Files
Test_context_pretrain/artifacts/training/trainer_state.json
ModelHub XC 200675bd2d 初始化项目,由ModelHub XC社区提供模型
Model: MathMindsAGI/Test_context_pretrain
Source: Original Platform
2026-04-11 11:04:57 +08:00

13183 lines
323 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 18779,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.000532509718302359,
"grad_norm": 23.296741485595703,
"learning_rate": 9.584664536741213e-07,
"loss": 7.8793,
"step": 10
},
{
"epoch": 0.001065019436604718,
"grad_norm": 19.85402488708496,
"learning_rate": 2.023429179978701e-06,
"loss": 7.7093,
"step": 20
},
{
"epoch": 0.001597529154907077,
"grad_norm": 10.07481861114502,
"learning_rate": 3.08839190628328e-06,
"loss": 7.0137,
"step": 30
},
{
"epoch": 0.002130038873209436,
"grad_norm": 5.072307109832764,
"learning_rate": 4.153354632587859e-06,
"loss": 6.1742,
"step": 40
},
{
"epoch": 0.002662548591511795,
"grad_norm": 3.5329959392547607,
"learning_rate": 5.218317358892439e-06,
"loss": 5.6068,
"step": 50
},
{
"epoch": 0.003195058309814154,
"grad_norm": 3.8453404903411865,
"learning_rate": 6.283280085197018e-06,
"loss": 5.1784,
"step": 60
},
{
"epoch": 0.003727568028116513,
"grad_norm": 6.184778213500977,
"learning_rate": 7.3482428115015974e-06,
"loss": 4.6031,
"step": 70
},
{
"epoch": 0.004260077746418872,
"grad_norm": 4.906091690063477,
"learning_rate": 8.413205537806178e-06,
"loss": 3.9631,
"step": 80
},
{
"epoch": 0.004792587464721231,
"grad_norm": 9.875988960266113,
"learning_rate": 9.478168264110757e-06,
"loss": 3.5113,
"step": 90
},
{
"epoch": 0.00532509718302359,
"grad_norm": 5.586822986602783,
"learning_rate": 1.0543130990415335e-05,
"loss": 3.1884,
"step": 100
},
{
"epoch": 0.005857606901325949,
"grad_norm": 9.180880546569824,
"learning_rate": 1.1608093716719916e-05,
"loss": 2.9137,
"step": 110
},
{
"epoch": 0.006390116619628308,
"grad_norm": 17.583784103393555,
"learning_rate": 1.2673056443024495e-05,
"loss": 2.7283,
"step": 120
},
{
"epoch": 0.006922626337930667,
"grad_norm": 16.766233444213867,
"learning_rate": 1.3738019169329076e-05,
"loss": 2.5699,
"step": 130
},
{
"epoch": 0.007455136056233026,
"grad_norm": 11.388614654541016,
"learning_rate": 1.4802981895633653e-05,
"loss": 2.4291,
"step": 140
},
{
"epoch": 0.007987645774535385,
"grad_norm": 18.473289489746094,
"learning_rate": 1.5867944621938232e-05,
"loss": 2.3001,
"step": 150
},
{
"epoch": 0.008520155492837744,
"grad_norm": 12.690078735351562,
"learning_rate": 1.693290734824281e-05,
"loss": 2.1744,
"step": 160
},
{
"epoch": 0.009052665211140103,
"grad_norm": 10.144042015075684,
"learning_rate": 1.799787007454739e-05,
"loss": 2.0552,
"step": 170
},
{
"epoch": 0.009585174929442462,
"grad_norm": 11.107041358947754,
"learning_rate": 1.906283280085197e-05,
"loss": 1.9585,
"step": 180
},
{
"epoch": 0.010117684647744821,
"grad_norm": 14.497051239013672,
"learning_rate": 2.0127795527156552e-05,
"loss": 1.8718,
"step": 190
},
{
"epoch": 0.01065019436604718,
"grad_norm": 10.508237838745117,
"learning_rate": 2.1192758253461128e-05,
"loss": 1.8153,
"step": 200
},
{
"epoch": 0.01118270408434954,
"grad_norm": 11.81551742553711,
"learning_rate": 2.2257720979765707e-05,
"loss": 1.7521,
"step": 210
},
{
"epoch": 0.011715213802651898,
"grad_norm": 7.003968238830566,
"learning_rate": 2.332268370607029e-05,
"loss": 1.7067,
"step": 220
},
{
"epoch": 0.012247723520954257,
"grad_norm": 9.637007713317871,
"learning_rate": 2.438764643237487e-05,
"loss": 1.658,
"step": 230
},
{
"epoch": 0.012780233239256616,
"grad_norm": 11.963647842407227,
"learning_rate": 2.5452609158679448e-05,
"loss": 1.6003,
"step": 240
},
{
"epoch": 0.013312742957558975,
"grad_norm": 15.572464942932129,
"learning_rate": 2.6517571884984027e-05,
"loss": 1.5396,
"step": 250
},
{
"epoch": 0.013845252675861335,
"grad_norm": 10.560100555419922,
"learning_rate": 2.7582534611288606e-05,
"loss": 1.4679,
"step": 260
},
{
"epoch": 0.014377762394163694,
"grad_norm": 14.625675201416016,
"learning_rate": 2.864749733759319e-05,
"loss": 1.3871,
"step": 270
},
{
"epoch": 0.014910272112466053,
"grad_norm": 15.250794410705566,
"learning_rate": 2.971246006389776e-05,
"loss": 1.2908,
"step": 280
},
{
"epoch": 0.015442781830768412,
"grad_norm": 10.370095252990723,
"learning_rate": 3.0777422790202344e-05,
"loss": 1.1773,
"step": 290
},
{
"epoch": 0.01597529154907077,
"grad_norm": 14.734580993652344,
"learning_rate": 3.1842385516506926e-05,
"loss": 1.0634,
"step": 300
},
{
"epoch": 0.01650780126737313,
"grad_norm": 11.359335899353027,
"learning_rate": 3.29073482428115e-05,
"loss": 0.9367,
"step": 310
},
{
"epoch": 0.01704031098567549,
"grad_norm": 15.065919876098633,
"learning_rate": 3.3972310969116084e-05,
"loss": 0.851,
"step": 320
},
{
"epoch": 0.017572820703977848,
"grad_norm": 11.290328025817871,
"learning_rate": 3.503727369542067e-05,
"loss": 0.7698,
"step": 330
},
{
"epoch": 0.018105330422280207,
"grad_norm": 9.410698890686035,
"learning_rate": 3.610223642172524e-05,
"loss": 0.6942,
"step": 340
},
{
"epoch": 0.018637840140582566,
"grad_norm": 7.125499725341797,
"learning_rate": 3.716719914802982e-05,
"loss": 0.6353,
"step": 350
},
{
"epoch": 0.019170349858884925,
"grad_norm": 11.152689933776855,
"learning_rate": 3.82321618743344e-05,
"loss": 0.6036,
"step": 360
},
{
"epoch": 0.019702859577187284,
"grad_norm": 7.263124465942383,
"learning_rate": 3.929712460063898e-05,
"loss": 0.5523,
"step": 370
},
{
"epoch": 0.020235369295489643,
"grad_norm": 6.285194396972656,
"learning_rate": 4.036208732694356e-05,
"loss": 0.5131,
"step": 380
},
{
"epoch": 0.020767879013792002,
"grad_norm": 3.8969569206237793,
"learning_rate": 4.142705005324814e-05,
"loss": 0.4857,
"step": 390
},
{
"epoch": 0.02130038873209436,
"grad_norm": 4.850637912750244,
"learning_rate": 4.249201277955272e-05,
"loss": 0.4606,
"step": 400
},
{
"epoch": 0.02183289845039672,
"grad_norm": 3.3862061500549316,
"learning_rate": 4.355697550585729e-05,
"loss": 0.4423,
"step": 410
},
{
"epoch": 0.02236540816869908,
"grad_norm": 3.709831714630127,
"learning_rate": 4.4621938232161876e-05,
"loss": 0.4269,
"step": 420
},
{
"epoch": 0.022897917887001438,
"grad_norm": 3.6177499294281006,
"learning_rate": 4.568690095846646e-05,
"loss": 0.4109,
"step": 430
},
{
"epoch": 0.023430427605303797,
"grad_norm": 3.2016589641571045,
"learning_rate": 4.6751863684771034e-05,
"loss": 0.4038,
"step": 440
},
{
"epoch": 0.023962937323606156,
"grad_norm": 3.1846377849578857,
"learning_rate": 4.781682641107562e-05,
"loss": 0.3922,
"step": 450
},
{
"epoch": 0.024495447041908515,
"grad_norm": 3.7085001468658447,
"learning_rate": 4.88817891373802e-05,
"loss": 0.3852,
"step": 460
},
{
"epoch": 0.025027956760210874,
"grad_norm": 1.9565516710281372,
"learning_rate": 4.994675186368477e-05,
"loss": 0.3727,
"step": 470
},
{
"epoch": 0.025560466478513233,
"grad_norm": 2.378927230834961,
"learning_rate": 5.101171458998936e-05,
"loss": 0.3654,
"step": 480
},
{
"epoch": 0.026092976196815592,
"grad_norm": 1.7363628149032593,
"learning_rate": 5.207667731629393e-05,
"loss": 0.3528,
"step": 490
},
{
"epoch": 0.02662548591511795,
"grad_norm": 2.0001909732818604,
"learning_rate": 5.314164004259851e-05,
"loss": 0.3472,
"step": 500
},
{
"epoch": 0.02715799563342031,
"grad_norm": 2.2477715015411377,
"learning_rate": 5.420660276890309e-05,
"loss": 0.3334,
"step": 510
},
{
"epoch": 0.02769050535172267,
"grad_norm": 1.9201889038085938,
"learning_rate": 5.527156549520767e-05,
"loss": 0.3183,
"step": 520
},
{
"epoch": 0.028223015070025028,
"grad_norm": 1.6685700416564941,
"learning_rate": 5.633652822151225e-05,
"loss": 0.3067,
"step": 530
},
{
"epoch": 0.028755524788327387,
"grad_norm": 1.8929866552352905,
"learning_rate": 5.7401490947816826e-05,
"loss": 0.293,
"step": 540
},
{
"epoch": 0.029288034506629746,
"grad_norm": 1.649090051651001,
"learning_rate": 5.8466453674121415e-05,
"loss": 0.2802,
"step": 550
},
{
"epoch": 0.029820544224932105,
"grad_norm": 0.9757211804389954,
"learning_rate": 5.953141640042599e-05,
"loss": 0.2706,
"step": 560
},
{
"epoch": 0.030353053943234464,
"grad_norm": 1.1477692127227783,
"learning_rate": 6.059637912673056e-05,
"loss": 0.2616,
"step": 570
},
{
"epoch": 0.030885563661536823,
"grad_norm": 1.2151044607162476,
"learning_rate": 6.166134185303514e-05,
"loss": 0.2488,
"step": 580
},
{
"epoch": 0.03141807337983918,
"grad_norm": 0.9159660339355469,
"learning_rate": 6.272630457933972e-05,
"loss": 0.2435,
"step": 590
},
{
"epoch": 0.03195058309814154,
"grad_norm": 0.9337270855903625,
"learning_rate": 6.379126730564431e-05,
"loss": 0.2341,
"step": 600
},
{
"epoch": 0.0324830928164439,
"grad_norm": 1.3477636575698853,
"learning_rate": 6.485623003194888e-05,
"loss": 0.2271,
"step": 610
},
{
"epoch": 0.03301560253474626,
"grad_norm": 0.9192898869514465,
"learning_rate": 6.592119275825347e-05,
"loss": 0.2222,
"step": 620
},
{
"epoch": 0.033548112253048615,
"grad_norm": 0.6390801668167114,
"learning_rate": 6.698615548455805e-05,
"loss": 0.2165,
"step": 630
},
{
"epoch": 0.03408062197135098,
"grad_norm": 0.6918138265609741,
"learning_rate": 6.805111821086262e-05,
"loss": 0.213,
"step": 640
},
{
"epoch": 0.03461313168965333,
"grad_norm": 0.6979911923408508,
"learning_rate": 6.91160809371672e-05,
"loss": 0.2126,
"step": 650
},
{
"epoch": 0.035145641407955695,
"grad_norm": 0.7399368286132812,
"learning_rate": 7.018104366347178e-05,
"loss": 0.2094,
"step": 660
},
{
"epoch": 0.03567815112625805,
"grad_norm": 0.7001500725746155,
"learning_rate": 7.124600638977636e-05,
"loss": 0.2083,
"step": 670
},
{
"epoch": 0.03621066084456041,
"grad_norm": 0.7533488273620605,
"learning_rate": 7.231096911608094e-05,
"loss": 0.207,
"step": 680
},
{
"epoch": 0.03674317056286277,
"grad_norm": 0.6705746054649353,
"learning_rate": 7.337593184238552e-05,
"loss": 0.2053,
"step": 690
},
{
"epoch": 0.03727568028116513,
"grad_norm": 0.4637382924556732,
"learning_rate": 7.44408945686901e-05,
"loss": 0.204,
"step": 700
},
{
"epoch": 0.03780818999946749,
"grad_norm": 0.40789374709129333,
"learning_rate": 7.550585729499468e-05,
"loss": 0.2019,
"step": 710
},
{
"epoch": 0.03834069971776985,
"grad_norm": 0.591678261756897,
"learning_rate": 7.657082002129926e-05,
"loss": 0.2011,
"step": 720
},
{
"epoch": 0.038873209436072205,
"grad_norm": 0.5219926834106445,
"learning_rate": 7.763578274760383e-05,
"loss": 0.1987,
"step": 730
},
{
"epoch": 0.03940571915437457,
"grad_norm": 0.47858574986457825,
"learning_rate": 7.870074547390842e-05,
"loss": 0.1976,
"step": 740
},
{
"epoch": 0.03993822887267692,
"grad_norm": 0.752047061920166,
"learning_rate": 7.9765708200213e-05,
"loss": 0.1987,
"step": 750
},
{
"epoch": 0.040470738590979285,
"grad_norm": 0.4126583933830261,
"learning_rate": 8.083067092651757e-05,
"loss": 0.1971,
"step": 760
},
{
"epoch": 0.04100324830928164,
"grad_norm": 0.8404585719108582,
"learning_rate": 8.189563365282216e-05,
"loss": 0.1948,
"step": 770
},
{
"epoch": 0.041535758027584004,
"grad_norm": 0.5083792209625244,
"learning_rate": 8.296059637912672e-05,
"loss": 0.194,
"step": 780
},
{
"epoch": 0.04206826774588636,
"grad_norm": 0.3890551030635834,
"learning_rate": 8.402555910543131e-05,
"loss": 0.1923,
"step": 790
},
{
"epoch": 0.04260077746418872,
"grad_norm": 0.7016918063163757,
"learning_rate": 8.509052183173589e-05,
"loss": 0.1912,
"step": 800
},
{
"epoch": 0.04313328718249108,
"grad_norm": 0.44527336955070496,
"learning_rate": 8.615548455804048e-05,
"loss": 0.1919,
"step": 810
},
{
"epoch": 0.04366579690079344,
"grad_norm": 0.3990408778190613,
"learning_rate": 8.722044728434506e-05,
"loss": 0.1905,
"step": 820
},
{
"epoch": 0.044198306619095795,
"grad_norm": 0.3964357078075409,
"learning_rate": 8.828541001064963e-05,
"loss": 0.1882,
"step": 830
},
{
"epoch": 0.04473081633739816,
"grad_norm": 0.6267169713973999,
"learning_rate": 8.93503727369542e-05,
"loss": 0.1894,
"step": 840
},
{
"epoch": 0.04526332605570051,
"grad_norm": 0.3614656329154968,
"learning_rate": 9.041533546325878e-05,
"loss": 0.1873,
"step": 850
},
{
"epoch": 0.045795835774002876,
"grad_norm": 0.3725983202457428,
"learning_rate": 9.148029818956337e-05,
"loss": 0.1854,
"step": 860
},
{
"epoch": 0.04632834549230523,
"grad_norm": 0.7198257446289062,
"learning_rate": 9.254526091586795e-05,
"loss": 0.1854,
"step": 870
},
{
"epoch": 0.046860855210607594,
"grad_norm": 0.5347720980644226,
"learning_rate": 9.361022364217252e-05,
"loss": 0.1856,
"step": 880
},
{
"epoch": 0.04739336492890995,
"grad_norm": 0.36126938462257385,
"learning_rate": 9.467518636847711e-05,
"loss": 0.1832,
"step": 890
},
{
"epoch": 0.04792587464721231,
"grad_norm": 0.5364170670509338,
"learning_rate": 9.574014909478169e-05,
"loss": 0.1837,
"step": 900
},
{
"epoch": 0.04845838436551467,
"grad_norm": 0.3289523422718048,
"learning_rate": 9.680511182108626e-05,
"loss": 0.1819,
"step": 910
},
{
"epoch": 0.04899089408381703,
"grad_norm": 0.3482621908187866,
"learning_rate": 9.787007454739084e-05,
"loss": 0.1825,
"step": 920
},
{
"epoch": 0.049523403802119385,
"grad_norm": 0.5768856406211853,
"learning_rate": 9.893503727369543e-05,
"loss": 0.1829,
"step": 930
},
{
"epoch": 0.05005591352042175,
"grad_norm": 0.337167352437973,
"learning_rate": 0.0001,
"loss": 0.1821,
"step": 940
},
{
"epoch": 0.0505884232387241,
"grad_norm": 0.2395765632390976,
"learning_rate": 9.99999457314795e-05,
"loss": 0.1807,
"step": 950
},
{
"epoch": 0.051120932957026466,
"grad_norm": 0.37906885147094727,
"learning_rate": 9.999978292608627e-05,
"loss": 0.1795,
"step": 960
},
{
"epoch": 0.05165344267532882,
"grad_norm": 0.4830165207386017,
"learning_rate": 9.999951158432521e-05,
"loss": 0.1801,
"step": 970
},
{
"epoch": 0.052185952393631184,
"grad_norm": 0.3381877839565277,
"learning_rate": 9.999913170703776e-05,
"loss": 0.1786,
"step": 980
},
{
"epoch": 0.05271846211193354,
"grad_norm": 0.4923486113548279,
"learning_rate": 9.999864329540193e-05,
"loss": 0.1792,
"step": 990
},
{
"epoch": 0.0532509718302359,
"grad_norm": 0.29676854610443115,
"learning_rate": 9.999804635093233e-05,
"loss": 0.1775,
"step": 1000
},
{
"epoch": 0.05378348154853826,
"grad_norm": 0.5925562977790833,
"learning_rate": 9.999734087548009e-05,
"loss": 0.1774,
"step": 1010
},
{
"epoch": 0.05431599126684062,
"grad_norm": 0.5015407800674438,
"learning_rate": 9.999652687123293e-05,
"loss": 0.1762,
"step": 1020
},
{
"epoch": 0.054848500985142976,
"grad_norm": 0.2539210915565491,
"learning_rate": 9.999560434071517e-05,
"loss": 0.1756,
"step": 1030
},
{
"epoch": 0.05538101070344534,
"grad_norm": 0.3802579343318939,
"learning_rate": 9.999457328678761e-05,
"loss": 0.1749,
"step": 1040
},
{
"epoch": 0.055913520421747694,
"grad_norm": 0.3460189998149872,
"learning_rate": 9.999343371264757e-05,
"loss": 0.1751,
"step": 1050
},
{
"epoch": 0.056446030140050056,
"grad_norm": 0.4987145960330963,
"learning_rate": 9.9992185621829e-05,
"loss": 0.1732,
"step": 1060
},
{
"epoch": 0.05697853985835241,
"grad_norm": 0.3508310616016388,
"learning_rate": 9.999082901820225e-05,
"loss": 0.173,
"step": 1070
},
{
"epoch": 0.057511049576654774,
"grad_norm": 0.6434867978096008,
"learning_rate": 9.998936390597424e-05,
"loss": 0.1733,
"step": 1080
},
{
"epoch": 0.05804355929495713,
"grad_norm": 0.25631335377693176,
"learning_rate": 9.998779028968839e-05,
"loss": 0.1727,
"step": 1090
},
{
"epoch": 0.05857606901325949,
"grad_norm": 0.7324220538139343,
"learning_rate": 9.998610817422456e-05,
"loss": 0.1721,
"step": 1100
},
{
"epoch": 0.05910857873156185,
"grad_norm": 0.2557462453842163,
"learning_rate": 9.998431756479907e-05,
"loss": 0.1718,
"step": 1110
},
{
"epoch": 0.05964108844986421,
"grad_norm": 0.23237043619155884,
"learning_rate": 9.998241846696474e-05,
"loss": 0.1708,
"step": 1120
},
{
"epoch": 0.060173598168166566,
"grad_norm": 0.7491874098777771,
"learning_rate": 9.998041088661076e-05,
"loss": 0.1709,
"step": 1130
},
{
"epoch": 0.06070610788646893,
"grad_norm": 0.3187454640865326,
"learning_rate": 9.997829482996277e-05,
"loss": 0.1706,
"step": 1140
},
{
"epoch": 0.061238617604771284,
"grad_norm": 0.28493937849998474,
"learning_rate": 9.997607030358276e-05,
"loss": 0.1696,
"step": 1150
},
{
"epoch": 0.061771127323073646,
"grad_norm": 0.6693065166473389,
"learning_rate": 9.997373731436915e-05,
"loss": 0.1693,
"step": 1160
},
{
"epoch": 0.062303637041376,
"grad_norm": 0.4238905608654022,
"learning_rate": 9.997129586955665e-05,
"loss": 0.1691,
"step": 1170
},
{
"epoch": 0.06283614675967836,
"grad_norm": 0.5617002248764038,
"learning_rate": 9.996874597671633e-05,
"loss": 0.1698,
"step": 1180
},
{
"epoch": 0.06336865647798072,
"grad_norm": 0.29317113757133484,
"learning_rate": 9.996608764375555e-05,
"loss": 0.1684,
"step": 1190
},
{
"epoch": 0.06390116619628308,
"grad_norm": 0.4380682706832886,
"learning_rate": 9.996332087891795e-05,
"loss": 0.1688,
"step": 1200
},
{
"epoch": 0.06443367591458544,
"grad_norm": 0.7158795595169067,
"learning_rate": 9.996044569078347e-05,
"loss": 0.1681,
"step": 1210
},
{
"epoch": 0.0649661856328878,
"grad_norm": 0.39761292934417725,
"learning_rate": 9.99574620882682e-05,
"loss": 0.1679,
"step": 1220
},
{
"epoch": 0.06549869535119016,
"grad_norm": 0.2829475700855255,
"learning_rate": 9.995437008062444e-05,
"loss": 0.1683,
"step": 1230
},
{
"epoch": 0.06603120506949252,
"grad_norm": 0.22211559116840363,
"learning_rate": 9.995116967744076e-05,
"loss": 0.1676,
"step": 1240
},
{
"epoch": 0.06656371478779488,
"grad_norm": 0.24345025420188904,
"learning_rate": 9.994786088864172e-05,
"loss": 0.1654,
"step": 1250
},
{
"epoch": 0.06709622450609723,
"grad_norm": 0.2510230243206024,
"learning_rate": 9.994444372448812e-05,
"loss": 0.1671,
"step": 1260
},
{
"epoch": 0.06762873422439959,
"grad_norm": 0.26244106888771057,
"learning_rate": 9.994091819557676e-05,
"loss": 0.1659,
"step": 1270
},
{
"epoch": 0.06816124394270195,
"grad_norm": 0.2892049252986908,
"learning_rate": 9.993728431284053e-05,
"loss": 0.1652,
"step": 1280
},
{
"epoch": 0.06869375366100432,
"grad_norm": 0.4036615788936615,
"learning_rate": 9.993354208754828e-05,
"loss": 0.165,
"step": 1290
},
{
"epoch": 0.06922626337930667,
"grad_norm": 0.32690417766571045,
"learning_rate": 9.992969153130491e-05,
"loss": 0.1646,
"step": 1300
},
{
"epoch": 0.06975877309760903,
"grad_norm": 0.9821091294288635,
"learning_rate": 9.992573265605119e-05,
"loss": 0.1651,
"step": 1310
},
{
"epoch": 0.07029128281591139,
"grad_norm": 0.23433181643486023,
"learning_rate": 9.992166547406383e-05,
"loss": 0.1659,
"step": 1320
},
{
"epoch": 0.07082379253421375,
"grad_norm": 0.3498155474662781,
"learning_rate": 9.99174899979554e-05,
"loss": 0.165,
"step": 1330
},
{
"epoch": 0.0713563022525161,
"grad_norm": 0.19588203728199005,
"learning_rate": 9.991320624067431e-05,
"loss": 0.1632,
"step": 1340
},
{
"epoch": 0.07188881197081846,
"grad_norm": 0.3548436462879181,
"learning_rate": 9.99088142155047e-05,
"loss": 0.1639,
"step": 1350
},
{
"epoch": 0.07242132168912083,
"grad_norm": 0.3274150788784027,
"learning_rate": 9.990431393606654e-05,
"loss": 0.1623,
"step": 1360
},
{
"epoch": 0.07295383140742319,
"grad_norm": 0.5695179104804993,
"learning_rate": 9.989970541631544e-05,
"loss": 0.1634,
"step": 1370
},
{
"epoch": 0.07348634112572554,
"grad_norm": 0.21706371009349823,
"learning_rate": 9.989498867054268e-05,
"loss": 0.1619,
"step": 1380
},
{
"epoch": 0.0740188508440279,
"grad_norm": 0.45233970880508423,
"learning_rate": 9.989016371337518e-05,
"loss": 0.1622,
"step": 1390
},
{
"epoch": 0.07455136056233026,
"grad_norm": 0.5718231797218323,
"learning_rate": 9.988523055977541e-05,
"loss": 0.1631,
"step": 1400
},
{
"epoch": 0.07508387028063263,
"grad_norm": 0.6669481992721558,
"learning_rate": 9.988018922504137e-05,
"loss": 0.1625,
"step": 1410
},
{
"epoch": 0.07561637999893497,
"grad_norm": 0.25058674812316895,
"learning_rate": 9.987503972480652e-05,
"loss": 0.162,
"step": 1420
},
{
"epoch": 0.07614888971723734,
"grad_norm": 0.2735210359096527,
"learning_rate": 9.986978207503977e-05,
"loss": 0.1617,
"step": 1430
},
{
"epoch": 0.0766813994355397,
"grad_norm": 0.2729678750038147,
"learning_rate": 9.98644162920454e-05,
"loss": 0.1607,
"step": 1440
},
{
"epoch": 0.07721390915384206,
"grad_norm": 0.20154890418052673,
"learning_rate": 9.985894239246298e-05,
"loss": 0.1612,
"step": 1450
},
{
"epoch": 0.07774641887214441,
"grad_norm": 0.37646111845970154,
"learning_rate": 9.985336039326747e-05,
"loss": 0.1602,
"step": 1460
},
{
"epoch": 0.07827892859044677,
"grad_norm": 0.29742431640625,
"learning_rate": 9.98476703117689e-05,
"loss": 0.1609,
"step": 1470
},
{
"epoch": 0.07881143830874913,
"grad_norm": 0.2643822729587555,
"learning_rate": 9.984187216561258e-05,
"loss": 0.1612,
"step": 1480
},
{
"epoch": 0.0793439480270515,
"grad_norm": 0.22973056137561798,
"learning_rate": 9.98359659727789e-05,
"loss": 0.1614,
"step": 1490
},
{
"epoch": 0.07987645774535385,
"grad_norm": 0.24614231288433075,
"learning_rate": 9.982995175158327e-05,
"loss": 0.1621,
"step": 1500
},
{
"epoch": 0.08040896746365621,
"grad_norm": 0.2537037134170532,
"learning_rate": 9.98238295206762e-05,
"loss": 0.1608,
"step": 1510
},
{
"epoch": 0.08094147718195857,
"grad_norm": 0.26126566529273987,
"learning_rate": 9.981759929904306e-05,
"loss": 0.1596,
"step": 1520
},
{
"epoch": 0.08147398690026093,
"grad_norm": 1.0934852361679077,
"learning_rate": 9.981126110600411e-05,
"loss": 0.1598,
"step": 1530
},
{
"epoch": 0.08200649661856328,
"grad_norm": 0.21107517182826996,
"learning_rate": 9.98048149612145e-05,
"loss": 0.1601,
"step": 1540
},
{
"epoch": 0.08253900633686564,
"grad_norm": 0.2115686535835266,
"learning_rate": 9.979826088466405e-05,
"loss": 0.1598,
"step": 1550
},
{
"epoch": 0.08307151605516801,
"grad_norm": 0.23121733963489532,
"learning_rate": 9.979159889667738e-05,
"loss": 0.1592,
"step": 1560
},
{
"epoch": 0.08360402577347037,
"grad_norm": 0.19117231667041779,
"learning_rate": 9.978482901791366e-05,
"loss": 0.1591,
"step": 1570
},
{
"epoch": 0.08413653549177272,
"grad_norm": 0.274919331073761,
"learning_rate": 9.977795126936671e-05,
"loss": 0.1593,
"step": 1580
},
{
"epoch": 0.08466904521007508,
"grad_norm": 0.17615851759910583,
"learning_rate": 9.977096567236481e-05,
"loss": 0.1586,
"step": 1590
},
{
"epoch": 0.08520155492837744,
"grad_norm": 0.3320156931877136,
"learning_rate": 9.976387224857071e-05,
"loss": 0.1575,
"step": 1600
},
{
"epoch": 0.0857340646466798,
"grad_norm": 0.56144779920578,
"learning_rate": 9.975667101998153e-05,
"loss": 0.1587,
"step": 1610
},
{
"epoch": 0.08626657436498215,
"grad_norm": 0.3506183326244354,
"learning_rate": 9.974936200892874e-05,
"loss": 0.1588,
"step": 1620
},
{
"epoch": 0.08679908408328452,
"grad_norm": 0.22701147198677063,
"learning_rate": 9.974194523807796e-05,
"loss": 0.1581,
"step": 1630
},
{
"epoch": 0.08733159380158688,
"grad_norm": 0.27218353748321533,
"learning_rate": 9.973442073042903e-05,
"loss": 0.1584,
"step": 1640
},
{
"epoch": 0.08786410351988924,
"grad_norm": 0.21976235508918762,
"learning_rate": 9.972678850931589e-05,
"loss": 0.1571,
"step": 1650
},
{
"epoch": 0.08839661323819159,
"grad_norm": 0.22460529208183289,
"learning_rate": 9.971904859840653e-05,
"loss": 0.157,
"step": 1660
},
{
"epoch": 0.08892912295649395,
"grad_norm": 0.16970294713974,
"learning_rate": 9.971120102170283e-05,
"loss": 0.1575,
"step": 1670
},
{
"epoch": 0.08946163267479632,
"grad_norm": 0.31050947308540344,
"learning_rate": 9.970324580354063e-05,
"loss": 0.1568,
"step": 1680
},
{
"epoch": 0.08999414239309868,
"grad_norm": 0.22615467011928558,
"learning_rate": 9.969518296858946e-05,
"loss": 0.1574,
"step": 1690
},
{
"epoch": 0.09052665211140103,
"grad_norm": 0.15380023419857025,
"learning_rate": 9.968701254185271e-05,
"loss": 0.1567,
"step": 1700
},
{
"epoch": 0.09105916182970339,
"grad_norm": 0.20212256908416748,
"learning_rate": 9.96787345486673e-05,
"loss": 0.157,
"step": 1710
},
{
"epoch": 0.09159167154800575,
"grad_norm": 0.24987904727458954,
"learning_rate": 9.967034901470377e-05,
"loss": 0.1574,
"step": 1720
},
{
"epoch": 0.09212418126630811,
"grad_norm": 0.30531537532806396,
"learning_rate": 9.966185596596618e-05,
"loss": 0.1557,
"step": 1730
},
{
"epoch": 0.09265669098461046,
"grad_norm": 0.23161855340003967,
"learning_rate": 9.965325542879196e-05,
"loss": 0.1568,
"step": 1740
},
{
"epoch": 0.09318920070291282,
"grad_norm": 0.5445181727409363,
"learning_rate": 9.964454742985188e-05,
"loss": 0.1557,
"step": 1750
},
{
"epoch": 0.09372171042121519,
"grad_norm": 0.28040483593940735,
"learning_rate": 9.963573199614992e-05,
"loss": 0.1573,
"step": 1760
},
{
"epoch": 0.09425422013951755,
"grad_norm": 0.16802328824996948,
"learning_rate": 9.962680915502331e-05,
"loss": 0.157,
"step": 1770
},
{
"epoch": 0.0947867298578199,
"grad_norm": 0.15088757872581482,
"learning_rate": 9.961777893414226e-05,
"loss": 0.1568,
"step": 1780
},
{
"epoch": 0.09531923957612226,
"grad_norm": 0.1732264906167984,
"learning_rate": 9.960864136151e-05,
"loss": 0.1562,
"step": 1790
},
{
"epoch": 0.09585174929442462,
"grad_norm": 0.5040917992591858,
"learning_rate": 9.959939646546272e-05,
"loss": 0.1558,
"step": 1800
},
{
"epoch": 0.09638425901272699,
"grad_norm": 0.19744379818439484,
"learning_rate": 9.959004427466935e-05,
"loss": 0.1559,
"step": 1810
},
{
"epoch": 0.09691676873102933,
"grad_norm": 0.18064717948436737,
"learning_rate": 9.958058481813158e-05,
"loss": 0.1552,
"step": 1820
},
{
"epoch": 0.0974492784493317,
"grad_norm": 0.1613135039806366,
"learning_rate": 9.957101812518377e-05,
"loss": 0.1556,
"step": 1830
},
{
"epoch": 0.09798178816763406,
"grad_norm": 0.20252278447151184,
"learning_rate": 9.956134422549275e-05,
"loss": 0.1551,
"step": 1840
},
{
"epoch": 0.09851429788593642,
"grad_norm": 0.3912264108657837,
"learning_rate": 9.955156314905785e-05,
"loss": 0.1549,
"step": 1850
},
{
"epoch": 0.09904680760423877,
"grad_norm": 0.19006063044071198,
"learning_rate": 9.954167492621079e-05,
"loss": 0.156,
"step": 1860
},
{
"epoch": 0.09957931732254113,
"grad_norm": 0.37275323271751404,
"learning_rate": 9.953167958761552e-05,
"loss": 0.1557,
"step": 1870
},
{
"epoch": 0.1001118270408435,
"grad_norm": 0.17530041933059692,
"learning_rate": 9.952157716426813e-05,
"loss": 0.1551,
"step": 1880
},
{
"epoch": 0.10064433675914586,
"grad_norm": 0.5232445597648621,
"learning_rate": 9.951136768749685e-05,
"loss": 0.1546,
"step": 1890
},
{
"epoch": 0.1011768464774482,
"grad_norm": 0.42586958408355713,
"learning_rate": 9.950105118896186e-05,
"loss": 0.1551,
"step": 1900
},
{
"epoch": 0.10170935619575057,
"grad_norm": 0.2536565065383911,
"learning_rate": 9.949062770065525e-05,
"loss": 0.155,
"step": 1910
},
{
"epoch": 0.10224186591405293,
"grad_norm": 0.15420402586460114,
"learning_rate": 9.948009725490082e-05,
"loss": 0.154,
"step": 1920
},
{
"epoch": 0.1027743756323553,
"grad_norm": 0.20187288522720337,
"learning_rate": 9.946945988435414e-05,
"loss": 0.1551,
"step": 1930
},
{
"epoch": 0.10330688535065764,
"grad_norm": 0.17882299423217773,
"learning_rate": 9.945871562200226e-05,
"loss": 0.1548,
"step": 1940
},
{
"epoch": 0.10383939506896,
"grad_norm": 0.18503925204277039,
"learning_rate": 9.944786450116384e-05,
"loss": 0.1544,
"step": 1950
},
{
"epoch": 0.10437190478726237,
"grad_norm": 0.21693278849124908,
"learning_rate": 9.943690655548876e-05,
"loss": 0.154,
"step": 1960
},
{
"epoch": 0.10490441450556473,
"grad_norm": 0.2536572515964508,
"learning_rate": 9.942584181895831e-05,
"loss": 0.154,
"step": 1970
},
{
"epoch": 0.10543692422386708,
"grad_norm": 0.20351417362689972,
"learning_rate": 9.941467032588483e-05,
"loss": 0.1531,
"step": 1980
},
{
"epoch": 0.10596943394216944,
"grad_norm": 0.14263711869716644,
"learning_rate": 9.940339211091182e-05,
"loss": 0.1541,
"step": 1990
},
{
"epoch": 0.1065019436604718,
"grad_norm": 0.21371303498744965,
"learning_rate": 9.939200720901367e-05,
"loss": 0.1544,
"step": 2000
},
{
"epoch": 0.10703445337877417,
"grad_norm": 0.18548338115215302,
"learning_rate": 9.93805156554956e-05,
"loss": 0.154,
"step": 2010
},
{
"epoch": 0.10756696309707652,
"grad_norm": 0.14423610270023346,
"learning_rate": 9.936891748599362e-05,
"loss": 0.1555,
"step": 2020
},
{
"epoch": 0.10809947281537888,
"grad_norm": 0.2253378927707672,
"learning_rate": 9.935721273647429e-05,
"loss": 0.1537,
"step": 2030
},
{
"epoch": 0.10863198253368124,
"grad_norm": 0.17178262770175934,
"learning_rate": 9.934540144323477e-05,
"loss": 0.1533,
"step": 2040
},
{
"epoch": 0.1091644922519836,
"grad_norm": 0.23457373678684235,
"learning_rate": 9.933348364290253e-05,
"loss": 0.1541,
"step": 2050
},
{
"epoch": 0.10969700197028595,
"grad_norm": 0.3339991867542267,
"learning_rate": 9.932145937243537e-05,
"loss": 0.1526,
"step": 2060
},
{
"epoch": 0.11022951168858831,
"grad_norm": 0.18367235362529755,
"learning_rate": 9.930932866912128e-05,
"loss": 0.1536,
"step": 2070
},
{
"epoch": 0.11076202140689068,
"grad_norm": 0.1779884397983551,
"learning_rate": 9.929709157057828e-05,
"loss": 0.1522,
"step": 2080
},
{
"epoch": 0.11129453112519304,
"grad_norm": 0.24092677235603333,
"learning_rate": 9.928474811475426e-05,
"loss": 0.1528,
"step": 2090
},
{
"epoch": 0.11182704084349539,
"grad_norm": 0.16710165143013,
"learning_rate": 9.927229833992706e-05,
"loss": 0.1525,
"step": 2100
},
{
"epoch": 0.11235955056179775,
"grad_norm": 0.3042786121368408,
"learning_rate": 9.925974228470415e-05,
"loss": 0.1545,
"step": 2110
},
{
"epoch": 0.11289206028010011,
"grad_norm": 0.14976242184638977,
"learning_rate": 9.924707998802259e-05,
"loss": 0.1531,
"step": 2120
},
{
"epoch": 0.11342456999840247,
"grad_norm": 0.1597498059272766,
"learning_rate": 9.923431148914885e-05,
"loss": 0.1523,
"step": 2130
},
{
"epoch": 0.11395707971670482,
"grad_norm": 0.2503865957260132,
"learning_rate": 9.922143682767886e-05,
"loss": 0.1527,
"step": 2140
},
{
"epoch": 0.11448958943500719,
"grad_norm": 0.240915447473526,
"learning_rate": 9.920845604353768e-05,
"loss": 0.1531,
"step": 2150
},
{
"epoch": 0.11502209915330955,
"grad_norm": 0.28919148445129395,
"learning_rate": 9.919536917697942e-05,
"loss": 0.1527,
"step": 2160
},
{
"epoch": 0.11555460887161191,
"grad_norm": 0.20423804223537445,
"learning_rate": 9.91821762685873e-05,
"loss": 0.153,
"step": 2170
},
{
"epoch": 0.11608711858991426,
"grad_norm": 0.18328100442886353,
"learning_rate": 9.916887735927326e-05,
"loss": 0.1515,
"step": 2180
},
{
"epoch": 0.11661962830821662,
"grad_norm": 0.22620701789855957,
"learning_rate": 9.915547249027795e-05,
"loss": 0.1514,
"step": 2190
},
{
"epoch": 0.11715213802651898,
"grad_norm": 0.23197805881500244,
"learning_rate": 9.914196170317074e-05,
"loss": 0.1526,
"step": 2200
},
{
"epoch": 0.11768464774482135,
"grad_norm": 0.320434033870697,
"learning_rate": 9.912834503984929e-05,
"loss": 0.1526,
"step": 2210
},
{
"epoch": 0.1182171574631237,
"grad_norm": 0.16544243693351746,
"learning_rate": 9.911462254253971e-05,
"loss": 0.1523,
"step": 2220
},
{
"epoch": 0.11874966718142606,
"grad_norm": 0.21110887825489044,
"learning_rate": 9.910079425379626e-05,
"loss": 0.1518,
"step": 2230
},
{
"epoch": 0.11928217689972842,
"grad_norm": 0.16100363433361053,
"learning_rate": 9.908686021650124e-05,
"loss": 0.152,
"step": 2240
},
{
"epoch": 0.11981468661803078,
"grad_norm": 0.14773668348789215,
"learning_rate": 9.907282047386497e-05,
"loss": 0.152,
"step": 2250
},
{
"epoch": 0.12034719633633313,
"grad_norm": 0.14374825358390808,
"learning_rate": 9.905867506942544e-05,
"loss": 0.1511,
"step": 2260
},
{
"epoch": 0.1208797060546355,
"grad_norm": 0.13494443893432617,
"learning_rate": 9.904442404704843e-05,
"loss": 0.1515,
"step": 2270
},
{
"epoch": 0.12141221577293786,
"grad_norm": 0.18906742334365845,
"learning_rate": 9.903006745092716e-05,
"loss": 0.1519,
"step": 2280
},
{
"epoch": 0.12194472549124022,
"grad_norm": 0.24630281329154968,
"learning_rate": 9.901560532558229e-05,
"loss": 0.1527,
"step": 2290
},
{
"epoch": 0.12247723520954257,
"grad_norm": 0.19350433349609375,
"learning_rate": 9.900103771586171e-05,
"loss": 0.1509,
"step": 2300
},
{
"epoch": 0.12300974492784493,
"grad_norm": 0.1677471250295639,
"learning_rate": 9.898636466694042e-05,
"loss": 0.1515,
"step": 2310
},
{
"epoch": 0.12354225464614729,
"grad_norm": 0.30285850167274475,
"learning_rate": 9.897158622432041e-05,
"loss": 0.1517,
"step": 2320
},
{
"epoch": 0.12407476436444966,
"grad_norm": 0.16860969364643097,
"learning_rate": 9.895670243383048e-05,
"loss": 0.1516,
"step": 2330
},
{
"epoch": 0.124607274082752,
"grad_norm": 0.10707177966833115,
"learning_rate": 9.894171334162614e-05,
"loss": 0.1517,
"step": 2340
},
{
"epoch": 0.12513978380105437,
"grad_norm": 0.16667041182518005,
"learning_rate": 9.892661899418945e-05,
"loss": 0.152,
"step": 2350
},
{
"epoch": 0.12567229351935671,
"grad_norm": 0.21757569909095764,
"learning_rate": 9.891141943832883e-05,
"loss": 0.1519,
"step": 2360
},
{
"epoch": 0.1262048032376591,
"grad_norm": 0.1555328369140625,
"learning_rate": 9.889611472117902e-05,
"loss": 0.1506,
"step": 2370
},
{
"epoch": 0.12673731295596144,
"grad_norm": 0.24122075736522675,
"learning_rate": 9.888070489020083e-05,
"loss": 0.1519,
"step": 2380
},
{
"epoch": 0.12726982267426382,
"grad_norm": 0.15034180879592896,
"learning_rate": 9.886518999318104e-05,
"loss": 0.1513,
"step": 2390
},
{
"epoch": 0.12780233239256616,
"grad_norm": 0.1593770682811737,
"learning_rate": 9.884957007823226e-05,
"loss": 0.1508,
"step": 2400
},
{
"epoch": 0.1283348421108685,
"grad_norm": 0.1536262482404709,
"learning_rate": 9.883384519379273e-05,
"loss": 0.1512,
"step": 2410
},
{
"epoch": 0.1288673518291709,
"grad_norm": 0.1585126668214798,
"learning_rate": 9.881801538862627e-05,
"loss": 0.1512,
"step": 2420
},
{
"epoch": 0.12939986154747324,
"grad_norm": 0.13629089295864105,
"learning_rate": 9.880208071182203e-05,
"loss": 0.1506,
"step": 2430
},
{
"epoch": 0.1299323712657756,
"grad_norm": 0.18671053647994995,
"learning_rate": 9.878604121279434e-05,
"loss": 0.1513,
"step": 2440
},
{
"epoch": 0.13046488098407796,
"grad_norm": 0.14679767191410065,
"learning_rate": 9.876989694128263e-05,
"loss": 0.1498,
"step": 2450
},
{
"epoch": 0.1309973907023803,
"grad_norm": 0.21141186356544495,
"learning_rate": 9.875364794735124e-05,
"loss": 0.1511,
"step": 2460
},
{
"epoch": 0.1315299004206827,
"grad_norm": 0.28956910967826843,
"learning_rate": 9.873729428138924e-05,
"loss": 0.1514,
"step": 2470
},
{
"epoch": 0.13206241013898504,
"grad_norm": 0.20682266354560852,
"learning_rate": 9.87208359941103e-05,
"loss": 0.1504,
"step": 2480
},
{
"epoch": 0.13259491985728739,
"grad_norm": 0.1642565280199051,
"learning_rate": 9.870427313655256e-05,
"loss": 0.1519,
"step": 2490
},
{
"epoch": 0.13312742957558976,
"grad_norm": 0.2124072164297104,
"learning_rate": 9.868760576007835e-05,
"loss": 0.1512,
"step": 2500
},
{
"epoch": 0.1336599392938921,
"grad_norm": 0.12310315668582916,
"learning_rate": 9.867083391637422e-05,
"loss": 0.1505,
"step": 2510
},
{
"epoch": 0.13419244901219446,
"grad_norm": 0.11877293884754181,
"learning_rate": 9.865395765745062e-05,
"loss": 0.1504,
"step": 2520
},
{
"epoch": 0.13472495873049684,
"grad_norm": 0.14998012781143188,
"learning_rate": 9.863697703564183e-05,
"loss": 0.1504,
"step": 2530
},
{
"epoch": 0.13525746844879918,
"grad_norm": 0.17364120483398438,
"learning_rate": 9.861989210360572e-05,
"loss": 0.1506,
"step": 2540
},
{
"epoch": 0.13578997816710156,
"grad_norm": 0.2578318119049072,
"learning_rate": 9.860270291432367e-05,
"loss": 0.15,
"step": 2550
},
{
"epoch": 0.1363224878854039,
"grad_norm": 0.19344595074653625,
"learning_rate": 9.858540952110036e-05,
"loss": 0.1506,
"step": 2560
},
{
"epoch": 0.13685499760370626,
"grad_norm": 0.12059523165225983,
"learning_rate": 9.856801197756362e-05,
"loss": 0.1504,
"step": 2570
},
{
"epoch": 0.13738750732200863,
"grad_norm": 0.14271683990955353,
"learning_rate": 9.855051033766424e-05,
"loss": 0.1503,
"step": 2580
},
{
"epoch": 0.13792001704031098,
"grad_norm": 0.1416383981704712,
"learning_rate": 9.853290465567582e-05,
"loss": 0.1499,
"step": 2590
},
{
"epoch": 0.13845252675861333,
"grad_norm": 0.2926510274410248,
"learning_rate": 9.851519498619462e-05,
"loss": 0.1505,
"step": 2600
},
{
"epoch": 0.1389850364769157,
"grad_norm": 0.169399231672287,
"learning_rate": 9.849738138413936e-05,
"loss": 0.1507,
"step": 2610
},
{
"epoch": 0.13951754619521806,
"grad_norm": 0.2096475064754486,
"learning_rate": 9.847946390475103e-05,
"loss": 0.1501,
"step": 2620
},
{
"epoch": 0.14005005591352043,
"grad_norm": 0.17753221094608307,
"learning_rate": 9.84614426035928e-05,
"loss": 0.1501,
"step": 2630
},
{
"epoch": 0.14058256563182278,
"grad_norm": 0.13106787204742432,
"learning_rate": 9.844331753654978e-05,
"loss": 0.1492,
"step": 2640
},
{
"epoch": 0.14111507535012513,
"grad_norm": 0.1869879961013794,
"learning_rate": 9.842508875982885e-05,
"loss": 0.1495,
"step": 2650
},
{
"epoch": 0.1416475850684275,
"grad_norm": 0.15248249471187592,
"learning_rate": 9.840675632995852e-05,
"loss": 0.1489,
"step": 2660
},
{
"epoch": 0.14218009478672985,
"grad_norm": 0.13266538083553314,
"learning_rate": 9.838832030378871e-05,
"loss": 0.1501,
"step": 2670
},
{
"epoch": 0.1427126045050322,
"grad_norm": 0.1557317078113556,
"learning_rate": 9.836978073849061e-05,
"loss": 0.1505,
"step": 2680
},
{
"epoch": 0.14324511422333458,
"grad_norm": 0.11426721513271332,
"learning_rate": 9.835113769155653e-05,
"loss": 0.1505,
"step": 2690
},
{
"epoch": 0.14377762394163693,
"grad_norm": 0.2386896014213562,
"learning_rate": 9.83323912207996e-05,
"loss": 0.1504,
"step": 2700
},
{
"epoch": 0.1443101336599393,
"grad_norm": 0.17772549390792847,
"learning_rate": 9.831354138435373e-05,
"loss": 0.1502,
"step": 2710
},
{
"epoch": 0.14484264337824165,
"grad_norm": 0.13859009742736816,
"learning_rate": 9.82945882406734e-05,
"loss": 0.1492,
"step": 2720
},
{
"epoch": 0.145375153096544,
"grad_norm": 0.16319668292999268,
"learning_rate": 9.827553184853333e-05,
"loss": 0.1495,
"step": 2730
},
{
"epoch": 0.14590766281484638,
"grad_norm": 0.17237436771392822,
"learning_rate": 9.82563722670286e-05,
"loss": 0.1497,
"step": 2740
},
{
"epoch": 0.14644017253314873,
"grad_norm": 0.22656778991222382,
"learning_rate": 9.823710955557413e-05,
"loss": 0.1494,
"step": 2750
},
{
"epoch": 0.14697268225145108,
"grad_norm": 0.12643253803253174,
"learning_rate": 9.821774377390474e-05,
"loss": 0.1497,
"step": 2760
},
{
"epoch": 0.14750519196975345,
"grad_norm": 0.1322176307439804,
"learning_rate": 9.819827498207481e-05,
"loss": 0.1494,
"step": 2770
},
{
"epoch": 0.1480377016880558,
"grad_norm": 0.13178198039531708,
"learning_rate": 9.817870324045824e-05,
"loss": 0.1497,
"step": 2780
},
{
"epoch": 0.14857021140635818,
"grad_norm": 0.16573889553546906,
"learning_rate": 9.815902860974812e-05,
"loss": 0.1484,
"step": 2790
},
{
"epoch": 0.14910272112466053,
"grad_norm": 0.17168866097927094,
"learning_rate": 9.813925115095663e-05,
"loss": 0.1498,
"step": 2800
},
{
"epoch": 0.14963523084296287,
"grad_norm": 0.11743076145648956,
"learning_rate": 9.811937092541483e-05,
"loss": 0.1489,
"step": 2810
},
{
"epoch": 0.15016774056126525,
"grad_norm": 0.09591732919216156,
"learning_rate": 9.809938799477247e-05,
"loss": 0.1492,
"step": 2820
},
{
"epoch": 0.1507002502795676,
"grad_norm": 0.2620985209941864,
"learning_rate": 9.807930242099777e-05,
"loss": 0.1484,
"step": 2830
},
{
"epoch": 0.15123275999786995,
"grad_norm": 0.1720651537179947,
"learning_rate": 9.805911426637723e-05,
"loss": 0.1489,
"step": 2840
},
{
"epoch": 0.15176526971617232,
"grad_norm": 0.12655815482139587,
"learning_rate": 9.803882359351556e-05,
"loss": 0.1489,
"step": 2850
},
{
"epoch": 0.15229777943447467,
"grad_norm": 0.1679336279630661,
"learning_rate": 9.801843046533527e-05,
"loss": 0.1474,
"step": 2860
},
{
"epoch": 0.15283028915277705,
"grad_norm": 0.16842088103294373,
"learning_rate": 9.799793494507667e-05,
"loss": 0.1487,
"step": 2870
},
{
"epoch": 0.1533627988710794,
"grad_norm": 0.15344814956188202,
"learning_rate": 9.797733709629755e-05,
"loss": 0.1491,
"step": 2880
},
{
"epoch": 0.15389530858938175,
"grad_norm": 0.14372903108596802,
"learning_rate": 9.795663698287305e-05,
"loss": 0.1486,
"step": 2890
},
{
"epoch": 0.15442781830768412,
"grad_norm": 0.14276905357837677,
"learning_rate": 9.793583466899541e-05,
"loss": 0.1483,
"step": 2900
},
{
"epoch": 0.15496032802598647,
"grad_norm": 0.21898868680000305,
"learning_rate": 9.791493021917384e-05,
"loss": 0.1473,
"step": 2910
},
{
"epoch": 0.15549283774428882,
"grad_norm": 0.21863441169261932,
"learning_rate": 9.789392369823423e-05,
"loss": 0.1472,
"step": 2920
},
{
"epoch": 0.1560253474625912,
"grad_norm": 0.14358623325824738,
"learning_rate": 9.787281517131905e-05,
"loss": 0.1479,
"step": 2930
},
{
"epoch": 0.15655785718089354,
"grad_norm": 0.12543822824954987,
"learning_rate": 9.785160470388706e-05,
"loss": 0.1479,
"step": 2940
},
{
"epoch": 0.15709036689919592,
"grad_norm": 0.14014865458011627,
"learning_rate": 9.783029236171317e-05,
"loss": 0.1474,
"step": 2950
},
{
"epoch": 0.15762287661749827,
"grad_norm": 0.15217439830303192,
"learning_rate": 9.78088782108882e-05,
"loss": 0.1478,
"step": 2960
},
{
"epoch": 0.15815538633580062,
"grad_norm": 0.10565731674432755,
"learning_rate": 9.778736231781864e-05,
"loss": 0.1472,
"step": 2970
},
{
"epoch": 0.158687896054103,
"grad_norm": 0.13025479018688202,
"learning_rate": 9.77657447492266e-05,
"loss": 0.147,
"step": 2980
},
{
"epoch": 0.15922040577240534,
"grad_norm": 0.26370614767074585,
"learning_rate": 9.774402557214934e-05,
"loss": 0.1468,
"step": 2990
},
{
"epoch": 0.1597529154907077,
"grad_norm": 0.11994566768407822,
"learning_rate": 9.772220485393935e-05,
"loss": 0.1468,
"step": 3000
},
{
"epoch": 0.16028542520901007,
"grad_norm": 0.13632826507091522,
"learning_rate": 9.770028266226392e-05,
"loss": 0.1465,
"step": 3010
},
{
"epoch": 0.16081793492731242,
"grad_norm": 0.18415699899196625,
"learning_rate": 9.767825906510508e-05,
"loss": 0.1461,
"step": 3020
},
{
"epoch": 0.1613504446456148,
"grad_norm": 0.1199774518609047,
"learning_rate": 9.765613413075925e-05,
"loss": 0.1462,
"step": 3030
},
{
"epoch": 0.16188295436391714,
"grad_norm": 0.12944312393665314,
"learning_rate": 9.763390792783718e-05,
"loss": 0.1456,
"step": 3040
},
{
"epoch": 0.1624154640822195,
"grad_norm": 0.12913690507411957,
"learning_rate": 9.761158052526357e-05,
"loss": 0.1461,
"step": 3050
},
{
"epoch": 0.16294797380052187,
"grad_norm": 0.13733190298080444,
"learning_rate": 9.758915199227704e-05,
"loss": 0.1454,
"step": 3060
},
{
"epoch": 0.16348048351882422,
"grad_norm": 0.13602448999881744,
"learning_rate": 9.756662239842977e-05,
"loss": 0.1453,
"step": 3070
},
{
"epoch": 0.16401299323712656,
"grad_norm": 0.23941437900066376,
"learning_rate": 9.754399181358735e-05,
"loss": 0.1447,
"step": 3080
},
{
"epoch": 0.16454550295542894,
"grad_norm": 0.17770028114318848,
"learning_rate": 9.752126030792852e-05,
"loss": 0.1448,
"step": 3090
},
{
"epoch": 0.1650780126737313,
"grad_norm": 0.12279467284679413,
"learning_rate": 9.749842795194502e-05,
"loss": 0.1445,
"step": 3100
},
{
"epoch": 0.16561052239203367,
"grad_norm": 0.16316959261894226,
"learning_rate": 9.747549481644132e-05,
"loss": 0.1436,
"step": 3110
},
{
"epoch": 0.16614303211033601,
"grad_norm": 0.1606248915195465,
"learning_rate": 9.74524609725344e-05,
"loss": 0.1447,
"step": 3120
},
{
"epoch": 0.16667554182863836,
"grad_norm": 0.14306576550006866,
"learning_rate": 9.742932649165357e-05,
"loss": 0.1448,
"step": 3130
},
{
"epoch": 0.16720805154694074,
"grad_norm": 0.16349278390407562,
"learning_rate": 9.740609144554018e-05,
"loss": 0.1443,
"step": 3140
},
{
"epoch": 0.1677405612652431,
"grad_norm": 0.26308995485305786,
"learning_rate": 9.738275590624748e-05,
"loss": 0.1436,
"step": 3150
},
{
"epoch": 0.16827307098354544,
"grad_norm": 0.13254424929618835,
"learning_rate": 9.735931994614034e-05,
"loss": 0.1436,
"step": 3160
},
{
"epoch": 0.1688055807018478,
"grad_norm": 0.14894609153270721,
"learning_rate": 9.733578363789504e-05,
"loss": 0.1428,
"step": 3170
},
{
"epoch": 0.16933809042015016,
"grad_norm": 0.2038808912038803,
"learning_rate": 9.731214705449902e-05,
"loss": 0.1427,
"step": 3180
},
{
"epoch": 0.16987060013845254,
"grad_norm": 0.1420915275812149,
"learning_rate": 9.728841026925072e-05,
"loss": 0.1431,
"step": 3190
},
{
"epoch": 0.1704031098567549,
"grad_norm": 0.12903986871242523,
"learning_rate": 9.726457335575931e-05,
"loss": 0.1426,
"step": 3200
},
{
"epoch": 0.17093561957505723,
"grad_norm": 0.15605418384075165,
"learning_rate": 9.724063638794445e-05,
"loss": 0.1435,
"step": 3210
},
{
"epoch": 0.1714681292933596,
"grad_norm": 0.179864302277565,
"learning_rate": 9.721659944003605e-05,
"loss": 0.1418,
"step": 3220
},
{
"epoch": 0.17200063901166196,
"grad_norm": 0.13642147183418274,
"learning_rate": 9.719246258657408e-05,
"loss": 0.1425,
"step": 3230
},
{
"epoch": 0.1725331487299643,
"grad_norm": 0.19007375836372375,
"learning_rate": 9.716822590240835e-05,
"loss": 0.1429,
"step": 3240
},
{
"epoch": 0.17306565844826668,
"grad_norm": 0.11586272716522217,
"learning_rate": 9.714388946269824e-05,
"loss": 0.1423,
"step": 3250
},
{
"epoch": 0.17359816816656903,
"grad_norm": 0.09750824421644211,
"learning_rate": 9.711945334291243e-05,
"loss": 0.1421,
"step": 3260
},
{
"epoch": 0.1741306778848714,
"grad_norm": 0.16159775853157043,
"learning_rate": 9.709491761882881e-05,
"loss": 0.1422,
"step": 3270
},
{
"epoch": 0.17466318760317376,
"grad_norm": 0.1439363956451416,
"learning_rate": 9.707028236653406e-05,
"loss": 0.1428,
"step": 3280
},
{
"epoch": 0.1751956973214761,
"grad_norm": 0.15214209258556366,
"learning_rate": 9.704554766242351e-05,
"loss": 0.1419,
"step": 3290
},
{
"epoch": 0.17572820703977848,
"grad_norm": 0.14261415600776672,
"learning_rate": 9.702071358320095e-05,
"loss": 0.142,
"step": 3300
},
{
"epoch": 0.17626071675808083,
"grad_norm": 0.17040428519248962,
"learning_rate": 9.69957802058783e-05,
"loss": 0.1426,
"step": 3310
},
{
"epoch": 0.17679322647638318,
"grad_norm": 0.11275117844343185,
"learning_rate": 9.697074760777542e-05,
"loss": 0.1423,
"step": 3320
},
{
"epoch": 0.17732573619468556,
"grad_norm": 0.11213172972202301,
"learning_rate": 9.694561586651985e-05,
"loss": 0.1416,
"step": 3330
},
{
"epoch": 0.1778582459129879,
"grad_norm": 0.08868248015642166,
"learning_rate": 9.692038506004659e-05,
"loss": 0.1414,
"step": 3340
},
{
"epoch": 0.17839075563129028,
"grad_norm": 0.09049142897129059,
"learning_rate": 9.689505526659783e-05,
"loss": 0.1409,
"step": 3350
},
{
"epoch": 0.17892326534959263,
"grad_norm": 0.21919691562652588,
"learning_rate": 9.686962656472278e-05,
"loss": 0.1423,
"step": 3360
},
{
"epoch": 0.17945577506789498,
"grad_norm": 0.11099066585302353,
"learning_rate": 9.684409903327728e-05,
"loss": 0.1417,
"step": 3370
},
{
"epoch": 0.17998828478619736,
"grad_norm": 0.10436002165079117,
"learning_rate": 9.681847275142371e-05,
"loss": 0.1413,
"step": 3380
},
{
"epoch": 0.1805207945044997,
"grad_norm": 0.11029750108718872,
"learning_rate": 9.679274779863065e-05,
"loss": 0.1407,
"step": 3390
},
{
"epoch": 0.18105330422280205,
"grad_norm": 0.15080855786800385,
"learning_rate": 9.67669242546727e-05,
"loss": 0.1414,
"step": 3400
},
{
"epoch": 0.18158581394110443,
"grad_norm": 0.11175508052110672,
"learning_rate": 9.674100219963018e-05,
"loss": 0.1407,
"step": 3410
},
{
"epoch": 0.18211832365940678,
"grad_norm": 0.10869117826223373,
"learning_rate": 9.671498171388889e-05,
"loss": 0.1401,
"step": 3420
},
{
"epoch": 0.18265083337770915,
"grad_norm": 0.14162185788154602,
"learning_rate": 9.668886287813985e-05,
"loss": 0.1406,
"step": 3430
},
{
"epoch": 0.1831833430960115,
"grad_norm": 0.2033168226480484,
"learning_rate": 9.666264577337908e-05,
"loss": 0.1407,
"step": 3440
},
{
"epoch": 0.18371585281431385,
"grad_norm": 3.439692497253418,
"learning_rate": 9.663633048090744e-05,
"loss": 0.1527,
"step": 3450
},
{
"epoch": 0.18424836253261623,
"grad_norm": 3.6431403160095215,
"learning_rate": 9.660991708233009e-05,
"loss": 0.7171,
"step": 3460
},
{
"epoch": 0.18478087225091858,
"grad_norm": 1.1392711400985718,
"learning_rate": 9.658340565955654e-05,
"loss": 0.3094,
"step": 3470
},
{
"epoch": 0.18531338196922092,
"grad_norm": 0.24986179172992706,
"learning_rate": 9.655679629480032e-05,
"loss": 0.2164,
"step": 3480
},
{
"epoch": 0.1858458916875233,
"grad_norm": 0.16256773471832275,
"learning_rate": 9.653008907057855e-05,
"loss": 0.1672,
"step": 3490
},
{
"epoch": 0.18637840140582565,
"grad_norm": 0.11707092821598053,
"learning_rate": 9.65032840697119e-05,
"loss": 0.1552,
"step": 3500
},
{
"epoch": 0.18691091112412803,
"grad_norm": 0.12210855633020401,
"learning_rate": 9.647638137532428e-05,
"loss": 0.1495,
"step": 3510
},
{
"epoch": 0.18744342084243037,
"grad_norm": 0.10740665346384048,
"learning_rate": 9.644938107084247e-05,
"loss": 0.1468,
"step": 3520
},
{
"epoch": 0.18797593056073272,
"grad_norm": 0.09589366614818573,
"learning_rate": 9.642228323999603e-05,
"loss": 0.1453,
"step": 3530
},
{
"epoch": 0.1885084402790351,
"grad_norm": 0.12820713222026825,
"learning_rate": 9.639508796681688e-05,
"loss": 0.1439,
"step": 3540
},
{
"epoch": 0.18904094999733745,
"grad_norm": 0.3186265528202057,
"learning_rate": 9.636779533563915e-05,
"loss": 0.1429,
"step": 3550
},
{
"epoch": 0.1895734597156398,
"grad_norm": 0.1107301339507103,
"learning_rate": 9.63404054310989e-05,
"loss": 0.1427,
"step": 3560
},
{
"epoch": 0.19010596943394217,
"grad_norm": 0.0950065553188324,
"learning_rate": 9.631291833813383e-05,
"loss": 0.1425,
"step": 3570
},
{
"epoch": 0.19063847915224452,
"grad_norm": 0.1297433227300644,
"learning_rate": 9.628533414198298e-05,
"loss": 0.1419,
"step": 3580
},
{
"epoch": 0.1911709888705469,
"grad_norm": 0.1097961962223053,
"learning_rate": 9.625765292818658e-05,
"loss": 0.1413,
"step": 3590
},
{
"epoch": 0.19170349858884925,
"grad_norm": 0.11060044914484024,
"learning_rate": 9.622987478258567e-05,
"loss": 0.1413,
"step": 3600
},
{
"epoch": 0.1922360083071516,
"grad_norm": 0.11111301183700562,
"learning_rate": 9.620199979132191e-05,
"loss": 0.1408,
"step": 3610
},
{
"epoch": 0.19276851802545397,
"grad_norm": 0.08389998227357864,
"learning_rate": 9.617402804083729e-05,
"loss": 0.141,
"step": 3620
},
{
"epoch": 0.19330102774375632,
"grad_norm": 0.1361977458000183,
"learning_rate": 9.61459596178738e-05,
"loss": 0.1409,
"step": 3630
},
{
"epoch": 0.19383353746205867,
"grad_norm": 0.12257982045412064,
"learning_rate": 9.61177946094733e-05,
"loss": 0.1405,
"step": 3640
},
{
"epoch": 0.19436604718036105,
"grad_norm": 0.09580480307340622,
"learning_rate": 9.608953310297708e-05,
"loss": 0.141,
"step": 3650
},
{
"epoch": 0.1948985568986634,
"grad_norm": 0.09593943506479263,
"learning_rate": 9.606117518602575e-05,
"loss": 0.1404,
"step": 3660
},
{
"epoch": 0.19543106661696577,
"grad_norm": 0.09266688674688339,
"learning_rate": 9.603272094655886e-05,
"loss": 0.1404,
"step": 3670
},
{
"epoch": 0.19596357633526812,
"grad_norm": 0.1070714071393013,
"learning_rate": 9.600417047281464e-05,
"loss": 0.1398,
"step": 3680
},
{
"epoch": 0.19649608605357047,
"grad_norm": 0.10331781953573227,
"learning_rate": 9.597552385332982e-05,
"loss": 0.1401,
"step": 3690
},
{
"epoch": 0.19702859577187284,
"grad_norm": 0.09512060880661011,
"learning_rate": 9.594678117693921e-05,
"loss": 0.1394,
"step": 3700
},
{
"epoch": 0.1975611054901752,
"grad_norm": 0.0843188613653183,
"learning_rate": 9.591794253277551e-05,
"loss": 0.14,
"step": 3710
},
{
"epoch": 0.19809361520847754,
"grad_norm": 0.0754111111164093,
"learning_rate": 9.588900801026907e-05,
"loss": 0.1406,
"step": 3720
},
{
"epoch": 0.19862612492677992,
"grad_norm": 0.09565232694149017,
"learning_rate": 9.585997769914752e-05,
"loss": 0.1399,
"step": 3730
},
{
"epoch": 0.19915863464508227,
"grad_norm": 0.10425586253404617,
"learning_rate": 9.583085168943555e-05,
"loss": 0.1391,
"step": 3740
},
{
"epoch": 0.19969114436338464,
"grad_norm": 0.1333099901676178,
"learning_rate": 9.580163007145459e-05,
"loss": 0.1402,
"step": 3750
},
{
"epoch": 0.200223654081687,
"grad_norm": 0.12000375241041183,
"learning_rate": 9.57723129358226e-05,
"loss": 0.1401,
"step": 3760
},
{
"epoch": 0.20075616379998934,
"grad_norm": 0.08403091132640839,
"learning_rate": 9.574290037345375e-05,
"loss": 0.1393,
"step": 3770
},
{
"epoch": 0.20128867351829172,
"grad_norm": 0.08062135428190231,
"learning_rate": 9.571339247555809e-05,
"loss": 0.1396,
"step": 3780
},
{
"epoch": 0.20182118323659407,
"grad_norm": 0.17963799834251404,
"learning_rate": 9.568378933364131e-05,
"loss": 0.14,
"step": 3790
},
{
"epoch": 0.2023536929548964,
"grad_norm": 0.09086289256811142,
"learning_rate": 9.565409103950451e-05,
"loss": 0.1397,
"step": 3800
},
{
"epoch": 0.2028862026731988,
"grad_norm": 0.10271194577217102,
"learning_rate": 9.562429768524381e-05,
"loss": 0.1394,
"step": 3810
},
{
"epoch": 0.20341871239150114,
"grad_norm": 0.076598159968853,
"learning_rate": 9.559440936325017e-05,
"loss": 0.1395,
"step": 3820
},
{
"epoch": 0.20395122210980351,
"grad_norm": 0.1196560189127922,
"learning_rate": 9.556442616620899e-05,
"loss": 0.1388,
"step": 3830
},
{
"epoch": 0.20448373182810586,
"grad_norm": 0.082634337246418,
"learning_rate": 9.553434818709992e-05,
"loss": 0.1394,
"step": 3840
},
{
"epoch": 0.2050162415464082,
"grad_norm": 0.13070203363895416,
"learning_rate": 9.550417551919655e-05,
"loss": 0.1393,
"step": 3850
},
{
"epoch": 0.2055487512647106,
"grad_norm": 0.21860548853874207,
"learning_rate": 9.547390825606606e-05,
"loss": 0.1387,
"step": 3860
},
{
"epoch": 0.20608126098301294,
"grad_norm": 0.08260785788297653,
"learning_rate": 9.544354649156899e-05,
"loss": 0.139,
"step": 3870
},
{
"epoch": 0.20661377070131529,
"grad_norm": 0.0788755938410759,
"learning_rate": 9.541309031985895e-05,
"loss": 0.1392,
"step": 3880
},
{
"epoch": 0.20714628041961766,
"grad_norm": 0.12903687357902527,
"learning_rate": 9.538253983538232e-05,
"loss": 0.1395,
"step": 3890
},
{
"epoch": 0.20767879013792,
"grad_norm": 0.19277387857437134,
"learning_rate": 9.535189513287792e-05,
"loss": 0.1389,
"step": 3900
},
{
"epoch": 0.2082112998562224,
"grad_norm": 0.1530824899673462,
"learning_rate": 9.532115630737674e-05,
"loss": 0.1388,
"step": 3910
},
{
"epoch": 0.20874380957452474,
"grad_norm": 0.0937756597995758,
"learning_rate": 9.52903234542017e-05,
"loss": 0.1398,
"step": 3920
},
{
"epoch": 0.20927631929282708,
"grad_norm": 0.12323369830846786,
"learning_rate": 9.52593966689673e-05,
"loss": 0.14,
"step": 3930
},
{
"epoch": 0.20980882901112946,
"grad_norm": 0.14708684384822845,
"learning_rate": 9.522837604757924e-05,
"loss": 0.1388,
"step": 3940
},
{
"epoch": 0.2103413387294318,
"grad_norm": 0.10080372542142868,
"learning_rate": 9.519726168623433e-05,
"loss": 0.1379,
"step": 3950
},
{
"epoch": 0.21087384844773416,
"grad_norm": 0.11739426851272583,
"learning_rate": 9.516605368141998e-05,
"loss": 0.1388,
"step": 3960
},
{
"epoch": 0.21140635816603653,
"grad_norm": 0.0870957151055336,
"learning_rate": 9.513475212991406e-05,
"loss": 0.1388,
"step": 3970
},
{
"epoch": 0.21193886788433888,
"grad_norm": 0.12498774379491806,
"learning_rate": 9.510335712878446e-05,
"loss": 0.139,
"step": 3980
},
{
"epoch": 0.21247137760264126,
"grad_norm": 0.13790611922740936,
"learning_rate": 9.507186877538899e-05,
"loss": 0.1391,
"step": 3990
},
{
"epoch": 0.2130038873209436,
"grad_norm": 0.09463178366422653,
"learning_rate": 9.504028716737481e-05,
"loss": 0.1387,
"step": 4000
},
{
"epoch": 0.21353639703924596,
"grad_norm": 0.0896778553724289,
"learning_rate": 9.500861240267836e-05,
"loss": 0.1384,
"step": 4010
},
{
"epoch": 0.21406890675754833,
"grad_norm": 0.09591860324144363,
"learning_rate": 9.49768445795249e-05,
"loss": 0.1379,
"step": 4020
},
{
"epoch": 0.21460141647585068,
"grad_norm": 0.12393760681152344,
"learning_rate": 9.49449837964283e-05,
"loss": 0.138,
"step": 4030
},
{
"epoch": 0.21513392619415303,
"grad_norm": 0.1211247369647026,
"learning_rate": 9.491303015219075e-05,
"loss": 0.1381,
"step": 4040
},
{
"epoch": 0.2156664359124554,
"grad_norm": 0.09435896575450897,
"learning_rate": 9.488098374590232e-05,
"loss": 0.1384,
"step": 4050
},
{
"epoch": 0.21619894563075776,
"grad_norm": 0.2162541151046753,
"learning_rate": 9.484884467694082e-05,
"loss": 0.1386,
"step": 4060
},
{
"epoch": 0.21673145534906013,
"grad_norm": 0.11500007659196854,
"learning_rate": 9.481661304497136e-05,
"loss": 0.139,
"step": 4070
},
{
"epoch": 0.21726396506736248,
"grad_norm": 0.12248394638299942,
"learning_rate": 9.478428894994612e-05,
"loss": 0.1387,
"step": 4080
},
{
"epoch": 0.21779647478566483,
"grad_norm": 0.08362865447998047,
"learning_rate": 9.475187249210396e-05,
"loss": 0.1386,
"step": 4090
},
{
"epoch": 0.2183289845039672,
"grad_norm": 0.09139638394117355,
"learning_rate": 9.471936377197025e-05,
"loss": 0.1382,
"step": 4100
},
{
"epoch": 0.21886149422226955,
"grad_norm": 0.08786854147911072,
"learning_rate": 9.468676289035643e-05,
"loss": 0.1377,
"step": 4110
},
{
"epoch": 0.2193940039405719,
"grad_norm": 0.09356456995010376,
"learning_rate": 9.465406994835972e-05,
"loss": 0.1377,
"step": 4120
},
{
"epoch": 0.21992651365887428,
"grad_norm": 0.07609741389751434,
"learning_rate": 9.462128504736286e-05,
"loss": 0.1382,
"step": 4130
},
{
"epoch": 0.22045902337717663,
"grad_norm": 0.12431398779153824,
"learning_rate": 9.458840828903368e-05,
"loss": 0.1382,
"step": 4140
},
{
"epoch": 0.220991533095479,
"grad_norm": 0.1060996949672699,
"learning_rate": 9.4555439775325e-05,
"loss": 0.1382,
"step": 4150
},
{
"epoch": 0.22152404281378135,
"grad_norm": 0.07396227866411209,
"learning_rate": 9.452237960847405e-05,
"loss": 0.1379,
"step": 4160
},
{
"epoch": 0.2220565525320837,
"grad_norm": 0.07973285764455795,
"learning_rate": 9.448922789100238e-05,
"loss": 0.1376,
"step": 4170
},
{
"epoch": 0.22258906225038608,
"grad_norm": 0.08671050518751144,
"learning_rate": 9.445598472571535e-05,
"loss": 0.1387,
"step": 4180
},
{
"epoch": 0.22312157196868843,
"grad_norm": 0.07978523522615433,
"learning_rate": 9.442265021570198e-05,
"loss": 0.1379,
"step": 4190
},
{
"epoch": 0.22365408168699077,
"grad_norm": 0.07077804952859879,
"learning_rate": 9.438922446433454e-05,
"loss": 0.1382,
"step": 4200
},
{
"epoch": 0.22418659140529315,
"grad_norm": 0.08330279588699341,
"learning_rate": 9.435570757526823e-05,
"loss": 0.1374,
"step": 4210
},
{
"epoch": 0.2247191011235955,
"grad_norm": 0.1032426580786705,
"learning_rate": 9.432209965244085e-05,
"loss": 0.1382,
"step": 4220
},
{
"epoch": 0.22525161084189788,
"grad_norm": 0.10053195804357529,
"learning_rate": 9.428840080007255e-05,
"loss": 0.1373,
"step": 4230
},
{
"epoch": 0.22578412056020022,
"grad_norm": 0.08895772695541382,
"learning_rate": 9.425461112266545e-05,
"loss": 0.1379,
"step": 4240
},
{
"epoch": 0.22631663027850257,
"grad_norm": 0.08644817024469376,
"learning_rate": 9.422073072500328e-05,
"loss": 0.1381,
"step": 4250
},
{
"epoch": 0.22684913999680495,
"grad_norm": 0.07521601766347885,
"learning_rate": 9.418675971215113e-05,
"loss": 0.1377,
"step": 4260
},
{
"epoch": 0.2273816497151073,
"grad_norm": 0.0699540451169014,
"learning_rate": 9.415269818945513e-05,
"loss": 0.1378,
"step": 4270
},
{
"epoch": 0.22791415943340965,
"grad_norm": 0.16751086711883545,
"learning_rate": 9.411854626254202e-05,
"loss": 0.1371,
"step": 4280
},
{
"epoch": 0.22844666915171202,
"grad_norm": 0.11966162919998169,
"learning_rate": 9.408430403731891e-05,
"loss": 0.1374,
"step": 4290
},
{
"epoch": 0.22897917887001437,
"grad_norm": 0.0889548733830452,
"learning_rate": 9.404997161997295e-05,
"loss": 0.1376,
"step": 4300
},
{
"epoch": 0.22951168858831675,
"grad_norm": 0.10110121965408325,
"learning_rate": 9.4015549116971e-05,
"loss": 0.1379,
"step": 4310
},
{
"epoch": 0.2300441983066191,
"grad_norm": 0.10038761049509048,
"learning_rate": 9.398103663505917e-05,
"loss": 0.1368,
"step": 4320
},
{
"epoch": 0.23057670802492145,
"grad_norm": 0.14014580845832825,
"learning_rate": 9.394643428126272e-05,
"loss": 0.1369,
"step": 4330
},
{
"epoch": 0.23110921774322382,
"grad_norm": 0.07071101665496826,
"learning_rate": 9.391174216288561e-05,
"loss": 0.1375,
"step": 4340
},
{
"epoch": 0.23164172746152617,
"grad_norm": 0.08393870294094086,
"learning_rate": 9.387696038751006e-05,
"loss": 0.1378,
"step": 4350
},
{
"epoch": 0.23217423717982852,
"grad_norm": 0.18702152371406555,
"learning_rate": 9.384208906299641e-05,
"loss": 0.1379,
"step": 4360
},
{
"epoch": 0.2327067468981309,
"grad_norm": 0.12593114376068115,
"learning_rate": 9.380712829748266e-05,
"loss": 0.1377,
"step": 4370
},
{
"epoch": 0.23323925661643324,
"grad_norm": 0.1111498549580574,
"learning_rate": 9.37720781993842e-05,
"loss": 0.138,
"step": 4380
},
{
"epoch": 0.23377176633473562,
"grad_norm": 0.11401405185461044,
"learning_rate": 9.37369388773934e-05,
"loss": 0.1373,
"step": 4390
},
{
"epoch": 0.23430427605303797,
"grad_norm": 0.07457905262708664,
"learning_rate": 9.370171044047937e-05,
"loss": 0.1376,
"step": 4400
},
{
"epoch": 0.23483678577134032,
"grad_norm": 0.07042038440704346,
"learning_rate": 9.366639299788758e-05,
"loss": 0.1379,
"step": 4410
},
{
"epoch": 0.2353692954896427,
"grad_norm": 0.06665973365306854,
"learning_rate": 9.363098665913941e-05,
"loss": 0.1368,
"step": 4420
},
{
"epoch": 0.23590180520794504,
"grad_norm": 0.06450683623552322,
"learning_rate": 9.3595491534032e-05,
"loss": 0.1376,
"step": 4430
},
{
"epoch": 0.2364343149262474,
"grad_norm": 0.0732714980840683,
"learning_rate": 9.355990773263782e-05,
"loss": 0.137,
"step": 4440
},
{
"epoch": 0.23696682464454977,
"grad_norm": 0.09271900355815887,
"learning_rate": 9.352423536530432e-05,
"loss": 0.1366,
"step": 4450
},
{
"epoch": 0.23749933436285212,
"grad_norm": 0.35426032543182373,
"learning_rate": 9.34884745426536e-05,
"loss": 0.1383,
"step": 4460
},
{
"epoch": 0.2380318440811545,
"grad_norm": 0.0786311998963356,
"learning_rate": 9.3452625375582e-05,
"loss": 0.1378,
"step": 4470
},
{
"epoch": 0.23856435379945684,
"grad_norm": 0.07759775966405869,
"learning_rate": 9.341668797525993e-05,
"loss": 0.1373,
"step": 4480
},
{
"epoch": 0.2390968635177592,
"grad_norm": 0.06604979932308197,
"learning_rate": 9.338066245313134e-05,
"loss": 0.1376,
"step": 4490
},
{
"epoch": 0.23962937323606157,
"grad_norm": 0.06817334145307541,
"learning_rate": 9.334454892091349e-05,
"loss": 0.1368,
"step": 4500
},
{
"epoch": 0.24016188295436391,
"grad_norm": 0.07450228929519653,
"learning_rate": 9.330834749059654e-05,
"loss": 0.1368,
"step": 4510
},
{
"epoch": 0.24069439267266626,
"grad_norm": 0.09656868129968643,
"learning_rate": 9.327205827444322e-05,
"loss": 0.1365,
"step": 4520
},
{
"epoch": 0.24122690239096864,
"grad_norm": 0.1612931787967682,
"learning_rate": 9.323568138498855e-05,
"loss": 0.1374,
"step": 4530
},
{
"epoch": 0.241759412109271,
"grad_norm": 0.1097157672047615,
"learning_rate": 9.319921693503935e-05,
"loss": 0.1374,
"step": 4540
},
{
"epoch": 0.24229192182757336,
"grad_norm": 0.07911382615566254,
"learning_rate": 9.316266503767402e-05,
"loss": 0.1375,
"step": 4550
},
{
"epoch": 0.2428244315458757,
"grad_norm": 0.07367183268070221,
"learning_rate": 9.31260258062421e-05,
"loss": 0.1372,
"step": 4560
},
{
"epoch": 0.24335694126417806,
"grad_norm": 0.0891132801771164,
"learning_rate": 9.308929935436404e-05,
"loss": 0.1362,
"step": 4570
},
{
"epoch": 0.24388945098248044,
"grad_norm": 0.10352007299661636,
"learning_rate": 9.305248579593064e-05,
"loss": 0.1378,
"step": 4580
},
{
"epoch": 0.2444219607007828,
"grad_norm": 0.10638166218996048,
"learning_rate": 9.301558524510293e-05,
"loss": 0.1367,
"step": 4590
},
{
"epoch": 0.24495447041908514,
"grad_norm": 0.07406862825155258,
"learning_rate": 9.297859781631166e-05,
"loss": 0.1365,
"step": 4600
},
{
"epoch": 0.2454869801373875,
"grad_norm": 0.10420636832714081,
"learning_rate": 9.294152362425701e-05,
"loss": 0.1372,
"step": 4610
},
{
"epoch": 0.24601948985568986,
"grad_norm": 0.0823531523346901,
"learning_rate": 9.290436278390821e-05,
"loss": 0.1367,
"step": 4620
},
{
"epoch": 0.24655199957399224,
"grad_norm": 0.08028628677129745,
"learning_rate": 9.286711541050322e-05,
"loss": 0.1371,
"step": 4630
},
{
"epoch": 0.24708450929229459,
"grad_norm": 0.13905274868011475,
"learning_rate": 9.282978161954825e-05,
"loss": 0.1368,
"step": 4640
},
{
"epoch": 0.24761701901059693,
"grad_norm": 0.07757926732301712,
"learning_rate": 9.279236152681763e-05,
"loss": 0.1369,
"step": 4650
},
{
"epoch": 0.2481495287288993,
"grad_norm": 0.08740050345659256,
"learning_rate": 9.275485524835319e-05,
"loss": 0.1371,
"step": 4660
},
{
"epoch": 0.24868203844720166,
"grad_norm": 0.07663418352603912,
"learning_rate": 9.271726290046413e-05,
"loss": 0.1375,
"step": 4670
},
{
"epoch": 0.249214548165504,
"grad_norm": 0.11915243417024612,
"learning_rate": 9.267958459972652e-05,
"loss": 0.1365,
"step": 4680
},
{
"epoch": 0.24974705788380638,
"grad_norm": 0.11448535323143005,
"learning_rate": 9.264182046298294e-05,
"loss": 0.137,
"step": 4690
},
{
"epoch": 0.25027956760210873,
"grad_norm": 0.09888239204883575,
"learning_rate": 9.260397060734219e-05,
"loss": 0.1373,
"step": 4700
},
{
"epoch": 0.2508120773204111,
"grad_norm": 0.12656264007091522,
"learning_rate": 9.256603515017885e-05,
"loss": 0.1364,
"step": 4710
},
{
"epoch": 0.25134458703871343,
"grad_norm": 0.1620924174785614,
"learning_rate": 9.252801420913304e-05,
"loss": 0.136,
"step": 4720
},
{
"epoch": 0.2518770967570158,
"grad_norm": 0.07459171861410141,
"learning_rate": 9.24899079021099e-05,
"loss": 0.1373,
"step": 4730
},
{
"epoch": 0.2524096064753182,
"grad_norm": 0.09508336335420609,
"learning_rate": 9.245171634727926e-05,
"loss": 0.1363,
"step": 4740
},
{
"epoch": 0.25294211619362056,
"grad_norm": 0.07613290101289749,
"learning_rate": 9.241343966307543e-05,
"loss": 0.1363,
"step": 4750
},
{
"epoch": 0.2534746259119229,
"grad_norm": 0.08999643474817276,
"learning_rate": 9.237507796819662e-05,
"loss": 0.1365,
"step": 4760
},
{
"epoch": 0.25400713563022526,
"grad_norm": 0.11467399448156357,
"learning_rate": 9.233663138160464e-05,
"loss": 0.1364,
"step": 4770
},
{
"epoch": 0.25453964534852763,
"grad_norm": 0.0825829803943634,
"learning_rate": 9.229810002252464e-05,
"loss": 0.1367,
"step": 4780
},
{
"epoch": 0.25507215506682995,
"grad_norm": 0.08100995421409607,
"learning_rate": 9.225948401044457e-05,
"loss": 0.137,
"step": 4790
},
{
"epoch": 0.25560466478513233,
"grad_norm": 0.08392170816659927,
"learning_rate": 9.222078346511502e-05,
"loss": 0.1366,
"step": 4800
},
{
"epoch": 0.2561371745034347,
"grad_norm": 0.061139799654483795,
"learning_rate": 9.218199850654854e-05,
"loss": 0.1368,
"step": 4810
},
{
"epoch": 0.256669684221737,
"grad_norm": 0.1355183869600296,
"learning_rate": 9.21431292550196e-05,
"loss": 0.1363,
"step": 4820
},
{
"epoch": 0.2572021939400394,
"grad_norm": 0.08287263661623001,
"learning_rate": 9.210417583106401e-05,
"loss": 0.1363,
"step": 4830
},
{
"epoch": 0.2577347036583418,
"grad_norm": 0.0793054848909378,
"learning_rate": 9.206513835547861e-05,
"loss": 0.1362,
"step": 4840
},
{
"epoch": 0.2582672133766441,
"grad_norm": 0.09595254063606262,
"learning_rate": 9.202601694932087e-05,
"loss": 0.136,
"step": 4850
},
{
"epoch": 0.2587997230949465,
"grad_norm": 0.07301712781190872,
"learning_rate": 9.198681173390858e-05,
"loss": 0.1371,
"step": 4860
},
{
"epoch": 0.25933223281324885,
"grad_norm": 0.11917870491743088,
"learning_rate": 9.194752283081937e-05,
"loss": 0.137,
"step": 4870
},
{
"epoch": 0.2598647425315512,
"grad_norm": 0.07802341878414154,
"learning_rate": 9.190815036189042e-05,
"loss": 0.1363,
"step": 4880
},
{
"epoch": 0.26039725224985355,
"grad_norm": 0.07218264043331146,
"learning_rate": 9.186869444921808e-05,
"loss": 0.136,
"step": 4890
},
{
"epoch": 0.2609297619681559,
"grad_norm": 0.07441945374011993,
"learning_rate": 9.182915521515745e-05,
"loss": 0.137,
"step": 4900
},
{
"epoch": 0.2614622716864583,
"grad_norm": 0.16663500666618347,
"learning_rate": 9.178953278232193e-05,
"loss": 0.1369,
"step": 4910
},
{
"epoch": 0.2619947814047606,
"grad_norm": 0.09314275532960892,
"learning_rate": 9.174982727358306e-05,
"loss": 0.1359,
"step": 4920
},
{
"epoch": 0.262527291123063,
"grad_norm": 0.07567309588193893,
"learning_rate": 9.171003881206992e-05,
"loss": 0.1364,
"step": 4930
},
{
"epoch": 0.2630598008413654,
"grad_norm": 0.10356537252664566,
"learning_rate": 9.167016752116883e-05,
"loss": 0.1367,
"step": 4940
},
{
"epoch": 0.2635923105596677,
"grad_norm": 0.07590640336275101,
"learning_rate": 9.163021352452302e-05,
"loss": 0.1359,
"step": 4950
},
{
"epoch": 0.2641248202779701,
"grad_norm": 0.09072890132665634,
"learning_rate": 9.159017694603214e-05,
"loss": 0.1367,
"step": 4960
},
{
"epoch": 0.26465732999627245,
"grad_norm": 0.10349312424659729,
"learning_rate": 9.155005790985197e-05,
"loss": 0.136,
"step": 4970
},
{
"epoch": 0.26518983971457477,
"grad_norm": 0.08810363709926605,
"learning_rate": 9.150985654039394e-05,
"loss": 0.1362,
"step": 4980
},
{
"epoch": 0.26572234943287715,
"grad_norm": 0.13400596380233765,
"learning_rate": 9.14695729623249e-05,
"loss": 0.1362,
"step": 4990
},
{
"epoch": 0.2662548591511795,
"grad_norm": 0.12602917850017548,
"learning_rate": 9.142920730056652e-05,
"loss": 0.1353,
"step": 5000
},
{
"epoch": 0.26678736886948184,
"grad_norm": 0.08865707367658615,
"learning_rate": 9.138875968029512e-05,
"loss": 0.1361,
"step": 5010
},
{
"epoch": 0.2673198785877842,
"grad_norm": 0.11873424053192139,
"learning_rate": 9.13482302269411e-05,
"loss": 0.1371,
"step": 5020
},
{
"epoch": 0.2678523883060866,
"grad_norm": 0.08005053550004959,
"learning_rate": 9.13076190661887e-05,
"loss": 0.1356,
"step": 5030
},
{
"epoch": 0.2683848980243889,
"grad_norm": 0.14600218832492828,
"learning_rate": 9.126692632397543e-05,
"loss": 0.1359,
"step": 5040
},
{
"epoch": 0.2689174077426913,
"grad_norm": 0.09237764775753021,
"learning_rate": 9.122615212649189e-05,
"loss": 0.136,
"step": 5050
},
{
"epoch": 0.26944991746099367,
"grad_norm": 0.06887295097112656,
"learning_rate": 9.118529660018125e-05,
"loss": 0.1371,
"step": 5060
},
{
"epoch": 0.26998242717929605,
"grad_norm": 0.07038972526788712,
"learning_rate": 9.114435987173886e-05,
"loss": 0.1365,
"step": 5070
},
{
"epoch": 0.27051493689759837,
"grad_norm": 0.06756497174501419,
"learning_rate": 9.110334206811195e-05,
"loss": 0.1359,
"step": 5080
},
{
"epoch": 0.27104744661590074,
"grad_norm": 0.07146366685628891,
"learning_rate": 9.106224331649906e-05,
"loss": 0.1362,
"step": 5090
},
{
"epoch": 0.2715799563342031,
"grad_norm": 0.07654134929180145,
"learning_rate": 9.102106374434984e-05,
"loss": 0.1357,
"step": 5100
},
{
"epoch": 0.27211246605250544,
"grad_norm": 0.11621958762407303,
"learning_rate": 9.097980347936457e-05,
"loss": 0.1362,
"step": 5110
},
{
"epoch": 0.2726449757708078,
"grad_norm": 0.08447077125310898,
"learning_rate": 9.093846264949368e-05,
"loss": 0.1354,
"step": 5120
},
{
"epoch": 0.2731774854891102,
"grad_norm": 0.06373301893472672,
"learning_rate": 9.089704138293756e-05,
"loss": 0.1357,
"step": 5130
},
{
"epoch": 0.2737099952074125,
"grad_norm": 0.07651172578334808,
"learning_rate": 9.085553980814592e-05,
"loss": 0.1361,
"step": 5140
},
{
"epoch": 0.2742425049257149,
"grad_norm": 0.08544403314590454,
"learning_rate": 9.081395805381761e-05,
"loss": 0.1356,
"step": 5150
},
{
"epoch": 0.27477501464401727,
"grad_norm": 0.1044570654630661,
"learning_rate": 9.077229624890002e-05,
"loss": 0.1355,
"step": 5160
},
{
"epoch": 0.2753075243623196,
"grad_norm": 0.09831110388040543,
"learning_rate": 9.073055452258889e-05,
"loss": 0.1364,
"step": 5170
},
{
"epoch": 0.27584003408062197,
"grad_norm": 0.07489628344774246,
"learning_rate": 9.068873300432772e-05,
"loss": 0.1369,
"step": 5180
},
{
"epoch": 0.27637254379892434,
"grad_norm": 0.06893607974052429,
"learning_rate": 9.064683182380749e-05,
"loss": 0.1359,
"step": 5190
},
{
"epoch": 0.27690505351722666,
"grad_norm": 0.08887787163257599,
"learning_rate": 9.060485111096617e-05,
"loss": 0.1347,
"step": 5200
},
{
"epoch": 0.27743756323552904,
"grad_norm": 0.074364572763443,
"learning_rate": 9.056279099598845e-05,
"loss": 0.1357,
"step": 5210
},
{
"epoch": 0.2779700729538314,
"grad_norm": 0.15046152472496033,
"learning_rate": 9.052065160930516e-05,
"loss": 0.1348,
"step": 5220
},
{
"epoch": 0.2785025826721338,
"grad_norm": 0.08475669473409653,
"learning_rate": 9.0478433081593e-05,
"loss": 0.1357,
"step": 5230
},
{
"epoch": 0.2790350923904361,
"grad_norm": 0.09118683636188507,
"learning_rate": 9.043613554377411e-05,
"loss": 0.1354,
"step": 5240
},
{
"epoch": 0.2795676021087385,
"grad_norm": 0.08327824622392654,
"learning_rate": 9.03937591270156e-05,
"loss": 0.135,
"step": 5250
},
{
"epoch": 0.28010011182704087,
"grad_norm": 0.12977443635463715,
"learning_rate": 9.035130396272922e-05,
"loss": 0.1353,
"step": 5260
},
{
"epoch": 0.2806326215453432,
"grad_norm": 0.07361641526222229,
"learning_rate": 9.030877018257091e-05,
"loss": 0.1363,
"step": 5270
},
{
"epoch": 0.28116513126364556,
"grad_norm": 0.06867006421089172,
"learning_rate": 9.02661579184404e-05,
"loss": 0.1353,
"step": 5280
},
{
"epoch": 0.28169764098194794,
"grad_norm": 0.09308381378650665,
"learning_rate": 9.022346730248079e-05,
"loss": 0.1357,
"step": 5290
},
{
"epoch": 0.28223015070025026,
"grad_norm": 0.07800911366939545,
"learning_rate": 9.01806984670782e-05,
"loss": 0.1348,
"step": 5300
},
{
"epoch": 0.28276266041855264,
"grad_norm": 0.06358273327350616,
"learning_rate": 9.013785154486127e-05,
"loss": 0.1352,
"step": 5310
},
{
"epoch": 0.283295170136855,
"grad_norm": 0.06616450846195221,
"learning_rate": 9.009492666870078e-05,
"loss": 0.135,
"step": 5320
},
{
"epoch": 0.28382767985515733,
"grad_norm": 0.11287859827280045,
"learning_rate": 9.005192397170932e-05,
"loss": 0.1363,
"step": 5330
},
{
"epoch": 0.2843601895734597,
"grad_norm": 0.06982850283384323,
"learning_rate": 9.000884358724073e-05,
"loss": 0.1358,
"step": 5340
},
{
"epoch": 0.2848926992917621,
"grad_norm": 0.0698726698756218,
"learning_rate": 8.996568564888978e-05,
"loss": 0.1349,
"step": 5350
},
{
"epoch": 0.2854252090100644,
"grad_norm": 0.0825994461774826,
"learning_rate": 8.99224502904918e-05,
"loss": 0.1351,
"step": 5360
},
{
"epoch": 0.2859577187283668,
"grad_norm": 0.10726054012775421,
"learning_rate": 8.987913764612212e-05,
"loss": 0.1359,
"step": 5370
},
{
"epoch": 0.28649022844666916,
"grad_norm": 0.06900358200073242,
"learning_rate": 8.983574785009578e-05,
"loss": 0.1358,
"step": 5380
},
{
"epoch": 0.28702273816497154,
"grad_norm": 0.062367282807826996,
"learning_rate": 8.979228103696709e-05,
"loss": 0.1359,
"step": 5390
},
{
"epoch": 0.28755524788327386,
"grad_norm": 0.11140688508749008,
"learning_rate": 8.974873734152915e-05,
"loss": 0.1349,
"step": 5400
},
{
"epoch": 0.28808775760157623,
"grad_norm": 0.07785354554653168,
"learning_rate": 8.970511689881351e-05,
"loss": 0.1357,
"step": 5410
},
{
"epoch": 0.2886202673198786,
"grad_norm": 0.09785955399274826,
"learning_rate": 8.96614198440897e-05,
"loss": 0.1349,
"step": 5420
},
{
"epoch": 0.28915277703818093,
"grad_norm": 0.07891997694969177,
"learning_rate": 8.961764631286487e-05,
"loss": 0.1357,
"step": 5430
},
{
"epoch": 0.2896852867564833,
"grad_norm": 0.08100765943527222,
"learning_rate": 8.957379644088325e-05,
"loss": 0.1356,
"step": 5440
},
{
"epoch": 0.2902177964747857,
"grad_norm": 0.08178524672985077,
"learning_rate": 8.952987036412584e-05,
"loss": 0.135,
"step": 5450
},
{
"epoch": 0.290750306193088,
"grad_norm": 0.11689390987157822,
"learning_rate": 8.948586821880997e-05,
"loss": 0.1349,
"step": 5460
},
{
"epoch": 0.2912828159113904,
"grad_norm": 0.06157712638378143,
"learning_rate": 8.944179014138891e-05,
"loss": 0.1352,
"step": 5470
},
{
"epoch": 0.29181532562969276,
"grad_norm": 0.09353891015052795,
"learning_rate": 8.939763626855129e-05,
"loss": 0.1347,
"step": 5480
},
{
"epoch": 0.2923478353479951,
"grad_norm": 0.09575408697128296,
"learning_rate": 8.93534067372209e-05,
"loss": 0.1351,
"step": 5490
},
{
"epoch": 0.29288034506629745,
"grad_norm": 0.05233992263674736,
"learning_rate": 8.930910168455603e-05,
"loss": 0.1354,
"step": 5500
},
{
"epoch": 0.29341285478459983,
"grad_norm": 0.1354876458644867,
"learning_rate": 8.926472124794931e-05,
"loss": 0.1348,
"step": 5510
},
{
"epoch": 0.29394536450290215,
"grad_norm": 0.10183115303516388,
"learning_rate": 8.922026556502699e-05,
"loss": 0.135,
"step": 5520
},
{
"epoch": 0.2944778742212045,
"grad_norm": 0.08267311006784439,
"learning_rate": 8.917573477364876e-05,
"loss": 0.1351,
"step": 5530
},
{
"epoch": 0.2950103839395069,
"grad_norm": 0.07382847368717194,
"learning_rate": 8.91311290119072e-05,
"loss": 0.1354,
"step": 5540
},
{
"epoch": 0.2955428936578093,
"grad_norm": 0.06079572066664696,
"learning_rate": 8.908644841812739e-05,
"loss": 0.1355,
"step": 5550
},
{
"epoch": 0.2960754033761116,
"grad_norm": 0.09829048812389374,
"learning_rate": 8.904169313086645e-05,
"loss": 0.1353,
"step": 5560
},
{
"epoch": 0.296607913094414,
"grad_norm": 0.09984841197729111,
"learning_rate": 8.899686328891315e-05,
"loss": 0.1348,
"step": 5570
},
{
"epoch": 0.29714042281271635,
"grad_norm": 0.1072811409831047,
"learning_rate": 8.895195903128739e-05,
"loss": 0.1356,
"step": 5580
},
{
"epoch": 0.2976729325310187,
"grad_norm": 0.0519433431327343,
"learning_rate": 8.890698049723995e-05,
"loss": 0.1345,
"step": 5590
},
{
"epoch": 0.29820544224932105,
"grad_norm": 0.057259172201156616,
"learning_rate": 8.886192782625189e-05,
"loss": 0.1338,
"step": 5600
},
{
"epoch": 0.2987379519676234,
"grad_norm": 0.102280393242836,
"learning_rate": 8.881680115803412e-05,
"loss": 0.1347,
"step": 5610
},
{
"epoch": 0.29927046168592575,
"grad_norm": 0.07144474983215332,
"learning_rate": 8.877160063252712e-05,
"loss": 0.1353,
"step": 5620
},
{
"epoch": 0.2998029714042281,
"grad_norm": 0.07198172062635422,
"learning_rate": 8.87263263899003e-05,
"loss": 0.1354,
"step": 5630
},
{
"epoch": 0.3003354811225305,
"grad_norm": 0.08418303728103638,
"learning_rate": 8.86809785705518e-05,
"loss": 0.1353,
"step": 5640
},
{
"epoch": 0.3008679908408328,
"grad_norm": 0.09237035363912582,
"learning_rate": 8.86355573151078e-05,
"loss": 0.1356,
"step": 5650
},
{
"epoch": 0.3014005005591352,
"grad_norm": 0.12669327855110168,
"learning_rate": 8.859006276442226e-05,
"loss": 0.1343,
"step": 5660
},
{
"epoch": 0.3019330102774376,
"grad_norm": 0.05533193424344063,
"learning_rate": 8.854449505957645e-05,
"loss": 0.1351,
"step": 5670
},
{
"epoch": 0.3024655199957399,
"grad_norm": 0.06650611758232117,
"learning_rate": 8.849885434187848e-05,
"loss": 0.1345,
"step": 5680
},
{
"epoch": 0.30299802971404227,
"grad_norm": 0.07978025078773499,
"learning_rate": 8.845314075286286e-05,
"loss": 0.1347,
"step": 5690
},
{
"epoch": 0.30353053943234465,
"grad_norm": 0.06983333081007004,
"learning_rate": 8.840735443429014e-05,
"loss": 0.1345,
"step": 5700
},
{
"epoch": 0.304063049150647,
"grad_norm": 0.08889699727296829,
"learning_rate": 8.836149552814632e-05,
"loss": 0.1341,
"step": 5710
},
{
"epoch": 0.30459555886894935,
"grad_norm": 0.17295877635478973,
"learning_rate": 8.831556417664255e-05,
"loss": 0.1349,
"step": 5720
},
{
"epoch": 0.3051280685872517,
"grad_norm": 0.08790881931781769,
"learning_rate": 8.826956052221464e-05,
"loss": 0.1356,
"step": 5730
},
{
"epoch": 0.3056605783055541,
"grad_norm": 0.11530311405658722,
"learning_rate": 8.822348470752263e-05,
"loss": 0.1341,
"step": 5740
},
{
"epoch": 0.3061930880238564,
"grad_norm": 0.07714807987213135,
"learning_rate": 8.817733687545024e-05,
"loss": 0.1349,
"step": 5750
},
{
"epoch": 0.3067255977421588,
"grad_norm": 0.07843048125505447,
"learning_rate": 8.813111716910463e-05,
"loss": 0.135,
"step": 5760
},
{
"epoch": 0.30725810746046117,
"grad_norm": 0.059752389788627625,
"learning_rate": 8.808482573181583e-05,
"loss": 0.134,
"step": 5770
},
{
"epoch": 0.3077906171787635,
"grad_norm": 0.07728555053472519,
"learning_rate": 8.803846270713622e-05,
"loss": 0.1347,
"step": 5780
},
{
"epoch": 0.30832312689706587,
"grad_norm": 0.11523959785699844,
"learning_rate": 8.79920282388403e-05,
"loss": 0.1345,
"step": 5790
},
{
"epoch": 0.30885563661536825,
"grad_norm": 0.0835232064127922,
"learning_rate": 8.794552247092404e-05,
"loss": 0.1342,
"step": 5800
},
{
"epoch": 0.30938814633367057,
"grad_norm": 0.08657065033912659,
"learning_rate": 8.789894554760456e-05,
"loss": 0.1351,
"step": 5810
},
{
"epoch": 0.30992065605197294,
"grad_norm": 0.17280389368534088,
"learning_rate": 8.78522976133196e-05,
"loss": 0.1345,
"step": 5820
},
{
"epoch": 0.3104531657702753,
"grad_norm": 0.07250665873289108,
"learning_rate": 8.780557881272711e-05,
"loss": 0.1339,
"step": 5830
},
{
"epoch": 0.31098567548857764,
"grad_norm": 0.135615274310112,
"learning_rate": 8.775878929070483e-05,
"loss": 0.1348,
"step": 5840
},
{
"epoch": 0.31151818520688,
"grad_norm": 0.09052561223506927,
"learning_rate": 8.77119291923498e-05,
"loss": 0.1342,
"step": 5850
},
{
"epoch": 0.3120506949251824,
"grad_norm": 0.06174413859844208,
"learning_rate": 8.766499866297791e-05,
"loss": 0.1338,
"step": 5860
},
{
"epoch": 0.31258320464348477,
"grad_norm": 0.08344202488660812,
"learning_rate": 8.761799784812348e-05,
"loss": 0.1343,
"step": 5870
},
{
"epoch": 0.3131157143617871,
"grad_norm": 0.11124816536903381,
"learning_rate": 8.757092689353876e-05,
"loss": 0.1348,
"step": 5880
},
{
"epoch": 0.31364822408008947,
"grad_norm": 0.19503851234912872,
"learning_rate": 8.752378594519355e-05,
"loss": 0.1351,
"step": 5890
},
{
"epoch": 0.31418073379839184,
"grad_norm": 0.0828915536403656,
"learning_rate": 8.747657514927463e-05,
"loss": 0.135,
"step": 5900
},
{
"epoch": 0.31471324351669416,
"grad_norm": 0.07251901179552078,
"learning_rate": 8.742929465218548e-05,
"loss": 0.1337,
"step": 5910
},
{
"epoch": 0.31524575323499654,
"grad_norm": 0.05215257406234741,
"learning_rate": 8.738194460054567e-05,
"loss": 0.1335,
"step": 5920
},
{
"epoch": 0.3157782629532989,
"grad_norm": 0.06828713417053223,
"learning_rate": 8.733452514119048e-05,
"loss": 0.1341,
"step": 5930
},
{
"epoch": 0.31631077267160124,
"grad_norm": 0.05530816689133644,
"learning_rate": 8.728703642117038e-05,
"loss": 0.1342,
"step": 5940
},
{
"epoch": 0.3168432823899036,
"grad_norm": 0.08321405947208405,
"learning_rate": 8.723947858775068e-05,
"loss": 0.1342,
"step": 5950
},
{
"epoch": 0.317375792108206,
"grad_norm": 0.08132331073284149,
"learning_rate": 8.7191851788411e-05,
"loss": 0.1344,
"step": 5960
},
{
"epoch": 0.3179083018265083,
"grad_norm": 0.062430258840322495,
"learning_rate": 8.714415617084484e-05,
"loss": 0.1346,
"step": 5970
},
{
"epoch": 0.3184408115448107,
"grad_norm": 0.09331026673316956,
"learning_rate": 8.709639188295906e-05,
"loss": 0.1346,
"step": 5980
},
{
"epoch": 0.31897332126311306,
"grad_norm": 0.05821016803383827,
"learning_rate": 8.70485590728735e-05,
"loss": 0.1347,
"step": 5990
},
{
"epoch": 0.3195058309814154,
"grad_norm": 0.059810176491737366,
"learning_rate": 8.700065788892053e-05,
"loss": 0.1341,
"step": 6000
},
{
"epoch": 0.32003834069971776,
"grad_norm": 0.07964300364255905,
"learning_rate": 8.695268847964449e-05,
"loss": 0.1348,
"step": 6010
},
{
"epoch": 0.32057085041802014,
"grad_norm": 0.08967084437608719,
"learning_rate": 8.690465099380131e-05,
"loss": 0.1343,
"step": 6020
},
{
"epoch": 0.3211033601363225,
"grad_norm": 0.05978870391845703,
"learning_rate": 8.685654558035803e-05,
"loss": 0.1335,
"step": 6030
},
{
"epoch": 0.32163586985462483,
"grad_norm": 0.05357252061367035,
"learning_rate": 8.680837238849237e-05,
"loss": 0.134,
"step": 6040
},
{
"epoch": 0.3221683795729272,
"grad_norm": 0.09251965582370758,
"learning_rate": 8.676013156759219e-05,
"loss": 0.1344,
"step": 6050
},
{
"epoch": 0.3227008892912296,
"grad_norm": 0.06674574315547943,
"learning_rate": 8.671182326725509e-05,
"loss": 0.1346,
"step": 6060
},
{
"epoch": 0.3232333990095319,
"grad_norm": 0.0678028017282486,
"learning_rate": 8.666344763728793e-05,
"loss": 0.1334,
"step": 6070
},
{
"epoch": 0.3237659087278343,
"grad_norm": 0.08303205668926239,
"learning_rate": 8.661500482770635e-05,
"loss": 0.1333,
"step": 6080
},
{
"epoch": 0.32429841844613666,
"grad_norm": 0.059111885726451874,
"learning_rate": 8.656649498873435e-05,
"loss": 0.1334,
"step": 6090
},
{
"epoch": 0.324830928164439,
"grad_norm": 0.0759367048740387,
"learning_rate": 8.651791827080373e-05,
"loss": 0.1334,
"step": 6100
},
{
"epoch": 0.32536343788274136,
"grad_norm": 0.05576184391975403,
"learning_rate": 8.646927482455375e-05,
"loss": 0.1347,
"step": 6110
},
{
"epoch": 0.32589594760104373,
"grad_norm": 0.12025826424360275,
"learning_rate": 8.642056480083058e-05,
"loss": 0.1341,
"step": 6120
},
{
"epoch": 0.32642845731934605,
"grad_norm": 0.055676814168691635,
"learning_rate": 8.637178835068685e-05,
"loss": 0.1342,
"step": 6130
},
{
"epoch": 0.32696096703764843,
"grad_norm": 0.06213228031992912,
"learning_rate": 8.632294562538114e-05,
"loss": 0.1336,
"step": 6140
},
{
"epoch": 0.3274934767559508,
"grad_norm": 0.05842900648713112,
"learning_rate": 8.627403677637762e-05,
"loss": 0.1339,
"step": 6150
},
{
"epoch": 0.32802598647425313,
"grad_norm": 0.07621738314628601,
"learning_rate": 8.62250619553455e-05,
"loss": 0.1339,
"step": 6160
},
{
"epoch": 0.3285584961925555,
"grad_norm": 0.06162210926413536,
"learning_rate": 8.61760213141585e-05,
"loss": 0.1345,
"step": 6170
},
{
"epoch": 0.3290910059108579,
"grad_norm": 0.0727054551243782,
"learning_rate": 8.612691500489453e-05,
"loss": 0.1342,
"step": 6180
},
{
"epoch": 0.32962351562916026,
"grad_norm": 0.05833178386092186,
"learning_rate": 8.607774317983515e-05,
"loss": 0.1337,
"step": 6190
},
{
"epoch": 0.3301560253474626,
"grad_norm": 0.11485815793275833,
"learning_rate": 8.602850599146502e-05,
"loss": 0.1334,
"step": 6200
},
{
"epoch": 0.33068853506576495,
"grad_norm": 0.0697018951177597,
"learning_rate": 8.597920359247156e-05,
"loss": 0.1335,
"step": 6210
},
{
"epoch": 0.33122104478406733,
"grad_norm": 0.05111430957913399,
"learning_rate": 8.592983613574435e-05,
"loss": 0.1332,
"step": 6220
},
{
"epoch": 0.33175355450236965,
"grad_norm": 0.06886550784111023,
"learning_rate": 8.588040377437479e-05,
"loss": 0.1338,
"step": 6230
},
{
"epoch": 0.33228606422067203,
"grad_norm": 0.04854755103588104,
"learning_rate": 8.58309066616555e-05,
"loss": 0.1336,
"step": 6240
},
{
"epoch": 0.3328185739389744,
"grad_norm": 0.0921018123626709,
"learning_rate": 8.57813449510799e-05,
"loss": 0.1338,
"step": 6250
},
{
"epoch": 0.3333510836572767,
"grad_norm": 0.09607180953025818,
"learning_rate": 8.573171879634177e-05,
"loss": 0.1341,
"step": 6260
},
{
"epoch": 0.3338835933755791,
"grad_norm": 0.16610988974571228,
"learning_rate": 8.568202835133468e-05,
"loss": 0.1343,
"step": 6270
},
{
"epoch": 0.3344161030938815,
"grad_norm": 0.07573292404413223,
"learning_rate": 8.563227377015162e-05,
"loss": 0.1336,
"step": 6280
},
{
"epoch": 0.3349486128121838,
"grad_norm": 0.10059863328933716,
"learning_rate": 8.558245520708444e-05,
"loss": 0.1338,
"step": 6290
},
{
"epoch": 0.3354811225304862,
"grad_norm": 0.0501171350479126,
"learning_rate": 8.553257281662342e-05,
"loss": 0.1331,
"step": 6300
},
{
"epoch": 0.33601363224878855,
"grad_norm": 0.07637584954500198,
"learning_rate": 8.548262675345673e-05,
"loss": 0.1336,
"step": 6310
},
{
"epoch": 0.3365461419670909,
"grad_norm": 0.08559510856866837,
"learning_rate": 8.543261717247006e-05,
"loss": 0.1345,
"step": 6320
},
{
"epoch": 0.33707865168539325,
"grad_norm": 0.07463840395212173,
"learning_rate": 8.5382544228746e-05,
"loss": 0.134,
"step": 6330
},
{
"epoch": 0.3376111614036956,
"grad_norm": 0.06291361898183823,
"learning_rate": 8.533240807756373e-05,
"loss": 0.134,
"step": 6340
},
{
"epoch": 0.338143671121998,
"grad_norm": 0.054589059203863144,
"learning_rate": 8.52822088743983e-05,
"loss": 0.1336,
"step": 6350
},
{
"epoch": 0.3386761808403003,
"grad_norm": 0.08773118257522583,
"learning_rate": 8.523194677492044e-05,
"loss": 0.1332,
"step": 6360
},
{
"epoch": 0.3392086905586027,
"grad_norm": 0.08630936592817307,
"learning_rate": 8.518162193499581e-05,
"loss": 0.1339,
"step": 6370
},
{
"epoch": 0.3397412002769051,
"grad_norm": 0.0654667541384697,
"learning_rate": 8.513123451068467e-05,
"loss": 0.1339,
"step": 6380
},
{
"epoch": 0.3402737099952074,
"grad_norm": 0.04769926890730858,
"learning_rate": 8.508078465824138e-05,
"loss": 0.1335,
"step": 6390
},
{
"epoch": 0.3408062197135098,
"grad_norm": 0.10144821554422379,
"learning_rate": 8.503027253411387e-05,
"loss": 0.1328,
"step": 6400
},
{
"epoch": 0.34133872943181215,
"grad_norm": 0.09670275449752808,
"learning_rate": 8.497969829494319e-05,
"loss": 0.1338,
"step": 6410
},
{
"epoch": 0.34187123915011447,
"grad_norm": 0.08334879577159882,
"learning_rate": 8.492906209756294e-05,
"loss": 0.1328,
"step": 6420
},
{
"epoch": 0.34240374886841685,
"grad_norm": 0.06717374920845032,
"learning_rate": 8.487836409899905e-05,
"loss": 0.134,
"step": 6430
},
{
"epoch": 0.3429362585867192,
"grad_norm": 0.1657373309135437,
"learning_rate": 8.482760445646885e-05,
"loss": 0.1333,
"step": 6440
},
{
"epoch": 0.34346876830502154,
"grad_norm": 0.07656820863485336,
"learning_rate": 8.477678332738102e-05,
"loss": 0.1331,
"step": 6450
},
{
"epoch": 0.3440012780233239,
"grad_norm": 0.06148603931069374,
"learning_rate": 8.472590086933479e-05,
"loss": 0.1338,
"step": 6460
},
{
"epoch": 0.3445337877416263,
"grad_norm": 0.0530422069132328,
"learning_rate": 8.467495724011967e-05,
"loss": 0.1335,
"step": 6470
},
{
"epoch": 0.3450662974599286,
"grad_norm": 1.325517177581787,
"learning_rate": 8.462395259771483e-05,
"loss": 0.1388,
"step": 6480
},
{
"epoch": 0.345598807178231,
"grad_norm": 4.959922790527344,
"learning_rate": 8.457288710028862e-05,
"loss": 0.2724,
"step": 6490
},
{
"epoch": 0.34613131689653337,
"grad_norm": 0.29457658529281616,
"learning_rate": 8.452176090619812e-05,
"loss": 0.1983,
"step": 6500
},
{
"epoch": 0.34666382661483575,
"grad_norm": 0.10155448317527771,
"learning_rate": 8.447057417398866e-05,
"loss": 0.1556,
"step": 6510
},
{
"epoch": 0.34719633633313807,
"grad_norm": 0.08506519347429276,
"learning_rate": 8.441932706239329e-05,
"loss": 0.1438,
"step": 6520
},
{
"epoch": 0.34772884605144044,
"grad_norm": 0.06791000813245773,
"learning_rate": 8.436801973033227e-05,
"loss": 0.1402,
"step": 6530
},
{
"epoch": 0.3482613557697428,
"grad_norm": 0.0665493905544281,
"learning_rate": 8.43166523369126e-05,
"loss": 0.1381,
"step": 6540
},
{
"epoch": 0.34879386548804514,
"grad_norm": 0.14150767028331757,
"learning_rate": 8.42652250414276e-05,
"loss": 0.136,
"step": 6550
},
{
"epoch": 0.3493263752063475,
"grad_norm": 0.07686637341976166,
"learning_rate": 8.421373800335632e-05,
"loss": 0.1352,
"step": 6560
},
{
"epoch": 0.3498588849246499,
"grad_norm": 0.06520914286375046,
"learning_rate": 8.416219138236308e-05,
"loss": 0.1349,
"step": 6570
},
{
"epoch": 0.3503913946429522,
"grad_norm": 0.1157696321606636,
"learning_rate": 8.411058533829688e-05,
"loss": 0.1346,
"step": 6580
},
{
"epoch": 0.3509239043612546,
"grad_norm": 0.10777822136878967,
"learning_rate": 8.405892003119115e-05,
"loss": 0.1354,
"step": 6590
},
{
"epoch": 0.35145641407955697,
"grad_norm": 0.07395236939191818,
"learning_rate": 8.4007195621263e-05,
"loss": 0.1335,
"step": 6600
},
{
"epoch": 0.3519889237978593,
"grad_norm": 0.0857616439461708,
"learning_rate": 8.395541226891283e-05,
"loss": 0.1339,
"step": 6610
},
{
"epoch": 0.35252143351616166,
"grad_norm": 0.055322933942079544,
"learning_rate": 8.390357013472386e-05,
"loss": 0.1339,
"step": 6620
},
{
"epoch": 0.35305394323446404,
"grad_norm": 0.06957754492759705,
"learning_rate": 8.385166937946154e-05,
"loss": 0.1337,
"step": 6630
},
{
"epoch": 0.35358645295276636,
"grad_norm": 0.06771986186504364,
"learning_rate": 8.379971016407313e-05,
"loss": 0.1344,
"step": 6640
},
{
"epoch": 0.35411896267106874,
"grad_norm": 0.08326587826013565,
"learning_rate": 8.374769264968722e-05,
"loss": 0.1333,
"step": 6650
},
{
"epoch": 0.3546514723893711,
"grad_norm": 0.0682111382484436,
"learning_rate": 8.369561699761317e-05,
"loss": 0.1347,
"step": 6660
},
{
"epoch": 0.3551839821076735,
"grad_norm": 0.10408024489879608,
"learning_rate": 8.364348336934056e-05,
"loss": 0.1335,
"step": 6670
},
{
"epoch": 0.3557164918259758,
"grad_norm": 0.07545497268438339,
"learning_rate": 8.359129192653883e-05,
"loss": 0.1329,
"step": 6680
},
{
"epoch": 0.3562490015442782,
"grad_norm": 0.06856414675712585,
"learning_rate": 8.353904283105671e-05,
"loss": 0.1333,
"step": 6690
},
{
"epoch": 0.35678151126258056,
"grad_norm": 0.12046464532613754,
"learning_rate": 8.34867362449217e-05,
"loss": 0.1341,
"step": 6700
},
{
"epoch": 0.3573140209808829,
"grad_norm": 0.08481092005968094,
"learning_rate": 8.343437233033952e-05,
"loss": 0.1331,
"step": 6710
},
{
"epoch": 0.35784653069918526,
"grad_norm": 0.0763193815946579,
"learning_rate": 8.338195124969377e-05,
"loss": 0.133,
"step": 6720
},
{
"epoch": 0.35837904041748764,
"grad_norm": 0.05080103129148483,
"learning_rate": 8.332947316554527e-05,
"loss": 0.1332,
"step": 6730
},
{
"epoch": 0.35891155013578996,
"grad_norm": 0.09795154631137848,
"learning_rate": 8.327693824063158e-05,
"loss": 0.1339,
"step": 6740
},
{
"epoch": 0.35944405985409233,
"grad_norm": 0.10069025307893753,
"learning_rate": 8.322434663786662e-05,
"loss": 0.1335,
"step": 6750
},
{
"epoch": 0.3599765695723947,
"grad_norm": 0.08091656118631363,
"learning_rate": 8.317169852034002e-05,
"loss": 0.1336,
"step": 6760
},
{
"epoch": 0.36050907929069703,
"grad_norm": 0.06075895577669144,
"learning_rate": 8.31189940513166e-05,
"loss": 0.1341,
"step": 6770
},
{
"epoch": 0.3610415890089994,
"grad_norm": 0.051195014268159866,
"learning_rate": 8.306623339423605e-05,
"loss": 0.1338,
"step": 6780
},
{
"epoch": 0.3615740987273018,
"grad_norm": 0.05651082843542099,
"learning_rate": 8.301341671271222e-05,
"loss": 0.1332,
"step": 6790
},
{
"epoch": 0.3621066084456041,
"grad_norm": 0.05757668614387512,
"learning_rate": 8.29605441705327e-05,
"loss": 0.133,
"step": 6800
},
{
"epoch": 0.3626391181639065,
"grad_norm": 0.05644191801548004,
"learning_rate": 8.290761593165836e-05,
"loss": 0.1333,
"step": 6810
},
{
"epoch": 0.36317162788220886,
"grad_norm": 0.07972195744514465,
"learning_rate": 8.285463216022276e-05,
"loss": 0.133,
"step": 6820
},
{
"epoch": 0.36370413760051123,
"grad_norm": 0.04617351293563843,
"learning_rate": 8.280159302053163e-05,
"loss": 0.1328,
"step": 6830
},
{
"epoch": 0.36423664731881356,
"grad_norm": 0.09602131694555283,
"learning_rate": 8.274849867706247e-05,
"loss": 0.1331,
"step": 6840
},
{
"epoch": 0.36476915703711593,
"grad_norm": 0.04789271950721741,
"learning_rate": 8.269534929446392e-05,
"loss": 0.133,
"step": 6850
},
{
"epoch": 0.3653016667554183,
"grad_norm": 0.09267139434814453,
"learning_rate": 8.26421450375553e-05,
"loss": 0.1325,
"step": 6860
},
{
"epoch": 0.36583417647372063,
"grad_norm": 0.058588556945323944,
"learning_rate": 8.258888607132614e-05,
"loss": 0.1336,
"step": 6870
},
{
"epoch": 0.366366686192023,
"grad_norm": 0.052210818976163864,
"learning_rate": 8.253557256093558e-05,
"loss": 0.1328,
"step": 6880
},
{
"epoch": 0.3668991959103254,
"grad_norm": 0.09430071711540222,
"learning_rate": 8.248220467171195e-05,
"loss": 0.1328,
"step": 6890
},
{
"epoch": 0.3674317056286277,
"grad_norm": 0.07742954045534134,
"learning_rate": 8.242878256915216e-05,
"loss": 0.1328,
"step": 6900
},
{
"epoch": 0.3679642153469301,
"grad_norm": 0.06042707711458206,
"learning_rate": 8.237530641892128e-05,
"loss": 0.133,
"step": 6910
},
{
"epoch": 0.36849672506523246,
"grad_norm": 0.06480567157268524,
"learning_rate": 8.232177638685194e-05,
"loss": 0.1328,
"step": 6920
},
{
"epoch": 0.3690292347835348,
"grad_norm": 0.047677043825387955,
"learning_rate": 8.226819263894395e-05,
"loss": 0.1331,
"step": 6930
},
{
"epoch": 0.36956174450183715,
"grad_norm": 0.051471047103405,
"learning_rate": 8.221455534136358e-05,
"loss": 0.1324,
"step": 6940
},
{
"epoch": 0.37009425422013953,
"grad_norm": 0.07004884630441666,
"learning_rate": 8.216086466044323e-05,
"loss": 0.1327,
"step": 6950
},
{
"epoch": 0.37062676393844185,
"grad_norm": 0.07678276300430298,
"learning_rate": 8.210712076268088e-05,
"loss": 0.1327,
"step": 6960
},
{
"epoch": 0.3711592736567442,
"grad_norm": 0.07195029407739639,
"learning_rate": 8.205332381473942e-05,
"loss": 0.1324,
"step": 6970
},
{
"epoch": 0.3716917833750466,
"grad_norm": 0.061837486922740936,
"learning_rate": 8.199947398344639e-05,
"loss": 0.1325,
"step": 6980
},
{
"epoch": 0.372224293093349,
"grad_norm": 0.1034204512834549,
"learning_rate": 8.19455714357932e-05,
"loss": 0.1326,
"step": 6990
},
{
"epoch": 0.3727568028116513,
"grad_norm": 0.10331778973340988,
"learning_rate": 8.189161633893481e-05,
"loss": 0.1328,
"step": 7000
},
{
"epoch": 0.3732893125299537,
"grad_norm": 0.06943188607692719,
"learning_rate": 8.183760886018914e-05,
"loss": 0.1323,
"step": 7010
},
{
"epoch": 0.37382182224825605,
"grad_norm": 0.050394218415021896,
"learning_rate": 8.178354916703654e-05,
"loss": 0.1324,
"step": 7020
},
{
"epoch": 0.3743543319665584,
"grad_norm": 0.06192854419350624,
"learning_rate": 8.172943742711923e-05,
"loss": 0.1323,
"step": 7030
},
{
"epoch": 0.37488684168486075,
"grad_norm": 0.12752105295658112,
"learning_rate": 8.16752738082409e-05,
"loss": 0.1329,
"step": 7040
},
{
"epoch": 0.3754193514031631,
"grad_norm": 0.07455851882696152,
"learning_rate": 8.162105847836605e-05,
"loss": 0.1331,
"step": 7050
},
{
"epoch": 0.37595186112146545,
"grad_norm": 0.07023312151432037,
"learning_rate": 8.156679160561963e-05,
"loss": 0.1326,
"step": 7060
},
{
"epoch": 0.3764843708397678,
"grad_norm": 0.057135872542858124,
"learning_rate": 8.151247335828638e-05,
"loss": 0.1334,
"step": 7070
},
{
"epoch": 0.3770168805580702,
"grad_norm": 0.10991890728473663,
"learning_rate": 8.145810390481033e-05,
"loss": 0.1328,
"step": 7080
},
{
"epoch": 0.3775493902763725,
"grad_norm": 0.06575486063957214,
"learning_rate": 8.140368341379431e-05,
"loss": 0.133,
"step": 7090
},
{
"epoch": 0.3780818999946749,
"grad_norm": 0.06990350782871246,
"learning_rate": 8.134921205399945e-05,
"loss": 0.1321,
"step": 7100
},
{
"epoch": 0.3786144097129773,
"grad_norm": 0.06953799724578857,
"learning_rate": 8.129468999434464e-05,
"loss": 0.132,
"step": 7110
},
{
"epoch": 0.3791469194312796,
"grad_norm": 0.09842592477798462,
"learning_rate": 8.124011740390591e-05,
"loss": 0.1323,
"step": 7120
},
{
"epoch": 0.37967942914958197,
"grad_norm": 0.08032160997390747,
"learning_rate": 8.118549445191613e-05,
"loss": 0.1324,
"step": 7130
},
{
"epoch": 0.38021193886788435,
"grad_norm": 0.07145192474126816,
"learning_rate": 8.113082130776417e-05,
"loss": 0.1315,
"step": 7140
},
{
"epoch": 0.3807444485861867,
"grad_norm": 0.05545572564005852,
"learning_rate": 8.107609814099466e-05,
"loss": 0.1327,
"step": 7150
},
{
"epoch": 0.38127695830448904,
"grad_norm": 0.06006612256169319,
"learning_rate": 8.102132512130738e-05,
"loss": 0.1316,
"step": 7160
},
{
"epoch": 0.3818094680227914,
"grad_norm": 0.08068816363811493,
"learning_rate": 8.096650241855661e-05,
"loss": 0.1319,
"step": 7170
},
{
"epoch": 0.3823419777410938,
"grad_norm": 0.08527512848377228,
"learning_rate": 8.091163020275077e-05,
"loss": 0.1324,
"step": 7180
},
{
"epoch": 0.3828744874593961,
"grad_norm": 0.06154448911547661,
"learning_rate": 8.085670864405179e-05,
"loss": 0.1327,
"step": 7190
},
{
"epoch": 0.3834069971776985,
"grad_norm": 0.05169384926557541,
"learning_rate": 8.080173791277463e-05,
"loss": 0.132,
"step": 7200
},
{
"epoch": 0.38393950689600087,
"grad_norm": 0.11166296899318695,
"learning_rate": 8.074671817938674e-05,
"loss": 0.1318,
"step": 7210
},
{
"epoch": 0.3844720166143032,
"grad_norm": 0.05975338816642761,
"learning_rate": 8.069164961450751e-05,
"loss": 0.1313,
"step": 7220
},
{
"epoch": 0.38500452633260557,
"grad_norm": 0.07280656695365906,
"learning_rate": 8.063653238890779e-05,
"loss": 0.1324,
"step": 7230
},
{
"epoch": 0.38553703605090794,
"grad_norm": 0.050891146063804626,
"learning_rate": 8.058136667350928e-05,
"loss": 0.132,
"step": 7240
},
{
"epoch": 0.38606954576921027,
"grad_norm": 0.10308956354856491,
"learning_rate": 8.05261526393841e-05,
"loss": 0.1323,
"step": 7250
},
{
"epoch": 0.38660205548751264,
"grad_norm": 0.08276902139186859,
"learning_rate": 8.04708904577542e-05,
"loss": 0.1324,
"step": 7260
},
{
"epoch": 0.387134565205815,
"grad_norm": 0.06150532513856888,
"learning_rate": 8.041558029999081e-05,
"loss": 0.1324,
"step": 7270
},
{
"epoch": 0.38766707492411734,
"grad_norm": 0.08963697403669357,
"learning_rate": 8.036022233761396e-05,
"loss": 0.1332,
"step": 7280
},
{
"epoch": 0.3881995846424197,
"grad_norm": 0.08556204289197922,
"learning_rate": 8.030481674229192e-05,
"loss": 0.1319,
"step": 7290
},
{
"epoch": 0.3887320943607221,
"grad_norm": 0.0741380900144577,
"learning_rate": 8.024936368584066e-05,
"loss": 0.132,
"step": 7300
},
{
"epoch": 0.38926460407902447,
"grad_norm": 0.060994237661361694,
"learning_rate": 8.019386334022336e-05,
"loss": 0.1328,
"step": 7310
},
{
"epoch": 0.3897971137973268,
"grad_norm": 0.053207580000162125,
"learning_rate": 8.013831587754984e-05,
"loss": 0.1321,
"step": 7320
},
{
"epoch": 0.39032962351562916,
"grad_norm": 0.08496523648500443,
"learning_rate": 8.008272147007597e-05,
"loss": 0.1317,
"step": 7330
},
{
"epoch": 0.39086213323393154,
"grad_norm": 0.06788633018732071,
"learning_rate": 8.002708029020329e-05,
"loss": 0.1323,
"step": 7340
},
{
"epoch": 0.39139464295223386,
"grad_norm": 0.05240168422460556,
"learning_rate": 7.997139251047835e-05,
"loss": 0.1323,
"step": 7350
},
{
"epoch": 0.39192715267053624,
"grad_norm": 0.08682172000408173,
"learning_rate": 7.991565830359218e-05,
"loss": 0.1321,
"step": 7360
},
{
"epoch": 0.3924596623888386,
"grad_norm": 0.05870863422751427,
"learning_rate": 7.985987784237981e-05,
"loss": 0.1317,
"step": 7370
},
{
"epoch": 0.39299217210714094,
"grad_norm": 0.053884461522102356,
"learning_rate": 7.980405129981971e-05,
"loss": 0.1322,
"step": 7380
},
{
"epoch": 0.3935246818254433,
"grad_norm": 0.051192574203014374,
"learning_rate": 7.974817884903325e-05,
"loss": 0.132,
"step": 7390
},
{
"epoch": 0.3940571915437457,
"grad_norm": 0.07789867371320724,
"learning_rate": 7.969226066328415e-05,
"loss": 0.1322,
"step": 7400
},
{
"epoch": 0.394589701262048,
"grad_norm": 0.12169856578111649,
"learning_rate": 7.963629691597794e-05,
"loss": 0.1331,
"step": 7410
},
{
"epoch": 0.3951222109803504,
"grad_norm": 0.05751097947359085,
"learning_rate": 7.95802877806615e-05,
"loss": 0.1317,
"step": 7420
},
{
"epoch": 0.39565472069865276,
"grad_norm": 0.0670279935002327,
"learning_rate": 7.952423343102242e-05,
"loss": 0.1321,
"step": 7430
},
{
"epoch": 0.3961872304169551,
"grad_norm": 0.12209637463092804,
"learning_rate": 7.946813404088849e-05,
"loss": 0.1318,
"step": 7440
},
{
"epoch": 0.39671974013525746,
"grad_norm": 0.06626468896865845,
"learning_rate": 7.94119897842272e-05,
"loss": 0.1318,
"step": 7450
},
{
"epoch": 0.39725224985355984,
"grad_norm": 0.04306609556078911,
"learning_rate": 7.935580083514516e-05,
"loss": 0.1318,
"step": 7460
},
{
"epoch": 0.3977847595718622,
"grad_norm": 0.07492338865995407,
"learning_rate": 7.929956736788759e-05,
"loss": 0.1318,
"step": 7470
},
{
"epoch": 0.39831726929016453,
"grad_norm": 0.051630035042762756,
"learning_rate": 7.924328955683774e-05,
"loss": 0.1314,
"step": 7480
},
{
"epoch": 0.3988497790084669,
"grad_norm": 0.06161106750369072,
"learning_rate": 7.918696757651637e-05,
"loss": 0.1319,
"step": 7490
},
{
"epoch": 0.3993822887267693,
"grad_norm": 0.048934947699308395,
"learning_rate": 7.913060160158125e-05,
"loss": 0.1318,
"step": 7500
},
{
"epoch": 0.3999147984450716,
"grad_norm": 0.08472836762666702,
"learning_rate": 7.907419180682656e-05,
"loss": 0.1324,
"step": 7510
},
{
"epoch": 0.400447308163374,
"grad_norm": 0.07017608731985092,
"learning_rate": 7.901773836718234e-05,
"loss": 0.1315,
"step": 7520
},
{
"epoch": 0.40097981788167636,
"grad_norm": 0.09098348021507263,
"learning_rate": 7.8961241457714e-05,
"loss": 0.1316,
"step": 7530
},
{
"epoch": 0.4015123275999787,
"grad_norm": 0.07034831494092941,
"learning_rate": 7.890470125362174e-05,
"loss": 0.132,
"step": 7540
},
{
"epoch": 0.40204483731828106,
"grad_norm": 0.08528514206409454,
"learning_rate": 7.884811793024009e-05,
"loss": 0.1317,
"step": 7550
},
{
"epoch": 0.40257734703658343,
"grad_norm": 0.10862760245800018,
"learning_rate": 7.879149166303719e-05,
"loss": 0.1315,
"step": 7560
},
{
"epoch": 0.40310985675488575,
"grad_norm": 0.04836263135075569,
"learning_rate": 7.873482262761438e-05,
"loss": 0.1317,
"step": 7570
},
{
"epoch": 0.40364236647318813,
"grad_norm": 0.051307760179042816,
"learning_rate": 7.867811099970568e-05,
"loss": 0.1328,
"step": 7580
},
{
"epoch": 0.4041748761914905,
"grad_norm": 0.05256601795554161,
"learning_rate": 7.862135695517712e-05,
"loss": 0.1321,
"step": 7590
},
{
"epoch": 0.4047073859097928,
"grad_norm": 0.05649365857243538,
"learning_rate": 7.856456067002633e-05,
"loss": 0.1314,
"step": 7600
},
{
"epoch": 0.4052398956280952,
"grad_norm": 0.04195050150156021,
"learning_rate": 7.85077223203819e-05,
"loss": 0.1327,
"step": 7610
},
{
"epoch": 0.4057724053463976,
"grad_norm": 0.07042062282562256,
"learning_rate": 7.845084208250286e-05,
"loss": 0.1319,
"step": 7620
},
{
"epoch": 0.40630491506469996,
"grad_norm": 0.048713624477386475,
"learning_rate": 7.839392013277814e-05,
"loss": 0.1315,
"step": 7630
},
{
"epoch": 0.4068374247830023,
"grad_norm": 0.05016913264989853,
"learning_rate": 7.833695664772605e-05,
"loss": 0.132,
"step": 7640
},
{
"epoch": 0.40736993450130465,
"grad_norm": 0.04809438809752464,
"learning_rate": 7.827995180399364e-05,
"loss": 0.1315,
"step": 7650
},
{
"epoch": 0.40790244421960703,
"grad_norm": 0.0424528494477272,
"learning_rate": 7.822290577835627e-05,
"loss": 0.1312,
"step": 7660
},
{
"epoch": 0.40843495393790935,
"grad_norm": 0.049090851098299026,
"learning_rate": 7.8165818747717e-05,
"loss": 0.1318,
"step": 7670
},
{
"epoch": 0.4089674636562117,
"grad_norm": 0.09739360958337784,
"learning_rate": 7.810869088910604e-05,
"loss": 0.1314,
"step": 7680
},
{
"epoch": 0.4094999733745141,
"grad_norm": 0.06400451064109802,
"learning_rate": 7.805152237968019e-05,
"loss": 0.1319,
"step": 7690
},
{
"epoch": 0.4100324830928164,
"grad_norm": 0.09439321607351303,
"learning_rate": 7.799431339672238e-05,
"loss": 0.1315,
"step": 7700
},
{
"epoch": 0.4105649928111188,
"grad_norm": 0.061424221843481064,
"learning_rate": 7.793706411764095e-05,
"loss": 0.132,
"step": 7710
},
{
"epoch": 0.4110975025294212,
"grad_norm": 0.06444218754768372,
"learning_rate": 7.787977471996928e-05,
"loss": 0.1313,
"step": 7720
},
{
"epoch": 0.4116300122477235,
"grad_norm": 0.052814047783613205,
"learning_rate": 7.782244538136513e-05,
"loss": 0.1316,
"step": 7730
},
{
"epoch": 0.4121625219660259,
"grad_norm": 0.06464862823486328,
"learning_rate": 7.776507627961012e-05,
"loss": 0.1313,
"step": 7740
},
{
"epoch": 0.41269503168432825,
"grad_norm": 0.05052724853157997,
"learning_rate": 7.770766759260918e-05,
"loss": 0.1317,
"step": 7750
},
{
"epoch": 0.41322754140263057,
"grad_norm": 0.10346025973558426,
"learning_rate": 7.765021949839e-05,
"loss": 0.1319,
"step": 7760
},
{
"epoch": 0.41376005112093295,
"grad_norm": 0.07890909165143967,
"learning_rate": 7.759273217510246e-05,
"loss": 0.1316,
"step": 7770
},
{
"epoch": 0.4142925608392353,
"grad_norm": 0.04561850428581238,
"learning_rate": 7.75352058010181e-05,
"loss": 0.1317,
"step": 7780
},
{
"epoch": 0.4148250705575377,
"grad_norm": 0.09326593577861786,
"learning_rate": 7.747764055452957e-05,
"loss": 0.1309,
"step": 7790
},
{
"epoch": 0.41535758027584,
"grad_norm": 0.06307931989431381,
"learning_rate": 7.742003661415007e-05,
"loss": 0.1307,
"step": 7800
},
{
"epoch": 0.4158900899941424,
"grad_norm": 0.07909877598285675,
"learning_rate": 7.736239415851274e-05,
"loss": 0.1312,
"step": 7810
},
{
"epoch": 0.4164225997124448,
"grad_norm": 0.05338076129555702,
"learning_rate": 7.730471336637024e-05,
"loss": 0.1309,
"step": 7820
},
{
"epoch": 0.4169551094307471,
"grad_norm": 0.08736453205347061,
"learning_rate": 7.724699441659404e-05,
"loss": 0.1321,
"step": 7830
},
{
"epoch": 0.41748761914904947,
"grad_norm": 0.062187109142541885,
"learning_rate": 7.718923748817397e-05,
"loss": 0.132,
"step": 7840
},
{
"epoch": 0.41802012886735185,
"grad_norm": 0.0855235755443573,
"learning_rate": 7.713144276021768e-05,
"loss": 0.1306,
"step": 7850
},
{
"epoch": 0.41855263858565417,
"grad_norm": 0.04441085830330849,
"learning_rate": 7.707361041194992e-05,
"loss": 0.1313,
"step": 7860
},
{
"epoch": 0.41908514830395655,
"grad_norm": 0.06373197585344315,
"learning_rate": 7.70157406227122e-05,
"loss": 0.1316,
"step": 7870
},
{
"epoch": 0.4196176580222589,
"grad_norm": 0.05832177773118019,
"learning_rate": 7.695783357196214e-05,
"loss": 0.1312,
"step": 7880
},
{
"epoch": 0.42015016774056124,
"grad_norm": 0.0553959384560585,
"learning_rate": 7.689988943927285e-05,
"loss": 0.1317,
"step": 7890
},
{
"epoch": 0.4206826774588636,
"grad_norm": 0.07334991544485092,
"learning_rate": 7.684190840433247e-05,
"loss": 0.1312,
"step": 7900
},
{
"epoch": 0.421215187177166,
"grad_norm": 0.08733541518449783,
"learning_rate": 7.67838906469436e-05,
"loss": 0.1317,
"step": 7910
},
{
"epoch": 0.4217476968954683,
"grad_norm": 0.07919137924909592,
"learning_rate": 7.672583634702262e-05,
"loss": 0.131,
"step": 7920
},
{
"epoch": 0.4222802066137707,
"grad_norm": 0.08723526448011398,
"learning_rate": 7.666774568459938e-05,
"loss": 0.1318,
"step": 7930
},
{
"epoch": 0.42281271633207307,
"grad_norm": 0.053012095391750336,
"learning_rate": 7.660961883981636e-05,
"loss": 0.1317,
"step": 7940
},
{
"epoch": 0.42334522605037544,
"grad_norm": 0.09206791967153549,
"learning_rate": 7.65514559929283e-05,
"loss": 0.1319,
"step": 7950
},
{
"epoch": 0.42387773576867777,
"grad_norm": 0.06498973816633224,
"learning_rate": 7.649325732430161e-05,
"loss": 0.1311,
"step": 7960
},
{
"epoch": 0.42441024548698014,
"grad_norm": 0.07660607993602753,
"learning_rate": 7.643502301441373e-05,
"loss": 0.1314,
"step": 7970
},
{
"epoch": 0.4249427552052825,
"grad_norm": 0.08989237993955612,
"learning_rate": 7.637675324385266e-05,
"loss": 0.1304,
"step": 7980
},
{
"epoch": 0.42547526492358484,
"grad_norm": 0.08027999103069305,
"learning_rate": 7.631844819331633e-05,
"loss": 0.1311,
"step": 7990
},
{
"epoch": 0.4260077746418872,
"grad_norm": 0.05923927202820778,
"learning_rate": 7.626010804361216e-05,
"loss": 0.1303,
"step": 8000
},
{
"epoch": 0.4265402843601896,
"grad_norm": 0.05849640443921089,
"learning_rate": 7.62017329756563e-05,
"loss": 0.1307,
"step": 8010
},
{
"epoch": 0.4270727940784919,
"grad_norm": 0.05768370255827904,
"learning_rate": 7.614332317047326e-05,
"loss": 0.1315,
"step": 8020
},
{
"epoch": 0.4276053037967943,
"grad_norm": 0.05652983486652374,
"learning_rate": 7.608487880919525e-05,
"loss": 0.1311,
"step": 8030
},
{
"epoch": 0.42813781351509667,
"grad_norm": 0.0556759238243103,
"learning_rate": 7.602640007306165e-05,
"loss": 0.1316,
"step": 8040
},
{
"epoch": 0.428670323233399,
"grad_norm": 0.04655342176556587,
"learning_rate": 7.596788714341843e-05,
"loss": 0.1313,
"step": 8050
},
{
"epoch": 0.42920283295170136,
"grad_norm": 0.048768457025289536,
"learning_rate": 7.590934020171758e-05,
"loss": 0.1308,
"step": 8060
},
{
"epoch": 0.42973534267000374,
"grad_norm": 0.05214981734752655,
"learning_rate": 7.58507594295166e-05,
"loss": 0.131,
"step": 8070
},
{
"epoch": 0.43026785238830606,
"grad_norm": 0.060043588280677795,
"learning_rate": 7.579214500847789e-05,
"loss": 0.1315,
"step": 8080
},
{
"epoch": 0.43080036210660844,
"grad_norm": 0.04958285391330719,
"learning_rate": 7.573349712036815e-05,
"loss": 0.1314,
"step": 8090
},
{
"epoch": 0.4313328718249108,
"grad_norm": 0.05665591359138489,
"learning_rate": 7.567481594705795e-05,
"loss": 0.1314,
"step": 8100
},
{
"epoch": 0.4318653815432132,
"grad_norm": 0.056042492389678955,
"learning_rate": 7.561610167052095e-05,
"loss": 0.131,
"step": 8110
},
{
"epoch": 0.4323978912615155,
"grad_norm": 0.05700002983212471,
"learning_rate": 7.555735447283364e-05,
"loss": 0.1313,
"step": 8120
},
{
"epoch": 0.4329304009798179,
"grad_norm": 0.05349269136786461,
"learning_rate": 7.549857453617446e-05,
"loss": 0.1313,
"step": 8130
},
{
"epoch": 0.43346291069812026,
"grad_norm": 0.05427918955683708,
"learning_rate": 7.543976204282342e-05,
"loss": 0.131,
"step": 8140
},
{
"epoch": 0.4339954204164226,
"grad_norm": 0.12502682209014893,
"learning_rate": 7.538091717516149e-05,
"loss": 0.1309,
"step": 8150
},
{
"epoch": 0.43452793013472496,
"grad_norm": 0.06011335179209709,
"learning_rate": 7.532204011567006e-05,
"loss": 0.1315,
"step": 8160
},
{
"epoch": 0.43506043985302734,
"grad_norm": 0.07122571021318436,
"learning_rate": 7.526313104693031e-05,
"loss": 0.1314,
"step": 8170
},
{
"epoch": 0.43559294957132966,
"grad_norm": 0.04538768157362938,
"learning_rate": 7.520419015162267e-05,
"loss": 0.1315,
"step": 8180
},
{
"epoch": 0.43612545928963203,
"grad_norm": 0.04720662534236908,
"learning_rate": 7.514521761252635e-05,
"loss": 0.131,
"step": 8190
},
{
"epoch": 0.4366579690079344,
"grad_norm": 0.07761963456869125,
"learning_rate": 7.508621361251858e-05,
"loss": 0.1316,
"step": 8200
},
{
"epoch": 0.43719047872623673,
"grad_norm": 0.08107470721006393,
"learning_rate": 7.502717833457424e-05,
"loss": 0.1308,
"step": 8210
},
{
"epoch": 0.4377229884445391,
"grad_norm": 0.08958134800195694,
"learning_rate": 7.496811196176513e-05,
"loss": 0.1314,
"step": 8220
},
{
"epoch": 0.4382554981628415,
"grad_norm": 0.04781255125999451,
"learning_rate": 7.490901467725957e-05,
"loss": 0.1306,
"step": 8230
},
{
"epoch": 0.4387880078811438,
"grad_norm": 0.06295894831418991,
"learning_rate": 7.484988666432165e-05,
"loss": 0.1311,
"step": 8240
},
{
"epoch": 0.4393205175994462,
"grad_norm": 0.06639114022254944,
"learning_rate": 7.479072810631078e-05,
"loss": 0.1311,
"step": 8250
},
{
"epoch": 0.43985302731774856,
"grad_norm": 0.06550955027341843,
"learning_rate": 7.473153918668112e-05,
"loss": 0.1307,
"step": 8260
},
{
"epoch": 0.44038553703605093,
"grad_norm": 0.06374099105596542,
"learning_rate": 7.467232008898098e-05,
"loss": 0.131,
"step": 8270
},
{
"epoch": 0.44091804675435325,
"grad_norm": 0.055466748774051666,
"learning_rate": 7.461307099685218e-05,
"loss": 0.1306,
"step": 8280
},
{
"epoch": 0.44145055647265563,
"grad_norm": 0.06467512249946594,
"learning_rate": 7.455379209402964e-05,
"loss": 0.1312,
"step": 8290
},
{
"epoch": 0.441983066190958,
"grad_norm": 0.10842160880565643,
"learning_rate": 7.44944835643407e-05,
"loss": 0.131,
"step": 8300
},
{
"epoch": 0.44251557590926033,
"grad_norm": 0.10142064094543457,
"learning_rate": 7.443514559170456e-05,
"loss": 0.1303,
"step": 8310
},
{
"epoch": 0.4430480856275627,
"grad_norm": 0.07040092349052429,
"learning_rate": 7.437577836013174e-05,
"loss": 0.1314,
"step": 8320
},
{
"epoch": 0.4435805953458651,
"grad_norm": 0.06632167845964432,
"learning_rate": 7.431638205372348e-05,
"loss": 0.1305,
"step": 8330
},
{
"epoch": 0.4441131050641674,
"grad_norm": 0.0742000862956047,
"learning_rate": 7.425695685667118e-05,
"loss": 0.1313,
"step": 8340
},
{
"epoch": 0.4446456147824698,
"grad_norm": 0.05238117650151253,
"learning_rate": 7.419750295325587e-05,
"loss": 0.131,
"step": 8350
},
{
"epoch": 0.44517812450077215,
"grad_norm": 0.06212290748953819,
"learning_rate": 7.413802052784756e-05,
"loss": 0.131,
"step": 8360
},
{
"epoch": 0.4457106342190745,
"grad_norm": 0.07771137356758118,
"learning_rate": 7.407850976490469e-05,
"loss": 0.1309,
"step": 8370
},
{
"epoch": 0.44624314393737685,
"grad_norm": 0.0551883801817894,
"learning_rate": 7.401897084897365e-05,
"loss": 0.1301,
"step": 8380
},
{
"epoch": 0.44677565365567923,
"grad_norm": 0.06460625678300858,
"learning_rate": 7.395940396468808e-05,
"loss": 0.1307,
"step": 8390
},
{
"epoch": 0.44730816337398155,
"grad_norm": 0.08054537326097488,
"learning_rate": 7.389980929676835e-05,
"loss": 0.1305,
"step": 8400
},
{
"epoch": 0.4478406730922839,
"grad_norm": 0.08456294983625412,
"learning_rate": 7.384018703002098e-05,
"loss": 0.1309,
"step": 8410
},
{
"epoch": 0.4483731828105863,
"grad_norm": 0.06319648027420044,
"learning_rate": 7.378053734933814e-05,
"loss": 0.1304,
"step": 8420
},
{
"epoch": 0.4489056925288887,
"grad_norm": 0.05323270335793495,
"learning_rate": 7.372086043969694e-05,
"loss": 0.1316,
"step": 8430
},
{
"epoch": 0.449438202247191,
"grad_norm": 0.05555250123143196,
"learning_rate": 7.366115648615898e-05,
"loss": 0.1313,
"step": 8440
},
{
"epoch": 0.4499707119654934,
"grad_norm": 0.08050252497196198,
"learning_rate": 7.360142567386968e-05,
"loss": 0.1303,
"step": 8450
},
{
"epoch": 0.45050322168379575,
"grad_norm": 0.0804496631026268,
"learning_rate": 7.354166818805776e-05,
"loss": 0.1305,
"step": 8460
},
{
"epoch": 0.4510357314020981,
"grad_norm": 0.09748142957687378,
"learning_rate": 7.34818842140347e-05,
"loss": 0.1308,
"step": 8470
},
{
"epoch": 0.45156824112040045,
"grad_norm": 0.0667809545993805,
"learning_rate": 7.34220739371941e-05,
"loss": 0.1312,
"step": 8480
},
{
"epoch": 0.4521007508387028,
"grad_norm": 0.08125482499599457,
"learning_rate": 7.336223754301105e-05,
"loss": 0.1311,
"step": 8490
},
{
"epoch": 0.45263326055700515,
"grad_norm": 0.057649750262498856,
"learning_rate": 7.330237521704177e-05,
"loss": 0.1307,
"step": 8500
},
{
"epoch": 0.4531657702753075,
"grad_norm": 0.06427519768476486,
"learning_rate": 7.324248714492279e-05,
"loss": 0.131,
"step": 8510
},
{
"epoch": 0.4536982799936099,
"grad_norm": 0.05290444567799568,
"learning_rate": 7.31825735123705e-05,
"loss": 0.1304,
"step": 8520
},
{
"epoch": 0.4542307897119122,
"grad_norm": 0.056924887001514435,
"learning_rate": 7.312263450518061e-05,
"loss": 0.1296,
"step": 8530
},
{
"epoch": 0.4547632994302146,
"grad_norm": 0.06351561844348907,
"learning_rate": 7.306267030922745e-05,
"loss": 0.1306,
"step": 8540
},
{
"epoch": 0.455295809148517,
"grad_norm": 0.08165629208087921,
"learning_rate": 7.300268111046348e-05,
"loss": 0.1307,
"step": 8550
},
{
"epoch": 0.4558283188668193,
"grad_norm": 0.059766896069049835,
"learning_rate": 7.294266709491873e-05,
"loss": 0.1305,
"step": 8560
},
{
"epoch": 0.45636082858512167,
"grad_norm": 0.05260982736945152,
"learning_rate": 7.288262844870013e-05,
"loss": 0.1301,
"step": 8570
},
{
"epoch": 0.45689333830342405,
"grad_norm": 0.06455428898334503,
"learning_rate": 7.282256535799106e-05,
"loss": 0.1304,
"step": 8580
},
{
"epoch": 0.4574258480217264,
"grad_norm": 0.05693411827087402,
"learning_rate": 7.276247800905063e-05,
"loss": 0.1304,
"step": 8590
},
{
"epoch": 0.45795835774002874,
"grad_norm": 0.05784597992897034,
"learning_rate": 7.270236658821322e-05,
"loss": 0.1308,
"step": 8600
},
{
"epoch": 0.4584908674583311,
"grad_norm": 0.10032429546117783,
"learning_rate": 7.264223128188789e-05,
"loss": 0.1308,
"step": 8610
},
{
"epoch": 0.4590233771766335,
"grad_norm": 0.0799618735909462,
"learning_rate": 7.258207227655768e-05,
"loss": 0.1314,
"step": 8620
},
{
"epoch": 0.4595558868949358,
"grad_norm": 0.08555562049150467,
"learning_rate": 7.25218897587792e-05,
"loss": 0.1301,
"step": 8630
},
{
"epoch": 0.4600883966132382,
"grad_norm": 0.06158687174320221,
"learning_rate": 7.246168391518196e-05,
"loss": 0.1302,
"step": 8640
},
{
"epoch": 0.46062090633154057,
"grad_norm": 0.06019744649529457,
"learning_rate": 7.240145493246776e-05,
"loss": 0.1304,
"step": 8650
},
{
"epoch": 0.4611534160498429,
"grad_norm": 0.06112377345561981,
"learning_rate": 7.234120299741021e-05,
"loss": 0.1301,
"step": 8660
},
{
"epoch": 0.46168592576814527,
"grad_norm": 0.04358561709523201,
"learning_rate": 7.228092829685406e-05,
"loss": 0.1299,
"step": 8670
},
{
"epoch": 0.46221843548644764,
"grad_norm": 0.04648636281490326,
"learning_rate": 7.22206310177147e-05,
"loss": 0.1296,
"step": 8680
},
{
"epoch": 0.46275094520474996,
"grad_norm": 0.040558718144893646,
"learning_rate": 7.216031134697747e-05,
"loss": 0.1307,
"step": 8690
},
{
"epoch": 0.46328345492305234,
"grad_norm": 0.04816916212439537,
"learning_rate": 7.209996947169719e-05,
"loss": 0.1307,
"step": 8700
},
{
"epoch": 0.4638159646413547,
"grad_norm": 0.08434905111789703,
"learning_rate": 7.203960557899758e-05,
"loss": 0.1304,
"step": 8710
},
{
"epoch": 0.46434847435965704,
"grad_norm": 0.04249223694205284,
"learning_rate": 7.197921985607055e-05,
"loss": 0.1307,
"step": 8720
},
{
"epoch": 0.4648809840779594,
"grad_norm": 0.04334559664130211,
"learning_rate": 7.191881249017574e-05,
"loss": 0.1301,
"step": 8730
},
{
"epoch": 0.4654134937962618,
"grad_norm": 0.06121005490422249,
"learning_rate": 7.185838366863995e-05,
"loss": 0.1307,
"step": 8740
},
{
"epoch": 0.46594600351456417,
"grad_norm": 0.05099225789308548,
"learning_rate": 7.179793357885645e-05,
"loss": 0.1305,
"step": 8750
},
{
"epoch": 0.4664785132328665,
"grad_norm": 0.04724998399615288,
"learning_rate": 7.173746240828451e-05,
"loss": 0.1295,
"step": 8760
},
{
"epoch": 0.46701102295116886,
"grad_norm": 0.04000856354832649,
"learning_rate": 7.167697034444874e-05,
"loss": 0.13,
"step": 8770
},
{
"epoch": 0.46754353266947124,
"grad_norm": 0.09295206516981125,
"learning_rate": 7.161645757493858e-05,
"loss": 0.1301,
"step": 8780
},
{
"epoch": 0.46807604238777356,
"grad_norm": 0.05277612432837486,
"learning_rate": 7.155592428740765e-05,
"loss": 0.1303,
"step": 8790
},
{
"epoch": 0.46860855210607594,
"grad_norm": 0.05306980386376381,
"learning_rate": 7.14953706695732e-05,
"loss": 0.1297,
"step": 8800
},
{
"epoch": 0.4691410618243783,
"grad_norm": 0.06097976118326187,
"learning_rate": 7.14347969092155e-05,
"loss": 0.1308,
"step": 8810
},
{
"epoch": 0.46967357154268063,
"grad_norm": 0.059332527220249176,
"learning_rate": 7.137420319417738e-05,
"loss": 0.1296,
"step": 8820
},
{
"epoch": 0.470206081260983,
"grad_norm": 0.09293901175260544,
"learning_rate": 7.131358971236344e-05,
"loss": 0.1296,
"step": 8830
},
{
"epoch": 0.4707385909792854,
"grad_norm": 0.046720948070287704,
"learning_rate": 7.125295665173964e-05,
"loss": 0.1304,
"step": 8840
},
{
"epoch": 0.4712711006975877,
"grad_norm": 0.06865198165178299,
"learning_rate": 7.119230420033259e-05,
"loss": 0.1306,
"step": 8850
},
{
"epoch": 0.4718036104158901,
"grad_norm": 0.11196744441986084,
"learning_rate": 7.113163254622915e-05,
"loss": 0.1301,
"step": 8860
},
{
"epoch": 0.47233612013419246,
"grad_norm": 0.056259218603372574,
"learning_rate": 7.107094187757559e-05,
"loss": 0.1298,
"step": 8870
},
{
"epoch": 0.4728686298524948,
"grad_norm": 0.06268846988677979,
"learning_rate": 7.101023238257725e-05,
"loss": 0.1303,
"step": 8880
},
{
"epoch": 0.47340113957079716,
"grad_norm": 0.05749877542257309,
"learning_rate": 7.094950424949784e-05,
"loss": 0.1305,
"step": 8890
},
{
"epoch": 0.47393364928909953,
"grad_norm": 0.05980097874999046,
"learning_rate": 7.088875766665879e-05,
"loss": 0.1299,
"step": 8900
},
{
"epoch": 0.4744661590074019,
"grad_norm": 0.048347923904657364,
"learning_rate": 7.082799282243881e-05,
"loss": 0.1302,
"step": 8910
},
{
"epoch": 0.47499866872570423,
"grad_norm": 0.0524616502225399,
"learning_rate": 7.076720990527324e-05,
"loss": 0.1301,
"step": 8920
},
{
"epoch": 0.4755311784440066,
"grad_norm": 0.06477531045675278,
"learning_rate": 7.070640910365344e-05,
"loss": 0.1306,
"step": 8930
},
{
"epoch": 0.476063688162309,
"grad_norm": 0.05950429290533066,
"learning_rate": 7.064559060612625e-05,
"loss": 0.13,
"step": 8940
},
{
"epoch": 0.4765961978806113,
"grad_norm": 0.0458899661898613,
"learning_rate": 7.058475460129337e-05,
"loss": 0.1299,
"step": 8950
},
{
"epoch": 0.4771287075989137,
"grad_norm": 0.04977622628211975,
"learning_rate": 7.05239012778108e-05,
"loss": 0.1297,
"step": 8960
},
{
"epoch": 0.47766121731721606,
"grad_norm": 0.052012983709573746,
"learning_rate": 7.046303082438823e-05,
"loss": 0.1304,
"step": 8970
},
{
"epoch": 0.4781937270355184,
"grad_norm": 0.09166349470615387,
"learning_rate": 7.040214342978851e-05,
"loss": 0.1303,
"step": 8980
},
{
"epoch": 0.47872623675382076,
"grad_norm": 0.057922665029764175,
"learning_rate": 7.034123928282699e-05,
"loss": 0.1292,
"step": 8990
},
{
"epoch": 0.47925874647212313,
"grad_norm": 0.05284808203577995,
"learning_rate": 7.028031857237098e-05,
"loss": 0.1299,
"step": 9000
},
{
"epoch": 0.47979125619042545,
"grad_norm": 0.05781892314553261,
"learning_rate": 7.021938148733918e-05,
"loss": 0.1304,
"step": 9010
},
{
"epoch": 0.48032376590872783,
"grad_norm": 0.04390615597367287,
"learning_rate": 7.0158428216701e-05,
"loss": 0.1295,
"step": 9020
},
{
"epoch": 0.4808562756270302,
"grad_norm": 0.06015874817967415,
"learning_rate": 7.009745894947612e-05,
"loss": 0.1299,
"step": 9030
},
{
"epoch": 0.4813887853453325,
"grad_norm": 0.17922475934028625,
"learning_rate": 7.003647387473378e-05,
"loss": 0.1299,
"step": 9040
},
{
"epoch": 0.4819212950636349,
"grad_norm": 0.07132676243782043,
"learning_rate": 6.997547318159225e-05,
"loss": 0.1304,
"step": 9050
},
{
"epoch": 0.4824538047819373,
"grad_norm": 0.06266484409570694,
"learning_rate": 6.991445705921825e-05,
"loss": 0.1302,
"step": 9060
},
{
"epoch": 0.48298631450023966,
"grad_norm": 0.04912625625729561,
"learning_rate": 6.985342569682632e-05,
"loss": 0.1299,
"step": 9070
},
{
"epoch": 0.483518824218542,
"grad_norm": 0.05088292434811592,
"learning_rate": 6.979237928367827e-05,
"loss": 0.1298,
"step": 9080
},
{
"epoch": 0.48405133393684435,
"grad_norm": 0.09333918988704681,
"learning_rate": 6.973131800908262e-05,
"loss": 0.13,
"step": 9090
},
{
"epoch": 0.48458384365514673,
"grad_norm": 0.05258602276444435,
"learning_rate": 6.967024206239392e-05,
"loss": 0.1292,
"step": 9100
},
{
"epoch": 0.48511635337344905,
"grad_norm": 0.05117359384894371,
"learning_rate": 6.960915163301222e-05,
"loss": 0.1298,
"step": 9110
},
{
"epoch": 0.4856488630917514,
"grad_norm": 0.0650695338845253,
"learning_rate": 6.954804691038255e-05,
"loss": 0.1302,
"step": 9120
},
{
"epoch": 0.4861813728100538,
"grad_norm": 0.07531211525201797,
"learning_rate": 6.948692808399417e-05,
"loss": 0.129,
"step": 9130
},
{
"epoch": 0.4867138825283561,
"grad_norm": 0.0522490069270134,
"learning_rate": 6.942579534338018e-05,
"loss": 0.1302,
"step": 9140
},
{
"epoch": 0.4872463922466585,
"grad_norm": 0.0909682959318161,
"learning_rate": 6.93646488781167e-05,
"loss": 0.1288,
"step": 9150
},
{
"epoch": 0.4877789019649609,
"grad_norm": 0.0672360509634018,
"learning_rate": 6.930348887782257e-05,
"loss": 0.1298,
"step": 9160
},
{
"epoch": 0.4883114116832632,
"grad_norm": 0.050222091376781464,
"learning_rate": 6.924231553215845e-05,
"loss": 0.1291,
"step": 9170
},
{
"epoch": 0.4888439214015656,
"grad_norm": 0.0731450617313385,
"learning_rate": 6.918112903082648e-05,
"loss": 0.1295,
"step": 9180
},
{
"epoch": 0.48937643111986795,
"grad_norm": 0.044536300003528595,
"learning_rate": 6.911992956356958e-05,
"loss": 0.1296,
"step": 9190
},
{
"epoch": 0.48990894083817027,
"grad_norm": 0.10119880735874176,
"learning_rate": 6.905871732017083e-05,
"loss": 0.1297,
"step": 9200
},
{
"epoch": 0.49044145055647265,
"grad_norm": 0.04427400976419449,
"learning_rate": 6.8997492490453e-05,
"loss": 0.1296,
"step": 9210
},
{
"epoch": 0.490973960274775,
"grad_norm": 0.05631903558969498,
"learning_rate": 6.893625526427785e-05,
"loss": 0.1294,
"step": 9220
},
{
"epoch": 0.4915064699930774,
"grad_norm": 0.05250485986471176,
"learning_rate": 6.88750058315456e-05,
"loss": 0.1288,
"step": 9230
},
{
"epoch": 0.4920389797113797,
"grad_norm": 0.04813829064369202,
"learning_rate": 6.881374438219426e-05,
"loss": 0.1299,
"step": 9240
},
{
"epoch": 0.4925714894296821,
"grad_norm": 0.10428118705749512,
"learning_rate": 6.875247110619923e-05,
"loss": 0.1293,
"step": 9250
},
{
"epoch": 0.4931039991479845,
"grad_norm": 0.05188250541687012,
"learning_rate": 6.869118619357244e-05,
"loss": 0.1298,
"step": 9260
},
{
"epoch": 0.4936365088662868,
"grad_norm": 0.06389789283275604,
"learning_rate": 6.862988983436205e-05,
"loss": 0.1297,
"step": 9270
},
{
"epoch": 0.49416901858458917,
"grad_norm": 0.05871303752064705,
"learning_rate": 6.856858221865158e-05,
"loss": 0.1296,
"step": 9280
},
{
"epoch": 0.49470152830289155,
"grad_norm": 0.09698927402496338,
"learning_rate": 6.850726353655956e-05,
"loss": 0.13,
"step": 9290
},
{
"epoch": 0.49523403802119387,
"grad_norm": 0.0667075663805008,
"learning_rate": 6.844593397823881e-05,
"loss": 0.1294,
"step": 9300
},
{
"epoch": 0.49576654773949624,
"grad_norm": 0.05773301422595978,
"learning_rate": 6.838459373387583e-05,
"loss": 0.1294,
"step": 9310
},
{
"epoch": 0.4962990574577986,
"grad_norm": 0.06608272343873978,
"learning_rate": 6.83232429936903e-05,
"loss": 0.1293,
"step": 9320
},
{
"epoch": 0.49683156717610094,
"grad_norm": 0.057207848876714706,
"learning_rate": 6.826188194793447e-05,
"loss": 0.1291,
"step": 9330
},
{
"epoch": 0.4973640768944033,
"grad_norm": 0.03619164600968361,
"learning_rate": 6.82005107868925e-05,
"loss": 0.1294,
"step": 9340
},
{
"epoch": 0.4978965866127057,
"grad_norm": 0.046284269541502,
"learning_rate": 6.813912970087994e-05,
"loss": 0.1294,
"step": 9350
},
{
"epoch": 0.498429096331008,
"grad_norm": 0.0476924329996109,
"learning_rate": 6.807773888024314e-05,
"loss": 0.1288,
"step": 9360
},
{
"epoch": 0.4989616060493104,
"grad_norm": 0.06622269749641418,
"learning_rate": 6.801633851535857e-05,
"loss": 0.1288,
"step": 9370
},
{
"epoch": 0.49949411576761277,
"grad_norm": 0.042118556797504425,
"learning_rate": 6.795492879663237e-05,
"loss": 0.1285,
"step": 9380
},
{
"epoch": 0.5000266254859151,
"grad_norm": 0.044616151601076126,
"learning_rate": 6.789350991449966e-05,
"loss": 0.1282,
"step": 9390
},
{
"epoch": 0.5005591352042175,
"grad_norm": 0.053620822727680206,
"learning_rate": 6.783208205942399e-05,
"loss": 0.1288,
"step": 9400
},
{
"epoch": 0.5010916449225198,
"grad_norm": 0.05040338635444641,
"learning_rate": 6.777064542189668e-05,
"loss": 0.1294,
"step": 9410
},
{
"epoch": 0.5016241546408222,
"grad_norm": 0.07730654627084732,
"learning_rate": 6.770920019243636e-05,
"loss": 0.1291,
"step": 9420
},
{
"epoch": 0.5021566643591245,
"grad_norm": 0.04234246164560318,
"learning_rate": 6.764774656158825e-05,
"loss": 0.1291,
"step": 9430
},
{
"epoch": 0.5026891740774269,
"grad_norm": 0.04666012525558472,
"learning_rate": 6.758628471992365e-05,
"loss": 0.1286,
"step": 9440
},
{
"epoch": 0.5032216837957293,
"grad_norm": 0.043177202343940735,
"learning_rate": 6.752481485803933e-05,
"loss": 0.1287,
"step": 9450
},
{
"epoch": 0.5037541935140316,
"grad_norm": 0.05593249201774597,
"learning_rate": 6.746333716655691e-05,
"loss": 0.1296,
"step": 9460
},
{
"epoch": 0.504286703232334,
"grad_norm": 0.050741735845804214,
"learning_rate": 6.740185183612227e-05,
"loss": 0.1286,
"step": 9470
},
{
"epoch": 0.5048192129506364,
"grad_norm": 0.08293752372264862,
"learning_rate": 6.734035905740504e-05,
"loss": 0.1289,
"step": 9480
},
{
"epoch": 0.5053517226689387,
"grad_norm": 0.11741827428340912,
"learning_rate": 6.727885902109785e-05,
"loss": 0.1286,
"step": 9490
},
{
"epoch": 0.5058842323872411,
"grad_norm": 0.05878937989473343,
"learning_rate": 6.7217351917916e-05,
"loss": 0.1288,
"step": 9500
},
{
"epoch": 0.5064167421055434,
"grad_norm": 0.04729843512177467,
"learning_rate": 6.715583793859652e-05,
"loss": 0.1291,
"step": 9510
},
{
"epoch": 0.5069492518238458,
"grad_norm": 0.04623175784945488,
"learning_rate": 6.709431727389789e-05,
"loss": 0.1275,
"step": 9520
},
{
"epoch": 0.5074817615421482,
"grad_norm": 0.047292426228523254,
"learning_rate": 6.703279011459927e-05,
"loss": 0.1285,
"step": 9530
},
{
"epoch": 0.5080142712604505,
"grad_norm": 0.04683827981352806,
"learning_rate": 6.697125665149993e-05,
"loss": 0.1283,
"step": 9540
},
{
"epoch": 0.5085467809787528,
"grad_norm": 0.08465840667486191,
"learning_rate": 6.69097170754188e-05,
"loss": 0.1289,
"step": 9550
},
{
"epoch": 0.5090792906970553,
"grad_norm": 0.04499583691358566,
"learning_rate": 6.684817157719364e-05,
"loss": 0.1295,
"step": 9560
},
{
"epoch": 0.5096118004153576,
"grad_norm": 0.05609264224767685,
"learning_rate": 6.678662034768063e-05,
"loss": 0.1291,
"step": 9570
},
{
"epoch": 0.5101443101336599,
"grad_norm": 0.04982760548591614,
"learning_rate": 6.672506357775375e-05,
"loss": 0.1287,
"step": 9580
},
{
"epoch": 0.5106768198519623,
"grad_norm": 0.04551566392183304,
"learning_rate": 6.666350145830413e-05,
"loss": 0.1287,
"step": 9590
},
{
"epoch": 0.5112093295702647,
"grad_norm": 0.06523692607879639,
"learning_rate": 6.660193418023947e-05,
"loss": 0.1289,
"step": 9600
},
{
"epoch": 0.511741839288567,
"grad_norm": 0.09148914366960526,
"learning_rate": 6.654036193448349e-05,
"loss": 0.1285,
"step": 9610
},
{
"epoch": 0.5122743490068694,
"grad_norm": 0.040613338351249695,
"learning_rate": 6.647878491197535e-05,
"loss": 0.1281,
"step": 9620
},
{
"epoch": 0.5128068587251717,
"grad_norm": 0.06947502493858337,
"learning_rate": 6.641720330366894e-05,
"loss": 0.1281,
"step": 9630
},
{
"epoch": 0.513339368443474,
"grad_norm": 0.060511503368616104,
"learning_rate": 6.635561730053245e-05,
"loss": 0.1285,
"step": 9640
},
{
"epoch": 0.5138718781617765,
"grad_norm": 0.06579563021659851,
"learning_rate": 6.629402709354766e-05,
"loss": 0.1282,
"step": 9650
},
{
"epoch": 0.5144043878800788,
"grad_norm": 0.055754803121089935,
"learning_rate": 6.62324328737094e-05,
"loss": 0.1284,
"step": 9660
},
{
"epoch": 0.5149368975983811,
"grad_norm": 0.05002092942595482,
"learning_rate": 6.617083483202493e-05,
"loss": 0.1295,
"step": 9670
},
{
"epoch": 0.5154694073166836,
"grad_norm": 0.03860372677445412,
"learning_rate": 6.610923315951336e-05,
"loss": 0.1284,
"step": 9680
},
{
"epoch": 0.5160019170349859,
"grad_norm": 0.08359445631504059,
"learning_rate": 6.604762804720508e-05,
"loss": 0.1285,
"step": 9690
},
{
"epoch": 0.5165344267532882,
"grad_norm": 0.05430614575743675,
"learning_rate": 6.598601968614115e-05,
"loss": 0.1283,
"step": 9700
},
{
"epoch": 0.5170669364715906,
"grad_norm": 0.05833563208580017,
"learning_rate": 6.592440826737266e-05,
"loss": 0.1289,
"step": 9710
},
{
"epoch": 0.517599446189893,
"grad_norm": 0.05940975248813629,
"learning_rate": 6.586279398196023e-05,
"loss": 0.1284,
"step": 9720
},
{
"epoch": 0.5181319559081953,
"grad_norm": 0.0453868992626667,
"learning_rate": 6.580117702097332e-05,
"loss": 0.1288,
"step": 9730
},
{
"epoch": 0.5186644656264977,
"grad_norm": 0.042583536356687546,
"learning_rate": 6.57395575754898e-05,
"loss": 0.1293,
"step": 9740
},
{
"epoch": 0.5191969753448,
"grad_norm": 0.05306556820869446,
"learning_rate": 6.567793583659507e-05,
"loss": 0.128,
"step": 9750
},
{
"epoch": 0.5197294850631023,
"grad_norm": 0.04358596354722977,
"learning_rate": 6.561631199538179e-05,
"loss": 0.129,
"step": 9760
},
{
"epoch": 0.5202619947814048,
"grad_norm": 0.11662445962429047,
"learning_rate": 6.555468624294907e-05,
"loss": 0.1276,
"step": 9770
},
{
"epoch": 0.5207945044997071,
"grad_norm": 0.050507139414548874,
"learning_rate": 6.549305877040199e-05,
"loss": 0.1291,
"step": 9780
},
{
"epoch": 0.5213270142180095,
"grad_norm": 0.059976451098918915,
"learning_rate": 6.543142976885088e-05,
"loss": 0.1279,
"step": 9790
},
{
"epoch": 0.5218595239363119,
"grad_norm": 0.04601925238966942,
"learning_rate": 6.536979942941091e-05,
"loss": 0.1288,
"step": 9800
},
{
"epoch": 0.5223920336546142,
"grad_norm": 0.05751890689134598,
"learning_rate": 6.530816794320134e-05,
"loss": 0.1283,
"step": 9810
},
{
"epoch": 0.5229245433729166,
"grad_norm": 0.05591721832752228,
"learning_rate": 6.524653550134501e-05,
"loss": 0.1287,
"step": 9820
},
{
"epoch": 0.5234570530912189,
"grad_norm": 0.05766240507364273,
"learning_rate": 6.518490229496772e-05,
"loss": 0.1285,
"step": 9830
},
{
"epoch": 0.5239895628095212,
"grad_norm": 0.054135777056217194,
"learning_rate": 6.512326851519762e-05,
"loss": 0.1287,
"step": 9840
},
{
"epoch": 0.5245220725278237,
"grad_norm": 0.04491560161113739,
"learning_rate": 6.506163435316468e-05,
"loss": 0.1276,
"step": 9850
},
{
"epoch": 0.525054582246126,
"grad_norm": 0.044994186609983444,
"learning_rate": 6.5e-05,
"loss": 0.1286,
"step": 9860
},
{
"epoch": 0.5255870919644283,
"grad_norm": 0.0446479506790638,
"learning_rate": 6.493836564683533e-05,
"loss": 0.1286,
"step": 9870
},
{
"epoch": 0.5261196016827308,
"grad_norm": 0.06419171392917633,
"learning_rate": 6.48767314848024e-05,
"loss": 0.1283,
"step": 9880
},
{
"epoch": 0.5266521114010331,
"grad_norm": 0.041707735508680344,
"learning_rate": 6.481509770503229e-05,
"loss": 0.1275,
"step": 9890
},
{
"epoch": 0.5271846211193354,
"grad_norm": 0.08214934170246124,
"learning_rate": 6.475346449865499e-05,
"loss": 0.1287,
"step": 9900
},
{
"epoch": 0.5277171308376378,
"grad_norm": 0.09313659369945526,
"learning_rate": 6.469183205679865e-05,
"loss": 0.1277,
"step": 9910
},
{
"epoch": 0.5282496405559401,
"grad_norm": 0.05460633337497711,
"learning_rate": 6.46302005705891e-05,
"loss": 0.1287,
"step": 9920
},
{
"epoch": 0.5287821502742425,
"grad_norm": 0.0486149825155735,
"learning_rate": 6.456857023114913e-05,
"loss": 0.1276,
"step": 9930
},
{
"epoch": 0.5293146599925449,
"grad_norm": 0.04761586710810661,
"learning_rate": 6.450694122959801e-05,
"loss": 0.1287,
"step": 9940
},
{
"epoch": 0.5298471697108472,
"grad_norm": 0.04752049222588539,
"learning_rate": 6.444531375705092e-05,
"loss": 0.1285,
"step": 9950
},
{
"epoch": 0.5303796794291495,
"grad_norm": 0.06729278713464737,
"learning_rate": 6.438368800461821e-05,
"loss": 0.1286,
"step": 9960
},
{
"epoch": 0.530912189147452,
"grad_norm": 0.04480813071131706,
"learning_rate": 6.432206416340492e-05,
"loss": 0.1284,
"step": 9970
},
{
"epoch": 0.5314446988657543,
"grad_norm": 0.040219422429800034,
"learning_rate": 6.426044242451022e-05,
"loss": 0.1287,
"step": 9980
},
{
"epoch": 0.5319772085840566,
"grad_norm": 0.03565455228090286,
"learning_rate": 6.419882297902667e-05,
"loss": 0.1284,
"step": 9990
},
{
"epoch": 0.532509718302359,
"grad_norm": 0.05310383439064026,
"learning_rate": 6.413720601803979e-05,
"loss": 0.1285,
"step": 10000
},
{
"epoch": 0.5330422280206614,
"grad_norm": 0.07043947279453278,
"learning_rate": 6.407559173262735e-05,
"loss": 0.1283,
"step": 10010
},
{
"epoch": 0.5335747377389637,
"grad_norm": 0.04902435466647148,
"learning_rate": 6.401398031385886e-05,
"loss": 0.1276,
"step": 10020
},
{
"epoch": 0.5341072474572661,
"grad_norm": 0.05668781325221062,
"learning_rate": 6.395237195279491e-05,
"loss": 0.1283,
"step": 10030
},
{
"epoch": 0.5346397571755684,
"grad_norm": 0.05145740881562233,
"learning_rate": 6.389076684048664e-05,
"loss": 0.1264,
"step": 10040
},
{
"epoch": 0.5351722668938708,
"grad_norm": 0.1139606162905693,
"learning_rate": 6.382916516797508e-05,
"loss": 0.1283,
"step": 10050
},
{
"epoch": 0.5357047766121732,
"grad_norm": 0.05299168825149536,
"learning_rate": 6.376756712629059e-05,
"loss": 0.1281,
"step": 10060
},
{
"epoch": 0.5362372863304755,
"grad_norm": 0.06942315399646759,
"learning_rate": 6.370597290645234e-05,
"loss": 0.1281,
"step": 10070
},
{
"epoch": 0.5367697960487778,
"grad_norm": 0.07276537269353867,
"learning_rate": 6.364438269946755e-05,
"loss": 0.1284,
"step": 10080
},
{
"epoch": 0.5373023057670803,
"grad_norm": 0.043881241232156754,
"learning_rate": 6.358279669633106e-05,
"loss": 0.1275,
"step": 10090
},
{
"epoch": 0.5378348154853826,
"grad_norm": 0.047917068004608154,
"learning_rate": 6.352121508802467e-05,
"loss": 0.1282,
"step": 10100
},
{
"epoch": 0.538367325203685,
"grad_norm": 0.04417307674884796,
"learning_rate": 6.345963806551651e-05,
"loss": 0.1281,
"step": 10110
},
{
"epoch": 0.5388998349219873,
"grad_norm": 0.053708259016275406,
"learning_rate": 6.339806581976055e-05,
"loss": 0.1286,
"step": 10120
},
{
"epoch": 0.5394323446402897,
"grad_norm": 0.05327571928501129,
"learning_rate": 6.333649854169587e-05,
"loss": 0.1284,
"step": 10130
},
{
"epoch": 0.5399648543585921,
"grad_norm": 0.062333524227142334,
"learning_rate": 6.327493642224624e-05,
"loss": 0.1281,
"step": 10140
},
{
"epoch": 0.5404973640768944,
"grad_norm": 0.04436059668660164,
"learning_rate": 6.321337965231937e-05,
"loss": 0.1278,
"step": 10150
},
{
"epoch": 0.5410298737951967,
"grad_norm": 0.07489614933729172,
"learning_rate": 6.315182842280638e-05,
"loss": 0.1284,
"step": 10160
},
{
"epoch": 0.5415623835134992,
"grad_norm": 0.06262974441051483,
"learning_rate": 6.309028292458122e-05,
"loss": 0.1269,
"step": 10170
},
{
"epoch": 0.5420948932318015,
"grad_norm": 0.08940589427947998,
"learning_rate": 6.302874334850006e-05,
"loss": 0.128,
"step": 10180
},
{
"epoch": 0.5426274029501038,
"grad_norm": 0.039577096700668335,
"learning_rate": 6.296720988540075e-05,
"loss": 0.1273,
"step": 10190
},
{
"epoch": 0.5431599126684062,
"grad_norm": 0.05988942086696625,
"learning_rate": 6.290568272610211e-05,
"loss": 0.1276,
"step": 10200
},
{
"epoch": 0.5436924223867086,
"grad_norm": 0.047797802835702896,
"learning_rate": 6.284416206140348e-05,
"loss": 0.1278,
"step": 10210
},
{
"epoch": 0.5442249321050109,
"grad_norm": 0.05901528522372246,
"learning_rate": 6.278264808208402e-05,
"loss": 0.1283,
"step": 10220
},
{
"epoch": 0.5447574418233133,
"grad_norm": 0.10273321717977524,
"learning_rate": 6.272114097890213e-05,
"loss": 0.1279,
"step": 10230
},
{
"epoch": 0.5452899515416156,
"grad_norm": 0.07229287177324295,
"learning_rate": 6.265964094259498e-05,
"loss": 0.1283,
"step": 10240
},
{
"epoch": 0.545822461259918,
"grad_norm": 0.04700973257422447,
"learning_rate": 6.259814816387775e-05,
"loss": 0.1276,
"step": 10250
},
{
"epoch": 0.5463549709782204,
"grad_norm": 0.10428871214389801,
"learning_rate": 6.25366628334431e-05,
"loss": 0.1285,
"step": 10260
},
{
"epoch": 0.5468874806965227,
"grad_norm": 0.048143282532691956,
"learning_rate": 6.247518514196067e-05,
"loss": 0.1275,
"step": 10270
},
{
"epoch": 0.547419990414825,
"grad_norm": 0.054553814232349396,
"learning_rate": 6.241371528007634e-05,
"loss": 0.1281,
"step": 10280
},
{
"epoch": 0.5479525001331275,
"grad_norm": 0.07188910245895386,
"learning_rate": 6.235225343841174e-05,
"loss": 0.1276,
"step": 10290
},
{
"epoch": 0.5484850098514298,
"grad_norm": 0.05446217581629753,
"learning_rate": 6.229079980756365e-05,
"loss": 0.1283,
"step": 10300
},
{
"epoch": 0.5490175195697321,
"grad_norm": 0.041187744587659836,
"learning_rate": 6.222935457810333e-05,
"loss": 0.1277,
"step": 10310
},
{
"epoch": 0.5495500292880345,
"grad_norm": 0.05523503199219704,
"learning_rate": 6.216791794057601e-05,
"loss": 0.1276,
"step": 10320
},
{
"epoch": 0.5500825390063369,
"grad_norm": 0.044267792254686356,
"learning_rate": 6.210649008550033e-05,
"loss": 0.1279,
"step": 10330
},
{
"epoch": 0.5506150487246392,
"grad_norm": 0.04887842759490013,
"learning_rate": 6.204507120336764e-05,
"loss": 0.128,
"step": 10340
},
{
"epoch": 0.5511475584429416,
"grad_norm": 0.04334214702248573,
"learning_rate": 6.198366148464143e-05,
"loss": 0.1276,
"step": 10350
},
{
"epoch": 0.5516800681612439,
"grad_norm": 0.05289037153124809,
"learning_rate": 6.192226111975687e-05,
"loss": 0.1275,
"step": 10360
},
{
"epoch": 0.5522125778795463,
"grad_norm": 0.05000938102602959,
"learning_rate": 6.186087029912005e-05,
"loss": 0.1277,
"step": 10370
},
{
"epoch": 0.5527450875978487,
"grad_norm": 0.05687737837433815,
"learning_rate": 6.179948921310749e-05,
"loss": 0.1282,
"step": 10380
},
{
"epoch": 0.553277597316151,
"grad_norm": 0.040263786911964417,
"learning_rate": 6.173811805206553e-05,
"loss": 0.1275,
"step": 10390
},
{
"epoch": 0.5538101070344533,
"grad_norm": 0.056092556565999985,
"learning_rate": 6.16767570063097e-05,
"loss": 0.1272,
"step": 10400
},
{
"epoch": 0.5543426167527558,
"grad_norm": 0.04456920921802521,
"learning_rate": 6.161540626612419e-05,
"loss": 0.1271,
"step": 10410
},
{
"epoch": 0.5548751264710581,
"grad_norm": 0.05201718211174011,
"learning_rate": 6.15540660217612e-05,
"loss": 0.1273,
"step": 10420
},
{
"epoch": 0.5554076361893605,
"grad_norm": 0.045165225863456726,
"learning_rate": 6.149273646344044e-05,
"loss": 0.1271,
"step": 10430
},
{
"epoch": 0.5559401459076628,
"grad_norm": 0.07475852966308594,
"learning_rate": 6.14314177813484e-05,
"loss": 0.128,
"step": 10440
},
{
"epoch": 0.5564726556259652,
"grad_norm": 0.08229029923677444,
"learning_rate": 6.137011016563797e-05,
"loss": 0.1277,
"step": 10450
},
{
"epoch": 0.5570051653442676,
"grad_norm": 0.09118565171957016,
"learning_rate": 6.130881380642755e-05,
"loss": 0.1277,
"step": 10460
},
{
"epoch": 0.5575376750625699,
"grad_norm": 0.04762515053153038,
"learning_rate": 6.124752889380079e-05,
"loss": 0.1275,
"step": 10470
},
{
"epoch": 0.5580701847808722,
"grad_norm": 0.03888937830924988,
"learning_rate": 6.118625561780574e-05,
"loss": 0.1275,
"step": 10480
},
{
"epoch": 0.5586026944991747,
"grad_norm": 0.04357834532856941,
"learning_rate": 6.112499416845443e-05,
"loss": 0.1279,
"step": 10490
},
{
"epoch": 0.559135204217477,
"grad_norm": 0.06639399379491806,
"learning_rate": 6.106374473572216e-05,
"loss": 0.128,
"step": 10500
},
{
"epoch": 0.5596677139357793,
"grad_norm": 0.051041729748249054,
"learning_rate": 6.100250750954699e-05,
"loss": 0.128,
"step": 10510
},
{
"epoch": 0.5602002236540817,
"grad_norm": 0.08065960556268692,
"learning_rate": 6.094128267982916e-05,
"loss": 0.1275,
"step": 10520
},
{
"epoch": 0.560732733372384,
"grad_norm": 0.04977899789810181,
"learning_rate": 6.0880070436430424e-05,
"loss": 0.1283,
"step": 10530
},
{
"epoch": 0.5612652430906864,
"grad_norm": 0.04065399989485741,
"learning_rate": 6.081887096917351e-05,
"loss": 0.1272,
"step": 10540
},
{
"epoch": 0.5617977528089888,
"grad_norm": 0.05486559495329857,
"learning_rate": 6.075768446784154e-05,
"loss": 0.1273,
"step": 10550
},
{
"epoch": 0.5623302625272911,
"grad_norm": 0.053763121366500854,
"learning_rate": 6.0696511122177436e-05,
"loss": 0.1275,
"step": 10560
},
{
"epoch": 0.5628627722455934,
"grad_norm": 0.049751050770282745,
"learning_rate": 6.063535112188329e-05,
"loss": 0.1263,
"step": 10570
},
{
"epoch": 0.5633952819638959,
"grad_norm": 0.036564771085977554,
"learning_rate": 6.057420465661982e-05,
"loss": 0.128,
"step": 10580
},
{
"epoch": 0.5639277916821982,
"grad_norm": 0.05281112715601921,
"learning_rate": 6.051307191600581e-05,
"loss": 0.1269,
"step": 10590
},
{
"epoch": 0.5644603014005005,
"grad_norm": 0.053066980093717575,
"learning_rate": 6.045195308961746e-05,
"loss": 0.1269,
"step": 10600
},
{
"epoch": 0.564992811118803,
"grad_norm": 0.05914291366934776,
"learning_rate": 6.039084836698779e-05,
"loss": 0.1275,
"step": 10610
},
{
"epoch": 0.5655253208371053,
"grad_norm": 0.06061727926135063,
"learning_rate": 6.032975793760609e-05,
"loss": 0.1274,
"step": 10620
},
{
"epoch": 0.5660578305554076,
"grad_norm": 0.06171563267707825,
"learning_rate": 6.026868199091737e-05,
"loss": 0.1273,
"step": 10630
},
{
"epoch": 0.56659034027371,
"grad_norm": 0.07542983442544937,
"learning_rate": 6.020762071632172e-05,
"loss": 0.1276,
"step": 10640
},
{
"epoch": 0.5671228499920123,
"grad_norm": 0.04028952494263649,
"learning_rate": 6.014657430317368e-05,
"loss": 0.1267,
"step": 10650
},
{
"epoch": 0.5676553597103147,
"grad_norm": 0.05201804265379906,
"learning_rate": 6.0085542940781755e-05,
"loss": 0.1273,
"step": 10660
},
{
"epoch": 0.5681878694286171,
"grad_norm": 0.047102462500333786,
"learning_rate": 6.0024526818407745e-05,
"loss": 0.1267,
"step": 10670
},
{
"epoch": 0.5687203791469194,
"grad_norm": 0.06168229877948761,
"learning_rate": 5.996352612526623e-05,
"loss": 0.1272,
"step": 10680
},
{
"epoch": 0.5692528888652217,
"grad_norm": 0.05583483725786209,
"learning_rate": 5.9902541050523886e-05,
"loss": 0.1278,
"step": 10690
},
{
"epoch": 0.5697853985835242,
"grad_norm": 0.06403730064630508,
"learning_rate": 5.9841571783299e-05,
"loss": 0.1273,
"step": 10700
},
{
"epoch": 0.5703179083018265,
"grad_norm": 0.042866677045822144,
"learning_rate": 5.9780618512660834e-05,
"loss": 0.1275,
"step": 10710
},
{
"epoch": 0.5708504180201288,
"grad_norm": 0.05416185408830643,
"learning_rate": 5.971968142762903e-05,
"loss": 0.1276,
"step": 10720
},
{
"epoch": 0.5713829277384312,
"grad_norm": 0.04284673184156418,
"learning_rate": 5.9658760717173e-05,
"loss": 0.1276,
"step": 10730
},
{
"epoch": 0.5719154374567336,
"grad_norm": 0.050528384745121,
"learning_rate": 5.959785657021149e-05,
"loss": 0.1272,
"step": 10740
},
{
"epoch": 0.572447947175036,
"grad_norm": 0.0541527085006237,
"learning_rate": 5.953696917561178e-05,
"loss": 0.1272,
"step": 10750
},
{
"epoch": 0.5729804568933383,
"grad_norm": 0.04789347946643829,
"learning_rate": 5.947609872218922e-05,
"loss": 0.1276,
"step": 10760
},
{
"epoch": 0.5735129666116406,
"grad_norm": 0.04346901550889015,
"learning_rate": 5.9415245398706645e-05,
"loss": 0.1273,
"step": 10770
},
{
"epoch": 0.5740454763299431,
"grad_norm": 0.03955162316560745,
"learning_rate": 5.9354409393873756e-05,
"loss": 0.1271,
"step": 10780
},
{
"epoch": 0.5745779860482454,
"grad_norm": 0.05987564101815224,
"learning_rate": 5.929359089634657e-05,
"loss": 0.1269,
"step": 10790
},
{
"epoch": 0.5751104957665477,
"grad_norm": 0.0456010103225708,
"learning_rate": 5.923279009472678e-05,
"loss": 0.1269,
"step": 10800
},
{
"epoch": 0.5756430054848501,
"grad_norm": 0.062419842928647995,
"learning_rate": 5.9172007177561194e-05,
"loss": 0.1274,
"step": 10810
},
{
"epoch": 0.5761755152031525,
"grad_norm": 0.0384056381881237,
"learning_rate": 5.911124233334122e-05,
"loss": 0.1274,
"step": 10820
},
{
"epoch": 0.5767080249214548,
"grad_norm": 0.06525867432355881,
"learning_rate": 5.905049575050218e-05,
"loss": 0.1271,
"step": 10830
},
{
"epoch": 0.5772405346397572,
"grad_norm": 0.05695752054452896,
"learning_rate": 5.8989767617422744e-05,
"loss": 0.1279,
"step": 10840
},
{
"epoch": 0.5777730443580595,
"grad_norm": 0.05444275960326195,
"learning_rate": 5.8929058122424406e-05,
"loss": 0.1273,
"step": 10850
},
{
"epoch": 0.5783055540763619,
"grad_norm": 0.051563095301389694,
"learning_rate": 5.886836745377087e-05,
"loss": 0.1273,
"step": 10860
},
{
"epoch": 0.5788380637946643,
"grad_norm": 0.07469698041677475,
"learning_rate": 5.8807695799667416e-05,
"loss": 0.1272,
"step": 10870
},
{
"epoch": 0.5793705735129666,
"grad_norm": 0.049753960222005844,
"learning_rate": 5.874704334826038e-05,
"loss": 0.1276,
"step": 10880
},
{
"epoch": 0.5799030832312689,
"grad_norm": 0.04683075100183487,
"learning_rate": 5.8686410287636575e-05,
"loss": 0.1273,
"step": 10890
},
{
"epoch": 0.5804355929495714,
"grad_norm": 0.04198311269283295,
"learning_rate": 5.862579680582263e-05,
"loss": 0.1277,
"step": 10900
},
{
"epoch": 0.5809681026678737,
"grad_norm": 0.045857448130846024,
"learning_rate": 5.8565203090784484e-05,
"loss": 0.1271,
"step": 10910
},
{
"epoch": 0.581500612386176,
"grad_norm": 0.040586717426776886,
"learning_rate": 5.8504629330426816e-05,
"loss": 0.1274,
"step": 10920
},
{
"epoch": 0.5820331221044784,
"grad_norm": 0.050801508128643036,
"learning_rate": 5.844407571259235e-05,
"loss": 0.127,
"step": 10930
},
{
"epoch": 0.5825656318227808,
"grad_norm": 0.050907645374536514,
"learning_rate": 5.8383542425061424e-05,
"loss": 0.1272,
"step": 10940
},
{
"epoch": 0.5830981415410831,
"grad_norm": 0.08759643882513046,
"learning_rate": 5.8323029655551266e-05,
"loss": 0.1267,
"step": 10950
},
{
"epoch": 0.5836306512593855,
"grad_norm": 0.05688736215233803,
"learning_rate": 5.8262537591715493e-05,
"loss": 0.1275,
"step": 10960
},
{
"epoch": 0.5841631609776878,
"grad_norm": 0.05095606669783592,
"learning_rate": 5.820206642114355e-05,
"loss": 0.1271,
"step": 10970
},
{
"epoch": 0.5846956706959902,
"grad_norm": 0.07440601289272308,
"learning_rate": 5.814161633136006e-05,
"loss": 0.1275,
"step": 10980
},
{
"epoch": 0.5852281804142926,
"grad_norm": 0.07147916406393051,
"learning_rate": 5.808118750982427e-05,
"loss": 0.1276,
"step": 10990
},
{
"epoch": 0.5857606901325949,
"grad_norm": 0.06379908323287964,
"learning_rate": 5.802078014392946e-05,
"loss": 0.1269,
"step": 11000
},
{
"epoch": 0.5862931998508972,
"grad_norm": 0.09271499514579773,
"learning_rate": 5.796039442100243e-05,
"loss": 0.126,
"step": 11010
},
{
"epoch": 0.5868257095691997,
"grad_norm": 0.04959186539053917,
"learning_rate": 5.7900030528302804e-05,
"loss": 0.1277,
"step": 11020
},
{
"epoch": 0.587358219287502,
"grad_norm": 0.03943556919693947,
"learning_rate": 5.783968865302254e-05,
"loss": 0.1264,
"step": 11030
},
{
"epoch": 0.5878907290058043,
"grad_norm": 0.04361870139837265,
"learning_rate": 5.777936898228531e-05,
"loss": 0.1276,
"step": 11040
},
{
"epoch": 0.5884232387241067,
"grad_norm": 0.0441637746989727,
"learning_rate": 5.771907170314593e-05,
"loss": 0.1274,
"step": 11050
},
{
"epoch": 0.588955748442409,
"grad_norm": 0.048377875238657,
"learning_rate": 5.7658797002589796e-05,
"loss": 0.1271,
"step": 11060
},
{
"epoch": 0.5894882581607115,
"grad_norm": 0.04017746075987816,
"learning_rate": 5.759854506753224e-05,
"loss": 0.1271,
"step": 11070
},
{
"epoch": 0.5900207678790138,
"grad_norm": 0.08903171867132187,
"learning_rate": 5.753831608481803e-05,
"loss": 0.1265,
"step": 11080
},
{
"epoch": 0.5905532775973161,
"grad_norm": 0.05974121764302254,
"learning_rate": 5.747811024122081e-05,
"loss": 0.127,
"step": 11090
},
{
"epoch": 0.5910857873156186,
"grad_norm": 0.04399004206061363,
"learning_rate": 5.741792772344232e-05,
"loss": 0.127,
"step": 11100
},
{
"epoch": 0.5916182970339209,
"grad_norm": 0.042719513177871704,
"learning_rate": 5.7357768718112114e-05,
"loss": 0.1271,
"step": 11110
},
{
"epoch": 0.5921508067522232,
"grad_norm": 0.04649467021226883,
"learning_rate": 5.729763341178678e-05,
"loss": 0.1275,
"step": 11120
},
{
"epoch": 0.5926833164705256,
"grad_norm": 0.061461612582206726,
"learning_rate": 5.723752199094938e-05,
"loss": 0.127,
"step": 11130
},
{
"epoch": 0.593215826188828,
"grad_norm": 0.09266576170921326,
"learning_rate": 5.717743464200895e-05,
"loss": 0.1276,
"step": 11140
},
{
"epoch": 0.5937483359071303,
"grad_norm": 0.039458803832530975,
"learning_rate": 5.711737155129987e-05,
"loss": 0.1273,
"step": 11150
},
{
"epoch": 0.5942808456254327,
"grad_norm": 0.0349600687623024,
"learning_rate": 5.7057332905081275e-05,
"loss": 0.127,
"step": 11160
},
{
"epoch": 0.594813355343735,
"grad_norm": 0.07518605887889862,
"learning_rate": 5.699731888953653e-05,
"loss": 0.1257,
"step": 11170
},
{
"epoch": 0.5953458650620373,
"grad_norm": 0.056736767292022705,
"learning_rate": 5.6937329690772554e-05,
"loss": 0.1275,
"step": 11180
},
{
"epoch": 0.5958783747803398,
"grad_norm": 0.0569731779396534,
"learning_rate": 5.687736549481939e-05,
"loss": 0.1269,
"step": 11190
},
{
"epoch": 0.5964108844986421,
"grad_norm": 0.06685450673103333,
"learning_rate": 5.681742648762949e-05,
"loss": 0.1275,
"step": 11200
},
{
"epoch": 0.5969433942169444,
"grad_norm": 0.05476146563887596,
"learning_rate": 5.675751285507722e-05,
"loss": 0.1266,
"step": 11210
},
{
"epoch": 0.5974759039352469,
"grad_norm": 0.04533839598298073,
"learning_rate": 5.6697624782958235e-05,
"loss": 0.1273,
"step": 11220
},
{
"epoch": 0.5980084136535492,
"grad_norm": 0.04754569008946419,
"learning_rate": 5.6637762456988943e-05,
"loss": 0.126,
"step": 11230
},
{
"epoch": 0.5985409233718515,
"grad_norm": 0.06229964643716812,
"learning_rate": 5.657792606280592e-05,
"loss": 0.126,
"step": 11240
},
{
"epoch": 0.5990734330901539,
"grad_norm": 0.057759564369916916,
"learning_rate": 5.65181157859653e-05,
"loss": 0.1274,
"step": 11250
},
{
"epoch": 0.5996059428084562,
"grad_norm": 0.05225152522325516,
"learning_rate": 5.6458331811942235e-05,
"loss": 0.1264,
"step": 11260
},
{
"epoch": 0.6001384525267586,
"grad_norm": 0.0388801135122776,
"learning_rate": 5.639857432613034e-05,
"loss": 0.127,
"step": 11270
},
{
"epoch": 0.600670962245061,
"grad_norm": 0.044373005628585815,
"learning_rate": 5.633884351384103e-05,
"loss": 0.1274,
"step": 11280
},
{
"epoch": 0.6012034719633633,
"grad_norm": 0.07331310957670212,
"learning_rate": 5.627913956030306e-05,
"loss": 0.1272,
"step": 11290
},
{
"epoch": 0.6017359816816656,
"grad_norm": 0.05038286745548248,
"learning_rate": 5.621946265066187e-05,
"loss": 0.1273,
"step": 11300
},
{
"epoch": 0.6022684913999681,
"grad_norm": 0.058994751423597336,
"learning_rate": 5.615981296997901e-05,
"loss": 0.1273,
"step": 11310
},
{
"epoch": 0.6028010011182704,
"grad_norm": 0.036481309682130814,
"learning_rate": 5.610019070323167e-05,
"loss": 0.1271,
"step": 11320
},
{
"epoch": 0.6033335108365727,
"grad_norm": 0.03966144099831581,
"learning_rate": 5.604059603531193e-05,
"loss": 0.127,
"step": 11330
},
{
"epoch": 0.6038660205548751,
"grad_norm": 0.055649157613515854,
"learning_rate": 5.5981029151026345e-05,
"loss": 0.1277,
"step": 11340
},
{
"epoch": 0.6043985302731775,
"grad_norm": 0.05199277400970459,
"learning_rate": 5.59214902350953e-05,
"loss": 0.1267,
"step": 11350
},
{
"epoch": 0.6049310399914798,
"grad_norm": 0.04140447452664375,
"learning_rate": 5.586197947215246e-05,
"loss": 0.1263,
"step": 11360
},
{
"epoch": 0.6054635497097822,
"grad_norm": 0.03502384573221207,
"learning_rate": 5.580249704674414e-05,
"loss": 0.1268,
"step": 11370
},
{
"epoch": 0.6059960594280845,
"grad_norm": 0.04866914451122284,
"learning_rate": 5.574304314332881e-05,
"loss": 0.1258,
"step": 11380
},
{
"epoch": 0.606528569146387,
"grad_norm": 0.039416830986738205,
"learning_rate": 5.568361794627652e-05,
"loss": 0.1266,
"step": 11390
},
{
"epoch": 0.6070610788646893,
"grad_norm": 0.04279816523194313,
"learning_rate": 5.562422163986827e-05,
"loss": 0.1265,
"step": 11400
},
{
"epoch": 0.6075935885829916,
"grad_norm": 0.08062811195850372,
"learning_rate": 5.556485440829544e-05,
"loss": 0.1266,
"step": 11410
},
{
"epoch": 0.608126098301294,
"grad_norm": 0.042986951768398285,
"learning_rate": 5.550551643565931e-05,
"loss": 0.1269,
"step": 11420
},
{
"epoch": 0.6086586080195964,
"grad_norm": 0.056119028478860855,
"learning_rate": 5.544620790597037e-05,
"loss": 0.1273,
"step": 11430
},
{
"epoch": 0.6091911177378987,
"grad_norm": 0.0498378686606884,
"learning_rate": 5.5386929003147835e-05,
"loss": 0.1273,
"step": 11440
},
{
"epoch": 0.6097236274562011,
"grad_norm": 0.04148755222558975,
"learning_rate": 5.5327679911019034e-05,
"loss": 0.1265,
"step": 11450
},
{
"epoch": 0.6102561371745034,
"grad_norm": 0.04891781508922577,
"learning_rate": 5.5268460813318866e-05,
"loss": 0.127,
"step": 11460
},
{
"epoch": 0.6107886468928058,
"grad_norm": 0.07420381903648376,
"learning_rate": 5.520927189368923e-05,
"loss": 0.1268,
"step": 11470
},
{
"epoch": 0.6113211566111082,
"grad_norm": 0.12143438309431076,
"learning_rate": 5.5150113335678365e-05,
"loss": 0.1273,
"step": 11480
},
{
"epoch": 0.6118536663294105,
"grad_norm": 0.045520998537540436,
"learning_rate": 5.509098532274044e-05,
"loss": 0.127,
"step": 11490
},
{
"epoch": 0.6123861760477128,
"grad_norm": 0.05857592076063156,
"learning_rate": 5.503188803823487e-05,
"loss": 0.1269,
"step": 11500
},
{
"epoch": 0.6129186857660153,
"grad_norm": 0.04991764947772026,
"learning_rate": 5.497282166542579e-05,
"loss": 0.1267,
"step": 11510
},
{
"epoch": 0.6134511954843176,
"grad_norm": 0.059020016342401505,
"learning_rate": 5.4913786387481426e-05,
"loss": 0.127,
"step": 11520
},
{
"epoch": 0.6139837052026199,
"grad_norm": 0.06219782307744026,
"learning_rate": 5.485478238747367e-05,
"loss": 0.1268,
"step": 11530
},
{
"epoch": 0.6145162149209223,
"grad_norm": 0.060054097324609756,
"learning_rate": 5.4795809848377323e-05,
"loss": 0.1266,
"step": 11540
},
{
"epoch": 0.6150487246392247,
"grad_norm": 0.05419805273413658,
"learning_rate": 5.473686895306971e-05,
"loss": 0.1269,
"step": 11550
},
{
"epoch": 0.615581234357527,
"grad_norm": 0.07475223392248154,
"learning_rate": 5.4677959884329944e-05,
"loss": 0.1264,
"step": 11560
},
{
"epoch": 0.6161137440758294,
"grad_norm": 0.052985042333602905,
"learning_rate": 5.4619082824838506e-05,
"loss": 0.1266,
"step": 11570
},
{
"epoch": 0.6166462537941317,
"grad_norm": 0.062309183180332184,
"learning_rate": 5.45602379571766e-05,
"loss": 0.1276,
"step": 11580
},
{
"epoch": 0.6171787635124341,
"grad_norm": 0.0704023614525795,
"learning_rate": 5.450142546382555e-05,
"loss": 0.126,
"step": 11590
},
{
"epoch": 0.6177112732307365,
"grad_norm": 0.04271765798330307,
"learning_rate": 5.444264552716636e-05,
"loss": 0.1267,
"step": 11600
},
{
"epoch": 0.6182437829490388,
"grad_norm": 0.03997405245900154,
"learning_rate": 5.438389832947903e-05,
"loss": 0.1267,
"step": 11610
},
{
"epoch": 0.6187762926673411,
"grad_norm": 0.06841737031936646,
"learning_rate": 5.432518405294208e-05,
"loss": 0.1259,
"step": 11620
},
{
"epoch": 0.6193088023856436,
"grad_norm": 0.07736402004957199,
"learning_rate": 5.426650287963186e-05,
"loss": 0.1277,
"step": 11630
},
{
"epoch": 0.6198413121039459,
"grad_norm": 0.05138285458087921,
"learning_rate": 5.4207854991522125e-05,
"loss": 0.1268,
"step": 11640
},
{
"epoch": 0.6203738218222482,
"grad_norm": 0.055199526250362396,
"learning_rate": 5.4149240570483394e-05,
"loss": 0.1258,
"step": 11650
},
{
"epoch": 0.6209063315405506,
"grad_norm": 0.0480291023850441,
"learning_rate": 5.409065979828243e-05,
"loss": 0.1267,
"step": 11660
},
{
"epoch": 0.621438841258853,
"grad_norm": 0.0840907022356987,
"learning_rate": 5.403211285658158e-05,
"loss": 0.1265,
"step": 11670
},
{
"epoch": 0.6219713509771553,
"grad_norm": 0.042071383446455,
"learning_rate": 5.397359992693835e-05,
"loss": 0.1262,
"step": 11680
},
{
"epoch": 0.6225038606954577,
"grad_norm": 0.0721912607550621,
"learning_rate": 5.3915121190804755e-05,
"loss": 0.1268,
"step": 11690
},
{
"epoch": 0.62303637041376,
"grad_norm": 0.058796901255846024,
"learning_rate": 5.385667682952675e-05,
"loss": 0.1263,
"step": 11700
},
{
"epoch": 0.6235688801320625,
"grad_norm": 0.04958143085241318,
"learning_rate": 5.3798267024343706e-05,
"loss": 0.1266,
"step": 11710
},
{
"epoch": 0.6241013898503648,
"grad_norm": 0.07576627284288406,
"learning_rate": 5.373989195638785e-05,
"loss": 0.1268,
"step": 11720
},
{
"epoch": 0.6246338995686671,
"grad_norm": 0.050743598490953445,
"learning_rate": 5.368155180668366e-05,
"loss": 0.1262,
"step": 11730
},
{
"epoch": 0.6251664092869695,
"grad_norm": 0.046020250767469406,
"learning_rate": 5.3623246756147346e-05,
"loss": 0.1262,
"step": 11740
},
{
"epoch": 0.6256989190052719,
"grad_norm": 0.04735419899225235,
"learning_rate": 5.356497698558628e-05,
"loss": 0.1265,
"step": 11750
},
{
"epoch": 0.6262314287235742,
"grad_norm": 0.058625295758247375,
"learning_rate": 5.3506742675698384e-05,
"loss": 0.1267,
"step": 11760
},
{
"epoch": 0.6267639384418766,
"grad_norm": 0.05440155416727066,
"learning_rate": 5.34485440070717e-05,
"loss": 0.1265,
"step": 11770
},
{
"epoch": 0.6272964481601789,
"grad_norm": 0.06765516102313995,
"learning_rate": 5.3390381160183645e-05,
"loss": 0.1273,
"step": 11780
},
{
"epoch": 0.6278289578784813,
"grad_norm": 0.042375244200229645,
"learning_rate": 5.333225431540062e-05,
"loss": 0.1269,
"step": 11790
},
{
"epoch": 0.6283614675967837,
"grad_norm": 0.04143916070461273,
"learning_rate": 5.327416365297737e-05,
"loss": 0.1267,
"step": 11800
},
{
"epoch": 0.628893977315086,
"grad_norm": 0.03681569918990135,
"learning_rate": 5.321610935305643e-05,
"loss": 0.1265,
"step": 11810
},
{
"epoch": 0.6294264870333883,
"grad_norm": 0.04781223088502884,
"learning_rate": 5.315809159566753e-05,
"loss": 0.1262,
"step": 11820
},
{
"epoch": 0.6299589967516908,
"grad_norm": 0.05528895556926727,
"learning_rate": 5.3100110560727155e-05,
"loss": 0.1261,
"step": 11830
},
{
"epoch": 0.6304915064699931,
"grad_norm": 0.050820931792259216,
"learning_rate": 5.3042166428037867e-05,
"loss": 0.1265,
"step": 11840
},
{
"epoch": 0.6310240161882954,
"grad_norm": 0.049191396683454514,
"learning_rate": 5.2984259377287795e-05,
"loss": 0.1266,
"step": 11850
},
{
"epoch": 0.6315565259065978,
"grad_norm": 0.04525256156921387,
"learning_rate": 5.2926389588050095e-05,
"loss": 0.127,
"step": 11860
},
{
"epoch": 0.6320890356249002,
"grad_norm": 0.03086530603468418,
"learning_rate": 5.2868557239782335e-05,
"loss": 0.1268,
"step": 11870
},
{
"epoch": 0.6326215453432025,
"grad_norm": 0.06164710223674774,
"learning_rate": 5.2810762511826017e-05,
"loss": 0.1265,
"step": 11880
},
{
"epoch": 0.6331540550615049,
"grad_norm": 0.10141383111476898,
"learning_rate": 5.275300558340596e-05,
"loss": 0.1264,
"step": 11890
},
{
"epoch": 0.6336865647798072,
"grad_norm": 0.043847665190696716,
"learning_rate": 5.269528663362976e-05,
"loss": 0.1266,
"step": 11900
},
{
"epoch": 0.6342190744981095,
"grad_norm": 0.04844609647989273,
"learning_rate": 5.2637605841487246e-05,
"loss": 0.1264,
"step": 11910
},
{
"epoch": 0.634751584216412,
"grad_norm": 0.04017659276723862,
"learning_rate": 5.257996338584994e-05,
"loss": 0.1266,
"step": 11920
},
{
"epoch": 0.6352840939347143,
"grad_norm": 0.04086530581116676,
"learning_rate": 5.2522359445470434e-05,
"loss": 0.1264,
"step": 11930
},
{
"epoch": 0.6358166036530166,
"grad_norm": 0.0746808871626854,
"learning_rate": 5.246479419898191e-05,
"loss": 0.1265,
"step": 11940
},
{
"epoch": 0.636349113371319,
"grad_norm": 0.047690387815237045,
"learning_rate": 5.2407267824897556e-05,
"loss": 0.1261,
"step": 11950
},
{
"epoch": 0.6368816230896214,
"grad_norm": 0.07566772401332855,
"learning_rate": 5.234978050161002e-05,
"loss": 0.1263,
"step": 11960
},
{
"epoch": 0.6374141328079237,
"grad_norm": 0.07059159129858017,
"learning_rate": 5.229233240739082e-05,
"loss": 0.1263,
"step": 11970
},
{
"epoch": 0.6379466425262261,
"grad_norm": 0.06677578389644623,
"learning_rate": 5.223492372038989e-05,
"loss": 0.1273,
"step": 11980
},
{
"epoch": 0.6384791522445284,
"grad_norm": 0.09771794825792313,
"learning_rate": 5.217755461863487e-05,
"loss": 0.1268,
"step": 11990
},
{
"epoch": 0.6390116619628308,
"grad_norm": 0.0393114909529686,
"learning_rate": 5.212022528003072e-05,
"loss": 0.1264,
"step": 12000
},
{
"epoch": 0.6395441716811332,
"grad_norm": 0.05768098682165146,
"learning_rate": 5.2062935882359054e-05,
"loss": 0.1267,
"step": 12010
},
{
"epoch": 0.6400766813994355,
"grad_norm": 0.04626571014523506,
"learning_rate": 5.2005686603277625e-05,
"loss": 0.1272,
"step": 12020
},
{
"epoch": 0.640609191117738,
"grad_norm": 0.06112409383058548,
"learning_rate": 5.1948477620319805e-05,
"loss": 0.1265,
"step": 12030
},
{
"epoch": 0.6411417008360403,
"grad_norm": 0.04936950281262398,
"learning_rate": 5.1891309110893974e-05,
"loss": 0.1268,
"step": 12040
},
{
"epoch": 0.6416742105543426,
"grad_norm": 0.053032536059617996,
"learning_rate": 5.183418125228301e-05,
"loss": 0.1269,
"step": 12050
},
{
"epoch": 0.642206720272645,
"grad_norm": 0.053800616413354874,
"learning_rate": 5.177709422164374e-05,
"loss": 0.1269,
"step": 12060
},
{
"epoch": 0.6427392299909473,
"grad_norm": 0.03961695730686188,
"learning_rate": 5.1720048196006376e-05,
"loss": 0.1265,
"step": 12070
},
{
"epoch": 0.6432717397092497,
"grad_norm": 0.040204983204603195,
"learning_rate": 5.166304335227396e-05,
"loss": 0.1262,
"step": 12080
},
{
"epoch": 0.6438042494275521,
"grad_norm": 0.11371159553527832,
"learning_rate": 5.160607986722186e-05,
"loss": 0.1263,
"step": 12090
},
{
"epoch": 0.6443367591458544,
"grad_norm": 0.037854380905628204,
"learning_rate": 5.154915791749715e-05,
"loss": 0.1265,
"step": 12100
},
{
"epoch": 0.6448692688641567,
"grad_norm": 0.07416236400604248,
"learning_rate": 5.1492277679618104e-05,
"loss": 0.1261,
"step": 12110
},
{
"epoch": 0.6454017785824592,
"grad_norm": 0.0411413200199604,
"learning_rate": 5.1435439329973664e-05,
"loss": 0.1262,
"step": 12120
},
{
"epoch": 0.6459342883007615,
"grad_norm": 0.06878205388784409,
"learning_rate": 5.1378643044822884e-05,
"loss": 0.1268,
"step": 12130
},
{
"epoch": 0.6464667980190638,
"grad_norm": 0.05869507044553757,
"learning_rate": 5.132188900029433e-05,
"loss": 0.1258,
"step": 12140
},
{
"epoch": 0.6469993077373662,
"grad_norm": 0.049447815865278244,
"learning_rate": 5.126517737238563e-05,
"loss": 0.1266,
"step": 12150
},
{
"epoch": 0.6475318174556686,
"grad_norm": 0.09829236567020416,
"learning_rate": 5.120850833696282e-05,
"loss": 0.1262,
"step": 12160
},
{
"epoch": 0.6480643271739709,
"grad_norm": 0.03944886848330498,
"learning_rate": 5.115188206975992e-05,
"loss": 0.127,
"step": 12170
},
{
"epoch": 0.6485968368922733,
"grad_norm": 0.038028497248888016,
"learning_rate": 5.109529874637824e-05,
"loss": 0.1266,
"step": 12180
},
{
"epoch": 0.6491293466105756,
"grad_norm": 0.05182207375764847,
"learning_rate": 5.103875854228601e-05,
"loss": 0.1259,
"step": 12190
},
{
"epoch": 0.649661856328878,
"grad_norm": 0.04164310172200203,
"learning_rate": 5.098226163281767e-05,
"loss": 0.1256,
"step": 12200
},
{
"epoch": 0.6501943660471804,
"grad_norm": 0.04075628146529198,
"learning_rate": 5.0925808193173454e-05,
"loss": 0.1267,
"step": 12210
},
{
"epoch": 0.6507268757654827,
"grad_norm": 0.05309925228357315,
"learning_rate": 5.0869398398418744e-05,
"loss": 0.1257,
"step": 12220
},
{
"epoch": 0.651259385483785,
"grad_norm": 0.05489126220345497,
"learning_rate": 5.081303242348363e-05,
"loss": 0.1262,
"step": 12230
},
{
"epoch": 0.6517918952020875,
"grad_norm": 0.06438528001308441,
"learning_rate": 5.075671044316228e-05,
"loss": 0.1268,
"step": 12240
},
{
"epoch": 0.6523244049203898,
"grad_norm": 0.045175325125455856,
"learning_rate": 5.070043263211242e-05,
"loss": 0.1262,
"step": 12250
},
{
"epoch": 0.6528569146386921,
"grad_norm": 0.05378909409046173,
"learning_rate": 5.064419916485485e-05,
"loss": 0.1264,
"step": 12260
},
{
"epoch": 0.6533894243569945,
"grad_norm": 0.05675683543086052,
"learning_rate": 5.058801021577282e-05,
"loss": 0.1266,
"step": 12270
},
{
"epoch": 0.6539219340752969,
"grad_norm": 0.0350642092525959,
"learning_rate": 5.053186595911152e-05,
"loss": 0.1269,
"step": 12280
},
{
"epoch": 0.6544544437935992,
"grad_norm": 0.05776926130056381,
"learning_rate": 5.0475766568977586e-05,
"loss": 0.1264,
"step": 12290
},
{
"epoch": 0.6549869535119016,
"grad_norm": 0.053256552666425705,
"learning_rate": 5.041971221933851e-05,
"loss": 0.127,
"step": 12300
},
{
"epoch": 0.6555194632302039,
"grad_norm": 0.03956317901611328,
"learning_rate": 5.0363703084022065e-05,
"loss": 0.1265,
"step": 12310
},
{
"epoch": 0.6560519729485063,
"grad_norm": 0.033014725893735886,
"learning_rate": 5.0307739336715864e-05,
"loss": 0.1261,
"step": 12320
},
{
"epoch": 0.6565844826668087,
"grad_norm": 0.040162548422813416,
"learning_rate": 5.0251821150966746e-05,
"loss": 0.1266,
"step": 12330
},
{
"epoch": 0.657116992385111,
"grad_norm": 0.08668463677167892,
"learning_rate": 5.0195948700180294e-05,
"loss": 0.1256,
"step": 12340
},
{
"epoch": 0.6576495021034134,
"grad_norm": 0.05517444759607315,
"learning_rate": 5.0140122157620185e-05,
"loss": 0.1262,
"step": 12350
},
{
"epoch": 0.6581820118217158,
"grad_norm": 0.05813097953796387,
"learning_rate": 5.008434169640781e-05,
"loss": 0.1258,
"step": 12360
},
{
"epoch": 0.6587145215400181,
"grad_norm": 0.036499012261629105,
"learning_rate": 5.002860748952165e-05,
"loss": 0.1261,
"step": 12370
},
{
"epoch": 0.6592470312583205,
"grad_norm": 0.03472182899713516,
"learning_rate": 4.997291970979672e-05,
"loss": 0.1268,
"step": 12380
},
{
"epoch": 0.6597795409766228,
"grad_norm": 0.06570050120353699,
"learning_rate": 4.9917278529924036e-05,
"loss": 0.1265,
"step": 12390
},
{
"epoch": 0.6603120506949252,
"grad_norm": 0.0551844947040081,
"learning_rate": 4.9861684122450166e-05,
"loss": 0.1262,
"step": 12400
},
{
"epoch": 0.6608445604132276,
"grad_norm": 0.05402039363980293,
"learning_rate": 4.9806136659776625e-05,
"loss": 0.1264,
"step": 12410
},
{
"epoch": 0.6613770701315299,
"grad_norm": 0.05242108181118965,
"learning_rate": 4.975063631415934e-05,
"loss": 0.1262,
"step": 12420
},
{
"epoch": 0.6619095798498322,
"grad_norm": 0.04954907298088074,
"learning_rate": 4.96951832577081e-05,
"loss": 0.1259,
"step": 12430
},
{
"epoch": 0.6624420895681347,
"grad_norm": 0.031282830983400345,
"learning_rate": 4.963977766238604e-05,
"loss": 0.1259,
"step": 12440
},
{
"epoch": 0.662974599286437,
"grad_norm": 0.037177179008722305,
"learning_rate": 4.95844197000092e-05,
"loss": 0.1263,
"step": 12450
},
{
"epoch": 0.6635071090047393,
"grad_norm": 0.059044573456048965,
"learning_rate": 4.95291095422458e-05,
"loss": 0.1251,
"step": 12460
},
{
"epoch": 0.6640396187230417,
"grad_norm": 0.04457508400082588,
"learning_rate": 4.94738473606159e-05,
"loss": 0.127,
"step": 12470
},
{
"epoch": 0.6645721284413441,
"grad_norm": 0.04706263169646263,
"learning_rate": 4.941863332649072e-05,
"loss": 0.1256,
"step": 12480
},
{
"epoch": 0.6651046381596464,
"grad_norm": 0.04623222351074219,
"learning_rate": 4.936346761109223e-05,
"loss": 0.1256,
"step": 12490
},
{
"epoch": 0.6656371478779488,
"grad_norm": 0.04978486895561218,
"learning_rate": 4.9308350385492494e-05,
"loss": 0.126,
"step": 12500
},
{
"epoch": 0.6661696575962511,
"grad_norm": 0.055559538304805756,
"learning_rate": 4.925328182061326e-05,
"loss": 0.126,
"step": 12510
},
{
"epoch": 0.6667021673145535,
"grad_norm": 0.04878619685769081,
"learning_rate": 4.9198262087225375e-05,
"loss": 0.1262,
"step": 12520
},
{
"epoch": 0.6672346770328559,
"grad_norm": 0.045161280781030655,
"learning_rate": 4.9143291355948225e-05,
"loss": 0.1259,
"step": 12530
},
{
"epoch": 0.6677671867511582,
"grad_norm": 0.08460939675569534,
"learning_rate": 4.9088369797249234e-05,
"loss": 0.1261,
"step": 12540
},
{
"epoch": 0.6682996964694605,
"grad_norm": 0.05501072108745575,
"learning_rate": 4.903349758144339e-05,
"loss": 0.1265,
"step": 12550
},
{
"epoch": 0.668832206187763,
"grad_norm": 0.04381651058793068,
"learning_rate": 4.897867487869262e-05,
"loss": 0.1259,
"step": 12560
},
{
"epoch": 0.6693647159060653,
"grad_norm": 0.051561057567596436,
"learning_rate": 4.8923901859005335e-05,
"loss": 0.1261,
"step": 12570
},
{
"epoch": 0.6698972256243676,
"grad_norm": 0.03529192507266998,
"learning_rate": 4.886917869223585e-05,
"loss": 0.1261,
"step": 12580
},
{
"epoch": 0.67042973534267,
"grad_norm": 0.047314297407865524,
"learning_rate": 4.881450554808389e-05,
"loss": 0.1266,
"step": 12590
},
{
"epoch": 0.6709622450609724,
"grad_norm": 0.06020704656839371,
"learning_rate": 4.875988259609407e-05,
"loss": 0.1258,
"step": 12600
},
{
"epoch": 0.6714947547792747,
"grad_norm": 0.040109023451805115,
"learning_rate": 4.870531000565537e-05,
"loss": 0.1261,
"step": 12610
},
{
"epoch": 0.6720272644975771,
"grad_norm": 0.03262796998023987,
"learning_rate": 4.865078794600053e-05,
"loss": 0.1264,
"step": 12620
},
{
"epoch": 0.6725597742158794,
"grad_norm": 0.039773985743522644,
"learning_rate": 4.859631658620569e-05,
"loss": 0.1265,
"step": 12630
},
{
"epoch": 0.6730922839341817,
"grad_norm": 0.046503521502017975,
"learning_rate": 4.854189609518969e-05,
"loss": 0.1259,
"step": 12640
},
{
"epoch": 0.6736247936524842,
"grad_norm": 0.04692930728197098,
"learning_rate": 4.848752664171362e-05,
"loss": 0.1258,
"step": 12650
},
{
"epoch": 0.6741573033707865,
"grad_norm": 0.04684825614094734,
"learning_rate": 4.843320839438035e-05,
"loss": 0.1266,
"step": 12660
},
{
"epoch": 0.6746898130890889,
"grad_norm": 0.051075540482997894,
"learning_rate": 4.837894152163395e-05,
"loss": 0.1257,
"step": 12670
},
{
"epoch": 0.6752223228073913,
"grad_norm": 0.08315866440534592,
"learning_rate": 4.832472619175913e-05,
"loss": 0.1256,
"step": 12680
},
{
"epoch": 0.6757548325256936,
"grad_norm": 0.05411198362708092,
"learning_rate": 4.827056257288079e-05,
"loss": 0.1265,
"step": 12690
},
{
"epoch": 0.676287342243996,
"grad_norm": 0.045681897550821304,
"learning_rate": 4.821645083296347e-05,
"loss": 0.1264,
"step": 12700
},
{
"epoch": 0.6768198519622983,
"grad_norm": 0.03494982793927193,
"learning_rate": 4.8162391139810845e-05,
"loss": 0.1265,
"step": 12710
},
{
"epoch": 0.6773523616806006,
"grad_norm": 0.04865271970629692,
"learning_rate": 4.8108383661065185e-05,
"loss": 0.126,
"step": 12720
},
{
"epoch": 0.6778848713989031,
"grad_norm": 0.055251702666282654,
"learning_rate": 4.805442856420682e-05,
"loss": 0.1264,
"step": 12730
},
{
"epoch": 0.6784173811172054,
"grad_norm": 0.051663871854543686,
"learning_rate": 4.800052601655362e-05,
"loss": 0.1255,
"step": 12740
},
{
"epoch": 0.6789498908355077,
"grad_norm": 0.05289029702544212,
"learning_rate": 4.794667618526057e-05,
"loss": 0.126,
"step": 12750
},
{
"epoch": 0.6794824005538102,
"grad_norm": 0.05994449183344841,
"learning_rate": 4.7892879237319136e-05,
"loss": 0.1261,
"step": 12760
},
{
"epoch": 0.6800149102721125,
"grad_norm": 0.03531305119395256,
"learning_rate": 4.783913533955675e-05,
"loss": 0.1265,
"step": 12770
},
{
"epoch": 0.6805474199904148,
"grad_norm": 0.04480816796422005,
"learning_rate": 4.7785444658636427e-05,
"loss": 0.126,
"step": 12780
},
{
"epoch": 0.6810799297087172,
"grad_norm": 0.045987531542778015,
"learning_rate": 4.773180736105607e-05,
"loss": 0.1268,
"step": 12790
},
{
"epoch": 0.6816124394270195,
"grad_norm": 0.06958389282226562,
"learning_rate": 4.767822361314805e-05,
"loss": 0.1259,
"step": 12800
},
{
"epoch": 0.6821449491453219,
"grad_norm": 0.03225488215684891,
"learning_rate": 4.762469358107873e-05,
"loss": 0.1257,
"step": 12810
},
{
"epoch": 0.6826774588636243,
"grad_norm": 0.05594348534941673,
"learning_rate": 4.757121743084784e-05,
"loss": 0.1261,
"step": 12820
},
{
"epoch": 0.6832099685819266,
"grad_norm": 0.06453056633472443,
"learning_rate": 4.751779532828806e-05,
"loss": 0.1263,
"step": 12830
},
{
"epoch": 0.6837424783002289,
"grad_norm": 0.05000981315970421,
"learning_rate": 4.746442743906442e-05,
"loss": 0.1257,
"step": 12840
},
{
"epoch": 0.6842749880185314,
"grad_norm": 0.06121028959751129,
"learning_rate": 4.741111392867386e-05,
"loss": 0.1258,
"step": 12850
},
{
"epoch": 0.6848074977368337,
"grad_norm": 0.04783171787858009,
"learning_rate": 4.7357854962444686e-05,
"loss": 0.1259,
"step": 12860
},
{
"epoch": 0.685340007455136,
"grad_norm": 0.0399705208837986,
"learning_rate": 4.7304650705536084e-05,
"loss": 0.1261,
"step": 12870
},
{
"epoch": 0.6858725171734384,
"grad_norm": 0.03981216251850128,
"learning_rate": 4.7251501322937534e-05,
"loss": 0.126,
"step": 12880
},
{
"epoch": 0.6864050268917408,
"grad_norm": 0.034330256283283234,
"learning_rate": 4.7198406979468366e-05,
"loss": 0.126,
"step": 12890
},
{
"epoch": 0.6869375366100431,
"grad_norm": 0.06301886588335037,
"learning_rate": 4.7145367839777237e-05,
"loss": 0.1255,
"step": 12900
},
{
"epoch": 0.6874700463283455,
"grad_norm": 0.07015033811330795,
"learning_rate": 4.709238406834164e-05,
"loss": 0.1257,
"step": 12910
},
{
"epoch": 0.6880025560466478,
"grad_norm": 0.04845889285206795,
"learning_rate": 4.703945582946729e-05,
"loss": 0.1266,
"step": 12920
},
{
"epoch": 0.6885350657649502,
"grad_norm": 0.043622374534606934,
"learning_rate": 4.69865832872878e-05,
"loss": 0.1255,
"step": 12930
},
{
"epoch": 0.6890675754832526,
"grad_norm": 0.046708934009075165,
"learning_rate": 4.6933766605763955e-05,
"loss": 0.1262,
"step": 12940
},
{
"epoch": 0.6896000852015549,
"grad_norm": 0.09181608259677887,
"learning_rate": 4.688100594868341e-05,
"loss": 0.1262,
"step": 12950
},
{
"epoch": 0.6901325949198572,
"grad_norm": 0.0670885518193245,
"learning_rate": 4.682830147965999e-05,
"loss": 0.1262,
"step": 12960
},
{
"epoch": 0.6906651046381597,
"grad_norm": 0.05586402490735054,
"learning_rate": 4.6775653362133356e-05,
"loss": 0.126,
"step": 12970
},
{
"epoch": 0.691197614356462,
"grad_norm": 0.0520888976752758,
"learning_rate": 4.6723061759368405e-05,
"loss": 0.1256,
"step": 12980
},
{
"epoch": 0.6917301240747644,
"grad_norm": 0.04255915433168411,
"learning_rate": 4.667052683445474e-05,
"loss": 0.1256,
"step": 12990
},
{
"epoch": 0.6922626337930667,
"grad_norm": 0.04210617393255234,
"learning_rate": 4.661804875030623e-05,
"loss": 0.1259,
"step": 13000
},
{
"epoch": 0.6927951435113691,
"grad_norm": 0.049725860357284546,
"learning_rate": 4.656562766966047e-05,
"loss": 0.1259,
"step": 13010
},
{
"epoch": 0.6933276532296715,
"grad_norm": 0.04117880016565323,
"learning_rate": 4.6513263755078305e-05,
"loss": 0.1252,
"step": 13020
},
{
"epoch": 0.6938601629479738,
"grad_norm": 0.06293977797031403,
"learning_rate": 4.6460957168943286e-05,
"loss": 0.1257,
"step": 13030
},
{
"epoch": 0.6943926726662761,
"grad_norm": 0.08014130592346191,
"learning_rate": 4.640870807346116e-05,
"loss": 0.1259,
"step": 13040
},
{
"epoch": 0.6949251823845786,
"grad_norm": 0.05291053652763367,
"learning_rate": 4.6356516630659444e-05,
"loss": 0.1261,
"step": 13050
},
{
"epoch": 0.6954576921028809,
"grad_norm": 0.061003703624010086,
"learning_rate": 4.630438300238684e-05,
"loss": 0.1256,
"step": 13060
},
{
"epoch": 0.6959902018211832,
"grad_norm": 0.06733989715576172,
"learning_rate": 4.625230735031276e-05,
"loss": 0.1259,
"step": 13070
},
{
"epoch": 0.6965227115394856,
"grad_norm": 0.07360579818487167,
"learning_rate": 4.620028983592687e-05,
"loss": 0.1261,
"step": 13080
},
{
"epoch": 0.697055221257788,
"grad_norm": 0.0536913201212883,
"learning_rate": 4.6148330620538474e-05,
"loss": 0.1263,
"step": 13090
},
{
"epoch": 0.6975877309760903,
"grad_norm": 0.04993463680148125,
"learning_rate": 4.609642986527615e-05,
"loss": 0.1262,
"step": 13100
},
{
"epoch": 0.6981202406943927,
"grad_norm": 0.04666028916835785,
"learning_rate": 4.6044587731087155e-05,
"loss": 0.1258,
"step": 13110
},
{
"epoch": 0.698652750412695,
"grad_norm": 0.07630308717489243,
"learning_rate": 4.599280437873699e-05,
"loss": 0.1259,
"step": 13120
},
{
"epoch": 0.6991852601309974,
"grad_norm": 0.07336148619651794,
"learning_rate": 4.594107996880884e-05,
"loss": 0.1261,
"step": 13130
},
{
"epoch": 0.6997177698492998,
"grad_norm": 0.0583728589117527,
"learning_rate": 4.588941466170312e-05,
"loss": 0.1263,
"step": 13140
},
{
"epoch": 0.7002502795676021,
"grad_norm": 0.04657367989420891,
"learning_rate": 4.5837808617636935e-05,
"loss": 0.1254,
"step": 13150
},
{
"epoch": 0.7007827892859044,
"grad_norm": 0.03507756069302559,
"learning_rate": 4.5786261996643664e-05,
"loss": 0.1255,
"step": 13160
},
{
"epoch": 0.7013152990042069,
"grad_norm": 0.10152143239974976,
"learning_rate": 4.57347749585724e-05,
"loss": 0.1261,
"step": 13170
},
{
"epoch": 0.7018478087225092,
"grad_norm": 0.04690668731927872,
"learning_rate": 4.568334766308741e-05,
"loss": 0.126,
"step": 13180
},
{
"epoch": 0.7023803184408115,
"grad_norm": 0.050610288977622986,
"learning_rate": 4.563198026966776e-05,
"loss": 0.1259,
"step": 13190
},
{
"epoch": 0.7029128281591139,
"grad_norm": 0.07565128803253174,
"learning_rate": 4.558067293760672e-05,
"loss": 0.1253,
"step": 13200
},
{
"epoch": 0.7034453378774163,
"grad_norm": 0.03581630438566208,
"learning_rate": 4.552942582601134e-05,
"loss": 0.125,
"step": 13210
},
{
"epoch": 0.7039778475957186,
"grad_norm": 0.06287883222103119,
"learning_rate": 4.547823909380188e-05,
"loss": 0.1264,
"step": 13220
},
{
"epoch": 0.704510357314021,
"grad_norm": 0.03954106569290161,
"learning_rate": 4.542711289971139e-05,
"loss": 0.1258,
"step": 13230
},
{
"epoch": 0.7050428670323233,
"grad_norm": 0.07557252049446106,
"learning_rate": 4.537604740228517e-05,
"loss": 0.125,
"step": 13240
},
{
"epoch": 0.7055753767506256,
"grad_norm": 0.07294019311666489,
"learning_rate": 4.532504275988033e-05,
"loss": 0.1259,
"step": 13250
},
{
"epoch": 0.7061078864689281,
"grad_norm": 0.06293601542711258,
"learning_rate": 4.527409913066522e-05,
"loss": 0.1259,
"step": 13260
},
{
"epoch": 0.7066403961872304,
"grad_norm": 0.043173741549253464,
"learning_rate": 4.5223216672619e-05,
"loss": 0.1255,
"step": 13270
},
{
"epoch": 0.7071729059055327,
"grad_norm": 0.05080621689558029,
"learning_rate": 4.517239554353116e-05,
"loss": 0.1256,
"step": 13280
},
{
"epoch": 0.7077054156238352,
"grad_norm": 0.04947923868894577,
"learning_rate": 4.512163590100097e-05,
"loss": 0.1257,
"step": 13290
},
{
"epoch": 0.7082379253421375,
"grad_norm": 0.054971180856227875,
"learning_rate": 4.507093790243704e-05,
"loss": 0.1259,
"step": 13300
},
{
"epoch": 0.7087704350604399,
"grad_norm": 0.0491788424551487,
"learning_rate": 4.5020301705056825e-05,
"loss": 0.1251,
"step": 13310
},
{
"epoch": 0.7093029447787422,
"grad_norm": 0.06562227010726929,
"learning_rate": 4.496972746588614e-05,
"loss": 0.1256,
"step": 13320
},
{
"epoch": 0.7098354544970445,
"grad_norm": 0.05146334320306778,
"learning_rate": 4.4919215341758614e-05,
"loss": 0.1259,
"step": 13330
},
{
"epoch": 0.710367964215347,
"grad_norm": 0.04213017597794533,
"learning_rate": 4.486876548931533e-05,
"loss": 0.1258,
"step": 13340
},
{
"epoch": 0.7109004739336493,
"grad_norm": 0.07794417440891266,
"learning_rate": 4.481837806500419e-05,
"loss": 0.1255,
"step": 13350
},
{
"epoch": 0.7114329836519516,
"grad_norm": 0.06125866621732712,
"learning_rate": 4.4768053225079565e-05,
"loss": 0.1255,
"step": 13360
},
{
"epoch": 0.711965493370254,
"grad_norm": 0.056374140083789825,
"learning_rate": 4.471779112560168e-05,
"loss": 0.1262,
"step": 13370
},
{
"epoch": 0.7124980030885564,
"grad_norm": 0.05176498368382454,
"learning_rate": 4.466759192243627e-05,
"loss": 0.1264,
"step": 13380
},
{
"epoch": 0.7130305128068587,
"grad_norm": 0.1045917272567749,
"learning_rate": 4.461745577125399e-05,
"loss": 0.1253,
"step": 13390
},
{
"epoch": 0.7135630225251611,
"grad_norm": 0.03207787126302719,
"learning_rate": 4.456738282752996e-05,
"loss": 0.125,
"step": 13400
},
{
"epoch": 0.7140955322434634,
"grad_norm": 0.06704405695199966,
"learning_rate": 4.451737324654328e-05,
"loss": 0.1253,
"step": 13410
},
{
"epoch": 0.7146280419617658,
"grad_norm": 0.0693150982260704,
"learning_rate": 4.4467427183376596e-05,
"loss": 0.1257,
"step": 13420
},
{
"epoch": 0.7151605516800682,
"grad_norm": 0.04728610813617706,
"learning_rate": 4.441754479291557e-05,
"loss": 0.125,
"step": 13430
},
{
"epoch": 0.7156930613983705,
"grad_norm": 0.05448344349861145,
"learning_rate": 4.43677262298484e-05,
"loss": 0.1261,
"step": 13440
},
{
"epoch": 0.7162255711166728,
"grad_norm": 0.036701589822769165,
"learning_rate": 4.431797164866533e-05,
"loss": 0.1259,
"step": 13450
},
{
"epoch": 0.7167580808349753,
"grad_norm": 0.043069060891866684,
"learning_rate": 4.426828120365824e-05,
"loss": 0.1254,
"step": 13460
},
{
"epoch": 0.7172905905532776,
"grad_norm": 0.07068092375993729,
"learning_rate": 4.421865504892011e-05,
"loss": 0.1251,
"step": 13470
},
{
"epoch": 0.7178231002715799,
"grad_norm": 0.045086752623319626,
"learning_rate": 4.416909333834451e-05,
"loss": 0.1259,
"step": 13480
},
{
"epoch": 0.7183556099898823,
"grad_norm": 0.03668762743473053,
"learning_rate": 4.4119596225625216e-05,
"loss": 0.1254,
"step": 13490
},
{
"epoch": 0.7188881197081847,
"grad_norm": 0.05897703021764755,
"learning_rate": 4.4070163864255644e-05,
"loss": 0.1256,
"step": 13500
},
{
"epoch": 0.719420629426487,
"grad_norm": 0.047495052218437195,
"learning_rate": 4.4020796407528455e-05,
"loss": 0.126,
"step": 13510
},
{
"epoch": 0.7199531391447894,
"grad_norm": 0.06927572190761566,
"learning_rate": 4.397149400853498e-05,
"loss": 0.1256,
"step": 13520
},
{
"epoch": 0.7204856488630917,
"grad_norm": 0.03571341931819916,
"learning_rate": 4.3922256820164856e-05,
"loss": 0.1257,
"step": 13530
},
{
"epoch": 0.7210181585813941,
"grad_norm": 0.04303283616900444,
"learning_rate": 4.3873084995105475e-05,
"loss": 0.1255,
"step": 13540
},
{
"epoch": 0.7215506682996965,
"grad_norm": 0.05867360904812813,
"learning_rate": 4.382397868584151e-05,
"loss": 0.1257,
"step": 13550
},
{
"epoch": 0.7220831780179988,
"grad_norm": 0.05930043384432793,
"learning_rate": 4.377493804465452e-05,
"loss": 0.1254,
"step": 13560
},
{
"epoch": 0.7226156877363011,
"grad_norm": 0.04837455227971077,
"learning_rate": 4.372596322362237e-05,
"loss": 0.1256,
"step": 13570
},
{
"epoch": 0.7231481974546036,
"grad_norm": 0.04174095019698143,
"learning_rate": 4.3677054374618844e-05,
"loss": 0.1255,
"step": 13580
},
{
"epoch": 0.7236807071729059,
"grad_norm": 0.0439835861325264,
"learning_rate": 4.3628211649313164e-05,
"loss": 0.1256,
"step": 13590
},
{
"epoch": 0.7242132168912082,
"grad_norm": 0.06301723420619965,
"learning_rate": 4.357943519916942e-05,
"loss": 0.1263,
"step": 13600
},
{
"epoch": 0.7247457266095106,
"grad_norm": 0.06303390860557556,
"learning_rate": 4.353072517544624e-05,
"loss": 0.1254,
"step": 13610
},
{
"epoch": 0.725278236327813,
"grad_norm": 0.06305810809135437,
"learning_rate": 4.348208172919626e-05,
"loss": 0.1252,
"step": 13620
},
{
"epoch": 0.7258107460461154,
"grad_norm": 0.08300595730543137,
"learning_rate": 4.343350501126566e-05,
"loss": 0.1258,
"step": 13630
},
{
"epoch": 0.7263432557644177,
"grad_norm": 0.054570749402046204,
"learning_rate": 4.338499517229365e-05,
"loss": 0.1255,
"step": 13640
},
{
"epoch": 0.72687576548272,
"grad_norm": 0.06060722470283508,
"learning_rate": 4.333655236271207e-05,
"loss": 0.1259,
"step": 13650
},
{
"epoch": 0.7274082752010225,
"grad_norm": 0.04300956055521965,
"learning_rate": 4.328817673274491e-05,
"loss": 0.1263,
"step": 13660
},
{
"epoch": 0.7279407849193248,
"grad_norm": 0.07667776197195053,
"learning_rate": 4.3239868432407804e-05,
"loss": 0.1256,
"step": 13670
},
{
"epoch": 0.7284732946376271,
"grad_norm": 0.04549676924943924,
"learning_rate": 4.3191627611507625e-05,
"loss": 0.126,
"step": 13680
},
{
"epoch": 0.7290058043559295,
"grad_norm": 0.03738940879702568,
"learning_rate": 4.314345441964197e-05,
"loss": 0.1258,
"step": 13690
},
{
"epoch": 0.7295383140742319,
"grad_norm": 0.04415878280997276,
"learning_rate": 4.3095349006198704e-05,
"loss": 0.1253,
"step": 13700
},
{
"epoch": 0.7300708237925342,
"grad_norm": 0.04898412898182869,
"learning_rate": 4.304731152035552e-05,
"loss": 0.1254,
"step": 13710
},
{
"epoch": 0.7306033335108366,
"grad_norm": 0.06354010105133057,
"learning_rate": 4.299934211107947e-05,
"loss": 0.1259,
"step": 13720
},
{
"epoch": 0.7311358432291389,
"grad_norm": 0.055742815136909485,
"learning_rate": 4.295144092712648e-05,
"loss": 0.1253,
"step": 13730
},
{
"epoch": 0.7316683529474413,
"grad_norm": 0.06040511652827263,
"learning_rate": 4.290360811704094e-05,
"loss": 0.1254,
"step": 13740
},
{
"epoch": 0.7322008626657437,
"grad_norm": 0.04668520390987396,
"learning_rate": 4.2855843829155166e-05,
"loss": 0.1251,
"step": 13750
},
{
"epoch": 0.732733372384046,
"grad_norm": 0.045908767729997635,
"learning_rate": 4.280814821158899e-05,
"loss": 0.1257,
"step": 13760
},
{
"epoch": 0.7332658821023483,
"grad_norm": 0.06110945716500282,
"learning_rate": 4.276052141224931e-05,
"loss": 0.1253,
"step": 13770
},
{
"epoch": 0.7337983918206508,
"grad_norm": 0.05198313668370247,
"learning_rate": 4.271296357882962e-05,
"loss": 0.1254,
"step": 13780
},
{
"epoch": 0.7343309015389531,
"grad_norm": 0.08895553648471832,
"learning_rate": 4.266547485880954e-05,
"loss": 0.1245,
"step": 13790
},
{
"epoch": 0.7348634112572554,
"grad_norm": 0.0400872640311718,
"learning_rate": 4.261805539945433e-05,
"loss": 0.1258,
"step": 13800
},
{
"epoch": 0.7353959209755578,
"grad_norm": 0.05898161605000496,
"learning_rate": 4.257070534781452e-05,
"loss": 0.1257,
"step": 13810
},
{
"epoch": 0.7359284306938602,
"grad_norm": 0.05569084361195564,
"learning_rate": 4.2523424850725366e-05,
"loss": 0.1256,
"step": 13820
},
{
"epoch": 0.7364609404121625,
"grad_norm": 0.03815682604908943,
"learning_rate": 4.2476214054806464e-05,
"loss": 0.1258,
"step": 13830
},
{
"epoch": 0.7369934501304649,
"grad_norm": 0.05617569014430046,
"learning_rate": 4.242907310646124e-05,
"loss": 0.1256,
"step": 13840
},
{
"epoch": 0.7375259598487672,
"grad_norm": 0.036379266530275345,
"learning_rate": 4.238200215187653e-05,
"loss": 0.125,
"step": 13850
},
{
"epoch": 0.7380584695670696,
"grad_norm": 0.03899050131440163,
"learning_rate": 4.233500133702209e-05,
"loss": 0.1252,
"step": 13860
},
{
"epoch": 0.738590979285372,
"grad_norm": 0.08941038697957993,
"learning_rate": 4.2288070807650195e-05,
"loss": 0.1258,
"step": 13870
},
{
"epoch": 0.7391234890036743,
"grad_norm": 0.04613060876727104,
"learning_rate": 4.2241210709295157e-05,
"loss": 0.1257,
"step": 13880
},
{
"epoch": 0.7396559987219766,
"grad_norm": 0.05115320160984993,
"learning_rate": 4.219442118727289e-05,
"loss": 0.1253,
"step": 13890
},
{
"epoch": 0.7401885084402791,
"grad_norm": 0.050107911229133606,
"learning_rate": 4.214770238668041e-05,
"loss": 0.1249,
"step": 13900
},
{
"epoch": 0.7407210181585814,
"grad_norm": 0.05156391113996506,
"learning_rate": 4.210105445239544e-05,
"loss": 0.1247,
"step": 13910
},
{
"epoch": 0.7412535278768837,
"grad_norm": 0.03768635913729668,
"learning_rate": 4.205447752907594e-05,
"loss": 0.1255,
"step": 13920
},
{
"epoch": 0.7417860375951861,
"grad_norm": 0.0679803118109703,
"learning_rate": 4.20079717611597e-05,
"loss": 0.1251,
"step": 13930
},
{
"epoch": 0.7423185473134885,
"grad_norm": 0.04139627143740654,
"learning_rate": 4.196153729286377e-05,
"loss": 0.1254,
"step": 13940
},
{
"epoch": 0.7428510570317909,
"grad_norm": 0.07428357750177383,
"learning_rate": 4.191517426818419e-05,
"loss": 0.1261,
"step": 13950
},
{
"epoch": 0.7433835667500932,
"grad_norm": 0.04479588195681572,
"learning_rate": 4.186888283089537e-05,
"loss": 0.1251,
"step": 13960
},
{
"epoch": 0.7439160764683955,
"grad_norm": 0.06392911076545715,
"learning_rate": 4.182266312454977e-05,
"loss": 0.1256,
"step": 13970
},
{
"epoch": 0.744448586186698,
"grad_norm": 0.054266393184661865,
"learning_rate": 4.177651529247739e-05,
"loss": 0.125,
"step": 13980
},
{
"epoch": 0.7449810959050003,
"grad_norm": 0.06033441051840782,
"learning_rate": 4.173043947778536e-05,
"loss": 0.1253,
"step": 13990
},
{
"epoch": 0.7455136056233026,
"grad_norm": 0.04095017537474632,
"learning_rate": 4.1684435823357454e-05,
"loss": 0.1255,
"step": 14000
},
{
"epoch": 0.746046115341605,
"grad_norm": 0.037291690707206726,
"learning_rate": 4.163850447185369e-05,
"loss": 0.1245,
"step": 14010
},
{
"epoch": 0.7465786250599074,
"grad_norm": 0.06799422949552536,
"learning_rate": 4.159264556570986e-05,
"loss": 0.1255,
"step": 14020
},
{
"epoch": 0.7471111347782097,
"grad_norm": 0.044954586774110794,
"learning_rate": 4.1546859247137124e-05,
"loss": 0.1264,
"step": 14030
},
{
"epoch": 0.7476436444965121,
"grad_norm": 0.041422173380851746,
"learning_rate": 4.1501145658121525e-05,
"loss": 0.1254,
"step": 14040
},
{
"epoch": 0.7481761542148144,
"grad_norm": 0.09260525554418564,
"learning_rate": 4.145550494042356e-05,
"loss": 0.1244,
"step": 14050
},
{
"epoch": 0.7487086639331167,
"grad_norm": 0.03994472324848175,
"learning_rate": 4.140993723557775e-05,
"loss": 0.1262,
"step": 14060
},
{
"epoch": 0.7492411736514192,
"grad_norm": 0.04395360127091408,
"learning_rate": 4.136444268489221e-05,
"loss": 0.1263,
"step": 14070
},
{
"epoch": 0.7497736833697215,
"grad_norm": 0.0409519337117672,
"learning_rate": 4.1319021429448204e-05,
"loss": 0.126,
"step": 14080
},
{
"epoch": 0.7503061930880238,
"grad_norm": 0.048877742141485214,
"learning_rate": 4.1273673610099675e-05,
"loss": 0.1253,
"step": 14090
},
{
"epoch": 0.7508387028063263,
"grad_norm": 0.04159548133611679,
"learning_rate": 4.122839936747289e-05,
"loss": 0.1255,
"step": 14100
},
{
"epoch": 0.7513712125246286,
"grad_norm": 0.036307524889707565,
"learning_rate": 4.118319884196587e-05,
"loss": 0.1249,
"step": 14110
},
{
"epoch": 0.7519037222429309,
"grad_norm": 0.039279136806726456,
"learning_rate": 4.1138072173748116e-05,
"loss": 0.125,
"step": 14120
},
{
"epoch": 0.7524362319612333,
"grad_norm": 0.05518367886543274,
"learning_rate": 4.109301950276003e-05,
"loss": 0.1256,
"step": 14130
},
{
"epoch": 0.7529687416795356,
"grad_norm": 0.043891094624996185,
"learning_rate": 4.104804096871259e-05,
"loss": 0.1257,
"step": 14140
},
{
"epoch": 0.753501251397838,
"grad_norm": 0.045587554574012756,
"learning_rate": 4.1003136711086875e-05,
"loss": 0.1263,
"step": 14150
},
{
"epoch": 0.7540337611161404,
"grad_norm": 0.05378378927707672,
"learning_rate": 4.0958306869133555e-05,
"loss": 0.1253,
"step": 14160
},
{
"epoch": 0.7545662708344427,
"grad_norm": 0.05503176152706146,
"learning_rate": 4.091355158187261e-05,
"loss": 0.1258,
"step": 14170
},
{
"epoch": 0.755098780552745,
"grad_norm": 0.10129349678754807,
"learning_rate": 4.0868870988092795e-05,
"loss": 0.1244,
"step": 14180
},
{
"epoch": 0.7556312902710475,
"grad_norm": 0.05240345746278763,
"learning_rate": 4.082426522635125e-05,
"loss": 0.1257,
"step": 14190
},
{
"epoch": 0.7561637999893498,
"grad_norm": 0.05487096309661865,
"learning_rate": 4.077973443497303e-05,
"loss": 0.1258,
"step": 14200
},
{
"epoch": 0.7566963097076521,
"grad_norm": 0.04014230892062187,
"learning_rate": 4.073527875205071e-05,
"loss": 0.1246,
"step": 14210
},
{
"epoch": 0.7572288194259545,
"grad_norm": 0.04955144226551056,
"learning_rate": 4.0690898315443955e-05,
"loss": 0.1252,
"step": 14220
},
{
"epoch": 0.7577613291442569,
"grad_norm": 0.05915694311261177,
"learning_rate": 4.064659326277911e-05,
"loss": 0.1255,
"step": 14230
},
{
"epoch": 0.7582938388625592,
"grad_norm": 0.07433107495307922,
"learning_rate": 4.0602363731448696e-05,
"loss": 0.1247,
"step": 14240
},
{
"epoch": 0.7588263485808616,
"grad_norm": 0.041825130581855774,
"learning_rate": 4.0558209858611093e-05,
"loss": 0.1253,
"step": 14250
},
{
"epoch": 0.7593588582991639,
"grad_norm": 0.07135327905416489,
"learning_rate": 4.051413178119002e-05,
"loss": 0.1253,
"step": 14260
},
{
"epoch": 0.7598913680174664,
"grad_norm": 0.0779680609703064,
"learning_rate": 4.0470129635874176e-05,
"loss": 0.1242,
"step": 14270
},
{
"epoch": 0.7604238777357687,
"grad_norm": 0.03307312726974487,
"learning_rate": 4.042620355911677e-05,
"loss": 0.1255,
"step": 14280
},
{
"epoch": 0.760956387454071,
"grad_norm": 0.037016890943050385,
"learning_rate": 4.0382353687135136e-05,
"loss": 0.1249,
"step": 14290
},
{
"epoch": 0.7614888971723734,
"grad_norm": 0.08262995630502701,
"learning_rate": 4.0338580155910284e-05,
"loss": 0.1249,
"step": 14300
},
{
"epoch": 0.7620214068906758,
"grad_norm": 0.05580204352736473,
"learning_rate": 4.029488310118648e-05,
"loss": 0.1251,
"step": 14310
},
{
"epoch": 0.7625539166089781,
"grad_norm": 0.0543997660279274,
"learning_rate": 4.025126265847084e-05,
"loss": 0.1261,
"step": 14320
},
{
"epoch": 0.7630864263272805,
"grad_norm": 0.06115228682756424,
"learning_rate": 4.02077189630329e-05,
"loss": 0.1251,
"step": 14330
},
{
"epoch": 0.7636189360455828,
"grad_norm": 0.046210747212171555,
"learning_rate": 4.016425214990421e-05,
"loss": 0.1255,
"step": 14340
},
{
"epoch": 0.7641514457638852,
"grad_norm": 0.0543675497174263,
"learning_rate": 4.0120862353877884e-05,
"loss": 0.1258,
"step": 14350
},
{
"epoch": 0.7646839554821876,
"grad_norm": 0.06712432205677032,
"learning_rate": 4.007754970950821e-05,
"loss": 0.1256,
"step": 14360
},
{
"epoch": 0.7652164652004899,
"grad_norm": 0.050090424716472626,
"learning_rate": 4.0034314351110216e-05,
"loss": 0.1257,
"step": 14370
},
{
"epoch": 0.7657489749187922,
"grad_norm": 0.039436932653188705,
"learning_rate": 3.999115641275929e-05,
"loss": 0.1259,
"step": 14380
},
{
"epoch": 0.7662814846370947,
"grad_norm": 0.03885102644562721,
"learning_rate": 3.994807602829068e-05,
"loss": 0.125,
"step": 14390
},
{
"epoch": 0.766813994355397,
"grad_norm": 0.03700343519449234,
"learning_rate": 3.990507333129922e-05,
"loss": 0.1254,
"step": 14400
},
{
"epoch": 0.7673465040736993,
"grad_norm": 0.06743155419826508,
"learning_rate": 3.986214845513874e-05,
"loss": 0.1252,
"step": 14410
},
{
"epoch": 0.7678790137920017,
"grad_norm": 0.05197859928011894,
"learning_rate": 3.9819301532921807e-05,
"loss": 0.1244,
"step": 14420
},
{
"epoch": 0.7684115235103041,
"grad_norm": 0.09455039352178574,
"learning_rate": 3.9776532697519206e-05,
"loss": 0.1254,
"step": 14430
},
{
"epoch": 0.7689440332286064,
"grad_norm": 0.04639993980526924,
"learning_rate": 3.97338420815596e-05,
"loss": 0.1252,
"step": 14440
},
{
"epoch": 0.7694765429469088,
"grad_norm": 0.03305187448859215,
"learning_rate": 3.969122981742909e-05,
"loss": 0.1244,
"step": 14450
},
{
"epoch": 0.7700090526652111,
"grad_norm": 0.06983647495508194,
"learning_rate": 3.9648696037270786e-05,
"loss": 0.1259,
"step": 14460
},
{
"epoch": 0.7705415623835135,
"grad_norm": 0.07447967678308487,
"learning_rate": 3.960624087298439e-05,
"loss": 0.1251,
"step": 14470
},
{
"epoch": 0.7710740721018159,
"grad_norm": 0.03923282399773598,
"learning_rate": 3.956386445622589e-05,
"loss": 0.1254,
"step": 14480
},
{
"epoch": 0.7716065818201182,
"grad_norm": 0.05779058113694191,
"learning_rate": 3.9521566918406984e-05,
"loss": 0.1248,
"step": 14490
},
{
"epoch": 0.7721390915384205,
"grad_norm": 0.043516259640455246,
"learning_rate": 3.947934839069485e-05,
"loss": 0.1248,
"step": 14500
},
{
"epoch": 0.772671601256723,
"grad_norm": 0.05518548563122749,
"learning_rate": 3.943720900401157e-05,
"loss": 0.1261,
"step": 14510
},
{
"epoch": 0.7732041109750253,
"grad_norm": 0.045164406299591064,
"learning_rate": 3.939514888903383e-05,
"loss": 0.1251,
"step": 14520
},
{
"epoch": 0.7737366206933276,
"grad_norm": 0.042605891823768616,
"learning_rate": 3.935316817619252e-05,
"loss": 0.1251,
"step": 14530
},
{
"epoch": 0.77426913041163,
"grad_norm": 0.05655062943696976,
"learning_rate": 3.931126699567228e-05,
"loss": 0.1258,
"step": 14540
},
{
"epoch": 0.7748016401299324,
"grad_norm": 0.06695695966482162,
"learning_rate": 3.926944547741112e-05,
"loss": 0.1257,
"step": 14550
},
{
"epoch": 0.7753341498482347,
"grad_norm": 0.0719684287905693,
"learning_rate": 3.922770375109997e-05,
"loss": 0.1256,
"step": 14560
},
{
"epoch": 0.7758666595665371,
"grad_norm": 0.043789032846689224,
"learning_rate": 3.918604194618241e-05,
"loss": 0.1254,
"step": 14570
},
{
"epoch": 0.7763991692848394,
"grad_norm": 0.03638778626918793,
"learning_rate": 3.9144460191854075e-05,
"loss": 0.1247,
"step": 14580
},
{
"epoch": 0.7769316790031419,
"grad_norm": 0.0492616705596447,
"learning_rate": 3.910295861706244e-05,
"loss": 0.1248,
"step": 14590
},
{
"epoch": 0.7774641887214442,
"grad_norm": 0.051167041063308716,
"learning_rate": 3.906153735050632e-05,
"loss": 0.1255,
"step": 14600
},
{
"epoch": 0.7779966984397465,
"grad_norm": 0.09880778193473816,
"learning_rate": 3.9020196520635454e-05,
"loss": 0.1256,
"step": 14610
},
{
"epoch": 0.7785292081580489,
"grad_norm": 0.06614736467599869,
"learning_rate": 3.897893625565016e-05,
"loss": 0.1248,
"step": 14620
},
{
"epoch": 0.7790617178763513,
"grad_norm": 0.05390491709113121,
"learning_rate": 3.893775668350095e-05,
"loss": 0.125,
"step": 14630
},
{
"epoch": 0.7795942275946536,
"grad_norm": 0.05455655977129936,
"learning_rate": 3.8896657931888056e-05,
"loss": 0.1248,
"step": 14640
},
{
"epoch": 0.780126737312956,
"grad_norm": 0.06813376396894455,
"learning_rate": 3.8855640128261135e-05,
"loss": 0.1254,
"step": 14650
},
{
"epoch": 0.7806592470312583,
"grad_norm": 0.03827499598264694,
"learning_rate": 3.8814703399818756e-05,
"loss": 0.1255,
"step": 14660
},
{
"epoch": 0.7811917567495607,
"grad_norm": 0.07484028488397598,
"learning_rate": 3.877384787350812e-05,
"loss": 0.1258,
"step": 14670
},
{
"epoch": 0.7817242664678631,
"grad_norm": 0.04092638939619064,
"learning_rate": 3.873307367602458e-05,
"loss": 0.1256,
"step": 14680
},
{
"epoch": 0.7822567761861654,
"grad_norm": 0.044872015714645386,
"learning_rate": 3.869238093381131e-05,
"loss": 0.125,
"step": 14690
},
{
"epoch": 0.7827892859044677,
"grad_norm": 0.053619783371686935,
"learning_rate": 3.8651769773058894e-05,
"loss": 0.1244,
"step": 14700
},
{
"epoch": 0.7833217956227702,
"grad_norm": 0.053385183215141296,
"learning_rate": 3.861124031970487e-05,
"loss": 0.1252,
"step": 14710
},
{
"epoch": 0.7838543053410725,
"grad_norm": 0.039729390293359756,
"learning_rate": 3.857079269943348e-05,
"loss": 0.1252,
"step": 14720
},
{
"epoch": 0.7843868150593748,
"grad_norm": 0.055133990943431854,
"learning_rate": 3.853042703767511e-05,
"loss": 0.1255,
"step": 14730
},
{
"epoch": 0.7849193247776772,
"grad_norm": 0.0439545176923275,
"learning_rate": 3.849014345960605e-05,
"loss": 0.1254,
"step": 14740
},
{
"epoch": 0.7854518344959796,
"grad_norm": 0.04726070538163185,
"learning_rate": 3.844994209014805e-05,
"loss": 0.1243,
"step": 14750
},
{
"epoch": 0.7859843442142819,
"grad_norm": 0.11086293309926987,
"learning_rate": 3.840982305396787e-05,
"loss": 0.1254,
"step": 14760
},
{
"epoch": 0.7865168539325843,
"grad_norm": 0.05439605191349983,
"learning_rate": 3.8369786475476986e-05,
"loss": 0.1247,
"step": 14770
},
{
"epoch": 0.7870493636508866,
"grad_norm": 0.0793689638376236,
"learning_rate": 3.832983247883116e-05,
"loss": 0.125,
"step": 14780
},
{
"epoch": 0.787581873369189,
"grad_norm": 0.04240609332919121,
"learning_rate": 3.8289961187930076e-05,
"loss": 0.1255,
"step": 14790
},
{
"epoch": 0.7881143830874914,
"grad_norm": 0.04460853338241577,
"learning_rate": 3.825017272641693e-05,
"loss": 0.1258,
"step": 14800
},
{
"epoch": 0.7886468928057937,
"grad_norm": 0.04268253594636917,
"learning_rate": 3.821046721767806e-05,
"loss": 0.1256,
"step": 14810
},
{
"epoch": 0.789179402524096,
"grad_norm": 0.07220305502414703,
"learning_rate": 3.817084478484256e-05,
"loss": 0.1258,
"step": 14820
},
{
"epoch": 0.7897119122423985,
"grad_norm": 0.05979606509208679,
"learning_rate": 3.8131305550781906e-05,
"loss": 0.1262,
"step": 14830
},
{
"epoch": 0.7902444219607008,
"grad_norm": 0.06392871588468552,
"learning_rate": 3.8091849638109575e-05,
"loss": 0.1244,
"step": 14840
},
{
"epoch": 0.7907769316790031,
"grad_norm": 0.042751483619213104,
"learning_rate": 3.8052477169180634e-05,
"loss": 0.125,
"step": 14850
},
{
"epoch": 0.7913094413973055,
"grad_norm": 0.03440069034695625,
"learning_rate": 3.801318826609144e-05,
"loss": 0.1255,
"step": 14860
},
{
"epoch": 0.7918419511156078,
"grad_norm": 0.061454493552446365,
"learning_rate": 3.797398305067914e-05,
"loss": 0.1251,
"step": 14870
},
{
"epoch": 0.7923744608339102,
"grad_norm": 0.058559708297252655,
"learning_rate": 3.7934861644521405e-05,
"loss": 0.125,
"step": 14880
},
{
"epoch": 0.7929069705522126,
"grad_norm": 0.0407247468829155,
"learning_rate": 3.789582416893599e-05,
"loss": 0.1254,
"step": 14890
},
{
"epoch": 0.7934394802705149,
"grad_norm": 0.05672033876180649,
"learning_rate": 3.78568707449804e-05,
"loss": 0.1253,
"step": 14900
},
{
"epoch": 0.7939719899888174,
"grad_norm": 0.056891556829214096,
"learning_rate": 3.781800149345146e-05,
"loss": 0.1244,
"step": 14910
},
{
"epoch": 0.7945044997071197,
"grad_norm": 0.05665665119886398,
"learning_rate": 3.7779216534885e-05,
"loss": 0.1249,
"step": 14920
},
{
"epoch": 0.795037009425422,
"grad_norm": 0.03597261756658554,
"learning_rate": 3.774051598955541e-05,
"loss": 0.1239,
"step": 14930
},
{
"epoch": 0.7955695191437244,
"grad_norm": 0.06507623195648193,
"learning_rate": 3.770189997747536e-05,
"loss": 0.1246,
"step": 14940
},
{
"epoch": 0.7961020288620267,
"grad_norm": 0.05575447157025337,
"learning_rate": 3.7663368618395365e-05,
"loss": 0.1251,
"step": 14950
},
{
"epoch": 0.7966345385803291,
"grad_norm": 0.07422123104333878,
"learning_rate": 3.7624922031803403e-05,
"loss": 0.1248,
"step": 14960
},
{
"epoch": 0.7971670482986315,
"grad_norm": 0.03994056582450867,
"learning_rate": 3.758656033692457e-05,
"loss": 0.1254,
"step": 14970
},
{
"epoch": 0.7976995580169338,
"grad_norm": 0.06453961879014969,
"learning_rate": 3.754828365272072e-05,
"loss": 0.1248,
"step": 14980
},
{
"epoch": 0.7982320677352361,
"grad_norm": 0.03173014149069786,
"learning_rate": 3.751009209789011e-05,
"loss": 0.1246,
"step": 14990
},
{
"epoch": 0.7987645774535386,
"grad_norm": 0.05219841003417969,
"learning_rate": 3.747198579086695e-05,
"loss": 0.125,
"step": 15000
},
{
"epoch": 0.7992970871718409,
"grad_norm": 0.04533257335424423,
"learning_rate": 3.7433964849821145e-05,
"loss": 0.1247,
"step": 15010
},
{
"epoch": 0.7998295968901432,
"grad_norm": 0.05037694424390793,
"learning_rate": 3.7396029392657835e-05,
"loss": 0.1254,
"step": 15020
},
{
"epoch": 0.8003621066084456,
"grad_norm": 0.047749314457178116,
"learning_rate": 3.7358179537017066e-05,
"loss": 0.1251,
"step": 15030
},
{
"epoch": 0.800894616326748,
"grad_norm": 0.06332427263259888,
"learning_rate": 3.732041540027348e-05,
"loss": 0.1246,
"step": 15040
},
{
"epoch": 0.8014271260450503,
"grad_norm": 0.04189267009496689,
"learning_rate": 3.728273709953586e-05,
"loss": 0.1248,
"step": 15050
},
{
"epoch": 0.8019596357633527,
"grad_norm": 0.04982787370681763,
"learning_rate": 3.724514475164681e-05,
"loss": 0.1251,
"step": 15060
},
{
"epoch": 0.802492145481655,
"grad_norm": 0.032108910381793976,
"learning_rate": 3.720763847318239e-05,
"loss": 0.125,
"step": 15070
},
{
"epoch": 0.8030246551999574,
"grad_norm": 0.03504796326160431,
"learning_rate": 3.717021838045175e-05,
"loss": 0.1247,
"step": 15080
},
{
"epoch": 0.8035571649182598,
"grad_norm": 0.05086008459329605,
"learning_rate": 3.713288458949679e-05,
"loss": 0.1253,
"step": 15090
},
{
"epoch": 0.8040896746365621,
"grad_norm": 0.04598323255777359,
"learning_rate": 3.709563721609178e-05,
"loss": 0.1246,
"step": 15100
},
{
"epoch": 0.8046221843548644,
"grad_norm": 0.034302182495594025,
"learning_rate": 3.705847637574299e-05,
"loss": 0.1256,
"step": 15110
},
{
"epoch": 0.8051546940731669,
"grad_norm": 0.04984142258763313,
"learning_rate": 3.7021402183688334e-05,
"loss": 0.1249,
"step": 15120
},
{
"epoch": 0.8056872037914692,
"grad_norm": 0.040011048316955566,
"learning_rate": 3.698441475489707e-05,
"loss": 0.1245,
"step": 15130
},
{
"epoch": 0.8062197135097715,
"grad_norm": 0.0699247419834137,
"learning_rate": 3.694751420406937e-05,
"loss": 0.1244,
"step": 15140
},
{
"epoch": 0.8067522232280739,
"grad_norm": 0.035953816026449203,
"learning_rate": 3.6910700645635975e-05,
"loss": 0.1251,
"step": 15150
},
{
"epoch": 0.8072847329463763,
"grad_norm": 0.09570103138685226,
"learning_rate": 3.68739741937579e-05,
"loss": 0.1245,
"step": 15160
},
{
"epoch": 0.8078172426646786,
"grad_norm": 0.051727280020713806,
"learning_rate": 3.683733496232599e-05,
"loss": 0.1257,
"step": 15170
},
{
"epoch": 0.808349752382981,
"grad_norm": 0.05984990671277046,
"learning_rate": 3.680078306496066e-05,
"loss": 0.1251,
"step": 15180
},
{
"epoch": 0.8088822621012833,
"grad_norm": 0.050374679267406464,
"learning_rate": 3.676431861501146e-05,
"loss": 0.1245,
"step": 15190
},
{
"epoch": 0.8094147718195857,
"grad_norm": 0.08578687161207199,
"learning_rate": 3.672794172555677e-05,
"loss": 0.1253,
"step": 15200
},
{
"epoch": 0.8099472815378881,
"grad_norm": 0.0373394675552845,
"learning_rate": 3.6691652509403475e-05,
"loss": 0.1244,
"step": 15210
},
{
"epoch": 0.8104797912561904,
"grad_norm": 0.03546525537967682,
"learning_rate": 3.6655451079086525e-05,
"loss": 0.1244,
"step": 15220
},
{
"epoch": 0.8110123009744928,
"grad_norm": 0.04470152407884598,
"learning_rate": 3.661933754686867e-05,
"loss": 0.1251,
"step": 15230
},
{
"epoch": 0.8115448106927952,
"grad_norm": 0.05563315749168396,
"learning_rate": 3.6583312024740076e-05,
"loss": 0.1254,
"step": 15240
},
{
"epoch": 0.8120773204110975,
"grad_norm": 0.04946048930287361,
"learning_rate": 3.654737462441801e-05,
"loss": 0.1246,
"step": 15250
},
{
"epoch": 0.8126098301293999,
"grad_norm": 0.05127432197332382,
"learning_rate": 3.651152545734643e-05,
"loss": 0.1247,
"step": 15260
},
{
"epoch": 0.8131423398477022,
"grad_norm": 0.05603098124265671,
"learning_rate": 3.6475764634695674e-05,
"loss": 0.1246,
"step": 15270
},
{
"epoch": 0.8136748495660046,
"grad_norm": 0.047840967774391174,
"learning_rate": 3.644009226736217e-05,
"loss": 0.1248,
"step": 15280
},
{
"epoch": 0.814207359284307,
"grad_norm": 0.04910242184996605,
"learning_rate": 3.6404508465968e-05,
"loss": 0.1252,
"step": 15290
},
{
"epoch": 0.8147398690026093,
"grad_norm": 0.07945267856121063,
"learning_rate": 3.6369013340860606e-05,
"loss": 0.1249,
"step": 15300
},
{
"epoch": 0.8152723787209116,
"grad_norm": 0.054363641887903214,
"learning_rate": 3.633360700211243e-05,
"loss": 0.125,
"step": 15310
},
{
"epoch": 0.8158048884392141,
"grad_norm": 0.051253627985715866,
"learning_rate": 3.629828955952062e-05,
"loss": 0.1252,
"step": 15320
},
{
"epoch": 0.8163373981575164,
"grad_norm": 0.049010276794433594,
"learning_rate": 3.62630611226066e-05,
"loss": 0.124,
"step": 15330
},
{
"epoch": 0.8168699078758187,
"grad_norm": 0.05660669878125191,
"learning_rate": 3.62279218006158e-05,
"loss": 0.1245,
"step": 15340
},
{
"epoch": 0.8174024175941211,
"grad_norm": 0.038386616855859756,
"learning_rate": 3.619287170251734e-05,
"loss": 0.125,
"step": 15350
},
{
"epoch": 0.8179349273124235,
"grad_norm": 0.06046159192919731,
"learning_rate": 3.6157910937003597e-05,
"loss": 0.1245,
"step": 15360
},
{
"epoch": 0.8184674370307258,
"grad_norm": 0.06180752068758011,
"learning_rate": 3.612303961248995e-05,
"loss": 0.1246,
"step": 15370
},
{
"epoch": 0.8189999467490282,
"grad_norm": 0.041465550661087036,
"learning_rate": 3.60882578371144e-05,
"loss": 0.1247,
"step": 15380
},
{
"epoch": 0.8195324564673305,
"grad_norm": 0.0629926398396492,
"learning_rate": 3.6053565718737265e-05,
"loss": 0.1248,
"step": 15390
},
{
"epoch": 0.8200649661856328,
"grad_norm": 0.05619725584983826,
"learning_rate": 3.601896336494083e-05,
"loss": 0.1251,
"step": 15400
},
{
"epoch": 0.8205974759039353,
"grad_norm": 0.06445404887199402,
"learning_rate": 3.598445088302901e-05,
"loss": 0.1244,
"step": 15410
},
{
"epoch": 0.8211299856222376,
"grad_norm": 0.04841604083776474,
"learning_rate": 3.595002838002704e-05,
"loss": 0.1249,
"step": 15420
},
{
"epoch": 0.8216624953405399,
"grad_norm": 0.04060814529657364,
"learning_rate": 3.591569596268108e-05,
"loss": 0.1251,
"step": 15430
},
{
"epoch": 0.8221950050588424,
"grad_norm": 0.03796577826142311,
"learning_rate": 3.5881453737457984e-05,
"loss": 0.1246,
"step": 15440
},
{
"epoch": 0.8227275147771447,
"grad_norm": 0.033980198204517365,
"learning_rate": 3.5847301810544856e-05,
"loss": 0.1252,
"step": 15450
},
{
"epoch": 0.823260024495447,
"grad_norm": 0.05901845172047615,
"learning_rate": 3.581324028784886e-05,
"loss": 0.1244,
"step": 15460
},
{
"epoch": 0.8237925342137494,
"grad_norm": 0.05464969575405121,
"learning_rate": 3.577926927499673e-05,
"loss": 0.1249,
"step": 15470
},
{
"epoch": 0.8243250439320517,
"grad_norm": 0.05274730920791626,
"learning_rate": 3.574538887733456e-05,
"loss": 0.1253,
"step": 15480
},
{
"epoch": 0.8248575536503541,
"grad_norm": 0.0523492768406868,
"learning_rate": 3.5711599199927446e-05,
"loss": 0.1246,
"step": 15490
},
{
"epoch": 0.8253900633686565,
"grad_norm": 0.03017192892730236,
"learning_rate": 3.5677900347559146e-05,
"loss": 0.1246,
"step": 15500
},
{
"epoch": 0.8259225730869588,
"grad_norm": 0.058320943266153336,
"learning_rate": 3.564429242473178e-05,
"loss": 0.1253,
"step": 15510
},
{
"epoch": 0.8264550828052611,
"grad_norm": 0.05324307456612587,
"learning_rate": 3.5610775535665465e-05,
"loss": 0.1247,
"step": 15520
},
{
"epoch": 0.8269875925235636,
"grad_norm": 0.05818801745772362,
"learning_rate": 3.557734978429801e-05,
"loss": 0.125,
"step": 15530
},
{
"epoch": 0.8275201022418659,
"grad_norm": 0.06262166053056717,
"learning_rate": 3.554401527428465e-05,
"loss": 0.1246,
"step": 15540
},
{
"epoch": 0.8280526119601683,
"grad_norm": 0.04980841279029846,
"learning_rate": 3.551077210899763e-05,
"loss": 0.1244,
"step": 15550
},
{
"epoch": 0.8285851216784706,
"grad_norm": 0.03848971053957939,
"learning_rate": 3.547762039152594e-05,
"loss": 0.1247,
"step": 15560
},
{
"epoch": 0.829117631396773,
"grad_norm": 0.04948917403817177,
"learning_rate": 3.5444560224675e-05,
"loss": 0.126,
"step": 15570
},
{
"epoch": 0.8296501411150754,
"grad_norm": 0.04893777146935463,
"learning_rate": 3.541159171096631e-05,
"loss": 0.1252,
"step": 15580
},
{
"epoch": 0.8301826508333777,
"grad_norm": 0.0531187430024147,
"learning_rate": 3.537871495263716e-05,
"loss": 0.1242,
"step": 15590
},
{
"epoch": 0.83071516055168,
"grad_norm": 0.04317576438188553,
"learning_rate": 3.534593005164027e-05,
"loss": 0.1241,
"step": 15600
},
{
"epoch": 0.8312476702699825,
"grad_norm": 0.03508533909916878,
"learning_rate": 3.531323710964356e-05,
"loss": 0.1251,
"step": 15610
},
{
"epoch": 0.8317801799882848,
"grad_norm": 0.04336007684469223,
"learning_rate": 3.528063622802974e-05,
"loss": 0.1255,
"step": 15620
},
{
"epoch": 0.8323126897065871,
"grad_norm": 0.05976368486881256,
"learning_rate": 3.5248127507896045e-05,
"loss": 0.1243,
"step": 15630
},
{
"epoch": 0.8328451994248895,
"grad_norm": 0.03515305742621422,
"learning_rate": 3.52157110500539e-05,
"loss": 0.1256,
"step": 15640
},
{
"epoch": 0.8333777091431919,
"grad_norm": 0.08611953258514404,
"learning_rate": 3.518338695502864e-05,
"loss": 0.1243,
"step": 15650
},
{
"epoch": 0.8339102188614942,
"grad_norm": 0.041929975152015686,
"learning_rate": 3.515115532305918e-05,
"loss": 0.1246,
"step": 15660
},
{
"epoch": 0.8344427285797966,
"grad_norm": 0.03388476371765137,
"learning_rate": 3.511901625409768e-05,
"loss": 0.1246,
"step": 15670
},
{
"epoch": 0.8349752382980989,
"grad_norm": 0.04702109470963478,
"learning_rate": 3.5086969847809256e-05,
"loss": 0.1241,
"step": 15680
},
{
"epoch": 0.8355077480164013,
"grad_norm": 0.04313468933105469,
"learning_rate": 3.50550162035717e-05,
"loss": 0.125,
"step": 15690
},
{
"epoch": 0.8360402577347037,
"grad_norm": 0.04187025874853134,
"learning_rate": 3.502315542047512e-05,
"loss": 0.1244,
"step": 15700
},
{
"epoch": 0.836572767453006,
"grad_norm": 0.0742115005850792,
"learning_rate": 3.4991387597321654e-05,
"loss": 0.1247,
"step": 15710
},
{
"epoch": 0.8371052771713083,
"grad_norm": 0.047620195895433426,
"learning_rate": 3.495971283262519e-05,
"loss": 0.1247,
"step": 15720
},
{
"epoch": 0.8376377868896108,
"grad_norm": 0.06339036673307419,
"learning_rate": 3.492813122461101e-05,
"loss": 0.1249,
"step": 15730
},
{
"epoch": 0.8381702966079131,
"grad_norm": 0.03321847692131996,
"learning_rate": 3.489664287121553e-05,
"loss": 0.1249,
"step": 15740
},
{
"epoch": 0.8387028063262154,
"grad_norm": 0.06577350944280624,
"learning_rate": 3.486524787008595e-05,
"loss": 0.1241,
"step": 15750
},
{
"epoch": 0.8392353160445178,
"grad_norm": 0.04866393655538559,
"learning_rate": 3.4833946318580026e-05,
"loss": 0.1244,
"step": 15760
},
{
"epoch": 0.8397678257628202,
"grad_norm": 0.045106563717126846,
"learning_rate": 3.4802738313765685e-05,
"loss": 0.1246,
"step": 15770
},
{
"epoch": 0.8403003354811225,
"grad_norm": 0.06193890795111656,
"learning_rate": 3.477162395242076e-05,
"loss": 0.1251,
"step": 15780
},
{
"epoch": 0.8408328451994249,
"grad_norm": 0.06153490021824837,
"learning_rate": 3.4740603331032706e-05,
"loss": 0.125,
"step": 15790
},
{
"epoch": 0.8413653549177272,
"grad_norm": 0.05703847110271454,
"learning_rate": 3.470967654579828e-05,
"loss": 0.1251,
"step": 15800
},
{
"epoch": 0.8418978646360296,
"grad_norm": 0.03664189949631691,
"learning_rate": 3.467884369262325e-05,
"loss": 0.1249,
"step": 15810
},
{
"epoch": 0.842430374354332,
"grad_norm": 0.037624064832925797,
"learning_rate": 3.46481048671221e-05,
"loss": 0.1245,
"step": 15820
},
{
"epoch": 0.8429628840726343,
"grad_norm": 0.04117140918970108,
"learning_rate": 3.4617460164617684e-05,
"loss": 0.1252,
"step": 15830
},
{
"epoch": 0.8434953937909366,
"grad_norm": 0.03222690895199776,
"learning_rate": 3.4586909680141047e-05,
"loss": 0.1245,
"step": 15840
},
{
"epoch": 0.8440279035092391,
"grad_norm": 0.057400964200496674,
"learning_rate": 3.455645350843102e-05,
"loss": 0.1248,
"step": 15850
},
{
"epoch": 0.8445604132275414,
"grad_norm": 0.04511050879955292,
"learning_rate": 3.452609174393395e-05,
"loss": 0.1248,
"step": 15860
},
{
"epoch": 0.8450929229458438,
"grad_norm": 0.03972748667001724,
"learning_rate": 3.4495824480803455e-05,
"loss": 0.1247,
"step": 15870
},
{
"epoch": 0.8456254326641461,
"grad_norm": 0.08818963170051575,
"learning_rate": 3.446565181290007e-05,
"loss": 0.125,
"step": 15880
},
{
"epoch": 0.8461579423824485,
"grad_norm": 0.07608040422201157,
"learning_rate": 3.4435573833791016e-05,
"loss": 0.1246,
"step": 15890
},
{
"epoch": 0.8466904521007509,
"grad_norm": 0.055682647973299026,
"learning_rate": 3.4405590636749836e-05,
"loss": 0.1255,
"step": 15900
},
{
"epoch": 0.8472229618190532,
"grad_norm": 0.0459553599357605,
"learning_rate": 3.437570231475618e-05,
"loss": 0.1247,
"step": 15910
},
{
"epoch": 0.8477554715373555,
"grad_norm": 0.07165340334177017,
"learning_rate": 3.43459089604955e-05,
"loss": 0.1245,
"step": 15920
},
{
"epoch": 0.848287981255658,
"grad_norm": 0.04513763263821602,
"learning_rate": 3.43162106663587e-05,
"loss": 0.1249,
"step": 15930
},
{
"epoch": 0.8488204909739603,
"grad_norm": 0.07213608175516129,
"learning_rate": 3.428660752444193e-05,
"loss": 0.1242,
"step": 15940
},
{
"epoch": 0.8493530006922626,
"grad_norm": 0.030396446585655212,
"learning_rate": 3.425709962654625e-05,
"loss": 0.1252,
"step": 15950
},
{
"epoch": 0.849885510410565,
"grad_norm": 0.030090300366282463,
"learning_rate": 3.4227687064177385e-05,
"loss": 0.1247,
"step": 15960
},
{
"epoch": 0.8504180201288674,
"grad_norm": 0.05374327301979065,
"learning_rate": 3.419836992854541e-05,
"loss": 0.1243,
"step": 15970
},
{
"epoch": 0.8509505298471697,
"grad_norm": 0.04545629397034645,
"learning_rate": 3.416914831056446e-05,
"loss": 0.1245,
"step": 15980
},
{
"epoch": 0.8514830395654721,
"grad_norm": 0.055011678487062454,
"learning_rate": 3.414002230085248e-05,
"loss": 0.1249,
"step": 15990
},
{
"epoch": 0.8520155492837744,
"grad_norm": 0.05431196093559265,
"learning_rate": 3.411099198973092e-05,
"loss": 0.1241,
"step": 16000
},
{
"epoch": 0.8525480590020768,
"grad_norm": 0.05473232641816139,
"learning_rate": 3.4082057467224484e-05,
"loss": 0.1253,
"step": 16010
},
{
"epoch": 0.8530805687203792,
"grad_norm": 0.07439985126256943,
"learning_rate": 3.40532188230608e-05,
"loss": 0.1245,
"step": 16020
},
{
"epoch": 0.8536130784386815,
"grad_norm": 0.07038458436727524,
"learning_rate": 3.402447614667018e-05,
"loss": 0.125,
"step": 16030
},
{
"epoch": 0.8541455881569838,
"grad_norm": 0.061058055609464645,
"learning_rate": 3.3995829527185354e-05,
"loss": 0.1249,
"step": 16040
},
{
"epoch": 0.8546780978752863,
"grad_norm": 0.057675547897815704,
"learning_rate": 3.396727905344115e-05,
"loss": 0.124,
"step": 16050
},
{
"epoch": 0.8552106075935886,
"grad_norm": 0.038779694586992264,
"learning_rate": 3.3938824813974254e-05,
"loss": 0.1242,
"step": 16060
},
{
"epoch": 0.8557431173118909,
"grad_norm": 0.044711895287036896,
"learning_rate": 3.391046689702292e-05,
"loss": 0.1241,
"step": 16070
},
{
"epoch": 0.8562756270301933,
"grad_norm": 0.0402277447283268,
"learning_rate": 3.388220539052671e-05,
"loss": 0.1241,
"step": 16080
},
{
"epoch": 0.8568081367484957,
"grad_norm": 0.07318955659866333,
"learning_rate": 3.3854040382126196e-05,
"loss": 0.125,
"step": 16090
},
{
"epoch": 0.857340646466798,
"grad_norm": 0.05128632113337517,
"learning_rate": 3.382597195916271e-05,
"loss": 0.1252,
"step": 16100
},
{
"epoch": 0.8578731561851004,
"grad_norm": 0.04426991939544678,
"learning_rate": 3.379800020867808e-05,
"loss": 0.124,
"step": 16110
},
{
"epoch": 0.8584056659034027,
"grad_norm": 0.09766895323991776,
"learning_rate": 3.377012521741433e-05,
"loss": 0.1246,
"step": 16120
},
{
"epoch": 0.858938175621705,
"grad_norm": 0.04723978415131569,
"learning_rate": 3.3742347071813424e-05,
"loss": 0.1255,
"step": 16130
},
{
"epoch": 0.8594706853400075,
"grad_norm": 0.0812908411026001,
"learning_rate": 3.3714665858017015e-05,
"loss": 0.1239,
"step": 16140
},
{
"epoch": 0.8600031950583098,
"grad_norm": 0.0860326737165451,
"learning_rate": 3.3687081661866164e-05,
"loss": 0.1241,
"step": 16150
},
{
"epoch": 0.8605357047766121,
"grad_norm": 0.04998904466629028,
"learning_rate": 3.365959456890109e-05,
"loss": 0.1247,
"step": 16160
},
{
"epoch": 0.8610682144949146,
"grad_norm": 0.035985738039016724,
"learning_rate": 3.3632204664360836e-05,
"loss": 0.1238,
"step": 16170
},
{
"epoch": 0.8616007242132169,
"grad_norm": 0.09907463192939758,
"learning_rate": 3.3604912033183126e-05,
"loss": 0.1244,
"step": 16180
},
{
"epoch": 0.8621332339315193,
"grad_norm": 0.07233595103025436,
"learning_rate": 3.357771676000397e-05,
"loss": 0.1253,
"step": 16190
},
{
"epoch": 0.8626657436498216,
"grad_norm": 0.038175683468580246,
"learning_rate": 3.355061892915752e-05,
"loss": 0.1245,
"step": 16200
},
{
"epoch": 0.863198253368124,
"grad_norm": 0.03324522450566292,
"learning_rate": 3.352361862467572e-05,
"loss": 0.1242,
"step": 16210
},
{
"epoch": 0.8637307630864264,
"grad_norm": 0.03613545373082161,
"learning_rate": 3.349671593028809e-05,
"loss": 0.1244,
"step": 16220
},
{
"epoch": 0.8642632728047287,
"grad_norm": 0.10772500932216644,
"learning_rate": 3.346991092942146e-05,
"loss": 0.1247,
"step": 16230
},
{
"epoch": 0.864795782523031,
"grad_norm": 0.05393153801560402,
"learning_rate": 3.3443203705199686e-05,
"loss": 0.1247,
"step": 16240
},
{
"epoch": 0.8653282922413335,
"grad_norm": 0.04021570831537247,
"learning_rate": 3.3416594340443444e-05,
"loss": 0.1248,
"step": 16250
},
{
"epoch": 0.8658608019596358,
"grad_norm": 0.04227181524038315,
"learning_rate": 3.339008291766991e-05,
"loss": 0.1245,
"step": 16260
},
{
"epoch": 0.8663933116779381,
"grad_norm": 0.044721730053424835,
"learning_rate": 3.3363669519092563e-05,
"loss": 0.1247,
"step": 16270
},
{
"epoch": 0.8669258213962405,
"grad_norm": 0.1196049377322197,
"learning_rate": 3.33373542266209e-05,
"loss": 0.1251,
"step": 16280
},
{
"epoch": 0.8674583311145428,
"grad_norm": 0.03756421059370041,
"learning_rate": 3.331113712186016e-05,
"loss": 0.1242,
"step": 16290
},
{
"epoch": 0.8679908408328452,
"grad_norm": 0.03522124141454697,
"learning_rate": 3.328501828611112e-05,
"loss": 0.1245,
"step": 16300
},
{
"epoch": 0.8685233505511476,
"grad_norm": 0.04906485602259636,
"learning_rate": 3.325899780036982e-05,
"loss": 0.125,
"step": 16310
},
{
"epoch": 0.8690558602694499,
"grad_norm": 0.046862684190273285,
"learning_rate": 3.3233075745327286e-05,
"loss": 0.124,
"step": 16320
},
{
"epoch": 0.8695883699877522,
"grad_norm": 0.1074092835187912,
"learning_rate": 3.320725220136934e-05,
"loss": 0.1245,
"step": 16330
},
{
"epoch": 0.8701208797060547,
"grad_norm": 0.05268271267414093,
"learning_rate": 3.3181527248576294e-05,
"loss": 0.1243,
"step": 16340
},
{
"epoch": 0.870653389424357,
"grad_norm": 0.049087993800640106,
"learning_rate": 3.3155900966722727e-05,
"loss": 0.1242,
"step": 16350
},
{
"epoch": 0.8711858991426593,
"grad_norm": 0.060601964592933655,
"learning_rate": 3.313037343527722e-05,
"loss": 0.1247,
"step": 16360
},
{
"epoch": 0.8717184088609617,
"grad_norm": 0.05477839335799217,
"learning_rate": 3.310494473340215e-05,
"loss": 0.1254,
"step": 16370
},
{
"epoch": 0.8722509185792641,
"grad_norm": 0.055110715329647064,
"learning_rate": 3.3079614939953416e-05,
"loss": 0.1246,
"step": 16380
},
{
"epoch": 0.8727834282975664,
"grad_norm": 0.0602547712624073,
"learning_rate": 3.305438413348016e-05,
"loss": 0.125,
"step": 16390
},
{
"epoch": 0.8733159380158688,
"grad_norm": 0.05673711746931076,
"learning_rate": 3.3029252392224584e-05,
"loss": 0.1245,
"step": 16400
},
{
"epoch": 0.8738484477341711,
"grad_norm": 0.05631018802523613,
"learning_rate": 3.30042197941217e-05,
"loss": 0.125,
"step": 16410
},
{
"epoch": 0.8743809574524735,
"grad_norm": 0.047678008675575256,
"learning_rate": 3.297928641679906e-05,
"loss": 0.1242,
"step": 16420
},
{
"epoch": 0.8749134671707759,
"grad_norm": 0.05217251926660538,
"learning_rate": 3.2954452337576504e-05,
"loss": 0.1245,
"step": 16430
},
{
"epoch": 0.8754459768890782,
"grad_norm": 0.05652473866939545,
"learning_rate": 3.2929717633465954e-05,
"loss": 0.1243,
"step": 16440
},
{
"epoch": 0.8759784866073805,
"grad_norm": 0.03848657384514809,
"learning_rate": 3.2905082381171184e-05,
"loss": 0.1243,
"step": 16450
},
{
"epoch": 0.876510996325683,
"grad_norm": 0.047618966549634933,
"learning_rate": 3.2880546657087554e-05,
"loss": 0.1246,
"step": 16460
},
{
"epoch": 0.8770435060439853,
"grad_norm": 0.06333454698324203,
"learning_rate": 3.2856110537301756e-05,
"loss": 0.1244,
"step": 16470
},
{
"epoch": 0.8775760157622876,
"grad_norm": 0.04720817133784294,
"learning_rate": 3.283177409759164e-05,
"loss": 0.1239,
"step": 16480
},
{
"epoch": 0.87810852548059,
"grad_norm": 0.03655124083161354,
"learning_rate": 3.280753741342592e-05,
"loss": 0.1248,
"step": 16490
},
{
"epoch": 0.8786410351988924,
"grad_norm": 0.05196612700819969,
"learning_rate": 3.278340055996396e-05,
"loss": 0.1245,
"step": 16500
},
{
"epoch": 0.8791735449171948,
"grad_norm": 0.039216578006744385,
"learning_rate": 3.275936361205555e-05,
"loss": 0.1248,
"step": 16510
},
{
"epoch": 0.8797060546354971,
"grad_norm": 0.055273279547691345,
"learning_rate": 3.2735426644240665e-05,
"loss": 0.1248,
"step": 16520
},
{
"epoch": 0.8802385643537994,
"grad_norm": 0.05333053693175316,
"learning_rate": 3.2711589730749266e-05,
"loss": 0.1242,
"step": 16530
},
{
"epoch": 0.8807710740721019,
"grad_norm": 0.062082525342702866,
"learning_rate": 3.268785294550098e-05,
"loss": 0.1251,
"step": 16540
},
{
"epoch": 0.8813035837904042,
"grad_norm": 0.03454854339361191,
"learning_rate": 3.266421636210497e-05,
"loss": 0.1241,
"step": 16550
},
{
"epoch": 0.8818360935087065,
"grad_norm": 0.036358997225761414,
"learning_rate": 3.264068005385965e-05,
"loss": 0.1246,
"step": 16560
},
{
"epoch": 0.8823686032270089,
"grad_norm": 0.03956957161426544,
"learning_rate": 3.261724409375252e-05,
"loss": 0.1241,
"step": 16570
},
{
"epoch": 0.8829011129453113,
"grad_norm": 0.07209271937608719,
"learning_rate": 3.259390855445982e-05,
"loss": 0.125,
"step": 16580
},
{
"epoch": 0.8834336226636136,
"grad_norm": 0.06704261153936386,
"learning_rate": 3.257067350834644e-05,
"loss": 0.1247,
"step": 16590
},
{
"epoch": 0.883966132381916,
"grad_norm": 0.06499594449996948,
"learning_rate": 3.25475390274656e-05,
"loss": 0.1253,
"step": 16600
},
{
"epoch": 0.8844986421002183,
"grad_norm": 0.03783570975065231,
"learning_rate": 3.2524505183558684e-05,
"loss": 0.1246,
"step": 16610
},
{
"epoch": 0.8850311518185207,
"grad_norm": 0.04036329314112663,
"learning_rate": 3.250157204805498e-05,
"loss": 0.1249,
"step": 16620
},
{
"epoch": 0.8855636615368231,
"grad_norm": 0.04968998581171036,
"learning_rate": 3.247873969207148e-05,
"loss": 0.125,
"step": 16630
},
{
"epoch": 0.8860961712551254,
"grad_norm": 0.045320551842451096,
"learning_rate": 3.245600818641265e-05,
"loss": 0.1244,
"step": 16640
},
{
"epoch": 0.8866286809734277,
"grad_norm": 0.06106564775109291,
"learning_rate": 3.243337760157022e-05,
"loss": 0.1247,
"step": 16650
},
{
"epoch": 0.8871611906917302,
"grad_norm": 0.04613622650504112,
"learning_rate": 3.241084800772296e-05,
"loss": 0.1245,
"step": 16660
},
{
"epoch": 0.8876937004100325,
"grad_norm": 0.05316569283604622,
"learning_rate": 3.238841947473642e-05,
"loss": 0.1236,
"step": 16670
},
{
"epoch": 0.8882262101283348,
"grad_norm": 0.0546153299510479,
"learning_rate": 3.236609207216283e-05,
"loss": 0.1245,
"step": 16680
},
{
"epoch": 0.8887587198466372,
"grad_norm": 0.06547331809997559,
"learning_rate": 3.2343865869240746e-05,
"loss": 0.1243,
"step": 16690
},
{
"epoch": 0.8892912295649396,
"grad_norm": 0.055185478180646896,
"learning_rate": 3.2321740934894925e-05,
"loss": 0.1245,
"step": 16700
},
{
"epoch": 0.8898237392832419,
"grad_norm": 0.046210877597332,
"learning_rate": 3.2299717337736076e-05,
"loss": 0.1242,
"step": 16710
},
{
"epoch": 0.8903562490015443,
"grad_norm": 0.04753991216421127,
"learning_rate": 3.2277795146060645e-05,
"loss": 0.1246,
"step": 16720
},
{
"epoch": 0.8908887587198466,
"grad_norm": 0.05761198326945305,
"learning_rate": 3.2255974427850666e-05,
"loss": 0.124,
"step": 16730
},
{
"epoch": 0.891421268438149,
"grad_norm": 0.061156004667282104,
"learning_rate": 3.223425525077342e-05,
"loss": 0.1244,
"step": 16740
},
{
"epoch": 0.8919537781564514,
"grad_norm": 0.07980604469776154,
"learning_rate": 3.2212637682181354e-05,
"loss": 0.1244,
"step": 16750
},
{
"epoch": 0.8924862878747537,
"grad_norm": 0.04657996818423271,
"learning_rate": 3.219112178911181e-05,
"loss": 0.1248,
"step": 16760
},
{
"epoch": 0.893018797593056,
"grad_norm": 0.040127284824848175,
"learning_rate": 3.216970763828683e-05,
"loss": 0.1245,
"step": 16770
},
{
"epoch": 0.8935513073113585,
"grad_norm": 0.04287361726164818,
"learning_rate": 3.2148395296112945e-05,
"loss": 0.1248,
"step": 16780
},
{
"epoch": 0.8940838170296608,
"grad_norm": 0.0566687285900116,
"learning_rate": 3.212718482868096e-05,
"loss": 0.1241,
"step": 16790
},
{
"epoch": 0.8946163267479631,
"grad_norm": 0.036797747015953064,
"learning_rate": 3.210607630176578e-05,
"loss": 0.1252,
"step": 16800
},
{
"epoch": 0.8951488364662655,
"grad_norm": 0.049759261310100555,
"learning_rate": 3.208506978082617e-05,
"loss": 0.1245,
"step": 16810
},
{
"epoch": 0.8956813461845679,
"grad_norm": 0.061853665858507156,
"learning_rate": 3.2064165331004594e-05,
"loss": 0.1252,
"step": 16820
},
{
"epoch": 0.8962138559028703,
"grad_norm": 0.05073931813240051,
"learning_rate": 3.2043363017126956e-05,
"loss": 0.1251,
"step": 16830
},
{
"epoch": 0.8967463656211726,
"grad_norm": 0.06072097271680832,
"learning_rate": 3.202266290370245e-05,
"loss": 0.1239,
"step": 16840
},
{
"epoch": 0.8972788753394749,
"grad_norm": 0.05999981239438057,
"learning_rate": 3.2002065054923325e-05,
"loss": 0.1246,
"step": 16850
},
{
"epoch": 0.8978113850577774,
"grad_norm": 0.048532549291849136,
"learning_rate": 3.198156953466472e-05,
"loss": 0.1242,
"step": 16860
},
{
"epoch": 0.8983438947760797,
"grad_norm": 0.05031272768974304,
"learning_rate": 3.196117640648444e-05,
"loss": 0.1247,
"step": 16870
},
{
"epoch": 0.898876404494382,
"grad_norm": 0.05845622345805168,
"learning_rate": 3.1940885733622754e-05,
"loss": 0.1239,
"step": 16880
},
{
"epoch": 0.8994089142126844,
"grad_norm": 0.05167698487639427,
"learning_rate": 3.192069757900224e-05,
"loss": 0.1245,
"step": 16890
},
{
"epoch": 0.8999414239309868,
"grad_norm": 0.04188617318868637,
"learning_rate": 3.190061200522753e-05,
"loss": 0.1246,
"step": 16900
},
{
"epoch": 0.9004739336492891,
"grad_norm": 0.0383358858525753,
"learning_rate": 3.188062907458516e-05,
"loss": 0.124,
"step": 16910
},
{
"epoch": 0.9010064433675915,
"grad_norm": 0.0524710975587368,
"learning_rate": 3.186074884904336e-05,
"loss": 0.1244,
"step": 16920
},
{
"epoch": 0.9015389530858938,
"grad_norm": 0.034921254962682724,
"learning_rate": 3.184097139025189e-05,
"loss": 0.1246,
"step": 16930
},
{
"epoch": 0.9020714628041961,
"grad_norm": 0.052057795226573944,
"learning_rate": 3.1821296759541764e-05,
"loss": 0.124,
"step": 16940
},
{
"epoch": 0.9026039725224986,
"grad_norm": 0.06420300155878067,
"learning_rate": 3.1801725017925195e-05,
"loss": 0.124,
"step": 16950
},
{
"epoch": 0.9031364822408009,
"grad_norm": 0.03623140975832939,
"learning_rate": 3.178225622609528e-05,
"loss": 0.1235,
"step": 16960
},
{
"epoch": 0.9036689919591032,
"grad_norm": 0.04458535462617874,
"learning_rate": 3.1762890444425875e-05,
"loss": 0.1249,
"step": 16970
},
{
"epoch": 0.9042015016774057,
"grad_norm": 0.03834957256913185,
"learning_rate": 3.174362773297141e-05,
"loss": 0.1246,
"step": 16980
},
{
"epoch": 0.904734011395708,
"grad_norm": 0.04682791605591774,
"learning_rate": 3.1724468151466665e-05,
"loss": 0.1245,
"step": 16990
},
{
"epoch": 0.9052665211140103,
"grad_norm": 0.0558556504547596,
"learning_rate": 3.170541175932662e-05,
"loss": 0.1244,
"step": 17000
},
{
"epoch": 0.9057990308323127,
"grad_norm": 0.046385906636714935,
"learning_rate": 3.168645861564627e-05,
"loss": 0.1241,
"step": 17010
},
{
"epoch": 0.906331540550615,
"grad_norm": 0.04825804755091667,
"learning_rate": 3.166760877920041e-05,
"loss": 0.1246,
"step": 17020
},
{
"epoch": 0.9068640502689174,
"grad_norm": 0.0882338434457779,
"learning_rate": 3.164886230844348e-05,
"loss": 0.125,
"step": 17030
},
{
"epoch": 0.9073965599872198,
"grad_norm": 0.08609329909086227,
"learning_rate": 3.163021926150939e-05,
"loss": 0.1242,
"step": 17040
},
{
"epoch": 0.9079290697055221,
"grad_norm": 0.05161284655332565,
"learning_rate": 3.1611679696211294e-05,
"loss": 0.125,
"step": 17050
},
{
"epoch": 0.9084615794238244,
"grad_norm": 0.04110497981309891,
"learning_rate": 3.159324367004148e-05,
"loss": 0.1253,
"step": 17060
},
{
"epoch": 0.9089940891421269,
"grad_norm": 0.07606612145900726,
"learning_rate": 3.157491124017115e-05,
"loss": 0.1236,
"step": 17070
},
{
"epoch": 0.9095265988604292,
"grad_norm": 0.04594139754772186,
"learning_rate": 3.1556682463450214e-05,
"loss": 0.1234,
"step": 17080
},
{
"epoch": 0.9100591085787315,
"grad_norm": 0.039515670388936996,
"learning_rate": 3.15385573964072e-05,
"loss": 0.1243,
"step": 17090
},
{
"epoch": 0.910591618297034,
"grad_norm": 0.04435297101736069,
"learning_rate": 3.152053609524897e-05,
"loss": 0.1245,
"step": 17100
},
{
"epoch": 0.9111241280153363,
"grad_norm": 0.03617672622203827,
"learning_rate": 3.150261861586065e-05,
"loss": 0.1243,
"step": 17110
},
{
"epoch": 0.9116566377336386,
"grad_norm": 0.053447507321834564,
"learning_rate": 3.148480501380538e-05,
"loss": 0.1251,
"step": 17120
},
{
"epoch": 0.912189147451941,
"grad_norm": 0.041530635207891464,
"learning_rate": 3.1467095344324174e-05,
"loss": 0.1242,
"step": 17130
},
{
"epoch": 0.9127216571702433,
"grad_norm": 0.1386973112821579,
"learning_rate": 3.144948966233577e-05,
"loss": 0.1244,
"step": 17140
},
{
"epoch": 0.9132541668885458,
"grad_norm": 0.06289440393447876,
"learning_rate": 3.143198802243638e-05,
"loss": 0.1246,
"step": 17150
},
{
"epoch": 0.9137866766068481,
"grad_norm": 0.03927746042609215,
"learning_rate": 3.141459047889964e-05,
"loss": 0.1242,
"step": 17160
},
{
"epoch": 0.9143191863251504,
"grad_norm": 0.03453196585178375,
"learning_rate": 3.1397297085676336e-05,
"loss": 0.1243,
"step": 17170
},
{
"epoch": 0.9148516960434528,
"grad_norm": 0.04992485046386719,
"learning_rate": 3.138010789639429e-05,
"loss": 0.1242,
"step": 17180
},
{
"epoch": 0.9153842057617552,
"grad_norm": 0.04788126423954964,
"learning_rate": 3.136302296435818e-05,
"loss": 0.1246,
"step": 17190
},
{
"epoch": 0.9159167154800575,
"grad_norm": 0.05929577723145485,
"learning_rate": 3.1346042342549376e-05,
"loss": 0.1245,
"step": 17200
},
{
"epoch": 0.9164492251983599,
"grad_norm": 0.08015090227127075,
"learning_rate": 3.132916608362578e-05,
"loss": 0.1244,
"step": 17210
},
{
"epoch": 0.9169817349166622,
"grad_norm": 0.06038287281990051,
"learning_rate": 3.131239423992165e-05,
"loss": 0.1245,
"step": 17220
},
{
"epoch": 0.9175142446349646,
"grad_norm": 0.04849204048514366,
"learning_rate": 3.129572686344745e-05,
"loss": 0.1244,
"step": 17230
},
{
"epoch": 0.918046754353267,
"grad_norm": 0.0919271856546402,
"learning_rate": 3.1279164005889696e-05,
"loss": 0.1249,
"step": 17240
},
{
"epoch": 0.9185792640715693,
"grad_norm": 0.039166927337646484,
"learning_rate": 3.126270571861076e-05,
"loss": 0.1246,
"step": 17250
},
{
"epoch": 0.9191117737898716,
"grad_norm": 0.05480289086699486,
"learning_rate": 3.1246352052648764e-05,
"loss": 0.124,
"step": 17260
},
{
"epoch": 0.9196442835081741,
"grad_norm": 0.0434199757874012,
"learning_rate": 3.1230103058717373e-05,
"loss": 0.1245,
"step": 17270
},
{
"epoch": 0.9201767932264764,
"grad_norm": 0.03532974794507027,
"learning_rate": 3.121395878720567e-05,
"loss": 0.1242,
"step": 17280
},
{
"epoch": 0.9207093029447787,
"grad_norm": 0.09806732088327408,
"learning_rate": 3.119791928817798e-05,
"loss": 0.1251,
"step": 17290
},
{
"epoch": 0.9212418126630811,
"grad_norm": 0.0625268891453743,
"learning_rate": 3.1181984611373735e-05,
"loss": 0.1239,
"step": 17300
},
{
"epoch": 0.9217743223813835,
"grad_norm": 0.04700905457139015,
"learning_rate": 3.116615480620727e-05,
"loss": 0.1247,
"step": 17310
},
{
"epoch": 0.9223068320996858,
"grad_norm": 0.0478329174220562,
"learning_rate": 3.1150429921767754e-05,
"loss": 0.1239,
"step": 17320
},
{
"epoch": 0.9228393418179882,
"grad_norm": 0.041897084563970566,
"learning_rate": 3.113481000681897e-05,
"loss": 0.1249,
"step": 17330
},
{
"epoch": 0.9233718515362905,
"grad_norm": 0.04760069027543068,
"learning_rate": 3.111929510979918e-05,
"loss": 0.124,
"step": 17340
},
{
"epoch": 0.9239043612545929,
"grad_norm": 0.06711754202842712,
"learning_rate": 3.110388527882099e-05,
"loss": 0.1239,
"step": 17350
},
{
"epoch": 0.9244368709728953,
"grad_norm": 0.034946054220199585,
"learning_rate": 3.108858056167117e-05,
"loss": 0.1246,
"step": 17360
},
{
"epoch": 0.9249693806911976,
"grad_norm": 0.03772689029574394,
"learning_rate": 3.107338100581056e-05,
"loss": 0.1245,
"step": 17370
},
{
"epoch": 0.9255018904094999,
"grad_norm": 0.04687857627868652,
"learning_rate": 3.105828665837386e-05,
"loss": 0.1249,
"step": 17380
},
{
"epoch": 0.9260344001278024,
"grad_norm": 0.04124782606959343,
"learning_rate": 3.104329756616952e-05,
"loss": 0.1243,
"step": 17390
},
{
"epoch": 0.9265669098461047,
"grad_norm": 0.052532244473695755,
"learning_rate": 3.1028413775679595e-05,
"loss": 0.1245,
"step": 17400
},
{
"epoch": 0.927099419564407,
"grad_norm": 0.0953177809715271,
"learning_rate": 3.101363533305958e-05,
"loss": 0.1241,
"step": 17410
},
{
"epoch": 0.9276319292827094,
"grad_norm": 0.05009876564145088,
"learning_rate": 3.099896228413829e-05,
"loss": 0.1238,
"step": 17420
},
{
"epoch": 0.9281644390010118,
"grad_norm": 0.08097761869430542,
"learning_rate": 3.098439467441771e-05,
"loss": 0.1249,
"step": 17430
},
{
"epoch": 0.9286969487193141,
"grad_norm": 0.047098558396101,
"learning_rate": 3.0969932549072835e-05,
"loss": 0.1233,
"step": 17440
},
{
"epoch": 0.9292294584376165,
"grad_norm": 0.047214169055223465,
"learning_rate": 3.0955575952951575e-05,
"loss": 0.1245,
"step": 17450
},
{
"epoch": 0.9297619681559188,
"grad_norm": 0.048827920109033585,
"learning_rate": 3.0941324930574554e-05,
"loss": 0.1241,
"step": 17460
},
{
"epoch": 0.9302944778742213,
"grad_norm": 0.07658734172582626,
"learning_rate": 3.0927179526135044e-05,
"loss": 0.1237,
"step": 17470
},
{
"epoch": 0.9308269875925236,
"grad_norm": 0.034373264759778976,
"learning_rate": 3.091313978349875e-05,
"loss": 0.1252,
"step": 17480
},
{
"epoch": 0.9313594973108259,
"grad_norm": 0.05511806905269623,
"learning_rate": 3.089920574620375e-05,
"loss": 0.1248,
"step": 17490
},
{
"epoch": 0.9318920070291283,
"grad_norm": 0.04354240372776985,
"learning_rate": 3.0885377457460294e-05,
"loss": 0.125,
"step": 17500
},
{
"epoch": 0.9324245167474307,
"grad_norm": 0.0334496833384037,
"learning_rate": 3.0871654960150706e-05,
"loss": 0.1239,
"step": 17510
},
{
"epoch": 0.932957026465733,
"grad_norm": 0.043775349855422974,
"learning_rate": 3.085803829682928e-05,
"loss": 0.124,
"step": 17520
},
{
"epoch": 0.9334895361840354,
"grad_norm": 0.03564087674021721,
"learning_rate": 3.0844527509722045e-05,
"loss": 0.1238,
"step": 17530
},
{
"epoch": 0.9340220459023377,
"grad_norm": 0.03102003037929535,
"learning_rate": 3.083112264072676e-05,
"loss": 0.1248,
"step": 17540
},
{
"epoch": 0.93455455562064,
"grad_norm": 0.03893466666340828,
"learning_rate": 3.0817823731412704e-05,
"loss": 0.1242,
"step": 17550
},
{
"epoch": 0.9350870653389425,
"grad_norm": 0.05906695872545242,
"learning_rate": 3.0804630823020575e-05,
"loss": 0.1235,
"step": 17560
},
{
"epoch": 0.9356195750572448,
"grad_norm": 0.07867705076932907,
"learning_rate": 3.079154395646233e-05,
"loss": 0.1246,
"step": 17570
},
{
"epoch": 0.9361520847755471,
"grad_norm": 0.047349728643894196,
"learning_rate": 3.077856317232114e-05,
"loss": 0.1238,
"step": 17580
},
{
"epoch": 0.9366845944938496,
"grad_norm": 0.04795532301068306,
"learning_rate": 3.0765688510851144e-05,
"loss": 0.1243,
"step": 17590
},
{
"epoch": 0.9372171042121519,
"grad_norm": 0.06184261292219162,
"learning_rate": 3.075292001197743e-05,
"loss": 0.1252,
"step": 17600
},
{
"epoch": 0.9377496139304542,
"grad_norm": 0.05014181509613991,
"learning_rate": 3.074025771529585e-05,
"loss": 0.1248,
"step": 17610
},
{
"epoch": 0.9382821236487566,
"grad_norm": 0.039317913353443146,
"learning_rate": 3.0727701660072925e-05,
"loss": 0.1239,
"step": 17620
},
{
"epoch": 0.938814633367059,
"grad_norm": 0.10161686688661575,
"learning_rate": 3.0715251885245734e-05,
"loss": 0.1232,
"step": 17630
},
{
"epoch": 0.9393471430853613,
"grad_norm": 0.0897764042019844,
"learning_rate": 3.070290842942173e-05,
"loss": 0.1244,
"step": 17640
},
{
"epoch": 0.9398796528036637,
"grad_norm": 0.056616149842739105,
"learning_rate": 3.0690671330878704e-05,
"loss": 0.1242,
"step": 17650
},
{
"epoch": 0.940412162521966,
"grad_norm": 0.04649018496274948,
"learning_rate": 3.0678540627564614e-05,
"loss": 0.1241,
"step": 17660
},
{
"epoch": 0.9409446722402683,
"grad_norm": 0.06587915867567062,
"learning_rate": 3.066651635709746e-05,
"loss": 0.1239,
"step": 17670
},
{
"epoch": 0.9414771819585708,
"grad_norm": 0.04945458844304085,
"learning_rate": 3.065459855676523e-05,
"loss": 0.124,
"step": 17680
},
{
"epoch": 0.9420096916768731,
"grad_norm": 0.0448901429772377,
"learning_rate": 3.06427872635257e-05,
"loss": 0.1234,
"step": 17690
},
{
"epoch": 0.9425422013951754,
"grad_norm": 0.04893770441412926,
"learning_rate": 3.063108251400638e-05,
"loss": 0.1249,
"step": 17700
},
{
"epoch": 0.9430747111134778,
"grad_norm": 0.050453029572963715,
"learning_rate": 3.06194843445044e-05,
"loss": 0.125,
"step": 17710
},
{
"epoch": 0.9436072208317802,
"grad_norm": 0.07537666708230972,
"learning_rate": 3.060799279098633e-05,
"loss": 0.1249,
"step": 17720
},
{
"epoch": 0.9441397305500825,
"grad_norm": 0.04989492520689964,
"learning_rate": 3.059660788908817e-05,
"loss": 0.1246,
"step": 17730
},
{
"epoch": 0.9446722402683849,
"grad_norm": 0.06518174707889557,
"learning_rate": 3.058532967411516e-05,
"loss": 0.1251,
"step": 17740
},
{
"epoch": 0.9452047499866872,
"grad_norm": 0.046202413737773895,
"learning_rate": 3.057415818104169e-05,
"loss": 0.1238,
"step": 17750
},
{
"epoch": 0.9457372597049896,
"grad_norm": 0.05071398615837097,
"learning_rate": 3.056309344451123e-05,
"loss": 0.1243,
"step": 17760
},
{
"epoch": 0.946269769423292,
"grad_norm": 0.08481655269861221,
"learning_rate": 3.0552135498836165e-05,
"loss": 0.1247,
"step": 17770
},
{
"epoch": 0.9468022791415943,
"grad_norm": 0.05741250142455101,
"learning_rate": 3.0541284377997724e-05,
"loss": 0.1251,
"step": 17780
},
{
"epoch": 0.9473347888598967,
"grad_norm": 0.060524262487888336,
"learning_rate": 3.053054011564587e-05,
"loss": 0.1239,
"step": 17790
},
{
"epoch": 0.9478672985781991,
"grad_norm": 0.056961771100759506,
"learning_rate": 3.051990274509917e-05,
"loss": 0.1249,
"step": 17800
},
{
"epoch": 0.9483998082965014,
"grad_norm": 0.053151581436395645,
"learning_rate": 3.050937229934475e-05,
"loss": 0.1247,
"step": 17810
},
{
"epoch": 0.9489323180148038,
"grad_norm": 0.04388433322310448,
"learning_rate": 3.049894881103813e-05,
"loss": 0.1241,
"step": 17820
},
{
"epoch": 0.9494648277331061,
"grad_norm": 0.05028518661856651,
"learning_rate": 3.0488632312503152e-05,
"loss": 0.1233,
"step": 17830
},
{
"epoch": 0.9499973374514085,
"grad_norm": 0.06016720086336136,
"learning_rate": 3.0478422835731874e-05,
"loss": 0.1246,
"step": 17840
},
{
"epoch": 0.9505298471697109,
"grad_norm": 0.05909838154911995,
"learning_rate": 3.0468320412384498e-05,
"loss": 0.1246,
"step": 17850
},
{
"epoch": 0.9510623568880132,
"grad_norm": 0.06923595815896988,
"learning_rate": 3.0458325073789212e-05,
"loss": 0.1242,
"step": 17860
},
{
"epoch": 0.9515948666063155,
"grad_norm": 0.041968394070863724,
"learning_rate": 3.0448436850942146e-05,
"loss": 0.1243,
"step": 17870
},
{
"epoch": 0.952127376324618,
"grad_norm": 0.038111716508865356,
"learning_rate": 3.0438655774507256e-05,
"loss": 0.1241,
"step": 17880
},
{
"epoch": 0.9526598860429203,
"grad_norm": 0.09092561900615692,
"learning_rate": 3.0428981874816235e-05,
"loss": 0.1237,
"step": 17890
},
{
"epoch": 0.9531923957612226,
"grad_norm": 0.031990304589271545,
"learning_rate": 3.0419415181868416e-05,
"loss": 0.1241,
"step": 17900
},
{
"epoch": 0.953724905479525,
"grad_norm": 0.05379229038953781,
"learning_rate": 3.0409955725330652e-05,
"loss": 0.1236,
"step": 17910
},
{
"epoch": 0.9542574151978274,
"grad_norm": 0.05789874866604805,
"learning_rate": 3.0400603534537282e-05,
"loss": 0.1236,
"step": 17920
},
{
"epoch": 0.9547899249161297,
"grad_norm": 0.045958537608385086,
"learning_rate": 3.0391358638489997e-05,
"loss": 0.1241,
"step": 17930
},
{
"epoch": 0.9553224346344321,
"grad_norm": 0.06445147842168808,
"learning_rate": 3.0382221065857753e-05,
"loss": 0.1239,
"step": 17940
},
{
"epoch": 0.9558549443527344,
"grad_norm": 0.04746498540043831,
"learning_rate": 3.0373190844976695e-05,
"loss": 0.1239,
"step": 17950
},
{
"epoch": 0.9563874540710368,
"grad_norm": 0.037877731025218964,
"learning_rate": 3.0364268003850065e-05,
"loss": 0.1242,
"step": 17960
},
{
"epoch": 0.9569199637893392,
"grad_norm": 0.07475633174180984,
"learning_rate": 3.0355452570148126e-05,
"loss": 0.1241,
"step": 17970
},
{
"epoch": 0.9574524735076415,
"grad_norm": 0.061180293560028076,
"learning_rate": 3.0346744571208034e-05,
"loss": 0.1241,
"step": 17980
},
{
"epoch": 0.9579849832259438,
"grad_norm": 0.05577493831515312,
"learning_rate": 3.033814403403381e-05,
"loss": 0.1236,
"step": 17990
},
{
"epoch": 0.9585174929442463,
"grad_norm": 0.04320796579122543,
"learning_rate": 3.0329650985296228e-05,
"loss": 0.1236,
"step": 18000
},
{
"epoch": 0.9590500026625486,
"grad_norm": 0.07489881664514542,
"learning_rate": 3.032126545133271e-05,
"loss": 0.1249,
"step": 18010
},
{
"epoch": 0.9595825123808509,
"grad_norm": 0.050032421946525574,
"learning_rate": 3.0312987458147298e-05,
"loss": 0.1245,
"step": 18020
},
{
"epoch": 0.9601150220991533,
"grad_norm": 0.035527851432561874,
"learning_rate": 3.030481703141053e-05,
"loss": 0.1239,
"step": 18030
},
{
"epoch": 0.9606475318174557,
"grad_norm": 0.0719723030924797,
"learning_rate": 3.0296754196459377e-05,
"loss": 0.1234,
"step": 18040
},
{
"epoch": 0.961180041535758,
"grad_norm": 0.07368568331003189,
"learning_rate": 3.028879897829716e-05,
"loss": 0.1244,
"step": 18050
},
{
"epoch": 0.9617125512540604,
"grad_norm": 0.08034256100654602,
"learning_rate": 3.028095140159347e-05,
"loss": 0.1249,
"step": 18060
},
{
"epoch": 0.9622450609723627,
"grad_norm": 0.06598822772502899,
"learning_rate": 3.0273211490684106e-05,
"loss": 0.1243,
"step": 18070
},
{
"epoch": 0.962777570690665,
"grad_norm": 0.048927973955869675,
"learning_rate": 3.0265579269570976e-05,
"loss": 0.125,
"step": 18080
},
{
"epoch": 0.9633100804089675,
"grad_norm": 0.035660270601511,
"learning_rate": 3.025805476192205e-05,
"loss": 0.1239,
"step": 18090
},
{
"epoch": 0.9638425901272698,
"grad_norm": 0.055542632937431335,
"learning_rate": 3.025063799107126e-05,
"loss": 0.1237,
"step": 18100
},
{
"epoch": 0.9643750998455722,
"grad_norm": 0.06694008409976959,
"learning_rate": 3.0243328980018447e-05,
"loss": 0.1232,
"step": 18110
},
{
"epoch": 0.9649076095638746,
"grad_norm": 0.04520373046398163,
"learning_rate": 3.0236127751429284e-05,
"loss": 0.1245,
"step": 18120
},
{
"epoch": 0.9654401192821769,
"grad_norm": 0.050599873065948486,
"learning_rate": 3.022903432763519e-05,
"loss": 0.1236,
"step": 18130
},
{
"epoch": 0.9659726290004793,
"grad_norm": 0.10253104567527771,
"learning_rate": 3.02220487306333e-05,
"loss": 0.1239,
"step": 18140
},
{
"epoch": 0.9665051387187816,
"grad_norm": 0.0527169369161129,
"learning_rate": 3.021517098208635e-05,
"loss": 0.1243,
"step": 18150
},
{
"epoch": 0.967037648437084,
"grad_norm": 0.06449782848358154,
"learning_rate": 3.0208401103322637e-05,
"loss": 0.1237,
"step": 18160
},
{
"epoch": 0.9675701581553864,
"grad_norm": 0.061875950545072556,
"learning_rate": 3.0201739115335952e-05,
"loss": 0.1238,
"step": 18170
},
{
"epoch": 0.9681026678736887,
"grad_norm": 0.037850238382816315,
"learning_rate": 3.0195185038785507e-05,
"loss": 0.1249,
"step": 18180
},
{
"epoch": 0.968635177591991,
"grad_norm": 0.04714973270893097,
"learning_rate": 3.0188738893995878e-05,
"loss": 0.1244,
"step": 18190
},
{
"epoch": 0.9691676873102935,
"grad_norm": 0.09865976870059967,
"learning_rate": 3.0182400700956943e-05,
"loss": 0.1248,
"step": 18200
},
{
"epoch": 0.9697001970285958,
"grad_norm": 0.03763122111558914,
"learning_rate": 3.0176170479323794e-05,
"loss": 0.1242,
"step": 18210
},
{
"epoch": 0.9702327067468981,
"grad_norm": 0.038522519171237946,
"learning_rate": 3.017004824841672e-05,
"loss": 0.1245,
"step": 18220
},
{
"epoch": 0.9707652164652005,
"grad_norm": 0.06458954513072968,
"learning_rate": 3.0164034027221112e-05,
"loss": 0.1235,
"step": 18230
},
{
"epoch": 0.9712977261835029,
"grad_norm": 0.04066864028573036,
"learning_rate": 3.015812783438743e-05,
"loss": 0.1247,
"step": 18240
},
{
"epoch": 0.9718302359018052,
"grad_norm": 0.0487934835255146,
"learning_rate": 3.0152329688231107e-05,
"loss": 0.125,
"step": 18250
},
{
"epoch": 0.9723627456201076,
"grad_norm": 0.0468660444021225,
"learning_rate": 3.014663960673254e-05,
"loss": 0.1238,
"step": 18260
},
{
"epoch": 0.9728952553384099,
"grad_norm": 0.05677594989538193,
"learning_rate": 3.014105760753701e-05,
"loss": 0.1236,
"step": 18270
},
{
"epoch": 0.9734277650567122,
"grad_norm": 0.042526934295892715,
"learning_rate": 3.0135583707954613e-05,
"loss": 0.1234,
"step": 18280
},
{
"epoch": 0.9739602747750147,
"grad_norm": 0.06067803502082825,
"learning_rate": 3.0130217924960234e-05,
"loss": 0.1248,
"step": 18290
},
{
"epoch": 0.974492784493317,
"grad_norm": 0.04792502894997597,
"learning_rate": 3.012496027519348e-05,
"loss": 0.1242,
"step": 18300
},
{
"epoch": 0.9750252942116193,
"grad_norm": 0.05716263875365257,
"learning_rate": 3.011981077495863e-05,
"loss": 0.1241,
"step": 18310
},
{
"epoch": 0.9755578039299218,
"grad_norm": 0.04997260496020317,
"learning_rate": 3.011476944022458e-05,
"loss": 0.124,
"step": 18320
},
{
"epoch": 0.9760903136482241,
"grad_norm": 0.09484563767910004,
"learning_rate": 3.010983628662481e-05,
"loss": 0.1242,
"step": 18330
},
{
"epoch": 0.9766228233665264,
"grad_norm": 0.0866529643535614,
"learning_rate": 3.010501132945731e-05,
"loss": 0.1242,
"step": 18340
},
{
"epoch": 0.9771553330848288,
"grad_norm": 0.0486617274582386,
"learning_rate": 3.0100294583684557e-05,
"loss": 0.1238,
"step": 18350
},
{
"epoch": 0.9776878428031311,
"grad_norm": 0.032813165336847305,
"learning_rate": 3.0095686063933453e-05,
"loss": 0.1243,
"step": 18360
},
{
"epoch": 0.9782203525214335,
"grad_norm": 0.07168902456760406,
"learning_rate": 3.009118578449529e-05,
"loss": 0.1249,
"step": 18370
},
{
"epoch": 0.9787528622397359,
"grad_norm": 0.06777796894311905,
"learning_rate": 3.0086793759325693e-05,
"loss": 0.1246,
"step": 18380
},
{
"epoch": 0.9792853719580382,
"grad_norm": 0.0522090420126915,
"learning_rate": 3.0082510002044588e-05,
"loss": 0.1238,
"step": 18390
},
{
"epoch": 0.9798178816763405,
"grad_norm": 0.07006611675024033,
"learning_rate": 3.0078334525936163e-05,
"loss": 0.1243,
"step": 18400
},
{
"epoch": 0.980350391394643,
"grad_norm": 0.06946975737810135,
"learning_rate": 3.0074267343948805e-05,
"loss": 0.1244,
"step": 18410
},
{
"epoch": 0.9808829011129453,
"grad_norm": 0.057238370180130005,
"learning_rate": 3.0070308468695084e-05,
"loss": 0.1234,
"step": 18420
},
{
"epoch": 0.9814154108312477,
"grad_norm": 0.06210003048181534,
"learning_rate": 3.0066457912451707e-05,
"loss": 0.1237,
"step": 18430
},
{
"epoch": 0.98194792054955,
"grad_norm": 0.0649491548538208,
"learning_rate": 3.006271568715947e-05,
"loss": 0.1233,
"step": 18440
},
{
"epoch": 0.9824804302678524,
"grad_norm": 0.038359202444553375,
"learning_rate": 3.0059081804423232e-05,
"loss": 0.1236,
"step": 18450
},
{
"epoch": 0.9830129399861548,
"grad_norm": 0.05497262626886368,
"learning_rate": 3.0055556275511883e-05,
"loss": 0.1239,
"step": 18460
},
{
"epoch": 0.9835454497044571,
"grad_norm": 0.038170114159584045,
"learning_rate": 3.005213911135828e-05,
"loss": 0.1242,
"step": 18470
},
{
"epoch": 0.9840779594227594,
"grad_norm": 0.04138299450278282,
"learning_rate": 3.004883032255925e-05,
"loss": 0.1246,
"step": 18480
},
{
"epoch": 0.9846104691410619,
"grad_norm": 0.0714152529835701,
"learning_rate": 3.004562991937555e-05,
"loss": 0.1239,
"step": 18490
},
{
"epoch": 0.9851429788593642,
"grad_norm": 0.08420536667108536,
"learning_rate": 3.0042537911731818e-05,
"loss": 0.1242,
"step": 18500
},
{
"epoch": 0.9856754885776665,
"grad_norm": 0.036863774061203,
"learning_rate": 3.0039554309216533e-05,
"loss": 0.1245,
"step": 18510
},
{
"epoch": 0.986207998295969,
"grad_norm": 0.059715207666158676,
"learning_rate": 3.003667912108204e-05,
"loss": 0.1249,
"step": 18520
},
{
"epoch": 0.9867405080142713,
"grad_norm": 0.039495982229709625,
"learning_rate": 3.0033912356244453e-05,
"loss": 0.1241,
"step": 18530
},
{
"epoch": 0.9872730177325736,
"grad_norm": 0.0337030254304409,
"learning_rate": 3.0031254023283678e-05,
"loss": 0.1244,
"step": 18540
},
{
"epoch": 0.987805527450876,
"grad_norm": 0.04630092531442642,
"learning_rate": 3.0028704130443352e-05,
"loss": 0.1237,
"step": 18550
},
{
"epoch": 0.9883380371691783,
"grad_norm": 0.03899266943335533,
"learning_rate": 3.0026262685630846e-05,
"loss": 0.124,
"step": 18560
},
{
"epoch": 0.9888705468874807,
"grad_norm": 0.04212348535656929,
"learning_rate": 3.002392969641723e-05,
"loss": 0.1234,
"step": 18570
},
{
"epoch": 0.9894030566057831,
"grad_norm": 0.03455604612827301,
"learning_rate": 3.0021705170037227e-05,
"loss": 0.1241,
"step": 18580
},
{
"epoch": 0.9899355663240854,
"grad_norm": 0.08309216052293777,
"learning_rate": 3.0019589113389234e-05,
"loss": 0.1238,
"step": 18590
},
{
"epoch": 0.9904680760423877,
"grad_norm": 0.09158316254615784,
"learning_rate": 3.0017581533035255e-05,
"loss": 0.1238,
"step": 18600
},
{
"epoch": 0.9910005857606902,
"grad_norm": 0.043327104300260544,
"learning_rate": 3.0015682435200926e-05,
"loss": 0.1249,
"step": 18610
},
{
"epoch": 0.9915330954789925,
"grad_norm": 0.05239805579185486,
"learning_rate": 3.001389182577545e-05,
"loss": 0.1243,
"step": 18620
},
{
"epoch": 0.9920656051972948,
"grad_norm": 0.07973194122314453,
"learning_rate": 3.0012209710311613e-05,
"loss": 0.1239,
"step": 18630
},
{
"epoch": 0.9925981149155972,
"grad_norm": 0.04951346665620804,
"learning_rate": 3.001063609402576e-05,
"loss": 0.1229,
"step": 18640
},
{
"epoch": 0.9931306246338996,
"grad_norm": 0.038230083882808685,
"learning_rate": 3.0009170981797758e-05,
"loss": 0.1236,
"step": 18650
},
{
"epoch": 0.9936631343522019,
"grad_norm": 0.05738453194499016,
"learning_rate": 3.0007814378171008e-05,
"loss": 0.1242,
"step": 18660
},
{
"epoch": 0.9941956440705043,
"grad_norm": 0.060486044734716415,
"learning_rate": 3.0006566287352423e-05,
"loss": 0.1242,
"step": 18670
},
{
"epoch": 0.9947281537888066,
"grad_norm": 0.05927567929029465,
"learning_rate": 3.0005426713212397e-05,
"loss": 0.1243,
"step": 18680
},
{
"epoch": 0.995260663507109,
"grad_norm": 0.10084035247564316,
"learning_rate": 3.000439565928482e-05,
"loss": 0.1243,
"step": 18690
},
{
"epoch": 0.9957931732254114,
"grad_norm": 0.04450371488928795,
"learning_rate": 3.0003473128767058e-05,
"loss": 0.1244,
"step": 18700
},
{
"epoch": 0.9963256829437137,
"grad_norm": 0.059074439108371735,
"learning_rate": 3.000265912451991e-05,
"loss": 0.1232,
"step": 18710
},
{
"epoch": 0.996858192662016,
"grad_norm": 0.04628562554717064,
"learning_rate": 3.0001953649067676e-05,
"loss": 0.1247,
"step": 18720
},
{
"epoch": 0.9973907023803185,
"grad_norm": 0.0613800473511219,
"learning_rate": 3.000135670459806e-05,
"loss": 0.1241,
"step": 18730
},
{
"epoch": 0.9979232120986208,
"grad_norm": 0.060382645577192307,
"learning_rate": 3.000086829296223e-05,
"loss": 0.1247,
"step": 18740
},
{
"epoch": 0.9984557218169232,
"grad_norm": 0.03309040144085884,
"learning_rate": 3.0000488415674777e-05,
"loss": 0.1242,
"step": 18750
},
{
"epoch": 0.9989882315352255,
"grad_norm": 0.047177914530038834,
"learning_rate": 3.0000217073913716e-05,
"loss": 0.124,
"step": 18760
},
{
"epoch": 0.9995207412535279,
"grad_norm": 0.03689567372202873,
"learning_rate": 3.00000542685205e-05,
"loss": 0.1245,
"step": 18770
},
{
"epoch": 1.0,
"step": 18779,
"total_flos": 5.994863411375112e+18,
"train_loss": 0.031714059101823636,
"train_runtime": 3465.0177,
"train_samples_per_second": 1387.365,
"train_steps_per_second": 5.42
}
],
"logging_steps": 10,
"max_steps": 18779,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.994863411375112e+18,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}