Files
mR3-Qwen3-8B-en-prompt-en-t…/trainer_state.json

5873 lines
142 KiB
JSON
Raw Normal View History

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 834,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0036019810895992796,
"grad_norm": 8.959743121022326,
"learning_rate": 0.0,
"loss": 1.8318,
"step": 1
},
{
"epoch": 0.007203962179198559,
"grad_norm": 8.599514774705224,
"learning_rate": 1.1904761904761906e-07,
"loss": 1.8629,
"step": 2
},
{
"epoch": 0.010805943268797838,
"grad_norm": 8.27099405489232,
"learning_rate": 2.3809523809523811e-07,
"loss": 1.8047,
"step": 3
},
{
"epoch": 0.014407924358397118,
"grad_norm": 8.807297661164533,
"learning_rate": 3.5714285714285716e-07,
"loss": 1.838,
"step": 4
},
{
"epoch": 0.018009905447996397,
"grad_norm": 9.118038313686268,
"learning_rate": 4.7619047619047623e-07,
"loss": 1.8445,
"step": 5
},
{
"epoch": 0.021611886537595677,
"grad_norm": 9.342871560722859,
"learning_rate": 5.952380952380953e-07,
"loss": 1.8716,
"step": 6
},
{
"epoch": 0.025213867627194957,
"grad_norm": 8.104634918034305,
"learning_rate": 7.142857142857143e-07,
"loss": 1.7625,
"step": 7
},
{
"epoch": 0.028815848716794237,
"grad_norm": 8.497419415075033,
"learning_rate": 8.333333333333333e-07,
"loss": 1.8662,
"step": 8
},
{
"epoch": 0.03241782980639352,
"grad_norm": 8.997271493582199,
"learning_rate": 9.523809523809525e-07,
"loss": 1.8798,
"step": 9
},
{
"epoch": 0.03601981089599279,
"grad_norm": 7.186846943308143,
"learning_rate": 1.0714285714285714e-06,
"loss": 1.7647,
"step": 10
},
{
"epoch": 0.03962179198559208,
"grad_norm": 7.023004901959916,
"learning_rate": 1.1904761904761906e-06,
"loss": 1.8083,
"step": 11
},
{
"epoch": 0.04322377307519135,
"grad_norm": 7.3197143781084195,
"learning_rate": 1.3095238095238096e-06,
"loss": 1.8125,
"step": 12
},
{
"epoch": 0.04682575416479064,
"grad_norm": 5.742362390459089,
"learning_rate": 1.4285714285714286e-06,
"loss": 1.7355,
"step": 13
},
{
"epoch": 0.05042773525438991,
"grad_norm": 5.054071547886992,
"learning_rate": 1.5476190476190479e-06,
"loss": 1.7257,
"step": 14
},
{
"epoch": 0.0540297163439892,
"grad_norm": 5.035497816412668,
"learning_rate": 1.6666666666666667e-06,
"loss": 1.744,
"step": 15
},
{
"epoch": 0.05763169743358847,
"grad_norm": 4.701882228340393,
"learning_rate": 1.7857142857142859e-06,
"loss": 1.6886,
"step": 16
},
{
"epoch": 0.06123367852318776,
"grad_norm": 4.331238204850519,
"learning_rate": 1.904761904761905e-06,
"loss": 1.6805,
"step": 17
},
{
"epoch": 0.06483565961278703,
"grad_norm": 2.767544145879741,
"learning_rate": 2.023809523809524e-06,
"loss": 1.6131,
"step": 18
},
{
"epoch": 0.06843764070238631,
"grad_norm": 2.6868680980231345,
"learning_rate": 2.1428571428571427e-06,
"loss": 1.5981,
"step": 19
},
{
"epoch": 0.07203962179198559,
"grad_norm": 2.4108574787799673,
"learning_rate": 2.261904761904762e-06,
"loss": 1.5926,
"step": 20
},
{
"epoch": 0.07564160288158488,
"grad_norm": 2.4672291267909237,
"learning_rate": 2.380952380952381e-06,
"loss": 1.5968,
"step": 21
},
{
"epoch": 0.07924358397118415,
"grad_norm": 2.1181123521738914,
"learning_rate": 2.5e-06,
"loss": 1.5456,
"step": 22
},
{
"epoch": 0.08284556506078343,
"grad_norm": 2.3108498703640565,
"learning_rate": 2.6190476190476192e-06,
"loss": 1.5908,
"step": 23
},
{
"epoch": 0.0864475461503827,
"grad_norm": 1.593382451878654,
"learning_rate": 2.7380952380952387e-06,
"loss": 1.5203,
"step": 24
},
{
"epoch": 0.090049527239982,
"grad_norm": 1.8594286026551032,
"learning_rate": 2.8571428571428573e-06,
"loss": 1.4988,
"step": 25
},
{
"epoch": 0.09365150832958127,
"grad_norm": 1.812932314030098,
"learning_rate": 2.9761904761904763e-06,
"loss": 1.5038,
"step": 26
},
{
"epoch": 0.09725348941918055,
"grad_norm": 1.6854819502367606,
"learning_rate": 3.0952380952380957e-06,
"loss": 1.5037,
"step": 27
},
{
"epoch": 0.10085547050877983,
"grad_norm": 1.4363843088807504,
"learning_rate": 3.2142857142857147e-06,
"loss": 1.4492,
"step": 28
},
{
"epoch": 0.1044574515983791,
"grad_norm": 1.2720392687132038,
"learning_rate": 3.3333333333333333e-06,
"loss": 1.4398,
"step": 29
},
{
"epoch": 0.1080594326879784,
"grad_norm": 1.151633877445218,
"learning_rate": 3.4523809523809528e-06,
"loss": 1.427,
"step": 30
},
{
"epoch": 0.11166141377757767,
"grad_norm": 0.9908597997660091,
"learning_rate": 3.5714285714285718e-06,
"loss": 1.4175,
"step": 31
},
{
"epoch": 0.11526339486717695,
"grad_norm": 0.8183196011685436,
"learning_rate": 3.690476190476191e-06,
"loss": 1.3983,
"step": 32
},
{
"epoch": 0.11886537595677622,
"grad_norm": 0.8635221903260426,
"learning_rate": 3.80952380952381e-06,
"loss": 1.4133,
"step": 33
},
{
"epoch": 0.12246735704637551,
"grad_norm": 0.8674489391428222,
"learning_rate": 3.928571428571429e-06,
"loss": 1.3498,
"step": 34
},
{
"epoch": 0.1260693381359748,
"grad_norm": 0.9238078524289024,
"learning_rate": 4.047619047619048e-06,
"loss": 1.3676,
"step": 35
},
{
"epoch": 0.12967131922557407,
"grad_norm": 0.8890230038452177,
"learning_rate": 4.166666666666667e-06,
"loss": 1.3671,
"step": 36
},
{
"epoch": 0.13327330031517334,
"grad_norm": 0.7855135929116516,
"learning_rate": 4.2857142857142855e-06,
"loss": 1.3752,
"step": 37
},
{
"epoch": 0.13687528140477262,
"grad_norm": 0.7376255448971343,
"learning_rate": 4.404761904761905e-06,
"loss": 1.3343,
"step": 38
},
{
"epoch": 0.1404772624943719,
"grad_norm": 0.594654333690764,
"learning_rate": 4.523809523809524e-06,
"loss": 1.3251,
"step": 39
},
{
"epoch": 0.14407924358397117,
"grad_norm": 0.5613128279020647,
"learning_rate": 4.642857142857144e-06,
"loss": 1.3126,
"step": 40
},
{
"epoch": 0.14768122467357048,
"grad_norm": 0.560019251447276,
"learning_rate": 4.761904761904762e-06,
"loss": 1.3397,
"step": 41
},
{
"epoch": 0.15128320576316975,
"grad_norm": 0.5314471256259126,
"learning_rate": 4.880952380952381e-06,
"loss": 1.2782,
"step": 42
},
{
"epoch": 0.15488518685276903,
"grad_norm": 0.5332998102282385,
"learning_rate": 5e-06,
"loss": 1.3021,
"step": 43
},
{
"epoch": 0.1584871679423683,
"grad_norm": 0.5282474767077582,
"learning_rate": 5.119047619047619e-06,
"loss": 1.2855,
"step": 44
},
{
"epoch": 0.16208914903196758,
"grad_norm": 0.5050735166019568,
"learning_rate": 5.2380952380952384e-06,
"loss": 1.2798,
"step": 45
},
{
"epoch": 0.16569113012156686,
"grad_norm": 0.5264954959532085,
"learning_rate": 5.357142857142857e-06,
"loss": 1.2685,
"step": 46
},
{
"epoch": 0.16929311121116614,
"grad_norm": 0.49891809978749935,
"learning_rate": 5.476190476190477e-06,
"loss": 1.3067,
"step": 47
},
{
"epoch": 0.1728950923007654,
"grad_norm": 0.43713019827527205,
"learning_rate": 5.595238095238096e-06,
"loss": 1.2912,
"step": 48
},
{
"epoch": 0.1764970733903647,
"grad_norm": 0.42268295993435495,
"learning_rate": 5.7142857142857145e-06,
"loss": 1.2677,
"step": 49
},
{
"epoch": 0.180099054479964,
"grad_norm": 0.42192747607841885,
"learning_rate": 5.833333333333334e-06,
"loss": 1.2945,
"step": 50
},
{
"epoch": 0.18370103556956327,
"grad_norm": 0.37897863688600525,
"learning_rate": 5.9523809523809525e-06,
"loss": 1.2431,
"step": 51
},
{
"epoch": 0.18730301665916255,
"grad_norm": 0.40405168811471465,
"learning_rate": 6.071428571428571e-06,
"loss": 1.2804,
"step": 52
},
{
"epoch": 0.19090499774876182,
"grad_norm": 0.3934601575838702,
"learning_rate": 6.1904761904761914e-06,
"loss": 1.2597,
"step": 53
},
{
"epoch": 0.1945069788383611,
"grad_norm": 0.41031465320208005,
"learning_rate": 6.30952380952381e-06,
"loss": 1.2451,
"step": 54
},
{
"epoch": 0.19810895992796038,
"grad_norm": 0.3681773008428082,
"learning_rate": 6.4285714285714295e-06,
"loss": 1.26,
"step": 55
},
{
"epoch": 0.20171094101755965,
"grad_norm": 0.36871798332732425,
"learning_rate": 6.547619047619048e-06,
"loss": 1.2251,
"step": 56
},
{
"epoch": 0.20531292210715893,
"grad_norm": 0.35510983491031706,
"learning_rate": 6.666666666666667e-06,
"loss": 1.2455,
"step": 57
},
{
"epoch": 0.2089149031967582,
"grad_norm": 0.3348595552564557,
"learning_rate": 6.785714285714287e-06,
"loss": 1.2582,
"step": 58
},
{
"epoch": 0.2125168842863575,
"grad_norm": 0.33479239236035946,
"learning_rate": 6.9047619047619055e-06,
"loss": 1.229,
"step": 59
},
{
"epoch": 0.2161188653759568,
"grad_norm": 0.35235529909722807,
"learning_rate": 7.023809523809524e-06,
"loss": 1.194,
"step": 60
},
{
"epoch": 0.21972084646555606,
"grad_norm": 0.3612868104143937,
"learning_rate": 7.1428571428571436e-06,
"loss": 1.216,
"step": 61
},
{
"epoch": 0.22332282755515534,
"grad_norm": 0.33124367306424535,
"learning_rate": 7.261904761904762e-06,
"loss": 1.2322,
"step": 62
},
{
"epoch": 0.22692480864475462,
"grad_norm": 0.31005107875726384,
"learning_rate": 7.380952380952382e-06,
"loss": 1.1965,
"step": 63
},
{
"epoch": 0.2305267897343539,
"grad_norm": 0.35704168784229545,
"learning_rate": 7.500000000000001e-06,
"loss": 1.2472,
"step": 64
},
{
"epoch": 0.23412877082395317,
"grad_norm": 0.3009141198350835,
"learning_rate": 7.61904761904762e-06,
"loss": 1.2042,
"step": 65
},
{
"epoch": 0.23773075191355245,
"grad_norm": 0.3361466848573483,
"learning_rate": 7.738095238095238e-06,
"loss": 1.1972,
"step": 66
},
{
"epoch": 0.24133273300315172,
"grad_norm": 0.32300651355695426,
"learning_rate": 7.857142857142858e-06,
"loss": 1.1821,
"step": 67
},
{
"epoch": 0.24493471409275103,
"grad_norm": 0.34432276589140803,
"learning_rate": 7.976190476190477e-06,
"loss": 1.2158,
"step": 68
},
{
"epoch": 0.2485366951823503,
"grad_norm": 0.32159883103552483,
"learning_rate": 8.095238095238097e-06,
"loss": 1.2273,
"step": 69
},
{
"epoch": 0.2521386762719496,
"grad_norm": 0.3132280942086526,
"learning_rate": 8.214285714285714e-06,
"loss": 1.217,
"step": 70
},
{
"epoch": 0.25574065736154883,
"grad_norm": 0.34857390044798864,
"learning_rate": 8.333333333333334e-06,
"loss": 1.1963,
"step": 71
},
{
"epoch": 0.25934263845114813,
"grad_norm": 0.28499241440527673,
"learning_rate": 8.452380952380953e-06,
"loss": 1.1921,
"step": 72
},
{
"epoch": 0.26294461954074744,
"grad_norm": 0.3170111251216066,
"learning_rate": 8.571428571428571e-06,
"loss": 1.1975,
"step": 73
},
{
"epoch": 0.2665466006303467,
"grad_norm": 0.3211922078756118,
"learning_rate": 8.690476190476192e-06,
"loss": 1.1704,
"step": 74
},
{
"epoch": 0.270148581719946,
"grad_norm": 0.30453515336097836,
"learning_rate": 8.80952380952381e-06,
"loss": 1.2062,
"step": 75
},
{
"epoch": 0.27375056280954524,
"grad_norm": 0.3064941559502552,
"learning_rate": 8.92857142857143e-06,
"loss": 1.1928,
"step": 76
},
{
"epoch": 0.27735254389914454,
"grad_norm": 0.33218232714495777,
"learning_rate": 9.047619047619049e-06,
"loss": 1.205,
"step": 77
},
{
"epoch": 0.2809545249887438,
"grad_norm": 0.29079080164563587,
"learning_rate": 9.166666666666666e-06,
"loss": 1.2031,
"step": 78
},
{
"epoch": 0.2845565060783431,
"grad_norm": 0.31159296882004955,
"learning_rate": 9.285714285714288e-06,
"loss": 1.212,
"step": 79
},
{
"epoch": 0.28815848716794235,
"grad_norm": 0.2950167931965713,
"learning_rate": 9.404761904761905e-06,
"loss": 1.21,
"step": 80
},
{
"epoch": 0.29176046825754165,
"grad_norm": 0.3168487800792039,
"learning_rate": 9.523809523809525e-06,
"loss": 1.1467,
"step": 81
},
{
"epoch": 0.29536244934714095,
"grad_norm": 0.31180168015480736,
"learning_rate": 9.642857142857144e-06,
"loss": 1.1904,
"step": 82
},
{
"epoch": 0.2989644304367402,
"grad_norm": 0.31394964544202014,
"learning_rate": 9.761904761904762e-06,
"loss": 1.1742,
"step": 83
},
{
"epoch": 0.3025664115263395,
"grad_norm": 0.28380748068760736,
"learning_rate": 9.880952380952381e-06,
"loss": 1.2007,
"step": 84
},
{
"epoch": 0.30616839261593876,
"grad_norm": 0.3122970930157758,
"learning_rate": 1e-05,
"loss": 1.1607,
"step": 85
},
{
"epoch": 0.30977037370553806,
"grad_norm": 0.30341382381199433,
"learning_rate": 9.999956135155688e-06,
"loss": 1.1943,
"step": 86
},
{
"epoch": 0.3133723547951373,
"grad_norm": 0.29699266871397906,
"learning_rate": 9.999824541392404e-06,
"loss": 1.156,
"step": 87
},
{
"epoch": 0.3169743358847366,
"grad_norm": 0.3370219209966227,
"learning_rate": 9.999605221019082e-06,
"loss": 1.166,
"step": 88
},
{
"epoch": 0.32057631697433586,
"grad_norm": 0.30301135573648547,
"learning_rate": 9.999298177883902e-06,
"loss": 1.186,
"step": 89
},
{
"epoch": 0.32417829806393517,
"grad_norm": 0.30025420893856164,
"learning_rate": 9.998903417374228e-06,
"loss": 1.1832,
"step": 90
},
{
"epoch": 0.32778027915353447,
"grad_norm": 0.3211722926193115,
"learning_rate": 9.9984209464165e-06,
"loss": 1.1309,
"step": 91
},
{
"epoch": 0.3313822602431337,
"grad_norm": 0.34378918525170493,
"learning_rate": 9.997850773476126e-06,
"loss": 1.1822,
"step": 92
},
{
"epoch": 0.334984241332733,
"grad_norm": 0.3202545861207382,
"learning_rate": 9.997192908557322e-06,
"loss": 1.1644,
"step": 93
},
{
"epoch": 0.3385862224223323,
"grad_norm": 0.3192066377734346,
"learning_rate": 9.996447363202947e-06,
"loss": 1.1827,
"step": 94
},
{
"epoch": 0.3421882035119316,
"grad_norm": 0.32504247647618456,
"learning_rate": 9.995614150494293e-06,
"loss": 1.16,
"step": 95
},
{
"epoch": 0.3457901846015308,
"grad_norm": 0.37075579456497426,
"learning_rate": 9.994693285050858e-06,
"loss": 1.1813,
"step": 96
},
{
"epoch": 0.34939216569113013,
"grad_norm": 0.3604600871283949,
"learning_rate": 9.99368478303009e-06,
"loss": 1.1535,
"step": 97
},
{
"epoch": 0.3529941467807294,
"grad_norm": 0.3272940747117161,
"learning_rate": 9.9925886621271e-06,
"loss": 1.1636,
"step": 98
},
{
"epoch": 0.3565961278703287,
"grad_norm": 0.3449509830414838,
"learning_rate": 9.99140494157436e-06,
"loss": 1.1575,
"step": 99
},
{
"epoch": 0.360198108959928,
"grad_norm": 0.35962181776653873,
"learning_rate": 9.990133642141359e-06,
"loss": 1.1756,
"step": 100
},
{
"epoch": 0.36380009004952724,
"grad_norm": 0.3255881417609746,
"learning_rate": 9.988774786134235e-06,
"loss": 1.1751,
"step": 101
},
{
"epoch": 0.36740207113912654,
"grad_norm": 0.3466801749265495,
"learning_rate": 9.987328397395389e-06,
"loss": 1.148,
"step": 102
},
{
"epoch": 0.3710040522287258,
"grad_norm": 0.3781154633191771,
"learning_rate": 9.98579450130307e-06,
"loss": 1.1672,
"step": 103
},
{
"epoch": 0.3746060333183251,
"grad_norm": 0.3177289518646908,
"learning_rate": 9.984173124770924e-06,
"loss": 1.1767,
"step": 104
},
{
"epoch": 0.37820801440792434,
"grad_norm": 0.3287531127302142,
"learning_rate": 9.982464296247523e-06,
"loss": 1.1729,
"step": 105
},
{
"epoch": 0.38180999549752365,
"grad_norm": 0.35236574446805197,
"learning_rate": 9.980668045715864e-06,
"loss": 1.162,
"step": 106
},
{
"epoch": 0.3854119765871229,
"grad_norm": 0.32366666300178654,
"learning_rate": 9.978784404692847e-06,
"loss": 1.1541,
"step": 107
},
{
"epoch": 0.3890139576767222,
"grad_norm": 0.3441412737476968,
"learning_rate": 9.97681340622872e-06,
"loss": 1.1483,
"step": 108
},
{
"epoch": 0.3926159387663215,
"grad_norm": 0.3368352565729486,
"learning_rate": 9.974755084906503e-06,
"loss": 1.1587,
"step": 109
},
{
"epoch": 0.39621791985592075,
"grad_norm": 0.3146485277926942,
"learning_rate": 9.972609476841368e-06,
"loss": 1.1603,
"step": 110
},
{
"epoch": 0.39981990094552006,
"grad_norm": 0.34336750676307926,
"learning_rate": 9.970376619680024e-06,
"loss": 1.1793,
"step": 111
},
{
"epoch": 0.4034218820351193,
"grad_norm": 0.3079869001100948,
"learning_rate": 9.968056552600043e-06,
"loss": 1.1601,
"step": 112
},
{
"epoch": 0.4070238631247186,
"grad_norm": 0.3194899482588308,
"learning_rate": 9.965649316309178e-06,
"loss": 1.1931,
"step": 113
},
{
"epoch": 0.41062584421431786,
"grad_norm": 0.31236725178854713,
"learning_rate": 9.963154953044646e-06,
"loss": 1.1157,
"step": 114
},
{
"epoch": 0.41422782530391716,
"grad_norm": 0.3641377454803935,
"learning_rate": 9.960573506572391e-06,
"loss": 1.1257,
"step": 115
},
{
"epoch": 0.4178298063935164,
"grad_norm": 0.3367690084363564,
"learning_rate": 9.957905022186309e-06,
"loss": 1.1332,
"step": 116
},
{
"epoch": 0.4214317874831157,
"grad_norm": 0.3282965716002517,
"learning_rate": 9.955149546707465e-06,
"loss": 1.0959,
"step": 117
},
{
"epoch": 0.425033768572715,
"grad_norm": 0.3764974200013322,
"learning_rate": 9.952307128483257e-06,
"loss": 1.168,
"step": 118
},
{
"epoch": 0.42863574966231427,
"grad_norm": 0.332077880161025,
"learning_rate": 9.94937781738658e-06,
"loss": 1.1847,
"step": 119
},
{
"epoch": 0.4322377307519136,
"grad_norm": 0.3122087138952814,
"learning_rate": 9.946361664814942e-06,
"loss": 1.1214,
"step": 120
},
{
"epoch": 0.4358397118415128,
"grad_norm": 0.321004643708737,
"learning_rate": 9.94325872368957e-06,
"loss": 1.1235,
"step": 121
},
{
"epoch": 0.4394416929311121,
"grad_norm": 0.35397785938333604,
"learning_rate": 9.940069048454478e-06,
"loss": 1.1792,
"step": 122
},
{
"epoch": 0.4430436740207114,
"grad_norm": 0.34751295835336804,
"learning_rate": 9.936792695075502e-06,
"loss": 1.1626,
"step": 123
},
{
"epoch": 0.4466456551103107,
"grad_norm": 0.33334683436011303,
"learning_rate": 9.93342972103934e-06,
"loss": 1.1156,
"step": 124
},
{
"epoch": 0.45024763619990993,
"grad_norm": 0.35572890521109984,
"learning_rate": 9.929980185352525e-06,
"loss": 1.134,
"step": 125
},
{
"epoch": 0.45384961728950923,
"grad_norm": 0.3600252106382079,
"learning_rate": 9.926444148540394e-06,
"loss": 1.1552,
"step": 126
},
{
"epoch": 0.45745159837910854,
"grad_norm": 0.31574099694060664,
"learning_rate": 9.922821672646028e-06,
"loss": 1.1294,
"step": 127
},
{
"epoch": 0.4610535794687078,
"grad_norm": 0.3384836348033959,
"learning_rate": 9.919112821229165e-06,
"loss": 1.1415,
"step": 128
},
{
"epoch": 0.4646555605583071,
"grad_norm": 0.36082038793653404,
"learning_rate": 9.915317659365078e-06,
"loss": 1.1486,
"step": 129
},
{
"epoch": 0.46825754164790634,
"grad_norm": 0.3475974165432403,
"learning_rate": 9.911436253643445e-06,
"loss": 1.1265,
"step": 130
},
{
"epoch": 0.47185952273750564,
"grad_norm": 0.3635545773479418,
"learning_rate": 9.907468672167165e-06,
"loss": 1.1549,
"step": 131
},
{
"epoch": 0.4754615038271049,
"grad_norm": 0.347794452842081,
"learning_rate": 9.903414984551178e-06,
"loss": 1.1461,
"step": 132
},
{
"epoch": 0.4790634849167042,
"grad_norm": 0.32822120698172536,
"learning_rate": 9.899275261921236e-06,
"loss": 1.1649,
"step": 133
},
{
"epoch": 0.48266546600630345,
"grad_norm": 0.29837185314643394,
"learning_rate": 9.89504957691265e-06,
"loss": 1.1571,
"step": 134
},
{
"epoch": 0.48626744709590275,
"grad_norm": 0.38306110575116503,
"learning_rate": 9.890738003669029e-06,
"loss": 1.1252,
"step": 135
},
{
"epoch": 0.48986942818550205,
"grad_norm": 0.34086844442205383,
"learning_rate": 9.886340617840968e-06,
"loss": 1.1382,
"step": 136
},
{
"epoch": 0.4934714092751013,
"grad_norm": 0.30015621390458924,
"learning_rate": 9.881857496584726e-06,
"loss": 1.1275,
"step": 137
},
{
"epoch": 0.4970733903647006,
"grad_norm": 0.3614100125796906,
"learning_rate": 9.877288718560866e-06,
"loss": 1.1161,
"step": 138
},
{
"epoch": 0.5006753714542999,
"grad_norm": 0.3267608562772437,
"learning_rate": 9.872634363932887e-06,
"loss": 1.1316,
"step": 139
},
{
"epoch": 0.5042773525438992,
"grad_norm": 0.30874876519759176,
"learning_rate": 9.867894514365802e-06,
"loss": 1.1485,
"step": 140
},
{
"epoch": 0.5078793336334985,
"grad_norm": 0.3155720475990474,
"learning_rate": 9.863069253024719e-06,
"loss": 1.1481,
"step": 141
},
{
"epoch": 0.5114813147230977,
"grad_norm": 0.33367944966848107,
"learning_rate": 9.85815866457337e-06,
"loss": 1.108,
"step": 142
},
{
"epoch": 0.515083295812697,
"grad_norm": 0.337631206990529,
"learning_rate": 9.853162835172638e-06,
"loss": 1.1292,
"step": 143
},
{
"epoch": 0.5186852769022963,
"grad_norm": 0.353550852505754,
"learning_rate": 9.84808185247903e-06,
"loss": 1.1378,
"step": 144
},
{
"epoch": 0.5222872579918956,
"grad_norm": 0.3545306891033168,
"learning_rate": 9.842915805643156e-06,
"loss": 1.108,
"step": 145
},
{
"epoch": 0.5258892390814949,
"grad_norm": 0.42557248608508313,
"learning_rate": 9.83766478530815e-06,
"loss": 1.1334,
"step": 146
},
{
"epoch": 0.5294912201710941,
"grad_norm": 0.3728030570909934,
"learning_rate": 9.832328883608088e-06,
"loss": 1.1381,
"step": 147
},
{
"epoch": 0.5330932012606934,
"grad_norm": 0.49924286590903955,
"learning_rate": 9.82690819416637e-06,
"loss": 1.097,
"step": 148
},
{
"epoch": 0.5366951823502927,
"grad_norm": 0.3439764618050822,
"learning_rate": 9.821402812094074e-06,
"loss": 1.1577,
"step": 149
},
{
"epoch": 0.540297163439892,
"grad_norm": 0.4159129574954092,
"learning_rate": 9.815812833988292e-06,
"loss": 1.132,
"step": 150
},
{
"epoch": 0.5438991445294912,
"grad_norm": 0.4129107171971605,
"learning_rate": 9.81013835793043e-06,
"loss": 1.1574,
"step": 151
},
{
"epoch": 0.5475011256190905,
"grad_norm": 0.4296938671727486,
"learning_rate": 9.804379483484493e-06,
"loss": 1.1459,
"step": 152
},
{
"epoch": 0.5511031067086898,
"grad_norm": 0.3766243233797345,
"learning_rate": 9.798536311695334e-06,
"loss": 1.1545,
"step": 153
},
{
"epoch": 0.5547050877982891,
"grad_norm": 0.35435312036719,
"learning_rate": 9.79260894508688e-06,
"loss": 1.1171,
"step": 154
},
{
"epoch": 0.5583070688878884,
"grad_norm": 0.37652608806645427,
"learning_rate": 9.786597487660336e-06,
"loss": 1.111,
"step": 155
},
{
"epoch": 0.5619090499774876,
"grad_norm": 0.3453394867906792,
"learning_rate": 9.780502044892363e-06,
"loss": 1.1749,
"step": 156
},
{
"epoch": 0.5655110310670869,
"grad_norm": 0.379890664116539,
"learning_rate": 9.774322723733216e-06,
"loss": 1.1481,
"step": 157
},
{
"epoch": 0.5691130121566862,
"grad_norm": 0.39501637379719035,
"learning_rate": 9.768059632604881e-06,
"loss": 1.1061,
"step": 158
},
{
"epoch": 0.5727149932462855,
"grad_norm": 0.3326705156892084,
"learning_rate": 9.761712881399164e-06,
"loss": 1.1412,
"step": 159
},
{
"epoch": 0.5763169743358847,
"grad_norm": 0.3617516742176726,
"learning_rate": 9.755282581475769e-06,
"loss": 1.1789,
"step": 160
},
{
"epoch": 0.579918955425484,
"grad_norm": 0.39263243654387053,
"learning_rate": 9.748768845660335e-06,
"loss": 1.152,
"step": 161
},
{
"epoch": 0.5835209365150833,
"grad_norm": 0.34780105035332837,
"learning_rate": 9.742171788242468e-06,
"loss": 1.1267,
"step": 162
},
{
"epoch": 0.5871229176046826,
"grad_norm": 0.45976305202766604,
"learning_rate": 9.735491524973723e-06,
"loss": 1.1043,
"step": 163
},
{
"epoch": 0.5907248986942819,
"grad_norm": 0.3604888952284618,
"learning_rate": 9.728728173065584e-06,
"loss": 1.1105,
"step": 164
},
{
"epoch": 0.5943268797838811,
"grad_norm": 0.4168296170026391,
"learning_rate": 9.721881851187406e-06,
"loss": 1.1362,
"step": 165
},
{
"epoch": 0.5979288608734804,
"grad_norm": 0.3757592230549266,
"learning_rate": 9.714952679464324e-06,
"loss": 1.1405,
"step": 166
},
{
"epoch": 0.6015308419630797,
"grad_norm": 0.40888701962345614,
"learning_rate": 9.707940779475151e-06,
"loss": 1.0968,
"step": 167
},
{
"epoch": 0.605132823052679,
"grad_norm": 0.36126726128985687,
"learning_rate": 9.700846274250252e-06,
"loss": 1.1197,
"step": 168
},
{
"epoch": 0.6087348041422782,
"grad_norm": 0.3792632202992676,
"learning_rate": 9.693669288269371e-06,
"loss": 1.1129,
"step": 169
},
{
"epoch": 0.6123367852318775,
"grad_norm": 0.4094377957060575,
"learning_rate": 9.68640994745946e-06,
"loss": 1.1235,
"step": 170
},
{
"epoch": 0.6159387663214768,
"grad_norm": 0.39486664256207166,
"learning_rate": 9.679068379192455e-06,
"loss": 1.1189,
"step": 171
},
{
"epoch": 0.6195407474110761,
"grad_norm": 0.37256294824637853,
"learning_rate": 9.671644712283061e-06,
"loss": 1.0951,
"step": 172
},
{
"epoch": 0.6231427285006754,
"grad_norm": 0.3627397815654964,
"learning_rate": 9.664139076986473e-06,
"loss": 1.1321,
"step": 173
},
{
"epoch": 0.6267447095902746,
"grad_norm": 0.4248545009770537,
"learning_rate": 9.656551604996102e-06,
"loss": 1.1338,
"step": 174
},
{
"epoch": 0.6303466906798739,
"grad_norm": 0.3492982058791957,
"learning_rate": 9.648882429441258e-06,
"loss": 1.1383,
"step": 175
},
{
"epoch": 0.6339486717694732,
"grad_norm": 0.37748320014327436,
"learning_rate": 9.641131684884817e-06,
"loss": 1.1316,
"step": 176
},
{
"epoch": 0.6375506528590725,
"grad_norm": 0.35682755754679785,
"learning_rate": 9.633299507320862e-06,
"loss": 1.09,
"step": 177
},
{
"epoch": 0.6411526339486717,
"grad_norm": 0.34106929576049877,
"learning_rate": 9.62538603417229e-06,
"loss": 1.136,
"step": 178
},
{
"epoch": 0.644754615038271,
"grad_norm": 0.34746889086894356,
"learning_rate": 9.617391404288412e-06,
"loss": 1.0943,
"step": 179
},
{
"epoch": 0.6483565961278703,
"grad_norm": 0.36723025633050504,
"learning_rate": 9.609315757942504e-06,
"loss": 1.182,
"step": 180
},
{
"epoch": 0.6519585772174696,
"grad_norm": 0.33451032057782015,
"learning_rate": 9.601159236829353e-06,
"loss": 1.1351,
"step": 181
},
{
"epoch": 0.6555605583070689,
"grad_norm": 0.33248672869162116,
"learning_rate": 9.592921984062771e-06,
"loss": 1.1187,
"step": 182
},
{
"epoch": 0.6591625393966681,
"grad_norm": 0.3331636644805929,
"learning_rate": 9.584604144173084e-06,
"loss": 1.1009,
"step": 183
},
{
"epoch": 0.6627645204862674,
"grad_norm": 0.3268063515189842,
"learning_rate": 9.576205863104588e-06,
"loss": 1.1048,
"step": 184
},
{
"epoch": 0.6663665015758667,
"grad_norm": 0.3651627841761447,
"learning_rate": 9.567727288213005e-06,
"loss": 1.1534,
"step": 185
},
{
"epoch": 0.669968482665466,
"grad_norm": 0.3347297887818968,
"learning_rate": 9.55916856826288e-06,
"loss": 1.1567,
"step": 186
},
{
"epoch": 0.6735704637550652,
"grad_norm": 0.3615161248575356,
"learning_rate": 9.550529853424979e-06,
"loss": 1.1278,
"step": 187
},
{
"epoch": 0.6771724448446645,
"grad_norm": 0.3713014349141257,
"learning_rate": 9.541811295273657e-06,
"loss": 1.1101,
"step": 188
},
{
"epoch": 0.6807744259342638,
"grad_norm": 0.34004372649926684,
"learning_rate": 9.53301304678419e-06,
"loss": 1.1276,
"step": 189
},
{
"epoch": 0.6843764070238632,
"grad_norm": 0.35758706995009787,
"learning_rate": 9.524135262330098e-06,
"loss": 1.1109,
"step": 190
},
{
"epoch": 0.6879783881134625,
"grad_norm": 0.3220236515093335,
"learning_rate": 9.515178097680437e-06,
"loss": 1.119,
"step": 191
},
{
"epoch": 0.6915803692030617,
"grad_norm": 0.35195981649297625,
"learning_rate": 9.506141709997058e-06,
"loss": 1.0968,
"step": 192
},
{
"epoch": 0.695182350292661,
"grad_norm": 0.3960324035518941,
"learning_rate": 9.497026257831856e-06,
"loss": 1.1396,
"step": 193
},
{
"epoch": 0.6987843313822603,
"grad_norm": 0.36557327721045585,
"learning_rate": 9.487831901123989e-06,
"loss": 1.1238,
"step": 194
},
{
"epoch": 0.7023863124718596,
"grad_norm": 0.3343162963854113,
"learning_rate": 9.478558801197065e-06,
"loss": 1.1293,
"step": 195
},
{
"epoch": 0.7059882935614588,
"grad_norm": 0.40726069321694147,
"learning_rate": 9.46920712075632e-06,
"loss": 1.1103,
"step": 196
},
{
"epoch": 0.7095902746510581,
"grad_norm": 0.3381887572157023,
"learning_rate": 9.459777023885754e-06,
"loss": 1.0944,
"step": 197
},
{
"epoch": 0.7131922557406574,
"grad_norm": 0.3733440854292112,
"learning_rate": 9.450268676045261e-06,
"loss": 1.1909,
"step": 198
},
{
"epoch": 0.7167942368302567,
"grad_norm": 0.3755723398224028,
"learning_rate": 9.440682244067724e-06,
"loss": 1.0909,
"step": 199
},
{
"epoch": 0.720396217919856,
"grad_norm": 0.38426065259470316,
"learning_rate": 9.431017896156074e-06,
"loss": 1.136,
"step": 200
},
{
"epoch": 0.7239981990094552,
"grad_norm": 0.379071589505192,
"learning_rate": 9.421275801880363e-06,
"loss": 1.1121,
"step": 201
},
{
"epoch": 0.7276001800990545,
"grad_norm": 0.34461539082999465,
"learning_rate": 9.411456132174768e-06,
"loss": 1.0988,
"step": 202
},
{
"epoch": 0.7312021611886538,
"grad_norm": 0.3654979888018069,
"learning_rate": 9.401559059334601e-06,
"loss": 1.1238,
"step": 203
},
{
"epoch": 0.7348041422782531,
"grad_norm": 0.34977548124519137,
"learning_rate": 9.39158475701329e-06,
"loss": 1.11,
"step": 204
},
{
"epoch": 0.7384061233678523,
"grad_norm": 0.3738440708255404,
"learning_rate": 9.381533400219319e-06,
"loss": 1.1023,
"step": 205
},
{
"epoch": 0.7420081044574516,
"grad_norm": 0.34015234663312666,
"learning_rate": 9.371405165313169e-06,
"loss": 1.1271,
"step": 206
},
{
"epoch": 0.7456100855470509,
"grad_norm": 0.344237085861616,
"learning_rate": 9.361200230004219e-06,
"loss": 1.1192,
"step": 207
},
{
"epoch": 0.7492120666366502,
"grad_norm": 0.3485062946685098,
"learning_rate": 9.35091877334763e-06,
"loss": 1.147,
"step": 208
},
{
"epoch": 0.7528140477262495,
"grad_norm": 0.3458232266553943,
"learning_rate": 9.340560975741198e-06,
"loss": 1.15,
"step": 209
},
{
"epoch": 0.7564160288158487,
"grad_norm": 0.338158493876411,
"learning_rate": 9.330127018922195e-06,
"loss": 1.1367,
"step": 210
},
{
"epoch": 0.760018009905448,
"grad_norm": 0.34361214214695057,
"learning_rate": 9.319617085964177e-06,
"loss": 1.0956,
"step": 211
},
{
"epoch": 0.7636199909950473,
"grad_norm": 0.3321785776797715,
"learning_rate": 9.309031361273775e-06,
"loss": 1.138,
"step": 212
},
{
"epoch": 0.7672219720846466,
"grad_norm": 0.36852431928120577,
"learning_rate": 9.298370030587456e-06,
"loss": 1.1271,
"step": 213
},
{
"epoch": 0.7708239531742458,
"grad_norm": 0.36986638396232147,
"learning_rate": 9.287633280968263e-06,
"loss": 1.112,
"step": 214
},
{
"epoch": 0.7744259342638451,
"grad_norm": 0.32561497084045427,
"learning_rate": 9.276821300802535e-06,
"loss": 1.08,
"step": 215
},
{
"epoch": 0.7780279153534444,
"grad_norm": 0.3746970517644171,
"learning_rate": 9.265934279796602e-06,
"loss": 1.1136,
"step": 216
},
{
"epoch": 0.7816298964430437,
"grad_norm": 0.4173073842610304,
"learning_rate": 9.25497240897346e-06,
"loss": 1.1273,
"step": 217
},
{
"epoch": 0.785231877532643,
"grad_norm": 0.37809222920052,
"learning_rate": 9.24393588066941e-06,
"loss": 1.0955,
"step": 218
},
{
"epoch": 0.7888338586222422,
"grad_norm": 0.35280564760591715,
"learning_rate": 9.232824888530689e-06,
"loss": 1.1037,
"step": 219
},
{
"epoch": 0.7924358397118415,
"grad_norm": 0.4219603934746488,
"learning_rate": 9.221639627510076e-06,
"loss": 1.1389,
"step": 220
},
{
"epoch": 0.7960378208014408,
"grad_norm": 0.34392171607237565,
"learning_rate": 9.210380293863462e-06,
"loss": 1.1329,
"step": 221
},
{
"epoch": 0.7996398018910401,
"grad_norm": 0.3553225127256251,
"learning_rate": 9.199047085146415e-06,
"loss": 1.0945,
"step": 222
},
{
"epoch": 0.8032417829806393,
"grad_norm": 0.3492982055796588,
"learning_rate": 9.18764020021071e-06,
"loss": 1.1537,
"step": 223
},
{
"epoch": 0.8068437640702386,
"grad_norm": 0.329347516548987,
"learning_rate": 9.176159839200838e-06,
"loss": 1.0952,
"step": 224
},
{
"epoch": 0.8104457451598379,
"grad_norm": 0.3256059168725448,
"learning_rate": 9.164606203550498e-06,
"loss": 1.1352,
"step": 225
},
{
"epoch": 0.8140477262494372,
"grad_norm": 0.40382790613119113,
"learning_rate": 9.152979495979064e-06,
"loss": 1.1412,
"step": 226
},
{
"epoch": 0.8176497073390365,
"grad_norm": 0.3575693980124408,
"learning_rate": 9.141279920488021e-06,
"loss": 1.1295,
"step": 227
},
{
"epoch": 0.8212516884286357,
"grad_norm": 0.3488694495678355,
"learning_rate": 9.129507682357393e-06,
"loss": 1.0832,
"step": 228
},
{
"epoch": 0.824853669518235,
"grad_norm": 0.3539230531756584,
"learning_rate": 9.117662988142138e-06,
"loss": 1.1281,
"step": 229
},
{
"epoch": 0.8284556506078343,
"grad_norm": 0.38111067495290035,
"learning_rate": 9.10574604566852e-06,
"loss": 1.1212,
"step": 230
},
{
"epoch": 0.8320576316974336,
"grad_norm": 0.3883299352729627,
"learning_rate": 9.093757064030473e-06,
"loss": 1.138,
"step": 231
},
{
"epoch": 0.8356596127870328,
"grad_norm": 0.32925848731485013,
"learning_rate": 9.08169625358592e-06,
"loss": 1.1304,
"step": 232
},
{
"epoch": 0.8392615938766321,
"grad_norm": 0.4180598977166807,
"learning_rate": 9.069563825953092e-06,
"loss": 1.1038,
"step": 233
},
{
"epoch": 0.8428635749662314,
"grad_norm": 0.34456574190208067,
"learning_rate": 9.057359994006806e-06,
"loss": 1.0855,
"step": 234
},
{
"epoch": 0.8464655560558307,
"grad_norm": 0.39699049033870243,
"learning_rate": 9.045084971874738e-06,
"loss": 1.1254,
"step": 235
},
{
"epoch": 0.85006753714543,
"grad_norm": 0.40763691295552823,
"learning_rate": 9.032738974933663e-06,
"loss": 1.0794,
"step": 236
},
{
"epoch": 0.8536695182350292,
"grad_norm": 0.36236914147842597,
"learning_rate": 9.020322219805674e-06,
"loss": 1.1203,
"step": 237
},
{
"epoch": 0.8572714993246285,
"grad_norm": 0.3833655304553943,
"learning_rate": 9.007834924354384e-06,
"loss": 1.1262,
"step": 238
},
{
"epoch": 0.8608734804142278,
"grad_norm": 0.37172135259336675,
"learning_rate": 8.9952773076811e-06,
"loss": 1.0626,
"step": 239
},
{
"epoch": 0.8644754615038271,
"grad_norm": 0.3334721033923123,
"learning_rate": 8.982649590120982e-06,
"loss": 1.0791,
"step": 240
},
{
"epoch": 0.8680774425934263,
"grad_norm": 0.34086631317663335,
"learning_rate": 8.969951993239177e-06,
"loss": 1.1154,
"step": 241
},
{
"epoch": 0.8716794236830256,
"grad_norm": 0.3371727774416591,
"learning_rate": 8.957184739826929e-06,
"loss": 1.1387,
"step": 242
},
{
"epoch": 0.875281404772625,
"grad_norm": 0.3633470123731805,
"learning_rate": 8.944348053897672e-06,
"loss": 1.1505,
"step": 243
},
{
"epoch": 0.8788833858622243,
"grad_norm": 0.3695807933678788,
"learning_rate": 8.931442160683094e-06,
"loss": 1.1351,
"step": 244
},
{
"epoch": 0.8824853669518236,
"grad_norm": 0.38189837725904396,
"learning_rate": 8.9184672866292e-06,
"loss": 1.099,
"step": 245
},
{
"epoch": 0.8860873480414228,
"grad_norm": 0.38223867232950853,
"learning_rate": 8.905423659392316e-06,
"loss": 1.0558,
"step": 246
},
{
"epoch": 0.889689329131022,
"grad_norm": 0.33464001960015227,
"learning_rate": 8.892311507835118e-06,
"loss": 1.1106,
"step": 247
},
{
"epoch": 0.8932913102206214,
"grad_norm": 0.3500335015064184,
"learning_rate": 8.879131062022598e-06,
"loss": 1.0943,
"step": 248
},
{
"epoch": 0.8968932913102207,
"grad_norm": 0.38615622184380055,
"learning_rate": 8.865882553218036e-06,
"loss": 1.1362,
"step": 249
},
{
"epoch": 0.9004952723998199,
"grad_norm": 0.3549431372544129,
"learning_rate": 8.852566213878947e-06,
"loss": 1.1453,
"step": 250
},
{
"epoch": 0.9040972534894192,
"grad_norm": 0.3324763448668655,
"learning_rate": 8.83918227765299e-06,
"loss": 1.1157,
"step": 251
},
{
"epoch": 0.9076992345790185,
"grad_norm": 0.35594427834572845,
"learning_rate": 8.825730979373873e-06,
"loss": 1.1095,
"step": 252
},
{
"epoch": 0.9113012156686178,
"grad_norm": 0.34567215550933944,
"learning_rate": 8.81221255505724e-06,
"loss": 1.1086,
"step": 253
},
{
"epoch": 0.9149031967582171,
"grad_norm": 0.3653541234957228,
"learning_rate": 8.798627241896524e-06,
"loss": 1.0936,
"step": 254
},
{
"epoch": 0.9185051778478163,
"grad_norm": 0.3795042606149247,
"learning_rate": 8.784975278258783e-06,
"loss": 1.1185,
"step": 255
},
{
"epoch": 0.9221071589374156,
"grad_norm": 0.3660926060585707,
"learning_rate": 8.77125690368052e-06,
"loss": 1.1117,
"step": 256
},
{
"epoch": 0.9257091400270149,
"grad_norm": 0.377064805408519,
"learning_rate": 8.757472358863481e-06,
"loss": 1.1265,
"step": 257
},
{
"epoch": 0.9293111211166142,
"grad_norm": 0.36507443565402037,
"learning_rate": 8.743621885670431e-06,
"loss": 1.1493,
"step": 258
},
{
"epoch": 0.9329131022062134,
"grad_norm": 0.36199439427145674,
"learning_rate": 8.729705727120911e-06,
"loss": 1.0902,
"step": 259
},
{
"epoch": 0.9365150832958127,
"grad_norm": 0.365398128659048,
"learning_rate": 8.715724127386971e-06,
"loss": 1.1199,
"step": 260
},
{
"epoch": 0.940117064385412,
"grad_norm": 0.3597974956274622,
"learning_rate": 8.701677331788891e-06,
"loss": 1.1349,
"step": 261
},
{
"epoch": 0.9437190454750113,
"grad_norm": 0.34565112893965727,
"learning_rate": 8.68756558679087e-06,
"loss": 1.1093,
"step": 262
},
{
"epoch": 0.9473210265646106,
"grad_norm": 0.35741324005523417,
"learning_rate": 8.673389139996708e-06,
"loss": 1.0965,
"step": 263
},
{
"epoch": 0.9509230076542098,
"grad_norm": 0.32598922479655423,
"learning_rate": 8.659148240145456e-06,
"loss": 1.105,
"step": 264
},
{
"epoch": 0.9545249887438091,
"grad_norm": 0.3329117672496498,
"learning_rate": 8.644843137107058e-06,
"loss": 1.0749,
"step": 265
},
{
"epoch": 0.9581269698334084,
"grad_norm": 0.3937476383650613,
"learning_rate": 8.630474081877959e-06,
"loss": 1.1018,
"step": 266
},
{
"epoch": 0.9617289509230077,
"grad_norm": 0.31471194111230605,
"learning_rate": 8.616041326576711e-06,
"loss": 1.1058,
"step": 267
},
{
"epoch": 0.9653309320126069,
"grad_norm": 0.36368215737547877,
"learning_rate": 8.601545124439535e-06,
"loss": 1.1285,
"step": 268
},
{
"epoch": 0.9689329131022062,
"grad_norm": 0.3473607661443015,
"learning_rate": 8.586985729815895e-06,
"loss": 1.1326,
"step": 269
},
{
"epoch": 0.9725348941918055,
"grad_norm": 0.377774377998179,
"learning_rate": 8.572363398164017e-06,
"loss": 1.1253,
"step": 270
},
{
"epoch": 0.9761368752814048,
"grad_norm": 0.3611382706567514,
"learning_rate": 8.557678386046429e-06,
"loss": 1.1152,
"step": 271
},
{
"epoch": 0.9797388563710041,
"grad_norm": 0.360467222912271,
"learning_rate": 8.542930951125432e-06,
"loss": 1.0612,
"step": 272
},
{
"epoch": 0.9833408374606033,
"grad_norm": 0.36148714655712805,
"learning_rate": 8.528121352158604e-06,
"loss": 1.1254,
"step": 273
},
{
"epoch": 0.9869428185502026,
"grad_norm": 0.4620821380277117,
"learning_rate": 8.513249848994248e-06,
"loss": 1.1151,
"step": 274
},
{
"epoch": 0.9905447996398019,
"grad_norm": 0.3699494680308604,
"learning_rate": 8.498316702566828e-06,
"loss": 1.1331,
"step": 275
},
{
"epoch": 0.9941467807294012,
"grad_norm": 0.3944738632436142,
"learning_rate": 8.483322174892404e-06,
"loss": 1.1218,
"step": 276
},
{
"epoch": 0.9977487618190004,
"grad_norm": 0.37443116561016265,
"learning_rate": 8.468266529064025e-06,
"loss": 1.0858,
"step": 277
},
{
"epoch": 1.0,
"grad_norm": 0.37443116561016265,
"learning_rate": 8.453150029247115e-06,
"loss": 1.1388,
"step": 278
},
{
"epoch": 1.0036019810895993,
"grad_norm": 0.5412674452239241,
"learning_rate": 8.437972940674838e-06,
"loss": 1.0955,
"step": 279
},
{
"epoch": 1.0072039621791986,
"grad_norm": 0.3606994742407109,
"learning_rate": 8.422735529643445e-06,
"loss": 1.0508,
"step": 280
},
{
"epoch": 1.010805943268798,
"grad_norm": 0.500891921192956,
"learning_rate": 8.4074380635076e-06,
"loss": 1.0681,
"step": 281
},
{
"epoch": 1.0144079243583972,
"grad_norm": 0.3250502662600929,
"learning_rate": 8.392080810675692e-06,
"loss": 1.0734,
"step": 282
},
{
"epoch": 1.0180099054479963,
"grad_norm": 0.4496303032317973,
"learning_rate": 8.376664040605122e-06,
"loss": 1.0971,
"step": 283
},
{
"epoch": 1.0216118865375956,
"grad_norm": 0.3582282301299312,
"learning_rate": 8.361188023797581e-06,
"loss": 1.1034,
"step": 284
},
{
"epoch": 1.025213867627195,
"grad_norm": 0.4887190983762863,
"learning_rate": 8.345653031794292e-06,
"loss": 1.0937,
"step": 285
},
{
"epoch": 1.0288158487167942,
"grad_norm": 0.419655234473874,
"learning_rate": 8.33005933717126e-06,
"loss": 1.0917,
"step": 286
},
{
"epoch": 1.0324178298063935,
"grad_norm": 0.4289753872257514,
"learning_rate": 8.314407213534477e-06,
"loss": 1.0846,
"step": 287
},
{
"epoch": 1.0360198108959928,
"grad_norm": 0.3868135415082272,
"learning_rate": 8.298696935515132e-06,
"loss": 1.0901,
"step": 288
},
{
"epoch": 1.0396217919855921,
"grad_norm": 0.39062885945304754,
"learning_rate": 8.282928778764783e-06,
"loss": 1.1005,
"step": 289
},
{
"epoch": 1.0432237730751914,
"grad_norm": 0.3750887902646042,
"learning_rate": 8.267103019950529e-06,
"loss": 1.092,
"step": 290
},
{
"epoch": 1.0468257541647907,
"grad_norm": 0.3944074092851082,
"learning_rate": 8.251219936750145e-06,
"loss": 1.0559,
"step": 291
},
{
"epoch": 1.0504277352543898,
"grad_norm": 0.442523607996674,
"learning_rate": 8.235279807847223e-06,
"loss": 1.0879,
"step": 292
},
{
"epoch": 1.0540297163439891,
"grad_norm": 0.400701104226115,
"learning_rate": 8.21928291292627e-06,
"loss": 1.0761,
"step": 293
},
{
"epoch": 1.0576316974335884,
"grad_norm": 0.3929872862213915,
"learning_rate": 8.203229532667808e-06,
"loss": 1.1122,
"step": 294
},
{
"epoch": 1.0612336785231877,
"grad_norm": 0.4802600223498965,
"learning_rate": 8.18711994874345e-06,
"loss": 1.0431,
"step": 295
},
{
"epoch": 1.064835659612787,
"grad_norm": 0.3896837748940082,
"learning_rate": 8.170954443810947e-06,
"loss": 1.0706,
"step": 296
},
{
"epoch": 1.0684376407023863,
"grad_norm": 0.46461703487742634,
"learning_rate": 8.154733301509249e-06,
"loss": 1.1189,
"step": 297
},
{
"epoch": 1.0720396217919856,
"grad_norm": 0.37467583421393963,
"learning_rate": 8.138456806453503e-06,
"loss": 1.0592,
"step": 298
},
{
"epoch": 1.075641602881585,
"grad_norm": 0.4532029044825418,
"learning_rate": 8.12212524423008e-06,
"loss": 1.0787,
"step": 299
},
{
"epoch": 1.0792435839711843,
"grad_norm": 0.3732477898818737,
"learning_rate": 8.105738901391553e-06,
"loss": 1.0592,
"step": 300
},
{
"epoch": 1.0828455650607833,
"grad_norm": 0.39110742601041953,
"learning_rate": 8.089298065451673e-06,
"loss": 1.0412,
"step": 301
},
{
"epoch": 1.0864475461503826,
"grad_norm": 0.4819959554895645,
"learning_rate": 8.072803024880322e-06,
"loss": 1.1164,
"step": 302
},
{
"epoch": 1.090049527239982,
"grad_norm": 0.40121766412624266,
"learning_rate": 8.05625406909846e-06,
"loss": 1.1179,
"step": 303
},
{
"epoch": 1.0936515083295812,
"grad_norm": 0.43991989176939206,
"learning_rate": 8.039651488473028e-06,
"loss": 1.0665,
"step": 304
},
{
"epoch": 1.0972534894191805,
"grad_norm": 0.3655171144729394,
"learning_rate": 8.022995574311876e-06,
"loss": 1.0892,
"step": 305
},
{
"epoch": 1.1008554705087799,
"grad_norm": 0.33944440562879685,
"learning_rate": 8.006286618858634e-06,
"loss": 1.0412,
"step": 306
},
{
"epoch": 1.1044574515983792,
"grad_norm": 0.4079616754346733,
"learning_rate": 7.989524915287595e-06,
"loss": 1.0643,
"step": 307
},
{
"epoch": 1.1080594326879785,
"grad_norm": 0.345780124558647,
"learning_rate": 7.972710757698567e-06,
"loss": 1.0982,
"step": 308
},
{
"epoch": 1.1116614137775778,
"grad_norm": 0.42242099900664976,
"learning_rate": 7.95584444111171e-06,
"loss": 1.1034,
"step": 309
},
{
"epoch": 1.1152633948671768,
"grad_norm": 0.4005977051798865,
"learning_rate": 7.938926261462366e-06,
"loss": 1.0703,
"step": 310
},
{
"epoch": 1.1188653759567762,
"grad_norm": 0.36549179047552754,
"learning_rate": 7.921956515595861e-06,
"loss": 1.1015,
"step": 311
},
{
"epoch": 1.1224673570463755,
"grad_norm": 0.4262110579462978,
"learning_rate": 7.904935501262301e-06,
"loss": 1.0648,
"step": 312
},
{
"epoch": 1.1260693381359748,
"grad_norm": 0.37185963973126124,
"learning_rate": 7.887863517111337e-06,
"loss": 1.1019,
"step": 313
},
{
"epoch": 1.129671319225574,
"grad_norm": 0.34072768994255465,
"learning_rate": 7.87074086268695e-06,
"loss": 1.049,
"step": 314
},
{
"epoch": 1.1332733003151734,
"grad_norm": 0.38829026634405583,
"learning_rate": 7.85356783842216e-06,
"loss": 1.099,
"step": 315
},
{
"epoch": 1.1368752814047727,
"grad_norm": 0.3454891333781692,
"learning_rate": 7.836344745633785e-06,
"loss": 1.0896,
"step": 316
},
{
"epoch": 1.140477262494372,
"grad_norm": 0.40407073745828537,
"learning_rate": 7.819071886517134e-06,
"loss": 1.0885,
"step": 317
},
{
"epoch": 1.1440792435839713,
"grad_norm": 0.40693031599207263,
"learning_rate": 7.801749564140724e-06,
"loss": 1.0621,
"step": 318
},
{
"epoch": 1.1476812246735704,
"grad_norm": 0.41527967835091,
"learning_rate": 7.78437808244094e-06,
"loss": 1.0661,
"step": 319
},
{
"epoch": 1.1512832057631697,
"grad_norm": 0.32461739079088076,
"learning_rate": 7.76695774621672e-06,
"loss": 1.0738,
"step": 320
},
{
"epoch": 1.154885186852769,
"grad_norm": 0.369117935448055,
"learning_rate": 7.7494888611242e-06,
"loss": 1.0507,
"step": 321
},
{
"epoch": 1.1584871679423683,
"grad_norm": 0.40286832831280445,
"learning_rate": 7.731971733671347e-06,
"loss": 1.0115,
"step": 322
},
{
"epoch": 1.1620891490319676,
"grad_norm": 0.3562069260238656,
"learning_rate": 7.714406671212589e-06,
"loss": 1.0678,
"step": 323
},
{
"epoch": 1.1656911301215669,
"grad_norm": 0.4156873725872057,
"learning_rate": 7.696793981943418e-06,
"loss": 1.0846,
"step": 324
},
{
"epoch": 1.1692931112111662,
"grad_norm": 0.40288662850406076,
"learning_rate": 7.679133974894984e-06,
"loss": 1.0544,
"step": 325
},
{
"epoch": 1.1728950923007655,
"grad_norm": 0.3739155064577844,
"learning_rate": 7.66142695992867e-06,
"loss": 1.0737,
"step": 326
},
{
"epoch": 1.1764970733903648,
"grad_norm": 0.4406680015338461,
"learning_rate": 7.64367324773066e-06,
"loss": 1.1106,
"step": 327
},
{
"epoch": 1.1800990544799639,
"grad_norm": 0.34345904522451653,
"learning_rate": 7.6258731498064796e-06,
"loss": 1.0759,
"step": 328
},
{
"epoch": 1.1837010355695632,
"grad_norm": 0.4365590987719929,
"learning_rate": 7.6080269784755405e-06,
"loss": 1.0738,
"step": 329
},
{
"epoch": 1.1873030166591625,
"grad_norm": 0.3633076504956387,
"learning_rate": 7.590135046865652e-06,
"loss": 1.1089,
"step": 330
},
{
"epoch": 1.1909049977487618,
"grad_norm": 0.395068007524633,
"learning_rate": 7.572197668907533e-06,
"loss": 1.1244,
"step": 331
},
{
"epoch": 1.194506978838361,
"grad_norm": 0.39729351127387735,
"learning_rate": 7.5542151593293e-06,
"loss": 1.0889,
"step": 332
},
{
"epoch": 1.1981089599279604,
"grad_norm": 2.5701237358423255,
"learning_rate": 7.536187833650947e-06,
"loss": 1.1059,
"step": 333
},
{
"epoch": 1.2017109410175597,
"grad_norm": 0.47480486252921483,
"learning_rate": 7.518116008178805e-06,
"loss": 1.0482,
"step": 334
},
{
"epoch": 1.205312922107159,
"grad_norm": 0.40028253752269954,
"learning_rate": 7.500000000000001e-06,
"loss": 1.0897,
"step": 335
},
{
"epoch": 1.2089149031967583,
"grad_norm": 0.45760958914786226,
"learning_rate": 7.481840126976885e-06,
"loss": 1.0907,
"step": 336
},
{
"epoch": 1.2125168842863574,
"grad_norm": 0.40820185657609664,
"learning_rate": 7.463636707741458e-06,
"loss": 1.0526,
"step": 337
},
{
"epoch": 1.2161188653759567,
"grad_norm": 0.4393002377107819,
"learning_rate": 7.445390061689782e-06,
"loss": 1.1063,
"step": 338
},
{
"epoch": 1.219720846465556,
"grad_norm": 0.3828530976851444,
"learning_rate": 7.42710050897637e-06,
"loss": 1.0735,
"step": 339
},
{
"epoch": 1.2233228275551553,
"grad_norm": 0.43353384279187324,
"learning_rate": 7.408768370508577e-06,
"loss": 1.0893,
"step": 340
},
{
"epoch": 1.2269248086447546,
"grad_norm": 0.3888399760706739,
"learning_rate": 7.390393967940962e-06,
"loss": 1.0666,
"step": 341
},
{
"epoch": 1.230526789734354,
"grad_norm": 0.4143338997564131,
"learning_rate": 7.371977623669646e-06,
"loss": 1.1293,
"step": 342
},
{
"epoch": 1.2341287708239532,
"grad_norm": 0.32879022172398614,
"learning_rate": 7.353519660826665e-06,
"loss": 1.0879,
"step": 343
},
{
"epoch": 1.2377307519135525,
"grad_norm": 0.357811781692411,
"learning_rate": 7.335020403274277e-06,
"loss": 1.0792,
"step": 344
},
{
"epoch": 1.2413327330031518,
"grad_norm": 0.37966132927138474,
"learning_rate": 7.31648017559931e-06,
"loss": 1.0718,
"step": 345
},
{
"epoch": 1.244934714092751,
"grad_norm": 0.3968902781929655,
"learning_rate": 7.297899303107441e-06,
"loss": 1.0676,
"step": 346
},
{
"epoch": 1.2485366951823502,
"grad_norm": 0.35750082909244935,
"learning_rate": 7.279278111817502e-06,
"loss": 1.0227,
"step": 347
},
{
"epoch": 1.2521386762719495,
"grad_norm": 0.42773056447698266,
"learning_rate": 7.260616928455754e-06,
"loss": 1.0496,
"step": 348
},
{
"epoch": 1.2557406573615488,
"grad_norm": 0.40151554123482264,
"learning_rate": 7.241916080450163e-06,
"loss": 1.0696,
"step": 349
},
{
"epoch": 1.2593426384511481,
"grad_norm": 0.4131771160185376,
"learning_rate": 7.223175895924638e-06,
"loss": 1.0915,
"step": 350
},
{
"epoch": 1.2629446195407474,
"grad_norm": 0.4106261415779341,
"learning_rate": 7.2043967036932935e-06,
"loss": 1.0328,
"step": 351
},
{
"epoch": 1.2665466006303467,
"grad_norm": 0.34879774503343425,
"learning_rate": 7.185578833254665e-06,
"loss": 1.057,
"step": 352
},
{
"epoch": 1.270148581719946,
"grad_norm": 0.3836853786701187,
"learning_rate": 7.166722614785937e-06,
"loss": 1.0754,
"step": 353
},
{
"epoch": 1.2737505628095454,
"grad_norm": 0.3800741116171829,
"learning_rate": 7.1478283791371415e-06,
"loss": 1.0841,
"step": 354
},
{
"epoch": 1.2773525438991444,
"grad_norm": 0.34355233879245345,
"learning_rate": 7.128896457825364e-06,
"loss": 1.0632,
"step": 355
},
{
"epoch": 1.2809545249887437,
"grad_norm": 0.42485239689947135,
"learning_rate": 7.1099271830289155e-06,
"loss": 1.0985,
"step": 356
},
{
"epoch": 1.284556506078343,
"grad_norm": 0.4215135642816349,
"learning_rate": 7.090920887581507e-06,
"loss": 1.0707,
"step": 357
},
{
"epoch": 1.2881584871679423,
"grad_norm": 0.35779792395285226,
"learning_rate": 7.071877904966422e-06,
"loss": 1.0815,
"step": 358
},
{
"epoch": 1.2917604682575416,
"grad_norm": 0.43513590600966623,
"learning_rate": 7.052798569310641e-06,
"loss": 1.1024,
"step": 359
},
{
"epoch": 1.295362449347141,
"grad_norm": 0.3709332183690677,
"learning_rate": 7.033683215379002e-06,
"loss": 1.0788,
"step": 360
},
{
"epoch": 1.2989644304367403,
"grad_norm": 0.3951949031993291,
"learning_rate": 7.014532178568314e-06,
"loss": 1.05,
"step": 361
},
{
"epoch": 1.3025664115263396,
"grad_norm": 0.33987216437327716,
"learning_rate": 6.995345794901477e-06,
"loss": 1.0697,
"step": 362
},
{
"epoch": 1.3061683926159389,
"grad_norm": 0.3780771954200527,
"learning_rate": 6.976124401021583e-06,
"loss": 1.0729,
"step": 363
},
{
"epoch": 1.309770373705538,
"grad_norm": 0.3908638367202069,
"learning_rate": 6.9568683341860135e-06,
"loss": 1.0328,
"step": 364
},
{
"epoch": 1.3133723547951373,
"grad_norm": 0.38289027631075667,
"learning_rate": 6.9375779322605154e-06,
"loss": 1.0928,
"step": 365
},
{
"epoch": 1.3169743358847366,
"grad_norm": 0.3614859878801377,
"learning_rate": 6.9182535337132824e-06,
"loss": 1.0756,
"step": 366
},
{
"epoch": 1.3205763169743359,
"grad_norm": 0.3598395419280253,
"learning_rate": 6.898895477609007e-06,
"loss": 1.103,
"step": 367
},
{
"epoch": 1.3241782980639352,
"grad_norm": 0.37455771512012165,
"learning_rate": 6.879504103602934e-06,
"loss": 1.0758,
"step": 368
},
{
"epoch": 1.3277802791535345,
"grad_norm": 0.40574624953367006,
"learning_rate": 6.860079751934908e-06,
"loss": 1.0794,
"step": 369
},
{
"epoch": 1.3313822602431338,
"grad_norm": 0.3725008186416374,
"learning_rate": 6.840622763423391e-06,
"loss": 1.0756,
"step": 370
},
{
"epoch": 1.334984241332733,
"grad_norm": 0.3646005212300081,
"learning_rate": 6.821133479459492e-06,
"loss": 1.0959,
"step": 371
},
{
"epoch": 1.3385862224223324,
"grad_norm": 0.4113247656948934,
"learning_rate": 6.8016122420009745e-06,
"loss": 1.0835,
"step": 372
},
{
"epoch": 1.3421882035119315,
"grad_norm": 0.35986780686629694,
"learning_rate": 6.782059393566254e-06,
"loss": 1.0589,
"step": 373
},
{
"epoch": 1.3457901846015308,
"grad_norm": 0.40465760674379114,
"learning_rate": 6.762475277228393e-06,
"loss": 1.0825,
"step": 374
},
{
"epoch": 1.34939216569113,
"grad_norm": 0.3988091604755472,
"learning_rate": 6.7428602366090764e-06,
"loss": 1.0716,
"step": 375
},
{
"epoch": 1.3529941467807294,
"grad_norm": 0.40966197148471073,
"learning_rate": 6.723214615872585e-06,
"loss": 1.0922,
"step": 376
},
{
"epoch": 1.3565961278703287,
"grad_norm": 0.42278479417615644,
"learning_rate": 6.70353875971976e-06,
"loss": 1.0802,
"step": 377
},
{
"epoch": 1.360198108959928,
"grad_norm": 0.35541048253819246,
"learning_rate": 6.683833013381942e-06,
"loss": 1.0872,
"step": 378
},
{
"epoch": 1.3638000900495273,
"grad_norm": 0.3502392276010618,
"learning_rate": 6.664097722614934e-06,
"loss": 1.0583,
"step": 379
},
{
"epoch": 1.3674020711391266,
"grad_norm": 0.38138433478225825,
"learning_rate": 6.644333233692917e-06,
"loss": 1.0692,
"step": 380
},
{
"epoch": 1.371004052228726,
"grad_norm": 0.3377587100898131,
"learning_rate": 6.624539893402383e-06,
"loss": 1.079,
"step": 381
},
{
"epoch": 1.374606033318325,
"grad_norm": 0.3409130145543055,
"learning_rate": 6.604718049036047e-06,
"loss": 1.0794,
"step": 382
},
{
"epoch": 1.3782080144079243,
"grad_norm": 0.37652977172080476,
"learning_rate": 6.58486804838676e-06,
"loss": 1.0908,
"step": 383
},
{
"epoch": 1.3818099954975236,
"grad_norm": 0.3257917788013114,
"learning_rate": 6.5649902397413915e-06,
"loss": 1.0856,
"step": 384
},
{
"epoch": 1.385411976587123,
"grad_norm": 0.3195784470315217,
"learning_rate": 6.545084971874738e-06,
"loss": 1.1038,
"step": 385
},
{
"epoch": 1.3890139576767222,
"grad_norm": 0.3616341536622016,
"learning_rate": 6.525152594043389e-06,
"loss": 1.0814,
"step": 386
},
{
"epoch": 1.3926159387663215,
"grad_norm": 0.40527856810901475,
"learning_rate": 6.505193455979603e-06,
"loss": 1.0994,
"step": 387
},
{
"epoch": 1.3962179198559208,
"grad_norm": 0.35495797940458634,
"learning_rate": 6.485207907885175e-06,
"loss": 1.063,
"step": 388
},
{
"epoch": 1.3998199009455201,
"grad_norm": 0.3327326350539732,
"learning_rate": 6.465196300425287e-06,
"loss": 1.1113,
"step": 389
},
{
"epoch": 1.4034218820351194,
"grad_norm": 0.36061962924782925,
"learning_rate": 6.445158984722358e-06,
"loss": 1.0644,
"step": 390
},
{
"epoch": 1.4070238631247185,
"grad_norm": 0.34339629638301855,
"learning_rate": 6.425096312349881e-06,
"loss": 1.0904,
"step": 391
},
{
"epoch": 1.4106258442143178,
"grad_norm": 0.34978035011668274,
"learning_rate": 6.4050086353262565e-06,
"loss": 1.0788,
"step": 392
},
{
"epoch": 1.414227825303917,
"grad_norm": 0.38376522345500463,
"learning_rate": 6.384896306108612e-06,
"loss": 1.0564,
"step": 393
},
{
"epoch": 1.4178298063935164,
"grad_norm": 0.35566301025630226,
"learning_rate": 6.364759677586627e-06,
"loss": 1.083,
"step": 394
},
{
"epoch": 1.4214317874831157,
"grad_norm": 0.32888274597150713,
"learning_rate": 6.344599103076329e-06,
"loss": 1.0868,
"step": 395
},
{
"epoch": 1.425033768572715,
"grad_norm": 0.3669493819260475,
"learning_rate": 6.324414936313904e-06,
"loss": 1.0402,
"step": 396
},
{
"epoch": 1.4286357496623143,
"grad_norm": 0.3617926547245432,
"learning_rate": 6.304207531449486e-06,
"loss": 1.0905,
"step": 397
},
{
"epoch": 1.4322377307519136,
"grad_norm": 0.36414284887525294,
"learning_rate": 6.28397724304094e-06,
"loss": 1.0827,
"step": 398
},
{
"epoch": 1.435839711841513,
"grad_norm": 0.3386723453983854,
"learning_rate": 6.2637244260476474e-06,
"loss": 1.061,
"step": 399
},
{
"epoch": 1.439441692931112,
"grad_norm": 0.34781644394622324,
"learning_rate": 6.243449435824276e-06,
"loss": 1.0802,
"step": 400
},
{
"epoch": 1.4430436740207113,
"grad_norm": 0.3503485349700194,
"learning_rate": 6.223152628114537e-06,
"loss": 1.0664,
"step": 401
},
{
"epoch": 1.4466456551103106,
"grad_norm": 0.3328702702553546,
"learning_rate": 6.202834359044959e-06,
"loss": 1.0701,
"step": 402
},
{
"epoch": 1.45024763619991,
"grad_norm": 0.33914752395171327,
"learning_rate": 6.182494985118625e-06,
"loss": 1.0538,
"step": 403
},
{
"epoch": 1.4538496172895092,
"grad_norm": 0.39570376225734244,
"learning_rate": 6.1621348632089205e-06,
"loss": 1.0608,
"step": 404
},
{
"epoch": 1.4574515983791085,
"grad_norm": 0.3515088932100447,
"learning_rate": 6.141754350553279e-06,
"loss": 1.0665,
"step": 405
},
{
"epoch": 1.4610535794687078,
"grad_norm": 0.3393163686581987,
"learning_rate": 6.121353804746907e-06,
"loss": 1.0678,
"step": 406
},
{
"epoch": 1.4646555605583071,
"grad_norm": 0.363682561032409,
"learning_rate": 6.100933583736508e-06,
"loss": 1.0712,
"step": 407
},
{
"epoch": 1.4682575416479065,
"grad_norm": 0.3852133533106999,
"learning_rate": 6.080494045814011e-06,
"loss": 1.0675,
"step": 408
},
{
"epoch": 1.4718595227375055,
"grad_norm": 0.33227273331193696,
"learning_rate": 6.060035549610275e-06,
"loss": 1.0702,
"step": 409
},
{
"epoch": 1.4754615038271048,
"grad_norm": 0.34242254777920744,
"learning_rate": 6.039558454088796e-06,
"loss": 1.0887,
"step": 410
},
{
"epoch": 1.4790634849167041,
"grad_norm": 0.36440240139479513,
"learning_rate": 6.019063118539425e-06,
"loss": 1.0629,
"step": 411
},
{
"epoch": 1.4826654660063034,
"grad_norm": 0.34522394073241824,
"learning_rate": 5.9985499025720354e-06,
"loss": 1.0604,
"step": 412
},
{
"epoch": 1.4862674470959027,
"grad_norm": 0.35469984623846545,
"learning_rate": 5.978019166110242e-06,
"loss": 1.0732,
"step": 413
},
{
"epoch": 1.489869428185502,
"grad_norm": 0.3327229153625461,
"learning_rate": 5.957471269385065e-06,
"loss": 1.0916,
"step": 414
},
{
"epoch": 1.4934714092751014,
"grad_norm": 0.35947455556579844,
"learning_rate": 5.936906572928625e-06,
"loss": 1.0857,
"step": 415
},
{
"epoch": 1.4970733903647007,
"grad_norm": 0.3523660373102713,
"learning_rate": 5.9163254375677995e-06,
"loss": 1.0354,
"step": 416
},
{
"epoch": 1.5006753714543,
"grad_norm": 0.33259457908461304,
"learning_rate": 5.8957282244179125e-06,
"loss": 1.081,
"step": 417
},
{
"epoch": 1.504277352543899,
"grad_norm": 0.36043116131180003,
"learning_rate": 5.8751152948763815e-06,
"loss": 1.0882,
"step": 418
},
{
"epoch": 1.5078793336334986,
"grad_norm": 0.3644427491180247,
"learning_rate": 5.854487010616384e-06,
"loss": 1.0753,
"step": 419
},
{
"epoch": 1.5114813147230977,
"grad_norm": 0.35395707576036406,
"learning_rate": 5.8338437335805124e-06,
"loss": 1.0953,
"step": 420
},
{
"epoch": 1.515083295812697,
"grad_norm": 0.48229948381614823,
"learning_rate": 5.813185825974419e-06,
"loss": 1.1207,
"step": 421
},
{
"epoch": 1.5186852769022963,
"grad_norm": 0.38738601915939525,
"learning_rate": 5.792513650260465e-06,
"loss": 1.0958,
"step": 422
},
{
"epoch": 1.5222872579918956,
"grad_norm": 0.3259233561804611,
"learning_rate": 5.771827569151357e-06,
"loss": 1.0954,
"step": 423
},
{
"epoch": 1.5258892390814949,
"grad_norm": 0.3332133749381523,
"learning_rate": 5.751127945603786e-06,
"loss": 1.0927,
"step": 424
},
{
"epoch": 1.529491220171094,
"grad_norm": 0.32989085101834736,
"learning_rate": 5.730415142812059e-06,
"loss": 1.0527,
"step": 425
},
{
"epoch": 1.5330932012606935,
"grad_norm": 0.36851444118947263,
"learning_rate": 5.709689524201723e-06,
"loss": 1.0583,
"step": 426
},
{
"epoch": 1.5366951823502926,
"grad_norm": 0.35379667109046153,
"learning_rate": 5.68895145342319e-06,
"loss": 1.0943,
"step": 427
},
{
"epoch": 1.540297163439892,
"grad_norm": 0.37779417160624446,
"learning_rate": 5.668201294345363e-06,
"loss": 1.0812,
"step": 428
},
{
"epoch": 1.5438991445294912,
"grad_norm": 0.3256143401933206,
"learning_rate": 5.647439411049235e-06,
"loss": 1.0646,
"step": 429
},
{
"epoch": 1.5475011256190905,
"grad_norm": 0.34874005725965246,
"learning_rate": 5.626666167821522e-06,
"loss": 1.0753,
"step": 430
},
{
"epoch": 1.5511031067086898,
"grad_norm": 0.315911724351532,
"learning_rate": 5.605881929148254e-06,
"loss": 1.062,
"step": 431
},
{
"epoch": 1.554705087798289,
"grad_norm": 0.3684384723712539,
"learning_rate": 5.585087059708389e-06,
"loss": 1.0853,
"step": 432
},
{
"epoch": 1.5583070688878884,
"grad_norm": 0.3803316237726428,
"learning_rate": 5.5642819243674085e-06,
"loss": 1.0471,
"step": 433
},
{
"epoch": 1.5619090499774875,
"grad_norm": 0.3322373067202158,
"learning_rate": 5.543466888170927e-06,
"loss": 1.0472,
"step": 434
},
{
"epoch": 1.565511031067087,
"grad_norm": 0.35808606269858695,
"learning_rate": 5.522642316338268e-06,
"loss": 1.0435,
"step": 435
},
{
"epoch": 1.569113012156686,
"grad_norm": 0.3651095676354401,
"learning_rate": 5.5018085742560745e-06,
"loss": 1.0415,
"step": 436
},
{
"epoch": 1.5727149932462856,
"grad_norm": 0.3434911579987937,
"learning_rate": 5.480966027471889e-06,
"loss": 1.0683,
"step": 437
},
{
"epoch": 1.5763169743358847,
"grad_norm": 0.3704214813504543,
"learning_rate": 5.460115041687737e-06,
"loss": 1.0413,
"step": 438
},
{
"epoch": 1.579918955425484,
"grad_norm": 0.34635012302936774,
"learning_rate": 5.439255982753717e-06,
"loss": 1.097,
"step": 439
},
{
"epoch": 1.5835209365150833,
"grad_norm": 0.3894870354521757,
"learning_rate": 5.41838921666158e-06,
"loss": 1.0078,
"step": 440
},
{
"epoch": 1.5871229176046826,
"grad_norm": 0.32628973132390515,
"learning_rate": 5.3975151095383e-06,
"loss": 1.0708,
"step": 441
},
{
"epoch": 1.590724898694282,
"grad_norm": 0.353493468436304,
"learning_rate": 5.376634027639664e-06,
"loss": 1.0893,
"step": 442
},
{
"epoch": 1.594326879783881,
"grad_norm": 0.35574519333131815,
"learning_rate": 5.355746337343835e-06,
"loss": 1.0992,
"step": 443
},
{
"epoch": 1.5979288608734805,
"grad_norm": 0.3721178525169961,
"learning_rate": 5.334852405144926e-06,
"loss": 1.08,
"step": 444
},
{
"epoch": 1.6015308419630796,
"grad_norm": 0.35260152951310547,
"learning_rate": 5.3139525976465675e-06,
"loss": 1.0573,
"step": 445
},
{
"epoch": 1.6051328230526791,
"grad_norm": 0.3102871481104457,
"learning_rate": 5.293047281555482e-06,
"loss": 1.0845,
"step": 446
},
{
"epoch": 1.6087348041422782,
"grad_norm": 0.34166757066038683,
"learning_rate": 5.272136823675046e-06,
"loss": 1.0644,
"step": 447
},
{
"epoch": 1.6123367852318775,
"grad_norm": 0.3555363375900774,
"learning_rate": 5.251221590898848e-06,
"loss": 1.086,
"step": 448
},
{
"epoch": 1.6159387663214768,
"grad_norm": 0.3487085514723649,
"learning_rate": 5.230301950204261e-06,
"loss": 1.0655,
"step": 449
},
{
"epoch": 1.6195407474110761,
"grad_norm": 0.3481330917315912,
"learning_rate": 5.209378268645998e-06,
"loss": 1.0804,
"step": 450
},
{
"epoch": 1.6231427285006754,
"grad_norm": 0.3479613181209226,
"learning_rate": 5.188450913349674e-06,
"loss": 1.0802,
"step": 451
},
{
"epoch": 1.6267447095902745,
"grad_norm": 0.329411221852206,
"learning_rate": 5.167520251505358e-06,
"loss": 1.049,
"step": 452
},
{
"epoch": 1.630346690679874,
"grad_norm": 0.373931387358175,
"learning_rate": 5.146586650361143e-06,
"loss": 1.05,
"step": 453
},
{
"epoch": 1.6339486717694731,
"grad_norm": 0.33179661919955405,
"learning_rate": 5.1256504772166885e-06,
"loss": 1.068,
"step": 454
},
{
"epoch": 1.6375506528590726,
"grad_norm": 0.3248298243422403,
"learning_rate": 5.1047120994167855e-06,
"loss": 1.0459,
"step": 455
},
{
"epoch": 1.6411526339486717,
"grad_norm": 0.32872162641417635,
"learning_rate": 5.083771884344908e-06,
"loss": 1.1005,
"step": 456
},
{
"epoch": 1.644754615038271,
"grad_norm": 0.3350940372390754,
"learning_rate": 5.062830199416764e-06,
"loss": 1.0729,
"step": 457
},
{
"epoch": 1.6483565961278703,
"grad_norm": 0.3500027893173043,
"learning_rate": 5.041887412073853e-06,
"loss": 1.056,
"step": 458
},
{
"epoch": 1.6519585772174696,
"grad_norm": 0.352966799403548,
"learning_rate": 5.0209438897770205e-06,
"loss": 1.0368,
"step": 459
},
{
"epoch": 1.655560558307069,
"grad_norm": 0.3570213767690983,
"learning_rate": 5e-06,
"loss": 1.1005,
"step": 460
},
{
"epoch": 1.659162539396668,
"grad_norm": 0.35681386612113786,
"learning_rate": 4.979056110222982e-06,
"loss": 1.0552,
"step": 461
},
{
"epoch": 1.6627645204862675,
"grad_norm": 0.37777499026486816,
"learning_rate": 4.9581125879261476e-06,
"loss": 1.0655,
"step": 462
},
{
"epoch": 1.6663665015758666,
"grad_norm": 0.3720854550185324,
"learning_rate": 4.937169800583237e-06,
"loss": 1.0905,
"step": 463
},
{
"epoch": 1.6699684826654662,
"grad_norm": 0.35871679077028334,
"learning_rate": 4.9162281156550945e-06,
"loss": 1.0735,
"step": 464
},
{
"epoch": 1.6735704637550652,
"grad_norm": 0.32746933618519874,
"learning_rate": 4.895287900583216e-06,
"loss": 1.0375,
"step": 465
},
{
"epoch": 1.6771724448446645,
"grad_norm": 0.38118331962443885,
"learning_rate": 4.874349522783313e-06,
"loss": 1.0646,
"step": 466
},
{
"epoch": 1.6807744259342638,
"grad_norm": 0.35946893815132736,
"learning_rate": 4.853413349638859e-06,
"loss": 1.0828,
"step": 467
},
{
"epoch": 1.6843764070238632,
"grad_norm": 0.3666759251106939,
"learning_rate": 4.832479748494643e-06,
"loss": 1.0551,
"step": 468
},
{
"epoch": 1.6879783881134625,
"grad_norm": 0.3508414333413744,
"learning_rate": 4.811549086650327e-06,
"loss": 1.0599,
"step": 469
},
{
"epoch": 1.6915803692030615,
"grad_norm": 0.35707827749845356,
"learning_rate": 4.7906217313540035e-06,
"loss": 1.0759,
"step": 470
},
{
"epoch": 1.695182350292661,
"grad_norm": 0.3788611425520442,
"learning_rate": 4.769698049795739e-06,
"loss": 1.0357,
"step": 471
},
{
"epoch": 1.6987843313822601,
"grad_norm": 0.31406557358200793,
"learning_rate": 4.748778409101153e-06,
"loss": 1.0835,
"step": 472
},
{
"epoch": 1.7023863124718597,
"grad_norm": 0.330042469142287,
"learning_rate": 4.727863176324955e-06,
"loss": 1.0356,
"step": 473
},
{
"epoch": 1.7059882935614588,
"grad_norm": 0.3692982637487251,
"learning_rate": 4.706952718444518e-06,
"loss": 1.0224,
"step": 474
},
{
"epoch": 1.709590274651058,
"grad_norm": 0.3401427800549121,
"learning_rate": 4.686047402353433e-06,
"loss": 1.056,
"step": 475
},
{
"epoch": 1.7131922557406574,
"grad_norm": 0.32559689457568264,
"learning_rate": 4.6651475948550765e-06,
"loss": 1.072,
"step": 476
},
{
"epoch": 1.7167942368302567,
"grad_norm": 0.33516871825696326,
"learning_rate": 4.644253662656167e-06,
"loss": 1.056,
"step": 477
},
{
"epoch": 1.720396217919856,
"grad_norm": 0.34667604652264583,
"learning_rate": 4.6233659723603374e-06,
"loss": 1.0555,
"step": 478
},
{
"epoch": 1.723998199009455,
"grad_norm": 0.37161554561132193,
"learning_rate": 4.602484890461702e-06,
"loss": 1.0563,
"step": 479
},
{
"epoch": 1.7276001800990546,
"grad_norm": 0.31848164234901527,
"learning_rate": 4.581610783338424e-06,
"loss": 1.0941,
"step": 480
},
{
"epoch": 1.7312021611886537,
"grad_norm": 0.3594252163107309,
"learning_rate": 4.560744017246284e-06,
"loss": 1.0751,
"step": 481
},
{
"epoch": 1.7348041422782532,
"grad_norm": 0.3544059030193539,
"learning_rate": 4.539884958312265e-06,
"loss": 1.0469,
"step": 482
},
{
"epoch": 1.7384061233678523,
"grad_norm": 0.32959836916197055,
"learning_rate": 4.519033972528114e-06,
"loss": 1.063,
"step": 483
},
{
"epoch": 1.7420081044574516,
"grad_norm": 0.31875669846466737,
"learning_rate": 4.4981914257439254e-06,
"loss": 1.0575,
"step": 484
},
{
"epoch": 1.7456100855470509,
"grad_norm": 0.31894865236781544,
"learning_rate": 4.477357683661734e-06,
"loss": 1.0591,
"step": 485
},
{
"epoch": 1.7492120666366502,
"grad_norm": 0.355450564535886,
"learning_rate": 4.456533111829076e-06,
"loss": 1.1063,
"step": 486
},
{
"epoch": 1.7528140477262495,
"grad_norm": 0.35660689731655426,
"learning_rate": 4.4357180756325915e-06,
"loss": 1.025,
"step": 487
},
{
"epoch": 1.7564160288158486,
"grad_norm": 0.35786503142864406,
"learning_rate": 4.414912940291614e-06,
"loss": 1.0731,
"step": 488
},
{
"epoch": 1.760018009905448,
"grad_norm": 0.33168551352286857,
"learning_rate": 4.394118070851749e-06,
"loss": 1.0152,
"step": 489
},
{
"epoch": 1.7636199909950472,
"grad_norm": 0.31557735871932313,
"learning_rate": 4.373333832178478e-06,
"loss": 1.0658,
"step": 490
},
{
"epoch": 1.7672219720846467,
"grad_norm": 0.3643796172518758,
"learning_rate": 4.352560588950766e-06,
"loss": 1.0788,
"step": 491
},
{
"epoch": 1.7708239531742458,
"grad_norm": 0.3342250830340594,
"learning_rate": 4.331798705654639e-06,
"loss": 1.0929,
"step": 492
},
{
"epoch": 1.774425934263845,
"grad_norm": 0.8828365078514346,
"learning_rate": 4.31104854657681e-06,
"loss": 1.0723,
"step": 493
},
{
"epoch": 1.7780279153534444,
"grad_norm": 0.32325289629440884,
"learning_rate": 4.290310475798278e-06,
"loss": 1.0472,
"step": 494
},
{
"epoch": 1.7816298964430437,
"grad_norm": 0.36167307979154,
"learning_rate": 4.269584857187942e-06,
"loss": 1.0803,
"step": 495
},
{
"epoch": 1.785231877532643,
"grad_norm": 0.3112535067026799,
"learning_rate": 4.248872054396215e-06,
"loss": 1.0522,
"step": 496
},
{
"epoch": 1.788833858622242,
"grad_norm": 0.34086457664496106,
"learning_rate": 4.228172430848645e-06,
"loss": 1.0775,
"step": 497
},
{
"epoch": 1.7924358397118416,
"grad_norm": 0.30310958551948036,
"learning_rate": 4.207486349739538e-06,
"loss": 1.0487,
"step": 498
},
{
"epoch": 1.7960378208014407,
"grad_norm": 0.3511261941310994,
"learning_rate": 4.186814174025582e-06,
"loss": 1.0844,
"step": 499
},
{
"epoch": 1.7996398018910402,
"grad_norm": 0.3503896982707501,
"learning_rate": 4.166156266419489e-06,
"loss": 1.0132,
"step": 500
},
{
"epoch": 1.8032417829806393,
"grad_norm": 0.3436670120686724,
"learning_rate": 4.145512989383618e-06,
"loss": 1.0598,
"step": 501
},
{
"epoch": 1.8068437640702386,
"grad_norm": 0.34785159088570466,
"learning_rate": 4.124884705123619e-06,
"loss": 1.035,
"step": 502
},
{
"epoch": 1.810445745159838,
"grad_norm": 0.33165542782534474,
"learning_rate": 4.104271775582089e-06,
"loss": 1.0358,
"step": 503
},
{
"epoch": 1.8140477262494372,
"grad_norm": 0.4070554454819295,
"learning_rate": 4.083674562432203e-06,
"loss": 1.0434,
"step": 504
},
{
"epoch": 1.8176497073390365,
"grad_norm": 0.3624273928106286,
"learning_rate": 4.063093427071376e-06,
"loss": 1.0995,
"step": 505
},
{
"epoch": 1.8212516884286356,
"grad_norm": 0.32728343459448644,
"learning_rate": 4.042528730614935e-06,
"loss": 1.0178,
"step": 506
},
{
"epoch": 1.8248536695182351,
"grad_norm": 0.31687128264594894,
"learning_rate": 4.02198083388976e-06,
"loss": 1.0649,
"step": 507
},
{
"epoch": 1.8284556506078342,
"grad_norm": 0.3744296721151693,
"learning_rate": 4.001450097427965e-06,
"loss": 1.0252,
"step": 508
},
{
"epoch": 1.8320576316974337,
"grad_norm": 0.3132845490517284,
"learning_rate": 3.980936881460576e-06,
"loss": 1.0795,
"step": 509
},
{
"epoch": 1.8356596127870328,
"grad_norm": 0.3420985303611589,
"learning_rate": 3.960441545911205e-06,
"loss": 1.0746,
"step": 510
},
{
"epoch": 1.8392615938766321,
"grad_norm": 0.307478947705401,
"learning_rate": 3.939964450389728e-06,
"loss": 1.0853,
"step": 511
},
{
"epoch": 1.8428635749662314,
"grad_norm": 0.33071208183560175,
"learning_rate": 3.91950595418599e-06,
"loss": 1.0528,
"step": 512
},
{
"epoch": 1.8464655560558307,
"grad_norm": 0.3340631255697292,
"learning_rate": 3.899066416263493e-06,
"loss": 1.0198,
"step": 513
},
{
"epoch": 1.85006753714543,
"grad_norm": 0.32856275060694273,
"learning_rate": 3.8786461952530955e-06,
"loss": 1.0969,
"step": 514
},
{
"epoch": 1.8536695182350291,
"grad_norm": 0.3416526016934508,
"learning_rate": 3.8582456494467214e-06,
"loss": 1.0863,
"step": 515
},
{
"epoch": 1.8572714993246286,
"grad_norm": 0.3185489633634486,
"learning_rate": 3.83786513679108e-06,
"loss": 1.0385,
"step": 516
},
{
"epoch": 1.8608734804142277,
"grad_norm": 0.32623706503747446,
"learning_rate": 3.817505014881378e-06,
"loss": 1.0144,
"step": 517
},
{
"epoch": 1.8644754615038273,
"grad_norm": 0.32218552881112167,
"learning_rate": 3.797165640955041e-06,
"loss": 1.0705,
"step": 518
},
{
"epoch": 1.8680774425934263,
"grad_norm": 0.30139922286880777,
"learning_rate": 3.776847371885464e-06,
"loss": 1.0493,
"step": 519
},
{
"epoch": 1.8716794236830256,
"grad_norm": 0.377029678977555,
"learning_rate": 3.756550564175727e-06,
"loss": 1.0494,
"step": 520
},
{
"epoch": 1.875281404772625,
"grad_norm": 0.345992186114263,
"learning_rate": 3.736275573952354e-06,
"loss": 1.0228,
"step": 521
},
{
"epoch": 1.8788833858622243,
"grad_norm": 0.3033717191767872,
"learning_rate": 3.716022756959061e-06,
"loss": 1.0896,
"step": 522
},
{
"epoch": 1.8824853669518236,
"grad_norm": 0.32124744536039596,
"learning_rate": 3.695792468550517e-06,
"loss": 1.0467,
"step": 523
},
{
"epoch": 1.8860873480414226,
"grad_norm": 0.3470918739977987,
"learning_rate": 3.6755850636860956e-06,
"loss": 1.0524,
"step": 524
},
{
"epoch": 1.8896893291310222,
"grad_norm": 0.38530316140769727,
"learning_rate": 3.655400896923672e-06,
"loss": 1.0685,
"step": 525
},
{
"epoch": 1.8932913102206212,
"grad_norm": 0.34989092164325525,
"learning_rate": 3.635240322413375e-06,
"loss": 1.0953,
"step": 526
},
{
"epoch": 1.8968932913102208,
"grad_norm": 0.30619682455422553,
"learning_rate": 3.6151036938913887e-06,
"loss": 1.0866,
"step": 527
},
{
"epoch": 1.9004952723998199,
"grad_norm": 0.33603759713948095,
"learning_rate": 3.5949913646737456e-06,
"loss": 1.0562,
"step": 528
},
{
"epoch": 1.9040972534894192,
"grad_norm": 0.36283301326571393,
"learning_rate": 3.5749036876501196e-06,
"loss": 1.0877,
"step": 529
},
{
"epoch": 1.9076992345790185,
"grad_norm": 0.32449495469021755,
"learning_rate": 3.5548410152776414e-06,
"loss": 1.0926,
"step": 530
},
{
"epoch": 1.9113012156686178,
"grad_norm": 0.30909965481121454,
"learning_rate": 3.5348036995747135e-06,
"loss": 1.0924,
"step": 531
},
{
"epoch": 1.914903196758217,
"grad_norm": 0.31489637458548564,
"learning_rate": 3.5147920921148267e-06,
"loss": 1.0828,
"step": 532
},
{
"epoch": 1.9185051778478162,
"grad_norm": 0.3330350499732472,
"learning_rate": 3.4948065440203982e-06,
"loss": 1.0685,
"step": 533
},
{
"epoch": 1.9221071589374157,
"grad_norm": 0.3107239605147675,
"learning_rate": 3.474847405956613e-06,
"loss": 1.0618,
"step": 534
},
{
"epoch": 1.9257091400270148,
"grad_norm": 0.34315983029146,
"learning_rate": 3.4549150281252635e-06,
"loss": 1.1048,
"step": 535
},
{
"epoch": 1.9293111211166143,
"grad_norm": 0.3227737291510128,
"learning_rate": 3.4350097602586085e-06,
"loss": 1.0491,
"step": 536
},
{
"epoch": 1.9329131022062134,
"grad_norm": 0.30320716585058866,
"learning_rate": 3.4151319516132414e-06,
"loss": 1.0179,
"step": 537
},
{
"epoch": 1.9365150832958127,
"grad_norm": 0.32851520671529355,
"learning_rate": 3.3952819509639534e-06,
"loss": 1.0495,
"step": 538
},
{
"epoch": 1.940117064385412,
"grad_norm": 0.33703603920644415,
"learning_rate": 3.375460106597619e-06,
"loss": 1.0148,
"step": 539
},
{
"epoch": 1.9437190454750113,
"grad_norm": 0.3097947001373875,
"learning_rate": 3.355666766307084e-06,
"loss": 1.1156,
"step": 540
},
{
"epoch": 1.9473210265646106,
"grad_norm": 0.3068291690397491,
"learning_rate": 3.3359022773850673e-06,
"loss": 1.0315,
"step": 541
},
{
"epoch": 1.9509230076542097,
"grad_norm": 0.3376120770611421,
"learning_rate": 3.31616698661806e-06,
"loss": 1.0564,
"step": 542
},
{
"epoch": 1.9545249887438092,
"grad_norm": 0.29124426624612915,
"learning_rate": 3.2964612402802422e-06,
"loss": 1.0689,
"step": 543
},
{
"epoch": 1.9581269698334083,
"grad_norm": 0.2948609004870433,
"learning_rate": 3.2767853841274154e-06,
"loss": 1.0684,
"step": 544
},
{
"epoch": 1.9617289509230078,
"grad_norm": 0.3558137852450314,
"learning_rate": 3.2571397633909252e-06,
"loss": 1.0452,
"step": 545
},
{
"epoch": 1.965330932012607,
"grad_norm": 0.3008490022930337,
"learning_rate": 3.2375247227716077e-06,
"loss": 1.0235,
"step": 546
},
{
"epoch": 1.9689329131022062,
"grad_norm": 0.31102274440087274,
"learning_rate": 3.217940606433747e-06,
"loss": 1.0575,
"step": 547
},
{
"epoch": 1.9725348941918055,
"grad_norm": 0.33011139669007306,
"learning_rate": 3.1983877579990276e-06,
"loss": 1.0419,
"step": 548
},
{
"epoch": 1.9761368752814048,
"grad_norm": 0.314573659163516,
"learning_rate": 3.178866520540509e-06,
"loss": 1.0364,
"step": 549
},
{
"epoch": 1.979738856371004,
"grad_norm": 0.313348256207242,
"learning_rate": 3.1593772365766107e-06,
"loss": 1.0615,
"step": 550
},
{
"epoch": 1.9833408374606032,
"grad_norm": 0.32258850263277733,
"learning_rate": 3.139920248065095e-06,
"loss": 1.0896,
"step": 551
},
{
"epoch": 1.9869428185502027,
"grad_norm": 0.3154271741355316,
"learning_rate": 3.1204958963970666e-06,
"loss": 1.0501,
"step": 552
},
{
"epoch": 1.9905447996398018,
"grad_norm": 0.3406321762516789,
"learning_rate": 3.1011045223909954e-06,
"loss": 1.0761,
"step": 553
},
{
"epoch": 1.9941467807294013,
"grad_norm": 0.31522039092493703,
"learning_rate": 3.0817464662867192e-06,
"loss": 1.0556,
"step": 554
},
{
"epoch": 1.9977487618190004,
"grad_norm": 0.3147135586324213,
"learning_rate": 3.0624220677394854e-06,
"loss": 1.0857,
"step": 555
},
{
"epoch": 2.0,
"grad_norm": 0.44015502627582576,
"learning_rate": 3.043131665813988e-06,
"loss": 1.0463,
"step": 556
},
{
"epoch": 2.003601981089599,
"grad_norm": 0.3983894180930443,
"learning_rate": 3.023875598978419e-06,
"loss": 1.0393,
"step": 557
},
{
"epoch": 2.0072039621791986,
"grad_norm": 0.3067099309247845,
"learning_rate": 3.004654205098524e-06,
"loss": 1.0605,
"step": 558
},
{
"epoch": 2.0108059432687977,
"grad_norm": 0.3357353560695444,
"learning_rate": 2.9854678214316875e-06,
"loss": 1.0417,
"step": 559
},
{
"epoch": 2.014407924358397,
"grad_norm": 0.33999056854523163,
"learning_rate": 2.966316784621e-06,
"loss": 1.024,
"step": 560
},
{
"epoch": 2.0180099054479963,
"grad_norm": 0.31226386349104857,
"learning_rate": 2.9472014306893605e-06,
"loss": 1.0485,
"step": 561
},
{
"epoch": 2.021611886537596,
"grad_norm": 0.31603080640243814,
"learning_rate": 2.92812209503358e-06,
"loss": 1.0414,
"step": 562
},
{
"epoch": 2.025213867627195,
"grad_norm": 0.3459067464369587,
"learning_rate": 2.9090791124184934e-06,
"loss": 1.0756,
"step": 563
},
{
"epoch": 2.0288158487167944,
"grad_norm": 0.3330987662321279,
"learning_rate": 2.8900728169710866e-06,
"loss": 1.054,
"step": 564
},
{
"epoch": 2.0324178298063935,
"grad_norm": 0.2927527791629305,
"learning_rate": 2.871103542174637e-06,
"loss": 1.0665,
"step": 565
},
{
"epoch": 2.0360198108959926,
"grad_norm": 0.33886103259967365,
"learning_rate": 2.8521716208628597e-06,
"loss": 1.0595,
"step": 566
},
{
"epoch": 2.039621791985592,
"grad_norm": 0.31642624101440003,
"learning_rate": 2.8332773852140644e-06,
"loss": 1.0177,
"step": 567
},
{
"epoch": 2.043223773075191,
"grad_norm": 0.32001337639878913,
"learning_rate": 2.814421166745337e-06,
"loss": 1.0461,
"step": 568
},
{
"epoch": 2.0468257541647907,
"grad_norm": 0.3024517860403017,
"learning_rate": 2.795603296306708e-06,
"loss": 1.0665,
"step": 569
},
{
"epoch": 2.05042773525439,
"grad_norm": 0.3442352491720829,
"learning_rate": 2.776824104075364e-06,
"loss": 1.0368,
"step": 570
},
{
"epoch": 2.0540297163439893,
"grad_norm": 0.3293545677344158,
"learning_rate": 2.7580839195498397e-06,
"loss": 1.043,
"step": 571
},
{
"epoch": 2.0576316974335884,
"grad_norm": 0.31332600205637107,
"learning_rate": 2.739383071544246e-06,
"loss": 1.0476,
"step": 572
},
{
"epoch": 2.061233678523188,
"grad_norm": 0.32030515645666624,
"learning_rate": 2.7207218881825016e-06,
"loss": 1.0542,
"step": 573
},
{
"epoch": 2.064835659612787,
"grad_norm": 0.3364547680473557,
"learning_rate": 2.7021006968925613e-06,
"loss": 1.0364,
"step": 574
},
{
"epoch": 2.068437640702386,
"grad_norm": 0.3145267889190637,
"learning_rate": 2.683519824400693e-06,
"loss": 1.0621,
"step": 575
},
{
"epoch": 2.0720396217919856,
"grad_norm": 0.3221194724938311,
"learning_rate": 2.6649795967257243e-06,
"loss": 1.0827,
"step": 576
},
{
"epoch": 2.0756416028815847,
"grad_norm": 0.31171845327350106,
"learning_rate": 2.646480339173337e-06,
"loss": 1.0327,
"step": 577
},
{
"epoch": 2.0792435839711843,
"grad_norm": 0.2884769700670282,
"learning_rate": 2.6280223763303546e-06,
"loss": 1.0488,
"step": 578
},
{
"epoch": 2.0828455650607833,
"grad_norm": 0.3214670803689811,
"learning_rate": 2.6096060320590393e-06,
"loss": 1.0175,
"step": 579
},
{
"epoch": 2.086447546150383,
"grad_norm": 0.29588927971342016,
"learning_rate": 2.5912316294914232e-06,
"loss": 1.0299,
"step": 580
},
{
"epoch": 2.090049527239982,
"grad_norm": 0.3179276570664923,
"learning_rate": 2.5728994910236304e-06,
"loss": 1.0416,
"step": 581
},
{
"epoch": 2.0936515083295815,
"grad_norm": 0.32306719861333466,
"learning_rate": 2.5546099383102206e-06,
"loss": 1.043,
"step": 582
},
{
"epoch": 2.0972534894191805,
"grad_norm": 0.3201304980987189,
"learning_rate": 2.536363292258543e-06,
"loss": 1.0612,
"step": 583
},
{
"epoch": 2.1008554705087796,
"grad_norm": 0.2925431078505077,
"learning_rate": 2.518159873023116e-06,
"loss": 1.0317,
"step": 584
},
{
"epoch": 2.104457451598379,
"grad_norm": 0.33029491252375176,
"learning_rate": 2.5000000000000015e-06,
"loss": 1.0468,
"step": 585
},
{
"epoch": 2.1080594326879782,
"grad_norm": 0.3093075735098707,
"learning_rate": 2.4818839918211963e-06,
"loss": 1.0264,
"step": 586
},
{
"epoch": 2.1116614137775778,
"grad_norm": 0.33517262514779006,
"learning_rate": 2.4638121663490546e-06,
"loss": 1.0125,
"step": 587
},
{
"epoch": 2.115263394867177,
"grad_norm": 0.29875408693671646,
"learning_rate": 2.4457848406707014e-06,
"loss": 1.0145,
"step": 588
},
{
"epoch": 2.1188653759567764,
"grad_norm": 0.3461841979966471,
"learning_rate": 2.4278023310924676e-06,
"loss": 1.0526,
"step": 589
},
{
"epoch": 2.1224673570463755,
"grad_norm": 0.316595399237612,
"learning_rate": 2.40986495313435e-06,
"loss": 1.0276,
"step": 590
},
{
"epoch": 2.126069338135975,
"grad_norm": 0.35509810185127894,
"learning_rate": 2.391973021524461e-06,
"loss": 1.0236,
"step": 591
},
{
"epoch": 2.129671319225574,
"grad_norm": 0.31577714871143664,
"learning_rate": 2.3741268501935212e-06,
"loss": 1.0668,
"step": 592
},
{
"epoch": 2.133273300315173,
"grad_norm": 0.30363778483757914,
"learning_rate": 2.356326752269342e-06,
"loss": 1.0399,
"step": 593
},
{
"epoch": 2.1368752814047727,
"grad_norm": 0.2951535358604565,
"learning_rate": 2.338573040071332e-06,
"loss": 1.0674,
"step": 594
},
{
"epoch": 2.1404772624943718,
"grad_norm": 0.2916733821330133,
"learning_rate": 2.320866025105016e-06,
"loss": 1.0251,
"step": 595
},
{
"epoch": 2.1440792435839713,
"grad_norm": 0.28186913326636054,
"learning_rate": 2.303206018056583e-06,
"loss": 1.04,
"step": 596
},
{
"epoch": 2.1476812246735704,
"grad_norm": 0.3291151473315765,
"learning_rate": 2.285593328787414e-06,
"loss": 1.0173,
"step": 597
},
{
"epoch": 2.15128320576317,
"grad_norm": 0.3065042887064959,
"learning_rate": 2.268028266328655e-06,
"loss": 1.0294,
"step": 598
},
{
"epoch": 2.154885186852769,
"grad_norm": 0.3025327291635849,
"learning_rate": 2.250511138875801e-06,
"loss": 1.046,
"step": 599
},
{
"epoch": 2.1584871679423685,
"grad_norm": 0.28100597708822905,
"learning_rate": 2.23304225378328e-06,
"loss": 1.0289,
"step": 600
},
{
"epoch": 2.1620891490319676,
"grad_norm": 0.31006702418509163,
"learning_rate": 2.2156219175590623e-06,
"loss": 1.023,
"step": 601
},
{
"epoch": 2.1656911301215667,
"grad_norm": 0.2962970912734003,
"learning_rate": 2.1982504358592777e-06,
"loss": 1.0775,
"step": 602
},
{
"epoch": 2.169293111211166,
"grad_norm": 0.32597592835782413,
"learning_rate": 2.1809281134828663e-06,
"loss": 1.0409,
"step": 603
},
{
"epoch": 2.1728950923007653,
"grad_norm": 0.30783288748012444,
"learning_rate": 2.1636552543662187e-06,
"loss": 1.0609,
"step": 604
},
{
"epoch": 2.176497073390365,
"grad_norm": 0.3197715534590088,
"learning_rate": 2.146432161577842e-06,
"loss": 1.0341,
"step": 605
},
{
"epoch": 2.180099054479964,
"grad_norm": 0.30989671790932927,
"learning_rate": 2.1292591373130515e-06,
"loss": 1.0292,
"step": 606
},
{
"epoch": 2.1837010355695634,
"grad_norm": 0.315438048629008,
"learning_rate": 2.112136482888663e-06,
"loss": 1.0171,
"step": 607
},
{
"epoch": 2.1873030166591625,
"grad_norm": 0.3140514935443536,
"learning_rate": 2.095064498737701e-06,
"loss": 1.0406,
"step": 608
},
{
"epoch": 2.190904997748762,
"grad_norm": 0.36444536122323545,
"learning_rate": 2.07804348440414e-06,
"loss": 1.0474,
"step": 609
},
{
"epoch": 2.194506978838361,
"grad_norm": 0.32038773790017444,
"learning_rate": 2.061073738537635e-06,
"loss": 1.0406,
"step": 610
},
{
"epoch": 2.19810895992796,
"grad_norm": 0.3327293671870662,
"learning_rate": 2.04415555888829e-06,
"loss": 1.0473,
"step": 611
},
{
"epoch": 2.2017109410175597,
"grad_norm": 0.3110981786314029,
"learning_rate": 2.027289242301435e-06,
"loss": 1.0674,
"step": 612
},
{
"epoch": 2.205312922107159,
"grad_norm": 0.3001638141607491,
"learning_rate": 2.0104750847124075e-06,
"loss": 1.0543,
"step": 613
},
{
"epoch": 2.2089149031967583,
"grad_norm": 0.3336943922980599,
"learning_rate": 1.9937133811413666e-06,
"loss": 1.0378,
"step": 614
},
{
"epoch": 2.2125168842863574,
"grad_norm": 0.3189387269628659,
"learning_rate": 1.977004425688126e-06,
"loss": 1.0182,
"step": 615
},
{
"epoch": 2.216118865375957,
"grad_norm": 0.2953897029068166,
"learning_rate": 1.9603485115269743e-06,
"loss": 1.0307,
"step": 616
},
{
"epoch": 2.219720846465556,
"grad_norm": 0.3354775874836228,
"learning_rate": 1.9437459309015426e-06,
"loss": 1.0582,
"step": 617
},
{
"epoch": 2.2233228275551555,
"grad_norm": 0.3155756131913934,
"learning_rate": 1.927196975119678e-06,
"loss": 1.0729,
"step": 618
},
{
"epoch": 2.2269248086447546,
"grad_norm": 0.30314021638523436,
"learning_rate": 1.910701934548329e-06,
"loss": 1.0675,
"step": 619
},
{
"epoch": 2.2305267897343537,
"grad_norm": 0.3178641177812712,
"learning_rate": 1.8942610986084487e-06,
"loss": 1.0193,
"step": 620
},
{
"epoch": 2.2341287708239532,
"grad_norm": 0.27497288823347316,
"learning_rate": 1.8778747557699223e-06,
"loss": 1.0397,
"step": 621
},
{
"epoch": 2.2377307519135523,
"grad_norm": 0.27473090413221113,
"learning_rate": 1.8615431935464984e-06,
"loss": 1.0814,
"step": 622
},
{
"epoch": 2.241332733003152,
"grad_norm": 0.29969676069735396,
"learning_rate": 1.8452666984907519e-06,
"loss": 1.0464,
"step": 623
},
{
"epoch": 2.244934714092751,
"grad_norm": 0.29678899593418184,
"learning_rate": 1.829045556189053e-06,
"loss": 1.0635,
"step": 624
},
{
"epoch": 2.2485366951823504,
"grad_norm": 0.30067878966920947,
"learning_rate": 1.8128800512565514e-06,
"loss": 1.0325,
"step": 625
},
{
"epoch": 2.2521386762719495,
"grad_norm": 0.28275287186797854,
"learning_rate": 1.7967704673321917e-06,
"loss": 1.0606,
"step": 626
},
{
"epoch": 2.2557406573615486,
"grad_norm": 0.27780010911864134,
"learning_rate": 1.7807170870737317e-06,
"loss": 1.061,
"step": 627
},
{
"epoch": 2.259342638451148,
"grad_norm": 0.30625432629700144,
"learning_rate": 1.7647201921527802e-06,
"loss": 1.0008,
"step": 628
},
{
"epoch": 2.2629446195407477,
"grad_norm": 0.30785488054179105,
"learning_rate": 1.7487800632498547e-06,
"loss": 1.0072,
"step": 629
},
{
"epoch": 2.2665466006303467,
"grad_norm": 0.3052922709373008,
"learning_rate": 1.7328969800494727e-06,
"loss": 1.0549,
"step": 630
},
{
"epoch": 2.270148581719946,
"grad_norm": 0.3040902290300385,
"learning_rate": 1.7170712212352187e-06,
"loss": 1.0484,
"step": 631
},
{
"epoch": 2.2737505628095454,
"grad_norm": 0.3114270358030963,
"learning_rate": 1.7013030644848698e-06,
"loss": 1.065,
"step": 632
},
{
"epoch": 2.2773525438991444,
"grad_norm": 0.30134193406734877,
"learning_rate": 1.6855927864655241e-06,
"loss": 1.0003,
"step": 633
},
{
"epoch": 2.280954524988744,
"grad_norm": 0.29795611858634163,
"learning_rate": 1.6699406628287423e-06,
"loss": 1.0447,
"step": 634
},
{
"epoch": 2.284556506078343,
"grad_norm": 0.3148996427378843,
"learning_rate": 1.6543469682057105e-06,
"loss": 1.0709,
"step": 635
},
{
"epoch": 2.2881584871679426,
"grad_norm": 0.29121080028920326,
"learning_rate": 1.6388119762024213e-06,
"loss": 1.0482,
"step": 636
},
{
"epoch": 2.2917604682575416,
"grad_norm": 0.288859982023708,
"learning_rate": 1.6233359593948777e-06,
"loss": 1.0803,
"step": 637
},
{
"epoch": 2.2953624493471407,
"grad_norm": 0.33078409972348893,
"learning_rate": 1.6079191893243102e-06,
"loss": 1.0652,
"step": 638
},
{
"epoch": 2.2989644304367403,
"grad_norm": 0.306929918665794,
"learning_rate": 1.5925619364924016e-06,
"loss": 1.0426,
"step": 639
},
{
"epoch": 2.3025664115263393,
"grad_norm": 0.27619745792679457,
"learning_rate": 1.5772644703565564e-06,
"loss": 1.0295,
"step": 640
},
{
"epoch": 2.306168392615939,
"grad_norm": 0.2889850236837387,
"learning_rate": 1.5620270593251635e-06,
"loss": 1.0017,
"step": 641
},
{
"epoch": 2.309770373705538,
"grad_norm": 0.2997580312987809,
"learning_rate": 1.5468499707528856e-06,
"loss": 1.0433,
"step": 642
},
{
"epoch": 2.3133723547951375,
"grad_norm": 0.2934596363171497,
"learning_rate": 1.531733470935976e-06,
"loss": 1.1025,
"step": 643
},
{
"epoch": 2.3169743358847366,
"grad_norm": 0.2984641308954964,
"learning_rate": 1.5166778251075964e-06,
"loss": 1.0295,
"step": 644
},
{
"epoch": 2.3205763169743356,
"grad_norm": 0.30803103965283024,
"learning_rate": 1.5016832974331725e-06,
"loss": 1.0625,
"step": 645
},
{
"epoch": 2.324178298063935,
"grad_norm": 0.3075838465758903,
"learning_rate": 1.4867501510057548e-06,
"loss": 1.0274,
"step": 646
},
{
"epoch": 2.3277802791535347,
"grad_norm": 0.2794304614904837,
"learning_rate": 1.4718786478413983e-06,
"loss": 1.0705,
"step": 647
},
{
"epoch": 2.3313822602431338,
"grad_norm": 0.2854931127642351,
"learning_rate": 1.4570690488745687e-06,
"loss": 1.072,
"step": 648
},
{
"epoch": 2.334984241332733,
"grad_norm": 0.2976715865621331,
"learning_rate": 1.4423216139535735e-06,
"loss": 1.0519,
"step": 649
},
{
"epoch": 2.3385862224223324,
"grad_norm": 0.2867454551588936,
"learning_rate": 1.4276366018359845e-06,
"loss": 1.0656,
"step": 650
},
{
"epoch": 2.3421882035119315,
"grad_norm": 0.2997877110355175,
"learning_rate": 1.4130142701841076e-06,
"loss": 1.0207,
"step": 651
},
{
"epoch": 2.345790184601531,
"grad_norm": 0.29242665391081074,
"learning_rate": 1.3984548755604655e-06,
"loss": 1.0295,
"step": 652
},
{
"epoch": 2.34939216569113,
"grad_norm": 0.28957351738255677,
"learning_rate": 1.3839586734232907e-06,
"loss": 1.0187,
"step": 653
},
{
"epoch": 2.3529941467807296,
"grad_norm": 0.31920070289406366,
"learning_rate": 1.3695259181220405e-06,
"loss": 1.0309,
"step": 654
},
{
"epoch": 2.3565961278703287,
"grad_norm": 0.3015410819374185,
"learning_rate": 1.3551568628929434e-06,
"loss": 1.0334,
"step": 655
},
{
"epoch": 2.3601981089599278,
"grad_norm": 0.3076515876213567,
"learning_rate": 1.3408517598545446e-06,
"loss": 0.9868,
"step": 656
},
{
"epoch": 2.3638000900495273,
"grad_norm": 0.3176154294797028,
"learning_rate": 1.3266108600032928e-06,
"loss": 1.0543,
"step": 657
},
{
"epoch": 2.3674020711391264,
"grad_norm": 0.3071265193798959,
"learning_rate": 1.312434413209131e-06,
"loss": 1.0443,
"step": 658
},
{
"epoch": 2.371004052228726,
"grad_norm": 0.2868126401515119,
"learning_rate": 1.2983226682111094e-06,
"loss": 1.0031,
"step": 659
},
{
"epoch": 2.374606033318325,
"grad_norm": 0.30911382077034116,
"learning_rate": 1.2842758726130283e-06,
"loss": 1.0161,
"step": 660
},
{
"epoch": 2.3782080144079245,
"grad_norm": 0.292638404203282,
"learning_rate": 1.2702942728790897e-06,
"loss": 1.0435,
"step": 661
},
{
"epoch": 2.3818099954975236,
"grad_norm": 0.3131805047178825,
"learning_rate": 1.2563781143295705e-06,
"loss": 1.0817,
"step": 662
},
{
"epoch": 2.3854119765871227,
"grad_norm": 0.27540737813728455,
"learning_rate": 1.24252764113652e-06,
"loss": 1.0299,
"step": 663
},
{
"epoch": 2.389013957676722,
"grad_norm": 0.36871741578270023,
"learning_rate": 1.2287430963194807e-06,
"loss": 1.0207,
"step": 664
},
{
"epoch": 2.3926159387663217,
"grad_norm": 0.2795983680218255,
"learning_rate": 1.2150247217412186e-06,
"loss": 1.0227,
"step": 665
},
{
"epoch": 2.396217919855921,
"grad_norm": 0.32352488852148475,
"learning_rate": 1.2013727581034783e-06,
"loss": 1.0234,
"step": 666
},
{
"epoch": 2.39981990094552,
"grad_norm": 0.297035764973451,
"learning_rate": 1.18778744494276e-06,
"loss": 1.0286,
"step": 667
},
{
"epoch": 2.4034218820351194,
"grad_norm": 0.3019576240092055,
"learning_rate": 1.1742690206261293e-06,
"loss": 1.0221,
"step": 668
},
{
"epoch": 2.4070238631247185,
"grad_norm": 0.2935186392415227,
"learning_rate": 1.160817722347014e-06,
"loss": 1.0341,
"step": 669
},
{
"epoch": 2.410625844214318,
"grad_norm": 0.29550059263202566,
"learning_rate": 1.1474337861210543e-06,
"loss": 1.0312,
"step": 670
},
{
"epoch": 2.414227825303917,
"grad_norm": 0.2919473066109118,
"learning_rate": 1.1341174467819637e-06,
"loss": 1.0145,
"step": 671
},
{
"epoch": 2.4178298063935166,
"grad_norm": 0.2770789920290827,
"learning_rate": 1.120868937977404e-06,
"loss": 1.0233,
"step": 672
},
{
"epoch": 2.4214317874831157,
"grad_norm": 0.2726692177451945,
"learning_rate": 1.1076884921648834e-06,
"loss": 1.0763,
"step": 673
},
{
"epoch": 2.425033768572715,
"grad_norm": 0.2955147912539993,
"learning_rate": 1.0945763406076837e-06,
"loss": 1.0431,
"step": 674
},
{
"epoch": 2.4286357496623143,
"grad_norm": 0.2937917795969739,
"learning_rate": 1.0815327133708015e-06,
"loss": 1.0238,
"step": 675
},
{
"epoch": 2.4322377307519134,
"grad_norm": 0.26163762430548565,
"learning_rate": 1.0685578393169054e-06,
"loss": 1.0572,
"step": 676
},
{
"epoch": 2.435839711841513,
"grad_norm": 0.30900006884034514,
"learning_rate": 1.0556519461023301e-06,
"loss": 1.0148,
"step": 677
},
{
"epoch": 2.439441692931112,
"grad_norm": 0.28768364202012864,
"learning_rate": 1.0428152601730718e-06,
"loss": 1.0526,
"step": 678
},
{
"epoch": 2.4430436740207115,
"grad_norm": 0.31064843044382456,
"learning_rate": 1.0300480067608232e-06,
"loss": 1.0131,
"step": 679
},
{
"epoch": 2.4466456551103106,
"grad_norm": 0.2790738585338088,
"learning_rate": 1.0173504098790188e-06,
"loss": 1.0432,
"step": 680
},
{
"epoch": 2.4502476361999097,
"grad_norm": 0.2675802235255538,
"learning_rate": 1.0047226923189024e-06,
"loss": 1.0592,
"step": 681
},
{
"epoch": 2.4538496172895092,
"grad_norm": 0.2820500392897806,
"learning_rate": 9.921650756456164e-07,
"loss": 1.0406,
"step": 682
},
{
"epoch": 2.4574515983791088,
"grad_norm": 0.2712979205650806,
"learning_rate": 9.79677780194327e-07,
"loss": 1.0469,
"step": 683
},
{
"epoch": 2.461053579468708,
"grad_norm": 0.28678767820139744,
"learning_rate": 9.67261025066339e-07,
"loss": 0.9984,
"step": 684
},
{
"epoch": 2.464655560558307,
"grad_norm": 0.2826669677633045,
"learning_rate": 9.549150281252633e-07,
"loss": 1.0439,
"step": 685
},
{
"epoch": 2.4682575416479065,
"grad_norm": 0.2898206571793001,
"learning_rate": 9.426400059931956e-07,
"loss": 1.0046,
"step": 686
},
{
"epoch": 2.4718595227375055,
"grad_norm": 0.3055004594875761,
"learning_rate": 9.304361740469103e-07,
"loss": 1.0269,
"step": 687
},
{
"epoch": 2.475461503827105,
"grad_norm": 0.28000409560086253,
"learning_rate": 9.183037464140804e-07,
"loss": 1.0274,
"step": 688
},
{
"epoch": 2.479063484916704,
"grad_norm": 0.2956303701365367,
"learning_rate": 9.06242935969528e-07,
"loss": 1.0458,
"step": 689
},
{
"epoch": 2.4826654660063037,
"grad_norm": 0.29216773948217756,
"learning_rate": 8.942539543314799e-07,
"loss": 1.0254,
"step": 690
},
{
"epoch": 2.4862674470959027,
"grad_norm": 0.3038042079243165,
"learning_rate": 8.823370118578628e-07,
"loss": 1.048,
"step": 691
},
{
"epoch": 2.489869428185502,
"grad_norm": 0.27942852348483804,
"learning_rate": 8.704923176426072e-07,
"loss": 1.0092,
"step": 692
},
{
"epoch": 2.4934714092751014,
"grad_norm": 0.28829213899925576,
"learning_rate": 8.587200795119793e-07,
"loss": 1.0443,
"step": 693
},
{
"epoch": 2.4970733903647004,
"grad_norm": 0.3075901120188406,
"learning_rate": 8.470205040209362e-07,
"loss": 1.0754,
"step": 694
},
{
"epoch": 2.5006753714543,
"grad_norm": 0.28958620497520454,
"learning_rate": 8.353937964495029e-07,
"loss": 1.0209,
"step": 695
},
{
"epoch": 2.504277352543899,
"grad_norm": 0.27671142949543276,
"learning_rate": 8.238401607991647e-07,
"loss": 1.0168,
"step": 696
},
{
"epoch": 2.5078793336334986,
"grad_norm": 0.273803663136562,
"learning_rate": 8.123597997892918e-07,
"loss": 1.0222,
"step": 697
},
{
"epoch": 2.5114813147230977,
"grad_norm": 0.2822637921207432,
"learning_rate": 8.009529148535855e-07,
"loss": 1.0219,
"step": 698
},
{
"epoch": 2.5150832958126967,
"grad_norm": 0.2943865446548967,
"learning_rate": 7.89619706136539e-07,
"loss": 1.0429,
"step": 699
},
{
"epoch": 2.5186852769022963,
"grad_norm": 0.27277951808804135,
"learning_rate": 7.783603724899258e-07,
"loss": 1.0446,
"step": 700
},
{
"epoch": 2.522287257991896,
"grad_norm": 0.27077579945237057,
"learning_rate": 7.671751114693104e-07,
"loss": 1.0381,
"step": 701
},
{
"epoch": 2.525889239081495,
"grad_norm": 0.27280532919773465,
"learning_rate": 7.560641193305912e-07,
"loss": 1.0239,
"step": 702
},
{
"epoch": 2.529491220171094,
"grad_norm": 0.2790422908370001,
"learning_rate": 7.450275910265415e-07,
"loss": 1.0249,
"step": 703
},
{
"epoch": 2.5330932012606935,
"grad_norm": 0.27523036662265893,
"learning_rate": 7.34065720203399e-07,
"loss": 1.0051,
"step": 704
},
{
"epoch": 2.5366951823502926,
"grad_norm": 0.2771023164701056,
"learning_rate": 7.23178699197467e-07,
"loss": 1.0354,
"step": 705
},
{
"epoch": 2.540297163439892,
"grad_norm": 0.2666702640648939,
"learning_rate": 7.123667190317396e-07,
"loss": 1.0277,
"step": 706
},
{
"epoch": 2.543899144529491,
"grad_norm": 0.2781381560124755,
"learning_rate": 7.01629969412545e-07,
"loss": 1.0658,
"step": 707
},
{
"epoch": 2.5475011256190907,
"grad_norm": 0.2880670528582805,
"learning_rate": 6.909686387262255e-07,
"loss": 1.0257,
"step": 708
},
{
"epoch": 2.55110310670869,
"grad_norm": 0.2772242207866922,
"learning_rate": 6.803829140358237e-07,
"loss": 1.0496,
"step": 709
},
{
"epoch": 2.554705087798289,
"grad_norm": 0.34213950210212746,
"learning_rate": 6.698729810778065e-07,
"loss": 1.0317,
"step": 710
},
{
"epoch": 2.5583070688878884,
"grad_norm": 0.29889700157480303,
"learning_rate": 6.594390242588044e-07,
"loss": 1.0357,
"step": 711
},
{
"epoch": 2.5619090499774875,
"grad_norm": 0.281098345760278,
"learning_rate": 6.490812266523716e-07,
"loss": 1.0532,
"step": 712
},
{
"epoch": 2.565511031067087,
"grad_norm": 0.27502660732949663,
"learning_rate": 6.387997699957815e-07,
"loss": 1.0581,
"step": 713
},
{
"epoch": 2.569113012156686,
"grad_norm": 0.2914484204504316,
"learning_rate": 6.28594834686832e-07,
"loss": 0.9895,
"step": 714
},
{
"epoch": 2.5727149932462856,
"grad_norm": 0.3129296314272526,
"learning_rate": 6.184665997806832e-07,
"loss": 1.0147,
"step": 715
},
{
"epoch": 2.5763169743358847,
"grad_norm": 0.32079210829379234,
"learning_rate": 6.084152429867113e-07,
"loss": 1.0406,
"step": 716
},
{
"epoch": 2.5799189554254838,
"grad_norm": 0.29202037625742366,
"learning_rate": 5.98440940665399e-07,
"loss": 1.0466,
"step": 717
},
{
"epoch": 2.5835209365150833,
"grad_norm": 0.295396320039525,
"learning_rate": 5.885438678252342e-07,
"loss": 1.0549,
"step": 718
},
{
"epoch": 2.587122917604683,
"grad_norm": 0.3054962097094399,
"learning_rate": 5.787241981196384e-07,
"loss": 1.0325,
"step": 719
},
{
"epoch": 2.590724898694282,
"grad_norm": 0.28026222206941487,
"learning_rate": 5.689821038439264e-07,
"loss": 1.051,
"step": 720
},
{
"epoch": 2.594326879783881,
"grad_norm": 0.32032760630187107,
"learning_rate": 5.593177559322776e-07,
"loss": 1.0006,
"step": 721
},
{
"epoch": 2.5979288608734805,
"grad_norm": 0.3213596256851929,
"learning_rate": 5.497313239547374e-07,
"loss": 0.9861,
"step": 722
},
{
"epoch": 2.6015308419630796,
"grad_norm": 0.28144430111804236,
"learning_rate": 5.402229761142464e-07,
"loss": 1.0751,
"step": 723
},
{
"epoch": 2.605132823052679,
"grad_norm": 0.31364935141684536,
"learning_rate": 5.307928792436812e-07,
"loss": 1.0723,
"step": 724
},
{
"epoch": 2.608734804142278,
"grad_norm": 0.27300612039452254,
"learning_rate": 5.214411988029355e-07,
"loss": 1.0382,
"step": 725
},
{
"epoch": 2.6123367852318777,
"grad_norm": 0.2856505846764118,
"learning_rate": 5.121680988760125e-07,
"loss": 1.0649,
"step": 726
},
{
"epoch": 2.615938766321477,
"grad_norm": 0.30242925411320504,
"learning_rate": 5.029737421681446e-07,
"loss": 1.0253,
"step": 727
},
{
"epoch": 2.619540747411076,
"grad_norm": 0.2760307219806458,
"learning_rate": 4.938582900029437e-07,
"loss": 1.0117,
"step": 728
},
{
"epoch": 2.6231427285006754,
"grad_norm": 0.2813571189889987,
"learning_rate": 4.848219023195644e-07,
"loss": 1.0558,
"step": 729
},
{
"epoch": 2.6267447095902745,
"grad_norm": 0.27273042137937875,
"learning_rate": 4.758647376699033e-07,
"loss": 1.044,
"step": 730
},
{
"epoch": 2.630346690679874,
"grad_norm": 0.2962356696193387,
"learning_rate": 4.6698695321581165e-07,
"loss": 1.0591,
"step": 731
},
{
"epoch": 2.633948671769473,
"grad_norm": 0.3021705699786841,
"learning_rate": 4.581887047263445e-07,
"loss": 1.0098,
"step": 732
},
{
"epoch": 2.6375506528590726,
"grad_norm": 0.3116020937459149,
"learning_rate": 4.494701465750217e-07,
"loss": 0.9851,
"step": 733
},
{
"epoch": 2.6411526339486717,
"grad_norm": 0.26030040597442616,
"learning_rate": 4.4083143173712207e-07,
"loss": 1.0507,
"step": 734
},
{
"epoch": 2.644754615038271,
"grad_norm": 0.2754248870408236,
"learning_rate": 4.322727117869951e-07,
"loss": 1.0644,
"step": 735
},
{
"epoch": 2.6483565961278703,
"grad_norm": 0.28205843108275575,
"learning_rate": 4.237941368954124e-07,
"loss": 1.0424,
"step": 736
},
{
"epoch": 2.65195857721747,
"grad_norm": 0.29294591358526023,
"learning_rate": 4.153958558269189e-07,
"loss": 1.0482,
"step": 737
},
{
"epoch": 2.655560558307069,
"grad_norm": 0.27169863680028755,
"learning_rate": 4.0707801593723006e-07,
"loss": 1.0163,
"step": 738
},
{
"epoch": 2.659162539396668,
"grad_norm": 0.27860960386729466,
"learning_rate": 3.9884076317064813e-07,
"loss": 1.0333,
"step": 739
},
{
"epoch": 2.6627645204862675,
"grad_norm": 0.2770379539586451,
"learning_rate": 3.90684242057498e-07,
"loss": 1.0574,
"step": 740
},
{
"epoch": 2.6663665015758666,
"grad_norm": 0.2668295100673454,
"learning_rate": 3.8260859571158883e-07,
"loss": 1.018,
"step": 741
},
{
"epoch": 2.669968482665466,
"grad_norm": 0.30903619747795014,
"learning_rate": 3.7461396582771035e-07,
"loss": 1.033,
"step": 742
},
{
"epoch": 2.6735704637550652,
"grad_norm": 0.28672971110408385,
"learning_rate": 3.6670049267913954e-07,
"loss": 1.0349,
"step": 743
},
{
"epoch": 2.6771724448446648,
"grad_norm": 0.276249475350017,
"learning_rate": 3.5886831511518336e-07,
"loss": 1.0317,
"step": 744
},
{
"epoch": 2.680774425934264,
"grad_norm": 0.2676884521307514,
"learning_rate": 3.511175705587433e-07,
"loss": 1.0254,
"step": 745
},
{
"epoch": 2.684376407023863,
"grad_norm": 0.2983580659797731,
"learning_rate": 3.434483950038986e-07,
"loss": 1.0412,
"step": 746
},
{
"epoch": 2.6879783881134625,
"grad_norm": 0.28438744098885343,
"learning_rate": 3.358609230135268e-07,
"loss": 1.0502,
"step": 747
},
{
"epoch": 2.6915803692030615,
"grad_norm": 0.2715735909059339,
"learning_rate": 3.283552877169399e-07,
"loss": 1.0289,
"step": 748
},
{
"epoch": 2.695182350292661,
"grad_norm": 0.2970172751919579,
"learning_rate": 3.2093162080754634e-07,
"loss": 1.0279,
"step": 749
},
{
"epoch": 2.69878433138226,
"grad_norm": 0.2845334892172726,
"learning_rate": 3.135900525405428e-07,
"loss": 1.0137,
"step": 750
},
{
"epoch": 2.7023863124718597,
"grad_norm": 0.27255472615309784,
"learning_rate": 3.0633071173062966e-07,
"loss": 1.0262,
"step": 751
},
{
"epoch": 2.7059882935614588,
"grad_norm": 0.27752394798698354,
"learning_rate": 2.99153725749749e-07,
"loss": 1.0454,
"step": 752
},
{
"epoch": 2.709590274651058,
"grad_norm": 0.276728317098971,
"learning_rate": 2.920592205248496e-07,
"loss": 0.9963,
"step": 753
},
{
"epoch": 2.7131922557406574,
"grad_norm": 0.27889890261263245,
"learning_rate": 2.850473205356774e-07,
"loss": 1.0716,
"step": 754
},
{
"epoch": 2.716794236830257,
"grad_norm": 0.31799452745614826,
"learning_rate": 2.7811814881259503e-07,
"loss": 1.0446,
"step": 755
},
{
"epoch": 2.720396217919856,
"grad_norm": 0.27681370096335217,
"learning_rate": 2.712718269344161e-07,
"loss": 1.0388,
"step": 756
},
{
"epoch": 2.723998199009455,
"grad_norm": 0.36693230305260593,
"learning_rate": 2.6450847502627883e-07,
"loss": 1.0537,
"step": 757
},
{
"epoch": 2.7276001800990546,
"grad_norm": 0.29241171362928314,
"learning_rate": 2.578282117575343e-07,
"loss": 1.0404,
"step": 758
},
{
"epoch": 2.7312021611886537,
"grad_norm": 0.2579639214579812,
"learning_rate": 2.5123115433966615e-07,
"loss": 1.0798,
"step": 759
},
{
"epoch": 2.734804142278253,
"grad_norm": 0.27185445345296794,
"learning_rate": 2.447174185242324e-07,
"loss": 1.0262,
"step": 760
},
{
"epoch": 2.7384061233678523,
"grad_norm": 0.2886015166476546,
"learning_rate": 2.3828711860083676e-07,
"loss": 1.0316,
"step": 761
},
{
"epoch": 2.742008104457452,
"grad_norm": 0.3031146928198293,
"learning_rate": 2.319403673951204e-07,
"loss": 1.0525,
"step": 762
},
{
"epoch": 2.745610085547051,
"grad_norm": 0.2703340838231308,
"learning_rate": 2.2567727626678527e-07,
"loss": 1.0723,
"step": 763
},
{
"epoch": 2.74921206663665,
"grad_norm": 0.2650868418407948,
"learning_rate": 2.1949795510763872e-07,
"loss": 1.0205,
"step": 764
},
{
"epoch": 2.7528140477262495,
"grad_norm": 0.26760082891770176,
"learning_rate": 2.134025123396638e-07,
"loss": 1.0261,
"step": 765
},
{
"epoch": 2.7564160288158486,
"grad_norm": 0.2717529102257122,
"learning_rate": 2.0739105491312028e-07,
"loss": 1.044,
"step": 766
},
{
"epoch": 2.760018009905448,
"grad_norm": 0.26225450814352563,
"learning_rate": 2.0146368830466668e-07,
"loss": 1.0793,
"step": 767
},
{
"epoch": 2.763619990995047,
"grad_norm": 0.3063150762890465,
"learning_rate": 1.9562051651550784e-07,
"loss": 1.0673,
"step": 768
},
{
"epoch": 2.7672219720846467,
"grad_norm": 0.2774099125784962,
"learning_rate": 1.8986164206957037e-07,
"loss": 1.0673,
"step": 769
},
{
"epoch": 2.770823953174246,
"grad_norm": 0.2847835154599416,
"learning_rate": 1.841871660117095e-07,
"loss": 1.0311,
"step": 770
},
{
"epoch": 2.774425934263845,
"grad_norm": 0.25146022356844167,
"learning_rate": 1.785971879059273e-07,
"loss": 1.0194,
"step": 771
},
{
"epoch": 2.7780279153534444,
"grad_norm": 0.3024944481082907,
"learning_rate": 1.7309180583363062e-07,
"loss": 0.9938,
"step": 772
},
{
"epoch": 2.781629896443044,
"grad_norm": 0.2875295564587982,
"learning_rate": 1.6767111639191202e-07,
"loss": 1.0043,
"step": 773
},
{
"epoch": 2.785231877532643,
"grad_norm": 0.28086121255020596,
"learning_rate": 1.6233521469185054e-07,
"loss": 1.0393,
"step": 774
},
{
"epoch": 2.788833858622242,
"grad_norm": 0.28337014562304264,
"learning_rate": 1.5708419435684463e-07,
"loss": 1.0467,
"step": 775
},
{
"epoch": 2.7924358397118416,
"grad_norm": 0.2695975671118399,
"learning_rate": 1.5191814752097024e-07,
"loss": 1.0277,
"step": 776
},
{
"epoch": 2.7960378208014407,
"grad_norm": 0.2596950818220317,
"learning_rate": 1.4683716482736364e-07,
"loss": 1.0375,
"step": 777
},
{
"epoch": 2.7996398018910402,
"grad_norm": 0.2833370915529864,
"learning_rate": 1.4184133542663014e-07,
"loss": 1.0402,
"step": 778
},
{
"epoch": 2.8032417829806393,
"grad_norm": 0.2837127477632618,
"learning_rate": 1.3693074697528231e-07,
"loss": 1.0663,
"step": 779
},
{
"epoch": 2.806843764070239,
"grad_norm": 0.258408242099485,
"learning_rate": 1.3210548563419857e-07,
"loss": 1.0245,
"step": 780
},
{
"epoch": 2.810445745159838,
"grad_norm": 0.29442506294804527,
"learning_rate": 1.2736563606711384e-07,
"loss": 0.9917,
"step": 781
},
{
"epoch": 2.814047726249437,
"grad_norm": 0.29244090698076763,
"learning_rate": 1.2271128143913458e-07,
"loss": 1.0409,
"step": 782
},
{
"epoch": 2.8176497073390365,
"grad_norm": 0.26403315463361793,
"learning_rate": 1.1814250341527611e-07,
"loss": 1.0486,
"step": 783
},
{
"epoch": 2.8212516884286356,
"grad_norm": 0.2723144354203953,
"learning_rate": 1.136593821590326e-07,
"loss": 1.0543,
"step": 784
},
{
"epoch": 2.824853669518235,
"grad_norm": 0.2850711230800217,
"learning_rate": 1.0926199633097156e-07,
"loss": 1.0095,
"step": 785
},
{
"epoch": 2.828455650607834,
"grad_norm": 0.2771326731497052,
"learning_rate": 1.0495042308735104e-07,
"loss": 1.0138,
"step": 786
},
{
"epoch": 2.8320576316974337,
"grad_norm": 0.2667788079924329,
"learning_rate": 1.007247380787657e-07,
"loss": 1.0354,
"step": 787
},
{
"epoch": 2.835659612787033,
"grad_norm": 0.27205262075900666,
"learning_rate": 9.658501544882182e-08,
"loss": 1.008,
"step": 788
},
{
"epoch": 2.839261593876632,
"grad_norm": 0.278153414604589,
"learning_rate": 9.253132783283548e-08,
"loss": 1.0575,
"step": 789
},
{
"epoch": 2.8428635749662314,
"grad_norm": 0.2722033011712004,
"learning_rate": 8.856374635655696e-08,
"loss": 1.0373,
"step": 790
},
{
"epoch": 2.846465556055831,
"grad_norm": 0.2643963535331226,
"learning_rate": 8.468234063492287e-08,
"loss": 1.0331,
"step": 791
},
{
"epoch": 2.85006753714543,
"grad_norm": 0.2772917134427986,
"learning_rate": 8.088717877083706e-08,
"loss": 0.9933,
"step": 792
},
{
"epoch": 2.853669518235029,
"grad_norm": 0.2567788777229302,
"learning_rate": 7.717832735397335e-08,
"loss": 1.039,
"step": 793
},
{
"epoch": 2.8572714993246286,
"grad_norm": 0.3279492936255398,
"learning_rate": 7.355585145960743e-08,
"loss": 1.066,
"step": 794
},
{
"epoch": 2.8608734804142277,
"grad_norm": 0.26103624246805707,
"learning_rate": 7.001981464747565e-08,
"loss": 1.0502,
"step": 795
},
{
"epoch": 2.8644754615038273,
"grad_norm": 0.28523237313847577,
"learning_rate": 6.657027896065982e-08,
"loss": 1.0071,
"step": 796
},
{
"epoch": 2.8680774425934263,
"grad_norm": 0.2714272091000359,
"learning_rate": 6.3207304924498e-08,
"loss": 1.0614,
"step": 797
},
{
"epoch": 2.871679423683026,
"grad_norm": 0.28016748152731064,
"learning_rate": 5.993095154552431e-08,
"loss": 1.0324,
"step": 798
},
{
"epoch": 2.875281404772625,
"grad_norm": 0.28138701855091797,
"learning_rate": 5.674127631043025e-08,
"loss": 1.0357,
"step": 799
},
{
"epoch": 2.878883385862224,
"grad_norm": 0.259827571346895,
"learning_rate": 5.363833518505834e-08,
"loss": 1.0203,
"step": 800
},
{
"epoch": 2.8824853669518236,
"grad_norm": 0.30527969392963294,
"learning_rate": 5.062218261342122e-08,
"loss": 1.04,
"step": 801
},
{
"epoch": 2.8860873480414226,
"grad_norm": 0.3336787676428948,
"learning_rate": 4.769287151674407e-08,
"loss": 1.0177,
"step": 802
},
{
"epoch": 2.889689329131022,
"grad_norm": 0.2673565531562563,
"learning_rate": 4.485045329253646e-08,
"loss": 0.9941,
"step": 803
},
{
"epoch": 2.8932913102206212,
"grad_norm": 0.27644924732915116,
"learning_rate": 4.209497781369143e-08,
"loss": 1.047,
"step": 804
},
{
"epoch": 2.8968932913102208,
"grad_norm": 0.2762539625768617,
"learning_rate": 3.9426493427611177e-08,
"loss": 1.0618,
"step": 805
},
{
"epoch": 2.90049527239982,
"grad_norm": 0.279438570935992,
"learning_rate": 3.684504695535496e-08,
"loss": 1.0407,
"step": 806
},
{
"epoch": 2.904097253489419,
"grad_norm": 0.25093375155659065,
"learning_rate": 3.435068369082306e-08,
"loss": 1.0263,
"step": 807
},
{
"epoch": 2.9076992345790185,
"grad_norm": 0.28318467682760556,
"learning_rate": 3.194344739995803e-08,
"loss": 1.0519,
"step": 808
},
{
"epoch": 2.911301215668618,
"grad_norm": 0.2904491497208285,
"learning_rate": 2.9623380319976912e-08,
"loss": 1.0549,
"step": 809
},
{
"epoch": 2.914903196758217,
"grad_norm": 0.2605165703364476,
"learning_rate": 2.7390523158633552e-08,
"loss": 1.0166,
"step": 810
},
{
"epoch": 2.918505177847816,
"grad_norm": 0.27714338429064156,
"learning_rate": 2.5244915093499134e-08,
"loss": 1.0533,
"step": 811
},
{
"epoch": 2.9221071589374157,
"grad_norm": 0.275239630057375,
"learning_rate": 2.3186593771280518e-08,
"loss": 1.0233,
"step": 812
},
{
"epoch": 2.9257091400270148,
"grad_norm": 0.27858108381395574,
"learning_rate": 2.1215595307154667e-08,
"loss": 0.9939,
"step": 813
},
{
"epoch": 2.9293111211166143,
"grad_norm": 0.2685380458847235,
"learning_rate": 1.9331954284137476e-08,
"loss": 1.0429,
"step": 814
},
{
"epoch": 2.9329131022062134,
"grad_norm": 0.2721576649644654,
"learning_rate": 1.753570375247815e-08,
"loss": 1.0401,
"step": 815
},
{
"epoch": 2.936515083295813,
"grad_norm": 0.25219694366538187,
"learning_rate": 1.582687522907633e-08,
"loss": 1.0321,
"step": 816
},
{
"epoch": 2.940117064385412,
"grad_norm": 0.31225958064880055,
"learning_rate": 1.4205498696930332e-08,
"loss": 1.0564,
"step": 817
},
{
"epoch": 2.943719045475011,
"grad_norm": 0.2907802700059505,
"learning_rate": 1.2671602604612531e-08,
"loss": 1.0549,
"step": 818
},
{
"epoch": 2.9473210265646106,
"grad_norm": 0.2620211100717753,
"learning_rate": 1.1225213865767026e-08,
"loss": 1.0135,
"step": 819
},
{
"epoch": 2.9509230076542097,
"grad_norm": 0.2662621532694749,
"learning_rate": 9.866357858642206e-09,
"loss": 1.0217,
"step": 820
},
{
"epoch": 2.954524988743809,
"grad_norm": 0.26026540743415805,
"learning_rate": 8.595058425640012e-09,
"loss": 1.0242,
"step": 821
},
{
"epoch": 2.9581269698334083,
"grad_norm": 0.27157295590276637,
"learning_rate": 7.411337872900715e-09,
"loss": 0.9976,
"step": 822
},
{
"epoch": 2.961728950923008,
"grad_norm": 0.2810030913242284,
"learning_rate": 6.315216969912663e-09,
"loss": 1.0426,
"step": 823
},
{
"epoch": 2.965330932012607,
"grad_norm": 0.27067627376772047,
"learning_rate": 5.306714949143699e-09,
"loss": 1.0691,
"step": 824
},
{
"epoch": 2.968932913102206,
"grad_norm": 0.26312224124888717,
"learning_rate": 4.385849505708084e-09,
"loss": 1.0305,
"step": 825
},
{
"epoch": 2.9725348941918055,
"grad_norm": 0.28983516142663135,
"learning_rate": 3.5526367970539765e-09,
"loss": 1.0732,
"step": 826
},
{
"epoch": 2.976136875281405,
"grad_norm": 0.2824286593101622,
"learning_rate": 2.8070914426786555e-09,
"loss": 1.0435,
"step": 827
},
{
"epoch": 2.979738856371004,
"grad_norm": 0.26087637492866167,
"learning_rate": 2.149226523874837e-09,
"loss": 1.0591,
"step": 828
},
{
"epoch": 2.983340837460603,
"grad_norm": 0.2909906050244136,
"learning_rate": 1.5790535835003006e-09,
"loss": 1.0739,
"step": 829
},
{
"epoch": 2.9869428185502027,
"grad_norm": 0.26693864211195434,
"learning_rate": 1.096582625772502e-09,
"loss": 1.037,
"step": 830
},
{
"epoch": 2.990544799639802,
"grad_norm": 0.28489493631726986,
"learning_rate": 7.018221160981498e-10,
"loss": 1.0347,
"step": 831
},
{
"epoch": 2.9941467807294013,
"grad_norm": 0.2886886682421759,
"learning_rate": 3.9477898091944135e-10,
"loss": 1.0298,
"step": 832
},
{
"epoch": 2.9977487618190004,
"grad_norm": 0.28032318013340757,
"learning_rate": 1.7545860759693446e-10,
"loss": 0.9798,
"step": 833
},
{
"epoch": 3.0,
"grad_norm": 0.28032318013340757,
"learning_rate": 4.3864844311847235e-11,
"loss": 1.0169,
"step": 834
}
],
"logging_steps": 1,
"max_steps": 834,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1614971456716800.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}