Files
salamandra-2b-instruct_tools/trainer_state.json

10176 lines
247 KiB
JSON
Raw Normal View History

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9999066554653225,
"eval_steps": 14,
"global_step": 1339,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.000746756277419957,
"grad_norm": 39.864603672245025,
"learning_rate": 2.439024390243903e-07,
"loss": 1.872,
"step": 1
},
{
"epoch": 0.001493512554839914,
"grad_norm": 25.154230105266365,
"learning_rate": 4.878048780487805e-07,
"loss": 1.9997,
"step": 2
},
{
"epoch": 0.0022402688322598714,
"grad_norm": 39.12364037859748,
"learning_rate": 7.317073170731707e-07,
"loss": 1.9296,
"step": 3
},
{
"epoch": 0.002987025109679828,
"grad_norm": 33.83330023997382,
"learning_rate": 9.75609756097561e-07,
"loss": 1.9772,
"step": 4
},
{
"epoch": 0.003733781387099785,
"grad_norm": 21.94955478058591,
"learning_rate": 1.2195121951219514e-06,
"loss": 1.87,
"step": 5
},
{
"epoch": 0.004480537664519743,
"grad_norm": 20.905610860537625,
"learning_rate": 1.4634146341463414e-06,
"loss": 1.7998,
"step": 6
},
{
"epoch": 0.005227293941939699,
"grad_norm": 22.006935536653316,
"learning_rate": 1.707317073170732e-06,
"loss": 1.8928,
"step": 7
},
{
"epoch": 0.005974050219359656,
"grad_norm": 19.70076963746501,
"learning_rate": 1.951219512195122e-06,
"loss": 1.9504,
"step": 8
},
{
"epoch": 0.006720806496779613,
"grad_norm": 22.22349502935227,
"learning_rate": 2.1951219512195125e-06,
"loss": 2.1065,
"step": 9
},
{
"epoch": 0.00746756277419957,
"grad_norm": 153.22088865153165,
"learning_rate": 2.4390243902439027e-06,
"loss": 1.8163,
"step": 10
},
{
"epoch": 0.008214319051619528,
"grad_norm": 12.758400252716752,
"learning_rate": 2.682926829268293e-06,
"loss": 1.7599,
"step": 11
},
{
"epoch": 0.008961075329039486,
"grad_norm": 17.03707793518957,
"learning_rate": 2.926829268292683e-06,
"loss": 1.8174,
"step": 12
},
{
"epoch": 0.009707831606459442,
"grad_norm": 12.542896293073882,
"learning_rate": 3.1707317073170736e-06,
"loss": 1.6822,
"step": 13
},
{
"epoch": 0.010454587883879398,
"grad_norm": 16.440723028682665,
"learning_rate": 3.414634146341464e-06,
"loss": 1.7731,
"step": 14
},
{
"epoch": 0.010454587883879398,
"eval_loss": 1.4025799036026,
"eval_runtime": 179.1157,
"eval_samples_per_second": 100.656,
"eval_steps_per_second": 1.574,
"step": 14
},
{
"epoch": 0.011201344161299356,
"grad_norm": 17.64290219497995,
"learning_rate": 3.6585365853658537e-06,
"loss": 1.7477,
"step": 15
},
{
"epoch": 0.011948100438719312,
"grad_norm": 14.176146937463686,
"learning_rate": 3.902439024390244e-06,
"loss": 1.6269,
"step": 16
},
{
"epoch": 0.01269485671613927,
"grad_norm": 10.32131977607697,
"learning_rate": 4.146341463414634e-06,
"loss": 1.5132,
"step": 17
},
{
"epoch": 0.013441612993559227,
"grad_norm": 11.77667555622052,
"learning_rate": 4.390243902439025e-06,
"loss": 1.6,
"step": 18
},
{
"epoch": 0.014188369270979185,
"grad_norm": 11.65934333405466,
"learning_rate": 4.634146341463416e-06,
"loss": 1.5687,
"step": 19
},
{
"epoch": 0.01493512554839914,
"grad_norm": 14.71495813797744,
"learning_rate": 4.8780487804878055e-06,
"loss": 1.4833,
"step": 20
},
{
"epoch": 0.0156818818258191,
"grad_norm": 19.563751131877677,
"learning_rate": 5.121951219512195e-06,
"loss": 1.5502,
"step": 21
},
{
"epoch": 0.016428638103239055,
"grad_norm": 9.697052484002269,
"learning_rate": 5.365853658536586e-06,
"loss": 1.499,
"step": 22
},
{
"epoch": 0.01717539438065901,
"grad_norm": 7.808180192853328,
"learning_rate": 5.609756097560977e-06,
"loss": 1.4112,
"step": 23
},
{
"epoch": 0.01792215065807897,
"grad_norm": 7.808693680801286,
"learning_rate": 5.853658536585366e-06,
"loss": 1.379,
"step": 24
},
{
"epoch": 0.018668906935498927,
"grad_norm": 10.205161557594558,
"learning_rate": 6.0975609756097564e-06,
"loss": 1.5749,
"step": 25
},
{
"epoch": 0.019415663212918884,
"grad_norm": 10.082022554511203,
"learning_rate": 6.341463414634147e-06,
"loss": 1.5026,
"step": 26
},
{
"epoch": 0.02016241949033884,
"grad_norm": 7.5938534821295285,
"learning_rate": 6.585365853658538e-06,
"loss": 1.4385,
"step": 27
},
{
"epoch": 0.020909175767758796,
"grad_norm": 8.080362258830748,
"learning_rate": 6.829268292682928e-06,
"loss": 1.368,
"step": 28
},
{
"epoch": 0.020909175767758796,
"eval_loss": 1.2178118228912354,
"eval_runtime": 160.607,
"eval_samples_per_second": 112.255,
"eval_steps_per_second": 1.756,
"step": 28
},
{
"epoch": 0.021655932045178756,
"grad_norm": 6.42748880141974,
"learning_rate": 7.0731707317073175e-06,
"loss": 1.373,
"step": 29
},
{
"epoch": 0.022402688322598712,
"grad_norm": 11.993286705868165,
"learning_rate": 7.317073170731707e-06,
"loss": 1.2981,
"step": 30
},
{
"epoch": 0.02314944460001867,
"grad_norm": 9.70428655700436,
"learning_rate": 7.560975609756098e-06,
"loss": 1.3723,
"step": 31
},
{
"epoch": 0.023896200877438625,
"grad_norm": 7.543123356855298,
"learning_rate": 7.804878048780489e-06,
"loss": 1.3837,
"step": 32
},
{
"epoch": 0.024642957154858584,
"grad_norm": 6.3551480225016865,
"learning_rate": 8.048780487804879e-06,
"loss": 1.3434,
"step": 33
},
{
"epoch": 0.02538971343227854,
"grad_norm": 6.170362178766895,
"learning_rate": 8.292682926829268e-06,
"loss": 1.2914,
"step": 34
},
{
"epoch": 0.026136469709698497,
"grad_norm": 7.579209609335756,
"learning_rate": 8.536585365853658e-06,
"loss": 1.4315,
"step": 35
},
{
"epoch": 0.026883225987118453,
"grad_norm": 8.161772733498161,
"learning_rate": 8.78048780487805e-06,
"loss": 1.4156,
"step": 36
},
{
"epoch": 0.027629982264538413,
"grad_norm": 9.153242946428481,
"learning_rate": 9.02439024390244e-06,
"loss": 1.3437,
"step": 37
},
{
"epoch": 0.02837673854195837,
"grad_norm": 8.270074388479511,
"learning_rate": 9.268292682926831e-06,
"loss": 1.3882,
"step": 38
},
{
"epoch": 0.029123494819378325,
"grad_norm": 5.810086051514348,
"learning_rate": 9.51219512195122e-06,
"loss": 1.2423,
"step": 39
},
{
"epoch": 0.02987025109679828,
"grad_norm": 5.940688408073622,
"learning_rate": 9.756097560975611e-06,
"loss": 1.3012,
"step": 40
},
{
"epoch": 0.030617007374218238,
"grad_norm": 7.446040839269437,
"learning_rate": 1e-05,
"loss": 1.3854,
"step": 41
},
{
"epoch": 0.0313637636516382,
"grad_norm": 7.823717873679139,
"learning_rate": 9.999985354973661e-06,
"loss": 1.2707,
"step": 42
},
{
"epoch": 0.0313637636516382,
"eval_loss": 1.1517595052719116,
"eval_runtime": 160.6701,
"eval_samples_per_second": 112.211,
"eval_steps_per_second": 1.755,
"step": 42
},
{
"epoch": 0.032110519929058154,
"grad_norm": 5.646313932886119,
"learning_rate": 9.999941419980432e-06,
"loss": 1.2381,
"step": 43
},
{
"epoch": 0.03285727620647811,
"grad_norm": 6.520800231029172,
"learning_rate": 9.999868195277684e-06,
"loss": 1.2704,
"step": 44
},
{
"epoch": 0.033604032483898066,
"grad_norm": 5.235413753926133,
"learning_rate": 9.999765681294371e-06,
"loss": 1.2493,
"step": 45
},
{
"epoch": 0.03435078876131802,
"grad_norm": 8.080884030634016,
"learning_rate": 9.99963387863102e-06,
"loss": 1.2873,
"step": 46
},
{
"epoch": 0.03509754503873798,
"grad_norm": 7.188542456253142,
"learning_rate": 9.999472788059732e-06,
"loss": 1.3258,
"step": 47
},
{
"epoch": 0.03584430131615794,
"grad_norm": 5.530317330717678,
"learning_rate": 9.999282410524176e-06,
"loss": 1.2104,
"step": 48
},
{
"epoch": 0.0365910575935779,
"grad_norm": 5.429267063162806,
"learning_rate": 9.999062747139587e-06,
"loss": 1.2585,
"step": 49
},
{
"epoch": 0.037337813870997855,
"grad_norm": 6.8607568553163425,
"learning_rate": 9.998813799192756e-06,
"loss": 1.2217,
"step": 50
},
{
"epoch": 0.03808457014841781,
"grad_norm": 5.864542890549609,
"learning_rate": 9.99853556814202e-06,
"loss": 1.1748,
"step": 51
},
{
"epoch": 0.03883132642583777,
"grad_norm": 6.776573554690618,
"learning_rate": 9.998228055617264e-06,
"loss": 1.2681,
"step": 52
},
{
"epoch": 0.03957808270325772,
"grad_norm": 5.240453608787099,
"learning_rate": 9.997891263419896e-06,
"loss": 1.2825,
"step": 53
},
{
"epoch": 0.04032483898067768,
"grad_norm": 5.502379192462283,
"learning_rate": 9.997525193522848e-06,
"loss": 1.1202,
"step": 54
},
{
"epoch": 0.041071595258097636,
"grad_norm": 4.981330215088245,
"learning_rate": 9.997129848070563e-06,
"loss": 1.237,
"step": 55
},
{
"epoch": 0.04181835153551759,
"grad_norm": 5.658491502222876,
"learning_rate": 9.99670522937898e-06,
"loss": 1.1674,
"step": 56
},
{
"epoch": 0.04181835153551759,
"eval_loss": 1.1058392524719238,
"eval_runtime": 160.6229,
"eval_samples_per_second": 112.244,
"eval_steps_per_second": 1.756,
"step": 56
},
{
"epoch": 0.042565107812937555,
"grad_norm": 5.781043637294228,
"learning_rate": 9.996251339935517e-06,
"loss": 1.2731,
"step": 57
},
{
"epoch": 0.04331186409035751,
"grad_norm": 6.798923459655398,
"learning_rate": 9.995768182399063e-06,
"loss": 1.2602,
"step": 58
},
{
"epoch": 0.04405862036777747,
"grad_norm": 8.183844154095041,
"learning_rate": 9.995255759599963e-06,
"loss": 1.2871,
"step": 59
},
{
"epoch": 0.044805376645197424,
"grad_norm": 5.8833180570242805,
"learning_rate": 9.994714074539991e-06,
"loss": 1.2261,
"step": 60
},
{
"epoch": 0.04555213292261738,
"grad_norm": 5.9510640282981395,
"learning_rate": 9.99414313039235e-06,
"loss": 1.1705,
"step": 61
},
{
"epoch": 0.04629888920003734,
"grad_norm": 4.960391929203876,
"learning_rate": 9.993542930501629e-06,
"loss": 1.1892,
"step": 62
},
{
"epoch": 0.04704564547745729,
"grad_norm": 6.082371592489799,
"learning_rate": 9.99291347838381e-06,
"loss": 1.2665,
"step": 63
},
{
"epoch": 0.04779240175487725,
"grad_norm": 5.928756573070347,
"learning_rate": 9.992254777726231e-06,
"loss": 1.2466,
"step": 64
},
{
"epoch": 0.04853915803229721,
"grad_norm": 5.784351587819939,
"learning_rate": 9.991566832387564e-06,
"loss": 1.1332,
"step": 65
},
{
"epoch": 0.04928591430971717,
"grad_norm": 5.309638624254364,
"learning_rate": 9.990849646397803e-06,
"loss": 1.0934,
"step": 66
},
{
"epoch": 0.050032670587137125,
"grad_norm": 5.282903270237203,
"learning_rate": 9.99010322395823e-06,
"loss": 1.1775,
"step": 67
},
{
"epoch": 0.05077942686455708,
"grad_norm": 7.505150676696945,
"learning_rate": 9.989327569441395e-06,
"loss": 1.1281,
"step": 68
},
{
"epoch": 0.05152618314197704,
"grad_norm": 5.111300188914756,
"learning_rate": 9.988522687391092e-06,
"loss": 1.0796,
"step": 69
},
{
"epoch": 0.052272939419396994,
"grad_norm": 5.220282812165929,
"learning_rate": 9.987688582522325e-06,
"loss": 1.1937,
"step": 70
},
{
"epoch": 0.052272939419396994,
"eval_loss": 1.0714255571365356,
"eval_runtime": 160.6215,
"eval_samples_per_second": 112.245,
"eval_steps_per_second": 1.756,
"step": 70
},
{
"epoch": 0.05301969569681695,
"grad_norm": 6.120957463564528,
"learning_rate": 9.986825259721292e-06,
"loss": 1.0982,
"step": 71
},
{
"epoch": 0.053766451974236906,
"grad_norm": 4.980966853682578,
"learning_rate": 9.985932724045347e-06,
"loss": 1.1755,
"step": 72
},
{
"epoch": 0.05451320825165686,
"grad_norm": 4.331150227314127,
"learning_rate": 9.985010980722974e-06,
"loss": 1.103,
"step": 73
},
{
"epoch": 0.055259964529076826,
"grad_norm": 5.124677563712541,
"learning_rate": 9.984060035153752e-06,
"loss": 1.1304,
"step": 74
},
{
"epoch": 0.05600672080649678,
"grad_norm": 5.5037953572716685,
"learning_rate": 9.983079892908332e-06,
"loss": 1.0897,
"step": 75
},
{
"epoch": 0.05675347708391674,
"grad_norm": 6.6795743519930415,
"learning_rate": 9.982070559728398e-06,
"loss": 1.2371,
"step": 76
},
{
"epoch": 0.057500233361336694,
"grad_norm": 5.052137551270288,
"learning_rate": 9.981032041526635e-06,
"loss": 1.124,
"step": 77
},
{
"epoch": 0.05824698963875665,
"grad_norm": 4.687977226171484,
"learning_rate": 9.979964344386692e-06,
"loss": 1.151,
"step": 78
},
{
"epoch": 0.05899374591617661,
"grad_norm": 4.4520211232435125,
"learning_rate": 9.978867474563151e-06,
"loss": 1.1519,
"step": 79
},
{
"epoch": 0.05974050219359656,
"grad_norm": 3.8288574242884774,
"learning_rate": 9.977741438481487e-06,
"loss": 1.0624,
"step": 80
},
{
"epoch": 0.06048725847101652,
"grad_norm": 4.534421709411,
"learning_rate": 9.976586242738032e-06,
"loss": 1.1844,
"step": 81
},
{
"epoch": 0.061234014748436476,
"grad_norm": 3.884735997829129,
"learning_rate": 9.975401894099933e-06,
"loss": 1.1259,
"step": 82
},
{
"epoch": 0.06198077102585644,
"grad_norm": 5.654542687957219,
"learning_rate": 9.974188399505116e-06,
"loss": 1.0949,
"step": 83
},
{
"epoch": 0.0627275273032764,
"grad_norm": 11.863718915842355,
"learning_rate": 9.972945766062248e-06,
"loss": 1.1209,
"step": 84
},
{
"epoch": 0.0627275273032764,
"eval_loss": 1.0480048656463623,
"eval_runtime": 160.6234,
"eval_samples_per_second": 112.244,
"eval_steps_per_second": 1.756,
"step": 84
},
{
"epoch": 0.06347428358069634,
"grad_norm": 5.672730835621609,
"learning_rate": 9.971674001050687e-06,
"loss": 1.0811,
"step": 85
},
{
"epoch": 0.06422103985811631,
"grad_norm": 4.58816995647344,
"learning_rate": 9.970373111920447e-06,
"loss": 1.0538,
"step": 86
},
{
"epoch": 0.06496779613553627,
"grad_norm": 3.901498541644042,
"learning_rate": 9.969043106292149e-06,
"loss": 1.1502,
"step": 87
},
{
"epoch": 0.06571455241295622,
"grad_norm": 4.564513799166725,
"learning_rate": 9.96768399195698e-06,
"loss": 1.0951,
"step": 88
},
{
"epoch": 0.06646130869037618,
"grad_norm": 5.186730910602631,
"learning_rate": 9.966295776876648e-06,
"loss": 1.1669,
"step": 89
},
{
"epoch": 0.06720806496779613,
"grad_norm": 4.925917142678016,
"learning_rate": 9.96487846918333e-06,
"loss": 1.1522,
"step": 90
},
{
"epoch": 0.0679548212452161,
"grad_norm": 6.5957881769118405,
"learning_rate": 9.963432077179629e-06,
"loss": 1.1221,
"step": 91
},
{
"epoch": 0.06870157752263605,
"grad_norm": 4.099086784960917,
"learning_rate": 9.961956609338526e-06,
"loss": 1.1186,
"step": 92
},
{
"epoch": 0.06944833380005601,
"grad_norm": 3.70039872123764,
"learning_rate": 9.960452074303327e-06,
"loss": 1.1268,
"step": 93
},
{
"epoch": 0.07019509007747596,
"grad_norm": 4.785660260223015,
"learning_rate": 9.958918480887612e-06,
"loss": 1.11,
"step": 94
},
{
"epoch": 0.07094184635489592,
"grad_norm": 3.968060355634415,
"learning_rate": 9.957355838075188e-06,
"loss": 1.1706,
"step": 95
},
{
"epoch": 0.07168860263231588,
"grad_norm": 3.793655960096394,
"learning_rate": 9.955764155020037e-06,
"loss": 1.0165,
"step": 96
},
{
"epoch": 0.07243535890973583,
"grad_norm": 4.377595345492589,
"learning_rate": 9.95414344104625e-06,
"loss": 1.1269,
"step": 97
},
{
"epoch": 0.0731821151871558,
"grad_norm": 3.7820990485608044,
"learning_rate": 9.952493705647989e-06,
"loss": 1.1133,
"step": 98
},
{
"epoch": 0.0731821151871558,
"eval_loss": 1.0365121364593506,
"eval_runtime": 160.5554,
"eval_samples_per_second": 112.291,
"eval_steps_per_second": 1.756,
"step": 98
},
{
"epoch": 0.07392887146457575,
"grad_norm": 5.305536373865056,
"learning_rate": 9.950814958489421e-06,
"loss": 1.1091,
"step": 99
},
{
"epoch": 0.07467562774199571,
"grad_norm": 4.4015698622632895,
"learning_rate": 9.949107209404664e-06,
"loss": 1.1791,
"step": 100
},
{
"epoch": 0.07542238401941566,
"grad_norm": 4.975666896127531,
"learning_rate": 9.947370468397731e-06,
"loss": 1.1585,
"step": 101
},
{
"epoch": 0.07616914029683562,
"grad_norm": 3.7736499179128167,
"learning_rate": 9.94560474564247e-06,
"loss": 1.038,
"step": 102
},
{
"epoch": 0.07691589657425557,
"grad_norm": 4.240252244792403,
"learning_rate": 9.9438100514825e-06,
"loss": 1.0741,
"step": 103
},
{
"epoch": 0.07766265285167553,
"grad_norm": 6.410869819768315,
"learning_rate": 9.941986396431161e-06,
"loss": 1.1008,
"step": 104
},
{
"epoch": 0.0784094091290955,
"grad_norm": 3.9651365430008463,
"learning_rate": 9.940133791171445e-06,
"loss": 1.0818,
"step": 105
},
{
"epoch": 0.07915616540651545,
"grad_norm": 4.449129793463904,
"learning_rate": 9.938252246555929e-06,
"loss": 1.1769,
"step": 106
},
{
"epoch": 0.07990292168393541,
"grad_norm": 4.552135217930411,
"learning_rate": 9.936341773606723e-06,
"loss": 1.0777,
"step": 107
},
{
"epoch": 0.08064967796135536,
"grad_norm": 4.190554265440604,
"learning_rate": 9.9344023835154e-06,
"loss": 1.054,
"step": 108
},
{
"epoch": 0.08139643423877532,
"grad_norm": 4.16264776639165,
"learning_rate": 9.932434087642924e-06,
"loss": 1.0736,
"step": 109
},
{
"epoch": 0.08214319051619527,
"grad_norm": 5.68044977176061,
"learning_rate": 9.930436897519595e-06,
"loss": 1.0091,
"step": 110
},
{
"epoch": 0.08288994679361524,
"grad_norm": 3.0998166629146042,
"learning_rate": 9.928410824844974e-06,
"loss": 1.0378,
"step": 111
},
{
"epoch": 0.08363670307103518,
"grad_norm": 4.842084164752693,
"learning_rate": 9.926355881487815e-06,
"loss": 1.1629,
"step": 112
},
{
"epoch": 0.08363670307103518,
"eval_loss": 1.032652735710144,
"eval_runtime": 160.5893,
"eval_samples_per_second": 112.268,
"eval_steps_per_second": 1.756,
"step": 112
},
{
"epoch": 0.08438345934845515,
"grad_norm": 3.7500012310156094,
"learning_rate": 9.924272079485996e-06,
"loss": 1.1139,
"step": 113
},
{
"epoch": 0.08513021562587511,
"grad_norm": 3.6304218738612177,
"learning_rate": 9.922159431046457e-06,
"loss": 1.0562,
"step": 114
},
{
"epoch": 0.08587697190329506,
"grad_norm": 4.219419886285629,
"learning_rate": 9.920017948545109e-06,
"loss": 1.0334,
"step": 115
},
{
"epoch": 0.08662372818071502,
"grad_norm": 4.106591390142076,
"learning_rate": 9.91784764452678e-06,
"loss": 1.0092,
"step": 116
},
{
"epoch": 0.08737048445813497,
"grad_norm": 3.2149890199105893,
"learning_rate": 9.91564853170514e-06,
"loss": 1.0324,
"step": 117
},
{
"epoch": 0.08811724073555494,
"grad_norm": 4.303949996240386,
"learning_rate": 9.913420622962606e-06,
"loss": 1.0643,
"step": 118
},
{
"epoch": 0.08886399701297489,
"grad_norm": 3.475558336468595,
"learning_rate": 9.911163931350296e-06,
"loss": 1.0757,
"step": 119
},
{
"epoch": 0.08961075329039485,
"grad_norm": 3.812619171039642,
"learning_rate": 9.908878470087931e-06,
"loss": 1.1746,
"step": 120
},
{
"epoch": 0.0903575095678148,
"grad_norm": 3.990770542904442,
"learning_rate": 9.906564252563769e-06,
"loss": 1.0596,
"step": 121
},
{
"epoch": 0.09110426584523476,
"grad_norm": 3.8102423981670337,
"learning_rate": 9.904221292334521e-06,
"loss": 1.0186,
"step": 122
},
{
"epoch": 0.09185102212265472,
"grad_norm": 3.766965024133607,
"learning_rate": 9.901849603125271e-06,
"loss": 1.1101,
"step": 123
},
{
"epoch": 0.09259777840007467,
"grad_norm": 3.3405241849704788,
"learning_rate": 9.8994491988294e-06,
"loss": 0.9937,
"step": 124
},
{
"epoch": 0.09334453467749464,
"grad_norm": 3.9927196426527383,
"learning_rate": 9.897020093508502e-06,
"loss": 1.0596,
"step": 125
},
{
"epoch": 0.09409129095491459,
"grad_norm": 3.2836748843642254,
"learning_rate": 9.894562301392301e-06,
"loss": 1.0812,
"step": 126
},
{
"epoch": 0.09409129095491459,
"eval_loss": 1.0256747007369995,
"eval_runtime": 160.6424,
"eval_samples_per_second": 112.231,
"eval_steps_per_second": 1.755,
"step": 126
},
{
"epoch": 0.09483804723233455,
"grad_norm": 4.363380401494807,
"learning_rate": 9.89207583687857e-06,
"loss": 1.0794,
"step": 127
},
{
"epoch": 0.0955848035097545,
"grad_norm": 6.478735079038311,
"learning_rate": 9.889560714533043e-06,
"loss": 1.066,
"step": 128
},
{
"epoch": 0.09633155978717446,
"grad_norm": 3.5425697986871705,
"learning_rate": 9.887016949089334e-06,
"loss": 1.0932,
"step": 129
},
{
"epoch": 0.09707831606459442,
"grad_norm": 3.693076593064551,
"learning_rate": 9.884444555448848e-06,
"loss": 1.0519,
"step": 130
},
{
"epoch": 0.09782507234201437,
"grad_norm": 7.860038890324692,
"learning_rate": 9.881843548680694e-06,
"loss": 1.0848,
"step": 131
},
{
"epoch": 0.09857182861943434,
"grad_norm": 7.4142740851339335,
"learning_rate": 9.879213944021597e-06,
"loss": 1.1747,
"step": 132
},
{
"epoch": 0.09931858489685429,
"grad_norm": 3.3316594853280947,
"learning_rate": 9.876555756875807e-06,
"loss": 0.9748,
"step": 133
},
{
"epoch": 0.10006534117427425,
"grad_norm": 3.78741230296177,
"learning_rate": 9.873869002815015e-06,
"loss": 1.0438,
"step": 134
},
{
"epoch": 0.1008120974516942,
"grad_norm": 4.609277603093571,
"learning_rate": 9.871153697578254e-06,
"loss": 0.9756,
"step": 135
},
{
"epoch": 0.10155885372911416,
"grad_norm": 4.188743485193134,
"learning_rate": 9.86840985707181e-06,
"loss": 1.0559,
"step": 136
},
{
"epoch": 0.10230561000653411,
"grad_norm": 3.428561895633466,
"learning_rate": 9.86563749736913e-06,
"loss": 1.0102,
"step": 137
},
{
"epoch": 0.10305236628395407,
"grad_norm": 6.01857672692751,
"learning_rate": 9.862836634710724e-06,
"loss": 1.0943,
"step": 138
},
{
"epoch": 0.10379912256137404,
"grad_norm": 4.3281257428742785,
"learning_rate": 9.860007285504079e-06,
"loss": 1.1187,
"step": 139
},
{
"epoch": 0.10454587883879399,
"grad_norm": 3.7218504298177675,
"learning_rate": 9.85714946632355e-06,
"loss": 1.0917,
"step": 140
},
{
"epoch": 0.10454587883879399,
"eval_loss": 1.023650884628296,
"eval_runtime": 160.5345,
"eval_samples_per_second": 112.306,
"eval_steps_per_second": 1.757,
"step": 140
},
{
"epoch": 0.10529263511621395,
"grad_norm": 3.4489168662027176,
"learning_rate": 9.854263193910274e-06,
"loss": 1.1063,
"step": 141
},
{
"epoch": 0.1060393913936339,
"grad_norm": 4.423623706010156,
"learning_rate": 9.85134848517206e-06,
"loss": 1.0728,
"step": 142
},
{
"epoch": 0.10678614767105386,
"grad_norm": 2.8649958611873076,
"learning_rate": 9.84840535718331e-06,
"loss": 1.0078,
"step": 143
},
{
"epoch": 0.10753290394847381,
"grad_norm": 4.338171037781992,
"learning_rate": 9.845433827184894e-06,
"loss": 1.0381,
"step": 144
},
{
"epoch": 0.10827966022589378,
"grad_norm": 5.171977346386107,
"learning_rate": 9.842433912584066e-06,
"loss": 1.1569,
"step": 145
},
{
"epoch": 0.10902641650331373,
"grad_norm": 3.3107619410438662,
"learning_rate": 9.839405630954358e-06,
"loss": 1.086,
"step": 146
},
{
"epoch": 0.10977317278073369,
"grad_norm": 2.840297473307081,
"learning_rate": 9.836349000035477e-06,
"loss": 1.0401,
"step": 147
},
{
"epoch": 0.11051992905815365,
"grad_norm": 5.3601003369045594,
"learning_rate": 9.833264037733198e-06,
"loss": 1.1039,
"step": 148
},
{
"epoch": 0.1112666853355736,
"grad_norm": 4.4022528506550955,
"learning_rate": 9.83015076211926e-06,
"loss": 1.0417,
"step": 149
},
{
"epoch": 0.11201344161299356,
"grad_norm": 3.547705426654357,
"learning_rate": 9.827009191431271e-06,
"loss": 0.9775,
"step": 150
},
{
"epoch": 0.11276019789041351,
"grad_norm": 4.149903504313127,
"learning_rate": 9.823839344072582e-06,
"loss": 1.0646,
"step": 151
},
{
"epoch": 0.11350695416783348,
"grad_norm": 3.9437966555055355,
"learning_rate": 9.820641238612187e-06,
"loss": 1.1617,
"step": 152
},
{
"epoch": 0.11425371044525343,
"grad_norm": 3.443249729940194,
"learning_rate": 9.81741489378463e-06,
"loss": 1.0669,
"step": 153
},
{
"epoch": 0.11500046672267339,
"grad_norm": 3.975523331591033,
"learning_rate": 9.814160328489867e-06,
"loss": 1.0538,
"step": 154
},
{
"epoch": 0.11500046672267339,
"eval_loss": 1.0189063549041748,
"eval_runtime": 160.548,
"eval_samples_per_second": 112.297,
"eval_steps_per_second": 1.756,
"step": 154
},
{
"epoch": 0.11574722300009334,
"grad_norm": 6.747016403332581,
"learning_rate": 9.810877561793178e-06,
"loss": 1.0677,
"step": 155
},
{
"epoch": 0.1164939792775133,
"grad_norm": 3.502112711353132,
"learning_rate": 9.807566612925044e-06,
"loss": 1.1483,
"step": 156
},
{
"epoch": 0.11724073555493326,
"grad_norm": 4.033103295708667,
"learning_rate": 9.804227501281041e-06,
"loss": 1.0036,
"step": 157
},
{
"epoch": 0.11798749183235321,
"grad_norm": 3.353125159155293,
"learning_rate": 9.800860246421717e-06,
"loss": 1.0754,
"step": 158
},
{
"epoch": 0.11873424810977318,
"grad_norm": 5.233916333194442,
"learning_rate": 9.797464868072489e-06,
"loss": 1.0573,
"step": 159
},
{
"epoch": 0.11948100438719313,
"grad_norm": 4.551032759166754,
"learning_rate": 9.794041386123517e-06,
"loss": 1.1049,
"step": 160
},
{
"epoch": 0.12022776066461309,
"grad_norm": 4.518239629943308,
"learning_rate": 9.790589820629594e-06,
"loss": 1.024,
"step": 161
},
{
"epoch": 0.12097451694203304,
"grad_norm": 3.8193115164547984,
"learning_rate": 9.787110191810027e-06,
"loss": 1.13,
"step": 162
},
{
"epoch": 0.121721273219453,
"grad_norm": 3.7115307636722674,
"learning_rate": 9.783602520048524e-06,
"loss": 1.1001,
"step": 163
},
{
"epoch": 0.12246802949687295,
"grad_norm": 5.181860033123124,
"learning_rate": 9.780066825893055e-06,
"loss": 1.1233,
"step": 164
},
{
"epoch": 0.12321478577429291,
"grad_norm": 4.818202897694641,
"learning_rate": 9.776503130055758e-06,
"loss": 1.0216,
"step": 165
},
{
"epoch": 0.12396154205171288,
"grad_norm": 7.927525142394235,
"learning_rate": 9.7729114534128e-06,
"loss": 1.0463,
"step": 166
},
{
"epoch": 0.12470829832913283,
"grad_norm": 3.743721622719181,
"learning_rate": 9.76929181700426e-06,
"loss": 1.1261,
"step": 167
},
{
"epoch": 0.1254550546065528,
"grad_norm": 5.6431869998808875,
"learning_rate": 9.765644242034009e-06,
"loss": 1.0746,
"step": 168
},
{
"epoch": 0.1254550546065528,
"eval_loss": 1.017249345779419,
"eval_runtime": 160.6003,
"eval_samples_per_second": 112.26,
"eval_steps_per_second": 1.756,
"step": 168
},
{
"epoch": 0.12620181088397275,
"grad_norm": 3.131298638211881,
"learning_rate": 9.761968749869576e-06,
"loss": 0.9417,
"step": 169
},
{
"epoch": 0.1269485671613927,
"grad_norm": 7.862269405575784,
"learning_rate": 9.758265362042035e-06,
"loss": 1.1121,
"step": 170
},
{
"epoch": 0.12769532343881265,
"grad_norm": 3.640159123263889,
"learning_rate": 9.754534100245867e-06,
"loss": 1.0805,
"step": 171
},
{
"epoch": 0.12844207971623262,
"grad_norm": 4.5439716593057105,
"learning_rate": 9.750774986338851e-06,
"loss": 1.0338,
"step": 172
},
{
"epoch": 0.12918883599365258,
"grad_norm": 3.969721172469628,
"learning_rate": 9.746988042341907e-06,
"loss": 1.1218,
"step": 173
},
{
"epoch": 0.12993559227107254,
"grad_norm": 3.7519993464454453,
"learning_rate": 9.743173290438998e-06,
"loss": 1.0642,
"step": 174
},
{
"epoch": 0.13068234854849248,
"grad_norm": 3.9426723091303617,
"learning_rate": 9.739330752976981e-06,
"loss": 1.0608,
"step": 175
},
{
"epoch": 0.13142910482591244,
"grad_norm": 4.1886923692493285,
"learning_rate": 9.735460452465477e-06,
"loss": 1.1175,
"step": 176
},
{
"epoch": 0.1321758611033324,
"grad_norm": 3.7222485604926336,
"learning_rate": 9.731562411576751e-06,
"loss": 1.028,
"step": 177
},
{
"epoch": 0.13292261738075237,
"grad_norm": 3.3695163719588743,
"learning_rate": 9.727636653145567e-06,
"loss": 1.1298,
"step": 178
},
{
"epoch": 0.1336693736581723,
"grad_norm": 3.566733005393415,
"learning_rate": 9.723683200169059e-06,
"loss": 1.0149,
"step": 179
},
{
"epoch": 0.13441612993559227,
"grad_norm": 3.5062777269977223,
"learning_rate": 9.719702075806594e-06,
"loss": 1.0478,
"step": 180
},
{
"epoch": 0.13516288621301223,
"grad_norm": 4.408630041983221,
"learning_rate": 9.715693303379643e-06,
"loss": 1.0856,
"step": 181
},
{
"epoch": 0.1359096424904322,
"grad_norm": 4.602186459996313,
"learning_rate": 9.711656906371636e-06,
"loss": 1.0201,
"step": 182
},
{
"epoch": 0.1359096424904322,
"eval_loss": 1.0116474628448486,
"eval_runtime": 160.494,
"eval_samples_per_second": 112.334,
"eval_steps_per_second": 1.757,
"step": 182
},
{
"epoch": 0.13665639876785216,
"grad_norm": 4.712040307247732,
"learning_rate": 9.70759290842783e-06,
"loss": 1.0098,
"step": 183
},
{
"epoch": 0.1374031550452721,
"grad_norm": 5.620657515416197,
"learning_rate": 9.703501333355167e-06,
"loss": 1.0496,
"step": 184
},
{
"epoch": 0.13814991132269205,
"grad_norm": 5.801872955784515,
"learning_rate": 9.699382205122138e-06,
"loss": 1.1084,
"step": 185
},
{
"epoch": 0.13889666760011202,
"grad_norm": 3.8516216420714504,
"learning_rate": 9.695235547858638e-06,
"loss": 1.061,
"step": 186
},
{
"epoch": 0.13964342387753198,
"grad_norm": 3.8555280916375785,
"learning_rate": 9.69106138585583e-06,
"loss": 1.0538,
"step": 187
},
{
"epoch": 0.14039018015495192,
"grad_norm": 3.726076167412021,
"learning_rate": 9.686859743565997e-06,
"loss": 1.0334,
"step": 188
},
{
"epoch": 0.14113693643237188,
"grad_norm": 4.733689742037587,
"learning_rate": 9.682630645602409e-06,
"loss": 1.081,
"step": 189
},
{
"epoch": 0.14188369270979184,
"grad_norm": 3.858770100544451,
"learning_rate": 9.678374116739159e-06,
"loss": 1.0443,
"step": 190
},
{
"epoch": 0.1426304489872118,
"grad_norm": 4.070999582637023,
"learning_rate": 9.674090181911044e-06,
"loss": 1.1021,
"step": 191
},
{
"epoch": 0.14337720526463177,
"grad_norm": 4.5872247594173805,
"learning_rate": 9.669778866213397e-06,
"loss": 1.1395,
"step": 192
},
{
"epoch": 0.1441239615420517,
"grad_norm": 4.031163577626464,
"learning_rate": 9.665440194901951e-06,
"loss": 1.0292,
"step": 193
},
{
"epoch": 0.14487071781947167,
"grad_norm": 3.52474459878042,
"learning_rate": 9.661074193392689e-06,
"loss": 0.958,
"step": 194
},
{
"epoch": 0.14561747409689163,
"grad_norm": 3.436050793052391,
"learning_rate": 9.656680887261693e-06,
"loss": 0.986,
"step": 195
},
{
"epoch": 0.1463642303743116,
"grad_norm": 5.0419620789744295,
"learning_rate": 9.652260302244996e-06,
"loss": 1.0678,
"step": 196
},
{
"epoch": 0.1463642303743116,
"eval_loss": 1.0114675760269165,
"eval_runtime": 160.5029,
"eval_samples_per_second": 112.328,
"eval_steps_per_second": 1.757,
"step": 196
},
{
"epoch": 0.14711098665173153,
"grad_norm": 3.6147654687509694,
"learning_rate": 9.647812464238434e-06,
"loss": 1.028,
"step": 197
},
{
"epoch": 0.1478577429291515,
"grad_norm": 3.586767777623825,
"learning_rate": 9.643337399297485e-06,
"loss": 1.0449,
"step": 198
},
{
"epoch": 0.14860449920657146,
"grad_norm": 3.715503317019716,
"learning_rate": 9.638835133637129e-06,
"loss": 1.0655,
"step": 199
},
{
"epoch": 0.14935125548399142,
"grad_norm": 4.532690290017325,
"learning_rate": 9.634305693631686e-06,
"loss": 1.0047,
"step": 200
},
{
"epoch": 0.15009801176141138,
"grad_norm": 4.260013712201999,
"learning_rate": 9.629749105814664e-06,
"loss": 1.0509,
"step": 201
},
{
"epoch": 0.15084476803883132,
"grad_norm": 3.6166187921664794,
"learning_rate": 9.625165396878599e-06,
"loss": 1.0305,
"step": 202
},
{
"epoch": 0.15159152431625128,
"grad_norm": 4.163490064044642,
"learning_rate": 9.62055459367491e-06,
"loss": 1.093,
"step": 203
},
{
"epoch": 0.15233828059367124,
"grad_norm": 13.019898994869333,
"learning_rate": 9.615916723213728e-06,
"loss": 1.1112,
"step": 204
},
{
"epoch": 0.1530850368710912,
"grad_norm": 3.175945878707496,
"learning_rate": 9.611251812663748e-06,
"loss": 1.0688,
"step": 205
},
{
"epoch": 0.15383179314851114,
"grad_norm": 4.8715801343243195,
"learning_rate": 9.606559889352065e-06,
"loss": 1.0823,
"step": 206
},
{
"epoch": 0.1545785494259311,
"grad_norm": 3.232522890237694,
"learning_rate": 9.601840980764016e-06,
"loss": 0.9584,
"step": 207
},
{
"epoch": 0.15532530570335107,
"grad_norm": 4.556117894865469,
"learning_rate": 9.597095114543018e-06,
"loss": 1.0848,
"step": 208
},
{
"epoch": 0.15607206198077103,
"grad_norm": 3.404791411157451,
"learning_rate": 9.592322318490404e-06,
"loss": 1.0357,
"step": 209
},
{
"epoch": 0.156818818258191,
"grad_norm": 3.056791925639437,
"learning_rate": 9.587522620565263e-06,
"loss": 0.915,
"step": 210
},
{
"epoch": 0.156818818258191,
"eval_loss": 1.0087823867797852,
"eval_runtime": 160.6551,
"eval_samples_per_second": 112.222,
"eval_steps_per_second": 1.755,
"step": 210
},
{
"epoch": 0.15756557453561093,
"grad_norm": 3.7714924940308987,
"learning_rate": 9.582696048884277e-06,
"loss": 1.1256,
"step": 211
},
{
"epoch": 0.1583123308130309,
"grad_norm": 3.26219138631936,
"learning_rate": 9.577842631721553e-06,
"loss": 1.0654,
"step": 212
},
{
"epoch": 0.15905908709045086,
"grad_norm": 3.435138396589902,
"learning_rate": 9.57296239750846e-06,
"loss": 1.0228,
"step": 213
},
{
"epoch": 0.15980584336787082,
"grad_norm": 5.346686608096492,
"learning_rate": 9.568055374833463e-06,
"loss": 1.0289,
"step": 214
},
{
"epoch": 0.16055259964529076,
"grad_norm": 3.9966819427916347,
"learning_rate": 9.563121592441949e-06,
"loss": 1.1345,
"step": 215
},
{
"epoch": 0.16129935592271072,
"grad_norm": 5.555416407810244,
"learning_rate": 9.558161079236073e-06,
"loss": 1.1554,
"step": 216
},
{
"epoch": 0.16204611220013068,
"grad_norm": 4.159577619579252,
"learning_rate": 9.553173864274567e-06,
"loss": 1.0403,
"step": 217
},
{
"epoch": 0.16279286847755065,
"grad_norm": 3.318000661865423,
"learning_rate": 9.548159976772593e-06,
"loss": 1.0505,
"step": 218
},
{
"epoch": 0.1635396247549706,
"grad_norm": 3.2489399076259566,
"learning_rate": 9.543119446101556e-06,
"loss": 1.0231,
"step": 219
},
{
"epoch": 0.16428638103239054,
"grad_norm": 3.4135688094134933,
"learning_rate": 9.538052301788937e-06,
"loss": 1.1055,
"step": 220
},
{
"epoch": 0.1650331373098105,
"grad_norm": 3.268136264861273,
"learning_rate": 9.532958573518121e-06,
"loss": 0.9477,
"step": 221
},
{
"epoch": 0.16577989358723047,
"grad_norm": 3.5039317588782195,
"learning_rate": 9.527838291128222e-06,
"loss": 0.9773,
"step": 222
},
{
"epoch": 0.16652664986465043,
"grad_norm": 3.7195766398408088,
"learning_rate": 9.52269148461391e-06,
"loss": 1.0539,
"step": 223
},
{
"epoch": 0.16727340614207037,
"grad_norm": 3.5220614370608154,
"learning_rate": 9.51751818412523e-06,
"loss": 1.0342,
"step": 224
},
{
"epoch": 0.16727340614207037,
"eval_loss": 1.0059709548950195,
"eval_runtime": 160.5464,
"eval_samples_per_second": 112.298,
"eval_steps_per_second": 1.757,
"step": 224
},
{
"epoch": 0.16802016241949033,
"grad_norm": 4.075160908849732,
"learning_rate": 9.512318419967427e-06,
"loss": 1.0168,
"step": 225
},
{
"epoch": 0.1687669186969103,
"grad_norm": 2.9114360706072664,
"learning_rate": 9.507092222600783e-06,
"loss": 1.0028,
"step": 226
},
{
"epoch": 0.16951367497433026,
"grad_norm": 4.132080117167401,
"learning_rate": 9.50183962264041e-06,
"loss": 1.0656,
"step": 227
},
{
"epoch": 0.17026043125175022,
"grad_norm": 3.672183049449128,
"learning_rate": 9.496560650856097e-06,
"loss": 1.0088,
"step": 228
},
{
"epoch": 0.17100718752917016,
"grad_norm": 3.4549536917783943,
"learning_rate": 9.491255338172116e-06,
"loss": 1.0091,
"step": 229
},
{
"epoch": 0.17175394380659012,
"grad_norm": 5.640659404596868,
"learning_rate": 9.485923715667043e-06,
"loss": 1.1446,
"step": 230
},
{
"epoch": 0.17250070008401008,
"grad_norm": 3.6480354854741193,
"learning_rate": 9.48056581457358e-06,
"loss": 1.0238,
"step": 231
},
{
"epoch": 0.17324745636143005,
"grad_norm": 3.236903605406026,
"learning_rate": 9.47518166627837e-06,
"loss": 0.943,
"step": 232
},
{
"epoch": 0.17399421263884998,
"grad_norm": 4.153416002763863,
"learning_rate": 9.469771302321806e-06,
"loss": 1.0034,
"step": 233
},
{
"epoch": 0.17474096891626995,
"grad_norm": 3.6956623376560973,
"learning_rate": 9.464334754397861e-06,
"loss": 1.0605,
"step": 234
},
{
"epoch": 0.1754877251936899,
"grad_norm": 3.206378764325457,
"learning_rate": 9.458872054353888e-06,
"loss": 1.008,
"step": 235
},
{
"epoch": 0.17623448147110987,
"grad_norm": 5.157203956458338,
"learning_rate": 9.453383234190443e-06,
"loss": 1.085,
"step": 236
},
{
"epoch": 0.17698123774852983,
"grad_norm": 4.037061991227878,
"learning_rate": 9.44786832606109e-06,
"loss": 1.0144,
"step": 237
},
{
"epoch": 0.17772799402594977,
"grad_norm": 4.908695178645077,
"learning_rate": 9.44232736227222e-06,
"loss": 1.0413,
"step": 238
},
{
"epoch": 0.17772799402594977,
"eval_loss": 1.0027769804000854,
"eval_runtime": 160.4435,
"eval_samples_per_second": 112.37,
"eval_steps_per_second": 1.758,
"step": 238
},
{
"epoch": 0.17847475030336973,
"grad_norm": 4.237247843930998,
"learning_rate": 9.436760375282858e-06,
"loss": 1.1329,
"step": 239
},
{
"epoch": 0.1792215065807897,
"grad_norm": 3.13851110753343,
"learning_rate": 9.431167397704473e-06,
"loss": 1.0213,
"step": 240
},
{
"epoch": 0.17996826285820966,
"grad_norm": 4.018367507468041,
"learning_rate": 9.425548462300784e-06,
"loss": 1.0049,
"step": 241
},
{
"epoch": 0.1807150191356296,
"grad_norm": 3.847188380013737,
"learning_rate": 9.419903601987577e-06,
"loss": 0.9974,
"step": 242
},
{
"epoch": 0.18146177541304956,
"grad_norm": 3.6783145957465275,
"learning_rate": 9.414232849832501e-06,
"loss": 1.0258,
"step": 243
},
{
"epoch": 0.18220853169046952,
"grad_norm": 4.133477837431737,
"learning_rate": 9.408536239054881e-06,
"loss": 1.0285,
"step": 244
},
{
"epoch": 0.18295528796788949,
"grad_norm": 3.568010560178771,
"learning_rate": 9.402813803025526e-06,
"loss": 1.0521,
"step": 245
},
{
"epoch": 0.18370204424530945,
"grad_norm": 2.808739846372454,
"learning_rate": 9.397065575266524e-06,
"loss": 0.9989,
"step": 246
},
{
"epoch": 0.18444880052272938,
"grad_norm": 3.07097954551416,
"learning_rate": 9.391291589451056e-06,
"loss": 0.9804,
"step": 247
},
{
"epoch": 0.18519555680014935,
"grad_norm": 3.2507539771287566,
"learning_rate": 9.38549187940319e-06,
"loss": 1.0255,
"step": 248
},
{
"epoch": 0.1859423130775693,
"grad_norm": 2.936663911951155,
"learning_rate": 9.379666479097688e-06,
"loss": 1.0129,
"step": 249
},
{
"epoch": 0.18668906935498927,
"grad_norm": 4.046385332301949,
"learning_rate": 9.373815422659806e-06,
"loss": 0.9575,
"step": 250
},
{
"epoch": 0.1874358256324092,
"grad_norm": 3.1585373599718625,
"learning_rate": 9.3679387443651e-06,
"loss": 1.0272,
"step": 251
},
{
"epoch": 0.18818258190982917,
"grad_norm": 3.5612606806923277,
"learning_rate": 9.362036478639206e-06,
"loss": 1.0497,
"step": 252
},
{
"epoch": 0.18818258190982917,
"eval_loss": 1.000356912612915,
"eval_runtime": 160.5493,
"eval_samples_per_second": 112.296,
"eval_steps_per_second": 1.756,
"step": 252
},
{
"epoch": 0.18892933818724914,
"grad_norm": 3.4215406870010567,
"learning_rate": 9.356108660057662e-06,
"loss": 1.0153,
"step": 253
},
{
"epoch": 0.1896760944646691,
"grad_norm": 6.549260486707147,
"learning_rate": 9.35015532334569e-06,
"loss": 1.0474,
"step": 254
},
{
"epoch": 0.19042285074208906,
"grad_norm": 3.7221855827438604,
"learning_rate": 9.344176503378003e-06,
"loss": 1.0034,
"step": 255
},
{
"epoch": 0.191169607019509,
"grad_norm": 4.286172100728777,
"learning_rate": 9.33817223517859e-06,
"loss": 1.1106,
"step": 256
},
{
"epoch": 0.19191636329692896,
"grad_norm": 3.7718076814034602,
"learning_rate": 9.332142553920513e-06,
"loss": 1.004,
"step": 257
},
{
"epoch": 0.19266311957434892,
"grad_norm": 4.027126629991061,
"learning_rate": 9.326087494925715e-06,
"loss": 1.0292,
"step": 258
},
{
"epoch": 0.1934098758517689,
"grad_norm": 4.013082273454176,
"learning_rate": 9.32000709366479e-06,
"loss": 1.0707,
"step": 259
},
{
"epoch": 0.19415663212918885,
"grad_norm": 3.6414888535002388,
"learning_rate": 9.313901385756794e-06,
"loss": 1.0261,
"step": 260
},
{
"epoch": 0.19490338840660879,
"grad_norm": 3.1393801326018043,
"learning_rate": 9.307770406969032e-06,
"loss": 1.0124,
"step": 261
},
{
"epoch": 0.19565014468402875,
"grad_norm": 4.255795797008218,
"learning_rate": 9.301614193216837e-06,
"loss": 1.0726,
"step": 262
},
{
"epoch": 0.1963969009614487,
"grad_norm": 3.2995671562106095,
"learning_rate": 9.295432780563378e-06,
"loss": 0.9887,
"step": 263
},
{
"epoch": 0.19714365723886867,
"grad_norm": 3.023791866548189,
"learning_rate": 9.289226205219432e-06,
"loss": 0.9578,
"step": 264
},
{
"epoch": 0.1978904135162886,
"grad_norm": 2.9272861770466543,
"learning_rate": 9.282994503543185e-06,
"loss": 0.917,
"step": 265
},
{
"epoch": 0.19863716979370857,
"grad_norm": 3.9852539375546447,
"learning_rate": 9.276737712040012e-06,
"loss": 1.0247,
"step": 266
},
{
"epoch": 0.19863716979370857,
"eval_loss": 0.9999991059303284,
"eval_runtime": 160.5983,
"eval_samples_per_second": 112.261,
"eval_steps_per_second": 1.756,
"step": 266
},
{
"epoch": 0.19938392607112854,
"grad_norm": 4.4042754214389594,
"learning_rate": 9.270455867362262e-06,
"loss": 1.0177,
"step": 267
},
{
"epoch": 0.2001306823485485,
"grad_norm": 3.680747210357818,
"learning_rate": 9.264149006309048e-06,
"loss": 1.0334,
"step": 268
},
{
"epoch": 0.20087743862596846,
"grad_norm": 3.8140169790142218,
"learning_rate": 9.257817165826027e-06,
"loss": 1.0524,
"step": 269
},
{
"epoch": 0.2016241949033884,
"grad_norm": 5.114797128136089,
"learning_rate": 9.251460383005188e-06,
"loss": 1.0574,
"step": 270
},
{
"epoch": 0.20237095118080836,
"grad_norm": 3.1878689642025995,
"learning_rate": 9.245078695084632e-06,
"loss": 1.029,
"step": 271
},
{
"epoch": 0.20311770745822832,
"grad_norm": 3.8148562988799295,
"learning_rate": 9.238672139448354e-06,
"loss": 1.0469,
"step": 272
},
{
"epoch": 0.2038644637356483,
"grad_norm": 3.767249082048484,
"learning_rate": 9.232240753626027e-06,
"loss": 1.0501,
"step": 273
},
{
"epoch": 0.20461122001306822,
"grad_norm": 2.5236070719630885,
"learning_rate": 9.225784575292772e-06,
"loss": 0.9713,
"step": 274
},
{
"epoch": 0.2053579762904882,
"grad_norm": 4.582403432492713,
"learning_rate": 9.219303642268953e-06,
"loss": 1.0448,
"step": 275
},
{
"epoch": 0.20610473256790815,
"grad_norm": 2.810180928314182,
"learning_rate": 9.212797992519942e-06,
"loss": 0.963,
"step": 276
},
{
"epoch": 0.2068514888453281,
"grad_norm": 3.2399813947929834,
"learning_rate": 9.206267664155906e-06,
"loss": 0.9717,
"step": 277
},
{
"epoch": 0.20759824512274808,
"grad_norm": 4.912297938186508,
"learning_rate": 9.199712695431577e-06,
"loss": 1.1234,
"step": 278
},
{
"epoch": 0.208345001400168,
"grad_norm": 3.0084002580336304,
"learning_rate": 9.193133124746029e-06,
"loss": 1.0269,
"step": 279
},
{
"epoch": 0.20909175767758797,
"grad_norm": 3.760705263479795,
"learning_rate": 9.186528990642456e-06,
"loss": 1.0662,
"step": 280
},
{
"epoch": 0.20909175767758797,
"eval_loss": 0.9944674372673035,
"eval_runtime": 160.5401,
"eval_samples_per_second": 112.302,
"eval_steps_per_second": 1.757,
"step": 280
},
{
"epoch": 0.20983851395500794,
"grad_norm": 3.3378272588505196,
"learning_rate": 9.179900331807949e-06,
"loss": 1.0086,
"step": 281
},
{
"epoch": 0.2105852702324279,
"grad_norm": 3.4975415402391614,
"learning_rate": 9.173247187073258e-06,
"loss": 1.0851,
"step": 282
},
{
"epoch": 0.21133202650984784,
"grad_norm": 3.469930363846197,
"learning_rate": 9.166569595412576e-06,
"loss": 0.9715,
"step": 283
},
{
"epoch": 0.2120787827872678,
"grad_norm": 3.2839874663262734,
"learning_rate": 9.159867595943305e-06,
"loss": 1.0602,
"step": 284
},
{
"epoch": 0.21282553906468776,
"grad_norm": 5.543480811563083,
"learning_rate": 9.153141227925828e-06,
"loss": 1.0608,
"step": 285
},
{
"epoch": 0.21357229534210773,
"grad_norm": 2.8606552784980463,
"learning_rate": 9.146390530763281e-06,
"loss": 1.056,
"step": 286
},
{
"epoch": 0.2143190516195277,
"grad_norm": 3.948112135838377,
"learning_rate": 9.139615544001319e-06,
"loss": 1.1039,
"step": 287
},
{
"epoch": 0.21506580789694763,
"grad_norm": 3.2663451420355787,
"learning_rate": 9.132816307327886e-06,
"loss": 1.0605,
"step": 288
},
{
"epoch": 0.2158125641743676,
"grad_norm": 3.6151154254671596,
"learning_rate": 9.125992860572979e-06,
"loss": 1.0952,
"step": 289
},
{
"epoch": 0.21655932045178755,
"grad_norm": 4.912648681546459,
"learning_rate": 9.119145243708425e-06,
"loss": 1.0311,
"step": 290
},
{
"epoch": 0.21730607672920751,
"grad_norm": 4.21191863900281,
"learning_rate": 9.112273496847633e-06,
"loss": 1.1001,
"step": 291
},
{
"epoch": 0.21805283300662745,
"grad_norm": 3.3815445362628336,
"learning_rate": 9.10537766024537e-06,
"loss": 1.0159,
"step": 292
},
{
"epoch": 0.2187995892840474,
"grad_norm": 3.0735326892001917,
"learning_rate": 9.09845777429752e-06,
"loss": 0.9702,
"step": 293
},
{
"epoch": 0.21954634556146738,
"grad_norm": 4.053765796563824,
"learning_rate": 9.091513879540845e-06,
"loss": 0.9826,
"step": 294
},
{
"epoch": 0.21954634556146738,
"eval_loss": 0.9944608807563782,
"eval_runtime": 160.5629,
"eval_samples_per_second": 112.286,
"eval_steps_per_second": 1.756,
"step": 294
},
{
"epoch": 0.22029310183888734,
"grad_norm": 4.865522980455722,
"learning_rate": 9.084546016652758e-06,
"loss": 1.1022,
"step": 295
},
{
"epoch": 0.2210398581163073,
"grad_norm": 3.747759280503234,
"learning_rate": 9.07755422645107e-06,
"loss": 1.0149,
"step": 296
},
{
"epoch": 0.22178661439372724,
"grad_norm": 2.886224555660969,
"learning_rate": 9.070538549893762e-06,
"loss": 0.9359,
"step": 297
},
{
"epoch": 0.2225333706711472,
"grad_norm": 3.707336145864212,
"learning_rate": 9.063499028078742e-06,
"loss": 1.0696,
"step": 298
},
{
"epoch": 0.22328012694856716,
"grad_norm": 2.918913323052194,
"learning_rate": 9.056435702243601e-06,
"loss": 0.9829,
"step": 299
},
{
"epoch": 0.22402688322598713,
"grad_norm": 3.5655254434314485,
"learning_rate": 9.049348613765379e-06,
"loss": 1.1198,
"step": 300
},
{
"epoch": 0.22477363950340706,
"grad_norm": 3.87864421227377,
"learning_rate": 9.042237804160313e-06,
"loss": 1.045,
"step": 301
},
{
"epoch": 0.22552039578082703,
"grad_norm": 5.860458069015467,
"learning_rate": 9.035103315083603e-06,
"loss": 1.0307,
"step": 302
},
{
"epoch": 0.226267152058247,
"grad_norm": 3.383434386692763,
"learning_rate": 9.027945188329157e-06,
"loss": 0.989,
"step": 303
},
{
"epoch": 0.22701390833566695,
"grad_norm": 3.65424563045076,
"learning_rate": 9.020763465829361e-06,
"loss": 1.025,
"step": 304
},
{
"epoch": 0.22776066461308692,
"grad_norm": 3.6361185953370856,
"learning_rate": 9.013558189654819e-06,
"loss": 0.9718,
"step": 305
},
{
"epoch": 0.22850742089050685,
"grad_norm": 4.603618291504726,
"learning_rate": 9.006329402014115e-06,
"loss": 0.976,
"step": 306
},
{
"epoch": 0.22925417716792681,
"grad_norm": 4.4620408030223775,
"learning_rate": 8.999077145253564e-06,
"loss": 1.0732,
"step": 307
},
{
"epoch": 0.23000093344534678,
"grad_norm": 4.541176700346841,
"learning_rate": 8.991801461856961e-06,
"loss": 1.067,
"step": 308
},
{
"epoch": 0.23000093344534678,
"eval_loss": 0.9934854507446289,
"eval_runtime": 160.5833,
"eval_samples_per_second": 112.272,
"eval_steps_per_second": 1.756,
"step": 308
},
{
"epoch": 0.23074768972276674,
"grad_norm": 3.195979915169788,
"learning_rate": 8.984502394445338e-06,
"loss": 1.0315,
"step": 309
},
{
"epoch": 0.23149444600018668,
"grad_norm": 3.835832525500061,
"learning_rate": 8.977179985776707e-06,
"loss": 1.1007,
"step": 310
},
{
"epoch": 0.23224120227760664,
"grad_norm": 5.0483013462475075,
"learning_rate": 8.969834278745817e-06,
"loss": 1.0427,
"step": 311
},
{
"epoch": 0.2329879585550266,
"grad_norm": 4.252092405014885,
"learning_rate": 8.962465316383894e-06,
"loss": 1.0672,
"step": 312
},
{
"epoch": 0.23373471483244657,
"grad_norm": 3.3931880091176083,
"learning_rate": 8.955073141858401e-06,
"loss": 1.0445,
"step": 313
},
{
"epoch": 0.23448147110986653,
"grad_norm": 3.7519330808807987,
"learning_rate": 8.94765779847277e-06,
"loss": 1.0875,
"step": 314
},
{
"epoch": 0.23522822738728646,
"grad_norm": 3.404074995155419,
"learning_rate": 8.940219329666167e-06,
"loss": 1.0125,
"step": 315
},
{
"epoch": 0.23597498366470643,
"grad_norm": 3.978543801899103,
"learning_rate": 8.932757779013214e-06,
"loss": 1.0249,
"step": 316
},
{
"epoch": 0.2367217399421264,
"grad_norm": 3.7272416239455763,
"learning_rate": 8.925273190223756e-06,
"loss": 1.089,
"step": 317
},
{
"epoch": 0.23746849621954635,
"grad_norm": 3.8636012935517448,
"learning_rate": 8.917765607142594e-06,
"loss": 1.0153,
"step": 318
},
{
"epoch": 0.2382152524969663,
"grad_norm": 3.2747280611352894,
"learning_rate": 8.910235073749226e-06,
"loss": 1.0644,
"step": 319
},
{
"epoch": 0.23896200877438625,
"grad_norm": 5.279253406480692,
"learning_rate": 8.9026816341576e-06,
"loss": 1.0308,
"step": 320
},
{
"epoch": 0.23970876505180622,
"grad_norm": 3.11287763067271,
"learning_rate": 8.895105332615841e-06,
"loss": 1.0005,
"step": 321
},
{
"epoch": 0.24045552132922618,
"grad_norm": 4.973692199255152,
"learning_rate": 8.887506213506005e-06,
"loss": 1.0565,
"step": 322
},
{
"epoch": 0.24045552132922618,
"eval_loss": 0.9922440648078918,
"eval_runtime": 160.7313,
"eval_samples_per_second": 112.169,
"eval_steps_per_second": 1.754,
"step": 322
},
{
"epoch": 0.24120227760664614,
"grad_norm": 3.1961095681046605,
"learning_rate": 8.879884321343813e-06,
"loss": 1.0421,
"step": 323
},
{
"epoch": 0.24194903388406608,
"grad_norm": 3.4689654047952816,
"learning_rate": 8.872239700778387e-06,
"loss": 1.0353,
"step": 324
},
{
"epoch": 0.24269579016148604,
"grad_norm": 3.316236552129336,
"learning_rate": 8.864572396591996e-06,
"loss": 1.0268,
"step": 325
},
{
"epoch": 0.243442546438906,
"grad_norm": 4.010944666361697,
"learning_rate": 8.856882453699789e-06,
"loss": 1.0012,
"step": 326
},
{
"epoch": 0.24418930271632597,
"grad_norm": 3.6005519269255504,
"learning_rate": 8.849169917149532e-06,
"loss": 1.0483,
"step": 327
},
{
"epoch": 0.2449360589937459,
"grad_norm": 3.039548950649496,
"learning_rate": 8.841434832121345e-06,
"loss": 1.053,
"step": 328
},
{
"epoch": 0.24568281527116587,
"grad_norm": 4.617344719406804,
"learning_rate": 8.833677243927439e-06,
"loss": 0.9565,
"step": 329
},
{
"epoch": 0.24642957154858583,
"grad_norm": 3.0892815649842986,
"learning_rate": 8.825897198011847e-06,
"loss": 1.0735,
"step": 330
},
{
"epoch": 0.2471763278260058,
"grad_norm": 3.675909285421093,
"learning_rate": 8.818094739950157e-06,
"loss": 1.0462,
"step": 331
},
{
"epoch": 0.24792308410342576,
"grad_norm": 3.927754317009532,
"learning_rate": 8.810269915449255e-06,
"loss": 1.114,
"step": 332
},
{
"epoch": 0.2486698403808457,
"grad_norm": 2.990101012798793,
"learning_rate": 8.802422770347044e-06,
"loss": 1.0208,
"step": 333
},
{
"epoch": 0.24941659665826565,
"grad_norm": 3.3346146894348956,
"learning_rate": 8.79455335061218e-06,
"loss": 0.978,
"step": 334
},
{
"epoch": 0.2501633529356856,
"grad_norm": 3.319351140293654,
"learning_rate": 8.786661702343811e-06,
"loss": 1.0032,
"step": 335
},
{
"epoch": 0.2509101092131056,
"grad_norm": 3.1815233823997775,
"learning_rate": 8.778747871771293e-06,
"loss": 0.9834,
"step": 336
},
{
"epoch": 0.2509101092131056,
"eval_loss": 0.9869978427886963,
"eval_runtime": 160.5665,
"eval_samples_per_second": 112.284,
"eval_steps_per_second": 1.756,
"step": 336
},
{
"epoch": 0.2516568654905255,
"grad_norm": 3.8614964239023575,
"learning_rate": 8.770811905253929e-06,
"loss": 0.9907,
"step": 337
},
{
"epoch": 0.2524036217679455,
"grad_norm": 4.010000964702555,
"learning_rate": 8.762853849280692e-06,
"loss": 0.9913,
"step": 338
},
{
"epoch": 0.25315037804536544,
"grad_norm": 3.390634860886564,
"learning_rate": 8.754873750469964e-06,
"loss": 1.0259,
"step": 339
},
{
"epoch": 0.2538971343227854,
"grad_norm": 3.889606894130351,
"learning_rate": 8.746871655569245e-06,
"loss": 1.1029,
"step": 340
},
{
"epoch": 0.25464389060020537,
"grad_norm": 5.308002153451313,
"learning_rate": 8.738847611454887e-06,
"loss": 1.0702,
"step": 341
},
{
"epoch": 0.2553906468776253,
"grad_norm": 3.588824400370797,
"learning_rate": 8.730801665131831e-06,
"loss": 1.0099,
"step": 342
},
{
"epoch": 0.2561374031550453,
"grad_norm": 5.807491833529212,
"learning_rate": 8.722733863733314e-06,
"loss": 1.1411,
"step": 343
},
{
"epoch": 0.25688415943246523,
"grad_norm": 3.2267656811014396,
"learning_rate": 8.714644254520599e-06,
"loss": 1.0253,
"step": 344
},
{
"epoch": 0.25763091570988517,
"grad_norm": 4.5457930141059,
"learning_rate": 8.706532884882704e-06,
"loss": 1.1174,
"step": 345
},
{
"epoch": 0.25837767198730516,
"grad_norm": 10.348057960352017,
"learning_rate": 8.698399802336117e-06,
"loss": 0.9912,
"step": 346
},
{
"epoch": 0.2591244282647251,
"grad_norm": 3.066110108501509,
"learning_rate": 8.690245054524522e-06,
"loss": 1.0336,
"step": 347
},
{
"epoch": 0.2598711845421451,
"grad_norm": 3.370718413246445,
"learning_rate": 8.682068689218517e-06,
"loss": 1.0249,
"step": 348
},
{
"epoch": 0.260617940819565,
"grad_norm": 3.1496821083014184,
"learning_rate": 8.673870754315336e-06,
"loss": 0.9605,
"step": 349
},
{
"epoch": 0.26136469709698495,
"grad_norm": 3.8758345843988233,
"learning_rate": 8.665651297838572e-06,
"loss": 1.0054,
"step": 350
},
{
"epoch": 0.26136469709698495,
"eval_loss": 0.9857860207557678,
"eval_runtime": 160.6029,
"eval_samples_per_second": 112.258,
"eval_steps_per_second": 1.756,
"step": 350
},
{
"epoch": 0.26211145337440495,
"grad_norm": 3.651482334551038,
"learning_rate": 8.65741036793788e-06,
"loss": 1.0849,
"step": 351
},
{
"epoch": 0.2628582096518249,
"grad_norm": 3.3246917496100306,
"learning_rate": 8.649148012888717e-06,
"loss": 0.959,
"step": 352
},
{
"epoch": 0.2636049659292448,
"grad_norm": 3.4869853483297235,
"learning_rate": 8.640864281092051e-06,
"loss": 1.0238,
"step": 353
},
{
"epoch": 0.2643517222066648,
"grad_norm": 3.62379026669033,
"learning_rate": 8.632559221074063e-06,
"loss": 1.0621,
"step": 354
},
{
"epoch": 0.26509847848408474,
"grad_norm": 3.407945964026508,
"learning_rate": 8.624232881485887e-06,
"loss": 1.0635,
"step": 355
},
{
"epoch": 0.26584523476150473,
"grad_norm": 2.7911014267413417,
"learning_rate": 8.615885311103306e-06,
"loss": 1.0197,
"step": 356
},
{
"epoch": 0.26659199103892467,
"grad_norm": 3.087248719032191,
"learning_rate": 8.607516558826477e-06,
"loss": 1.0095,
"step": 357
},
{
"epoch": 0.2673387473163446,
"grad_norm": 3.232901307867413,
"learning_rate": 8.599126673679636e-06,
"loss": 0.984,
"step": 358
},
{
"epoch": 0.2680855035937646,
"grad_norm": 4.085245454288716,
"learning_rate": 8.590715704810823e-06,
"loss": 1.0483,
"step": 359
},
{
"epoch": 0.26883225987118453,
"grad_norm": 6.945488421551598,
"learning_rate": 8.582283701491576e-06,
"loss": 1.0444,
"step": 360
},
{
"epoch": 0.2695790161486045,
"grad_norm": 3.6035457167003067,
"learning_rate": 8.573830713116663e-06,
"loss": 0.989,
"step": 361
},
{
"epoch": 0.27032577242602446,
"grad_norm": 4.121541341754975,
"learning_rate": 8.565356789203781e-06,
"loss": 1.0037,
"step": 362
},
{
"epoch": 0.2710725287034444,
"grad_norm": 3.79404977076993,
"learning_rate": 8.556861979393263e-06,
"loss": 1.0403,
"step": 363
},
{
"epoch": 0.2718192849808644,
"grad_norm": 4.030031666437055,
"learning_rate": 8.548346333447794e-06,
"loss": 1.0121,
"step": 364
},
{
"epoch": 0.2718192849808644,
"eval_loss": 0.9849192500114441,
"eval_runtime": 160.5363,
"eval_samples_per_second": 112.305,
"eval_steps_per_second": 1.757,
"step": 364
},
{
"epoch": 0.2725660412582843,
"grad_norm": 2.888450802695812,
"learning_rate": 8.539809901252118e-06,
"loss": 0.9501,
"step": 365
},
{
"epoch": 0.2733127975357043,
"grad_norm": 3.9527081940985545,
"learning_rate": 8.531252732812744e-06,
"loss": 1.0831,
"step": 366
},
{
"epoch": 0.27405955381312425,
"grad_norm": 6.463294702133942,
"learning_rate": 8.522674878257658e-06,
"loss": 1.0366,
"step": 367
},
{
"epoch": 0.2748063100905442,
"grad_norm": 3.095330854017702,
"learning_rate": 8.514076387836022e-06,
"loss": 0.9823,
"step": 368
},
{
"epoch": 0.2755530663679642,
"grad_norm": 3.8233257574656743,
"learning_rate": 8.505457311917878e-06,
"loss": 1.0125,
"step": 369
},
{
"epoch": 0.2762998226453841,
"grad_norm": 4.0903866779791676,
"learning_rate": 8.496817700993869e-06,
"loss": 1.0439,
"step": 370
},
{
"epoch": 0.27704657892280404,
"grad_norm": 3.8768659844134743,
"learning_rate": 8.488157605674924e-06,
"loss": 1.1039,
"step": 371
},
{
"epoch": 0.27779333520022403,
"grad_norm": 2.9318267792600357,
"learning_rate": 8.479477076691975e-06,
"loss": 0.9654,
"step": 372
},
{
"epoch": 0.27854009147764397,
"grad_norm": 4.231056764323647,
"learning_rate": 8.47077616489565e-06,
"loss": 0.9985,
"step": 373
},
{
"epoch": 0.27928684775506396,
"grad_norm": 3.0018019927339936,
"learning_rate": 8.462054921255984e-06,
"loss": 1.07,
"step": 374
},
{
"epoch": 0.2800336040324839,
"grad_norm": 3.860901132155319,
"learning_rate": 8.453313396862113e-06,
"loss": 1.0714,
"step": 375
},
{
"epoch": 0.28078036030990383,
"grad_norm": 3.397395596102485,
"learning_rate": 8.44455164292198e-06,
"loss": 1.0387,
"step": 376
},
{
"epoch": 0.2815271165873238,
"grad_norm": 2.8794162395513654,
"learning_rate": 8.43576971076203e-06,
"loss": 0.9757,
"step": 377
},
{
"epoch": 0.28227387286474376,
"grad_norm": 3.4033543290966928,
"learning_rate": 8.426967651826914e-06,
"loss": 0.9365,
"step": 378
},
{
"epoch": 0.28227387286474376,
"eval_loss": 0.9828009009361267,
"eval_runtime": 160.585,
"eval_samples_per_second": 112.271,
"eval_steps_per_second": 1.756,
"step": 378
},
{
"epoch": 0.28302062914216375,
"grad_norm": 3.3989624743375617,
"learning_rate": 8.418145517679188e-06,
"loss": 1.0231,
"step": 379
},
{
"epoch": 0.2837673854195837,
"grad_norm": 4.328116900729524,
"learning_rate": 8.409303359999007e-06,
"loss": 0.9752,
"step": 380
},
{
"epoch": 0.2845141416970036,
"grad_norm": 3.602577669926632,
"learning_rate": 8.400441230583822e-06,
"loss": 1.0199,
"step": 381
},
{
"epoch": 0.2852608979744236,
"grad_norm": 3.5809369419716752,
"learning_rate": 8.391559181348081e-06,
"loss": 0.9868,
"step": 382
},
{
"epoch": 0.28600765425184355,
"grad_norm": 3.395383899337209,
"learning_rate": 8.382657264322924e-06,
"loss": 1.0212,
"step": 383
},
{
"epoch": 0.28675441052926354,
"grad_norm": 6.772601000085416,
"learning_rate": 8.37373553165587e-06,
"loss": 1.1117,
"step": 384
},
{
"epoch": 0.2875011668066835,
"grad_norm": 3.252451755396697,
"learning_rate": 8.364794035610527e-06,
"loss": 1.0028,
"step": 385
},
{
"epoch": 0.2882479230841034,
"grad_norm": 3.5098322414248395,
"learning_rate": 8.355832828566273e-06,
"loss": 0.9566,
"step": 386
},
{
"epoch": 0.2889946793615234,
"grad_norm": 2.9266861457219187,
"learning_rate": 8.346851963017952e-06,
"loss": 0.9624,
"step": 387
},
{
"epoch": 0.28974143563894333,
"grad_norm": 3.8574986005071814,
"learning_rate": 8.337851491575569e-06,
"loss": 1.1127,
"step": 388
},
{
"epoch": 0.29048819191636327,
"grad_norm": 3.7829383097996323,
"learning_rate": 8.32883146696398e-06,
"loss": 1.054,
"step": 389
},
{
"epoch": 0.29123494819378326,
"grad_norm": 4.299196127336594,
"learning_rate": 8.319791942022586e-06,
"loss": 1.1059,
"step": 390
},
{
"epoch": 0.2919817044712032,
"grad_norm": 3.537643283730942,
"learning_rate": 8.310732969705018e-06,
"loss": 1.0657,
"step": 391
},
{
"epoch": 0.2927284607486232,
"grad_norm": 3.1645266690038616,
"learning_rate": 8.301654603078832e-06,
"loss": 1.037,
"step": 392
},
{
"epoch": 0.2927284607486232,
"eval_loss": 0.9834852814674377,
"eval_runtime": 160.6069,
"eval_samples_per_second": 112.255,
"eval_steps_per_second": 1.756,
"step": 392
},
{
"epoch": 0.2934752170260431,
"grad_norm": 4.187400458258949,
"learning_rate": 8.292556895325195e-06,
"loss": 0.998,
"step": 393
},
{
"epoch": 0.29422197330346306,
"grad_norm": 3.5667687656977387,
"learning_rate": 8.283439899738574e-06,
"loss": 1.0179,
"step": 394
},
{
"epoch": 0.29496872958088305,
"grad_norm": 3.765218136068551,
"learning_rate": 8.274303669726427e-06,
"loss": 1.0147,
"step": 395
},
{
"epoch": 0.295715485858303,
"grad_norm": 3.864131300606489,
"learning_rate": 8.265148258808884e-06,
"loss": 0.9901,
"step": 396
},
{
"epoch": 0.296462242135723,
"grad_norm": 2.949157107673803,
"learning_rate": 8.255973720618438e-06,
"loss": 1.0055,
"step": 397
},
{
"epoch": 0.2972089984131429,
"grad_norm": 3.7436652842793374,
"learning_rate": 8.246780108899635e-06,
"loss": 1.1692,
"step": 398
},
{
"epoch": 0.29795575469056285,
"grad_norm": 3.2192130117671343,
"learning_rate": 8.237567477508744e-06,
"loss": 0.9998,
"step": 399
},
{
"epoch": 0.29870251096798284,
"grad_norm": 3.6902903702330443,
"learning_rate": 8.228335880413458e-06,
"loss": 1.0037,
"step": 400
},
{
"epoch": 0.2994492672454028,
"grad_norm": 3.4271815650751436,
"learning_rate": 8.219085371692573e-06,
"loss": 1.0365,
"step": 401
},
{
"epoch": 0.30019602352282276,
"grad_norm": 3.4017253839996133,
"learning_rate": 8.209816005535665e-06,
"loss": 1.0868,
"step": 402
},
{
"epoch": 0.3009427798002427,
"grad_norm": 2.7458347961593566,
"learning_rate": 8.200527836242775e-06,
"loss": 0.9653,
"step": 403
},
{
"epoch": 0.30168953607766263,
"grad_norm": 3.1745766085250007,
"learning_rate": 8.191220918224102e-06,
"loss": 0.993,
"step": 404
},
{
"epoch": 0.3024362923550826,
"grad_norm": 3.090654241484588,
"learning_rate": 8.181895305999665e-06,
"loss": 0.9881,
"step": 405
},
{
"epoch": 0.30318304863250256,
"grad_norm": 3.6314806742898282,
"learning_rate": 8.172551054199002e-06,
"loss": 1.0375,
"step": 406
},
{
"epoch": 0.30318304863250256,
"eval_loss": 0.9811131358146667,
"eval_runtime": 160.5504,
"eval_samples_per_second": 112.295,
"eval_steps_per_second": 1.756,
"step": 406
},
{
"epoch": 0.3039298049099225,
"grad_norm": 3.7357325248458624,
"learning_rate": 8.16318821756084e-06,
"loss": 1.0248,
"step": 407
},
{
"epoch": 0.3046765611873425,
"grad_norm": 2.5737530837652383,
"learning_rate": 8.153806850932771e-06,
"loss": 1.0217,
"step": 408
},
{
"epoch": 0.3054233174647624,
"grad_norm": 3.1463228977712467,
"learning_rate": 8.144407009270939e-06,
"loss": 1.0029,
"step": 409
},
{
"epoch": 0.3061700737421824,
"grad_norm": 9.739807547203563,
"learning_rate": 8.134988747639719e-06,
"loss": 1.0793,
"step": 410
},
{
"epoch": 0.30691683001960235,
"grad_norm": 3.4794447208645187,
"learning_rate": 8.125552121211385e-06,
"loss": 1.0399,
"step": 411
},
{
"epoch": 0.3076635862970223,
"grad_norm": 3.3525431927586578,
"learning_rate": 8.116097185265793e-06,
"loss": 1.0431,
"step": 412
},
{
"epoch": 0.3084103425744423,
"grad_norm": 3.4084157252106837,
"learning_rate": 8.106623995190058e-06,
"loss": 1.0727,
"step": 413
},
{
"epoch": 0.3091570988518622,
"grad_norm": 3.476679438345711,
"learning_rate": 8.09713260647823e-06,
"loss": 1.1045,
"step": 414
},
{
"epoch": 0.3099038551292822,
"grad_norm": 3.5348116522716078,
"learning_rate": 8.08762307473096e-06,
"loss": 0.9603,
"step": 415
},
{
"epoch": 0.31065061140670214,
"grad_norm": 3.6167949527936254,
"learning_rate": 8.078095455655188e-06,
"loss": 1.0288,
"step": 416
},
{
"epoch": 0.3113973676841221,
"grad_norm": 3.0637330231202013,
"learning_rate": 8.068549805063806e-06,
"loss": 0.9912,
"step": 417
},
{
"epoch": 0.31214412396154206,
"grad_norm": 3.240446093775731,
"learning_rate": 8.058986178875337e-06,
"loss": 0.9449,
"step": 418
},
{
"epoch": 0.312890880238962,
"grad_norm": 3.5029259821878593,
"learning_rate": 8.0494046331136e-06,
"loss": 0.9791,
"step": 419
},
{
"epoch": 0.313637636516382,
"grad_norm": 4.857373088195226,
"learning_rate": 8.039805223907396e-06,
"loss": 1.0656,
"step": 420
},
{
"epoch": 0.313637636516382,
"eval_loss": 0.9795615673065186,
"eval_runtime": 160.5273,
"eval_samples_per_second": 112.311,
"eval_steps_per_second": 1.757,
"step": 420
},
{
"epoch": 0.3143843927938019,
"grad_norm": 3.621695917176158,
"learning_rate": 8.030188007490164e-06,
"loss": 0.9901,
"step": 421
},
{
"epoch": 0.31513114907122186,
"grad_norm": 3.134150159669535,
"learning_rate": 8.020553040199654e-06,
"loss": 1.017,
"step": 422
},
{
"epoch": 0.31587790534864185,
"grad_norm": 3.7042154643581395,
"learning_rate": 8.010900378477612e-06,
"loss": 1.0453,
"step": 423
},
{
"epoch": 0.3166246616260618,
"grad_norm": 2.91252648100696,
"learning_rate": 8.00123007886943e-06,
"loss": 1.0093,
"step": 424
},
{
"epoch": 0.3173714179034818,
"grad_norm": 3.211153472938924,
"learning_rate": 7.991542198023827e-06,
"loss": 0.9395,
"step": 425
},
{
"epoch": 0.3181181741809017,
"grad_norm": 3.2362263093734764,
"learning_rate": 7.981836792692508e-06,
"loss": 1.0098,
"step": 426
},
{
"epoch": 0.31886493045832165,
"grad_norm": 4.070963322234874,
"learning_rate": 7.97211391972984e-06,
"loss": 1.0552,
"step": 427
},
{
"epoch": 0.31961168673574164,
"grad_norm": 3.212324506648779,
"learning_rate": 7.962373636092517e-06,
"loss": 0.9705,
"step": 428
},
{
"epoch": 0.3203584430131616,
"grad_norm": 3.861278889833234,
"learning_rate": 7.952615998839222e-06,
"loss": 1.0376,
"step": 429
},
{
"epoch": 0.3211051992905815,
"grad_norm": 3.040788601978854,
"learning_rate": 7.942841065130296e-06,
"loss": 0.9765,
"step": 430
},
{
"epoch": 0.3218519555680015,
"grad_norm": 3.7318713732273934,
"learning_rate": 7.933048892227406e-06,
"loss": 1.0966,
"step": 431
},
{
"epoch": 0.32259871184542144,
"grad_norm": 3.0465602998502153,
"learning_rate": 7.923239537493204e-06,
"loss": 0.9536,
"step": 432
},
{
"epoch": 0.32334546812284143,
"grad_norm": 3.3796680686549117,
"learning_rate": 7.913413058390989e-06,
"loss": 1.1145,
"step": 433
},
{
"epoch": 0.32409222440026136,
"grad_norm": 3.6473790330312825,
"learning_rate": 7.903569512484383e-06,
"loss": 1.0295,
"step": 434
},
{
"epoch": 0.32409222440026136,
"eval_loss": 0.9785324335098267,
"eval_runtime": 160.554,
"eval_samples_per_second": 112.292,
"eval_steps_per_second": 1.756,
"step": 434
},
{
"epoch": 0.3248389806776813,
"grad_norm": 2.9530024474970236,
"learning_rate": 7.893708957436982e-06,
"loss": 1.0426,
"step": 435
},
{
"epoch": 0.3255857369551013,
"grad_norm": 3.766650384182543,
"learning_rate": 7.88383145101202e-06,
"loss": 1.0902,
"step": 436
},
{
"epoch": 0.3263324932325212,
"grad_norm": 3.0225531834819157,
"learning_rate": 7.873937051072037e-06,
"loss": 1.0197,
"step": 437
},
{
"epoch": 0.3270792495099412,
"grad_norm": 6.747688160994902,
"learning_rate": 7.864025815578524e-06,
"loss": 1.0138,
"step": 438
},
{
"epoch": 0.32782600578736115,
"grad_norm": 3.3502645696292466,
"learning_rate": 7.85409780259161e-06,
"loss": 1.026,
"step": 439
},
{
"epoch": 0.3285727620647811,
"grad_norm": 3.390696827000236,
"learning_rate": 7.844153070269697e-06,
"loss": 1.0403,
"step": 440
},
{
"epoch": 0.3293195183422011,
"grad_norm": 3.270199160958159,
"learning_rate": 7.834191676869135e-06,
"loss": 1.0221,
"step": 441
},
{
"epoch": 0.330066274619621,
"grad_norm": 3.361500760720287,
"learning_rate": 7.824213680743867e-06,
"loss": 0.9559,
"step": 442
},
{
"epoch": 0.330813030897041,
"grad_norm": 3.6775652760129125,
"learning_rate": 7.8142191403451e-06,
"loss": 1.0061,
"step": 443
},
{
"epoch": 0.33155978717446094,
"grad_norm": 2.9665469767448425,
"learning_rate": 7.80420811422096e-06,
"loss": 1.0567,
"step": 444
},
{
"epoch": 0.3323065434518809,
"grad_norm": 3.5075474240286377,
"learning_rate": 7.794180661016143e-06,
"loss": 1.0241,
"step": 445
},
{
"epoch": 0.33305329972930087,
"grad_norm": 3.1914109742785266,
"learning_rate": 7.784136839471573e-06,
"loss": 1.0758,
"step": 446
},
{
"epoch": 0.3338000560067208,
"grad_norm": 4.066852799813992,
"learning_rate": 7.774076708424062e-06,
"loss": 1.0315,
"step": 447
},
{
"epoch": 0.33454681228414074,
"grad_norm": 3.0279095506480496,
"learning_rate": 7.764000326805967e-06,
"loss": 0.9507,
"step": 448
},
{
"epoch": 0.33454681228414074,
"eval_loss": 0.9775028228759766,
"eval_runtime": 160.6104,
"eval_samples_per_second": 112.253,
"eval_steps_per_second": 1.756,
"step": 448
},
{
"epoch": 0.33529356856156073,
"grad_norm": 3.102821716169309,
"learning_rate": 7.753907753644835e-06,
"loss": 1.0906,
"step": 449
},
{
"epoch": 0.33604032483898066,
"grad_norm": 3.1371422748805644,
"learning_rate": 7.74379904806307e-06,
"loss": 1.038,
"step": 450
},
{
"epoch": 0.33678708111640066,
"grad_norm": 3.664718417806466,
"learning_rate": 7.733674269277572e-06,
"loss": 1.0063,
"step": 451
},
{
"epoch": 0.3375338373938206,
"grad_norm": 8.663230070425483,
"learning_rate": 7.7235334765994e-06,
"loss": 0.9922,
"step": 452
},
{
"epoch": 0.3382805936712405,
"grad_norm": 3.4498456909319053,
"learning_rate": 7.71337672943343e-06,
"loss": 0.929,
"step": 453
},
{
"epoch": 0.3390273499486605,
"grad_norm": 3.3294787014480924,
"learning_rate": 7.703204087277989e-06,
"loss": 1.025,
"step": 454
},
{
"epoch": 0.33977410622608045,
"grad_norm": 10.093120102912547,
"learning_rate": 7.693015609724524e-06,
"loss": 1.0889,
"step": 455
},
{
"epoch": 0.34052086250350044,
"grad_norm": 3.427826565448867,
"learning_rate": 7.682811356457245e-06,
"loss": 1.0335,
"step": 456
},
{
"epoch": 0.3412676187809204,
"grad_norm": 3.1300498945062705,
"learning_rate": 7.672591387252773e-06,
"loss": 0.9643,
"step": 457
},
{
"epoch": 0.3420143750583403,
"grad_norm": 4.834020434756831,
"learning_rate": 7.662355761979794e-06,
"loss": 1.0139,
"step": 458
},
{
"epoch": 0.3427611313357603,
"grad_norm": 3.206146987660176,
"learning_rate": 7.652104540598712e-06,
"loss": 0.9946,
"step": 459
},
{
"epoch": 0.34350788761318024,
"grad_norm": 3.0060430098945,
"learning_rate": 7.64183778316129e-06,
"loss": 0.9548,
"step": 460
},
{
"epoch": 0.34425464389060023,
"grad_norm": 2.700718919045169,
"learning_rate": 7.6315555498103e-06,
"loss": 1.0126,
"step": 461
},
{
"epoch": 0.34500140016802017,
"grad_norm": 4.606267437808485,
"learning_rate": 7.621257900779173e-06,
"loss": 1.014,
"step": 462
},
{
"epoch": 0.34500140016802017,
"eval_loss": 0.9751178026199341,
"eval_runtime": 162.1058,
"eval_samples_per_second": 111.217,
"eval_steps_per_second": 1.74,
"step": 462
},
{
"epoch": 0.3457481564454401,
"grad_norm": 3.8917302169214243,
"learning_rate": 7.610944896391644e-06,
"loss": 1.0268,
"step": 463
},
{
"epoch": 0.3464949127228601,
"grad_norm": 3.3196051557196586,
"learning_rate": 7.6006165970614045e-06,
"loss": 1.0196,
"step": 464
},
{
"epoch": 0.34724166900028003,
"grad_norm": 3.53901881155908,
"learning_rate": 7.5902730632917395e-06,
"loss": 0.9993,
"step": 465
},
{
"epoch": 0.34798842527769996,
"grad_norm": 3.0682482146093477,
"learning_rate": 7.579914355675177e-06,
"loss": 0.9473,
"step": 466
},
{
"epoch": 0.34873518155511996,
"grad_norm": 3.087914341379048,
"learning_rate": 7.569540534893139e-06,
"loss": 1.0363,
"step": 467
},
{
"epoch": 0.3494819378325399,
"grad_norm": 3.8282273155400426,
"learning_rate": 7.559151661715574e-06,
"loss": 0.9911,
"step": 468
},
{
"epoch": 0.3502286941099599,
"grad_norm": 3.7879619577438475,
"learning_rate": 7.548747797000611e-06,
"loss": 1.1124,
"step": 469
},
{
"epoch": 0.3509754503873798,
"grad_norm": 3.678911283929549,
"learning_rate": 7.5383290016942e-06,
"loss": 1.089,
"step": 470
},
{
"epoch": 0.35172220666479975,
"grad_norm": 3.153223573399342,
"learning_rate": 7.527895336829754e-06,
"loss": 0.9562,
"step": 471
},
{
"epoch": 0.35246896294221974,
"grad_norm": 4.016430707128402,
"learning_rate": 7.517446863527791e-06,
"loss": 1.0842,
"step": 472
},
{
"epoch": 0.3532157192196397,
"grad_norm": 3.016008204142368,
"learning_rate": 7.506983642995576e-06,
"loss": 0.9873,
"step": 473
},
{
"epoch": 0.35396247549705967,
"grad_norm": 3.138460161693774,
"learning_rate": 7.496505736526769e-06,
"loss": 1.0124,
"step": 474
},
{
"epoch": 0.3547092317744796,
"grad_norm": 2.8855105537883046,
"learning_rate": 7.486013205501053e-06,
"loss": 0.9669,
"step": 475
},
{
"epoch": 0.35545598805189954,
"grad_norm": 3.1177066824549597,
"learning_rate": 7.475506111383787e-06,
"loss": 0.9873,
"step": 476
},
{
"epoch": 0.35545598805189954,
"eval_loss": 0.9743978977203369,
"eval_runtime": 160.456,
"eval_samples_per_second": 112.361,
"eval_steps_per_second": 1.757,
"step": 476
},
{
"epoch": 0.35620274432931953,
"grad_norm": 3.0598520129588374,
"learning_rate": 7.464984515725638e-06,
"loss": 0.9891,
"step": 477
},
{
"epoch": 0.35694950060673947,
"grad_norm": 2.98269608960095,
"learning_rate": 7.454448480162226e-06,
"loss": 1.0347,
"step": 478
},
{
"epoch": 0.35769625688415946,
"grad_norm": 3.2757092754418107,
"learning_rate": 7.443898066413755e-06,
"loss": 1.1134,
"step": 479
},
{
"epoch": 0.3584430131615794,
"grad_norm": 3.846293749413655,
"learning_rate": 7.433333336284665e-06,
"loss": 1.0546,
"step": 480
},
{
"epoch": 0.35918976943899933,
"grad_norm": 3.0031396693349346,
"learning_rate": 7.422754351663252e-06,
"loss": 0.9413,
"step": 481
},
{
"epoch": 0.3599365257164193,
"grad_norm": 5.879276289496315,
"learning_rate": 7.412161174521321e-06,
"loss": 1.0645,
"step": 482
},
{
"epoch": 0.36068328199383926,
"grad_norm": 2.9630046791659304,
"learning_rate": 7.4015538669138144e-06,
"loss": 1.0015,
"step": 483
},
{
"epoch": 0.3614300382712592,
"grad_norm": 3.1007268346585417,
"learning_rate": 7.390932490978453e-06,
"loss": 0.9843,
"step": 484
},
{
"epoch": 0.3621767945486792,
"grad_norm": 3.5996328301501155,
"learning_rate": 7.3802971089353696e-06,
"loss": 1.0323,
"step": 485
},
{
"epoch": 0.3629235508260991,
"grad_norm": 3.6048723852155846,
"learning_rate": 7.369647783086742e-06,
"loss": 1.015,
"step": 486
},
{
"epoch": 0.3636703071035191,
"grad_norm": 2.7669538427602487,
"learning_rate": 7.358984575816437e-06,
"loss": 0.9157,
"step": 487
},
{
"epoch": 0.36441706338093904,
"grad_norm": 2.7712108102753126,
"learning_rate": 7.3483075495896296e-06,
"loss": 0.9782,
"step": 488
},
{
"epoch": 0.365163819658359,
"grad_norm": 3.151704417287714,
"learning_rate": 7.337616766952455e-06,
"loss": 0.9626,
"step": 489
},
{
"epoch": 0.36591057593577897,
"grad_norm": 3.5437250736566366,
"learning_rate": 7.326912290531634e-06,
"loss": 1.0085,
"step": 490
},
{
"epoch": 0.36591057593577897,
"eval_loss": 0.9715595841407776,
"eval_runtime": 160.5498,
"eval_samples_per_second": 112.295,
"eval_steps_per_second": 1.756,
"step": 490
},
{
"epoch": 0.3666573322131989,
"grad_norm": 3.774562387910661,
"learning_rate": 7.316194183034096e-06,
"loss": 1.087,
"step": 491
},
{
"epoch": 0.3674040884906189,
"grad_norm": 2.8372057210924746,
"learning_rate": 7.30546250724663e-06,
"loss": 0.9763,
"step": 492
},
{
"epoch": 0.36815084476803883,
"grad_norm": 2.560789631009045,
"learning_rate": 7.294717326035508e-06,
"loss": 0.9571,
"step": 493
},
{
"epoch": 0.36889760104545877,
"grad_norm": 3.5990199559087785,
"learning_rate": 7.283958702346111e-06,
"loss": 0.9785,
"step": 494
},
{
"epoch": 0.36964435732287876,
"grad_norm": 2.669595595250994,
"learning_rate": 7.273186699202572e-06,
"loss": 0.9195,
"step": 495
},
{
"epoch": 0.3703911136002987,
"grad_norm": 4.258528121719663,
"learning_rate": 7.262401379707401e-06,
"loss": 0.983,
"step": 496
},
{
"epoch": 0.3711378698777187,
"grad_norm": 9.165533012445232,
"learning_rate": 7.251602807041111e-06,
"loss": 1.0183,
"step": 497
},
{
"epoch": 0.3718846261551386,
"grad_norm": 5.239008418773379,
"learning_rate": 7.240791044461853e-06,
"loss": 1.0623,
"step": 498
},
{
"epoch": 0.37263138243255856,
"grad_norm": 4.43366736674882,
"learning_rate": 7.2299661553050474e-06,
"loss": 1.0722,
"step": 499
},
{
"epoch": 0.37337813870997855,
"grad_norm": 2.7329812801042426,
"learning_rate": 7.21912820298301e-06,
"loss": 1.0754,
"step": 500
},
{
"epoch": 0.3741248949873985,
"grad_norm": 3.716579945077537,
"learning_rate": 7.208277250984577e-06,
"loss": 1.0111,
"step": 501
},
{
"epoch": 0.3748716512648184,
"grad_norm": 3.196376735234458,
"learning_rate": 7.1974133628747435e-06,
"loss": 1.032,
"step": 502
},
{
"epoch": 0.3756184075422384,
"grad_norm": 3.418307936162068,
"learning_rate": 7.186536602294278e-06,
"loss": 0.9652,
"step": 503
},
{
"epoch": 0.37636516381965834,
"grad_norm": 3.0583787014333748,
"learning_rate": 7.175647032959358e-06,
"loss": 0.99,
"step": 504
},
{
"epoch": 0.37636516381965834,
"eval_loss": 0.970821738243103,
"eval_runtime": 160.5549,
"eval_samples_per_second": 112.292,
"eval_steps_per_second": 1.756,
"step": 504
},
{
"epoch": 0.37711192009707833,
"grad_norm": 3.9375426470992196,
"learning_rate": 7.164744718661198e-06,
"loss": 1.0763,
"step": 505
},
{
"epoch": 0.37785867637449827,
"grad_norm": 3.670004043591105,
"learning_rate": 7.153829723265666e-06,
"loss": 1.0512,
"step": 506
},
{
"epoch": 0.3786054326519182,
"grad_norm": 4.08797931465596,
"learning_rate": 7.142902110712925e-06,
"loss": 1.0742,
"step": 507
},
{
"epoch": 0.3793521889293382,
"grad_norm": 2.969955919693604,
"learning_rate": 7.131961945017041e-06,
"loss": 0.9621,
"step": 508
},
{
"epoch": 0.38009894520675813,
"grad_norm": 3.5178350581181874,
"learning_rate": 7.121009290265619e-06,
"loss": 0.9393,
"step": 509
},
{
"epoch": 0.3808457014841781,
"grad_norm": 3.307634748721632,
"learning_rate": 7.11004421061943e-06,
"loss": 0.9751,
"step": 510
},
{
"epoch": 0.38159245776159806,
"grad_norm": 3.0771731917177365,
"learning_rate": 7.099066770312023e-06,
"loss": 1.1015,
"step": 511
},
{
"epoch": 0.382339214039018,
"grad_norm": 3.0141374010446724,
"learning_rate": 7.088077033649359e-06,
"loss": 1.0432,
"step": 512
},
{
"epoch": 0.383085970316438,
"grad_norm": 3.1493573609897063,
"learning_rate": 7.0770750650094335e-06,
"loss": 1.0117,
"step": 513
},
{
"epoch": 0.3838327265938579,
"grad_norm": 2.5211640616380278,
"learning_rate": 7.066060928841891e-06,
"loss": 0.9526,
"step": 514
},
{
"epoch": 0.3845794828712779,
"grad_norm": 2.7231216253469874,
"learning_rate": 7.055034689667661e-06,
"loss": 0.962,
"step": 515
},
{
"epoch": 0.38532623914869785,
"grad_norm": 2.944911073727331,
"learning_rate": 7.0439964120785665e-06,
"loss": 0.9764,
"step": 516
},
{
"epoch": 0.3860729954261178,
"grad_norm": 4.058582096841929,
"learning_rate": 7.032946160736956e-06,
"loss": 1.0465,
"step": 517
},
{
"epoch": 0.3868197517035378,
"grad_norm": 3.3936349644680566,
"learning_rate": 7.021884000375315e-06,
"loss": 1.0345,
"step": 518
},
{
"epoch": 0.3868197517035378,
"eval_loss": 0.9695369005203247,
"eval_runtime": 162.0438,
"eval_samples_per_second": 111.26,
"eval_steps_per_second": 1.74,
"step": 518
},
{
"epoch": 0.3875665079809577,
"grad_norm": 3.0088080373888877,
"learning_rate": 7.010809995795897e-06,
"loss": 0.9921,
"step": 519
},
{
"epoch": 0.3883132642583777,
"grad_norm": 3.7876856086376463,
"learning_rate": 6.999724211870339e-06,
"loss": 1.0344,
"step": 520
},
{
"epoch": 0.38906002053579763,
"grad_norm": 3.554627765753071,
"learning_rate": 6.98862671353928e-06,
"loss": 1.0494,
"step": 521
},
{
"epoch": 0.38980677681321757,
"grad_norm": 3.6313523510559493,
"learning_rate": 6.977517565811977e-06,
"loss": 1.1087,
"step": 522
},
{
"epoch": 0.39055353309063756,
"grad_norm": 2.7107679507341143,
"learning_rate": 6.966396833765941e-06,
"loss": 0.9847,
"step": 523
},
{
"epoch": 0.3913002893680575,
"grad_norm": 3.1970196122685417,
"learning_rate": 6.955264582546536e-06,
"loss": 0.9458,
"step": 524
},
{
"epoch": 0.39204704564547743,
"grad_norm": 3.7052386420585766,
"learning_rate": 6.944120877366605e-06,
"loss": 0.9409,
"step": 525
},
{
"epoch": 0.3927938019228974,
"grad_norm": 2.8311602798021216,
"learning_rate": 6.932965783506089e-06,
"loss": 0.9862,
"step": 526
},
{
"epoch": 0.39354055820031736,
"grad_norm": 3.1398517884202954,
"learning_rate": 6.92179936631165e-06,
"loss": 0.9867,
"step": 527
},
{
"epoch": 0.39428731447773735,
"grad_norm": 3.2070435051142065,
"learning_rate": 6.910621691196274e-06,
"loss": 1.0831,
"step": 528
},
{
"epoch": 0.3950340707551573,
"grad_norm": 3.2754924049472924,
"learning_rate": 6.8994328236389006e-06,
"loss": 0.9178,
"step": 529
},
{
"epoch": 0.3957808270325772,
"grad_norm": 2.758911582337925,
"learning_rate": 6.888232829184035e-06,
"loss": 1.0161,
"step": 530
},
{
"epoch": 0.3965275833099972,
"grad_norm": 3.081654379594517,
"learning_rate": 6.8770217734413606e-06,
"loss": 0.9263,
"step": 531
},
{
"epoch": 0.39727433958741715,
"grad_norm": 3.2054417126811683,
"learning_rate": 6.8657997220853615e-06,
"loss": 0.9714,
"step": 532
},
{
"epoch": 0.39727433958741715,
"eval_loss": 0.9671504497528076,
"eval_runtime": 160.5647,
"eval_samples_per_second": 112.285,
"eval_steps_per_second": 1.756,
"step": 532
},
{
"epoch": 0.39802109586483714,
"grad_norm": 3.5709587722352683,
"learning_rate": 6.854566740854932e-06,
"loss": 1.0274,
"step": 533
},
{
"epoch": 0.3987678521422571,
"grad_norm": 2.8375890052138493,
"learning_rate": 6.843322895552995e-06,
"loss": 1.0538,
"step": 534
},
{
"epoch": 0.399514608419677,
"grad_norm": 3.8436688249330153,
"learning_rate": 6.832068252046116e-06,
"loss": 0.9943,
"step": 535
},
{
"epoch": 0.400261364697097,
"grad_norm": 3.0838296325254593,
"learning_rate": 6.820802876264112e-06,
"loss": 1.0524,
"step": 536
},
{
"epoch": 0.40100812097451694,
"grad_norm": 3.2520329037248814,
"learning_rate": 6.809526834199675e-06,
"loss": 0.9378,
"step": 537
},
{
"epoch": 0.4017548772519369,
"grad_norm": 3.437376705017015,
"learning_rate": 6.798240191907979e-06,
"loss": 0.9232,
"step": 538
},
{
"epoch": 0.40250163352935686,
"grad_norm": 3.8117996083043564,
"learning_rate": 6.786943015506292e-06,
"loss": 1.1646,
"step": 539
},
{
"epoch": 0.4032483898067768,
"grad_norm": 3.086449782517495,
"learning_rate": 6.775635371173595e-06,
"loss": 1.0376,
"step": 540
},
{
"epoch": 0.4039951460841968,
"grad_norm": 3.5594550947217605,
"learning_rate": 6.764317325150183e-06,
"loss": 1.0664,
"step": 541
},
{
"epoch": 0.4047419023616167,
"grad_norm": 2.929617271103529,
"learning_rate": 6.752988943737291e-06,
"loss": 0.9074,
"step": 542
},
{
"epoch": 0.40548865863903666,
"grad_norm": 3.4414325543869397,
"learning_rate": 6.7416502932967e-06,
"loss": 1.0103,
"step": 543
},
{
"epoch": 0.40623541491645665,
"grad_norm": 3.6632742569900927,
"learning_rate": 6.730301440250337e-06,
"loss": 1.0326,
"step": 544
},
{
"epoch": 0.4069821711938766,
"grad_norm": 2.8687836258205666,
"learning_rate": 6.718942451079911e-06,
"loss": 1.0152,
"step": 545
},
{
"epoch": 0.4077289274712966,
"grad_norm": 3.321912105204247,
"learning_rate": 6.707573392326493e-06,
"loss": 1.0539,
"step": 546
},
{
"epoch": 0.4077289274712966,
"eval_loss": 0.966361403465271,
"eval_runtime": 160.7022,
"eval_samples_per_second": 112.189,
"eval_steps_per_second": 1.755,
"step": 546
},
{
"epoch": 0.4084756837487165,
"grad_norm": 3.272095356187146,
"learning_rate": 6.6961943305901515e-06,
"loss": 1.0243,
"step": 547
},
{
"epoch": 0.40922244002613645,
"grad_norm": 4.464138677869332,
"learning_rate": 6.6848053325295525e-06,
"loss": 0.9455,
"step": 548
},
{
"epoch": 0.40996919630355644,
"grad_norm": 3.6980284220501702,
"learning_rate": 6.673406464861563e-06,
"loss": 0.968,
"step": 549
},
{
"epoch": 0.4107159525809764,
"grad_norm": 3.9712580585477326,
"learning_rate": 6.661997794360872e-06,
"loss": 1.0356,
"step": 550
},
{
"epoch": 0.41146270885839636,
"grad_norm": 3.6234064719129013,
"learning_rate": 6.65057938785959e-06,
"loss": 0.9782,
"step": 551
},
{
"epoch": 0.4122094651358163,
"grad_norm": 3.430576862937015,
"learning_rate": 6.639151312246863e-06,
"loss": 1.0377,
"step": 552
},
{
"epoch": 0.41295622141323624,
"grad_norm": 3.653367041079211,
"learning_rate": 6.62771363446848e-06,
"loss": 0.9338,
"step": 553
},
{
"epoch": 0.4137029776906562,
"grad_norm": 2.9772192093714778,
"learning_rate": 6.616266421526477e-06,
"loss": 0.9065,
"step": 554
},
{
"epoch": 0.41444973396807616,
"grad_norm": 5.845516786477963,
"learning_rate": 6.604809740478748e-06,
"loss": 1.0693,
"step": 555
},
{
"epoch": 0.41519649024549615,
"grad_norm": 3.306244287174315,
"learning_rate": 6.593343658438649e-06,
"loss": 1.0179,
"step": 556
},
{
"epoch": 0.4159432465229161,
"grad_norm": 3.092506060971144,
"learning_rate": 6.581868242574613e-06,
"loss": 1.0373,
"step": 557
},
{
"epoch": 0.416690002800336,
"grad_norm": 2.793320476040383,
"learning_rate": 6.570383560109745e-06,
"loss": 1.0184,
"step": 558
},
{
"epoch": 0.417436759077756,
"grad_norm": 3.4955390820801853,
"learning_rate": 6.558889678321436e-06,
"loss": 1.0392,
"step": 559
},
{
"epoch": 0.41818351535517595,
"grad_norm": 3.273061028251747,
"learning_rate": 6.547386664540968e-06,
"loss": 0.9409,
"step": 560
},
{
"epoch": 0.41818351535517595,
"eval_loss": 0.9662355780601501,
"eval_runtime": 160.6718,
"eval_samples_per_second": 112.21,
"eval_steps_per_second": 1.755,
"step": 560
},
{
"epoch": 0.4189302716325959,
"grad_norm": 4.087603010650717,
"learning_rate": 6.535874586153115e-06,
"loss": 1.0936,
"step": 561
},
{
"epoch": 0.4196770279100159,
"grad_norm": 3.1783778248379226,
"learning_rate": 6.524353510595754e-06,
"loss": 1.0416,
"step": 562
},
{
"epoch": 0.4204237841874358,
"grad_norm": 3.5032912207274247,
"learning_rate": 6.512823505359469e-06,
"loss": 0.9697,
"step": 563
},
{
"epoch": 0.4211705404648558,
"grad_norm": 2.924411630831047,
"learning_rate": 6.501284637987148e-06,
"loss": 0.9833,
"step": 564
},
{
"epoch": 0.42191729674227574,
"grad_norm": 3.292319469279369,
"learning_rate": 6.489736976073603e-06,
"loss": 0.9739,
"step": 565
},
{
"epoch": 0.4226640530196957,
"grad_norm": 2.8074608140919146,
"learning_rate": 6.4781805872651536e-06,
"loss": 1.0121,
"step": 566
},
{
"epoch": 0.42341080929711566,
"grad_norm": 3.8979341939201237,
"learning_rate": 6.466615539259252e-06,
"loss": 0.9085,
"step": 567
},
{
"epoch": 0.4241575655745356,
"grad_norm": 3.0105273821322642,
"learning_rate": 6.4550418998040686e-06,
"loss": 0.9803,
"step": 568
},
{
"epoch": 0.4249043218519556,
"grad_norm": 3.018386515792839,
"learning_rate": 6.443459736698106e-06,
"loss": 1.0349,
"step": 569
},
{
"epoch": 0.4256510781293755,
"grad_norm": 4.3297434955866105,
"learning_rate": 6.431869117789797e-06,
"loss": 0.9927,
"step": 570
},
{
"epoch": 0.42639783440679546,
"grad_norm": 3.6098279806564326,
"learning_rate": 6.4202701109771105e-06,
"loss": 1.0685,
"step": 571
},
{
"epoch": 0.42714459068421545,
"grad_norm": 3.27616752754168,
"learning_rate": 6.408662784207149e-06,
"loss": 0.9819,
"step": 572
},
{
"epoch": 0.4278913469616354,
"grad_norm": 2.755375498502825,
"learning_rate": 6.397047205475757e-06,
"loss": 0.9871,
"step": 573
},
{
"epoch": 0.4286381032390554,
"grad_norm": 2.978173123042259,
"learning_rate": 6.385423442827116e-06,
"loss": 0.9861,
"step": 574
},
{
"epoch": 0.4286381032390554,
"eval_loss": 0.9649081826210022,
"eval_runtime": 160.6682,
"eval_samples_per_second": 112.213,
"eval_steps_per_second": 1.755,
"step": 574
},
{
"epoch": 0.4293848595164753,
"grad_norm": 3.1552421177846997,
"learning_rate": 6.3737915643533484e-06,
"loss": 1.0343,
"step": 575
},
{
"epoch": 0.43013161579389525,
"grad_norm": 4.0712453485088345,
"learning_rate": 6.362151638194125e-06,
"loss": 0.9875,
"step": 576
},
{
"epoch": 0.43087837207131524,
"grad_norm": 3.6072794575911353,
"learning_rate": 6.3505037325362515e-06,
"loss": 0.9823,
"step": 577
},
{
"epoch": 0.4316251283487352,
"grad_norm": 3.74072340333252,
"learning_rate": 6.338847915613285e-06,
"loss": 1.0738,
"step": 578
},
{
"epoch": 0.4323718846261551,
"grad_norm": 3.9042759629608668,
"learning_rate": 6.327184255705123e-06,
"loss": 0.9318,
"step": 579
},
{
"epoch": 0.4331186409035751,
"grad_norm": 2.7602755638221805,
"learning_rate": 6.315512821137606e-06,
"loss": 0.9505,
"step": 580
},
{
"epoch": 0.43386539718099504,
"grad_norm": 3.663747781306997,
"learning_rate": 6.303833680282125e-06,
"loss": 0.9821,
"step": 581
},
{
"epoch": 0.43461215345841503,
"grad_norm": 4.012768339701778,
"learning_rate": 6.292146901555207e-06,
"loss": 1.0239,
"step": 582
},
{
"epoch": 0.43535890973583496,
"grad_norm": 3.6313447848071903,
"learning_rate": 6.280452553418126e-06,
"loss": 1.0401,
"step": 583
},
{
"epoch": 0.4361056660132549,
"grad_norm": 3.2153420258632255,
"learning_rate": 6.268750704376494e-06,
"loss": 1.0054,
"step": 584
},
{
"epoch": 0.4368524222906749,
"grad_norm": 3.4771984642840947,
"learning_rate": 6.257041422979871e-06,
"loss": 0.9913,
"step": 585
},
{
"epoch": 0.4375991785680948,
"grad_norm": 2.792906968691294,
"learning_rate": 6.245324777821346e-06,
"loss": 0.953,
"step": 586
},
{
"epoch": 0.4383459348455148,
"grad_norm": 3.1678763709553266,
"learning_rate": 6.233600837537153e-06,
"loss": 1.0841,
"step": 587
},
{
"epoch": 0.43909269112293475,
"grad_norm": 3.9018561243297136,
"learning_rate": 6.221869670806257e-06,
"loss": 0.9856,
"step": 588
},
{
"epoch": 0.43909269112293475,
"eval_loss": 0.9642106294631958,
"eval_runtime": 162.0199,
"eval_samples_per_second": 111.276,
"eval_steps_per_second": 1.741,
"step": 588
},
{
"epoch": 0.4398394474003547,
"grad_norm": 3.434760853329825,
"learning_rate": 6.210131346349953e-06,
"loss": 0.9926,
"step": 589
},
{
"epoch": 0.4405862036777747,
"grad_norm": 3.4084803668234738,
"learning_rate": 6.1983859329314745e-06,
"loss": 1.0572,
"step": 590
},
{
"epoch": 0.4413329599551946,
"grad_norm": 2.968509928932182,
"learning_rate": 6.186633499355576e-06,
"loss": 0.9848,
"step": 591
},
{
"epoch": 0.4420797162326146,
"grad_norm": 5.328332695240182,
"learning_rate": 6.174874114468132e-06,
"loss": 1.0451,
"step": 592
},
{
"epoch": 0.44282647251003454,
"grad_norm": 3.0263469144237143,
"learning_rate": 6.16310784715575e-06,
"loss": 0.971,
"step": 593
},
{
"epoch": 0.4435732287874545,
"grad_norm": 3.1339960008604826,
"learning_rate": 6.151334766345345e-06,
"loss": 1.0407,
"step": 594
},
{
"epoch": 0.44431998506487447,
"grad_norm": 3.0931627663439984,
"learning_rate": 6.139554941003747e-06,
"loss": 1.0377,
"step": 595
},
{
"epoch": 0.4450667413422944,
"grad_norm": 2.909763131231064,
"learning_rate": 6.127768440137298e-06,
"loss": 1.0148,
"step": 596
},
{
"epoch": 0.44581349761971434,
"grad_norm": 3.4356772897225287,
"learning_rate": 6.115975332791446e-06,
"loss": 0.9894,
"step": 597
},
{
"epoch": 0.44656025389713433,
"grad_norm": 2.8810376760097474,
"learning_rate": 6.104175688050336e-06,
"loss": 1.0024,
"step": 598
},
{
"epoch": 0.44730701017455426,
"grad_norm": 2.963947770292573,
"learning_rate": 6.092369575036411e-06,
"loss": 0.9927,
"step": 599
},
{
"epoch": 0.44805376645197426,
"grad_norm": 3.4408370155382513,
"learning_rate": 6.0805570629100075e-06,
"loss": 0.9892,
"step": 600
},
{
"epoch": 0.4488005227293942,
"grad_norm": 3.2679972430155235,
"learning_rate": 6.068738220868944e-06,
"loss": 1.0083,
"step": 601
},
{
"epoch": 0.4495472790068141,
"grad_norm": 3.3266341400886725,
"learning_rate": 6.056913118148122e-06,
"loss": 0.9947,
"step": 602
},
{
"epoch": 0.4495472790068141,
"eval_loss": 0.9621589183807373,
"eval_runtime": 162.1706,
"eval_samples_per_second": 111.173,
"eval_steps_per_second": 1.739,
"step": 602
},
{
"epoch": 0.4502940352842341,
"grad_norm": 2.760574168207489,
"learning_rate": 6.045081824019119e-06,
"loss": 1.0301,
"step": 603
},
{
"epoch": 0.45104079156165405,
"grad_norm": 4.395437647043472,
"learning_rate": 6.03324440778978e-06,
"loss": 0.9877,
"step": 604
},
{
"epoch": 0.45178754783907404,
"grad_norm": 2.9581029369662892,
"learning_rate": 6.021400938803813e-06,
"loss": 1.0625,
"step": 605
},
{
"epoch": 0.452534304116494,
"grad_norm": 4.9741175183125845,
"learning_rate": 6.009551486440387e-06,
"loss": 0.984,
"step": 606
},
{
"epoch": 0.4532810603939139,
"grad_norm": 3.083559757545901,
"learning_rate": 5.9976961201137155e-06,
"loss": 0.9369,
"step": 607
},
{
"epoch": 0.4540278166713339,
"grad_norm": 3.5027682562154014,
"learning_rate": 5.985834909272661e-06,
"loss": 1.0684,
"step": 608
},
{
"epoch": 0.45477457294875384,
"grad_norm": 4.142396452688926,
"learning_rate": 5.973967923400321e-06,
"loss": 1.0164,
"step": 609
},
{
"epoch": 0.45552132922617383,
"grad_norm": 2.8673317625362835,
"learning_rate": 5.9620952320136225e-06,
"loss": 0.9815,
"step": 610
},
{
"epoch": 0.45626808550359377,
"grad_norm": 3.021928936402679,
"learning_rate": 5.95021690466292e-06,
"loss": 1.0028,
"step": 611
},
{
"epoch": 0.4570148417810137,
"grad_norm": 3.2317785993970007,
"learning_rate": 5.938333010931578e-06,
"loss": 1.1046,
"step": 612
},
{
"epoch": 0.4577615980584337,
"grad_norm": 2.726268768685177,
"learning_rate": 5.926443620435572e-06,
"loss": 1.0048,
"step": 613
},
{
"epoch": 0.45850835433585363,
"grad_norm": 3.1064415558539706,
"learning_rate": 5.914548802823077e-06,
"loss": 0.9325,
"step": 614
},
{
"epoch": 0.4592551106132736,
"grad_norm": 2.6123605318850665,
"learning_rate": 5.902648627774059e-06,
"loss": 0.9743,
"step": 615
},
{
"epoch": 0.46000186689069356,
"grad_norm": 3.0579324009097517,
"learning_rate": 5.8907431649998695e-06,
"loss": 0.9612,
"step": 616
},
{
"epoch": 0.46000186689069356,
"eval_loss": 0.9606157541275024,
"eval_runtime": 160.5917,
"eval_samples_per_second": 112.266,
"eval_steps_per_second": 1.756,
"step": 616
},
{
"epoch": 0.4607486231681135,
"grad_norm": 3.772368202333684,
"learning_rate": 5.878832484242833e-06,
"loss": 1.0323,
"step": 617
},
{
"epoch": 0.4614953794455335,
"grad_norm": 3.0158939394506943,
"learning_rate": 5.866916655275846e-06,
"loss": 0.947,
"step": 618
},
{
"epoch": 0.4622421357229534,
"grad_norm": 2.828941271333206,
"learning_rate": 5.854995747901958e-06,
"loss": 0.9681,
"step": 619
},
{
"epoch": 0.46298889200037335,
"grad_norm": 5.490674513764256,
"learning_rate": 5.84306983195397e-06,
"loss": 1.0043,
"step": 620
},
{
"epoch": 0.46373564827779334,
"grad_norm": 3.6863195592071727,
"learning_rate": 5.831138977294025e-06,
"loss": 1.0246,
"step": 621
},
{
"epoch": 0.4644824045552133,
"grad_norm": 8.996745121039877,
"learning_rate": 5.819203253813194e-06,
"loss": 0.9893,
"step": 622
},
{
"epoch": 0.46522916083263327,
"grad_norm": 2.9938080204280606,
"learning_rate": 5.807262731431069e-06,
"loss": 0.9104,
"step": 623
},
{
"epoch": 0.4659759171100532,
"grad_norm": 3.2831084055526207,
"learning_rate": 5.795317480095361e-06,
"loss": 0.9939,
"step": 624
},
{
"epoch": 0.46672267338747314,
"grad_norm": 3.491567046308877,
"learning_rate": 5.783367569781474e-06,
"loss": 0.9589,
"step": 625
},
{
"epoch": 0.46746942966489313,
"grad_norm": 3.8464462248209936,
"learning_rate": 5.77141307049211e-06,
"loss": 0.9308,
"step": 626
},
{
"epoch": 0.46821618594231307,
"grad_norm": 3.518690870541806,
"learning_rate": 5.7594540522568495e-06,
"loss": 0.9067,
"step": 627
},
{
"epoch": 0.46896294221973306,
"grad_norm": 4.039900909656756,
"learning_rate": 5.7474905851317505e-06,
"loss": 0.9626,
"step": 628
},
{
"epoch": 0.469709698497153,
"grad_norm": 3.6025011295590863,
"learning_rate": 5.73552273919893e-06,
"loss": 0.9861,
"step": 629
},
{
"epoch": 0.47045645477457293,
"grad_norm": 4.493424057626318,
"learning_rate": 5.723550584566151e-06,
"loss": 0.9795,
"step": 630
},
{
"epoch": 0.47045645477457293,
"eval_loss": 0.9606096744537354,
"eval_runtime": 160.7417,
"eval_samples_per_second": 112.161,
"eval_steps_per_second": 1.754,
"step": 630
},
{
"epoch": 0.4712032110519929,
"grad_norm": 4.352468775772123,
"learning_rate": 5.711574191366427e-06,
"loss": 0.975,
"step": 631
},
{
"epoch": 0.47194996732941286,
"grad_norm": 4.037016067946189,
"learning_rate": 5.699593629757591e-06,
"loss": 1.0385,
"step": 632
},
{
"epoch": 0.47269672360683285,
"grad_norm": 2.9029927769111388,
"learning_rate": 5.6876089699219016e-06,
"loss": 1.0004,
"step": 633
},
{
"epoch": 0.4734434798842528,
"grad_norm": 6.004548534705236,
"learning_rate": 5.675620282065621e-06,
"loss": 0.9879,
"step": 634
},
{
"epoch": 0.4741902361616727,
"grad_norm": 3.391029290471578,
"learning_rate": 5.663627636418611e-06,
"loss": 1.064,
"step": 635
},
{
"epoch": 0.4749369924390927,
"grad_norm": 3.043307222730041,
"learning_rate": 5.651631103233914e-06,
"loss": 1.0092,
"step": 636
},
{
"epoch": 0.47568374871651264,
"grad_norm": 3.5752213883205366,
"learning_rate": 5.639630752787349e-06,
"loss": 0.9599,
"step": 637
},
{
"epoch": 0.4764305049939326,
"grad_norm": 3.1804375282053288,
"learning_rate": 5.627626655377094e-06,
"loss": 0.9713,
"step": 638
},
{
"epoch": 0.47717726127135257,
"grad_norm": 2.790112215997805,
"learning_rate": 5.6156188813232806e-06,
"loss": 0.9912,
"step": 639
},
{
"epoch": 0.4779240175487725,
"grad_norm": 3.246922721538881,
"learning_rate": 5.603607500967574e-06,
"loss": 0.963,
"step": 640
},
{
"epoch": 0.4786707738261925,
"grad_norm": 3.392869083327263,
"learning_rate": 5.591592584672767e-06,
"loss": 1.0094,
"step": 641
},
{
"epoch": 0.47941753010361243,
"grad_norm": 3.0902607501283668,
"learning_rate": 5.579574202822366e-06,
"loss": 0.9703,
"step": 642
},
{
"epoch": 0.48016428638103237,
"grad_norm": 2.974464180221207,
"learning_rate": 5.567552425820177e-06,
"loss": 0.9171,
"step": 643
},
{
"epoch": 0.48091104265845236,
"grad_norm": 2.821388532495256,
"learning_rate": 5.5555273240899e-06,
"loss": 0.9527,
"step": 644
},
{
"epoch": 0.48091104265845236,
"eval_loss": 0.9591115713119507,
"eval_runtime": 160.7107,
"eval_samples_per_second": 112.183,
"eval_steps_per_second": 1.755,
"step": 644
},
{
"epoch": 0.4816577989358723,
"grad_norm": 3.260572717739725,
"learning_rate": 5.543498968074704e-06,
"loss": 0.9426,
"step": 645
},
{
"epoch": 0.4824045552132923,
"grad_norm": 2.903284442683706,
"learning_rate": 5.531467428236827e-06,
"loss": 1.0116,
"step": 646
},
{
"epoch": 0.4831513114907122,
"grad_norm": 3.2692857499423456,
"learning_rate": 5.519432775057158e-06,
"loss": 1.0015,
"step": 647
},
{
"epoch": 0.48389806776813216,
"grad_norm": 3.5780325125931833,
"learning_rate": 5.507395079034816e-06,
"loss": 1.0247,
"step": 648
},
{
"epoch": 0.48464482404555215,
"grad_norm": 2.943060253168757,
"learning_rate": 5.4953544106867594e-06,
"loss": 0.8597,
"step": 649
},
{
"epoch": 0.4853915803229721,
"grad_norm": 3.7864502304307854,
"learning_rate": 5.4833108405473425e-06,
"loss": 0.9909,
"step": 650
},
{
"epoch": 0.4861383366003921,
"grad_norm": 3.34360958188669,
"learning_rate": 5.471264439167932e-06,
"loss": 1.0366,
"step": 651
},
{
"epoch": 0.486885092877812,
"grad_norm": 3.22674707296479,
"learning_rate": 5.45921527711647e-06,
"loss": 1.0114,
"step": 652
},
{
"epoch": 0.48763184915523194,
"grad_norm": 3.3442512460984632,
"learning_rate": 5.447163424977076e-06,
"loss": 0.9198,
"step": 653
},
{
"epoch": 0.48837860543265194,
"grad_norm": 3.4465116733186485,
"learning_rate": 5.4351089533496286e-06,
"loss": 0.9371,
"step": 654
},
{
"epoch": 0.48912536171007187,
"grad_norm": 3.6231881310417995,
"learning_rate": 5.423051932849348e-06,
"loss": 1.0369,
"step": 655
},
{
"epoch": 0.4898721179874918,
"grad_norm": 3.150691425065748,
"learning_rate": 5.410992434106387e-06,
"loss": 0.9355,
"step": 656
},
{
"epoch": 0.4906188742649118,
"grad_norm": 2.7706082715810365,
"learning_rate": 5.398930527765416e-06,
"loss": 0.9466,
"step": 657
},
{
"epoch": 0.49136563054233173,
"grad_norm": 2.9257751655736337,
"learning_rate": 5.386866284485212e-06,
"loss": 1.0128,
"step": 658
},
{
"epoch": 0.49136563054233173,
"eval_loss": 0.9578000903129578,
"eval_runtime": 160.6458,
"eval_samples_per_second": 112.228,
"eval_steps_per_second": 1.755,
"step": 658
},
{
"epoch": 0.4921123868197517,
"grad_norm": 4.572313571723131,
"learning_rate": 5.374799774938236e-06,
"loss": 1.0165,
"step": 659
},
{
"epoch": 0.49285914309717166,
"grad_norm": 3.254503964827434,
"learning_rate": 5.36273106981023e-06,
"loss": 0.961,
"step": 660
},
{
"epoch": 0.4936058993745916,
"grad_norm": 2.941336279988955,
"learning_rate": 5.350660239799795e-06,
"loss": 1.0056,
"step": 661
},
{
"epoch": 0.4943526556520116,
"grad_norm": 3.3699370670436863,
"learning_rate": 5.338587355617981e-06,
"loss": 0.9501,
"step": 662
},
{
"epoch": 0.4950994119294315,
"grad_norm": 3.1050200793960356,
"learning_rate": 5.326512487987871e-06,
"loss": 0.9597,
"step": 663
},
{
"epoch": 0.4958461682068515,
"grad_norm": 3.1508330708960584,
"learning_rate": 5.314435707644166e-06,
"loss": 0.9765,
"step": 664
},
{
"epoch": 0.49659292448427145,
"grad_norm": 3.5815299911038587,
"learning_rate": 5.3023570853327725e-06,
"loss": 0.9374,
"step": 665
},
{
"epoch": 0.4973396807616914,
"grad_norm": 2.8686531462477918,
"learning_rate": 5.290276691810388e-06,
"loss": 0.9601,
"step": 666
},
{
"epoch": 0.4980864370391114,
"grad_norm": 3.117674265246126,
"learning_rate": 5.278194597844083e-06,
"loss": 1.0391,
"step": 667
},
{
"epoch": 0.4988331933165313,
"grad_norm": 3.093494135534134,
"learning_rate": 5.266110874210893e-06,
"loss": 0.9477,
"step": 668
},
{
"epoch": 0.4995799495939513,
"grad_norm": 3.139075567298836,
"learning_rate": 5.2540255916974005e-06,
"loss": 1.0269,
"step": 669
},
{
"epoch": 0.5003267058713712,
"grad_norm": 3.3071077238436963,
"learning_rate": 5.241938821099313e-06,
"loss": 1.0532,
"step": 670
},
{
"epoch": 0.5010734621487912,
"grad_norm": 2.816646250547839,
"learning_rate": 5.229850633221063e-06,
"loss": 0.9031,
"step": 671
},
{
"epoch": 0.5018202184262112,
"grad_norm": 3.4889209023924144,
"learning_rate": 5.217761098875383e-06,
"loss": 0.9739,
"step": 672
},
{
"epoch": 0.5018202184262112,
"eval_loss": 0.9554553627967834,
"eval_runtime": 160.5922,
"eval_samples_per_second": 112.266,
"eval_steps_per_second": 1.756,
"step": 672
},
{
"epoch": 0.5025669747036311,
"grad_norm": 3.042163670813616,
"learning_rate": 5.205670288882889e-06,
"loss": 1.0226,
"step": 673
},
{
"epoch": 0.503313730981051,
"grad_norm": 3.5555861229297205,
"learning_rate": 5.19357827407168e-06,
"loss": 1.0199,
"step": 674
},
{
"epoch": 0.504060487258471,
"grad_norm": 2.6753462198941347,
"learning_rate": 5.181485125276898e-06,
"loss": 0.9951,
"step": 675
},
{
"epoch": 0.504807243535891,
"grad_norm": 3.430987000958468,
"learning_rate": 5.169390913340342e-06,
"loss": 0.9776,
"step": 676
},
{
"epoch": 0.505553999813311,
"grad_norm": 2.836448999390065,
"learning_rate": 5.157295709110031e-06,
"loss": 0.9318,
"step": 677
},
{
"epoch": 0.5063007560907309,
"grad_norm": 3.9055759931822704,
"learning_rate": 5.1451995834397975e-06,
"loss": 1.0254,
"step": 678
},
{
"epoch": 0.5070475123681508,
"grad_norm": 3.2262289649091387,
"learning_rate": 5.133102607188875e-06,
"loss": 1.0698,
"step": 679
},
{
"epoch": 0.5077942686455708,
"grad_norm": 6.111956461530283,
"learning_rate": 5.121004851221477e-06,
"loss": 0.9786,
"step": 680
},
{
"epoch": 0.5085410249229908,
"grad_norm": 3.4939998439456303,
"learning_rate": 5.108906386406385e-06,
"loss": 0.9358,
"step": 681
},
{
"epoch": 0.5092877812004107,
"grad_norm": 2.927278408485738,
"learning_rate": 5.096807283616535e-06,
"loss": 0.964,
"step": 682
},
{
"epoch": 0.5100345374778307,
"grad_norm": 3.476986083429208,
"learning_rate": 5.084707613728598e-06,
"loss": 0.9,
"step": 683
},
{
"epoch": 0.5107812937552506,
"grad_norm": 3.775237997290502,
"learning_rate": 5.0726074476225675e-06,
"loss": 0.969,
"step": 684
},
{
"epoch": 0.5115280500326705,
"grad_norm": 4.127873726003733,
"learning_rate": 5.060506856181342e-06,
"loss": 1.0825,
"step": 685
},
{
"epoch": 0.5122748063100906,
"grad_norm": 3.435131733296269,
"learning_rate": 5.0484059102903174e-06,
"loss": 0.979,
"step": 686
},
{
"epoch": 0.5122748063100906,
"eval_loss": 0.9553431272506714,
"eval_runtime": 160.5802,
"eval_samples_per_second": 112.274,
"eval_steps_per_second": 1.756,
"step": 686
},
{
"epoch": 0.5130215625875105,
"grad_norm": 3.339425996015394,
"learning_rate": 5.036304680836959e-06,
"loss": 1.0607,
"step": 687
},
{
"epoch": 0.5137683188649305,
"grad_norm": 4.076321482116909,
"learning_rate": 5.0242032387103974e-06,
"loss": 1.0518,
"step": 688
},
{
"epoch": 0.5145150751423504,
"grad_norm": 3.089030123276057,
"learning_rate": 5.01210165480101e-06,
"loss": 0.9193,
"step": 689
},
{
"epoch": 0.5152618314197703,
"grad_norm": 4.513388204954891,
"learning_rate": 5e-06,
"loss": 1.036,
"step": 690
},
{
"epoch": 0.5160085876971904,
"grad_norm": 3.369925467145746,
"learning_rate": 4.9878983451989904e-06,
"loss": 0.9661,
"step": 691
},
{
"epoch": 0.5167553439746103,
"grad_norm": 3.2836780434682993,
"learning_rate": 4.9757967612896025e-06,
"loss": 0.9677,
"step": 692
},
{
"epoch": 0.5175021002520303,
"grad_norm": 4.0775074108281215,
"learning_rate": 4.963695319163041e-06,
"loss": 1.0827,
"step": 693
},
{
"epoch": 0.5182488565294502,
"grad_norm": 3.325029804236844,
"learning_rate": 4.951594089709685e-06,
"loss": 1.0138,
"step": 694
},
{
"epoch": 0.5189956128068701,
"grad_norm": 3.5885563133061487,
"learning_rate": 4.939493143818659e-06,
"loss": 1.0677,
"step": 695
},
{
"epoch": 0.5197423690842902,
"grad_norm": 3.156282854149789,
"learning_rate": 4.927392552377434e-06,
"loss": 0.9617,
"step": 696
},
{
"epoch": 0.5204891253617101,
"grad_norm": 3.0416344384601026,
"learning_rate": 4.915292386271403e-06,
"loss": 0.9855,
"step": 697
},
{
"epoch": 0.52123588163913,
"grad_norm": 3.2733777511786095,
"learning_rate": 4.9031927163834655e-06,
"loss": 0.8939,
"step": 698
},
{
"epoch": 0.52198263791655,
"grad_norm": 3.398814579607184,
"learning_rate": 4.891093613593615e-06,
"loss": 0.9741,
"step": 699
},
{
"epoch": 0.5227293941939699,
"grad_norm": 3.1153916205245777,
"learning_rate": 4.878995148778525e-06,
"loss": 0.9968,
"step": 700
},
{
"epoch": 0.5227293941939699,
"eval_loss": 0.9541786313056946,
"eval_runtime": 162.0716,
"eval_samples_per_second": 111.241,
"eval_steps_per_second": 1.74,
"step": 700
},
{
"epoch": 0.5234761504713898,
"grad_norm": 3.3647452892177117,
"learning_rate": 4.866897392811127e-06,
"loss": 0.9962,
"step": 701
},
{
"epoch": 0.5242229067488099,
"grad_norm": 3.113204959768887,
"learning_rate": 4.854800416560205e-06,
"loss": 1.0054,
"step": 702
},
{
"epoch": 0.5249696630262298,
"grad_norm": 3.8495342835228583,
"learning_rate": 4.842704290889971e-06,
"loss": 0.8952,
"step": 703
},
{
"epoch": 0.5257164193036498,
"grad_norm": 3.174795087074365,
"learning_rate": 4.830609086659659e-06,
"loss": 0.9851,
"step": 704
},
{
"epoch": 0.5264631755810697,
"grad_norm": 2.729342539133593,
"learning_rate": 4.818514874723103e-06,
"loss": 0.9213,
"step": 705
},
{
"epoch": 0.5272099318584896,
"grad_norm": 3.076282518758111,
"learning_rate": 4.806421725928323e-06,
"loss": 0.9964,
"step": 706
},
{
"epoch": 0.5279566881359097,
"grad_norm": 3.2836882108192165,
"learning_rate": 4.7943297111171115e-06,
"loss": 0.9746,
"step": 707
},
{
"epoch": 0.5287034444133296,
"grad_norm": 3.8352591409956336,
"learning_rate": 4.782238901124618e-06,
"loss": 1.043,
"step": 708
},
{
"epoch": 0.5294502006907496,
"grad_norm": 2.7535052802015634,
"learning_rate": 4.770149366778938e-06,
"loss": 0.8983,
"step": 709
},
{
"epoch": 0.5301969569681695,
"grad_norm": 3.852480148863445,
"learning_rate": 4.758061178900687e-06,
"loss": 0.9914,
"step": 710
},
{
"epoch": 0.5309437132455894,
"grad_norm": 3.4470151143902337,
"learning_rate": 4.745974408302602e-06,
"loss": 0.9474,
"step": 711
},
{
"epoch": 0.5316904695230095,
"grad_norm": 2.5890177372536516,
"learning_rate": 4.7338891257891085e-06,
"loss": 0.9773,
"step": 712
},
{
"epoch": 0.5324372258004294,
"grad_norm": 4.222415888473242,
"learning_rate": 4.721805402155919e-06,
"loss": 1.0616,
"step": 713
},
{
"epoch": 0.5331839820778493,
"grad_norm": 3.0869687607345395,
"learning_rate": 4.709723308189614e-06,
"loss": 1.0079,
"step": 714
},
{
"epoch": 0.5331839820778493,
"eval_loss": 0.9527038931846619,
"eval_runtime": 160.8101,
"eval_samples_per_second": 112.114,
"eval_steps_per_second": 1.754,
"step": 714
},
{
"epoch": 0.5339307383552693,
"grad_norm": 3.670148127602683,
"learning_rate": 4.697642914667229e-06,
"loss": 1.0192,
"step": 715
},
{
"epoch": 0.5346774946326892,
"grad_norm": 3.107343398040105,
"learning_rate": 4.6855642923558345e-06,
"loss": 0.9228,
"step": 716
},
{
"epoch": 0.5354242509101093,
"grad_norm": 3.5996371455869367,
"learning_rate": 4.67348751201213e-06,
"loss": 1.0376,
"step": 717
},
{
"epoch": 0.5361710071875292,
"grad_norm": 3.1878765692346605,
"learning_rate": 4.661412644382021e-06,
"loss": 0.9582,
"step": 718
},
{
"epoch": 0.5369177634649491,
"grad_norm": 3.771190985798084,
"learning_rate": 4.649339760200206e-06,
"loss": 0.9596,
"step": 719
},
{
"epoch": 0.5376645197423691,
"grad_norm": 3.38014392455014,
"learning_rate": 4.637268930189772e-06,
"loss": 0.9532,
"step": 720
},
{
"epoch": 0.538411276019789,
"grad_norm": 3.8749251965793965,
"learning_rate": 4.625200225061765e-06,
"loss": 1.0393,
"step": 721
},
{
"epoch": 0.539158032297209,
"grad_norm": 3.9117960489662127,
"learning_rate": 4.61313371551479e-06,
"loss": 1.006,
"step": 722
},
{
"epoch": 0.539904788574629,
"grad_norm": 2.8086121993308857,
"learning_rate": 4.601069472234584e-06,
"loss": 0.9323,
"step": 723
},
{
"epoch": 0.5406515448520489,
"grad_norm": 2.7551273219778825,
"learning_rate": 4.589007565893615e-06,
"loss": 0.9317,
"step": 724
},
{
"epoch": 0.5413983011294689,
"grad_norm": 3.362198847742376,
"learning_rate": 4.576948067150655e-06,
"loss": 0.9358,
"step": 725
},
{
"epoch": 0.5421450574068888,
"grad_norm": 3.5974105806012235,
"learning_rate": 4.564891046650373e-06,
"loss": 1.0596,
"step": 726
},
{
"epoch": 0.5428918136843088,
"grad_norm": 3.552430736775191,
"learning_rate": 4.552836575022925e-06,
"loss": 0.9611,
"step": 727
},
{
"epoch": 0.5436385699617288,
"grad_norm": 2.573569543965171,
"learning_rate": 4.540784722883532e-06,
"loss": 0.9552,
"step": 728
},
{
"epoch": 0.5436385699617288,
"eval_loss": 0.951053261756897,
"eval_runtime": 160.6105,
"eval_samples_per_second": 112.253,
"eval_steps_per_second": 1.756,
"step": 728
},
{
"epoch": 0.5443853262391487,
"grad_norm": 3.565425106205681,
"learning_rate": 4.528735560832071e-06,
"loss": 1.0667,
"step": 729
},
{
"epoch": 0.5451320825165686,
"grad_norm": 2.7926401591953383,
"learning_rate": 4.51668915945266e-06,
"loss": 0.976,
"step": 730
},
{
"epoch": 0.5458788387939886,
"grad_norm": 3.0421797900802003,
"learning_rate": 4.504645589313243e-06,
"loss": 1.0462,
"step": 731
},
{
"epoch": 0.5466255950714086,
"grad_norm": 3.503454575591918,
"learning_rate": 4.492604920965185e-06,
"loss": 1.0857,
"step": 732
},
{
"epoch": 0.5473723513488286,
"grad_norm": 2.904815956092458,
"learning_rate": 4.480567224942845e-06,
"loss": 1.0072,
"step": 733
},
{
"epoch": 0.5481191076262485,
"grad_norm": 3.5102905705105742,
"learning_rate": 4.468532571763174e-06,
"loss": 0.9472,
"step": 734
},
{
"epoch": 0.5488658639036684,
"grad_norm": 3.7183800704590033,
"learning_rate": 4.456501031925297e-06,
"loss": 1.0463,
"step": 735
},
{
"epoch": 0.5496126201810884,
"grad_norm": 2.9419058702603555,
"learning_rate": 4.444472675910103e-06,
"loss": 1.031,
"step": 736
},
{
"epoch": 0.5503593764585084,
"grad_norm": 3.9094266797244877,
"learning_rate": 4.4324475741798235e-06,
"loss": 0.9969,
"step": 737
},
{
"epoch": 0.5511061327359283,
"grad_norm": 3.20510351193743,
"learning_rate": 4.420425797177637e-06,
"loss": 1.0058,
"step": 738
},
{
"epoch": 0.5518528890133483,
"grad_norm": 2.862584885372328,
"learning_rate": 4.4084074153272346e-06,
"loss": 0.9266,
"step": 739
},
{
"epoch": 0.5525996452907682,
"grad_norm": 3.230765906011071,
"learning_rate": 4.396392499032428e-06,
"loss": 0.928,
"step": 740
},
{
"epoch": 0.5533464015681882,
"grad_norm": 7.447272293937854,
"learning_rate": 4.38438111867672e-06,
"loss": 1.0577,
"step": 741
},
{
"epoch": 0.5540931578456081,
"grad_norm": 4.421418489574816,
"learning_rate": 4.372373344622906e-06,
"loss": 1.0461,
"step": 742
},
{
"epoch": 0.5540931578456081,
"eval_loss": 0.9515445232391357,
"eval_runtime": 162.0634,
"eval_samples_per_second": 111.247,
"eval_steps_per_second": 1.74,
"step": 742
},
{
"epoch": 0.5548399141230281,
"grad_norm": 2.7712010504789544,
"learning_rate": 4.360369247212653e-06,
"loss": 0.988,
"step": 743
},
{
"epoch": 0.5555866704004481,
"grad_norm": 2.8683975287912435,
"learning_rate": 4.3483688967660875e-06,
"loss": 0.8898,
"step": 744
},
{
"epoch": 0.556333426677868,
"grad_norm": 3.3419411279482003,
"learning_rate": 4.336372363581391e-06,
"loss": 1.0419,
"step": 745
},
{
"epoch": 0.5570801829552879,
"grad_norm": 3.422260194867919,
"learning_rate": 4.3243797179343795e-06,
"loss": 0.9626,
"step": 746
},
{
"epoch": 0.5578269392327079,
"grad_norm": 3.932467663326973,
"learning_rate": 4.3123910300781e-06,
"loss": 1.0097,
"step": 747
},
{
"epoch": 0.5585736955101279,
"grad_norm": 3.220509235991095,
"learning_rate": 4.300406370242409e-06,
"loss": 0.9915,
"step": 748
},
{
"epoch": 0.5593204517875479,
"grad_norm": 3.1936031478949873,
"learning_rate": 4.2884258086335755e-06,
"loss": 0.958,
"step": 749
},
{
"epoch": 0.5600672080649678,
"grad_norm": 3.1854522468121935,
"learning_rate": 4.276449415433851e-06,
"loss": 0.9964,
"step": 750
},
{
"epoch": 0.5608139643423877,
"grad_norm": 2.6714557730618553,
"learning_rate": 4.264477260801072e-06,
"loss": 1.0101,
"step": 751
},
{
"epoch": 0.5615607206198077,
"grad_norm": 3.647313021227066,
"learning_rate": 4.25250941486825e-06,
"loss": 0.9056,
"step": 752
},
{
"epoch": 0.5623074768972277,
"grad_norm": 3.3124446614726826,
"learning_rate": 4.2405459477431505e-06,
"loss": 0.9657,
"step": 753
},
{
"epoch": 0.5630542331746476,
"grad_norm": 3.4799141929813904,
"learning_rate": 4.228586929507892e-06,
"loss": 0.9853,
"step": 754
},
{
"epoch": 0.5638009894520676,
"grad_norm": 3.506153943735322,
"learning_rate": 4.216632430218528e-06,
"loss": 0.9887,
"step": 755
},
{
"epoch": 0.5645477457294875,
"grad_norm": 2.5744780478609335,
"learning_rate": 4.204682519904641e-06,
"loss": 0.936,
"step": 756
},
{
"epoch": 0.5645477457294875,
"eval_loss": 0.9495623707771301,
"eval_runtime": 160.7518,
"eval_samples_per_second": 112.154,
"eval_steps_per_second": 1.754,
"step": 756
},
{
"epoch": 0.5652945020069075,
"grad_norm": 3.098794189486562,
"learning_rate": 4.1927372685689315e-06,
"loss": 0.9661,
"step": 757
},
{
"epoch": 0.5660412582843275,
"grad_norm": 3.4619013278607578,
"learning_rate": 4.180796746186808e-06,
"loss": 0.9115,
"step": 758
},
{
"epoch": 0.5667880145617474,
"grad_norm": 3.5592551507334433,
"learning_rate": 4.168861022705976e-06,
"loss": 1.0279,
"step": 759
},
{
"epoch": 0.5675347708391674,
"grad_norm": 3.57134196506803,
"learning_rate": 4.1569301680460304e-06,
"loss": 0.9903,
"step": 760
},
{
"epoch": 0.5682815271165873,
"grad_norm": 2.9011475792179664,
"learning_rate": 4.145004252098044e-06,
"loss": 0.9266,
"step": 761
},
{
"epoch": 0.5690282833940072,
"grad_norm": 3.3121205852213564,
"learning_rate": 4.133083344724156e-06,
"loss": 0.9647,
"step": 762
},
{
"epoch": 0.5697750396714273,
"grad_norm": 4.324275423359752,
"learning_rate": 4.121167515757168e-06,
"loss": 0.926,
"step": 763
},
{
"epoch": 0.5705217959488472,
"grad_norm": 2.8518532935568475,
"learning_rate": 4.109256835000132e-06,
"loss": 1.0484,
"step": 764
},
{
"epoch": 0.5712685522262672,
"grad_norm": 2.8603023862843737,
"learning_rate": 4.097351372225943e-06,
"loss": 0.9625,
"step": 765
},
{
"epoch": 0.5720153085036871,
"grad_norm": 3.2495297372313936,
"learning_rate": 4.085451197176924e-06,
"loss": 0.9934,
"step": 766
},
{
"epoch": 0.572762064781107,
"grad_norm": 2.9168937333210736,
"learning_rate": 4.073556379564429e-06,
"loss": 0.9692,
"step": 767
},
{
"epoch": 0.5735088210585271,
"grad_norm": 3.4739754957829994,
"learning_rate": 4.061666989068423e-06,
"loss": 1.0596,
"step": 768
},
{
"epoch": 0.574255577335947,
"grad_norm": 2.524612817650384,
"learning_rate": 4.049783095337081e-06,
"loss": 0.934,
"step": 769
},
{
"epoch": 0.575002333613367,
"grad_norm": 3.1228879201643713,
"learning_rate": 4.037904767986378e-06,
"loss": 0.9707,
"step": 770
},
{
"epoch": 0.575002333613367,
"eval_loss": 0.9477403163909912,
"eval_runtime": 162.0633,
"eval_samples_per_second": 111.247,
"eval_steps_per_second": 1.74,
"step": 770
},
{
"epoch": 0.5757490898907869,
"grad_norm": 3.257360843044048,
"learning_rate": 4.026032076599681e-06,
"loss": 0.9881,
"step": 771
},
{
"epoch": 0.5764958461682068,
"grad_norm": 4.000425859190567,
"learning_rate": 4.014165090727341e-06,
"loss": 0.9536,
"step": 772
},
{
"epoch": 0.5772426024456269,
"grad_norm": 2.753962998604683,
"learning_rate": 4.002303879886288e-06,
"loss": 0.9246,
"step": 773
},
{
"epoch": 0.5779893587230468,
"grad_norm": 3.7186990398787967,
"learning_rate": 3.990448513559615e-06,
"loss": 1.0088,
"step": 774
},
{
"epoch": 0.5787361150004667,
"grad_norm": 3.158075187326005,
"learning_rate": 3.978599061196188e-06,
"loss": 0.9959,
"step": 775
},
{
"epoch": 0.5794828712778867,
"grad_norm": 4.303108732794814,
"learning_rate": 3.9667555922102214e-06,
"loss": 1.0725,
"step": 776
},
{
"epoch": 0.5802296275553066,
"grad_norm": 2.870021324090936,
"learning_rate": 3.954918175980882e-06,
"loss": 0.924,
"step": 777
},
{
"epoch": 0.5809763838327265,
"grad_norm": 4.264645613049714,
"learning_rate": 3.9430868818518786e-06,
"loss": 0.9837,
"step": 778
},
{
"epoch": 0.5817231401101466,
"grad_norm": 3.3495172772032986,
"learning_rate": 3.931261779131058e-06,
"loss": 0.9841,
"step": 779
},
{
"epoch": 0.5824698963875665,
"grad_norm": 3.246610836443149,
"learning_rate": 3.919442937089996e-06,
"loss": 0.9947,
"step": 780
},
{
"epoch": 0.5832166526649865,
"grad_norm": 2.7004006268478444,
"learning_rate": 3.9076304249635905e-06,
"loss": 0.9138,
"step": 781
},
{
"epoch": 0.5839634089424064,
"grad_norm": 2.7834796179870325,
"learning_rate": 3.895824311949665e-06,
"loss": 1.0329,
"step": 782
},
{
"epoch": 0.5847101652198263,
"grad_norm": 2.8998183145518124,
"learning_rate": 3.884024667208556e-06,
"loss": 1.0747,
"step": 783
},
{
"epoch": 0.5854569214972464,
"grad_norm": 3.5440597227303194,
"learning_rate": 3.872231559862702e-06,
"loss": 0.9257,
"step": 784
},
{
"epoch": 0.5854569214972464,
"eval_loss": 0.9469641447067261,
"eval_runtime": 160.547,
"eval_samples_per_second": 112.297,
"eval_steps_per_second": 1.756,
"step": 784
},
{
"epoch": 0.5862036777746663,
"grad_norm": 3.3382847763731767,
"learning_rate": 3.860445058996255e-06,
"loss": 0.9998,
"step": 785
},
{
"epoch": 0.5869504340520862,
"grad_norm": 2.6762939914119226,
"learning_rate": 3.848665233654658e-06,
"loss": 0.9951,
"step": 786
},
{
"epoch": 0.5876971903295062,
"grad_norm": 2.4239663023709093,
"learning_rate": 3.836892152844251e-06,
"loss": 1.0196,
"step": 787
},
{
"epoch": 0.5884439466069261,
"grad_norm": 2.7049762394604313,
"learning_rate": 3.825125885531869e-06,
"loss": 0.9878,
"step": 788
},
{
"epoch": 0.5891907028843462,
"grad_norm": 3.486650028911791,
"learning_rate": 3.813366500644426e-06,
"loss": 0.977,
"step": 789
},
{
"epoch": 0.5899374591617661,
"grad_norm": 3.9630866861155116,
"learning_rate": 3.8016140670685263e-06,
"loss": 0.9626,
"step": 790
},
{
"epoch": 0.590684215439186,
"grad_norm": 4.033104870432513,
"learning_rate": 3.789868653650046e-06,
"loss": 0.9761,
"step": 791
},
{
"epoch": 0.591430971716606,
"grad_norm": 3.346871826227189,
"learning_rate": 3.7781303291937453e-06,
"loss": 1.0169,
"step": 792
},
{
"epoch": 0.5921777279940259,
"grad_norm": 3.936340048603582,
"learning_rate": 3.7663991624628495e-06,
"loss": 0.8851,
"step": 793
},
{
"epoch": 0.592924484271446,
"grad_norm": 3.6485292207922058,
"learning_rate": 3.7546752221786553e-06,
"loss": 0.8865,
"step": 794
},
{
"epoch": 0.5936712405488659,
"grad_norm": 3.6264257513729343,
"learning_rate": 3.7429585770201314e-06,
"loss": 0.9835,
"step": 795
},
{
"epoch": 0.5944179968262858,
"grad_norm": 4.573602430354129,
"learning_rate": 3.7312492956235058e-06,
"loss": 1.0293,
"step": 796
},
{
"epoch": 0.5951647531037058,
"grad_norm": 2.85790348242624,
"learning_rate": 3.719547446581876e-06,
"loss": 0.976,
"step": 797
},
{
"epoch": 0.5959115093811257,
"grad_norm": 2.8881328636872623,
"learning_rate": 3.7078530984447956e-06,
"loss": 0.9718,
"step": 798
},
{
"epoch": 0.5959115093811257,
"eval_loss": 0.9462713599205017,
"eval_runtime": 160.8542,
"eval_samples_per_second": 112.083,
"eval_steps_per_second": 1.753,
"step": 798
},
{
"epoch": 0.5966582656585457,
"grad_norm": 3.2246121361724254,
"learning_rate": 3.6961663197178767e-06,
"loss": 0.8918,
"step": 799
},
{
"epoch": 0.5974050219359657,
"grad_norm": 4.065115346169568,
"learning_rate": 3.6844871788623946e-06,
"loss": 0.9987,
"step": 800
},
{
"epoch": 0.5981517782133856,
"grad_norm": 3.1428749321319236,
"learning_rate": 3.6728157442948786e-06,
"loss": 1.0329,
"step": 801
},
{
"epoch": 0.5988985344908055,
"grad_norm": 3.2108025912704763,
"learning_rate": 3.6611520843867155e-06,
"loss": 0.9817,
"step": 802
},
{
"epoch": 0.5996452907682255,
"grad_norm": 3.3376030924698674,
"learning_rate": 3.649496267463749e-06,
"loss": 0.9244,
"step": 803
},
{
"epoch": 0.6003920470456455,
"grad_norm": 2.797596396920881,
"learning_rate": 3.6378483618058774e-06,
"loss": 0.9516,
"step": 804
},
{
"epoch": 0.6011388033230655,
"grad_norm": 2.6634774502196605,
"learning_rate": 3.626208435646652e-06,
"loss": 0.9045,
"step": 805
},
{
"epoch": 0.6018855596004854,
"grad_norm": 4.020178611133311,
"learning_rate": 3.6145765571728863e-06,
"loss": 0.997,
"step": 806
},
{
"epoch": 0.6026323158779053,
"grad_norm": 3.4517830596099497,
"learning_rate": 3.6029527945242436e-06,
"loss": 0.9844,
"step": 807
},
{
"epoch": 0.6033790721553253,
"grad_norm": 3.408570855279641,
"learning_rate": 3.5913372157928515e-06,
"loss": 0.9571,
"step": 808
},
{
"epoch": 0.6041258284327453,
"grad_norm": 3.102469639035291,
"learning_rate": 3.5797298890228903e-06,
"loss": 0.9399,
"step": 809
},
{
"epoch": 0.6048725847101653,
"grad_norm": 4.116854993159461,
"learning_rate": 3.5681308822102046e-06,
"loss": 1.0615,
"step": 810
},
{
"epoch": 0.6056193409875852,
"grad_norm": 3.3466880091581372,
"learning_rate": 3.5565402633018963e-06,
"loss": 1.0105,
"step": 811
},
{
"epoch": 0.6063660972650051,
"grad_norm": 2.8545068302805126,
"learning_rate": 3.5449581001959327e-06,
"loss": 1.0286,
"step": 812
},
{
"epoch": 0.6063660972650051,
"eval_loss": 0.9455689191818237,
"eval_runtime": 160.9732,
"eval_samples_per_second": 112.0,
"eval_steps_per_second": 1.752,
"step": 812
},
{
"epoch": 0.6071128535424251,
"grad_norm": 3.155802958052963,
"learning_rate": 3.5333844607407497e-06,
"loss": 0.924,
"step": 813
},
{
"epoch": 0.607859609819845,
"grad_norm": 2.9985600832229333,
"learning_rate": 3.521819412734846e-06,
"loss": 1.0208,
"step": 814
},
{
"epoch": 0.608606366097265,
"grad_norm": 3.456904877508385,
"learning_rate": 3.5102630239263986e-06,
"loss": 0.9405,
"step": 815
},
{
"epoch": 0.609353122374685,
"grad_norm": 3.506919400122488,
"learning_rate": 3.4987153620128534e-06,
"loss": 1.0131,
"step": 816
},
{
"epoch": 0.6100998786521049,
"grad_norm": 3.562709368000536,
"learning_rate": 3.487176494640533e-06,
"loss": 1.0204,
"step": 817
},
{
"epoch": 0.6108466349295248,
"grad_norm": 3.029728546928659,
"learning_rate": 3.4756464894042475e-06,
"loss": 0.9355,
"step": 818
},
{
"epoch": 0.6115933912069448,
"grad_norm": 4.701806928246403,
"learning_rate": 3.464125413846886e-06,
"loss": 1.0966,
"step": 819
},
{
"epoch": 0.6123401474843648,
"grad_norm": 4.345487604862701,
"learning_rate": 3.4526133354590342e-06,
"loss": 1.0304,
"step": 820
},
{
"epoch": 0.6130869037617848,
"grad_norm": 2.9882707953224097,
"learning_rate": 3.4411103216785645e-06,
"loss": 1.0434,
"step": 821
},
{
"epoch": 0.6138336600392047,
"grad_norm": 3.6824324317588997,
"learning_rate": 3.4296164398902576e-06,
"loss": 1.0781,
"step": 822
},
{
"epoch": 0.6145804163166246,
"grad_norm": 3.1022094976858012,
"learning_rate": 3.418131757425389e-06,
"loss": 0.9307,
"step": 823
},
{
"epoch": 0.6153271725940446,
"grad_norm": 6.184516709697887,
"learning_rate": 3.4066563415613523e-06,
"loss": 0.9767,
"step": 824
},
{
"epoch": 0.6160739288714646,
"grad_norm": 3.18927582757545,
"learning_rate": 3.395190259521254e-06,
"loss": 0.9867,
"step": 825
},
{
"epoch": 0.6168206851488846,
"grad_norm": 3.103890715286569,
"learning_rate": 3.3837335784735244e-06,
"loss": 0.9872,
"step": 826
},
{
"epoch": 0.6168206851488846,
"eval_loss": 0.9444602727890015,
"eval_runtime": 160.7087,
"eval_samples_per_second": 112.184,
"eval_steps_per_second": 1.755,
"step": 826
},
{
"epoch": 0.6175674414263045,
"grad_norm": 4.650015908882197,
"learning_rate": 3.37228636553152e-06,
"loss": 0.9987,
"step": 827
},
{
"epoch": 0.6183141977037244,
"grad_norm": 3.5399264590081265,
"learning_rate": 3.360848687753138e-06,
"loss": 0.9564,
"step": 828
},
{
"epoch": 0.6190609539811444,
"grad_norm": 3.0564069474101756,
"learning_rate": 3.349420612140412e-06,
"loss": 0.936,
"step": 829
},
{
"epoch": 0.6198077102585644,
"grad_norm": 3.6031607198080886,
"learning_rate": 3.33800220563913e-06,
"loss": 0.9446,
"step": 830
},
{
"epoch": 0.6205544665359843,
"grad_norm": 3.4094903377690953,
"learning_rate": 3.3265935351384386e-06,
"loss": 0.9977,
"step": 831
},
{
"epoch": 0.6213012228134043,
"grad_norm": 2.848149256988196,
"learning_rate": 3.3151946674704487e-06,
"loss": 0.9697,
"step": 832
},
{
"epoch": 0.6220479790908242,
"grad_norm": 3.0079233857559977,
"learning_rate": 3.3038056694098485e-06,
"loss": 0.9622,
"step": 833
},
{
"epoch": 0.6227947353682441,
"grad_norm": 3.518051937950098,
"learning_rate": 3.2924266076735094e-06,
"loss": 0.9171,
"step": 834
},
{
"epoch": 0.6235414916456642,
"grad_norm": 3.3430473288801013,
"learning_rate": 3.281057548920091e-06,
"loss": 0.9487,
"step": 835
},
{
"epoch": 0.6242882479230841,
"grad_norm": 3.258832229112217,
"learning_rate": 3.2696985597496633e-06,
"loss": 0.9402,
"step": 836
},
{
"epoch": 0.6250350042005041,
"grad_norm": 4.025430224245148,
"learning_rate": 3.258349706703302e-06,
"loss": 1.035,
"step": 837
},
{
"epoch": 0.625781760477924,
"grad_norm": 3.6604686310111374,
"learning_rate": 3.24701105626271e-06,
"loss": 0.9754,
"step": 838
},
{
"epoch": 0.6265285167553439,
"grad_norm": 3.25423601241418,
"learning_rate": 3.2356826748498182e-06,
"loss": 0.984,
"step": 839
},
{
"epoch": 0.627275273032764,
"grad_norm": 3.28584168969409,
"learning_rate": 3.2243646288264073e-06,
"loss": 0.9791,
"step": 840
},
{
"epoch": 0.627275273032764,
"eval_loss": 0.9437100291252136,
"eval_runtime": 160.7821,
"eval_samples_per_second": 112.133,
"eval_steps_per_second": 1.754,
"step": 840
},
{
"epoch": 0.6280220293101839,
"grad_norm": 4.366912401633733,
"learning_rate": 3.2130569844937097e-06,
"loss": 0.9599,
"step": 841
},
{
"epoch": 0.6287687855876039,
"grad_norm": 2.9219395535212587,
"learning_rate": 3.2017598080920224e-06,
"loss": 0.9939,
"step": 842
},
{
"epoch": 0.6295155418650238,
"grad_norm": 3.786606223724064,
"learning_rate": 3.1904731658003264e-06,
"loss": 0.9879,
"step": 843
},
{
"epoch": 0.6302622981424437,
"grad_norm": 3.3886784078360432,
"learning_rate": 3.1791971237358893e-06,
"loss": 0.9483,
"step": 844
},
{
"epoch": 0.6310090544198638,
"grad_norm": 2.644489445271042,
"learning_rate": 3.1679317479538864e-06,
"loss": 0.9832,
"step": 845
},
{
"epoch": 0.6317558106972837,
"grad_norm": 3.2481495332858237,
"learning_rate": 3.1566771044470057e-06,
"loss": 1.0207,
"step": 846
},
{
"epoch": 0.6325025669747036,
"grad_norm": 3.053150214876181,
"learning_rate": 3.1454332591450697e-06,
"loss": 1.0517,
"step": 847
},
{
"epoch": 0.6332493232521236,
"grad_norm": 3.1143024638740635,
"learning_rate": 3.1342002779146398e-06,
"loss": 0.8988,
"step": 848
},
{
"epoch": 0.6339960795295435,
"grad_norm": 3.123010178193279,
"learning_rate": 3.122978226558641e-06,
"loss": 0.9868,
"step": 849
},
{
"epoch": 0.6347428358069636,
"grad_norm": 3.1659905184153048,
"learning_rate": 3.1117671708159665e-06,
"loss": 1.0189,
"step": 850
},
{
"epoch": 0.6354895920843835,
"grad_norm": 2.875486846492392,
"learning_rate": 3.1005671763611003e-06,
"loss": 0.9028,
"step": 851
},
{
"epoch": 0.6362363483618034,
"grad_norm": 2.9689142045380406,
"learning_rate": 3.0893783088037264e-06,
"loss": 0.8993,
"step": 852
},
{
"epoch": 0.6369831046392234,
"grad_norm": 3.6622092513517517,
"learning_rate": 3.078200633688352e-06,
"loss": 0.9756,
"step": 853
},
{
"epoch": 0.6377298609166433,
"grad_norm": 3.8793883255926582,
"learning_rate": 3.0670342164939126e-06,
"loss": 0.9594,
"step": 854
},
{
"epoch": 0.6377298609166433,
"eval_loss": 0.9425017237663269,
"eval_runtime": 160.7149,
"eval_samples_per_second": 112.18,
"eval_steps_per_second": 1.755,
"step": 854
},
{
"epoch": 0.6384766171940632,
"grad_norm": 2.9732643744244034,
"learning_rate": 3.0558791226333974e-06,
"loss": 0.9466,
"step": 855
},
{
"epoch": 0.6392233734714833,
"grad_norm": 2.548903115004312,
"learning_rate": 3.044735417453466e-06,
"loss": 0.9401,
"step": 856
},
{
"epoch": 0.6399701297489032,
"grad_norm": 2.9378623619360753,
"learning_rate": 3.033603166234059e-06,
"loss": 0.9978,
"step": 857
},
{
"epoch": 0.6407168860263232,
"grad_norm": 3.3994193331015143,
"learning_rate": 3.0224824341880226e-06,
"loss": 0.9302,
"step": 858
},
{
"epoch": 0.6414636423037431,
"grad_norm": 3.7318362244634526,
"learning_rate": 3.0113732864607236e-06,
"loss": 0.9188,
"step": 859
},
{
"epoch": 0.642210398581163,
"grad_norm": 3.6257765842788543,
"learning_rate": 3.000275788129662e-06,
"loss": 0.9515,
"step": 860
},
{
"epoch": 0.6429571548585831,
"grad_norm": 2.741192045611565,
"learning_rate": 2.9891900042041043e-06,
"loss": 0.9686,
"step": 861
},
{
"epoch": 0.643703911136003,
"grad_norm": 3.262238253576203,
"learning_rate": 2.978115999624686e-06,
"loss": 0.9904,
"step": 862
},
{
"epoch": 0.6444506674134229,
"grad_norm": 3.0017525383643124,
"learning_rate": 2.967053839263046e-06,
"loss": 0.9432,
"step": 863
},
{
"epoch": 0.6451974236908429,
"grad_norm": 3.6774257261294023,
"learning_rate": 2.956003587921433e-06,
"loss": 0.9398,
"step": 864
},
{
"epoch": 0.6459441799682628,
"grad_norm": 3.027953563829782,
"learning_rate": 2.9449653103323405e-06,
"loss": 0.9681,
"step": 865
},
{
"epoch": 0.6466909362456829,
"grad_norm": 3.112438742457364,
"learning_rate": 2.9339390711581105e-06,
"loss": 0.9468,
"step": 866
},
{
"epoch": 0.6474376925231028,
"grad_norm": 3.5341048864406948,
"learning_rate": 2.9229249349905686e-06,
"loss": 0.9825,
"step": 867
},
{
"epoch": 0.6481844488005227,
"grad_norm": 3.54521516556626,
"learning_rate": 2.9119229663506417e-06,
"loss": 1.0349,
"step": 868
},
{
"epoch": 0.6481844488005227,
"eval_loss": 0.9415974617004395,
"eval_runtime": 162.2767,
"eval_samples_per_second": 111.1,
"eval_steps_per_second": 1.738,
"step": 868
},
{
"epoch": 0.6489312050779427,
"grad_norm": 3.1153407411887657,
"learning_rate": 2.900933229687978e-06,
"loss": 0.9653,
"step": 869
},
{
"epoch": 0.6496779613553626,
"grad_norm": 3.3514637459968695,
"learning_rate": 2.889955789380572e-06,
"loss": 0.9628,
"step": 870
},
{
"epoch": 0.6504247176327826,
"grad_norm": 3.085482591143154,
"learning_rate": 2.8789907097343818e-06,
"loss": 0.9068,
"step": 871
},
{
"epoch": 0.6511714739102026,
"grad_norm": 3.5723275185915515,
"learning_rate": 2.868038054982962e-06,
"loss": 0.9639,
"step": 872
},
{
"epoch": 0.6519182301876225,
"grad_norm": 2.8915352489194492,
"learning_rate": 2.8570978892870777e-06,
"loss": 1.0256,
"step": 873
},
{
"epoch": 0.6526649864650425,
"grad_norm": 3.091290299462198,
"learning_rate": 2.8461702767343336e-06,
"loss": 0.9228,
"step": 874
},
{
"epoch": 0.6534117427424624,
"grad_norm": 3.248482495382901,
"learning_rate": 2.8352552813388035e-06,
"loss": 0.9985,
"step": 875
},
{
"epoch": 0.6541584990198824,
"grad_norm": 3.6787026511903984,
"learning_rate": 2.824352967040642e-06,
"loss": 0.9921,
"step": 876
},
{
"epoch": 0.6549052552973024,
"grad_norm": 3.4197970247343727,
"learning_rate": 2.8134633977057236e-06,
"loss": 0.8962,
"step": 877
},
{
"epoch": 0.6556520115747223,
"grad_norm": 2.914535165179358,
"learning_rate": 2.802586637125258e-06,
"loss": 0.859,
"step": 878
},
{
"epoch": 0.6563987678521422,
"grad_norm": 2.9645614071642097,
"learning_rate": 2.791722749015424e-06,
"loss": 1.0076,
"step": 879
},
{
"epoch": 0.6571455241295622,
"grad_norm": 3.3043036104064014,
"learning_rate": 2.7808717970169928e-06,
"loss": 1.022,
"step": 880
},
{
"epoch": 0.6578922804069822,
"grad_norm": 3.9485650303396382,
"learning_rate": 2.770033844694954e-06,
"loss": 0.9906,
"step": 881
},
{
"epoch": 0.6586390366844022,
"grad_norm": 4.366695146187443,
"learning_rate": 2.7592089555381486e-06,
"loss": 1.081,
"step": 882
},
{
"epoch": 0.6586390366844022,
"eval_loss": 0.94098299741745,
"eval_runtime": 160.6764,
"eval_samples_per_second": 112.207,
"eval_steps_per_second": 1.755,
"step": 882
},
{
"epoch": 0.6593857929618221,
"grad_norm": 3.0599765753650563,
"learning_rate": 2.748397192958893e-06,
"loss": 0.9745,
"step": 883
},
{
"epoch": 0.660132549239242,
"grad_norm": 3.0490735460024525,
"learning_rate": 2.7375986202926003e-06,
"loss": 0.9087,
"step": 884
},
{
"epoch": 0.660879305516662,
"grad_norm": 2.944396457362802,
"learning_rate": 2.7268133007974284e-06,
"loss": 0.9665,
"step": 885
},
{
"epoch": 0.661626061794082,
"grad_norm": 4.033811803016879,
"learning_rate": 2.716041297653891e-06,
"loss": 1.0236,
"step": 886
},
{
"epoch": 0.662372818071502,
"grad_norm": 3.4943770273065016,
"learning_rate": 2.705282673964495e-06,
"loss": 0.9446,
"step": 887
},
{
"epoch": 0.6631195743489219,
"grad_norm": 4.178024868422968,
"learning_rate": 2.69453749275337e-06,
"loss": 0.9134,
"step": 888
},
{
"epoch": 0.6638663306263418,
"grad_norm": 2.9796556875857503,
"learning_rate": 2.6838058169659076e-06,
"loss": 0.9714,
"step": 889
},
{
"epoch": 0.6646130869037618,
"grad_norm": 2.5919847870009023,
"learning_rate": 2.67308770946837e-06,
"loss": 0.9558,
"step": 890
},
{
"epoch": 0.6653598431811817,
"grad_norm": 3.2509492718731527,
"learning_rate": 2.6623832330475454e-06,
"loss": 0.9933,
"step": 891
},
{
"epoch": 0.6661065994586017,
"grad_norm": 2.998873106447495,
"learning_rate": 2.651692450410372e-06,
"loss": 1.005,
"step": 892
},
{
"epoch": 0.6668533557360217,
"grad_norm": 3.561009888811348,
"learning_rate": 2.6410154241835663e-06,
"loss": 0.9905,
"step": 893
},
{
"epoch": 0.6676001120134416,
"grad_norm": 3.0540541341319694,
"learning_rate": 2.630352216913258e-06,
"loss": 0.8646,
"step": 894
},
{
"epoch": 0.6683468682908615,
"grad_norm": 2.823799259947434,
"learning_rate": 2.6197028910646304e-06,
"loss": 1.01,
"step": 895
},
{
"epoch": 0.6690936245682815,
"grad_norm": 2.711618389928562,
"learning_rate": 2.609067509021549e-06,
"loss": 0.9544,
"step": 896
},
{
"epoch": 0.6690936245682815,
"eval_loss": 0.9402115345001221,
"eval_runtime": 162.2582,
"eval_samples_per_second": 111.113,
"eval_steps_per_second": 1.738,
"step": 896
},
{
"epoch": 0.6698403808457015,
"grad_norm": 3.009980995035587,
"learning_rate": 2.5984461330861864e-06,
"loss": 1.024,
"step": 897
},
{
"epoch": 0.6705871371231215,
"grad_norm": 3.2368826166146385,
"learning_rate": 2.58783882547868e-06,
"loss": 0.9605,
"step": 898
},
{
"epoch": 0.6713338934005414,
"grad_norm": 4.00112194900498,
"learning_rate": 2.57724564833675e-06,
"loss": 0.99,
"step": 899
},
{
"epoch": 0.6720806496779613,
"grad_norm": 3.4500845570375462,
"learning_rate": 2.566666663715337e-06,
"loss": 1.0087,
"step": 900
},
{
"epoch": 0.6728274059553813,
"grad_norm": 2.8366354860399525,
"learning_rate": 2.5561019335862435e-06,
"loss": 0.8821,
"step": 901
},
{
"epoch": 0.6735741622328013,
"grad_norm": 7.210495835093528,
"learning_rate": 2.545551519837777e-06,
"loss": 1.0363,
"step": 902
},
{
"epoch": 0.6743209185102212,
"grad_norm": 3.348557095005959,
"learning_rate": 2.5350154842743643e-06,
"loss": 0.9791,
"step": 903
},
{
"epoch": 0.6750676747876412,
"grad_norm": 2.988330744084953,
"learning_rate": 2.524493888616214e-06,
"loss": 0.9316,
"step": 904
},
{
"epoch": 0.6758144310650611,
"grad_norm": 3.8502952989746797,
"learning_rate": 2.5139867944989483e-06,
"loss": 0.9556,
"step": 905
},
{
"epoch": 0.676561187342481,
"grad_norm": 3.0740946428548774,
"learning_rate": 2.503494263473233e-06,
"loss": 0.9857,
"step": 906
},
{
"epoch": 0.6773079436199011,
"grad_norm": 2.639674000747994,
"learning_rate": 2.4930163570044245e-06,
"loss": 0.897,
"step": 907
},
{
"epoch": 0.678054699897321,
"grad_norm": 3.76069599349086,
"learning_rate": 2.482553136472211e-06,
"loss": 0.9361,
"step": 908
},
{
"epoch": 0.678801456174741,
"grad_norm": 2.99525819871286,
"learning_rate": 2.4721046631702478e-06,
"loss": 0.9624,
"step": 909
},
{
"epoch": 0.6795482124521609,
"grad_norm": 3.3991168090917254,
"learning_rate": 2.461670998305802e-06,
"loss": 1.0163,
"step": 910
},
{
"epoch": 0.6795482124521609,
"eval_loss": 0.9402364492416382,
"eval_runtime": 160.891,
"eval_samples_per_second": 112.057,
"eval_steps_per_second": 1.753,
"step": 910
},
{
"epoch": 0.6802949687295808,
"grad_norm": 5.192382465431668,
"learning_rate": 2.451252202999389e-06,
"loss": 0.9618,
"step": 911
},
{
"epoch": 0.6810417250070009,
"grad_norm": 2.994654361310334,
"learning_rate": 2.440848338284427e-06,
"loss": 0.9835,
"step": 912
},
{
"epoch": 0.6817884812844208,
"grad_norm": 3.856560498633684,
"learning_rate": 2.4304594651068626e-06,
"loss": 1.0109,
"step": 913
},
{
"epoch": 0.6825352375618408,
"grad_norm": 4.578051753619612,
"learning_rate": 2.420085644324824e-06,
"loss": 1.0303,
"step": 914
},
{
"epoch": 0.6832819938392607,
"grad_norm": 3.4159235584433425,
"learning_rate": 2.409726936708263e-06,
"loss": 0.9201,
"step": 915
},
{
"epoch": 0.6840287501166806,
"grad_norm": 3.266824512755902,
"learning_rate": 2.3993834029385976e-06,
"loss": 0.9419,
"step": 916
},
{
"epoch": 0.6847755063941007,
"grad_norm": 3.1027816566962696,
"learning_rate": 2.3890551036083564e-06,
"loss": 0.989,
"step": 917
},
{
"epoch": 0.6855222626715206,
"grad_norm": 2.7420536258047963,
"learning_rate": 2.378742099220829e-06,
"loss": 0.9537,
"step": 918
},
{
"epoch": 0.6862690189489405,
"grad_norm": 3.4229182865230983,
"learning_rate": 2.3684444501897012e-06,
"loss": 1.0261,
"step": 919
},
{
"epoch": 0.6870157752263605,
"grad_norm": 2.848588746926309,
"learning_rate": 2.3581622168387107e-06,
"loss": 0.9573,
"step": 920
},
{
"epoch": 0.6877625315037804,
"grad_norm": 3.740798933248872,
"learning_rate": 2.3478954594012884e-06,
"loss": 0.9586,
"step": 921
},
{
"epoch": 0.6885092877812005,
"grad_norm": 4.476847035187571,
"learning_rate": 2.337644238020207e-06,
"loss": 0.9325,
"step": 922
},
{
"epoch": 0.6892560440586204,
"grad_norm": 3.797199429587135,
"learning_rate": 2.32740861274723e-06,
"loss": 0.9058,
"step": 923
},
{
"epoch": 0.6900028003360403,
"grad_norm": 2.787060332572636,
"learning_rate": 2.3171886435427567e-06,
"loss": 0.9898,
"step": 924
},
{
"epoch": 0.6900028003360403,
"eval_loss": 0.9391377568244934,
"eval_runtime": 160.7788,
"eval_samples_per_second": 112.135,
"eval_steps_per_second": 1.754,
"step": 924
},
{
"epoch": 0.6907495566134603,
"grad_norm": 3.3922204747022375,
"learning_rate": 2.3069843902754767e-06,
"loss": 1.0498,
"step": 925
},
{
"epoch": 0.6914963128908802,
"grad_norm": 3.0040350675291787,
"learning_rate": 2.296795912722014e-06,
"loss": 0.9636,
"step": 926
},
{
"epoch": 0.6922430691683003,
"grad_norm": 3.0670888522236286,
"learning_rate": 2.286623270566572e-06,
"loss": 0.95,
"step": 927
},
{
"epoch": 0.6929898254457202,
"grad_norm": 2.992366541085708,
"learning_rate": 2.2764665234006008e-06,
"loss": 0.983,
"step": 928
},
{
"epoch": 0.6937365817231401,
"grad_norm": 3.862457594099046,
"learning_rate": 2.2663257307224308e-06,
"loss": 0.9717,
"step": 929
},
{
"epoch": 0.6944833380005601,
"grad_norm": 2.865765904294805,
"learning_rate": 2.2562009519369314e-06,
"loss": 0.9047,
"step": 930
},
{
"epoch": 0.69523009427798,
"grad_norm": 2.6036045157584264,
"learning_rate": 2.246092246355163e-06,
"loss": 0.8313,
"step": 931
},
{
"epoch": 0.6959768505553999,
"grad_norm": 3.031699920926545,
"learning_rate": 2.2359996731940348e-06,
"loss": 0.9678,
"step": 932
},
{
"epoch": 0.69672360683282,
"grad_norm": 2.740843180643503,
"learning_rate": 2.22592329157594e-06,
"loss": 0.9964,
"step": 933
},
{
"epoch": 0.6974703631102399,
"grad_norm": 2.861698814059541,
"learning_rate": 2.215863160528429e-06,
"loss": 0.9205,
"step": 934
},
{
"epoch": 0.6982171193876598,
"grad_norm": 2.757313651567981,
"learning_rate": 2.205819338983859e-06,
"loss": 0.9189,
"step": 935
},
{
"epoch": 0.6989638756650798,
"grad_norm": 3.2909156447917725,
"learning_rate": 2.195791885779041e-06,
"loss": 0.9368,
"step": 936
},
{
"epoch": 0.6997106319424997,
"grad_norm": 6.203638391772724,
"learning_rate": 2.1857808596548992e-06,
"loss": 0.9975,
"step": 937
},
{
"epoch": 0.7004573882199198,
"grad_norm": 3.0377699161386076,
"learning_rate": 2.1757863192561356e-06,
"loss": 0.9513,
"step": 938
},
{
"epoch": 0.7004573882199198,
"eval_loss": 0.9378783106803894,
"eval_runtime": 160.7274,
"eval_samples_per_second": 112.171,
"eval_steps_per_second": 1.755,
"step": 938
},
{
"epoch": 0.7012041444973397,
"grad_norm": 3.8530318792360383,
"learning_rate": 2.165808323130868e-06,
"loss": 0.9743,
"step": 939
},
{
"epoch": 0.7019509007747596,
"grad_norm": 3.764384225357672,
"learning_rate": 2.1558469297303025e-06,
"loss": 0.8979,
"step": 940
},
{
"epoch": 0.7026976570521796,
"grad_norm": 3.607231090891381,
"learning_rate": 2.1459021974083905e-06,
"loss": 0.8938,
"step": 941
},
{
"epoch": 0.7034444133295995,
"grad_norm": 2.850451936817253,
"learning_rate": 2.135974184421477e-06,
"loss": 0.9309,
"step": 942
},
{
"epoch": 0.7041911696070196,
"grad_norm": 3.4414586735950548,
"learning_rate": 2.1260629489279662e-06,
"loss": 0.9612,
"step": 943
},
{
"epoch": 0.7049379258844395,
"grad_norm": 3.306379000135249,
"learning_rate": 2.1161685489879784e-06,
"loss": 0.9423,
"step": 944
},
{
"epoch": 0.7056846821618594,
"grad_norm": 3.3569042890381877,
"learning_rate": 2.106291042563019e-06,
"loss": 1.0236,
"step": 945
},
{
"epoch": 0.7064314384392794,
"grad_norm": 3.3135012728201017,
"learning_rate": 2.096430487515618e-06,
"loss": 0.9959,
"step": 946
},
{
"epoch": 0.7071781947166993,
"grad_norm": 3.038549894496397,
"learning_rate": 2.086586941609011e-06,
"loss": 1.0162,
"step": 947
},
{
"epoch": 0.7079249509941193,
"grad_norm": 3.08173088117554,
"learning_rate": 2.076760462506798e-06,
"loss": 0.9131,
"step": 948
},
{
"epoch": 0.7086717072715393,
"grad_norm": 3.028394966652236,
"learning_rate": 2.0669511077725945e-06,
"loss": 0.9259,
"step": 949
},
{
"epoch": 0.7094184635489592,
"grad_norm": 3.3989843796174934,
"learning_rate": 2.0571589348697045e-06,
"loss": 0.9344,
"step": 950
},
{
"epoch": 0.7101652198263791,
"grad_norm": 2.975011090701034,
"learning_rate": 2.04738400116078e-06,
"loss": 0.9218,
"step": 951
},
{
"epoch": 0.7109119761037991,
"grad_norm": 3.265896334464912,
"learning_rate": 2.037626363907485e-06,
"loss": 0.9765,
"step": 952
},
{
"epoch": 0.7109119761037991,
"eval_loss": 0.9376602172851562,
"eval_runtime": 160.7633,
"eval_samples_per_second": 112.146,
"eval_steps_per_second": 1.754,
"step": 952
},
{
"epoch": 0.7116587323812191,
"grad_norm": 3.6778027198208982,
"learning_rate": 2.0278860802701616e-06,
"loss": 0.9278,
"step": 953
},
{
"epoch": 0.7124054886586391,
"grad_norm": 2.6384835692393542,
"learning_rate": 2.0181632073074925e-06,
"loss": 0.9659,
"step": 954
},
{
"epoch": 0.713152244936059,
"grad_norm": 3.786013581216713,
"learning_rate": 2.0084578019761738e-06,
"loss": 0.9555,
"step": 955
},
{
"epoch": 0.7138990012134789,
"grad_norm": 3.4033801567287436,
"learning_rate": 1.9987699211305696e-06,
"loss": 0.8764,
"step": 956
},
{
"epoch": 0.7146457574908989,
"grad_norm": 3.026170739363414,
"learning_rate": 1.9890996215223885e-06,
"loss": 0.9646,
"step": 957
},
{
"epoch": 0.7153925137683189,
"grad_norm": 2.8698966567286646,
"learning_rate": 1.979446959800347e-06,
"loss": 0.958,
"step": 958
},
{
"epoch": 0.7161392700457389,
"grad_norm": 3.094820756318611,
"learning_rate": 1.9698119925098398e-06,
"loss": 1.0365,
"step": 959
},
{
"epoch": 0.7168860263231588,
"grad_norm": 3.283111017009602,
"learning_rate": 1.9601947760926044e-06,
"loss": 0.9294,
"step": 960
},
{
"epoch": 0.7176327826005787,
"grad_norm": 3.3801881029401977,
"learning_rate": 1.9505953668863996e-06,
"loss": 0.9826,
"step": 961
},
{
"epoch": 0.7183795388779987,
"grad_norm": 2.6507459684657437,
"learning_rate": 1.9410138211246644e-06,
"loss": 0.9045,
"step": 962
},
{
"epoch": 0.7191262951554187,
"grad_norm": 3.20895770074743,
"learning_rate": 1.9314501949361946e-06,
"loss": 0.9533,
"step": 963
},
{
"epoch": 0.7198730514328386,
"grad_norm": 2.9463529533656967,
"learning_rate": 1.9219045443448133e-06,
"loss": 0.9577,
"step": 964
},
{
"epoch": 0.7206198077102586,
"grad_norm": 3.0591917801650887,
"learning_rate": 1.912376925269041e-06,
"loss": 0.9461,
"step": 965
},
{
"epoch": 0.7213665639876785,
"grad_norm": 3.333232970293983,
"learning_rate": 1.9028673935217723e-06,
"loss": 0.8945,
"step": 966
},
{
"epoch": 0.7213665639876785,
"eval_loss": 0.9377442002296448,
"eval_runtime": 162.3492,
"eval_samples_per_second": 111.051,
"eval_steps_per_second": 1.737,
"step": 966
},
{
"epoch": 0.7221133202650984,
"grad_norm": 2.964178856719797,
"learning_rate": 1.893376004809942e-06,
"loss": 0.9926,
"step": 967
},
{
"epoch": 0.7228600765425184,
"grad_norm": 2.9396645690446093,
"learning_rate": 1.8839028147342087e-06,
"loss": 0.996,
"step": 968
},
{
"epoch": 0.7236068328199384,
"grad_norm": 2.9374240680791277,
"learning_rate": 1.8744478787886188e-06,
"loss": 0.9515,
"step": 969
},
{
"epoch": 0.7243535890973584,
"grad_norm": 3.1383072383295123,
"learning_rate": 1.8650112523602832e-06,
"loss": 0.9053,
"step": 970
},
{
"epoch": 0.7251003453747783,
"grad_norm": 2.9457028560535043,
"learning_rate": 1.8555929907290627e-06,
"loss": 0.8926,
"step": 971
},
{
"epoch": 0.7258471016521982,
"grad_norm": 3.6033895225192185,
"learning_rate": 1.846193149067232e-06,
"loss": 1.0745,
"step": 972
},
{
"epoch": 0.7265938579296182,
"grad_norm": 2.905441286184863,
"learning_rate": 1.8368117824391623e-06,
"loss": 0.961,
"step": 973
},
{
"epoch": 0.7273406142070382,
"grad_norm": 3.823010945665864,
"learning_rate": 1.827448945800997e-06,
"loss": 0.9202,
"step": 974
},
{
"epoch": 0.7280873704844582,
"grad_norm": 2.9910986917683755,
"learning_rate": 1.8181046940003366e-06,
"loss": 0.9435,
"step": 975
},
{
"epoch": 0.7288341267618781,
"grad_norm": 3.862933976303629,
"learning_rate": 1.808779081775901e-06,
"loss": 1.0576,
"step": 976
},
{
"epoch": 0.729580883039298,
"grad_norm": 4.606305556163073,
"learning_rate": 1.799472163757226e-06,
"loss": 1.0184,
"step": 977
},
{
"epoch": 0.730327639316718,
"grad_norm": 5.027834361171428,
"learning_rate": 1.7901839944643373e-06,
"loss": 1.0321,
"step": 978
},
{
"epoch": 0.731074395594138,
"grad_norm": 2.9581566301260147,
"learning_rate": 1.780914628307428e-06,
"loss": 0.9176,
"step": 979
},
{
"epoch": 0.7318211518715579,
"grad_norm": 4.036645532947177,
"learning_rate": 1.7716641195865408e-06,
"loss": 1.0221,
"step": 980
},
{
"epoch": 0.7318211518715579,
"eval_loss": 0.9362857937812805,
"eval_runtime": 160.7578,
"eval_samples_per_second": 112.15,
"eval_steps_per_second": 1.754,
"step": 980
},
{
"epoch": 0.7325679081489779,
"grad_norm": 3.986822927862001,
"learning_rate": 1.762432522491258e-06,
"loss": 1.043,
"step": 981
},
{
"epoch": 0.7333146644263978,
"grad_norm": 3.585322403590216,
"learning_rate": 1.7532198911003677e-06,
"loss": 0.9715,
"step": 982
},
{
"epoch": 0.7340614207038177,
"grad_norm": 3.452113127334812,
"learning_rate": 1.7440262793815615e-06,
"loss": 0.966,
"step": 983
},
{
"epoch": 0.7348081769812378,
"grad_norm": 3.0003018548266818,
"learning_rate": 1.7348517411911176e-06,
"loss": 0.8986,
"step": 984
},
{
"epoch": 0.7355549332586577,
"grad_norm": 3.7074201960046462,
"learning_rate": 1.7256963302735752e-06,
"loss": 0.9799,
"step": 985
},
{
"epoch": 0.7363016895360777,
"grad_norm": 3.0268210048371826,
"learning_rate": 1.716560100261428e-06,
"loss": 1.004,
"step": 986
},
{
"epoch": 0.7370484458134976,
"grad_norm": 4.555026395818409,
"learning_rate": 1.7074431046748075e-06,
"loss": 0.9813,
"step": 987
},
{
"epoch": 0.7377952020909175,
"grad_norm": 2.854485460589325,
"learning_rate": 1.6983453969211706e-06,
"loss": 0.9223,
"step": 988
},
{
"epoch": 0.7385419583683376,
"grad_norm": 3.780899514822457,
"learning_rate": 1.6892670302949842e-06,
"loss": 1.0049,
"step": 989
},
{
"epoch": 0.7392887146457575,
"grad_norm": 2.708058621198111,
"learning_rate": 1.680208057977415e-06,
"loss": 0.9578,
"step": 990
},
{
"epoch": 0.7400354709231775,
"grad_norm": 5.261839966075888,
"learning_rate": 1.6711685330360212e-06,
"loss": 0.9491,
"step": 991
},
{
"epoch": 0.7407822272005974,
"grad_norm": 3.554553077382045,
"learning_rate": 1.6621485084244327e-06,
"loss": 0.9547,
"step": 992
},
{
"epoch": 0.7415289834780173,
"grad_norm": 3.4576454634816116,
"learning_rate": 1.6531480369820497e-06,
"loss": 0.9735,
"step": 993
},
{
"epoch": 0.7422757397554374,
"grad_norm": 2.7953413513430068,
"learning_rate": 1.6441671714337288e-06,
"loss": 0.9573,
"step": 994
},
{
"epoch": 0.7422757397554374,
"eval_loss": 0.9361117482185364,
"eval_runtime": 162.2503,
"eval_samples_per_second": 111.118,
"eval_steps_per_second": 1.738,
"step": 994
},
{
"epoch": 0.7430224960328573,
"grad_norm": 3.5618656933093877,
"learning_rate": 1.635205964389474e-06,
"loss": 0.9789,
"step": 995
},
{
"epoch": 0.7437692523102772,
"grad_norm": 3.508297212134497,
"learning_rate": 1.626264468344132e-06,
"loss": 0.9089,
"step": 996
},
{
"epoch": 0.7445160085876972,
"grad_norm": 2.8461158774574113,
"learning_rate": 1.6173427356770778e-06,
"loss": 0.9407,
"step": 997
},
{
"epoch": 0.7452627648651171,
"grad_norm": 2.8680121302683914,
"learning_rate": 1.6084408186519195e-06,
"loss": 0.9893,
"step": 998
},
{
"epoch": 0.7460095211425372,
"grad_norm": 2.6994982967303023,
"learning_rate": 1.599558769416179e-06,
"loss": 0.9235,
"step": 999
},
{
"epoch": 0.7467562774199571,
"grad_norm": 2.946565912220613,
"learning_rate": 1.5906966400009944e-06,
"loss": 0.9575,
"step": 1000
},
{
"epoch": 0.747503033697377,
"grad_norm": 2.924215952122836,
"learning_rate": 1.5818544823208126e-06,
"loss": 0.8939,
"step": 1001
},
{
"epoch": 0.748249789974797,
"grad_norm": 3.0590249491881147,
"learning_rate": 1.573032348173087e-06,
"loss": 1.0089,
"step": 1002
},
{
"epoch": 0.7489965462522169,
"grad_norm": 3.196914793142116,
"learning_rate": 1.5642302892379708e-06,
"loss": 0.9879,
"step": 1003
},
{
"epoch": 0.7497433025296368,
"grad_norm": 3.794505980226742,
"learning_rate": 1.555448357078021e-06,
"loss": 1.0676,
"step": 1004
},
{
"epoch": 0.7504900588070569,
"grad_norm": 3.347035703229042,
"learning_rate": 1.5466866031378874e-06,
"loss": 0.9242,
"step": 1005
},
{
"epoch": 0.7512368150844768,
"grad_norm": 5.075461756876524,
"learning_rate": 1.5379450787440163e-06,
"loss": 1.0153,
"step": 1006
},
{
"epoch": 0.7519835713618968,
"grad_norm": 3.532477622458222,
"learning_rate": 1.5292238351043503e-06,
"loss": 0.9921,
"step": 1007
},
{
"epoch": 0.7527303276393167,
"grad_norm": 2.756518632689471,
"learning_rate": 1.5205229233080266e-06,
"loss": 0.9969,
"step": 1008
},
{
"epoch": 0.7527303276393167,
"eval_loss": 0.935323178768158,
"eval_runtime": 160.7419,
"eval_samples_per_second": 112.161,
"eval_steps_per_second": 1.754,
"step": 1008
},
{
"epoch": 0.7534770839167366,
"grad_norm": 3.276940054933946,
"learning_rate": 1.511842394325077e-06,
"loss": 0.9367,
"step": 1009
},
{
"epoch": 0.7542238401941567,
"grad_norm": 4.197551947365772,
"learning_rate": 1.5031822990061318e-06,
"loss": 1.0045,
"step": 1010
},
{
"epoch": 0.7549705964715766,
"grad_norm": 2.742225216417972,
"learning_rate": 1.494542688082123e-06,
"loss": 0.9001,
"step": 1011
},
{
"epoch": 0.7557173527489965,
"grad_norm": 3.1759063839874093,
"learning_rate": 1.485923612163982e-06,
"loss": 0.9752,
"step": 1012
},
{
"epoch": 0.7564641090264165,
"grad_norm": 3.234441153665573,
"learning_rate": 1.4773251217423424e-06,
"loss": 0.9566,
"step": 1013
},
{
"epoch": 0.7572108653038364,
"grad_norm": 3.328867377350979,
"learning_rate": 1.468747267187256e-06,
"loss": 0.8425,
"step": 1014
},
{
"epoch": 0.7579576215812565,
"grad_norm": 3.4507904655828963,
"learning_rate": 1.4601900987478834e-06,
"loss": 1.0111,
"step": 1015
},
{
"epoch": 0.7587043778586764,
"grad_norm": 3.0879533408630526,
"learning_rate": 1.451653666552208e-06,
"loss": 0.884,
"step": 1016
},
{
"epoch": 0.7594511341360963,
"grad_norm": 2.9599334649521345,
"learning_rate": 1.4431380206067374e-06,
"loss": 0.9592,
"step": 1017
},
{
"epoch": 0.7601978904135163,
"grad_norm": 3.355339344519482,
"learning_rate": 1.4346432107962205e-06,
"loss": 0.9678,
"step": 1018
},
{
"epoch": 0.7609446466909362,
"grad_norm": 4.7621784133806795,
"learning_rate": 1.4261692868833376e-06,
"loss": 0.9767,
"step": 1019
},
{
"epoch": 0.7616914029683562,
"grad_norm": 3.0462672117608225,
"learning_rate": 1.4177162985084242e-06,
"loss": 0.9758,
"step": 1020
},
{
"epoch": 0.7624381592457762,
"grad_norm": 3.3235265510570327,
"learning_rate": 1.4092842951891788e-06,
"loss": 0.9605,
"step": 1021
},
{
"epoch": 0.7631849155231961,
"grad_norm": 2.9873139379742044,
"learning_rate": 1.400873326320364e-06,
"loss": 0.947,
"step": 1022
},
{
"epoch": 0.7631849155231961,
"eval_loss": 0.934901237487793,
"eval_runtime": 160.8713,
"eval_samples_per_second": 112.071,
"eval_steps_per_second": 1.753,
"step": 1022
},
{
"epoch": 0.763931671800616,
"grad_norm": 2.925828560741243,
"learning_rate": 1.3924834411735238e-06,
"loss": 0.9255,
"step": 1023
},
{
"epoch": 0.764678428078036,
"grad_norm": 2.929366653908739,
"learning_rate": 1.3841146888966944e-06,
"loss": 0.9649,
"step": 1024
},
{
"epoch": 0.765425184355456,
"grad_norm": 3.5848412522150093,
"learning_rate": 1.3757671185141136e-06,
"loss": 0.9942,
"step": 1025
},
{
"epoch": 0.766171940632876,
"grad_norm": 4.313625493258559,
"learning_rate": 1.367440778925938e-06,
"loss": 0.9989,
"step": 1026
},
{
"epoch": 0.7669186969102959,
"grad_norm": 3.1883660056342444,
"learning_rate": 1.35913571890795e-06,
"loss": 0.934,
"step": 1027
},
{
"epoch": 0.7676654531877158,
"grad_norm": 3.0451342971999034,
"learning_rate": 1.350851987111283e-06,
"loss": 0.9309,
"step": 1028
},
{
"epoch": 0.7684122094651358,
"grad_norm": 3.0995695469892826,
"learning_rate": 1.3425896320621224e-06,
"loss": 0.9187,
"step": 1029
},
{
"epoch": 0.7691589657425558,
"grad_norm": 2.9172531099918624,
"learning_rate": 1.3343487021614315e-06,
"loss": 0.9461,
"step": 1030
},
{
"epoch": 0.7699057220199758,
"grad_norm": 2.7992115604306993,
"learning_rate": 1.3261292456846648e-06,
"loss": 0.9552,
"step": 1031
},
{
"epoch": 0.7706524782973957,
"grad_norm": 2.895791747367878,
"learning_rate": 1.3179313107814844e-06,
"loss": 0.9338,
"step": 1032
},
{
"epoch": 0.7713992345748156,
"grad_norm": 3.185964164923729,
"learning_rate": 1.3097549454754782e-06,
"loss": 0.9724,
"step": 1033
},
{
"epoch": 0.7721459908522356,
"grad_norm": 2.5940064617026564,
"learning_rate": 1.3016001976638836e-06,
"loss": 0.8629,
"step": 1034
},
{
"epoch": 0.7728927471296556,
"grad_norm": 10.058975733269536,
"learning_rate": 1.2934671151172974e-06,
"loss": 0.9243,
"step": 1035
},
{
"epoch": 0.7736395034070755,
"grad_norm": 3.365992421531039,
"learning_rate": 1.2853557454794025e-06,
"loss": 1.0129,
"step": 1036
},
{
"epoch": 0.7736395034070755,
"eval_loss": 0.934323787689209,
"eval_runtime": 162.2911,
"eval_samples_per_second": 111.091,
"eval_steps_per_second": 1.738,
"step": 1036
},
{
"epoch": 0.7743862596844955,
"grad_norm": 3.4360489381196446,
"learning_rate": 1.2772661362666877e-06,
"loss": 1.0044,
"step": 1037
},
{
"epoch": 0.7751330159619154,
"grad_norm": 4.245525265730096,
"learning_rate": 1.2691983348681698e-06,
"loss": 0.9676,
"step": 1038
},
{
"epoch": 0.7758797722393354,
"grad_norm": 3.7645844451140666,
"learning_rate": 1.2611523885451137e-06,
"loss": 0.9585,
"step": 1039
},
{
"epoch": 0.7766265285167554,
"grad_norm": 2.9642595691153444,
"learning_rate": 1.2531283444307567e-06,
"loss": 1.0301,
"step": 1040
},
{
"epoch": 0.7773732847941753,
"grad_norm": 2.6311114422503605,
"learning_rate": 1.2451262495300366e-06,
"loss": 1.0166,
"step": 1041
},
{
"epoch": 0.7781200410715953,
"grad_norm": 3.151248153435886,
"learning_rate": 1.2371461507193077e-06,
"loss": 0.9414,
"step": 1042
},
{
"epoch": 0.7788667973490152,
"grad_norm": 3.2520539000528057,
"learning_rate": 1.2291880947460732e-06,
"loss": 0.9074,
"step": 1043
},
{
"epoch": 0.7796135536264351,
"grad_norm": 3.2890092101452435,
"learning_rate": 1.2212521282287093e-06,
"loss": 0.8918,
"step": 1044
},
{
"epoch": 0.7803603099038551,
"grad_norm": 3.2370242322883875,
"learning_rate": 1.213338297656191e-06,
"loss": 0.9752,
"step": 1045
},
{
"epoch": 0.7811070661812751,
"grad_norm": 2.7758770607541368,
"learning_rate": 1.2054466493878209e-06,
"loss": 0.9171,
"step": 1046
},
{
"epoch": 0.7818538224586951,
"grad_norm": 3.605541137271554,
"learning_rate": 1.1975772296529564e-06,
"loss": 0.9332,
"step": 1047
},
{
"epoch": 0.782600578736115,
"grad_norm": 3.041323832919354,
"learning_rate": 1.1897300845507447e-06,
"loss": 0.9344,
"step": 1048
},
{
"epoch": 0.7833473350135349,
"grad_norm": 2.8405142198832998,
"learning_rate": 1.1819052600498444e-06,
"loss": 0.9004,
"step": 1049
},
{
"epoch": 0.7840940912909549,
"grad_norm": 3.021597526668893,
"learning_rate": 1.1741028019881546e-06,
"loss": 0.9212,
"step": 1050
},
{
"epoch": 0.7840940912909549,
"eval_loss": 0.9340171217918396,
"eval_runtime": 162.2347,
"eval_samples_per_second": 111.129,
"eval_steps_per_second": 1.738,
"step": 1050
},
{
"epoch": 0.7848408475683749,
"grad_norm": 2.950883129014722,
"learning_rate": 1.166322756072562e-06,
"loss": 0.9675,
"step": 1051
},
{
"epoch": 0.7855876038457948,
"grad_norm": 3.0912726001846256,
"learning_rate": 1.1585651678786558e-06,
"loss": 0.8971,
"step": 1052
},
{
"epoch": 0.7863343601232148,
"grad_norm": 4.15327113314584,
"learning_rate": 1.1508300828504682e-06,
"loss": 0.9149,
"step": 1053
},
{
"epoch": 0.7870811164006347,
"grad_norm": 3.2238088469540704,
"learning_rate": 1.1431175463002114e-06,
"loss": 0.9713,
"step": 1054
},
{
"epoch": 0.7878278726780547,
"grad_norm": 3.0917040248252428,
"learning_rate": 1.1354276034080059e-06,
"loss": 0.8971,
"step": 1055
},
{
"epoch": 0.7885746289554747,
"grad_norm": 4.274054201982966,
"learning_rate": 1.1277602992216142e-06,
"loss": 0.9776,
"step": 1056
},
{
"epoch": 0.7893213852328946,
"grad_norm": 3.4775465819659312,
"learning_rate": 1.1201156786561884e-06,
"loss": 1.0144,
"step": 1057
},
{
"epoch": 0.7900681415103146,
"grad_norm": 3.353681585305471,
"learning_rate": 1.1124937864939956e-06,
"loss": 0.9548,
"step": 1058
},
{
"epoch": 0.7908148977877345,
"grad_norm": 3.129257365590115,
"learning_rate": 1.1048946673841598e-06,
"loss": 1.0154,
"step": 1059
},
{
"epoch": 0.7915616540651544,
"grad_norm": 3.590240334184257,
"learning_rate": 1.0973183658424008e-06,
"loss": 0.9403,
"step": 1060
},
{
"epoch": 0.7923084103425745,
"grad_norm": 2.8470937895204407,
"learning_rate": 1.0897649262507753e-06,
"loss": 0.9681,
"step": 1061
},
{
"epoch": 0.7930551666199944,
"grad_norm": 3.532189828005365,
"learning_rate": 1.0822343928574087e-06,
"loss": 0.9324,
"step": 1062
},
{
"epoch": 0.7938019228974144,
"grad_norm": 3.1397557159820395,
"learning_rate": 1.0747268097762454e-06,
"loss": 0.9094,
"step": 1063
},
{
"epoch": 0.7945486791748343,
"grad_norm": 6.086890250561432,
"learning_rate": 1.0672422209867879e-06,
"loss": 1.0163,
"step": 1064
},
{
"epoch": 0.7945486791748343,
"eval_loss": 0.933886706829071,
"eval_runtime": 162.281,
"eval_samples_per_second": 111.097,
"eval_steps_per_second": 1.738,
"step": 1064
},
{
"epoch": 0.7952954354522542,
"grad_norm": 3.169410119671673,
"learning_rate": 1.0597806703338354e-06,
"loss": 0.9663,
"step": 1065
},
{
"epoch": 0.7960421917296743,
"grad_norm": 3.3940195467110765,
"learning_rate": 1.0523422015272299e-06,
"loss": 0.9519,
"step": 1066
},
{
"epoch": 0.7967889480070942,
"grad_norm": 4.508947232616116,
"learning_rate": 1.0449268581416012e-06,
"loss": 0.904,
"step": 1067
},
{
"epoch": 0.7975357042845141,
"grad_norm": 2.632774375597297,
"learning_rate": 1.0375346836161071e-06,
"loss": 0.9245,
"step": 1068
},
{
"epoch": 0.7982824605619341,
"grad_norm": 2.9785159713949714,
"learning_rate": 1.0301657212541854e-06,
"loss": 0.9514,
"step": 1069
},
{
"epoch": 0.799029216839354,
"grad_norm": 3.155524617990433,
"learning_rate": 1.022820014223293e-06,
"loss": 0.8659,
"step": 1070
},
{
"epoch": 0.7997759731167741,
"grad_norm": 2.805966276939344,
"learning_rate": 1.0154976055546627e-06,
"loss": 1.0431,
"step": 1071
},
{
"epoch": 0.800522729394194,
"grad_norm": 3.8141120841423706,
"learning_rate": 1.0081985381430392e-06,
"loss": 0.9828,
"step": 1072
},
{
"epoch": 0.8012694856716139,
"grad_norm": 3.2436963757912824,
"learning_rate": 1.0009228547464373e-06,
"loss": 0.9252,
"step": 1073
},
{
"epoch": 0.8020162419490339,
"grad_norm": 3.034785333690608,
"learning_rate": 9.936705979858863e-07,
"loss": 0.9408,
"step": 1074
},
{
"epoch": 0.8027629982264538,
"grad_norm": 3.247230477369662,
"learning_rate": 9.86441810345183e-07,
"loss": 0.9407,
"step": 1075
},
{
"epoch": 0.8035097545038739,
"grad_norm": 3.1230396825638223,
"learning_rate": 9.792365341706395e-07,
"loss": 1.0738,
"step": 1076
},
{
"epoch": 0.8042565107812938,
"grad_norm": 3.634705534638435,
"learning_rate": 9.720548116708434e-07,
"loss": 1.0531,
"step": 1077
},
{
"epoch": 0.8050032670587137,
"grad_norm": 3.2524650618531403,
"learning_rate": 9.648966849163987e-07,
"loss": 0.9508,
"step": 1078
},
{
"epoch": 0.8050032670587137,
"eval_loss": 0.9330756068229675,
"eval_runtime": 162.464,
"eval_samples_per_second": 110.972,
"eval_steps_per_second": 1.736,
"step": 1078
},
{
"epoch": 0.8057500233361337,
"grad_norm": 3.195766017025719,
"learning_rate": 9.577621958396876e-07,
"loss": 0.9247,
"step": 1079
},
{
"epoch": 0.8064967796135536,
"grad_norm": 3.231704715181529,
"learning_rate": 9.506513862346223e-07,
"loss": 0.9718,
"step": 1080
},
{
"epoch": 0.8072435358909735,
"grad_norm": 3.08352329883453,
"learning_rate": 9.435642977564002e-07,
"loss": 0.953,
"step": 1081
},
{
"epoch": 0.8079902921683936,
"grad_norm": 2.7239297991167146,
"learning_rate": 9.365009719212609e-07,
"loss": 1.019,
"step": 1082
},
{
"epoch": 0.8087370484458135,
"grad_norm": 3.4569217007757103,
"learning_rate": 9.294614501062393e-07,
"loss": 1.0053,
"step": 1083
},
{
"epoch": 0.8094838047232334,
"grad_norm": 2.8952768629298182,
"learning_rate": 9.224457735489312e-07,
"loss": 0.879,
"step": 1084
},
{
"epoch": 0.8102305610006534,
"grad_norm": 2.635860772081649,
"learning_rate": 9.154539833472442e-07,
"loss": 0.9415,
"step": 1085
},
{
"epoch": 0.8109773172780733,
"grad_norm": 2.981500012106527,
"learning_rate": 9.08486120459155e-07,
"loss": 0.8901,
"step": 1086
},
{
"epoch": 0.8117240735554934,
"grad_norm": 3.2151674731774955,
"learning_rate": 9.015422257024814e-07,
"loss": 1.0142,
"step": 1087
},
{
"epoch": 0.8124708298329133,
"grad_norm": 4.460127069540021,
"learning_rate": 8.946223397546311e-07,
"loss": 0.9761,
"step": 1088
},
{
"epoch": 0.8132175861103332,
"grad_norm": 2.64438098456573,
"learning_rate": 8.877265031523685e-07,
"loss": 0.9587,
"step": 1089
},
{
"epoch": 0.8139643423877532,
"grad_norm": 3.075964264839053,
"learning_rate": 8.80854756291576e-07,
"loss": 1.0468,
"step": 1090
},
{
"epoch": 0.8147110986651731,
"grad_norm": 3.701739504153379,
"learning_rate": 8.740071394270217e-07,
"loss": 0.905,
"step": 1091
},
{
"epoch": 0.8154578549425932,
"grad_norm": 2.9113831337724507,
"learning_rate": 8.671836926721172e-07,
"loss": 0.9129,
"step": 1092
},
{
"epoch": 0.8154578549425932,
"eval_loss": 0.9328471422195435,
"eval_runtime": 162.392,
"eval_samples_per_second": 111.021,
"eval_steps_per_second": 1.737,
"step": 1092
},
{
"epoch": 0.8162046112200131,
"grad_norm": 4.310707752332849,
"learning_rate": 8.603844559986823e-07,
"loss": 1.0048,
"step": 1093
},
{
"epoch": 0.816951367497433,
"grad_norm": 3.8711564841693384,
"learning_rate": 8.536094692367197e-07,
"loss": 0.9851,
"step": 1094
},
{
"epoch": 0.817698123774853,
"grad_norm": 8.753075618875652,
"learning_rate": 8.468587720741728e-07,
"loss": 0.9883,
"step": 1095
},
{
"epoch": 0.8184448800522729,
"grad_norm": 2.8672638891822864,
"learning_rate": 8.401324040566955e-07,
"loss": 0.9291,
"step": 1096
},
{
"epoch": 0.8191916363296929,
"grad_norm": 4.776201464319421,
"learning_rate": 8.334304045874248e-07,
"loss": 0.9755,
"step": 1097
},
{
"epoch": 0.8199383926071129,
"grad_norm": 3.16525486972403,
"learning_rate": 8.267528129267438e-07,
"loss": 0.9925,
"step": 1098
},
{
"epoch": 0.8206851488845328,
"grad_norm": 4.511439354536366,
"learning_rate": 8.20099668192052e-07,
"loss": 0.9539,
"step": 1099
},
{
"epoch": 0.8214319051619527,
"grad_norm": 3.4286525245441797,
"learning_rate": 8.134710093575444e-07,
"loss": 0.9686,
"step": 1100
},
{
"epoch": 0.8221786614393727,
"grad_norm": 3.0049333808435343,
"learning_rate": 8.068668752539726e-07,
"loss": 0.9661,
"step": 1101
},
{
"epoch": 0.8229254177167927,
"grad_norm": 4.385250881338943,
"learning_rate": 8.002873045684245e-07,
"loss": 0.9325,
"step": 1102
},
{
"epoch": 0.8236721739942127,
"grad_norm": 3.027199330607413,
"learning_rate": 7.937323358440935e-07,
"loss": 0.9295,
"step": 1103
},
{
"epoch": 0.8244189302716326,
"grad_norm": 3.4801606475057505,
"learning_rate": 7.872020074800585e-07,
"loss": 1.0889,
"step": 1104
},
{
"epoch": 0.8251656865490525,
"grad_norm": 2.881651694848292,
"learning_rate": 7.80696357731049e-07,
"loss": 0.9259,
"step": 1105
},
{
"epoch": 0.8259124428264725,
"grad_norm": 3.232131277331923,
"learning_rate": 7.742154247072287e-07,
"loss": 0.9543,
"step": 1106
},
{
"epoch": 0.8259124428264725,
"eval_loss": 0.9323558211326599,
"eval_runtime": 160.8356,
"eval_samples_per_second": 112.096,
"eval_steps_per_second": 1.753,
"step": 1106
},
{
"epoch": 0.8266591991038925,
"grad_norm": 3.839039683871901,
"learning_rate": 7.677592463739741e-07,
"loss": 1.0328,
"step": 1107
},
{
"epoch": 0.8274059553813125,
"grad_norm": 3.451018921091918,
"learning_rate": 7.613278605516455e-07,
"loss": 0.9324,
"step": 1108
},
{
"epoch": 0.8281527116587324,
"grad_norm": 2.741822965758025,
"learning_rate": 7.549213049153687e-07,
"loss": 0.9556,
"step": 1109
},
{
"epoch": 0.8288994679361523,
"grad_norm": 2.809367958739757,
"learning_rate": 7.485396169948133e-07,
"loss": 1.0265,
"step": 1110
},
{
"epoch": 0.8296462242135723,
"grad_norm": 2.5599878133704914,
"learning_rate": 7.421828341739751e-07,
"loss": 0.888,
"step": 1111
},
{
"epoch": 0.8303929804909923,
"grad_norm": 3.482845460987069,
"learning_rate": 7.358509936909541e-07,
"loss": 1.025,
"step": 1112
},
{
"epoch": 0.8311397367684122,
"grad_norm": 3.736730021811191,
"learning_rate": 7.295441326377384e-07,
"loss": 0.9231,
"step": 1113
},
{
"epoch": 0.8318864930458322,
"grad_norm": 2.945367690866313,
"learning_rate": 7.232622879599882e-07,
"loss": 0.8896,
"step": 1114
},
{
"epoch": 0.8326332493232521,
"grad_norm": 2.786535060510927,
"learning_rate": 7.170054964568146e-07,
"loss": 0.9609,
"step": 1115
},
{
"epoch": 0.833380005600672,
"grad_norm": 4.311286527154838,
"learning_rate": 7.107737947805688e-07,
"loss": 1.0491,
"step": 1116
},
{
"epoch": 0.834126761878092,
"grad_norm": 3.356639280458624,
"learning_rate": 7.045672194366238e-07,
"loss": 0.8749,
"step": 1117
},
{
"epoch": 0.834873518155512,
"grad_norm": 2.703485397651996,
"learning_rate": 6.983858067831645e-07,
"loss": 0.9012,
"step": 1118
},
{
"epoch": 0.835620274432932,
"grad_norm": 4.065892457563659,
"learning_rate": 6.922295930309691e-07,
"loss": 1.0221,
"step": 1119
},
{
"epoch": 0.8363670307103519,
"grad_norm": 3.418730924372397,
"learning_rate": 6.860986142432057e-07,
"loss": 1.0233,
"step": 1120
},
{
"epoch": 0.8363670307103519,
"eval_loss": 0.9319419264793396,
"eval_runtime": 162.3687,
"eval_samples_per_second": 111.037,
"eval_steps_per_second": 1.737,
"step": 1120
},
{
"epoch": 0.8371137869877718,
"grad_norm": 2.996912276789265,
"learning_rate": 6.799929063352112e-07,
"loss": 0.9296,
"step": 1121
},
{
"epoch": 0.8378605432651918,
"grad_norm": 2.7030140370045483,
"learning_rate": 6.739125050742873e-07,
"loss": 1.014,
"step": 1122
},
{
"epoch": 0.8386072995426118,
"grad_norm": 2.7714835166723937,
"learning_rate": 6.678574460794879e-07,
"loss": 0.9642,
"step": 1123
},
{
"epoch": 0.8393540558200318,
"grad_norm": 3.0335494350345162,
"learning_rate": 6.618277648214127e-07,
"loss": 0.9585,
"step": 1124
},
{
"epoch": 0.8401008120974517,
"grad_norm": 3.05369699501868,
"learning_rate": 6.558234966219984e-07,
"loss": 0.9921,
"step": 1125
},
{
"epoch": 0.8408475683748716,
"grad_norm": 3.2366643164869218,
"learning_rate": 6.498446766543098e-07,
"loss": 0.969,
"step": 1126
},
{
"epoch": 0.8415943246522916,
"grad_norm": 3.3113556891155964,
"learning_rate": 6.438913399423396e-07,
"loss": 0.9841,
"step": 1127
},
{
"epoch": 0.8423410809297116,
"grad_norm": 3.742348768556645,
"learning_rate": 6.379635213607971e-07,
"loss": 0.9123,
"step": 1128
},
{
"epoch": 0.8430878372071315,
"grad_norm": 4.013813043282346,
"learning_rate": 6.320612556349027e-07,
"loss": 1.0005,
"step": 1129
},
{
"epoch": 0.8438345934845515,
"grad_norm": 3.235695639624682,
"learning_rate": 6.261845773401936e-07,
"loss": 0.9728,
"step": 1130
},
{
"epoch": 0.8445813497619714,
"grad_norm": 3.279666788848323,
"learning_rate": 6.203335209023137e-07,
"loss": 0.9955,
"step": 1131
},
{
"epoch": 0.8453281060393913,
"grad_norm": 2.985404353225797,
"learning_rate": 6.145081205968123e-07,
"loss": 0.9569,
"step": 1132
},
{
"epoch": 0.8460748623168114,
"grad_norm": 2.9264881627568933,
"learning_rate": 6.087084105489449e-07,
"loss": 0.97,
"step": 1133
},
{
"epoch": 0.8468216185942313,
"grad_norm": 2.8871274133914526,
"learning_rate": 6.029344247334773e-07,
"loss": 0.9063,
"step": 1134
},
{
"epoch": 0.8468216185942313,
"eval_loss": 0.9319166541099548,
"eval_runtime": 160.895,
"eval_samples_per_second": 112.054,
"eval_steps_per_second": 1.753,
"step": 1134
},
{
"epoch": 0.8475683748716513,
"grad_norm": 2.9254630933971604,
"learning_rate": 5.971861969744758e-07,
"loss": 0.9854,
"step": 1135
},
{
"epoch": 0.8483151311490712,
"grad_norm": 3.9276232939207363,
"learning_rate": 5.914637609451191e-07,
"loss": 1.0273,
"step": 1136
},
{
"epoch": 0.8490618874264911,
"grad_norm": 2.507415577305572,
"learning_rate": 5.857671501675005e-07,
"loss": 0.9728,
"step": 1137
},
{
"epoch": 0.8498086437039112,
"grad_norm": 2.4460330720059504,
"learning_rate": 5.800963980124241e-07,
"loss": 0.9021,
"step": 1138
},
{
"epoch": 0.8505553999813311,
"grad_norm": 3.0103053437385334,
"learning_rate": 5.744515376992155e-07,
"loss": 0.9459,
"step": 1139
},
{
"epoch": 0.851302156258751,
"grad_norm": 2.817309734726002,
"learning_rate": 5.688326022955276e-07,
"loss": 0.95,
"step": 1140
},
{
"epoch": 0.852048912536171,
"grad_norm": 2.6530611977707976,
"learning_rate": 5.632396247171429e-07,
"loss": 0.9195,
"step": 1141
},
{
"epoch": 0.8527956688135909,
"grad_norm": 3.2183899358377213,
"learning_rate": 5.576726377277803e-07,
"loss": 0.9738,
"step": 1142
},
{
"epoch": 0.853542425091011,
"grad_norm": 2.7141680726405926,
"learning_rate": 5.521316739389116e-07,
"loss": 0.9675,
"step": 1143
},
{
"epoch": 0.8542891813684309,
"grad_norm": 2.7689523546663146,
"learning_rate": 5.46616765809559e-07,
"loss": 0.9566,
"step": 1144
},
{
"epoch": 0.8550359376458508,
"grad_norm": 2.6466811955985095,
"learning_rate": 5.411279456461133e-07,
"loss": 0.865,
"step": 1145
},
{
"epoch": 0.8557826939232708,
"grad_norm": 3.5195514432600206,
"learning_rate": 5.3566524560214e-07,
"loss": 0.923,
"step": 1146
},
{
"epoch": 0.8565294502006907,
"grad_norm": 2.871952018242148,
"learning_rate": 5.302286976781956e-07,
"loss": 1.0447,
"step": 1147
},
{
"epoch": 0.8572762064781108,
"grad_norm": 2.925114725852367,
"learning_rate": 5.248183337216328e-07,
"loss": 0.9123,
"step": 1148
},
{
"epoch": 0.8572762064781108,
"eval_loss": 0.9314769506454468,
"eval_runtime": 162.7458,
"eval_samples_per_second": 110.78,
"eval_steps_per_second": 1.733,
"step": 1148
},
{
"epoch": 0.8580229627555307,
"grad_norm": 4.717832820933871,
"learning_rate": 5.194341854264206e-07,
"loss": 1.0074,
"step": 1149
},
{
"epoch": 0.8587697190329506,
"grad_norm": 3.809263009430757,
"learning_rate": 5.140762843329583e-07,
"loss": 0.9953,
"step": 1150
},
{
"epoch": 0.8595164753103706,
"grad_norm": 4.127384933862416,
"learning_rate": 5.087446618278858e-07,
"loss": 0.9496,
"step": 1151
},
{
"epoch": 0.8602632315877905,
"grad_norm": 4.437603789678461,
"learning_rate": 5.034393491439044e-07,
"loss": 0.9705,
"step": 1152
},
{
"epoch": 0.8610099878652105,
"grad_norm": 3.243478071729358,
"learning_rate": 4.981603773595911e-07,
"loss": 0.9363,
"step": 1153
},
{
"epoch": 0.8617567441426305,
"grad_norm": 2.6993410282554007,
"learning_rate": 4.929077773992186e-07,
"loss": 0.9113,
"step": 1154
},
{
"epoch": 0.8625035004200504,
"grad_norm": 2.91054239299717,
"learning_rate": 4.87681580032573e-07,
"loss": 0.9461,
"step": 1155
},
{
"epoch": 0.8632502566974704,
"grad_norm": 2.568086078013279,
"learning_rate": 4.824818158747718e-07,
"loss": 0.8377,
"step": 1156
},
{
"epoch": 0.8639970129748903,
"grad_norm": 3.029881375353677,
"learning_rate": 4.773085153860912e-07,
"loss": 0.9166,
"step": 1157
},
{
"epoch": 0.8647437692523102,
"grad_norm": 2.88437306298097,
"learning_rate": 4.7216170887177834e-07,
"loss": 0.9333,
"step": 1158
},
{
"epoch": 0.8654905255297303,
"grad_norm": 3.137367070847363,
"learning_rate": 4.6704142648188013e-07,
"loss": 0.9337,
"step": 1159
},
{
"epoch": 0.8662372818071502,
"grad_norm": 2.9359952034209273,
"learning_rate": 4.619476982110649e-07,
"loss": 0.9736,
"step": 1160
},
{
"epoch": 0.8669840380845701,
"grad_norm": 3.1468262493105184,
"learning_rate": 4.568805538984461e-07,
"loss": 0.9122,
"step": 1161
},
{
"epoch": 0.8677307943619901,
"grad_norm": 3.5463405622760105,
"learning_rate": 4.5184002322740784e-07,
"loss": 1.0057,
"step": 1162
},
{
"epoch": 0.8677307943619901,
"eval_loss": 0.9312469363212585,
"eval_runtime": 162.7083,
"eval_samples_per_second": 110.806,
"eval_steps_per_second": 1.733,
"step": 1162
},
{
"epoch": 0.86847755063941,
"grad_norm": 2.9402861286207607,
"learning_rate": 4.468261357254339e-07,
"loss": 0.918,
"step": 1163
},
{
"epoch": 0.8692243069168301,
"grad_norm": 3.5252882533020564,
"learning_rate": 4.41838920763929e-07,
"loss": 0.9697,
"step": 1164
},
{
"epoch": 0.86997106319425,
"grad_norm": 3.682368504764222,
"learning_rate": 4.368784075580512e-07,
"loss": 0.9509,
"step": 1165
},
{
"epoch": 0.8707178194716699,
"grad_norm": 3.3322856389909865,
"learning_rate": 4.319446251665388e-07,
"loss": 1.0236,
"step": 1166
},
{
"epoch": 0.8714645757490899,
"grad_norm": 3.4771965818234882,
"learning_rate": 4.2703760249154124e-07,
"loss": 0.9486,
"step": 1167
},
{
"epoch": 0.8722113320265098,
"grad_norm": 3.069323768435232,
"learning_rate": 4.221573682784486e-07,
"loss": 0.9591,
"step": 1168
},
{
"epoch": 0.8729580883039298,
"grad_norm": 4.090015934591448,
"learning_rate": 4.1730395111572397e-07,
"loss": 0.969,
"step": 1169
},
{
"epoch": 0.8737048445813498,
"grad_norm": 6.2126368267999705,
"learning_rate": 4.124773794347375e-07,
"loss": 0.9801,
"step": 1170
},
{
"epoch": 0.8744516008587697,
"grad_norm": 2.655406724824259,
"learning_rate": 4.0767768150959785e-07,
"loss": 0.9159,
"step": 1171
},
{
"epoch": 0.8751983571361897,
"grad_norm": 2.840595662469952,
"learning_rate": 4.0290488545698224e-07,
"loss": 0.8925,
"step": 1172
},
{
"epoch": 0.8759451134136096,
"grad_norm": 3.2666873166974373,
"learning_rate": 3.9815901923598354e-07,
"loss": 0.9983,
"step": 1173
},
{
"epoch": 0.8766918696910296,
"grad_norm": 2.899555759428235,
"learning_rate": 3.934401106479352e-07,
"loss": 0.983,
"step": 1174
},
{
"epoch": 0.8774386259684496,
"grad_norm": 3.082400103066319,
"learning_rate": 3.8874818733625363e-07,
"loss": 0.9847,
"step": 1175
},
{
"epoch": 0.8781853822458695,
"grad_norm": 3.4527125973541892,
"learning_rate": 3.8408327678627343e-07,
"loss": 0.9698,
"step": 1176
},
{
"epoch": 0.8781853822458695,
"eval_loss": 0.9311810731887817,
"eval_runtime": 161.1374,
"eval_samples_per_second": 111.886,
"eval_steps_per_second": 1.75,
"step": 1176
},
{
"epoch": 0.8789321385232894,
"grad_norm": 3.090415752994308,
"learning_rate": 3.79445406325093e-07,
"loss": 1.0307,
"step": 1177
},
{
"epoch": 0.8796788948007094,
"grad_norm": 2.6147140868395784,
"learning_rate": 3.7483460312140343e-07,
"loss": 0.9311,
"step": 1178
},
{
"epoch": 0.8804256510781294,
"grad_norm": 2.7086102542703174,
"learning_rate": 3.702508941853383e-07,
"loss": 0.9121,
"step": 1179
},
{
"epoch": 0.8811724073555494,
"grad_norm": 3.1100117614192557,
"learning_rate": 3.6569430636831496e-07,
"loss": 1.0066,
"step": 1180
},
{
"epoch": 0.8819191636329693,
"grad_norm": 3.5375133182103884,
"learning_rate": 3.611648663628725e-07,
"loss": 1.0192,
"step": 1181
},
{
"epoch": 0.8826659199103892,
"grad_norm": 3.4809829662795053,
"learning_rate": 3.566626007025159e-07,
"loss": 0.9227,
"step": 1182
},
{
"epoch": 0.8834126761878092,
"grad_norm": 2.807725591167475,
"learning_rate": 3.5218753576156837e-07,
"loss": 0.9324,
"step": 1183
},
{
"epoch": 0.8841594324652292,
"grad_norm": 4.1307538335322995,
"learning_rate": 3.477396977550052e-07,
"loss": 1.0402,
"step": 1184
},
{
"epoch": 0.8849061887426491,
"grad_norm": 3.1065655720227943,
"learning_rate": 3.433191127383079e-07,
"loss": 0.9984,
"step": 1185
},
{
"epoch": 0.8856529450200691,
"grad_norm": 3.0423671480418006,
"learning_rate": 3.3892580660731146e-07,
"loss": 0.9477,
"step": 1186
},
{
"epoch": 0.886399701297489,
"grad_norm": 5.506314111297366,
"learning_rate": 3.3455980509804865e-07,
"loss": 0.9375,
"step": 1187
},
{
"epoch": 0.887146457574909,
"grad_norm": 2.7257294061538526,
"learning_rate": 3.302211337866029e-07,
"loss": 0.9616,
"step": 1188
},
{
"epoch": 0.887893213852329,
"grad_norm": 4.286522315516212,
"learning_rate": 3.2590981808895637e-07,
"loss": 1.002,
"step": 1189
},
{
"epoch": 0.8886399701297489,
"grad_norm": 3.266886884058907,
"learning_rate": 3.21625883260841e-07,
"loss": 0.9467,
"step": 1190
},
{
"epoch": 0.8886399701297489,
"eval_loss": 0.9309723377227783,
"eval_runtime": 161.2639,
"eval_samples_per_second": 111.798,
"eval_steps_per_second": 1.749,
"step": 1190
},
{
"epoch": 0.8893867264071689,
"grad_norm": 2.594459317115802,
"learning_rate": 3.173693543975931e-07,
"loss": 0.9563,
"step": 1191
},
{
"epoch": 0.8901334826845888,
"grad_norm": 3.4125681478868297,
"learning_rate": 3.1314025643400246e-07,
"loss": 0.9761,
"step": 1192
},
{
"epoch": 0.8908802389620087,
"grad_norm": 3.2367487026935855,
"learning_rate": 3.089386141441714e-07,
"loss": 1.0675,
"step": 1193
},
{
"epoch": 0.8916269952394287,
"grad_norm": 2.7617284333366316,
"learning_rate": 3.0476445214136343e-07,
"loss": 0.9114,
"step": 1194
},
{
"epoch": 0.8923737515168487,
"grad_norm": 3.4059049786673707,
"learning_rate": 3.0061779487786325e-07,
"loss": 1.0092,
"step": 1195
},
{
"epoch": 0.8931205077942687,
"grad_norm": 2.9466488189376094,
"learning_rate": 2.9649866664483387e-07,
"loss": 1.0071,
"step": 1196
},
{
"epoch": 0.8938672640716886,
"grad_norm": 3.2214019544717356,
"learning_rate": 2.9240709157217107e-07,
"loss": 0.9705,
"step": 1197
},
{
"epoch": 0.8946140203491085,
"grad_norm": 2.92519808633413,
"learning_rate": 2.883430936283649e-07,
"loss": 0.9843,
"step": 1198
},
{
"epoch": 0.8953607766265285,
"grad_norm": 3.173147787422756,
"learning_rate": 2.8430669662035784e-07,
"loss": 0.961,
"step": 1199
},
{
"epoch": 0.8961075329039485,
"grad_norm": 3.73113933358305,
"learning_rate": 2.802979241934067e-07,
"loss": 0.9631,
"step": 1200
},
{
"epoch": 0.8968542891813684,
"grad_norm": 2.8388033769300582,
"learning_rate": 2.7631679983094293e-07,
"loss": 0.9686,
"step": 1201
},
{
"epoch": 0.8976010454587884,
"grad_norm": 2.580484676835834,
"learning_rate": 2.72363346854434e-07,
"loss": 0.8967,
"step": 1202
},
{
"epoch": 0.8983478017362083,
"grad_norm": 2.6372158270212616,
"learning_rate": 2.684375884232493e-07,
"loss": 0.8848,
"step": 1203
},
{
"epoch": 0.8990945580136283,
"grad_norm": 5.045405465113533,
"learning_rate": 2.645395475345236e-07,
"loss": 0.9653,
"step": 1204
},
{
"epoch": 0.8990945580136283,
"eval_loss": 0.9308164119720459,
"eval_runtime": 160.9645,
"eval_samples_per_second": 112.006,
"eval_steps_per_second": 1.752,
"step": 1204
},
{
"epoch": 0.8998413142910483,
"grad_norm": 3.81237492723543,
"learning_rate": 2.6066924702302044e-07,
"loss": 0.985,
"step": 1205
},
{
"epoch": 0.9005880705684682,
"grad_norm": 4.444299985817935,
"learning_rate": 2.568267095610022e-07,
"loss": 0.9718,
"step": 1206
},
{
"epoch": 0.9013348268458882,
"grad_norm": 3.213116580736095,
"learning_rate": 2.530119576580936e-07,
"loss": 0.95,
"step": 1207
},
{
"epoch": 0.9020815831233081,
"grad_norm": 3.304199593315397,
"learning_rate": 2.492250136611513e-07,
"loss": 0.9401,
"step": 1208
},
{
"epoch": 0.902828339400728,
"grad_norm": 3.216958801550224,
"learning_rate": 2.454658997541326e-07,
"loss": 0.9949,
"step": 1209
},
{
"epoch": 0.9035750956781481,
"grad_norm": 2.8366418091093633,
"learning_rate": 2.417346379579671e-07,
"loss": 0.9415,
"step": 1210
},
{
"epoch": 0.904321851955568,
"grad_norm": 3.4418483482994353,
"learning_rate": 2.380312501304255e-07,
"loss": 0.9263,
"step": 1211
},
{
"epoch": 0.905068608232988,
"grad_norm": 3.2162625122987283,
"learning_rate": 2.343557579659922e-07,
"loss": 0.9481,
"step": 1212
},
{
"epoch": 0.9058153645104079,
"grad_norm": 3.6606526579447363,
"learning_rate": 2.3070818299573972e-07,
"loss": 0.9496,
"step": 1213
},
{
"epoch": 0.9065621207878278,
"grad_norm": 3.413782160357342,
"learning_rate": 2.2708854658720135e-07,
"loss": 0.8925,
"step": 1214
},
{
"epoch": 0.9073088770652479,
"grad_norm": 3.37342109636671,
"learning_rate": 2.2349686994424303e-07,
"loss": 0.9775,
"step": 1215
},
{
"epoch": 0.9080556333426678,
"grad_norm": 3.226917932046761,
"learning_rate": 2.1993317410694605e-07,
"loss": 0.9228,
"step": 1216
},
{
"epoch": 0.9088023896200877,
"grad_norm": 2.9170592051096937,
"learning_rate": 2.1639747995147843e-07,
"loss": 0.9238,
"step": 1217
},
{
"epoch": 0.9095491458975077,
"grad_norm": 4.372205178863148,
"learning_rate": 2.1288980818997272e-07,
"loss": 0.974,
"step": 1218
},
{
"epoch": 0.9095491458975077,
"eval_loss": 0.9307811260223389,
"eval_runtime": 162.4851,
"eval_samples_per_second": 110.958,
"eval_steps_per_second": 1.736,
"step": 1218
},
{
"epoch": 0.9102959021749276,
"grad_norm": 2.9674581524340713,
"learning_rate": 2.094101793704073e-07,
"loss": 0.9195,
"step": 1219
},
{
"epoch": 0.9110426584523477,
"grad_norm": 2.9235998255992537,
"learning_rate": 2.0595861387648574e-07,
"loss": 0.9142,
"step": 1220
},
{
"epoch": 0.9117894147297676,
"grad_norm": 3.485012716463773,
"learning_rate": 2.0253513192751374e-07,
"loss": 0.9069,
"step": 1221
},
{
"epoch": 0.9125361710071875,
"grad_norm": 2.6461707104119876,
"learning_rate": 1.9913975357828408e-07,
"loss": 0.9067,
"step": 1222
},
{
"epoch": 0.9132829272846075,
"grad_norm": 2.661675283327674,
"learning_rate": 1.957724987189602e-07,
"loss": 0.9367,
"step": 1223
},
{
"epoch": 0.9140296835620274,
"grad_norm": 3.1094316780050812,
"learning_rate": 1.9243338707495618e-07,
"loss": 0.8989,
"step": 1224
},
{
"epoch": 0.9147764398394475,
"grad_norm": 3.183968339471646,
"learning_rate": 1.8912243820682296e-07,
"loss": 0.9342,
"step": 1225
},
{
"epoch": 0.9155231961168674,
"grad_norm": 2.9166944722749273,
"learning_rate": 1.858396715101346e-07,
"loss": 1.0136,
"step": 1226
},
{
"epoch": 0.9162699523942873,
"grad_norm": 4.554593719123295,
"learning_rate": 1.8258510621537219e-07,
"loss": 0.9789,
"step": 1227
},
{
"epoch": 0.9170167086717073,
"grad_norm": 2.8500992040594473,
"learning_rate": 1.7935876138781284e-07,
"loss": 0.9422,
"step": 1228
},
{
"epoch": 0.9177634649491272,
"grad_norm": 3.314999491197659,
"learning_rate": 1.7616065592742038e-07,
"loss": 0.9746,
"step": 1229
},
{
"epoch": 0.9185102212265472,
"grad_norm": 2.785252191895322,
"learning_rate": 1.729908085687293e-07,
"loss": 0.8996,
"step": 1230
},
{
"epoch": 0.9192569775039672,
"grad_norm": 3.312621983480616,
"learning_rate": 1.698492378807387e-07,
"loss": 0.9983,
"step": 1231
},
{
"epoch": 0.9200037337813871,
"grad_norm": 2.970201392187938,
"learning_rate": 1.6673596226680356e-07,
"loss": 0.9679,
"step": 1232
},
{
"epoch": 0.9200037337813871,
"eval_loss": 0.930705726146698,
"eval_runtime": 162.5052,
"eval_samples_per_second": 110.944,
"eval_steps_per_second": 1.735,
"step": 1232
},
{
"epoch": 0.920750490058807,
"grad_norm": 2.766948354777863,
"learning_rate": 1.6365099996452416e-07,
"loss": 0.9815,
"step": 1233
},
{
"epoch": 0.921497246336227,
"grad_norm": 3.775583043641789,
"learning_rate": 1.6059436904564296e-07,
"loss": 0.9999,
"step": 1234
},
{
"epoch": 0.9222440026136469,
"grad_norm": 3.1430081879332823,
"learning_rate": 1.575660874159346e-07,
"loss": 0.969,
"step": 1235
},
{
"epoch": 0.922990758891067,
"grad_norm": 3.861696256410228,
"learning_rate": 1.545661728151071e-07,
"loss": 1.0212,
"step": 1236
},
{
"epoch": 0.9237375151684869,
"grad_norm": 3.6672560176616704,
"learning_rate": 1.515946428166909e-07,
"loss": 0.9356,
"step": 1237
},
{
"epoch": 0.9244842714459068,
"grad_norm": 4.106696856846996,
"learning_rate": 1.4865151482793938e-07,
"loss": 0.9449,
"step": 1238
},
{
"epoch": 0.9252310277233268,
"grad_norm": 2.9579773642607066,
"learning_rate": 1.4573680608972796e-07,
"loss": 0.9129,
"step": 1239
},
{
"epoch": 0.9259777840007467,
"grad_norm": 2.808101044256339,
"learning_rate": 1.4285053367645074e-07,
"loss": 0.866,
"step": 1240
},
{
"epoch": 0.9267245402781668,
"grad_norm": 2.9339064632805307,
"learning_rate": 1.3999271449592177e-07,
"loss": 0.9948,
"step": 1241
},
{
"epoch": 0.9274712965555867,
"grad_norm": 3.0554176273960563,
"learning_rate": 1.371633652892762e-07,
"loss": 1.0511,
"step": 1242
},
{
"epoch": 0.9282180528330066,
"grad_norm": 5.510569175235229,
"learning_rate": 1.3436250263087204e-07,
"loss": 0.928,
"step": 1243
},
{
"epoch": 0.9289648091104266,
"grad_norm": 2.844624274973105,
"learning_rate": 1.3159014292819126e-07,
"loss": 0.9032,
"step": 1244
},
{
"epoch": 0.9297115653878465,
"grad_norm": 4.434412677362633,
"learning_rate": 1.2884630242174734e-07,
"loss": 0.9385,
"step": 1245
},
{
"epoch": 0.9304583216652665,
"grad_norm": 3.4872159250577295,
"learning_rate": 1.2613099718498556e-07,
"loss": 0.9364,
"step": 1246
},
{
"epoch": 0.9304583216652665,
"eval_loss": 0.9305473566055298,
"eval_runtime": 162.4805,
"eval_samples_per_second": 110.961,
"eval_steps_per_second": 1.736,
"step": 1246
},
{
"epoch": 0.9312050779426865,
"grad_norm": 3.5713420230443695,
"learning_rate": 1.234442431241939e-07,
"loss": 1.0143,
"step": 1247
},
{
"epoch": 0.9319518342201064,
"grad_norm": 2.8182072549192214,
"learning_rate": 1.207860559784052e-07,
"loss": 0.9826,
"step": 1248
},
{
"epoch": 0.9326985904975263,
"grad_norm": 2.875098039824205,
"learning_rate": 1.181564513193073e-07,
"loss": 0.9561,
"step": 1249
},
{
"epoch": 0.9334453467749463,
"grad_norm": 3.08129402935908,
"learning_rate": 1.1555544455115253e-07,
"loss": 0.9792,
"step": 1250
},
{
"epoch": 0.9341921030523663,
"grad_norm": 4.008794109759551,
"learning_rate": 1.1298305091066664e-07,
"loss": 1.0236,
"step": 1251
},
{
"epoch": 0.9349388593297863,
"grad_norm": 3.25803536742016,
"learning_rate": 1.1043928546695782e-07,
"loss": 0.9582,
"step": 1252
},
{
"epoch": 0.9356856156072062,
"grad_norm": 2.8272859290809005,
"learning_rate": 1.0792416312143172e-07,
"loss": 0.939,
"step": 1253
},
{
"epoch": 0.9364323718846261,
"grad_norm": 3.27251051941531,
"learning_rate": 1.0543769860769992e-07,
"loss": 0.9998,
"step": 1254
},
{
"epoch": 0.9371791281620461,
"grad_norm": 3.3364183862421113,
"learning_rate": 1.029799064914988e-07,
"loss": 0.9757,
"step": 1255
},
{
"epoch": 0.9379258844394661,
"grad_norm": 3.3619896026786265,
"learning_rate": 1.0055080117060079e-07,
"loss": 1.0142,
"step": 1256
},
{
"epoch": 0.938672640716886,
"grad_norm": 3.0948786475102508,
"learning_rate": 9.81503968747305e-08,
"loss": 0.9619,
"step": 1257
},
{
"epoch": 0.939419396994306,
"grad_norm": 3.0788291331198674,
"learning_rate": 9.577870766547981e-08,
"loss": 0.9774,
"step": 1258
},
{
"epoch": 0.9401661532717259,
"grad_norm": 3.2586272595359516,
"learning_rate": 9.34357474362313e-08,
"loss": 0.9681,
"step": 1259
},
{
"epoch": 0.9409129095491459,
"grad_norm": 3.0686211394939424,
"learning_rate": 9.112152991206991e-08,
"loss": 0.992,
"step": 1260
},
{
"epoch": 0.9409129095491459,
"eval_loss": 0.9305338859558105,
"eval_runtime": 161.1412,
"eval_samples_per_second": 111.883,
"eval_steps_per_second": 1.75,
"step": 1260
},
{
"epoch": 0.9416596658265659,
"grad_norm": 2.8839831606268143,
"learning_rate": 8.883606864970585e-08,
"loss": 0.8647,
"step": 1261
},
{
"epoch": 0.9424064221039858,
"grad_norm": 3.5045862915012886,
"learning_rate": 8.657937703739516e-08,
"loss": 0.9615,
"step": 1262
},
{
"epoch": 0.9431531783814058,
"grad_norm": 3.013831723584786,
"learning_rate": 8.435146829486263e-08,
"loss": 0.9761,
"step": 1263
},
{
"epoch": 0.9438999346588257,
"grad_norm": 3.05235918180062,
"learning_rate": 8.215235547321897e-08,
"loss": 0.9317,
"step": 1264
},
{
"epoch": 0.9446466909362456,
"grad_norm": 2.8893059761689592,
"learning_rate": 7.998205145489157e-08,
"loss": 0.9059,
"step": 1265
},
{
"epoch": 0.9453934472136657,
"grad_norm": 2.9521748418786355,
"learning_rate": 7.784056895354386e-08,
"loss": 0.9681,
"step": 1266
},
{
"epoch": 0.9461402034910856,
"grad_norm": 3.1653409809050665,
"learning_rate": 7.572792051400325e-08,
"loss": 0.9913,
"step": 1267
},
{
"epoch": 0.9468869597685056,
"grad_norm": 2.6124719184273477,
"learning_rate": 7.364411851218667e-08,
"loss": 0.8977,
"step": 1268
},
{
"epoch": 0.9476337160459255,
"grad_norm": 2.858057980612677,
"learning_rate": 7.158917515502739e-08,
"loss": 0.9498,
"step": 1269
},
{
"epoch": 0.9483804723233454,
"grad_norm": 3.549096737039596,
"learning_rate": 6.95631024804061e-08,
"loss": 0.9432,
"step": 1270
},
{
"epoch": 0.9491272286007654,
"grad_norm": 3.1098491003954307,
"learning_rate": 6.75659123570771e-08,
"loss": 0.9664,
"step": 1271
},
{
"epoch": 0.9498739848781854,
"grad_norm": 3.4670315690611377,
"learning_rate": 6.559761648460117e-08,
"loss": 0.8994,
"step": 1272
},
{
"epoch": 0.9506207411556054,
"grad_norm": 4.325331230417902,
"learning_rate": 6.365822639327724e-08,
"loss": 0.9636,
"step": 1273
},
{
"epoch": 0.9513674974330253,
"grad_norm": 2.9355017549957942,
"learning_rate": 6.174775344407246e-08,
"loss": 0.9531,
"step": 1274
},
{
"epoch": 0.9513674974330253,
"eval_loss": 0.9305031299591064,
"eval_runtime": 162.3818,
"eval_samples_per_second": 111.028,
"eval_steps_per_second": 1.737,
"step": 1274
},
{
"epoch": 0.9521142537104452,
"grad_norm": 2.7787168628182286,
"learning_rate": 5.986620882855676e-08,
"loss": 0.9989,
"step": 1275
},
{
"epoch": 0.9528610099878652,
"grad_norm": 3.250198943576934,
"learning_rate": 5.801360356883945e-08,
"loss": 1.0186,
"step": 1276
},
{
"epoch": 0.9536077662652852,
"grad_norm": 3.5045478545742372,
"learning_rate": 5.618994851750104e-08,
"loss": 1.0243,
"step": 1277
},
{
"epoch": 0.9543545225427051,
"grad_norm": 3.596766310602903,
"learning_rate": 5.439525435753157e-08,
"loss": 1.0089,
"step": 1278
},
{
"epoch": 0.9551012788201251,
"grad_norm": 3.2110723577196394,
"learning_rate": 5.262953160226958e-08,
"loss": 1.0486,
"step": 1279
},
{
"epoch": 0.955848035097545,
"grad_norm": 2.945046395031802,
"learning_rate": 5.089279059533658e-08,
"loss": 0.8746,
"step": 1280
},
{
"epoch": 0.956594791374965,
"grad_norm": 2.742862157777454,
"learning_rate": 4.91850415105799e-08,
"loss": 0.905,
"step": 1281
},
{
"epoch": 0.957341547652385,
"grad_norm": 2.816790640635472,
"learning_rate": 4.7506294352011596e-08,
"loss": 0.9554,
"step": 1282
},
{
"epoch": 0.9580883039298049,
"grad_norm": 3.3497034429748975,
"learning_rate": 4.5856558953750744e-08,
"loss": 0.8932,
"step": 1283
},
{
"epoch": 0.9588350602072249,
"grad_norm": 2.9107805279966543,
"learning_rate": 4.423584497996458e-08,
"loss": 0.9526,
"step": 1284
},
{
"epoch": 0.9595818164846448,
"grad_norm": 2.700904755255844,
"learning_rate": 4.2644161924811353e-08,
"loss": 0.9566,
"step": 1285
},
{
"epoch": 0.9603285727620647,
"grad_norm": 3.2097128106381225,
"learning_rate": 4.108151911238922e-08,
"loss": 0.9119,
"step": 1286
},
{
"epoch": 0.9610753290394848,
"grad_norm": 3.1787175007546455,
"learning_rate": 3.9547925696675206e-08,
"loss": 0.9617,
"step": 1287
},
{
"epoch": 0.9618220853169047,
"grad_norm": 3.468752659560656,
"learning_rate": 3.804339066147467e-08,
"loss": 1.004,
"step": 1288
},
{
"epoch": 0.9618220853169047,
"eval_loss": 0.9304465055465698,
"eval_runtime": 162.5341,
"eval_samples_per_second": 110.924,
"eval_steps_per_second": 1.735,
"step": 1288
},
{
"epoch": 0.9625688415943247,
"grad_norm": 2.8644336249094513,
"learning_rate": 3.656792282037136e-08,
"loss": 0.8925,
"step": 1289
},
{
"epoch": 0.9633155978717446,
"grad_norm": 3.0822614830737627,
"learning_rate": 3.512153081667135e-08,
"loss": 0.9553,
"step": 1290
},
{
"epoch": 0.9640623541491645,
"grad_norm": 2.907482583857242,
"learning_rate": 3.370422312335309e-08,
"loss": 0.971,
"step": 1291
},
{
"epoch": 0.9648091104265846,
"grad_norm": 3.070952453759052,
"learning_rate": 3.2316008043020154e-08,
"loss": 0.9381,
"step": 1292
},
{
"epoch": 0.9655558667040045,
"grad_norm": 3.397673851079207,
"learning_rate": 3.095689370785249e-08,
"loss": 0.9627,
"step": 1293
},
{
"epoch": 0.9663026229814244,
"grad_norm": 2.9351868857737684,
"learning_rate": 2.9626888079554716e-08,
"loss": 0.9772,
"step": 1294
},
{
"epoch": 0.9670493792588444,
"grad_norm": 2.8830349043131154,
"learning_rate": 2.8325998949314536e-08,
"loss": 0.9592,
"step": 1295
},
{
"epoch": 0.9677961355362643,
"grad_norm": 2.7889585719099848,
"learning_rate": 2.705423393775386e-08,
"loss": 0.9692,
"step": 1296
},
{
"epoch": 0.9685428918136844,
"grad_norm": 3.3872739189532446,
"learning_rate": 2.5811600494885512e-08,
"loss": 0.9343,
"step": 1297
},
{
"epoch": 0.9692896480911043,
"grad_norm": 3.735163743338174,
"learning_rate": 2.4598105900069392e-08,
"loss": 0.9593,
"step": 1298
},
{
"epoch": 0.9700364043685242,
"grad_norm": 2.71350691275283,
"learning_rate": 2.341375726197026e-08,
"loss": 0.9819,
"step": 1299
},
{
"epoch": 0.9707831606459442,
"grad_norm": 2.6806573316875517,
"learning_rate": 2.2258561518513912e-08,
"loss": 0.9477,
"step": 1300
},
{
"epoch": 0.9715299169233641,
"grad_norm": 3.2230529938354278,
"learning_rate": 2.1132525436849406e-08,
"loss": 0.9044,
"step": 1301
},
{
"epoch": 0.9722766732007841,
"grad_norm": 3.2452809812911276,
"learning_rate": 2.003565561330856e-08,
"loss": 0.8966,
"step": 1302
},
{
"epoch": 0.9722766732007841,
"eval_loss": 0.9304633736610413,
"eval_runtime": 162.5458,
"eval_samples_per_second": 110.916,
"eval_steps_per_second": 1.735,
"step": 1302
},
{
"epoch": 0.9730234294782041,
"grad_norm": 3.169797354119332,
"learning_rate": 1.896795847336541e-08,
"loss": 0.9213,
"step": 1303
},
{
"epoch": 0.973770185755624,
"grad_norm": 2.9915639532159046,
"learning_rate": 1.792944027160237e-08,
"loss": 0.8673,
"step": 1304
},
{
"epoch": 0.974516942033044,
"grad_norm": 4.0354586375745765,
"learning_rate": 1.6920107091668582e-08,
"loss": 0.9383,
"step": 1305
},
{
"epoch": 0.9752636983104639,
"grad_norm": 2.8461272392012247,
"learning_rate": 1.593996484624938e-08,
"loss": 0.9598,
"step": 1306
},
{
"epoch": 0.9760104545878838,
"grad_norm": 3.0983794245126086,
"learning_rate": 1.4989019277028004e-08,
"loss": 1.0039,
"step": 1307
},
{
"epoch": 0.9767572108653039,
"grad_norm": 3.1350242555646743,
"learning_rate": 1.4067275954653403e-08,
"loss": 0.9235,
"step": 1308
},
{
"epoch": 0.9775039671427238,
"grad_norm": 2.9339669005824467,
"learning_rate": 1.3174740278708575e-08,
"loss": 0.9089,
"step": 1309
},
{
"epoch": 0.9782507234201437,
"grad_norm": 4.474824857246958,
"learning_rate": 1.2311417477676168e-08,
"loss": 0.9183,
"step": 1310
},
{
"epoch": 0.9789974796975637,
"grad_norm": 2.989598754747387,
"learning_rate": 1.1477312608910162e-08,
"loss": 0.9991,
"step": 1311
},
{
"epoch": 0.9797442359749836,
"grad_norm": 2.9625742101217014,
"learning_rate": 1.0672430558605895e-08,
"loss": 0.953,
"step": 1312
},
{
"epoch": 0.9804909922524037,
"grad_norm": 2.951799943908215,
"learning_rate": 9.89677604177064e-09,
"loss": 0.9771,
"step": 1313
},
{
"epoch": 0.9812377485298236,
"grad_norm": 3.131290656868523,
"learning_rate": 9.150353602197516e-09,
"loss": 0.8718,
"step": 1314
},
{
"epoch": 0.9819845048072435,
"grad_norm": 2.953246971653502,
"learning_rate": 8.433167612436066e-09,
"loss": 0.9561,
"step": 1315
},
{
"epoch": 0.9827312610846635,
"grad_norm": 3.994087349668577,
"learning_rate": 7.745222273770059e-09,
"loss": 0.9905,
"step": 1316
},
{
"epoch": 0.9827312610846635,
"eval_loss": 0.9304625988006592,
"eval_runtime": 162.5204,
"eval_samples_per_second": 110.934,
"eval_steps_per_second": 1.735,
"step": 1316
},
{
"epoch": 0.9834780173620834,
"grad_norm": 2.854502234217566,
"learning_rate": 7.0865216161902785e-09,
"loss": 1.0256,
"step": 1317
},
{
"epoch": 0.9842247736395034,
"grad_norm": 3.1821866741449867,
"learning_rate": 6.457069498372326e-09,
"loss": 0.9699,
"step": 1318
},
{
"epoch": 0.9849715299169234,
"grad_norm": 2.869146882764378,
"learning_rate": 5.856869607652749e-09,
"loss": 0.9833,
"step": 1319
},
{
"epoch": 0.9857182861943433,
"grad_norm": 3.1328413430563975,
"learning_rate": 5.285925460009056e-09,
"loss": 0.9053,
"step": 1320
},
{
"epoch": 0.9864650424717633,
"grad_norm": 2.6428303935877917,
"learning_rate": 4.744240400038624e-09,
"loss": 0.9862,
"step": 1321
},
{
"epoch": 0.9872117987491832,
"grad_norm": 3.6370165390584277,
"learning_rate": 4.231817600938159e-09,
"loss": 0.9705,
"step": 1322
},
{
"epoch": 0.9879585550266032,
"grad_norm": 3.206324861128839,
"learning_rate": 3.748660064484821e-09,
"loss": 0.9382,
"step": 1323
},
{
"epoch": 0.9887053113040232,
"grad_norm": 3.3127354281132635,
"learning_rate": 3.2947706210217923e-09,
"loss": 0.9847,
"step": 1324
},
{
"epoch": 0.9894520675814431,
"grad_norm": 2.5984978136593284,
"learning_rate": 2.8701519294371815e-09,
"loss": 0.9211,
"step": 1325
},
{
"epoch": 0.990198823858863,
"grad_norm": 2.806011670037097,
"learning_rate": 2.4748064771529247e-09,
"loss": 0.9384,
"step": 1326
},
{
"epoch": 0.990945580136283,
"grad_norm": 3.06945560096709,
"learning_rate": 2.1087365801053526e-09,
"loss": 0.9233,
"step": 1327
},
{
"epoch": 0.991692336413703,
"grad_norm": 3.170045754610596,
"learning_rate": 1.7719443827368677e-09,
"loss": 1.0009,
"step": 1328
},
{
"epoch": 0.992439092691123,
"grad_norm": 2.5106733677134887,
"learning_rate": 1.4644318579798422e-09,
"loss": 0.8646,
"step": 1329
},
{
"epoch": 0.9931858489685429,
"grad_norm": 2.6476074361565733,
"learning_rate": 1.186200807245519e-09,
"loss": 1.007,
"step": 1330
},
{
"epoch": 0.9931858489685429,
"eval_loss": 0.9304366111755371,
"eval_runtime": 160.9597,
"eval_samples_per_second": 112.009,
"eval_steps_per_second": 1.752,
"step": 1330
},
{
"epoch": 0.9939326052459628,
"grad_norm": 3.095198427340493,
"learning_rate": 9.372528604134623e-10,
"loss": 0.9486,
"step": 1331
},
{
"epoch": 0.9946793615233828,
"grad_norm": 3.941739091074789,
"learning_rate": 7.17589475824898e-10,
"loss": 0.9998,
"step": 1332
},
{
"epoch": 0.9954261178008028,
"grad_norm": 2.6143928805877543,
"learning_rate": 5.272119402693898e-10,
"loss": 0.9061,
"step": 1333
},
{
"epoch": 0.9961728740782227,
"grad_norm": 3.161138570652923,
"learning_rate": 3.6612136898039885e-10,
"loss": 0.9688,
"step": 1334
},
{
"epoch": 0.9969196303556427,
"grad_norm": 4.104053917839182,
"learning_rate": 2.3431870562917735e-10,
"loss": 1.0086,
"step": 1335
},
{
"epoch": 0.9976663866330626,
"grad_norm": 2.9990664641234006,
"learning_rate": 1.3180472231588694e-10,
"loss": 1.0231,
"step": 1336
},
{
"epoch": 0.9984131429104826,
"grad_norm": 2.6250712172326782,
"learning_rate": 5.858001956904335e-11,
"loss": 0.9374,
"step": 1337
},
{
"epoch": 0.9991598991879026,
"grad_norm": 2.8904822230789065,
"learning_rate": 1.464502633996556e-11,
"loss": 0.87,
"step": 1338
},
{
"epoch": 0.9999066554653225,
"grad_norm": 2.4805900744497618,
"learning_rate": 0.0,
"loss": 0.8749,
"step": 1339
},
{
"epoch": 0.9999066554653225,
"step": 1339,
"total_flos": 1690624755499008.0,
"train_loss": 1.0187733439120719,
"train_runtime": 28145.7978,
"train_samples_per_second": 12.179,
"train_steps_per_second": 0.048
}
],
"logging_steps": 1.0,
"max_steps": 1339,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 134,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1690624755499008.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}