Files
salamandra-7b-instruct-tool…/trainer_state.json
ModelHub XC ead9e3c533 初始化项目,由ModelHub XC社区提供模型
Model: BSC-LT/salamandra-7b-instruct-tools-16k
Source: Original Platform
2026-05-22 09:40:17 +08:00

13282 lines
322 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9996635488863468,
"eval_steps": 60,
"global_step": 1857,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0005383217818450979,
"grad_norm": 109.41815851913972,
"learning_rate": 1.7857142857142858e-07,
"loss": 1.8223,
"step": 1
},
{
"epoch": 0.0010766435636901958,
"grad_norm": 6.467874390653583,
"learning_rate": 3.5714285714285716e-07,
"loss": 0.995,
"step": 2
},
{
"epoch": 0.0016149653455352936,
"grad_norm": 4802.638302957245,
"learning_rate": 5.357142857142857e-07,
"loss": 1.8859,
"step": 3
},
{
"epoch": 0.0021532871273803916,
"grad_norm": 16.08026403474417,
"learning_rate": 7.142857142857143e-07,
"loss": 0.953,
"step": 4
},
{
"epoch": 0.0026916089092254895,
"grad_norm": 57.36353411210698,
"learning_rate": 8.928571428571429e-07,
"loss": 0.9429,
"step": 5
},
{
"epoch": 0.0032299306910705873,
"grad_norm": 19675.34780096024,
"learning_rate": 1.0714285714285714e-06,
"loss": 2.6006,
"step": 6
},
{
"epoch": 0.0037682524729156855,
"grad_norm": 818.3276445922152,
"learning_rate": 1.25e-06,
"loss": 0.9918,
"step": 7
},
{
"epoch": 0.004306574254760783,
"grad_norm": 104.38551556669266,
"learning_rate": 1.4285714285714286e-06,
"loss": 0.9549,
"step": 8
},
{
"epoch": 0.004844896036605881,
"grad_norm": 367.74940177761715,
"learning_rate": 1.6071428571428574e-06,
"loss": 2.68,
"step": 9
},
{
"epoch": 0.005383217818450979,
"grad_norm": 2496.9575083108416,
"learning_rate": 1.7857142857142859e-06,
"loss": 1.0041,
"step": 10
},
{
"epoch": 0.005921539600296077,
"grad_norm": 8.38602908383877,
"learning_rate": 1.9642857142857144e-06,
"loss": 0.9457,
"step": 11
},
{
"epoch": 0.0064598613821411745,
"grad_norm": 8.592616439259098,
"learning_rate": 2.1428571428571427e-06,
"loss": 0.8897,
"step": 12
},
{
"epoch": 0.006998183163986273,
"grad_norm": 438.07218134142477,
"learning_rate": 2.321428571428572e-06,
"loss": 0.9019,
"step": 13
},
{
"epoch": 0.007536504945831371,
"grad_norm": 9.994776961708206,
"learning_rate": 2.5e-06,
"loss": 0.825,
"step": 14
},
{
"epoch": 0.008074826727676468,
"grad_norm": 10.743498710130337,
"learning_rate": 2.6785714285714285e-06,
"loss": 0.849,
"step": 15
},
{
"epoch": 0.008613148509521567,
"grad_norm": 15.967903678723232,
"learning_rate": 2.8571428571428573e-06,
"loss": 0.8012,
"step": 16
},
{
"epoch": 0.009151470291366665,
"grad_norm": 4.1668693518220055,
"learning_rate": 3.0357142857142856e-06,
"loss": 0.7744,
"step": 17
},
{
"epoch": 0.009689792073211762,
"grad_norm": 6.926360685293587,
"learning_rate": 3.2142857142857147e-06,
"loss": 0.692,
"step": 18
},
{
"epoch": 0.010228113855056861,
"grad_norm": 3.2346473797515074,
"learning_rate": 3.3928571428571435e-06,
"loss": 0.6831,
"step": 19
},
{
"epoch": 0.010766435636901958,
"grad_norm": 3.7571978637997177,
"learning_rate": 3.5714285714285718e-06,
"loss": 0.6537,
"step": 20
},
{
"epoch": 0.011304757418747056,
"grad_norm": 7.559105627558757,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.6141,
"step": 21
},
{
"epoch": 0.011843079200592153,
"grad_norm": 2.387397312463083,
"learning_rate": 3.928571428571429e-06,
"loss": 0.695,
"step": 22
},
{
"epoch": 0.012381400982437252,
"grad_norm": 4.391512430632287,
"learning_rate": 4.107142857142857e-06,
"loss": 0.6185,
"step": 23
},
{
"epoch": 0.012919722764282349,
"grad_norm": 4.15230181408785,
"learning_rate": 4.2857142857142855e-06,
"loss": 0.5936,
"step": 24
},
{
"epoch": 0.013458044546127448,
"grad_norm": 2.0653396060051605,
"learning_rate": 4.464285714285715e-06,
"loss": 0.5898,
"step": 25
},
{
"epoch": 0.013996366327972546,
"grad_norm": 17.979519982673633,
"learning_rate": 4.642857142857144e-06,
"loss": 0.5905,
"step": 26
},
{
"epoch": 0.014534688109817643,
"grad_norm": 8.151818005754823,
"learning_rate": 4.821428571428572e-06,
"loss": 0.5533,
"step": 27
},
{
"epoch": 0.015073009891662742,
"grad_norm": 5.987974608299902,
"learning_rate": 5e-06,
"loss": 0.6107,
"step": 28
},
{
"epoch": 0.015611331673507839,
"grad_norm": 2.330194497601682,
"learning_rate": 5.1785714285714296e-06,
"loss": 0.5506,
"step": 29
},
{
"epoch": 0.016149653455352936,
"grad_norm": 12.794302959608173,
"learning_rate": 5.357142857142857e-06,
"loss": 0.5766,
"step": 30
},
{
"epoch": 0.016687975237198036,
"grad_norm": 1.9007212740143298,
"learning_rate": 5.535714285714286e-06,
"loss": 0.5627,
"step": 31
},
{
"epoch": 0.017226297019043133,
"grad_norm": 1.9146904127163438,
"learning_rate": 5.7142857142857145e-06,
"loss": 0.5883,
"step": 32
},
{
"epoch": 0.01776461880088823,
"grad_norm": 2.187675355948441,
"learning_rate": 5.892857142857144e-06,
"loss": 0.5035,
"step": 33
},
{
"epoch": 0.01830294058273333,
"grad_norm": 2.6806082565798124,
"learning_rate": 6.071428571428571e-06,
"loss": 0.5542,
"step": 34
},
{
"epoch": 0.018841262364578427,
"grad_norm": 3.3951990046554323,
"learning_rate": 6.25e-06,
"loss": 0.538,
"step": 35
},
{
"epoch": 0.019379584146423524,
"grad_norm": 2.5099129162892853,
"learning_rate": 6.4285714285714295e-06,
"loss": 0.5251,
"step": 36
},
{
"epoch": 0.01991790592826862,
"grad_norm": 2.0412677432451627,
"learning_rate": 6.607142857142858e-06,
"loss": 0.5121,
"step": 37
},
{
"epoch": 0.020456227710113722,
"grad_norm": 2.781712184570042,
"learning_rate": 6.785714285714287e-06,
"loss": 0.5254,
"step": 38
},
{
"epoch": 0.02099454949195882,
"grad_norm": 6.805805985669443,
"learning_rate": 6.964285714285714e-06,
"loss": 0.5831,
"step": 39
},
{
"epoch": 0.021532871273803916,
"grad_norm": 2.0490434598423652,
"learning_rate": 7.1428571428571436e-06,
"loss": 0.5393,
"step": 40
},
{
"epoch": 0.022071193055649013,
"grad_norm": 1.9372367290098516,
"learning_rate": 7.321428571428572e-06,
"loss": 0.5972,
"step": 41
},
{
"epoch": 0.022609514837494113,
"grad_norm": 2.7227356306942165,
"learning_rate": 7.500000000000001e-06,
"loss": 0.5369,
"step": 42
},
{
"epoch": 0.02314783661933921,
"grad_norm": 2.011860407760454,
"learning_rate": 7.67857142857143e-06,
"loss": 0.5164,
"step": 43
},
{
"epoch": 0.023686158401184307,
"grad_norm": 2.380752182458038,
"learning_rate": 7.857142857142858e-06,
"loss": 0.4715,
"step": 44
},
{
"epoch": 0.024224480183029407,
"grad_norm": 2.0112153484283537,
"learning_rate": 8.035714285714286e-06,
"loss": 0.4943,
"step": 45
},
{
"epoch": 0.024762801964874504,
"grad_norm": 1.7657871236862508,
"learning_rate": 8.214285714285714e-06,
"loss": 0.5792,
"step": 46
},
{
"epoch": 0.0253011237467196,
"grad_norm": 2.012306508738324,
"learning_rate": 8.392857142857144e-06,
"loss": 0.5704,
"step": 47
},
{
"epoch": 0.025839445528564698,
"grad_norm": 2.0657223159326743,
"learning_rate": 8.571428571428571e-06,
"loss": 0.5145,
"step": 48
},
{
"epoch": 0.0263777673104098,
"grad_norm": 2.137310846323582,
"learning_rate": 8.750000000000001e-06,
"loss": 0.5067,
"step": 49
},
{
"epoch": 0.026916089092254895,
"grad_norm": 2.2166052489861534,
"learning_rate": 8.92857142857143e-06,
"loss": 0.5799,
"step": 50
},
{
"epoch": 0.027454410874099992,
"grad_norm": 2.029493952864758,
"learning_rate": 9.107142857142858e-06,
"loss": 0.5817,
"step": 51
},
{
"epoch": 0.027992732655945093,
"grad_norm": 1.5628607433382145,
"learning_rate": 9.285714285714288e-06,
"loss": 0.4722,
"step": 52
},
{
"epoch": 0.02853105443779019,
"grad_norm": 1.686683459837313,
"learning_rate": 9.464285714285714e-06,
"loss": 0.5233,
"step": 53
},
{
"epoch": 0.029069376219635287,
"grad_norm": 1.7287851726495882,
"learning_rate": 9.642857142857144e-06,
"loss": 0.5744,
"step": 54
},
{
"epoch": 0.029607698001480384,
"grad_norm": 2.246853321344625,
"learning_rate": 9.821428571428573e-06,
"loss": 0.4972,
"step": 55
},
{
"epoch": 0.030146019783325484,
"grad_norm": 1.9175548738162544,
"learning_rate": 1e-05,
"loss": 0.535,
"step": 56
},
{
"epoch": 0.03068434156517058,
"grad_norm": 2.169109901402676,
"learning_rate": 9.999992393020984e-06,
"loss": 0.5429,
"step": 57
},
{
"epoch": 0.031222663347015678,
"grad_norm": 2.260825281362616,
"learning_rate": 9.99996957210708e-06,
"loss": 0.521,
"step": 58
},
{
"epoch": 0.031760985128860775,
"grad_norm": 1.660309077201794,
"learning_rate": 9.999931537327727e-06,
"loss": 0.531,
"step": 59
},
{
"epoch": 0.03229930691070587,
"grad_norm": 2.069841458563405,
"learning_rate": 9.999878288798659e-06,
"loss": 0.5661,
"step": 60
},
{
"epoch": 0.03229930691070587,
"eval_loss": 0.5262647271156311,
"eval_runtime": 1569.0341,
"eval_samples_per_second": 15.94,
"eval_steps_per_second": 0.498,
"step": 60
},
{
"epoch": 0.032837628692550976,
"grad_norm": 2.6347591222570577,
"learning_rate": 9.999809826681898e-06,
"loss": 0.544,
"step": 61
},
{
"epoch": 0.03337595047439607,
"grad_norm": 2.286499156997404,
"learning_rate": 9.999726151185762e-06,
"loss": 0.5387,
"step": 62
},
{
"epoch": 0.03391427225624117,
"grad_norm": 1.8415858956026085,
"learning_rate": 9.999627262564856e-06,
"loss": 0.5148,
"step": 63
},
{
"epoch": 0.034452594038086266,
"grad_norm": 1.6900844200859937,
"learning_rate": 9.999513161120078e-06,
"loss": 0.5291,
"step": 64
},
{
"epoch": 0.03499091581993136,
"grad_norm": 1.7125448582732223,
"learning_rate": 9.999383847198618e-06,
"loss": 0.5535,
"step": 65
},
{
"epoch": 0.03552923760177646,
"grad_norm": 1.9111631206584763,
"learning_rate": 9.999239321193946e-06,
"loss": 0.5146,
"step": 66
},
{
"epoch": 0.03606755938362156,
"grad_norm": 1.5772484080951499,
"learning_rate": 9.999079583545829e-06,
"loss": 0.4713,
"step": 67
},
{
"epoch": 0.03660588116546666,
"grad_norm": 1.8895632782472054,
"learning_rate": 9.998904634740313e-06,
"loss": 0.5802,
"step": 68
},
{
"epoch": 0.03714420294731176,
"grad_norm": 1.7764047564754841,
"learning_rate": 9.998714475309733e-06,
"loss": 0.4893,
"step": 69
},
{
"epoch": 0.037682524729156855,
"grad_norm": 1.6552020383306354,
"learning_rate": 9.9985091058327e-06,
"loss": 0.5265,
"step": 70
},
{
"epoch": 0.03822084651100195,
"grad_norm": 1.6488442266603467,
"learning_rate": 9.998288526934115e-06,
"loss": 0.5231,
"step": 71
},
{
"epoch": 0.03875916829284705,
"grad_norm": 2.563488205094923,
"learning_rate": 9.998052739285151e-06,
"loss": 0.5305,
"step": 72
},
{
"epoch": 0.039297490074692146,
"grad_norm": 1.7898615543554037,
"learning_rate": 9.997801743603264e-06,
"loss": 0.5237,
"step": 73
},
{
"epoch": 0.03983581185653724,
"grad_norm": 1.7633259864675677,
"learning_rate": 9.997535540652177e-06,
"loss": 0.5502,
"step": 74
},
{
"epoch": 0.04037413363838234,
"grad_norm": 1.8121416043404328,
"learning_rate": 9.997254131241893e-06,
"loss": 0.4952,
"step": 75
},
{
"epoch": 0.040912455420227443,
"grad_norm": 1.5652647418073986,
"learning_rate": 9.996957516228682e-06,
"loss": 0.4945,
"step": 76
},
{
"epoch": 0.04145077720207254,
"grad_norm": 2.048844737679617,
"learning_rate": 9.996645696515082e-06,
"loss": 0.5123,
"step": 77
},
{
"epoch": 0.04198909898391764,
"grad_norm": 1.6687520157181732,
"learning_rate": 9.996318673049893e-06,
"loss": 0.5443,
"step": 78
},
{
"epoch": 0.042527420765762734,
"grad_norm": 1.66167477759581,
"learning_rate": 9.995976446828182e-06,
"loss": 0.5029,
"step": 79
},
{
"epoch": 0.04306574254760783,
"grad_norm": 1.5077402156848434,
"learning_rate": 9.99561901889127e-06,
"loss": 0.5197,
"step": 80
},
{
"epoch": 0.04360406432945293,
"grad_norm": 1.8622381731018631,
"learning_rate": 9.995246390326739e-06,
"loss": 0.5048,
"step": 81
},
{
"epoch": 0.044142386111298025,
"grad_norm": 1.6038417564132132,
"learning_rate": 9.994858562268415e-06,
"loss": 0.5779,
"step": 82
},
{
"epoch": 0.04468070789314313,
"grad_norm": 2.2450492036773126,
"learning_rate": 9.994455535896383e-06,
"loss": 0.5407,
"step": 83
},
{
"epoch": 0.045219029674988226,
"grad_norm": 1.7319893085330837,
"learning_rate": 9.994037312436963e-06,
"loss": 0.4857,
"step": 84
},
{
"epoch": 0.04575735145683332,
"grad_norm": 1.6718459312817726,
"learning_rate": 9.99360389316273e-06,
"loss": 0.4815,
"step": 85
},
{
"epoch": 0.04629567323867842,
"grad_norm": 2.7232264171397276,
"learning_rate": 9.993155279392479e-06,
"loss": 0.5877,
"step": 86
},
{
"epoch": 0.04683399502052352,
"grad_norm": 1.9404135244552454,
"learning_rate": 9.992691472491253e-06,
"loss": 0.5062,
"step": 87
},
{
"epoch": 0.047372316802368614,
"grad_norm": 1.9213426547558368,
"learning_rate": 9.99221247387032e-06,
"loss": 0.5188,
"step": 88
},
{
"epoch": 0.04791063858421371,
"grad_norm": 1.5451598644824311,
"learning_rate": 9.991718284987173e-06,
"loss": 0.5397,
"step": 89
},
{
"epoch": 0.048448960366058814,
"grad_norm": 2.5679521016629385,
"learning_rate": 9.991208907345524e-06,
"loss": 0.541,
"step": 90
},
{
"epoch": 0.04898728214790391,
"grad_norm": 2.98985646242629,
"learning_rate": 9.990684342495304e-06,
"loss": 0.4854,
"step": 91
},
{
"epoch": 0.04952560392974901,
"grad_norm": 1.9886055940456542,
"learning_rate": 9.990144592032657e-06,
"loss": 0.5256,
"step": 92
},
{
"epoch": 0.050063925711594105,
"grad_norm": 2.083677922083048,
"learning_rate": 9.989589657599927e-06,
"loss": 0.4859,
"step": 93
},
{
"epoch": 0.0506022474934392,
"grad_norm": 1.5145771411744222,
"learning_rate": 9.989019540885664e-06,
"loss": 0.4744,
"step": 94
},
{
"epoch": 0.0511405692752843,
"grad_norm": 1.655565898472542,
"learning_rate": 9.98843424362462e-06,
"loss": 0.4615,
"step": 95
},
{
"epoch": 0.051678891057129396,
"grad_norm": 1.9814143121579568,
"learning_rate": 9.987833767597726e-06,
"loss": 0.4806,
"step": 96
},
{
"epoch": 0.0522172128389745,
"grad_norm": 1.5166169599719224,
"learning_rate": 9.987218114632109e-06,
"loss": 0.5279,
"step": 97
},
{
"epoch": 0.0527555346208196,
"grad_norm": 1.7338166251896456,
"learning_rate": 9.98658728660107e-06,
"loss": 0.4885,
"step": 98
},
{
"epoch": 0.053293856402664694,
"grad_norm": 2.059909188253357,
"learning_rate": 9.98594128542409e-06,
"loss": 0.4878,
"step": 99
},
{
"epoch": 0.05383217818450979,
"grad_norm": 1.946469408161261,
"learning_rate": 9.985280113066816e-06,
"loss": 0.5423,
"step": 100
},
{
"epoch": 0.05437049996635489,
"grad_norm": 2.2782083747319333,
"learning_rate": 9.984603771541055e-06,
"loss": 0.5132,
"step": 101
},
{
"epoch": 0.054908821748199985,
"grad_norm": 2.057010956887204,
"learning_rate": 9.983912262904775e-06,
"loss": 0.5092,
"step": 102
},
{
"epoch": 0.05544714353004508,
"grad_norm": 1.7498707830077607,
"learning_rate": 9.983205589262093e-06,
"loss": 0.4711,
"step": 103
},
{
"epoch": 0.055985465311890185,
"grad_norm": 2.08857966446578,
"learning_rate": 9.98248375276327e-06,
"loss": 0.5405,
"step": 104
},
{
"epoch": 0.05652378709373528,
"grad_norm": 1.6492587393982439,
"learning_rate": 9.981746755604703e-06,
"loss": 0.5346,
"step": 105
},
{
"epoch": 0.05706210887558038,
"grad_norm": 2.4884932019084203,
"learning_rate": 9.980994600028919e-06,
"loss": 0.4979,
"step": 106
},
{
"epoch": 0.057600430657425476,
"grad_norm": 2.357643749019895,
"learning_rate": 9.980227288324576e-06,
"loss": 0.547,
"step": 107
},
{
"epoch": 0.05813875243927057,
"grad_norm": 1.7013608238808469,
"learning_rate": 9.979444822826438e-06,
"loss": 0.4984,
"step": 108
},
{
"epoch": 0.05867707422111567,
"grad_norm": 1.6424667181868076,
"learning_rate": 9.978647205915386e-06,
"loss": 0.5501,
"step": 109
},
{
"epoch": 0.05921539600296077,
"grad_norm": 1.8701509501400961,
"learning_rate": 9.977834440018406e-06,
"loss": 0.5478,
"step": 110
},
{
"epoch": 0.05975371778480587,
"grad_norm": 1.8496243899167086,
"learning_rate": 9.977006527608569e-06,
"loss": 0.4782,
"step": 111
},
{
"epoch": 0.06029203956665097,
"grad_norm": 1.6878413932010692,
"learning_rate": 9.976163471205045e-06,
"loss": 0.4832,
"step": 112
},
{
"epoch": 0.060830361348496065,
"grad_norm": 1.9099800850936837,
"learning_rate": 9.975305273373075e-06,
"loss": 0.515,
"step": 113
},
{
"epoch": 0.06136868313034116,
"grad_norm": 1.5649119566569916,
"learning_rate": 9.974431936723979e-06,
"loss": 0.4561,
"step": 114
},
{
"epoch": 0.06190700491218626,
"grad_norm": 1.7341754469580601,
"learning_rate": 9.973543463915139e-06,
"loss": 0.5348,
"step": 115
},
{
"epoch": 0.062445326694031356,
"grad_norm": 1.7476560123562952,
"learning_rate": 9.972639857649989e-06,
"loss": 0.5287,
"step": 116
},
{
"epoch": 0.06298364847587645,
"grad_norm": 2.0434137346621624,
"learning_rate": 9.971721120678018e-06,
"loss": 0.5932,
"step": 117
},
{
"epoch": 0.06352197025772155,
"grad_norm": 1.62299849715006,
"learning_rate": 9.97078725579475e-06,
"loss": 0.5077,
"step": 118
},
{
"epoch": 0.06406029203956665,
"grad_norm": 1.7228929187523507,
"learning_rate": 9.969838265841739e-06,
"loss": 0.5859,
"step": 119
},
{
"epoch": 0.06459861382141174,
"grad_norm": 1.6625474372880666,
"learning_rate": 9.968874153706567e-06,
"loss": 0.4655,
"step": 120
},
{
"epoch": 0.06459861382141174,
"eval_loss": 0.5072533488273621,
"eval_runtime": 1577.1777,
"eval_samples_per_second": 15.857,
"eval_steps_per_second": 0.496,
"step": 120
},
{
"epoch": 0.06513693560325684,
"grad_norm": 2.0716206061611486,
"learning_rate": 9.967894922322824e-06,
"loss": 0.539,
"step": 121
},
{
"epoch": 0.06567525738510195,
"grad_norm": 1.6205145916384769,
"learning_rate": 9.96690057467011e-06,
"loss": 0.5478,
"step": 122
},
{
"epoch": 0.06621357916694705,
"grad_norm": 1.587372514164151,
"learning_rate": 9.965891113774015e-06,
"loss": 0.538,
"step": 123
},
{
"epoch": 0.06675190094879214,
"grad_norm": 1.4772510136765666,
"learning_rate": 9.964866542706119e-06,
"loss": 0.5349,
"step": 124
},
{
"epoch": 0.06729022273063724,
"grad_norm": 1.7801746551956565,
"learning_rate": 9.963826864583979e-06,
"loss": 0.4909,
"step": 125
},
{
"epoch": 0.06782854451248234,
"grad_norm": 5.729919312521928,
"learning_rate": 9.962772082571115e-06,
"loss": 0.6005,
"step": 126
},
{
"epoch": 0.06836686629432744,
"grad_norm": 1.6619105967880943,
"learning_rate": 9.961702199877014e-06,
"loss": 0.4715,
"step": 127
},
{
"epoch": 0.06890518807617253,
"grad_norm": 1.5987631874828743,
"learning_rate": 9.960617219757105e-06,
"loss": 0.4807,
"step": 128
},
{
"epoch": 0.06944350985801763,
"grad_norm": 1.625681174655454,
"learning_rate": 9.959517145512754e-06,
"loss": 0.535,
"step": 129
},
{
"epoch": 0.06998183163986273,
"grad_norm": 2.100345459551234,
"learning_rate": 9.958401980491259e-06,
"loss": 0.5264,
"step": 130
},
{
"epoch": 0.07052015342170782,
"grad_norm": 1.7787800977162425,
"learning_rate": 9.957271728085836e-06,
"loss": 0.5171,
"step": 131
},
{
"epoch": 0.07105847520355292,
"grad_norm": 1.6985346393670706,
"learning_rate": 9.956126391735605e-06,
"loss": 0.5016,
"step": 132
},
{
"epoch": 0.07159679698539802,
"grad_norm": 1.3787117088478043,
"learning_rate": 9.954965974925586e-06,
"loss": 0.502,
"step": 133
},
{
"epoch": 0.07213511876724311,
"grad_norm": 1.547259961768447,
"learning_rate": 9.953790481186689e-06,
"loss": 0.5046,
"step": 134
},
{
"epoch": 0.07267344054908821,
"grad_norm": 1.7755359789986371,
"learning_rate": 9.952599914095692e-06,
"loss": 0.5385,
"step": 135
},
{
"epoch": 0.07321176233093332,
"grad_norm": 1.5896819627160363,
"learning_rate": 9.951394277275247e-06,
"loss": 0.4749,
"step": 136
},
{
"epoch": 0.07375008411277842,
"grad_norm": 1.6875256792153286,
"learning_rate": 9.950173574393853e-06,
"loss": 0.4763,
"step": 137
},
{
"epoch": 0.07428840589462352,
"grad_norm": 1.437266797535168,
"learning_rate": 9.948937809165853e-06,
"loss": 0.4833,
"step": 138
},
{
"epoch": 0.07482672767646861,
"grad_norm": 1.7282025114929471,
"learning_rate": 9.947686985351427e-06,
"loss": 0.4767,
"step": 139
},
{
"epoch": 0.07536504945831371,
"grad_norm": 1.8616012721247828,
"learning_rate": 9.946421106756568e-06,
"loss": 0.5093,
"step": 140
},
{
"epoch": 0.0759033712401588,
"grad_norm": 1.8460263465465812,
"learning_rate": 9.94514017723308e-06,
"loss": 0.517,
"step": 141
},
{
"epoch": 0.0764416930220039,
"grad_norm": 2.0057873955643823,
"learning_rate": 9.94384420067857e-06,
"loss": 0.5154,
"step": 142
},
{
"epoch": 0.076980014803849,
"grad_norm": 1.65882505385735,
"learning_rate": 9.94253318103642e-06,
"loss": 0.4701,
"step": 143
},
{
"epoch": 0.0775183365856941,
"grad_norm": 2.3628830084290806,
"learning_rate": 9.941207122295789e-06,
"loss": 0.5405,
"step": 144
},
{
"epoch": 0.0780566583675392,
"grad_norm": 1.6577450103892044,
"learning_rate": 9.9398660284916e-06,
"loss": 0.4927,
"step": 145
},
{
"epoch": 0.07859498014938429,
"grad_norm": 1.4186036899765784,
"learning_rate": 9.938509903704521e-06,
"loss": 0.4898,
"step": 146
},
{
"epoch": 0.07913330193122939,
"grad_norm": 1.544561300695159,
"learning_rate": 9.937138752060958e-06,
"loss": 0.4893,
"step": 147
},
{
"epoch": 0.07967162371307449,
"grad_norm": 2.396784154476515,
"learning_rate": 9.935752577733038e-06,
"loss": 0.5326,
"step": 148
},
{
"epoch": 0.08020994549491958,
"grad_norm": 1.6617814624124967,
"learning_rate": 9.9343513849386e-06,
"loss": 0.5131,
"step": 149
},
{
"epoch": 0.08074826727676468,
"grad_norm": 1.7862849588167096,
"learning_rate": 9.932935177941185e-06,
"loss": 0.571,
"step": 150
},
{
"epoch": 0.08128658905860979,
"grad_norm": 1.4319233814203582,
"learning_rate": 9.931503961050012e-06,
"loss": 0.5017,
"step": 151
},
{
"epoch": 0.08182491084045489,
"grad_norm": 4.306871831666418,
"learning_rate": 9.93005773861998e-06,
"loss": 0.4935,
"step": 152
},
{
"epoch": 0.08236323262229998,
"grad_norm": 2.160758045969246,
"learning_rate": 9.928596515051639e-06,
"loss": 0.4985,
"step": 153
},
{
"epoch": 0.08290155440414508,
"grad_norm": 1.5540015811422117,
"learning_rate": 9.927120294791188e-06,
"loss": 0.4575,
"step": 154
},
{
"epoch": 0.08343987618599018,
"grad_norm": 1.5794711992375656,
"learning_rate": 9.92562908233046e-06,
"loss": 0.5031,
"step": 155
},
{
"epoch": 0.08397819796783527,
"grad_norm": 2.034943473794147,
"learning_rate": 9.9241228822069e-06,
"loss": 0.4829,
"step": 156
},
{
"epoch": 0.08451651974968037,
"grad_norm": 1.878275757652009,
"learning_rate": 9.922601699003567e-06,
"loss": 0.5468,
"step": 157
},
{
"epoch": 0.08505484153152547,
"grad_norm": 1.8197718876914466,
"learning_rate": 9.921065537349097e-06,
"loss": 0.5228,
"step": 158
},
{
"epoch": 0.08559316331337057,
"grad_norm": 1.850901219005824,
"learning_rate": 9.919514401917717e-06,
"loss": 0.4894,
"step": 159
},
{
"epoch": 0.08613148509521566,
"grad_norm": 1.6912529326600465,
"learning_rate": 9.917948297429202e-06,
"loss": 0.4783,
"step": 160
},
{
"epoch": 0.08666980687706076,
"grad_norm": 1.9572290713193328,
"learning_rate": 9.916367228648887e-06,
"loss": 0.4889,
"step": 161
},
{
"epoch": 0.08720812865890586,
"grad_norm": 2.2412763350776497,
"learning_rate": 9.914771200387634e-06,
"loss": 0.5196,
"step": 162
},
{
"epoch": 0.08774645044075095,
"grad_norm": 2.0096075056146527,
"learning_rate": 9.913160217501822e-06,
"loss": 0.5098,
"step": 163
},
{
"epoch": 0.08828477222259605,
"grad_norm": 1.561955725348752,
"learning_rate": 9.911534284893336e-06,
"loss": 0.4993,
"step": 164
},
{
"epoch": 0.08882309400444116,
"grad_norm": 2.2239745440823113,
"learning_rate": 9.909893407509554e-06,
"loss": 0.5189,
"step": 165
},
{
"epoch": 0.08936141578628626,
"grad_norm": 2.1956593936333606,
"learning_rate": 9.90823759034332e-06,
"loss": 0.4956,
"step": 166
},
{
"epoch": 0.08989973756813135,
"grad_norm": 1.7245617400478288,
"learning_rate": 9.906566838432943e-06,
"loss": 0.5076,
"step": 167
},
{
"epoch": 0.09043805934997645,
"grad_norm": 1.6846599680454537,
"learning_rate": 9.904881156862172e-06,
"loss": 0.4546,
"step": 168
},
{
"epoch": 0.09097638113182155,
"grad_norm": 1.713604562000994,
"learning_rate": 9.903180550760184e-06,
"loss": 0.5622,
"step": 169
},
{
"epoch": 0.09151470291366665,
"grad_norm": 1.4559714724478827,
"learning_rate": 9.901465025301571e-06,
"loss": 0.499,
"step": 170
},
{
"epoch": 0.09205302469551174,
"grad_norm": 1.748975091207079,
"learning_rate": 9.899734585706316e-06,
"loss": 0.4823,
"step": 171
},
{
"epoch": 0.09259134647735684,
"grad_norm": 1.6268147978199312,
"learning_rate": 9.89798923723979e-06,
"loss": 0.5452,
"step": 172
},
{
"epoch": 0.09312966825920194,
"grad_norm": 1.7343158101478648,
"learning_rate": 9.896228985212722e-06,
"loss": 0.4359,
"step": 173
},
{
"epoch": 0.09366799004104703,
"grad_norm": 2.07042169826696,
"learning_rate": 9.894453834981194e-06,
"loss": 0.511,
"step": 174
},
{
"epoch": 0.09420631182289213,
"grad_norm": 1.791222622400255,
"learning_rate": 9.892663791946617e-06,
"loss": 0.5451,
"step": 175
},
{
"epoch": 0.09474463360473723,
"grad_norm": 2.20105621306618,
"learning_rate": 9.890858861555719e-06,
"loss": 0.5144,
"step": 176
},
{
"epoch": 0.09528295538658232,
"grad_norm": 1.6902715423027703,
"learning_rate": 9.889039049300526e-06,
"loss": 0.5445,
"step": 177
},
{
"epoch": 0.09582127716842742,
"grad_norm": 1.6384822244675972,
"learning_rate": 9.88720436071835e-06,
"loss": 0.5164,
"step": 178
},
{
"epoch": 0.09635959895027253,
"grad_norm": 1.486764051130488,
"learning_rate": 9.885354801391764e-06,
"loss": 0.478,
"step": 179
},
{
"epoch": 0.09689792073211763,
"grad_norm": 1.701132133672937,
"learning_rate": 9.883490376948593e-06,
"loss": 0.5027,
"step": 180
},
{
"epoch": 0.09689792073211763,
"eval_loss": 0.49806535243988037,
"eval_runtime": 1515.9148,
"eval_samples_per_second": 16.498,
"eval_steps_per_second": 0.516,
"step": 180
},
{
"epoch": 0.09743624251396273,
"grad_norm": 1.9402448136247314,
"learning_rate": 9.881611093061891e-06,
"loss": 0.5127,
"step": 181
},
{
"epoch": 0.09797456429580782,
"grad_norm": 1.7830082860168288,
"learning_rate": 9.879716955449927e-06,
"loss": 0.4977,
"step": 182
},
{
"epoch": 0.09851288607765292,
"grad_norm": 1.8728338162339362,
"learning_rate": 9.877807969876167e-06,
"loss": 0.5303,
"step": 183
},
{
"epoch": 0.09905120785949802,
"grad_norm": 1.9418905923773875,
"learning_rate": 9.875884142149258e-06,
"loss": 0.4924,
"step": 184
},
{
"epoch": 0.09958952964134311,
"grad_norm": 1.7198468996934395,
"learning_rate": 9.873945478123006e-06,
"loss": 0.4753,
"step": 185
},
{
"epoch": 0.10012785142318821,
"grad_norm": 1.9960103116925314,
"learning_rate": 9.87199198369636e-06,
"loss": 0.5277,
"step": 186
},
{
"epoch": 0.10066617320503331,
"grad_norm": 1.627744057918891,
"learning_rate": 9.870023664813399e-06,
"loss": 0.46,
"step": 187
},
{
"epoch": 0.1012044949868784,
"grad_norm": 1.689952574264165,
"learning_rate": 9.868040527463305e-06,
"loss": 0.4994,
"step": 188
},
{
"epoch": 0.1017428167687235,
"grad_norm": 1.5603624594142342,
"learning_rate": 9.866042577680354e-06,
"loss": 0.5304,
"step": 189
},
{
"epoch": 0.1022811385505686,
"grad_norm": 1.748472496778829,
"learning_rate": 9.86402982154389e-06,
"loss": 0.4964,
"step": 190
},
{
"epoch": 0.1028194603324137,
"grad_norm": 1.7431819106596798,
"learning_rate": 9.862002265178308e-06,
"loss": 0.4783,
"step": 191
},
{
"epoch": 0.10335778211425879,
"grad_norm": 1.837418537016329,
"learning_rate": 9.859959914753042e-06,
"loss": 0.4862,
"step": 192
},
{
"epoch": 0.1038961038961039,
"grad_norm": 2.596761998177084,
"learning_rate": 9.857902776482538e-06,
"loss": 0.5261,
"step": 193
},
{
"epoch": 0.104434425677949,
"grad_norm": 1.893467433056967,
"learning_rate": 9.85583085662624e-06,
"loss": 0.5324,
"step": 194
},
{
"epoch": 0.1049727474597941,
"grad_norm": 1.5311561663354358,
"learning_rate": 9.853744161488568e-06,
"loss": 0.4934,
"step": 195
},
{
"epoch": 0.1055110692416392,
"grad_norm": 1.573948338119931,
"learning_rate": 9.851642697418898e-06,
"loss": 0.5137,
"step": 196
},
{
"epoch": 0.10604939102348429,
"grad_norm": 1.7486390517463863,
"learning_rate": 9.84952647081155e-06,
"loss": 0.535,
"step": 197
},
{
"epoch": 0.10658771280532939,
"grad_norm": 1.589021194069147,
"learning_rate": 9.847395488105761e-06,
"loss": 0.443,
"step": 198
},
{
"epoch": 0.10712603458717448,
"grad_norm": 1.9185393015026924,
"learning_rate": 9.845249755785665e-06,
"loss": 0.5281,
"step": 199
},
{
"epoch": 0.10766435636901958,
"grad_norm": 2.3792026849321704,
"learning_rate": 9.84308928038028e-06,
"loss": 0.5031,
"step": 200
},
{
"epoch": 0.10820267815086468,
"grad_norm": 1.9165328926467609,
"learning_rate": 9.840914068463482e-06,
"loss": 0.5557,
"step": 201
},
{
"epoch": 0.10874099993270978,
"grad_norm": 2.5946215311840315,
"learning_rate": 9.838724126653987e-06,
"loss": 0.4922,
"step": 202
},
{
"epoch": 0.10927932171455487,
"grad_norm": 2.13076319151747,
"learning_rate": 9.836519461615331e-06,
"loss": 0.5781,
"step": 203
},
{
"epoch": 0.10981764349639997,
"grad_norm": 1.663228941320188,
"learning_rate": 9.834300080055854e-06,
"loss": 0.484,
"step": 204
},
{
"epoch": 0.11035596527824507,
"grad_norm": 2.225077581890442,
"learning_rate": 9.832065988728667e-06,
"loss": 0.4869,
"step": 205
},
{
"epoch": 0.11089428706009016,
"grad_norm": 1.4816502494413102,
"learning_rate": 9.829817194431646e-06,
"loss": 0.4782,
"step": 206
},
{
"epoch": 0.11143260884193526,
"grad_norm": 1.9584675295393534,
"learning_rate": 9.827553704007403e-06,
"loss": 0.4572,
"step": 207
},
{
"epoch": 0.11197093062378037,
"grad_norm": 1.4348786359320973,
"learning_rate": 9.82527552434327e-06,
"loss": 0.4682,
"step": 208
},
{
"epoch": 0.11250925240562547,
"grad_norm": 1.836643464151516,
"learning_rate": 9.82298266237127e-06,
"loss": 0.475,
"step": 209
},
{
"epoch": 0.11304757418747056,
"grad_norm": 1.6780795457698512,
"learning_rate": 9.820675125068105e-06,
"loss": 0.4903,
"step": 210
},
{
"epoch": 0.11358589596931566,
"grad_norm": 2.0824594091852124,
"learning_rate": 9.818352919455133e-06,
"loss": 0.5396,
"step": 211
},
{
"epoch": 0.11412421775116076,
"grad_norm": 1.7381485522277624,
"learning_rate": 9.816016052598336e-06,
"loss": 0.536,
"step": 212
},
{
"epoch": 0.11466253953300586,
"grad_norm": 1.7730039428627105,
"learning_rate": 9.813664531608319e-06,
"loss": 0.5344,
"step": 213
},
{
"epoch": 0.11520086131485095,
"grad_norm": 1.726577182888005,
"learning_rate": 9.811298363640265e-06,
"loss": 0.4686,
"step": 214
},
{
"epoch": 0.11573918309669605,
"grad_norm": 1.4284226913661735,
"learning_rate": 9.808917555893934e-06,
"loss": 0.417,
"step": 215
},
{
"epoch": 0.11627750487854115,
"grad_norm": 1.8490676859358208,
"learning_rate": 9.806522115613624e-06,
"loss": 0.4734,
"step": 216
},
{
"epoch": 0.11681582666038624,
"grad_norm": 1.9252320315263673,
"learning_rate": 9.804112050088164e-06,
"loss": 0.5216,
"step": 217
},
{
"epoch": 0.11735414844223134,
"grad_norm": 2.039324491259981,
"learning_rate": 9.801687366650882e-06,
"loss": 0.5209,
"step": 218
},
{
"epoch": 0.11789247022407644,
"grad_norm": 2.9773699463269168,
"learning_rate": 9.799248072679581e-06,
"loss": 0.5341,
"step": 219
},
{
"epoch": 0.11843079200592153,
"grad_norm": 2.742476530553411,
"learning_rate": 9.796794175596526e-06,
"loss": 0.5013,
"step": 220
},
{
"epoch": 0.11896911378776663,
"grad_norm": 1.7756468554357536,
"learning_rate": 9.794325682868413e-06,
"loss": 0.4789,
"step": 221
},
{
"epoch": 0.11950743556961174,
"grad_norm": 1.6809704903695406,
"learning_rate": 9.791842602006355e-06,
"loss": 0.4661,
"step": 222
},
{
"epoch": 0.12004575735145684,
"grad_norm": 1.5983552620095136,
"learning_rate": 9.789344940565844e-06,
"loss": 0.4525,
"step": 223
},
{
"epoch": 0.12058407913330194,
"grad_norm": 1.6785718872740183,
"learning_rate": 9.786832706146745e-06,
"loss": 0.5614,
"step": 224
},
{
"epoch": 0.12112240091514703,
"grad_norm": 1.8472396669798028,
"learning_rate": 9.784305906393266e-06,
"loss": 0.5442,
"step": 225
},
{
"epoch": 0.12166072269699213,
"grad_norm": 2.233728320756155,
"learning_rate": 9.781764548993932e-06,
"loss": 0.5065,
"step": 226
},
{
"epoch": 0.12219904447883723,
"grad_norm": 1.7583669595786098,
"learning_rate": 9.77920864168156e-06,
"loss": 0.5031,
"step": 227
},
{
"epoch": 0.12273736626068232,
"grad_norm": 1.856107901761449,
"learning_rate": 9.77663819223325e-06,
"loss": 0.5218,
"step": 228
},
{
"epoch": 0.12327568804252742,
"grad_norm": 1.5999284716572806,
"learning_rate": 9.774053208470338e-06,
"loss": 0.447,
"step": 229
},
{
"epoch": 0.12381400982437252,
"grad_norm": 3.170181526472491,
"learning_rate": 9.771453698258392e-06,
"loss": 0.4549,
"step": 230
},
{
"epoch": 0.12435233160621761,
"grad_norm": 1.7567006972999655,
"learning_rate": 9.768839669507185e-06,
"loss": 0.5203,
"step": 231
},
{
"epoch": 0.12489065338806271,
"grad_norm": 1.6024823185860628,
"learning_rate": 9.766211130170653e-06,
"loss": 0.5035,
"step": 232
},
{
"epoch": 0.1254289751699078,
"grad_norm": 1.9234982966827474,
"learning_rate": 9.7635680882469e-06,
"loss": 0.5742,
"step": 233
},
{
"epoch": 0.1259672969517529,
"grad_norm": 1.526400617412084,
"learning_rate": 9.760910551778149e-06,
"loss": 0.4953,
"step": 234
},
{
"epoch": 0.126505618733598,
"grad_norm": 1.7460568880199783,
"learning_rate": 9.758238528850733e-06,
"loss": 0.4705,
"step": 235
},
{
"epoch": 0.1270439405154431,
"grad_norm": 5.681983754980635,
"learning_rate": 9.755552027595055e-06,
"loss": 0.5499,
"step": 236
},
{
"epoch": 0.1275822622972882,
"grad_norm": 1.9059517301514561,
"learning_rate": 9.752851056185583e-06,
"loss": 0.5016,
"step": 237
},
{
"epoch": 0.1281205840791333,
"grad_norm": 2.032081768465102,
"learning_rate": 9.750135622840811e-06,
"loss": 0.4761,
"step": 238
},
{
"epoch": 0.1286589058609784,
"grad_norm": 2.044888486278771,
"learning_rate": 9.747405735823232e-06,
"loss": 0.535,
"step": 239
},
{
"epoch": 0.1291972276428235,
"grad_norm": 1.7814262228625417,
"learning_rate": 9.744661403439328e-06,
"loss": 0.5524,
"step": 240
},
{
"epoch": 0.1291972276428235,
"eval_loss": 0.4923091232776642,
"eval_runtime": 1516.8995,
"eval_samples_per_second": 16.488,
"eval_steps_per_second": 0.516,
"step": 240
},
{
"epoch": 0.12973554942466858,
"grad_norm": 3.1298270206538,
"learning_rate": 9.74190263403953e-06,
"loss": 0.4938,
"step": 241
},
{
"epoch": 0.13027387120651368,
"grad_norm": 1.4984946811035116,
"learning_rate": 9.739129436018193e-06,
"loss": 0.4417,
"step": 242
},
{
"epoch": 0.1308121929883588,
"grad_norm": 1.364613667269671,
"learning_rate": 9.736341817813586e-06,
"loss": 0.4698,
"step": 243
},
{
"epoch": 0.1313505147702039,
"grad_norm": 1.4558332152005662,
"learning_rate": 9.733539787907851e-06,
"loss": 0.51,
"step": 244
},
{
"epoch": 0.131888836552049,
"grad_norm": 1.605378069117634,
"learning_rate": 9.730723354826978e-06,
"loss": 0.4502,
"step": 245
},
{
"epoch": 0.1324271583338941,
"grad_norm": 1.6741314580897366,
"learning_rate": 9.727892527140787e-06,
"loss": 0.4445,
"step": 246
},
{
"epoch": 0.1329654801157392,
"grad_norm": 2.306950410094544,
"learning_rate": 9.725047313462897e-06,
"loss": 0.541,
"step": 247
},
{
"epoch": 0.1335038018975843,
"grad_norm": 2.110791301537649,
"learning_rate": 9.722187722450699e-06,
"loss": 0.5105,
"step": 248
},
{
"epoch": 0.1340421236794294,
"grad_norm": 1.8250944708952,
"learning_rate": 9.719313762805334e-06,
"loss": 0.5233,
"step": 249
},
{
"epoch": 0.13458044546127448,
"grad_norm": 1.5279014760068415,
"learning_rate": 9.716425443271663e-06,
"loss": 0.4978,
"step": 250
},
{
"epoch": 0.13511876724311958,
"grad_norm": 1.6155139379634116,
"learning_rate": 9.713522772638238e-06,
"loss": 0.489,
"step": 251
},
{
"epoch": 0.13565708902496468,
"grad_norm": 1.7541916143762504,
"learning_rate": 9.710605759737281e-06,
"loss": 0.5058,
"step": 252
},
{
"epoch": 0.13619541080680977,
"grad_norm": 2.0770411769433914,
"learning_rate": 9.707674413444658e-06,
"loss": 0.4765,
"step": 253
},
{
"epoch": 0.13673373258865487,
"grad_norm": 2.20017292136363,
"learning_rate": 9.70472874267984e-06,
"loss": 0.5073,
"step": 254
},
{
"epoch": 0.13727205437049997,
"grad_norm": 2.5155355882755495,
"learning_rate": 9.701768756405894e-06,
"loss": 0.5271,
"step": 255
},
{
"epoch": 0.13781037615234507,
"grad_norm": 1.6203966463313373,
"learning_rate": 9.698794463629438e-06,
"loss": 0.5328,
"step": 256
},
{
"epoch": 0.13834869793419016,
"grad_norm": 1.776204296227151,
"learning_rate": 9.695805873400627e-06,
"loss": 0.4975,
"step": 257
},
{
"epoch": 0.13888701971603526,
"grad_norm": 1.817996887986963,
"learning_rate": 9.692802994813117e-06,
"loss": 0.5076,
"step": 258
},
{
"epoch": 0.13942534149788036,
"grad_norm": 1.5387316388819356,
"learning_rate": 9.68978583700404e-06,
"loss": 0.4783,
"step": 259
},
{
"epoch": 0.13996366327972545,
"grad_norm": 1.4525191587799346,
"learning_rate": 9.686754409153984e-06,
"loss": 0.4541,
"step": 260
},
{
"epoch": 0.14050198506157055,
"grad_norm": 2.5072786042500286,
"learning_rate": 9.683708720486947e-06,
"loss": 0.4321,
"step": 261
},
{
"epoch": 0.14104030684341565,
"grad_norm": 1.928234336171056,
"learning_rate": 9.680648780270327e-06,
"loss": 0.5026,
"step": 262
},
{
"epoch": 0.14157862862526074,
"grad_norm": 1.9095002820990152,
"learning_rate": 9.677574597814884e-06,
"loss": 0.5048,
"step": 263
},
{
"epoch": 0.14211695040710584,
"grad_norm": 2.7537047870453777,
"learning_rate": 9.674486182474716e-06,
"loss": 0.5202,
"step": 264
},
{
"epoch": 0.14265527218895094,
"grad_norm": 1.5411698281683408,
"learning_rate": 9.671383543647225e-06,
"loss": 0.473,
"step": 265
},
{
"epoch": 0.14319359397079603,
"grad_norm": 1.6351867542673815,
"learning_rate": 9.668266690773094e-06,
"loss": 0.4734,
"step": 266
},
{
"epoch": 0.14373191575264113,
"grad_norm": 1.8884810300636565,
"learning_rate": 9.66513563333626e-06,
"loss": 0.5014,
"step": 267
},
{
"epoch": 0.14427023753448623,
"grad_norm": 1.6743904016832571,
"learning_rate": 9.661990380863876e-06,
"loss": 0.4782,
"step": 268
},
{
"epoch": 0.14480855931633133,
"grad_norm": 1.9090758165263444,
"learning_rate": 9.658830942926291e-06,
"loss": 0.5003,
"step": 269
},
{
"epoch": 0.14534688109817642,
"grad_norm": 1.4937405913115736,
"learning_rate": 9.655657329137015e-06,
"loss": 0.4432,
"step": 270
},
{
"epoch": 0.14588520288002152,
"grad_norm": 1.9026943182309153,
"learning_rate": 9.652469549152695e-06,
"loss": 0.529,
"step": 271
},
{
"epoch": 0.14642352466186664,
"grad_norm": 1.8186943886881364,
"learning_rate": 9.649267612673079e-06,
"loss": 0.4737,
"step": 272
},
{
"epoch": 0.14696184644371174,
"grad_norm": 1.8259823260308685,
"learning_rate": 9.646051529440993e-06,
"loss": 0.4985,
"step": 273
},
{
"epoch": 0.14750016822555684,
"grad_norm": 1.9385932273349529,
"learning_rate": 9.64282130924231e-06,
"loss": 0.4838,
"step": 274
},
{
"epoch": 0.14803849000740193,
"grad_norm": 2.04013899262351,
"learning_rate": 9.639576961905915e-06,
"loss": 0.5434,
"step": 275
},
{
"epoch": 0.14857681178924703,
"grad_norm": 1.4822512590060632,
"learning_rate": 9.636318497303679e-06,
"loss": 0.5105,
"step": 276
},
{
"epoch": 0.14911513357109213,
"grad_norm": 1.580055299090581,
"learning_rate": 9.633045925350436e-06,
"loss": 0.5236,
"step": 277
},
{
"epoch": 0.14965345535293723,
"grad_norm": 1.947058506268201,
"learning_rate": 9.629759256003936e-06,
"loss": 0.517,
"step": 278
},
{
"epoch": 0.15019177713478232,
"grad_norm": 2.09097300966892,
"learning_rate": 9.626458499264833e-06,
"loss": 0.4795,
"step": 279
},
{
"epoch": 0.15073009891662742,
"grad_norm": 1.9281815370039999,
"learning_rate": 9.623143665176636e-06,
"loss": 0.5091,
"step": 280
},
{
"epoch": 0.15126842069847252,
"grad_norm": 1.8942765435710498,
"learning_rate": 9.6198147638257e-06,
"loss": 0.486,
"step": 281
},
{
"epoch": 0.1518067424803176,
"grad_norm": 1.5680877122601742,
"learning_rate": 9.616471805341175e-06,
"loss": 0.5756,
"step": 282
},
{
"epoch": 0.1523450642621627,
"grad_norm": 1.8187589637332664,
"learning_rate": 9.613114799894989e-06,
"loss": 0.4848,
"step": 283
},
{
"epoch": 0.1528833860440078,
"grad_norm": 2.845269186548161,
"learning_rate": 9.609743757701806e-06,
"loss": 0.5196,
"step": 284
},
{
"epoch": 0.1534217078258529,
"grad_norm": 1.6573799451128552,
"learning_rate": 9.60635868901901e-06,
"loss": 0.5256,
"step": 285
},
{
"epoch": 0.153960029607698,
"grad_norm": 1.403409672767778,
"learning_rate": 9.602959604146658e-06,
"loss": 0.4591,
"step": 286
},
{
"epoch": 0.1544983513895431,
"grad_norm": 1.5756224710697608,
"learning_rate": 9.599546513427455e-06,
"loss": 0.4499,
"step": 287
},
{
"epoch": 0.1550366731713882,
"grad_norm": 1.8561161081867996,
"learning_rate": 9.596119427246727e-06,
"loss": 0.514,
"step": 288
},
{
"epoch": 0.1555749949532333,
"grad_norm": 1.6430886050709819,
"learning_rate": 9.592678356032382e-06,
"loss": 0.4916,
"step": 289
},
{
"epoch": 0.1561133167350784,
"grad_norm": 1.5608831001537813,
"learning_rate": 9.589223310254881e-06,
"loss": 0.4845,
"step": 290
},
{
"epoch": 0.15665163851692349,
"grad_norm": 2.041472319934021,
"learning_rate": 9.58575430042721e-06,
"loss": 0.5105,
"step": 291
},
{
"epoch": 0.15718996029876858,
"grad_norm": 1.879252835980779,
"learning_rate": 9.582271337104844e-06,
"loss": 0.5254,
"step": 292
},
{
"epoch": 0.15772828208061368,
"grad_norm": 1.7353738362985391,
"learning_rate": 9.578774430885714e-06,
"loss": 0.545,
"step": 293
},
{
"epoch": 0.15826660386245878,
"grad_norm": 1.6167983704567415,
"learning_rate": 9.575263592410176e-06,
"loss": 0.484,
"step": 294
},
{
"epoch": 0.15880492564430387,
"grad_norm": 1.6983057165346465,
"learning_rate": 9.571738832360979e-06,
"loss": 0.5001,
"step": 295
},
{
"epoch": 0.15934324742614897,
"grad_norm": 2.081190213763369,
"learning_rate": 9.568200161463237e-06,
"loss": 0.4722,
"step": 296
},
{
"epoch": 0.15988156920799407,
"grad_norm": 2.246655796617688,
"learning_rate": 9.564647590484384e-06,
"loss": 0.5171,
"step": 297
},
{
"epoch": 0.16041989098983916,
"grad_norm": 1.4481263563444773,
"learning_rate": 9.561081130234155e-06,
"loss": 0.471,
"step": 298
},
{
"epoch": 0.16095821277168426,
"grad_norm": 1.6254902571476582,
"learning_rate": 9.557500791564545e-06,
"loss": 0.4709,
"step": 299
},
{
"epoch": 0.16149653455352936,
"grad_norm": 1.6522030181707457,
"learning_rate": 9.55390658536978e-06,
"loss": 0.4314,
"step": 300
},
{
"epoch": 0.16149653455352936,
"eval_loss": 0.48600396513938904,
"eval_runtime": 1525.5556,
"eval_samples_per_second": 16.394,
"eval_steps_per_second": 0.513,
"step": 300
},
{
"epoch": 0.16203485633537448,
"grad_norm": 1.6735119675316397,
"learning_rate": 9.550298522586277e-06,
"loss": 0.4981,
"step": 301
},
{
"epoch": 0.16257317811721958,
"grad_norm": 1.7492206784400102,
"learning_rate": 9.546676614192623e-06,
"loss": 0.5166,
"step": 302
},
{
"epoch": 0.16311149989906468,
"grad_norm": 1.8716369675908593,
"learning_rate": 9.543040871209528e-06,
"loss": 0.4587,
"step": 303
},
{
"epoch": 0.16364982168090977,
"grad_norm": 1.5260344735318792,
"learning_rate": 9.5393913046998e-06,
"loss": 0.4637,
"step": 304
},
{
"epoch": 0.16418814346275487,
"grad_norm": 1.9514934425079693,
"learning_rate": 9.535727925768312e-06,
"loss": 0.5018,
"step": 305
},
{
"epoch": 0.16472646524459997,
"grad_norm": 1.9239888955973004,
"learning_rate": 9.53205074556196e-06,
"loss": 0.5156,
"step": 306
},
{
"epoch": 0.16526478702644506,
"grad_norm": 1.4397611201745624,
"learning_rate": 9.528359775269637e-06,
"loss": 0.4876,
"step": 307
},
{
"epoch": 0.16580310880829016,
"grad_norm": 1.6314792528136741,
"learning_rate": 9.524655026122199e-06,
"loss": 0.4466,
"step": 308
},
{
"epoch": 0.16634143059013526,
"grad_norm": 1.7046994741333183,
"learning_rate": 9.520936509392425e-06,
"loss": 0.5137,
"step": 309
},
{
"epoch": 0.16687975237198036,
"grad_norm": 1.6773498230286716,
"learning_rate": 9.517204236394983e-06,
"loss": 0.4857,
"step": 310
},
{
"epoch": 0.16741807415382545,
"grad_norm": 1.9407453364887826,
"learning_rate": 9.513458218486404e-06,
"loss": 0.569,
"step": 311
},
{
"epoch": 0.16795639593567055,
"grad_norm": 2.3596815310352355,
"learning_rate": 9.509698467065042e-06,
"loss": 0.4823,
"step": 312
},
{
"epoch": 0.16849471771751565,
"grad_norm": 1.491461623274511,
"learning_rate": 9.505924993571037e-06,
"loss": 0.4814,
"step": 313
},
{
"epoch": 0.16903303949936074,
"grad_norm": 1.755984194501031,
"learning_rate": 9.502137809486277e-06,
"loss": 0.4953,
"step": 314
},
{
"epoch": 0.16957136128120584,
"grad_norm": 1.4330639099631888,
"learning_rate": 9.49833692633438e-06,
"loss": 0.4566,
"step": 315
},
{
"epoch": 0.17010968306305094,
"grad_norm": 2.8224430252996413,
"learning_rate": 9.49452235568064e-06,
"loss": 0.5356,
"step": 316
},
{
"epoch": 0.17064800484489603,
"grad_norm": 1.6038158256481398,
"learning_rate": 9.490694109131997e-06,
"loss": 0.4667,
"step": 317
},
{
"epoch": 0.17118632662674113,
"grad_norm": 1.5264996881581228,
"learning_rate": 9.486852198337013e-06,
"loss": 0.5066,
"step": 318
},
{
"epoch": 0.17172464840858623,
"grad_norm": 2.1960133726792987,
"learning_rate": 9.482996634985818e-06,
"loss": 0.51,
"step": 319
},
{
"epoch": 0.17226297019043132,
"grad_norm": 1.8025162435130595,
"learning_rate": 9.479127430810087e-06,
"loss": 0.4542,
"step": 320
},
{
"epoch": 0.17280129197227642,
"grad_norm": 1.573351382907097,
"learning_rate": 9.475244597583007e-06,
"loss": 0.4932,
"step": 321
},
{
"epoch": 0.17333961375412152,
"grad_norm": 1.8667569419712537,
"learning_rate": 9.471348147119226e-06,
"loss": 0.5095,
"step": 322
},
{
"epoch": 0.17387793553596662,
"grad_norm": 1.7668055772396445,
"learning_rate": 9.467438091274831e-06,
"loss": 0.5407,
"step": 323
},
{
"epoch": 0.1744162573178117,
"grad_norm": 1.8953472452582216,
"learning_rate": 9.46351444194731e-06,
"loss": 0.5128,
"step": 324
},
{
"epoch": 0.1749545790996568,
"grad_norm": 1.4178882398027213,
"learning_rate": 9.459577211075505e-06,
"loss": 0.4783,
"step": 325
},
{
"epoch": 0.1754929008815019,
"grad_norm": 2.0556054399757833,
"learning_rate": 9.455626410639595e-06,
"loss": 0.4883,
"step": 326
},
{
"epoch": 0.176031222663347,
"grad_norm": 1.7326020245251583,
"learning_rate": 9.451662052661042e-06,
"loss": 0.5118,
"step": 327
},
{
"epoch": 0.1765695444451921,
"grad_norm": 4.171939008569256,
"learning_rate": 9.447684149202555e-06,
"loss": 0.5034,
"step": 328
},
{
"epoch": 0.17710786622703723,
"grad_norm": 1.4094510294695572,
"learning_rate": 9.44369271236807e-06,
"loss": 0.485,
"step": 329
},
{
"epoch": 0.17764618800888232,
"grad_norm": 1.7412556596004685,
"learning_rate": 9.4396877543027e-06,
"loss": 0.5202,
"step": 330
},
{
"epoch": 0.17818450979072742,
"grad_norm": 2.605859372043168,
"learning_rate": 9.435669287192691e-06,
"loss": 0.4685,
"step": 331
},
{
"epoch": 0.17872283157257252,
"grad_norm": 1.751047130574041,
"learning_rate": 9.431637323265406e-06,
"loss": 0.5435,
"step": 332
},
{
"epoch": 0.1792611533544176,
"grad_norm": 1.6979113314955865,
"learning_rate": 9.42759187478927e-06,
"loss": 0.5082,
"step": 333
},
{
"epoch": 0.1797994751362627,
"grad_norm": 1.655193667961951,
"learning_rate": 9.423532954073737e-06,
"loss": 0.52,
"step": 334
},
{
"epoch": 0.1803377969181078,
"grad_norm": 1.715183078111553,
"learning_rate": 9.419460573469262e-06,
"loss": 0.4876,
"step": 335
},
{
"epoch": 0.1808761186999529,
"grad_norm": 1.755206515543788,
"learning_rate": 9.415374745367245e-06,
"loss": 0.4826,
"step": 336
},
{
"epoch": 0.181414440481798,
"grad_norm": 1.530238277234238,
"learning_rate": 9.411275482200015e-06,
"loss": 0.5227,
"step": 337
},
{
"epoch": 0.1819527622636431,
"grad_norm": 1.4873212835334444,
"learning_rate": 9.40716279644077e-06,
"loss": 0.4784,
"step": 338
},
{
"epoch": 0.1824910840454882,
"grad_norm": 1.4713841358562554,
"learning_rate": 9.403036700603561e-06,
"loss": 0.4872,
"step": 339
},
{
"epoch": 0.1830294058273333,
"grad_norm": 1.5551919063027968,
"learning_rate": 9.398897207243232e-06,
"loss": 0.4817,
"step": 340
},
{
"epoch": 0.1835677276091784,
"grad_norm": 1.8717050820441055,
"learning_rate": 9.394744328955403e-06,
"loss": 0.5002,
"step": 341
},
{
"epoch": 0.18410604939102349,
"grad_norm": 1.9843100820794195,
"learning_rate": 9.390578078376417e-06,
"loss": 0.4799,
"step": 342
},
{
"epoch": 0.18464437117286858,
"grad_norm": 2.156998251608843,
"learning_rate": 9.386398468183304e-06,
"loss": 0.4469,
"step": 343
},
{
"epoch": 0.18518269295471368,
"grad_norm": 1.7123477834586953,
"learning_rate": 9.38220551109375e-06,
"loss": 0.5312,
"step": 344
},
{
"epoch": 0.18572101473655878,
"grad_norm": 1.862901860663747,
"learning_rate": 9.377999219866046e-06,
"loss": 0.5146,
"step": 345
},
{
"epoch": 0.18625933651840387,
"grad_norm": 1.8400145206055536,
"learning_rate": 9.373779607299061e-06,
"loss": 0.498,
"step": 346
},
{
"epoch": 0.18679765830024897,
"grad_norm": 1.4419967374301528,
"learning_rate": 9.369546686232199e-06,
"loss": 0.491,
"step": 347
},
{
"epoch": 0.18733598008209407,
"grad_norm": 1.6800971553110484,
"learning_rate": 9.365300469545352e-06,
"loss": 0.453,
"step": 348
},
{
"epoch": 0.18787430186393916,
"grad_norm": 1.4414646625492236,
"learning_rate": 9.361040970158876e-06,
"loss": 0.4844,
"step": 349
},
{
"epoch": 0.18841262364578426,
"grad_norm": 1.4693828151901231,
"learning_rate": 9.356768201033542e-06,
"loss": 0.4846,
"step": 350
},
{
"epoch": 0.18895094542762936,
"grad_norm": 1.6213301090422854,
"learning_rate": 9.35248217517049e-06,
"loss": 0.4528,
"step": 351
},
{
"epoch": 0.18948926720947445,
"grad_norm": 1.3998204036117714,
"learning_rate": 9.348182905611209e-06,
"loss": 0.4677,
"step": 352
},
{
"epoch": 0.19002758899131955,
"grad_norm": 1.4713366703366633,
"learning_rate": 9.343870405437477e-06,
"loss": 0.4292,
"step": 353
},
{
"epoch": 0.19056591077316465,
"grad_norm": 1.941068700941172,
"learning_rate": 9.339544687771334e-06,
"loss": 0.5102,
"step": 354
},
{
"epoch": 0.19110423255500975,
"grad_norm": 1.828849112653357,
"learning_rate": 9.335205765775039e-06,
"loss": 0.4638,
"step": 355
},
{
"epoch": 0.19164255433685484,
"grad_norm": 1.6885129161638754,
"learning_rate": 9.330853652651026e-06,
"loss": 0.4391,
"step": 356
},
{
"epoch": 0.19218087611869994,
"grad_norm": 1.7268115477491062,
"learning_rate": 9.326488361641867e-06,
"loss": 0.4557,
"step": 357
},
{
"epoch": 0.19271919790054506,
"grad_norm": 1.369390489248521,
"learning_rate": 9.322109906030237e-06,
"loss": 0.4451,
"step": 358
},
{
"epoch": 0.19325751968239016,
"grad_norm": 1.653269795096283,
"learning_rate": 9.31771829913886e-06,
"loss": 0.4466,
"step": 359
},
{
"epoch": 0.19379584146423526,
"grad_norm": 1.6015504141518857,
"learning_rate": 9.313313554330484e-06,
"loss": 0.4977,
"step": 360
},
{
"epoch": 0.19379584146423526,
"eval_loss": 0.4812440574169159,
"eval_runtime": 1528.9254,
"eval_samples_per_second": 16.358,
"eval_steps_per_second": 0.511,
"step": 360
},
{
"epoch": 0.19433416324608035,
"grad_norm": 1.6899547102686612,
"learning_rate": 9.308895685007824e-06,
"loss": 0.5404,
"step": 361
},
{
"epoch": 0.19487248502792545,
"grad_norm": 1.8153441873291498,
"learning_rate": 9.304464704613541e-06,
"loss": 0.5128,
"step": 362
},
{
"epoch": 0.19541080680977055,
"grad_norm": 1.6094259149494354,
"learning_rate": 9.300020626630184e-06,
"loss": 0.4854,
"step": 363
},
{
"epoch": 0.19594912859161565,
"grad_norm": 1.726004590201776,
"learning_rate": 9.295563464580153e-06,
"loss": 0.4827,
"step": 364
},
{
"epoch": 0.19648745037346074,
"grad_norm": 1.7917006550897865,
"learning_rate": 9.29109323202567e-06,
"loss": 0.4689,
"step": 365
},
{
"epoch": 0.19702577215530584,
"grad_norm": 2.067420755566304,
"learning_rate": 9.286609942568712e-06,
"loss": 0.4411,
"step": 366
},
{
"epoch": 0.19756409393715094,
"grad_norm": 1.9439738397276571,
"learning_rate": 9.282113609851002e-06,
"loss": 0.4748,
"step": 367
},
{
"epoch": 0.19810241571899603,
"grad_norm": 1.6206588657538272,
"learning_rate": 9.277604247553939e-06,
"loss": 0.5215,
"step": 368
},
{
"epoch": 0.19864073750084113,
"grad_norm": 2.0968303117516136,
"learning_rate": 9.273081869398577e-06,
"loss": 0.4466,
"step": 369
},
{
"epoch": 0.19917905928268623,
"grad_norm": 1.5483077144548956,
"learning_rate": 9.268546489145566e-06,
"loss": 0.5042,
"step": 370
},
{
"epoch": 0.19971738106453132,
"grad_norm": 1.6430391903483688,
"learning_rate": 9.263998120595124e-06,
"loss": 0.4798,
"step": 371
},
{
"epoch": 0.20025570284637642,
"grad_norm": 1.451263876582638,
"learning_rate": 9.259436777586991e-06,
"loss": 0.4498,
"step": 372
},
{
"epoch": 0.20079402462822152,
"grad_norm": 1.924895097651951,
"learning_rate": 9.25486247400038e-06,
"loss": 0.4971,
"step": 373
},
{
"epoch": 0.20133234641006661,
"grad_norm": 1.5044716731151997,
"learning_rate": 9.250275223753948e-06,
"loss": 0.4761,
"step": 374
},
{
"epoch": 0.2018706681919117,
"grad_norm": 1.8105401635317677,
"learning_rate": 9.245675040805738e-06,
"loss": 0.4645,
"step": 375
},
{
"epoch": 0.2024089899737568,
"grad_norm": 1.4400001043179194,
"learning_rate": 9.241061939153146e-06,
"loss": 0.5052,
"step": 376
},
{
"epoch": 0.2029473117556019,
"grad_norm": 2.1898160128283046,
"learning_rate": 9.236435932832883e-06,
"loss": 0.4571,
"step": 377
},
{
"epoch": 0.203485633537447,
"grad_norm": 1.728102995146478,
"learning_rate": 9.231797035920921e-06,
"loss": 0.459,
"step": 378
},
{
"epoch": 0.2040239553192921,
"grad_norm": 1.5484346370702677,
"learning_rate": 9.227145262532458e-06,
"loss": 0.5106,
"step": 379
},
{
"epoch": 0.2045622771011372,
"grad_norm": 1.5623742217769747,
"learning_rate": 9.222480626821868e-06,
"loss": 0.444,
"step": 380
},
{
"epoch": 0.2051005988829823,
"grad_norm": 1.7091436440987169,
"learning_rate": 9.217803142982668e-06,
"loss": 0.4732,
"step": 381
},
{
"epoch": 0.2056389206648274,
"grad_norm": 1.4196906974845203,
"learning_rate": 9.213112825247466e-06,
"loss": 0.4779,
"step": 382
},
{
"epoch": 0.2061772424466725,
"grad_norm": 1.5167704426292719,
"learning_rate": 9.20840968788792e-06,
"loss": 0.4967,
"step": 383
},
{
"epoch": 0.20671556422851758,
"grad_norm": 1.4170871947038493,
"learning_rate": 9.203693745214698e-06,
"loss": 0.491,
"step": 384
},
{
"epoch": 0.20725388601036268,
"grad_norm": 1.5152939794668674,
"learning_rate": 9.19896501157743e-06,
"loss": 0.4541,
"step": 385
},
{
"epoch": 0.2077922077922078,
"grad_norm": 1.9536536833455793,
"learning_rate": 9.19422350136467e-06,
"loss": 0.4799,
"step": 386
},
{
"epoch": 0.2083305295740529,
"grad_norm": 2.316326510948496,
"learning_rate": 9.18946922900384e-06,
"loss": 0.4658,
"step": 387
},
{
"epoch": 0.208868851355898,
"grad_norm": 1.2922243986398827,
"learning_rate": 9.184702208961204e-06,
"loss": 0.4057,
"step": 388
},
{
"epoch": 0.2094071731377431,
"grad_norm": 1.8303479595554093,
"learning_rate": 9.179922455741812e-06,
"loss": 0.4427,
"step": 389
},
{
"epoch": 0.2099454949195882,
"grad_norm": 1.541720900007236,
"learning_rate": 9.175129983889452e-06,
"loss": 0.516,
"step": 390
},
{
"epoch": 0.2104838167014333,
"grad_norm": 1.9307101459341938,
"learning_rate": 9.17032480798662e-06,
"loss": 0.4349,
"step": 391
},
{
"epoch": 0.2110221384832784,
"grad_norm": 1.3922182421272982,
"learning_rate": 9.165506942654468e-06,
"loss": 0.4816,
"step": 392
},
{
"epoch": 0.21156046026512348,
"grad_norm": 1.6974151932118977,
"learning_rate": 9.16067640255275e-06,
"loss": 0.4812,
"step": 393
},
{
"epoch": 0.21209878204696858,
"grad_norm": 1.4726854167474133,
"learning_rate": 9.155833202379798e-06,
"loss": 0.4717,
"step": 394
},
{
"epoch": 0.21263710382881368,
"grad_norm": 1.8790922445419658,
"learning_rate": 9.150977356872456e-06,
"loss": 0.4885,
"step": 395
},
{
"epoch": 0.21317542561065878,
"grad_norm": 1.9084443087840661,
"learning_rate": 9.146108880806056e-06,
"loss": 0.4633,
"step": 396
},
{
"epoch": 0.21371374739250387,
"grad_norm": 1.6996601490386696,
"learning_rate": 9.141227788994348e-06,
"loss": 0.4453,
"step": 397
},
{
"epoch": 0.21425206917434897,
"grad_norm": 1.7127514086857762,
"learning_rate": 9.136334096289485e-06,
"loss": 0.5144,
"step": 398
},
{
"epoch": 0.21479039095619407,
"grad_norm": 1.4183339048304517,
"learning_rate": 9.131427817581953e-06,
"loss": 0.476,
"step": 399
},
{
"epoch": 0.21532871273803916,
"grad_norm": 1.5688801517253075,
"learning_rate": 9.12650896780053e-06,
"loss": 0.4657,
"step": 400
},
{
"epoch": 0.21586703451988426,
"grad_norm": 1.391080609496865,
"learning_rate": 9.121577561912256e-06,
"loss": 0.5043,
"step": 401
},
{
"epoch": 0.21640535630172936,
"grad_norm": 3.302547702490585,
"learning_rate": 9.11663361492237e-06,
"loss": 0.497,
"step": 402
},
{
"epoch": 0.21694367808357445,
"grad_norm": 1.7874988296563226,
"learning_rate": 9.111677141874273e-06,
"loss": 0.4465,
"step": 403
},
{
"epoch": 0.21748199986541955,
"grad_norm": 1.830004021479594,
"learning_rate": 9.106708157849478e-06,
"loss": 0.5088,
"step": 404
},
{
"epoch": 0.21802032164726465,
"grad_norm": 2.4236747379642267,
"learning_rate": 9.101726677967569e-06,
"loss": 0.4922,
"step": 405
},
{
"epoch": 0.21855864342910974,
"grad_norm": 1.5488577176317244,
"learning_rate": 9.096732717386152e-06,
"loss": 0.497,
"step": 406
},
{
"epoch": 0.21909696521095484,
"grad_norm": 2.3263014189367306,
"learning_rate": 9.091726291300806e-06,
"loss": 0.4791,
"step": 407
},
{
"epoch": 0.21963528699279994,
"grad_norm": 1.7243223143837634,
"learning_rate": 9.086707414945044e-06,
"loss": 0.5192,
"step": 408
},
{
"epoch": 0.22017360877464504,
"grad_norm": 1.3667216442420331,
"learning_rate": 9.08167610359026e-06,
"loss": 0.4816,
"step": 409
},
{
"epoch": 0.22071193055649013,
"grad_norm": 1.4675898960533509,
"learning_rate": 9.076632372545688e-06,
"loss": 0.4694,
"step": 410
},
{
"epoch": 0.22125025233833523,
"grad_norm": 1.725309532729321,
"learning_rate": 9.071576237158348e-06,
"loss": 0.5097,
"step": 411
},
{
"epoch": 0.22178857412018033,
"grad_norm": 1.48659542538949,
"learning_rate": 9.066507712813009e-06,
"loss": 0.445,
"step": 412
},
{
"epoch": 0.22232689590202542,
"grad_norm": 1.6287270540094485,
"learning_rate": 9.06142681493213e-06,
"loss": 0.4948,
"step": 413
},
{
"epoch": 0.22286521768387052,
"grad_norm": 1.5275233090165254,
"learning_rate": 9.056333558975828e-06,
"loss": 0.4556,
"step": 414
},
{
"epoch": 0.22340353946571564,
"grad_norm": 1.6620168630066545,
"learning_rate": 9.051227960441819e-06,
"loss": 0.4652,
"step": 415
},
{
"epoch": 0.22394186124756074,
"grad_norm": 2.059601149156459,
"learning_rate": 9.046110034865374e-06,
"loss": 0.5085,
"step": 416
},
{
"epoch": 0.22448018302940584,
"grad_norm": 1.762324556385875,
"learning_rate": 9.040979797819275e-06,
"loss": 0.4461,
"step": 417
},
{
"epoch": 0.22501850481125094,
"grad_norm": 1.7567357923246754,
"learning_rate": 9.035837264913764e-06,
"loss": 0.4732,
"step": 418
},
{
"epoch": 0.22555682659309603,
"grad_norm": 1.6696886078675257,
"learning_rate": 9.030682451796497e-06,
"loss": 0.4642,
"step": 419
},
{
"epoch": 0.22609514837494113,
"grad_norm": 1.8175306322549967,
"learning_rate": 9.025515374152498e-06,
"loss": 0.4613,
"step": 420
},
{
"epoch": 0.22609514837494113,
"eval_loss": 0.4776149392127991,
"eval_runtime": 1533.2316,
"eval_samples_per_second": 16.312,
"eval_steps_per_second": 0.51,
"step": 420
},
{
"epoch": 0.22663347015678623,
"grad_norm": 1.7934239843519915,
"learning_rate": 9.020336047704105e-06,
"loss": 0.516,
"step": 421
},
{
"epoch": 0.22717179193863132,
"grad_norm": 1.5310720805604554,
"learning_rate": 9.015144488210927e-06,
"loss": 0.489,
"step": 422
},
{
"epoch": 0.22771011372047642,
"grad_norm": 1.48774951332565,
"learning_rate": 9.009940711469804e-06,
"loss": 0.5009,
"step": 423
},
{
"epoch": 0.22824843550232152,
"grad_norm": 2.4756529462562145,
"learning_rate": 9.004724733314738e-06,
"loss": 0.4406,
"step": 424
},
{
"epoch": 0.22878675728416661,
"grad_norm": 1.4505668733407078,
"learning_rate": 8.999496569616867e-06,
"loss": 0.4554,
"step": 425
},
{
"epoch": 0.2293250790660117,
"grad_norm": 1.7945762191089136,
"learning_rate": 8.994256236284402e-06,
"loss": 0.4632,
"step": 426
},
{
"epoch": 0.2298634008478568,
"grad_norm": 1.6376843185311614,
"learning_rate": 8.989003749262587e-06,
"loss": 0.4885,
"step": 427
},
{
"epoch": 0.2304017226297019,
"grad_norm": 1.8830741232863908,
"learning_rate": 8.983739124533644e-06,
"loss": 0.5075,
"step": 428
},
{
"epoch": 0.230940044411547,
"grad_norm": 1.3195150579928587,
"learning_rate": 8.978462378116729e-06,
"loss": 0.4708,
"step": 429
},
{
"epoch": 0.2314783661933921,
"grad_norm": 3.7495214134368977,
"learning_rate": 8.973173526067883e-06,
"loss": 0.4286,
"step": 430
},
{
"epoch": 0.2320166879752372,
"grad_norm": 2.359888838059791,
"learning_rate": 8.967872584479977e-06,
"loss": 0.5009,
"step": 431
},
{
"epoch": 0.2325550097570823,
"grad_norm": 2.307039087438763,
"learning_rate": 8.962559569482677e-06,
"loss": 0.5676,
"step": 432
},
{
"epoch": 0.2330933315389274,
"grad_norm": 1.6816015759212095,
"learning_rate": 8.957234497242378e-06,
"loss": 0.4741,
"step": 433
},
{
"epoch": 0.2336316533207725,
"grad_norm": 1.322921614998224,
"learning_rate": 8.951897383962163e-06,
"loss": 0.4688,
"step": 434
},
{
"epoch": 0.23416997510261758,
"grad_norm": 1.4430047272258668,
"learning_rate": 8.946548245881758e-06,
"loss": 0.4711,
"step": 435
},
{
"epoch": 0.23470829688446268,
"grad_norm": 1.5731159349637571,
"learning_rate": 8.941187099277475e-06,
"loss": 0.5128,
"step": 436
},
{
"epoch": 0.23524661866630778,
"grad_norm": 1.7731819377906834,
"learning_rate": 8.935813960462166e-06,
"loss": 0.4669,
"step": 437
},
{
"epoch": 0.23578494044815287,
"grad_norm": 1.5736170200351274,
"learning_rate": 8.930428845785171e-06,
"loss": 0.5151,
"step": 438
},
{
"epoch": 0.23632326222999797,
"grad_norm": 1.9488876650276103,
"learning_rate": 8.925031771632273e-06,
"loss": 0.449,
"step": 439
},
{
"epoch": 0.23686158401184307,
"grad_norm": 1.8677275264654012,
"learning_rate": 8.919622754425645e-06,
"loss": 0.4758,
"step": 440
},
{
"epoch": 0.23739990579368817,
"grad_norm": 1.6185523790901868,
"learning_rate": 8.914201810623796e-06,
"loss": 0.4539,
"step": 441
},
{
"epoch": 0.23793822757553326,
"grad_norm": 1.7808483857096469,
"learning_rate": 8.908768956721535e-06,
"loss": 0.5022,
"step": 442
},
{
"epoch": 0.2384765493573784,
"grad_norm": 1.5766134824810658,
"learning_rate": 8.903324209249895e-06,
"loss": 0.448,
"step": 443
},
{
"epoch": 0.23901487113922348,
"grad_norm": 1.734675342226781,
"learning_rate": 8.897867584776114e-06,
"loss": 0.4646,
"step": 444
},
{
"epoch": 0.23955319292106858,
"grad_norm": 1.5790149541067802,
"learning_rate": 8.892399099903564e-06,
"loss": 0.4786,
"step": 445
},
{
"epoch": 0.24009151470291368,
"grad_norm": 1.4746994503206987,
"learning_rate": 8.8869187712717e-06,
"loss": 0.5055,
"step": 446
},
{
"epoch": 0.24062983648475877,
"grad_norm": 1.629202002564735,
"learning_rate": 8.881426615556023e-06,
"loss": 0.4572,
"step": 447
},
{
"epoch": 0.24116815826660387,
"grad_norm": 2.060742412650639,
"learning_rate": 8.875922649468019e-06,
"loss": 0.5032,
"step": 448
},
{
"epoch": 0.24170648004844897,
"grad_norm": 1.5621749237333817,
"learning_rate": 8.87040688975511e-06,
"loss": 0.4654,
"step": 449
},
{
"epoch": 0.24224480183029407,
"grad_norm": 1.4674899116105513,
"learning_rate": 8.864879353200599e-06,
"loss": 0.4747,
"step": 450
},
{
"epoch": 0.24278312361213916,
"grad_norm": 1.5183875651941505,
"learning_rate": 8.859340056623632e-06,
"loss": 0.4982,
"step": 451
},
{
"epoch": 0.24332144539398426,
"grad_norm": 1.5706370531453442,
"learning_rate": 8.853789016879134e-06,
"loss": 0.4667,
"step": 452
},
{
"epoch": 0.24385976717582936,
"grad_norm": 1.6305623278282155,
"learning_rate": 8.84822625085776e-06,
"loss": 0.456,
"step": 453
},
{
"epoch": 0.24439808895767445,
"grad_norm": 1.6523301690172285,
"learning_rate": 8.842651775485848e-06,
"loss": 0.5383,
"step": 454
},
{
"epoch": 0.24493641073951955,
"grad_norm": 1.5998220743266833,
"learning_rate": 8.837065607725368e-06,
"loss": 0.4829,
"step": 455
},
{
"epoch": 0.24547473252136465,
"grad_norm": 1.7862569885991761,
"learning_rate": 8.831467764573863e-06,
"loss": 0.5101,
"step": 456
},
{
"epoch": 0.24601305430320974,
"grad_norm": 1.704691179868801,
"learning_rate": 8.8258582630644e-06,
"loss": 0.4627,
"step": 457
},
{
"epoch": 0.24655137608505484,
"grad_norm": 1.7756811764982563,
"learning_rate": 8.820237120265526e-06,
"loss": 0.5079,
"step": 458
},
{
"epoch": 0.24708969786689994,
"grad_norm": 1.3696742776597963,
"learning_rate": 8.814604353281206e-06,
"loss": 0.4393,
"step": 459
},
{
"epoch": 0.24762801964874503,
"grad_norm": 2.7637461827933083,
"learning_rate": 8.80895997925078e-06,
"loss": 0.4548,
"step": 460
},
{
"epoch": 0.24816634143059013,
"grad_norm": 1.9115795242982947,
"learning_rate": 8.803304015348894e-06,
"loss": 0.4805,
"step": 461
},
{
"epoch": 0.24870466321243523,
"grad_norm": 1.6805506691737162,
"learning_rate": 8.797636478785475e-06,
"loss": 0.4786,
"step": 462
},
{
"epoch": 0.24924298499428033,
"grad_norm": 1.865661091263274,
"learning_rate": 8.791957386805651e-06,
"loss": 0.4722,
"step": 463
},
{
"epoch": 0.24978130677612542,
"grad_norm": 1.9405317358586787,
"learning_rate": 8.78626675668972e-06,
"loss": 0.4705,
"step": 464
},
{
"epoch": 0.2503196285579705,
"grad_norm": 1.4415009315383829,
"learning_rate": 8.78056460575308e-06,
"loss": 0.4301,
"step": 465
},
{
"epoch": 0.2508579503398156,
"grad_norm": 1.6060330602526178,
"learning_rate": 8.774850951346188e-06,
"loss": 0.4114,
"step": 466
},
{
"epoch": 0.2513962721216607,
"grad_norm": 1.7567677906852937,
"learning_rate": 8.769125810854504e-06,
"loss": 0.4922,
"step": 467
},
{
"epoch": 0.2519345939035058,
"grad_norm": 1.4281502602519498,
"learning_rate": 8.763389201698438e-06,
"loss": 0.4426,
"step": 468
},
{
"epoch": 0.2524729156853509,
"grad_norm": 1.787920776798679,
"learning_rate": 8.757641141333296e-06,
"loss": 0.4451,
"step": 469
},
{
"epoch": 0.253011237467196,
"grad_norm": 1.4246034781799948,
"learning_rate": 8.751881647249228e-06,
"loss": 0.4353,
"step": 470
},
{
"epoch": 0.2535495592490411,
"grad_norm": 1.6679185342871934,
"learning_rate": 8.746110736971175e-06,
"loss": 0.4573,
"step": 471
},
{
"epoch": 0.2540878810308862,
"grad_norm": 1.6765594656197593,
"learning_rate": 8.740328428058813e-06,
"loss": 0.4797,
"step": 472
},
{
"epoch": 0.2546262028127313,
"grad_norm": 1.7826390062476167,
"learning_rate": 8.734534738106503e-06,
"loss": 0.473,
"step": 473
},
{
"epoch": 0.2551645245945764,
"grad_norm": 2.195730177211015,
"learning_rate": 8.728729684743238e-06,
"loss": 0.4648,
"step": 474
},
{
"epoch": 0.2557028463764215,
"grad_norm": 1.475566632306908,
"learning_rate": 8.722913285632584e-06,
"loss": 0.4845,
"step": 475
},
{
"epoch": 0.2562411681582666,
"grad_norm": 1.7347583810505152,
"learning_rate": 8.717085558472631e-06,
"loss": 0.4708,
"step": 476
},
{
"epoch": 0.2567794899401117,
"grad_norm": 1.6902146229456119,
"learning_rate": 8.71124652099594e-06,
"loss": 0.4817,
"step": 477
},
{
"epoch": 0.2573178117219568,
"grad_norm": 1.7071042054828858,
"learning_rate": 8.705396190969484e-06,
"loss": 0.4712,
"step": 478
},
{
"epoch": 0.2578561335038019,
"grad_norm": 1.729348975756144,
"learning_rate": 8.699534586194598e-06,
"loss": 0.4881,
"step": 479
},
{
"epoch": 0.258394455285647,
"grad_norm": 1.4614872127177663,
"learning_rate": 8.693661724506924e-06,
"loss": 0.457,
"step": 480
},
{
"epoch": 0.258394455285647,
"eval_loss": 0.4751787483692169,
"eval_runtime": 1539.7899,
"eval_samples_per_second": 16.242,
"eval_steps_per_second": 0.508,
"step": 480
},
{
"epoch": 0.25893277706749207,
"grad_norm": 2.1154756500873977,
"learning_rate": 8.687777623776357e-06,
"loss": 0.4842,
"step": 481
},
{
"epoch": 0.25947109884933717,
"grad_norm": 1.5862460419373354,
"learning_rate": 8.681882301906988e-06,
"loss": 0.4432,
"step": 482
},
{
"epoch": 0.26000942063118226,
"grad_norm": 1.796404843665338,
"learning_rate": 8.675975776837053e-06,
"loss": 0.4759,
"step": 483
},
{
"epoch": 0.26054774241302736,
"grad_norm": 1.5555927859924092,
"learning_rate": 8.67005806653888e-06,
"loss": 0.509,
"step": 484
},
{
"epoch": 0.26108606419487246,
"grad_norm": 2.1699720622194354,
"learning_rate": 8.664129189018826e-06,
"loss": 0.5334,
"step": 485
},
{
"epoch": 0.2616243859767176,
"grad_norm": 1.690073634180223,
"learning_rate": 8.658189162317226e-06,
"loss": 0.4356,
"step": 486
},
{
"epoch": 0.2621627077585627,
"grad_norm": 1.8294975401345657,
"learning_rate": 8.65223800450835e-06,
"loss": 0.4387,
"step": 487
},
{
"epoch": 0.2627010295404078,
"grad_norm": 2.5288130694594337,
"learning_rate": 8.646275733700327e-06,
"loss": 0.4567,
"step": 488
},
{
"epoch": 0.2632393513222529,
"grad_norm": 1.957861459161194,
"learning_rate": 8.640302368035105e-06,
"loss": 0.4614,
"step": 489
},
{
"epoch": 0.263777673104098,
"grad_norm": 1.5304950580333017,
"learning_rate": 8.634317925688392e-06,
"loss": 0.4655,
"step": 490
},
{
"epoch": 0.2643159948859431,
"grad_norm": 1.667011172421826,
"learning_rate": 8.628322424869599e-06,
"loss": 0.4834,
"step": 491
},
{
"epoch": 0.2648543166677882,
"grad_norm": 2.1636641173694464,
"learning_rate": 8.622315883821783e-06,
"loss": 0.4776,
"step": 492
},
{
"epoch": 0.2653926384496333,
"grad_norm": 1.46798046973594,
"learning_rate": 8.616298320821601e-06,
"loss": 0.4272,
"step": 493
},
{
"epoch": 0.2659309602314784,
"grad_norm": 1.861178177564276,
"learning_rate": 8.61026975417924e-06,
"loss": 0.4784,
"step": 494
},
{
"epoch": 0.2664692820133235,
"grad_norm": 1.6268110739530368,
"learning_rate": 8.604230202238373e-06,
"loss": 0.5029,
"step": 495
},
{
"epoch": 0.2670076037951686,
"grad_norm": 1.5680263307618678,
"learning_rate": 8.598179683376098e-06,
"loss": 0.4225,
"step": 496
},
{
"epoch": 0.2675459255770137,
"grad_norm": 1.5774347517397593,
"learning_rate": 8.592118216002883e-06,
"loss": 0.4879,
"step": 497
},
{
"epoch": 0.2680842473588588,
"grad_norm": 2.670832440569625,
"learning_rate": 8.586045818562508e-06,
"loss": 0.4667,
"step": 498
},
{
"epoch": 0.26862256914070387,
"grad_norm": 2.2055704035459787,
"learning_rate": 8.579962509532016e-06,
"loss": 0.4331,
"step": 499
},
{
"epoch": 0.26916089092254897,
"grad_norm": 1.4435727148058994,
"learning_rate": 8.573868307421648e-06,
"loss": 0.4894,
"step": 500
},
{
"epoch": 0.26969921270439406,
"grad_norm": 1.6814136996880347,
"learning_rate": 8.567763230774789e-06,
"loss": 0.4697,
"step": 501
},
{
"epoch": 0.27023753448623916,
"grad_norm": 1.5774141123551826,
"learning_rate": 8.561647298167918e-06,
"loss": 0.503,
"step": 502
},
{
"epoch": 0.27077585626808426,
"grad_norm": 1.5778826165083357,
"learning_rate": 8.555520528210541e-06,
"loss": 0.4535,
"step": 503
},
{
"epoch": 0.27131417804992936,
"grad_norm": 1.7129721491097367,
"learning_rate": 8.549382939545143e-06,
"loss": 0.4494,
"step": 504
},
{
"epoch": 0.27185249983177445,
"grad_norm": 1.8943346844828264,
"learning_rate": 8.543234550847128e-06,
"loss": 0.5063,
"step": 505
},
{
"epoch": 0.27239082161361955,
"grad_norm": 1.5886936361058726,
"learning_rate": 8.537075380824761e-06,
"loss": 0.4652,
"step": 506
},
{
"epoch": 0.27292914339546465,
"grad_norm": 1.4831172032030655,
"learning_rate": 8.530905448219112e-06,
"loss": 0.4243,
"step": 507
},
{
"epoch": 0.27346746517730974,
"grad_norm": 1.7919686995453996,
"learning_rate": 8.524724771804001e-06,
"loss": 0.5049,
"step": 508
},
{
"epoch": 0.27400578695915484,
"grad_norm": 1.7505822684442558,
"learning_rate": 8.518533370385939e-06,
"loss": 0.4423,
"step": 509
},
{
"epoch": 0.27454410874099994,
"grad_norm": 1.5798026347891434,
"learning_rate": 8.512331262804069e-06,
"loss": 0.4866,
"step": 510
},
{
"epoch": 0.27508243052284503,
"grad_norm": 1.8464155171834333,
"learning_rate": 8.506118467930112e-06,
"loss": 0.4708,
"step": 511
},
{
"epoch": 0.27562075230469013,
"grad_norm": 1.6897436623195476,
"learning_rate": 8.499895004668308e-06,
"loss": 0.4903,
"step": 512
},
{
"epoch": 0.27615907408653523,
"grad_norm": 1.7863457448170967,
"learning_rate": 8.49366089195536e-06,
"loss": 0.5092,
"step": 513
},
{
"epoch": 0.2766973958683803,
"grad_norm": 1.7320740104134424,
"learning_rate": 8.487416148760375e-06,
"loss": 0.48,
"step": 514
},
{
"epoch": 0.2772357176502254,
"grad_norm": 1.7064456081649735,
"learning_rate": 8.481160794084799e-06,
"loss": 0.4754,
"step": 515
},
{
"epoch": 0.2777740394320705,
"grad_norm": 1.7525756365837095,
"learning_rate": 8.47489484696238e-06,
"loss": 0.427,
"step": 516
},
{
"epoch": 0.2783123612139156,
"grad_norm": 2.058946941055886,
"learning_rate": 8.468618326459086e-06,
"loss": 0.4847,
"step": 517
},
{
"epoch": 0.2788506829957607,
"grad_norm": 2.0477477556261467,
"learning_rate": 8.46233125167306e-06,
"loss": 0.4579,
"step": 518
},
{
"epoch": 0.2793890047776058,
"grad_norm": 1.783616738245662,
"learning_rate": 8.456033641734562e-06,
"loss": 0.4858,
"step": 519
},
{
"epoch": 0.2799273265594509,
"grad_norm": 2.0513841896237444,
"learning_rate": 8.449725515805907e-06,
"loss": 0.5352,
"step": 520
},
{
"epoch": 0.280465648341296,
"grad_norm": 1.6372025528727123,
"learning_rate": 8.443406893081406e-06,
"loss": 0.4618,
"step": 521
},
{
"epoch": 0.2810039701231411,
"grad_norm": 1.5571805104955587,
"learning_rate": 8.437077792787314e-06,
"loss": 0.4038,
"step": 522
},
{
"epoch": 0.2815422919049862,
"grad_norm": 1.75233105631481,
"learning_rate": 8.43073823418176e-06,
"loss": 0.4845,
"step": 523
},
{
"epoch": 0.2820806136868313,
"grad_norm": 1.6881033261753147,
"learning_rate": 8.424388236554704e-06,
"loss": 0.4865,
"step": 524
},
{
"epoch": 0.2826189354686764,
"grad_norm": 1.796069079351986,
"learning_rate": 8.418027819227861e-06,
"loss": 0.4538,
"step": 525
},
{
"epoch": 0.2831572572505215,
"grad_norm": 1.24349614978993,
"learning_rate": 8.41165700155466e-06,
"loss": 0.4166,
"step": 526
},
{
"epoch": 0.2836955790323666,
"grad_norm": 1.932274887854439,
"learning_rate": 8.405275802920168e-06,
"loss": 0.5061,
"step": 527
},
{
"epoch": 0.2842339008142117,
"grad_norm": 1.5593268393001998,
"learning_rate": 8.398884242741045e-06,
"loss": 0.4894,
"step": 528
},
{
"epoch": 0.2847722225960568,
"grad_norm": 1.7069043502360113,
"learning_rate": 8.392482340465475e-06,
"loss": 0.4485,
"step": 529
},
{
"epoch": 0.2853105443779019,
"grad_norm": 1.5063144141336193,
"learning_rate": 8.386070115573115e-06,
"loss": 0.4175,
"step": 530
},
{
"epoch": 0.285848866159747,
"grad_norm": 1.4364305869165457,
"learning_rate": 8.379647587575026e-06,
"loss": 0.4766,
"step": 531
},
{
"epoch": 0.28638718794159207,
"grad_norm": 1.3932649525614649,
"learning_rate": 8.373214776013625e-06,
"loss": 0.406,
"step": 532
},
{
"epoch": 0.28692550972343717,
"grad_norm": 1.5523357464392091,
"learning_rate": 8.366771700462615e-06,
"loss": 0.508,
"step": 533
},
{
"epoch": 0.28746383150528226,
"grad_norm": 2.1213305217928613,
"learning_rate": 8.360318380526932e-06,
"loss": 0.4985,
"step": 534
},
{
"epoch": 0.28800215328712736,
"grad_norm": 1.5873480547904262,
"learning_rate": 8.353854835842685e-06,
"loss": 0.4919,
"step": 535
},
{
"epoch": 0.28854047506897246,
"grad_norm": 1.5670280821673355,
"learning_rate": 8.347381086077095e-06,
"loss": 0.4708,
"step": 536
},
{
"epoch": 0.28907879685081755,
"grad_norm": 1.6763746949820768,
"learning_rate": 8.34089715092843e-06,
"loss": 0.4165,
"step": 537
},
{
"epoch": 0.28961711863266265,
"grad_norm": 1.5717106133141925,
"learning_rate": 8.334403050125956e-06,
"loss": 0.4554,
"step": 538
},
{
"epoch": 0.29015544041450775,
"grad_norm": 1.9743994746638458,
"learning_rate": 8.327898803429866e-06,
"loss": 0.4695,
"step": 539
},
{
"epoch": 0.29069376219635285,
"grad_norm": 1.5473676266482859,
"learning_rate": 8.32138443063123e-06,
"loss": 0.4712,
"step": 540
},
{
"epoch": 0.29069376219635285,
"eval_loss": 0.47182729840278625,
"eval_runtime": 1553.992,
"eval_samples_per_second": 16.094,
"eval_steps_per_second": 0.503,
"step": 540
},
{
"epoch": 0.29123208397819794,
"grad_norm": 1.4425882953477511,
"learning_rate": 8.314859951551926e-06,
"loss": 0.4837,
"step": 541
},
{
"epoch": 0.29177040576004304,
"grad_norm": 1.3326493426074462,
"learning_rate": 8.308325386044583e-06,
"loss": 0.4814,
"step": 542
},
{
"epoch": 0.2923087275418882,
"grad_norm": 1.6128638362772016,
"learning_rate": 8.301780753992523e-06,
"loss": 0.4575,
"step": 543
},
{
"epoch": 0.2928470493237333,
"grad_norm": 1.4423693981211698,
"learning_rate": 8.295226075309697e-06,
"loss": 0.4633,
"step": 544
},
{
"epoch": 0.2933853711055784,
"grad_norm": 1.6198600771922913,
"learning_rate": 8.288661369940627e-06,
"loss": 0.4463,
"step": 545
},
{
"epoch": 0.2939236928874235,
"grad_norm": 1.5249628074643904,
"learning_rate": 8.282086657860342e-06,
"loss": 0.4668,
"step": 546
},
{
"epoch": 0.2944620146692686,
"grad_norm": 1.8125904384120293,
"learning_rate": 8.275501959074325e-06,
"loss": 0.4825,
"step": 547
},
{
"epoch": 0.2950003364511137,
"grad_norm": 1.9606743516276068,
"learning_rate": 8.268907293618437e-06,
"loss": 0.4684,
"step": 548
},
{
"epoch": 0.2955386582329588,
"grad_norm": 1.494990763192773,
"learning_rate": 8.262302681558872e-06,
"loss": 0.4664,
"step": 549
},
{
"epoch": 0.29607698001480387,
"grad_norm": 1.8337579001893594,
"learning_rate": 8.255688142992089e-06,
"loss": 0.4699,
"step": 550
},
{
"epoch": 0.29661530179664897,
"grad_norm": 1.779841389754219,
"learning_rate": 8.24906369804475e-06,
"loss": 0.4857,
"step": 551
},
{
"epoch": 0.29715362357849406,
"grad_norm": 1.6593925240524081,
"learning_rate": 8.242429366873663e-06,
"loss": 0.5038,
"step": 552
},
{
"epoch": 0.29769194536033916,
"grad_norm": 1.9956877344800352,
"learning_rate": 8.235785169665711e-06,
"loss": 0.4911,
"step": 553
},
{
"epoch": 0.29823026714218426,
"grad_norm": 1.579568204329291,
"learning_rate": 8.229131126637804e-06,
"loss": 0.4552,
"step": 554
},
{
"epoch": 0.29876858892402935,
"grad_norm": 1.5989428055850947,
"learning_rate": 8.222467258036808e-06,
"loss": 0.5177,
"step": 555
},
{
"epoch": 0.29930691070587445,
"grad_norm": 2.349536199541145,
"learning_rate": 8.215793584139485e-06,
"loss": 0.4911,
"step": 556
},
{
"epoch": 0.29984523248771955,
"grad_norm": 1.9403593317863332,
"learning_rate": 8.209110125252435e-06,
"loss": 0.5061,
"step": 557
},
{
"epoch": 0.30038355426956465,
"grad_norm": 1.7346564666609186,
"learning_rate": 8.202416901712033e-06,
"loss": 0.4357,
"step": 558
},
{
"epoch": 0.30092187605140974,
"grad_norm": 1.710471255918245,
"learning_rate": 8.195713933884359e-06,
"loss": 0.5015,
"step": 559
},
{
"epoch": 0.30146019783325484,
"grad_norm": 2.207816727293276,
"learning_rate": 8.189001242165151e-06,
"loss": 0.527,
"step": 560
},
{
"epoch": 0.30199851961509994,
"grad_norm": 1.428363458277829,
"learning_rate": 8.182278846979728e-06,
"loss": 0.4983,
"step": 561
},
{
"epoch": 0.30253684139694503,
"grad_norm": 1.77069966551508,
"learning_rate": 8.175546768782938e-06,
"loss": 0.4996,
"step": 562
},
{
"epoch": 0.30307516317879013,
"grad_norm": 1.631420375855133,
"learning_rate": 8.168805028059095e-06,
"loss": 0.4899,
"step": 563
},
{
"epoch": 0.3036134849606352,
"grad_norm": 1.6234744365340297,
"learning_rate": 8.162053645321908e-06,
"loss": 0.4275,
"step": 564
},
{
"epoch": 0.3041518067424803,
"grad_norm": 1.7151129037835051,
"learning_rate": 8.15529264111443e-06,
"loss": 0.4628,
"step": 565
},
{
"epoch": 0.3046901285243254,
"grad_norm": 1.6757537025608307,
"learning_rate": 8.148522036008985e-06,
"loss": 0.4636,
"step": 566
},
{
"epoch": 0.3052284503061705,
"grad_norm": 1.157809434742461,
"learning_rate": 8.141741850607117e-06,
"loss": 0.3868,
"step": 567
},
{
"epoch": 0.3057667720880156,
"grad_norm": 1.4360027236144732,
"learning_rate": 8.134952105539515e-06,
"loss": 0.4725,
"step": 568
},
{
"epoch": 0.3063050938698607,
"grad_norm": 1.6762158717929798,
"learning_rate": 8.128152821465957e-06,
"loss": 0.4818,
"step": 569
},
{
"epoch": 0.3068434156517058,
"grad_norm": 1.6736535469921034,
"learning_rate": 8.121344019075253e-06,
"loss": 0.4805,
"step": 570
},
{
"epoch": 0.3073817374335509,
"grad_norm": 1.5918931966460608,
"learning_rate": 8.114525719085163e-06,
"loss": 0.5152,
"step": 571
},
{
"epoch": 0.307920059215396,
"grad_norm": 1.4169517878992852,
"learning_rate": 8.107697942242356e-06,
"loss": 0.4731,
"step": 572
},
{
"epoch": 0.3084583809972411,
"grad_norm": 1.5959353428431666,
"learning_rate": 8.100860709322334e-06,
"loss": 0.4463,
"step": 573
},
{
"epoch": 0.3089967027790862,
"grad_norm": 1.4569323564340282,
"learning_rate": 8.094014041129373e-06,
"loss": 0.4046,
"step": 574
},
{
"epoch": 0.3095350245609313,
"grad_norm": 1.5558748525412556,
"learning_rate": 8.087157958496456e-06,
"loss": 0.4644,
"step": 575
},
{
"epoch": 0.3100733463427764,
"grad_norm": 1.6641076139574378,
"learning_rate": 8.080292482285213e-06,
"loss": 0.5064,
"step": 576
},
{
"epoch": 0.3106116681246215,
"grad_norm": 1.5793644667521578,
"learning_rate": 8.07341763338586e-06,
"loss": 0.515,
"step": 577
},
{
"epoch": 0.3111499899064666,
"grad_norm": 1.895774618714942,
"learning_rate": 8.066533432717127e-06,
"loss": 0.4763,
"step": 578
},
{
"epoch": 0.3116883116883117,
"grad_norm": 1.6689610869771314,
"learning_rate": 8.059639901226203e-06,
"loss": 0.4487,
"step": 579
},
{
"epoch": 0.3122266334701568,
"grad_norm": 1.4289516860868958,
"learning_rate": 8.05273705988867e-06,
"loss": 0.426,
"step": 580
},
{
"epoch": 0.3127649552520019,
"grad_norm": 1.448460429863824,
"learning_rate": 8.04582492970843e-06,
"loss": 0.4622,
"step": 581
},
{
"epoch": 0.31330327703384697,
"grad_norm": 1.562340995796949,
"learning_rate": 8.038903531717662e-06,
"loss": 0.4644,
"step": 582
},
{
"epoch": 0.31384159881569207,
"grad_norm": 1.4837986133941243,
"learning_rate": 8.031972886976731e-06,
"loss": 0.4845,
"step": 583
},
{
"epoch": 0.31437992059753717,
"grad_norm": 1.696043847539263,
"learning_rate": 8.025033016574148e-06,
"loss": 0.4631,
"step": 584
},
{
"epoch": 0.31491824237938226,
"grad_norm": 1.8636443570370922,
"learning_rate": 8.018083941626494e-06,
"loss": 0.4582,
"step": 585
},
{
"epoch": 0.31545656416122736,
"grad_norm": 1.6588060343624296,
"learning_rate": 8.011125683278351e-06,
"loss": 0.4118,
"step": 586
},
{
"epoch": 0.31599488594307246,
"grad_norm": 2.064927405044272,
"learning_rate": 8.004158262702253e-06,
"loss": 0.5307,
"step": 587
},
{
"epoch": 0.31653320772491755,
"grad_norm": 1.7599540523459494,
"learning_rate": 7.997181701098608e-06,
"loss": 0.4542,
"step": 588
},
{
"epoch": 0.31707152950676265,
"grad_norm": 1.679120614548226,
"learning_rate": 7.99019601969564e-06,
"loss": 0.4462,
"step": 589
},
{
"epoch": 0.31760985128860775,
"grad_norm": 1.6748781594901945,
"learning_rate": 7.983201239749321e-06,
"loss": 0.4435,
"step": 590
},
{
"epoch": 0.31814817307045284,
"grad_norm": 1.6895768411385892,
"learning_rate": 7.976197382543306e-06,
"loss": 0.5043,
"step": 591
},
{
"epoch": 0.31868649485229794,
"grad_norm": 1.4551705590923076,
"learning_rate": 7.969184469388877e-06,
"loss": 0.4992,
"step": 592
},
{
"epoch": 0.31922481663414304,
"grad_norm": 1.8224446520059305,
"learning_rate": 7.962162521624865e-06,
"loss": 0.5242,
"step": 593
},
{
"epoch": 0.31976313841598814,
"grad_norm": 1.5471915857747345,
"learning_rate": 7.955131560617595e-06,
"loss": 0.4672,
"step": 594
},
{
"epoch": 0.32030146019783323,
"grad_norm": 1.943277469873626,
"learning_rate": 7.948091607760815e-06,
"loss": 0.4817,
"step": 595
},
{
"epoch": 0.32083978197967833,
"grad_norm": 1.361762394527565,
"learning_rate": 7.941042684475635e-06,
"loss": 0.4341,
"step": 596
},
{
"epoch": 0.3213781037615234,
"grad_norm": 1.578768861245864,
"learning_rate": 7.933984812210459e-06,
"loss": 0.452,
"step": 597
},
{
"epoch": 0.3219164255433685,
"grad_norm": 1.3732353872225034,
"learning_rate": 7.926918012440923e-06,
"loss": 0.4349,
"step": 598
},
{
"epoch": 0.3224547473252136,
"grad_norm": 1.8064334973816905,
"learning_rate": 7.919842306669825e-06,
"loss": 0.4499,
"step": 599
},
{
"epoch": 0.3229930691070587,
"grad_norm": 1.582853458222087,
"learning_rate": 7.912757716427062e-06,
"loss": 0.4865,
"step": 600
},
{
"epoch": 0.3229930691070587,
"eval_loss": 0.4672350585460663,
"eval_runtime": 1563.3319,
"eval_samples_per_second": 15.998,
"eval_steps_per_second": 0.5,
"step": 600
},
{
"epoch": 0.32353139088890387,
"grad_norm": 1.6009402167895466,
"learning_rate": 7.905664263269567e-06,
"loss": 0.4576,
"step": 601
},
{
"epoch": 0.32406971267074897,
"grad_norm": 1.6832973254975117,
"learning_rate": 7.898561968781242e-06,
"loss": 0.457,
"step": 602
},
{
"epoch": 0.32460803445259406,
"grad_norm": 4.046599916473538,
"learning_rate": 7.891450854572884e-06,
"loss": 0.49,
"step": 603
},
{
"epoch": 0.32514635623443916,
"grad_norm": 1.5254137578843718,
"learning_rate": 7.884330942282136e-06,
"loss": 0.4533,
"step": 604
},
{
"epoch": 0.32568467801628426,
"grad_norm": 1.5392402810831298,
"learning_rate": 7.877202253573404e-06,
"loss": 0.4566,
"step": 605
},
{
"epoch": 0.32622299979812935,
"grad_norm": 1.5838863815714255,
"learning_rate": 7.870064810137806e-06,
"loss": 0.4224,
"step": 606
},
{
"epoch": 0.32676132157997445,
"grad_norm": 1.5112598539099842,
"learning_rate": 7.862918633693091e-06,
"loss": 0.4537,
"step": 607
},
{
"epoch": 0.32729964336181955,
"grad_norm": 1.7380984306062113,
"learning_rate": 7.855763745983588e-06,
"loss": 0.5168,
"step": 608
},
{
"epoch": 0.32783796514366464,
"grad_norm": 1.3686616623355445,
"learning_rate": 7.848600168780127e-06,
"loss": 0.4774,
"step": 609
},
{
"epoch": 0.32837628692550974,
"grad_norm": 1.8037345014596735,
"learning_rate": 7.841427923879982e-06,
"loss": 0.4841,
"step": 610
},
{
"epoch": 0.32891460870735484,
"grad_norm": 1.5578093278723995,
"learning_rate": 7.834247033106798e-06,
"loss": 0.4494,
"step": 611
},
{
"epoch": 0.32945293048919994,
"grad_norm": 1.7470526074648303,
"learning_rate": 7.827057518310532e-06,
"loss": 0.4316,
"step": 612
},
{
"epoch": 0.32999125227104503,
"grad_norm": 1.344635684714144,
"learning_rate": 7.819859401367376e-06,
"loss": 0.4277,
"step": 613
},
{
"epoch": 0.33052957405289013,
"grad_norm": 1.6142148463610868,
"learning_rate": 7.8126527041797e-06,
"loss": 0.4732,
"step": 614
},
{
"epoch": 0.3310678958347352,
"grad_norm": 1.4894686294102883,
"learning_rate": 7.805437448675986e-06,
"loss": 0.4804,
"step": 615
},
{
"epoch": 0.3316062176165803,
"grad_norm": 1.959553525810308,
"learning_rate": 7.798213656810747e-06,
"loss": 0.5052,
"step": 616
},
{
"epoch": 0.3321445393984254,
"grad_norm": 1.5799236754205312,
"learning_rate": 7.790981350564482e-06,
"loss": 0.432,
"step": 617
},
{
"epoch": 0.3326828611802705,
"grad_norm": 1.82490515289263,
"learning_rate": 7.783740551943586e-06,
"loss": 0.4394,
"step": 618
},
{
"epoch": 0.3332211829621156,
"grad_norm": 1.5031228288941465,
"learning_rate": 7.776491282980305e-06,
"loss": 0.5064,
"step": 619
},
{
"epoch": 0.3337595047439607,
"grad_norm": 1.4329349118783261,
"learning_rate": 7.76923356573265e-06,
"loss": 0.489,
"step": 620
},
{
"epoch": 0.3342978265258058,
"grad_norm": 1.4961946186338742,
"learning_rate": 7.761967422284347e-06,
"loss": 0.4704,
"step": 621
},
{
"epoch": 0.3348361483076509,
"grad_norm": 1.7319823672043928,
"learning_rate": 7.754692874744752e-06,
"loss": 0.4621,
"step": 622
},
{
"epoch": 0.335374470089496,
"grad_norm": 2.0507693298974035,
"learning_rate": 7.747409945248797e-06,
"loss": 0.502,
"step": 623
},
{
"epoch": 0.3359127918713411,
"grad_norm": 1.4817353671174234,
"learning_rate": 7.74011865595692e-06,
"loss": 0.4975,
"step": 624
},
{
"epoch": 0.3364511136531862,
"grad_norm": 1.5154706925154366,
"learning_rate": 7.732819029054999e-06,
"loss": 0.4819,
"step": 625
},
{
"epoch": 0.3369894354350313,
"grad_norm": 2.9866409096863507,
"learning_rate": 7.725511086754269e-06,
"loss": 0.4947,
"step": 626
},
{
"epoch": 0.3375277572168764,
"grad_norm": 1.7699700957236326,
"learning_rate": 7.718194851291284e-06,
"loss": 0.4703,
"step": 627
},
{
"epoch": 0.3380660789987215,
"grad_norm": 2.371528841529566,
"learning_rate": 7.710870344927817e-06,
"loss": 0.5458,
"step": 628
},
{
"epoch": 0.3386044007805666,
"grad_norm": 1.5200234564971724,
"learning_rate": 7.703537589950819e-06,
"loss": 0.4562,
"step": 629
},
{
"epoch": 0.3391427225624117,
"grad_norm": 1.371146036616362,
"learning_rate": 7.696196608672333e-06,
"loss": 0.4196,
"step": 630
},
{
"epoch": 0.3396810443442568,
"grad_norm": 1.5627852767313657,
"learning_rate": 7.688847423429434e-06,
"loss": 0.505,
"step": 631
},
{
"epoch": 0.3402193661261019,
"grad_norm": 1.3089486655111793,
"learning_rate": 7.68149005658417e-06,
"loss": 0.4532,
"step": 632
},
{
"epoch": 0.34075768790794697,
"grad_norm": 1.72862210074593,
"learning_rate": 7.674124530523461e-06,
"loss": 0.5431,
"step": 633
},
{
"epoch": 0.34129600968979207,
"grad_norm": 1.397330557638678,
"learning_rate": 7.666750867659078e-06,
"loss": 0.46,
"step": 634
},
{
"epoch": 0.34183433147163717,
"grad_norm": 1.5822930242940645,
"learning_rate": 7.659369090427537e-06,
"loss": 0.5183,
"step": 635
},
{
"epoch": 0.34237265325348226,
"grad_norm": 1.517257101602274,
"learning_rate": 7.651979221290049e-06,
"loss": 0.4847,
"step": 636
},
{
"epoch": 0.34291097503532736,
"grad_norm": 1.569552765274582,
"learning_rate": 7.644581282732445e-06,
"loss": 0.5237,
"step": 637
},
{
"epoch": 0.34344929681717246,
"grad_norm": 1.5173887839906304,
"learning_rate": 7.637175297265109e-06,
"loss": 0.444,
"step": 638
},
{
"epoch": 0.34398761859901755,
"grad_norm": 2.6037607041595883,
"learning_rate": 7.629761287422915e-06,
"loss": 0.4271,
"step": 639
},
{
"epoch": 0.34452594038086265,
"grad_norm": 1.6900192017878133,
"learning_rate": 7.622339275765147e-06,
"loss": 0.4631,
"step": 640
},
{
"epoch": 0.34506426216270775,
"grad_norm": 1.6204089265699804,
"learning_rate": 7.61490928487544e-06,
"loss": 0.4798,
"step": 641
},
{
"epoch": 0.34560258394455284,
"grad_norm": 2.072148397739707,
"learning_rate": 7.6074713373617094e-06,
"loss": 0.5169,
"step": 642
},
{
"epoch": 0.34614090572639794,
"grad_norm": 1.4489303833679512,
"learning_rate": 7.600025455856078e-06,
"loss": 0.4477,
"step": 643
},
{
"epoch": 0.34667922750824304,
"grad_norm": 1.808968142318587,
"learning_rate": 7.592571663014811e-06,
"loss": 0.4591,
"step": 644
},
{
"epoch": 0.34721754929008813,
"grad_norm": 1.4861828747421941,
"learning_rate": 7.5851099815182505e-06,
"loss": 0.4792,
"step": 645
},
{
"epoch": 0.34775587107193323,
"grad_norm": 1.6729126421729203,
"learning_rate": 7.577640434070734e-06,
"loss": 0.4832,
"step": 646
},
{
"epoch": 0.34829419285377833,
"grad_norm": 1.871195222211602,
"learning_rate": 7.5701630434005405e-06,
"loss": 0.4417,
"step": 647
},
{
"epoch": 0.3488325146356234,
"grad_norm": 1.51735945461571,
"learning_rate": 7.56267783225981e-06,
"loss": 0.4741,
"step": 648
},
{
"epoch": 0.3493708364174685,
"grad_norm": 2.071142969866682,
"learning_rate": 7.555184823424479e-06,
"loss": 0.4127,
"step": 649
},
{
"epoch": 0.3499091581993136,
"grad_norm": 1.910282433363155,
"learning_rate": 7.547684039694216e-06,
"loss": 0.4531,
"step": 650
},
{
"epoch": 0.3504474799811587,
"grad_norm": 1.9652818314978835,
"learning_rate": 7.54017550389234e-06,
"loss": 0.5085,
"step": 651
},
{
"epoch": 0.3509858017630038,
"grad_norm": 1.6117024086203307,
"learning_rate": 7.5326592388657605e-06,
"loss": 0.5148,
"step": 652
},
{
"epoch": 0.3515241235448489,
"grad_norm": 1.4960314880258612,
"learning_rate": 7.525135267484906e-06,
"loss": 0.4629,
"step": 653
},
{
"epoch": 0.352062445326694,
"grad_norm": 1.604228922752054,
"learning_rate": 7.517603612643653e-06,
"loss": 0.5117,
"step": 654
},
{
"epoch": 0.3526007671085391,
"grad_norm": 2.136019956641433,
"learning_rate": 7.5100642972592606e-06,
"loss": 0.4629,
"step": 655
},
{
"epoch": 0.3531390888903842,
"grad_norm": 1.4857693238664922,
"learning_rate": 7.50251734427229e-06,
"loss": 0.4671,
"step": 656
},
{
"epoch": 0.3536774106722293,
"grad_norm": 1.4380772688023766,
"learning_rate": 7.494962776646549e-06,
"loss": 0.428,
"step": 657
},
{
"epoch": 0.35421573245407445,
"grad_norm": 1.7510803552126726,
"learning_rate": 7.487400617369013e-06,
"loss": 0.4417,
"step": 658
},
{
"epoch": 0.35475405423591955,
"grad_norm": 1.8718328199464012,
"learning_rate": 7.479830889449754e-06,
"loss": 0.4489,
"step": 659
},
{
"epoch": 0.35529237601776464,
"grad_norm": 1.3987482870509058,
"learning_rate": 7.472253615921878e-06,
"loss": 0.5121,
"step": 660
},
{
"epoch": 0.35529237601776464,
"eval_loss": 0.4641415774822235,
"eval_runtime": 1581.4987,
"eval_samples_per_second": 15.814,
"eval_steps_per_second": 0.494,
"step": 660
},
{
"epoch": 0.35583069779960974,
"grad_norm": 1.5856953831241587,
"learning_rate": 7.464668819841453e-06,
"loss": 0.4429,
"step": 661
},
{
"epoch": 0.35636901958145484,
"grad_norm": 1.648655956667231,
"learning_rate": 7.457076524287426e-06,
"loss": 0.4794,
"step": 662
},
{
"epoch": 0.35690734136329993,
"grad_norm": 1.8056054836187343,
"learning_rate": 7.4494767523615754e-06,
"loss": 0.4488,
"step": 663
},
{
"epoch": 0.35744566314514503,
"grad_norm": 1.7062432057396102,
"learning_rate": 7.441869527188421e-06,
"loss": 0.4506,
"step": 664
},
{
"epoch": 0.35798398492699013,
"grad_norm": 1.4819375518870144,
"learning_rate": 7.434254871915166e-06,
"loss": 0.4135,
"step": 665
},
{
"epoch": 0.3585223067088352,
"grad_norm": 1.734074823822691,
"learning_rate": 7.426632809711617e-06,
"loss": 0.4744,
"step": 666
},
{
"epoch": 0.3590606284906803,
"grad_norm": 1.5235626105944915,
"learning_rate": 7.4190033637701216e-06,
"loss": 0.4646,
"step": 667
},
{
"epoch": 0.3595989502725254,
"grad_norm": 1.9128329967338416,
"learning_rate": 7.411366557305495e-06,
"loss": 0.4626,
"step": 668
},
{
"epoch": 0.3601372720543705,
"grad_norm": 2.5022708068016097,
"learning_rate": 7.403722413554947e-06,
"loss": 0.4959,
"step": 669
},
{
"epoch": 0.3606755938362156,
"grad_norm": 1.8966801972869858,
"learning_rate": 7.396070955778013e-06,
"loss": 0.45,
"step": 670
},
{
"epoch": 0.3612139156180607,
"grad_norm": 2.061313497940433,
"learning_rate": 7.388412207256486e-06,
"loss": 0.4961,
"step": 671
},
{
"epoch": 0.3617522373999058,
"grad_norm": 1.6720715956995327,
"learning_rate": 7.380746191294341e-06,
"loss": 0.4667,
"step": 672
},
{
"epoch": 0.3622905591817509,
"grad_norm": 1.5487990630837682,
"learning_rate": 7.373072931217669e-06,
"loss": 0.527,
"step": 673
},
{
"epoch": 0.362828880963596,
"grad_norm": 1.4996736955806738,
"learning_rate": 7.365392450374598e-06,
"loss": 0.4353,
"step": 674
},
{
"epoch": 0.3633672027454411,
"grad_norm": 1.6372189463929279,
"learning_rate": 7.357704772135231e-06,
"loss": 0.469,
"step": 675
},
{
"epoch": 0.3639055245272862,
"grad_norm": 1.5447454253844684,
"learning_rate": 7.350009919891574e-06,
"loss": 0.4278,
"step": 676
},
{
"epoch": 0.3644438463091313,
"grad_norm": 1.4107385578994651,
"learning_rate": 7.342307917057457e-06,
"loss": 0.44,
"step": 677
},
{
"epoch": 0.3649821680909764,
"grad_norm": 1.4950963156286234,
"learning_rate": 7.334598787068469e-06,
"loss": 0.4529,
"step": 678
},
{
"epoch": 0.3655204898728215,
"grad_norm": 2.047196931688194,
"learning_rate": 7.326882553381886e-06,
"loss": 0.4993,
"step": 679
},
{
"epoch": 0.3660588116546666,
"grad_norm": 1.8078116478641435,
"learning_rate": 7.319159239476601e-06,
"loss": 0.4903,
"step": 680
},
{
"epoch": 0.3665971334365117,
"grad_norm": 1.6585777335125267,
"learning_rate": 7.311428868853047e-06,
"loss": 0.449,
"step": 681
},
{
"epoch": 0.3671354552183568,
"grad_norm": 1.644551492901717,
"learning_rate": 7.30369146503313e-06,
"loss": 0.4359,
"step": 682
},
{
"epoch": 0.3676737770002019,
"grad_norm": 1.566051715226832,
"learning_rate": 7.29594705156016e-06,
"loss": 0.5171,
"step": 683
},
{
"epoch": 0.36821209878204697,
"grad_norm": 1.860361723636211,
"learning_rate": 7.288195651998772e-06,
"loss": 0.5058,
"step": 684
},
{
"epoch": 0.36875042056389207,
"grad_norm": 1.479824820585221,
"learning_rate": 7.280437289934858e-06,
"loss": 0.5082,
"step": 685
},
{
"epoch": 0.36928874234573716,
"grad_norm": 1.5621912841951935,
"learning_rate": 7.272671988975499e-06,
"loss": 0.4861,
"step": 686
},
{
"epoch": 0.36982706412758226,
"grad_norm": 1.6260728405178757,
"learning_rate": 7.264899772748889e-06,
"loss": 0.5003,
"step": 687
},
{
"epoch": 0.37036538590942736,
"grad_norm": 1.5646367035382582,
"learning_rate": 7.2571206649042584e-06,
"loss": 0.4559,
"step": 688
},
{
"epoch": 0.37090370769127246,
"grad_norm": 1.7472551729015091,
"learning_rate": 7.249334689111814e-06,
"loss": 0.4541,
"step": 689
},
{
"epoch": 0.37144202947311755,
"grad_norm": 1.6362939723396042,
"learning_rate": 7.241541869062656e-06,
"loss": 0.4733,
"step": 690
},
{
"epoch": 0.37198035125496265,
"grad_norm": 1.4710809281537391,
"learning_rate": 7.2337422284687135e-06,
"loss": 0.4523,
"step": 691
},
{
"epoch": 0.37251867303680775,
"grad_norm": 1.6849371563467512,
"learning_rate": 7.225935791062665e-06,
"loss": 0.4976,
"step": 692
},
{
"epoch": 0.37305699481865284,
"grad_norm": 1.7850003378424297,
"learning_rate": 7.2181225805978745e-06,
"loss": 0.4482,
"step": 693
},
{
"epoch": 0.37359531660049794,
"grad_norm": 2.355398835881447,
"learning_rate": 7.210302620848315e-06,
"loss": 0.4599,
"step": 694
},
{
"epoch": 0.37413363838234304,
"grad_norm": 1.617194741699657,
"learning_rate": 7.20247593560849e-06,
"loss": 0.4543,
"step": 695
},
{
"epoch": 0.37467196016418813,
"grad_norm": 1.4733355105927,
"learning_rate": 7.1946425486933755e-06,
"loss": 0.4125,
"step": 696
},
{
"epoch": 0.37521028194603323,
"grad_norm": 1.4512303803275823,
"learning_rate": 7.186802483938333e-06,
"loss": 0.4515,
"step": 697
},
{
"epoch": 0.3757486037278783,
"grad_norm": 1.4829224037632613,
"learning_rate": 7.178955765199048e-06,
"loss": 0.475,
"step": 698
},
{
"epoch": 0.3762869255097234,
"grad_norm": 1.4882203445110318,
"learning_rate": 7.171102416351448e-06,
"loss": 0.4485,
"step": 699
},
{
"epoch": 0.3768252472915685,
"grad_norm": 1.6613200067557963,
"learning_rate": 7.163242461291639e-06,
"loss": 0.4402,
"step": 700
},
{
"epoch": 0.3773635690734136,
"grad_norm": 1.7483634690103926,
"learning_rate": 7.155375923935826e-06,
"loss": 0.4936,
"step": 701
},
{
"epoch": 0.3779018908552587,
"grad_norm": 1.6616671629226913,
"learning_rate": 7.14750282822024e-06,
"loss": 0.4644,
"step": 702
},
{
"epoch": 0.3784402126371038,
"grad_norm": 1.5260208283942596,
"learning_rate": 7.139623198101073e-06,
"loss": 0.489,
"step": 703
},
{
"epoch": 0.3789785344189489,
"grad_norm": 1.361965813750003,
"learning_rate": 7.131737057554399e-06,
"loss": 0.3901,
"step": 704
},
{
"epoch": 0.379516856200794,
"grad_norm": 1.620874046214403,
"learning_rate": 7.1238444305760975e-06,
"loss": 0.458,
"step": 705
},
{
"epoch": 0.3800551779826391,
"grad_norm": 1.7744718469804224,
"learning_rate": 7.115945341181789e-06,
"loss": 0.4585,
"step": 706
},
{
"epoch": 0.3805934997644842,
"grad_norm": 1.4959797567409379,
"learning_rate": 7.108039813406755e-06,
"loss": 0.4497,
"step": 707
},
{
"epoch": 0.3811318215463293,
"grad_norm": 1.645088668489625,
"learning_rate": 7.10012787130587e-06,
"loss": 0.4419,
"step": 708
},
{
"epoch": 0.3816701433281744,
"grad_norm": 1.5908205648141605,
"learning_rate": 7.092209538953527e-06,
"loss": 0.4768,
"step": 709
},
{
"epoch": 0.3822084651100195,
"grad_norm": 1.2865059891101038,
"learning_rate": 7.0842848404435574e-06,
"loss": 0.4432,
"step": 710
},
{
"epoch": 0.3827467868918646,
"grad_norm": 1.438686585698748,
"learning_rate": 7.07635379988917e-06,
"loss": 0.463,
"step": 711
},
{
"epoch": 0.3832851086737097,
"grad_norm": 1.5810030390346108,
"learning_rate": 7.068416441422867e-06,
"loss": 0.4324,
"step": 712
},
{
"epoch": 0.3838234304555548,
"grad_norm": 1.8920886247581228,
"learning_rate": 7.060472789196378e-06,
"loss": 0.4513,
"step": 713
},
{
"epoch": 0.3843617522373999,
"grad_norm": 1.4721512319324748,
"learning_rate": 7.052522867380578e-06,
"loss": 0.4794,
"step": 714
},
{
"epoch": 0.38490007401924503,
"grad_norm": 1.8748283518664401,
"learning_rate": 7.044566700165426e-06,
"loss": 0.5359,
"step": 715
},
{
"epoch": 0.38543839580109013,
"grad_norm": 2.1664339926414247,
"learning_rate": 7.036604311759879e-06,
"loss": 0.4696,
"step": 716
},
{
"epoch": 0.3859767175829352,
"grad_norm": 1.599064767192068,
"learning_rate": 7.028635726391826e-06,
"loss": 0.5009,
"step": 717
},
{
"epoch": 0.3865150393647803,
"grad_norm": 1.658951664965314,
"learning_rate": 7.020660968308011e-06,
"loss": 0.526,
"step": 718
},
{
"epoch": 0.3870533611466254,
"grad_norm": 1.5566803387570707,
"learning_rate": 7.012680061773962e-06,
"loss": 0.4944,
"step": 719
},
{
"epoch": 0.3875916829284705,
"grad_norm": 1.5561052872784167,
"learning_rate": 7.0046930310739145e-06,
"loss": 0.4023,
"step": 720
},
{
"epoch": 0.3875916829284705,
"eval_loss": 0.4598337709903717,
"eval_runtime": 1512.3789,
"eval_samples_per_second": 16.537,
"eval_steps_per_second": 0.517,
"step": 720
},
{
"epoch": 0.3881300047103156,
"grad_norm": 1.5343444055056177,
"learning_rate": 6.996699900510736e-06,
"loss": 0.4661,
"step": 721
},
{
"epoch": 0.3886683264921607,
"grad_norm": 1.5835711750557553,
"learning_rate": 6.988700694405861e-06,
"loss": 0.5243,
"step": 722
},
{
"epoch": 0.3892066482740058,
"grad_norm": 1.739458700941234,
"learning_rate": 6.980695437099203e-06,
"loss": 0.468,
"step": 723
},
{
"epoch": 0.3897449700558509,
"grad_norm": 1.4597418259308022,
"learning_rate": 6.972684152949095e-06,
"loss": 0.4312,
"step": 724
},
{
"epoch": 0.390283291837696,
"grad_norm": 1.4822140659700849,
"learning_rate": 6.964666866332202e-06,
"loss": 0.4171,
"step": 725
},
{
"epoch": 0.3908216136195411,
"grad_norm": 2.219448742321713,
"learning_rate": 6.956643601643459e-06,
"loss": 0.4682,
"step": 726
},
{
"epoch": 0.3913599354013862,
"grad_norm": 1.6249675680199915,
"learning_rate": 6.948614383295988e-06,
"loss": 0.467,
"step": 727
},
{
"epoch": 0.3918982571832313,
"grad_norm": 2.5331886913847916,
"learning_rate": 6.940579235721027e-06,
"loss": 0.5046,
"step": 728
},
{
"epoch": 0.3924365789650764,
"grad_norm": 1.651989792055275,
"learning_rate": 6.932538183367854e-06,
"loss": 0.4432,
"step": 729
},
{
"epoch": 0.3929749007469215,
"grad_norm": 1.4451051204854284,
"learning_rate": 6.924491250703716e-06,
"loss": 0.436,
"step": 730
},
{
"epoch": 0.3935132225287666,
"grad_norm": 1.6726948542569147,
"learning_rate": 6.916438462213756e-06,
"loss": 0.4701,
"step": 731
},
{
"epoch": 0.3940515443106117,
"grad_norm": 1.3458270610890806,
"learning_rate": 6.908379842400926e-06,
"loss": 0.461,
"step": 732
},
{
"epoch": 0.3945898660924568,
"grad_norm": 1.8671906958135296,
"learning_rate": 6.90031541578593e-06,
"loss": 0.4621,
"step": 733
},
{
"epoch": 0.3951281878743019,
"grad_norm": 1.6937643401491398,
"learning_rate": 6.892245206907136e-06,
"loss": 0.4403,
"step": 734
},
{
"epoch": 0.39566650965614697,
"grad_norm": 1.6011629978962008,
"learning_rate": 6.88416924032051e-06,
"loss": 0.4832,
"step": 735
},
{
"epoch": 0.39620483143799207,
"grad_norm": 1.7023847640279732,
"learning_rate": 6.876087540599532e-06,
"loss": 0.4871,
"step": 736
},
{
"epoch": 0.39674315321983716,
"grad_norm": 1.5639503808317925,
"learning_rate": 6.868000132335132e-06,
"loss": 0.504,
"step": 737
},
{
"epoch": 0.39728147500168226,
"grad_norm": 1.6209519657967315,
"learning_rate": 6.859907040135609e-06,
"loss": 0.4947,
"step": 738
},
{
"epoch": 0.39781979678352736,
"grad_norm": 1.4902231086791655,
"learning_rate": 6.851808288626554e-06,
"loss": 0.4329,
"step": 739
},
{
"epoch": 0.39835811856537245,
"grad_norm": 1.4751989923406863,
"learning_rate": 6.843703902450781e-06,
"loss": 0.469,
"step": 740
},
{
"epoch": 0.39889644034721755,
"grad_norm": 1.7318655949983495,
"learning_rate": 6.8355939062682485e-06,
"loss": 0.4646,
"step": 741
},
{
"epoch": 0.39943476212906265,
"grad_norm": 2.0477062374958312,
"learning_rate": 6.827478324755986e-06,
"loss": 0.4527,
"step": 742
},
{
"epoch": 0.39997308391090775,
"grad_norm": 1.5357049173396753,
"learning_rate": 6.819357182608014e-06,
"loss": 0.4119,
"step": 743
},
{
"epoch": 0.40051140569275284,
"grad_norm": 1.6669074072618764,
"learning_rate": 6.811230504535276e-06,
"loss": 0.4123,
"step": 744
},
{
"epoch": 0.40104972747459794,
"grad_norm": 2.0238793916536095,
"learning_rate": 6.803098315265563e-06,
"loss": 0.4607,
"step": 745
},
{
"epoch": 0.40158804925644304,
"grad_norm": 1.7302550872159141,
"learning_rate": 6.7949606395434294e-06,
"loss": 0.5252,
"step": 746
},
{
"epoch": 0.40212637103828813,
"grad_norm": 1.5575167275155066,
"learning_rate": 6.786817502130127e-06,
"loss": 0.4484,
"step": 747
},
{
"epoch": 0.40266469282013323,
"grad_norm": 1.3960320100955355,
"learning_rate": 6.778668927803526e-06,
"loss": 0.444,
"step": 748
},
{
"epoch": 0.4032030146019783,
"grad_norm": 1.5537207671933355,
"learning_rate": 6.770514941358041e-06,
"loss": 0.4522,
"step": 749
},
{
"epoch": 0.4037413363838234,
"grad_norm": 1.6191186519608955,
"learning_rate": 6.762355567604553e-06,
"loss": 0.489,
"step": 750
},
{
"epoch": 0.4042796581656685,
"grad_norm": 1.7320364851332162,
"learning_rate": 6.7541908313703355e-06,
"loss": 0.4746,
"step": 751
},
{
"epoch": 0.4048179799475136,
"grad_norm": 1.5268044530623444,
"learning_rate": 6.746020757498979e-06,
"loss": 0.4138,
"step": 752
},
{
"epoch": 0.4053563017293587,
"grad_norm": 1.522928297135606,
"learning_rate": 6.737845370850317e-06,
"loss": 0.4938,
"step": 753
},
{
"epoch": 0.4058946235112038,
"grad_norm": 1.567608770456755,
"learning_rate": 6.729664696300347e-06,
"loss": 0.4745,
"step": 754
},
{
"epoch": 0.4064329452930489,
"grad_norm": 1.5048680773669196,
"learning_rate": 6.721478758741155e-06,
"loss": 0.4714,
"step": 755
},
{
"epoch": 0.406971267074894,
"grad_norm": 1.7508536934704277,
"learning_rate": 6.713287583080845e-06,
"loss": 0.4778,
"step": 756
},
{
"epoch": 0.4075095888567391,
"grad_norm": 1.6217945250756625,
"learning_rate": 6.70509119424346e-06,
"loss": 0.4529,
"step": 757
},
{
"epoch": 0.4080479106385842,
"grad_norm": 1.6092594479977214,
"learning_rate": 6.696889617168897e-06,
"loss": 0.4674,
"step": 758
},
{
"epoch": 0.4085862324204293,
"grad_norm": 1.5153766468742507,
"learning_rate": 6.688682876812851e-06,
"loss": 0.4612,
"step": 759
},
{
"epoch": 0.4091245542022744,
"grad_norm": 1.6200362705011053,
"learning_rate": 6.6804709981467195e-06,
"loss": 0.4812,
"step": 760
},
{
"epoch": 0.4096628759841195,
"grad_norm": 1.6047382022765324,
"learning_rate": 6.672254006157541e-06,
"loss": 0.4758,
"step": 761
},
{
"epoch": 0.4102011977659646,
"grad_norm": 1.8520426373676713,
"learning_rate": 6.664031925847908e-06,
"loss": 0.4184,
"step": 762
},
{
"epoch": 0.4107395195478097,
"grad_norm": 2.2658987317474195,
"learning_rate": 6.6558047822358975e-06,
"loss": 0.5178,
"step": 763
},
{
"epoch": 0.4112778413296548,
"grad_norm": 1.580321228406977,
"learning_rate": 6.6475726003549934e-06,
"loss": 0.4249,
"step": 764
},
{
"epoch": 0.4118161631114999,
"grad_norm": 1.4077736219835957,
"learning_rate": 6.639335405254008e-06,
"loss": 0.4586,
"step": 765
},
{
"epoch": 0.412354484893345,
"grad_norm": 1.5112139801178681,
"learning_rate": 6.631093221997012e-06,
"loss": 0.4316,
"step": 766
},
{
"epoch": 0.41289280667519007,
"grad_norm": 1.4529648200398257,
"learning_rate": 6.6228460756632496e-06,
"loss": 0.4571,
"step": 767
},
{
"epoch": 0.41343112845703517,
"grad_norm": 1.826148495373045,
"learning_rate": 6.61459399134707e-06,
"loss": 0.4278,
"step": 768
},
{
"epoch": 0.41396945023888027,
"grad_norm": 1.5179851185666227,
"learning_rate": 6.6063369941578445e-06,
"loss": 0.4622,
"step": 769
},
{
"epoch": 0.41450777202072536,
"grad_norm": 1.3529363726674315,
"learning_rate": 6.5980751092198955e-06,
"loss": 0.4215,
"step": 770
},
{
"epoch": 0.41504609380257046,
"grad_norm": 2.0731132539429944,
"learning_rate": 6.589808361672417e-06,
"loss": 0.484,
"step": 771
},
{
"epoch": 0.4155844155844156,
"grad_norm": 1.4870501106627148,
"learning_rate": 6.581536776669402e-06,
"loss": 0.4863,
"step": 772
},
{
"epoch": 0.4161227373662607,
"grad_norm": 1.9062099501037697,
"learning_rate": 6.5732603793795535e-06,
"loss": 0.4238,
"step": 773
},
{
"epoch": 0.4166610591481058,
"grad_norm": 1.5565227999579219,
"learning_rate": 6.564979194986229e-06,
"loss": 0.4524,
"step": 774
},
{
"epoch": 0.4171993809299509,
"grad_norm": 2.306172957615922,
"learning_rate": 6.5566932486873455e-06,
"loss": 0.4964,
"step": 775
},
{
"epoch": 0.417737702711796,
"grad_norm": 1.401583156601946,
"learning_rate": 6.54840256569531e-06,
"loss": 0.4304,
"step": 776
},
{
"epoch": 0.4182760244936411,
"grad_norm": 1.749412909981746,
"learning_rate": 6.540107171236943e-06,
"loss": 0.4844,
"step": 777
},
{
"epoch": 0.4188143462754862,
"grad_norm": 1.6322807652870075,
"learning_rate": 6.531807090553402e-06,
"loss": 0.4853,
"step": 778
},
{
"epoch": 0.4193526680573313,
"grad_norm": 1.2479234535295218,
"learning_rate": 6.5235023489001046e-06,
"loss": 0.4491,
"step": 779
},
{
"epoch": 0.4198909898391764,
"grad_norm": 1.5833625576839316,
"learning_rate": 6.515192971546645e-06,
"loss": 0.4171,
"step": 780
},
{
"epoch": 0.4198909898391764,
"eval_loss": 0.4564184546470642,
"eval_runtime": 1517.3821,
"eval_samples_per_second": 16.482,
"eval_steps_per_second": 0.515,
"step": 780
},
{
"epoch": 0.4204293116210215,
"grad_norm": 1.5809122747897906,
"learning_rate": 6.50687898377673e-06,
"loss": 0.4087,
"step": 781
},
{
"epoch": 0.4209676334028666,
"grad_norm": 1.5387429096209948,
"learning_rate": 6.49856041088809e-06,
"loss": 0.4414,
"step": 782
},
{
"epoch": 0.4215059551847117,
"grad_norm": 1.6020701369523538,
"learning_rate": 6.49023727819241e-06,
"loss": 0.4237,
"step": 783
},
{
"epoch": 0.4220442769665568,
"grad_norm": 1.6896383664306511,
"learning_rate": 6.481909611015249e-06,
"loss": 0.5049,
"step": 784
},
{
"epoch": 0.42258259874840187,
"grad_norm": 1.4623261927757227,
"learning_rate": 6.47357743469596e-06,
"loss": 0.4513,
"step": 785
},
{
"epoch": 0.42312092053024697,
"grad_norm": 1.8063028002015338,
"learning_rate": 6.465240774587623e-06,
"loss": 0.4917,
"step": 786
},
{
"epoch": 0.42365924231209207,
"grad_norm": 1.639390083578586,
"learning_rate": 6.4568996560569515e-06,
"loss": 0.4578,
"step": 787
},
{
"epoch": 0.42419756409393716,
"grad_norm": 1.337761070121856,
"learning_rate": 6.448554104484236e-06,
"loss": 0.4523,
"step": 788
},
{
"epoch": 0.42473588587578226,
"grad_norm": 1.518872556678575,
"learning_rate": 6.44020414526325e-06,
"loss": 0.4384,
"step": 789
},
{
"epoch": 0.42527420765762736,
"grad_norm": 1.491028002743192,
"learning_rate": 6.431849803801179e-06,
"loss": 0.451,
"step": 790
},
{
"epoch": 0.42581252943947245,
"grad_norm": 2.093042650030991,
"learning_rate": 6.423491105518542e-06,
"loss": 0.4656,
"step": 791
},
{
"epoch": 0.42635085122131755,
"grad_norm": 1.9063256309499805,
"learning_rate": 6.415128075849118e-06,
"loss": 0.4848,
"step": 792
},
{
"epoch": 0.42688917300316265,
"grad_norm": 1.7660120890204227,
"learning_rate": 6.4067607402398625e-06,
"loss": 0.4451,
"step": 793
},
{
"epoch": 0.42742749478500774,
"grad_norm": 1.577961253859089,
"learning_rate": 6.398389124150832e-06,
"loss": 0.485,
"step": 794
},
{
"epoch": 0.42796581656685284,
"grad_norm": 1.6746798086361996,
"learning_rate": 6.3900132530551125e-06,
"loss": 0.4521,
"step": 795
},
{
"epoch": 0.42850413834869794,
"grad_norm": 1.696615006593536,
"learning_rate": 6.381633152438733e-06,
"loss": 0.4406,
"step": 796
},
{
"epoch": 0.42904246013054304,
"grad_norm": 3.213801364228645,
"learning_rate": 6.373248847800595e-06,
"loss": 0.5115,
"step": 797
},
{
"epoch": 0.42958078191238813,
"grad_norm": 1.719986070739237,
"learning_rate": 6.364860364652388e-06,
"loss": 0.4237,
"step": 798
},
{
"epoch": 0.43011910369423323,
"grad_norm": 1.778509802687885,
"learning_rate": 6.3564677285185196e-06,
"loss": 0.4568,
"step": 799
},
{
"epoch": 0.4306574254760783,
"grad_norm": 1.5260126863179546,
"learning_rate": 6.348070964936032e-06,
"loss": 0.4337,
"step": 800
},
{
"epoch": 0.4311957472579234,
"grad_norm": 1.5937231247097972,
"learning_rate": 6.339670099454526e-06,
"loss": 0.4642,
"step": 801
},
{
"epoch": 0.4317340690397685,
"grad_norm": 2.9535392042792465,
"learning_rate": 6.3312651576360866e-06,
"loss": 0.4434,
"step": 802
},
{
"epoch": 0.4322723908216136,
"grad_norm": 1.49472223900728,
"learning_rate": 6.322856165055198e-06,
"loss": 0.4125,
"step": 803
},
{
"epoch": 0.4328107126034587,
"grad_norm": 2.242176131558003,
"learning_rate": 6.314443147298675e-06,
"loss": 0.49,
"step": 804
},
{
"epoch": 0.4333490343853038,
"grad_norm": 1.681655235385771,
"learning_rate": 6.306026129965573e-06,
"loss": 0.4245,
"step": 805
},
{
"epoch": 0.4338873561671489,
"grad_norm": 1.5909295811480582,
"learning_rate": 6.297605138667127e-06,
"loss": 0.4748,
"step": 806
},
{
"epoch": 0.434425677948994,
"grad_norm": 1.5145278838582474,
"learning_rate": 6.289180199026654e-06,
"loss": 0.4578,
"step": 807
},
{
"epoch": 0.4349639997308391,
"grad_norm": 1.459737051246134,
"learning_rate": 6.280751336679495e-06,
"loss": 0.4637,
"step": 808
},
{
"epoch": 0.4355023215126842,
"grad_norm": 1.6191142290587295,
"learning_rate": 6.2723185772729166e-06,
"loss": 0.4582,
"step": 809
},
{
"epoch": 0.4360406432945293,
"grad_norm": 2.0040844342157422,
"learning_rate": 6.263881946466049e-06,
"loss": 0.4783,
"step": 810
},
{
"epoch": 0.4365789650763744,
"grad_norm": 1.7322826082498741,
"learning_rate": 6.255441469929804e-06,
"loss": 0.5002,
"step": 811
},
{
"epoch": 0.4371172868582195,
"grad_norm": 1.4894619670010198,
"learning_rate": 6.2469971733467925e-06,
"loss": 0.4253,
"step": 812
},
{
"epoch": 0.4376556086400646,
"grad_norm": 1.6488111913669299,
"learning_rate": 6.238549082411247e-06,
"loss": 0.4539,
"step": 813
},
{
"epoch": 0.4381939304219097,
"grad_norm": 1.3488898562178637,
"learning_rate": 6.230097222828949e-06,
"loss": 0.4623,
"step": 814
},
{
"epoch": 0.4387322522037548,
"grad_norm": 1.6423043283763479,
"learning_rate": 6.221641620317147e-06,
"loss": 0.4921,
"step": 815
},
{
"epoch": 0.4392705739855999,
"grad_norm": 1.9335639612379423,
"learning_rate": 6.2131823006044756e-06,
"loss": 0.4453,
"step": 816
},
{
"epoch": 0.439808895767445,
"grad_norm": 1.389152591337612,
"learning_rate": 6.2047192894308815e-06,
"loss": 0.4413,
"step": 817
},
{
"epoch": 0.44034721754929007,
"grad_norm": 1.983305422880984,
"learning_rate": 6.196252612547545e-06,
"loss": 0.5093,
"step": 818
},
{
"epoch": 0.44088553933113517,
"grad_norm": 2.053814295705837,
"learning_rate": 6.187782295716802e-06,
"loss": 0.4381,
"step": 819
},
{
"epoch": 0.44142386111298026,
"grad_norm": 1.547864349515979,
"learning_rate": 6.179308364712056e-06,
"loss": 0.4932,
"step": 820
},
{
"epoch": 0.44196218289482536,
"grad_norm": 1.4111506897228125,
"learning_rate": 6.170830845317717e-06,
"loss": 0.4695,
"step": 821
},
{
"epoch": 0.44250050467667046,
"grad_norm": 2.5994615269947485,
"learning_rate": 6.162349763329109e-06,
"loss": 0.5318,
"step": 822
},
{
"epoch": 0.44303882645851556,
"grad_norm": 1.5802737203663468,
"learning_rate": 6.153865144552398e-06,
"loss": 0.4676,
"step": 823
},
{
"epoch": 0.44357714824036065,
"grad_norm": 1.4711770748421387,
"learning_rate": 6.145377014804509e-06,
"loss": 0.4687,
"step": 824
},
{
"epoch": 0.44411547002220575,
"grad_norm": 1.3383114582462243,
"learning_rate": 6.136885399913052e-06,
"loss": 0.4514,
"step": 825
},
{
"epoch": 0.44465379180405085,
"grad_norm": 1.375700143244168,
"learning_rate": 6.1283903257162434e-06,
"loss": 0.4581,
"step": 826
},
{
"epoch": 0.44519211358589594,
"grad_norm": 1.6933351988143874,
"learning_rate": 6.119891818062822e-06,
"loss": 0.4399,
"step": 827
},
{
"epoch": 0.44573043536774104,
"grad_norm": 1.4137670063234855,
"learning_rate": 6.1113899028119764e-06,
"loss": 0.4298,
"step": 828
},
{
"epoch": 0.4462687571495862,
"grad_norm": 1.8781325581931287,
"learning_rate": 6.102884605833262e-06,
"loss": 0.4921,
"step": 829
},
{
"epoch": 0.4468070789314313,
"grad_norm": 1.5329498351981126,
"learning_rate": 6.094375953006527e-06,
"loss": 0.4518,
"step": 830
},
{
"epoch": 0.4473454007132764,
"grad_norm": 1.6692806133274172,
"learning_rate": 6.085863970221827e-06,
"loss": 0.5337,
"step": 831
},
{
"epoch": 0.4478837224951215,
"grad_norm": 1.5092683621943173,
"learning_rate": 6.077348683379351e-06,
"loss": 0.4578,
"step": 832
},
{
"epoch": 0.4484220442769666,
"grad_norm": 1.6510945855973929,
"learning_rate": 6.068830118389345e-06,
"loss": 0.479,
"step": 833
},
{
"epoch": 0.4489603660588117,
"grad_norm": 2.639396623007194,
"learning_rate": 6.060308301172026e-06,
"loss": 0.451,
"step": 834
},
{
"epoch": 0.4494986878406568,
"grad_norm": 1.8709014826106682,
"learning_rate": 6.051783257657508e-06,
"loss": 0.5109,
"step": 835
},
{
"epoch": 0.45003700962250187,
"grad_norm": 2.1325245569205284,
"learning_rate": 6.04325501378572e-06,
"loss": 0.4874,
"step": 836
},
{
"epoch": 0.45057533140434697,
"grad_norm": 1.4972184191802396,
"learning_rate": 6.034723595506334e-06,
"loss": 0.4671,
"step": 837
},
{
"epoch": 0.45111365318619207,
"grad_norm": 1.3179174814289414,
"learning_rate": 6.026189028778675e-06,
"loss": 0.4078,
"step": 838
},
{
"epoch": 0.45165197496803716,
"grad_norm": 1.521198968359238,
"learning_rate": 6.017651339571652e-06,
"loss": 0.4456,
"step": 839
},
{
"epoch": 0.45219029674988226,
"grad_norm": 1.4836797423023151,
"learning_rate": 6.009110553863674e-06,
"loss": 0.4497,
"step": 840
},
{
"epoch": 0.45219029674988226,
"eval_loss": 0.4534289836883545,
"eval_runtime": 1525.9354,
"eval_samples_per_second": 16.39,
"eval_steps_per_second": 0.512,
"step": 840
},
{
"epoch": 0.45272861853172736,
"grad_norm": 1.808617433298175,
"learning_rate": 6.000566697642575e-06,
"loss": 0.435,
"step": 841
},
{
"epoch": 0.45326694031357245,
"grad_norm": 2.008290454012663,
"learning_rate": 5.992019796905524e-06,
"loss": 0.4626,
"step": 842
},
{
"epoch": 0.45380526209541755,
"grad_norm": 1.7710157949578111,
"learning_rate": 5.9834698776589614e-06,
"loss": 0.4311,
"step": 843
},
{
"epoch": 0.45434358387726265,
"grad_norm": 1.6230775011015806,
"learning_rate": 5.9749169659185104e-06,
"loss": 0.4693,
"step": 844
},
{
"epoch": 0.45488190565910774,
"grad_norm": 1.3639464284433171,
"learning_rate": 5.966361087708898e-06,
"loss": 0.4658,
"step": 845
},
{
"epoch": 0.45542022744095284,
"grad_norm": 1.8137146027163404,
"learning_rate": 5.957802269063878e-06,
"loss": 0.4567,
"step": 846
},
{
"epoch": 0.45595854922279794,
"grad_norm": 1.6758956331351547,
"learning_rate": 5.949240536026153e-06,
"loss": 0.467,
"step": 847
},
{
"epoch": 0.45649687100464303,
"grad_norm": 1.5131926980070547,
"learning_rate": 5.940675914647293e-06,
"loss": 0.4106,
"step": 848
},
{
"epoch": 0.45703519278648813,
"grad_norm": 1.5046633719884865,
"learning_rate": 5.9321084309876555e-06,
"loss": 0.4282,
"step": 849
},
{
"epoch": 0.45757351456833323,
"grad_norm": 1.6481158877878923,
"learning_rate": 5.923538111116307e-06,
"loss": 0.4414,
"step": 850
},
{
"epoch": 0.4581118363501783,
"grad_norm": 2.175705374474076,
"learning_rate": 5.914964981110944e-06,
"loss": 0.5038,
"step": 851
},
{
"epoch": 0.4586501581320234,
"grad_norm": 1.748850851161863,
"learning_rate": 5.906389067057819e-06,
"loss": 0.4603,
"step": 852
},
{
"epoch": 0.4591884799138685,
"grad_norm": 1.5440809581743327,
"learning_rate": 5.897810395051646e-06,
"loss": 0.4697,
"step": 853
},
{
"epoch": 0.4597268016957136,
"grad_norm": 1.5332714275032744,
"learning_rate": 5.889228991195539e-06,
"loss": 0.4549,
"step": 854
},
{
"epoch": 0.4602651234775587,
"grad_norm": 1.6246537267152152,
"learning_rate": 5.880644881600921e-06,
"loss": 0.4413,
"step": 855
},
{
"epoch": 0.4608034452594038,
"grad_norm": 1.7384003721983572,
"learning_rate": 5.872058092387449e-06,
"loss": 0.5178,
"step": 856
},
{
"epoch": 0.4613417670412489,
"grad_norm": 1.4306474231507047,
"learning_rate": 5.863468649682933e-06,
"loss": 0.4584,
"step": 857
},
{
"epoch": 0.461880088823094,
"grad_norm": 1.7487008875581123,
"learning_rate": 5.8548765796232565e-06,
"loss": 0.4775,
"step": 858
},
{
"epoch": 0.4624184106049391,
"grad_norm": 1.6200058585564832,
"learning_rate": 5.846281908352299e-06,
"loss": 0.4718,
"step": 859
},
{
"epoch": 0.4629567323867842,
"grad_norm": 1.4993582658806037,
"learning_rate": 5.837684662021856e-06,
"loss": 0.4367,
"step": 860
},
{
"epoch": 0.4634950541686293,
"grad_norm": 1.6215871681690963,
"learning_rate": 5.829084866791551e-06,
"loss": 0.4891,
"step": 861
},
{
"epoch": 0.4640333759504744,
"grad_norm": 1.6479378578126422,
"learning_rate": 5.820482548828773e-06,
"loss": 0.4701,
"step": 862
},
{
"epoch": 0.4645716977323195,
"grad_norm": 1.709497613352161,
"learning_rate": 5.811877734308583e-06,
"loss": 0.4314,
"step": 863
},
{
"epoch": 0.4651100195141646,
"grad_norm": 1.850585526202356,
"learning_rate": 5.803270449413636e-06,
"loss": 0.4399,
"step": 864
},
{
"epoch": 0.4656483412960097,
"grad_norm": 1.4300437023045451,
"learning_rate": 5.7946607203341075e-06,
"loss": 0.4434,
"step": 865
},
{
"epoch": 0.4661866630778548,
"grad_norm": 1.4799373263095972,
"learning_rate": 5.786048573267608e-06,
"loss": 0.4065,
"step": 866
},
{
"epoch": 0.4667249848596999,
"grad_norm": 1.8869037850434587,
"learning_rate": 5.777434034419111e-06,
"loss": 0.4823,
"step": 867
},
{
"epoch": 0.467263306641545,
"grad_norm": 1.720619241457494,
"learning_rate": 5.768817130000857e-06,
"loss": 0.4444,
"step": 868
},
{
"epoch": 0.46780162842339007,
"grad_norm": 1.3809501342652182,
"learning_rate": 5.760197886232292e-06,
"loss": 0.4058,
"step": 869
},
{
"epoch": 0.46833995020523517,
"grad_norm": 1.6474446895806825,
"learning_rate": 5.75157632933998e-06,
"loss": 0.4244,
"step": 870
},
{
"epoch": 0.46887827198708026,
"grad_norm": 1.3347455312904397,
"learning_rate": 5.7429524855575216e-06,
"loss": 0.4509,
"step": 871
},
{
"epoch": 0.46941659376892536,
"grad_norm": 2.4700574740497583,
"learning_rate": 5.7343263811254746e-06,
"loss": 0.4078,
"step": 872
},
{
"epoch": 0.46995491555077046,
"grad_norm": 1.6808144924631037,
"learning_rate": 5.725698042291279e-06,
"loss": 0.445,
"step": 873
},
{
"epoch": 0.47049323733261555,
"grad_norm": 1.6561338534624221,
"learning_rate": 5.717067495309172e-06,
"loss": 0.4626,
"step": 874
},
{
"epoch": 0.47103155911446065,
"grad_norm": 1.4357104359447126,
"learning_rate": 5.708434766440109e-06,
"loss": 0.4253,
"step": 875
},
{
"epoch": 0.47156988089630575,
"grad_norm": 1.5584705980730198,
"learning_rate": 5.699799881951684e-06,
"loss": 0.4326,
"step": 876
},
{
"epoch": 0.47210820267815085,
"grad_norm": 1.6134096232268902,
"learning_rate": 5.691162868118052e-06,
"loss": 0.4361,
"step": 877
},
{
"epoch": 0.47264652445999594,
"grad_norm": 1.4597620039500387,
"learning_rate": 5.682523751219846e-06,
"loss": 0.4009,
"step": 878
},
{
"epoch": 0.47318484624184104,
"grad_norm": 1.6065681327100592,
"learning_rate": 5.673882557544098e-06,
"loss": 0.4859,
"step": 879
},
{
"epoch": 0.47372316802368614,
"grad_norm": 1.5207533993363942,
"learning_rate": 5.665239313384161e-06,
"loss": 0.4281,
"step": 880
},
{
"epoch": 0.47426148980553123,
"grad_norm": 1.4714029139534557,
"learning_rate": 5.656594045039623e-06,
"loss": 0.4364,
"step": 881
},
{
"epoch": 0.47479981158737633,
"grad_norm": 1.7055967072229654,
"learning_rate": 5.647946778816238e-06,
"loss": 0.5044,
"step": 882
},
{
"epoch": 0.4753381333692214,
"grad_norm": 1.7261543220071143,
"learning_rate": 5.639297541025831e-06,
"loss": 0.486,
"step": 883
},
{
"epoch": 0.4758764551510665,
"grad_norm": 1.6626927738024924,
"learning_rate": 5.630646357986232e-06,
"loss": 0.5142,
"step": 884
},
{
"epoch": 0.4764147769329116,
"grad_norm": 1.5653946306822688,
"learning_rate": 5.621993256021188e-06,
"loss": 0.4364,
"step": 885
},
{
"epoch": 0.4769530987147568,
"grad_norm": 1.8026208698346797,
"learning_rate": 5.613338261460287e-06,
"loss": 0.4538,
"step": 886
},
{
"epoch": 0.47749142049660187,
"grad_norm": 1.6799784860946594,
"learning_rate": 5.6046814006388705e-06,
"loss": 0.4644,
"step": 887
},
{
"epoch": 0.47802974227844697,
"grad_norm": 1.4364276865950356,
"learning_rate": 5.596022699897963e-06,
"loss": 0.4051,
"step": 888
},
{
"epoch": 0.47856806406029206,
"grad_norm": 1.6914469502870713,
"learning_rate": 5.587362185584189e-06,
"loss": 0.4871,
"step": 889
},
{
"epoch": 0.47910638584213716,
"grad_norm": 1.4415518156055118,
"learning_rate": 5.578699884049683e-06,
"loss": 0.4429,
"step": 890
},
{
"epoch": 0.47964470762398226,
"grad_norm": 1.4674935937695475,
"learning_rate": 5.570035821652029e-06,
"loss": 0.426,
"step": 891
},
{
"epoch": 0.48018302940582736,
"grad_norm": 2.1147351198112982,
"learning_rate": 5.561370024754161e-06,
"loss": 0.4789,
"step": 892
},
{
"epoch": 0.48072135118767245,
"grad_norm": 1.4253127193278772,
"learning_rate": 5.552702519724294e-06,
"loss": 0.4346,
"step": 893
},
{
"epoch": 0.48125967296951755,
"grad_norm": 3.7503200169998676,
"learning_rate": 5.544033332935838e-06,
"loss": 0.4393,
"step": 894
},
{
"epoch": 0.48179799475136265,
"grad_norm": 2.1079137772003818,
"learning_rate": 5.535362490767323e-06,
"loss": 0.5118,
"step": 895
},
{
"epoch": 0.48233631653320774,
"grad_norm": 2.2185325950005477,
"learning_rate": 5.526690019602315e-06,
"loss": 0.3894,
"step": 896
},
{
"epoch": 0.48287463831505284,
"grad_norm": 1.5274617672885367,
"learning_rate": 5.518015945829337e-06,
"loss": 0.42,
"step": 897
},
{
"epoch": 0.48341296009689794,
"grad_norm": 1.622273471984762,
"learning_rate": 5.509340295841785e-06,
"loss": 0.5112,
"step": 898
},
{
"epoch": 0.48395128187874303,
"grad_norm": 1.5776105686627353,
"learning_rate": 5.500663096037856e-06,
"loss": 0.4577,
"step": 899
},
{
"epoch": 0.48448960366058813,
"grad_norm": 1.4494216604414056,
"learning_rate": 5.491984372820461e-06,
"loss": 0.4585,
"step": 900
},
{
"epoch": 0.48448960366058813,
"eval_loss": 0.4497644305229187,
"eval_runtime": 1526.5252,
"eval_samples_per_second": 16.384,
"eval_steps_per_second": 0.512,
"step": 900
},
{
"epoch": 0.4850279254424332,
"grad_norm": 1.5164622603897875,
"learning_rate": 5.483304152597145e-06,
"loss": 0.4488,
"step": 901
},
{
"epoch": 0.4855662472242783,
"grad_norm": 1.5363015107046971,
"learning_rate": 5.474622461780011e-06,
"loss": 0.424,
"step": 902
},
{
"epoch": 0.4861045690061234,
"grad_norm": 1.5955517741757022,
"learning_rate": 5.465939326785634e-06,
"loss": 0.4544,
"step": 903
},
{
"epoch": 0.4866428907879685,
"grad_norm": 1.879614888686265,
"learning_rate": 5.457254774034983e-06,
"loss": 0.5032,
"step": 904
},
{
"epoch": 0.4871812125698136,
"grad_norm": 1.5621620080191398,
"learning_rate": 5.448568829953344e-06,
"loss": 0.4675,
"step": 905
},
{
"epoch": 0.4877195343516587,
"grad_norm": 1.463009731317384,
"learning_rate": 5.439881520970234e-06,
"loss": 0.5112,
"step": 906
},
{
"epoch": 0.4882578561335038,
"grad_norm": 1.4309448662315376,
"learning_rate": 5.431192873519326e-06,
"loss": 0.4532,
"step": 907
},
{
"epoch": 0.4887961779153489,
"grad_norm": 1.8077348129923718,
"learning_rate": 5.422502914038359e-06,
"loss": 0.4498,
"step": 908
},
{
"epoch": 0.489334499697194,
"grad_norm": 1.770786349097794,
"learning_rate": 5.413811668969072e-06,
"loss": 0.5081,
"step": 909
},
{
"epoch": 0.4898728214790391,
"grad_norm": 1.911624959064584,
"learning_rate": 5.4051191647571126e-06,
"loss": 0.4297,
"step": 910
},
{
"epoch": 0.4904111432608842,
"grad_norm": 2.238598280094612,
"learning_rate": 5.396425427851958e-06,
"loss": 0.4722,
"step": 911
},
{
"epoch": 0.4909494650427293,
"grad_norm": 1.7184560772593453,
"learning_rate": 5.387730484706839e-06,
"loss": 0.4778,
"step": 912
},
{
"epoch": 0.4914877868245744,
"grad_norm": 1.452205930174256,
"learning_rate": 5.3790343617786555e-06,
"loss": 0.4233,
"step": 913
},
{
"epoch": 0.4920261086064195,
"grad_norm": 1.6315132839706739,
"learning_rate": 5.3703370855278995e-06,
"loss": 0.4429,
"step": 914
},
{
"epoch": 0.4925644303882646,
"grad_norm": 2.1202501474227984,
"learning_rate": 5.361638682418565e-06,
"loss": 0.461,
"step": 915
},
{
"epoch": 0.4931027521701097,
"grad_norm": 1.4850726589476337,
"learning_rate": 5.352939178918084e-06,
"loss": 0.5053,
"step": 916
},
{
"epoch": 0.4936410739519548,
"grad_norm": 2.5715760460764505,
"learning_rate": 5.344238601497231e-06,
"loss": 0.523,
"step": 917
},
{
"epoch": 0.4941793957337999,
"grad_norm": 1.6641597075498922,
"learning_rate": 5.335536976630052e-06,
"loss": 0.4452,
"step": 918
},
{
"epoch": 0.494717717515645,
"grad_norm": 1.579954501546705,
"learning_rate": 5.326834330793775e-06,
"loss": 0.4365,
"step": 919
},
{
"epoch": 0.49525603929749007,
"grad_norm": 1.8639771696751175,
"learning_rate": 5.318130690468741e-06,
"loss": 0.4956,
"step": 920
},
{
"epoch": 0.49579436107933517,
"grad_norm": 1.6264721082016091,
"learning_rate": 5.309426082138311e-06,
"loss": 0.4592,
"step": 921
},
{
"epoch": 0.49633268286118026,
"grad_norm": 1.624012882860616,
"learning_rate": 5.300720532288798e-06,
"loss": 0.437,
"step": 922
},
{
"epoch": 0.49687100464302536,
"grad_norm": 1.6131788103239653,
"learning_rate": 5.29201406740937e-06,
"loss": 0.4335,
"step": 923
},
{
"epoch": 0.49740932642487046,
"grad_norm": 1.4350753111666732,
"learning_rate": 5.28330671399199e-06,
"loss": 0.4462,
"step": 924
},
{
"epoch": 0.49794764820671555,
"grad_norm": 1.9075044926150524,
"learning_rate": 5.274598498531318e-06,
"loss": 0.5123,
"step": 925
},
{
"epoch": 0.49848596998856065,
"grad_norm": 2.2955162228107233,
"learning_rate": 5.265889447524641e-06,
"loss": 0.4649,
"step": 926
},
{
"epoch": 0.49902429177040575,
"grad_norm": 1.8752294916309997,
"learning_rate": 5.257179587471784e-06,
"loss": 0.4339,
"step": 927
},
{
"epoch": 0.49956261355225084,
"grad_norm": 1.776206864828494,
"learning_rate": 5.248468944875036e-06,
"loss": 0.4047,
"step": 928
},
{
"epoch": 0.5001009353340959,
"grad_norm": 1.6863520776370677,
"learning_rate": 5.239757546239069e-06,
"loss": 0.4041,
"step": 929
},
{
"epoch": 0.500639257115941,
"grad_norm": 1.6004117617835396,
"learning_rate": 5.231045418070852e-06,
"loss": 0.4026,
"step": 930
},
{
"epoch": 0.5011775788977861,
"grad_norm": 1.6497898215404967,
"learning_rate": 5.222332586879576e-06,
"loss": 0.4953,
"step": 931
},
{
"epoch": 0.5017159006796312,
"grad_norm": 1.6264336562152901,
"learning_rate": 5.2136190791765714e-06,
"loss": 0.4697,
"step": 932
},
{
"epoch": 0.5022542224614763,
"grad_norm": 1.4687648507656423,
"learning_rate": 5.204904921475226e-06,
"loss": 0.4608,
"step": 933
},
{
"epoch": 0.5027925442433214,
"grad_norm": 1.555407852307028,
"learning_rate": 5.196190140290905e-06,
"loss": 0.4191,
"step": 934
},
{
"epoch": 0.5033308660251665,
"grad_norm": 1.6926089059266405,
"learning_rate": 5.1874747621408705e-06,
"loss": 0.4034,
"step": 935
},
{
"epoch": 0.5038691878070116,
"grad_norm": 1.5853166612648868,
"learning_rate": 5.178758813544203e-06,
"loss": 0.4288,
"step": 936
},
{
"epoch": 0.5044075095888567,
"grad_norm": 1.5462488708677307,
"learning_rate": 5.170042321021721e-06,
"loss": 0.5049,
"step": 937
},
{
"epoch": 0.5049458313707018,
"grad_norm": 1.6860561151031408,
"learning_rate": 5.161325311095889e-06,
"loss": 0.4673,
"step": 938
},
{
"epoch": 0.5054841531525469,
"grad_norm": 1.603506680608381,
"learning_rate": 5.1526078102907565e-06,
"loss": 0.4613,
"step": 939
},
{
"epoch": 0.506022474934392,
"grad_norm": 1.7493626988274396,
"learning_rate": 5.143889845131859e-06,
"loss": 0.4563,
"step": 940
},
{
"epoch": 0.5065607967162371,
"grad_norm": 1.7677497007408356,
"learning_rate": 5.135171442146147e-06,
"loss": 0.4389,
"step": 941
},
{
"epoch": 0.5070991184980822,
"grad_norm": 1.7686507376112643,
"learning_rate": 5.126452627861906e-06,
"loss": 0.469,
"step": 942
},
{
"epoch": 0.5076374402799273,
"grad_norm": 2.03881052798833,
"learning_rate": 5.117733428808671e-06,
"loss": 0.473,
"step": 943
},
{
"epoch": 0.5081757620617724,
"grad_norm": 1.5924723958151055,
"learning_rate": 5.109013871517148e-06,
"loss": 0.4449,
"step": 944
},
{
"epoch": 0.5087140838436175,
"grad_norm": 1.787982594535362,
"learning_rate": 5.10029398251913e-06,
"loss": 0.4575,
"step": 945
},
{
"epoch": 0.5092524056254626,
"grad_norm": 1.8443122029947836,
"learning_rate": 5.091573788347424e-06,
"loss": 0.4825,
"step": 946
},
{
"epoch": 0.5097907274073077,
"grad_norm": 1.5660114035251782,
"learning_rate": 5.082853315535764e-06,
"loss": 0.4705,
"step": 947
},
{
"epoch": 0.5103290491891528,
"grad_norm": 1.4015195298555256,
"learning_rate": 5.074132590618731e-06,
"loss": 0.4222,
"step": 948
},
{
"epoch": 0.5108673709709979,
"grad_norm": 1.6261999654731143,
"learning_rate": 5.065411640131672e-06,
"loss": 0.4172,
"step": 949
},
{
"epoch": 0.511405692752843,
"grad_norm": 1.6580955314247148,
"learning_rate": 5.0566904906106254e-06,
"loss": 0.4803,
"step": 950
},
{
"epoch": 0.5119440145346881,
"grad_norm": 1.6882580545035042,
"learning_rate": 5.047969168592229e-06,
"loss": 0.4959,
"step": 951
},
{
"epoch": 0.5124823363165332,
"grad_norm": 1.2734853203083423,
"learning_rate": 5.039247700613649e-06,
"loss": 0.4532,
"step": 952
},
{
"epoch": 0.5130206580983783,
"grad_norm": 1.6598696282615735,
"learning_rate": 5.030526113212494e-06,
"loss": 0.4443,
"step": 953
},
{
"epoch": 0.5135589798802234,
"grad_norm": 1.555381309193185,
"learning_rate": 5.021804432926739e-06,
"loss": 0.4704,
"step": 954
},
{
"epoch": 0.5140973016620685,
"grad_norm": 1.5525351037863324,
"learning_rate": 5.013082686294639e-06,
"loss": 0.4373,
"step": 955
},
{
"epoch": 0.5146356234439136,
"grad_norm": 1.5575470355469987,
"learning_rate": 5.00436089985465e-06,
"loss": 0.4242,
"step": 956
},
{
"epoch": 0.5151739452257587,
"grad_norm": 1.7457061624641392,
"learning_rate": 4.995639100145352e-06,
"loss": 0.4685,
"step": 957
},
{
"epoch": 0.5157122670076038,
"grad_norm": 1.6284837184280405,
"learning_rate": 4.9869173137053625e-06,
"loss": 0.4702,
"step": 958
},
{
"epoch": 0.5162505887894488,
"grad_norm": 2.191085743474062,
"learning_rate": 4.978195567073262e-06,
"loss": 0.5185,
"step": 959
},
{
"epoch": 0.516788910571294,
"grad_norm": 1.5407588424547343,
"learning_rate": 4.969473886787507e-06,
"loss": 0.505,
"step": 960
},
{
"epoch": 0.516788910571294,
"eval_loss": 0.44528621435165405,
"eval_runtime": 1532.2971,
"eval_samples_per_second": 16.322,
"eval_steps_per_second": 0.51,
"step": 960
},
{
"epoch": 0.517327232353139,
"grad_norm": 1.7214959560480187,
"learning_rate": 4.960752299386353e-06,
"loss": 0.4826,
"step": 961
},
{
"epoch": 0.5178655541349841,
"grad_norm": 1.5649628360297678,
"learning_rate": 4.9520308314077726e-06,
"loss": 0.4224,
"step": 962
},
{
"epoch": 0.5184038759168292,
"grad_norm": 1.6424636557347856,
"learning_rate": 4.943309509389377e-06,
"loss": 0.4148,
"step": 963
},
{
"epoch": 0.5189421976986743,
"grad_norm": 1.98993484637264,
"learning_rate": 4.934588359868329e-06,
"loss": 0.4307,
"step": 964
},
{
"epoch": 0.5194805194805194,
"grad_norm": 2.0804456077787123,
"learning_rate": 4.92586740938127e-06,
"loss": 0.4108,
"step": 965
},
{
"epoch": 0.5200188412623645,
"grad_norm": 1.748710199317067,
"learning_rate": 4.917146684464238e-06,
"loss": 0.4567,
"step": 966
},
{
"epoch": 0.5205571630442096,
"grad_norm": 1.4755067360374794,
"learning_rate": 4.908426211652577e-06,
"loss": 0.4523,
"step": 967
},
{
"epoch": 0.5210954848260547,
"grad_norm": 1.6340640272431366,
"learning_rate": 4.899706017480872e-06,
"loss": 0.4697,
"step": 968
},
{
"epoch": 0.5216338066078998,
"grad_norm": 1.5338487326156454,
"learning_rate": 4.890986128482854e-06,
"loss": 0.4108,
"step": 969
},
{
"epoch": 0.5221721283897449,
"grad_norm": 1.4204187507894679,
"learning_rate": 4.88226657119133e-06,
"loss": 0.4175,
"step": 970
},
{
"epoch": 0.52271045017159,
"grad_norm": 1.4916766712552136,
"learning_rate": 4.873547372138095e-06,
"loss": 0.4274,
"step": 971
},
{
"epoch": 0.5232487719534352,
"grad_norm": 1.514306526603469,
"learning_rate": 4.864828557853854e-06,
"loss": 0.4745,
"step": 972
},
{
"epoch": 0.5237870937352803,
"grad_norm": 1.774262113242822,
"learning_rate": 4.856110154868143e-06,
"loss": 0.4172,
"step": 973
},
{
"epoch": 0.5243254155171254,
"grad_norm": 1.4311594537408503,
"learning_rate": 4.847392189709246e-06,
"loss": 0.4499,
"step": 974
},
{
"epoch": 0.5248637372989705,
"grad_norm": 2.045966100772589,
"learning_rate": 4.8386746889041116e-06,
"loss": 0.496,
"step": 975
},
{
"epoch": 0.5254020590808156,
"grad_norm": 1.3914439869095196,
"learning_rate": 4.82995767897828e-06,
"loss": 0.4068,
"step": 976
},
{
"epoch": 0.5259403808626607,
"grad_norm": 1.3260222946498679,
"learning_rate": 4.8212411864557975e-06,
"loss": 0.4344,
"step": 977
},
{
"epoch": 0.5264787026445058,
"grad_norm": 1.7672350290368148,
"learning_rate": 4.812525237859131e-06,
"loss": 0.4647,
"step": 978
},
{
"epoch": 0.5270170244263509,
"grad_norm": 1.5287264304361414,
"learning_rate": 4.803809859709097e-06,
"loss": 0.4406,
"step": 979
},
{
"epoch": 0.527555346208196,
"grad_norm": 1.5180822455976997,
"learning_rate": 4.795095078524775e-06,
"loss": 0.4462,
"step": 980
},
{
"epoch": 0.5280936679900411,
"grad_norm": 1.5390017294524125,
"learning_rate": 4.78638092082343e-06,
"loss": 0.4427,
"step": 981
},
{
"epoch": 0.5286319897718862,
"grad_norm": 1.8490518419390272,
"learning_rate": 4.777667413120425e-06,
"loss": 0.4716,
"step": 982
},
{
"epoch": 0.5291703115537313,
"grad_norm": 1.9241747880139426,
"learning_rate": 4.7689545819291484e-06,
"loss": 0.4471,
"step": 983
},
{
"epoch": 0.5297086333355764,
"grad_norm": 1.5723366516079713,
"learning_rate": 4.760242453760932e-06,
"loss": 0.3616,
"step": 984
},
{
"epoch": 0.5302469551174215,
"grad_norm": 2.125474240340618,
"learning_rate": 4.751531055124965e-06,
"loss": 0.4567,
"step": 985
},
{
"epoch": 0.5307852768992666,
"grad_norm": 1.5872857045985345,
"learning_rate": 4.742820412528217e-06,
"loss": 0.4311,
"step": 986
},
{
"epoch": 0.5313235986811117,
"grad_norm": 1.5991351116825514,
"learning_rate": 4.73411055247536e-06,
"loss": 0.4572,
"step": 987
},
{
"epoch": 0.5318619204629568,
"grad_norm": 1.5620726404348677,
"learning_rate": 4.725401501468683e-06,
"loss": 0.4299,
"step": 988
},
{
"epoch": 0.5324002422448019,
"grad_norm": 1.6599112973852914,
"learning_rate": 4.716693286008011e-06,
"loss": 0.4444,
"step": 989
},
{
"epoch": 0.532938564026647,
"grad_norm": 1.7825302359359856,
"learning_rate": 4.707985932590631e-06,
"loss": 0.4321,
"step": 990
},
{
"epoch": 0.5334768858084921,
"grad_norm": 1.5739707930921258,
"learning_rate": 4.699279467711204e-06,
"loss": 0.4567,
"step": 991
},
{
"epoch": 0.5340152075903372,
"grad_norm": 1.5857670482566744,
"learning_rate": 4.69057391786169e-06,
"loss": 0.4312,
"step": 992
},
{
"epoch": 0.5345535293721823,
"grad_norm": 1.3615110605746865,
"learning_rate": 4.68186930953126e-06,
"loss": 0.376,
"step": 993
},
{
"epoch": 0.5350918511540274,
"grad_norm": 1.4263273424189502,
"learning_rate": 4.673165669206226e-06,
"loss": 0.4424,
"step": 994
},
{
"epoch": 0.5356301729358725,
"grad_norm": 2.8748098476059933,
"learning_rate": 4.6644630233699495e-06,
"loss": 0.4828,
"step": 995
},
{
"epoch": 0.5361684947177175,
"grad_norm": 1.7530111025052908,
"learning_rate": 4.65576139850277e-06,
"loss": 0.4565,
"step": 996
},
{
"epoch": 0.5367068164995626,
"grad_norm": 1.625700838321751,
"learning_rate": 4.647060821081918e-06,
"loss": 0.4397,
"step": 997
},
{
"epoch": 0.5372451382814077,
"grad_norm": 1.7382100638812064,
"learning_rate": 4.638361317581437e-06,
"loss": 0.4701,
"step": 998
},
{
"epoch": 0.5377834600632528,
"grad_norm": 2.153555864190946,
"learning_rate": 4.629662914472103e-06,
"loss": 0.45,
"step": 999
},
{
"epoch": 0.5383217818450979,
"grad_norm": 1.6756544006397587,
"learning_rate": 4.620965638221346e-06,
"loss": 0.4373,
"step": 1000
},
{
"epoch": 0.538860103626943,
"grad_norm": 2.115872641463188,
"learning_rate": 4.612269515293162e-06,
"loss": 0.4807,
"step": 1001
},
{
"epoch": 0.5393984254087881,
"grad_norm": 1.7162266935661588,
"learning_rate": 4.603574572148043e-06,
"loss": 0.4231,
"step": 1002
},
{
"epoch": 0.5399367471906332,
"grad_norm": 1.828685276454168,
"learning_rate": 4.59488083524289e-06,
"loss": 0.4405,
"step": 1003
},
{
"epoch": 0.5404750689724783,
"grad_norm": 1.6864896839159536,
"learning_rate": 4.58618833103093e-06,
"loss": 0.4144,
"step": 1004
},
{
"epoch": 0.5410133907543234,
"grad_norm": 1.4876643937775926,
"learning_rate": 4.5774970859616426e-06,
"loss": 0.4628,
"step": 1005
},
{
"epoch": 0.5415517125361685,
"grad_norm": 1.5038750034441302,
"learning_rate": 4.568807126480676e-06,
"loss": 0.4595,
"step": 1006
},
{
"epoch": 0.5420900343180136,
"grad_norm": 1.3366252716503892,
"learning_rate": 4.560118479029768e-06,
"loss": 0.4447,
"step": 1007
},
{
"epoch": 0.5426283560998587,
"grad_norm": 1.5955474786951926,
"learning_rate": 4.5514311700466575e-06,
"loss": 0.4731,
"step": 1008
},
{
"epoch": 0.5431666778817038,
"grad_norm": 1.415371321661975,
"learning_rate": 4.5427452259650185e-06,
"loss": 0.4565,
"step": 1009
},
{
"epoch": 0.5437049996635489,
"grad_norm": 1.414837591715847,
"learning_rate": 4.534060673214367e-06,
"loss": 0.439,
"step": 1010
},
{
"epoch": 0.544243321445394,
"grad_norm": 1.6390543819341332,
"learning_rate": 4.525377538219991e-06,
"loss": 0.4434,
"step": 1011
},
{
"epoch": 0.5447816432272391,
"grad_norm": 1.9027726313032218,
"learning_rate": 4.516695847402857e-06,
"loss": 0.4841,
"step": 1012
},
{
"epoch": 0.5453199650090842,
"grad_norm": 1.6549184700101718,
"learning_rate": 4.50801562717954e-06,
"loss": 0.4187,
"step": 1013
},
{
"epoch": 0.5458582867909293,
"grad_norm": 1.672495923944031,
"learning_rate": 4.499336903962146e-06,
"loss": 0.461,
"step": 1014
},
{
"epoch": 0.5463966085727744,
"grad_norm": 1.9002456572131434,
"learning_rate": 4.490659704158218e-06,
"loss": 0.4305,
"step": 1015
},
{
"epoch": 0.5469349303546195,
"grad_norm": 1.3438622389285284,
"learning_rate": 4.481984054170666e-06,
"loss": 0.4569,
"step": 1016
},
{
"epoch": 0.5474732521364646,
"grad_norm": 1.6738782134152472,
"learning_rate": 4.473309980397686e-06,
"loss": 0.4574,
"step": 1017
},
{
"epoch": 0.5480115739183097,
"grad_norm": 1.410079098904291,
"learning_rate": 4.464637509232679e-06,
"loss": 0.4616,
"step": 1018
},
{
"epoch": 0.5485498957001548,
"grad_norm": 1.5059024241541985,
"learning_rate": 4.455966667064164e-06,
"loss": 0.4257,
"step": 1019
},
{
"epoch": 0.5490882174819999,
"grad_norm": 1.8743979543800648,
"learning_rate": 4.447297480275708e-06,
"loss": 0.4468,
"step": 1020
},
{
"epoch": 0.5490882174819999,
"eval_loss": 0.44231292605400085,
"eval_runtime": 1542.3429,
"eval_samples_per_second": 16.216,
"eval_steps_per_second": 0.507,
"step": 1020
},
{
"epoch": 0.549626539263845,
"grad_norm": 2.326652305551719,
"learning_rate": 4.4386299752458405e-06,
"loss": 0.5123,
"step": 1021
},
{
"epoch": 0.5501648610456901,
"grad_norm": 1.5214313173590028,
"learning_rate": 4.429964178347973e-06,
"loss": 0.4525,
"step": 1022
},
{
"epoch": 0.5507031828275352,
"grad_norm": 1.578588355929213,
"learning_rate": 4.4213001159503185e-06,
"loss": 0.4511,
"step": 1023
},
{
"epoch": 0.5512415046093803,
"grad_norm": 1.5736153928065848,
"learning_rate": 4.4126378144158145e-06,
"loss": 0.402,
"step": 1024
},
{
"epoch": 0.5517798263912254,
"grad_norm": 1.4881049360513776,
"learning_rate": 4.4039773001020394e-06,
"loss": 0.4312,
"step": 1025
},
{
"epoch": 0.5523181481730705,
"grad_norm": 1.5453517436989277,
"learning_rate": 4.395318599361133e-06,
"loss": 0.4297,
"step": 1026
},
{
"epoch": 0.5528564699549156,
"grad_norm": 1.7401645944762647,
"learning_rate": 4.386661738539716e-06,
"loss": 0.4021,
"step": 1027
},
{
"epoch": 0.5533947917367606,
"grad_norm": 1.6594295806955806,
"learning_rate": 4.3780067439788125e-06,
"loss": 0.3936,
"step": 1028
},
{
"epoch": 0.5539331135186057,
"grad_norm": 1.4018911995650016,
"learning_rate": 4.3693536420137704e-06,
"loss": 0.4208,
"step": 1029
},
{
"epoch": 0.5544714353004508,
"grad_norm": 1.554369257290078,
"learning_rate": 4.360702458974172e-06,
"loss": 0.3869,
"step": 1030
},
{
"epoch": 0.5550097570822959,
"grad_norm": 1.7013778785431986,
"learning_rate": 4.3520532211837645e-06,
"loss": 0.4557,
"step": 1031
},
{
"epoch": 0.555548078864141,
"grad_norm": 1.5141795112180816,
"learning_rate": 4.343405954960378e-06,
"loss": 0.437,
"step": 1032
},
{
"epoch": 0.5560864006459861,
"grad_norm": 1.6876343830074998,
"learning_rate": 4.334760686615842e-06,
"loss": 0.4632,
"step": 1033
},
{
"epoch": 0.5566247224278312,
"grad_norm": 1.7137409506750598,
"learning_rate": 4.326117442455904e-06,
"loss": 0.451,
"step": 1034
},
{
"epoch": 0.5571630442096763,
"grad_norm": 2.2054388725094993,
"learning_rate": 4.3174762487801554e-06,
"loss": 0.4845,
"step": 1035
},
{
"epoch": 0.5577013659915214,
"grad_norm": 1.4514781472802996,
"learning_rate": 4.30883713188195e-06,
"loss": 0.4713,
"step": 1036
},
{
"epoch": 0.5582396877733665,
"grad_norm": 1.3155208362445518,
"learning_rate": 4.300200118048318e-06,
"loss": 0.4048,
"step": 1037
},
{
"epoch": 0.5587780095552116,
"grad_norm": 1.7594624250292574,
"learning_rate": 4.291565233559893e-06,
"loss": 0.4719,
"step": 1038
},
{
"epoch": 0.5593163313370567,
"grad_norm": 1.5899320924503517,
"learning_rate": 4.282932504690829e-06,
"loss": 0.4889,
"step": 1039
},
{
"epoch": 0.5598546531189018,
"grad_norm": 1.5400899090595648,
"learning_rate": 4.274301957708723e-06,
"loss": 0.48,
"step": 1040
},
{
"epoch": 0.5603929749007469,
"grad_norm": 1.9340975529821163,
"learning_rate": 4.265673618874527e-06,
"loss": 0.4558,
"step": 1041
},
{
"epoch": 0.560931296682592,
"grad_norm": 1.1875057467361612,
"learning_rate": 4.257047514442481e-06,
"loss": 0.4308,
"step": 1042
},
{
"epoch": 0.5614696184644371,
"grad_norm": 1.7255919834039524,
"learning_rate": 4.248423670660022e-06,
"loss": 0.4637,
"step": 1043
},
{
"epoch": 0.5620079402462822,
"grad_norm": 1.552937296818888,
"learning_rate": 4.239802113767711e-06,
"loss": 0.5167,
"step": 1044
},
{
"epoch": 0.5625462620281273,
"grad_norm": 1.4241418668403774,
"learning_rate": 4.231182869999146e-06,
"loss": 0.4262,
"step": 1045
},
{
"epoch": 0.5630845838099724,
"grad_norm": 1.4079020132555902,
"learning_rate": 4.222565965580892e-06,
"loss": 0.4527,
"step": 1046
},
{
"epoch": 0.5636229055918175,
"grad_norm": 1.3617602268653886,
"learning_rate": 4.2139514267323925e-06,
"loss": 0.4546,
"step": 1047
},
{
"epoch": 0.5641612273736626,
"grad_norm": 1.5838734348735288,
"learning_rate": 4.205339279665895e-06,
"loss": 0.3903,
"step": 1048
},
{
"epoch": 0.5646995491555077,
"grad_norm": 1.451984176062728,
"learning_rate": 4.196729550586367e-06,
"loss": 0.4211,
"step": 1049
},
{
"epoch": 0.5652378709373528,
"grad_norm": 1.5454288468811321,
"learning_rate": 4.18812226569142e-06,
"loss": 0.3856,
"step": 1050
},
{
"epoch": 0.5657761927191979,
"grad_norm": 1.6143068691418476,
"learning_rate": 4.17951745117123e-06,
"loss": 0.4137,
"step": 1051
},
{
"epoch": 0.566314514501043,
"grad_norm": 1.5780823976901985,
"learning_rate": 4.170915133208452e-06,
"loss": 0.4402,
"step": 1052
},
{
"epoch": 0.5668528362828881,
"grad_norm": 1.4482990847613153,
"learning_rate": 4.162315337978148e-06,
"loss": 0.5056,
"step": 1053
},
{
"epoch": 0.5673911580647332,
"grad_norm": 1.534829858260644,
"learning_rate": 4.153718091647702e-06,
"loss": 0.4212,
"step": 1054
},
{
"epoch": 0.5679294798465783,
"grad_norm": 1.6872941151721794,
"learning_rate": 4.145123420376745e-06,
"loss": 0.4604,
"step": 1055
},
{
"epoch": 0.5684678016284234,
"grad_norm": 1.3923901318290877,
"learning_rate": 4.136531350317069e-06,
"loss": 0.4608,
"step": 1056
},
{
"epoch": 0.5690061234102685,
"grad_norm": 1.7627677860939457,
"learning_rate": 4.127941907612553e-06,
"loss": 0.4345,
"step": 1057
},
{
"epoch": 0.5695444451921136,
"grad_norm": 1.6236383393521263,
"learning_rate": 4.11935511839908e-06,
"loss": 0.4599,
"step": 1058
},
{
"epoch": 0.5700827669739587,
"grad_norm": 1.5390392661613181,
"learning_rate": 4.110771008804463e-06,
"loss": 0.4822,
"step": 1059
},
{
"epoch": 0.5706210887558038,
"grad_norm": 1.6460116304075034,
"learning_rate": 4.102189604948356e-06,
"loss": 0.4277,
"step": 1060
},
{
"epoch": 0.5711594105376488,
"grad_norm": 1.4089445870425645,
"learning_rate": 4.093610932942184e-06,
"loss": 0.4055,
"step": 1061
},
{
"epoch": 0.571697732319494,
"grad_norm": 1.4912945610802475,
"learning_rate": 4.085035018889058e-06,
"loss": 0.4081,
"step": 1062
},
{
"epoch": 0.572236054101339,
"grad_norm": 1.7313554326427134,
"learning_rate": 4.076461888883696e-06,
"loss": 0.4516,
"step": 1063
},
{
"epoch": 0.5727743758831841,
"grad_norm": 1.438398770463997,
"learning_rate": 4.067891569012347e-06,
"loss": 0.4591,
"step": 1064
},
{
"epoch": 0.5733126976650292,
"grad_norm": 1.2911877198700585,
"learning_rate": 4.059324085352709e-06,
"loss": 0.3877,
"step": 1065
},
{
"epoch": 0.5738510194468743,
"grad_norm": 1.4799665950387828,
"learning_rate": 4.050759463973849e-06,
"loss": 0.4027,
"step": 1066
},
{
"epoch": 0.5743893412287194,
"grad_norm": 1.31856553741587,
"learning_rate": 4.042197730936124e-06,
"loss": 0.4385,
"step": 1067
},
{
"epoch": 0.5749276630105645,
"grad_norm": 1.4681673368671948,
"learning_rate": 4.033638912291104e-06,
"loss": 0.4699,
"step": 1068
},
{
"epoch": 0.5754659847924096,
"grad_norm": 1.8186933987892613,
"learning_rate": 4.025083034081492e-06,
"loss": 0.474,
"step": 1069
},
{
"epoch": 0.5760043065742547,
"grad_norm": 1.7243406009536202,
"learning_rate": 4.016530122341039e-06,
"loss": 0.4664,
"step": 1070
},
{
"epoch": 0.5765426283560998,
"grad_norm": 1.7574219154990909,
"learning_rate": 4.007980203094476e-06,
"loss": 0.412,
"step": 1071
},
{
"epoch": 0.5770809501379449,
"grad_norm": 3.3723520725361325,
"learning_rate": 3.999433302357427e-06,
"loss": 0.3745,
"step": 1072
},
{
"epoch": 0.57761927191979,
"grad_norm": 1.470644839329035,
"learning_rate": 3.990889446136326e-06,
"loss": 0.4192,
"step": 1073
},
{
"epoch": 0.5781575937016351,
"grad_norm": 1.8064402874305607,
"learning_rate": 3.982348660428349e-06,
"loss": 0.4633,
"step": 1074
},
{
"epoch": 0.5786959154834802,
"grad_norm": 1.5560108586108519,
"learning_rate": 3.9738109712213255e-06,
"loss": 0.4554,
"step": 1075
},
{
"epoch": 0.5792342372653253,
"grad_norm": 1.390022072661602,
"learning_rate": 3.965276404493667e-06,
"loss": 0.4468,
"step": 1076
},
{
"epoch": 0.5797725590471704,
"grad_norm": 1.5485174930428875,
"learning_rate": 3.956744986214281e-06,
"loss": 0.4406,
"step": 1077
},
{
"epoch": 0.5803108808290155,
"grad_norm": 1.377328803064819,
"learning_rate": 3.948216742342492e-06,
"loss": 0.3914,
"step": 1078
},
{
"epoch": 0.5808492026108606,
"grad_norm": 1.7377815121930535,
"learning_rate": 3.939691698827975e-06,
"loss": 0.4409,
"step": 1079
},
{
"epoch": 0.5813875243927057,
"grad_norm": 1.584949416405362,
"learning_rate": 3.931169881610655e-06,
"loss": 0.4909,
"step": 1080
},
{
"epoch": 0.5813875243927057,
"eval_loss": 0.43915173411369324,
"eval_runtime": 1551.2876,
"eval_samples_per_second": 16.122,
"eval_steps_per_second": 0.504,
"step": 1080
},
{
"epoch": 0.5819258461745508,
"grad_norm": 1.4259479318176305,
"learning_rate": 3.922651316620648e-06,
"loss": 0.419,
"step": 1081
},
{
"epoch": 0.5824641679563959,
"grad_norm": 1.883836889268125,
"learning_rate": 3.914136029778173e-06,
"loss": 0.4847,
"step": 1082
},
{
"epoch": 0.583002489738241,
"grad_norm": 1.5440830790183266,
"learning_rate": 3.905624046993474e-06,
"loss": 0.4484,
"step": 1083
},
{
"epoch": 0.5835408115200861,
"grad_norm": 1.711059696428319,
"learning_rate": 3.897115394166738e-06,
"loss": 0.4682,
"step": 1084
},
{
"epoch": 0.5840791333019312,
"grad_norm": 1.8908190002251042,
"learning_rate": 3.8886100971880235e-06,
"loss": 0.4325,
"step": 1085
},
{
"epoch": 0.5846174550837764,
"grad_norm": 1.5374015806352503,
"learning_rate": 3.880108181937178e-06,
"loss": 0.4434,
"step": 1086
},
{
"epoch": 0.5851557768656215,
"grad_norm": 1.864521131460447,
"learning_rate": 3.871609674283757e-06,
"loss": 0.4649,
"step": 1087
},
{
"epoch": 0.5856940986474666,
"grad_norm": 1.9214802187823141,
"learning_rate": 3.863114600086948e-06,
"loss": 0.452,
"step": 1088
},
{
"epoch": 0.5862324204293117,
"grad_norm": 1.3598584887277212,
"learning_rate": 3.854622985195492e-06,
"loss": 0.466,
"step": 1089
},
{
"epoch": 0.5867707422111568,
"grad_norm": 1.6127091744766286,
"learning_rate": 3.846134855447602e-06,
"loss": 0.4627,
"step": 1090
},
{
"epoch": 0.5873090639930019,
"grad_norm": 1.4648349504902127,
"learning_rate": 3.837650236670892e-06,
"loss": 0.3967,
"step": 1091
},
{
"epoch": 0.587847385774847,
"grad_norm": 1.8146408700451369,
"learning_rate": 3.829169154682283e-06,
"loss": 0.4271,
"step": 1092
},
{
"epoch": 0.5883857075566921,
"grad_norm": 1.7751846942753446,
"learning_rate": 3.8206916352879446e-06,
"loss": 0.4464,
"step": 1093
},
{
"epoch": 0.5889240293385372,
"grad_norm": 1.6612024138612147,
"learning_rate": 3.8122177042832e-06,
"loss": 0.4107,
"step": 1094
},
{
"epoch": 0.5894623511203823,
"grad_norm": 2.812616379162355,
"learning_rate": 3.8037473874524542e-06,
"loss": 0.4584,
"step": 1095
},
{
"epoch": 0.5900006729022274,
"grad_norm": 1.3709537212409602,
"learning_rate": 3.7952807105691185e-06,
"loss": 0.4356,
"step": 1096
},
{
"epoch": 0.5905389946840724,
"grad_norm": 1.2984038273503478,
"learning_rate": 3.7868176993955253e-06,
"loss": 0.426,
"step": 1097
},
{
"epoch": 0.5910773164659175,
"grad_norm": 1.6589883894837865,
"learning_rate": 3.7783583796828543e-06,
"loss": 0.4449,
"step": 1098
},
{
"epoch": 0.5916156382477626,
"grad_norm": 1.66006556219293,
"learning_rate": 3.769902777171051e-06,
"loss": 0.493,
"step": 1099
},
{
"epoch": 0.5921539600296077,
"grad_norm": 1.5937225644555308,
"learning_rate": 3.761450917588753e-06,
"loss": 0.4723,
"step": 1100
},
{
"epoch": 0.5926922818114528,
"grad_norm": 1.3456146090228862,
"learning_rate": 3.7530028266532074e-06,
"loss": 0.4137,
"step": 1101
},
{
"epoch": 0.5932306035932979,
"grad_norm": 1.679198037724048,
"learning_rate": 3.744558530070196e-06,
"loss": 0.4261,
"step": 1102
},
{
"epoch": 0.593768925375143,
"grad_norm": 1.581894355411804,
"learning_rate": 3.7361180535339504e-06,
"loss": 0.4612,
"step": 1103
},
{
"epoch": 0.5943072471569881,
"grad_norm": 1.4999393803804146,
"learning_rate": 3.7276814227270842e-06,
"loss": 0.4242,
"step": 1104
},
{
"epoch": 0.5948455689388332,
"grad_norm": 1.6700110113661726,
"learning_rate": 3.719248663320506e-06,
"loss": 0.4536,
"step": 1105
},
{
"epoch": 0.5953838907206783,
"grad_norm": 1.4628534581538355,
"learning_rate": 3.7108198009733454e-06,
"loss": 0.3885,
"step": 1106
},
{
"epoch": 0.5959222125025234,
"grad_norm": 1.5174908060004981,
"learning_rate": 3.7023948613328736e-06,
"loss": 0.4688,
"step": 1107
},
{
"epoch": 0.5964605342843685,
"grad_norm": 1.6277090494975097,
"learning_rate": 3.6939738700344264e-06,
"loss": 0.4404,
"step": 1108
},
{
"epoch": 0.5969988560662136,
"grad_norm": 2.5097831655290954,
"learning_rate": 3.6855568527013273e-06,
"loss": 0.4608,
"step": 1109
},
{
"epoch": 0.5975371778480587,
"grad_norm": 1.4992012722834578,
"learning_rate": 3.677143834944803e-06,
"loss": 0.4446,
"step": 1110
},
{
"epoch": 0.5980754996299038,
"grad_norm": 1.4139401580995998,
"learning_rate": 3.6687348423639147e-06,
"loss": 0.4098,
"step": 1111
},
{
"epoch": 0.5986138214117489,
"grad_norm": 2.0752058550686585,
"learning_rate": 3.6603299005454744e-06,
"loss": 0.4234,
"step": 1112
},
{
"epoch": 0.599152143193594,
"grad_norm": 1.6967487088214965,
"learning_rate": 3.6519290350639697e-06,
"loss": 0.4348,
"step": 1113
},
{
"epoch": 0.5996904649754391,
"grad_norm": 1.7094622508466781,
"learning_rate": 3.6435322714814813e-06,
"loss": 0.4584,
"step": 1114
},
{
"epoch": 0.6002287867572842,
"grad_norm": 1.5333043053128887,
"learning_rate": 3.635139635347612e-06,
"loss": 0.4211,
"step": 1115
},
{
"epoch": 0.6007671085391293,
"grad_norm": 1.447440380533825,
"learning_rate": 3.626751152199406e-06,
"loss": 0.4392,
"step": 1116
},
{
"epoch": 0.6013054303209744,
"grad_norm": 1.558545230893266,
"learning_rate": 3.6183668475612665e-06,
"loss": 0.4553,
"step": 1117
},
{
"epoch": 0.6018437521028195,
"grad_norm": 1.7341397982742823,
"learning_rate": 3.6099867469448874e-06,
"loss": 0.4521,
"step": 1118
},
{
"epoch": 0.6023820738846646,
"grad_norm": 3.5577384559068075,
"learning_rate": 3.601610875849168e-06,
"loss": 0.4999,
"step": 1119
},
{
"epoch": 0.6029203956665097,
"grad_norm": 1.3499033786926813,
"learning_rate": 3.5932392597601396e-06,
"loss": 0.4273,
"step": 1120
},
{
"epoch": 0.6034587174483548,
"grad_norm": 1.49775810523526,
"learning_rate": 3.584871924150883e-06,
"loss": 0.4275,
"step": 1121
},
{
"epoch": 0.6039970392301999,
"grad_norm": 1.4867216376875734,
"learning_rate": 3.576508894481458e-06,
"loss": 0.443,
"step": 1122
},
{
"epoch": 0.604535361012045,
"grad_norm": 1.8077118144262816,
"learning_rate": 3.5681501961988212e-06,
"loss": 0.408,
"step": 1123
},
{
"epoch": 0.6050736827938901,
"grad_norm": 2.0530433441295535,
"learning_rate": 3.5597958547367507e-06,
"loss": 0.3988,
"step": 1124
},
{
"epoch": 0.6056120045757352,
"grad_norm": 1.4118492293118154,
"learning_rate": 3.551445895515765e-06,
"loss": 0.477,
"step": 1125
},
{
"epoch": 0.6061503263575803,
"grad_norm": 1.7018214299556869,
"learning_rate": 3.5431003439430493e-06,
"loss": 0.4441,
"step": 1126
},
{
"epoch": 0.6066886481394254,
"grad_norm": 1.434018580532193,
"learning_rate": 3.5347592254123795e-06,
"loss": 0.4539,
"step": 1127
},
{
"epoch": 0.6072269699212705,
"grad_norm": 1.4867130289511963,
"learning_rate": 3.526422565304042e-06,
"loss": 0.4158,
"step": 1128
},
{
"epoch": 0.6077652917031156,
"grad_norm": 1.4715457603229556,
"learning_rate": 3.518090388984753e-06,
"loss": 0.425,
"step": 1129
},
{
"epoch": 0.6083036134849606,
"grad_norm": 1.4891631829297116,
"learning_rate": 3.5097627218075905e-06,
"loss": 0.4551,
"step": 1130
},
{
"epoch": 0.6088419352668057,
"grad_norm": 1.38559309859237,
"learning_rate": 3.5014395891119112e-06,
"loss": 0.3903,
"step": 1131
},
{
"epoch": 0.6093802570486508,
"grad_norm": 1.5211311736282844,
"learning_rate": 3.4931210162232716e-06,
"loss": 0.474,
"step": 1132
},
{
"epoch": 0.6099185788304959,
"grad_norm": 3.910273590345733,
"learning_rate": 3.484807028453356e-06,
"loss": 0.4386,
"step": 1133
},
{
"epoch": 0.610456900612341,
"grad_norm": 1.21915593287012,
"learning_rate": 3.476497651099897e-06,
"loss": 0.4214,
"step": 1134
},
{
"epoch": 0.6109952223941861,
"grad_norm": 7.218438211629208,
"learning_rate": 3.4681929094465987e-06,
"loss": 0.4368,
"step": 1135
},
{
"epoch": 0.6115335441760312,
"grad_norm": 1.5885679173464573,
"learning_rate": 3.4598928287630585e-06,
"loss": 0.4304,
"step": 1136
},
{
"epoch": 0.6120718659578763,
"grad_norm": 1.6276966755475062,
"learning_rate": 3.451597434304692e-06,
"loss": 0.4303,
"step": 1137
},
{
"epoch": 0.6126101877397214,
"grad_norm": 2.4974771072637227,
"learning_rate": 3.443306751312656e-06,
"loss": 0.4812,
"step": 1138
},
{
"epoch": 0.6131485095215665,
"grad_norm": 1.8523418655749138,
"learning_rate": 3.435020805013773e-06,
"loss": 0.4464,
"step": 1139
},
{
"epoch": 0.6136868313034116,
"grad_norm": 1.6153961476534389,
"learning_rate": 3.4267396206204477e-06,
"loss": 0.4258,
"step": 1140
},
{
"epoch": 0.6136868313034116,
"eval_loss": 0.4358210265636444,
"eval_runtime": 1559.0889,
"eval_samples_per_second": 16.041,
"eval_steps_per_second": 0.502,
"step": 1140
},
{
"epoch": 0.6142251530852567,
"grad_norm": 1.5200314946583775,
"learning_rate": 3.4184632233306004e-06,
"loss": 0.4328,
"step": 1141
},
{
"epoch": 0.6147634748671018,
"grad_norm": 1.753239287330404,
"learning_rate": 3.4101916383275836e-06,
"loss": 0.4164,
"step": 1142
},
{
"epoch": 0.6153017966489469,
"grad_norm": 1.3784614615536817,
"learning_rate": 3.4019248907801058e-06,
"loss": 0.407,
"step": 1143
},
{
"epoch": 0.615840118430792,
"grad_norm": 1.4916546024442217,
"learning_rate": 3.3936630058421567e-06,
"loss": 0.4449,
"step": 1144
},
{
"epoch": 0.6163784402126371,
"grad_norm": 1.411016335795447,
"learning_rate": 3.385406008652931e-06,
"loss": 0.4137,
"step": 1145
},
{
"epoch": 0.6169167619944822,
"grad_norm": 1.969929829038151,
"learning_rate": 3.3771539243367517e-06,
"loss": 0.4569,
"step": 1146
},
{
"epoch": 0.6174550837763273,
"grad_norm": 1.4268646662770854,
"learning_rate": 3.3689067780029895e-06,
"loss": 0.4399,
"step": 1147
},
{
"epoch": 0.6179934055581724,
"grad_norm": 1.4858645297475759,
"learning_rate": 3.3606645947459933e-06,
"loss": 0.4318,
"step": 1148
},
{
"epoch": 0.6185317273400175,
"grad_norm": 2.07970165108201,
"learning_rate": 3.3524273996450087e-06,
"loss": 0.4804,
"step": 1149
},
{
"epoch": 0.6190700491218626,
"grad_norm": 1.5524399522642343,
"learning_rate": 3.3441952177641046e-06,
"loss": 0.448,
"step": 1150
},
{
"epoch": 0.6196083709037077,
"grad_norm": 1.5025047668730835,
"learning_rate": 3.335968074152094e-06,
"loss": 0.4229,
"step": 1151
},
{
"epoch": 0.6201466926855528,
"grad_norm": 1.51932290948172,
"learning_rate": 3.32774599384246e-06,
"loss": 0.4238,
"step": 1152
},
{
"epoch": 0.6206850144673979,
"grad_norm": 1.4003637291864899,
"learning_rate": 3.319529001853282e-06,
"loss": 0.4618,
"step": 1153
},
{
"epoch": 0.621223336249243,
"grad_norm": 1.3792399628540106,
"learning_rate": 3.311317123187151e-06,
"loss": 0.4052,
"step": 1154
},
{
"epoch": 0.6217616580310881,
"grad_norm": 1.4341824487711958,
"learning_rate": 3.3031103828311044e-06,
"loss": 0.4452,
"step": 1155
},
{
"epoch": 0.6222999798129332,
"grad_norm": 1.8890388921678993,
"learning_rate": 3.294908805756543e-06,
"loss": 0.4311,
"step": 1156
},
{
"epoch": 0.6228383015947783,
"grad_norm": 1.6873174271659632,
"learning_rate": 3.286712416919156e-06,
"loss": 0.465,
"step": 1157
},
{
"epoch": 0.6233766233766234,
"grad_norm": 2.113957712483436,
"learning_rate": 3.2785212412588464e-06,
"loss": 0.4103,
"step": 1158
},
{
"epoch": 0.6239149451584685,
"grad_norm": 1.6169473829408894,
"learning_rate": 3.2703353036996553e-06,
"loss": 0.4042,
"step": 1159
},
{
"epoch": 0.6244532669403136,
"grad_norm": 1.6678579140480474,
"learning_rate": 3.262154629149684e-06,
"loss": 0.4849,
"step": 1160
},
{
"epoch": 0.6249915887221587,
"grad_norm": 1.5133551741537392,
"learning_rate": 3.253979242501023e-06,
"loss": 0.4479,
"step": 1161
},
{
"epoch": 0.6255299105040037,
"grad_norm": 1.5463516633606489,
"learning_rate": 3.2458091686296666e-06,
"loss": 0.4589,
"step": 1162
},
{
"epoch": 0.6260682322858488,
"grad_norm": 1.3908513399535982,
"learning_rate": 3.2376444323954487e-06,
"loss": 0.407,
"step": 1163
},
{
"epoch": 0.6266065540676939,
"grad_norm": 1.4911824388993882,
"learning_rate": 3.2294850586419603e-06,
"loss": 0.4016,
"step": 1164
},
{
"epoch": 0.627144875849539,
"grad_norm": 1.4342504928355473,
"learning_rate": 3.2213310721964753e-06,
"loss": 0.4269,
"step": 1165
},
{
"epoch": 0.6276831976313841,
"grad_norm": 1.5982636474188436,
"learning_rate": 3.2131824978698744e-06,
"loss": 0.4532,
"step": 1166
},
{
"epoch": 0.6282215194132292,
"grad_norm": 1.3672342575621805,
"learning_rate": 3.2050393604565722e-06,
"loss": 0.3972,
"step": 1167
},
{
"epoch": 0.6287598411950743,
"grad_norm": 1.6874817093257244,
"learning_rate": 3.196901684734439e-06,
"loss": 0.457,
"step": 1168
},
{
"epoch": 0.6292981629769194,
"grad_norm": 1.5723777384143767,
"learning_rate": 3.188769495464725e-06,
"loss": 0.3892,
"step": 1169
},
{
"epoch": 0.6298364847587645,
"grad_norm": 1.601524939347794,
"learning_rate": 3.180642817391988e-06,
"loss": 0.4433,
"step": 1170
},
{
"epoch": 0.6303748065406096,
"grad_norm": 2.25805654454037,
"learning_rate": 3.172521675244016e-06,
"loss": 0.4322,
"step": 1171
},
{
"epoch": 0.6309131283224547,
"grad_norm": 1.5555079250741115,
"learning_rate": 3.1644060937317523e-06,
"loss": 0.391,
"step": 1172
},
{
"epoch": 0.6314514501042998,
"grad_norm": 1.4992699551350894,
"learning_rate": 3.1562960975492194e-06,
"loss": 0.4044,
"step": 1173
},
{
"epoch": 0.6319897718861449,
"grad_norm": 1.5799132322735037,
"learning_rate": 3.1481917113734474e-06,
"loss": 0.3812,
"step": 1174
},
{
"epoch": 0.63252809366799,
"grad_norm": 1.7698333563655604,
"learning_rate": 3.140092959864392e-06,
"loss": 0.4353,
"step": 1175
},
{
"epoch": 0.6330664154498351,
"grad_norm": 1.568455528145148,
"learning_rate": 3.1319998676648695e-06,
"loss": 0.4307,
"step": 1176
},
{
"epoch": 0.6336047372316802,
"grad_norm": 1.6539679705814518,
"learning_rate": 3.12391245940047e-06,
"loss": 0.4269,
"step": 1177
},
{
"epoch": 0.6341430590135253,
"grad_norm": 1.7204853297231233,
"learning_rate": 3.115830759679492e-06,
"loss": 0.4857,
"step": 1178
},
{
"epoch": 0.6346813807953704,
"grad_norm": 1.6626863719528417,
"learning_rate": 3.1077547930928652e-06,
"loss": 0.4681,
"step": 1179
},
{
"epoch": 0.6352197025772155,
"grad_norm": 1.6842711637823262,
"learning_rate": 3.0996845842140716e-06,
"loss": 0.4312,
"step": 1180
},
{
"epoch": 0.6357580243590606,
"grad_norm": 1.7431784823037149,
"learning_rate": 3.091620157599075e-06,
"loss": 0.4206,
"step": 1181
},
{
"epoch": 0.6362963461409057,
"grad_norm": 1.7565059915579697,
"learning_rate": 3.0835615377862453e-06,
"loss": 0.4787,
"step": 1182
},
{
"epoch": 0.6368346679227508,
"grad_norm": 1.5940508036600212,
"learning_rate": 3.0755087492962844e-06,
"loss": 0.3977,
"step": 1183
},
{
"epoch": 0.6373729897045959,
"grad_norm": 1.4265440236436624,
"learning_rate": 3.0674618166321477e-06,
"loss": 0.4455,
"step": 1184
},
{
"epoch": 0.637911311486441,
"grad_norm": 1.5203806820148102,
"learning_rate": 3.059420764278975e-06,
"loss": 0.4421,
"step": 1185
},
{
"epoch": 0.6384496332682861,
"grad_norm": 1.7485388075672719,
"learning_rate": 3.0513856167040123e-06,
"loss": 0.4337,
"step": 1186
},
{
"epoch": 0.6389879550501312,
"grad_norm": 1.5758916072812403,
"learning_rate": 3.0433563983565415e-06,
"loss": 0.483,
"step": 1187
},
{
"epoch": 0.6395262768319763,
"grad_norm": 1.7757740619316615,
"learning_rate": 3.0353331336677984e-06,
"loss": 0.402,
"step": 1188
},
{
"epoch": 0.6400645986138214,
"grad_norm": 1.5639356203741708,
"learning_rate": 3.027315847050906e-06,
"loss": 0.4588,
"step": 1189
},
{
"epoch": 0.6406029203956665,
"grad_norm": 1.900913903628273,
"learning_rate": 3.0193045629007982e-06,
"loss": 0.4318,
"step": 1190
},
{
"epoch": 0.6411412421775116,
"grad_norm": 1.7813979669008324,
"learning_rate": 3.011299305594141e-06,
"loss": 0.4444,
"step": 1191
},
{
"epoch": 0.6416795639593567,
"grad_norm": 1.4267787696799576,
"learning_rate": 3.0033000994892646e-06,
"loss": 0.4394,
"step": 1192
},
{
"epoch": 0.6422178857412018,
"grad_norm": 1.425734282167891,
"learning_rate": 2.995306968926087e-06,
"loss": 0.4729,
"step": 1193
},
{
"epoch": 0.6427562075230469,
"grad_norm": 1.6415657973276232,
"learning_rate": 2.98731993822604e-06,
"loss": 0.4644,
"step": 1194
},
{
"epoch": 0.643294529304892,
"grad_norm": 1.8314597950910743,
"learning_rate": 2.97933903169199e-06,
"loss": 0.5308,
"step": 1195
},
{
"epoch": 0.643832851086737,
"grad_norm": 1.5314208582263587,
"learning_rate": 2.9713642736081755e-06,
"loss": 0.4539,
"step": 1196
},
{
"epoch": 0.6443711728685821,
"grad_norm": 1.7043966331574372,
"learning_rate": 2.9633956882401215e-06,
"loss": 0.4478,
"step": 1197
},
{
"epoch": 0.6449094946504272,
"grad_norm": 1.3896380014466228,
"learning_rate": 2.955433299834576e-06,
"loss": 0.4274,
"step": 1198
},
{
"epoch": 0.6454478164322723,
"grad_norm": 1.328466975562685,
"learning_rate": 2.947477132619423e-06,
"loss": 0.4151,
"step": 1199
},
{
"epoch": 0.6459861382141174,
"grad_norm": 1.4947495053829816,
"learning_rate": 2.939527210803624e-06,
"loss": 0.4225,
"step": 1200
},
{
"epoch": 0.6459861382141174,
"eval_loss": 0.43335118889808655,
"eval_runtime": 1568.1591,
"eval_samples_per_second": 15.949,
"eval_steps_per_second": 0.499,
"step": 1200
},
{
"epoch": 0.6465244599959626,
"grad_norm": 1.7770419353679783,
"learning_rate": 2.9315835585771334e-06,
"loss": 0.4443,
"step": 1201
},
{
"epoch": 0.6470627817778077,
"grad_norm": 1.509257884926516,
"learning_rate": 2.923646200110832e-06,
"loss": 0.403,
"step": 1202
},
{
"epoch": 0.6476011035596528,
"grad_norm": 1.413359799607147,
"learning_rate": 2.915715159556444e-06,
"loss": 0.3995,
"step": 1203
},
{
"epoch": 0.6481394253414979,
"grad_norm": 1.4051405846579907,
"learning_rate": 2.9077904610464745e-06,
"loss": 0.3597,
"step": 1204
},
{
"epoch": 0.648677747123343,
"grad_norm": 1.5857210618229394,
"learning_rate": 2.89987212869413e-06,
"loss": 0.448,
"step": 1205
},
{
"epoch": 0.6492160689051881,
"grad_norm": 1.3723187404527468,
"learning_rate": 2.8919601865932456e-06,
"loss": 0.4522,
"step": 1206
},
{
"epoch": 0.6497543906870332,
"grad_norm": 1.3511061410304184,
"learning_rate": 2.884054658818214e-06,
"loss": 0.3792,
"step": 1207
},
{
"epoch": 0.6502927124688783,
"grad_norm": 1.387760091675675,
"learning_rate": 2.8761555694239046e-06,
"loss": 0.4515,
"step": 1208
},
{
"epoch": 0.6508310342507234,
"grad_norm": 1.4247593593472396,
"learning_rate": 2.868262942445603e-06,
"loss": 0.4489,
"step": 1209
},
{
"epoch": 0.6513693560325685,
"grad_norm": 1.600671347691334,
"learning_rate": 2.8603768018989275e-06,
"loss": 0.3944,
"step": 1210
},
{
"epoch": 0.6519076778144136,
"grad_norm": 1.4284428882228806,
"learning_rate": 2.852497171779761e-06,
"loss": 0.432,
"step": 1211
},
{
"epoch": 0.6524459995962587,
"grad_norm": 1.8170320001458748,
"learning_rate": 2.8446240760641762e-06,
"loss": 0.483,
"step": 1212
},
{
"epoch": 0.6529843213781038,
"grad_norm": 1.872300633931277,
"learning_rate": 2.836757538708362e-06,
"loss": 0.4226,
"step": 1213
},
{
"epoch": 0.6535226431599489,
"grad_norm": 1.5545253276420463,
"learning_rate": 2.8288975836485523e-06,
"loss": 0.4452,
"step": 1214
},
{
"epoch": 0.654060964941794,
"grad_norm": 1.4689119979210103,
"learning_rate": 2.8210442348009543e-06,
"loss": 0.4206,
"step": 1215
},
{
"epoch": 0.6545992867236391,
"grad_norm": 1.495722266239985,
"learning_rate": 2.8131975160616686e-06,
"loss": 0.4555,
"step": 1216
},
{
"epoch": 0.6551376085054842,
"grad_norm": 1.4286754464458904,
"learning_rate": 2.805357451306626e-06,
"loss": 0.4531,
"step": 1217
},
{
"epoch": 0.6556759302873293,
"grad_norm": 1.6604089854519999,
"learning_rate": 2.797524064391511e-06,
"loss": 0.4351,
"step": 1218
},
{
"epoch": 0.6562142520691744,
"grad_norm": 1.677727217993553,
"learning_rate": 2.7896973791516867e-06,
"loss": 0.4797,
"step": 1219
},
{
"epoch": 0.6567525738510195,
"grad_norm": 1.8188528752490087,
"learning_rate": 2.781877419402126e-06,
"loss": 0.3942,
"step": 1220
},
{
"epoch": 0.6572908956328646,
"grad_norm": 1.518304729497582,
"learning_rate": 2.7740642089373356e-06,
"loss": 0.4567,
"step": 1221
},
{
"epoch": 0.6578292174147097,
"grad_norm": 1.9076520179847476,
"learning_rate": 2.76625777153129e-06,
"loss": 0.4761,
"step": 1222
},
{
"epoch": 0.6583675391965548,
"grad_norm": 1.6501027454283104,
"learning_rate": 2.758458130937346e-06,
"loss": 0.4568,
"step": 1223
},
{
"epoch": 0.6589058609783999,
"grad_norm": 1.4971909664683323,
"learning_rate": 2.7506653108881885e-06,
"loss": 0.4534,
"step": 1224
},
{
"epoch": 0.659444182760245,
"grad_norm": 1.8216935826384455,
"learning_rate": 2.742879335095743e-06,
"loss": 0.4872,
"step": 1225
},
{
"epoch": 0.6599825045420901,
"grad_norm": 1.441369836777809,
"learning_rate": 2.735100227251113e-06,
"loss": 0.3857,
"step": 1226
},
{
"epoch": 0.6605208263239352,
"grad_norm": 1.3907320663098741,
"learning_rate": 2.7273280110245e-06,
"loss": 0.4055,
"step": 1227
},
{
"epoch": 0.6610591481057803,
"grad_norm": 1.3629302314750185,
"learning_rate": 2.719562710065142e-06,
"loss": 0.4059,
"step": 1228
},
{
"epoch": 0.6615974698876254,
"grad_norm": 1.5181251515722511,
"learning_rate": 2.711804348001231e-06,
"loss": 0.4927,
"step": 1229
},
{
"epoch": 0.6621357916694705,
"grad_norm": 1.583461554714453,
"learning_rate": 2.704052948439842e-06,
"loss": 0.4139,
"step": 1230
},
{
"epoch": 0.6626741134513155,
"grad_norm": 1.597683792644596,
"learning_rate": 2.6963085349668718e-06,
"loss": 0.4299,
"step": 1231
},
{
"epoch": 0.6632124352331606,
"grad_norm": 1.4538764746820028,
"learning_rate": 2.6885711311469547e-06,
"loss": 0.4238,
"step": 1232
},
{
"epoch": 0.6637507570150057,
"grad_norm": 1.5760098860778269,
"learning_rate": 2.6808407605234006e-06,
"loss": 0.4605,
"step": 1233
},
{
"epoch": 0.6642890787968508,
"grad_norm": 1.8819638022647283,
"learning_rate": 2.673117446618114e-06,
"loss": 0.4176,
"step": 1234
},
{
"epoch": 0.6648274005786959,
"grad_norm": 1.7467867886896942,
"learning_rate": 2.665401212931532e-06,
"loss": 0.4284,
"step": 1235
},
{
"epoch": 0.665365722360541,
"grad_norm": 1.3582161008888671,
"learning_rate": 2.6576920829425434e-06,
"loss": 0.449,
"step": 1236
},
{
"epoch": 0.6659040441423861,
"grad_norm": 1.7112669988534182,
"learning_rate": 2.6499900801084283e-06,
"loss": 0.4702,
"step": 1237
},
{
"epoch": 0.6664423659242312,
"grad_norm": 2.099925951296545,
"learning_rate": 2.6422952278647705e-06,
"loss": 0.4592,
"step": 1238
},
{
"epoch": 0.6669806877060763,
"grad_norm": 1.4352705146813356,
"learning_rate": 2.6346075496254054e-06,
"loss": 0.384,
"step": 1239
},
{
"epoch": 0.6675190094879214,
"grad_norm": 1.89895053480487,
"learning_rate": 2.6269270687823337e-06,
"loss": 0.4632,
"step": 1240
},
{
"epoch": 0.6680573312697665,
"grad_norm": 1.527126991788229,
"learning_rate": 2.619253808705661e-06,
"loss": 0.4304,
"step": 1241
},
{
"epoch": 0.6685956530516116,
"grad_norm": 1.9088122860113825,
"learning_rate": 2.6115877927435152e-06,
"loss": 0.4615,
"step": 1242
},
{
"epoch": 0.6691339748334567,
"grad_norm": 1.5152814714510374,
"learning_rate": 2.6039290442219884e-06,
"loss": 0.4019,
"step": 1243
},
{
"epoch": 0.6696722966153018,
"grad_norm": 1.490222426325067,
"learning_rate": 2.5962775864450563e-06,
"loss": 0.425,
"step": 1244
},
{
"epoch": 0.6702106183971469,
"grad_norm": 1.5269175130136061,
"learning_rate": 2.588633442694508e-06,
"loss": 0.3988,
"step": 1245
},
{
"epoch": 0.670748940178992,
"grad_norm": 1.4416954872355545,
"learning_rate": 2.5809966362298805e-06,
"loss": 0.4603,
"step": 1246
},
{
"epoch": 0.6712872619608371,
"grad_norm": 2.6364873275752014,
"learning_rate": 2.573367190288385e-06,
"loss": 0.4648,
"step": 1247
},
{
"epoch": 0.6718255837426822,
"grad_norm": 1.788546820645697,
"learning_rate": 2.5657451280848355e-06,
"loss": 0.4635,
"step": 1248
},
{
"epoch": 0.6723639055245273,
"grad_norm": 1.3806063124644692,
"learning_rate": 2.5581304728115797e-06,
"loss": 0.4943,
"step": 1249
},
{
"epoch": 0.6729022273063724,
"grad_norm": 1.402487270939909,
"learning_rate": 2.550523247638426e-06,
"loss": 0.4006,
"step": 1250
},
{
"epoch": 0.6734405490882175,
"grad_norm": 1.910681275697032,
"learning_rate": 2.542923475712574e-06,
"loss": 0.4609,
"step": 1251
},
{
"epoch": 0.6739788708700626,
"grad_norm": 1.446121535462886,
"learning_rate": 2.5353311801585507e-06,
"loss": 0.4092,
"step": 1252
},
{
"epoch": 0.6745171926519077,
"grad_norm": 1.6008122915794563,
"learning_rate": 2.5277463840781236e-06,
"loss": 0.4648,
"step": 1253
},
{
"epoch": 0.6750555144337528,
"grad_norm": 1.8052193116478468,
"learning_rate": 2.520169110550248e-06,
"loss": 0.4325,
"step": 1254
},
{
"epoch": 0.6755938362155979,
"grad_norm": 2.0544496666589245,
"learning_rate": 2.5125993826309904e-06,
"loss": 0.4102,
"step": 1255
},
{
"epoch": 0.676132157997443,
"grad_norm": 1.5511129757696938,
"learning_rate": 2.5050372233534526e-06,
"loss": 0.4443,
"step": 1256
},
{
"epoch": 0.6766704797792881,
"grad_norm": 1.8672906417068529,
"learning_rate": 2.4974826557277115e-06,
"loss": 0.4516,
"step": 1257
},
{
"epoch": 0.6772088015611332,
"grad_norm": 1.4831806217941237,
"learning_rate": 2.489935702740741e-06,
"loss": 0.4347,
"step": 1258
},
{
"epoch": 0.6777471233429783,
"grad_norm": 1.5986607931002996,
"learning_rate": 2.4823963873563487e-06,
"loss": 0.427,
"step": 1259
},
{
"epoch": 0.6782854451248234,
"grad_norm": 1.481767434298922,
"learning_rate": 2.4748647325150966e-06,
"loss": 0.4135,
"step": 1260
},
{
"epoch": 0.6782854451248234,
"eval_loss": 0.43108630180358887,
"eval_runtime": 1581.7954,
"eval_samples_per_second": 15.811,
"eval_steps_per_second": 0.494,
"step": 1260
},
{
"epoch": 0.6788237669066685,
"grad_norm": 1.491812080960543,
"learning_rate": 2.467340761134242e-06,
"loss": 0.4392,
"step": 1261
},
{
"epoch": 0.6793620886885136,
"grad_norm": 1.5403059882131847,
"learning_rate": 2.459824496107662e-06,
"loss": 0.4631,
"step": 1262
},
{
"epoch": 0.6799004104703587,
"grad_norm": 1.4488066174399352,
"learning_rate": 2.4523159603057858e-06,
"loss": 0.4401,
"step": 1263
},
{
"epoch": 0.6804387322522037,
"grad_norm": 1.6997928715987718,
"learning_rate": 2.444815176575521e-06,
"loss": 0.4671,
"step": 1264
},
{
"epoch": 0.6809770540340488,
"grad_norm": 1.6242395825984155,
"learning_rate": 2.4373221677401916e-06,
"loss": 0.4227,
"step": 1265
},
{
"epoch": 0.6815153758158939,
"grad_norm": 1.3272959133305353,
"learning_rate": 2.429836956599463e-06,
"loss": 0.3586,
"step": 1266
},
{
"epoch": 0.682053697597739,
"grad_norm": 1.723455688742321,
"learning_rate": 2.422359565929268e-06,
"loss": 0.4275,
"step": 1267
},
{
"epoch": 0.6825920193795841,
"grad_norm": 1.3911086482449566,
"learning_rate": 2.414890018481752e-06,
"loss": 0.4383,
"step": 1268
},
{
"epoch": 0.6831303411614292,
"grad_norm": 1.515918050738459,
"learning_rate": 2.40742833698519e-06,
"loss": 0.4342,
"step": 1269
},
{
"epoch": 0.6836686629432743,
"grad_norm": 1.6928322026664087,
"learning_rate": 2.3999745441439243e-06,
"loss": 0.4156,
"step": 1270
},
{
"epoch": 0.6842069847251194,
"grad_norm": 1.3632558682947689,
"learning_rate": 2.3925286626382926e-06,
"loss": 0.3914,
"step": 1271
},
{
"epoch": 0.6847453065069645,
"grad_norm": 3.139130094162036,
"learning_rate": 2.385090715124562e-06,
"loss": 0.4637,
"step": 1272
},
{
"epoch": 0.6852836282888096,
"grad_norm": 1.434440598705869,
"learning_rate": 2.3776607242348547e-06,
"loss": 0.437,
"step": 1273
},
{
"epoch": 0.6858219500706547,
"grad_norm": 1.5144260531076574,
"learning_rate": 2.3702387125770882e-06,
"loss": 0.4234,
"step": 1274
},
{
"epoch": 0.6863602718524998,
"grad_norm": 1.693660818176695,
"learning_rate": 2.362824702734893e-06,
"loss": 0.4164,
"step": 1275
},
{
"epoch": 0.6868985936343449,
"grad_norm": 1.3894626651308215,
"learning_rate": 2.355418717267558e-06,
"loss": 0.4221,
"step": 1276
},
{
"epoch": 0.68743691541619,
"grad_norm": 1.697033782203384,
"learning_rate": 2.3480207787099534e-06,
"loss": 0.4383,
"step": 1277
},
{
"epoch": 0.6879752371980351,
"grad_norm": 1.4858347246883488,
"learning_rate": 2.340630909572465e-06,
"loss": 0.4265,
"step": 1278
},
{
"epoch": 0.6885135589798802,
"grad_norm": 1.500359176091357,
"learning_rate": 2.3332491323409234e-06,
"loss": 0.4481,
"step": 1279
},
{
"epoch": 0.6890518807617253,
"grad_norm": 1.5297356725220441,
"learning_rate": 2.32587546947654e-06,
"loss": 0.4348,
"step": 1280
},
{
"epoch": 0.6895902025435704,
"grad_norm": 2.508398158502729,
"learning_rate": 2.3185099434158352e-06,
"loss": 0.4437,
"step": 1281
},
{
"epoch": 0.6901285243254155,
"grad_norm": 1.523641981004582,
"learning_rate": 2.311152576570566e-06,
"loss": 0.4575,
"step": 1282
},
{
"epoch": 0.6906668461072606,
"grad_norm": 1.6114434265747755,
"learning_rate": 2.303803391327669e-06,
"loss": 0.4378,
"step": 1283
},
{
"epoch": 0.6912051678891057,
"grad_norm": 1.4928444150803868,
"learning_rate": 2.296462410049183e-06,
"loss": 0.4411,
"step": 1284
},
{
"epoch": 0.6917434896709508,
"grad_norm": 1.5345549032626111,
"learning_rate": 2.289129655072185e-06,
"loss": 0.4324,
"step": 1285
},
{
"epoch": 0.6922818114527959,
"grad_norm": 1.4298368477097725,
"learning_rate": 2.2818051487087183e-06,
"loss": 0.426,
"step": 1286
},
{
"epoch": 0.692820133234641,
"grad_norm": 1.8725369506254443,
"learning_rate": 2.2744889132457314e-06,
"loss": 0.4541,
"step": 1287
},
{
"epoch": 0.6933584550164861,
"grad_norm": 1.77702449875276,
"learning_rate": 2.267180970945003e-06,
"loss": 0.432,
"step": 1288
},
{
"epoch": 0.6938967767983312,
"grad_norm": 1.4563290123647166,
"learning_rate": 2.259881344043081e-06,
"loss": 0.3832,
"step": 1289
},
{
"epoch": 0.6944350985801763,
"grad_norm": 1.3449801230990073,
"learning_rate": 2.252590054751205e-06,
"loss": 0.3962,
"step": 1290
},
{
"epoch": 0.6949734203620214,
"grad_norm": 1.8854534900995603,
"learning_rate": 2.2453071252552515e-06,
"loss": 0.4807,
"step": 1291
},
{
"epoch": 0.6955117421438665,
"grad_norm": 1.762423954535133,
"learning_rate": 2.238032577715656e-06,
"loss": 0.384,
"step": 1292
},
{
"epoch": 0.6960500639257116,
"grad_norm": 1.476803369543656,
"learning_rate": 2.2307664342673506e-06,
"loss": 0.4539,
"step": 1293
},
{
"epoch": 0.6965883857075567,
"grad_norm": 1.4854619250041479,
"learning_rate": 2.2235087170196966e-06,
"loss": 0.4396,
"step": 1294
},
{
"epoch": 0.6971267074894018,
"grad_norm": 1.41098403179678,
"learning_rate": 2.2162594480564155e-06,
"loss": 0.4005,
"step": 1295
},
{
"epoch": 0.6976650292712469,
"grad_norm": 1.2989632950912373,
"learning_rate": 2.2090186494355203e-06,
"loss": 0.4151,
"step": 1296
},
{
"epoch": 0.698203351053092,
"grad_norm": 1.6133874577700047,
"learning_rate": 2.2017863431892534e-06,
"loss": 0.4285,
"step": 1297
},
{
"epoch": 0.698741672834937,
"grad_norm": 1.333799397613619,
"learning_rate": 2.1945625513240154e-06,
"loss": 0.4041,
"step": 1298
},
{
"epoch": 0.6992799946167821,
"grad_norm": 1.4390186504294415,
"learning_rate": 2.1873472958202997e-06,
"loss": 0.4365,
"step": 1299
},
{
"epoch": 0.6998183163986272,
"grad_norm": 1.2866738586576456,
"learning_rate": 2.1801405986326245e-06,
"loss": 0.4665,
"step": 1300
},
{
"epoch": 0.7003566381804723,
"grad_norm": 2.2273828713275865,
"learning_rate": 2.1729424816894685e-06,
"loss": 0.4564,
"step": 1301
},
{
"epoch": 0.7008949599623174,
"grad_norm": 1.4546138888578992,
"learning_rate": 2.165752966893203e-06,
"loss": 0.4051,
"step": 1302
},
{
"epoch": 0.7014332817441625,
"grad_norm": 1.3514329197218915,
"learning_rate": 2.158572076120019e-06,
"loss": 0.4154,
"step": 1303
},
{
"epoch": 0.7019716035260076,
"grad_norm": 1.3870510485604055,
"learning_rate": 2.1513998312198734e-06,
"loss": 0.4269,
"step": 1304
},
{
"epoch": 0.7025099253078527,
"grad_norm": 1.6439661727082362,
"learning_rate": 2.1442362540164123e-06,
"loss": 0.4472,
"step": 1305
},
{
"epoch": 0.7030482470896978,
"grad_norm": 2.036208978375709,
"learning_rate": 2.1370813663069086e-06,
"loss": 0.4952,
"step": 1306
},
{
"epoch": 0.7035865688715429,
"grad_norm": 1.4306434260587932,
"learning_rate": 2.1299351898621938e-06,
"loss": 0.3815,
"step": 1307
},
{
"epoch": 0.704124890653388,
"grad_norm": 1.5518498802370642,
"learning_rate": 2.122797746426595e-06,
"loss": 0.4656,
"step": 1308
},
{
"epoch": 0.7046632124352331,
"grad_norm": 1.353149193018473,
"learning_rate": 2.1156690577178657e-06,
"loss": 0.4414,
"step": 1309
},
{
"epoch": 0.7052015342170782,
"grad_norm": 1.3081505827837419,
"learning_rate": 2.108549145427117e-06,
"loss": 0.4355,
"step": 1310
},
{
"epoch": 0.7057398559989233,
"grad_norm": 1.5741831120177514,
"learning_rate": 2.1014380312187593e-06,
"loss": 0.4396,
"step": 1311
},
{
"epoch": 0.7062781777807684,
"grad_norm": 1.5628460516936316,
"learning_rate": 2.094335736730433e-06,
"loss": 0.3687,
"step": 1312
},
{
"epoch": 0.7068164995626135,
"grad_norm": 3.0284027392779986,
"learning_rate": 2.0872422835729384e-06,
"loss": 0.4463,
"step": 1313
},
{
"epoch": 0.7073548213444586,
"grad_norm": 1.3447501399327724,
"learning_rate": 2.0801576933301757e-06,
"loss": 0.4371,
"step": 1314
},
{
"epoch": 0.7078931431263038,
"grad_norm": 1.8116776445346612,
"learning_rate": 2.073081987559077e-06,
"loss": 0.4109,
"step": 1315
},
{
"epoch": 0.7084314649081489,
"grad_norm": 1.571648134209876,
"learning_rate": 2.06601518778954e-06,
"loss": 0.432,
"step": 1316
},
{
"epoch": 0.708969786689994,
"grad_norm": 1.596166756734421,
"learning_rate": 2.0589573155243663e-06,
"loss": 0.4291,
"step": 1317
},
{
"epoch": 0.7095081084718391,
"grad_norm": 1.4446289087866433,
"learning_rate": 2.051908392239186e-06,
"loss": 0.4094,
"step": 1318
},
{
"epoch": 0.7100464302536842,
"grad_norm": 1.377063116073787,
"learning_rate": 2.044868439382406e-06,
"loss": 0.4696,
"step": 1319
},
{
"epoch": 0.7105847520355293,
"grad_norm": 1.3694098512093758,
"learning_rate": 2.0378374783751352e-06,
"loss": 0.402,
"step": 1320
},
{
"epoch": 0.7105847520355293,
"eval_loss": 0.4282020330429077,
"eval_runtime": 1515.7705,
"eval_samples_per_second": 16.5,
"eval_steps_per_second": 0.516,
"step": 1320
},
{
"epoch": 0.7111230738173744,
"grad_norm": 1.929826065439873,
"learning_rate": 2.030815530611123e-06,
"loss": 0.4159,
"step": 1321
},
{
"epoch": 0.7116613955992195,
"grad_norm": 1.4082500795847726,
"learning_rate": 2.023802617456694e-06,
"loss": 0.3941,
"step": 1322
},
{
"epoch": 0.7121997173810646,
"grad_norm": 1.8816103595399847,
"learning_rate": 2.01679876025068e-06,
"loss": 0.4244,
"step": 1323
},
{
"epoch": 0.7127380391629097,
"grad_norm": 1.5683369901785116,
"learning_rate": 2.0098039803043612e-06,
"loss": 0.4332,
"step": 1324
},
{
"epoch": 0.7132763609447548,
"grad_norm": 1.4453103994083734,
"learning_rate": 2.0028182989013923e-06,
"loss": 0.3945,
"step": 1325
},
{
"epoch": 0.7138146827265999,
"grad_norm": 1.6267798252157584,
"learning_rate": 1.9958417372977474e-06,
"loss": 0.4528,
"step": 1326
},
{
"epoch": 0.714353004508445,
"grad_norm": 1.6214655041789812,
"learning_rate": 1.9888743167216493e-06,
"loss": 0.4074,
"step": 1327
},
{
"epoch": 0.7148913262902901,
"grad_norm": 1.8595682807437428,
"learning_rate": 1.9819160583735077e-06,
"loss": 0.4494,
"step": 1328
},
{
"epoch": 0.7154296480721352,
"grad_norm": 1.4662467013475076,
"learning_rate": 1.974966983425852e-06,
"loss": 0.4066,
"step": 1329
},
{
"epoch": 0.7159679698539803,
"grad_norm": 2.5261174973160716,
"learning_rate": 1.9680271130232693e-06,
"loss": 0.4394,
"step": 1330
},
{
"epoch": 0.7165062916358254,
"grad_norm": 1.8084272539130577,
"learning_rate": 1.9610964682823407e-06,
"loss": 0.4601,
"step": 1331
},
{
"epoch": 0.7170446134176705,
"grad_norm": 1.820018846201368,
"learning_rate": 1.9541750702915706e-06,
"loss": 0.4446,
"step": 1332
},
{
"epoch": 0.7175829351995155,
"grad_norm": 1.3923517314522877,
"learning_rate": 1.9472629401113325e-06,
"loss": 0.3857,
"step": 1333
},
{
"epoch": 0.7181212569813606,
"grad_norm": 1.527238991242769,
"learning_rate": 1.9403600987737976e-06,
"loss": 0.4381,
"step": 1334
},
{
"epoch": 0.7186595787632057,
"grad_norm": 1.4006251254778943,
"learning_rate": 1.9334665672828736e-06,
"loss": 0.4332,
"step": 1335
},
{
"epoch": 0.7191979005450508,
"grad_norm": 2.1367769390904,
"learning_rate": 1.926582366614141e-06,
"loss": 0.4331,
"step": 1336
},
{
"epoch": 0.7197362223268959,
"grad_norm": 1.661348731930383,
"learning_rate": 1.9197075177147866e-06,
"loss": 0.4877,
"step": 1337
},
{
"epoch": 0.720274544108741,
"grad_norm": 1.4928525414429736,
"learning_rate": 1.9128420415035442e-06,
"loss": 0.4239,
"step": 1338
},
{
"epoch": 0.7208128658905861,
"grad_norm": 1.533499882863047,
"learning_rate": 1.9059859588706287e-06,
"loss": 0.3951,
"step": 1339
},
{
"epoch": 0.7213511876724312,
"grad_norm": 1.8392687775713348,
"learning_rate": 1.8991392906776668e-06,
"loss": 0.4395,
"step": 1340
},
{
"epoch": 0.7218895094542763,
"grad_norm": 1.573889490157054,
"learning_rate": 1.8923020577576452e-06,
"loss": 0.4162,
"step": 1341
},
{
"epoch": 0.7224278312361214,
"grad_norm": 1.5526149616819422,
"learning_rate": 1.885474280914838e-06,
"loss": 0.4579,
"step": 1342
},
{
"epoch": 0.7229661530179665,
"grad_norm": 1.5191810245344743,
"learning_rate": 1.8786559809247485e-06,
"loss": 0.4216,
"step": 1343
},
{
"epoch": 0.7235044747998116,
"grad_norm": 1.5555786435185341,
"learning_rate": 1.8718471785340414e-06,
"loss": 0.4122,
"step": 1344
},
{
"epoch": 0.7240427965816567,
"grad_norm": 1.3557551585285899,
"learning_rate": 1.8650478944604844e-06,
"loss": 0.3932,
"step": 1345
},
{
"epoch": 0.7245811183635018,
"grad_norm": 1.4728885839955113,
"learning_rate": 1.8582581493928837e-06,
"loss": 0.4934,
"step": 1346
},
{
"epoch": 0.7251194401453469,
"grad_norm": 1.5560703862712066,
"learning_rate": 1.8514779639910152e-06,
"loss": 0.4565,
"step": 1347
},
{
"epoch": 0.725657761927192,
"grad_norm": 1.4005810948444959,
"learning_rate": 1.8447073588855707e-06,
"loss": 0.45,
"step": 1348
},
{
"epoch": 0.7261960837090371,
"grad_norm": 1.4372886671511238,
"learning_rate": 1.8379463546780923e-06,
"loss": 0.4076,
"step": 1349
},
{
"epoch": 0.7267344054908822,
"grad_norm": 1.3561213817272149,
"learning_rate": 1.8311949719409056e-06,
"loss": 0.3991,
"step": 1350
},
{
"epoch": 0.7272727272727273,
"grad_norm": 1.592180627183088,
"learning_rate": 1.824453231217062e-06,
"loss": 0.4395,
"step": 1351
},
{
"epoch": 0.7278110490545724,
"grad_norm": 1.674234401633556,
"learning_rate": 1.8177211530202733e-06,
"loss": 0.5076,
"step": 1352
},
{
"epoch": 0.7283493708364175,
"grad_norm": 1.3869830990008478,
"learning_rate": 1.8109987578348504e-06,
"loss": 0.3823,
"step": 1353
},
{
"epoch": 0.7288876926182626,
"grad_norm": 1.8958736579636137,
"learning_rate": 1.8042860661156425e-06,
"loss": 0.4283,
"step": 1354
},
{
"epoch": 0.7294260144001077,
"grad_norm": 2.277391563720137,
"learning_rate": 1.7975830982879688e-06,
"loss": 0.4344,
"step": 1355
},
{
"epoch": 0.7299643361819528,
"grad_norm": 1.3788436987213148,
"learning_rate": 1.7908898747475656e-06,
"loss": 0.42,
"step": 1356
},
{
"epoch": 0.7305026579637979,
"grad_norm": 1.472584181988221,
"learning_rate": 1.784206415860516e-06,
"loss": 0.4554,
"step": 1357
},
{
"epoch": 0.731040979745643,
"grad_norm": 1.441497867695086,
"learning_rate": 1.7775327419631938e-06,
"loss": 0.3914,
"step": 1358
},
{
"epoch": 0.7315793015274881,
"grad_norm": 1.413962400530734,
"learning_rate": 1.7708688733621971e-06,
"loss": 0.4271,
"step": 1359
},
{
"epoch": 0.7321176233093332,
"grad_norm": 1.467777866704718,
"learning_rate": 1.7642148303342894e-06,
"loss": 0.4613,
"step": 1360
},
{
"epoch": 0.7326559450911783,
"grad_norm": 1.4588809601870538,
"learning_rate": 1.7575706331263392e-06,
"loss": 0.3732,
"step": 1361
},
{
"epoch": 0.7331942668730234,
"grad_norm": 1.9984141502445067,
"learning_rate": 1.7509363019552506e-06,
"loss": 0.4337,
"step": 1362
},
{
"epoch": 0.7337325886548685,
"grad_norm": 1.7211596185425657,
"learning_rate": 1.744311857007912e-06,
"loss": 0.4237,
"step": 1363
},
{
"epoch": 0.7342709104367136,
"grad_norm": 1.3275340316554045,
"learning_rate": 1.7376973184411294e-06,
"loss": 0.4026,
"step": 1364
},
{
"epoch": 0.7348092322185587,
"grad_norm": 1.3704150312314805,
"learning_rate": 1.7310927063815647e-06,
"loss": 0.4221,
"step": 1365
},
{
"epoch": 0.7353475540004037,
"grad_norm": 1.6240778919766734,
"learning_rate": 1.7244980409256768e-06,
"loss": 0.3956,
"step": 1366
},
{
"epoch": 0.7358858757822488,
"grad_norm": 1.5916150137066967,
"learning_rate": 1.7179133421396571e-06,
"loss": 0.449,
"step": 1367
},
{
"epoch": 0.7364241975640939,
"grad_norm": 1.3674325981426028,
"learning_rate": 1.7113386300593749e-06,
"loss": 0.469,
"step": 1368
},
{
"epoch": 0.736962519345939,
"grad_norm": 1.823579935483228,
"learning_rate": 1.7047739246903044e-06,
"loss": 0.4256,
"step": 1369
},
{
"epoch": 0.7375008411277841,
"grad_norm": 1.5992570631473233,
"learning_rate": 1.6982192460074787e-06,
"loss": 0.4364,
"step": 1370
},
{
"epoch": 0.7380391629096292,
"grad_norm": 1.83556587779534,
"learning_rate": 1.6916746139554186e-06,
"loss": 0.462,
"step": 1371
},
{
"epoch": 0.7385774846914743,
"grad_norm": 1.63962319033326,
"learning_rate": 1.6851400484480757e-06,
"loss": 0.4647,
"step": 1372
},
{
"epoch": 0.7391158064733194,
"grad_norm": 1.489565256988372,
"learning_rate": 1.6786155693687712e-06,
"loss": 0.4391,
"step": 1373
},
{
"epoch": 0.7396541282551645,
"grad_norm": 1.8781762497357959,
"learning_rate": 1.6721011965701344e-06,
"loss": 0.4429,
"step": 1374
},
{
"epoch": 0.7401924500370096,
"grad_norm": 1.394724821422672,
"learning_rate": 1.6655969498740455e-06,
"loss": 0.3781,
"step": 1375
},
{
"epoch": 0.7407307718188547,
"grad_norm": 1.7954529740174663,
"learning_rate": 1.6591028490715722e-06,
"loss": 0.4437,
"step": 1376
},
{
"epoch": 0.7412690936006998,
"grad_norm": 1.5625366322113399,
"learning_rate": 1.6526189139229072e-06,
"loss": 0.4221,
"step": 1377
},
{
"epoch": 0.7418074153825449,
"grad_norm": 1.49000718617141,
"learning_rate": 1.6461451641573156e-06,
"loss": 0.3824,
"step": 1378
},
{
"epoch": 0.74234573716439,
"grad_norm": 1.5501486593751905,
"learning_rate": 1.639681619473069e-06,
"loss": 0.4316,
"step": 1379
},
{
"epoch": 0.7428840589462351,
"grad_norm": 1.6012264627466746,
"learning_rate": 1.6332282995373867e-06,
"loss": 0.4414,
"step": 1380
},
{
"epoch": 0.7428840589462351,
"eval_loss": 0.4260067939758301,
"eval_runtime": 1520.5135,
"eval_samples_per_second": 16.448,
"eval_steps_per_second": 0.514,
"step": 1380
},
{
"epoch": 0.7434223807280802,
"grad_norm": 1.3868379821786618,
"learning_rate": 1.6267852239863763e-06,
"loss": 0.3962,
"step": 1381
},
{
"epoch": 0.7439607025099253,
"grad_norm": 1.563201406467786,
"learning_rate": 1.6203524124249742e-06,
"loss": 0.4359,
"step": 1382
},
{
"epoch": 0.7444990242917704,
"grad_norm": 2.0744885451879895,
"learning_rate": 1.613929884426887e-06,
"loss": 0.472,
"step": 1383
},
{
"epoch": 0.7450373460736155,
"grad_norm": 1.7165383734256863,
"learning_rate": 1.607517659534526e-06,
"loss": 0.4449,
"step": 1384
},
{
"epoch": 0.7455756678554606,
"grad_norm": 1.420966932605389,
"learning_rate": 1.6011157572589565e-06,
"loss": 0.4594,
"step": 1385
},
{
"epoch": 0.7461139896373057,
"grad_norm": 1.3843843466818937,
"learning_rate": 1.5947241970798332e-06,
"loss": 0.4021,
"step": 1386
},
{
"epoch": 0.7466523114191508,
"grad_norm": 2.021869994898455,
"learning_rate": 1.588342998445342e-06,
"loss": 0.4973,
"step": 1387
},
{
"epoch": 0.7471906332009959,
"grad_norm": 1.6308202289723368,
"learning_rate": 1.58197218077214e-06,
"loss": 0.4448,
"step": 1388
},
{
"epoch": 0.747728954982841,
"grad_norm": 1.5609319044422376,
"learning_rate": 1.5756117634452977e-06,
"loss": 0.4512,
"step": 1389
},
{
"epoch": 0.7482672767646861,
"grad_norm": 1.3798571945954525,
"learning_rate": 1.5692617658182402e-06,
"loss": 0.4332,
"step": 1390
},
{
"epoch": 0.7488055985465312,
"grad_norm": 1.5464889993436788,
"learning_rate": 1.5629222072126888e-06,
"loss": 0.4716,
"step": 1391
},
{
"epoch": 0.7493439203283763,
"grad_norm": 1.7517747662085987,
"learning_rate": 1.5565931069185946e-06,
"loss": 0.4305,
"step": 1392
},
{
"epoch": 0.7498822421102214,
"grad_norm": 1.5029346054542445,
"learning_rate": 1.5502744841940936e-06,
"loss": 0.4657,
"step": 1393
},
{
"epoch": 0.7504205638920665,
"grad_norm": 1.3544718143048395,
"learning_rate": 1.543966358265438e-06,
"loss": 0.418,
"step": 1394
},
{
"epoch": 0.7509588856739116,
"grad_norm": 1.52275975192662,
"learning_rate": 1.5376687483269404e-06,
"loss": 0.3732,
"step": 1395
},
{
"epoch": 0.7514972074557567,
"grad_norm": 1.691512607761959,
"learning_rate": 1.5313816735409148e-06,
"loss": 0.4606,
"step": 1396
},
{
"epoch": 0.7520355292376018,
"grad_norm": 1.6421517222533963,
"learning_rate": 1.5251051530376199e-06,
"loss": 0.413,
"step": 1397
},
{
"epoch": 0.7525738510194468,
"grad_norm": 1.7994036447279773,
"learning_rate": 1.518839205915202e-06,
"loss": 0.4167,
"step": 1398
},
{
"epoch": 0.753112172801292,
"grad_norm": 1.4116743542426848,
"learning_rate": 1.5125838512396278e-06,
"loss": 0.4502,
"step": 1399
},
{
"epoch": 0.753650494583137,
"grad_norm": 2.9318193198163414,
"learning_rate": 1.5063391080446404e-06,
"loss": 0.4523,
"step": 1400
},
{
"epoch": 0.7541888163649821,
"grad_norm": 1.3582596783082035,
"learning_rate": 1.500104995331692e-06,
"loss": 0.3758,
"step": 1401
},
{
"epoch": 0.7547271381468272,
"grad_norm": 2.1921211591651435,
"learning_rate": 1.493881532069889e-06,
"loss": 0.4725,
"step": 1402
},
{
"epoch": 0.7552654599286723,
"grad_norm": 1.5078767590789557,
"learning_rate": 1.487668737195932e-06,
"loss": 0.4137,
"step": 1403
},
{
"epoch": 0.7558037817105174,
"grad_norm": 1.7747344554372293,
"learning_rate": 1.4814666296140617e-06,
"loss": 0.4519,
"step": 1404
},
{
"epoch": 0.7563421034923625,
"grad_norm": 1.4869616706516326,
"learning_rate": 1.4752752281960003e-06,
"loss": 0.3805,
"step": 1405
},
{
"epoch": 0.7568804252742076,
"grad_norm": 1.688795973706041,
"learning_rate": 1.4690945517808897e-06,
"loss": 0.4993,
"step": 1406
},
{
"epoch": 0.7574187470560527,
"grad_norm": 1.583736337415557,
"learning_rate": 1.4629246191752406e-06,
"loss": 0.4382,
"step": 1407
},
{
"epoch": 0.7579570688378978,
"grad_norm": 1.405921968173557,
"learning_rate": 1.4567654491528732e-06,
"loss": 0.3952,
"step": 1408
},
{
"epoch": 0.7584953906197429,
"grad_norm": 1.3449184128012615,
"learning_rate": 1.4506170604548575e-06,
"loss": 0.4443,
"step": 1409
},
{
"epoch": 0.759033712401588,
"grad_norm": 1.5849926738123288,
"learning_rate": 1.4444794717894596e-06,
"loss": 0.4131,
"step": 1410
},
{
"epoch": 0.7595720341834331,
"grad_norm": 1.6555281403636608,
"learning_rate": 1.4383527018320825e-06,
"loss": 0.4414,
"step": 1411
},
{
"epoch": 0.7601103559652782,
"grad_norm": 1.6263621942357136,
"learning_rate": 1.432236769225211e-06,
"loss": 0.4346,
"step": 1412
},
{
"epoch": 0.7606486777471233,
"grad_norm": 2.0460094225135044,
"learning_rate": 1.426131692578354e-06,
"loss": 0.4493,
"step": 1413
},
{
"epoch": 0.7611869995289684,
"grad_norm": 1.472378438798274,
"learning_rate": 1.4200374904679853e-06,
"loss": 0.4562,
"step": 1414
},
{
"epoch": 0.7617253213108135,
"grad_norm": 1.7242311556580157,
"learning_rate": 1.413954181437493e-06,
"loss": 0.4043,
"step": 1415
},
{
"epoch": 0.7622636430926586,
"grad_norm": 1.6120964716761355,
"learning_rate": 1.4078817839971193e-06,
"loss": 0.4815,
"step": 1416
},
{
"epoch": 0.7628019648745037,
"grad_norm": 2.00633033152504,
"learning_rate": 1.4018203166239032e-06,
"loss": 0.5084,
"step": 1417
},
{
"epoch": 0.7633402866563488,
"grad_norm": 1.593451139015103,
"learning_rate": 1.3957697977616275e-06,
"loss": 0.4089,
"step": 1418
},
{
"epoch": 0.7638786084381939,
"grad_norm": 1.520947317999593,
"learning_rate": 1.38973024582076e-06,
"loss": 0.4204,
"step": 1419
},
{
"epoch": 0.764416930220039,
"grad_norm": 1.5671907812915762,
"learning_rate": 1.3837016791784002e-06,
"loss": 0.4011,
"step": 1420
},
{
"epoch": 0.7649552520018841,
"grad_norm": 2.3136360187940435,
"learning_rate": 1.3776841161782174e-06,
"loss": 0.5217,
"step": 1421
},
{
"epoch": 0.7654935737837292,
"grad_norm": 1.6259616459954453,
"learning_rate": 1.3716775751304024e-06,
"loss": 0.4094,
"step": 1422
},
{
"epoch": 0.7660318955655743,
"grad_norm": 1.2851781752532265,
"learning_rate": 1.365682074311609e-06,
"loss": 0.4371,
"step": 1423
},
{
"epoch": 0.7665702173474194,
"grad_norm": 1.6356127807123704,
"learning_rate": 1.3596976319648957e-06,
"loss": 0.4305,
"step": 1424
},
{
"epoch": 0.7671085391292645,
"grad_norm": 1.7847217896835836,
"learning_rate": 1.3537242662996741e-06,
"loss": 0.4228,
"step": 1425
},
{
"epoch": 0.7676468609111096,
"grad_norm": 1.9347446509271482,
"learning_rate": 1.347761995491651e-06,
"loss": 0.3528,
"step": 1426
},
{
"epoch": 0.7681851826929547,
"grad_norm": 1.7975930657160712,
"learning_rate": 1.3418108376827738e-06,
"loss": 0.4782,
"step": 1427
},
{
"epoch": 0.7687235044747998,
"grad_norm": 1.4744627345322843,
"learning_rate": 1.3358708109811775e-06,
"loss": 0.3919,
"step": 1428
},
{
"epoch": 0.769261826256645,
"grad_norm": 2.7855979759464926,
"learning_rate": 1.3299419334611213e-06,
"loss": 0.4646,
"step": 1429
},
{
"epoch": 0.7698001480384901,
"grad_norm": 1.4805916259048137,
"learning_rate": 1.324024223162947e-06,
"loss": 0.3906,
"step": 1430
},
{
"epoch": 0.7703384698203352,
"grad_norm": 1.7443733531704324,
"learning_rate": 1.3181176980930133e-06,
"loss": 0.4046,
"step": 1431
},
{
"epoch": 0.7708767916021803,
"grad_norm": 1.3403811088010225,
"learning_rate": 1.3122223762236446e-06,
"loss": 0.4585,
"step": 1432
},
{
"epoch": 0.7714151133840254,
"grad_norm": 1.8083215069181602,
"learning_rate": 1.306338275493077e-06,
"loss": 0.4488,
"step": 1433
},
{
"epoch": 0.7719534351658704,
"grad_norm": 2.257570529751952,
"learning_rate": 1.3004654138054035e-06,
"loss": 0.4411,
"step": 1434
},
{
"epoch": 0.7724917569477155,
"grad_norm": 1.5282453915471157,
"learning_rate": 1.2946038090305186e-06,
"loss": 0.3982,
"step": 1435
},
{
"epoch": 0.7730300787295606,
"grad_norm": 1.3350543760395588,
"learning_rate": 1.2887534790040623e-06,
"loss": 0.3529,
"step": 1436
},
{
"epoch": 0.7735684005114057,
"grad_norm": 1.5872897107277366,
"learning_rate": 1.2829144415273703e-06,
"loss": 0.4175,
"step": 1437
},
{
"epoch": 0.7741067222932508,
"grad_norm": 1.461133941363055,
"learning_rate": 1.2770867143674176e-06,
"loss": 0.4225,
"step": 1438
},
{
"epoch": 0.7746450440750959,
"grad_norm": 1.977273812214763,
"learning_rate": 1.2712703152567634e-06,
"loss": 0.3955,
"step": 1439
},
{
"epoch": 0.775183365856941,
"grad_norm": 1.6743349069669249,
"learning_rate": 1.2654652618934977e-06,
"loss": 0.3861,
"step": 1440
},
{
"epoch": 0.775183365856941,
"eval_loss": 0.42436715960502625,
"eval_runtime": 1522.7354,
"eval_samples_per_second": 16.424,
"eval_steps_per_second": 0.514,
"step": 1440
},
{
"epoch": 0.7757216876387861,
"grad_norm": 1.499262565396223,
"learning_rate": 1.2596715719411877e-06,
"loss": 0.4024,
"step": 1441
},
{
"epoch": 0.7762600094206312,
"grad_norm": 1.6235233768215886,
"learning_rate": 1.253889263028827e-06,
"loss": 0.3789,
"step": 1442
},
{
"epoch": 0.7767983312024763,
"grad_norm": 1.4115144384917186,
"learning_rate": 1.2481183527507734e-06,
"loss": 0.4605,
"step": 1443
},
{
"epoch": 0.7773366529843214,
"grad_norm": 1.4061010836073027,
"learning_rate": 1.2423588586667058e-06,
"loss": 0.394,
"step": 1444
},
{
"epoch": 0.7778749747661665,
"grad_norm": 1.4756730352326592,
"learning_rate": 1.2366107983015636e-06,
"loss": 0.3997,
"step": 1445
},
{
"epoch": 0.7784132965480116,
"grad_norm": 1.7767670811956109,
"learning_rate": 1.2308741891454978e-06,
"loss": 0.4388,
"step": 1446
},
{
"epoch": 0.7789516183298567,
"grad_norm": 1.9567881229548667,
"learning_rate": 1.2251490486538143e-06,
"loss": 0.4457,
"step": 1447
},
{
"epoch": 0.7794899401117018,
"grad_norm": 1.7149877959759003,
"learning_rate": 1.2194353942469217e-06,
"loss": 0.4482,
"step": 1448
},
{
"epoch": 0.7800282618935469,
"grad_norm": 1.5521839437257912,
"learning_rate": 1.2137332433102806e-06,
"loss": 0.469,
"step": 1449
},
{
"epoch": 0.780566583675392,
"grad_norm": 2.688209146479993,
"learning_rate": 1.2080426131943496e-06,
"loss": 0.3849,
"step": 1450
},
{
"epoch": 0.7811049054572371,
"grad_norm": 1.4274278905750635,
"learning_rate": 1.2023635212145262e-06,
"loss": 0.3923,
"step": 1451
},
{
"epoch": 0.7816432272390822,
"grad_norm": 1.5796240111966617,
"learning_rate": 1.1966959846511068e-06,
"loss": 0.4567,
"step": 1452
},
{
"epoch": 0.7821815490209273,
"grad_norm": 2.368565849047706,
"learning_rate": 1.191040020749223e-06,
"loss": 0.3885,
"step": 1453
},
{
"epoch": 0.7827198708027724,
"grad_norm": 1.7831232578884653,
"learning_rate": 1.1853956467187943e-06,
"loss": 0.3873,
"step": 1454
},
{
"epoch": 0.7832581925846175,
"grad_norm": 2.2089394022551363,
"learning_rate": 1.1797628797344752e-06,
"loss": 0.4341,
"step": 1455
},
{
"epoch": 0.7837965143664626,
"grad_norm": 1.7921663918566133,
"learning_rate": 1.1741417369356011e-06,
"loss": 0.4138,
"step": 1456
},
{
"epoch": 0.7843348361483077,
"grad_norm": 1.503278809860387,
"learning_rate": 1.1685322354261402e-06,
"loss": 0.4608,
"step": 1457
},
{
"epoch": 0.7848731579301528,
"grad_norm": 1.567305564830315,
"learning_rate": 1.1629343922746334e-06,
"loss": 0.4444,
"step": 1458
},
{
"epoch": 0.7854114797119979,
"grad_norm": 1.4431401966395603,
"learning_rate": 1.1573482245141525e-06,
"loss": 0.4353,
"step": 1459
},
{
"epoch": 0.785949801493843,
"grad_norm": 1.7031469874820835,
"learning_rate": 1.1517737491422415e-06,
"loss": 0.4433,
"step": 1460
},
{
"epoch": 0.7864881232756881,
"grad_norm": 1.9609977211459744,
"learning_rate": 1.1462109831208679e-06,
"loss": 0.4482,
"step": 1461
},
{
"epoch": 0.7870264450575332,
"grad_norm": 2.150596318263902,
"learning_rate": 1.1406599433763694e-06,
"loss": 0.4755,
"step": 1462
},
{
"epoch": 0.7875647668393783,
"grad_norm": 1.3265638431410287,
"learning_rate": 1.1351206467994018e-06,
"loss": 0.4102,
"step": 1463
},
{
"epoch": 0.7881030886212234,
"grad_norm": 4.188075621147485,
"learning_rate": 1.129593110244892e-06,
"loss": 0.3644,
"step": 1464
},
{
"epoch": 0.7886414104030685,
"grad_norm": 1.5439643283706193,
"learning_rate": 1.1240773505319824e-06,
"loss": 0.4707,
"step": 1465
},
{
"epoch": 0.7891797321849136,
"grad_norm": 1.695949064351043,
"learning_rate": 1.1185733844439778e-06,
"loss": 0.4506,
"step": 1466
},
{
"epoch": 0.7897180539667586,
"grad_norm": 1.4925323276596911,
"learning_rate": 1.113081228728301e-06,
"loss": 0.4062,
"step": 1467
},
{
"epoch": 0.7902563757486037,
"grad_norm": 1.810916777909123,
"learning_rate": 1.1076009000964384e-06,
"loss": 0.4617,
"step": 1468
},
{
"epoch": 0.7907946975304488,
"grad_norm": 1.5391006325796759,
"learning_rate": 1.102132415223886e-06,
"loss": 0.4341,
"step": 1469
},
{
"epoch": 0.7913330193122939,
"grad_norm": 1.3539603638585116,
"learning_rate": 1.0966757907501058e-06,
"loss": 0.4045,
"step": 1470
},
{
"epoch": 0.791871341094139,
"grad_norm": 1.585969494802185,
"learning_rate": 1.0912310432784673e-06,
"loss": 0.4889,
"step": 1471
},
{
"epoch": 0.7924096628759841,
"grad_norm": 1.3636312861290756,
"learning_rate": 1.0857981893762048e-06,
"loss": 0.4352,
"step": 1472
},
{
"epoch": 0.7929479846578292,
"grad_norm": 1.5823372906311277,
"learning_rate": 1.0803772455743572e-06,
"loss": 0.398,
"step": 1473
},
{
"epoch": 0.7934863064396743,
"grad_norm": 1.5278694836184388,
"learning_rate": 1.0749682283677288e-06,
"loss": 0.4228,
"step": 1474
},
{
"epoch": 0.7940246282215194,
"grad_norm": 1.1652690918407183,
"learning_rate": 1.0695711542148313e-06,
"loss": 0.3811,
"step": 1475
},
{
"epoch": 0.7945629500033645,
"grad_norm": 1.4886602129753284,
"learning_rate": 1.0641860395378367e-06,
"loss": 0.4037,
"step": 1476
},
{
"epoch": 0.7951012717852096,
"grad_norm": 1.5390850918633818,
"learning_rate": 1.0588129007225266e-06,
"loss": 0.3754,
"step": 1477
},
{
"epoch": 0.7956395935670547,
"grad_norm": 1.676720868561217,
"learning_rate": 1.0534517541182431e-06,
"loss": 0.4599,
"step": 1478
},
{
"epoch": 0.7961779153488998,
"grad_norm": 1.676144009500296,
"learning_rate": 1.0481026160378394e-06,
"loss": 0.4203,
"step": 1479
},
{
"epoch": 0.7967162371307449,
"grad_norm": 1.3949722623692342,
"learning_rate": 1.042765502757625e-06,
"loss": 0.4149,
"step": 1480
},
{
"epoch": 0.79725455891259,
"grad_norm": 1.6398344004557446,
"learning_rate": 1.0374404305173247e-06,
"loss": 0.4215,
"step": 1481
},
{
"epoch": 0.7977928806944351,
"grad_norm": 1.6715940485370635,
"learning_rate": 1.0321274155200234e-06,
"loss": 0.4393,
"step": 1482
},
{
"epoch": 0.7983312024762802,
"grad_norm": 1.395308837290767,
"learning_rate": 1.0268264739321194e-06,
"loss": 0.4398,
"step": 1483
},
{
"epoch": 0.7988695242581253,
"grad_norm": 1.6597231226511682,
"learning_rate": 1.0215376218832723e-06,
"loss": 0.4185,
"step": 1484
},
{
"epoch": 0.7994078460399704,
"grad_norm": 1.5059702316944186,
"learning_rate": 1.0162608754663572e-06,
"loss": 0.4428,
"step": 1485
},
{
"epoch": 0.7999461678218155,
"grad_norm": 1.774717767949121,
"learning_rate": 1.0109962507374139e-06,
"loss": 0.456,
"step": 1486
},
{
"epoch": 0.8004844896036606,
"grad_norm": 1.5763966693479707,
"learning_rate": 1.0057437637155997e-06,
"loss": 0.4742,
"step": 1487
},
{
"epoch": 0.8010228113855057,
"grad_norm": 1.66961890257069,
"learning_rate": 1.0005034303831352e-06,
"loss": 0.4479,
"step": 1488
},
{
"epoch": 0.8015611331673508,
"grad_norm": 1.4312052717987154,
"learning_rate": 9.95275266685264e-07,
"loss": 0.3894,
"step": 1489
},
{
"epoch": 0.8020994549491959,
"grad_norm": 1.5395533368166758,
"learning_rate": 9.900592885301986e-07,
"loss": 0.433,
"step": 1490
},
{
"epoch": 0.802637776731041,
"grad_norm": 1.7267038818610854,
"learning_rate": 9.848555117890734e-07,
"loss": 0.4399,
"step": 1491
},
{
"epoch": 0.8031760985128861,
"grad_norm": 1.588155903799363,
"learning_rate": 9.796639522958972e-07,
"loss": 0.4662,
"step": 1492
},
{
"epoch": 0.8037144202947312,
"grad_norm": 1.278378381771794,
"learning_rate": 9.744846258475032e-07,
"loss": 0.4023,
"step": 1493
},
{
"epoch": 0.8042527420765763,
"grad_norm": 1.630276962177858,
"learning_rate": 9.693175482035038e-07,
"loss": 0.4352,
"step": 1494
},
{
"epoch": 0.8047910638584214,
"grad_norm": 1.7375887913272672,
"learning_rate": 9.641627350862371e-07,
"loss": 0.4451,
"step": 1495
},
{
"epoch": 0.8053293856402665,
"grad_norm": 1.5671830810820253,
"learning_rate": 9.590202021807266e-07,
"loss": 0.4944,
"step": 1496
},
{
"epoch": 0.8058677074221116,
"grad_norm": 1.5984498803682108,
"learning_rate": 9.538899651346278e-07,
"loss": 0.4171,
"step": 1497
},
{
"epoch": 0.8064060292039567,
"grad_norm": 1.4646889528560627,
"learning_rate": 9.487720395581829e-07,
"loss": 0.3802,
"step": 1498
},
{
"epoch": 0.8069443509858018,
"grad_norm": 1.3512741257951366,
"learning_rate": 9.436664410241736e-07,
"loss": 0.4309,
"step": 1499
},
{
"epoch": 0.8074826727676468,
"grad_norm": 1.5243040161927932,
"learning_rate": 9.385731850678714e-07,
"loss": 0.4321,
"step": 1500
},
{
"epoch": 0.8074826727676468,
"eval_loss": 0.42280885577201843,
"eval_runtime": 1525.8015,
"eval_samples_per_second": 16.391,
"eval_steps_per_second": 0.513,
"step": 1500
},
{
"epoch": 0.8080209945494919,
"grad_norm": 1.7335916518675676,
"learning_rate": 9.334922871869933e-07,
"loss": 0.4613,
"step": 1501
},
{
"epoch": 0.808559316331337,
"grad_norm": 1.4183990627505498,
"learning_rate": 9.284237628416537e-07,
"loss": 0.4245,
"step": 1502
},
{
"epoch": 0.8090976381131821,
"grad_norm": 1.6705452727321846,
"learning_rate": 9.233676274543141e-07,
"loss": 0.4186,
"step": 1503
},
{
"epoch": 0.8096359598950272,
"grad_norm": 1.6195072788491132,
"learning_rate": 9.183238964097408e-07,
"loss": 0.4606,
"step": 1504
},
{
"epoch": 0.8101742816768723,
"grad_norm": 1.5392537994753088,
"learning_rate": 9.132925850549573e-07,
"loss": 0.4261,
"step": 1505
},
{
"epoch": 0.8107126034587174,
"grad_norm": 1.5937406024477896,
"learning_rate": 9.082737086991955e-07,
"loss": 0.378,
"step": 1506
},
{
"epoch": 0.8112509252405625,
"grad_norm": 1.6757621701627432,
"learning_rate": 9.0326728261385e-07,
"loss": 0.4782,
"step": 1507
},
{
"epoch": 0.8117892470224076,
"grad_norm": 2.005066048659624,
"learning_rate": 8.982733220324319e-07,
"loss": 0.4419,
"step": 1508
},
{
"epoch": 0.8123275688042527,
"grad_norm": 1.5506134684388948,
"learning_rate": 8.932918421505244e-07,
"loss": 0.4669,
"step": 1509
},
{
"epoch": 0.8128658905860978,
"grad_norm": 1.8474324824508042,
"learning_rate": 8.883228581257297e-07,
"loss": 0.4416,
"step": 1510
},
{
"epoch": 0.8134042123679429,
"grad_norm": 1.5536434524734581,
"learning_rate": 8.83366385077632e-07,
"loss": 0.4377,
"step": 1511
},
{
"epoch": 0.813942534149788,
"grad_norm": 1.399796692285853,
"learning_rate": 8.784224380877454e-07,
"loss": 0.4392,
"step": 1512
},
{
"epoch": 0.8144808559316331,
"grad_norm": 1.5556950965685121,
"learning_rate": 8.734910321994717e-07,
"loss": 0.406,
"step": 1513
},
{
"epoch": 0.8150191777134782,
"grad_norm": 1.5480188724931883,
"learning_rate": 8.685721824180499e-07,
"loss": 0.4433,
"step": 1514
},
{
"epoch": 0.8155574994953233,
"grad_norm": 1.4971651714962706,
"learning_rate": 8.636659037105149e-07,
"loss": 0.3966,
"step": 1515
},
{
"epoch": 0.8160958212771684,
"grad_norm": 1.6155911416639859,
"learning_rate": 8.587722110056529e-07,
"loss": 0.4212,
"step": 1516
},
{
"epoch": 0.8166341430590135,
"grad_norm": 1.976217129048654,
"learning_rate": 8.538911191939475e-07,
"loss": 0.4107,
"step": 1517
},
{
"epoch": 0.8171724648408586,
"grad_norm": 1.9846803772964912,
"learning_rate": 8.490226431275456e-07,
"loss": 0.4094,
"step": 1518
},
{
"epoch": 0.8177107866227037,
"grad_norm": 3.0586074935315133,
"learning_rate": 8.441667976202045e-07,
"loss": 0.4492,
"step": 1519
},
{
"epoch": 0.8182491084045488,
"grad_norm": 1.6149445557914077,
"learning_rate": 8.393235974472497e-07,
"loss": 0.4361,
"step": 1520
},
{
"epoch": 0.8187874301863939,
"grad_norm": 1.4631036764406664,
"learning_rate": 8.344930573455323e-07,
"loss": 0.4343,
"step": 1521
},
{
"epoch": 0.819325751968239,
"grad_norm": 1.3342306529935604,
"learning_rate": 8.296751920133794e-07,
"loss": 0.3546,
"step": 1522
},
{
"epoch": 0.8198640737500841,
"grad_norm": 2.0226246030817356,
"learning_rate": 8.248700161105483e-07,
"loss": 0.4281,
"step": 1523
},
{
"epoch": 0.8204023955319292,
"grad_norm": 1.9696807317895189,
"learning_rate": 8.200775442581893e-07,
"loss": 0.4215,
"step": 1524
},
{
"epoch": 0.8209407173137743,
"grad_norm": 1.4820095683603027,
"learning_rate": 8.152977910387955e-07,
"loss": 0.4928,
"step": 1525
},
{
"epoch": 0.8214790390956194,
"grad_norm": 1.5809021302001485,
"learning_rate": 8.105307709961602e-07,
"loss": 0.442,
"step": 1526
},
{
"epoch": 0.8220173608774645,
"grad_norm": 1.3682019844229378,
"learning_rate": 8.057764986353317e-07,
"loss": 0.448,
"step": 1527
},
{
"epoch": 0.8225556826593096,
"grad_norm": 1.6136391165039332,
"learning_rate": 8.010349884225699e-07,
"loss": 0.4458,
"step": 1528
},
{
"epoch": 0.8230940044411547,
"grad_norm": 1.2595845723052967,
"learning_rate": 7.963062547853023e-07,
"loss": 0.4014,
"step": 1529
},
{
"epoch": 0.8236323262229998,
"grad_norm": 2.650357568288943,
"learning_rate": 7.915903121120816e-07,
"loss": 0.4475,
"step": 1530
},
{
"epoch": 0.8241706480048449,
"grad_norm": 1.5993270434912978,
"learning_rate": 7.868871747525353e-07,
"loss": 0.3952,
"step": 1531
},
{
"epoch": 0.82470896978669,
"grad_norm": 1.5445035783730348,
"learning_rate": 7.821968570173321e-07,
"loss": 0.4546,
"step": 1532
},
{
"epoch": 0.825247291568535,
"grad_norm": 1.7600163478435773,
"learning_rate": 7.775193731781316e-07,
"loss": 0.3925,
"step": 1533
},
{
"epoch": 0.8257856133503801,
"grad_norm": 1.9376227278838558,
"learning_rate": 7.728547374675421e-07,
"loss": 0.4142,
"step": 1534
},
{
"epoch": 0.8263239351322252,
"grad_norm": 1.5661272939035957,
"learning_rate": 7.682029640790783e-07,
"loss": 0.408,
"step": 1535
},
{
"epoch": 0.8268622569140703,
"grad_norm": 1.7751314318755442,
"learning_rate": 7.635640671671168e-07,
"loss": 0.4748,
"step": 1536
},
{
"epoch": 0.8274005786959154,
"grad_norm": 1.4328800747976576,
"learning_rate": 7.589380608468549e-07,
"loss": 0.445,
"step": 1537
},
{
"epoch": 0.8279389004777605,
"grad_norm": 1.770544068666416,
"learning_rate": 7.543249591942647e-07,
"loss": 0.3877,
"step": 1538
},
{
"epoch": 0.8284772222596056,
"grad_norm": 1.4644257793154838,
"learning_rate": 7.497247762460535e-07,
"loss": 0.4729,
"step": 1539
},
{
"epoch": 0.8290155440414507,
"grad_norm": 2.0251569316621354,
"learning_rate": 7.451375259996196e-07,
"loss": 0.3926,
"step": 1540
},
{
"epoch": 0.8295538658232958,
"grad_norm": 1.5659705563939743,
"learning_rate": 7.405632224130094e-07,
"loss": 0.3978,
"step": 1541
},
{
"epoch": 0.8300921876051409,
"grad_norm": 1.5791357169071338,
"learning_rate": 7.360018794048757e-07,
"loss": 0.4482,
"step": 1542
},
{
"epoch": 0.830630509386986,
"grad_norm": 1.5219436138787439,
"learning_rate": 7.314535108544346e-07,
"loss": 0.3993,
"step": 1543
},
{
"epoch": 0.8311688311688312,
"grad_norm": 1.5116221556805869,
"learning_rate": 7.26918130601425e-07,
"loss": 0.4431,
"step": 1544
},
{
"epoch": 0.8317071529506763,
"grad_norm": 1.5355423700033741,
"learning_rate": 7.223957524460612e-07,
"loss": 0.3847,
"step": 1545
},
{
"epoch": 0.8322454747325214,
"grad_norm": 1.6301347275924607,
"learning_rate": 7.17886390148999e-07,
"loss": 0.4149,
"step": 1546
},
{
"epoch": 0.8327837965143665,
"grad_norm": 1.39164969438826,
"learning_rate": 7.133900574312885e-07,
"loss": 0.444,
"step": 1547
},
{
"epoch": 0.8333221182962116,
"grad_norm": 1.6360359120384138,
"learning_rate": 7.089067679743322e-07,
"loss": 0.4387,
"step": 1548
},
{
"epoch": 0.8338604400780567,
"grad_norm": 1.1463330927551836,
"learning_rate": 7.044365354198462e-07,
"loss": 0.367,
"step": 1549
},
{
"epoch": 0.8343987618599018,
"grad_norm": 1.3951952353250727,
"learning_rate": 6.999793733698168e-07,
"loss": 0.4537,
"step": 1550
},
{
"epoch": 0.8349370836417469,
"grad_norm": 1.444313279525601,
"learning_rate": 6.955352953864592e-07,
"loss": 0.4517,
"step": 1551
},
{
"epoch": 0.835475405423592,
"grad_norm": 1.4922885632634126,
"learning_rate": 6.91104314992177e-07,
"loss": 0.4182,
"step": 1552
},
{
"epoch": 0.8360137272054371,
"grad_norm": 1.361490120387784,
"learning_rate": 6.866864456695189e-07,
"loss": 0.3819,
"step": 1553
},
{
"epoch": 0.8365520489872822,
"grad_norm": 1.3785822196112183,
"learning_rate": 6.822817008611409e-07,
"loss": 0.4315,
"step": 1554
},
{
"epoch": 0.8370903707691273,
"grad_norm": 1.786812938484116,
"learning_rate": 6.778900939697642e-07,
"loss": 0.4352,
"step": 1555
},
{
"epoch": 0.8376286925509724,
"grad_norm": 1.51980814160385,
"learning_rate": 6.735116383581325e-07,
"loss": 0.4681,
"step": 1556
},
{
"epoch": 0.8381670143328175,
"grad_norm": 1.6909398106864937,
"learning_rate": 6.691463473489751e-07,
"loss": 0.3764,
"step": 1557
},
{
"epoch": 0.8387053361146626,
"grad_norm": 1.3032028525505768,
"learning_rate": 6.647942342249619e-07,
"loss": 0.4571,
"step": 1558
},
{
"epoch": 0.8392436578965077,
"grad_norm": 2.673478994173862,
"learning_rate": 6.604553122286672e-07,
"loss": 0.4424,
"step": 1559
},
{
"epoch": 0.8397819796783528,
"grad_norm": 1.8774151039134228,
"learning_rate": 6.561295945625246e-07,
"loss": 0.4289,
"step": 1560
},
{
"epoch": 0.8397819796783528,
"eval_loss": 0.42163270711898804,
"eval_runtime": 1532.1805,
"eval_samples_per_second": 16.323,
"eval_steps_per_second": 0.51,
"step": 1560
},
{
"epoch": 0.8403203014601979,
"grad_norm": 1.3658795551777532,
"learning_rate": 6.51817094388793e-07,
"loss": 0.4041,
"step": 1561
},
{
"epoch": 0.840858623242043,
"grad_norm": 2.0775682420189683,
"learning_rate": 6.475178248295111e-07,
"loss": 0.4626,
"step": 1562
},
{
"epoch": 0.8413969450238881,
"grad_norm": 2.0811838469436137,
"learning_rate": 6.432317989664599e-07,
"loss": 0.4316,
"step": 1563
},
{
"epoch": 0.8419352668057332,
"grad_norm": 1.6387122228577398,
"learning_rate": 6.389590298411236e-07,
"loss": 0.4198,
"step": 1564
},
{
"epoch": 0.8424735885875783,
"grad_norm": 1.6679858558099225,
"learning_rate": 6.346995304546482e-07,
"loss": 0.3999,
"step": 1565
},
{
"epoch": 0.8430119103694234,
"grad_norm": 1.4149904617289844,
"learning_rate": 6.304533137678026e-07,
"loss": 0.418,
"step": 1566
},
{
"epoch": 0.8435502321512685,
"grad_norm": 1.58157239985269,
"learning_rate": 6.262203927009403e-07,
"loss": 0.4279,
"step": 1567
},
{
"epoch": 0.8440885539331136,
"grad_norm": 1.7638599414290634,
"learning_rate": 6.220007801339562e-07,
"loss": 0.4042,
"step": 1568
},
{
"epoch": 0.8446268757149586,
"grad_norm": 1.5007385916657803,
"learning_rate": 6.17794488906252e-07,
"loss": 0.4402,
"step": 1569
},
{
"epoch": 0.8451651974968037,
"grad_norm": 1.345366896432651,
"learning_rate": 6.136015318166966e-07,
"loss": 0.3642,
"step": 1570
},
{
"epoch": 0.8457035192786488,
"grad_norm": 1.5235663558748846,
"learning_rate": 6.094219216235841e-07,
"loss": 0.3964,
"step": 1571
},
{
"epoch": 0.8462418410604939,
"grad_norm": 1.3657476470037149,
"learning_rate": 6.052556710445972e-07,
"loss": 0.3748,
"step": 1572
},
{
"epoch": 0.846780162842339,
"grad_norm": 1.4394596688138968,
"learning_rate": 6.011027927567681e-07,
"loss": 0.441,
"step": 1573
},
{
"epoch": 0.8473184846241841,
"grad_norm": 1.5318361149430813,
"learning_rate": 5.969632993964414e-07,
"loss": 0.4621,
"step": 1574
},
{
"epoch": 0.8478568064060292,
"grad_norm": 1.6075753885114712,
"learning_rate": 5.928372035592306e-07,
"loss": 0.4645,
"step": 1575
},
{
"epoch": 0.8483951281878743,
"grad_norm": 1.5722006469692726,
"learning_rate": 5.887245177999867e-07,
"loss": 0.4446,
"step": 1576
},
{
"epoch": 0.8489334499697194,
"grad_norm": 1.4551383751314828,
"learning_rate": 5.846252546327547e-07,
"loss": 0.43,
"step": 1577
},
{
"epoch": 0.8494717717515645,
"grad_norm": 1.4487392657122655,
"learning_rate": 5.805394265307391e-07,
"loss": 0.4032,
"step": 1578
},
{
"epoch": 0.8500100935334096,
"grad_norm": 1.6691803468661808,
"learning_rate": 5.764670459262622e-07,
"loss": 0.4328,
"step": 1579
},
{
"epoch": 0.8505484153152547,
"grad_norm": 1.6197190610235175,
"learning_rate": 5.724081252107311e-07,
"loss": 0.4045,
"step": 1580
},
{
"epoch": 0.8510867370970998,
"grad_norm": 1.6633094952520224,
"learning_rate": 5.683626767345951e-07,
"loss": 0.4271,
"step": 1581
},
{
"epoch": 0.8516250588789449,
"grad_norm": 1.3383638616282105,
"learning_rate": 5.6433071280731e-07,
"loss": 0.3742,
"step": 1582
},
{
"epoch": 0.85216338066079,
"grad_norm": 1.3573201978569531,
"learning_rate": 5.60312245697302e-07,
"loss": 0.355,
"step": 1583
},
{
"epoch": 0.8527017024426351,
"grad_norm": 1.5087600985731158,
"learning_rate": 5.563072876319292e-07,
"loss": 0.4275,
"step": 1584
},
{
"epoch": 0.8532400242244802,
"grad_norm": 1.9174671861368988,
"learning_rate": 5.523158507974452e-07,
"loss": 0.4523,
"step": 1585
},
{
"epoch": 0.8537783460063253,
"grad_norm": 1.2701535232392451,
"learning_rate": 5.483379473389599e-07,
"loss": 0.4157,
"step": 1586
},
{
"epoch": 0.8543166677881704,
"grad_norm": 1.3648674048032239,
"learning_rate": 5.443735893604041e-07,
"loss": 0.443,
"step": 1587
},
{
"epoch": 0.8548549895700155,
"grad_norm": 1.7303772028968518,
"learning_rate": 5.404227889244939e-07,
"loss": 0.3945,
"step": 1588
},
{
"epoch": 0.8553933113518606,
"grad_norm": 1.4650825399074572,
"learning_rate": 5.364855580526923e-07,
"loss": 0.4183,
"step": 1589
},
{
"epoch": 0.8559316331337057,
"grad_norm": 1.7612420028556155,
"learning_rate": 5.325619087251704e-07,
"loss": 0.4472,
"step": 1590
},
{
"epoch": 0.8564699549155508,
"grad_norm": 1.6090688100302808,
"learning_rate": 5.28651852880776e-07,
"loss": 0.4348,
"step": 1591
},
{
"epoch": 0.8570082766973959,
"grad_norm": 1.59025634923398,
"learning_rate": 5.247554024169949e-07,
"loss": 0.4132,
"step": 1592
},
{
"epoch": 0.857546598479241,
"grad_norm": 1.8249117304980227,
"learning_rate": 5.20872569189913e-07,
"loss": 0.415,
"step": 1593
},
{
"epoch": 0.8580849202610861,
"grad_norm": 1.3724204134525155,
"learning_rate": 5.170033650141837e-07,
"loss": 0.4645,
"step": 1594
},
{
"epoch": 0.8586232420429312,
"grad_norm": 2.066798117946357,
"learning_rate": 5.131478016629888e-07,
"loss": 0.4225,
"step": 1595
},
{
"epoch": 0.8591615638247763,
"grad_norm": 2.780252323052268,
"learning_rate": 5.093058908680043e-07,
"loss": 0.4048,
"step": 1596
},
{
"epoch": 0.8596998856066214,
"grad_norm": 1.4726854180656292,
"learning_rate": 5.054776443193626e-07,
"loss": 0.4337,
"step": 1597
},
{
"epoch": 0.8602382073884665,
"grad_norm": 1.7991832445280496,
"learning_rate": 5.016630736656213e-07,
"loss": 0.3871,
"step": 1598
},
{
"epoch": 0.8607765291703116,
"grad_norm": 1.6803342666413155,
"learning_rate": 4.978621905137238e-07,
"loss": 0.4332,
"step": 1599
},
{
"epoch": 0.8613148509521567,
"grad_norm": 1.4355251448306459,
"learning_rate": 4.940750064289657e-07,
"loss": 0.3924,
"step": 1600
},
{
"epoch": 0.8618531727340017,
"grad_norm": 1.3604897046592517,
"learning_rate": 4.903015329349581e-07,
"loss": 0.4057,
"step": 1601
},
{
"epoch": 0.8623914945158468,
"grad_norm": 1.6598958205265515,
"learning_rate": 4.865417815135958e-07,
"loss": 0.3885,
"step": 1602
},
{
"epoch": 0.8629298162976919,
"grad_norm": 1.4613049538096838,
"learning_rate": 4.827957636050179e-07,
"loss": 0.3922,
"step": 1603
},
{
"epoch": 0.863468138079537,
"grad_norm": 1.5965664706849296,
"learning_rate": 4.790634906075775e-07,
"loss": 0.4828,
"step": 1604
},
{
"epoch": 0.8640064598613821,
"grad_norm": 1.8120189192545764,
"learning_rate": 4.753449738778021e-07,
"loss": 0.429,
"step": 1605
},
{
"epoch": 0.8645447816432272,
"grad_norm": 1.8371969884713577,
"learning_rate": 4.716402247303631e-07,
"loss": 0.4074,
"step": 1606
},
{
"epoch": 0.8650831034250723,
"grad_norm": 1.5256250240541858,
"learning_rate": 4.6794925443804097e-07,
"loss": 0.4015,
"step": 1607
},
{
"epoch": 0.8656214252069174,
"grad_norm": 1.6504131905617414,
"learning_rate": 4.642720742316886e-07,
"loss": 0.4619,
"step": 1608
},
{
"epoch": 0.8661597469887625,
"grad_norm": 1.7464812669613627,
"learning_rate": 4.6060869530019983e-07,
"loss": 0.4537,
"step": 1609
},
{
"epoch": 0.8666980687706076,
"grad_norm": 1.8767060082708276,
"learning_rate": 4.569591287904723e-07,
"loss": 0.4612,
"step": 1610
},
{
"epoch": 0.8672363905524527,
"grad_norm": 1.3070105173969313,
"learning_rate": 4.5332338580737824e-07,
"loss": 0.3629,
"step": 1611
},
{
"epoch": 0.8677747123342978,
"grad_norm": 4.572221630177869,
"learning_rate": 4.4970147741372315e-07,
"loss": 0.4587,
"step": 1612
},
{
"epoch": 0.8683130341161429,
"grad_norm": 1.4960042467223587,
"learning_rate": 4.460934146302215e-07,
"loss": 0.4734,
"step": 1613
},
{
"epoch": 0.868851355897988,
"grad_norm": 1.9121190508560355,
"learning_rate": 4.424992084354551e-07,
"loss": 0.4016,
"step": 1614
},
{
"epoch": 0.8693896776798331,
"grad_norm": 1.706342167134769,
"learning_rate": 4.389188697658453e-07,
"loss": 0.4207,
"step": 1615
},
{
"epoch": 0.8699279994616782,
"grad_norm": 1.5621521598790504,
"learning_rate": 4.3535240951561695e-07,
"loss": 0.4101,
"step": 1616
},
{
"epoch": 0.8704663212435233,
"grad_norm": 1.4806315484210542,
"learning_rate": 4.3179983853676386e-07,
"loss": 0.4608,
"step": 1617
},
{
"epoch": 0.8710046430253684,
"grad_norm": 1.526083402719131,
"learning_rate": 4.2826116763902135e-07,
"loss": 0.4183,
"step": 1618
},
{
"epoch": 0.8715429648072135,
"grad_norm": 1.6689772565592038,
"learning_rate": 4.247364075898258e-07,
"loss": 0.4288,
"step": 1619
},
{
"epoch": 0.8720812865890586,
"grad_norm": 1.3834588776364911,
"learning_rate": 4.2122556911428744e-07,
"loss": 0.4032,
"step": 1620
},
{
"epoch": 0.8720812865890586,
"eval_loss": 0.42079228162765503,
"eval_runtime": 1541.5294,
"eval_samples_per_second": 16.224,
"eval_steps_per_second": 0.507,
"step": 1620
},
{
"epoch": 0.8726196083709037,
"grad_norm": 1.5791149363732657,
"learning_rate": 4.177286628951566e-07,
"loss": 0.4388,
"step": 1621
},
{
"epoch": 0.8731579301527488,
"grad_norm": 1.7565308716827732,
"learning_rate": 4.142456995727906e-07,
"loss": 0.4403,
"step": 1622
},
{
"epoch": 0.8736962519345939,
"grad_norm": 1.8536625820585364,
"learning_rate": 4.107766897451204e-07,
"loss": 0.377,
"step": 1623
},
{
"epoch": 0.874234573716439,
"grad_norm": 1.557798623706775,
"learning_rate": 4.073216439676203e-07,
"loss": 0.4099,
"step": 1624
},
{
"epoch": 0.8747728954982841,
"grad_norm": 1.5848805929742247,
"learning_rate": 4.0388057275327466e-07,
"loss": 0.4127,
"step": 1625
},
{
"epoch": 0.8753112172801292,
"grad_norm": 1.4737469672067065,
"learning_rate": 4.004534865725462e-07,
"loss": 0.4125,
"step": 1626
},
{
"epoch": 0.8758495390619743,
"grad_norm": 1.4866822244945306,
"learning_rate": 3.970403958533436e-07,
"loss": 0.4081,
"step": 1627
},
{
"epoch": 0.8763878608438194,
"grad_norm": 1.6255821682103373,
"learning_rate": 3.936413109809906e-07,
"loss": 0.4465,
"step": 1628
},
{
"epoch": 0.8769261826256645,
"grad_norm": 1.4642881317646486,
"learning_rate": 3.902562422981937e-07,
"loss": 0.4286,
"step": 1629
},
{
"epoch": 0.8774645044075096,
"grad_norm": 1.580573409189922,
"learning_rate": 3.8688520010501276e-07,
"loss": 0.4527,
"step": 1630
},
{
"epoch": 0.8780028261893547,
"grad_norm": 2.0543315708956387,
"learning_rate": 3.835281946588254e-07,
"loss": 0.4377,
"step": 1631
},
{
"epoch": 0.8785411479711998,
"grad_norm": 1.5115782436115135,
"learning_rate": 3.801852361743008e-07,
"loss": 0.4525,
"step": 1632
},
{
"epoch": 0.8790794697530449,
"grad_norm": 1.8374746527735237,
"learning_rate": 3.7685633482336504e-07,
"loss": 0.4242,
"step": 1633
},
{
"epoch": 0.87961779153489,
"grad_norm": 1.5036770046647692,
"learning_rate": 3.7354150073516947e-07,
"loss": 0.4474,
"step": 1634
},
{
"epoch": 0.880156113316735,
"grad_norm": 1.658882270187231,
"learning_rate": 3.702407439960648e-07,
"loss": 0.4321,
"step": 1635
},
{
"epoch": 0.8806944350985801,
"grad_norm": 1.6020319338410256,
"learning_rate": 3.669540746495653e-07,
"loss": 0.4212,
"step": 1636
},
{
"epoch": 0.8812327568804252,
"grad_norm": 1.7415071086793177,
"learning_rate": 3.636815026963214e-07,
"loss": 0.4229,
"step": 1637
},
{
"epoch": 0.8817710786622703,
"grad_norm": 1.328144623680027,
"learning_rate": 3.604230380940871e-07,
"loss": 0.4135,
"step": 1638
},
{
"epoch": 0.8823094004441154,
"grad_norm": 1.8361744282067538,
"learning_rate": 3.5717869075769187e-07,
"loss": 0.4448,
"step": 1639
},
{
"epoch": 0.8828477222259605,
"grad_norm": 1.4454157174291669,
"learning_rate": 3.5394847055900794e-07,
"loss": 0.4339,
"step": 1640
},
{
"epoch": 0.8833860440078056,
"grad_norm": 1.6322475345286311,
"learning_rate": 3.5073238732692305e-07,
"loss": 0.4176,
"step": 1641
},
{
"epoch": 0.8839243657896507,
"grad_norm": 1.445292085363601,
"learning_rate": 3.475304508473071e-07,
"loss": 0.4554,
"step": 1642
},
{
"epoch": 0.8844626875714958,
"grad_norm": 1.4938616353672438,
"learning_rate": 3.44342670862986e-07,
"loss": 0.4088,
"step": 1643
},
{
"epoch": 0.8850010093533409,
"grad_norm": 1.47760594711673,
"learning_rate": 3.411690570737097e-07,
"loss": 0.3793,
"step": 1644
},
{
"epoch": 0.885539331135186,
"grad_norm": 1.6041036008050786,
"learning_rate": 3.3800961913612427e-07,
"loss": 0.4648,
"step": 1645
},
{
"epoch": 0.8860776529170311,
"grad_norm": 1.6055085861001368,
"learning_rate": 3.3486436666374024e-07,
"loss": 0.3958,
"step": 1646
},
{
"epoch": 0.8866159746988762,
"grad_norm": 1.592597656491022,
"learning_rate": 3.3173330922690594e-07,
"loss": 0.4534,
"step": 1647
},
{
"epoch": 0.8871542964807213,
"grad_norm": 1.3972942678399092,
"learning_rate": 3.2861645635277715e-07,
"loss": 0.4075,
"step": 1648
},
{
"epoch": 0.8876926182625664,
"grad_norm": 1.299571800868061,
"learning_rate": 3.255138175252859e-07,
"loss": 0.4322,
"step": 1649
},
{
"epoch": 0.8882309400444115,
"grad_norm": 1.6074089216828915,
"learning_rate": 3.22425402185117e-07,
"loss": 0.4442,
"step": 1650
},
{
"epoch": 0.8887692618262566,
"grad_norm": 1.6515277192815747,
"learning_rate": 3.1935121972967387e-07,
"loss": 0.3974,
"step": 1651
},
{
"epoch": 0.8893075836081017,
"grad_norm": 1.9560867162587892,
"learning_rate": 3.1629127951305407e-07,
"loss": 0.4419,
"step": 1652
},
{
"epoch": 0.8898459053899468,
"grad_norm": 1.4109620050170866,
"learning_rate": 3.132455908460175e-07,
"loss": 0.4006,
"step": 1653
},
{
"epoch": 0.8903842271717919,
"grad_norm": 1.3778369174445322,
"learning_rate": 3.1021416299595985e-07,
"loss": 0.3917,
"step": 1654
},
{
"epoch": 0.890922548953637,
"grad_norm": 1.7547858079840999,
"learning_rate": 3.0719700518688447e-07,
"loss": 0.4698,
"step": 1655
},
{
"epoch": 0.8914608707354821,
"grad_norm": 1.5659476763978994,
"learning_rate": 3.0419412659937477e-07,
"loss": 0.4172,
"step": 1656
},
{
"epoch": 0.8919991925173272,
"grad_norm": 3.093400384631848,
"learning_rate": 3.0120553637056293e-07,
"loss": 0.3883,
"step": 1657
},
{
"epoch": 0.8925375142991724,
"grad_norm": 1.4466790084982413,
"learning_rate": 2.9823124359410706e-07,
"loss": 0.391,
"step": 1658
},
{
"epoch": 0.8930758360810175,
"grad_norm": 1.2602029099448362,
"learning_rate": 2.9527125732015995e-07,
"loss": 0.41,
"step": 1659
},
{
"epoch": 0.8936141578628626,
"grad_norm": 1.5682198116188635,
"learning_rate": 2.923255865553432e-07,
"loss": 0.4361,
"step": 1660
},
{
"epoch": 0.8941524796447077,
"grad_norm": 1.7284038118874672,
"learning_rate": 2.8939424026271923e-07,
"loss": 0.4248,
"step": 1661
},
{
"epoch": 0.8946908014265528,
"grad_norm": 1.4256983828332148,
"learning_rate": 2.8647722736176333e-07,
"loss": 0.4291,
"step": 1662
},
{
"epoch": 0.8952291232083979,
"grad_norm": 1.4976102627551229,
"learning_rate": 2.8357455672833933e-07,
"loss": 0.3813,
"step": 1663
},
{
"epoch": 0.895767444990243,
"grad_norm": 1.8854495681463317,
"learning_rate": 2.8068623719466725e-07,
"loss": 0.4516,
"step": 1664
},
{
"epoch": 0.8963057667720881,
"grad_norm": 1.5693149002013742,
"learning_rate": 2.7781227754930253e-07,
"loss": 0.4585,
"step": 1665
},
{
"epoch": 0.8968440885539332,
"grad_norm": 1.573734503341506,
"learning_rate": 2.7495268653710493e-07,
"loss": 0.4483,
"step": 1666
},
{
"epoch": 0.8973824103357783,
"grad_norm": 1.5481263062327042,
"learning_rate": 2.7210747285921435e-07,
"loss": 0.4468,
"step": 1667
},
{
"epoch": 0.8979207321176234,
"grad_norm": 1.7822442462595496,
"learning_rate": 2.692766451730233e-07,
"loss": 0.4234,
"step": 1668
},
{
"epoch": 0.8984590538994685,
"grad_norm": 1.8797060608535148,
"learning_rate": 2.6646021209215003e-07,
"loss": 0.4063,
"step": 1669
},
{
"epoch": 0.8989973756813135,
"grad_norm": 1.4047802142985153,
"learning_rate": 2.636581821864148e-07,
"loss": 0.3933,
"step": 1670
},
{
"epoch": 0.8995356974631586,
"grad_norm": 1.9919594742667397,
"learning_rate": 2.6087056398180823e-07,
"loss": 0.4259,
"step": 1671
},
{
"epoch": 0.9000740192450037,
"grad_norm": 1.439697905572551,
"learning_rate": 2.580973659604735e-07,
"loss": 0.4234,
"step": 1672
},
{
"epoch": 0.9006123410268488,
"grad_norm": 1.4340034850095604,
"learning_rate": 2.553385965606736e-07,
"loss": 0.4011,
"step": 1673
},
{
"epoch": 0.9011506628086939,
"grad_norm": 1.6008407880111504,
"learning_rate": 2.525942641767687e-07,
"loss": 0.4064,
"step": 1674
},
{
"epoch": 0.901688984590539,
"grad_norm": 1.393769083088064,
"learning_rate": 2.498643771591908e-07,
"loss": 0.3878,
"step": 1675
},
{
"epoch": 0.9022273063723841,
"grad_norm": 1.5473000323872435,
"learning_rate": 2.47148943814417e-07,
"loss": 0.4125,
"step": 1676
},
{
"epoch": 0.9027656281542292,
"grad_norm": 1.504947787937997,
"learning_rate": 2.4444797240494533e-07,
"loss": 0.4328,
"step": 1677
},
{
"epoch": 0.9033039499360743,
"grad_norm": 1.8071042005817233,
"learning_rate": 2.4176147114927e-07,
"loss": 0.4429,
"step": 1678
},
{
"epoch": 0.9038422717179194,
"grad_norm": 1.5975781936612632,
"learning_rate": 2.3908944822185144e-07,
"loss": 0.4279,
"step": 1679
},
{
"epoch": 0.9043805934997645,
"grad_norm": 1.4408734852067904,
"learning_rate": 2.364319117531011e-07,
"loss": 0.404,
"step": 1680
},
{
"epoch": 0.9043805934997645,
"eval_loss": 0.42025431990623474,
"eval_runtime": 1550.3923,
"eval_samples_per_second": 16.131,
"eval_steps_per_second": 0.504,
"step": 1680
},
{
"epoch": 0.9049189152816096,
"grad_norm": 1.6629310324181896,
"learning_rate": 2.3378886982934778e-07,
"loss": 0.4876,
"step": 1681
},
{
"epoch": 0.9054572370634547,
"grad_norm": 1.5275509334845596,
"learning_rate": 2.311603304928173e-07,
"loss": 0.4428,
"step": 1682
},
{
"epoch": 0.9059955588452998,
"grad_norm": 1.6372832685609333,
"learning_rate": 2.285463017416073e-07,
"loss": 0.4815,
"step": 1683
},
{
"epoch": 0.9065338806271449,
"grad_norm": 1.846596894090347,
"learning_rate": 2.2594679152966258e-07,
"loss": 0.4724,
"step": 1684
},
{
"epoch": 0.90707220240899,
"grad_norm": 1.7091710123282846,
"learning_rate": 2.2336180776675154e-07,
"loss": 0.4447,
"step": 1685
},
{
"epoch": 0.9076105241908351,
"grad_norm": 1.4759554995733482,
"learning_rate": 2.2079135831843956e-07,
"loss": 0.4421,
"step": 1686
},
{
"epoch": 0.9081488459726802,
"grad_norm": 1.4044547819882969,
"learning_rate": 2.1823545100606914e-07,
"loss": 0.4438,
"step": 1687
},
{
"epoch": 0.9086871677545253,
"grad_norm": 1.6839786445608516,
"learning_rate": 2.1569409360673422e-07,
"loss": 0.4295,
"step": 1688
},
{
"epoch": 0.9092254895363704,
"grad_norm": 1.695687328944884,
"learning_rate": 2.131672938532553e-07,
"loss": 0.4001,
"step": 1689
},
{
"epoch": 0.9097638113182155,
"grad_norm": 1.6064285368620497,
"learning_rate": 2.1065505943415775e-07,
"loss": 0.426,
"step": 1690
},
{
"epoch": 0.9103021331000606,
"grad_norm": 1.805677873651136,
"learning_rate": 2.0815739799364743e-07,
"loss": 0.4109,
"step": 1691
},
{
"epoch": 0.9108404548819057,
"grad_norm": 1.6393066274059234,
"learning_rate": 2.0567431713158726e-07,
"loss": 0.4377,
"step": 1692
},
{
"epoch": 0.9113787766637508,
"grad_norm": 1.6183131956225818,
"learning_rate": 2.032058244034757e-07,
"loss": 0.4412,
"step": 1693
},
{
"epoch": 0.9119170984455959,
"grad_norm": 1.5002695967364554,
"learning_rate": 2.007519273204206e-07,
"loss": 0.4437,
"step": 1694
},
{
"epoch": 0.912455420227441,
"grad_norm": 1.647362717510626,
"learning_rate": 1.9831263334911977e-07,
"loss": 0.4808,
"step": 1695
},
{
"epoch": 0.9129937420092861,
"grad_norm": 1.5964438963275278,
"learning_rate": 1.95887949911836e-07,
"loss": 0.4393,
"step": 1696
},
{
"epoch": 0.9135320637911312,
"grad_norm": 1.8713869106599383,
"learning_rate": 1.934778843863766e-07,
"loss": 0.434,
"step": 1697
},
{
"epoch": 0.9140703855729763,
"grad_norm": 1.9039547376831083,
"learning_rate": 1.9108244410606823e-07,
"loss": 0.4364,
"step": 1698
},
{
"epoch": 0.9146087073548214,
"grad_norm": 1.5450254177283191,
"learning_rate": 1.887016363597366e-07,
"loss": 0.4589,
"step": 1699
},
{
"epoch": 0.9151470291366665,
"grad_norm": 1.543879530191546,
"learning_rate": 1.8633546839168403e-07,
"loss": 0.4064,
"step": 1700
},
{
"epoch": 0.9156853509185116,
"grad_norm": 1.5304353330893454,
"learning_rate": 1.839839474016658e-07,
"loss": 0.442,
"step": 1701
},
{
"epoch": 0.9162236727003567,
"grad_norm": 2.3452574340826233,
"learning_rate": 1.8164708054487002e-07,
"loss": 0.422,
"step": 1702
},
{
"epoch": 0.9167619944822017,
"grad_norm": 1.9150867244566236,
"learning_rate": 1.7932487493189598e-07,
"loss": 0.4294,
"step": 1703
},
{
"epoch": 0.9173003162640468,
"grad_norm": 1.6124806051656038,
"learning_rate": 1.7701733762873152e-07,
"loss": 0.428,
"step": 1704
},
{
"epoch": 0.9178386380458919,
"grad_norm": 1.4187608860726189,
"learning_rate": 1.7472447565673177e-07,
"loss": 0.4038,
"step": 1705
},
{
"epoch": 0.918376959827737,
"grad_norm": 1.4661931221135862,
"learning_rate": 1.7244629599259767e-07,
"loss": 0.3848,
"step": 1706
},
{
"epoch": 0.9189152816095821,
"grad_norm": 1.6206434175751971,
"learning_rate": 1.7018280556835632e-07,
"loss": 0.3851,
"step": 1707
},
{
"epoch": 0.9194536033914272,
"grad_norm": 1.8423442465927384,
"learning_rate": 1.6793401127133513e-07,
"loss": 0.4079,
"step": 1708
},
{
"epoch": 0.9199919251732723,
"grad_norm": 1.3950233471823357,
"learning_rate": 1.6569991994414835e-07,
"loss": 0.3994,
"step": 1709
},
{
"epoch": 0.9205302469551174,
"grad_norm": 1.5142214065755961,
"learning_rate": 1.6348053838466937e-07,
"loss": 0.4189,
"step": 1710
},
{
"epoch": 0.9210685687369625,
"grad_norm": 1.5917351975615364,
"learning_rate": 1.6127587334601458e-07,
"loss": 0.4314,
"step": 1711
},
{
"epoch": 0.9216068905188076,
"grad_norm": 1.605064219083874,
"learning_rate": 1.5908593153651952e-07,
"loss": 0.4237,
"step": 1712
},
{
"epoch": 0.9221452123006527,
"grad_norm": 1.7341654884483175,
"learning_rate": 1.5691071961972116e-07,
"loss": 0.4131,
"step": 1713
},
{
"epoch": 0.9226835340824978,
"grad_norm": 1.6343186301580133,
"learning_rate": 1.547502442143356e-07,
"loss": 0.4233,
"step": 1714
},
{
"epoch": 0.9232218558643429,
"grad_norm": 1.5099995374537671,
"learning_rate": 1.526045118942404e-07,
"loss": 0.3982,
"step": 1715
},
{
"epoch": 0.923760177646188,
"grad_norm": 1.7958348974891065,
"learning_rate": 1.504735291884507e-07,
"loss": 0.4331,
"step": 1716
},
{
"epoch": 0.9242984994280331,
"grad_norm": 1.7356588334735397,
"learning_rate": 1.4835730258110303e-07,
"loss": 0.4357,
"step": 1717
},
{
"epoch": 0.9248368212098782,
"grad_norm": 2.500196744283525,
"learning_rate": 1.4625583851143432e-07,
"loss": 0.3799,
"step": 1718
},
{
"epoch": 0.9253751429917233,
"grad_norm": 1.3646453068750661,
"learning_rate": 1.4416914337376132e-07,
"loss": 0.4128,
"step": 1719
},
{
"epoch": 0.9259134647735684,
"grad_norm": 1.642640642870041,
"learning_rate": 1.420972235174628e-07,
"loss": 0.4506,
"step": 1720
},
{
"epoch": 0.9264517865554135,
"grad_norm": 1.592814733182936,
"learning_rate": 1.4004008524695912e-07,
"loss": 0.4296,
"step": 1721
},
{
"epoch": 0.9269901083372586,
"grad_norm": 1.4652552983592342,
"learning_rate": 1.3799773482169378e-07,
"loss": 0.4233,
"step": 1722
},
{
"epoch": 0.9275284301191037,
"grad_norm": 1.7410090898687602,
"learning_rate": 1.3597017845611181e-07,
"loss": 0.4594,
"step": 1723
},
{
"epoch": 0.9280667519009488,
"grad_norm": 1.559448064084867,
"learning_rate": 1.3395742231964658e-07,
"loss": 0.4336,
"step": 1724
},
{
"epoch": 0.9286050736827939,
"grad_norm": 1.9623398348887997,
"learning_rate": 1.3195947253669518e-07,
"loss": 0.4724,
"step": 1725
},
{
"epoch": 0.929143395464639,
"grad_norm": 1.4765323135961603,
"learning_rate": 1.2997633518660125e-07,
"loss": 0.4122,
"step": 1726
},
{
"epoch": 0.9296817172464841,
"grad_norm": 1.9030353185015407,
"learning_rate": 1.2800801630364013e-07,
"loss": 0.4414,
"step": 1727
},
{
"epoch": 0.9302200390283292,
"grad_norm": 1.3486307498615422,
"learning_rate": 1.2605452187699484e-07,
"loss": 0.4799,
"step": 1728
},
{
"epoch": 0.9307583608101743,
"grad_norm": 1.4474994381201687,
"learning_rate": 1.2411585785074232e-07,
"loss": 0.4353,
"step": 1729
},
{
"epoch": 0.9312966825920194,
"grad_norm": 1.460955137197927,
"learning_rate": 1.221920301238333e-07,
"loss": 0.4248,
"step": 1730
},
{
"epoch": 0.9318350043738645,
"grad_norm": 1.8140612572363009,
"learning_rate": 1.2028304455007412e-07,
"loss": 0.3888,
"step": 1731
},
{
"epoch": 0.9323733261557096,
"grad_norm": 1.4724419135884532,
"learning_rate": 1.1838890693811055e-07,
"loss": 0.3868,
"step": 1732
},
{
"epoch": 0.9329116479375547,
"grad_norm": 1.4562877473919869,
"learning_rate": 1.1650962305140845e-07,
"loss": 0.4305,
"step": 1733
},
{
"epoch": 0.9334499697193998,
"grad_norm": 2.0045234339432763,
"learning_rate": 1.1464519860823698e-07,
"loss": 0.5062,
"step": 1734
},
{
"epoch": 0.9339882915012448,
"grad_norm": 1.8962618785171959,
"learning_rate": 1.1279563928165094e-07,
"loss": 0.4049,
"step": 1735
},
{
"epoch": 0.93452661328309,
"grad_norm": 1.580337734175196,
"learning_rate": 1.1096095069947466e-07,
"loss": 0.4465,
"step": 1736
},
{
"epoch": 0.935064935064935,
"grad_norm": 1.6703156179249958,
"learning_rate": 1.091411384442831e-07,
"loss": 0.4174,
"step": 1737
},
{
"epoch": 0.9356032568467801,
"grad_norm": 1.4707795804039079,
"learning_rate": 1.0733620805338462e-07,
"loss": 0.3582,
"step": 1738
},
{
"epoch": 0.9361415786286252,
"grad_norm": 1.5443607495595517,
"learning_rate": 1.0554616501880722e-07,
"loss": 0.4322,
"step": 1739
},
{
"epoch": 0.9366799004104703,
"grad_norm": 1.647874029047969,
"learning_rate": 1.0377101478727835e-07,
"loss": 0.4465,
"step": 1740
},
{
"epoch": 0.9366799004104703,
"eval_loss": 0.41988879442214966,
"eval_runtime": 1559.0337,
"eval_samples_per_second": 16.042,
"eval_steps_per_second": 0.502,
"step": 1740
},
{
"epoch": 0.9372182221923154,
"grad_norm": 1.6210033117188805,
"learning_rate": 1.0201076276021072e-07,
"loss": 0.4432,
"step": 1741
},
{
"epoch": 0.9377565439741605,
"grad_norm": 1.9123170938822815,
"learning_rate": 1.0026541429368431e-07,
"loss": 0.4024,
"step": 1742
},
{
"epoch": 0.9382948657560056,
"grad_norm": 2.5680416907462864,
"learning_rate": 9.853497469843043e-08,
"loss": 0.3973,
"step": 1743
},
{
"epoch": 0.9388331875378507,
"grad_norm": 1.462242975230514,
"learning_rate": 9.681944923981724e-08,
"loss": 0.455,
"step": 1744
},
{
"epoch": 0.9393715093196958,
"grad_norm": 1.4330622858448745,
"learning_rate": 9.511884313782915e-08,
"loss": 0.409,
"step": 1745
},
{
"epoch": 0.9399098311015409,
"grad_norm": 1.5924131568344673,
"learning_rate": 9.343316156705751e-08,
"loss": 0.4709,
"step": 1746
},
{
"epoch": 0.940448152883386,
"grad_norm": 2.1748083360521,
"learning_rate": 9.176240965668049e-08,
"loss": 0.4975,
"step": 1747
},
{
"epoch": 0.9409864746652311,
"grad_norm": 2.240808535802813,
"learning_rate": 9.01065924904465e-08,
"loss": 0.4817,
"step": 1748
},
{
"epoch": 0.9415247964470762,
"grad_norm": 1.7231015704313604,
"learning_rate": 8.846571510666369e-08,
"loss": 0.4094,
"step": 1749
},
{
"epoch": 0.9420631182289213,
"grad_norm": 1.4693480082476622,
"learning_rate": 8.683978249817981e-08,
"loss": 0.4453,
"step": 1750
},
{
"epoch": 0.9426014400107664,
"grad_norm": 1.6509935540008158,
"learning_rate": 8.52287996123674e-08,
"loss": 0.4065,
"step": 1751
},
{
"epoch": 0.9431397617926115,
"grad_norm": 1.6701873629796138,
"learning_rate": 8.363277135111314e-08,
"loss": 0.3761,
"step": 1752
},
{
"epoch": 0.9436780835744566,
"grad_norm": 1.2809352240300242,
"learning_rate": 8.205170257079786e-08,
"loss": 0.4159,
"step": 1753
},
{
"epoch": 0.9442164053563017,
"grad_norm": 1.62872520153001,
"learning_rate": 8.048559808228496e-08,
"loss": 0.3973,
"step": 1754
},
{
"epoch": 0.9447547271381468,
"grad_norm": 1.6888413344801536,
"learning_rate": 7.89344626509031e-08,
"loss": 0.4219,
"step": 1755
},
{
"epoch": 0.9452930489199919,
"grad_norm": 1.6223202323912347,
"learning_rate": 7.739830099643464e-08,
"loss": 0.4303,
"step": 1756
},
{
"epoch": 0.945831370701837,
"grad_norm": 1.2810729846885742,
"learning_rate": 7.587711779309947e-08,
"loss": 0.3868,
"step": 1757
},
{
"epoch": 0.9463696924836821,
"grad_norm": 1.6840497326805903,
"learning_rate": 7.437091766954119e-08,
"loss": 0.434,
"step": 1758
},
{
"epoch": 0.9469080142655272,
"grad_norm": 1.765752446431431,
"learning_rate": 7.287970520881205e-08,
"loss": 0.4461,
"step": 1759
},
{
"epoch": 0.9474463360473723,
"grad_norm": 1.4694297184744327,
"learning_rate": 7.140348494836191e-08,
"loss": 0.4374,
"step": 1760
},
{
"epoch": 0.9479846578292174,
"grad_norm": 1.456090878683348,
"learning_rate": 6.994226138002047e-08,
"loss": 0.4204,
"step": 1761
},
{
"epoch": 0.9485229796110625,
"grad_norm": 1.5114503786906142,
"learning_rate": 6.849603894998725e-08,
"loss": 0.4431,
"step": 1762
},
{
"epoch": 0.9490613013929076,
"grad_norm": 1.9303693867033398,
"learning_rate": 6.706482205881548e-08,
"loss": 0.4292,
"step": 1763
},
{
"epoch": 0.9495996231747527,
"grad_norm": 1.3436489528854563,
"learning_rate": 6.564861506139996e-08,
"loss": 0.3854,
"step": 1764
},
{
"epoch": 0.9501379449565978,
"grad_norm": 1.3843500014884988,
"learning_rate": 6.424742226696312e-08,
"loss": 0.3969,
"step": 1765
},
{
"epoch": 0.9506762667384429,
"grad_norm": 1.3401735876692071,
"learning_rate": 6.286124793904336e-08,
"loss": 0.4183,
"step": 1766
},
{
"epoch": 0.951214588520288,
"grad_norm": 1.685672633138118,
"learning_rate": 6.149009629547897e-08,
"loss": 0.4468,
"step": 1767
},
{
"epoch": 0.951752910302133,
"grad_norm": 1.8943339017606036,
"learning_rate": 6.013397150839983e-08,
"loss": 0.4361,
"step": 1768
},
{
"epoch": 0.9522912320839781,
"grad_norm": 1.7967244404705551,
"learning_rate": 5.8792877704211274e-08,
"loss": 0.4491,
"step": 1769
},
{
"epoch": 0.9528295538658232,
"grad_norm": 1.4606147240071112,
"learning_rate": 5.746681896358131e-08,
"loss": 0.4019,
"step": 1770
},
{
"epoch": 0.9533678756476683,
"grad_norm": 1.455938194249448,
"learning_rate": 5.615579932143067e-08,
"loss": 0.3948,
"step": 1771
},
{
"epoch": 0.9539061974295135,
"grad_norm": 1.2759206549407909,
"learning_rate": 5.485982276691892e-08,
"loss": 0.3949,
"step": 1772
},
{
"epoch": 0.9544445192113586,
"grad_norm": 1.5731889340664074,
"learning_rate": 5.35788932434328e-08,
"loss": 0.4422,
"step": 1773
},
{
"epoch": 0.9549828409932037,
"grad_norm": 1.4900834870938766,
"learning_rate": 5.2313014648573966e-08,
"loss": 0.3651,
"step": 1774
},
{
"epoch": 0.9555211627750488,
"grad_norm": 1.3653648358156305,
"learning_rate": 5.1062190834146875e-08,
"loss": 0.403,
"step": 1775
},
{
"epoch": 0.9560594845568939,
"grad_norm": 1.5012692588758656,
"learning_rate": 4.9826425606148145e-08,
"loss": 0.4056,
"step": 1776
},
{
"epoch": 0.956597806338739,
"grad_norm": 1.7114437223613954,
"learning_rate": 4.860572272475384e-08,
"loss": 0.4219,
"step": 1777
},
{
"epoch": 0.9571361281205841,
"grad_norm": 1.5710449681536929,
"learning_rate": 4.740008590430778e-08,
"loss": 0.4504,
"step": 1778
},
{
"epoch": 0.9576744499024292,
"grad_norm": 1.5334464777855485,
"learning_rate": 4.620951881331215e-08,
"loss": 0.4078,
"step": 1779
},
{
"epoch": 0.9582127716842743,
"grad_norm": 1.665311340751073,
"learning_rate": 4.5034025074414124e-08,
"loss": 0.388,
"step": 1780
},
{
"epoch": 0.9587510934661194,
"grad_norm": 1.6819133415223784,
"learning_rate": 4.3873608264397014e-08,
"loss": 0.4318,
"step": 1781
},
{
"epoch": 0.9592894152479645,
"grad_norm": 2.1910803064926947,
"learning_rate": 4.272827191416584e-08,
"loss": 0.3862,
"step": 1782
},
{
"epoch": 0.9598277370298096,
"grad_norm": 1.3743310605178427,
"learning_rate": 4.159801950874176e-08,
"loss": 0.382,
"step": 1783
},
{
"epoch": 0.9603660588116547,
"grad_norm": 1.753291691489888,
"learning_rate": 4.048285448724709e-08,
"loss": 0.4677,
"step": 1784
},
{
"epoch": 0.9609043805934998,
"grad_norm": 1.4424214242693971,
"learning_rate": 3.938278024289644e-08,
"loss": 0.4012,
"step": 1785
},
{
"epoch": 0.9614427023753449,
"grad_norm": 1.4573151134275804,
"learning_rate": 3.829780012298612e-08,
"loss": 0.4058,
"step": 1786
},
{
"epoch": 0.96198102415719,
"grad_norm": 1.4245212432098524,
"learning_rate": 3.722791742888476e-08,
"loss": 0.3958,
"step": 1787
},
{
"epoch": 0.9625193459390351,
"grad_norm": 1.533496999870574,
"learning_rate": 3.617313541602274e-08,
"loss": 0.4195,
"step": 1788
},
{
"epoch": 0.9630576677208802,
"grad_norm": 1.854726516234056,
"learning_rate": 3.5133457293881626e-08,
"loss": 0.4376,
"step": 1789
},
{
"epoch": 0.9635959895027253,
"grad_norm": 1.9373159151394588,
"learning_rate": 3.410888622598585e-08,
"loss": 0.4312,
"step": 1790
},
{
"epoch": 0.9641343112845704,
"grad_norm": 2.153201724460075,
"learning_rate": 3.3099425329890525e-08,
"loss": 0.4494,
"step": 1791
},
{
"epoch": 0.9646726330664155,
"grad_norm": 1.4498518000265068,
"learning_rate": 3.210507767717586e-08,
"loss": 0.4199,
"step": 1792
},
{
"epoch": 0.9652109548482606,
"grad_norm": 1.6032986767797375,
"learning_rate": 3.1125846293433846e-08,
"loss": 0.3771,
"step": 1793
},
{
"epoch": 0.9657492766301057,
"grad_norm": 2.1622319654687057,
"learning_rate": 3.0161734158261625e-08,
"loss": 0.4214,
"step": 1794
},
{
"epoch": 0.9662875984119508,
"grad_norm": 1.4345400536711836,
"learning_rate": 2.9212744205252553e-08,
"loss": 0.3797,
"step": 1795
},
{
"epoch": 0.9668259201937959,
"grad_norm": 1.6565073229021858,
"learning_rate": 2.8278879321983477e-08,
"loss": 0.3874,
"step": 1796
},
{
"epoch": 0.967364241975641,
"grad_norm": 2.0557097314570196,
"learning_rate": 2.736014235001194e-08,
"loss": 0.4341,
"step": 1797
},
{
"epoch": 0.9679025637574861,
"grad_norm": 1.64490095462292,
"learning_rate": 2.6456536084862872e-08,
"loss": 0.3979,
"step": 1798
},
{
"epoch": 0.9684408855393312,
"grad_norm": 1.6729564375619899,
"learning_rate": 2.5568063276021347e-08,
"loss": 0.397,
"step": 1799
},
{
"epoch": 0.9689792073211763,
"grad_norm": 1.5597222162662605,
"learning_rate": 2.4694726626925403e-08,
"loss": 0.432,
"step": 1800
},
{
"epoch": 0.9689792073211763,
"eval_loss": 0.4197918474674225,
"eval_runtime": 1571.0705,
"eval_samples_per_second": 15.919,
"eval_steps_per_second": 0.498,
"step": 1800
},
{
"epoch": 0.9695175291030214,
"grad_norm": 1.4076281710448164,
"learning_rate": 2.383652879495657e-08,
"loss": 0.3963,
"step": 1801
},
{
"epoch": 0.9700558508848665,
"grad_norm": 1.645367632025504,
"learning_rate": 2.299347239143157e-08,
"loss": 0.4272,
"step": 1802
},
{
"epoch": 0.9705941726667116,
"grad_norm": 1.3956889574044051,
"learning_rate": 2.2165559981595642e-08,
"loss": 0.429,
"step": 1803
},
{
"epoch": 0.9711324944485566,
"grad_norm": 1.4793349281728767,
"learning_rate": 2.1352794084613658e-08,
"loss": 0.4479,
"step": 1804
},
{
"epoch": 0.9716708162304017,
"grad_norm": 1.580535608856093,
"learning_rate": 2.0555177173562925e-08,
"loss": 0.431,
"step": 1805
},
{
"epoch": 0.9722091380122468,
"grad_norm": 1.7015563233283766,
"learning_rate": 1.9772711675425937e-08,
"loss": 0.3984,
"step": 1806
},
{
"epoch": 0.9727474597940919,
"grad_norm": 1.5158636017258738,
"learning_rate": 1.9005399971080974e-08,
"loss": 0.4166,
"step": 1807
},
{
"epoch": 0.973285781575937,
"grad_norm": 1.4220838677616172,
"learning_rate": 1.8253244395298186e-08,
"loss": 0.3988,
"step": 1808
},
{
"epoch": 0.9738241033577821,
"grad_norm": 1.3963959999222404,
"learning_rate": 1.7516247236731288e-08,
"loss": 0.4224,
"step": 1809
},
{
"epoch": 0.9743624251396272,
"grad_norm": 1.7337278360138024,
"learning_rate": 1.679441073790755e-08,
"loss": 0.4738,
"step": 1810
},
{
"epoch": 0.9749007469214723,
"grad_norm": 1.4861221398216466,
"learning_rate": 1.6087737095225598e-08,
"loss": 0.4449,
"step": 1811
},
{
"epoch": 0.9754390687033174,
"grad_norm": 1.3145810749185178,
"learning_rate": 1.539622845894595e-08,
"loss": 0.3885,
"step": 1812
},
{
"epoch": 0.9759773904851625,
"grad_norm": 1.3176971825763986,
"learning_rate": 1.471988693318549e-08,
"loss": 0.4232,
"step": 1813
},
{
"epoch": 0.9765157122670076,
"grad_norm": 1.442309770679218,
"learning_rate": 1.4058714575910238e-08,
"loss": 0.4328,
"step": 1814
},
{
"epoch": 0.9770540340488527,
"grad_norm": 1.5157478456952573,
"learning_rate": 1.3412713398930355e-08,
"loss": 0.3911,
"step": 1815
},
{
"epoch": 0.9775923558306978,
"grad_norm": 1.779840899462066,
"learning_rate": 1.2781885367892377e-08,
"loss": 0.4179,
"step": 1816
},
{
"epoch": 0.9781306776125429,
"grad_norm": 1.6067561255260123,
"learning_rate": 1.2166232402275325e-08,
"loss": 0.3987,
"step": 1817
},
{
"epoch": 0.978668999394388,
"grad_norm": 1.4429159861518235,
"learning_rate": 1.156575637538182e-08,
"loss": 0.3752,
"step": 1818
},
{
"epoch": 0.9792073211762331,
"grad_norm": 1.6134101059886168,
"learning_rate": 1.0980459114335318e-08,
"loss": 0.4491,
"step": 1819
},
{
"epoch": 0.9797456429580782,
"grad_norm": 1.3430032688894593,
"learning_rate": 1.0410342400073992e-08,
"loss": 0.4446,
"step": 1820
},
{
"epoch": 0.9802839647399233,
"grad_norm": 1.5854543749606242,
"learning_rate": 9.855407967344078e-09,
"loss": 0.4022,
"step": 1821
},
{
"epoch": 0.9808222865217684,
"grad_norm": 1.3429626400579588,
"learning_rate": 9.31565750469543e-09,
"loss": 0.4173,
"step": 1822
},
{
"epoch": 0.9813606083036135,
"grad_norm": 1.8181594324695687,
"learning_rate": 8.791092654476529e-09,
"loss": 0.4699,
"step": 1823
},
{
"epoch": 0.9818989300854586,
"grad_norm": 1.3189784151442827,
"learning_rate": 8.281715012827817e-09,
"loss": 0.3847,
"step": 1824
},
{
"epoch": 0.9824372518673037,
"grad_norm": 1.29942395236663,
"learning_rate": 7.78752612968059e-09,
"loss": 0.3989,
"step": 1825
},
{
"epoch": 0.9829755736491488,
"grad_norm": 1.6481398184837366,
"learning_rate": 7.3085275087475535e-09,
"loss": 0.385,
"step": 1826
},
{
"epoch": 0.9835138954309939,
"grad_norm": 1.2097016930732503,
"learning_rate": 6.844720607522282e-09,
"loss": 0.4635,
"step": 1827
},
{
"epoch": 0.984052217212839,
"grad_norm": 1.3353672523995217,
"learning_rate": 6.3961068372725425e-09,
"loss": 0.4659,
"step": 1828
},
{
"epoch": 0.9845905389946841,
"grad_norm": 1.6604758834668205,
"learning_rate": 5.962687563036418e-09,
"loss": 0.4182,
"step": 1829
},
{
"epoch": 0.9851288607765292,
"grad_norm": 1.365766973195823,
"learning_rate": 5.544464103618419e-09,
"loss": 0.4496,
"step": 1830
},
{
"epoch": 0.9856671825583743,
"grad_norm": 1.7311791534397065,
"learning_rate": 5.1414377315855965e-09,
"loss": 0.4091,
"step": 1831
},
{
"epoch": 0.9862055043402194,
"grad_norm": 1.6223056568910816,
"learning_rate": 4.753609673263104e-09,
"loss": 0.435,
"step": 1832
},
{
"epoch": 0.9867438261220645,
"grad_norm": 1.4811187708876057,
"learning_rate": 4.380981108730309e-09,
"loss": 0.4229,
"step": 1833
},
{
"epoch": 0.9872821479039096,
"grad_norm": 1.5639619332709622,
"learning_rate": 4.023553171819128e-09,
"loss": 0.4434,
"step": 1834
},
{
"epoch": 0.9878204696857547,
"grad_norm": 1.4607336838401341,
"learning_rate": 3.681326950107922e-09,
"loss": 0.3892,
"step": 1835
},
{
"epoch": 0.9883587914675998,
"grad_norm": 1.4459818740856154,
"learning_rate": 3.3543034849192746e-09,
"loss": 0.4613,
"step": 1836
},
{
"epoch": 0.9888971132494448,
"grad_norm": 1.727956071768554,
"learning_rate": 3.0424837713188825e-09,
"loss": 0.4321,
"step": 1837
},
{
"epoch": 0.98943543503129,
"grad_norm": 1.4250494159267046,
"learning_rate": 2.7458687581072284e-09,
"loss": 0.4361,
"step": 1838
},
{
"epoch": 0.989973756813135,
"grad_norm": 1.6825614414547043,
"learning_rate": 2.4644593478240218e-09,
"loss": 0.4247,
"step": 1839
},
{
"epoch": 0.9905120785949801,
"grad_norm": 1.3394226647545722,
"learning_rate": 2.1982563967376525e-09,
"loss": 0.4224,
"step": 1840
},
{
"epoch": 0.9910504003768252,
"grad_norm": 1.3878090062249357,
"learning_rate": 1.9472607148490752e-09,
"loss": 0.4671,
"step": 1841
},
{
"epoch": 0.9915887221586703,
"grad_norm": 1.8045067084462034,
"learning_rate": 1.71147306588626e-09,
"loss": 0.4093,
"step": 1842
},
{
"epoch": 0.9921270439405154,
"grad_norm": 1.6487465697670387,
"learning_rate": 1.4908941673008604e-09,
"loss": 0.4768,
"step": 1843
},
{
"epoch": 0.9926653657223605,
"grad_norm": 1.3894142004683563,
"learning_rate": 1.2855246902693241e-09,
"loss": 0.4126,
"step": 1844
},
{
"epoch": 0.9932036875042056,
"grad_norm": 1.5382669595746958,
"learning_rate": 1.0953652596878972e-09,
"loss": 0.4662,
"step": 1845
},
{
"epoch": 0.9937420092860507,
"grad_norm": 1.5055759777025033,
"learning_rate": 9.204164541720683e-10,
"loss": 0.3911,
"step": 1846
},
{
"epoch": 0.9942803310678958,
"grad_norm": 1.4883627722190473,
"learning_rate": 7.606788060543491e-10,
"loss": 0.4005,
"step": 1847
},
{
"epoch": 0.9948186528497409,
"grad_norm": 1.7929841052447726,
"learning_rate": 6.16152801383163e-10,
"loss": 0.4239,
"step": 1848
},
{
"epoch": 0.995356974631586,
"grad_norm": 1.3514634100350202,
"learning_rate": 4.86838879921736e-10,
"loss": 0.4122,
"step": 1849
},
{
"epoch": 0.9958952964134311,
"grad_norm": 1.5688583282415778,
"learning_rate": 3.7273743514476544e-10,
"loss": 0.3613,
"step": 1850
},
{
"epoch": 0.9964336181952762,
"grad_norm": 1.3790895255701852,
"learning_rate": 2.73848814238975e-10,
"loss": 0.3974,
"step": 1851
},
{
"epoch": 0.9969719399771213,
"grad_norm": 1.4609310145673613,
"learning_rate": 1.9017331810256002e-10,
"loss": 0.4287,
"step": 1852
},
{
"epoch": 0.9975102617589664,
"grad_norm": 1.6915446904327818,
"learning_rate": 1.2171120134185643e-10,
"loss": 0.4238,
"step": 1853
},
{
"epoch": 0.9980485835408115,
"grad_norm": 1.636253995850887,
"learning_rate": 6.846267227356152e-11,
"loss": 0.4105,
"step": 1854
},
{
"epoch": 0.9985869053226566,
"grad_norm": 1.3210272324277625,
"learning_rate": 3.042789292140302e-11,
"loss": 0.3978,
"step": 1855
},
{
"epoch": 0.9991252271045017,
"grad_norm": 1.7798971238230394,
"learning_rate": 7.606979016694383e-12,
"loss": 0.4537,
"step": 1856
},
{
"epoch": 0.9996635488863468,
"grad_norm": 1.6132079869080023,
"learning_rate": 0.0,
"loss": 0.4395,
"step": 1857
},
{
"epoch": 0.9996635488863468,
"step": 1857,
"total_flos": 1.243798906601472e+16,
"train_loss": 0.0,
"train_runtime": 0.4818,
"train_samples_per_second": 987062.335,
"train_steps_per_second": 3854.561
}
],
"logging_steps": 1.0,
"max_steps": 1857,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 60,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.243798906601472e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}