Files
gemma-3-270m-it-Text-Cleaner/train/log.json
ModelHub XC ddc6dbdd40 初始化项目,由ModelHub XC社区提供模型
Model: kth8/gemma-3-270m-it-Text-Cleaner
Source: Original Platform
2026-04-24 10:42:39 +08:00

8820 lines
197 KiB
JSON

[
{
"loss": 0.311,
"grad_norm": 7.0026350021362305,
"learning_rate": 0.0,
"epoch": 0.0008019246190858059,
"step": 1
},
{
"loss": 0.4209,
"grad_norm": 7.763463973999023,
"learning_rate": 3.1746031746031746e-06,
"epoch": 0.0016038492381716118,
"step": 2
},
{
"loss": 0.5177,
"grad_norm": 7.721052646636963,
"learning_rate": 6.349206349206349e-06,
"epoch": 0.0024057738572574178,
"step": 3
},
{
"loss": 0.3841,
"grad_norm": 5.636883735656738,
"learning_rate": 9.523809523809523e-06,
"epoch": 0.0032076984763432237,
"step": 4
},
{
"loss": 0.3934,
"grad_norm": 5.894153118133545,
"learning_rate": 1.2698412698412699e-05,
"epoch": 0.00400962309542903,
"step": 5
},
{
"loss": 0.2577,
"grad_norm": 3.8658251762390137,
"learning_rate": 1.5873015873015872e-05,
"epoch": 0.0048115477145148355,
"step": 6
},
{
"loss": 0.1507,
"grad_norm": 3.220764398574829,
"learning_rate": 1.9047619047619046e-05,
"epoch": 0.0056134723336006415,
"step": 7
},
{
"loss": 0.1547,
"grad_norm": 1.97222101688385,
"learning_rate": 2.2222222222222223e-05,
"epoch": 0.006415396952686447,
"step": 8
},
{
"loss": 0.1619,
"grad_norm": 2.004807472229004,
"learning_rate": 2.5396825396825397e-05,
"epoch": 0.007217321571772253,
"step": 9
},
{
"loss": 0.2521,
"grad_norm": 2.6470654010772705,
"learning_rate": 2.857142857142857e-05,
"epoch": 0.00801924619085806,
"step": 10
},
{
"loss": 0.3305,
"grad_norm": 3.026132106781006,
"learning_rate": 3.1746031746031745e-05,
"epoch": 0.008821170809943865,
"step": 11
},
{
"loss": 0.2042,
"grad_norm": 2.123467206954956,
"learning_rate": 3.492063492063492e-05,
"epoch": 0.009623095429029671,
"step": 12
},
{
"loss": 0.2047,
"grad_norm": 1.958135962486267,
"learning_rate": 3.809523809523809e-05,
"epoch": 0.010425020048115477,
"step": 13
},
{
"loss": 0.1499,
"grad_norm": 1.2876746654510498,
"learning_rate": 4.126984126984127e-05,
"epoch": 0.011226944667201283,
"step": 14
},
{
"loss": 0.0589,
"grad_norm": 0.7209349870681763,
"learning_rate": 4.4444444444444447e-05,
"epoch": 0.012028869286287089,
"step": 15
},
{
"loss": 0.1404,
"grad_norm": 1.2799328565597534,
"learning_rate": 4.761904761904762e-05,
"epoch": 0.012830793905372895,
"step": 16
},
{
"loss": 0.1594,
"grad_norm": 1.2897180318832397,
"learning_rate": 5.0793650793650794e-05,
"epoch": 0.0136327185244587,
"step": 17
},
{
"loss": 0.1236,
"grad_norm": 0.907631516456604,
"learning_rate": 5.396825396825397e-05,
"epoch": 0.014434643143544507,
"step": 18
},
{
"loss": 0.0899,
"grad_norm": 0.7336040139198303,
"learning_rate": 5.714285714285714e-05,
"epoch": 0.015236567762630313,
"step": 19
},
{
"loss": 0.1434,
"grad_norm": 1.4779671430587769,
"learning_rate": 6.0317460317460316e-05,
"epoch": 0.01603849238171612,
"step": 20
},
{
"loss": 0.1072,
"grad_norm": 0.6834859251976013,
"learning_rate": 6.349206349206349e-05,
"epoch": 0.016840417000801924,
"step": 21
},
{
"loss": 0.1077,
"grad_norm": 0.9278278946876526,
"learning_rate": 6.666666666666667e-05,
"epoch": 0.01764234161988773,
"step": 22
},
{
"loss": 0.1463,
"grad_norm": 1.041062593460083,
"learning_rate": 6.984126984126984e-05,
"epoch": 0.018444266238973536,
"step": 23
},
{
"loss": 0.1311,
"grad_norm": 1.007616639137268,
"learning_rate": 7.301587301587302e-05,
"epoch": 0.019246190858059342,
"step": 24
},
{
"loss": 0.1652,
"grad_norm": 1.5278170108795166,
"learning_rate": 7.619047619047618e-05,
"epoch": 0.020048115477145148,
"step": 25
},
{
"loss": 0.2241,
"grad_norm": 1.5930604934692383,
"learning_rate": 7.936507936507937e-05,
"epoch": 0.020850040096230954,
"step": 26
},
{
"loss": 0.0831,
"grad_norm": 0.7199026942253113,
"learning_rate": 8.253968253968255e-05,
"epoch": 0.02165196471531676,
"step": 27
},
{
"loss": 0.1204,
"grad_norm": 0.986321210861206,
"learning_rate": 8.571428571428571e-05,
"epoch": 0.022453889334402566,
"step": 28
},
{
"loss": 0.1153,
"grad_norm": 1.234464168548584,
"learning_rate": 8.888888888888889e-05,
"epoch": 0.023255813953488372,
"step": 29
},
{
"loss": 0.081,
"grad_norm": 1.0115418434143066,
"learning_rate": 9.206349206349206e-05,
"epoch": 0.024057738572574178,
"step": 30
},
{
"loss": 0.103,
"grad_norm": 1.4726132154464722,
"learning_rate": 9.523809523809524e-05,
"epoch": 0.024859663191659984,
"step": 31
},
{
"loss": 0.0841,
"grad_norm": 0.7434117197990417,
"learning_rate": 9.841269841269841e-05,
"epoch": 0.02566158781074579,
"step": 32
},
{
"loss": 0.0806,
"grad_norm": 0.6968424916267395,
"learning_rate": 0.00010158730158730159,
"epoch": 0.026463512429831595,
"step": 33
},
{
"loss": 0.1339,
"grad_norm": 1.8305174112319946,
"learning_rate": 0.00010476190476190477,
"epoch": 0.0272654370489174,
"step": 34
},
{
"loss": 0.148,
"grad_norm": 1.3083935976028442,
"learning_rate": 0.00010793650793650794,
"epoch": 0.028067361668003207,
"step": 35
},
{
"loss": 0.0659,
"grad_norm": 0.5363959074020386,
"learning_rate": 0.00011111111111111112,
"epoch": 0.028869286287089013,
"step": 36
},
{
"loss": 0.0761,
"grad_norm": 0.7278910875320435,
"learning_rate": 0.00011428571428571428,
"epoch": 0.02967121090617482,
"step": 37
},
{
"loss": 0.0628,
"grad_norm": 0.5862115621566772,
"learning_rate": 0.00011746031746031746,
"epoch": 0.030473135525260625,
"step": 38
},
{
"loss": 0.0892,
"grad_norm": 0.8882272243499756,
"learning_rate": 0.00012063492063492063,
"epoch": 0.03127506014434643,
"step": 39
},
{
"loss": 0.0907,
"grad_norm": 0.8315787315368652,
"learning_rate": 0.0001238095238095238,
"epoch": 0.03207698476343224,
"step": 40
},
{
"loss": 0.048,
"grad_norm": 0.6063331365585327,
"learning_rate": 0.00012698412698412698,
"epoch": 0.03287890938251804,
"step": 41
},
{
"loss": 0.066,
"grad_norm": 0.6467223167419434,
"learning_rate": 0.00013015873015873017,
"epoch": 0.03368083400160385,
"step": 42
},
{
"loss": 0.1351,
"grad_norm": 1.2565680742263794,
"learning_rate": 0.00013333333333333334,
"epoch": 0.034482758620689655,
"step": 43
},
{
"loss": 0.0867,
"grad_norm": 0.8123145699501038,
"learning_rate": 0.0001365079365079365,
"epoch": 0.03528468323977546,
"step": 44
},
{
"loss": 0.1072,
"grad_norm": 0.8433717489242554,
"learning_rate": 0.00013968253968253967,
"epoch": 0.03608660785886127,
"step": 45
},
{
"loss": 0.0438,
"grad_norm": 0.5360523462295532,
"learning_rate": 0.00014285714285714287,
"epoch": 0.03688853247794707,
"step": 46
},
{
"loss": 0.0866,
"grad_norm": 0.644867479801178,
"learning_rate": 0.00014603174603174603,
"epoch": 0.03769045709703288,
"step": 47
},
{
"loss": 0.0839,
"grad_norm": 0.8485159873962402,
"learning_rate": 0.00014920634920634923,
"epoch": 0.038492381716118684,
"step": 48
},
{
"loss": 0.0774,
"grad_norm": 0.5638540387153625,
"learning_rate": 0.00015238095238095237,
"epoch": 0.03929430633520449,
"step": 49
},
{
"loss": 0.0611,
"grad_norm": 0.7566853761672974,
"learning_rate": 0.00015555555555555556,
"epoch": 0.040096230954290296,
"step": 50
},
{
"loss": 0.1411,
"grad_norm": 1.0959564447402954,
"learning_rate": 0.00015873015873015873,
"epoch": 0.0408981555733761,
"step": 51
},
{
"loss": 0.0594,
"grad_norm": 0.6066744923591614,
"learning_rate": 0.00016190476190476192,
"epoch": 0.04170008019246191,
"step": 52
},
{
"loss": 0.0337,
"grad_norm": 0.5505036115646362,
"learning_rate": 0.0001650793650793651,
"epoch": 0.042502004811547714,
"step": 53
},
{
"loss": 0.0572,
"grad_norm": 0.6075869798660278,
"learning_rate": 0.00016825396825396826,
"epoch": 0.04330392943063352,
"step": 54
},
{
"loss": 0.0869,
"grad_norm": 0.9212067723274231,
"learning_rate": 0.00017142857142857143,
"epoch": 0.044105854049719326,
"step": 55
},
{
"loss": 0.033,
"grad_norm": 0.4611626863479614,
"learning_rate": 0.00017460317460317462,
"epoch": 0.04490777866880513,
"step": 56
},
{
"loss": 0.0711,
"grad_norm": 0.8158572912216187,
"learning_rate": 0.00017777777777777779,
"epoch": 0.04570970328789094,
"step": 57
},
{
"loss": 0.1482,
"grad_norm": 1.3836172819137573,
"learning_rate": 0.00018095238095238095,
"epoch": 0.046511627906976744,
"step": 58
},
{
"loss": 0.0953,
"grad_norm": 0.6279105544090271,
"learning_rate": 0.00018412698412698412,
"epoch": 0.04731355252606255,
"step": 59
},
{
"loss": 0.1133,
"grad_norm": 1.3958708047866821,
"learning_rate": 0.00018730158730158731,
"epoch": 0.048115477145148355,
"step": 60
},
{
"loss": 0.1674,
"grad_norm": 1.2703611850738525,
"learning_rate": 0.00019047619047619048,
"epoch": 0.04891740176423416,
"step": 61
},
{
"loss": 0.0756,
"grad_norm": 0.8350338935852051,
"learning_rate": 0.00019365079365079365,
"epoch": 0.04971932638331997,
"step": 62
},
{
"loss": 0.0776,
"grad_norm": 0.7750063538551331,
"learning_rate": 0.00019682539682539682,
"epoch": 0.05052125100240577,
"step": 63
},
{
"loss": 0.0258,
"grad_norm": 0.4177851974964142,
"learning_rate": 0.0002,
"epoch": 0.05132317562149158,
"step": 64
},
{
"loss": 0.0743,
"grad_norm": 0.9661064743995667,
"learning_rate": 0.00019999964798101197,
"epoch": 0.052125100240577385,
"step": 65
},
{
"loss": 0.1386,
"grad_norm": 1.2234452962875366,
"learning_rate": 0.0001999985919265261,
"epoch": 0.05292702485966319,
"step": 66
},
{
"loss": 0.0466,
"grad_norm": 0.3697403073310852,
"learning_rate": 0.00019999683184397752,
"epoch": 0.053728949478749,
"step": 67
},
{
"loss": 0.1913,
"grad_norm": 1.1906723976135254,
"learning_rate": 0.0001999943677457578,
"epoch": 0.0545308740978348,
"step": 68
},
{
"loss": 0.0784,
"grad_norm": 0.6538499593734741,
"learning_rate": 0.0001999911996492152,
"epoch": 0.05533279871692061,
"step": 69
},
{
"loss": 0.0477,
"grad_norm": 0.7172570824623108,
"learning_rate": 0.00019998732757665427,
"epoch": 0.056134723336006415,
"step": 70
},
{
"loss": 0.0691,
"grad_norm": 0.5724918842315674,
"learning_rate": 0.00019998275155533587,
"epoch": 0.05693664795509222,
"step": 71
},
{
"loss": 0.0746,
"grad_norm": 0.8900654911994934,
"learning_rate": 0.00019997747161747695,
"epoch": 0.057738572574178026,
"step": 72
},
{
"loss": 0.0356,
"grad_norm": 0.43228501081466675,
"learning_rate": 0.00019997148780025027,
"epoch": 0.05854049719326383,
"step": 73
},
{
"loss": 0.0341,
"grad_norm": 0.4748842716217041,
"learning_rate": 0.0001999648001457842,
"epoch": 0.05934242181234964,
"step": 74
},
{
"loss": 0.0751,
"grad_norm": 0.5790648460388184,
"learning_rate": 0.00019995740870116233,
"epoch": 0.060144346431435444,
"step": 75
},
{
"loss": 0.1419,
"grad_norm": 0.8083305358886719,
"learning_rate": 0.00019994931351842327,
"epoch": 0.06094627105052125,
"step": 76
},
{
"loss": 0.0738,
"grad_norm": 1.0509905815124512,
"learning_rate": 0.00019994051465456014,
"epoch": 0.061748195669607056,
"step": 77
},
{
"loss": 0.09,
"grad_norm": 1.006076693534851,
"learning_rate": 0.00019993101217152028,
"epoch": 0.06255012028869286,
"step": 78
},
{
"loss": 0.0671,
"grad_norm": 0.7131031155586243,
"learning_rate": 0.00019992080613620485,
"epoch": 0.06335204490777867,
"step": 79
},
{
"loss": 0.0476,
"grad_norm": 0.6254774332046509,
"learning_rate": 0.00019990989662046818,
"epoch": 0.06415396952686447,
"step": 80
},
{
"loss": 0.0313,
"grad_norm": 0.5143452882766724,
"learning_rate": 0.00019989828370111737,
"epoch": 0.06495589414595028,
"step": 81
},
{
"loss": 0.0727,
"grad_norm": 0.616113007068634,
"learning_rate": 0.00019988596745991179,
"epoch": 0.06575781876503609,
"step": 82
},
{
"loss": 0.0693,
"grad_norm": 1.8450731039047241,
"learning_rate": 0.00019987294798356247,
"epoch": 0.06655974338412189,
"step": 83
},
{
"loss": 0.1301,
"grad_norm": 0.8279522657394409,
"learning_rate": 0.00019985922536373146,
"epoch": 0.0673616680032077,
"step": 84
},
{
"loss": 0.0457,
"grad_norm": 0.6411037445068359,
"learning_rate": 0.00019984479969703127,
"epoch": 0.0681635926222935,
"step": 85
},
{
"loss": 0.0636,
"grad_norm": 0.5541757941246033,
"learning_rate": 0.000199829671085024,
"epoch": 0.06896551724137931,
"step": 86
},
{
"loss": 0.0644,
"grad_norm": 0.5471921563148499,
"learning_rate": 0.00019981383963422087,
"epoch": 0.06976744186046512,
"step": 87
},
{
"loss": 0.0486,
"grad_norm": 0.7092999219894409,
"learning_rate": 0.00019979730545608126,
"epoch": 0.07056936647955092,
"step": 88
},
{
"loss": 0.124,
"grad_norm": 1.2980421781539917,
"learning_rate": 0.00019978006866701211,
"epoch": 0.07137129109863673,
"step": 89
},
{
"loss": 0.1298,
"grad_norm": 0.778945803642273,
"learning_rate": 0.0001997621293883669,
"epoch": 0.07217321571772253,
"step": 90
},
{
"loss": 0.0542,
"grad_norm": 0.4509424865245819,
"learning_rate": 0.00019974348774644501,
"epoch": 0.07297514033680834,
"step": 91
},
{
"loss": 0.0468,
"grad_norm": 0.6056888103485107,
"learning_rate": 0.00019972414387249072,
"epoch": 0.07377706495589414,
"step": 92
},
{
"loss": 0.0815,
"grad_norm": 0.7726762294769287,
"learning_rate": 0.00019970409790269215,
"epoch": 0.07457898957497995,
"step": 93
},
{
"loss": 0.0668,
"grad_norm": 0.6297205090522766,
"learning_rate": 0.00019968334997818064,
"epoch": 0.07538091419406576,
"step": 94
},
{
"loss": 0.1778,
"grad_norm": 1.166032075881958,
"learning_rate": 0.00019966190024502939,
"epoch": 0.07618283881315156,
"step": 95
},
{
"loss": 0.0662,
"grad_norm": 0.7612900733947754,
"learning_rate": 0.00019963974885425266,
"epoch": 0.07698476343223737,
"step": 96
},
{
"loss": 0.0433,
"grad_norm": 0.43482959270477295,
"learning_rate": 0.00019961689596180467,
"epoch": 0.07778668805132317,
"step": 97
},
{
"loss": 0.0616,
"grad_norm": 0.5207836627960205,
"learning_rate": 0.0001995933417285785,
"epoch": 0.07858861267040898,
"step": 98
},
{
"loss": 0.0523,
"grad_norm": 0.6553881764411926,
"learning_rate": 0.0001995690863204049,
"epoch": 0.07939053728949479,
"step": 99
},
{
"loss": 0.1302,
"grad_norm": 1.2842791080474854,
"learning_rate": 0.00019954412990805107,
"epoch": 0.08019246190858059,
"step": 100
},
{
"loss": 0.0922,
"grad_norm": 0.5699795484542847,
"learning_rate": 0.0001995184726672197,
"epoch": 0.0809943865276664,
"step": 101
},
{
"loss": 0.0807,
"grad_norm": 0.5272155404090881,
"learning_rate": 0.00019949211477854749,
"epoch": 0.0817963111467522,
"step": 102
},
{
"loss": 0.1362,
"grad_norm": 0.6196130514144897,
"learning_rate": 0.00019946505642760398,
"epoch": 0.08259823576583801,
"step": 103
},
{
"loss": 0.0705,
"grad_norm": 0.6336621046066284,
"learning_rate": 0.00019943729780489027,
"epoch": 0.08340016038492382,
"step": 104
},
{
"loss": 0.0652,
"grad_norm": 0.7032070755958557,
"learning_rate": 0.00019940883910583756,
"epoch": 0.08420208500400962,
"step": 105
},
{
"loss": 0.1116,
"grad_norm": 0.908371090888977,
"learning_rate": 0.0001993796805308059,
"epoch": 0.08500400962309543,
"step": 106
},
{
"loss": 0.0818,
"grad_norm": 0.7326153516769409,
"learning_rate": 0.00019934982228508278,
"epoch": 0.08580593424218123,
"step": 107
},
{
"loss": 0.1018,
"grad_norm": 0.8321508169174194,
"learning_rate": 0.00019931926457888156,
"epoch": 0.08660785886126704,
"step": 108
},
{
"loss": 0.0259,
"grad_norm": 0.2848133146762848,
"learning_rate": 0.00019928800762734005,
"epoch": 0.08740978348035285,
"step": 109
},
{
"loss": 0.0884,
"grad_norm": 1.1061406135559082,
"learning_rate": 0.00019925605165051918,
"epoch": 0.08821170809943865,
"step": 110
},
{
"loss": 0.0813,
"grad_norm": 0.5895913243293762,
"learning_rate": 0.000199223396873401,
"epoch": 0.08901363271852446,
"step": 111
},
{
"loss": 0.1933,
"grad_norm": 1.0626415014266968,
"learning_rate": 0.00019919004352588767,
"epoch": 0.08981555733761026,
"step": 112
},
{
"loss": 0.0619,
"grad_norm": 0.5373443365097046,
"learning_rate": 0.00019915599184279942,
"epoch": 0.09061748195669607,
"step": 113
},
{
"loss": 0.1071,
"grad_norm": 0.6781280636787415,
"learning_rate": 0.00019912124206387295,
"epoch": 0.09141940657578188,
"step": 114
},
{
"loss": 0.0754,
"grad_norm": 0.42521488666534424,
"learning_rate": 0.00019908579443375996,
"epoch": 0.09222133119486768,
"step": 115
},
{
"loss": 0.0705,
"grad_norm": 0.5241889357566833,
"learning_rate": 0.0001990496492020252,
"epoch": 0.09302325581395349,
"step": 116
},
{
"loss": 0.1076,
"grad_norm": 0.6329948902130127,
"learning_rate": 0.00019901280662314484,
"epoch": 0.09382518043303929,
"step": 117
},
{
"loss": 0.0346,
"grad_norm": 0.3218804597854614,
"learning_rate": 0.0001989752669565046,
"epoch": 0.0946271050521251,
"step": 118
},
{
"loss": 0.1206,
"grad_norm": 0.5836507081985474,
"learning_rate": 0.00019893703046639804,
"epoch": 0.0954290296712109,
"step": 119
},
{
"loss": 0.0955,
"grad_norm": 0.6629716157913208,
"learning_rate": 0.00019889809742202455,
"epoch": 0.09623095429029671,
"step": 120
},
{
"loss": 0.1089,
"grad_norm": 0.9768190383911133,
"learning_rate": 0.00019885846809748753,
"epoch": 0.09703287890938252,
"step": 121
},
{
"loss": 0.0323,
"grad_norm": 0.27991437911987305,
"learning_rate": 0.00019881814277179248,
"epoch": 0.09783480352846832,
"step": 122
},
{
"loss": 0.0466,
"grad_norm": 0.5002017617225647,
"learning_rate": 0.00019877712172884502,
"epoch": 0.09863672814755413,
"step": 123
},
{
"loss": 0.0311,
"grad_norm": 0.4994860589504242,
"learning_rate": 0.00019873540525744887,
"epoch": 0.09943865276663993,
"step": 124
},
{
"eval_loss": 0.06782178580760956,
"eval_runtime": 50.5786,
"eval_samples_per_second": 20.76,
"eval_steps_per_second": 5.2,
"epoch": 0.09943865276663993,
"step": 124
},
{
"loss": 0.0705,
"grad_norm": 0.6123189926147461,
"learning_rate": 0.00019869299365130383,
"epoch": 0.10024057738572574,
"step": 125
},
{
"loss": 0.0841,
"grad_norm": 0.7357730865478516,
"learning_rate": 0.00019864988720900368,
"epoch": 0.10104250200481155,
"step": 126
},
{
"loss": 0.0674,
"grad_norm": 0.46437254548072815,
"learning_rate": 0.0001986060862340342,
"epoch": 0.10184442662389735,
"step": 127
},
{
"loss": 0.0376,
"grad_norm": 0.5510705709457397,
"learning_rate": 0.00019856159103477086,
"epoch": 0.10264635124298316,
"step": 128
},
{
"loss": 0.0396,
"grad_norm": 0.5313072204589844,
"learning_rate": 0.00019851640192447673,
"epoch": 0.10344827586206896,
"step": 129
},
{
"loss": 0.0801,
"grad_norm": 0.6203364133834839,
"learning_rate": 0.00019847051922130038,
"epoch": 0.10425020048115477,
"step": 130
},
{
"loss": 0.0499,
"grad_norm": 0.3568151891231537,
"learning_rate": 0.00019842394324827341,
"epoch": 0.10505212510024058,
"step": 131
},
{
"loss": 0.0561,
"grad_norm": 0.3815423548221588,
"learning_rate": 0.00019837667433330838,
"epoch": 0.10585404971932638,
"step": 132
},
{
"loss": 0.0722,
"grad_norm": 0.4797166585922241,
"learning_rate": 0.00019832871280919635,
"epoch": 0.10665597433841219,
"step": 133
},
{
"loss": 0.0516,
"grad_norm": 0.47701454162597656,
"learning_rate": 0.00019828005901360475,
"epoch": 0.107457898957498,
"step": 134
},
{
"loss": 0.0309,
"grad_norm": 0.37124770879745483,
"learning_rate": 0.00019823071328907473,
"epoch": 0.1082598235765838,
"step": 135
},
{
"loss": 0.0707,
"grad_norm": 0.6959102749824524,
"learning_rate": 0.0001981806759830189,
"epoch": 0.1090617481956696,
"step": 136
},
{
"loss": 0.1417,
"grad_norm": 0.7774357795715332,
"learning_rate": 0.00019812994744771898,
"epoch": 0.10986367281475541,
"step": 137
},
{
"loss": 0.0405,
"grad_norm": 0.4280378818511963,
"learning_rate": 0.00019807852804032305,
"epoch": 0.11066559743384122,
"step": 138
},
{
"loss": 0.0506,
"grad_norm": 0.5292235016822815,
"learning_rate": 0.00019802641812284328,
"epoch": 0.11146752205292702,
"step": 139
},
{
"loss": 0.0624,
"grad_norm": 0.5091221332550049,
"learning_rate": 0.00019797361806215332,
"epoch": 0.11226944667201283,
"step": 140
},
{
"loss": 0.0598,
"grad_norm": 0.5391169786453247,
"learning_rate": 0.0001979201282299856,
"epoch": 0.11307137129109864,
"step": 141
},
{
"loss": 0.0861,
"grad_norm": 0.7957108616828918,
"learning_rate": 0.00019786594900292887,
"epoch": 0.11387329591018444,
"step": 142
},
{
"loss": 0.0698,
"grad_norm": 0.5378095507621765,
"learning_rate": 0.00019781108076242547,
"epoch": 0.11467522052927025,
"step": 143
},
{
"loss": 0.0612,
"grad_norm": 0.5657555460929871,
"learning_rate": 0.00019775552389476864,
"epoch": 0.11547714514835605,
"step": 144
},
{
"loss": 0.1241,
"grad_norm": 0.8074794411659241,
"learning_rate": 0.00019769927879109982,
"epoch": 0.11627906976744186,
"step": 145
},
{
"loss": 0.0831,
"grad_norm": 0.5241571068763733,
"learning_rate": 0.0001976423458474059,
"epoch": 0.11708099438652766,
"step": 146
},
{
"loss": 0.0462,
"grad_norm": 0.3452630043029785,
"learning_rate": 0.00019758472546451645,
"epoch": 0.11788291900561347,
"step": 147
},
{
"loss": 0.0656,
"grad_norm": 0.38813871145248413,
"learning_rate": 0.00019752641804810084,
"epoch": 0.11868484362469928,
"step": 148
},
{
"loss": 0.0512,
"grad_norm": 0.5402405261993408,
"learning_rate": 0.0001974674240086654,
"epoch": 0.11948676824378508,
"step": 149
},
{
"loss": 0.039,
"grad_norm": 0.35998794436454773,
"learning_rate": 0.00019740774376155061,
"epoch": 0.12028869286287089,
"step": 150
},
{
"loss": 0.0275,
"grad_norm": 0.2361939251422882,
"learning_rate": 0.000197347377726928,
"epoch": 0.1210906174819567,
"step": 151
},
{
"loss": 0.0476,
"grad_norm": 0.48203134536743164,
"learning_rate": 0.00019728632632979746,
"epoch": 0.1218925421010425,
"step": 152
},
{
"loss": 0.0255,
"grad_norm": 0.2733021676540375,
"learning_rate": 0.00019722458999998398,
"epoch": 0.1226944667201283,
"step": 153
},
{
"loss": 0.0506,
"grad_norm": 0.3442985713481903,
"learning_rate": 0.00019716216917213476,
"epoch": 0.12349639133921411,
"step": 154
},
{
"loss": 0.0441,
"grad_norm": 0.46731194853782654,
"learning_rate": 0.00019709906428571616,
"epoch": 0.12429831595829992,
"step": 155
},
{
"loss": 0.0794,
"grad_norm": 0.9103071689605713,
"learning_rate": 0.0001970352757850105,
"epoch": 0.12510024057738572,
"step": 156
},
{
"loss": 0.0394,
"grad_norm": 0.46548521518707275,
"learning_rate": 0.0001969708041191131,
"epoch": 0.12590216519647154,
"step": 157
},
{
"loss": 0.0485,
"grad_norm": 0.4331950545310974,
"learning_rate": 0.00019690564974192892,
"epoch": 0.12670408981555734,
"step": 158
},
{
"loss": 0.0417,
"grad_norm": 0.4089224636554718,
"learning_rate": 0.00019683981311216959,
"epoch": 0.12750601443464316,
"step": 159
},
{
"loss": 0.067,
"grad_norm": 0.7565222978591919,
"learning_rate": 0.0001967732946933499,
"epoch": 0.12830793905372895,
"step": 160
},
{
"loss": 0.1036,
"grad_norm": 0.5942509174346924,
"learning_rate": 0.00019670609495378482,
"epoch": 0.12910986367281477,
"step": 161
},
{
"loss": 0.079,
"grad_norm": 0.6143490672111511,
"learning_rate": 0.00019663821436658604,
"epoch": 0.12991178829190056,
"step": 162
},
{
"loss": 0.0825,
"grad_norm": 0.4321056306362152,
"learning_rate": 0.0001965696534096587,
"epoch": 0.13071371291098638,
"step": 163
},
{
"loss": 0.0583,
"grad_norm": 0.5038022398948669,
"learning_rate": 0.00019650041256569792,
"epoch": 0.13151563753007217,
"step": 164
},
{
"loss": 0.0422,
"grad_norm": 0.31969794631004333,
"learning_rate": 0.00019643049232218553,
"epoch": 0.132317562149158,
"step": 165
},
{
"loss": 0.0574,
"grad_norm": 0.33682748675346375,
"learning_rate": 0.00019635989317138666,
"epoch": 0.13311948676824378,
"step": 166
},
{
"loss": 0.1009,
"grad_norm": 0.8040818572044373,
"learning_rate": 0.0001962886156103461,
"epoch": 0.1339214113873296,
"step": 167
},
{
"loss": 0.0342,
"grad_norm": 0.3678284287452698,
"learning_rate": 0.00019621666014088494,
"epoch": 0.1347233360064154,
"step": 168
},
{
"loss": 0.1214,
"grad_norm": 0.5396429300308228,
"learning_rate": 0.00019614402726959705,
"epoch": 0.13552526062550121,
"step": 169
},
{
"loss": 0.0595,
"grad_norm": 0.46996235847473145,
"learning_rate": 0.0001960707175078454,
"epoch": 0.136327185244587,
"step": 170
},
{
"loss": 0.0714,
"grad_norm": 0.5289244055747986,
"learning_rate": 0.00019599673137175855,
"epoch": 0.13712910986367283,
"step": 171
},
{
"loss": 0.0443,
"grad_norm": 0.45096877217292786,
"learning_rate": 0.00019592206938222703,
"epoch": 0.13793103448275862,
"step": 172
},
{
"loss": 0.0552,
"grad_norm": 0.451477587223053,
"learning_rate": 0.00019584673206489954,
"epoch": 0.13873295910184444,
"step": 173
},
{
"loss": 0.0539,
"grad_norm": 0.37521955370903015,
"learning_rate": 0.00019577071995017945,
"epoch": 0.13953488372093023,
"step": 174
},
{
"loss": 0.0612,
"grad_norm": 0.34417349100112915,
"learning_rate": 0.0001956940335732209,
"epoch": 0.14033680834001605,
"step": 175
},
{
"loss": 0.0488,
"grad_norm": 0.30773675441741943,
"learning_rate": 0.00019561667347392508,
"epoch": 0.14113873295910184,
"step": 176
},
{
"loss": 0.0713,
"grad_norm": 0.633940577507019,
"learning_rate": 0.00019553864019693652,
"epoch": 0.14194065757818766,
"step": 177
},
{
"loss": 0.046,
"grad_norm": 0.5169980525970459,
"learning_rate": 0.00019545993429163913,
"epoch": 0.14274258219727345,
"step": 178
},
{
"loss": 0.1141,
"grad_norm": 0.6768614053726196,
"learning_rate": 0.0001953805563121523,
"epoch": 0.14354450681635927,
"step": 179
},
{
"loss": 0.0451,
"grad_norm": 0.4514835476875305,
"learning_rate": 0.0001953005068173272,
"epoch": 0.14434643143544507,
"step": 180
},
{
"loss": 0.0473,
"grad_norm": 0.4141976237297058,
"learning_rate": 0.00019521978637074267,
"epoch": 0.14514835605453089,
"step": 181
},
{
"loss": 0.0551,
"grad_norm": 0.5272877812385559,
"learning_rate": 0.0001951383955407013,
"epoch": 0.14595028067361668,
"step": 182
},
{
"loss": 0.0714,
"grad_norm": 0.9530414342880249,
"learning_rate": 0.00019505633490022546,
"epoch": 0.1467522052927025,
"step": 183
},
{
"loss": 0.1042,
"grad_norm": 0.8545625805854797,
"learning_rate": 0.0001949736050270532,
"epoch": 0.1475541299117883,
"step": 184
},
{
"loss": 0.0809,
"grad_norm": 0.4434387683868408,
"learning_rate": 0.00019489020650363426,
"epoch": 0.1483560545308741,
"step": 185
},
{
"loss": 0.0775,
"grad_norm": 0.6959827542304993,
"learning_rate": 0.00019480613991712588,
"epoch": 0.1491579791499599,
"step": 186
},
{
"loss": 0.0479,
"grad_norm": 0.6551741361618042,
"learning_rate": 0.00019472140585938882,
"epoch": 0.14995990376904572,
"step": 187
},
{
"loss": 0.0719,
"grad_norm": 0.6390430331230164,
"learning_rate": 0.00019463600492698296,
"epoch": 0.1507618283881315,
"step": 188
},
{
"loss": 0.0513,
"grad_norm": 0.4449121356010437,
"learning_rate": 0.00019454993772116336,
"epoch": 0.15156375300721733,
"step": 189
},
{
"loss": 0.0419,
"grad_norm": 0.3439355492591858,
"learning_rate": 0.00019446320484787575,
"epoch": 0.15236567762630313,
"step": 190
},
{
"loss": 0.0594,
"grad_norm": 0.3776263892650604,
"learning_rate": 0.00019437580691775258,
"epoch": 0.15316760224538895,
"step": 191
},
{
"loss": 0.0544,
"grad_norm": 0.5031057000160217,
"learning_rate": 0.00019428774454610843,
"epoch": 0.15396952686447474,
"step": 192
},
{
"loss": 0.0451,
"grad_norm": 0.33334505558013916,
"learning_rate": 0.00019419901835293583,
"epoch": 0.15477145148356056,
"step": 193
},
{
"loss": 0.0433,
"grad_norm": 0.28815487027168274,
"learning_rate": 0.0001941096289629009,
"epoch": 0.15557337610264635,
"step": 194
},
{
"loss": 0.0648,
"grad_norm": 0.5981271266937256,
"learning_rate": 0.00019401957700533888,
"epoch": 0.15637530072173217,
"step": 195
},
{
"loss": 0.0469,
"grad_norm": 0.3985323905944824,
"learning_rate": 0.00019392886311424973,
"epoch": 0.15717722534081796,
"step": 196
},
{
"loss": 0.0467,
"grad_norm": 0.3379111886024475,
"learning_rate": 0.00019383748792829372,
"epoch": 0.15797914995990378,
"step": 197
},
{
"loss": 0.0467,
"grad_norm": 0.4567181468009949,
"learning_rate": 0.00019374545209078687,
"epoch": 0.15878107457898957,
"step": 198
},
{
"loss": 0.0494,
"grad_norm": 0.38965797424316406,
"learning_rate": 0.0001936527562496964,
"epoch": 0.1595829991980754,
"step": 199
},
{
"loss": 0.0552,
"grad_norm": 0.5935775637626648,
"learning_rate": 0.0001935594010576362,
"epoch": 0.16038492381716118,
"step": 200
},
{
"loss": 0.0558,
"grad_norm": 0.4197022318840027,
"learning_rate": 0.0001934653871718624,
"epoch": 0.161186848436247,
"step": 201
},
{
"loss": 0.0582,
"grad_norm": 0.46609750390052795,
"learning_rate": 0.0001933707152542683,
"epoch": 0.1619887730553328,
"step": 202
},
{
"loss": 0.0686,
"grad_norm": 0.6024537086486816,
"learning_rate": 0.00019327538597138029,
"epoch": 0.16279069767441862,
"step": 203
},
{
"loss": 0.0394,
"grad_norm": 0.36054351925849915,
"learning_rate": 0.0001931793999943526,
"epoch": 0.1635926222935044,
"step": 204
},
{
"loss": 0.0756,
"grad_norm": 0.6744378805160522,
"learning_rate": 0.0001930827579989631,
"epoch": 0.16439454691259023,
"step": 205
},
{
"loss": 0.0492,
"grad_norm": 0.3806591331958771,
"learning_rate": 0.00019298546066560802,
"epoch": 0.16519647153167602,
"step": 206
},
{
"loss": 0.0189,
"grad_norm": 0.30359482765197754,
"learning_rate": 0.00019288750867929756,
"epoch": 0.16599839615076184,
"step": 207
},
{
"loss": 0.0396,
"grad_norm": 0.4475793242454529,
"learning_rate": 0.00019278890272965096,
"epoch": 0.16680032076984763,
"step": 208
},
{
"loss": 0.0935,
"grad_norm": 0.6485787630081177,
"learning_rate": 0.00019268964351089148,
"epoch": 0.16760224538893345,
"step": 209
},
{
"loss": 0.0615,
"grad_norm": 0.4646718502044678,
"learning_rate": 0.00019258973172184174,
"epoch": 0.16840417000801924,
"step": 210
},
{
"loss": 0.0414,
"grad_norm": 0.49095892906188965,
"learning_rate": 0.0001924891680659187,
"epoch": 0.16920609462710506,
"step": 211
},
{
"loss": 0.0265,
"grad_norm": 0.835785984992981,
"learning_rate": 0.0001923879532511287,
"epoch": 0.17000801924619086,
"step": 212
},
{
"loss": 0.0696,
"grad_norm": 0.43860942125320435,
"learning_rate": 0.0001922860879900624,
"epoch": 0.17080994386527668,
"step": 213
},
{
"loss": 0.0578,
"grad_norm": 0.5574386119842529,
"learning_rate": 0.00019218357299988998,
"epoch": 0.17161186848436247,
"step": 214
},
{
"loss": 0.0454,
"grad_norm": 0.3218346834182739,
"learning_rate": 0.0001920804090023559,
"epoch": 0.1724137931034483,
"step": 215
},
{
"loss": 0.0453,
"grad_norm": 0.4190017879009247,
"learning_rate": 0.0001919765967237739,
"epoch": 0.17321571772253408,
"step": 216
},
{
"loss": 0.0449,
"grad_norm": 0.35313868522644043,
"learning_rate": 0.00019187213689502176,
"epoch": 0.1740176423416199,
"step": 217
},
{
"loss": 0.0813,
"grad_norm": 0.44302183389663696,
"learning_rate": 0.00019176703025153643,
"epoch": 0.1748195669607057,
"step": 218
},
{
"loss": 0.0276,
"grad_norm": 0.2679917812347412,
"learning_rate": 0.00019166127753330857,
"epoch": 0.1756214915797915,
"step": 219
},
{
"loss": 0.0323,
"grad_norm": 0.2973562777042389,
"learning_rate": 0.00019155487948487748,
"epoch": 0.1764234161988773,
"step": 220
},
{
"loss": 0.044,
"grad_norm": 0.3215465247631073,
"learning_rate": 0.00019144783685532578,
"epoch": 0.17722534081796312,
"step": 221
},
{
"loss": 0.0353,
"grad_norm": 0.3549197018146515,
"learning_rate": 0.00019134015039827431,
"epoch": 0.17802726543704891,
"step": 222
},
{
"loss": 0.051,
"grad_norm": 0.42532142996788025,
"learning_rate": 0.00019123182087187656,
"epoch": 0.17882919005613473,
"step": 223
},
{
"loss": 0.0479,
"grad_norm": 0.39455631375312805,
"learning_rate": 0.0001911228490388136,
"epoch": 0.17963111467522053,
"step": 224
},
{
"loss": 0.0573,
"grad_norm": 0.45477625727653503,
"learning_rate": 0.00019101323566628843,
"epoch": 0.18043303929430635,
"step": 225
},
{
"loss": 0.0342,
"grad_norm": 0.2890731692314148,
"learning_rate": 0.0001909029815260209,
"epoch": 0.18123496391339214,
"step": 226
},
{
"loss": 0.0361,
"grad_norm": 0.34329336881637573,
"learning_rate": 0.00019079208739424197,
"epoch": 0.18203688853247796,
"step": 227
},
{
"loss": 0.0714,
"grad_norm": 0.6790952682495117,
"learning_rate": 0.0001906805540516885,
"epoch": 0.18283881315156375,
"step": 228
},
{
"loss": 0.032,
"grad_norm": 0.26758837699890137,
"learning_rate": 0.00019056838228359753,
"epoch": 0.18364073777064957,
"step": 229
},
{
"loss": 0.0401,
"grad_norm": 0.5012450218200684,
"learning_rate": 0.0001904555728797009,
"epoch": 0.18444266238973536,
"step": 230
},
{
"loss": 0.0301,
"grad_norm": 0.28922465443611145,
"learning_rate": 0.00019034212663421969,
"epoch": 0.18524458700882118,
"step": 231
},
{
"loss": 0.0639,
"grad_norm": 0.44703420996665955,
"learning_rate": 0.00019022804434585852,
"epoch": 0.18604651162790697,
"step": 232
},
{
"loss": 0.0446,
"grad_norm": 0.25191769003868103,
"learning_rate": 0.00019011332681780006,
"epoch": 0.1868484362469928,
"step": 233
},
{
"loss": 0.0623,
"grad_norm": 0.46206915378570557,
"learning_rate": 0.00018999797485769925,
"epoch": 0.18765036086607859,
"step": 234
},
{
"loss": 0.0379,
"grad_norm": 0.25015994906425476,
"learning_rate": 0.0001898819892776777,
"epoch": 0.1884522854851644,
"step": 235
},
{
"loss": 0.0602,
"grad_norm": 0.3543962836265564,
"learning_rate": 0.0001897653708943179,
"epoch": 0.1892542101042502,
"step": 236
},
{
"loss": 0.0483,
"grad_norm": 0.29529276490211487,
"learning_rate": 0.00018964812052865764,
"epoch": 0.19005613472333602,
"step": 237
},
{
"loss": 0.1344,
"grad_norm": 0.5841286778450012,
"learning_rate": 0.00018953023900618397,
"epoch": 0.1908580593424218,
"step": 238
},
{
"loss": 0.0651,
"grad_norm": 0.45970141887664795,
"learning_rate": 0.00018941172715682757,
"epoch": 0.19165998396150763,
"step": 239
},
{
"loss": 0.0552,
"grad_norm": 0.4103776514530182,
"learning_rate": 0.00018929258581495685,
"epoch": 0.19246190858059342,
"step": 240
},
{
"loss": 0.0525,
"grad_norm": 0.3026215434074402,
"learning_rate": 0.00018917281581937214,
"epoch": 0.19326383319967924,
"step": 241
},
{
"loss": 0.0343,
"grad_norm": 0.28369593620300293,
"learning_rate": 0.00018905241801329972,
"epoch": 0.19406575781876503,
"step": 242
},
{
"loss": 0.05,
"grad_norm": 0.36326268315315247,
"learning_rate": 0.00018893139324438577,
"epoch": 0.19486768243785085,
"step": 243
},
{
"loss": 0.0539,
"grad_norm": 0.38554129004478455,
"learning_rate": 0.0001888097423646907,
"epoch": 0.19566960705693665,
"step": 244
},
{
"loss": 0.0202,
"grad_norm": 0.16676409542560577,
"learning_rate": 0.00018868746623068293,
"epoch": 0.19647153167602247,
"step": 245
},
{
"loss": 0.0282,
"grad_norm": 0.37308400869369507,
"learning_rate": 0.00018856456570323277,
"epoch": 0.19727345629510826,
"step": 246
},
{
"loss": 0.0517,
"grad_norm": 0.38957253098487854,
"learning_rate": 0.0001884410416476067,
"epoch": 0.19807538091419408,
"step": 247
},
{
"loss": 0.0236,
"grad_norm": 0.33424702286720276,
"learning_rate": 0.00018831689493346095,
"epoch": 0.19887730553327987,
"step": 248
},
{
"eval_loss": 0.056792400777339935,
"eval_runtime": 32.4803,
"eval_samples_per_second": 32.327,
"eval_steps_per_second": 8.097,
"epoch": 0.19887730553327987,
"step": 248
},
{
"loss": 0.0303,
"grad_norm": 0.308700829744339,
"learning_rate": 0.0001881921264348355,
"epoch": 0.1996792301523657,
"step": 249
},
{
"loss": 0.0619,
"grad_norm": 0.31528714299201965,
"learning_rate": 0.00018806673703014804,
"epoch": 0.20048115477145148,
"step": 250
},
{
"loss": 0.0917,
"grad_norm": 0.6735418438911438,
"learning_rate": 0.00018794072760218753,
"epoch": 0.2012830793905373,
"step": 251
},
{
"loss": 0.0882,
"grad_norm": 0.793260931968689,
"learning_rate": 0.00018781409903810821,
"epoch": 0.2020850040096231,
"step": 252
},
{
"loss": 0.0455,
"grad_norm": 0.39877378940582275,
"learning_rate": 0.0001876868522294233,
"epoch": 0.2028869286287089,
"step": 253
},
{
"loss": 0.0959,
"grad_norm": 0.931326687335968,
"learning_rate": 0.00018755898807199856,
"epoch": 0.2036888532477947,
"step": 254
},
{
"loss": 0.0983,
"grad_norm": 0.8125079274177551,
"learning_rate": 0.00018743050746604633,
"epoch": 0.20449077786688052,
"step": 255
},
{
"loss": 0.0523,
"grad_norm": 0.4794807434082031,
"learning_rate": 0.00018730141131611882,
"epoch": 0.20529270248596632,
"step": 256
},
{
"loss": 0.0311,
"grad_norm": 0.24229726195335388,
"learning_rate": 0.00018717170053110196,
"epoch": 0.20609462710505214,
"step": 257
},
{
"loss": 0.073,
"grad_norm": 0.3876228630542755,
"learning_rate": 0.0001870413760242089,
"epoch": 0.20689655172413793,
"step": 258
},
{
"loss": 0.0837,
"grad_norm": 0.3997587263584137,
"learning_rate": 0.0001869104387129737,
"epoch": 0.20769847634322375,
"step": 259
},
{
"loss": 0.0395,
"grad_norm": 0.3965926468372345,
"learning_rate": 0.00018677888951924474,
"epoch": 0.20850040096230954,
"step": 260
},
{
"loss": 0.0554,
"grad_norm": 0.4688607156276703,
"learning_rate": 0.00018664672936917828,
"epoch": 0.20930232558139536,
"step": 261
},
{
"loss": 0.1299,
"grad_norm": 0.9481903910636902,
"learning_rate": 0.00018651395919323202,
"epoch": 0.21010425020048115,
"step": 262
},
{
"loss": 0.0503,
"grad_norm": 0.29419174790382385,
"learning_rate": 0.00018638057992615838,
"epoch": 0.21090617481956697,
"step": 263
},
{
"loss": 0.0969,
"grad_norm": 0.5171178579330444,
"learning_rate": 0.00018624659250699805,
"epoch": 0.21170809943865276,
"step": 264
},
{
"loss": 0.0345,
"grad_norm": 0.3265765309333801,
"learning_rate": 0.00018611199787907338,
"epoch": 0.21251002405773858,
"step": 265
},
{
"loss": 0.0362,
"grad_norm": 0.33063560724258423,
"learning_rate": 0.00018597679698998163,
"epoch": 0.21331194867682438,
"step": 266
},
{
"loss": 0.0511,
"grad_norm": 0.660375714302063,
"learning_rate": 0.00018584099079158842,
"epoch": 0.2141138732959102,
"step": 267
},
{
"loss": 0.0815,
"grad_norm": 0.580894410610199,
"learning_rate": 0.00018570458024002093,
"epoch": 0.214915797914996,
"step": 268
},
{
"loss": 0.1139,
"grad_norm": 0.7811892032623291,
"learning_rate": 0.0001855675662956613,
"epoch": 0.2157177225340818,
"step": 269
},
{
"loss": 0.0454,
"grad_norm": 0.40047529339790344,
"learning_rate": 0.0001854299499231397,
"epoch": 0.2165196471531676,
"step": 270
},
{
"loss": 0.0691,
"grad_norm": 0.6034380197525024,
"learning_rate": 0.0001852917320913276,
"epoch": 0.21732157177225342,
"step": 271
},
{
"loss": 0.0392,
"grad_norm": 0.3567689061164856,
"learning_rate": 0.00018515291377333112,
"epoch": 0.2181234963913392,
"step": 272
},
{
"loss": 0.0426,
"grad_norm": 0.346510648727417,
"learning_rate": 0.00018501349594648395,
"epoch": 0.21892542101042503,
"step": 273
},
{
"loss": 0.0326,
"grad_norm": 0.5042007565498352,
"learning_rate": 0.0001848734795923404,
"epoch": 0.21972734562951082,
"step": 274
},
{
"loss": 0.0446,
"grad_norm": 0.5788083076477051,
"learning_rate": 0.0001847328656966689,
"epoch": 0.22052927024859664,
"step": 275
},
{
"loss": 0.0573,
"grad_norm": 0.3450917601585388,
"learning_rate": 0.0001845916552494446,
"epoch": 0.22133119486768243,
"step": 276
},
{
"loss": 0.0495,
"grad_norm": 0.36637723445892334,
"learning_rate": 0.00018444984924484277,
"epoch": 0.22213311948676825,
"step": 277
},
{
"loss": 0.0342,
"grad_norm": 0.24668528139591217,
"learning_rate": 0.00018430744868123145,
"epoch": 0.22293504410585405,
"step": 278
},
{
"loss": 0.0499,
"grad_norm": 0.49873360991477966,
"learning_rate": 0.0001841644545611647,
"epoch": 0.22373696872493987,
"step": 279
},
{
"loss": 0.1,
"grad_norm": 1.0184354782104492,
"learning_rate": 0.00018402086789137546,
"epoch": 0.22453889334402566,
"step": 280
},
{
"loss": 0.0401,
"grad_norm": 0.43986013531684875,
"learning_rate": 0.00018387668968276836,
"epoch": 0.22534081796311148,
"step": 281
},
{
"loss": 0.0451,
"grad_norm": 0.3782620131969452,
"learning_rate": 0.0001837319209504128,
"epoch": 0.22614274258219727,
"step": 282
},
{
"loss": 0.0494,
"grad_norm": 0.396990031003952,
"learning_rate": 0.00018358656271353559,
"epoch": 0.2269446672012831,
"step": 283
},
{
"loss": 0.0262,
"grad_norm": 0.34569329023361206,
"learning_rate": 0.00018344061599551398,
"epoch": 0.22774659182036888,
"step": 284
},
{
"loss": 0.049,
"grad_norm": 0.36551395058631897,
"learning_rate": 0.0001832940818238682,
"epoch": 0.2285485164394547,
"step": 285
},
{
"loss": 0.0493,
"grad_norm": 0.3316669166088104,
"learning_rate": 0.00018314696123025454,
"epoch": 0.2293504410585405,
"step": 286
},
{
"loss": 0.0974,
"grad_norm": 0.9379229545593262,
"learning_rate": 0.0001829992552504578,
"epoch": 0.2301523656776263,
"step": 287
},
{
"loss": 0.035,
"grad_norm": 0.4309346079826355,
"learning_rate": 0.00018285096492438424,
"epoch": 0.2309542902967121,
"step": 288
},
{
"loss": 0.0573,
"grad_norm": 0.47338199615478516,
"learning_rate": 0.00018270209129605397,
"epoch": 0.23175621491579793,
"step": 289
},
{
"loss": 0.026,
"grad_norm": 0.3351285457611084,
"learning_rate": 0.00018255263541359397,
"epoch": 0.23255813953488372,
"step": 290
},
{
"loss": 0.0291,
"grad_norm": 0.2552240192890167,
"learning_rate": 0.00018240259832923034,
"epoch": 0.23336006415396954,
"step": 291
},
{
"loss": 0.02,
"grad_norm": 0.23985892534255981,
"learning_rate": 0.00018225198109928114,
"epoch": 0.23416198877305533,
"step": 292
},
{
"loss": 0.0195,
"grad_norm": 0.26093894243240356,
"learning_rate": 0.00018210078478414894,
"epoch": 0.23496391339214115,
"step": 293
},
{
"loss": 0.0669,
"grad_norm": 0.5353745222091675,
"learning_rate": 0.00018194901044831313,
"epoch": 0.23576583801122694,
"step": 294
},
{
"loss": 0.0193,
"grad_norm": 0.25396963953971863,
"learning_rate": 0.00018179665916032273,
"epoch": 0.23656776263031276,
"step": 295
},
{
"loss": 0.0656,
"grad_norm": 0.3989141881465912,
"learning_rate": 0.00018164373199278856,
"epoch": 0.23736968724939855,
"step": 296
},
{
"loss": 0.0297,
"grad_norm": 0.31333601474761963,
"learning_rate": 0.00018149023002237612,
"epoch": 0.23817161186848437,
"step": 297
},
{
"loss": 0.0471,
"grad_norm": 0.8698596954345703,
"learning_rate": 0.00018133615432979744,
"epoch": 0.23897353648757017,
"step": 298
},
{
"loss": 0.081,
"grad_norm": 0.46993565559387207,
"learning_rate": 0.00018118150599980397,
"epoch": 0.23977546110665598,
"step": 299
},
{
"loss": 0.049,
"grad_norm": 0.5059134364128113,
"learning_rate": 0.00018102628612117865,
"epoch": 0.24057738572574178,
"step": 300
},
{
"loss": 0.0653,
"grad_norm": 0.5506439805030823,
"learning_rate": 0.00018087049578672845,
"epoch": 0.2413793103448276,
"step": 301
},
{
"loss": 0.0554,
"grad_norm": 0.5644898414611816,
"learning_rate": 0.00018071413609327638,
"epoch": 0.2421812349639134,
"step": 302
},
{
"loss": 0.0536,
"grad_norm": 0.4158555269241333,
"learning_rate": 0.00018055720814165414,
"epoch": 0.2429831595829992,
"step": 303
},
{
"loss": 0.0593,
"grad_norm": 0.4376695454120636,
"learning_rate": 0.00018039971303669407,
"epoch": 0.243785084202085,
"step": 304
},
{
"loss": 0.0698,
"grad_norm": 0.5507003664970398,
"learning_rate": 0.00018024165188722151,
"epoch": 0.24458700882117082,
"step": 305
},
{
"loss": 0.0219,
"grad_norm": 0.25363439321517944,
"learning_rate": 0.000180083025806047,
"epoch": 0.2453889334402566,
"step": 306
},
{
"loss": 0.0134,
"grad_norm": 0.2080700397491455,
"learning_rate": 0.00017992383590995838,
"epoch": 0.24619085805934243,
"step": 307
},
{
"loss": 0.0656,
"grad_norm": 0.421975702047348,
"learning_rate": 0.00017976408331971298,
"epoch": 0.24699278267842822,
"step": 308
},
{
"loss": 0.027,
"grad_norm": 0.3046298921108246,
"learning_rate": 0.00017960376916002972,
"epoch": 0.24779470729751404,
"step": 309
},
{
"loss": 0.0534,
"grad_norm": 0.3668377995491028,
"learning_rate": 0.00017944289455958112,
"epoch": 0.24859663191659984,
"step": 310
},
{
"loss": 0.0553,
"grad_norm": 0.4287368059158325,
"learning_rate": 0.0001792814606509855,
"epoch": 0.24939855653568566,
"step": 311
},
{
"loss": 0.0487,
"grad_norm": 0.370373010635376,
"learning_rate": 0.00017911946857079888,
"epoch": 0.25020048115477145,
"step": 312
},
{
"loss": 0.0422,
"grad_norm": 0.4405466616153717,
"learning_rate": 0.00017895691945950696,
"epoch": 0.25100240577385724,
"step": 313
},
{
"loss": 0.1081,
"grad_norm": 0.6124715805053711,
"learning_rate": 0.0001787938144615173,
"epoch": 0.2518043303929431,
"step": 314
},
{
"loss": 0.0403,
"grad_norm": 0.22574079036712646,
"learning_rate": 0.000178630154725151,
"epoch": 0.2526062550120289,
"step": 315
},
{
"loss": 0.1046,
"grad_norm": 0.5598015189170837,
"learning_rate": 0.00017846594140263474,
"epoch": 0.25340817963111467,
"step": 316
},
{
"loss": 0.0544,
"grad_norm": 0.3449535667896271,
"learning_rate": 0.0001783011756500927,
"epoch": 0.25421010425020046,
"step": 317
},
{
"loss": 0.058,
"grad_norm": 0.40914788842201233,
"learning_rate": 0.0001781358586275383,
"epoch": 0.2550120288692863,
"step": 318
},
{
"loss": 0.1132,
"grad_norm": 0.7423124313354492,
"learning_rate": 0.0001779699914988662,
"epoch": 0.2558139534883721,
"step": 319
},
{
"loss": 0.0601,
"grad_norm": 0.6021925210952759,
"learning_rate": 0.00017780357543184397,
"epoch": 0.2566158781074579,
"step": 320
},
{
"loss": 0.1014,
"grad_norm": 0.48059457540512085,
"learning_rate": 0.0001776366115981039,
"epoch": 0.2574178027265437,
"step": 321
},
{
"loss": 0.0548,
"grad_norm": 0.5897157788276672,
"learning_rate": 0.00017746910117313482,
"epoch": 0.25821972734562953,
"step": 322
},
{
"loss": 0.0483,
"grad_norm": 0.36229458451271057,
"learning_rate": 0.0001773010453362737,
"epoch": 0.2590216519647153,
"step": 323
},
{
"loss": 0.0632,
"grad_norm": 0.49136513471603394,
"learning_rate": 0.0001771324452706975,
"epoch": 0.2598235765838011,
"step": 324
},
{
"loss": 0.0324,
"grad_norm": 0.6286053657531738,
"learning_rate": 0.00017696330216341463,
"epoch": 0.2606255012028869,
"step": 325
},
{
"loss": 0.054,
"grad_norm": 0.49283909797668457,
"learning_rate": 0.0001767936172052569,
"epoch": 0.26142742582197276,
"step": 326
},
{
"loss": 0.0256,
"grad_norm": 0.2010183483362198,
"learning_rate": 0.00017662339159087078,
"epoch": 0.26222935044105855,
"step": 327
},
{
"loss": 0.0451,
"grad_norm": 0.39567244052886963,
"learning_rate": 0.00017645262651870926,
"epoch": 0.26303127506014434,
"step": 328
},
{
"loss": 0.1059,
"grad_norm": 0.5877751708030701,
"learning_rate": 0.00017628132319102332,
"epoch": 0.26383319967923013,
"step": 329
},
{
"loss": 0.0314,
"grad_norm": 0.28202834725379944,
"learning_rate": 0.0001761094828138534,
"epoch": 0.264635124298316,
"step": 330
},
{
"loss": 0.0241,
"grad_norm": 0.3100980222225189,
"learning_rate": 0.00017593710659702104,
"epoch": 0.2654370489174018,
"step": 331
},
{
"loss": 0.0515,
"grad_norm": 0.28590792417526245,
"learning_rate": 0.0001757641957541203,
"epoch": 0.26623897353648757,
"step": 332
},
{
"loss": 0.0691,
"grad_norm": 0.7025532126426697,
"learning_rate": 0.0001755907515025091,
"epoch": 0.26704089815557336,
"step": 333
},
{
"loss": 0.0398,
"grad_norm": 0.3652035593986511,
"learning_rate": 0.0001754167750633009,
"epoch": 0.2678428227746592,
"step": 334
},
{
"loss": 0.0597,
"grad_norm": 0.41364148259162903,
"learning_rate": 0.00017524226766135588,
"epoch": 0.268644747393745,
"step": 335
},
{
"loss": 0.0582,
"grad_norm": 0.3338804244995117,
"learning_rate": 0.00017506723052527242,
"epoch": 0.2694466720128308,
"step": 336
},
{
"loss": 0.0833,
"grad_norm": 0.9465529322624207,
"learning_rate": 0.00017489166488737846,
"epoch": 0.2702485966319166,
"step": 337
},
{
"loss": 0.0848,
"grad_norm": 0.44553694128990173,
"learning_rate": 0.00017471557198372274,
"epoch": 0.27105052125100243,
"step": 338
},
{
"loss": 0.1273,
"grad_norm": 1.1346548795700073,
"learning_rate": 0.00017453895305406616,
"epoch": 0.2718524458700882,
"step": 339
},
{
"loss": 0.0736,
"grad_norm": 0.4856693744659424,
"learning_rate": 0.00017436180934187308,
"epoch": 0.272654370489174,
"step": 340
},
{
"loss": 0.0296,
"grad_norm": 0.27393412590026855,
"learning_rate": 0.0001741841420943025,
"epoch": 0.2734562951082598,
"step": 341
},
{
"loss": 0.0504,
"grad_norm": 0.3282850384712219,
"learning_rate": 0.00017400595256219928,
"epoch": 0.27425821972734565,
"step": 342
},
{
"loss": 0.0501,
"grad_norm": 0.3622792363166809,
"learning_rate": 0.00017382724200008546,
"epoch": 0.27506014434643145,
"step": 343
},
{
"loss": 0.0875,
"grad_norm": 0.5967736840248108,
"learning_rate": 0.00017364801166615124,
"epoch": 0.27586206896551724,
"step": 344
},
{
"loss": 0.0797,
"grad_norm": 0.665009617805481,
"learning_rate": 0.0001734682628222462,
"epoch": 0.27666399358460303,
"step": 345
},
{
"loss": 0.0367,
"grad_norm": 0.31664225459098816,
"learning_rate": 0.0001732879967338705,
"epoch": 0.2774659182036889,
"step": 346
},
{
"loss": 0.078,
"grad_norm": 0.613771915435791,
"learning_rate": 0.00017310721467016587,
"epoch": 0.27826784282277467,
"step": 347
},
{
"loss": 0.0309,
"grad_norm": 0.29217079281806946,
"learning_rate": 0.00017292591790390665,
"epoch": 0.27906976744186046,
"step": 348
},
{
"loss": 0.0147,
"grad_norm": 0.1654537171125412,
"learning_rate": 0.00017274410771149094,
"epoch": 0.27987169206094625,
"step": 349
},
{
"loss": 0.0372,
"grad_norm": 0.2641878128051758,
"learning_rate": 0.0001725617853729316,
"epoch": 0.2806736166800321,
"step": 350
},
{
"loss": 0.0438,
"grad_norm": 0.4984488785266876,
"learning_rate": 0.00017237895217184703,
"epoch": 0.2814755412991179,
"step": 351
},
{
"loss": 0.0641,
"grad_norm": 0.4201189875602722,
"learning_rate": 0.00017219560939545246,
"epoch": 0.2822774659182037,
"step": 352
},
{
"loss": 0.0582,
"grad_norm": 0.3273194134235382,
"learning_rate": 0.00017201175833455066,
"epoch": 0.2830793905372895,
"step": 353
},
{
"loss": 0.0358,
"grad_norm": 0.2902083396911621,
"learning_rate": 0.0001718274002835229,
"epoch": 0.2838813151563753,
"step": 354
},
{
"loss": 0.0383,
"grad_norm": 0.1811976581811905,
"learning_rate": 0.00017164253654031986,
"epoch": 0.2846832397754611,
"step": 355
},
{
"loss": 0.0519,
"grad_norm": 0.4728938639163971,
"learning_rate": 0.00017145716840645254,
"epoch": 0.2854851643945469,
"step": 356
},
{
"loss": 0.0437,
"grad_norm": 0.48397713899612427,
"learning_rate": 0.00017127129718698297,
"epoch": 0.2862870890136327,
"step": 357
},
{
"loss": 0.0416,
"grad_norm": 0.3491261303424835,
"learning_rate": 0.0001710849241905151,
"epoch": 0.28708901363271855,
"step": 358
},
{
"loss": 0.0688,
"grad_norm": 0.4765617251396179,
"learning_rate": 0.00017089805072918567,
"epoch": 0.28789093825180434,
"step": 359
},
{
"loss": 0.0959,
"grad_norm": 0.7366757988929749,
"learning_rate": 0.00017071067811865476,
"epoch": 0.28869286287089013,
"step": 360
},
{
"loss": 0.033,
"grad_norm": 0.3149030804634094,
"learning_rate": 0.00017052280767809673,
"epoch": 0.2894947874899759,
"step": 361
},
{
"loss": 0.0458,
"grad_norm": 0.3187673091888428,
"learning_rate": 0.00017033444073019077,
"epoch": 0.29029671210906177,
"step": 362
},
{
"loss": 0.0446,
"grad_norm": 0.3986169099807739,
"learning_rate": 0.0001701455786011118,
"epoch": 0.29109863672814756,
"step": 363
},
{
"loss": 0.0341,
"grad_norm": 0.3107149302959442,
"learning_rate": 0.00016995622262052092,
"epoch": 0.29190056134723336,
"step": 364
},
{
"loss": 0.0392,
"grad_norm": 0.38037049770355225,
"learning_rate": 0.00016976637412155612,
"epoch": 0.29270248596631915,
"step": 365
},
{
"loss": 0.0406,
"grad_norm": 0.35384100675582886,
"learning_rate": 0.00016957603444082295,
"epoch": 0.293504410585405,
"step": 366
},
{
"loss": 0.0676,
"grad_norm": 0.6596208810806274,
"learning_rate": 0.000169385204918385,
"epoch": 0.2943063352044908,
"step": 367
},
{
"loss": 0.0496,
"grad_norm": 0.3856953978538513,
"learning_rate": 0.00016919388689775464,
"epoch": 0.2951082598235766,
"step": 368
},
{
"loss": 0.0541,
"grad_norm": 0.3974038362503052,
"learning_rate": 0.00016900208172588332,
"epoch": 0.29591018444266237,
"step": 369
},
{
"loss": 0.0419,
"grad_norm": 0.40319862961769104,
"learning_rate": 0.00016880979075315237,
"epoch": 0.2967121090617482,
"step": 370
},
{
"loss": 0.0565,
"grad_norm": 0.27359071373939514,
"learning_rate": 0.00016861701533336322,
"epoch": 0.297514033680834,
"step": 371
},
{
"loss": 0.0262,
"grad_norm": 0.351244181394577,
"learning_rate": 0.00016842375682372805,
"epoch": 0.2983159582999198,
"step": 372
},
{
"eval_loss": 0.05102652311325073,
"eval_runtime": 31.7718,
"eval_samples_per_second": 33.048,
"eval_steps_per_second": 8.278,
"epoch": 0.2983159582999198,
"step": 372
},
{
"loss": 0.0428,
"grad_norm": 0.42074060440063477,
"learning_rate": 0.00016823001658486012,
"epoch": 0.2991178829190056,
"step": 373
},
{
"loss": 0.0224,
"grad_norm": 0.2260231077671051,
"learning_rate": 0.00016803579598076432,
"epoch": 0.29991980753809144,
"step": 374
},
{
"loss": 0.0492,
"grad_norm": 0.47774842381477356,
"learning_rate": 0.0001678410963788275,
"epoch": 0.30072173215717724,
"step": 375
},
{
"loss": 0.0638,
"grad_norm": 0.5587054491043091,
"learning_rate": 0.0001676459191498087,
"epoch": 0.301523656776263,
"step": 376
},
{
"loss": 0.0707,
"grad_norm": 0.4895194172859192,
"learning_rate": 0.0001674502656678298,
"epoch": 0.3023255813953488,
"step": 377
},
{
"loss": 0.0279,
"grad_norm": 0.24737556278705597,
"learning_rate": 0.00016725413731036561,
"epoch": 0.30312750601443467,
"step": 378
},
{
"loss": 0.0305,
"grad_norm": 0.35510316491127014,
"learning_rate": 0.00016705753545823423,
"epoch": 0.30392943063352046,
"step": 379
},
{
"loss": 0.0204,
"grad_norm": 0.253121942281723,
"learning_rate": 0.00016686046149558736,
"epoch": 0.30473135525260625,
"step": 380
},
{
"loss": 0.0226,
"grad_norm": 0.25866273045539856,
"learning_rate": 0.00016666291680990055,
"epoch": 0.30553327987169204,
"step": 381
},
{
"loss": 0.0829,
"grad_norm": 0.4675450325012207,
"learning_rate": 0.00016646490279196343,
"epoch": 0.3063352044907779,
"step": 382
},
{
"loss": 0.0203,
"grad_norm": 0.30080100893974304,
"learning_rate": 0.00016626642083586985,
"epoch": 0.3071371291098637,
"step": 383
},
{
"loss": 0.0454,
"grad_norm": 0.5222088694572449,
"learning_rate": 0.00016606747233900815,
"epoch": 0.3079390537289495,
"step": 384
},
{
"loss": 0.0502,
"grad_norm": 0.32578209042549133,
"learning_rate": 0.00016586805870205134,
"epoch": 0.30874097834803527,
"step": 385
},
{
"loss": 0.0567,
"grad_norm": 0.3294476568698883,
"learning_rate": 0.0001656681813289471,
"epoch": 0.3095429029671211,
"step": 386
},
{
"loss": 0.0817,
"grad_norm": 0.7187215685844421,
"learning_rate": 0.0001654678416269081,
"epoch": 0.3103448275862069,
"step": 387
},
{
"loss": 0.0305,
"grad_norm": 0.31030380725860596,
"learning_rate": 0.0001652670410064019,
"epoch": 0.3111467522052927,
"step": 388
},
{
"loss": 0.0614,
"grad_norm": 0.5844921469688416,
"learning_rate": 0.00016506578088114107,
"epoch": 0.3119486768243785,
"step": 389
},
{
"loss": 0.0291,
"grad_norm": 0.2818225622177124,
"learning_rate": 0.00016486406266807345,
"epoch": 0.31275060144346434,
"step": 390
},
{
"loss": 0.1276,
"grad_norm": 0.6056419610977173,
"learning_rate": 0.0001646618877873717,
"epoch": 0.31355252606255013,
"step": 391
},
{
"loss": 0.0534,
"grad_norm": 0.36668699979782104,
"learning_rate": 0.00016445925766242391,
"epoch": 0.3143544506816359,
"step": 392
},
{
"loss": 0.031,
"grad_norm": 0.34223347902297974,
"learning_rate": 0.00016425617371982303,
"epoch": 0.3151563753007217,
"step": 393
},
{
"loss": 0.0862,
"grad_norm": 0.394709050655365,
"learning_rate": 0.00016405263738935718,
"epoch": 0.31595829991980756,
"step": 394
},
{
"loss": 0.0489,
"grad_norm": 0.41530197858810425,
"learning_rate": 0.00016384865010399935,
"epoch": 0.31676022453889335,
"step": 395
},
{
"loss": 0.1056,
"grad_norm": 0.450509637594223,
"learning_rate": 0.00016364421329989755,
"epoch": 0.31756214915797915,
"step": 396
},
{
"loss": 0.0511,
"grad_norm": 0.4890766441822052,
"learning_rate": 0.00016343932841636456,
"epoch": 0.31836407377706494,
"step": 397
},
{
"loss": 0.16,
"grad_norm": 0.6917940974235535,
"learning_rate": 0.00016323399689586768,
"epoch": 0.3191659983961508,
"step": 398
},
{
"loss": 0.0584,
"grad_norm": 0.4217245280742645,
"learning_rate": 0.00016302822018401884,
"epoch": 0.3199679230152366,
"step": 399
},
{
"loss": 0.0471,
"grad_norm": 0.33742472529411316,
"learning_rate": 0.00016282199972956425,
"epoch": 0.32076984763432237,
"step": 400
},
{
"loss": 0.0561,
"grad_norm": 0.30320796370506287,
"learning_rate": 0.00016261533698437418,
"epoch": 0.32157177225340816,
"step": 401
},
{
"loss": 0.0504,
"grad_norm": 0.41129252314567566,
"learning_rate": 0.00016240823340343285,
"epoch": 0.322373696872494,
"step": 402
},
{
"loss": 0.0191,
"grad_norm": 0.21539658308029175,
"learning_rate": 0.00016220069044482814,
"epoch": 0.3231756214915798,
"step": 403
},
{
"loss": 0.0846,
"grad_norm": 0.5003443360328674,
"learning_rate": 0.00016199270956974128,
"epoch": 0.3239775461106656,
"step": 404
},
{
"loss": 0.0821,
"grad_norm": 0.3936382532119751,
"learning_rate": 0.00016178429224243663,
"epoch": 0.3247794707297514,
"step": 405
},
{
"loss": 0.1342,
"grad_norm": 1.055274248123169,
"learning_rate": 0.00016157543993025134,
"epoch": 0.32558139534883723,
"step": 406
},
{
"loss": 0.0784,
"grad_norm": 0.33087801933288574,
"learning_rate": 0.00016136615410358493,
"epoch": 0.326383319967923,
"step": 407
},
{
"loss": 0.0415,
"grad_norm": 0.27356383204460144,
"learning_rate": 0.00016115643623588915,
"epoch": 0.3271852445870088,
"step": 408
},
{
"loss": 0.0449,
"grad_norm": 0.39037784934043884,
"learning_rate": 0.00016094628780365743,
"epoch": 0.3279871692060946,
"step": 409
},
{
"loss": 0.0643,
"grad_norm": 0.3727872967720032,
"learning_rate": 0.00016073571028641452,
"epoch": 0.32878909382518046,
"step": 410
},
{
"loss": 0.0366,
"grad_norm": 0.30508482456207275,
"learning_rate": 0.0001605247051667061,
"epoch": 0.32959101844426625,
"step": 411
},
{
"loss": 0.0384,
"grad_norm": 0.313531756401062,
"learning_rate": 0.00016031327393008845,
"epoch": 0.33039294306335204,
"step": 412
},
{
"loss": 0.034,
"grad_norm": 0.3675989806652069,
"learning_rate": 0.00016010141806511766,
"epoch": 0.33119486768243783,
"step": 413
},
{
"loss": 0.0383,
"grad_norm": 0.2861047685146332,
"learning_rate": 0.00015988913906333946,
"epoch": 0.3319967923015237,
"step": 414
},
{
"loss": 0.0232,
"grad_norm": 0.30425795912742615,
"learning_rate": 0.0001596764384192787,
"epoch": 0.33279871692060947,
"step": 415
},
{
"loss": 0.0582,
"grad_norm": 0.5757021307945251,
"learning_rate": 0.00015946331763042867,
"epoch": 0.33360064153969526,
"step": 416
},
{
"loss": 0.0341,
"grad_norm": 0.2700221538543701,
"learning_rate": 0.00015924977819724068,
"epoch": 0.33440256615878106,
"step": 417
},
{
"loss": 0.0458,
"grad_norm": 0.3216298818588257,
"learning_rate": 0.00015903582162311337,
"epoch": 0.3352044907778669,
"step": 418
},
{
"loss": 0.0497,
"grad_norm": 0.2954160273075104,
"learning_rate": 0.00015882144941438233,
"epoch": 0.3360064153969527,
"step": 419
},
{
"loss": 0.0345,
"grad_norm": 0.30057498812675476,
"learning_rate": 0.00015860666308030932,
"epoch": 0.3368083400160385,
"step": 420
},
{
"loss": 0.046,
"grad_norm": 0.31479984521865845,
"learning_rate": 0.00015839146413307165,
"epoch": 0.3376102646351243,
"step": 421
},
{
"loss": 0.0455,
"grad_norm": 0.31836825609207153,
"learning_rate": 0.00015817585408775168,
"epoch": 0.3384121892542101,
"step": 422
},
{
"loss": 0.021,
"grad_norm": 0.24365834891796112,
"learning_rate": 0.000157959834462326,
"epoch": 0.3392141138732959,
"step": 423
},
{
"loss": 0.0354,
"grad_norm": 0.38124316930770874,
"learning_rate": 0.0001577434067776548,
"epoch": 0.3400160384923817,
"step": 424
},
{
"loss": 0.0853,
"grad_norm": 0.6972952485084534,
"learning_rate": 0.00015752657255747122,
"epoch": 0.3408179631114675,
"step": 425
},
{
"loss": 0.0189,
"grad_norm": 0.2013692706823349,
"learning_rate": 0.00015730933332837045,
"epoch": 0.34161988773055335,
"step": 426
},
{
"loss": 0.0561,
"grad_norm": 0.3334507346153259,
"learning_rate": 0.00015709169061979913,
"epoch": 0.34242181234963914,
"step": 427
},
{
"loss": 0.0652,
"grad_norm": 0.8858683109283447,
"learning_rate": 0.0001568736459640447,
"epoch": 0.34322373696872494,
"step": 428
},
{
"loss": 0.0267,
"grad_norm": 0.2540907561779022,
"learning_rate": 0.00015665520089622423,
"epoch": 0.3440256615878107,
"step": 429
},
{
"loss": 0.0475,
"grad_norm": 0.3518412709236145,
"learning_rate": 0.00015643635695427403,
"epoch": 0.3448275862068966,
"step": 430
},
{
"loss": 0.0311,
"grad_norm": 0.17859573662281036,
"learning_rate": 0.00015621711567893854,
"epoch": 0.34562951082598237,
"step": 431
},
{
"loss": 0.0202,
"grad_norm": 0.3969719409942627,
"learning_rate": 0.00015599747861375955,
"epoch": 0.34643143544506816,
"step": 432
},
{
"loss": 0.0249,
"grad_norm": 0.21450327336788177,
"learning_rate": 0.00015577744730506545,
"epoch": 0.34723336006415395,
"step": 433
},
{
"loss": 0.0416,
"grad_norm": 0.37466296553611755,
"learning_rate": 0.00015555702330196023,
"epoch": 0.3480352846832398,
"step": 434
},
{
"loss": 0.064,
"grad_norm": 0.5470214486122131,
"learning_rate": 0.00015533620815631256,
"epoch": 0.3488372093023256,
"step": 435
},
{
"loss": 0.0988,
"grad_norm": 0.6237538456916809,
"learning_rate": 0.0001551150034227449,
"epoch": 0.3496391339214114,
"step": 436
},
{
"loss": 0.1344,
"grad_norm": 0.5647206902503967,
"learning_rate": 0.0001548934106586226,
"epoch": 0.3504410585404972,
"step": 437
},
{
"loss": 0.0561,
"grad_norm": 0.326889306306839,
"learning_rate": 0.0001546714314240429,
"epoch": 0.351242983159583,
"step": 438
},
{
"loss": 0.0785,
"grad_norm": 0.4708334803581238,
"learning_rate": 0.00015444906728182385,
"epoch": 0.3520449077786688,
"step": 439
},
{
"loss": 0.0392,
"grad_norm": 0.4006723165512085,
"learning_rate": 0.00015422631979749354,
"epoch": 0.3528468323977546,
"step": 440
},
{
"loss": 0.0291,
"grad_norm": 0.25906902551651,
"learning_rate": 0.00015400319053927874,
"epoch": 0.3536487570168404,
"step": 441
},
{
"loss": 0.0242,
"grad_norm": 0.31759947538375854,
"learning_rate": 0.00015377968107809425,
"epoch": 0.35445068163592625,
"step": 442
},
{
"loss": 0.0368,
"grad_norm": 0.2436400055885315,
"learning_rate": 0.00015355579298753153,
"epoch": 0.35525260625501204,
"step": 443
},
{
"loss": 0.0704,
"grad_norm": 0.4932403564453125,
"learning_rate": 0.00015333152784384777,
"epoch": 0.35605453087409783,
"step": 444
},
{
"loss": 0.0529,
"grad_norm": 0.4474373757839203,
"learning_rate": 0.00015310688722595473,
"epoch": 0.3568564554931836,
"step": 445
},
{
"loss": 0.0773,
"grad_norm": 0.5451852679252625,
"learning_rate": 0.00015288187271540767,
"epoch": 0.35765838011226947,
"step": 446
},
{
"loss": 0.0567,
"grad_norm": 0.3486538231372833,
"learning_rate": 0.00015265648589639423,
"epoch": 0.35846030473135526,
"step": 447
},
{
"loss": 0.0445,
"grad_norm": 0.33438971638679504,
"learning_rate": 0.00015243072835572318,
"epoch": 0.35926222935044105,
"step": 448
},
{
"loss": 0.0656,
"grad_norm": 0.6021797060966492,
"learning_rate": 0.00015220460168281335,
"epoch": 0.36006415396952685,
"step": 449
},
{
"loss": 0.0539,
"grad_norm": 0.2629101276397705,
"learning_rate": 0.0001519781074696824,
"epoch": 0.3608660785886127,
"step": 450
},
{
"loss": 0.0559,
"grad_norm": 0.38639259338378906,
"learning_rate": 0.00015175124731093553,
"epoch": 0.3616680032076985,
"step": 451
},
{
"loss": 0.0632,
"grad_norm": 0.40031421184539795,
"learning_rate": 0.00015152402280375454,
"epoch": 0.3624699278267843,
"step": 452
},
{
"loss": 0.0196,
"grad_norm": 0.24561044573783875,
"learning_rate": 0.00015129643554788612,
"epoch": 0.36327185244587007,
"step": 453
},
{
"loss": 0.0563,
"grad_norm": 0.8373734354972839,
"learning_rate": 0.00015106848714563112,
"epoch": 0.3640737770649559,
"step": 454
},
{
"loss": 0.0388,
"grad_norm": 0.38167354464530945,
"learning_rate": 0.00015084017920183272,
"epoch": 0.3648757016840417,
"step": 455
},
{
"loss": 0.0506,
"grad_norm": 0.46959736943244934,
"learning_rate": 0.00015061151332386566,
"epoch": 0.3656776263031275,
"step": 456
},
{
"loss": 0.0314,
"grad_norm": 0.30401480197906494,
"learning_rate": 0.00015038249112162445,
"epoch": 0.3664795509222133,
"step": 457
},
{
"loss": 0.0466,
"grad_norm": 0.2866615355014801,
"learning_rate": 0.00015015311420751244,
"epoch": 0.36728147554129914,
"step": 458
},
{
"loss": 0.0963,
"grad_norm": 0.787212073802948,
"learning_rate": 0.00014992338419643022,
"epoch": 0.36808340016038493,
"step": 459
},
{
"loss": 0.0872,
"grad_norm": 0.49554625153541565,
"learning_rate": 0.00014969330270576427,
"epoch": 0.3688853247794707,
"step": 460
},
{
"loss": 0.0268,
"grad_norm": 0.2807726562023163,
"learning_rate": 0.0001494628713553757,
"epoch": 0.3696872493985565,
"step": 461
},
{
"loss": 0.0488,
"grad_norm": 0.28138288855552673,
"learning_rate": 0.0001492320917675887,
"epoch": 0.37048917401764236,
"step": 462
},
{
"loss": 0.0582,
"grad_norm": 0.3524348735809326,
"learning_rate": 0.0001490009655671792,
"epoch": 0.37129109863672816,
"step": 463
},
{
"loss": 0.0627,
"grad_norm": 0.38492485880851746,
"learning_rate": 0.00014876949438136347,
"epoch": 0.37209302325581395,
"step": 464
},
{
"loss": 0.0581,
"grad_norm": 0.31561005115509033,
"learning_rate": 0.0001485376798397865,
"epoch": 0.37289494787489974,
"step": 465
},
{
"loss": 0.0437,
"grad_norm": 0.30238181352615356,
"learning_rate": 0.00014830552357451076,
"epoch": 0.3736968724939856,
"step": 466
},
{
"loss": 0.0498,
"grad_norm": 0.3918459117412567,
"learning_rate": 0.00014807302722000447,
"epoch": 0.3744987971130714,
"step": 467
},
{
"loss": 0.0245,
"grad_norm": 0.20536094903945923,
"learning_rate": 0.00014784019241313026,
"epoch": 0.37530072173215717,
"step": 468
},
{
"loss": 0.0327,
"grad_norm": 0.2256690412759781,
"learning_rate": 0.0001476070207931336,
"epoch": 0.37610264635124296,
"step": 469
},
{
"loss": 0.0626,
"grad_norm": 0.42872869968414307,
"learning_rate": 0.00014737351400163128,
"epoch": 0.3769045709703288,
"step": 470
},
{
"loss": 0.0555,
"grad_norm": 0.3690952658653259,
"learning_rate": 0.0001471396736825998,
"epoch": 0.3777064955894146,
"step": 471
},
{
"loss": 0.0675,
"grad_norm": 0.4958707094192505,
"learning_rate": 0.0001469055014823637,
"epoch": 0.3785084202085004,
"step": 472
},
{
"loss": 0.0505,
"grad_norm": 0.319414883852005,
"learning_rate": 0.0001466709990495843,
"epoch": 0.3793103448275862,
"step": 473
},
{
"loss": 0.0455,
"grad_norm": 0.34806713461875916,
"learning_rate": 0.00014643616803524778,
"epoch": 0.38011226944667204,
"step": 474
},
{
"loss": 0.0284,
"grad_norm": 0.25858795642852783,
"learning_rate": 0.0001462010100926536,
"epoch": 0.3809141940657578,
"step": 475
},
{
"loss": 0.0587,
"grad_norm": 0.39808589220046997,
"learning_rate": 0.00014596552687740302,
"epoch": 0.3817161186848436,
"step": 476
},
{
"loss": 0.0476,
"grad_norm": 0.4907149374485016,
"learning_rate": 0.00014572972004738732,
"epoch": 0.3825180433039294,
"step": 477
},
{
"loss": 0.0506,
"grad_norm": 0.30614417791366577,
"learning_rate": 0.00014549359126277608,
"epoch": 0.38331996792301526,
"step": 478
},
{
"loss": 0.0259,
"grad_norm": 0.3281151354312897,
"learning_rate": 0.00014525714218600565,
"epoch": 0.38412189254210105,
"step": 479
},
{
"loss": 0.0376,
"grad_norm": 0.34824758768081665,
"learning_rate": 0.00014502037448176734,
"epoch": 0.38492381716118684,
"step": 480
},
{
"loss": 0.0425,
"grad_norm": 0.2705196440219879,
"learning_rate": 0.00014478328981699568,
"epoch": 0.38572574178027264,
"step": 481
},
{
"loss": 0.0466,
"grad_norm": 0.2696325480937958,
"learning_rate": 0.00014454588986085676,
"epoch": 0.3865276663993585,
"step": 482
},
{
"loss": 0.0372,
"grad_norm": 0.3687107264995575,
"learning_rate": 0.00014430817628473638,
"epoch": 0.3873295910184443,
"step": 483
},
{
"loss": 0.0554,
"grad_norm": 0.3724960684776306,
"learning_rate": 0.00014407015076222846,
"epoch": 0.38813151563753007,
"step": 484
},
{
"loss": 0.079,
"grad_norm": 0.5664525032043457,
"learning_rate": 0.000143831814969123,
"epoch": 0.38893344025661586,
"step": 485
},
{
"loss": 0.0318,
"grad_norm": 0.20477205514907837,
"learning_rate": 0.00014359317058339457,
"epoch": 0.3897353648757017,
"step": 486
},
{
"loss": 0.0456,
"grad_norm": 0.3792808949947357,
"learning_rate": 0.0001433542192851902,
"epoch": 0.3905372894947875,
"step": 487
},
{
"loss": 0.0253,
"grad_norm": 0.26179176568984985,
"learning_rate": 0.00014311496275681783,
"epoch": 0.3913392141138733,
"step": 488
},
{
"loss": 0.0398,
"grad_norm": 0.29624319076538086,
"learning_rate": 0.00014287540268273426,
"epoch": 0.3921411387329591,
"step": 489
},
{
"loss": 0.0425,
"grad_norm": 0.3284585773944855,
"learning_rate": 0.00014263554074953337,
"epoch": 0.39294306335204493,
"step": 490
},
{
"loss": 0.0277,
"grad_norm": 0.23194313049316406,
"learning_rate": 0.00014239537864593432,
"epoch": 0.3937449879711307,
"step": 491
},
{
"loss": 0.047,
"grad_norm": 0.557132363319397,
"learning_rate": 0.00014215491806276944,
"epoch": 0.3945469125902165,
"step": 492
},
{
"loss": 0.0495,
"grad_norm": 0.3186132311820984,
"learning_rate": 0.0001419141606929726,
"epoch": 0.3953488372093023,
"step": 493
},
{
"loss": 0.0476,
"grad_norm": 0.4139769375324249,
"learning_rate": 0.0001416731082315671,
"epoch": 0.39615076182838815,
"step": 494
},
{
"loss": 0.0707,
"grad_norm": 0.6908156275749207,
"learning_rate": 0.00014143176237565387,
"epoch": 0.39695268644747395,
"step": 495
},
{
"loss": 0.0328,
"grad_norm": 0.47614389657974243,
"learning_rate": 0.0001411901248243993,
"epoch": 0.39775461106655974,
"step": 496
},
{
"eval_loss": 0.04790589585900307,
"eval_runtime": 31.9045,
"eval_samples_per_second": 32.911,
"eval_steps_per_second": 8.243,
"epoch": 0.39775461106655974,
"step": 496
},
{
"loss": 0.0491,
"grad_norm": 0.4075859487056732,
"learning_rate": 0.00014094819727902353,
"epoch": 0.39855653568564553,
"step": 497
},
{
"loss": 0.0679,
"grad_norm": 0.2855551838874817,
"learning_rate": 0.0001407059814427884,
"epoch": 0.3993584603047314,
"step": 498
},
{
"loss": 0.0366,
"grad_norm": 0.7473935484886169,
"learning_rate": 0.00014046347902098535,
"epoch": 0.40016038492381717,
"step": 499
},
{
"loss": 0.0177,
"grad_norm": 0.16580775380134583,
"learning_rate": 0.00014022069172092352,
"epoch": 0.40096230954290296,
"step": 500
},
{
"loss": 0.04,
"grad_norm": 0.3346802592277527,
"learning_rate": 0.00013997762125191773,
"epoch": 0.40176423416198875,
"step": 501
},
{
"loss": 0.065,
"grad_norm": 0.5194714069366455,
"learning_rate": 0.00013973426932527636,
"epoch": 0.4025661587810746,
"step": 502
},
{
"loss": 0.0412,
"grad_norm": 0.25542134046554565,
"learning_rate": 0.00013949063765428943,
"epoch": 0.4033680834001604,
"step": 503
},
{
"loss": 0.0768,
"grad_norm": 0.46887674927711487,
"learning_rate": 0.00013924672795421637,
"epoch": 0.4041700080192462,
"step": 504
},
{
"loss": 0.0508,
"grad_norm": 0.3275587558746338,
"learning_rate": 0.00013900254194227415,
"epoch": 0.404971932638332,
"step": 505
},
{
"loss": 0.0253,
"grad_norm": 0.2020861655473709,
"learning_rate": 0.000138758081337625,
"epoch": 0.4057738572574178,
"step": 506
},
{
"loss": 0.0322,
"grad_norm": 0.5022090673446655,
"learning_rate": 0.0001385133478613644,
"epoch": 0.4065757818765036,
"step": 507
},
{
"loss": 0.121,
"grad_norm": 1.1316415071487427,
"learning_rate": 0.000138268343236509,
"epoch": 0.4073777064955894,
"step": 508
},
{
"loss": 0.0239,
"grad_norm": 0.29626041650772095,
"learning_rate": 0.00013802306918798437,
"epoch": 0.4081796311146752,
"step": 509
},
{
"loss": 0.0354,
"grad_norm": 0.19514746963977814,
"learning_rate": 0.00013777752744261295,
"epoch": 0.40898155573376105,
"step": 510
},
{
"loss": 0.0692,
"grad_norm": 0.4436163902282715,
"learning_rate": 0.0001375317197291019,
"epoch": 0.40978348035284684,
"step": 511
},
{
"loss": 0.0427,
"grad_norm": 0.36557817459106445,
"learning_rate": 0.00013728564777803088,
"epoch": 0.41058540497193263,
"step": 512
},
{
"loss": 0.0407,
"grad_norm": 0.3514234721660614,
"learning_rate": 0.00013703931332183987,
"epoch": 0.4113873295910184,
"step": 513
},
{
"loss": 0.0235,
"grad_norm": 0.24922512471675873,
"learning_rate": 0.00013679271809481693,
"epoch": 0.41218925421010427,
"step": 514
},
{
"loss": 0.0492,
"grad_norm": 0.4417109787464142,
"learning_rate": 0.00013654586383308619,
"epoch": 0.41299117882919006,
"step": 515
},
{
"loss": 0.0973,
"grad_norm": 0.5984606146812439,
"learning_rate": 0.00013629875227459532,
"epoch": 0.41379310344827586,
"step": 516
},
{
"loss": 0.0597,
"grad_norm": 0.5426322221755981,
"learning_rate": 0.0001360513851591036,
"epoch": 0.41459502806736165,
"step": 517
},
{
"loss": 0.081,
"grad_norm": 0.7733796238899231,
"learning_rate": 0.00013580376422816945,
"epoch": 0.4153969526864475,
"step": 518
},
{
"loss": 0.031,
"grad_norm": 0.33183905482292175,
"learning_rate": 0.00013555589122513827,
"epoch": 0.4161988773055333,
"step": 519
},
{
"loss": 0.0592,
"grad_norm": 0.4072870910167694,
"learning_rate": 0.0001353077678951301,
"epoch": 0.4170008019246191,
"step": 520
},
{
"loss": 0.0523,
"grad_norm": 0.3927518427371979,
"learning_rate": 0.0001350593959850274,
"epoch": 0.41780272654370487,
"step": 521
},
{
"loss": 0.0332,
"grad_norm": 0.3755587637424469,
"learning_rate": 0.00013481077724346278,
"epoch": 0.4186046511627907,
"step": 522
},
{
"loss": 0.1049,
"grad_norm": 0.5004737377166748,
"learning_rate": 0.0001345619134208066,
"epoch": 0.4194065757818765,
"step": 523
},
{
"loss": 0.0878,
"grad_norm": 0.3315165042877197,
"learning_rate": 0.00013431280626915467,
"epoch": 0.4202085004009623,
"step": 524
},
{
"loss": 0.0339,
"grad_norm": 0.27768945693969727,
"learning_rate": 0.00013406345754231588,
"epoch": 0.4210104250200481,
"step": 525
},
{
"loss": 0.0433,
"grad_norm": 0.3195447325706482,
"learning_rate": 0.00013381386899580003,
"epoch": 0.42181234963913394,
"step": 526
},
{
"loss": 0.028,
"grad_norm": 0.2721582055091858,
"learning_rate": 0.00013356404238680527,
"epoch": 0.42261427425821974,
"step": 527
},
{
"loss": 0.0324,
"grad_norm": 0.2353498488664627,
"learning_rate": 0.00013331397947420576,
"epoch": 0.4234161988773055,
"step": 528
},
{
"loss": 0.0572,
"grad_norm": 0.49510321021080017,
"learning_rate": 0.0001330636820185394,
"epoch": 0.4242181234963913,
"step": 529
},
{
"loss": 0.0586,
"grad_norm": 0.5035674571990967,
"learning_rate": 0.00013281315178199536,
"epoch": 0.42502004811547717,
"step": 530
},
{
"loss": 0.0337,
"grad_norm": 0.761020839214325,
"learning_rate": 0.00013256239052840155,
"epoch": 0.42582197273456296,
"step": 531
},
{
"loss": 0.0587,
"grad_norm": 0.2618282735347748,
"learning_rate": 0.00013231140002321253,
"epoch": 0.42662389735364875,
"step": 532
},
{
"loss": 0.0257,
"grad_norm": 0.2896956503391266,
"learning_rate": 0.0001320601820334967,
"epoch": 0.42742582197273454,
"step": 533
},
{
"loss": 0.0461,
"grad_norm": 0.48962509632110596,
"learning_rate": 0.00013180873832792416,
"epoch": 0.4282277465918204,
"step": 534
},
{
"loss": 0.0093,
"grad_norm": 0.13504081964492798,
"learning_rate": 0.00013155707067675406,
"epoch": 0.4290296712109062,
"step": 535
},
{
"loss": 0.0417,
"grad_norm": 0.3743266463279724,
"learning_rate": 0.00013130518085182225,
"epoch": 0.429831595829992,
"step": 536
},
{
"loss": 0.0343,
"grad_norm": 0.29630181193351746,
"learning_rate": 0.00013105307062652872,
"epoch": 0.43063352044907777,
"step": 537
},
{
"loss": 0.0291,
"grad_norm": 0.25488558411598206,
"learning_rate": 0.00013080074177582526,
"epoch": 0.4314354450681636,
"step": 538
},
{
"loss": 0.091,
"grad_norm": 0.4586013853549957,
"learning_rate": 0.00013054819607620274,
"epoch": 0.4322373696872494,
"step": 539
},
{
"loss": 0.1163,
"grad_norm": 0.7305994033813477,
"learning_rate": 0.00013029543530567884,
"epoch": 0.4330392943063352,
"step": 540
},
{
"loss": 0.0339,
"grad_norm": 0.234614759683609,
"learning_rate": 0.00013004246124378535,
"epoch": 0.433841218925421,
"step": 541
},
{
"loss": 0.0321,
"grad_norm": 0.2804659903049469,
"learning_rate": 0.00012978927567155573,
"epoch": 0.43464314354450684,
"step": 542
},
{
"loss": 0.0514,
"grad_norm": 0.5687031745910645,
"learning_rate": 0.0001295358803715126,
"epoch": 0.43544506816359263,
"step": 543
},
{
"loss": 0.0824,
"grad_norm": 0.583227276802063,
"learning_rate": 0.00012928227712765504,
"epoch": 0.4362469927826784,
"step": 544
},
{
"loss": 0.0453,
"grad_norm": 0.31921252608299255,
"learning_rate": 0.00012902846772544624,
"epoch": 0.4370489174017642,
"step": 545
},
{
"loss": 0.0495,
"grad_norm": 0.4188879430294037,
"learning_rate": 0.00012877445395180078,
"epoch": 0.43785084202085006,
"step": 546
},
{
"loss": 0.0393,
"grad_norm": 0.2866995334625244,
"learning_rate": 0.00012852023759507203,
"epoch": 0.43865276663993585,
"step": 547
},
{
"loss": 0.0772,
"grad_norm": 0.48335814476013184,
"learning_rate": 0.00012826582044503978,
"epoch": 0.43945469125902165,
"step": 548
},
{
"loss": 0.0537,
"grad_norm": 0.3400033414363861,
"learning_rate": 0.0001280112042928973,
"epoch": 0.44025661587810744,
"step": 549
},
{
"loss": 0.0503,
"grad_norm": 0.43847382068634033,
"learning_rate": 0.00012775639093123907,
"epoch": 0.4410585404971933,
"step": 550
},
{
"loss": 0.0659,
"grad_norm": 0.3055131137371063,
"learning_rate": 0.00012750138215404782,
"epoch": 0.4418604651162791,
"step": 551
},
{
"loss": 0.0532,
"grad_norm": 0.31449994444847107,
"learning_rate": 0.0001272461797566823,
"epoch": 0.44266238973536487,
"step": 552
},
{
"loss": 0.0273,
"grad_norm": 0.39831122756004333,
"learning_rate": 0.00012699078553586422,
"epoch": 0.44346431435445066,
"step": 553
},
{
"loss": 0.04,
"grad_norm": 0.464834600687027,
"learning_rate": 0.00012673520128966592,
"epoch": 0.4442662389735365,
"step": 554
},
{
"loss": 0.0679,
"grad_norm": 0.3944595158100128,
"learning_rate": 0.00012647942881749755,
"epoch": 0.4450681635926223,
"step": 555
},
{
"loss": 0.0271,
"grad_norm": 0.21679094433784485,
"learning_rate": 0.00012622346992009447,
"epoch": 0.4458700882117081,
"step": 556
},
{
"loss": 0.0349,
"grad_norm": 0.34640711545944214,
"learning_rate": 0.00012596732639950442,
"epoch": 0.4466720128307939,
"step": 557
},
{
"loss": 0.0445,
"grad_norm": 0.5096455216407776,
"learning_rate": 0.00012571100005907523,
"epoch": 0.44747393744987973,
"step": 558
},
{
"loss": 0.0544,
"grad_norm": 0.35034018754959106,
"learning_rate": 0.0001254544927034415,
"epoch": 0.4482758620689655,
"step": 559
},
{
"loss": 0.1161,
"grad_norm": 0.4701795279979706,
"learning_rate": 0.00012519780613851254,
"epoch": 0.4490777866880513,
"step": 560
},
{
"loss": 0.0259,
"grad_norm": 0.25175973773002625,
"learning_rate": 0.00012494094217145918,
"epoch": 0.4498797113071371,
"step": 561
},
{
"loss": 0.0431,
"grad_norm": 0.30269894003868103,
"learning_rate": 0.00012468390261070138,
"epoch": 0.45068163592622296,
"step": 562
},
{
"loss": 0.0234,
"grad_norm": 0.23327726125717163,
"learning_rate": 0.0001244266892658952,
"epoch": 0.45148356054530875,
"step": 563
},
{
"loss": 0.0433,
"grad_norm": 0.26909253001213074,
"learning_rate": 0.00012416930394792026,
"epoch": 0.45228548516439454,
"step": 564
},
{
"loss": 0.0676,
"grad_norm": 0.4461866319179535,
"learning_rate": 0.00012391174846886698,
"epoch": 0.45308740978348033,
"step": 565
},
{
"loss": 0.0461,
"grad_norm": 0.4100785553455353,
"learning_rate": 0.0001236540246420237,
"epoch": 0.4538893344025662,
"step": 566
},
{
"loss": 0.0338,
"grad_norm": 0.35902178287506104,
"learning_rate": 0.00012339613428186407,
"epoch": 0.454691259021652,
"step": 567
},
{
"loss": 0.0544,
"grad_norm": 0.43561217188835144,
"learning_rate": 0.00012313807920403419,
"epoch": 0.45549318364073776,
"step": 568
},
{
"loss": 0.0476,
"grad_norm": 0.34299418330192566,
"learning_rate": 0.0001228798612253397,
"epoch": 0.45629510825982356,
"step": 569
},
{
"loss": 0.1276,
"grad_norm": 0.5789246559143066,
"learning_rate": 0.00012262148216373331,
"epoch": 0.4570970328789094,
"step": 570
},
{
"loss": 0.0243,
"grad_norm": 0.42919760942459106,
"learning_rate": 0.00012236294383830175,
"epoch": 0.4578989574979952,
"step": 571
},
{
"loss": 0.0459,
"grad_norm": 0.24285271763801575,
"learning_rate": 0.00012210424806925301,
"epoch": 0.458700882117081,
"step": 572
},
{
"loss": 0.0573,
"grad_norm": 0.46728515625,
"learning_rate": 0.00012184539667790349,
"epoch": 0.4595028067361668,
"step": 573
},
{
"loss": 0.0543,
"grad_norm": 0.2979477643966675,
"learning_rate": 0.00012158639148666534,
"epoch": 0.4603047313552526,
"step": 574
},
{
"loss": 0.0613,
"grad_norm": 0.35671502351760864,
"learning_rate": 0.00012132723431903341,
"epoch": 0.4611066559743384,
"step": 575
},
{
"loss": 0.0328,
"grad_norm": 0.279118150472641,
"learning_rate": 0.00012106792699957263,
"epoch": 0.4619085805934242,
"step": 576
},
{
"loss": 0.0595,
"grad_norm": 0.6142110824584961,
"learning_rate": 0.000120808471353905,
"epoch": 0.46271050521251,
"step": 577
},
{
"loss": 0.0691,
"grad_norm": 0.7308236956596375,
"learning_rate": 0.00012054886920869681,
"epoch": 0.46351242983159585,
"step": 578
},
{
"loss": 0.0528,
"grad_norm": 0.45223355293273926,
"learning_rate": 0.00012028912239164569,
"epoch": 0.46431435445068164,
"step": 579
},
{
"loss": 0.0373,
"grad_norm": 0.2948494255542755,
"learning_rate": 0.00012002923273146794,
"epoch": 0.46511627906976744,
"step": 580
},
{
"loss": 0.0414,
"grad_norm": 0.27661287784576416,
"learning_rate": 0.00011976920205788542,
"epoch": 0.4659182036888532,
"step": 581
},
{
"loss": 0.0578,
"grad_norm": 0.4644034504890442,
"learning_rate": 0.00011950903220161285,
"epoch": 0.4667201283079391,
"step": 582
},
{
"loss": 0.0565,
"grad_norm": 0.6451210379600525,
"learning_rate": 0.00011924872499434479,
"epoch": 0.46752205292702487,
"step": 583
},
{
"loss": 0.0231,
"grad_norm": 0.21448062360286713,
"learning_rate": 0.00011898828226874284,
"epoch": 0.46832397754611066,
"step": 584
},
{
"loss": 0.0166,
"grad_norm": 0.15424512326717377,
"learning_rate": 0.00011872770585842273,
"epoch": 0.46912590216519645,
"step": 585
},
{
"loss": 0.0473,
"grad_norm": 0.31540054082870483,
"learning_rate": 0.0001184669975979413,
"epoch": 0.4699278267842823,
"step": 586
},
{
"loss": 0.0165,
"grad_norm": 0.13097421824932098,
"learning_rate": 0.00011820615932278374,
"epoch": 0.4707297514033681,
"step": 587
},
{
"loss": 0.0318,
"grad_norm": 0.308799684047699,
"learning_rate": 0.00011794519286935055,
"epoch": 0.4715316760224539,
"step": 588
},
{
"loss": 0.0471,
"grad_norm": 0.2947872579097748,
"learning_rate": 0.00011768410007494466,
"epoch": 0.4723336006415397,
"step": 589
},
{
"loss": 0.0516,
"grad_norm": 0.22661037743091583,
"learning_rate": 0.0001174228827777585,
"epoch": 0.4731355252606255,
"step": 590
},
{
"loss": 0.03,
"grad_norm": 0.24548248946666718,
"learning_rate": 0.00011716154281686105,
"epoch": 0.4739374498797113,
"step": 591
},
{
"loss": 0.0365,
"grad_norm": 0.2837478220462799,
"learning_rate": 0.00011690008203218493,
"epoch": 0.4747393744987971,
"step": 592
},
{
"loss": 0.0538,
"grad_norm": 0.3481287360191345,
"learning_rate": 0.00011663850226451327,
"epoch": 0.4755412991178829,
"step": 593
},
{
"loss": 0.048,
"grad_norm": 0.4488002061843872,
"learning_rate": 0.000116376805355467,
"epoch": 0.47634322373696875,
"step": 594
},
{
"loss": 0.015,
"grad_norm": 0.16303379833698273,
"learning_rate": 0.00011611499314749177,
"epoch": 0.47714514835605454,
"step": 595
},
{
"loss": 0.0246,
"grad_norm": 0.22950126230716705,
"learning_rate": 0.0001158530674838449,
"epoch": 0.47794707297514033,
"step": 596
},
{
"loss": 0.0116,
"grad_norm": 0.1625395268201828,
"learning_rate": 0.0001155910302085826,
"epoch": 0.4787489975942261,
"step": 597
},
{
"loss": 0.0301,
"grad_norm": 0.23239369690418243,
"learning_rate": 0.00011532888316654675,
"epoch": 0.47955092221331197,
"step": 598
},
{
"loss": 0.1168,
"grad_norm": 0.7024423480033875,
"learning_rate": 0.00011506662820335208,
"epoch": 0.48035284683239776,
"step": 599
},
{
"loss": 0.0615,
"grad_norm": 0.31283116340637207,
"learning_rate": 0.00011480426716537315,
"epoch": 0.48115477145148355,
"step": 600
},
{
"loss": 0.0387,
"grad_norm": 0.22865501046180725,
"learning_rate": 0.0001145418018997313,
"epoch": 0.48195669607056935,
"step": 601
},
{
"loss": 0.0189,
"grad_norm": 0.2138299196958542,
"learning_rate": 0.00011427923425428164,
"epoch": 0.4827586206896552,
"step": 602
},
{
"loss": 0.0703,
"grad_norm": 0.3493439257144928,
"learning_rate": 0.00011401656607760015,
"epoch": 0.483560545308741,
"step": 603
},
{
"loss": 0.0178,
"grad_norm": 0.2075956165790558,
"learning_rate": 0.00011375379921897051,
"epoch": 0.4843624699278268,
"step": 604
},
{
"loss": 0.0456,
"grad_norm": 0.4019928276538849,
"learning_rate": 0.0001134909355283712,
"epoch": 0.48516439454691257,
"step": 605
},
{
"loss": 0.0332,
"grad_norm": 0.3662348687648773,
"learning_rate": 0.00011322797685646242,
"epoch": 0.4859663191659984,
"step": 606
},
{
"loss": 0.0289,
"grad_norm": 0.26660025119781494,
"learning_rate": 0.00011296492505457314,
"epoch": 0.4867682437850842,
"step": 607
},
{
"loss": 0.0251,
"grad_norm": 0.1749676614999771,
"learning_rate": 0.00011270178197468789,
"epoch": 0.48757016840417,
"step": 608
},
{
"loss": 0.0338,
"grad_norm": 0.2791067957878113,
"learning_rate": 0.00011243854946943388,
"epoch": 0.4883720930232558,
"step": 609
},
{
"loss": 0.0395,
"grad_norm": 0.21187956631183624,
"learning_rate": 0.00011217522939206795,
"epoch": 0.48917401764234164,
"step": 610
},
{
"loss": 0.0596,
"grad_norm": 0.4193437695503235,
"learning_rate": 0.00011191182359646337,
"epoch": 0.48997594226142743,
"step": 611
},
{
"loss": 0.0437,
"grad_norm": 0.42110878229141235,
"learning_rate": 0.00011164833393709706,
"epoch": 0.4907778668805132,
"step": 612
},
{
"loss": 0.0364,
"grad_norm": 0.3795287013053894,
"learning_rate": 0.00011138476226903625,
"epoch": 0.491579791499599,
"step": 613
},
{
"loss": 0.0537,
"grad_norm": 0.307650089263916,
"learning_rate": 0.00011112111044792557,
"epoch": 0.49238171611868486,
"step": 614
},
{
"loss": 0.044,
"grad_norm": 0.33749890327453613,
"learning_rate": 0.00011085738032997398,
"epoch": 0.49318364073777066,
"step": 615
},
{
"loss": 0.0479,
"grad_norm": 0.3227038085460663,
"learning_rate": 0.00011059357377194161,
"epoch": 0.49398556535685645,
"step": 616
},
{
"loss": 0.0452,
"grad_norm": 0.3482477068901062,
"learning_rate": 0.00011032969263112688,
"epoch": 0.49478748997594224,
"step": 617
},
{
"loss": 0.0315,
"grad_norm": 0.27159547805786133,
"learning_rate": 0.00011006573876535322,
"epoch": 0.4955894145950281,
"step": 618
},
{
"loss": 0.0637,
"grad_norm": 0.40270885825157166,
"learning_rate": 0.0001098017140329561,
"epoch": 0.4963913392141139,
"step": 619
},
{
"loss": 0.022,
"grad_norm": 0.21836791932582855,
"learning_rate": 0.00010953762029276982,
"epoch": 0.4971932638331997,
"step": 620
},
{
"eval_loss": 0.04481621831655502,
"eval_runtime": 32.1222,
"eval_samples_per_second": 32.688,
"eval_steps_per_second": 8.187,
"epoch": 0.4971932638331997,
"step": 620
},
{
"loss": 0.0203,
"grad_norm": 0.1830679029226303,
"learning_rate": 0.00010927345940411467,
"epoch": 0.49799518845228546,
"step": 621
},
{
"loss": 0.0607,
"grad_norm": 0.4090077579021454,
"learning_rate": 0.00010900923322678364,
"epoch": 0.4987971130713713,
"step": 622
},
{
"loss": 0.0342,
"grad_norm": 0.28506171703338623,
"learning_rate": 0.00010874494362102931,
"epoch": 0.4995990376904571,
"step": 623
},
{
"loss": 0.0318,
"grad_norm": 0.31976205110549927,
"learning_rate": 0.00010848059244755093,
"epoch": 0.5004009623095429,
"step": 624
},
{
"loss": 0.0556,
"grad_norm": 0.2998436391353607,
"learning_rate": 0.0001082161815674811,
"epoch": 0.5012028869286287,
"step": 625
},
{
"loss": 0.021,
"grad_norm": 0.22129428386688232,
"learning_rate": 0.00010795171284237284,
"epoch": 0.5020048115477145,
"step": 626
},
{
"loss": 0.0266,
"grad_norm": 0.2941289246082306,
"learning_rate": 0.00010768718813418644,
"epoch": 0.5028067361668003,
"step": 627
},
{
"loss": 0.0568,
"grad_norm": 0.3848710358142853,
"learning_rate": 0.00010742260930527625,
"epoch": 0.5036086607858862,
"step": 628
},
{
"loss": 0.0388,
"grad_norm": 0.33324113488197327,
"learning_rate": 0.00010715797821837776,
"epoch": 0.504410585404972,
"step": 629
},
{
"loss": 0.0357,
"grad_norm": 0.350759893655777,
"learning_rate": 0.00010689329673659429,
"epoch": 0.5052125100240578,
"step": 630
},
{
"loss": 0.0111,
"grad_norm": 0.15862928330898285,
"learning_rate": 0.00010662856672338397,
"epoch": 0.5060144346431436,
"step": 631
},
{
"loss": 0.0382,
"grad_norm": 0.26137423515319824,
"learning_rate": 0.00010636379004254664,
"epoch": 0.5068163592622293,
"step": 632
},
{
"loss": 0.0244,
"grad_norm": 0.28696557879447937,
"learning_rate": 0.00010609896855821068,
"epoch": 0.5076182838813151,
"step": 633
},
{
"loss": 0.0497,
"grad_norm": 0.3603985905647278,
"learning_rate": 0.00010583410413481994,
"epoch": 0.5084202085004009,
"step": 634
},
{
"loss": 0.0837,
"grad_norm": 0.653423011302948,
"learning_rate": 0.00010556919863712054,
"epoch": 0.5092221331194867,
"step": 635
},
{
"loss": 0.0243,
"grad_norm": 0.23948614299297333,
"learning_rate": 0.00010530425393014774,
"epoch": 0.5100240577385726,
"step": 636
},
{
"loss": 0.0271,
"grad_norm": 0.22972430288791656,
"learning_rate": 0.00010503927187921292,
"epoch": 0.5108259823576584,
"step": 637
},
{
"loss": 0.047,
"grad_norm": 0.4855923354625702,
"learning_rate": 0.00010477425434989036,
"epoch": 0.5116279069767442,
"step": 638
},
{
"loss": 0.0319,
"grad_norm": 0.3573042154312134,
"learning_rate": 0.0001045092032080041,
"epoch": 0.51242983159583,
"step": 639
},
{
"loss": 0.0679,
"grad_norm": 0.4812779426574707,
"learning_rate": 0.00010424412031961484,
"epoch": 0.5132317562149158,
"step": 640
},
{
"loss": 0.016,
"grad_norm": 0.21666432917118073,
"learning_rate": 0.00010397900755100678,
"epoch": 0.5140336808340016,
"step": 641
},
{
"loss": 0.018,
"grad_norm": 0.19402359426021576,
"learning_rate": 0.00010371386676867447,
"epoch": 0.5148356054530874,
"step": 642
},
{
"loss": 0.0917,
"grad_norm": 0.5789539217948914,
"learning_rate": 0.00010344869983930974,
"epoch": 0.5156375300721732,
"step": 643
},
{
"loss": 0.02,
"grad_norm": 0.19617126882076263,
"learning_rate": 0.00010318350862978848,
"epoch": 0.5164394546912591,
"step": 644
},
{
"loss": 0.0407,
"grad_norm": 0.33302173018455505,
"learning_rate": 0.00010291829500715744,
"epoch": 0.5172413793103449,
"step": 645
},
{
"loss": 0.0685,
"grad_norm": 0.4327728748321533,
"learning_rate": 0.00010265306083862134,
"epoch": 0.5180433039294307,
"step": 646
},
{
"loss": 0.029,
"grad_norm": 0.3352719843387604,
"learning_rate": 0.00010238780799152938,
"epoch": 0.5188452285485164,
"step": 647
},
{
"loss": 0.0195,
"grad_norm": 0.20400014519691467,
"learning_rate": 0.00010212253833336237,
"epoch": 0.5196471531676022,
"step": 648
},
{
"loss": 0.052,
"grad_norm": 0.5209816098213196,
"learning_rate": 0.00010185725373171942,
"epoch": 0.520449077786688,
"step": 649
},
{
"loss": 0.0123,
"grad_norm": 0.2923823595046997,
"learning_rate": 0.0001015919560543049,
"epoch": 0.5212510024057738,
"step": 650
},
{
"loss": 0.0412,
"grad_norm": 0.3393188714981079,
"learning_rate": 0.0001013266471689152,
"epoch": 0.5220529270248596,
"step": 651
},
{
"loss": 0.0188,
"grad_norm": 0.24097828567028046,
"learning_rate": 0.00010106132894342564,
"epoch": 0.5228548516439455,
"step": 652
},
{
"loss": 0.0686,
"grad_norm": 0.44344210624694824,
"learning_rate": 0.00010079600324577722,
"epoch": 0.5236567762630313,
"step": 653
},
{
"loss": 0.0143,
"grad_norm": 0.2262842059135437,
"learning_rate": 0.0001005306719439637,
"epoch": 0.5244587008821171,
"step": 654
},
{
"loss": 0.0288,
"grad_norm": 0.2735036611557007,
"learning_rate": 0.00010026533690601814,
"epoch": 0.5252606255012029,
"step": 655
},
{
"loss": 0.0554,
"grad_norm": 0.5491762757301331,
"learning_rate": 0.0001,
"epoch": 0.5260625501202887,
"step": 656
},
{
"loss": 0.052,
"grad_norm": 0.3667290508747101,
"learning_rate": 9.973466309398187e-05,
"epoch": 0.5268644747393745,
"step": 657
},
{
"loss": 0.029,
"grad_norm": 0.24463889002799988,
"learning_rate": 9.946932805603635e-05,
"epoch": 0.5276663993584603,
"step": 658
},
{
"loss": 0.0305,
"grad_norm": 0.34307271242141724,
"learning_rate": 9.92039967542228e-05,
"epoch": 0.5284683239775461,
"step": 659
},
{
"loss": 0.0543,
"grad_norm": 0.32049161195755005,
"learning_rate": 9.89386710565744e-05,
"epoch": 0.529270248596632,
"step": 660
},
{
"loss": 0.0745,
"grad_norm": 0.5253795981407166,
"learning_rate": 9.867335283108479e-05,
"epoch": 0.5300721732157178,
"step": 661
},
{
"loss": 0.0205,
"grad_norm": 0.22634099423885345,
"learning_rate": 9.840804394569513e-05,
"epoch": 0.5308740978348035,
"step": 662
},
{
"loss": 0.0478,
"grad_norm": 0.3835356831550598,
"learning_rate": 9.81427462682806e-05,
"epoch": 0.5316760224538893,
"step": 663
},
{
"loss": 0.0198,
"grad_norm": 0.25156858563423157,
"learning_rate": 9.787746166663764e-05,
"epoch": 0.5324779470729751,
"step": 664
},
{
"loss": 0.066,
"grad_norm": 0.577354907989502,
"learning_rate": 9.761219200847065e-05,
"epoch": 0.5332798716920609,
"step": 665
},
{
"loss": 0.0769,
"grad_norm": 0.5115137696266174,
"learning_rate": 9.73469391613787e-05,
"epoch": 0.5340817963111467,
"step": 666
},
{
"loss": 0.0391,
"grad_norm": 0.3202758729457855,
"learning_rate": 9.708170499284256e-05,
"epoch": 0.5348837209302325,
"step": 667
},
{
"loss": 0.0445,
"grad_norm": 0.422722727060318,
"learning_rate": 9.681649137021158e-05,
"epoch": 0.5356856455493184,
"step": 668
},
{
"loss": 0.0446,
"grad_norm": 0.32844579219818115,
"learning_rate": 9.655130016069028e-05,
"epoch": 0.5364875701684042,
"step": 669
},
{
"loss": 0.045,
"grad_norm": 0.3552158772945404,
"learning_rate": 9.628613323132554e-05,
"epoch": 0.53728949478749,
"step": 670
},
{
"loss": 0.0516,
"grad_norm": 0.37886497378349304,
"learning_rate": 9.602099244899323e-05,
"epoch": 0.5380914194065758,
"step": 671
},
{
"loss": 0.0224,
"grad_norm": 0.25544053316116333,
"learning_rate": 9.57558796803852e-05,
"epoch": 0.5388933440256616,
"step": 672
},
{
"loss": 0.0433,
"grad_norm": 0.2606353163719177,
"learning_rate": 9.549079679199592e-05,
"epoch": 0.5396952686447474,
"step": 673
},
{
"loss": 0.053,
"grad_norm": 0.3851439654827118,
"learning_rate": 9.522574565010965e-05,
"epoch": 0.5404971932638332,
"step": 674
},
{
"loss": 0.0314,
"grad_norm": 0.26221150159835815,
"learning_rate": 9.496072812078712e-05,
"epoch": 0.541299117882919,
"step": 675
},
{
"loss": 0.0569,
"grad_norm": 0.5227025747299194,
"learning_rate": 9.46957460698523e-05,
"epoch": 0.5421010425020049,
"step": 676
},
{
"loss": 0.0181,
"grad_norm": 0.20920135080814362,
"learning_rate": 9.44308013628795e-05,
"epoch": 0.5429029671210907,
"step": 677
},
{
"loss": 0.0326,
"grad_norm": 0.2929348647594452,
"learning_rate": 9.416589586518008e-05,
"epoch": 0.5437048917401764,
"step": 678
},
{
"loss": 0.0346,
"grad_norm": 0.38871344923973083,
"learning_rate": 9.390103144178932e-05,
"epoch": 0.5445068163592622,
"step": 679
},
{
"loss": 0.0603,
"grad_norm": 0.392945259809494,
"learning_rate": 9.363620995745337e-05,
"epoch": 0.545308740978348,
"step": 680
},
{
"loss": 0.0579,
"grad_norm": 0.6106362342834473,
"learning_rate": 9.337143327661604e-05,
"epoch": 0.5461106655974338,
"step": 681
},
{
"loss": 0.1305,
"grad_norm": 0.6625472903251648,
"learning_rate": 9.310670326340576e-05,
"epoch": 0.5469125902165196,
"step": 682
},
{
"loss": 0.0954,
"grad_norm": 0.5873953104019165,
"learning_rate": 9.284202178162226e-05,
"epoch": 0.5477145148356054,
"step": 683
},
{
"loss": 0.0214,
"grad_norm": 0.2383047342300415,
"learning_rate": 9.257739069472374e-05,
"epoch": 0.5485164394546913,
"step": 684
},
{
"loss": 0.0395,
"grad_norm": 0.46583423018455505,
"learning_rate": 9.23128118658136e-05,
"epoch": 0.5493183640737771,
"step": 685
},
{
"loss": 0.0847,
"grad_norm": 0.42172953486442566,
"learning_rate": 9.204828715762718e-05,
"epoch": 0.5501202886928629,
"step": 686
},
{
"loss": 0.0182,
"grad_norm": 0.17326125502586365,
"learning_rate": 9.178381843251891e-05,
"epoch": 0.5509222133119487,
"step": 687
},
{
"loss": 0.0706,
"grad_norm": 0.4465944468975067,
"learning_rate": 9.151940755244912e-05,
"epoch": 0.5517241379310345,
"step": 688
},
{
"loss": 0.0361,
"grad_norm": 0.3605600893497467,
"learning_rate": 9.12550563789707e-05,
"epoch": 0.5525260625501203,
"step": 689
},
{
"loss": 0.0637,
"grad_norm": 0.5488521456718445,
"learning_rate": 9.099076677321638e-05,
"epoch": 0.5533279871692061,
"step": 690
},
{
"loss": 0.0413,
"grad_norm": 0.3144517242908478,
"learning_rate": 9.072654059588533e-05,
"epoch": 0.5541299117882919,
"step": 691
},
{
"loss": 0.0469,
"grad_norm": 0.356842041015625,
"learning_rate": 9.04623797072302e-05,
"epoch": 0.5549318364073778,
"step": 692
},
{
"loss": 0.0976,
"grad_norm": 0.5099210143089294,
"learning_rate": 9.019828596704394e-05,
"epoch": 0.5557337610264635,
"step": 693
},
{
"loss": 0.0491,
"grad_norm": 0.5059170126914978,
"learning_rate": 8.99342612346468e-05,
"epoch": 0.5565356856455493,
"step": 694
},
{
"loss": 0.0378,
"grad_norm": 0.29008913040161133,
"learning_rate": 8.967030736887314e-05,
"epoch": 0.5573376102646351,
"step": 695
},
{
"loss": 0.0998,
"grad_norm": 0.6845918297767639,
"learning_rate": 8.94064262280584e-05,
"epoch": 0.5581395348837209,
"step": 696
},
{
"loss": 0.0582,
"grad_norm": 0.3744989335536957,
"learning_rate": 8.914261967002605e-05,
"epoch": 0.5589414595028067,
"step": 697
},
{
"loss": 0.0581,
"grad_norm": 0.467715859413147,
"learning_rate": 8.887888955207444e-05,
"epoch": 0.5597433841218925,
"step": 698
},
{
"loss": 0.0444,
"grad_norm": 0.3465082347393036,
"learning_rate": 8.861523773096378e-05,
"epoch": 0.5605453087409783,
"step": 699
},
{
"loss": 0.0616,
"grad_norm": 0.4096762537956238,
"learning_rate": 8.835166606290295e-05,
"epoch": 0.5613472333600642,
"step": 700
},
{
"loss": 0.0286,
"grad_norm": 0.3438918888568878,
"learning_rate": 8.808817640353661e-05,
"epoch": 0.56214915797915,
"step": 701
},
{
"loss": 0.0796,
"grad_norm": 0.503362774848938,
"learning_rate": 8.782477060793211e-05,
"epoch": 0.5629510825982358,
"step": 702
},
{
"loss": 0.0403,
"grad_norm": 0.36747029423713684,
"learning_rate": 8.756145053056615e-05,
"epoch": 0.5637530072173216,
"step": 703
},
{
"loss": 0.0339,
"grad_norm": 0.2829087972640991,
"learning_rate": 8.729821802531212e-05,
"epoch": 0.5645549318364074,
"step": 704
},
{
"loss": 0.0333,
"grad_norm": 0.3334031105041504,
"learning_rate": 8.703507494542691e-05,
"epoch": 0.5653568564554932,
"step": 705
},
{
"loss": 0.0437,
"grad_norm": 0.38484475016593933,
"learning_rate": 8.67720231435376e-05,
"epoch": 0.566158781074579,
"step": 706
},
{
"loss": 0.0485,
"grad_norm": 0.3287144601345062,
"learning_rate": 8.650906447162884e-05,
"epoch": 0.5669607056936647,
"step": 707
},
{
"loss": 0.0255,
"grad_norm": 0.22345122694969177,
"learning_rate": 8.624620078102951e-05,
"epoch": 0.5677626303127506,
"step": 708
},
{
"loss": 0.1315,
"grad_norm": 0.7388908267021179,
"learning_rate": 8.598343392239989e-05,
"epoch": 0.5685645549318364,
"step": 709
},
{
"loss": 0.0334,
"grad_norm": 0.2406347393989563,
"learning_rate": 8.572076574571838e-05,
"epoch": 0.5693664795509222,
"step": 710
},
{
"loss": 0.0225,
"grad_norm": 0.3055616021156311,
"learning_rate": 8.545819810026871e-05,
"epoch": 0.570168404170008,
"step": 711
},
{
"loss": 0.0139,
"grad_norm": 0.23999330401420593,
"learning_rate": 8.519573283462687e-05,
"epoch": 0.5709703287890938,
"step": 712
},
{
"loss": 0.0783,
"grad_norm": 0.6496703624725342,
"learning_rate": 8.493337179664793e-05,
"epoch": 0.5717722534081796,
"step": 713
},
{
"loss": 0.0103,
"grad_norm": 0.2168056070804596,
"learning_rate": 8.467111683345326e-05,
"epoch": 0.5725741780272654,
"step": 714
},
{
"loss": 0.1149,
"grad_norm": 0.9031127095222473,
"learning_rate": 8.440896979141744e-05,
"epoch": 0.5733761026463512,
"step": 715
},
{
"loss": 0.0517,
"grad_norm": 0.3651449680328369,
"learning_rate": 8.414693251615512e-05,
"epoch": 0.5741780272654371,
"step": 716
},
{
"loss": 0.0603,
"grad_norm": 0.35386982560157776,
"learning_rate": 8.388500685250827e-05,
"epoch": 0.5749799518845229,
"step": 717
},
{
"loss": 0.0563,
"grad_norm": 0.3657480478286743,
"learning_rate": 8.3623194644533e-05,
"epoch": 0.5757818765036087,
"step": 718
},
{
"loss": 0.0391,
"grad_norm": 0.2397533357143402,
"learning_rate": 8.336149773548678e-05,
"epoch": 0.5765838011226945,
"step": 719
},
{
"loss": 0.0399,
"grad_norm": 0.33155348896980286,
"learning_rate": 8.309991796781511e-05,
"epoch": 0.5773857257417803,
"step": 720
},
{
"loss": 0.0514,
"grad_norm": 0.4915727972984314,
"learning_rate": 8.283845718313894e-05,
"epoch": 0.5781876503608661,
"step": 721
},
{
"loss": 0.026,
"grad_norm": 0.22791197896003723,
"learning_rate": 8.257711722224152e-05,
"epoch": 0.5789895749799518,
"step": 722
},
{
"loss": 0.0435,
"grad_norm": 0.40722930431365967,
"learning_rate": 8.231589992505536e-05,
"epoch": 0.5797914995990376,
"step": 723
},
{
"loss": 0.0217,
"grad_norm": 0.24059796333312988,
"learning_rate": 8.205480713064946e-05,
"epoch": 0.5805934242181235,
"step": 724
},
{
"loss": 0.0339,
"grad_norm": 0.40672048926353455,
"learning_rate": 8.179384067721631e-05,
"epoch": 0.5813953488372093,
"step": 725
},
{
"loss": 0.0744,
"grad_norm": 0.3509446680545807,
"learning_rate": 8.153300240205873e-05,
"epoch": 0.5821972734562951,
"step": 726
},
{
"loss": 0.0644,
"grad_norm": 0.28646859526634216,
"learning_rate": 8.12722941415773e-05,
"epoch": 0.5829991980753809,
"step": 727
},
{
"loss": 0.0163,
"grad_norm": 0.222028449177742,
"learning_rate": 8.101171773125716e-05,
"epoch": 0.5838011226944667,
"step": 728
},
{
"loss": 0.0215,
"grad_norm": 0.23310942947864532,
"learning_rate": 8.075127500565525e-05,
"epoch": 0.5846030473135525,
"step": 729
},
{
"loss": 0.0297,
"grad_norm": 0.2620904743671417,
"learning_rate": 8.049096779838719e-05,
"epoch": 0.5854049719326383,
"step": 730
},
{
"loss": 0.0773,
"grad_norm": 0.6896341443061829,
"learning_rate": 8.023079794211459e-05,
"epoch": 0.5862068965517241,
"step": 731
},
{
"loss": 0.0654,
"grad_norm": 0.3588181138038635,
"learning_rate": 7.99707672685321e-05,
"epoch": 0.58700882117081,
"step": 732
},
{
"loss": 0.0348,
"grad_norm": 0.2889043390750885,
"learning_rate": 7.971087760835432e-05,
"epoch": 0.5878107457898958,
"step": 733
},
{
"loss": 0.0174,
"grad_norm": 0.5972622632980347,
"learning_rate": 7.945113079130323e-05,
"epoch": 0.5886126704089816,
"step": 734
},
{
"loss": 0.0359,
"grad_norm": 0.25957322120666504,
"learning_rate": 7.919152864609499e-05,
"epoch": 0.5894145950280674,
"step": 735
},
{
"loss": 0.0433,
"grad_norm": 0.3756544888019562,
"learning_rate": 7.89320730004274e-05,
"epoch": 0.5902165196471532,
"step": 736
},
{
"loss": 0.0429,
"grad_norm": 0.25527504086494446,
"learning_rate": 7.867276568096662e-05,
"epoch": 0.591018444266239,
"step": 737
},
{
"loss": 0.0727,
"grad_norm": 0.49652037024497986,
"learning_rate": 7.84136085133347e-05,
"epoch": 0.5918203688853247,
"step": 738
},
{
"loss": 0.0555,
"grad_norm": 0.46329352259635925,
"learning_rate": 7.815460332209656e-05,
"epoch": 0.5926222935044105,
"step": 739
},
{
"loss": 0.0231,
"grad_norm": 0.217621847987175,
"learning_rate": 7.789575193074704e-05,
"epoch": 0.5934242181234964,
"step": 740
},
{
"loss": 0.0689,
"grad_norm": 0.5665069818496704,
"learning_rate": 7.763705616169825e-05,
"epoch": 0.5942261427425822,
"step": 741
},
{
"loss": 0.0463,
"grad_norm": 0.42257973551750183,
"learning_rate": 7.737851783626671e-05,
"epoch": 0.595028067361668,
"step": 742
},
{
"loss": 0.0348,
"grad_norm": 0.34942853450775146,
"learning_rate": 7.712013877466032e-05,
"epoch": 0.5958299919807538,
"step": 743
},
{
"loss": 0.0189,
"grad_norm": 0.2506210505962372,
"learning_rate": 7.686192079596586e-05,
"epoch": 0.5966319165998396,
"step": 744
},
{
"eval_loss": 0.042267050594091415,
"eval_runtime": 31.7194,
"eval_samples_per_second": 33.103,
"eval_steps_per_second": 8.291,
"epoch": 0.5966319165998396,
"step": 744
},
{
"loss": 0.0516,
"grad_norm": 0.3015764355659485,
"learning_rate": 7.660386571813593e-05,
"epoch": 0.5974338412189254,
"step": 745
},
{
"loss": 0.04,
"grad_norm": 0.333032488822937,
"learning_rate": 7.634597535797633e-05,
"epoch": 0.5982357658380112,
"step": 746
},
{
"loss": 0.058,
"grad_norm": 0.34520605206489563,
"learning_rate": 7.608825153113305e-05,
"epoch": 0.599037690457097,
"step": 747
},
{
"loss": 0.0177,
"grad_norm": 0.24561840295791626,
"learning_rate": 7.583069605207975e-05,
"epoch": 0.5998396150761829,
"step": 748
},
{
"loss": 0.0406,
"grad_norm": 0.30027586221694946,
"learning_rate": 7.557331073410485e-05,
"epoch": 0.6006415396952687,
"step": 749
},
{
"loss": 0.0641,
"grad_norm": 0.41032275557518005,
"learning_rate": 7.531609738929865e-05,
"epoch": 0.6014434643143545,
"step": 750
},
{
"loss": 0.0209,
"grad_norm": 0.20874442160129547,
"learning_rate": 7.505905782854081e-05,
"epoch": 0.6022453889334403,
"step": 751
},
{
"loss": 0.0501,
"grad_norm": 0.3524108827114105,
"learning_rate": 7.48021938614875e-05,
"epoch": 0.603047313552526,
"step": 752
},
{
"loss": 0.0323,
"grad_norm": 0.3698127269744873,
"learning_rate": 7.454550729655852e-05,
"epoch": 0.6038492381716118,
"step": 753
},
{
"loss": 0.047,
"grad_norm": 0.40356680750846863,
"learning_rate": 7.428899994092483e-05,
"epoch": 0.6046511627906976,
"step": 754
},
{
"loss": 0.024,
"grad_norm": 0.2525324523448944,
"learning_rate": 7.403267360049556e-05,
"epoch": 0.6054530874097834,
"step": 755
},
{
"loss": 0.0143,
"grad_norm": 0.416182279586792,
"learning_rate": 7.37765300799056e-05,
"epoch": 0.6062550120288693,
"step": 756
},
{
"loss": 0.0715,
"grad_norm": 0.4480084478855133,
"learning_rate": 7.352057118250246e-05,
"epoch": 0.6070569366479551,
"step": 757
},
{
"loss": 0.01,
"grad_norm": 0.22036206722259521,
"learning_rate": 7.326479871033409e-05,
"epoch": 0.6078588612670409,
"step": 758
},
{
"loss": 0.0329,
"grad_norm": 0.2710481882095337,
"learning_rate": 7.300921446413583e-05,
"epoch": 0.6086607858861267,
"step": 759
},
{
"loss": 0.0332,
"grad_norm": 0.241096630692482,
"learning_rate": 7.275382024331772e-05,
"epoch": 0.6094627105052125,
"step": 760
},
{
"loss": 0.0367,
"grad_norm": 0.37980324029922485,
"learning_rate": 7.249861784595217e-05,
"epoch": 0.6102646351242983,
"step": 761
},
{
"loss": 0.0499,
"grad_norm": 0.4780760407447815,
"learning_rate": 7.2243609068761e-05,
"epoch": 0.6110665597433841,
"step": 762
},
{
"loss": 0.0261,
"grad_norm": 0.21910789608955383,
"learning_rate": 7.198879570710272e-05,
"epoch": 0.6118684843624699,
"step": 763
},
{
"loss": 0.047,
"grad_norm": 0.30522310733795166,
"learning_rate": 7.173417955496024e-05,
"epoch": 0.6126704089815558,
"step": 764
},
{
"loss": 0.0361,
"grad_norm": 0.4247373044490814,
"learning_rate": 7.147976240492795e-05,
"epoch": 0.6134723336006416,
"step": 765
},
{
"loss": 0.0543,
"grad_norm": 0.30531254410743713,
"learning_rate": 7.122554604819925e-05,
"epoch": 0.6142742582197274,
"step": 766
},
{
"loss": 0.0394,
"grad_norm": 0.3051380515098572,
"learning_rate": 7.097153227455379e-05,
"epoch": 0.6150761828388132,
"step": 767
},
{
"loss": 0.0556,
"grad_norm": 0.3333624601364136,
"learning_rate": 7.071772287234497e-05,
"epoch": 0.615878107457899,
"step": 768
},
{
"loss": 0.032,
"grad_norm": 0.2435581535100937,
"learning_rate": 7.046411962848744e-05,
"epoch": 0.6166800320769847,
"step": 769
},
{
"loss": 0.0175,
"grad_norm": 0.21923010051250458,
"learning_rate": 7.021072432844426e-05,
"epoch": 0.6174819566960705,
"step": 770
},
{
"loss": 0.0308,
"grad_norm": 0.344446063041687,
"learning_rate": 6.995753875621464e-05,
"epoch": 0.6182838813151563,
"step": 771
},
{
"loss": 0.0579,
"grad_norm": 0.6894804835319519,
"learning_rate": 6.970456469432117e-05,
"epoch": 0.6190858059342422,
"step": 772
},
{
"loss": 0.0412,
"grad_norm": 0.9697020053863525,
"learning_rate": 6.945180392379729e-05,
"epoch": 0.619887730553328,
"step": 773
},
{
"loss": 0.0313,
"grad_norm": 0.30235642194747925,
"learning_rate": 6.919925822417476e-05,
"epoch": 0.6206896551724138,
"step": 774
},
{
"loss": 0.058,
"grad_norm": 0.42743489146232605,
"learning_rate": 6.894692937347127e-05,
"epoch": 0.6214915797914996,
"step": 775
},
{
"loss": 0.0405,
"grad_norm": 0.38457682728767395,
"learning_rate": 6.869481914817779e-05,
"epoch": 0.6222935044105854,
"step": 776
},
{
"loss": 0.0319,
"grad_norm": 0.31749409437179565,
"learning_rate": 6.844292932324597e-05,
"epoch": 0.6230954290296712,
"step": 777
},
{
"loss": 0.0363,
"grad_norm": 0.4263424575328827,
"learning_rate": 6.819126167207585e-05,
"epoch": 0.623897353648757,
"step": 778
},
{
"loss": 0.0393,
"grad_norm": 0.25529760122299194,
"learning_rate": 6.793981796650333e-05,
"epoch": 0.6246992782678428,
"step": 779
},
{
"loss": 0.0294,
"grad_norm": 0.203300341963768,
"learning_rate": 6.768859997678751e-05,
"epoch": 0.6255012028869287,
"step": 780
},
{
"loss": 0.0714,
"grad_norm": 0.43434929847717285,
"learning_rate": 6.743760947159846e-05,
"epoch": 0.6263031275060145,
"step": 781
},
{
"loss": 0.0236,
"grad_norm": 0.3486297130584717,
"learning_rate": 6.718684821800467e-05,
"epoch": 0.6271050521251003,
"step": 782
},
{
"loss": 0.0401,
"grad_norm": 0.36812183260917664,
"learning_rate": 6.69363179814606e-05,
"epoch": 0.627906976744186,
"step": 783
},
{
"loss": 0.0466,
"grad_norm": 0.40551620721817017,
"learning_rate": 6.668602052579424e-05,
"epoch": 0.6287089013632718,
"step": 784
},
{
"loss": 0.0548,
"grad_norm": 0.39897987246513367,
"learning_rate": 6.643595761319474e-05,
"epoch": 0.6295108259823576,
"step": 785
},
{
"loss": 0.0287,
"grad_norm": 0.23864711821079254,
"learning_rate": 6.61861310042e-05,
"epoch": 0.6303127506014434,
"step": 786
},
{
"loss": 0.0342,
"grad_norm": 0.32459014654159546,
"learning_rate": 6.593654245768415e-05,
"epoch": 0.6311146752205292,
"step": 787
},
{
"loss": 0.103,
"grad_norm": 0.8521727323532104,
"learning_rate": 6.568719373084538e-05,
"epoch": 0.6319165998396151,
"step": 788
},
{
"loss": 0.0183,
"grad_norm": 0.20950952172279358,
"learning_rate": 6.543808657919345e-05,
"epoch": 0.6327185244587009,
"step": 789
},
{
"loss": 0.0399,
"grad_norm": 0.41553550958633423,
"learning_rate": 6.518922275653724e-05,
"epoch": 0.6335204490777867,
"step": 790
},
{
"loss": 0.0347,
"grad_norm": 0.2640535831451416,
"learning_rate": 6.494060401497261e-05,
"epoch": 0.6343223736968725,
"step": 791
},
{
"loss": 0.0288,
"grad_norm": 0.2901599407196045,
"learning_rate": 6.469223210486992e-05,
"epoch": 0.6351242983159583,
"step": 792
},
{
"loss": 0.036,
"grad_norm": 0.30714696645736694,
"learning_rate": 6.444410877486178e-05,
"epoch": 0.6359262229350441,
"step": 793
},
{
"loss": 0.0168,
"grad_norm": 0.16659529507160187,
"learning_rate": 6.419623577183056e-05,
"epoch": 0.6367281475541299,
"step": 794
},
{
"loss": 0.0391,
"grad_norm": 0.2610877454280853,
"learning_rate": 6.394861484089641e-05,
"epoch": 0.6375300721732157,
"step": 795
},
{
"loss": 0.0113,
"grad_norm": 0.14762139320373535,
"learning_rate": 6.370124772540469e-05,
"epoch": 0.6383319967923016,
"step": 796
},
{
"loss": 0.1048,
"grad_norm": 0.5695735216140747,
"learning_rate": 6.345413616691385e-05,
"epoch": 0.6391339214113874,
"step": 797
},
{
"loss": 0.0379,
"grad_norm": 0.2888137996196747,
"learning_rate": 6.320728190518308e-05,
"epoch": 0.6399358460304732,
"step": 798
},
{
"loss": 0.0287,
"grad_norm": 0.3997354805469513,
"learning_rate": 6.29606866781602e-05,
"epoch": 0.640737770649559,
"step": 799
},
{
"loss": 0.0347,
"grad_norm": 0.23028384149074554,
"learning_rate": 6.271435222196916e-05,
"epoch": 0.6415396952686447,
"step": 800
},
{
"loss": 0.0375,
"grad_norm": 0.332156240940094,
"learning_rate": 6.246828027089811e-05,
"epoch": 0.6423416198877305,
"step": 801
},
{
"loss": 0.0492,
"grad_norm": 0.41977575421333313,
"learning_rate": 6.222247255738706e-05,
"epoch": 0.6431435445068163,
"step": 802
},
{
"loss": 0.0199,
"grad_norm": 0.24224106967449188,
"learning_rate": 6.197693081201567e-05,
"epoch": 0.6439454691259021,
"step": 803
},
{
"loss": 0.0513,
"grad_norm": 0.46784040331840515,
"learning_rate": 6.173165676349103e-05,
"epoch": 0.644747393744988,
"step": 804
},
{
"loss": 0.0472,
"grad_norm": 0.38110026717185974,
"learning_rate": 6.14866521386356e-05,
"epoch": 0.6455493183640738,
"step": 805
},
{
"loss": 0.0649,
"grad_norm": 0.3705803453922272,
"learning_rate": 6.124191866237504e-05,
"epoch": 0.6463512429831596,
"step": 806
},
{
"loss": 0.0437,
"grad_norm": 0.28756698966026306,
"learning_rate": 6.0997458057725877e-05,
"epoch": 0.6471531676022454,
"step": 807
},
{
"loss": 0.0297,
"grad_norm": 0.3769364356994629,
"learning_rate": 6.0753272045783625e-05,
"epoch": 0.6479550922213312,
"step": 808
},
{
"loss": 0.0351,
"grad_norm": 0.2772417962551117,
"learning_rate": 6.0509362345710585e-05,
"epoch": 0.648757016840417,
"step": 809
},
{
"loss": 0.0276,
"grad_norm": 0.20303967595100403,
"learning_rate": 6.026573067472366e-05,
"epoch": 0.6495589414595028,
"step": 810
},
{
"loss": 0.0409,
"grad_norm": 0.26352015137672424,
"learning_rate": 6.00223787480823e-05,
"epoch": 0.6503608660785886,
"step": 811
},
{
"loss": 0.0392,
"grad_norm": 0.4463076591491699,
"learning_rate": 5.977930827907649e-05,
"epoch": 0.6511627906976745,
"step": 812
},
{
"loss": 0.0556,
"grad_norm": 0.3590082824230194,
"learning_rate": 5.9536520979014676e-05,
"epoch": 0.6519647153167603,
"step": 813
},
{
"loss": 0.0316,
"grad_norm": 0.38499733805656433,
"learning_rate": 5.929401855721162e-05,
"epoch": 0.652766639935846,
"step": 814
},
{
"loss": 0.0456,
"grad_norm": 1.1676615476608276,
"learning_rate": 5.905180272097648e-05,
"epoch": 0.6535685645549318,
"step": 815
},
{
"loss": 0.0783,
"grad_norm": 0.5431867837905884,
"learning_rate": 5.880987517560075e-05,
"epoch": 0.6543704891740176,
"step": 816
},
{
"loss": 0.0501,
"grad_norm": 0.35964494943618774,
"learning_rate": 5.856823762434618e-05,
"epoch": 0.6551724137931034,
"step": 817
},
{
"loss": 0.0408,
"grad_norm": 0.323234498500824,
"learning_rate": 5.832689176843291e-05,
"epoch": 0.6559743384121892,
"step": 818
},
{
"loss": 0.0442,
"grad_norm": 1.2825475931167603,
"learning_rate": 5.808583930702739e-05,
"epoch": 0.656776263031275,
"step": 819
},
{
"loss": 0.0291,
"grad_norm": 0.19290927052497864,
"learning_rate": 5.784508193723057e-05,
"epoch": 0.6575781876503609,
"step": 820
},
{
"loss": 0.0176,
"grad_norm": 0.18903003633022308,
"learning_rate": 5.76046213540657e-05,
"epoch": 0.6583801122694467,
"step": 821
},
{
"loss": 0.0336,
"grad_norm": 0.39771515130996704,
"learning_rate": 5.7364459250466596e-05,
"epoch": 0.6591820368885325,
"step": 822
},
{
"loss": 0.0294,
"grad_norm": 0.3263964354991913,
"learning_rate": 5.712459731726577e-05,
"epoch": 0.6599839615076183,
"step": 823
},
{
"loss": 0.0371,
"grad_norm": 0.2918822765350342,
"learning_rate": 5.688503724318217e-05,
"epoch": 0.6607858861267041,
"step": 824
},
{
"loss": 0.0395,
"grad_norm": 0.24919533729553223,
"learning_rate": 5.6645780714809814e-05,
"epoch": 0.6615878107457899,
"step": 825
},
{
"loss": 0.0374,
"grad_norm": 0.23720526695251465,
"learning_rate": 5.640682941660547e-05,
"epoch": 0.6623897353648757,
"step": 826
},
{
"loss": 0.0199,
"grad_norm": 0.1959155648946762,
"learning_rate": 5.616818503087704e-05,
"epoch": 0.6631916599839615,
"step": 827
},
{
"loss": 0.0641,
"grad_norm": 2.0196421146392822,
"learning_rate": 5.5929849237771556e-05,
"epoch": 0.6639935846030474,
"step": 828
},
{
"loss": 0.0397,
"grad_norm": 0.24733805656433105,
"learning_rate": 5.569182371526365e-05,
"epoch": 0.6647955092221332,
"step": 829
},
{
"loss": 0.0359,
"grad_norm": 0.24535015225410461,
"learning_rate": 5.545411013914329e-05,
"epoch": 0.6655974338412189,
"step": 830
},
{
"loss": 0.0384,
"grad_norm": 0.4017760753631592,
"learning_rate": 5.521671018300436e-05,
"epoch": 0.6663993584603047,
"step": 831
},
{
"loss": 0.0287,
"grad_norm": 0.2186603546142578,
"learning_rate": 5.497962551823266e-05,
"epoch": 0.6672012830793905,
"step": 832
},
{
"loss": 0.0885,
"grad_norm": 0.4830259680747986,
"learning_rate": 5.4742857813994356e-05,
"epoch": 0.6680032076984763,
"step": 833
},
{
"loss": 0.0302,
"grad_norm": 0.3544902205467224,
"learning_rate": 5.450640873722395e-05,
"epoch": 0.6688051323175621,
"step": 834
},
{
"loss": 0.0764,
"grad_norm": 0.3852503299713135,
"learning_rate": 5.427027995261269e-05,
"epoch": 0.6696070569366479,
"step": 835
},
{
"loss": 0.0466,
"grad_norm": 0.32173559069633484,
"learning_rate": 5.403447312259702e-05,
"epoch": 0.6704089815557338,
"step": 836
},
{
"loss": 0.0149,
"grad_norm": 0.19790256023406982,
"learning_rate": 5.379898990734641e-05,
"epoch": 0.6712109061748196,
"step": 837
},
{
"loss": 0.0309,
"grad_norm": 0.2427252233028412,
"learning_rate": 5.356383196475225e-05,
"epoch": 0.6720128307939054,
"step": 838
},
{
"loss": 0.0637,
"grad_norm": 0.45702651143074036,
"learning_rate": 5.332900095041569e-05,
"epoch": 0.6728147554129912,
"step": 839
},
{
"loss": 0.0258,
"grad_norm": 0.21773581206798553,
"learning_rate": 5.309449851763633e-05,
"epoch": 0.673616680032077,
"step": 840
},
{
"loss": 0.0383,
"grad_norm": 0.34996020793914795,
"learning_rate": 5.286032631740023e-05,
"epoch": 0.6744186046511628,
"step": 841
},
{
"loss": 0.053,
"grad_norm": 0.3601475656032562,
"learning_rate": 5.2626485998368726e-05,
"epoch": 0.6752205292702486,
"step": 842
},
{
"loss": 0.0334,
"grad_norm": 0.2879583537578583,
"learning_rate": 5.239297920686641e-05,
"epoch": 0.6760224538893344,
"step": 843
},
{
"loss": 0.025,
"grad_norm": 0.3214558959007263,
"learning_rate": 5.215980758686978e-05,
"epoch": 0.6768243785084203,
"step": 844
},
{
"loss": 0.038,
"grad_norm": 0.33000731468200684,
"learning_rate": 5.1926972779995564e-05,
"epoch": 0.677626303127506,
"step": 845
},
{
"loss": 0.0325,
"grad_norm": 0.45079219341278076,
"learning_rate": 5.169447642548928e-05,
"epoch": 0.6784282277465918,
"step": 846
},
{
"loss": 0.0186,
"grad_norm": 0.27335742115974426,
"learning_rate": 5.146232016021353e-05,
"epoch": 0.6792301523656776,
"step": 847
},
{
"loss": 0.0674,
"grad_norm": 0.480881929397583,
"learning_rate": 5.123050561863657e-05,
"epoch": 0.6800320769847634,
"step": 848
},
{
"loss": 0.0358,
"grad_norm": 0.26150617003440857,
"learning_rate": 5.099903443282079e-05,
"epoch": 0.6808340016038492,
"step": 849
},
{
"loss": 0.0358,
"grad_norm": 0.4229785203933716,
"learning_rate": 5.0767908232411306e-05,
"epoch": 0.681635926222935,
"step": 850
},
{
"loss": 0.0964,
"grad_norm": 0.6302306652069092,
"learning_rate": 5.053712864462432e-05,
"epoch": 0.6824378508420208,
"step": 851
},
{
"loss": 0.0529,
"grad_norm": 0.40250223875045776,
"learning_rate": 5.0306697294235714e-05,
"epoch": 0.6832397754611067,
"step": 852
},
{
"loss": 0.0349,
"grad_norm": 0.40601104497909546,
"learning_rate": 5.007661580356982e-05,
"epoch": 0.6840417000801925,
"step": 853
},
{
"loss": 0.0264,
"grad_norm": 0.20523907244205475,
"learning_rate": 4.984688579248756e-05,
"epoch": 0.6848436246992783,
"step": 854
},
{
"loss": 0.0233,
"grad_norm": 0.2532117962837219,
"learning_rate": 4.961750887837557e-05,
"epoch": 0.6856455493183641,
"step": 855
},
{
"loss": 0.0222,
"grad_norm": 0.23107284307479858,
"learning_rate": 4.938848667613436e-05,
"epoch": 0.6864474739374499,
"step": 856
},
{
"loss": 0.0243,
"grad_norm": 0.2529151141643524,
"learning_rate": 4.915982079816732e-05,
"epoch": 0.6872493985565357,
"step": 857
},
{
"loss": 0.0391,
"grad_norm": 0.2575894892215729,
"learning_rate": 4.8931512854368913e-05,
"epoch": 0.6880513231756215,
"step": 858
},
{
"loss": 0.0773,
"grad_norm": 0.6415811777114868,
"learning_rate": 4.870356445211388e-05,
"epoch": 0.6888532477947072,
"step": 859
},
{
"loss": 0.0391,
"grad_norm": 0.4123080372810364,
"learning_rate": 4.8475977196245504e-05,
"epoch": 0.6896551724137931,
"step": 860
},
{
"loss": 0.0457,
"grad_norm": 0.31931477785110474,
"learning_rate": 4.8248752689064494e-05,
"epoch": 0.6904570970328789,
"step": 861
},
{
"loss": 0.0268,
"grad_norm": 0.2256850302219391,
"learning_rate": 4.802189253031764e-05,
"epoch": 0.6912590216519647,
"step": 862
},
{
"loss": 0.029,
"grad_norm": 0.4967258870601654,
"learning_rate": 4.779539831718668e-05,
"epoch": 0.6920609462710505,
"step": 863
},
{
"loss": 0.0175,
"grad_norm": 0.18003036081790924,
"learning_rate": 4.756927164427685e-05,
"epoch": 0.6928628708901363,
"step": 864
},
{
"loss": 0.0215,
"grad_norm": 0.2862027585506439,
"learning_rate": 4.7343514103605767e-05,
"epoch": 0.6936647955092221,
"step": 865
},
{
"loss": 0.0455,
"grad_norm": 0.32969915866851807,
"learning_rate": 4.711812728459233e-05,
"epoch": 0.6944667201283079,
"step": 866
},
{
"loss": 0.0135,
"grad_norm": 0.16625012457370758,
"learning_rate": 4.689311277404529e-05,
"epoch": 0.6952686447473937,
"step": 867
},
{
"loss": 0.0223,
"grad_norm": 0.4092782139778137,
"learning_rate": 4.666847215615226e-05,
"epoch": 0.6960705693664796,
"step": 868
},
{
"eval_loss": 0.04034050926566124,
"eval_runtime": 31.8203,
"eval_samples_per_second": 32.998,
"eval_steps_per_second": 8.265,
"epoch": 0.6960705693664796,
"step": 868
},
{
"loss": 0.0779,
"grad_norm": 0.5360198616981506,
"learning_rate": 4.6444207012468465e-05,
"epoch": 0.6968724939855654,
"step": 869
},
{
"loss": 0.0513,
"grad_norm": 0.49572035670280457,
"learning_rate": 4.622031892190579e-05,
"epoch": 0.6976744186046512,
"step": 870
},
{
"loss": 0.0337,
"grad_norm": 0.27240556478500366,
"learning_rate": 4.599680946072127e-05,
"epoch": 0.698476343223737,
"step": 871
},
{
"loss": 0.0367,
"grad_norm": 0.3386692702770233,
"learning_rate": 4.57736802025065e-05,
"epoch": 0.6992782678428228,
"step": 872
},
{
"loss": 0.0502,
"grad_norm": 0.47822192311286926,
"learning_rate": 4.555093271817616e-05,
"epoch": 0.7000801924619086,
"step": 873
},
{
"loss": 0.0752,
"grad_norm": 0.6390823125839233,
"learning_rate": 4.532856857595714e-05,
"epoch": 0.7008821170809943,
"step": 874
},
{
"loss": 0.0452,
"grad_norm": 0.37735825777053833,
"learning_rate": 4.5106589341377394e-05,
"epoch": 0.7016840417000801,
"step": 875
},
{
"loss": 0.021,
"grad_norm": 0.18500256538391113,
"learning_rate": 4.488499657725511e-05,
"epoch": 0.702485966319166,
"step": 876
},
{
"loss": 0.0621,
"grad_norm": 0.43604958057403564,
"learning_rate": 4.466379184368747e-05,
"epoch": 0.7032878909382518,
"step": 877
},
{
"loss": 0.0204,
"grad_norm": 0.5190374851226807,
"learning_rate": 4.444297669803981e-05,
"epoch": 0.7040898155573376,
"step": 878
},
{
"loss": 0.0131,
"grad_norm": 0.3594497740268707,
"learning_rate": 4.422255269493455e-05,
"epoch": 0.7048917401764234,
"step": 879
},
{
"loss": 0.0234,
"grad_norm": 0.26429349184036255,
"learning_rate": 4.4002521386240466e-05,
"epoch": 0.7056936647955092,
"step": 880
},
{
"loss": 0.0205,
"grad_norm": 0.3388163447380066,
"learning_rate": 4.37828843210615e-05,
"epoch": 0.706495589414595,
"step": 881
},
{
"loss": 0.0264,
"grad_norm": 0.21950915455818176,
"learning_rate": 4.3563643045725964e-05,
"epoch": 0.7072975140336808,
"step": 882
},
{
"loss": 0.0542,
"grad_norm": 0.40110042691230774,
"learning_rate": 4.334479910377577e-05,
"epoch": 0.7080994386527666,
"step": 883
},
{
"loss": 0.078,
"grad_norm": 0.4152352511882782,
"learning_rate": 4.312635403595532e-05,
"epoch": 0.7089013632718525,
"step": 884
},
{
"loss": 0.0244,
"grad_norm": 0.20890304446220398,
"learning_rate": 4.290830938020087e-05,
"epoch": 0.7097032878909383,
"step": 885
},
{
"loss": 0.0318,
"grad_norm": 0.32398372888565063,
"learning_rate": 4.269066667162956e-05,
"epoch": 0.7105052125100241,
"step": 886
},
{
"loss": 0.0615,
"grad_norm": 0.3690579831600189,
"learning_rate": 4.247342744252883e-05,
"epoch": 0.7113071371291099,
"step": 887
},
{
"loss": 0.0299,
"grad_norm": 0.4021519422531128,
"learning_rate": 4.2256593222345185e-05,
"epoch": 0.7121090617481957,
"step": 888
},
{
"loss": 0.0228,
"grad_norm": 0.24381564557552338,
"learning_rate": 4.2040165537674006e-05,
"epoch": 0.7129109863672815,
"step": 889
},
{
"loss": 0.0786,
"grad_norm": 0.5315597057342529,
"learning_rate": 4.182414591224833e-05,
"epoch": 0.7137129109863672,
"step": 890
},
{
"loss": 0.0306,
"grad_norm": 0.29537150263786316,
"learning_rate": 4.160853586692839e-05,
"epoch": 0.714514835605453,
"step": 891
},
{
"loss": 0.029,
"grad_norm": 0.3600987195968628,
"learning_rate": 4.139333691969071e-05,
"epoch": 0.7153167602245389,
"step": 892
},
{
"loss": 0.0235,
"grad_norm": 0.23906612396240234,
"learning_rate": 4.117855058561769e-05,
"epoch": 0.7161186848436247,
"step": 893
},
{
"loss": 0.1348,
"grad_norm": 0.7623001337051392,
"learning_rate": 4.096417837688666e-05,
"epoch": 0.7169206094627105,
"step": 894
},
{
"loss": 0.0216,
"grad_norm": 0.33109530806541443,
"learning_rate": 4.075022180275935e-05,
"epoch": 0.7177225340817963,
"step": 895
},
{
"loss": 0.0213,
"grad_norm": 0.172570139169693,
"learning_rate": 4.053668236957134e-05,
"epoch": 0.7185244587008821,
"step": 896
},
{
"loss": 0.0213,
"grad_norm": 0.27047714591026306,
"learning_rate": 4.032356158072131e-05,
"epoch": 0.7193263833199679,
"step": 897
},
{
"loss": 0.0791,
"grad_norm": 0.4213772118091583,
"learning_rate": 4.0110860936660566e-05,
"epoch": 0.7201283079390537,
"step": 898
},
{
"loss": 0.0458,
"grad_norm": 0.38493579626083374,
"learning_rate": 3.989858193488236e-05,
"epoch": 0.7209302325581395,
"step": 899
},
{
"loss": 0.022,
"grad_norm": 0.23332200944423676,
"learning_rate": 3.96867260699116e-05,
"epoch": 0.7217321571772254,
"step": 900
},
{
"loss": 0.0438,
"grad_norm": 0.3719151020050049,
"learning_rate": 3.947529483329387e-05,
"epoch": 0.7225340817963112,
"step": 901
},
{
"loss": 0.0294,
"grad_norm": 0.18766042590141296,
"learning_rate": 3.92642897135855e-05,
"epoch": 0.723336006415397,
"step": 902
},
{
"loss": 0.0163,
"grad_norm": 0.17008039355278015,
"learning_rate": 3.905371219634257e-05,
"epoch": 0.7241379310344828,
"step": 903
},
{
"loss": 0.0293,
"grad_norm": 0.2763400673866272,
"learning_rate": 3.884356376411089e-05,
"epoch": 0.7249398556535686,
"step": 904
},
{
"loss": 0.0286,
"grad_norm": 0.3425106704235077,
"learning_rate": 3.863384589641509e-05,
"epoch": 0.7257417802726543,
"step": 905
},
{
"loss": 0.0221,
"grad_norm": 0.3655487596988678,
"learning_rate": 3.8424560069748706e-05,
"epoch": 0.7265437048917401,
"step": 906
},
{
"loss": 0.0191,
"grad_norm": 0.22211404144763947,
"learning_rate": 3.821570775756339e-05,
"epoch": 0.7273456295108259,
"step": 907
},
{
"loss": 0.0415,
"grad_norm": 0.3968844711780548,
"learning_rate": 3.800729043025871e-05,
"epoch": 0.7281475541299118,
"step": 908
},
{
"loss": 0.0477,
"grad_norm": 0.30855193734169006,
"learning_rate": 3.779930955517187e-05,
"epoch": 0.7289494787489976,
"step": 909
},
{
"loss": 0.0186,
"grad_norm": 0.20964409410953522,
"learning_rate": 3.759176659656717e-05,
"epoch": 0.7297514033680834,
"step": 910
},
{
"loss": 0.0169,
"grad_norm": 0.20416317880153656,
"learning_rate": 3.7384663015625854e-05,
"epoch": 0.7305533279871692,
"step": 911
},
{
"loss": 0.1093,
"grad_norm": 0.6007756590843201,
"learning_rate": 3.717800027043576e-05,
"epoch": 0.731355252606255,
"step": 912
},
{
"loss": 0.0393,
"grad_norm": 0.4740281403064728,
"learning_rate": 3.697177981598115e-05,
"epoch": 0.7321571772253408,
"step": 913
},
{
"loss": 0.0507,
"grad_norm": 0.39300405979156494,
"learning_rate": 3.676600310413233e-05,
"epoch": 0.7329591018444266,
"step": 914
},
{
"loss": 0.0255,
"grad_norm": 0.29988205432891846,
"learning_rate": 3.6560671583635467e-05,
"epoch": 0.7337610264635124,
"step": 915
},
{
"loss": 0.0219,
"grad_norm": 0.22536736726760864,
"learning_rate": 3.635578670010242e-05,
"epoch": 0.7345629510825983,
"step": 916
},
{
"loss": 0.0398,
"grad_norm": 0.29492881894111633,
"learning_rate": 3.615134989600069e-05,
"epoch": 0.7353648757016841,
"step": 917
},
{
"loss": 0.0413,
"grad_norm": 0.3680134415626526,
"learning_rate": 3.5947362610642854e-05,
"epoch": 0.7361668003207699,
"step": 918
},
{
"loss": 0.0451,
"grad_norm": 0.2880399525165558,
"learning_rate": 3.5743826280177e-05,
"epoch": 0.7369687249398557,
"step": 919
},
{
"loss": 0.0463,
"grad_norm": 0.38011434674263,
"learning_rate": 3.554074233757608e-05,
"epoch": 0.7377706495589414,
"step": 920
},
{
"loss": 0.0236,
"grad_norm": 0.21362242102622986,
"learning_rate": 3.533811221262833e-05,
"epoch": 0.7385725741780272,
"step": 921
},
{
"loss": 0.0611,
"grad_norm": 0.49550414085388184,
"learning_rate": 3.5135937331926596e-05,
"epoch": 0.739374498797113,
"step": 922
},
{
"loss": 0.0312,
"grad_norm": 0.2971956133842468,
"learning_rate": 3.4934219118858936e-05,
"epoch": 0.7401764234161988,
"step": 923
},
{
"loss": 0.0168,
"grad_norm": 0.21751493215560913,
"learning_rate": 3.4732958993598154e-05,
"epoch": 0.7409783480352847,
"step": 924
},
{
"loss": 0.0316,
"grad_norm": 0.25100478529930115,
"learning_rate": 3.453215837309192e-05,
"epoch": 0.7417802726543705,
"step": 925
},
{
"loss": 0.0355,
"grad_norm": 0.23625293374061584,
"learning_rate": 3.4331818671052906e-05,
"epoch": 0.7425821972734563,
"step": 926
},
{
"loss": 0.0158,
"grad_norm": 0.4031226336956024,
"learning_rate": 3.413194129794869e-05,
"epoch": 0.7433841218925421,
"step": 927
},
{
"loss": 0.0269,
"grad_norm": 0.27065587043762207,
"learning_rate": 3.393252766099187e-05,
"epoch": 0.7441860465116279,
"step": 928
},
{
"loss": 0.0348,
"grad_norm": 0.2262876033782959,
"learning_rate": 3.373357916413016e-05,
"epoch": 0.7449879711307137,
"step": 929
},
{
"loss": 0.0519,
"grad_norm": 0.4427652359008789,
"learning_rate": 3.353509720803658e-05,
"epoch": 0.7457898957497995,
"step": 930
},
{
"loss": 0.0686,
"grad_norm": 0.46217551827430725,
"learning_rate": 3.333708319009945e-05,
"epoch": 0.7465918203688853,
"step": 931
},
{
"loss": 0.0278,
"grad_norm": 0.3490634262561798,
"learning_rate": 3.313953850441266e-05,
"epoch": 0.7473937449879712,
"step": 932
},
{
"loss": 0.0184,
"grad_norm": 0.23873376846313477,
"learning_rate": 3.294246454176577e-05,
"epoch": 0.748195669607057,
"step": 933
},
{
"loss": 0.0297,
"grad_norm": 0.35058480501174927,
"learning_rate": 3.274586268963443e-05,
"epoch": 0.7489975942261428,
"step": 934
},
{
"loss": 0.0616,
"grad_norm": 0.7911800742149353,
"learning_rate": 3.254973433217021e-05,
"epoch": 0.7497995188452286,
"step": 935
},
{
"loss": 0.0355,
"grad_norm": 0.2944418489933014,
"learning_rate": 3.2354080850191324e-05,
"epoch": 0.7506014434643143,
"step": 936
},
{
"loss": 0.0705,
"grad_norm": 0.495128333568573,
"learning_rate": 3.215890362117256e-05,
"epoch": 0.7514033680834001,
"step": 937
},
{
"loss": 0.0297,
"grad_norm": 0.3445483446121216,
"learning_rate": 3.196420401923566e-05,
"epoch": 0.7522052927024859,
"step": 938
},
{
"loss": 0.025,
"grad_norm": 0.28738442063331604,
"learning_rate": 3.176998341513989e-05,
"epoch": 0.7530072173215717,
"step": 939
},
{
"loss": 0.0215,
"grad_norm": 0.24774937331676483,
"learning_rate": 3.157624317627195e-05,
"epoch": 0.7538091419406576,
"step": 940
},
{
"loss": 0.0215,
"grad_norm": 0.2146318256855011,
"learning_rate": 3.138298466663681e-05,
"epoch": 0.7546110665597434,
"step": 941
},
{
"loss": 0.0433,
"grad_norm": 0.27725639939308167,
"learning_rate": 3.119020924684762e-05,
"epoch": 0.7554129911788292,
"step": 942
},
{
"loss": 0.0978,
"grad_norm": 0.4043017029762268,
"learning_rate": 3.099791827411668e-05,
"epoch": 0.756214915797915,
"step": 943
},
{
"loss": 0.0499,
"grad_norm": 0.33598214387893677,
"learning_rate": 3.080611310224539e-05,
"epoch": 0.7570168404170008,
"step": 944
},
{
"loss": 0.0422,
"grad_norm": 0.5092307925224304,
"learning_rate": 3.061479508161502e-05,
"epoch": 0.7578187650360866,
"step": 945
},
{
"loss": 0.0192,
"grad_norm": 0.37134242057800293,
"learning_rate": 3.042396555917707e-05,
"epoch": 0.7586206896551724,
"step": 946
},
{
"loss": 0.0539,
"grad_norm": 0.36531612277030945,
"learning_rate": 3.0233625878443927e-05,
"epoch": 0.7594226142742582,
"step": 947
},
{
"loss": 0.0405,
"grad_norm": 0.3020681142807007,
"learning_rate": 3.0043777379479098e-05,
"epoch": 0.7602245388933441,
"step": 948
},
{
"loss": 0.093,
"grad_norm": 0.45718201994895935,
"learning_rate": 2.985442139888821e-05,
"epoch": 0.7610264635124299,
"step": 949
},
{
"loss": 0.0479,
"grad_norm": 0.26453983783721924,
"learning_rate": 2.9665559269809217e-05,
"epoch": 0.7618283881315157,
"step": 950
},
{
"loss": 0.06,
"grad_norm": 0.3121758699417114,
"learning_rate": 2.9477192321903292e-05,
"epoch": 0.7626303127506014,
"step": 951
},
{
"loss": 0.0204,
"grad_norm": 0.20015697181224823,
"learning_rate": 2.9289321881345254e-05,
"epoch": 0.7634322373696872,
"step": 952
},
{
"loss": 0.0371,
"grad_norm": 0.36291614174842834,
"learning_rate": 2.9101949270814344e-05,
"epoch": 0.764234161988773,
"step": 953
},
{
"loss": 0.0229,
"grad_norm": 0.2591784596443176,
"learning_rate": 2.8915075809484904e-05,
"epoch": 0.7650360866078588,
"step": 954
},
{
"loss": 0.0261,
"grad_norm": 0.2614014744758606,
"learning_rate": 2.872870281301704e-05,
"epoch": 0.7658380112269446,
"step": 955
},
{
"loss": 0.0228,
"grad_norm": 0.3347266912460327,
"learning_rate": 2.854283159354748e-05,
"epoch": 0.7666399358460305,
"step": 956
},
{
"loss": 0.0368,
"grad_norm": 0.3613988757133484,
"learning_rate": 2.835746345968012e-05,
"epoch": 0.7674418604651163,
"step": 957
},
{
"loss": 0.0175,
"grad_norm": 0.2561342716217041,
"learning_rate": 2.8172599716477143e-05,
"epoch": 0.7682437850842021,
"step": 958
},
{
"loss": 0.0396,
"grad_norm": 0.2864450216293335,
"learning_rate": 2.7988241665449354e-05,
"epoch": 0.7690457097032879,
"step": 959
},
{
"loss": 0.0294,
"grad_norm": 0.2632593512535095,
"learning_rate": 2.7804390604547557e-05,
"epoch": 0.7698476343223737,
"step": 960
},
{
"loss": 0.0252,
"grad_norm": 0.2442079782485962,
"learning_rate": 2.7621047828153e-05,
"epoch": 0.7706495589414595,
"step": 961
},
{
"loss": 0.027,
"grad_norm": 0.30140507221221924,
"learning_rate": 2.7438214627068448e-05,
"epoch": 0.7714514835605453,
"step": 962
},
{
"loss": 0.0602,
"grad_norm": 0.4731035828590393,
"learning_rate": 2.7255892288509043e-05,
"epoch": 0.7722534081796311,
"step": 963
},
{
"loss": 0.0201,
"grad_norm": 0.25631648302078247,
"learning_rate": 2.707408209609339e-05,
"epoch": 0.773055332798717,
"step": 964
},
{
"loss": 0.0316,
"grad_norm": 0.2207869291305542,
"learning_rate": 2.689278532983416e-05,
"epoch": 0.7738572574178028,
"step": 965
},
{
"loss": 0.0997,
"grad_norm": 0.574215292930603,
"learning_rate": 2.6712003266129525e-05,
"epoch": 0.7746591820368885,
"step": 966
},
{
"loss": 0.107,
"grad_norm": 0.5775609612464905,
"learning_rate": 2.65317371777538e-05,
"epoch": 0.7754611066559743,
"step": 967
},
{
"loss": 0.028,
"grad_norm": 0.37875795364379883,
"learning_rate": 2.6351988333848788e-05,
"epoch": 0.7762630312750601,
"step": 968
},
{
"loss": 0.026,
"grad_norm": 0.2766773998737335,
"learning_rate": 2.6172757999914554e-05,
"epoch": 0.7770649558941459,
"step": 969
},
{
"loss": 0.066,
"grad_norm": 0.3758871555328369,
"learning_rate": 2.5994047437800706e-05,
"epoch": 0.7778668805132317,
"step": 970
},
{
"loss": 0.019,
"grad_norm": 0.2001071274280548,
"learning_rate": 2.5815857905697548e-05,
"epoch": 0.7786688051323175,
"step": 971
},
{
"loss": 0.0469,
"grad_norm": 0.35508066415786743,
"learning_rate": 2.5638190658126938e-05,
"epoch": 0.7794707297514034,
"step": 972
},
{
"loss": 0.05,
"grad_norm": 0.4158821403980255,
"learning_rate": 2.5461046945933854e-05,
"epoch": 0.7802726543704892,
"step": 973
},
{
"loss": 0.0471,
"grad_norm": 0.4256257712841034,
"learning_rate": 2.5284428016277284e-05,
"epoch": 0.781074578989575,
"step": 974
},
{
"loss": 0.0345,
"grad_norm": 0.4111097753047943,
"learning_rate": 2.5108335112621562e-05,
"epoch": 0.7818765036086608,
"step": 975
},
{
"loss": 0.0287,
"grad_norm": 0.2494903802871704,
"learning_rate": 2.493276947472756e-05,
"epoch": 0.7826784282277466,
"step": 976
},
{
"loss": 0.0141,
"grad_norm": 0.3150325119495392,
"learning_rate": 2.4757732338644124e-05,
"epoch": 0.7834803528468324,
"step": 977
},
{
"loss": 0.0281,
"grad_norm": 0.33352166414260864,
"learning_rate": 2.458322493669911e-05,
"epoch": 0.7842822774659182,
"step": 978
},
{
"loss": 0.0404,
"grad_norm": 0.2655154764652252,
"learning_rate": 2.4409248497490922e-05,
"epoch": 0.785084202085004,
"step": 979
},
{
"loss": 0.0338,
"grad_norm": 0.4300474524497986,
"learning_rate": 2.4235804245879723e-05,
"epoch": 0.7858861267040899,
"step": 980
},
{
"loss": 0.0173,
"grad_norm": 0.18177032470703125,
"learning_rate": 2.4062893402978958e-05,
"epoch": 0.7866880513231757,
"step": 981
},
{
"loss": 0.0422,
"grad_norm": 0.3139914572238922,
"learning_rate": 2.389051718614662e-05,
"epoch": 0.7874899759422614,
"step": 982
},
{
"loss": 0.0139,
"grad_norm": 0.16391263902187347,
"learning_rate": 2.371867680897668e-05,
"epoch": 0.7882919005613472,
"step": 983
},
{
"loss": 0.0496,
"grad_norm": 0.3264078199863434,
"learning_rate": 2.354737348129077e-05,
"epoch": 0.789093825180433,
"step": 984
},
{
"loss": 0.0342,
"grad_norm": 0.3129443824291229,
"learning_rate": 2.337660840912923e-05,
"epoch": 0.7898957497995188,
"step": 985
},
{
"loss": 0.0892,
"grad_norm": 0.5504446029663086,
"learning_rate": 2.320638279474312e-05,
"epoch": 0.7906976744186046,
"step": 986
},
{
"loss": 0.0212,
"grad_norm": 0.2645837664604187,
"learning_rate": 2.3036697836585353e-05,
"epoch": 0.7914995990376904,
"step": 987
},
{
"loss": 0.0374,
"grad_norm": 0.3089625835418701,
"learning_rate": 2.2867554729302542e-05,
"epoch": 0.7923015236567763,
"step": 988
},
{
"loss": 0.068,
"grad_norm": 0.5308516025543213,
"learning_rate": 2.26989546637263e-05,
"epoch": 0.7931034482758621,
"step": 989
},
{
"loss": 0.0077,
"grad_norm": 0.20258042216300964,
"learning_rate": 2.25308988268652e-05,
"epoch": 0.7939053728949479,
"step": 990
},
{
"loss": 0.0377,
"grad_norm": 0.3660711348056793,
"learning_rate": 2.2363388401896124e-05,
"epoch": 0.7947072975140337,
"step": 991
},
{
"loss": 0.037,
"grad_norm": 0.3499682545661926,
"learning_rate": 2.2196424568156073e-05,
"epoch": 0.7955092221331195,
"step": 992
},
{
"eval_loss": 0.039402980357408524,
"eval_runtime": 31.7538,
"eval_samples_per_second": 33.067,
"eval_steps_per_second": 8.282,
"epoch": 0.7955092221331195,
"step": 992
},
{
"loss": 0.0333,
"grad_norm": 0.259440541267395,
"learning_rate": 2.2030008501133815e-05,
"epoch": 0.7963111467522053,
"step": 993
},
{
"loss": 0.0071,
"grad_norm": 0.16091641783714294,
"learning_rate": 2.186414137246172e-05,
"epoch": 0.7971130713712911,
"step": 994
},
{
"loss": 0.0539,
"grad_norm": 0.4330938160419464,
"learning_rate": 2.1698824349907344e-05,
"epoch": 0.7979149959903769,
"step": 995
},
{
"loss": 0.0287,
"grad_norm": 0.2377873659133911,
"learning_rate": 2.153405859736528e-05,
"epoch": 0.7987169206094628,
"step": 996
},
{
"loss": 0.0286,
"grad_norm": 0.30016374588012695,
"learning_rate": 2.136984527484901e-05,
"epoch": 0.7995188452285485,
"step": 997
},
{
"loss": 0.0644,
"grad_norm": 0.3702533543109894,
"learning_rate": 2.1206185538482703e-05,
"epoch": 0.8003207698476343,
"step": 998
},
{
"loss": 0.0425,
"grad_norm": 0.31753045320510864,
"learning_rate": 2.1043080540493056e-05,
"epoch": 0.8011226944667201,
"step": 999
},
{
"loss": 0.0495,
"grad_norm": 0.26273587346076965,
"learning_rate": 2.0880531429201145e-05,
"epoch": 0.8019246190858059,
"step": 1000
},
{
"loss": 0.0328,
"grad_norm": 0.2284964621067047,
"learning_rate": 2.0718539349014544e-05,
"epoch": 0.8027265437048917,
"step": 1001
},
{
"loss": 0.0299,
"grad_norm": 0.31368839740753174,
"learning_rate": 2.05571054404189e-05,
"epoch": 0.8035284683239775,
"step": 1002
},
{
"loss": 0.0734,
"grad_norm": 0.5880581736564636,
"learning_rate": 2.039623083997031e-05,
"epoch": 0.8043303929430633,
"step": 1003
},
{
"loss": 0.0626,
"grad_norm": 0.6813879609107971,
"learning_rate": 2.0235916680287015e-05,
"epoch": 0.8051323175621492,
"step": 1004
},
{
"loss": 0.0493,
"grad_norm": 0.8034153580665588,
"learning_rate": 2.007616409004165e-05,
"epoch": 0.805934242181235,
"step": 1005
},
{
"loss": 0.0275,
"grad_norm": 0.2636951208114624,
"learning_rate": 1.991697419395301e-05,
"epoch": 0.8067361668003208,
"step": 1006
},
{
"loss": 0.0725,
"grad_norm": 0.7305841445922852,
"learning_rate": 1.97583481127785e-05,
"epoch": 0.8075380914194066,
"step": 1007
},
{
"loss": 0.0484,
"grad_norm": 0.3095945119857788,
"learning_rate": 1.9600286963305957e-05,
"epoch": 0.8083400160384924,
"step": 1008
},
{
"loss": 0.0204,
"grad_norm": 0.21153652667999268,
"learning_rate": 1.9442791858345887e-05,
"epoch": 0.8091419406575782,
"step": 1009
},
{
"loss": 0.0456,
"grad_norm": 0.2735914885997772,
"learning_rate": 1.928586390672361e-05,
"epoch": 0.809943865276664,
"step": 1010
},
{
"loss": 0.0288,
"grad_norm": 0.2760597765445709,
"learning_rate": 1.9129504213271564e-05,
"epoch": 0.8107457898957497,
"step": 1011
},
{
"loss": 0.0254,
"grad_norm": 0.25427863001823425,
"learning_rate": 1.897371387882134e-05,
"epoch": 0.8115477145148356,
"step": 1012
},
{
"loss": 0.0204,
"grad_norm": 0.23352181911468506,
"learning_rate": 1.881849400019602e-05,
"epoch": 0.8123496391339214,
"step": 1013
},
{
"loss": 0.0979,
"grad_norm": 0.5435330867767334,
"learning_rate": 1.8663845670202563e-05,
"epoch": 0.8131515637530072,
"step": 1014
},
{
"loss": 0.025,
"grad_norm": 0.2618383467197418,
"learning_rate": 1.85097699776239e-05,
"epoch": 0.813953488372093,
"step": 1015
},
{
"loss": 0.024,
"grad_norm": 0.26069387793540955,
"learning_rate": 1.835626800721144e-05,
"epoch": 0.8147554129911788,
"step": 1016
},
{
"loss": 0.0237,
"grad_norm": 0.2411407083272934,
"learning_rate": 1.8203340839677308e-05,
"epoch": 0.8155573376102646,
"step": 1017
},
{
"loss": 0.0388,
"grad_norm": 0.4038242697715759,
"learning_rate": 1.8050989551686914e-05,
"epoch": 0.8163592622293504,
"step": 1018
},
{
"loss": 0.068,
"grad_norm": 0.5744785070419312,
"learning_rate": 1.7899215215851084e-05,
"epoch": 0.8171611868484362,
"step": 1019
},
{
"loss": 0.0195,
"grad_norm": 0.23403626680374146,
"learning_rate": 1.7748018900718854e-05,
"epoch": 0.8179631114675221,
"step": 1020
},
{
"loss": 0.0095,
"grad_norm": 0.11110510677099228,
"learning_rate": 1.7597401670769685e-05,
"epoch": 0.8187650360866079,
"step": 1021
},
{
"loss": 0.0321,
"grad_norm": 0.38874614238739014,
"learning_rate": 1.7447364586406066e-05,
"epoch": 0.8195669607056937,
"step": 1022
},
{
"loss": 0.083,
"grad_norm": 0.46676599979400635,
"learning_rate": 1.729790870394603e-05,
"epoch": 0.8203688853247795,
"step": 1023
},
{
"loss": 0.0314,
"grad_norm": 0.4288753569126129,
"learning_rate": 1.7149035075615794e-05,
"epoch": 0.8211708099438653,
"step": 1024
},
{
"loss": 0.0651,
"grad_norm": 0.29615238308906555,
"learning_rate": 1.7000744749542208e-05,
"epoch": 0.8219727345629511,
"step": 1025
},
{
"loss": 0.068,
"grad_norm": 0.4459689259529114,
"learning_rate": 1.6853038769745467e-05,
"epoch": 0.8227746591820368,
"step": 1026
},
{
"loss": 0.0259,
"grad_norm": 0.22963477671146393,
"learning_rate": 1.670591817613181e-05,
"epoch": 0.8235765838011226,
"step": 1027
},
{
"loss": 0.0222,
"grad_norm": 0.2704809010028839,
"learning_rate": 1.6559384004486055e-05,
"epoch": 0.8243785084202085,
"step": 1028
},
{
"loss": 0.028,
"grad_norm": 0.28117361664772034,
"learning_rate": 1.6413437286464417e-05,
"epoch": 0.8251804330392943,
"step": 1029
},
{
"loss": 0.0303,
"grad_norm": 0.22778946161270142,
"learning_rate": 1.6268079049587203e-05,
"epoch": 0.8259823576583801,
"step": 1030
},
{
"loss": 0.1011,
"grad_norm": 0.7209060788154602,
"learning_rate": 1.6123310317231643e-05,
"epoch": 0.8267842822774659,
"step": 1031
},
{
"loss": 0.0223,
"grad_norm": 0.383091002702713,
"learning_rate": 1.5979132108624574e-05,
"epoch": 0.8275862068965517,
"step": 1032
},
{
"loss": 0.0628,
"grad_norm": 0.5048542618751526,
"learning_rate": 1.583554543883532e-05,
"epoch": 0.8283881315156375,
"step": 1033
},
{
"loss": 0.0255,
"grad_norm": 0.21313592791557312,
"learning_rate": 1.5692551318768556e-05,
"epoch": 0.8291900561347233,
"step": 1034
},
{
"loss": 0.0251,
"grad_norm": 0.2026532143354416,
"learning_rate": 1.5550150755157268e-05,
"epoch": 0.8299919807538091,
"step": 1035
},
{
"loss": 0.0779,
"grad_norm": 0.3825243413448334,
"learning_rate": 1.5408344750555383e-05,
"epoch": 0.830793905372895,
"step": 1036
},
{
"loss": 0.0298,
"grad_norm": 0.31827080249786377,
"learning_rate": 1.5267134303331122e-05,
"epoch": 0.8315958299919808,
"step": 1037
},
{
"loss": 0.0338,
"grad_norm": 0.25245165824890137,
"learning_rate": 1.5126520407659617e-05,
"epoch": 0.8323977546110666,
"step": 1038
},
{
"loss": 0.026,
"grad_norm": 0.22130174934864044,
"learning_rate": 1.4986504053516105e-05,
"epoch": 0.8331996792301524,
"step": 1039
},
{
"loss": 0.0215,
"grad_norm": 0.22145700454711914,
"learning_rate": 1.4847086226668872e-05,
"epoch": 0.8340016038492382,
"step": 1040
},
{
"loss": 0.0759,
"grad_norm": 0.42060795426368713,
"learning_rate": 1.4708267908672401e-05,
"epoch": 0.834803528468324,
"step": 1041
},
{
"loss": 0.0294,
"grad_norm": 0.32257431745529175,
"learning_rate": 1.4570050076860342e-05,
"epoch": 0.8356054530874097,
"step": 1042
},
{
"loss": 0.0242,
"grad_norm": 0.16260656714439392,
"learning_rate": 1.4432433704338722e-05,
"epoch": 0.8364073777064955,
"step": 1043
},
{
"loss": 0.0232,
"grad_norm": 0.2972884476184845,
"learning_rate": 1.429541975997908e-05,
"epoch": 0.8372093023255814,
"step": 1044
},
{
"loss": 0.0321,
"grad_norm": 0.3135718107223511,
"learning_rate": 1.415900920841161e-05,
"epoch": 0.8380112269446672,
"step": 1045
},
{
"loss": 0.0203,
"grad_norm": 0.21164710819721222,
"learning_rate": 1.4023203010018394e-05,
"epoch": 0.838813151563753,
"step": 1046
},
{
"loss": 0.021,
"grad_norm": 0.2070372849702835,
"learning_rate": 1.3888002120926623e-05,
"epoch": 0.8396150761828388,
"step": 1047
},
{
"loss": 0.0428,
"grad_norm": 0.39735156297683716,
"learning_rate": 1.3753407493001968e-05,
"epoch": 0.8404170008019246,
"step": 1048
},
{
"loss": 0.02,
"grad_norm": 0.23848125338554382,
"learning_rate": 1.3619420073841637e-05,
"epoch": 0.8412189254210104,
"step": 1049
},
{
"loss": 0.056,
"grad_norm": 0.3926337957382202,
"learning_rate": 1.3486040806767996e-05,
"epoch": 0.8420208500400962,
"step": 1050
},
{
"loss": 0.0319,
"grad_norm": 0.2710540294647217,
"learning_rate": 1.3353270630821712e-05,
"epoch": 0.842822774659182,
"step": 1051
},
{
"loss": 0.0402,
"grad_norm": 0.41882696747779846,
"learning_rate": 1.3221110480755305e-05,
"epoch": 0.8436246992782679,
"step": 1052
},
{
"loss": 0.0475,
"grad_norm": 0.4259793162345886,
"learning_rate": 1.3089561287026319e-05,
"epoch": 0.8444266238973537,
"step": 1053
},
{
"loss": 0.0314,
"grad_norm": 0.26140275597572327,
"learning_rate": 1.2958623975791118e-05,
"epoch": 0.8452285485164395,
"step": 1054
},
{
"loss": 0.0476,
"grad_norm": 0.3547419309616089,
"learning_rate": 1.2828299468898076e-05,
"epoch": 0.8460304731355253,
"step": 1055
},
{
"loss": 0.0493,
"grad_norm": 0.40817224979400635,
"learning_rate": 1.2698588683881186e-05,
"epoch": 0.846832397754611,
"step": 1056
},
{
"loss": 0.0312,
"grad_norm": 0.2400854080915451,
"learning_rate": 1.2569492533953665e-05,
"epoch": 0.8476343223736968,
"step": 1057
},
{
"loss": 0.0251,
"grad_norm": 0.3751298785209656,
"learning_rate": 1.2441011928001433e-05,
"epoch": 0.8484362469927826,
"step": 1058
},
{
"loss": 0.0223,
"grad_norm": 0.2388581931591034,
"learning_rate": 1.2313147770576749e-05,
"epoch": 0.8492381716118684,
"step": 1059
},
{
"loss": 0.0415,
"grad_norm": 0.3184750974178314,
"learning_rate": 1.2185900961891794e-05,
"epoch": 0.8500400962309543,
"step": 1060
},
{
"loss": 0.0818,
"grad_norm": 0.5252367258071899,
"learning_rate": 1.2059272397812493e-05,
"epoch": 0.8508420208500401,
"step": 1061
},
{
"loss": 0.0277,
"grad_norm": 0.3043983578681946,
"learning_rate": 1.1933262969851988e-05,
"epoch": 0.8516439454691259,
"step": 1062
},
{
"loss": 0.0106,
"grad_norm": 0.09968919306993484,
"learning_rate": 1.1807873565164506e-05,
"epoch": 0.8524458700882117,
"step": 1063
},
{
"loss": 0.0365,
"grad_norm": 0.27977362275123596,
"learning_rate": 1.1683105066539068e-05,
"epoch": 0.8532477947072975,
"step": 1064
},
{
"loss": 0.0589,
"grad_norm": 0.3934069275856018,
"learning_rate": 1.1558958352393334e-05,
"epoch": 0.8540497193263833,
"step": 1065
},
{
"loss": 0.0272,
"grad_norm": 0.24241115152835846,
"learning_rate": 1.1435434296767233e-05,
"epoch": 0.8548516439454691,
"step": 1066
},
{
"loss": 0.0502,
"grad_norm": 0.31850865483283997,
"learning_rate": 1.1312533769317103e-05,
"epoch": 0.8556535685645549,
"step": 1067
},
{
"loss": 0.0295,
"grad_norm": 0.2685650885105133,
"learning_rate": 1.1190257635309275e-05,
"epoch": 0.8564554931836408,
"step": 1068
},
{
"loss": 0.0346,
"grad_norm": 0.20911841094493866,
"learning_rate": 1.106860675561424e-05,
"epoch": 0.8572574178027266,
"step": 1069
},
{
"loss": 0.0187,
"grad_norm": 0.24455945193767548,
"learning_rate": 1.0947581986700306e-05,
"epoch": 0.8580593424218124,
"step": 1070
},
{
"loss": 0.0396,
"grad_norm": 0.3328882157802582,
"learning_rate": 1.0827184180627858e-05,
"epoch": 0.8588612670408982,
"step": 1071
},
{
"loss": 0.0463,
"grad_norm": 0.4071497917175293,
"learning_rate": 1.0707414185043163e-05,
"epoch": 0.859663191659984,
"step": 1072
},
{
"loss": 0.0366,
"grad_norm": 0.411491334438324,
"learning_rate": 1.0588272843172454e-05,
"epoch": 0.8604651162790697,
"step": 1073
},
{
"loss": 0.0312,
"grad_norm": 0.4028700888156891,
"learning_rate": 1.0469760993816057e-05,
"epoch": 0.8612670408981555,
"step": 1074
},
{
"loss": 0.0297,
"grad_norm": 0.17155224084854126,
"learning_rate": 1.0351879471342374e-05,
"epoch": 0.8620689655172413,
"step": 1075
},
{
"loss": 0.047,
"grad_norm": 0.3104127049446106,
"learning_rate": 1.0234629105682103e-05,
"epoch": 0.8628708901363272,
"step": 1076
},
{
"loss": 0.0207,
"grad_norm": 0.2133323699235916,
"learning_rate": 1.0118010722322314e-05,
"epoch": 0.863672814755413,
"step": 1077
},
{
"loss": 0.0143,
"grad_norm": 0.17464157938957214,
"learning_rate": 1.0002025142300765e-05,
"epoch": 0.8644747393744988,
"step": 1078
},
{
"loss": 0.0289,
"grad_norm": 0.3865419328212738,
"learning_rate": 9.886673182199957e-06,
"epoch": 0.8652766639935846,
"step": 1079
},
{
"loss": 0.0185,
"grad_norm": 0.2113240659236908,
"learning_rate": 9.771955654141496e-06,
"epoch": 0.8660785886126704,
"step": 1080
},
{
"loss": 0.0135,
"grad_norm": 0.138031005859375,
"learning_rate": 9.657873365780323e-06,
"epoch": 0.8668805132317562,
"step": 1081
},
{
"loss": 0.0308,
"grad_norm": 0.3399095833301544,
"learning_rate": 9.544427120299138e-06,
"epoch": 0.867682437850842,
"step": 1082
},
{
"loss": 0.0146,
"grad_norm": 0.26471027731895447,
"learning_rate": 9.431617716402507e-06,
"epoch": 0.8684843624699278,
"step": 1083
},
{
"loss": 0.0202,
"grad_norm": 0.19661951065063477,
"learning_rate": 9.319445948311534e-06,
"epoch": 0.8692862870890137,
"step": 1084
},
{
"loss": 0.0636,
"grad_norm": 0.49121007323265076,
"learning_rate": 9.207912605758052e-06,
"epoch": 0.8700882117080995,
"step": 1085
},
{
"loss": 0.0532,
"grad_norm": 0.4061635434627533,
"learning_rate": 9.097018473979124e-06,
"epoch": 0.8708901363271853,
"step": 1086
},
{
"loss": 0.0411,
"grad_norm": 0.29139432311058044,
"learning_rate": 8.986764333711584e-06,
"epoch": 0.871692060946271,
"step": 1087
},
{
"loss": 0.0191,
"grad_norm": 0.14276857674121857,
"learning_rate": 8.87715096118642e-06,
"epoch": 0.8724939855653568,
"step": 1088
},
{
"loss": 0.0147,
"grad_norm": 0.19102653861045837,
"learning_rate": 8.768179128123455e-06,
"epoch": 0.8732959101844426,
"step": 1089
},
{
"loss": 0.106,
"grad_norm": 0.6417858004570007,
"learning_rate": 8.659849601725701e-06,
"epoch": 0.8740978348035284,
"step": 1090
},
{
"loss": 0.0448,
"grad_norm": 0.3078593313694,
"learning_rate": 8.55216314467422e-06,
"epoch": 0.8748997594226142,
"step": 1091
},
{
"loss": 0.0169,
"grad_norm": 0.22024403512477875,
"learning_rate": 8.445120515122551e-06,
"epoch": 0.8757016840417001,
"step": 1092
},
{
"loss": 0.0637,
"grad_norm": 0.4088142514228821,
"learning_rate": 8.338722466691451e-06,
"epoch": 0.8765036086607859,
"step": 1093
},
{
"loss": 0.0353,
"grad_norm": 0.2129702866077423,
"learning_rate": 8.23296974846357e-06,
"epoch": 0.8773055332798717,
"step": 1094
},
{
"loss": 0.0619,
"grad_norm": 0.438853919506073,
"learning_rate": 8.127863104978261e-06,
"epoch": 0.8781074578989575,
"step": 1095
},
{
"loss": 0.0339,
"grad_norm": 0.30360084772109985,
"learning_rate": 8.023403276226126e-06,
"epoch": 0.8789093825180433,
"step": 1096
},
{
"loss": 0.0257,
"grad_norm": 0.2785508930683136,
"learning_rate": 7.91959099764411e-06,
"epoch": 0.8797113071371291,
"step": 1097
},
{
"loss": 0.0355,
"grad_norm": 0.2872433066368103,
"learning_rate": 7.816427000110015e-06,
"epoch": 0.8805132317562149,
"step": 1098
},
{
"loss": 0.0103,
"grad_norm": 0.11933287978172302,
"learning_rate": 7.713912009937608e-06,
"epoch": 0.8813151563753007,
"step": 1099
},
{
"loss": 0.0392,
"grad_norm": 0.26215773820877075,
"learning_rate": 7.612046748871327e-06,
"epoch": 0.8821170809943866,
"step": 1100
},
{
"loss": 0.0147,
"grad_norm": 1.0445473194122314,
"learning_rate": 7.5108319340813085e-06,
"epoch": 0.8829190056134724,
"step": 1101
},
{
"loss": 0.0212,
"grad_norm": 0.18120186030864716,
"learning_rate": 7.410268278158272e-06,
"epoch": 0.8837209302325582,
"step": 1102
},
{
"loss": 0.0471,
"grad_norm": 0.37671101093292236,
"learning_rate": 7.310356489108538e-06,
"epoch": 0.884522854851644,
"step": 1103
},
{
"loss": 0.0455,
"grad_norm": 0.38250431418418884,
"learning_rate": 7.211097270349066e-06,
"epoch": 0.8853247794707297,
"step": 1104
},
{
"loss": 0.0233,
"grad_norm": 0.36999034881591797,
"learning_rate": 7.112491320702441e-06,
"epoch": 0.8861267040898155,
"step": 1105
},
{
"loss": 0.0264,
"grad_norm": 0.2825574278831482,
"learning_rate": 7.014539334392012e-06,
"epoch": 0.8869286287089013,
"step": 1106
},
{
"loss": 0.064,
"grad_norm": 0.35153815150260925,
"learning_rate": 6.917242001036917e-06,
"epoch": 0.8877305533279871,
"step": 1107
},
{
"loss": 0.0188,
"grad_norm": 0.14561624825000763,
"learning_rate": 6.820600005647382e-06,
"epoch": 0.888532477947073,
"step": 1108
},
{
"loss": 0.0561,
"grad_norm": 0.463888555765152,
"learning_rate": 6.7246140286197355e-06,
"epoch": 0.8893344025661588,
"step": 1109
},
{
"loss": 0.0041,
"grad_norm": 0.0676012933254242,
"learning_rate": 6.629284745731701e-06,
"epoch": 0.8901363271852446,
"step": 1110
},
{
"loss": 0.0159,
"grad_norm": 0.18029426038265228,
"learning_rate": 6.5346128281376204e-06,
"epoch": 0.8909382518043304,
"step": 1111
},
{
"loss": 0.063,
"grad_norm": 0.35441911220550537,
"learning_rate": 6.440598942363796e-06,
"epoch": 0.8917401764234162,
"step": 1112
},
{
"loss": 0.0502,
"grad_norm": 0.37630465626716614,
"learning_rate": 6.347243750303622e-06,
"epoch": 0.892542101042502,
"step": 1113
},
{
"loss": 0.0604,
"grad_norm": 0.3076138496398926,
"learning_rate": 6.254547909213149e-06,
"epoch": 0.8933440256615878,
"step": 1114
},
{
"loss": 0.0382,
"grad_norm": 0.2812439203262329,
"learning_rate": 6.162512071706272e-06,
"epoch": 0.8941459502806736,
"step": 1115
},
{
"loss": 0.0421,
"grad_norm": 0.26998552680015564,
"learning_rate": 6.071136885750272e-06,
"epoch": 0.8949478748997595,
"step": 1116
},
{
"eval_loss": 0.03864981606602669,
"eval_runtime": 31.6055,
"eval_samples_per_second": 33.222,
"eval_steps_per_second": 8.321,
"epoch": 0.8949478748997595,
"step": 1116
},
{
"loss": 0.0632,
"grad_norm": 0.4778763949871063,
"learning_rate": 5.980422994661139e-06,
"epoch": 0.8957497995188453,
"step": 1117
},
{
"loss": 0.0152,
"grad_norm": 0.18333400785923004,
"learning_rate": 5.890371037099107e-06,
"epoch": 0.896551724137931,
"step": 1118
},
{
"loss": 0.027,
"grad_norm": 0.28018802404403687,
"learning_rate": 5.800981647064186e-06,
"epoch": 0.8973536487570168,
"step": 1119
},
{
"loss": 0.0281,
"grad_norm": 0.2619428336620331,
"learning_rate": 5.71225545389158e-06,
"epoch": 0.8981555733761026,
"step": 1120
},
{
"loss": 0.0274,
"grad_norm": 0.2673223912715912,
"learning_rate": 5.624193082247431e-06,
"epoch": 0.8989574979951884,
"step": 1121
},
{
"loss": 0.0482,
"grad_norm": 0.2983281910419464,
"learning_rate": 5.536795152124252e-06,
"epoch": 0.8997594226142742,
"step": 1122
},
{
"loss": 0.0384,
"grad_norm": 0.30985018610954285,
"learning_rate": 5.450062278836677e-06,
"epoch": 0.90056134723336,
"step": 1123
},
{
"loss": 0.0169,
"grad_norm": 0.1679602563381195,
"learning_rate": 5.363995073017047e-06,
"epoch": 0.9013632718524459,
"step": 1124
},
{
"loss": 0.0396,
"grad_norm": 0.29093822836875916,
"learning_rate": 5.278594140611204e-06,
"epoch": 0.9021651964715317,
"step": 1125
},
{
"loss": 0.0218,
"grad_norm": 0.26348739862442017,
"learning_rate": 5.193860082874125e-06,
"epoch": 0.9029671210906175,
"step": 1126
},
{
"loss": 0.0458,
"grad_norm": 0.5017328262329102,
"learning_rate": 5.1097934963657665e-06,
"epoch": 0.9037690457097033,
"step": 1127
},
{
"loss": 0.0321,
"grad_norm": 0.3663092851638794,
"learning_rate": 5.026394972946813e-06,
"epoch": 0.9045709703287891,
"step": 1128
},
{
"loss": 0.0308,
"grad_norm": 0.28317686915397644,
"learning_rate": 4.943665099774553e-06,
"epoch": 0.9053728949478749,
"step": 1129
},
{
"loss": 0.0492,
"grad_norm": 0.4701959490776062,
"learning_rate": 4.861604459298696e-06,
"epoch": 0.9061748195669607,
"step": 1130
},
{
"loss": 0.0677,
"grad_norm": 0.5933964252471924,
"learning_rate": 4.780213629257324e-06,
"epoch": 0.9069767441860465,
"step": 1131
},
{
"loss": 0.0532,
"grad_norm": 0.35850825905799866,
"learning_rate": 4.69949318267281e-06,
"epoch": 0.9077786688051324,
"step": 1132
},
{
"loss": 0.0429,
"grad_norm": 0.3367816209793091,
"learning_rate": 4.619443687847702e-06,
"epoch": 0.9085805934242182,
"step": 1133
},
{
"loss": 0.0223,
"grad_norm": 0.3213984966278076,
"learning_rate": 4.540065708360886e-06,
"epoch": 0.909382518043304,
"step": 1134
},
{
"loss": 0.0452,
"grad_norm": 0.4436606168746948,
"learning_rate": 4.461359803063458e-06,
"epoch": 0.9101844426623897,
"step": 1135
},
{
"loss": 0.0179,
"grad_norm": 0.3105975389480591,
"learning_rate": 4.383326526074916e-06,
"epoch": 0.9109863672814755,
"step": 1136
},
{
"loss": 0.0313,
"grad_norm": 0.3733506202697754,
"learning_rate": 4.305966426779118e-06,
"epoch": 0.9117882919005613,
"step": 1137
},
{
"loss": 0.0142,
"grad_norm": 0.28305160999298096,
"learning_rate": 4.229280049820561e-06,
"epoch": 0.9125902165196471,
"step": 1138
},
{
"loss": 0.0351,
"grad_norm": 0.431251585483551,
"learning_rate": 4.15326793510048e-06,
"epoch": 0.9133921411387329,
"step": 1139
},
{
"loss": 0.0231,
"grad_norm": 0.20681129395961761,
"learning_rate": 4.077930617773007e-06,
"epoch": 0.9141940657578188,
"step": 1140
},
{
"loss": 0.0448,
"grad_norm": 0.314126193523407,
"learning_rate": 4.003268628241452e-06,
"epoch": 0.9149959903769046,
"step": 1141
},
{
"loss": 0.0495,
"grad_norm": 0.3883209228515625,
"learning_rate": 3.929282492154607e-06,
"epoch": 0.9157979149959904,
"step": 1142
},
{
"loss": 0.0204,
"grad_norm": 0.28020593523979187,
"learning_rate": 3.855972730402968e-06,
"epoch": 0.9165998396150762,
"step": 1143
},
{
"loss": 0.0161,
"grad_norm": 0.23583762347698212,
"learning_rate": 3.783339859115065e-06,
"epoch": 0.917401764234162,
"step": 1144
},
{
"loss": 0.0306,
"grad_norm": 0.2325585037469864,
"learning_rate": 3.711384389653916e-06,
"epoch": 0.9182036888532478,
"step": 1145
},
{
"loss": 0.0235,
"grad_norm": 0.24619098007678986,
"learning_rate": 3.6401068286133542e-06,
"epoch": 0.9190056134723336,
"step": 1146
},
{
"loss": 0.033,
"grad_norm": 0.3259875476360321,
"learning_rate": 3.5695076778144875e-06,
"epoch": 0.9198075380914194,
"step": 1147
},
{
"loss": 0.0304,
"grad_norm": 0.25106412172317505,
"learning_rate": 3.4995874343021094e-06,
"epoch": 0.9206094627105053,
"step": 1148
},
{
"loss": 0.0469,
"grad_norm": 0.4743681252002716,
"learning_rate": 3.430346590341338e-06,
"epoch": 0.921411387329591,
"step": 1149
},
{
"loss": 0.0539,
"grad_norm": 0.34995362162590027,
"learning_rate": 3.3617856334139607e-06,
"epoch": 0.9222133119486768,
"step": 1150
},
{
"loss": 0.0662,
"grad_norm": 0.7248356342315674,
"learning_rate": 3.2939050462151953e-06,
"epoch": 0.9230152365677626,
"step": 1151
},
{
"loss": 0.014,
"grad_norm": 0.2265823483467102,
"learning_rate": 3.226705306650113e-06,
"epoch": 0.9238171611868484,
"step": 1152
},
{
"loss": 0.013,
"grad_norm": 0.20024491846561432,
"learning_rate": 3.1601868878304406e-06,
"epoch": 0.9246190858059342,
"step": 1153
},
{
"loss": 0.0347,
"grad_norm": 0.25708380341529846,
"learning_rate": 3.0943502580710772e-06,
"epoch": 0.92542101042502,
"step": 1154
},
{
"loss": 0.0149,
"grad_norm": 0.2992137372493744,
"learning_rate": 3.0291958808869037e-06,
"epoch": 0.9262229350441058,
"step": 1155
},
{
"loss": 0.0162,
"grad_norm": 0.24897243082523346,
"learning_rate": 2.9647242149895006e-06,
"epoch": 0.9270248596631917,
"step": 1156
},
{
"loss": 0.0193,
"grad_norm": 0.19664904475212097,
"learning_rate": 2.9009357142838477e-06,
"epoch": 0.9278267842822775,
"step": 1157
},
{
"loss": 0.0288,
"grad_norm": 0.40675321221351624,
"learning_rate": 2.8378308278652288e-06,
"epoch": 0.9286287089013633,
"step": 1158
},
{
"loss": 0.029,
"grad_norm": 0.2802353501319885,
"learning_rate": 2.775410000016021e-06,
"epoch": 0.9294306335204491,
"step": 1159
},
{
"loss": 0.0561,
"grad_norm": 0.9122768044471741,
"learning_rate": 2.7136736702025433e-06,
"epoch": 0.9302325581395349,
"step": 1160
},
{
"loss": 0.032,
"grad_norm": 0.36887556314468384,
"learning_rate": 2.652622273072003e-06,
"epoch": 0.9310344827586207,
"step": 1161
},
{
"loss": 0.0285,
"grad_norm": 0.31024497747421265,
"learning_rate": 2.5922562384494196e-06,
"epoch": 0.9318364073777065,
"step": 1162
},
{
"loss": 0.0499,
"grad_norm": 0.27006012201309204,
"learning_rate": 2.532575991334618e-06,
"epoch": 0.9326383319967922,
"step": 1163
},
{
"loss": 0.0422,
"grad_norm": 0.3135731816291809,
"learning_rate": 2.473581951899184e-06,
"epoch": 0.9334402566158782,
"step": 1164
},
{
"loss": 0.0249,
"grad_norm": 0.23915302753448486,
"learning_rate": 2.415274535483547e-06,
"epoch": 0.9342421812349639,
"step": 1165
},
{
"loss": 0.0736,
"grad_norm": 0.5155062675476074,
"learning_rate": 2.357654152594113e-06,
"epoch": 0.9350441058540497,
"step": 1166
},
{
"loss": 0.0379,
"grad_norm": 0.28162410855293274,
"learning_rate": 2.3007212089001916e-06,
"epoch": 0.9358460304731355,
"step": 1167
},
{
"loss": 0.0247,
"grad_norm": 0.2652982771396637,
"learning_rate": 2.2444761052313856e-06,
"epoch": 0.9366479550922213,
"step": 1168
},
{
"loss": 0.0219,
"grad_norm": 0.1709524244070053,
"learning_rate": 2.1889192375745494e-06,
"epoch": 0.9374498797113071,
"step": 1169
},
{
"loss": 0.0725,
"grad_norm": 0.5203680396080017,
"learning_rate": 2.1340509970711466e-06,
"epoch": 0.9382518043303929,
"step": 1170
},
{
"loss": 0.0466,
"grad_norm": 0.5651960372924805,
"learning_rate": 2.0798717700144077e-06,
"epoch": 0.9390537289494787,
"step": 1171
},
{
"loss": 0.0316,
"grad_norm": 0.219880610704422,
"learning_rate": 2.0263819378466884e-06,
"epoch": 0.9398556535685646,
"step": 1172
},
{
"loss": 0.037,
"grad_norm": 0.3648744225502014,
"learning_rate": 1.973581877156716e-06,
"epoch": 0.9406575781876504,
"step": 1173
},
{
"loss": 0.0337,
"grad_norm": 0.35861659049987793,
"learning_rate": 1.921471959676957e-06,
"epoch": 0.9414595028067362,
"step": 1174
},
{
"loss": 0.0255,
"grad_norm": 0.3175615072250366,
"learning_rate": 1.870052552281032e-06,
"epoch": 0.942261427425822,
"step": 1175
},
{
"loss": 0.0521,
"grad_norm": 0.41771504282951355,
"learning_rate": 1.8193240169810943e-06,
"epoch": 0.9430633520449078,
"step": 1176
},
{
"loss": 0.113,
"grad_norm": 0.5672475695610046,
"learning_rate": 1.7692867109252886e-06,
"epoch": 0.9438652766639936,
"step": 1177
},
{
"loss": 0.0231,
"grad_norm": 0.24217240512371063,
"learning_rate": 1.7199409863952521e-06,
"epoch": 0.9446672012830793,
"step": 1178
},
{
"loss": 0.0286,
"grad_norm": 0.34596091508865356,
"learning_rate": 1.6712871908036387e-06,
"epoch": 0.9454691259021651,
"step": 1179
},
{
"loss": 0.0096,
"grad_norm": 0.1532655507326126,
"learning_rate": 1.623325666691644e-06,
"epoch": 0.946271050521251,
"step": 1180
},
{
"loss": 0.0107,
"grad_norm": 0.14835034310817719,
"learning_rate": 1.5760567517266066e-06,
"epoch": 0.9470729751403368,
"step": 1181
},
{
"loss": 0.0378,
"grad_norm": 0.31077146530151367,
"learning_rate": 1.5294807786996213e-06,
"epoch": 0.9478748997594226,
"step": 1182
},
{
"loss": 0.0459,
"grad_norm": 0.33775973320007324,
"learning_rate": 1.4835980755232626e-06,
"epoch": 0.9486768243785084,
"step": 1183
},
{
"loss": 0.0722,
"grad_norm": 0.3435156047344208,
"learning_rate": 1.4384089652291543e-06,
"epoch": 0.9494787489975942,
"step": 1184
},
{
"loss": 0.0259,
"grad_norm": 0.29428747296333313,
"learning_rate": 1.3939137659658153e-06,
"epoch": 0.95028067361668,
"step": 1185
},
{
"loss": 0.0189,
"grad_norm": 0.21608836948871613,
"learning_rate": 1.3501127909963274e-06,
"epoch": 0.9510825982357658,
"step": 1186
},
{
"loss": 0.0352,
"grad_norm": 0.35777002573013306,
"learning_rate": 1.3070063486961936e-06,
"epoch": 0.9518845228548516,
"step": 1187
},
{
"loss": 0.0284,
"grad_norm": 0.2871112525463104,
"learning_rate": 1.2645947425511395e-06,
"epoch": 0.9526864474739375,
"step": 1188
},
{
"loss": 0.0225,
"grad_norm": 0.22989165782928467,
"learning_rate": 1.2228782711549924e-06,
"epoch": 0.9534883720930233,
"step": 1189
},
{
"loss": 0.027,
"grad_norm": 0.3213796317577362,
"learning_rate": 1.181857228207539e-06,
"epoch": 0.9542902967121091,
"step": 1190
},
{
"loss": 0.0232,
"grad_norm": 0.27565819025039673,
"learning_rate": 1.1415319025124938e-06,
"epoch": 0.9550922213311949,
"step": 1191
},
{
"loss": 0.0385,
"grad_norm": 0.5389794111251831,
"learning_rate": 1.1019025779754666e-06,
"epoch": 0.9558941459502807,
"step": 1192
},
{
"loss": 0.0419,
"grad_norm": 0.27501630783081055,
"learning_rate": 1.0629695336019763e-06,
"epoch": 0.9566960705693665,
"step": 1193
},
{
"loss": 0.0423,
"grad_norm": 0.2711651027202606,
"learning_rate": 1.0247330434954071e-06,
"epoch": 0.9574979951884522,
"step": 1194
},
{
"loss": 0.0222,
"grad_norm": 0.20319171249866486,
"learning_rate": 9.871933768551888e-07,
"epoch": 0.958299919807538,
"step": 1195
},
{
"loss": 0.0718,
"grad_norm": 0.3775697648525238,
"learning_rate": 9.503507979748305e-07,
"epoch": 0.9591018444266239,
"step": 1196
},
{
"loss": 0.0313,
"grad_norm": 0.27909061312675476,
"learning_rate": 9.142055662400673e-07,
"epoch": 0.9599037690457097,
"step": 1197
},
{
"loss": 0.0754,
"grad_norm": 0.5719237923622131,
"learning_rate": 8.787579361270614e-07,
"epoch": 0.9607056936647955,
"step": 1198
},
{
"loss": 0.0706,
"grad_norm": 0.6314537525177002,
"learning_rate": 8.440081572005931e-07,
"epoch": 0.9615076182838813,
"step": 1199
},
{
"loss": 0.038,
"grad_norm": 0.45612236857414246,
"learning_rate": 8.099564741123166e-07,
"epoch": 0.9623095429029671,
"step": 1200
},
{
"loss": 0.0525,
"grad_norm": 0.3892579674720764,
"learning_rate": 7.766031265989849e-07,
"epoch": 0.9631114675220529,
"step": 1201
},
{
"loss": 0.0146,
"grad_norm": 0.20876550674438477,
"learning_rate": 7.439483494808497e-07,
"epoch": 0.9639133921411387,
"step": 1202
},
{
"loss": 0.0245,
"grad_norm": 0.2424009144306183,
"learning_rate": 7.11992372659942e-07,
"epoch": 0.9647153167602245,
"step": 1203
},
{
"loss": 0.0171,
"grad_norm": 0.20193764567375183,
"learning_rate": 6.807354211184613e-07,
"epoch": 0.9655172413793104,
"step": 1204
},
{
"loss": 0.0357,
"grad_norm": 0.3591613471508026,
"learning_rate": 6.501777149172328e-07,
"epoch": 0.9663191659983962,
"step": 1205
},
{
"loss": 0.0481,
"grad_norm": 0.40879741311073303,
"learning_rate": 6.203194691940972e-07,
"epoch": 0.967121090617482,
"step": 1206
},
{
"loss": 0.0316,
"grad_norm": 0.2834092080593109,
"learning_rate": 5.91160894162468e-07,
"epoch": 0.9679230152365678,
"step": 1207
},
{
"loss": 0.0357,
"grad_norm": 0.23049277067184448,
"learning_rate": 5.627021951097545e-07,
"epoch": 0.9687249398556536,
"step": 1208
},
{
"loss": 0.0289,
"grad_norm": 0.2661738693714142,
"learning_rate": 5.349435723960183e-07,
"epoch": 0.9695268644747393,
"step": 1209
},
{
"loss": 0.0295,
"grad_norm": 0.2500978708267212,
"learning_rate": 5.078852214525198e-07,
"epoch": 0.9703287890938251,
"step": 1210
},
{
"loss": 0.0253,
"grad_norm": 0.20364578068256378,
"learning_rate": 4.815273327803182e-07,
"epoch": 0.9711307137129109,
"step": 1211
},
{
"loss": 0.0444,
"grad_norm": 0.33421790599823,
"learning_rate": 4.5587009194894004e-07,
"epoch": 0.9719326383319968,
"step": 1212
},
{
"loss": 0.0181,
"grad_norm": 0.2810249626636505,
"learning_rate": 4.3091367959512407e-07,
"epoch": 0.9727345629510826,
"step": 1213
},
{
"loss": 0.0252,
"grad_norm": 0.24068693816661835,
"learning_rate": 4.066582714214895e-07,
"epoch": 0.9735364875701684,
"step": 1214
},
{
"loss": 0.0529,
"grad_norm": 0.3940003514289856,
"learning_rate": 3.831040381953144e-07,
"epoch": 0.9743384121892542,
"step": 1215
},
{
"loss": 0.0359,
"grad_norm": 0.7913379073143005,
"learning_rate": 3.6025114574734785e-07,
"epoch": 0.97514033680834,
"step": 1216
},
{
"loss": 0.1689,
"grad_norm": 0.7005312442779541,
"learning_rate": 3.380997549706444e-07,
"epoch": 0.9759422614274258,
"step": 1217
},
{
"loss": 0.0493,
"grad_norm": 0.38180509209632874,
"learning_rate": 3.166500218193758e-07,
"epoch": 0.9767441860465116,
"step": 1218
},
{
"loss": 0.0252,
"grad_norm": 0.20567728579044342,
"learning_rate": 2.9590209730784304e-07,
"epoch": 0.9775461106655974,
"step": 1219
},
{
"loss": 0.0352,
"grad_norm": 0.6058043241500854,
"learning_rate": 2.758561275092886e-07,
"epoch": 0.9783480352846833,
"step": 1220
},
{
"loss": 0.0272,
"grad_norm": 0.32482075691223145,
"learning_rate": 2.5651225355497464e-07,
"epoch": 0.9791499599037691,
"step": 1221
},
{
"loss": 0.031,
"grad_norm": 0.16501711308956146,
"learning_rate": 2.378706116330953e-07,
"epoch": 0.9799518845228549,
"step": 1222
},
{
"loss": 0.0579,
"grad_norm": 0.3395942747592926,
"learning_rate": 2.1993133298791046e-07,
"epoch": 0.9807538091419407,
"step": 1223
},
{
"loss": 0.013,
"grad_norm": 0.18358808755874634,
"learning_rate": 2.0269454391874666e-07,
"epoch": 0.9815557337610264,
"step": 1224
},
{
"loss": 0.0441,
"grad_norm": 0.4380914866924286,
"learning_rate": 1.861603657791422e-07,
"epoch": 0.9823576583801122,
"step": 1225
},
{
"loss": 0.0216,
"grad_norm": 0.19042056798934937,
"learning_rate": 1.7032891497600345e-07,
"epoch": 0.983159582999198,
"step": 1226
},
{
"loss": 0.0208,
"grad_norm": 0.21681898832321167,
"learning_rate": 1.5520030296873877e-07,
"epoch": 0.9839615076182838,
"step": 1227
},
{
"loss": 0.0664,
"grad_norm": 0.4365503489971161,
"learning_rate": 1.4077463626852582e-07,
"epoch": 0.9847634322373697,
"step": 1228
},
{
"loss": 0.0224,
"grad_norm": 0.18103234469890594,
"learning_rate": 1.270520164375344e-07,
"epoch": 0.9855653568564555,
"step": 1229
},
{
"loss": 0.029,
"grad_norm": 0.26580461859703064,
"learning_rate": 1.1403254008822695e-07,
"epoch": 0.9863672814755413,
"step": 1230
},
{
"loss": 0.0414,
"grad_norm": 0.2631921172142029,
"learning_rate": 1.0171629888265921e-07,
"epoch": 0.9871692060946271,
"step": 1231
},
{
"loss": 0.0267,
"grad_norm": 0.34322279691696167,
"learning_rate": 9.010337953185843e-08,
"epoch": 0.9879711307137129,
"step": 1232
},
{
"loss": 0.0245,
"grad_norm": 0.24382710456848145,
"learning_rate": 7.919386379515726e-08,
"epoch": 0.9887730553327987,
"step": 1233
},
{
"loss": 0.0652,
"grad_norm": 0.5472224354743958,
"learning_rate": 6.89878284797163e-08,
"epoch": 0.9895749799518845,
"step": 1234
},
{
"loss": 0.0412,
"grad_norm": 0.2964751720428467,
"learning_rate": 5.948534543988027e-08,
"epoch": 0.9903769045709703,
"step": 1235
},
{
"loss": 0.0436,
"grad_norm": 0.3399409353733063,
"learning_rate": 5.068648157675604e-08,
"epoch": 0.9911788291900562,
"step": 1236
},
{
"loss": 0.0481,
"grad_norm": 0.346693754196167,
"learning_rate": 4.259129883767976e-08,
"epoch": 0.991980753809142,
"step": 1237
},
{
"loss": 0.098,
"grad_norm": 0.5714817047119141,
"learning_rate": 3.5199854215817176e-08,
"epoch": 0.9927826784282278,
"step": 1238
},
{
"loss": 0.0611,
"grad_norm": 0.36500847339630127,
"learning_rate": 2.8512199749730628e-08,
"epoch": 0.9935846030473136,
"step": 1239
},
{
"loss": 0.0238,
"grad_norm": 0.21105419099330902,
"learning_rate": 2.2528382523057113e-08,
"epoch": 0.9943865276663993,
"step": 1240
},
{
"eval_loss": 0.038508880883455276,
"eval_runtime": 31.7523,
"eval_samples_per_second": 33.068,
"eval_steps_per_second": 8.283,
"epoch": 0.9943865276663993,
"step": 1240
},
{
"loss": 0.0441,
"grad_norm": 0.3713122010231018,
"learning_rate": 1.7248444664141884e-08,
"epoch": 0.9951884522854851,
"step": 1241
},
{
"loss": 0.0319,
"grad_norm": 0.1977764517068863,
"learning_rate": 1.2672423345760909e-08,
"epoch": 0.9959903769045709,
"step": 1242
},
{
"loss": 0.0478,
"grad_norm": 0.42995980381965637,
"learning_rate": 8.80035078482111e-09,
"epoch": 0.9967923015236567,
"step": 1243
},
{
"loss": 0.0306,
"grad_norm": 0.39976274967193604,
"learning_rate": 5.6322542422049266e-09,
"epoch": 0.9975942261427426,
"step": 1244
},
{
"loss": 0.0413,
"grad_norm": 0.28021112084388733,
"learning_rate": 3.1681560225038654e-09,
"epoch": 0.9983961507618284,
"step": 1245
},
{
"loss": 0.0335,
"grad_norm": 0.25943997502326965,
"learning_rate": 1.4080734739074786e-09,
"epoch": 0.9991980753809142,
"step": 1246
},
{
"loss": 0.0384,
"grad_norm": 0.45012426376342773,
"learning_rate": 3.52018988059033e-10,
"epoch": 1.0,
"step": 1247
},
{
"train_runtime": 1545.9245,
"train_samples_per_second": 12.901,
"train_steps_per_second": 0.807,
"total_flos": 1.433925787776e+16,
"train_loss": 0.05218265543086118,
"epoch": 1.0,
"step": 1247
}
]