Files
qwen2.5-VL-3B-atm-finetune-…/trainer_state.json
ModelHub XC f11041f846 初始化项目,由ModelHub XC社区提供模型
Model: DaDing777/qwen2.5-VL-3B-atm-finetune-cot-full
Source: Original Platform
2026-06-03 23:19:11 +08:00

11573 lines
280 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 1647,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0006074411541381929,
"grad_norm": 4.187357914113359,
"learning_rate": 0.0,
"loss": 1.3011,
"step": 1
},
{
"epoch": 0.0012148823082763858,
"grad_norm": 4.498512475534671,
"learning_rate": 6.060606060606061e-08,
"loss": 1.3126,
"step": 2
},
{
"epoch": 0.0018223234624145787,
"grad_norm": 4.0434971741932495,
"learning_rate": 1.2121212121212122e-07,
"loss": 1.2803,
"step": 3
},
{
"epoch": 0.0024297646165527716,
"grad_norm": 4.162307471665992,
"learning_rate": 1.8181818181818183e-07,
"loss": 1.3162,
"step": 4
},
{
"epoch": 0.0030372057706909645,
"grad_norm": 4.278843392925189,
"learning_rate": 2.4242424242424244e-07,
"loss": 1.3803,
"step": 5
},
{
"epoch": 0.0036446469248291574,
"grad_norm": 4.123864416498828,
"learning_rate": 3.0303030303030305e-07,
"loss": 1.3463,
"step": 6
},
{
"epoch": 0.00425208807896735,
"grad_norm": 4.304171763132361,
"learning_rate": 3.6363636363636366e-07,
"loss": 1.3397,
"step": 7
},
{
"epoch": 0.004859529233105543,
"grad_norm": 3.9277126463319307,
"learning_rate": 4.242424242424243e-07,
"loss": 1.3189,
"step": 8
},
{
"epoch": 0.005466970387243736,
"grad_norm": 3.922407631596546,
"learning_rate": 4.848484848484849e-07,
"loss": 1.2963,
"step": 9
},
{
"epoch": 0.006074411541381929,
"grad_norm": 3.895403645466344,
"learning_rate": 5.454545454545455e-07,
"loss": 1.2897,
"step": 10
},
{
"epoch": 0.006681852695520122,
"grad_norm": 4.0159782701678814,
"learning_rate": 6.060606060606061e-07,
"loss": 1.2556,
"step": 11
},
{
"epoch": 0.007289293849658315,
"grad_norm": 4.3312492479849976,
"learning_rate": 6.666666666666667e-07,
"loss": 1.2826,
"step": 12
},
{
"epoch": 0.007896735003796507,
"grad_norm": 3.8648934637566277,
"learning_rate": 7.272727272727273e-07,
"loss": 1.3033,
"step": 13
},
{
"epoch": 0.0085041761579347,
"grad_norm": 4.286961260584155,
"learning_rate": 7.878787878787879e-07,
"loss": 1.3182,
"step": 14
},
{
"epoch": 0.009111617312072893,
"grad_norm": 3.9563681075623696,
"learning_rate": 8.484848484848486e-07,
"loss": 1.2456,
"step": 15
},
{
"epoch": 0.009719058466211086,
"grad_norm": 4.609717367893727,
"learning_rate": 9.090909090909091e-07,
"loss": 1.3312,
"step": 16
},
{
"epoch": 0.010326499620349278,
"grad_norm": 3.8182264142948212,
"learning_rate": 9.696969696969698e-07,
"loss": 1.2702,
"step": 17
},
{
"epoch": 0.010933940774487472,
"grad_norm": 3.9030184249626765,
"learning_rate": 1.0303030303030304e-06,
"loss": 1.2982,
"step": 18
},
{
"epoch": 0.011541381928625664,
"grad_norm": 3.630670067928241,
"learning_rate": 1.090909090909091e-06,
"loss": 1.2928,
"step": 19
},
{
"epoch": 0.012148823082763858,
"grad_norm": 3.627122940350712,
"learning_rate": 1.1515151515151516e-06,
"loss": 1.3399,
"step": 20
},
{
"epoch": 0.01275626423690205,
"grad_norm": 3.4178400978584498,
"learning_rate": 1.2121212121212122e-06,
"loss": 1.201,
"step": 21
},
{
"epoch": 0.013363705391040244,
"grad_norm": 3.8805835742800308,
"learning_rate": 1.2727272727272728e-06,
"loss": 1.3176,
"step": 22
},
{
"epoch": 0.013971146545178436,
"grad_norm": 5.15609913207104,
"learning_rate": 1.3333333333333334e-06,
"loss": 1.2253,
"step": 23
},
{
"epoch": 0.01457858769931663,
"grad_norm": 3.361843882497272,
"learning_rate": 1.3939393939393942e-06,
"loss": 1.2849,
"step": 24
},
{
"epoch": 0.015186028853454821,
"grad_norm": 3.419552970347977,
"learning_rate": 1.4545454545454546e-06,
"loss": 1.0856,
"step": 25
},
{
"epoch": 0.015793470007593013,
"grad_norm": 4.990297184335331,
"learning_rate": 1.5151515151515152e-06,
"loss": 1.1542,
"step": 26
},
{
"epoch": 0.01640091116173121,
"grad_norm": 3.6910946706226024,
"learning_rate": 1.5757575757575759e-06,
"loss": 1.1332,
"step": 27
},
{
"epoch": 0.0170083523158694,
"grad_norm": 3.2707017108314105,
"learning_rate": 1.6363636363636365e-06,
"loss": 1.0859,
"step": 28
},
{
"epoch": 0.017615793470007593,
"grad_norm": 3.6848753932789022,
"learning_rate": 1.6969696969696973e-06,
"loss": 1.1462,
"step": 29
},
{
"epoch": 0.018223234624145785,
"grad_norm": 3.539975633137712,
"learning_rate": 1.7575757575757577e-06,
"loss": 1.1017,
"step": 30
},
{
"epoch": 0.018830675778283977,
"grad_norm": 5.020856655134608,
"learning_rate": 1.8181818181818183e-06,
"loss": 1.1052,
"step": 31
},
{
"epoch": 0.019438116932422173,
"grad_norm": 2.8322427452989403,
"learning_rate": 1.878787878787879e-06,
"loss": 1.0702,
"step": 32
},
{
"epoch": 0.020045558086560365,
"grad_norm": 11.978063076522911,
"learning_rate": 1.9393939393939395e-06,
"loss": 1.0783,
"step": 33
},
{
"epoch": 0.020652999240698557,
"grad_norm": 3.0620204118757615,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.9578,
"step": 34
},
{
"epoch": 0.02126044039483675,
"grad_norm": 2.6396729812241992,
"learning_rate": 2.0606060606060607e-06,
"loss": 1.0325,
"step": 35
},
{
"epoch": 0.021867881548974944,
"grad_norm": 2.605944141300131,
"learning_rate": 2.1212121212121216e-06,
"loss": 0.9263,
"step": 36
},
{
"epoch": 0.022475322703113136,
"grad_norm": 2.253997875811574,
"learning_rate": 2.181818181818182e-06,
"loss": 0.9275,
"step": 37
},
{
"epoch": 0.023082763857251328,
"grad_norm": 2.713661141656442,
"learning_rate": 2.2424242424242428e-06,
"loss": 0.9474,
"step": 38
},
{
"epoch": 0.02369020501138952,
"grad_norm": 3.6449984569878833,
"learning_rate": 2.303030303030303e-06,
"loss": 0.9734,
"step": 39
},
{
"epoch": 0.024297646165527716,
"grad_norm": 2.810830857915748,
"learning_rate": 2.363636363636364e-06,
"loss": 0.8991,
"step": 40
},
{
"epoch": 0.024905087319665908,
"grad_norm": 2.567407308011712,
"learning_rate": 2.4242424242424244e-06,
"loss": 0.8963,
"step": 41
},
{
"epoch": 0.0255125284738041,
"grad_norm": 2.637128320235312,
"learning_rate": 2.4848484848484848e-06,
"loss": 0.8778,
"step": 42
},
{
"epoch": 0.026119969627942292,
"grad_norm": 3.0903128789155754,
"learning_rate": 2.5454545454545456e-06,
"loss": 0.8738,
"step": 43
},
{
"epoch": 0.026727410782080487,
"grad_norm": 2.6726103064757214,
"learning_rate": 2.6060606060606064e-06,
"loss": 0.8107,
"step": 44
},
{
"epoch": 0.02733485193621868,
"grad_norm": 2.4879173840412006,
"learning_rate": 2.666666666666667e-06,
"loss": 0.8709,
"step": 45
},
{
"epoch": 0.02794229309035687,
"grad_norm": 2.7470444516164294,
"learning_rate": 2.7272727272727272e-06,
"loss": 0.8008,
"step": 46
},
{
"epoch": 0.028549734244495063,
"grad_norm": 3.0832323670913566,
"learning_rate": 2.7878787878787885e-06,
"loss": 0.8585,
"step": 47
},
{
"epoch": 0.02915717539863326,
"grad_norm": 2.3479372726052334,
"learning_rate": 2.848484848484849e-06,
"loss": 0.799,
"step": 48
},
{
"epoch": 0.02976461655277145,
"grad_norm": 3.333892504748674,
"learning_rate": 2.9090909090909093e-06,
"loss": 0.7643,
"step": 49
},
{
"epoch": 0.030372057706909643,
"grad_norm": 4.005716160822946,
"learning_rate": 2.96969696969697e-06,
"loss": 0.7075,
"step": 50
},
{
"epoch": 0.030979498861047835,
"grad_norm": 2.9637475493256393,
"learning_rate": 3.0303030303030305e-06,
"loss": 0.752,
"step": 51
},
{
"epoch": 0.03158694001518603,
"grad_norm": 6.448028143148418,
"learning_rate": 3.090909090909091e-06,
"loss": 0.7298,
"step": 52
},
{
"epoch": 0.03219438116932422,
"grad_norm": 2.5226109840726267,
"learning_rate": 3.1515151515151517e-06,
"loss": 0.6652,
"step": 53
},
{
"epoch": 0.03280182232346242,
"grad_norm": 2.493123673145167,
"learning_rate": 3.2121212121212125e-06,
"loss": 0.6731,
"step": 54
},
{
"epoch": 0.033409263477600606,
"grad_norm": 2.605987670786732,
"learning_rate": 3.272727272727273e-06,
"loss": 0.7381,
"step": 55
},
{
"epoch": 0.0340167046317388,
"grad_norm": 2.29936704850675,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.6703,
"step": 56
},
{
"epoch": 0.03462414578587699,
"grad_norm": 5.095296867406457,
"learning_rate": 3.3939393939393946e-06,
"loss": 0.6278,
"step": 57
},
{
"epoch": 0.035231586940015186,
"grad_norm": 4.103295631128622,
"learning_rate": 3.454545454545455e-06,
"loss": 0.6501,
"step": 58
},
{
"epoch": 0.03583902809415338,
"grad_norm": 3.266586264244061,
"learning_rate": 3.5151515151515154e-06,
"loss": 0.6557,
"step": 59
},
{
"epoch": 0.03644646924829157,
"grad_norm": 1.996000103297209,
"learning_rate": 3.575757575757576e-06,
"loss": 0.605,
"step": 60
},
{
"epoch": 0.037053910402429766,
"grad_norm": 2.616817368318823,
"learning_rate": 3.6363636363636366e-06,
"loss": 0.6313,
"step": 61
},
{
"epoch": 0.037661351556567954,
"grad_norm": 2.977002184120951,
"learning_rate": 3.6969696969696974e-06,
"loss": 0.6025,
"step": 62
},
{
"epoch": 0.03826879271070615,
"grad_norm": 1.8839479352203694,
"learning_rate": 3.757575757575758e-06,
"loss": 0.5803,
"step": 63
},
{
"epoch": 0.038876233864844345,
"grad_norm": 1.8084677913612675,
"learning_rate": 3.818181818181819e-06,
"loss": 0.6036,
"step": 64
},
{
"epoch": 0.039483675018982534,
"grad_norm": 3.298041724732867,
"learning_rate": 3.878787878787879e-06,
"loss": 0.5619,
"step": 65
},
{
"epoch": 0.04009111617312073,
"grad_norm": 3.210051259243264,
"learning_rate": 3.93939393939394e-06,
"loss": 0.6117,
"step": 66
},
{
"epoch": 0.040698557327258925,
"grad_norm": 1.7548473190282035,
"learning_rate": 4.000000000000001e-06,
"loss": 0.6115,
"step": 67
},
{
"epoch": 0.04130599848139711,
"grad_norm": 4.027003074298192,
"learning_rate": 4.060606060606061e-06,
"loss": 0.555,
"step": 68
},
{
"epoch": 0.04191343963553531,
"grad_norm": 1.7049695945671344,
"learning_rate": 4.1212121212121215e-06,
"loss": 0.5281,
"step": 69
},
{
"epoch": 0.0425208807896735,
"grad_norm": 1.9295753107848288,
"learning_rate": 4.181818181818182e-06,
"loss": 0.4976,
"step": 70
},
{
"epoch": 0.04312832194381169,
"grad_norm": 5.6724940525137315,
"learning_rate": 4.242424242424243e-06,
"loss": 0.5599,
"step": 71
},
{
"epoch": 0.04373576309794989,
"grad_norm": 2.5165077043031925,
"learning_rate": 4.303030303030303e-06,
"loss": 0.5043,
"step": 72
},
{
"epoch": 0.04434320425208808,
"grad_norm": 2.7600416718167438,
"learning_rate": 4.363636363636364e-06,
"loss": 0.5835,
"step": 73
},
{
"epoch": 0.04495064540622627,
"grad_norm": 4.577709098881687,
"learning_rate": 4.424242424242425e-06,
"loss": 0.5495,
"step": 74
},
{
"epoch": 0.04555808656036447,
"grad_norm": 3.185211056113932,
"learning_rate": 4.4848484848484855e-06,
"loss": 0.5371,
"step": 75
},
{
"epoch": 0.046165527714502656,
"grad_norm": 1.7788256477959088,
"learning_rate": 4.5454545454545455e-06,
"loss": 0.4778,
"step": 76
},
{
"epoch": 0.04677296886864085,
"grad_norm": 2.0763650855771254,
"learning_rate": 4.606060606060606e-06,
"loss": 0.4913,
"step": 77
},
{
"epoch": 0.04738041002277904,
"grad_norm": 2.1625612516573605,
"learning_rate": 4.666666666666667e-06,
"loss": 0.453,
"step": 78
},
{
"epoch": 0.047987851176917236,
"grad_norm": 2.186861003935316,
"learning_rate": 4.727272727272728e-06,
"loss": 0.559,
"step": 79
},
{
"epoch": 0.04859529233105543,
"grad_norm": 2.7651744089253056,
"learning_rate": 4.787878787878788e-06,
"loss": 0.4996,
"step": 80
},
{
"epoch": 0.04920273348519362,
"grad_norm": 1.8551809178974643,
"learning_rate": 4.848484848484849e-06,
"loss": 0.5501,
"step": 81
},
{
"epoch": 0.049810174639331815,
"grad_norm": 1.8080650695028413,
"learning_rate": 4.90909090909091e-06,
"loss": 0.5637,
"step": 82
},
{
"epoch": 0.05041761579347001,
"grad_norm": 1.830828959106882,
"learning_rate": 4.9696969696969696e-06,
"loss": 0.4904,
"step": 83
},
{
"epoch": 0.0510250569476082,
"grad_norm": 1.7016759327222268,
"learning_rate": 5.030303030303031e-06,
"loss": 0.485,
"step": 84
},
{
"epoch": 0.051632498101746395,
"grad_norm": 1.5379148608918123,
"learning_rate": 5.090909090909091e-06,
"loss": 0.4898,
"step": 85
},
{
"epoch": 0.052239939255884583,
"grad_norm": 1.8661306362602696,
"learning_rate": 5.151515151515152e-06,
"loss": 0.4896,
"step": 86
},
{
"epoch": 0.05284738041002278,
"grad_norm": 2.0602919860599482,
"learning_rate": 5.212121212121213e-06,
"loss": 0.4677,
"step": 87
},
{
"epoch": 0.053454821564160974,
"grad_norm": 1.772568608105656,
"learning_rate": 5.272727272727273e-06,
"loss": 0.4233,
"step": 88
},
{
"epoch": 0.05406226271829916,
"grad_norm": 1.6245327242364143,
"learning_rate": 5.333333333333334e-06,
"loss": 0.4784,
"step": 89
},
{
"epoch": 0.05466970387243736,
"grad_norm": 1.4521657126765681,
"learning_rate": 5.3939393939393945e-06,
"loss": 0.4646,
"step": 90
},
{
"epoch": 0.05527714502657555,
"grad_norm": 6.093340972704043,
"learning_rate": 5.4545454545454545e-06,
"loss": 0.4966,
"step": 91
},
{
"epoch": 0.05588458618071374,
"grad_norm": 3.72118634286479,
"learning_rate": 5.515151515151515e-06,
"loss": 0.5535,
"step": 92
},
{
"epoch": 0.05649202733485194,
"grad_norm": 2.058084505025063,
"learning_rate": 5.575757575757577e-06,
"loss": 0.4948,
"step": 93
},
{
"epoch": 0.05709946848899013,
"grad_norm": 1.6056271512619456,
"learning_rate": 5.636363636363636e-06,
"loss": 0.4807,
"step": 94
},
{
"epoch": 0.05770690964312832,
"grad_norm": 2.583298049219201,
"learning_rate": 5.696969696969698e-06,
"loss": 0.4591,
"step": 95
},
{
"epoch": 0.05831435079726652,
"grad_norm": 1.5715554881698062,
"learning_rate": 5.7575757575757586e-06,
"loss": 0.477,
"step": 96
},
{
"epoch": 0.058921791951404706,
"grad_norm": 2.8917057921549425,
"learning_rate": 5.8181818181818185e-06,
"loss": 0.5028,
"step": 97
},
{
"epoch": 0.0595292331055429,
"grad_norm": 1.620684043715009,
"learning_rate": 5.878787878787879e-06,
"loss": 0.4828,
"step": 98
},
{
"epoch": 0.06013667425968109,
"grad_norm": 1.375161799406732,
"learning_rate": 5.93939393939394e-06,
"loss": 0.4249,
"step": 99
},
{
"epoch": 0.060744115413819286,
"grad_norm": 1.4172846210611314,
"learning_rate": 6e-06,
"loss": 0.3987,
"step": 100
},
{
"epoch": 0.06135155656795748,
"grad_norm": 1.8383152351112293,
"learning_rate": 6.060606060606061e-06,
"loss": 0.459,
"step": 101
},
{
"epoch": 0.06195899772209567,
"grad_norm": 1.609009063333299,
"learning_rate": 6.121212121212121e-06,
"loss": 0.4439,
"step": 102
},
{
"epoch": 0.06256643887623387,
"grad_norm": 2.8231836394185734,
"learning_rate": 6.181818181818182e-06,
"loss": 0.529,
"step": 103
},
{
"epoch": 0.06317388003037205,
"grad_norm": 1.4932765424378436,
"learning_rate": 6.2424242424242434e-06,
"loss": 0.4752,
"step": 104
},
{
"epoch": 0.06378132118451026,
"grad_norm": 1.4265072751982764,
"learning_rate": 6.303030303030303e-06,
"loss": 0.4559,
"step": 105
},
{
"epoch": 0.06438876233864844,
"grad_norm": 1.7232201589379095,
"learning_rate": 6.363636363636364e-06,
"loss": 0.4081,
"step": 106
},
{
"epoch": 0.06499620349278663,
"grad_norm": 1.9090397829519035,
"learning_rate": 6.424242424242425e-06,
"loss": 0.4906,
"step": 107
},
{
"epoch": 0.06560364464692484,
"grad_norm": 5.566791403419221,
"learning_rate": 6.484848484848485e-06,
"loss": 0.5131,
"step": 108
},
{
"epoch": 0.06621108580106302,
"grad_norm": 2.1515254306159153,
"learning_rate": 6.545454545454546e-06,
"loss": 0.4312,
"step": 109
},
{
"epoch": 0.06681852695520121,
"grad_norm": 1.53243371760272,
"learning_rate": 6.606060606060607e-06,
"loss": 0.4083,
"step": 110
},
{
"epoch": 0.0674259681093394,
"grad_norm": 1.5854107306879113,
"learning_rate": 6.666666666666667e-06,
"loss": 0.4338,
"step": 111
},
{
"epoch": 0.0680334092634776,
"grad_norm": 1.7854671288961894,
"learning_rate": 6.7272727272727275e-06,
"loss": 0.4259,
"step": 112
},
{
"epoch": 0.06864085041761579,
"grad_norm": 1.603406823497176,
"learning_rate": 6.787878787878789e-06,
"loss": 0.4359,
"step": 113
},
{
"epoch": 0.06924829157175398,
"grad_norm": 1.4707223980643627,
"learning_rate": 6.848484848484849e-06,
"loss": 0.4215,
"step": 114
},
{
"epoch": 0.06985573272589218,
"grad_norm": 1.6534737889131494,
"learning_rate": 6.90909090909091e-06,
"loss": 0.4414,
"step": 115
},
{
"epoch": 0.07046317388003037,
"grad_norm": 1.532739371631904,
"learning_rate": 6.969696969696971e-06,
"loss": 0.4297,
"step": 116
},
{
"epoch": 0.07107061503416856,
"grad_norm": 1.4297734448243231,
"learning_rate": 7.030303030303031e-06,
"loss": 0.4479,
"step": 117
},
{
"epoch": 0.07167805618830676,
"grad_norm": 1.4036156057391842,
"learning_rate": 7.0909090909090916e-06,
"loss": 0.4278,
"step": 118
},
{
"epoch": 0.07228549734244495,
"grad_norm": 1.6693103514376786,
"learning_rate": 7.151515151515152e-06,
"loss": 0.4375,
"step": 119
},
{
"epoch": 0.07289293849658314,
"grad_norm": 1.8164935073923107,
"learning_rate": 7.212121212121212e-06,
"loss": 0.3987,
"step": 120
},
{
"epoch": 0.07350037965072134,
"grad_norm": 1.6026752209068431,
"learning_rate": 7.272727272727273e-06,
"loss": 0.4095,
"step": 121
},
{
"epoch": 0.07410782080485953,
"grad_norm": 1.652701468229036,
"learning_rate": 7.333333333333333e-06,
"loss": 0.4165,
"step": 122
},
{
"epoch": 0.07471526195899772,
"grad_norm": 1.5322103898261112,
"learning_rate": 7.393939393939395e-06,
"loss": 0.4534,
"step": 123
},
{
"epoch": 0.07532270311313591,
"grad_norm": 1.5253210647459732,
"learning_rate": 7.454545454545456e-06,
"loss": 0.4315,
"step": 124
},
{
"epoch": 0.07593014426727411,
"grad_norm": 1.774830478931673,
"learning_rate": 7.515151515151516e-06,
"loss": 0.4044,
"step": 125
},
{
"epoch": 0.0765375854214123,
"grad_norm": 1.887705476184069,
"learning_rate": 7.5757575757575764e-06,
"loss": 0.4153,
"step": 126
},
{
"epoch": 0.07714502657555049,
"grad_norm": 2.38958143214645,
"learning_rate": 7.636363636363638e-06,
"loss": 0.3912,
"step": 127
},
{
"epoch": 0.07775246772968869,
"grad_norm": 1.6032007121408827,
"learning_rate": 7.696969696969696e-06,
"loss": 0.3989,
"step": 128
},
{
"epoch": 0.07835990888382688,
"grad_norm": 1.3782945465938736,
"learning_rate": 7.757575757575758e-06,
"loss": 0.4357,
"step": 129
},
{
"epoch": 0.07896735003796507,
"grad_norm": 1.332527605688991,
"learning_rate": 7.81818181818182e-06,
"loss": 0.3616,
"step": 130
},
{
"epoch": 0.07957479119210327,
"grad_norm": 1.8131621493966852,
"learning_rate": 7.87878787878788e-06,
"loss": 0.4498,
"step": 131
},
{
"epoch": 0.08018223234624146,
"grad_norm": 4.745812951213747,
"learning_rate": 7.93939393939394e-06,
"loss": 0.4247,
"step": 132
},
{
"epoch": 0.08078967350037965,
"grad_norm": 2.294695141070265,
"learning_rate": 8.000000000000001e-06,
"loss": 0.3744,
"step": 133
},
{
"epoch": 0.08139711465451785,
"grad_norm": 1.6356983798797735,
"learning_rate": 8.060606060606061e-06,
"loss": 0.4373,
"step": 134
},
{
"epoch": 0.08200455580865604,
"grad_norm": 1.9911745307305873,
"learning_rate": 8.121212121212121e-06,
"loss": 0.4294,
"step": 135
},
{
"epoch": 0.08261199696279423,
"grad_norm": 1.3737076521844085,
"learning_rate": 8.181818181818183e-06,
"loss": 0.4106,
"step": 136
},
{
"epoch": 0.08321943811693243,
"grad_norm": 1.5919226800718935,
"learning_rate": 8.242424242424243e-06,
"loss": 0.4537,
"step": 137
},
{
"epoch": 0.08382687927107062,
"grad_norm": 1.4783198785793517,
"learning_rate": 8.303030303030305e-06,
"loss": 0.3788,
"step": 138
},
{
"epoch": 0.0844343204252088,
"grad_norm": 1.6410317800155916,
"learning_rate": 8.363636363636365e-06,
"loss": 0.3867,
"step": 139
},
{
"epoch": 0.085041761579347,
"grad_norm": 1.6735556190365386,
"learning_rate": 8.424242424242425e-06,
"loss": 0.4355,
"step": 140
},
{
"epoch": 0.0856492027334852,
"grad_norm": 1.5607650619109157,
"learning_rate": 8.484848484848486e-06,
"loss": 0.4044,
"step": 141
},
{
"epoch": 0.08625664388762339,
"grad_norm": 1.4477035531105342,
"learning_rate": 8.545454545454546e-06,
"loss": 0.4603,
"step": 142
},
{
"epoch": 0.08686408504176157,
"grad_norm": 2.315589018866609,
"learning_rate": 8.606060606060606e-06,
"loss": 0.3701,
"step": 143
},
{
"epoch": 0.08747152619589978,
"grad_norm": 1.282913362768846,
"learning_rate": 8.666666666666668e-06,
"loss": 0.4052,
"step": 144
},
{
"epoch": 0.08807896735003796,
"grad_norm": 1.441795861741466,
"learning_rate": 8.727272727272728e-06,
"loss": 0.3743,
"step": 145
},
{
"epoch": 0.08868640850417615,
"grad_norm": 1.6705399490739978,
"learning_rate": 8.787878787878788e-06,
"loss": 0.4156,
"step": 146
},
{
"epoch": 0.08929384965831436,
"grad_norm": 1.3022330139774325,
"learning_rate": 8.84848484848485e-06,
"loss": 0.403,
"step": 147
},
{
"epoch": 0.08990129081245254,
"grad_norm": 1.3289138423968723,
"learning_rate": 8.90909090909091e-06,
"loss": 0.4241,
"step": 148
},
{
"epoch": 0.09050873196659073,
"grad_norm": 1.9110745404266152,
"learning_rate": 8.969696969696971e-06,
"loss": 0.3843,
"step": 149
},
{
"epoch": 0.09111617312072894,
"grad_norm": 1.4735930505959485,
"learning_rate": 9.030303030303031e-06,
"loss": 0.3963,
"step": 150
},
{
"epoch": 0.09172361427486712,
"grad_norm": 1.542399354189347,
"learning_rate": 9.090909090909091e-06,
"loss": 0.3909,
"step": 151
},
{
"epoch": 0.09233105542900531,
"grad_norm": 3.7339191436590244,
"learning_rate": 9.151515151515153e-06,
"loss": 0.3864,
"step": 152
},
{
"epoch": 0.0929384965831435,
"grad_norm": 1.3925338316644487,
"learning_rate": 9.212121212121213e-06,
"loss": 0.3766,
"step": 153
},
{
"epoch": 0.0935459377372817,
"grad_norm": 1.4555873503122925,
"learning_rate": 9.272727272727273e-06,
"loss": 0.352,
"step": 154
},
{
"epoch": 0.09415337889141989,
"grad_norm": 1.5097093818651814,
"learning_rate": 9.333333333333334e-06,
"loss": 0.3938,
"step": 155
},
{
"epoch": 0.09476082004555808,
"grad_norm": 1.5316324465377855,
"learning_rate": 9.393939393939396e-06,
"loss": 0.3722,
"step": 156
},
{
"epoch": 0.09536826119969628,
"grad_norm": 1.3962340669474915,
"learning_rate": 9.454545454545456e-06,
"loss": 0.4142,
"step": 157
},
{
"epoch": 0.09597570235383447,
"grad_norm": 1.2746895088209749,
"learning_rate": 9.515151515151516e-06,
"loss": 0.3723,
"step": 158
},
{
"epoch": 0.09658314350797266,
"grad_norm": 1.818570241099621,
"learning_rate": 9.575757575757576e-06,
"loss": 0.4198,
"step": 159
},
{
"epoch": 0.09719058466211086,
"grad_norm": 1.2766595136162489,
"learning_rate": 9.636363636363638e-06,
"loss": 0.3832,
"step": 160
},
{
"epoch": 0.09779802581624905,
"grad_norm": 1.2291199390553869,
"learning_rate": 9.696969696969698e-06,
"loss": 0.3825,
"step": 161
},
{
"epoch": 0.09840546697038724,
"grad_norm": 1.316405861184458,
"learning_rate": 9.757575757575758e-06,
"loss": 0.3418,
"step": 162
},
{
"epoch": 0.09901290812452544,
"grad_norm": 1.9801803743837283,
"learning_rate": 9.81818181818182e-06,
"loss": 0.3554,
"step": 163
},
{
"epoch": 0.09962034927866363,
"grad_norm": 1.3735171287256052,
"learning_rate": 9.87878787878788e-06,
"loss": 0.4433,
"step": 164
},
{
"epoch": 0.10022779043280182,
"grad_norm": 1.548447451053059,
"learning_rate": 9.939393939393939e-06,
"loss": 0.3988,
"step": 165
},
{
"epoch": 0.10083523158694002,
"grad_norm": 1.9424963562307047,
"learning_rate": 1e-05,
"loss": 0.4318,
"step": 166
},
{
"epoch": 0.10144267274107821,
"grad_norm": 1.4796993703531827,
"learning_rate": 9.999988765773283e-06,
"loss": 0.3901,
"step": 167
},
{
"epoch": 0.1020501138952164,
"grad_norm": 2.718931345910638,
"learning_rate": 9.99995506314361e-06,
"loss": 0.3733,
"step": 168
},
{
"epoch": 0.10265755504935459,
"grad_norm": 1.5993151857382264,
"learning_rate": 9.999898892262433e-06,
"loss": 0.3702,
"step": 169
},
{
"epoch": 0.10326499620349279,
"grad_norm": 2.0474737975378914,
"learning_rate": 9.99982025338217e-06,
"loss": 0.3925,
"step": 170
},
{
"epoch": 0.10387243735763098,
"grad_norm": 1.36801258921155,
"learning_rate": 9.999719146856191e-06,
"loss": 0.3601,
"step": 171
},
{
"epoch": 0.10447987851176917,
"grad_norm": 2.62979787342245,
"learning_rate": 9.999595573138845e-06,
"loss": 0.3676,
"step": 172
},
{
"epoch": 0.10508731966590737,
"grad_norm": 1.651611235990868,
"learning_rate": 9.99944953278543e-06,
"loss": 0.3896,
"step": 173
},
{
"epoch": 0.10569476082004556,
"grad_norm": 1.537437823682991,
"learning_rate": 9.99928102645221e-06,
"loss": 0.3931,
"step": 174
},
{
"epoch": 0.10630220197418375,
"grad_norm": 1.483529775109398,
"learning_rate": 9.999090054896397e-06,
"loss": 0.4084,
"step": 175
},
{
"epoch": 0.10690964312832195,
"grad_norm": 1.846243435186876,
"learning_rate": 9.99887661897616e-06,
"loss": 0.4019,
"step": 176
},
{
"epoch": 0.10751708428246014,
"grad_norm": 1.9400365455774309,
"learning_rate": 9.998640719650609e-06,
"loss": 0.353,
"step": 177
},
{
"epoch": 0.10812452543659833,
"grad_norm": 1.6331047828876988,
"learning_rate": 9.99838235797981e-06,
"loss": 0.3934,
"step": 178
},
{
"epoch": 0.10873196659073653,
"grad_norm": 1.3201503553736853,
"learning_rate": 9.998101535124758e-06,
"loss": 0.3784,
"step": 179
},
{
"epoch": 0.10933940774487472,
"grad_norm": 1.5036507542871547,
"learning_rate": 9.997798252347382e-06,
"loss": 0.3829,
"step": 180
},
{
"epoch": 0.1099468488990129,
"grad_norm": 1.8873362444241242,
"learning_rate": 9.997472511010543e-06,
"loss": 0.3468,
"step": 181
},
{
"epoch": 0.1105542900531511,
"grad_norm": 1.7817085438588727,
"learning_rate": 9.99712431257802e-06,
"loss": 0.3942,
"step": 182
},
{
"epoch": 0.1111617312072893,
"grad_norm": 1.4579688480639608,
"learning_rate": 9.99675365861451e-06,
"loss": 0.3493,
"step": 183
},
{
"epoch": 0.11176917236142749,
"grad_norm": 1.3333541821120902,
"learning_rate": 9.996360550785619e-06,
"loss": 0.3748,
"step": 184
},
{
"epoch": 0.11237661351556567,
"grad_norm": 1.4462662425656174,
"learning_rate": 9.995944990857848e-06,
"loss": 0.3929,
"step": 185
},
{
"epoch": 0.11298405466970388,
"grad_norm": 2.8011059160646785,
"learning_rate": 9.9955069806986e-06,
"loss": 0.3832,
"step": 186
},
{
"epoch": 0.11359149582384206,
"grad_norm": 1.795646163613219,
"learning_rate": 9.995046522276152e-06,
"loss": 0.3726,
"step": 187
},
{
"epoch": 0.11419893697798025,
"grad_norm": 1.6154980564971173,
"learning_rate": 9.994563617659665e-06,
"loss": 0.4186,
"step": 188
},
{
"epoch": 0.11480637813211846,
"grad_norm": 4.098449801468239,
"learning_rate": 9.994058269019163e-06,
"loss": 0.3649,
"step": 189
},
{
"epoch": 0.11541381928625664,
"grad_norm": 2.0366760130819452,
"learning_rate": 9.993530478625524e-06,
"loss": 0.3889,
"step": 190
},
{
"epoch": 0.11602126044039483,
"grad_norm": 1.3683556243295762,
"learning_rate": 9.992980248850476e-06,
"loss": 0.3505,
"step": 191
},
{
"epoch": 0.11662870159453304,
"grad_norm": 1.631109364001529,
"learning_rate": 9.992407582166582e-06,
"loss": 0.3716,
"step": 192
},
{
"epoch": 0.11723614274867122,
"grad_norm": 1.2863914949680042,
"learning_rate": 9.99181248114723e-06,
"loss": 0.353,
"step": 193
},
{
"epoch": 0.11784358390280941,
"grad_norm": 2.3770367523137512,
"learning_rate": 9.991194948466615e-06,
"loss": 0.3212,
"step": 194
},
{
"epoch": 0.11845102505694761,
"grad_norm": 1.251810621948992,
"learning_rate": 9.990554986899745e-06,
"loss": 0.3383,
"step": 195
},
{
"epoch": 0.1190584662110858,
"grad_norm": 1.3642089088744158,
"learning_rate": 9.989892599322404e-06,
"loss": 0.4176,
"step": 196
},
{
"epoch": 0.11966590736522399,
"grad_norm": 1.6922372698689354,
"learning_rate": 9.98920778871116e-06,
"loss": 0.3655,
"step": 197
},
{
"epoch": 0.12027334851936218,
"grad_norm": 3.407996768283883,
"learning_rate": 9.988500558143337e-06,
"loss": 0.3844,
"step": 198
},
{
"epoch": 0.12088078967350038,
"grad_norm": 1.609402206597202,
"learning_rate": 9.987770910797014e-06,
"loss": 0.4128,
"step": 199
},
{
"epoch": 0.12148823082763857,
"grad_norm": 1.3140316757802264,
"learning_rate": 9.987018849950996e-06,
"loss": 0.3962,
"step": 200
},
{
"epoch": 0.12209567198177676,
"grad_norm": 1.7469090493735866,
"learning_rate": 9.986244378984817e-06,
"loss": 0.3691,
"step": 201
},
{
"epoch": 0.12270311313591496,
"grad_norm": 1.33925380250546,
"learning_rate": 9.985447501378706e-06,
"loss": 0.3566,
"step": 202
},
{
"epoch": 0.12331055429005315,
"grad_norm": 1.809637543111782,
"learning_rate": 9.984628220713587e-06,
"loss": 0.33,
"step": 203
},
{
"epoch": 0.12391799544419134,
"grad_norm": 8.810976676413983,
"learning_rate": 9.983786540671052e-06,
"loss": 0.3745,
"step": 204
},
{
"epoch": 0.12452543659832954,
"grad_norm": 1.5678141251242255,
"learning_rate": 9.98292246503335e-06,
"loss": 0.3863,
"step": 205
},
{
"epoch": 0.12513287775246773,
"grad_norm": 1.2444290855149283,
"learning_rate": 9.982035997683372e-06,
"loss": 0.3449,
"step": 206
},
{
"epoch": 0.12574031890660592,
"grad_norm": 1.262107824746014,
"learning_rate": 9.981127142604628e-06,
"loss": 0.3553,
"step": 207
},
{
"epoch": 0.1263477600607441,
"grad_norm": 2.261971867644215,
"learning_rate": 9.980195903881231e-06,
"loss": 0.3929,
"step": 208
},
{
"epoch": 0.1269552012148823,
"grad_norm": 1.8939164327645688,
"learning_rate": 9.979242285697878e-06,
"loss": 0.3894,
"step": 209
},
{
"epoch": 0.1275626423690205,
"grad_norm": 1.3109993122933488,
"learning_rate": 9.978266292339838e-06,
"loss": 0.3462,
"step": 210
},
{
"epoch": 0.1281700835231587,
"grad_norm": 1.2689263598262492,
"learning_rate": 9.97726792819292e-06,
"loss": 0.3756,
"step": 211
},
{
"epoch": 0.1287775246772969,
"grad_norm": 1.413911853511702,
"learning_rate": 9.976247197743465e-06,
"loss": 0.3517,
"step": 212
},
{
"epoch": 0.12938496583143508,
"grad_norm": 1.1477594568471046,
"learning_rate": 9.975204105578318e-06,
"loss": 0.3524,
"step": 213
},
{
"epoch": 0.12999240698557327,
"grad_norm": 1.3268983856893921,
"learning_rate": 9.974138656384815e-06,
"loss": 0.3703,
"step": 214
},
{
"epoch": 0.13059984813971146,
"grad_norm": 1.6085096632167164,
"learning_rate": 9.973050854950756e-06,
"loss": 0.4107,
"step": 215
},
{
"epoch": 0.13120728929384967,
"grad_norm": 1.2710644494087506,
"learning_rate": 9.97194070616438e-06,
"loss": 0.3704,
"step": 216
},
{
"epoch": 0.13181473044798786,
"grad_norm": 1.7007435080900732,
"learning_rate": 9.970808215014357e-06,
"loss": 0.3616,
"step": 217
},
{
"epoch": 0.13242217160212605,
"grad_norm": 2.8035141258704965,
"learning_rate": 9.969653386589749e-06,
"loss": 0.3476,
"step": 218
},
{
"epoch": 0.13302961275626424,
"grad_norm": 1.4530105606551262,
"learning_rate": 9.968476226079997e-06,
"loss": 0.3658,
"step": 219
},
{
"epoch": 0.13363705391040243,
"grad_norm": 1.365931702075457,
"learning_rate": 9.967276738774897e-06,
"loss": 0.3559,
"step": 220
},
{
"epoch": 0.13424449506454061,
"grad_norm": 2.4105147491908845,
"learning_rate": 9.966054930064577e-06,
"loss": 0.3464,
"step": 221
},
{
"epoch": 0.1348519362186788,
"grad_norm": 1.460590918478701,
"learning_rate": 9.964810805439464e-06,
"loss": 0.3835,
"step": 222
},
{
"epoch": 0.13545937737281702,
"grad_norm": 1.4592160813082191,
"learning_rate": 9.96354437049027e-06,
"loss": 0.3649,
"step": 223
},
{
"epoch": 0.1360668185269552,
"grad_norm": 1.3706484221475523,
"learning_rate": 9.962255630907964e-06,
"loss": 0.306,
"step": 224
},
{
"epoch": 0.1366742596810934,
"grad_norm": 1.9923965820088605,
"learning_rate": 9.96094459248374e-06,
"loss": 0.4094,
"step": 225
},
{
"epoch": 0.13728170083523158,
"grad_norm": 1.212563998181567,
"learning_rate": 9.959611261108999e-06,
"loss": 0.3601,
"step": 226
},
{
"epoch": 0.13788914198936977,
"grad_norm": 1.0692124626260124,
"learning_rate": 9.95825564277532e-06,
"loss": 0.3532,
"step": 227
},
{
"epoch": 0.13849658314350796,
"grad_norm": 1.1033511924540673,
"learning_rate": 9.956877743574437e-06,
"loss": 0.3384,
"step": 228
},
{
"epoch": 0.13910402429764618,
"grad_norm": 1.2707398286483838,
"learning_rate": 9.955477569698197e-06,
"loss": 0.3367,
"step": 229
},
{
"epoch": 0.13971146545178437,
"grad_norm": 1.405772249647566,
"learning_rate": 9.954055127438554e-06,
"loss": 0.3673,
"step": 230
},
{
"epoch": 0.14031890660592256,
"grad_norm": 1.8473504788016004,
"learning_rate": 9.952610423187516e-06,
"loss": 0.4095,
"step": 231
},
{
"epoch": 0.14092634776006074,
"grad_norm": 1.4016757128131039,
"learning_rate": 9.951143463437145e-06,
"loss": 0.3503,
"step": 232
},
{
"epoch": 0.14153378891419893,
"grad_norm": 1.3608962633812787,
"learning_rate": 9.949654254779499e-06,
"loss": 0.3897,
"step": 233
},
{
"epoch": 0.14214123006833712,
"grad_norm": 1.2731854733891155,
"learning_rate": 9.948142803906623e-06,
"loss": 0.3596,
"step": 234
},
{
"epoch": 0.1427486712224753,
"grad_norm": 2.744204155126872,
"learning_rate": 9.946609117610508e-06,
"loss": 0.3311,
"step": 235
},
{
"epoch": 0.14335611237661353,
"grad_norm": 1.4126443556826698,
"learning_rate": 9.94505320278307e-06,
"loss": 0.3417,
"step": 236
},
{
"epoch": 0.14396355353075171,
"grad_norm": 1.8771416206071374,
"learning_rate": 9.943475066416105e-06,
"loss": 0.3246,
"step": 237
},
{
"epoch": 0.1445709946848899,
"grad_norm": 1.662514482004372,
"learning_rate": 9.94187471560127e-06,
"loss": 0.336,
"step": 238
},
{
"epoch": 0.1451784358390281,
"grad_norm": 1.3252537752081341,
"learning_rate": 9.940252157530048e-06,
"loss": 0.3728,
"step": 239
},
{
"epoch": 0.14578587699316628,
"grad_norm": 1.4379575767599655,
"learning_rate": 9.938607399493714e-06,
"loss": 0.3349,
"step": 240
},
{
"epoch": 0.14639331814730447,
"grad_norm": 1.7085189808679144,
"learning_rate": 9.936940448883299e-06,
"loss": 0.3732,
"step": 241
},
{
"epoch": 0.14700075930144268,
"grad_norm": 1.3475867031172948,
"learning_rate": 9.935251313189564e-06,
"loss": 0.3614,
"step": 242
},
{
"epoch": 0.14760820045558087,
"grad_norm": 1.3757718265613308,
"learning_rate": 9.933540000002966e-06,
"loss": 0.3495,
"step": 243
},
{
"epoch": 0.14821564160971906,
"grad_norm": 2.5569808145548016,
"learning_rate": 9.931806517013612e-06,
"loss": 0.3846,
"step": 244
},
{
"epoch": 0.14882308276385725,
"grad_norm": 2.3675332462526724,
"learning_rate": 9.930050872011242e-06,
"loss": 0.3927,
"step": 245
},
{
"epoch": 0.14943052391799544,
"grad_norm": 1.257087872816816,
"learning_rate": 9.92827307288518e-06,
"loss": 0.347,
"step": 246
},
{
"epoch": 0.15003796507213363,
"grad_norm": 1.1999532829094337,
"learning_rate": 9.926473127624306e-06,
"loss": 0.3099,
"step": 247
},
{
"epoch": 0.15064540622627182,
"grad_norm": 1.3143658832484064,
"learning_rate": 9.924651044317017e-06,
"loss": 0.3476,
"step": 248
},
{
"epoch": 0.15125284738041003,
"grad_norm": 1.2916975919820823,
"learning_rate": 9.922806831151192e-06,
"loss": 0.3829,
"step": 249
},
{
"epoch": 0.15186028853454822,
"grad_norm": 1.3792160543399081,
"learning_rate": 9.920940496414153e-06,
"loss": 0.3414,
"step": 250
},
{
"epoch": 0.1524677296886864,
"grad_norm": 1.552844179932476,
"learning_rate": 9.919052048492633e-06,
"loss": 0.329,
"step": 251
},
{
"epoch": 0.1530751708428246,
"grad_norm": 1.2281468240128537,
"learning_rate": 9.917141495872733e-06,
"loss": 0.3112,
"step": 252
},
{
"epoch": 0.1536826119969628,
"grad_norm": 1.2302352301120831,
"learning_rate": 9.915208847139883e-06,
"loss": 0.3576,
"step": 253
},
{
"epoch": 0.15429005315110098,
"grad_norm": 1.618659315298819,
"learning_rate": 9.913254110978812e-06,
"loss": 0.3669,
"step": 254
},
{
"epoch": 0.1548974943052392,
"grad_norm": 1.8288474552445757,
"learning_rate": 9.911277296173498e-06,
"loss": 0.3572,
"step": 255
},
{
"epoch": 0.15550493545937738,
"grad_norm": 1.1302723791543823,
"learning_rate": 9.909278411607134e-06,
"loss": 0.3432,
"step": 256
},
{
"epoch": 0.15611237661351557,
"grad_norm": 1.1877481410718438,
"learning_rate": 9.90725746626209e-06,
"loss": 0.3096,
"step": 257
},
{
"epoch": 0.15671981776765376,
"grad_norm": 1.168952450453289,
"learning_rate": 9.90521446921987e-06,
"loss": 0.3067,
"step": 258
},
{
"epoch": 0.15732725892179195,
"grad_norm": 1.3157190721989047,
"learning_rate": 9.903149429661072e-06,
"loss": 0.3666,
"step": 259
},
{
"epoch": 0.15793470007593013,
"grad_norm": 1.1390654533947697,
"learning_rate": 9.90106235686534e-06,
"loss": 0.3506,
"step": 260
},
{
"epoch": 0.15854214123006835,
"grad_norm": 4.00051251257636,
"learning_rate": 9.89895326021134e-06,
"loss": 0.3249,
"step": 261
},
{
"epoch": 0.15914958238420654,
"grad_norm": 1.2234262044458446,
"learning_rate": 9.896822149176695e-06,
"loss": 0.3318,
"step": 262
},
{
"epoch": 0.15975702353834473,
"grad_norm": 1.3897253859010028,
"learning_rate": 9.894669033337962e-06,
"loss": 0.396,
"step": 263
},
{
"epoch": 0.16036446469248292,
"grad_norm": 1.7242712965980627,
"learning_rate": 9.892493922370575e-06,
"loss": 0.3188,
"step": 264
},
{
"epoch": 0.1609719058466211,
"grad_norm": 6.4297649243853225,
"learning_rate": 9.89029682604881e-06,
"loss": 0.3379,
"step": 265
},
{
"epoch": 0.1615793470007593,
"grad_norm": 1.1537471226413523,
"learning_rate": 9.888077754245741e-06,
"loss": 0.3493,
"step": 266
},
{
"epoch": 0.16218678815489748,
"grad_norm": 1.6875687617066326,
"learning_rate": 9.88583671693319e-06,
"loss": 0.3608,
"step": 267
},
{
"epoch": 0.1627942293090357,
"grad_norm": 1.2568745648537023,
"learning_rate": 9.883573724181683e-06,
"loss": 0.3795,
"step": 268
},
{
"epoch": 0.1634016704631739,
"grad_norm": 3.273702543597702,
"learning_rate": 9.881288786160413e-06,
"loss": 0.3669,
"step": 269
},
{
"epoch": 0.16400911161731208,
"grad_norm": 1.6681243106945143,
"learning_rate": 9.878981913137178e-06,
"loss": 0.3045,
"step": 270
},
{
"epoch": 0.16461655277145026,
"grad_norm": 1.3040049020132651,
"learning_rate": 9.87665311547836e-06,
"loss": 0.3748,
"step": 271
},
{
"epoch": 0.16522399392558845,
"grad_norm": 1.4186566442108688,
"learning_rate": 9.87430240364885e-06,
"loss": 0.317,
"step": 272
},
{
"epoch": 0.16583143507972664,
"grad_norm": 1.2927229360317918,
"learning_rate": 9.871929788212022e-06,
"loss": 0.3444,
"step": 273
},
{
"epoch": 0.16643887623386486,
"grad_norm": 1.2231558908365099,
"learning_rate": 9.869535279829674e-06,
"loss": 0.3606,
"step": 274
},
{
"epoch": 0.16704631738800305,
"grad_norm": 1.9640328612851339,
"learning_rate": 9.867118889261988e-06,
"loss": 0.3473,
"step": 275
},
{
"epoch": 0.16765375854214123,
"grad_norm": 1.627402530642274,
"learning_rate": 9.864680627367476e-06,
"loss": 0.3278,
"step": 276
},
{
"epoch": 0.16826119969627942,
"grad_norm": 1.2427133272428312,
"learning_rate": 9.862220505102933e-06,
"loss": 0.3521,
"step": 277
},
{
"epoch": 0.1688686408504176,
"grad_norm": 1.3565240591384948,
"learning_rate": 9.859738533523384e-06,
"loss": 0.319,
"step": 278
},
{
"epoch": 0.1694760820045558,
"grad_norm": 1.2107487526492342,
"learning_rate": 9.857234723782044e-06,
"loss": 0.3352,
"step": 279
},
{
"epoch": 0.170083523158694,
"grad_norm": 1.3180319945310117,
"learning_rate": 9.854709087130261e-06,
"loss": 0.3139,
"step": 280
},
{
"epoch": 0.1706909643128322,
"grad_norm": 3.2813293101863916,
"learning_rate": 9.852161634917463e-06,
"loss": 0.3349,
"step": 281
},
{
"epoch": 0.1712984054669704,
"grad_norm": 1.245803428057021,
"learning_rate": 9.849592378591113e-06,
"loss": 0.3077,
"step": 282
},
{
"epoch": 0.17190584662110858,
"grad_norm": 1.223977954013305,
"learning_rate": 9.847001329696653e-06,
"loss": 0.3069,
"step": 283
},
{
"epoch": 0.17251328777524677,
"grad_norm": 1.3069739682646992,
"learning_rate": 9.844388499877457e-06,
"loss": 0.3291,
"step": 284
},
{
"epoch": 0.17312072892938496,
"grad_norm": 1.131190253610025,
"learning_rate": 9.841753900874774e-06,
"loss": 0.3289,
"step": 285
},
{
"epoch": 0.17372817008352315,
"grad_norm": 1.34440810872577,
"learning_rate": 9.839097544527674e-06,
"loss": 0.3267,
"step": 286
},
{
"epoch": 0.17433561123766136,
"grad_norm": 1.3644537366382798,
"learning_rate": 9.836419442773004e-06,
"loss": 0.3443,
"step": 287
},
{
"epoch": 0.17494305239179955,
"grad_norm": 1.2569085833757287,
"learning_rate": 9.833719607645325e-06,
"loss": 0.3241,
"step": 288
},
{
"epoch": 0.17555049354593774,
"grad_norm": 1.2077036555513847,
"learning_rate": 9.830998051276858e-06,
"loss": 0.3541,
"step": 289
},
{
"epoch": 0.17615793470007593,
"grad_norm": 3.9279665228825187,
"learning_rate": 9.82825478589744e-06,
"loss": 0.3666,
"step": 290
},
{
"epoch": 0.17676537585421412,
"grad_norm": 1.3633793584504903,
"learning_rate": 9.825489823834454e-06,
"loss": 0.3162,
"step": 291
},
{
"epoch": 0.1773728170083523,
"grad_norm": 1.190852630500219,
"learning_rate": 9.822703177512783e-06,
"loss": 0.3281,
"step": 292
},
{
"epoch": 0.1779802581624905,
"grad_norm": 1.032612295087311,
"learning_rate": 9.819894859454756e-06,
"loss": 0.2902,
"step": 293
},
{
"epoch": 0.1785876993166287,
"grad_norm": 2.2666599075970058,
"learning_rate": 9.817064882280085e-06,
"loss": 0.3872,
"step": 294
},
{
"epoch": 0.1791951404707669,
"grad_norm": 1.5056944572723148,
"learning_rate": 9.814213258705813e-06,
"loss": 0.4009,
"step": 295
},
{
"epoch": 0.1798025816249051,
"grad_norm": 1.2750096010881427,
"learning_rate": 9.811340001546252e-06,
"loss": 0.335,
"step": 296
},
{
"epoch": 0.18041002277904328,
"grad_norm": 1.3167579603123851,
"learning_rate": 9.808445123712934e-06,
"loss": 0.3789,
"step": 297
},
{
"epoch": 0.18101746393318147,
"grad_norm": 1.2917298455538913,
"learning_rate": 9.805528638214543e-06,
"loss": 0.365,
"step": 298
},
{
"epoch": 0.18162490508731965,
"grad_norm": 1.2249270654309992,
"learning_rate": 9.802590558156863e-06,
"loss": 0.3267,
"step": 299
},
{
"epoch": 0.18223234624145787,
"grad_norm": 1.28147771791881,
"learning_rate": 9.799630896742716e-06,
"loss": 0.3258,
"step": 300
},
{
"epoch": 0.18283978739559606,
"grad_norm": 2.076161195627259,
"learning_rate": 9.796649667271905e-06,
"loss": 0.3588,
"step": 301
},
{
"epoch": 0.18344722854973425,
"grad_norm": 1.1215708430697366,
"learning_rate": 9.793646883141155e-06,
"loss": 0.32,
"step": 302
},
{
"epoch": 0.18405466970387244,
"grad_norm": 1.2637743993804484,
"learning_rate": 9.790622557844047e-06,
"loss": 0.3561,
"step": 303
},
{
"epoch": 0.18466211085801063,
"grad_norm": 2.4961674050461635,
"learning_rate": 9.787576704970965e-06,
"loss": 0.343,
"step": 304
},
{
"epoch": 0.1852695520121488,
"grad_norm": 1.8062025541980924,
"learning_rate": 9.784509338209026e-06,
"loss": 0.339,
"step": 305
},
{
"epoch": 0.185876993166287,
"grad_norm": 1.1705663644423028,
"learning_rate": 9.781420471342035e-06,
"loss": 0.3204,
"step": 306
},
{
"epoch": 0.18648443432042522,
"grad_norm": 1.3501675244896367,
"learning_rate": 9.778310118250397e-06,
"loss": 0.3598,
"step": 307
},
{
"epoch": 0.1870918754745634,
"grad_norm": 1.2093391302114258,
"learning_rate": 9.77517829291108e-06,
"loss": 0.3397,
"step": 308
},
{
"epoch": 0.1876993166287016,
"grad_norm": 1.3119917853957324,
"learning_rate": 9.772025009397538e-06,
"loss": 0.3291,
"step": 309
},
{
"epoch": 0.18830675778283978,
"grad_norm": 3.4574677223030217,
"learning_rate": 9.768850281879651e-06,
"loss": 0.3297,
"step": 310
},
{
"epoch": 0.18891419893697797,
"grad_norm": 1.3155763470245156,
"learning_rate": 9.765654124623664e-06,
"loss": 0.3317,
"step": 311
},
{
"epoch": 0.18952164009111616,
"grad_norm": 1.2868677150111685,
"learning_rate": 9.762436551992117e-06,
"loss": 0.3545,
"step": 312
},
{
"epoch": 0.19012908124525438,
"grad_norm": 1.2047620112147,
"learning_rate": 9.759197578443787e-06,
"loss": 0.3282,
"step": 313
},
{
"epoch": 0.19073652239939257,
"grad_norm": 1.341450099932963,
"learning_rate": 9.755937218533622e-06,
"loss": 0.348,
"step": 314
},
{
"epoch": 0.19134396355353075,
"grad_norm": 1.3549028797747085,
"learning_rate": 9.752655486912666e-06,
"loss": 0.3258,
"step": 315
},
{
"epoch": 0.19195140470766894,
"grad_norm": 1.2140305820046362,
"learning_rate": 9.74935239832801e-06,
"loss": 0.3441,
"step": 316
},
{
"epoch": 0.19255884586180713,
"grad_norm": 1.3240982666908445,
"learning_rate": 9.746027967622709e-06,
"loss": 0.3322,
"step": 317
},
{
"epoch": 0.19316628701594532,
"grad_norm": 1.1866493550648762,
"learning_rate": 9.742682209735727e-06,
"loss": 0.3387,
"step": 318
},
{
"epoch": 0.19377372817008354,
"grad_norm": 1.4708765807963506,
"learning_rate": 9.739315139701868e-06,
"loss": 0.3234,
"step": 319
},
{
"epoch": 0.19438116932422173,
"grad_norm": 1.0795492879673514,
"learning_rate": 9.735926772651703e-06,
"loss": 0.3182,
"step": 320
},
{
"epoch": 0.19498861047835991,
"grad_norm": 1.1995704100657156,
"learning_rate": 9.732517123811502e-06,
"loss": 0.3267,
"step": 321
},
{
"epoch": 0.1955960516324981,
"grad_norm": 1.2807722640717565,
"learning_rate": 9.729086208503174e-06,
"loss": 0.3439,
"step": 322
},
{
"epoch": 0.1962034927866363,
"grad_norm": 1.1439191799968789,
"learning_rate": 9.725634042144192e-06,
"loss": 0.3035,
"step": 323
},
{
"epoch": 0.19681093394077448,
"grad_norm": 1.123574240810596,
"learning_rate": 9.722160640247523e-06,
"loss": 0.3402,
"step": 324
},
{
"epoch": 0.19741837509491267,
"grad_norm": 1.9809732054403608,
"learning_rate": 9.71866601842156e-06,
"loss": 0.3596,
"step": 325
},
{
"epoch": 0.19802581624905088,
"grad_norm": 3.2675821795654474,
"learning_rate": 9.715150192370054e-06,
"loss": 0.3378,
"step": 326
},
{
"epoch": 0.19863325740318907,
"grad_norm": 1.1513389970747174,
"learning_rate": 9.71161317789204e-06,
"loss": 0.312,
"step": 327
},
{
"epoch": 0.19924069855732726,
"grad_norm": 1.0732590413444016,
"learning_rate": 9.708054990881763e-06,
"loss": 0.3028,
"step": 328
},
{
"epoch": 0.19984813971146545,
"grad_norm": 1.130054869998939,
"learning_rate": 9.70447564732862e-06,
"loss": 0.3161,
"step": 329
},
{
"epoch": 0.20045558086560364,
"grad_norm": 1.4279288053692063,
"learning_rate": 9.700875163317072e-06,
"loss": 0.3159,
"step": 330
},
{
"epoch": 0.20106302201974183,
"grad_norm": 1.4180293061625155,
"learning_rate": 9.69725355502658e-06,
"loss": 0.3555,
"step": 331
},
{
"epoch": 0.20167046317388004,
"grad_norm": 1.5620650901784414,
"learning_rate": 9.693610838731532e-06,
"loss": 0.3256,
"step": 332
},
{
"epoch": 0.20227790432801823,
"grad_norm": 1.3488433094850794,
"learning_rate": 9.689947030801168e-06,
"loss": 0.358,
"step": 333
},
{
"epoch": 0.20288534548215642,
"grad_norm": 1.2086835793396953,
"learning_rate": 9.686262147699507e-06,
"loss": 0.3648,
"step": 334
},
{
"epoch": 0.2034927866362946,
"grad_norm": 1.0080244863547254,
"learning_rate": 9.682556205985274e-06,
"loss": 0.3197,
"step": 335
},
{
"epoch": 0.2041002277904328,
"grad_norm": 1.0405594952124566,
"learning_rate": 9.678829222311827e-06,
"loss": 0.304,
"step": 336
},
{
"epoch": 0.204707668944571,
"grad_norm": 1.669499151030841,
"learning_rate": 9.675081213427076e-06,
"loss": 0.3282,
"step": 337
},
{
"epoch": 0.20531511009870917,
"grad_norm": 1.162339657589905,
"learning_rate": 9.671312196173413e-06,
"loss": 0.328,
"step": 338
},
{
"epoch": 0.2059225512528474,
"grad_norm": 1.1111055689988498,
"learning_rate": 9.667522187487635e-06,
"loss": 0.3352,
"step": 339
},
{
"epoch": 0.20652999240698558,
"grad_norm": 1.3779787045612117,
"learning_rate": 9.663711204400872e-06,
"loss": 0.3575,
"step": 340
},
{
"epoch": 0.20713743356112377,
"grad_norm": 1.6323496019886752,
"learning_rate": 9.659879264038499e-06,
"loss": 0.365,
"step": 341
},
{
"epoch": 0.20774487471526196,
"grad_norm": 1.493535511167361,
"learning_rate": 9.656026383620076e-06,
"loss": 0.3445,
"step": 342
},
{
"epoch": 0.20835231586940015,
"grad_norm": 1.3056823613349453,
"learning_rate": 9.65215258045925e-06,
"loss": 0.2948,
"step": 343
},
{
"epoch": 0.20895975702353833,
"grad_norm": 1.0670662511449958,
"learning_rate": 9.6482578719637e-06,
"loss": 0.3139,
"step": 344
},
{
"epoch": 0.20956719817767655,
"grad_norm": 1.3642861541498819,
"learning_rate": 9.644342275635036e-06,
"loss": 0.3015,
"step": 345
},
{
"epoch": 0.21017463933181474,
"grad_norm": 1.0747742911930387,
"learning_rate": 9.640405809068743e-06,
"loss": 0.3228,
"step": 346
},
{
"epoch": 0.21078208048595293,
"grad_norm": 1.1565608956431175,
"learning_rate": 9.636448489954077e-06,
"loss": 0.307,
"step": 347
},
{
"epoch": 0.21138952164009112,
"grad_norm": 1.195151098550731,
"learning_rate": 9.632470336074009e-06,
"loss": 0.3284,
"step": 348
},
{
"epoch": 0.2119969627942293,
"grad_norm": 1.1885220245152495,
"learning_rate": 9.628471365305134e-06,
"loss": 0.3437,
"step": 349
},
{
"epoch": 0.2126044039483675,
"grad_norm": 1.0344142275699475,
"learning_rate": 9.624451595617588e-06,
"loss": 0.3185,
"step": 350
},
{
"epoch": 0.21321184510250568,
"grad_norm": 1.2656391323297032,
"learning_rate": 9.620411045074972e-06,
"loss": 0.3626,
"step": 351
},
{
"epoch": 0.2138192862566439,
"grad_norm": 1.0752778164280428,
"learning_rate": 9.616349731834271e-06,
"loss": 0.3225,
"step": 352
},
{
"epoch": 0.2144267274107821,
"grad_norm": 1.2178645720798402,
"learning_rate": 9.612267674145772e-06,
"loss": 0.3534,
"step": 353
},
{
"epoch": 0.21503416856492027,
"grad_norm": 1.4072309869153488,
"learning_rate": 9.608164890352977e-06,
"loss": 0.3459,
"step": 354
},
{
"epoch": 0.21564160971905846,
"grad_norm": 1.1875602285502396,
"learning_rate": 9.604041398892528e-06,
"loss": 0.3288,
"step": 355
},
{
"epoch": 0.21624905087319665,
"grad_norm": 1.2188563316023242,
"learning_rate": 9.599897218294122e-06,
"loss": 0.3509,
"step": 356
},
{
"epoch": 0.21685649202733484,
"grad_norm": 1.1569315648201919,
"learning_rate": 9.595732367180422e-06,
"loss": 0.3173,
"step": 357
},
{
"epoch": 0.21746393318147306,
"grad_norm": 1.5018233135579402,
"learning_rate": 9.591546864266983e-06,
"loss": 0.3507,
"step": 358
},
{
"epoch": 0.21807137433561125,
"grad_norm": 1.0272557252775882,
"learning_rate": 9.58734072836216e-06,
"loss": 0.3001,
"step": 359
},
{
"epoch": 0.21867881548974943,
"grad_norm": 1.245040066414171,
"learning_rate": 9.583113978367026e-06,
"loss": 0.2957,
"step": 360
},
{
"epoch": 0.21928625664388762,
"grad_norm": 1.3275806437802142,
"learning_rate": 9.578866633275289e-06,
"loss": 0.3383,
"step": 361
},
{
"epoch": 0.2198936977980258,
"grad_norm": 1.1727768085477153,
"learning_rate": 9.574598712173202e-06,
"loss": 0.2735,
"step": 362
},
{
"epoch": 0.220501138952164,
"grad_norm": 1.22146926918798,
"learning_rate": 9.570310234239483e-06,
"loss": 0.3166,
"step": 363
},
{
"epoch": 0.2211085801063022,
"grad_norm": 1.2281212541536195,
"learning_rate": 9.56600121874523e-06,
"loss": 0.3249,
"step": 364
},
{
"epoch": 0.2217160212604404,
"grad_norm": 2.3299644415456036,
"learning_rate": 9.561671685053818e-06,
"loss": 0.3467,
"step": 365
},
{
"epoch": 0.2223234624145786,
"grad_norm": 1.1619894211736224,
"learning_rate": 9.557321652620839e-06,
"loss": 0.3077,
"step": 366
},
{
"epoch": 0.22293090356871678,
"grad_norm": 1.1786921246836153,
"learning_rate": 9.55295114099399e-06,
"loss": 0.3294,
"step": 367
},
{
"epoch": 0.22353834472285497,
"grad_norm": 1.1859186341534969,
"learning_rate": 9.548560169812997e-06,
"loss": 0.3167,
"step": 368
},
{
"epoch": 0.22414578587699316,
"grad_norm": 1.1441591110703015,
"learning_rate": 9.544148758809528e-06,
"loss": 0.3193,
"step": 369
},
{
"epoch": 0.22475322703113135,
"grad_norm": 1.1128313603192685,
"learning_rate": 9.539716927807102e-06,
"loss": 0.3093,
"step": 370
},
{
"epoch": 0.22536066818526956,
"grad_norm": 1.4910675750318487,
"learning_rate": 9.535264696720993e-06,
"loss": 0.3253,
"step": 371
},
{
"epoch": 0.22596810933940775,
"grad_norm": 1.189666585401165,
"learning_rate": 9.530792085558151e-06,
"loss": 0.3558,
"step": 372
},
{
"epoch": 0.22657555049354594,
"grad_norm": 1.2061368174942724,
"learning_rate": 9.526299114417108e-06,
"loss": 0.3253,
"step": 373
},
{
"epoch": 0.22718299164768413,
"grad_norm": 1.0468374477355344,
"learning_rate": 9.521785803487888e-06,
"loss": 0.3178,
"step": 374
},
{
"epoch": 0.22779043280182232,
"grad_norm": 1.2500589524530483,
"learning_rate": 9.517252173051912e-06,
"loss": 0.3066,
"step": 375
},
{
"epoch": 0.2283978739559605,
"grad_norm": 2.396102762626989,
"learning_rate": 9.512698243481914e-06,
"loss": 0.3087,
"step": 376
},
{
"epoch": 0.22900531511009872,
"grad_norm": 13.857607430818112,
"learning_rate": 9.508124035241843e-06,
"loss": 0.3001,
"step": 377
},
{
"epoch": 0.2296127562642369,
"grad_norm": 1.5821099485242913,
"learning_rate": 9.50352956888678e-06,
"loss": 0.3393,
"step": 378
},
{
"epoch": 0.2302201974183751,
"grad_norm": 1.2074450897701234,
"learning_rate": 9.498914865062831e-06,
"loss": 0.334,
"step": 379
},
{
"epoch": 0.2308276385725133,
"grad_norm": 1.1721133873450802,
"learning_rate": 9.49427994450705e-06,
"loss": 0.3285,
"step": 380
},
{
"epoch": 0.23143507972665148,
"grad_norm": 1.3430574212868829,
"learning_rate": 9.489624828047336e-06,
"loss": 0.3137,
"step": 381
},
{
"epoch": 0.23204252088078967,
"grad_norm": 1.5915750295490272,
"learning_rate": 9.484949536602343e-06,
"loss": 0.3505,
"step": 382
},
{
"epoch": 0.23264996203492785,
"grad_norm": 1.440309732703764,
"learning_rate": 9.480254091181385e-06,
"loss": 0.3441,
"step": 383
},
{
"epoch": 0.23325740318906607,
"grad_norm": 1.2699897467088066,
"learning_rate": 9.47553851288434e-06,
"loss": 0.328,
"step": 384
},
{
"epoch": 0.23386484434320426,
"grad_norm": 1.118564432611521,
"learning_rate": 9.470802822901558e-06,
"loss": 0.2914,
"step": 385
},
{
"epoch": 0.23447228549734245,
"grad_norm": 2.267610070857606,
"learning_rate": 9.466047042513767e-06,
"loss": 0.3194,
"step": 386
},
{
"epoch": 0.23507972665148064,
"grad_norm": 1.1933730736468051,
"learning_rate": 9.461271193091971e-06,
"loss": 0.3329,
"step": 387
},
{
"epoch": 0.23568716780561882,
"grad_norm": 1.2144519426747942,
"learning_rate": 9.45647529609736e-06,
"loss": 0.3295,
"step": 388
},
{
"epoch": 0.236294608959757,
"grad_norm": 1.3580165469389258,
"learning_rate": 9.451659373081214e-06,
"loss": 0.3447,
"step": 389
},
{
"epoch": 0.23690205011389523,
"grad_norm": 1.1100635616742127,
"learning_rate": 9.4468234456848e-06,
"loss": 0.3337,
"step": 390
},
{
"epoch": 0.23750949126803342,
"grad_norm": 1.3428354458732323,
"learning_rate": 9.44196753563928e-06,
"loss": 0.2838,
"step": 391
},
{
"epoch": 0.2381169324221716,
"grad_norm": 1.3921865014793011,
"learning_rate": 9.437091664765611e-06,
"loss": 0.3256,
"step": 392
},
{
"epoch": 0.2387243735763098,
"grad_norm": 1.0558795045496834,
"learning_rate": 9.43219585497445e-06,
"loss": 0.2924,
"step": 393
},
{
"epoch": 0.23933181473044798,
"grad_norm": 1.0852216391713467,
"learning_rate": 9.427280128266049e-06,
"loss": 0.3159,
"step": 394
},
{
"epoch": 0.23993925588458617,
"grad_norm": 1.2663888095002538,
"learning_rate": 9.422344506730168e-06,
"loss": 0.3223,
"step": 395
},
{
"epoch": 0.24054669703872436,
"grad_norm": 1.2598248186127823,
"learning_rate": 9.41738901254596e-06,
"loss": 0.3114,
"step": 396
},
{
"epoch": 0.24115413819286258,
"grad_norm": 1.2212737926468238,
"learning_rate": 9.412413667981884e-06,
"loss": 0.365,
"step": 397
},
{
"epoch": 0.24176157934700077,
"grad_norm": 1.1959984881869385,
"learning_rate": 9.4074184953956e-06,
"loss": 0.3723,
"step": 398
},
{
"epoch": 0.24236902050113895,
"grad_norm": 1.291676435173795,
"learning_rate": 9.402403517233867e-06,
"loss": 0.3455,
"step": 399
},
{
"epoch": 0.24297646165527714,
"grad_norm": 1.1960840171763427,
"learning_rate": 9.397368756032445e-06,
"loss": 0.3453,
"step": 400
},
{
"epoch": 0.24358390280941533,
"grad_norm": 1.0826676561819115,
"learning_rate": 9.392314234415999e-06,
"loss": 0.3047,
"step": 401
},
{
"epoch": 0.24419134396355352,
"grad_norm": 1.1755710795468963,
"learning_rate": 9.38723997509798e-06,
"loss": 0.313,
"step": 402
},
{
"epoch": 0.24479878511769174,
"grad_norm": 1.3499086111109924,
"learning_rate": 9.38214600088054e-06,
"loss": 0.3285,
"step": 403
},
{
"epoch": 0.24540622627182992,
"grad_norm": 1.3200873501528223,
"learning_rate": 9.37703233465443e-06,
"loss": 0.369,
"step": 404
},
{
"epoch": 0.2460136674259681,
"grad_norm": 1.3826162413858059,
"learning_rate": 9.371898999398876e-06,
"loss": 0.3527,
"step": 405
},
{
"epoch": 0.2466211085801063,
"grad_norm": 1.2278576280498825,
"learning_rate": 9.366746018181503e-06,
"loss": 0.3277,
"step": 406
},
{
"epoch": 0.2472285497342445,
"grad_norm": 1.1329545797723328,
"learning_rate": 9.361573414158215e-06,
"loss": 0.3229,
"step": 407
},
{
"epoch": 0.24783599088838268,
"grad_norm": 1.0477461146627898,
"learning_rate": 9.356381210573092e-06,
"loss": 0.2919,
"step": 408
},
{
"epoch": 0.24844343204252087,
"grad_norm": 2.4554214547877704,
"learning_rate": 9.351169430758293e-06,
"loss": 0.3438,
"step": 409
},
{
"epoch": 0.24905087319665908,
"grad_norm": 3.5352794640669685,
"learning_rate": 9.345938098133946e-06,
"loss": 0.3262,
"step": 410
},
{
"epoch": 0.24965831435079727,
"grad_norm": 2.021698412413005,
"learning_rate": 9.340687236208037e-06,
"loss": 0.3011,
"step": 411
},
{
"epoch": 0.25026575550493546,
"grad_norm": 2.2039087798756465,
"learning_rate": 9.33541686857632e-06,
"loss": 0.3663,
"step": 412
},
{
"epoch": 0.25087319665907365,
"grad_norm": 1.2067385283486605,
"learning_rate": 9.330127018922195e-06,
"loss": 0.3212,
"step": 413
},
{
"epoch": 0.25148063781321184,
"grad_norm": 1.2316558886892726,
"learning_rate": 9.324817711016609e-06,
"loss": 0.3419,
"step": 414
},
{
"epoch": 0.25208807896735,
"grad_norm": 1.4875274199476636,
"learning_rate": 9.31948896871795e-06,
"loss": 0.3348,
"step": 415
},
{
"epoch": 0.2526955201214882,
"grad_norm": 1.1115019195847107,
"learning_rate": 9.31414081597194e-06,
"loss": 0.3512,
"step": 416
},
{
"epoch": 0.2533029612756264,
"grad_norm": 1.2295268471367569,
"learning_rate": 9.30877327681152e-06,
"loss": 0.3503,
"step": 417
},
{
"epoch": 0.2539104024297646,
"grad_norm": 1.0786831844077154,
"learning_rate": 9.303386375356752e-06,
"loss": 0.3162,
"step": 418
},
{
"epoch": 0.25451784358390284,
"grad_norm": 1.0475351605214382,
"learning_rate": 9.297980135814706e-06,
"loss": 0.3103,
"step": 419
},
{
"epoch": 0.255125284738041,
"grad_norm": 1.0774726768233147,
"learning_rate": 9.292554582479349e-06,
"loss": 0.3187,
"step": 420
},
{
"epoch": 0.2557327258921792,
"grad_norm": 1.1341288746675326,
"learning_rate": 9.28710973973144e-06,
"loss": 0.3267,
"step": 421
},
{
"epoch": 0.2563401670463174,
"grad_norm": 1.7271345636020505,
"learning_rate": 9.281645632038417e-06,
"loss": 0.3329,
"step": 422
},
{
"epoch": 0.2569476082004556,
"grad_norm": 1.1865979526898363,
"learning_rate": 9.276162283954293e-06,
"loss": 0.3148,
"step": 423
},
{
"epoch": 0.2575550493545938,
"grad_norm": 1.050767524282803,
"learning_rate": 9.270659720119533e-06,
"loss": 0.3431,
"step": 424
},
{
"epoch": 0.25816249050873197,
"grad_norm": 1.196771620264985,
"learning_rate": 9.265137965260962e-06,
"loss": 0.3422,
"step": 425
},
{
"epoch": 0.25876993166287016,
"grad_norm": 1.0955500339958768,
"learning_rate": 9.259597044191635e-06,
"loss": 0.3195,
"step": 426
},
{
"epoch": 0.25937737281700834,
"grad_norm": 1.115960598799573,
"learning_rate": 9.254036981810741e-06,
"loss": 0.3238,
"step": 427
},
{
"epoch": 0.25998481397114653,
"grad_norm": 1.0005694364136173,
"learning_rate": 9.248457803103476e-06,
"loss": 0.309,
"step": 428
},
{
"epoch": 0.2605922551252847,
"grad_norm": 1.0726223391150986,
"learning_rate": 9.242859533140947e-06,
"loss": 0.3031,
"step": 429
},
{
"epoch": 0.2611996962794229,
"grad_norm": 1.1657341953685123,
"learning_rate": 9.237242197080045e-06,
"loss": 0.2901,
"step": 430
},
{
"epoch": 0.2618071374335611,
"grad_norm": 1.013927670160624,
"learning_rate": 9.231605820163343e-06,
"loss": 0.2932,
"step": 431
},
{
"epoch": 0.26241457858769934,
"grad_norm": 1.1498383770847378,
"learning_rate": 9.225950427718974e-06,
"loss": 0.3333,
"step": 432
},
{
"epoch": 0.26302201974183753,
"grad_norm": 1.2571593199046411,
"learning_rate": 9.220276045160524e-06,
"loss": 0.3098,
"step": 433
},
{
"epoch": 0.2636294608959757,
"grad_norm": 1.004792456164235,
"learning_rate": 9.21458269798691e-06,
"loss": 0.2914,
"step": 434
},
{
"epoch": 0.2642369020501139,
"grad_norm": 1.0509819009913075,
"learning_rate": 9.208870411782276e-06,
"loss": 0.3191,
"step": 435
},
{
"epoch": 0.2648443432042521,
"grad_norm": 1.0808764700945872,
"learning_rate": 9.203139212215868e-06,
"loss": 0.3397,
"step": 436
},
{
"epoch": 0.2654517843583903,
"grad_norm": 1.9069945576857954,
"learning_rate": 9.197389125041925e-06,
"loss": 0.3696,
"step": 437
},
{
"epoch": 0.2660592255125285,
"grad_norm": 0.9868084753868711,
"learning_rate": 9.191620176099559e-06,
"loss": 0.2926,
"step": 438
},
{
"epoch": 0.26666666666666666,
"grad_norm": 1.236174799741463,
"learning_rate": 9.185832391312644e-06,
"loss": 0.3532,
"step": 439
},
{
"epoch": 0.26727410782080485,
"grad_norm": 1.085032203623591,
"learning_rate": 9.180025796689692e-06,
"loss": 0.3313,
"step": 440
},
{
"epoch": 0.26788154897494304,
"grad_norm": 0.9582279787310294,
"learning_rate": 9.174200418323746e-06,
"loss": 0.2886,
"step": 441
},
{
"epoch": 0.26848899012908123,
"grad_norm": 0.9169437747174185,
"learning_rate": 9.168356282392253e-06,
"loss": 0.2921,
"step": 442
},
{
"epoch": 0.2690964312832194,
"grad_norm": 1.0775595515767766,
"learning_rate": 9.16249341515695e-06,
"loss": 0.3391,
"step": 443
},
{
"epoch": 0.2697038724373576,
"grad_norm": 1.0089375719934834,
"learning_rate": 9.156611842963753e-06,
"loss": 0.3159,
"step": 444
},
{
"epoch": 0.27031131359149585,
"grad_norm": 1.2417432186550288,
"learning_rate": 9.150711592242627e-06,
"loss": 0.3654,
"step": 445
},
{
"epoch": 0.27091875474563404,
"grad_norm": 1.0274792842367653,
"learning_rate": 9.144792689507471e-06,
"loss": 0.3107,
"step": 446
},
{
"epoch": 0.2715261958997722,
"grad_norm": 1.1641760376218897,
"learning_rate": 9.138855161356006e-06,
"loss": 0.3219,
"step": 447
},
{
"epoch": 0.2721336370539104,
"grad_norm": 1.038602346175002,
"learning_rate": 9.132899034469648e-06,
"loss": 0.3262,
"step": 448
},
{
"epoch": 0.2727410782080486,
"grad_norm": 1.0193170343688194,
"learning_rate": 9.126924335613385e-06,
"loss": 0.2947,
"step": 449
},
{
"epoch": 0.2733485193621868,
"grad_norm": 1.0412898987806527,
"learning_rate": 9.120931091635669e-06,
"loss": 0.2982,
"step": 450
},
{
"epoch": 0.273955960516325,
"grad_norm": 1.0086545373941327,
"learning_rate": 9.114919329468283e-06,
"loss": 0.3189,
"step": 451
},
{
"epoch": 0.27456340167046317,
"grad_norm": 0.9279048193591232,
"learning_rate": 9.108889076126226e-06,
"loss": 0.2653,
"step": 452
},
{
"epoch": 0.27517084282460136,
"grad_norm": 0.995776916618753,
"learning_rate": 9.102840358707594e-06,
"loss": 0.2785,
"step": 453
},
{
"epoch": 0.27577828397873955,
"grad_norm": 1.1209448523987642,
"learning_rate": 9.09677320439345e-06,
"loss": 0.3484,
"step": 454
},
{
"epoch": 0.27638572513287774,
"grad_norm": 1.0086795340037955,
"learning_rate": 9.090687640447709e-06,
"loss": 0.3039,
"step": 455
},
{
"epoch": 0.2769931662870159,
"grad_norm": 1.0635853354673632,
"learning_rate": 9.084583694217012e-06,
"loss": 0.3368,
"step": 456
},
{
"epoch": 0.2776006074411541,
"grad_norm": 1.3855504059825723,
"learning_rate": 9.07846139313061e-06,
"loss": 0.3416,
"step": 457
},
{
"epoch": 0.27820804859529236,
"grad_norm": 1.0574619979291167,
"learning_rate": 9.072320764700223e-06,
"loss": 0.2921,
"step": 458
},
{
"epoch": 0.27881548974943055,
"grad_norm": 0.9945293438865139,
"learning_rate": 9.066161836519942e-06,
"loss": 0.2738,
"step": 459
},
{
"epoch": 0.27942293090356873,
"grad_norm": 1.0200515276803628,
"learning_rate": 9.059984636266082e-06,
"loss": 0.3244,
"step": 460
},
{
"epoch": 0.2800303720577069,
"grad_norm": 0.9552331626040478,
"learning_rate": 9.053789191697072e-06,
"loss": 0.2867,
"step": 461
},
{
"epoch": 0.2806378132118451,
"grad_norm": 0.9682413028760446,
"learning_rate": 9.047575530653324e-06,
"loss": 0.2914,
"step": 462
},
{
"epoch": 0.2812452543659833,
"grad_norm": 1.0309908640635839,
"learning_rate": 9.041343681057106e-06,
"loss": 0.2882,
"step": 463
},
{
"epoch": 0.2818526955201215,
"grad_norm": 1.038796502390337,
"learning_rate": 9.035093670912424e-06,
"loss": 0.2814,
"step": 464
},
{
"epoch": 0.2824601366742597,
"grad_norm": 1.1394035890796435,
"learning_rate": 9.028825528304892e-06,
"loss": 0.3444,
"step": 465
},
{
"epoch": 0.28306757782839786,
"grad_norm": 1.4464664575484871,
"learning_rate": 9.022539281401601e-06,
"loss": 0.3403,
"step": 466
},
{
"epoch": 0.28367501898253605,
"grad_norm": 1.725019753808846,
"learning_rate": 9.016234958451002e-06,
"loss": 0.3225,
"step": 467
},
{
"epoch": 0.28428246013667424,
"grad_norm": 1.390207557379254,
"learning_rate": 9.009912587782772e-06,
"loss": 0.2972,
"step": 468
},
{
"epoch": 0.28488990129081243,
"grad_norm": 1.1656375132150465,
"learning_rate": 9.00357219780769e-06,
"loss": 0.3025,
"step": 469
},
{
"epoch": 0.2854973424449506,
"grad_norm": 2.458767731063712,
"learning_rate": 8.997213817017508e-06,
"loss": 0.3368,
"step": 470
},
{
"epoch": 0.28610478359908886,
"grad_norm": 1.0665249956470793,
"learning_rate": 8.990837473984818e-06,
"loss": 0.3208,
"step": 471
},
{
"epoch": 0.28671222475322705,
"grad_norm": 1.1467648154057113,
"learning_rate": 8.984443197362938e-06,
"loss": 0.2963,
"step": 472
},
{
"epoch": 0.28731966590736524,
"grad_norm": 1.188817694944298,
"learning_rate": 8.978031015885767e-06,
"loss": 0.3049,
"step": 473
},
{
"epoch": 0.28792710706150343,
"grad_norm": 0.9664646995536351,
"learning_rate": 8.971600958367668e-06,
"loss": 0.2873,
"step": 474
},
{
"epoch": 0.2885345482156416,
"grad_norm": 1.0103760175347862,
"learning_rate": 8.965153053703325e-06,
"loss": 0.2933,
"step": 475
},
{
"epoch": 0.2891419893697798,
"grad_norm": 0.969093412655753,
"learning_rate": 8.958687330867634e-06,
"loss": 0.3211,
"step": 476
},
{
"epoch": 0.289749430523918,
"grad_norm": 1.025864636561286,
"learning_rate": 8.952203818915548e-06,
"loss": 0.3216,
"step": 477
},
{
"epoch": 0.2903568716780562,
"grad_norm": 1.109916705902771,
"learning_rate": 8.94570254698197e-06,
"loss": 0.2862,
"step": 478
},
{
"epoch": 0.29096431283219437,
"grad_norm": 0.9713323047580559,
"learning_rate": 8.939183544281597e-06,
"loss": 0.3105,
"step": 479
},
{
"epoch": 0.29157175398633256,
"grad_norm": 1.081777308673117,
"learning_rate": 8.932646840108818e-06,
"loss": 0.3272,
"step": 480
},
{
"epoch": 0.29217919514047075,
"grad_norm": 0.967596641805163,
"learning_rate": 8.926092463837557e-06,
"loss": 0.2761,
"step": 481
},
{
"epoch": 0.29278663629460894,
"grad_norm": 1.155987695983836,
"learning_rate": 8.919520444921153e-06,
"loss": 0.3064,
"step": 482
},
{
"epoch": 0.2933940774487471,
"grad_norm": 1.3696382864069947,
"learning_rate": 8.912930812892228e-06,
"loss": 0.2865,
"step": 483
},
{
"epoch": 0.29400151860288537,
"grad_norm": 1.0400446945814443,
"learning_rate": 8.906323597362547e-06,
"loss": 0.2686,
"step": 484
},
{
"epoch": 0.29460895975702356,
"grad_norm": 1.0325058068789128,
"learning_rate": 8.899698828022895e-06,
"loss": 0.2879,
"step": 485
},
{
"epoch": 0.29521640091116175,
"grad_norm": 1.0137176891852828,
"learning_rate": 8.893056534642938e-06,
"loss": 0.3086,
"step": 486
},
{
"epoch": 0.29582384206529994,
"grad_norm": 1.2122910237107214,
"learning_rate": 8.886396747071085e-06,
"loss": 0.3277,
"step": 487
},
{
"epoch": 0.2964312832194381,
"grad_norm": 1.073984642104128,
"learning_rate": 8.879719495234363e-06,
"loss": 0.3181,
"step": 488
},
{
"epoch": 0.2970387243735763,
"grad_norm": 1.095543075839678,
"learning_rate": 8.873024809138272e-06,
"loss": 0.3102,
"step": 489
},
{
"epoch": 0.2976461655277145,
"grad_norm": 1.01242570236515,
"learning_rate": 8.866312718866669e-06,
"loss": 0.2998,
"step": 490
},
{
"epoch": 0.2982536066818527,
"grad_norm": 0.9488282118596906,
"learning_rate": 8.859583254581604e-06,
"loss": 0.3099,
"step": 491
},
{
"epoch": 0.2988610478359909,
"grad_norm": 0.9722813663051023,
"learning_rate": 8.852836446523213e-06,
"loss": 0.3386,
"step": 492
},
{
"epoch": 0.29946848899012907,
"grad_norm": 1.0326428240517584,
"learning_rate": 8.846072325009562e-06,
"loss": 0.2987,
"step": 493
},
{
"epoch": 0.30007593014426726,
"grad_norm": 1.0268489993356678,
"learning_rate": 8.83929092043652e-06,
"loss": 0.3282,
"step": 494
},
{
"epoch": 0.30068337129840544,
"grad_norm": 1.0504404390194961,
"learning_rate": 8.832492263277624e-06,
"loss": 0.331,
"step": 495
},
{
"epoch": 0.30129081245254363,
"grad_norm": 1.0223568612198137,
"learning_rate": 8.825676384083936e-06,
"loss": 0.3073,
"step": 496
},
{
"epoch": 0.3018982536066819,
"grad_norm": 0.9798177174047413,
"learning_rate": 8.818843313483907e-06,
"loss": 0.2886,
"step": 497
},
{
"epoch": 0.30250569476082007,
"grad_norm": 1.0532247166652768,
"learning_rate": 8.811993082183243e-06,
"loss": 0.2974,
"step": 498
},
{
"epoch": 0.30311313591495825,
"grad_norm": 0.975121772940988,
"learning_rate": 8.805125720964766e-06,
"loss": 0.2997,
"step": 499
},
{
"epoch": 0.30372057706909644,
"grad_norm": 1.137742002586217,
"learning_rate": 8.798241260688273e-06,
"loss": 0.3188,
"step": 500
},
{
"epoch": 0.30432801822323463,
"grad_norm": 2.779671078767294,
"learning_rate": 8.791339732290398e-06,
"loss": 0.3156,
"step": 501
},
{
"epoch": 0.3049354593773728,
"grad_norm": 1.2822627424235575,
"learning_rate": 8.784421166784476e-06,
"loss": 0.3006,
"step": 502
},
{
"epoch": 0.305542900531511,
"grad_norm": 0.9483433674131612,
"learning_rate": 8.7774855952604e-06,
"loss": 0.2795,
"step": 503
},
{
"epoch": 0.3061503416856492,
"grad_norm": 1.0109525806584068,
"learning_rate": 8.770533048884483e-06,
"loss": 0.3045,
"step": 504
},
{
"epoch": 0.3067577828397874,
"grad_norm": 0.9625557196195801,
"learning_rate": 8.763563558899317e-06,
"loss": 0.2759,
"step": 505
},
{
"epoch": 0.3073652239939256,
"grad_norm": 1.0453305774145718,
"learning_rate": 8.756577156623636e-06,
"loss": 0.3117,
"step": 506
},
{
"epoch": 0.30797266514806376,
"grad_norm": 0.8875823603586328,
"learning_rate": 8.749573873452169e-06,
"loss": 0.2716,
"step": 507
},
{
"epoch": 0.30858010630220195,
"grad_norm": 0.968540908249718,
"learning_rate": 8.742553740855507e-06,
"loss": 0.2851,
"step": 508
},
{
"epoch": 0.30918754745634014,
"grad_norm": 0.9903789827417551,
"learning_rate": 8.735516790379952e-06,
"loss": 0.2897,
"step": 509
},
{
"epoch": 0.3097949886104784,
"grad_norm": 1.0711211077943226,
"learning_rate": 8.728463053647382e-06,
"loss": 0.2584,
"step": 510
},
{
"epoch": 0.31040242976461657,
"grad_norm": 1.0473520524330442,
"learning_rate": 8.721392562355113e-06,
"loss": 0.3144,
"step": 511
},
{
"epoch": 0.31100987091875476,
"grad_norm": 1.0396429716589337,
"learning_rate": 8.71430534827574e-06,
"loss": 0.3046,
"step": 512
},
{
"epoch": 0.31161731207289295,
"grad_norm": 2.22911229260033,
"learning_rate": 8.707201443257015e-06,
"loss": 0.3096,
"step": 513
},
{
"epoch": 0.31222475322703114,
"grad_norm": 1.0461555035207433,
"learning_rate": 8.700080879221689e-06,
"loss": 0.3344,
"step": 514
},
{
"epoch": 0.3128321943811693,
"grad_norm": 1.0243099223034418,
"learning_rate": 8.692943688167371e-06,
"loss": 0.3317,
"step": 515
},
{
"epoch": 0.3134396355353075,
"grad_norm": 1.0001264307440594,
"learning_rate": 8.685789902166395e-06,
"loss": 0.3035,
"step": 516
},
{
"epoch": 0.3140470766894457,
"grad_norm": 1.067152048322313,
"learning_rate": 8.67861955336566e-06,
"loss": 0.305,
"step": 517
},
{
"epoch": 0.3146545178435839,
"grad_norm": 1.030162408720675,
"learning_rate": 8.671432673986493e-06,
"loss": 0.3161,
"step": 518
},
{
"epoch": 0.3152619589977221,
"grad_norm": 0.9760695402458939,
"learning_rate": 8.664229296324514e-06,
"loss": 0.3157,
"step": 519
},
{
"epoch": 0.31586940015186027,
"grad_norm": 0.9705147238493051,
"learning_rate": 8.657009452749466e-06,
"loss": 0.3048,
"step": 520
},
{
"epoch": 0.31647684130599846,
"grad_norm": 0.8938796766666607,
"learning_rate": 8.649773175705099e-06,
"loss": 0.2668,
"step": 521
},
{
"epoch": 0.3170842824601367,
"grad_norm": 1.011233752497999,
"learning_rate": 8.642520497709001e-06,
"loss": 0.3098,
"step": 522
},
{
"epoch": 0.3176917236142749,
"grad_norm": 1.137929526930306,
"learning_rate": 8.635251451352463e-06,
"loss": 0.3015,
"step": 523
},
{
"epoch": 0.3182991647684131,
"grad_norm": 1.0130770256069037,
"learning_rate": 8.627966069300332e-06,
"loss": 0.3245,
"step": 524
},
{
"epoch": 0.31890660592255127,
"grad_norm": 1.1354507509321872,
"learning_rate": 8.620664384290863e-06,
"loss": 0.3039,
"step": 525
},
{
"epoch": 0.31951404707668946,
"grad_norm": 1.2939991366637584,
"learning_rate": 8.613346429135567e-06,
"loss": 0.3078,
"step": 526
},
{
"epoch": 0.32012148823082764,
"grad_norm": 1.1684109323065632,
"learning_rate": 8.606012236719073e-06,
"loss": 0.3385,
"step": 527
},
{
"epoch": 0.32072892938496583,
"grad_norm": 1.9632990216219541,
"learning_rate": 8.598661839998972e-06,
"loss": 0.2775,
"step": 528
},
{
"epoch": 0.321336370539104,
"grad_norm": 1.0999059016474897,
"learning_rate": 8.591295272005674e-06,
"loss": 0.2942,
"step": 529
},
{
"epoch": 0.3219438116932422,
"grad_norm": 1.1924126987732822,
"learning_rate": 8.583912565842258e-06,
"loss": 0.2957,
"step": 530
},
{
"epoch": 0.3225512528473804,
"grad_norm": 0.9830056635559057,
"learning_rate": 8.576513754684318e-06,
"loss": 0.2871,
"step": 531
},
{
"epoch": 0.3231586940015186,
"grad_norm": 1.003723254647249,
"learning_rate": 8.569098871779828e-06,
"loss": 0.3159,
"step": 532
},
{
"epoch": 0.3237661351556568,
"grad_norm": 1.1546050485627521,
"learning_rate": 8.561667950448973e-06,
"loss": 0.3274,
"step": 533
},
{
"epoch": 0.32437357630979496,
"grad_norm": 7.898389008169903,
"learning_rate": 8.554221024084019e-06,
"loss": 0.2923,
"step": 534
},
{
"epoch": 0.3249810174639332,
"grad_norm": 1.066007060725776,
"learning_rate": 8.546758126149148e-06,
"loss": 0.3172,
"step": 535
},
{
"epoch": 0.3255884586180714,
"grad_norm": 1.0571265266645584,
"learning_rate": 8.539279290180315e-06,
"loss": 0.3294,
"step": 536
},
{
"epoch": 0.3261958997722096,
"grad_norm": 1.1006676423033956,
"learning_rate": 8.531784549785098e-06,
"loss": 0.3524,
"step": 537
},
{
"epoch": 0.3268033409263478,
"grad_norm": 1.1991261856518842,
"learning_rate": 8.524273938642539e-06,
"loss": 0.3158,
"step": 538
},
{
"epoch": 0.32741078208048596,
"grad_norm": 1.0695433507748728,
"learning_rate": 8.516747490503001e-06,
"loss": 0.3318,
"step": 539
},
{
"epoch": 0.32801822323462415,
"grad_norm": 1.009828314645369,
"learning_rate": 8.509205239188017e-06,
"loss": 0.3034,
"step": 540
},
{
"epoch": 0.32862566438876234,
"grad_norm": 1.0614942016734776,
"learning_rate": 8.501647218590127e-06,
"loss": 0.3249,
"step": 541
},
{
"epoch": 0.32923310554290053,
"grad_norm": 1.0535816660890713,
"learning_rate": 8.494073462672743e-06,
"loss": 0.3245,
"step": 542
},
{
"epoch": 0.3298405466970387,
"grad_norm": 1.1116235063360127,
"learning_rate": 8.486484005469977e-06,
"loss": 0.3111,
"step": 543
},
{
"epoch": 0.3304479878511769,
"grad_norm": 1.2499481577520084,
"learning_rate": 8.478878881086505e-06,
"loss": 0.2774,
"step": 544
},
{
"epoch": 0.3310554290053151,
"grad_norm": 1.2434383006954242,
"learning_rate": 8.471258123697403e-06,
"loss": 0.3591,
"step": 545
},
{
"epoch": 0.3316628701594533,
"grad_norm": 2.629111128401328,
"learning_rate": 8.463621767547998e-06,
"loss": 0.2964,
"step": 546
},
{
"epoch": 0.33227031131359147,
"grad_norm": 1.0085205462907783,
"learning_rate": 8.455969846953711e-06,
"loss": 0.2782,
"step": 547
},
{
"epoch": 0.3328777524677297,
"grad_norm": 1.0091177434087597,
"learning_rate": 8.448302396299906e-06,
"loss": 0.2923,
"step": 548
},
{
"epoch": 0.3334851936218679,
"grad_norm": 0.9548710150038249,
"learning_rate": 8.440619450041736e-06,
"loss": 0.256,
"step": 549
},
{
"epoch": 0.3340926347760061,
"grad_norm": 1.0061487620910332,
"learning_rate": 8.432921042703985e-06,
"loss": 0.2978,
"step": 550
},
{
"epoch": 0.3347000759301443,
"grad_norm": 1.0205034766401575,
"learning_rate": 8.425207208880914e-06,
"loss": 0.3307,
"step": 551
},
{
"epoch": 0.33530751708428247,
"grad_norm": 0.9995329253562294,
"learning_rate": 8.417477983236107e-06,
"loss": 0.3149,
"step": 552
},
{
"epoch": 0.33591495823842066,
"grad_norm": 0.9454465727784129,
"learning_rate": 8.409733400502311e-06,
"loss": 0.3152,
"step": 553
},
{
"epoch": 0.33652239939255885,
"grad_norm": 1.1810082686737633,
"learning_rate": 8.401973495481289e-06,
"loss": 0.2706,
"step": 554
},
{
"epoch": 0.33712984054669703,
"grad_norm": 0.8806178339494458,
"learning_rate": 8.39419830304365e-06,
"loss": 0.2847,
"step": 555
},
{
"epoch": 0.3377372817008352,
"grad_norm": 1.1386249132431194,
"learning_rate": 8.386407858128707e-06,
"loss": 0.2851,
"step": 556
},
{
"epoch": 0.3383447228549734,
"grad_norm": 1.028806142818622,
"learning_rate": 8.378602195744308e-06,
"loss": 0.3078,
"step": 557
},
{
"epoch": 0.3389521640091116,
"grad_norm": 1.0457874244613665,
"learning_rate": 8.370781350966683e-06,
"loss": 0.3397,
"step": 558
},
{
"epoch": 0.3395596051632498,
"grad_norm": 0.8684256285368641,
"learning_rate": 8.362945358940295e-06,
"loss": 0.2842,
"step": 559
},
{
"epoch": 0.340167046317388,
"grad_norm": 0.9167915425454639,
"learning_rate": 8.355094254877665e-06,
"loss": 0.2658,
"step": 560
},
{
"epoch": 0.3407744874715262,
"grad_norm": 1.0174829618995838,
"learning_rate": 8.347228074059227e-06,
"loss": 0.3266,
"step": 561
},
{
"epoch": 0.3413819286256644,
"grad_norm": 0.960163727054609,
"learning_rate": 8.339346851833163e-06,
"loss": 0.2889,
"step": 562
},
{
"epoch": 0.3419893697798026,
"grad_norm": 0.9190612974975497,
"learning_rate": 8.33145062361525e-06,
"loss": 0.2993,
"step": 563
},
{
"epoch": 0.3425968109339408,
"grad_norm": 1.260688931755127,
"learning_rate": 8.323539424888695e-06,
"loss": 0.3011,
"step": 564
},
{
"epoch": 0.343204252088079,
"grad_norm": 0.8862857315713791,
"learning_rate": 8.315613291203977e-06,
"loss": 0.2745,
"step": 565
},
{
"epoch": 0.34381169324221716,
"grad_norm": 1.534384818793313,
"learning_rate": 8.30767225817869e-06,
"loss": 0.3015,
"step": 566
},
{
"epoch": 0.34441913439635535,
"grad_norm": 1.0421030425662317,
"learning_rate": 8.299716361497377e-06,
"loss": 0.2937,
"step": 567
},
{
"epoch": 0.34502657555049354,
"grad_norm": 1.078821559929711,
"learning_rate": 8.291745636911382e-06,
"loss": 0.3104,
"step": 568
},
{
"epoch": 0.34563401670463173,
"grad_norm": 1.0058600245745741,
"learning_rate": 8.283760120238672e-06,
"loss": 0.3077,
"step": 569
},
{
"epoch": 0.3462414578587699,
"grad_norm": 0.8608607521318042,
"learning_rate": 8.27575984736369e-06,
"loss": 0.2701,
"step": 570
},
{
"epoch": 0.3468488990129081,
"grad_norm": 1.0527010355534043,
"learning_rate": 8.26774485423719e-06,
"loss": 0.3196,
"step": 571
},
{
"epoch": 0.3474563401670463,
"grad_norm": 0.92875430774757,
"learning_rate": 8.259715176876069e-06,
"loss": 0.2782,
"step": 572
},
{
"epoch": 0.3480637813211845,
"grad_norm": 1.3921020583644004,
"learning_rate": 8.251670851363214e-06,
"loss": 0.3346,
"step": 573
},
{
"epoch": 0.34867122247532273,
"grad_norm": 0.9590626440697687,
"learning_rate": 8.243611913847337e-06,
"loss": 0.2824,
"step": 574
},
{
"epoch": 0.3492786636294609,
"grad_norm": 0.9520186243816708,
"learning_rate": 8.235538400542809e-06,
"loss": 0.27,
"step": 575
},
{
"epoch": 0.3498861047835991,
"grad_norm": 0.9591343436311952,
"learning_rate": 8.2274503477295e-06,
"loss": 0.2719,
"step": 576
},
{
"epoch": 0.3504935459377373,
"grad_norm": 1.0279410178241586,
"learning_rate": 8.21934779175262e-06,
"loss": 0.3191,
"step": 577
},
{
"epoch": 0.3511009870918755,
"grad_norm": 0.941703751218152,
"learning_rate": 8.211230769022552e-06,
"loss": 0.302,
"step": 578
},
{
"epoch": 0.35170842824601367,
"grad_norm": 0.9536297735317896,
"learning_rate": 8.203099316014679e-06,
"loss": 0.3236,
"step": 579
},
{
"epoch": 0.35231586940015186,
"grad_norm": 1.0851175895827145,
"learning_rate": 8.19495346926924e-06,
"loss": 0.3174,
"step": 580
},
{
"epoch": 0.35292331055429005,
"grad_norm": 0.9375847401656001,
"learning_rate": 8.18679326539115e-06,
"loss": 0.2984,
"step": 581
},
{
"epoch": 0.35353075170842824,
"grad_norm": 1.2513870142868544,
"learning_rate": 8.178618741049841e-06,
"loss": 0.3142,
"step": 582
},
{
"epoch": 0.3541381928625664,
"grad_norm": 1.0326942039827636,
"learning_rate": 8.170429932979097e-06,
"loss": 0.3118,
"step": 583
},
{
"epoch": 0.3547456340167046,
"grad_norm": 0.97117910690105,
"learning_rate": 8.162226877976886e-06,
"loss": 0.3114,
"step": 584
},
{
"epoch": 0.3553530751708428,
"grad_norm": 1.105458493777876,
"learning_rate": 8.154009612905205e-06,
"loss": 0.3252,
"step": 585
},
{
"epoch": 0.355960516324981,
"grad_norm": 1.1341020674749815,
"learning_rate": 8.145778174689897e-06,
"loss": 0.3388,
"step": 586
},
{
"epoch": 0.35656795747911924,
"grad_norm": 0.8971554688120575,
"learning_rate": 8.137532600320502e-06,
"loss": 0.2955,
"step": 587
},
{
"epoch": 0.3571753986332574,
"grad_norm": 1.0151980524067494,
"learning_rate": 8.129272926850079e-06,
"loss": 0.2949,
"step": 588
},
{
"epoch": 0.3577828397873956,
"grad_norm": 0.941672453155367,
"learning_rate": 8.120999191395048e-06,
"loss": 0.2819,
"step": 589
},
{
"epoch": 0.3583902809415338,
"grad_norm": 1.057217581283722,
"learning_rate": 8.112711431135014e-06,
"loss": 0.288,
"step": 590
},
{
"epoch": 0.358997722095672,
"grad_norm": 1.375861607230768,
"learning_rate": 8.10440968331261e-06,
"loss": 0.2897,
"step": 591
},
{
"epoch": 0.3596051632498102,
"grad_norm": 6.874880808118071,
"learning_rate": 8.096093985233323e-06,
"loss": 0.3182,
"step": 592
},
{
"epoch": 0.36021260440394837,
"grad_norm": 1.0459697501354848,
"learning_rate": 8.087764374265325e-06,
"loss": 0.3171,
"step": 593
},
{
"epoch": 0.36082004555808656,
"grad_norm": 1.0424410896636964,
"learning_rate": 8.079420887839316e-06,
"loss": 0.2841,
"step": 594
},
{
"epoch": 0.36142748671222474,
"grad_norm": 0.9398782517670305,
"learning_rate": 8.071063563448341e-06,
"loss": 0.2975,
"step": 595
},
{
"epoch": 0.36203492786636293,
"grad_norm": 0.9740196516759212,
"learning_rate": 8.062692438647628e-06,
"loss": 0.3001,
"step": 596
},
{
"epoch": 0.3626423690205011,
"grad_norm": 1.1401753407703934,
"learning_rate": 8.054307551054427e-06,
"loss": 0.3006,
"step": 597
},
{
"epoch": 0.3632498101746393,
"grad_norm": 1.5471172495911225,
"learning_rate": 8.045908938347828e-06,
"loss": 0.2829,
"step": 598
},
{
"epoch": 0.3638572513287775,
"grad_norm": 1.024433089085474,
"learning_rate": 8.037496638268599e-06,
"loss": 0.3338,
"step": 599
},
{
"epoch": 0.36446469248291574,
"grad_norm": 1.0602623846991535,
"learning_rate": 8.029070688619013e-06,
"loss": 0.2817,
"step": 600
},
{
"epoch": 0.36507213363705393,
"grad_norm": 0.9832965892055614,
"learning_rate": 8.020631127262681e-06,
"loss": 0.2928,
"step": 601
},
{
"epoch": 0.3656795747911921,
"grad_norm": 1.0377272606893748,
"learning_rate": 8.012177992124385e-06,
"loss": 0.3163,
"step": 602
},
{
"epoch": 0.3662870159453303,
"grad_norm": 1.162756936534878,
"learning_rate": 8.003711321189895e-06,
"loss": 0.3026,
"step": 603
},
{
"epoch": 0.3668944570994685,
"grad_norm": 0.9081590107485684,
"learning_rate": 7.995231152505815e-06,
"loss": 0.278,
"step": 604
},
{
"epoch": 0.3675018982536067,
"grad_norm": 1.0285104632076432,
"learning_rate": 7.986737524179398e-06,
"loss": 0.3198,
"step": 605
},
{
"epoch": 0.3681093394077449,
"grad_norm": 1.0089602950227836,
"learning_rate": 7.978230474378383e-06,
"loss": 0.2896,
"step": 606
},
{
"epoch": 0.36871678056188306,
"grad_norm": 0.9253892326788151,
"learning_rate": 7.96971004133082e-06,
"loss": 0.2973,
"step": 607
},
{
"epoch": 0.36932422171602125,
"grad_norm": 0.9373958997004501,
"learning_rate": 7.961176263324902e-06,
"loss": 0.2702,
"step": 608
},
{
"epoch": 0.36993166287015944,
"grad_norm": 1.0340689976239543,
"learning_rate": 7.952629178708783e-06,
"loss": 0.3086,
"step": 609
},
{
"epoch": 0.3705391040242976,
"grad_norm": 0.9434565199275107,
"learning_rate": 7.944068825890424e-06,
"loss": 0.2844,
"step": 610
},
{
"epoch": 0.3711465451784358,
"grad_norm": 2.0206497983922165,
"learning_rate": 7.935495243337397e-06,
"loss": 0.2996,
"step": 611
},
{
"epoch": 0.371753986332574,
"grad_norm": 1.220826721106363,
"learning_rate": 7.92690846957673e-06,
"loss": 0.2564,
"step": 612
},
{
"epoch": 0.37236142748671225,
"grad_norm": 0.9495640406098673,
"learning_rate": 7.918308543194735e-06,
"loss": 0.2789,
"step": 613
},
{
"epoch": 0.37296886864085044,
"grad_norm": 1.1839526557503743,
"learning_rate": 7.909695502836814e-06,
"loss": 0.3291,
"step": 614
},
{
"epoch": 0.3735763097949886,
"grad_norm": 0.9422030510378584,
"learning_rate": 7.90106938720731e-06,
"loss": 0.2658,
"step": 615
},
{
"epoch": 0.3741837509491268,
"grad_norm": 2.6158876200316943,
"learning_rate": 7.892430235069317e-06,
"loss": 0.3267,
"step": 616
},
{
"epoch": 0.374791192103265,
"grad_norm": 1.0569339252672818,
"learning_rate": 7.883778085244514e-06,
"loss": 0.3302,
"step": 617
},
{
"epoch": 0.3753986332574032,
"grad_norm": 0.9481970640626258,
"learning_rate": 7.875112976612984e-06,
"loss": 0.2825,
"step": 618
},
{
"epoch": 0.3760060744115414,
"grad_norm": 0.9971575781640729,
"learning_rate": 7.866434948113046e-06,
"loss": 0.2988,
"step": 619
},
{
"epoch": 0.37661351556567957,
"grad_norm": 0.9807473222056035,
"learning_rate": 7.857744038741076e-06,
"loss": 0.3192,
"step": 620
},
{
"epoch": 0.37722095671981776,
"grad_norm": 1.0243743769936524,
"learning_rate": 7.849040287551331e-06,
"loss": 0.3149,
"step": 621
},
{
"epoch": 0.37782839787395595,
"grad_norm": 0.9586280652231142,
"learning_rate": 7.84032373365578e-06,
"loss": 0.2815,
"step": 622
},
{
"epoch": 0.37843583902809413,
"grad_norm": 0.9369075735191363,
"learning_rate": 7.831594416223916e-06,
"loss": 0.3008,
"step": 623
},
{
"epoch": 0.3790432801822323,
"grad_norm": 3.308019410340958,
"learning_rate": 7.822852374482597e-06,
"loss": 0.3148,
"step": 624
},
{
"epoch": 0.37965072133637057,
"grad_norm": 1.164444173629437,
"learning_rate": 7.814097647715848e-06,
"loss": 0.3058,
"step": 625
},
{
"epoch": 0.38025816249050876,
"grad_norm": 1.4177016659749746,
"learning_rate": 7.805330275264707e-06,
"loss": 0.2889,
"step": 626
},
{
"epoch": 0.38086560364464694,
"grad_norm": 1.4357393277200918,
"learning_rate": 7.796550296527032e-06,
"loss": 0.2636,
"step": 627
},
{
"epoch": 0.38147304479878513,
"grad_norm": 1.0253772001951136,
"learning_rate": 7.787757750957335e-06,
"loss": 0.3026,
"step": 628
},
{
"epoch": 0.3820804859529233,
"grad_norm": 0.926193976032409,
"learning_rate": 7.778952678066591e-06,
"loss": 0.2613,
"step": 629
},
{
"epoch": 0.3826879271070615,
"grad_norm": 0.9522660951838047,
"learning_rate": 7.77013511742208e-06,
"loss": 0.3146,
"step": 630
},
{
"epoch": 0.3832953682611997,
"grad_norm": 1.0242396609306572,
"learning_rate": 7.761305108647188e-06,
"loss": 0.2957,
"step": 631
},
{
"epoch": 0.3839028094153379,
"grad_norm": 0.9802706086421458,
"learning_rate": 7.752462691421245e-06,
"loss": 0.2947,
"step": 632
},
{
"epoch": 0.3845102505694761,
"grad_norm": 0.9593635598943385,
"learning_rate": 7.743607905479338e-06,
"loss": 0.3063,
"step": 633
},
{
"epoch": 0.38511769172361426,
"grad_norm": 0.9149349282493551,
"learning_rate": 7.734740790612137e-06,
"loss": 0.2824,
"step": 634
},
{
"epoch": 0.38572513287775245,
"grad_norm": 0.8104078792665621,
"learning_rate": 7.72586138666571e-06,
"loss": 0.2546,
"step": 635
},
{
"epoch": 0.38633257403189064,
"grad_norm": 1.0286565476756544,
"learning_rate": 7.716969733541357e-06,
"loss": 0.2704,
"step": 636
},
{
"epoch": 0.38694001518602883,
"grad_norm": 0.8490863136577027,
"learning_rate": 7.708065871195413e-06,
"loss": 0.2606,
"step": 637
},
{
"epoch": 0.3875474563401671,
"grad_norm": 1.0903672375251494,
"learning_rate": 7.699149839639086e-06,
"loss": 0.3175,
"step": 638
},
{
"epoch": 0.38815489749430526,
"grad_norm": 1.4642223666112006,
"learning_rate": 7.690221678938258e-06,
"loss": 0.3641,
"step": 639
},
{
"epoch": 0.38876233864844345,
"grad_norm": 0.9534979437652094,
"learning_rate": 7.681281429213328e-06,
"loss": 0.2731,
"step": 640
},
{
"epoch": 0.38936977980258164,
"grad_norm": 1.1167627986990094,
"learning_rate": 7.672329130639007e-06,
"loss": 0.2791,
"step": 641
},
{
"epoch": 0.38997722095671983,
"grad_norm": 1.1634813414266154,
"learning_rate": 7.663364823444157e-06,
"loss": 0.3173,
"step": 642
},
{
"epoch": 0.390584662110858,
"grad_norm": 0.9674791602076483,
"learning_rate": 7.654388547911605e-06,
"loss": 0.3198,
"step": 643
},
{
"epoch": 0.3911921032649962,
"grad_norm": 0.9326003350894531,
"learning_rate": 7.645400344377953e-06,
"loss": 0.2446,
"step": 644
},
{
"epoch": 0.3917995444191344,
"grad_norm": 1.0664043566671932,
"learning_rate": 7.63640025323341e-06,
"loss": 0.2897,
"step": 645
},
{
"epoch": 0.3924069855732726,
"grad_norm": 1.1042940465708586,
"learning_rate": 7.627388314921602e-06,
"loss": 0.2964,
"step": 646
},
{
"epoch": 0.39301442672741077,
"grad_norm": 0.9931594882104945,
"learning_rate": 7.61836456993939e-06,
"loss": 0.28,
"step": 647
},
{
"epoch": 0.39362186788154896,
"grad_norm": 1.471343399649596,
"learning_rate": 7.609329058836694e-06,
"loss": 0.3354,
"step": 648
},
{
"epoch": 0.39422930903568715,
"grad_norm": 1.368059808767781,
"learning_rate": 7.600281822216307e-06,
"loss": 0.312,
"step": 649
},
{
"epoch": 0.39483675018982534,
"grad_norm": 0.9867176052292568,
"learning_rate": 7.59122290073371e-06,
"loss": 0.2691,
"step": 650
},
{
"epoch": 0.3954441913439636,
"grad_norm": 1.0007304468451435,
"learning_rate": 7.582152335096896e-06,
"loss": 0.2817,
"step": 651
},
{
"epoch": 0.39605163249810177,
"grad_norm": 1.0529002222158175,
"learning_rate": 7.5730701660661795e-06,
"loss": 0.2804,
"step": 652
},
{
"epoch": 0.39665907365223996,
"grad_norm": 0.9111676203769294,
"learning_rate": 7.563976434454021e-06,
"loss": 0.2674,
"step": 653
},
{
"epoch": 0.39726651480637815,
"grad_norm": 0.928422868586772,
"learning_rate": 7.554871181124836e-06,
"loss": 0.2842,
"step": 654
},
{
"epoch": 0.39787395596051633,
"grad_norm": 0.9776630589483184,
"learning_rate": 7.5457544469948164e-06,
"loss": 0.2891,
"step": 655
},
{
"epoch": 0.3984813971146545,
"grad_norm": 0.9696610718502006,
"learning_rate": 7.536626273031747e-06,
"loss": 0.2815,
"step": 656
},
{
"epoch": 0.3990888382687927,
"grad_norm": 0.9352484358853785,
"learning_rate": 7.5274867002548154e-06,
"loss": 0.2666,
"step": 657
},
{
"epoch": 0.3996962794229309,
"grad_norm": 0.9663340246117754,
"learning_rate": 7.5183357697344395e-06,
"loss": 0.2834,
"step": 658
},
{
"epoch": 0.4003037205770691,
"grad_norm": 1.0770927677521116,
"learning_rate": 7.509173522592066e-06,
"loss": 0.3175,
"step": 659
},
{
"epoch": 0.4009111617312073,
"grad_norm": 0.9005571117374582,
"learning_rate": 7.500000000000001e-06,
"loss": 0.2918,
"step": 660
},
{
"epoch": 0.40151860288534547,
"grad_norm": 1.053393785111227,
"learning_rate": 7.4908152431812175e-06,
"loss": 0.2636,
"step": 661
},
{
"epoch": 0.40212604403948365,
"grad_norm": 0.9355577937726377,
"learning_rate": 7.481619293409173e-06,
"loss": 0.2656,
"step": 662
},
{
"epoch": 0.40273348519362184,
"grad_norm": 1.067579553680483,
"learning_rate": 7.472412192007619e-06,
"loss": 0.2734,
"step": 663
},
{
"epoch": 0.4033409263477601,
"grad_norm": 0.8813079944231941,
"learning_rate": 7.4631939803504215e-06,
"loss": 0.2411,
"step": 664
},
{
"epoch": 0.4039483675018983,
"grad_norm": 1.008354055836425,
"learning_rate": 7.453964699861376e-06,
"loss": 0.2621,
"step": 665
},
{
"epoch": 0.40455580865603646,
"grad_norm": 1.3145794725051865,
"learning_rate": 7.44472439201401e-06,
"loss": 0.3353,
"step": 666
},
{
"epoch": 0.40516324981017465,
"grad_norm": 1.0499020867013482,
"learning_rate": 7.435473098331411e-06,
"loss": 0.309,
"step": 667
},
{
"epoch": 0.40577069096431284,
"grad_norm": 1.0498570111550733,
"learning_rate": 7.426210860386032e-06,
"loss": 0.2863,
"step": 668
},
{
"epoch": 0.40637813211845103,
"grad_norm": 0.9381854841212456,
"learning_rate": 7.416937719799502e-06,
"loss": 0.3162,
"step": 669
},
{
"epoch": 0.4069855732725892,
"grad_norm": 1.0356224890807904,
"learning_rate": 7.407653718242449e-06,
"loss": 0.2835,
"step": 670
},
{
"epoch": 0.4075930144267274,
"grad_norm": 1.1471319493901884,
"learning_rate": 7.398358897434303e-06,
"loss": 0.2995,
"step": 671
},
{
"epoch": 0.4082004555808656,
"grad_norm": 0.9661407586663348,
"learning_rate": 7.3890532991431174e-06,
"loss": 0.2815,
"step": 672
},
{
"epoch": 0.4088078967350038,
"grad_norm": 1.236474905505436,
"learning_rate": 7.379736965185369e-06,
"loss": 0.3244,
"step": 673
},
{
"epoch": 0.409415337889142,
"grad_norm": 1.0600796964601655,
"learning_rate": 7.370409937425781e-06,
"loss": 0.2994,
"step": 674
},
{
"epoch": 0.41002277904328016,
"grad_norm": 0.9816189966958595,
"learning_rate": 7.361072257777132e-06,
"loss": 0.3046,
"step": 675
},
{
"epoch": 0.41063022019741835,
"grad_norm": 1.1164647876294425,
"learning_rate": 7.3517239682000675e-06,
"loss": 0.2676,
"step": 676
},
{
"epoch": 0.4112376613515566,
"grad_norm": 0.836806710584883,
"learning_rate": 7.342365110702907e-06,
"loss": 0.2233,
"step": 677
},
{
"epoch": 0.4118451025056948,
"grad_norm": 1.0083900885239343,
"learning_rate": 7.332995727341462e-06,
"loss": 0.3282,
"step": 678
},
{
"epoch": 0.41245254365983297,
"grad_norm": 0.8982701658241349,
"learning_rate": 7.323615860218844e-06,
"loss": 0.2366,
"step": 679
},
{
"epoch": 0.41305998481397116,
"grad_norm": 0.8535635256156215,
"learning_rate": 7.314225551485273e-06,
"loss": 0.2424,
"step": 680
},
{
"epoch": 0.41366742596810935,
"grad_norm": 1.122301845672972,
"learning_rate": 7.304824843337893e-06,
"loss": 0.3075,
"step": 681
},
{
"epoch": 0.41427486712224754,
"grad_norm": 0.9413071187408356,
"learning_rate": 7.295413778020579e-06,
"loss": 0.2862,
"step": 682
},
{
"epoch": 0.4148823082763857,
"grad_norm": 1.0778162018478468,
"learning_rate": 7.285992397823747e-06,
"loss": 0.3039,
"step": 683
},
{
"epoch": 0.4154897494305239,
"grad_norm": 1.0058545387610198,
"learning_rate": 7.276560745084167e-06,
"loss": 0.2982,
"step": 684
},
{
"epoch": 0.4160971905846621,
"grad_norm": 1.247146796633824,
"learning_rate": 7.267118862184767e-06,
"loss": 0.2769,
"step": 685
},
{
"epoch": 0.4167046317388003,
"grad_norm": 0.9580540977436537,
"learning_rate": 7.257666791554448e-06,
"loss": 0.3057,
"step": 686
},
{
"epoch": 0.4173120728929385,
"grad_norm": 4.047797878525422,
"learning_rate": 7.248204575667893e-06,
"loss": 0.2686,
"step": 687
},
{
"epoch": 0.41791951404707667,
"grad_norm": 0.9842953093021652,
"learning_rate": 7.2387322570453724e-06,
"loss": 0.2819,
"step": 688
},
{
"epoch": 0.41852695520121486,
"grad_norm": 0.9798887505619451,
"learning_rate": 7.229249878252558e-06,
"loss": 0.2659,
"step": 689
},
{
"epoch": 0.4191343963553531,
"grad_norm": 1.0112521392212896,
"learning_rate": 7.219757481900325e-06,
"loss": 0.2878,
"step": 690
},
{
"epoch": 0.4197418375094913,
"grad_norm": 0.9383147067402705,
"learning_rate": 7.210255110644569e-06,
"loss": 0.3096,
"step": 691
},
{
"epoch": 0.4203492786636295,
"grad_norm": 0.9751290815844371,
"learning_rate": 7.2007428071860045e-06,
"loss": 0.3036,
"step": 692
},
{
"epoch": 0.42095671981776767,
"grad_norm": 0.954940864157478,
"learning_rate": 7.191220614269981e-06,
"loss": 0.2748,
"step": 693
},
{
"epoch": 0.42156416097190585,
"grad_norm": 0.955968898320391,
"learning_rate": 7.181688574686292e-06,
"loss": 0.294,
"step": 694
},
{
"epoch": 0.42217160212604404,
"grad_norm": 1.0024277031805326,
"learning_rate": 7.17214673126897e-06,
"loss": 0.2817,
"step": 695
},
{
"epoch": 0.42277904328018223,
"grad_norm": 1.1088991455813102,
"learning_rate": 7.162595126896111e-06,
"loss": 0.2699,
"step": 696
},
{
"epoch": 0.4233864844343204,
"grad_norm": 0.9068874217571249,
"learning_rate": 7.15303380448967e-06,
"loss": 0.2666,
"step": 697
},
{
"epoch": 0.4239939255884586,
"grad_norm": 0.9846540050512566,
"learning_rate": 7.143462807015271e-06,
"loss": 0.2996,
"step": 698
},
{
"epoch": 0.4246013667425968,
"grad_norm": 1.2920144608956823,
"learning_rate": 7.133882177482019e-06,
"loss": 0.283,
"step": 699
},
{
"epoch": 0.425208807896735,
"grad_norm": 0.9844981703237418,
"learning_rate": 7.1242919589422974e-06,
"loss": 0.283,
"step": 700
},
{
"epoch": 0.4258162490508732,
"grad_norm": 1.0792092917236789,
"learning_rate": 7.114692194491583e-06,
"loss": 0.2771,
"step": 701
},
{
"epoch": 0.42642369020501136,
"grad_norm": 1.1366050037820674,
"learning_rate": 7.105082927268247e-06,
"loss": 0.3462,
"step": 702
},
{
"epoch": 0.4270311313591496,
"grad_norm": 1.9892569077431685,
"learning_rate": 7.095464200453366e-06,
"loss": 0.2657,
"step": 703
},
{
"epoch": 0.4276385725132878,
"grad_norm": 1.030861638233516,
"learning_rate": 7.085836057270521e-06,
"loss": 0.3113,
"step": 704
},
{
"epoch": 0.428246013667426,
"grad_norm": 1.070680183715172,
"learning_rate": 7.07619854098561e-06,
"loss": 0.2622,
"step": 705
},
{
"epoch": 0.4288534548215642,
"grad_norm": 1.0086658824371333,
"learning_rate": 7.066551694906651e-06,
"loss": 0.254,
"step": 706
},
{
"epoch": 0.42946089597570236,
"grad_norm": 1.2178606474849707,
"learning_rate": 7.056895562383585e-06,
"loss": 0.283,
"step": 707
},
{
"epoch": 0.43006833712984055,
"grad_norm": 1.0905334429821039,
"learning_rate": 7.047230186808085e-06,
"loss": 0.2979,
"step": 708
},
{
"epoch": 0.43067577828397874,
"grad_norm": 1.0513698337032553,
"learning_rate": 7.0375556116133605e-06,
"loss": 0.3149,
"step": 709
},
{
"epoch": 0.4312832194381169,
"grad_norm": 0.905817609480293,
"learning_rate": 7.027871880273959e-06,
"loss": 0.267,
"step": 710
},
{
"epoch": 0.4318906605922551,
"grad_norm": 0.8779355405635809,
"learning_rate": 7.018179036305574e-06,
"loss": 0.2777,
"step": 711
},
{
"epoch": 0.4324981017463933,
"grad_norm": 0.9741991469020347,
"learning_rate": 7.008477123264849e-06,
"loss": 0.2881,
"step": 712
},
{
"epoch": 0.4331055429005315,
"grad_norm": 1.0958805822729734,
"learning_rate": 6.9987661847491786e-06,
"loss": 0.2688,
"step": 713
},
{
"epoch": 0.4337129840546697,
"grad_norm": 0.9812819976899622,
"learning_rate": 6.989046264396516e-06,
"loss": 0.3073,
"step": 714
},
{
"epoch": 0.43432042520880787,
"grad_norm": 0.9280917573243459,
"learning_rate": 6.9793174058851805e-06,
"loss": 0.2942,
"step": 715
},
{
"epoch": 0.4349278663629461,
"grad_norm": 0.9208122228856829,
"learning_rate": 6.96957965293365e-06,
"loss": 0.2603,
"step": 716
},
{
"epoch": 0.4355353075170843,
"grad_norm": 1.0481626035767557,
"learning_rate": 6.959833049300376e-06,
"loss": 0.2659,
"step": 717
},
{
"epoch": 0.4361427486712225,
"grad_norm": 0.8888068906740508,
"learning_rate": 6.9500776387835785e-06,
"loss": 0.2626,
"step": 718
},
{
"epoch": 0.4367501898253607,
"grad_norm": 0.9005255044425835,
"learning_rate": 6.940313465221057e-06,
"loss": 0.283,
"step": 719
},
{
"epoch": 0.43735763097949887,
"grad_norm": 0.9313053275341852,
"learning_rate": 6.9305405724899876e-06,
"loss": 0.2878,
"step": 720
},
{
"epoch": 0.43796507213363706,
"grad_norm": 1.0001487266577187,
"learning_rate": 6.920759004506723e-06,
"loss": 0.2381,
"step": 721
},
{
"epoch": 0.43857251328777525,
"grad_norm": 1.104902424962282,
"learning_rate": 6.91096880522661e-06,
"loss": 0.3125,
"step": 722
},
{
"epoch": 0.43917995444191343,
"grad_norm": 0.9710695360675384,
"learning_rate": 6.90117001864377e-06,
"loss": 0.2778,
"step": 723
},
{
"epoch": 0.4397873955960516,
"grad_norm": 0.8790773492913548,
"learning_rate": 6.891362688790925e-06,
"loss": 0.2741,
"step": 724
},
{
"epoch": 0.4403948367501898,
"grad_norm": 1.27448461099361,
"learning_rate": 6.8815468597391785e-06,
"loss": 0.2961,
"step": 725
},
{
"epoch": 0.441002277904328,
"grad_norm": 1.8340194618418497,
"learning_rate": 6.871722575597829e-06,
"loss": 0.2806,
"step": 726
},
{
"epoch": 0.4416097190584662,
"grad_norm": 0.9529710613769301,
"learning_rate": 6.8618898805141744e-06,
"loss": 0.2788,
"step": 727
},
{
"epoch": 0.4422171602126044,
"grad_norm": 1.1180921584509682,
"learning_rate": 6.8520488186733e-06,
"loss": 0.2916,
"step": 728
},
{
"epoch": 0.4428246013667426,
"grad_norm": 0.9569903924698052,
"learning_rate": 6.8421994342979e-06,
"loss": 0.297,
"step": 729
},
{
"epoch": 0.4434320425208808,
"grad_norm": 0.9663109439234678,
"learning_rate": 6.832341771648057e-06,
"loss": 0.2934,
"step": 730
},
{
"epoch": 0.444039483675019,
"grad_norm": 0.9525035732233259,
"learning_rate": 6.822475875021057e-06,
"loss": 0.2924,
"step": 731
},
{
"epoch": 0.4446469248291572,
"grad_norm": 1.6736139762109075,
"learning_rate": 6.812601788751192e-06,
"loss": 0.2875,
"step": 732
},
{
"epoch": 0.4452543659832954,
"grad_norm": 1.3961577247630492,
"learning_rate": 6.802719557209547e-06,
"loss": 0.2723,
"step": 733
},
{
"epoch": 0.44586180713743356,
"grad_norm": 1.0192583283909253,
"learning_rate": 6.792829224803816e-06,
"loss": 0.2902,
"step": 734
},
{
"epoch": 0.44646924829157175,
"grad_norm": 1.403077195941947,
"learning_rate": 6.782930835978094e-06,
"loss": 0.3298,
"step": 735
},
{
"epoch": 0.44707668944570994,
"grad_norm": 0.912658050114097,
"learning_rate": 6.773024435212678e-06,
"loss": 0.2654,
"step": 736
},
{
"epoch": 0.44768413059984813,
"grad_norm": 1.0482352218310602,
"learning_rate": 6.76311006702387e-06,
"loss": 0.27,
"step": 737
},
{
"epoch": 0.4482915717539863,
"grad_norm": 0.8572502039817809,
"learning_rate": 6.753187775963773e-06,
"loss": 0.245,
"step": 738
},
{
"epoch": 0.4488990129081245,
"grad_norm": 1.0913308805109663,
"learning_rate": 6.743257606620094e-06,
"loss": 0.2551,
"step": 739
},
{
"epoch": 0.4495064540622627,
"grad_norm": 0.9439876776084938,
"learning_rate": 6.733319603615941e-06,
"loss": 0.274,
"step": 740
},
{
"epoch": 0.45011389521640094,
"grad_norm": 0.9593594109115189,
"learning_rate": 6.723373811609628e-06,
"loss": 0.2698,
"step": 741
},
{
"epoch": 0.4507213363705391,
"grad_norm": 1.065483676595223,
"learning_rate": 6.713420275294467e-06,
"loss": 0.3096,
"step": 742
},
{
"epoch": 0.4513287775246773,
"grad_norm": 1.0245095563483146,
"learning_rate": 6.703459039398571e-06,
"loss": 0.3101,
"step": 743
},
{
"epoch": 0.4519362186788155,
"grad_norm": 0.8544225145366473,
"learning_rate": 6.693490148684654e-06,
"loss": 0.2478,
"step": 744
},
{
"epoch": 0.4525436598329537,
"grad_norm": 1.1603691969888617,
"learning_rate": 6.683513647949826e-06,
"loss": 0.3075,
"step": 745
},
{
"epoch": 0.4531511009870919,
"grad_norm": 1.052592239520445,
"learning_rate": 6.673529582025398e-06,
"loss": 0.2737,
"step": 746
},
{
"epoch": 0.45375854214123007,
"grad_norm": 3.117949544460077,
"learning_rate": 6.66353799577667e-06,
"loss": 0.2791,
"step": 747
},
{
"epoch": 0.45436598329536826,
"grad_norm": 0.9613711534282726,
"learning_rate": 6.653538934102743e-06,
"loss": 0.3014,
"step": 748
},
{
"epoch": 0.45497342444950645,
"grad_norm": 0.9748105893226726,
"learning_rate": 6.643532441936307e-06,
"loss": 0.2749,
"step": 749
},
{
"epoch": 0.45558086560364464,
"grad_norm": 0.981233876336576,
"learning_rate": 6.633518564243442e-06,
"loss": 0.2971,
"step": 750
},
{
"epoch": 0.4561883067577828,
"grad_norm": 0.9457213271619488,
"learning_rate": 6.6234973460234184e-06,
"loss": 0.2868,
"step": 751
},
{
"epoch": 0.456795747911921,
"grad_norm": 2.686802516445107,
"learning_rate": 6.6134688323084884e-06,
"loss": 0.2731,
"step": 752
},
{
"epoch": 0.4574031890660592,
"grad_norm": 0.9223972209238886,
"learning_rate": 6.603433068163694e-06,
"loss": 0.2616,
"step": 753
},
{
"epoch": 0.45801063022019745,
"grad_norm": 1.025121154265423,
"learning_rate": 6.593390098686653e-06,
"loss": 0.2907,
"step": 754
},
{
"epoch": 0.45861807137433563,
"grad_norm": 0.9622665588756997,
"learning_rate": 6.583339969007364e-06,
"loss": 0.3044,
"step": 755
},
{
"epoch": 0.4592255125284738,
"grad_norm": 0.9057032918476721,
"learning_rate": 6.573282724288001e-06,
"loss": 0.2728,
"step": 756
},
{
"epoch": 0.459832953682612,
"grad_norm": 0.9234610842082313,
"learning_rate": 6.563218409722712e-06,
"loss": 0.276,
"step": 757
},
{
"epoch": 0.4604403948367502,
"grad_norm": 0.958933012871059,
"learning_rate": 6.553147070537413e-06,
"loss": 0.2777,
"step": 758
},
{
"epoch": 0.4610478359908884,
"grad_norm": 1.0680318113393608,
"learning_rate": 6.543068751989585e-06,
"loss": 0.2765,
"step": 759
},
{
"epoch": 0.4616552771450266,
"grad_norm": 0.9152338969224387,
"learning_rate": 6.532983499368078e-06,
"loss": 0.2931,
"step": 760
},
{
"epoch": 0.46226271829916477,
"grad_norm": 0.9026579775576341,
"learning_rate": 6.522891357992895e-06,
"loss": 0.2519,
"step": 761
},
{
"epoch": 0.46287015945330295,
"grad_norm": 0.9283253808655062,
"learning_rate": 6.512792373215e-06,
"loss": 0.2804,
"step": 762
},
{
"epoch": 0.46347760060744114,
"grad_norm": 0.9173677882111563,
"learning_rate": 6.502686590416105e-06,
"loss": 0.2734,
"step": 763
},
{
"epoch": 0.46408504176157933,
"grad_norm": 0.9717106384462386,
"learning_rate": 6.492574055008474e-06,
"loss": 0.2671,
"step": 764
},
{
"epoch": 0.4646924829157175,
"grad_norm": 1.0114399277783983,
"learning_rate": 6.482454812434711e-06,
"loss": 0.2843,
"step": 765
},
{
"epoch": 0.4652999240698557,
"grad_norm": 0.9171006340811538,
"learning_rate": 6.472328908167562e-06,
"loss": 0.2744,
"step": 766
},
{
"epoch": 0.46590736522399395,
"grad_norm": 1.0070946523066167,
"learning_rate": 6.4621963877097105e-06,
"loss": 0.2838,
"step": 767
},
{
"epoch": 0.46651480637813214,
"grad_norm": 0.9699071928177333,
"learning_rate": 6.452057296593568e-06,
"loss": 0.3075,
"step": 768
},
{
"epoch": 0.46712224753227033,
"grad_norm": 1.2714437302384922,
"learning_rate": 6.441911680381074e-06,
"loss": 0.2803,
"step": 769
},
{
"epoch": 0.4677296886864085,
"grad_norm": 0.8642721638499515,
"learning_rate": 6.431759584663492e-06,
"loss": 0.2594,
"step": 770
},
{
"epoch": 0.4683371298405467,
"grad_norm": 0.9642319769554307,
"learning_rate": 6.421601055061195e-06,
"loss": 0.294,
"step": 771
},
{
"epoch": 0.4689445709946849,
"grad_norm": 0.9718176611982676,
"learning_rate": 6.411436137223479e-06,
"loss": 0.276,
"step": 772
},
{
"epoch": 0.4695520121488231,
"grad_norm": 1.044555593128521,
"learning_rate": 6.401264876828335e-06,
"loss": 0.2739,
"step": 773
},
{
"epoch": 0.47015945330296127,
"grad_norm": 1.2599360050994348,
"learning_rate": 6.391087319582264e-06,
"loss": 0.2689,
"step": 774
},
{
"epoch": 0.47076689445709946,
"grad_norm": 1.0049522316698594,
"learning_rate": 6.38090351122006e-06,
"loss": 0.2523,
"step": 775
},
{
"epoch": 0.47137433561123765,
"grad_norm": 0.966627950881477,
"learning_rate": 6.370713497504607e-06,
"loss": 0.2443,
"step": 776
},
{
"epoch": 0.47198177676537584,
"grad_norm": 0.9792802121829564,
"learning_rate": 6.360517324226676e-06,
"loss": 0.2783,
"step": 777
},
{
"epoch": 0.472589217919514,
"grad_norm": 0.8715410479118867,
"learning_rate": 6.350315037204714e-06,
"loss": 0.272,
"step": 778
},
{
"epoch": 0.4731966590736522,
"grad_norm": 1.0001665140104699,
"learning_rate": 6.340106682284645e-06,
"loss": 0.2838,
"step": 779
},
{
"epoch": 0.47380410022779046,
"grad_norm": 0.934121158767379,
"learning_rate": 6.329892305339659e-06,
"loss": 0.2557,
"step": 780
},
{
"epoch": 0.47441154138192865,
"grad_norm": 1.00237931674153,
"learning_rate": 6.319671952270004e-06,
"loss": 0.2729,
"step": 781
},
{
"epoch": 0.47501898253606684,
"grad_norm": 1.222988126809238,
"learning_rate": 6.309445669002787e-06,
"loss": 0.2493,
"step": 782
},
{
"epoch": 0.475626423690205,
"grad_norm": 1.107165014150917,
"learning_rate": 6.299213501491761e-06,
"loss": 0.3008,
"step": 783
},
{
"epoch": 0.4762338648443432,
"grad_norm": 1.2066643126228012,
"learning_rate": 6.288975495717124e-06,
"loss": 0.2867,
"step": 784
},
{
"epoch": 0.4768413059984814,
"grad_norm": 0.9028058894006271,
"learning_rate": 6.2787316976853045e-06,
"loss": 0.2495,
"step": 785
},
{
"epoch": 0.4774487471526196,
"grad_norm": 0.8790390490726189,
"learning_rate": 6.268482153428763e-06,
"loss": 0.2348,
"step": 786
},
{
"epoch": 0.4780561883067578,
"grad_norm": 0.9822253223369787,
"learning_rate": 6.258226909005783e-06,
"loss": 0.2809,
"step": 787
},
{
"epoch": 0.47866362946089597,
"grad_norm": 0.9996010163514261,
"learning_rate": 6.247966010500258e-06,
"loss": 0.2794,
"step": 788
},
{
"epoch": 0.47927107061503416,
"grad_norm": 0.95106120425031,
"learning_rate": 6.237699504021495e-06,
"loss": 0.2892,
"step": 789
},
{
"epoch": 0.47987851176917234,
"grad_norm": 0.9498966653557624,
"learning_rate": 6.227427435703997e-06,
"loss": 0.277,
"step": 790
},
{
"epoch": 0.48048595292331053,
"grad_norm": 0.9516961785802474,
"learning_rate": 6.217149851707261e-06,
"loss": 0.253,
"step": 791
},
{
"epoch": 0.4810933940774487,
"grad_norm": 0.948579051817591,
"learning_rate": 6.206866798215571e-06,
"loss": 0.2675,
"step": 792
},
{
"epoch": 0.48170083523158697,
"grad_norm": 0.8987027506170783,
"learning_rate": 6.1965783214377895e-06,
"loss": 0.2498,
"step": 793
},
{
"epoch": 0.48230827638572515,
"grad_norm": 0.8985403214008115,
"learning_rate": 6.186284467607149e-06,
"loss": 0.2594,
"step": 794
},
{
"epoch": 0.48291571753986334,
"grad_norm": 0.9277532386845908,
"learning_rate": 6.175985282981042e-06,
"loss": 0.2644,
"step": 795
},
{
"epoch": 0.48352315869400153,
"grad_norm": 1.1995838839316881,
"learning_rate": 6.165680813840822e-06,
"loss": 0.2546,
"step": 796
},
{
"epoch": 0.4841305998481397,
"grad_norm": 1.801894016231239,
"learning_rate": 6.155371106491584e-06,
"loss": 0.3234,
"step": 797
},
{
"epoch": 0.4847380410022779,
"grad_norm": 0.906615804627482,
"learning_rate": 6.1450562072619635e-06,
"loss": 0.245,
"step": 798
},
{
"epoch": 0.4853454821564161,
"grad_norm": 1.0302579157248837,
"learning_rate": 6.134736162503929e-06,
"loss": 0.2631,
"step": 799
},
{
"epoch": 0.4859529233105543,
"grad_norm": 0.9384658833738736,
"learning_rate": 6.124411018592568e-06,
"loss": 0.2632,
"step": 800
},
{
"epoch": 0.4865603644646925,
"grad_norm": 0.8947013802717417,
"learning_rate": 6.114080821925885e-06,
"loss": 0.272,
"step": 801
},
{
"epoch": 0.48716780561883066,
"grad_norm": 1.3316960693705406,
"learning_rate": 6.103745618924587e-06,
"loss": 0.2577,
"step": 802
},
{
"epoch": 0.48777524677296885,
"grad_norm": 1.0028338947085624,
"learning_rate": 6.09340545603188e-06,
"loss": 0.2925,
"step": 803
},
{
"epoch": 0.48838268792710704,
"grad_norm": 1.0004427933236308,
"learning_rate": 6.0830603797132574e-06,
"loss": 0.2582,
"step": 804
},
{
"epoch": 0.48899012908124523,
"grad_norm": 0.9812737917561756,
"learning_rate": 6.072710436456293e-06,
"loss": 0.2832,
"step": 805
},
{
"epoch": 0.4895975702353835,
"grad_norm": 0.9548200362731749,
"learning_rate": 6.0623556727704306e-06,
"loss": 0.2676,
"step": 806
},
{
"epoch": 0.49020501138952166,
"grad_norm": 0.9929722264569331,
"learning_rate": 6.051996135186774e-06,
"loss": 0.289,
"step": 807
},
{
"epoch": 0.49081245254365985,
"grad_norm": 1.2854841849196001,
"learning_rate": 6.041631870257882e-06,
"loss": 0.2847,
"step": 808
},
{
"epoch": 0.49141989369779804,
"grad_norm": 1.0000863726047369,
"learning_rate": 6.0312629245575534e-06,
"loss": 0.3195,
"step": 809
},
{
"epoch": 0.4920273348519362,
"grad_norm": 0.9562458448178358,
"learning_rate": 6.020889344680627e-06,
"loss": 0.3136,
"step": 810
},
{
"epoch": 0.4926347760060744,
"grad_norm": 3.4103757955936653,
"learning_rate": 6.010511177242757e-06,
"loss": 0.285,
"step": 811
},
{
"epoch": 0.4932422171602126,
"grad_norm": 0.9236674911746163,
"learning_rate": 6.000128468880223e-06,
"loss": 0.2651,
"step": 812
},
{
"epoch": 0.4938496583143508,
"grad_norm": 1.50715645288713,
"learning_rate": 5.989741266249701e-06,
"loss": 0.2961,
"step": 813
},
{
"epoch": 0.494457099468489,
"grad_norm": 1.2283119233067459,
"learning_rate": 5.979349616028067e-06,
"loss": 0.3049,
"step": 814
},
{
"epoch": 0.49506454062262717,
"grad_norm": 1.0356567298104182,
"learning_rate": 5.9689535649121855e-06,
"loss": 0.2891,
"step": 815
},
{
"epoch": 0.49567198177676536,
"grad_norm": 0.9560994912289477,
"learning_rate": 5.958553159618693e-06,
"loss": 0.2619,
"step": 816
},
{
"epoch": 0.49627942293090355,
"grad_norm": 0.8976018878445121,
"learning_rate": 5.948148446883794e-06,
"loss": 0.2753,
"step": 817
},
{
"epoch": 0.49688686408504174,
"grad_norm": 1.057276020274219,
"learning_rate": 5.937739473463047e-06,
"loss": 0.3255,
"step": 818
},
{
"epoch": 0.49749430523918,
"grad_norm": 0.9249628177128568,
"learning_rate": 5.927326286131162e-06,
"loss": 0.2774,
"step": 819
},
{
"epoch": 0.49810174639331817,
"grad_norm": 0.9503742934078119,
"learning_rate": 5.916908931681781e-06,
"loss": 0.2771,
"step": 820
},
{
"epoch": 0.49870918754745636,
"grad_norm": 1.2103474150033977,
"learning_rate": 5.906487456927273e-06,
"loss": 0.2949,
"step": 821
},
{
"epoch": 0.49931662870159454,
"grad_norm": 0.9494133739592079,
"learning_rate": 5.896061908698521e-06,
"loss": 0.2771,
"step": 822
},
{
"epoch": 0.49992406985573273,
"grad_norm": 0.9595078564844419,
"learning_rate": 5.885632333844714e-06,
"loss": 0.2746,
"step": 823
},
{
"epoch": 0.5005315110098709,
"grad_norm": 0.8889046754711648,
"learning_rate": 5.8751987792331365e-06,
"loss": 0.2728,
"step": 824
},
{
"epoch": 0.5011389521640092,
"grad_norm": 0.9984221504514925,
"learning_rate": 5.864761291748956e-06,
"loss": 0.2669,
"step": 825
},
{
"epoch": 0.5017463933181473,
"grad_norm": 1.0642385661388791,
"learning_rate": 5.854319918295012e-06,
"loss": 0.2711,
"step": 826
},
{
"epoch": 0.5023538344722855,
"grad_norm": 1.1473823060409172,
"learning_rate": 5.843874705791607e-06,
"loss": 0.2463,
"step": 827
},
{
"epoch": 0.5029612756264237,
"grad_norm": 1.0570678499864414,
"learning_rate": 5.833425701176294e-06,
"loss": 0.3234,
"step": 828
},
{
"epoch": 0.5035687167805619,
"grad_norm": 1.1140589963258467,
"learning_rate": 5.82297295140367e-06,
"loss": 0.2757,
"step": 829
},
{
"epoch": 0.5041761579347,
"grad_norm": 0.8357461454559573,
"learning_rate": 5.812516503445158e-06,
"loss": 0.2555,
"step": 830
},
{
"epoch": 0.5047835990888383,
"grad_norm": 0.9851929754213505,
"learning_rate": 5.8020564042888015e-06,
"loss": 0.2864,
"step": 831
},
{
"epoch": 0.5053910402429764,
"grad_norm": 0.9649129922182244,
"learning_rate": 5.79159270093905e-06,
"loss": 0.2871,
"step": 832
},
{
"epoch": 0.5059984813971147,
"grad_norm": 0.9056292022966355,
"learning_rate": 5.781125440416552e-06,
"loss": 0.2611,
"step": 833
},
{
"epoch": 0.5066059225512528,
"grad_norm": 1.2561528927314698,
"learning_rate": 5.770654669757935e-06,
"loss": 0.2938,
"step": 834
},
{
"epoch": 0.507213363705391,
"grad_norm": 1.429458180902475,
"learning_rate": 5.760180436015604e-06,
"loss": 0.2726,
"step": 835
},
{
"epoch": 0.5078208048595292,
"grad_norm": 0.9977501941214085,
"learning_rate": 5.749702786257529e-06,
"loss": 0.2808,
"step": 836
},
{
"epoch": 0.5084282460136674,
"grad_norm": 0.8542206729805362,
"learning_rate": 5.739221767567025e-06,
"loss": 0.2515,
"step": 837
},
{
"epoch": 0.5090356871678057,
"grad_norm": 1.1569449493080222,
"learning_rate": 5.7287374270425475e-06,
"loss": 0.2764,
"step": 838
},
{
"epoch": 0.5096431283219438,
"grad_norm": 0.9444050762828973,
"learning_rate": 5.718249811797482e-06,
"loss": 0.2895,
"step": 839
},
{
"epoch": 0.510250569476082,
"grad_norm": 0.9584586543782557,
"learning_rate": 5.707758968959923e-06,
"loss": 0.2746,
"step": 840
},
{
"epoch": 0.5108580106302202,
"grad_norm": 0.8938350182024917,
"learning_rate": 5.69726494567248e-06,
"loss": 0.2646,
"step": 841
},
{
"epoch": 0.5114654517843584,
"grad_norm": 0.9952500187835128,
"learning_rate": 5.686767789092041e-06,
"loss": 0.2927,
"step": 842
},
{
"epoch": 0.5120728929384966,
"grad_norm": 0.8980910835024705,
"learning_rate": 5.676267546389587e-06,
"loss": 0.255,
"step": 843
},
{
"epoch": 0.5126803340926348,
"grad_norm": 0.9195976062449588,
"learning_rate": 5.6657642647499545e-06,
"loss": 0.2825,
"step": 844
},
{
"epoch": 0.5132877752467729,
"grad_norm": 1.0411762343769737,
"learning_rate": 5.655257991371646e-06,
"loss": 0.2614,
"step": 845
},
{
"epoch": 0.5138952164009112,
"grad_norm": 0.8901603514925267,
"learning_rate": 5.644748773466606e-06,
"loss": 0.2739,
"step": 846
},
{
"epoch": 0.5145026575550493,
"grad_norm": 0.995174149677262,
"learning_rate": 5.6342366582600035e-06,
"loss": 0.3136,
"step": 847
},
{
"epoch": 0.5151100987091876,
"grad_norm": 1.7141964167900545,
"learning_rate": 5.62372169299004e-06,
"loss": 0.2931,
"step": 848
},
{
"epoch": 0.5157175398633257,
"grad_norm": 0.9986737322206383,
"learning_rate": 5.613203924907711e-06,
"loss": 0.2635,
"step": 849
},
{
"epoch": 0.5163249810174639,
"grad_norm": 0.9184311409988306,
"learning_rate": 5.6026834012766155e-06,
"loss": 0.2523,
"step": 850
},
{
"epoch": 0.5169324221716022,
"grad_norm": 1.0089379913752443,
"learning_rate": 5.592160169372734e-06,
"loss": 0.2884,
"step": 851
},
{
"epoch": 0.5175398633257403,
"grad_norm": 0.9856271821475499,
"learning_rate": 5.581634276484211e-06,
"loss": 0.2701,
"step": 852
},
{
"epoch": 0.5181473044798786,
"grad_norm": 1.4723310483980634,
"learning_rate": 5.571105769911159e-06,
"loss": 0.3305,
"step": 853
},
{
"epoch": 0.5187547456340167,
"grad_norm": 0.972595176753459,
"learning_rate": 5.560574696965425e-06,
"loss": 0.2401,
"step": 854
},
{
"epoch": 0.5193621867881549,
"grad_norm": 0.9359549675099497,
"learning_rate": 5.550041104970398e-06,
"loss": 0.2647,
"step": 855
},
{
"epoch": 0.5199696279422931,
"grad_norm": 1.0700686033849647,
"learning_rate": 5.539505041260779e-06,
"loss": 0.2867,
"step": 856
},
{
"epoch": 0.5205770690964313,
"grad_norm": 1.8194028377094806,
"learning_rate": 5.528966553182379e-06,
"loss": 0.2492,
"step": 857
},
{
"epoch": 0.5211845102505694,
"grad_norm": 0.9354512112441699,
"learning_rate": 5.518425688091906e-06,
"loss": 0.2945,
"step": 858
},
{
"epoch": 0.5217919514047077,
"grad_norm": 0.8575059431791342,
"learning_rate": 5.507882493356745e-06,
"loss": 0.2579,
"step": 859
},
{
"epoch": 0.5223993925588458,
"grad_norm": 0.8801723522103503,
"learning_rate": 5.497337016354757e-06,
"loss": 0.2843,
"step": 860
},
{
"epoch": 0.5230068337129841,
"grad_norm": 1.055054705643407,
"learning_rate": 5.486789304474047e-06,
"loss": 0.2463,
"step": 861
},
{
"epoch": 0.5236142748671222,
"grad_norm": 0.9847077421418023,
"learning_rate": 5.476239405112775e-06,
"loss": 0.2961,
"step": 862
},
{
"epoch": 0.5242217160212604,
"grad_norm": 0.9528909479486961,
"learning_rate": 5.465687365678921e-06,
"loss": 0.2883,
"step": 863
},
{
"epoch": 0.5248291571753987,
"grad_norm": 0.9331008172836965,
"learning_rate": 5.45513323359009e-06,
"loss": 0.3138,
"step": 864
},
{
"epoch": 0.5254365983295368,
"grad_norm": 0.9666089596639151,
"learning_rate": 5.444577056273284e-06,
"loss": 0.2755,
"step": 865
},
{
"epoch": 0.5260440394836751,
"grad_norm": 0.9347956099649323,
"learning_rate": 5.434018881164702e-06,
"loss": 0.2701,
"step": 866
},
{
"epoch": 0.5266514806378132,
"grad_norm": 0.8850204653085014,
"learning_rate": 5.423458755709516e-06,
"loss": 0.284,
"step": 867
},
{
"epoch": 0.5272589217919514,
"grad_norm": 0.846363898916957,
"learning_rate": 5.412896727361663e-06,
"loss": 0.2381,
"step": 868
},
{
"epoch": 0.5278663629460896,
"grad_norm": 1.0043627806351294,
"learning_rate": 5.402332843583631e-06,
"loss": 0.2748,
"step": 869
},
{
"epoch": 0.5284738041002278,
"grad_norm": 1.0846231794328964,
"learning_rate": 5.391767151846247e-06,
"loss": 0.2717,
"step": 870
},
{
"epoch": 0.529081245254366,
"grad_norm": 1.046609605860445,
"learning_rate": 5.381199699628459e-06,
"loss": 0.2982,
"step": 871
},
{
"epoch": 0.5296886864085042,
"grad_norm": 0.912932730088245,
"learning_rate": 5.370630534417133e-06,
"loss": 0.2531,
"step": 872
},
{
"epoch": 0.5302961275626423,
"grad_norm": 1.1500626604620934,
"learning_rate": 5.360059703706823e-06,
"loss": 0.2995,
"step": 873
},
{
"epoch": 0.5309035687167806,
"grad_norm": 1.0117429899245785,
"learning_rate": 5.349487254999579e-06,
"loss": 0.2959,
"step": 874
},
{
"epoch": 0.5315110098709187,
"grad_norm": 1.2473392199840632,
"learning_rate": 5.3389132358047115e-06,
"loss": 0.28,
"step": 875
},
{
"epoch": 0.532118451025057,
"grad_norm": 0.9342083750338547,
"learning_rate": 5.328337693638591e-06,
"loss": 0.2856,
"step": 876
},
{
"epoch": 0.5327258921791952,
"grad_norm": 0.9455670305571291,
"learning_rate": 5.317760676024436e-06,
"loss": 0.2757,
"step": 877
},
{
"epoch": 0.5333333333333333,
"grad_norm": 1.7595753273077446,
"learning_rate": 5.307182230492089e-06,
"loss": 0.2757,
"step": 878
},
{
"epoch": 0.5339407744874716,
"grad_norm": 0.8967029314384648,
"learning_rate": 5.296602404577814e-06,
"loss": 0.2455,
"step": 879
},
{
"epoch": 0.5345482156416097,
"grad_norm": 1.3247032268401917,
"learning_rate": 5.286021245824075e-06,
"loss": 0.2947,
"step": 880
},
{
"epoch": 0.535155656795748,
"grad_norm": 0.9423673536300602,
"learning_rate": 5.275438801779328e-06,
"loss": 0.2687,
"step": 881
},
{
"epoch": 0.5357630979498861,
"grad_norm": 0.9635039934677604,
"learning_rate": 5.264855119997803e-06,
"loss": 0.283,
"step": 882
},
{
"epoch": 0.5363705391040243,
"grad_norm": 0.8537857300482113,
"learning_rate": 5.254270248039291e-06,
"loss": 0.2448,
"step": 883
},
{
"epoch": 0.5369779802581625,
"grad_norm": 0.983767837657037,
"learning_rate": 5.243684233468933e-06,
"loss": 0.285,
"step": 884
},
{
"epoch": 0.5375854214123007,
"grad_norm": 0.8691004474077461,
"learning_rate": 5.233097123857004e-06,
"loss": 0.2556,
"step": 885
},
{
"epoch": 0.5381928625664388,
"grad_norm": 0.9137454862081088,
"learning_rate": 5.222508966778702e-06,
"loss": 0.2484,
"step": 886
},
{
"epoch": 0.5388003037205771,
"grad_norm": 0.9557993995814925,
"learning_rate": 5.211919809813927e-06,
"loss": 0.2568,
"step": 887
},
{
"epoch": 0.5394077448747152,
"grad_norm": 0.9810092195544174,
"learning_rate": 5.201329700547077e-06,
"loss": 0.296,
"step": 888
},
{
"epoch": 0.5400151860288535,
"grad_norm": 0.9230699355344197,
"learning_rate": 5.190738686566826e-06,
"loss": 0.2641,
"step": 889
},
{
"epoch": 0.5406226271829917,
"grad_norm": 1.6906836035620436,
"learning_rate": 5.180146815465915e-06,
"loss": 0.28,
"step": 890
},
{
"epoch": 0.5412300683371298,
"grad_norm": 0.9241916165908446,
"learning_rate": 5.169554134840937e-06,
"loss": 0.2646,
"step": 891
},
{
"epoch": 0.5418375094912681,
"grad_norm": 0.9358728519319159,
"learning_rate": 5.158960692292122e-06,
"loss": 0.267,
"step": 892
},
{
"epoch": 0.5424449506454062,
"grad_norm": 1.1048133481375857,
"learning_rate": 5.148366535423126e-06,
"loss": 0.2777,
"step": 893
},
{
"epoch": 0.5430523917995445,
"grad_norm": 0.9378744728399504,
"learning_rate": 5.137771711840811e-06,
"loss": 0.2678,
"step": 894
},
{
"epoch": 0.5436598329536826,
"grad_norm": 12.537370676924667,
"learning_rate": 5.1271762691550375e-06,
"loss": 0.2639,
"step": 895
},
{
"epoch": 0.5442672741078208,
"grad_norm": 0.9266142790273743,
"learning_rate": 5.116580254978447e-06,
"loss": 0.2659,
"step": 896
},
{
"epoch": 0.544874715261959,
"grad_norm": 0.8872145017815025,
"learning_rate": 5.1059837169262506e-06,
"loss": 0.2657,
"step": 897
},
{
"epoch": 0.5454821564160972,
"grad_norm": 0.9596765024185803,
"learning_rate": 5.095386702616012e-06,
"loss": 0.2737,
"step": 898
},
{
"epoch": 0.5460895975702353,
"grad_norm": 0.8505452588238741,
"learning_rate": 5.084789259667437e-06,
"loss": 0.2229,
"step": 899
},
{
"epoch": 0.5466970387243736,
"grad_norm": 0.9742318710472475,
"learning_rate": 5.074191435702155e-06,
"loss": 0.2621,
"step": 900
},
{
"epoch": 0.5473044798785117,
"grad_norm": 0.9174362261127138,
"learning_rate": 5.06359327834351e-06,
"loss": 0.2735,
"step": 901
},
{
"epoch": 0.54791192103265,
"grad_norm": 1.0480082974980471,
"learning_rate": 5.05299483521634e-06,
"loss": 0.2804,
"step": 902
},
{
"epoch": 0.5485193621867882,
"grad_norm": 0.9044085564794266,
"learning_rate": 5.0423961539467754e-06,
"loss": 0.251,
"step": 903
},
{
"epoch": 0.5491268033409263,
"grad_norm": 1.0846067776083739,
"learning_rate": 5.031797282162007e-06,
"loss": 0.275,
"step": 904
},
{
"epoch": 0.5497342444950646,
"grad_norm": 1.054126628964702,
"learning_rate": 5.021198267490088e-06,
"loss": 0.3109,
"step": 905
},
{
"epoch": 0.5503416856492027,
"grad_norm": 0.9731663440580473,
"learning_rate": 5.010599157559713e-06,
"loss": 0.2744,
"step": 906
},
{
"epoch": 0.550949126803341,
"grad_norm": 0.935385971022661,
"learning_rate": 5e-06,
"loss": 0.2833,
"step": 907
},
{
"epoch": 0.5515565679574791,
"grad_norm": 0.95732647622787,
"learning_rate": 4.98940084244029e-06,
"loss": 0.2738,
"step": 908
},
{
"epoch": 0.5521640091116173,
"grad_norm": 0.9778546392598653,
"learning_rate": 4.9788017325099134e-06,
"loss": 0.2902,
"step": 909
},
{
"epoch": 0.5527714502657555,
"grad_norm": 1.161637726781862,
"learning_rate": 4.968202717837996e-06,
"loss": 0.2448,
"step": 910
},
{
"epoch": 0.5533788914198937,
"grad_norm": 1.0325179748583595,
"learning_rate": 4.957603846053225e-06,
"loss": 0.2777,
"step": 911
},
{
"epoch": 0.5539863325740318,
"grad_norm": 0.9385803768928881,
"learning_rate": 4.947005164783661e-06,
"loss": 0.252,
"step": 912
},
{
"epoch": 0.5545937737281701,
"grad_norm": 0.8813294108193872,
"learning_rate": 4.936406721656492e-06,
"loss": 0.262,
"step": 913
},
{
"epoch": 0.5552012148823082,
"grad_norm": 1.0450236387704728,
"learning_rate": 4.925808564297847e-06,
"loss": 0.2809,
"step": 914
},
{
"epoch": 0.5558086560364465,
"grad_norm": 0.9743579657662962,
"learning_rate": 4.915210740332564e-06,
"loss": 0.2727,
"step": 915
},
{
"epoch": 0.5564160971905847,
"grad_norm": 1.1211404996380052,
"learning_rate": 4.9046132973839895e-06,
"loss": 0.3337,
"step": 916
},
{
"epoch": 0.5570235383447228,
"grad_norm": 0.9501582701592088,
"learning_rate": 4.894016283073753e-06,
"loss": 0.2598,
"step": 917
},
{
"epoch": 0.5576309794988611,
"grad_norm": 1.023090140186835,
"learning_rate": 4.883419745021554e-06,
"loss": 0.2483,
"step": 918
},
{
"epoch": 0.5582384206529992,
"grad_norm": 0.9242815154848782,
"learning_rate": 4.872823730844966e-06,
"loss": 0.255,
"step": 919
},
{
"epoch": 0.5588458618071375,
"grad_norm": 1.0437928013824458,
"learning_rate": 4.862228288159191e-06,
"loss": 0.2804,
"step": 920
},
{
"epoch": 0.5594533029612756,
"grad_norm": 0.928662371565127,
"learning_rate": 4.851633464576876e-06,
"loss": 0.2487,
"step": 921
},
{
"epoch": 0.5600607441154138,
"grad_norm": 0.8860877007011291,
"learning_rate": 4.841039307707878e-06,
"loss": 0.2567,
"step": 922
},
{
"epoch": 0.560668185269552,
"grad_norm": 0.9520319656600132,
"learning_rate": 4.8304458651590645e-06,
"loss": 0.2736,
"step": 923
},
{
"epoch": 0.5612756264236902,
"grad_norm": 0.9185300392147626,
"learning_rate": 4.819853184534085e-06,
"loss": 0.2638,
"step": 924
},
{
"epoch": 0.5618830675778284,
"grad_norm": 0.8949825038070447,
"learning_rate": 4.809261313433176e-06,
"loss": 0.2582,
"step": 925
},
{
"epoch": 0.5624905087319666,
"grad_norm": 1.0105141831102669,
"learning_rate": 4.798670299452926e-06,
"loss": 0.2606,
"step": 926
},
{
"epoch": 0.5630979498861047,
"grad_norm": 0.8759995193174664,
"learning_rate": 4.788080190186075e-06,
"loss": 0.2725,
"step": 927
},
{
"epoch": 0.563705391040243,
"grad_norm": 0.9842928399442494,
"learning_rate": 4.7774910332213005e-06,
"loss": 0.2889,
"step": 928
},
{
"epoch": 0.5643128321943812,
"grad_norm": 0.9982390837595454,
"learning_rate": 4.766902876142996e-06,
"loss": 0.2536,
"step": 929
},
{
"epoch": 0.5649202733485194,
"grad_norm": 0.8982965289569348,
"learning_rate": 4.756315766531069e-06,
"loss": 0.2748,
"step": 930
},
{
"epoch": 0.5655277145026576,
"grad_norm": 0.9053592532295419,
"learning_rate": 4.74572975196071e-06,
"loss": 0.2453,
"step": 931
},
{
"epoch": 0.5661351556567957,
"grad_norm": 0.9728048806580342,
"learning_rate": 4.735144880002199e-06,
"loss": 0.2834,
"step": 932
},
{
"epoch": 0.566742596810934,
"grad_norm": 1.0237268565258881,
"learning_rate": 4.724561198220672e-06,
"loss": 0.2525,
"step": 933
},
{
"epoch": 0.5673500379650721,
"grad_norm": 0.8905565256751365,
"learning_rate": 4.713978754175926e-06,
"loss": 0.2698,
"step": 934
},
{
"epoch": 0.5679574791192104,
"grad_norm": 0.9139326984298836,
"learning_rate": 4.703397595422188e-06,
"loss": 0.2674,
"step": 935
},
{
"epoch": 0.5685649202733485,
"grad_norm": 0.8833013039618229,
"learning_rate": 4.692817769507912e-06,
"loss": 0.2684,
"step": 936
},
{
"epoch": 0.5691723614274867,
"grad_norm": 0.9238763026108181,
"learning_rate": 4.682239323975566e-06,
"loss": 0.2558,
"step": 937
},
{
"epoch": 0.5697798025816249,
"grad_norm": 1.5115130854908239,
"learning_rate": 4.671662306361409e-06,
"loss": 0.2935,
"step": 938
},
{
"epoch": 0.5703872437357631,
"grad_norm": 0.9482923489278967,
"learning_rate": 4.66108676419529e-06,
"loss": 0.294,
"step": 939
},
{
"epoch": 0.5709946848899012,
"grad_norm": 0.9477355385020175,
"learning_rate": 4.6505127450004216e-06,
"loss": 0.2632,
"step": 940
},
{
"epoch": 0.5716021260440395,
"grad_norm": 0.9464692568004452,
"learning_rate": 4.6399402962931775e-06,
"loss": 0.2688,
"step": 941
},
{
"epoch": 0.5722095671981777,
"grad_norm": 1.5567830309937185,
"learning_rate": 4.62936946558287e-06,
"loss": 0.2712,
"step": 942
},
{
"epoch": 0.5728170083523159,
"grad_norm": 0.9424657036172921,
"learning_rate": 4.618800300371543e-06,
"loss": 0.2545,
"step": 943
},
{
"epoch": 0.5734244495064541,
"grad_norm": 0.8823928171248179,
"learning_rate": 4.608232848153757e-06,
"loss": 0.2412,
"step": 944
},
{
"epoch": 0.5740318906605922,
"grad_norm": 0.9999498692731567,
"learning_rate": 4.597667156416371e-06,
"loss": 0.2893,
"step": 945
},
{
"epoch": 0.5746393318147305,
"grad_norm": 1.3850490330933427,
"learning_rate": 4.587103272638339e-06,
"loss": 0.272,
"step": 946
},
{
"epoch": 0.5752467729688686,
"grad_norm": 0.9012990165306813,
"learning_rate": 4.576541244290484e-06,
"loss": 0.2735,
"step": 947
},
{
"epoch": 0.5758542141230069,
"grad_norm": 0.9444183793088788,
"learning_rate": 4.565981118835299e-06,
"loss": 0.2747,
"step": 948
},
{
"epoch": 0.576461655277145,
"grad_norm": 1.0837085618557278,
"learning_rate": 4.555422943726715e-06,
"loss": 0.318,
"step": 949
},
{
"epoch": 0.5770690964312832,
"grad_norm": 0.9744715635599894,
"learning_rate": 4.5448667664099125e-06,
"loss": 0.2991,
"step": 950
},
{
"epoch": 0.5776765375854214,
"grad_norm": 0.8967007719257041,
"learning_rate": 4.534312634321081e-06,
"loss": 0.2748,
"step": 951
},
{
"epoch": 0.5782839787395596,
"grad_norm": 0.9164171208837589,
"learning_rate": 4.523760594887228e-06,
"loss": 0.246,
"step": 952
},
{
"epoch": 0.5788914198936977,
"grad_norm": 0.8748749367095119,
"learning_rate": 4.513210695525954e-06,
"loss": 0.2521,
"step": 953
},
{
"epoch": 0.579498861047836,
"grad_norm": 0.9504021763832875,
"learning_rate": 4.5026629836452445e-06,
"loss": 0.2965,
"step": 954
},
{
"epoch": 0.5801063022019742,
"grad_norm": 1.0694432667746632,
"learning_rate": 4.492117506643256e-06,
"loss": 0.2487,
"step": 955
},
{
"epoch": 0.5807137433561124,
"grad_norm": 0.9126259590301874,
"learning_rate": 4.481574311908096e-06,
"loss": 0.2714,
"step": 956
},
{
"epoch": 0.5813211845102506,
"grad_norm": 0.8752331832756258,
"learning_rate": 4.471033446817623e-06,
"loss": 0.2645,
"step": 957
},
{
"epoch": 0.5819286256643887,
"grad_norm": 0.9292223873298213,
"learning_rate": 4.460494958739223e-06,
"loss": 0.2827,
"step": 958
},
{
"epoch": 0.582536066818527,
"grad_norm": 0.9249697636061491,
"learning_rate": 4.449958895029604e-06,
"loss": 0.2889,
"step": 959
},
{
"epoch": 0.5831435079726651,
"grad_norm": 0.896078007157414,
"learning_rate": 4.439425303034576e-06,
"loss": 0.2438,
"step": 960
},
{
"epoch": 0.5837509491268034,
"grad_norm": 0.994404825370418,
"learning_rate": 4.428894230088842e-06,
"loss": 0.2567,
"step": 961
},
{
"epoch": 0.5843583902809415,
"grad_norm": 1.003722332588181,
"learning_rate": 4.418365723515791e-06,
"loss": 0.2777,
"step": 962
},
{
"epoch": 0.5849658314350797,
"grad_norm": 0.9051722268130655,
"learning_rate": 4.407839830627269e-06,
"loss": 0.2723,
"step": 963
},
{
"epoch": 0.5855732725892179,
"grad_norm": 0.9993245696949893,
"learning_rate": 4.397316598723385e-06,
"loss": 0.261,
"step": 964
},
{
"epoch": 0.5861807137433561,
"grad_norm": 1.129318363010175,
"learning_rate": 4.38679607509229e-06,
"loss": 0.2785,
"step": 965
},
{
"epoch": 0.5867881548974943,
"grad_norm": 0.9652429592155066,
"learning_rate": 4.376278307009962e-06,
"loss": 0.3026,
"step": 966
},
{
"epoch": 0.5873955960516325,
"grad_norm": 1.08394459651315,
"learning_rate": 4.365763341739996e-06,
"loss": 0.2583,
"step": 967
},
{
"epoch": 0.5880030372057707,
"grad_norm": 0.9919795857924707,
"learning_rate": 4.355251226533396e-06,
"loss": 0.2825,
"step": 968
},
{
"epoch": 0.5886104783599089,
"grad_norm": 1.3252968659585784,
"learning_rate": 4.344742008628356e-06,
"loss": 0.2525,
"step": 969
},
{
"epoch": 0.5892179195140471,
"grad_norm": 1.2622690111591564,
"learning_rate": 4.334235735250047e-06,
"loss": 0.2289,
"step": 970
},
{
"epoch": 0.5898253606681853,
"grad_norm": 1.051902837554305,
"learning_rate": 4.3237324536104165e-06,
"loss": 0.2478,
"step": 971
},
{
"epoch": 0.5904328018223235,
"grad_norm": 1.0527055472641202,
"learning_rate": 4.313232210907959e-06,
"loss": 0.2898,
"step": 972
},
{
"epoch": 0.5910402429764616,
"grad_norm": 1.0101095079376434,
"learning_rate": 4.302735054327523e-06,
"loss": 0.2682,
"step": 973
},
{
"epoch": 0.5916476841305999,
"grad_norm": 0.8621015161725616,
"learning_rate": 4.292241031040077e-06,
"loss": 0.2485,
"step": 974
},
{
"epoch": 0.592255125284738,
"grad_norm": 0.8726005630346255,
"learning_rate": 4.28175018820252e-06,
"loss": 0.249,
"step": 975
},
{
"epoch": 0.5928625664388762,
"grad_norm": 0.9751348393117523,
"learning_rate": 4.271262572957453e-06,
"loss": 0.2877,
"step": 976
},
{
"epoch": 0.5934700075930144,
"grad_norm": 3.66261461720758,
"learning_rate": 4.2607782324329776e-06,
"loss": 0.3063,
"step": 977
},
{
"epoch": 0.5940774487471526,
"grad_norm": 0.8943406986539186,
"learning_rate": 4.250297213742473e-06,
"loss": 0.2405,
"step": 978
},
{
"epoch": 0.5946848899012908,
"grad_norm": 1.1389623298222618,
"learning_rate": 4.239819563984397e-06,
"loss": 0.304,
"step": 979
},
{
"epoch": 0.595292331055429,
"grad_norm": 0.9274441396848276,
"learning_rate": 4.229345330242067e-06,
"loss": 0.2592,
"step": 980
},
{
"epoch": 0.5958997722095672,
"grad_norm": 0.920248304426081,
"learning_rate": 4.21887455958345e-06,
"loss": 0.2656,
"step": 981
},
{
"epoch": 0.5965072133637054,
"grad_norm": 1.0045840102748669,
"learning_rate": 4.2084072990609505e-06,
"loss": 0.2793,
"step": 982
},
{
"epoch": 0.5971146545178436,
"grad_norm": 1.0001754794903,
"learning_rate": 4.1979435957111984e-06,
"loss": 0.2836,
"step": 983
},
{
"epoch": 0.5977220956719818,
"grad_norm": 0.9044442095717217,
"learning_rate": 4.187483496554844e-06,
"loss": 0.2583,
"step": 984
},
{
"epoch": 0.59832953682612,
"grad_norm": 0.9671571248544576,
"learning_rate": 4.17702704859633e-06,
"loss": 0.2675,
"step": 985
},
{
"epoch": 0.5989369779802581,
"grad_norm": 1.0757021127620503,
"learning_rate": 4.166574298823707e-06,
"loss": 0.2676,
"step": 986
},
{
"epoch": 0.5995444191343964,
"grad_norm": 0.9735187115759455,
"learning_rate": 4.156125294208396e-06,
"loss": 0.242,
"step": 987
},
{
"epoch": 0.6001518602885345,
"grad_norm": 0.9405044204067678,
"learning_rate": 4.145680081704989e-06,
"loss": 0.2516,
"step": 988
},
{
"epoch": 0.6007593014426728,
"grad_norm": 0.9651846089243521,
"learning_rate": 4.135238708251045e-06,
"loss": 0.2758,
"step": 989
},
{
"epoch": 0.6013667425968109,
"grad_norm": 0.9646286486818011,
"learning_rate": 4.1248012207668635e-06,
"loss": 0.2565,
"step": 990
},
{
"epoch": 0.6019741837509491,
"grad_norm": 1.003985298512368,
"learning_rate": 4.1143676661552876e-06,
"loss": 0.2828,
"step": 991
},
{
"epoch": 0.6025816249050873,
"grad_norm": 1.3272162976579898,
"learning_rate": 4.103938091301479e-06,
"loss": 0.2374,
"step": 992
},
{
"epoch": 0.6031890660592255,
"grad_norm": 1.029165825211411,
"learning_rate": 4.093512543072729e-06,
"loss": 0.256,
"step": 993
},
{
"epoch": 0.6037965072133638,
"grad_norm": 1.015855525425474,
"learning_rate": 4.08309106831822e-06,
"loss": 0.2732,
"step": 994
},
{
"epoch": 0.6044039483675019,
"grad_norm": 1.0983207807319615,
"learning_rate": 4.07267371386884e-06,
"loss": 0.2808,
"step": 995
},
{
"epoch": 0.6050113895216401,
"grad_norm": 1.0434667435702822,
"learning_rate": 4.062260526536955e-06,
"loss": 0.2936,
"step": 996
},
{
"epoch": 0.6056188306757783,
"grad_norm": 2.7705088228683183,
"learning_rate": 4.051851553116208e-06,
"loss": 0.2797,
"step": 997
},
{
"epoch": 0.6062262718299165,
"grad_norm": 1.0175524502413011,
"learning_rate": 4.041446840381309e-06,
"loss": 0.2847,
"step": 998
},
{
"epoch": 0.6068337129840546,
"grad_norm": 1.0263313633646869,
"learning_rate": 4.0310464350878145e-06,
"loss": 0.2803,
"step": 999
},
{
"epoch": 0.6074411541381929,
"grad_norm": 0.9619358823368712,
"learning_rate": 4.0206503839719335e-06,
"loss": 0.2762,
"step": 1000
},
{
"epoch": 0.608048595292331,
"grad_norm": 1.1203236915172694,
"learning_rate": 4.0102587337503e-06,
"loss": 0.2813,
"step": 1001
},
{
"epoch": 0.6086560364464693,
"grad_norm": 0.9232892656793803,
"learning_rate": 3.999871531119779e-06,
"loss": 0.2791,
"step": 1002
},
{
"epoch": 0.6092634776006074,
"grad_norm": 1.2408650869198945,
"learning_rate": 3.989488822757244e-06,
"loss": 0.2529,
"step": 1003
},
{
"epoch": 0.6098709187547456,
"grad_norm": 1.3861837200018423,
"learning_rate": 3.9791106553193746e-06,
"loss": 0.2681,
"step": 1004
},
{
"epoch": 0.6104783599088838,
"grad_norm": 1.94554998345937,
"learning_rate": 3.968737075442449e-06,
"loss": 0.2774,
"step": 1005
},
{
"epoch": 0.611085801063022,
"grad_norm": 1.049688439654031,
"learning_rate": 3.9583681297421194e-06,
"loss": 0.2738,
"step": 1006
},
{
"epoch": 0.6116932422171603,
"grad_norm": 1.1130517893828742,
"learning_rate": 3.9480038648132285e-06,
"loss": 0.258,
"step": 1007
},
{
"epoch": 0.6123006833712984,
"grad_norm": 0.9091158603102817,
"learning_rate": 3.937644327229572e-06,
"loss": 0.256,
"step": 1008
},
{
"epoch": 0.6129081245254366,
"grad_norm": 0.8593163726216395,
"learning_rate": 3.927289563543709e-06,
"loss": 0.2139,
"step": 1009
},
{
"epoch": 0.6135155656795748,
"grad_norm": 2.2304132729314436,
"learning_rate": 3.916939620286743e-06,
"loss": 0.3042,
"step": 1010
},
{
"epoch": 0.614123006833713,
"grad_norm": 0.8846471003483667,
"learning_rate": 3.906594543968122e-06,
"loss": 0.2461,
"step": 1011
},
{
"epoch": 0.6147304479878511,
"grad_norm": 0.9578022421490531,
"learning_rate": 3.896254381075416e-06,
"loss": 0.2135,
"step": 1012
},
{
"epoch": 0.6153378891419894,
"grad_norm": 0.9488296987772777,
"learning_rate": 3.885919178074116e-06,
"loss": 0.2656,
"step": 1013
},
{
"epoch": 0.6159453302961275,
"grad_norm": 0.9095777905606226,
"learning_rate": 3.875588981407433e-06,
"loss": 0.2696,
"step": 1014
},
{
"epoch": 0.6165527714502658,
"grad_norm": 1.172457556031403,
"learning_rate": 3.865263837496072e-06,
"loss": 0.2807,
"step": 1015
},
{
"epoch": 0.6171602126044039,
"grad_norm": 0.880155690724818,
"learning_rate": 3.854943792738037e-06,
"loss": 0.2724,
"step": 1016
},
{
"epoch": 0.6177676537585421,
"grad_norm": 0.9726329271199242,
"learning_rate": 3.844628893508417e-06,
"loss": 0.2849,
"step": 1017
},
{
"epoch": 0.6183750949126803,
"grad_norm": 0.9428793397994366,
"learning_rate": 3.834319186159179e-06,
"loss": 0.2807,
"step": 1018
},
{
"epoch": 0.6189825360668185,
"grad_norm": 1.1315897196957132,
"learning_rate": 3.8240147170189575e-06,
"loss": 0.2674,
"step": 1019
},
{
"epoch": 0.6195899772209568,
"grad_norm": 0.9181683595405062,
"learning_rate": 3.8137155323928526e-06,
"loss": 0.2801,
"step": 1020
},
{
"epoch": 0.6201974183750949,
"grad_norm": 1.2695788430486319,
"learning_rate": 3.803421678562213e-06,
"loss": 0.2464,
"step": 1021
},
{
"epoch": 0.6208048595292331,
"grad_norm": 0.9830056869409215,
"learning_rate": 3.7931332017844302e-06,
"loss": 0.2219,
"step": 1022
},
{
"epoch": 0.6214123006833713,
"grad_norm": 0.9647382303431887,
"learning_rate": 3.7828501482927416e-06,
"loss": 0.2841,
"step": 1023
},
{
"epoch": 0.6220197418375095,
"grad_norm": 0.9717568307189096,
"learning_rate": 3.7725725642960047e-06,
"loss": 0.2977,
"step": 1024
},
{
"epoch": 0.6226271829916477,
"grad_norm": 0.9411703602062548,
"learning_rate": 3.7623004959785066e-06,
"loss": 0.2373,
"step": 1025
},
{
"epoch": 0.6232346241457859,
"grad_norm": 1.0071266637924543,
"learning_rate": 3.752033989499742e-06,
"loss": 0.2786,
"step": 1026
},
{
"epoch": 0.623842065299924,
"grad_norm": 0.9059441456779171,
"learning_rate": 3.7417730909942184e-06,
"loss": 0.231,
"step": 1027
},
{
"epoch": 0.6244495064540623,
"grad_norm": 1.012261264702401,
"learning_rate": 3.7315178465712364e-06,
"loss": 0.2623,
"step": 1028
},
{
"epoch": 0.6250569476082004,
"grad_norm": 0.9552757738267845,
"learning_rate": 3.721268302314698e-06,
"loss": 0.262,
"step": 1029
},
{
"epoch": 0.6256643887623387,
"grad_norm": 1.027708835953696,
"learning_rate": 3.7110245042828786e-06,
"loss": 0.2576,
"step": 1030
},
{
"epoch": 0.6262718299164769,
"grad_norm": 1.4132850004379782,
"learning_rate": 3.70078649850824e-06,
"loss": 0.2911,
"step": 1031
},
{
"epoch": 0.626879271070615,
"grad_norm": 1.299324195215338,
"learning_rate": 3.690554330997215e-06,
"loss": 0.2699,
"step": 1032
},
{
"epoch": 0.6274867122247533,
"grad_norm": 0.9195896367280948,
"learning_rate": 3.6803280477299975e-06,
"loss": 0.2449,
"step": 1033
},
{
"epoch": 0.6280941533788914,
"grad_norm": 1.2092094482741347,
"learning_rate": 3.670107694660343e-06,
"loss": 0.3003,
"step": 1034
},
{
"epoch": 0.6287015945330297,
"grad_norm": 0.9323263275420439,
"learning_rate": 3.659893317715355e-06,
"loss": 0.2712,
"step": 1035
},
{
"epoch": 0.6293090356871678,
"grad_norm": 0.9032020829812812,
"learning_rate": 3.6496849627952875e-06,
"loss": 0.2838,
"step": 1036
},
{
"epoch": 0.629916476841306,
"grad_norm": 0.9083525940287548,
"learning_rate": 3.639482675773324e-06,
"loss": 0.2729,
"step": 1037
},
{
"epoch": 0.6305239179954442,
"grad_norm": 0.9352716602967113,
"learning_rate": 3.6292865024953945e-06,
"loss": 0.2541,
"step": 1038
},
{
"epoch": 0.6311313591495824,
"grad_norm": 1.1060377092126286,
"learning_rate": 3.6190964887799418e-06,
"loss": 0.3177,
"step": 1039
},
{
"epoch": 0.6317388003037205,
"grad_norm": 0.9438466117610348,
"learning_rate": 3.6089126804177373e-06,
"loss": 0.2253,
"step": 1040
},
{
"epoch": 0.6323462414578588,
"grad_norm": 0.9463389610475652,
"learning_rate": 3.5987351231716665e-06,
"loss": 0.2484,
"step": 1041
},
{
"epoch": 0.6329536826119969,
"grad_norm": 1.0525074350281083,
"learning_rate": 3.5885638627765228e-06,
"loss": 0.2747,
"step": 1042
},
{
"epoch": 0.6335611237661352,
"grad_norm": 0.9145186747529931,
"learning_rate": 3.5783989449388063e-06,
"loss": 0.2631,
"step": 1043
},
{
"epoch": 0.6341685649202734,
"grad_norm": 0.8785607963190103,
"learning_rate": 3.568240415336509e-06,
"loss": 0.2438,
"step": 1044
},
{
"epoch": 0.6347760060744115,
"grad_norm": 0.9826184323548174,
"learning_rate": 3.5580883196189265e-06,
"loss": 0.2784,
"step": 1045
},
{
"epoch": 0.6353834472285498,
"grad_norm": 1.2684340321470944,
"learning_rate": 3.547942703406433e-06,
"loss": 0.2494,
"step": 1046
},
{
"epoch": 0.6359908883826879,
"grad_norm": 1.080171576140306,
"learning_rate": 3.5378036122902907e-06,
"loss": 0.2277,
"step": 1047
},
{
"epoch": 0.6365983295368262,
"grad_norm": 1.0670243919181694,
"learning_rate": 3.52767109183244e-06,
"loss": 0.2479,
"step": 1048
},
{
"epoch": 0.6372057706909643,
"grad_norm": 1.16999981733913,
"learning_rate": 3.5175451875652906e-06,
"loss": 0.3218,
"step": 1049
},
{
"epoch": 0.6378132118451025,
"grad_norm": 0.9540949950156941,
"learning_rate": 3.507425944991529e-06,
"loss": 0.2782,
"step": 1050
},
{
"epoch": 0.6384206529992407,
"grad_norm": 1.091171127154527,
"learning_rate": 3.4973134095838943e-06,
"loss": 0.2587,
"step": 1051
},
{
"epoch": 0.6390280941533789,
"grad_norm": 0.9498496318574927,
"learning_rate": 3.4872076267850015e-06,
"loss": 0.2541,
"step": 1052
},
{
"epoch": 0.639635535307517,
"grad_norm": 1.0314901642287697,
"learning_rate": 3.4771086420071053e-06,
"loss": 0.2664,
"step": 1053
},
{
"epoch": 0.6402429764616553,
"grad_norm": 0.9996172120430332,
"learning_rate": 3.4670165006319236e-06,
"loss": 0.2799,
"step": 1054
},
{
"epoch": 0.6408504176157934,
"grad_norm": 0.9350416152513497,
"learning_rate": 3.4569312480104157e-06,
"loss": 0.2829,
"step": 1055
},
{
"epoch": 0.6414578587699317,
"grad_norm": 1.1632073226641764,
"learning_rate": 3.4468529294625895e-06,
"loss": 0.2574,
"step": 1056
},
{
"epoch": 0.6420652999240699,
"grad_norm": 0.9447749084911037,
"learning_rate": 3.4367815902772917e-06,
"loss": 0.2562,
"step": 1057
},
{
"epoch": 0.642672741078208,
"grad_norm": 1.2070485411268692,
"learning_rate": 3.4267172757120005e-06,
"loss": 0.2635,
"step": 1058
},
{
"epoch": 0.6432801822323463,
"grad_norm": 0.9408532287008231,
"learning_rate": 3.416660030992639e-06,
"loss": 0.2631,
"step": 1059
},
{
"epoch": 0.6438876233864844,
"grad_norm": 1.279419634233724,
"learning_rate": 3.406609901313349e-06,
"loss": 0.2716,
"step": 1060
},
{
"epoch": 0.6444950645406227,
"grad_norm": 1.0437711278438688,
"learning_rate": 3.396566931836308e-06,
"loss": 0.2633,
"step": 1061
},
{
"epoch": 0.6451025056947608,
"grad_norm": 0.9477234648384033,
"learning_rate": 3.386531167691512e-06,
"loss": 0.2551,
"step": 1062
},
{
"epoch": 0.645709946848899,
"grad_norm": 0.8821252495312435,
"learning_rate": 3.3765026539765832e-06,
"loss": 0.2484,
"step": 1063
},
{
"epoch": 0.6463173880030372,
"grad_norm": 1.0630208248660034,
"learning_rate": 3.36648143575656e-06,
"loss": 0.2724,
"step": 1064
},
{
"epoch": 0.6469248291571754,
"grad_norm": 0.8969008349261947,
"learning_rate": 3.3564675580636946e-06,
"loss": 0.2544,
"step": 1065
},
{
"epoch": 0.6475322703113136,
"grad_norm": 0.8758051487969973,
"learning_rate": 3.3464610658972584e-06,
"loss": 0.2518,
"step": 1066
},
{
"epoch": 0.6481397114654518,
"grad_norm": 0.9068783910815723,
"learning_rate": 3.3364620042233316e-06,
"loss": 0.2362,
"step": 1067
},
{
"epoch": 0.6487471526195899,
"grad_norm": 0.9711243894474835,
"learning_rate": 3.326470417974604e-06,
"loss": 0.2417,
"step": 1068
},
{
"epoch": 0.6493545937737282,
"grad_norm": 0.9040626259172609,
"learning_rate": 3.3164863520501744e-06,
"loss": 0.2289,
"step": 1069
},
{
"epoch": 0.6499620349278664,
"grad_norm": 4.042527322341976,
"learning_rate": 3.3065098513153473e-06,
"loss": 0.2839,
"step": 1070
},
{
"epoch": 0.6505694760820045,
"grad_norm": 0.8905148331919976,
"learning_rate": 3.29654096060143e-06,
"loss": 0.2758,
"step": 1071
},
{
"epoch": 0.6511769172361428,
"grad_norm": 1.0792077197141197,
"learning_rate": 3.2865797247055354e-06,
"loss": 0.2662,
"step": 1072
},
{
"epoch": 0.6517843583902809,
"grad_norm": 0.9497844499158115,
"learning_rate": 3.2766261883903744e-06,
"loss": 0.2549,
"step": 1073
},
{
"epoch": 0.6523917995444192,
"grad_norm": 1.0939981168516681,
"learning_rate": 3.266680396384061e-06,
"loss": 0.293,
"step": 1074
},
{
"epoch": 0.6529992406985573,
"grad_norm": 2.703463930587898,
"learning_rate": 3.256742393379909e-06,
"loss": 0.225,
"step": 1075
},
{
"epoch": 0.6536066818526955,
"grad_norm": 0.8280179235274505,
"learning_rate": 3.2468122240362287e-06,
"loss": 0.224,
"step": 1076
},
{
"epoch": 0.6542141230068337,
"grad_norm": 1.439064209594826,
"learning_rate": 3.2368899329761316e-06,
"loss": 0.2607,
"step": 1077
},
{
"epoch": 0.6548215641609719,
"grad_norm": 0.8868653685170421,
"learning_rate": 3.226975564787322e-06,
"loss": 0.2276,
"step": 1078
},
{
"epoch": 0.6554290053151101,
"grad_norm": 1.0980898669737127,
"learning_rate": 3.2170691640219077e-06,
"loss": 0.2648,
"step": 1079
},
{
"epoch": 0.6560364464692483,
"grad_norm": 0.9994606567591712,
"learning_rate": 3.2071707751961838e-06,
"loss": 0.2785,
"step": 1080
},
{
"epoch": 0.6566438876233864,
"grad_norm": 1.0995716769114432,
"learning_rate": 3.197280442790455e-06,
"loss": 0.2503,
"step": 1081
},
{
"epoch": 0.6572513287775247,
"grad_norm": 0.9205922048045703,
"learning_rate": 3.187398211248811e-06,
"loss": 0.2367,
"step": 1082
},
{
"epoch": 0.6578587699316629,
"grad_norm": 0.9635183689443267,
"learning_rate": 3.1775241249789434e-06,
"loss": 0.254,
"step": 1083
},
{
"epoch": 0.6584662110858011,
"grad_norm": 0.924480671064993,
"learning_rate": 3.1676582283519454e-06,
"loss": 0.2279,
"step": 1084
},
{
"epoch": 0.6590736522399393,
"grad_norm": 0.906465381610937,
"learning_rate": 3.1578005657021004e-06,
"loss": 0.2285,
"step": 1085
},
{
"epoch": 0.6596810933940774,
"grad_norm": 0.92471932747414,
"learning_rate": 3.1479511813267006e-06,
"loss": 0.2655,
"step": 1086
},
{
"epoch": 0.6602885345482157,
"grad_norm": 1.0142140195825766,
"learning_rate": 3.1381101194858264e-06,
"loss": 0.2407,
"step": 1087
},
{
"epoch": 0.6608959757023538,
"grad_norm": 1.0686680921013536,
"learning_rate": 3.1282774244021717e-06,
"loss": 0.2604,
"step": 1088
},
{
"epoch": 0.661503416856492,
"grad_norm": 1.0244620367393154,
"learning_rate": 3.118453140260823e-06,
"loss": 0.284,
"step": 1089
},
{
"epoch": 0.6621108580106302,
"grad_norm": 1.0170876360692431,
"learning_rate": 3.1086373112090762e-06,
"loss": 0.2523,
"step": 1090
},
{
"epoch": 0.6627182991647684,
"grad_norm": 0.9802160451886665,
"learning_rate": 3.0988299813562304e-06,
"loss": 0.2783,
"step": 1091
},
{
"epoch": 0.6633257403189066,
"grad_norm": 0.9103056036600278,
"learning_rate": 3.089031194773392e-06,
"loss": 0.2502,
"step": 1092
},
{
"epoch": 0.6639331814730448,
"grad_norm": 1.359229570037281,
"learning_rate": 3.079240995493279e-06,
"loss": 0.2479,
"step": 1093
},
{
"epoch": 0.6645406226271829,
"grad_norm": 1.0565783940509939,
"learning_rate": 3.069459427510014e-06,
"loss": 0.2442,
"step": 1094
},
{
"epoch": 0.6651480637813212,
"grad_norm": 0.8822214330244847,
"learning_rate": 3.0596865347789444e-06,
"loss": 0.2722,
"step": 1095
},
{
"epoch": 0.6657555049354594,
"grad_norm": 0.9150480419339629,
"learning_rate": 3.049922361216422e-06,
"loss": 0.2425,
"step": 1096
},
{
"epoch": 0.6663629460895976,
"grad_norm": 0.9517674541707122,
"learning_rate": 3.040166950699626e-06,
"loss": 0.2496,
"step": 1097
},
{
"epoch": 0.6669703872437358,
"grad_norm": 0.9599880059742387,
"learning_rate": 3.0304203470663507e-06,
"loss": 0.2619,
"step": 1098
},
{
"epoch": 0.6675778283978739,
"grad_norm": 0.9460242432498148,
"learning_rate": 3.0206825941148203e-06,
"loss": 0.3065,
"step": 1099
},
{
"epoch": 0.6681852695520122,
"grad_norm": 1.4106793360221765,
"learning_rate": 3.0109537356034856e-06,
"loss": 0.2737,
"step": 1100
},
{
"epoch": 0.6687927107061503,
"grad_norm": 1.2737238922916891,
"learning_rate": 3.001233815250823e-06,
"loss": 0.2899,
"step": 1101
},
{
"epoch": 0.6694001518602886,
"grad_norm": 0.9517912503819469,
"learning_rate": 2.991522876735154e-06,
"loss": 0.2624,
"step": 1102
},
{
"epoch": 0.6700075930144267,
"grad_norm": 1.015322451634877,
"learning_rate": 2.981820963694427e-06,
"loss": 0.2301,
"step": 1103
},
{
"epoch": 0.6706150341685649,
"grad_norm": 1.4417460163504778,
"learning_rate": 2.9721281197260427e-06,
"loss": 0.2864,
"step": 1104
},
{
"epoch": 0.6712224753227031,
"grad_norm": 1.3786447145331062,
"learning_rate": 2.9624443883866403e-06,
"loss": 0.2441,
"step": 1105
},
{
"epoch": 0.6718299164768413,
"grad_norm": 1.0007659923579586,
"learning_rate": 2.9527698131919156e-06,
"loss": 0.2891,
"step": 1106
},
{
"epoch": 0.6724373576309794,
"grad_norm": 1.1633207582868845,
"learning_rate": 2.9431044376164165e-06,
"loss": 0.2978,
"step": 1107
},
{
"epoch": 0.6730447987851177,
"grad_norm": 1.032656472550036,
"learning_rate": 2.9334483050933506e-06,
"loss": 0.2507,
"step": 1108
},
{
"epoch": 0.6736522399392559,
"grad_norm": 0.9333208809560491,
"learning_rate": 2.9238014590143925e-06,
"loss": 0.2376,
"step": 1109
},
{
"epoch": 0.6742596810933941,
"grad_norm": 1.1289827469541969,
"learning_rate": 2.91416394272948e-06,
"loss": 0.2582,
"step": 1110
},
{
"epoch": 0.6748671222475323,
"grad_norm": 0.925760485434696,
"learning_rate": 2.904535799546636e-06,
"loss": 0.2177,
"step": 1111
},
{
"epoch": 0.6754745634016704,
"grad_norm": 0.9162175321455921,
"learning_rate": 2.894917072731753e-06,
"loss": 0.2607,
"step": 1112
},
{
"epoch": 0.6760820045558087,
"grad_norm": 1.0213577123976934,
"learning_rate": 2.8853078055084192e-06,
"loss": 0.2588,
"step": 1113
},
{
"epoch": 0.6766894457099468,
"grad_norm": 0.9140846537600611,
"learning_rate": 2.8757080410577042e-06,
"loss": 0.2701,
"step": 1114
},
{
"epoch": 0.6772968868640851,
"grad_norm": 0.9558677038661029,
"learning_rate": 2.866117822517982e-06,
"loss": 0.2078,
"step": 1115
},
{
"epoch": 0.6779043280182232,
"grad_norm": 1.6365480186656665,
"learning_rate": 2.8565371929847286e-06,
"loss": 0.2519,
"step": 1116
},
{
"epoch": 0.6785117691723614,
"grad_norm": 0.8999981416609766,
"learning_rate": 2.846966195510332e-06,
"loss": 0.2586,
"step": 1117
},
{
"epoch": 0.6791192103264996,
"grad_norm": 0.8986580797788825,
"learning_rate": 2.83740487310389e-06,
"loss": 0.2651,
"step": 1118
},
{
"epoch": 0.6797266514806378,
"grad_norm": 1.0174347095785217,
"learning_rate": 2.82785326873103e-06,
"loss": 0.2593,
"step": 1119
},
{
"epoch": 0.680334092634776,
"grad_norm": 0.922083211098202,
"learning_rate": 2.81831142531371e-06,
"loss": 0.2597,
"step": 1120
},
{
"epoch": 0.6809415337889142,
"grad_norm": 1.006983963737431,
"learning_rate": 2.8087793857300193e-06,
"loss": 0.2682,
"step": 1121
},
{
"epoch": 0.6815489749430524,
"grad_norm": 1.8659752629573085,
"learning_rate": 2.7992571928139984e-06,
"loss": 0.2481,
"step": 1122
},
{
"epoch": 0.6821564160971906,
"grad_norm": 0.9701288254765418,
"learning_rate": 2.7897448893554335e-06,
"loss": 0.2581,
"step": 1123
},
{
"epoch": 0.6827638572513288,
"grad_norm": 0.9122901540097156,
"learning_rate": 2.780242518099675e-06,
"loss": 0.2503,
"step": 1124
},
{
"epoch": 0.683371298405467,
"grad_norm": 0.9807303601001196,
"learning_rate": 2.7707501217474443e-06,
"loss": 0.2744,
"step": 1125
},
{
"epoch": 0.6839787395596052,
"grad_norm": 6.645271440733989,
"learning_rate": 2.761267742954629e-06,
"loss": 0.2524,
"step": 1126
},
{
"epoch": 0.6845861807137433,
"grad_norm": 1.534147172507746,
"learning_rate": 2.7517954243321097e-06,
"loss": 0.2659,
"step": 1127
},
{
"epoch": 0.6851936218678816,
"grad_norm": 0.9373688303596897,
"learning_rate": 2.7423332084455543e-06,
"loss": 0.2851,
"step": 1128
},
{
"epoch": 0.6858010630220197,
"grad_norm": 0.8831368522306644,
"learning_rate": 2.7328811378152355e-06,
"loss": 0.2557,
"step": 1129
},
{
"epoch": 0.686408504176158,
"grad_norm": 0.9679109580287109,
"learning_rate": 2.723439254915834e-06,
"loss": 0.275,
"step": 1130
},
{
"epoch": 0.6870159453302961,
"grad_norm": 1.2218248931218192,
"learning_rate": 2.714007602176254e-06,
"loss": 0.2413,
"step": 1131
},
{
"epoch": 0.6876233864844343,
"grad_norm": 0.9708101090046806,
"learning_rate": 2.704586221979422e-06,
"loss": 0.2645,
"step": 1132
},
{
"epoch": 0.6882308276385725,
"grad_norm": 1.2522386234048026,
"learning_rate": 2.695175156662107e-06,
"loss": 0.2574,
"step": 1133
},
{
"epoch": 0.6888382687927107,
"grad_norm": 0.8419129242667286,
"learning_rate": 2.6857744485147286e-06,
"loss": 0.2383,
"step": 1134
},
{
"epoch": 0.689445709946849,
"grad_norm": 1.0193831420490371,
"learning_rate": 2.6763841397811576e-06,
"loss": 0.2735,
"step": 1135
},
{
"epoch": 0.6900531511009871,
"grad_norm": 0.9241296407909637,
"learning_rate": 2.667004272658541e-06,
"loss": 0.2768,
"step": 1136
},
{
"epoch": 0.6906605922551253,
"grad_norm": 1.1359832523999245,
"learning_rate": 2.6576348892970947e-06,
"loss": 0.2636,
"step": 1137
},
{
"epoch": 0.6912680334092635,
"grad_norm": 0.8941017093322563,
"learning_rate": 2.6482760317999338e-06,
"loss": 0.2559,
"step": 1138
},
{
"epoch": 0.6918754745634017,
"grad_norm": 0.8968310666010292,
"learning_rate": 2.638927742222868e-06,
"loss": 0.2537,
"step": 1139
},
{
"epoch": 0.6924829157175398,
"grad_norm": 0.959662408955417,
"learning_rate": 2.629590062574221e-06,
"loss": 0.2656,
"step": 1140
},
{
"epoch": 0.6930903568716781,
"grad_norm": 0.9000247977135409,
"learning_rate": 2.6202630348146323e-06,
"loss": 0.2899,
"step": 1141
},
{
"epoch": 0.6936977980258162,
"grad_norm": 1.0079650585712254,
"learning_rate": 2.610946700856885e-06,
"loss": 0.267,
"step": 1142
},
{
"epoch": 0.6943052391799545,
"grad_norm": 0.8890790658066724,
"learning_rate": 2.6016411025656973e-06,
"loss": 0.2535,
"step": 1143
},
{
"epoch": 0.6949126803340926,
"grad_norm": 0.9198656469536414,
"learning_rate": 2.592346281757552e-06,
"loss": 0.2509,
"step": 1144
},
{
"epoch": 0.6955201214882308,
"grad_norm": 0.9738974660627011,
"learning_rate": 2.583062280200501e-06,
"loss": 0.2593,
"step": 1145
},
{
"epoch": 0.696127562642369,
"grad_norm": 0.9837219510435016,
"learning_rate": 2.5737891396139713e-06,
"loss": 0.255,
"step": 1146
},
{
"epoch": 0.6967350037965072,
"grad_norm": 0.9076420454349192,
"learning_rate": 2.5645269016685905e-06,
"loss": 0.2704,
"step": 1147
},
{
"epoch": 0.6973424449506455,
"grad_norm": 0.8898433106915349,
"learning_rate": 2.5552756079859904e-06,
"loss": 0.2594,
"step": 1148
},
{
"epoch": 0.6979498861047836,
"grad_norm": 0.9063509680084296,
"learning_rate": 2.5460353001386263e-06,
"loss": 0.2529,
"step": 1149
},
{
"epoch": 0.6985573272589218,
"grad_norm": 0.8948107859742076,
"learning_rate": 2.5368060196495785e-06,
"loss": 0.2564,
"step": 1150
},
{
"epoch": 0.69916476841306,
"grad_norm": 0.8945325429627021,
"learning_rate": 2.527587807992383e-06,
"loss": 0.2387,
"step": 1151
},
{
"epoch": 0.6997722095671982,
"grad_norm": 0.9769838924293288,
"learning_rate": 2.5183807065908296e-06,
"loss": 0.2542,
"step": 1152
},
{
"epoch": 0.7003796507213363,
"grad_norm": 0.9645249197834942,
"learning_rate": 2.5091847568187834e-06,
"loss": 0.2281,
"step": 1153
},
{
"epoch": 0.7009870918754746,
"grad_norm": 0.9496141254681564,
"learning_rate": 2.5000000000000015e-06,
"loss": 0.241,
"step": 1154
},
{
"epoch": 0.7015945330296127,
"grad_norm": 0.9380133404008738,
"learning_rate": 2.4908264774079355e-06,
"loss": 0.2605,
"step": 1155
},
{
"epoch": 0.702201974183751,
"grad_norm": 0.9074627496390306,
"learning_rate": 2.4816642302655634e-06,
"loss": 0.2541,
"step": 1156
},
{
"epoch": 0.7028094153378891,
"grad_norm": 1.0027152368026724,
"learning_rate": 2.4725132997451833e-06,
"loss": 0.2601,
"step": 1157
},
{
"epoch": 0.7034168564920273,
"grad_norm": 2.6289745813296284,
"learning_rate": 2.4633737269682546e-06,
"loss": 0.3022,
"step": 1158
},
{
"epoch": 0.7040242976461655,
"grad_norm": 0.9370322619588107,
"learning_rate": 2.454245553005184e-06,
"loss": 0.2643,
"step": 1159
},
{
"epoch": 0.7046317388003037,
"grad_norm": 1.468995991623161,
"learning_rate": 2.445128818875166e-06,
"loss": 0.2852,
"step": 1160
},
{
"epoch": 0.705239179954442,
"grad_norm": 0.9901499615769476,
"learning_rate": 2.4360235655459804e-06,
"loss": 0.3014,
"step": 1161
},
{
"epoch": 0.7058466211085801,
"grad_norm": 0.9762587690316699,
"learning_rate": 2.4269298339338205e-06,
"loss": 0.2464,
"step": 1162
},
{
"epoch": 0.7064540622627183,
"grad_norm": 0.9823369995064071,
"learning_rate": 2.4178476649031057e-06,
"loss": 0.2611,
"step": 1163
},
{
"epoch": 0.7070615034168565,
"grad_norm": 1.040540505906759,
"learning_rate": 2.408777099266291e-06,
"loss": 0.2628,
"step": 1164
},
{
"epoch": 0.7076689445709947,
"grad_norm": 0.9626462256229749,
"learning_rate": 2.3997181777836955e-06,
"loss": 0.3069,
"step": 1165
},
{
"epoch": 0.7082763857251329,
"grad_norm": 1.2283451848928204,
"learning_rate": 2.3906709411633073e-06,
"loss": 0.2405,
"step": 1166
},
{
"epoch": 0.7088838268792711,
"grad_norm": 0.9137970515612295,
"learning_rate": 2.381635430060611e-06,
"loss": 0.28,
"step": 1167
},
{
"epoch": 0.7094912680334092,
"grad_norm": 0.9656636216601993,
"learning_rate": 2.3726116850783987e-06,
"loss": 0.2696,
"step": 1168
},
{
"epoch": 0.7100987091875475,
"grad_norm": 0.9461355671018838,
"learning_rate": 2.3635997467665905e-06,
"loss": 0.2527,
"step": 1169
},
{
"epoch": 0.7107061503416856,
"grad_norm": 0.8499788622610774,
"learning_rate": 2.354599655622049e-06,
"loss": 0.2425,
"step": 1170
},
{
"epoch": 0.7113135914958238,
"grad_norm": 0.9394600691367851,
"learning_rate": 2.3456114520883956e-06,
"loss": 0.2478,
"step": 1171
},
{
"epoch": 0.711921032649962,
"grad_norm": 0.9539174173321666,
"learning_rate": 2.3366351765558437e-06,
"loss": 0.2552,
"step": 1172
},
{
"epoch": 0.7125284738041002,
"grad_norm": 1.0414224878560323,
"learning_rate": 2.3276708693609947e-06,
"loss": 0.2798,
"step": 1173
},
{
"epoch": 0.7131359149582385,
"grad_norm": 0.9245170066700932,
"learning_rate": 2.318718570786675e-06,
"loss": 0.2463,
"step": 1174
},
{
"epoch": 0.7137433561123766,
"grad_norm": 0.9803347614971838,
"learning_rate": 2.309778321061742e-06,
"loss": 0.2416,
"step": 1175
},
{
"epoch": 0.7143507972665148,
"grad_norm": 0.9130379562604788,
"learning_rate": 2.3008501603609147e-06,
"loss": 0.275,
"step": 1176
},
{
"epoch": 0.714958238420653,
"grad_norm": 0.8761644255482913,
"learning_rate": 2.2919341288045853e-06,
"loss": 0.2502,
"step": 1177
},
{
"epoch": 0.7155656795747912,
"grad_norm": 0.9584496523002601,
"learning_rate": 2.283030266458644e-06,
"loss": 0.2754,
"step": 1178
},
{
"epoch": 0.7161731207289294,
"grad_norm": 0.8665475599966695,
"learning_rate": 2.2741386133342923e-06,
"loss": 0.2505,
"step": 1179
},
{
"epoch": 0.7167805618830676,
"grad_norm": 0.8576308093825102,
"learning_rate": 2.265259209387867e-06,
"loss": 0.2304,
"step": 1180
},
{
"epoch": 0.7173880030372057,
"grad_norm": 0.9335114757524509,
"learning_rate": 2.256392094520664e-06,
"loss": 0.2697,
"step": 1181
},
{
"epoch": 0.717995444191344,
"grad_norm": 1.2455184951743299,
"learning_rate": 2.2475373085787568e-06,
"loss": 0.2644,
"step": 1182
},
{
"epoch": 0.7186028853454821,
"grad_norm": 1.8108379220736726,
"learning_rate": 2.238694891352814e-06,
"loss": 0.2637,
"step": 1183
},
{
"epoch": 0.7192103264996204,
"grad_norm": 0.884996855921727,
"learning_rate": 2.229864882577921e-06,
"loss": 0.2303,
"step": 1184
},
{
"epoch": 0.7198177676537585,
"grad_norm": 0.9318262199049523,
"learning_rate": 2.2210473219334083e-06,
"loss": 0.255,
"step": 1185
},
{
"epoch": 0.7204252088078967,
"grad_norm": 0.8164042083609618,
"learning_rate": 2.2122422490426676e-06,
"loss": 0.2384,
"step": 1186
},
{
"epoch": 0.721032649962035,
"grad_norm": 1.0115748704170144,
"learning_rate": 2.203449703472969e-06,
"loss": 0.268,
"step": 1187
},
{
"epoch": 0.7216400911161731,
"grad_norm": 1.1883752167190924,
"learning_rate": 2.194669724735296e-06,
"loss": 0.2755,
"step": 1188
},
{
"epoch": 0.7222475322703114,
"grad_norm": 1.0308833660996168,
"learning_rate": 2.1859023522841543e-06,
"loss": 0.2327,
"step": 1189
},
{
"epoch": 0.7228549734244495,
"grad_norm": 0.9120682340740801,
"learning_rate": 2.1771476255174056e-06,
"loss": 0.2735,
"step": 1190
},
{
"epoch": 0.7234624145785877,
"grad_norm": 1.013727297073577,
"learning_rate": 2.1684055837760837e-06,
"loss": 0.2757,
"step": 1191
},
{
"epoch": 0.7240698557327259,
"grad_norm": 0.9458973783887059,
"learning_rate": 2.159676266344222e-06,
"loss": 0.268,
"step": 1192
},
{
"epoch": 0.7246772968868641,
"grad_norm": 1.3758680862531418,
"learning_rate": 2.1509597124486693e-06,
"loss": 0.2367,
"step": 1193
},
{
"epoch": 0.7252847380410022,
"grad_norm": 1.2033412446646528,
"learning_rate": 2.1422559612589266e-06,
"loss": 0.2964,
"step": 1194
},
{
"epoch": 0.7258921791951405,
"grad_norm": 1.0020458048011924,
"learning_rate": 2.1335650518869555e-06,
"loss": 0.2625,
"step": 1195
},
{
"epoch": 0.7264996203492786,
"grad_norm": 0.956244480904364,
"learning_rate": 2.124887023387017e-06,
"loss": 0.2974,
"step": 1196
},
{
"epoch": 0.7271070615034169,
"grad_norm": 0.969906166075625,
"learning_rate": 2.1162219147554884e-06,
"loss": 0.2858,
"step": 1197
},
{
"epoch": 0.727714502657555,
"grad_norm": 1.242369815476469,
"learning_rate": 2.1075697649306838e-06,
"loss": 0.2651,
"step": 1198
},
{
"epoch": 0.7283219438116932,
"grad_norm": 1.0002620329734284,
"learning_rate": 2.09893061279269e-06,
"loss": 0.2611,
"step": 1199
},
{
"epoch": 0.7289293849658315,
"grad_norm": 1.2055397902089622,
"learning_rate": 2.0903044971631854e-06,
"loss": 0.2498,
"step": 1200
},
{
"epoch": 0.7295368261199696,
"grad_norm": 0.9189460422388009,
"learning_rate": 2.0816914568052664e-06,
"loss": 0.2549,
"step": 1201
},
{
"epoch": 0.7301442672741079,
"grad_norm": 0.9278551686313573,
"learning_rate": 2.0730915304232692e-06,
"loss": 0.2753,
"step": 1202
},
{
"epoch": 0.730751708428246,
"grad_norm": 0.9259931376538301,
"learning_rate": 2.0645047566626057e-06,
"loss": 0.2429,
"step": 1203
},
{
"epoch": 0.7313591495823842,
"grad_norm": 1.0024167165948739,
"learning_rate": 2.055931174109579e-06,
"loss": 0.2923,
"step": 1204
},
{
"epoch": 0.7319665907365224,
"grad_norm": 0.8930847111145748,
"learning_rate": 2.0473708212912167e-06,
"loss": 0.2416,
"step": 1205
},
{
"epoch": 0.7325740318906606,
"grad_norm": 0.8864928181427251,
"learning_rate": 2.0388237366751005e-06,
"loss": 0.2538,
"step": 1206
},
{
"epoch": 0.7331814730447987,
"grad_norm": 0.8850193923988731,
"learning_rate": 2.030289958669181e-06,
"loss": 0.2649,
"step": 1207
},
{
"epoch": 0.733788914198937,
"grad_norm": 0.9739541510077735,
"learning_rate": 2.02176952562162e-06,
"loss": 0.2517,
"step": 1208
},
{
"epoch": 0.7343963553530751,
"grad_norm": 0.9799826210952297,
"learning_rate": 2.013262475820602e-06,
"loss": 0.2716,
"step": 1209
},
{
"epoch": 0.7350037965072134,
"grad_norm": 1.0600437940139478,
"learning_rate": 2.004768847494186e-06,
"loss": 0.2365,
"step": 1210
},
{
"epoch": 0.7356112376613515,
"grad_norm": 0.9207976884534176,
"learning_rate": 1.996288678810105e-06,
"loss": 0.2632,
"step": 1211
},
{
"epoch": 0.7362186788154897,
"grad_norm": 1.0260990612904581,
"learning_rate": 1.987822007875617e-06,
"loss": 0.2675,
"step": 1212
},
{
"epoch": 0.736826119969628,
"grad_norm": 1.0643126752862775,
"learning_rate": 1.979368872737319e-06,
"loss": 0.2282,
"step": 1213
},
{
"epoch": 0.7374335611237661,
"grad_norm": 1.002822192943713,
"learning_rate": 1.9709293113809876e-06,
"loss": 0.237,
"step": 1214
},
{
"epoch": 0.7380410022779044,
"grad_norm": 0.8146421380435214,
"learning_rate": 1.962503361731403e-06,
"loss": 0.2347,
"step": 1215
},
{
"epoch": 0.7386484434320425,
"grad_norm": 1.44451984462559,
"learning_rate": 1.954091061652172e-06,
"loss": 0.249,
"step": 1216
},
{
"epoch": 0.7392558845861807,
"grad_norm": 1.0048037676428558,
"learning_rate": 1.945692448945574e-06,
"loss": 0.2684,
"step": 1217
},
{
"epoch": 0.7398633257403189,
"grad_norm": 0.9134185361949188,
"learning_rate": 1.9373075613523728e-06,
"loss": 0.269,
"step": 1218
},
{
"epoch": 0.7404707668944571,
"grad_norm": 1.3119909261331324,
"learning_rate": 1.928936436551661e-06,
"loss": 0.2422,
"step": 1219
},
{
"epoch": 0.7410782080485953,
"grad_norm": 0.8520218708347329,
"learning_rate": 1.920579112160685e-06,
"loss": 0.2199,
"step": 1220
},
{
"epoch": 0.7416856492027335,
"grad_norm": 2.6495798687566086,
"learning_rate": 1.912235625734676e-06,
"loss": 0.2854,
"step": 1221
},
{
"epoch": 0.7422930903568716,
"grad_norm": 1.6880210137763603,
"learning_rate": 1.903906014766681e-06,
"loss": 0.2761,
"step": 1222
},
{
"epoch": 0.7429005315110099,
"grad_norm": 0.9375069285048585,
"learning_rate": 1.8955903166873924e-06,
"loss": 0.25,
"step": 1223
},
{
"epoch": 0.743507972665148,
"grad_norm": 1.239196974591738,
"learning_rate": 1.8872885688649879e-06,
"loss": 0.2876,
"step": 1224
},
{
"epoch": 0.7441154138192863,
"grad_norm": 0.9780811220285901,
"learning_rate": 1.8790008086049534e-06,
"loss": 0.255,
"step": 1225
},
{
"epoch": 0.7447228549734245,
"grad_norm": 0.9109861670066071,
"learning_rate": 1.8707270731499223e-06,
"loss": 0.2401,
"step": 1226
},
{
"epoch": 0.7453302961275626,
"grad_norm": 0.9572237400053779,
"learning_rate": 1.862467399679499e-06,
"loss": 0.2855,
"step": 1227
},
{
"epoch": 0.7459377372817009,
"grad_norm": 0.9835745008775473,
"learning_rate": 1.854221825310103e-06,
"loss": 0.2376,
"step": 1228
},
{
"epoch": 0.746545178435839,
"grad_norm": 0.9473916357579606,
"learning_rate": 1.8459903870947954e-06,
"loss": 0.277,
"step": 1229
},
{
"epoch": 0.7471526195899773,
"grad_norm": 0.8905602752123002,
"learning_rate": 1.8377731220231144e-06,
"loss": 0.2506,
"step": 1230
},
{
"epoch": 0.7477600607441154,
"grad_norm": 0.9604834177033152,
"learning_rate": 1.829570067020906e-06,
"loss": 0.2448,
"step": 1231
},
{
"epoch": 0.7483675018982536,
"grad_norm": 0.8845596702356572,
"learning_rate": 1.8213812589501611e-06,
"loss": 0.2547,
"step": 1232
},
{
"epoch": 0.7489749430523918,
"grad_norm": 0.9925482849261942,
"learning_rate": 1.813206734608851e-06,
"loss": 0.2603,
"step": 1233
},
{
"epoch": 0.74958238420653,
"grad_norm": 1.0974123543068024,
"learning_rate": 1.8050465307307602e-06,
"loss": 0.2461,
"step": 1234
},
{
"epoch": 0.7501898253606681,
"grad_norm": 0.9382971676426907,
"learning_rate": 1.7969006839853227e-06,
"loss": 0.2226,
"step": 1235
},
{
"epoch": 0.7507972665148064,
"grad_norm": 1.6609571035124198,
"learning_rate": 1.78876923097745e-06,
"loss": 0.2553,
"step": 1236
},
{
"epoch": 0.7514047076689445,
"grad_norm": 1.0951347816044532,
"learning_rate": 1.7806522082473809e-06,
"loss": 0.2549,
"step": 1237
},
{
"epoch": 0.7520121488230828,
"grad_norm": 0.9563113436198466,
"learning_rate": 1.7725496522704998e-06,
"loss": 0.2582,
"step": 1238
},
{
"epoch": 0.752619589977221,
"grad_norm": 0.9531971950924529,
"learning_rate": 1.7644615994571934e-06,
"loss": 0.2509,
"step": 1239
},
{
"epoch": 0.7532270311313591,
"grad_norm": 1.0124203802056453,
"learning_rate": 1.7563880861526656e-06,
"loss": 0.2444,
"step": 1240
},
{
"epoch": 0.7538344722854974,
"grad_norm": 0.9577538726500576,
"learning_rate": 1.748329148636787e-06,
"loss": 0.2236,
"step": 1241
},
{
"epoch": 0.7544419134396355,
"grad_norm": 0.9178846740365786,
"learning_rate": 1.7402848231239317e-06,
"loss": 0.2544,
"step": 1242
},
{
"epoch": 0.7550493545937738,
"grad_norm": 0.9382975967726378,
"learning_rate": 1.73225514576281e-06,
"loss": 0.2665,
"step": 1243
},
{
"epoch": 0.7556567957479119,
"grad_norm": 1.3363746570955906,
"learning_rate": 1.7242401526363095e-06,
"loss": 0.2745,
"step": 1244
},
{
"epoch": 0.7562642369020501,
"grad_norm": 1.3353177833317331,
"learning_rate": 1.7162398797613284e-06,
"loss": 0.251,
"step": 1245
},
{
"epoch": 0.7568716780561883,
"grad_norm": 1.0235724081052306,
"learning_rate": 1.70825436308862e-06,
"loss": 0.2699,
"step": 1246
},
{
"epoch": 0.7574791192103265,
"grad_norm": 0.9858229899690142,
"learning_rate": 1.7002836385026234e-06,
"loss": 0.2429,
"step": 1247
},
{
"epoch": 0.7580865603644646,
"grad_norm": 1.1427480361449285,
"learning_rate": 1.692327741821312e-06,
"loss": 0.2733,
"step": 1248
},
{
"epoch": 0.7586940015186029,
"grad_norm": 0.9835216641075001,
"learning_rate": 1.6843867087960252e-06,
"loss": 0.2671,
"step": 1249
},
{
"epoch": 0.7593014426727411,
"grad_norm": 0.9784659480769877,
"learning_rate": 1.676460575111306e-06,
"loss": 0.2515,
"step": 1250
},
{
"epoch": 0.7599088838268793,
"grad_norm": 0.9481282295943736,
"learning_rate": 1.6685493763847515e-06,
"loss": 0.259,
"step": 1251
},
{
"epoch": 0.7605163249810175,
"grad_norm": 1.0452063539012255,
"learning_rate": 1.6606531481668364e-06,
"loss": 0.2633,
"step": 1252
},
{
"epoch": 0.7611237661351556,
"grad_norm": 0.9745231850326872,
"learning_rate": 1.6527719259407743e-06,
"loss": 0.249,
"step": 1253
},
{
"epoch": 0.7617312072892939,
"grad_norm": 1.3873562933934764,
"learning_rate": 1.6449057451223354e-06,
"loss": 0.253,
"step": 1254
},
{
"epoch": 0.762338648443432,
"grad_norm": 1.1512772469160202,
"learning_rate": 1.6370546410597066e-06,
"loss": 0.2799,
"step": 1255
},
{
"epoch": 0.7629460895975703,
"grad_norm": 1.0250344913669225,
"learning_rate": 1.6292186490333172e-06,
"loss": 0.265,
"step": 1256
},
{
"epoch": 0.7635535307517084,
"grad_norm": 1.1545097210078017,
"learning_rate": 1.6213978042556938e-06,
"loss": 0.2319,
"step": 1257
},
{
"epoch": 0.7641609719058466,
"grad_norm": 0.9063231723472821,
"learning_rate": 1.6135921418712959e-06,
"loss": 0.2512,
"step": 1258
},
{
"epoch": 0.7647684130599848,
"grad_norm": 1.061377737003138,
"learning_rate": 1.6058016969563512e-06,
"loss": 0.2598,
"step": 1259
},
{
"epoch": 0.765375854214123,
"grad_norm": 0.9337491753620247,
"learning_rate": 1.5980265045187139e-06,
"loss": 0.2707,
"step": 1260
},
{
"epoch": 0.7659832953682612,
"grad_norm": 1.2607704019846233,
"learning_rate": 1.5902665994976896e-06,
"loss": 0.269,
"step": 1261
},
{
"epoch": 0.7665907365223994,
"grad_norm": 1.086835681982817,
"learning_rate": 1.5825220167638945e-06,
"loss": 0.2215,
"step": 1262
},
{
"epoch": 0.7671981776765376,
"grad_norm": 1.0948737291989328,
"learning_rate": 1.5747927911190858e-06,
"loss": 0.2713,
"step": 1263
},
{
"epoch": 0.7678056188306758,
"grad_norm": 1.102976831652949,
"learning_rate": 1.567078957296016e-06,
"loss": 0.266,
"step": 1264
},
{
"epoch": 0.768413059984814,
"grad_norm": 1.046644859411465,
"learning_rate": 1.5593805499582659e-06,
"loss": 0.2365,
"step": 1265
},
{
"epoch": 0.7690205011389522,
"grad_norm": 0.8613582389098838,
"learning_rate": 1.5516976037000941e-06,
"loss": 0.2188,
"step": 1266
},
{
"epoch": 0.7696279422930904,
"grad_norm": 1.7227164484553419,
"learning_rate": 1.544030153046291e-06,
"loss": 0.2567,
"step": 1267
},
{
"epoch": 0.7702353834472285,
"grad_norm": 2.92880493600265,
"learning_rate": 1.5363782324520033e-06,
"loss": 0.2803,
"step": 1268
},
{
"epoch": 0.7708428246013668,
"grad_norm": 1.0122911998142148,
"learning_rate": 1.528741876302598e-06,
"loss": 0.2772,
"step": 1269
},
{
"epoch": 0.7714502657555049,
"grad_norm": 1.3980473384689627,
"learning_rate": 1.5211211189134955e-06,
"loss": 0.2478,
"step": 1270
},
{
"epoch": 0.7720577069096431,
"grad_norm": 0.8508504199116661,
"learning_rate": 1.5135159945300232e-06,
"loss": 0.2401,
"step": 1271
},
{
"epoch": 0.7726651480637813,
"grad_norm": 0.9969962158012547,
"learning_rate": 1.5059265373272574e-06,
"loss": 0.2617,
"step": 1272
},
{
"epoch": 0.7732725892179195,
"grad_norm": 0.964226927135635,
"learning_rate": 1.4983527814098736e-06,
"loss": 0.267,
"step": 1273
},
{
"epoch": 0.7738800303720577,
"grad_norm": 1.0115140691606623,
"learning_rate": 1.4907947608119866e-06,
"loss": 0.2421,
"step": 1274
},
{
"epoch": 0.7744874715261959,
"grad_norm": 1.2895393161155704,
"learning_rate": 1.4832525094970007e-06,
"loss": 0.2452,
"step": 1275
},
{
"epoch": 0.7750949126803341,
"grad_norm": 0.8408524834828659,
"learning_rate": 1.475726061357463e-06,
"loss": 0.2166,
"step": 1276
},
{
"epoch": 0.7757023538344723,
"grad_norm": 1.333514308934424,
"learning_rate": 1.4682154502149025e-06,
"loss": 0.2415,
"step": 1277
},
{
"epoch": 0.7763097949886105,
"grad_norm": 1.036857242677457,
"learning_rate": 1.4607207098196851e-06,
"loss": 0.2569,
"step": 1278
},
{
"epoch": 0.7769172361427487,
"grad_norm": 7.103965440866741,
"learning_rate": 1.4532418738508525e-06,
"loss": 0.2648,
"step": 1279
},
{
"epoch": 0.7775246772968869,
"grad_norm": 0.8400952986765654,
"learning_rate": 1.4457789759159813e-06,
"loss": 0.2018,
"step": 1280
},
{
"epoch": 0.778132118451025,
"grad_norm": 1.4757073564314478,
"learning_rate": 1.4383320495510267e-06,
"loss": 0.2616,
"step": 1281
},
{
"epoch": 0.7787395596051633,
"grad_norm": 1.004482929976758,
"learning_rate": 1.430901128220174e-06,
"loss": 0.2529,
"step": 1282
},
{
"epoch": 0.7793470007593014,
"grad_norm": 1.0139377829258103,
"learning_rate": 1.4234862453156839e-06,
"loss": 0.2756,
"step": 1283
},
{
"epoch": 0.7799544419134397,
"grad_norm": 1.0100199779353403,
"learning_rate": 1.4160874341577447e-06,
"loss": 0.2484,
"step": 1284
},
{
"epoch": 0.7805618830675778,
"grad_norm": 1.1168401047776593,
"learning_rate": 1.4087047279943267e-06,
"loss": 0.2687,
"step": 1285
},
{
"epoch": 0.781169324221716,
"grad_norm": 0.9503234909845282,
"learning_rate": 1.4013381600010278e-06,
"loss": 0.2563,
"step": 1286
},
{
"epoch": 0.7817767653758542,
"grad_norm": 1.0368290840258114,
"learning_rate": 1.3939877632809279e-06,
"loss": 0.2866,
"step": 1287
},
{
"epoch": 0.7823842065299924,
"grad_norm": 1.0086813795279805,
"learning_rate": 1.3866535708644335e-06,
"loss": 0.2418,
"step": 1288
},
{
"epoch": 0.7829916476841307,
"grad_norm": 1.1754106526081984,
"learning_rate": 1.3793356157091387e-06,
"loss": 0.2582,
"step": 1289
},
{
"epoch": 0.7835990888382688,
"grad_norm": 0.9640662064683282,
"learning_rate": 1.3720339306996666e-06,
"loss": 0.2834,
"step": 1290
},
{
"epoch": 0.784206529992407,
"grad_norm": 1.741496452010621,
"learning_rate": 1.3647485486475376e-06,
"loss": 0.2374,
"step": 1291
},
{
"epoch": 0.7848139711465452,
"grad_norm": 1.0182189209813342,
"learning_rate": 1.3574795022910014e-06,
"loss": 0.2531,
"step": 1292
},
{
"epoch": 0.7854214123006834,
"grad_norm": 0.9760934213660039,
"learning_rate": 1.3502268242949025e-06,
"loss": 0.2575,
"step": 1293
},
{
"epoch": 0.7860288534548215,
"grad_norm": 1.8383703679855188,
"learning_rate": 1.3429905472505344e-06,
"loss": 0.2383,
"step": 1294
},
{
"epoch": 0.7866362946089598,
"grad_norm": 0.9502317083781607,
"learning_rate": 1.3357707036754875e-06,
"loss": 0.2585,
"step": 1295
},
{
"epoch": 0.7872437357630979,
"grad_norm": 0.9297333282490423,
"learning_rate": 1.3285673260135073e-06,
"loss": 0.2452,
"step": 1296
},
{
"epoch": 0.7878511769172362,
"grad_norm": 0.9116623980444865,
"learning_rate": 1.321380446634342e-06,
"loss": 0.2514,
"step": 1297
},
{
"epoch": 0.7884586180713743,
"grad_norm": 1.0165136386704785,
"learning_rate": 1.314210097833607e-06,
"loss": 0.2698,
"step": 1298
},
{
"epoch": 0.7890660592255125,
"grad_norm": 1.0097975118483586,
"learning_rate": 1.3070563118326295e-06,
"loss": 0.2623,
"step": 1299
},
{
"epoch": 0.7896735003796507,
"grad_norm": 0.9511823883591485,
"learning_rate": 1.2999191207783129e-06,
"loss": 0.2227,
"step": 1300
},
{
"epoch": 0.7902809415337889,
"grad_norm": 1.083790995951702,
"learning_rate": 1.2927985567429868e-06,
"loss": 0.2386,
"step": 1301
},
{
"epoch": 0.7908883826879272,
"grad_norm": 0.9289196009158714,
"learning_rate": 1.2856946517242608e-06,
"loss": 0.2299,
"step": 1302
},
{
"epoch": 0.7914958238420653,
"grad_norm": 0.9764047061340886,
"learning_rate": 1.27860743764489e-06,
"loss": 0.2536,
"step": 1303
},
{
"epoch": 0.7921032649962035,
"grad_norm": 1.066452400553731,
"learning_rate": 1.2715369463526173e-06,
"loss": 0.2485,
"step": 1304
},
{
"epoch": 0.7927107061503417,
"grad_norm": 0.8830887915707148,
"learning_rate": 1.2644832096200498e-06,
"loss": 0.241,
"step": 1305
},
{
"epoch": 0.7933181473044799,
"grad_norm": 0.8852127517971522,
"learning_rate": 1.257446259144494e-06,
"loss": 0.2236,
"step": 1306
},
{
"epoch": 0.793925588458618,
"grad_norm": 0.9428821852573767,
"learning_rate": 1.2504261265478324e-06,
"loss": 0.2552,
"step": 1307
},
{
"epoch": 0.7945330296127563,
"grad_norm": 1.0320142315688623,
"learning_rate": 1.2434228433763657e-06,
"loss": 0.2469,
"step": 1308
},
{
"epoch": 0.7951404707668944,
"grad_norm": 1.0170174001133827,
"learning_rate": 1.2364364411006841e-06,
"loss": 0.2437,
"step": 1309
},
{
"epoch": 0.7957479119210327,
"grad_norm": 1.0560205729362242,
"learning_rate": 1.2294669511155193e-06,
"loss": 0.2327,
"step": 1310
},
{
"epoch": 0.7963553530751708,
"grad_norm": 1.239053820039574,
"learning_rate": 1.2225144047396015e-06,
"loss": 0.2627,
"step": 1311
},
{
"epoch": 0.796962794229309,
"grad_norm": 1.0988583237532765,
"learning_rate": 1.215578833215526e-06,
"loss": 0.262,
"step": 1312
},
{
"epoch": 0.7975702353834472,
"grad_norm": 0.867925390100386,
"learning_rate": 1.2086602677096033e-06,
"loss": 0.2416,
"step": 1313
},
{
"epoch": 0.7981776765375854,
"grad_norm": 1.2397806394064825,
"learning_rate": 1.201758739311728e-06,
"loss": 0.2478,
"step": 1314
},
{
"epoch": 0.7987851176917237,
"grad_norm": 1.7686159599994773,
"learning_rate": 1.1948742790352342e-06,
"loss": 0.2663,
"step": 1315
},
{
"epoch": 0.7993925588458618,
"grad_norm": 1.23788413067218,
"learning_rate": 1.1880069178167586e-06,
"loss": 0.2271,
"step": 1316
},
{
"epoch": 0.8,
"grad_norm": 0.899946412274731,
"learning_rate": 1.1811566865160961e-06,
"loss": 0.222,
"step": 1317
},
{
"epoch": 0.8006074411541382,
"grad_norm": 0.87270457464745,
"learning_rate": 1.1743236159160654e-06,
"loss": 0.2592,
"step": 1318
},
{
"epoch": 0.8012148823082764,
"grad_norm": 1.3219310442302556,
"learning_rate": 1.167507736722377e-06,
"loss": 0.266,
"step": 1319
},
{
"epoch": 0.8018223234624146,
"grad_norm": 0.9267745860014674,
"learning_rate": 1.1607090795634802e-06,
"loss": 0.249,
"step": 1320
},
{
"epoch": 0.8024297646165528,
"grad_norm": 1.145281359593592,
"learning_rate": 1.15392767499044e-06,
"loss": 0.273,
"step": 1321
},
{
"epoch": 0.8030372057706909,
"grad_norm": 0.9943238830649733,
"learning_rate": 1.1471635534767877e-06,
"loss": 0.2713,
"step": 1322
},
{
"epoch": 0.8036446469248292,
"grad_norm": 0.8925102247035651,
"learning_rate": 1.1404167454183957e-06,
"loss": 0.2509,
"step": 1323
},
{
"epoch": 0.8042520880789673,
"grad_norm": 2.8209341271672557,
"learning_rate": 1.133687281133331e-06,
"loss": 0.2414,
"step": 1324
},
{
"epoch": 0.8048595292331056,
"grad_norm": 0.8913737685748099,
"learning_rate": 1.1269751908617277e-06,
"loss": 0.2382,
"step": 1325
},
{
"epoch": 0.8054669703872437,
"grad_norm": 0.8955833156392411,
"learning_rate": 1.1202805047656406e-06,
"loss": 0.2336,
"step": 1326
},
{
"epoch": 0.8060744115413819,
"grad_norm": 1.1427272635049914,
"learning_rate": 1.113603252928917e-06,
"loss": 0.2576,
"step": 1327
},
{
"epoch": 0.8066818526955202,
"grad_norm": 0.9867069146988969,
"learning_rate": 1.1069434653570633e-06,
"loss": 0.2703,
"step": 1328
},
{
"epoch": 0.8072892938496583,
"grad_norm": 1.0341115309067341,
"learning_rate": 1.1003011719771046e-06,
"loss": 0.251,
"step": 1329
},
{
"epoch": 0.8078967350037966,
"grad_norm": 0.941968386464464,
"learning_rate": 1.0936764026374547e-06,
"loss": 0.2523,
"step": 1330
},
{
"epoch": 0.8085041761579347,
"grad_norm": 0.9109334482631996,
"learning_rate": 1.0870691871077738e-06,
"loss": 0.2573,
"step": 1331
},
{
"epoch": 0.8091116173120729,
"grad_norm": 1.1679903287794757,
"learning_rate": 1.0804795550788473e-06,
"loss": 0.2727,
"step": 1332
},
{
"epoch": 0.8097190584662111,
"grad_norm": 1.0262961821675425,
"learning_rate": 1.073907536162443e-06,
"loss": 0.2499,
"step": 1333
},
{
"epoch": 0.8103264996203493,
"grad_norm": 0.8638969582311489,
"learning_rate": 1.0673531598911824e-06,
"loss": 0.2077,
"step": 1334
},
{
"epoch": 0.8109339407744874,
"grad_norm": 1.065445241867707,
"learning_rate": 1.0608164557184042e-06,
"loss": 0.2733,
"step": 1335
},
{
"epoch": 0.8115413819286257,
"grad_norm": 1.615302331483808,
"learning_rate": 1.0542974530180327e-06,
"loss": 0.2712,
"step": 1336
},
{
"epoch": 0.8121488230827638,
"grad_norm": 0.9558232094508515,
"learning_rate": 1.0477961810844517e-06,
"loss": 0.281,
"step": 1337
},
{
"epoch": 0.8127562642369021,
"grad_norm": 1.0153735128244517,
"learning_rate": 1.0413126691323667e-06,
"loss": 0.2521,
"step": 1338
},
{
"epoch": 0.8133637053910402,
"grad_norm": 0.9566927051256368,
"learning_rate": 1.0348469462966753e-06,
"loss": 0.2869,
"step": 1339
},
{
"epoch": 0.8139711465451784,
"grad_norm": 1.2528160410336169,
"learning_rate": 1.0283990416323336e-06,
"loss": 0.2747,
"step": 1340
},
{
"epoch": 0.8145785876993167,
"grad_norm": 0.8811463059194917,
"learning_rate": 1.0219689841142343e-06,
"loss": 0.2071,
"step": 1341
},
{
"epoch": 0.8151860288534548,
"grad_norm": 1.9173571112957752,
"learning_rate": 1.0155568026370637e-06,
"loss": 0.2345,
"step": 1342
},
{
"epoch": 0.8157934700075931,
"grad_norm": 1.0402835209362526,
"learning_rate": 1.0091625260151827e-06,
"loss": 0.2435,
"step": 1343
},
{
"epoch": 0.8164009111617312,
"grad_norm": 1.6822710586636964,
"learning_rate": 1.0027861829824953e-06,
"loss": 0.287,
"step": 1344
},
{
"epoch": 0.8170083523158694,
"grad_norm": 1.397843369835043,
"learning_rate": 9.964278021923107e-07,
"loss": 0.2605,
"step": 1345
},
{
"epoch": 0.8176157934700076,
"grad_norm": 1.123716581875761,
"learning_rate": 9.900874122172294e-07,
"loss": 0.2647,
"step": 1346
},
{
"epoch": 0.8182232346241458,
"grad_norm": 1.0901754083400064,
"learning_rate": 9.83765041548998e-07,
"loss": 0.2707,
"step": 1347
},
{
"epoch": 0.818830675778284,
"grad_norm": 0.9517248980182025,
"learning_rate": 9.774607185984004e-07,
"loss": 0.2515,
"step": 1348
},
{
"epoch": 0.8194381169324222,
"grad_norm": 0.93504230650595,
"learning_rate": 9.711744716951093e-07,
"loss": 0.241,
"step": 1349
},
{
"epoch": 0.8200455580865603,
"grad_norm": 1.0424392516819492,
"learning_rate": 9.649063290875771e-07,
"loss": 0.2197,
"step": 1350
},
{
"epoch": 0.8206529992406986,
"grad_norm": 2.171189768685288,
"learning_rate": 9.586563189428954e-07,
"loss": 0.2367,
"step": 1351
},
{
"epoch": 0.8212604403948367,
"grad_norm": 0.9333141948132236,
"learning_rate": 9.524244693466773e-07,
"loss": 0.2391,
"step": 1352
},
{
"epoch": 0.8218678815489749,
"grad_norm": 0.986280542191797,
"learning_rate": 9.462108083029287e-07,
"loss": 0.247,
"step": 1353
},
{
"epoch": 0.8224753227031132,
"grad_norm": 1.1077079150850047,
"learning_rate": 9.400153637339182e-07,
"loss": 0.2908,
"step": 1354
},
{
"epoch": 0.8230827638572513,
"grad_norm": 1.3469195622589663,
"learning_rate": 9.338381634800597e-07,
"loss": 0.2264,
"step": 1355
},
{
"epoch": 0.8236902050113896,
"grad_norm": 2.3223661832113476,
"learning_rate": 9.276792352997782e-07,
"loss": 0.2334,
"step": 1356
},
{
"epoch": 0.8242976461655277,
"grad_norm": 0.965162694370609,
"learning_rate": 9.215386068693927e-07,
"loss": 0.2554,
"step": 1357
},
{
"epoch": 0.8249050873196659,
"grad_norm": 1.1093572084109473,
"learning_rate": 9.154163057829879e-07,
"loss": 0.2328,
"step": 1358
},
{
"epoch": 0.8255125284738041,
"grad_norm": 0.9658006597774278,
"learning_rate": 9.093123595522929e-07,
"loss": 0.2641,
"step": 1359
},
{
"epoch": 0.8261199696279423,
"grad_norm": 1.1141024465330946,
"learning_rate": 9.032267956065516e-07,
"loss": 0.2168,
"step": 1360
},
{
"epoch": 0.8267274107820805,
"grad_norm": 1.085834176055846,
"learning_rate": 8.971596412924067e-07,
"loss": 0.2665,
"step": 1361
},
{
"epoch": 0.8273348519362187,
"grad_norm": 1.0277054005618411,
"learning_rate": 8.911109238737748e-07,
"loss": 0.2654,
"step": 1362
},
{
"epoch": 0.8279422930903568,
"grad_norm": 1.1286512062535323,
"learning_rate": 8.850806705317183e-07,
"loss": 0.2572,
"step": 1363
},
{
"epoch": 0.8285497342444951,
"grad_norm": 0.9881387168493946,
"learning_rate": 8.790689083643328e-07,
"loss": 0.2762,
"step": 1364
},
{
"epoch": 0.8291571753986332,
"grad_norm": 1.7913780275956543,
"learning_rate": 8.730756643866157e-07,
"loss": 0.2728,
"step": 1365
},
{
"epoch": 0.8297646165527715,
"grad_norm": 1.0438179673664785,
"learning_rate": 8.671009655303531e-07,
"loss": 0.2876,
"step": 1366
},
{
"epoch": 0.8303720577069097,
"grad_norm": 1.1157108355581231,
"learning_rate": 8.611448386439936e-07,
"loss": 0.2582,
"step": 1367
},
{
"epoch": 0.8309794988610478,
"grad_norm": 1.5568021179946305,
"learning_rate": 8.552073104925296e-07,
"loss": 0.2428,
"step": 1368
},
{
"epoch": 0.8315869400151861,
"grad_norm": 1.0699739124750929,
"learning_rate": 8.492884077573749e-07,
"loss": 0.2568,
"step": 1369
},
{
"epoch": 0.8321943811693242,
"grad_norm": 0.7926743610930811,
"learning_rate": 8.433881570362484e-07,
"loss": 0.2176,
"step": 1370
},
{
"epoch": 0.8328018223234624,
"grad_norm": 1.225524035303792,
"learning_rate": 8.375065848430508e-07,
"loss": 0.274,
"step": 1371
},
{
"epoch": 0.8334092634776006,
"grad_norm": 0.9030010611168635,
"learning_rate": 8.316437176077491e-07,
"loss": 0.2649,
"step": 1372
},
{
"epoch": 0.8340167046317388,
"grad_norm": 0.8628776555924657,
"learning_rate": 8.257995816762559e-07,
"loss": 0.238,
"step": 1373
},
{
"epoch": 0.834624145785877,
"grad_norm": 0.9519929470187486,
"learning_rate": 8.199742033103091e-07,
"loss": 0.22,
"step": 1374
},
{
"epoch": 0.8352315869400152,
"grad_norm": 0.9316429752072123,
"learning_rate": 8.141676086873574e-07,
"loss": 0.2523,
"step": 1375
},
{
"epoch": 0.8358390280941533,
"grad_norm": 1.5230699643150925,
"learning_rate": 8.083798239004408e-07,
"loss": 0.2601,
"step": 1376
},
{
"epoch": 0.8364464692482916,
"grad_norm": 1.0766184599801747,
"learning_rate": 8.026108749580758e-07,
"loss": 0.2538,
"step": 1377
},
{
"epoch": 0.8370539104024297,
"grad_norm": 2.320789746346856,
"learning_rate": 7.968607877841333e-07,
"loss": 0.2844,
"step": 1378
},
{
"epoch": 0.837661351556568,
"grad_norm": 0.8792983215848907,
"learning_rate": 7.911295882177256e-07,
"loss": 0.236,
"step": 1379
},
{
"epoch": 0.8382687927107062,
"grad_norm": 1.6787572932160961,
"learning_rate": 7.854173020130906e-07,
"loss": 0.2403,
"step": 1380
},
{
"epoch": 0.8388762338648443,
"grad_norm": 0.9829023277192255,
"learning_rate": 7.79723954839477e-07,
"loss": 0.2287,
"step": 1381
},
{
"epoch": 0.8394836750189826,
"grad_norm": 1.7023786251385593,
"learning_rate": 7.740495722810271e-07,
"loss": 0.2435,
"step": 1382
},
{
"epoch": 0.8400911161731207,
"grad_norm": 0.9441864654332802,
"learning_rate": 7.683941798366578e-07,
"loss": 0.2906,
"step": 1383
},
{
"epoch": 0.840698557327259,
"grad_norm": 1.2877471436954095,
"learning_rate": 7.627578029199562e-07,
"loss": 0.2498,
"step": 1384
},
{
"epoch": 0.8413059984813971,
"grad_norm": 0.9842560572472016,
"learning_rate": 7.571404668590532e-07,
"loss": 0.2742,
"step": 1385
},
{
"epoch": 0.8419134396355353,
"grad_norm": 1.130499069526576,
"learning_rate": 7.515421968965242e-07,
"loss": 0.2285,
"step": 1386
},
{
"epoch": 0.8425208807896735,
"grad_norm": 0.9696805118746494,
"learning_rate": 7.459630181892608e-07,
"loss": 0.262,
"step": 1387
},
{
"epoch": 0.8431283219438117,
"grad_norm": 1.2144471408339115,
"learning_rate": 7.404029558083653e-07,
"loss": 0.2675,
"step": 1388
},
{
"epoch": 0.8437357630979498,
"grad_norm": 1.0391512842651585,
"learning_rate": 7.348620347390384e-07,
"loss": 0.2855,
"step": 1389
},
{
"epoch": 0.8443432042520881,
"grad_norm": 0.9328634429354515,
"learning_rate": 7.293402798804667e-07,
"loss": 0.2345,
"step": 1390
},
{
"epoch": 0.8449506454062262,
"grad_norm": 0.9723351719934986,
"learning_rate": 7.238377160457094e-07,
"loss": 0.2645,
"step": 1391
},
{
"epoch": 0.8455580865603645,
"grad_norm": 1.4147751762924923,
"learning_rate": 7.183543679615834e-07,
"loss": 0.2626,
"step": 1392
},
{
"epoch": 0.8461655277145027,
"grad_norm": 1.177367440845403,
"learning_rate": 7.128902602685617e-07,
"loss": 0.2709,
"step": 1393
},
{
"epoch": 0.8467729688686408,
"grad_norm": 1.0052526002592344,
"learning_rate": 7.074454175206524e-07,
"loss": 0.2464,
"step": 1394
},
{
"epoch": 0.8473804100227791,
"grad_norm": 0.977663222241663,
"learning_rate": 7.020198641852949e-07,
"loss": 0.2447,
"step": 1395
},
{
"epoch": 0.8479878511769172,
"grad_norm": 2.675337956474618,
"learning_rate": 6.966136246432492e-07,
"loss": 0.2647,
"step": 1396
},
{
"epoch": 0.8485952923310555,
"grad_norm": 0.9873856637443686,
"learning_rate": 6.912267231884817e-07,
"loss": 0.266,
"step": 1397
},
{
"epoch": 0.8492027334851936,
"grad_norm": 1.003763602141105,
"learning_rate": 6.858591840280627e-07,
"loss": 0.2891,
"step": 1398
},
{
"epoch": 0.8498101746393318,
"grad_norm": 0.917577761815308,
"learning_rate": 6.805110312820501e-07,
"loss": 0.2545,
"step": 1399
},
{
"epoch": 0.85041761579347,
"grad_norm": 0.8727291361789857,
"learning_rate": 6.751822889833926e-07,
"loss": 0.2522,
"step": 1400
},
{
"epoch": 0.8510250569476082,
"grad_norm": 0.9567515585475891,
"learning_rate": 6.698729810778065e-07,
"loss": 0.2411,
"step": 1401
},
{
"epoch": 0.8516324981017463,
"grad_norm": 0.8896081239860772,
"learning_rate": 6.645831314236817e-07,
"loss": 0.2484,
"step": 1402
},
{
"epoch": 0.8522399392558846,
"grad_norm": 1.1157181737389836,
"learning_rate": 6.593127637919633e-07,
"loss": 0.2852,
"step": 1403
},
{
"epoch": 0.8528473804100227,
"grad_norm": 0.9750689112401771,
"learning_rate": 6.540619018660555e-07,
"loss": 0.2512,
"step": 1404
},
{
"epoch": 0.853454821564161,
"grad_norm": 0.8887565653775829,
"learning_rate": 6.488305692417074e-07,
"loss": 0.2538,
"step": 1405
},
{
"epoch": 0.8540622627182992,
"grad_norm": 0.9449518854689749,
"learning_rate": 6.436187894269086e-07,
"loss": 0.2412,
"step": 1406
},
{
"epoch": 0.8546697038724373,
"grad_norm": 1.1922033805891283,
"learning_rate": 6.384265858417877e-07,
"loss": 0.2618,
"step": 1407
},
{
"epoch": 0.8552771450265756,
"grad_norm": 0.9056621667680124,
"learning_rate": 6.332539818184985e-07,
"loss": 0.2363,
"step": 1408
},
{
"epoch": 0.8558845861807137,
"grad_norm": 1.0040516816367477,
"learning_rate": 6.281010006011256e-07,
"loss": 0.2511,
"step": 1409
},
{
"epoch": 0.856492027334852,
"grad_norm": 0.9636326863778567,
"learning_rate": 6.229676653455719e-07,
"loss": 0.2861,
"step": 1410
},
{
"epoch": 0.8570994684889901,
"grad_norm": 1.267651631290491,
"learning_rate": 6.178539991194599e-07,
"loss": 0.2562,
"step": 1411
},
{
"epoch": 0.8577069096431283,
"grad_norm": 1.125217028031925,
"learning_rate": 6.127600249020216e-07,
"loss": 0.2394,
"step": 1412
},
{
"epoch": 0.8583143507972665,
"grad_norm": 1.16266685601348,
"learning_rate": 6.076857655840024e-07,
"loss": 0.2844,
"step": 1413
},
{
"epoch": 0.8589217919514047,
"grad_norm": 0.9618647822548747,
"learning_rate": 6.026312439675553e-07,
"loss": 0.2221,
"step": 1414
},
{
"epoch": 0.8595292331055429,
"grad_norm": 1.0967811765567483,
"learning_rate": 5.975964827661346e-07,
"loss": 0.2839,
"step": 1415
},
{
"epoch": 0.8601366742596811,
"grad_norm": 1.2542292575403695,
"learning_rate": 5.925815046044026e-07,
"loss": 0.283,
"step": 1416
},
{
"epoch": 0.8607441154138192,
"grad_norm": 0.9685662585305622,
"learning_rate": 5.875863320181175e-07,
"loss": 0.2386,
"step": 1417
},
{
"epoch": 0.8613515565679575,
"grad_norm": 1.2191630870814079,
"learning_rate": 5.826109874540409e-07,
"loss": 0.2672,
"step": 1418
},
{
"epoch": 0.8619589977220957,
"grad_norm": 0.9467425337398014,
"learning_rate": 5.776554932698325e-07,
"loss": 0.2645,
"step": 1419
},
{
"epoch": 0.8625664388762339,
"grad_norm": 3.223483400807002,
"learning_rate": 5.727198717339511e-07,
"loss": 0.2326,
"step": 1420
},
{
"epoch": 0.8631738800303721,
"grad_norm": 0.9512745639027146,
"learning_rate": 5.678041450255512e-07,
"loss": 0.2629,
"step": 1421
},
{
"epoch": 0.8637813211845102,
"grad_norm": 0.9320452148866075,
"learning_rate": 5.6290833523439e-07,
"loss": 0.2641,
"step": 1422
},
{
"epoch": 0.8643887623386485,
"grad_norm": 1.0278008843267301,
"learning_rate": 5.58032464360721e-07,
"loss": 0.2803,
"step": 1423
},
{
"epoch": 0.8649962034927866,
"grad_norm": 0.908323450955481,
"learning_rate": 5.531765543152002e-07,
"loss": 0.2356,
"step": 1424
},
{
"epoch": 0.8656036446469249,
"grad_norm": 0.8303574957373083,
"learning_rate": 5.483406269187869e-07,
"loss": 0.2189,
"step": 1425
},
{
"epoch": 0.866211085801063,
"grad_norm": 1.1970452420325983,
"learning_rate": 5.435247039026398e-07,
"loss": 0.2094,
"step": 1426
},
{
"epoch": 0.8668185269552012,
"grad_norm": 1.2257111130524938,
"learning_rate": 5.387288069080298e-07,
"loss": 0.231,
"step": 1427
},
{
"epoch": 0.8674259681093394,
"grad_norm": 0.9627292722754438,
"learning_rate": 5.33952957486234e-07,
"loss": 0.2333,
"step": 1428
},
{
"epoch": 0.8680334092634776,
"grad_norm": 1.032967615425608,
"learning_rate": 5.291971770984428e-07,
"loss": 0.2958,
"step": 1429
},
{
"epoch": 0.8686408504176157,
"grad_norm": 1.139677124417918,
"learning_rate": 5.244614871156612e-07,
"loss": 0.2405,
"step": 1430
},
{
"epoch": 0.869248291571754,
"grad_norm": 1.2580527412823377,
"learning_rate": 5.197459088186163e-07,
"loss": 0.221,
"step": 1431
},
{
"epoch": 0.8698557327258922,
"grad_norm": 1.1944284727855752,
"learning_rate": 5.150504633976572e-07,
"loss": 0.2859,
"step": 1432
},
{
"epoch": 0.8704631738800304,
"grad_norm": 1.0957916763809294,
"learning_rate": 5.103751719526639e-07,
"loss": 0.2239,
"step": 1433
},
{
"epoch": 0.8710706150341686,
"grad_norm": 1.0470854505908578,
"learning_rate": 5.057200554929509e-07,
"loss": 0.2574,
"step": 1434
},
{
"epoch": 0.8716780561883067,
"grad_norm": 1.1296719722218975,
"learning_rate": 5.010851349371704e-07,
"loss": 0.2639,
"step": 1435
},
{
"epoch": 0.872285497342445,
"grad_norm": 2.711592251059139,
"learning_rate": 4.964704311132224e-07,
"loss": 0.2488,
"step": 1436
},
{
"epoch": 0.8728929384965831,
"grad_norm": 0.953048159062841,
"learning_rate": 4.918759647581578e-07,
"loss": 0.2581,
"step": 1437
},
{
"epoch": 0.8735003796507214,
"grad_norm": 1.0707876735381872,
"learning_rate": 4.873017565180871e-07,
"loss": 0.2578,
"step": 1438
},
{
"epoch": 0.8741078208048595,
"grad_norm": 0.9374479476013973,
"learning_rate": 4.827478269480895e-07,
"loss": 0.2405,
"step": 1439
},
{
"epoch": 0.8747152619589977,
"grad_norm": 1.0277390378554292,
"learning_rate": 4.782141965121129e-07,
"loss": 0.2701,
"step": 1440
},
{
"epoch": 0.8753227031131359,
"grad_norm": 1.4882213121058918,
"learning_rate": 4.7370088558289175e-07,
"loss": 0.2886,
"step": 1441
},
{
"epoch": 0.8759301442672741,
"grad_norm": 1.3548392131624356,
"learning_rate": 4.6920791444184934e-07,
"loss": 0.2471,
"step": 1442
},
{
"epoch": 0.8765375854214122,
"grad_norm": 1.0466228565297642,
"learning_rate": 4.647353032790086e-07,
"loss": 0.2414,
"step": 1443
},
{
"epoch": 0.8771450265755505,
"grad_norm": 0.8743868205121337,
"learning_rate": 4.602830721928997e-07,
"loss": 0.2079,
"step": 1444
},
{
"epoch": 0.8777524677296887,
"grad_norm": 1.0599925032639006,
"learning_rate": 4.558512411904731e-07,
"loss": 0.2949,
"step": 1445
},
{
"epoch": 0.8783599088838269,
"grad_norm": 1.4979260055048251,
"learning_rate": 4.5143983018700485e-07,
"loss": 0.249,
"step": 1446
},
{
"epoch": 0.8789673500379651,
"grad_norm": 0.9462464867555567,
"learning_rate": 4.4704885900601236e-07,
"loss": 0.2422,
"step": 1447
},
{
"epoch": 0.8795747911921032,
"grad_norm": 1.0122245637859872,
"learning_rate": 4.4267834737916295e-07,
"loss": 0.2516,
"step": 1448
},
{
"epoch": 0.8801822323462415,
"grad_norm": 1.2775093153721113,
"learning_rate": 4.3832831494618255e-07,
"loss": 0.2585,
"step": 1449
},
{
"epoch": 0.8807896735003796,
"grad_norm": 0.9907820157094275,
"learning_rate": 4.33998781254773e-07,
"loss": 0.2508,
"step": 1450
},
{
"epoch": 0.8813971146545179,
"grad_norm": 0.9761789386358818,
"learning_rate": 4.2968976576051703e-07,
"loss": 0.2848,
"step": 1451
},
{
"epoch": 0.882004555808656,
"grad_norm": 0.8788756221973065,
"learning_rate": 4.2540128782679934e-07,
"loss": 0.2185,
"step": 1452
},
{
"epoch": 0.8826119969627942,
"grad_norm": 0.9350228380136899,
"learning_rate": 4.211333667247125e-07,
"loss": 0.2464,
"step": 1453
},
{
"epoch": 0.8832194381169324,
"grad_norm": 0.9813301382441217,
"learning_rate": 4.1688602163297564e-07,
"loss": 0.2666,
"step": 1454
},
{
"epoch": 0.8838268792710706,
"grad_norm": 0.9810743433744146,
"learning_rate": 4.126592716378408e-07,
"loss": 0.2296,
"step": 1455
},
{
"epoch": 0.8844343204252088,
"grad_norm": 1.2038878027224096,
"learning_rate": 4.0845313573301736e-07,
"loss": 0.2682,
"step": 1456
},
{
"epoch": 0.885041761579347,
"grad_norm": 0.9884971893945054,
"learning_rate": 4.042676328195788e-07,
"loss": 0.2643,
"step": 1457
},
{
"epoch": 0.8856492027334852,
"grad_norm": 5.323233068234899,
"learning_rate": 4.001027817058789e-07,
"loss": 0.238,
"step": 1458
},
{
"epoch": 0.8862566438876234,
"grad_norm": 0.9225650218982654,
"learning_rate": 3.959586011074729e-07,
"loss": 0.2155,
"step": 1459
},
{
"epoch": 0.8868640850417616,
"grad_norm": 0.9528679131681773,
"learning_rate": 3.9183510964702463e-07,
"loss": 0.2418,
"step": 1460
},
{
"epoch": 0.8874715261958998,
"grad_norm": 1.1303434388021751,
"learning_rate": 3.8773232585422924e-07,
"loss": 0.2297,
"step": 1461
},
{
"epoch": 0.888078967350038,
"grad_norm": 0.9900217922322905,
"learning_rate": 3.836502681657289e-07,
"loss": 0.2462,
"step": 1462
},
{
"epoch": 0.8886864085041761,
"grad_norm": 0.887907349960081,
"learning_rate": 3.795889549250292e-07,
"loss": 0.2171,
"step": 1463
},
{
"epoch": 0.8892938496583144,
"grad_norm": 0.9248553866957503,
"learning_rate": 3.755484043824131e-07,
"loss": 0.2243,
"step": 1464
},
{
"epoch": 0.8899012908124525,
"grad_norm": 0.8912025953132797,
"learning_rate": 3.715286346948671e-07,
"loss": 0.2149,
"step": 1465
},
{
"epoch": 0.8905087319665907,
"grad_norm": 0.8751189326641786,
"learning_rate": 3.675296639259912e-07,
"loss": 0.228,
"step": 1466
},
{
"epoch": 0.8911161731207289,
"grad_norm": 1.0691558945787711,
"learning_rate": 3.6355151004592414e-07,
"loss": 0.2233,
"step": 1467
},
{
"epoch": 0.8917236142748671,
"grad_norm": 1.3028028349470695,
"learning_rate": 3.595941909312595e-07,
"loss": 0.2603,
"step": 1468
},
{
"epoch": 0.8923310554290053,
"grad_norm": 0.8896967704695612,
"learning_rate": 3.5565772436496336e-07,
"loss": 0.2269,
"step": 1469
},
{
"epoch": 0.8929384965831435,
"grad_norm": 0.907343561878061,
"learning_rate": 3.517421280363004e-07,
"loss": 0.2477,
"step": 1470
},
{
"epoch": 0.8935459377372817,
"grad_norm": 0.9527250045159997,
"learning_rate": 3.4784741954074884e-07,
"loss": 0.2645,
"step": 1471
},
{
"epoch": 0.8941533788914199,
"grad_norm": 0.9958249751109337,
"learning_rate": 3.439736163799251e-07,
"loss": 0.2331,
"step": 1472
},
{
"epoch": 0.8947608200455581,
"grad_norm": 1.2873604817141377,
"learning_rate": 3.4012073596150106e-07,
"loss": 0.235,
"step": 1473
},
{
"epoch": 0.8953682611996963,
"grad_norm": 0.8961669297469559,
"learning_rate": 3.362887955991301e-07,
"loss": 0.2408,
"step": 1474
},
{
"epoch": 0.8959757023538345,
"grad_norm": 1.051005440352167,
"learning_rate": 3.3247781251236623e-07,
"loss": 0.256,
"step": 1475
},
{
"epoch": 0.8965831435079726,
"grad_norm": 1.396804063195131,
"learning_rate": 3.2868780382658895e-07,
"loss": 0.2259,
"step": 1476
},
{
"epoch": 0.8971905846621109,
"grad_norm": 0.8551324108863454,
"learning_rate": 3.2491878657292643e-07,
"loss": 0.2552,
"step": 1477
},
{
"epoch": 0.897798025816249,
"grad_norm": 0.9274670411652475,
"learning_rate": 3.2117077768817395e-07,
"loss": 0.2271,
"step": 1478
},
{
"epoch": 0.8984054669703873,
"grad_norm": 1.2221587711171944,
"learning_rate": 3.174437940147268e-07,
"loss": 0.2447,
"step": 1479
},
{
"epoch": 0.8990129081245254,
"grad_norm": 6.662057062986324,
"learning_rate": 3.1373785230049356e-07,
"loss": 0.2718,
"step": 1480
},
{
"epoch": 0.8996203492786636,
"grad_norm": 2.162458231338751,
"learning_rate": 3.1005296919883354e-07,
"loss": 0.2563,
"step": 1481
},
{
"epoch": 0.9002277904328019,
"grad_norm": 1.1484595739033663,
"learning_rate": 3.0638916126846885e-07,
"loss": 0.2488,
"step": 1482
},
{
"epoch": 0.90083523158694,
"grad_norm": 0.9027654629735503,
"learning_rate": 3.0274644497342133e-07,
"loss": 0.2304,
"step": 1483
},
{
"epoch": 0.9014426727410783,
"grad_norm": 0.950573848140933,
"learning_rate": 2.991248366829291e-07,
"loss": 0.2141,
"step": 1484
},
{
"epoch": 0.9020501138952164,
"grad_norm": 0.8792954327784936,
"learning_rate": 2.955243526713808e-07,
"loss": 0.2382,
"step": 1485
},
{
"epoch": 0.9026575550493546,
"grad_norm": 1.1214719838180265,
"learning_rate": 2.91945009118238e-07,
"loss": 0.2527,
"step": 1486
},
{
"epoch": 0.9032649962034928,
"grad_norm": 1.1337122451080186,
"learning_rate": 2.883868221079628e-07,
"loss": 0.3125,
"step": 1487
},
{
"epoch": 0.903872437357631,
"grad_norm": 1.149501989804444,
"learning_rate": 2.848498076299483e-07,
"loss": 0.2788,
"step": 1488
},
{
"epoch": 0.9044798785117691,
"grad_norm": 0.9593512296584564,
"learning_rate": 2.813339815784416e-07,
"loss": 0.2439,
"step": 1489
},
{
"epoch": 0.9050873196659074,
"grad_norm": 1.0889375186520727,
"learning_rate": 2.7783935975247867e-07,
"loss": 0.2679,
"step": 1490
},
{
"epoch": 0.9056947608200455,
"grad_norm": 1.0266400030570082,
"learning_rate": 2.743659578558089e-07,
"loss": 0.2375,
"step": 1491
},
{
"epoch": 0.9063022019741838,
"grad_norm": 0.9071334642534141,
"learning_rate": 2.7091379149682683e-07,
"loss": 0.2293,
"step": 1492
},
{
"epoch": 0.9069096431283219,
"grad_norm": 1.2936790169599448,
"learning_rate": 2.6748287618849957e-07,
"loss": 0.2409,
"step": 1493
},
{
"epoch": 0.9075170842824601,
"grad_norm": 1.0584178177194592,
"learning_rate": 2.6407322734829763e-07,
"loss": 0.236,
"step": 1494
},
{
"epoch": 0.9081245254365984,
"grad_norm": 1.3313437851181316,
"learning_rate": 2.6068486029813154e-07,
"loss": 0.2356,
"step": 1495
},
{
"epoch": 0.9087319665907365,
"grad_norm": 0.9005677622751922,
"learning_rate": 2.573177902642726e-07,
"loss": 0.2245,
"step": 1496
},
{
"epoch": 0.9093394077448748,
"grad_norm": 0.9376710431209911,
"learning_rate": 2.539720323772926e-07,
"loss": 0.2416,
"step": 1497
},
{
"epoch": 0.9099468488990129,
"grad_norm": 0.9311283280877473,
"learning_rate": 2.506476016719922e-07,
"loss": 0.2341,
"step": 1498
},
{
"epoch": 0.9105542900531511,
"grad_norm": 1.1243831646626379,
"learning_rate": 2.473445130873353e-07,
"loss": 0.2628,
"step": 1499
},
{
"epoch": 0.9111617312072893,
"grad_norm": 0.9100015428632143,
"learning_rate": 2.440627814663804e-07,
"loss": 0.235,
"step": 1500
},
{
"epoch": 0.9117691723614275,
"grad_norm": 1.8437664997799614,
"learning_rate": 2.4080242155621327e-07,
"loss": 0.2469,
"step": 1501
},
{
"epoch": 0.9123766135155656,
"grad_norm": 1.0290473471963233,
"learning_rate": 2.3756344800788421e-07,
"loss": 0.2474,
"step": 1502
},
{
"epoch": 0.9129840546697039,
"grad_norm": 1.1477504753716588,
"learning_rate": 2.343458753763378e-07,
"loss": 0.2242,
"step": 1503
},
{
"epoch": 0.913591495823842,
"grad_norm": 1.1198944079500255,
"learning_rate": 2.3114971812034981e-07,
"loss": 0.2504,
"step": 1504
},
{
"epoch": 0.9141989369779803,
"grad_norm": 0.945076046858483,
"learning_rate": 2.2797499060246253e-07,
"loss": 0.2517,
"step": 1505
},
{
"epoch": 0.9148063781321184,
"grad_norm": 1.285167047982773,
"learning_rate": 2.2482170708892083e-07,
"loss": 0.2333,
"step": 1506
},
{
"epoch": 0.9154138192862566,
"grad_norm": 0.9188875249417393,
"learning_rate": 2.2168988174960382e-07,
"loss": 0.242,
"step": 1507
},
{
"epoch": 0.9160212604403949,
"grad_norm": 0.9392029956247224,
"learning_rate": 2.1857952865796616e-07,
"loss": 0.2494,
"step": 1508
},
{
"epoch": 0.916628701594533,
"grad_norm": 3.5960021463661223,
"learning_rate": 2.1549066179097355e-07,
"loss": 0.2581,
"step": 1509
},
{
"epoch": 0.9172361427486713,
"grad_norm": 1.4581742375551667,
"learning_rate": 2.124232950290367e-07,
"loss": 0.2536,
"step": 1510
},
{
"epoch": 0.9178435839028094,
"grad_norm": 1.0527162495034155,
"learning_rate": 2.0937744215595467e-07,
"loss": 0.2409,
"step": 1511
},
{
"epoch": 0.9184510250569476,
"grad_norm": 1.4110685445772864,
"learning_rate": 2.0635311685884675e-07,
"loss": 0.2095,
"step": 1512
},
{
"epoch": 0.9190584662110858,
"grad_norm": 1.213907408406235,
"learning_rate": 2.0335033272809612e-07,
"loss": 0.2757,
"step": 1513
},
{
"epoch": 0.919665907365224,
"grad_norm": 2.0561680152756114,
"learning_rate": 2.0036910325728521e-07,
"loss": 0.2397,
"step": 1514
},
{
"epoch": 0.9202733485193622,
"grad_norm": 1.0091633854522606,
"learning_rate": 1.9740944184313882e-07,
"loss": 0.256,
"step": 1515
},
{
"epoch": 0.9208807896735004,
"grad_norm": 1.374619992070271,
"learning_rate": 1.9447136178545766e-07,
"loss": 0.2351,
"step": 1516
},
{
"epoch": 0.9214882308276385,
"grad_norm": 1.0149607815681039,
"learning_rate": 1.9155487628706672e-07,
"loss": 0.2149,
"step": 1517
},
{
"epoch": 0.9220956719817768,
"grad_norm": 1.5286222741924442,
"learning_rate": 1.8865999845374794e-07,
"loss": 0.2401,
"step": 1518
},
{
"epoch": 0.9227031131359149,
"grad_norm": 1.1238029435165344,
"learning_rate": 1.857867412941883e-07,
"loss": 0.2259,
"step": 1519
},
{
"epoch": 0.9233105542900532,
"grad_norm": 1.047113679889672,
"learning_rate": 1.8293511771991624e-07,
"loss": 0.2562,
"step": 1520
},
{
"epoch": 0.9239179954441914,
"grad_norm": 1.1015346889794326,
"learning_rate": 1.8010514054524531e-07,
"loss": 0.2496,
"step": 1521
},
{
"epoch": 0.9245254365983295,
"grad_norm": 0.9348867843858392,
"learning_rate": 1.7729682248721848e-07,
"loss": 0.2193,
"step": 1522
},
{
"epoch": 0.9251328777524678,
"grad_norm": 0.900662481617006,
"learning_rate": 1.7451017616554822e-07,
"loss": 0.2346,
"step": 1523
},
{
"epoch": 0.9257403189066059,
"grad_norm": 1.0079629512704111,
"learning_rate": 1.7174521410256162e-07,
"loss": 0.2739,
"step": 1524
},
{
"epoch": 0.9263477600607442,
"grad_norm": 1.0034552226211848,
"learning_rate": 1.69001948723142e-07,
"loss": 0.2709,
"step": 1525
},
{
"epoch": 0.9269552012148823,
"grad_norm": 1.1773428284591294,
"learning_rate": 1.6628039235467686e-07,
"loss": 0.2472,
"step": 1526
},
{
"epoch": 0.9275626423690205,
"grad_norm": 1.1087538756550075,
"learning_rate": 1.6358055722699662e-07,
"loss": 0.2376,
"step": 1527
},
{
"epoch": 0.9281700835231587,
"grad_norm": 3.5302378698596972,
"learning_rate": 1.6090245547232707e-07,
"loss": 0.2445,
"step": 1528
},
{
"epoch": 0.9287775246772969,
"grad_norm": 0.9318045452754465,
"learning_rate": 1.5824609912522825e-07,
"loss": 0.2495,
"step": 1529
},
{
"epoch": 0.929384965831435,
"grad_norm": 0.9925458437861561,
"learning_rate": 1.5561150012254446e-07,
"loss": 0.252,
"step": 1530
},
{
"epoch": 0.9299924069855733,
"grad_norm": 0.9204194360128435,
"learning_rate": 1.5299867030334815e-07,
"loss": 0.2544,
"step": 1531
},
{
"epoch": 0.9305998481397114,
"grad_norm": 1.4389165535169934,
"learning_rate": 1.5040762140888843e-07,
"loss": 0.2509,
"step": 1532
},
{
"epoch": 0.9312072892938497,
"grad_norm": 0.9464143937114549,
"learning_rate": 1.4783836508253823e-07,
"loss": 0.219,
"step": 1533
},
{
"epoch": 0.9318147304479879,
"grad_norm": 1.1584265502532431,
"learning_rate": 1.4529091286973994e-07,
"loss": 0.2584,
"step": 1534
},
{
"epoch": 0.932422171602126,
"grad_norm": 1.0767653117954572,
"learning_rate": 1.4276527621795655e-07,
"loss": 0.2477,
"step": 1535
},
{
"epoch": 0.9330296127562643,
"grad_norm": 1.1174557743113676,
"learning_rate": 1.402614664766172e-07,
"loss": 0.2515,
"step": 1536
},
{
"epoch": 0.9336370539104024,
"grad_norm": 0.9313798735305144,
"learning_rate": 1.3777949489706898e-07,
"loss": 0.231,
"step": 1537
},
{
"epoch": 0.9342444950645407,
"grad_norm": 1.1379310451818712,
"learning_rate": 1.353193726325247e-07,
"loss": 0.2503,
"step": 1538
},
{
"epoch": 0.9348519362186788,
"grad_norm": 1.1585882440499968,
"learning_rate": 1.3288111073801235e-07,
"loss": 0.2784,
"step": 1539
},
{
"epoch": 0.935459377372817,
"grad_norm": 1.5930803179580344,
"learning_rate": 1.3046472017032685e-07,
"loss": 0.2418,
"step": 1540
},
{
"epoch": 0.9360668185269552,
"grad_norm": 0.9991292646052891,
"learning_rate": 1.280702117879795e-07,
"loss": 0.2397,
"step": 1541
},
{
"epoch": 0.9366742596810934,
"grad_norm": 1.212096857283085,
"learning_rate": 1.2569759635115086e-07,
"loss": 0.2582,
"step": 1542
},
{
"epoch": 0.9372817008352315,
"grad_norm": 1.1250193762265426,
"learning_rate": 1.2334688452164122e-07,
"loss": 0.2575,
"step": 1543
},
{
"epoch": 0.9378891419893698,
"grad_norm": 0.971115660382781,
"learning_rate": 1.210180868628219e-07,
"loss": 0.271,
"step": 1544
},
{
"epoch": 0.9384965831435079,
"grad_norm": 0.907500253470022,
"learning_rate": 1.1871121383958961e-07,
"loss": 0.2392,
"step": 1545
},
{
"epoch": 0.9391040242976462,
"grad_norm": 1.47219686578771,
"learning_rate": 1.1642627581831767e-07,
"loss": 0.2533,
"step": 1546
},
{
"epoch": 0.9397114654517844,
"grad_norm": 1.4887563913664645,
"learning_rate": 1.1416328306681046e-07,
"loss": 0.2665,
"step": 1547
},
{
"epoch": 0.9403189066059225,
"grad_norm": 0.9182957946443633,
"learning_rate": 1.1192224575425848e-07,
"loss": 0.2233,
"step": 1548
},
{
"epoch": 0.9409263477600608,
"grad_norm": 1.0661000364774975,
"learning_rate": 1.0970317395119001e-07,
"loss": 0.2722,
"step": 1549
},
{
"epoch": 0.9415337889141989,
"grad_norm": 1.156185857708016,
"learning_rate": 1.0750607762942622e-07,
"loss": 0.2374,
"step": 1550
},
{
"epoch": 0.9421412300683372,
"grad_norm": 1.0021679642199284,
"learning_rate": 1.0533096666203946e-07,
"loss": 0.247,
"step": 1551
},
{
"epoch": 0.9427486712224753,
"grad_norm": 1.062302865690974,
"learning_rate": 1.0317785082330555e-07,
"loss": 0.2415,
"step": 1552
},
{
"epoch": 0.9433561123766135,
"grad_norm": 0.8324437312272753,
"learning_rate": 1.0104673978866164e-07,
"loss": 0.2131,
"step": 1553
},
{
"epoch": 0.9439635535307517,
"grad_norm": 1.2649813678192605,
"learning_rate": 9.89376431346606e-08,
"loss": 0.276,
"step": 1554
},
{
"epoch": 0.9445709946848899,
"grad_norm": 1.622757976163991,
"learning_rate": 9.685057033892998e-08,
"loss": 0.2582,
"step": 1555
},
{
"epoch": 0.945178435839028,
"grad_norm": 1.765578442579649,
"learning_rate": 9.478553078013042e-08,
"loss": 0.2553,
"step": 1556
},
{
"epoch": 0.9457858769931663,
"grad_norm": 1.4608500363168406,
"learning_rate": 9.274253373791064e-08,
"loss": 0.2555,
"step": 1557
},
{
"epoch": 0.9463933181473044,
"grad_norm": 0.8888287744971497,
"learning_rate": 9.072158839286748e-08,
"loss": 0.2405,
"step": 1558
},
{
"epoch": 0.9470007593014427,
"grad_norm": 1.269488158961429,
"learning_rate": 8.872270382650372e-08,
"loss": 0.2397,
"step": 1559
},
{
"epoch": 0.9476082004555809,
"grad_norm": 0.9140050155362377,
"learning_rate": 8.674588902118919e-08,
"loss": 0.2581,
"step": 1560
},
{
"epoch": 0.948215641609719,
"grad_norm": 0.9350253085292872,
"learning_rate": 8.479115286011752e-08,
"loss": 0.2578,
"step": 1561
},
{
"epoch": 0.9488230827638573,
"grad_norm": 1.294156199026534,
"learning_rate": 8.285850412726837e-08,
"loss": 0.2768,
"step": 1562
},
{
"epoch": 0.9494305239179954,
"grad_norm": 0.8781441810000316,
"learning_rate": 8.094795150736745e-08,
"loss": 0.2124,
"step": 1563
},
{
"epoch": 0.9500379650721337,
"grad_norm": 0.8821255230738889,
"learning_rate": 7.905950358584768e-08,
"loss": 0.2358,
"step": 1564
},
{
"epoch": 0.9506454062262718,
"grad_norm": 1.08985926239788,
"learning_rate": 7.719316884880922e-08,
"loss": 0.2615,
"step": 1565
},
{
"epoch": 0.95125284738041,
"grad_norm": 0.8755195395367136,
"learning_rate": 7.534895568298395e-08,
"loss": 0.2352,
"step": 1566
},
{
"epoch": 0.9518602885345482,
"grad_norm": 0.9952147011406434,
"learning_rate": 7.352687237569489e-08,
"loss": 0.2557,
"step": 1567
},
{
"epoch": 0.9524677296886864,
"grad_norm": 1.1520050119158871,
"learning_rate": 7.172692711482022e-08,
"loss": 0.2156,
"step": 1568
},
{
"epoch": 0.9530751708428246,
"grad_norm": 1.0057754256396354,
"learning_rate": 6.994912798875875e-08,
"loss": 0.2465,
"step": 1569
},
{
"epoch": 0.9536826119969628,
"grad_norm": 1.0040861530461729,
"learning_rate": 6.819348298638839e-08,
"loss": 0.2816,
"step": 1570
},
{
"epoch": 0.9542900531511009,
"grad_norm": 1.0700694831197364,
"learning_rate": 6.6459999997035e-08,
"loss": 0.2412,
"step": 1571
},
{
"epoch": 0.9548974943052392,
"grad_norm": 1.038954346833618,
"learning_rate": 6.474868681043578e-08,
"loss": 0.2782,
"step": 1572
},
{
"epoch": 0.9555049354593774,
"grad_norm": 0.9247657016066214,
"learning_rate": 6.305955111670204e-08,
"loss": 0.241,
"step": 1573
},
{
"epoch": 0.9561123766135156,
"grad_norm": 1.2125088257095862,
"learning_rate": 6.13926005062876e-08,
"loss": 0.2506,
"step": 1574
},
{
"epoch": 0.9567198177676538,
"grad_norm": 2.8028781405959005,
"learning_rate": 5.974784246995214e-08,
"loss": 0.2117,
"step": 1575
},
{
"epoch": 0.9573272589217919,
"grad_norm": 1.2280962513812903,
"learning_rate": 5.8125284398730666e-08,
"loss": 0.2237,
"step": 1576
},
{
"epoch": 0.9579347000759302,
"grad_norm": 1.2300450579760327,
"learning_rate": 5.6524933583896326e-08,
"loss": 0.2126,
"step": 1577
},
{
"epoch": 0.9585421412300683,
"grad_norm": 2.4999481290325116,
"learning_rate": 5.4946797216931524e-08,
"loss": 0.2545,
"step": 1578
},
{
"epoch": 0.9591495823842066,
"grad_norm": 6.486220633951538,
"learning_rate": 5.339088238949186e-08,
"loss": 0.2354,
"step": 1579
},
{
"epoch": 0.9597570235383447,
"grad_norm": 0.9307011705790993,
"learning_rate": 5.185719609337836e-08,
"loss": 0.2342,
"step": 1580
},
{
"epoch": 0.9603644646924829,
"grad_norm": 0.9393824630850843,
"learning_rate": 5.034574522050251e-08,
"loss": 0.2467,
"step": 1581
},
{
"epoch": 0.9609719058466211,
"grad_norm": 1.4403224099920036,
"learning_rate": 4.885653656285627e-08,
"loss": 0.254,
"step": 1582
},
{
"epoch": 0.9615793470007593,
"grad_norm": 0.8733649961719668,
"learning_rate": 4.73895768124838e-08,
"loss": 0.2441,
"step": 1583
},
{
"epoch": 0.9621867881548974,
"grad_norm": 1.197435869062917,
"learning_rate": 4.5944872561448084e-08,
"loss": 0.2331,
"step": 1584
},
{
"epoch": 0.9627942293090357,
"grad_norm": 1.096594381356183,
"learning_rate": 4.45224303018027e-08,
"loss": 0.2402,
"step": 1585
},
{
"epoch": 0.9634016704631739,
"grad_norm": 1.0250873227147124,
"learning_rate": 4.3122256425563444e-08,
"loss": 0.266,
"step": 1586
},
{
"epoch": 0.9640091116173121,
"grad_norm": 1.0372168523415322,
"learning_rate": 4.174435722467951e-08,
"loss": 0.2625,
"step": 1587
},
{
"epoch": 0.9646165527714503,
"grad_norm": 0.9433606458038362,
"learning_rate": 4.038873889100237e-08,
"loss": 0.2642,
"step": 1588
},
{
"epoch": 0.9652239939255884,
"grad_norm": 0.9807120841293877,
"learning_rate": 3.905540751626191e-08,
"loss": 0.2472,
"step": 1589
},
{
"epoch": 0.9658314350797267,
"grad_norm": 0.9083975877708611,
"learning_rate": 3.77443690920376e-08,
"loss": 0.2307,
"step": 1590
},
{
"epoch": 0.9664388762338648,
"grad_norm": 3.6391836768271273,
"learning_rate": 3.645562950973014e-08,
"loss": 0.2495,
"step": 1591
},
{
"epoch": 0.9670463173880031,
"grad_norm": 1.0037148121403654,
"learning_rate": 3.518919456053649e-08,
"loss": 0.2738,
"step": 1592
},
{
"epoch": 0.9676537585421412,
"grad_norm": 1.1951622922237168,
"learning_rate": 3.3945069935423234e-08,
"loss": 0.2449,
"step": 1593
},
{
"epoch": 0.9682611996962794,
"grad_norm": 1.5169082502067843,
"learning_rate": 3.2723261225102164e-08,
"loss": 0.2377,
"step": 1594
},
{
"epoch": 0.9688686408504176,
"grad_norm": 3.6717572091002997,
"learning_rate": 3.152377392000361e-08,
"loss": 0.2671,
"step": 1595
},
{
"epoch": 0.9694760820045558,
"grad_norm": 0.8724528062444226,
"learning_rate": 3.034661341025258e-08,
"loss": 0.231,
"step": 1596
},
{
"epoch": 0.970083523158694,
"grad_norm": 0.9374629987363168,
"learning_rate": 2.9191784985644345e-08,
"loss": 0.241,
"step": 1597
},
{
"epoch": 0.9706909643128322,
"grad_norm": 0.9282402240595091,
"learning_rate": 2.8059293835620006e-08,
"loss": 0.2349,
"step": 1598
},
{
"epoch": 0.9712984054669704,
"grad_norm": 1.2644746158155864,
"learning_rate": 2.6949145049245396e-08,
"loss": 0.249,
"step": 1599
},
{
"epoch": 0.9719058466211086,
"grad_norm": 0.9000862092444777,
"learning_rate": 2.5861343615184997e-08,
"loss": 0.245,
"step": 1600
},
{
"epoch": 0.9725132877752468,
"grad_norm": 1.368391294736083,
"learning_rate": 2.479589442168251e-08,
"loss": 0.2333,
"step": 1601
},
{
"epoch": 0.973120728929385,
"grad_norm": 1.29020328624742,
"learning_rate": 2.3752802256536423e-08,
"loss": 0.2683,
"step": 1602
},
{
"epoch": 0.9737281700835232,
"grad_norm": 0.9410886564376265,
"learning_rate": 2.2732071807081147e-08,
"loss": 0.2285,
"step": 1603
},
{
"epoch": 0.9743356112376613,
"grad_norm": 1.4761677232747976,
"learning_rate": 2.173370766016314e-08,
"loss": 0.2315,
"step": 1604
},
{
"epoch": 0.9749430523917996,
"grad_norm": 0.9501609497089889,
"learning_rate": 2.0757714302122035e-08,
"loss": 0.241,
"step": 1605
},
{
"epoch": 0.9755504935459377,
"grad_norm": 1.2314833591182839,
"learning_rate": 1.98040961187701e-08,
"loss": 0.2703,
"step": 1606
},
{
"epoch": 0.976157934700076,
"grad_norm": 1.0158036848314118,
"learning_rate": 1.8872857395372812e-08,
"loss": 0.2597,
"step": 1607
},
{
"epoch": 0.9767653758542141,
"grad_norm": 1.1694648264181446,
"learning_rate": 1.7964002316628316e-08,
"loss": 0.2916,
"step": 1608
},
{
"epoch": 0.9773728170083523,
"grad_norm": 1.0688476934543394,
"learning_rate": 1.7077534966650767e-08,
"loss": 0.2558,
"step": 1609
},
{
"epoch": 0.9779802581624905,
"grad_norm": 0.9549062578238298,
"learning_rate": 1.6213459328950355e-08,
"loss": 0.244,
"step": 1610
},
{
"epoch": 0.9785876993166287,
"grad_norm": 2.412110087204276,
"learning_rate": 1.537177928641498e-08,
"loss": 0.2462,
"step": 1611
},
{
"epoch": 0.979195140470767,
"grad_norm": 0.941429067459168,
"learning_rate": 1.4552498621295264e-08,
"loss": 0.2535,
"step": 1612
},
{
"epoch": 0.9798025816249051,
"grad_norm": 0.9371186125393489,
"learning_rate": 1.3755621015184018e-08,
"loss": 0.2515,
"step": 1613
},
{
"epoch": 0.9804100227790433,
"grad_norm": 1.2909491242156825,
"learning_rate": 1.2981150049004021e-08,
"loss": 0.2327,
"step": 1614
},
{
"epoch": 0.9810174639331815,
"grad_norm": 0.9804844053080275,
"learning_rate": 1.2229089202987487e-08,
"loss": 0.2317,
"step": 1615
},
{
"epoch": 0.9816249050873197,
"grad_norm": 1.0876245667656317,
"learning_rate": 1.1499441856663296e-08,
"loss": 0.2516,
"step": 1616
},
{
"epoch": 0.9822323462414578,
"grad_norm": 0.8612954321226812,
"learning_rate": 1.0792211288841447e-08,
"loss": 0.2599,
"step": 1617
},
{
"epoch": 0.9828397873955961,
"grad_norm": 0.888353089870358,
"learning_rate": 1.0107400677596413e-08,
"loss": 0.2547,
"step": 1618
},
{
"epoch": 0.9834472285497342,
"grad_norm": 0.9827706274173068,
"learning_rate": 9.44501310025603e-09,
"loss": 0.2519,
"step": 1619
},
{
"epoch": 0.9840546697038725,
"grad_norm": 0.9827464321021941,
"learning_rate": 8.805051533384846e-09,
"loss": 0.2315,
"step": 1620
},
{
"epoch": 0.9846621108580106,
"grad_norm": 0.9339543239161487,
"learning_rate": 8.187518852771914e-09,
"loss": 0.2636,
"step": 1621
},
{
"epoch": 0.9852695520121488,
"grad_norm": 0.8991352561684951,
"learning_rate": 7.59241783341913e-09,
"loss": 0.2535,
"step": 1622
},
{
"epoch": 0.985876993166287,
"grad_norm": 0.8964567756827048,
"learning_rate": 7.019751149525133e-09,
"loss": 0.2098,
"step": 1623
},
{
"epoch": 0.9864844343204252,
"grad_norm": 1.0511236807122393,
"learning_rate": 6.469521374477539e-09,
"loss": 0.2739,
"step": 1624
},
{
"epoch": 0.9870918754745635,
"grad_norm": 0.8742334390126567,
"learning_rate": 5.941730980839056e-09,
"loss": 0.2423,
"step": 1625
},
{
"epoch": 0.9876993166287016,
"grad_norm": 0.8982580571091601,
"learning_rate": 5.436382340335833e-09,
"loss": 0.2275,
"step": 1626
},
{
"epoch": 0.9883067577828398,
"grad_norm": 0.9802151698563373,
"learning_rate": 4.9534777238485764e-09,
"loss": 0.2458,
"step": 1627
},
{
"epoch": 0.988914198936978,
"grad_norm": 1.021763994953802,
"learning_rate": 4.493019301401447e-09,
"loss": 0.2413,
"step": 1628
},
{
"epoch": 0.9895216400911162,
"grad_norm": 1.4457966401308637,
"learning_rate": 4.055009142152066e-09,
"loss": 0.2717,
"step": 1629
},
{
"epoch": 0.9901290812452543,
"grad_norm": 0.9291088704888705,
"learning_rate": 3.6394492143820847e-09,
"loss": 0.2631,
"step": 1630
},
{
"epoch": 0.9907365223993926,
"grad_norm": 0.9333078223932707,
"learning_rate": 3.2463413854899594e-09,
"loss": 0.2356,
"step": 1631
},
{
"epoch": 0.9913439635535307,
"grad_norm": 0.8258221417650397,
"learning_rate": 2.875687421980966e-09,
"loss": 0.2435,
"step": 1632
},
{
"epoch": 0.991951404707669,
"grad_norm": 1.3664318253744514,
"learning_rate": 2.5274889894583156e-09,
"loss": 0.2455,
"step": 1633
},
{
"epoch": 0.9925588458618071,
"grad_norm": 1.152187566076446,
"learning_rate": 2.201747652618713e-09,
"loss": 0.2713,
"step": 1634
},
{
"epoch": 0.9931662870159453,
"grad_norm": 1.085926992818226,
"learning_rate": 1.8984648752429222e-09,
"loss": 0.2716,
"step": 1635
},
{
"epoch": 0.9937737281700835,
"grad_norm": 1.1962597289024564,
"learning_rate": 1.6176420201902132e-09,
"loss": 0.2427,
"step": 1636
},
{
"epoch": 0.9943811693242217,
"grad_norm": 0.9297404890182976,
"learning_rate": 1.3592803493905904e-09,
"loss": 0.2492,
"step": 1637
},
{
"epoch": 0.99498861047836,
"grad_norm": 1.4127019374082828,
"learning_rate": 1.1233810238425735e-09,
"loss": 0.2329,
"step": 1638
},
{
"epoch": 0.9955960516324981,
"grad_norm": 0.9889547616457451,
"learning_rate": 9.099451036048701e-10,
"loss": 0.2464,
"step": 1639
},
{
"epoch": 0.9962034927866363,
"grad_norm": 1.479970043099125,
"learning_rate": 7.189735477913795e-10,
"loss": 0.243,
"step": 1640
},
{
"epoch": 0.9968109339407745,
"grad_norm": 0.9559279964912788,
"learning_rate": 5.504672145700829e-10,
"loss": 0.294,
"step": 1641
},
{
"epoch": 0.9974183750949127,
"grad_norm": 1.013358124558348,
"learning_rate": 4.0442686115582665e-10,
"loss": 0.2607,
"step": 1642
},
{
"epoch": 0.9980258162490508,
"grad_norm": 1.7596074983835073,
"learning_rate": 2.8085314380976725e-10,
"loss": 0.2626,
"step": 1643
},
{
"epoch": 0.9986332574031891,
"grad_norm": 1.0764621343908087,
"learning_rate": 1.797466178327101e-10,
"loss": 0.2622,
"step": 1644
},
{
"epoch": 0.9992406985573272,
"grad_norm": 1.1105069084316046,
"learning_rate": 1.011077375662195e-10,
"loss": 0.2386,
"step": 1645
},
{
"epoch": 0.9998481397114655,
"grad_norm": 1.0590187008021932,
"learning_rate": 4.4936856390398465e-11,
"loss": 0.2989,
"step": 1646
},
{
"epoch": 1.0,
"grad_norm": 1.0590187008021932,
"learning_rate": 1.1234226718337405e-11,
"loss": 0.0578,
"step": 1647
},
{
"epoch": 1.0,
"step": 1647,
"total_flos": 669099333058560.0,
"train_loss": 0.32129256987551813,
"train_runtime": 70988.8215,
"train_samples_per_second": 0.742,
"train_steps_per_second": 0.023
}
],
"logging_steps": 1,
"max_steps": 1647,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 669099333058560.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}