Files
PsychAgent-Qwen3-32B/trainer_state.json
ModelHub XC 90763e8666 初始化项目,由ModelHub XC社区提供模型
Model: ecnu-icalk/PsychAgent-Qwen3-32B
Source: Original Platform
2026-06-04 10:54:17 +08:00

5217 lines
126 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 1479,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004058853373921867,
"grad_norm": 1.6877160845613697,
"learning_rate": 6.756756756756757e-08,
"loss": 0.3778,
"step": 2
},
{
"epoch": 0.008117706747843734,
"grad_norm": 1.7651022477491063,
"learning_rate": 2.0270270270270273e-07,
"loss": 0.3539,
"step": 4
},
{
"epoch": 0.0121765601217656,
"grad_norm": 1.7346867297246225,
"learning_rate": 3.378378378378379e-07,
"loss": 0.3621,
"step": 6
},
{
"epoch": 0.016235413495687467,
"grad_norm": 1.572133422531505,
"learning_rate": 4.7297297297297305e-07,
"loss": 0.3345,
"step": 8
},
{
"epoch": 0.020294266869609334,
"grad_norm": 1.619733499528023,
"learning_rate": 6.081081081081082e-07,
"loss": 0.3408,
"step": 10
},
{
"epoch": 0.0243531202435312,
"grad_norm": 1.658160676313538,
"learning_rate": 7.432432432432434e-07,
"loss": 0.351,
"step": 12
},
{
"epoch": 0.028411973617453068,
"grad_norm": 1.5596628408438766,
"learning_rate": 8.783783783783785e-07,
"loss": 0.3319,
"step": 14
},
{
"epoch": 0.032470826991374935,
"grad_norm": 1.582923343783565,
"learning_rate": 1.0135135135135136e-06,
"loss": 0.3631,
"step": 16
},
{
"epoch": 0.0365296803652968,
"grad_norm": 1.6210099160307392,
"learning_rate": 1.148648648648649e-06,
"loss": 0.3369,
"step": 18
},
{
"epoch": 0.04058853373921867,
"grad_norm": 1.4490925925277196,
"learning_rate": 1.2837837837837838e-06,
"loss": 0.3401,
"step": 20
},
{
"epoch": 0.044647387113140535,
"grad_norm": 1.5981937222734808,
"learning_rate": 1.418918918918919e-06,
"loss": 0.4064,
"step": 22
},
{
"epoch": 0.0487062404870624,
"grad_norm": 1.4745472664304582,
"learning_rate": 1.5540540540540541e-06,
"loss": 0.3511,
"step": 24
},
{
"epoch": 0.05276509386098427,
"grad_norm": 1.5094086215074392,
"learning_rate": 1.6891891891891894e-06,
"loss": 0.3303,
"step": 26
},
{
"epoch": 0.056823947234906136,
"grad_norm": 1.585436570063055,
"learning_rate": 1.8243243243243245e-06,
"loss": 0.3458,
"step": 28
},
{
"epoch": 0.060882800608828,
"grad_norm": 1.738359897683998,
"learning_rate": 1.9594594594594595e-06,
"loss": 0.3401,
"step": 30
},
{
"epoch": 0.06494165398274987,
"grad_norm": 1.5432250105335408,
"learning_rate": 2.0945945945945946e-06,
"loss": 0.341,
"step": 32
},
{
"epoch": 0.06900050735667174,
"grad_norm": 1.4879837682144732,
"learning_rate": 2.22972972972973e-06,
"loss": 0.333,
"step": 34
},
{
"epoch": 0.0730593607305936,
"grad_norm": 1.5249793712374056,
"learning_rate": 2.364864864864865e-06,
"loss": 0.3389,
"step": 36
},
{
"epoch": 0.07711821410451547,
"grad_norm": 1.5591368574163726,
"learning_rate": 2.5e-06,
"loss": 0.3422,
"step": 38
},
{
"epoch": 0.08117706747843734,
"grad_norm": 1.6074325994660499,
"learning_rate": 2.6351351351351353e-06,
"loss": 0.348,
"step": 40
},
{
"epoch": 0.0852359208523592,
"grad_norm": 1.517968202412236,
"learning_rate": 2.7702702702702703e-06,
"loss": 0.3376,
"step": 42
},
{
"epoch": 0.08929477422628107,
"grad_norm": 1.5371668709250539,
"learning_rate": 2.9054054054054054e-06,
"loss": 0.3556,
"step": 44
},
{
"epoch": 0.09335362760020294,
"grad_norm": 1.4812012090460671,
"learning_rate": 3.040540540540541e-06,
"loss": 0.3524,
"step": 46
},
{
"epoch": 0.0974124809741248,
"grad_norm": 1.5363097735419804,
"learning_rate": 3.1756756756756755e-06,
"loss": 0.3279,
"step": 48
},
{
"epoch": 0.10147133434804667,
"grad_norm": 1.4293275584526721,
"learning_rate": 3.310810810810811e-06,
"loss": 0.3063,
"step": 50
},
{
"epoch": 0.10553018772196854,
"grad_norm": 1.5566072184509325,
"learning_rate": 3.445945945945946e-06,
"loss": 0.3453,
"step": 52
},
{
"epoch": 0.1095890410958904,
"grad_norm": 1.411331335157791,
"learning_rate": 3.5810810810810816e-06,
"loss": 0.3151,
"step": 54
},
{
"epoch": 0.11364789446981227,
"grad_norm": 1.559975757164133,
"learning_rate": 3.7162162162162162e-06,
"loss": 0.3267,
"step": 56
},
{
"epoch": 0.11770674784373414,
"grad_norm": 1.3575124310454543,
"learning_rate": 3.851351351351352e-06,
"loss": 0.3426,
"step": 58
},
{
"epoch": 0.121765601217656,
"grad_norm": 1.566074257576769,
"learning_rate": 3.986486486486487e-06,
"loss": 0.3431,
"step": 60
},
{
"epoch": 0.12582445459157787,
"grad_norm": 1.4663548045652957,
"learning_rate": 4.121621621621622e-06,
"loss": 0.3328,
"step": 62
},
{
"epoch": 0.12988330796549974,
"grad_norm": 1.4786594077137154,
"learning_rate": 4.256756756756757e-06,
"loss": 0.318,
"step": 64
},
{
"epoch": 0.1339421613394216,
"grad_norm": 1.4943486857884478,
"learning_rate": 4.391891891891892e-06,
"loss": 0.3457,
"step": 66
},
{
"epoch": 0.13800101471334347,
"grad_norm": 1.4843598946184238,
"learning_rate": 4.527027027027027e-06,
"loss": 0.329,
"step": 68
},
{
"epoch": 0.14205986808726534,
"grad_norm": 1.4234589903170214,
"learning_rate": 4.6621621621621625e-06,
"loss": 0.3346,
"step": 70
},
{
"epoch": 0.1461187214611872,
"grad_norm": 1.363410197784669,
"learning_rate": 4.797297297297297e-06,
"loss": 0.33,
"step": 72
},
{
"epoch": 0.15017757483510907,
"grad_norm": 1.3795399226185014,
"learning_rate": 4.932432432432433e-06,
"loss": 0.3111,
"step": 74
},
{
"epoch": 0.15423642820903094,
"grad_norm": 1.4630782525423722,
"learning_rate": 5.067567567567568e-06,
"loss": 0.3164,
"step": 76
},
{
"epoch": 0.1582952815829528,
"grad_norm": 1.535896727076301,
"learning_rate": 5.202702702702704e-06,
"loss": 0.3462,
"step": 78
},
{
"epoch": 0.16235413495687467,
"grad_norm": 1.3384695460866296,
"learning_rate": 5.337837837837838e-06,
"loss": 0.328,
"step": 80
},
{
"epoch": 0.16641298833079654,
"grad_norm": 1.564060215758407,
"learning_rate": 5.472972972972973e-06,
"loss": 0.3356,
"step": 82
},
{
"epoch": 0.1704718417047184,
"grad_norm": 1.5875357299856645,
"learning_rate": 5.608108108108109e-06,
"loss": 0.3317,
"step": 84
},
{
"epoch": 0.17453069507864027,
"grad_norm": 1.3150986310423165,
"learning_rate": 5.743243243243244e-06,
"loss": 0.3197,
"step": 86
},
{
"epoch": 0.17858954845256214,
"grad_norm": 1.4780350536596663,
"learning_rate": 5.8783783783783786e-06,
"loss": 0.3328,
"step": 88
},
{
"epoch": 0.182648401826484,
"grad_norm": 1.3565098348860962,
"learning_rate": 6.013513513513514e-06,
"loss": 0.3081,
"step": 90
},
{
"epoch": 0.18670725520040587,
"grad_norm": 1.5123567866175038,
"learning_rate": 6.1486486486486495e-06,
"loss": 0.3276,
"step": 92
},
{
"epoch": 0.19076610857432774,
"grad_norm": 1.4884022484673987,
"learning_rate": 6.283783783783784e-06,
"loss": 0.3523,
"step": 94
},
{
"epoch": 0.1948249619482496,
"grad_norm": 1.601046629607006,
"learning_rate": 6.41891891891892e-06,
"loss": 0.3612,
"step": 96
},
{
"epoch": 0.19888381532217148,
"grad_norm": 1.479212704609007,
"learning_rate": 6.554054054054054e-06,
"loss": 0.3106,
"step": 98
},
{
"epoch": 0.20294266869609334,
"grad_norm": 1.4220008074976263,
"learning_rate": 6.689189189189191e-06,
"loss": 0.3441,
"step": 100
},
{
"epoch": 0.2070015220700152,
"grad_norm": 1.3514096746438489,
"learning_rate": 6.824324324324325e-06,
"loss": 0.334,
"step": 102
},
{
"epoch": 0.21106037544393708,
"grad_norm": 1.4495742654024877,
"learning_rate": 6.95945945945946e-06,
"loss": 0.3365,
"step": 104
},
{
"epoch": 0.21511922881785894,
"grad_norm": 1.4742675513138555,
"learning_rate": 7.0945945945945946e-06,
"loss": 0.3306,
"step": 106
},
{
"epoch": 0.2191780821917808,
"grad_norm": 1.5180836151687533,
"learning_rate": 7.229729729729731e-06,
"loss": 0.3435,
"step": 108
},
{
"epoch": 0.22323693556570268,
"grad_norm": 1.54712083151063,
"learning_rate": 7.3648648648648655e-06,
"loss": 0.3698,
"step": 110
},
{
"epoch": 0.22729578893962454,
"grad_norm": 1.5589085128625726,
"learning_rate": 7.500000000000001e-06,
"loss": 0.3209,
"step": 112
},
{
"epoch": 0.2313546423135464,
"grad_norm": 1.5013917554759992,
"learning_rate": 7.635135135135135e-06,
"loss": 0.3488,
"step": 114
},
{
"epoch": 0.23541349568746828,
"grad_norm": 1.4511795332039656,
"learning_rate": 7.77027027027027e-06,
"loss": 0.3506,
"step": 116
},
{
"epoch": 0.23947234906139014,
"grad_norm": 1.4069378206541412,
"learning_rate": 7.905405405405406e-06,
"loss": 0.3481,
"step": 118
},
{
"epoch": 0.243531202435312,
"grad_norm": 1.582455691561815,
"learning_rate": 8.040540540540541e-06,
"loss": 0.3603,
"step": 120
},
{
"epoch": 0.24759005580923388,
"grad_norm": 1.434757897777904,
"learning_rate": 8.175675675675677e-06,
"loss": 0.3351,
"step": 122
},
{
"epoch": 0.25164890918315574,
"grad_norm": 1.4655011777161737,
"learning_rate": 8.31081081081081e-06,
"loss": 0.3579,
"step": 124
},
{
"epoch": 0.2557077625570776,
"grad_norm": 1.2773884939900029,
"learning_rate": 8.445945945945948e-06,
"loss": 0.3344,
"step": 126
},
{
"epoch": 0.2597666159309995,
"grad_norm": 1.566232859360701,
"learning_rate": 8.581081081081082e-06,
"loss": 0.3671,
"step": 128
},
{
"epoch": 0.26382546930492135,
"grad_norm": 1.600278240352609,
"learning_rate": 8.716216216216217e-06,
"loss": 0.3668,
"step": 130
},
{
"epoch": 0.2678843226788432,
"grad_norm": 1.4285015615854608,
"learning_rate": 8.851351351351351e-06,
"loss": 0.3701,
"step": 132
},
{
"epoch": 0.2719431760527651,
"grad_norm": 1.567165427429299,
"learning_rate": 8.986486486486488e-06,
"loss": 0.3718,
"step": 134
},
{
"epoch": 0.27600202942668695,
"grad_norm": 1.4657335810014254,
"learning_rate": 9.121621621621622e-06,
"loss": 0.3576,
"step": 136
},
{
"epoch": 0.2800608828006088,
"grad_norm": 1.5291620734959124,
"learning_rate": 9.256756756756757e-06,
"loss": 0.3838,
"step": 138
},
{
"epoch": 0.2841197361745307,
"grad_norm": 1.4553153344151037,
"learning_rate": 9.391891891891893e-06,
"loss": 0.3782,
"step": 140
},
{
"epoch": 0.28817858954845255,
"grad_norm": 1.406160463172771,
"learning_rate": 9.527027027027028e-06,
"loss": 0.3666,
"step": 142
},
{
"epoch": 0.2922374429223744,
"grad_norm": 1.5917073052853832,
"learning_rate": 9.662162162162164e-06,
"loss": 0.392,
"step": 144
},
{
"epoch": 0.2962962962962963,
"grad_norm": 1.511564904546133,
"learning_rate": 9.797297297297298e-06,
"loss": 0.3852,
"step": 146
},
{
"epoch": 0.30035514967021815,
"grad_norm": 1.4461988102226726,
"learning_rate": 9.932432432432433e-06,
"loss": 0.3947,
"step": 148
},
{
"epoch": 0.30441400304414,
"grad_norm": 1.5436908100507405,
"learning_rate": 9.999986072170506e-06,
"loss": 0.3778,
"step": 150
},
{
"epoch": 0.3084728564180619,
"grad_norm": 1.413238952587016,
"learning_rate": 9.99987465000011e-06,
"loss": 0.3634,
"step": 152
},
{
"epoch": 0.31253170979198375,
"grad_norm": 1.1989032000359305,
"learning_rate": 9.999651808142305e-06,
"loss": 0.3629,
"step": 154
},
{
"epoch": 0.3165905631659056,
"grad_norm": 1.2868027887610292,
"learning_rate": 9.999317551563011e-06,
"loss": 0.3674,
"step": 156
},
{
"epoch": 0.3206494165398275,
"grad_norm": 1.3871216563915811,
"learning_rate": 9.998871887710965e-06,
"loss": 0.3844,
"step": 158
},
{
"epoch": 0.32470826991374935,
"grad_norm": 1.532088928705796,
"learning_rate": 9.998314826517564e-06,
"loss": 0.3986,
"step": 160
},
{
"epoch": 0.3287671232876712,
"grad_norm": 1.5418249912864774,
"learning_rate": 9.997646380396633e-06,
"loss": 0.3934,
"step": 162
},
{
"epoch": 0.3328259766615931,
"grad_norm": 1.4957436906571129,
"learning_rate": 9.996866564244158e-06,
"loss": 0.3958,
"step": 164
},
{
"epoch": 0.33688483003551495,
"grad_norm": 1.4005270008180681,
"learning_rate": 9.995975395437952e-06,
"loss": 0.3697,
"step": 166
},
{
"epoch": 0.3409436834094368,
"grad_norm": 1.3477487401644073,
"learning_rate": 9.994972893837259e-06,
"loss": 0.382,
"step": 168
},
{
"epoch": 0.3450025367833587,
"grad_norm": 1.4090292300223908,
"learning_rate": 9.993859081782322e-06,
"loss": 0.3989,
"step": 170
},
{
"epoch": 0.34906139015728055,
"grad_norm": 1.2787360296779213,
"learning_rate": 9.992633984093886e-06,
"loss": 0.3746,
"step": 172
},
{
"epoch": 0.3531202435312024,
"grad_norm": 1.410889902419896,
"learning_rate": 9.991297628072632e-06,
"loss": 0.3965,
"step": 174
},
{
"epoch": 0.3571790969051243,
"grad_norm": 1.5306391383373583,
"learning_rate": 9.98985004349858e-06,
"loss": 0.418,
"step": 176
},
{
"epoch": 0.36123795027904615,
"grad_norm": 1.412306543323218,
"learning_rate": 9.988291262630425e-06,
"loss": 0.3954,
"step": 178
},
{
"epoch": 0.365296803652968,
"grad_norm": 1.376967456626685,
"learning_rate": 9.986621320204813e-06,
"loss": 0.3944,
"step": 180
},
{
"epoch": 0.3693556570268899,
"grad_norm": 1.325650682628611,
"learning_rate": 9.984840253435569e-06,
"loss": 0.396,
"step": 182
},
{
"epoch": 0.37341451040081175,
"grad_norm": 1.248201726411196,
"learning_rate": 9.982948102012866e-06,
"loss": 0.3783,
"step": 184
},
{
"epoch": 0.3774733637747336,
"grad_norm": 1.3492739121359127,
"learning_rate": 9.98094490810235e-06,
"loss": 0.4078,
"step": 186
},
{
"epoch": 0.3815322171486555,
"grad_norm": 1.4387342464186235,
"learning_rate": 9.978830716344185e-06,
"loss": 0.3892,
"step": 188
},
{
"epoch": 0.38559107052257735,
"grad_norm": 1.2509510227667138,
"learning_rate": 9.976605573852071e-06,
"loss": 0.3696,
"step": 190
},
{
"epoch": 0.3896499238964992,
"grad_norm": 1.3154238912694323,
"learning_rate": 9.974269530212185e-06,
"loss": 0.405,
"step": 192
},
{
"epoch": 0.3937087772704211,
"grad_norm": 1.5250960877423454,
"learning_rate": 9.971822637482085e-06,
"loss": 0.4135,
"step": 194
},
{
"epoch": 0.39776763064434295,
"grad_norm": 1.3908097291583248,
"learning_rate": 9.969264950189539e-06,
"loss": 0.4006,
"step": 196
},
{
"epoch": 0.4018264840182648,
"grad_norm": 1.4790280124932251,
"learning_rate": 9.966596525331324e-06,
"loss": 0.4188,
"step": 198
},
{
"epoch": 0.4058853373921867,
"grad_norm": 1.4233060958120591,
"learning_rate": 9.96381742237194e-06,
"loss": 0.4042,
"step": 200
},
{
"epoch": 0.40994419076610855,
"grad_norm": 1.3044712016817912,
"learning_rate": 9.960927703242298e-06,
"loss": 0.3956,
"step": 202
},
{
"epoch": 0.4140030441400304,
"grad_norm": 1.2296559721817601,
"learning_rate": 9.957927432338332e-06,
"loss": 0.3813,
"step": 204
},
{
"epoch": 0.4180618975139523,
"grad_norm": 1.247811043333453,
"learning_rate": 9.954816676519569e-06,
"loss": 0.3846,
"step": 206
},
{
"epoch": 0.42212075088787415,
"grad_norm": 1.4552540186289,
"learning_rate": 9.951595505107633e-06,
"loss": 0.3826,
"step": 208
},
{
"epoch": 0.426179604261796,
"grad_norm": 1.3877999146640874,
"learning_rate": 9.948263989884708e-06,
"loss": 0.4118,
"step": 210
},
{
"epoch": 0.4302384576357179,
"grad_norm": 1.3634788475367725,
"learning_rate": 9.944822205091929e-06,
"loss": 0.3974,
"step": 212
},
{
"epoch": 0.43429731100963975,
"grad_norm": 1.2479391778044153,
"learning_rate": 9.94127022742774e-06,
"loss": 0.3784,
"step": 214
},
{
"epoch": 0.4383561643835616,
"grad_norm": 1.2120926044150644,
"learning_rate": 9.937608136046171e-06,
"loss": 0.3857,
"step": 216
},
{
"epoch": 0.4424150177574835,
"grad_norm": 1.363599562180868,
"learning_rate": 9.933836012555083e-06,
"loss": 0.4089,
"step": 218
},
{
"epoch": 0.44647387113140535,
"grad_norm": 1.169807478788221,
"learning_rate": 9.929953941014349e-06,
"loss": 0.3649,
"step": 220
},
{
"epoch": 0.4505327245053272,
"grad_norm": 1.3307716150293825,
"learning_rate": 9.925962007933975e-06,
"loss": 0.4093,
"step": 222
},
{
"epoch": 0.4545915778792491,
"grad_norm": 1.2092559857310445,
"learning_rate": 9.921860302272184e-06,
"loss": 0.3959,
"step": 224
},
{
"epoch": 0.45865043125317095,
"grad_norm": 1.40047176469619,
"learning_rate": 9.917648915433413e-06,
"loss": 0.4271,
"step": 226
},
{
"epoch": 0.4627092846270928,
"grad_norm": 1.2607295390446736,
"learning_rate": 9.9133279412663e-06,
"loss": 0.3963,
"step": 228
},
{
"epoch": 0.4667681380010147,
"grad_norm": 1.2887307875981555,
"learning_rate": 9.908897476061576e-06,
"loss": 0.4128,
"step": 230
},
{
"epoch": 0.47082699137493655,
"grad_norm": 1.3538240839163793,
"learning_rate": 9.904357618549925e-06,
"loss": 0.4032,
"step": 232
},
{
"epoch": 0.4748858447488584,
"grad_norm": 1.2106198378461424,
"learning_rate": 9.899708469899786e-06,
"loss": 0.402,
"step": 234
},
{
"epoch": 0.4789446981227803,
"grad_norm": 1.214556237570029,
"learning_rate": 9.894950133715094e-06,
"loss": 0.4079,
"step": 236
},
{
"epoch": 0.48300355149670215,
"grad_norm": 1.3315305311943295,
"learning_rate": 9.89008271603297e-06,
"loss": 0.3908,
"step": 238
},
{
"epoch": 0.487062404870624,
"grad_norm": 1.2949635168468638,
"learning_rate": 9.885106325321371e-06,
"loss": 0.418,
"step": 240
},
{
"epoch": 0.4911212582445459,
"grad_norm": 1.3622823844909377,
"learning_rate": 9.880021072476651e-06,
"loss": 0.4136,
"step": 242
},
{
"epoch": 0.49518011161846776,
"grad_norm": 1.298645600673932,
"learning_rate": 9.874827070821112e-06,
"loss": 0.4037,
"step": 244
},
{
"epoch": 0.4992389649923896,
"grad_norm": 1.1869885168664382,
"learning_rate": 9.869524436100458e-06,
"loss": 0.3723,
"step": 246
},
{
"epoch": 0.5032978183663115,
"grad_norm": 1.1960510107751574,
"learning_rate": 9.864113286481237e-06,
"loss": 0.3665,
"step": 248
},
{
"epoch": 0.5073566717402334,
"grad_norm": 1.4192185941613773,
"learning_rate": 9.85859374254819e-06,
"loss": 0.4243,
"step": 250
},
{
"epoch": 0.5114155251141552,
"grad_norm": 1.2602840725571196,
"learning_rate": 9.852965927301573e-06,
"loss": 0.3945,
"step": 252
},
{
"epoch": 0.5154743784880771,
"grad_norm": 1.3076466906647164,
"learning_rate": 9.847229966154415e-06,
"loss": 0.4303,
"step": 254
},
{
"epoch": 0.519533231861999,
"grad_norm": 1.2994672624028094,
"learning_rate": 9.841385986929716e-06,
"loss": 0.4223,
"step": 256
},
{
"epoch": 0.5235920852359208,
"grad_norm": 1.2718506455560323,
"learning_rate": 9.835434119857612e-06,
"loss": 0.4124,
"step": 258
},
{
"epoch": 0.5276509386098427,
"grad_norm": 1.233515929409816,
"learning_rate": 9.829374497572461e-06,
"loss": 0.4156,
"step": 260
},
{
"epoch": 0.5317097919837646,
"grad_norm": 1.192750940555937,
"learning_rate": 9.823207255109891e-06,
"loss": 0.3865,
"step": 262
},
{
"epoch": 0.5357686453576864,
"grad_norm": 1.2146025999616998,
"learning_rate": 9.816932529903795e-06,
"loss": 0.381,
"step": 264
},
{
"epoch": 0.5398274987316083,
"grad_norm": 1.254427394175359,
"learning_rate": 9.810550461783261e-06,
"loss": 0.4209,
"step": 266
},
{
"epoch": 0.5438863521055302,
"grad_norm": 1.2007483465859314,
"learning_rate": 9.804061192969465e-06,
"loss": 0.3935,
"step": 268
},
{
"epoch": 0.547945205479452,
"grad_norm": 1.1962317702272547,
"learning_rate": 9.797464868072489e-06,
"loss": 0.4055,
"step": 270
},
{
"epoch": 0.5520040588533739,
"grad_norm": 1.2388775998656307,
"learning_rate": 9.790761634088108e-06,
"loss": 0.4016,
"step": 272
},
{
"epoch": 0.5560629122272958,
"grad_norm": 1.1565724419815788,
"learning_rate": 9.78395164039452e-06,
"loss": 0.4066,
"step": 274
},
{
"epoch": 0.5601217656012176,
"grad_norm": 1.287795256739133,
"learning_rate": 9.777035038749002e-06,
"loss": 0.4072,
"step": 276
},
{
"epoch": 0.5641806189751395,
"grad_norm": 1.1990919314621633,
"learning_rate": 9.77001198328453e-06,
"loss": 0.385,
"step": 278
},
{
"epoch": 0.5682394723490614,
"grad_norm": 1.1155810438092542,
"learning_rate": 9.762882630506366e-06,
"loss": 0.4138,
"step": 280
},
{
"epoch": 0.5722983257229832,
"grad_norm": 1.270188734252511,
"learning_rate": 9.75564713928854e-06,
"loss": 0.4108,
"step": 282
},
{
"epoch": 0.5763571790969051,
"grad_norm": 1.2854062183745893,
"learning_rate": 9.748305670870326e-06,
"loss": 0.4105,
"step": 284
},
{
"epoch": 0.580416032470827,
"grad_norm": 1.2822199195202089,
"learning_rate": 9.740858388852652e-06,
"loss": 0.4187,
"step": 286
},
{
"epoch": 0.5844748858447488,
"grad_norm": 1.1789452806981648,
"learning_rate": 9.733305459194444e-06,
"loss": 0.4026,
"step": 288
},
{
"epoch": 0.5885337392186707,
"grad_norm": 1.2063791823863752,
"learning_rate": 9.725647050208936e-06,
"loss": 0.4194,
"step": 290
},
{
"epoch": 0.5925925925925926,
"grad_norm": 1.1212621894773256,
"learning_rate": 9.717883332559911e-06,
"loss": 0.4043,
"step": 292
},
{
"epoch": 0.5966514459665144,
"grad_norm": 1.236354295472038,
"learning_rate": 9.710014479257906e-06,
"loss": 0.4279,
"step": 294
},
{
"epoch": 0.6007102993404363,
"grad_norm": 1.230960872966148,
"learning_rate": 9.702040665656353e-06,
"loss": 0.417,
"step": 296
},
{
"epoch": 0.6047691527143582,
"grad_norm": 1.302936449552778,
"learning_rate": 9.693962069447669e-06,
"loss": 0.4399,
"step": 298
},
{
"epoch": 0.60882800608828,
"grad_norm": 1.1296630845707911,
"learning_rate": 9.685778870659301e-06,
"loss": 0.4024,
"step": 300
},
{
"epoch": 0.6128868594622019,
"grad_norm": 1.1746225535864936,
"learning_rate": 9.677491251649711e-06,
"loss": 0.3912,
"step": 302
},
{
"epoch": 0.6169457128361238,
"grad_norm": 1.241320530846212,
"learning_rate": 9.669099397104314e-06,
"loss": 0.4174,
"step": 304
},
{
"epoch": 0.6210045662100456,
"grad_norm": 1.2219507615770004,
"learning_rate": 9.660603494031358e-06,
"loss": 0.3918,
"step": 306
},
{
"epoch": 0.6250634195839675,
"grad_norm": 1.1589702070871013,
"learning_rate": 9.652003731757763e-06,
"loss": 0.4157,
"step": 308
},
{
"epoch": 0.6291222729578894,
"grad_norm": 1.1220293339629992,
"learning_rate": 9.643300301924902e-06,
"loss": 0.4015,
"step": 310
},
{
"epoch": 0.6331811263318112,
"grad_norm": 1.2563582002979947,
"learning_rate": 9.634493398484319e-06,
"loss": 0.4128,
"step": 312
},
{
"epoch": 0.6372399797057331,
"grad_norm": 1.1888367524986483,
"learning_rate": 9.625583217693419e-06,
"loss": 0.3874,
"step": 314
},
{
"epoch": 0.641298833079655,
"grad_norm": 1.1925360068598152,
"learning_rate": 9.616569958111097e-06,
"loss": 0.4219,
"step": 316
},
{
"epoch": 0.6453576864535768,
"grad_norm": 1.2776062593085378,
"learning_rate": 9.607453820593297e-06,
"loss": 0.4138,
"step": 318
},
{
"epoch": 0.6494165398274987,
"grad_norm": 1.157480079096016,
"learning_rate": 9.598235008288551e-06,
"loss": 0.4075,
"step": 320
},
{
"epoch": 0.6534753932014206,
"grad_norm": 1.2352282756489477,
"learning_rate": 9.58891372663345e-06,
"loss": 0.4111,
"step": 322
},
{
"epoch": 0.6575342465753424,
"grad_norm": 1.2837461432435215,
"learning_rate": 9.579490183348052e-06,
"loss": 0.4358,
"step": 324
},
{
"epoch": 0.6615930999492643,
"grad_norm": 1.172789813592292,
"learning_rate": 9.56996458843128e-06,
"loss": 0.3986,
"step": 326
},
{
"epoch": 0.6656519533231862,
"grad_norm": 1.194020795966964,
"learning_rate": 9.56033715415621e-06,
"loss": 0.4075,
"step": 328
},
{
"epoch": 0.669710806697108,
"grad_norm": 1.0964374769088712,
"learning_rate": 9.550608095065367e-06,
"loss": 0.4071,
"step": 330
},
{
"epoch": 0.6737696600710299,
"grad_norm": 1.089373021702181,
"learning_rate": 9.540777627965933e-06,
"loss": 0.3957,
"step": 332
},
{
"epoch": 0.6778285134449518,
"grad_norm": 1.1992667011972529,
"learning_rate": 9.53084597192491e-06,
"loss": 0.4158,
"step": 334
},
{
"epoch": 0.6818873668188736,
"grad_norm": 1.2172349749770106,
"learning_rate": 9.520813348264252e-06,
"loss": 0.4277,
"step": 336
},
{
"epoch": 0.6859462201927955,
"grad_norm": 1.2574902319962946,
"learning_rate": 9.510679980555922e-06,
"loss": 0.3995,
"step": 338
},
{
"epoch": 0.6900050735667174,
"grad_norm": 1.131615777672815,
"learning_rate": 9.500446094616911e-06,
"loss": 0.4005,
"step": 340
},
{
"epoch": 0.6940639269406392,
"grad_norm": 1.246895890559624,
"learning_rate": 9.490111918504213e-06,
"loss": 0.4169,
"step": 342
},
{
"epoch": 0.6981227803145611,
"grad_norm": 1.181624286199365,
"learning_rate": 9.479677682509737e-06,
"loss": 0.3986,
"step": 344
},
{
"epoch": 0.702181633688483,
"grad_norm": 1.132690259540531,
"learning_rate": 9.469143619155172e-06,
"loss": 0.3923,
"step": 346
},
{
"epoch": 0.7062404870624048,
"grad_norm": 1.047890655047983,
"learning_rate": 9.458509963186815e-06,
"loss": 0.4043,
"step": 348
},
{
"epoch": 0.7102993404363267,
"grad_norm": 1.147246283887197,
"learning_rate": 9.44777695157033e-06,
"loss": 0.4066,
"step": 350
},
{
"epoch": 0.7143581938102486,
"grad_norm": 1.215824000969317,
"learning_rate": 9.436944823485475e-06,
"loss": 0.4146,
"step": 352
},
{
"epoch": 0.7184170471841704,
"grad_norm": 1.1437849010452408,
"learning_rate": 9.426013820320764e-06,
"loss": 0.4206,
"step": 354
},
{
"epoch": 0.7224759005580923,
"grad_norm": 1.1210034262809383,
"learning_rate": 9.414984185668097e-06,
"loss": 0.3991,
"step": 356
},
{
"epoch": 0.7265347539320142,
"grad_norm": 1.3628388125490938,
"learning_rate": 9.403856165317322e-06,
"loss": 0.4359,
"step": 358
},
{
"epoch": 0.730593607305936,
"grad_norm": 1.236630161906545,
"learning_rate": 9.392630007250769e-06,
"loss": 0.4415,
"step": 360
},
{
"epoch": 0.7346524606798579,
"grad_norm": 1.1213292521942286,
"learning_rate": 9.381305961637713e-06,
"loss": 0.4219,
"step": 362
},
{
"epoch": 0.7387113140537798,
"grad_norm": 1.209577588106072,
"learning_rate": 9.369884280828806e-06,
"loss": 0.4308,
"step": 364
},
{
"epoch": 0.7427701674277016,
"grad_norm": 1.3143877988319919,
"learning_rate": 9.358365219350448e-06,
"loss": 0.4376,
"step": 366
},
{
"epoch": 0.7468290208016235,
"grad_norm": 1.1794072785475278,
"learning_rate": 9.346749033899121e-06,
"loss": 0.4331,
"step": 368
},
{
"epoch": 0.7508878741755454,
"grad_norm": 1.1813808165518036,
"learning_rate": 9.335035983335667e-06,
"loss": 0.3992,
"step": 370
},
{
"epoch": 0.7549467275494672,
"grad_norm": 1.1568780935799914,
"learning_rate": 9.323226328679512e-06,
"loss": 0.4044,
"step": 372
},
{
"epoch": 0.7590055809233891,
"grad_norm": 1.0660018515522698,
"learning_rate": 9.311320333102864e-06,
"loss": 0.3954,
"step": 374
},
{
"epoch": 0.763064434297311,
"grad_norm": 1.108920689047685,
"learning_rate": 9.299318261924834e-06,
"loss": 0.3998,
"step": 376
},
{
"epoch": 0.7671232876712328,
"grad_norm": 1.136789158664533,
"learning_rate": 9.287220382605532e-06,
"loss": 0.4042,
"step": 378
},
{
"epoch": 0.7711821410451547,
"grad_norm": 1.2496770566654822,
"learning_rate": 9.275026964740101e-06,
"loss": 0.4067,
"step": 380
},
{
"epoch": 0.7752409944190766,
"grad_norm": 1.1996061958131852,
"learning_rate": 9.262738280052715e-06,
"loss": 0.4183,
"step": 382
},
{
"epoch": 0.7792998477929984,
"grad_norm": 1.2149866273575285,
"learning_rate": 9.250354602390523e-06,
"loss": 0.4409,
"step": 384
},
{
"epoch": 0.7833587011669203,
"grad_norm": 1.1177682668450932,
"learning_rate": 9.237876207717538e-06,
"loss": 0.4029,
"step": 386
},
{
"epoch": 0.7874175545408422,
"grad_norm": 1.1677530585024758,
"learning_rate": 9.225303374108503e-06,
"loss": 0.4178,
"step": 388
},
{
"epoch": 0.791476407914764,
"grad_norm": 1.3678602117567324,
"learning_rate": 9.212636381742676e-06,
"loss": 0.4197,
"step": 390
},
{
"epoch": 0.7955352612886859,
"grad_norm": 1.189997795752436,
"learning_rate": 9.199875512897602e-06,
"loss": 0.4173,
"step": 392
},
{
"epoch": 0.7995941146626078,
"grad_norm": 1.186213329132832,
"learning_rate": 9.187021051942814e-06,
"loss": 0.4145,
"step": 394
},
{
"epoch": 0.8036529680365296,
"grad_norm": 1.203493805658719,
"learning_rate": 9.174073285333498e-06,
"loss": 0.4181,
"step": 396
},
{
"epoch": 0.8077118214104515,
"grad_norm": 1.175802247814532,
"learning_rate": 9.161032501604106e-06,
"loss": 0.3949,
"step": 398
},
{
"epoch": 0.8117706747843734,
"grad_norm": 1.212190555266731,
"learning_rate": 9.147898991361936e-06,
"loss": 0.4076,
"step": 400
},
{
"epoch": 0.8158295281582952,
"grad_norm": 1.191973289112244,
"learning_rate": 9.134673047280644e-06,
"loss": 0.4233,
"step": 402
},
{
"epoch": 0.8198883815322171,
"grad_norm": 1.2279490044480763,
"learning_rate": 9.121354964093732e-06,
"loss": 0.4127,
"step": 404
},
{
"epoch": 0.823947234906139,
"grad_norm": 1.151451234197627,
"learning_rate": 9.107945038587974e-06,
"loss": 0.4226,
"step": 406
},
{
"epoch": 0.8280060882800608,
"grad_norm": 1.119728494545527,
"learning_rate": 9.094443569596802e-06,
"loss": 0.4033,
"step": 408
},
{
"epoch": 0.8320649416539827,
"grad_norm": 1.13012343405543,
"learning_rate": 9.08085085799365e-06,
"loss": 0.4088,
"step": 410
},
{
"epoch": 0.8361237950279046,
"grad_norm": 1.159098363094475,
"learning_rate": 9.067167206685248e-06,
"loss": 0.4124,
"step": 412
},
{
"epoch": 0.8401826484018264,
"grad_norm": 1.2347110694455659,
"learning_rate": 9.05339292060487e-06,
"loss": 0.434,
"step": 414
},
{
"epoch": 0.8442415017757483,
"grad_norm": 1.2402415983547357,
"learning_rate": 9.039528306705543e-06,
"loss": 0.425,
"step": 416
},
{
"epoch": 0.8483003551496702,
"grad_norm": 1.1683847145500172,
"learning_rate": 9.025573673953201e-06,
"loss": 0.4423,
"step": 418
},
{
"epoch": 0.852359208523592,
"grad_norm": 0.9967973069250277,
"learning_rate": 9.011529333319804e-06,
"loss": 0.3987,
"step": 420
},
{
"epoch": 0.8564180618975139,
"grad_norm": 1.0663571132874041,
"learning_rate": 8.997395597776404e-06,
"loss": 0.3908,
"step": 422
},
{
"epoch": 0.8604769152714358,
"grad_norm": 1.1585376506062766,
"learning_rate": 8.98317278228618e-06,
"loss": 0.4055,
"step": 424
},
{
"epoch": 0.8645357686453576,
"grad_norm": 1.3272038292907982,
"learning_rate": 8.96886120379741e-06,
"loss": 0.4241,
"step": 426
},
{
"epoch": 0.8685946220192795,
"grad_norm": 1.1134457404863736,
"learning_rate": 8.954461181236406e-06,
"loss": 0.4343,
"step": 428
},
{
"epoch": 0.8726534753932014,
"grad_norm": 1.152191927893708,
"learning_rate": 8.939973035500418e-06,
"loss": 0.4012,
"step": 430
},
{
"epoch": 0.8767123287671232,
"grad_norm": 1.157272959329367,
"learning_rate": 8.925397089450473e-06,
"loss": 0.4116,
"step": 432
},
{
"epoch": 0.8807711821410451,
"grad_norm": 1.1617646168179858,
"learning_rate": 8.910733667904186e-06,
"loss": 0.4128,
"step": 434
},
{
"epoch": 0.884830035514967,
"grad_norm": 1.2116957635700267,
"learning_rate": 8.895983097628515e-06,
"loss": 0.4332,
"step": 436
},
{
"epoch": 0.8888888888888888,
"grad_norm": 1.1950006191376203,
"learning_rate": 8.88114570733249e-06,
"loss": 0.4005,
"step": 438
},
{
"epoch": 0.8929477422628107,
"grad_norm": 1.1470604791719932,
"learning_rate": 8.866221827659876e-06,
"loss": 0.4233,
"step": 440
},
{
"epoch": 0.8970065956367326,
"grad_norm": 1.2230557240685258,
"learning_rate": 8.851211791181813e-06,
"loss": 0.4133,
"step": 442
},
{
"epoch": 0.9010654490106544,
"grad_norm": 1.2680481957203948,
"learning_rate": 8.8361159323894e-06,
"loss": 0.447,
"step": 444
},
{
"epoch": 0.9051243023845763,
"grad_norm": 1.095625464598396,
"learning_rate": 8.820934587686247e-06,
"loss": 0.3884,
"step": 446
},
{
"epoch": 0.9091831557584982,
"grad_norm": 1.1796975421785947,
"learning_rate": 8.805668095380969e-06,
"loss": 0.4139,
"step": 448
},
{
"epoch": 0.91324200913242,
"grad_norm": 1.2337117620848044,
"learning_rate": 8.790316795679654e-06,
"loss": 0.4258,
"step": 450
},
{
"epoch": 0.9173008625063419,
"grad_norm": 1.060326310057752,
"learning_rate": 8.774881030678284e-06,
"loss": 0.4039,
"step": 452
},
{
"epoch": 0.9213597158802638,
"grad_norm": 1.0818729217545202,
"learning_rate": 8.759361144355103e-06,
"loss": 0.4186,
"step": 454
},
{
"epoch": 0.9254185692541856,
"grad_norm": 1.2188647624805096,
"learning_rate": 8.74375748256296e-06,
"loss": 0.43,
"step": 456
},
{
"epoch": 0.9294774226281075,
"grad_norm": 1.1517012313266344,
"learning_rate": 8.728070393021595e-06,
"loss": 0.3952,
"step": 458
},
{
"epoch": 0.9335362760020294,
"grad_norm": 1.146308857460623,
"learning_rate": 8.712300225309894e-06,
"loss": 0.419,
"step": 460
},
{
"epoch": 0.9375951293759512,
"grad_norm": 1.1733285567505642,
"learning_rate": 8.6964473308581e-06,
"loss": 0.4295,
"step": 462
},
{
"epoch": 0.9416539827498731,
"grad_norm": 1.2133986892308575,
"learning_rate": 8.680512062939976e-06,
"loss": 0.3994,
"step": 464
},
{
"epoch": 0.945712836123795,
"grad_norm": 1.2860321510839698,
"learning_rate": 8.664494776664942e-06,
"loss": 0.4305,
"step": 466
},
{
"epoch": 0.9497716894977168,
"grad_norm": 1.222015638603744,
"learning_rate": 8.64839582897015e-06,
"loss": 0.4247,
"step": 468
},
{
"epoch": 0.9538305428716387,
"grad_norm": 1.1303294072263912,
"learning_rate": 8.63221557861254e-06,
"loss": 0.414,
"step": 470
},
{
"epoch": 0.9578893962455606,
"grad_norm": 1.1883694044651687,
"learning_rate": 8.615954386160836e-06,
"loss": 0.3944,
"step": 472
},
{
"epoch": 0.9619482496194824,
"grad_norm": 1.0686371570768038,
"learning_rate": 8.599612613987522e-06,
"loss": 0.4138,
"step": 474
},
{
"epoch": 0.9660071029934043,
"grad_norm": 1.1520582178885161,
"learning_rate": 8.583190626260754e-06,
"loss": 0.408,
"step": 476
},
{
"epoch": 0.9700659563673262,
"grad_norm": 1.2111448095961146,
"learning_rate": 8.566688788936254e-06,
"loss": 0.4326,
"step": 478
},
{
"epoch": 0.974124809741248,
"grad_norm": 1.0960113471423047,
"learning_rate": 8.550107469749159e-06,
"loss": 0.4095,
"step": 480
},
{
"epoch": 0.9781836631151699,
"grad_norm": 1.2031487005930193,
"learning_rate": 8.533447038205805e-06,
"loss": 0.4019,
"step": 482
},
{
"epoch": 0.9822425164890918,
"grad_norm": 1.0541006609815473,
"learning_rate": 8.516707865575515e-06,
"loss": 0.4301,
"step": 484
},
{
"epoch": 0.9863013698630136,
"grad_norm": 1.1625544941021624,
"learning_rate": 8.499890324882323e-06,
"loss": 0.3998,
"step": 486
},
{
"epoch": 0.9903602232369355,
"grad_norm": 1.3066175946513412,
"learning_rate": 8.482994790896645e-06,
"loss": 0.4422,
"step": 488
},
{
"epoch": 0.9944190766108574,
"grad_norm": 1.0701571633478897,
"learning_rate": 8.466021640126946e-06,
"loss": 0.4122,
"step": 490
},
{
"epoch": 0.9984779299847792,
"grad_norm": 1.167166516481942,
"learning_rate": 8.448971250811337e-06,
"loss": 0.4137,
"step": 492
},
{
"epoch": 1.002029426686961,
"grad_norm": 1.2941849102817737,
"learning_rate": 8.431844002909153e-06,
"loss": 0.3068,
"step": 494
},
{
"epoch": 1.0060882800608828,
"grad_norm": 1.111084517678234,
"learning_rate": 8.414640278092485e-06,
"loss": 0.2196,
"step": 496
},
{
"epoch": 1.0101471334348047,
"grad_norm": 1.0562648719597976,
"learning_rate": 8.397360459737673e-06,
"loss": 0.214,
"step": 498
},
{
"epoch": 1.0142059868087265,
"grad_norm": 1.1997363186960575,
"learning_rate": 8.38000493291676e-06,
"loss": 0.1968,
"step": 500
},
{
"epoch": 1.0182648401826484,
"grad_norm": 1.173481743208335,
"learning_rate": 8.362574084388921e-06,
"loss": 0.2037,
"step": 502
},
{
"epoch": 1.0223236935565703,
"grad_norm": 1.0170814335662817,
"learning_rate": 8.34506830259183e-06,
"loss": 0.1732,
"step": 504
},
{
"epoch": 1.0263825469304921,
"grad_norm": 0.997455021719729,
"learning_rate": 8.327487977633013e-06,
"loss": 0.198,
"step": 506
},
{
"epoch": 1.030441400304414,
"grad_norm": 1.0596243435147559,
"learning_rate": 8.309833501281159e-06,
"loss": 0.1968,
"step": 508
},
{
"epoch": 1.0345002536783359,
"grad_norm": 1.1329929736584996,
"learning_rate": 8.292105266957372e-06,
"loss": 0.2058,
"step": 510
},
{
"epoch": 1.0385591070522577,
"grad_norm": 1.0617871718782863,
"learning_rate": 8.274303669726427e-06,
"loss": 0.1837,
"step": 512
},
{
"epoch": 1.0426179604261796,
"grad_norm": 1.0158373026810432,
"learning_rate": 8.256429106287944e-06,
"loss": 0.1937,
"step": 514
},
{
"epoch": 1.0466768138001015,
"grad_norm": 1.1124406488041407,
"learning_rate": 8.238481974967567e-06,
"loss": 0.2044,
"step": 516
},
{
"epoch": 1.0507356671740233,
"grad_norm": 1.0954828147640017,
"learning_rate": 8.220462675708075e-06,
"loss": 0.2025,
"step": 518
},
{
"epoch": 1.0547945205479452,
"grad_norm": 1.0243819070320326,
"learning_rate": 8.202371610060471e-06,
"loss": 0.1944,
"step": 520
},
{
"epoch": 1.058853373921867,
"grad_norm": 1.0760742610687821,
"learning_rate": 8.184209181175038e-06,
"loss": 0.1949,
"step": 522
},
{
"epoch": 1.062912227295789,
"grad_norm": 1.020386464750481,
"learning_rate": 8.165975793792355e-06,
"loss": 0.1923,
"step": 524
},
{
"epoch": 1.0669710806697108,
"grad_norm": 1.1029549076667262,
"learning_rate": 8.14767185423427e-06,
"loss": 0.18,
"step": 526
},
{
"epoch": 1.0710299340436327,
"grad_norm": 1.1869108297914424,
"learning_rate": 8.129297770394855e-06,
"loss": 0.199,
"step": 528
},
{
"epoch": 1.0750887874175545,
"grad_norm": 1.042008597384453,
"learning_rate": 8.11085395173131e-06,
"loss": 0.1758,
"step": 530
},
{
"epoch": 1.0791476407914764,
"grad_norm": 1.0824678674361556,
"learning_rate": 8.092340809254844e-06,
"loss": 0.183,
"step": 532
},
{
"epoch": 1.0832064941653983,
"grad_norm": 1.1733425605990007,
"learning_rate": 8.073758755521506e-06,
"loss": 0.2001,
"step": 534
},
{
"epoch": 1.0872653475393201,
"grad_norm": 0.989241357527303,
"learning_rate": 8.055108204623001e-06,
"loss": 0.1854,
"step": 536
},
{
"epoch": 1.091324200913242,
"grad_norm": 1.105028315498873,
"learning_rate": 8.03638957217746e-06,
"loss": 0.1887,
"step": 538
},
{
"epoch": 1.0953830542871639,
"grad_norm": 1.1215793382714723,
"learning_rate": 8.017603275320176e-06,
"loss": 0.206,
"step": 540
},
{
"epoch": 1.0994419076610857,
"grad_norm": 1.0430299772389053,
"learning_rate": 7.998749732694308e-06,
"loss": 0.1852,
"step": 542
},
{
"epoch": 1.1035007610350076,
"grad_norm": 1.0539243906524998,
"learning_rate": 7.979829364441555e-06,
"loss": 0.1792,
"step": 544
},
{
"epoch": 1.1075596144089295,
"grad_norm": 1.056850242292317,
"learning_rate": 7.960842592192792e-06,
"loss": 0.1914,
"step": 546
},
{
"epoch": 1.1116184677828513,
"grad_norm": 1.0273529138082944,
"learning_rate": 7.94178983905867e-06,
"loss": 0.1947,
"step": 548
},
{
"epoch": 1.1156773211567732,
"grad_norm": 1.0677655866471754,
"learning_rate": 7.922671529620192e-06,
"loss": 0.1901,
"step": 550
},
{
"epoch": 1.119736174530695,
"grad_norm": 1.012213849308213,
"learning_rate": 7.903488089919253e-06,
"loss": 0.1732,
"step": 552
},
{
"epoch": 1.123795027904617,
"grad_norm": 1.1676973953753516,
"learning_rate": 7.88423994744914e-06,
"loss": 0.2106,
"step": 554
},
{
"epoch": 1.1278538812785388,
"grad_norm": 1.0599154280202072,
"learning_rate": 7.864927531145012e-06,
"loss": 0.1868,
"step": 556
},
{
"epoch": 1.1319127346524607,
"grad_norm": 1.0897459769656754,
"learning_rate": 7.845551271374333e-06,
"loss": 0.1814,
"step": 558
},
{
"epoch": 1.1359715880263825,
"grad_norm": 1.055142014684741,
"learning_rate": 7.82611159992729e-06,
"loss": 0.1851,
"step": 560
},
{
"epoch": 1.1400304414003044,
"grad_norm": 1.1116965314079303,
"learning_rate": 7.80660895000717e-06,
"loss": 0.196,
"step": 562
},
{
"epoch": 1.1440892947742263,
"grad_norm": 1.039510707609459,
"learning_rate": 7.787043756220698e-06,
"loss": 0.1721,
"step": 564
},
{
"epoch": 1.1481481481481481,
"grad_norm": 1.128546678780832,
"learning_rate": 7.767416454568358e-06,
"loss": 0.1848,
"step": 566
},
{
"epoch": 1.15220700152207,
"grad_norm": 1.1259620179696028,
"learning_rate": 7.747727482434679e-06,
"loss": 0.2007,
"step": 568
},
{
"epoch": 1.1562658548959919,
"grad_norm": 1.1097229809191436,
"learning_rate": 7.727977278578484e-06,
"loss": 0.1881,
"step": 570
},
{
"epoch": 1.1603247082699137,
"grad_norm": 1.0616464097857343,
"learning_rate": 7.708166283123118e-06,
"loss": 0.1945,
"step": 572
},
{
"epoch": 1.1643835616438356,
"grad_norm": 1.073617478066992,
"learning_rate": 7.68829493754663e-06,
"loss": 0.1858,
"step": 574
},
{
"epoch": 1.1684424150177575,
"grad_norm": 1.0887445235919726,
"learning_rate": 7.668363684671947e-06,
"loss": 0.1857,
"step": 576
},
{
"epoch": 1.1725012683916793,
"grad_norm": 1.0401775398806878,
"learning_rate": 7.648372968656995e-06,
"loss": 0.1786,
"step": 578
},
{
"epoch": 1.1765601217656012,
"grad_norm": 1.072786873168531,
"learning_rate": 7.628323234984806e-06,
"loss": 0.1848,
"step": 580
},
{
"epoch": 1.180618975139523,
"grad_norm": 1.183804677665548,
"learning_rate": 7.608214930453597e-06,
"loss": 0.2032,
"step": 582
},
{
"epoch": 1.184677828513445,
"grad_norm": 1.1546921624510742,
"learning_rate": 7.588048503166801e-06,
"loss": 0.1933,
"step": 584
},
{
"epoch": 1.1887366818873668,
"grad_norm": 1.0646260835850125,
"learning_rate": 7.5678244025230894e-06,
"loss": 0.1842,
"step": 586
},
{
"epoch": 1.1927955352612887,
"grad_norm": 0.9351171981377732,
"learning_rate": 7.547543079206355e-06,
"loss": 0.1711,
"step": 588
},
{
"epoch": 1.1968543886352105,
"grad_norm": 1.1893988642652746,
"learning_rate": 7.5272049851756716e-06,
"loss": 0.2027,
"step": 590
},
{
"epoch": 1.2009132420091324,
"grad_norm": 1.0632981222064524,
"learning_rate": 7.506810573655215e-06,
"loss": 0.1852,
"step": 592
},
{
"epoch": 1.2049720953830543,
"grad_norm": 1.0836002498537225,
"learning_rate": 7.486360299124169e-06,
"loss": 0.1887,
"step": 594
},
{
"epoch": 1.2090309487569761,
"grad_norm": 1.0213871056780877,
"learning_rate": 7.4658546173066005e-06,
"loss": 0.1826,
"step": 596
},
{
"epoch": 1.213089802130898,
"grad_norm": 0.9528373737399318,
"learning_rate": 7.445293985161296e-06,
"loss": 0.1722,
"step": 598
},
{
"epoch": 1.2171486555048199,
"grad_norm": 0.9267772302163672,
"learning_rate": 7.424678860871584e-06,
"loss": 0.1754,
"step": 600
},
{
"epoch": 1.2212075088787417,
"grad_norm": 1.0580239859843474,
"learning_rate": 7.404009703835121e-06,
"loss": 0.1828,
"step": 602
},
{
"epoch": 1.2252663622526636,
"grad_norm": 1.1609412830600723,
"learning_rate": 7.383286974653659e-06,
"loss": 0.2043,
"step": 604
},
{
"epoch": 1.2293252156265855,
"grad_norm": 1.2441637701281891,
"learning_rate": 7.362511135122779e-06,
"loss": 0.2,
"step": 606
},
{
"epoch": 1.2333840690005073,
"grad_norm": 1.0712250042029285,
"learning_rate": 7.341682648221591e-06,
"loss": 0.1823,
"step": 608
},
{
"epoch": 1.2374429223744292,
"grad_norm": 0.9995902931065666,
"learning_rate": 7.320801978102434e-06,
"loss": 0.1826,
"step": 610
},
{
"epoch": 1.241501775748351,
"grad_norm": 1.1066959207293212,
"learning_rate": 7.299869590080524e-06,
"loss": 0.1916,
"step": 612
},
{
"epoch": 1.245560629122273,
"grad_norm": 1.102809387398261,
"learning_rate": 7.278885950623578e-06,
"loss": 0.2034,
"step": 614
},
{
"epoch": 1.2496194824961948,
"grad_norm": 1.2015908572580698,
"learning_rate": 7.257851527341429e-06,
"loss": 0.2007,
"step": 616
},
{
"epoch": 1.2536783358701167,
"grad_norm": 1.0215395009781163,
"learning_rate": 7.236766788975603e-06,
"loss": 0.1926,
"step": 618
},
{
"epoch": 1.2577371892440385,
"grad_norm": 0.9684806459895816,
"learning_rate": 7.215632205388872e-06,
"loss": 0.1738,
"step": 620
},
{
"epoch": 1.2617960426179604,
"grad_norm": 1.1014687188825973,
"learning_rate": 7.19444824755478e-06,
"loss": 0.1895,
"step": 622
},
{
"epoch": 1.2658548959918823,
"grad_norm": 1.0685183294149176,
"learning_rate": 7.173215387547155e-06,
"loss": 0.1798,
"step": 624
},
{
"epoch": 1.2699137493658041,
"grad_norm": 1.100155763152369,
"learning_rate": 7.151934098529583e-06,
"loss": 0.1876,
"step": 626
},
{
"epoch": 1.273972602739726,
"grad_norm": 1.1041782673624663,
"learning_rate": 7.130604854744871e-06,
"loss": 0.1959,
"step": 628
},
{
"epoch": 1.2780314561136479,
"grad_norm": 1.1670239082453848,
"learning_rate": 7.109228131504465e-06,
"loss": 0.2055,
"step": 630
},
{
"epoch": 1.2820903094875697,
"grad_norm": 1.0829456391928898,
"learning_rate": 7.087804405177876e-06,
"loss": 0.1866,
"step": 632
},
{
"epoch": 1.2861491628614916,
"grad_norm": 1.0004546162300938,
"learning_rate": 7.066334153182049e-06,
"loss": 0.1805,
"step": 634
},
{
"epoch": 1.2902080162354135,
"grad_norm": 1.032451555529876,
"learning_rate": 7.044817853970732e-06,
"loss": 0.1866,
"step": 636
},
{
"epoch": 1.2942668696093353,
"grad_norm": 1.0864837314090992,
"learning_rate": 7.023255987023813e-06,
"loss": 0.182,
"step": 638
},
{
"epoch": 1.2983257229832572,
"grad_norm": 1.0459756892568486,
"learning_rate": 7.001649032836631e-06,
"loss": 0.1863,
"step": 640
},
{
"epoch": 1.302384576357179,
"grad_norm": 1.0601944062820794,
"learning_rate": 6.9799974729092765e-06,
"loss": 0.1732,
"step": 642
},
{
"epoch": 1.306443429731101,
"grad_norm": 1.018697234233559,
"learning_rate": 6.958301789735853e-06,
"loss": 0.1763,
"step": 644
},
{
"epoch": 1.3105022831050228,
"grad_norm": 1.2251116288030364,
"learning_rate": 6.936562466793724e-06,
"loss": 0.21,
"step": 646
},
{
"epoch": 1.3145611364789447,
"grad_norm": 1.0662349795156443,
"learning_rate": 6.914779988532755e-06,
"loss": 0.1889,
"step": 648
},
{
"epoch": 1.3186199898528665,
"grad_norm": 1.1235644124407285,
"learning_rate": 6.892954840364493e-06,
"loss": 0.2028,
"step": 650
},
{
"epoch": 1.3226788432267884,
"grad_norm": 1.074570291741179,
"learning_rate": 6.871087508651373e-06,
"loss": 0.1884,
"step": 652
},
{
"epoch": 1.3267376966007103,
"grad_norm": 1.0531061994655868,
"learning_rate": 6.8491784806958616e-06,
"loss": 0.2021,
"step": 654
},
{
"epoch": 1.3307965499746321,
"grad_norm": 1.1819152398440131,
"learning_rate": 6.827228244729609e-06,
"loss": 0.1932,
"step": 656
},
{
"epoch": 1.334855403348554,
"grad_norm": 1.045483939181271,
"learning_rate": 6.805237289902565e-06,
"loss": 0.1965,
"step": 658
},
{
"epoch": 1.3389142567224759,
"grad_norm": 1.1758637342179898,
"learning_rate": 6.783206106272076e-06,
"loss": 0.198,
"step": 660
},
{
"epoch": 1.3429731100963977,
"grad_norm": 1.0914064475419278,
"learning_rate": 6.761135184791969e-06,
"loss": 0.1846,
"step": 662
},
{
"epoch": 1.3470319634703196,
"grad_norm": 1.0211966623620905,
"learning_rate": 6.7390250173016104e-06,
"loss": 0.181,
"step": 664
},
{
"epoch": 1.3510908168442415,
"grad_norm": 1.193565468654654,
"learning_rate": 6.716876096514944e-06,
"loss": 0.2095,
"step": 666
},
{
"epoch": 1.3551496702181633,
"grad_norm": 1.1271722792849745,
"learning_rate": 6.694688916009505e-06,
"loss": 0.1848,
"step": 668
},
{
"epoch": 1.3592085235920852,
"grad_norm": 1.1098782394361217,
"learning_rate": 6.672463970215436e-06,
"loss": 0.1961,
"step": 670
},
{
"epoch": 1.363267376966007,
"grad_norm": 1.1374613038031431,
"learning_rate": 6.650201754404455e-06,
"loss": 0.1836,
"step": 672
},
{
"epoch": 1.367326230339929,
"grad_norm": 1.0341949368176346,
"learning_rate": 6.627902764678824e-06,
"loss": 0.1881,
"step": 674
},
{
"epoch": 1.3713850837138508,
"grad_norm": 1.109962989096539,
"learning_rate": 6.605567497960295e-06,
"loss": 0.1803,
"step": 676
},
{
"epoch": 1.3754439370877727,
"grad_norm": 1.107735146712493,
"learning_rate": 6.583196451979031e-06,
"loss": 0.1917,
"step": 678
},
{
"epoch": 1.3795027904616946,
"grad_norm": 1.1579886280607274,
"learning_rate": 6.560790125262524e-06,
"loss": 0.1979,
"step": 680
},
{
"epoch": 1.3835616438356164,
"grad_norm": 0.878075470155148,
"learning_rate": 6.538349017124472e-06,
"loss": 0.1631,
"step": 682
},
{
"epoch": 1.3876204972095383,
"grad_norm": 1.062317827656781,
"learning_rate": 6.515873627653663e-06,
"loss": 0.1808,
"step": 684
},
{
"epoch": 1.3916793505834602,
"grad_norm": 1.0327124357250628,
"learning_rate": 6.493364457702831e-06,
"loss": 0.1799,
"step": 686
},
{
"epoch": 1.395738203957382,
"grad_norm": 1.132131135132688,
"learning_rate": 6.470822008877482e-06,
"loss": 0.1822,
"step": 688
},
{
"epoch": 1.3997970573313039,
"grad_norm": 1.0462034604519863,
"learning_rate": 6.448246783524734e-06,
"loss": 0.1919,
"step": 690
},
{
"epoch": 1.4038559107052258,
"grad_norm": 1.1044742895931243,
"learning_rate": 6.42563928472211e-06,
"loss": 0.1851,
"step": 692
},
{
"epoch": 1.4079147640791476,
"grad_norm": 1.1674244148460076,
"learning_rate": 6.403000016266326e-06,
"loss": 0.1866,
"step": 694
},
{
"epoch": 1.4119736174530695,
"grad_norm": 1.1296402785524131,
"learning_rate": 6.380329482662078e-06,
"loss": 0.2035,
"step": 696
},
{
"epoch": 1.4160324708269914,
"grad_norm": 1.0524131174312268,
"learning_rate": 6.35762818911078e-06,
"loss": 0.1717,
"step": 698
},
{
"epoch": 1.4200913242009132,
"grad_norm": 1.1191774371111942,
"learning_rate": 6.334896641499324e-06,
"loss": 0.178,
"step": 700
},
{
"epoch": 1.424150177574835,
"grad_norm": 1.047752132697504,
"learning_rate": 6.312135346388793e-06,
"loss": 0.1881,
"step": 702
},
{
"epoch": 1.428209030948757,
"grad_norm": 1.1743382728694667,
"learning_rate": 6.289344811003184e-06,
"loss": 0.2033,
"step": 704
},
{
"epoch": 1.4322678843226788,
"grad_norm": 1.1804585608436726,
"learning_rate": 6.2665255432180916e-06,
"loss": 0.1931,
"step": 706
},
{
"epoch": 1.4363267376966007,
"grad_norm": 1.0677096903056138,
"learning_rate": 6.2436780515494035e-06,
"loss": 0.1837,
"step": 708
},
{
"epoch": 1.4403855910705226,
"grad_norm": 1.2099132702699213,
"learning_rate": 6.2208028451419575e-06,
"loss": 0.2112,
"step": 710
},
{
"epoch": 1.4444444444444444,
"grad_norm": 1.1837036949871973,
"learning_rate": 6.197900433758205e-06,
"loss": 0.2021,
"step": 712
},
{
"epoch": 1.4485032978183663,
"grad_norm": 1.167969784888959,
"learning_rate": 6.174971327766842e-06,
"loss": 0.1958,
"step": 714
},
{
"epoch": 1.4525621511922882,
"grad_norm": 1.0584127834879178,
"learning_rate": 6.1520160381314465e-06,
"loss": 0.1854,
"step": 716
},
{
"epoch": 1.45662100456621,
"grad_norm": 1.0674665549424147,
"learning_rate": 6.129035076399077e-06,
"loss": 0.1896,
"step": 718
},
{
"epoch": 1.4606798579401319,
"grad_norm": 1.1061278306008033,
"learning_rate": 6.106028954688892e-06,
"loss": 0.1903,
"step": 720
},
{
"epoch": 1.4647387113140538,
"grad_norm": 1.0435415673333681,
"learning_rate": 6.082998185680718e-06,
"loss": 0.1872,
"step": 722
},
{
"epoch": 1.4687975646879756,
"grad_norm": 1.058891200904623,
"learning_rate": 6.059943282603642e-06,
"loss": 0.1983,
"step": 724
},
{
"epoch": 1.4728564180618975,
"grad_norm": 1.0795684120382831,
"learning_rate": 6.03686475922456e-06,
"loss": 0.178,
"step": 726
},
{
"epoch": 1.4769152714358194,
"grad_norm": 1.1384310108536333,
"learning_rate": 6.013763129836739e-06,
"loss": 0.1874,
"step": 728
},
{
"epoch": 1.4809741248097412,
"grad_norm": 1.1039777651990725,
"learning_rate": 5.990638909248352e-06,
"loss": 0.1941,
"step": 730
},
{
"epoch": 1.485032978183663,
"grad_norm": 1.0762677800080636,
"learning_rate": 5.967492612770999e-06,
"loss": 0.1869,
"step": 732
},
{
"epoch": 1.489091831557585,
"grad_norm": 1.0660512751481621,
"learning_rate": 5.944324756208238e-06,
"loss": 0.1807,
"step": 734
},
{
"epoch": 1.4931506849315068,
"grad_norm": 1.0935996132707635,
"learning_rate": 5.92113585584408e-06,
"loss": 0.1945,
"step": 736
},
{
"epoch": 1.4972095383054287,
"grad_norm": 1.1460959395776262,
"learning_rate": 5.897926428431485e-06,
"loss": 0.193,
"step": 738
},
{
"epoch": 1.5012683916793506,
"grad_norm": 1.1965912612959004,
"learning_rate": 5.87469699118085e-06,
"loss": 0.1941,
"step": 740
},
{
"epoch": 1.5053272450532724,
"grad_norm": 1.1926255825530645,
"learning_rate": 5.851448061748477e-06,
"loss": 0.1954,
"step": 742
},
{
"epoch": 1.5093860984271943,
"grad_norm": 1.0050356960035571,
"learning_rate": 5.828180158225047e-06,
"loss": 0.1812,
"step": 744
},
{
"epoch": 1.5134449518011162,
"grad_norm": 1.0259967073861873,
"learning_rate": 5.804893799124068e-06,
"loss": 0.1892,
"step": 746
},
{
"epoch": 1.517503805175038,
"grad_norm": 1.1502053032831951,
"learning_rate": 5.7815895033703164e-06,
"loss": 0.1965,
"step": 748
},
{
"epoch": 1.52156265854896,
"grad_norm": 1.2448018070646114,
"learning_rate": 5.758267790288282e-06,
"loss": 0.2082,
"step": 750
},
{
"epoch": 1.5256215119228818,
"grad_norm": 1.0310538568503425,
"learning_rate": 5.734929179590593e-06,
"loss": 0.1801,
"step": 752
},
{
"epoch": 1.5296803652968036,
"grad_norm": 1.0835597986581949,
"learning_rate": 5.711574191366427e-06,
"loss": 0.1807,
"step": 754
},
{
"epoch": 1.5337392186707255,
"grad_norm": 1.1039579795978836,
"learning_rate": 5.6882033460699294e-06,
"loss": 0.1934,
"step": 756
},
{
"epoch": 1.5377980720446474,
"grad_norm": 1.1564614770433477,
"learning_rate": 5.664817164508614e-06,
"loss": 0.183,
"step": 758
},
{
"epoch": 1.5418569254185692,
"grad_norm": 1.1933306807050015,
"learning_rate": 5.641416167831752e-06,
"loss": 0.1983,
"step": 760
},
{
"epoch": 1.545915778792491,
"grad_norm": 1.1642784370439,
"learning_rate": 5.618000877518767e-06,
"loss": 0.205,
"step": 762
},
{
"epoch": 1.549974632166413,
"grad_norm": 1.096750111637783,
"learning_rate": 5.594571815367602e-06,
"loss": 0.1871,
"step": 764
},
{
"epoch": 1.5540334855403348,
"grad_norm": 0.8896315598965203,
"learning_rate": 5.5711295034831034e-06,
"loss": 0.1588,
"step": 766
},
{
"epoch": 1.5580923389142567,
"grad_norm": 1.021696941589894,
"learning_rate": 5.547674464265384e-06,
"loss": 0.1885,
"step": 768
},
{
"epoch": 1.5621511922881786,
"grad_norm": 1.0760610238279678,
"learning_rate": 5.524207220398169e-06,
"loss": 0.1844,
"step": 770
},
{
"epoch": 1.5662100456621004,
"grad_norm": 1.0146299892564568,
"learning_rate": 5.500728294837168e-06,
"loss": 0.1717,
"step": 772
},
{
"epoch": 1.5702688990360223,
"grad_norm": 1.1203690278420046,
"learning_rate": 5.477238210798406e-06,
"loss": 0.1816,
"step": 774
},
{
"epoch": 1.5743277524099442,
"grad_norm": 1.213922753663776,
"learning_rate": 5.453737491746572e-06,
"loss": 0.1956,
"step": 776
},
{
"epoch": 1.578386605783866,
"grad_norm": 1.0789723847536306,
"learning_rate": 5.430226661383348e-06,
"loss": 0.1831,
"step": 778
},
{
"epoch": 1.582445459157788,
"grad_norm": 1.0165965114615476,
"learning_rate": 5.406706243635742e-06,
"loss": 0.1859,
"step": 780
},
{
"epoch": 1.5865043125317098,
"grad_norm": 0.9244907929973665,
"learning_rate": 5.383176762644416e-06,
"loss": 0.1799,
"step": 782
},
{
"epoch": 1.5905631659056316,
"grad_norm": 1.015459762165936,
"learning_rate": 5.359638742751994e-06,
"loss": 0.1859,
"step": 784
},
{
"epoch": 1.5946220192795535,
"grad_norm": 1.119032836602815,
"learning_rate": 5.3360927084913925e-06,
"loss": 0.1949,
"step": 786
},
{
"epoch": 1.5986808726534754,
"grad_norm": 0.9834799242339374,
"learning_rate": 5.312539184574123e-06,
"loss": 0.1795,
"step": 788
},
{
"epoch": 1.6027397260273972,
"grad_norm": 1.1257597437225455,
"learning_rate": 5.288978695878596e-06,
"loss": 0.1842,
"step": 790
},
{
"epoch": 1.606798579401319,
"grad_norm": 1.1490293546501014,
"learning_rate": 5.265411767438432e-06,
"loss": 0.1892,
"step": 792
},
{
"epoch": 1.610857432775241,
"grad_norm": 1.096272375625098,
"learning_rate": 5.241838924430757e-06,
"loss": 0.1857,
"step": 794
},
{
"epoch": 1.6149162861491628,
"grad_norm": 0.9881695704441573,
"learning_rate": 5.2182606921645e-06,
"loss": 0.1839,
"step": 796
},
{
"epoch": 1.6189751395230847,
"grad_norm": 1.0055701217382587,
"learning_rate": 5.194677596068689e-06,
"loss": 0.1974,
"step": 798
},
{
"epoch": 1.6230339928970066,
"grad_norm": 1.0445320687597668,
"learning_rate": 5.171090161680736e-06,
"loss": 0.186,
"step": 800
},
{
"epoch": 1.6270928462709284,
"grad_norm": 1.010532137401968,
"learning_rate": 5.1474989146347355e-06,
"loss": 0.1818,
"step": 802
},
{
"epoch": 1.6311516996448503,
"grad_norm": 1.059728950180328,
"learning_rate": 5.1239043806497365e-06,
"loss": 0.1878,
"step": 804
},
{
"epoch": 1.6352105530187722,
"grad_norm": 1.02492281938429,
"learning_rate": 5.100307085518046e-06,
"loss": 0.1792,
"step": 806
},
{
"epoch": 1.639269406392694,
"grad_norm": 1.1092364608711534,
"learning_rate": 5.076707555093491e-06,
"loss": 0.1816,
"step": 808
},
{
"epoch": 1.643328259766616,
"grad_norm": 0.9816745515421457,
"learning_rate": 5.053106315279721e-06,
"loss": 0.2025,
"step": 810
},
{
"epoch": 1.6473871131405378,
"grad_norm": 1.0000356438781097,
"learning_rate": 5.029503892018472e-06,
"loss": 0.1669,
"step": 812
},
{
"epoch": 1.6514459665144596,
"grad_norm": 1.0450858430582273,
"learning_rate": 5.005900811277856e-06,
"loss": 0.1802,
"step": 814
},
{
"epoch": 1.6555048198883815,
"grad_norm": 0.9390205529074375,
"learning_rate": 4.982297599040633e-06,
"loss": 0.1636,
"step": 816
},
{
"epoch": 1.6595636732623034,
"grad_norm": 1.154620857603639,
"learning_rate": 4.958694781292496e-06,
"loss": 0.1923,
"step": 818
},
{
"epoch": 1.6636225266362252,
"grad_norm": 1.1872938869192748,
"learning_rate": 4.935092884010347e-06,
"loss": 0.1873,
"step": 820
},
{
"epoch": 1.667681380010147,
"grad_norm": 1.0599792451386443,
"learning_rate": 4.911492433150573e-06,
"loss": 0.1809,
"step": 822
},
{
"epoch": 1.671740233384069,
"grad_norm": 1.1108344863089323,
"learning_rate": 4.887893954637335e-06,
"loss": 0.1864,
"step": 824
},
{
"epoch": 1.6757990867579908,
"grad_norm": 1.058121443964045,
"learning_rate": 4.86429797435083e-06,
"loss": 0.1766,
"step": 826
},
{
"epoch": 1.6798579401319127,
"grad_norm": 1.1323101168080565,
"learning_rate": 4.840705018115595e-06,
"loss": 0.1808,
"step": 828
},
{
"epoch": 1.6839167935058346,
"grad_norm": 1.128025551519256,
"learning_rate": 4.8171156116887725e-06,
"loss": 0.1757,
"step": 830
},
{
"epoch": 1.6879756468797564,
"grad_norm": 1.0934679444423028,
"learning_rate": 4.7935302807483965e-06,
"loss": 0.1924,
"step": 832
},
{
"epoch": 1.6920345002536783,
"grad_norm": 1.0760501447523048,
"learning_rate": 4.769949550881687e-06,
"loss": 0.1902,
"step": 834
},
{
"epoch": 1.6960933536276002,
"grad_norm": 1.0550473463955812,
"learning_rate": 4.746373947573325e-06,
"loss": 0.1787,
"step": 836
},
{
"epoch": 1.700152207001522,
"grad_norm": 1.2275660092618677,
"learning_rate": 4.722803996193753e-06,
"loss": 0.197,
"step": 838
},
{
"epoch": 1.704211060375444,
"grad_norm": 1.0505064696078903,
"learning_rate": 4.699240221987461e-06,
"loss": 0.1819,
"step": 840
},
{
"epoch": 1.7082699137493658,
"grad_norm": 1.1523046961319277,
"learning_rate": 4.6756831500612846e-06,
"loss": 0.1888,
"step": 842
},
{
"epoch": 1.7123287671232876,
"grad_norm": 0.9989431343495883,
"learning_rate": 4.652133305372705e-06,
"loss": 0.1727,
"step": 844
},
{
"epoch": 1.7163876204972095,
"grad_norm": 1.0740718139978316,
"learning_rate": 4.628591212718144e-06,
"loss": 0.1756,
"step": 846
},
{
"epoch": 1.7204464738711314,
"grad_norm": 1.1041425692480016,
"learning_rate": 4.605057396721275e-06,
"loss": 0.1741,
"step": 848
},
{
"epoch": 1.7245053272450532,
"grad_norm": 1.212666044737014,
"learning_rate": 4.58153238182133e-06,
"loss": 0.1841,
"step": 850
},
{
"epoch": 1.728564180618975,
"grad_norm": 1.0783964750466963,
"learning_rate": 4.558016692261412e-06,
"loss": 0.1698,
"step": 852
},
{
"epoch": 1.732623033992897,
"grad_norm": 1.0683744846668402,
"learning_rate": 4.534510852076817e-06,
"loss": 0.1886,
"step": 854
},
{
"epoch": 1.7366818873668188,
"grad_norm": 1.1011269960255068,
"learning_rate": 4.511015385083345e-06,
"loss": 0.1945,
"step": 856
},
{
"epoch": 1.7407407407407407,
"grad_norm": 0.9991671395240459,
"learning_rate": 4.487530814865646e-06,
"loss": 0.1824,
"step": 858
},
{
"epoch": 1.7447995941146626,
"grad_norm": 1.0566808884558716,
"learning_rate": 4.464057664765532e-06,
"loss": 0.1823,
"step": 860
},
{
"epoch": 1.7488584474885844,
"grad_norm": 1.0940203228626781,
"learning_rate": 4.440596457870327e-06,
"loss": 0.1834,
"step": 862
},
{
"epoch": 1.7529173008625063,
"grad_norm": 1.0234332049105062,
"learning_rate": 4.417147717001205e-06,
"loss": 0.1746,
"step": 864
},
{
"epoch": 1.7569761542364282,
"grad_norm": 0.9623367599486023,
"learning_rate": 4.393711964701541e-06,
"loss": 0.1682,
"step": 866
},
{
"epoch": 1.76103500761035,
"grad_norm": 1.0516978200243972,
"learning_rate": 4.37028972322527e-06,
"loss": 0.1786,
"step": 868
},
{
"epoch": 1.765093860984272,
"grad_norm": 1.1391069464012384,
"learning_rate": 4.346881514525236e-06,
"loss": 0.1791,
"step": 870
},
{
"epoch": 1.7691527143581938,
"grad_norm": 0.971499677774941,
"learning_rate": 4.323487860241582e-06,
"loss": 0.1672,
"step": 872
},
{
"epoch": 1.7732115677321156,
"grad_norm": 1.1577835890912351,
"learning_rate": 4.3001092816901055e-06,
"loss": 0.1854,
"step": 874
},
{
"epoch": 1.7772704211060375,
"grad_norm": 1.1217645675230743,
"learning_rate": 4.2767462998506485e-06,
"loss": 0.1823,
"step": 876
},
{
"epoch": 1.7813292744799594,
"grad_norm": 1.1190282031824559,
"learning_rate": 4.253399435355492e-06,
"loss": 0.1895,
"step": 878
},
{
"epoch": 1.7853881278538812,
"grad_norm": 1.0134907069750374,
"learning_rate": 4.230069208477745e-06,
"loss": 0.175,
"step": 880
},
{
"epoch": 1.789446981227803,
"grad_norm": 1.1494619938574746,
"learning_rate": 4.206756139119762e-06,
"loss": 0.1953,
"step": 882
},
{
"epoch": 1.793505834601725,
"grad_norm": 0.9248356141218419,
"learning_rate": 4.183460746801546e-06,
"loss": 0.1702,
"step": 884
},
{
"epoch": 1.7975646879756468,
"grad_norm": 1.0725930962377248,
"learning_rate": 4.160183550649176e-06,
"loss": 0.1778,
"step": 886
},
{
"epoch": 1.8016235413495687,
"grad_norm": 1.0788894008577279,
"learning_rate": 4.136925069383243e-06,
"loss": 0.1917,
"step": 888
},
{
"epoch": 1.8056823947234906,
"grad_norm": 1.0122516476461982,
"learning_rate": 4.113685821307282e-06,
"loss": 0.1898,
"step": 890
},
{
"epoch": 1.8097412480974124,
"grad_norm": 1.027424121449119,
"learning_rate": 4.090466324296228e-06,
"loss": 0.1822,
"step": 892
},
{
"epoch": 1.8138001014713343,
"grad_norm": 1.1269393961404834,
"learning_rate": 4.067267095784871e-06,
"loss": 0.1841,
"step": 894
},
{
"epoch": 1.8178589548452562,
"grad_norm": 1.0052196803723334,
"learning_rate": 4.044088652756332e-06,
"loss": 0.1629,
"step": 896
},
{
"epoch": 1.821917808219178,
"grad_norm": 1.079578577258494,
"learning_rate": 4.020931511730533e-06,
"loss": 0.1774,
"step": 898
},
{
"epoch": 1.8259766615931,
"grad_norm": 0.9587436391914074,
"learning_rate": 3.997796188752695e-06,
"loss": 0.1733,
"step": 900
},
{
"epoch": 1.8300355149670218,
"grad_norm": 0.9992614549374934,
"learning_rate": 3.974683199381836e-06,
"loss": 0.1685,
"step": 902
},
{
"epoch": 1.8340943683409436,
"grad_norm": 0.9418897276184947,
"learning_rate": 3.951593058679276e-06,
"loss": 0.1672,
"step": 904
},
{
"epoch": 1.8381532217148655,
"grad_norm": 1.1397268358795776,
"learning_rate": 3.928526281197169e-06,
"loss": 0.1749,
"step": 906
},
{
"epoch": 1.8422120750887874,
"grad_norm": 1.0440206163216095,
"learning_rate": 3.905483380967027e-06,
"loss": 0.1722,
"step": 908
},
{
"epoch": 1.8462709284627092,
"grad_norm": 1.048561547401053,
"learning_rate": 3.882464871488273e-06,
"loss": 0.1693,
"step": 910
},
{
"epoch": 1.850329781836631,
"grad_norm": 1.0284223905418497,
"learning_rate": 3.859471265716791e-06,
"loss": 0.1691,
"step": 912
},
{
"epoch": 1.854388635210553,
"grad_norm": 1.004974372673609,
"learning_rate": 3.836503076053501e-06,
"loss": 0.1751,
"step": 914
},
{
"epoch": 1.8584474885844748,
"grad_norm": 1.1435852033856233,
"learning_rate": 3.8135608143329404e-06,
"loss": 0.1809,
"step": 916
},
{
"epoch": 1.8625063419583967,
"grad_norm": 0.9996592509283232,
"learning_rate": 3.7906449918118493e-06,
"loss": 0.1696,
"step": 918
},
{
"epoch": 1.8665651953323186,
"grad_norm": 1.0632454270482863,
"learning_rate": 3.7677561191577873e-06,
"loss": 0.17,
"step": 920
},
{
"epoch": 1.8706240487062404,
"grad_norm": 1.026248873958979,
"learning_rate": 3.7448947064377496e-06,
"loss": 0.1768,
"step": 922
},
{
"epoch": 1.8746829020801623,
"grad_norm": 1.0006033609765281,
"learning_rate": 3.722061263106797e-06,
"loss": 0.1712,
"step": 924
},
{
"epoch": 1.8787417554540842,
"grad_norm": 1.0394166532597735,
"learning_rate": 3.699256297996714e-06,
"loss": 0.1802,
"step": 926
},
{
"epoch": 1.882800608828006,
"grad_norm": 1.1087386752635604,
"learning_rate": 3.6764803193046538e-06,
"loss": 0.1787,
"step": 928
},
{
"epoch": 1.886859462201928,
"grad_norm": 1.1539792806225302,
"learning_rate": 3.6537338345818273e-06,
"loss": 0.177,
"step": 930
},
{
"epoch": 1.8909183155758498,
"grad_norm": 1.1358496184900775,
"learning_rate": 3.6310173507221884e-06,
"loss": 0.1784,
"step": 932
},
{
"epoch": 1.8949771689497716,
"grad_norm": 0.9648197271266891,
"learning_rate": 3.6083313739511316e-06,
"loss": 0.1613,
"step": 934
},
{
"epoch": 1.8990360223236935,
"grad_norm": 1.069217067686545,
"learning_rate": 3.5856764098142207e-06,
"loss": 0.1722,
"step": 936
},
{
"epoch": 1.9030948756976154,
"grad_norm": 0.9827567009351711,
"learning_rate": 3.563052963165915e-06,
"loss": 0.1619,
"step": 938
},
{
"epoch": 1.9071537290715372,
"grad_norm": 1.0416626747952469,
"learning_rate": 3.5404615381583264e-06,
"loss": 0.1786,
"step": 940
},
{
"epoch": 1.911212582445459,
"grad_norm": 0.9796952362181767,
"learning_rate": 3.5179026382299752e-06,
"loss": 0.1635,
"step": 942
},
{
"epoch": 1.915271435819381,
"grad_norm": 1.0913636067798673,
"learning_rate": 3.4953767660945825e-06,
"loss": 0.1849,
"step": 944
},
{
"epoch": 1.9193302891933028,
"grad_norm": 1.054155532699976,
"learning_rate": 3.472884423729861e-06,
"loss": 0.1824,
"step": 946
},
{
"epoch": 1.9233891425672247,
"grad_norm": 1.1299730992487989,
"learning_rate": 3.4504261123663243e-06,
"loss": 0.1741,
"step": 948
},
{
"epoch": 1.9274479959411466,
"grad_norm": 0.9544541000662791,
"learning_rate": 3.4280023324761287e-06,
"loss": 0.1622,
"step": 950
},
{
"epoch": 1.9315068493150684,
"grad_norm": 1.0633189960260987,
"learning_rate": 3.4056135837619077e-06,
"loss": 0.1714,
"step": 952
},
{
"epoch": 1.9355657026889903,
"grad_norm": 0.9744187925381573,
"learning_rate": 3.3832603651456486e-06,
"loss": 0.1704,
"step": 954
},
{
"epoch": 1.9396245560629122,
"grad_norm": 1.0876850028674756,
"learning_rate": 3.360943174757564e-06,
"loss": 0.1835,
"step": 956
},
{
"epoch": 1.943683409436834,
"grad_norm": 0.9933052886092801,
"learning_rate": 3.3386625099249957e-06,
"loss": 0.1722,
"step": 958
},
{
"epoch": 1.947742262810756,
"grad_norm": 1.0507655110716982,
"learning_rate": 3.3164188671613382e-06,
"loss": 0.1799,
"step": 960
},
{
"epoch": 1.9518011161846778,
"grad_norm": 0.9635760320459535,
"learning_rate": 3.29421274215496e-06,
"loss": 0.1665,
"step": 962
},
{
"epoch": 1.9558599695585996,
"grad_norm": 1.0098097588789372,
"learning_rate": 3.2720446297581696e-06,
"loss": 0.1756,
"step": 964
},
{
"epoch": 1.9599188229325215,
"grad_norm": 0.9487863077375068,
"learning_rate": 3.2499150239761813e-06,
"loss": 0.1674,
"step": 966
},
{
"epoch": 1.9639776763064434,
"grad_norm": 1.0886147748414823,
"learning_rate": 3.2278244179561107e-06,
"loss": 0.176,
"step": 968
},
{
"epoch": 1.9680365296803652,
"grad_norm": 1.0933879192223048,
"learning_rate": 3.205773303975982e-06,
"loss": 0.1649,
"step": 970
},
{
"epoch": 1.972095383054287,
"grad_norm": 1.032499198840103,
"learning_rate": 3.1837621734337607e-06,
"loss": 0.1712,
"step": 972
},
{
"epoch": 1.976154236428209,
"grad_norm": 1.0255453322259884,
"learning_rate": 3.1617915168363994e-06,
"loss": 0.1835,
"step": 974
},
{
"epoch": 1.9802130898021308,
"grad_norm": 0.9986750671875287,
"learning_rate": 3.1398618237889124e-06,
"loss": 0.1685,
"step": 976
},
{
"epoch": 1.9842719431760527,
"grad_norm": 1.018414001751852,
"learning_rate": 3.11797358298346e-06,
"loss": 0.1707,
"step": 978
},
{
"epoch": 1.9883307965499746,
"grad_norm": 1.0770634669309533,
"learning_rate": 3.096127282188458e-06,
"loss": 0.1687,
"step": 980
},
{
"epoch": 1.9923896499238964,
"grad_norm": 1.1276389976988863,
"learning_rate": 3.074323408237716e-06,
"loss": 0.1788,
"step": 982
},
{
"epoch": 1.9964485032978183,
"grad_norm": 1.1430608051198146,
"learning_rate": 3.0525624470195746e-06,
"loss": 0.1878,
"step": 984
},
{
"epoch": 2.0,
"grad_norm": 1.1495443627078172,
"learning_rate": 3.0308448834660953e-06,
"loss": 0.1664,
"step": 986
},
{
"epoch": 2.004058853373922,
"grad_norm": 0.7595072680880963,
"learning_rate": 3.009171201542235e-06,
"loss": 0.073,
"step": 988
},
{
"epoch": 2.0081177067478437,
"grad_norm": 0.6553749481487482,
"learning_rate": 2.987541884235078e-06,
"loss": 0.0666,
"step": 990
},
{
"epoch": 2.0121765601217656,
"grad_norm": 0.6758730321723564,
"learning_rate": 2.965957413543063e-06,
"loss": 0.068,
"step": 992
},
{
"epoch": 2.0162354134956875,
"grad_norm": 0.7705357959194599,
"learning_rate": 2.944418270465243e-06,
"loss": 0.0722,
"step": 994
},
{
"epoch": 2.0202942668696093,
"grad_norm": 0.7447986269637112,
"learning_rate": 2.9229249349905686e-06,
"loss": 0.0636,
"step": 996
},
{
"epoch": 2.024353120243531,
"grad_norm": 0.7070522483364615,
"learning_rate": 2.9014778860871916e-06,
"loss": 0.056,
"step": 998
},
{
"epoch": 2.028411973617453,
"grad_norm": 0.8137822848077799,
"learning_rate": 2.880077601691793e-06,
"loss": 0.0711,
"step": 1000
},
{
"epoch": 2.032470826991375,
"grad_norm": 0.6930740331498185,
"learning_rate": 2.8587245586989265e-06,
"loss": 0.0619,
"step": 1002
},
{
"epoch": 2.036529680365297,
"grad_norm": 0.6609838916960153,
"learning_rate": 2.8374192329503934e-06,
"loss": 0.0604,
"step": 1004
},
{
"epoch": 2.0405885337392187,
"grad_norm": 0.6985284878939515,
"learning_rate": 2.8161620992246497e-06,
"loss": 0.0616,
"step": 1006
},
{
"epoch": 2.0446473871131405,
"grad_norm": 0.697512001428481,
"learning_rate": 2.7949536312262048e-06,
"loss": 0.0649,
"step": 1008
},
{
"epoch": 2.0487062404870624,
"grad_norm": 0.6960549223161825,
"learning_rate": 2.7737943015750862e-06,
"loss": 0.0699,
"step": 1010
},
{
"epoch": 2.0527650938609843,
"grad_norm": 0.7147137635432833,
"learning_rate": 2.752684581796292e-06,
"loss": 0.0626,
"step": 1012
},
{
"epoch": 2.056823947234906,
"grad_norm": 0.6478212242747857,
"learning_rate": 2.7316249423092923e-06,
"loss": 0.0594,
"step": 1014
},
{
"epoch": 2.060882800608828,
"grad_norm": 0.7050061564241327,
"learning_rate": 2.7106158524175396e-06,
"loss": 0.0646,
"step": 1016
},
{
"epoch": 2.06494165398275,
"grad_norm": 0.5836812360722653,
"learning_rate": 2.689657780298019e-06,
"loss": 0.0552,
"step": 1018
},
{
"epoch": 2.0690005073566717,
"grad_norm": 0.6963206280767881,
"learning_rate": 2.6687511929908093e-06,
"loss": 0.0633,
"step": 1020
},
{
"epoch": 2.0730593607305936,
"grad_norm": 0.6223916195319845,
"learning_rate": 2.6478965563886745e-06,
"loss": 0.0567,
"step": 1022
},
{
"epoch": 2.0771182141045155,
"grad_norm": 0.7658362298283596,
"learning_rate": 2.627094335226682e-06,
"loss": 0.059,
"step": 1024
},
{
"epoch": 2.0811770674784373,
"grad_norm": 0.6242034540359834,
"learning_rate": 2.6063449930718487e-06,
"loss": 0.0566,
"step": 1026
},
{
"epoch": 2.085235920852359,
"grad_norm": 0.6057344953235689,
"learning_rate": 2.5856489923128136e-06,
"loss": 0.0573,
"step": 1028
},
{
"epoch": 2.089294774226281,
"grad_norm": 0.6209117592060627,
"learning_rate": 2.5650067941495236e-06,
"loss": 0.0543,
"step": 1030
},
{
"epoch": 2.093353627600203,
"grad_norm": 0.626145712835931,
"learning_rate": 2.5444188585829634e-06,
"loss": 0.0573,
"step": 1032
},
{
"epoch": 2.097412480974125,
"grad_norm": 0.7019968254783542,
"learning_rate": 2.523885644404906e-06,
"loss": 0.0629,
"step": 1034
},
{
"epoch": 2.1014713343480467,
"grad_norm": 0.672737803900476,
"learning_rate": 2.5034076091876813e-06,
"loss": 0.0599,
"step": 1036
},
{
"epoch": 2.1055301877219685,
"grad_norm": 0.7666550066371539,
"learning_rate": 2.48298520927399e-06,
"loss": 0.0685,
"step": 1038
},
{
"epoch": 2.1095890410958904,
"grad_norm": 0.6186035266270663,
"learning_rate": 2.4626188997667224e-06,
"loss": 0.0528,
"step": 1040
},
{
"epoch": 2.1136478944698123,
"grad_norm": 0.6362086867459513,
"learning_rate": 2.4423091345188244e-06,
"loss": 0.0609,
"step": 1042
},
{
"epoch": 2.117706747843734,
"grad_norm": 0.7043829575983082,
"learning_rate": 2.4220563661231793e-06,
"loss": 0.0607,
"step": 1044
},
{
"epoch": 2.121765601217656,
"grad_norm": 0.6342780093035242,
"learning_rate": 2.4018610459025317e-06,
"loss": 0.0614,
"step": 1046
},
{
"epoch": 2.125824454591578,
"grad_norm": 0.6371937965298768,
"learning_rate": 2.381723623899412e-06,
"loss": 0.0576,
"step": 1048
},
{
"epoch": 2.1298833079654997,
"grad_norm": 0.7261372327288902,
"learning_rate": 2.361644548866127e-06,
"loss": 0.0612,
"step": 1050
},
{
"epoch": 2.1339421613394216,
"grad_norm": 0.6921022790180155,
"learning_rate": 2.341624268254747e-06,
"loss": 0.0637,
"step": 1052
},
{
"epoch": 2.1380010147133435,
"grad_norm": 0.7180722302503428,
"learning_rate": 2.3216632282071345e-06,
"loss": 0.0653,
"step": 1054
},
{
"epoch": 2.1420598680872653,
"grad_norm": 0.6048159926460217,
"learning_rate": 2.3017618735450142e-06,
"loss": 0.055,
"step": 1056
},
{
"epoch": 2.146118721461187,
"grad_norm": 0.6876882976918033,
"learning_rate": 2.2819206477600462e-06,
"loss": 0.0593,
"step": 1058
},
{
"epoch": 2.150177574835109,
"grad_norm": 0.6830162087163217,
"learning_rate": 2.2621399930039493e-06,
"loss": 0.0576,
"step": 1060
},
{
"epoch": 2.154236428209031,
"grad_norm": 0.572916709401609,
"learning_rate": 2.2424203500786473e-06,
"loss": 0.0565,
"step": 1062
},
{
"epoch": 2.158295281582953,
"grad_norm": 0.6064104088259805,
"learning_rate": 2.2227621584264505e-06,
"loss": 0.0609,
"step": 1064
},
{
"epoch": 2.1623541349568747,
"grad_norm": 0.5345159650560205,
"learning_rate": 2.203165856120251e-06,
"loss": 0.0486,
"step": 1066
},
{
"epoch": 2.1664129883307965,
"grad_norm": 0.708241954697193,
"learning_rate": 2.183631879853776e-06,
"loss": 0.0592,
"step": 1068
},
{
"epoch": 2.1704718417047184,
"grad_norm": 0.6237247767204619,
"learning_rate": 2.164160664931843e-06,
"loss": 0.0564,
"step": 1070
},
{
"epoch": 2.1745306950786403,
"grad_norm": 0.6913274477398279,
"learning_rate": 2.1447526452606658e-06,
"loss": 0.0608,
"step": 1072
},
{
"epoch": 2.178589548452562,
"grad_norm": 0.615699302647019,
"learning_rate": 2.125408253338183e-06,
"loss": 0.0572,
"step": 1074
},
{
"epoch": 2.182648401826484,
"grad_norm": 0.7114790842641555,
"learning_rate": 2.106127920244423e-06,
"loss": 0.056,
"step": 1076
},
{
"epoch": 2.186707255200406,
"grad_norm": 0.6498843232992042,
"learning_rate": 2.086912075631896e-06,
"loss": 0.0579,
"step": 1078
},
{
"epoch": 2.1907661085743277,
"grad_norm": 0.6342534301664807,
"learning_rate": 2.067761147716017e-06,
"loss": 0.0573,
"step": 1080
},
{
"epoch": 2.1948249619482496,
"grad_norm": 0.7027558078351507,
"learning_rate": 2.0486755632655643e-06,
"loss": 0.0593,
"step": 1082
},
{
"epoch": 2.1988838153221715,
"grad_norm": 0.6558346648533067,
"learning_rate": 2.029655747593169e-06,
"loss": 0.0605,
"step": 1084
},
{
"epoch": 2.2029426686960933,
"grad_norm": 0.6450513139758751,
"learning_rate": 2.010702124545845e-06,
"loss": 0.0598,
"step": 1086
},
{
"epoch": 2.207001522070015,
"grad_norm": 0.7322704077213636,
"learning_rate": 1.9918151164955303e-06,
"loss": 0.0617,
"step": 1088
},
{
"epoch": 2.211060375443937,
"grad_norm": 0.7103114005030767,
"learning_rate": 1.9729951443296823e-06,
"loss": 0.0564,
"step": 1090
},
{
"epoch": 2.215119228817859,
"grad_norm": 0.6912085691591306,
"learning_rate": 1.9542426274418975e-06,
"loss": 0.0628,
"step": 1092
},
{
"epoch": 2.219178082191781,
"grad_norm": 0.7492868491245555,
"learning_rate": 1.9355579837225673e-06,
"loss": 0.0601,
"step": 1094
},
{
"epoch": 2.2232369355657027,
"grad_norm": 0.6846296844726598,
"learning_rate": 1.916941629549565e-06,
"loss": 0.0562,
"step": 1096
},
{
"epoch": 2.2272957889396245,
"grad_norm": 0.6860703433669731,
"learning_rate": 1.8983939797789624e-06,
"loss": 0.0604,
"step": 1098
},
{
"epoch": 2.2313546423135464,
"grad_norm": 0.6583737331854461,
"learning_rate": 1.8799154477357883e-06,
"loss": 0.057,
"step": 1100
},
{
"epoch": 2.2354134956874683,
"grad_norm": 0.6015963283689161,
"learning_rate": 1.8615064452048181e-06,
"loss": 0.0529,
"step": 1102
},
{
"epoch": 2.23947234906139,
"grad_norm": 0.6596224589385736,
"learning_rate": 1.8431673824214013e-06,
"loss": 0.0607,
"step": 1104
},
{
"epoch": 2.243531202435312,
"grad_norm": 0.6295377331681089,
"learning_rate": 1.8248986680623077e-06,
"loss": 0.0524,
"step": 1106
},
{
"epoch": 2.247590055809234,
"grad_norm": 0.7623992953499044,
"learning_rate": 1.8067007092366368e-06,
"loss": 0.0633,
"step": 1108
},
{
"epoch": 2.2516489091831557,
"grad_norm": 0.6747010480441555,
"learning_rate": 1.7885739114767292e-06,
"loss": 0.0575,
"step": 1110
},
{
"epoch": 2.2557077625570776,
"grad_norm": 0.6640676472618579,
"learning_rate": 1.770518678729139e-06,
"loss": 0.0532,
"step": 1112
},
{
"epoch": 2.2597666159309995,
"grad_norm": 0.6588950267013456,
"learning_rate": 1.752535413345634e-06,
"loss": 0.0572,
"step": 1114
},
{
"epoch": 2.2638254693049213,
"grad_norm": 0.6957155982625279,
"learning_rate": 1.734624516074221e-06,
"loss": 0.0591,
"step": 1116
},
{
"epoch": 2.267884322678843,
"grad_norm": 0.7568659886745189,
"learning_rate": 1.716786386050221e-06,
"loss": 0.0619,
"step": 1118
},
{
"epoch": 2.271943176052765,
"grad_norm": 0.7149883674184384,
"learning_rate": 1.6990214207873723e-06,
"loss": 0.0603,
"step": 1120
},
{
"epoch": 2.276002029426687,
"grad_norm": 0.6375454124392296,
"learning_rate": 1.681330016168977e-06,
"loss": 0.0583,
"step": 1122
},
{
"epoch": 2.280060882800609,
"grad_norm": 0.7001731734101665,
"learning_rate": 1.6637125664390747e-06,
"loss": 0.06,
"step": 1124
},
{
"epoch": 2.2841197361745307,
"grad_norm": 0.5744052821303995,
"learning_rate": 1.6461694641936544e-06,
"loss": 0.0532,
"step": 1126
},
{
"epoch": 2.2881785895484525,
"grad_norm": 0.6379817597651213,
"learning_rate": 1.6287011003719105e-06,
"loss": 0.0581,
"step": 1128
},
{
"epoch": 2.2922374429223744,
"grad_norm": 0.6936132311539229,
"learning_rate": 1.61130786424753e-06,
"loss": 0.0578,
"step": 1130
},
{
"epoch": 2.2962962962962963,
"grad_norm": 0.7643680282316857,
"learning_rate": 1.5939901434200145e-06,
"loss": 0.0587,
"step": 1132
},
{
"epoch": 2.300355149670218,
"grad_norm": 0.676227187971244,
"learning_rate": 1.5767483238060498e-06,
"loss": 0.0568,
"step": 1134
},
{
"epoch": 2.30441400304414,
"grad_norm": 0.6485083388679013,
"learning_rate": 1.5595827896308968e-06,
"loss": 0.0615,
"step": 1136
},
{
"epoch": 2.308472856418062,
"grad_norm": 0.6513490211287265,
"learning_rate": 1.5424939234198377e-06,
"loss": 0.0558,
"step": 1138
},
{
"epoch": 2.3125317097919837,
"grad_norm": 0.6470779888157924,
"learning_rate": 1.5254821059896452e-06,
"loss": 0.0569,
"step": 1140
},
{
"epoch": 2.3165905631659056,
"grad_norm": 0.6690972634285901,
"learning_rate": 1.5085477164400975e-06,
"loss": 0.0564,
"step": 1142
},
{
"epoch": 2.3206494165398275,
"grad_norm": 0.6192010726163617,
"learning_rate": 1.4916911321455362e-06,
"loss": 0.0566,
"step": 1144
},
{
"epoch": 2.3247082699137493,
"grad_norm": 0.6718507373057219,
"learning_rate": 1.4749127287464483e-06,
"loss": 0.0566,
"step": 1146
},
{
"epoch": 2.328767123287671,
"grad_norm": 0.6291964856953526,
"learning_rate": 1.458212880141099e-06,
"loss": 0.0568,
"step": 1148
},
{
"epoch": 2.332825976661593,
"grad_norm": 0.6090771051076672,
"learning_rate": 1.4415919584771999e-06,
"loss": 0.0547,
"step": 1150
},
{
"epoch": 2.336884830035515,
"grad_norm": 0.6315155634950337,
"learning_rate": 1.425050334143616e-06,
"loss": 0.0586,
"step": 1152
},
{
"epoch": 2.340943683409437,
"grad_norm": 0.682944731395333,
"learning_rate": 1.408588375762114e-06,
"loss": 0.0575,
"step": 1154
},
{
"epoch": 2.3450025367833587,
"grad_norm": 0.6828351505916127,
"learning_rate": 1.39220645017914e-06,
"loss": 0.0575,
"step": 1156
},
{
"epoch": 2.3490613901572805,
"grad_norm": 0.6343438475682116,
"learning_rate": 1.3759049224576516e-06,
"loss": 0.054,
"step": 1158
},
{
"epoch": 2.3531202435312024,
"grad_norm": 0.6608391141452298,
"learning_rate": 1.3596841558689788e-06,
"loss": 0.0611,
"step": 1160
},
{
"epoch": 2.3571790969051243,
"grad_norm": 0.6847038775101427,
"learning_rate": 1.3435445118847362e-06,
"loss": 0.0597,
"step": 1162
},
{
"epoch": 2.361237950279046,
"grad_norm": 0.6758006038700237,
"learning_rate": 1.3274863501687546e-06,
"loss": 0.0582,
"step": 1164
},
{
"epoch": 2.365296803652968,
"grad_norm": 0.679273586750369,
"learning_rate": 1.3115100285690795e-06,
"loss": 0.0586,
"step": 1166
},
{
"epoch": 2.36935565702689,
"grad_norm": 0.6455382817232485,
"learning_rate": 1.2956159031099874e-06,
"loss": 0.0572,
"step": 1168
},
{
"epoch": 2.3734145104008117,
"grad_norm": 0.6494368906651962,
"learning_rate": 1.2798043279840544e-06,
"loss": 0.0573,
"step": 1170
},
{
"epoch": 2.3774733637747336,
"grad_norm": 0.6652797010014586,
"learning_rate": 1.2640756555442684e-06,
"loss": 0.0585,
"step": 1172
},
{
"epoch": 2.3815322171486555,
"grad_norm": 0.7085143214543068,
"learning_rate": 1.248430236296168e-06,
"loss": 0.0531,
"step": 1174
},
{
"epoch": 2.3855910705225774,
"grad_norm": 0.7017172691247079,
"learning_rate": 1.2328684188900392e-06,
"loss": 0.0562,
"step": 1176
},
{
"epoch": 2.389649923896499,
"grad_norm": 0.5911653357727898,
"learning_rate": 1.2173905501131395e-06,
"loss": 0.0555,
"step": 1178
},
{
"epoch": 2.393708777270421,
"grad_norm": 0.7052853605271917,
"learning_rate": 1.2019969748819783e-06,
"loss": 0.0633,
"step": 1180
},
{
"epoch": 2.397767630644343,
"grad_norm": 0.5468703307014926,
"learning_rate": 1.186688036234625e-06,
"loss": 0.0512,
"step": 1182
},
{
"epoch": 2.401826484018265,
"grad_norm": 0.6298551223639655,
"learning_rate": 1.1714640753230628e-06,
"loss": 0.0523,
"step": 1184
},
{
"epoch": 2.4058853373921867,
"grad_norm": 0.7357211211763364,
"learning_rate": 1.1563254314055893e-06,
"loss": 0.0553,
"step": 1186
},
{
"epoch": 2.4099441907661086,
"grad_norm": 0.6645237994069922,
"learning_rate": 1.1412724418392562e-06,
"loss": 0.0544,
"step": 1188
},
{
"epoch": 2.4140030441400304,
"grad_norm": 0.661565716961166,
"learning_rate": 1.126305442072354e-06,
"loss": 0.055,
"step": 1190
},
{
"epoch": 2.4180618975139523,
"grad_norm": 0.6206331269413049,
"learning_rate": 1.1114247656369305e-06,
"loss": 0.0545,
"step": 1192
},
{
"epoch": 2.422120750887874,
"grad_norm": 0.7180840889505126,
"learning_rate": 1.0966307441413598e-06,
"loss": 0.0581,
"step": 1194
},
{
"epoch": 2.426179604261796,
"grad_norm": 0.7354347742714406,
"learning_rate": 1.0819237072629606e-06,
"loss": 0.0597,
"step": 1196
},
{
"epoch": 2.430238457635718,
"grad_norm": 0.664963118331295,
"learning_rate": 1.0673039827406373e-06,
"loss": 0.0592,
"step": 1198
},
{
"epoch": 2.4342973110096398,
"grad_norm": 0.6596757154643482,
"learning_rate": 1.0527718963675871e-06,
"loss": 0.0543,
"step": 1200
},
{
"epoch": 2.4383561643835616,
"grad_norm": 0.614862575729698,
"learning_rate": 1.0383277719840318e-06,
"loss": 0.051,
"step": 1202
},
{
"epoch": 2.4424150177574835,
"grad_norm": 0.6239737484592334,
"learning_rate": 1.0239719314700052e-06,
"loss": 0.0569,
"step": 1204
},
{
"epoch": 2.4464738711314054,
"grad_norm": 0.7008527413286773,
"learning_rate": 1.0097046947381805e-06,
"loss": 0.0622,
"step": 1206
},
{
"epoch": 2.450532724505327,
"grad_norm": 0.646447221626618,
"learning_rate": 9.955263797267379e-07,
"loss": 0.0593,
"step": 1208
},
{
"epoch": 2.454591577879249,
"grad_norm": 0.6736248842428098,
"learning_rate": 9.814373023922851e-07,
"loss": 0.0573,
"step": 1210
},
{
"epoch": 2.458650431253171,
"grad_norm": 0.7520656749859748,
"learning_rate": 9.674377767028142e-07,
"loss": 0.0595,
"step": 1212
},
{
"epoch": 2.462709284627093,
"grad_norm": 0.6256286530852058,
"learning_rate": 9.53528114630699e-07,
"loss": 0.0539,
"step": 1214
},
{
"epoch": 2.4667681380010147,
"grad_norm": 0.7163476466314366,
"learning_rate": 9.397086261457511e-07,
"loss": 0.0587,
"step": 1216
},
{
"epoch": 2.4708269913749366,
"grad_norm": 0.6810504627251797,
"learning_rate": 9.259796192083071e-07,
"loss": 0.0576,
"step": 1218
},
{
"epoch": 2.4748858447488584,
"grad_norm": 0.6288859512257164,
"learning_rate": 9.123413997623714e-07,
"loss": 0.0543,
"step": 1220
},
{
"epoch": 2.4789446981227803,
"grad_norm": 0.6740418425171263,
"learning_rate": 8.987942717287923e-07,
"loss": 0.0578,
"step": 1222
},
{
"epoch": 2.483003551496702,
"grad_norm": 0.6224042862768536,
"learning_rate": 8.853385369984901e-07,
"loss": 0.0537,
"step": 1224
},
{
"epoch": 2.487062404870624,
"grad_norm": 0.6485173083194978,
"learning_rate": 8.719744954257375e-07,
"loss": 0.056,
"step": 1226
},
{
"epoch": 2.491121258244546,
"grad_norm": 0.6582025110825541,
"learning_rate": 8.587024448214637e-07,
"loss": 0.0541,
"step": 1228
},
{
"epoch": 2.4951801116184678,
"grad_norm": 0.7255418838785723,
"learning_rate": 8.455226809466327e-07,
"loss": 0.0592,
"step": 1230
},
{
"epoch": 2.4992389649923896,
"grad_norm": 0.6507900300493289,
"learning_rate": 8.324354975056403e-07,
"loss": 0.0539,
"step": 1232
},
{
"epoch": 2.5032978183663115,
"grad_norm": 0.6681824734246471,
"learning_rate": 8.19441186139776e-07,
"loss": 0.0591,
"step": 1234
},
{
"epoch": 2.5073566717402334,
"grad_norm": 0.6936354997773724,
"learning_rate": 8.065400364207194e-07,
"loss": 0.0584,
"step": 1236
},
{
"epoch": 2.5114155251141552,
"grad_norm": 0.6718221656136698,
"learning_rate": 7.937323358440935e-07,
"loss": 0.0543,
"step": 1238
},
{
"epoch": 2.515474378488077,
"grad_norm": 0.6632718887777156,
"learning_rate": 7.810183698230539e-07,
"loss": 0.0572,
"step": 1240
},
{
"epoch": 2.519533231861999,
"grad_norm": 0.6239004674471934,
"learning_rate": 7.683984216819262e-07,
"loss": 0.0545,
"step": 1242
},
{
"epoch": 2.523592085235921,
"grad_norm": 0.5944403471674328,
"learning_rate": 7.55872772649896e-07,
"loss": 0.0535,
"step": 1244
},
{
"epoch": 2.5276509386098427,
"grad_norm": 0.6165723170085607,
"learning_rate": 7.434417018547396e-07,
"loss": 0.0514,
"step": 1246
},
{
"epoch": 2.5317097919837646,
"grad_norm": 0.7183547419188132,
"learning_rate": 7.311054863166095e-07,
"loss": 0.0588,
"step": 1248
},
{
"epoch": 2.5357686453576864,
"grad_norm": 0.7242691023134634,
"learning_rate": 7.188644009418517e-07,
"loss": 0.0603,
"step": 1250
},
{
"epoch": 2.5398274987316083,
"grad_norm": 0.5791350405500479,
"learning_rate": 7.067187185168862e-07,
"loss": 0.0531,
"step": 1252
},
{
"epoch": 2.54388635210553,
"grad_norm": 0.6776524885992443,
"learning_rate": 6.946687097021249e-07,
"loss": 0.0544,
"step": 1254
},
{
"epoch": 2.547945205479452,
"grad_norm": 0.6068439005721586,
"learning_rate": 6.827146430259446e-07,
"loss": 0.0504,
"step": 1256
},
{
"epoch": 2.552004058853374,
"grad_norm": 0.6517780554148217,
"learning_rate": 6.70856784878699e-07,
"loss": 0.0576,
"step": 1258
},
{
"epoch": 2.5560629122272958,
"grad_norm": 0.6551482566155284,
"learning_rate": 6.590953995067812e-07,
"loss": 0.0585,
"step": 1260
},
{
"epoch": 2.5601217656012176,
"grad_norm": 0.7195939705815774,
"learning_rate": 6.474307490067383e-07,
"loss": 0.0591,
"step": 1262
},
{
"epoch": 2.5641806189751395,
"grad_norm": 0.7410151414847665,
"learning_rate": 6.358630933194282e-07,
"loss": 0.0618,
"step": 1264
},
{
"epoch": 2.5682394723490614,
"grad_norm": 0.6972961543769066,
"learning_rate": 6.24392690224232e-07,
"loss": 0.0607,
"step": 1266
},
{
"epoch": 2.5722983257229832,
"grad_norm": 0.6627967555045137,
"learning_rate": 6.130197953333017e-07,
"loss": 0.0602,
"step": 1268
},
{
"epoch": 2.576357179096905,
"grad_norm": 0.6185680861600283,
"learning_rate": 6.017446620858708e-07,
"loss": 0.0565,
"step": 1270
},
{
"epoch": 2.580416032470827,
"grad_norm": 0.6250339920749016,
"learning_rate": 5.905675417426027e-07,
"loss": 0.0572,
"step": 1272
},
{
"epoch": 2.584474885844749,
"grad_norm": 0.6417143051901513,
"learning_rate": 5.794886833799923e-07,
"loss": 0.0514,
"step": 1274
},
{
"epoch": 2.5885337392186707,
"grad_norm": 0.5742514445618982,
"learning_rate": 5.685083338848152e-07,
"loss": 0.0509,
"step": 1276
},
{
"epoch": 2.5925925925925926,
"grad_norm": 0.6842845774267343,
"learning_rate": 5.576267379486294e-07,
"loss": 0.0608,
"step": 1278
},
{
"epoch": 2.5966514459665144,
"grad_norm": 0.7682701087480387,
"learning_rate": 5.468441380623169e-07,
"loss": 0.0619,
"step": 1280
},
{
"epoch": 2.6007102993404363,
"grad_norm": 0.6379342121635503,
"learning_rate": 5.361607745106817e-07,
"loss": 0.0534,
"step": 1282
},
{
"epoch": 2.604769152714358,
"grad_norm": 0.6746813956871355,
"learning_rate": 5.255768853671011e-07,
"loss": 0.0568,
"step": 1284
},
{
"epoch": 2.60882800608828,
"grad_norm": 0.5662682410250746,
"learning_rate": 5.150927064882089e-07,
"loss": 0.0488,
"step": 1286
},
{
"epoch": 2.612886859462202,
"grad_norm": 0.7495492021339842,
"learning_rate": 5.047084715086515e-07,
"loss": 0.0627,
"step": 1288
},
{
"epoch": 2.6169457128361238,
"grad_norm": 0.5821244808744749,
"learning_rate": 4.944244118358721e-07,
"loss": 0.0496,
"step": 1290
},
{
"epoch": 2.6210045662100456,
"grad_norm": 0.6716113774136223,
"learning_rate": 4.842407566449591e-07,
"loss": 0.0527,
"step": 1292
},
{
"epoch": 2.6250634195839675,
"grad_norm": 0.6675478536309039,
"learning_rate": 4.741577328735364e-07,
"loss": 0.0562,
"step": 1294
},
{
"epoch": 2.6291222729578894,
"grad_norm": 0.6215223340536244,
"learning_rate": 4.641755652167107e-07,
"loss": 0.0557,
"step": 1296
},
{
"epoch": 2.6331811263318112,
"grad_norm": 0.6998801247830552,
"learning_rate": 4.5429447612205635e-07,
"loss": 0.0559,
"step": 1298
},
{
"epoch": 2.637239979705733,
"grad_norm": 0.7251216921883125,
"learning_rate": 4.445146857846672e-07,
"loss": 0.0505,
"step": 1300
},
{
"epoch": 2.641298833079655,
"grad_norm": 0.674778502737252,
"learning_rate": 4.3483641214224325e-07,
"loss": 0.0536,
"step": 1302
},
{
"epoch": 2.645357686453577,
"grad_norm": 0.603226222531606,
"learning_rate": 4.2525987087023433e-07,
"loss": 0.0492,
"step": 1304
},
{
"epoch": 2.6494165398274987,
"grad_norm": 0.7870111250427357,
"learning_rate": 4.1578527537703973e-07,
"loss": 0.061,
"step": 1306
},
{
"epoch": 2.6534753932014206,
"grad_norm": 0.6451056158523719,
"learning_rate": 4.064128367992459e-07,
"loss": 0.0556,
"step": 1308
},
{
"epoch": 2.6575342465753424,
"grad_norm": 0.7137232587300326,
"learning_rate": 3.971427639969233e-07,
"loss": 0.0557,
"step": 1310
},
{
"epoch": 2.6615930999492643,
"grad_norm": 0.6325976622831373,
"learning_rate": 3.879752635489736e-07,
"loss": 0.0525,
"step": 1312
},
{
"epoch": 2.665651953323186,
"grad_norm": 0.670332593338025,
"learning_rate": 3.7891053974852597e-07,
"loss": 0.0524,
"step": 1314
},
{
"epoch": 2.669710806697108,
"grad_norm": 0.6627524615579112,
"learning_rate": 3.6994879459838375e-07,
"loss": 0.0557,
"step": 1316
},
{
"epoch": 2.67376966007103,
"grad_norm": 0.6143115984862325,
"learning_rate": 3.6109022780652147e-07,
"loss": 0.0569,
"step": 1318
},
{
"epoch": 2.6778285134449518,
"grad_norm": 0.645959952937934,
"learning_rate": 3.5233503678163696e-07,
"loss": 0.0571,
"step": 1320
},
{
"epoch": 2.6818873668188736,
"grad_norm": 0.6681068621470589,
"learning_rate": 3.4368341662875004e-07,
"loss": 0.0535,
"step": 1322
},
{
"epoch": 2.6859462201927955,
"grad_norm": 0.6686418775510267,
"learning_rate": 3.3513556014485805e-07,
"loss": 0.0615,
"step": 1324
},
{
"epoch": 2.6900050735667174,
"grad_norm": 0.7424511108861405,
"learning_rate": 3.26691657814634e-07,
"loss": 0.0592,
"step": 1326
},
{
"epoch": 2.6940639269406392,
"grad_norm": 0.659620559037728,
"learning_rate": 3.183518978061895e-07,
"loss": 0.0555,
"step": 1328
},
{
"epoch": 2.698122780314561,
"grad_norm": 0.6744968743539785,
"learning_rate": 3.101164659668732e-07,
"loss": 0.0557,
"step": 1330
},
{
"epoch": 2.702181633688483,
"grad_norm": 0.6611821221163022,
"learning_rate": 3.0198554581913343e-07,
"loss": 0.0572,
"step": 1332
},
{
"epoch": 2.706240487062405,
"grad_norm": 0.6422102786010568,
"learning_rate": 2.9395931855643043e-07,
"loss": 0.0529,
"step": 1334
},
{
"epoch": 2.7102993404363267,
"grad_norm": 0.5990751150942317,
"learning_rate": 2.860379630391935e-07,
"loss": 0.0522,
"step": 1336
},
{
"epoch": 2.7143581938102486,
"grad_norm": 0.6397947037965095,
"learning_rate": 2.7822165579084013e-07,
"loss": 0.0516,
"step": 1338
},
{
"epoch": 2.7184170471841704,
"grad_norm": 0.6033731180075299,
"learning_rate": 2.705105709938388e-07,
"loss": 0.0522,
"step": 1340
},
{
"epoch": 2.7224759005580923,
"grad_norm": 0.5850767244585305,
"learning_rate": 2.629048804858275e-07,
"loss": 0.0571,
"step": 1342
},
{
"epoch": 2.726534753932014,
"grad_norm": 0.6793421437999266,
"learning_rate": 2.5540475375578967e-07,
"loss": 0.0579,
"step": 1344
},
{
"epoch": 2.730593607305936,
"grad_norm": 0.6089471972028935,
"learning_rate": 2.4801035794026987e-07,
"loss": 0.0537,
"step": 1346
},
{
"epoch": 2.734652460679858,
"grad_norm": 0.5978644646078147,
"learning_rate": 2.407218578196524e-07,
"loss": 0.0521,
"step": 1348
},
{
"epoch": 2.7387113140537798,
"grad_norm": 0.6521169806760104,
"learning_rate": 2.3353941581449048e-07,
"loss": 0.0584,
"step": 1350
},
{
"epoch": 2.7427701674277016,
"grad_norm": 0.6427655880180726,
"learning_rate": 2.2646319198188495e-07,
"loss": 0.0531,
"step": 1352
},
{
"epoch": 2.7468290208016235,
"grad_norm": 0.7400104362172217,
"learning_rate": 2.1949334401192013e-07,
"loss": 0.0597,
"step": 1354
},
{
"epoch": 2.7508878741755454,
"grad_norm": 0.7047967548664266,
"learning_rate": 2.1263002722414383e-07,
"loss": 0.0593,
"step": 1356
},
{
"epoch": 2.7549467275494672,
"grad_norm": 0.6950676115493591,
"learning_rate": 2.0587339456411503e-07,
"loss": 0.0558,
"step": 1358
},
{
"epoch": 2.759005580923389,
"grad_norm": 0.6247096449549893,
"learning_rate": 1.9922359659998724e-07,
"loss": 0.0535,
"step": 1360
},
{
"epoch": 2.763064434297311,
"grad_norm": 0.6324280760328311,
"learning_rate": 1.9268078151915724e-07,
"loss": 0.0572,
"step": 1362
},
{
"epoch": 2.767123287671233,
"grad_norm": 0.6887667157732265,
"learning_rate": 1.8624509512496336e-07,
"loss": 0.0567,
"step": 1364
},
{
"epoch": 2.7711821410451547,
"grad_norm": 0.6432677564664775,
"learning_rate": 1.799166808334335e-07,
"loss": 0.0561,
"step": 1366
},
{
"epoch": 2.7752409944190766,
"grad_norm": 0.6374676188470956,
"learning_rate": 1.7369567967009226e-07,
"loss": 0.052,
"step": 1368
},
{
"epoch": 2.7792998477929984,
"grad_norm": 0.6411778692609862,
"learning_rate": 1.6758223026681507e-07,
"loss": 0.056,
"step": 1370
},
{
"epoch": 2.7833587011669203,
"grad_norm": 0.5631134199833117,
"learning_rate": 1.615764688587429e-07,
"loss": 0.0508,
"step": 1372
},
{
"epoch": 2.787417554540842,
"grad_norm": 0.6819140357771947,
"learning_rate": 1.5567852928124237e-07,
"loss": 0.0571,
"step": 1374
},
{
"epoch": 2.791476407914764,
"grad_norm": 0.5799311609819854,
"learning_rate": 1.4988854296692557e-07,
"loss": 0.0503,
"step": 1376
},
{
"epoch": 2.795535261288686,
"grad_norm": 0.6901834332410363,
"learning_rate": 1.442066389427199e-07,
"loss": 0.0599,
"step": 1378
},
{
"epoch": 2.7995941146626078,
"grad_norm": 0.6242349895835719,
"learning_rate": 1.386329438269929e-07,
"loss": 0.0563,
"step": 1380
},
{
"epoch": 2.8036529680365296,
"grad_norm": 0.6747442124346058,
"learning_rate": 1.3316758182673307e-07,
"loss": 0.0559,
"step": 1382
},
{
"epoch": 2.8077118214104515,
"grad_norm": 0.6806732183113077,
"learning_rate": 1.2781067473477905e-07,
"loss": 0.0553,
"step": 1384
},
{
"epoch": 2.8117706747843734,
"grad_norm": 0.5453489129772455,
"learning_rate": 1.225623419271055e-07,
"loss": 0.0492,
"step": 1386
},
{
"epoch": 2.8158295281582952,
"grad_norm": 0.6628204710761366,
"learning_rate": 1.1742270036016523e-07,
"loss": 0.0542,
"step": 1388
},
{
"epoch": 2.819888381532217,
"grad_norm": 0.6122682091369429,
"learning_rate": 1.1239186456828033e-07,
"loss": 0.0551,
"step": 1390
},
{
"epoch": 2.823947234906139,
"grad_norm": 0.6936962374041317,
"learning_rate": 1.0746994666109234e-07,
"loss": 0.0573,
"step": 1392
},
{
"epoch": 2.828006088280061,
"grad_norm": 0.5643200467907136,
"learning_rate": 1.0265705632106216e-07,
"loss": 0.0546,
"step": 1394
},
{
"epoch": 2.8320649416539827,
"grad_norm": 0.6353165412067484,
"learning_rate": 9.795330080102527e-08,
"loss": 0.0541,
"step": 1396
},
{
"epoch": 2.8361237950279046,
"grad_norm": 0.6131605918166376,
"learning_rate": 9.335878492180373e-08,
"loss": 0.0519,
"step": 1398
},
{
"epoch": 2.8401826484018264,
"grad_norm": 0.6838700561564602,
"learning_rate": 8.887361106986848e-08,
"loss": 0.0557,
"step": 1400
},
{
"epoch": 2.8442415017757483,
"grad_norm": 0.6903440686599182,
"learning_rate": 8.44978791950607e-08,
"loss": 0.0594,
"step": 1402
},
{
"epoch": 2.84830035514967,
"grad_norm": 0.6727563667189779,
"learning_rate": 8.023168680835913e-08,
"loss": 0.0599,
"step": 1404
},
{
"epoch": 2.852359208523592,
"grad_norm": 0.6714864747461081,
"learning_rate": 7.60751289797118e-08,
"loss": 0.0596,
"step": 1406
},
{
"epoch": 2.856418061897514,
"grad_norm": 0.6100055956786066,
"learning_rate": 7.202829833591496e-08,
"loss": 0.056,
"step": 1408
},
{
"epoch": 2.8604769152714358,
"grad_norm": 0.6561132306996892,
"learning_rate": 6.809128505855189e-08,
"loss": 0.0556,
"step": 1410
},
{
"epoch": 2.8645357686453576,
"grad_norm": 0.6332585194044941,
"learning_rate": 6.426417688197961e-08,
"loss": 0.0532,
"step": 1412
},
{
"epoch": 2.8685946220192795,
"grad_norm": 0.6170375673674818,
"learning_rate": 6.054705909137426e-08,
"loss": 0.0496,
"step": 1414
},
{
"epoch": 2.8726534753932014,
"grad_norm": 0.60220768047444,
"learning_rate": 5.6940014520834865e-08,
"loss": 0.0539,
"step": 1416
},
{
"epoch": 2.8767123287671232,
"grad_norm": 0.5816521708453392,
"learning_rate": 5.344312355153036e-08,
"loss": 0.0523,
"step": 1418
},
{
"epoch": 2.880771182141045,
"grad_norm": 0.6233216693560418,
"learning_rate": 5.005646410991549e-08,
"loss": 0.0547,
"step": 1420
},
{
"epoch": 2.884830035514967,
"grad_norm": 0.6419014793947228,
"learning_rate": 4.678011166598884e-08,
"loss": 0.0619,
"step": 1422
},
{
"epoch": 2.888888888888889,
"grad_norm": 0.6187502232959146,
"learning_rate": 4.3614139231614725e-08,
"loss": 0.0506,
"step": 1424
},
{
"epoch": 2.8929477422628107,
"grad_norm": 0.6573679267165627,
"learning_rate": 4.0558617358892326e-08,
"loss": 0.054,
"step": 1426
},
{
"epoch": 2.8970065956367326,
"grad_norm": 0.7332565562788992,
"learning_rate": 3.7613614138587995e-08,
"loss": 0.0587,
"step": 1428
},
{
"epoch": 2.9010654490106544,
"grad_norm": 0.6415121216139319,
"learning_rate": 3.477919519861428e-08,
"loss": 0.0537,
"step": 1430
},
{
"epoch": 2.9051243023845763,
"grad_norm": 0.7104972559533141,
"learning_rate": 3.205542370256997e-08,
"loss": 0.0549,
"step": 1432
},
{
"epoch": 2.909183155758498,
"grad_norm": 0.6659234362952994,
"learning_rate": 2.944236034832959e-08,
"loss": 0.059,
"step": 1434
},
{
"epoch": 2.91324200913242,
"grad_norm": 0.6504771749865272,
"learning_rate": 2.6940063366693303e-08,
"loss": 0.0545,
"step": 1436
},
{
"epoch": 2.917300862506342,
"grad_norm": 0.6035759484889454,
"learning_rate": 2.4548588520089123e-08,
"loss": 0.0544,
"step": 1438
},
{
"epoch": 2.9213597158802638,
"grad_norm": 0.6114721671996689,
"learning_rate": 2.2267989101328878e-08,
"loss": 0.0531,
"step": 1440
},
{
"epoch": 2.9254185692541856,
"grad_norm": 0.6776141271918054,
"learning_rate": 2.0098315932421952e-08,
"loss": 0.0548,
"step": 1442
},
{
"epoch": 2.9294774226281075,
"grad_norm": 0.5785157075548263,
"learning_rate": 1.803961736344062e-08,
"loss": 0.0489,
"step": 1444
},
{
"epoch": 2.9335362760020294,
"grad_norm": 0.6257664598177896,
"learning_rate": 1.6091939271446478e-08,
"loss": 0.0541,
"step": 1446
},
{
"epoch": 2.9375951293759512,
"grad_norm": 0.6581827205937377,
"learning_rate": 1.4255325059463477e-08,
"loss": 0.057,
"step": 1448
},
{
"epoch": 2.941653982749873,
"grad_norm": 0.5842892930844981,
"learning_rate": 1.252981565551481e-08,
"loss": 0.0524,
"step": 1450
},
{
"epoch": 2.945712836123795,
"grad_norm": 0.660064548851826,
"learning_rate": 1.0915449511708088e-08,
"loss": 0.0546,
"step": 1452
},
{
"epoch": 2.949771689497717,
"grad_norm": 0.6419857691002046,
"learning_rate": 9.412262603378797e-09,
"loss": 0.0544,
"step": 1454
},
{
"epoch": 2.9538305428716387,
"grad_norm": 0.5794278961094809,
"learning_rate": 8.020288428289836e-09,
"loss": 0.0532,
"step": 1456
},
{
"epoch": 2.9578893962455606,
"grad_norm": 0.7121133884438979,
"learning_rate": 6.739558005884883e-09,
"loss": 0.0577,
"step": 1458
},
{
"epoch": 2.9619482496194824,
"grad_norm": 0.6610137998761876,
"learning_rate": 5.570099876595625e-09,
"loss": 0.0582,
"step": 1460
},
{
"epoch": 2.9660071029934043,
"grad_norm": 0.5782156107576852,
"learning_rate": 4.511940101207812e-09,
"loss": 0.0517,
"step": 1462
},
{
"epoch": 2.970065956367326,
"grad_norm": 0.6497750292771783,
"learning_rate": 3.565102260278397e-09,
"loss": 0.0566,
"step": 1464
},
{
"epoch": 2.974124809741248,
"grad_norm": 0.6075980392613102,
"learning_rate": 2.72960745361206e-09,
"loss": 0.0521,
"step": 1466
},
{
"epoch": 2.97818366311517,
"grad_norm": 0.6593396919810925,
"learning_rate": 2.0054742997893674e-09,
"loss": 0.0561,
"step": 1468
},
{
"epoch": 2.9822425164890918,
"grad_norm": 0.6349479858496213,
"learning_rate": 1.392718935752102e-09,
"loss": 0.0527,
"step": 1470
},
{
"epoch": 2.9863013698630136,
"grad_norm": 0.6624916331368633,
"learning_rate": 8.913550164463269e-10,
"loss": 0.053,
"step": 1472
},
{
"epoch": 2.9903602232369355,
"grad_norm": 0.6290549360832741,
"learning_rate": 5.013937145131875e-10,
"loss": 0.0563,
"step": 1474
},
{
"epoch": 2.9944190766108574,
"grad_norm": 0.625615525565498,
"learning_rate": 2.2284372004410804e-10,
"loss": 0.0562,
"step": 1476
},
{
"epoch": 2.9984779299847792,
"grad_norm": 0.5809460613562152,
"learning_rate": 5.5711240385392106e-11,
"loss": 0.0501,
"step": 1478
},
{
"epoch": 3.0,
"step": 1479,
"total_flos": 4127658346151936.0,
"train_loss": 0.21015015432889433,
"train_runtime": 144374.0796,
"train_samples_per_second": 1.311,
"train_steps_per_second": 0.01
}
],
"logging_steps": 2,
"max_steps": 1479,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4127658346151936.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}