Files
qwen2.5-VL-3B-atm-finetune-…/trainer_state.json

2221 lines
53 KiB
JSON
Raw Permalink Normal View History

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 311,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003215434083601286,
"grad_norm": 7.461493968963623,
"learning_rate": 0.0,
"loss": 1.4509,
"step": 1
},
{
"epoch": 0.006430868167202572,
"grad_norm": 7.841219425201416,
"learning_rate": 3.125e-07,
"loss": 1.4192,
"step": 2
},
{
"epoch": 0.00964630225080386,
"grad_norm": 8.970290184020996,
"learning_rate": 6.25e-07,
"loss": 1.4011,
"step": 3
},
{
"epoch": 0.012861736334405145,
"grad_norm": 6.137041091918945,
"learning_rate": 9.375000000000001e-07,
"loss": 1.3749,
"step": 4
},
{
"epoch": 0.01607717041800643,
"grad_norm": 5.494370460510254,
"learning_rate": 1.25e-06,
"loss": 1.3428,
"step": 5
},
{
"epoch": 0.01929260450160772,
"grad_norm": 4.324854373931885,
"learning_rate": 1.5625e-06,
"loss": 1.2579,
"step": 6
},
{
"epoch": 0.022508038585209004,
"grad_norm": 5.789402008056641,
"learning_rate": 1.8750000000000003e-06,
"loss": 1.2162,
"step": 7
},
{
"epoch": 0.02572347266881029,
"grad_norm": 4.819009304046631,
"learning_rate": 2.1875000000000002e-06,
"loss": 1.1246,
"step": 8
},
{
"epoch": 0.028938906752411574,
"grad_norm": 4.173788070678711,
"learning_rate": 2.5e-06,
"loss": 1.0988,
"step": 9
},
{
"epoch": 0.03215434083601286,
"grad_norm": 3.964716911315918,
"learning_rate": 2.8125e-06,
"loss": 1.0293,
"step": 10
},
{
"epoch": 0.03536977491961415,
"grad_norm": 3.7624258995056152,
"learning_rate": 3.125e-06,
"loss": 1.0261,
"step": 11
},
{
"epoch": 0.03858520900321544,
"grad_norm": 2.7567455768585205,
"learning_rate": 3.4375e-06,
"loss": 0.9731,
"step": 12
},
{
"epoch": 0.04180064308681672,
"grad_norm": 11.352989196777344,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.9391,
"step": 13
},
{
"epoch": 0.04501607717041801,
"grad_norm": 2.96602201461792,
"learning_rate": 4.0625000000000005e-06,
"loss": 0.8457,
"step": 14
},
{
"epoch": 0.04823151125401929,
"grad_norm": 3.602654218673706,
"learning_rate": 4.3750000000000005e-06,
"loss": 0.854,
"step": 15
},
{
"epoch": 0.05144694533762058,
"grad_norm": 3.3031013011932373,
"learning_rate": 4.6875000000000004e-06,
"loss": 0.7838,
"step": 16
},
{
"epoch": 0.05466237942122187,
"grad_norm": 3.8468689918518066,
"learning_rate": 5e-06,
"loss": 0.8028,
"step": 17
},
{
"epoch": 0.05787781350482315,
"grad_norm": 5.588563919067383,
"learning_rate": 5.3125e-06,
"loss": 0.7586,
"step": 18
},
{
"epoch": 0.06109324758842444,
"grad_norm": 3.002431631088257,
"learning_rate": 5.625e-06,
"loss": 0.7981,
"step": 19
},
{
"epoch": 0.06430868167202572,
"grad_norm": 2.289362668991089,
"learning_rate": 5.9375e-06,
"loss": 0.6801,
"step": 20
},
{
"epoch": 0.06752411575562701,
"grad_norm": 2.4948556423187256,
"learning_rate": 6.25e-06,
"loss": 0.7076,
"step": 21
},
{
"epoch": 0.0707395498392283,
"grad_norm": 3.449002742767334,
"learning_rate": 6.5625e-06,
"loss": 0.6911,
"step": 22
},
{
"epoch": 0.07395498392282958,
"grad_norm": 2.8027281761169434,
"learning_rate": 6.875e-06,
"loss": 0.6124,
"step": 23
},
{
"epoch": 0.07717041800643087,
"grad_norm": 3.1212947368621826,
"learning_rate": 7.1875e-06,
"loss": 0.6253,
"step": 24
},
{
"epoch": 0.08038585209003216,
"grad_norm": 2.3612632751464844,
"learning_rate": 7.500000000000001e-06,
"loss": 0.6117,
"step": 25
},
{
"epoch": 0.08360128617363344,
"grad_norm": 3.0025129318237305,
"learning_rate": 7.8125e-06,
"loss": 0.5949,
"step": 26
},
{
"epoch": 0.08681672025723473,
"grad_norm": 2.812004566192627,
"learning_rate": 8.125000000000001e-06,
"loss": 0.5983,
"step": 27
},
{
"epoch": 0.09003215434083602,
"grad_norm": 3.4180963039398193,
"learning_rate": 8.4375e-06,
"loss": 0.5706,
"step": 28
},
{
"epoch": 0.0932475884244373,
"grad_norm": 3.952913522720337,
"learning_rate": 8.750000000000001e-06,
"loss": 0.536,
"step": 29
},
{
"epoch": 0.09646302250803858,
"grad_norm": 2.5324349403381348,
"learning_rate": 9.0625e-06,
"loss": 0.5384,
"step": 30
},
{
"epoch": 0.09967845659163987,
"grad_norm": 2.9339852333068848,
"learning_rate": 9.375000000000001e-06,
"loss": 0.5491,
"step": 31
},
{
"epoch": 0.10289389067524116,
"grad_norm": 2.1388156414031982,
"learning_rate": 9.6875e-06,
"loss": 0.524,
"step": 32
},
{
"epoch": 0.10610932475884244,
"grad_norm": 2.724695920944214,
"learning_rate": 1e-05,
"loss": 0.5224,
"step": 33
},
{
"epoch": 0.10932475884244373,
"grad_norm": 2.3847827911376953,
"learning_rate": 9.999683023724021e-06,
"loss": 0.5185,
"step": 34
},
{
"epoch": 0.11254019292604502,
"grad_norm": 5.435914039611816,
"learning_rate": 9.998732135085665e-06,
"loss": 0.5219,
"step": 35
},
{
"epoch": 0.1157556270096463,
"grad_norm": 2.3512284755706787,
"learning_rate": 9.99714745464859e-06,
"loss": 0.4951,
"step": 36
},
{
"epoch": 0.1189710610932476,
"grad_norm": 2.4358675479888916,
"learning_rate": 9.994929183335237e-06,
"loss": 0.5087,
"step": 37
},
{
"epoch": 0.12218649517684887,
"grad_norm": 4.703172206878662,
"learning_rate": 9.992077602401358e-06,
"loss": 0.4586,
"step": 38
},
{
"epoch": 0.12540192926045016,
"grad_norm": 1.9784342050552368,
"learning_rate": 9.988593073400354e-06,
"loss": 0.4962,
"step": 39
},
{
"epoch": 0.12861736334405144,
"grad_norm": 2.9739573001861572,
"learning_rate": 9.984476038137437e-06,
"loss": 0.5232,
"step": 40
},
{
"epoch": 0.13183279742765272,
"grad_norm": 4.0715413093566895,
"learning_rate": 9.979727018613607e-06,
"loss": 0.4603,
"step": 41
},
{
"epoch": 0.13504823151125403,
"grad_norm": 3.099459409713745,
"learning_rate": 9.974346616959476e-06,
"loss": 0.456,
"step": 42
},
{
"epoch": 0.1382636655948553,
"grad_norm": 3.039191246032715,
"learning_rate": 9.968335515358916e-06,
"loss": 0.45,
"step": 43
},
{
"epoch": 0.1414790996784566,
"grad_norm": 22.894182205200195,
"learning_rate": 9.961694475962562e-06,
"loss": 0.4605,
"step": 44
},
{
"epoch": 0.14469453376205788,
"grad_norm": 2.549868106842041,
"learning_rate": 9.954424340791195e-06,
"loss": 0.4636,
"step": 45
},
{
"epoch": 0.14790996784565916,
"grad_norm": 2.1484735012054443,
"learning_rate": 9.94652603162896e-06,
"loss": 0.4533,
"step": 46
},
{
"epoch": 0.15112540192926044,
"grad_norm": 2.6148245334625244,
"learning_rate": 9.938000549906509e-06,
"loss": 0.442,
"step": 47
},
{
"epoch": 0.15434083601286175,
"grad_norm": 3.580359697341919,
"learning_rate": 9.92884897657402e-06,
"loss": 0.477,
"step": 48
},
{
"epoch": 0.15755627009646303,
"grad_norm": 2.5778746604919434,
"learning_rate": 9.919072471964146e-06,
"loss": 0.4266,
"step": 49
},
{
"epoch": 0.1607717041800643,
"grad_norm": 9.264074325561523,
"learning_rate": 9.908672275644898e-06,
"loss": 0.4375,
"step": 50
},
{
"epoch": 0.1639871382636656,
"grad_norm": 3.2539267539978027,
"learning_rate": 9.897649706262474e-06,
"loss": 0.4438,
"step": 51
},
{
"epoch": 0.16720257234726688,
"grad_norm": 3.373600721359253,
"learning_rate": 9.88600616137407e-06,
"loss": 0.4388,
"step": 52
},
{
"epoch": 0.17041800643086816,
"grad_norm": 5.1715898513793945,
"learning_rate": 9.873743117270691e-06,
"loss": 0.4612,
"step": 53
},
{
"epoch": 0.17363344051446947,
"grad_norm": 4.894754409790039,
"learning_rate": 9.860862128789954e-06,
"loss": 0.4513,
"step": 54
},
{
"epoch": 0.17684887459807075,
"grad_norm": 3.1228713989257812,
"learning_rate": 9.847364829118963e-06,
"loss": 0.4439,
"step": 55
},
{
"epoch": 0.18006430868167203,
"grad_norm": 5.72307014465332,
"learning_rate": 9.833252929587231e-06,
"loss": 0.4484,
"step": 56
},
{
"epoch": 0.1832797427652733,
"grad_norm": 7.6115336418151855,
"learning_rate": 9.818528219449705e-06,
"loss": 0.4642,
"step": 57
},
{
"epoch": 0.1864951768488746,
"grad_norm": 4.580008506774902,
"learning_rate": 9.803192565659898e-06,
"loss": 0.4289,
"step": 58
},
{
"epoch": 0.18971061093247588,
"grad_norm": 4.601083278656006,
"learning_rate": 9.78724791263318e-06,
"loss": 0.416,
"step": 59
},
{
"epoch": 0.19292604501607716,
"grad_norm": 5.066440105438232,
"learning_rate": 9.770696282000245e-06,
"loss": 0.4083,
"step": 60
},
{
"epoch": 0.19614147909967847,
"grad_norm": 4.051520824432373,
"learning_rate": 9.753539772350792e-06,
"loss": 0.4177,
"step": 61
},
{
"epoch": 0.19935691318327975,
"grad_norm": 3.406569242477417,
"learning_rate": 9.735780558967434e-06,
"loss": 0.4328,
"step": 62
},
{
"epoch": 0.20257234726688103,
"grad_norm": 4.951582908630371,
"learning_rate": 9.717420893549902e-06,
"loss": 0.424,
"step": 63
},
{
"epoch": 0.2057877813504823,
"grad_norm": 2.5664072036743164,
"learning_rate": 9.698463103929542e-06,
"loss": 0.4254,
"step": 64
},
{
"epoch": 0.2090032154340836,
"grad_norm": 2.8900935649871826,
"learning_rate": 9.67890959377418e-06,
"loss": 0.4202,
"step": 65
},
{
"epoch": 0.21221864951768488,
"grad_norm": 3.5653440952301025,
"learning_rate": 9.658762842283343e-06,
"loss": 0.397,
"step": 66
},
{
"epoch": 0.21543408360128619,
"grad_norm": 2.600797414779663,
"learning_rate": 9.638025403873939e-06,
"loss": 0.3912,
"step": 67
},
{
"epoch": 0.21864951768488747,
"grad_norm": 7.3537397384643555,
"learning_rate": 9.616699907856368e-06,
"loss": 0.3758,
"step": 68
},
{
"epoch": 0.22186495176848875,
"grad_norm": 5.298691272735596,
"learning_rate": 9.594789058101154e-06,
"loss": 0.4368,
"step": 69
},
{
"epoch": 0.22508038585209003,
"grad_norm": 2.9018328189849854,
"learning_rate": 9.57229563269612e-06,
"loss": 0.4067,
"step": 70
},
{
"epoch": 0.2282958199356913,
"grad_norm": 2.2843472957611084,
"learning_rate": 9.549222483594154e-06,
"loss": 0.3884,
"step": 71
},
{
"epoch": 0.2315112540192926,
"grad_norm": 4.242974281311035,
"learning_rate": 9.525572536251608e-06,
"loss": 0.3895,
"step": 72
},
{
"epoch": 0.2347266881028939,
"grad_norm": 3.73633074760437,
"learning_rate": 9.501348789257373e-06,
"loss": 0.408,
"step": 73
},
{
"epoch": 0.2379421221864952,
"grad_norm": 7.407820224761963,
"learning_rate": 9.476554313952697e-06,
"loss": 0.3862,
"step": 74
},
{
"epoch": 0.24115755627009647,
"grad_norm": 4.708957672119141,
"learning_rate": 9.451192254041759e-06,
"loss": 0.4149,
"step": 75
},
{
"epoch": 0.24437299035369775,
"grad_norm": 6.709017276763916,
"learning_rate": 9.425265825193077e-06,
"loss": 0.38,
"step": 76
},
{
"epoch": 0.24758842443729903,
"grad_norm": 2.1756374835968018,
"learning_rate": 9.398778314631801e-06,
"loss": 0.3799,
"step": 77
},
{
"epoch": 0.2508038585209003,
"grad_norm": 3.1432833671569824,
"learning_rate": 9.371733080722911e-06,
"loss": 0.3882,
"step": 78
},
{
"epoch": 0.2540192926045016,
"grad_norm": 2.0718796253204346,
"learning_rate": 9.34413355254542e-06,
"loss": 0.4123,
"step": 79
},
{
"epoch": 0.2572347266881029,
"grad_norm": 3.231426954269409,
"learning_rate": 9.31598322945759e-06,
"loss": 0.3627,
"step": 80
},
{
"epoch": 0.2604501607717042,
"grad_norm": 3.3169357776641846,
"learning_rate": 9.287285680653254e-06,
"loss": 0.3747,
"step": 81
},
{
"epoch": 0.26366559485530544,
"grad_norm": 1.9841314554214478,
"learning_rate": 9.258044544709276e-06,
"loss": 0.399,
"step": 82
},
{
"epoch": 0.26688102893890675,
"grad_norm": 2.6931841373443604,
"learning_rate": 9.228263529124199e-06,
"loss": 0.3995,
"step": 83
},
{
"epoch": 0.27009646302250806,
"grad_norm": 2.48873233795166,
"learning_rate": 9.197946409848196e-06,
"loss": 0.4221,
"step": 84
},
{
"epoch": 0.2733118971061093,
"grad_norm": 20.441673278808594,
"learning_rate": 9.167097030804289e-06,
"loss": 0.3649,
"step": 85
},
{
"epoch": 0.2765273311897106,
"grad_norm": 2.6681602001190186,
"learning_rate": 9.135719303400995e-06,
"loss": 0.3638,
"step": 86
},
{
"epoch": 0.2797427652733119,
"grad_norm": 4.435401439666748,
"learning_rate": 9.103817206036383e-06,
"loss": 0.3722,
"step": 87
},
{
"epoch": 0.2829581993569132,
"grad_norm": 5.914163589477539,
"learning_rate": 9.071394783593664e-06,
"loss": 0.3656,
"step": 88
},
{
"epoch": 0.2861736334405145,
"grad_norm": 6.216729640960693,
"learning_rate": 9.038456146928325e-06,
"loss": 0.3916,
"step": 89
},
{
"epoch": 0.28938906752411575,
"grad_norm": 2.873570442199707,
"learning_rate": 9.005005472346923e-06,
"loss": 0.3903,
"step": 90
},
{
"epoch": 0.29260450160771706,
"grad_norm": 4.470005035400391,
"learning_rate": 8.971047001077561e-06,
"loss": 0.3987,
"step": 91
},
{
"epoch": 0.2958199356913183,
"grad_norm": 2.5284571647644043,
"learning_rate": 8.936585038732143e-06,
"loss": 0.4044,
"step": 92
},
{
"epoch": 0.2990353697749196,
"grad_norm": 2.339695692062378,
"learning_rate": 8.90162395476046e-06,
"loss": 0.3858,
"step": 93
},
{
"epoch": 0.3022508038585209,
"grad_norm": 2.0064709186553955,
"learning_rate": 8.866168181896198e-06,
"loss": 0.4002,
"step": 94
},
{
"epoch": 0.3054662379421222,
"grad_norm": 3.07234525680542,
"learning_rate": 8.83022221559489e-06,
"loss": 0.375,
"step": 95
},
{
"epoch": 0.3086816720257235,
"grad_norm": 2.4521424770355225,
"learning_rate": 8.793790613463956e-06,
"loss": 0.3549,
"step": 96
},
{
"epoch": 0.31189710610932475,
"grad_norm": 2.3006341457366943,
"learning_rate": 8.756877994684818e-06,
"loss": 0.3798,
"step": 97
},
{
"epoch": 0.31511254019292606,
"grad_norm": 3.3463075160980225,
"learning_rate": 8.719489039427256e-06,
"loss": 0.3871,
"step": 98
},
{
"epoch": 0.3183279742765273,
"grad_norm": 2.5349507331848145,
"learning_rate": 8.681628488255986e-06,
"loss": 0.4025,
"step": 99
},
{
"epoch": 0.3215434083601286,
"grad_norm": 2.8825855255126953,
"learning_rate": 8.643301141529619e-06,
"loss": 0.3998,
"step": 100
},
{
"epoch": 0.3247588424437299,
"grad_norm": 4.388237953186035,
"learning_rate": 8.604511858792006e-06,
"loss": 0.3714,
"step": 101
},
{
"epoch": 0.3279742765273312,
"grad_norm": 2.6666557788848877,
"learning_rate": 8.565265558156101e-06,
"loss": 0.3509,
"step": 102
},
{
"epoch": 0.3311897106109325,
"grad_norm": 2.7230324745178223,
"learning_rate": 8.525567215680397e-06,
"loss": 0.366,
"step": 103
},
{
"epoch": 0.33440514469453375,
"grad_norm": 2.4554688930511475,
"learning_rate": 8.485421864737997e-06,
"loss": 0.3919,
"step": 104
},
{
"epoch": 0.33762057877813506,
"grad_norm": 7.866596221923828,
"learning_rate": 8.444834595378434e-06,
"loss": 0.3623,
"step": 105
},
{
"epoch": 0.3408360128617363,
"grad_norm": 2.528653144836426,
"learning_rate": 8.403810553682307e-06,
"loss": 0.3758,
"step": 106
},
{
"epoch": 0.3440514469453376,
"grad_norm": 2.836378335952759,
"learning_rate": 8.362354941108803e-06,
"loss": 0.3456,
"step": 107
},
{
"epoch": 0.34726688102893893,
"grad_norm": 1.8620100021362305,
"learning_rate": 8.320473013836197e-06,
"loss": 0.3754,
"step": 108
},
{
"epoch": 0.3504823151125402,
"grad_norm": 2.056680679321289,
"learning_rate": 8.278170082095422e-06,
"loss": 0.3858,
"step": 109
},
{
"epoch": 0.3536977491961415,
"grad_norm": 1.9714686870574951,
"learning_rate": 8.23545150949679e-06,
"loss": 0.3941,
"step": 110
},
{
"epoch": 0.35691318327974275,
"grad_norm": 2.2530500888824463,
"learning_rate": 8.192322712349917e-06,
"loss": 0.3712,
"step": 111
},
{
"epoch": 0.36012861736334406,
"grad_norm": 1.7236007452011108,
"learning_rate": 8.148789158977012e-06,
"loss": 0.3532,
"step": 112
},
{
"epoch": 0.3633440514469453,
"grad_norm": 1.8990964889526367,
"learning_rate": 8.104856369019525e-06,
"loss": 0.3801,
"step": 113
},
{
"epoch": 0.3665594855305466,
"grad_norm": 5.287169933319092,
"learning_rate": 8.060529912738316e-06,
"loss": 0.3594,
"step": 114
},
{
"epoch": 0.36977491961414793,
"grad_norm": 2.917484998703003,
"learning_rate": 8.0158154103074e-06,
"loss": 0.3696,
"step": 115
},
{
"epoch": 0.3729903536977492,
"grad_norm": 2.5253026485443115,
"learning_rate": 7.970718531101365e-06,
"loss": 0.3553,
"step": 116
},
{
"epoch": 0.3762057877813505,
"grad_norm": 2.7132797241210938,
"learning_rate": 7.925244992976538e-06,
"loss": 0.3775,
"step": 117
},
{
"epoch": 0.37942122186495175,
"grad_norm": 5.237837791442871,
"learning_rate": 7.879400561546033e-06,
"loss": 0.3591,
"step": 118
},
{
"epoch": 0.38263665594855306,
"grad_norm": 2.0805959701538086,
"learning_rate": 7.833191049448706e-06,
"loss": 0.3723,
"step": 119
},
{
"epoch": 0.3858520900321543,
"grad_norm": 1.8187751770019531,
"learning_rate": 7.786622315612182e-06,
"loss": 0.3566,
"step": 120
},
{
"epoch": 0.3890675241157556,
"grad_norm": 2.222515821456909,
"learning_rate": 7.739700264509993e-06,
"loss": 0.3809,
"step": 121
},
{
"epoch": 0.39228295819935693,
"grad_norm": 8.328165054321289,
"learning_rate": 7.692430845412946e-06,
"loss": 0.3707,
"step": 122
},
{
"epoch": 0.3954983922829582,
"grad_norm": 2.218949317932129,
"learning_rate": 7.644820051634813e-06,
"loss": 0.3642,
"step": 123
},
{
"epoch": 0.3987138263665595,
"grad_norm": 1.9735389947891235,
"learning_rate": 7.596873919772438e-06,
"loss": 0.3605,
"step": 124
},
{
"epoch": 0.40192926045016075,
"grad_norm": 3.412888526916504,
"learning_rate": 7.548598528940354e-06,
"loss": 0.3648,
"step": 125
},
{
"epoch": 0.40514469453376206,
"grad_norm": 4.399238109588623,
"learning_rate": 7.500000000000001e-06,
"loss": 0.3735,
"step": 126
},
{
"epoch": 0.40836012861736337,
"grad_norm": 1.8429063558578491,
"learning_rate": 7.451084494783668e-06,
"loss": 0.3775,
"step": 127
},
{
"epoch": 0.4115755627009646,
"grad_norm": 2.099372386932373,
"learning_rate": 7.401858215313228e-06,
"loss": 0.3646,
"step": 128
},
{
"epoch": 0.41479099678456594,
"grad_norm": 2.8833494186401367,
"learning_rate": 7.352327403013779e-06,
"loss": 0.3752,
"step": 129
},
{
"epoch": 0.4180064308681672,
"grad_norm": 2.006443500518799,
"learning_rate": 7.302498337922293e-06,
"loss": 0.3567,
"step": 130
},
{
"epoch": 0.4212218649517685,
"grad_norm": 2.024747371673584,
"learning_rate": 7.2523773378913655e-06,
"loss": 0.3623,
"step": 131
},
{
"epoch": 0.42443729903536975,
"grad_norm": 1.9539835453033447,
"learning_rate": 7.201970757788172e-06,
"loss": 0.3709,
"step": 132
},
{
"epoch": 0.42765273311897106,
"grad_norm": 1.9126152992248535,
"learning_rate": 7.151284988688731e-06,
"loss": 0.3518,
"step": 133
},
{
"epoch": 0.43086816720257237,
"grad_norm": 1.9806180000305176,
"learning_rate": 7.100326457067576e-06,
"loss": 0.3623,
"step": 134
},
{
"epoch": 0.4340836012861736,
"grad_norm": 4.260410785675049,
"learning_rate": 7.049101623982938e-06,
"loss": 0.3518,
"step": 135
},
{
"epoch": 0.43729903536977494,
"grad_norm": 2.0884203910827637,
"learning_rate": 6.9976169842575526e-06,
"loss": 0.3812,
"step": 136
},
{
"epoch": 0.4405144694533762,
"grad_norm": 4.204238414764404,
"learning_rate": 6.945879065655164e-06,
"loss": 0.3615,
"step": 137
},
{
"epoch": 0.4437299035369775,
"grad_norm": 1.9794977903366089,
"learning_rate": 6.893894428052881e-06,
"loss": 0.3898,
"step": 138
},
{
"epoch": 0.44694533762057875,
"grad_norm": 2.9515440464019775,
"learning_rate": 6.841669662609437e-06,
"loss": 0.3437,
"step": 139
},
{
"epoch": 0.45016077170418006,
"grad_norm": 2.980576992034912,
"learning_rate": 6.789211390929497e-06,
"loss": 0.3523,
"step": 140
},
{
"epoch": 0.4533762057877814,
"grad_norm": 4.675036907196045,
"learning_rate": 6.736526264224101e-06,
"loss": 0.3738,
"step": 141
},
{
"epoch": 0.4565916398713826,
"grad_norm": 3.4226956367492676,
"learning_rate": 6.6836209624673575e-06,
"loss": 0.3726,
"step": 142
},
{
"epoch": 0.45980707395498394,
"grad_norm": 2.1817691326141357,
"learning_rate": 6.6305021935494755e-06,
"loss": 0.3322,
"step": 143
},
{
"epoch": 0.4630225080385852,
"grad_norm": 2.0901007652282715,
"learning_rate": 6.5771766924262795e-06,
"loss": 0.3328,
"step": 144
},
{
"epoch": 0.4662379421221865,
"grad_norm": 1.8397691249847412,
"learning_rate": 6.523651220265269e-06,
"loss": 0.3492,
"step": 145
},
{
"epoch": 0.4694533762057878,
"grad_norm": 2.156468391418457,
"learning_rate": 6.469932563588386e-06,
"loss": 0.3362,
"step": 146
},
{
"epoch": 0.47266881028938906,
"grad_norm": 2.963684320449829,
"learning_rate": 6.41602753341152e-06,
"loss": 0.3438,
"step": 147
},
{
"epoch": 0.4758842443729904,
"grad_norm": 1.6273006200790405,
"learning_rate": 6.361942964380967e-06,
"loss": 0.3434,
"step": 148
},
{
"epoch": 0.4790996784565916,
"grad_norm": 2.0362226963043213,
"learning_rate": 6.307685713906835e-06,
"loss": 0.3487,
"step": 149
},
{
"epoch": 0.48231511254019294,
"grad_norm": 1.889363169670105,
"learning_rate": 6.2532626612936035e-06,
"loss": 0.3335,
"step": 150
},
{
"epoch": 0.4855305466237942,
"grad_norm": 5.194770336151123,
"learning_rate": 6.1986807068678926e-06,
"loss": 0.3578,
"step": 151
},
{
"epoch": 0.4887459807073955,
"grad_norm": 2.6607186794281006,
"learning_rate": 6.143946771103561e-06,
"loss": 0.3585,
"step": 152
},
{
"epoch": 0.4919614147909968,
"grad_norm": 1.6458920240402222,
"learning_rate": 6.089067793744258e-06,
"loss": 0.3163,
"step": 153
},
{
"epoch": 0.49517684887459806,
"grad_norm": 1.745608925819397,
"learning_rate": 6.034050732923538e-06,
"loss": 0.3513,
"step": 154
},
{
"epoch": 0.4983922829581994,
"grad_norm": 2.75510835647583,
"learning_rate": 5.978902564282616e-06,
"loss": 0.3436,
"step": 155
},
{
"epoch": 0.5016077170418006,
"grad_norm": 3.112760543823242,
"learning_rate": 5.923630280085948e-06,
"loss": 0.3321,
"step": 156
},
{
"epoch": 0.5048231511254019,
"grad_norm": 1.7312897443771362,
"learning_rate": 5.8682408883346535e-06,
"loss": 0.3464,
"step": 157
},
{
"epoch": 0.5080385852090032,
"grad_norm": 3.926618814468384,
"learning_rate": 5.8127414118779825e-06,
"loss": 0.366,
"step": 158
},
{
"epoch": 0.5112540192926045,
"grad_norm": 2.1648647785186768,
"learning_rate": 5.757138887522884e-06,
"loss": 0.3592,
"step": 159
},
{
"epoch": 0.5144694533762058,
"grad_norm": 3.066451072692871,
"learning_rate": 5.701440365141799e-06,
"loss": 0.3374,
"step": 160
},
{
"epoch": 0.5176848874598071,
"grad_norm": 1.5853915214538574,
"learning_rate": 5.645652906778808e-06,
"loss": 0.3354,
"step": 161
},
{
"epoch": 0.5209003215434084,
"grad_norm": 1.4135924577713013,
"learning_rate": 5.5897835857542315e-06,
"loss": 0.3402,
"step": 162
},
{
"epoch": 0.5241157556270096,
"grad_norm": 2.379409074783325,
"learning_rate": 5.533839485767795e-06,
"loss": 0.349,
"step": 163
},
{
"epoch": 0.5273311897106109,
"grad_norm": 1.670160174369812,
"learning_rate": 5.477827700000492e-06,
"loss": 0.3314,
"step": 164
},
{
"epoch": 0.5305466237942122,
"grad_norm": 2.1935579776763916,
"learning_rate": 5.421755330215223e-06,
"loss": 0.3147,
"step": 165
},
{
"epoch": 0.5337620578778135,
"grad_norm": 3.3353383541107178,
"learning_rate": 5.365629485856381e-06,
"loss": 0.3427,
"step": 166
},
{
"epoch": 0.5369774919614148,
"grad_norm": 1.6737383604049683,
"learning_rate": 5.30945728314841e-06,
"loss": 0.3091,
"step": 167
},
{
"epoch": 0.5401929260450161,
"grad_norm": 1.815943717956543,
"learning_rate": 5.253245844193564e-06,
"loss": 0.3197,
"step": 168
},
{
"epoch": 0.5434083601286174,
"grad_norm": 2.064694404602051,
"learning_rate": 5.197002296068878e-06,
"loss": 0.3491,
"step": 169
},
{
"epoch": 0.5466237942122186,
"grad_norm": 2.5412817001342773,
"learning_rate": 5.140733769922525e-06,
"loss": 0.3323,
"step": 170
},
{
"epoch": 0.5498392282958199,
"grad_norm": 1.7079787254333496,
"learning_rate": 5.084447400069656e-06,
"loss": 0.3382,
"step": 171
},
{
"epoch": 0.5530546623794212,
"grad_norm": 1.8138501644134521,
"learning_rate": 5.0281503230878304e-06,
"loss": 0.3424,
"step": 172
},
{
"epoch": 0.5562700964630225,
"grad_norm": 3.9634087085723877,
"learning_rate": 4.971849676912172e-06,
"loss": 0.3357,
"step": 173
},
{
"epoch": 0.5594855305466238,
"grad_norm": 2.114734172821045,
"learning_rate": 4.915552599930345e-06,
"loss": 0.3544,
"step": 174
},
{
"epoch": 0.5627009646302251,
"grad_norm": 1.3108409643173218,
"learning_rate": 4.859266230077474e-06,
"loss": 0.3134,
"step": 175
},
{
"epoch": 0.5659163987138264,
"grad_norm": 1.882356882095337,
"learning_rate": 4.802997703931124e-06,
"loss": 0.3472,
"step": 176
},
{
"epoch": 0.5691318327974276,
"grad_norm": 3.0320799350738525,
"learning_rate": 4.746754155806437e-06,
"loss": 0.3484,
"step": 177
},
{
"epoch": 0.572347266881029,
"grad_norm": 1.5372339487075806,
"learning_rate": 4.6905427168515914e-06,
"loss": 0.3511,
"step": 178
},
{
"epoch": 0.5755627009646302,
"grad_norm": 2.2504475116729736,
"learning_rate": 4.63437051414362e-06,
"loss": 0.3823,
"step": 179
},
{
"epoch": 0.5787781350482315,
"grad_norm": 2.124473810195923,
"learning_rate": 4.5782446697847775e-06,
"loss": 0.3537,
"step": 180
},
{
"epoch": 0.5819935691318328,
"grad_norm": 1.7836929559707642,
"learning_rate": 4.52217229999951e-06,
"loss": 0.3281,
"step": 181
},
{
"epoch": 0.5852090032154341,
"grad_norm": 1.819919228553772,
"learning_rate": 4.466160514232206e-06,
"loss": 0.3307,
"step": 182
},
{
"epoch": 0.5884244372990354,
"grad_norm": 2.2056925296783447,
"learning_rate": 4.410216414245771e-06,
"loss": 0.3289,
"step": 183
},
{
"epoch": 0.5916398713826366,
"grad_norm": 1.8819239139556885,
"learning_rate": 4.354347093221194e-06,
"loss": 0.3139,
"step": 184
},
{
"epoch": 0.594855305466238,
"grad_norm": 1.8985276222229004,
"learning_rate": 4.298559634858202e-06,
"loss": 0.3249,
"step": 185
},
{
"epoch": 0.5980707395498392,
"grad_norm": 3.259624481201172,
"learning_rate": 4.2428611124771184e-06,
"loss": 0.3566,
"step": 186
},
{
"epoch": 0.6012861736334405,
"grad_norm": 2.25924015045166,
"learning_rate": 4.187258588122019e-06,
"loss": 0.3359,
"step": 187
},
{
"epoch": 0.6045016077170418,
"grad_norm": 1.6317423582077026,
"learning_rate": 4.131759111665349e-06,
"loss": 0.3231,
"step": 188
},
{
"epoch": 0.6077170418006431,
"grad_norm": 1.4268584251403809,
"learning_rate": 4.076369719914055e-06,
"loss": 0.3621,
"step": 189
},
{
"epoch": 0.6109324758842444,
"grad_norm": 1.7488436698913574,
"learning_rate": 4.021097435717386e-06,
"loss": 0.3263,
"step": 190
},
{
"epoch": 0.6141479099678456,
"grad_norm": 2.6326963901519775,
"learning_rate": 3.965949267076465e-06,
"loss": 0.3376,
"step": 191
},
{
"epoch": 0.617363344051447,
"grad_norm": 1.5136549472808838,
"learning_rate": 3.910932206255742e-06,
"loss": 0.3161,
"step": 192
},
{
"epoch": 0.6205787781350482,
"grad_norm": 1.4793072938919067,
"learning_rate": 3.856053228896442e-06,
"loss": 0.3241,
"step": 193
},
{
"epoch": 0.6237942122186495,
"grad_norm": 2.289064884185791,
"learning_rate": 3.8013192931321095e-06,
"loss": 0.3207,
"step": 194
},
{
"epoch": 0.6270096463022508,
"grad_norm": 1.8162267208099365,
"learning_rate": 3.7467373387063973e-06,
"loss": 0.3242,
"step": 195
},
{
"epoch": 0.6302250803858521,
"grad_norm": 1.8329249620437622,
"learning_rate": 3.692314286093167e-06,
"loss": 0.3248,
"step": 196
},
{
"epoch": 0.6334405144694534,
"grad_norm": 1.6766780614852905,
"learning_rate": 3.6380570356190346e-06,
"loss": 0.3291,
"step": 197
},
{
"epoch": 0.6366559485530546,
"grad_norm": 66.03868865966797,
"learning_rate": 3.58397246658848e-06,
"loss": 0.3078,
"step": 198
},
{
"epoch": 0.639871382636656,
"grad_norm": 2.198519706726074,
"learning_rate": 3.5300674364116173e-06,
"loss": 0.3197,
"step": 199
},
{
"epoch": 0.6430868167202572,
"grad_norm": 2.2276482582092285,
"learning_rate": 3.476348779734732e-06,
"loss": 0.3141,
"step": 200
},
{
"epoch": 0.6463022508038585,
"grad_norm": 1.4550248384475708,
"learning_rate": 3.4228233075737225e-06,
"loss": 0.3327,
"step": 201
},
{
"epoch": 0.6495176848874598,
"grad_norm": 1.832467794418335,
"learning_rate": 3.3694978064505258e-06,
"loss": 0.3196,
"step": 202
},
{
"epoch": 0.6527331189710611,
"grad_norm": 1.8890045881271362,
"learning_rate": 3.316379037532644e-06,
"loss": 0.355,
"step": 203
},
{
"epoch": 0.6559485530546624,
"grad_norm": 2.667874813079834,
"learning_rate": 3.2634737357758994e-06,
"loss": 0.3358,
"step": 204
},
{
"epoch": 0.6591639871382636,
"grad_norm": 1.426277756690979,
"learning_rate": 3.2107886090705035e-06,
"loss": 0.3134,
"step": 205
},
{
"epoch": 0.662379421221865,
"grad_norm": 2.3840363025665283,
"learning_rate": 3.158330337390565e-06,
"loss": 0.3144,
"step": 206
},
{
"epoch": 0.6655948553054662,
"grad_norm": 1.9086871147155762,
"learning_rate": 3.10610557194712e-06,
"loss": 0.3043,
"step": 207
},
{
"epoch": 0.6688102893890675,
"grad_norm": 1.7807660102844238,
"learning_rate": 3.0541209343448373e-06,
"loss": 0.3121,
"step": 208
},
{
"epoch": 0.6720257234726688,
"grad_norm": 1.6139692068099976,
"learning_rate": 3.0023830157424504e-06,
"loss": 0.3454,
"step": 209
},
{
"epoch": 0.6752411575562701,
"grad_norm": 1.9122083187103271,
"learning_rate": 2.950898376017064e-06,
"loss": 0.3175,
"step": 210
},
{
"epoch": 0.6784565916398714,
"grad_norm": 2.084561586380005,
"learning_rate": 2.8996735429324256e-06,
"loss": 0.319,
"step": 211
},
{
"epoch": 0.6816720257234726,
"grad_norm": 1.411837100982666,
"learning_rate": 2.848715011311271e-06,
"loss": 0.3085,
"step": 212
},
{
"epoch": 0.684887459807074,
"grad_norm": 1.4443124532699585,
"learning_rate": 2.7980292422118282e-06,
"loss": 0.3366,
"step": 213
},
{
"epoch": 0.6881028938906752,
"grad_norm": 1.447710394859314,
"learning_rate": 2.7476226621086354e-06,
"loss": 0.3171,
"step": 214
},
{
"epoch": 0.6913183279742765,
"grad_norm": 1.3682136535644531,
"learning_rate": 2.697501662077707e-06,
"loss": 0.3158,
"step": 215
},
{
"epoch": 0.6945337620578779,
"grad_norm": 8.954407691955566,
"learning_rate": 2.6476725969862227e-06,
"loss": 0.3474,
"step": 216
},
{
"epoch": 0.6977491961414791,
"grad_norm": 1.565764307975769,
"learning_rate": 2.5981417846867753e-06,
"loss": 0.3287,
"step": 217
},
{
"epoch": 0.7009646302250804,
"grad_norm": 1.5535356998443604,
"learning_rate": 2.548915505216333e-06,
"loss": 0.3206,
"step": 218
},
{
"epoch": 0.7041800643086816,
"grad_norm": 1.5272207260131836,
"learning_rate": 2.5000000000000015e-06,
"loss": 0.3228,
"step": 219
},
{
"epoch": 0.707395498392283,
"grad_norm": 1.4562255144119263,
"learning_rate": 2.4514014710596467e-06,
"loss": 0.296,
"step": 220
},
{
"epoch": 0.7106109324758842,
"grad_norm": 1.6484540700912476,
"learning_rate": 2.4031260802275623e-06,
"loss": 0.3358,
"step": 221
},
{
"epoch": 0.7138263665594855,
"grad_norm": 1.4094409942626953,
"learning_rate": 2.3551799483651894e-06,
"loss": 0.3332,
"step": 222
},
{
"epoch": 0.7170418006430869,
"grad_norm": 1.8686710596084595,
"learning_rate": 2.307569154587056e-06,
"loss": 0.3356,
"step": 223
},
{
"epoch": 0.7202572347266881,
"grad_norm": 1.5808024406433105,
"learning_rate": 2.2602997354900075e-06,
"loss": 0.315,
"step": 224
},
{
"epoch": 0.7234726688102894,
"grad_norm": 1.5298748016357422,
"learning_rate": 2.2133776843878185e-06,
"loss": 0.3355,
"step": 225
},
{
"epoch": 0.7266881028938906,
"grad_norm": 12.085217475891113,
"learning_rate": 2.166808950551296e-06,
"loss": 0.3301,
"step": 226
},
{
"epoch": 0.729903536977492,
"grad_norm": 1.5451043844223022,
"learning_rate": 2.120599438453968e-06,
"loss": 0.3321,
"step": 227
},
{
"epoch": 0.7331189710610932,
"grad_norm": 1.8228951692581177,
"learning_rate": 2.074755007023461e-06,
"loss": 0.3074,
"step": 228
},
{
"epoch": 0.7363344051446945,
"grad_norm": 1.5614581108093262,
"learning_rate": 2.0292814688986375e-06,
"loss": 0.342,
"step": 229
},
{
"epoch": 0.7395498392282959,
"grad_norm": 1.4238361120224,
"learning_rate": 1.9841845896926022e-06,
"loss": 0.3261,
"step": 230
},
{
"epoch": 0.7427652733118971,
"grad_norm": 1.7577193975448608,
"learning_rate": 1.9394700872616856e-06,
"loss": 0.3312,
"step": 231
},
{
"epoch": 0.7459807073954984,
"grad_norm": 2.6132872104644775,
"learning_rate": 1.8951436309804766e-06,
"loss": 0.341,
"step": 232
},
{
"epoch": 0.7491961414790996,
"grad_norm": 1.9483258724212646,
"learning_rate": 1.8512108410229878e-06,
"loss": 0.3039,
"step": 233
},
{
"epoch": 0.752411575562701,
"grad_norm": 1.289170742034912,
"learning_rate": 1.8076772876500831e-06,
"loss": 0.3003,
"step": 234
},
{
"epoch": 0.7556270096463023,
"grad_norm": 2.7016942501068115,
"learning_rate": 1.7645484905032129e-06,
"loss": 0.3283,
"step": 235
},
{
"epoch": 0.7588424437299035,
"grad_norm": 1.3249751329421997,
"learning_rate": 1.7218299179045789e-06,
"loss": 0.3128,
"step": 236
},
{
"epoch": 0.7620578778135049,
"grad_norm": 2.7184793949127197,
"learning_rate": 1.6795269861638041e-06,
"loss": 0.3338,
"step": 237
},
{
"epoch": 0.7652733118971061,
"grad_norm": 1.27732253074646,
"learning_rate": 1.6376450588911985e-06,
"loss": 0.2865,
"step": 238
},
{
"epoch": 0.7684887459807074,
"grad_norm": 1.7141942977905273,
"learning_rate": 1.5961894463176942e-06,
"loss": 0.3154,
"step": 239
},
{
"epoch": 0.7717041800643086,
"grad_norm": 4.537605285644531,
"learning_rate": 1.555165404621567e-06,
"loss": 0.3207,
"step": 240
},
{
"epoch": 0.77491961414791,
"grad_norm": 1.5014591217041016,
"learning_rate": 1.5145781352620054e-06,
"loss": 0.3403,
"step": 241
},
{
"epoch": 0.7781350482315113,
"grad_norm": 1.7332754135131836,
"learning_rate": 1.4744327843196043e-06,
"loss": 0.2983,
"step": 242
},
{
"epoch": 0.7813504823151125,
"grad_norm": 1.8262887001037598,
"learning_rate": 1.434734441843899e-06,
"loss": 0.3052,
"step": 243
},
{
"epoch": 0.7845659163987139,
"grad_norm": 3.564021348953247,
"learning_rate": 1.3954881412079945e-06,
"loss": 0.3155,
"step": 244
},
{
"epoch": 0.7877813504823151,
"grad_norm": 1.3386446237564087,
"learning_rate": 1.3566988584703817e-06,
"loss": 0.29,
"step": 245
},
{
"epoch": 0.7909967845659164,
"grad_norm": 14.863595962524414,
"learning_rate": 1.3183715117440143e-06,
"loss": 0.303,
"step": 246
},
{
"epoch": 0.7942122186495176,
"grad_norm": 10.220840454101562,
"learning_rate": 1.280510960572745e-06,
"loss": 0.3258,
"step": 247
},
{
"epoch": 0.797427652733119,
"grad_norm": 1.610312819480896,
"learning_rate": 1.2431220053151832e-06,
"loss": 0.3301,
"step": 248
},
{
"epoch": 0.8006430868167203,
"grad_norm": 1.4349790811538696,
"learning_rate": 1.2062093865360458e-06,
"loss": 0.2993,
"step": 249
},
{
"epoch": 0.8038585209003215,
"grad_norm": 3.160371780395508,
"learning_rate": 1.1697777844051105e-06,
"loss": 0.3119,
"step": 250
},
{
"epoch": 0.8070739549839229,
"grad_norm": 1.5855354070663452,
"learning_rate": 1.1338318181038037e-06,
"loss": 0.3017,
"step": 251
},
{
"epoch": 0.8102893890675241,
"grad_norm": 2.18475341796875,
"learning_rate": 1.0983760452395415e-06,
"loss": 0.3205,
"step": 252
},
{
"epoch": 0.8135048231511254,
"grad_norm": 1.4034461975097656,
"learning_rate": 1.063414961267859e-06,
"loss": 0.3265,
"step": 253
},
{
"epoch": 0.8167202572347267,
"grad_norm": 1.7966201305389404,
"learning_rate": 1.02895299892244e-06,
"loss": 0.3048,
"step": 254
},
{
"epoch": 0.819935691318328,
"grad_norm": 1.4181287288665771,
"learning_rate": 9.949945276530782e-07,
"loss": 0.327,
"step": 255
},
{
"epoch": 0.8231511254019293,
"grad_norm": 1.8698980808258057,
"learning_rate": 9.615438530716753e-07,
"loss": 0.304,
"step": 256
},
{
"epoch": 0.8263665594855305,
"grad_norm": 1.4744179248809814,
"learning_rate": 9.286052164063369e-07,
"loss": 0.3335,
"step": 257
},
{
"epoch": 0.8295819935691319,
"grad_norm": 1.7521005868911743,
"learning_rate": 8.961827939636198e-07,
"loss": 0.3438,
"step": 258
},
{
"epoch": 0.8327974276527331,
"grad_norm": 2.057781934738159,
"learning_rate": 8.64280696599008e-07,
"loss": 0.3156,
"step": 259
},
{
"epoch": 0.8360128617363344,
"grad_norm": 1.6581817865371704,
"learning_rate": 8.329029691957124e-07,
"loss": 0.3126,
"step": 260
},
{
"epoch": 0.8392282958199357,
"grad_norm": 2.1192777156829834,
"learning_rate": 8.02053590151805e-07,
"loss": 0.3188,
"step": 261
},
{
"epoch": 0.842443729903537,
"grad_norm": 1.4133118391036987,
"learning_rate": 7.717364708758024e-07,
"loss": 0.3211,
"step": 262
},
{
"epoch": 0.8456591639871383,
"grad_norm": 1.5940773487091064,
"learning_rate": 7.41955455290726e-07,
"loss": 0.2978,
"step": 263
},
{
"epoch": 0.8488745980707395,
"grad_norm": 2.956995725631714,
"learning_rate": 7.127143193467445e-07,
"loss": 0.3173,
"step": 264
},
{
"epoch": 0.8520900321543409,
"grad_norm": 2.5843443870544434,
"learning_rate": 6.840167705424106e-07,
"loss": 0.3002,
"step": 265
},
{
"epoch": 0.8553054662379421,
"grad_norm": 1.8771467208862305,
"learning_rate": 6.558664474545817e-07,
"loss": 0.3243,
"step": 266
},
{
"epoch": 0.8585209003215434,
"grad_norm": 2.241569757461548,
"learning_rate": 6.282669192770896e-07,
"loss": 0.2968,
"step": 267
},
{
"epoch": 0.8617363344051447,
"grad_norm": 1.668226957321167,
"learning_rate": 6.012216853682001e-07,
"loss": 0.32,
"step": 268
},
{
"epoch": 0.864951768488746,
"grad_norm": 1.5583616495132446,
"learning_rate": 5.747341748069229e-07,
"loss": 0.309,
"step": 269
},
{
"epoch": 0.8681672025723473,
"grad_norm": 1.433117389678955,
"learning_rate": 5.488077459582425e-07,
"loss": 0.3231,
"step": 270
},
{
"epoch": 0.8713826366559485,
"grad_norm": 1.5444004535675049,
"learning_rate": 5.234456860473042e-07,
"loss": 0.292,
"step": 271
},
{
"epoch": 0.8745980707395499,
"grad_norm": 1.4406189918518066,
"learning_rate": 4.986512107426283e-07,
"loss": 0.3043,
"step": 272
},
{
"epoch": 0.8778135048231511,
"grad_norm": 1.3344569206237793,
"learning_rate": 4.7442746374839363e-07,
"loss": 0.2818,
"step": 273
},
{
"epoch": 0.8810289389067524,
"grad_norm": 1.5688618421554565,
"learning_rate": 4.50777516405847e-07,
"loss": 0.295,
"step": 274
},
{
"epoch": 0.8842443729903537,
"grad_norm": 1.4727739095687866,
"learning_rate": 4.2770436730388166e-07,
"loss": 0.2951,
"step": 275
},
{
"epoch": 0.887459807073955,
"grad_norm": 2.1879146099090576,
"learning_rate": 4.05210941898847e-07,
"loss": 0.3191,
"step": 276
},
{
"epoch": 0.8906752411575563,
"grad_norm": 1.7236080169677734,
"learning_rate": 3.8330009214363197e-07,
"loss": 0.3118,
"step": 277
},
{
"epoch": 0.8938906752411575,
"grad_norm": 1.4538291692733765,
"learning_rate": 3.619745961260623e-07,
"loss": 0.3207,
"step": 278
},
{
"epoch": 0.8971061093247589,
"grad_norm": 1.4028717279434204,
"learning_rate": 3.4123715771665786e-07,
"loss": 0.3278,
"step": 279
},
{
"epoch": 0.9003215434083601,
"grad_norm": 1.755362629890442,
"learning_rate": 3.2109040622582186e-07,
"loss": 0.2798,
"step": 280
},
{
"epoch": 0.9035369774919614,
"grad_norm": 2.1135966777801514,
"learning_rate": 3.015368960704584e-07,
"loss": 0.307,
"step": 281
},
{
"epoch": 0.9067524115755627,
"grad_norm": 1.4916549921035767,
"learning_rate": 2.8257910645009935e-07,
"loss": 0.2861,
"step": 282
},
{
"epoch": 0.909967845659164,
"grad_norm": 1.6314096450805664,
"learning_rate": 2.6421944103256657e-07,
"loss": 0.3065,
"step": 283
},
{
"epoch": 0.9131832797427653,
"grad_norm": 2.6644771099090576,
"learning_rate": 2.4646022764920843e-07,
"loss": 0.3013,
"step": 284
},
{
"epoch": 0.9163987138263665,
"grad_norm": 1.4383426904678345,
"learning_rate": 2.2930371799975593e-07,
"loss": 0.309,
"step": 285
},
{
"epoch": 0.9196141479099679,
"grad_norm": 1.4383573532104492,
"learning_rate": 2.1275208736682262e-07,
"loss": 0.2973,
"step": 286
},
{
"epoch": 0.9228295819935691,
"grad_norm": 1.249383568763733,
"learning_rate": 1.9680743434010385e-07,
"loss": 0.307,
"step": 287
},
{
"epoch": 0.9260450160771704,
"grad_norm": 3.4106099605560303,
"learning_rate": 1.814717805502958e-07,
"loss": 0.3035,
"step": 288
},
{
"epoch": 0.9292604501607717,
"grad_norm": 2.925081729888916,
"learning_rate": 1.667470704127694e-07,
"loss": 0.2966,
"step": 289
},
{
"epoch": 0.932475884244373,
"grad_norm": 3.2131919860839844,
"learning_rate": 1.5263517088103862e-07,
"loss": 0.3021,
"step": 290
},
{
"epoch": 0.9356913183279743,
"grad_norm": 1.7186495065689087,
"learning_rate": 1.3913787121004717e-07,
"loss": 0.3164,
"step": 291
},
{
"epoch": 0.9389067524115756,
"grad_norm": 1.486461877822876,
"learning_rate": 1.2625688272930925e-07,
"loss": 0.3191,
"step": 292
},
{
"epoch": 0.9421221864951769,
"grad_norm": 1.584938406944275,
"learning_rate": 1.1399383862592928e-07,
"loss": 0.3005,
"step": 293
},
{
"epoch": 0.9453376205787781,
"grad_norm": 3.820082426071167,
"learning_rate": 1.0235029373752758e-07,
"loss": 0.3019,
"step": 294
},
{
"epoch": 0.9485530546623794,
"grad_norm": 1.3474429845809937,
"learning_rate": 9.132772435510362e-08,
"loss": 0.2809,
"step": 295
},
{
"epoch": 0.9517684887459807,
"grad_norm": 1.2956693172454834,
"learning_rate": 8.092752803585513e-08,
"loss": 0.2959,
"step": 296
},
{
"epoch": 0.954983922829582,
"grad_norm": 7.804889678955078,
"learning_rate": 7.115102342598101e-08,
"loss": 0.2872,
"step": 297
},
{
"epoch": 0.9581993569131833,
"grad_norm": 3.1022164821624756,
"learning_rate": 6.199945009349173e-08,
"loss": 0.3173,
"step": 298
},
{
"epoch": 0.9614147909967846,
"grad_norm": 4.706262111663818,
"learning_rate": 5.3473968371040575e-08,
"loss": 0.2896,
"step": 299
},
{
"epoch": 0.9646302250803859,
"grad_norm": 1.4467219114303589,
"learning_rate": 4.55756592088058e-08,
"loss": 0.2965,
"step": 300
},
{
"epoch": 0.9678456591639871,
"grad_norm": 1.4120930433273315,
"learning_rate": 3.8305524037438035e-08,
"loss": 0.3084,
"step": 301
},
{
"epoch": 0.9710610932475884,
"grad_norm": 1.5261682271957397,
"learning_rate": 3.166448464108629e-08,
"loss": 0.328,
"step": 302
},
{
"epoch": 0.9742765273311897,
"grad_norm": 1.4508821964263916,
"learning_rate": 2.5653383040524228e-08,
"loss": 0.2849,
"step": 303
},
{
"epoch": 0.977491961414791,
"grad_norm": 6.040746212005615,
"learning_rate": 2.0272981386393332e-08,
"loss": 0.3468,
"step": 304
},
{
"epoch": 0.9807073954983923,
"grad_norm": 2.0974323749542236,
"learning_rate": 1.552396186256411e-08,
"loss": 0.2976,
"step": 305
},
{
"epoch": 0.9839228295819936,
"grad_norm": 1.473928451538086,
"learning_rate": 1.1406926599646373e-08,
"loss": 0.3228,
"step": 306
},
{
"epoch": 0.9871382636655949,
"grad_norm": 2.54904842376709,
"learning_rate": 7.922397598642551e-09,
"loss": 0.2999,
"step": 307
},
{
"epoch": 0.9903536977491961,
"grad_norm": 1.7829190492630005,
"learning_rate": 5.0708166647628345e-09,
"loss": 0.3042,
"step": 308
},
{
"epoch": 0.9935691318327974,
"grad_norm": 1.8181827068328857,
"learning_rate": 2.8525453514099966e-09,
"loss": 0.3057,
"step": 309
},
{
"epoch": 0.9967845659163987,
"grad_norm": 3.868682384490967,
"learning_rate": 1.2678649143349485e-09,
"loss": 0.3086,
"step": 310
},
{
"epoch": 1.0,
"grad_norm": 1.5854872465133667,
"learning_rate": 3.1697627597970794e-10,
"loss": 0.3017,
"step": 311
},
{
"epoch": 1.0,
"step": 311,
"total_flos": 3.149345964351816e+17,
"train_loss": 0.4079481167808606,
"train_runtime": 5827.269,
"train_samples_per_second": 3.415,
"train_steps_per_second": 0.053
}
],
"logging_steps": 1,
"max_steps": 311,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.149345964351816e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}