Files
tinyllama-v1.0-1.5T-CPT/trainer_state.json

5634 lines
139 KiB
JSON
Raw Normal View History

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.008947845247016454,
"eval_steps": 500,
"global_step": 800,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 1.1184806558770566e-05,
"grad_norm": 9.17889404296875,
"learning_rate": 0.00019999999993826567,
"loss": 4.6752,
"step": 1
},
{
"epoch": 2.236961311754113e-05,
"grad_norm": 15.023734092712402,
"learning_rate": 0.0001999999997530627,
"loss": 4.8816,
"step": 2
},
{
"epoch": 3.35544196763117e-05,
"grad_norm": 4.711775302886963,
"learning_rate": 0.00019999999944439107,
"loss": 4.3122,
"step": 3
},
{
"epoch": 4.473922623508226e-05,
"grad_norm": 5.1041460037231445,
"learning_rate": 0.0001999999990122508,
"loss": 4.0207,
"step": 4
},
{
"epoch": 5.5924032793852833e-05,
"grad_norm": 11.579492568969727,
"learning_rate": 0.0001999999984566419,
"loss": 3.656,
"step": 5
},
{
"epoch": 6.71088393526234e-05,
"grad_norm": 4.573488235473633,
"learning_rate": 0.00019999999777756431,
"loss": 3.3136,
"step": 6
},
{
"epoch": 7.829364591139397e-05,
"grad_norm": 3.6844234466552734,
"learning_rate": 0.0001999999969750181,
"loss": 3.0363,
"step": 7
},
{
"epoch": 8.947845247016453e-05,
"grad_norm": 2.9362566471099854,
"learning_rate": 0.00019999999604900323,
"loss": 2.7911,
"step": 8
},
{
"epoch": 0.0001006632590289351,
"grad_norm": 2.6654202938079834,
"learning_rate": 0.0001999999949995197,
"loss": 2.6176,
"step": 9
},
{
"epoch": 0.00011184806558770567,
"grad_norm": 1.8987364768981934,
"learning_rate": 0.00019999999382656758,
"loss": 2.5172,
"step": 10
},
{
"epoch": 0.00012303287214647624,
"grad_norm": 2.596072196960449,
"learning_rate": 0.0001999999925301468,
"loss": 2.3978,
"step": 11
},
{
"epoch": 0.0001342176787052468,
"grad_norm": 1.6658835411071777,
"learning_rate": 0.00019999999111025733,
"loss": 2.2888,
"step": 12
},
{
"epoch": 0.00014540248526401735,
"grad_norm": 1.5891242027282715,
"learning_rate": 0.00019999998956689926,
"loss": 2.1966,
"step": 13
},
{
"epoch": 0.00015658729182278794,
"grad_norm": 1.7551047801971436,
"learning_rate": 0.00019999998790007256,
"loss": 2.1286,
"step": 14
},
{
"epoch": 0.0001677720983815585,
"grad_norm": 1.515837550163269,
"learning_rate": 0.0001999999861097772,
"loss": 2.0329,
"step": 15
},
{
"epoch": 0.00017895690494032905,
"grad_norm": 2.0875966548919678,
"learning_rate": 0.0001999999841960132,
"loss": 1.9836,
"step": 16
},
{
"epoch": 0.00019014171149909964,
"grad_norm": 1.3364766836166382,
"learning_rate": 0.0001999999821587806,
"loss": 1.9412,
"step": 17
},
{
"epoch": 0.0002013265180578702,
"grad_norm": 0.9586036205291748,
"learning_rate": 0.00019999997999807934,
"loss": 1.9021,
"step": 18
},
{
"epoch": 0.00021251132461664075,
"grad_norm": 0.610419750213623,
"learning_rate": 0.00019999997771390947,
"loss": 1.9278,
"step": 19
},
{
"epoch": 0.00022369613117541133,
"grad_norm": 0.6592239141464233,
"learning_rate": 0.000199999975306271,
"loss": 1.8956,
"step": 20
},
{
"epoch": 0.0002348809377341819,
"grad_norm": 0.7091565132141113,
"learning_rate": 0.00019999997277516388,
"loss": 1.8629,
"step": 21
},
{
"epoch": 0.0002460657442929525,
"grad_norm": 0.6687048077583313,
"learning_rate": 0.00019999997012058819,
"loss": 1.818,
"step": 22
},
{
"epoch": 0.000257250550851723,
"grad_norm": 0.29321762919425964,
"learning_rate": 0.00019999996734254382,
"loss": 1.8024,
"step": 23
},
{
"epoch": 0.0002684353574104936,
"grad_norm": 0.6186531186103821,
"learning_rate": 0.00019999996444103086,
"loss": 1.7958,
"step": 24
},
{
"epoch": 0.0002796201639692642,
"grad_norm": 0.4960622489452362,
"learning_rate": 0.0001999999614160493,
"loss": 1.7714,
"step": 25
},
{
"epoch": 0.0002908049705280347,
"grad_norm": 0.25318390130996704,
"learning_rate": 0.00019999995826759916,
"loss": 1.7419,
"step": 26
},
{
"epoch": 0.0003019897770868053,
"grad_norm": 0.5521177649497986,
"learning_rate": 0.0001999999549956804,
"loss": 1.7336,
"step": 27
},
{
"epoch": 0.0003131745836455759,
"grad_norm": 0.3085158169269562,
"learning_rate": 0.00019999995160029305,
"loss": 1.7304,
"step": 28
},
{
"epoch": 0.0003243593902043464,
"grad_norm": 0.2978903353214264,
"learning_rate": 0.0001999999480814371,
"loss": 1.7283,
"step": 29
},
{
"epoch": 0.000335544196763117,
"grad_norm": 0.40339481830596924,
"learning_rate": 0.00019999994443911258,
"loss": 1.7577,
"step": 30
},
{
"epoch": 0.00034672900332188757,
"grad_norm": 0.13451404869556427,
"learning_rate": 0.00019999994067331945,
"loss": 1.7435,
"step": 31
},
{
"epoch": 0.0003579138098806581,
"grad_norm": 0.3141914904117584,
"learning_rate": 0.0001999999367840578,
"loss": 1.7479,
"step": 32
},
{
"epoch": 0.0003690986164394287,
"grad_norm": 0.12182258069515228,
"learning_rate": 0.00019999993277132754,
"loss": 1.7391,
"step": 33
},
{
"epoch": 0.00038028342299819927,
"grad_norm": 0.3160305917263031,
"learning_rate": 0.00019999992863512872,
"loss": 1.7231,
"step": 34
},
{
"epoch": 0.0003914682295569698,
"grad_norm": 0.18215563893318176,
"learning_rate": 0.00019999992437546134,
"loss": 1.7067,
"step": 35
},
{
"epoch": 0.0004026530361157404,
"grad_norm": 0.24103401601314545,
"learning_rate": 0.0001999999199923254,
"loss": 1.6877,
"step": 36
},
{
"epoch": 0.00041383784267451097,
"grad_norm": 0.17353500425815582,
"learning_rate": 0.0001999999154857209,
"loss": 1.6746,
"step": 37
},
{
"epoch": 0.0004250226492332815,
"grad_norm": 0.19149154424667358,
"learning_rate": 0.00019999991085564784,
"loss": 1.6734,
"step": 38
},
{
"epoch": 0.0004362074557920521,
"grad_norm": 0.15810631215572357,
"learning_rate": 0.0001999999061021063,
"loss": 1.6773,
"step": 39
},
{
"epoch": 0.00044739226235082267,
"grad_norm": 0.14770038425922394,
"learning_rate": 0.00019999990122509614,
"loss": 1.6967,
"step": 40
},
{
"epoch": 0.0004585770689095932,
"grad_norm": 0.15101520717144012,
"learning_rate": 0.0001999998962246175,
"loss": 1.6816,
"step": 41
},
{
"epoch": 0.0004697618754683638,
"grad_norm": 0.1085171177983284,
"learning_rate": 0.00019999989110067035,
"loss": 1.6875,
"step": 42
},
{
"epoch": 0.00048094668202713437,
"grad_norm": 0.13066136837005615,
"learning_rate": 0.00019999988585325468,
"loss": 1.6768,
"step": 43
},
{
"epoch": 0.000492131488585905,
"grad_norm": 0.09783171862363815,
"learning_rate": 0.0001999998804823705,
"loss": 1.6559,
"step": 44
},
{
"epoch": 0.0005033162951446755,
"grad_norm": 0.15001484751701355,
"learning_rate": 0.00019999987498801777,
"loss": 1.6662,
"step": 45
},
{
"epoch": 0.000514501101703446,
"grad_norm": 0.06484243273735046,
"learning_rate": 0.0001999998693701966,
"loss": 1.6528,
"step": 46
},
{
"epoch": 0.0005256859082622166,
"grad_norm": 0.12908180058002472,
"learning_rate": 0.00019999986362890693,
"loss": 1.67,
"step": 47
},
{
"epoch": 0.0005368707148209872,
"grad_norm": 0.05700545758008957,
"learning_rate": 0.00019999985776414877,
"loss": 1.6643,
"step": 48
},
{
"epoch": 0.0005480555213797577,
"grad_norm": 0.10586538165807724,
"learning_rate": 0.00019999985177592211,
"loss": 1.6725,
"step": 49
},
{
"epoch": 0.0005592403279385283,
"grad_norm": 0.05411362275481224,
"learning_rate": 0.00019999984566422703,
"loss": 1.6495,
"step": 50
},
{
"epoch": 0.0005704251344972989,
"grad_norm": 0.08841974288225174,
"learning_rate": 0.00019999983942906347,
"loss": 1.6397,
"step": 51
},
{
"epoch": 0.0005816099410560694,
"grad_norm": 0.049202822148799896,
"learning_rate": 0.00019999983307043145,
"loss": 1.6601,
"step": 52
},
{
"epoch": 0.00059279474761484,
"grad_norm": 0.06537283957004547,
"learning_rate": 0.00019999982658833098,
"loss": 1.6405,
"step": 53
},
{
"epoch": 0.0006039795541736106,
"grad_norm": 0.04358899965882301,
"learning_rate": 0.0001999998199827621,
"loss": 1.6586,
"step": 54
},
{
"epoch": 0.0006151643607323811,
"grad_norm": 0.05924156308174133,
"learning_rate": 0.0001999998132537248,
"loss": 1.6609,
"step": 55
},
{
"epoch": 0.0006263491672911517,
"grad_norm": 0.047364529222249985,
"learning_rate": 0.00019999980640121904,
"loss": 1.6561,
"step": 56
},
{
"epoch": 0.0006375339738499223,
"grad_norm": 0.05860909819602966,
"learning_rate": 0.00019999979942524488,
"loss": 1.644,
"step": 57
},
{
"epoch": 0.0006487187804086928,
"grad_norm": 0.058639075607061386,
"learning_rate": 0.00019999979232580235,
"loss": 1.6582,
"step": 58
},
{
"epoch": 0.0006599035869674634,
"grad_norm": 0.049288246780633926,
"learning_rate": 0.00019999978510289138,
"loss": 1.6828,
"step": 59
},
{
"epoch": 0.000671088393526234,
"grad_norm": 0.06293074041604996,
"learning_rate": 0.00019999977775651207,
"loss": 1.6852,
"step": 60
},
{
"epoch": 0.0006822732000850045,
"grad_norm": 0.03848947212100029,
"learning_rate": 0.00019999977028666436,
"loss": 1.6792,
"step": 61
},
{
"epoch": 0.0006934580066437751,
"grad_norm": 0.1217992901802063,
"learning_rate": 0.00019999976269334828,
"loss": 1.7142,
"step": 62
},
{
"epoch": 0.0007046428132025457,
"grad_norm": 0.04006423428654671,
"learning_rate": 0.00019999975497656384,
"loss": 1.7108,
"step": 63
},
{
"epoch": 0.0007158276197613162,
"grad_norm": 0.057927753776311874,
"learning_rate": 0.0001999997471363111,
"loss": 1.6902,
"step": 64
},
{
"epoch": 0.0007270124263200868,
"grad_norm": 0.04537767171859741,
"learning_rate": 0.00019999973917258997,
"loss": 1.6627,
"step": 65
},
{
"epoch": 0.0007381972328788574,
"grad_norm": 0.057307057082653046,
"learning_rate": 0.00019999973108540052,
"loss": 1.6561,
"step": 66
},
{
"epoch": 0.0007493820394376279,
"grad_norm": 0.044002167880535126,
"learning_rate": 0.00019999972287474272,
"loss": 1.6595,
"step": 67
},
{
"epoch": 0.0007605668459963985,
"grad_norm": 0.03991294279694557,
"learning_rate": 0.00019999971454061666,
"loss": 1.6563,
"step": 68
},
{
"epoch": 0.0007717516525551691,
"grad_norm": 0.044279683381319046,
"learning_rate": 0.0001999997060830223,
"loss": 1.6352,
"step": 69
},
{
"epoch": 0.0007829364591139396,
"grad_norm": 0.04319776967167854,
"learning_rate": 0.00019999969750195967,
"loss": 1.6319,
"step": 70
},
{
"epoch": 0.0007941212656727102,
"grad_norm": 0.04436005651950836,
"learning_rate": 0.00019999968879742873,
"loss": 1.6556,
"step": 71
},
{
"epoch": 0.0008053060722314808,
"grad_norm": 0.0416998416185379,
"learning_rate": 0.00019999967996942952,
"loss": 1.6646,
"step": 72
},
{
"epoch": 0.0008164908787902513,
"grad_norm": 0.03601493313908577,
"learning_rate": 0.00019999967101796208,
"loss": 1.6605,
"step": 73
},
{
"epoch": 0.0008276756853490219,
"grad_norm": 0.03957024961709976,
"learning_rate": 0.00019999966194302637,
"loss": 1.6714,
"step": 74
},
{
"epoch": 0.0008388604919077925,
"grad_norm": 0.04401829466223717,
"learning_rate": 0.00019999965274462245,
"loss": 1.6497,
"step": 75
},
{
"epoch": 0.000850045298466563,
"grad_norm": 0.04251580312848091,
"learning_rate": 0.0001999996434227503,
"loss": 1.6567,
"step": 76
},
{
"epoch": 0.0008612301050253336,
"grad_norm": 0.038915786892175674,
"learning_rate": 0.00019999963397740995,
"loss": 1.6515,
"step": 77
},
{
"epoch": 0.0008724149115841042,
"grad_norm": 0.04063812270760536,
"learning_rate": 0.00019999962440860137,
"loss": 1.6764,
"step": 78
},
{
"epoch": 0.0008835997181428747,
"grad_norm": 0.04114954546093941,
"learning_rate": 0.00019999961471632463,
"loss": 1.6553,
"step": 79
},
{
"epoch": 0.0008947845247016453,
"grad_norm": 0.039563097059726715,
"learning_rate": 0.0001999996049005797,
"loss": 1.6595,
"step": 80
},
{
"epoch": 0.0009059693312604159,
"grad_norm": 0.03477632254362106,
"learning_rate": 0.00019999959496136663,
"loss": 1.6612,
"step": 81
},
{
"epoch": 0.0009171541378191864,
"grad_norm": 0.04104992002248764,
"learning_rate": 0.0001999995848986854,
"loss": 1.6476,
"step": 82
},
{
"epoch": 0.000928338944377957,
"grad_norm": 0.03909602388739586,
"learning_rate": 0.00019999957471253602,
"loss": 1.6445,
"step": 83
},
{
"epoch": 0.0009395237509367276,
"grad_norm": 0.039677415043115616,
"learning_rate": 0.00019999956440291855,
"loss": 1.6368,
"step": 84
},
{
"epoch": 0.0009507085574954981,
"grad_norm": 0.03493763506412506,
"learning_rate": 0.00019999955396983292,
"loss": 1.6343,
"step": 85
},
{
"epoch": 0.0009618933640542687,
"grad_norm": 0.03530074283480644,
"learning_rate": 0.0001999995434132792,
"loss": 1.6843,
"step": 86
},
{
"epoch": 0.0009730781706130393,
"grad_norm": 0.037156637758016586,
"learning_rate": 0.00019999953273325743,
"loss": 1.6721,
"step": 87
},
{
"epoch": 0.00098426297717181,
"grad_norm": 0.04006032645702362,
"learning_rate": 0.00019999952192976755,
"loss": 1.6724,
"step": 88
},
{
"epoch": 0.0009954477837305804,
"grad_norm": 0.03743763267993927,
"learning_rate": 0.0001999995110028096,
"loss": 1.6613,
"step": 89
},
{
"epoch": 0.001006632590289351,
"grad_norm": 0.04100384563207626,
"learning_rate": 0.00019999949995238369,
"loss": 1.6444,
"step": 90
},
{
"epoch": 0.0010178173968481215,
"grad_norm": 0.03799253702163696,
"learning_rate": 0.00019999948877848965,
"loss": 1.6641,
"step": 91
},
{
"epoch": 0.001029002203406892,
"grad_norm": 0.040163811296224594,
"learning_rate": 0.00019999947748112763,
"loss": 1.6411,
"step": 92
},
{
"epoch": 0.0010401870099656626,
"grad_norm": 0.03576591610908508,
"learning_rate": 0.0001999994660602976,
"loss": 1.6378,
"step": 93
},
{
"epoch": 0.0010513718165244333,
"grad_norm": 0.03735070303082466,
"learning_rate": 0.00019999945451599957,
"loss": 1.644,
"step": 94
},
{
"epoch": 0.0010625566230832038,
"grad_norm": 0.04157353192567825,
"learning_rate": 0.00019999944284823358,
"loss": 1.6532,
"step": 95
},
{
"epoch": 0.0010737414296419744,
"grad_norm": 0.046478450298309326,
"learning_rate": 0.0001999994310569996,
"loss": 1.6744,
"step": 96
},
{
"epoch": 0.0010849262362007449,
"grad_norm": 0.04813043400645256,
"learning_rate": 0.0001999994191422977,
"loss": 1.6716,
"step": 97
},
{
"epoch": 0.0010961110427595154,
"grad_norm": 0.03780042380094528,
"learning_rate": 0.00019999940710412788,
"loss": 1.6502,
"step": 98
},
{
"epoch": 0.001107295849318286,
"grad_norm": 0.03811471536755562,
"learning_rate": 0.0001999993949424901,
"loss": 1.6405,
"step": 99
},
{
"epoch": 0.0011184806558770567,
"grad_norm": 0.03838631138205528,
"learning_rate": 0.00019999938265738445,
"loss": 1.6202,
"step": 100
},
{
"epoch": 0.0011296654624358272,
"grad_norm": 0.04033266752958298,
"learning_rate": 0.0001999993702488109,
"loss": 1.6305,
"step": 101
},
{
"epoch": 0.0011408502689945978,
"grad_norm": 0.038872625678777695,
"learning_rate": 0.0001999993577167695,
"loss": 1.6243,
"step": 102
},
{
"epoch": 0.0011520350755533683,
"grad_norm": 0.047068770974874496,
"learning_rate": 0.0001999993450612602,
"loss": 1.6126,
"step": 103
},
{
"epoch": 0.0011632198821121388,
"grad_norm": 0.038774486631155014,
"learning_rate": 0.0001999993322822831,
"loss": 1.6125,
"step": 104
},
{
"epoch": 0.0011744046886709093,
"grad_norm": 0.046706534922122955,
"learning_rate": 0.00019999931937983814,
"loss": 1.6062,
"step": 105
},
{
"epoch": 0.00118558949522968,
"grad_norm": 0.038454532623291016,
"learning_rate": 0.00019999930635392538,
"loss": 1.6145,
"step": 106
},
{
"epoch": 0.0011967743017884506,
"grad_norm": 0.042938027530908585,
"learning_rate": 0.0001999992932045448,
"loss": 1.6064,
"step": 107
},
{
"epoch": 0.0012079591083472212,
"grad_norm": 0.03728878125548363,
"learning_rate": 0.00019999927993169652,
"loss": 1.6203,
"step": 108
},
{
"epoch": 0.0012191439149059917,
"grad_norm": 0.03831510245800018,
"learning_rate": 0.00019999926653538043,
"loss": 1.6214,
"step": 109
},
{
"epoch": 0.0012303287214647622,
"grad_norm": 0.04032002389431,
"learning_rate": 0.00019999925301559659,
"loss": 1.6116,
"step": 110
},
{
"epoch": 0.0012415135280235327,
"grad_norm": 0.04648670554161072,
"learning_rate": 0.00019999923937234505,
"loss": 1.6228,
"step": 111
},
{
"epoch": 0.0012526983345823035,
"grad_norm": 0.03572435304522514,
"learning_rate": 0.00019999922560562575,
"loss": 1.6382,
"step": 112
},
{
"epoch": 0.001263883141141074,
"grad_norm": 0.03520497307181358,
"learning_rate": 0.0001999992117154388,
"loss": 1.642,
"step": 113
},
{
"epoch": 0.0012750679476998446,
"grad_norm": 0.037833768874406815,
"learning_rate": 0.00019999919770178414,
"loss": 1.6809,
"step": 114
},
{
"epoch": 0.001286252754258615,
"grad_norm": 0.04116043448448181,
"learning_rate": 0.00019999918356466186,
"loss": 1.6669,
"step": 115
},
{
"epoch": 0.0012974375608173856,
"grad_norm": 0.03686891868710518,
"learning_rate": 0.00019999916930407192,
"loss": 1.6336,
"step": 116
},
{
"epoch": 0.0013086223673761561,
"grad_norm": 0.04540928825736046,
"learning_rate": 0.00019999915492001434,
"loss": 1.636,
"step": 117
},
{
"epoch": 0.0013198071739349269,
"grad_norm": 0.03894014656543732,
"learning_rate": 0.00019999914041248917,
"loss": 1.642,
"step": 118
},
{
"epoch": 0.0013309919804936974,
"grad_norm": 0.03868821635842323,
"learning_rate": 0.0001999991257814964,
"loss": 1.6459,
"step": 119
},
{
"epoch": 0.001342176787052468,
"grad_norm": 0.05047065392136574,
"learning_rate": 0.00019999911102703606,
"loss": 1.6566,
"step": 120
},
{
"epoch": 0.0013533615936112385,
"grad_norm": 0.037026163190603256,
"learning_rate": 0.0001999990961491082,
"loss": 1.644,
"step": 121
},
{
"epoch": 0.001364546400170009,
"grad_norm": 0.05025867745280266,
"learning_rate": 0.00019999908114771278,
"loss": 1.6375,
"step": 122
},
{
"epoch": 0.0013757312067287795,
"grad_norm": 0.05618799850344658,
"learning_rate": 0.0001999990660228498,
"loss": 1.6414,
"step": 123
},
{
"epoch": 0.0013869160132875503,
"grad_norm": 0.04012198746204376,
"learning_rate": 0.0001999990507745194,
"loss": 1.6338,
"step": 124
},
{
"epoch": 0.0013981008198463208,
"grad_norm": 0.03692522644996643,
"learning_rate": 0.00019999903540272147,
"loss": 1.6338,
"step": 125
},
{
"epoch": 0.0014092856264050913,
"grad_norm": 0.04653295874595642,
"learning_rate": 0.0001999990199074561,
"loss": 1.6242,
"step": 126
},
{
"epoch": 0.0014204704329638619,
"grad_norm": 0.038410138338804245,
"learning_rate": 0.0001999990042887233,
"loss": 1.6535,
"step": 127
},
{
"epoch": 0.0014316552395226324,
"grad_norm": 0.03855321556329727,
"learning_rate": 0.00019999898854652307,
"loss": 1.6622,
"step": 128
},
{
"epoch": 0.001442840046081403,
"grad_norm": 0.039161138236522675,
"learning_rate": 0.00019999897268085543,
"loss": 1.6596,
"step": 129
},
{
"epoch": 0.0014540248526401737,
"grad_norm": 0.04229150339961052,
"learning_rate": 0.00019999895669172042,
"loss": 1.6044,
"step": 130
},
{
"epoch": 0.0014652096591989442,
"grad_norm": 0.04695671424269676,
"learning_rate": 0.00019999894057911804,
"loss": 1.593,
"step": 131
},
{
"epoch": 0.0014763944657577147,
"grad_norm": 0.05007043853402138,
"learning_rate": 0.00019999892434304832,
"loss": 1.6371,
"step": 132
},
{
"epoch": 0.0014875792723164853,
"grad_norm": 0.04140834882855415,
"learning_rate": 0.00019999890798351127,
"loss": 1.6333,
"step": 133
},
{
"epoch": 0.0014987640788752558,
"grad_norm": 0.03771176561713219,
"learning_rate": 0.0001999988915005069,
"loss": 1.6253,
"step": 134
},
{
"epoch": 0.0015099488854340263,
"grad_norm": 0.03835882246494293,
"learning_rate": 0.00019999887489403532,
"loss": 1.6096,
"step": 135
},
{
"epoch": 0.001521133691992797,
"grad_norm": 0.042071614414453506,
"learning_rate": 0.00019999885816409643,
"loss": 1.6158,
"step": 136
},
{
"epoch": 0.0015323184985515676,
"grad_norm": 0.04736172780394554,
"learning_rate": 0.0001999988413106903,
"loss": 1.6076,
"step": 137
},
{
"epoch": 0.0015435033051103381,
"grad_norm": 0.045473724603652954,
"learning_rate": 0.00019999882433381695,
"loss": 1.6002,
"step": 138
},
{
"epoch": 0.0015546881116691087,
"grad_norm": 0.038239121437072754,
"learning_rate": 0.0001999988072334764,
"loss": 1.6093,
"step": 139
},
{
"epoch": 0.0015658729182278792,
"grad_norm": 0.03806670382618904,
"learning_rate": 0.0001999987900096687,
"loss": 1.5958,
"step": 140
},
{
"epoch": 0.0015770577247866497,
"grad_norm": 0.034363262355327606,
"learning_rate": 0.00019999877266239382,
"loss": 1.6182,
"step": 141
},
{
"epoch": 0.0015882425313454205,
"grad_norm": 0.03781922906637192,
"learning_rate": 0.0001999987551916518,
"loss": 1.6041,
"step": 142
},
{
"epoch": 0.001599427337904191,
"grad_norm": 0.037946417927742004,
"learning_rate": 0.00019999873759744268,
"loss": 1.6154,
"step": 143
},
{
"epoch": 0.0016106121444629615,
"grad_norm": 0.03221859410405159,
"learning_rate": 0.00019999871987976645,
"loss": 1.6205,
"step": 144
},
{
"epoch": 0.001621796951021732,
"grad_norm": 0.034399550408124924,
"learning_rate": 0.00019999870203862318,
"loss": 1.6077,
"step": 145
},
{
"epoch": 0.0016329817575805026,
"grad_norm": 0.037965498864650726,
"learning_rate": 0.00019999868407401285,
"loss": 1.6226,
"step": 146
},
{
"epoch": 0.0016441665641392731,
"grad_norm": 0.034158267080783844,
"learning_rate": 0.00019999866598593549,
"loss": 1.6364,
"step": 147
},
{
"epoch": 0.0016553513706980439,
"grad_norm": 0.035179782658815384,
"learning_rate": 0.00019999864777439113,
"loss": 1.6015,
"step": 148
},
{
"epoch": 0.0016665361772568144,
"grad_norm": 0.07380399852991104,
"learning_rate": 0.00019999862943937977,
"loss": 1.6036,
"step": 149
},
{
"epoch": 0.001677720983815585,
"grad_norm": 0.055961351841688156,
"learning_rate": 0.00019999861098090146,
"loss": 1.6209,
"step": 150
},
{
"epoch": 0.0016889057903743555,
"grad_norm": 0.06556452065706253,
"learning_rate": 0.00019999859239895623,
"loss": 1.6179,
"step": 151
},
{
"epoch": 0.001700090596933126,
"grad_norm": 0.0406390018761158,
"learning_rate": 0.0001999985736935441,
"loss": 1.6251,
"step": 152
},
{
"epoch": 0.0017112754034918965,
"grad_norm": 0.04337646812200546,
"learning_rate": 0.00019999855486466504,
"loss": 1.6229,
"step": 153
},
{
"epoch": 0.0017224602100506673,
"grad_norm": 0.03773214668035507,
"learning_rate": 0.00019999853591231914,
"loss": 1.6163,
"step": 154
},
{
"epoch": 0.0017336450166094378,
"grad_norm": 0.04170192405581474,
"learning_rate": 0.0001999985168365064,
"loss": 1.6187,
"step": 155
},
{
"epoch": 0.0017448298231682083,
"grad_norm": 0.046784352511167526,
"learning_rate": 0.00019999849763722684,
"loss": 1.6313,
"step": 156
},
{
"epoch": 0.0017560146297269789,
"grad_norm": 0.04641957953572273,
"learning_rate": 0.00019999847831448048,
"loss": 1.6353,
"step": 157
},
{
"epoch": 0.0017671994362857494,
"grad_norm": 0.04125691205263138,
"learning_rate": 0.00019999845886826736,
"loss": 1.6136,
"step": 158
},
{
"epoch": 0.00177838424284452,
"grad_norm": 0.03794560208916664,
"learning_rate": 0.00019999843929858748,
"loss": 1.6124,
"step": 159
},
{
"epoch": 0.0017895690494032907,
"grad_norm": 0.04494895413517952,
"learning_rate": 0.00019999841960544087,
"loss": 1.6038,
"step": 160
},
{
"epoch": 0.0018007538559620612,
"grad_norm": 0.0605354905128479,
"learning_rate": 0.00019999839978882756,
"loss": 1.6129,
"step": 161
},
{
"epoch": 0.0018119386625208317,
"grad_norm": 0.03629598766565323,
"learning_rate": 0.0001999983798487476,
"loss": 1.6136,
"step": 162
},
{
"epoch": 0.0018231234690796023,
"grad_norm": 0.051288772374391556,
"learning_rate": 0.00019999835978520099,
"loss": 1.6121,
"step": 163
},
{
"epoch": 0.0018343082756383728,
"grad_norm": 0.0646372139453888,
"learning_rate": 0.00019999833959818774,
"loss": 1.6097,
"step": 164
},
{
"epoch": 0.0018454930821971433,
"grad_norm": 0.05430466681718826,
"learning_rate": 0.00019999831928770788,
"loss": 1.6336,
"step": 165
},
{
"epoch": 0.001856677888755914,
"grad_norm": 0.04446660354733467,
"learning_rate": 0.00019999829885376146,
"loss": 1.6446,
"step": 166
},
{
"epoch": 0.0018678626953146846,
"grad_norm": 0.05089535191655159,
"learning_rate": 0.0001999982782963485,
"loss": 1.6295,
"step": 167
},
{
"epoch": 0.0018790475018734551,
"grad_norm": 0.04738520458340645,
"learning_rate": 0.000199998257615469,
"loss": 1.6129,
"step": 168
},
{
"epoch": 0.0018902323084322257,
"grad_norm": 0.06705950200557709,
"learning_rate": 0.00019999823681112296,
"loss": 1.6091,
"step": 169
},
{
"epoch": 0.0019014171149909962,
"grad_norm": 0.08080725371837616,
"learning_rate": 0.0001999982158833105,
"loss": 1.6041,
"step": 170
},
{
"epoch": 0.0019126019215497667,
"grad_norm": 0.08998562395572662,
"learning_rate": 0.00019999819483203162,
"loss": 1.61,
"step": 171
},
{
"epoch": 0.0019237867281085375,
"grad_norm": 0.0561324767768383,
"learning_rate": 0.00019999817365728626,
"loss": 1.6019,
"step": 172
},
{
"epoch": 0.001934971534667308,
"grad_norm": 0.06384976953268051,
"learning_rate": 0.00019999815235907453,
"loss": 1.6045,
"step": 173
},
{
"epoch": 0.0019461563412260785,
"grad_norm": 0.08231502771377563,
"learning_rate": 0.00019999813093739643,
"loss": 1.5851,
"step": 174
},
{
"epoch": 0.001957341147784849,
"grad_norm": 0.06544940173625946,
"learning_rate": 0.00019999810939225196,
"loss": 1.5902,
"step": 175
},
{
"epoch": 0.00196852595434362,
"grad_norm": 0.05647379904985428,
"learning_rate": 0.0001999980877236412,
"loss": 1.5972,
"step": 176
},
{
"epoch": 0.00197971076090239,
"grad_norm": 0.04889946058392525,
"learning_rate": 0.00019999806593156417,
"loss": 1.6039,
"step": 177
},
{
"epoch": 0.001990895567461161,
"grad_norm": 0.04563042148947716,
"learning_rate": 0.00019999804401602087,
"loss": 1.6137,
"step": 178
},
{
"epoch": 0.002002080374019931,
"grad_norm": 0.057878173887729645,
"learning_rate": 0.0001999980219770113,
"loss": 1.603,
"step": 179
},
{
"epoch": 0.002013265180578702,
"grad_norm": 0.04965892806649208,
"learning_rate": 0.00019999799981453554,
"loss": 1.6016,
"step": 180
},
{
"epoch": 0.0020244499871374727,
"grad_norm": 0.042930275201797485,
"learning_rate": 0.00019999797752859362,
"loss": 1.5872,
"step": 181
},
{
"epoch": 0.002035634793696243,
"grad_norm": 0.07251135259866714,
"learning_rate": 0.00019999795511918553,
"loss": 1.5973,
"step": 182
},
{
"epoch": 0.0020468196002550137,
"grad_norm": 0.053799793124198914,
"learning_rate": 0.00019999793258631133,
"loss": 1.6177,
"step": 183
},
{
"epoch": 0.002058004406813784,
"grad_norm": 0.05833510681986809,
"learning_rate": 0.00019999790992997101,
"loss": 1.5898,
"step": 184
},
{
"epoch": 0.002069189213372555,
"grad_norm": 0.06137058511376381,
"learning_rate": 0.0001999978871501646,
"loss": 1.5836,
"step": 185
},
{
"epoch": 0.002080374019931325,
"grad_norm": 0.04289720579981804,
"learning_rate": 0.0001999978642468922,
"loss": 1.5893,
"step": 186
},
{
"epoch": 0.002091558826490096,
"grad_norm": 0.054850902408361435,
"learning_rate": 0.00019999784122015375,
"loss": 1.5894,
"step": 187
},
{
"epoch": 0.0021027436330488666,
"grad_norm": 0.06448347866535187,
"learning_rate": 0.00019999781806994934,
"loss": 1.5862,
"step": 188
},
{
"epoch": 0.002113928439607637,
"grad_norm": 0.06109186261892319,
"learning_rate": 0.00019999779479627897,
"loss": 1.5951,
"step": 189
},
{
"epoch": 0.0021251132461664077,
"grad_norm": 0.05638093128800392,
"learning_rate": 0.00019999777139914263,
"loss": 1.5929,
"step": 190
},
{
"epoch": 0.002136298052725178,
"grad_norm": 0.05871524661779404,
"learning_rate": 0.00019999774787854047,
"loss": 1.5979,
"step": 191
},
{
"epoch": 0.0021474828592839487,
"grad_norm": 0.05536816641688347,
"learning_rate": 0.00019999772423447238,
"loss": 1.598,
"step": 192
},
{
"epoch": 0.0021586676658427195,
"grad_norm": 0.04985928162932396,
"learning_rate": 0.00019999770046693848,
"loss": 1.6067,
"step": 193
},
{
"epoch": 0.0021698524724014898,
"grad_norm": 0.03956759348511696,
"learning_rate": 0.00019999767657593874,
"loss": 1.6101,
"step": 194
},
{
"epoch": 0.0021810372789602605,
"grad_norm": 0.043052803725004196,
"learning_rate": 0.00019999765256147324,
"loss": 1.5925,
"step": 195
},
{
"epoch": 0.002192222085519031,
"grad_norm": 0.049073830246925354,
"learning_rate": 0.000199997628423542,
"loss": 1.5997,
"step": 196
},
{
"epoch": 0.0022034068920778016,
"grad_norm": 0.0505862683057785,
"learning_rate": 0.00019999760416214503,
"loss": 1.5976,
"step": 197
},
{
"epoch": 0.002214591698636572,
"grad_norm": 0.04482674226164818,
"learning_rate": 0.00019999757977728235,
"loss": 1.5762,
"step": 198
},
{
"epoch": 0.0022257765051953426,
"grad_norm": 0.03707127273082733,
"learning_rate": 0.00019999755526895402,
"loss": 1.5805,
"step": 199
},
{
"epoch": 0.0022369613117541134,
"grad_norm": 0.041122857481241226,
"learning_rate": 0.00019999753063716002,
"loss": 1.6041,
"step": 200
},
{
"epoch": 0.0022481461183128837,
"grad_norm": 0.05945773050189018,
"learning_rate": 0.00019999750588190046,
"loss": 1.6068,
"step": 201
},
{
"epoch": 0.0022593309248716545,
"grad_norm": 0.058612506836652756,
"learning_rate": 0.00019999748100317532,
"loss": 1.6,
"step": 202
},
{
"epoch": 0.0022705157314304248,
"grad_norm": 0.05361739546060562,
"learning_rate": 0.00019999745600098466,
"loss": 1.5891,
"step": 203
},
{
"epoch": 0.0022817005379891955,
"grad_norm": 0.045548051595687866,
"learning_rate": 0.00019999743087532846,
"loss": 1.6161,
"step": 204
},
{
"epoch": 0.002292885344547966,
"grad_norm": 0.04524560272693634,
"learning_rate": 0.00019999740562620682,
"loss": 1.6092,
"step": 205
},
{
"epoch": 0.0023040701511067366,
"grad_norm": 0.04346180334687233,
"learning_rate": 0.0001999973802536197,
"loss": 1.6076,
"step": 206
},
{
"epoch": 0.0023152549576655073,
"grad_norm": 0.047505974769592285,
"learning_rate": 0.00019999735475756717,
"loss": 1.5825,
"step": 207
},
{
"epoch": 0.0023264397642242776,
"grad_norm": 0.03851678594946861,
"learning_rate": 0.00019999732913804927,
"loss": 1.5991,
"step": 208
},
{
"epoch": 0.0023376245707830484,
"grad_norm": 0.051913902163505554,
"learning_rate": 0.000199997303395066,
"loss": 1.6281,
"step": 209
},
{
"epoch": 0.0023488093773418187,
"grad_norm": 0.06070960685610771,
"learning_rate": 0.0001999972775286174,
"loss": 1.641,
"step": 210
},
{
"epoch": 0.0023599941839005894,
"grad_norm": 0.13532494008541107,
"learning_rate": 0.00019999725153870354,
"loss": 1.6009,
"step": 211
},
{
"epoch": 0.00237117899045936,
"grad_norm": 0.30072659254074097,
"learning_rate": 0.00019999722542532442,
"loss": 1.6163,
"step": 212
},
{
"epoch": 0.0023823637970181305,
"grad_norm": 0.3511681854724884,
"learning_rate": 0.00019999719918848004,
"loss": 1.6465,
"step": 213
},
{
"epoch": 0.0023935486035769012,
"grad_norm": 0.2487097680568695,
"learning_rate": 0.00019999717282817052,
"loss": 1.6367,
"step": 214
},
{
"epoch": 0.0024047334101356716,
"grad_norm": 0.23582719266414642,
"learning_rate": 0.00019999714634439582,
"loss": 1.6202,
"step": 215
},
{
"epoch": 0.0024159182166944423,
"grad_norm": 0.17389710247516632,
"learning_rate": 0.000199997119737156,
"loss": 1.6308,
"step": 216
},
{
"epoch": 0.0024271030232532126,
"grad_norm": 0.14822594821453094,
"learning_rate": 0.00019999709300645105,
"loss": 1.6175,
"step": 217
},
{
"epoch": 0.0024382878298119834,
"grad_norm": 0.12898804247379303,
"learning_rate": 0.00019999706615228107,
"loss": 1.6069,
"step": 218
},
{
"epoch": 0.002449472636370754,
"grad_norm": 0.16721111536026,
"learning_rate": 0.00019999703917464605,
"loss": 1.6206,
"step": 219
},
{
"epoch": 0.0024606574429295244,
"grad_norm": 0.08022027462720871,
"learning_rate": 0.00019999701207354606,
"loss": 1.6082,
"step": 220
},
{
"epoch": 0.002471842249488295,
"grad_norm": 0.1281793862581253,
"learning_rate": 0.0001999969848489811,
"loss": 1.5928,
"step": 221
},
{
"epoch": 0.0024830270560470655,
"grad_norm": 0.10240975767374039,
"learning_rate": 0.00019999695750095117,
"loss": 1.6058,
"step": 222
},
{
"epoch": 0.0024942118626058362,
"grad_norm": 0.08368588238954544,
"learning_rate": 0.00019999693002945642,
"loss": 1.6193,
"step": 223
},
{
"epoch": 0.002505396669164607,
"grad_norm": 0.09998615831136703,
"learning_rate": 0.00019999690243449676,
"loss": 1.6096,
"step": 224
},
{
"epoch": 0.0025165814757233773,
"grad_norm": 0.08170673996210098,
"learning_rate": 0.00019999687471607228,
"loss": 1.5874,
"step": 225
},
{
"epoch": 0.002527766282282148,
"grad_norm": 0.09015469253063202,
"learning_rate": 0.000199996846874183,
"loss": 1.5903,
"step": 226
},
{
"epoch": 0.0025389510888409184,
"grad_norm": 0.07382986694574356,
"learning_rate": 0.000199996818908829,
"loss": 1.5958,
"step": 227
},
{
"epoch": 0.002550135895399689,
"grad_norm": 0.06669515371322632,
"learning_rate": 0.00019999679082001023,
"loss": 1.5881,
"step": 228
},
{
"epoch": 0.0025613207019584594,
"grad_norm": 0.0616774745285511,
"learning_rate": 0.0001999967626077268,
"loss": 1.6038,
"step": 229
},
{
"epoch": 0.00257250550851723,
"grad_norm": 0.05451146885752678,
"learning_rate": 0.00019999673427197872,
"loss": 1.6021,
"step": 230
},
{
"epoch": 0.002583690315076001,
"grad_norm": 0.05640149116516113,
"learning_rate": 0.000199996705812766,
"loss": 1.6022,
"step": 231
},
{
"epoch": 0.0025948751216347712,
"grad_norm": 0.06033660098910332,
"learning_rate": 0.00019999667723008871,
"loss": 1.6002,
"step": 232
},
{
"epoch": 0.002606059928193542,
"grad_norm": 0.07876270264387131,
"learning_rate": 0.0001999966485239469,
"loss": 1.6043,
"step": 233
},
{
"epoch": 0.0026172447347523123,
"grad_norm": 0.0829700380563736,
"learning_rate": 0.00019999661969434055,
"loss": 1.5915,
"step": 234
},
{
"epoch": 0.002628429541311083,
"grad_norm": 0.05654975026845932,
"learning_rate": 0.0001999965907412697,
"loss": 1.5893,
"step": 235
},
{
"epoch": 0.0026396143478698538,
"grad_norm": 0.06751634925603867,
"learning_rate": 0.00019999656166473444,
"loss": 1.5757,
"step": 236
},
{
"epoch": 0.002650799154428624,
"grad_norm": 0.06081743538379669,
"learning_rate": 0.00019999653246473477,
"loss": 1.5702,
"step": 237
},
{
"epoch": 0.002661983960987395,
"grad_norm": 0.0666998103260994,
"learning_rate": 0.00019999650314127075,
"loss": 1.568,
"step": 238
},
{
"epoch": 0.002673168767546165,
"grad_norm": 0.04934430122375488,
"learning_rate": 0.00019999647369434235,
"loss": 1.6017,
"step": 239
},
{
"epoch": 0.002684353574104936,
"grad_norm": 0.0574209988117218,
"learning_rate": 0.00019999644412394972,
"loss": 1.5935,
"step": 240
},
{
"epoch": 0.002695538380663706,
"grad_norm": 0.04870286583900452,
"learning_rate": 0.00019999641443009278,
"loss": 1.6035,
"step": 241
},
{
"epoch": 0.002706723187222477,
"grad_norm": 0.04176439344882965,
"learning_rate": 0.00019999638461277162,
"loss": 1.598,
"step": 242
},
{
"epoch": 0.0027179079937812477,
"grad_norm": 0.05534802004694939,
"learning_rate": 0.0001999963546719863,
"loss": 1.5945,
"step": 243
},
{
"epoch": 0.002729092800340018,
"grad_norm": 0.04214160889387131,
"learning_rate": 0.0001999963246077368,
"loss": 1.6019,
"step": 244
},
{
"epoch": 0.0027402776068987888,
"grad_norm": 0.04326852038502693,
"learning_rate": 0.00019999629442002322,
"loss": 1.604,
"step": 245
},
{
"epoch": 0.002751462413457559,
"grad_norm": 0.04295732453465462,
"learning_rate": 0.00019999626410884553,
"loss": 1.6136,
"step": 246
},
{
"epoch": 0.00276264722001633,
"grad_norm": 0.038508690893650055,
"learning_rate": 0.00019999623367420385,
"loss": 1.5904,
"step": 247
},
{
"epoch": 0.0027738320265751006,
"grad_norm": 0.040281713008880615,
"learning_rate": 0.0001999962031160981,
"loss": 1.5955,
"step": 248
},
{
"epoch": 0.002785016833133871,
"grad_norm": 0.041424721479415894,
"learning_rate": 0.00019999617243452844,
"loss": 1.608,
"step": 249
},
{
"epoch": 0.0027962016396926416,
"grad_norm": 0.03804994374513626,
"learning_rate": 0.00019999614162949484,
"loss": 1.6125,
"step": 250
},
{
"epoch": 0.002807386446251412,
"grad_norm": 0.04370785504579544,
"learning_rate": 0.0001999961107009974,
"loss": 1.6125,
"step": 251
},
{
"epoch": 0.0028185712528101827,
"grad_norm": 0.047021038830280304,
"learning_rate": 0.000199996079649036,
"loss": 1.5931,
"step": 252
},
{
"epoch": 0.002829756059368953,
"grad_norm": 0.036128588020801544,
"learning_rate": 0.0001999960484736109,
"loss": 1.5707,
"step": 253
},
{
"epoch": 0.0028409408659277238,
"grad_norm": 0.04315561056137085,
"learning_rate": 0.00019999601717472199,
"loss": 1.5902,
"step": 254
},
{
"epoch": 0.0028521256724864945,
"grad_norm": 0.04395722597837448,
"learning_rate": 0.00019999598575236934,
"loss": 1.5995,
"step": 255
},
{
"epoch": 0.002863310479045265,
"grad_norm": 0.038929786533117294,
"learning_rate": 0.000199995954206553,
"loss": 1.5854,
"step": 256
},
{
"epoch": 0.0028744952856040356,
"grad_norm": 0.041567280888557434,
"learning_rate": 0.00019999592253727299,
"loss": 1.5783,
"step": 257
},
{
"epoch": 0.002885680092162806,
"grad_norm": 0.03894374892115593,
"learning_rate": 0.0001999958907445294,
"loss": 1.5846,
"step": 258
},
{
"epoch": 0.0028968648987215766,
"grad_norm": 0.04269428178668022,
"learning_rate": 0.00019999585882832222,
"loss": 1.6023,
"step": 259
},
{
"epoch": 0.0029080497052803474,
"grad_norm": 0.04121831804513931,
"learning_rate": 0.00019999582678865147,
"loss": 1.6051,
"step": 260
},
{
"epoch": 0.0029192345118391177,
"grad_norm": 0.038076166063547134,
"learning_rate": 0.00019999579462551728,
"loss": 1.6136,
"step": 261
},
{
"epoch": 0.0029304193183978884,
"grad_norm": 0.042008642107248306,
"learning_rate": 0.0001999957623389196,
"loss": 1.6063,
"step": 262
},
{
"epoch": 0.0029416041249566587,
"grad_norm": 0.042438406497240067,
"learning_rate": 0.0001999957299288585,
"loss": 1.6033,
"step": 263
},
{
"epoch": 0.0029527889315154295,
"grad_norm": 0.041119206696748734,
"learning_rate": 0.000199995697395334,
"loss": 1.6031,
"step": 264
},
{
"epoch": 0.0029639737380742,
"grad_norm": 0.045258279889822006,
"learning_rate": 0.00019999566473834622,
"loss": 1.5853,
"step": 265
},
{
"epoch": 0.0029751585446329705,
"grad_norm": 0.04734019562602043,
"learning_rate": 0.0001999956319578951,
"loss": 1.5921,
"step": 266
},
{
"epoch": 0.0029863433511917413,
"grad_norm": 0.04389064759016037,
"learning_rate": 0.00019999559905398072,
"loss": 1.581,
"step": 267
},
{
"epoch": 0.0029975281577505116,
"grad_norm": 0.04582642391324043,
"learning_rate": 0.00019999556602660318,
"loss": 1.5928,
"step": 268
},
{
"epoch": 0.0030087129643092824,
"grad_norm": 0.04518941417336464,
"learning_rate": 0.00019999553287576238,
"loss": 1.5809,
"step": 269
},
{
"epoch": 0.0030198977708680527,
"grad_norm": 0.04687381908297539,
"learning_rate": 0.0001999954996014585,
"loss": 1.5999,
"step": 270
},
{
"epoch": 0.0030310825774268234,
"grad_norm": 0.04809357225894928,
"learning_rate": 0.00019999546620369152,
"loss": 1.5972,
"step": 271
},
{
"epoch": 0.003042267383985594,
"grad_norm": 0.05907173454761505,
"learning_rate": 0.0001999954326824615,
"loss": 1.5862,
"step": 272
},
{
"epoch": 0.0030534521905443645,
"grad_norm": 0.06583942472934723,
"learning_rate": 0.00019999539903776842,
"loss": 1.5846,
"step": 273
},
{
"epoch": 0.0030646369971031352,
"grad_norm": 0.07557905465364456,
"learning_rate": 0.0001999953652696124,
"loss": 1.5962,
"step": 274
},
{
"epoch": 0.0030758218036619055,
"grad_norm": 0.07007817178964615,
"learning_rate": 0.00019999533137799347,
"loss": 1.5951,
"step": 275
},
{
"epoch": 0.0030870066102206763,
"grad_norm": 0.05711887776851654,
"learning_rate": 0.00019999529736291162,
"loss": 1.5932,
"step": 276
},
{
"epoch": 0.0030981914167794466,
"grad_norm": 0.04450292885303497,
"learning_rate": 0.00019999526322436696,
"loss": 1.6071,
"step": 277
},
{
"epoch": 0.0031093762233382173,
"grad_norm": 0.04260997474193573,
"learning_rate": 0.00019999522896235947,
"loss": 1.6038,
"step": 278
},
{
"epoch": 0.003120561029896988,
"grad_norm": 0.05689796805381775,
"learning_rate": 0.00019999519457688925,
"loss": 1.5688,
"step": 279
},
{
"epoch": 0.0031317458364557584,
"grad_norm": 0.06330379098653793,
"learning_rate": 0.0001999951600679563,
"loss": 1.5617,
"step": 280
},
{
"epoch": 0.003142930643014529,
"grad_norm": 0.06195618584752083,
"learning_rate": 0.00019999512543556066,
"loss": 1.5602,
"step": 281
},
{
"epoch": 0.0031541154495732995,
"grad_norm": 0.06677111238241196,
"learning_rate": 0.0001999950906797024,
"loss": 1.5659,
"step": 282
},
{
"epoch": 0.00316530025613207,
"grad_norm": 0.05750421807169914,
"learning_rate": 0.00019999505580038153,
"loss": 1.5759,
"step": 283
},
{
"epoch": 0.003176485062690841,
"grad_norm": 0.04907039552927017,
"learning_rate": 0.00019999502079759817,
"loss": 1.5833,
"step": 284
},
{
"epoch": 0.0031876698692496113,
"grad_norm": 0.048877034336328506,
"learning_rate": 0.00019999498567135223,
"loss": 1.5836,
"step": 285
},
{
"epoch": 0.003198854675808382,
"grad_norm": 0.05494236946105957,
"learning_rate": 0.0001999949504216439,
"loss": 1.5837,
"step": 286
},
{
"epoch": 0.0032100394823671523,
"grad_norm": 0.04953937977552414,
"learning_rate": 0.00019999491504847313,
"loss": 1.5794,
"step": 287
},
{
"epoch": 0.003221224288925923,
"grad_norm": 0.05240803211927414,
"learning_rate": 0.00019999487955184,
"loss": 1.5894,
"step": 288
},
{
"epoch": 0.0032324090954846934,
"grad_norm": 0.05633338540792465,
"learning_rate": 0.0001999948439317445,
"loss": 1.5755,
"step": 289
},
{
"epoch": 0.003243593902043464,
"grad_norm": 0.06563600897789001,
"learning_rate": 0.00019999480818818675,
"loss": 1.5912,
"step": 290
},
{
"epoch": 0.003254778708602235,
"grad_norm": 0.05903002619743347,
"learning_rate": 0.00019999477232116676,
"loss": 1.5856,
"step": 291
},
{
"epoch": 0.003265963515161005,
"grad_norm": 0.03582334890961647,
"learning_rate": 0.00019999473633068457,
"loss": 1.6079,
"step": 292
},
{
"epoch": 0.003277148321719776,
"grad_norm": 0.05011364817619324,
"learning_rate": 0.00019999470021674025,
"loss": 1.5907,
"step": 293
},
{
"epoch": 0.0032883331282785463,
"grad_norm": 0.0577118918299675,
"learning_rate": 0.0001999946639793338,
"loss": 1.5955,
"step": 294
},
{
"epoch": 0.003299517934837317,
"grad_norm": 0.05170518904924393,
"learning_rate": 0.00019999462761846528,
"loss": 1.5884,
"step": 295
},
{
"epoch": 0.0033107027413960878,
"grad_norm": 0.05011725425720215,
"learning_rate": 0.00019999459113413475,
"loss": 1.6046,
"step": 296
},
{
"epoch": 0.003321887547954858,
"grad_norm": 0.05645633116364479,
"learning_rate": 0.00019999455452634224,
"loss": 1.596,
"step": 297
},
{
"epoch": 0.003333072354513629,
"grad_norm": 0.05705921724438667,
"learning_rate": 0.0001999945177950878,
"loss": 1.6136,
"step": 298
},
{
"epoch": 0.003344257161072399,
"grad_norm": 0.05761184170842171,
"learning_rate": 0.0001999944809403715,
"loss": 1.6095,
"step": 299
},
{
"epoch": 0.00335544196763117,
"grad_norm": 0.0613851472735405,
"learning_rate": 0.00019999444396219337,
"loss": 1.6141,
"step": 300
},
{
"epoch": 0.00336662677418994,
"grad_norm": 0.06220489367842674,
"learning_rate": 0.00019999440686055344,
"loss": 1.5988,
"step": 301
},
{
"epoch": 0.003377811580748711,
"grad_norm": 0.062393296509981155,
"learning_rate": 0.00019999436963545177,
"loss": 1.6075,
"step": 302
},
{
"epoch": 0.0033889963873074817,
"grad_norm": 0.0625912994146347,
"learning_rate": 0.00019999433228688838,
"loss": 1.6102,
"step": 303
},
{
"epoch": 0.003400181193866252,
"grad_norm": 0.06049802899360657,
"learning_rate": 0.00019999429481486335,
"loss": 1.5968,
"step": 304
},
{
"epoch": 0.0034113660004250227,
"grad_norm": 0.05767315998673439,
"learning_rate": 0.00019999425721937674,
"loss": 1.5906,
"step": 305
},
{
"epoch": 0.003422550806983793,
"grad_norm": 0.049920015037059784,
"learning_rate": 0.00019999421950042854,
"loss": 1.5694,
"step": 306
},
{
"epoch": 0.003433735613542564,
"grad_norm": 0.04852724075317383,
"learning_rate": 0.0001999941816580188,
"loss": 1.5609,
"step": 307
},
{
"epoch": 0.0034449204201013345,
"grad_norm": 0.05249037966132164,
"learning_rate": 0.00019999414369214767,
"loss": 1.5675,
"step": 308
},
{
"epoch": 0.003456105226660105,
"grad_norm": 0.05167357623577118,
"learning_rate": 0.00019999410560281506,
"loss": 1.5645,
"step": 309
},
{
"epoch": 0.0034672900332188756,
"grad_norm": 0.05197747051715851,
"learning_rate": 0.00019999406739002108,
"loss": 1.5707,
"step": 310
},
{
"epoch": 0.003478474839777646,
"grad_norm": 0.05140923336148262,
"learning_rate": 0.00019999402905376582,
"loss": 1.5745,
"step": 311
},
{
"epoch": 0.0034896596463364167,
"grad_norm": 0.05357779935002327,
"learning_rate": 0.00019999399059404923,
"loss": 1.5824,
"step": 312
},
{
"epoch": 0.003500844452895187,
"grad_norm": 0.04196302220225334,
"learning_rate": 0.00019999395201087143,
"loss": 1.5685,
"step": 313
},
{
"epoch": 0.0035120292594539577,
"grad_norm": 0.04698769748210907,
"learning_rate": 0.00019999391330423246,
"loss": 1.5645,
"step": 314
},
{
"epoch": 0.0035232140660127285,
"grad_norm": 0.055174414068460464,
"learning_rate": 0.00019999387447413236,
"loss": 1.5702,
"step": 315
},
{
"epoch": 0.003534398872571499,
"grad_norm": 0.05560048297047615,
"learning_rate": 0.00019999383552057114,
"loss": 1.5746,
"step": 316
},
{
"epoch": 0.0035455836791302695,
"grad_norm": 0.059730809181928635,
"learning_rate": 0.0001999937964435489,
"loss": 1.5698,
"step": 317
},
{
"epoch": 0.00355676848568904,
"grad_norm": 0.06850636750459671,
"learning_rate": 0.00019999375724306568,
"loss": 1.5837,
"step": 318
},
{
"epoch": 0.0035679532922478106,
"grad_norm": 0.0658111497759819,
"learning_rate": 0.00019999371791912148,
"loss": 1.5759,
"step": 319
},
{
"epoch": 0.0035791380988065813,
"grad_norm": 0.05440279841423035,
"learning_rate": 0.00019999367847171643,
"loss": 1.5873,
"step": 320
},
{
"epoch": 0.0035903229053653517,
"grad_norm": 0.06169600412249565,
"learning_rate": 0.0001999936389008505,
"loss": 1.5791,
"step": 321
},
{
"epoch": 0.0036015077119241224,
"grad_norm": 0.06897033751010895,
"learning_rate": 0.0001999935992065238,
"loss": 1.5815,
"step": 322
},
{
"epoch": 0.0036126925184828927,
"grad_norm": 0.06641620397567749,
"learning_rate": 0.00019999355938873635,
"loss": 1.5826,
"step": 323
},
{
"epoch": 0.0036238773250416635,
"grad_norm": 0.057002220302820206,
"learning_rate": 0.00019999351944748818,
"loss": 1.5716,
"step": 324
},
{
"epoch": 0.0036350621316004338,
"grad_norm": 0.06431427597999573,
"learning_rate": 0.00019999347938277938,
"loss": 1.5626,
"step": 325
},
{
"epoch": 0.0036462469381592045,
"grad_norm": 0.06504250317811966,
"learning_rate": 0.00019999343919460997,
"loss": 1.574,
"step": 326
},
{
"epoch": 0.0036574317447179753,
"grad_norm": 0.06940289586782455,
"learning_rate": 0.00019999339888298004,
"loss": 1.5676,
"step": 327
},
{
"epoch": 0.0036686165512767456,
"grad_norm": 0.06492604315280914,
"learning_rate": 0.00019999335844788957,
"loss": 1.5676,
"step": 328
},
{
"epoch": 0.0036798013578355163,
"grad_norm": 0.07069146633148193,
"learning_rate": 0.0001999933178893387,
"loss": 1.5601,
"step": 329
},
{
"epoch": 0.0036909861643942866,
"grad_norm": 0.07502440363168716,
"learning_rate": 0.00019999327720732736,
"loss": 1.5651,
"step": 330
},
{
"epoch": 0.0037021709709530574,
"grad_norm": 0.06407099217176437,
"learning_rate": 0.00019999323640185573,
"loss": 1.5818,
"step": 331
},
{
"epoch": 0.003713355777511828,
"grad_norm": 0.05621904134750366,
"learning_rate": 0.00019999319547292377,
"loss": 1.5975,
"step": 332
},
{
"epoch": 0.0037245405840705985,
"grad_norm": 0.053219810128211975,
"learning_rate": 0.00019999315442053157,
"loss": 1.5925,
"step": 333
},
{
"epoch": 0.003735725390629369,
"grad_norm": 0.0538603812456131,
"learning_rate": 0.00019999311324467919,
"loss": 1.5784,
"step": 334
},
{
"epoch": 0.0037469101971881395,
"grad_norm": 0.05239463597536087,
"learning_rate": 0.00019999307194536664,
"loss": 1.5782,
"step": 335
},
{
"epoch": 0.0037580950037469103,
"grad_norm": 0.053746242076158524,
"learning_rate": 0.00019999303052259398,
"loss": 1.5802,
"step": 336
},
{
"epoch": 0.0037692798103056806,
"grad_norm": 0.04721551761031151,
"learning_rate": 0.0001999929889763613,
"loss": 1.5739,
"step": 337
},
{
"epoch": 0.0037804646168644513,
"grad_norm": 0.04483070224523544,
"learning_rate": 0.00019999294730666862,
"loss": 1.5823,
"step": 338
},
{
"epoch": 0.003791649423423222,
"grad_norm": 0.05224015936255455,
"learning_rate": 0.000199992905513516,
"loss": 1.5559,
"step": 339
},
{
"epoch": 0.0038028342299819924,
"grad_norm": 0.05772995948791504,
"learning_rate": 0.0001999928635969035,
"loss": 1.5574,
"step": 340
},
{
"epoch": 0.003814019036540763,
"grad_norm": 0.059287503361701965,
"learning_rate": 0.00019999282155683116,
"loss": 1.5694,
"step": 341
},
{
"epoch": 0.0038252038430995334,
"grad_norm": 0.050815433263778687,
"learning_rate": 0.00019999277939329902,
"loss": 1.5655,
"step": 342
},
{
"epoch": 0.003836388649658304,
"grad_norm": 0.047384679317474365,
"learning_rate": 0.00019999273710630714,
"loss": 1.5704,
"step": 343
},
{
"epoch": 0.003847573456217075,
"grad_norm": 0.04918666183948517,
"learning_rate": 0.00019999269469585555,
"loss": 1.5694,
"step": 344
},
{
"epoch": 0.0038587582627758452,
"grad_norm": 0.058182474225759506,
"learning_rate": 0.0001999926521619444,
"loss": 1.5971,
"step": 345
},
{
"epoch": 0.003869943069334616,
"grad_norm": 0.07194909453392029,
"learning_rate": 0.00019999260950457362,
"loss": 1.6033,
"step": 346
},
{
"epoch": 0.0038811278758933863,
"grad_norm": 0.08051355183124542,
"learning_rate": 0.0001999925667237433,
"loss": 1.5969,
"step": 347
},
{
"epoch": 0.003892312682452157,
"grad_norm": 0.09640984982252121,
"learning_rate": 0.00019999252381945357,
"loss": 1.6414,
"step": 348
},
{
"epoch": 0.0039034974890109274,
"grad_norm": 0.11357203125953674,
"learning_rate": 0.00019999248079170437,
"loss": 1.6522,
"step": 349
},
{
"epoch": 0.003914682295569698,
"grad_norm": 0.14623580873012543,
"learning_rate": 0.00019999243764049586,
"loss": 1.6482,
"step": 350
},
{
"epoch": 0.003925867102128468,
"grad_norm": 0.158810093998909,
"learning_rate": 0.00019999239436582796,
"loss": 1.5814,
"step": 351
},
{
"epoch": 0.00393705190868724,
"grad_norm": 0.11811669170856476,
"learning_rate": 0.00019999235096770086,
"loss": 1.5782,
"step": 352
},
{
"epoch": 0.00394823671524601,
"grad_norm": 0.09518411755561829,
"learning_rate": 0.0001999923074461145,
"loss": 1.5852,
"step": 353
},
{
"epoch": 0.00395942152180478,
"grad_norm": 0.1165471076965332,
"learning_rate": 0.00019999226380106906,
"loss": 1.5766,
"step": 354
},
{
"epoch": 0.0039706063283635505,
"grad_norm": 0.09517768025398254,
"learning_rate": 0.00019999222003256448,
"loss": 1.5829,
"step": 355
},
{
"epoch": 0.003981791134922322,
"grad_norm": 0.06006886065006256,
"learning_rate": 0.00019999217614060085,
"loss": 1.5708,
"step": 356
},
{
"epoch": 0.003992975941481092,
"grad_norm": 0.08933499455451965,
"learning_rate": 0.00019999213212517825,
"loss": 1.5718,
"step": 357
},
{
"epoch": 0.004004160748039862,
"grad_norm": 0.0867113471031189,
"learning_rate": 0.0001999920879862967,
"loss": 1.5711,
"step": 358
},
{
"epoch": 0.0040153455545986335,
"grad_norm": 0.06603030860424042,
"learning_rate": 0.00019999204372395628,
"loss": 1.5778,
"step": 359
},
{
"epoch": 0.004026530361157404,
"grad_norm": 0.07514684647321701,
"learning_rate": 0.00019999199933815702,
"loss": 1.5738,
"step": 360
},
{
"epoch": 0.004037715167716174,
"grad_norm": 0.060728590935468674,
"learning_rate": 0.00019999195482889897,
"loss": 1.5694,
"step": 361
},
{
"epoch": 0.004048899974274945,
"grad_norm": 0.06927715986967087,
"learning_rate": 0.00019999191019618224,
"loss": 1.5664,
"step": 362
},
{
"epoch": 0.004060084780833716,
"grad_norm": 0.0576176755130291,
"learning_rate": 0.00019999186544000685,
"loss": 1.5704,
"step": 363
},
{
"epoch": 0.004071269587392486,
"grad_norm": 0.047579534351825714,
"learning_rate": 0.00019999182056037285,
"loss": 1.5732,
"step": 364
},
{
"epoch": 0.004082454393951256,
"grad_norm": 0.06114533543586731,
"learning_rate": 0.00019999177555728027,
"loss": 1.5776,
"step": 365
},
{
"epoch": 0.0040936392005100275,
"grad_norm": 0.05183887854218483,
"learning_rate": 0.0001999917304307292,
"loss": 1.5697,
"step": 366
},
{
"epoch": 0.004104824007068798,
"grad_norm": 0.05595005676150322,
"learning_rate": 0.0001999916851807197,
"loss": 1.5611,
"step": 367
},
{
"epoch": 0.004116008813627568,
"grad_norm": 0.04884869232773781,
"learning_rate": 0.00019999163980725183,
"loss": 1.5754,
"step": 368
},
{
"epoch": 0.004127193620186339,
"grad_norm": 0.050160013139247894,
"learning_rate": 0.00019999159431032562,
"loss": 1.5818,
"step": 369
},
{
"epoch": 0.00413837842674511,
"grad_norm": 0.0458194725215435,
"learning_rate": 0.00019999154868994111,
"loss": 1.5821,
"step": 370
},
{
"epoch": 0.00414956323330388,
"grad_norm": 0.04877634719014168,
"learning_rate": 0.00019999150294609845,
"loss": 1.5828,
"step": 371
},
{
"epoch": 0.00416074803986265,
"grad_norm": 0.045320361852645874,
"learning_rate": 0.00019999145707879758,
"loss": 1.5946,
"step": 372
},
{
"epoch": 0.004171932846421421,
"grad_norm": 0.05053291842341423,
"learning_rate": 0.00019999141108803864,
"loss": 1.6002,
"step": 373
},
{
"epoch": 0.004183117652980192,
"grad_norm": 0.053211431950330734,
"learning_rate": 0.0001999913649738216,
"loss": 1.5914,
"step": 374
},
{
"epoch": 0.004194302459538962,
"grad_norm": 0.045671332627534866,
"learning_rate": 0.00019999131873614664,
"loss": 1.5806,
"step": 375
},
{
"epoch": 0.004205487266097733,
"grad_norm": 0.051272232085466385,
"learning_rate": 0.0001999912723750137,
"loss": 1.5898,
"step": 376
},
{
"epoch": 0.0042166720726565035,
"grad_norm": 0.05297670140862465,
"learning_rate": 0.0001999912258904229,
"loss": 1.6003,
"step": 377
},
{
"epoch": 0.004227856879215274,
"grad_norm": 0.044414643198251724,
"learning_rate": 0.00019999117928237427,
"loss": 1.6069,
"step": 378
},
{
"epoch": 0.004239041685774044,
"grad_norm": 0.04553841054439545,
"learning_rate": 0.0001999911325508679,
"loss": 1.5798,
"step": 379
},
{
"epoch": 0.004250226492332815,
"grad_norm": 0.05364730581641197,
"learning_rate": 0.00019999108569590383,
"loss": 1.5856,
"step": 380
},
{
"epoch": 0.004261411298891586,
"grad_norm": 0.05173739790916443,
"learning_rate": 0.0001999910387174821,
"loss": 1.5764,
"step": 381
},
{
"epoch": 0.004272596105450356,
"grad_norm": 0.05577515438199043,
"learning_rate": 0.00019999099161560282,
"loss": 1.5855,
"step": 382
},
{
"epoch": 0.004283780912009127,
"grad_norm": 0.057436104863882065,
"learning_rate": 0.00019999094439026598,
"loss": 1.5785,
"step": 383
},
{
"epoch": 0.0042949657185678974,
"grad_norm": 0.03927776962518692,
"learning_rate": 0.00019999089704147166,
"loss": 1.5735,
"step": 384
},
{
"epoch": 0.004306150525126668,
"grad_norm": 0.04739474132657051,
"learning_rate": 0.00019999084956921997,
"loss": 1.5805,
"step": 385
},
{
"epoch": 0.004317335331685439,
"grad_norm": 0.04832485690712929,
"learning_rate": 0.0001999908019735109,
"loss": 1.6164,
"step": 386
},
{
"epoch": 0.004328520138244209,
"grad_norm": 0.049625612795352936,
"learning_rate": 0.00019999075425434452,
"loss": 1.6468,
"step": 387
},
{
"epoch": 0.0043397049448029796,
"grad_norm": 0.04835371673107147,
"learning_rate": 0.00019999070641172094,
"loss": 1.6438,
"step": 388
},
{
"epoch": 0.00435088975136175,
"grad_norm": 0.05029625818133354,
"learning_rate": 0.00019999065844564018,
"loss": 1.6103,
"step": 389
},
{
"epoch": 0.004362074557920521,
"grad_norm": 0.055567558854818344,
"learning_rate": 0.0001999906103561023,
"loss": 1.602,
"step": 390
},
{
"epoch": 0.004373259364479291,
"grad_norm": 0.06654093414545059,
"learning_rate": 0.00019999056214310733,
"loss": 1.5815,
"step": 391
},
{
"epoch": 0.004384444171038062,
"grad_norm": 0.06477198004722595,
"learning_rate": 0.0001999905138066554,
"loss": 1.5883,
"step": 392
},
{
"epoch": 0.004395628977596833,
"grad_norm": 0.0623844638466835,
"learning_rate": 0.00019999046534674656,
"loss": 1.5877,
"step": 393
},
{
"epoch": 0.004406813784155603,
"grad_norm": 0.059734586626291275,
"learning_rate": 0.00019999041676338077,
"loss": 1.5855,
"step": 394
},
{
"epoch": 0.0044179985907143735,
"grad_norm": 0.05187408998608589,
"learning_rate": 0.0001999903680565582,
"loss": 1.5827,
"step": 395
},
{
"epoch": 0.004429183397273144,
"grad_norm": 0.05175703763961792,
"learning_rate": 0.00019999031922627886,
"loss": 1.5758,
"step": 396
},
{
"epoch": 0.004440368203831915,
"grad_norm": 0.05059249326586723,
"learning_rate": 0.00019999027027254286,
"loss": 1.5758,
"step": 397
},
{
"epoch": 0.004451553010390685,
"grad_norm": 0.050220511853694916,
"learning_rate": 0.0001999902211953502,
"loss": 1.5771,
"step": 398
},
{
"epoch": 0.004462737816949456,
"grad_norm": 0.057194001972675323,
"learning_rate": 0.00019999017199470094,
"loss": 1.5828,
"step": 399
},
{
"epoch": 0.004473922623508227,
"grad_norm": 0.07026943564414978,
"learning_rate": 0.00019999012267059519,
"loss": 1.5728,
"step": 400
},
{
"epoch": 0.004485107430066997,
"grad_norm": 0.08094791322946548,
"learning_rate": 0.00019999007322303296,
"loss": 1.5634,
"step": 401
},
{
"epoch": 0.004496292236625767,
"grad_norm": 0.08370808511972427,
"learning_rate": 0.0001999900236520144,
"loss": 1.5747,
"step": 402
},
{
"epoch": 0.004507477043184538,
"grad_norm": 0.09409447014331818,
"learning_rate": 0.00019998997395753945,
"loss": 1.5703,
"step": 403
},
{
"epoch": 0.004518661849743309,
"grad_norm": 0.09207552671432495,
"learning_rate": 0.0001999899241396082,
"loss": 1.5628,
"step": 404
},
{
"epoch": 0.004529846656302079,
"grad_norm": 0.07077619433403015,
"learning_rate": 0.0001999898741982208,
"loss": 1.5826,
"step": 405
},
{
"epoch": 0.0045410314628608495,
"grad_norm": 0.06743451207876205,
"learning_rate": 0.00019998982413337724,
"loss": 1.6047,
"step": 406
},
{
"epoch": 0.004552216269419621,
"grad_norm": 0.10414531826972961,
"learning_rate": 0.0001999897739450776,
"loss": 1.6005,
"step": 407
},
{
"epoch": 0.004563401075978391,
"grad_norm": 0.10947459191083908,
"learning_rate": 0.0001999897236333219,
"loss": 1.6025,
"step": 408
},
{
"epoch": 0.004574585882537161,
"grad_norm": 0.06659277528524399,
"learning_rate": 0.00019998967319811027,
"loss": 1.6059,
"step": 409
},
{
"epoch": 0.004585770689095932,
"grad_norm": 0.06038579344749451,
"learning_rate": 0.00019998962263944274,
"loss": 1.5826,
"step": 410
},
{
"epoch": 0.004596955495654703,
"grad_norm": 0.08695773035287857,
"learning_rate": 0.00019998957195731934,
"loss": 1.5779,
"step": 411
},
{
"epoch": 0.004608140302213473,
"grad_norm": 0.07446157187223434,
"learning_rate": 0.0001999895211517402,
"loss": 1.5812,
"step": 412
},
{
"epoch": 0.0046193251087722435,
"grad_norm": 0.04878260940313339,
"learning_rate": 0.00019998947022270534,
"loss": 1.6033,
"step": 413
},
{
"epoch": 0.004630509915331015,
"grad_norm": 0.06283459812402725,
"learning_rate": 0.00019998941917021484,
"loss": 1.5879,
"step": 414
},
{
"epoch": 0.004641694721889785,
"grad_norm": 0.05891675129532814,
"learning_rate": 0.00019998936799426874,
"loss": 1.59,
"step": 415
},
{
"epoch": 0.004652879528448555,
"grad_norm": 0.04745139181613922,
"learning_rate": 0.0001999893166948671,
"loss": 1.5981,
"step": 416
},
{
"epoch": 0.0046640643350073265,
"grad_norm": 0.05297010764479637,
"learning_rate": 0.00019998926527201003,
"loss": 1.5937,
"step": 417
},
{
"epoch": 0.004675249141566097,
"grad_norm": 0.05115678906440735,
"learning_rate": 0.00019998921372569757,
"loss": 1.5941,
"step": 418
},
{
"epoch": 0.004686433948124867,
"grad_norm": 0.05678752437233925,
"learning_rate": 0.00019998916205592974,
"loss": 1.5631,
"step": 419
},
{
"epoch": 0.004697618754683637,
"grad_norm": 0.05227034166455269,
"learning_rate": 0.00019998911026270668,
"loss": 1.5755,
"step": 420
},
{
"epoch": 0.004708803561242409,
"grad_norm": 0.05871938541531563,
"learning_rate": 0.0001999890583460284,
"loss": 1.5843,
"step": 421
},
{
"epoch": 0.004719988367801179,
"grad_norm": 0.06751812249422073,
"learning_rate": 0.00019998900630589493,
"loss": 1.5922,
"step": 422
},
{
"epoch": 0.004731173174359949,
"grad_norm": 0.061179131269454956,
"learning_rate": 0.00019998895414230646,
"loss": 1.5939,
"step": 423
},
{
"epoch": 0.00474235798091872,
"grad_norm": 0.06404510140419006,
"learning_rate": 0.00019998890185526292,
"loss": 1.5857,
"step": 424
},
{
"epoch": 0.004753542787477491,
"grad_norm": 0.07117751985788345,
"learning_rate": 0.0001999888494447645,
"loss": 1.5676,
"step": 425
},
{
"epoch": 0.004764727594036261,
"grad_norm": 0.06996233761310577,
"learning_rate": 0.00019998879691081114,
"loss": 1.5769,
"step": 426
},
{
"epoch": 0.004775912400595031,
"grad_norm": 0.0711674690246582,
"learning_rate": 0.00019998874425340298,
"loss": 1.5622,
"step": 427
},
{
"epoch": 0.0047870972071538025,
"grad_norm": 0.079228475689888,
"learning_rate": 0.0001999886914725401,
"loss": 1.571,
"step": 428
},
{
"epoch": 0.004798282013712573,
"grad_norm": 0.08016793429851532,
"learning_rate": 0.00019998863856822248,
"loss": 1.6029,
"step": 429
},
{
"epoch": 0.004809466820271343,
"grad_norm": 0.0795593336224556,
"learning_rate": 0.00019998858554045026,
"loss": 1.6062,
"step": 430
},
{
"epoch": 0.004820651626830114,
"grad_norm": 0.06341350823640823,
"learning_rate": 0.00019998853238922348,
"loss": 1.5884,
"step": 431
},
{
"epoch": 0.004831836433388885,
"grad_norm": 0.05454142764210701,
"learning_rate": 0.00019998847911454219,
"loss": 1.5921,
"step": 432
},
{
"epoch": 0.004843021239947655,
"grad_norm": 0.0625983327627182,
"learning_rate": 0.0001999884257164065,
"loss": 1.582,
"step": 433
},
{
"epoch": 0.004854206046506425,
"grad_norm": 0.06116988882422447,
"learning_rate": 0.00019998837219481645,
"loss": 1.5804,
"step": 434
},
{
"epoch": 0.004865390853065196,
"grad_norm": 0.057872429490089417,
"learning_rate": 0.0001999883185497721,
"loss": 1.5721,
"step": 435
},
{
"epoch": 0.004876575659623967,
"grad_norm": 0.04977230727672577,
"learning_rate": 0.00019998826478127352,
"loss": 1.5539,
"step": 436
},
{
"epoch": 0.004887760466182737,
"grad_norm": 0.06137122958898544,
"learning_rate": 0.00019998821088932077,
"loss": 1.5715,
"step": 437
},
{
"epoch": 0.004898945272741508,
"grad_norm": 0.05595165491104126,
"learning_rate": 0.00019998815687391396,
"loss": 1.5841,
"step": 438
},
{
"epoch": 0.0049101300793002785,
"grad_norm": 0.049001295119524,
"learning_rate": 0.00019998810273505311,
"loss": 1.571,
"step": 439
},
{
"epoch": 0.004921314885859049,
"grad_norm": 0.04792420566082001,
"learning_rate": 0.00019998804847273828,
"loss": 1.5695,
"step": 440
},
{
"epoch": 0.00493249969241782,
"grad_norm": 0.052222106605768204,
"learning_rate": 0.00019998799408696956,
"loss": 1.5671,
"step": 441
},
{
"epoch": 0.00494368449897659,
"grad_norm": 0.05545085668563843,
"learning_rate": 0.00019998793957774703,
"loss": 1.5679,
"step": 442
},
{
"epoch": 0.004954869305535361,
"grad_norm": 0.06147260218858719,
"learning_rate": 0.00019998788494507075,
"loss": 1.5746,
"step": 443
},
{
"epoch": 0.004966054112094131,
"grad_norm": 0.06299655884504318,
"learning_rate": 0.00019998783018894073,
"loss": 1.5719,
"step": 444
},
{
"epoch": 0.004977238918652902,
"grad_norm": 0.05477641522884369,
"learning_rate": 0.00019998777530935713,
"loss": 1.5933,
"step": 445
},
{
"epoch": 0.0049884237252116725,
"grad_norm": 0.054924603551626205,
"learning_rate": 0.00019998772030631993,
"loss": 1.6381,
"step": 446
},
{
"epoch": 0.004999608531770443,
"grad_norm": 0.05982334539294243,
"learning_rate": 0.0001999876651798293,
"loss": 1.6049,
"step": 447
},
{
"epoch": 0.005010793338329214,
"grad_norm": 0.07177302241325378,
"learning_rate": 0.0001999876099298852,
"loss": 1.5885,
"step": 448
},
{
"epoch": 0.005021978144887984,
"grad_norm": 0.06020566448569298,
"learning_rate": 0.00019998755455648778,
"loss": 1.5918,
"step": 449
},
{
"epoch": 0.005033162951446755,
"grad_norm": 0.0725252702832222,
"learning_rate": 0.00019998749905963706,
"loss": 1.5948,
"step": 450
},
{
"epoch": 0.005044347758005525,
"grad_norm": 0.07799220085144043,
"learning_rate": 0.00019998744343933313,
"loss": 1.5903,
"step": 451
},
{
"epoch": 0.005055532564564296,
"grad_norm": 0.06732252240180969,
"learning_rate": 0.00019998738769557605,
"loss": 1.6123,
"step": 452
},
{
"epoch": 0.005066717371123066,
"grad_norm": 0.056653380393981934,
"learning_rate": 0.0001999873318283659,
"loss": 1.5976,
"step": 453
},
{
"epoch": 0.005077902177681837,
"grad_norm": 0.06148442253470421,
"learning_rate": 0.00019998727583770274,
"loss": 1.5826,
"step": 454
},
{
"epoch": 0.005089086984240608,
"grad_norm": 0.0657142624258995,
"learning_rate": 0.00019998721972358662,
"loss": 1.5504,
"step": 455
},
{
"epoch": 0.005100271790799378,
"grad_norm": 0.06259225308895111,
"learning_rate": 0.00019998716348601766,
"loss": 1.552,
"step": 456
},
{
"epoch": 0.0051114565973581485,
"grad_norm": 0.08781653642654419,
"learning_rate": 0.00019998710712499585,
"loss": 1.5523,
"step": 457
},
{
"epoch": 0.005122641403916919,
"grad_norm": 0.0888068750500679,
"learning_rate": 0.00019998705064052137,
"loss": 1.5581,
"step": 458
},
{
"epoch": 0.00513382621047569,
"grad_norm": 0.11727220565080643,
"learning_rate": 0.0001999869940325942,
"loss": 1.5714,
"step": 459
},
{
"epoch": 0.00514501101703446,
"grad_norm": 0.10440998524427414,
"learning_rate": 0.00019998693730121443,
"loss": 1.5567,
"step": 460
},
{
"epoch": 0.005156195823593231,
"grad_norm": 0.0791282206773758,
"learning_rate": 0.00019998688044638215,
"loss": 1.5605,
"step": 461
},
{
"epoch": 0.005167380630152002,
"grad_norm": 0.07670507580041885,
"learning_rate": 0.0001999868234680974,
"loss": 1.5676,
"step": 462
},
{
"epoch": 0.005178565436710772,
"grad_norm": 0.09401030838489532,
"learning_rate": 0.0001999867663663603,
"loss": 1.555,
"step": 463
},
{
"epoch": 0.0051897502432695424,
"grad_norm": 0.08279041945934296,
"learning_rate": 0.00019998670914117087,
"loss": 1.5436,
"step": 464
},
{
"epoch": 0.005200935049828314,
"grad_norm": 0.17070412635803223,
"learning_rate": 0.0001999866517925292,
"loss": 1.5575,
"step": 465
},
{
"epoch": 0.005212119856387084,
"grad_norm": 0.09067776054143906,
"learning_rate": 0.00019998659432043537,
"loss": 1.5665,
"step": 466
},
{
"epoch": 0.005223304662945854,
"grad_norm": 0.08899036049842834,
"learning_rate": 0.00019998653672488942,
"loss": 1.5678,
"step": 467
},
{
"epoch": 0.005234489469504625,
"grad_norm": 0.06300584226846695,
"learning_rate": 0.00019998647900589144,
"loss": 1.5826,
"step": 468
},
{
"epoch": 0.005245674276063396,
"grad_norm": 0.07066696137189865,
"learning_rate": 0.00019998642116344156,
"loss": 1.5696,
"step": 469
},
{
"epoch": 0.005256859082622166,
"grad_norm": 0.0663958340883255,
"learning_rate": 0.00019998636319753973,
"loss": 1.5878,
"step": 470
},
{
"epoch": 0.005268043889180936,
"grad_norm": 0.06905809789896011,
"learning_rate": 0.00019998630510818612,
"loss": 1.5667,
"step": 471
},
{
"epoch": 0.0052792286957397076,
"grad_norm": 0.05643589049577713,
"learning_rate": 0.00019998624689538077,
"loss": 1.5742,
"step": 472
},
{
"epoch": 0.005290413502298478,
"grad_norm": 0.05323821306228638,
"learning_rate": 0.00019998618855912375,
"loss": 1.5965,
"step": 473
},
{
"epoch": 0.005301598308857248,
"grad_norm": 0.07279177010059357,
"learning_rate": 0.0001999861300994151,
"loss": 1.5954,
"step": 474
},
{
"epoch": 0.0053127831154160185,
"grad_norm": 0.06261339038610458,
"learning_rate": 0.00019998607151625497,
"loss": 1.5771,
"step": 475
},
{
"epoch": 0.00532396792197479,
"grad_norm": 0.0605684369802475,
"learning_rate": 0.00019998601280964335,
"loss": 1.5674,
"step": 476
},
{
"epoch": 0.00533515272853356,
"grad_norm": 0.05855708196759224,
"learning_rate": 0.0001999859539795804,
"loss": 1.5641,
"step": 477
},
{
"epoch": 0.00534633753509233,
"grad_norm": 0.04459947720170021,
"learning_rate": 0.0001999858950260661,
"loss": 1.5706,
"step": 478
},
{
"epoch": 0.0053575223416511015,
"grad_norm": 0.05174221470952034,
"learning_rate": 0.00019998583594910057,
"loss": 1.5525,
"step": 479
},
{
"epoch": 0.005368707148209872,
"grad_norm": 0.047726552933454514,
"learning_rate": 0.0001999857767486839,
"loss": 1.5613,
"step": 480
},
{
"epoch": 0.005379891954768642,
"grad_norm": 0.056866295635700226,
"learning_rate": 0.0001999857174248161,
"loss": 1.5851,
"step": 481
},
{
"epoch": 0.005391076761327412,
"grad_norm": 0.05624596029520035,
"learning_rate": 0.00019998565797749732,
"loss": 1.5862,
"step": 482
},
{
"epoch": 0.005402261567886184,
"grad_norm": 0.057655058801174164,
"learning_rate": 0.0001999855984067276,
"loss": 1.5848,
"step": 483
},
{
"epoch": 0.005413446374444954,
"grad_norm": 0.06511086970567703,
"learning_rate": 0.000199985538712507,
"loss": 1.6081,
"step": 484
},
{
"epoch": 0.005424631181003724,
"grad_norm": 0.0913616269826889,
"learning_rate": 0.0001999854788948356,
"loss": 1.5973,
"step": 485
},
{
"epoch": 0.005435815987562495,
"grad_norm": 0.11560064554214478,
"learning_rate": 0.00019998541895371345,
"loss": 1.5767,
"step": 486
},
{
"epoch": 0.005447000794121266,
"grad_norm": 0.10205356776714325,
"learning_rate": 0.00019998535888914073,
"loss": 1.5641,
"step": 487
},
{
"epoch": 0.005458185600680036,
"grad_norm": 0.06806248426437378,
"learning_rate": 0.00019998529870111735,
"loss": 1.5586,
"step": 488
},
{
"epoch": 0.005469370407238807,
"grad_norm": 0.06459362804889679,
"learning_rate": 0.00019998523838964355,
"loss": 1.5532,
"step": 489
},
{
"epoch": 0.0054805552137975775,
"grad_norm": 0.07120149582624435,
"learning_rate": 0.00019998517795471928,
"loss": 1.5515,
"step": 490
},
{
"epoch": 0.005491740020356348,
"grad_norm": 0.059738751500844955,
"learning_rate": 0.00019998511739634464,
"loss": 1.5567,
"step": 491
},
{
"epoch": 0.005502924826915118,
"grad_norm": 0.06517786532640457,
"learning_rate": 0.00019998505671451976,
"loss": 1.595,
"step": 492
},
{
"epoch": 0.005514109633473889,
"grad_norm": 0.06841952353715897,
"learning_rate": 0.0001999849959092447,
"loss": 1.5805,
"step": 493
},
{
"epoch": 0.00552529444003266,
"grad_norm": 0.06286690384149551,
"learning_rate": 0.0001999849349805195,
"loss": 1.571,
"step": 494
},
{
"epoch": 0.00553647924659143,
"grad_norm": 0.0559217631816864,
"learning_rate": 0.00019998487392834422,
"loss": 1.5569,
"step": 495
},
{
"epoch": 0.005547664053150201,
"grad_norm": 0.0687541738152504,
"learning_rate": 0.000199984812752719,
"loss": 1.557,
"step": 496
},
{
"epoch": 0.0055588488597089715,
"grad_norm": 0.06690400838851929,
"learning_rate": 0.00019998475145364383,
"loss": 1.567,
"step": 497
},
{
"epoch": 0.005570033666267742,
"grad_norm": 0.06469254940748215,
"learning_rate": 0.00019998469003111892,
"loss": 1.5783,
"step": 498
},
{
"epoch": 0.005581218472826512,
"grad_norm": 0.06279771029949188,
"learning_rate": 0.0001999846284851442,
"loss": 1.588,
"step": 499
},
{
"epoch": 0.005592403279385283,
"grad_norm": 0.05500609427690506,
"learning_rate": 0.00019998456681571982,
"loss": 1.5775,
"step": 500
},
{
"epoch": 0.005603588085944054,
"grad_norm": 0.05660603567957878,
"learning_rate": 0.00019998450502284584,
"loss": 1.5847,
"step": 501
},
{
"epoch": 0.005614772892502824,
"grad_norm": 0.06350179761648178,
"learning_rate": 0.00019998444310652237,
"loss": 1.5859,
"step": 502
},
{
"epoch": 0.005625957699061595,
"grad_norm": 0.06947285681962967,
"learning_rate": 0.00019998438106674945,
"loss": 1.6142,
"step": 503
},
{
"epoch": 0.005637142505620365,
"grad_norm": 0.08556090295314789,
"learning_rate": 0.00019998431890352712,
"loss": 1.5949,
"step": 504
},
{
"epoch": 0.005648327312179136,
"grad_norm": 0.10163229703903198,
"learning_rate": 0.00019998425661685553,
"loss": 1.5819,
"step": 505
},
{
"epoch": 0.005659512118737906,
"grad_norm": 0.11865612119436264,
"learning_rate": 0.00019998419420673476,
"loss": 1.5799,
"step": 506
},
{
"epoch": 0.005670696925296677,
"grad_norm": 0.12710048258304596,
"learning_rate": 0.0001999841316731648,
"loss": 1.5837,
"step": 507
},
{
"epoch": 0.0056818817318554475,
"grad_norm": 0.10893180966377258,
"learning_rate": 0.00019998406901614583,
"loss": 1.5547,
"step": 508
},
{
"epoch": 0.005693066538414218,
"grad_norm": 0.06662983447313309,
"learning_rate": 0.00019998400623567788,
"loss": 1.5412,
"step": 509
},
{
"epoch": 0.005704251344972989,
"grad_norm": 0.06717602163553238,
"learning_rate": 0.000199983943331761,
"loss": 1.5471,
"step": 510
},
{
"epoch": 0.005715436151531759,
"grad_norm": 0.08228597790002823,
"learning_rate": 0.0001999838803043953,
"loss": 1.5446,
"step": 511
},
{
"epoch": 0.00572662095809053,
"grad_norm": 0.07614196836948395,
"learning_rate": 0.00019998381715358084,
"loss": 1.5574,
"step": 512
},
{
"epoch": 0.005737805764649301,
"grad_norm": 0.06075645610690117,
"learning_rate": 0.00019998375387931774,
"loss": 1.5597,
"step": 513
},
{
"epoch": 0.005748990571208071,
"grad_norm": 0.05882800742983818,
"learning_rate": 0.00019998369048160604,
"loss": 1.5559,
"step": 514
},
{
"epoch": 0.0057601753777668414,
"grad_norm": 0.07097506523132324,
"learning_rate": 0.0001999836269604458,
"loss": 1.5648,
"step": 515
},
{
"epoch": 0.005771360184325612,
"grad_norm": 0.06486310064792633,
"learning_rate": 0.00019998356331583716,
"loss": 1.5735,
"step": 516
},
{
"epoch": 0.005782544990884383,
"grad_norm": 0.05333361402153969,
"learning_rate": 0.00019998349954778016,
"loss": 1.567,
"step": 517
},
{
"epoch": 0.005793729797443153,
"grad_norm": 0.07817003130912781,
"learning_rate": 0.00019998343565627488,
"loss": 1.5902,
"step": 518
},
{
"epoch": 0.0058049146040019236,
"grad_norm": 0.07619974762201309,
"learning_rate": 0.00019998337164132138,
"loss": 1.5819,
"step": 519
},
{
"epoch": 0.005816099410560695,
"grad_norm": 0.06044092774391174,
"learning_rate": 0.0001999833075029198,
"loss": 1.6163,
"step": 520
},
{
"epoch": 0.005827284217119465,
"grad_norm": 0.06666608154773712,
"learning_rate": 0.00019998324324107015,
"loss": 1.5977,
"step": 521
},
{
"epoch": 0.005838469023678235,
"grad_norm": 0.06902644038200378,
"learning_rate": 0.00019998317885577254,
"loss": 1.5818,
"step": 522
},
{
"epoch": 0.005849653830237006,
"grad_norm": 0.05606195330619812,
"learning_rate": 0.00019998311434702703,
"loss": 1.5784,
"step": 523
},
{
"epoch": 0.005860838636795777,
"grad_norm": 0.08465290814638138,
"learning_rate": 0.00019998304971483374,
"loss": 1.5653,
"step": 524
},
{
"epoch": 0.005872023443354547,
"grad_norm": 0.08544803410768509,
"learning_rate": 0.00019998298495919274,
"loss": 1.5616,
"step": 525
},
{
"epoch": 0.0058832082499133175,
"grad_norm": 0.06527858972549438,
"learning_rate": 0.0001999829200801041,
"loss": 1.5879,
"step": 526
},
{
"epoch": 0.005894393056472089,
"grad_norm": 0.07150562107563019,
"learning_rate": 0.00019998285507756789,
"loss": 1.5642,
"step": 527
},
{
"epoch": 0.005905577863030859,
"grad_norm": 0.08152669668197632,
"learning_rate": 0.00019998278995158418,
"loss": 1.5744,
"step": 528
},
{
"epoch": 0.005916762669589629,
"grad_norm": 0.09149385243654251,
"learning_rate": 0.0001999827247021531,
"loss": 1.5715,
"step": 529
},
{
"epoch": 0.0059279474761484,
"grad_norm": 0.08523254096508026,
"learning_rate": 0.00019998265932927466,
"loss": 1.5822,
"step": 530
},
{
"epoch": 0.005939132282707171,
"grad_norm": 0.062498655170202255,
"learning_rate": 0.000199982593832949,
"loss": 1.5862,
"step": 531
},
{
"epoch": 0.005950317089265941,
"grad_norm": 0.07431355118751526,
"learning_rate": 0.0001999825282131762,
"loss": 1.574,
"step": 532
},
{
"epoch": 0.005961501895824711,
"grad_norm": 0.0720391795039177,
"learning_rate": 0.00019998246246995632,
"loss": 1.57,
"step": 533
},
{
"epoch": 0.005972686702383483,
"grad_norm": 0.06915175914764404,
"learning_rate": 0.00019998239660328943,
"loss": 1.5721,
"step": 534
},
{
"epoch": 0.005983871508942253,
"grad_norm": 0.058932509273290634,
"learning_rate": 0.00019998233061317561,
"loss": 1.583,
"step": 535
},
{
"epoch": 0.005995056315501023,
"grad_norm": 0.05271697789430618,
"learning_rate": 0.000199982264499615,
"loss": 1.5762,
"step": 536
},
{
"epoch": 0.006006241122059794,
"grad_norm": 0.05659927427768707,
"learning_rate": 0.0001999821982626076,
"loss": 1.5718,
"step": 537
},
{
"epoch": 0.006017425928618565,
"grad_norm": 0.05749264359474182,
"learning_rate": 0.00019998213190215353,
"loss": 1.5897,
"step": 538
},
{
"epoch": 0.006028610735177335,
"grad_norm": 0.06748930364847183,
"learning_rate": 0.0001999820654182529,
"loss": 1.5844,
"step": 539
},
{
"epoch": 0.006039795541736105,
"grad_norm": 0.06751269847154617,
"learning_rate": 0.0001999819988109057,
"loss": 1.5926,
"step": 540
},
{
"epoch": 0.0060509803482948765,
"grad_norm": 0.06434184312820435,
"learning_rate": 0.00019998193208011213,
"loss": 1.5742,
"step": 541
},
{
"epoch": 0.006062165154853647,
"grad_norm": 0.06833555549383163,
"learning_rate": 0.0001999818652258722,
"loss": 1.5818,
"step": 542
},
{
"epoch": 0.006073349961412417,
"grad_norm": 0.06202944368124008,
"learning_rate": 0.000199981798248186,
"loss": 1.5806,
"step": 543
},
{
"epoch": 0.006084534767971188,
"grad_norm": 0.06603850424289703,
"learning_rate": 0.00019998173114705366,
"loss": 1.5892,
"step": 544
},
{
"epoch": 0.006095719574529959,
"grad_norm": 0.04424119368195534,
"learning_rate": 0.00019998166392247522,
"loss": 1.6017,
"step": 545
},
{
"epoch": 0.006106904381088729,
"grad_norm": 0.04886182025074959,
"learning_rate": 0.00019998159657445074,
"loss": 1.5971,
"step": 546
},
{
"epoch": 0.006118089187647499,
"grad_norm": 0.055009353905916214,
"learning_rate": 0.00019998152910298035,
"loss": 1.5942,
"step": 547
},
{
"epoch": 0.0061292739942062704,
"grad_norm": 0.04949553683400154,
"learning_rate": 0.00019998146150806411,
"loss": 1.5851,
"step": 548
},
{
"epoch": 0.006140458800765041,
"grad_norm": 0.05669408291578293,
"learning_rate": 0.0001999813937897021,
"loss": 1.5565,
"step": 549
},
{
"epoch": 0.006151643607323811,
"grad_norm": 0.052044518291950226,
"learning_rate": 0.00019998132594789444,
"loss": 1.5652,
"step": 550
},
{
"epoch": 0.006162828413882582,
"grad_norm": 0.055031049996614456,
"learning_rate": 0.00019998125798264117,
"loss": 1.5509,
"step": 551
},
{
"epoch": 0.006174013220441353,
"grad_norm": 0.0653780847787857,
"learning_rate": 0.0001999811898939424,
"loss": 1.5672,
"step": 552
},
{
"epoch": 0.006185198027000123,
"grad_norm": 0.06647983938455582,
"learning_rate": 0.00019998112168179822,
"loss": 1.5744,
"step": 553
},
{
"epoch": 0.006196382833558893,
"grad_norm": 0.062013398855924606,
"learning_rate": 0.00019998105334620867,
"loss": 1.5545,
"step": 554
},
{
"epoch": 0.006207567640117664,
"grad_norm": 0.04633625969290733,
"learning_rate": 0.00019998098488717384,
"loss": 1.5643,
"step": 555
},
{
"epoch": 0.006218752446676435,
"grad_norm": 0.04178265854716301,
"learning_rate": 0.00019998091630469387,
"loss": 1.5484,
"step": 556
},
{
"epoch": 0.006229937253235205,
"grad_norm": 0.04647913947701454,
"learning_rate": 0.00019998084759876883,
"loss": 1.554,
"step": 557
},
{
"epoch": 0.006241122059793976,
"grad_norm": 0.05254526063799858,
"learning_rate": 0.00019998077876939876,
"loss": 1.5671,
"step": 558
},
{
"epoch": 0.0062523068663527465,
"grad_norm": 0.054666776210069656,
"learning_rate": 0.00019998070981658376,
"loss": 1.5709,
"step": 559
},
{
"epoch": 0.006263491672911517,
"grad_norm": 0.04982587322592735,
"learning_rate": 0.00019998064074032396,
"loss": 1.5645,
"step": 560
},
{
"epoch": 0.006274676479470288,
"grad_norm": 0.05644576624035835,
"learning_rate": 0.00019998057154061938,
"loss": 1.5512,
"step": 561
},
{
"epoch": 0.006285861286029058,
"grad_norm": 0.05311651527881622,
"learning_rate": 0.00019998050221747016,
"loss": 1.5399,
"step": 562
},
{
"epoch": 0.006297046092587829,
"grad_norm": 0.05626964941620827,
"learning_rate": 0.00019998043277087634,
"loss": 1.5389,
"step": 563
},
{
"epoch": 0.006308230899146599,
"grad_norm": 0.06463497877120972,
"learning_rate": 0.00019998036320083808,
"loss": 1.549,
"step": 564
},
{
"epoch": 0.00631941570570537,
"grad_norm": 0.07779069244861603,
"learning_rate": 0.00019998029350735538,
"loss": 1.5591,
"step": 565
},
{
"epoch": 0.00633060051226414,
"grad_norm": 0.10419348627328873,
"learning_rate": 0.00019998022369042837,
"loss": 1.5669,
"step": 566
},
{
"epoch": 0.006341785318822911,
"grad_norm": 0.11543929576873779,
"learning_rate": 0.00019998015375005709,
"loss": 1.5669,
"step": 567
},
{
"epoch": 0.006352970125381682,
"grad_norm": 0.08400971442461014,
"learning_rate": 0.0001999800836862417,
"loss": 1.589,
"step": 568
},
{
"epoch": 0.006364154931940452,
"grad_norm": 0.056216295808553696,
"learning_rate": 0.00019998001349898225,
"loss": 1.5781,
"step": 569
},
{
"epoch": 0.0063753397384992225,
"grad_norm": 0.07171747833490372,
"learning_rate": 0.0001999799431882788,
"loss": 1.5722,
"step": 570
},
{
"epoch": 0.006386524545057993,
"grad_norm": 0.07911943644285202,
"learning_rate": 0.0001999798727541315,
"loss": 1.5617,
"step": 571
},
{
"epoch": 0.006397709351616764,
"grad_norm": 0.07471580803394318,
"learning_rate": 0.0001999798021965404,
"loss": 1.5674,
"step": 572
},
{
"epoch": 0.006408894158175534,
"grad_norm": 0.06016454100608826,
"learning_rate": 0.00019997973151550556,
"loss": 1.589,
"step": 573
},
{
"epoch": 0.006420078964734305,
"grad_norm": 0.06692295521497726,
"learning_rate": 0.00019997966071102713,
"loss": 1.5721,
"step": 574
},
{
"epoch": 0.006431263771293076,
"grad_norm": 0.06640581041574478,
"learning_rate": 0.00019997958978310514,
"loss": 1.5781,
"step": 575
},
{
"epoch": 0.006442448577851846,
"grad_norm": 0.058826372027397156,
"learning_rate": 0.0001999795187317397,
"loss": 1.5666,
"step": 576
},
{
"epoch": 0.0064536333844106165,
"grad_norm": 0.055648185312747955,
"learning_rate": 0.0001999794475569309,
"loss": 1.5707,
"step": 577
},
{
"epoch": 0.006464818190969387,
"grad_norm": 0.058248959481716156,
"learning_rate": 0.00019997937625867884,
"loss": 1.57,
"step": 578
},
{
"epoch": 0.006476002997528158,
"grad_norm": 0.05667665973305702,
"learning_rate": 0.00019997930483698357,
"loss": 1.5715,
"step": 579
},
{
"epoch": 0.006487187804086928,
"grad_norm": 0.051860544830560684,
"learning_rate": 0.00019997923329184524,
"loss": 1.5875,
"step": 580
},
{
"epoch": 0.006498372610645699,
"grad_norm": 0.05429021269083023,
"learning_rate": 0.00019997916162326385,
"loss": 1.606,
"step": 581
},
{
"epoch": 0.00650955741720447,
"grad_norm": 0.055650923401117325,
"learning_rate": 0.00019997908983123956,
"loss": 1.6024,
"step": 582
},
{
"epoch": 0.00652074222376324,
"grad_norm": 0.061447255313396454,
"learning_rate": 0.00019997901791577244,
"loss": 1.5888,
"step": 583
},
{
"epoch": 0.00653192703032201,
"grad_norm": 0.06065785884857178,
"learning_rate": 0.00019997894587686255,
"loss": 1.5739,
"step": 584
},
{
"epoch": 0.006543111836880782,
"grad_norm": 0.07358521968126297,
"learning_rate": 0.00019997887371451002,
"loss": 1.5682,
"step": 585
},
{
"epoch": 0.006554296643439552,
"grad_norm": 0.08286885917186737,
"learning_rate": 0.00019997880142871494,
"loss": 1.5702,
"step": 586
},
{
"epoch": 0.006565481449998322,
"grad_norm": 0.09056065231561661,
"learning_rate": 0.0001999787290194774,
"loss": 1.5822,
"step": 587
},
{
"epoch": 0.0065766662565570925,
"grad_norm": 0.08298853039741516,
"learning_rate": 0.00019997865648679745,
"loss": 1.5818,
"step": 588
},
{
"epoch": 0.006587851063115864,
"grad_norm": 0.08499585837125778,
"learning_rate": 0.00019997858383067517,
"loss": 1.5775,
"step": 589
},
{
"epoch": 0.006599035869674634,
"grad_norm": 0.08271525800228119,
"learning_rate": 0.00019997851105111073,
"loss": 1.5756,
"step": 590
},
{
"epoch": 0.006610220676233404,
"grad_norm": 0.07318850606679916,
"learning_rate": 0.00019997843814810416,
"loss": 1.5674,
"step": 591
},
{
"epoch": 0.0066214054827921755,
"grad_norm": 0.07372857630252838,
"learning_rate": 0.00019997836512165558,
"loss": 1.5589,
"step": 592
},
{
"epoch": 0.006632590289350946,
"grad_norm": 0.09608045220375061,
"learning_rate": 0.00019997829197176503,
"loss": 1.5483,
"step": 593
},
{
"epoch": 0.006643775095909716,
"grad_norm": 0.13775509595870972,
"learning_rate": 0.00019997821869843264,
"loss": 1.5534,
"step": 594
},
{
"epoch": 0.0066549599024684864,
"grad_norm": 0.1282949000597,
"learning_rate": 0.00019997814530165847,
"loss": 1.5707,
"step": 595
},
{
"epoch": 0.006666144709027258,
"grad_norm": 0.09030576795339584,
"learning_rate": 0.00019997807178144268,
"loss": 1.5759,
"step": 596
},
{
"epoch": 0.006677329515586028,
"grad_norm": 0.08960919827222824,
"learning_rate": 0.00019997799813778531,
"loss": 1.5747,
"step": 597
},
{
"epoch": 0.006688514322144798,
"grad_norm": 0.08592968434095383,
"learning_rate": 0.00019997792437068644,
"loss": 1.5837,
"step": 598
},
{
"epoch": 0.0066996991287035694,
"grad_norm": 0.07291566580533981,
"learning_rate": 0.00019997785048014616,
"loss": 1.5797,
"step": 599
},
{
"epoch": 0.00671088393526234,
"grad_norm": 0.07706471532583237,
"learning_rate": 0.0001999777764661646,
"loss": 1.5715,
"step": 600
},
{
"epoch": 0.00672206874182111,
"grad_norm": 0.06954386830329895,
"learning_rate": 0.00019997770232874182,
"loss": 1.563,
"step": 601
},
{
"epoch": 0.00673325354837988,
"grad_norm": 0.06999648362398148,
"learning_rate": 0.00019997762806787792,
"loss": 1.5717,
"step": 602
},
{
"epoch": 0.0067444383549386516,
"grad_norm": 0.05400196090340614,
"learning_rate": 0.00019997755368357298,
"loss": 1.5862,
"step": 603
},
{
"epoch": 0.006755623161497422,
"grad_norm": 0.06418072432279587,
"learning_rate": 0.00019997747917582714,
"loss": 1.5908,
"step": 604
},
{
"epoch": 0.006766807968056192,
"grad_norm": 0.05838518589735031,
"learning_rate": 0.00019997740454464044,
"loss": 1.5718,
"step": 605
},
{
"epoch": 0.006777992774614963,
"grad_norm": 0.05882187560200691,
"learning_rate": 0.00019997732979001298,
"loss": 1.5769,
"step": 606
},
{
"epoch": 0.006789177581173734,
"grad_norm": 0.05842543765902519,
"learning_rate": 0.00019997725491194482,
"loss": 1.5746,
"step": 607
},
{
"epoch": 0.006800362387732504,
"grad_norm": 0.0527958944439888,
"learning_rate": 0.00019997717991043616,
"loss": 1.5682,
"step": 608
},
{
"epoch": 0.006811547194291274,
"grad_norm": 0.06266690045595169,
"learning_rate": 0.00019997710478548698,
"loss": 1.572,
"step": 609
},
{
"epoch": 0.0068227320008500455,
"grad_norm": 0.05554317682981491,
"learning_rate": 0.00019997702953709746,
"loss": 1.583,
"step": 610
},
{
"epoch": 0.006833916807408816,
"grad_norm": 0.04651861637830734,
"learning_rate": 0.00019997695416526761,
"loss": 1.5738,
"step": 611
},
{
"epoch": 0.006845101613967586,
"grad_norm": 0.053717561066150665,
"learning_rate": 0.0001999768786699976,
"loss": 1.5766,
"step": 612
},
{
"epoch": 0.006856286420526357,
"grad_norm": 0.050605516880750656,
"learning_rate": 0.0001999768030512875,
"loss": 1.606,
"step": 613
},
{
"epoch": 0.006867471227085128,
"grad_norm": 0.054307371377944946,
"learning_rate": 0.00019997672730913735,
"loss": 1.6187,
"step": 614
},
{
"epoch": 0.006878656033643898,
"grad_norm": 0.06506580859422684,
"learning_rate": 0.00019997665144354728,
"loss": 1.6113,
"step": 615
},
{
"epoch": 0.006889840840202669,
"grad_norm": 0.06480210274457932,
"learning_rate": 0.00019997657545451744,
"loss": 1.594,
"step": 616
},
{
"epoch": 0.006901025646761439,
"grad_norm": 0.04906410723924637,
"learning_rate": 0.00019997649934204784,
"loss": 1.5809,
"step": 617
},
{
"epoch": 0.00691221045332021,
"grad_norm": 0.05194586515426636,
"learning_rate": 0.00019997642310613857,
"loss": 1.5913,
"step": 618
},
{
"epoch": 0.00692339525987898,
"grad_norm": 0.05839546024799347,
"learning_rate": 0.0001999763467467898,
"loss": 1.5746,
"step": 619
},
{
"epoch": 0.006934580066437751,
"grad_norm": 0.06750357896089554,
"learning_rate": 0.0001999762702640016,
"loss": 1.575,
"step": 620
},
{
"epoch": 0.0069457648729965215,
"grad_norm": 0.07982991635799408,
"learning_rate": 0.00019997619365777402,
"loss": 1.576,
"step": 621
},
{
"epoch": 0.006956949679555292,
"grad_norm": 0.08816216886043549,
"learning_rate": 0.00019997611692810718,
"loss": 1.5744,
"step": 622
},
{
"epoch": 0.006968134486114063,
"grad_norm": 0.09619053453207016,
"learning_rate": 0.0001999760400750012,
"loss": 1.5791,
"step": 623
},
{
"epoch": 0.006979319292672833,
"grad_norm": 0.09412987530231476,
"learning_rate": 0.00019997596309845612,
"loss": 1.5799,
"step": 624
},
{
"epoch": 0.006990504099231604,
"grad_norm": 0.09227743744850159,
"learning_rate": 0.0001999758859984721,
"loss": 1.574,
"step": 625
},
{
"epoch": 0.007001688905790374,
"grad_norm": 0.07984034717082977,
"learning_rate": 0.00019997580877504918,
"loss": 1.5612,
"step": 626
},
{
"epoch": 0.007012873712349145,
"grad_norm": 0.05941009521484375,
"learning_rate": 0.00019997573142818752,
"loss": 1.5655,
"step": 627
},
{
"epoch": 0.0070240585189079155,
"grad_norm": 0.06787893921136856,
"learning_rate": 0.0001999756539578871,
"loss": 1.5662,
"step": 628
},
{
"epoch": 0.007035243325466686,
"grad_norm": 0.07556013017892838,
"learning_rate": 0.00019997557636414816,
"loss": 1.5877,
"step": 629
},
{
"epoch": 0.007046428132025457,
"grad_norm": 0.06565730273723602,
"learning_rate": 0.0001999754986469707,
"loss": 1.5701,
"step": 630
},
{
"epoch": 0.007057612938584227,
"grad_norm": 0.05801456421613693,
"learning_rate": 0.00019997542080635482,
"loss": 1.5604,
"step": 631
},
{
"epoch": 0.007068797745142998,
"grad_norm": 0.058777451515197754,
"learning_rate": 0.00019997534284230066,
"loss": 1.5695,
"step": 632
},
{
"epoch": 0.007079982551701768,
"grad_norm": 0.0650622621178627,
"learning_rate": 0.0001999752647548083,
"loss": 1.5612,
"step": 633
},
{
"epoch": 0.007091167358260539,
"grad_norm": 0.05834876000881195,
"learning_rate": 0.00019997518654387783,
"loss": 1.5617,
"step": 634
},
{
"epoch": 0.007102352164819309,
"grad_norm": 0.06384813785552979,
"learning_rate": 0.00019997510820950933,
"loss": 1.5567,
"step": 635
},
{
"epoch": 0.00711353697137808,
"grad_norm": 0.05401776731014252,
"learning_rate": 0.00019997502975170291,
"loss": 1.5558,
"step": 636
},
{
"epoch": 0.007124721777936851,
"grad_norm": 0.06590646505355835,
"learning_rate": 0.00019997495117045867,
"loss": 1.5474,
"step": 637
},
{
"epoch": 0.007135906584495621,
"grad_norm": 0.0560823492705822,
"learning_rate": 0.00019997487246577674,
"loss": 1.5404,
"step": 638
},
{
"epoch": 0.0071470913910543915,
"grad_norm": 0.0544624887406826,
"learning_rate": 0.00019997479363765717,
"loss": 1.5321,
"step": 639
},
{
"epoch": 0.007158276197613163,
"grad_norm": 0.04914103075861931,
"learning_rate": 0.00019997471468610005,
"loss": 1.5395,
"step": 640
},
{
"epoch": 0.007169461004171933,
"grad_norm": 0.0481346994638443,
"learning_rate": 0.00019997463561110553,
"loss": 1.5513,
"step": 641
},
{
"epoch": 0.007180645810730703,
"grad_norm": 0.04910167306661606,
"learning_rate": 0.00019997455641267367,
"loss": 1.5436,
"step": 642
},
{
"epoch": 0.007191830617289474,
"grad_norm": 0.05214869976043701,
"learning_rate": 0.00019997447709080456,
"loss": 1.5589,
"step": 643
},
{
"epoch": 0.007203015423848245,
"grad_norm": 0.06242618337273598,
"learning_rate": 0.00019997439764549832,
"loss": 1.5395,
"step": 644
},
{
"epoch": 0.007214200230407015,
"grad_norm": 0.07024102658033371,
"learning_rate": 0.00019997431807675505,
"loss": 1.5624,
"step": 645
},
{
"epoch": 0.0072253850369657854,
"grad_norm": 0.07082174718379974,
"learning_rate": 0.0001999742383845748,
"loss": 1.5505,
"step": 646
},
{
"epoch": 0.007236569843524557,
"grad_norm": 0.06821414083242416,
"learning_rate": 0.00019997415856895775,
"loss": 1.5489,
"step": 647
},
{
"epoch": 0.007247754650083327,
"grad_norm": 0.062424443662166595,
"learning_rate": 0.00019997407862990395,
"loss": 1.5341,
"step": 648
},
{
"epoch": 0.007258939456642097,
"grad_norm": 0.05251247063279152,
"learning_rate": 0.00019997399856741348,
"loss": 1.5257,
"step": 649
},
{
"epoch": 0.0072701242632008676,
"grad_norm": 0.04626723378896713,
"learning_rate": 0.0001999739183814865,
"loss": 1.5387,
"step": 650
},
{
"epoch": 0.007281309069759639,
"grad_norm": 0.05231785774230957,
"learning_rate": 0.00019997383807212306,
"loss": 1.5378,
"step": 651
},
{
"epoch": 0.007292493876318409,
"grad_norm": 0.06309188902378082,
"learning_rate": 0.00019997375763932323,
"loss": 1.5531,
"step": 652
},
{
"epoch": 0.007303678682877179,
"grad_norm": 0.05309786647558212,
"learning_rate": 0.0001999736770830872,
"loss": 1.5694,
"step": 653
},
{
"epoch": 0.0073148634894359505,
"grad_norm": 0.04923882707953453,
"learning_rate": 0.000199973596403415,
"loss": 1.5837,
"step": 654
},
{
"epoch": 0.007326048295994721,
"grad_norm": 0.05534524843096733,
"learning_rate": 0.00019997351560030677,
"loss": 1.5918,
"step": 655
},
{
"epoch": 0.007337233102553491,
"grad_norm": 0.1240246444940567,
"learning_rate": 0.00019997343467376258,
"loss": 1.5881,
"step": 656
},
{
"epoch": 0.0073484179091122615,
"grad_norm": 0.06112068518996239,
"learning_rate": 0.00019997335362378254,
"loss": 1.5761,
"step": 657
},
{
"epoch": 0.007359602715671033,
"grad_norm": 0.06589160114526749,
"learning_rate": 0.00019997327245036673,
"loss": 1.5641,
"step": 658
},
{
"epoch": 0.007370787522229803,
"grad_norm": 0.060181207954883575,
"learning_rate": 0.0001999731911535153,
"loss": 1.5573,
"step": 659
},
{
"epoch": 0.007381972328788573,
"grad_norm": 0.06380990892648697,
"learning_rate": 0.0001999731097332283,
"loss": 1.5624,
"step": 660
},
{
"epoch": 0.0073931571353473445,
"grad_norm": 0.06176357343792915,
"learning_rate": 0.00019997302818950584,
"loss": 1.5499,
"step": 661
},
{
"epoch": 0.007404341941906115,
"grad_norm": 0.055721499025821686,
"learning_rate": 0.00019997294652234805,
"loss": 1.557,
"step": 662
},
{
"epoch": 0.007415526748464885,
"grad_norm": 0.051978956907987595,
"learning_rate": 0.000199972864731755,
"loss": 1.5623,
"step": 663
},
{
"epoch": 0.007426711555023656,
"grad_norm": 0.04754827544093132,
"learning_rate": 0.00019997278281772682,
"loss": 1.5465,
"step": 664
},
{
"epoch": 0.007437896361582427,
"grad_norm": 0.0538279265165329,
"learning_rate": 0.0001999727007802636,
"loss": 1.5326,
"step": 665
},
{
"epoch": 0.007449081168141197,
"grad_norm": 0.0629352405667305,
"learning_rate": 0.00019997261861936543,
"loss": 1.5365,
"step": 666
},
{
"epoch": 0.007460265974699967,
"grad_norm": 0.06892745941877365,
"learning_rate": 0.00019997253633503238,
"loss": 1.5607,
"step": 667
},
{
"epoch": 0.007471450781258738,
"grad_norm": 0.07525767385959625,
"learning_rate": 0.00019997245392726465,
"loss": 1.5728,
"step": 668
},
{
"epoch": 0.007482635587817509,
"grad_norm": 0.10010071098804474,
"learning_rate": 0.00019997237139606224,
"loss": 1.559,
"step": 669
},
{
"epoch": 0.007493820394376279,
"grad_norm": 0.12011202424764633,
"learning_rate": 0.0001999722887414253,
"loss": 1.5694,
"step": 670
},
{
"epoch": 0.00750500520093505,
"grad_norm": 0.12278566509485245,
"learning_rate": 0.00019997220596335393,
"loss": 1.5939,
"step": 671
},
{
"epoch": 0.0075161900074938205,
"grad_norm": 0.10163605213165283,
"learning_rate": 0.00019997212306184823,
"loss": 1.5722,
"step": 672
},
{
"epoch": 0.007527374814052591,
"grad_norm": 0.09386469423770905,
"learning_rate": 0.00019997204003690828,
"loss": 1.5748,
"step": 673
},
{
"epoch": 0.007538559620611361,
"grad_norm": 0.1031983494758606,
"learning_rate": 0.00019997195688853422,
"loss": 1.5778,
"step": 674
},
{
"epoch": 0.007549744427170132,
"grad_norm": 0.09082422405481339,
"learning_rate": 0.00019997187361672615,
"loss": 1.5904,
"step": 675
},
{
"epoch": 0.007560929233728903,
"grad_norm": 0.05761239677667618,
"learning_rate": 0.0001999717902214841,
"loss": 1.5753,
"step": 676
},
{
"epoch": 0.007572114040287673,
"grad_norm": 0.0772882029414177,
"learning_rate": 0.0001999717067028083,
"loss": 1.5631,
"step": 677
},
{
"epoch": 0.007583298846846444,
"grad_norm": 0.09266892075538635,
"learning_rate": 0.00019997162306069875,
"loss": 1.5798,
"step": 678
},
{
"epoch": 0.0075944836534052144,
"grad_norm": 0.07755053043365479,
"learning_rate": 0.00019997153929515558,
"loss": 1.5969,
"step": 679
},
{
"epoch": 0.007605668459963985,
"grad_norm": 0.061833951622247696,
"learning_rate": 0.0001999714554061789,
"loss": 1.5781,
"step": 680
},
{
"epoch": 0.007616853266522755,
"grad_norm": 0.07911964505910873,
"learning_rate": 0.00019997137139376883,
"loss": 1.611,
"step": 681
},
{
"epoch": 0.007628038073081526,
"grad_norm": 0.07502644509077072,
"learning_rate": 0.00019997128725792544,
"loss": 1.6201,
"step": 682
},
{
"epoch": 0.007639222879640297,
"grad_norm": 0.11084458976984024,
"learning_rate": 0.00019997120299864886,
"loss": 1.5882,
"step": 683
},
{
"epoch": 0.007650407686199067,
"grad_norm": 0.1428053230047226,
"learning_rate": 0.00019997111861593921,
"loss": 1.5737,
"step": 684
},
{
"epoch": 0.007661592492757838,
"grad_norm": 0.1456058770418167,
"learning_rate": 0.00019997103410979652,
"loss": 1.583,
"step": 685
},
{
"epoch": 0.007672777299316608,
"grad_norm": 0.10741148889064789,
"learning_rate": 0.00019997094948022098,
"loss": 1.5736,
"step": 686
},
{
"epoch": 0.007683962105875379,
"grad_norm": 0.08248301595449448,
"learning_rate": 0.00019997086472721263,
"loss": 1.5559,
"step": 687
},
{
"epoch": 0.00769514691243415,
"grad_norm": 0.09595336019992828,
"learning_rate": 0.00019997077985077163,
"loss": 1.5513,
"step": 688
},
{
"epoch": 0.00770633171899292,
"grad_norm": 0.06806618720293045,
"learning_rate": 0.00019997069485089804,
"loss": 1.5624,
"step": 689
},
{
"epoch": 0.0077175165255516905,
"grad_norm": 0.07510481029748917,
"learning_rate": 0.00019997060972759198,
"loss": 1.5401,
"step": 690
},
{
"epoch": 0.007728701332110461,
"grad_norm": 0.06557908654212952,
"learning_rate": 0.00019997052448085358,
"loss": 1.5507,
"step": 691
},
{
"epoch": 0.007739886138669232,
"grad_norm": 0.07286231964826584,
"learning_rate": 0.0001999704391106829,
"loss": 1.5597,
"step": 692
},
{
"epoch": 0.007751070945228002,
"grad_norm": 0.06357460469007492,
"learning_rate": 0.0001999703536170801,
"loss": 1.5591,
"step": 693
},
{
"epoch": 0.007762255751786773,
"grad_norm": 0.06700731813907623,
"learning_rate": 0.00019997026800004522,
"loss": 1.5521,
"step": 694
},
{
"epoch": 0.007773440558345544,
"grad_norm": 0.05840053781867027,
"learning_rate": 0.00019997018225957839,
"loss": 1.5688,
"step": 695
},
{
"epoch": 0.007784625364904314,
"grad_norm": 0.06327050924301147,
"learning_rate": 0.00019997009639567974,
"loss": 1.5547,
"step": 696
},
{
"epoch": 0.007795810171463084,
"grad_norm": 0.06035961955785751,
"learning_rate": 0.00019997001040834936,
"loss": 1.5701,
"step": 697
},
{
"epoch": 0.007806994978021855,
"grad_norm": 0.05573936179280281,
"learning_rate": 0.0001999699242975874,
"loss": 1.5878,
"step": 698
},
{
"epoch": 0.007818179784580626,
"grad_norm": 0.05611170828342438,
"learning_rate": 0.00019996983806339387,
"loss": 1.5899,
"step": 699
},
{
"epoch": 0.007829364591139396,
"grad_norm": 0.05826570466160774,
"learning_rate": 0.00019996975170576896,
"loss": 1.5533,
"step": 700
},
{
"epoch": 0.007840549397698167,
"grad_norm": 0.050937049090862274,
"learning_rate": 0.00019996966522471273,
"loss": 1.5545,
"step": 701
},
{
"epoch": 0.007851734204256937,
"grad_norm": 0.06593479216098785,
"learning_rate": 0.0001999695786202253,
"loss": 1.5685,
"step": 702
},
{
"epoch": 0.007862919010815707,
"grad_norm": 0.06405465304851532,
"learning_rate": 0.0001999694918923068,
"loss": 1.6121,
"step": 703
},
{
"epoch": 0.00787410381737448,
"grad_norm": 0.052820343524217606,
"learning_rate": 0.0001999694050409573,
"loss": 1.6007,
"step": 704
},
{
"epoch": 0.00788528862393325,
"grad_norm": 0.05512186512351036,
"learning_rate": 0.00019996931806617695,
"loss": 1.5792,
"step": 705
},
{
"epoch": 0.00789647343049202,
"grad_norm": 0.0432184673845768,
"learning_rate": 0.0001999692309679658,
"loss": 1.5852,
"step": 706
},
{
"epoch": 0.00790765823705079,
"grad_norm": 0.05248282849788666,
"learning_rate": 0.00019996914374632402,
"loss": 1.5917,
"step": 707
},
{
"epoch": 0.00791884304360956,
"grad_norm": 0.04413476958870888,
"learning_rate": 0.00019996905640125165,
"loss": 1.5584,
"step": 708
},
{
"epoch": 0.00793002785016833,
"grad_norm": 0.04878908023238182,
"learning_rate": 0.00019996896893274886,
"loss": 1.5506,
"step": 709
},
{
"epoch": 0.007941212656727101,
"grad_norm": 0.04344234988093376,
"learning_rate": 0.00019996888134081575,
"loss": 1.5684,
"step": 710
},
{
"epoch": 0.007952397463285873,
"grad_norm": 0.047158923000097275,
"learning_rate": 0.0001999687936254524,
"loss": 1.6133,
"step": 711
},
{
"epoch": 0.007963582269844643,
"grad_norm": 0.050282686948776245,
"learning_rate": 0.00019996870578665893,
"loss": 1.6021,
"step": 712
},
{
"epoch": 0.007974767076403414,
"grad_norm": 0.043916840106248856,
"learning_rate": 0.0001999686178244354,
"loss": 1.5794,
"step": 713
},
{
"epoch": 0.007985951882962184,
"grad_norm": 0.079423688352108,
"learning_rate": 0.00019996852973878205,
"loss": 1.5666,
"step": 714
},
{
"epoch": 0.007997136689520954,
"grad_norm": 0.047040194272994995,
"learning_rate": 0.00019996844152969884,
"loss": 1.5643,
"step": 715
},
{
"epoch": 0.008008321496079725,
"grad_norm": 0.04954817518591881,
"learning_rate": 0.00019996835319718596,
"loss": 1.5756,
"step": 716
},
{
"epoch": 0.008019506302638495,
"grad_norm": 0.0529201366007328,
"learning_rate": 0.00019996826474124352,
"loss": 1.5693,
"step": 717
},
{
"epoch": 0.008030691109197267,
"grad_norm": 0.0555887334048748,
"learning_rate": 0.00019996817616187162,
"loss": 1.5699,
"step": 718
},
{
"epoch": 0.008041875915756037,
"grad_norm": 0.05515376478433609,
"learning_rate": 0.00019996808745907036,
"loss": 1.5729,
"step": 719
},
{
"epoch": 0.008053060722314808,
"grad_norm": 0.05125884339213371,
"learning_rate": 0.0001999679986328398,
"loss": 1.5618,
"step": 720
},
{
"epoch": 0.008064245528873578,
"grad_norm": 0.046284329146146774,
"learning_rate": 0.0001999679096831802,
"loss": 1.5723,
"step": 721
},
{
"epoch": 0.008075430335432348,
"grad_norm": 0.07273488491773605,
"learning_rate": 0.0001999678206100915,
"loss": 1.5701,
"step": 722
},
{
"epoch": 0.008086615141991119,
"grad_norm": 0.047560565173625946,
"learning_rate": 0.0001999677314135739,
"loss": 1.5579,
"step": 723
},
{
"epoch": 0.00809779994854989,
"grad_norm": 0.060283761471509933,
"learning_rate": 0.00019996764209362749,
"loss": 1.5615,
"step": 724
},
{
"epoch": 0.008108984755108661,
"grad_norm": 0.0602620430290699,
"learning_rate": 0.00019996755265025236,
"loss": 1.5602,
"step": 725
},
{
"epoch": 0.008120169561667431,
"grad_norm": 0.05383098125457764,
"learning_rate": 0.00019996746308344868,
"loss": 1.5769,
"step": 726
},
{
"epoch": 0.008131354368226202,
"grad_norm": 0.04577267915010452,
"learning_rate": 0.0001999673733932165,
"loss": 1.619,
"step": 727
},
{
"epoch": 0.008142539174784972,
"grad_norm": 0.04550078883767128,
"learning_rate": 0.00019996728357955595,
"loss": 1.5907,
"step": 728
},
{
"epoch": 0.008153723981343742,
"grad_norm": 0.050287820398807526,
"learning_rate": 0.00019996719364246714,
"loss": 1.5809,
"step": 729
},
{
"epoch": 0.008164908787902513,
"grad_norm": 0.05841783806681633,
"learning_rate": 0.00019996710358195018,
"loss": 1.5521,
"step": 730
},
{
"epoch": 0.008176093594461285,
"grad_norm": 0.07749857753515244,
"learning_rate": 0.0001999670133980052,
"loss": 1.5848,
"step": 731
},
{
"epoch": 0.008187278401020055,
"grad_norm": 0.08802466839551926,
"learning_rate": 0.00019996692309063232,
"loss": 1.6046,
"step": 732
},
{
"epoch": 0.008198463207578825,
"grad_norm": 0.09324830025434494,
"learning_rate": 0.00019996683265983162,
"loss": 1.5969,
"step": 733
},
{
"epoch": 0.008209648014137596,
"grad_norm": 0.07845516502857208,
"learning_rate": 0.0001999667421056032,
"loss": 1.5831,
"step": 734
},
{
"epoch": 0.008220832820696366,
"grad_norm": 0.06912586092948914,
"learning_rate": 0.0001999666514279472,
"loss": 1.5706,
"step": 735
},
{
"epoch": 0.008232017627255136,
"grad_norm": 0.0572381317615509,
"learning_rate": 0.00019996656062686374,
"loss": 1.5609,
"step": 736
},
{
"epoch": 0.008243202433813906,
"grad_norm": 0.06219245865941048,
"learning_rate": 0.00019996646970235287,
"loss": 1.5541,
"step": 737
},
{
"epoch": 0.008254387240372679,
"grad_norm": 0.0628521591424942,
"learning_rate": 0.0001999663786544148,
"loss": 1.5556,
"step": 738
},
{
"epoch": 0.008265572046931449,
"grad_norm": 0.06389934569597244,
"learning_rate": 0.0001999662874830496,
"loss": 1.5235,
"step": 739
},
{
"epoch": 0.00827675685349022,
"grad_norm": 0.052320901304483414,
"learning_rate": 0.00019996619618825733,
"loss": 1.539,
"step": 740
},
{
"epoch": 0.00828794166004899,
"grad_norm": 0.05470295995473862,
"learning_rate": 0.00019996610477003817,
"loss": 1.5415,
"step": 741
},
{
"epoch": 0.00829912646660776,
"grad_norm": 0.06103771552443504,
"learning_rate": 0.00019996601322839222,
"loss": 1.5422,
"step": 742
},
{
"epoch": 0.00831031127316653,
"grad_norm": 0.06434791535139084,
"learning_rate": 0.00019996592156331958,
"loss": 1.5527,
"step": 743
},
{
"epoch": 0.0083214960797253,
"grad_norm": 0.06087024137377739,
"learning_rate": 0.00019996582977482036,
"loss": 1.562,
"step": 744
},
{
"epoch": 0.008332680886284072,
"grad_norm": 0.060757141560316086,
"learning_rate": 0.00019996573786289465,
"loss": 1.5641,
"step": 745
},
{
"epoch": 0.008343865692842843,
"grad_norm": 0.07097544521093369,
"learning_rate": 0.00019996564582754265,
"loss": 1.542,
"step": 746
},
{
"epoch": 0.008355050499401613,
"grad_norm": 0.07591135054826736,
"learning_rate": 0.00019996555366876437,
"loss": 1.5557,
"step": 747
},
{
"epoch": 0.008366235305960383,
"grad_norm": 0.07860101759433746,
"learning_rate": 0.00019996546138655998,
"loss": 1.5592,
"step": 748
},
{
"epoch": 0.008377420112519154,
"grad_norm": 0.08454012125730515,
"learning_rate": 0.00019996536898092958,
"loss": 1.5428,
"step": 749
},
{
"epoch": 0.008388604919077924,
"grad_norm": 0.08686886727809906,
"learning_rate": 0.0001999652764518733,
"loss": 1.5604,
"step": 750
},
{
"epoch": 0.008399789725636694,
"grad_norm": 0.07752903550863266,
"learning_rate": 0.00019996518379939126,
"loss": 1.5663,
"step": 751
},
{
"epoch": 0.008410974532195466,
"grad_norm": 0.07272690534591675,
"learning_rate": 0.00019996509102348356,
"loss": 1.5463,
"step": 752
},
{
"epoch": 0.008422159338754237,
"grad_norm": 0.07069668918848038,
"learning_rate": 0.00019996499812415026,
"loss": 1.5403,
"step": 753
},
{
"epoch": 0.008433344145313007,
"grad_norm": 0.06617298722267151,
"learning_rate": 0.00019996490510139155,
"loss": 1.5452,
"step": 754
},
{
"epoch": 0.008444528951871777,
"grad_norm": 0.06795412302017212,
"learning_rate": 0.00019996481195520756,
"loss": 1.5355,
"step": 755
},
{
"epoch": 0.008455713758430548,
"grad_norm": 0.06670048087835312,
"learning_rate": 0.00019996471868559832,
"loss": 1.5529,
"step": 756
},
{
"epoch": 0.008466898564989318,
"grad_norm": 0.06750231981277466,
"learning_rate": 0.000199964625292564,
"loss": 1.5626,
"step": 757
},
{
"epoch": 0.008478083371548088,
"grad_norm": 0.06446841359138489,
"learning_rate": 0.0001999645317761047,
"loss": 1.5599,
"step": 758
},
{
"epoch": 0.00848926817810686,
"grad_norm": 0.0593569353222847,
"learning_rate": 0.00019996443813622057,
"loss": 1.5538,
"step": 759
},
{
"epoch": 0.00850045298466563,
"grad_norm": 0.0496729277074337,
"learning_rate": 0.00019996434437291168,
"loss": 1.5427,
"step": 760
},
{
"epoch": 0.008511637791224401,
"grad_norm": 0.04995394125580788,
"learning_rate": 0.00019996425048617814,
"loss": 1.5326,
"step": 761
},
{
"epoch": 0.008522822597783171,
"grad_norm": 0.061305031180381775,
"learning_rate": 0.00019996415647602014,
"loss": 1.5553,
"step": 762
},
{
"epoch": 0.008534007404341942,
"grad_norm": 0.07046514004468918,
"learning_rate": 0.0001999640623424377,
"loss": 1.5305,
"step": 763
},
{
"epoch": 0.008545192210900712,
"grad_norm": 0.0729839950799942,
"learning_rate": 0.00019996396808543102,
"loss": 1.5448,
"step": 764
},
{
"epoch": 0.008556377017459482,
"grad_norm": 0.07597866654396057,
"learning_rate": 0.00019996387370500016,
"loss": 1.5339,
"step": 765
},
{
"epoch": 0.008567561824018254,
"grad_norm": 0.07808911800384521,
"learning_rate": 0.00019996377920114525,
"loss": 1.5474,
"step": 766
},
{
"epoch": 0.008578746630577025,
"grad_norm": 0.07077853381633759,
"learning_rate": 0.0001999636845738664,
"loss": 1.5456,
"step": 767
},
{
"epoch": 0.008589931437135795,
"grad_norm": 0.05932854115962982,
"learning_rate": 0.00019996358982316378,
"loss": 1.5581,
"step": 768
},
{
"epoch": 0.008601116243694565,
"grad_norm": 0.055467307567596436,
"learning_rate": 0.00019996349494903743,
"loss": 1.552,
"step": 769
},
{
"epoch": 0.008612301050253336,
"grad_norm": 0.0684780701994896,
"learning_rate": 0.0001999633999514875,
"loss": 1.5691,
"step": 770
},
{
"epoch": 0.008623485856812106,
"grad_norm": 0.07558051496744156,
"learning_rate": 0.0001999633048305141,
"loss": 1.5704,
"step": 771
},
{
"epoch": 0.008634670663370878,
"grad_norm": 0.07451245933771133,
"learning_rate": 0.0001999632095861174,
"loss": 1.562,
"step": 772
},
{
"epoch": 0.008645855469929648,
"grad_norm": 0.06852173060178757,
"learning_rate": 0.00019996311421829744,
"loss": 1.5582,
"step": 773
},
{
"epoch": 0.008657040276488418,
"grad_norm": 0.05201677978038788,
"learning_rate": 0.00019996301872705438,
"loss": 1.5405,
"step": 774
},
{
"epoch": 0.008668225083047189,
"grad_norm": 0.05331201106309891,
"learning_rate": 0.00019996292311238832,
"loss": 1.5389,
"step": 775
},
{
"epoch": 0.008679409889605959,
"grad_norm": 0.06298915296792984,
"learning_rate": 0.00019996282737429942,
"loss": 1.5544,
"step": 776
},
{
"epoch": 0.00869059469616473,
"grad_norm": 0.06354406476020813,
"learning_rate": 0.00019996273151278774,
"loss": 1.5654,
"step": 777
},
{
"epoch": 0.0087017795027235,
"grad_norm": 0.0683928057551384,
"learning_rate": 0.00019996263552785344,
"loss": 1.5515,
"step": 778
},
{
"epoch": 0.008712964309282272,
"grad_norm": 0.08236062526702881,
"learning_rate": 0.0001999625394194966,
"loss": 1.5617,
"step": 779
},
{
"epoch": 0.008724149115841042,
"grad_norm": 0.07203904539346695,
"learning_rate": 0.0001999624431877174,
"loss": 1.555,
"step": 780
},
{
"epoch": 0.008735333922399812,
"grad_norm": 0.06245394051074982,
"learning_rate": 0.0001999623468325159,
"loss": 1.5696,
"step": 781
},
{
"epoch": 0.008746518728958583,
"grad_norm": 0.06451458483934402,
"learning_rate": 0.00019996225035389222,
"loss": 1.5695,
"step": 782
},
{
"epoch": 0.008757703535517353,
"grad_norm": 0.06131128594279289,
"learning_rate": 0.00019996215375184652,
"loss": 1.5749,
"step": 783
},
{
"epoch": 0.008768888342076123,
"grad_norm": 0.07188650965690613,
"learning_rate": 0.00019996205702637888,
"loss": 1.5647,
"step": 784
},
{
"epoch": 0.008780073148634894,
"grad_norm": 0.0647532194852829,
"learning_rate": 0.00019996196017748948,
"loss": 1.5906,
"step": 785
},
{
"epoch": 0.008791257955193666,
"grad_norm": 0.06955672800540924,
"learning_rate": 0.00019996186320517836,
"loss": 1.5923,
"step": 786
},
{
"epoch": 0.008802442761752436,
"grad_norm": 0.08375248312950134,
"learning_rate": 0.00019996176610944568,
"loss": 1.5949,
"step": 787
},
{
"epoch": 0.008813627568311206,
"grad_norm": 0.07389501482248306,
"learning_rate": 0.00019996166889029156,
"loss": 1.5859,
"step": 788
},
{
"epoch": 0.008824812374869977,
"grad_norm": 0.08891887962818146,
"learning_rate": 0.0001999615715477161,
"loss": 1.5799,
"step": 789
},
{
"epoch": 0.008835997181428747,
"grad_norm": 0.08630786836147308,
"learning_rate": 0.00019996147408171948,
"loss": 1.5648,
"step": 790
},
{
"epoch": 0.008847181987987517,
"grad_norm": 0.09771011024713516,
"learning_rate": 0.00019996137649230176,
"loss": 1.5505,
"step": 791
},
{
"epoch": 0.008858366794546288,
"grad_norm": 0.09192012995481491,
"learning_rate": 0.00019996127877946307,
"loss": 1.5704,
"step": 792
},
{
"epoch": 0.00886955160110506,
"grad_norm": 0.07876724004745483,
"learning_rate": 0.00019996118094320355,
"loss": 1.5822,
"step": 793
},
{
"epoch": 0.00888073640766383,
"grad_norm": 0.06203979253768921,
"learning_rate": 0.00019996108298352328,
"loss": 1.5599,
"step": 794
},
{
"epoch": 0.0088919212142226,
"grad_norm": 0.06725753843784332,
"learning_rate": 0.00019996098490042242,
"loss": 1.562,
"step": 795
},
{
"epoch": 0.00890310602078137,
"grad_norm": 0.07860880345106125,
"learning_rate": 0.0001999608866939011,
"loss": 1.5554,
"step": 796
},
{
"epoch": 0.008914290827340141,
"grad_norm": 0.07922618836164474,
"learning_rate": 0.0001999607883639594,
"loss": 1.564,
"step": 797
},
{
"epoch": 0.008925475633898911,
"grad_norm": 0.07509887218475342,
"learning_rate": 0.0001999606899105975,
"loss": 1.5531,
"step": 798
},
{
"epoch": 0.008936660440457682,
"grad_norm": 0.0813961774110794,
"learning_rate": 0.00019996059133381547,
"loss": 1.5479,
"step": 799
},
{
"epoch": 0.008947845247016454,
"grad_norm": 0.09687768667936325,
"learning_rate": 0.00019996049263361343,
"loss": 1.5527,
"step": 800
}
],
"logging_steps": 1,
"max_steps": 89407,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.6661988212201226e+20,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}