2565 lines
62 KiB
JSON
2565 lines
62 KiB
JSON
{
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 1.9983216783216782,
|
|
"eval_steps": 500,
|
|
"global_step": 1786,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.0011188811188811189,
|
|
"grad_norm": 153.8507844996399,
|
|
"learning_rate": 1.6759776536312848e-06,
|
|
"loss": 4.8778,
|
|
"step": 1
|
|
},
|
|
{
|
|
"epoch": 0.005594405594405594,
|
|
"grad_norm": 153.39272858190785,
|
|
"learning_rate": 8.379888268156423e-06,
|
|
"loss": 4.7304,
|
|
"step": 5
|
|
},
|
|
{
|
|
"epoch": 0.011188811188811189,
|
|
"grad_norm": 96.41612092958107,
|
|
"learning_rate": 1.6759776536312845e-05,
|
|
"loss": 4.1396,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.016783216783216783,
|
|
"grad_norm": 23.87392825650757,
|
|
"learning_rate": 2.513966480446927e-05,
|
|
"loss": 2.5431,
|
|
"step": 15
|
|
},
|
|
{
|
|
"epoch": 0.022377622377622378,
|
|
"grad_norm": 6.903016270261399,
|
|
"learning_rate": 3.351955307262569e-05,
|
|
"loss": 1.742,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.027972027972027972,
|
|
"grad_norm": 2.441067831027445,
|
|
"learning_rate": 4.189944134078212e-05,
|
|
"loss": 1.3057,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 0.033566433566433566,
|
|
"grad_norm": 1.0984763176398198,
|
|
"learning_rate": 5.027932960893854e-05,
|
|
"loss": 1.1221,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.039160839160839164,
|
|
"grad_norm": 0.5734705027472714,
|
|
"learning_rate": 5.865921787709496e-05,
|
|
"loss": 1.0604,
|
|
"step": 35
|
|
},
|
|
{
|
|
"epoch": 0.044755244755244755,
|
|
"grad_norm": 0.47491071619847164,
|
|
"learning_rate": 6.703910614525138e-05,
|
|
"loss": 0.9951,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.05034965034965035,
|
|
"grad_norm": 0.48157929747551576,
|
|
"learning_rate": 7.541899441340782e-05,
|
|
"loss": 0.9713,
|
|
"step": 45
|
|
},
|
|
{
|
|
"epoch": 0.055944055944055944,
|
|
"grad_norm": 0.45861937937875213,
|
|
"learning_rate": 8.379888268156423e-05,
|
|
"loss": 0.9551,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.06153846153846154,
|
|
"grad_norm": 0.5047783392223855,
|
|
"learning_rate": 9.217877094972066e-05,
|
|
"loss": 0.948,
|
|
"step": 55
|
|
},
|
|
{
|
|
"epoch": 0.06713286713286713,
|
|
"grad_norm": 0.4461540398939836,
|
|
"learning_rate": 0.00010055865921787709,
|
|
"loss": 0.9392,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.07272727272727272,
|
|
"grad_norm": 0.3316401246824197,
|
|
"learning_rate": 0.00010893854748603351,
|
|
"loss": 0.9146,
|
|
"step": 65
|
|
},
|
|
{
|
|
"epoch": 0.07832167832167833,
|
|
"grad_norm": 0.21722066760341413,
|
|
"learning_rate": 0.00011731843575418992,
|
|
"loss": 0.9012,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.08391608391608392,
|
|
"grad_norm": 0.1499443883049875,
|
|
"learning_rate": 0.00012569832402234635,
|
|
"loss": 0.8927,
|
|
"step": 75
|
|
},
|
|
{
|
|
"epoch": 0.08951048951048951,
|
|
"grad_norm": 0.22243490348769648,
|
|
"learning_rate": 0.00013407821229050276,
|
|
"loss": 0.8937,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.0951048951048951,
|
|
"grad_norm": 0.37625345216705214,
|
|
"learning_rate": 0.0001424581005586592,
|
|
"loss": 0.8718,
|
|
"step": 85
|
|
},
|
|
{
|
|
"epoch": 0.1006993006993007,
|
|
"grad_norm": 0.24963786248117179,
|
|
"learning_rate": 0.00015083798882681564,
|
|
"loss": 0.8921,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.1062937062937063,
|
|
"grad_norm": 0.26880921530140445,
|
|
"learning_rate": 0.00015921787709497208,
|
|
"loss": 0.8799,
|
|
"step": 95
|
|
},
|
|
{
|
|
"epoch": 0.11188811188811189,
|
|
"grad_norm": 0.20889087265244094,
|
|
"learning_rate": 0.00016759776536312847,
|
|
"loss": 0.8678,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.11748251748251748,
|
|
"grad_norm": 0.20865989038699168,
|
|
"learning_rate": 0.0001759776536312849,
|
|
"loss": 0.8715,
|
|
"step": 105
|
|
},
|
|
{
|
|
"epoch": 0.12307692307692308,
|
|
"grad_norm": 0.17048448558431092,
|
|
"learning_rate": 0.00018435754189944132,
|
|
"loss": 0.866,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.12867132867132866,
|
|
"grad_norm": 0.2563726823096772,
|
|
"learning_rate": 0.00019273743016759776,
|
|
"loss": 0.8786,
|
|
"step": 115
|
|
},
|
|
{
|
|
"epoch": 0.13426573426573427,
|
|
"grad_norm": 0.22854402328286066,
|
|
"learning_rate": 0.00020111731843575417,
|
|
"loss": 0.8641,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.13986013986013987,
|
|
"grad_norm": 0.17876773160294948,
|
|
"learning_rate": 0.00020949720670391058,
|
|
"loss": 0.8625,
|
|
"step": 125
|
|
},
|
|
{
|
|
"epoch": 0.14545454545454545,
|
|
"grad_norm": 0.3615856863579962,
|
|
"learning_rate": 0.00021787709497206702,
|
|
"loss": 0.8504,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 0.15104895104895105,
|
|
"grad_norm": 0.22751374781762473,
|
|
"learning_rate": 0.00022625698324022346,
|
|
"loss": 0.8683,
|
|
"step": 135
|
|
},
|
|
{
|
|
"epoch": 0.15664335664335666,
|
|
"grad_norm": 0.21790912834318224,
|
|
"learning_rate": 0.00023463687150837985,
|
|
"loss": 0.8519,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.16223776223776223,
|
|
"grad_norm": 0.18707627983249808,
|
|
"learning_rate": 0.0002430167597765363,
|
|
"loss": 0.8426,
|
|
"step": 145
|
|
},
|
|
{
|
|
"epoch": 0.16783216783216784,
|
|
"grad_norm": 0.22404831326656477,
|
|
"learning_rate": 0.0002513966480446927,
|
|
"loss": 0.8509,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.17342657342657342,
|
|
"grad_norm": 0.20919591839479193,
|
|
"learning_rate": 0.00025977653631284914,
|
|
"loss": 0.8594,
|
|
"step": 155
|
|
},
|
|
{
|
|
"epoch": 0.17902097902097902,
|
|
"grad_norm": 0.15579954649232924,
|
|
"learning_rate": 0.0002681564245810055,
|
|
"loss": 0.8319,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.18461538461538463,
|
|
"grad_norm": 0.4055853577053493,
|
|
"learning_rate": 0.00027653631284916196,
|
|
"loss": 0.8577,
|
|
"step": 165
|
|
},
|
|
{
|
|
"epoch": 0.1902097902097902,
|
|
"grad_norm": 0.2945253672148607,
|
|
"learning_rate": 0.0002849162011173184,
|
|
"loss": 0.8579,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 0.1958041958041958,
|
|
"grad_norm": 0.21772142109085163,
|
|
"learning_rate": 0.00029329608938547484,
|
|
"loss": 0.8382,
|
|
"step": 175
|
|
},
|
|
{
|
|
"epoch": 0.2013986013986014,
|
|
"grad_norm": 0.2281215849497856,
|
|
"learning_rate": 0.00029999971336506766,
|
|
"loss": 0.8331,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.206993006993007,
|
|
"grad_norm": 0.17899592128620762,
|
|
"learning_rate": 0.0002999896812574594,
|
|
"loss": 0.8434,
|
|
"step": 185
|
|
},
|
|
{
|
|
"epoch": 0.2125874125874126,
|
|
"grad_norm": 0.14731246899285452,
|
|
"learning_rate": 0.0002999653184986775,
|
|
"loss": 0.8458,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 0.21818181818181817,
|
|
"grad_norm": 0.14945248170502354,
|
|
"learning_rate": 0.00029992662741644334,
|
|
"loss": 0.8457,
|
|
"step": 195
|
|
},
|
|
{
|
|
"epoch": 0.22377622377622378,
|
|
"grad_norm": 0.14124929357614965,
|
|
"learning_rate": 0.0002998736117074673,
|
|
"loss": 0.8219,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.22937062937062938,
|
|
"grad_norm": 0.18621904528265473,
|
|
"learning_rate": 0.0002998062764370954,
|
|
"loss": 0.8299,
|
|
"step": 205
|
|
},
|
|
{
|
|
"epoch": 0.23496503496503496,
|
|
"grad_norm": 0.22513445720588005,
|
|
"learning_rate": 0.00029972462803882523,
|
|
"loss": 0.8502,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 0.24055944055944056,
|
|
"grad_norm": 0.1636384983699745,
|
|
"learning_rate": 0.0002996286743136916,
|
|
"loss": 0.8506,
|
|
"step": 215
|
|
},
|
|
{
|
|
"epoch": 0.24615384615384617,
|
|
"grad_norm": 0.19587892370520038,
|
|
"learning_rate": 0.000299518424429521,
|
|
"loss": 0.8382,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.2517482517482518,
|
|
"grad_norm": 0.17371559645370255,
|
|
"learning_rate": 0.0002993938889200556,
|
|
"loss": 0.8322,
|
|
"step": 225
|
|
},
|
|
{
|
|
"epoch": 0.2573426573426573,
|
|
"grad_norm": 0.13779589201692538,
|
|
"learning_rate": 0.0002992550796839468,
|
|
"loss": 0.8243,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 0.2629370629370629,
|
|
"grad_norm": 0.22266694135489237,
|
|
"learning_rate": 0.00029910200998361857,
|
|
"loss": 0.8332,
|
|
"step": 235
|
|
},
|
|
{
|
|
"epoch": 0.26853146853146853,
|
|
"grad_norm": 0.21324201617886082,
|
|
"learning_rate": 0.0002989346944440003,
|
|
"loss": 0.8377,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.27412587412587414,
|
|
"grad_norm": 0.1378342855087125,
|
|
"learning_rate": 0.0002987531490511291,
|
|
"loss": 0.8346,
|
|
"step": 245
|
|
},
|
|
{
|
|
"epoch": 0.27972027972027974,
|
|
"grad_norm": 0.13885067623091277,
|
|
"learning_rate": 0.000298557391150623,
|
|
"loss": 0.8366,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 0.2853146853146853,
|
|
"grad_norm": 0.13254675983004904,
|
|
"learning_rate": 0.00029834743944602316,
|
|
"loss": 0.8389,
|
|
"step": 255
|
|
},
|
|
{
|
|
"epoch": 0.2909090909090909,
|
|
"grad_norm": 0.15399189190157112,
|
|
"learning_rate": 0.0002981233139970071,
|
|
"loss": 0.8247,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 0.2965034965034965,
|
|
"grad_norm": 0.16475565381996887,
|
|
"learning_rate": 0.0002978850362174722,
|
|
"loss": 0.8042,
|
|
"step": 265
|
|
},
|
|
{
|
|
"epoch": 0.3020979020979021,
|
|
"grad_norm": 0.1449584149552635,
|
|
"learning_rate": 0.0002976326288734894,
|
|
"loss": 0.8113,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 0.3076923076923077,
|
|
"grad_norm": 0.1479148461740847,
|
|
"learning_rate": 0.0002973661160811284,
|
|
"loss": 0.8327,
|
|
"step": 275
|
|
},
|
|
{
|
|
"epoch": 0.3132867132867133,
|
|
"grad_norm": 0.11641111946130961,
|
|
"learning_rate": 0.00029708552330415337,
|
|
"loss": 0.8266,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 0.31888111888111886,
|
|
"grad_norm": 0.1289644823631784,
|
|
"learning_rate": 0.0002967908773515898,
|
|
"loss": 0.8031,
|
|
"step": 285
|
|
},
|
|
{
|
|
"epoch": 0.32447552447552447,
|
|
"grad_norm": 0.19696780235530817,
|
|
"learning_rate": 0.0002964822063751635,
|
|
"loss": 0.8195,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 0.3300699300699301,
|
|
"grad_norm": 0.13415245793361397,
|
|
"learning_rate": 0.00029615953986661056,
|
|
"loss": 0.8232,
|
|
"step": 295
|
|
},
|
|
{
|
|
"epoch": 0.3356643356643357,
|
|
"grad_norm": 0.1289064453230423,
|
|
"learning_rate": 0.0002958229086548595,
|
|
"loss": 0.811,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.3412587412587413,
|
|
"grad_norm": 0.2030532789276685,
|
|
"learning_rate": 0.00029547234490308604,
|
|
"loss": 0.8196,
|
|
"step": 305
|
|
},
|
|
{
|
|
"epoch": 0.34685314685314683,
|
|
"grad_norm": 0.1690018504223368,
|
|
"learning_rate": 0.00029510788210563996,
|
|
"loss": 0.8176,
|
|
"step": 310
|
|
},
|
|
{
|
|
"epoch": 0.35244755244755244,
|
|
"grad_norm": 0.18348727298089326,
|
|
"learning_rate": 0.0002947295550848448,
|
|
"loss": 0.8106,
|
|
"step": 315
|
|
},
|
|
{
|
|
"epoch": 0.35804195804195804,
|
|
"grad_norm": 0.12176145136770436,
|
|
"learning_rate": 0.000294337399987671,
|
|
"loss": 0.8106,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 0.36363636363636365,
|
|
"grad_norm": 0.14033633981287297,
|
|
"learning_rate": 0.0002939314542822821,
|
|
"loss": 0.8061,
|
|
"step": 325
|
|
},
|
|
{
|
|
"epoch": 0.36923076923076925,
|
|
"grad_norm": 0.1360601366236884,
|
|
"learning_rate": 0.0002935117567544547,
|
|
"loss": 0.8026,
|
|
"step": 330
|
|
},
|
|
{
|
|
"epoch": 0.3748251748251748,
|
|
"grad_norm": 0.15461610864168454,
|
|
"learning_rate": 0.0002930783475038734,
|
|
"loss": 0.8232,
|
|
"step": 335
|
|
},
|
|
{
|
|
"epoch": 0.3804195804195804,
|
|
"grad_norm": 0.10363695784053935,
|
|
"learning_rate": 0.0002926312679402985,
|
|
"loss": 0.8049,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 0.386013986013986,
|
|
"grad_norm": 0.12894656658416323,
|
|
"learning_rate": 0.00029217056077961043,
|
|
"loss": 0.7993,
|
|
"step": 345
|
|
},
|
|
{
|
|
"epoch": 0.3916083916083916,
|
|
"grad_norm": 0.12013728582042035,
|
|
"learning_rate": 0.000291696270039728,
|
|
"loss": 0.7863,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 0.3972027972027972,
|
|
"grad_norm": 0.10813698150493352,
|
|
"learning_rate": 0.0002912084410364029,
|
|
"loss": 0.7997,
|
|
"step": 355
|
|
},
|
|
{
|
|
"epoch": 0.4027972027972028,
|
|
"grad_norm": 0.17771298556035153,
|
|
"learning_rate": 0.00029070712037889,
|
|
"loss": 0.8018,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 0.4083916083916084,
|
|
"grad_norm": 0.10837497819616222,
|
|
"learning_rate": 0.00029019235596549394,
|
|
"loss": 0.8078,
|
|
"step": 365
|
|
},
|
|
{
|
|
"epoch": 0.413986013986014,
|
|
"grad_norm": 0.12560903325827824,
|
|
"learning_rate": 0.0002896641969789932,
|
|
"loss": 0.8182,
|
|
"step": 370
|
|
},
|
|
{
|
|
"epoch": 0.4195804195804196,
|
|
"grad_norm": 0.17753966157649756,
|
|
"learning_rate": 0.0002891226938819405,
|
|
"loss": 0.8059,
|
|
"step": 375
|
|
},
|
|
{
|
|
"epoch": 0.4251748251748252,
|
|
"grad_norm": 0.12975220832490364,
|
|
"learning_rate": 0.0002885678984118415,
|
|
"loss": 0.7811,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 0.4307692307692308,
|
|
"grad_norm": 0.18552799892105348,
|
|
"learning_rate": 0.0002879998635762118,
|
|
"loss": 0.799,
|
|
"step": 385
|
|
},
|
|
{
|
|
"epoch": 0.43636363636363634,
|
|
"grad_norm": 0.17744047449094297,
|
|
"learning_rate": 0.000287418643647512,
|
|
"loss": 0.7974,
|
|
"step": 390
|
|
},
|
|
{
|
|
"epoch": 0.44195804195804195,
|
|
"grad_norm": 0.13797503272598644,
|
|
"learning_rate": 0.00028682429415796267,
|
|
"loss": 0.7931,
|
|
"step": 395
|
|
},
|
|
{
|
|
"epoch": 0.44755244755244755,
|
|
"grad_norm": 0.12860800142514783,
|
|
"learning_rate": 0.0002862168718942383,
|
|
"loss": 0.7861,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.45314685314685316,
|
|
"grad_norm": 0.12668805301774957,
|
|
"learning_rate": 0.00028559643489204186,
|
|
"loss": 0.8107,
|
|
"step": 405
|
|
},
|
|
{
|
|
"epoch": 0.45874125874125876,
|
|
"grad_norm": 0.11338400624283074,
|
|
"learning_rate": 0.0002849630424305595,
|
|
"loss": 0.8088,
|
|
"step": 410
|
|
},
|
|
{
|
|
"epoch": 0.4643356643356643,
|
|
"grad_norm": 0.1345570809457518,
|
|
"learning_rate": 0.00028431675502679717,
|
|
"loss": 0.8038,
|
|
"step": 415
|
|
},
|
|
{
|
|
"epoch": 0.4699300699300699,
|
|
"grad_norm": 0.11102734283999488,
|
|
"learning_rate": 0.00028365763442979823,
|
|
"loss": 0.8163,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 0.4755244755244755,
|
|
"grad_norm": 0.12810877098380058,
|
|
"learning_rate": 0.000282985743614744,
|
|
"loss": 0.8017,
|
|
"step": 425
|
|
},
|
|
{
|
|
"epoch": 0.4811188811188811,
|
|
"grad_norm": 0.11873107897366113,
|
|
"learning_rate": 0.0002823011467769364,
|
|
"loss": 0.7957,
|
|
"step": 430
|
|
},
|
|
{
|
|
"epoch": 0.48671328671328673,
|
|
"grad_norm": 0.13272846461187762,
|
|
"learning_rate": 0.000281603909325665,
|
|
"loss": 0.8054,
|
|
"step": 435
|
|
},
|
|
{
|
|
"epoch": 0.49230769230769234,
|
|
"grad_norm": 0.10875049751356179,
|
|
"learning_rate": 0.00028089409787795716,
|
|
"loss": 0.7976,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 0.4979020979020979,
|
|
"grad_norm": 0.11634952268956057,
|
|
"learning_rate": 0.0002801717802522132,
|
|
"loss": 0.792,
|
|
"step": 445
|
|
},
|
|
{
|
|
"epoch": 0.5034965034965035,
|
|
"grad_norm": 0.10607280522202725,
|
|
"learning_rate": 0.00027943702546172697,
|
|
"loss": 0.8078,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 0.509090909090909,
|
|
"grad_norm": 0.15901805386261234,
|
|
"learning_rate": 0.00027868990370809164,
|
|
"loss": 0.8023,
|
|
"step": 455
|
|
},
|
|
{
|
|
"epoch": 0.5146853146853146,
|
|
"grad_norm": 0.12222508621029372,
|
|
"learning_rate": 0.00027793048637449273,
|
|
"loss": 0.7956,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 0.5202797202797202,
|
|
"grad_norm": 0.10769602993732762,
|
|
"learning_rate": 0.0002771588460188876,
|
|
"loss": 0.7897,
|
|
"step": 465
|
|
},
|
|
{
|
|
"epoch": 0.5258741258741259,
|
|
"grad_norm": 0.1119724845850892,
|
|
"learning_rate": 0.00027637505636707315,
|
|
"loss": 0.7901,
|
|
"step": 470
|
|
},
|
|
{
|
|
"epoch": 0.5314685314685315,
|
|
"grad_norm": 0.11279764446201575,
|
|
"learning_rate": 0.0002755791923056415,
|
|
"loss": 0.79,
|
|
"step": 475
|
|
},
|
|
{
|
|
"epoch": 0.5370629370629371,
|
|
"grad_norm": 0.16791268697388115,
|
|
"learning_rate": 0.0002747713298748253,
|
|
"loss": 0.7909,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 0.5426573426573427,
|
|
"grad_norm": 0.14084691053374204,
|
|
"learning_rate": 0.00027395154626123225,
|
|
"loss": 0.8013,
|
|
"step": 485
|
|
},
|
|
{
|
|
"epoch": 0.5482517482517483,
|
|
"grad_norm": 0.12452861008817544,
|
|
"learning_rate": 0.00027311991979047046,
|
|
"loss": 0.7888,
|
|
"step": 490
|
|
},
|
|
{
|
|
"epoch": 0.5538461538461539,
|
|
"grad_norm": 0.12432408060887955,
|
|
"learning_rate": 0.00027227652991966507,
|
|
"loss": 0.7736,
|
|
"step": 495
|
|
},
|
|
{
|
|
"epoch": 0.5594405594405595,
|
|
"grad_norm": 0.1167273022435753,
|
|
"learning_rate": 0.00027142145722986637,
|
|
"loss": 0.7892,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.5650349650349651,
|
|
"grad_norm": 0.1570515101772784,
|
|
"learning_rate": 0.0002705547834183506,
|
|
"loss": 0.7735,
|
|
"step": 505
|
|
},
|
|
{
|
|
"epoch": 0.5706293706293706,
|
|
"grad_norm": 0.1757157276572827,
|
|
"learning_rate": 0.00026967659129081465,
|
|
"loss": 0.7947,
|
|
"step": 510
|
|
},
|
|
{
|
|
"epoch": 0.5762237762237762,
|
|
"grad_norm": 0.10842613803532425,
|
|
"learning_rate": 0.0002687869647534643,
|
|
"loss": 0.7844,
|
|
"step": 515
|
|
},
|
|
{
|
|
"epoch": 0.5818181818181818,
|
|
"grad_norm": 0.14559223171878447,
|
|
"learning_rate": 0.0002678859888049972,
|
|
"loss": 0.7881,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 0.5874125874125874,
|
|
"grad_norm": 0.13398356861671987,
|
|
"learning_rate": 0.0002669737495284819,
|
|
"loss": 0.7731,
|
|
"step": 525
|
|
},
|
|
{
|
|
"epoch": 0.593006993006993,
|
|
"grad_norm": 0.16426998549603838,
|
|
"learning_rate": 0.00026605033408313354,
|
|
"loss": 0.7819,
|
|
"step": 530
|
|
},
|
|
{
|
|
"epoch": 0.5986013986013986,
|
|
"grad_norm": 0.12765279377973468,
|
|
"learning_rate": 0.0002651158306959855,
|
|
"loss": 0.7725,
|
|
"step": 535
|
|
},
|
|
{
|
|
"epoch": 0.6041958041958042,
|
|
"grad_norm": 0.11028117529632633,
|
|
"learning_rate": 0.00026417032865346023,
|
|
"loss": 0.7926,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 0.6097902097902098,
|
|
"grad_norm": 0.12782289803306612,
|
|
"learning_rate": 0.00026321391829283884,
|
|
"loss": 0.7634,
|
|
"step": 545
|
|
},
|
|
{
|
|
"epoch": 0.6153846153846154,
|
|
"grad_norm": 0.1369549911517866,
|
|
"learning_rate": 0.0002622466909936289,
|
|
"loss": 0.7628,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 0.620979020979021,
|
|
"grad_norm": 0.11521467825966626,
|
|
"learning_rate": 0.0002612687391688347,
|
|
"loss": 0.7763,
|
|
"step": 555
|
|
},
|
|
{
|
|
"epoch": 0.6265734265734266,
|
|
"grad_norm": 0.12350512688772212,
|
|
"learning_rate": 0.00026028015625612706,
|
|
"loss": 0.7884,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 0.6321678321678321,
|
|
"grad_norm": 0.10825882258061882,
|
|
"learning_rate": 0.000259281036708916,
|
|
"loss": 0.7945,
|
|
"step": 565
|
|
},
|
|
{
|
|
"epoch": 0.6377622377622377,
|
|
"grad_norm": 0.11150400278758185,
|
|
"learning_rate": 0.00025827147598732656,
|
|
"loss": 0.7862,
|
|
"step": 570
|
|
},
|
|
{
|
|
"epoch": 0.6433566433566433,
|
|
"grad_norm": 0.10337975508652891,
|
|
"learning_rate": 0.00025725157054907777,
|
|
"loss": 0.7838,
|
|
"step": 575
|
|
},
|
|
{
|
|
"epoch": 0.6489510489510489,
|
|
"grad_norm": 0.09770680702107988,
|
|
"learning_rate": 0.0002562214178402669,
|
|
"loss": 0.7969,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 0.6545454545454545,
|
|
"grad_norm": 0.1163323874465344,
|
|
"learning_rate": 0.00025518111628605885,
|
|
"loss": 0.7819,
|
|
"step": 585
|
|
},
|
|
{
|
|
"epoch": 0.6601398601398601,
|
|
"grad_norm": 0.11896295851189105,
|
|
"learning_rate": 0.00025413076528128255,
|
|
"loss": 0.7709,
|
|
"step": 590
|
|
},
|
|
{
|
|
"epoch": 0.6657342657342658,
|
|
"grad_norm": 0.11330549108690711,
|
|
"learning_rate": 0.0002530704651809339,
|
|
"loss": 0.7744,
|
|
"step": 595
|
|
},
|
|
{
|
|
"epoch": 0.6713286713286714,
|
|
"grad_norm": 0.13654029100152085,
|
|
"learning_rate": 0.0002520003172905878,
|
|
"loss": 0.7952,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 0.676923076923077,
|
|
"grad_norm": 0.10351068732360345,
|
|
"learning_rate": 0.0002509204238567186,
|
|
"loss": 0.7755,
|
|
"step": 605
|
|
},
|
|
{
|
|
"epoch": 0.6825174825174826,
|
|
"grad_norm": 0.14077884569885882,
|
|
"learning_rate": 0.00024983088805693163,
|
|
"loss": 0.7831,
|
|
"step": 610
|
|
},
|
|
{
|
|
"epoch": 0.6881118881118881,
|
|
"grad_norm": 0.11948510265909233,
|
|
"learning_rate": 0.00024873181399010446,
|
|
"loss": 0.7861,
|
|
"step": 615
|
|
},
|
|
{
|
|
"epoch": 0.6937062937062937,
|
|
"grad_norm": 0.12755481254736167,
|
|
"learning_rate": 0.00024762330666644136,
|
|
"loss": 0.7782,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 0.6993006993006993,
|
|
"grad_norm": 0.10102773025347063,
|
|
"learning_rate": 0.0002465054719974401,
|
|
"loss": 0.7731,
|
|
"step": 625
|
|
},
|
|
{
|
|
"epoch": 0.7048951048951049,
|
|
"grad_norm": 0.11665875780674151,
|
|
"learning_rate": 0.0002453784167857725,
|
|
"loss": 0.7839,
|
|
"step": 630
|
|
},
|
|
{
|
|
"epoch": 0.7104895104895105,
|
|
"grad_norm": 0.1074509342573267,
|
|
"learning_rate": 0.00024424224871508014,
|
|
"loss": 0.7769,
|
|
"step": 635
|
|
},
|
|
{
|
|
"epoch": 0.7160839160839161,
|
|
"grad_norm": 0.12422918258396683,
|
|
"learning_rate": 0.0002430970763396861,
|
|
"loss": 0.7754,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 0.7216783216783217,
|
|
"grad_norm": 0.11786940518440715,
|
|
"learning_rate": 0.00024194300907422276,
|
|
"loss": 0.7974,
|
|
"step": 645
|
|
},
|
|
{
|
|
"epoch": 0.7272727272727273,
|
|
"grad_norm": 0.10398253596299759,
|
|
"learning_rate": 0.00024078015718317818,
|
|
"loss": 0.7729,
|
|
"step": 650
|
|
},
|
|
{
|
|
"epoch": 0.7328671328671329,
|
|
"grad_norm": 0.09903776559603776,
|
|
"learning_rate": 0.00023960863177036079,
|
|
"loss": 0.774,
|
|
"step": 655
|
|
},
|
|
{
|
|
"epoch": 0.7384615384615385,
|
|
"grad_norm": 0.09179018818178636,
|
|
"learning_rate": 0.00023842854476828411,
|
|
"loss": 0.7629,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 0.7440559440559441,
|
|
"grad_norm": 0.10143244330106851,
|
|
"learning_rate": 0.0002372400089274724,
|
|
"loss": 0.781,
|
|
"step": 665
|
|
},
|
|
{
|
|
"epoch": 0.7496503496503496,
|
|
"grad_norm": 0.1121174501554115,
|
|
"learning_rate": 0.00023604313780568772,
|
|
"loss": 0.7811,
|
|
"step": 670
|
|
},
|
|
{
|
|
"epoch": 0.7552447552447552,
|
|
"grad_norm": 0.10047620860251714,
|
|
"learning_rate": 0.00023483804575708027,
|
|
"loss": 0.7752,
|
|
"step": 675
|
|
},
|
|
{
|
|
"epoch": 0.7608391608391608,
|
|
"grad_norm": 0.11068987100435401,
|
|
"learning_rate": 0.0002336248479212626,
|
|
"loss": 0.7657,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 0.7664335664335664,
|
|
"grad_norm": 0.12535370469312143,
|
|
"learning_rate": 0.0002324036602123086,
|
|
"loss": 0.7731,
|
|
"step": 685
|
|
},
|
|
{
|
|
"epoch": 0.772027972027972,
|
|
"grad_norm": 0.11059830294321518,
|
|
"learning_rate": 0.00023117459930767847,
|
|
"loss": 0.7831,
|
|
"step": 690
|
|
},
|
|
{
|
|
"epoch": 0.7776223776223776,
|
|
"grad_norm": 0.11073279493368117,
|
|
"learning_rate": 0.00022993778263707105,
|
|
"loss": 0.7705,
|
|
"step": 695
|
|
},
|
|
{
|
|
"epoch": 0.7832167832167832,
|
|
"grad_norm": 0.12764331660853,
|
|
"learning_rate": 0.000228693328371204,
|
|
"loss": 0.7816,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 0.7888111888111888,
|
|
"grad_norm": 0.10951855988735575,
|
|
"learning_rate": 0.0002274413554105232,
|
|
"loss": 0.7577,
|
|
"step": 705
|
|
},
|
|
{
|
|
"epoch": 0.7944055944055944,
|
|
"grad_norm": 0.12320185793067193,
|
|
"learning_rate": 0.00022618198337384264,
|
|
"loss": 0.7744,
|
|
"step": 710
|
|
},
|
|
{
|
|
"epoch": 0.8,
|
|
"grad_norm": 0.10716655164464266,
|
|
"learning_rate": 0.00022491533258691546,
|
|
"loss": 0.7752,
|
|
"step": 715
|
|
},
|
|
{
|
|
"epoch": 0.8055944055944056,
|
|
"grad_norm": 0.11962877035421185,
|
|
"learning_rate": 0.00022364152407093737,
|
|
"loss": 0.7812,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 0.8111888111888111,
|
|
"grad_norm": 0.10651252517324981,
|
|
"learning_rate": 0.00022236067953098414,
|
|
"loss": 0.78,
|
|
"step": 725
|
|
},
|
|
{
|
|
"epoch": 0.8167832167832167,
|
|
"grad_norm": 0.1086890422520719,
|
|
"learning_rate": 0.00022107292134438298,
|
|
"loss": 0.7801,
|
|
"step": 730
|
|
},
|
|
{
|
|
"epoch": 0.8223776223776224,
|
|
"grad_norm": 0.09743032182612474,
|
|
"learning_rate": 0.00021977837254902034,
|
|
"loss": 0.7762,
|
|
"step": 735
|
|
},
|
|
{
|
|
"epoch": 0.827972027972028,
|
|
"grad_norm": 0.09964450561189496,
|
|
"learning_rate": 0.0002184771568315862,
|
|
"loss": 0.7809,
|
|
"step": 740
|
|
},
|
|
{
|
|
"epoch": 0.8335664335664336,
|
|
"grad_norm": 0.09852105354101268,
|
|
"learning_rate": 0.0002171693985157567,
|
|
"loss": 0.7803,
|
|
"step": 745
|
|
},
|
|
{
|
|
"epoch": 0.8391608391608392,
|
|
"grad_norm": 0.10751247181443387,
|
|
"learning_rate": 0.00021585522255031554,
|
|
"loss": 0.754,
|
|
"step": 750
|
|
},
|
|
{
|
|
"epoch": 0.8447552447552448,
|
|
"grad_norm": 0.10083658202350868,
|
|
"learning_rate": 0.00021453475449721593,
|
|
"loss": 0.7689,
|
|
"step": 755
|
|
},
|
|
{
|
|
"epoch": 0.8503496503496504,
|
|
"grad_norm": 0.1132302634053956,
|
|
"learning_rate": 0.00021320812051958392,
|
|
"loss": 0.7667,
|
|
"step": 760
|
|
},
|
|
{
|
|
"epoch": 0.855944055944056,
|
|
"grad_norm": 0.09942757452904256,
|
|
"learning_rate": 0.00021187544736966403,
|
|
"loss": 0.7798,
|
|
"step": 765
|
|
},
|
|
{
|
|
"epoch": 0.8615384615384616,
|
|
"grad_norm": 0.11119180151464317,
|
|
"learning_rate": 0.00021053686237670912,
|
|
"loss": 0.7768,
|
|
"step": 770
|
|
},
|
|
{
|
|
"epoch": 0.8671328671328671,
|
|
"grad_norm": 0.11012292213675227,
|
|
"learning_rate": 0.0002091924934348146,
|
|
"loss": 0.7641,
|
|
"step": 775
|
|
},
|
|
{
|
|
"epoch": 0.8727272727272727,
|
|
"grad_norm": 0.10508899250762296,
|
|
"learning_rate": 0.0002078424689906988,
|
|
"loss": 0.772,
|
|
"step": 780
|
|
},
|
|
{
|
|
"epoch": 0.8783216783216783,
|
|
"grad_norm": 0.09916724236606267,
|
|
"learning_rate": 0.00020648691803143088,
|
|
"loss": 0.7798,
|
|
"step": 785
|
|
},
|
|
{
|
|
"epoch": 0.8839160839160839,
|
|
"grad_norm": 0.10139136991498161,
|
|
"learning_rate": 0.00020512597007210672,
|
|
"loss": 0.7595,
|
|
"step": 790
|
|
},
|
|
{
|
|
"epoch": 0.8895104895104895,
|
|
"grad_norm": 0.09922677378640356,
|
|
"learning_rate": 0.00020375975514347447,
|
|
"loss": 0.7582,
|
|
"step": 795
|
|
},
|
|
{
|
|
"epoch": 0.8951048951048951,
|
|
"grad_norm": 0.10734407771584646,
|
|
"learning_rate": 0.0002023884037795109,
|
|
"loss": 0.7747,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 0.9006993006993007,
|
|
"grad_norm": 0.09003146154158007,
|
|
"learning_rate": 0.00020101204700494963,
|
|
"loss": 0.772,
|
|
"step": 805
|
|
},
|
|
{
|
|
"epoch": 0.9062937062937063,
|
|
"grad_norm": 0.10224227095976963,
|
|
"learning_rate": 0.00019963081632276244,
|
|
"loss": 0.7632,
|
|
"step": 810
|
|
},
|
|
{
|
|
"epoch": 0.9118881118881119,
|
|
"grad_norm": 0.10634964894349899,
|
|
"learning_rate": 0.00019824484370159511,
|
|
"loss": 0.7621,
|
|
"step": 815
|
|
},
|
|
{
|
|
"epoch": 0.9174825174825175,
|
|
"grad_norm": 0.08669203171955091,
|
|
"learning_rate": 0.00019685426156315817,
|
|
"loss": 0.7678,
|
|
"step": 820
|
|
},
|
|
{
|
|
"epoch": 0.9230769230769231,
|
|
"grad_norm": 0.09236536065178236,
|
|
"learning_rate": 0.00019545920276957512,
|
|
"loss": 0.7615,
|
|
"step": 825
|
|
},
|
|
{
|
|
"epoch": 0.9286713286713286,
|
|
"grad_norm": 0.09051106183048227,
|
|
"learning_rate": 0.00019405980061068813,
|
|
"loss": 0.7538,
|
|
"step": 830
|
|
},
|
|
{
|
|
"epoch": 0.9342657342657342,
|
|
"grad_norm": 0.09075314123364801,
|
|
"learning_rate": 0.00019265618879132294,
|
|
"loss": 0.7695,
|
|
"step": 835
|
|
},
|
|
{
|
|
"epoch": 0.9398601398601398,
|
|
"grad_norm": 0.09779655862085175,
|
|
"learning_rate": 0.000191248501418514,
|
|
"loss": 0.7486,
|
|
"step": 840
|
|
},
|
|
{
|
|
"epoch": 0.9454545454545454,
|
|
"grad_norm": 0.09284439899834651,
|
|
"learning_rate": 0.00018983687298869165,
|
|
"loss": 0.7757,
|
|
"step": 845
|
|
},
|
|
{
|
|
"epoch": 0.951048951048951,
|
|
"grad_norm": 0.10906263614923466,
|
|
"learning_rate": 0.00018842143837483137,
|
|
"loss": 0.7654,
|
|
"step": 850
|
|
},
|
|
{
|
|
"epoch": 0.9566433566433566,
|
|
"grad_norm": 0.08711563033296012,
|
|
"learning_rate": 0.00018700233281356774,
|
|
"loss": 0.7661,
|
|
"step": 855
|
|
},
|
|
{
|
|
"epoch": 0.9622377622377623,
|
|
"grad_norm": 0.10105386217514606,
|
|
"learning_rate": 0.00018557969189227327,
|
|
"loss": 0.7566,
|
|
"step": 860
|
|
},
|
|
{
|
|
"epoch": 0.9678321678321679,
|
|
"grad_norm": 0.09374075010702233,
|
|
"learning_rate": 0.00018415365153610363,
|
|
"loss": 0.7505,
|
|
"step": 865
|
|
},
|
|
{
|
|
"epoch": 0.9734265734265735,
|
|
"grad_norm": 0.09217739631223294,
|
|
"learning_rate": 0.00018272434799501108,
|
|
"loss": 0.7513,
|
|
"step": 870
|
|
},
|
|
{
|
|
"epoch": 0.9790209790209791,
|
|
"grad_norm": 0.10860300198578388,
|
|
"learning_rate": 0.00018129191783072644,
|
|
"loss": 0.7586,
|
|
"step": 875
|
|
},
|
|
{
|
|
"epoch": 0.9846153846153847,
|
|
"grad_norm": 0.10382198085691989,
|
|
"learning_rate": 0.00017985649790371123,
|
|
"loss": 0.7712,
|
|
"step": 880
|
|
},
|
|
{
|
|
"epoch": 0.9902097902097902,
|
|
"grad_norm": 0.09718012392096963,
|
|
"learning_rate": 0.00017841822536008174,
|
|
"loss": 0.7548,
|
|
"step": 885
|
|
},
|
|
{
|
|
"epoch": 0.9958041958041958,
|
|
"grad_norm": 0.09129380340752612,
|
|
"learning_rate": 0.00017697723761850529,
|
|
"loss": 0.7442,
|
|
"step": 890
|
|
},
|
|
{
|
|
"epoch": 0.9991608391608392,
|
|
"eval_loss": 1.078917145729065,
|
|
"eval_runtime": 368.4649,
|
|
"eval_samples_per_second": 55.878,
|
|
"eval_steps_per_second": 1.748,
|
|
"step": 893
|
|
},
|
|
{
|
|
"epoch": 1.0013986013986014,
|
|
"grad_norm": 0.2193429577704184,
|
|
"learning_rate": 0.0001755336723570709,
|
|
"loss": 0.7304,
|
|
"step": 895
|
|
},
|
|
{
|
|
"epoch": 1.006993006993007,
|
|
"grad_norm": 0.12029696024418647,
|
|
"learning_rate": 0.00017408766750013455,
|
|
"loss": 0.6883,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 1.0125874125874126,
|
|
"grad_norm": 0.11157245101821324,
|
|
"learning_rate": 0.0001726393612051416,
|
|
"loss": 0.6937,
|
|
"step": 905
|
|
},
|
|
{
|
|
"epoch": 1.018181818181818,
|
|
"grad_norm": 0.11562847307206518,
|
|
"learning_rate": 0.0001711888918494268,
|
|
"loss": 0.7072,
|
|
"step": 910
|
|
},
|
|
{
|
|
"epoch": 1.0237762237762238,
|
|
"grad_norm": 0.09699091401508494,
|
|
"learning_rate": 0.00016973639801699258,
|
|
"loss": 0.7002,
|
|
"step": 915
|
|
},
|
|
{
|
|
"epoch": 1.0293706293706293,
|
|
"grad_norm": 0.10329864513789806,
|
|
"learning_rate": 0.0001682820184852687,
|
|
"loss": 0.7049,
|
|
"step": 920
|
|
},
|
|
{
|
|
"epoch": 1.034965034965035,
|
|
"grad_norm": 0.10505439144371424,
|
|
"learning_rate": 0.0001668258922118525,
|
|
"loss": 0.7062,
|
|
"step": 925
|
|
},
|
|
{
|
|
"epoch": 1.0405594405594405,
|
|
"grad_norm": 0.09345853743066072,
|
|
"learning_rate": 0.0001653681583212326,
|
|
"loss": 0.705,
|
|
"step": 930
|
|
},
|
|
{
|
|
"epoch": 1.0461538461538462,
|
|
"grad_norm": 0.14624259074546206,
|
|
"learning_rate": 0.00016390895609149608,
|
|
"loss": 0.6862,
|
|
"step": 935
|
|
},
|
|
{
|
|
"epoch": 1.0517482517482517,
|
|
"grad_norm": 0.09484870970062163,
|
|
"learning_rate": 0.00016244842494102135,
|
|
"loss": 0.6794,
|
|
"step": 940
|
|
},
|
|
{
|
|
"epoch": 1.0573426573426574,
|
|
"grad_norm": 0.1093126433741654,
|
|
"learning_rate": 0.00016098670441515759,
|
|
"loss": 0.6965,
|
|
"step": 945
|
|
},
|
|
{
|
|
"epoch": 1.062937062937063,
|
|
"grad_norm": 0.08709955416922696,
|
|
"learning_rate": 0.000159523934172892,
|
|
"loss": 0.6875,
|
|
"step": 950
|
|
},
|
|
{
|
|
"epoch": 1.0685314685314686,
|
|
"grad_norm": 0.08587078936550793,
|
|
"learning_rate": 0.00015806025397350617,
|
|
"loss": 0.6816,
|
|
"step": 955
|
|
},
|
|
{
|
|
"epoch": 1.0741258741258741,
|
|
"grad_norm": 0.09226255228367097,
|
|
"learning_rate": 0.00015659580366322265,
|
|
"loss": 0.6909,
|
|
"step": 960
|
|
},
|
|
{
|
|
"epoch": 1.0797202797202796,
|
|
"grad_norm": 0.0936240844549708,
|
|
"learning_rate": 0.00015513072316184393,
|
|
"loss": 0.6904,
|
|
"step": 965
|
|
},
|
|
{
|
|
"epoch": 1.0853146853146853,
|
|
"grad_norm": 0.085659439411888,
|
|
"learning_rate": 0.0001536651524493834,
|
|
"loss": 0.6874,
|
|
"step": 970
|
|
},
|
|
{
|
|
"epoch": 1.0909090909090908,
|
|
"grad_norm": 0.09924612907304702,
|
|
"learning_rate": 0.00015219923155269157,
|
|
"loss": 0.6953,
|
|
"step": 975
|
|
},
|
|
{
|
|
"epoch": 1.0965034965034965,
|
|
"grad_norm": 0.10028513455623837,
|
|
"learning_rate": 0.00015073310053207665,
|
|
"loss": 0.6967,
|
|
"step": 980
|
|
},
|
|
{
|
|
"epoch": 1.102097902097902,
|
|
"grad_norm": 0.09345136457275896,
|
|
"learning_rate": 0.00014926689946792332,
|
|
"loss": 0.6905,
|
|
"step": 985
|
|
},
|
|
{
|
|
"epoch": 1.1076923076923078,
|
|
"grad_norm": 0.08893419418399162,
|
|
"learning_rate": 0.00014780076844730849,
|
|
"loss": 0.6985,
|
|
"step": 990
|
|
},
|
|
{
|
|
"epoch": 1.1132867132867132,
|
|
"grad_norm": 0.08853702003030435,
|
|
"learning_rate": 0.00014633484755061658,
|
|
"loss": 0.7014,
|
|
"step": 995
|
|
},
|
|
{
|
|
"epoch": 1.118881118881119,
|
|
"grad_norm": 0.08365464476596743,
|
|
"learning_rate": 0.0001448692768381561,
|
|
"loss": 0.697,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 1.1244755244755245,
|
|
"grad_norm": 0.08780213786162074,
|
|
"learning_rate": 0.00014340419633677732,
|
|
"loss": 0.7025,
|
|
"step": 1005
|
|
},
|
|
{
|
|
"epoch": 1.1300699300699302,
|
|
"grad_norm": 0.0884295613746686,
|
|
"learning_rate": 0.00014193974602649386,
|
|
"loss": 0.6993,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"epoch": 1.1356643356643357,
|
|
"grad_norm": 0.09059540370700289,
|
|
"learning_rate": 0.00014047606582710798,
|
|
"loss": 0.6948,
|
|
"step": 1015
|
|
},
|
|
{
|
|
"epoch": 1.1412587412587412,
|
|
"grad_norm": 0.09899100050319447,
|
|
"learning_rate": 0.00013901329558484236,
|
|
"loss": 0.6992,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"epoch": 1.1468531468531469,
|
|
"grad_norm": 0.09283620847093584,
|
|
"learning_rate": 0.00013755157505897868,
|
|
"loss": 0.7184,
|
|
"step": 1025
|
|
},
|
|
{
|
|
"epoch": 1.1524475524475524,
|
|
"grad_norm": 0.08887024389956383,
|
|
"learning_rate": 0.00013609104390850392,
|
|
"loss": 0.697,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"epoch": 1.158041958041958,
|
|
"grad_norm": 0.08693816785751791,
|
|
"learning_rate": 0.0001346318416787674,
|
|
"loss": 0.6939,
|
|
"step": 1035
|
|
},
|
|
{
|
|
"epoch": 1.1636363636363636,
|
|
"grad_norm": 0.09116425941497776,
|
|
"learning_rate": 0.00013317410778814745,
|
|
"loss": 0.6989,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"epoch": 1.1692307692307693,
|
|
"grad_norm": 0.08570466903745366,
|
|
"learning_rate": 0.00013171798151473133,
|
|
"loss": 0.6956,
|
|
"step": 1045
|
|
},
|
|
{
|
|
"epoch": 1.1748251748251748,
|
|
"grad_norm": 0.0974948891545614,
|
|
"learning_rate": 0.0001302636019830074,
|
|
"loss": 0.6965,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"epoch": 1.1804195804195805,
|
|
"grad_norm": 0.08995597994573838,
|
|
"learning_rate": 0.0001288111081505732,
|
|
"loss": 0.7041,
|
|
"step": 1055
|
|
},
|
|
{
|
|
"epoch": 1.186013986013986,
|
|
"grad_norm": 0.07861264078439004,
|
|
"learning_rate": 0.00012736063879485837,
|
|
"loss": 0.7032,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"epoch": 1.1916083916083915,
|
|
"grad_norm": 0.0841514205254347,
|
|
"learning_rate": 0.0001259123324998655,
|
|
"loss": 0.6905,
|
|
"step": 1065
|
|
},
|
|
{
|
|
"epoch": 1.1972027972027972,
|
|
"grad_norm": 0.08692540683602883,
|
|
"learning_rate": 0.0001244663276429291,
|
|
"loss": 0.7074,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"epoch": 1.2027972027972027,
|
|
"grad_norm": 0.08776833500890557,
|
|
"learning_rate": 0.00012302276238149463,
|
|
"loss": 0.7041,
|
|
"step": 1075
|
|
},
|
|
{
|
|
"epoch": 1.2083916083916084,
|
|
"grad_norm": 0.08727180482054969,
|
|
"learning_rate": 0.00012158177463991828,
|
|
"loss": 0.696,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"epoch": 1.213986013986014,
|
|
"grad_norm": 0.08046559487267867,
|
|
"learning_rate": 0.00012014350209628875,
|
|
"loss": 0.6826,
|
|
"step": 1085
|
|
},
|
|
{
|
|
"epoch": 1.2195804195804196,
|
|
"grad_norm": 0.08781752670322365,
|
|
"learning_rate": 0.00011870808216927356,
|
|
"loss": 0.6999,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"epoch": 1.2251748251748251,
|
|
"grad_norm": 0.08813740501797863,
|
|
"learning_rate": 0.00011727565200498888,
|
|
"loss": 0.7037,
|
|
"step": 1095
|
|
},
|
|
{
|
|
"epoch": 1.2307692307692308,
|
|
"grad_norm": 0.08772222997454243,
|
|
"learning_rate": 0.00011584634846389638,
|
|
"loss": 0.6986,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 1.2363636363636363,
|
|
"grad_norm": 0.10332929019104932,
|
|
"learning_rate": 0.00011442030810772673,
|
|
"loss": 0.6725,
|
|
"step": 1105
|
|
},
|
|
{
|
|
"epoch": 1.241958041958042,
|
|
"grad_norm": 0.08709301562019137,
|
|
"learning_rate": 0.00011299766718643226,
|
|
"loss": 0.7063,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"epoch": 1.2475524475524475,
|
|
"grad_norm": 0.08449696490718896,
|
|
"learning_rate": 0.00011157856162516863,
|
|
"loss": 0.692,
|
|
"step": 1115
|
|
},
|
|
{
|
|
"epoch": 1.2531468531468533,
|
|
"grad_norm": 0.08124616270666368,
|
|
"learning_rate": 0.00011016312701130841,
|
|
"loss": 0.6915,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"epoch": 1.2587412587412588,
|
|
"grad_norm": 0.09376360999458151,
|
|
"learning_rate": 0.000108751498581486,
|
|
"loss": 0.6939,
|
|
"step": 1125
|
|
},
|
|
{
|
|
"epoch": 1.2643356643356642,
|
|
"grad_norm": 0.08677694965635109,
|
|
"learning_rate": 0.00010734381120867707,
|
|
"loss": 0.7029,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"epoch": 1.26993006993007,
|
|
"grad_norm": 0.08737879433308637,
|
|
"learning_rate": 0.00010594019938931187,
|
|
"loss": 0.6849,
|
|
"step": 1135
|
|
},
|
|
{
|
|
"epoch": 1.2755244755244755,
|
|
"grad_norm": 0.08311271942623545,
|
|
"learning_rate": 0.00010454079723042485,
|
|
"loss": 0.6799,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"epoch": 1.2811188811188812,
|
|
"grad_norm": 0.07508666997892824,
|
|
"learning_rate": 0.00010314573843684183,
|
|
"loss": 0.6979,
|
|
"step": 1145
|
|
},
|
|
{
|
|
"epoch": 1.2867132867132867,
|
|
"grad_norm": 0.07967922610661686,
|
|
"learning_rate": 0.00010175515629840487,
|
|
"loss": 0.6793,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"epoch": 1.2923076923076924,
|
|
"grad_norm": 0.08373250104545919,
|
|
"learning_rate": 0.00010036918367723754,
|
|
"loss": 0.6942,
|
|
"step": 1155
|
|
},
|
|
{
|
|
"epoch": 1.2979020979020979,
|
|
"grad_norm": 0.08415435083106365,
|
|
"learning_rate": 9.898795299505037e-05,
|
|
"loss": 0.6843,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"epoch": 1.3034965034965036,
|
|
"grad_norm": 0.0972925237131085,
|
|
"learning_rate": 9.761159622048914e-05,
|
|
"loss": 0.6786,
|
|
"step": 1165
|
|
},
|
|
{
|
|
"epoch": 1.309090909090909,
|
|
"grad_norm": 0.08332589008281578,
|
|
"learning_rate": 9.624024485652552e-05,
|
|
"loss": 0.6895,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"epoch": 1.3146853146853146,
|
|
"grad_norm": 0.08324352673680815,
|
|
"learning_rate": 9.48740299278933e-05,
|
|
"loss": 0.6902,
|
|
"step": 1175
|
|
},
|
|
{
|
|
"epoch": 1.3202797202797203,
|
|
"grad_norm": 0.08212510567034546,
|
|
"learning_rate": 9.351308196856911e-05,
|
|
"loss": 0.6861,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"epoch": 1.325874125874126,
|
|
"grad_norm": 0.08320001036155726,
|
|
"learning_rate": 9.215753100930118e-05,
|
|
"loss": 0.6943,
|
|
"step": 1185
|
|
},
|
|
{
|
|
"epoch": 1.3314685314685315,
|
|
"grad_norm": 0.09026366102833903,
|
|
"learning_rate": 9.08075065651854e-05,
|
|
"loss": 0.7031,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"epoch": 1.337062937062937,
|
|
"grad_norm": 0.07809171125830346,
|
|
"learning_rate": 8.946313762329081e-05,
|
|
"loss": 0.6974,
|
|
"step": 1195
|
|
},
|
|
{
|
|
"epoch": 1.3426573426573427,
|
|
"grad_norm": 0.07708050011669573,
|
|
"learning_rate": 8.812455263033595e-05,
|
|
"loss": 0.7072,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 1.3482517482517482,
|
|
"grad_norm": 0.08672106026682487,
|
|
"learning_rate": 8.679187948041605e-05,
|
|
"loss": 0.6946,
|
|
"step": 1205
|
|
},
|
|
{
|
|
"epoch": 1.353846153846154,
|
|
"grad_norm": 0.08223294476361295,
|
|
"learning_rate": 8.546524550278405e-05,
|
|
"loss": 0.6917,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"epoch": 1.3594405594405594,
|
|
"grad_norm": 0.07572041983819904,
|
|
"learning_rate": 8.414477744968441e-05,
|
|
"loss": 0.7068,
|
|
"step": 1215
|
|
},
|
|
{
|
|
"epoch": 1.365034965034965,
|
|
"grad_norm": 0.07935310328973877,
|
|
"learning_rate": 8.283060148424328e-05,
|
|
"loss": 0.6825,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"epoch": 1.3706293706293706,
|
|
"grad_norm": 0.08084222220036809,
|
|
"learning_rate": 8.152284316841382e-05,
|
|
"loss": 0.6895,
|
|
"step": 1225
|
|
},
|
|
{
|
|
"epoch": 1.3762237762237763,
|
|
"grad_norm": 0.08362754142551204,
|
|
"learning_rate": 8.02216274509797e-05,
|
|
"loss": 0.6855,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"epoch": 1.3818181818181818,
|
|
"grad_norm": 0.08263146843817087,
|
|
"learning_rate": 7.892707865561702e-05,
|
|
"loss": 0.685,
|
|
"step": 1235
|
|
},
|
|
{
|
|
"epoch": 1.3874125874125873,
|
|
"grad_norm": 0.07614085795637057,
|
|
"learning_rate": 7.763932046901587e-05,
|
|
"loss": 0.698,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"epoch": 1.393006993006993,
|
|
"grad_norm": 0.08297289392137173,
|
|
"learning_rate": 7.635847592906259e-05,
|
|
"loss": 0.6892,
|
|
"step": 1245
|
|
},
|
|
{
|
|
"epoch": 1.3986013986013985,
|
|
"grad_norm": 0.09257478488971461,
|
|
"learning_rate": 7.50846674130845e-05,
|
|
"loss": 0.6819,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"epoch": 1.4041958041958043,
|
|
"grad_norm": 0.0820635181024322,
|
|
"learning_rate": 7.381801662615731e-05,
|
|
"loss": 0.6836,
|
|
"step": 1255
|
|
},
|
|
{
|
|
"epoch": 1.4097902097902097,
|
|
"grad_norm": 0.08479286795375468,
|
|
"learning_rate": 7.255864458947677e-05,
|
|
"loss": 0.6838,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"epoch": 1.4153846153846155,
|
|
"grad_norm": 0.08169299364006637,
|
|
"learning_rate": 7.130667162879602e-05,
|
|
"loss": 0.6912,
|
|
"step": 1265
|
|
},
|
|
{
|
|
"epoch": 1.420979020979021,
|
|
"grad_norm": 0.0847296311273153,
|
|
"learning_rate": 7.006221736292892e-05,
|
|
"loss": 0.6824,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"epoch": 1.4265734265734267,
|
|
"grad_norm": 0.07999623959693618,
|
|
"learning_rate": 6.882540069232155e-05,
|
|
"loss": 0.6806,
|
|
"step": 1275
|
|
},
|
|
{
|
|
"epoch": 1.4321678321678322,
|
|
"grad_norm": 0.07964123970317083,
|
|
"learning_rate": 6.759633978769139e-05,
|
|
"loss": 0.7052,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"epoch": 1.4377622377622377,
|
|
"grad_norm": 0.0966871184419301,
|
|
"learning_rate": 6.63751520787374e-05,
|
|
"loss": 0.7002,
|
|
"step": 1285
|
|
},
|
|
{
|
|
"epoch": 1.4433566433566434,
|
|
"grad_norm": 0.09626897693462537,
|
|
"learning_rate": 6.516195424291972e-05,
|
|
"loss": 0.6912,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"epoch": 1.4489510489510489,
|
|
"grad_norm": 0.08145120964067751,
|
|
"learning_rate": 6.395686219431232e-05,
|
|
"loss": 0.6877,
|
|
"step": 1295
|
|
},
|
|
{
|
|
"epoch": 1.4545454545454546,
|
|
"grad_norm": 0.08789566288442652,
|
|
"learning_rate": 6.275999107252758e-05,
|
|
"loss": 0.6847,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 1.46013986013986,
|
|
"grad_norm": 0.0806679485875012,
|
|
"learning_rate": 6.157145523171587e-05,
|
|
"loss": 0.6869,
|
|
"step": 1305
|
|
},
|
|
{
|
|
"epoch": 1.4657342657342658,
|
|
"grad_norm": 0.07942983212018809,
|
|
"learning_rate": 6.039136822963924e-05,
|
|
"loss": 0.6767,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"epoch": 1.4713286713286713,
|
|
"grad_norm": 0.08489122954397245,
|
|
"learning_rate": 5.9219842816821796e-05,
|
|
"loss": 0.6814,
|
|
"step": 1315
|
|
},
|
|
{
|
|
"epoch": 1.476923076923077,
|
|
"grad_norm": 0.08653144402466852,
|
|
"learning_rate": 5.805699092577722e-05,
|
|
"loss": 0.6968,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"epoch": 1.4825174825174825,
|
|
"grad_norm": 0.08218462407837061,
|
|
"learning_rate": 5.6902923660313855e-05,
|
|
"loss": 0.6781,
|
|
"step": 1325
|
|
},
|
|
{
|
|
"epoch": 1.488111888111888,
|
|
"grad_norm": 0.07605468398853794,
|
|
"learning_rate": 5.5757751284919836e-05,
|
|
"loss": 0.6837,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"epoch": 1.4937062937062937,
|
|
"grad_norm": 0.07347540817224318,
|
|
"learning_rate": 5.462158321422751e-05,
|
|
"loss": 0.678,
|
|
"step": 1335
|
|
},
|
|
{
|
|
"epoch": 1.4993006993006994,
|
|
"grad_norm": 0.08136761352155132,
|
|
"learning_rate": 5.34945280025599e-05,
|
|
"loss": 0.6843,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"epoch": 1.504895104895105,
|
|
"grad_norm": 0.079793665685405,
|
|
"learning_rate": 5.237669333355863e-05,
|
|
"loss": 0.6919,
|
|
"step": 1345
|
|
},
|
|
{
|
|
"epoch": 1.5104895104895104,
|
|
"grad_norm": 0.07721773411337415,
|
|
"learning_rate": 5.126818600989557e-05,
|
|
"loss": 0.6826,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"epoch": 1.5160839160839161,
|
|
"grad_norm": 0.07622519698946323,
|
|
"learning_rate": 5.0169111943068374e-05,
|
|
"loss": 0.6901,
|
|
"step": 1355
|
|
},
|
|
{
|
|
"epoch": 1.5216783216783218,
|
|
"grad_norm": 0.07862095262493349,
|
|
"learning_rate": 4.9079576143281326e-05,
|
|
"loss": 0.691,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"epoch": 1.5272727272727273,
|
|
"grad_norm": 0.07949120411421966,
|
|
"learning_rate": 4.7999682709412216e-05,
|
|
"loss": 0.6806,
|
|
"step": 1365
|
|
},
|
|
{
|
|
"epoch": 1.5328671328671328,
|
|
"grad_norm": 0.08530944602336236,
|
|
"learning_rate": 4.692953481906605e-05,
|
|
"loss": 0.6847,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"epoch": 1.5384615384615383,
|
|
"grad_norm": 0.08074798934800313,
|
|
"learning_rate": 4.586923471871743e-05,
|
|
"loss": 0.681,
|
|
"step": 1375
|
|
},
|
|
{
|
|
"epoch": 1.544055944055944,
|
|
"grad_norm": 0.0777449518216711,
|
|
"learning_rate": 4.481888371394115e-05,
|
|
"loss": 0.6874,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"epoch": 1.5496503496503498,
|
|
"grad_norm": 0.07535968734459726,
|
|
"learning_rate": 4.377858215973318e-05,
|
|
"loss": 0.6751,
|
|
"step": 1385
|
|
},
|
|
{
|
|
"epoch": 1.5552447552447553,
|
|
"grad_norm": 0.07188075306681382,
|
|
"learning_rate": 4.2748429450922263e-05,
|
|
"loss": 0.6745,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"epoch": 1.5608391608391607,
|
|
"grad_norm": 0.0781312708212868,
|
|
"learning_rate": 4.172852401267347e-05,
|
|
"loss": 0.688,
|
|
"step": 1395
|
|
},
|
|
{
|
|
"epoch": 1.5664335664335665,
|
|
"grad_norm": 0.07521827314744757,
|
|
"learning_rate": 4.0718963291084e-05,
|
|
"loss": 0.6757,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 1.5720279720279722,
|
|
"grad_norm": 0.07373545234599518,
|
|
"learning_rate": 3.9719843743872964e-05,
|
|
"loss": 0.6778,
|
|
"step": 1405
|
|
},
|
|
{
|
|
"epoch": 1.5776223776223777,
|
|
"grad_norm": 0.07264270828709123,
|
|
"learning_rate": 3.873126083116525e-05,
|
|
"loss": 0.6864,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"epoch": 1.5832167832167832,
|
|
"grad_norm": 0.0763271584430483,
|
|
"learning_rate": 3.775330900637108e-05,
|
|
"loss": 0.683,
|
|
"step": 1415
|
|
},
|
|
{
|
|
"epoch": 1.5888111888111887,
|
|
"grad_norm": 0.07981572030031307,
|
|
"learning_rate": 3.678608170716117e-05,
|
|
"loss": 0.6795,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"epoch": 1.5944055944055944,
|
|
"grad_norm": 0.0746896810608302,
|
|
"learning_rate": 3.582967134653972e-05,
|
|
"loss": 0.675,
|
|
"step": 1425
|
|
},
|
|
{
|
|
"epoch": 1.6,
|
|
"grad_norm": 0.07449703423407898,
|
|
"learning_rate": 3.488416930401457e-05,
|
|
"loss": 0.6805,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"epoch": 1.6055944055944056,
|
|
"grad_norm": 0.07104343697673421,
|
|
"learning_rate": 3.3949665916866466e-05,
|
|
"loss": 0.6752,
|
|
"step": 1435
|
|
},
|
|
{
|
|
"epoch": 1.611188811188811,
|
|
"grad_norm": 0.07498548014087274,
|
|
"learning_rate": 3.302625047151807e-05,
|
|
"loss": 0.6949,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"epoch": 1.6167832167832168,
|
|
"grad_norm": 0.07539631481012679,
|
|
"learning_rate": 3.211401119500283e-05,
|
|
"loss": 0.6892,
|
|
"step": 1445
|
|
},
|
|
{
|
|
"epoch": 1.6223776223776225,
|
|
"grad_norm": 0.0765200306903301,
|
|
"learning_rate": 3.12130352465357e-05,
|
|
"loss": 0.6891,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"epoch": 1.627972027972028,
|
|
"grad_norm": 0.0730441214772004,
|
|
"learning_rate": 3.032340870918527e-05,
|
|
"loss": 0.6981,
|
|
"step": 1455
|
|
},
|
|
{
|
|
"epoch": 1.6335664335664335,
|
|
"grad_norm": 0.08171074628055888,
|
|
"learning_rate": 2.9445216581649384e-05,
|
|
"loss": 0.6936,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"epoch": 1.6391608391608392,
|
|
"grad_norm": 0.083112396316528,
|
|
"learning_rate": 2.8578542770133654e-05,
|
|
"loss": 0.6737,
|
|
"step": 1465
|
|
},
|
|
{
|
|
"epoch": 1.6447552447552447,
|
|
"grad_norm": 0.08252295941296424,
|
|
"learning_rate": 2.772347008033492e-05,
|
|
"loss": 0.701,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"epoch": 1.6503496503496504,
|
|
"grad_norm": 0.07835293344801121,
|
|
"learning_rate": 2.688008020952952e-05,
|
|
"loss": 0.6921,
|
|
"step": 1475
|
|
},
|
|
{
|
|
"epoch": 1.655944055944056,
|
|
"grad_norm": 0.0778212904803543,
|
|
"learning_rate": 2.6048453738767755e-05,
|
|
"loss": 0.6764,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"epoch": 1.6615384615384614,
|
|
"grad_norm": 0.07182079524548696,
|
|
"learning_rate": 2.5228670125174704e-05,
|
|
"loss": 0.6841,
|
|
"step": 1485
|
|
},
|
|
{
|
|
"epoch": 1.6671328671328671,
|
|
"grad_norm": 0.072274928421158,
|
|
"learning_rate": 2.4420807694358468e-05,
|
|
"loss": 0.6823,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"epoch": 1.6727272727272728,
|
|
"grad_norm": 0.07124500713293849,
|
|
"learning_rate": 2.3624943632926853e-05,
|
|
"loss": 0.6816,
|
|
"step": 1495
|
|
},
|
|
{
|
|
"epoch": 1.6783216783216783,
|
|
"grad_norm": 0.06940723884896695,
|
|
"learning_rate": 2.2841153981112397e-05,
|
|
"loss": 0.6805,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 1.6839160839160838,
|
|
"grad_norm": 0.07325019748524,
|
|
"learning_rate": 2.20695136255073e-05,
|
|
"loss": 0.6614,
|
|
"step": 1505
|
|
},
|
|
{
|
|
"epoch": 1.6895104895104895,
|
|
"grad_norm": 0.0727025067676513,
|
|
"learning_rate": 2.1310096291908347e-05,
|
|
"loss": 0.6851,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"epoch": 1.6951048951048953,
|
|
"grad_norm": 0.07406758207149883,
|
|
"learning_rate": 2.0562974538273024e-05,
|
|
"loss": 0.6978,
|
|
"step": 1515
|
|
},
|
|
{
|
|
"epoch": 1.7006993006993008,
|
|
"grad_norm": 0.0684645890113912,
|
|
"learning_rate": 1.9828219747786733e-05,
|
|
"loss": 0.6814,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"epoch": 1.7062937062937062,
|
|
"grad_norm": 0.07103130256598,
|
|
"learning_rate": 1.910590212204281e-05,
|
|
"loss": 0.6955,
|
|
"step": 1525
|
|
},
|
|
{
|
|
"epoch": 1.7118881118881117,
|
|
"grad_norm": 0.08660646823989741,
|
|
"learning_rate": 1.839609067433495e-05,
|
|
"loss": 0.6768,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"epoch": 1.7174825174825175,
|
|
"grad_norm": 0.07000885068796428,
|
|
"learning_rate": 1.7698853223063554e-05,
|
|
"loss": 0.6814,
|
|
"step": 1535
|
|
},
|
|
{
|
|
"epoch": 1.7230769230769232,
|
|
"grad_norm": 0.07227717547773495,
|
|
"learning_rate": 1.701425638525601e-05,
|
|
"loss": 0.6863,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"epoch": 1.7286713286713287,
|
|
"grad_norm": 0.08472297754432541,
|
|
"learning_rate": 1.634236557020174e-05,
|
|
"loss": 0.6739,
|
|
"step": 1545
|
|
},
|
|
{
|
|
"epoch": 1.7342657342657342,
|
|
"grad_norm": 0.07635635084022904,
|
|
"learning_rate": 1.5683244973202848e-05,
|
|
"loss": 0.6849,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"epoch": 1.7398601398601399,
|
|
"grad_norm": 0.0733376716964744,
|
|
"learning_rate": 1.5036957569440488e-05,
|
|
"loss": 0.6736,
|
|
"step": 1555
|
|
},
|
|
{
|
|
"epoch": 1.7454545454545456,
|
|
"grad_norm": 0.06984335423469197,
|
|
"learning_rate": 1.4403565107958142e-05,
|
|
"loss": 0.6801,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"epoch": 1.751048951048951,
|
|
"grad_norm": 0.07299591046039727,
|
|
"learning_rate": 1.3783128105761649e-05,
|
|
"loss": 0.6703,
|
|
"step": 1565
|
|
},
|
|
{
|
|
"epoch": 1.7566433566433566,
|
|
"grad_norm": 0.06856518432971286,
|
|
"learning_rate": 1.3175705842037332e-05,
|
|
"loss": 0.6811,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"epoch": 1.762237762237762,
|
|
"grad_norm": 0.06892528407637724,
|
|
"learning_rate": 1.2581356352488003e-05,
|
|
"loss": 0.6895,
|
|
"step": 1575
|
|
},
|
|
{
|
|
"epoch": 1.7678321678321678,
|
|
"grad_norm": 0.0731333956161155,
|
|
"learning_rate": 1.2000136423788226e-05,
|
|
"loss": 0.6947,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"epoch": 1.7734265734265735,
|
|
"grad_norm": 0.07510563722528918,
|
|
"learning_rate": 1.1432101588158487e-05,
|
|
"loss": 0.6782,
|
|
"step": 1585
|
|
},
|
|
{
|
|
"epoch": 1.779020979020979,
|
|
"grad_norm": 0.07335149840877105,
|
|
"learning_rate": 1.0877306118059498e-05,
|
|
"loss": 0.6832,
|
|
"step": 1590
|
|
},
|
|
{
|
|
"epoch": 1.7846153846153845,
|
|
"grad_norm": 0.07016146300305476,
|
|
"learning_rate": 1.0335803021006783e-05,
|
|
"loss": 0.6735,
|
|
"step": 1595
|
|
},
|
|
{
|
|
"epoch": 1.7902097902097902,
|
|
"grad_norm": 0.07110506905460164,
|
|
"learning_rate": 9.807644034506024e-06,
|
|
"loss": 0.6933,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 1.795804195804196,
|
|
"grad_norm": 0.07366692101954259,
|
|
"learning_rate": 9.292879621110022e-06,
|
|
"loss": 0.6775,
|
|
"step": 1605
|
|
},
|
|
{
|
|
"epoch": 1.8013986013986014,
|
|
"grad_norm": 0.07086756664676215,
|
|
"learning_rate": 8.791558963597045e-06,
|
|
"loss": 0.6847,
|
|
"step": 1610
|
|
},
|
|
{
|
|
"epoch": 1.806993006993007,
|
|
"grad_norm": 0.0704179943193231,
|
|
"learning_rate": 8.30372996027195e-06,
|
|
"loss": 0.6802,
|
|
"step": 1615
|
|
},
|
|
{
|
|
"epoch": 1.8125874125874126,
|
|
"grad_norm": 0.07178710401033903,
|
|
"learning_rate": 7.829439220389521e-06,
|
|
"loss": 0.6892,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"epoch": 1.8181818181818183,
|
|
"grad_norm": 0.07500788961567526,
|
|
"learning_rate": 7.368732059701499e-06,
|
|
"loss": 0.6822,
|
|
"step": 1625
|
|
},
|
|
{
|
|
"epoch": 1.8237762237762238,
|
|
"grad_norm": 0.07030773912011665,
|
|
"learning_rate": 6.921652496126623e-06,
|
|
"loss": 0.6749,
|
|
"step": 1630
|
|
},
|
|
{
|
|
"epoch": 1.8293706293706293,
|
|
"grad_norm": 0.07046870723595076,
|
|
"learning_rate": 6.4882432455452606e-06,
|
|
"loss": 0.6748,
|
|
"step": 1635
|
|
},
|
|
{
|
|
"epoch": 1.8349650349650348,
|
|
"grad_norm": 0.07117585543308619,
|
|
"learning_rate": 6.068545717717916e-06,
|
|
"loss": 0.6828,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"epoch": 1.8405594405594405,
|
|
"grad_norm": 0.06915527262243856,
|
|
"learning_rate": 5.662600012328944e-06,
|
|
"loss": 0.6883,
|
|
"step": 1645
|
|
},
|
|
{
|
|
"epoch": 1.8461538461538463,
|
|
"grad_norm": 0.06677186359807996,
|
|
"learning_rate": 5.27044491515512e-06,
|
|
"loss": 0.6701,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"epoch": 1.8517482517482518,
|
|
"grad_norm": 0.06975135811394965,
|
|
"learning_rate": 4.892117894359981e-06,
|
|
"loss": 0.6896,
|
|
"step": 1655
|
|
},
|
|
{
|
|
"epoch": 1.8573426573426572,
|
|
"grad_norm": 0.06646442932877118,
|
|
"learning_rate": 4.527655096913913e-06,
|
|
"loss": 0.6736,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"epoch": 1.862937062937063,
|
|
"grad_norm": 0.0696338374277028,
|
|
"learning_rate": 4.177091345140488e-06,
|
|
"loss": 0.6824,
|
|
"step": 1665
|
|
},
|
|
{
|
|
"epoch": 1.8685314685314687,
|
|
"grad_norm": 0.06913237278308428,
|
|
"learning_rate": 3.840460133389434e-06,
|
|
"loss": 0.6708,
|
|
"step": 1670
|
|
},
|
|
{
|
|
"epoch": 1.8741258741258742,
|
|
"grad_norm": 0.07472788742872244,
|
|
"learning_rate": 3.5177936248364236e-06,
|
|
"loss": 0.6843,
|
|
"step": 1675
|
|
},
|
|
{
|
|
"epoch": 1.8797202797202797,
|
|
"grad_norm": 0.07149423452101296,
|
|
"learning_rate": 3.2091226484101506e-06,
|
|
"loss": 0.6716,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"epoch": 1.8853146853146852,
|
|
"grad_norm": 0.0676192443302248,
|
|
"learning_rate": 2.9144766958466014e-06,
|
|
"loss": 0.6816,
|
|
"step": 1685
|
|
},
|
|
{
|
|
"epoch": 1.8909090909090909,
|
|
"grad_norm": 0.07125330690039695,
|
|
"learning_rate": 2.6338839188715433e-06,
|
|
"loss": 0.686,
|
|
"step": 1690
|
|
},
|
|
{
|
|
"epoch": 1.8965034965034966,
|
|
"grad_norm": 0.07150788853257964,
|
|
"learning_rate": 2.3673711265105754e-06,
|
|
"loss": 0.6845,
|
|
"step": 1695
|
|
},
|
|
{
|
|
"epoch": 1.902097902097902,
|
|
"grad_norm": 0.06921602353027008,
|
|
"learning_rate": 2.1149637825277953e-06,
|
|
"loss": 0.6851,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"epoch": 1.9076923076923076,
|
|
"grad_norm": 0.07103009932661779,
|
|
"learning_rate": 1.876686002992861e-06,
|
|
"loss": 0.6879,
|
|
"step": 1705
|
|
},
|
|
{
|
|
"epoch": 1.9132867132867133,
|
|
"grad_norm": 0.07078839389394587,
|
|
"learning_rate": 1.6525605539768173e-06,
|
|
"loss": 0.6842,
|
|
"step": 1710
|
|
},
|
|
{
|
|
"epoch": 1.918881118881119,
|
|
"grad_norm": 0.0677463790575426,
|
|
"learning_rate": 1.4426088493769695e-06,
|
|
"loss": 0.6822,
|
|
"step": 1715
|
|
},
|
|
{
|
|
"epoch": 1.9244755244755245,
|
|
"grad_norm": 0.0689426207416468,
|
|
"learning_rate": 1.2468509488708534e-06,
|
|
"loss": 0.671,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"epoch": 1.93006993006993,
|
|
"grad_norm": 0.06820108494343277,
|
|
"learning_rate": 1.0653055559997014e-06,
|
|
"loss": 0.6775,
|
|
"step": 1725
|
|
},
|
|
{
|
|
"epoch": 1.9356643356643357,
|
|
"grad_norm": 0.06821597918112166,
|
|
"learning_rate": 8.979900163813891e-07,
|
|
"loss": 0.6701,
|
|
"step": 1730
|
|
},
|
|
{
|
|
"epoch": 1.9412587412587412,
|
|
"grad_norm": 0.06837657131866866,
|
|
"learning_rate": 7.449203160532102e-07,
|
|
"loss": 0.6725,
|
|
"step": 1735
|
|
},
|
|
{
|
|
"epoch": 1.946853146853147,
|
|
"grad_norm": 0.07000431064997485,
|
|
"learning_rate": 6.061110799443991e-07,
|
|
"loss": 0.6709,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"epoch": 1.9524475524475524,
|
|
"grad_norm": 0.07108137280182684,
|
|
"learning_rate": 4.815755704789481e-07,
|
|
"loss": 0.6846,
|
|
"step": 1745
|
|
},
|
|
{
|
|
"epoch": 1.958041958041958,
|
|
"grad_norm": 0.07117415645327746,
|
|
"learning_rate": 3.7132568630833804e-07,
|
|
"loss": 0.6865,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"epoch": 1.9636363636363636,
|
|
"grad_norm": 0.06606487210968705,
|
|
"learning_rate": 2.753719611747474e-07,
|
|
"loss": 0.6736,
|
|
"step": 1755
|
|
},
|
|
{
|
|
"epoch": 1.9692307692307693,
|
|
"grad_norm": 0.06965546022369541,
|
|
"learning_rate": 1.9372356290460744e-07,
|
|
"loss": 0.7023,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"epoch": 1.9748251748251748,
|
|
"grad_norm": 0.06925022116566176,
|
|
"learning_rate": 1.2638829253265316e-07,
|
|
"loss": 0.6665,
|
|
"step": 1765
|
|
},
|
|
{
|
|
"epoch": 1.9804195804195803,
|
|
"grad_norm": 0.06743940727744605,
|
|
"learning_rate": 7.337258355660236e-08,
|
|
"loss": 0.6704,
|
|
"step": 1770
|
|
},
|
|
{
|
|
"epoch": 1.986013986013986,
|
|
"grad_norm": 0.06697392386506328,
|
|
"learning_rate": 3.4681501322464386e-08,
|
|
"loss": 0.6703,
|
|
"step": 1775
|
|
},
|
|
{
|
|
"epoch": 1.9916083916083918,
|
|
"grad_norm": 0.06864360169538268,
|
|
"learning_rate": 1.0318742540560421e-08,
|
|
"loss": 0.6753,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"epoch": 1.9972027972027973,
|
|
"grad_norm": 0.06821105929452137,
|
|
"learning_rate": 2.8663493232272684e-10,
|
|
"loss": 0.6705,
|
|
"step": 1785
|
|
},
|
|
{
|
|
"epoch": 1.9983216783216782,
|
|
"eval_loss": 1.0629972219467163,
|
|
"eval_runtime": 366.656,
|
|
"eval_samples_per_second": 56.153,
|
|
"eval_steps_per_second": 1.756,
|
|
"step": 1786
|
|
},
|
|
{
|
|
"epoch": 1.9983216783216782,
|
|
"step": 1786,
|
|
"total_flos": 1127451463778304.0,
|
|
"train_loss": 0.777597175032935,
|
|
"train_runtime": 16051.5455,
|
|
"train_samples_per_second": 14.253,
|
|
"train_steps_per_second": 0.111
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 1786,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 2,
|
|
"save_steps": 500,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": false,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 1127451463778304.0,
|
|
"train_batch_size": 4,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|