1391 lines
34 KiB
JSON
1391 lines
34 KiB
JSON
|
|
{
|
||
|
|
"best_metric": null,
|
||
|
|
"best_model_checkpoint": null,
|
||
|
|
"epoch": 0.9994655264564404,
|
||
|
|
"eval_steps": 200,
|
||
|
|
"global_step": 935,
|
||
|
|
"is_hyper_param_search": false,
|
||
|
|
"is_local_process_zero": true,
|
||
|
|
"is_world_process_zero": true,
|
||
|
|
"log_history": [
|
||
|
|
{
|
||
|
|
"epoch": 0.0010689470871191875,
|
||
|
|
"grad_norm": 4.714404593717477e+18,
|
||
|
|
"learning_rate": 2.1276595744680852e-07,
|
||
|
|
"loss": 1.5727,
|
||
|
|
"step": 1
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.005344735435595938,
|
||
|
|
"grad_norm": 3523.978271484375,
|
||
|
|
"learning_rate": 1.0638297872340427e-06,
|
||
|
|
"loss": 1.5802,
|
||
|
|
"step": 5
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.010689470871191877,
|
||
|
|
"grad_norm": 36.699684143066406,
|
||
|
|
"learning_rate": 2.1276595744680853e-06,
|
||
|
|
"loss": 1.539,
|
||
|
|
"step": 10
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.016034206306787813,
|
||
|
|
"grad_norm": 6.539373874664307,
|
||
|
|
"learning_rate": 3.191489361702128e-06,
|
||
|
|
"loss": 1.4339,
|
||
|
|
"step": 15
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.021378941742383754,
|
||
|
|
"grad_norm": 2.321072816848755,
|
||
|
|
"learning_rate": 4.255319148936171e-06,
|
||
|
|
"loss": 1.3441,
|
||
|
|
"step": 20
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.02672367717797969,
|
||
|
|
"grad_norm": 2.6276676654815674,
|
||
|
|
"learning_rate": 5.319148936170213e-06,
|
||
|
|
"loss": 1.2829,
|
||
|
|
"step": 25
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.032068412613575625,
|
||
|
|
"grad_norm": 1.8791221380233765,
|
||
|
|
"learning_rate": 6.382978723404256e-06,
|
||
|
|
"loss": 1.2579,
|
||
|
|
"step": 30
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.03741314804917156,
|
||
|
|
"grad_norm": 1.898274540901184,
|
||
|
|
"learning_rate": 7.446808510638298e-06,
|
||
|
|
"loss": 1.2338,
|
||
|
|
"step": 35
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.04275788348476751,
|
||
|
|
"grad_norm": 1.6828669309616089,
|
||
|
|
"learning_rate": 8.510638297872341e-06,
|
||
|
|
"loss": 1.2116,
|
||
|
|
"step": 40
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.048102618920363445,
|
||
|
|
"grad_norm": 1.6606314182281494,
|
||
|
|
"learning_rate": 9.574468085106385e-06,
|
||
|
|
"loss": 1.1804,
|
||
|
|
"step": 45
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.05344735435595938,
|
||
|
|
"grad_norm": 1.5807420015335083,
|
||
|
|
"learning_rate": 1.0638297872340426e-05,
|
||
|
|
"loss": 1.191,
|
||
|
|
"step": 50
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.05879208979155532,
|
||
|
|
"grad_norm": 1.5150554180145264,
|
||
|
|
"learning_rate": 1.170212765957447e-05,
|
||
|
|
"loss": 1.1707,
|
||
|
|
"step": 55
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06413682522715125,
|
||
|
|
"grad_norm": 1.198437213897705,
|
||
|
|
"learning_rate": 1.2765957446808513e-05,
|
||
|
|
"loss": 1.1583,
|
||
|
|
"step": 60
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06948156066274719,
|
||
|
|
"grad_norm": 1.2076998949050903,
|
||
|
|
"learning_rate": 1.3829787234042556e-05,
|
||
|
|
"loss": 1.1604,
|
||
|
|
"step": 65
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07482629609834313,
|
||
|
|
"grad_norm": 1.4316256046295166,
|
||
|
|
"learning_rate": 1.4893617021276596e-05,
|
||
|
|
"loss": 1.1664,
|
||
|
|
"step": 70
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08017103153393906,
|
||
|
|
"grad_norm": 1.2154533863067627,
|
||
|
|
"learning_rate": 1.595744680851064e-05,
|
||
|
|
"loss": 1.1603,
|
||
|
|
"step": 75
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08551576696953501,
|
||
|
|
"grad_norm": 1.6808208227157593,
|
||
|
|
"learning_rate": 1.7021276595744682e-05,
|
||
|
|
"loss": 1.1415,
|
||
|
|
"step": 80
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09086050240513095,
|
||
|
|
"grad_norm": 1.1716338396072388,
|
||
|
|
"learning_rate": 1.8085106382978724e-05,
|
||
|
|
"loss": 1.1511,
|
||
|
|
"step": 85
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09620523784072689,
|
||
|
|
"grad_norm": 1.4733761548995972,
|
||
|
|
"learning_rate": 1.914893617021277e-05,
|
||
|
|
"loss": 1.1462,
|
||
|
|
"step": 90
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10154997327632283,
|
||
|
|
"grad_norm": 1.0243571996688843,
|
||
|
|
"learning_rate": 1.9999930228629612e-05,
|
||
|
|
"loss": 1.1484,
|
||
|
|
"step": 95
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10689470871191876,
|
||
|
|
"grad_norm": 1.1259580850601196,
|
||
|
|
"learning_rate": 1.999748833289337e-05,
|
||
|
|
"loss": 1.1425,
|
||
|
|
"step": 100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1122394441475147,
|
||
|
|
"grad_norm": 1.4838624000549316,
|
||
|
|
"learning_rate": 1.999155884218539e-05,
|
||
|
|
"loss": 1.161,
|
||
|
|
"step": 105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11758417958311064,
|
||
|
|
"grad_norm": 1.2224469184875488,
|
||
|
|
"learning_rate": 1.9982143824991402e-05,
|
||
|
|
"loss": 1.1318,
|
||
|
|
"step": 110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12292891501870658,
|
||
|
|
"grad_norm": 0.9379732608795166,
|
||
|
|
"learning_rate": 1.9969246565713005e-05,
|
||
|
|
"loss": 1.1533,
|
||
|
|
"step": 115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1282736504543025,
|
||
|
|
"grad_norm": 1.209952473640442,
|
||
|
|
"learning_rate": 1.99528715635219e-05,
|
||
|
|
"loss": 1.1494,
|
||
|
|
"step": 120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13361838588989844,
|
||
|
|
"grad_norm": 1.0505174398422241,
|
||
|
|
"learning_rate": 1.9933024530790377e-05,
|
||
|
|
"loss": 1.142,
|
||
|
|
"step": 125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13896312132549438,
|
||
|
|
"grad_norm": 1.0835527181625366,
|
||
|
|
"learning_rate": 1.990971239109856e-05,
|
||
|
|
"loss": 1.1377,
|
||
|
|
"step": 130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14430785676109031,
|
||
|
|
"grad_norm": 1.0426019430160522,
|
||
|
|
"learning_rate": 1.9882943276819153e-05,
|
||
|
|
"loss": 1.1378,
|
||
|
|
"step": 135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14965259219668625,
|
||
|
|
"grad_norm": 1.105392336845398,
|
||
|
|
"learning_rate": 1.9852726526280467e-05,
|
||
|
|
"loss": 1.1298,
|
||
|
|
"step": 140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1549973276322822,
|
||
|
|
"grad_norm": 1.0859570503234863,
|
||
|
|
"learning_rate": 1.981907268050878e-05,
|
||
|
|
"loss": 1.144,
|
||
|
|
"step": 145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16034206306787813,
|
||
|
|
"grad_norm": 1.0400607585906982,
|
||
|
|
"learning_rate": 1.9781993479551124e-05,
|
||
|
|
"loss": 1.1431,
|
||
|
|
"step": 150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16568679850347406,
|
||
|
|
"grad_norm": 0.8823645710945129,
|
||
|
|
"learning_rate": 1.9741501858379828e-05,
|
||
|
|
"loss": 1.1449,
|
||
|
|
"step": 155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17103153393907003,
|
||
|
|
"grad_norm": 0.9304960370063782,
|
||
|
|
"learning_rate": 1.969761194238015e-05,
|
||
|
|
"loss": 1.1289,
|
||
|
|
"step": 160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17637626937466597,
|
||
|
|
"grad_norm": 0.9176273345947266,
|
||
|
|
"learning_rate": 1.9650339042422707e-05,
|
||
|
|
"loss": 1.1303,
|
||
|
|
"step": 165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1817210048102619,
|
||
|
|
"grad_norm": 0.8516846299171448,
|
||
|
|
"learning_rate": 1.9599699649522318e-05,
|
||
|
|
"loss": 1.1169,
|
||
|
|
"step": 170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18706574024585784,
|
||
|
|
"grad_norm": 0.9081844687461853,
|
||
|
|
"learning_rate": 1.9545711429085138e-05,
|
||
|
|
"loss": 1.1263,
|
||
|
|
"step": 175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19241047568145378,
|
||
|
|
"grad_norm": 0.8308460712432861,
|
||
|
|
"learning_rate": 1.948839321474617e-05,
|
||
|
|
"loss": 1.1327,
|
||
|
|
"step": 180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19775521111704972,
|
||
|
|
"grad_norm": 0.7651630640029907,
|
||
|
|
"learning_rate": 1.942776500179918e-05,
|
||
|
|
"loss": 1.1364,
|
||
|
|
"step": 185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20309994655264565,
|
||
|
|
"grad_norm": 0.8433904647827148,
|
||
|
|
"learning_rate": 1.9363847940221396e-05,
|
||
|
|
"loss": 1.1305,
|
||
|
|
"step": 190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2084446819882416,
|
||
|
|
"grad_norm": 0.7462895512580872,
|
||
|
|
"learning_rate": 1.929666432729541e-05,
|
||
|
|
"loss": 1.1378,
|
||
|
|
"step": 195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21378941742383753,
|
||
|
|
"grad_norm": 0.7528314590454102,
|
||
|
|
"learning_rate": 1.9226237599830834e-05,
|
||
|
|
"loss": 1.1457,
|
||
|
|
"step": 200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21378941742383753,
|
||
|
|
"eval_loss": 1.1342483758926392,
|
||
|
|
"eval_runtime": 298.4179,
|
||
|
|
"eval_samples_per_second": 44.387,
|
||
|
|
"eval_steps_per_second": 5.549,
|
||
|
|
"step": 200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21913415285943347,
|
||
|
|
"grad_norm": 0.8370968103408813,
|
||
|
|
"learning_rate": 1.9152592325988428e-05,
|
||
|
|
"loss": 1.1102,
|
||
|
|
"step": 205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2244788882950294,
|
||
|
|
"grad_norm": 0.8102027177810669,
|
||
|
|
"learning_rate": 1.9075754196709574e-05,
|
||
|
|
"loss": 1.1246,
|
||
|
|
"step": 210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22982362373062534,
|
||
|
|
"grad_norm": 0.7332049012184143,
|
||
|
|
"learning_rate": 1.8995750016754066e-05,
|
||
|
|
"loss": 1.1459,
|
||
|
|
"step": 215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23516835916622128,
|
||
|
|
"grad_norm": 0.8379830718040466,
|
||
|
|
"learning_rate": 1.8912607695349348e-05,
|
||
|
|
"loss": 1.1129,
|
||
|
|
"step": 220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24051309460181722,
|
||
|
|
"grad_norm": 0.8001225590705872,
|
||
|
|
"learning_rate": 1.882635623645446e-05,
|
||
|
|
"loss": 1.1253,
|
||
|
|
"step": 225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24585783003741316,
|
||
|
|
"grad_norm": 0.8222916722297668,
|
||
|
|
"learning_rate": 1.873702572864208e-05,
|
||
|
|
"loss": 1.1273,
|
||
|
|
"step": 230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.25120256547300907,
|
||
|
|
"grad_norm": 0.7916209697723389,
|
||
|
|
"learning_rate": 1.8644647334602225e-05,
|
||
|
|
"loss": 1.1032,
|
||
|
|
"step": 235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.256547300908605,
|
||
|
|
"grad_norm": 0.8102867603302002,
|
||
|
|
"learning_rate": 1.8549253280271232e-05,
|
||
|
|
"loss": 1.1098,
|
||
|
|
"step": 240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.26189203634420094,
|
||
|
|
"grad_norm": 0.7470313906669617,
|
||
|
|
"learning_rate": 1.8450876843589837e-05,
|
||
|
|
"loss": 1.1207,
|
||
|
|
"step": 245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2672367717797969,
|
||
|
|
"grad_norm": 0.8269715309143066,
|
||
|
|
"learning_rate": 1.834955234289425e-05,
|
||
|
|
"loss": 1.1254,
|
||
|
|
"step": 250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2725815072153928,
|
||
|
|
"grad_norm": 0.8336250185966492,
|
||
|
|
"learning_rate": 1.824531512494432e-05,
|
||
|
|
"loss": 1.1306,
|
||
|
|
"step": 255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.27792624265098875,
|
||
|
|
"grad_norm": 0.7656748294830322,
|
||
|
|
"learning_rate": 1.81382015525929e-05,
|
||
|
|
"loss": 1.1106,
|
||
|
|
"step": 260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2832709780865847,
|
||
|
|
"grad_norm": 0.7885479927062988,
|
||
|
|
"learning_rate": 1.8028248992100783e-05,
|
||
|
|
"loss": 1.1133,
|
||
|
|
"step": 265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.28861571352218063,
|
||
|
|
"grad_norm": 0.8161848783493042,
|
||
|
|
"learning_rate": 1.7915495800101594e-05,
|
||
|
|
"loss": 1.114,
|
||
|
|
"step": 270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.29396044895777657,
|
||
|
|
"grad_norm": 0.7918674945831299,
|
||
|
|
"learning_rate": 1.7799981310221172e-05,
|
||
|
|
"loss": 1.1255,
|
||
|
|
"step": 275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2993051843933725,
|
||
|
|
"grad_norm": 0.7479060292243958,
|
||
|
|
"learning_rate": 1.7681745819356163e-05,
|
||
|
|
"loss": 1.111,
|
||
|
|
"step": 280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.30464991982896844,
|
||
|
|
"grad_norm": 0.7473365664482117,
|
||
|
|
"learning_rate": 1.756083057361657e-05,
|
||
|
|
"loss": 1.1083,
|
||
|
|
"step": 285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3099946552645644,
|
||
|
|
"grad_norm": 0.7499710917472839,
|
||
|
|
"learning_rate": 1.743727775393713e-05,
|
||
|
|
"loss": 1.1198,
|
||
|
|
"step": 290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3153393907001603,
|
||
|
|
"grad_norm": 0.7550699710845947,
|
||
|
|
"learning_rate": 1.7311130461362658e-05,
|
||
|
|
"loss": 1.103,
|
||
|
|
"step": 295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.32068412613575625,
|
||
|
|
"grad_norm": 0.7409452795982361,
|
||
|
|
"learning_rate": 1.7182432702012363e-05,
|
||
|
|
"loss": 1.111,
|
||
|
|
"step": 300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3260288615713522,
|
||
|
|
"grad_norm": 0.7733418941497803,
|
||
|
|
"learning_rate": 1.7051229371728418e-05,
|
||
|
|
"loss": 1.128,
|
||
|
|
"step": 305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.33137359700694813,
|
||
|
|
"grad_norm": 0.7811718583106995,
|
||
|
|
"learning_rate": 1.6917566240414197e-05,
|
||
|
|
"loss": 1.1172,
|
||
|
|
"step": 310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3367183324425441,
|
||
|
|
"grad_norm": 0.7376381158828735,
|
||
|
|
"learning_rate": 1.678148993606757e-05,
|
||
|
|
"loss": 1.11,
|
||
|
|
"step": 315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.34206306787814006,
|
||
|
|
"grad_norm": 0.8394958972930908,
|
||
|
|
"learning_rate": 1.6643047928514862e-05,
|
||
|
|
"loss": 1.1133,
|
||
|
|
"step": 320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.347407803313736,
|
||
|
|
"grad_norm": 0.7389253377914429,
|
||
|
|
"learning_rate": 1.6502288512851124e-05,
|
||
|
|
"loss": 1.1056,
|
||
|
|
"step": 325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.35275253874933193,
|
||
|
|
"grad_norm": 0.811557948589325,
|
||
|
|
"learning_rate": 1.635926079259257e-05,
|
||
|
|
"loss": 1.1121,
|
||
|
|
"step": 330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.35809727418492787,
|
||
|
|
"grad_norm": 0.718371570110321,
|
||
|
|
"learning_rate": 1.6214014662546897e-05,
|
||
|
|
"loss": 1.1188,
|
||
|
|
"step": 335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3634420096205238,
|
||
|
|
"grad_norm": 0.8406382203102112,
|
||
|
|
"learning_rate": 1.606660079140769e-05,
|
||
|
|
"loss": 1.094,
|
||
|
|
"step": 340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.36878674505611975,
|
||
|
|
"grad_norm": 0.8330205678939819,
|
||
|
|
"learning_rate": 1.5917070604078736e-05,
|
||
|
|
"loss": 1.1225,
|
||
|
|
"step": 345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3741314804917157,
|
||
|
|
"grad_norm": 0.8457123637199402,
|
||
|
|
"learning_rate": 1.576547626373464e-05,
|
||
|
|
"loss": 1.1178,
|
||
|
|
"step": 350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3794762159273116,
|
||
|
|
"grad_norm": 0.7380478382110596,
|
||
|
|
"learning_rate": 1.5611870653623826e-05,
|
||
|
|
"loss": 1.0822,
|
||
|
|
"step": 355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.38482095136290756,
|
||
|
|
"grad_norm": 0.8306733965873718,
|
||
|
|
"learning_rate": 1.5456307358620372e-05,
|
||
|
|
"loss": 1.1232,
|
||
|
|
"step": 360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3901656867985035,
|
||
|
|
"grad_norm": 1.030051589012146,
|
||
|
|
"learning_rate": 1.5298840646531093e-05,
|
||
|
|
"loss": 1.0938,
|
||
|
|
"step": 365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.39551042223409943,
|
||
|
|
"grad_norm": 0.7108510732650757,
|
||
|
|
"learning_rate": 1.5139525449164358e-05,
|
||
|
|
"loss": 1.096,
|
||
|
|
"step": 370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.40085515766969537,
|
||
|
|
"grad_norm": 0.6886569261550903,
|
||
|
|
"learning_rate": 1.49784173431673e-05,
|
||
|
|
"loss": 1.1025,
|
||
|
|
"step": 375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4061998931052913,
|
||
|
|
"grad_norm": 0.7875552177429199,
|
||
|
|
"learning_rate": 1.4815572530638046e-05,
|
||
|
|
"loss": 1.1171,
|
||
|
|
"step": 380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.41154462854088725,
|
||
|
|
"grad_norm": 0.7455640435218811,
|
||
|
|
"learning_rate": 1.4651047819519804e-05,
|
||
|
|
"loss": 1.1102,
|
||
|
|
"step": 385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4168893639764832,
|
||
|
|
"grad_norm": 0.815104067325592,
|
||
|
|
"learning_rate": 1.4484900603783544e-05,
|
||
|
|
"loss": 1.1004,
|
||
|
|
"step": 390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4222340994120791,
|
||
|
|
"grad_norm": 0.7801194190979004,
|
||
|
|
"learning_rate": 1.4317188843406304e-05,
|
||
|
|
"loss": 1.1172,
|
||
|
|
"step": 395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.42757883484767506,
|
||
|
|
"grad_norm": 0.6944659352302551,
|
||
|
|
"learning_rate": 1.4147971044152002e-05,
|
||
|
|
"loss": 1.0927,
|
||
|
|
"step": 400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.42757883484767506,
|
||
|
|
"eval_loss": 1.1091774702072144,
|
||
|
|
"eval_runtime": 299.2382,
|
||
|
|
"eval_samples_per_second": 44.266,
|
||
|
|
"eval_steps_per_second": 5.534,
|
||
|
|
"step": 400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.432923570283271,
|
||
|
|
"grad_norm": 0.7244230508804321,
|
||
|
|
"learning_rate": 1.3977306237161877e-05,
|
||
|
|
"loss": 1.1015,
|
||
|
|
"step": 405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.43826830571886694,
|
||
|
|
"grad_norm": 0.7366450428962708,
|
||
|
|
"learning_rate": 1.3805253958361641e-05,
|
||
|
|
"loss": 1.1064,
|
||
|
|
"step": 410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4436130411544629,
|
||
|
|
"grad_norm": 0.728798508644104,
|
||
|
|
"learning_rate": 1.3631874227692549e-05,
|
||
|
|
"loss": 1.0934,
|
||
|
|
"step": 415
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4489577765900588,
|
||
|
|
"grad_norm": 0.6889216303825378,
|
||
|
|
"learning_rate": 1.3457227528173613e-05,
|
||
|
|
"loss": 1.0692,
|
||
|
|
"step": 420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.45430251202565475,
|
||
|
|
"grad_norm": 0.8071165084838867,
|
||
|
|
"learning_rate": 1.3281374784802263e-05,
|
||
|
|
"loss": 1.1183,
|
||
|
|
"step": 425
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4596472474612507,
|
||
|
|
"grad_norm": 0.7213979363441467,
|
||
|
|
"learning_rate": 1.3104377343300868e-05,
|
||
|
|
"loss": 1.0848,
|
||
|
|
"step": 430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4649919828968466,
|
||
|
|
"grad_norm": 0.7587602138519287,
|
||
|
|
"learning_rate": 1.292629694871642e-05,
|
||
|
|
"loss": 1.0936,
|
||
|
|
"step": 435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.47033671833244256,
|
||
|
|
"grad_norm": 0.7268496155738831,
|
||
|
|
"learning_rate": 1.2747195723880976e-05,
|
||
|
|
"loss": 1.1016,
|
||
|
|
"step": 440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4756814537680385,
|
||
|
|
"grad_norm": 0.7437026500701904,
|
||
|
|
"learning_rate": 1.2567136147740294e-05,
|
||
|
|
"loss": 1.0934,
|
||
|
|
"step": 445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.48102618920363444,
|
||
|
|
"grad_norm": 0.7416156530380249,
|
||
|
|
"learning_rate": 1.2386181033558205e-05,
|
||
|
|
"loss": 1.0981,
|
||
|
|
"step": 450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4863709246392304,
|
||
|
|
"grad_norm": 2.2510459423065186,
|
||
|
|
"learning_rate": 1.2204393507004404e-05,
|
||
|
|
"loss": 1.0839,
|
||
|
|
"step": 455
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4917156600748263,
|
||
|
|
"grad_norm": 0.7769641280174255,
|
||
|
|
"learning_rate": 1.2021836984133255e-05,
|
||
|
|
"loss": 1.1056,
|
||
|
|
"step": 460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.49706039551042225,
|
||
|
|
"grad_norm": 0.7865785956382751,
|
||
|
|
"learning_rate": 1.1838575149261256e-05,
|
||
|
|
"loss": 1.0846,
|
||
|
|
"step": 465
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5024051309460181,
|
||
|
|
"grad_norm": 0.7650848031044006,
|
||
|
|
"learning_rate": 1.165467193275097e-05,
|
||
|
|
"loss": 1.0914,
|
||
|
|
"step": 470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5077498663816141,
|
||
|
|
"grad_norm": 0.7357913851737976,
|
||
|
|
"learning_rate": 1.1470191488709086e-05,
|
||
|
|
"loss": 1.0999,
|
||
|
|
"step": 475
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.51309460181721,
|
||
|
|
"grad_norm": 0.7361023426055908,
|
||
|
|
"learning_rate": 1.1285198172606466e-05,
|
||
|
|
"loss": 1.0888,
|
||
|
|
"step": 480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.518439337252806,
|
||
|
|
"grad_norm": 0.6836273074150085,
|
||
|
|
"learning_rate": 1.1099756518827895e-05,
|
||
|
|
"loss": 1.0858,
|
||
|
|
"step": 485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5237840726884019,
|
||
|
|
"grad_norm": 0.7213249802589417,
|
||
|
|
"learning_rate": 1.0913931218159482e-05,
|
||
|
|
"loss": 1.0799,
|
||
|
|
"step": 490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5291288081239979,
|
||
|
|
"grad_norm": 0.6905862092971802,
|
||
|
|
"learning_rate": 1.072778709522143e-05,
|
||
|
|
"loss": 1.0845,
|
||
|
|
"step": 495
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5344735435595938,
|
||
|
|
"grad_norm": 0.6546228528022766,
|
||
|
|
"learning_rate": 1.0541389085854177e-05,
|
||
|
|
"loss": 1.072,
|
||
|
|
"step": 500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5398182789951897,
|
||
|
|
"grad_norm": 0.7062543630599976,
|
||
|
|
"learning_rate": 1.0354802214465715e-05,
|
||
|
|
"loss": 1.0642,
|
||
|
|
"step": 505
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5451630144307856,
|
||
|
|
"grad_norm": 0.6921901106834412,
|
||
|
|
"learning_rate": 1.0168091571348003e-05,
|
||
|
|
"loss": 1.0773,
|
||
|
|
"step": 510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5505077498663816,
|
||
|
|
"grad_norm": 0.696793258190155,
|
||
|
|
"learning_rate": 9.981322289970407e-06,
|
||
|
|
"loss": 1.085,
|
||
|
|
"step": 515
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5558524853019775,
|
||
|
|
"grad_norm": 0.6655827164649963,
|
||
|
|
"learning_rate": 9.794559524258089e-06,
|
||
|
|
"loss": 1.1033,
|
||
|
|
"step": 520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5611972207375735,
|
||
|
|
"grad_norm": 0.7247300148010254,
|
||
|
|
"learning_rate": 9.607868425863235e-06,
|
||
|
|
"loss": 1.0884,
|
||
|
|
"step": 525
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5665419561731694,
|
||
|
|
"grad_norm": 0.704136312007904,
|
||
|
|
"learning_rate": 9.421314121437093e-06,
|
||
|
|
"loss": 1.0921,
|
||
|
|
"step": 530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5718866916087654,
|
||
|
|
"grad_norm": 0.6913681626319885,
|
||
|
|
"learning_rate": 9.234961689910735e-06,
|
||
|
|
"loss": 1.092,
|
||
|
|
"step": 535
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5772314270443613,
|
||
|
|
"grad_norm": 0.7085949182510376,
|
||
|
|
"learning_rate": 9.04887613979244e-06,
|
||
|
|
"loss": 1.0779,
|
||
|
|
"step": 540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5825761624799572,
|
||
|
|
"grad_norm": 0.6966550350189209,
|
||
|
|
"learning_rate": 8.863122386489704e-06,
|
||
|
|
"loss": 1.0858,
|
||
|
|
"step": 545
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5879208979155531,
|
||
|
|
"grad_norm": 0.6893490552902222,
|
||
|
|
"learning_rate": 8.677765229663634e-06,
|
||
|
|
"loss": 1.074,
|
||
|
|
"step": 550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5932656333511491,
|
||
|
|
"grad_norm": 0.6713391542434692,
|
||
|
|
"learning_rate": 8.492869330623813e-06,
|
||
|
|
"loss": 1.0883,
|
||
|
|
"step": 555
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.598610368786745,
|
||
|
|
"grad_norm": 0.6851401329040527,
|
||
|
|
"learning_rate": 8.308499189771375e-06,
|
||
|
|
"loss": 1.0786,
|
||
|
|
"step": 560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.603955104222341,
|
||
|
|
"grad_norm": 0.6715465188026428,
|
||
|
|
"learning_rate": 8.124719124098218e-06,
|
||
|
|
"loss": 1.0586,
|
||
|
|
"step": 565
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6092998396579369,
|
||
|
|
"grad_norm": 0.7100517153739929,
|
||
|
|
"learning_rate": 7.941593244750232e-06,
|
||
|
|
"loss": 1.0852,
|
||
|
|
"step": 570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6146445750935329,
|
||
|
|
"grad_norm": 0.6566494703292847,
|
||
|
|
"learning_rate": 7.759185434662281e-06,
|
||
|
|
"loss": 1.0728,
|
||
|
|
"step": 575
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6199893105291288,
|
||
|
|
"grad_norm": 0.6481814980506897,
|
||
|
|
"learning_rate": 7.57755932627284e-06,
|
||
|
|
"loss": 1.0824,
|
||
|
|
"step": 580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6253340459647247,
|
||
|
|
"grad_norm": 0.6776189208030701,
|
||
|
|
"learning_rate": 7.396778279326006e-06,
|
||
|
|
"loss": 1.083,
|
||
|
|
"step": 585
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6306787814003206,
|
||
|
|
"grad_norm": 0.666384756565094,
|
||
|
|
"learning_rate": 7.216905358768622e-06,
|
||
|
|
"loss": 1.0879,
|
||
|
|
"step": 590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6360235168359166,
|
||
|
|
"grad_norm": 0.6301067471504211,
|
||
|
|
"learning_rate": 7.038003312750263e-06,
|
||
|
|
"loss": 1.0773,
|
||
|
|
"step": 595
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6413682522715125,
|
||
|
|
"grad_norm": 0.664985716342926,
|
||
|
|
"learning_rate": 6.860134550733727e-06,
|
||
|
|
"loss": 1.0811,
|
||
|
|
"step": 600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6413682522715125,
|
||
|
|
"eval_loss": 1.0846972465515137,
|
||
|
|
"eval_runtime": 306.2721,
|
||
|
|
"eval_samples_per_second": 43.249,
|
||
|
|
"eval_steps_per_second": 5.407,
|
||
|
|
"step": 600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6467129877071085,
|
||
|
|
"grad_norm": 0.6260780096054077,
|
||
|
|
"learning_rate": 6.68336112172366e-06,
|
||
|
|
"loss": 1.0902,
|
||
|
|
"step": 605
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6520577231427044,
|
||
|
|
"grad_norm": 0.6706629395484924,
|
||
|
|
"learning_rate": 6.5077446926209475e-06,
|
||
|
|
"loss": 1.0781,
|
||
|
|
"step": 610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6574024585783004,
|
||
|
|
"grad_norm": 0.6518301963806152,
|
||
|
|
"learning_rate": 6.333346526710398e-06,
|
||
|
|
"loss": 1.0538,
|
||
|
|
"step": 615
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6627471940138963,
|
||
|
|
"grad_norm": 0.6535404920578003,
|
||
|
|
"learning_rate": 6.1602274622892175e-06,
|
||
|
|
"loss": 1.0582,
|
||
|
|
"step": 620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6680919294494923,
|
||
|
|
"grad_norm": 0.681788980960846,
|
||
|
|
"learning_rate": 5.988447891443744e-06,
|
||
|
|
"loss": 1.0796,
|
||
|
|
"step": 625
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6734366648850882,
|
||
|
|
"grad_norm": 0.6322731971740723,
|
||
|
|
"learning_rate": 5.818067738981851e-06,
|
||
|
|
"loss": 1.0557,
|
||
|
|
"step": 630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6787814003206841,
|
||
|
|
"grad_norm": 0.6635384559631348,
|
||
|
|
"learning_rate": 5.649146441528341e-06,
|
||
|
|
"loss": 1.0889,
|
||
|
|
"step": 635
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6841261357562801,
|
||
|
|
"grad_norm": 0.6500101089477539,
|
||
|
|
"learning_rate": 5.48174292679065e-06,
|
||
|
|
"loss": 1.0699,
|
||
|
|
"step": 640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.689470871191876,
|
||
|
|
"grad_norm": 0.6220327615737915,
|
||
|
|
"learning_rate": 5.3159155930021e-06,
|
||
|
|
"loss": 1.0885,
|
||
|
|
"step": 645
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.694815606627472,
|
||
|
|
"grad_norm": 0.6437745094299316,
|
||
|
|
"learning_rate": 5.151722288549828e-06,
|
||
|
|
"loss": 1.0731,
|
||
|
|
"step": 650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7001603420630679,
|
||
|
|
"grad_norm": 0.607314944267273,
|
||
|
|
"learning_rate": 4.989220291794549e-06,
|
||
|
|
"loss": 1.0514,
|
||
|
|
"step": 655
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7055050774986639,
|
||
|
|
"grad_norm": 0.6423488259315491,
|
||
|
|
"learning_rate": 4.82846629108917e-06,
|
||
|
|
"loss": 1.0769,
|
||
|
|
"step": 660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7108498129342598,
|
||
|
|
"grad_norm": 0.6608098149299622,
|
||
|
|
"learning_rate": 4.66951636500322e-06,
|
||
|
|
"loss": 1.077,
|
||
|
|
"step": 665
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7161945483698557,
|
||
|
|
"grad_norm": 0.6718878149986267,
|
||
|
|
"learning_rate": 4.512425962759992e-06,
|
||
|
|
"loss": 1.0588,
|
||
|
|
"step": 670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7215392838054516,
|
||
|
|
"grad_norm": 0.6484812498092651,
|
||
|
|
"learning_rate": 4.357249884893252e-06,
|
||
|
|
"loss": 1.0599,
|
||
|
|
"step": 675
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7268840192410476,
|
||
|
|
"grad_norm": 0.6472474336624146,
|
||
|
|
"learning_rate": 4.204042264130227e-06,
|
||
|
|
"loss": 1.0646,
|
||
|
|
"step": 680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7322287546766435,
|
||
|
|
"grad_norm": 0.6388882398605347,
|
||
|
|
"learning_rate": 4.052856546507565e-06,
|
||
|
|
"loss": 1.0855,
|
||
|
|
"step": 685
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7375734901122395,
|
||
|
|
"grad_norm": 0.6360565423965454,
|
||
|
|
"learning_rate": 3.9037454727268375e-06,
|
||
|
|
"loss": 1.0778,
|
||
|
|
"step": 690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7429182255478354,
|
||
|
|
"grad_norm": 0.6330989003181458,
|
||
|
|
"learning_rate": 3.7567610597560854e-06,
|
||
|
|
"loss": 1.0813,
|
||
|
|
"step": 695
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7482629609834314,
|
||
|
|
"grad_norm": 0.6103405952453613,
|
||
|
|
"learning_rate": 3.611954582683861e-06,
|
||
|
|
"loss": 1.0548,
|
||
|
|
"step": 700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7536076964190273,
|
||
|
|
"grad_norm": 0.6149457097053528,
|
||
|
|
"learning_rate": 3.469376556832069e-06,
|
||
|
|
"loss": 1.0617,
|
||
|
|
"step": 705
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7589524318546232,
|
||
|
|
"grad_norm": 0.6596876382827759,
|
||
|
|
"learning_rate": 3.3290767201338247e-06,
|
||
|
|
"loss": 1.0632,
|
||
|
|
"step": 710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7642971672902191,
|
||
|
|
"grad_norm": 0.6746628284454346,
|
||
|
|
"learning_rate": 3.1911040157825256e-06,
|
||
|
|
"loss": 1.0642,
|
||
|
|
"step": 715
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7696419027258151,
|
||
|
|
"grad_norm": 0.6416762471199036,
|
||
|
|
"learning_rate": 3.055506575158168e-06,
|
||
|
|
"loss": 1.0641,
|
||
|
|
"step": 720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.774986638161411,
|
||
|
|
"grad_norm": 0.6470797657966614,
|
||
|
|
"learning_rate": 2.922331701036848e-06,
|
||
|
|
"loss": 1.0592,
|
||
|
|
"step": 725
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.780331373597007,
|
||
|
|
"grad_norm": 0.6156336069107056,
|
||
|
|
"learning_rate": 2.791625851089317e-06,
|
||
|
|
"loss": 1.0917,
|
||
|
|
"step": 730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7856761090326029,
|
||
|
|
"grad_norm": 0.6239079833030701,
|
||
|
|
"learning_rate": 2.663434621674367e-06,
|
||
|
|
"loss": 1.071,
|
||
|
|
"step": 735
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7910208444681989,
|
||
|
|
"grad_norm": 0.6546154618263245,
|
||
|
|
"learning_rate": 2.537802731932674e-06,
|
||
|
|
"loss": 1.0616,
|
||
|
|
"step": 740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7963655799037948,
|
||
|
|
"grad_norm": 0.6523792147636414,
|
||
|
|
"learning_rate": 2.4147740081866423e-06,
|
||
|
|
"loss": 1.0687,
|
||
|
|
"step": 745
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8017103153393907,
|
||
|
|
"grad_norm": 0.6275489330291748,
|
||
|
|
"learning_rate": 2.294391368651735e-06,
|
||
|
|
"loss": 1.0502,
|
||
|
|
"step": 750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8070550507749866,
|
||
|
|
"grad_norm": 0.6088879704475403,
|
||
|
|
"learning_rate": 2.176696808464559e-06,
|
||
|
|
"loss": 1.0632,
|
||
|
|
"step": 755
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8123997862105826,
|
||
|
|
"grad_norm": 0.6577759981155396,
|
||
|
|
"learning_rate": 2.0617313850330067e-06,
|
||
|
|
"loss": 1.0601,
|
||
|
|
"step": 760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8177445216461785,
|
||
|
|
"grad_norm": 0.6168049573898315,
|
||
|
|
"learning_rate": 1.949535203713474e-06,
|
||
|
|
"loss": 1.0583,
|
||
|
|
"step": 765
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8230892570817745,
|
||
|
|
"grad_norm": 0.6288221478462219,
|
||
|
|
"learning_rate": 1.8401474038202338e-06,
|
||
|
|
"loss": 1.061,
|
||
|
|
"step": 770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8284339925173704,
|
||
|
|
"grad_norm": 0.6249226927757263,
|
||
|
|
"learning_rate": 1.7336061449717967e-06,
|
||
|
|
"loss": 1.0555,
|
||
|
|
"step": 775
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8337787279529664,
|
||
|
|
"grad_norm": 0.6102626323699951,
|
||
|
|
"learning_rate": 1.6299485937790505e-06,
|
||
|
|
"loss": 1.0724,
|
||
|
|
"step": 780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8391234633885623,
|
||
|
|
"grad_norm": 0.623283326625824,
|
||
|
|
"learning_rate": 1.5292109108797726e-06,
|
||
|
|
"loss": 1.0696,
|
||
|
|
"step": 785
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8444681988241582,
|
||
|
|
"grad_norm": 0.6192260384559631,
|
||
|
|
"learning_rate": 1.4314282383241097e-06,
|
||
|
|
"loss": 1.0768,
|
||
|
|
"step": 790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8498129342597541,
|
||
|
|
"grad_norm": 0.6554280519485474,
|
||
|
|
"learning_rate": 1.3366346873153703e-06,
|
||
|
|
"loss": 1.0525,
|
||
|
|
"step": 795
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8551576696953501,
|
||
|
|
"grad_norm": 0.5920155048370361,
|
||
|
|
"learning_rate": 1.2448633263104415e-06,
|
||
|
|
"loss": 1.0531,
|
||
|
|
"step": 800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8551576696953501,
|
||
|
|
"eval_loss": 1.0703972578048706,
|
||
|
|
"eval_runtime": 298.1755,
|
||
|
|
"eval_samples_per_second": 44.424,
|
||
|
|
"eval_steps_per_second": 5.554,
|
||
|
|
"step": 800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.860502405130946,
|
||
|
|
"grad_norm": 0.6562113761901855,
|
||
|
|
"learning_rate": 1.1561461694839304e-06,
|
||
|
|
"loss": 1.0479,
|
||
|
|
"step": 805
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.865847140566542,
|
||
|
|
"grad_norm": 0.5837886333465576,
|
||
|
|
"learning_rate": 1.070514165560138e-06,
|
||
|
|
"loss": 1.045,
|
||
|
|
"step": 810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8711918760021379,
|
||
|
|
"grad_norm": 0.6308085918426514,
|
||
|
|
"learning_rate": 9.879971870166628e-07,
|
||
|
|
"loss": 1.0544,
|
||
|
|
"step": 815
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8765366114377339,
|
||
|
|
"grad_norm": 0.6370830535888672,
|
||
|
|
"learning_rate": 9.086240196634899e-07,
|
||
|
|
"loss": 1.0663,
|
||
|
|
"step": 820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8818813468733298,
|
||
|
|
"grad_norm": 0.6129716634750366,
|
||
|
|
"learning_rate": 8.324223526011321e-07,
|
||
|
|
"loss": 1.059,
|
||
|
|
"step": 825
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8872260823089257,
|
||
|
|
"grad_norm": 0.615529477596283,
|
||
|
|
"learning_rate": 7.594187685613763e-07,
|
||
|
|
"loss": 1.06,
|
||
|
|
"step": 830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8925708177445216,
|
||
|
|
"grad_norm": 0.6301218867301941,
|
||
|
|
"learning_rate": 6.896387346339683e-07,
|
||
|
|
"loss": 1.0409,
|
||
|
|
"step": 835
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8979155531801176,
|
||
|
|
"grad_norm": 0.6093020439147949,
|
||
|
|
"learning_rate": 6.231065933824975e-07,
|
||
|
|
"loss": 1.0489,
|
||
|
|
"step": 840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9032602886157135,
|
||
|
|
"grad_norm": 0.6007770299911499,
|
||
|
|
"learning_rate": 5.598455543525571e-07,
|
||
|
|
"loss": 1.0489,
|
||
|
|
"step": 845
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9086050240513095,
|
||
|
|
"grad_norm": 0.6080732941627502,
|
||
|
|
"learning_rate": 4.998776859751619e-07,
|
||
|
|
"loss": 1.067,
|
||
|
|
"step": 850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9139497594869054,
|
||
|
|
"grad_norm": 0.6055812835693359,
|
||
|
|
"learning_rate": 4.4322390786824986e-07,
|
||
|
|
"loss": 1.0577,
|
||
|
|
"step": 855
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9192944949225014,
|
||
|
|
"grad_norm": 0.608756959438324,
|
||
|
|
"learning_rate": 3.8990398353891954e-07,
|
||
|
|
"loss": 1.0603,
|
||
|
|
"step": 860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9246392303580973,
|
||
|
|
"grad_norm": 0.5943930149078369,
|
||
|
|
"learning_rate": 3.3993651348899537e-07,
|
||
|
|
"loss": 1.0735,
|
||
|
|
"step": 865
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9299839657936932,
|
||
|
|
"grad_norm": 0.6033700108528137,
|
||
|
|
"learning_rate": 2.9333892872629664e-07,
|
||
|
|
"loss": 1.0772,
|
||
|
|
"step": 870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9353287012292891,
|
||
|
|
"grad_norm": 0.6112857460975647,
|
||
|
|
"learning_rate": 2.501274846838797e-07,
|
||
|
|
"loss": 1.0584,
|
||
|
|
"step": 875
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9406734366648851,
|
||
|
|
"grad_norm": 0.6191092729568481,
|
||
|
|
"learning_rate": 2.1031725554937378e-07,
|
||
|
|
"loss": 1.0667,
|
||
|
|
"step": 880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.946018172100481,
|
||
|
|
"grad_norm": 0.5986559391021729,
|
||
|
|
"learning_rate": 1.739221290063986e-07,
|
||
|
|
"loss": 1.0548,
|
||
|
|
"step": 885
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.951362907536077,
|
||
|
|
"grad_norm": 0.6083486676216125,
|
||
|
|
"learning_rate": 1.4095480138988204e-07,
|
||
|
|
"loss": 1.0605,
|
||
|
|
"step": 890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9567076429716729,
|
||
|
|
"grad_norm": 0.6409095525741577,
|
||
|
|
"learning_rate": 1.1142677325698514e-07,
|
||
|
|
"loss": 1.0561,
|
||
|
|
"step": 895
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9620523784072689,
|
||
|
|
"grad_norm": 0.6235837340354919,
|
||
|
|
"learning_rate": 8.534834537516246e-08,
|
||
|
|
"loss": 1.0667,
|
||
|
|
"step": 900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9673971138428648,
|
||
|
|
"grad_norm": 0.606450617313385,
|
||
|
|
"learning_rate": 6.272861512876871e-08,
|
||
|
|
"loss": 1.0558,
|
||
|
|
"step": 905
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9727418492784607,
|
||
|
|
"grad_norm": 0.6099078059196472,
|
||
|
|
"learning_rate": 4.357547334546408e-08,
|
||
|
|
"loss": 1.0739,
|
||
|
|
"step": 910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9780865847140566,
|
||
|
|
"grad_norm": 0.6281602382659912,
|
||
|
|
"learning_rate": 2.7895601543520557e-08,
|
||
|
|
"loss": 1.0675,
|
||
|
|
"step": 915
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9834313201496526,
|
||
|
|
"grad_norm": 0.6282256245613098,
|
||
|
|
"learning_rate": 1.56944696009953e-08,
|
||
|
|
"loss": 1.0526,
|
||
|
|
"step": 920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9887760555852485,
|
||
|
|
"grad_norm": 0.5956407785415649,
|
||
|
|
"learning_rate": 6.976333847578121e-09,
|
||
|
|
"loss": 1.0485,
|
||
|
|
"step": 925
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9941207910208445,
|
||
|
|
"grad_norm": 0.6351339221000671,
|
||
|
|
"learning_rate": 1.7442355797825383e-09,
|
||
|
|
"loss": 1.0796,
|
||
|
|
"step": 930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9994655264564404,
|
||
|
|
"grad_norm": 0.580534040927887,
|
||
|
|
"learning_rate": 0.0,
|
||
|
|
"loss": 1.0454,
|
||
|
|
"step": 935
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9994655264564404,
|
||
|
|
"step": 935,
|
||
|
|
"total_flos": 2.759237889794507e+18,
|
||
|
|
"train_loss": 1.1061673424460672,
|
||
|
|
"train_runtime": 11435.0508,
|
||
|
|
"train_samples_per_second": 10.471,
|
||
|
|
"train_steps_per_second": 0.082
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"logging_steps": 5,
|
||
|
|
"max_steps": 935,
|
||
|
|
"num_input_tokens_seen": 0,
|
||
|
|
"num_train_epochs": 1,
|
||
|
|
"save_steps": 500,
|
||
|
|
"stateful_callbacks": {
|
||
|
|
"TrainerControl": {
|
||
|
|
"args": {
|
||
|
|
"should_epoch_stop": false,
|
||
|
|
"should_evaluate": false,
|
||
|
|
"should_log": false,
|
||
|
|
"should_save": false,
|
||
|
|
"should_training_stop": false
|
||
|
|
},
|
||
|
|
"attributes": {}
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"total_flos": 2.759237889794507e+18,
|
||
|
|
"train_batch_size": 2,
|
||
|
|
"trial_name": null,
|
||
|
|
"trial_params": null
|
||
|
|
}
|