1294 lines
31 KiB
JSON
1294 lines
31 KiB
JSON
{
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 1.0,
|
|
"eval_steps": 500,
|
|
"global_step": 18053,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.005539245554755442,
|
|
"grad_norm": 1.546007513999939,
|
|
"learning_rate": 4.972580734503961e-05,
|
|
"loss": 1.2806,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.011078491109510884,
|
|
"grad_norm": 1.6162946224212646,
|
|
"learning_rate": 4.945161469007921e-05,
|
|
"loss": 1.2654,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.016617736664266327,
|
|
"grad_norm": 2.059379816055298,
|
|
"learning_rate": 4.917465241234144e-05,
|
|
"loss": 1.2231,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.022156982219021768,
|
|
"grad_norm": 1.6531622409820557,
|
|
"learning_rate": 4.890045975738105e-05,
|
|
"loss": 1.2161,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.027696227773777213,
|
|
"grad_norm": 1.57131028175354,
|
|
"learning_rate": 4.8623497479643274e-05,
|
|
"loss": 1.1963,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.033235473328532654,
|
|
"grad_norm": 1.4883884191513062,
|
|
"learning_rate": 4.83465352019055e-05,
|
|
"loss": 1.1716,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 0.038774718883288095,
|
|
"grad_norm": 1.4190359115600586,
|
|
"learning_rate": 4.806957292416773e-05,
|
|
"loss": 1.1969,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 0.044313964438043536,
|
|
"grad_norm": 1.6759884357452393,
|
|
"learning_rate": 4.779261064642996e-05,
|
|
"loss": 1.179,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 0.04985320999279898,
|
|
"grad_norm": 1.4575251340866089,
|
|
"learning_rate": 4.7515648368692186e-05,
|
|
"loss": 1.1543,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 0.055392455547554426,
|
|
"grad_norm": 1.4753471612930298,
|
|
"learning_rate": 4.723868609095441e-05,
|
|
"loss": 1.1502,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 0.06093170110230987,
|
|
"grad_norm": 1.660173773765564,
|
|
"learning_rate": 4.696172381321664e-05,
|
|
"loss": 1.1231,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 0.06647094665706531,
|
|
"grad_norm": 1.5432391166687012,
|
|
"learning_rate": 4.668476153547887e-05,
|
|
"loss": 1.1036,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 0.07201019221182076,
|
|
"grad_norm": 1.3922276496887207,
|
|
"learning_rate": 4.64077992577411e-05,
|
|
"loss": 1.1286,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 0.07754943776657619,
|
|
"grad_norm": 1.361965298652649,
|
|
"learning_rate": 4.613083698000333e-05,
|
|
"loss": 1.1039,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 0.08308868332133164,
|
|
"grad_norm": 1.3993804454803467,
|
|
"learning_rate": 4.585387470226555e-05,
|
|
"loss": 1.1166,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 0.08862792887608707,
|
|
"grad_norm": 1.5207942724227905,
|
|
"learning_rate": 4.557691242452778e-05,
|
|
"loss": 1.1079,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 0.09416717443084252,
|
|
"grad_norm": 1.6368706226348877,
|
|
"learning_rate": 4.529995014679001e-05,
|
|
"loss": 1.1103,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"epoch": 0.09970641998559795,
|
|
"grad_norm": 1.4837383031845093,
|
|
"learning_rate": 4.5022987869052236e-05,
|
|
"loss": 1.1003,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 0.1052456655403534,
|
|
"grad_norm": 1.483654499053955,
|
|
"learning_rate": 4.474602559131447e-05,
|
|
"loss": 1.0686,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"epoch": 0.11078491109510885,
|
|
"grad_norm": 1.776803731918335,
|
|
"learning_rate": 4.446906331357669e-05,
|
|
"loss": 1.0892,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 0.11632415664986429,
|
|
"grad_norm": 1.6472382545471191,
|
|
"learning_rate": 4.419210103583892e-05,
|
|
"loss": 1.0892,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"epoch": 0.12186340220461973,
|
|
"grad_norm": 1.4846845865249634,
|
|
"learning_rate": 4.391513875810115e-05,
|
|
"loss": 1.0421,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"epoch": 0.12740264775937518,
|
|
"grad_norm": 1.4608880281448364,
|
|
"learning_rate": 4.3638176480363375e-05,
|
|
"loss": 1.0578,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"epoch": 0.13294189331413062,
|
|
"grad_norm": 1.515207052230835,
|
|
"learning_rate": 4.336121420262561e-05,
|
|
"loss": 1.0652,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"epoch": 0.13848113886888605,
|
|
"grad_norm": 1.5072288513183594,
|
|
"learning_rate": 4.308425192488783e-05,
|
|
"loss": 1.0855,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"epoch": 0.1440203844236415,
|
|
"grad_norm": 1.4887125492095947,
|
|
"learning_rate": 4.281005926992744e-05,
|
|
"loss": 1.0739,
|
|
"step": 2600
|
|
},
|
|
{
|
|
"epoch": 0.14955962997839695,
|
|
"grad_norm": 1.3779743909835815,
|
|
"learning_rate": 4.2533096992189665e-05,
|
|
"loss": 1.0558,
|
|
"step": 2700
|
|
},
|
|
{
|
|
"epoch": 0.15509887553315238,
|
|
"grad_norm": 1.4754050970077515,
|
|
"learning_rate": 4.22561347144519e-05,
|
|
"loss": 1.0458,
|
|
"step": 2800
|
|
},
|
|
{
|
|
"epoch": 0.16063812108790781,
|
|
"grad_norm": 1.3826239109039307,
|
|
"learning_rate": 4.197917243671412e-05,
|
|
"loss": 1.0416,
|
|
"step": 2900
|
|
},
|
|
{
|
|
"epoch": 0.16617736664266328,
|
|
"grad_norm": 1.3614164590835571,
|
|
"learning_rate": 4.170221015897635e-05,
|
|
"loss": 1.0405,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"epoch": 0.1717166121974187,
|
|
"grad_norm": 1.8164361715316772,
|
|
"learning_rate": 4.142524788123858e-05,
|
|
"loss": 1.0464,
|
|
"step": 3100
|
|
},
|
|
{
|
|
"epoch": 0.17725585775217415,
|
|
"grad_norm": 1.497776746749878,
|
|
"learning_rate": 4.11482856035008e-05,
|
|
"loss": 1.0242,
|
|
"step": 3200
|
|
},
|
|
{
|
|
"epoch": 0.1827951033069296,
|
|
"grad_norm": 1.5668854713439941,
|
|
"learning_rate": 4.0871323325763036e-05,
|
|
"loss": 1.0444,
|
|
"step": 3300
|
|
},
|
|
{
|
|
"epoch": 0.18833434886168504,
|
|
"grad_norm": 1.3012648820877075,
|
|
"learning_rate": 4.0594361048025256e-05,
|
|
"loss": 1.019,
|
|
"step": 3400
|
|
},
|
|
{
|
|
"epoch": 0.19387359441644048,
|
|
"grad_norm": 1.4110270738601685,
|
|
"learning_rate": 4.031739877028749e-05,
|
|
"loss": 1.031,
|
|
"step": 3500
|
|
},
|
|
{
|
|
"epoch": 0.1994128399711959,
|
|
"grad_norm": 1.7231452465057373,
|
|
"learning_rate": 4.0040436492549715e-05,
|
|
"loss": 1.0458,
|
|
"step": 3600
|
|
},
|
|
{
|
|
"epoch": 0.20495208552595137,
|
|
"grad_norm": 1.5503065586090088,
|
|
"learning_rate": 3.976347421481195e-05,
|
|
"loss": 1.0141,
|
|
"step": 3700
|
|
},
|
|
{
|
|
"epoch": 0.2104913310807068,
|
|
"grad_norm": 1.3918309211730957,
|
|
"learning_rate": 3.9486511937074175e-05,
|
|
"loss": 1.0169,
|
|
"step": 3800
|
|
},
|
|
{
|
|
"epoch": 0.21603057663546224,
|
|
"grad_norm": 1.4343814849853516,
|
|
"learning_rate": 3.9209549659336394e-05,
|
|
"loss": 1.0247,
|
|
"step": 3900
|
|
},
|
|
{
|
|
"epoch": 0.2215698221902177,
|
|
"grad_norm": 1.4719725847244263,
|
|
"learning_rate": 3.893258738159863e-05,
|
|
"loss": 1.0043,
|
|
"step": 4000
|
|
},
|
|
{
|
|
"epoch": 0.22710906774497314,
|
|
"grad_norm": 1.3622781038284302,
|
|
"learning_rate": 3.8655625103860854e-05,
|
|
"loss": 1.0282,
|
|
"step": 4100
|
|
},
|
|
{
|
|
"epoch": 0.23264831329972857,
|
|
"grad_norm": 1.5316309928894043,
|
|
"learning_rate": 3.837866282612309e-05,
|
|
"loss": 1.0034,
|
|
"step": 4200
|
|
},
|
|
{
|
|
"epoch": 0.23818755885448403,
|
|
"grad_norm": 1.3682796955108643,
|
|
"learning_rate": 3.8104470171162684e-05,
|
|
"loss": 1.0456,
|
|
"step": 4300
|
|
},
|
|
{
|
|
"epoch": 0.24372680440923947,
|
|
"grad_norm": 1.4921820163726807,
|
|
"learning_rate": 3.782750789342492e-05,
|
|
"loss": 1.0166,
|
|
"step": 4400
|
|
},
|
|
{
|
|
"epoch": 0.2492660499639949,
|
|
"grad_norm": 1.4957038164138794,
|
|
"learning_rate": 3.7550545615687143e-05,
|
|
"loss": 1.0079,
|
|
"step": 4500
|
|
},
|
|
{
|
|
"epoch": 0.25480529551875036,
|
|
"grad_norm": 1.5265921354293823,
|
|
"learning_rate": 3.7273583337949377e-05,
|
|
"loss": 1.0297,
|
|
"step": 4600
|
|
},
|
|
{
|
|
"epoch": 0.2603445410735058,
|
|
"grad_norm": 1.4102026224136353,
|
|
"learning_rate": 3.69966210602116e-05,
|
|
"loss": 0.978,
|
|
"step": 4700
|
|
},
|
|
{
|
|
"epoch": 0.26588378662826123,
|
|
"grad_norm": 1.3901424407958984,
|
|
"learning_rate": 3.671965878247383e-05,
|
|
"loss": 1.0043,
|
|
"step": 4800
|
|
},
|
|
{
|
|
"epoch": 0.27142303218301667,
|
|
"grad_norm": 1.4173741340637207,
|
|
"learning_rate": 3.6442696504736056e-05,
|
|
"loss": 0.9725,
|
|
"step": 4900
|
|
},
|
|
{
|
|
"epoch": 0.2769622777377721,
|
|
"grad_norm": 1.2784532308578491,
|
|
"learning_rate": 3.616573422699828e-05,
|
|
"loss": 0.9876,
|
|
"step": 5000
|
|
},
|
|
{
|
|
"epoch": 0.28250152329252753,
|
|
"grad_norm": 1.4648102521896362,
|
|
"learning_rate": 3.5888771949260515e-05,
|
|
"loss": 0.9715,
|
|
"step": 5100
|
|
},
|
|
{
|
|
"epoch": 0.288040768847283,
|
|
"grad_norm": 1.410418152809143,
|
|
"learning_rate": 3.561180967152274e-05,
|
|
"loss": 1.0037,
|
|
"step": 5200
|
|
},
|
|
{
|
|
"epoch": 0.29358001440203846,
|
|
"grad_norm": 1.6049284934997559,
|
|
"learning_rate": 3.533484739378497e-05,
|
|
"loss": 0.9674,
|
|
"step": 5300
|
|
},
|
|
{
|
|
"epoch": 0.2991192599567939,
|
|
"grad_norm": 1.3856281042099,
|
|
"learning_rate": 3.5057885116047194e-05,
|
|
"loss": 0.9927,
|
|
"step": 5400
|
|
},
|
|
{
|
|
"epoch": 0.3046585055115493,
|
|
"grad_norm": 1.485903024673462,
|
|
"learning_rate": 3.478092283830943e-05,
|
|
"loss": 0.9421,
|
|
"step": 5500
|
|
},
|
|
{
|
|
"epoch": 0.31019775106630476,
|
|
"grad_norm": 1.2787377834320068,
|
|
"learning_rate": 3.4503960560571654e-05,
|
|
"loss": 0.9697,
|
|
"step": 5600
|
|
},
|
|
{
|
|
"epoch": 0.3157369966210602,
|
|
"grad_norm": 1.4596630334854126,
|
|
"learning_rate": 3.422699828283388e-05,
|
|
"loss": 0.9743,
|
|
"step": 5700
|
|
},
|
|
{
|
|
"epoch": 0.32127624217581563,
|
|
"grad_norm": 1.6178170442581177,
|
|
"learning_rate": 3.3950036005096106e-05,
|
|
"loss": 0.979,
|
|
"step": 5800
|
|
},
|
|
{
|
|
"epoch": 0.3268154877305711,
|
|
"grad_norm": 1.5128576755523682,
|
|
"learning_rate": 3.367307372735833e-05,
|
|
"loss": 0.9507,
|
|
"step": 5900
|
|
},
|
|
{
|
|
"epoch": 0.33235473328532655,
|
|
"grad_norm": 1.4769525527954102,
|
|
"learning_rate": 3.3396111449620566e-05,
|
|
"loss": 0.9651,
|
|
"step": 6000
|
|
},
|
|
{
|
|
"epoch": 0.337893978840082,
|
|
"grad_norm": 1.3777639865875244,
|
|
"learning_rate": 3.311914917188279e-05,
|
|
"loss": 0.9747,
|
|
"step": 6100
|
|
},
|
|
{
|
|
"epoch": 0.3434332243948374,
|
|
"grad_norm": 1.492494821548462,
|
|
"learning_rate": 3.284218689414502e-05,
|
|
"loss": 0.9858,
|
|
"step": 6200
|
|
},
|
|
{
|
|
"epoch": 0.34897246994959286,
|
|
"grad_norm": 1.282101035118103,
|
|
"learning_rate": 3.2565224616407245e-05,
|
|
"loss": 0.9761,
|
|
"step": 6300
|
|
},
|
|
{
|
|
"epoch": 0.3545117155043483,
|
|
"grad_norm": 1.3632028102874756,
|
|
"learning_rate": 3.228826233866947e-05,
|
|
"loss": 0.958,
|
|
"step": 6400
|
|
},
|
|
{
|
|
"epoch": 0.3600509610591037,
|
|
"grad_norm": 1.4041866064071655,
|
|
"learning_rate": 3.2011300060931704e-05,
|
|
"loss": 0.9772,
|
|
"step": 6500
|
|
},
|
|
{
|
|
"epoch": 0.3655902066138592,
|
|
"grad_norm": 1.3332486152648926,
|
|
"learning_rate": 3.173433778319393e-05,
|
|
"loss": 0.9669,
|
|
"step": 6600
|
|
},
|
|
{
|
|
"epoch": 0.37112945216861465,
|
|
"grad_norm": 1.3557853698730469,
|
|
"learning_rate": 3.145737550545616e-05,
|
|
"loss": 0.9431,
|
|
"step": 6700
|
|
},
|
|
{
|
|
"epoch": 0.3766686977233701,
|
|
"grad_norm": 1.311341404914856,
|
|
"learning_rate": 3.118041322771838e-05,
|
|
"loss": 0.9675,
|
|
"step": 6800
|
|
},
|
|
{
|
|
"epoch": 0.3822079432781255,
|
|
"grad_norm": 1.3420363664627075,
|
|
"learning_rate": 3.0903450949980616e-05,
|
|
"loss": 0.9443,
|
|
"step": 6900
|
|
},
|
|
{
|
|
"epoch": 0.38774718883288095,
|
|
"grad_norm": 1.5872620344161987,
|
|
"learning_rate": 3.062648867224284e-05,
|
|
"loss": 0.9534,
|
|
"step": 7000
|
|
},
|
|
{
|
|
"epoch": 0.3932864343876364,
|
|
"grad_norm": 1.5827562808990479,
|
|
"learning_rate": 3.035229601728245e-05,
|
|
"loss": 0.9618,
|
|
"step": 7100
|
|
},
|
|
{
|
|
"epoch": 0.3988256799423918,
|
|
"grad_norm": 1.4099847078323364,
|
|
"learning_rate": 3.0075333739544676e-05,
|
|
"loss": 0.94,
|
|
"step": 7200
|
|
},
|
|
{
|
|
"epoch": 0.4043649254971473,
|
|
"grad_norm": 1.6116340160369873,
|
|
"learning_rate": 2.97983714618069e-05,
|
|
"loss": 0.9461,
|
|
"step": 7300
|
|
},
|
|
{
|
|
"epoch": 0.40990417105190274,
|
|
"grad_norm": 1.3645659685134888,
|
|
"learning_rate": 2.9521409184069133e-05,
|
|
"loss": 0.9427,
|
|
"step": 7400
|
|
},
|
|
{
|
|
"epoch": 0.4154434166066582,
|
|
"grad_norm": 1.7124513387680054,
|
|
"learning_rate": 2.924444690633136e-05,
|
|
"loss": 0.9431,
|
|
"step": 7500
|
|
},
|
|
{
|
|
"epoch": 0.4209826621614136,
|
|
"grad_norm": 1.47300124168396,
|
|
"learning_rate": 2.896748462859359e-05,
|
|
"loss": 0.94,
|
|
"step": 7600
|
|
},
|
|
{
|
|
"epoch": 0.42652190771616905,
|
|
"grad_norm": 1.4425369501113892,
|
|
"learning_rate": 2.8690522350855815e-05,
|
|
"loss": 0.9439,
|
|
"step": 7700
|
|
},
|
|
{
|
|
"epoch": 0.4320611532709245,
|
|
"grad_norm": 1.1862937211990356,
|
|
"learning_rate": 2.8413560073118045e-05,
|
|
"loss": 0.9429,
|
|
"step": 7800
|
|
},
|
|
{
|
|
"epoch": 0.4376003988256799,
|
|
"grad_norm": 1.3416625261306763,
|
|
"learning_rate": 2.813659779538027e-05,
|
|
"loss": 0.9228,
|
|
"step": 7900
|
|
},
|
|
{
|
|
"epoch": 0.4431396443804354,
|
|
"grad_norm": 1.4095027446746826,
|
|
"learning_rate": 2.7859635517642497e-05,
|
|
"loss": 0.9477,
|
|
"step": 8000
|
|
},
|
|
{
|
|
"epoch": 0.44867888993519084,
|
|
"grad_norm": 1.3794485330581665,
|
|
"learning_rate": 2.7582673239904727e-05,
|
|
"loss": 0.9234,
|
|
"step": 8100
|
|
},
|
|
{
|
|
"epoch": 0.4542181354899463,
|
|
"grad_norm": 1.638173222541809,
|
|
"learning_rate": 2.7305710962166953e-05,
|
|
"loss": 0.9239,
|
|
"step": 8200
|
|
},
|
|
{
|
|
"epoch": 0.4597573810447017,
|
|
"grad_norm": 1.4151724576950073,
|
|
"learning_rate": 2.7028748684429183e-05,
|
|
"loss": 0.9488,
|
|
"step": 8300
|
|
},
|
|
{
|
|
"epoch": 0.46529662659945714,
|
|
"grad_norm": 1.4664607048034668,
|
|
"learning_rate": 2.675178640669141e-05,
|
|
"loss": 0.9281,
|
|
"step": 8400
|
|
},
|
|
{
|
|
"epoch": 0.4708358721542126,
|
|
"grad_norm": 1.4272499084472656,
|
|
"learning_rate": 2.647482412895364e-05,
|
|
"loss": 0.9252,
|
|
"step": 8500
|
|
},
|
|
{
|
|
"epoch": 0.47637511770896807,
|
|
"grad_norm": 1.380058765411377,
|
|
"learning_rate": 2.6197861851215866e-05,
|
|
"loss": 0.9067,
|
|
"step": 8600
|
|
},
|
|
{
|
|
"epoch": 0.4819143632637235,
|
|
"grad_norm": 1.385686993598938,
|
|
"learning_rate": 2.5920899573478092e-05,
|
|
"loss": 0.921,
|
|
"step": 8700
|
|
},
|
|
{
|
|
"epoch": 0.48745360881847893,
|
|
"grad_norm": 1.4072484970092773,
|
|
"learning_rate": 2.5643937295740322e-05,
|
|
"loss": 0.9134,
|
|
"step": 8800
|
|
},
|
|
{
|
|
"epoch": 0.49299285437323437,
|
|
"grad_norm": 1.5359854698181152,
|
|
"learning_rate": 2.5366975018002548e-05,
|
|
"loss": 0.9355,
|
|
"step": 8900
|
|
},
|
|
{
|
|
"epoch": 0.4985320999279898,
|
|
"grad_norm": 1.3707598447799683,
|
|
"learning_rate": 2.5090012740264778e-05,
|
|
"loss": 0.932,
|
|
"step": 9000
|
|
},
|
|
{
|
|
"epoch": 0.5040713454827452,
|
|
"grad_norm": 1.4937622547149658,
|
|
"learning_rate": 2.4815820085304385e-05,
|
|
"loss": 0.929,
|
|
"step": 9100
|
|
},
|
|
{
|
|
"epoch": 0.5096105910375007,
|
|
"grad_norm": 1.4973763227462769,
|
|
"learning_rate": 2.4538857807566608e-05,
|
|
"loss": 0.9313,
|
|
"step": 9200
|
|
},
|
|
{
|
|
"epoch": 0.5151498365922561,
|
|
"grad_norm": 1.3731300830841064,
|
|
"learning_rate": 2.4261895529828838e-05,
|
|
"loss": 0.8977,
|
|
"step": 9300
|
|
},
|
|
{
|
|
"epoch": 0.5206890821470116,
|
|
"grad_norm": 1.5620144605636597,
|
|
"learning_rate": 2.3984933252091064e-05,
|
|
"loss": 0.9255,
|
|
"step": 9400
|
|
},
|
|
{
|
|
"epoch": 0.526228327701767,
|
|
"grad_norm": 1.4226034879684448,
|
|
"learning_rate": 2.3707970974353294e-05,
|
|
"loss": 0.8994,
|
|
"step": 9500
|
|
},
|
|
{
|
|
"epoch": 0.5317675732565225,
|
|
"grad_norm": 1.3993937969207764,
|
|
"learning_rate": 2.3431008696615524e-05,
|
|
"loss": 0.9212,
|
|
"step": 9600
|
|
},
|
|
{
|
|
"epoch": 0.537306818811278,
|
|
"grad_norm": 1.4125133752822876,
|
|
"learning_rate": 2.315404641887775e-05,
|
|
"loss": 0.9072,
|
|
"step": 9700
|
|
},
|
|
{
|
|
"epoch": 0.5428460643660333,
|
|
"grad_norm": 1.2466013431549072,
|
|
"learning_rate": 2.287708414113998e-05,
|
|
"loss": 0.9155,
|
|
"step": 9800
|
|
},
|
|
{
|
|
"epoch": 0.5483853099207888,
|
|
"grad_norm": 1.319427251815796,
|
|
"learning_rate": 2.2600121863402206e-05,
|
|
"loss": 0.8968,
|
|
"step": 9900
|
|
},
|
|
{
|
|
"epoch": 0.5539245554755442,
|
|
"grad_norm": 1.5401791334152222,
|
|
"learning_rate": 2.2323159585664432e-05,
|
|
"loss": 0.9146,
|
|
"step": 10000
|
|
},
|
|
{
|
|
"epoch": 0.5594638010302997,
|
|
"grad_norm": 1.2928884029388428,
|
|
"learning_rate": 2.2046197307926662e-05,
|
|
"loss": 0.8996,
|
|
"step": 10100
|
|
},
|
|
{
|
|
"epoch": 0.5650030465850551,
|
|
"grad_norm": 1.5264612436294556,
|
|
"learning_rate": 2.176923503018889e-05,
|
|
"loss": 0.8889,
|
|
"step": 10200
|
|
},
|
|
{
|
|
"epoch": 0.5705422921398106,
|
|
"grad_norm": 1.5760724544525146,
|
|
"learning_rate": 2.1492272752451118e-05,
|
|
"loss": 0.8978,
|
|
"step": 10300
|
|
},
|
|
{
|
|
"epoch": 0.576081537694566,
|
|
"grad_norm": 1.414896845817566,
|
|
"learning_rate": 2.1215310474713345e-05,
|
|
"loss": 0.8989,
|
|
"step": 10400
|
|
},
|
|
{
|
|
"epoch": 0.5816207832493214,
|
|
"grad_norm": 1.422662377357483,
|
|
"learning_rate": 2.0938348196975574e-05,
|
|
"loss": 0.8695,
|
|
"step": 10500
|
|
},
|
|
{
|
|
"epoch": 0.5871600288040769,
|
|
"grad_norm": 1.6162989139556885,
|
|
"learning_rate": 2.06613859192378e-05,
|
|
"loss": 0.9017,
|
|
"step": 10600
|
|
},
|
|
{
|
|
"epoch": 0.5926992743588323,
|
|
"grad_norm": 1.537391185760498,
|
|
"learning_rate": 2.0384423641500027e-05,
|
|
"loss": 0.8946,
|
|
"step": 10700
|
|
},
|
|
{
|
|
"epoch": 0.5982385199135878,
|
|
"grad_norm": 1.3381567001342773,
|
|
"learning_rate": 2.0107461363762257e-05,
|
|
"loss": 0.8901,
|
|
"step": 10800
|
|
},
|
|
{
|
|
"epoch": 0.6037777654683432,
|
|
"grad_norm": 1.3765337467193604,
|
|
"learning_rate": 1.9830499086024483e-05,
|
|
"loss": 0.8872,
|
|
"step": 10900
|
|
},
|
|
{
|
|
"epoch": 0.6093170110230987,
|
|
"grad_norm": 1.5019334554672241,
|
|
"learning_rate": 1.9553536808286713e-05,
|
|
"loss": 0.8688,
|
|
"step": 11000
|
|
},
|
|
{
|
|
"epoch": 0.6148562565778541,
|
|
"grad_norm": 1.5896552801132202,
|
|
"learning_rate": 1.9276574530548943e-05,
|
|
"loss": 0.8863,
|
|
"step": 11100
|
|
},
|
|
{
|
|
"epoch": 0.6203955021326095,
|
|
"grad_norm": 1.481461524963379,
|
|
"learning_rate": 1.899961225281117e-05,
|
|
"loss": 0.8735,
|
|
"step": 11200
|
|
},
|
|
{
|
|
"epoch": 0.625934747687365,
|
|
"grad_norm": 1.5073565244674683,
|
|
"learning_rate": 1.8722649975073395e-05,
|
|
"loss": 0.8929,
|
|
"step": 11300
|
|
},
|
|
{
|
|
"epoch": 0.6314739932421204,
|
|
"grad_norm": 1.3693833351135254,
|
|
"learning_rate": 1.844568769733562e-05,
|
|
"loss": 0.8478,
|
|
"step": 11400
|
|
},
|
|
{
|
|
"epoch": 0.6370132387968759,
|
|
"grad_norm": 1.481180191040039,
|
|
"learning_rate": 1.816872541959785e-05,
|
|
"loss": 0.8784,
|
|
"step": 11500
|
|
},
|
|
{
|
|
"epoch": 0.6425524843516313,
|
|
"grad_norm": 1.5158451795578003,
|
|
"learning_rate": 1.789176314186008e-05,
|
|
"loss": 0.8697,
|
|
"step": 11600
|
|
},
|
|
{
|
|
"epoch": 0.6480917299063867,
|
|
"grad_norm": 1.4979525804519653,
|
|
"learning_rate": 1.7614800864122307e-05,
|
|
"loss": 0.9006,
|
|
"step": 11700
|
|
},
|
|
{
|
|
"epoch": 0.6536309754611422,
|
|
"grad_norm": 1.3702558279037476,
|
|
"learning_rate": 1.7337838586384537e-05,
|
|
"loss": 0.8745,
|
|
"step": 11800
|
|
},
|
|
{
|
|
"epoch": 0.6591702210158976,
|
|
"grad_norm": 1.298034429550171,
|
|
"learning_rate": 1.7060876308646763e-05,
|
|
"loss": 0.8719,
|
|
"step": 11900
|
|
},
|
|
{
|
|
"epoch": 0.6647094665706531,
|
|
"grad_norm": 1.3086830377578735,
|
|
"learning_rate": 1.678668365368637e-05,
|
|
"loss": 0.8478,
|
|
"step": 12000
|
|
},
|
|
{
|
|
"epoch": 0.6702487121254085,
|
|
"grad_norm": 1.4746365547180176,
|
|
"learning_rate": 1.6509721375948597e-05,
|
|
"loss": 0.8843,
|
|
"step": 12100
|
|
},
|
|
{
|
|
"epoch": 0.675787957680164,
|
|
"grad_norm": 1.503086805343628,
|
|
"learning_rate": 1.6232759098210827e-05,
|
|
"loss": 0.8948,
|
|
"step": 12200
|
|
},
|
|
{
|
|
"epoch": 0.6813272032349194,
|
|
"grad_norm": 1.4060204029083252,
|
|
"learning_rate": 1.595579682047305e-05,
|
|
"loss": 0.8695,
|
|
"step": 12300
|
|
},
|
|
{
|
|
"epoch": 0.6868664487896748,
|
|
"grad_norm": 1.5291565656661987,
|
|
"learning_rate": 1.568160416551266e-05,
|
|
"loss": 0.9056,
|
|
"step": 12400
|
|
},
|
|
{
|
|
"epoch": 0.6924056943444303,
|
|
"grad_norm": 1.3221312761306763,
|
|
"learning_rate": 1.5404641887774887e-05,
|
|
"loss": 0.8651,
|
|
"step": 12500
|
|
},
|
|
{
|
|
"epoch": 0.6979449398991857,
|
|
"grad_norm": 1.5272188186645508,
|
|
"learning_rate": 1.5127679610037113e-05,
|
|
"loss": 0.8725,
|
|
"step": 12600
|
|
},
|
|
{
|
|
"epoch": 0.7034841854539412,
|
|
"grad_norm": 1.4335006475448608,
|
|
"learning_rate": 1.4850717332299341e-05,
|
|
"loss": 0.8663,
|
|
"step": 12700
|
|
},
|
|
{
|
|
"epoch": 0.7090234310086966,
|
|
"grad_norm": 1.321509838104248,
|
|
"learning_rate": 1.457375505456157e-05,
|
|
"loss": 0.8565,
|
|
"step": 12800
|
|
},
|
|
{
|
|
"epoch": 0.7145626765634521,
|
|
"grad_norm": 1.3843624591827393,
|
|
"learning_rate": 1.4296792776823797e-05,
|
|
"loss": 0.8614,
|
|
"step": 12900
|
|
},
|
|
{
|
|
"epoch": 0.7201019221182074,
|
|
"grad_norm": 1.3091601133346558,
|
|
"learning_rate": 1.4019830499086025e-05,
|
|
"loss": 0.8538,
|
|
"step": 13000
|
|
},
|
|
{
|
|
"epoch": 0.7256411676729629,
|
|
"grad_norm": 1.5815715789794922,
|
|
"learning_rate": 1.3742868221348253e-05,
|
|
"loss": 0.8624,
|
|
"step": 13100
|
|
},
|
|
{
|
|
"epoch": 0.7311804132277184,
|
|
"grad_norm": 1.6078437566757202,
|
|
"learning_rate": 1.346590594361048e-05,
|
|
"loss": 0.8405,
|
|
"step": 13200
|
|
},
|
|
{
|
|
"epoch": 0.7367196587824738,
|
|
"grad_norm": 1.446047306060791,
|
|
"learning_rate": 1.3188943665872708e-05,
|
|
"loss": 0.8445,
|
|
"step": 13300
|
|
},
|
|
{
|
|
"epoch": 0.7422589043372293,
|
|
"grad_norm": 1.207653284072876,
|
|
"learning_rate": 1.2911981388134936e-05,
|
|
"loss": 0.8431,
|
|
"step": 13400
|
|
},
|
|
{
|
|
"epoch": 0.7477981498919847,
|
|
"grad_norm": 1.3394980430603027,
|
|
"learning_rate": 1.2635019110397164e-05,
|
|
"loss": 0.868,
|
|
"step": 13500
|
|
},
|
|
{
|
|
"epoch": 0.7533373954467402,
|
|
"grad_norm": 1.549267053604126,
|
|
"learning_rate": 1.2358056832659392e-05,
|
|
"loss": 0.8604,
|
|
"step": 13600
|
|
},
|
|
{
|
|
"epoch": 0.7588766410014955,
|
|
"grad_norm": 1.3387057781219482,
|
|
"learning_rate": 1.2081094554921622e-05,
|
|
"loss": 0.8563,
|
|
"step": 13700
|
|
},
|
|
{
|
|
"epoch": 0.764415886556251,
|
|
"grad_norm": 1.3845041990280151,
|
|
"learning_rate": 1.1804132277183848e-05,
|
|
"loss": 0.8365,
|
|
"step": 13800
|
|
},
|
|
{
|
|
"epoch": 0.7699551321110065,
|
|
"grad_norm": 1.4750843048095703,
|
|
"learning_rate": 1.1527169999446076e-05,
|
|
"loss": 0.8322,
|
|
"step": 13900
|
|
},
|
|
{
|
|
"epoch": 0.7754943776657619,
|
|
"grad_norm": 1.4781407117843628,
|
|
"learning_rate": 1.1250207721708304e-05,
|
|
"loss": 0.8696,
|
|
"step": 14000
|
|
},
|
|
{
|
|
"epoch": 0.7810336232205174,
|
|
"grad_norm": 1.5403274297714233,
|
|
"learning_rate": 1.0973245443970532e-05,
|
|
"loss": 0.8381,
|
|
"step": 14100
|
|
},
|
|
{
|
|
"epoch": 0.7865728687752728,
|
|
"grad_norm": 1.4403986930847168,
|
|
"learning_rate": 1.069628316623276e-05,
|
|
"loss": 0.8341,
|
|
"step": 14200
|
|
},
|
|
{
|
|
"epoch": 0.7921121143300283,
|
|
"grad_norm": 1.509552001953125,
|
|
"learning_rate": 1.0419320888494988e-05,
|
|
"loss": 0.8689,
|
|
"step": 14300
|
|
},
|
|
{
|
|
"epoch": 0.7976513598847836,
|
|
"grad_norm": 1.567030668258667,
|
|
"learning_rate": 1.0142358610757214e-05,
|
|
"loss": 0.8496,
|
|
"step": 14400
|
|
},
|
|
{
|
|
"epoch": 0.8031906054395391,
|
|
"grad_norm": 1.4610817432403564,
|
|
"learning_rate": 9.865396333019443e-06,
|
|
"loss": 0.8694,
|
|
"step": 14500
|
|
},
|
|
{
|
|
"epoch": 0.8087298509942946,
|
|
"grad_norm": 1.4665472507476807,
|
|
"learning_rate": 9.58843405528167e-06,
|
|
"loss": 0.843,
|
|
"step": 14600
|
|
},
|
|
{
|
|
"epoch": 0.81426909654905,
|
|
"grad_norm": 1.45100998878479,
|
|
"learning_rate": 9.3114717775439e-06,
|
|
"loss": 0.8529,
|
|
"step": 14700
|
|
},
|
|
{
|
|
"epoch": 0.8198083421038055,
|
|
"grad_norm": 1.4459706544876099,
|
|
"learning_rate": 9.034509499806127e-06,
|
|
"loss": 0.8251,
|
|
"step": 14800
|
|
},
|
|
{
|
|
"epoch": 0.8253475876585609,
|
|
"grad_norm": 1.408592700958252,
|
|
"learning_rate": 8.757547222068355e-06,
|
|
"loss": 0.858,
|
|
"step": 14900
|
|
},
|
|
{
|
|
"epoch": 0.8308868332133164,
|
|
"grad_norm": 1.4576612710952759,
|
|
"learning_rate": 8.480584944330583e-06,
|
|
"loss": 0.8294,
|
|
"step": 15000
|
|
},
|
|
{
|
|
"epoch": 0.8364260787680717,
|
|
"grad_norm": 1.4453418254852295,
|
|
"learning_rate": 8.203622666592809e-06,
|
|
"loss": 0.8421,
|
|
"step": 15100
|
|
},
|
|
{
|
|
"epoch": 0.8419653243228272,
|
|
"grad_norm": 1.4345848560333252,
|
|
"learning_rate": 7.926660388855039e-06,
|
|
"loss": 0.8338,
|
|
"step": 15200
|
|
},
|
|
{
|
|
"epoch": 0.8475045698775827,
|
|
"grad_norm": 1.4117823839187622,
|
|
"learning_rate": 7.649698111117267e-06,
|
|
"loss": 0.8494,
|
|
"step": 15300
|
|
},
|
|
{
|
|
"epoch": 0.8530438154323381,
|
|
"grad_norm": 1.7380582094192505,
|
|
"learning_rate": 7.372735833379495e-06,
|
|
"loss": 0.831,
|
|
"step": 15400
|
|
},
|
|
{
|
|
"epoch": 0.8585830609870936,
|
|
"grad_norm": 1.372897744178772,
|
|
"learning_rate": 7.095773555641721e-06,
|
|
"loss": 0.8201,
|
|
"step": 15500
|
|
},
|
|
{
|
|
"epoch": 0.864122306541849,
|
|
"grad_norm": 1.482770562171936,
|
|
"learning_rate": 6.81881127790395e-06,
|
|
"loss": 0.8221,
|
|
"step": 15600
|
|
},
|
|
{
|
|
"epoch": 0.8696615520966045,
|
|
"grad_norm": 1.394877314567566,
|
|
"learning_rate": 6.541849000166178e-06,
|
|
"loss": 0.8441,
|
|
"step": 15700
|
|
},
|
|
{
|
|
"epoch": 0.8752007976513598,
|
|
"grad_norm": 1.4818041324615479,
|
|
"learning_rate": 6.264886722428405e-06,
|
|
"loss": 0.843,
|
|
"step": 15800
|
|
},
|
|
{
|
|
"epoch": 0.8807400432061153,
|
|
"grad_norm": 1.8270115852355957,
|
|
"learning_rate": 5.9879244446906334e-06,
|
|
"loss": 0.851,
|
|
"step": 15900
|
|
},
|
|
{
|
|
"epoch": 0.8862792887608708,
|
|
"grad_norm": 1.4031203985214233,
|
|
"learning_rate": 5.7109621669528615e-06,
|
|
"loss": 0.8515,
|
|
"step": 16000
|
|
},
|
|
{
|
|
"epoch": 0.8918185343156262,
|
|
"grad_norm": 1.5000931024551392,
|
|
"learning_rate": 5.4339998892150895e-06,
|
|
"loss": 0.819,
|
|
"step": 16100
|
|
},
|
|
{
|
|
"epoch": 0.8973577798703817,
|
|
"grad_norm": 1.4219608306884766,
|
|
"learning_rate": 5.1570376114773176e-06,
|
|
"loss": 0.8528,
|
|
"step": 16200
|
|
},
|
|
{
|
|
"epoch": 0.9028970254251371,
|
|
"grad_norm": 1.6713409423828125,
|
|
"learning_rate": 4.880075333739545e-06,
|
|
"loss": 0.8253,
|
|
"step": 16300
|
|
},
|
|
{
|
|
"epoch": 0.9084362709798925,
|
|
"grad_norm": 1.642972707748413,
|
|
"learning_rate": 4.603113056001773e-06,
|
|
"loss": 0.8199,
|
|
"step": 16400
|
|
},
|
|
{
|
|
"epoch": 0.913975516534648,
|
|
"grad_norm": 1.4362276792526245,
|
|
"learning_rate": 4.328920401041378e-06,
|
|
"loss": 0.8186,
|
|
"step": 16500
|
|
},
|
|
{
|
|
"epoch": 0.9195147620894034,
|
|
"grad_norm": 1.387813925743103,
|
|
"learning_rate": 4.051958123303606e-06,
|
|
"loss": 0.8447,
|
|
"step": 16600
|
|
},
|
|
{
|
|
"epoch": 0.9250540076441589,
|
|
"grad_norm": 1.7197731733322144,
|
|
"learning_rate": 3.774995845565834e-06,
|
|
"loss": 0.8416,
|
|
"step": 16700
|
|
},
|
|
{
|
|
"epoch": 0.9305932531989143,
|
|
"grad_norm": 1.4148505926132202,
|
|
"learning_rate": 3.498033567828062e-06,
|
|
"loss": 0.8388,
|
|
"step": 16800
|
|
},
|
|
{
|
|
"epoch": 0.9361324987536698,
|
|
"grad_norm": 1.5057902336120605,
|
|
"learning_rate": 3.2210712900902897e-06,
|
|
"loss": 0.8141,
|
|
"step": 16900
|
|
},
|
|
{
|
|
"epoch": 0.9416717443084252,
|
|
"grad_norm": 1.6533297300338745,
|
|
"learning_rate": 2.9441090123525177e-06,
|
|
"loss": 0.8131,
|
|
"step": 17000
|
|
},
|
|
{
|
|
"epoch": 0.9472109898631806,
|
|
"grad_norm": 1.550384759902954,
|
|
"learning_rate": 2.6671467346147458e-06,
|
|
"loss": 0.8474,
|
|
"step": 17100
|
|
},
|
|
{
|
|
"epoch": 0.9527502354179361,
|
|
"grad_norm": 1.3930208683013916,
|
|
"learning_rate": 2.3901844568769734e-06,
|
|
"loss": 0.8252,
|
|
"step": 17200
|
|
},
|
|
{
|
|
"epoch": 0.9582894809726915,
|
|
"grad_norm": 1.534490704536438,
|
|
"learning_rate": 2.1132221791392014e-06,
|
|
"loss": 0.8333,
|
|
"step": 17300
|
|
},
|
|
{
|
|
"epoch": 0.963828726527447,
|
|
"grad_norm": 1.496036171913147,
|
|
"learning_rate": 1.836259901401429e-06,
|
|
"loss": 0.8452,
|
|
"step": 17400
|
|
},
|
|
{
|
|
"epoch": 0.9693679720822024,
|
|
"grad_norm": 1.4429802894592285,
|
|
"learning_rate": 1.559297623663657e-06,
|
|
"loss": 0.8368,
|
|
"step": 17500
|
|
},
|
|
{
|
|
"epoch": 0.9749072176369579,
|
|
"grad_norm": 1.4180331230163574,
|
|
"learning_rate": 1.282335345925885e-06,
|
|
"loss": 0.8139,
|
|
"step": 17600
|
|
},
|
|
{
|
|
"epoch": 0.9804464631917132,
|
|
"grad_norm": 1.423274278640747,
|
|
"learning_rate": 1.005373068188113e-06,
|
|
"loss": 0.809,
|
|
"step": 17700
|
|
},
|
|
{
|
|
"epoch": 0.9859857087464687,
|
|
"grad_norm": 1.4893913269042969,
|
|
"learning_rate": 7.284107904503407e-07,
|
|
"loss": 0.8271,
|
|
"step": 17800
|
|
},
|
|
{
|
|
"epoch": 0.9915249543012242,
|
|
"grad_norm": 1.5484572649002075,
|
|
"learning_rate": 4.5144851271256857e-07,
|
|
"loss": 0.8409,
|
|
"step": 17900
|
|
},
|
|
{
|
|
"epoch": 0.9970641998559796,
|
|
"grad_norm": 1.423147439956665,
|
|
"learning_rate": 1.7448623497479645e-07,
|
|
"loss": 0.8285,
|
|
"step": 18000
|
|
}
|
|
],
|
|
"logging_steps": 100,
|
|
"max_steps": 18053,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 1,
|
|
"save_steps": 18053,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 5.47361101474431e+17,
|
|
"train_batch_size": 16,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|