Files
Qllama-tiny-.5B-test-1/trainer_state.json
ModelHub XC ef8182ee05 初始化项目,由ModelHub XC社区提供模型
Model: Josephgflowers/Qllama-tiny-.5B-test-1
Source: Original Platform
2026-06-24 12:02:19 +08:00

1294 lines
31 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 18053,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005539245554755442,
"grad_norm": 1.546007513999939,
"learning_rate": 4.972580734503961e-05,
"loss": 1.2806,
"step": 100
},
{
"epoch": 0.011078491109510884,
"grad_norm": 1.6162946224212646,
"learning_rate": 4.945161469007921e-05,
"loss": 1.2654,
"step": 200
},
{
"epoch": 0.016617736664266327,
"grad_norm": 2.059379816055298,
"learning_rate": 4.917465241234144e-05,
"loss": 1.2231,
"step": 300
},
{
"epoch": 0.022156982219021768,
"grad_norm": 1.6531622409820557,
"learning_rate": 4.890045975738105e-05,
"loss": 1.2161,
"step": 400
},
{
"epoch": 0.027696227773777213,
"grad_norm": 1.57131028175354,
"learning_rate": 4.8623497479643274e-05,
"loss": 1.1963,
"step": 500
},
{
"epoch": 0.033235473328532654,
"grad_norm": 1.4883884191513062,
"learning_rate": 4.83465352019055e-05,
"loss": 1.1716,
"step": 600
},
{
"epoch": 0.038774718883288095,
"grad_norm": 1.4190359115600586,
"learning_rate": 4.806957292416773e-05,
"loss": 1.1969,
"step": 700
},
{
"epoch": 0.044313964438043536,
"grad_norm": 1.6759884357452393,
"learning_rate": 4.779261064642996e-05,
"loss": 1.179,
"step": 800
},
{
"epoch": 0.04985320999279898,
"grad_norm": 1.4575251340866089,
"learning_rate": 4.7515648368692186e-05,
"loss": 1.1543,
"step": 900
},
{
"epoch": 0.055392455547554426,
"grad_norm": 1.4753471612930298,
"learning_rate": 4.723868609095441e-05,
"loss": 1.1502,
"step": 1000
},
{
"epoch": 0.06093170110230987,
"grad_norm": 1.660173773765564,
"learning_rate": 4.696172381321664e-05,
"loss": 1.1231,
"step": 1100
},
{
"epoch": 0.06647094665706531,
"grad_norm": 1.5432391166687012,
"learning_rate": 4.668476153547887e-05,
"loss": 1.1036,
"step": 1200
},
{
"epoch": 0.07201019221182076,
"grad_norm": 1.3922276496887207,
"learning_rate": 4.64077992577411e-05,
"loss": 1.1286,
"step": 1300
},
{
"epoch": 0.07754943776657619,
"grad_norm": 1.361965298652649,
"learning_rate": 4.613083698000333e-05,
"loss": 1.1039,
"step": 1400
},
{
"epoch": 0.08308868332133164,
"grad_norm": 1.3993804454803467,
"learning_rate": 4.585387470226555e-05,
"loss": 1.1166,
"step": 1500
},
{
"epoch": 0.08862792887608707,
"grad_norm": 1.5207942724227905,
"learning_rate": 4.557691242452778e-05,
"loss": 1.1079,
"step": 1600
},
{
"epoch": 0.09416717443084252,
"grad_norm": 1.6368706226348877,
"learning_rate": 4.529995014679001e-05,
"loss": 1.1103,
"step": 1700
},
{
"epoch": 0.09970641998559795,
"grad_norm": 1.4837383031845093,
"learning_rate": 4.5022987869052236e-05,
"loss": 1.1003,
"step": 1800
},
{
"epoch": 0.1052456655403534,
"grad_norm": 1.483654499053955,
"learning_rate": 4.474602559131447e-05,
"loss": 1.0686,
"step": 1900
},
{
"epoch": 0.11078491109510885,
"grad_norm": 1.776803731918335,
"learning_rate": 4.446906331357669e-05,
"loss": 1.0892,
"step": 2000
},
{
"epoch": 0.11632415664986429,
"grad_norm": 1.6472382545471191,
"learning_rate": 4.419210103583892e-05,
"loss": 1.0892,
"step": 2100
},
{
"epoch": 0.12186340220461973,
"grad_norm": 1.4846845865249634,
"learning_rate": 4.391513875810115e-05,
"loss": 1.0421,
"step": 2200
},
{
"epoch": 0.12740264775937518,
"grad_norm": 1.4608880281448364,
"learning_rate": 4.3638176480363375e-05,
"loss": 1.0578,
"step": 2300
},
{
"epoch": 0.13294189331413062,
"grad_norm": 1.515207052230835,
"learning_rate": 4.336121420262561e-05,
"loss": 1.0652,
"step": 2400
},
{
"epoch": 0.13848113886888605,
"grad_norm": 1.5072288513183594,
"learning_rate": 4.308425192488783e-05,
"loss": 1.0855,
"step": 2500
},
{
"epoch": 0.1440203844236415,
"grad_norm": 1.4887125492095947,
"learning_rate": 4.281005926992744e-05,
"loss": 1.0739,
"step": 2600
},
{
"epoch": 0.14955962997839695,
"grad_norm": 1.3779743909835815,
"learning_rate": 4.2533096992189665e-05,
"loss": 1.0558,
"step": 2700
},
{
"epoch": 0.15509887553315238,
"grad_norm": 1.4754050970077515,
"learning_rate": 4.22561347144519e-05,
"loss": 1.0458,
"step": 2800
},
{
"epoch": 0.16063812108790781,
"grad_norm": 1.3826239109039307,
"learning_rate": 4.197917243671412e-05,
"loss": 1.0416,
"step": 2900
},
{
"epoch": 0.16617736664266328,
"grad_norm": 1.3614164590835571,
"learning_rate": 4.170221015897635e-05,
"loss": 1.0405,
"step": 3000
},
{
"epoch": 0.1717166121974187,
"grad_norm": 1.8164361715316772,
"learning_rate": 4.142524788123858e-05,
"loss": 1.0464,
"step": 3100
},
{
"epoch": 0.17725585775217415,
"grad_norm": 1.497776746749878,
"learning_rate": 4.11482856035008e-05,
"loss": 1.0242,
"step": 3200
},
{
"epoch": 0.1827951033069296,
"grad_norm": 1.5668854713439941,
"learning_rate": 4.0871323325763036e-05,
"loss": 1.0444,
"step": 3300
},
{
"epoch": 0.18833434886168504,
"grad_norm": 1.3012648820877075,
"learning_rate": 4.0594361048025256e-05,
"loss": 1.019,
"step": 3400
},
{
"epoch": 0.19387359441644048,
"grad_norm": 1.4110270738601685,
"learning_rate": 4.031739877028749e-05,
"loss": 1.031,
"step": 3500
},
{
"epoch": 0.1994128399711959,
"grad_norm": 1.7231452465057373,
"learning_rate": 4.0040436492549715e-05,
"loss": 1.0458,
"step": 3600
},
{
"epoch": 0.20495208552595137,
"grad_norm": 1.5503065586090088,
"learning_rate": 3.976347421481195e-05,
"loss": 1.0141,
"step": 3700
},
{
"epoch": 0.2104913310807068,
"grad_norm": 1.3918309211730957,
"learning_rate": 3.9486511937074175e-05,
"loss": 1.0169,
"step": 3800
},
{
"epoch": 0.21603057663546224,
"grad_norm": 1.4343814849853516,
"learning_rate": 3.9209549659336394e-05,
"loss": 1.0247,
"step": 3900
},
{
"epoch": 0.2215698221902177,
"grad_norm": 1.4719725847244263,
"learning_rate": 3.893258738159863e-05,
"loss": 1.0043,
"step": 4000
},
{
"epoch": 0.22710906774497314,
"grad_norm": 1.3622781038284302,
"learning_rate": 3.8655625103860854e-05,
"loss": 1.0282,
"step": 4100
},
{
"epoch": 0.23264831329972857,
"grad_norm": 1.5316309928894043,
"learning_rate": 3.837866282612309e-05,
"loss": 1.0034,
"step": 4200
},
{
"epoch": 0.23818755885448403,
"grad_norm": 1.3682796955108643,
"learning_rate": 3.8104470171162684e-05,
"loss": 1.0456,
"step": 4300
},
{
"epoch": 0.24372680440923947,
"grad_norm": 1.4921820163726807,
"learning_rate": 3.782750789342492e-05,
"loss": 1.0166,
"step": 4400
},
{
"epoch": 0.2492660499639949,
"grad_norm": 1.4957038164138794,
"learning_rate": 3.7550545615687143e-05,
"loss": 1.0079,
"step": 4500
},
{
"epoch": 0.25480529551875036,
"grad_norm": 1.5265921354293823,
"learning_rate": 3.7273583337949377e-05,
"loss": 1.0297,
"step": 4600
},
{
"epoch": 0.2603445410735058,
"grad_norm": 1.4102026224136353,
"learning_rate": 3.69966210602116e-05,
"loss": 0.978,
"step": 4700
},
{
"epoch": 0.26588378662826123,
"grad_norm": 1.3901424407958984,
"learning_rate": 3.671965878247383e-05,
"loss": 1.0043,
"step": 4800
},
{
"epoch": 0.27142303218301667,
"grad_norm": 1.4173741340637207,
"learning_rate": 3.6442696504736056e-05,
"loss": 0.9725,
"step": 4900
},
{
"epoch": 0.2769622777377721,
"grad_norm": 1.2784532308578491,
"learning_rate": 3.616573422699828e-05,
"loss": 0.9876,
"step": 5000
},
{
"epoch": 0.28250152329252753,
"grad_norm": 1.4648102521896362,
"learning_rate": 3.5888771949260515e-05,
"loss": 0.9715,
"step": 5100
},
{
"epoch": 0.288040768847283,
"grad_norm": 1.410418152809143,
"learning_rate": 3.561180967152274e-05,
"loss": 1.0037,
"step": 5200
},
{
"epoch": 0.29358001440203846,
"grad_norm": 1.6049284934997559,
"learning_rate": 3.533484739378497e-05,
"loss": 0.9674,
"step": 5300
},
{
"epoch": 0.2991192599567939,
"grad_norm": 1.3856281042099,
"learning_rate": 3.5057885116047194e-05,
"loss": 0.9927,
"step": 5400
},
{
"epoch": 0.3046585055115493,
"grad_norm": 1.485903024673462,
"learning_rate": 3.478092283830943e-05,
"loss": 0.9421,
"step": 5500
},
{
"epoch": 0.31019775106630476,
"grad_norm": 1.2787377834320068,
"learning_rate": 3.4503960560571654e-05,
"loss": 0.9697,
"step": 5600
},
{
"epoch": 0.3157369966210602,
"grad_norm": 1.4596630334854126,
"learning_rate": 3.422699828283388e-05,
"loss": 0.9743,
"step": 5700
},
{
"epoch": 0.32127624217581563,
"grad_norm": 1.6178170442581177,
"learning_rate": 3.3950036005096106e-05,
"loss": 0.979,
"step": 5800
},
{
"epoch": 0.3268154877305711,
"grad_norm": 1.5128576755523682,
"learning_rate": 3.367307372735833e-05,
"loss": 0.9507,
"step": 5900
},
{
"epoch": 0.33235473328532655,
"grad_norm": 1.4769525527954102,
"learning_rate": 3.3396111449620566e-05,
"loss": 0.9651,
"step": 6000
},
{
"epoch": 0.337893978840082,
"grad_norm": 1.3777639865875244,
"learning_rate": 3.311914917188279e-05,
"loss": 0.9747,
"step": 6100
},
{
"epoch": 0.3434332243948374,
"grad_norm": 1.492494821548462,
"learning_rate": 3.284218689414502e-05,
"loss": 0.9858,
"step": 6200
},
{
"epoch": 0.34897246994959286,
"grad_norm": 1.282101035118103,
"learning_rate": 3.2565224616407245e-05,
"loss": 0.9761,
"step": 6300
},
{
"epoch": 0.3545117155043483,
"grad_norm": 1.3632028102874756,
"learning_rate": 3.228826233866947e-05,
"loss": 0.958,
"step": 6400
},
{
"epoch": 0.3600509610591037,
"grad_norm": 1.4041866064071655,
"learning_rate": 3.2011300060931704e-05,
"loss": 0.9772,
"step": 6500
},
{
"epoch": 0.3655902066138592,
"grad_norm": 1.3332486152648926,
"learning_rate": 3.173433778319393e-05,
"loss": 0.9669,
"step": 6600
},
{
"epoch": 0.37112945216861465,
"grad_norm": 1.3557853698730469,
"learning_rate": 3.145737550545616e-05,
"loss": 0.9431,
"step": 6700
},
{
"epoch": 0.3766686977233701,
"grad_norm": 1.311341404914856,
"learning_rate": 3.118041322771838e-05,
"loss": 0.9675,
"step": 6800
},
{
"epoch": 0.3822079432781255,
"grad_norm": 1.3420363664627075,
"learning_rate": 3.0903450949980616e-05,
"loss": 0.9443,
"step": 6900
},
{
"epoch": 0.38774718883288095,
"grad_norm": 1.5872620344161987,
"learning_rate": 3.062648867224284e-05,
"loss": 0.9534,
"step": 7000
},
{
"epoch": 0.3932864343876364,
"grad_norm": 1.5827562808990479,
"learning_rate": 3.035229601728245e-05,
"loss": 0.9618,
"step": 7100
},
{
"epoch": 0.3988256799423918,
"grad_norm": 1.4099847078323364,
"learning_rate": 3.0075333739544676e-05,
"loss": 0.94,
"step": 7200
},
{
"epoch": 0.4043649254971473,
"grad_norm": 1.6116340160369873,
"learning_rate": 2.97983714618069e-05,
"loss": 0.9461,
"step": 7300
},
{
"epoch": 0.40990417105190274,
"grad_norm": 1.3645659685134888,
"learning_rate": 2.9521409184069133e-05,
"loss": 0.9427,
"step": 7400
},
{
"epoch": 0.4154434166066582,
"grad_norm": 1.7124513387680054,
"learning_rate": 2.924444690633136e-05,
"loss": 0.9431,
"step": 7500
},
{
"epoch": 0.4209826621614136,
"grad_norm": 1.47300124168396,
"learning_rate": 2.896748462859359e-05,
"loss": 0.94,
"step": 7600
},
{
"epoch": 0.42652190771616905,
"grad_norm": 1.4425369501113892,
"learning_rate": 2.8690522350855815e-05,
"loss": 0.9439,
"step": 7700
},
{
"epoch": 0.4320611532709245,
"grad_norm": 1.1862937211990356,
"learning_rate": 2.8413560073118045e-05,
"loss": 0.9429,
"step": 7800
},
{
"epoch": 0.4376003988256799,
"grad_norm": 1.3416625261306763,
"learning_rate": 2.813659779538027e-05,
"loss": 0.9228,
"step": 7900
},
{
"epoch": 0.4431396443804354,
"grad_norm": 1.4095027446746826,
"learning_rate": 2.7859635517642497e-05,
"loss": 0.9477,
"step": 8000
},
{
"epoch": 0.44867888993519084,
"grad_norm": 1.3794485330581665,
"learning_rate": 2.7582673239904727e-05,
"loss": 0.9234,
"step": 8100
},
{
"epoch": 0.4542181354899463,
"grad_norm": 1.638173222541809,
"learning_rate": 2.7305710962166953e-05,
"loss": 0.9239,
"step": 8200
},
{
"epoch": 0.4597573810447017,
"grad_norm": 1.4151724576950073,
"learning_rate": 2.7028748684429183e-05,
"loss": 0.9488,
"step": 8300
},
{
"epoch": 0.46529662659945714,
"grad_norm": 1.4664607048034668,
"learning_rate": 2.675178640669141e-05,
"loss": 0.9281,
"step": 8400
},
{
"epoch": 0.4708358721542126,
"grad_norm": 1.4272499084472656,
"learning_rate": 2.647482412895364e-05,
"loss": 0.9252,
"step": 8500
},
{
"epoch": 0.47637511770896807,
"grad_norm": 1.380058765411377,
"learning_rate": 2.6197861851215866e-05,
"loss": 0.9067,
"step": 8600
},
{
"epoch": 0.4819143632637235,
"grad_norm": 1.385686993598938,
"learning_rate": 2.5920899573478092e-05,
"loss": 0.921,
"step": 8700
},
{
"epoch": 0.48745360881847893,
"grad_norm": 1.4072484970092773,
"learning_rate": 2.5643937295740322e-05,
"loss": 0.9134,
"step": 8800
},
{
"epoch": 0.49299285437323437,
"grad_norm": 1.5359854698181152,
"learning_rate": 2.5366975018002548e-05,
"loss": 0.9355,
"step": 8900
},
{
"epoch": 0.4985320999279898,
"grad_norm": 1.3707598447799683,
"learning_rate": 2.5090012740264778e-05,
"loss": 0.932,
"step": 9000
},
{
"epoch": 0.5040713454827452,
"grad_norm": 1.4937622547149658,
"learning_rate": 2.4815820085304385e-05,
"loss": 0.929,
"step": 9100
},
{
"epoch": 0.5096105910375007,
"grad_norm": 1.4973763227462769,
"learning_rate": 2.4538857807566608e-05,
"loss": 0.9313,
"step": 9200
},
{
"epoch": 0.5151498365922561,
"grad_norm": 1.3731300830841064,
"learning_rate": 2.4261895529828838e-05,
"loss": 0.8977,
"step": 9300
},
{
"epoch": 0.5206890821470116,
"grad_norm": 1.5620144605636597,
"learning_rate": 2.3984933252091064e-05,
"loss": 0.9255,
"step": 9400
},
{
"epoch": 0.526228327701767,
"grad_norm": 1.4226034879684448,
"learning_rate": 2.3707970974353294e-05,
"loss": 0.8994,
"step": 9500
},
{
"epoch": 0.5317675732565225,
"grad_norm": 1.3993937969207764,
"learning_rate": 2.3431008696615524e-05,
"loss": 0.9212,
"step": 9600
},
{
"epoch": 0.537306818811278,
"grad_norm": 1.4125133752822876,
"learning_rate": 2.315404641887775e-05,
"loss": 0.9072,
"step": 9700
},
{
"epoch": 0.5428460643660333,
"grad_norm": 1.2466013431549072,
"learning_rate": 2.287708414113998e-05,
"loss": 0.9155,
"step": 9800
},
{
"epoch": 0.5483853099207888,
"grad_norm": 1.319427251815796,
"learning_rate": 2.2600121863402206e-05,
"loss": 0.8968,
"step": 9900
},
{
"epoch": 0.5539245554755442,
"grad_norm": 1.5401791334152222,
"learning_rate": 2.2323159585664432e-05,
"loss": 0.9146,
"step": 10000
},
{
"epoch": 0.5594638010302997,
"grad_norm": 1.2928884029388428,
"learning_rate": 2.2046197307926662e-05,
"loss": 0.8996,
"step": 10100
},
{
"epoch": 0.5650030465850551,
"grad_norm": 1.5264612436294556,
"learning_rate": 2.176923503018889e-05,
"loss": 0.8889,
"step": 10200
},
{
"epoch": 0.5705422921398106,
"grad_norm": 1.5760724544525146,
"learning_rate": 2.1492272752451118e-05,
"loss": 0.8978,
"step": 10300
},
{
"epoch": 0.576081537694566,
"grad_norm": 1.414896845817566,
"learning_rate": 2.1215310474713345e-05,
"loss": 0.8989,
"step": 10400
},
{
"epoch": 0.5816207832493214,
"grad_norm": 1.422662377357483,
"learning_rate": 2.0938348196975574e-05,
"loss": 0.8695,
"step": 10500
},
{
"epoch": 0.5871600288040769,
"grad_norm": 1.6162989139556885,
"learning_rate": 2.06613859192378e-05,
"loss": 0.9017,
"step": 10600
},
{
"epoch": 0.5926992743588323,
"grad_norm": 1.537391185760498,
"learning_rate": 2.0384423641500027e-05,
"loss": 0.8946,
"step": 10700
},
{
"epoch": 0.5982385199135878,
"grad_norm": 1.3381567001342773,
"learning_rate": 2.0107461363762257e-05,
"loss": 0.8901,
"step": 10800
},
{
"epoch": 0.6037777654683432,
"grad_norm": 1.3765337467193604,
"learning_rate": 1.9830499086024483e-05,
"loss": 0.8872,
"step": 10900
},
{
"epoch": 0.6093170110230987,
"grad_norm": 1.5019334554672241,
"learning_rate": 1.9553536808286713e-05,
"loss": 0.8688,
"step": 11000
},
{
"epoch": 0.6148562565778541,
"grad_norm": 1.5896552801132202,
"learning_rate": 1.9276574530548943e-05,
"loss": 0.8863,
"step": 11100
},
{
"epoch": 0.6203955021326095,
"grad_norm": 1.481461524963379,
"learning_rate": 1.899961225281117e-05,
"loss": 0.8735,
"step": 11200
},
{
"epoch": 0.625934747687365,
"grad_norm": 1.5073565244674683,
"learning_rate": 1.8722649975073395e-05,
"loss": 0.8929,
"step": 11300
},
{
"epoch": 0.6314739932421204,
"grad_norm": 1.3693833351135254,
"learning_rate": 1.844568769733562e-05,
"loss": 0.8478,
"step": 11400
},
{
"epoch": 0.6370132387968759,
"grad_norm": 1.481180191040039,
"learning_rate": 1.816872541959785e-05,
"loss": 0.8784,
"step": 11500
},
{
"epoch": 0.6425524843516313,
"grad_norm": 1.5158451795578003,
"learning_rate": 1.789176314186008e-05,
"loss": 0.8697,
"step": 11600
},
{
"epoch": 0.6480917299063867,
"grad_norm": 1.4979525804519653,
"learning_rate": 1.7614800864122307e-05,
"loss": 0.9006,
"step": 11700
},
{
"epoch": 0.6536309754611422,
"grad_norm": 1.3702558279037476,
"learning_rate": 1.7337838586384537e-05,
"loss": 0.8745,
"step": 11800
},
{
"epoch": 0.6591702210158976,
"grad_norm": 1.298034429550171,
"learning_rate": 1.7060876308646763e-05,
"loss": 0.8719,
"step": 11900
},
{
"epoch": 0.6647094665706531,
"grad_norm": 1.3086830377578735,
"learning_rate": 1.678668365368637e-05,
"loss": 0.8478,
"step": 12000
},
{
"epoch": 0.6702487121254085,
"grad_norm": 1.4746365547180176,
"learning_rate": 1.6509721375948597e-05,
"loss": 0.8843,
"step": 12100
},
{
"epoch": 0.675787957680164,
"grad_norm": 1.503086805343628,
"learning_rate": 1.6232759098210827e-05,
"loss": 0.8948,
"step": 12200
},
{
"epoch": 0.6813272032349194,
"grad_norm": 1.4060204029083252,
"learning_rate": 1.595579682047305e-05,
"loss": 0.8695,
"step": 12300
},
{
"epoch": 0.6868664487896748,
"grad_norm": 1.5291565656661987,
"learning_rate": 1.568160416551266e-05,
"loss": 0.9056,
"step": 12400
},
{
"epoch": 0.6924056943444303,
"grad_norm": 1.3221312761306763,
"learning_rate": 1.5404641887774887e-05,
"loss": 0.8651,
"step": 12500
},
{
"epoch": 0.6979449398991857,
"grad_norm": 1.5272188186645508,
"learning_rate": 1.5127679610037113e-05,
"loss": 0.8725,
"step": 12600
},
{
"epoch": 0.7034841854539412,
"grad_norm": 1.4335006475448608,
"learning_rate": 1.4850717332299341e-05,
"loss": 0.8663,
"step": 12700
},
{
"epoch": 0.7090234310086966,
"grad_norm": 1.321509838104248,
"learning_rate": 1.457375505456157e-05,
"loss": 0.8565,
"step": 12800
},
{
"epoch": 0.7145626765634521,
"grad_norm": 1.3843624591827393,
"learning_rate": 1.4296792776823797e-05,
"loss": 0.8614,
"step": 12900
},
{
"epoch": 0.7201019221182074,
"grad_norm": 1.3091601133346558,
"learning_rate": 1.4019830499086025e-05,
"loss": 0.8538,
"step": 13000
},
{
"epoch": 0.7256411676729629,
"grad_norm": 1.5815715789794922,
"learning_rate": 1.3742868221348253e-05,
"loss": 0.8624,
"step": 13100
},
{
"epoch": 0.7311804132277184,
"grad_norm": 1.6078437566757202,
"learning_rate": 1.346590594361048e-05,
"loss": 0.8405,
"step": 13200
},
{
"epoch": 0.7367196587824738,
"grad_norm": 1.446047306060791,
"learning_rate": 1.3188943665872708e-05,
"loss": 0.8445,
"step": 13300
},
{
"epoch": 0.7422589043372293,
"grad_norm": 1.207653284072876,
"learning_rate": 1.2911981388134936e-05,
"loss": 0.8431,
"step": 13400
},
{
"epoch": 0.7477981498919847,
"grad_norm": 1.3394980430603027,
"learning_rate": 1.2635019110397164e-05,
"loss": 0.868,
"step": 13500
},
{
"epoch": 0.7533373954467402,
"grad_norm": 1.549267053604126,
"learning_rate": 1.2358056832659392e-05,
"loss": 0.8604,
"step": 13600
},
{
"epoch": 0.7588766410014955,
"grad_norm": 1.3387057781219482,
"learning_rate": 1.2081094554921622e-05,
"loss": 0.8563,
"step": 13700
},
{
"epoch": 0.764415886556251,
"grad_norm": 1.3845041990280151,
"learning_rate": 1.1804132277183848e-05,
"loss": 0.8365,
"step": 13800
},
{
"epoch": 0.7699551321110065,
"grad_norm": 1.4750843048095703,
"learning_rate": 1.1527169999446076e-05,
"loss": 0.8322,
"step": 13900
},
{
"epoch": 0.7754943776657619,
"grad_norm": 1.4781407117843628,
"learning_rate": 1.1250207721708304e-05,
"loss": 0.8696,
"step": 14000
},
{
"epoch": 0.7810336232205174,
"grad_norm": 1.5403274297714233,
"learning_rate": 1.0973245443970532e-05,
"loss": 0.8381,
"step": 14100
},
{
"epoch": 0.7865728687752728,
"grad_norm": 1.4403986930847168,
"learning_rate": 1.069628316623276e-05,
"loss": 0.8341,
"step": 14200
},
{
"epoch": 0.7921121143300283,
"grad_norm": 1.509552001953125,
"learning_rate": 1.0419320888494988e-05,
"loss": 0.8689,
"step": 14300
},
{
"epoch": 0.7976513598847836,
"grad_norm": 1.567030668258667,
"learning_rate": 1.0142358610757214e-05,
"loss": 0.8496,
"step": 14400
},
{
"epoch": 0.8031906054395391,
"grad_norm": 1.4610817432403564,
"learning_rate": 9.865396333019443e-06,
"loss": 0.8694,
"step": 14500
},
{
"epoch": 0.8087298509942946,
"grad_norm": 1.4665472507476807,
"learning_rate": 9.58843405528167e-06,
"loss": 0.843,
"step": 14600
},
{
"epoch": 0.81426909654905,
"grad_norm": 1.45100998878479,
"learning_rate": 9.3114717775439e-06,
"loss": 0.8529,
"step": 14700
},
{
"epoch": 0.8198083421038055,
"grad_norm": 1.4459706544876099,
"learning_rate": 9.034509499806127e-06,
"loss": 0.8251,
"step": 14800
},
{
"epoch": 0.8253475876585609,
"grad_norm": 1.408592700958252,
"learning_rate": 8.757547222068355e-06,
"loss": 0.858,
"step": 14900
},
{
"epoch": 0.8308868332133164,
"grad_norm": 1.4576612710952759,
"learning_rate": 8.480584944330583e-06,
"loss": 0.8294,
"step": 15000
},
{
"epoch": 0.8364260787680717,
"grad_norm": 1.4453418254852295,
"learning_rate": 8.203622666592809e-06,
"loss": 0.8421,
"step": 15100
},
{
"epoch": 0.8419653243228272,
"grad_norm": 1.4345848560333252,
"learning_rate": 7.926660388855039e-06,
"loss": 0.8338,
"step": 15200
},
{
"epoch": 0.8475045698775827,
"grad_norm": 1.4117823839187622,
"learning_rate": 7.649698111117267e-06,
"loss": 0.8494,
"step": 15300
},
{
"epoch": 0.8530438154323381,
"grad_norm": 1.7380582094192505,
"learning_rate": 7.372735833379495e-06,
"loss": 0.831,
"step": 15400
},
{
"epoch": 0.8585830609870936,
"grad_norm": 1.372897744178772,
"learning_rate": 7.095773555641721e-06,
"loss": 0.8201,
"step": 15500
},
{
"epoch": 0.864122306541849,
"grad_norm": 1.482770562171936,
"learning_rate": 6.81881127790395e-06,
"loss": 0.8221,
"step": 15600
},
{
"epoch": 0.8696615520966045,
"grad_norm": 1.394877314567566,
"learning_rate": 6.541849000166178e-06,
"loss": 0.8441,
"step": 15700
},
{
"epoch": 0.8752007976513598,
"grad_norm": 1.4818041324615479,
"learning_rate": 6.264886722428405e-06,
"loss": 0.843,
"step": 15800
},
{
"epoch": 0.8807400432061153,
"grad_norm": 1.8270115852355957,
"learning_rate": 5.9879244446906334e-06,
"loss": 0.851,
"step": 15900
},
{
"epoch": 0.8862792887608708,
"grad_norm": 1.4031203985214233,
"learning_rate": 5.7109621669528615e-06,
"loss": 0.8515,
"step": 16000
},
{
"epoch": 0.8918185343156262,
"grad_norm": 1.5000931024551392,
"learning_rate": 5.4339998892150895e-06,
"loss": 0.819,
"step": 16100
},
{
"epoch": 0.8973577798703817,
"grad_norm": 1.4219608306884766,
"learning_rate": 5.1570376114773176e-06,
"loss": 0.8528,
"step": 16200
},
{
"epoch": 0.9028970254251371,
"grad_norm": 1.6713409423828125,
"learning_rate": 4.880075333739545e-06,
"loss": 0.8253,
"step": 16300
},
{
"epoch": 0.9084362709798925,
"grad_norm": 1.642972707748413,
"learning_rate": 4.603113056001773e-06,
"loss": 0.8199,
"step": 16400
},
{
"epoch": 0.913975516534648,
"grad_norm": 1.4362276792526245,
"learning_rate": 4.328920401041378e-06,
"loss": 0.8186,
"step": 16500
},
{
"epoch": 0.9195147620894034,
"grad_norm": 1.387813925743103,
"learning_rate": 4.051958123303606e-06,
"loss": 0.8447,
"step": 16600
},
{
"epoch": 0.9250540076441589,
"grad_norm": 1.7197731733322144,
"learning_rate": 3.774995845565834e-06,
"loss": 0.8416,
"step": 16700
},
{
"epoch": 0.9305932531989143,
"grad_norm": 1.4148505926132202,
"learning_rate": 3.498033567828062e-06,
"loss": 0.8388,
"step": 16800
},
{
"epoch": 0.9361324987536698,
"grad_norm": 1.5057902336120605,
"learning_rate": 3.2210712900902897e-06,
"loss": 0.8141,
"step": 16900
},
{
"epoch": 0.9416717443084252,
"grad_norm": 1.6533297300338745,
"learning_rate": 2.9441090123525177e-06,
"loss": 0.8131,
"step": 17000
},
{
"epoch": 0.9472109898631806,
"grad_norm": 1.550384759902954,
"learning_rate": 2.6671467346147458e-06,
"loss": 0.8474,
"step": 17100
},
{
"epoch": 0.9527502354179361,
"grad_norm": 1.3930208683013916,
"learning_rate": 2.3901844568769734e-06,
"loss": 0.8252,
"step": 17200
},
{
"epoch": 0.9582894809726915,
"grad_norm": 1.534490704536438,
"learning_rate": 2.1132221791392014e-06,
"loss": 0.8333,
"step": 17300
},
{
"epoch": 0.963828726527447,
"grad_norm": 1.496036171913147,
"learning_rate": 1.836259901401429e-06,
"loss": 0.8452,
"step": 17400
},
{
"epoch": 0.9693679720822024,
"grad_norm": 1.4429802894592285,
"learning_rate": 1.559297623663657e-06,
"loss": 0.8368,
"step": 17500
},
{
"epoch": 0.9749072176369579,
"grad_norm": 1.4180331230163574,
"learning_rate": 1.282335345925885e-06,
"loss": 0.8139,
"step": 17600
},
{
"epoch": 0.9804464631917132,
"grad_norm": 1.423274278640747,
"learning_rate": 1.005373068188113e-06,
"loss": 0.809,
"step": 17700
},
{
"epoch": 0.9859857087464687,
"grad_norm": 1.4893913269042969,
"learning_rate": 7.284107904503407e-07,
"loss": 0.8271,
"step": 17800
},
{
"epoch": 0.9915249543012242,
"grad_norm": 1.5484572649002075,
"learning_rate": 4.5144851271256857e-07,
"loss": 0.8409,
"step": 17900
},
{
"epoch": 0.9970641998559796,
"grad_norm": 1.423147439956665,
"learning_rate": 1.7448623497479645e-07,
"loss": 0.8285,
"step": 18000
}
],
"logging_steps": 100,
"max_steps": 18053,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 18053,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.47361101474431e+17,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}