Files
llama-3-8b-base-kto-ultrafe…/trainer_state.json
ModelHub XC c1cab8bfec 初始化项目,由ModelHub XC社区提供模型
Model: jackf857/llama-3-8b-base-kto-ultrafeedback-8xh200
Source: Original Platform
2026-04-23 23:37:08 +08:00

1548 lines
49 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 200,
"global_step": 955,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0010471204188481676,
"grad_norm": 45.70638656616211,
"kl": 0.03365863859653473,
"learning_rate": 0.0,
"logits/chosen": -133350016.0,
"logits/rejected": -100751848.0,
"logps/chosen": -199.38916015625,
"logps/rejected": -248.57103704637098,
"loss": 1.9996,
"rewards/chosen": -0.006603976993849783,
"rewards/margins": 0.002509254980413562,
"rewards/rejected": -0.009113231974263345,
"step": 1
},
{
"epoch": 0.010471204188481676,
"grad_norm": 45.63622283935547,
"kl": 0.05001102015376091,
"learning_rate": 4.6875e-08,
"logits/chosen": -114836760.0,
"logits/rejected": -115496552.0,
"logps/chosen": -280.943407960199,
"logps/rejected": -255.2594489981785,
"loss": 2.0019,
"rewards/chosen": -0.001783605436385172,
"rewards/margins": -0.0038228675079356074,
"rewards/rejected": 0.0020392620715504353,
"step": 10
},
{
"epoch": 0.020942408376963352,
"grad_norm": 49.609336853027344,
"kl": 0.050704918801784515,
"learning_rate": 9.895833333333332e-08,
"logits/chosen": -125602032.0,
"logits/rejected": -116276880.0,
"logps/chosen": -279.09286115269464,
"logps/rejected": -261.840047998366,
"loss": 1.9992,
"rewards/chosen": 0.0030849476060467564,
"rewards/margins": 0.0017726313628812977,
"rewards/rejected": 0.0013123162431654587,
"step": 20
},
{
"epoch": 0.031413612565445025,
"grad_norm": 45.7076301574707,
"kl": 0.0796850323677063,
"learning_rate": 1.5104166666666664e-07,
"logits/chosen": -117311640.0,
"logits/rejected": -114144304.0,
"logps/chosen": -294.7149727852853,
"logps/rejected": -242.79242162052117,
"loss": 1.9982,
"rewards/chosen": 0.009787982648557372,
"rewards/margins": 0.002934079104880057,
"rewards/rejected": 0.006853903543677315,
"step": 30
},
{
"epoch": 0.041884816753926704,
"grad_norm": 53.927669525146484,
"kl": 0.20030224323272705,
"learning_rate": 2.03125e-07,
"logits/chosen": -119643952.0,
"logits/rejected": -120977776.0,
"logps/chosen": -306.0683379120879,
"logps/rejected": -278.4179383748056,
"loss": 1.9963,
"rewards/chosen": 0.03329542031280661,
"rewards/margins": 0.007960547439639757,
"rewards/rejected": 0.025334872873166856,
"step": 40
},
{
"epoch": 0.05235602094240838,
"grad_norm": 51.90446090698242,
"kl": 0.34236329793930054,
"learning_rate": 2.552083333333333e-07,
"logits/chosen": -119124032.0,
"logits/rejected": -115569560.0,
"logps/chosen": -311.1760096153846,
"logps/rejected": -268.8574900793651,
"loss": 1.9864,
"rewards/chosen": 0.07745902428260217,
"rewards/margins": 0.02584011520367111,
"rewards/rejected": 0.051618909078931054,
"step": 50
},
{
"epoch": 0.06282722513089005,
"grad_norm": 51.83867263793945,
"kl": 0.503300666809082,
"learning_rate": 3.0729166666666665e-07,
"logits/chosen": -114966488.0,
"logits/rejected": -120589360.0,
"logps/chosen": -299.37085962145113,
"logps/rejected": -272.62553212074306,
"loss": 1.9757,
"rewards/chosen": 0.12960234199788667,
"rewards/margins": 0.050474802944834835,
"rewards/rejected": 0.07912753905305184,
"step": 60
},
{
"epoch": 0.07329842931937172,
"grad_norm": 55.578922271728516,
"kl": 0.31499481201171875,
"learning_rate": 3.59375e-07,
"logits/chosen": -111468848.0,
"logits/rejected": -121516712.0,
"logps/chosen": -281.68463625401927,
"logps/rejected": -274.1886635638298,
"loss": 1.9691,
"rewards/chosen": 0.17197007304985806,
"rewards/margins": 0.07172654231262758,
"rewards/rejected": 0.10024353073723048,
"step": 70
},
{
"epoch": 0.08376963350785341,
"grad_norm": 53.44794464111328,
"kl": 0.14020584523677826,
"learning_rate": 4.114583333333333e-07,
"logits/chosen": -124242176.0,
"logits/rejected": -114853336.0,
"logps/chosen": -309.7781393568147,
"logps/rejected": -254.96207137161085,
"loss": 1.919,
"rewards/chosen": 0.2425672171857054,
"rewards/margins": 0.16257205424617438,
"rewards/rejected": 0.079995162939531,
"step": 80
},
{
"epoch": 0.09424083769633508,
"grad_norm": 45.9337043762207,
"kl": 0.010224603116512299,
"learning_rate": 4.6354166666666664e-07,
"logits/chosen": -114247248.0,
"logits/rejected": -121002304.0,
"logps/chosen": -255.63410433070865,
"logps/rejected": -255.94968507751938,
"loss": 1.9103,
"rewards/chosen": 0.2659489669199065,
"rewards/margins": 0.1896964983080636,
"rewards/rejected": 0.0762524686118429,
"step": 90
},
{
"epoch": 0.10471204188481675,
"grad_norm": 47.694000244140625,
"kl": 0.0,
"learning_rate": 4.999849525959245e-07,
"logits/chosen": -116696832.0,
"logits/rejected": -135091520.0,
"logps/chosen": -299.64565284653463,
"logps/rejected": -257.15984328635017,
"loss": 1.8452,
"rewards/chosen": 0.3131070278658725,
"rewards/margins": 0.35410608100594115,
"rewards/rejected": -0.04099905314006862,
"step": 100
},
{
"epoch": 0.11518324607329843,
"grad_norm": 45.92950439453125,
"kl": 0.0,
"learning_rate": 4.997174935782199e-07,
"logits/chosen": -114301056.0,
"logits/rejected": -125180608.0,
"logps/chosen": -289.91893468118195,
"logps/rejected": -249.99958300627944,
"loss": 1.8212,
"rewards/chosen": 0.0881744610206521,
"rewards/margins": 0.3989718005393923,
"rewards/rejected": -0.31079733951874017,
"step": 110
},
{
"epoch": 0.1256544502617801,
"grad_norm": 44.388973236083984,
"kl": 0.0,
"learning_rate": 4.9911605954668e-07,
"logits/chosen": -128287936.0,
"logits/rejected": -123292968.0,
"logps/chosen": -274.05433947772656,
"logps/rejected": -291.2209608505564,
"loss": 1.7655,
"rewards/chosen": 0.06379634663805983,
"rewards/margins": 0.5666303574066409,
"rewards/rejected": -0.502834010768581,
"step": 120
},
{
"epoch": 0.13612565445026178,
"grad_norm": 43.125389099121094,
"kl": 0.0,
"learning_rate": 4.981814548660135e-07,
"logits/chosen": -118625360.0,
"logits/rejected": -142319376.0,
"logps/chosen": -286.44763163349916,
"logps/rejected": -260.8665758862629,
"loss": 1.688,
"rewards/chosen": 0.16187965810595462,
"rewards/margins": 0.7612699556618077,
"rewards/rejected": -0.599390297555853,
"step": 130
},
{
"epoch": 0.14659685863874344,
"grad_norm": 43.5549201965332,
"kl": 0.0,
"learning_rate": 4.969149294871417e-07,
"logits/chosen": -134174704.0,
"logits/rejected": -129014448.0,
"logps/chosen": -270.30858126996804,
"logps/rejected": -286.30975248470946,
"loss": 1.6923,
"rewards/chosen": -0.13654317642553165,
"rewards/margins": 0.839808220327622,
"rewards/rejected": -0.9763513967531536,
"step": 140
},
{
"epoch": 0.15706806282722513,
"grad_norm": 38.105342864990234,
"kl": 0.0,
"learning_rate": 4.953181772754997e-07,
"logits/chosen": -140920352.0,
"logits/rejected": -132664600.0,
"logps/chosen": -272.86307251908397,
"logps/rejected": -266.89385,
"loss": 1.6369,
"rewards/chosen": -0.0188656086230096,
"rewards/margins": 1.1427980632519903,
"rewards/rejected": -1.161663671875,
"step": 150
},
{
"epoch": 0.16753926701570682,
"grad_norm": 38.238868713378906,
"kl": 0.0,
"learning_rate": 4.93393333745642e-07,
"logits/chosen": -130763216.0,
"logits/rejected": -132181120.0,
"logps/chosen": -267.7964184253247,
"logps/rejected": -265.75249435240966,
"loss": 1.6156,
"rewards/chosen": 0.03252483962418197,
"rewards/margins": 1.1381634585530223,
"rewards/rejected": -1.1056386189288403,
"step": 160
},
{
"epoch": 0.17801047120418848,
"grad_norm": 149.7418212890625,
"kl": 0.0,
"learning_rate": 4.9114297320518e-07,
"logits/chosen": -148455008.0,
"logits/rejected": -148213216.0,
"logps/chosen": -292.17505877742946,
"logps/rejected": -289.26684190031153,
"loss": 1.6163,
"rewards/chosen": -0.49163005195067594,
"rewards/margins": 1.4294290544185801,
"rewards/rejected": -1.9210591063692561,
"step": 170
},
{
"epoch": 0.18848167539267016,
"grad_norm": 43.059410095214844,
"kl": 0.0,
"learning_rate": 4.885701053118751e-07,
"logits/chosen": -147940400.0,
"logits/rejected": -144332512.0,
"logps/chosen": -282.0602057573416,
"logps/rejected": -282.34577409162716,
"loss": 1.5876,
"rewards/chosen": 0.034519776310397446,
"rewards/margins": 1.3361885912602889,
"rewards/rejected": -1.3016688149498914,
"step": 180
},
{
"epoch": 0.19895287958115182,
"grad_norm": 40.49046325683594,
"kl": 0.0,
"learning_rate": 4.856781710484872e-07,
"logits/chosen": -139495344.0,
"logits/rejected": -144202096.0,
"logps/chosen": -280.0632974481659,
"logps/rejected": -290.6424196018377,
"loss": 1.5551,
"rewards/chosen": 0.12813351523172722,
"rewards/margins": 1.4480502681389824,
"rewards/rejected": -1.319916752907255,
"step": 190
},
{
"epoch": 0.2094240837696335,
"grad_norm": 40.46213912963867,
"kl": 0.0,
"learning_rate": 4.824710381207655e-07,
"logits/chosen": -144665424.0,
"logits/rejected": -151805984.0,
"logps/chosen": -291.33322447749197,
"logps/rejected": -285.26674107142856,
"loss": 1.5841,
"rewards/chosen": 0.006562507420874102,
"rewards/margins": 1.4862394700443544,
"rewards/rejected": -1.4796769626234803,
"step": 200
},
{
"epoch": 0.2094240837696335,
"eval_kl": 0.0,
"eval_logits/chosen": -151004736.0,
"eval_logits/rejected": -149476768.0,
"eval_logps/chosen": -289.55475,
"eval_logps/rejected": -284.09784375,
"eval_loss": 0.39710375666618347,
"eval_rewards/chosen": -0.16988844299316405,
"eval_rewards/margins": 1.544695541381836,
"eval_rewards/rejected": -1.714583984375,
"eval_runtime": 92.5994,
"eval_samples_per_second": 43.197,
"eval_steps_per_second": 1.35,
"step": 200
},
{
"epoch": 0.2198952879581152,
"grad_norm": 36.308265686035156,
"kl": 0.0,
"learning_rate": 4.789529957847353e-07,
"logits/chosen": -152687040.0,
"logits/rejected": -144599248.0,
"logps/chosen": -300.46617366412215,
"logps/rejected": -276.6361,
"loss": 1.5542,
"rewards/chosen": -0.020091176215018935,
"rewards/margins": 1.684735972222481,
"rewards/rejected": -1.7048271484375,
"step": 210
},
{
"epoch": 0.23036649214659685,
"grad_norm": 42.04066467285156,
"kl": 0.0,
"learning_rate": 4.751287491101977e-07,
"logits/chosen": -153300528.0,
"logits/rejected": -141240576.0,
"logps/chosen": -284.2961228649068,
"logps/rejected": -269.83765723270443,
"loss": 1.626,
"rewards/chosen": -0.18326281908876407,
"rewards/margins": 1.4757443193374349,
"rewards/rejected": -1.6590071384261988,
"step": 220
},
{
"epoch": 0.24083769633507854,
"grad_norm": 36.196807861328125,
"kl": 0.0,
"learning_rate": 4.710034126881159e-07,
"logits/chosen": -159471936.0,
"logits/rejected": -135821408.0,
"logps/chosen": -301.3033342430859,
"logps/rejected": -293.3502055227656,
"loss": 1.5672,
"rewards/chosen": 0.1034569663945938,
"rewards/margins": 1.6865190209305867,
"rewards/rejected": -1.5830620545359928,
"step": 230
},
{
"epoch": 0.2513089005235602,
"grad_norm": 36.31395721435547,
"kl": 0.0,
"learning_rate": 4.665825037903035e-07,
"logits/chosen": -151562448.0,
"logits/rejected": -142325232.0,
"logps/chosen": -277.93509244992293,
"logps/rejected": -272.8287688193344,
"loss": 1.5525,
"rewards/chosen": 0.3289636691288882,
"rewards/margins": 1.6360129685110691,
"rewards/rejected": -1.307049299382181,
"step": 240
},
{
"epoch": 0.2617801047120419,
"grad_norm": 39.01026916503906,
"kl": 0.0,
"learning_rate": 4.618719349905619e-07,
"logits/chosen": -156142944.0,
"logits/rejected": -141863280.0,
"logps/chosen": -294.4734971374046,
"logps/rejected": -273.881775,
"loss": 1.525,
"rewards/chosen": 0.3023297957791627,
"rewards/margins": 1.7884680770291626,
"rewards/rejected": -1.48613828125,
"step": 250
},
{
"epoch": 0.27225130890052357,
"grad_norm": 40.547950744628906,
"kl": 0.0,
"learning_rate": 4.568780062571374e-07,
"logits/chosen": -152771744.0,
"logits/rejected": -151975136.0,
"logps/chosen": -278.8975861378205,
"logps/rejected": -288.9916634908537,
"loss": 1.5231,
"rewards/chosen": 0.07649397850036621,
"rewards/margins": 1.9374570090596268,
"rewards/rejected": -1.8609630305592606,
"step": 260
},
{
"epoch": 0.28272251308900526,
"grad_norm": 38.78754806518555,
"kl": 0.0,
"learning_rate": 4.516073965270717e-07,
"logits/chosen": -147246848.0,
"logits/rejected": -140734832.0,
"logps/chosen": -275.6838321596244,
"logps/rejected": -295.4493467238689,
"loss": 1.5248,
"rewards/chosen": 0.03273308743520149,
"rewards/margins": 1.9670623063070813,
"rewards/rejected": -1.93432921887188,
"step": 270
},
{
"epoch": 0.2931937172774869,
"grad_norm": 38.30440902709961,
"kl": 0.0,
"learning_rate": 4.460671547737158e-07,
"logits/chosen": -139242080.0,
"logits/rejected": -144891984.0,
"logps/chosen": -307.5467996382637,
"logps/rejected": -275.4886018237082,
"loss": 1.5128,
"rewards/chosen": -0.2156752313448302,
"rewards/margins": 1.9168140977062718,
"rewards/rejected": -2.132489329051102,
"step": 280
},
{
"epoch": 0.3036649214659686,
"grad_norm": 52.507747650146484,
"kl": 0.0,
"learning_rate": 4.40264690579353e-07,
"logits/chosen": -153254432.0,
"logits/rejected": -148096032.0,
"logps/chosen": -296.4355230564024,
"logps/rejected": -277.0494791666667,
"loss": 1.5223,
"rewards/chosen": -0.014792419061428162,
"rewards/margins": 2.2441466881976866,
"rewards/rejected": -2.2589391072591147,
"step": 290
},
{
"epoch": 0.31413612565445026,
"grad_norm": 42.4326286315918,
"kl": 0.0,
"learning_rate": 4.3420776422553916e-07,
"logits/chosen": -146678224.0,
"logits/rejected": -144266464.0,
"logps/chosen": -289.21781823394497,
"logps/rejected": -280.0624001597444,
"loss": 1.5308,
"rewards/chosen": 0.00835518734899865,
"rewards/margins": 2.1249945409457442,
"rewards/rejected": -2.1166393535967454,
"step": 300
},
{
"epoch": 0.32460732984293195,
"grad_norm": 48.941036224365234,
"kl": 0.0,
"learning_rate": 4.279044763144141e-07,
"logits/chosen": -139984352.0,
"logits/rejected": -148744560.0,
"logps/chosen": -271.40555111821084,
"logps/rejected": -308.63379204892965,
"loss": 1.5119,
"rewards/chosen": -0.12764440176966854,
"rewards/margins": 2.093687366674521,
"rewards/rejected": -2.22133176844419,
"step": 310
},
{
"epoch": 0.33507853403141363,
"grad_norm": 30.565563201904297,
"kl": 0.0,
"learning_rate": 4.213632569348639e-07,
"logits/chosen": -160167840.0,
"logits/rejected": -138572496.0,
"logps/chosen": -290.84521412884334,
"logps/rejected": -290.7674309045226,
"loss": 1.5273,
"rewards/chosen": -0.013026183032012091,
"rewards/margins": 2.366796105493951,
"rewards/rejected": -2.379822288525963,
"step": 320
},
{
"epoch": 0.34554973821989526,
"grad_norm": 37.133567810058594,
"kl": 0.0,
"learning_rate": 4.145928543880249e-07,
"logits/chosen": -143369248.0,
"logits/rejected": -145142368.0,
"logps/chosen": -288.8722847551343,
"logps/rejected": -284.5384949768161,
"loss": 1.4706,
"rewards/chosen": 0.3644241646379468,
"rewards/margins": 2.3581714100373867,
"rewards/rejected": -1.9937472453994398,
"step": 330
},
{
"epoch": 0.35602094240837695,
"grad_norm": 42.86240005493164,
"kl": 0.0,
"learning_rate": 4.076023234872057e-07,
"logits/chosen": -134226816.0,
"logits/rejected": -150647472.0,
"logps/chosen": -290.2114297253635,
"logps/rejected": -284.6018816187595,
"loss": 1.4998,
"rewards/chosen": 0.00031860425514627927,
"rewards/margins": 2.320413741535309,
"rewards/rejected": -2.3200951372801626,
"step": 340
},
{
"epoch": 0.36649214659685864,
"grad_norm": 43.323272705078125,
"kl": 0.0,
"learning_rate": 4.004010134478771e-07,
"logits/chosen": -153940128.0,
"logits/rejected": -142368416.0,
"logps/chosen": -277.28628954475306,
"logps/rejected": -281.2114319620253,
"loss": 1.4997,
"rewards/chosen": 0.20580647315508055,
"rewards/margins": 2.183485568845844,
"rewards/rejected": -1.9776790956907635,
"step": 350
},
{
"epoch": 0.3769633507853403,
"grad_norm": 46.848731994628906,
"kl": 0.0,
"learning_rate": 3.9299855538392534e-07,
"logits/chosen": -146128720.0,
"logits/rejected": -144031184.0,
"logps/chosen": -288.1243632445141,
"logps/rejected": -282.6408197040498,
"loss": 1.4687,
"rewards/chosen": 0.38341842699200385,
"rewards/margins": 2.226742892164317,
"rewards/rejected": -1.843324465172313,
"step": 360
},
{
"epoch": 0.387434554973822,
"grad_norm": 41.83399963378906,
"kl": 0.0,
"learning_rate": 3.8540484942689075e-07,
"logits/chosen": -145012880.0,
"logits/rejected": -147854064.0,
"logps/chosen": -280.8901771496815,
"logps/rejected": -297.4892398389571,
"loss": 1.4964,
"rewards/chosen": 0.17396898937832778,
"rewards/margins": 2.185319656196771,
"rewards/rejected": -2.0113506668184433,
"step": 370
},
{
"epoch": 0.39790575916230364,
"grad_norm": 41.95068359375,
"kl": 0.0,
"learning_rate": 3.77630051485419e-07,
"logits/chosen": -158335904.0,
"logits/rejected": -135963312.0,
"logps/chosen": -299.00375645280235,
"logps/rejected": -290.30250726744185,
"loss": 1.5113,
"rewards/chosen": -0.136355295997102,
"rewards/margins": 2.5501342781437826,
"rewards/rejected": -2.6864895741408845,
"step": 380
},
{
"epoch": 0.4083769633507853,
"grad_norm": 52.55752944946289,
"kl": 0.0,
"learning_rate": 3.696845596626342e-07,
"logits/chosen": -136675232.0,
"logits/rejected": -139554592.0,
"logps/chosen": -267.1172371031746,
"logps/rejected": -287.2497836538462,
"loss": 1.4728,
"rewards/chosen": 0.29143521747891865,
"rewards/margins": 2.468798123127957,
"rewards/rejected": -2.1773629056490384,
"step": 390
},
{
"epoch": 0.418848167539267,
"grad_norm": 42.524559020996094,
"kl": 0.0,
"learning_rate": 3.61579000349597e-07,
"logits/chosen": -145115344.0,
"logits/rejected": -138730784.0,
"logps/chosen": -291.6719227828746,
"logps/rejected": -286.7621056309904,
"loss": 1.404,
"rewards/chosen": 0.3901278061239727,
"rewards/margins": 2.5747456987457777,
"rewards/rejected": -2.184617892621805,
"step": 400
},
{
"epoch": 0.418848167539267,
"eval_kl": 0.0,
"eval_logits/chosen": -143785152.0,
"eval_logits/rejected": -142386976.0,
"eval_logps/chosen": -288.1983125,
"eval_logps/rejected": -290.82553125,
"eval_loss": 0.3773096799850464,
"eval_rewards/chosen": -0.03424349975585937,
"eval_rewards/margins": 2.3531070861816406,
"eval_rewards/rejected": -2.3873505859375,
"eval_runtime": 92.585,
"eval_samples_per_second": 43.204,
"eval_steps_per_second": 1.35,
"step": 400
},
{
"epoch": 0.4293193717277487,
"grad_norm": 42.05149459838867,
"kl": 0.0,
"learning_rate": 3.5332421401344837e-07,
"logits/chosen": -123967896.0,
"logits/rejected": -151420832.0,
"logps/chosen": -290.8375683922559,
"logps/rejected": -284.5394497084548,
"loss": 1.5018,
"rewards/chosen": -0.1862513866488781,
"rewards/margins": 2.4929647368820076,
"rewards/rejected": -2.679216123530886,
"step": 410
},
{
"epoch": 0.4397905759162304,
"grad_norm": 45.76744842529297,
"kl": 0.0,
"learning_rate": 3.4493124069924635e-07,
"logits/chosen": -141672128.0,
"logits/rejected": -143752144.0,
"logps/chosen": -296.493825,
"logps/rejected": -276.59630248091605,
"loss": 1.489,
"rewards/chosen": -0.059238671875,
"rewards/margins": 2.682879686009065,
"rewards/rejected": -2.742118357884065,
"step": 420
},
{
"epoch": 0.450261780104712,
"grad_norm": 48.224552154541016,
"kl": 0.0,
"learning_rate": 3.3641130526488335e-07,
"logits/chosen": -128212800.0,
"logits/rejected": -138041008.0,
"logps/chosen": -262.0190345368917,
"logps/rejected": -302.6821539657854,
"loss": 1.5488,
"rewards/chosen": 0.1527433649898511,
"rewards/margins": 2.138388427805753,
"rewards/rejected": -1.985645062815902,
"step": 430
},
{
"epoch": 0.4607329842931937,
"grad_norm": 38.78076171875,
"kl": 0.0,
"learning_rate": 3.2777580236883473e-07,
"logits/chosen": -137593360.0,
"logits/rejected": -143156048.0,
"logps/chosen": -262.82073682108626,
"logps/rejected": -282.15691896024464,
"loss": 1.4863,
"rewards/chosen": 0.48399031276520066,
"rewards/margins": 2.222597083598534,
"rewards/rejected": -1.7386067708333333,
"step": 440
},
{
"epoch": 0.4712041884816754,
"grad_norm": 32.755828857421875,
"kl": 0.0,
"learning_rate": 3.1903628123081196e-07,
"logits/chosen": -145392688.0,
"logits/rejected": -133878232.0,
"logps/chosen": -279.9309734083851,
"logps/rejected": -278.34424135220127,
"loss": 1.4144,
"rewards/chosen": 0.4770013560419497,
"rewards/margins": 2.7351111625106013,
"rewards/rejected": -2.2581098064686516,
"step": 450
},
{
"epoch": 0.4816753926701571,
"grad_norm": 42.82301330566406,
"kl": 0.0,
"learning_rate": 3.1020443018570556e-07,
"logits/chosen": -127823832.0,
"logits/rejected": -148714048.0,
"logps/chosen": -277.72029728084414,
"logps/rejected": -274.04221573795184,
"loss": 1.4946,
"rewards/chosen": 0.2742920664997844,
"rewards/margins": 2.2432816111039826,
"rewards/rejected": -1.968989544604198,
"step": 460
},
{
"epoch": 0.49214659685863876,
"grad_norm": 35.25486755371094,
"kl": 0.0,
"learning_rate": 3.0129206105147343e-07,
"logits/chosen": -128224592.0,
"logits/rejected": -141323744.0,
"logps/chosen": -297.7064896003263,
"logps/rejected": -277.0822245127436,
"loss": 1.5014,
"rewards/chosen": -0.1534212246221197,
"rewards/margins": 2.4885884590126253,
"rewards/rejected": -2.642009683634745,
"step": 470
},
{
"epoch": 0.5026178010471204,
"grad_norm": 49.871315002441406,
"kl": 0.0,
"learning_rate": 2.923110933318805e-07,
"logits/chosen": -138666448.0,
"logits/rejected": -125876032.0,
"logps/chosen": -282.11912313432833,
"logps/rejected": -271.0550204918033,
"loss": 1.5235,
"rewards/chosen": -0.26497638759328357,
"rewards/margins": 2.5066780853831507,
"rewards/rejected": -2.7716544729764343,
"step": 480
},
{
"epoch": 0.5130890052356021,
"grad_norm": 39.03130340576172,
"kl": 0.0,
"learning_rate": 2.832735382752194e-07,
"logits/chosen": -144244752.0,
"logits/rejected": -139237664.0,
"logps/chosen": -280.307546898928,
"logps/rejected": -291.85860247208933,
"loss": 1.5082,
"rewards/chosen": -0.07937168746410796,
"rewards/margins": 2.5424585728902582,
"rewards/rejected": -2.621830260354366,
"step": 490
},
{
"epoch": 0.5235602094240838,
"grad_norm": 31.400175094604492,
"kl": 0.0,
"learning_rate": 2.741914828103307e-07,
"logits/chosen": -134795200.0,
"logits/rejected": -140993584.0,
"logps/chosen": -274.8821624803768,
"logps/rejected": -273.449115474339,
"loss": 1.4409,
"rewards/chosen": 0.3371262003900118,
"rewards/margins": 2.429343961677434,
"rewards/rejected": -2.092217761287422,
"step": 500
},
{
"epoch": 0.5340314136125655,
"grad_norm": 39.398651123046875,
"kl": 0.0,
"learning_rate": 2.650770733814065e-07,
"logits/chosen": -139524336.0,
"logits/rejected": -137561184.0,
"logps/chosen": -281.1636513157895,
"logps/rejected": -272.26749138591117,
"loss": 1.4701,
"rewards/chosen": 0.545446806547174,
"rewards/margins": 2.443065145583353,
"rewards/rejected": -1.8976183390361792,
"step": 510
},
{
"epoch": 0.5445026178010471,
"grad_norm": 40.88848114013672,
"kl": 0.0,
"learning_rate": 2.55942499703198e-07,
"logits/chosen": -147061424.0,
"logits/rejected": -143406240.0,
"logps/chosen": -285.4889,
"logps/rejected": -274.23685591603055,
"loss": 1.4519,
"rewards/chosen": 0.4090586181640625,
"rewards/margins": 2.4761641167327646,
"rewards/rejected": -2.067105498568702,
"step": 520
},
{
"epoch": 0.5549738219895288,
"grad_norm": 41.25908660888672,
"kl": 0.0,
"learning_rate": 2.467999784583527e-07,
"logits/chosen": -131054160.0,
"logits/rejected": -139775840.0,
"logps/chosen": -270.7588608226837,
"logps/rejected": -278.70869170489294,
"loss": 1.4667,
"rewards/chosen": 0.1376024556997866,
"rewards/margins": 2.5865234959926955,
"rewards/rejected": -2.448921040292909,
"step": 530
},
{
"epoch": 0.5654450261780105,
"grad_norm": 58.745155334472656,
"kl": 0.0,
"learning_rate": 2.3766173695868388e-07,
"logits/chosen": -139035088.0,
"logits/rejected": -133750928.0,
"logps/chosen": -290.9726024119449,
"logps/rejected": -290.5950209330144,
"loss": 1.5503,
"rewards/chosen": -0.12228842205146918,
"rewards/margins": 2.3166335900099346,
"rewards/rejected": -2.4389220120614037,
"step": 540
},
{
"epoch": 0.5759162303664922,
"grad_norm": 66.44160461425781,
"kl": 0.0,
"learning_rate": 2.285399967922253e-07,
"logits/chosen": -140837504.0,
"logits/rejected": -148332576.0,
"logps/chosen": -269.52528454472844,
"logps/rejected": -282.80800840978594,
"loss": 1.4314,
"rewards/chosen": -0.13683468998430637,
"rewards/margins": 2.8514840770383425,
"rewards/rejected": -2.988318767022649,
"step": 550
},
{
"epoch": 0.5863874345549738,
"grad_norm": 34.65999221801758,
"kl": 0.0,
"learning_rate": 2.194469574779397e-07,
"logits/chosen": -155893536.0,
"logits/rejected": -136393088.0,
"logps/chosen": -289.53893209408193,
"logps/rejected": -284.4916465378422,
"loss": 1.4864,
"rewards/chosen": -0.09103836383732751,
"rewards/margins": 2.7043548873452408,
"rewards/rejected": -2.7953932511825683,
"step": 560
},
{
"epoch": 0.5968586387434555,
"grad_norm": 50.38192367553711,
"kl": 0.0,
"learning_rate": 2.1039478014994441e-07,
"logits/chosen": -140968768.0,
"logits/rejected": -132994816.0,
"logps/chosen": -269.5297433903577,
"logps/rejected": -288.17197311616957,
"loss": 1.4443,
"rewards/chosen": 0.11533069758912082,
"rewards/margins": 2.7949352204379236,
"rewards/rejected": -2.679604522848803,
"step": 570
},
{
"epoch": 0.6073298429319371,
"grad_norm": 39.10985565185547,
"kl": 0.0,
"learning_rate": 2.0139557129307149e-07,
"logits/chosen": -141384624.0,
"logits/rejected": -141585264.0,
"logps/chosen": -298.5900179140127,
"logps/rejected": -308.7128067484663,
"loss": 1.4304,
"rewards/chosen": -0.01027871393094397,
"rewards/margins": 2.8328863970257276,
"rewards/rejected": -2.843165110956672,
"step": 580
},
{
"epoch": 0.6178010471204188,
"grad_norm": 48.67289733886719,
"kl": 0.0,
"learning_rate": 1.9246136655151808e-07,
"logits/chosen": -145905728.0,
"logits/rejected": -138221376.0,
"logps/chosen": -293.7313226744186,
"logps/rejected": -306.55437992125985,
"loss": 1.4509,
"rewards/chosen": -0.13455553868020229,
"rewards/margins": 2.8649036994472583,
"rewards/rejected": -2.9994592381274607,
"step": 590
},
{
"epoch": 0.6282722513089005,
"grad_norm": 68.55772399902344,
"kl": 0.0,
"learning_rate": 1.8360411463223873e-07,
"logits/chosen": -136852608.0,
"logits/rejected": -143516144.0,
"logps/chosen": -284.4403070349762,
"logps/rejected": -294.88001632104454,
"loss": 1.4253,
"rewards/chosen": -0.10330413672806538,
"rewards/margins": 2.9733640825379677,
"rewards/rejected": -3.076668219266033,
"step": 600
},
{
"epoch": 0.6282722513089005,
"eval_kl": 0.0,
"eval_logits/chosen": -145117536.0,
"eval_logits/rejected": -143700400.0,
"eval_logps/chosen": -291.06696875,
"eval_logps/rejected": -298.3589375,
"eval_loss": 0.36837950348854065,
"eval_rewards/chosen": -0.32111080932617186,
"eval_rewards/margins": 2.819581085205078,
"eval_rewards/rejected": -3.14069189453125,
"eval_runtime": 92.5853,
"eval_samples_per_second": 43.203,
"eval_steps_per_second": 1.35,
"step": 600
},
{
"epoch": 0.6387434554973822,
"grad_norm": 46.867210388183594,
"kl": 0.0,
"learning_rate": 1.7483566132460865e-07,
"logits/chosen": -136255600.0,
"logits/rejected": -144252464.0,
"logps/chosen": -299.9070411392405,
"logps/rejected": -282.2795138888889,
"loss": 1.4918,
"rewards/chosen": -0.4866646392435967,
"rewards/margins": 2.6991346651063264,
"rewards/rejected": -3.185799304349923,
"step": 610
},
{
"epoch": 0.6492146596858639,
"grad_norm": 66.88858032226562,
"kl": 0.0,
"learning_rate": 1.66167733657731e-07,
"logits/chosen": -140277344.0,
"logits/rejected": -142182944.0,
"logps/chosen": -301.9623953349282,
"logps/rejected": -294.2529192189893,
"loss": 1.4825,
"rewards/chosen": -0.35411040140301037,
"rewards/margins": 2.692060965126469,
"rewards/rejected": -3.0461713665294794,
"step": 620
},
{
"epoch": 0.6596858638743456,
"grad_norm": 62.48853302001953,
"kl": 0.0,
"learning_rate": 1.5761192421657456e-07,
"logits/chosen": -133893392.0,
"logits/rejected": -143692496.0,
"logps/chosen": -292.95065395367413,
"logps/rejected": -299.167311735474,
"loss": 1.4055,
"rewards/chosen": 0.11951812159139127,
"rewards/margins": 2.9790377767782226,
"rewards/rejected": -2.859519655186831,
"step": 630
},
{
"epoch": 0.6701570680628273,
"grad_norm": 38.459293365478516,
"kl": 0.0,
"learning_rate": 1.491796756379185e-07,
"logits/chosen": -148631472.0,
"logits/rejected": -137124976.0,
"logps/chosen": -307.80620335820896,
"logps/rejected": -284.41946721311473,
"loss": 1.4798,
"rewards/chosen": 0.12505948650303172,
"rewards/margins": 2.7576216423047737,
"rewards/rejected": -2.632562155801742,
"step": 640
},
{
"epoch": 0.680628272251309,
"grad_norm": 51.62284469604492,
"kl": 0.0,
"learning_rate": 1.4088226530684071e-07,
"logits/chosen": -145016352.0,
"logits/rejected": -135913600.0,
"logps/chosen": -293.0742607526882,
"logps/rejected": -286.65267289348174,
"loss": 1.4106,
"rewards/chosen": 0.37428661059307794,
"rewards/margins": 2.8034505085213373,
"rewards/rejected": -2.4291638979282593,
"step": 650
},
{
"epoch": 0.6910994764397905,
"grad_norm": 44.217506408691406,
"kl": 0.0,
"learning_rate": 1.327307902742142e-07,
"logits/chosen": -153775056.0,
"logits/rejected": -142987488.0,
"logps/chosen": -278.56211538461537,
"logps/rejected": -295.2840277777778,
"loss": 1.4113,
"rewards/chosen": 0.3199942486102764,
"rewards/margins": 3.0220053085681133,
"rewards/rejected": -2.702011059957837,
"step": 660
},
{
"epoch": 0.7015706806282722,
"grad_norm": 52.56444549560547,
"kl": 0.0,
"learning_rate": 1.2473615241538523e-07,
"logits/chosen": -138428624.0,
"logits/rejected": -125599760.0,
"logps/chosen": -270.62024962742174,
"logps/rejected": -297.9065578817734,
"loss": 1.5102,
"rewards/chosen": 0.21654559389844916,
"rewards/margins": 2.3964325355420244,
"rewards/rejected": -2.1798869416435753,
"step": 670
},
{
"epoch": 0.7120418848167539,
"grad_norm": 38.48976516723633,
"kl": 0.0,
"learning_rate": 1.169090438498816e-07,
"logits/chosen": -140096608.0,
"logits/rejected": -141314656.0,
"logps/chosen": -289.16740023474176,
"logps/rejected": -289.05847796411854,
"loss": 1.4,
"rewards/chosen": 0.4871681464110182,
"rewards/margins": 2.8861619137789,
"rewards/rejected": -2.398993767367882,
"step": 680
},
{
"epoch": 0.7225130890052356,
"grad_norm": 56.998104095458984,
"kl": 0.0,
"learning_rate": 1.0925993264165045e-07,
"logits/chosen": -136509200.0,
"logits/rejected": -140580992.0,
"logps/chosen": -284.6041084265176,
"logps/rejected": -296.2347333715596,
"loss": 1.4483,
"rewards/chosen": 0.1843722529304675,
"rewards/margins": 2.723567089094936,
"rewards/rejected": -2.5391948361644685,
"step": 690
},
{
"epoch": 0.7329842931937173,
"grad_norm": 45.59560012817383,
"kl": 0.0,
"learning_rate": 1.0179904879894998e-07,
"logits/chosen": -139792672.0,
"logits/rejected": -133128280.0,
"logps/chosen": -282.0086206896552,
"logps/rejected": -297.9873685747664,
"loss": 1.4197,
"rewards/chosen": 0.06277323516558704,
"rewards/margins": 3.0981276117475574,
"rewards/rejected": -3.0353543765819704,
"step": 700
},
{
"epoch": 0.743455497382199,
"grad_norm": 48.215816497802734,
"kl": 0.0,
"learning_rate": 9.453637059262117e-08,
"logits/chosen": -127794064.0,
"logits/rejected": -130284464.0,
"logps/chosen": -276.90582061068704,
"logps/rejected": -275.3603,
"loss": 1.5249,
"rewards/chosen": -0.14867283478947996,
"rewards/margins": 2.47092736052302,
"rewards/rejected": -2.6196001953125,
"step": 710
},
{
"epoch": 0.7539267015706806,
"grad_norm": 61.68962097167969,
"kl": 0.0,
"learning_rate": 8.748161121103406e-08,
"logits/chosen": -140951328.0,
"logits/rejected": -141405104.0,
"logps/chosen": -288.2717027559055,
"logps/rejected": -306.3112403100775,
"loss": 1.3695,
"rewards/chosen": 0.361008819820374,
"rewards/margins": 3.0993951070660133,
"rewards/rejected": -2.7383862872456395,
"step": 720
},
{
"epoch": 0.7643979057591623,
"grad_norm": 43.57563400268555,
"kl": 0.0,
"learning_rate": 8.064420576955965e-08,
"logits/chosen": -144350032.0,
"logits/rejected": -144956128.0,
"logps/chosen": -289.1480224609375,
"logps/rejected": -297.3230224609375,
"loss": 1.4858,
"rewards/chosen": 0.04629603624343872,
"rewards/margins": 2.6986050724983217,
"rewards/rejected": -2.652309036254883,
"step": 730
},
{
"epoch": 0.774869109947644,
"grad_norm": 47.147090911865234,
"kl": 0.0,
"learning_rate": 7.403329869193922e-08,
"logits/chosen": -135583312.0,
"logits/rejected": -131832256.0,
"logps/chosen": -277.6656105990783,
"logps/rejected": -275.17182531796504,
"loss": 1.3656,
"rewards/chosen": 0.1653434061967466,
"rewards/margins": 3.2219976161281854,
"rewards/rejected": -3.056654209931439,
"step": 740
},
{
"epoch": 0.7853403141361257,
"grad_norm": 44.07842254638672,
"kl": 0.0,
"learning_rate": 6.765773148042858e-08,
"logits/chosen": -143625632.0,
"logits/rejected": -132837352.0,
"logps/chosen": -285.6507056451613,
"logps/rejected": -281.92257054848966,
"loss": 1.4614,
"rewards/chosen": 0.21915514311665946,
"rewards/margins": 2.62252863091972,
"rewards/rejected": -2.4033734878030604,
"step": 750
},
{
"epoch": 0.7958115183246073,
"grad_norm": 37.42493438720703,
"kl": 0.0,
"learning_rate": 6.152603089107139e-08,
"logits/chosen": -136608224.0,
"logits/rejected": -131950376.0,
"logps/chosen": -276.34929128614914,
"logps/rejected": -277.7825792536116,
"loss": 1.4942,
"rewards/chosen": 0.24006759666779634,
"rewards/margins": 2.4807506731374995,
"rewards/rejected": -2.240683076469703,
"step": 760
},
{
"epoch": 0.806282722513089,
"grad_norm": 68.92852020263672,
"kl": 0.0,
"learning_rate": 5.5646397529920175e-08,
"logits/chosen": -132386256.0,
"logits/rejected": -138109456.0,
"logps/chosen": -303.72342011128774,
"logps/rejected": -284.04025057603684,
"loss": 1.3831,
"rewards/chosen": 0.32797061695772056,
"rewards/margins": 2.99956472398057,
"rewards/rejected": -2.6715941070228495,
"step": 770
},
{
"epoch": 0.8167539267015707,
"grad_norm": 36.25162124633789,
"kl": 0.0,
"learning_rate": 5.002669488545111e-08,
"logits/chosen": -126804304.0,
"logits/rejected": -149925328.0,
"logps/chosen": -280.9178725369458,
"logps/rejected": -298.3454033159463,
"loss": 1.4707,
"rewards/chosen": 0.24425755893851345,
"rewards/margins": 2.512562284278491,
"rewards/rejected": -2.2683047253399775,
"step": 780
},
{
"epoch": 0.8272251308900523,
"grad_norm": 45.3673095703125,
"kl": 0.0,
"learning_rate": 4.467443881184646e-08,
"logits/chosen": -137377824.0,
"logits/rejected": -140888416.0,
"logps/chosen": -284.73014937106916,
"logps/rejected": -271.3363742236025,
"loss": 1.4641,
"rewards/chosen": 0.15725456093842127,
"rewards/margins": 2.4592872211917207,
"rewards/rejected": -2.3020326602532997,
"step": 790
},
{
"epoch": 0.837696335078534,
"grad_norm": 44.92776870727539,
"kl": 0.0,
"learning_rate": 3.959678747720488e-08,
"logits/chosen": -147636928.0,
"logits/rejected": -129594688.0,
"logps/chosen": -278.11655092592594,
"logps/rejected": -284.1502840909091,
"loss": 1.4432,
"rewards/chosen": 0.22765783239293982,
"rewards/margins": 2.823709614417733,
"rewards/rejected": -2.5960517820247935,
"step": 800
},
{
"epoch": 0.837696335078534,
"eval_kl": 0.0,
"eval_logits/chosen": -140467840.0,
"eval_logits/rejected": -139209600.0,
"eval_logps/chosen": -286.2336875,
"eval_logps/rejected": -292.39625,
"eval_loss": 0.3657679557800293,
"eval_rewards/chosen": 0.16221832275390624,
"eval_rewards/margins": 2.7066421508789062,
"eval_rewards/rejected": -2.544423828125,
"eval_runtime": 92.5899,
"eval_samples_per_second": 43.201,
"eval_steps_per_second": 1.35,
"step": 800
},
{
"epoch": 0.8481675392670157,
"grad_norm": 64.69525909423828,
"kl": 0.0,
"learning_rate": 3.480053179012654e-08,
"logits/chosen": -129839872.0,
"logits/rejected": -140454848.0,
"logps/chosen": -266.87487579491255,
"logps/rejected": -288.485599078341,
"loss": 1.5392,
"rewards/chosen": -0.023006766702867273,
"rewards/margins": 2.3186599749686,
"rewards/rejected": -2.341666741671467,
"step": 810
},
{
"epoch": 0.8586387434554974,
"grad_norm": 63.29912185668945,
"kl": 0.0,
"learning_rate": 3.029208631747446e-08,
"logits/chosen": -138798032.0,
"logits/rejected": -130695232.0,
"logps/chosen": -273.63010448619633,
"logps/rejected": -289.54072949840764,
"loss": 1.3905,
"rewards/chosen": 0.2373036811688195,
"rewards/margins": 3.154649639319695,
"rewards/rejected": -2.9173459581508756,
"step": 820
},
{
"epoch": 0.8691099476439791,
"grad_norm": 35.773826599121094,
"kl": 0.0,
"learning_rate": 2.607748070546037e-08,
"logits/chosen": -138341072.0,
"logits/rejected": -140245856.0,
"logps/chosen": -276.33953568611986,
"logps/rejected": -295.26535893962847,
"loss": 1.4761,
"rewards/chosen": 0.13098442968133872,
"rewards/margins": 2.7483675067300037,
"rewards/rejected": -2.617383077048665,
"step": 830
},
{
"epoch": 0.8795811518324608,
"grad_norm": 36.50845718383789,
"kl": 0.0,
"learning_rate": 2.2162351615526544e-08,
"logits/chosen": -140731280.0,
"logits/rejected": -148560064.0,
"logps/chosen": -301.4704117063492,
"logps/rejected": -290.1577644230769,
"loss": 1.4465,
"rewards/chosen": 0.18666185651506698,
"rewards/margins": 2.7894090019477593,
"rewards/rejected": -2.6027471454326925,
"step": 840
},
{
"epoch": 0.8900523560209425,
"grad_norm": 55.24102020263672,
"kl": 0.0,
"learning_rate": 1.8551935185811717e-08,
"logits/chosen": -132794856.0,
"logits/rejected": -138046480.0,
"logps/chosen": -282.9409226190476,
"logps/rejected": -302.71471153846153,
"loss": 1.4063,
"rewards/chosen": 0.10835386003766741,
"rewards/margins": 3.033290759076129,
"rewards/rejected": -2.9249368990384617,
"step": 850
},
{
"epoch": 0.900523560209424,
"grad_norm": 70.16515350341797,
"kl": 0.0,
"learning_rate": 1.5251060028279612e-08,
"logits/chosen": -143098928.0,
"logits/rejected": -126897944.0,
"logps/chosen": -272.46225367078824,
"logps/rejected": -305.9686759478673,
"loss": 1.5084,
"rewards/chosen": 0.03112101370621317,
"rewards/margins": 2.5436736215754867,
"rewards/rejected": -2.5125526078692735,
"step": 860
},
{
"epoch": 0.9109947643979057,
"grad_norm": 28.95891761779785,
"kl": 0.0,
"learning_rate": 1.2264140770878839e-08,
"logits/chosen": -137280736.0,
"logits/rejected": -143405088.0,
"logps/chosen": -299.961469889065,
"logps/rejected": -295.115875385208,
"loss": 1.4681,
"rewards/chosen": -0.01648792260800224,
"rewards/margins": 2.59923304083335,
"rewards/rejected": -2.615720963441352,
"step": 870
},
{
"epoch": 0.9214659685863874,
"grad_norm": 37.45644760131836,
"kl": 0.0,
"learning_rate": 9.59517215336922e-09,
"logits/chosen": -128477976.0,
"logits/rejected": -129677584.0,
"logps/chosen": -280.0816461267606,
"logps/rejected": -291.42394695787834,
"loss": 1.4164,
"rewards/chosen": -0.018490225682982444,
"rewards/margins": 3.1044779987100557,
"rewards/rejected": -3.122968224393038,
"step": 880
},
{
"epoch": 0.9319371727748691,
"grad_norm": 43.50038528442383,
"kl": 0.0,
"learning_rate": 7.247723684711382e-09,
"logits/chosen": -137378768.0,
"logits/rejected": -126123072.0,
"logps/chosen": -271.5022151295732,
"logps/rejected": -294.2158453525641,
"loss": 1.4426,
"rewards/chosen": 0.19363278877444384,
"rewards/margins": 2.807892557529452,
"rewards/rejected": -2.614259768755008,
"step": 890
},
{
"epoch": 0.9424083769633508,
"grad_norm": 58.68267059326172,
"kl": 0.0,
"learning_rate": 5.224934869164976e-09,
"logits/chosen": -139006912.0,
"logits/rejected": -141954272.0,
"logps/chosen": -292.2946211507293,
"logps/rejected": -301.43368212669685,
"loss": 1.4844,
"rewards/chosen": -0.05539148785113515,
"rewards/margins": 2.60611269089368,
"rewards/rejected": -2.6615041787448153,
"step": 900
},
{
"epoch": 0.9528795811518325,
"grad_norm": 51.242652893066406,
"kl": 0.0,
"learning_rate": 3.529511007479946e-09,
"logits/chosen": -140018880.0,
"logits/rejected": -134809904.0,
"logps/chosen": -292.4076660906298,
"logps/rejected": -281.15220091414943,
"loss": 1.4564,
"rewards/chosen": 0.1075638450235815,
"rewards/margins": 2.6941378960967137,
"rewards/rejected": -2.586574051073132,
"step": 910
},
{
"epoch": 0.9633507853403142,
"grad_norm": 60.67096710205078,
"kl": 0.0,
"learning_rate": 2.1637195787966857e-09,
"logits/chosen": -132927744.0,
"logits/rejected": -145037952.0,
"logps/chosen": -294.478180176565,
"logps/rejected": -286.2923801369863,
"loss": 1.4298,
"rewards/chosen": 0.23743079906481993,
"rewards/margins": 2.895192835903671,
"rewards/rejected": -2.657762036838851,
"step": 920
},
{
"epoch": 0.9738219895287958,
"grad_norm": 44.87836837768555,
"kl": 0.0,
"learning_rate": 1.1293872080934963e-09,
"logits/chosen": -125942888.0,
"logits/rejected": -145652336.0,
"logps/chosen": -283.0784801136364,
"logps/rejected": -296.9692206325301,
"loss": 1.4259,
"rewards/chosen": 0.11567323858087714,
"rewards/margins": 2.9895860038999658,
"rewards/rejected": -2.873912765319089,
"step": 930
},
{
"epoch": 0.9842931937172775,
"grad_norm": 43.4354362487793,
"kl": 0.0,
"learning_rate": 4.2789722323760546e-10,
"logits/chosen": -142621872.0,
"logits/rejected": -136877328.0,
"logps/chosen": -288.74076066616766,
"logps/rejected": -292.02558210784315,
"loss": 1.4129,
"rewards/chosen": 0.27186810613392354,
"rewards/margins": 3.1179713504915623,
"rewards/rejected": -2.846103244357639,
"step": 940
},
{
"epoch": 0.9947643979057592,
"grad_norm": 49.643978118896484,
"kl": 0.0,
"learning_rate": 6.018780490690822e-11,
"logits/chosen": -147837520.0,
"logits/rejected": -131119704.0,
"logps/chosen": -285.26368371212124,
"logps/rejected": -282.18916330645163,
"loss": 1.4033,
"rewards/chosen": 0.14758417534105706,
"rewards/margins": 3.1335240379922666,
"rewards/rejected": -2.9859398626512097,
"step": 950
},
{
"epoch": 1.0,
"step": 955,
"total_flos": 0.0,
"train_loss": 1.5426912418834826,
"train_runtime": 5367.6535,
"train_samples_per_second": 22.779,
"train_steps_per_second": 0.178
}
],
"logging_steps": 10,
"max_steps": 955,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}