{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 496, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.020163831127914304, "grad_norm": 15.5, "learning_rate": 3e-08, "logits/chosen": -0.4867519736289978, "logits/rejected": 0.7175194621086121, "logps/chosen": -30.936298370361328, "logps/rejected": -33.71613311767578, "loss": 0.6928, "rewards/accuracies": 0.4234375059604645, "rewards/chosen": 0.0013589367736130953, "rewards/margins": 0.0008156307740136981, "rewards/rejected": 0.0005433057667687535, "step": 10 }, { "epoch": 0.04032766225582861, "grad_norm": 14.75, "learning_rate": 6.333333333333333e-08, "logits/chosen": -0.42843765020370483, "logits/rejected": 0.7893258929252625, "logps/chosen": -30.931198120117188, "logps/rejected": -33.70547866821289, "loss": 0.6923, "rewards/accuracies": 0.546875, "rewards/chosen": 0.0024414127692580223, "rewards/margins": 0.0018455162644386292, "rewards/rejected": 0.0005958965630270541, "step": 20 }, { "epoch": 0.06049149338374291, "grad_norm": 13.5, "learning_rate": 9.666666666666666e-08, "logits/chosen": -0.4219973087310791, "logits/rejected": 0.6850587725639343, "logps/chosen": -30.893016815185547, "logps/rejected": -33.663429260253906, "loss": 0.6918, "rewards/accuracies": 0.5453125238418579, "rewards/chosen": 0.004747429862618446, "rewards/margins": 0.0027347125578671694, "rewards/rejected": 0.0020127175375819206, "step": 30 }, { "epoch": 0.08065532451165722, "grad_norm": 15.125, "learning_rate": 1.3e-07, "logits/chosen": -0.46360141038894653, "logits/rejected": 0.7466679215431213, "logps/chosen": -30.849849700927734, "logps/rejected": -33.69337463378906, "loss": 0.6893, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.009496917016804218, "rewards/margins": 0.007778028957545757, "rewards/rejected": 0.001718888757750392, "step": 40 }, { "epoch": 0.10081915563957151, "grad_norm": 14.3125, "learning_rate": 1.6333333333333331e-07, "logits/chosen": -0.5125963091850281, "logits/rejected": 0.7158086895942688, "logps/chosen": -30.67547607421875, "logps/rejected": -33.68535232543945, "loss": 0.6864, "rewards/accuracies": 0.75, "rewards/chosen": 0.015847254544496536, "rewards/margins": 0.013664955273270607, "rewards/rejected": 0.002182298805564642, "step": 50 }, { "epoch": 0.12098298676748583, "grad_norm": 15.25, "learning_rate": 1.9666666666666665e-07, "logits/chosen": -0.5598157644271851, "logits/rejected": 0.6922208070755005, "logps/chosen": -30.706085205078125, "logps/rejected": -33.59804153442383, "loss": 0.6824, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.02341878041625023, "rewards/margins": 0.02176792547106743, "rewards/rejected": 0.0016508543631061912, "step": 60 }, { "epoch": 0.14114681789540012, "grad_norm": 13.5625, "learning_rate": 2.3e-07, "logits/chosen": -0.5082000494003296, "logits/rejected": 0.6696754693984985, "logps/chosen": -30.65346908569336, "logps/rejected": -33.60075378417969, "loss": 0.6784, "rewards/accuracies": 0.84375, "rewards/chosen": 0.03311777114868164, "rewards/margins": 0.02995235286653042, "rewards/rejected": 0.003165417117998004, "step": 70 }, { "epoch": 0.16131064902331443, "grad_norm": 16.125, "learning_rate": 2.633333333333333e-07, "logits/chosen": -0.48689335584640503, "logits/rejected": 0.7162936925888062, "logps/chosen": -30.42413330078125, "logps/rejected": -33.7276611328125, "loss": 0.6722, "rewards/accuracies": 0.871874988079071, "rewards/chosen": 0.042802099138498306, "rewards/margins": 0.04268326610326767, "rewards/rejected": 0.0001188320602523163, "step": 80 }, { "epoch": 0.18147448015122875, "grad_norm": 15.0625, "learning_rate": 2.966666666666667e-07, "logits/chosen": -0.6490235328674316, "logits/rejected": 0.663810670375824, "logps/chosen": -30.447296142578125, "logps/rejected": -33.84953689575195, "loss": 0.6663, "rewards/accuracies": 0.8843749761581421, "rewards/chosen": 0.05355549976229668, "rewards/margins": 0.05490420013666153, "rewards/rejected": -0.0013486887328326702, "step": 90 }, { "epoch": 0.20163831127914303, "grad_norm": 14.6875, "learning_rate": 3.3e-07, "logits/chosen": -0.6071578860282898, "logits/rejected": 0.6672302484512329, "logps/chosen": -30.24630355834961, "logps/rejected": -33.704193115234375, "loss": 0.6589, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 0.06409426033496857, "rewards/margins": 0.07057920843362808, "rewards/rejected": -0.006484942976385355, "step": 100 }, { "epoch": 0.20163831127914303, "eval_logits/chosen": -0.8155572414398193, "eval_logits/rejected": 0.4097934663295746, "eval_logps/chosen": -30.2824764251709, "eval_logps/rejected": -33.65603256225586, "eval_loss": 0.656726062297821, "eval_rewards/accuracies": 0.8702152967453003, "eval_rewards/chosen": 0.06777840107679367, "eval_rewards/margins": 0.0754016563296318, "eval_rewards/rejected": -0.007623251993209124, "eval_runtime": 44.5266, "eval_samples_per_second": 37.506, "eval_steps_per_second": 9.388, "step": 100 }, { "epoch": 0.22180214240705734, "grad_norm": 15.5, "learning_rate": 3.6333333333333333e-07, "logits/chosen": -0.6406155824661255, "logits/rejected": 0.6143032312393188, "logps/chosen": -30.176578521728516, "logps/rejected": -33.91298294067383, "loss": 0.6486, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.07676664739847183, "rewards/margins": 0.09246878325939178, "rewards/rejected": -0.0157021377235651, "step": 110 }, { "epoch": 0.24196597353497165, "grad_norm": 16.5, "learning_rate": 3.9666666666666665e-07, "logits/chosen": -0.6949701905250549, "logits/rejected": 0.5297445058822632, "logps/chosen": -30.219324111938477, "logps/rejected": -33.860145568847656, "loss": 0.639, "rewards/accuracies": 0.871874988079071, "rewards/chosen": 0.0840606540441513, "rewards/margins": 0.11379051208496094, "rewards/rejected": -0.02972986176609993, "step": 120 }, { "epoch": 0.26212980466288593, "grad_norm": 15.875, "learning_rate": 4.2999999999999996e-07, "logits/chosen": -0.8164470791816711, "logits/rejected": 0.4915032386779785, "logps/chosen": -29.9921817779541, "logps/rejected": -34.11717224121094, "loss": 0.6243, "rewards/accuracies": 0.871874988079071, "rewards/chosen": 0.09985624253749847, "rewards/margins": 0.14640672504901886, "rewards/rejected": -0.04655047133564949, "step": 130 }, { "epoch": 0.28229363579080025, "grad_norm": 17.25, "learning_rate": 4.633333333333333e-07, "logits/chosen": -0.8576955795288086, "logits/rejected": 0.475827693939209, "logps/chosen": -29.987823486328125, "logps/rejected": -34.64146041870117, "loss": 0.606, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 0.10108546912670135, "rewards/margins": 0.1895439326763153, "rewards/rejected": -0.08845846354961395, "step": 140 }, { "epoch": 0.30245746691871456, "grad_norm": 15.75, "learning_rate": 4.966666666666666e-07, "logits/chosen": -0.9962993860244751, "logits/rejected": 0.2535431385040283, "logps/chosen": -30.001697540283203, "logps/rejected": -34.91169357299805, "loss": 0.5907, "rewards/accuracies": 0.8609374761581421, "rewards/chosen": 0.0953492671251297, "rewards/margins": 0.23015904426574707, "rewards/rejected": -0.13480977714061737, "step": 150 }, { "epoch": 0.32262129804662887, "grad_norm": 19.5, "learning_rate": 4.869942196531791e-07, "logits/chosen": -1.1013362407684326, "logits/rejected": 0.16825838387012482, "logps/chosen": -30.271495819091797, "logps/rejected": -35.96307373046875, "loss": 0.558, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 0.07991272956132889, "rewards/margins": 0.3163560628890991, "rewards/rejected": -0.23644332587718964, "step": 160 }, { "epoch": 0.3427851291745432, "grad_norm": 20.0, "learning_rate": 4.7254335260115607e-07, "logits/chosen": -1.3539522886276245, "logits/rejected": -0.050642382353544235, "logps/chosen": -30.70322608947754, "logps/rejected": -36.963932037353516, "loss": 0.5439, "rewards/accuracies": 0.854687511920929, "rewards/chosen": 0.04025987908244133, "rewards/margins": 0.3653072416782379, "rewards/rejected": -0.3250473737716675, "step": 170 }, { "epoch": 0.3629489603024575, "grad_norm": 19.375, "learning_rate": 4.5809248554913295e-07, "logits/chosen": -1.5499536991119385, "logits/rejected": -0.12814846634864807, "logps/chosen": -30.65024185180664, "logps/rejected": -38.008567810058594, "loss": 0.5077, "rewards/accuracies": 0.878125011920929, "rewards/chosen": 0.030102571472525597, "rewards/margins": 0.46569380164146423, "rewards/rejected": -0.4355912208557129, "step": 180 }, { "epoch": 0.38311279143037175, "grad_norm": 21.75, "learning_rate": 4.436416184971098e-07, "logits/chosen": -1.6617387533187866, "logits/rejected": -0.32283174991607666, "logps/chosen": -30.94146156311035, "logps/rejected": -39.50160598754883, "loss": 0.4704, "rewards/accuracies": 0.895312488079071, "rewards/chosen": 0.004333639983087778, "rewards/margins": 0.5781550407409668, "rewards/rejected": -0.5738214254379272, "step": 190 }, { "epoch": 0.40327662255828606, "grad_norm": 14.6875, "learning_rate": 4.291907514450867e-07, "logits/chosen": -1.791582703590393, "logits/rejected": -0.44964680075645447, "logps/chosen": -31.526784896850586, "logps/rejected": -39.81529998779297, "loss": 0.4811, "rewards/accuracies": 0.864062488079071, "rewards/chosen": -0.03861772269010544, "rewards/margins": 0.56865394115448, "rewards/rejected": -0.607271671295166, "step": 200 }, { "epoch": 0.40327662255828606, "eval_logits/chosen": -1.988352656364441, "eval_logits/rejected": -0.7473806738853455, "eval_logps/chosen": -31.514951705932617, "eval_logps/rejected": -40.30702209472656, "eval_loss": 0.4655759632587433, "eval_rewards/accuracies": 0.8690191507339478, "eval_rewards/chosen": -0.05546921119093895, "eval_rewards/margins": 0.6172530651092529, "eval_rewards/rejected": -0.6727222204208374, "eval_runtime": 44.074, "eval_samples_per_second": 37.891, "eval_steps_per_second": 9.484, "step": 200 }, { "epoch": 0.42344045368620037, "grad_norm": 17.875, "learning_rate": 4.1473988439306354e-07, "logits/chosen": -1.9743998050689697, "logits/rejected": -0.7126643061637878, "logps/chosen": -31.816967010498047, "logps/rejected": -40.79069519042969, "loss": 0.4591, "rewards/accuracies": 0.879687488079071, "rewards/chosen": -0.06654059141874313, "rewards/margins": 0.6414004564285278, "rewards/rejected": -0.7079410552978516, "step": 210 }, { "epoch": 0.4436042848141147, "grad_norm": 15.875, "learning_rate": 4.002890173410404e-07, "logits/chosen": -1.9798154830932617, "logits/rejected": -0.7139844298362732, "logps/chosen": -32.13506317138672, "logps/rejected": -41.47466278076172, "loss": 0.4525, "rewards/accuracies": 0.8656250238418579, "rewards/chosen": -0.0931621789932251, "rewards/margins": 0.6879409551620483, "rewards/rejected": -0.7811031341552734, "step": 220 }, { "epoch": 0.463768115942029, "grad_norm": 20.0, "learning_rate": 3.8583815028901736e-07, "logits/chosen": -2.0034127235412598, "logits/rejected": -0.7428504228591919, "logps/chosen": -31.648120880126953, "logps/rejected": -42.38803482055664, "loss": 0.4149, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -0.07446546852588654, "rewards/margins": 0.8017433285713196, "rewards/rejected": -0.8762086629867554, "step": 230 }, { "epoch": 0.4839319470699433, "grad_norm": 18.25, "learning_rate": 3.713872832369942e-07, "logits/chosen": -2.0142064094543457, "logits/rejected": -0.9079702496528625, "logps/chosen": -32.108821868896484, "logps/rejected": -43.06040954589844, "loss": 0.4058, "rewards/accuracies": 0.8890625238418579, "rewards/chosen": -0.10588344186544418, "rewards/margins": 0.8354071378707886, "rewards/rejected": -0.941290557384491, "step": 240 }, { "epoch": 0.5040957781978576, "grad_norm": 16.5, "learning_rate": 3.5693641618497107e-07, "logits/chosen": -2.275698184967041, "logits/rejected": -1.122511863708496, "logps/chosen": -32.408470153808594, "logps/rejected": -43.69465255737305, "loss": 0.4089, "rewards/accuracies": 0.875, "rewards/chosen": -0.13990476727485657, "rewards/margins": 0.8530368804931641, "rewards/rejected": -0.992941677570343, "step": 250 }, { "epoch": 0.5242596093257719, "grad_norm": 13.9375, "learning_rate": 3.4248554913294795e-07, "logits/chosen": -2.2892613410949707, "logits/rejected": -1.177565336227417, "logps/chosen": -32.277252197265625, "logps/rejected": -44.352439880371094, "loss": 0.3905, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -0.13188917934894562, "rewards/margins": 0.9304295778274536, "rewards/rejected": -1.0623188018798828, "step": 260 }, { "epoch": 0.5444234404536862, "grad_norm": 16.875, "learning_rate": 3.2803468208092484e-07, "logits/chosen": -2.309782028198242, "logits/rejected": -1.1110422611236572, "logps/chosen": -32.4368896484375, "logps/rejected": -44.47226333618164, "loss": 0.3876, "rewards/accuracies": 0.8734375238418579, "rewards/chosen": -0.14435531198978424, "rewards/margins": 0.945163369178772, "rewards/rejected": -1.0895185470581055, "step": 270 }, { "epoch": 0.5645872715816005, "grad_norm": 20.0, "learning_rate": 3.135838150289017e-07, "logits/chosen": -2.4775633811950684, "logits/rejected": -1.2171634435653687, "logps/chosen": -31.85956382751465, "logps/rejected": -44.78974151611328, "loss": 0.3722, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -0.10046832263469696, "rewards/margins": 1.0068700313568115, "rewards/rejected": -1.107338309288025, "step": 280 }, { "epoch": 0.5847511027095148, "grad_norm": 15.3125, "learning_rate": 2.991329479768786e-07, "logits/chosen": -2.3516058921813965, "logits/rejected": -1.2547065019607544, "logps/chosen": -32.33803939819336, "logps/rejected": -44.54314422607422, "loss": 0.3877, "rewards/accuracies": 0.8734375238418579, "rewards/chosen": -0.1320447474718094, "rewards/margins": 0.960769534111023, "rewards/rejected": -1.0928142070770264, "step": 290 }, { "epoch": 0.6049149338374291, "grad_norm": 14.3125, "learning_rate": 2.846820809248555e-07, "logits/chosen": -2.5609071254730225, "logits/rejected": -1.3568521738052368, "logps/chosen": -31.781259536743164, "logps/rejected": -45.361305236816406, "loss": 0.3534, "rewards/accuracies": 0.8968750238418579, "rewards/chosen": -0.0874972864985466, "rewards/margins": 1.0803875923156738, "rewards/rejected": -1.1678849458694458, "step": 300 }, { "epoch": 0.6049149338374291, "eval_logits/chosen": -2.652808666229248, "eval_logits/rejected": -1.5094201564788818, "eval_logps/chosen": -32.167049407958984, "eval_logps/rejected": -45.16587829589844, "eval_loss": 0.37229523062705994, "eval_rewards/accuracies": 0.8755980730056763, "eval_rewards/chosen": -0.12067891657352448, "eval_rewards/margins": 1.0379289388656616, "eval_rewards/rejected": -1.158607840538025, "eval_runtime": 44.3186, "eval_samples_per_second": 37.682, "eval_steps_per_second": 9.432, "step": 300 }, { "epoch": 0.6250787649653434, "grad_norm": 12.125, "learning_rate": 2.7023121387283236e-07, "logits/chosen": -2.574951410293579, "logits/rejected": -1.4039162397384644, "logps/chosen": -32.359107971191406, "logps/rejected": -45.18638229370117, "loss": 0.3833, "rewards/accuracies": 0.870312511920929, "rewards/chosen": -0.12439367920160294, "rewards/margins": 1.0330053567886353, "rewards/rejected": -1.1573989391326904, "step": 310 }, { "epoch": 0.6452425960932577, "grad_norm": 16.125, "learning_rate": 2.5578034682080925e-07, "logits/chosen": -2.5044591426849365, "logits/rejected": -1.4151450395584106, "logps/chosen": -32.13268280029297, "logps/rejected": -45.22514343261719, "loss": 0.3719, "rewards/accuracies": 0.875, "rewards/chosen": -0.11237072944641113, "rewards/margins": 1.047181487083435, "rewards/rejected": -1.1595523357391357, "step": 320 }, { "epoch": 0.665406427221172, "grad_norm": 15.0, "learning_rate": 2.4132947976878613e-07, "logits/chosen": -2.631962776184082, "logits/rejected": -1.3861881494522095, "logps/chosen": -31.74435043334961, "logps/rejected": -45.44641876220703, "loss": 0.3586, "rewards/accuracies": 0.885937511920929, "rewards/chosen": -0.074491485953331, "rewards/margins": 1.1014248132705688, "rewards/rejected": -1.1759161949157715, "step": 330 }, { "epoch": 0.6855702583490864, "grad_norm": 16.0, "learning_rate": 2.26878612716763e-07, "logits/chosen": -2.5997676849365234, "logits/rejected": -1.4487764835357666, "logps/chosen": -31.48516845703125, "logps/rejected": -45.57600021362305, "loss": 0.3413, "rewards/accuracies": 0.903124988079071, "rewards/chosen": -0.054619044065475464, "rewards/margins": 1.1495531797409058, "rewards/rejected": -1.2041722536087036, "step": 340 }, { "epoch": 0.7057340894770007, "grad_norm": 12.9375, "learning_rate": 2.1242774566473987e-07, "logits/chosen": -2.581223726272583, "logits/rejected": -1.451586127281189, "logps/chosen": -31.826080322265625, "logps/rejected": -45.72737503051758, "loss": 0.3609, "rewards/accuracies": 0.879687488079071, "rewards/chosen": -0.08685452491044998, "rewards/margins": 1.1219041347503662, "rewards/rejected": -1.2087585926055908, "step": 350 }, { "epoch": 0.725897920604915, "grad_norm": 14.125, "learning_rate": 1.9797687861271675e-07, "logits/chosen": -2.6039886474609375, "logits/rejected": -1.593008279800415, "logps/chosen": -32.02338790893555, "logps/rejected": -45.7207145690918, "loss": 0.3591, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -0.09107818454504013, "rewards/margins": 1.1208597421646118, "rewards/rejected": -1.2119379043579102, "step": 360 }, { "epoch": 0.7460617517328293, "grad_norm": 14.3125, "learning_rate": 1.8352601156069363e-07, "logits/chosen": -2.6979289054870605, "logits/rejected": -1.5796029567718506, "logps/chosen": -31.300853729248047, "logps/rejected": -45.72681427001953, "loss": 0.3522, "rewards/accuracies": 0.878125011920929, "rewards/chosen": -0.03190985321998596, "rewards/margins": 1.1756919622421265, "rewards/rejected": -1.20760178565979, "step": 370 }, { "epoch": 0.7662255828607435, "grad_norm": 10.875, "learning_rate": 1.690751445086705e-07, "logits/chosen": -2.669769525527954, "logits/rejected": -1.4769527912139893, "logps/chosen": -30.98702049255371, "logps/rejected": -46.33720779418945, "loss": 0.3226, "rewards/accuracies": 0.90625, "rewards/chosen": -0.009888170287013054, "rewards/margins": 1.2525317668914795, "rewards/rejected": -1.2624199390411377, "step": 380 }, { "epoch": 0.7863894139886578, "grad_norm": 12.5, "learning_rate": 1.546242774566474e-07, "logits/chosen": -2.6219515800476074, "logits/rejected": -1.4883930683135986, "logps/chosen": -31.58437156677246, "logps/rejected": -46.32578659057617, "loss": 0.348, "rewards/accuracies": 0.875, "rewards/chosen": -0.05657508969306946, "rewards/margins": 1.205727458000183, "rewards/rejected": -1.2623026371002197, "step": 390 }, { "epoch": 0.8065532451165721, "grad_norm": 13.4375, "learning_rate": 1.4017341040462428e-07, "logits/chosen": -2.6918234825134277, "logits/rejected": -1.4419605731964111, "logps/chosen": -31.05796241760254, "logps/rejected": -45.93518829345703, "loss": 0.3322, "rewards/accuracies": 0.8890625238418579, "rewards/chosen": -0.011668933555483818, "rewards/margins": 1.2215235233306885, "rewards/rejected": -1.2331925630569458, "step": 400 }, { "epoch": 0.8065532451165721, "eval_logits/chosen": -2.7881991863250732, "eval_logits/rejected": -1.6745110750198364, "eval_logps/chosen": -31.51004981994629, "eval_logps/rejected": -45.77407455444336, "eval_loss": 0.3527255356311798, "eval_rewards/accuracies": 0.8761961460113525, "eval_rewards/chosen": -0.05497899651527405, "eval_rewards/margins": 1.164448618888855, "eval_rewards/rejected": -1.219427466392517, "eval_runtime": 43.521, "eval_samples_per_second": 38.372, "eval_steps_per_second": 9.605, "step": 400 }, { "epoch": 0.8267170762444864, "grad_norm": 11.6875, "learning_rate": 1.2572254335260116e-07, "logits/chosen": -2.651202440261841, "logits/rejected": -1.4713218212127686, "logps/chosen": -31.563549041748047, "logps/rejected": -46.01884078979492, "loss": 0.3475, "rewards/accuracies": 0.8921874761581421, "rewards/chosen": -0.05552523210644722, "rewards/margins": 1.1869945526123047, "rewards/rejected": -1.242519736289978, "step": 410 }, { "epoch": 0.8468809073724007, "grad_norm": 9.6875, "learning_rate": 1.1127167630057803e-07, "logits/chosen": -2.605095624923706, "logits/rejected": -1.4782483577728271, "logps/chosen": -31.462610244750977, "logps/rejected": -45.76819610595703, "loss": 0.3681, "rewards/accuracies": 0.878125011920929, "rewards/chosen": -0.059452712535858154, "rewards/margins": 1.1480482816696167, "rewards/rejected": -1.2075010538101196, "step": 420 }, { "epoch": 0.867044738500315, "grad_norm": 13.5, "learning_rate": 9.68208092485549e-08, "logits/chosen": -2.7015011310577393, "logits/rejected": -1.6397815942764282, "logps/chosen": -30.976587295532227, "logps/rejected": -46.11310958862305, "loss": 0.3244, "rewards/accuracies": 0.90625, "rewards/chosen": -0.003909923601895571, "rewards/margins": 1.2422012090682983, "rewards/rejected": -1.2461111545562744, "step": 430 }, { "epoch": 0.8872085696282294, "grad_norm": 13.375, "learning_rate": 8.236994219653179e-08, "logits/chosen": -2.649815082550049, "logits/rejected": -1.571324110031128, "logps/chosen": -31.4038028717041, "logps/rejected": -46.44231414794922, "loss": 0.3272, "rewards/accuracies": 0.8890625238418579, "rewards/chosen": -0.03583191707730293, "rewards/margins": 1.2405941486358643, "rewards/rejected": -1.2764259576797485, "step": 440 }, { "epoch": 0.9073724007561437, "grad_norm": 12.1875, "learning_rate": 6.791907514450866e-08, "logits/chosen": -2.7028326988220215, "logits/rejected": -1.6298167705535889, "logps/chosen": -31.396902084350586, "logps/rejected": -46.344139099121094, "loss": 0.3365, "rewards/accuracies": 0.8843749761581421, "rewards/chosen": -0.03214337304234505, "rewards/margins": 1.235580325126648, "rewards/rejected": -1.2677236795425415, "step": 450 }, { "epoch": 0.927536231884058, "grad_norm": 9.375, "learning_rate": 5.346820809248555e-08, "logits/chosen": -2.6308987140655518, "logits/rejected": -1.5787830352783203, "logps/chosen": -31.125768661499023, "logps/rejected": -45.810577392578125, "loss": 0.341, "rewards/accuracies": 0.8890625238418579, "rewards/chosen": -0.014469502493739128, "rewards/margins": 1.20750892162323, "rewards/rejected": -1.2219784259796143, "step": 460 }, { "epoch": 0.9477000630119723, "grad_norm": 12.375, "learning_rate": 3.901734104046243e-08, "logits/chosen": -2.748487949371338, "logits/rejected": -1.6348745822906494, "logps/chosen": -31.541439056396484, "logps/rejected": -46.1522216796875, "loss": 0.3417, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.040760189294815063, "rewards/margins": 1.2017240524291992, "rewards/rejected": -1.2424843311309814, "step": 470 }, { "epoch": 0.9678638941398866, "grad_norm": 13.8125, "learning_rate": 2.4566473988439306e-08, "logits/chosen": -2.7292351722717285, "logits/rejected": -1.6388839483261108, "logps/chosen": -31.504674911499023, "logps/rejected": -46.303550720214844, "loss": 0.3416, "rewards/accuracies": 0.871874988079071, "rewards/chosen": -0.040863461792469025, "rewards/margins": 1.2127190828323364, "rewards/rejected": -1.2535823583602905, "step": 480 }, { "epoch": 0.9880277252678009, "grad_norm": 14.5625, "learning_rate": 1.0115606936416185e-08, "logits/chosen": -2.6773369312286377, "logits/rejected": -1.5286281108856201, "logps/chosen": -31.112579345703125, "logps/rejected": -45.940792083740234, "loss": 0.3377, "rewards/accuracies": 0.8890625238418579, "rewards/chosen": -0.019895082339644432, "rewards/margins": 1.210729956626892, "rewards/rejected": -1.230625033378601, "step": 490 } ], "logging_steps": 10, "max_steps": 496, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }