{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 681, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0014684287812041115, "epsilon_dpo/beta": 0.0999474972486496, "epsilon_dpo/beta_margin_grad_mean": -0.5006521940231323, "epsilon_dpo/beta_margin_grad_std": 0.010521039366722107, "epsilon_dpo/beta_margin_mean": -0.0026135474909096956, "epsilon_dpo/beta_margin_std": 0.04210928454995155, "epsilon_dpo/loss_margin_mean": -0.02287048101425171, "grad_norm": 83.56657409667969, "kl/avg_steps": 0.0625, "kl/beta": 0.10000000149011612, "kl/n_epsilon_steps": 0.46875, "kl/p_epsilon_steps": 0.53125, "learning_rate": 0.0, "logits/chosen": -0.6899334788322449, "logits/rejected": -0.37887901067733765, "logps/chosen": -50.1435661315918, "logps/ref_chosen": -50.14883804321289, "logps/ref_rejected": -74.1280517578125, "logps/rejected": -74.09991455078125, "loss": 1.3894, "rewards/accuracies": 0.53125, "rewards/chosen": 0.00041806945228017867, "rewards/margins": -0.0026135058142244816, "rewards/rejected": 0.003031575120985508, "step": 1 }, { "epoch": 0.002936857562408223, "epsilon_dpo/beta": 0.1000724583864212, "epsilon_dpo/beta_margin_grad_mean": -0.5017112493515015, "epsilon_dpo/beta_margin_grad_std": 0.0088164322078228, "epsilon_dpo/beta_margin_mean": -0.006847253534942865, "epsilon_dpo/beta_margin_std": 0.03527917340397835, "epsilon_dpo/loss_margin_mean": -0.06572240591049194, "grad_norm": 72.1585464477539, "kl/avg_steps": -0.125, "kl/beta": 0.09993753582239151, "kl/n_epsilon_steps": 0.5625, "kl/p_epsilon_steps": 0.4375, "learning_rate": 7.246376811594203e-09, "logits/chosen": -0.6022520065307617, "logits/rejected": -0.36671221256256104, "logps/chosen": -52.65569305419922, "logps/ref_chosen": -52.620704650878906, "logps/ref_rejected": -75.30413818359375, "logps/rejected": -75.27340698242188, "loss": 1.3935, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0036358418874442577, "rewards/margins": -0.00684727355837822, "rewards/rejected": 0.003211432136595249, "step": 2 }, { "epoch": 0.004405286343612335, "epsilon_dpo/beta": 0.0999162569642067, "epsilon_dpo/beta_margin_grad_mean": -0.4991426467895508, "epsilon_dpo/beta_margin_grad_std": 0.008554468862712383, "epsilon_dpo/beta_margin_mean": 0.00343105080537498, "epsilon_dpo/beta_margin_std": 0.0342276468873024, "epsilon_dpo/loss_margin_mean": 0.037074267864227295, "grad_norm": 70.64185333251953, "kl/avg_steps": 0.15625, "kl/beta": 0.10006261616945267, "kl/n_epsilon_steps": 0.421875, "kl/p_epsilon_steps": 0.578125, "learning_rate": 1.4492753623188406e-08, "logits/chosen": -0.5908145308494568, "logits/rejected": -0.4275705814361572, "logps/chosen": -60.96543502807617, "logps/ref_chosen": -60.98159408569336, "logps/ref_rejected": -68.67259216308594, "logps/rejected": -68.69351196289062, "loss": 1.3832, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0015193297294899821, "rewards/margins": 0.0034310566261410713, "rewards/rejected": -0.0019117268966510892, "step": 3 }, { "epoch": 0.005873715124816446, "epsilon_dpo/beta": 0.10013506561517715, "epsilon_dpo/beta_margin_grad_mean": -0.5023244619369507, "epsilon_dpo/beta_margin_grad_std": 0.009719975292682648, "epsilon_dpo/beta_margin_mean": -0.009302487596869469, "epsilon_dpo/beta_margin_std": 0.038918618112802505, "epsilon_dpo/loss_margin_mean": -0.09038430452346802, "grad_norm": 72.27873229980469, "kl/avg_steps": -0.21875, "kl/beta": 0.09990651160478592, "kl/n_epsilon_steps": 0.609375, "kl/p_epsilon_steps": 0.390625, "learning_rate": 2.1739130434782606e-08, "logits/chosen": -0.6021588444709778, "logits/rejected": -0.42872855067253113, "logps/chosen": -56.75792694091797, "logps/ref_chosen": -56.76771545410156, "logps/ref_rejected": -86.64710998535156, "logps/rejected": -86.54693603515625, "loss": 1.396, "rewards/accuracies": 0.390625, "rewards/chosen": 0.0008886073483154178, "rewards/margins": -0.009302439168095589, "rewards/rejected": 0.010191047564148903, "step": 4 }, { "epoch": 0.007342143906020558, "epsilon_dpo/beta": 0.10004167258739471, "epsilon_dpo/beta_margin_grad_mean": -0.49954700469970703, "epsilon_dpo/beta_margin_grad_std": 0.008118817582726479, "epsilon_dpo/beta_margin_mean": 0.001813046750612557, "epsilon_dpo/beta_margin_std": 0.03248732164502144, "epsilon_dpo/loss_margin_mean": 0.02056872844696045, "grad_norm": 89.54268646240234, "kl/avg_steps": 0.09375, "kl/beta": 0.10012553632259369, "kl/n_epsilon_steps": 0.453125, "kl/p_epsilon_steps": 0.546875, "learning_rate": 2.898550724637681e-08, "logits/chosen": -0.743955135345459, "logits/rejected": -0.4869227111339569, "logps/chosen": -53.81658935546875, "logps/ref_chosen": -53.859375, "logps/ref_rejected": -84.14918518066406, "logps/rejected": -84.12696838378906, "loss": 1.3847, "rewards/accuracies": 0.515625, "rewards/chosen": 0.00417137099429965, "rewards/margins": 0.0018129991367459297, "rewards/rejected": 0.002358372090384364, "step": 5 }, { "epoch": 0.00881057268722467, "epsilon_dpo/beta": 0.09979165345430374, "epsilon_dpo/beta_margin_grad_mean": -0.4990909695625305, "epsilon_dpo/beta_margin_grad_std": 0.00914138276129961, "epsilon_dpo/beta_margin_mean": 0.0036372877657413483, "epsilon_dpo/beta_margin_std": 0.036579351872205734, "epsilon_dpo/loss_margin_mean": 0.039216578006744385, "grad_norm": 91.50358581542969, "kl/avg_steps": 0.25, "kl/beta": 0.10003175586462021, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 3.6231884057971014e-08, "logits/chosen": -0.6965059638023376, "logits/rejected": -0.5487236976623535, "logps/chosen": -62.976524353027344, "logps/ref_chosen": -63.007484436035156, "logps/ref_rejected": -92.64534759521484, "logps/rejected": -92.65360260009766, "loss": 1.383, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0029447507113218307, "rewards/margins": 0.0036372365429997444, "rewards/rejected": -0.0006924858316779137, "step": 6 }, { "epoch": 0.010279001468428781, "epsilon_dpo/beta": 0.09972991049289703, "epsilon_dpo/beta_margin_grad_mean": -0.5003304481506348, "epsilon_dpo/beta_margin_grad_std": 0.01168179139494896, "epsilon_dpo/beta_margin_mean": -0.0013204828137531877, "epsilon_dpo/beta_margin_std": 0.046759072691202164, "epsilon_dpo/loss_margin_mean": -0.009589701890945435, "grad_norm": 82.27672576904297, "kl/avg_steps": 0.0625, "kl/beta": 0.0997823029756546, "kl/n_epsilon_steps": 0.46875, "kl/p_epsilon_steps": 0.53125, "learning_rate": 4.347826086956521e-08, "logits/chosen": -0.6239166259765625, "logits/rejected": -0.4245404899120331, "logps/chosen": -57.77162551879883, "logps/ref_chosen": -57.774818420410156, "logps/ref_rejected": -103.92059326171875, "logps/rejected": -103.90780639648438, "loss": 1.3882, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0001622685813345015, "rewards/margins": -0.0013204591814428568, "rewards/rejected": 0.0014827274717390537, "step": 7 }, { "epoch": 0.011747430249632892, "epsilon_dpo/beta": 0.09966761618852615, "epsilon_dpo/beta_margin_grad_mean": -0.5001169443130493, "epsilon_dpo/beta_margin_grad_std": 0.010085917077958584, "epsilon_dpo/beta_margin_mean": -0.0004724572936538607, "epsilon_dpo/beta_margin_std": 0.040366336703300476, "epsilon_dpo/loss_margin_mean": -0.0018860399723052979, "grad_norm": 78.22746276855469, "kl/avg_steps": 0.0625, "kl/beta": 0.09971997886896133, "kl/n_epsilon_steps": 0.46875, "kl/p_epsilon_steps": 0.53125, "learning_rate": 5.0724637681159424e-08, "logits/chosen": -0.5672747492790222, "logits/rejected": -0.4737367630004883, "logps/chosen": -58.69141387939453, "logps/ref_chosen": -58.716033935546875, "logps/ref_rejected": -79.3114242553711, "logps/rejected": -79.284912109375, "loss": 1.3872, "rewards/accuracies": 0.546875, "rewards/chosen": 0.0023498879745602608, "rewards/margins": -0.0004724874161183834, "rewards/rejected": 0.002822375390678644, "step": 8 }, { "epoch": 0.013215859030837005, "epsilon_dpo/beta": 0.099636510014534, "epsilon_dpo/beta_margin_grad_mean": -0.4998748302459717, "epsilon_dpo/beta_margin_grad_std": 0.01016149390488863, "epsilon_dpo/beta_margin_mean": 0.0005050363834016025, "epsilon_dpo/beta_margin_std": 0.04067489877343178, "epsilon_dpo/loss_margin_mean": 0.008003711700439453, "grad_norm": 85.07678985595703, "kl/avg_steps": 0.03125, "kl/beta": 0.09965769201517105, "kl/n_epsilon_steps": 0.484375, "kl/p_epsilon_steps": 0.515625, "learning_rate": 5.797101449275362e-08, "logits/chosen": -0.669657826423645, "logits/rejected": -0.44206157326698303, "logps/chosen": -69.83699035644531, "logps/ref_chosen": -69.8668441772461, "logps/ref_rejected": -99.6026611328125, "logps/rejected": -99.580810546875, "loss": 1.3862, "rewards/accuracies": 0.484375, "rewards/chosen": 0.002859012922272086, "rewards/margins": 0.0005050955805927515, "rewards/rejected": 0.0023539173416793346, "step": 9 }, { "epoch": 0.014684287812041116, "epsilon_dpo/beta": 0.0995742455124855, "epsilon_dpo/beta_margin_grad_mean": -0.5003238916397095, "epsilon_dpo/beta_margin_grad_std": 0.008978527970612049, "epsilon_dpo/beta_margin_mean": -0.0012951751705259085, "epsilon_dpo/beta_margin_std": 0.03592904284596443, "epsilon_dpo/loss_margin_mean": -0.010241597890853882, "grad_norm": 70.06171417236328, "kl/avg_steps": 0.0625, "kl/beta": 0.0996265560388565, "kl/n_epsilon_steps": 0.46875, "kl/p_epsilon_steps": 0.53125, "learning_rate": 6.521739130434782e-08, "logits/chosen": -0.6811233162879944, "logits/rejected": -0.44765201210975647, "logps/chosen": -48.37286376953125, "logps/ref_chosen": -48.35768508911133, "logps/ref_rejected": -80.37206268310547, "logps/rejected": -80.37699890136719, "loss": 1.3879, "rewards/accuracies": 0.5, "rewards/chosen": -0.0016418680315837264, "rewards/margins": -0.0012952459510415792, "rewards/rejected": -0.000346622196957469, "step": 10 }, { "epoch": 0.016152716593245228, "epsilon_dpo/beta": 0.09954316914081573, "epsilon_dpo/beta_margin_grad_mean": -0.49935105443000793, "epsilon_dpo/beta_margin_grad_std": 0.008948074653744698, "epsilon_dpo/beta_margin_mean": 0.0025976356118917465, "epsilon_dpo/beta_margin_std": 0.03580310195684433, "epsilon_dpo/loss_margin_mean": 0.028973519802093506, "grad_norm": 67.98162078857422, "kl/avg_steps": 0.03125, "kl/beta": 0.099564328789711, "kl/n_epsilon_steps": 0.484375, "kl/p_epsilon_steps": 0.515625, "learning_rate": 7.246376811594203e-08, "logits/chosen": -0.5316141843795776, "logits/rejected": -0.3694092035293579, "logps/chosen": -53.018104553222656, "logps/ref_chosen": -53.01685333251953, "logps/ref_rejected": -87.78038024902344, "logps/rejected": -87.81060791015625, "loss": 1.384, "rewards/accuracies": 0.515625, "rewards/chosen": -0.00022831570822745562, "rewards/margins": 0.0025976833421736956, "rewards/rejected": -0.0028259989339858294, "step": 11 }, { "epoch": 0.01762114537444934, "epsilon_dpo/beta": 0.09948095679283142, "epsilon_dpo/beta_margin_grad_mean": -0.4997762441635132, "epsilon_dpo/beta_margin_grad_std": 0.01158287562429905, "epsilon_dpo/beta_margin_mean": 0.0008899245294742286, "epsilon_dpo/beta_margin_std": 0.04637879133224487, "epsilon_dpo/loss_margin_mean": 0.012266382575035095, "grad_norm": 89.55339813232422, "kl/avg_steps": 0.0625, "kl/beta": 0.09953322261571884, "kl/n_epsilon_steps": 0.46875, "kl/p_epsilon_steps": 0.53125, "learning_rate": 7.971014492753623e-08, "logits/chosen": -0.6296329498291016, "logits/rejected": -0.4155291020870209, "logps/chosen": -61.834747314453125, "logps/ref_chosen": -61.80543518066406, "logps/ref_rejected": -104.85826873779297, "logps/rejected": -104.89984893798828, "loss": 1.3859, "rewards/accuracies": 0.515625, "rewards/chosen": -0.0030919623095542192, "rewards/margins": 0.0008900631219148636, "rewards/rejected": -0.0039820256642997265, "step": 12 }, { "epoch": 0.01908957415565345, "epsilon_dpo/beta": 0.09944991022348404, "epsilon_dpo/beta_margin_grad_mean": -0.4985347092151642, "epsilon_dpo/beta_margin_grad_std": 0.011819672770798206, "epsilon_dpo/beta_margin_mean": 0.005867047235369682, "epsilon_dpo/beta_margin_std": 0.04732148349285126, "epsilon_dpo/loss_margin_mean": 0.06244337558746338, "grad_norm": 79.01435852050781, "kl/avg_steps": 0.03125, "kl/beta": 0.09947105497121811, "kl/n_epsilon_steps": 0.484375, "kl/p_epsilon_steps": 0.515625, "learning_rate": 8.695652173913042e-08, "logits/chosen": -0.6465901136398315, "logits/rejected": -0.5206432342529297, "logps/chosen": -64.20757293701172, "logps/ref_chosen": -64.26036071777344, "logps/ref_rejected": -87.20307922363281, "logps/rejected": -87.21273803710938, "loss": 1.381, "rewards/accuracies": 0.515625, "rewards/chosen": 0.005090477876365185, "rewards/margins": 0.005867065396159887, "rewards/rejected": -0.0007765874033793807, "step": 13 }, { "epoch": 0.020558002936857563, "epsilon_dpo/beta": 0.09954315423965454, "epsilon_dpo/beta_margin_grad_mean": -0.5012027025222778, "epsilon_dpo/beta_margin_grad_std": 0.010310296900570393, "epsilon_dpo/beta_margin_mean": -0.004813310690224171, "epsilon_dpo/beta_margin_std": 0.04126282408833504, "epsilon_dpo/loss_margin_mean": -0.045211225748062134, "grad_norm": 85.693603515625, "kl/avg_steps": -0.09375, "kl/beta": 0.09943997859954834, "kl/n_epsilon_steps": 0.546875, "kl/p_epsilon_steps": 0.453125, "learning_rate": 9.420289855072464e-08, "logits/chosen": -0.6981677412986755, "logits/rejected": -0.4689730107784271, "logps/chosen": -58.15471649169922, "logps/ref_chosen": -58.11021423339844, "logps/ref_rejected": -104.04708099365234, "logps/rejected": -104.04637145996094, "loss": 1.3915, "rewards/accuracies": 0.46875, "rewards/chosen": -0.004551096353679895, "rewards/margins": -0.0048133376985788345, "rewards/rejected": 0.00026224181056022644, "step": 14 }, { "epoch": 0.022026431718061675, "epsilon_dpo/beta": 0.09929438680410385, "epsilon_dpo/beta_margin_grad_mean": -0.498632550239563, "epsilon_dpo/beta_margin_grad_std": 0.009638694114983082, "epsilon_dpo/beta_margin_mean": 0.005466893315315247, "epsilon_dpo/beta_margin_std": 0.03858000040054321, "epsilon_dpo/loss_margin_mean": 0.05783188343048096, "grad_norm": 63.776878356933594, "kl/avg_steps": 0.25, "kl/beta": 0.09953329712152481, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 1.0144927536231885e-07, "logits/chosen": -0.4637385308742523, "logits/rejected": -0.35175687074661255, "logps/chosen": -56.960899353027344, "logps/ref_chosen": -56.96691131591797, "logps/ref_rejected": -80.80863952636719, "logps/rejected": -80.86045837402344, "loss": 1.3812, "rewards/accuracies": 0.609375, "rewards/chosen": 0.0004750732332468033, "rewards/margins": 0.005466861184686422, "rewards/rejected": -0.004991788417100906, "step": 15 }, { "epoch": 0.023494860499265784, "epsilon_dpo/beta": 0.09935706853866577, "epsilon_dpo/beta_margin_grad_mean": -0.49960967898368835, "epsilon_dpo/beta_margin_grad_std": 0.008690658025443554, "epsilon_dpo/beta_margin_mean": 0.0015602321363985538, "epsilon_dpo/beta_margin_std": 0.034774623811244965, "epsilon_dpo/loss_margin_mean": 0.018438905477523804, "grad_norm": 83.94454956054688, "kl/avg_steps": -0.0625, "kl/beta": 0.0992850810289383, "kl/n_epsilon_steps": 0.53125, "kl/p_epsilon_steps": 0.46875, "learning_rate": 1.0869565217391303e-07, "logits/chosen": -0.6619457006454468, "logits/rejected": -0.5016107559204102, "logps/chosen": -61.76502990722656, "logps/ref_chosen": -61.739891052246094, "logps/ref_rejected": -84.36947631835938, "logps/rejected": -84.41305541992188, "loss": 1.385, "rewards/accuracies": 0.484375, "rewards/chosen": -0.002612006152048707, "rewards/margins": 0.0015602526254951954, "rewards/rejected": -0.004172259010374546, "step": 16 }, { "epoch": 0.024963289280469897, "epsilon_dpo/beta": 0.09907767176628113, "epsilon_dpo/beta_margin_grad_mean": -0.4984363615512848, "epsilon_dpo/beta_margin_grad_std": 0.0094489436596632, "epsilon_dpo/beta_margin_mean": 0.00625709630548954, "epsilon_dpo/beta_margin_std": 0.03781072795391083, "epsilon_dpo/loss_margin_mean": 0.0661078691482544, "grad_norm": 77.86913299560547, "kl/avg_steps": 0.28125, "kl/beta": 0.09934717416763306, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 1.1594202898550725e-07, "logits/chosen": -0.6108927130699158, "logits/rejected": -0.4828973710536957, "logps/chosen": -67.6737060546875, "logps/ref_chosen": -67.71033477783203, "logps/ref_rejected": -85.37865447998047, "logps/rejected": -85.40813446044922, "loss": 1.3804, "rewards/accuracies": 0.65625, "rewards/chosen": 0.003507263958454132, "rewards/margins": 0.006256964989006519, "rewards/rejected": -0.0027497014962136745, "step": 17 }, { "epoch": 0.02643171806167401, "epsilon_dpo/beta": 0.09879979491233826, "epsilon_dpo/beta_margin_grad_mean": -0.4986821115016937, "epsilon_dpo/beta_margin_grad_std": 0.008960261940956116, "epsilon_dpo/beta_margin_mean": 0.005270513240247965, "epsilon_dpo/beta_margin_std": 0.03585405647754669, "epsilon_dpo/loss_margin_mean": 0.05598863959312439, "grad_norm": 81.35855865478516, "kl/avg_steps": 0.28125, "kl/beta": 0.0990685448050499, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 1.2318840579710146e-07, "logits/chosen": -0.7670595645904541, "logits/rejected": -0.5500361323356628, "logps/chosen": -47.718833923339844, "logps/ref_chosen": -47.7394905090332, "logps/ref_rejected": -75.4722900390625, "logps/rejected": -75.50761413574219, "loss": 1.3814, "rewards/accuracies": 0.65625, "rewards/chosen": 0.0019111181609332561, "rewards/margins": 0.005270537454634905, "rewards/rejected": -0.003359419060871005, "step": 18 }, { "epoch": 0.027900146842878122, "epsilon_dpo/beta": 0.09870795160531998, "epsilon_dpo/beta_margin_grad_mean": -0.49873286485671997, "epsilon_dpo/beta_margin_grad_std": 0.008805947378277779, "epsilon_dpo/beta_margin_mean": 0.005072311032563448, "epsilon_dpo/beta_margin_std": 0.03523925691843033, "epsilon_dpo/loss_margin_mean": 0.053946733474731445, "grad_norm": 73.26107788085938, "kl/avg_steps": 0.09375, "kl/beta": 0.09879069775342941, "kl/n_epsilon_steps": 0.453125, "kl/p_epsilon_steps": 0.546875, "learning_rate": 1.3043478260869563e-07, "logits/chosen": -0.6282287836074829, "logits/rejected": -0.41298654675483704, "logps/chosen": -70.22738647460938, "logps/ref_chosen": -70.20535278320312, "logps/ref_rejected": -89.75758361816406, "logps/rejected": -89.83357238769531, "loss": 1.3815, "rewards/accuracies": 0.578125, "rewards/chosen": -0.0023102399427443743, "rewards/margins": 0.005072316154837608, "rewards/rejected": -0.007382555864751339, "step": 19 }, { "epoch": 0.02936857562408223, "epsilon_dpo/beta": 0.09867718070745468, "epsilon_dpo/beta_margin_grad_mean": -0.4981527328491211, "epsilon_dpo/beta_margin_grad_std": 0.010064210742712021, "epsilon_dpo/beta_margin_mean": 0.007393495179712772, "epsilon_dpo/beta_margin_std": 0.040283456444740295, "epsilon_dpo/loss_margin_mean": 0.07789051532745361, "grad_norm": 72.92608642578125, "kl/avg_steps": 0.03125, "kl/beta": 0.09869816154241562, "kl/n_epsilon_steps": 0.484375, "kl/p_epsilon_steps": 0.515625, "learning_rate": 1.3768115942028986e-07, "logits/chosen": -0.7782789468765259, "logits/rejected": -0.538977324962616, "logps/chosen": -50.805747985839844, "logps/ref_chosen": -50.80324172973633, "logps/ref_rejected": -78.8233413696289, "logps/rejected": -78.90374755859375, "loss": 1.3793, "rewards/accuracies": 0.484375, "rewards/chosen": -0.00037431088276207447, "rewards/margins": 0.007393484003841877, "rewards/rejected": -0.007767794653773308, "step": 20 }, { "epoch": 0.030837004405286344, "epsilon_dpo/beta": 0.09850744158029556, "epsilon_dpo/beta_margin_grad_mean": -0.49679034948349, "epsilon_dpo/beta_margin_grad_std": 0.009602558799088001, "epsilon_dpo/beta_margin_mean": 0.01284959726035595, "epsilon_dpo/beta_margin_std": 0.038444884121418, "epsilon_dpo/loss_margin_mean": 0.13301609456539154, "grad_norm": 75.66818237304688, "kl/avg_steps": 0.171875, "kl/beta": 0.09866733103990555, "kl/n_epsilon_steps": 0.40625, "kl/p_epsilon_steps": 0.578125, "learning_rate": 1.4492753623188405e-07, "logits/chosen": -0.6122225522994995, "logits/rejected": -0.5136980414390564, "logps/chosen": -50.05017852783203, "logps/ref_chosen": -50.063018798828125, "logps/ref_rejected": -77.86878967285156, "logps/rejected": -77.98896789550781, "loss": 1.3739, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0011464322451502085, "rewards/margins": 0.012849586084485054, "rewards/rejected": -0.01170315407216549, "step": 21 }, { "epoch": 0.032305433186490456, "epsilon_dpo/beta": 0.09816926717758179, "epsilon_dpo/beta_margin_grad_mean": -0.4954878091812134, "epsilon_dpo/beta_margin_grad_std": 0.010731114074587822, "epsilon_dpo/beta_margin_mean": 0.018057547509670258, "epsilon_dpo/beta_margin_std": 0.04294878616929054, "epsilon_dpo/loss_margin_mean": 0.18695014715194702, "grad_norm": 82.43994903564453, "kl/avg_steps": 0.34375, "kl/beta": 0.09849803894758224, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 1.5217391304347825e-07, "logits/chosen": -0.6706408262252808, "logits/rejected": -0.4660327434539795, "logps/chosen": -59.02484893798828, "logps/ref_chosen": -59.05763626098633, "logps/ref_rejected": -97.50466918945312, "logps/rejected": -97.65882873535156, "loss": 1.3688, "rewards/accuracies": 0.671875, "rewards/chosen": 0.0030678450129926205, "rewards/margins": 0.018057584762573242, "rewards/rejected": -0.014989741146564484, "step": 22 }, { "epoch": 0.033773861967694566, "epsilon_dpo/beta": 0.0978022888302803, "epsilon_dpo/beta_margin_grad_mean": -0.495043009519577, "epsilon_dpo/beta_margin_grad_std": 0.010730421170592308, "epsilon_dpo/beta_margin_mean": 0.019842475652694702, "epsilon_dpo/beta_margin_std": 0.04295789822936058, "epsilon_dpo/loss_margin_mean": 0.20566779375076294, "grad_norm": 78.68445587158203, "kl/avg_steps": 0.375, "kl/beta": 0.09816060960292816, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 1.5942028985507245e-07, "logits/chosen": -0.612334132194519, "logits/rejected": -0.5152074098587036, "logps/chosen": -60.02301788330078, "logps/ref_chosen": -60.07769775390625, "logps/ref_rejected": -81.1395492553711, "logps/rejected": -81.29054260253906, "loss": 1.367, "rewards/accuracies": 0.65625, "rewards/chosen": 0.005194256082177162, "rewards/margins": 0.01984243094921112, "rewards/rejected": -0.014648174867033958, "step": 23 }, { "epoch": 0.03524229074889868, "epsilon_dpo/beta": 0.09728408604860306, "epsilon_dpo/beta_margin_grad_mean": -0.4949635863304138, "epsilon_dpo/beta_margin_grad_std": 0.009255305863916874, "epsilon_dpo/beta_margin_mean": 0.020157571882009506, "epsilon_dpo/beta_margin_std": 0.03706182911992073, "epsilon_dpo/loss_margin_mean": 0.20909583568572998, "grad_norm": 88.37494659423828, "kl/avg_steps": 0.53125, "kl/beta": 0.09779388457536697, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.6666666666666665e-07, "logits/chosen": -0.6284483075141907, "logits/rejected": -0.49340900778770447, "logps/chosen": -44.25568771362305, "logps/ref_chosen": -44.29103469848633, "logps/ref_rejected": -99.12521362304688, "logps/rejected": -99.29896545410156, "loss": 1.3666, "rewards/accuracies": 0.734375, "rewards/chosen": 0.003375026863068342, "rewards/margins": 0.020157575607299805, "rewards/rejected": -0.016782548278570175, "step": 24 }, { "epoch": 0.03671071953010279, "epsilon_dpo/beta": 0.09698280692100525, "epsilon_dpo/beta_margin_grad_mean": -0.4951268136501312, "epsilon_dpo/beta_margin_grad_std": 0.011948227882385254, "epsilon_dpo/beta_margin_mean": 0.019504927098751068, "epsilon_dpo/beta_margin_std": 0.047829385846853256, "epsilon_dpo/loss_margin_mean": 0.20440703630447388, "grad_norm": 71.66119384765625, "kl/avg_steps": 0.3125, "kl/beta": 0.09727709740400314, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 1.7391304347826085e-07, "logits/chosen": -0.6306965351104736, "logits/rejected": -0.4757160544395447, "logps/chosen": -52.48844528198242, "logps/ref_chosen": -52.537052154541016, "logps/ref_rejected": -89.34219360351562, "logps/rejected": -89.49798583984375, "loss": 1.3675, "rewards/accuracies": 0.65625, "rewards/chosen": 0.004545444622635841, "rewards/margins": 0.019504955038428307, "rewards/rejected": -0.01495951134711504, "step": 25 }, { "epoch": 0.0381791483113069, "epsilon_dpo/beta": 0.09646852314472198, "epsilon_dpo/beta_margin_grad_mean": -0.49040067195892334, "epsilon_dpo/beta_margin_grad_std": 0.014042048715054989, "epsilon_dpo/beta_margin_mean": 0.03843845799565315, "epsilon_dpo/beta_margin_std": 0.05624152719974518, "epsilon_dpo/loss_margin_mean": 0.40167027711868286, "grad_norm": 84.22888946533203, "kl/avg_steps": 0.53125, "kl/beta": 0.09697405248880386, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.8115942028985507e-07, "logits/chosen": -0.6835383176803589, "logits/rejected": -0.5212547779083252, "logps/chosen": -53.822509765625, "logps/ref_chosen": -53.92280578613281, "logps/ref_rejected": -103.35971069335938, "logps/rejected": -103.66108703613281, "loss": 1.349, "rewards/accuracies": 0.765625, "rewards/chosen": 0.009511815384030342, "rewards/margins": 0.038438428193330765, "rewards/rejected": -0.028926612809300423, "step": 26 }, { "epoch": 0.039647577092511016, "epsilon_dpo/beta": 0.0959133729338646, "epsilon_dpo/beta_margin_grad_mean": -0.48844072222709656, "epsilon_dpo/beta_margin_grad_std": 0.014806153252720833, "epsilon_dpo/beta_margin_mean": 0.04631367698311806, "epsilon_dpo/beta_margin_std": 0.05942818522453308, "epsilon_dpo/loss_margin_mean": 0.48550090193748474, "grad_norm": 89.76054382324219, "kl/avg_steps": 0.578125, "kl/beta": 0.09646160155534744, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.8840579710144927e-07, "logits/chosen": -0.691501796245575, "logits/rejected": -0.48829805850982666, "logps/chosen": -42.80644989013672, "logps/ref_chosen": -42.898529052734375, "logps/ref_rejected": -98.72420501708984, "logps/rejected": -99.11763000488281, "loss": 1.3414, "rewards/accuracies": 0.78125, "rewards/chosen": 0.008736366406083107, "rewards/margins": 0.04631367325782776, "rewards/rejected": -0.0375773087143898, "step": 27 }, { "epoch": 0.041116005873715125, "epsilon_dpo/beta": 0.09543713927268982, "epsilon_dpo/beta_margin_grad_mean": -0.4904250204563141, "epsilon_dpo/beta_margin_grad_std": 0.013653564266860485, "epsilon_dpo/beta_margin_mean": 0.038332872092723846, "epsilon_dpo/beta_margin_std": 0.054671693593263626, "epsilon_dpo/loss_margin_mean": 0.4052448570728302, "grad_norm": 71.43858337402344, "kl/avg_steps": 0.5, "kl/beta": 0.09590713679790497, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.9565217391304347e-07, "logits/chosen": -0.7132616639137268, "logits/rejected": -0.48149383068084717, "logps/chosen": -60.492679595947266, "logps/ref_chosen": -60.55650329589844, "logps/ref_rejected": -91.40111541748047, "logps/rejected": -91.74253845214844, "loss": 1.3491, "rewards/accuracies": 0.765625, "rewards/chosen": 0.005918354727327824, "rewards/margins": 0.03833283483982086, "rewards/rejected": -0.03241448104381561, "step": 28 }, { "epoch": 0.042584434654919234, "epsilon_dpo/beta": 0.09478338062763214, "epsilon_dpo/beta_margin_grad_mean": -0.48638561367988586, "epsilon_dpo/beta_margin_grad_std": 0.013761184178292751, "epsilon_dpo/beta_margin_mean": 0.05452274531126022, "epsilon_dpo/beta_margin_std": 0.05518447607755661, "epsilon_dpo/loss_margin_mean": 0.5777689218521118, "grad_norm": 86.3988037109375, "kl/avg_steps": 0.6875, "kl/beta": 0.09542998671531677, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 2.028985507246377e-07, "logits/chosen": -0.7569383382797241, "logits/rejected": -0.5672882795333862, "logps/chosen": -57.68357849121094, "logps/ref_chosen": -57.80778503417969, "logps/ref_rejected": -97.39434814453125, "logps/rejected": -97.84791564941406, "loss": 1.3333, "rewards/accuracies": 0.84375, "rewards/chosen": 0.011695785447955132, "rewards/margins": 0.05452270805835724, "rewards/rejected": -0.04282692074775696, "step": 29 }, { "epoch": 0.04405286343612335, "epsilon_dpo/beta": 0.0940769612789154, "epsilon_dpo/beta_margin_grad_mean": -0.4850979447364807, "epsilon_dpo/beta_margin_grad_std": 0.014148331247270107, "epsilon_dpo/beta_margin_mean": 0.059684351086616516, "epsilon_dpo/beta_margin_std": 0.05674216151237488, "epsilon_dpo/loss_margin_mean": 0.636677622795105, "grad_norm": 82.63673400878906, "kl/avg_steps": 0.75, "kl/beta": 0.0947783887386322, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 2.1014492753623187e-07, "logits/chosen": -0.6795220375061035, "logits/rejected": -0.5398270487785339, "logps/chosen": -52.4403076171875, "logps/ref_chosen": -52.57737350463867, "logps/ref_rejected": -98.48921203613281, "logps/rejected": -98.98881530761719, "loss": 1.3283, "rewards/accuracies": 0.875, "rewards/chosen": 0.012802567332983017, "rewards/margins": 0.05968429893255234, "rewards/rejected": -0.04688173532485962, "step": 30 }, { "epoch": 0.04552129221732746, "epsilon_dpo/beta": 0.09358243644237518, "epsilon_dpo/beta_margin_grad_mean": -0.4891579747200012, "epsilon_dpo/beta_margin_grad_std": 0.0171290785074234, "epsilon_dpo/beta_margin_mean": 0.04344891756772995, "epsilon_dpo/beta_margin_std": 0.0687476322054863, "epsilon_dpo/loss_margin_mean": 0.468301922082901, "grad_norm": 63.605506896972656, "kl/avg_steps": 0.53125, "kl/beta": 0.0940728411078453, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.1739130434782607e-07, "logits/chosen": -0.700573742389679, "logits/rejected": -0.4872978627681732, "logps/chosen": -63.730369567871094, "logps/ref_chosen": -63.806922912597656, "logps/ref_rejected": -72.89400482177734, "logps/rejected": -73.28575897216797, "loss": 1.3445, "rewards/accuracies": 0.75, "rewards/chosen": 0.006982723250985146, "rewards/margins": 0.04344893991947174, "rewards/rejected": -0.036466218531131744, "step": 31 }, { "epoch": 0.04698972099853157, "epsilon_dpo/beta": 0.09297093003988266, "epsilon_dpo/beta_margin_grad_mean": -0.4829469323158264, "epsilon_dpo/beta_margin_grad_std": 0.020606767386198044, "epsilon_dpo/beta_margin_mean": 0.06842009723186493, "epsilon_dpo/beta_margin_std": 0.08292694389820099, "epsilon_dpo/loss_margin_mean": 0.7392706274986267, "grad_norm": 76.32506561279297, "kl/avg_steps": 0.65625, "kl/beta": 0.09357572346925735, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 2.2463768115942027e-07, "logits/chosen": -0.6239430904388428, "logits/rejected": -0.42263031005859375, "logps/chosen": -62.531192779541016, "logps/ref_chosen": -62.739524841308594, "logps/ref_rejected": -89.3175048828125, "logps/rejected": -89.84844207763672, "loss": 1.3208, "rewards/accuracies": 0.84375, "rewards/chosen": 0.019259147346019745, "rewards/margins": 0.06842006742954254, "rewards/rejected": -0.0491609200835228, "step": 32 }, { "epoch": 0.048458149779735685, "epsilon_dpo/beta": 0.09245194494724274, "epsilon_dpo/beta_margin_grad_mean": -0.4862017035484314, "epsilon_dpo/beta_margin_grad_std": 0.014605310745537281, "epsilon_dpo/beta_margin_mean": 0.05525689572095871, "epsilon_dpo/beta_margin_std": 0.058518461883068085, "epsilon_dpo/loss_margin_mean": 0.6014795303344727, "grad_norm": 67.19760131835938, "kl/avg_steps": 0.5625, "kl/beta": 0.09296563267707825, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.318840579710145e-07, "logits/chosen": -0.5926761627197266, "logits/rejected": -0.4174070358276367, "logps/chosen": -53.162803649902344, "logps/ref_chosen": -53.26097106933594, "logps/ref_rejected": -87.8851318359375, "logps/rejected": -88.3884506225586, "loss": 1.3327, "rewards/accuracies": 0.78125, "rewards/chosen": 0.00890004076063633, "rewards/margins": 0.055256880819797516, "rewards/rejected": -0.04635683447122574, "step": 33 }, { "epoch": 0.049926578560939794, "epsilon_dpo/beta": 0.0917903482913971, "epsilon_dpo/beta_margin_grad_mean": -0.48294445872306824, "epsilon_dpo/beta_margin_grad_std": 0.019153540953993797, "epsilon_dpo/beta_margin_mean": 0.06836734712123871, "epsilon_dpo/beta_margin_std": 0.07685627788305283, "epsilon_dpo/loss_margin_mean": 0.7480142116546631, "grad_norm": 71.88612365722656, "kl/avg_steps": 0.71875, "kl/beta": 0.09244562685489655, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 2.391304347826087e-07, "logits/chosen": -0.5952507257461548, "logits/rejected": -0.52412348985672, "logps/chosen": -50.7568359375, "logps/ref_chosen": -50.81732940673828, "logps/ref_rejected": -101.92184448242188, "logps/rejected": -102.609375, "loss": 1.3206, "rewards/accuracies": 0.84375, "rewards/chosen": 0.005410528276115656, "rewards/margins": 0.06836740672588348, "rewards/rejected": -0.06295688450336456, "step": 34 }, { "epoch": 0.0513950073421439, "epsilon_dpo/beta": 0.09099189192056656, "epsilon_dpo/beta_margin_grad_mean": -0.4718893766403198, "epsilon_dpo/beta_margin_grad_std": 0.02585284784436226, "epsilon_dpo/beta_margin_mean": 0.1130870133638382, "epsilon_dpo/beta_margin_std": 0.10551401227712631, "epsilon_dpo/loss_margin_mean": 1.2445440292358398, "grad_norm": 75.97843933105469, "kl/avg_steps": 0.875, "kl/beta": 0.09178591519594193, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 2.463768115942029e-07, "logits/chosen": -0.7257874608039856, "logits/rejected": -0.461169570684433, "logps/chosen": -50.86943054199219, "logps/ref_chosen": -51.02449035644531, "logps/ref_rejected": -106.82443237304688, "logps/rejected": -107.9139175415039, "loss": 1.2792, "rewards/accuracies": 0.953125, "rewards/chosen": 0.014054520055651665, "rewards/margins": 0.11308702826499939, "rewards/rejected": -0.09903251379728317, "step": 35 }, { "epoch": 0.05286343612334802, "epsilon_dpo/beta": 0.09040167182683945, "epsilon_dpo/beta_margin_grad_mean": -0.47390487790107727, "epsilon_dpo/beta_margin_grad_std": 0.026591215282678604, "epsilon_dpo/beta_margin_mean": 0.1047983318567276, "epsilon_dpo/beta_margin_std": 0.10706175863742828, "epsilon_dpo/loss_margin_mean": 1.164489507675171, "grad_norm": 66.66797637939453, "kl/avg_steps": 0.65625, "kl/beta": 0.09098975360393524, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 2.536231884057971e-07, "logits/chosen": -0.735052227973938, "logits/rejected": -0.5852710604667664, "logps/chosen": -51.944091796875, "logps/ref_chosen": -51.991493225097656, "logps/ref_rejected": -86.04061889648438, "logps/rejected": -87.15771484375, "loss": 1.2871, "rewards/accuracies": 0.828125, "rewards/chosen": 0.004147795960307121, "rewards/margins": 0.10479836165904999, "rewards/rejected": -0.10065056383609772, "step": 36 }, { "epoch": 0.05433186490455213, "epsilon_dpo/beta": 0.08986878395080566, "epsilon_dpo/beta_margin_grad_mean": -0.47582224011421204, "epsilon_dpo/beta_margin_grad_std": 0.0320659838616848, "epsilon_dpo/beta_margin_mean": 0.09735116362571716, "epsilon_dpo/beta_margin_std": 0.12976804375648499, "epsilon_dpo/loss_margin_mean": 1.0894930362701416, "grad_norm": 56.18361282348633, "kl/avg_steps": 0.59375, "kl/beta": 0.09039653092622757, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 2.6086956521739126e-07, "logits/chosen": -0.7193362712860107, "logits/rejected": -0.4781792163848877, "logps/chosen": -62.758453369140625, "logps/ref_chosen": -62.807106018066406, "logps/ref_rejected": -77.89507293701172, "logps/rejected": -78.9359130859375, "loss": 1.2955, "rewards/accuracies": 0.828125, "rewards/chosen": 0.004034676589071751, "rewards/margins": 0.09735117852687836, "rewards/rejected": -0.09331650286912918, "step": 37 }, { "epoch": 0.055800293685756244, "epsilon_dpo/beta": 0.08922599256038666, "epsilon_dpo/beta_margin_grad_mean": -0.4700261652469635, "epsilon_dpo/beta_margin_grad_std": 0.035584937781095505, "epsilon_dpo/beta_margin_mean": 0.12100663781166077, "epsilon_dpo/beta_margin_std": 0.14498375356197357, "epsilon_dpo/loss_margin_mean": 1.361170768737793, "grad_norm": 63.268428802490234, "kl/avg_steps": 0.71875, "kl/beta": 0.08986296504735947, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 2.681159420289855e-07, "logits/chosen": -0.6519103646278381, "logits/rejected": -0.4997590184211731, "logps/chosen": -48.251590728759766, "logps/ref_chosen": -48.39051818847656, "logps/ref_rejected": -97.91244506835938, "logps/rejected": -99.13468170166016, "loss": 1.2741, "rewards/accuracies": 0.875, "rewards/chosen": 0.012151028029620647, "rewards/margins": 0.12100662291049957, "rewards/rejected": -0.10885559767484665, "step": 38 }, { "epoch": 0.05726872246696035, "epsilon_dpo/beta": 0.088477723300457, "epsilon_dpo/beta_margin_grad_mean": -0.46417590975761414, "epsilon_dpo/beta_margin_grad_std": 0.03130076080560684, "epsilon_dpo/beta_margin_mean": 0.14417992532253265, "epsilon_dpo/beta_margin_std": 0.12662391364574432, "epsilon_dpo/loss_margin_mean": 1.6333224773406982, "grad_norm": 65.90448760986328, "kl/avg_steps": 0.84375, "kl/beta": 0.08922168612480164, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 2.753623188405797e-07, "logits/chosen": -0.7812178134918213, "logits/rejected": -0.5999255776405334, "logps/chosen": -50.66172409057617, "logps/ref_chosen": -50.75046920776367, "logps/ref_rejected": -78.56951141357422, "logps/rejected": -80.11408996582031, "loss": 1.2513, "rewards/accuracies": 0.921875, "rewards/chosen": 0.007626615464687347, "rewards/margins": 0.14417986571788788, "rewards/rejected": -0.13655325770378113, "step": 39 }, { "epoch": 0.05873715124816446, "epsilon_dpo/beta": 0.08793099224567413, "epsilon_dpo/beta_margin_grad_mean": -0.4664282202720642, "epsilon_dpo/beta_margin_grad_std": 0.038107842206954956, "epsilon_dpo/beta_margin_mean": 0.13568727672100067, "epsilon_dpo/beta_margin_std": 0.15566600859165192, "epsilon_dpo/loss_margin_mean": 1.5504910945892334, "grad_norm": 53.64518356323242, "kl/avg_steps": 0.625, "kl/beta": 0.08847517520189285, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 2.8260869565217386e-07, "logits/chosen": -0.6166332364082336, "logits/rejected": -0.45730146765708923, "logps/chosen": -57.77972412109375, "logps/ref_chosen": -57.985069274902344, "logps/ref_rejected": -74.30007934570312, "logps/rejected": -75.64521789550781, "loss": 1.2612, "rewards/accuracies": 0.8125, "rewards/chosen": 0.017830543220043182, "rewards/margins": 0.13568724691867828, "rewards/rejected": -0.1178567111492157, "step": 40 }, { "epoch": 0.06020558002936858, "epsilon_dpo/beta": 0.087274931371212, "epsilon_dpo/beta_margin_grad_mean": -0.4586746394634247, "epsilon_dpo/beta_margin_grad_std": 0.0441967137157917, "epsilon_dpo/beta_margin_mean": 0.16744600236415863, "epsilon_dpo/beta_margin_std": 0.18128535151481628, "epsilon_dpo/loss_margin_mean": 1.9254682064056396, "grad_norm": 60.75354766845703, "kl/avg_steps": 0.75, "kl/beta": 0.08792564272880554, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 2.898550724637681e-07, "logits/chosen": -0.7145446538925171, "logits/rejected": -0.5281996130943298, "logps/chosen": -62.65257263183594, "logps/ref_chosen": -62.69581604003906, "logps/ref_rejected": -97.02352905273438, "logps/rejected": -98.90574645996094, "loss": 1.2339, "rewards/accuracies": 0.875, "rewards/chosen": 0.0033399879466742277, "rewards/margins": 0.16744601726531982, "rewards/rejected": -0.16410601139068604, "step": 41 }, { "epoch": 0.06167400881057269, "epsilon_dpo/beta": 0.08657068759202957, "epsilon_dpo/beta_margin_grad_mean": -0.4439954161643982, "epsilon_dpo/beta_margin_grad_std": 0.051451511681079865, "epsilon_dpo/beta_margin_mean": 0.22843961417675018, "epsilon_dpo/beta_margin_std": 0.2150142788887024, "epsilon_dpo/loss_margin_mean": 2.6446244716644287, "grad_norm": 70.50254821777344, "kl/avg_steps": 0.8125, "kl/beta": 0.08727110922336578, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 2.971014492753623e-07, "logits/chosen": -0.748748779296875, "logits/rejected": -0.471982479095459, "logps/chosen": -58.705352783203125, "logps/ref_chosen": -58.96642303466797, "logps/ref_rejected": -109.90837097167969, "logps/rejected": -112.29192352294922, "loss": 1.1821, "rewards/accuracies": 0.9375, "rewards/chosen": 0.022369947284460068, "rewards/margins": 0.22843967378139496, "rewards/rejected": -0.2060697227716446, "step": 42 }, { "epoch": 0.0631424375917768, "epsilon_dpo/beta": 0.08581885695457458, "epsilon_dpo/beta_margin_grad_mean": -0.44696906208992004, "epsilon_dpo/beta_margin_grad_std": 0.04155328497290611, "epsilon_dpo/beta_margin_mean": 0.21492145955562592, "epsilon_dpo/beta_margin_std": 0.17198152840137482, "epsilon_dpo/loss_margin_mean": 2.508019208908081, "grad_norm": 63.363746643066406, "kl/avg_steps": 0.875, "kl/beta": 0.08656774461269379, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 3.043478260869565e-07, "logits/chosen": -0.6911792755126953, "logits/rejected": -0.5561075210571289, "logps/chosen": -53.596961975097656, "logps/ref_chosen": -54.15599822998047, "logps/ref_rejected": -96.48019409179688, "logps/rejected": -98.42918395996094, "loss": 1.1901, "rewards/accuracies": 0.9375, "rewards/chosen": 0.047858819365501404, "rewards/margins": 0.2149214893579483, "rewards/rejected": -0.1670626699924469, "step": 43 }, { "epoch": 0.06461086637298091, "epsilon_dpo/beta": 0.0850476324558258, "epsilon_dpo/beta_margin_grad_mean": -0.44107890129089355, "epsilon_dpo/beta_margin_grad_std": 0.04519936442375183, "epsilon_dpo/beta_margin_mean": 0.23933164775371552, "epsilon_dpo/beta_margin_std": 0.18726012110710144, "epsilon_dpo/loss_margin_mean": 2.8173367977142334, "grad_norm": 69.75694274902344, "kl/avg_steps": 0.90625, "kl/beta": 0.08581684529781342, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 3.115942028985507e-07, "logits/chosen": -0.7419127225875854, "logits/rejected": -0.561978280544281, "logps/chosen": -49.84899139404297, "logps/ref_chosen": -50.07849884033203, "logps/ref_rejected": -108.78376007080078, "logps/rejected": -111.37159729003906, "loss": 1.1698, "rewards/accuracies": 0.953125, "rewards/chosen": 0.019421285018324852, "rewards/margins": 0.23933161795139313, "rewards/rejected": -0.21991033852100372, "step": 44 }, { "epoch": 0.06607929515418502, "epsilon_dpo/beta": 0.08444329351186752, "epsilon_dpo/beta_margin_grad_mean": -0.45379340648651123, "epsilon_dpo/beta_margin_grad_std": 0.05130607634782791, "epsilon_dpo/beta_margin_mean": 0.18834719061851501, "epsilon_dpo/beta_margin_std": 0.21307416260242462, "epsilon_dpo/loss_margin_mean": 2.238414764404297, "grad_norm": 54.971275329589844, "kl/avg_steps": 0.71875, "kl/beta": 0.08504611998796463, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 3.188405797101449e-07, "logits/chosen": -0.5838513970375061, "logits/rejected": -0.5171458721160889, "logps/chosen": -48.300140380859375, "logps/ref_chosen": -48.41493225097656, "logps/ref_rejected": -77.93643188476562, "logps/rejected": -80.06005859375, "loss": 1.2179, "rewards/accuracies": 0.875, "rewards/chosen": 0.009439542889595032, "rewards/margins": 0.18834716081619263, "rewards/rejected": -0.1789076179265976, "step": 45 }, { "epoch": 0.06754772393538913, "epsilon_dpo/beta": 0.08384069055318832, "epsilon_dpo/beta_margin_grad_mean": -0.4394093155860901, "epsilon_dpo/beta_margin_grad_std": 0.0630989596247673, "epsilon_dpo/beta_margin_mean": 0.24937555193901062, "epsilon_dpo/beta_margin_std": 0.26771342754364014, "epsilon_dpo/loss_margin_mean": 2.984797954559326, "grad_norm": 61.659027099609375, "kl/avg_steps": 0.71875, "kl/beta": 0.08443921059370041, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 3.260869565217391e-07, "logits/chosen": -0.7949700355529785, "logits/rejected": -0.5555962920188904, "logps/chosen": -55.753944396972656, "logps/ref_chosen": -55.999427795410156, "logps/ref_rejected": -95.652587890625, "logps/rejected": -98.39190673828125, "loss": 1.1696, "rewards/accuracies": 0.890625, "rewards/chosen": 0.020097916945815086, "rewards/margins": 0.24937552213668823, "rewards/rejected": -0.2292776107788086, "step": 46 }, { "epoch": 0.06901615271659324, "epsilon_dpo/beta": 0.08320295065641403, "epsilon_dpo/beta_margin_grad_mean": -0.4386330842971802, "epsilon_dpo/beta_margin_grad_std": 0.055102963000535965, "epsilon_dpo/beta_margin_mean": 0.25048893690109253, "epsilon_dpo/beta_margin_std": 0.22870446741580963, "epsilon_dpo/loss_margin_mean": 3.0181827545166016, "grad_norm": 56.881492614746094, "kl/avg_steps": 0.765625, "kl/beta": 0.0838366374373436, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.875, "learning_rate": 3.333333333333333e-07, "logits/chosen": -0.7932885885238647, "logits/rejected": -0.51214599609375, "logps/chosen": -57.51045608520508, "logps/ref_chosen": -57.92607879638672, "logps/ref_rejected": -94.67920684814453, "logps/rejected": -97.28176879882812, "loss": 1.1642, "rewards/accuracies": 0.90625, "rewards/chosen": 0.03409276157617569, "rewards/margins": 0.25048884749412537, "rewards/rejected": -0.21639610826969147, "step": 47 }, { "epoch": 0.07048458149779736, "epsilon_dpo/beta": 0.08260989934206009, "epsilon_dpo/beta_margin_grad_mean": -0.43596893548965454, "epsilon_dpo/beta_margin_grad_std": 0.06429051607847214, "epsilon_dpo/beta_margin_mean": 0.2623628079891205, "epsilon_dpo/beta_margin_std": 0.26902657747268677, "epsilon_dpo/loss_margin_mean": 3.188123941421509, "grad_norm": 64.7890396118164, "kl/avg_steps": 0.71875, "kl/beta": 0.0831996351480484, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 3.4057971014492755e-07, "logits/chosen": -0.8307114839553833, "logits/rejected": -0.577639102935791, "logps/chosen": -57.117889404296875, "logps/ref_chosen": -57.188072204589844, "logps/ref_rejected": -88.0166015625, "logps/rejected": -91.13453674316406, "loss": 1.1586, "rewards/accuracies": 0.875, "rewards/chosen": 0.005281925667077303, "rewards/margins": 0.2623627781867981, "rewards/rejected": -0.2570808529853821, "step": 48 }, { "epoch": 0.07195301027900147, "epsilon_dpo/beta": 0.0819687470793724, "epsilon_dpo/beta_margin_grad_mean": -0.42563095688819885, "epsilon_dpo/beta_margin_grad_std": 0.07447288185358047, "epsilon_dpo/beta_margin_mean": 0.3100949227809906, "epsilon_dpo/beta_margin_std": 0.3251339793205261, "epsilon_dpo/loss_margin_mean": 3.7948062419891357, "grad_norm": 55.275516510009766, "kl/avg_steps": 0.78125, "kl/beta": 0.08260590583086014, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 3.478260869565217e-07, "logits/chosen": -0.7270078659057617, "logits/rejected": -0.5109246969223022, "logps/chosen": -61.376583099365234, "logps/ref_chosen": -61.685264587402344, "logps/ref_rejected": -83.76747131347656, "logps/rejected": -87.25360107421875, "loss": 1.125, "rewards/accuracies": 0.890625, "rewards/chosen": 0.02468142658472061, "rewards/margins": 0.310094952583313, "rewards/rejected": -0.285413533449173, "step": 49 }, { "epoch": 0.07342143906020558, "epsilon_dpo/beta": 0.08143579214811325, "epsilon_dpo/beta_margin_grad_mean": -0.418803870677948, "epsilon_dpo/beta_margin_grad_std": 0.07662991434335709, "epsilon_dpo/beta_margin_mean": 0.3378123939037323, "epsilon_dpo/beta_margin_std": 0.32804766297340393, "epsilon_dpo/loss_margin_mean": 4.165571689605713, "grad_norm": 54.424171447753906, "kl/avg_steps": 0.65625, "kl/beta": 0.08196555078029633, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 3.5507246376811595e-07, "logits/chosen": -0.7568400502204895, "logits/rejected": -0.5175820589065552, "logps/chosen": -58.89253234863281, "logps/ref_chosen": -58.72413635253906, "logps/ref_rejected": -96.35814666748047, "logps/rejected": -100.69210815429688, "loss": 1.1024, "rewards/accuracies": 0.84375, "rewards/chosen": -0.014449729584157467, "rewards/margins": 0.3378123641014099, "rewards/rejected": -0.35226207971572876, "step": 50 }, { "epoch": 0.07488986784140969, "epsilon_dpo/beta": 0.08103210479021072, "epsilon_dpo/beta_margin_grad_mean": -0.42181292176246643, "epsilon_dpo/beta_margin_grad_std": 0.09374556690454483, "epsilon_dpo/beta_margin_mean": 0.33399254083633423, "epsilon_dpo/beta_margin_std": 0.41953185200691223, "epsilon_dpo/loss_margin_mean": 4.146895885467529, "grad_norm": 46.214134216308594, "kl/avg_steps": 0.5, "kl/beta": 0.08143115788698196, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 3.6231884057971015e-07, "logits/chosen": -0.8009949922561646, "logits/rejected": -0.6197192072868347, "logps/chosen": -61.633811950683594, "logps/ref_chosen": -61.3736686706543, "logps/ref_rejected": -76.00199890136719, "logps/rejected": -80.40904235839844, "loss": 1.1207, "rewards/accuracies": 0.828125, "rewards/chosen": -0.02197762206196785, "rewards/margins": 0.333992600440979, "rewards/rejected": -0.35597023367881775, "step": 51 }, { "epoch": 0.0763582966226138, "epsilon_dpo/beta": 0.08042638003826141, "epsilon_dpo/beta_margin_grad_mean": -0.3843227028846741, "epsilon_dpo/beta_margin_grad_std": 0.08970463275909424, "epsilon_dpo/beta_margin_mean": 0.49331915378570557, "epsilon_dpo/beta_margin_std": 0.40902501344680786, "epsilon_dpo/loss_margin_mean": 6.149931907653809, "grad_norm": 53.337032318115234, "kl/avg_steps": 0.75, "kl/beta": 0.08102603256702423, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 3.695652173913043e-07, "logits/chosen": -0.8910727500915527, "logits/rejected": -0.6129493713378906, "logps/chosen": -52.00531768798828, "logps/ref_chosen": -52.33735656738281, "logps/ref_rejected": -79.97391510009766, "logps/rejected": -85.79180908203125, "loss": 0.9909, "rewards/accuracies": 0.9375, "rewards/chosen": 0.025913957506418228, "rewards/margins": 0.49331915378570557, "rewards/rejected": -0.46740520000457764, "step": 52 }, { "epoch": 0.07782672540381791, "epsilon_dpo/beta": 0.07980253547430038, "epsilon_dpo/beta_margin_grad_mean": -0.3882746994495392, "epsilon_dpo/beta_margin_grad_std": 0.1083177998661995, "epsilon_dpo/beta_margin_mean": 0.4963873624801636, "epsilon_dpo/beta_margin_std": 0.5382514595985413, "epsilon_dpo/loss_margin_mean": 6.23687219619751, "grad_norm": 51.91477966308594, "kl/avg_steps": 0.78125, "kl/beta": 0.08042285591363907, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 3.7681159420289855e-07, "logits/chosen": -0.6966301202774048, "logits/rejected": -0.5892688632011414, "logps/chosen": -53.52302551269531, "logps/ref_chosen": -53.31465530395508, "logps/ref_rejected": -91.7835922241211, "logps/rejected": -98.22883605957031, "loss": 1.0123, "rewards/accuracies": 0.921875, "rewards/chosen": -0.017171800136566162, "rewards/margins": 0.4963873326778412, "rewards/rejected": -0.513559103012085, "step": 53 }, { "epoch": 0.07929515418502203, "epsilon_dpo/beta": 0.0791962519288063, "epsilon_dpo/beta_margin_grad_mean": -0.3992304503917694, "epsilon_dpo/beta_margin_grad_std": 0.0889367163181305, "epsilon_dpo/beta_margin_mean": 0.4301222860813141, "epsilon_dpo/beta_margin_std": 0.4083307385444641, "epsilon_dpo/loss_margin_mean": 5.4435248374938965, "grad_norm": 47.81783676147461, "kl/avg_steps": 0.765625, "kl/beta": 0.07979942858219147, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.875, "learning_rate": 3.8405797101449274e-07, "logits/chosen": -0.9264237880706787, "logits/rejected": -0.6701672077178955, "logps/chosen": -51.0803337097168, "logps/ref_chosen": -50.68865966796875, "logps/ref_rejected": -91.71539306640625, "logps/rejected": -97.55059051513672, "loss": 1.0396, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03115382045507431, "rewards/margins": 0.4301223158836365, "rewards/rejected": -0.461276113986969, "step": 54 }, { "epoch": 0.08076358296622614, "epsilon_dpo/beta": 0.07875551283359528, "epsilon_dpo/beta_margin_grad_mean": -0.38479316234588623, "epsilon_dpo/beta_margin_grad_std": 0.12403902411460876, "epsilon_dpo/beta_margin_mean": 0.5141162872314453, "epsilon_dpo/beta_margin_std": 0.5916620492935181, "epsilon_dpo/loss_margin_mean": 6.566575527191162, "grad_norm": 46.457801818847656, "kl/avg_steps": 0.5625, "kl/beta": 0.0791931003332138, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.9130434782608694e-07, "logits/chosen": -0.9492754340171814, "logits/rejected": -0.7138346433639526, "logps/chosen": -63.550445556640625, "logps/ref_chosen": -62.615234375, "logps/ref_rejected": -88.99349975585938, "logps/rejected": -96.49528503417969, "loss": 1.014, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07498433440923691, "rewards/margins": 0.5141162872314453, "rewards/rejected": -0.5891005992889404, "step": 55 }, { "epoch": 0.08223201174743025, "epsilon_dpo/beta": 0.0783396065235138, "epsilon_dpo/beta_margin_grad_mean": -0.3912721276283264, "epsilon_dpo/beta_margin_grad_std": 0.1250195950269699, "epsilon_dpo/beta_margin_mean": 0.48327428102493286, "epsilon_dpo/beta_margin_std": 0.5860716104507446, "epsilon_dpo/loss_margin_mean": 6.207910060882568, "grad_norm": 42.523895263671875, "kl/avg_steps": 0.53125, "kl/beta": 0.0787501335144043, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 3.9855072463768114e-07, "logits/chosen": -0.8505500555038452, "logits/rejected": -0.6876404881477356, "logps/chosen": -58.679595947265625, "logps/ref_chosen": -57.93273162841797, "logps/ref_rejected": -94.1744384765625, "logps/rejected": -101.12921142578125, "loss": 1.037, "rewards/accuracies": 0.828125, "rewards/chosen": -0.059790801256895065, "rewards/margins": 0.4832742214202881, "rewards/rejected": -0.543065071105957, "step": 56 }, { "epoch": 0.08370044052863436, "epsilon_dpo/beta": 0.07772976905107498, "epsilon_dpo/beta_margin_grad_mean": -0.37548625469207764, "epsilon_dpo/beta_margin_grad_std": 0.11424616724252701, "epsilon_dpo/beta_margin_mean": 0.5436288714408875, "epsilon_dpo/beta_margin_std": 0.5275634527206421, "epsilon_dpo/loss_margin_mean": 7.017059803009033, "grad_norm": 47.585750579833984, "kl/avg_steps": 0.78125, "kl/beta": 0.07833398133516312, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 4.057971014492754e-07, "logits/chosen": -0.7789304852485657, "logits/rejected": -0.6877784729003906, "logps/chosen": -71.28856658935547, "logps/ref_chosen": -70.49528503417969, "logps/ref_rejected": -95.56546020507812, "logps/rejected": -103.37579345703125, "loss": 0.9777, "rewards/accuracies": 0.921875, "rewards/chosen": -0.06252375990152359, "rewards/margins": 0.5436288714408875, "rewards/rejected": -0.6061526536941528, "step": 57 }, { "epoch": 0.08516886930983847, "epsilon_dpo/beta": 0.07718782871961594, "epsilon_dpo/beta_margin_grad_mean": -0.36823755502700806, "epsilon_dpo/beta_margin_grad_std": 0.1302955597639084, "epsilon_dpo/beta_margin_mean": 0.600260317325592, "epsilon_dpo/beta_margin_std": 0.6401125192642212, "epsilon_dpo/loss_margin_mean": 7.806028366088867, "grad_norm": 50.20505142211914, "kl/avg_steps": 0.703125, "kl/beta": 0.07772674411535263, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.1304347826086954e-07, "logits/chosen": -0.9927153587341309, "logits/rejected": -0.7306280732154846, "logps/chosen": -63.216793060302734, "logps/ref_chosen": -62.13294219970703, "logps/ref_rejected": -84.61729431152344, "logps/rejected": -93.5071792602539, "loss": 0.9617, "rewards/accuracies": 0.890625, "rewards/chosen": -0.08548370748758316, "rewards/margins": 0.6002602577209473, "rewards/rejected": -0.6857439875602722, "step": 58 }, { "epoch": 0.08663729809104258, "epsilon_dpo/beta": 0.07668519020080566, "epsilon_dpo/beta_margin_grad_mean": -0.35859009623527527, "epsilon_dpo/beta_margin_grad_std": 0.13512957096099854, "epsilon_dpo/beta_margin_mean": 0.6526122689247131, "epsilon_dpo/beta_margin_std": 0.6807378530502319, "epsilon_dpo/loss_margin_mean": 8.546601295471191, "grad_norm": 48.971317291259766, "kl/avg_steps": 0.65625, "kl/beta": 0.0771840438246727, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.2028985507246374e-07, "logits/chosen": -0.9326772689819336, "logits/rejected": -0.7697768211364746, "logps/chosen": -53.49982452392578, "logps/ref_chosen": -51.932525634765625, "logps/ref_rejected": -88.88520050048828, "logps/rejected": -98.99909973144531, "loss": 0.9341, "rewards/accuracies": 0.875, "rewards/chosen": -0.1215255856513977, "rewards/margins": 0.6526123285293579, "rewards/rejected": -0.7741378545761108, "step": 59 }, { "epoch": 0.0881057268722467, "epsilon_dpo/beta": 0.0762091875076294, "epsilon_dpo/beta_margin_grad_mean": -0.38297078013420105, "epsilon_dpo/beta_margin_grad_std": 0.13203725218772888, "epsilon_dpo/beta_margin_mean": 0.5120560526847839, "epsilon_dpo/beta_margin_std": 0.606110155582428, "epsilon_dpo/loss_margin_mean": 6.76201868057251, "grad_norm": 53.07661819458008, "kl/avg_steps": 0.625, "kl/beta": 0.07668082416057587, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.2753623188405794e-07, "logits/chosen": -0.9355441927909851, "logits/rejected": -0.6807034611701965, "logps/chosen": -63.75890350341797, "logps/ref_chosen": -60.94218444824219, "logps/ref_rejected": -85.39340209960938, "logps/rejected": -94.97213745117188, "loss": 1.0221, "rewards/accuracies": 0.828125, "rewards/chosen": -0.21674984693527222, "rewards/margins": 0.5120559930801392, "rewards/rejected": -0.7288058996200562, "step": 60 }, { "epoch": 0.08957415565345081, "epsilon_dpo/beta": 0.07587873935699463, "epsilon_dpo/beta_margin_grad_mean": -0.37099823355674744, "epsilon_dpo/beta_margin_grad_std": 0.15391331911087036, "epsilon_dpo/beta_margin_mean": 0.6379275918006897, "epsilon_dpo/beta_margin_std": 0.8875248432159424, "epsilon_dpo/loss_margin_mean": 8.468091011047363, "grad_norm": 45.54325485229492, "kl/avg_steps": 0.4375, "kl/beta": 0.07620454579591751, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.3478260869565214e-07, "logits/chosen": -0.865566611289978, "logits/rejected": -0.6939293146133423, "logps/chosen": -62.2430419921875, "logps/ref_chosen": -60.633522033691406, "logps/ref_rejected": -89.85249328613281, "logps/rejected": -99.93009948730469, "loss": 0.9906, "rewards/accuracies": 0.78125, "rewards/chosen": -0.12385329604148865, "rewards/margins": 0.6379275321960449, "rewards/rejected": -0.761780858039856, "step": 61 }, { "epoch": 0.09104258443465492, "epsilon_dpo/beta": 0.07542965561151505, "epsilon_dpo/beta_margin_grad_mean": -0.3952219784259796, "epsilon_dpo/beta_margin_grad_std": 0.1335650235414505, "epsilon_dpo/beta_margin_mean": 0.47400033473968506, "epsilon_dpo/beta_margin_std": 0.6434755921363831, "epsilon_dpo/loss_margin_mean": 6.3259053230285645, "grad_norm": 45.17091751098633, "kl/avg_steps": 0.59375, "kl/beta": 0.07587260752916336, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.420289855072464e-07, "logits/chosen": -0.8267738223075867, "logits/rejected": -0.6903345584869385, "logps/chosen": -57.91468811035156, "logps/ref_chosen": -56.15077209472656, "logps/ref_rejected": -75.56619262695312, "logps/rejected": -83.65602111816406, "loss": 1.058, "rewards/accuracies": 0.859375, "rewards/chosen": -0.13497035205364227, "rewards/margins": 0.4740002751350403, "rewards/rejected": -0.6089706420898438, "step": 62 }, { "epoch": 0.09251101321585903, "epsilon_dpo/beta": 0.07498443126678467, "epsilon_dpo/beta_margin_grad_mean": -0.3600352704524994, "epsilon_dpo/beta_margin_grad_std": 0.1420409381389618, "epsilon_dpo/beta_margin_mean": 0.6376527547836304, "epsilon_dpo/beta_margin_std": 0.6982914805412292, "epsilon_dpo/loss_margin_mean": 8.549195289611816, "grad_norm": 47.9777717590332, "kl/avg_steps": 0.59375, "kl/beta": 0.07542476803064346, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.4927536231884053e-07, "logits/chosen": -0.9301478862762451, "logits/rejected": -0.719444751739502, "logps/chosen": -76.01518249511719, "logps/ref_chosen": -73.14739227294922, "logps/ref_rejected": -97.61006164550781, "logps/rejected": -109.02705383300781, "loss": 0.9518, "rewards/accuracies": 0.875, "rewards/chosen": -0.21734055876731873, "rewards/margins": 0.6376527547836304, "rewards/rejected": -0.8549933433532715, "step": 63 }, { "epoch": 0.09397944199706314, "epsilon_dpo/beta": 0.07462374866008759, "epsilon_dpo/beta_margin_grad_mean": -0.34572863578796387, "epsilon_dpo/beta_margin_grad_std": 0.15755517780780792, "epsilon_dpo/beta_margin_mean": 0.7506232857704163, "epsilon_dpo/beta_margin_std": 0.8795703053474426, "epsilon_dpo/loss_margin_mean": 10.123734474182129, "grad_norm": 43.91283416748047, "kl/avg_steps": 0.484375, "kl/beta": 0.07497958093881607, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.5652173913043473e-07, "logits/chosen": -0.8505758047103882, "logits/rejected": -0.6992577314376831, "logps/chosen": -55.14314651489258, "logps/ref_chosen": -53.99859619140625, "logps/ref_rejected": -93.53020477294922, "logps/rejected": -104.79847717285156, "loss": 0.9192, "rewards/accuracies": 0.796875, "rewards/chosen": -0.08719173073768616, "rewards/margins": 0.7506231665611267, "rewards/rejected": -0.8378149271011353, "step": 64 }, { "epoch": 0.09544787077826726, "epsilon_dpo/beta": 0.07420583814382553, "epsilon_dpo/beta_margin_grad_mean": -0.3416244089603424, "epsilon_dpo/beta_margin_grad_std": 0.1670026183128357, "epsilon_dpo/beta_margin_mean": 0.7576022744178772, "epsilon_dpo/beta_margin_std": 0.8702723383903503, "epsilon_dpo/loss_margin_mean": 10.269062042236328, "grad_norm": 45.28920364379883, "kl/avg_steps": 0.5625, "kl/beta": 0.0746181458234787, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.63768115942029e-07, "logits/chosen": -0.9071321487426758, "logits/rejected": -0.8409342169761658, "logps/chosen": -68.27507019042969, "logps/ref_chosen": -64.83599853515625, "logps/ref_rejected": -109.94645690917969, "logps/rejected": -123.65458679199219, "loss": 0.9228, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2566946744918823, "rewards/margins": 0.7576022744178772, "rewards/rejected": -1.0142968893051147, "step": 65 }, { "epoch": 0.09691629955947137, "epsilon_dpo/beta": 0.0737907662987709, "epsilon_dpo/beta_margin_grad_mean": -0.3596351146697998, "epsilon_dpo/beta_margin_grad_std": 0.15223053097724915, "epsilon_dpo/beta_margin_mean": 0.6837877035140991, "epsilon_dpo/beta_margin_std": 0.8476912975311279, "epsilon_dpo/loss_margin_mean": 9.32065200805664, "grad_norm": 42.59511184692383, "kl/avg_steps": 0.5625, "kl/beta": 0.07420077174901962, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.7101449275362313e-07, "logits/chosen": -0.9653792381286621, "logits/rejected": -0.8392778635025024, "logps/chosen": -54.66441345214844, "logps/ref_chosen": -51.44352722167969, "logps/ref_rejected": -75.63629150390625, "logps/rejected": -88.17784118652344, "loss": 0.953, "rewards/accuracies": 0.828125, "rewards/chosen": -0.23922501504421234, "rewards/margins": 0.6837877035140991, "rewards/rejected": -0.9230127334594727, "step": 66 }, { "epoch": 0.09838472834067548, "epsilon_dpo/beta": 0.07342413067817688, "epsilon_dpo/beta_margin_grad_mean": -0.35666364431381226, "epsilon_dpo/beta_margin_grad_std": 0.15520258247852325, "epsilon_dpo/beta_margin_mean": 0.6994439959526062, "epsilon_dpo/beta_margin_std": 0.856719434261322, "epsilon_dpo/loss_margin_mean": 9.584704399108887, "grad_norm": 43.226829528808594, "kl/avg_steps": 0.5, "kl/beta": 0.07378572225570679, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.782608695652174e-07, "logits/chosen": -0.9449235200881958, "logits/rejected": -0.8122668266296387, "logps/chosen": -62.196495056152344, "logps/ref_chosen": -59.34080505371094, "logps/ref_rejected": -72.78729248046875, "logps/rejected": -85.2276840209961, "loss": 0.9462, "rewards/accuracies": 0.8125, "rewards/chosen": -0.21227487921714783, "rewards/margins": 0.699444055557251, "rewards/rejected": -0.9117189049720764, "step": 67 }, { "epoch": 0.09985315712187959, "epsilon_dpo/beta": 0.07299000769853592, "epsilon_dpo/beta_margin_grad_mean": -0.3574289083480835, "epsilon_dpo/beta_margin_grad_std": 0.1356179416179657, "epsilon_dpo/beta_margin_mean": 0.6625055074691772, "epsilon_dpo/beta_margin_std": 0.7009024024009705, "epsilon_dpo/loss_margin_mean": 9.117342948913574, "grad_norm": 43.32976150512695, "kl/avg_steps": 0.59375, "kl/beta": 0.07341863214969635, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.855072463768116e-07, "logits/chosen": -0.9087271094322205, "logits/rejected": -0.74156653881073, "logps/chosen": -68.5649642944336, "logps/ref_chosen": -65.2058334350586, "logps/ref_rejected": -77.20724487304688, "logps/rejected": -89.68373107910156, "loss": 0.9309, "rewards/accuracies": 0.875, "rewards/chosen": -0.24640579521656036, "rewards/margins": 0.6625055074691772, "rewards/rejected": -0.9089112877845764, "step": 68 }, { "epoch": 0.1013215859030837, "epsilon_dpo/beta": 0.07258199155330658, "epsilon_dpo/beta_margin_grad_mean": -0.3325771987438202, "epsilon_dpo/beta_margin_grad_std": 0.14603158831596375, "epsilon_dpo/beta_margin_mean": 0.8206022381782532, "epsilon_dpo/beta_margin_std": 0.8429078459739685, "epsilon_dpo/loss_margin_mean": 11.354151725769043, "grad_norm": 44.75086212158203, "kl/avg_steps": 0.5625, "kl/beta": 0.07298527657985687, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.927536231884058e-07, "logits/chosen": -0.9024134874343872, "logits/rejected": -0.7688239216804504, "logps/chosen": -63.78904342651367, "logps/ref_chosen": -59.81924057006836, "logps/ref_rejected": -103.38886260986328, "logps/rejected": -118.71281433105469, "loss": 0.8579, "rewards/accuracies": 0.875, "rewards/chosen": -0.28851306438446045, "rewards/margins": 0.8206021785736084, "rewards/rejected": -1.1091153621673584, "step": 69 }, { "epoch": 0.1027900146842878, "epsilon_dpo/beta": 0.0721760094165802, "epsilon_dpo/beta_margin_grad_mean": -0.32791411876678467, "epsilon_dpo/beta_margin_grad_std": 0.17222338914871216, "epsilon_dpo/beta_margin_mean": 0.8811073899269104, "epsilon_dpo/beta_margin_std": 0.9766340851783752, "epsilon_dpo/loss_margin_mean": 12.275406837463379, "grad_norm": 48.5513801574707, "kl/avg_steps": 0.5625, "kl/beta": 0.07257703691720963, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 5e-07, "logits/chosen": -0.9441779851913452, "logits/rejected": -0.8562849760055542, "logps/chosen": -67.6795654296875, "logps/ref_chosen": -61.930641174316406, "logps/ref_rejected": -91.060791015625, "logps/rejected": -109.08511352539062, "loss": 0.8686, "rewards/accuracies": 0.84375, "rewards/chosen": -0.41746020317077637, "rewards/margins": 0.8811073303222656, "rewards/rejected": -1.298567533493042, "step": 70 }, { "epoch": 0.10425844346549193, "epsilon_dpo/beta": 0.07177228480577469, "epsilon_dpo/beta_margin_grad_mean": -0.30481892824172974, "epsilon_dpo/beta_margin_grad_std": 0.15653713047504425, "epsilon_dpo/beta_margin_mean": 1.0375287532806396, "epsilon_dpo/beta_margin_std": 1.0877079963684082, "epsilon_dpo/loss_margin_mean": 14.51425838470459, "grad_norm": 43.49538040161133, "kl/avg_steps": 0.5625, "kl/beta": 0.07217106968164444, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.999967061337492e-07, "logits/chosen": -0.9862484931945801, "logits/rejected": -0.8802157044410706, "logps/chosen": -67.21176147460938, "logps/ref_chosen": -61.750343322753906, "logps/ref_rejected": -97.33662414550781, "logps/rejected": -117.31230163574219, "loss": 0.7838, "rewards/accuracies": 0.90625, "rewards/chosen": -0.3936518430709839, "rewards/margins": 1.0375288724899292, "rewards/rejected": -1.431180715560913, "step": 71 }, { "epoch": 0.10572687224669604, "epsilon_dpo/beta": 0.07130353897809982, "epsilon_dpo/beta_margin_grad_mean": -0.30151036381721497, "epsilon_dpo/beta_margin_grad_std": 0.1891818344593048, "epsilon_dpo/beta_margin_mean": 1.0700877904891968, "epsilon_dpo/beta_margin_std": 1.190470814704895, "epsilon_dpo/loss_margin_mean": 15.082220077514648, "grad_norm": 55.14749526977539, "kl/avg_steps": 0.65625, "kl/beta": 0.07176738232374191, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.999868246217933e-07, "logits/chosen": -0.9780253171920776, "logits/rejected": -0.8922737836837769, "logps/chosen": -72.2786636352539, "logps/ref_chosen": -66.05341339111328, "logps/ref_rejected": -95.2869873046875, "logps/rejected": -116.5944595336914, "loss": 0.8218, "rewards/accuracies": 0.890625, "rewards/chosen": -0.4470614492893219, "rewards/margins": 1.0700877904891968, "rewards/rejected": -1.5171492099761963, "step": 72 }, { "epoch": 0.10719530102790015, "epsilon_dpo/beta": 0.07097236067056656, "epsilon_dpo/beta_margin_grad_mean": -0.3319794535636902, "epsilon_dpo/beta_margin_grad_std": 0.23944905400276184, "epsilon_dpo/beta_margin_mean": 1.0602235794067383, "epsilon_dpo/beta_margin_std": 1.6266452074050903, "epsilon_dpo/loss_margin_mean": 15.071878433227539, "grad_norm": 63.31758499145508, "kl/avg_steps": 0.46875, "kl/beta": 0.0712994784116745, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.999703557245192e-07, "logits/chosen": -1.0396358966827393, "logits/rejected": -0.9342153072357178, "logps/chosen": -74.35950469970703, "logps/ref_chosen": -66.25627136230469, "logps/ref_rejected": -90.45613861083984, "logps/rejected": -113.63124084472656, "loss": 1.0033, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5787585973739624, "rewards/margins": 1.0602235794067383, "rewards/rejected": -1.6389822959899902, "step": 73 }, { "epoch": 0.10866372980910426, "epsilon_dpo/beta": 0.07061904668807983, "epsilon_dpo/beta_margin_grad_mean": -0.3144068717956543, "epsilon_dpo/beta_margin_grad_std": 0.20813730359077454, "epsilon_dpo/beta_margin_mean": 1.1345840692520142, "epsilon_dpo/beta_margin_std": 1.767999291419983, "epsilon_dpo/loss_margin_mean": 16.178430557250977, "grad_norm": 58.661258697509766, "kl/avg_steps": 0.5, "kl/beta": 0.07096681743860245, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.999472998758977e-07, "logits/chosen": -1.0253856182098389, "logits/rejected": -0.9532393217086792, "logps/chosen": -61.79954528808594, "logps/ref_chosen": -53.42488098144531, "logps/ref_rejected": -95.94693756103516, "logps/rejected": -120.50003051757812, "loss": 0.9142, "rewards/accuracies": 0.796875, "rewards/chosen": -0.5950509309768677, "rewards/margins": 1.1345840692520142, "rewards/rejected": -1.7296350002288818, "step": 74 }, { "epoch": 0.11013215859030837, "epsilon_dpo/beta": 0.07017943263053894, "epsilon_dpo/beta_margin_grad_mean": -0.2569746673107147, "epsilon_dpo/beta_margin_grad_std": 0.1957414597272873, "epsilon_dpo/beta_margin_mean": 1.5688680410385132, "epsilon_dpo/beta_margin_std": 1.6174330711364746, "epsilon_dpo/loss_margin_mean": 22.451784133911133, "grad_norm": 43.96208190917969, "kl/avg_steps": 0.625, "kl/beta": 0.07061374932527542, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.999176576834721e-07, "logits/chosen": -1.0390228033065796, "logits/rejected": -0.9516497254371643, "logps/chosen": -59.737876892089844, "logps/ref_chosen": -51.861663818359375, "logps/ref_rejected": -111.25397491455078, "logps/rejected": -141.58197021484375, "loss": 0.6817, "rewards/accuracies": 0.859375, "rewards/chosen": -0.5542978048324585, "rewards/margins": 1.5688681602478027, "rewards/rejected": -2.123166084289551, "step": 75 }, { "epoch": 0.11160058737151249, "epsilon_dpo/beta": 0.06976546347141266, "epsilon_dpo/beta_margin_grad_mean": -0.311787486076355, "epsilon_dpo/beta_margin_grad_std": 0.19355669617652893, "epsilon_dpo/beta_margin_mean": 0.982628583908081, "epsilon_dpo/beta_margin_std": 1.223758339881897, "epsilon_dpo/loss_margin_mean": 14.169283866882324, "grad_norm": 55.30027389526367, "kl/avg_steps": 0.59375, "kl/beta": 0.0701751559972763, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.998814299283415e-07, "logits/chosen": -1.0337252616882324, "logits/rejected": -0.9552336931228638, "logps/chosen": -62.58606719970703, "logps/ref_chosen": -53.26604080200195, "logps/ref_rejected": -78.21662139892578, "logps/rejected": -101.7059326171875, "loss": 0.8908, "rewards/accuracies": 0.859375, "rewards/chosen": -0.6522326469421387, "rewards/margins": 0.9826285243034363, "rewards/rejected": -1.6348612308502197, "step": 76 }, { "epoch": 0.1130690161527166, "epsilon_dpo/beta": 0.06928827613592148, "epsilon_dpo/beta_margin_grad_mean": -0.24933093786239624, "epsilon_dpo/beta_margin_grad_std": 0.21627967059612274, "epsilon_dpo/beta_margin_mean": 1.558729648590088, "epsilon_dpo/beta_margin_std": 1.6443172693252563, "epsilon_dpo/loss_margin_mean": 22.599130630493164, "grad_norm": 63.71240997314453, "kl/avg_steps": 0.6875, "kl/beta": 0.06976094841957092, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.998386175651409e-07, "logits/chosen": -1.067899465560913, "logits/rejected": -1.0597002506256104, "logps/chosen": -66.05867004394531, "logps/ref_chosen": -58.0966796875, "logps/ref_rejected": -93.77361297607422, "logps/rejected": -124.33473205566406, "loss": 0.7442, "rewards/accuracies": 0.890625, "rewards/chosen": -0.5564785003662109, "rewards/margins": 1.558729648590088, "rewards/rejected": -2.115208148956299, "step": 77 }, { "epoch": 0.1145374449339207, "epsilon_dpo/beta": 0.06890178471803665, "epsilon_dpo/beta_margin_grad_mean": -0.2992877960205078, "epsilon_dpo/beta_margin_grad_std": 0.2032037228345871, "epsilon_dpo/beta_margin_mean": 1.1561099290847778, "epsilon_dpo/beta_margin_std": 1.3384202718734741, "epsilon_dpo/loss_margin_mean": 16.87738037109375, "grad_norm": 54.934146881103516, "kl/avg_steps": 0.5625, "kl/beta": 0.06928461790084839, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.997892217220159e-07, "logits/chosen": -1.0190231800079346, "logits/rejected": -0.9725791215896606, "logps/chosen": -63.288551330566406, "logps/ref_chosen": -55.61378479003906, "logps/ref_rejected": -84.93436431884766, "logps/rejected": -109.48651123046875, "loss": 0.8215, "rewards/accuracies": 0.796875, "rewards/chosen": -0.5324300527572632, "rewards/margins": 1.1561098098754883, "rewards/rejected": -1.688539981842041, "step": 78 }, { "epoch": 0.11600587371512482, "epsilon_dpo/beta": 0.06860251724720001, "epsilon_dpo/beta_margin_grad_mean": -0.30763792991638184, "epsilon_dpo/beta_margin_grad_std": 0.2211284041404724, "epsilon_dpo/beta_margin_mean": 1.2479766607284546, "epsilon_dpo/beta_margin_std": 1.5779584646224976, "epsilon_dpo/loss_margin_mean": 18.32201385498047, "grad_norm": 49.639915466308594, "kl/avg_steps": 0.4375, "kl/beta": 0.0688970759510994, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.997332437005931e-07, "logits/chosen": -1.024808645248413, "logits/rejected": -0.9812244176864624, "logps/chosen": -63.02421188354492, "logps/ref_chosen": -55.45048522949219, "logps/ref_rejected": -87.64756774902344, "logps/rejected": -113.54330444335938, "loss": 0.8577, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5230532288551331, "rewards/margins": 1.2479766607284546, "rewards/rejected": -1.7710298299789429, "step": 79 }, { "epoch": 0.11747430249632893, "epsilon_dpo/beta": 0.06835716217756271, "epsilon_dpo/beta_margin_grad_mean": -0.3214108943939209, "epsilon_dpo/beta_margin_grad_std": 0.21686489880084991, "epsilon_dpo/beta_margin_mean": 1.12192702293396, "epsilon_dpo/beta_margin_std": 1.5621590614318848, "epsilon_dpo/loss_margin_mean": 16.538299560546875, "grad_norm": 54.372802734375, "kl/avg_steps": 0.359375, "kl/beta": 0.06859695911407471, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.996706849759452e-07, "logits/chosen": -1.0545909404754639, "logits/rejected": -0.9393061399459839, "logps/chosen": -68.34993743896484, "logps/ref_chosen": -58.519290924072266, "logps/ref_rejected": -87.54750061035156, "logps/rejected": -113.91645050048828, "loss": 0.9049, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6747243404388428, "rewards/margins": 1.12192702293396, "rewards/rejected": -1.7966513633728027, "step": 80 }, { "epoch": 0.11894273127753303, "epsilon_dpo/beta": 0.06799500435590744, "epsilon_dpo/beta_margin_grad_mean": -0.2737717628479004, "epsilon_dpo/beta_margin_grad_std": 0.22165818512439728, "epsilon_dpo/beta_margin_mean": 1.488916277885437, "epsilon_dpo/beta_margin_std": 1.7577481269836426, "epsilon_dpo/loss_margin_mean": 22.02306365966797, "grad_norm": 58.842952728271484, "kl/avg_steps": 0.53125, "kl/beta": 0.06835132092237473, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.996015471965529e-07, "logits/chosen": -1.1194102764129639, "logits/rejected": -1.0304535627365112, "logps/chosen": -74.84884643554688, "logps/ref_chosen": -66.44886779785156, "logps/ref_rejected": -129.66270446777344, "logps/rejected": -160.08575439453125, "loss": 0.7737, "rewards/accuracies": 0.859375, "rewards/chosen": -0.5745280981063843, "rewards/margins": 1.4889161586761475, "rewards/rejected": -2.063444137573242, "step": 81 }, { "epoch": 0.12041116005873716, "epsilon_dpo/beta": 0.06761444360017776, "epsilon_dpo/beta_margin_grad_mean": -0.31272122263908386, "epsilon_dpo/beta_margin_grad_std": 0.24187932908535004, "epsilon_dpo/beta_margin_mean": 1.176439881324768, "epsilon_dpo/beta_margin_std": 1.8416661024093628, "epsilon_dpo/loss_margin_mean": 17.541545867919922, "grad_norm": 72.6407699584961, "kl/avg_steps": 0.5625, "kl/beta": 0.06799012422561646, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.995258321842611e-07, "logits/chosen": -1.0402679443359375, "logits/rejected": -1.000870943069458, "logps/chosen": -62.255680084228516, "logps/ref_chosen": -52.232383728027344, "logps/ref_rejected": -90.74325561523438, "logps/rejected": -118.30809020996094, "loss": 1.0089, "rewards/accuracies": 0.796875, "rewards/chosen": -0.6823013424873352, "rewards/margins": 1.176439881324768, "rewards/rejected": -1.8587411642074585, "step": 82 }, { "epoch": 0.12187958883994127, "epsilon_dpo/beta": 0.06732076406478882, "epsilon_dpo/beta_margin_grad_mean": -0.29003429412841797, "epsilon_dpo/beta_margin_grad_std": 0.22237923741340637, "epsilon_dpo/beta_margin_mean": 1.3209935426712036, "epsilon_dpo/beta_margin_std": 1.548567295074463, "epsilon_dpo/loss_margin_mean": 19.759384155273438, "grad_norm": 59.325870513916016, "kl/avg_steps": 0.4375, "kl/beta": 0.06760982424020767, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.994435419342304e-07, "logits/chosen": -1.1070338487625122, "logits/rejected": -0.9864081144332886, "logps/chosen": -65.63611602783203, "logps/ref_chosen": -55.82738494873047, "logps/ref_rejected": -103.71590423583984, "logps/rejected": -133.28402709960938, "loss": 0.8203, "rewards/accuracies": 0.8125, "rewards/chosen": -0.662756085395813, "rewards/margins": 1.3209935426712036, "rewards/rejected": -1.9837496280670166, "step": 83 }, { "epoch": 0.12334801762114538, "epsilon_dpo/beta": 0.06692232191562653, "epsilon_dpo/beta_margin_grad_mean": -0.2845773994922638, "epsilon_dpo/beta_margin_grad_std": 0.18569572269916534, "epsilon_dpo/beta_margin_mean": 1.1468771696090698, "epsilon_dpo/beta_margin_std": 1.1552484035491943, "epsilon_dpo/loss_margin_mean": 17.218107223510742, "grad_norm": 47.83616256713867, "kl/avg_steps": 0.59375, "kl/beta": 0.06731531769037247, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.993546786148857e-07, "logits/chosen": -1.0478136539459229, "logits/rejected": -1.0096745491027832, "logps/chosen": -75.1614990234375, "logps/ref_chosen": -67.1761703491211, "logps/ref_rejected": -87.29859924316406, "logps/rejected": -112.50204467773438, "loss": 0.7733, "rewards/accuracies": 0.859375, "rewards/chosen": -0.5357580184936523, "rewards/margins": 1.1468771696090698, "rewards/rejected": -1.6826353073120117, "step": 84 }, { "epoch": 0.12481644640234948, "epsilon_dpo/beta": 0.06669463217258453, "epsilon_dpo/beta_margin_grad_mean": -0.30865395069122314, "epsilon_dpo/beta_margin_grad_std": 0.20696672797203064, "epsilon_dpo/beta_margin_mean": 1.1267473697662354, "epsilon_dpo/beta_margin_std": 1.3287572860717773, "epsilon_dpo/loss_margin_mean": 17.026790618896484, "grad_norm": 52.211517333984375, "kl/avg_steps": 0.34375, "kl/beta": 0.06691799312829971, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.992592445678582e-07, "logits/chosen": -1.0085999965667725, "logits/rejected": -1.0254071950912476, "logps/chosen": -66.77091979980469, "logps/ref_chosen": -58.406620025634766, "logps/ref_rejected": -78.63880157470703, "logps/rejected": -104.02989196777344, "loss": 0.8465, "rewards/accuracies": 0.734375, "rewards/chosen": -0.5604950189590454, "rewards/margins": 1.1267473697662354, "rewards/rejected": -1.6872422695159912, "step": 85 }, { "epoch": 0.1262848751835536, "epsilon_dpo/beta": 0.06642446666955948, "epsilon_dpo/beta_margin_grad_mean": -0.32995861768722534, "epsilon_dpo/beta_margin_grad_std": 0.23720116913318634, "epsilon_dpo/beta_margin_mean": 1.1591877937316895, "epsilon_dpo/beta_margin_std": 1.8596081733703613, "epsilon_dpo/loss_margin_mean": 17.604108810424805, "grad_norm": 66.92620849609375, "kl/avg_steps": 0.40625, "kl/beta": 0.06668874621391296, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.991572423079235e-07, "logits/chosen": -1.037870168685913, "logits/rejected": -1.011461853981018, "logps/chosen": -66.24519348144531, "logps/ref_chosen": -56.13746643066406, "logps/ref_rejected": -88.12165069580078, "logps/rejected": -115.83348083496094, "loss": 0.9947, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6760549545288086, "rewards/margins": 1.1591877937316895, "rewards/rejected": -1.835242748260498, "step": 86 }, { "epoch": 0.1277533039647577, "epsilon_dpo/beta": 0.06601040810346603, "epsilon_dpo/beta_margin_grad_mean": -0.2889256775379181, "epsilon_dpo/beta_margin_grad_std": 0.22318826615810394, "epsilon_dpo/beta_margin_mean": 1.2812418937683105, "epsilon_dpo/beta_margin_std": 1.6002838611602783, "epsilon_dpo/loss_margin_mean": 19.522178649902344, "grad_norm": 52.268272399902344, "kl/avg_steps": 0.625, "kl/beta": 0.06641892343759537, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.990486745229364e-07, "logits/chosen": -1.0352756977081299, "logits/rejected": -0.9126079082489014, "logps/chosen": -65.29768371582031, "logps/ref_chosen": -55.63609313964844, "logps/ref_rejected": -95.46757507324219, "logps/rejected": -124.65135192871094, "loss": 0.8572, "rewards/accuracies": 0.828125, "rewards/chosen": -0.641059398651123, "rewards/margins": 1.2812418937683105, "rewards/rejected": -1.9223012924194336, "step": 87 }, { "epoch": 0.12922173274596183, "epsilon_dpo/beta": 0.06572417914867401, "epsilon_dpo/beta_margin_grad_mean": -0.33352962136268616, "epsilon_dpo/beta_margin_grad_std": 0.22125916182994843, "epsilon_dpo/beta_margin_mean": 0.9624807834625244, "epsilon_dpo/beta_margin_std": 1.440002202987671, "epsilon_dpo/loss_margin_mean": 14.769360542297363, "grad_norm": 58.52445602416992, "kl/avg_steps": 0.4375, "kl/beta": 0.0660063847899437, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.989335440737586e-07, "logits/chosen": -1.0081560611724854, "logits/rejected": -0.9678352475166321, "logps/chosen": -85.53303527832031, "logps/ref_chosen": -73.67115020751953, "logps/ref_rejected": -106.70849609375, "logps/rejected": -133.33975219726562, "loss": 0.9858, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7829633951187134, "rewards/margins": 0.9624808430671692, "rewards/rejected": -1.7454442977905273, "step": 88 }, { "epoch": 0.13069016152716592, "epsilon_dpo/beta": 0.06527356803417206, "epsilon_dpo/beta_margin_grad_mean": -0.31276053190231323, "epsilon_dpo/beta_margin_grad_std": 0.17064036428928375, "epsilon_dpo/beta_margin_mean": 0.9959526658058167, "epsilon_dpo/beta_margin_std": 1.146094799041748, "epsilon_dpo/loss_margin_mean": 15.32180404663086, "grad_norm": 42.91395950317383, "kl/avg_steps": 0.6875, "kl/beta": 0.06571885943412781, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.988118539941847e-07, "logits/chosen": -0.9808931946754456, "logits/rejected": -0.8977110385894775, "logps/chosen": -67.0821304321289, "logps/ref_chosen": -60.624916076660156, "logps/ref_rejected": -82.08354949951172, "logps/rejected": -103.86257934570312, "loss": 0.8338, "rewards/accuracies": 0.890625, "rewards/chosen": -0.4233604073524475, "rewards/margins": 0.9959526658058167, "rewards/rejected": -1.4193130731582642, "step": 89 }, { "epoch": 0.13215859030837004, "epsilon_dpo/beta": 0.0648890808224678, "epsilon_dpo/beta_margin_grad_mean": -0.31154337525367737, "epsilon_dpo/beta_margin_grad_std": 0.21933160722255707, "epsilon_dpo/beta_margin_mean": 1.2439275979995728, "epsilon_dpo/beta_margin_std": 1.7182693481445312, "epsilon_dpo/loss_margin_mean": 19.285219192504883, "grad_norm": 52.33935546875, "kl/avg_steps": 0.59375, "kl/beta": 0.06527013331651688, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.986836074908615e-07, "logits/chosen": -1.033602237701416, "logits/rejected": -0.973494291305542, "logps/chosen": -61.970252990722656, "logps/ref_chosen": -53.285308837890625, "logps/ref_rejected": -111.54470825195312, "logps/rejected": -139.51487731933594, "loss": 0.8947, "rewards/accuracies": 0.828125, "rewards/chosen": -0.565422773361206, "rewards/margins": 1.2439274787902832, "rewards/rejected": -1.8093502521514893, "step": 90 }, { "epoch": 0.13362701908957417, "epsilon_dpo/beta": 0.0646277442574501, "epsilon_dpo/beta_margin_grad_mean": -0.3069167733192444, "epsilon_dpo/beta_margin_grad_std": 0.21806450188159943, "epsilon_dpo/beta_margin_mean": 1.1736429929733276, "epsilon_dpo/beta_margin_std": 1.418158769607544, "epsilon_dpo/loss_margin_mean": 18.3018741607666, "grad_norm": 51.4193229675293, "kl/avg_steps": 0.40625, "kl/beta": 0.06488487869501114, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.985488079432037e-07, "logits/chosen": -1.039869785308838, "logits/rejected": -1.0203289985656738, "logps/chosen": -69.62163543701172, "logps/ref_chosen": -61.80295944213867, "logps/ref_rejected": -87.87395477294922, "logps/rejected": -113.9945068359375, "loss": 0.8525, "rewards/accuracies": 0.734375, "rewards/chosen": -0.5083088874816895, "rewards/margins": 1.1736429929733276, "rewards/rejected": -1.681951880455017, "step": 91 }, { "epoch": 0.13509544787077826, "epsilon_dpo/beta": 0.06437624990940094, "epsilon_dpo/beta_margin_grad_mean": -0.31448835134506226, "epsilon_dpo/beta_margin_grad_std": 0.21340087056159973, "epsilon_dpo/beta_margin_mean": 1.106092929840088, "epsilon_dpo/beta_margin_std": 1.3934167623519897, "epsilon_dpo/loss_margin_mean": 17.31808853149414, "grad_norm": 45.67806625366211, "kl/avg_steps": 0.390625, "kl/beta": 0.06462235003709793, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.984074589033043e-07, "logits/chosen": -1.0301735401153564, "logits/rejected": -1.0114562511444092, "logps/chosen": -58.884674072265625, "logps/ref_chosen": -51.640769958496094, "logps/ref_rejected": -77.88117980957031, "logps/rejected": -102.44317626953125, "loss": 0.8756, "rewards/accuracies": 0.75, "rewards/chosen": -0.469423770904541, "rewards/margins": 1.106092929840088, "rewards/rejected": -1.575516700744629, "step": 92 }, { "epoch": 0.13656387665198239, "epsilon_dpo/beta": 0.06395485997200012, "epsilon_dpo/beta_margin_grad_mean": -0.29885995388031006, "epsilon_dpo/beta_margin_grad_std": 0.17560887336730957, "epsilon_dpo/beta_margin_mean": 1.105426549911499, "epsilon_dpo/beta_margin_std": 1.2089306116104126, "epsilon_dpo/loss_margin_mean": 17.356679916381836, "grad_norm": 39.11857986450195, "kl/avg_steps": 0.65625, "kl/beta": 0.06437090039253235, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.982595640958425e-07, "logits/chosen": -1.0493080615997314, "logits/rejected": -0.9445855617523193, "logps/chosen": -60.38475036621094, "logps/ref_chosen": -52.529239654541016, "logps/ref_rejected": -77.1607437133789, "logps/rejected": -102.37294006347656, "loss": 0.7855, "rewards/accuracies": 0.859375, "rewards/chosen": -0.5029890537261963, "rewards/margins": 1.105426549911499, "rewards/rejected": -1.6084156036376953, "step": 93 }, { "epoch": 0.13803230543318648, "epsilon_dpo/beta": 0.06359785050153732, "epsilon_dpo/beta_margin_grad_mean": -0.2872365713119507, "epsilon_dpo/beta_margin_grad_std": 0.18094860017299652, "epsilon_dpo/beta_margin_mean": 1.1820956468582153, "epsilon_dpo/beta_margin_std": 1.1670469045639038, "epsilon_dpo/loss_margin_mean": 18.67878532409668, "grad_norm": 41.127235412597656, "kl/avg_steps": 0.5625, "kl/beta": 0.06395121663808823, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.98105127417984e-07, "logits/chosen": -1.0464091300964355, "logits/rejected": -1.0004725456237793, "logps/chosen": -69.6494140625, "logps/ref_chosen": -61.22261047363281, "logps/ref_rejected": -99.59902954101562, "logps/rejected": -126.70462036132812, "loss": 0.7487, "rewards/accuracies": 0.828125, "rewards/chosen": -0.5378746390342712, "rewards/margins": 1.1820956468582153, "rewards/rejected": -1.7199702262878418, "step": 94 }, { "epoch": 0.1395007342143906, "epsilon_dpo/beta": 0.06318248808383942, "epsilon_dpo/beta_margin_grad_mean": -0.31443729996681213, "epsilon_dpo/beta_margin_grad_std": 0.17064118385314941, "epsilon_dpo/beta_margin_mean": 0.9451765418052673, "epsilon_dpo/beta_margin_std": 1.0015146732330322, "epsilon_dpo/loss_margin_mean": 15.025580406188965, "grad_norm": 40.1151237487793, "kl/avg_steps": 0.65625, "kl/beta": 0.06359350681304932, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.979441529392784e-07, "logits/chosen": -1.0391969680786133, "logits/rejected": -0.9397677779197693, "logps/chosen": -59.46489715576172, "logps/ref_chosen": -52.52364730834961, "logps/ref_rejected": -75.88035583496094, "logps/rejected": -97.84718322753906, "loss": 0.8374, "rewards/accuracies": 0.890625, "rewards/chosen": -0.4412747621536255, "rewards/margins": 0.9451765418052673, "rewards/rejected": -1.386451244354248, "step": 95 }, { "epoch": 0.14096916299559473, "epsilon_dpo/beta": 0.06277056038379669, "epsilon_dpo/beta_margin_grad_mean": -0.27842971682548523, "epsilon_dpo/beta_margin_grad_std": 0.17272476851940155, "epsilon_dpo/beta_margin_mean": 1.2603580951690674, "epsilon_dpo/beta_margin_std": 1.311813235282898, "epsilon_dpo/loss_margin_mean": 20.152423858642578, "grad_norm": 40.65923309326172, "kl/avg_steps": 0.65625, "kl/beta": 0.0631788969039917, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.977766449015534e-07, "logits/chosen": -1.0081816911697388, "logits/rejected": -0.9590755701065063, "logps/chosen": -68.17340850830078, "logps/ref_chosen": -62.15697479248047, "logps/ref_rejected": -96.59601593017578, "logps/rejected": -122.7648696899414, "loss": 0.727, "rewards/accuracies": 0.921875, "rewards/chosen": -0.3794512152671814, "rewards/margins": 1.2603580951690674, "rewards/rejected": -1.6398093700408936, "step": 96 }, { "epoch": 0.14243759177679882, "epsilon_dpo/beta": 0.062439776957035065, "epsilon_dpo/beta_margin_grad_mean": -0.30438145995140076, "epsilon_dpo/beta_margin_grad_std": 0.16797401010990143, "epsilon_dpo/beta_margin_mean": 0.9990195631980896, "epsilon_dpo/beta_margin_std": 0.9852281808853149, "epsilon_dpo/loss_margin_mean": 16.083852767944336, "grad_norm": 42.303070068359375, "kl/avg_steps": 0.53125, "kl/beta": 0.06276698410511017, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.976026077188012e-07, "logits/chosen": -1.06300687789917, "logits/rejected": -0.8783408403396606, "logps/chosen": -61.59731674194336, "logps/ref_chosen": -54.64636993408203, "logps/ref_rejected": -76.96475219726562, "logps/rejected": -99.99954986572266, "loss": 0.7976, "rewards/accuracies": 0.828125, "rewards/chosen": -0.43561291694641113, "rewards/margins": 0.9990196228027344, "rewards/rejected": -1.4346325397491455, "step": 97 }, { "epoch": 0.14390602055800295, "epsilon_dpo/beta": 0.062109820544719696, "epsilon_dpo/beta_margin_grad_mean": -0.2962830364704132, "epsilon_dpo/beta_margin_grad_std": 0.19924990832805634, "epsilon_dpo/beta_margin_mean": 1.0890288352966309, "epsilon_dpo/beta_margin_std": 1.186418890953064, "epsilon_dpo/loss_margin_mean": 17.645435333251953, "grad_norm": 43.674556732177734, "kl/avg_steps": 0.53125, "kl/beta": 0.062435299158096313, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.974220459770639e-07, "logits/chosen": -1.0256155729293823, "logits/rejected": -0.9630335569381714, "logps/chosen": -73.58518981933594, "logps/ref_chosen": -65.25862884521484, "logps/ref_rejected": -96.5274887084961, "logps/rejected": -122.49948120117188, "loss": 0.8297, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5198632478713989, "rewards/margins": 1.0890288352966309, "rewards/rejected": -1.6088919639587402, "step": 98 }, { "epoch": 0.14537444933920704, "epsilon_dpo/beta": 0.06172337755560875, "epsilon_dpo/beta_margin_grad_mean": -0.29098957777023315, "epsilon_dpo/beta_margin_grad_std": 0.1694328337907791, "epsilon_dpo/beta_margin_mean": 1.2009061574935913, "epsilon_dpo/beta_margin_std": 1.299239993095398, "epsilon_dpo/loss_margin_mean": 19.53193473815918, "grad_norm": 39.50008010864258, "kl/avg_steps": 0.625, "kl/beta": 0.062105365097522736, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.972349644343108e-07, "logits/chosen": -1.0253294706344604, "logits/rejected": -0.9613098502159119, "logps/chosen": -52.740440368652344, "logps/ref_chosen": -45.63848114013672, "logps/ref_rejected": -86.43792724609375, "logps/rejected": -113.07182312011719, "loss": 0.7496, "rewards/accuracies": 0.90625, "rewards/chosen": -0.4397137761116028, "rewards/margins": 1.2009060382843018, "rewards/rejected": -1.6406198740005493, "step": 99 }, { "epoch": 0.14684287812041116, "epsilon_dpo/beta": 0.061397869139909744, "epsilon_dpo/beta_margin_grad_mean": -0.34565305709838867, "epsilon_dpo/beta_margin_grad_std": 0.20056740939617157, "epsilon_dpo/beta_margin_mean": 0.8253319263458252, "epsilon_dpo/beta_margin_std": 1.1934319734573364, "epsilon_dpo/loss_margin_mean": 13.542726516723633, "grad_norm": 48.6550178527832, "kl/avg_steps": 0.53125, "kl/beta": 0.0617196150124073, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.970413680203148e-07, "logits/chosen": -0.9946659207344055, "logits/rejected": -0.8828315734863281, "logps/chosen": -65.0089111328125, "logps/ref_chosen": -57.5939826965332, "logps/ref_rejected": -74.06021118164062, "logps/rejected": -95.01786804199219, "loss": 0.9809, "rewards/accuracies": 0.828125, "rewards/chosen": -0.4583064317703247, "rewards/margins": 0.82533198595047, "rewards/rejected": -1.28363835811615, "step": 100 }, { "epoch": 0.14684287812041116, "eval_epsilon_dpo/beta": 0.061230581253767014, "eval_epsilon_dpo/beta_margin_grad_mean": -0.39301300048828125, "eval_epsilon_dpo/beta_margin_grad_std": 0.20782990753650665, "eval_epsilon_dpo/beta_margin_mean": 0.6044580340385437, "eval_epsilon_dpo/beta_margin_std": 1.2196648120880127, "eval_epsilon_dpo/loss_margin_mean": 10.001635551452637, "eval_kl/n_epsilon_steps": 0.3617294430732727, "eval_kl/p_epsilon_steps": 0.6369863152503967, "eval_logits/chosen": -0.9442313313484192, "eval_logits/rejected": -0.8901473879814148, "eval_logps/chosen": -90.66112518310547, "eval_logps/ref_chosen": -79.05104064941406, "eval_logps/ref_rejected": -86.79793548583984, "eval_logps/rejected": -108.40965270996094, "eval_loss": 0.5818918347358704, "eval_rewards/accuracies": 0.6926369667053223, "eval_rewards/chosen": -0.7142966389656067, "eval_rewards/margins": 0.6044580936431885, "eval_rewards/rejected": -1.3187546730041504, "eval_runtime": 43.1958, "eval_samples_per_second": 54.149, "eval_steps_per_second": 1.713, "step": 100 }, { "epoch": 0.14831130690161526, "epsilon_dpo/beta": 0.06118854135274887, "epsilon_dpo/beta_margin_grad_mean": -0.3312467336654663, "epsilon_dpo/beta_margin_grad_std": 0.19234833121299744, "epsilon_dpo/beta_margin_mean": 0.9552551507949829, "epsilon_dpo/beta_margin_std": 1.2462098598480225, "epsilon_dpo/loss_margin_mean": 15.74006462097168, "grad_norm": 43.89051055908203, "kl/avg_steps": 0.34375, "kl/beta": 0.061393462121486664, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.968412618365215e-07, "logits/chosen": -1.0468535423278809, "logits/rejected": -0.9560986757278442, "logps/chosen": -70.41122436523438, "logps/ref_chosen": -61.64884948730469, "logps/ref_rejected": -83.18968963623047, "logps/rejected": -107.69213104248047, "loss": 0.8997, "rewards/accuracies": 0.71875, "rewards/chosen": -0.538151741027832, "rewards/margins": 0.9552551507949829, "rewards/rejected": -1.4934070110321045, "step": 101 }, { "epoch": 0.14977973568281938, "epsilon_dpo/beta": 0.061055414378643036, "epsilon_dpo/beta_margin_grad_mean": -0.3631453812122345, "epsilon_dpo/beta_margin_grad_std": 0.20328044891357422, "epsilon_dpo/beta_margin_mean": 0.7262924313545227, "epsilon_dpo/beta_margin_std": 1.132162094116211, "epsilon_dpo/loss_margin_mean": 12.029383659362793, "grad_norm": 52.17008972167969, "kl/avg_steps": 0.21875, "kl/beta": 0.0611831471323967, "kl/n_epsilon_steps": 0.390625, "kl/p_epsilon_steps": 0.609375, "learning_rate": 4.966346511559149e-07, "logits/chosen": -1.0717209577560425, "logits/rejected": -0.9202479124069214, "logps/chosen": -74.46989440917969, "logps/ref_chosen": -64.0788803100586, "logps/ref_rejected": -68.18707275390625, "logps/rejected": -90.60747528076172, "loss": 1.0303, "rewards/accuracies": 0.703125, "rewards/chosen": -0.6374403238296509, "rewards/margins": 0.7262923717498779, "rewards/rejected": -1.3637328147888184, "step": 102 }, { "epoch": 0.1512481644640235, "epsilon_dpo/beta": 0.06069318577647209, "epsilon_dpo/beta_margin_grad_mean": -0.2652726471424103, "epsilon_dpo/beta_margin_grad_std": 0.16527369618415833, "epsilon_dpo/beta_margin_mean": 1.313633680343628, "epsilon_dpo/beta_margin_std": 1.19468092918396, "epsilon_dpo/loss_margin_mean": 21.731555938720703, "grad_norm": 37.721797943115234, "kl/avg_steps": 0.59375, "kl/beta": 0.06104959920048714, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.964215414228785e-07, "logits/chosen": -1.0768955945968628, "logits/rejected": -1.024803876876831, "logps/chosen": -67.62309265136719, "logps/ref_chosen": -61.299278259277344, "logps/ref_rejected": -93.57271575927734, "logps/rejected": -121.62808227539062, "loss": 0.6741, "rewards/accuracies": 0.90625, "rewards/chosen": -0.38536566495895386, "rewards/margins": 1.3136337995529175, "rewards/rejected": -1.6989994049072266, "step": 103 }, { "epoch": 0.1527165932452276, "epsilon_dpo/beta": 0.06035391986370087, "epsilon_dpo/beta_margin_grad_mean": -0.2930792570114136, "epsilon_dpo/beta_margin_grad_std": 0.1929297298192978, "epsilon_dpo/beta_margin_mean": 1.2390574216842651, "epsilon_dpo/beta_margin_std": 1.3837366104125977, "epsilon_dpo/loss_margin_mean": 20.630268096923828, "grad_norm": 41.5215950012207, "kl/avg_steps": 0.5625, "kl/beta": 0.06068925932049751, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.96201938253052e-07, "logits/chosen": -1.0918048620224, "logits/rejected": -0.9679174423217773, "logps/chosen": -62.31468200683594, "logps/ref_chosen": -54.37277603149414, "logps/ref_rejected": -89.5647201538086, "logps/rejected": -118.13689422607422, "loss": 0.7819, "rewards/accuracies": 0.875, "rewards/chosen": -0.4813727140426636, "rewards/margins": 1.2390574216842651, "rewards/rejected": -1.7204301357269287, "step": 104 }, { "epoch": 0.15418502202643172, "epsilon_dpo/beta": 0.05986543372273445, "epsilon_dpo/beta_margin_grad_mean": -0.21603839099407196, "epsilon_dpo/beta_margin_grad_std": 0.16202878952026367, "epsilon_dpo/beta_margin_mean": 1.6685210466384888, "epsilon_dpo/beta_margin_std": 1.2866175174713135, "epsilon_dpo/loss_margin_mean": 27.929157257080078, "grad_norm": 34.132720947265625, "kl/avg_steps": 0.8125, "kl/beta": 0.06034978851675987, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 4.959758474331832e-07, "logits/chosen": -1.059356927871704, "logits/rejected": -1.0111595392227173, "logps/chosen": -61.135772705078125, "logps/ref_chosen": -54.638946533203125, "logps/ref_rejected": -97.97351837158203, "logps/rejected": -132.39950561523438, "loss": 0.5393, "rewards/accuracies": 0.953125, "rewards/chosen": -0.3894655704498291, "rewards/margins": 1.6685209274291992, "rewards/rejected": -2.0579864978790283, "step": 105 }, { "epoch": 0.15565345080763582, "epsilon_dpo/beta": 0.05953262001276016, "epsilon_dpo/beta_margin_grad_mean": -0.29940372705459595, "epsilon_dpo/beta_margin_grad_std": 0.17994937300682068, "epsilon_dpo/beta_margin_mean": 1.0705699920654297, "epsilon_dpo/beta_margin_std": 1.1025373935699463, "epsilon_dpo/loss_margin_mean": 18.07825469970703, "grad_norm": 38.402015686035156, "kl/avg_steps": 0.5625, "kl/beta": 0.05986339971423149, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.957432749209755e-07, "logits/chosen": -1.0292755365371704, "logits/rejected": -0.854220986366272, "logps/chosen": -62.74475860595703, "logps/ref_chosen": -54.83289337158203, "logps/ref_rejected": -85.22461700439453, "logps/rejected": -111.21473693847656, "loss": 0.7896, "rewards/accuracies": 0.84375, "rewards/chosen": -0.4724421501159668, "rewards/margins": 1.0705699920654297, "rewards/rejected": -1.5430121421813965, "step": 106 }, { "epoch": 0.15712187958883994, "epsilon_dpo/beta": 0.059143807739019394, "epsilon_dpo/beta_margin_grad_mean": -0.2823401391506195, "epsilon_dpo/beta_margin_grad_std": 0.19471247494220734, "epsilon_dpo/beta_margin_mean": 1.2131283283233643, "epsilon_dpo/beta_margin_std": 1.206012487411499, "epsilon_dpo/loss_margin_mean": 20.60358428955078, "grad_norm": 39.720516204833984, "kl/avg_steps": 0.65625, "kl/beta": 0.05952855199575424, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.955042268449307e-07, "logits/chosen": -1.0828496217727661, "logits/rejected": -1.0270475149154663, "logps/chosen": -79.37728118896484, "logps/ref_chosen": -69.70780944824219, "logps/ref_rejected": -94.73950958251953, "logps/rejected": -125.01255798339844, "loss": 0.7603, "rewards/accuracies": 0.890625, "rewards/chosen": -0.5732927322387695, "rewards/margins": 1.2131283283233643, "rewards/rejected": -1.7864210605621338, "step": 107 }, { "epoch": 0.15859030837004406, "epsilon_dpo/beta": 0.05883214250206947, "epsilon_dpo/beta_margin_grad_mean": -0.3005242347717285, "epsilon_dpo/beta_margin_grad_std": 0.21865715086460114, "epsilon_dpo/beta_margin_mean": 1.2708423137664795, "epsilon_dpo/beta_margin_std": 1.5279698371887207, "epsilon_dpo/loss_margin_mean": 21.735319137573242, "grad_norm": 44.16614532470703, "kl/avg_steps": 0.53125, "kl/beta": 0.05914044380187988, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.952587095041881e-07, "logits/chosen": -1.11293625831604, "logits/rejected": -0.9888235330581665, "logps/chosen": -65.3058853149414, "logps/ref_chosen": -56.0098876953125, "logps/ref_rejected": -95.79601287841797, "logps/rejected": -126.82733154296875, "loss": 0.8338, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5490582585334778, "rewards/margins": 1.2708423137664795, "rewards/rejected": -1.819900631904602, "step": 108 }, { "epoch": 0.16005873715124816, "epsilon_dpo/beta": 0.05848447605967522, "epsilon_dpo/beta_margin_grad_mean": -0.2573312222957611, "epsilon_dpo/beta_margin_grad_std": 0.1849949210882187, "epsilon_dpo/beta_margin_mean": 1.4004172086715698, "epsilon_dpo/beta_margin_std": 1.2429378032684326, "epsilon_dpo/loss_margin_mean": 24.054006576538086, "grad_norm": 38.34709548950195, "kl/avg_steps": 0.59375, "kl/beta": 0.05882791802287102, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.95006729368358e-07, "logits/chosen": -0.9814876317977905, "logits/rejected": -0.9408524036407471, "logps/chosen": -71.12234497070312, "logps/ref_chosen": -62.88549041748047, "logps/ref_rejected": -98.68573760986328, "logps/rejected": -130.97659301757812, "loss": 0.6671, "rewards/accuracies": 0.875, "rewards/chosen": -0.48322200775146484, "rewards/margins": 1.4004172086715698, "rewards/rejected": -1.8836392164230347, "step": 109 }, { "epoch": 0.16152716593245228, "epsilon_dpo/beta": 0.058102719485759735, "epsilon_dpo/beta_margin_grad_mean": -0.28056707978248596, "epsilon_dpo/beta_margin_grad_std": 0.2003585249185562, "epsilon_dpo/beta_margin_mean": 1.2914304733276367, "epsilon_dpo/beta_margin_std": 1.4332082271575928, "epsilon_dpo/loss_margin_mean": 22.326982498168945, "grad_norm": 39.42861557006836, "kl/avg_steps": 0.65625, "kl/beta": 0.05848069116473198, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.947482930773511e-07, "logits/chosen": -1.0638456344604492, "logits/rejected": -0.9847558736801147, "logps/chosen": -66.33636474609375, "logps/ref_chosen": -58.753684997558594, "logps/ref_rejected": -79.75001525878906, "logps/rejected": -109.65968322753906, "loss": 0.7777, "rewards/accuracies": 0.890625, "rewards/chosen": -0.4425739347934723, "rewards/margins": 1.2914303541183472, "rewards/rejected": -1.734004259109497, "step": 110 }, { "epoch": 0.16299559471365638, "epsilon_dpo/beta": 0.05779653787612915, "epsilon_dpo/beta_margin_grad_mean": -0.2765791118144989, "epsilon_dpo/beta_margin_grad_std": 0.20906668901443481, "epsilon_dpo/beta_margin_mean": 1.3465930223464966, "epsilon_dpo/beta_margin_std": 1.4630502462387085, "epsilon_dpo/loss_margin_mean": 23.435821533203125, "grad_norm": 43.166236877441406, "kl/avg_steps": 0.53125, "kl/beta": 0.0580994114279747, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.944834074412042e-07, "logits/chosen": -0.985710859298706, "logits/rejected": -0.9343112111091614, "logps/chosen": -78.71395874023438, "logps/ref_chosen": -68.62410736083984, "logps/ref_rejected": -98.42886352539062, "logps/rejected": -131.9545440673828, "loss": 0.7691, "rewards/accuracies": 0.828125, "rewards/chosen": -0.5861130952835083, "rewards/margins": 1.3465930223464966, "rewards/rejected": -1.9327061176300049, "step": 111 }, { "epoch": 0.1644640234948605, "epsilon_dpo/beta": 0.05750006064772606, "epsilon_dpo/beta_margin_grad_mean": -0.32882383465766907, "epsilon_dpo/beta_margin_grad_std": 0.1954524666070938, "epsilon_dpo/beta_margin_mean": 0.9281901717185974, "epsilon_dpo/beta_margin_std": 1.1477144956588745, "epsilon_dpo/loss_margin_mean": 16.249736785888672, "grad_norm": 42.380157470703125, "kl/avg_steps": 0.515625, "kl/beta": 0.05779239162802696, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.942120794399002e-07, "logits/chosen": -1.0260038375854492, "logits/rejected": -0.8925676345825195, "logps/chosen": -60.01226043701172, "logps/ref_chosen": -50.24964141845703, "logps/ref_rejected": -64.77442932128906, "logps/rejected": -90.78678894042969, "loss": 0.9025, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5631338953971863, "rewards/margins": 0.9281901121139526, "rewards/rejected": -1.4913240671157837, "step": 112 }, { "epoch": 0.16593245227606462, "epsilon_dpo/beta": 0.057196203619241714, "epsilon_dpo/beta_margin_grad_mean": -0.29975658655166626, "epsilon_dpo/beta_margin_grad_std": 0.17696668207645416, "epsilon_dpo/beta_margin_mean": 1.0448459386825562, "epsilon_dpo/beta_margin_std": 1.0058784484863281, "epsilon_dpo/loss_margin_mean": 18.36528205871582, "grad_norm": 40.14544677734375, "kl/avg_steps": 0.53125, "kl/beta": 0.057495929300785065, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.939343162231841e-07, "logits/chosen": -1.042406439781189, "logits/rejected": -0.9559611082077026, "logps/chosen": -76.94298553466797, "logps/ref_chosen": -66.71295166015625, "logps/ref_rejected": -77.96870422363281, "logps/rejected": -106.56402587890625, "loss": 0.7846, "rewards/accuracies": 0.828125, "rewards/chosen": -0.5865628719329834, "rewards/margins": 1.0448459386825562, "rewards/rejected": -1.63140869140625, "step": 113 }, { "epoch": 0.16740088105726872, "epsilon_dpo/beta": 0.05689394846558571, "epsilon_dpo/beta_margin_grad_mean": -0.26561489701271057, "epsilon_dpo/beta_margin_grad_std": 0.1946914792060852, "epsilon_dpo/beta_margin_mean": 1.4782589673995972, "epsilon_dpo/beta_margin_std": 1.6168479919433594, "epsilon_dpo/loss_margin_mean": 26.11954689025879, "grad_norm": 37.15214157104492, "kl/avg_steps": 0.53125, "kl/beta": 0.05719209462404251, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.936501251103751e-07, "logits/chosen": -1.0384024381637573, "logits/rejected": -0.9960717558860779, "logps/chosen": -67.10043334960938, "logps/ref_chosen": -57.78507995605469, "logps/ref_rejected": -87.10966491699219, "logps/rejected": -122.54457092285156, "loss": 0.7, "rewards/accuracies": 0.859375, "rewards/chosen": -0.5322611331939697, "rewards/margins": 1.4782588481903076, "rewards/rejected": -2.0105199813842773, "step": 114 }, { "epoch": 0.16886930983847284, "epsilon_dpo/beta": 0.05662885308265686, "epsilon_dpo/beta_margin_grad_mean": -0.3195987939834595, "epsilon_dpo/beta_margin_grad_std": 0.20884445309638977, "epsilon_dpo/beta_margin_mean": 1.0727407932281494, "epsilon_dpo/beta_margin_std": 1.4862561225891113, "epsilon_dpo/loss_margin_mean": 19.081743240356445, "grad_norm": 56.88962173461914, "kl/avg_steps": 0.46875, "kl/beta": 0.056889865547418594, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.933595135901732e-07, "logits/chosen": -1.0595698356628418, "logits/rejected": -0.9536565542221069, "logps/chosen": -78.7204360961914, "logps/ref_chosen": -65.5826416015625, "logps/ref_rejected": -98.56552124023438, "logps/rejected": -130.78504943847656, "loss": 0.9201, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7461957335472107, "rewards/margins": 1.0727407932281494, "rewards/rejected": -1.8189365863800049, "step": 115 }, { "epoch": 0.17033773861967694, "epsilon_dpo/beta": 0.05629386007785797, "epsilon_dpo/beta_margin_grad_mean": -0.2952349781990051, "epsilon_dpo/beta_margin_grad_std": 0.1610753983259201, "epsilon_dpo/beta_margin_mean": 1.086800456047058, "epsilon_dpo/beta_margin_std": 1.0847556591033936, "epsilon_dpo/loss_margin_mean": 19.388042449951172, "grad_norm": 39.08403778076172, "kl/avg_steps": 0.59375, "kl/beta": 0.05662443861365318, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.930624893204624e-07, "logits/chosen": -1.0231701135635376, "logits/rejected": -0.954992413520813, "logps/chosen": -61.05341720581055, "logps/ref_chosen": -51.40031051635742, "logps/ref_rejected": -80.5218505859375, "logps/rejected": -109.56300354003906, "loss": 0.7571, "rewards/accuracies": 0.890625, "rewards/chosen": -0.5440513491630554, "rewards/margins": 1.086800456047058, "rewards/rejected": -1.6308517456054688, "step": 116 }, { "epoch": 0.17180616740088106, "epsilon_dpo/beta": 0.05599677190184593, "epsilon_dpo/beta_margin_grad_mean": -0.31095898151397705, "epsilon_dpo/beta_margin_grad_std": 0.18366140127182007, "epsilon_dpo/beta_margin_mean": 1.0290024280548096, "epsilon_dpo/beta_margin_std": 1.1844969987869263, "epsilon_dpo/loss_margin_mean": 18.480804443359375, "grad_norm": 47.90737533569336, "kl/avg_steps": 0.53125, "kl/beta": 0.056290216743946075, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.927590601281083e-07, "logits/chosen": -0.9935444593429565, "logits/rejected": -0.9353665113449097, "logps/chosen": -80.16091918945312, "logps/ref_chosen": -69.29840850830078, "logps/ref_rejected": -66.58399200439453, "logps/rejected": -95.92730712890625, "loss": 0.8342, "rewards/accuracies": 0.859375, "rewards/chosen": -0.611723780632019, "rewards/margins": 1.0290024280548096, "rewards/rejected": -1.6407263278961182, "step": 117 }, { "epoch": 0.17327459618208516, "epsilon_dpo/beta": 0.05568336695432663, "epsilon_dpo/beta_margin_grad_mean": -0.3081415593624115, "epsilon_dpo/beta_margin_grad_std": 0.1649520993232727, "epsilon_dpo/beta_margin_mean": 1.0005450248718262, "epsilon_dpo/beta_margin_std": 1.0125638246536255, "epsilon_dpo/loss_margin_mean": 18.05414581298828, "grad_norm": 38.3569450378418, "kl/avg_steps": 0.5625, "kl/beta": 0.05599275603890419, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.924492340087524e-07, "logits/chosen": -1.0015466213226318, "logits/rejected": -0.8971748352050781, "logps/chosen": -66.13504791259766, "logps/ref_chosen": -55.6409797668457, "logps/ref_rejected": -75.66905212402344, "logps/rejected": -104.21726989746094, "loss": 0.799, "rewards/accuracies": 0.875, "rewards/chosen": -0.5855417847633362, "rewards/margins": 1.0005450248718262, "rewards/rejected": -1.5860867500305176, "step": 118 }, { "epoch": 0.17474302496328928, "epsilon_dpo/beta": 0.05544150620698929, "epsilon_dpo/beta_margin_grad_mean": -0.31184121966362, "epsilon_dpo/beta_margin_grad_std": 0.19244518876075745, "epsilon_dpo/beta_margin_mean": 1.0441036224365234, "epsilon_dpo/beta_margin_std": 1.2219512462615967, "epsilon_dpo/loss_margin_mean": 18.95892333984375, "grad_norm": 42.377586364746094, "kl/avg_steps": 0.4375, "kl/beta": 0.0556795597076416, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.92133019126601e-07, "logits/chosen": -1.0072647333145142, "logits/rejected": -0.9816898107528687, "logps/chosen": -85.67257690429688, "logps/ref_chosen": -73.51019287109375, "logps/ref_rejected": -102.97728729248047, "logps/rejected": -134.0985870361328, "loss": 0.8431, "rewards/accuracies": 0.828125, "rewards/chosen": -0.6764189004898071, "rewards/margins": 1.0441036224365234, "rewards/rejected": -1.7205225229263306, "step": 119 }, { "epoch": 0.1762114537444934, "epsilon_dpo/beta": 0.05509604886174202, "epsilon_dpo/beta_margin_grad_mean": -0.2703264355659485, "epsilon_dpo/beta_margin_grad_std": 0.19106176495552063, "epsilon_dpo/beta_margin_mean": 1.3362863063812256, "epsilon_dpo/beta_margin_std": 1.3112635612487793, "epsilon_dpo/loss_margin_mean": 24.35825538635254, "grad_norm": 39.41484069824219, "kl/avg_steps": 0.625, "kl/beta": 0.05543702095746994, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.918104238142103e-07, "logits/chosen": -0.9771697521209717, "logits/rejected": -0.9445418119430542, "logps/chosen": -90.03053283691406, "logps/ref_chosen": -76.78083801269531, "logps/ref_rejected": -108.02374267578125, "logps/rejected": -145.63168334960938, "loss": 0.7151, "rewards/accuracies": 0.875, "rewards/chosen": -0.7321771383285522, "rewards/margins": 1.3362863063812256, "rewards/rejected": -2.0684633255004883, "step": 120 }, { "epoch": 0.1776798825256975, "epsilon_dpo/beta": 0.05478827282786369, "epsilon_dpo/beta_margin_grad_mean": -0.26886865496635437, "epsilon_dpo/beta_margin_grad_std": 0.19572946429252625, "epsilon_dpo/beta_margin_mean": 1.4292089939117432, "epsilon_dpo/beta_margin_std": 1.4435065984725952, "epsilon_dpo/loss_margin_mean": 26.207809448242188, "grad_norm": 38.79628372192383, "kl/avg_steps": 0.5625, "kl/beta": 0.055092692375183105, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.91481456572267e-07, "logits/chosen": -0.9736794233322144, "logits/rejected": -0.9580074548721313, "logps/chosen": -74.22486877441406, "logps/ref_chosen": -61.789894104003906, "logps/ref_rejected": -109.99456787109375, "logps/rejected": -148.63735961914062, "loss": 0.7091, "rewards/accuracies": 0.84375, "rewards/chosen": -0.6830889582633972, "rewards/margins": 1.4292089939117432, "rewards/rejected": -2.112298011779785, "step": 121 }, { "epoch": 0.17914831130690162, "epsilon_dpo/beta": 0.054310593754053116, "epsilon_dpo/beta_margin_grad_mean": -0.21327242255210876, "epsilon_dpo/beta_margin_grad_std": 0.17507927119731903, "epsilon_dpo/beta_margin_mean": 1.7653812170028687, "epsilon_dpo/beta_margin_std": 1.3834340572357178, "epsilon_dpo/loss_margin_mean": 32.55738830566406, "grad_norm": 35.855751037597656, "kl/avg_steps": 0.875, "kl/beta": 0.05478452891111374, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 4.911461260693638e-07, "logits/chosen": -1.0290205478668213, "logits/rejected": -0.9942089319229126, "logps/chosen": -58.41440200805664, "logps/ref_chosen": -46.90221405029297, "logps/ref_rejected": -106.71418762207031, "logps/rejected": -150.78378295898438, "loss": 0.5408, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6259321570396423, "rewards/margins": 1.765381097793579, "rewards/rejected": -2.391313314437866, "step": 122 }, { "epoch": 0.18061674008810572, "epsilon_dpo/beta": 0.05406015366315842, "epsilon_dpo/beta_margin_grad_mean": -0.30481162667274475, "epsilon_dpo/beta_margin_grad_std": 0.22019901871681213, "epsilon_dpo/beta_margin_mean": 1.1788209676742554, "epsilon_dpo/beta_margin_std": 1.5325920581817627, "epsilon_dpo/loss_margin_mean": 21.96431541442871, "grad_norm": 47.754364013671875, "kl/avg_steps": 0.46875, "kl/beta": 0.05430932343006134, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.908044411417711e-07, "logits/chosen": -0.9941624402999878, "logits/rejected": -0.912921130657196, "logps/chosen": -73.27442169189453, "logps/ref_chosen": -61.33863830566406, "logps/ref_rejected": -87.77539825439453, "logps/rejected": -121.67549896240234, "loss": 0.8874, "rewards/accuracies": 0.828125, "rewards/chosen": -0.6485357284545898, "rewards/margins": 1.178821086883545, "rewards/rejected": -1.8273568153381348, "step": 123 }, { "epoch": 0.18208516886930984, "epsilon_dpo/beta": 0.05372345820069313, "epsilon_dpo/beta_margin_grad_mean": -0.27301228046417236, "epsilon_dpo/beta_margin_grad_std": 0.21599474549293518, "epsilon_dpo/beta_margin_mean": 1.5537796020507812, "epsilon_dpo/beta_margin_std": 1.8719865083694458, "epsilon_dpo/loss_margin_mean": 29.06615447998047, "grad_norm": 47.07590866088867, "kl/avg_steps": 0.625, "kl/beta": 0.05405593663454056, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.904564107932048e-07, "logits/chosen": -0.9897609353065491, "logits/rejected": -0.9946451187133789, "logps/chosen": -84.17681121826172, "logps/ref_chosen": -71.44833374023438, "logps/ref_rejected": -117.58056640625, "logps/rejected": -159.37521362304688, "loss": 0.7583, "rewards/accuracies": 0.84375, "rewards/chosen": -0.6873254776000977, "rewards/margins": 1.5537796020507812, "rewards/rejected": -2.241105079650879, "step": 124 }, { "epoch": 0.18355359765051396, "epsilon_dpo/beta": 0.0533394031226635, "epsilon_dpo/beta_margin_grad_mean": -0.26133814454078674, "epsilon_dpo/beta_margin_grad_std": 0.17945978045463562, "epsilon_dpo/beta_margin_mean": 1.4211194515228271, "epsilon_dpo/beta_margin_std": 1.3498996496200562, "epsilon_dpo/loss_margin_mean": 26.72475814819336, "grad_norm": 38.591426849365234, "kl/avg_steps": 0.71875, "kl/beta": 0.05372018367052078, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 4.90102044194588e-07, "logits/chosen": -1.0314831733703613, "logits/rejected": -1.006223201751709, "logps/chosen": -59.979026794433594, "logps/ref_chosen": -50.136940002441406, "logps/ref_rejected": -83.98861694335938, "logps/rejected": -120.55546569824219, "loss": 0.6743, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5260579586029053, "rewards/margins": 1.4211194515228271, "rewards/rejected": -1.947177529335022, "step": 125 }, { "epoch": 0.18502202643171806, "epsilon_dpo/beta": 0.052958764135837555, "epsilon_dpo/beta_margin_grad_mean": -0.26814988255500793, "epsilon_dpo/beta_margin_grad_std": 0.18324759602546692, "epsilon_dpo/beta_margin_mean": 1.3413618803024292, "epsilon_dpo/beta_margin_std": 1.3076198101043701, "epsilon_dpo/loss_margin_mean": 25.416467666625977, "grad_norm": 40.11631393432617, "kl/avg_steps": 0.71875, "kl/beta": 0.05333682522177696, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 4.897413506838102e-07, "logits/chosen": -1.0392162799835205, "logits/rejected": -0.9651429653167725, "logps/chosen": -66.92804718017578, "logps/ref_chosen": -55.66706848144531, "logps/ref_rejected": -98.1297607421875, "logps/rejected": -134.8072052001953, "loss": 0.7042, "rewards/accuracies": 0.890625, "rewards/chosen": -0.5989965200424194, "rewards/margins": 1.3413619995117188, "rewards/rejected": -1.9403584003448486, "step": 126 }, { "epoch": 0.18649045521292218, "epsilon_dpo/beta": 0.05264703556895256, "epsilon_dpo/beta_margin_grad_mean": -0.293317586183548, "epsilon_dpo/beta_margin_grad_std": 0.18321077525615692, "epsilon_dpo/beta_margin_mean": 1.1014037132263184, "epsilon_dpo/beta_margin_std": 1.0729624032974243, "epsilon_dpo/loss_margin_mean": 21.02937889099121, "grad_norm": 37.68070602416992, "kl/avg_steps": 0.59375, "kl/beta": 0.05295620113611221, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.89374339765481e-07, "logits/chosen": -1.018848180770874, "logits/rejected": -0.9406877756118774, "logps/chosen": -66.27595520019531, "logps/ref_chosen": -56.55467987060547, "logps/ref_rejected": -76.7957763671875, "logps/rejected": -107.54643249511719, "loss": 0.774, "rewards/accuracies": 0.828125, "rewards/chosen": -0.5129358172416687, "rewards/margins": 1.1014037132263184, "rewards/rejected": -1.6143395900726318, "step": 127 }, { "epoch": 0.18795888399412627, "epsilon_dpo/beta": 0.05238565057516098, "epsilon_dpo/beta_margin_grad_mean": -0.30218613147735596, "epsilon_dpo/beta_margin_grad_std": 0.19723427295684814, "epsilon_dpo/beta_margin_mean": 1.1567561626434326, "epsilon_dpo/beta_margin_std": 1.334637999534607, "epsilon_dpo/loss_margin_mean": 22.21176528930664, "grad_norm": 41.57168197631836, "kl/avg_steps": 0.5, "kl/beta": 0.052643630653619766, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.890010211106795e-07, "logits/chosen": -1.0094084739685059, "logits/rejected": -0.9334766268730164, "logps/chosen": -68.6881332397461, "logps/ref_chosen": -58.12095642089844, "logps/ref_rejected": -76.43896484375, "logps/rejected": -109.21790313720703, "loss": 0.8188, "rewards/accuracies": 0.828125, "rewards/chosen": -0.5555476546287537, "rewards/margins": 1.1567561626434326, "rewards/rejected": -1.712303876876831, "step": 128 }, { "epoch": 0.1894273127753304, "epsilon_dpo/beta": 0.05219051241874695, "epsilon_dpo/beta_margin_grad_mean": -0.30846473574638367, "epsilon_dpo/beta_margin_grad_std": 0.2237718552350998, "epsilon_dpo/beta_margin_mean": 1.1693782806396484, "epsilon_dpo/beta_margin_std": 1.5052525997161865, "epsilon_dpo/loss_margin_mean": 22.593595504760742, "grad_norm": 49.767723083496094, "kl/avg_steps": 0.375, "kl/beta": 0.0523817241191864, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.88621404556699e-07, "logits/chosen": -1.002424716949463, "logits/rejected": -0.9351119995117188, "logps/chosen": -81.81732177734375, "logps/ref_chosen": -66.91636657714844, "logps/ref_rejected": -96.6422119140625, "logps/rejected": -134.1367645263672, "loss": 0.888, "rewards/accuracies": 0.75, "rewards/chosen": -0.7800949811935425, "rewards/margins": 1.1693782806396484, "rewards/rejected": -1.9494731426239014, "step": 129 }, { "epoch": 0.19089574155653452, "epsilon_dpo/beta": 0.05187311768531799, "epsilon_dpo/beta_margin_grad_mean": -0.24806594848632812, "epsilon_dpo/beta_margin_grad_std": 0.21258293092250824, "epsilon_dpo/beta_margin_mean": 1.5920820236206055, "epsilon_dpo/beta_margin_std": 1.5427429676055908, "epsilon_dpo/loss_margin_mean": 30.837533950805664, "grad_norm": 36.21017837524414, "kl/avg_steps": 0.609375, "kl/beta": 0.0521860234439373, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.882355001067891e-07, "logits/chosen": -0.9198658466339111, "logits/rejected": -0.914577305316925, "logps/chosen": -55.8094482421875, "logps/ref_chosen": -44.666847229003906, "logps/ref_rejected": -82.78165435791016, "logps/rejected": -124.76178741455078, "loss": 0.6934, "rewards/accuracies": 0.84375, "rewards/chosen": -0.5801820158958435, "rewards/margins": 1.592081904411316, "rewards/rejected": -2.1722640991210938, "step": 130 }, { "epoch": 0.19236417033773862, "epsilon_dpo/beta": 0.05145364627242088, "epsilon_dpo/beta_margin_grad_mean": -0.24963055551052094, "epsilon_dpo/beta_margin_grad_std": 0.1847582757472992, "epsilon_dpo/beta_margin_mean": 1.4447486400604248, "epsilon_dpo/beta_margin_std": 1.290471076965332, "epsilon_dpo/loss_margin_mean": 28.153446197509766, "grad_norm": 33.95417404174805, "kl/avg_steps": 0.8125, "kl/beta": 0.051869943737983704, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 4.878433179298909e-07, "logits/chosen": -0.9668587446212769, "logits/rejected": -0.9406773447990417, "logps/chosen": -53.15331268310547, "logps/ref_chosen": -44.92458724975586, "logps/ref_rejected": -88.44401550292969, "logps/rejected": -124.82617950439453, "loss": 0.6592, "rewards/accuracies": 0.921875, "rewards/chosen": -0.424873948097229, "rewards/margins": 1.4447486400604248, "rewards/rejected": -1.8696224689483643, "step": 131 }, { "epoch": 0.19383259911894274, "epsilon_dpo/beta": 0.05107111111283302, "epsilon_dpo/beta_margin_grad_mean": -0.2733632028102875, "epsilon_dpo/beta_margin_grad_std": 0.18104958534240723, "epsilon_dpo/beta_margin_mean": 1.331608533859253, "epsilon_dpo/beta_margin_std": 1.3731529712677002, "epsilon_dpo/loss_margin_mean": 26.155336380004883, "grad_norm": 40.16446304321289, "kl/avg_steps": 0.75, "kl/beta": 0.051451895385980606, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 4.874448683603694e-07, "logits/chosen": -0.9922490119934082, "logits/rejected": -0.9857528209686279, "logps/chosen": -70.96759033203125, "logps/ref_chosen": -59.00108337402344, "logps/ref_rejected": -87.89215087890625, "logps/rejected": -126.01399230957031, "loss": 0.7129, "rewards/accuracies": 0.921875, "rewards/chosen": -0.6130307912826538, "rewards/margins": 1.331608533859253, "rewards/rejected": -1.9446392059326172, "step": 132 }, { "epoch": 0.19530102790014683, "epsilon_dpo/beta": 0.05072285234928131, "epsilon_dpo/beta_margin_grad_mean": -0.3002629578113556, "epsilon_dpo/beta_margin_grad_std": 0.20250080525875092, "epsilon_dpo/beta_margin_mean": 1.211213231086731, "epsilon_dpo/beta_margin_std": 1.4373672008514404, "epsilon_dpo/loss_margin_mean": 23.984243392944336, "grad_norm": 39.40069580078125, "kl/avg_steps": 0.6875, "kl/beta": 0.051068879663944244, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.870401618977415e-07, "logits/chosen": -0.9905763268470764, "logits/rejected": -0.9981982111930847, "logps/chosen": -80.45793914794922, "logps/ref_chosen": -66.60449981689453, "logps/ref_rejected": -96.33355712890625, "logps/rejected": -134.17124938964844, "loss": 0.817, "rewards/accuracies": 0.875, "rewards/chosen": -0.7051622867584229, "rewards/margins": 1.2112131118774414, "rewards/rejected": -1.9163753986358643, "step": 133 }, { "epoch": 0.19676945668135096, "epsilon_dpo/beta": 0.050313107669353485, "epsilon_dpo/beta_margin_grad_mean": -0.2648751139640808, "epsilon_dpo/beta_margin_grad_std": 0.1557236611843109, "epsilon_dpo/beta_margin_mean": 1.313256859779358, "epsilon_dpo/beta_margin_std": 1.1702913045883179, "epsilon_dpo/loss_margin_mean": 26.153579711914062, "grad_norm": 35.90312194824219, "kl/avg_steps": 0.8125, "kl/beta": 0.050720177590847015, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 4.866292092063986e-07, "logits/chosen": -1.0099027156829834, "logits/rejected": -0.9128708839416504, "logps/chosen": -62.06022644042969, "logps/ref_chosen": -52.06925582885742, "logps/ref_rejected": -87.6545181274414, "logps/rejected": -123.79906463623047, "loss": 0.6622, "rewards/accuracies": 0.953125, "rewards/chosen": -0.5038759708404541, "rewards/margins": 1.3132569789886475, "rewards/rejected": -1.817132830619812, "step": 134 }, { "epoch": 0.19823788546255505, "epsilon_dpo/beta": 0.04997050389647484, "epsilon_dpo/beta_margin_grad_mean": -0.2640392780303955, "epsilon_dpo/beta_margin_grad_std": 0.19785915315151215, "epsilon_dpo/beta_margin_mean": 1.470131754875183, "epsilon_dpo/beta_margin_std": 1.504582405090332, "epsilon_dpo/loss_margin_mean": 29.53673553466797, "grad_norm": 38.08821105957031, "kl/avg_steps": 0.6875, "kl/beta": 0.05031139776110649, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.862120211153265e-07, "logits/chosen": -0.9997594356536865, "logits/rejected": -0.9596564769744873, "logps/chosen": -63.85767364501953, "logps/ref_chosen": -50.353858947753906, "logps/ref_rejected": -115.97975158691406, "logps/rejected": -159.02029418945312, "loss": 0.7081, "rewards/accuracies": 0.890625, "rewards/chosen": -0.6766850352287292, "rewards/margins": 1.4701316356658936, "rewards/rejected": -2.1468167304992676, "step": 135 }, { "epoch": 0.19970631424375918, "epsilon_dpo/beta": 0.04970738664269447, "epsilon_dpo/beta_margin_grad_mean": -0.3144148588180542, "epsilon_dpo/beta_margin_grad_std": 0.17996704578399658, "epsilon_dpo/beta_margin_mean": 1.1049261093139648, "epsilon_dpo/beta_margin_std": 1.4463449716567993, "epsilon_dpo/loss_margin_mean": 22.351482391357422, "grad_norm": 42.638153076171875, "kl/avg_steps": 0.53125, "kl/beta": 0.049967870116233826, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.857886086178193e-07, "logits/chosen": -0.995400071144104, "logits/rejected": -0.9106892347335815, "logps/chosen": -79.67277526855469, "logps/ref_chosen": -65.072509765625, "logps/ref_rejected": -96.32122802734375, "logps/rejected": -133.27297973632812, "loss": 0.8294, "rewards/accuracies": 0.828125, "rewards/chosen": -0.7287014126777649, "rewards/margins": 1.1049261093139648, "rewards/rejected": -1.833627462387085, "step": 136 }, { "epoch": 0.2011747430249633, "epsilon_dpo/beta": 0.04935150593519211, "epsilon_dpo/beta_margin_grad_mean": -0.26569247245788574, "epsilon_dpo/beta_margin_grad_std": 0.21095705032348633, "epsilon_dpo/beta_margin_mean": 1.5682029724121094, "epsilon_dpo/beta_margin_std": 1.7559775114059448, "epsilon_dpo/loss_margin_mean": 31.89786148071289, "grad_norm": 42.921722412109375, "kl/avg_steps": 0.71875, "kl/beta": 0.04970381781458855, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 4.853589828711902e-07, "logits/chosen": -0.9733572602272034, "logits/rejected": -1.0097665786743164, "logps/chosen": -64.98374938964844, "logps/ref_chosen": -48.759117126464844, "logps/ref_rejected": -113.86377716064453, "logps/rejected": -161.98626708984375, "loss": 0.7372, "rewards/accuracies": 0.859375, "rewards/chosen": -0.8035503625869751, "rewards/margins": 1.5682029724121094, "rewards/rejected": -2.371753215789795, "step": 137 }, { "epoch": 0.2026431718061674, "epsilon_dpo/beta": 0.04903016984462738, "epsilon_dpo/beta_margin_grad_mean": -0.2841445505619049, "epsilon_dpo/beta_margin_grad_std": 0.20331744849681854, "epsilon_dpo/beta_margin_mean": 1.2138417959213257, "epsilon_dpo/beta_margin_std": 1.288333535194397, "epsilon_dpo/loss_margin_mean": 24.87476348876953, "grad_norm": 48.054229736328125, "kl/avg_steps": 0.65625, "kl/beta": 0.04934912174940109, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.849231551964771e-07, "logits/chosen": -0.9696782827377319, "logits/rejected": -0.9463719129562378, "logps/chosen": -76.98085021972656, "logps/ref_chosen": -60.519649505615234, "logps/ref_rejected": -93.19694519042969, "logps/rejected": -134.53289794921875, "loss": 0.7886, "rewards/accuracies": 0.875, "rewards/chosen": -0.8097862601280212, "rewards/margins": 1.2138417959213257, "rewards/rejected": -2.0236282348632812, "step": 138 }, { "epoch": 0.20411160058737152, "epsilon_dpo/beta": 0.048725828528404236, "epsilon_dpo/beta_margin_grad_mean": -0.25709182024002075, "epsilon_dpo/beta_margin_grad_std": 0.1820717751979828, "epsilon_dpo/beta_margin_mean": 1.4469445943832397, "epsilon_dpo/beta_margin_std": 1.3607902526855469, "epsilon_dpo/loss_margin_mean": 29.81831169128418, "grad_norm": 35.817955017089844, "kl/avg_steps": 0.625, "kl/beta": 0.04902737960219383, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.844811370781446e-07, "logits/chosen": -0.956308126449585, "logits/rejected": -0.9260801076889038, "logps/chosen": -59.003021240234375, "logps/ref_chosen": -46.89138412475586, "logps/ref_rejected": -79.72798156738281, "logps/rejected": -121.65792846679688, "loss": 0.6635, "rewards/accuracies": 0.890625, "rewards/chosen": -0.5916452407836914, "rewards/margins": 1.4469447135925293, "rewards/rejected": -2.0385899543762207, "step": 139 }, { "epoch": 0.2055800293685756, "epsilon_dpo/beta": 0.04848409444093704, "epsilon_dpo/beta_margin_grad_mean": -0.28536489605903625, "epsilon_dpo/beta_margin_grad_std": 0.21349573135375977, "epsilon_dpo/beta_margin_mean": 1.322952389717102, "epsilon_dpo/beta_margin_std": 1.463498830795288, "epsilon_dpo/loss_margin_mean": 27.453405380249023, "grad_norm": 39.32830047607422, "kl/avg_steps": 0.5, "kl/beta": 0.04872285947203636, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.840329401637809e-07, "logits/chosen": -0.9830155372619629, "logits/rejected": -0.9642812609672546, "logps/chosen": -72.8682861328125, "logps/ref_chosen": -58.97471618652344, "logps/ref_rejected": -83.28411102294922, "logps/rejected": -124.63108825683594, "loss": 0.7857, "rewards/accuracies": 0.796875, "rewards/chosen": -0.6751251816749573, "rewards/margins": 1.3229525089263916, "rewards/rejected": -1.998077630996704, "step": 140 }, { "epoch": 0.20704845814977973, "epsilon_dpo/beta": 0.048182275146245956, "epsilon_dpo/beta_margin_grad_mean": -0.2698647379875183, "epsilon_dpo/beta_margin_grad_std": 0.2030099481344223, "epsilon_dpo/beta_margin_mean": 1.308327317237854, "epsilon_dpo/beta_margin_std": 1.2912057638168335, "epsilon_dpo/loss_margin_mean": 27.29104995727539, "grad_norm": 42.428138732910156, "kl/avg_steps": 0.625, "kl/beta": 0.04848045855760574, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.83578576263792e-07, "logits/chosen": -0.9564207792282104, "logits/rejected": -0.956634521484375, "logps/chosen": -87.76817321777344, "logps/ref_chosen": -75.0756607055664, "logps/ref_rejected": -98.1922607421875, "logps/rejected": -138.17581176757812, "loss": 0.7446, "rewards/accuracies": 0.890625, "rewards/chosen": -0.6139034032821655, "rewards/margins": 1.3083271980285645, "rewards/rejected": -1.9222307205200195, "step": 141 }, { "epoch": 0.20851688693098386, "epsilon_dpo/beta": 0.0478980615735054, "epsilon_dpo/beta_margin_grad_mean": -0.2826434075832367, "epsilon_dpo/beta_margin_grad_std": 0.22222787141799927, "epsilon_dpo/beta_margin_mean": 1.4014191627502441, "epsilon_dpo/beta_margin_std": 1.6505073308944702, "epsilon_dpo/loss_margin_mean": 29.41887092590332, "grad_norm": 47.383052825927734, "kl/avg_steps": 0.59375, "kl/beta": 0.04817933589220047, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.83118057351089e-07, "logits/chosen": -0.9501470327377319, "logits/rejected": -0.9797188639640808, "logps/chosen": -75.01841735839844, "logps/ref_chosen": -58.027931213378906, "logps/ref_rejected": -94.58222198486328, "logps/rejected": -140.9915771484375, "loss": 0.8234, "rewards/accuracies": 0.84375, "rewards/chosen": -0.8165905475616455, "rewards/margins": 1.4014191627502441, "rewards/rejected": -2.2180097103118896, "step": 142 }, { "epoch": 0.20998531571218795, "epsilon_dpo/beta": 0.04767522215843201, "epsilon_dpo/beta_margin_grad_mean": -0.33072641491889954, "epsilon_dpo/beta_margin_grad_std": 0.2191799283027649, "epsilon_dpo/beta_margin_mean": 0.9733087420463562, "epsilon_dpo/beta_margin_std": 1.377323031425476, "epsilon_dpo/loss_margin_mean": 20.585609436035156, "grad_norm": 50.030784606933594, "kl/avg_steps": 0.46875, "kl/beta": 0.04789496213197708, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.826513955607734e-07, "logits/chosen": -0.9663059115409851, "logits/rejected": -0.8719507455825806, "logps/chosen": -74.62933349609375, "logps/ref_chosen": -57.59645080566406, "logps/ref_rejected": -78.99957275390625, "logps/rejected": -116.61805725097656, "loss": 0.9549, "rewards/accuracies": 0.765625, "rewards/chosen": -0.8152145147323608, "rewards/margins": 0.973308801651001, "rewards/rejected": -1.7885233163833618, "step": 143 }, { "epoch": 0.21145374449339208, "epsilon_dpo/beta": 0.04739319160580635, "epsilon_dpo/beta_margin_grad_mean": -0.26462069153785706, "epsilon_dpo/beta_margin_grad_std": 0.17797112464904785, "epsilon_dpo/beta_margin_mean": 1.2886942625045776, "epsilon_dpo/beta_margin_std": 1.145198106765747, "epsilon_dpo/loss_margin_mean": 27.316715240478516, "grad_norm": 33.45161056518555, "kl/avg_steps": 0.59375, "kl/beta": 0.04767150059342384, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.821786031898176e-07, "logits/chosen": -0.976190447807312, "logits/rejected": -0.8981922268867493, "logps/chosen": -72.54024505615234, "logps/ref_chosen": -59.90636444091797, "logps/ref_rejected": -82.00025939941406, "logps/rejected": -121.95085906982422, "loss": 0.6876, "rewards/accuracies": 0.890625, "rewards/chosen": -0.6001367568969727, "rewards/margins": 1.2886942625045776, "rewards/rejected": -1.8888310194015503, "step": 144 }, { "epoch": 0.21292217327459617, "epsilon_dpo/beta": 0.04709864407777786, "epsilon_dpo/beta_margin_grad_mean": -0.27273982763290405, "epsilon_dpo/beta_margin_grad_std": 0.1858561635017395, "epsilon_dpo/beta_margin_mean": 1.333594799041748, "epsilon_dpo/beta_margin_std": 1.3202054500579834, "epsilon_dpo/loss_margin_mean": 28.433860778808594, "grad_norm": 37.968711853027344, "kl/avg_steps": 0.625, "kl/beta": 0.04739012196660042, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.816996926967401e-07, "logits/chosen": -0.9659013748168945, "logits/rejected": -0.9058011770248413, "logps/chosen": -71.90216064453125, "logps/ref_chosen": -56.60066604614258, "logps/ref_rejected": -77.86631774902344, "logps/rejected": -121.60166931152344, "loss": 0.713, "rewards/accuracies": 0.890625, "rewards/chosen": -0.7217481136322021, "rewards/margins": 1.333594799041748, "rewards/rejected": -2.05534291267395, "step": 145 }, { "epoch": 0.2143906020558003, "epsilon_dpo/beta": 0.046894416213035583, "epsilon_dpo/beta_margin_grad_mean": -0.3107375502586365, "epsilon_dpo/beta_margin_grad_std": 0.21199391782283783, "epsilon_dpo/beta_margin_mean": 1.0807816982269287, "epsilon_dpo/beta_margin_std": 1.3126076459884644, "epsilon_dpo/loss_margin_mean": 23.22393035888672, "grad_norm": 51.1514892578125, "kl/avg_steps": 0.4375, "kl/beta": 0.04709577187895775, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.812146767012779e-07, "logits/chosen": -1.014100432395935, "logits/rejected": -0.9278345108032227, "logps/chosen": -86.86761474609375, "logps/ref_chosen": -66.00045776367188, "logps/ref_rejected": -81.70278930664062, "logps/rejected": -125.79388427734375, "loss": 0.8731, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9812708497047424, "rewards/margins": 1.0807816982269287, "rewards/rejected": -2.0620524883270264, "step": 146 }, { "epoch": 0.21585903083700442, "epsilon_dpo/beta": 0.046616874635219574, "epsilon_dpo/beta_margin_grad_mean": -0.27530568838119507, "epsilon_dpo/beta_margin_grad_std": 0.19929742813110352, "epsilon_dpo/beta_margin_mean": 1.381503939628601, "epsilon_dpo/beta_margin_std": 1.4857523441314697, "epsilon_dpo/loss_margin_mean": 29.78620719909668, "grad_norm": 46.896018981933594, "kl/avg_steps": 0.59375, "kl/beta": 0.04689062759280205, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.807235679840536e-07, "logits/chosen": -0.9725676774978638, "logits/rejected": -0.9196237325668335, "logps/chosen": -70.034912109375, "logps/ref_chosen": -53.405487060546875, "logps/ref_rejected": -71.39061737060547, "logps/rejected": -117.80624389648438, "loss": 0.7435, "rewards/accuracies": 0.875, "rewards/chosen": -0.7782865762710571, "rewards/margins": 1.381503939628601, "rewards/rejected": -2.159790515899658, "step": 147 }, { "epoch": 0.2173274596182085, "epsilon_dpo/beta": 0.04629801586270332, "epsilon_dpo/beta_margin_grad_mean": -0.30957266688346863, "epsilon_dpo/beta_margin_grad_std": 0.18236467242240906, "epsilon_dpo/beta_margin_mean": 1.0623618364334106, "epsilon_dpo/beta_margin_std": 1.202706217765808, "epsilon_dpo/loss_margin_mean": 23.042098999023438, "grad_norm": 37.24595642089844, "kl/avg_steps": 0.6875, "kl/beta": 0.04661385715007782, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.802263794862384e-07, "logits/chosen": -1.011152982711792, "logits/rejected": -0.9253931045532227, "logps/chosen": -79.60943603515625, "logps/ref_chosen": -64.93708038330078, "logps/ref_rejected": -103.09384155273438, "logps/rejected": -140.80828857421875, "loss": 0.8231, "rewards/accuracies": 0.859375, "rewards/chosen": -0.6816513538360596, "rewards/margins": 1.062361717224121, "rewards/rejected": -1.7440130710601807, "step": 148 }, { "epoch": 0.21879588839941264, "epsilon_dpo/beta": 0.04595295339822769, "epsilon_dpo/beta_margin_grad_mean": -0.2586212158203125, "epsilon_dpo/beta_margin_grad_std": 0.18093499541282654, "epsilon_dpo/beta_margin_mean": 1.3834995031356812, "epsilon_dpo/beta_margin_std": 1.2863571643829346, "epsilon_dpo/loss_margin_mean": 30.207395553588867, "grad_norm": 39.60329818725586, "kl/avg_steps": 0.75, "kl/beta": 0.04629557579755783, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 4.797231243092118e-07, "logits/chosen": -0.9560129642486572, "logits/rejected": -0.901535153388977, "logps/chosen": -73.84778594970703, "logps/ref_chosen": -58.47376251220703, "logps/ref_rejected": -99.31474304199219, "logps/rejected": -144.8961639404297, "loss": 0.6747, "rewards/accuracies": 0.890625, "rewards/chosen": -0.7077004909515381, "rewards/margins": 1.3834993839263916, "rewards/rejected": -2.0911998748779297, "step": 149 }, { "epoch": 0.22026431718061673, "epsilon_dpo/beta": 0.04565395414829254, "epsilon_dpo/beta_margin_grad_mean": -0.2837304472923279, "epsilon_dpo/beta_margin_grad_std": 0.1890309900045395, "epsilon_dpo/beta_margin_mean": 1.318402886390686, "epsilon_dpo/beta_margin_std": 1.513938069343567, "epsilon_dpo/loss_margin_mean": 29.001182556152344, "grad_norm": 41.226314544677734, "kl/avg_steps": 0.65625, "kl/beta": 0.04595094174146652, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.792138157142157e-07, "logits/chosen": -0.9622288346290588, "logits/rejected": -0.9198128581047058, "logps/chosen": -60.1712646484375, "logps/ref_chosen": -45.705810546875, "logps/ref_rejected": -83.34759521484375, "logps/rejected": -126.81423950195312, "loss": 0.7574, "rewards/accuracies": 0.890625, "rewards/chosen": -0.6629461050033569, "rewards/margins": 1.318402886390686, "rewards/rejected": -1.981348991394043, "step": 150 }, { "epoch": 0.22173274596182085, "epsilon_dpo/beta": 0.045313503593206406, "epsilon_dpo/beta_margin_grad_mean": -0.2545073926448822, "epsilon_dpo/beta_margin_grad_std": 0.18467025458812714, "epsilon_dpo/beta_margin_mean": 1.4252184629440308, "epsilon_dpo/beta_margin_std": 1.310182809829712, "epsilon_dpo/loss_margin_mean": 31.559906005859375, "grad_norm": 38.9255256652832, "kl/avg_steps": 0.75, "kl/beta": 0.045651357620954514, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 4.786984671220053e-07, "logits/chosen": -0.9893289804458618, "logits/rejected": -0.93805992603302, "logps/chosen": -86.97854614257812, "logps/ref_chosen": -70.57083129882812, "logps/ref_rejected": -100.46382141113281, "logps/rejected": -148.43142700195312, "loss": 0.6688, "rewards/accuracies": 0.875, "rewards/chosen": -0.7452378273010254, "rewards/margins": 1.4252184629440308, "rewards/rejected": -2.1704564094543457, "step": 151 }, { "epoch": 0.22320117474302498, "epsilon_dpo/beta": 0.04501866176724434, "epsilon_dpo/beta_margin_grad_mean": -0.2555217742919922, "epsilon_dpo/beta_margin_grad_std": 0.18799203634262085, "epsilon_dpo/beta_margin_mean": 1.4897184371948242, "epsilon_dpo/beta_margin_std": 1.462400197982788, "epsilon_dpo/loss_margin_mean": 33.2269401550293, "grad_norm": 37.86301040649414, "kl/avg_steps": 0.65625, "kl/beta": 0.045311518013477325, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.78177092112495e-07, "logits/chosen": -0.9898856282234192, "logits/rejected": -0.9902868866920471, "logps/chosen": -73.75241088867188, "logps/ref_chosen": -60.164390563964844, "logps/ref_rejected": -106.14045715332031, "logps/rejected": -152.95542907714844, "loss": 0.6743, "rewards/accuracies": 0.890625, "rewards/chosen": -0.6140701770782471, "rewards/margins": 1.4897184371948242, "rewards/rejected": -2.1037886142730713, "step": 152 }, { "epoch": 0.22466960352422907, "epsilon_dpo/beta": 0.0447532944381237, "epsilon_dpo/beta_margin_grad_mean": -0.290993869304657, "epsilon_dpo/beta_margin_grad_std": 0.20372426509857178, "epsilon_dpo/beta_margin_mean": 1.2581030130386353, "epsilon_dpo/beta_margin_std": 1.4385782480239868, "epsilon_dpo/loss_margin_mean": 28.263267517089844, "grad_norm": 35.41409683227539, "kl/avg_steps": 0.59375, "kl/beta": 0.045016102492809296, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.776497044244016e-07, "logits/chosen": -0.9467694759368896, "logits/rejected": -0.9733778238296509, "logps/chosen": -70.63774108886719, "logps/ref_chosen": -56.315277099609375, "logps/ref_rejected": -85.65583801269531, "logps/rejected": -128.24156188964844, "loss": 0.7931, "rewards/accuracies": 0.828125, "rewards/chosen": -0.6429731845855713, "rewards/margins": 1.2581028938293457, "rewards/rejected": -1.901076078414917, "step": 153 }, { "epoch": 0.2261380323054332, "epsilon_dpo/beta": 0.04454508051276207, "epsilon_dpo/beta_margin_grad_mean": -0.2973214089870453, "epsilon_dpo/beta_margin_grad_std": 0.22685933113098145, "epsilon_dpo/beta_margin_mean": 1.2366024255752563, "epsilon_dpo/beta_margin_std": 1.4697744846343994, "epsilon_dpo/loss_margin_mean": 27.96182632446289, "grad_norm": 49.316978454589844, "kl/avg_steps": 0.46875, "kl/beta": 0.0447503961622715, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.771163179548808e-07, "logits/chosen": -0.9589699506759644, "logits/rejected": -0.9176832437515259, "logps/chosen": -80.27618408203125, "logps/ref_chosen": -62.74256896972656, "logps/ref_rejected": -104.24420166015625, "logps/rejected": -149.73963928222656, "loss": 0.8487, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7843384742736816, "rewards/margins": 1.2366024255752563, "rewards/rejected": -2.0209410190582275, "step": 154 }, { "epoch": 0.2276064610866373, "epsilon_dpo/beta": 0.04423980787396431, "epsilon_dpo/beta_margin_grad_mean": -0.2698431611061096, "epsilon_dpo/beta_margin_grad_std": 0.18442155420780182, "epsilon_dpo/beta_margin_mean": 1.3126617670059204, "epsilon_dpo/beta_margin_std": 1.3255443572998047, "epsilon_dpo/loss_margin_mean": 29.781213760375977, "grad_norm": 36.14684295654297, "kl/avg_steps": 0.6875, "kl/beta": 0.0445416085422039, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.7657694675916247e-07, "logits/chosen": -0.9246869683265686, "logits/rejected": -0.8986223936080933, "logps/chosen": -74.22544860839844, "logps/ref_chosen": -60.65318298339844, "logps/ref_rejected": -77.49220275878906, "logps/rejected": -120.84568786621094, "loss": 0.7225, "rewards/accuracies": 0.890625, "rewards/chosen": -0.6022658944129944, "rewards/margins": 1.31266188621521, "rewards/rejected": -1.9149277210235596, "step": 155 }, { "epoch": 0.2290748898678414, "epsilon_dpo/beta": 0.04403451830148697, "epsilon_dpo/beta_margin_grad_mean": -0.33595940470695496, "epsilon_dpo/beta_margin_grad_std": 0.23996010422706604, "epsilon_dpo/beta_margin_mean": 0.9327062368392944, "epsilon_dpo/beta_margin_std": 1.5123710632324219, "epsilon_dpo/loss_margin_mean": 21.390121459960938, "grad_norm": 59.35513687133789, "kl/avg_steps": 0.46875, "kl/beta": 0.04423747584223747, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.7603160505017893e-07, "logits/chosen": -0.9638940095901489, "logits/rejected": -0.9159474968910217, "logps/chosen": -90.10908508300781, "logps/ref_chosen": -69.49188232421875, "logps/ref_rejected": -77.1692886352539, "logps/rejected": -119.17662048339844, "loss": 1.0654, "rewards/accuracies": 0.75, "rewards/chosen": -0.9126459360122681, "rewards/margins": 0.9327062368392944, "rewards/rejected": -1.8453521728515625, "step": 156 }, { "epoch": 0.2305433186490455, "epsilon_dpo/beta": 0.04371897876262665, "epsilon_dpo/beta_margin_grad_mean": -0.2587592601776123, "epsilon_dpo/beta_margin_grad_std": 0.19853951036930084, "epsilon_dpo/beta_margin_mean": 1.4568617343902588, "epsilon_dpo/beta_margin_std": 1.4228980541229248, "epsilon_dpo/loss_margin_mean": 33.44245147705078, "grad_norm": 44.633541107177734, "kl/avg_steps": 0.71875, "kl/beta": 0.04403107985854149, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 4.7548030719819154e-07, "logits/chosen": -0.9832438230514526, "logits/rejected": -0.9874979257583618, "logps/chosen": -81.43819427490234, "logps/ref_chosen": -61.368438720703125, "logps/ref_rejected": -107.64636993408203, "logps/rejected": -161.1585693359375, "loss": 0.6928, "rewards/accuracies": 0.875, "rewards/chosen": -0.8795930743217468, "rewards/margins": 1.4568617343902588, "rewards/rejected": -2.3364548683166504, "step": 157 }, { "epoch": 0.23201174743024963, "epsilon_dpo/beta": 0.0434616394340992, "epsilon_dpo/beta_margin_grad_mean": -0.2763029634952545, "epsilon_dpo/beta_margin_grad_std": 0.196333646774292, "epsilon_dpo/beta_margin_mean": 1.4541841745376587, "epsilon_dpo/beta_margin_std": 1.6093320846557617, "epsilon_dpo/loss_margin_mean": 33.61172866821289, "grad_norm": 33.16934585571289, "kl/avg_steps": 0.59375, "kl/beta": 0.04371686279773712, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.7492306773041136e-07, "logits/chosen": -1.0052913427352905, "logits/rejected": -0.96925950050354, "logps/chosen": -74.00665283203125, "logps/ref_chosen": -57.61292266845703, "logps/ref_rejected": -113.6946792602539, "logps/rejected": -163.70013427734375, "loss": 0.7293, "rewards/accuracies": 0.828125, "rewards/chosen": -0.7132701873779297, "rewards/margins": 1.4541841745376587, "rewards/rejected": -2.167454242706299, "step": 158 }, { "epoch": 0.23348017621145375, "epsilon_dpo/beta": 0.04320510849356651, "epsilon_dpo/beta_margin_grad_mean": -0.29521963000297546, "epsilon_dpo/beta_margin_grad_std": 0.20369917154312134, "epsilon_dpo/beta_margin_mean": 1.243503451347351, "epsilon_dpo/beta_margin_std": 1.4695652723312378, "epsilon_dpo/loss_margin_mean": 28.925600051879883, "grad_norm": 62.41481399536133, "kl/avg_steps": 0.59375, "kl/beta": 0.04345882683992386, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.743599013306165e-07, "logits/chosen": -0.9751964211463928, "logits/rejected": -0.901130199432373, "logps/chosen": -100.16021728515625, "logps/ref_chosen": -81.56034088134766, "logps/ref_rejected": -88.8987045288086, "logps/rejected": -136.42417907714844, "loss": 0.8066, "rewards/accuracies": 0.84375, "rewards/chosen": -0.8058692216873169, "rewards/margins": 1.243503451347351, "rewards/rejected": -2.049372673034668, "step": 159 }, { "epoch": 0.23494860499265785, "epsilon_dpo/beta": 0.04299060255289078, "epsilon_dpo/beta_margin_grad_mean": -0.2983653247356415, "epsilon_dpo/beta_margin_grad_std": 0.21221430599689484, "epsilon_dpo/beta_margin_mean": 1.3093961477279663, "epsilon_dpo/beta_margin_std": 1.614226222038269, "epsilon_dpo/loss_margin_mean": 30.649219512939453, "grad_norm": 38.94950485229492, "kl/avg_steps": 0.5, "kl/beta": 0.04320231452584267, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.737908228387656e-07, "logits/chosen": -1.0091383457183838, "logits/rejected": -0.9705837965011597, "logps/chosen": -84.16133880615234, "logps/ref_chosen": -65.73088073730469, "logps/ref_rejected": -97.21781921386719, "logps/rejected": -146.29750061035156, "loss": 0.817, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7951179146766663, "rewards/margins": 1.3093961477279663, "rewards/rejected": -2.1045141220092773, "step": 160 }, { "epoch": 0.23641703377386197, "epsilon_dpo/beta": 0.04274984449148178, "epsilon_dpo/beta_margin_grad_mean": -0.29090237617492676, "epsilon_dpo/beta_margin_grad_std": 0.2110779881477356, "epsilon_dpo/beta_margin_mean": 1.2623058557510376, "epsilon_dpo/beta_margin_std": 1.418753981590271, "epsilon_dpo/loss_margin_mean": 29.705718994140625, "grad_norm": 37.854209899902344, "kl/avg_steps": 0.5625, "kl/beta": 0.04298737645149231, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.7321584725060594e-07, "logits/chosen": -0.9591707587242126, "logits/rejected": -0.9641485214233398, "logps/chosen": -69.65299224853516, "logps/ref_chosen": -52.43647766113281, "logps/ref_rejected": -83.43095397949219, "logps/rejected": -130.35317993164062, "loss": 0.7967, "rewards/accuracies": 0.796875, "rewards/chosen": -0.7385674118995667, "rewards/margins": 1.2623059749603271, "rewards/rejected": -2.000873327255249, "step": 161 }, { "epoch": 0.23788546255506607, "epsilon_dpo/beta": 0.04251072183251381, "epsilon_dpo/beta_margin_grad_mean": -0.2922504246234894, "epsilon_dpo/beta_margin_grad_std": 0.1976582258939743, "epsilon_dpo/beta_margin_mean": 1.254723072052002, "epsilon_dpo/beta_margin_std": 1.5166453123092651, "epsilon_dpo/loss_margin_mean": 29.6732234954834, "grad_norm": 33.096405029296875, "kl/avg_steps": 0.5625, "kl/beta": 0.04274692386388779, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.7263498971727905e-07, "logits/chosen": -0.9918665289878845, "logits/rejected": -0.9454479813575745, "logps/chosen": -78.78065490722656, "logps/ref_chosen": -62.61058807373047, "logps/ref_rejected": -89.39057922363281, "logps/rejected": -135.23387145996094, "loss": 0.7858, "rewards/accuracies": 0.84375, "rewards/chosen": -0.6898952722549438, "rewards/margins": 1.254723072052002, "rewards/rejected": -1.9446183443069458, "step": 162 }, { "epoch": 0.2393538913362702, "epsilon_dpo/beta": 0.04220651462674141, "epsilon_dpo/beta_margin_grad_mean": -0.28332507610321045, "epsilon_dpo/beta_margin_grad_std": 0.18418292701244354, "epsilon_dpo/beta_margin_mean": 1.2095342874526978, "epsilon_dpo/beta_margin_std": 1.213563323020935, "epsilon_dpo/loss_margin_mean": 28.76431655883789, "grad_norm": 29.936630249023438, "kl/avg_steps": 0.71875, "kl/beta": 0.04250781983137131, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 4.720482655449212e-07, "logits/chosen": -1.0189510583877563, "logits/rejected": -0.9549261331558228, "logps/chosen": -70.54529571533203, "logps/ref_chosen": -55.021629333496094, "logps/ref_rejected": -75.41822052001953, "logps/rejected": -119.7061996459961, "loss": 0.7472, "rewards/accuracies": 0.90625, "rewards/chosen": -0.6555919647216797, "rewards/margins": 1.2095344066619873, "rewards/rejected": -1.865126371383667, "step": 163 }, { "epoch": 0.24082232011747431, "epsilon_dpo/beta": 0.041931699961423874, "epsilon_dpo/beta_margin_grad_mean": -0.2573654055595398, "epsilon_dpo/beta_margin_grad_std": 0.16751733422279358, "epsilon_dpo/beta_margin_mean": 1.3391965627670288, "epsilon_dpo/beta_margin_std": 1.185653567314148, "epsilon_dpo/loss_margin_mean": 32.058197021484375, "grad_norm": 33.62155532836914, "kl/avg_steps": 0.65625, "kl/beta": 0.04220447316765785, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.714556901942599e-07, "logits/chosen": -1.0321714878082275, "logits/rejected": -0.9380872249603271, "logps/chosen": -69.88985443115234, "logps/ref_chosen": -55.64066696166992, "logps/ref_rejected": -79.66463470458984, "logps/rejected": -125.97201538085938, "loss": 0.6574, "rewards/accuracies": 0.90625, "rewards/chosen": -0.5993364453315735, "rewards/margins": 1.3391965627670288, "rewards/rejected": -1.938533067703247, "step": 164 }, { "epoch": 0.2422907488986784, "epsilon_dpo/beta": 0.04171073064208031, "epsilon_dpo/beta_margin_grad_mean": -0.32541587948799133, "epsilon_dpo/beta_margin_grad_std": 0.18152815103530884, "epsilon_dpo/beta_margin_mean": 0.9150287508964539, "epsilon_dpo/beta_margin_std": 1.077598214149475, "epsilon_dpo/loss_margin_mean": 22.06475067138672, "grad_norm": 44.52377700805664, "kl/avg_steps": 0.53125, "kl/beta": 0.04192931205034256, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.708572792802069e-07, "logits/chosen": -0.998910665512085, "logits/rejected": -0.90199875831604, "logps/chosen": -78.4998779296875, "logps/ref_chosen": -61.310691833496094, "logps/ref_rejected": -73.67060852050781, "logps/rejected": -112.92454528808594, "loss": 0.8825, "rewards/accuracies": 0.828125, "rewards/chosen": -0.7182124853134155, "rewards/margins": 0.9150286912918091, "rewards/rejected": -1.6332411766052246, "step": 165 }, { "epoch": 0.24375917767988253, "epsilon_dpo/beta": 0.04142513871192932, "epsilon_dpo/beta_margin_grad_mean": -0.26308882236480713, "epsilon_dpo/beta_margin_grad_std": 0.18034981191158295, "epsilon_dpo/beta_margin_mean": 1.5421535968780518, "epsilon_dpo/beta_margin_std": 1.679103970527649, "epsilon_dpo/loss_margin_mean": 37.36027908325195, "grad_norm": 32.624271392822266, "kl/avg_steps": 0.6875, "kl/beta": 0.04170773923397064, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.702530485714461e-07, "logits/chosen": -0.9749380350112915, "logits/rejected": -0.9805365204811096, "logps/chosen": -67.81795501708984, "logps/ref_chosen": -50.98360061645508, "logps/ref_rejected": -98.09512329101562, "logps/rejected": -152.28976440429688, "loss": 0.6783, "rewards/accuracies": 0.890625, "rewards/chosen": -0.6984573602676392, "rewards/margins": 1.5421535968780518, "rewards/rejected": -2.2406110763549805, "step": 166 }, { "epoch": 0.24522760646108663, "epsilon_dpo/beta": 0.04112933948636055, "epsilon_dpo/beta_margin_grad_mean": -0.23302772641181946, "epsilon_dpo/beta_margin_grad_std": 0.1919611096382141, "epsilon_dpo/beta_margin_mean": 1.593092918395996, "epsilon_dpo/beta_margin_std": 1.3703241348266602, "epsilon_dpo/loss_margin_mean": 38.86928176879883, "grad_norm": 34.27349090576172, "kl/avg_steps": 0.71875, "kl/beta": 0.04142295569181442, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 4.6964301399001877e-07, "logits/chosen": -0.9137979745864868, "logits/rejected": -0.9279575347900391, "logps/chosen": -67.69206237792969, "logps/ref_chosen": -50.42409133911133, "logps/ref_rejected": -96.03042602539062, "logps/rejected": -152.1676788330078, "loss": 0.6263, "rewards/accuracies": 0.90625, "rewards/chosen": -0.7120723128318787, "rewards/margins": 1.593092918395996, "rewards/rejected": -2.3051652908325195, "step": 167 }, { "epoch": 0.24669603524229075, "epsilon_dpo/beta": 0.04083583503961563, "epsilon_dpo/beta_margin_grad_mean": -0.27528858184814453, "epsilon_dpo/beta_margin_grad_std": 0.17864958941936493, "epsilon_dpo/beta_margin_mean": 1.239363431930542, "epsilon_dpo/beta_margin_std": 1.1697802543640137, "epsilon_dpo/loss_margin_mean": 30.45475196838379, "grad_norm": 35.55723571777344, "kl/avg_steps": 0.71875, "kl/beta": 0.04112735390663147, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 4.690271916109034e-07, "logits/chosen": -1.0056126117706299, "logits/rejected": -0.9262582063674927, "logps/chosen": -66.53511047363281, "logps/ref_chosen": -49.46282196044922, "logps/ref_rejected": -75.30854797363281, "logps/rejected": -122.8355941772461, "loss": 0.7168, "rewards/accuracies": 0.875, "rewards/chosen": -0.6987807750701904, "rewards/margins": 1.239363431930542, "rewards/rejected": -1.9381442070007324, "step": 168 }, { "epoch": 0.24816446402349487, "epsilon_dpo/beta": 0.040633752942085266, "epsilon_dpo/beta_margin_grad_mean": -0.32107600569725037, "epsilon_dpo/beta_margin_grad_std": 0.20667105913162231, "epsilon_dpo/beta_margin_mean": 1.1159132719039917, "epsilon_dpo/beta_margin_std": 1.5563249588012695, "epsilon_dpo/loss_margin_mean": 27.650426864624023, "grad_norm": 36.4353141784668, "kl/avg_steps": 0.5, "kl/beta": 0.040833860635757446, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.6840559766159235e-07, "logits/chosen": -0.9751948118209839, "logits/rejected": -0.8828585147857666, "logps/chosen": -76.23262023925781, "logps/ref_chosen": -59.803443908691406, "logps/ref_rejected": -83.34574890136719, "logps/rejected": -127.42534637451172, "loss": 0.8922, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6698789596557617, "rewards/margins": 1.1159132719039917, "rewards/rejected": -1.7857921123504639, "step": 169 }, { "epoch": 0.24963289280469897, "epsilon_dpo/beta": 0.04034271091222763, "epsilon_dpo/beta_margin_grad_mean": -0.25905343890190125, "epsilon_dpo/beta_margin_grad_std": 0.19562862813472748, "epsilon_dpo/beta_margin_mean": 1.4130287170410156, "epsilon_dpo/beta_margin_std": 1.3224432468414307, "epsilon_dpo/loss_margin_mean": 35.15060043334961, "grad_norm": 30.6046085357666, "kl/avg_steps": 0.71875, "kl/beta": 0.04063070937991142, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 4.6777824852166437e-07, "logits/chosen": -0.9482662081718445, "logits/rejected": -0.8848442435264587, "logps/chosen": -63.6239013671875, "logps/ref_chosen": -49.471771240234375, "logps/ref_rejected": -75.91734313964844, "logps/rejected": -125.22006225585938, "loss": 0.6946, "rewards/accuracies": 0.90625, "rewards/chosen": -0.5722982883453369, "rewards/margins": 1.4130287170410156, "rewards/rejected": -1.9853267669677734, "step": 170 }, { "epoch": 0.2511013215859031, "epsilon_dpo/beta": 0.040105246007442474, "epsilon_dpo/beta_margin_grad_mean": -0.3009694814682007, "epsilon_dpo/beta_margin_grad_std": 0.19512800872325897, "epsilon_dpo/beta_margin_mean": 1.1584964990615845, "epsilon_dpo/beta_margin_std": 1.4104453325271606, "epsilon_dpo/loss_margin_mean": 29.043243408203125, "grad_norm": 42.23140335083008, "kl/avg_steps": 0.59375, "kl/beta": 0.04034075886011124, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.6714516072235273e-07, "logits/chosen": -0.9987205266952515, "logits/rejected": -0.8955119848251343, "logps/chosen": -104.87657928466797, "logps/ref_chosen": -84.49931335449219, "logps/ref_rejected": -109.38209533691406, "logps/rejected": -158.8026123046875, "loss": 0.8207, "rewards/accuracies": 0.859375, "rewards/chosen": -0.8190073370933533, "rewards/margins": 1.158496379852295, "rewards/rejected": -1.977503776550293, "step": 171 }, { "epoch": 0.2525697503671072, "epsilon_dpo/beta": 0.03994372487068176, "epsilon_dpo/beta_margin_grad_mean": -0.2966817319393158, "epsilon_dpo/beta_margin_grad_std": 0.21017903089523315, "epsilon_dpo/beta_margin_mean": 1.1768347024917603, "epsilon_dpo/beta_margin_std": 1.378002405166626, "epsilon_dpo/loss_margin_mean": 29.682003021240234, "grad_norm": 41.4985237121582, "kl/avg_steps": 0.40625, "kl/beta": 0.040102649480104446, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.6650635094610966e-07, "logits/chosen": -0.9402850866317749, "logits/rejected": -0.9079384207725525, "logps/chosen": -91.90399169921875, "logps/ref_chosen": -68.65391540527344, "logps/ref_rejected": -85.43667602539062, "logps/rejected": -138.36874389648438, "loss": 0.8268, "rewards/accuracies": 0.796875, "rewards/chosen": -0.9317343235015869, "rewards/margins": 1.1768348217010498, "rewards/rejected": -2.1085691452026367, "step": 172 }, { "epoch": 0.2540381791483113, "epsilon_dpo/beta": 0.03970721364021301, "epsilon_dpo/beta_margin_grad_mean": -0.29116085171699524, "epsilon_dpo/beta_margin_grad_std": 0.18318597972393036, "epsilon_dpo/beta_margin_mean": 1.1734623908996582, "epsilon_dpo/beta_margin_std": 1.2339973449707031, "epsilon_dpo/loss_margin_mean": 29.693958282470703, "grad_norm": 38.0942497253418, "kl/avg_steps": 0.59375, "kl/beta": 0.03994039073586464, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.6586183602616687e-07, "logits/chosen": -1.0178707838058472, "logits/rejected": -0.9571582078933716, "logps/chosen": -82.31532287597656, "logps/ref_chosen": -63.050872802734375, "logps/ref_rejected": -78.68392944335938, "logps/rejected": -127.64232635498047, "loss": 0.7684, "rewards/accuracies": 0.859375, "rewards/chosen": -0.7673953771591187, "rewards/margins": 1.1734623908996582, "rewards/rejected": -1.9408578872680664, "step": 173 }, { "epoch": 0.2555066079295154, "epsilon_dpo/beta": 0.03947284445166588, "epsilon_dpo/beta_margin_grad_mean": -0.2851825952529907, "epsilon_dpo/beta_margin_grad_std": 0.19597502052783966, "epsilon_dpo/beta_margin_mean": 1.236145257949829, "epsilon_dpo/beta_margin_std": 1.3220787048339844, "epsilon_dpo/loss_margin_mean": 31.475608825683594, "grad_norm": 40.52580642700195, "kl/avg_steps": 0.59375, "kl/beta": 0.039704643189907074, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.652116329460919e-07, "logits/chosen": -0.8906412720680237, "logits/rejected": -0.9368263483047485, "logps/chosen": -74.1144790649414, "logps/ref_chosen": -53.36296844482422, "logps/ref_rejected": -101.91120910644531, "logps/rejected": -154.13833618164062, "loss": 0.7697, "rewards/accuracies": 0.828125, "rewards/chosen": -0.8219078779220581, "rewards/margins": 1.236145257949829, "rewards/rejected": -2.0580530166625977, "step": 174 }, { "epoch": 0.25697503671071953, "epsilon_dpo/beta": 0.03917818143963814, "epsilon_dpo/beta_margin_grad_mean": -0.23521888256072998, "epsilon_dpo/beta_margin_grad_std": 0.17553327977657318, "epsilon_dpo/beta_margin_mean": 1.6595497131347656, "epsilon_dpo/beta_margin_std": 1.46583890914917, "epsilon_dpo/loss_margin_mean": 42.47246170043945, "grad_norm": 38.26108169555664, "kl/avg_steps": 0.75, "kl/beta": 0.03947028890252113, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 4.645557588393406e-07, "logits/chosen": -0.9278110265731812, "logits/rejected": -0.9340702295303345, "logps/chosen": -65.43023681640625, "logps/ref_chosen": -45.417762756347656, "logps/ref_rejected": -89.50579833984375, "logps/rejected": -151.99073791503906, "loss": 0.5952, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7862248420715332, "rewards/margins": 1.6595497131347656, "rewards/rejected": -2.445774555206299, "step": 175 }, { "epoch": 0.25844346549192365, "epsilon_dpo/beta": 0.0388987772166729, "epsilon_dpo/beta_margin_grad_mean": -0.25364938378334045, "epsilon_dpo/beta_margin_grad_std": 0.17404069006443024, "epsilon_dpo/beta_margin_mean": 1.4662517309188843, "epsilon_dpo/beta_margin_std": 1.378067135810852, "epsilon_dpo/loss_margin_mean": 37.79985809326172, "grad_norm": 36.76453399658203, "kl/avg_steps": 0.71875, "kl/beta": 0.039176467806100845, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 4.638942309888058e-07, "logits/chosen": -0.944208025932312, "logits/rejected": -0.9565688371658325, "logps/chosen": -70.38160705566406, "logps/ref_chosen": -50.45283889770508, "logps/ref_rejected": -95.55896759033203, "logps/rejected": -153.28758239746094, "loss": 0.6481, "rewards/accuracies": 0.90625, "rewards/chosen": -0.7769966125488281, "rewards/margins": 1.4662517309188843, "rewards/rejected": -2.243248462677002, "step": 176 }, { "epoch": 0.2599118942731278, "epsilon_dpo/beta": 0.03864549845457077, "epsilon_dpo/beta_margin_grad_mean": -0.2690354287624359, "epsilon_dpo/beta_margin_grad_std": 0.19591771066188812, "epsilon_dpo/beta_margin_mean": 1.4547548294067383, "epsilon_dpo/beta_margin_std": 1.526091456413269, "epsilon_dpo/loss_margin_mean": 37.79157257080078, "grad_norm": 39.08116912841797, "kl/avg_steps": 0.65625, "kl/beta": 0.038896895945072174, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.6322706682636137e-07, "logits/chosen": -0.9477738738059998, "logits/rejected": -0.856330156326294, "logps/chosen": -86.77143859863281, "logps/ref_chosen": -61.21646499633789, "logps/ref_rejected": -95.89378356933594, "logps/rejected": -159.24032592773438, "loss": 0.7101, "rewards/accuracies": 0.84375, "rewards/chosen": -0.9890835285186768, "rewards/margins": 1.4547548294067383, "rewards/rejected": -2.443838357925415, "step": 177 }, { "epoch": 0.26138032305433184, "epsilon_dpo/beta": 0.03840561583638191, "epsilon_dpo/beta_margin_grad_mean": -0.24208901822566986, "epsilon_dpo/beta_margin_grad_std": 0.20867198705673218, "epsilon_dpo/beta_margin_mean": 1.749585747718811, "epsilon_dpo/beta_margin_std": 1.7728079557418823, "epsilon_dpo/loss_margin_mean": 45.76614761352539, "grad_norm": 45.900428771972656, "kl/avg_steps": 0.625, "kl/beta": 0.03864329680800438, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.6255428393240354e-07, "logits/chosen": -0.8913235664367676, "logits/rejected": -0.9729048013687134, "logps/chosen": -88.7005615234375, "logps/ref_chosen": -58.26478958129883, "logps/ref_rejected": -105.36532592773438, "logps/rejected": -181.5672607421875, "loss": 0.6573, "rewards/accuracies": 0.84375, "rewards/chosen": -1.1707124710083008, "rewards/margins": 1.7495858669281006, "rewards/rejected": -2.9202980995178223, "step": 178 }, { "epoch": 0.26284875183553597, "epsilon_dpo/beta": 0.03826308995485306, "epsilon_dpo/beta_margin_grad_mean": -0.26975318789482117, "epsilon_dpo/beta_margin_grad_std": 0.2499701827764511, "epsilon_dpo/beta_margin_mean": 1.577806830406189, "epsilon_dpo/beta_margin_std": 1.8105027675628662, "epsilon_dpo/loss_margin_mean": 41.57501220703125, "grad_norm": 53.537803649902344, "kl/avg_steps": 0.375, "kl/beta": 0.03840327635407448, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.6187590003538724e-07, "logits/chosen": -0.8972229361534119, "logits/rejected": -0.9002971649169922, "logps/chosen": -89.78822326660156, "logps/ref_chosen": -61.05832290649414, "logps/ref_rejected": -90.52782440185547, "logps/rejected": -160.83273315429688, "loss": 0.8261, "rewards/accuracies": 0.75, "rewards/chosen": -1.1028212308883667, "rewards/margins": 1.577806830406189, "rewards/rejected": -2.6806280612945557, "step": 179 }, { "epoch": 0.2643171806167401, "epsilon_dpo/beta": 0.03803643956780434, "epsilon_dpo/beta_margin_grad_mean": -0.24183642864227295, "epsilon_dpo/beta_margin_grad_std": 0.19888649880886078, "epsilon_dpo/beta_margin_mean": 1.6517789363861084, "epsilon_dpo/beta_margin_std": 1.5315907001495361, "epsilon_dpo/loss_margin_mean": 43.62617492675781, "grad_norm": 35.250980377197266, "kl/avg_steps": 0.59375, "kl/beta": 0.038259804248809814, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.611919330113591e-07, "logits/chosen": -0.9804345369338989, "logits/rejected": -0.9229779839515686, "logps/chosen": -78.31442260742188, "logps/ref_chosen": -54.34272003173828, "logps/ref_rejected": -98.21183776855469, "logps/rejected": -165.80970764160156, "loss": 0.6396, "rewards/accuracies": 0.859375, "rewards/chosen": -0.9133108258247375, "rewards/margins": 1.651779055595398, "rewards/rejected": -2.565089702606201, "step": 180 }, { "epoch": 0.2657856093979442, "epsilon_dpo/beta": 0.03780004382133484, "epsilon_dpo/beta_margin_grad_mean": -0.30530738830566406, "epsilon_dpo/beta_margin_grad_std": 0.20790939033031464, "epsilon_dpo/beta_margin_mean": 1.1538174152374268, "epsilon_dpo/beta_margin_std": 1.4224164485931396, "epsilon_dpo/loss_margin_mean": 30.68983268737793, "grad_norm": 41.931888580322266, "kl/avg_steps": 0.625, "kl/beta": 0.03803397715091705, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.605024008834863e-07, "logits/chosen": -0.9457237720489502, "logits/rejected": -0.8364189863204956, "logps/chosen": -76.48634338378906, "logps/ref_chosen": -55.000457763671875, "logps/ref_rejected": -61.656166076660156, "logps/rejected": -113.8318862915039, "loss": 0.8516, "rewards/accuracies": 0.875, "rewards/chosen": -0.8143583536148071, "rewards/margins": 1.1538174152374268, "rewards/rejected": -1.9681758880615234, "step": 181 }, { "epoch": 0.26725403817914833, "epsilon_dpo/beta": 0.03752982243895531, "epsilon_dpo/beta_margin_grad_mean": -0.23621736466884613, "epsilon_dpo/beta_margin_grad_std": 0.22672967612743378, "epsilon_dpo/beta_margin_mean": 1.784500241279602, "epsilon_dpo/beta_margin_std": 1.7586556673049927, "epsilon_dpo/loss_margin_mean": 47.745147705078125, "grad_norm": 43.90743637084961, "kl/avg_steps": 0.71875, "kl/beta": 0.03779774159193039, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 4.598073218215817e-07, "logits/chosen": -0.8483109474182129, "logits/rejected": -0.8673996925354004, "logps/chosen": -63.344337463378906, "logps/ref_chosen": -41.10784912109375, "logps/ref_rejected": -89.5215835571289, "logps/rejected": -159.5032196044922, "loss": 0.6955, "rewards/accuracies": 0.875, "rewards/chosen": -0.837371826171875, "rewards/margins": 1.7845001220703125, "rewards/rejected": -2.6218719482421875, "step": 182 }, { "epoch": 0.2687224669603524, "epsilon_dpo/beta": 0.0372854582965374, "epsilon_dpo/beta_margin_grad_mean": -0.288994699716568, "epsilon_dpo/beta_margin_grad_std": 0.1929122507572174, "epsilon_dpo/beta_margin_mean": 1.2557570934295654, "epsilon_dpo/beta_margin_std": 1.3659968376159668, "epsilon_dpo/loss_margin_mean": 33.83012390136719, "grad_norm": 46.23603439331055, "kl/avg_steps": 0.65625, "kl/beta": 0.037528008222579956, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.5910671414162484e-07, "logits/chosen": -0.9779186844825745, "logits/rejected": -0.9011868238449097, "logps/chosen": -88.38247680664062, "logps/ref_chosen": -57.524559020996094, "logps/ref_rejected": -75.97572326660156, "logps/rejected": -140.6637725830078, "loss": 0.7675, "rewards/accuracies": 0.84375, "rewards/chosen": -1.1529462337493896, "rewards/margins": 1.2557570934295654, "rewards/rejected": -2.408703327178955, "step": 183 }, { "epoch": 0.2701908957415565, "epsilon_dpo/beta": 0.037077322602272034, "epsilon_dpo/beta_margin_grad_mean": -0.31754952669143677, "epsilon_dpo/beta_margin_grad_std": 0.18516047298908234, "epsilon_dpo/beta_margin_mean": 1.047448992729187, "epsilon_dpo/beta_margin_std": 1.2702914476394653, "epsilon_dpo/loss_margin_mean": 28.40550422668457, "grad_norm": 37.191226959228516, "kl/avg_steps": 0.5625, "kl/beta": 0.037283334881067276, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.5840059630527985e-07, "logits/chosen": -0.9359087347984314, "logits/rejected": -0.9384806752204895, "logps/chosen": -81.17167663574219, "logps/ref_chosen": -58.544952392578125, "logps/ref_rejected": -76.63406372070312, "logps/rejected": -127.66629028320312, "loss": 0.8443, "rewards/accuracies": 0.84375, "rewards/chosen": -0.8395744562149048, "rewards/margins": 1.047448992729187, "rewards/rejected": -1.8870235681533813, "step": 184 }, { "epoch": 0.27165932452276065, "epsilon_dpo/beta": 0.03690469264984131, "epsilon_dpo/beta_margin_grad_mean": -0.33891424536705017, "epsilon_dpo/beta_margin_grad_std": 0.20880207419395447, "epsilon_dpo/beta_margin_mean": 0.9352792501449585, "epsilon_dpo/beta_margin_std": 1.3410850763320923, "epsilon_dpo/loss_margin_mean": 25.53728485107422, "grad_norm": 43.51972961425781, "kl/avg_steps": 0.46875, "kl/beta": 0.03707478940486908, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.5768898691940836e-07, "logits/chosen": -0.9461748600006104, "logits/rejected": -0.8326639533042908, "logps/chosen": -87.82810974121094, "logps/ref_chosen": -62.02584457397461, "logps/ref_rejected": -73.76260375976562, "logps/rejected": -125.1021499633789, "loss": 0.9611, "rewards/accuracies": 0.75, "rewards/chosen": -0.9536153078079224, "rewards/margins": 0.9352792501449585, "rewards/rejected": -1.8888945579528809, "step": 185 }, { "epoch": 0.27312775330396477, "epsilon_dpo/beta": 0.03665177896618843, "epsilon_dpo/beta_margin_grad_mean": -0.24522437155246735, "epsilon_dpo/beta_margin_grad_std": 0.1821317821741104, "epsilon_dpo/beta_margin_mean": 1.5288625955581665, "epsilon_dpo/beta_margin_std": 1.3740565776824951, "epsilon_dpo/loss_margin_mean": 41.85858917236328, "grad_norm": 37.097618103027344, "kl/avg_steps": 0.6875, "kl/beta": 0.0369018130004406, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.5697190473557947e-07, "logits/chosen": -0.937578558921814, "logits/rejected": -0.9114998579025269, "logps/chosen": -94.59858703613281, "logps/ref_chosen": -69.35346984863281, "logps/ref_rejected": -88.07244873046875, "logps/rejected": -155.17616271972656, "loss": 0.634, "rewards/accuracies": 0.90625, "rewards/chosen": -0.9274721145629883, "rewards/margins": 1.5288625955581665, "rewards/rejected": -2.4563345909118652, "step": 186 }, { "epoch": 0.2745961820851689, "epsilon_dpo/beta": 0.0364244244992733, "epsilon_dpo/beta_margin_grad_mean": -0.3008478581905365, "epsilon_dpo/beta_margin_grad_std": 0.20724257826805115, "epsilon_dpo/beta_margin_mean": 1.1786620616912842, "epsilon_dpo/beta_margin_std": 1.4209734201431274, "epsilon_dpo/loss_margin_mean": 32.53537368774414, "grad_norm": 44.46013641357422, "kl/avg_steps": 0.625, "kl/beta": 0.03664984554052353, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.5624936864957555e-07, "logits/chosen": -0.8802215456962585, "logits/rejected": -0.9012373685836792, "logps/chosen": -78.84783935546875, "logps/ref_chosen": -52.75646209716797, "logps/ref_rejected": -81.96910095214844, "logps/rejected": -140.59585571289062, "loss": 0.8414, "rewards/accuracies": 0.828125, "rewards/chosen": -0.9526379704475403, "rewards/margins": 1.1786620616912842, "rewards/rejected": -2.1312999725341797, "step": 187 }, { "epoch": 0.27606461086637296, "epsilon_dpo/beta": 0.03614127263426781, "epsilon_dpo/beta_margin_grad_mean": -0.2645493447780609, "epsilon_dpo/beta_margin_grad_std": 0.19629958271980286, "epsilon_dpo/beta_margin_mean": 1.4356456995010376, "epsilon_dpo/beta_margin_std": 1.441051721572876, "epsilon_dpo/loss_margin_mean": 39.84414291381836, "grad_norm": 38.8013916015625, "kl/avg_steps": 0.78125, "kl/beta": 0.036422207951545715, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 4.5552139770089454e-07, "logits/chosen": -0.955854594707489, "logits/rejected": -0.9559851884841919, "logps/chosen": -70.17094421386719, "logps/ref_chosen": -49.415489196777344, "logps/ref_rejected": -89.54043579101562, "logps/rejected": -150.14002990722656, "loss": 0.7102, "rewards/accuracies": 0.875, "rewards/chosen": -0.7516555786132812, "rewards/margins": 1.4356456995010376, "rewards/rejected": -2.1873011589050293, "step": 188 }, { "epoch": 0.2775330396475771, "epsilon_dpo/beta": 0.03589498996734619, "epsilon_dpo/beta_margin_grad_mean": -0.2797743082046509, "epsilon_dpo/beta_margin_grad_std": 0.19247855246067047, "epsilon_dpo/beta_margin_mean": 1.3004655838012695, "epsilon_dpo/beta_margin_std": 1.388077735900879, "epsilon_dpo/loss_margin_mean": 36.37957000732422, "grad_norm": 44.822998046875, "kl/avg_steps": 0.6875, "kl/beta": 0.03613986447453499, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.5478801107224794e-07, "logits/chosen": -0.9416651725769043, "logits/rejected": -0.8099236488342285, "logps/chosen": -75.24497985839844, "logps/ref_chosen": -52.39896011352539, "logps/ref_rejected": -72.16735076904297, "logps/rejected": -131.3929443359375, "loss": 0.7521, "rewards/accuracies": 0.84375, "rewards/chosen": -0.8231290578842163, "rewards/margins": 1.3004655838012695, "rewards/rejected": -2.1235947608947754, "step": 189 }, { "epoch": 0.2790014684287812, "epsilon_dpo/beta": 0.035649895668029785, "epsilon_dpo/beta_margin_grad_mean": -0.27818140387535095, "epsilon_dpo/beta_margin_grad_std": 0.19762156903743744, "epsilon_dpo/beta_margin_mean": 1.3540294170379639, "epsilon_dpo/beta_margin_std": 1.4708027839660645, "epsilon_dpo/loss_margin_mean": 38.14091491699219, "grad_norm": 40.37140655517578, "kl/avg_steps": 0.6875, "kl/beta": 0.03589309751987457, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.5404922808905543e-07, "logits/chosen": -1.0179307460784912, "logits/rejected": -0.9395278096199036, "logps/chosen": -87.93982696533203, "logps/ref_chosen": -64.68305969238281, "logps/ref_rejected": -102.55052185058594, "logps/rejected": -163.94821166992188, "loss": 0.7481, "rewards/accuracies": 0.859375, "rewards/chosen": -0.8317223787307739, "rewards/margins": 1.3540294170379639, "rewards/rejected": -2.1857519149780273, "step": 190 }, { "epoch": 0.28046989720998533, "epsilon_dpo/beta": 0.035328492522239685, "epsilon_dpo/beta_margin_grad_mean": -0.22660250961780548, "epsilon_dpo/beta_margin_grad_std": 0.17660540342330933, "epsilon_dpo/beta_margin_mean": 1.7651500701904297, "epsilon_dpo/beta_margin_std": 1.6534723043441772, "epsilon_dpo/loss_margin_mean": 50.026649475097656, "grad_norm": 30.353147506713867, "kl/avg_steps": 0.90625, "kl/beta": 0.03564801812171936, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 4.5330506821893565e-07, "logits/chosen": -0.9302985072135925, "logits/rejected": -0.9496049284934998, "logps/chosen": -87.33015441894531, "logps/ref_chosen": -68.65887451171875, "logps/ref_rejected": -110.1396713256836, "logps/rejected": -178.8376007080078, "loss": 0.579, "rewards/accuracies": 0.921875, "rewards/chosen": -0.6598148345947266, "rewards/margins": 1.7651500701904297, "rewards/rejected": -2.4249649047851562, "step": 191 }, { "epoch": 0.28193832599118945, "epsilon_dpo/beta": 0.03509952872991562, "epsilon_dpo/beta_margin_grad_mean": -0.27115482091903687, "epsilon_dpo/beta_margin_grad_std": 0.19542157649993896, "epsilon_dpo/beta_margin_mean": 1.406541347503662, "epsilon_dpo/beta_margin_std": 1.4579341411590576, "epsilon_dpo/loss_margin_mean": 40.24146270751953, "grad_norm": 36.47545623779297, "kl/avg_steps": 0.65625, "kl/beta": 0.03532785922288895, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.5255555107119336e-07, "logits/chosen": -0.9530336260795593, "logits/rejected": -0.917807936668396, "logps/chosen": -96.71080780029297, "logps/ref_chosen": -69.72691345214844, "logps/ref_rejected": -103.32135009765625, "logps/rejected": -170.54669189453125, "loss": 0.7162, "rewards/accuracies": 0.828125, "rewards/chosen": -0.9497445225715637, "rewards/margins": 1.406541347503662, "rewards/rejected": -2.356285810470581, "step": 192 }, { "epoch": 0.2834067547723935, "epsilon_dpo/beta": 0.034914568066596985, "epsilon_dpo/beta_margin_grad_mean": -0.3270193040370941, "epsilon_dpo/beta_margin_grad_std": 0.20700420439243317, "epsilon_dpo/beta_margin_mean": 0.8627244234085083, "epsilon_dpo/beta_margin_std": 1.2464419603347778, "epsilon_dpo/loss_margin_mean": 24.909847259521484, "grad_norm": 57.78805923461914, "kl/avg_steps": 0.53125, "kl/beta": 0.03509753197431564, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.5180069639630236e-07, "logits/chosen": -0.9580802917480469, "logits/rejected": -0.8371573686599731, "logps/chosen": -88.631591796875, "logps/ref_chosen": -60.19049835205078, "logps/ref_rejected": -76.40755462646484, "logps/rejected": -129.75851440429688, "loss": 0.9925, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9967535734176636, "rewards/margins": 0.8627244234085083, "rewards/rejected": -1.8594779968261719, "step": 193 }, { "epoch": 0.28487518355359764, "epsilon_dpo/beta": 0.03467550873756409, "epsilon_dpo/beta_margin_grad_mean": -0.27004995942115784, "epsilon_dpo/beta_margin_grad_std": 0.17530405521392822, "epsilon_dpo/beta_margin_mean": 1.3117589950561523, "epsilon_dpo/beta_margin_std": 1.2078568935394287, "epsilon_dpo/loss_margin_mean": 37.94846725463867, "grad_norm": 28.85865592956543, "kl/avg_steps": 0.6875, "kl/beta": 0.03491206094622612, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.510405240853854e-07, "logits/chosen": -0.8273730278015137, "logits/rejected": -0.7798274159431458, "logps/chosen": -55.676612854003906, "logps/ref_chosen": -37.84037399291992, "logps/ref_rejected": -60.684783935546875, "logps/rejected": -116.46949768066406, "loss": 0.691, "rewards/accuracies": 0.875, "rewards/chosen": -0.6194702982902527, "rewards/margins": 1.3117589950561523, "rewards/rejected": -1.9312293529510498, "step": 194 }, { "epoch": 0.28634361233480177, "epsilon_dpo/beta": 0.03444957733154297, "epsilon_dpo/beta_margin_grad_mean": -0.2650885283946991, "epsilon_dpo/beta_margin_grad_std": 0.17504550516605377, "epsilon_dpo/beta_margin_mean": 1.3131177425384521, "epsilon_dpo/beta_margin_std": 1.2515735626220703, "epsilon_dpo/loss_margin_mean": 38.27104568481445, "grad_norm": 40.16360855102539, "kl/avg_steps": 0.65625, "kl/beta": 0.03467367962002754, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.5027505416968985e-07, "logits/chosen": -0.9197718501091003, "logits/rejected": -0.9446795582771301, "logps/chosen": -83.88518524169922, "logps/ref_chosen": -54.891571044921875, "logps/ref_rejected": -96.77095794677734, "logps/rejected": -164.03561401367188, "loss": 0.6854, "rewards/accuracies": 0.90625, "rewards/chosen": -1.0012941360473633, "rewards/margins": 1.3131178617477417, "rewards/rejected": -2.3144121170043945, "step": 195 }, { "epoch": 0.2878120411160059, "epsilon_dpo/beta": 0.034192681312561035, "epsilon_dpo/beta_margin_grad_mean": -0.2511465847492218, "epsilon_dpo/beta_margin_grad_std": 0.1939229518175125, "epsilon_dpo/beta_margin_mean": 1.5818175077438354, "epsilon_dpo/beta_margin_std": 1.527445912361145, "epsilon_dpo/loss_margin_mean": 46.410396575927734, "grad_norm": 35.67520523071289, "kl/avg_steps": 0.75, "kl/beta": 0.03444761782884598, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 4.495043068200599e-07, "logits/chosen": -0.9461580514907837, "logits/rejected": -0.8389493227005005, "logps/chosen": -76.25343322753906, "logps/ref_chosen": -53.245243072509766, "logps/ref_rejected": -76.05294799804688, "logps/rejected": -145.47154235839844, "loss": 0.6619, "rewards/accuracies": 0.890625, "rewards/chosen": -0.7892789244651794, "rewards/margins": 1.581817626953125, "rewards/rejected": -2.371096611022949, "step": 196 }, { "epoch": 0.28928046989721, "epsilon_dpo/beta": 0.03392745554447174, "epsilon_dpo/beta_margin_grad_mean": -0.276800274848938, "epsilon_dpo/beta_margin_grad_std": 0.16978055238723755, "epsilon_dpo/beta_margin_mean": 1.2544382810592651, "epsilon_dpo/beta_margin_std": 1.2259612083435059, "epsilon_dpo/loss_margin_mean": 37.06828308105469, "grad_norm": 34.02505111694336, "kl/avg_steps": 0.78125, "kl/beta": 0.03419118374586105, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 4.4872830234640493e-07, "logits/chosen": -0.8266817331314087, "logits/rejected": -0.8321689963340759, "logps/chosen": -83.6280517578125, "logps/ref_chosen": -60.42033767700195, "logps/ref_rejected": -77.20890808105469, "logps/rejected": -137.4849090576172, "loss": 0.7128, "rewards/accuracies": 0.90625, "rewards/chosen": -0.7887219190597534, "rewards/margins": 1.2544382810592651, "rewards/rejected": -2.0431602001190186, "step": 197 }, { "epoch": 0.2907488986784141, "epsilon_dpo/beta": 0.03367505967617035, "epsilon_dpo/beta_margin_grad_mean": -0.270523339509964, "epsilon_dpo/beta_margin_grad_std": 0.17167897522449493, "epsilon_dpo/beta_margin_mean": 1.3996421098709106, "epsilon_dpo/beta_margin_std": 1.5758426189422607, "epsilon_dpo/loss_margin_mean": 41.68669128417969, "grad_norm": 31.8677921295166, "kl/avg_steps": 0.75, "kl/beta": 0.033926136791706085, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 4.479470611971645e-07, "logits/chosen": -0.9528543949127197, "logits/rejected": -0.9351816177368164, "logps/chosen": -79.49683380126953, "logps/ref_chosen": -55.03618621826172, "logps/ref_rejected": -97.24325561523438, "logps/rejected": -163.39059448242188, "loss": 0.696, "rewards/accuracies": 0.90625, "rewards/chosen": -0.8244791030883789, "rewards/margins": 1.3996421098709106, "rewards/rejected": -2.22412109375, "step": 198 }, { "epoch": 0.2922173274596182, "epsilon_dpo/beta": 0.03346646949648857, "epsilon_dpo/beta_margin_grad_mean": -0.2789730429649353, "epsilon_dpo/beta_margin_grad_std": 0.20289357006549835, "epsilon_dpo/beta_margin_mean": 1.3355058431625366, "epsilon_dpo/beta_margin_std": 1.4378360509872437, "epsilon_dpo/loss_margin_mean": 40.09720993041992, "grad_norm": 34.80937957763672, "kl/avg_steps": 0.625, "kl/beta": 0.03367358446121216, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.471606039587695e-07, "logits/chosen": -0.9109436273574829, "logits/rejected": -0.8725607991218567, "logps/chosen": -81.19833374023438, "logps/ref_chosen": -56.828826904296875, "logps/ref_rejected": -84.64820861816406, "logps/rejected": -149.11492919921875, "loss": 0.7562, "rewards/accuracies": 0.859375, "rewards/chosen": -0.817104697227478, "rewards/margins": 1.335505723953247, "rewards/rejected": -2.1526105403900146, "step": 199 }, { "epoch": 0.2936857562408223, "epsilon_dpo/beta": 0.03325860574841499, "epsilon_dpo/beta_margin_grad_mean": -0.2773591876029968, "epsilon_dpo/beta_margin_grad_std": 0.21103039383888245, "epsilon_dpo/beta_margin_mean": 1.4414976835250854, "epsilon_dpo/beta_margin_std": 1.6528784036636353, "epsilon_dpo/loss_margin_mean": 43.556739807128906, "grad_norm": 40.760223388671875, "kl/avg_steps": 0.625, "kl/beta": 0.03346443176269531, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.4636895135509966e-07, "logits/chosen": -0.9012882709503174, "logits/rejected": -0.8924180865287781, "logps/chosen": -79.0744857788086, "logps/ref_chosen": -53.06706237792969, "logps/ref_rejected": -80.60843658447266, "logps/rejected": -150.172607421875, "loss": 0.7763, "rewards/accuracies": 0.84375, "rewards/chosen": -0.8672366142272949, "rewards/margins": 1.4414976835250854, "rewards/rejected": -2.30873441696167, "step": 200 }, { "epoch": 0.2936857562408223, "eval_epsilon_dpo/beta": 0.0331425815820694, "eval_epsilon_dpo/beta_margin_grad_mean": -0.3643336594104767, "eval_epsilon_dpo/beta_margin_grad_std": 0.21575622260570526, "eval_epsilon_dpo/beta_margin_mean": 0.830310583114624, "eval_epsilon_dpo/beta_margin_std": 1.4035773277282715, "eval_epsilon_dpo/loss_margin_mean": 25.31026840209961, "eval_kl/n_epsilon_steps": 0.3232020437717438, "eval_kl/p_epsilon_steps": 0.6759417653083801, "eval_logits/chosen": -0.9108002781867981, "eval_logits/rejected": -0.8736971020698547, "eval_logps/chosen": -112.24243927001953, "eval_logps/ref_chosen": -79.05104064941406, "eval_logps/ref_rejected": -86.79793548583984, "eval_logps/rejected": -145.29959106445312, "eval_loss": 0.534026563167572, "eval_rewards/accuracies": 0.7148972749710083, "eval_rewards/chosen": -1.1029342412948608, "eval_rewards/margins": 0.830310583114624, "eval_rewards/rejected": -1.9332449436187744, "eval_runtime": 43.1548, "eval_samples_per_second": 54.2, "eval_steps_per_second": 1.715, "step": 200 }, { "epoch": 0.29515418502202645, "epsilon_dpo/beta": 0.03303124010562897, "epsilon_dpo/beta_margin_grad_mean": -0.2606509029865265, "epsilon_dpo/beta_margin_grad_std": 0.19733315706253052, "epsilon_dpo/beta_margin_mean": 1.4968202114105225, "epsilon_dpo/beta_margin_std": 1.5476200580596924, "epsilon_dpo/loss_margin_mean": 45.50050735473633, "grad_norm": 32.52497482299805, "kl/avg_steps": 0.6875, "kl/beta": 0.03325657919049263, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.455721242469372e-07, "logits/chosen": -0.9245244264602661, "logits/rejected": -0.8867864608764648, "logps/chosen": -98.13409423828125, "logps/ref_chosen": -75.4022216796875, "logps/ref_rejected": -114.80821990966797, "logps/rejected": -183.04058837890625, "loss": 0.6978, "rewards/accuracies": 0.875, "rewards/chosen": -0.7529622316360474, "rewards/margins": 1.4968202114105225, "rewards/rejected": -2.2497823238372803, "step": 201 }, { "epoch": 0.2966226138032305, "epsilon_dpo/beta": 0.03284699469804764, "epsilon_dpo/beta_margin_grad_mean": -0.3225107192993164, "epsilon_dpo/beta_margin_grad_std": 0.2235589325428009, "epsilon_dpo/beta_margin_mean": 1.0417420864105225, "epsilon_dpo/beta_margin_std": 1.408299446105957, "epsilon_dpo/loss_margin_mean": 31.94036293029785, "grad_norm": 43.633670806884766, "kl/avg_steps": 0.5625, "kl/beta": 0.033029500395059586, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.4477014363141755e-07, "logits/chosen": -0.8974350094795227, "logits/rejected": -0.8948566913604736, "logps/chosen": -76.72681427001953, "logps/ref_chosen": -50.101318359375, "logps/ref_rejected": -86.98503112792969, "logps/rejected": -145.55088806152344, "loss": 0.9358, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8780945539474487, "rewards/margins": 1.0417420864105225, "rewards/rejected": -1.9198366403579712, "step": 202 }, { "epoch": 0.29809104258443464, "epsilon_dpo/beta": 0.03264273330569267, "epsilon_dpo/beta_margin_grad_mean": -0.28617146611213684, "epsilon_dpo/beta_margin_grad_std": 0.17759016156196594, "epsilon_dpo/beta_margin_mean": 1.1962802410125732, "epsilon_dpo/beta_margin_std": 1.203896403312683, "epsilon_dpo/loss_margin_mean": 36.81545639038086, "grad_norm": 31.58559799194336, "kl/avg_steps": 0.625, "kl/beta": 0.032844748347997665, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.439630306414758e-07, "logits/chosen": -0.9989201426506042, "logits/rejected": -0.887505292892456, "logps/chosen": -83.34878540039062, "logps/ref_chosen": -60.60969543457031, "logps/ref_rejected": -85.89596557617188, "logps/rejected": -145.45053100585938, "loss": 0.7446, "rewards/accuracies": 0.859375, "rewards/chosen": -0.7440138459205627, "rewards/margins": 1.1962802410125732, "rewards/rejected": -1.9402940273284912, "step": 203 }, { "epoch": 0.29955947136563876, "epsilon_dpo/beta": 0.03243998438119888, "epsilon_dpo/beta_margin_grad_mean": -0.2921689748764038, "epsilon_dpo/beta_margin_grad_std": 0.1850796341896057, "epsilon_dpo/beta_margin_mean": 1.230268955230713, "epsilon_dpo/beta_margin_std": 1.4135836362838745, "epsilon_dpo/loss_margin_mean": 38.102867126464844, "grad_norm": 34.97591018676758, "kl/avg_steps": 0.625, "kl/beta": 0.032640744000673294, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.431508065452897e-07, "logits/chosen": -0.9733670949935913, "logits/rejected": -0.8489159941673279, "logps/chosen": -107.8044204711914, "logps/ref_chosen": -80.16496276855469, "logps/ref_rejected": -87.69590759277344, "logps/rejected": -153.438232421875, "loss": 0.7734, "rewards/accuracies": 0.84375, "rewards/chosen": -0.8980864882469177, "rewards/margins": 1.230268955230713, "rewards/rejected": -2.1283555030822754, "step": 204 }, { "epoch": 0.3010279001468429, "epsilon_dpo/beta": 0.03224863111972809, "epsilon_dpo/beta_margin_grad_mean": -0.266499400138855, "epsilon_dpo/beta_margin_grad_std": 0.21775342524051666, "epsilon_dpo/beta_margin_mean": 1.4351390600204468, "epsilon_dpo/beta_margin_std": 1.4817283153533936, "epsilon_dpo/loss_margin_mean": 44.7449951171875, "grad_norm": 37.37656021118164, "kl/avg_steps": 0.59375, "kl/beta": 0.0324380062520504, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.4233349274571974e-07, "logits/chosen": -0.9361932277679443, "logits/rejected": -0.8849596381187439, "logps/chosen": -88.91207885742188, "logps/ref_chosen": -59.384735107421875, "logps/ref_rejected": -85.12505340576172, "logps/rejected": -159.39739990234375, "loss": 0.7467, "rewards/accuracies": 0.828125, "rewards/chosen": -0.9544239044189453, "rewards/margins": 1.4351390600204468, "rewards/rejected": -2.3895630836486816, "step": 205 }, { "epoch": 0.302496328928047, "epsilon_dpo/beta": 0.0320381261408329, "epsilon_dpo/beta_margin_grad_mean": -0.2319086343050003, "epsilon_dpo/beta_margin_grad_std": 0.19648852944374084, "epsilon_dpo/beta_margin_mean": 1.6427158117294312, "epsilon_dpo/beta_margin_std": 1.4306899309158325, "epsilon_dpo/loss_margin_mean": 51.48557662963867, "grad_norm": 31.1833553314209, "kl/avg_steps": 0.65625, "kl/beta": 0.03224654123187065, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.415111107797445e-07, "logits/chosen": -0.9038573503494263, "logits/rejected": -0.919549822807312, "logps/chosen": -76.15838623046875, "logps/ref_chosen": -46.964500427246094, "logps/ref_rejected": -98.9534912109375, "logps/rejected": -179.63294982910156, "loss": 0.6167, "rewards/accuracies": 0.875, "rewards/chosen": -0.9376621246337891, "rewards/margins": 1.6427156925201416, "rewards/rejected": -2.5803780555725098, "step": 206 }, { "epoch": 0.3039647577092511, "epsilon_dpo/beta": 0.031819239258766174, "epsilon_dpo/beta_margin_grad_mean": -0.24663381278514862, "epsilon_dpo/beta_margin_grad_std": 0.18707990646362305, "epsilon_dpo/beta_margin_mean": 1.7104490995407104, "epsilon_dpo/beta_margin_std": 1.7299070358276367, "epsilon_dpo/loss_margin_mean": 53.9384880065918, "grad_norm": 33.46366500854492, "kl/avg_steps": 0.6875, "kl/beta": 0.03203630447387695, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.4068368231789365e-07, "logits/chosen": -0.9281418323516846, "logits/rejected": -0.862322211265564, "logps/chosen": -79.54106140136719, "logps/ref_chosen": -56.05625915527344, "logps/ref_rejected": -84.44779968261719, "logps/rejected": -161.87109375, "loss": 0.6375, "rewards/accuracies": 0.890625, "rewards/chosen": -0.7494818568229675, "rewards/margins": 1.710448980331421, "rewards/rejected": -2.459930896759033, "step": 207 }, { "epoch": 0.3054331864904552, "epsilon_dpo/beta": 0.031592030078172684, "epsilon_dpo/beta_margin_grad_mean": -0.2750071585178375, "epsilon_dpo/beta_margin_grad_std": 0.19815897941589355, "epsilon_dpo/beta_margin_mean": 1.3792335987091064, "epsilon_dpo/beta_margin_std": 1.4800665378570557, "epsilon_dpo/loss_margin_mean": 43.82465744018555, "grad_norm": 47.45777893066406, "kl/avg_steps": 0.71875, "kl/beta": 0.03181755915284157, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 4.398512291636768e-07, "logits/chosen": -0.922553539276123, "logits/rejected": -0.8577385544776917, "logps/chosen": -101.65859985351562, "logps/ref_chosen": -67.06761169433594, "logps/ref_rejected": -94.28689575195312, "logps/rejected": -172.70254516601562, "loss": 0.7398, "rewards/accuracies": 0.890625, "rewards/chosen": -1.0961982011795044, "rewards/margins": 1.3792335987091064, "rewards/rejected": -2.4754316806793213, "step": 208 }, { "epoch": 0.3069016152716593, "epsilon_dpo/beta": 0.03140607476234436, "epsilon_dpo/beta_margin_grad_mean": -0.27811017632484436, "epsilon_dpo/beta_margin_grad_std": 0.20568938553333282, "epsilon_dpo/beta_margin_mean": 1.3316532373428345, "epsilon_dpo/beta_margin_std": 1.4257279634475708, "epsilon_dpo/loss_margin_mean": 42.62771987915039, "grad_norm": 33.01478958129883, "kl/avg_steps": 0.59375, "kl/beta": 0.031590502709150314, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.3901377325300857e-07, "logits/chosen": -0.8361650109291077, "logits/rejected": -0.807563304901123, "logps/chosen": -84.05644226074219, "logps/ref_chosen": -56.18169403076172, "logps/ref_rejected": -80.94152069091797, "logps/rejected": -151.44398498535156, "loss": 0.7621, "rewards/accuracies": 0.859375, "rewards/chosen": -0.8777600526809692, "rewards/margins": 1.331653356552124, "rewards/rejected": -2.209413528442383, "step": 209 }, { "epoch": 0.30837004405286345, "epsilon_dpo/beta": 0.031171627342700958, "epsilon_dpo/beta_margin_grad_mean": -0.2719811201095581, "epsilon_dpo/beta_margin_grad_std": 0.17441512644290924, "epsilon_dpo/beta_margin_mean": 1.3284344673156738, "epsilon_dpo/beta_margin_std": 1.2857555150985718, "epsilon_dpo/loss_margin_mean": 42.733673095703125, "grad_norm": 34.61418914794922, "kl/avg_steps": 0.75, "kl/beta": 0.0314040407538414, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 4.381713366536311e-07, "logits/chosen": -0.8779923915863037, "logits/rejected": -0.8523108959197998, "logps/chosen": -72.92080688476562, "logps/ref_chosen": -46.371822357177734, "logps/ref_rejected": -76.68162536621094, "logps/rejected": -145.9642791748047, "loss": 0.6964, "rewards/accuracies": 0.90625, "rewards/chosen": -0.8289717435836792, "rewards/margins": 1.3284344673156738, "rewards/rejected": -2.1574063301086426, "step": 210 }, { "epoch": 0.30983847283406757, "epsilon_dpo/beta": 0.031017513945698738, "epsilon_dpo/beta_margin_grad_mean": -0.3004344403743744, "epsilon_dpo/beta_margin_grad_std": 0.21112920343875885, "epsilon_dpo/beta_margin_mean": 1.1957648992538452, "epsilon_dpo/beta_margin_std": 1.424613118171692, "epsilon_dpo/loss_margin_mean": 38.81417465209961, "grad_norm": 46.81315994262695, "kl/avg_steps": 0.5, "kl/beta": 0.03117026388645172, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.373239415645323e-07, "logits/chosen": -0.9604991674423218, "logits/rejected": -0.8495550155639648, "logps/chosen": -118.53823852539062, "logps/ref_chosen": -78.93235778808594, "logps/ref_rejected": -86.82098388671875, "logps/rejected": -165.2410430908203, "loss": 0.8303, "rewards/accuracies": 0.796875, "rewards/chosen": -1.2315609455108643, "rewards/margins": 1.1957648992538452, "rewards/rejected": -2.42732572555542, "step": 211 }, { "epoch": 0.31130690161527164, "epsilon_dpo/beta": 0.030834117904305458, "epsilon_dpo/beta_margin_grad_mean": -0.24542976915836334, "epsilon_dpo/beta_margin_grad_std": 0.20712795853614807, "epsilon_dpo/beta_margin_mean": 1.7108030319213867, "epsilon_dpo/beta_margin_std": 1.6675610542297363, "epsilon_dpo/loss_margin_mean": 55.742095947265625, "grad_norm": 35.138973236083984, "kl/avg_steps": 0.59375, "kl/beta": 0.03101518750190735, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.3647161031536086e-07, "logits/chosen": -0.8966690897941589, "logits/rejected": -0.919607400894165, "logps/chosen": -90.64710998535156, "logps/ref_chosen": -58.19701385498047, "logps/ref_rejected": -103.05784606933594, "logps/rejected": -191.25003051757812, "loss": 0.6557, "rewards/accuracies": 0.859375, "rewards/chosen": -1.0024387836456299, "rewards/margins": 1.7108030319213867, "rewards/rejected": -2.7132415771484375, "step": 212 }, { "epoch": 0.31277533039647576, "epsilon_dpo/beta": 0.030632847920060158, "epsilon_dpo/beta_margin_grad_mean": -0.24080544710159302, "epsilon_dpo/beta_margin_grad_std": 0.1985619217157364, "epsilon_dpo/beta_margin_mean": 1.7104308605194092, "epsilon_dpo/beta_margin_std": 1.6659079790115356, "epsilon_dpo/loss_margin_mean": 56.05814743041992, "grad_norm": 38.70331954956055, "kl/avg_steps": 0.65625, "kl/beta": 0.030832121148705482, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.3561436536583774e-07, "logits/chosen": -0.9104989767074585, "logits/rejected": -0.8471628427505493, "logps/chosen": -98.09537506103516, "logps/ref_chosen": -67.51271057128906, "logps/ref_rejected": -93.91471862792969, "logps/rejected": -180.55552673339844, "loss": 0.6496, "rewards/accuracies": 0.875, "rewards/chosen": -0.9383978247642517, "rewards/margins": 1.7104308605194092, "rewards/rejected": -2.6488285064697266, "step": 213 }, { "epoch": 0.3142437591776799, "epsilon_dpo/beta": 0.030418725684285164, "epsilon_dpo/beta_margin_grad_mean": -0.28497371077537537, "epsilon_dpo/beta_margin_grad_std": 0.19882433116436005, "epsilon_dpo/beta_margin_mean": 1.3660856485366821, "epsilon_dpo/beta_margin_std": 1.5981017351150513, "epsilon_dpo/loss_margin_mean": 45.08562469482422, "grad_norm": 46.01312255859375, "kl/avg_steps": 0.703125, "kl/beta": 0.030631106346845627, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.3475222930516473e-07, "logits/chosen": -0.7757738828659058, "logits/rejected": -0.7755211591720581, "logps/chosen": -72.26516723632812, "logps/ref_chosen": -41.604888916015625, "logps/ref_rejected": -77.51741027832031, "logps/rejected": -153.2633056640625, "loss": 0.7848, "rewards/accuracies": 0.875, "rewards/chosen": -0.9344457387924194, "rewards/margins": 1.3660856485366821, "rewards/rejected": -2.3005313873291016, "step": 214 }, { "epoch": 0.315712187958884, "epsilon_dpo/beta": 0.030230149626731873, "epsilon_dpo/beta_margin_grad_mean": -0.2637866735458374, "epsilon_dpo/beta_margin_grad_std": 0.1917775273323059, "epsilon_dpo/beta_margin_mean": 1.4585318565368652, "epsilon_dpo/beta_margin_std": 1.5505485534667969, "epsilon_dpo/loss_margin_mean": 48.46710205078125, "grad_norm": 36.04100036621094, "kl/avg_steps": 0.625, "kl/beta": 0.03041723370552063, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.3388522485142885e-07, "logits/chosen": -0.8914802670478821, "logits/rejected": -0.8997035622596741, "logps/chosen": -86.42012023925781, "logps/ref_chosen": -53.279266357421875, "logps/ref_rejected": -89.96464538574219, "logps/rejected": -171.57260131835938, "loss": 0.7021, "rewards/accuracies": 0.859375, "rewards/chosen": -1.0044194459915161, "rewards/margins": 1.4585318565368652, "rewards/rejected": -2.462951183319092, "step": 215 }, { "epoch": 0.31718061674008813, "epsilon_dpo/beta": 0.030023491010069847, "epsilon_dpo/beta_margin_grad_mean": -0.26890814304351807, "epsilon_dpo/beta_margin_grad_std": 0.2200879454612732, "epsilon_dpo/beta_margin_mean": 1.5447771549224854, "epsilon_dpo/beta_margin_std": 1.6737757921218872, "epsilon_dpo/loss_margin_mean": 51.67399215698242, "grad_norm": 47.91659927368164, "kl/avg_steps": 0.6875, "kl/beta": 0.030228307470679283, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.330133748510036e-07, "logits/chosen": -0.8279985189437866, "logits/rejected": -0.7683322429656982, "logps/chosen": -85.96902465820312, "logps/ref_chosen": -48.887794494628906, "logps/ref_rejected": -77.19892883300781, "logps/rejected": -165.9541473388672, "loss": 0.7448, "rewards/accuracies": 0.84375, "rewards/chosen": -1.115185260772705, "rewards/margins": 1.5447771549224854, "rewards/rejected": -2.6599624156951904, "step": 216 }, { "epoch": 0.3186490455212922, "epsilon_dpo/beta": 0.029762189835309982, "epsilon_dpo/beta_margin_grad_mean": -0.23406146466732025, "epsilon_dpo/beta_margin_grad_std": 0.18837563693523407, "epsilon_dpo/beta_margin_mean": 1.6387605667114258, "epsilon_dpo/beta_margin_std": 1.45741868019104, "epsilon_dpo/loss_margin_mean": 55.16669845581055, "grad_norm": 41.643062591552734, "kl/avg_steps": 0.875, "kl/beta": 0.03002190589904785, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 4.3213670227794757e-07, "logits/chosen": -0.8984875082969666, "logits/rejected": -0.8015980124473572, "logps/chosen": -82.93193054199219, "logps/ref_chosen": -49.845306396484375, "logps/ref_rejected": -100.07832336425781, "logps/rejected": -188.33164978027344, "loss": 0.6145, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9862960577011108, "rewards/margins": 1.6387605667114258, "rewards/rejected": -2.625056743621826, "step": 217 }, { "epoch": 0.3201174743024963, "epsilon_dpo/beta": 0.029578441753983498, "epsilon_dpo/beta_margin_grad_mean": -0.2751648426055908, "epsilon_dpo/beta_margin_grad_std": 0.20698566734790802, "epsilon_dpo/beta_margin_mean": 1.385319709777832, "epsilon_dpo/beta_margin_std": 1.5074750185012817, "epsilon_dpo/loss_margin_mean": 47.062767028808594, "grad_norm": 42.43049621582031, "kl/avg_steps": 0.625, "kl/beta": 0.02976149320602417, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.3125523023339815e-07, "logits/chosen": -0.925573468208313, "logits/rejected": -0.8360555768013, "logps/chosen": -92.91569519042969, "logps/ref_chosen": -58.576683044433594, "logps/ref_rejected": -87.84639739990234, "logps/rejected": -169.2481689453125, "loss": 0.7488, "rewards/accuracies": 0.859375, "rewards/chosen": -1.0180344581604004, "rewards/margins": 1.385319709777832, "rewards/rejected": -2.4033541679382324, "step": 218 }, { "epoch": 0.32158590308370044, "epsilon_dpo/beta": 0.029385482892394066, "epsilon_dpo/beta_margin_grad_mean": -0.29207971692085266, "epsilon_dpo/beta_margin_grad_std": 0.18771642446517944, "epsilon_dpo/beta_margin_mean": 1.1679359674453735, "epsilon_dpo/beta_margin_std": 1.2931469678878784, "epsilon_dpo/loss_margin_mean": 39.917320251464844, "grad_norm": 42.281761169433594, "kl/avg_steps": 0.65625, "kl/beta": 0.02957664057612419, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.303689819449636e-07, "logits/chosen": -0.8818705677986145, "logits/rejected": -0.8401570320129395, "logps/chosen": -95.96543884277344, "logps/ref_chosen": -61.083858489990234, "logps/ref_rejected": -85.83042907714844, "logps/rejected": -160.62933349609375, "loss": 0.7962, "rewards/accuracies": 0.890625, "rewards/chosen": -1.0273648500442505, "rewards/margins": 1.167935848236084, "rewards/rejected": -2.195300817489624, "step": 219 }, { "epoch": 0.32305433186490456, "epsilon_dpo/beta": 0.029193896800279617, "epsilon_dpo/beta_margin_grad_mean": -0.3001166582107544, "epsilon_dpo/beta_margin_grad_std": 0.17635947465896606, "epsilon_dpo/beta_margin_mean": 1.0498942136764526, "epsilon_dpo/beta_margin_std": 1.0387635231018066, "epsilon_dpo/loss_margin_mean": 36.12261199951172, "grad_norm": 40.59983444213867, "kl/avg_steps": 0.65625, "kl/beta": 0.029383808374404907, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.2947798076611047e-07, "logits/chosen": -0.9992510080337524, "logits/rejected": -0.8547923564910889, "logps/chosen": -106.59527587890625, "logps/ref_chosen": -70.03128051757812, "logps/ref_rejected": -87.68551635742188, "logps/rejected": -160.37210083007812, "loss": 0.7876, "rewards/accuracies": 0.84375, "rewards/chosen": -1.0686886310577393, "rewards/margins": 1.0498943328857422, "rewards/rejected": -2.1185829639434814, "step": 220 }, { "epoch": 0.3245227606461087, "epsilon_dpo/beta": 0.028957944363355637, "epsilon_dpo/beta_margin_grad_mean": -0.20571331679821014, "epsilon_dpo/beta_margin_grad_std": 0.17811840772628784, "epsilon_dpo/beta_margin_mean": 1.8437591791152954, "epsilon_dpo/beta_margin_std": 1.4009723663330078, "epsilon_dpo/loss_margin_mean": 63.807823181152344, "grad_norm": 35.39186096191406, "kl/avg_steps": 0.8125, "kl/beta": 0.029192235320806503, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 4.285822501755485e-07, "logits/chosen": -0.8663440346717834, "logits/rejected": -0.9057981967926025, "logps/chosen": -89.57997131347656, "logps/ref_chosen": -52.15470886230469, "logps/ref_rejected": -106.46768188476562, "logps/rejected": -207.70077514648438, "loss": 0.5239, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0850903987884521, "rewards/margins": 1.843759298324585, "rewards/rejected": -2.928849697113037, "step": 221 }, { "epoch": 0.32599118942731276, "epsilon_dpo/beta": 0.0287607554346323, "epsilon_dpo/beta_margin_grad_mean": -0.27154117822647095, "epsilon_dpo/beta_margin_grad_std": 0.20897042751312256, "epsilon_dpo/beta_margin_mean": 1.3865907192230225, "epsilon_dpo/beta_margin_std": 1.4618659019470215, "epsilon_dpo/loss_margin_mean": 48.42705154418945, "grad_norm": 45.9877815246582, "kl/avg_steps": 0.6875, "kl/beta": 0.028956959024071693, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.276818137766118e-07, "logits/chosen": -0.951972246170044, "logits/rejected": -0.9163509607315063, "logps/chosen": -99.87564086914062, "logps/ref_chosen": -60.971099853515625, "logps/ref_rejected": -100.00115203857422, "logps/rejected": -187.33274841308594, "loss": 0.7478, "rewards/accuracies": 0.828125, "rewards/chosen": -1.1236631870269775, "rewards/margins": 1.3865907192230225, "rewards/rejected": -2.51025390625, "step": 222 }, { "epoch": 0.3274596182085169, "epsilon_dpo/beta": 0.028600329533219337, "epsilon_dpo/beta_margin_grad_mean": -0.2979002594947815, "epsilon_dpo/beta_margin_grad_std": 0.23002079129219055, "epsilon_dpo/beta_margin_mean": 1.200039029121399, "epsilon_dpo/beta_margin_std": 1.4345189332962036, "epsilon_dpo/loss_margin_mean": 42.24653625488281, "grad_norm": 52.527828216552734, "kl/avg_steps": 0.5625, "kl/beta": 0.028759239241480827, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.2677669529663686e-07, "logits/chosen": -0.9274756908416748, "logits/rejected": -0.7958655953407288, "logps/chosen": -95.27392578125, "logps/ref_chosen": -52.64057922363281, "logps/ref_rejected": -82.82502746582031, "logps/rejected": -167.70492553710938, "loss": 0.8655, "rewards/accuracies": 0.765625, "rewards/chosen": -1.2237632274627686, "rewards/margins": 1.2000389099121094, "rewards/rejected": -2.423802375793457, "step": 223 }, { "epoch": 0.328928046989721, "epsilon_dpo/beta": 0.02845822647213936, "epsilon_dpo/beta_margin_grad_mean": -0.26928919553756714, "epsilon_dpo/beta_margin_grad_std": 0.2087675780057907, "epsilon_dpo/beta_margin_mean": 1.4641945362091064, "epsilon_dpo/beta_margin_std": 1.5582221746444702, "epsilon_dpo/loss_margin_mean": 51.74797821044922, "grad_norm": 43.929931640625, "kl/avg_steps": 0.5, "kl/beta": 0.028598373755812645, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.2586691858633747e-07, "logits/chosen": -0.9127311706542969, "logits/rejected": -0.8163399696350098, "logps/chosen": -83.71321105957031, "logps/ref_chosen": -48.59540939331055, "logps/ref_rejected": -77.11648559570312, "logps/rejected": -163.98226928710938, "loss": 0.7383, "rewards/accuracies": 0.859375, "rewards/chosen": -1.002156376838684, "rewards/margins": 1.4641945362091064, "rewards/rejected": -2.46635103225708, "step": 224 }, { "epoch": 0.3303964757709251, "epsilon_dpo/beta": 0.028254389762878418, "epsilon_dpo/beta_margin_grad_mean": -0.22889071702957153, "epsilon_dpo/beta_margin_grad_std": 0.1976906806230545, "epsilon_dpo/beta_margin_mean": 1.7548757791519165, "epsilon_dpo/beta_margin_std": 1.5744292736053467, "epsilon_dpo/loss_margin_mean": 62.31322479248047, "grad_norm": 35.99250411987305, "kl/avg_steps": 0.71875, "kl/beta": 0.02845609374344349, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 4.249525076191759e-07, "logits/chosen": -0.9548609256744385, "logits/rejected": -0.9496945142745972, "logps/chosen": -97.8477783203125, "logps/ref_chosen": -58.000465393066406, "logps/ref_rejected": -99.90290832519531, "logps/rejected": -202.06344604492188, "loss": 0.6025, "rewards/accuracies": 0.890625, "rewards/chosen": -1.1272399425506592, "rewards/margins": 1.754875659942627, "rewards/rejected": -2.882115602493286, "step": 225 }, { "epoch": 0.33186490455212925, "epsilon_dpo/beta": 0.02807479165494442, "epsilon_dpo/beta_margin_grad_mean": -0.2813953459262848, "epsilon_dpo/beta_margin_grad_std": 0.18881773948669434, "epsilon_dpo/beta_margin_mean": 1.2987498044967651, "epsilon_dpo/beta_margin_std": 1.3646435737609863, "epsilon_dpo/loss_margin_mean": 46.46550369262695, "grad_norm": 39.40699768066406, "kl/avg_steps": 0.640625, "kl/beta": 0.02825302444398403, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.2403348649073167e-07, "logits/chosen": -0.9883425831794739, "logits/rejected": -0.858059823513031, "logps/chosen": -91.37918090820312, "logps/ref_chosen": -58.898799896240234, "logps/ref_rejected": -78.68775939941406, "logps/rejected": -157.63363647460938, "loss": 0.7437, "rewards/accuracies": 0.84375, "rewards/chosen": -0.9137367010116577, "rewards/margins": 1.2987498044967651, "rewards/rejected": -2.2124862670898438, "step": 226 }, { "epoch": 0.3333333333333333, "epsilon_dpo/beta": 0.027891740202903748, "epsilon_dpo/beta_margin_grad_mean": -0.2355383336544037, "epsilon_dpo/beta_margin_grad_std": 0.19523394107818604, "epsilon_dpo/beta_margin_mean": 1.6658804416656494, "epsilon_dpo/beta_margin_std": 1.555821180343628, "epsilon_dpo/loss_margin_mean": 59.96611404418945, "grad_norm": 36.23415756225586, "kl/avg_steps": 0.65625, "kl/beta": 0.028073180466890335, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.2310987941806615e-07, "logits/chosen": -0.9570825099945068, "logits/rejected": -0.9183826446533203, "logps/chosen": -95.04913330078125, "logps/ref_chosen": -59.072181701660156, "logps/ref_rejected": -99.41236877441406, "logps/rejected": -195.35543823242188, "loss": 0.6364, "rewards/accuracies": 0.875, "rewards/chosen": -1.0061688423156738, "rewards/margins": 1.6658804416656494, "rewards/rejected": -2.6720492839813232, "step": 227 }, { "epoch": 0.33480176211453744, "epsilon_dpo/beta": 0.027744758874177933, "epsilon_dpo/beta_margin_grad_mean": -0.2855328619480133, "epsilon_dpo/beta_margin_grad_std": 0.19863800704479218, "epsilon_dpo/beta_margin_mean": 1.2065411806106567, "epsilon_dpo/beta_margin_std": 1.2601219415664673, "epsilon_dpo/loss_margin_mean": 43.744407653808594, "grad_norm": 37.50095748901367, "kl/avg_steps": 0.53125, "kl/beta": 0.027890151366591454, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.2218171073908463e-07, "logits/chosen": -0.9429744482040405, "logits/rejected": -0.9067270755767822, "logps/chosen": -101.53741455078125, "logps/ref_chosen": -65.89129638671875, "logps/ref_rejected": -91.04875183105469, "logps/rejected": -170.43927001953125, "loss": 0.7764, "rewards/accuracies": 0.828125, "rewards/chosen": -0.9902582168579102, "rewards/margins": 1.2065410614013672, "rewards/rejected": -2.1967992782592773, "step": 228 }, { "epoch": 0.33627019089574156, "epsilon_dpo/beta": 0.027563462033867836, "epsilon_dpo/beta_margin_grad_mean": -0.2710328698158264, "epsilon_dpo/beta_margin_grad_std": 0.2131662368774414, "epsilon_dpo/beta_margin_mean": 1.3845741748809814, "epsilon_dpo/beta_margin_std": 1.4678245782852173, "epsilon_dpo/loss_margin_mean": 50.469581604003906, "grad_norm": 38.05533981323242, "kl/avg_steps": 0.65625, "kl/beta": 0.027742767706513405, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.212490049118951e-07, "logits/chosen": -0.963711142539978, "logits/rejected": -0.8549022078514099, "logps/chosen": -108.31786346435547, "logps/ref_chosen": -70.70636749267578, "logps/ref_rejected": -84.52740478515625, "logps/rejected": -172.60848999023438, "loss": 0.7629, "rewards/accuracies": 0.859375, "rewards/chosen": -1.0388529300689697, "rewards/margins": 1.3845741748809814, "rewards/rejected": -2.423427104949951, "step": 229 }, { "epoch": 0.3377386196769457, "epsilon_dpo/beta": 0.0273579154163599, "epsilon_dpo/beta_margin_grad_mean": -0.23891520500183105, "epsilon_dpo/beta_margin_grad_std": 0.1923268884420395, "epsilon_dpo/beta_margin_mean": 1.6638827323913574, "epsilon_dpo/beta_margin_std": 1.6286611557006836, "epsilon_dpo/loss_margin_mean": 61.025386810302734, "grad_norm": 33.867340087890625, "kl/avg_steps": 0.75, "kl/beta": 0.027561893686652184, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 4.203117865141635e-07, "logits/chosen": -0.8555896282196045, "logits/rejected": -0.8511315584182739, "logps/chosen": -71.353759765625, "logps/ref_chosen": -39.282005310058594, "logps/ref_rejected": -85.62191009521484, "logps/rejected": -178.71905517578125, "loss": 0.6508, "rewards/accuracies": 0.890625, "rewards/chosen": -0.8789701461791992, "rewards/margins": 1.6638827323913574, "rewards/rejected": -2.5428528785705566, "step": 230 }, { "epoch": 0.3392070484581498, "epsilon_dpo/beta": 0.0271841399371624, "epsilon_dpo/beta_margin_grad_mean": -0.2801465094089508, "epsilon_dpo/beta_margin_grad_std": 0.17204877734184265, "epsilon_dpo/beta_margin_mean": 1.3108634948730469, "epsilon_dpo/beta_margin_std": 1.3848408460617065, "epsilon_dpo/loss_margin_mean": 48.394615173339844, "grad_norm": 30.824636459350586, "kl/avg_steps": 0.640625, "kl/beta": 0.027356717735528946, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.1937008024246625e-07, "logits/chosen": -0.9609138369560242, "logits/rejected": -0.8604573607444763, "logps/chosen": -94.687255859375, "logps/ref_chosen": -63.27644348144531, "logps/ref_rejected": -74.1239013671875, "logps/rejected": -153.92933654785156, "loss": 0.7175, "rewards/accuracies": 0.921875, "rewards/chosen": -0.8553760051727295, "rewards/margins": 1.3108634948730469, "rewards/rejected": -2.1662395000457764, "step": 231 }, { "epoch": 0.3406754772393539, "epsilon_dpo/beta": 0.027032380923628807, "epsilon_dpo/beta_margin_grad_mean": -0.32459476590156555, "epsilon_dpo/beta_margin_grad_std": 0.2017158418893814, "epsilon_dpo/beta_margin_mean": 1.0300805568695068, "epsilon_dpo/beta_margin_std": 1.3635324239730835, "epsilon_dpo/loss_margin_mean": 38.336116790771484, "grad_norm": 41.94512939453125, "kl/avg_steps": 0.5625, "kl/beta": 0.027182579040527344, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.1842391091163933e-07, "logits/chosen": -0.9774780869483948, "logits/rejected": -0.7848711013793945, "logps/chosen": -110.14714813232422, "logps/ref_chosen": -70.74876403808594, "logps/ref_rejected": -83.97706604003906, "logps/rejected": -161.71157836914062, "loss": 0.8942, "rewards/accuracies": 0.84375, "rewards/chosen": -1.068268060684204, "rewards/margins": 1.0300805568695068, "rewards/rejected": -2.098348617553711, "step": 232 }, { "epoch": 0.342143906020558, "epsilon_dpo/beta": 0.0268896222114563, "epsilon_dpo/beta_margin_grad_mean": -0.27420952916145325, "epsilon_dpo/beta_margin_grad_std": 0.22176600992679596, "epsilon_dpo/beta_margin_mean": 1.5114142894744873, "epsilon_dpo/beta_margin_std": 1.7114201784133911, "epsilon_dpo/loss_margin_mean": 56.54182815551758, "grad_norm": 36.417903900146484, "kl/avg_steps": 0.53125, "kl/beta": 0.02703053317964077, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.174733034541245e-07, "logits/chosen": -0.9822086095809937, "logits/rejected": -0.9409425258636475, "logps/chosen": -90.49263763427734, "logps/ref_chosen": -54.8829345703125, "logps/ref_rejected": -107.48007202148438, "logps/rejected": -199.63160705566406, "loss": 0.7647, "rewards/accuracies": 0.828125, "rewards/chosen": -0.9597580432891846, "rewards/margins": 1.5114142894744873, "rewards/rejected": -2.471172332763672, "step": 233 }, { "epoch": 0.3436123348017621, "epsilon_dpo/beta": 0.026680300012230873, "epsilon_dpo/beta_margin_grad_mean": -0.23809513449668884, "epsilon_dpo/beta_margin_grad_std": 0.18818014860153198, "epsilon_dpo/beta_margin_mean": 1.5255002975463867, "epsilon_dpo/beta_margin_std": 1.3624063730239868, "epsilon_dpo/loss_margin_mean": 57.353092193603516, "grad_norm": 34.17942428588867, "kl/avg_steps": 0.78125, "kl/beta": 0.026887692511081696, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 4.165182829193126e-07, "logits/chosen": -0.9014260768890381, "logits/rejected": -0.9556354284286499, "logps/chosen": -76.05662536621094, "logps/ref_chosen": -44.09451675415039, "logps/ref_rejected": -100.00663757324219, "logps/rejected": -189.32183837890625, "loss": 0.6489, "rewards/accuracies": 0.921875, "rewards/chosen": -0.8547545671463013, "rewards/margins": 1.5255002975463867, "rewards/rejected": -2.3802549839019775, "step": 234 }, { "epoch": 0.34508076358296624, "epsilon_dpo/beta": 0.026548519730567932, "epsilon_dpo/beta_margin_grad_mean": -0.3122892677783966, "epsilon_dpo/beta_margin_grad_std": 0.2024298459291458, "epsilon_dpo/beta_margin_mean": 1.119795322418213, "epsilon_dpo/beta_margin_std": 1.4032968282699585, "epsilon_dpo/loss_margin_mean": 42.453731536865234, "grad_norm": 47.0859489440918, "kl/avg_steps": 0.5, "kl/beta": 0.026679260656237602, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.1555887447288255e-07, "logits/chosen": -0.9289396405220032, "logits/rejected": -0.8603818416595459, "logps/chosen": -106.87683868408203, "logps/ref_chosen": -62.237911224365234, "logps/ref_rejected": -90.39505767822266, "logps/rejected": -177.4877166748047, "loss": 0.8567, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1885778903961182, "rewards/margins": 1.119795322418213, "rewards/rejected": -2.308373212814331, "step": 235 }, { "epoch": 0.3465491923641703, "epsilon_dpo/beta": 0.026354169473052025, "epsilon_dpo/beta_margin_grad_mean": -0.26097211241722107, "epsilon_dpo/beta_margin_grad_std": 0.18908995389938354, "epsilon_dpo/beta_margin_mean": 1.3955023288726807, "epsilon_dpo/beta_margin_std": 1.2780405282974243, "epsilon_dpo/loss_margin_mean": 53.129547119140625, "grad_norm": 44.04022216796875, "kl/avg_steps": 0.734375, "kl/beta": 0.026546526700258255, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.859375, "learning_rate": 4.1459510339613946e-07, "logits/chosen": -0.8796597719192505, "logits/rejected": -0.9121089577674866, "logps/chosen": -82.28189086914062, "logps/ref_chosen": -49.34136199951172, "logps/ref_rejected": -103.51162719726562, "logps/rejected": -189.5817108154297, "loss": 0.6821, "rewards/accuracies": 0.84375, "rewards/chosen": -0.8701371550559998, "rewards/margins": 1.3955023288726807, "rewards/rejected": -2.265639543533325, "step": 236 }, { "epoch": 0.34801762114537443, "epsilon_dpo/beta": 0.02617851458489895, "epsilon_dpo/beta_margin_grad_mean": -0.2683834135532379, "epsilon_dpo/beta_margin_grad_std": 0.1993410587310791, "epsilon_dpo/beta_margin_mean": 1.3627753257751465, "epsilon_dpo/beta_margin_std": 1.378995656967163, "epsilon_dpo/loss_margin_mean": 52.28477478027344, "grad_norm": 43.82939910888672, "kl/avg_steps": 0.671875, "kl/beta": 0.02635299786925316, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.136269950853473e-07, "logits/chosen": -0.8822282552719116, "logits/rejected": -0.8408148288726807, "logps/chosen": -93.03535461425781, "logps/ref_chosen": -54.168121337890625, "logps/ref_rejected": -94.78036499023438, "logps/rejected": -185.932373046875, "loss": 0.7318, "rewards/accuracies": 0.875, "rewards/chosen": -1.0194590091705322, "rewards/margins": 1.3627753257751465, "rewards/rejected": -2.3822340965270996, "step": 237 }, { "epoch": 0.34948604992657856, "epsilon_dpo/beta": 0.02600793167948723, "epsilon_dpo/beta_margin_grad_mean": -0.2891097664833069, "epsilon_dpo/beta_margin_grad_std": 0.17171606421470642, "epsilon_dpo/beta_margin_mean": 1.1959774494171143, "epsilon_dpo/beta_margin_std": 1.265442967414856, "epsilon_dpo/loss_margin_mean": 46.166481018066406, "grad_norm": 48.45499801635742, "kl/avg_steps": 0.65625, "kl/beta": 0.026177119463682175, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.126545750510605e-07, "logits/chosen": -0.8099647760391235, "logits/rejected": -0.8988782167434692, "logps/chosen": -92.368408203125, "logps/ref_chosen": -53.973121643066406, "logps/ref_rejected": -89.41795349121094, "logps/rejected": -173.97970581054688, "loss": 0.7486, "rewards/accuracies": 0.890625, "rewards/chosen": -0.999852180480957, "rewards/margins": 1.1959774494171143, "rewards/rejected": -2.1958298683166504, "step": 238 }, { "epoch": 0.3509544787077827, "epsilon_dpo/beta": 0.025830240920186043, "epsilon_dpo/beta_margin_grad_mean": -0.27064305543899536, "epsilon_dpo/beta_margin_grad_std": 0.18519093096256256, "epsilon_dpo/beta_margin_mean": 1.352831244468689, "epsilon_dpo/beta_margin_std": 1.3413230180740356, "epsilon_dpo/loss_margin_mean": 52.579769134521484, "grad_norm": 49.931846618652344, "kl/avg_steps": 0.6875, "kl/beta": 0.02600645273923874, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.116778689174514e-07, "logits/chosen": -0.8864554166793823, "logits/rejected": -0.8159253597259521, "logps/chosen": -100.35231018066406, "logps/ref_chosen": -58.09782409667969, "logps/ref_rejected": -93.59294128417969, "logps/rejected": -188.42718505859375, "loss": 0.7085, "rewards/accuracies": 0.859375, "rewards/chosen": -1.0936906337738037, "rewards/margins": 1.3528313636779785, "rewards/rejected": -2.446521759033203, "step": 239 }, { "epoch": 0.3524229074889868, "epsilon_dpo/beta": 0.025670016184449196, "epsilon_dpo/beta_margin_grad_mean": -0.28202009201049805, "epsilon_dpo/beta_margin_grad_std": 0.2018457055091858, "epsilon_dpo/beta_margin_mean": 1.24077308177948, "epsilon_dpo/beta_margin_std": 1.2659971714019775, "epsilon_dpo/loss_margin_mean": 48.58975601196289, "grad_norm": 36.69154739379883, "kl/avg_steps": 0.625, "kl/beta": 0.025828879326581955, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.106969024216348e-07, "logits/chosen": -0.861579954624176, "logits/rejected": -0.8070861101150513, "logps/chosen": -103.43043518066406, "logps/ref_chosen": -60.6144905090332, "logps/ref_rejected": -74.1185302734375, "logps/rejected": -165.52423095703125, "loss": 0.7647, "rewards/accuracies": 0.828125, "rewards/chosen": -1.100210189819336, "rewards/margins": 1.2407732009887695, "rewards/rejected": -2.3409833908081055, "step": 240 }, { "epoch": 0.35389133627019087, "epsilon_dpo/beta": 0.025486508384346962, "epsilon_dpo/beta_margin_grad_mean": -0.2389088273048401, "epsilon_dpo/beta_margin_grad_std": 0.1938583105802536, "epsilon_dpo/beta_margin_mean": 1.7226759195327759, "epsilon_dpo/beta_margin_std": 1.789014458656311, "epsilon_dpo/loss_margin_mean": 67.83097076416016, "grad_norm": 38.793495178222656, "kl/avg_steps": 0.71875, "kl/beta": 0.025668451562523842, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 4.097117014129903e-07, "logits/chosen": -0.8475504517555237, "logits/rejected": -0.7058205604553223, "logps/chosen": -104.88059997558594, "logps/ref_chosen": -66.091064453125, "logps/ref_rejected": -88.06088256835938, "logps/rejected": -194.681396484375, "loss": 0.6398, "rewards/accuracies": 0.890625, "rewards/chosen": -0.9906549453735352, "rewards/margins": 1.7226760387420654, "rewards/rejected": -2.7133307456970215, "step": 241 }, { "epoch": 0.355359765051395, "epsilon_dpo/beta": 0.025296665728092194, "epsilon_dpo/beta_margin_grad_mean": -0.2969255745410919, "epsilon_dpo/beta_margin_grad_std": 0.1766965687274933, "epsilon_dpo/beta_margin_mean": 1.155187726020813, "epsilon_dpo/beta_margin_std": 1.2903164625167847, "epsilon_dpo/loss_margin_mean": 45.810943603515625, "grad_norm": 69.306884765625, "kl/avg_steps": 0.75, "kl/beta": 0.02548527531325817, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 4.087222918524807e-07, "logits/chosen": -0.7396622896194458, "logits/rejected": -0.7340766191482544, "logps/chosen": -115.20914459228516, "logps/ref_chosen": -67.86392211914062, "logps/ref_rejected": -83.36033630371094, "logps/rejected": -176.51649475097656, "loss": 0.7818, "rewards/accuracies": 0.90625, "rewards/chosen": -1.1993852853775024, "rewards/margins": 1.1551876068115234, "rewards/rejected": -2.3545730113983154, "step": 242 }, { "epoch": 0.3568281938325991, "epsilon_dpo/beta": 0.025108352303504944, "epsilon_dpo/beta_margin_grad_mean": -0.23020483553409576, "epsilon_dpo/beta_margin_grad_std": 0.17036239802837372, "epsilon_dpo/beta_margin_mean": 1.6202038526535034, "epsilon_dpo/beta_margin_std": 1.3733315467834473, "epsilon_dpo/loss_margin_mean": 64.70569610595703, "grad_norm": 44.07530975341797, "kl/avg_steps": 0.75, "kl/beta": 0.02529555931687355, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 4.07728699811968e-07, "logits/chosen": -0.8574939966201782, "logits/rejected": -0.7249910831451416, "logps/chosen": -111.91984558105469, "logps/ref_chosen": -63.08424377441406, "logps/ref_rejected": -76.33563232421875, "logps/rejected": -189.87693786621094, "loss": 0.5833, "rewards/accuracies": 0.921875, "rewards/chosen": -1.2278523445129395, "rewards/margins": 1.6202038526535034, "rewards/rejected": -2.8480563163757324, "step": 243 }, { "epoch": 0.35829662261380324, "epsilon_dpo/beta": 0.024944983422756195, "epsilon_dpo/beta_margin_grad_mean": -0.2581092119216919, "epsilon_dpo/beta_margin_grad_std": 0.18795832991600037, "epsilon_dpo/beta_margin_mean": 1.4344532489776611, "epsilon_dpo/beta_margin_std": 1.32624351978302, "epsilon_dpo/loss_margin_mean": 57.7309455871582, "grad_norm": 49.93527603149414, "kl/avg_steps": 0.65625, "kl/beta": 0.025107255205512047, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.067309514735267e-07, "logits/chosen": -0.7641308903694153, "logits/rejected": -0.7154449224472046, "logps/chosen": -104.9389877319336, "logps/ref_chosen": -61.14069366455078, "logps/ref_rejected": -94.89193725585938, "logps/rejected": -196.42117309570312, "loss": 0.6755, "rewards/accuracies": 0.875, "rewards/chosen": -1.093597650527954, "rewards/margins": 1.4344532489776611, "rewards/rejected": -2.5280508995056152, "step": 244 }, { "epoch": 0.35976505139500736, "epsilon_dpo/beta": 0.024797938764095306, "epsilon_dpo/beta_margin_grad_mean": -0.277778685092926, "epsilon_dpo/beta_margin_grad_std": 0.20800314843654633, "epsilon_dpo/beta_margin_mean": 1.4760335683822632, "epsilon_dpo/beta_margin_std": 1.7903791666030884, "epsilon_dpo/loss_margin_mean": 59.82563400268555, "grad_norm": 54.38076400756836, "kl/avg_steps": 0.59375, "kl/beta": 0.024943562224507332, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.057290731287531e-07, "logits/chosen": -0.8543267250061035, "logits/rejected": -0.7700981497764587, "logps/chosen": -117.57855224609375, "logps/ref_chosen": -67.26228332519531, "logps/ref_rejected": -87.64010620117188, "logps/rejected": -197.78199768066406, "loss": 0.7694, "rewards/accuracies": 0.828125, "rewards/chosen": -1.249393343925476, "rewards/margins": 1.4760334491729736, "rewards/rejected": -2.72542667388916, "step": 245 }, { "epoch": 0.36123348017621143, "epsilon_dpo/beta": 0.024620573967695236, "epsilon_dpo/beta_margin_grad_mean": -0.28294581174850464, "epsilon_dpo/beta_margin_grad_std": 0.19259461760520935, "epsilon_dpo/beta_margin_mean": 1.3314363956451416, "epsilon_dpo/beta_margin_std": 1.5045994520187378, "epsilon_dpo/loss_margin_mean": 54.27648162841797, "grad_norm": 41.48917007446289, "kl/avg_steps": 0.71875, "kl/beta": 0.02479633502662182, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 4.047230911780736e-07, "logits/chosen": -0.8397436141967773, "logits/rejected": -0.7311065196990967, "logps/chosen": -113.2144546508789, "logps/ref_chosen": -66.69696807861328, "logps/ref_rejected": -84.34634399414062, "logps/rejected": -185.14031982421875, "loss": 0.757, "rewards/accuracies": 0.921875, "rewards/chosen": -1.1463537216186523, "rewards/margins": 1.3314363956451416, "rewards/rejected": -2.477789878845215, "step": 246 }, { "epoch": 0.36270190895741555, "epsilon_dpo/beta": 0.024429485201835632, "epsilon_dpo/beta_margin_grad_mean": -0.22065015137195587, "epsilon_dpo/beta_margin_grad_std": 0.19751501083374023, "epsilon_dpo/beta_margin_mean": 1.9003351926803589, "epsilon_dpo/beta_margin_std": 1.767119288444519, "epsilon_dpo/loss_margin_mean": 78.00141143798828, "grad_norm": 43.01525115966797, "kl/avg_steps": 0.78125, "kl/beta": 0.02461938187479973, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 4.0371303213004814e-07, "logits/chosen": -0.7675491571426392, "logits/rejected": -0.7368471622467041, "logps/chosen": -110.41958618164062, "logps/ref_chosen": -56.6053466796875, "logps/ref_rejected": -106.29327392578125, "logps/rejected": -238.10891723632812, "loss": 0.5939, "rewards/accuracies": 0.90625, "rewards/chosen": -1.3169013261795044, "rewards/margins": 1.9003353118896484, "rewards/rejected": -3.2172365188598633, "step": 247 }, { "epoch": 0.3641703377386197, "epsilon_dpo/beta": 0.024217206984758377, "epsilon_dpo/beta_margin_grad_mean": -0.2548842430114746, "epsilon_dpo/beta_margin_grad_std": 0.15879112482070923, "epsilon_dpo/beta_margin_mean": 1.3608661890029907, "epsilon_dpo/beta_margin_std": 1.123822569847107, "epsilon_dpo/loss_margin_mean": 56.26406478881836, "grad_norm": 49.93627166748047, "kl/avg_steps": 0.875, "kl/beta": 0.024428535252809525, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 4.0269892260067197e-07, "logits/chosen": -0.7783492803573608, "logits/rejected": -0.7796909809112549, "logps/chosen": -89.22013854980469, "logps/ref_chosen": -44.043216705322266, "logps/ref_rejected": -91.85687255859375, "logps/rejected": -193.29786682128906, "loss": 0.6365, "rewards/accuracies": 0.953125, "rewards/chosen": -1.0947335958480835, "rewards/margins": 1.3608663082122803, "rewards/rejected": -2.455599784851074, "step": 248 }, { "epoch": 0.3656387665198238, "epsilon_dpo/beta": 0.02412823960185051, "epsilon_dpo/beta_margin_grad_mean": -0.3194911479949951, "epsilon_dpo/beta_margin_grad_std": 0.2219369113445282, "epsilon_dpo/beta_margin_mean": 1.0834875106811523, "epsilon_dpo/beta_margin_std": 1.4188543558120728, "epsilon_dpo/loss_margin_mean": 45.29121398925781, "grad_norm": 63.952247619628906, "kl/avg_steps": 0.375, "kl/beta": 0.02421663887798786, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.0168078931267426e-07, "logits/chosen": -0.832280158996582, "logits/rejected": -0.7321330308914185, "logps/chosen": -115.44468688964844, "logps/ref_chosen": -62.442352294921875, "logps/ref_rejected": -80.46806335449219, "logps/rejected": -178.76162719726562, "loss": 0.9049, "rewards/accuracies": 0.734375, "rewards/chosen": -1.2819901704788208, "rewards/margins": 1.0834875106811523, "rewards/rejected": -2.3654775619506836, "step": 249 }, { "epoch": 0.3671071953010279, "epsilon_dpo/beta": 0.0239400751888752, "epsilon_dpo/beta_margin_grad_mean": -0.23697948455810547, "epsilon_dpo/beta_margin_grad_std": 0.16869625449180603, "epsilon_dpo/beta_margin_mean": 1.4498459100723267, "epsilon_dpo/beta_margin_std": 1.092609167098999, "epsilon_dpo/loss_margin_mean": 60.717708587646484, "grad_norm": 42.29915237426758, "kl/avg_steps": 0.78125, "kl/beta": 0.024126166477799416, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 4.006586590948141e-07, "logits/chosen": -0.8870829343795776, "logits/rejected": -0.6928812265396118, "logps/chosen": -108.88627624511719, "logps/ref_chosen": -65.6366958618164, "logps/ref_rejected": -73.87183380126953, "logps/rejected": -177.83912658691406, "loss": 0.6012, "rewards/accuracies": 0.890625, "rewards/chosen": -1.0348234176635742, "rewards/margins": 1.449845790863037, "rewards/rejected": -2.4846692085266113, "step": 250 }, { "epoch": 0.368575624082232, "epsilon_dpo/beta": 0.023791901767253876, "epsilon_dpo/beta_margin_grad_mean": -0.2746790051460266, "epsilon_dpo/beta_margin_grad_std": 0.19281445443630219, "epsilon_dpo/beta_margin_mean": 1.3085737228393555, "epsilon_dpo/beta_margin_std": 1.2904928922653198, "epsilon_dpo/loss_margin_mean": 55.25014877319336, "grad_norm": 38.71930694580078, "kl/avg_steps": 0.625, "kl/beta": 0.023939142003655434, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 3.9963255888117325e-07, "logits/chosen": -0.8276119232177734, "logits/rejected": -0.7018730640411377, "logps/chosen": -111.89410400390625, "logps/ref_chosen": -57.182716369628906, "logps/ref_rejected": -77.66343688964844, "logps/rejected": -187.62496948242188, "loss": 0.7294, "rewards/accuracies": 0.859375, "rewards/chosen": -1.3019583225250244, "rewards/margins": 1.3085737228393555, "rewards/rejected": -2.610532283782959, "step": 251 }, { "epoch": 0.3700440528634361, "epsilon_dpo/beta": 0.023599514737725258, "epsilon_dpo/beta_margin_grad_mean": -0.23693950474262238, "epsilon_dpo/beta_margin_grad_std": 0.17138709127902985, "epsilon_dpo/beta_margin_mean": 1.4469765424728394, "epsilon_dpo/beta_margin_std": 1.207213282585144, "epsilon_dpo/loss_margin_mean": 61.47700119018555, "grad_norm": 54.85141372680664, "kl/avg_steps": 0.8125, "kl/beta": 0.023790450766682625, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 3.9860251571044666e-07, "logits/chosen": -0.8772724866867065, "logits/rejected": -0.7638910412788391, "logps/chosen": -123.50946044921875, "logps/ref_chosen": -71.68563842773438, "logps/ref_rejected": -84.75798797607422, "logps/rejected": -198.05880737304688, "loss": 0.6352, "rewards/accuracies": 0.90625, "rewards/chosen": -1.2233631610870361, "rewards/margins": 1.4469765424728394, "rewards/rejected": -2.670339584350586, "step": 252 }, { "epoch": 0.37151248164464024, "epsilon_dpo/beta": 0.02346094138920307, "epsilon_dpo/beta_margin_grad_mean": -0.2852194905281067, "epsilon_dpo/beta_margin_grad_std": 0.21233250200748444, "epsilon_dpo/beta_margin_mean": 1.269000768661499, "epsilon_dpo/beta_margin_std": 1.4000648260116577, "epsilon_dpo/loss_margin_mean": 54.386932373046875, "grad_norm": 51.151695251464844, "kl/avg_steps": 0.59375, "kl/beta": 0.02359871193766594, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 3.9756855672522986e-07, "logits/chosen": -0.8559330701828003, "logits/rejected": -0.8246597051620483, "logps/chosen": -115.36244201660156, "logps/ref_chosen": -69.13392639160156, "logps/ref_rejected": -98.70252990722656, "logps/rejected": -199.31797790527344, "loss": 0.8005, "rewards/accuracies": 0.890625, "rewards/chosen": -1.0857230424880981, "rewards/margins": 1.269000768661499, "rewards/rejected": -2.3547239303588867, "step": 253 }, { "epoch": 0.37298091042584436, "epsilon_dpo/beta": 0.023337125778198242, "epsilon_dpo/beta_margin_grad_mean": -0.29857712984085083, "epsilon_dpo/beta_margin_grad_std": 0.21847309172153473, "epsilon_dpo/beta_margin_mean": 1.268110990524292, "epsilon_dpo/beta_margin_std": 1.560611367225647, "epsilon_dpo/loss_margin_mean": 54.68608856201172, "grad_norm": 73.59979248046875, "kl/avg_steps": 0.53125, "kl/beta": 0.0234594214707613, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 3.965307091713037e-07, "logits/chosen": -0.8435882329940796, "logits/rejected": -0.7198815941810608, "logps/chosen": -108.90068054199219, "logps/ref_chosen": -54.154998779296875, "logps/ref_rejected": -90.30764770507812, "logps/rejected": -199.73941040039062, "loss": 0.8367, "rewards/accuracies": 0.796875, "rewards/chosen": -1.280766487121582, "rewards/margins": 1.268110990524292, "rewards/rejected": -2.548877716064453, "step": 254 }, { "epoch": 0.3744493392070485, "epsilon_dpo/beta": 0.023177336901426315, "epsilon_dpo/beta_margin_grad_mean": -0.296540766954422, "epsilon_dpo/beta_margin_grad_std": 0.2287638783454895, "epsilon_dpo/beta_margin_mean": 1.3300968408584595, "epsilon_dpo/beta_margin_std": 1.7163633108139038, "epsilon_dpo/loss_margin_mean": 57.690494537353516, "grad_norm": 83.69367218017578, "kl/avg_steps": 0.6875, "kl/beta": 0.023335451260209084, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 3.954890003969163e-07, "logits/chosen": -0.7430202960968018, "logits/rejected": -0.6945962905883789, "logps/chosen": -118.17829895019531, "logps/ref_chosen": -57.14167022705078, "logps/ref_rejected": -90.2085952758789, "logps/rejected": -208.9357147216797, "loss": 0.8796, "rewards/accuracies": 0.859375, "rewards/chosen": -1.417651891708374, "rewards/margins": 1.330096960067749, "rewards/rejected": -2.747748851776123, "step": 255 }, { "epoch": 0.37591776798825255, "epsilon_dpo/beta": 0.023040810599923134, "epsilon_dpo/beta_margin_grad_mean": -0.26910004019737244, "epsilon_dpo/beta_margin_grad_std": 0.2048642784357071, "epsilon_dpo/beta_margin_mean": 1.4319080114364624, "epsilon_dpo/beta_margin_std": 1.4983351230621338, "epsilon_dpo/loss_margin_mean": 62.45432662963867, "grad_norm": 53.05121994018555, "kl/avg_steps": 0.59375, "kl/beta": 0.02317611500620842, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 3.944434578520628e-07, "logits/chosen": -0.7182353734970093, "logits/rejected": -0.665188729763031, "logps/chosen": -111.40750122070312, "logps/ref_chosen": -55.163490295410156, "logps/ref_rejected": -92.56291961669922, "logps/rejected": -211.26126098632812, "loss": 0.7279, "rewards/accuracies": 0.859375, "rewards/chosen": -1.298081874847412, "rewards/margins": 1.4319078922271729, "rewards/rejected": -2.729990005493164, "step": 256 }, { "epoch": 0.37738619676945667, "epsilon_dpo/beta": 0.022919215261936188, "epsilon_dpo/beta_margin_grad_mean": -0.2688346207141876, "epsilon_dpo/beta_margin_grad_std": 0.20320047438144684, "epsilon_dpo/beta_margin_mean": 1.5189297199249268, "epsilon_dpo/beta_margin_std": 1.622308611869812, "epsilon_dpo/loss_margin_mean": 66.6268539428711, "grad_norm": 67.98104095458984, "kl/avg_steps": 0.53125, "kl/beta": 0.02303932048380375, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 3.933941090877615e-07, "logits/chosen": -0.7001844644546509, "logits/rejected": -0.662509024143219, "logps/chosen": -100.78417205810547, "logps/ref_chosen": -49.4236946105957, "logps/ref_rejected": -79.53791809082031, "logps/rejected": -197.52523803710938, "loss": 0.7147, "rewards/accuracies": 0.78125, "rewards/chosen": -1.176638126373291, "rewards/margins": 1.5189297199249268, "rewards/rejected": -2.6955676078796387, "step": 257 }, { "epoch": 0.3788546255506608, "epsilon_dpo/beta": 0.022769449278712273, "epsilon_dpo/beta_margin_grad_mean": -0.27944010496139526, "epsilon_dpo/beta_margin_grad_std": 0.2202530950307846, "epsilon_dpo/beta_margin_mean": 1.296033501625061, "epsilon_dpo/beta_margin_std": 1.511138677597046, "epsilon_dpo/loss_margin_mean": 57.230464935302734, "grad_norm": 54.61482620239258, "kl/avg_steps": 0.65625, "kl/beta": 0.022917570546269417, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 3.923409817553284e-07, "logits/chosen": -0.7847526669502258, "logits/rejected": -0.742351770401001, "logps/chosen": -116.78972625732422, "logps/ref_chosen": -59.384124755859375, "logps/ref_rejected": -95.9901123046875, "logps/rejected": -210.6261749267578, "loss": 0.8349, "rewards/accuracies": 0.859375, "rewards/chosen": -1.3097673654556274, "rewards/margins": 1.296033501625061, "rewards/rejected": -2.6058008670806885, "step": 258 }, { "epoch": 0.3803230543318649, "epsilon_dpo/beta": 0.0226423479616642, "epsilon_dpo/beta_margin_grad_mean": -0.29370635747909546, "epsilon_dpo/beta_margin_grad_std": 0.1979660838842392, "epsilon_dpo/beta_margin_mean": 1.197819709777832, "epsilon_dpo/beta_margin_std": 1.2998988628387451, "epsilon_dpo/loss_margin_mean": 53.186553955078125, "grad_norm": 43.3511848449707, "kl/avg_steps": 0.5625, "kl/beta": 0.022768154740333557, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.9128410360564793e-07, "logits/chosen": -0.7879418134689331, "logits/rejected": -0.7493730783462524, "logps/chosen": -103.28313446044922, "logps/ref_chosen": -52.828346252441406, "logps/ref_rejected": -89.19165802001953, "logps/rejected": -192.8330078125, "loss": 0.7926, "rewards/accuracies": 0.828125, "rewards/chosen": -1.1445544958114624, "rewards/margins": 1.197819709777832, "rewards/rejected": -2.342374324798584, "step": 259 }, { "epoch": 0.38179148311306904, "epsilon_dpo/beta": 0.02247324213385582, "epsilon_dpo/beta_margin_grad_mean": -0.28044986724853516, "epsilon_dpo/beta_margin_grad_std": 0.1815163493156433, "epsilon_dpo/beta_margin_mean": 1.2883044481277466, "epsilon_dpo/beta_margin_std": 1.362417221069336, "epsilon_dpo/loss_margin_mean": 57.508216857910156, "grad_norm": 37.80736541748047, "kl/avg_steps": 0.75, "kl/beta": 0.02264080010354519, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 3.9022350248844246e-07, "logits/chosen": -0.7794230580329895, "logits/rejected": -0.8244825601577759, "logps/chosen": -96.36808776855469, "logps/ref_chosen": -47.41767501831055, "logps/ref_rejected": -95.08979034423828, "logps/rejected": -201.5484161376953, "loss": 0.7334, "rewards/accuracies": 0.859375, "rewards/chosen": -1.1006656885147095, "rewards/margins": 1.2883045673370361, "rewards/rejected": -2.388970375061035, "step": 260 }, { "epoch": 0.3832599118942731, "epsilon_dpo/beta": 0.022312970831990242, "epsilon_dpo/beta_margin_grad_mean": -0.2600063979625702, "epsilon_dpo/beta_margin_grad_std": 0.178856760263443, "epsilon_dpo/beta_margin_mean": 1.3869123458862305, "epsilon_dpo/beta_margin_std": 1.3287155628204346, "epsilon_dpo/loss_margin_mean": 62.372074127197266, "grad_norm": 46.86500549316406, "kl/avg_steps": 0.71875, "kl/beta": 0.022472258657217026, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 3.891592063515376e-07, "logits/chosen": -0.8502944707870483, "logits/rejected": -0.7972038984298706, "logps/chosen": -92.98090362548828, "logps/ref_chosen": -53.03137969970703, "logps/ref_rejected": -88.51494598388672, "logps/rejected": -190.8365478515625, "loss": 0.6733, "rewards/accuracies": 0.859375, "rewards/chosen": -0.8925292491912842, "rewards/margins": 1.3869123458862305, "rewards/rejected": -2.2794415950775146, "step": 261 }, { "epoch": 0.38472834067547723, "epsilon_dpo/beta": 0.022153738886117935, "epsilon_dpo/beta_margin_grad_mean": -0.28383615612983704, "epsilon_dpo/beta_margin_grad_std": 0.18365508317947388, "epsilon_dpo/beta_margin_mean": 1.1327235698699951, "epsilon_dpo/beta_margin_std": 1.0738329887390137, "epsilon_dpo/loss_margin_mean": 51.34547424316406, "grad_norm": 43.433895111083984, "kl/avg_steps": 0.71875, "kl/beta": 0.022311890497803688, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 3.880912432401264e-07, "logits/chosen": -0.8100330233573914, "logits/rejected": -0.6867516040802002, "logps/chosen": -105.44173431396484, "logps/ref_chosen": -59.620140075683594, "logps/ref_rejected": -86.41853332519531, "logps/rejected": -183.58560180664062, "loss": 0.7602, "rewards/accuracies": 0.875, "rewards/chosen": -1.0171130895614624, "rewards/margins": 1.1327235698699951, "rewards/rejected": -2.149836540222168, "step": 262 }, { "epoch": 0.38619676945668135, "epsilon_dpo/beta": 0.021988723427057266, "epsilon_dpo/beta_margin_grad_mean": -0.2505612075328827, "epsilon_dpo/beta_margin_grad_std": 0.183075949549675, "epsilon_dpo/beta_margin_mean": 1.4616928100585938, "epsilon_dpo/beta_margin_std": 1.3058973550796509, "epsilon_dpo/loss_margin_mean": 66.67501068115234, "grad_norm": 43.3455696105957, "kl/avg_steps": 0.75, "kl/beta": 0.022152669727802277, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 3.870196412960302e-07, "logits/chosen": -0.9041777849197388, "logits/rejected": -0.7808328866958618, "logps/chosen": -99.48373413085938, "logps/ref_chosen": -59.42094421386719, "logps/ref_rejected": -96.85720825195312, "logps/rejected": -203.59500122070312, "loss": 0.6508, "rewards/accuracies": 0.90625, "rewards/chosen": -0.884239673614502, "rewards/margins": 1.4616928100585938, "rewards/rejected": -2.3459324836730957, "step": 263 }, { "epoch": 0.3876651982378855, "epsilon_dpo/beta": 0.021831907331943512, "epsilon_dpo/beta_margin_grad_mean": -0.2944892644882202, "epsilon_dpo/beta_margin_grad_std": 0.17753368616104126, "epsilon_dpo/beta_margin_mean": 1.1068028211593628, "epsilon_dpo/beta_margin_std": 1.1502090692520142, "epsilon_dpo/loss_margin_mean": 50.89043426513672, "grad_norm": 45.67006301879883, "kl/avg_steps": 0.71875, "kl/beta": 0.021987760439515114, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 3.8594442875695665e-07, "logits/chosen": -0.8379767537117004, "logits/rejected": -0.7533121109008789, "logps/chosen": -104.73934936523438, "logps/ref_chosen": -62.722084045410156, "logps/ref_rejected": -93.85621643066406, "logps/rejected": -186.763916015625, "loss": 0.7875, "rewards/accuracies": 0.921875, "rewards/chosen": -0.9191652536392212, "rewards/margins": 1.1068028211593628, "rewards/rejected": -2.025968074798584, "step": 264 }, { "epoch": 0.3891336270190896, "epsilon_dpo/beta": 0.021703332662582397, "epsilon_dpo/beta_margin_grad_mean": -0.29720309376716614, "epsilon_dpo/beta_margin_grad_std": 0.18713915348052979, "epsilon_dpo/beta_margin_mean": 1.1945571899414062, "epsilon_dpo/beta_margin_std": 1.3362030982971191, "epsilon_dpo/loss_margin_mean": 55.31159973144531, "grad_norm": 56.90526580810547, "kl/avg_steps": 0.59375, "kl/beta": 0.021830851212143898, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.848656339557562e-07, "logits/chosen": -0.8070585131645203, "logits/rejected": -0.7428088784217834, "logps/chosen": -110.93099975585938, "logps/ref_chosen": -61.971466064453125, "logps/ref_rejected": -88.02059936523438, "logps/rejected": -192.29173278808594, "loss": 0.7847, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0657553672790527, "rewards/margins": 1.1945571899414062, "rewards/rejected": -2.260312557220459, "step": 265 }, { "epoch": 0.39060205580029367, "epsilon_dpo/beta": 0.021548166871070862, "epsilon_dpo/beta_margin_grad_mean": -0.28442031145095825, "epsilon_dpo/beta_margin_grad_std": 0.1898299902677536, "epsilon_dpo/beta_margin_mean": 1.2238138914108276, "epsilon_dpo/beta_margin_std": 1.2477452754974365, "epsilon_dpo/loss_margin_mean": 57.00858688354492, "grad_norm": 32.84288787841797, "kl/avg_steps": 0.71875, "kl/beta": 0.02170199528336525, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 3.8378328531967507e-07, "logits/chosen": -0.8406482338905334, "logits/rejected": -0.6030203104019165, "logps/chosen": -113.43331146240234, "logps/ref_chosen": -67.09967041015625, "logps/ref_rejected": -67.97122192382812, "logps/rejected": -171.31344604492188, "loss": 0.7567, "rewards/accuracies": 0.859375, "rewards/chosen": -0.9995484352111816, "rewards/margins": 1.2238138914108276, "rewards/rejected": -2.223362445831299, "step": 266 }, { "epoch": 0.3920704845814978, "epsilon_dpo/beta": 0.02138766087591648, "epsilon_dpo/beta_margin_grad_mean": -0.2644921839237213, "epsilon_dpo/beta_margin_grad_std": 0.1655421257019043, "epsilon_dpo/beta_margin_mean": 1.2985275983810425, "epsilon_dpo/beta_margin_std": 1.1735737323760986, "epsilon_dpo/loss_margin_mean": 60.872066497802734, "grad_norm": 36.54864501953125, "kl/avg_steps": 0.75, "kl/beta": 0.02154712565243244, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 3.8269741136960646e-07, "logits/chosen": -0.823701798915863, "logits/rejected": -0.6632372140884399, "logps/chosen": -117.4482650756836, "logps/ref_chosen": -68.97074890136719, "logps/ref_rejected": -90.16844940185547, "logps/rejected": -199.51803588867188, "loss": 0.6767, "rewards/accuracies": 0.953125, "rewards/chosen": -1.0386977195739746, "rewards/margins": 1.2985275983810425, "rewards/rejected": -2.3372254371643066, "step": 267 }, { "epoch": 0.3935389133627019, "epsilon_dpo/beta": 0.021241815760731697, "epsilon_dpo/beta_margin_grad_mean": -0.2792663872241974, "epsilon_dpo/beta_margin_grad_std": 0.1909855753183365, "epsilon_dpo/beta_margin_mean": 1.2270984649658203, "epsilon_dpo/beta_margin_std": 1.2154557704925537, "epsilon_dpo/loss_margin_mean": 58.02248764038086, "grad_norm": 45.175697326660156, "kl/avg_steps": 0.6875, "kl/beta": 0.021386725828051567, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 3.8160804071933894e-07, "logits/chosen": -0.6772645711898804, "logits/rejected": -0.6899411678314209, "logps/chosen": -110.12238311767578, "logps/ref_chosen": -55.900306701660156, "logps/ref_rejected": -101.64763641357422, "logps/rejected": -213.89219665527344, "loss": 0.7472, "rewards/accuracies": 0.875, "rewards/chosen": -1.1551294326782227, "rewards/margins": 1.2270984649658203, "rewards/rejected": -2.382227897644043, "step": 268 }, { "epoch": 0.39500734214390604, "epsilon_dpo/beta": 0.02111669071018696, "epsilon_dpo/beta_margin_grad_mean": -0.2359836995601654, "epsilon_dpo/beta_margin_grad_std": 0.20190633833408356, "epsilon_dpo/beta_margin_mean": 1.6167008876800537, "epsilon_dpo/beta_margin_std": 1.4006026983261108, "epsilon_dpo/loss_margin_mean": 76.9288101196289, "grad_norm": 55.38663101196289, "kl/avg_steps": 0.59375, "kl/beta": 0.02124069631099701, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 3.8051520207480204e-07, "logits/chosen": -0.777718722820282, "logits/rejected": -0.6842841506004333, "logps/chosen": -129.2608642578125, "logps/ref_chosen": -70.03955078125, "logps/ref_rejected": -107.34937286376953, "logps/rejected": -243.49948120117188, "loss": 0.6333, "rewards/accuracies": 0.859375, "rewards/chosen": -1.2531479597091675, "rewards/margins": 1.6167008876800537, "rewards/rejected": -2.8698487281799316, "step": 269 }, { "epoch": 0.3964757709251101, "epsilon_dpo/beta": 0.020985450595617294, "epsilon_dpo/beta_margin_grad_mean": -0.29173266887664795, "epsilon_dpo/beta_margin_grad_std": 0.1945246160030365, "epsilon_dpo/beta_margin_mean": 1.2036516666412354, "epsilon_dpo/beta_margin_std": 1.3258358240127563, "epsilon_dpo/loss_margin_mean": 57.62965393066406, "grad_norm": 39.35811233520508, "kl/avg_steps": 0.625, "kl/beta": 0.021115323528647423, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 3.794189242333106e-07, "logits/chosen": -0.7690585851669312, "logits/rejected": -0.7463020086288452, "logps/chosen": -120.31138610839844, "logps/ref_chosen": -69.53347778320312, "logps/ref_rejected": -109.92864990234375, "logps/rejected": -218.33621215820312, "loss": 0.7883, "rewards/accuracies": 0.890625, "rewards/chosen": -1.0666934251785278, "rewards/margins": 1.2036516666412354, "rewards/rejected": -2.2703449726104736, "step": 270 }, { "epoch": 0.39794419970631423, "epsilon_dpo/beta": 0.020835433155298233, "epsilon_dpo/beta_margin_grad_mean": -0.2704339027404785, "epsilon_dpo/beta_margin_grad_std": 0.1832066774368286, "epsilon_dpo/beta_margin_mean": 1.3792264461517334, "epsilon_dpo/beta_margin_std": 1.3681683540344238, "epsilon_dpo/loss_margin_mean": 66.40867614746094, "grad_norm": 50.04530334472656, "kl/avg_steps": 0.71875, "kl/beta": 0.020984172821044922, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 3.7831923608280514e-07, "logits/chosen": -0.7107840776443481, "logits/rejected": -0.6057737469673157, "logps/chosen": -116.04220581054688, "logps/ref_chosen": -56.76457214355469, "logps/ref_rejected": -92.51383209228516, "logps/rejected": -218.20013427734375, "loss": 0.702, "rewards/accuracies": 0.890625, "rewards/chosen": -1.2361798286437988, "rewards/margins": 1.3792264461517334, "rewards/rejected": -2.6154062747955322, "step": 271 }, { "epoch": 0.39941262848751835, "epsilon_dpo/beta": 0.020693259313702583, "epsilon_dpo/beta_margin_grad_mean": -0.2399667203426361, "epsilon_dpo/beta_margin_grad_std": 0.2069646120071411, "epsilon_dpo/beta_margin_mean": 1.6370471715927124, "epsilon_dpo/beta_margin_std": 1.55239999294281, "epsilon_dpo/loss_margin_mean": 79.44245147705078, "grad_norm": 41.83186721801758, "kl/avg_steps": 0.6875, "kl/beta": 0.0208344254642725, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 3.772161666010912e-07, "logits/chosen": -0.5893428325653076, "logits/rejected": -0.6096173524856567, "logps/chosen": -108.57723999023438, "logps/ref_chosen": -49.49715805053711, "logps/ref_rejected": -105.54279327392578, "logps/rejected": -244.06532287597656, "loss": 0.6611, "rewards/accuracies": 0.84375, "rewards/chosen": -1.223039150238037, "rewards/margins": 1.6370470523834229, "rewards/rejected": -2.860086441040039, "step": 272 }, { "epoch": 0.4008810572687225, "epsilon_dpo/beta": 0.02051962912082672, "epsilon_dpo/beta_margin_grad_mean": -0.23856356739997864, "epsilon_dpo/beta_margin_grad_std": 0.18430212140083313, "epsilon_dpo/beta_margin_mean": 1.6161974668502808, "epsilon_dpo/beta_margin_std": 1.4859600067138672, "epsilon_dpo/loss_margin_mean": 78.91922760009766, "grad_norm": 78.92858123779297, "kl/avg_steps": 0.84375, "kl/beta": 0.020692165940999985, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 3.761097448550755e-07, "logits/chosen": -0.6929817199707031, "logits/rejected": -0.5817391872406006, "logps/chosen": -125.79875183105469, "logps/ref_chosen": -62.97539520263672, "logps/ref_rejected": -92.49858093261719, "logps/rejected": -234.24114990234375, "loss": 0.6271, "rewards/accuracies": 0.9375, "rewards/chosen": -1.290513038635254, "rewards/margins": 1.6161974668502808, "rewards/rejected": -2.906710624694824, "step": 273 }, { "epoch": 0.4023494860499266, "epsilon_dpo/beta": 0.020380007103085518, "epsilon_dpo/beta_margin_grad_mean": -0.2764292061328888, "epsilon_dpo/beta_margin_grad_std": 0.19955939054489136, "epsilon_dpo/beta_margin_mean": 1.309850811958313, "epsilon_dpo/beta_margin_std": 1.3032691478729248, "epsilon_dpo/loss_margin_mean": 64.52693939208984, "grad_norm": 54.08884048461914, "kl/avg_steps": 0.6875, "kl/beta": 0.020519036799669266, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 3.75e-07, "logits/chosen": -0.6644202470779419, "logits/rejected": -0.5181941390037537, "logps/chosen": -130.21035766601562, "logps/ref_chosen": -55.66770935058594, "logps/ref_rejected": -77.33308410644531, "logps/rejected": -216.40267944335938, "loss": 0.7387, "rewards/accuracies": 0.828125, "rewards/chosen": -1.521093487739563, "rewards/margins": 1.309850811958313, "rewards/rejected": -2.830944538116455, "step": 274 }, { "epoch": 0.40381791483113066, "epsilon_dpo/beta": 0.020253589376807213, "epsilon_dpo/beta_margin_grad_mean": -0.2785060405731201, "epsilon_dpo/beta_margin_grad_std": 0.20673537254333496, "epsilon_dpo/beta_margin_mean": 1.3525274991989136, "epsilon_dpo/beta_margin_std": 1.4203792810440063, "epsilon_dpo/loss_margin_mean": 67.09994506835938, "grad_norm": 55.116641998291016, "kl/avg_steps": 0.625, "kl/beta": 0.02037893235683441, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 3.738869612786737e-07, "logits/chosen": -0.6706832647323608, "logits/rejected": -0.6582698822021484, "logps/chosen": -103.64492797851562, "logps/ref_chosen": -48.594703674316406, "logps/ref_rejected": -93.30369567871094, "logps/rejected": -215.45387268066406, "loss": 0.7525, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1173958778381348, "rewards/margins": 1.3525274991989136, "rewards/rejected": -2.469923496246338, "step": 275 }, { "epoch": 0.4052863436123348, "epsilon_dpo/beta": 0.020127790048718452, "epsilon_dpo/beta_margin_grad_mean": -0.28192076086997986, "epsilon_dpo/beta_margin_grad_std": 0.1965862363576889, "epsilon_dpo/beta_margin_mean": 1.339874267578125, "epsilon_dpo/beta_margin_std": 1.46302330493927, "epsilon_dpo/loss_margin_mean": 66.88359832763672, "grad_norm": 97.52819061279297, "kl/avg_steps": 0.625, "kl/beta": 0.020252354443073273, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 3.7277065802070204e-07, "logits/chosen": -0.7014357447624207, "logits/rejected": -0.5417762994766235, "logps/chosen": -118.94365692138672, "logps/ref_chosen": -56.57740783691406, "logps/ref_rejected": -70.36566925048828, "logps/rejected": -199.61550903320312, "loss": 0.7503, "rewards/accuracies": 0.796875, "rewards/chosen": -1.257150650024414, "rewards/margins": 1.339874267578125, "rewards/rejected": -2.597024917602539, "step": 276 }, { "epoch": 0.4067547723935389, "epsilon_dpo/beta": 0.019971320405602455, "epsilon_dpo/beta_margin_grad_mean": -0.2710755169391632, "epsilon_dpo/beta_margin_grad_std": 0.16731785237789154, "epsilon_dpo/beta_margin_mean": 1.283422589302063, "epsilon_dpo/beta_margin_std": 1.175058364868164, "epsilon_dpo/loss_margin_mean": 64.41889190673828, "grad_norm": 76.95030212402344, "kl/avg_steps": 0.78125, "kl/beta": 0.02012656256556511, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 3.71651119641714e-07, "logits/chosen": -0.6967241764068604, "logits/rejected": -0.6158395409584045, "logps/chosen": -115.48424530029297, "logps/ref_chosen": -56.27156066894531, "logps/ref_rejected": -92.88127136230469, "logps/rejected": -216.51283264160156, "loss": 0.6888, "rewards/accuracies": 0.953125, "rewards/chosen": -1.182989239692688, "rewards/margins": 1.283422589302063, "rewards/rejected": -2.46641206741333, "step": 277 }, { "epoch": 0.40822320117474303, "epsilon_dpo/beta": 0.019841471686959267, "epsilon_dpo/beta_margin_grad_mean": -0.264595627784729, "epsilon_dpo/beta_margin_grad_std": 0.19803930819034576, "epsilon_dpo/beta_margin_mean": 1.5051084756851196, "epsilon_dpo/beta_margin_std": 1.559203863143921, "epsilon_dpo/loss_margin_mean": 76.17341613769531, "grad_norm": 43.44647979736328, "kl/avg_steps": 0.65625, "kl/beta": 0.019970543682575226, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 3.705283756425872e-07, "logits/chosen": -0.5835840702056885, "logits/rejected": -0.5952056646347046, "logps/chosen": -113.42312622070312, "logps/ref_chosen": -52.94194030761719, "logps/ref_rejected": -91.25357818603516, "logps/rejected": -227.90817260742188, "loss": 0.7055, "rewards/accuracies": 0.859375, "rewards/chosen": -1.2011182308197021, "rewards/margins": 1.50510835647583, "rewards/rejected": -2.7062265872955322, "step": 278 }, { "epoch": 0.40969162995594716, "epsilon_dpo/beta": 0.019724512472748756, "epsilon_dpo/beta_margin_grad_mean": -0.25120463967323303, "epsilon_dpo/beta_margin_grad_std": 0.21320270001888275, "epsilon_dpo/beta_margin_mean": 1.616261601448059, "epsilon_dpo/beta_margin_std": 1.5595982074737549, "epsilon_dpo/loss_margin_mean": 82.3366470336914, "grad_norm": 56.63746643066406, "kl/avg_steps": 0.59375, "kl/beta": 0.019840341061353683, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 3.6940245560867e-07, "logits/chosen": -0.5176438689231873, "logits/rejected": -0.4667193591594696, "logps/chosen": -122.50505065917969, "logps/ref_chosen": -48.641319274902344, "logps/ref_rejected": -87.8514404296875, "logps/rejected": -244.05181884765625, "loss": 0.6789, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4587175846099854, "rewards/margins": 1.616261601448059, "rewards/rejected": -3.074979305267334, "step": 279 }, { "epoch": 0.4111600587371512, "epsilon_dpo/beta": 0.01956493966281414, "epsilon_dpo/beta_margin_grad_mean": -0.2162458449602127, "epsilon_dpo/beta_margin_grad_std": 0.16580165922641754, "epsilon_dpo/beta_margin_mean": 1.5945125818252563, "epsilon_dpo/beta_margin_std": 1.1257351636886597, "epsilon_dpo/loss_margin_mean": 81.6865463256836, "grad_norm": 51.763580322265625, "kl/avg_steps": 0.8125, "kl/beta": 0.019723234698176384, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 3.6827338920900253e-07, "logits/chosen": -0.5518442988395691, "logits/rejected": -0.5683473944664001, "logps/chosen": -132.6591033935547, "logps/ref_chosen": -58.797122955322266, "logps/ref_rejected": -98.61885070800781, "logps/rejected": -254.16738891601562, "loss": 0.5458, "rewards/accuracies": 0.90625, "rewards/chosen": -1.44656240940094, "rewards/margins": 1.5945125818252563, "rewards/rejected": -3.041074752807617, "step": 280 }, { "epoch": 0.41262848751835535, "epsilon_dpo/beta": 0.01940114051103592, "epsilon_dpo/beta_margin_grad_mean": -0.24567550420761108, "epsilon_dpo/beta_margin_grad_std": 0.19012600183486938, "epsilon_dpo/beta_margin_mean": 1.42535400390625, "epsilon_dpo/beta_margin_std": 1.3284828662872314, "epsilon_dpo/loss_margin_mean": 73.67842102050781, "grad_norm": 64.7247543334961, "kl/avg_steps": 0.84375, "kl/beta": 0.019564274698495865, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 3.6714120619553435e-07, "logits/chosen": -0.5656956434249878, "logits/rejected": -0.42098718881607056, "logps/chosen": -127.68087768554688, "logps/ref_chosen": -55.488521575927734, "logps/ref_rejected": -80.88258361816406, "logps/rejected": -226.75335693359375, "loss": 0.6922, "rewards/accuracies": 0.921875, "rewards/chosen": -1.4034972190856934, "rewards/margins": 1.42535400390625, "rewards/rejected": -2.8288512229919434, "step": 281 }, { "epoch": 0.41409691629955947, "epsilon_dpo/beta": 0.01927519217133522, "epsilon_dpo/beta_margin_grad_mean": -0.27404311299324036, "epsilon_dpo/beta_margin_grad_std": 0.16918019950389862, "epsilon_dpo/beta_margin_mean": 1.269365906715393, "epsilon_dpo/beta_margin_std": 1.2346893548965454, "epsilon_dpo/loss_margin_mean": 66.10945892333984, "grad_norm": 50.608116149902344, "kl/avg_steps": 0.65625, "kl/beta": 0.01940058171749115, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 3.660059364023408e-07, "logits/chosen": -0.6203180551528931, "logits/rejected": -0.36443668603897095, "logps/chosen": -152.64346313476562, "logps/ref_chosen": -73.07014465332031, "logps/ref_rejected": -95.35098266601562, "logps/rejected": -241.0337677001953, "loss": 0.701, "rewards/accuracies": 0.859375, "rewards/chosen": -1.5348063707351685, "rewards/margins": 1.2693657875061035, "rewards/rejected": -2.8041722774505615, "step": 282 }, { "epoch": 0.4155653450807636, "epsilon_dpo/beta": 0.019131455570459366, "epsilon_dpo/beta_margin_grad_mean": -0.24582470953464508, "epsilon_dpo/beta_margin_grad_std": 0.20526982843875885, "epsilon_dpo/beta_margin_mean": 1.57723867893219, "epsilon_dpo/beta_margin_std": 1.5101546049118042, "epsilon_dpo/loss_margin_mean": 82.75170135498047, "grad_norm": 56.80989074707031, "kl/avg_steps": 0.75, "kl/beta": 0.019274096935987473, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 3.6486760974483685e-07, "logits/chosen": -0.52199387550354, "logits/rejected": -0.4504436254501343, "logps/chosen": -145.3206787109375, "logps/ref_chosen": -61.89844512939453, "logps/ref_rejected": -96.98655700683594, "logps/rejected": -263.1604919433594, "loss": 0.6792, "rewards/accuracies": 0.875, "rewards/chosen": -1.5973310470581055, "rewards/margins": 1.5772387981414795, "rewards/rejected": -3.174569845199585, "step": 283 }, { "epoch": 0.4170337738619677, "epsilon_dpo/beta": 0.018989035859704018, "epsilon_dpo/beta_margin_grad_mean": -0.24250973761081696, "epsilon_dpo/beta_margin_grad_std": 0.1711161881685257, "epsilon_dpo/beta_margin_mean": 1.589562177658081, "epsilon_dpo/beta_margin_std": 1.5010643005371094, "epsilon_dpo/loss_margin_mean": 83.95030975341797, "grad_norm": 54.7606315612793, "kl/avg_steps": 0.75, "kl/beta": 0.019130617380142212, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 3.6372625621898863e-07, "logits/chosen": -0.46351319551467896, "logits/rejected": -0.44188055396080017, "logps/chosen": -143.29278564453125, "logps/ref_chosen": -58.4355354309082, "logps/ref_rejected": -93.46926879882812, "logps/rejected": -262.27685546875, "loss": 0.6139, "rewards/accuracies": 0.890625, "rewards/chosen": -1.6118556261062622, "rewards/margins": 1.589562177658081, "rewards/rejected": -3.2014176845550537, "step": 284 }, { "epoch": 0.4185022026431718, "epsilon_dpo/beta": 0.01885361224412918, "epsilon_dpo/beta_margin_grad_mean": -0.25107693672180176, "epsilon_dpo/beta_margin_grad_std": 0.18937461078166962, "epsilon_dpo/beta_margin_mean": 1.436224341392517, "epsilon_dpo/beta_margin_std": 1.289421796798706, "epsilon_dpo/loss_margin_mean": 76.44730377197266, "grad_norm": 71.8442153930664, "kl/avg_steps": 0.71875, "kl/beta": 0.01898820511996746, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 3.625819059005228e-07, "logits/chosen": -0.5949057340621948, "logits/rejected": -0.5442003607749939, "logps/chosen": -150.2645263671875, "logps/ref_chosen": -66.2322006225586, "logps/ref_rejected": -99.1268310546875, "logps/rejected": -259.6064453125, "loss": 0.6651, "rewards/accuracies": 0.875, "rewards/chosen": -1.5869226455688477, "rewards/margins": 1.4362244606018066, "rewards/rejected": -3.023146867752075, "step": 285 }, { "epoch": 0.4199706314243759, "epsilon_dpo/beta": 0.018724961206316948, "epsilon_dpo/beta_margin_grad_mean": -0.24600212275981903, "epsilon_dpo/beta_margin_grad_std": 0.19994834065437317, "epsilon_dpo/beta_margin_mean": 1.5680627822875977, "epsilon_dpo/beta_margin_std": 1.476419448852539, "epsilon_dpo/loss_margin_mean": 84.06401824951172, "grad_norm": 45.75380325317383, "kl/avg_steps": 0.6875, "kl/beta": 0.018852701410651207, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 3.614345889441346e-07, "logits/chosen": -0.6205496788024902, "logits/rejected": -0.4964269995689392, "logps/chosen": -150.214599609375, "logps/ref_chosen": -72.95100402832031, "logps/ref_rejected": -88.58845520019531, "logps/rejected": -249.91607666015625, "loss": 0.6691, "rewards/accuracies": 0.828125, "rewards/chosen": -1.4485076665878296, "rewards/margins": 1.5680627822875977, "rewards/rejected": -3.016570568084717, "step": 286 }, { "epoch": 0.42143906020558003, "epsilon_dpo/beta": 0.018591254949569702, "epsilon_dpo/beta_margin_grad_mean": -0.2821826934814453, "epsilon_dpo/beta_margin_grad_std": 0.212895929813385, "epsilon_dpo/beta_margin_mean": 1.3162546157836914, "epsilon_dpo/beta_margin_std": 1.5128830671310425, "epsilon_dpo/loss_margin_mean": 71.11557006835938, "grad_norm": 51.563961029052734, "kl/avg_steps": 0.71875, "kl/beta": 0.018723974004387856, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 3.6028433558269275e-07, "logits/chosen": -0.5361831188201904, "logits/rejected": -0.3548717200756073, "logps/chosen": -140.733154296875, "logps/ref_chosen": -61.54115295410156, "logps/ref_rejected": -77.6960678100586, "logps/rejected": -228.00364685058594, "loss": 0.7981, "rewards/accuracies": 0.84375, "rewards/chosen": -1.4748058319091797, "rewards/margins": 1.3162546157836914, "rewards/rejected": -2.791060447692871, "step": 287 }, { "epoch": 0.42290748898678415, "epsilon_dpo/beta": 0.018441151827573776, "epsilon_dpo/beta_margin_grad_mean": -0.2382936030626297, "epsilon_dpo/beta_margin_grad_std": 0.18369296193122864, "epsilon_dpo/beta_margin_mean": 1.5130971670150757, "epsilon_dpo/beta_margin_std": 1.265858769416809, "epsilon_dpo/loss_margin_mean": 82.2579116821289, "grad_norm": 72.87432098388672, "kl/avg_steps": 0.8125, "kl/beta": 0.018590355291962624, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 3.5913117612644327e-07, "logits/chosen": -0.5659801959991455, "logits/rejected": -0.4238738715648651, "logps/chosen": -135.8625030517578, "logps/ref_chosen": -56.661224365234375, "logps/ref_rejected": -87.335693359375, "logps/rejected": -248.79489135742188, "loss": 0.6233, "rewards/accuracies": 0.921875, "rewards/chosen": -1.462754726409912, "rewards/margins": 1.5130971670150757, "rewards/rejected": -2.9758520126342773, "step": 288 }, { "epoch": 0.4243759177679883, "epsilon_dpo/beta": 0.018321340903639793, "epsilon_dpo/beta_margin_grad_mean": -0.25129714608192444, "epsilon_dpo/beta_margin_grad_std": 0.19622960686683655, "epsilon_dpo/beta_margin_mean": 1.6442151069641113, "epsilon_dpo/beta_margin_std": 1.6481398344039917, "epsilon_dpo/loss_margin_mean": 90.0957260131836, "grad_norm": 43.197750091552734, "kl/avg_steps": 0.65625, "kl/beta": 0.018440525978803635, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 3.5797514096221024e-07, "logits/chosen": -0.4051462411880493, "logits/rejected": -0.3806743025779724, "logps/chosen": -127.05389404296875, "logps/ref_chosen": -45.23039245605469, "logps/ref_rejected": -87.64266967773438, "logps/rejected": -259.5618896484375, "loss": 0.66, "rewards/accuracies": 0.875, "rewards/chosen": -1.500335693359375, "rewards/margins": 1.6442151069641113, "rewards/rejected": -3.1445508003234863, "step": 289 }, { "epoch": 0.42584434654919234, "epsilon_dpo/beta": 0.018167538568377495, "epsilon_dpo/beta_margin_grad_mean": -0.23720751702785492, "epsilon_dpo/beta_margin_grad_std": 0.19694750010967255, "epsilon_dpo/beta_margin_mean": 1.7543230056762695, "epsilon_dpo/beta_margin_std": 1.7301348447799683, "epsilon_dpo/loss_margin_mean": 96.78536987304688, "grad_norm": 60.19779968261719, "kl/avg_steps": 0.84375, "kl/beta": 0.018320299685001373, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 3.568162605525952e-07, "logits/chosen": -0.41647857427597046, "logits/rejected": -0.5270963907241821, "logps/chosen": -145.60018920898438, "logps/ref_chosen": -55.47149658203125, "logps/ref_rejected": -116.70857238769531, "logps/rejected": -303.62261962890625, "loss": 0.6338, "rewards/accuracies": 0.90625, "rewards/chosen": -1.6394047737121582, "rewards/margins": 1.7543230056762695, "rewards/rejected": -3.3937277793884277, "step": 290 }, { "epoch": 0.42731277533039647, "epsilon_dpo/beta": 0.018032565712928772, "epsilon_dpo/beta_margin_grad_mean": -0.24202531576156616, "epsilon_dpo/beta_margin_grad_std": 0.19435711205005646, "epsilon_dpo/beta_margin_mean": 1.5499616861343384, "epsilon_dpo/beta_margin_std": 1.3567811250686646, "epsilon_dpo/loss_margin_mean": 86.23190307617188, "grad_norm": 49.096588134765625, "kl/avg_steps": 0.75, "kl/beta": 0.01816701516509056, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 3.5565456543517485e-07, "logits/chosen": -0.45129257440567017, "logits/rejected": -0.40018701553344727, "logps/chosen": -137.8589630126953, "logps/ref_chosen": -63.26036834716797, "logps/ref_rejected": -89.29708862304688, "logps/rejected": -250.12759399414062, "loss": 0.6417, "rewards/accuracies": 0.90625, "rewards/chosen": -1.3472115993499756, "rewards/margins": 1.549961805343628, "rewards/rejected": -2.8971734046936035, "step": 291 }, { "epoch": 0.4287812041116006, "epsilon_dpo/beta": 0.017903964966535568, "epsilon_dpo/beta_margin_grad_mean": -0.260786235332489, "epsilon_dpo/beta_margin_grad_std": 0.21484361588954926, "epsilon_dpo/beta_margin_mean": 1.5039082765579224, "epsilon_dpo/beta_margin_std": 1.5898772478103638, "epsilon_dpo/loss_margin_mean": 84.35847473144531, "grad_norm": 136.08177185058594, "kl/avg_steps": 0.71875, "kl/beta": 0.018031777814030647, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 3.5449008622169583e-07, "logits/chosen": -0.36719846725463867, "logits/rejected": -0.2681925892829895, "logps/chosen": -153.21641540527344, "logps/ref_chosen": -53.91852951049805, "logps/ref_rejected": -89.96138000488281, "logps/rejected": -273.61773681640625, "loss": 0.7375, "rewards/accuracies": 0.859375, "rewards/chosen": -1.7816226482391357, "rewards/margins": 1.5039082765579224, "rewards/rejected": -3.2855310440063477, "step": 292 }, { "epoch": 0.4302496328928047, "epsilon_dpo/beta": 0.01780417375266552, "epsilon_dpo/beta_margin_grad_mean": -0.2954978048801422, "epsilon_dpo/beta_margin_grad_std": 0.22622421383857727, "epsilon_dpo/beta_margin_mean": 1.2976562976837158, "epsilon_dpo/beta_margin_std": 1.5245400667190552, "epsilon_dpo/loss_margin_mean": 73.33763885498047, "grad_norm": 65.80419158935547, "kl/avg_steps": 0.5625, "kl/beta": 0.017903098836541176, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.5332285359726846e-07, "logits/chosen": -0.286105751991272, "logits/rejected": -0.1886579692363739, "logps/chosen": -152.51913452148438, "logps/ref_chosen": -60.376033782958984, "logps/ref_rejected": -77.8524398803711, "logps/rejected": -243.33319091796875, "loss": 0.834, "rewards/accuracies": 0.796875, "rewards/chosen": -1.6426957845687866, "rewards/margins": 1.2976562976837158, "rewards/rejected": -2.940351963043213, "step": 293 }, { "epoch": 0.43171806167400884, "epsilon_dpo/beta": 0.01769345812499523, "epsilon_dpo/beta_margin_grad_mean": -0.27741488814353943, "epsilon_dpo/beta_margin_grad_std": 0.20217780768871307, "epsilon_dpo/beta_margin_mean": 1.4067116975784302, "epsilon_dpo/beta_margin_std": 1.5747895240783691, "epsilon_dpo/loss_margin_mean": 79.890869140625, "grad_norm": 62.94172286987305, "kl/avg_steps": 0.625, "kl/beta": 0.017802957445383072, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 3.5215289831955786e-07, "logits/chosen": -0.23500093817710876, "logits/rejected": -0.21314044296741486, "logps/chosen": -135.5957794189453, "logps/ref_chosen": -48.0875358581543, "logps/ref_rejected": -81.89698791503906, "logps/rejected": -249.2960968017578, "loss": 0.7448, "rewards/accuracies": 0.828125, "rewards/chosen": -1.5494587421417236, "rewards/margins": 1.4067118167877197, "rewards/rejected": -2.9561705589294434, "step": 294 }, { "epoch": 0.4331864904552129, "epsilon_dpo/beta": 0.01757803000509739, "epsilon_dpo/beta_margin_grad_mean": -0.27356937527656555, "epsilon_dpo/beta_margin_grad_std": 0.20483291149139404, "epsilon_dpo/beta_margin_mean": 1.3722447156906128, "epsilon_dpo/beta_margin_std": 1.434361219406128, "epsilon_dpo/loss_margin_mean": 78.42586517333984, "grad_norm": 83.32816314697266, "kl/avg_steps": 0.65625, "kl/beta": 0.017692379653453827, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 3.509802512179737e-07, "logits/chosen": -0.19670921564102173, "logits/rejected": -0.21700705587863922, "logps/chosen": -150.30224609375, "logps/ref_chosen": -49.92467498779297, "logps/ref_rejected": -87.45632934570312, "logps/rejected": -266.259765625, "loss": 0.741, "rewards/accuracies": 0.828125, "rewards/chosen": -1.7668020725250244, "rewards/margins": 1.3722447156906128, "rewards/rejected": -3.1390466690063477, "step": 295 }, { "epoch": 0.434654919236417, "epsilon_dpo/beta": 0.017485400661826134, "epsilon_dpo/beta_margin_grad_mean": -0.3116765022277832, "epsilon_dpo/beta_margin_grad_std": 0.2334553748369217, "epsilon_dpo/beta_margin_mean": 1.1315889358520508, "epsilon_dpo/beta_margin_std": 1.5379483699798584, "epsilon_dpo/loss_margin_mean": 65.20050048828125, "grad_norm": 100.70362854003906, "kl/avg_steps": 0.53125, "kl/beta": 0.017577029764652252, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 3.498049431928577e-07, "logits/chosen": -0.46437016129493713, "logits/rejected": -0.2735271751880646, "logps/chosen": -166.9285888671875, "logps/ref_chosen": -65.49124145507812, "logps/ref_rejected": -93.08908081054688, "logps/rejected": -259.7269287109375, "loss": 0.9407, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7783277034759521, "rewards/margins": 1.1315889358520508, "rewards/rejected": -2.909916639328003, "step": 296 }, { "epoch": 0.43612334801762115, "epsilon_dpo/beta": 0.017354749143123627, "epsilon_dpo/beta_margin_grad_mean": -0.2685171365737915, "epsilon_dpo/beta_margin_grad_std": 0.18320339918136597, "epsilon_dpo/beta_margin_mean": 1.2928276062011719, "epsilon_dpo/beta_margin_std": 1.2205746173858643, "epsilon_dpo/loss_margin_mean": 74.73949432373047, "grad_norm": 55.77240753173828, "kl/avg_steps": 0.75, "kl/beta": 0.017484145238995552, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 3.486270052146694e-07, "logits/chosen": -0.3529755473136902, "logits/rejected": -0.2527937591075897, "logps/chosen": -151.24240112304688, "logps/ref_chosen": -56.47694778442383, "logps/ref_rejected": -95.1385498046875, "logps/rejected": -264.64349365234375, "loss": 0.7113, "rewards/accuracies": 0.890625, "rewards/chosen": -1.6452429294586182, "rewards/margins": 1.2928276062011719, "rewards/rejected": -2.93807053565979, "step": 297 }, { "epoch": 0.43759177679882527, "epsilon_dpo/beta": 0.0172418300062418, "epsilon_dpo/beta_margin_grad_mean": -0.2541157007217407, "epsilon_dpo/beta_margin_grad_std": 0.18940328061580658, "epsilon_dpo/beta_margin_mean": 1.5506335496902466, "epsilon_dpo/beta_margin_std": 1.5275505781173706, "epsilon_dpo/loss_margin_mean": 90.27720642089844, "grad_norm": 48.75471878051758, "kl/avg_steps": 0.65625, "kl/beta": 0.01735399104654789, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 3.474464683231698e-07, "logits/chosen": -0.37584325671195984, "logits/rejected": -0.48060327768325806, "logps/chosen": -154.344970703125, "logps/ref_chosen": -67.32516479492188, "logps/ref_rejected": -116.66217041015625, "logps/rejected": -293.95916748046875, "loss": 0.6597, "rewards/accuracies": 0.875, "rewards/chosen": -1.5013866424560547, "rewards/margins": 1.5506335496902466, "rewards/rejected": -3.0520200729370117, "step": 298 }, { "epoch": 0.4390602055800294, "epsilon_dpo/beta": 0.01714019477367401, "epsilon_dpo/beta_margin_grad_mean": -0.2694842517375946, "epsilon_dpo/beta_margin_grad_std": 0.22855404019355774, "epsilon_dpo/beta_margin_mean": 1.5130068063735962, "epsilon_dpo/beta_margin_std": 1.702101469039917, "epsilon_dpo/loss_margin_mean": 88.78156280517578, "grad_norm": 57.401187896728516, "kl/avg_steps": 0.59375, "kl/beta": 0.017240848392248154, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 3.462633636266041e-07, "logits/chosen": -0.24702782928943634, "logits/rejected": -0.19395104050636292, "logps/chosen": -130.8524627685547, "logps/ref_chosen": -48.96209716796875, "logps/ref_rejected": -84.32823944091797, "logps/rejected": -255.00015258789062, "loss": 0.7716, "rewards/accuracies": 0.828125, "rewards/chosen": -1.4078408479690552, "rewards/margins": 1.5130068063735962, "rewards/rejected": -2.9208476543426514, "step": 299 }, { "epoch": 0.44052863436123346, "epsilon_dpo/beta": 0.017025606706738472, "epsilon_dpo/beta_margin_grad_mean": -0.25651147961616516, "epsilon_dpo/beta_margin_grad_std": 0.24618221819400787, "epsilon_dpo/beta_margin_mean": 1.6073572635650635, "epsilon_dpo/beta_margin_std": 1.7735728025436401, "epsilon_dpo/loss_margin_mean": 94.92350769042969, "grad_norm": 118.19168853759766, "kl/avg_steps": 0.671875, "kl/beta": 0.01713908463716507, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.828125, "learning_rate": 3.4507772230088147e-07, "logits/chosen": -0.23180323839187622, "logits/rejected": -0.23513799905776978, "logps/chosen": -168.39486694335938, "logps/ref_chosen": -59.073707580566406, "logps/ref_rejected": -95.9664535522461, "logps/rejected": -300.21112060546875, "loss": 0.7955, "rewards/accuracies": 0.84375, "rewards/chosen": -1.8646724224090576, "rewards/margins": 1.6073572635650635, "rewards/rejected": -3.472029685974121, "step": 300 }, { "epoch": 0.44052863436123346, "eval_epsilon_dpo/beta": 0.016948556527495384, "eval_epsilon_dpo/beta_margin_grad_mean": -0.34138724207878113, "eval_epsilon_dpo/beta_margin_grad_std": 0.21339240670204163, "eval_epsilon_dpo/beta_margin_mean": 0.9211956858634949, "eval_epsilon_dpo/beta_margin_std": 1.3268096446990967, "eval_epsilon_dpo/loss_margin_mean": 54.82255172729492, "eval_kl/n_epsilon_steps": 0.2709760367870331, "eval_kl/p_epsilon_steps": 0.7281678318977356, "eval_logits/chosen": -0.4338293671607971, "eval_logits/rejected": -0.29129528999328613, "eval_logps/chosen": -180.23182678222656, "eval_logps/ref_chosen": -79.05104064941406, "eval_logps/ref_rejected": -86.79793548583984, "eval_logps/rejected": -242.80128479003906, "eval_loss": 0.4928291141986847, "eval_rewards/accuracies": 0.7452911138534546, "eval_rewards/chosen": -1.717902421951294, "eval_rewards/margins": 0.9211956858634949, "eval_rewards/rejected": -2.6390981674194336, "eval_runtime": 43.1421, "eval_samples_per_second": 54.216, "eval_steps_per_second": 1.715, "step": 300 }, { "epoch": 0.4419970631424376, "epsilon_dpo/beta": 0.016898702830076218, "epsilon_dpo/beta_margin_grad_mean": -0.23044808208942413, "epsilon_dpo/beta_margin_grad_std": 0.18402184545993805, "epsilon_dpo/beta_margin_mean": 1.726190447807312, "epsilon_dpo/beta_margin_std": 1.530474305152893, "epsilon_dpo/loss_margin_mean": 102.42485809326172, "grad_norm": 57.23482131958008, "kl/avg_steps": 0.75, "kl/beta": 0.017024699598550797, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 3.4388957558875316e-07, "logits/chosen": -0.38795414566993713, "logits/rejected": -0.3201250433921814, "logps/chosen": -149.53274536132812, "logps/ref_chosen": -57.249366760253906, "logps/ref_rejected": -92.35354614257812, "logps/rejected": -287.0617980957031, "loss": 0.5917, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5618090629577637, "rewards/margins": 1.726190447807312, "rewards/rejected": -3.2879996299743652, "step": 301 }, { "epoch": 0.4434654919236417, "epsilon_dpo/beta": 0.016788750886917114, "epsilon_dpo/beta_margin_grad_mean": -0.2679104208946228, "epsilon_dpo/beta_margin_grad_std": 0.20277279615402222, "epsilon_dpo/beta_margin_mean": 1.4023065567016602, "epsilon_dpo/beta_margin_std": 1.4703813791275024, "epsilon_dpo/loss_margin_mean": 83.92284393310547, "grad_norm": 46.08768844604492, "kl/avg_steps": 0.65625, "kl/beta": 0.016897965222597122, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 3.426989547989902e-07, "logits/chosen": -0.31420189142227173, "logits/rejected": -0.3468834459781647, "logps/chosen": -131.73231506347656, "logps/ref_chosen": -51.19799041748047, "logps/ref_rejected": -97.22636413574219, "logps/rejected": -261.68353271484375, "loss": 0.7302, "rewards/accuracies": 0.84375, "rewards/chosen": -1.3534595966339111, "rewards/margins": 1.4023065567016602, "rewards/rejected": -2.7557661533355713, "step": 302 }, { "epoch": 0.44493392070484583, "epsilon_dpo/beta": 0.016695033758878708, "epsilon_dpo/beta_margin_grad_mean": -0.28386080265045166, "epsilon_dpo/beta_margin_grad_std": 0.21949513256549835, "epsilon_dpo/beta_margin_mean": 1.410112738609314, "epsilon_dpo/beta_margin_std": 1.6264557838439941, "epsilon_dpo/loss_margin_mean": 84.96031188964844, "grad_norm": 65.71988677978516, "kl/avg_steps": 0.5625, "kl/beta": 0.01678779534995556, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.4150589130555773e-07, "logits/chosen": -0.4323018193244934, "logits/rejected": -0.23482966423034668, "logps/chosen": -156.1943359375, "logps/ref_chosen": -66.71394348144531, "logps/ref_rejected": -86.94542694091797, "logps/rejected": -261.3861389160156, "loss": 0.7884, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4964756965637207, "rewards/margins": 1.4101126194000244, "rewards/rejected": -2.906588554382324, "step": 303 }, { "epoch": 0.44640234948604995, "epsilon_dpo/beta": 0.01657034456729889, "epsilon_dpo/beta_margin_grad_mean": -0.24798835813999176, "epsilon_dpo/beta_margin_grad_std": 0.18337440490722656, "epsilon_dpo/beta_margin_mean": 1.3948038816452026, "epsilon_dpo/beta_margin_std": 1.129565954208374, "epsilon_dpo/loss_margin_mean": 84.44613647460938, "grad_norm": 53.74188232421875, "kl/avg_steps": 0.75, "kl/beta": 0.01669389195740223, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 3.403104165467883e-07, "logits/chosen": -0.4842193126678467, "logits/rejected": -0.3841709494590759, "logps/chosen": -154.2616729736328, "logps/ref_chosen": -71.95069885253906, "logps/ref_rejected": -90.47203063964844, "logps/rejected": -257.2291259765625, "loss": 0.6452, "rewards/accuracies": 0.890625, "rewards/chosen": -1.3671255111694336, "rewards/margins": 1.394803762435913, "rewards/rejected": -2.7619292736053467, "step": 304 }, { "epoch": 0.447870778267254, "epsilon_dpo/beta": 0.01645217090845108, "epsilon_dpo/beta_margin_grad_mean": -0.2687958776950836, "epsilon_dpo/beta_margin_grad_std": 0.18299052119255066, "epsilon_dpo/beta_margin_mean": 1.354810118675232, "epsilon_dpo/beta_margin_std": 1.3857295513153076, "epsilon_dpo/loss_margin_mean": 82.63323974609375, "grad_norm": 55.70637130737305, "kl/avg_steps": 0.71875, "kl/beta": 0.01656961999833584, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 3.391125620245535e-07, "logits/chosen": -0.39701157808303833, "logits/rejected": -0.2356944978237152, "logps/chosen": -158.01931762695312, "logps/ref_chosen": -66.79523468017578, "logps/ref_rejected": -92.75459289550781, "logps/rejected": -266.6119384765625, "loss": 0.7054, "rewards/accuracies": 0.90625, "rewards/chosen": -1.5025837421417236, "rewards/margins": 1.354810118675232, "rewards/rejected": -2.857393741607666, "step": 305 }, { "epoch": 0.44933920704845814, "epsilon_dpo/beta": 0.016334764659404755, "epsilon_dpo/beta_margin_grad_mean": -0.2529657781124115, "epsilon_dpo/beta_margin_grad_std": 0.19209551811218262, "epsilon_dpo/beta_margin_mean": 1.4291316270828247, "epsilon_dpo/beta_margin_std": 1.3057225942611694, "epsilon_dpo/loss_margin_mean": 87.81014251708984, "grad_norm": 55.52931594848633, "kl/avg_steps": 0.71875, "kl/beta": 0.01645137555897236, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 3.3791235930343417e-07, "logits/chosen": -0.43966060876846313, "logits/rejected": -0.21656040847301483, "logps/chosen": -162.42086791992188, "logps/ref_chosen": -69.68389892578125, "logps/ref_rejected": -85.15919494628906, "logps/rejected": -265.706298828125, "loss": 0.6797, "rewards/accuracies": 0.875, "rewards/chosen": -1.5165860652923584, "rewards/margins": 1.4291316270828247, "rewards/rejected": -2.9457178115844727, "step": 306 }, { "epoch": 0.45080763582966227, "epsilon_dpo/beta": 0.016228405758738518, "epsilon_dpo/beta_margin_grad_mean": -0.2852145731449127, "epsilon_dpo/beta_margin_grad_std": 0.2094935029745102, "epsilon_dpo/beta_margin_mean": 1.2783092260360718, "epsilon_dpo/beta_margin_std": 1.412838101387024, "epsilon_dpo/loss_margin_mean": 79.1700439453125, "grad_norm": 48.41770935058594, "kl/avg_steps": 0.65625, "kl/beta": 0.01633397489786148, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 3.367098400098881e-07, "logits/chosen": -0.3732296824455261, "logits/rejected": -0.22560788691043854, "logps/chosen": -162.98159790039062, "logps/ref_chosen": -70.16542053222656, "logps/ref_rejected": -86.97230529785156, "logps/rejected": -258.95849609375, "loss": 0.7878, "rewards/accuracies": 0.828125, "rewards/chosen": -1.5072572231292725, "rewards/margins": 1.2783092260360718, "rewards/rejected": -2.785566568374634, "step": 307 }, { "epoch": 0.4522760646108664, "epsilon_dpo/beta": 0.016102313995361328, "epsilon_dpo/beta_margin_grad_mean": -0.24816231429576874, "epsilon_dpo/beta_margin_grad_std": 0.19829751551151276, "epsilon_dpo/beta_margin_mean": 1.508320927619934, "epsilon_dpo/beta_margin_std": 1.43063223361969, "epsilon_dpo/loss_margin_mean": 93.97611999511719, "grad_norm": 59.59919357299805, "kl/avg_steps": 0.78125, "kl/beta": 0.0162274818867445, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 3.355050358314172e-07, "logits/chosen": -0.1325426995754242, "logits/rejected": -0.00846935249865055, "logps/chosen": -157.41693115234375, "logps/ref_chosen": -55.2449951171875, "logps/ref_rejected": -79.37226104736328, "logps/rejected": -275.52032470703125, "loss": 0.6795, "rewards/accuracies": 0.90625, "rewards/chosen": -1.6461052894592285, "rewards/margins": 1.5083208084106445, "rewards/rejected": -3.154426097869873, "step": 308 }, { "epoch": 0.45374449339207046, "epsilon_dpo/beta": 0.015982523560523987, "epsilon_dpo/beta_margin_grad_mean": -0.2601964771747589, "epsilon_dpo/beta_margin_grad_std": 0.1988908052444458, "epsilon_dpo/beta_margin_mean": 1.4376085996627808, "epsilon_dpo/beta_margin_std": 1.40444016456604, "epsilon_dpo/loss_margin_mean": 90.27399444580078, "grad_norm": 70.78994750976562, "kl/avg_steps": 0.75, "kl/beta": 0.016101688146591187, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 3.3429797851573183e-07, "logits/chosen": -0.10324697196483612, "logits/rejected": 0.07860840111970901, "logps/chosen": -158.23501586914062, "logps/ref_chosen": -48.959083557128906, "logps/ref_rejected": -82.34072875976562, "logps/rejected": -281.8906555175781, "loss": 0.7035, "rewards/accuracies": 0.875, "rewards/chosen": -1.7477052211761475, "rewards/margins": 1.4376085996627808, "rewards/rejected": -3.1853137016296387, "step": 309 }, { "epoch": 0.4552129221732746, "epsilon_dpo/beta": 0.015853555873036385, "epsilon_dpo/beta_margin_grad_mean": -0.2507603168487549, "epsilon_dpo/beta_margin_grad_std": 0.16641587018966675, "epsilon_dpo/beta_margin_mean": 1.3794183731079102, "epsilon_dpo/beta_margin_std": 1.1298142671585083, "epsilon_dpo/loss_margin_mean": 87.19569396972656, "grad_norm": 95.67975616455078, "kl/avg_steps": 0.8125, "kl/beta": 0.015981823205947876, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 3.3308869986991487e-07, "logits/chosen": -0.17005358636379242, "logits/rejected": 0.10568149387836456, "logps/chosen": -175.5594940185547, "logps/ref_chosen": -62.74177932739258, "logps/ref_rejected": -79.9302978515625, "logps/rejected": -279.9437255859375, "loss": 0.6344, "rewards/accuracies": 0.90625, "rewards/chosen": -1.7901716232299805, "rewards/margins": 1.3794183731079102, "rewards/rejected": -3.1695899963378906, "step": 310 }, { "epoch": 0.4566813509544787, "epsilon_dpo/beta": 0.015755511820316315, "epsilon_dpo/beta_margin_grad_mean": -0.24761131405830383, "epsilon_dpo/beta_margin_grad_std": 0.20032590627670288, "epsilon_dpo/beta_margin_mean": 1.613990068435669, "epsilon_dpo/beta_margin_std": 1.5465534925460815, "epsilon_dpo/loss_margin_mean": 102.88874816894531, "grad_norm": 72.37922668457031, "kl/avg_steps": 0.625, "kl/beta": 0.015853017568588257, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 3.3187723175958346e-07, "logits/chosen": -0.15288975834846497, "logits/rejected": 0.19276544451713562, "logps/chosen": -178.68092346191406, "logps/ref_chosen": -53.027976989746094, "logps/ref_rejected": -77.43820190429688, "logps/rejected": -305.9798889160156, "loss": 0.6546, "rewards/accuracies": 0.859375, "rewards/chosen": -1.9825176000595093, "rewards/margins": 1.613990068435669, "rewards/rejected": -3.5965075492858887, "step": 311 }, { "epoch": 0.4581497797356828, "epsilon_dpo/beta": 0.015633033588528633, "epsilon_dpo/beta_margin_grad_mean": -0.25820305943489075, "epsilon_dpo/beta_margin_grad_std": 0.19453281164169312, "epsilon_dpo/beta_margin_mean": 1.4380865097045898, "epsilon_dpo/beta_margin_std": 1.4255266189575195, "epsilon_dpo/loss_margin_mean": 92.2966537475586, "grad_norm": 58.559967041015625, "kl/avg_steps": 0.78125, "kl/beta": 0.01575455255806446, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 3.306636061080487e-07, "logits/chosen": -0.07131820917129517, "logits/rejected": 0.3034754693508148, "logps/chosen": -160.26889038085938, "logps/ref_chosen": -49.39221954345703, "logps/ref_rejected": -75.79280090332031, "logps/rejected": -278.96612548828125, "loss": 0.6956, "rewards/accuracies": 0.875, "rewards/chosen": -1.7354340553283691, "rewards/margins": 1.4380865097045898, "rewards/rejected": -3.173520565032959, "step": 312 }, { "epoch": 0.45961820851688695, "epsilon_dpo/beta": 0.015528921969234943, "epsilon_dpo/beta_margin_grad_mean": -0.2961232364177704, "epsilon_dpo/beta_margin_grad_std": 0.21177497506141663, "epsilon_dpo/beta_margin_mean": 1.2246079444885254, "epsilon_dpo/beta_margin_std": 1.4380840063095093, "epsilon_dpo/loss_margin_mean": 79.250732421875, "grad_norm": 53.46128463745117, "kl/avg_steps": 0.671875, "kl/beta": 0.015632424503564835, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.828125, "learning_rate": 3.2944785489547537e-07, "logits/chosen": -0.18855774402618408, "logits/rejected": -0.04356323182582855, "logps/chosen": -143.72442626953125, "logps/ref_chosen": -50.152740478515625, "logps/ref_rejected": -86.40620422363281, "logps/rejected": -259.2286071777344, "loss": 0.8263, "rewards/accuracies": 0.84375, "rewards/chosen": -1.4555275440216064, "rewards/margins": 1.2246079444885254, "rewards/rejected": -2.6801352500915527, "step": 313 }, { "epoch": 0.461086637298091, "epsilon_dpo/beta": 0.015442293137311935, "epsilon_dpo/beta_margin_grad_mean": -0.29345953464508057, "epsilon_dpo/beta_margin_grad_std": 0.21309317648410797, "epsilon_dpo/beta_margin_mean": 1.2201604843139648, "epsilon_dpo/beta_margin_std": 1.3797729015350342, "epsilon_dpo/loss_margin_mean": 79.48578643798828, "grad_norm": 105.91136932373047, "kl/avg_steps": 0.5625, "kl/beta": 0.015528094954788685, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.2823001015803857e-07, "logits/chosen": -0.2723826467990875, "logits/rejected": -0.20200121402740479, "logps/chosen": -151.65988159179688, "logps/ref_chosen": -57.23758316040039, "logps/ref_rejected": -97.59652709960938, "logps/rejected": -271.504638671875, "loss": 0.8108, "rewards/accuracies": 0.765625, "rewards/chosen": -1.4609575271606445, "rewards/margins": 1.2201604843139648, "rewards/rejected": -2.6811180114746094, "step": 314 }, { "epoch": 0.46255506607929514, "epsilon_dpo/beta": 0.015331787057220936, "epsilon_dpo/beta_margin_grad_mean": -0.29342058300971985, "epsilon_dpo/beta_margin_grad_std": 0.1769658774137497, "epsilon_dpo/beta_margin_mean": 1.1589797735214233, "epsilon_dpo/beta_margin_std": 1.2406238317489624, "epsilon_dpo/loss_margin_mean": 75.88170623779297, "grad_norm": 55.90388870239258, "kl/avg_steps": 0.71875, "kl/beta": 0.015441237948834896, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 3.270101039870797e-07, "logits/chosen": -0.08375975489616394, "logits/rejected": 0.001368771307170391, "logps/chosen": -142.81259155273438, "logps/ref_chosen": -49.06958770751953, "logps/ref_rejected": -85.68087768554688, "logps/rejected": -255.30560302734375, "loss": 0.7708, "rewards/accuracies": 0.859375, "rewards/chosen": -1.4388582706451416, "rewards/margins": 1.1589796543121338, "rewards/rejected": -2.5978379249572754, "step": 315 }, { "epoch": 0.46402349486049926, "epsilon_dpo/beta": 0.015217584557831287, "epsilon_dpo/beta_margin_grad_mean": -0.2518807351589203, "epsilon_dpo/beta_margin_grad_std": 0.17856837809085846, "epsilon_dpo/beta_margin_mean": 1.4667308330535889, "epsilon_dpo/beta_margin_std": 1.3366727828979492, "epsilon_dpo/loss_margin_mean": 96.67438507080078, "grad_norm": 46.615665435791016, "kl/avg_steps": 0.75, "kl/beta": 0.015331045724451542, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 3.2578816852826086e-07, "logits/chosen": -0.24756154417991638, "logits/rejected": -0.2685829997062683, "logps/chosen": -144.43292236328125, "logps/ref_chosen": -54.26074981689453, "logps/ref_rejected": -101.2814712524414, "logps/rejected": -288.1280517578125, "loss": 0.6472, "rewards/accuracies": 0.859375, "rewards/chosen": -1.3722679615020752, "rewards/margins": 1.4667308330535889, "rewards/rejected": -2.838998794555664, "step": 316 }, { "epoch": 0.4654919236417034, "epsilon_dpo/beta": 0.015109057538211346, "epsilon_dpo/beta_margin_grad_mean": -0.22824791073799133, "epsilon_dpo/beta_margin_grad_std": 0.17922361195087433, "epsilon_dpo/beta_margin_mean": 1.5885720252990723, "epsilon_dpo/beta_margin_std": 1.2415167093276978, "epsilon_dpo/loss_margin_mean": 105.47412109375, "grad_norm": 79.0168685913086, "kl/avg_steps": 0.71875, "kl/beta": 0.015216918662190437, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 3.2456423598071783e-07, "logits/chosen": -0.35903796553611755, "logits/rejected": -0.09899880737066269, "logps/chosen": -140.70082092285156, "logps/ref_chosen": -56.094207763671875, "logps/ref_rejected": -100.69905090332031, "logps/rejected": -290.77978515625, "loss": 0.5824, "rewards/accuracies": 0.875, "rewards/chosen": -1.279386281967163, "rewards/margins": 1.5885720252990723, "rewards/rejected": -2.8679583072662354, "step": 317 }, { "epoch": 0.4669603524229075, "epsilon_dpo/beta": 0.014991792850196362, "epsilon_dpo/beta_margin_grad_mean": -0.24415971338748932, "epsilon_dpo/beta_margin_grad_std": 0.1679057478904724, "epsilon_dpo/beta_margin_mean": 1.4487322568893433, "epsilon_dpo/beta_margin_std": 1.1893333196640015, "epsilon_dpo/loss_margin_mean": 96.87982177734375, "grad_norm": 78.77783203125, "kl/avg_steps": 0.78125, "kl/beta": 0.015108327381312847, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 3.233383385962115e-07, "logits/chosen": -0.48737025260925293, "logits/rejected": -0.21573328971862793, "logps/chosen": -144.57675170898438, "logps/ref_chosen": -64.64570617675781, "logps/ref_rejected": -82.76425170898438, "logps/rejected": -259.57513427734375, "loss": 0.6172, "rewards/accuracies": 0.90625, "rewards/chosen": -1.1990365982055664, "rewards/margins": 1.4487323760986328, "rewards/rejected": -2.647768974304199, "step": 318 }, { "epoch": 0.4684287812041116, "epsilon_dpo/beta": 0.014866207726299763, "epsilon_dpo/beta_margin_grad_mean": -0.24987944960594177, "epsilon_dpo/beta_margin_grad_std": 0.1708785742521286, "epsilon_dpo/beta_margin_mean": 1.363257646560669, "epsilon_dpo/beta_margin_std": 1.1079721450805664, "epsilon_dpo/loss_margin_mean": 91.90858459472656, "grad_norm": 73.16886901855469, "kl/avg_steps": 0.84375, "kl/beta": 0.014991208910942078, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 3.2211050867827805e-07, "logits/chosen": -0.29700884222984314, "logits/rejected": -0.34763428568840027, "logps/chosen": -131.5271759033203, "logps/ref_chosen": -49.383758544921875, "logps/ref_rejected": -113.90650939941406, "logps/rejected": -287.95849609375, "loss": 0.6412, "rewards/accuracies": 0.921875, "rewards/chosen": -1.221997857093811, "rewards/margins": 1.363257646560669, "rewards/rejected": -2.5852556228637695, "step": 319 }, { "epoch": 0.4698972099853157, "epsilon_dpo/beta": 0.014751114882528782, "epsilon_dpo/beta_margin_grad_mean": -0.24163083732128143, "epsilon_dpo/beta_margin_grad_std": 0.17252741754055023, "epsilon_dpo/beta_margin_mean": 1.5045992136001587, "epsilon_dpo/beta_margin_std": 1.272070288658142, "epsilon_dpo/loss_margin_mean": 102.26066589355469, "grad_norm": 53.9862174987793, "kl/avg_steps": 0.78125, "kl/beta": 0.01486577931791544, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 3.208807785813777e-07, "logits/chosen": -0.27596646547317505, "logits/rejected": -0.23841209709644318, "logps/chosen": -157.13699340820312, "logps/ref_chosen": -59.50489044189453, "logps/ref_rejected": -97.66716766357422, "logps/rejected": -297.5599365234375, "loss": 0.6149, "rewards/accuracies": 0.90625, "rewards/chosen": -1.4394325017929077, "rewards/margins": 1.5045990943908691, "rewards/rejected": -2.9440317153930664, "step": 320 }, { "epoch": 0.4713656387665198, "epsilon_dpo/beta": 0.014662097208201885, "epsilon_dpo/beta_margin_grad_mean": -0.28940823674201965, "epsilon_dpo/beta_margin_grad_std": 0.21540525555610657, "epsilon_dpo/beta_margin_mean": 1.2925175428390503, "epsilon_dpo/beta_margin_std": 1.4737564325332642, "epsilon_dpo/loss_margin_mean": 88.63056945800781, "grad_norm": 88.17047882080078, "kl/avg_steps": 0.609375, "kl/beta": 0.014750540256500244, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.796875, "learning_rate": 3.1964918071004217e-07, "logits/chosen": -0.08846712112426758, "logits/rejected": 0.12448375672101974, "logps/chosen": -195.017578125, "logps/ref_chosen": -61.548683166503906, "logps/ref_rejected": -91.64103698730469, "logps/rejected": -313.7405090332031, "loss": 0.8002, "rewards/accuracies": 0.828125, "rewards/chosen": -1.9587950706481934, "rewards/margins": 1.2925175428390503, "rewards/rejected": -3.251312732696533, "step": 321 }, { "epoch": 0.47283406754772395, "epsilon_dpo/beta": 0.014557276852428913, "epsilon_dpo/beta_margin_grad_mean": -0.2569010853767395, "epsilon_dpo/beta_margin_grad_std": 0.19904620945453644, "epsilon_dpo/beta_margin_mean": 1.4192512035369873, "epsilon_dpo/beta_margin_std": 1.3140568733215332, "epsilon_dpo/loss_margin_mean": 97.86225891113281, "grad_norm": 57.257720947265625, "kl/avg_steps": 0.71875, "kl/beta": 0.014661198481917381, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 3.184157475180207e-07, "logits/chosen": -0.12550796568393707, "logits/rejected": 0.017032817006111145, "logps/chosen": -182.1523895263672, "logps/ref_chosen": -57.29003143310547, "logps/ref_rejected": -95.74992370605469, "logps/rejected": -318.47454833984375, "loss": 0.6915, "rewards/accuracies": 0.890625, "rewards/chosen": -1.8180735111236572, "rewards/margins": 1.4192513227462769, "rewards/rejected": -3.2373247146606445, "step": 322 }, { "epoch": 0.47430249632892807, "epsilon_dpo/beta": 0.014444295316934586, "epsilon_dpo/beta_margin_grad_mean": -0.23776492476463318, "epsilon_dpo/beta_margin_grad_std": 0.17510882019996643, "epsilon_dpo/beta_margin_mean": 1.5026826858520508, "epsilon_dpo/beta_margin_std": 1.202879786491394, "epsilon_dpo/loss_margin_mean": 104.3030014038086, "grad_norm": 58.186004638671875, "kl/avg_steps": 0.78125, "kl/beta": 0.014556573703885078, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 3.171805115074251e-07, "logits/chosen": 0.07482028752565384, "logits/rejected": 0.24660231173038483, "logps/chosen": -181.14083862304688, "logps/ref_chosen": -51.23395919799805, "logps/ref_rejected": -75.06192016601562, "logps/rejected": -309.27178955078125, "loss": 0.606, "rewards/accuracies": 0.90625, "rewards/chosen": -1.8771140575408936, "rewards/margins": 1.5026826858520508, "rewards/rejected": -3.3797965049743652, "step": 323 }, { "epoch": 0.47577092511013214, "epsilon_dpo/beta": 0.014341351576149464, "epsilon_dpo/beta_margin_grad_mean": -0.25068747997283936, "epsilon_dpo/beta_margin_grad_std": 0.2099665254354477, "epsilon_dpo/beta_margin_mean": 1.4978466033935547, "epsilon_dpo/beta_margin_std": 1.4200392961502075, "epsilon_dpo/loss_margin_mean": 104.87434387207031, "grad_norm": 89.12545013427734, "kl/avg_steps": 0.71875, "kl/beta": 0.014443731866776943, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 3.1594350522787295e-07, "logits/chosen": -0.0508156418800354, "logits/rejected": 0.2568337023258209, "logps/chosen": -215.462646484375, "logps/ref_chosen": -65.13516998291016, "logps/ref_rejected": -86.47750091552734, "logps/rejected": -341.6793212890625, "loss": 0.6933, "rewards/accuracies": 0.859375, "rewards/chosen": -2.1571807861328125, "rewards/margins": 1.4978466033935547, "rewards/rejected": -3.6550276279449463, "step": 324 }, { "epoch": 0.47723935389133626, "epsilon_dpo/beta": 0.01422556210309267, "epsilon_dpo/beta_margin_grad_mean": -0.25655460357666016, "epsilon_dpo/beta_margin_grad_std": 0.17581619322299957, "epsilon_dpo/beta_margin_mean": 1.3322529792785645, "epsilon_dpo/beta_margin_std": 1.1336437463760376, "epsilon_dpo/loss_margin_mean": 93.88763427734375, "grad_norm": 53.843570709228516, "kl/avg_steps": 0.8125, "kl/beta": 0.014340657740831375, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 3.147047612756302e-07, "logits/chosen": -0.20970404148101807, "logits/rejected": 0.24382324516773224, "logps/chosen": -175.234130859375, "logps/ref_chosen": -56.215599060058594, "logps/ref_rejected": -70.0859375, "logps/rejected": -282.99212646484375, "loss": 0.6641, "rewards/accuracies": 0.90625, "rewards/chosen": -1.69527268409729, "rewards/margins": 1.3322529792785645, "rewards/rejected": -3.0275256633758545, "step": 325 }, { "epoch": 0.4787077826725404, "epsilon_dpo/beta": 0.014128695242106915, "epsilon_dpo/beta_margin_grad_mean": -0.28498223423957825, "epsilon_dpo/beta_margin_grad_std": 0.1808035671710968, "epsilon_dpo/beta_margin_mean": 1.1853976249694824, "epsilon_dpo/beta_margin_std": 1.1836676597595215, "epsilon_dpo/loss_margin_mean": 84.24244689941406, "grad_norm": 54.919639587402344, "kl/avg_steps": 0.6875, "kl/beta": 0.01422507967799902, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 3.134643122927519e-07, "logits/chosen": -0.2864975333213806, "logits/rejected": 0.1083693727850914, "logps/chosen": -200.7134552001953, "logps/ref_chosen": -72.72496032714844, "logps/ref_rejected": -79.84678649902344, "logps/rejected": -292.0777282714844, "loss": 0.7544, "rewards/accuracies": 0.875, "rewards/chosen": -1.809903860092163, "rewards/margins": 1.1853976249694824, "rewards/rejected": -2.9953014850616455, "step": 326 }, { "epoch": 0.4801762114537445, "epsilon_dpo/beta": 0.014023392461240292, "epsilon_dpo/beta_margin_grad_mean": -0.22554190456867218, "epsilon_dpo/beta_margin_grad_std": 0.18653175234794617, "epsilon_dpo/beta_margin_mean": 1.639188289642334, "epsilon_dpo/beta_margin_std": 1.3321127891540527, "epsilon_dpo/loss_margin_mean": 117.2615737915039, "grad_norm": 65.24697875976562, "kl/avg_steps": 0.75, "kl/beta": 0.014127950184047222, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 3.1222219096622264e-07, "logits/chosen": -0.3468073904514313, "logits/rejected": -0.188334122300148, "logps/chosen": -177.39767456054688, "logps/ref_chosen": -69.13441467285156, "logps/ref_rejected": -111.93377685546875, "logps/rejected": -337.4586181640625, "loss": 0.5909, "rewards/accuracies": 0.875, "rewards/chosen": -1.5198478698730469, "rewards/margins": 1.639188289642334, "rewards/rejected": -3.159036159515381, "step": 327 }, { "epoch": 0.48164464023494863, "epsilon_dpo/beta": 0.013918999582529068, "epsilon_dpo/beta_margin_grad_mean": -0.25955086946487427, "epsilon_dpo/beta_margin_grad_std": 0.17522956430912018, "epsilon_dpo/beta_margin_mean": 1.3695151805877686, "epsilon_dpo/beta_margin_std": 1.2336775064468384, "epsilon_dpo/loss_margin_mean": 98.67748260498047, "grad_norm": 57.12346267700195, "kl/avg_steps": 0.75, "kl/beta": 0.01402277871966362, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 3.1097843002709427e-07, "logits/chosen": -0.1713501214981079, "logits/rejected": 0.03233088552951813, "logps/chosen": -177.60702514648438, "logps/ref_chosen": -59.68719482421875, "logps/ref_rejected": -90.85499572753906, "logps/rejected": -307.45233154296875, "loss": 0.6664, "rewards/accuracies": 0.90625, "rewards/chosen": -1.6427186727523804, "rewards/margins": 1.3695151805877686, "rewards/rejected": -3.0122337341308594, "step": 328 }, { "epoch": 0.4831130690161527, "epsilon_dpo/beta": 0.013811035081744194, "epsilon_dpo/beta_margin_grad_mean": -0.23463240265846252, "epsilon_dpo/beta_margin_grad_std": 0.18572793900966644, "epsilon_dpo/beta_margin_mean": 1.6088722944259644, "epsilon_dpo/beta_margin_std": 1.4085825681686401, "epsilon_dpo/loss_margin_mean": 116.8152847290039, "grad_norm": 52.39393997192383, "kl/avg_steps": 0.78125, "kl/beta": 0.013918391428887844, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 3.0973306224962437e-07, "logits/chosen": -0.29046374559402466, "logits/rejected": -0.12796850502490997, "logps/chosen": -182.58395385742188, "logps/ref_chosen": -65.2461929321289, "logps/ref_rejected": -100.69770812988281, "logps/rejected": -334.8507385253906, "loss": 0.6155, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6222935914993286, "rewards/margins": 1.608872413635254, "rewards/rejected": -3.231165885925293, "step": 329 }, { "epoch": 0.4845814977973568, "epsilon_dpo/beta": 0.013712604530155659, "epsilon_dpo/beta_margin_grad_mean": -0.24368812143802643, "epsilon_dpo/beta_margin_grad_std": 0.17975428700447083, "epsilon_dpo/beta_margin_mean": 1.4507081508636475, "epsilon_dpo/beta_margin_std": 1.2100770473480225, "epsilon_dpo/loss_margin_mean": 106.16722106933594, "grad_norm": 49.25497055053711, "kl/avg_steps": 0.71875, "kl/beta": 0.013810496777296066, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 3.084861204504122e-07, "logits/chosen": 0.06375602632761002, "logits/rejected": 0.08258255571126938, "logps/chosen": -165.3278350830078, "logps/ref_chosen": -46.998348236083984, "logps/ref_rejected": -86.87684631347656, "logps/rejected": -311.3735656738281, "loss": 0.6328, "rewards/accuracies": 0.890625, "rewards/chosen": -1.6240264177322388, "rewards/margins": 1.4507081508636475, "rewards/rejected": -3.074734687805176, "step": 330 }, { "epoch": 0.48604992657856094, "epsilon_dpo/beta": 0.01360617857426405, "epsilon_dpo/beta_margin_grad_mean": -0.2268812209367752, "epsilon_dpo/beta_margin_grad_std": 0.1594099998474121, "epsilon_dpo/beta_margin_mean": 1.5454741716384888, "epsilon_dpo/beta_margin_std": 1.1557540893554688, "epsilon_dpo/loss_margin_mean": 113.86687469482422, "grad_norm": 51.23035430908203, "kl/avg_steps": 0.78125, "kl/beta": 0.013711942359805107, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 3.072376374875335e-07, "logits/chosen": -0.07446587085723877, "logits/rejected": 0.13634458184242249, "logps/chosen": -168.69863891601562, "logps/ref_chosen": -50.52424621582031, "logps/ref_rejected": -89.01544189453125, "logps/rejected": -321.05670166015625, "loss": 0.5642, "rewards/accuracies": 0.90625, "rewards/chosen": -1.6087958812713623, "rewards/margins": 1.5454741716384888, "rewards/rejected": -3.1542701721191406, "step": 331 }, { "epoch": 0.48751835535976507, "epsilon_dpo/beta": 0.013509208336472511, "epsilon_dpo/beta_margin_grad_mean": -0.28300759196281433, "epsilon_dpo/beta_margin_grad_std": 0.18324074149131775, "epsilon_dpo/beta_margin_mean": 1.1878498792648315, "epsilon_dpo/beta_margin_std": 1.1348134279251099, "epsilon_dpo/loss_margin_mean": 88.24505615234375, "grad_norm": 55.885032653808594, "kl/avg_steps": 0.71875, "kl/beta": 0.013605647720396519, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 3.059876462596758e-07, "logits/chosen": -0.014329172670841217, "logits/rejected": 0.17287902534008026, "logps/chosen": -178.20919799804688, "logps/ref_chosen": -49.18028259277344, "logps/ref_rejected": -76.48515319824219, "logps/rejected": -293.7591247558594, "loss": 0.7446, "rewards/accuracies": 0.859375, "rewards/chosen": -1.7443733215332031, "rewards/margins": 1.187849760055542, "rewards/rejected": -2.932223320007324, "step": 332 }, { "epoch": 0.4889867841409692, "epsilon_dpo/beta": 0.013425469398498535, "epsilon_dpo/beta_margin_grad_mean": -0.2561701238155365, "epsilon_dpo/beta_margin_grad_std": 0.21217595040798187, "epsilon_dpo/beta_margin_mean": 1.5489706993103027, "epsilon_dpo/beta_margin_std": 1.5074926614761353, "epsilon_dpo/loss_margin_mean": 115.92626953125, "grad_norm": 50.82048416137695, "kl/avg_steps": 0.625, "kl/beta": 0.013508555479347706, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 3.0473617970527015e-07, "logits/chosen": 0.03314550966024399, "logits/rejected": 0.1178080290555954, "logps/chosen": -202.36419677734375, "logps/ref_chosen": -63.75574493408203, "logps/ref_rejected": -95.04411315917969, "logps/rejected": -349.578857421875, "loss": 0.6949, "rewards/accuracies": 0.828125, "rewards/chosen": -1.8626306056976318, "rewards/margins": 1.5489706993103027, "rewards/rejected": -3.4116015434265137, "step": 333 }, { "epoch": 0.49045521292217326, "epsilon_dpo/beta": 0.013316906988620758, "epsilon_dpo/beta_margin_grad_mean": -0.25936245918273926, "epsilon_dpo/beta_margin_grad_std": 0.19826123118400574, "epsilon_dpo/beta_margin_mean": 1.36473548412323, "epsilon_dpo/beta_margin_std": 1.2763848304748535, "epsilon_dpo/loss_margin_mean": 102.80318450927734, "grad_norm": 86.10340118408203, "kl/avg_steps": 0.8125, "kl/beta": 0.013424650765955448, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 3.034832708016243e-07, "logits/chosen": -0.02073417231440544, "logits/rejected": 0.10776689648628235, "logps/chosen": -203.4918212890625, "logps/ref_chosen": -66.97975158691406, "logps/ref_rejected": -95.31692504882812, "logps/rejected": -334.6322021484375, "loss": 0.7076, "rewards/accuracies": 0.90625, "rewards/chosen": -1.819995403289795, "rewards/margins": 1.36473548412323, "rewards/rejected": -3.1847307682037354, "step": 334 }, { "epoch": 0.4919236417033774, "epsilon_dpo/beta": 0.013234549202024937, "epsilon_dpo/beta_margin_grad_mean": -0.3029221296310425, "epsilon_dpo/beta_margin_grad_std": 0.2137981653213501, "epsilon_dpo/beta_margin_mean": 1.1251932382583618, "epsilon_dpo/beta_margin_std": 1.3580584526062012, "epsilon_dpo/loss_margin_mean": 85.52513122558594, "grad_norm": 89.16724395751953, "kl/avg_steps": 0.625, "kl/beta": 0.013316454365849495, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 3.022289525640531e-07, "logits/chosen": 0.02390405163168907, "logits/rejected": 0.2575310170650482, "logps/chosen": -210.85337829589844, "logps/ref_chosen": -62.54248046875, "logps/ref_rejected": -87.6176986694336, "logps/rejected": -321.4537353515625, "loss": 0.8569, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9678313732147217, "rewards/margins": 1.1251931190490723, "rewards/rejected": -3.093024730682373, "step": 335 }, { "epoch": 0.4933920704845815, "epsilon_dpo/beta": 0.013144075870513916, "epsilon_dpo/beta_margin_grad_mean": -0.2478855401277542, "epsilon_dpo/beta_margin_grad_std": 0.20943517982959747, "epsilon_dpo/beta_margin_mean": 1.5077518224716187, "epsilon_dpo/beta_margin_std": 1.5056235790252686, "epsilon_dpo/loss_margin_mean": 115.24197387695312, "grad_norm": 67.42304992675781, "kl/avg_steps": 0.6875, "kl/beta": 0.013233743607997894, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 3.009732580450086e-07, "logits/chosen": 0.07678806036710739, "logits/rejected": 0.11152809858322144, "logps/chosen": -194.21603393554688, "logps/ref_chosen": -54.531150817871094, "logps/ref_rejected": -104.40424346923828, "logps/rejected": -359.3310852050781, "loss": 0.71, "rewards/accuracies": 0.859375, "rewards/chosen": -1.839827299118042, "rewards/margins": 1.5077519416809082, "rewards/rejected": -3.347579002380371, "step": 336 }, { "epoch": 0.4948604992657856, "epsilon_dpo/beta": 0.013046111911535263, "epsilon_dpo/beta_margin_grad_mean": -0.24443912506103516, "epsilon_dpo/beta_margin_grad_std": 0.19509734213352203, "epsilon_dpo/beta_margin_mean": 1.4538227319717407, "epsilon_dpo/beta_margin_std": 1.2511672973632812, "epsilon_dpo/loss_margin_mean": 111.8341064453125, "grad_norm": 53.20949935913086, "kl/avg_steps": 0.75, "kl/beta": 0.013143382966518402, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 2.9971622033320914e-07, "logits/chosen": -0.1043112725019455, "logits/rejected": -0.023440301418304443, "logps/chosen": -185.51742553710938, "logps/ref_chosen": -65.12869262695312, "logps/ref_rejected": -101.72701263427734, "logps/rejected": -333.9498291015625, "loss": 0.6585, "rewards/accuracies": 0.890625, "rewards/chosen": -1.572738766670227, "rewards/margins": 1.4538227319717407, "rewards/rejected": -3.0265614986419678, "step": 337 }, { "epoch": 0.49632892804698975, "epsilon_dpo/beta": 0.012948994524776936, "epsilon_dpo/beta_margin_grad_mean": -0.2558075785636902, "epsilon_dpo/beta_margin_grad_std": 0.180355042219162, "epsilon_dpo/beta_margin_mean": 1.374841570854187, "epsilon_dpo/beta_margin_std": 1.1893986463546753, "epsilon_dpo/loss_margin_mean": 106.50833892822266, "grad_norm": 55.55891418457031, "kl/avg_steps": 0.75, "kl/beta": 0.013045541942119598, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 2.984578725527675e-07, "logits/chosen": -0.019446374848484993, "logits/rejected": -0.0027777403593063354, "logps/chosen": -175.84194946289062, "logps/ref_chosen": -58.422706604003906, "logps/ref_rejected": -89.06854248046875, "logps/rejected": -312.9961242675781, "loss": 0.6629, "rewards/accuracies": 0.90625, "rewards/chosen": -1.5235812664031982, "rewards/margins": 1.374841570854187, "rewards/rejected": -2.8984227180480957, "step": 338 }, { "epoch": 0.4977973568281938, "epsilon_dpo/beta": 0.01285259984433651, "epsilon_dpo/beta_margin_grad_mean": -0.25017017126083374, "epsilon_dpo/beta_margin_grad_std": 0.1595836877822876, "epsilon_dpo/beta_margin_mean": 1.3577648401260376, "epsilon_dpo/beta_margin_std": 1.1023646593093872, "epsilon_dpo/loss_margin_mean": 105.93833923339844, "grad_norm": 47.104557037353516, "kl/avg_steps": 0.75, "kl/beta": 0.01294842828065157, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 2.9719824786231796e-07, "logits/chosen": -0.3091191351413727, "logits/rejected": -0.19397714734077454, "logps/chosen": -163.83560180664062, "logps/ref_chosen": -59.99531555175781, "logps/ref_rejected": -103.9109115600586, "logps/rejected": -313.6895446777344, "loss": 0.6288, "rewards/accuracies": 0.921875, "rewards/chosen": -1.3350287675857544, "rewards/margins": 1.3577649593353271, "rewards/rejected": -2.692793607711792, "step": 339 }, { "epoch": 0.49926578560939794, "epsilon_dpo/beta": 0.012758912518620491, "epsilon_dpo/beta_margin_grad_mean": -0.26304712891578674, "epsilon_dpo/beta_margin_grad_std": 0.1791672557592392, "epsilon_dpo/beta_margin_mean": 1.332050085067749, "epsilon_dpo/beta_margin_std": 1.2350640296936035, "epsilon_dpo/loss_margin_mean": 104.75978088378906, "grad_norm": 54.65445327758789, "kl/avg_steps": 0.734375, "kl/beta": 0.012852038256824017, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.859375, "learning_rate": 2.959373794541426e-07, "logits/chosen": -0.2100875824689865, "logits/rejected": 0.047447167336940765, "logps/chosen": -154.89137268066406, "logps/ref_chosen": -52.83022689819336, "logps/ref_rejected": -73.10723876953125, "logps/rejected": -279.92816162109375, "loss": 0.6862, "rewards/accuracies": 0.890625, "rewards/chosen": -1.3033416271209717, "rewards/margins": 1.332050085067749, "rewards/rejected": -2.6353917121887207, "step": 340 }, { "epoch": 0.5007342143906021, "epsilon_dpo/beta": 0.012671898119151592, "epsilon_dpo/beta_margin_grad_mean": -0.24114340543746948, "epsilon_dpo/beta_margin_grad_std": 0.19073477387428284, "epsilon_dpo/beta_margin_mean": 1.4527372121810913, "epsilon_dpo/beta_margin_std": 1.1797953844070435, "epsilon_dpo/loss_margin_mean": 115.11068725585938, "grad_norm": 54.916526794433594, "kl/avg_steps": 0.6875, "kl/beta": 0.012758344411849976, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 2.946753005532965e-07, "logits/chosen": -0.10914070904254913, "logits/rejected": -0.013678308576345444, "logps/chosen": -151.44482421875, "logps/ref_chosen": -47.899803161621094, "logps/ref_rejected": -101.80987548828125, "logps/rejected": -320.465576171875, "loss": 0.638, "rewards/accuracies": 0.84375, "rewards/chosen": -1.3139156103134155, "rewards/margins": 1.4527373313903809, "rewards/rejected": -2.766652822494507, "step": 341 }, { "epoch": 0.5022026431718062, "epsilon_dpo/beta": 0.012585373595356941, "epsilon_dpo/beta_margin_grad_mean": -0.26522305607795715, "epsilon_dpo/beta_margin_grad_std": 0.18035577237606049, "epsilon_dpo/beta_margin_mean": 1.3305476903915405, "epsilon_dpo/beta_margin_std": 1.2396719455718994, "epsilon_dpo/loss_margin_mean": 106.14080047607422, "grad_norm": 42.698951721191406, "kl/avg_steps": 0.6875, "kl/beta": 0.012671229429543018, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 2.934120444167326e-07, "logits/chosen": -0.33909744024276733, "logits/rejected": -0.02335232123732567, "logps/chosen": -164.9725341796875, "logps/ref_chosen": -71.99664306640625, "logps/ref_rejected": -92.58959197998047, "logps/rejected": -291.706298828125, "loss": 0.6909, "rewards/accuracies": 0.859375, "rewards/chosen": -1.1726986169815063, "rewards/margins": 1.33054780960083, "rewards/rejected": -2.503246307373047, "step": 342 }, { "epoch": 0.5036710719530103, "epsilon_dpo/beta": 0.012471907772123814, "epsilon_dpo/beta_margin_grad_mean": -0.24438782036304474, "epsilon_dpo/beta_margin_grad_std": 0.1653384417295456, "epsilon_dpo/beta_margin_mean": 1.3828749656677246, "epsilon_dpo/beta_margin_std": 1.0594128370285034, "epsilon_dpo/loss_margin_mean": 111.03284454345703, "grad_norm": 47.392791748046875, "kl/avg_steps": 0.90625, "kl/beta": 0.01258470956236124, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 2.9214764433242476e-07, "logits/chosen": -0.2089230716228485, "logits/rejected": -0.17183662950992584, "logps/chosen": -145.00962829589844, "logps/ref_chosen": -54.40562438964844, "logps/ref_rejected": -111.04141998291016, "logps/rejected": -312.67828369140625, "loss": 0.6202, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1313905715942383, "rewards/margins": 1.3828749656677246, "rewards/rejected": -2.514265537261963, "step": 343 }, { "epoch": 0.5051395007342144, "epsilon_dpo/beta": 0.012383283115923405, "epsilon_dpo/beta_margin_grad_mean": -0.27973777055740356, "epsilon_dpo/beta_margin_grad_std": 0.18304574489593506, "epsilon_dpo/beta_margin_mean": 1.183401107788086, "epsilon_dpo/beta_margin_std": 1.180587887763977, "epsilon_dpo/loss_margin_mean": 95.94915008544922, "grad_norm": 50.056968688964844, "kl/avg_steps": 0.71875, "kl/beta": 0.01247168518602848, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 2.9088213361849126e-07, "logits/chosen": -0.13662785291671753, "logits/rejected": 0.021654516458511353, "logps/chosen": -155.55010986328125, "logps/ref_chosen": -53.96466827392578, "logps/ref_rejected": -90.62336730957031, "logps/rejected": -288.1579284667969, "loss": 0.7518, "rewards/accuracies": 0.859375, "rewards/chosen": -1.2603166103363037, "rewards/margins": 1.183401107788086, "rewards/rejected": -2.4437174797058105, "step": 344 }, { "epoch": 0.5066079295154186, "epsilon_dpo/beta": 0.012298782356083393, "epsilon_dpo/beta_margin_grad_mean": -0.26846975088119507, "epsilon_dpo/beta_margin_grad_std": 0.19623839855194092, "epsilon_dpo/beta_margin_mean": 1.3173902034759521, "epsilon_dpo/beta_margin_std": 1.2630058526992798, "epsilon_dpo/loss_margin_mean": 107.56978607177734, "grad_norm": 53.13343811035156, "kl/avg_steps": 0.6875, "kl/beta": 0.012382684275507927, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 2.896155456223163e-07, "logits/chosen": -0.17985820770263672, "logits/rejected": -0.0756099596619606, "logps/chosen": -184.08670043945312, "logps/ref_chosen": -61.685699462890625, "logps/ref_rejected": -99.49040985107422, "logps/rejected": -329.461181640625, "loss": 0.7218, "rewards/accuracies": 0.859375, "rewards/chosen": -1.5073106288909912, "rewards/margins": 1.3173902034759521, "rewards/rejected": -2.8247008323669434, "step": 345 }, { "epoch": 0.5080763582966226, "epsilon_dpo/beta": 0.012226336635649204, "epsilon_dpo/beta_margin_grad_mean": -0.28209608793258667, "epsilon_dpo/beta_margin_grad_std": 0.21230106055736542, "epsilon_dpo/beta_margin_mean": 1.222169041633606, "epsilon_dpo/beta_margin_std": 1.290783166885376, "epsilon_dpo/loss_margin_mean": 100.54989624023438, "grad_norm": 61.39634704589844, "kl/avg_steps": 0.59375, "kl/beta": 0.01229813415557146, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 2.883479137196714e-07, "logits/chosen": 0.08478528261184692, "logits/rejected": 0.20450150966644287, "logps/chosen": -187.56558227539062, "logps/ref_chosen": -55.256263732910156, "logps/ref_rejected": -77.41532135009766, "logps/rejected": -310.2745361328125, "loss": 0.7973, "rewards/accuracies": 0.828125, "rewards/chosen": -1.6205660104751587, "rewards/margins": 1.222169041633606, "rewards/rejected": -2.8427350521087646, "step": 346 }, { "epoch": 0.5095447870778267, "epsilon_dpo/beta": 0.01213888730853796, "epsilon_dpo/beta_margin_grad_mean": -0.2620489299297333, "epsilon_dpo/beta_margin_grad_std": 0.2006431370973587, "epsilon_dpo/beta_margin_mean": 1.3762738704681396, "epsilon_dpo/beta_margin_std": 1.3768495321273804, "epsilon_dpo/loss_margin_mean": 113.8448486328125, "grad_norm": 56.24908447265625, "kl/avg_steps": 0.71875, "kl/beta": 0.012225545011460781, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 2.8707927131383614e-07, "logits/chosen": -0.0002256631851196289, "logits/rejected": 0.14817695319652557, "logps/chosen": -186.6798095703125, "logps/ref_chosen": -57.56624221801758, "logps/ref_rejected": -92.35508728027344, "logps/rejected": -335.31353759765625, "loss": 0.7247, "rewards/accuracies": 0.890625, "rewards/chosen": -1.5691325664520264, "rewards/margins": 1.3762738704681396, "rewards/rejected": -2.945406436920166, "step": 347 }, { "epoch": 0.5110132158590308, "epsilon_dpo/beta": 0.012059849686920643, "epsilon_dpo/beta_margin_grad_mean": -0.2906031012535095, "epsilon_dpo/beta_margin_grad_std": 0.20753613114356995, "epsilon_dpo/beta_margin_mean": 1.181694746017456, "epsilon_dpo/beta_margin_std": 1.2876741886138916, "epsilon_dpo/loss_margin_mean": 98.48127746582031, "grad_norm": 69.49148559570312, "kl/avg_steps": 0.65625, "kl/beta": 0.012138301506638527, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 2.858096518347179e-07, "logits/chosen": 0.057972244918346405, "logits/rejected": 0.12979300320148468, "logps/chosen": -175.77296447753906, "logps/ref_chosen": -56.31770324707031, "logps/ref_rejected": -89.13837432861328, "logps/rejected": -307.0749206542969, "loss": 0.8094, "rewards/accuracies": 0.828125, "rewards/chosen": -1.442892074584961, "rewards/margins": 1.181694746017456, "rewards/rejected": -2.624586820602417, "step": 348 }, { "epoch": 0.5124816446402349, "epsilon_dpo/beta": 0.011988760903477669, "epsilon_dpo/beta_margin_grad_mean": -0.28718581795692444, "epsilon_dpo/beta_margin_grad_std": 0.20010027289390564, "epsilon_dpo/beta_margin_mean": 1.253356695175171, "epsilon_dpo/beta_margin_std": 1.3836594820022583, "epsilon_dpo/loss_margin_mean": 105.09356689453125, "grad_norm": 58.3474235534668, "kl/avg_steps": 0.59375, "kl/beta": 0.012059163302183151, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 2.845390887379706e-07, "logits/chosen": 0.06532445549964905, "logits/rejected": 0.09254080057144165, "logps/chosen": -173.82147216796875, "logps/ref_chosen": -58.0255126953125, "logps/ref_rejected": -97.50515747070312, "logps/rejected": -318.3946838378906, "loss": 0.7825, "rewards/accuracies": 0.859375, "rewards/chosen": -1.390855073928833, "rewards/margins": 1.253356695175171, "rewards/rejected": -2.644211769104004, "step": 349 }, { "epoch": 0.5139500734214391, "epsilon_dpo/beta": 0.011903010308742523, "epsilon_dpo/beta_margin_grad_mean": -0.2723287343978882, "epsilon_dpo/beta_margin_grad_std": 0.18795283138751984, "epsilon_dpo/beta_margin_mean": 1.2802366018295288, "epsilon_dpo/beta_margin_std": 1.2380613088607788, "epsilon_dpo/loss_margin_mean": 107.95018005371094, "grad_norm": 73.92295837402344, "kl/avg_steps": 0.71875, "kl/beta": 0.01198798418045044, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 2.8326761550411346e-07, "logits/chosen": 0.1495445966720581, "logits/rejected": 0.19291891157627106, "logps/chosen": -190.48580932617188, "logps/ref_chosen": -64.33049011230469, "logps/ref_rejected": -89.87164306640625, "logps/rejected": -323.97711181640625, "loss": 0.721, "rewards/accuracies": 0.890625, "rewards/chosen": -1.503514289855957, "rewards/margins": 1.2802364826202393, "rewards/rejected": -2.7837507724761963, "step": 350 }, { "epoch": 0.5154185022026432, "epsilon_dpo/beta": 0.011818069033324718, "epsilon_dpo/beta_margin_grad_mean": -0.24587078392505646, "epsilon_dpo/beta_margin_grad_std": 0.2066507488489151, "epsilon_dpo/beta_margin_mean": 1.5357071161270142, "epsilon_dpo/beta_margin_std": 1.447570562362671, "epsilon_dpo/loss_margin_mean": 130.4733123779297, "grad_norm": 59.51049041748047, "kl/avg_steps": 0.71875, "kl/beta": 0.011902435682713985, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 2.819952656376487e-07, "logits/chosen": 0.11146371066570282, "logits/rejected": 0.15263676643371582, "logps/chosen": -188.93045043945312, "logps/ref_chosen": -60.6721305847168, "logps/ref_rejected": -101.5654296875, "logps/rejected": -360.29705810546875, "loss": 0.6817, "rewards/accuracies": 0.859375, "rewards/chosen": -1.5186275243759155, "rewards/margins": 1.5357072353363037, "rewards/rejected": -3.0543346405029297, "step": 351 }, { "epoch": 0.5168869309838473, "epsilon_dpo/beta": 0.01175589207559824, "epsilon_dpo/beta_margin_grad_mean": -0.32376158237457275, "epsilon_dpo/beta_margin_grad_std": 0.2207159847021103, "epsilon_dpo/beta_margin_mean": 1.0264657735824585, "epsilon_dpo/beta_margin_std": 1.3690868616104126, "epsilon_dpo/loss_margin_mean": 87.97261047363281, "grad_norm": 85.13878631591797, "kl/avg_steps": 0.53125, "kl/beta": 0.011817497201263905, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.8072207266617854e-07, "logits/chosen": 0.007934626191854477, "logits/rejected": 0.4826154410839081, "logps/chosen": -215.90182495117188, "logps/ref_chosen": -70.9434585571289, "logps/ref_rejected": -76.6419677734375, "logps/rejected": -309.57293701171875, "loss": 0.925, "rewards/accuracies": 0.78125, "rewards/chosen": -1.7098991870880127, "rewards/margins": 1.0264657735824585, "rewards/rejected": -2.7363648414611816, "step": 352 }, { "epoch": 0.5183553597650514, "epsilon_dpo/beta": 0.011690095067024231, "epsilon_dpo/beta_margin_grad_mean": -0.29996585845947266, "epsilon_dpo/beta_margin_grad_std": 0.21679334342479706, "epsilon_dpo/beta_margin_mean": 1.2114415168762207, "epsilon_dpo/beta_margin_std": 1.421209454536438, "epsilon_dpo/loss_margin_mean": 104.27862548828125, "grad_norm": 60.23612976074219, "kl/avg_steps": 0.5625, "kl/beta": 0.011755048297345638, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.794480701395219e-07, "logits/chosen": 0.22534312307834625, "logits/rejected": 0.4321860671043396, "logps/chosen": -200.62216186523438, "logps/ref_chosen": -58.39533996582031, "logps/ref_rejected": -80.33552551269531, "logps/rejected": -326.84100341796875, "loss": 0.8387, "rewards/accuracies": 0.78125, "rewards/chosen": -1.6637961864471436, "rewards/margins": 1.2114415168762207, "rewards/rejected": -2.8752379417419434, "step": 353 }, { "epoch": 0.5198237885462555, "epsilon_dpo/beta": 0.011602786369621754, "epsilon_dpo/beta_margin_grad_mean": -0.24083960056304932, "epsilon_dpo/beta_margin_grad_std": 0.16048724949359894, "epsilon_dpo/beta_margin_mean": 1.402290940284729, "epsilon_dpo/beta_margin_std": 1.0611681938171387, "epsilon_dpo/loss_margin_mean": 121.20055389404297, "grad_norm": 54.1386833190918, "kl/avg_steps": 0.75, "kl/beta": 0.011689295992255211, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 2.781732916288303e-07, "logits/chosen": 0.01698947697877884, "logits/rejected": 0.240818053483963, "logps/chosen": -175.9649658203125, "logps/ref_chosen": -59.80299377441406, "logps/ref_rejected": -88.75750732421875, "logps/rejected": -326.1200256347656, "loss": 0.6058, "rewards/accuracies": 0.921875, "rewards/chosen": -1.3482849597930908, "rewards/margins": 1.4022910594940186, "rewards/rejected": -2.7505760192871094, "step": 354 }, { "epoch": 0.5212922173274597, "epsilon_dpo/beta": 0.011512788012623787, "epsilon_dpo/beta_margin_grad_mean": -0.24913077056407928, "epsilon_dpo/beta_margin_grad_std": 0.17663627862930298, "epsilon_dpo/beta_margin_mean": 1.4240295886993408, "epsilon_dpo/beta_margin_std": 1.2208130359649658, "epsilon_dpo/loss_margin_mean": 124.04373168945312, "grad_norm": 48.92887496948242, "kl/avg_steps": 0.78125, "kl/beta": 0.011602279730141163, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 2.7689777072570284e-07, "logits/chosen": -0.03938338905572891, "logits/rejected": 0.2880319356918335, "logps/chosen": -172.64614868164062, "logps/ref_chosen": -54.128501892089844, "logps/ref_rejected": -82.40606689453125, "logps/rejected": -324.9674377441406, "loss": 0.6429, "rewards/accuracies": 0.90625, "rewards/chosen": -1.3659427165985107, "rewards/margins": 1.4240295886993408, "rewards/rejected": -2.7899723052978516, "step": 355 }, { "epoch": 0.5227606461086637, "epsilon_dpo/beta": 0.011470315046608448, "epsilon_dpo/beta_margin_grad_mean": -0.3513711094856262, "epsilon_dpo/beta_margin_grad_std": 0.23108676075935364, "epsilon_dpo/beta_margin_mean": 0.8818143010139465, "epsilon_dpo/beta_margin_std": 1.3648110628128052, "epsilon_dpo/loss_margin_mean": 77.65859985351562, "grad_norm": 80.60954284667969, "kl/avg_steps": 0.375, "kl/beta": 0.011512339115142822, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 2.7562154104130176e-07, "logits/chosen": 0.08925038576126099, "logits/rejected": 0.4124196767807007, "logps/chosen": -213.70367431640625, "logps/ref_chosen": -64.67381286621094, "logps/ref_rejected": -75.89926147460938, "logps/rejected": -302.58770751953125, "loss": 1.0289, "rewards/accuracies": 0.703125, "rewards/chosen": -1.7121202945709229, "rewards/margins": 0.8818143606185913, "rewards/rejected": -2.5939345359802246, "step": 356 }, { "epoch": 0.5242290748898678, "epsilon_dpo/beta": 0.011377277784049511, "epsilon_dpo/beta_margin_grad_mean": -0.25575509667396545, "epsilon_dpo/beta_margin_grad_std": 0.17103040218353271, "epsilon_dpo/beta_margin_mean": 1.3060200214385986, "epsilon_dpo/beta_margin_std": 1.1001414060592651, "epsilon_dpo/loss_margin_mean": 115.1030502319336, "grad_norm": 50.962833404541016, "kl/avg_steps": 0.8125, "kl/beta": 0.01146932877600193, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 2.7434463620546594e-07, "logits/chosen": 0.02392914518713951, "logits/rejected": 0.32534968852996826, "logps/chosen": -183.0228271484375, "logps/ref_chosen": -52.725799560546875, "logps/ref_rejected": -86.84115600585938, "logps/rejected": -332.2412414550781, "loss": 0.6693, "rewards/accuracies": 0.90625, "rewards/chosen": -1.4839692115783691, "rewards/margins": 1.3060200214385986, "rewards/rejected": -2.7899889945983887, "step": 357 }, { "epoch": 0.5256975036710719, "epsilon_dpo/beta": 0.011299805715680122, "epsilon_dpo/beta_margin_grad_mean": -0.28634849190711975, "epsilon_dpo/beta_margin_grad_std": 0.1681547611951828, "epsilon_dpo/beta_margin_mean": 1.1631741523742676, "epsilon_dpo/beta_margin_std": 1.1540768146514893, "epsilon_dpo/loss_margin_mean": 103.31038665771484, "grad_norm": 50.889225006103516, "kl/avg_steps": 0.6875, "kl/beta": 0.01137689221650362, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 2.730670898658255e-07, "logits/chosen": -0.006175622344017029, "logits/rejected": 0.0895409807562828, "logps/chosen": -187.41957092285156, "logps/ref_chosen": -63.20543670654297, "logps/ref_rejected": -88.373291015625, "logps/rejected": -315.8978271484375, "loss": 0.7395, "rewards/accuracies": 0.859375, "rewards/chosen": -1.404453992843628, "rewards/margins": 1.1631741523742676, "rewards/rejected": -2.5676283836364746, "step": 358 }, { "epoch": 0.527165932452276, "epsilon_dpo/beta": 0.011222649365663528, "epsilon_dpo/beta_margin_grad_mean": -0.27606135606765747, "epsilon_dpo/beta_margin_grad_std": 0.17397715151309967, "epsilon_dpo/beta_margin_mean": 1.2372063398361206, "epsilon_dpo/beta_margin_std": 1.1864436864852905, "epsilon_dpo/loss_margin_mean": 110.6605453491211, "grad_norm": 51.99663543701172, "kl/avg_steps": 0.6875, "kl/beta": 0.011299209669232368, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 2.717889356869146e-07, "logits/chosen": 0.12803149223327637, "logits/rejected": 0.40792912244796753, "logps/chosen": -197.83924865722656, "logps/ref_chosen": -56.370216369628906, "logps/ref_rejected": -82.17375183105469, "logps/rejected": -334.3033447265625, "loss": 0.7173, "rewards/accuracies": 0.890625, "rewards/chosen": -1.5897796154022217, "rewards/margins": 1.237206220626831, "rewards/rejected": -2.8269858360290527, "step": 359 }, { "epoch": 0.5286343612334802, "epsilon_dpo/beta": 0.011133727617561817, "epsilon_dpo/beta_margin_grad_mean": -0.291787713766098, "epsilon_dpo/beta_margin_grad_std": 0.15833166241645813, "epsilon_dpo/beta_margin_mean": 1.068414330482483, "epsilon_dpo/beta_margin_std": 0.9562557339668274, "epsilon_dpo/loss_margin_mean": 96.20708465576172, "grad_norm": 46.62980651855469, "kl/avg_steps": 0.796875, "kl/beta": 0.011222057975828648, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 2.7051020734928443e-07, "logits/chosen": 0.1601828932762146, "logits/rejected": 0.4318040609359741, "logps/chosen": -176.61279296875, "logps/ref_chosen": -51.460384368896484, "logps/ref_rejected": -69.83892822265625, "logps/rejected": -291.19842529296875, "loss": 0.7456, "rewards/accuracies": 0.921875, "rewards/chosen": -1.3949909210205078, "rewards/margins": 1.068414330482483, "rewards/rejected": -2.463405132293701, "step": 360 }, { "epoch": 0.5301027900146843, "epsilon_dpo/beta": 0.011073543690145016, "epsilon_dpo/beta_margin_grad_mean": -0.31325334310531616, "epsilon_dpo/beta_margin_grad_std": 0.19027428328990936, "epsilon_dpo/beta_margin_mean": 0.9978066682815552, "epsilon_dpo/beta_margin_std": 1.0947188138961792, "epsilon_dpo/loss_margin_mean": 90.64729309082031, "grad_norm": 77.21360778808594, "kl/avg_steps": 0.546875, "kl/beta": 0.011133339256048203, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.6923093854861593e-07, "logits/chosen": 0.16121695935726166, "logits/rejected": 0.22058174014091492, "logps/chosen": -185.80856323242188, "logps/ref_chosen": -53.86951446533203, "logps/ref_rejected": -90.76925659179688, "logps/rejected": -313.3555908203125, "loss": 0.8425, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4621036052703857, "rewards/margins": 0.9978066682815552, "rewards/rejected": -2.4599101543426514, "step": 361 }, { "epoch": 0.5315712187958884, "epsilon_dpo/beta": 0.010987376794219017, "epsilon_dpo/beta_margin_grad_mean": -0.25590401887893677, "epsilon_dpo/beta_margin_grad_std": 0.16981923580169678, "epsilon_dpo/beta_margin_mean": 1.3311046361923218, "epsilon_dpo/beta_margin_std": 1.1102828979492188, "epsilon_dpo/loss_margin_mean": 121.49525451660156, "grad_norm": 50.58975601196289, "kl/avg_steps": 0.78125, "kl/beta": 0.011072784662246704, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 2.679511629948319e-07, "logits/chosen": -0.036043211817741394, "logits/rejected": 0.011701637879014015, "logps/chosen": -174.71505737304688, "logps/ref_chosen": -58.639060974121094, "logps/ref_rejected": -105.58195495605469, "logps/rejected": -343.1531982421875, "loss": 0.6548, "rewards/accuracies": 0.890625, "rewards/chosen": -1.2764334678649902, "rewards/margins": 1.3311047554016113, "rewards/rejected": -2.6075382232666016, "step": 362 }, { "epoch": 0.5330396475770925, "epsilon_dpo/beta": 0.010891903191804886, "epsilon_dpo/beta_margin_grad_mean": -0.23322609066963196, "epsilon_dpo/beta_margin_grad_std": 0.15718205273151398, "epsilon_dpo/beta_margin_mean": 1.5058331489562988, "epsilon_dpo/beta_margin_std": 1.1756678819656372, "epsilon_dpo/loss_margin_mean": 138.46673583984375, "grad_norm": 44.88272476196289, "kl/avg_steps": 0.875, "kl/beta": 0.010986949317157269, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 2.6667091441120816e-07, "logits/chosen": 0.04104474559426308, "logits/rejected": 0.2909192144870758, "logps/chosen": -147.897216796875, "logps/ref_chosen": -44.558380126953125, "logps/ref_rejected": -74.69496154785156, "logps/rejected": -316.50054931640625, "loss": 0.5798, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1261868476867676, "rewards/margins": 1.5058331489562988, "rewards/rejected": -2.6320199966430664, "step": 363 }, { "epoch": 0.5345080763582967, "epsilon_dpo/beta": 0.01082465797662735, "epsilon_dpo/beta_margin_grad_mean": -0.30966997146606445, "epsilon_dpo/beta_margin_grad_std": 0.18362931907176971, "epsilon_dpo/beta_margin_mean": 1.0146048069000244, "epsilon_dpo/beta_margin_std": 1.1044753789901733, "epsilon_dpo/loss_margin_mean": 94.23161315917969, "grad_norm": 53.01240921020508, "kl/avg_steps": 0.625, "kl/beta": 0.010891648009419441, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 2.6539022653348575e-07, "logits/chosen": 0.12989960610866547, "logits/rejected": 0.0695757195353508, "logps/chosen": -169.1478729248047, "logps/ref_chosen": -48.894622802734375, "logps/ref_rejected": -91.395751953125, "logps/rejected": -305.880615234375, "loss": 0.8271, "rewards/accuracies": 0.796875, "rewards/chosen": -1.3043076992034912, "rewards/margins": 1.0146048069000244, "rewards/rejected": -2.3189125061035156, "step": 364 }, { "epoch": 0.5359765051395007, "epsilon_dpo/beta": 0.010743891820311546, "epsilon_dpo/beta_margin_grad_mean": -0.305558979511261, "epsilon_dpo/beta_margin_grad_std": 0.1742655485868454, "epsilon_dpo/beta_margin_mean": 1.0461580753326416, "epsilon_dpo/beta_margin_std": 1.1072211265563965, "epsilon_dpo/loss_margin_mean": 97.71156311035156, "grad_norm": 49.17789840698242, "kl/avg_steps": 0.75, "kl/beta": 0.010823997668921947, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 2.641091331089811e-07, "logits/chosen": 0.07260268926620483, "logits/rejected": 0.09923917800188065, "logps/chosen": -170.17611694335938, "logps/ref_chosen": -51.49274444580078, "logps/ref_rejected": -92.70166778564453, "logps/rejected": -309.09661865234375, "loss": 0.7999, "rewards/accuracies": 0.875, "rewards/chosen": -1.2785989046096802, "rewards/margins": 1.0461580753326416, "rewards/rejected": -2.3247570991516113, "step": 365 }, { "epoch": 0.5374449339207048, "epsilon_dpo/beta": 0.010663913562893867, "epsilon_dpo/beta_margin_grad_mean": -0.2789558470249176, "epsilon_dpo/beta_margin_grad_std": 0.16324636340141296, "epsilon_dpo/beta_margin_mean": 1.1398682594299316, "epsilon_dpo/beta_margin_std": 0.9605399370193481, "epsilon_dpo/loss_margin_mean": 107.23007202148438, "grad_norm": 57.75393295288086, "kl/avg_steps": 0.75, "kl/beta": 0.01074342243373394, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 2.6282766789569736e-07, "logits/chosen": 0.18406611680984497, "logits/rejected": 0.2672308683395386, "logps/chosen": -154.07119750976562, "logps/ref_chosen": -44.7205696105957, "logps/ref_rejected": -83.31040954589844, "logps/rejected": -299.89111328125, "loss": 0.7132, "rewards/accuracies": 0.875, "rewards/chosen": -1.167595624923706, "rewards/margins": 1.1398682594299316, "rewards/rejected": -2.307464122772217, "step": 366 }, { "epoch": 0.5389133627019089, "epsilon_dpo/beta": 0.010587861761450768, "epsilon_dpo/beta_margin_grad_mean": -0.3186591863632202, "epsilon_dpo/beta_margin_grad_std": 0.16735391318798065, "epsilon_dpo/beta_margin_mean": 0.9135677814483643, "epsilon_dpo/beta_margin_std": 0.9402992129325867, "epsilon_dpo/loss_margin_mean": 86.62907409667969, "grad_norm": 48.919776916503906, "kl/avg_steps": 0.71875, "kl/beta": 0.01066344603896141, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 2.615458646614349e-07, "logits/chosen": -0.003403276205062866, "logits/rejected": 0.19663584232330322, "logps/chosen": -173.90603637695312, "logps/ref_chosen": -58.405418395996094, "logps/ref_rejected": -76.75132751464844, "logps/rejected": -278.88104248046875, "loss": 0.838, "rewards/accuracies": 0.859375, "rewards/chosen": -1.2235524654388428, "rewards/margins": 0.9135677814483643, "rewards/rejected": -2.137120246887207, "step": 367 }, { "epoch": 0.540381791483113, "epsilon_dpo/beta": 0.010495759546756744, "epsilon_dpo/beta_margin_grad_mean": -0.2614386975765228, "epsilon_dpo/beta_margin_grad_std": 0.12906388938426971, "epsilon_dpo/beta_margin_mean": 1.203548550605774, "epsilon_dpo/beta_margin_std": 0.8884553909301758, "epsilon_dpo/loss_margin_mean": 114.82508087158203, "grad_norm": 77.68212890625, "kl/avg_steps": 0.875, "kl/beta": 0.010587349534034729, "kl/n_epsilon_steps": 0.0625, "kl/p_epsilon_steps": 0.9375, "learning_rate": 2.6026375718290083e-07, "logits/chosen": 0.05425267294049263, "logits/rejected": 0.1033497229218483, "logps/chosen": -151.20834350585938, "logps/ref_chosen": -44.452518463134766, "logps/ref_rejected": -98.55526733398438, "logps/rejected": -320.13616943359375, "loss": 0.6394, "rewards/accuracies": 0.9375, "rewards/chosen": -1.120084524154663, "rewards/margins": 1.2035484313964844, "rewards/rejected": -2.3236331939697266, "step": 368 }, { "epoch": 0.5418502202643172, "epsilon_dpo/beta": 0.010434240102767944, "epsilon_dpo/beta_margin_grad_mean": -0.31353968381881714, "epsilon_dpo/beta_margin_grad_std": 0.17336703836917877, "epsilon_dpo/beta_margin_mean": 0.9541692733764648, "epsilon_dpo/beta_margin_std": 0.9706453084945679, "epsilon_dpo/loss_margin_mean": 91.92720794677734, "grad_norm": 68.10032653808594, "kl/avg_steps": 0.59375, "kl/beta": 0.010495513677597046, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 2.589813792448196e-07, "logits/chosen": -0.020310744643211365, "logits/rejected": 0.1938587725162506, "logps/chosen": -197.55783081054688, "logps/ref_chosen": -71.38150024414062, "logps/ref_rejected": -91.29582214355469, "logps/rejected": -309.39935302734375, "loss": 0.8249, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3184516429901123, "rewards/margins": 0.9541692733764648, "rewards/rejected": -2.272620916366577, "step": 369 }, { "epoch": 0.5433186490455213, "epsilon_dpo/beta": 0.01039221789687872, "epsilon_dpo/beta_margin_grad_mean": -0.3402102291584015, "epsilon_dpo/beta_margin_grad_std": 0.1848820000886917, "epsilon_dpo/beta_margin_mean": 0.7936439514160156, "epsilon_dpo/beta_margin_std": 0.9488508701324463, "epsilon_dpo/loss_margin_mean": 76.98424530029297, "grad_norm": 61.3382453918457, "kl/avg_steps": 0.40625, "kl/beta": 0.010433564893901348, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 2.5769876463904263e-07, "logits/chosen": 0.026390574872493744, "logits/rejected": 0.21404683589935303, "logps/chosen": -217.964599609375, "logps/ref_chosen": -71.60749816894531, "logps/ref_rejected": -97.25978088378906, "logps/rejected": -320.60113525390625, "loss": 0.9278, "rewards/accuracies": 0.765625, "rewards/chosen": -1.5233656167984009, "rewards/margins": 0.7936439514160156, "rewards/rejected": -2.317009449005127, "step": 370 }, { "epoch": 0.5447870778267254, "epsilon_dpo/beta": 0.010327436029911041, "epsilon_dpo/beta_margin_grad_mean": -0.31311824917793274, "epsilon_dpo/beta_margin_grad_std": 0.18779975175857544, "epsilon_dpo/beta_margin_mean": 0.9890053272247314, "epsilon_dpo/beta_margin_std": 1.0799168348312378, "epsilon_dpo/loss_margin_mean": 96.27989959716797, "grad_norm": 47.58628463745117, "kl/avg_steps": 0.625, "kl/beta": 0.010391349904239178, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 2.5641594716365744e-07, "logits/chosen": 0.1568828821182251, "logits/rejected": 0.2479465901851654, "logps/chosen": -214.4617156982422, "logps/ref_chosen": -69.41448974609375, "logps/ref_rejected": -99.17217254638672, "logps/rejected": -340.499267578125, "loss": 0.8421, "rewards/accuracies": 0.796875, "rewards/chosen": -1.499226450920105, "rewards/margins": 0.9890053272247314, "rewards/rejected": -2.488231658935547, "step": 371 }, { "epoch": 0.5462555066079295, "epsilon_dpo/beta": 0.01025683619081974, "epsilon_dpo/beta_margin_grad_mean": -0.26346760988235474, "epsilon_dpo/beta_margin_grad_std": 0.18028797209262848, "epsilon_dpo/beta_margin_mean": 1.3152923583984375, "epsilon_dpo/beta_margin_std": 1.1791177988052368, "epsilon_dpo/loss_margin_mean": 128.73831176757812, "grad_norm": 50.93545913696289, "kl/avg_steps": 0.6875, "kl/beta": 0.010326807387173176, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 2.551329606220976e-07, "logits/chosen": 0.23591172695159912, "logits/rejected": 0.652916431427002, "logps/chosen": -206.0281982421875, "logps/ref_chosen": -61.8179931640625, "logps/ref_rejected": -78.53949737548828, "logps/rejected": -351.48797607421875, "loss": 0.6862, "rewards/accuracies": 0.875, "rewards/chosen": -1.4801955223083496, "rewards/margins": 1.3152923583984375, "rewards/rejected": -2.795487880706787, "step": 372 }, { "epoch": 0.5477239353891337, "epsilon_dpo/beta": 0.010180390439927578, "epsilon_dpo/beta_margin_grad_mean": -0.27469882369041443, "epsilon_dpo/beta_margin_grad_std": 0.19345998764038086, "epsilon_dpo/beta_margin_mean": 1.2231866121292114, "epsilon_dpo/beta_margin_std": 1.177465558052063, "epsilon_dpo/loss_margin_mean": 120.59542846679688, "grad_norm": 63.232784271240234, "kl/avg_steps": 0.75, "kl/beta": 0.010256295092403889, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 2.538498388222517e-07, "logits/chosen": 0.1704447865486145, "logits/rejected": 0.6260873675346375, "logps/chosen": -221.25779724121094, "logps/ref_chosen": -64.21713256835938, "logps/ref_rejected": -85.95960998535156, "logps/rejected": -363.595703125, "loss": 0.7512, "rewards/accuracies": 0.890625, "rewards/chosen": -1.601395606994629, "rewards/margins": 1.223186731338501, "rewards/rejected": -2.824582099914551, "step": 373 }, { "epoch": 0.5491923641703378, "epsilon_dpo/beta": 0.010104605928063393, "epsilon_dpo/beta_margin_grad_mean": -0.295759916305542, "epsilon_dpo/beta_margin_grad_std": 0.18086838722229004, "epsilon_dpo/beta_margin_mean": 1.1446397304534912, "epsilon_dpo/beta_margin_std": 1.1959160566329956, "epsilon_dpo/loss_margin_mean": 113.65790557861328, "grad_norm": 47.119110107421875, "kl/avg_steps": 0.75, "kl/beta": 0.010179945267736912, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 2.525666155755725e-07, "logits/chosen": 0.13347086310386658, "logits/rejected": 0.32731887698173523, "logps/chosen": -209.53182983398438, "logps/ref_chosen": -70.65017700195312, "logps/ref_rejected": -93.64016723632812, "logps/rejected": -346.1797180175781, "loss": 0.7768, "rewards/accuracies": 0.875, "rewards/chosen": -1.4041626453399658, "rewards/margins": 1.1446397304534912, "rewards/rejected": -2.548802375793457, "step": 374 }, { "epoch": 0.5506607929515418, "epsilon_dpo/beta": 0.010035702027380466, "epsilon_dpo/beta_margin_grad_mean": -0.3037494719028473, "epsilon_dpo/beta_margin_grad_std": 0.21776020526885986, "epsilon_dpo/beta_margin_mean": 1.0716514587402344, "epsilon_dpo/beta_margin_std": 1.2841838598251343, "epsilon_dpo/loss_margin_mean": 107.4032211303711, "grad_norm": 85.46239471435547, "kl/avg_steps": 0.6875, "kl/beta": 0.010104164481163025, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 2.512833246961859e-07, "logits/chosen": 0.31618988513946533, "logits/rejected": 0.44418323040008545, "logps/chosen": -225.16519165039062, "logps/ref_chosen": -60.080223083496094, "logps/ref_rejected": -88.93830871582031, "logps/rejected": -361.426513671875, "loss": 0.8852, "rewards/accuracies": 0.84375, "rewards/chosen": -1.6596342325210571, "rewards/margins": 1.0716514587402344, "rewards/rejected": -2.731285810470581, "step": 375 }, { "epoch": 0.5521292217327459, "epsilon_dpo/beta": 0.009957768023014069, "epsilon_dpo/beta_margin_grad_mean": -0.2636283338069916, "epsilon_dpo/beta_margin_grad_std": 0.18567059934139252, "epsilon_dpo/beta_margin_mean": 1.3474147319793701, "epsilon_dpo/beta_margin_std": 1.2644068002700806, "epsilon_dpo/loss_margin_mean": 135.72604370117188, "grad_norm": 57.3476448059082, "kl/avg_steps": 0.78125, "kl/beta": 0.01003517210483551, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 2.5e-07, "logits/chosen": 0.33650436997413635, "logits/rejected": 0.52508544921875, "logps/chosen": -221.05125427246094, "logps/ref_chosen": -62.660308837890625, "logps/ref_rejected": -105.526611328125, "logps/rejected": -399.64361572265625, "loss": 0.6973, "rewards/accuracies": 0.875, "rewards/chosen": -1.579259991645813, "rewards/margins": 1.3474147319793701, "rewards/rejected": -2.9266748428344727, "step": 376 }, { "epoch": 0.55359765051395, "epsilon_dpo/beta": 0.00988991279155016, "epsilon_dpo/beta_margin_grad_mean": -0.27174854278564453, "epsilon_dpo/beta_margin_grad_std": 0.19310957193374634, "epsilon_dpo/beta_margin_mean": 1.2498548030853271, "epsilon_dpo/beta_margin_std": 1.1488646268844604, "epsilon_dpo/loss_margin_mean": 126.91059112548828, "grad_norm": 55.345767974853516, "kl/avg_steps": 0.6875, "kl/beta": 0.009957380592823029, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 2.487166753038141e-07, "logits/chosen": 0.593209981918335, "logits/rejected": 0.6385773420333862, "logps/chosen": -235.71920776367188, "logps/ref_chosen": -54.478736877441406, "logps/ref_rejected": -98.70335388183594, "logps/rejected": -406.85443115234375, "loss": 0.7255, "rewards/accuracies": 0.84375, "rewards/chosen": -1.792405128479004, "rewards/margins": 1.2498548030853271, "rewards/rejected": -3.042259931564331, "step": 377 }, { "epoch": 0.5550660792951542, "epsilon_dpo/beta": 0.009819293394684792, "epsilon_dpo/beta_margin_grad_mean": -0.25957709550857544, "epsilon_dpo/beta_margin_grad_std": 0.17345236241817474, "epsilon_dpo/beta_margin_mean": 1.2708011865615845, "epsilon_dpo/beta_margin_std": 1.0593914985656738, "epsilon_dpo/loss_margin_mean": 129.88406372070312, "grad_norm": 51.589290618896484, "kl/avg_steps": 0.71875, "kl/beta": 0.009889391250908375, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 2.4743338442442754e-07, "logits/chosen": 0.762771725654602, "logits/rejected": 0.9840282201766968, "logps/chosen": -229.14559936523438, "logps/ref_chosen": -45.02053451538086, "logps/ref_rejected": -88.0469741821289, "logps/rejected": -402.05609130859375, "loss": 0.6753, "rewards/accuracies": 0.90625, "rewards/chosen": -1.8094918727874756, "rewards/margins": 1.270801067352295, "rewards/rejected": -3.0802931785583496, "step": 378 }, { "epoch": 0.5565345080763583, "epsilon_dpo/beta": 0.009758426807820797, "epsilon_dpo/beta_margin_grad_mean": -0.27853649854660034, "epsilon_dpo/beta_margin_grad_std": 0.21985061466693878, "epsilon_dpo/beta_margin_mean": 1.2948954105377197, "epsilon_dpo/beta_margin_std": 1.3682698011398315, "epsilon_dpo/loss_margin_mean": 133.4654083251953, "grad_norm": 71.65619659423828, "kl/avg_steps": 0.625, "kl/beta": 0.009818818420171738, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 2.461501611777483e-07, "logits/chosen": 0.7221354246139526, "logits/rejected": 0.6323871612548828, "logps/chosen": -258.0139465332031, "logps/ref_chosen": -53.182098388671875, "logps/ref_rejected": -114.30015563964844, "logps/rejected": -452.597412109375, "loss": 0.783, "rewards/accuracies": 0.828125, "rewards/chosen": -2.002189874649048, "rewards/margins": 1.2948954105377197, "rewards/rejected": -3.2970852851867676, "step": 379 }, { "epoch": 0.5580029368575624, "epsilon_dpo/beta": 0.009688666090369225, "epsilon_dpo/beta_margin_grad_mean": -0.265032023191452, "epsilon_dpo/beta_margin_grad_std": 0.21453852951526642, "epsilon_dpo/beta_margin_mean": 1.363479733467102, "epsilon_dpo/beta_margin_std": 1.3592864274978638, "epsilon_dpo/loss_margin_mean": 141.36749267578125, "grad_norm": 55.80002212524414, "kl/avg_steps": 0.71875, "kl/beta": 0.009757831692695618, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 2.4486703937790243e-07, "logits/chosen": 0.773692786693573, "logits/rejected": 0.6141992807388306, "logps/chosen": -251.891845703125, "logps/ref_chosen": -51.3530387878418, "logps/ref_rejected": -104.19169616699219, "logps/rejected": -446.0979919433594, "loss": 0.7465, "rewards/accuracies": 0.859375, "rewards/chosen": -1.944932460784912, "rewards/margins": 1.3634798526763916, "rewards/rejected": -3.3084120750427246, "step": 380 }, { "epoch": 0.5594713656387665, "epsilon_dpo/beta": 0.009625581093132496, "epsilon_dpo/beta_margin_grad_mean": -0.28838157653808594, "epsilon_dpo/beta_margin_grad_std": 0.21418903768062592, "epsilon_dpo/beta_margin_mean": 1.2413488626480103, "epsilon_dpo/beta_margin_std": 1.37782621383667, "epsilon_dpo/loss_margin_mean": 129.6344757080078, "grad_norm": 68.58415985107422, "kl/avg_steps": 0.65625, "kl/beta": 0.009688197635114193, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 2.435840528363426e-07, "logits/chosen": 0.5541897416114807, "logits/rejected": 1.0620379447937012, "logps/chosen": -253.16357421875, "logps/ref_chosen": -57.80306625366211, "logps/ref_rejected": -79.21940612792969, "logps/rejected": -404.2143859863281, "loss": 0.8103, "rewards/accuracies": 0.828125, "rewards/chosen": -1.8841533660888672, "rewards/margins": 1.2413489818572998, "rewards/rejected": -3.125502109527588, "step": 381 }, { "epoch": 0.5609397944199707, "epsilon_dpo/beta": 0.009550793096423149, "epsilon_dpo/beta_margin_grad_mean": -0.2708124816417694, "epsilon_dpo/beta_margin_grad_std": 0.1647692173719406, "epsilon_dpo/beta_margin_mean": 1.1883574724197388, "epsilon_dpo/beta_margin_std": 0.994311511516571, "epsilon_dpo/loss_margin_mean": 124.78330993652344, "grad_norm": 51.3189811706543, "kl/avg_steps": 0.78125, "kl/beta": 0.009625033475458622, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 2.4230123536095745e-07, "logits/chosen": 0.19109681248664856, "logits/rejected": 0.26786884665489197, "logps/chosen": -245.92892456054688, "logps/ref_chosen": -66.02030181884766, "logps/ref_rejected": -110.71015930175781, "logps/rejected": -415.402099609375, "loss": 0.697, "rewards/accuracies": 0.921875, "rewards/chosen": -1.720555305480957, "rewards/margins": 1.1883574724197388, "rewards/rejected": -2.9089126586914062, "step": 382 }, { "epoch": 0.5624082232011748, "epsilon_dpo/beta": 0.009485709480941296, "epsilon_dpo/beta_margin_grad_mean": -0.2903318703174591, "epsilon_dpo/beta_margin_grad_std": 0.19628944993019104, "epsilon_dpo/beta_margin_mean": 1.123332142829895, "epsilon_dpo/beta_margin_std": 1.1714568138122559, "epsilon_dpo/loss_margin_mean": 119.01622772216797, "grad_norm": 49.46955108642578, "kl/avg_steps": 0.6875, "kl/beta": 0.009550420567393303, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 2.4101862075518037e-07, "logits/chosen": 0.3420097827911377, "logits/rejected": 0.39168184995651245, "logps/chosen": -215.2762451171875, "logps/ref_chosen": -50.39148712158203, "logps/ref_rejected": -93.71589660644531, "logps/rejected": -377.61688232421875, "loss": 0.799, "rewards/accuracies": 0.84375, "rewards/chosen": -1.5670068264007568, "rewards/margins": 1.123332142829895, "rewards/rejected": -2.6903388500213623, "step": 383 }, { "epoch": 0.5638766519823789, "epsilon_dpo/beta": 0.009409083984792233, "epsilon_dpo/beta_margin_grad_mean": -0.29907065629959106, "epsilon_dpo/beta_margin_grad_std": 0.16258589923381805, "epsilon_dpo/beta_margin_mean": 1.0240944623947144, "epsilon_dpo/beta_margin_std": 0.9657134413719177, "epsilon_dpo/loss_margin_mean": 109.14332580566406, "grad_norm": 68.42369079589844, "kl/avg_steps": 0.8125, "kl/beta": 0.009485210292041302, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 2.397362428170992e-07, "logits/chosen": 0.18505258858203888, "logits/rejected": 0.20061829686164856, "logps/chosen": -193.0376434326172, "logps/ref_chosen": -52.046104431152344, "logps/ref_rejected": -85.76089477539062, "logps/rejected": -335.895751953125, "loss": 0.7738, "rewards/accuracies": 0.875, "rewards/chosen": -1.3274247646331787, "rewards/margins": 1.0240944623947144, "rewards/rejected": -2.3515191078186035, "step": 384 }, { "epoch": 0.5653450807635829, "epsilon_dpo/beta": 0.009336191229522228, "epsilon_dpo/beta_margin_grad_mean": -0.281484991312027, "epsilon_dpo/beta_margin_grad_std": 0.14978937804698944, "epsilon_dpo/beta_margin_mean": 1.0686564445495605, "epsilon_dpo/beta_margin_std": 0.8089478015899658, "epsilon_dpo/loss_margin_mean": 114.78732299804688, "grad_norm": 69.06336975097656, "kl/avg_steps": 0.78125, "kl/beta": 0.009408763609826565, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 2.3845413533856514e-07, "logits/chosen": -0.012512121349573135, "logits/rejected": 0.2826478183269501, "logps/chosen": -190.58724975585938, "logps/ref_chosen": -65.55216217041016, "logps/ref_rejected": -77.82792663574219, "logps/rejected": -317.6503601074219, "loss": 0.7125, "rewards/accuracies": 0.90625, "rewards/chosen": -1.169029712677002, "rewards/margins": 1.0686564445495605, "rewards/rejected": -2.2376861572265625, "step": 385 }, { "epoch": 0.566813509544787, "epsilon_dpo/beta": 0.00927257165312767, "epsilon_dpo/beta_margin_grad_mean": -0.2851457893848419, "epsilon_dpo/beta_margin_grad_std": 0.17000393569469452, "epsilon_dpo/beta_margin_mean": 1.104270100593567, "epsilon_dpo/beta_margin_std": 0.9711874723434448, "epsilon_dpo/loss_margin_mean": 119.57515716552734, "grad_norm": 54.31593704223633, "kl/avg_steps": 0.6875, "kl/beta": 0.009335828013718128, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 2.3717233210430254e-07, "logits/chosen": 0.14980031549930573, "logits/rejected": 0.16187229752540588, "logps/chosen": -192.3538818359375, "logps/ref_chosen": -58.22185516357422, "logps/ref_rejected": -92.32742309570312, "logps/rejected": -346.03460693359375, "loss": 0.7388, "rewards/accuracies": 0.84375, "rewards/chosen": -1.2461304664611816, "rewards/margins": 1.1042702198028564, "rewards/rejected": -2.350400447845459, "step": 386 }, { "epoch": 0.5682819383259912, "epsilon_dpo/beta": 0.009197666309773922, "epsilon_dpo/beta_margin_grad_mean": -0.30550339818000793, "epsilon_dpo/beta_margin_grad_std": 0.1537167876958847, "epsilon_dpo/beta_margin_mean": 0.9292916059494019, "epsilon_dpo/beta_margin_std": 0.7956849336624146, "epsilon_dpo/loss_margin_mean": 101.3406753540039, "grad_norm": 54.8062744140625, "kl/avg_steps": 0.8125, "kl/beta": 0.009272081777453423, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 2.3589086689101889e-07, "logits/chosen": -0.17502695322036743, "logits/rejected": 0.27621710300445557, "logps/chosen": -209.55714416503906, "logps/ref_chosen": -66.41944885253906, "logps/ref_rejected": -92.16915893554688, "logps/rejected": -336.64752197265625, "loss": 0.79, "rewards/accuracies": 0.921875, "rewards/chosen": -1.317199945449829, "rewards/margins": 0.9292916059494019, "rewards/rejected": -2.2464914321899414, "step": 387 }, { "epoch": 0.5697503671071953, "epsilon_dpo/beta": 0.009132160805165768, "epsilon_dpo/beta_margin_grad_mean": -0.26968178153038025, "epsilon_dpo/beta_margin_grad_std": 0.17602618038654327, "epsilon_dpo/beta_margin_mean": 1.242363452911377, "epsilon_dpo/beta_margin_std": 1.1079589128494263, "epsilon_dpo/loss_margin_mean": 136.5463104248047, "grad_norm": 55.63298797607422, "kl/avg_steps": 0.71875, "kl/beta": 0.009197353385388851, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 2.3460977346651428e-07, "logits/chosen": 0.36801832914352417, "logits/rejected": 0.31209510564804077, "logps/chosen": -204.08885192871094, "logps/ref_chosen": -50.129459381103516, "logps/ref_rejected": -104.43305969238281, "logps/rejected": -394.93878173828125, "loss": 0.7026, "rewards/accuracies": 0.875, "rewards/chosen": -1.4085330963134766, "rewards/margins": 1.242363452911377, "rewards/rejected": -2.6508965492248535, "step": 388 }, { "epoch": 0.5712187958883994, "epsilon_dpo/beta": 0.009072699584066868, "epsilon_dpo/beta_margin_grad_mean": -0.28545913100242615, "epsilon_dpo/beta_margin_grad_std": 0.18159131705760956, "epsilon_dpo/beta_margin_mean": 1.1541965007781982, "epsilon_dpo/beta_margin_std": 1.1000096797943115, "epsilon_dpo/loss_margin_mean": 127.77715301513672, "grad_norm": 69.67356872558594, "kl/avg_steps": 0.65625, "kl/beta": 0.0091317193582654, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 2.3332908558879177e-07, "logits/chosen": 0.4236670136451721, "logits/rejected": 0.7409931421279907, "logps/chosen": -235.445068359375, "logps/ref_chosen": -57.906593322753906, "logps/ref_rejected": -77.91454315185547, "logps/rejected": -383.23016357421875, "loss": 0.7525, "rewards/accuracies": 0.875, "rewards/chosen": -1.6125736236572266, "rewards/margins": 1.1541965007781982, "rewards/rejected": -2.766770124435425, "step": 389 }, { "epoch": 0.5726872246696035, "epsilon_dpo/beta": 0.00901071261614561, "epsilon_dpo/beta_margin_grad_mean": -0.3042297661304474, "epsilon_dpo/beta_margin_grad_std": 0.20502284169197083, "epsilon_dpo/beta_margin_mean": 1.0643731355667114, "epsilon_dpo/beta_margin_std": 1.284492015838623, "epsilon_dpo/loss_margin_mean": 118.77655792236328, "grad_norm": 85.99386596679688, "kl/avg_steps": 0.6875, "kl/beta": 0.009072182700037956, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 2.320488370051681e-07, "logits/chosen": 0.5675963163375854, "logits/rejected": 0.8187364339828491, "logps/chosen": -228.89016723632812, "logps/ref_chosen": -49.22591781616211, "logps/ref_rejected": -85.5281982421875, "logps/rejected": -383.968994140625, "loss": 0.8619, "rewards/accuracies": 0.875, "rewards/chosen": -1.6221460103988647, "rewards/margins": 1.0643731355667114, "rewards/rejected": -2.686519145965576, "step": 390 }, { "epoch": 0.5741556534508077, "epsilon_dpo/beta": 0.008957634679973125, "epsilon_dpo/beta_margin_grad_mean": -0.34145286679267883, "epsilon_dpo/beta_margin_grad_std": 0.17664246261119843, "epsilon_dpo/beta_margin_mean": 0.8340969681739807, "epsilon_dpo/beta_margin_std": 1.0772384405136108, "epsilon_dpo/loss_margin_mean": 93.67467498779297, "grad_norm": 82.07687377929688, "kl/avg_steps": 0.59375, "kl/beta": 0.009010237641632557, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 2.3076906145138405e-07, "logits/chosen": 0.39004236459732056, "logits/rejected": 0.5632044076919556, "logps/chosen": -245.46572875976562, "logps/ref_chosen": -64.32965087890625, "logps/ref_rejected": -86.73820495605469, "logps/rejected": -361.5489501953125, "loss": 0.9234, "rewards/accuracies": 0.828125, "rewards/chosen": -1.6255539655685425, "rewards/margins": 0.8340969681739807, "rewards/rejected": -2.459650993347168, "step": 391 }, { "epoch": 0.5756240822320118, "epsilon_dpo/beta": 0.008885168470442295, "epsilon_dpo/beta_margin_grad_mean": -0.2806675434112549, "epsilon_dpo/beta_margin_grad_std": 0.16160036623477936, "epsilon_dpo/beta_margin_mean": 1.1598118543624878, "epsilon_dpo/beta_margin_std": 1.0425323247909546, "epsilon_dpo/loss_margin_mean": 130.8539276123047, "grad_norm": 53.46550369262695, "kl/avg_steps": 0.8125, "kl/beta": 0.008957055397331715, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 2.294897926507156e-07, "logits/chosen": 0.3719983696937561, "logits/rejected": 0.43438470363616943, "logps/chosen": -207.44383239746094, "logps/ref_chosen": -53.50397872924805, "logps/ref_rejected": -102.34583282470703, "logps/rejected": -387.1396484375, "loss": 0.716, "rewards/accuracies": 0.890625, "rewards/chosen": -1.3687458038330078, "rewards/margins": 1.1598117351531982, "rewards/rejected": -2.528557538986206, "step": 392 }, { "epoch": 0.5770925110132159, "epsilon_dpo/beta": 0.008838548325002193, "epsilon_dpo/beta_margin_grad_mean": -0.32830435037612915, "epsilon_dpo/beta_margin_grad_std": 0.18955926597118378, "epsilon_dpo/beta_margin_mean": 0.9347378015518188, "epsilon_dpo/beta_margin_std": 1.1401594877243042, "epsilon_dpo/loss_margin_mean": 106.420166015625, "grad_norm": 48.940673828125, "kl/avg_steps": 0.53125, "kl/beta": 0.008884865790605545, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.2821106431308543e-07, "logits/chosen": 0.5507162809371948, "logits/rejected": 0.8104848861694336, "logps/chosen": -205.63404846191406, "logps/ref_chosen": -46.473915100097656, "logps/ref_rejected": -71.96885681152344, "logps/rejected": -337.54913330078125, "loss": 0.8873, "rewards/accuracies": 0.796875, "rewards/chosen": -1.4089500904083252, "rewards/margins": 0.9347378015518188, "rewards/rejected": -2.3436877727508545, "step": 393 }, { "epoch": 0.57856093979442, "epsilon_dpo/beta": 0.00877803098410368, "epsilon_dpo/beta_margin_grad_mean": -0.29959022998809814, "epsilon_dpo/beta_margin_grad_std": 0.19315236806869507, "epsilon_dpo/beta_margin_mean": 1.0850220918655396, "epsilon_dpo/beta_margin_std": 1.1473870277404785, "epsilon_dpo/loss_margin_mean": 124.20958709716797, "grad_norm": 52.52149200439453, "kl/avg_steps": 0.6875, "kl/beta": 0.00883791409432888, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 2.2693291013417452e-07, "logits/chosen": 0.3315991461277008, "logits/rejected": 0.5701587796211243, "logps/chosen": -207.61373901367188, "logps/ref_chosen": -52.91154479980469, "logps/ref_rejected": -90.82263946533203, "logps/rejected": -369.7344055175781, "loss": 0.8089, "rewards/accuracies": 0.84375, "rewards/chosen": -1.3601274490356445, "rewards/margins": 1.085022211074829, "rewards/rejected": -2.4451494216918945, "step": 394 }, { "epoch": 0.580029368575624, "epsilon_dpo/beta": 0.008716708980500698, "epsilon_dpo/beta_margin_grad_mean": -0.2946094274520874, "epsilon_dpo/beta_margin_grad_std": 0.1813129335641861, "epsilon_dpo/beta_margin_mean": 1.102211594581604, "epsilon_dpo/beta_margin_std": 1.0907398462295532, "epsilon_dpo/loss_margin_mean": 126.95230102539062, "grad_norm": 48.824092864990234, "kl/avg_steps": 0.703125, "kl/beta": 0.008777568116784096, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 2.2565536379453404e-07, "logits/chosen": 0.3647058606147766, "logits/rejected": 0.5051198601722717, "logps/chosen": -211.43392944335938, "logps/ref_chosen": -62.546112060546875, "logps/ref_rejected": -83.78262329101562, "logps/rejected": -359.62274169921875, "loss": 0.7749, "rewards/accuracies": 0.859375, "rewards/chosen": -1.29952073097229, "rewards/margins": 1.1022114753723145, "rewards/rejected": -2.4017324447631836, "step": 395 }, { "epoch": 0.5814977973568282, "epsilon_dpo/beta": 0.008659947663545609, "epsilon_dpo/beta_margin_grad_mean": -0.30411437153816223, "epsilon_dpo/beta_margin_grad_std": 0.16896657645702362, "epsilon_dpo/beta_margin_mean": 0.9894986152648926, "epsilon_dpo/beta_margin_std": 0.9693487286567688, "epsilon_dpo/loss_margin_mean": 114.83411407470703, "grad_norm": 43.686859130859375, "kl/avg_steps": 0.65625, "kl/beta": 0.008716282434761524, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 2.2437845895869825e-07, "logits/chosen": 0.11506186425685883, "logits/rejected": 0.3793829083442688, "logps/chosen": -221.6730499267578, "logps/ref_chosen": -68.99594116210938, "logps/ref_rejected": -88.64665985107422, "logps/rejected": -356.1578674316406, "loss": 0.7989, "rewards/accuracies": 0.828125, "rewards/chosen": -1.3240954875946045, "rewards/margins": 0.9894986152648926, "rewards/rejected": -2.313594102859497, "step": 396 }, { "epoch": 0.5829662261380323, "epsilon_dpo/beta": 0.008592660538852215, "epsilon_dpo/beta_margin_grad_mean": -0.2829607129096985, "epsilon_dpo/beta_margin_grad_std": 0.16558720171451569, "epsilon_dpo/beta_margin_mean": 1.1373881101608276, "epsilon_dpo/beta_margin_std": 1.0025265216827393, "epsilon_dpo/loss_margin_mean": 132.73634338378906, "grad_norm": 38.96381759643555, "kl/avg_steps": 0.78125, "kl/beta": 0.008659454062581062, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 2.2310222927429716e-07, "logits/chosen": 0.10061437636613846, "logits/rejected": 0.2683975398540497, "logps/chosen": -196.1708984375, "logps/ref_chosen": -61.27716827392578, "logps/ref_rejected": -103.11612701416016, "logps/rejected": -370.7462158203125, "loss": 0.7248, "rewards/accuracies": 0.90625, "rewards/chosen": -1.1615934371948242, "rewards/margins": 1.1373881101608276, "rewards/rejected": -2.2989816665649414, "step": 397 }, { "epoch": 0.5844346549192364, "epsilon_dpo/beta": 0.008528737351298332, "epsilon_dpo/beta_margin_grad_mean": -0.3027188777923584, "epsilon_dpo/beta_margin_grad_std": 0.16493533551692963, "epsilon_dpo/beta_margin_mean": 1.0099176168441772, "epsilon_dpo/beta_margin_std": 0.9829539656639099, "epsilon_dpo/loss_margin_mean": 118.81352996826172, "grad_norm": 42.015350341796875, "kl/avg_steps": 0.75, "kl/beta": 0.008592327125370502, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 2.2182670837116972e-07, "logits/chosen": -0.0056856414303183556, "logits/rejected": 0.051807716488838196, "logps/chosen": -214.44459533691406, "logps/ref_chosen": -68.15155029296875, "logps/ref_rejected": -108.52360534667969, "logps/rejected": -373.63018798828125, "loss": 0.787, "rewards/accuracies": 0.875, "rewards/chosen": -1.248372197151184, "rewards/margins": 1.0099174976348877, "rewards/rejected": -2.2582898139953613, "step": 398 }, { "epoch": 0.5859030837004405, "epsilon_dpo/beta": 0.008465247228741646, "epsilon_dpo/beta_margin_grad_mean": -0.2791239619255066, "epsilon_dpo/beta_margin_grad_std": 0.17180785536766052, "epsilon_dpo/beta_margin_mean": 1.1316139698028564, "epsilon_dpo/beta_margin_std": 0.964480996131897, "epsilon_dpo/loss_margin_mean": 134.1562042236328, "grad_norm": 38.74634552001953, "kl/avg_steps": 0.75, "kl/beta": 0.008528363890945911, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 2.2055192986047804e-07, "logits/chosen": 0.03982260078191757, "logits/rejected": 0.4843941926956177, "logps/chosen": -183.54969787597656, "logps/ref_chosen": -60.889801025390625, "logps/ref_rejected": -77.96558380126953, "logps/rejected": -334.78167724609375, "loss": 0.7258, "rewards/accuracies": 0.890625, "rewards/chosen": -1.0403821468353271, "rewards/margins": 1.1316139698028564, "rewards/rejected": -2.1719961166381836, "step": 399 }, { "epoch": 0.5873715124816447, "epsilon_dpo/beta": 0.008412813767790794, "epsilon_dpo/beta_margin_grad_mean": -0.2643857002258301, "epsilon_dpo/beta_margin_grad_std": 0.18431194126605988, "epsilon_dpo/beta_margin_mean": 1.2925533056259155, "epsilon_dpo/beta_margin_std": 1.1452906131744385, "epsilon_dpo/loss_margin_mean": 154.34190368652344, "grad_norm": 49.45317077636719, "kl/avg_steps": 0.625, "kl/beta": 0.008464877493679523, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 2.192779273338215e-07, "logits/chosen": 0.012739516794681549, "logits/rejected": 0.17742590606212616, "logps/chosen": -184.56044006347656, "logps/ref_chosen": -63.64359664916992, "logps/ref_rejected": -105.252685546875, "logps/rejected": -380.51141357421875, "loss": 0.6919, "rewards/accuracies": 0.859375, "rewards/chosen": -1.0188145637512207, "rewards/margins": 1.2925533056259155, "rewards/rejected": -2.311367988586426, "step": 400 }, { "epoch": 0.5873715124816447, "eval_epsilon_dpo/beta": 0.008376333862543106, "eval_epsilon_dpo/beta_margin_grad_mean": -0.37250053882598877, "eval_epsilon_dpo/beta_margin_grad_std": 0.1851673126220703, "eval_epsilon_dpo/beta_margin_mean": 0.6458921432495117, "eval_epsilon_dpo/beta_margin_std": 0.9830819964408875, "eval_epsilon_dpo/loss_margin_mean": 77.85261535644531, "eval_kl/n_epsilon_steps": 0.28082191944122314, "eval_kl/p_epsilon_steps": 0.7183219194412231, "eval_logits/chosen": 0.05544120445847511, "eval_logits/rejected": 0.2763102352619171, "eval_logps/chosen": -239.24766540527344, "eval_logps/ref_chosen": -79.05104064941406, "eval_logps/ref_rejected": -86.79793548583984, "eval_logps/rejected": -324.8471984863281, "eval_loss": 0.5233684778213501, "eval_rewards/accuracies": 0.7333047986030579, "eval_rewards/chosen": -1.3444827795028687, "eval_rewards/margins": 0.6458921432495117, "eval_rewards/rejected": -1.99037504196167, "eval_runtime": 43.1545, "eval_samples_per_second": 54.201, "eval_steps_per_second": 1.715, "step": 400 }, { "epoch": 0.5888399412628488, "epsilon_dpo/beta": 0.008373704738914967, "epsilon_dpo/beta_margin_grad_mean": -0.35600748658180237, "epsilon_dpo/beta_margin_grad_std": 0.218048557639122, "epsilon_dpo/beta_margin_mean": 0.8236421346664429, "epsilon_dpo/beta_margin_std": 1.309921145439148, "epsilon_dpo/loss_margin_mean": 99.26753997802734, "grad_norm": 50.62467575073242, "kl/avg_steps": 0.46875, "kl/beta": 0.008412300609052181, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 2.1800473436235136e-07, "logits/chosen": 0.2227509617805481, "logits/rejected": 0.32873934507369995, "logps/chosen": -199.46241760253906, "logps/ref_chosen": -57.16303253173828, "logps/ref_rejected": -83.79249572753906, "logps/rejected": -325.35943603515625, "loss": 1.035, "rewards/accuracies": 0.765625, "rewards/chosen": -1.1949478387832642, "rewards/margins": 0.8236421346664429, "rewards/rejected": -2.018589973449707, "step": 401 }, { "epoch": 0.5903083700440529, "epsilon_dpo/beta": 0.008303234353661537, "epsilon_dpo/beta_margin_grad_mean": -0.23055776953697205, "epsilon_dpo/beta_margin_grad_std": 0.1520979404449463, "epsilon_dpo/beta_margin_mean": 1.42051100730896, "epsilon_dpo/beta_margin_std": 0.9418905377388, "epsilon_dpo/loss_margin_mean": 171.4123077392578, "grad_norm": 27.302982330322266, "kl/avg_steps": 0.84375, "kl/beta": 0.008373051881790161, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 2.1673238449588665e-07, "logits/chosen": 0.1841059774160385, "logits/rejected": 0.4696100056171417, "logps/chosen": -155.729248046875, "logps/ref_chosen": -50.74037170410156, "logps/ref_rejected": -81.0460433959961, "logps/rejected": -357.44720458984375, "loss": 0.5727, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8723675012588501, "rewards/margins": 1.42051100730896, "rewards/rejected": -2.2928786277770996, "step": 402 }, { "epoch": 0.591776798825257, "epsilon_dpo/beta": 0.00824414286762476, "epsilon_dpo/beta_margin_grad_mean": -0.29532960057258606, "epsilon_dpo/beta_margin_grad_std": 0.18958072364330292, "epsilon_dpo/beta_margin_mean": 1.0768463611602783, "epsilon_dpo/beta_margin_std": 1.0528734922409058, "epsilon_dpo/loss_margin_mean": 131.1674041748047, "grad_norm": 37.2651252746582, "kl/avg_steps": 0.71875, "kl/beta": 0.008302995935082436, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 2.154609112620295e-07, "logits/chosen": 0.3366158604621887, "logits/rejected": 0.5971213579177856, "logps/chosen": -170.4490509033203, "logps/ref_chosen": -47.14731216430664, "logps/ref_rejected": -77.2666015625, "logps/rejected": -331.73577880859375, "loss": 0.7888, "rewards/accuracies": 0.828125, "rewards/chosen": -1.0179142951965332, "rewards/margins": 1.0768463611602783, "rewards/rejected": -2.0947608947753906, "step": 403 }, { "epoch": 0.593245227606461, "epsilon_dpo/beta": 0.008187886327505112, "epsilon_dpo/beta_margin_grad_mean": -0.2925785481929779, "epsilon_dpo/beta_margin_grad_std": 0.18369947373867035, "epsilon_dpo/beta_margin_mean": 1.1043678522109985, "epsilon_dpo/beta_margin_std": 1.079614281654358, "epsilon_dpo/loss_margin_mean": 135.46914672851562, "grad_norm": 50.897254943847656, "kl/avg_steps": 0.6875, "kl/beta": 0.00824374333024025, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 2.1419034816528218e-07, "logits/chosen": 0.2518424093723297, "logits/rejected": 0.5586760640144348, "logps/chosen": -182.01962280273438, "logps/ref_chosen": -47.875274658203125, "logps/ref_rejected": -77.15499877929688, "logps/rejected": -346.76849365234375, "loss": 0.7747, "rewards/accuracies": 0.828125, "rewards/chosen": -1.1005568504333496, "rewards/margins": 1.104367971420288, "rewards/rejected": -2.2049245834350586, "step": 404 }, { "epoch": 0.5947136563876652, "epsilon_dpo/beta": 0.008149891160428524, "epsilon_dpo/beta_margin_grad_mean": -0.3321351110935211, "epsilon_dpo/beta_margin_grad_std": 0.19370493292808533, "epsilon_dpo/beta_margin_mean": 0.9106223583221436, "epsilon_dpo/beta_margin_std": 1.1335408687591553, "epsilon_dpo/loss_margin_mean": 112.54642486572266, "grad_norm": 48.55431365966797, "kl/avg_steps": 0.46875, "kl/beta": 0.008187455125153065, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 2.129207286861638e-07, "logits/chosen": 0.21004986763000488, "logits/rejected": 0.4672671854496002, "logps/chosen": -228.88922119140625, "logps/ref_chosen": -65.16290283203125, "logps/ref_rejected": -87.18678283691406, "logps/rejected": -363.45953369140625, "loss": 0.9057, "rewards/accuracies": 0.765625, "rewards/chosen": -1.3364191055297852, "rewards/margins": 0.9106223583221436, "rewards/rejected": -2.2470414638519287, "step": 405 }, { "epoch": 0.5961820851688693, "epsilon_dpo/beta": 0.008104225620627403, "epsilon_dpo/beta_margin_grad_mean": -0.3036099374294281, "epsilon_dpo/beta_margin_grad_std": 0.19255390763282776, "epsilon_dpo/beta_margin_mean": 1.0679973363876343, "epsilon_dpo/beta_margin_std": 1.1687884330749512, "epsilon_dpo/loss_margin_mean": 132.58038330078125, "grad_norm": 38.041080474853516, "kl/avg_steps": 0.5625, "kl/beta": 0.008149255067110062, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.1165208628032861e-07, "logits/chosen": 0.43436455726623535, "logits/rejected": 0.49039581418037415, "logps/chosen": -201.58392333984375, "logps/ref_chosen": -49.740814208984375, "logps/ref_rejected": -92.07862854003906, "logps/rejected": -376.50213623046875, "loss": 0.8218, "rewards/accuracies": 0.796875, "rewards/chosen": -1.23288893699646, "rewards/margins": 1.0679974555969238, "rewards/rejected": -2.300886392593384, "step": 406 }, { "epoch": 0.5976505139500734, "epsilon_dpo/beta": 0.008051296696066856, "epsilon_dpo/beta_margin_grad_mean": -0.32882165908813477, "epsilon_dpo/beta_margin_grad_std": 0.19124306738376617, "epsilon_dpo/beta_margin_mean": 0.8842993974685669, "epsilon_dpo/beta_margin_std": 1.0626513957977295, "epsilon_dpo/loss_margin_mean": 110.4656753540039, "grad_norm": 80.59246826171875, "kl/avg_steps": 0.65625, "kl/beta": 0.008103672415018082, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 2.1038445437768375e-07, "logits/chosen": 0.39705413579940796, "logits/rejected": 0.7918181419372559, "logps/chosen": -231.44268798828125, "logps/ref_chosen": -56.33069610595703, "logps/ref_rejected": -77.5120849609375, "logps/rejected": -363.0897521972656, "loss": 0.9052, "rewards/accuracies": 0.828125, "rewards/chosen": -1.4123256206512451, "rewards/margins": 0.8842993974685669, "rewards/rejected": -2.2966251373291016, "step": 407 }, { "epoch": 0.5991189427312775, "epsilon_dpo/beta": 0.008008869364857674, "epsilon_dpo/beta_margin_grad_mean": -0.34042710065841675, "epsilon_dpo/beta_margin_grad_std": 0.1824430227279663, "epsilon_dpo/beta_margin_mean": 0.7959102988243103, "epsilon_dpo/beta_margin_std": 0.9429380893707275, "epsilon_dpo/loss_margin_mean": 100.0704574584961, "grad_norm": 45.580322265625, "kl/avg_steps": 0.53125, "kl/beta": 0.008050838485360146, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.0911786638150872e-07, "logits/chosen": 0.09460186958312988, "logits/rejected": 0.6105620265007019, "logps/chosen": -250.8351593017578, "logps/ref_chosen": -69.789306640625, "logps/ref_rejected": -90.09693908691406, "logps/rejected": -371.2132568359375, "loss": 0.9228, "rewards/accuracies": 0.75, "rewards/chosen": -1.4534653425216675, "rewards/margins": 0.7959102988243103, "rewards/rejected": -2.249375820159912, "step": 408 }, { "epoch": 0.6005873715124816, "epsilon_dpo/beta": 0.007951530627906322, "epsilon_dpo/beta_margin_grad_mean": -0.2959141135215759, "epsilon_dpo/beta_margin_grad_std": 0.16134285926818848, "epsilon_dpo/beta_margin_mean": 1.0359981060028076, "epsilon_dpo/beta_margin_std": 0.9457764029502869, "epsilon_dpo/loss_margin_mean": 130.778076171875, "grad_norm": 56.11100387573242, "kl/avg_steps": 0.71875, "kl/beta": 0.008008294738829136, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 2.0785235566757517e-07, "logits/chosen": 0.3741447925567627, "logits/rejected": 0.6117522716522217, "logps/chosen": -242.51821899414062, "logps/ref_chosen": -67.31744384765625, "logps/ref_rejected": -84.904296875, "logps/rejected": -390.8831481933594, "loss": 0.7636, "rewards/accuracies": 0.875, "rewards/chosen": -1.3948218822479248, "rewards/margins": 1.0359981060028076, "rewards/rejected": -2.4308199882507324, "step": 409 }, { "epoch": 0.6020558002936858, "epsilon_dpo/beta": 0.007902241311967373, "epsilon_dpo/beta_margin_grad_mean": -0.3191192150115967, "epsilon_dpo/beta_margin_grad_std": 0.17301428318023682, "epsilon_dpo/beta_margin_mean": 0.9134294986724854, "epsilon_dpo/beta_margin_std": 0.9596654176712036, "epsilon_dpo/loss_margin_mean": 116.19683837890625, "grad_norm": 44.16848373413086, "kl/avg_steps": 0.625, "kl/beta": 0.007951145060360432, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 2.065879555832674e-07, "logits/chosen": 0.4321461319923401, "logits/rejected": 0.5176758766174316, "logps/chosen": -237.3287353515625, "logps/ref_chosen": -51.465354919433594, "logps/ref_rejected": -83.198974609375, "logps/rejected": -385.25921630859375, "loss": 0.8466, "rewards/accuracies": 0.84375, "rewards/chosen": -1.4710114002227783, "rewards/margins": 0.9134294986724854, "rewards/rejected": -2.3844408988952637, "step": 410 }, { "epoch": 0.6035242290748899, "epsilon_dpo/beta": 0.007848219946026802, "epsilon_dpo/beta_margin_grad_mean": -0.3073974549770355, "epsilon_dpo/beta_margin_grad_std": 0.17039382457733154, "epsilon_dpo/beta_margin_mean": 1.0119707584381104, "epsilon_dpo/beta_margin_std": 1.0637365579605103, "epsilon_dpo/loss_margin_mean": 129.45492553710938, "grad_norm": 51.007633209228516, "kl/avg_steps": 0.6875, "kl/beta": 0.007901759818196297, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 2.0532469944670343e-07, "logits/chosen": 0.5662134885787964, "logits/rejected": 0.8120511770248413, "logps/chosen": -251.80374145507812, "logps/ref_chosen": -52.30727005004883, "logps/ref_rejected": -80.69495391845703, "logps/rejected": -409.6463623046875, "loss": 0.8034, "rewards/accuracies": 0.859375, "rewards/chosen": -1.5672738552093506, "rewards/margins": 1.0119706392288208, "rewards/rejected": -2.579244613647461, "step": 411 }, { "epoch": 0.604992657856094, "epsilon_dpo/beta": 0.00779463117942214, "epsilon_dpo/beta_margin_grad_mean": -0.3244841396808624, "epsilon_dpo/beta_margin_grad_std": 0.18310624361038208, "epsilon_dpo/beta_margin_mean": 0.8807581067085266, "epsilon_dpo/beta_margin_std": 0.9668706059455872, "epsilon_dpo/loss_margin_mean": 113.57080841064453, "grad_norm": 55.60865783691406, "kl/avg_steps": 0.6875, "kl/beta": 0.007847805507481098, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 2.0406262054585738e-07, "logits/chosen": 0.4706282317638397, "logits/rejected": 0.5481432676315308, "logps/chosen": -254.76193237304688, "logps/ref_chosen": -53.144126892089844, "logps/ref_rejected": -100.06080627441406, "logps/rejected": -415.2494201660156, "loss": 0.8768, "rewards/accuracies": 0.84375, "rewards/chosen": -1.5729925632476807, "rewards/margins": 0.8807581067085266, "rewards/rejected": -2.4537506103515625, "step": 412 }, { "epoch": 0.6064610866372981, "epsilon_dpo/beta": 0.007729229982942343, "epsilon_dpo/beta_margin_grad_mean": -0.3099435269832611, "epsilon_dpo/beta_margin_grad_std": 0.16134311258792877, "epsilon_dpo/beta_margin_mean": 0.929050862789154, "epsilon_dpo/beta_margin_std": 0.8711987733840942, "epsilon_dpo/loss_margin_mean": 120.51229858398438, "grad_norm": 51.80922317504883, "kl/avg_steps": 0.84375, "kl/beta": 0.007794220466166735, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 2.0280175213768205e-07, "logits/chosen": 0.41288986802101135, "logits/rejected": 0.551357626914978, "logps/chosen": -275.64434814453125, "logps/ref_chosen": -61.58196258544922, "logps/ref_rejected": -99.47340393066406, "logps/rejected": -434.048095703125, "loss": 0.8106, "rewards/accuracies": 0.90625, "rewards/chosen": -1.6561181545257568, "rewards/margins": 0.9290508031845093, "rewards/rejected": -2.5851690769195557, "step": 413 }, { "epoch": 0.6079295154185022, "epsilon_dpo/beta": 0.007666975259780884, "epsilon_dpo/beta_margin_grad_mean": -0.27944430708885193, "epsilon_dpo/beta_margin_grad_std": 0.16479705274105072, "epsilon_dpo/beta_margin_mean": 1.1091288328170776, "epsilon_dpo/beta_margin_std": 0.9259233474731445, "epsilon_dpo/loss_margin_mean": 145.06768798828125, "grad_norm": 51.93803024291992, "kl/avg_steps": 0.8125, "kl/beta": 0.007729006931185722, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 2.0154212744723247e-07, "logits/chosen": 0.5738648176193237, "logits/rejected": 0.8426915407180786, "logps/chosen": -240.62554931640625, "logps/ref_chosen": -46.63148880004883, "logps/ref_rejected": -87.64652252197266, "logps/rejected": -426.708251953125, "loss": 0.7247, "rewards/accuracies": 0.90625, "rewards/chosen": -1.4886877536773682, "rewards/margins": 1.1091289520263672, "rewards/rejected": -2.5978167057037354, "step": 414 }, { "epoch": 0.6093979441997063, "epsilon_dpo/beta": 0.007614767644554377, "epsilon_dpo/beta_margin_grad_mean": -0.32948988676071167, "epsilon_dpo/beta_margin_grad_std": 0.16906966269016266, "epsilon_dpo/beta_margin_mean": 0.8560322523117065, "epsilon_dpo/beta_margin_std": 0.9299007058143616, "epsilon_dpo/loss_margin_mean": 112.95114135742188, "grad_norm": 43.91434860229492, "kl/avg_steps": 0.6875, "kl/beta": 0.007666714955121279, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 2.002837796667909e-07, "logits/chosen": 0.23712509870529175, "logits/rejected": 0.4227851331233978, "logps/chosen": -294.5069580078125, "logps/ref_chosen": -78.6182861328125, "logps/ref_rejected": -100.47752380371094, "logps/rejected": -429.3173522949219, "loss": 0.8712, "rewards/accuracies": 0.84375, "rewards/chosen": -1.6471409797668457, "rewards/margins": 0.8560322523117065, "rewards/rejected": -2.5031731128692627, "step": 415 }, { "epoch": 0.6108663729809104, "epsilon_dpo/beta": 0.007555634714663029, "epsilon_dpo/beta_margin_grad_mean": -0.26847735047340393, "epsilon_dpo/beta_margin_grad_std": 0.16211014986038208, "epsilon_dpo/beta_margin_mean": 1.1980746984481812, "epsilon_dpo/beta_margin_std": 0.9696047306060791, "epsilon_dpo/loss_margin_mean": 158.9966583251953, "grad_norm": 66.8626708984375, "kl/avg_steps": 0.78125, "kl/beta": 0.007614366244524717, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 1.990267419549914e-07, "logits/chosen": 0.32131990790367126, "logits/rejected": 0.6265676021575928, "logps/chosen": -260.194091796875, "logps/ref_chosen": -58.27912521362305, "logps/ref_rejected": -90.56871795654297, "logps/rejected": -451.4803466796875, "loss": 0.6837, "rewards/accuracies": 0.90625, "rewards/chosen": -1.5257494449615479, "rewards/margins": 1.1980746984481812, "rewards/rejected": -2.7238240242004395, "step": 416 }, { "epoch": 0.6123348017621145, "epsilon_dpo/beta": 0.007487618364393711, "epsilon_dpo/beta_margin_grad_mean": -0.2677600085735321, "epsilon_dpo/beta_margin_grad_std": 0.141451895236969, "epsilon_dpo/beta_margin_mean": 1.1496177911758423, "epsilon_dpo/beta_margin_std": 0.8150914907455444, "epsilon_dpo/loss_margin_mean": 153.72714233398438, "grad_norm": 47.27936553955078, "kl/avg_steps": 0.90625, "kl/beta": 0.007555339951068163, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 1.9777104743594686e-07, "logits/chosen": 0.5093731880187988, "logits/rejected": 1.3290538787841797, "logps/chosen": -264.1747131347656, "logps/ref_chosen": -50.1987190246582, "logps/ref_rejected": -68.15184020996094, "logps/rejected": -435.85498046875, "loss": 0.6667, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6028650999069214, "rewards/margins": 1.1496177911758423, "rewards/rejected": -2.7524828910827637, "step": 417 }, { "epoch": 0.6138032305433186, "epsilon_dpo/beta": 0.00744143221527338, "epsilon_dpo/beta_margin_grad_mean": -0.31553393602371216, "epsilon_dpo/beta_margin_grad_std": 0.17717210948467255, "epsilon_dpo/beta_margin_mean": 0.9751946926116943, "epsilon_dpo/beta_margin_std": 1.0679258108139038, "epsilon_dpo/loss_margin_mean": 131.72601318359375, "grad_norm": 49.26523971557617, "kl/avg_steps": 0.625, "kl/beta": 0.007487484719604254, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.965167291983757e-07, "logits/chosen": 0.11700999736785889, "logits/rejected": 0.5254498720169067, "logps/chosen": -300.4624328613281, "logps/ref_chosen": -81.97846984863281, "logps/ref_rejected": -104.69148254394531, "logps/rejected": -454.90142822265625, "loss": 0.8347, "rewards/accuracies": 0.796875, "rewards/chosen": -1.627633810043335, "rewards/margins": 0.9751946926116943, "rewards/rejected": -2.6028285026550293, "step": 418 }, { "epoch": 0.6152716593245228, "epsilon_dpo/beta": 0.007385909557342529, "epsilon_dpo/beta_margin_grad_mean": -0.2904692590236664, "epsilon_dpo/beta_margin_grad_std": 0.17036795616149902, "epsilon_dpo/beta_margin_mean": 1.0811809301376343, "epsilon_dpo/beta_margin_std": 0.9915341734886169, "epsilon_dpo/loss_margin_mean": 146.87191772460938, "grad_norm": 46.66392135620117, "kl/avg_steps": 0.75, "kl/beta": 0.0074409786611795425, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 1.9526382029472988e-07, "logits/chosen": 0.3658443093299866, "logits/rejected": 0.5743440389633179, "logps/chosen": -245.95651245117188, "logps/ref_chosen": -52.948646545410156, "logps/ref_rejected": -91.58309936523438, "logps/rejected": -431.462890625, "loss": 0.7559, "rewards/accuracies": 0.90625, "rewards/chosen": -1.427504062652588, "rewards/margins": 1.0811809301376343, "rewards/rejected": -2.5086851119995117, "step": 419 }, { "epoch": 0.6167400881057269, "epsilon_dpo/beta": 0.007344777230173349, "epsilon_dpo/beta_margin_grad_mean": -0.31808242201805115, "epsilon_dpo/beta_margin_grad_std": 0.17818176746368408, "epsilon_dpo/beta_margin_mean": 0.9353904724121094, "epsilon_dpo/beta_margin_std": 1.032126545906067, "epsilon_dpo/loss_margin_mean": 128.07164001464844, "grad_norm": 55.739376068115234, "kl/avg_steps": 0.5625, "kl/beta": 0.007385586854070425, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.9401235374032425e-07, "logits/chosen": 0.14379703998565674, "logits/rejected": 0.9165039658546448, "logps/chosen": -278.3549499511719, "logps/ref_chosen": -77.7699203491211, "logps/ref_rejected": -69.31985473632812, "logps/rejected": -397.9765319824219, "loss": 0.8551, "rewards/accuracies": 0.828125, "rewards/chosen": -1.4773731231689453, "rewards/margins": 0.9353904724121094, "rewards/rejected": -2.4127635955810547, "step": 420 }, { "epoch": 0.618208516886931, "epsilon_dpo/beta": 0.007305989041924477, "epsilon_dpo/beta_margin_grad_mean": -0.34329816699028015, "epsilon_dpo/beta_margin_grad_std": 0.1848863959312439, "epsilon_dpo/beta_margin_mean": 0.7727855443954468, "epsilon_dpo/beta_margin_std": 0.9297990798950195, "epsilon_dpo/loss_margin_mean": 106.55794525146484, "grad_norm": 60.28358840942383, "kl/avg_steps": 0.53125, "kl/beta": 0.007344275247305632, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.9276236251246653e-07, "logits/chosen": 0.2799881100654602, "logits/rejected": 0.49353882670402527, "logps/chosen": -237.02256774902344, "logps/ref_chosen": -53.765865325927734, "logps/ref_rejected": -89.28144836425781, "logps/rejected": -379.0960998535156, "loss": 0.937, "rewards/accuracies": 0.734375, "rewards/chosen": -1.3415601253509521, "rewards/margins": 0.7727855443954468, "rewards/rejected": -2.1143455505371094, "step": 421 }, { "epoch": 0.6196769456681351, "epsilon_dpo/beta": 0.00725596584379673, "epsilon_dpo/beta_margin_grad_mean": -0.32523876428604126, "epsilon_dpo/beta_margin_grad_std": 0.17791172862052917, "epsilon_dpo/beta_margin_mean": 0.8585691452026367, "epsilon_dpo/beta_margin_std": 0.9232885837554932, "epsilon_dpo/loss_margin_mean": 118.95094299316406, "grad_norm": 49.03010559082031, "kl/avg_steps": 0.6875, "kl/beta": 0.007305465172976255, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 1.9151387954958792e-07, "logits/chosen": 0.20186124742031097, "logits/rejected": 0.5440545082092285, "logps/chosen": -262.7305603027344, "logps/ref_chosen": -68.6337661743164, "logps/ref_rejected": -87.86351013183594, "logps/rejected": -400.9112548828125, "loss": 0.8759, "rewards/accuracies": 0.828125, "rewards/chosen": -1.4108562469482422, "rewards/margins": 0.8585691452026367, "rewards/rejected": -2.269425392150879, "step": 422 }, { "epoch": 0.6211453744493393, "epsilon_dpo/beta": 0.0072086891159415245, "epsilon_dpo/beta_margin_grad_mean": -0.32009923458099365, "epsilon_dpo/beta_margin_grad_std": 0.1750791072845459, "epsilon_dpo/beta_margin_mean": 0.894817590713501, "epsilon_dpo/beta_margin_std": 0.9295312166213989, "epsilon_dpo/loss_margin_mean": 124.76802062988281, "grad_norm": 70.25402069091797, "kl/avg_steps": 0.65625, "kl/beta": 0.007255583070218563, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 1.902669377503756e-07, "logits/chosen": 0.34702229499816895, "logits/rejected": 0.37363946437835693, "logps/chosen": -236.0745849609375, "logps/ref_chosen": -54.99030303955078, "logps/ref_rejected": -86.30654907226562, "logps/rejected": -392.15887451171875, "loss": 0.8532, "rewards/accuracies": 0.828125, "rewards/chosen": -1.307551622390747, "rewards/margins": 0.894817590713501, "rewards/rejected": -2.202369213104248, "step": 423 }, { "epoch": 0.6226138032305433, "epsilon_dpo/beta": 0.007159437518566847, "epsilon_dpo/beta_margin_grad_mean": -0.31579458713531494, "epsilon_dpo/beta_margin_grad_std": 0.15702416002750397, "epsilon_dpo/beta_margin_mean": 0.9235584139823914, "epsilon_dpo/beta_margin_std": 0.9231213331222534, "epsilon_dpo/loss_margin_mean": 129.53733825683594, "grad_norm": 69.93016815185547, "kl/avg_steps": 0.6875, "kl/beta": 0.007208278402686119, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 1.890215699729057e-07, "logits/chosen": 0.17603448033332825, "logits/rejected": 0.7972570657730103, "logps/chosen": -211.23085021972656, "logps/ref_chosen": -56.01191711425781, "logps/ref_rejected": -66.47896575927734, "logps/rejected": -351.2352294921875, "loss": 0.8175, "rewards/accuracies": 0.84375, "rewards/chosen": -1.1121392250061035, "rewards/margins": 0.9235584139823914, "rewards/rejected": -2.0356974601745605, "step": 424 }, { "epoch": 0.6240822320117474, "epsilon_dpo/beta": 0.007112789899110794, "epsilon_dpo/beta_margin_grad_mean": -0.3333708941936493, "epsilon_dpo/beta_margin_grad_std": 0.16707682609558105, "epsilon_dpo/beta_margin_mean": 0.8023787140846252, "epsilon_dpo/beta_margin_std": 0.8346377015113831, "epsilon_dpo/loss_margin_mean": 113.39553833007812, "grad_norm": 50.22777557373047, "kl/avg_steps": 0.65625, "kl/beta": 0.00715905986726284, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 1.8777780903377732e-07, "logits/chosen": 0.22520506381988525, "logits/rejected": 0.28665509819984436, "logps/chosen": -219.85743713378906, "logps/ref_chosen": -46.868995666503906, "logps/ref_rejected": -95.92545318603516, "logps/rejected": -382.3094482421875, "loss": 0.884, "rewards/accuracies": 0.828125, "rewards/chosen": -1.2333381175994873, "rewards/margins": 0.8023786544799805, "rewards/rejected": -2.0357167720794678, "step": 425 }, { "epoch": 0.6255506607929515, "epsilon_dpo/beta": 0.007057525217533112, "epsilon_dpo/beta_margin_grad_mean": -0.2971186637878418, "epsilon_dpo/beta_margin_grad_std": 0.16744133830070496, "epsilon_dpo/beta_margin_mean": 1.013198733329773, "epsilon_dpo/beta_margin_std": 0.9114711284637451, "epsilon_dpo/loss_margin_mean": 144.0508575439453, "grad_norm": 60.108436584472656, "kl/avg_steps": 0.78125, "kl/beta": 0.007112384773790836, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 1.8653568770724803e-07, "logits/chosen": -0.021541226655244827, "logits/rejected": 0.46378132700920105, "logps/chosen": -232.7984619140625, "logps/ref_chosen": -76.58354187011719, "logps/ref_rejected": -81.26658630371094, "logps/rejected": -381.5323486328125, "loss": 0.7755, "rewards/accuracies": 0.890625, "rewards/chosen": -1.1039516925811768, "rewards/margins": 1.013198733329773, "rewards/rejected": -2.1171505451202393, "step": 426 }, { "epoch": 0.6270190895741556, "epsilon_dpo/beta": 0.007016048766672611, "epsilon_dpo/beta_margin_grad_mean": -0.3493276834487915, "epsilon_dpo/beta_margin_grad_std": 0.15186835825443268, "epsilon_dpo/beta_margin_mean": 0.7190166711807251, "epsilon_dpo/beta_margin_std": 0.790506899356842, "epsilon_dpo/loss_margin_mean": 103.0052261352539, "grad_norm": 97.97515869140625, "kl/avg_steps": 0.59375, "kl/beta": 0.007057250011712313, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 1.8529523872436977e-07, "logits/chosen": 0.024022696539759636, "logits/rejected": 0.4438282549381256, "logps/chosen": -228.42538452148438, "logps/ref_chosen": -64.8538818359375, "logps/ref_rejected": -78.56600952148438, "logps/rejected": -345.14276123046875, "loss": 0.919, "rewards/accuracies": 0.828125, "rewards/chosen": -1.1492910385131836, "rewards/margins": 0.7190166711807251, "rewards/rejected": -1.8683075904846191, "step": 427 }, { "epoch": 0.6284875183553598, "epsilon_dpo/beta": 0.006972445175051689, "epsilon_dpo/beta_margin_grad_mean": -0.3201913833618164, "epsilon_dpo/beta_margin_grad_std": 0.1774386167526245, "epsilon_dpo/beta_margin_mean": 0.8985069394111633, "epsilon_dpo/beta_margin_std": 0.9505627751350403, "epsilon_dpo/loss_margin_mean": 129.54954528808594, "grad_norm": 56.44048309326172, "kl/avg_steps": 0.625, "kl/beta": 0.0070155952125787735, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.8405649477212697e-07, "logits/chosen": 0.20027443766593933, "logits/rejected": 0.22859197854995728, "logps/chosen": -262.54144287109375, "logps/ref_chosen": -62.63666534423828, "logps/ref_rejected": -103.28182220458984, "logps/rejected": -432.73614501953125, "loss": 0.8578, "rewards/accuracies": 0.828125, "rewards/chosen": -1.3960375785827637, "rewards/margins": 0.8985069990158081, "rewards/rejected": -2.2945446968078613, "step": 428 }, { "epoch": 0.6299559471365639, "epsilon_dpo/beta": 0.006933495402336121, "epsilon_dpo/beta_margin_grad_mean": -0.32588937878608704, "epsilon_dpo/beta_margin_grad_std": 0.17860379815101624, "epsilon_dpo/beta_margin_mean": 0.8632559180259705, "epsilon_dpo/beta_margin_std": 0.9289405941963196, "epsilon_dpo/loss_margin_mean": 125.29979705810547, "grad_norm": 57.67830276489258, "kl/avg_steps": 0.5625, "kl/beta": 0.006972020026296377, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.828194884925749e-07, "logits/chosen": 0.008652932941913605, "logits/rejected": 0.5683068037033081, "logps/chosen": -284.10809326171875, "logps/ref_chosen": -81.23401641845703, "logps/ref_rejected": -91.79493713378906, "logps/rejected": -419.96881103515625, "loss": 0.8753, "rewards/accuracies": 0.796875, "rewards/chosen": -1.4090235233306885, "rewards/margins": 0.8632558584213257, "rewards/rejected": -2.2722792625427246, "step": 429 }, { "epoch": 0.631424375917768, "epsilon_dpo/beta": 0.006888212636113167, "epsilon_dpo/beta_margin_grad_mean": -0.33034777641296387, "epsilon_dpo/beta_margin_grad_std": 0.1568160504102707, "epsilon_dpo/beta_margin_mean": 0.8251773118972778, "epsilon_dpo/beta_margin_std": 0.8462843298912048, "epsilon_dpo/loss_margin_mean": 120.34428405761719, "grad_norm": 52.97359848022461, "kl/avg_steps": 0.65625, "kl/beta": 0.006933021824806929, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 1.8158425248197928e-07, "logits/chosen": 0.1370442807674408, "logits/rejected": 0.24206362664699554, "logps/chosen": -252.5347442626953, "logps/ref_chosen": -60.92032241821289, "logps/ref_rejected": -104.42280578613281, "logps/rejected": -416.38153076171875, "loss": 0.8641, "rewards/accuracies": 0.859375, "rewards/chosen": -1.3209152221679688, "rewards/margins": 0.8251773118972778, "rewards/rejected": -2.146092414855957, "step": 430 }, { "epoch": 0.6328928046989721, "epsilon_dpo/beta": 0.006834692787379026, "epsilon_dpo/beta_margin_grad_mean": -0.29644519090652466, "epsilon_dpo/beta_margin_grad_std": 0.16910652816295624, "epsilon_dpo/beta_margin_mean": 1.0374089479446411, "epsilon_dpo/beta_margin_std": 0.9661343693733215, "epsilon_dpo/loss_margin_mean": 152.2759552001953, "grad_norm": 39.05802536010742, "kl/avg_steps": 0.78125, "kl/beta": 0.006887820549309254, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 1.8035081928995788e-07, "logits/chosen": 0.4554019570350647, "logits/rejected": 0.5327665209770203, "logps/chosen": -260.46075439453125, "logps/ref_chosen": -57.348751068115234, "logps/ref_rejected": -92.84022521972656, "logps/rejected": -448.2281799316406, "loss": 0.7734, "rewards/accuracies": 0.890625, "rewards/chosen": -1.3892830610275269, "rewards/margins": 1.0374089479446411, "rewards/rejected": -2.426692008972168, "step": 431 }, { "epoch": 0.6343612334801763, "epsilon_dpo/beta": 0.006779574789106846, "epsilon_dpo/beta_margin_grad_mean": -0.30400457978248596, "epsilon_dpo/beta_margin_grad_std": 0.15248362720012665, "epsilon_dpo/beta_margin_mean": 0.9762919545173645, "epsilon_dpo/beta_margin_std": 0.9004828333854675, "epsilon_dpo/loss_margin_mean": 144.37176513671875, "grad_norm": 32.576263427734375, "kl/avg_steps": 0.8125, "kl/beta": 0.006834426429122686, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 1.791192214186223e-07, "logits/chosen": 0.02793058566749096, "logits/rejected": 0.38804006576538086, "logps/chosen": -253.63150024414062, "logps/ref_chosen": -71.07479095458984, "logps/ref_rejected": -98.57952880859375, "logps/rejected": -425.50799560546875, "loss": 0.7803, "rewards/accuracies": 0.921875, "rewards/chosen": -1.2381863594055176, "rewards/margins": 0.9762920141220093, "rewards/rejected": -2.2144782543182373, "step": 432 }, { "epoch": 0.6358296622613803, "epsilon_dpo/beta": 0.006746122147887945, "epsilon_dpo/beta_margin_grad_mean": -0.35162287950515747, "epsilon_dpo/beta_margin_grad_std": 0.183163121342659, "epsilon_dpo/beta_margin_mean": 0.7110780477523804, "epsilon_dpo/beta_margin_std": 0.8855108618736267, "epsilon_dpo/loss_margin_mean": 106.28688049316406, "grad_norm": 44.58323287963867, "kl/avg_steps": 0.5, "kl/beta": 0.0067793442867696285, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.7788949132172193e-07, "logits/chosen": 0.4049626588821411, "logits/rejected": 0.5291376113891602, "logps/chosen": -278.4117431640625, "logps/ref_chosen": -58.273193359375, "logps/ref_rejected": -95.95089721679688, "logps/rejected": -422.3763122558594, "loss": 0.9669, "rewards/accuracies": 0.765625, "rewards/chosen": -1.487854242324829, "rewards/margins": 0.7110780477523804, "rewards/rejected": -2.19893217086792, "step": 433 }, { "epoch": 0.6372980910425844, "epsilon_dpo/beta": 0.0066956933587789536, "epsilon_dpo/beta_margin_grad_mean": -0.3319990634918213, "epsilon_dpo/beta_margin_grad_std": 0.14507560431957245, "epsilon_dpo/beta_margin_mean": 0.816291868686676, "epsilon_dpo/beta_margin_std": 0.822716474533081, "epsilon_dpo/loss_margin_mean": 122.31977844238281, "grad_norm": 35.67780303955078, "kl/avg_steps": 0.75, "kl/beta": 0.006745615974068642, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 1.7666166140378853e-07, "logits/chosen": 0.21431401371955872, "logits/rejected": 0.6656895875930786, "logps/chosen": -254.0223388671875, "logps/ref_chosen": -61.97370147705078, "logps/ref_rejected": -78.49861145019531, "logps/rejected": -392.8670349121094, "loss": 0.8566, "rewards/accuracies": 0.890625, "rewards/chosen": -1.28713059425354, "rewards/margins": 0.8162918090820312, "rewards/rejected": -2.1034226417541504, "step": 434 }, { "epoch": 0.6387665198237885, "epsilon_dpo/beta": 0.006650034803897142, "epsilon_dpo/beta_margin_grad_mean": -0.3107792139053345, "epsilon_dpo/beta_margin_grad_std": 0.1730233132839203, "epsilon_dpo/beta_margin_mean": 0.9402909874916077, "epsilon_dpo/beta_margin_std": 0.9170975685119629, "epsilon_dpo/loss_margin_mean": 142.02479553222656, "grad_norm": 33.875038146972656, "kl/avg_steps": 0.6875, "kl/beta": 0.006695400923490524, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 1.7543576401928218e-07, "logits/chosen": 0.3405795097351074, "logits/rejected": 0.5714382529258728, "logps/chosen": -229.49069213867188, "logps/ref_chosen": -51.502052307128906, "logps/ref_rejected": -87.56689453125, "logps/rejected": -407.580322265625, "loss": 0.8221, "rewards/accuracies": 0.890625, "rewards/chosen": -1.1853218078613281, "rewards/margins": 0.9402910470962524, "rewards/rejected": -2.125612735748291, "step": 435 }, { "epoch": 0.6402349486049926, "epsilon_dpo/beta": 0.006600471679121256, "epsilon_dpo/beta_margin_grad_mean": -0.30987975001335144, "epsilon_dpo/beta_margin_grad_std": 0.1436336487531662, "epsilon_dpo/beta_margin_mean": 0.9302107095718384, "epsilon_dpo/beta_margin_std": 0.8247028589248657, "epsilon_dpo/loss_margin_mean": 141.33209228515625, "grad_norm": 34.9669189453125, "kl/avg_steps": 0.75, "kl/beta": 0.0066496841609478, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 1.742118314717391e-07, "logits/chosen": 0.09514103829860687, "logits/rejected": 0.6070296168327332, "logps/chosen": -250.2595672607422, "logps/ref_chosen": -71.40371704101562, "logps/ref_rejected": -82.72775268554688, "logps/rejected": -402.91571044921875, "loss": 0.7874, "rewards/accuracies": 0.890625, "rewards/chosen": -1.181276798248291, "rewards/margins": 0.9302107095718384, "rewards/rejected": -2.11148738861084, "step": 436 }, { "epoch": 0.6417033773861968, "epsilon_dpo/beta": 0.006555462256073952, "epsilon_dpo/beta_margin_grad_mean": -0.32474982738494873, "epsilon_dpo/beta_margin_grad_std": 0.15997685492038727, "epsilon_dpo/beta_margin_mean": 0.8452447652816772, "epsilon_dpo/beta_margin_std": 0.8253268599510193, "epsilon_dpo/loss_margin_mean": 129.53070068359375, "grad_norm": 41.57432174682617, "kl/avg_steps": 0.6875, "kl/beta": 0.006600182969123125, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 1.7298989601292036e-07, "logits/chosen": 0.26234185695648193, "logits/rejected": 0.6462544798851013, "logps/chosen": -242.2954864501953, "logps/ref_chosen": -64.7442626953125, "logps/ref_rejected": -82.04356384277344, "logps/rejected": -389.12548828125, "loss": 0.8502, "rewards/accuracies": 0.84375, "rewards/chosen": -1.1652759313583374, "rewards/margins": 0.8452447652816772, "rewards/rejected": -2.0105206966400146, "step": 437 }, { "epoch": 0.6431718061674009, "epsilon_dpo/beta": 0.006496360059827566, "epsilon_dpo/beta_margin_grad_mean": -0.2956306040287018, "epsilon_dpo/beta_margin_grad_std": 0.1379927545785904, "epsilon_dpo/beta_margin_mean": 1.0000540018081665, "epsilon_dpo/beta_margin_std": 0.804446816444397, "epsilon_dpo/loss_margin_mean": 154.13580322265625, "grad_norm": 32.169349670410156, "kl/avg_steps": 0.90625, "kl/beta": 0.006555116269737482, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 1.7176998984196144e-07, "logits/chosen": 0.2299380898475647, "logits/rejected": 0.6087510585784912, "logps/chosen": -233.89158630371094, "logps/ref_chosen": -59.0186653137207, "logps/ref_rejected": -83.07682037353516, "logps/rejected": -412.0855712890625, "loss": 0.7415, "rewards/accuracies": 0.953125, "rewards/chosen": -1.136362075805664, "rewards/margins": 1.0000540018081665, "rewards/rejected": -2.136415958404541, "step": 438 }, { "epoch": 0.644640234948605, "epsilon_dpo/beta": 0.00645222794264555, "epsilon_dpo/beta_margin_grad_mean": -0.34073659777641296, "epsilon_dpo/beta_margin_grad_std": 0.15491709113121033, "epsilon_dpo/beta_margin_mean": 0.7735282182693481, "epsilon_dpo/beta_margin_std": 0.8291470408439636, "epsilon_dpo/loss_margin_mean": 120.40505981445312, "grad_norm": 40.5095100402832, "kl/avg_steps": 0.6875, "kl/beta": 0.0064962441101670265, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 1.7055214510452458e-07, "logits/chosen": 0.3964412212371826, "logits/rejected": 0.430361270904541, "logps/chosen": -237.7843475341797, "logps/ref_chosen": -53.784080505371094, "logps/ref_rejected": -83.98545837402344, "logps/rejected": -388.39080810546875, "loss": 0.8931, "rewards/accuracies": 0.84375, "rewards/chosen": -1.1882553100585938, "rewards/margins": 0.7735282182693481, "rewards/rejected": -1.961783528327942, "step": 439 }, { "epoch": 0.6461086637298091, "epsilon_dpo/beta": 0.006414221134036779, "epsilon_dpo/beta_margin_grad_mean": -0.36195108294487, "epsilon_dpo/beta_margin_grad_std": 0.16643108427524567, "epsilon_dpo/beta_margin_mean": 0.6733620166778564, "epsilon_dpo/beta_margin_std": 0.8658587336540222, "epsilon_dpo/loss_margin_mean": 105.64379119873047, "grad_norm": 48.05801010131836, "kl/avg_steps": 0.59375, "kl/beta": 0.006451887544244528, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 1.6933639389195134e-07, "logits/chosen": 0.07137221843004227, "logits/rejected": 0.40858763456344604, "logps/chosen": -265.9714050292969, "logps/ref_chosen": -78.56671905517578, "logps/ref_rejected": -96.49775695800781, "logps/rejected": -389.5462341308594, "loss": 0.9764, "rewards/accuracies": 0.796875, "rewards/chosen": -1.2052370309829712, "rewards/margins": 0.6733620166778564, "rewards/rejected": -1.878598928451538, "step": 440 }, { "epoch": 0.6475770925110133, "epsilon_dpo/beta": 0.006367330439388752, "epsilon_dpo/beta_margin_grad_mean": -0.33204370737075806, "epsilon_dpo/beta_margin_grad_std": 0.16942457854747772, "epsilon_dpo/beta_margin_mean": 0.833185076713562, "epsilon_dpo/beta_margin_std": 0.9349945187568665, "epsilon_dpo/loss_margin_mean": 131.405517578125, "grad_norm": 43.710899353027344, "kl/avg_steps": 0.734375, "kl/beta": 0.006413805298507214, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.859375, "learning_rate": 1.681227682404166e-07, "logits/chosen": 0.35447198152542114, "logits/rejected": 0.7814577221870422, "logps/chosen": -264.5113830566406, "logps/ref_chosen": -60.824440002441406, "logps/ref_rejected": -96.47080993652344, "logps/rejected": -431.5632629394531, "loss": 0.8885, "rewards/accuracies": 0.875, "rewards/chosen": -1.2992668151855469, "rewards/margins": 0.833185076713562, "rewards/rejected": -2.1324520111083984, "step": 441 }, { "epoch": 0.6490455212922174, "epsilon_dpo/beta": 0.006313956808298826, "epsilon_dpo/beta_margin_grad_mean": -0.2994399964809418, "epsilon_dpo/beta_margin_grad_std": 0.1570899337530136, "epsilon_dpo/beta_margin_mean": 1.0064421892166138, "epsilon_dpo/beta_margin_std": 0.8888764977455139, "epsilon_dpo/loss_margin_mean": 159.73695373535156, "grad_norm": 36.07387161254883, "kl/avg_steps": 0.84375, "kl/beta": 0.006367047317326069, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 1.669113001300851e-07, "logits/chosen": 0.4353085160255432, "logits/rejected": 0.7528847455978394, "logps/chosen": -222.94937133789062, "logps/ref_chosen": -47.01121520996094, "logps/ref_rejected": -76.53926086425781, "logps/rejected": -412.21435546875, "loss": 0.7663, "rewards/accuracies": 0.90625, "rewards/chosen": -1.111487865447998, "rewards/margins": 1.0064421892166138, "rewards/rejected": -2.1179301738739014, "step": 442 }, { "epoch": 0.6505139500734214, "epsilon_dpo/beta": 0.00627494091168046, "epsilon_dpo/beta_margin_grad_mean": -0.354687362909317, "epsilon_dpo/beta_margin_grad_std": 0.17831788957118988, "epsilon_dpo/beta_margin_mean": 0.7105193734169006, "epsilon_dpo/beta_margin_std": 0.9240910410881042, "epsilon_dpo/loss_margin_mean": 114.0117416381836, "grad_norm": 50.00312805175781, "kl/avg_steps": 0.625, "kl/beta": 0.006313774734735489, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.6570202148426815e-07, "logits/chosen": 0.2745404839515686, "logits/rejected": 0.6156367063522339, "logps/chosen": -278.45794677734375, "logps/ref_chosen": -71.27301788330078, "logps/ref_rejected": -86.679931640625, "logps/rejected": -407.8765869140625, "loss": 0.9717, "rewards/accuracies": 0.828125, "rewards/chosen": -1.3025455474853516, "rewards/margins": 0.7105194330215454, "rewards/rejected": -2.0130648612976074, "step": 443 }, { "epoch": 0.6519823788546255, "epsilon_dpo/beta": 0.0062300837598741055, "epsilon_dpo/beta_margin_grad_mean": -0.30968645215034485, "epsilon_dpo/beta_margin_grad_std": 0.16983701288700104, "epsilon_dpo/beta_margin_mean": 0.9751760959625244, "epsilon_dpo/beta_margin_std": 0.9998669028282166, "epsilon_dpo/loss_margin_mean": 157.15977478027344, "grad_norm": 41.11380386352539, "kl/avg_steps": 0.71875, "kl/beta": 0.006274559069424868, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 1.6449496416858282e-07, "logits/chosen": 0.35377830266952515, "logits/rejected": 0.5459920167922974, "logps/chosen": -262.89581298828125, "logps/ref_chosen": -57.213706970214844, "logps/ref_rejected": -97.25489044189453, "logps/rejected": -460.0967712402344, "loss": 0.8153, "rewards/accuracies": 0.875, "rewards/chosen": -1.282365322113037, "rewards/margins": 0.9751760959625244, "rewards/rejected": -2.2575414180755615, "step": 444 }, { "epoch": 0.6534508076358296, "epsilon_dpo/beta": 0.006189518608152866, "epsilon_dpo/beta_margin_grad_mean": -0.31064438819885254, "epsilon_dpo/beta_margin_grad_std": 0.1580774337053299, "epsilon_dpo/beta_margin_mean": 0.9533052444458008, "epsilon_dpo/beta_margin_std": 0.9005807042121887, "epsilon_dpo/loss_margin_mean": 154.62954711914062, "grad_norm": 45.95329666137695, "kl/avg_steps": 0.65625, "kl/beta": 0.006229782477021217, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 1.6329015999011182e-07, "logits/chosen": 0.3452759087085724, "logits/rejected": 0.6788507699966431, "logps/chosen": -262.902587890625, "logps/ref_chosen": -67.29979705810547, "logps/ref_rejected": -92.68267822265625, "logps/rejected": -442.9150390625, "loss": 0.7993, "rewards/accuracies": 0.890625, "rewards/chosen": -1.2117838859558105, "rewards/margins": 0.9533052444458008, "rewards/rejected": -2.1650891304016113, "step": 445 }, { "epoch": 0.6549192364170338, "epsilon_dpo/beta": 0.006143361795693636, "epsilon_dpo/beta_margin_grad_mean": -0.30731505155563354, "epsilon_dpo/beta_margin_grad_std": 0.14560139179229736, "epsilon_dpo/beta_margin_mean": 0.9334458112716675, "epsilon_dpo/beta_margin_std": 0.7915002107620239, "epsilon_dpo/loss_margin_mean": 152.38681030273438, "grad_norm": 55.357357025146484, "kl/avg_steps": 0.75, "kl/beta": 0.006189166102558374, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 1.6208764069656578e-07, "logits/chosen": 0.3471897542476654, "logits/rejected": 0.3918067216873169, "logps/chosen": -256.1627197265625, "logps/ref_chosen": -59.098487854003906, "logps/ref_rejected": -101.26419067382812, "logps/rejected": -450.7152404785156, "loss": 0.7814, "rewards/accuracies": 0.890625, "rewards/chosen": -1.2111321687698364, "rewards/margins": 0.9334458112716675, "rewards/rejected": -2.144577980041504, "step": 446 }, { "epoch": 0.6563876651982379, "epsilon_dpo/beta": 0.006093789357692003, "epsilon_dpo/beta_margin_grad_mean": -0.3054165840148926, "epsilon_dpo/beta_margin_grad_std": 0.1465393453836441, "epsilon_dpo/beta_margin_mean": 0.9746509194374084, "epsilon_dpo/beta_margin_std": 0.9327902793884277, "epsilon_dpo/loss_margin_mean": 160.3368377685547, "grad_norm": 48.96641540527344, "kl/avg_steps": 0.8125, "kl/beta": 0.006143092643469572, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 1.608874379754465e-07, "logits/chosen": 0.4716407060623169, "logits/rejected": 0.4547463059425354, "logps/chosen": -268.1436767578125, "logps/ref_chosen": -56.07533264160156, "logps/ref_rejected": -98.69475555419922, "logps/rejected": -471.0999450683594, "loss": 0.7784, "rewards/accuracies": 0.921875, "rewards/chosen": -1.2925333976745605, "rewards/margins": 0.9746509790420532, "rewards/rejected": -2.267184257507324, "step": 447 }, { "epoch": 0.657856093979442, "epsilon_dpo/beta": 0.006052294280380011, "epsilon_dpo/beta_margin_grad_mean": -0.30193132162094116, "epsilon_dpo/beta_margin_grad_std": 0.15906290709972382, "epsilon_dpo/beta_margin_mean": 1.009904146194458, "epsilon_dpo/beta_margin_std": 0.9370668530464172, "epsilon_dpo/loss_margin_mean": 167.4809112548828, "grad_norm": 49.381710052490234, "kl/avg_steps": 0.6875, "kl/beta": 0.006093582604080439, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 1.5968958345321177e-07, "logits/chosen": 0.47561150789260864, "logits/rejected": 0.6322281360626221, "logps/chosen": -298.080322265625, "logps/ref_chosen": -60.00384521484375, "logps/ref_rejected": -102.26465606689453, "logps/rejected": -507.822021484375, "loss": 0.7752, "rewards/accuracies": 0.875, "rewards/chosen": -1.4422485828399658, "rewards/margins": 1.009904146194458, "rewards/rejected": -2.452152729034424, "step": 448 }, { "epoch": 0.6593245227606461, "epsilon_dpo/beta": 0.006014752201735973, "epsilon_dpo/beta_margin_grad_mean": -0.3279159367084503, "epsilon_dpo/beta_margin_grad_std": 0.18189726769924164, "epsilon_dpo/beta_margin_mean": 0.888742983341217, "epsilon_dpo/beta_margin_std": 1.0735763311386108, "epsilon_dpo/loss_margin_mean": 148.5830078125, "grad_norm": 60.586238861083984, "kl/avg_steps": 0.625, "kl/beta": 0.006051975302398205, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.584941086944423e-07, "logits/chosen": 0.2962471544742584, "logits/rejected": 0.6988736391067505, "logps/chosen": -308.541259765625, "logps/ref_chosen": -67.52661895751953, "logps/ref_rejected": -88.59690856933594, "logps/rejected": -478.1945495605469, "loss": 0.8983, "rewards/accuracies": 0.84375, "rewards/chosen": -1.452686071395874, "rewards/margins": 0.8887430429458618, "rewards/rejected": -2.3414292335510254, "step": 449 }, { "epoch": 0.6607929515418502, "epsilon_dpo/beta": 0.005967994686216116, "epsilon_dpo/beta_margin_grad_mean": -0.29214900732040405, "epsilon_dpo/beta_margin_grad_std": 0.1508917659521103, "epsilon_dpo/beta_margin_mean": 1.0216530561447144, "epsilon_dpo/beta_margin_std": 0.8822335600852966, "epsilon_dpo/loss_margin_mean": 171.69039916992188, "grad_norm": 50.2854118347168, "kl/avg_steps": 0.78125, "kl/beta": 0.006014385260641575, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 1.573010452010098e-07, "logits/chosen": 0.4698118567466736, "logits/rejected": 0.6411651372909546, "logps/chosen": -270.3196105957031, "logps/ref_chosen": -57.108116149902344, "logps/ref_rejected": -102.75494384765625, "logps/rejected": -487.6568603515625, "loss": 0.7534, "rewards/accuracies": 0.90625, "rewards/chosen": -1.2739126682281494, "rewards/margins": 1.0216530561447144, "rewards/rejected": -2.295565605163574, "step": 450 }, { "epoch": 0.6622613803230544, "epsilon_dpo/beta": 0.005929191131144762, "epsilon_dpo/beta_margin_grad_mean": -0.3303794860839844, "epsilon_dpo/beta_margin_grad_std": 0.1712116152048111, "epsilon_dpo/beta_margin_mean": 0.8015156388282776, "epsilon_dpo/beta_margin_std": 0.9112508893013, "epsilon_dpo/loss_margin_mean": 135.93467712402344, "grad_norm": 50.47534942626953, "kl/avg_steps": 0.65625, "kl/beta": 0.005967761855572462, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 1.5611042441124687e-07, "logits/chosen": 0.7546664476394653, "logits/rejected": 1.2638273239135742, "logps/chosen": -320.57232666015625, "logps/ref_chosen": -58.46883010864258, "logps/ref_rejected": -72.92941284179688, "logps/rejected": -470.96759033203125, "loss": 0.9089, "rewards/accuracies": 0.859375, "rewards/chosen": -1.556694507598877, "rewards/margins": 0.8015156388282776, "rewards/rejected": -2.3582100868225098, "step": 451 }, { "epoch": 0.6637298091042585, "epsilon_dpo/beta": 0.00588312279433012, "epsilon_dpo/beta_margin_grad_mean": -0.32057738304138184, "epsilon_dpo/beta_margin_grad_std": 0.14430400729179382, "epsilon_dpo/beta_margin_mean": 0.8708633780479431, "epsilon_dpo/beta_margin_std": 0.8176453113555908, "epsilon_dpo/loss_margin_mean": 148.41787719726562, "grad_norm": 34.57151794433594, "kl/avg_steps": 0.78125, "kl/beta": 0.005928853992372751, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 1.549222776991186e-07, "logits/chosen": 0.7242465019226074, "logits/rejected": 0.6068210601806641, "logps/chosen": -266.37939453125, "logps/ref_chosen": -50.39055252075195, "logps/ref_rejected": -97.77143096923828, "logps/rejected": -462.17816162109375, "loss": 0.8214, "rewards/accuracies": 0.921875, "rewards/chosen": -1.2716758251190186, "rewards/margins": 0.8708633780479431, "rewards/rejected": -2.1425392627716064, "step": 452 }, { "epoch": 0.6651982378854625, "epsilon_dpo/beta": 0.0058448719792068005, "epsilon_dpo/beta_margin_grad_mean": -0.3328855633735657, "epsilon_dpo/beta_margin_grad_std": 0.15490888059139252, "epsilon_dpo/beta_margin_mean": 0.8247835636138916, "epsilon_dpo/beta_margin_std": 0.8746103644371033, "epsilon_dpo/loss_margin_mean": 141.74331665039062, "grad_norm": 33.32931137084961, "kl/avg_steps": 0.65625, "kl/beta": 0.005882893688976765, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 1.5373663637339584e-07, "logits/chosen": 0.35818058252334595, "logits/rejected": 0.8482306003570557, "logps/chosen": -291.4721984863281, "logps/ref_chosen": -57.71485137939453, "logps/ref_rejected": -82.20741271972656, "logps/rejected": -457.70806884765625, "loss": 0.8671, "rewards/accuracies": 0.859375, "rewards/chosen": -1.368526816368103, "rewards/margins": 0.8247835636138916, "rewards/rejected": -2.193310260772705, "step": 453 }, { "epoch": 0.6666666666666666, "epsilon_dpo/beta": 0.0058012851513922215, "epsilon_dpo/beta_margin_grad_mean": -0.3133637309074402, "epsilon_dpo/beta_margin_grad_std": 0.1705373227596283, "epsilon_dpo/beta_margin_mean": 0.9566707611083984, "epsilon_dpo/beta_margin_std": 1.0051079988479614, "epsilon_dpo/loss_margin_mean": 165.53953552246094, "grad_norm": 43.49067306518555, "kl/avg_steps": 0.75, "kl/beta": 0.005844539031386375, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 1.5255353167683017e-07, "logits/chosen": 0.5292907357215881, "logits/rejected": 1.0574098825454712, "logps/chosen": -316.734375, "logps/ref_chosen": -60.945648193359375, "logps/ref_rejected": -84.9507827758789, "logps/rejected": -506.279052734375, "loss": 0.8274, "rewards/accuracies": 0.84375, "rewards/chosen": -1.4860866069793701, "rewards/margins": 0.9566707611083984, "rewards/rejected": -2.4427573680877686, "step": 454 }, { "epoch": 0.6681350954478708, "epsilon_dpo/beta": 0.005758099257946014, "epsilon_dpo/beta_margin_grad_mean": -0.2917003035545349, "epsilon_dpo/beta_margin_grad_std": 0.16674098372459412, "epsilon_dpo/beta_margin_mean": 1.0676908493041992, "epsilon_dpo/beta_margin_std": 0.952242374420166, "epsilon_dpo/loss_margin_mean": 186.02780151367188, "grad_norm": 34.411067962646484, "kl/avg_steps": 0.75, "kl/beta": 0.005801031365990639, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 1.5137299478533064e-07, "logits/chosen": 0.43719807267189026, "logits/rejected": 0.41231751441955566, "logps/chosen": -267.48406982421875, "logps/ref_chosen": -44.88671112060547, "logps/ref_rejected": -115.30147552490234, "logps/rejected": -523.9266357421875, "loss": 0.7524, "rewards/accuracies": 0.859375, "rewards/chosen": -1.282523512840271, "rewards/margins": 1.0676908493041992, "rewards/rejected": -2.3502144813537598, "step": 455 }, { "epoch": 0.6696035242290749, "epsilon_dpo/beta": 0.0057062371633946896, "epsilon_dpo/beta_margin_grad_mean": -0.2859190106391907, "epsilon_dpo/beta_margin_grad_std": 0.13566403090953827, "epsilon_dpo/beta_margin_mean": 1.066293716430664, "epsilon_dpo/beta_margin_std": 0.8393720984458923, "epsilon_dpo/loss_margin_mean": 187.0402374267578, "grad_norm": 32.09308624267578, "kl/avg_steps": 0.90625, "kl/beta": 0.005757847335189581, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 1.5019505680714232e-07, "logits/chosen": 0.39520812034606934, "logits/rejected": 0.4162057042121887, "logps/chosen": -275.6275939941406, "logps/ref_chosen": -57.036781311035156, "logps/ref_rejected": -105.21783447265625, "logps/rejected": -510.848876953125, "loss": 0.71, "rewards/accuracies": 0.984375, "rewards/chosen": -1.247272253036499, "rewards/margins": 1.066293716430664, "rewards/rejected": -2.313566207885742, "step": 456 }, { "epoch": 0.671071953010279, "epsilon_dpo/beta": 0.0056639062240719795, "epsilon_dpo/beta_margin_grad_mean": -0.312649130821228, "epsilon_dpo/beta_margin_grad_std": 0.14824675023555756, "epsilon_dpo/beta_margin_mean": 0.8866687417030334, "epsilon_dpo/beta_margin_std": 0.7621463537216187, "epsilon_dpo/loss_margin_mean": 157.08285522460938, "grad_norm": 44.855472564697266, "kl/avg_steps": 0.75, "kl/beta": 0.0057061356492340565, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 1.4901974878202627e-07, "logits/chosen": 0.3958442211151123, "logits/rejected": 0.794312596321106, "logps/chosen": -268.1856689453125, "logps/ref_chosen": -54.24253845214844, "logps/ref_rejected": -85.10956573486328, "logps/rejected": -456.13555908203125, "loss": 0.8047, "rewards/accuracies": 0.875, "rewards/chosen": -1.2122774124145508, "rewards/margins": 0.8866687417030334, "rewards/rejected": -2.0989460945129395, "step": 457 }, { "epoch": 0.6725403817914831, "epsilon_dpo/beta": 0.005625282879918814, "epsilon_dpo/beta_margin_grad_mean": -0.29639893770217896, "epsilon_dpo/beta_margin_grad_std": 0.15821236371994019, "epsilon_dpo/beta_margin_mean": 1.0232208967208862, "epsilon_dpo/beta_margin_std": 0.9355968832969666, "epsilon_dpo/loss_margin_mean": 182.63494873046875, "grad_norm": 38.89377212524414, "kl/avg_steps": 0.6875, "kl/beta": 0.0056636580266058445, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 1.4784710168044212e-07, "logits/chosen": 0.4888191819190979, "logits/rejected": 0.6527252793312073, "logps/chosen": -278.69854736328125, "logps/ref_chosen": -55.40888214111328, "logps/ref_rejected": -97.68325805664062, "logps/rejected": -503.6078796386719, "loss": 0.7622, "rewards/accuracies": 0.84375, "rewards/chosen": -1.2583105564117432, "rewards/margins": 1.0232208967208862, "rewards/rejected": -2.28153133392334, "step": 458 }, { "epoch": 0.6740088105726872, "epsilon_dpo/beta": 0.005585114937275648, "epsilon_dpo/beta_margin_grad_mean": -0.3197575509548187, "epsilon_dpo/beta_margin_grad_std": 0.15985752642154694, "epsilon_dpo/beta_margin_mean": 0.8780723810195923, "epsilon_dpo/beta_margin_std": 0.8582127690315247, "epsilon_dpo/loss_margin_mean": 157.82933044433594, "grad_norm": 44.090911865234375, "kl/avg_steps": 0.71875, "kl/beta": 0.005624986253678799, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 1.466771464027316e-07, "logits/chosen": 0.7341614961624146, "logits/rejected": 0.9279061555862427, "logps/chosen": -285.59722900390625, "logps/ref_chosen": -46.55748748779297, "logps/ref_rejected": -86.16854095458984, "logps/rejected": -483.0376281738281, "loss": 0.8372, "rewards/accuracies": 0.890625, "rewards/chosen": -1.3366007804870605, "rewards/margins": 0.8780723810195923, "rewards/rejected": -2.2146730422973633, "step": 459 }, { "epoch": 0.6754772393538914, "epsilon_dpo/beta": 0.005545258987694979, "epsilon_dpo/beta_margin_grad_mean": -0.30482274293899536, "epsilon_dpo/beta_margin_grad_std": 0.1670825034379959, "epsilon_dpo/beta_margin_mean": 0.9677952527999878, "epsilon_dpo/beta_margin_std": 0.9130998253822327, "epsilon_dpo/loss_margin_mean": 175.251220703125, "grad_norm": 38.01408386230469, "kl/avg_steps": 0.71875, "kl/beta": 0.005584845319390297, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 1.4550991377830423e-07, "logits/chosen": 0.7233595848083496, "logits/rejected": 0.6271342039108276, "logps/chosen": -304.29058837890625, "logps/ref_chosen": -51.63489532470703, "logps/ref_rejected": -104.11935424804688, "logps/rejected": -532.0263061523438, "loss": 0.8001, "rewards/accuracies": 0.875, "rewards/chosen": -1.403839349746704, "rewards/margins": 0.9677952527999878, "rewards/rejected": -2.3716347217559814, "step": 460 }, { "epoch": 0.6769456681350955, "epsilon_dpo/beta": 0.005510885734111071, "epsilon_dpo/beta_margin_grad_mean": -0.3535759449005127, "epsilon_dpo/beta_margin_grad_std": 0.16537989675998688, "epsilon_dpo/beta_margin_mean": 0.7148459553718567, "epsilon_dpo/beta_margin_std": 0.8677749037742615, "epsilon_dpo/loss_margin_mean": 130.45925903320312, "grad_norm": 38.21791076660156, "kl/avg_steps": 0.625, "kl/beta": 0.0055449907667934895, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.4434543456482518e-07, "logits/chosen": 0.7113257646560669, "logits/rejected": 0.8079010248184204, "logps/chosen": -325.3001708984375, "logps/ref_chosen": -55.18195343017578, "logps/ref_rejected": -86.47689819335938, "logps/rejected": -487.05438232421875, "loss": 0.9471, "rewards/accuracies": 0.828125, "rewards/chosen": -1.4901607036590576, "rewards/margins": 0.7148459553718567, "rewards/rejected": -2.2050065994262695, "step": 461 }, { "epoch": 0.6784140969162996, "epsilon_dpo/beta": 0.00547578651458025, "epsilon_dpo/beta_margin_grad_mean": -0.35789817571640015, "epsilon_dpo/beta_margin_grad_std": 0.14245951175689697, "epsilon_dpo/beta_margin_mean": 0.6690189242362976, "epsilon_dpo/beta_margin_std": 0.7433232069015503, "epsilon_dpo/loss_margin_mean": 122.77618408203125, "grad_norm": 42.94804763793945, "kl/avg_steps": 0.640625, "kl/beta": 0.0055105495266616344, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.4318373944740484e-07, "logits/chosen": 0.5266987681388855, "logits/rejected": 0.8050484657287598, "logps/chosen": -327.9981689453125, "logps/ref_chosen": -69.92803955078125, "logps/ref_rejected": -78.84111785888672, "logps/rejected": -459.68743896484375, "loss": 0.9378, "rewards/accuracies": 0.84375, "rewards/chosen": -1.4143660068511963, "rewards/margins": 0.6690188646316528, "rewards/rejected": -2.0833849906921387, "step": 462 }, { "epoch": 0.6798825256975036, "epsilon_dpo/beta": 0.00544350640848279, "epsilon_dpo/beta_margin_grad_mean": -0.33866074681282043, "epsilon_dpo/beta_margin_grad_std": 0.17297625541687012, "epsilon_dpo/beta_margin_mean": 0.788625955581665, "epsilon_dpo/beta_margin_std": 0.8829845786094666, "epsilon_dpo/loss_margin_mean": 145.75604248046875, "grad_norm": 50.6060905456543, "kl/avg_steps": 0.59375, "kl/beta": 0.005475472658872604, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 1.4202485903778976e-07, "logits/chosen": 0.5106614828109741, "logits/rejected": 0.7114510536193848, "logps/chosen": -312.5292663574219, "logps/ref_chosen": -55.27437210083008, "logps/ref_rejected": -89.02497863769531, "logps/rejected": -492.0359191894531, "loss": 0.9069, "rewards/accuracies": 0.828125, "rewards/chosen": -1.4034281969070435, "rewards/margins": 0.788625955581665, "rewards/rejected": -2.192054271697998, "step": 463 }, { "epoch": 0.6813509544787077, "epsilon_dpo/beta": 0.00540287047624588, "epsilon_dpo/beta_margin_grad_mean": -0.2959914207458496, "epsilon_dpo/beta_margin_grad_std": 0.17414125800132751, "epsilon_dpo/beta_margin_mean": 1.0425310134887695, "epsilon_dpo/beta_margin_std": 0.9792246222496033, "epsilon_dpo/loss_margin_mean": 193.62644958496094, "grad_norm": 41.503604888916016, "kl/avg_steps": 0.75, "kl/beta": 0.005443153902888298, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 1.4086882387355658e-07, "logits/chosen": 0.6274369955062866, "logits/rejected": 0.5430445671081543, "logps/chosen": -302.8582763671875, "logps/ref_chosen": -50.91230010986328, "logps/ref_rejected": -102.4893798828125, "logps/rejected": -548.061767578125, "loss": 0.7761, "rewards/accuracies": 0.875, "rewards/chosen": -1.3622095584869385, "rewards/margins": 1.0425310134887695, "rewards/rejected": -2.404740571975708, "step": 464 }, { "epoch": 0.6828193832599119, "epsilon_dpo/beta": 0.005362650379538536, "epsilon_dpo/beta_margin_grad_mean": -0.29354214668273926, "epsilon_dpo/beta_margin_grad_std": 0.1528787612915039, "epsilon_dpo/beta_margin_mean": 1.0141922235488892, "epsilon_dpo/beta_margin_std": 0.838722825050354, "epsilon_dpo/loss_margin_mean": 189.72882080078125, "grad_norm": 51.4481201171875, "kl/avg_steps": 0.75, "kl/beta": 0.005402633920311928, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 1.3971566441730714e-07, "logits/chosen": 0.38318532705307007, "logits/rejected": 0.6318896412849426, "logps/chosen": -298.53082275390625, "logps/ref_chosen": -60.116851806640625, "logps/ref_rejected": -113.94602966308594, "logps/rejected": -542.0888061523438, "loss": 0.7489, "rewards/accuracies": 0.890625, "rewards/chosen": -1.2803826332092285, "rewards/margins": 1.0141922235488892, "rewards/rejected": -2.294574737548828, "step": 465 }, { "epoch": 0.684287812041116, "epsilon_dpo/beta": 0.005329433362931013, "epsilon_dpo/beta_margin_grad_mean": -0.324601948261261, "epsilon_dpo/beta_margin_grad_std": 0.1643323004245758, "epsilon_dpo/beta_margin_mean": 0.8457480072975159, "epsilon_dpo/beta_margin_std": 0.8515949249267578, "epsilon_dpo/loss_margin_mean": 159.5610809326172, "grad_norm": 54.184085845947266, "kl/avg_steps": 0.625, "kl/beta": 0.005362415686249733, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.3856541105586545e-07, "logits/chosen": 0.6274415254592896, "logits/rejected": 0.779162585735321, "logps/chosen": -308.79168701171875, "logps/ref_chosen": -52.920921325683594, "logps/ref_rejected": -90.3154296875, "logps/rejected": -505.747314453125, "loss": 0.8586, "rewards/accuracies": 0.84375, "rewards/chosen": -1.3661298751831055, "rewards/margins": 0.8457480669021606, "rewards/rejected": -2.2118778228759766, "step": 466 }, { "epoch": 0.6857562408223201, "epsilon_dpo/beta": 0.005297997035086155, "epsilon_dpo/beta_margin_grad_mean": -0.32604339718818665, "epsilon_dpo/beta_margin_grad_std": 0.18723338842391968, "epsilon_dpo/beta_margin_mean": 0.8774838447570801, "epsilon_dpo/beta_margin_std": 0.9864553809165955, "epsilon_dpo/loss_margin_mean": 166.6938934326172, "grad_norm": 60.39503479003906, "kl/avg_steps": 0.59375, "kl/beta": 0.00532910879701376, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 1.3741809409947729e-07, "logits/chosen": 0.34069177508354187, "logits/rejected": 0.7749538421630859, "logps/chosen": -374.0175476074219, "logps/ref_chosen": -78.7158203125, "logps/ref_rejected": -102.86019897460938, "logps/rejected": -564.8558349609375, "loss": 0.8859, "rewards/accuracies": 0.796875, "rewards/chosen": -1.5679855346679688, "rewards/margins": 0.8774838447570801, "rewards/rejected": -2.445469379425049, "step": 467 }, { "epoch": 0.6872246696035242, "epsilon_dpo/beta": 0.005258447490632534, "epsilon_dpo/beta_margin_grad_mean": -0.3130618631839752, "epsilon_dpo/beta_margin_grad_std": 0.16425372660160065, "epsilon_dpo/beta_margin_mean": 0.9173241853713989, "epsilon_dpo/beta_margin_std": 0.8768501281738281, "epsilon_dpo/loss_margin_mean": 175.10275268554688, "grad_norm": 44.3262939453125, "kl/avg_steps": 0.75, "kl/beta": 0.0052976543083786964, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 1.362737437810114e-07, "logits/chosen": 0.3684792220592499, "logits/rejected": 0.5974057912826538, "logps/chosen": -310.4765625, "logps/ref_chosen": -69.93536376953125, "logps/ref_rejected": -101.02881622314453, "logps/rejected": -516.6727905273438, "loss": 0.8196, "rewards/accuracies": 0.875, "rewards/chosen": -1.2665220499038696, "rewards/margins": 0.9173241853713989, "rewards/rejected": -2.1838462352752686, "step": 468 }, { "epoch": 0.6886930983847284, "epsilon_dpo/beta": 0.005218472797423601, "epsilon_dpo/beta_margin_grad_mean": -0.30890974402427673, "epsilon_dpo/beta_margin_grad_std": 0.15223819017410278, "epsilon_dpo/beta_margin_mean": 0.9334389567375183, "epsilon_dpo/beta_margin_std": 0.8322674632072449, "epsilon_dpo/loss_margin_mean": 179.397216796875, "grad_norm": 38.74735641479492, "kl/avg_steps": 0.765625, "kl/beta": 0.005258217453956604, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.875, "learning_rate": 1.351323902551631e-07, "logits/chosen": 0.49617844820022583, "logits/rejected": 0.7309253811836243, "logps/chosen": -338.1322021484375, "logps/ref_chosen": -68.12469482421875, "logps/ref_rejected": -104.78640747070312, "logps/rejected": -554.191162109375, "loss": 0.7934, "rewards/accuracies": 0.875, "rewards/chosen": -1.4098811149597168, "rewards/margins": 0.9334390163421631, "rewards/rejected": -2.343319892883301, "step": 469 }, { "epoch": 0.6901615271659325, "epsilon_dpo/beta": 0.0051747532561421394, "epsilon_dpo/beta_margin_grad_mean": -0.29332658648490906, "epsilon_dpo/beta_margin_grad_std": 0.14796683192253113, "epsilon_dpo/beta_margin_mean": 1.0051977634429932, "epsilon_dpo/beta_margin_std": 0.8227752447128296, "epsilon_dpo/loss_margin_mean": 194.66285705566406, "grad_norm": 48.59160232543945, "kl/avg_steps": 0.84375, "kl/beta": 0.005218265112489462, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 1.339940635976592e-07, "logits/chosen": 0.5892876982688904, "logits/rejected": 0.8713769316673279, "logps/chosen": -263.25250244140625, "logps/ref_chosen": -43.79193115234375, "logps/ref_rejected": -82.70285034179688, "logps/rejected": -496.8262939453125, "loss": 0.7491, "rewards/accuracies": 0.921875, "rewards/chosen": -1.136615514755249, "rewards/margins": 1.0051977634429932, "rewards/rejected": -2.141813278198242, "step": 470 }, { "epoch": 0.6916299559471366, "epsilon_dpo/beta": 0.005142777226865292, "epsilon_dpo/beta_margin_grad_mean": -0.3280228078365326, "epsilon_dpo/beta_margin_grad_std": 0.14275549352169037, "epsilon_dpo/beta_margin_mean": 0.7997382283210754, "epsilon_dpo/beta_margin_std": 0.7150464057922363, "epsilon_dpo/loss_margin_mean": 156.2308349609375, "grad_norm": 35.29491424560547, "kl/avg_steps": 0.625, "kl/beta": 0.005174604244530201, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.3285879380446563e-07, "logits/chosen": 0.4337306022644043, "logits/rejected": 0.9008173942565918, "logps/chosen": -309.1363525390625, "logps/ref_chosen": -63.33952331542969, "logps/ref_rejected": -83.61048126220703, "logps/rejected": -485.63812255859375, "loss": 0.8468, "rewards/accuracies": 0.828125, "rewards/chosen": -1.2650556564331055, "rewards/margins": 0.7997382879257202, "rewards/rejected": -2.064793825149536, "step": 471 }, { "epoch": 0.6930983847283406, "epsilon_dpo/beta": 0.005106013268232346, "epsilon_dpo/beta_margin_grad_mean": -0.3135577142238617, "epsilon_dpo/beta_margin_grad_std": 0.15046267211437225, "epsilon_dpo/beta_margin_mean": 0.9393625259399414, "epsilon_dpo/beta_margin_std": 0.907642126083374, "epsilon_dpo/loss_margin_mean": 184.59075927734375, "grad_norm": 32.15871047973633, "kl/avg_steps": 0.71875, "kl/beta": 0.0051424638368189335, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 1.317266107909975e-07, "logits/chosen": -0.0728834792971611, "logits/rejected": 0.19875231385231018, "logps/chosen": -318.1406555175781, "logps/ref_chosen": -83.66609954833984, "logps/ref_rejected": -117.20919799804688, "logps/rejected": -536.2745361328125, "loss": 0.8026, "rewards/accuracies": 0.859375, "rewards/chosen": -1.1983230113983154, "rewards/margins": 0.9393625259399414, "rewards/rejected": -2.137685537338257, "step": 472 }, { "epoch": 0.6945668135095447, "epsilon_dpo/beta": 0.005079149734228849, "epsilon_dpo/beta_margin_grad_mean": -0.3632872700691223, "epsilon_dpo/beta_margin_grad_std": 0.1751829981803894, "epsilon_dpo/beta_margin_mean": 0.6389003396034241, "epsilon_dpo/beta_margin_std": 0.8650482892990112, "epsilon_dpo/loss_margin_mean": 126.8731918334961, "grad_norm": 61.05051803588867, "kl/avg_steps": 0.53125, "kl/beta": 0.0051057664677500725, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.3059754439133002e-07, "logits/chosen": 0.35071080923080444, "logits/rejected": 0.8694913387298584, "logps/chosen": -335.890625, "logps/ref_chosen": -63.49696731567383, "logps/ref_rejected": -81.14657592773438, "logps/rejected": -480.41339111328125, "loss": 1.0059, "rewards/accuracies": 0.765625, "rewards/chosen": -1.3861662149429321, "rewards/margins": 0.6389002799987793, "rewards/rejected": -2.025066614151001, "step": 473 }, { "epoch": 0.6960352422907489, "epsilon_dpo/beta": 0.005053896456956863, "epsilon_dpo/beta_margin_grad_mean": -0.3590001165866852, "epsilon_dpo/beta_margin_grad_std": 0.16798219084739685, "epsilon_dpo/beta_margin_mean": 0.670434296131134, "epsilon_dpo/beta_margin_std": 0.8294155597686768, "epsilon_dpo/loss_margin_mean": 133.68316650390625, "grad_norm": 47.492401123046875, "kl/avg_steps": 0.5, "kl/beta": 0.0050787851214408875, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.2947162435741277e-07, "logits/chosen": 0.5560064315795898, "logits/rejected": 0.6434218287467957, "logps/chosen": -292.8822021484375, "logps/ref_chosen": -52.6119384765625, "logps/ref_rejected": -90.08041381835938, "logps/rejected": -464.0338439941406, "loss": 0.9708, "rewards/accuracies": 0.734375, "rewards/chosen": -1.2170591354370117, "rewards/margins": 0.6704343557357788, "rewards/rejected": -1.8874934911727905, "step": 474 }, { "epoch": 0.697503671071953, "epsilon_dpo/beta": 0.005014538764953613, "epsilon_dpo/beta_margin_grad_mean": -0.3117504417896271, "epsilon_dpo/beta_margin_grad_std": 0.15037527680397034, "epsilon_dpo/beta_margin_mean": 0.9207746982574463, "epsilon_dpo/beta_margin_std": 0.8331746459007263, "epsilon_dpo/loss_margin_mean": 184.13772583007812, "grad_norm": 61.19434356689453, "kl/avg_steps": 0.78125, "kl/beta": 0.005053517874330282, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 1.2834888035828596e-07, "logits/chosen": 0.6205512881278992, "logits/rejected": 0.6674166321754456, "logps/chosen": -243.25057983398438, "logps/ref_chosen": -42.49519348144531, "logps/ref_rejected": -90.06295013427734, "logps/rejected": -474.9560546875, "loss": 0.7992, "rewards/accuracies": 0.859375, "rewards/chosen": -1.0072057247161865, "rewards/margins": 0.9207746982574463, "rewards/rejected": -1.9279804229736328, "step": 475 }, { "epoch": 0.6989720998531571, "epsilon_dpo/beta": 0.004972531925886869, "epsilon_dpo/beta_margin_grad_mean": -0.33235964179039, "epsilon_dpo/beta_margin_grad_std": 0.1353040486574173, "epsilon_dpo/beta_margin_mean": 0.7847320437431335, "epsilon_dpo/beta_margin_std": 0.702809751033783, "epsilon_dpo/loss_margin_mean": 158.13043212890625, "grad_norm": 40.11614990234375, "kl/avg_steps": 0.84375, "kl/beta": 0.005014343187212944, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 1.2722934197929802e-07, "logits/chosen": 0.5991215705871582, "logits/rejected": 0.9088248610496521, "logps/chosen": -258.5040283203125, "logps/ref_chosen": -42.949378967285156, "logps/ref_rejected": -73.71023559570312, "logps/rejected": -447.39532470703125, "loss": 0.8502, "rewards/accuracies": 0.9375, "rewards/chosen": -1.072633981704712, "rewards/margins": 0.7847321033477783, "rewards/rejected": -1.8573660850524902, "step": 476 }, { "epoch": 0.7004405286343612, "epsilon_dpo/beta": 0.0049379123374819756, "epsilon_dpo/beta_margin_grad_mean": -0.334118515253067, "epsilon_dpo/beta_margin_grad_std": 0.1581631898880005, "epsilon_dpo/beta_margin_mean": 0.7720637321472168, "epsilon_dpo/beta_margin_std": 0.7845483422279358, "epsilon_dpo/loss_margin_mean": 157.0982208251953, "grad_norm": 36.1607551574707, "kl/avg_steps": 0.703125, "kl/beta": 0.004972388502210379, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 1.2611303872132631e-07, "logits/chosen": 0.23146602511405945, "logits/rejected": 0.8222414255142212, "logps/chosen": -303.25787353515625, "logps/ref_chosen": -70.77261352539062, "logps/ref_rejected": -76.13737487792969, "logps/rejected": -465.7208557128906, "loss": 0.8885, "rewards/accuracies": 0.890625, "rewards/chosen": -1.149595022201538, "rewards/margins": 0.7720637321472168, "rewards/rejected": -1.9216588735580444, "step": 477 }, { "epoch": 0.7019089574155654, "epsilon_dpo/beta": 0.004896498750895262, "epsilon_dpo/beta_margin_grad_mean": -0.3210771977901459, "epsilon_dpo/beta_margin_grad_std": 0.1335819661617279, "epsilon_dpo/beta_margin_mean": 0.8631773591041565, "epsilon_dpo/beta_margin_std": 0.7796393036842346, "epsilon_dpo/loss_margin_mean": 176.6071014404297, "grad_norm": 36.4519157409668, "kl/avg_steps": 0.84375, "kl/beta": 0.004937670659273863, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 1.2500000000000005e-07, "logits/chosen": 0.5271429419517517, "logits/rejected": 0.6856623888015747, "logps/chosen": -236.5076904296875, "logps/ref_chosen": -41.440513610839844, "logps/ref_rejected": -85.36196899414062, "logps/rejected": -457.0362548828125, "loss": 0.8134, "rewards/accuracies": 0.90625, "rewards/chosen": -0.956280529499054, "rewards/margins": 0.8631774187088013, "rewards/rejected": -1.8194578886032104, "step": 478 }, { "epoch": 0.7033773861967695, "epsilon_dpo/beta": 0.004861651454120874, "epsilon_dpo/beta_margin_grad_mean": -0.3410286605358124, "epsilon_dpo/beta_margin_grad_std": 0.14748983085155487, "epsilon_dpo/beta_margin_mean": 0.7377583980560303, "epsilon_dpo/beta_margin_std": 0.7295092344284058, "epsilon_dpo/loss_margin_mean": 152.3760223388672, "grad_norm": 50.982177734375, "kl/avg_steps": 0.71875, "kl/beta": 0.0048963576555252075, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 1.2389025514492456e-07, "logits/chosen": 0.5171575546264648, "logits/rejected": 0.5576011538505554, "logps/chosen": -297.18328857421875, "logps/ref_chosen": -53.907920837402344, "logps/ref_rejected": -95.1163330078125, "logps/rejected": -490.7677001953125, "loss": 0.8927, "rewards/accuracies": 0.875, "rewards/chosen": -1.1844937801361084, "rewards/margins": 0.7377583980560303, "rewards/rejected": -1.9222524166107178, "step": 479 }, { "epoch": 0.7048458149779736, "epsilon_dpo/beta": 0.004836073610931635, "epsilon_dpo/beta_margin_grad_mean": -0.36628028750419617, "epsilon_dpo/beta_margin_grad_std": 0.17329415678977966, "epsilon_dpo/beta_margin_mean": 0.6332051753997803, "epsilon_dpo/beta_margin_std": 0.8665231466293335, "epsilon_dpo/loss_margin_mean": 132.01724243164062, "grad_norm": 57.463008880615234, "kl/avg_steps": 0.53125, "kl/beta": 0.004861416295170784, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.227838333989088e-07, "logits/chosen": 0.33993053436279297, "logits/rejected": 0.8602235913276672, "logps/chosen": -318.82342529296875, "logps/ref_chosen": -58.682701110839844, "logps/ref_rejected": -82.93248748779297, "logps/rejected": -475.0904541015625, "loss": 1.0101, "rewards/accuracies": 0.78125, "rewards/chosen": -1.2613255977630615, "rewards/margins": 0.633205235004425, "rewards/rejected": -1.8945307731628418, "step": 480 }, { "epoch": 0.7063142437591777, "epsilon_dpo/beta": 0.004801449831575155, "epsilon_dpo/beta_margin_grad_mean": -0.33503931760787964, "epsilon_dpo/beta_margin_grad_std": 0.15022431313991547, "epsilon_dpo/beta_margin_mean": 0.7770059108734131, "epsilon_dpo/beta_margin_std": 0.7589020729064941, "epsilon_dpo/loss_margin_mean": 162.50860595703125, "grad_norm": 33.654972076416016, "kl/avg_steps": 0.71875, "kl/beta": 0.004835726227611303, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 1.2168076391719489e-07, "logits/chosen": 0.38666173815727234, "logits/rejected": 0.6381037831306458, "logps/chosen": -286.5435791015625, "logps/ref_chosen": -54.964271545410156, "logps/ref_rejected": -92.42044067382812, "logps/rejected": -486.5083312988281, "loss": 0.8738, "rewards/accuracies": 0.859375, "rewards/chosen": -1.1127684116363525, "rewards/margins": 0.7770059108734131, "rewards/rejected": -1.8897743225097656, "step": 481 }, { "epoch": 0.7077826725403817, "epsilon_dpo/beta": 0.004770186729729176, "epsilon_dpo/beta_margin_grad_mean": -0.3630225360393524, "epsilon_dpo/beta_margin_grad_std": 0.16115719079971313, "epsilon_dpo/beta_margin_mean": 0.6352172493934631, "epsilon_dpo/beta_margin_std": 0.8041839599609375, "epsilon_dpo/loss_margin_mean": 133.9978485107422, "grad_norm": 37.491363525390625, "kl/avg_steps": 0.65625, "kl/beta": 0.004801217466592789, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 1.2058107576668938e-07, "logits/chosen": 0.3698510527610779, "logits/rejected": 0.6439001560211182, "logps/chosen": -318.55609130859375, "logps/ref_chosen": -67.55347442626953, "logps/ref_rejected": -87.58953857421875, "logps/rejected": -472.59002685546875, "loss": 0.9868, "rewards/accuracies": 0.859375, "rewards/chosen": -1.1996984481811523, "rewards/margins": 0.6352172493934631, "rewards/rejected": -1.8349155187606812, "step": 482 }, { "epoch": 0.7092511013215859, "epsilon_dpo/beta": 0.004736104980111122, "epsilon_dpo/beta_margin_grad_mean": -0.31758615374565125, "epsilon_dpo/beta_margin_grad_std": 0.1468149721622467, "epsilon_dpo/beta_margin_mean": 0.8923901319503784, "epsilon_dpo/beta_margin_std": 0.8510040640830994, "epsilon_dpo/loss_margin_mean": 189.09866333007812, "grad_norm": 32.81733322143555, "kl/avg_steps": 0.71875, "kl/beta": 0.004769915249198675, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 1.194847979251979e-07, "logits/chosen": 0.19232258200645447, "logits/rejected": 0.6765154600143433, "logps/chosen": -306.38372802734375, "logps/ref_chosen": -63.32981872558594, "logps/ref_rejected": -95.78697204589844, "logps/rejected": -527.9395751953125, "loss": 0.8153, "rewards/accuracies": 0.859375, "rewards/chosen": -1.15290105342865, "rewards/margins": 0.8923901319503784, "rewards/rejected": -2.0452911853790283, "step": 483 }, { "epoch": 0.71071953010279, "epsilon_dpo/beta": 0.004699346609413624, "epsilon_dpo/beta_margin_grad_mean": -0.3339075446128845, "epsilon_dpo/beta_margin_grad_std": 0.13252882659435272, "epsilon_dpo/beta_margin_mean": 0.7821731567382812, "epsilon_dpo/beta_margin_std": 0.7183220386505127, "epsilon_dpo/loss_margin_mean": 166.8756866455078, "grad_norm": 38.810420989990234, "kl/avg_steps": 0.78125, "kl/beta": 0.004735875874757767, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 1.1839195928066101e-07, "logits/chosen": 0.4545806348323822, "logits/rejected": 0.8561559319496155, "logps/chosen": -287.86785888671875, "logps/ref_chosen": -59.13812255859375, "logps/ref_rejected": -84.37144470214844, "logps/rejected": -479.97686767578125, "loss": 0.8527, "rewards/accuracies": 0.90625, "rewards/chosen": -1.0752531290054321, "rewards/margins": 0.7821731567382812, "rewards/rejected": -1.857426404953003, "step": 484 }, { "epoch": 0.7121879588839941, "epsilon_dpo/beta": 0.004664386156946421, "epsilon_dpo/beta_margin_grad_mean": -0.3178929388523102, "epsilon_dpo/beta_margin_grad_std": 0.1579822152853012, "epsilon_dpo/beta_margin_mean": 0.8828170299530029, "epsilon_dpo/beta_margin_std": 0.8285797238349915, "epsilon_dpo/loss_margin_mean": 189.91624450683594, "grad_norm": 33.34450149536133, "kl/avg_steps": 0.75, "kl/beta": 0.004699163604527712, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 1.1730258863039347e-07, "logits/chosen": 0.3416202664375305, "logits/rejected": 0.5913619995117188, "logps/chosen": -262.7619934082031, "logps/ref_chosen": -58.849571228027344, "logps/ref_rejected": -103.36408996582031, "logps/rejected": -497.1927490234375, "loss": 0.8269, "rewards/accuracies": 0.90625, "rewards/chosen": -0.9522556066513062, "rewards/margins": 0.8828170299530029, "rewards/rejected": -1.8350727558135986, "step": 485 }, { "epoch": 0.7136563876651982, "epsilon_dpo/beta": 0.004635494668036699, "epsilon_dpo/beta_margin_grad_mean": -0.33765965700149536, "epsilon_dpo/beta_margin_grad_std": 0.1793326586484909, "epsilon_dpo/beta_margin_mean": 0.78525710105896, "epsilon_dpo/beta_margin_std": 0.9002077579498291, "epsilon_dpo/loss_margin_mean": 170.508544921875, "grad_norm": 42.30367660522461, "kl/avg_steps": 0.625, "kl/beta": 0.00466418219730258, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.1621671468032493e-07, "logits/chosen": 0.42141079902648926, "logits/rejected": 0.6843166351318359, "logps/chosen": -303.60137939453125, "logps/ref_chosen": -55.25966262817383, "logps/ref_rejected": -92.13936614990234, "logps/rejected": -510.9896240234375, "loss": 0.9184, "rewards/accuracies": 0.796875, "rewards/chosen": -1.1533944606781006, "rewards/margins": 0.78525710105896, "rewards/rejected": -1.9386515617370605, "step": 486 }, { "epoch": 0.7151248164464024, "epsilon_dpo/beta": 0.004598011262714863, "epsilon_dpo/beta_margin_grad_mean": -0.33197563886642456, "epsilon_dpo/beta_margin_grad_std": 0.1348419189453125, "epsilon_dpo/beta_margin_mean": 0.7602766156196594, "epsilon_dpo/beta_margin_std": 0.6450428366661072, "epsilon_dpo/loss_margin_mean": 165.81109619140625, "grad_norm": 39.9567756652832, "kl/avg_steps": 0.8125, "kl/beta": 0.004635212477296591, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 1.1513436604424378e-07, "logits/chosen": 0.5476217269897461, "logits/rejected": 0.7340171337127686, "logps/chosen": -302.55706787109375, "logps/ref_chosen": -53.06330871582031, "logps/ref_rejected": -92.4188232421875, "logps/rejected": -507.72369384765625, "loss": 0.856, "rewards/accuracies": 0.890625, "rewards/chosen": -1.1482787132263184, "rewards/margins": 0.7602766156196594, "rewards/rejected": -1.908555269241333, "step": 487 }, { "epoch": 0.7165932452276065, "epsilon_dpo/beta": 0.00456239003688097, "epsilon_dpo/beta_margin_grad_mean": -0.33869460225105286, "epsilon_dpo/beta_margin_grad_std": 0.12046254426240921, "epsilon_dpo/beta_margin_mean": 0.7283648252487183, "epsilon_dpo/beta_margin_std": 0.597629964351654, "epsilon_dpo/loss_margin_mean": 160.0612335205078, "grad_norm": 36.45921325683594, "kl/avg_steps": 0.78125, "kl/beta": 0.004597854800522327, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 1.1405557124304335e-07, "logits/chosen": 0.4700571894645691, "logits/rejected": 0.8733597993850708, "logps/chosen": -283.22467041015625, "logps/ref_chosen": -52.228153228759766, "logps/ref_rejected": -84.00656127929688, "logps/rejected": -475.06427001953125, "loss": 0.8617, "rewards/accuracies": 0.890625, "rewards/chosen": -1.0542188882827759, "rewards/margins": 0.7283648252487183, "rewards/rejected": -1.7825837135314941, "step": 488 }, { "epoch": 0.7180616740088106, "epsilon_dpo/beta": 0.004525597207248211, "epsilon_dpo/beta_margin_grad_mean": -0.36369070410728455, "epsilon_dpo/beta_margin_grad_std": 0.13181878626346588, "epsilon_dpo/beta_margin_mean": 0.6385282874107361, "epsilon_dpo/beta_margin_std": 0.6957613825798035, "epsilon_dpo/loss_margin_mean": 141.3865203857422, "grad_norm": 58.11296081542969, "kl/avg_steps": 0.8125, "kl/beta": 0.004562212619930506, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 1.1298035870396985e-07, "logits/chosen": 0.3628634214401245, "logits/rejected": 0.7215201258659363, "logps/chosen": -279.34716796875, "logps/ref_chosen": -55.989627838134766, "logps/ref_rejected": -79.39813232421875, "logps/rejected": -444.14215087890625, "loss": 0.9453, "rewards/accuracies": 0.921875, "rewards/chosen": -1.0114222764968872, "rewards/margins": 0.6385283470153809, "rewards/rejected": -1.6499505043029785, "step": 489 }, { "epoch": 0.7195301027900147, "epsilon_dpo/beta": 0.004493365529924631, "epsilon_dpo/beta_margin_grad_mean": -0.3462271988391876, "epsilon_dpo/beta_margin_grad_std": 0.15604344010353088, "epsilon_dpo/beta_margin_mean": 0.7303367257118225, "epsilon_dpo/beta_margin_std": 0.7860758304595947, "epsilon_dpo/loss_margin_mean": 163.24058532714844, "grad_norm": 31.600603103637695, "kl/avg_steps": 0.71875, "kl/beta": 0.004525443073362112, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 1.1190875675987355e-07, "logits/chosen": 0.5707692503929138, "logits/rejected": 0.488492488861084, "logps/chosen": -283.4697265625, "logps/ref_chosen": -52.36639404296875, "logps/ref_rejected": -110.40904998779297, "logps/rejected": -504.7529602050781, "loss": 0.9136, "rewards/accuracies": 0.875, "rewards/chosen": -1.039729118347168, "rewards/margins": 0.7303366661071777, "rewards/rejected": -1.7700657844543457, "step": 490 }, { "epoch": 0.7209985315712188, "epsilon_dpo/beta": 0.004472534172236919, "epsilon_dpo/beta_margin_grad_mean": -0.38271310925483704, "epsilon_dpo/beta_margin_grad_std": 0.14819550514221191, "epsilon_dpo/beta_margin_mean": 0.5458483099937439, "epsilon_dpo/beta_margin_std": 0.7266311645507812, "epsilon_dpo/loss_margin_mean": 123.00843048095703, "grad_norm": 62.742305755615234, "kl/avg_steps": 0.46875, "kl/beta": 0.004493148531764746, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 1.1084079364846241e-07, "logits/chosen": 0.44595614075660706, "logits/rejected": 0.9600294828414917, "logps/chosen": -317.3819580078125, "logps/ref_chosen": -60.11626434326172, "logps/ref_rejected": -73.27278900146484, "logps/rejected": -453.54693603515625, "loss": 1.0265, "rewards/accuracies": 0.765625, "rewards/chosen": -1.1519925594329834, "rewards/margins": 0.5458483099937439, "rewards/rejected": -1.6978408098220825, "step": 491 }, { "epoch": 0.7224669603524229, "epsilon_dpo/beta": 0.004450269043445587, "epsilon_dpo/beta_margin_grad_mean": -0.39979806542396545, "epsilon_dpo/beta_margin_grad_std": 0.17615996301174164, "epsilon_dpo/beta_margin_mean": 0.46891486644744873, "epsilon_dpo/beta_margin_std": 0.8474681973457336, "epsilon_dpo/loss_margin_mean": 106.54131317138672, "grad_norm": 45.599552154541016, "kl/avg_steps": 0.5, "kl/beta": 0.004472185391932726, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.097764975115576e-07, "logits/chosen": 0.6272152662277222, "logits/rejected": 1.0693143606185913, "logps/chosen": -324.37713623046875, "logps/ref_chosen": -53.99418258666992, "logps/ref_rejected": -72.65962219238281, "logps/rejected": -449.5838623046875, "loss": 1.1302, "rewards/accuracies": 0.734375, "rewards/chosen": -1.2058026790618896, "rewards/margins": 0.46891486644744873, "rewards/rejected": -1.6747174263000488, "step": 492 }, { "epoch": 0.723935389133627, "epsilon_dpo/beta": 0.004423956852406263, "epsilon_dpo/beta_margin_grad_mean": -0.3644724488258362, "epsilon_dpo/beta_margin_grad_std": 0.139420285820961, "epsilon_dpo/beta_margin_mean": 0.605819046497345, "epsilon_dpo/beta_margin_std": 0.6427890658378601, "epsilon_dpo/loss_margin_mean": 137.73443603515625, "grad_norm": 57.87881088256836, "kl/avg_steps": 0.59375, "kl/beta": 0.004449935629963875, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 1.0871589639435203e-07, "logits/chosen": 0.34055036306381226, "logits/rejected": 0.8597230911254883, "logps/chosen": -344.1697082519531, "logps/ref_chosen": -75.49723815917969, "logps/ref_rejected": -87.32301330566406, "logps/rejected": -493.72991943359375, "loss": 0.9628, "rewards/accuracies": 0.828125, "rewards/chosen": -1.1892423629760742, "rewards/margins": 0.6058189868927002, "rewards/rejected": -1.7950613498687744, "step": 493 }, { "epoch": 0.7254038179148311, "epsilon_dpo/beta": 0.004388166591525078, "epsilon_dpo/beta_margin_grad_mean": -0.320593923330307, "epsilon_dpo/beta_margin_grad_std": 0.13530012965202332, "epsilon_dpo/beta_margin_mean": 0.8199219107627869, "epsilon_dpo/beta_margin_std": 0.6674307584762573, "epsilon_dpo/loss_margin_mean": 187.34080505371094, "grad_norm": 57.35707473754883, "kl/avg_steps": 0.8125, "kl/beta": 0.004423670005053282, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 1.0765901824467166e-07, "logits/chosen": 0.807897686958313, "logits/rejected": 0.8155907392501831, "logps/chosen": -282.2862548828125, "logps/ref_chosen": -41.35926818847656, "logps/ref_rejected": -86.09136962890625, "logps/rejected": -514.359130859375, "loss": 0.8221, "rewards/accuracies": 0.90625, "rewards/chosen": -1.0579063892364502, "rewards/margins": 0.8199218511581421, "rewards/rejected": -1.8778282403945923, "step": 494 }, { "epoch": 0.7268722466960352, "epsilon_dpo/beta": 0.004359656944870949, "epsilon_dpo/beta_margin_grad_mean": -0.3493143618106842, "epsilon_dpo/beta_margin_grad_std": 0.14684849977493286, "epsilon_dpo/beta_margin_mean": 0.7016583681106567, "epsilon_dpo/beta_margin_std": 0.729976236820221, "epsilon_dpo/loss_margin_mean": 161.7394256591797, "grad_norm": 43.34193420410156, "kl/avg_steps": 0.65625, "kl/beta": 0.004388017579913139, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 1.0660589091223854e-07, "logits/chosen": 0.3953931927680969, "logits/rejected": 0.7432557940483093, "logps/chosen": -341.39044189453125, "logps/ref_chosen": -63.53507995605469, "logps/ref_rejected": -91.42443084716797, "logps/rejected": -531.0192260742188, "loss": 0.9158, "rewards/accuracies": 0.84375, "rewards/chosen": -1.2132760286331177, "rewards/margins": 0.7016583681106567, "rewards/rejected": -1.9149343967437744, "step": 495 }, { "epoch": 0.7283406754772394, "epsilon_dpo/beta": 0.004335320554673672, "epsilon_dpo/beta_margin_grad_mean": -0.3704771101474762, "epsilon_dpo/beta_margin_grad_std": 0.12676607072353363, "epsilon_dpo/beta_margin_mean": 0.5778650045394897, "epsilon_dpo/beta_margin_std": 0.5912160873413086, "epsilon_dpo/loss_margin_mean": 134.03567504882812, "grad_norm": 47.66548156738281, "kl/avg_steps": 0.5625, "kl/beta": 0.004359408747404814, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.0555654214793722e-07, "logits/chosen": 0.2811706066131592, "logits/rejected": 0.7893705368041992, "logps/chosen": -366.8771057128906, "logps/ref_chosen": -72.59192657470703, "logps/ref_rejected": -84.32933807373047, "logps/rejected": -512.6502075195312, "loss": 0.9678, "rewards/accuracies": 0.828125, "rewards/chosen": -1.277395486831665, "rewards/margins": 0.577864944934845, "rewards/rejected": -1.8552603721618652, "step": 496 }, { "epoch": 0.7298091042584435, "epsilon_dpo/beta": 0.0043083615601062775, "epsilon_dpo/beta_margin_grad_mean": -0.38184332847595215, "epsilon_dpo/beta_margin_grad_std": 0.14038600027561188, "epsilon_dpo/beta_margin_mean": 0.5312784314155579, "epsilon_dpo/beta_margin_std": 0.6518020033836365, "epsilon_dpo/loss_margin_mean": 124.06751251220703, "grad_norm": 44.63701248168945, "kl/avg_steps": 0.625, "kl/beta": 0.004335024394094944, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.0451099960308374e-07, "logits/chosen": 0.5063110589981079, "logits/rejected": 0.9023481607437134, "logps/chosen": -363.6978454589844, "logps/ref_chosen": -58.593971252441406, "logps/ref_rejected": -76.28836822509766, "logps/rejected": -505.45977783203125, "loss": 1.0195, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3173977136611938, "rewards/margins": 0.5312784314155579, "rewards/rejected": -1.8486762046813965, "step": 497 }, { "epoch": 0.7312775330396476, "epsilon_dpo/beta": 0.004284294322133064, "epsilon_dpo/beta_margin_grad_mean": -0.36809319257736206, "epsilon_dpo/beta_margin_grad_std": 0.14858005940914154, "epsilon_dpo/beta_margin_mean": 0.6082795858383179, "epsilon_dpo/beta_margin_std": 0.7224895358085632, "epsilon_dpo/loss_margin_mean": 142.9373016357422, "grad_norm": 52.54181671142578, "kl/avg_steps": 0.5625, "kl/beta": 0.004308098927140236, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.0346929082869641e-07, "logits/chosen": 0.5228114724159241, "logits/rejected": 0.9948470592498779, "logps/chosen": -367.08575439453125, "logps/ref_chosen": -71.20565795898438, "logps/ref_rejected": -83.95803833007812, "logps/rejected": -522.775390625, "loss": 0.9804, "rewards/accuracies": 0.765625, "rewards/chosen": -1.2692654132843018, "rewards/margins": 0.6082795858383179, "rewards/rejected": -1.8775451183319092, "step": 498 }, { "epoch": 0.7327459618208517, "epsilon_dpo/beta": 0.00425497442483902, "epsilon_dpo/beta_margin_grad_mean": -0.3490248918533325, "epsilon_dpo/beta_margin_grad_std": 0.15517489612102509, "epsilon_dpo/beta_margin_mean": 0.6933284997940063, "epsilon_dpo/beta_margin_std": 0.732176661491394, "epsilon_dpo/loss_margin_mean": 163.8168182373047, "grad_norm": 46.36258316040039, "kl/avg_steps": 0.6875, "kl/beta": 0.004284001421183348, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 1.0243144327477013e-07, "logits/chosen": 0.7224333882331848, "logits/rejected": 0.7317450046539307, "logps/chosen": -345.83740234375, "logps/ref_chosen": -51.25519561767578, "logps/ref_rejected": -101.07870483398438, "logps/rejected": -559.477783203125, "loss": 0.9273, "rewards/accuracies": 0.828125, "rewards/chosen": -1.2557456493377686, "rewards/margins": 0.6933284997940063, "rewards/rejected": -1.949074149131775, "step": 499 }, { "epoch": 0.7342143906020558, "epsilon_dpo/beta": 0.0042272512800991535, "epsilon_dpo/beta_margin_grad_mean": -0.3537231981754303, "epsilon_dpo/beta_margin_grad_std": 0.13802658021450043, "epsilon_dpo/beta_margin_mean": 0.6817148327827454, "epsilon_dpo/beta_margin_std": 0.7146762013435364, "epsilon_dpo/loss_margin_mean": 162.0128173828125, "grad_norm": 33.654090881347656, "kl/avg_steps": 0.65625, "kl/beta": 0.0042547499760985374, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 1.0139748428955333e-07, "logits/chosen": 0.6236047744750977, "logits/rejected": 0.7619317173957825, "logps/chosen": -375.20428466796875, "logps/ref_chosen": -57.027442932128906, "logps/ref_rejected": -93.93421173095703, "logps/rejected": -574.1239013671875, "loss": 0.9214, "rewards/accuracies": 0.875, "rewards/chosen": -1.3463799953460693, "rewards/margins": 0.6817148923873901, "rewards/rejected": -2.02809476852417, "step": 500 }, { "epoch": 0.7342143906020558, "eval_epsilon_dpo/beta": 0.004208484664559364, "eval_epsilon_dpo/beta_margin_grad_mean": -0.40103766322135925, "eval_epsilon_dpo/beta_margin_grad_std": 0.14435508847236633, "eval_epsilon_dpo/beta_margin_mean": 0.45140042901039124, "eval_epsilon_dpo/beta_margin_std": 0.6817530989646912, "eval_epsilon_dpo/loss_margin_mean": 108.27748107910156, "eval_kl/n_epsilon_steps": 0.27525684237480164, "eval_kl/p_epsilon_steps": 0.7234588861465454, "eval_logits/chosen": 0.5316959023475647, "eval_logits/rejected": 0.8735014796257019, "eval_logps/chosen": -417.7781982421875, "eval_logps/ref_chosen": -79.05104064941406, "eval_logps/ref_rejected": -86.79793548583984, "eval_logps/rejected": -533.8025512695312, "eval_loss": 0.547046422958374, "eval_rewards/accuracies": 0.7487157583236694, "eval_rewards/chosen": -1.427492618560791, "eval_rewards/margins": 0.45140042901039124, "eval_rewards/rejected": -1.8788931369781494, "eval_runtime": 43.1754, "eval_samples_per_second": 54.174, "eval_steps_per_second": 1.714, "step": 500 }, { "epoch": 0.73568281938326, "epsilon_dpo/beta": 0.004198369104415178, "epsilon_dpo/beta_margin_grad_mean": -0.3420836925506592, "epsilon_dpo/beta_margin_grad_std": 0.1379937082529068, "epsilon_dpo/beta_margin_mean": 0.7321783900260925, "epsilon_dpo/beta_margin_std": 0.6965384483337402, "epsilon_dpo/loss_margin_mean": 175.11428833007812, "grad_norm": 43.227970123291016, "kl/avg_steps": 0.6875, "kl/beta": 0.004227010067552328, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 1.0036744111882672e-07, "logits/chosen": 0.7106126546859741, "logits/rejected": 1.0762202739715576, "logps/chosen": -354.31011962890625, "logps/ref_chosen": -54.359527587890625, "logps/ref_rejected": -80.15670776367188, "logps/rejected": -555.2215576171875, "loss": 0.8844, "rewards/accuracies": 0.859375, "rewards/chosen": -1.2609002590179443, "rewards/margins": 0.7321783900260925, "rewards/rejected": -1.9930784702301025, "step": 501 }, { "epoch": 0.737151248164464, "epsilon_dpo/beta": 0.004163142293691635, "epsilon_dpo/beta_margin_grad_mean": -0.34105634689331055, "epsilon_dpo/beta_margin_grad_std": 0.11382236331701279, "epsilon_dpo/beta_margin_mean": 0.7036091685295105, "epsilon_dpo/beta_margin_std": 0.5351970195770264, "epsilon_dpo/loss_margin_mean": 169.35462951660156, "grad_norm": 31.223445892333984, "kl/avg_steps": 0.84375, "kl/beta": 0.004198147915303707, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 9.934134090518592e-08, "logits/chosen": 0.359226256608963, "logits/rejected": 1.0165369510650635, "logps/chosen": -348.9472961425781, "logps/ref_chosen": -67.60050964355469, "logps/ref_rejected": -82.94876098632812, "logps/rejected": -533.6502075195312, "loss": 0.8659, "rewards/accuracies": 0.90625, "rewards/chosen": -1.1718389987945557, "rewards/margins": 0.7036091685295105, "rewards/rejected": -1.875448226928711, "step": 502 }, { "epoch": 0.7386196769456681, "epsilon_dpo/beta": 0.004134814720600843, "epsilon_dpo/beta_margin_grad_mean": -0.3556686043739319, "epsilon_dpo/beta_margin_grad_std": 0.12179072201251984, "epsilon_dpo/beta_margin_mean": 0.6433125734329224, "epsilon_dpo/beta_margin_std": 0.5787558555603027, "epsilon_dpo/loss_margin_mean": 156.22250366210938, "grad_norm": 37.32900619506836, "kl/avg_steps": 0.6875, "kl/beta": 0.00416302215307951, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 9.831921068732571e-08, "logits/chosen": 0.47690650820732117, "logits/rejected": 1.0228471755981445, "logps/chosen": -361.281494140625, "logps/ref_chosen": -55.078407287597656, "logps/ref_rejected": -82.50544738769531, "logps/rejected": -544.9310302734375, "loss": 0.917, "rewards/accuracies": 0.859375, "rewards/chosen": -1.2666527032852173, "rewards/margins": 0.6433125734329224, "rewards/rejected": -1.9099652767181396, "step": 503 }, { "epoch": 0.7400881057268722, "epsilon_dpo/beta": 0.0041039977222681046, "epsilon_dpo/beta_margin_grad_mean": -0.33253493905067444, "epsilon_dpo/beta_margin_grad_std": 0.1312149465084076, "epsilon_dpo/beta_margin_mean": 0.7709100246429443, "epsilon_dpo/beta_margin_std": 0.6641334295272827, "epsilon_dpo/loss_margin_mean": 188.43588256835938, "grad_norm": 48.717227935791016, "kl/avg_steps": 0.75, "kl/beta": 0.004134596791118383, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 9.730107739932805e-08, "logits/chosen": 0.5325213670730591, "logits/rejected": 0.7064046859741211, "logps/chosen": -364.09124755859375, "logps/ref_chosen": -59.96575164794922, "logps/ref_rejected": -103.76213073730469, "logps/rejected": -596.323486328125, "loss": 0.8501, "rewards/accuracies": 0.90625, "rewards/chosen": -1.2495099306106567, "rewards/margins": 0.7709100246429443, "rewards/rejected": -2.0204200744628906, "step": 504 }, { "epoch": 0.7415565345080763, "epsilon_dpo/beta": 0.004083707928657532, "epsilon_dpo/beta_margin_grad_mean": -0.3996689021587372, "epsilon_dpo/beta_margin_grad_std": 0.15897127985954285, "epsilon_dpo/beta_margin_mean": 0.4632645547389984, "epsilon_dpo/beta_margin_std": 0.7566088438034058, "epsilon_dpo/loss_margin_mean": 114.5339584350586, "grad_norm": 68.79544067382812, "kl/avg_steps": 0.5, "kl/beta": 0.004103818442672491, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 9.628696786995188e-08, "logits/chosen": 0.4053691029548645, "logits/rejected": 0.7741438746452332, "logps/chosen": -394.07562255859375, "logps/ref_chosen": -76.1549072265625, "logps/ref_rejected": -88.58537292480469, "logps/rejected": -521.0400390625, "loss": 1.1027, "rewards/accuracies": 0.765625, "rewards/chosen": -1.300675630569458, "rewards/margins": 0.4632645845413208, "rewards/rejected": -1.7639403343200684, "step": 505 }, { "epoch": 0.7430249632892805, "epsilon_dpo/beta": 0.004057010170072317, "epsilon_dpo/beta_margin_grad_mean": -0.34152358770370483, "epsilon_dpo/beta_margin_grad_std": 0.14202556014060974, "epsilon_dpo/beta_margin_mean": 0.7302777171134949, "epsilon_dpo/beta_margin_std": 0.6926552653312683, "epsilon_dpo/loss_margin_mean": 180.8192596435547, "grad_norm": 32.2053337097168, "kl/avg_steps": 0.65625, "kl/beta": 0.004083401523530483, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 9.527690882192635e-08, "logits/chosen": 0.7665762901306152, "logits/rejected": 1.076411485671997, "logps/chosen": -328.1141357421875, "logps/ref_chosen": -48.96050262451172, "logps/ref_rejected": -78.41505432128906, "logps/rejected": -538.387939453125, "loss": 0.8872, "rewards/accuracies": 0.84375, "rewards/chosen": -1.1337469816207886, "rewards/margins": 0.7302776575088501, "rewards/rejected": -1.8640246391296387, "step": 506 }, { "epoch": 0.7444933920704846, "epsilon_dpo/beta": 0.004033094737678766, "epsilon_dpo/beta_margin_grad_mean": -0.3671947717666626, "epsilon_dpo/beta_margin_grad_std": 0.13613611459732056, "epsilon_dpo/beta_margin_mean": 0.6116988658905029, "epsilon_dpo/beta_margin_std": 0.679846465587616, "epsilon_dpo/loss_margin_mean": 152.4768524169922, "grad_norm": 32.728424072265625, "kl/avg_steps": 0.59375, "kl/beta": 0.004056778736412525, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 9.427092687124691e-08, "logits/chosen": 0.5450102686882019, "logits/rejected": 0.8309746980667114, "logps/chosen": -370.1054382324219, "logps/ref_chosen": -66.80150604248047, "logps/ref_rejected": -95.37289428710938, "logps/rejected": -551.1536865234375, "loss": 0.963, "rewards/accuracies": 0.796875, "rewards/chosen": -1.2244369983673096, "rewards/margins": 0.6116988658905029, "rewards/rejected": -1.8361358642578125, "step": 507 }, { "epoch": 0.7459618208516887, "epsilon_dpo/beta": 0.004010550212115049, "epsilon_dpo/beta_margin_grad_mean": -0.37403443455696106, "epsilon_dpo/beta_margin_grad_std": 0.14489853382110596, "epsilon_dpo/beta_margin_mean": 0.578895092010498, "epsilon_dpo/beta_margin_std": 0.7033603191375732, "epsilon_dpo/loss_margin_mean": 145.273681640625, "grad_norm": 34.49687194824219, "kl/avg_steps": 0.5625, "kl/beta": 0.004032833967357874, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 9.326904852647344e-08, "logits/chosen": 0.29630690813064575, "logits/rejected": 0.738036036491394, "logps/chosen": -350.77117919921875, "logps/ref_chosen": -71.303466796875, "logps/ref_rejected": -95.6275405883789, "logps/rejected": -520.368896484375, "loss": 0.9961, "rewards/accuracies": 0.796875, "rewards/chosen": -1.1227812767028809, "rewards/margins": 0.578895092010498, "rewards/rejected": -1.701676368713379, "step": 508 }, { "epoch": 0.7474302496328928, "epsilon_dpo/beta": 0.003983103670179844, "epsilon_dpo/beta_margin_grad_mean": -0.37501460313796997, "epsilon_dpo/beta_margin_grad_std": 0.12634851038455963, "epsilon_dpo/beta_margin_mean": 0.5611936450004578, "epsilon_dpo/beta_margin_std": 0.6098694801330566, "epsilon_dpo/loss_margin_mean": 141.5440216064453, "grad_norm": 46.47676467895508, "kl/avg_steps": 0.6875, "kl/beta": 0.004010275937616825, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 9.227130018803195e-08, "logits/chosen": 0.38151389360427856, "logits/rejected": 0.6786462664604187, "logps/chosen": -309.4127502441406, "logps/ref_chosen": -63.81895065307617, "logps/ref_rejected": -83.25643920898438, "logps/rejected": -470.394287109375, "loss": 0.983, "rewards/accuracies": 0.875, "rewards/chosen": -0.9797898530960083, "rewards/margins": 0.5611936450004578, "rewards/rejected": -1.5409835577011108, "step": 509 }, { "epoch": 0.748898678414097, "epsilon_dpo/beta": 0.003954662010073662, "epsilon_dpo/beta_margin_grad_mean": -0.33917155861854553, "epsilon_dpo/beta_margin_grad_std": 0.131233349442482, "epsilon_dpo/beta_margin_mean": 0.7268217206001282, "epsilon_dpo/beta_margin_std": 0.6267231702804565, "epsilon_dpo/loss_margin_mean": 184.49526977539062, "grad_norm": 38.41777801513672, "kl/avg_steps": 0.71875, "kl/beta": 0.003982893656939268, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 9.127770814751932e-08, "logits/chosen": 0.648572564125061, "logits/rejected": 0.7486324310302734, "logps/chosen": -314.7032165527344, "logps/ref_chosen": -51.878448486328125, "logps/ref_rejected": -102.7651596069336, "logps/rejected": -550.085205078125, "loss": 0.8726, "rewards/accuracies": 0.859375, "rewards/chosen": -1.0410048961639404, "rewards/margins": 0.726821780204773, "rewards/rejected": -1.7678265571594238, "step": 510 }, { "epoch": 0.750367107195301, "epsilon_dpo/beta": 0.003928912337869406, "epsilon_dpo/beta_margin_grad_mean": -0.35081177949905396, "epsilon_dpo/beta_margin_grad_std": 0.15038582682609558, "epsilon_dpo/beta_margin_mean": 0.6912986636161804, "epsilon_dpo/beta_margin_std": 0.7469455003738403, "epsilon_dpo/loss_margin_mean": 176.87680053710938, "grad_norm": 35.97317123413086, "kl/avg_steps": 0.65625, "kl/beta": 0.003954470623284578, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 9.028829858700973e-08, "logits/chosen": 0.4911497235298157, "logits/rejected": 0.7370046377182007, "logps/chosen": -314.3341064453125, "logps/ref_chosen": -60.23811340332031, "logps/ref_rejected": -92.85676574707031, "logps/rejected": -523.8295288085938, "loss": 0.929, "rewards/accuracies": 0.828125, "rewards/chosen": -0.9999300837516785, "rewards/margins": 0.6912987232208252, "rewards/rejected": -1.6912288665771484, "step": 511 }, { "epoch": 0.7518355359765051, "epsilon_dpo/beta": 0.0039008415769785643, "epsilon_dpo/beta_margin_grad_mean": -0.34543266892433167, "epsilon_dpo/beta_margin_grad_std": 0.12374412268400192, "epsilon_dpo/beta_margin_mean": 0.6969825625419617, "epsilon_dpo/beta_margin_std": 0.6002340912818909, "epsilon_dpo/loss_margin_mean": 179.2578125, "grad_norm": 42.282527923583984, "kl/avg_steps": 0.71875, "kl/beta": 0.003928688820451498, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 8.930309757836516e-08, "logits/chosen": 0.528344452381134, "logits/rejected": 0.9337077736854553, "logps/chosen": -295.787109375, "logps/ref_chosen": -54.905494689941406, "logps/ref_rejected": -81.87586975097656, "logps/rejected": -502.0152893066406, "loss": 0.8845, "rewards/accuracies": 0.890625, "rewards/chosen": -0.9405291080474854, "rewards/margins": 0.6969825625419617, "rewards/rejected": -1.6375117301940918, "step": 512 }, { "epoch": 0.7533039647577092, "epsilon_dpo/beta": 0.003876661416143179, "epsilon_dpo/beta_margin_grad_mean": -0.3744572401046753, "epsilon_dpo/beta_margin_grad_std": 0.13922473788261414, "epsilon_dpo/beta_margin_mean": 0.5541786551475525, "epsilon_dpo/beta_margin_std": 0.6370647549629211, "epsilon_dpo/loss_margin_mean": 143.83148193359375, "grad_norm": 52.009796142578125, "kl/avg_steps": 0.625, "kl/beta": 0.003900652751326561, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 8.832213108254863e-08, "logits/chosen": 0.4536152482032776, "logits/rejected": 0.9576371312141418, "logps/chosen": -326.5400390625, "logps/ref_chosen": -64.91644287109375, "logps/ref_rejected": -76.06245422363281, "logps/rejected": -481.5175476074219, "loss": 0.9995, "rewards/accuracies": 0.84375, "rewards/chosen": -1.0159627199172974, "rewards/margins": 0.5541786551475525, "rewards/rejected": -1.570141315460205, "step": 513 }, { "epoch": 0.7547723935389133, "epsilon_dpo/beta": 0.0038562172558158636, "epsilon_dpo/beta_margin_grad_mean": -0.3828768730163574, "epsilon_dpo/beta_margin_grad_std": 0.14796635508537292, "epsilon_dpo/beta_margin_mean": 0.5380199551582336, "epsilon_dpo/beta_margin_std": 0.7064852714538574, "epsilon_dpo/loss_margin_mean": 140.54808044433594, "grad_norm": 30.75905418395996, "kl/avg_steps": 0.53125, "kl/beta": 0.0038764250930398703, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 8.734542494893954e-08, "logits/chosen": 0.35837316513061523, "logits/rejected": 0.9016172885894775, "logps/chosen": -329.46221923828125, "logps/ref_chosen": -74.22957611083984, "logps/ref_rejected": -78.945556640625, "logps/rejected": -474.72625732421875, "loss": 1.0288, "rewards/accuracies": 0.78125, "rewards/chosen": -0.9856797456741333, "rewards/margins": 0.5380199551582336, "rewards/rejected": -1.5236997604370117, "step": 514 }, { "epoch": 0.7562408223201175, "epsilon_dpo/beta": 0.0038298137951642275, "epsilon_dpo/beta_margin_grad_mean": -0.38730791211128235, "epsilon_dpo/beta_margin_grad_std": 0.13555346429347992, "epsilon_dpo/beta_margin_mean": 0.5017286539077759, "epsilon_dpo/beta_margin_std": 0.6221214532852173, "epsilon_dpo/loss_margin_mean": 131.72146606445312, "grad_norm": 34.91788864135742, "kl/avg_steps": 0.6875, "kl/beta": 0.003855940420180559, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 8.637300491465272e-08, "logits/chosen": 0.4837096333503723, "logits/rejected": 0.5631206035614014, "logps/chosen": -275.1760559082031, "logps/ref_chosen": -50.40156555175781, "logps/ref_rejected": -87.09774780273438, "logps/rejected": -443.59368896484375, "loss": 1.0341, "rewards/accuracies": 0.84375, "rewards/chosen": -0.8624446392059326, "rewards/margins": 0.5017286539077759, "rewards/rejected": -1.364173412322998, "step": 515 }, { "epoch": 0.7577092511013216, "epsilon_dpo/beta": 0.0038048606365919113, "epsilon_dpo/beta_margin_grad_mean": -0.3501274585723877, "epsilon_dpo/beta_margin_grad_std": 0.12381567806005478, "epsilon_dpo/beta_margin_mean": 0.6758091449737549, "epsilon_dpo/beta_margin_std": 0.6056503057479858, "epsilon_dpo/loss_margin_mean": 178.34332275390625, "grad_norm": 32.94475173950195, "kl/avg_steps": 0.65625, "kl/beta": 0.0038296119309961796, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 8.540489660386064e-08, "logits/chosen": 0.3618767261505127, "logits/rejected": 0.33820533752441406, "logps/chosen": -298.4083557128906, "logps/ref_chosen": -64.6495590209961, "logps/ref_rejected": -111.72238159179688, "logps/rejected": -523.8245239257812, "loss": 0.8997, "rewards/accuracies": 0.875, "rewards/chosen": -0.8899648189544678, "rewards/margins": 0.6758091449737549, "rewards/rejected": -1.5657739639282227, "step": 516 }, { "epoch": 0.7591776798825257, "epsilon_dpo/beta": 0.0037800539284944534, "epsilon_dpo/beta_margin_grad_mean": -0.3596791625022888, "epsilon_dpo/beta_margin_grad_std": 0.1358945220708847, "epsilon_dpo/beta_margin_mean": 0.6402126550674438, "epsilon_dpo/beta_margin_std": 0.665012776851654, "epsilon_dpo/loss_margin_mean": 170.1980438232422, "grad_norm": 34.73167419433594, "kl/avg_steps": 0.65625, "kl/beta": 0.0038046438712626696, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 8.444112552711752e-08, "logits/chosen": 0.3920656740665436, "logits/rejected": 0.7085820436477661, "logps/chosen": -308.80694580078125, "logps/ref_chosen": -60.913551330566406, "logps/ref_rejected": -89.08308410644531, "logps/rejected": -507.1745300292969, "loss": 0.9403, "rewards/accuracies": 0.84375, "rewards/chosen": -0.9386484026908875, "rewards/margins": 0.6402126550674438, "rewards/rejected": -1.5788609981536865, "step": 517 }, { "epoch": 0.7606461086637298, "epsilon_dpo/beta": 0.0037554092705249786, "epsilon_dpo/beta_margin_grad_mean": -0.3725910186767578, "epsilon_dpo/beta_margin_grad_std": 0.14076903462409973, "epsilon_dpo/beta_margin_mean": 0.5712500214576721, "epsilon_dpo/beta_margin_std": 0.6601026058197021, "epsilon_dpo/loss_margin_mean": 152.9922332763672, "grad_norm": 25.094013214111328, "kl/avg_steps": 0.65625, "kl/beta": 0.0037798387929797173, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 8.348171708068747e-08, "logits/chosen": 0.3267255127429962, "logits/rejected": 0.6061903834342957, "logps/chosen": -299.3568420410156, "logps/ref_chosen": -57.45589065551758, "logps/ref_rejected": -85.31269836425781, "logps/rejected": -480.20587158203125, "loss": 0.9919, "rewards/accuracies": 0.859375, "rewards/chosen": -0.909741997718811, "rewards/margins": 0.5712500810623169, "rewards/rejected": -1.480992078781128, "step": 518 }, { "epoch": 0.762114537444934, "epsilon_dpo/beta": 0.0037320987321436405, "epsilon_dpo/beta_margin_grad_mean": -0.3851795792579651, "epsilon_dpo/beta_margin_grad_std": 0.13246044516563416, "epsilon_dpo/beta_margin_mean": 0.514685869216919, "epsilon_dpo/beta_margin_std": 0.6302955150604248, "epsilon_dpo/loss_margin_mean": 138.73207092285156, "grad_norm": 31.03791618347168, "kl/avg_steps": 0.625, "kl/beta": 0.0037551952991634607, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 8.25266965458755e-08, "logits/chosen": 0.2064821869134903, "logits/rejected": 0.36921072006225586, "logps/chosen": -309.9266357421875, "logps/ref_chosen": -74.06330871582031, "logps/ref_rejected": -104.44416809082031, "logps/rejected": -479.0395812988281, "loss": 1.0242, "rewards/accuracies": 0.84375, "rewards/chosen": -0.8822252750396729, "rewards/margins": 0.514685869216919, "rewards/rejected": -1.3969111442565918, "step": 519 }, { "epoch": 0.7635829662261381, "epsilon_dpo/beta": 0.003706584917381406, "epsilon_dpo/beta_margin_grad_mean": -0.3665812909603119, "epsilon_dpo/beta_margin_grad_std": 0.11654356867074966, "epsilon_dpo/beta_margin_mean": 0.5927385687828064, "epsilon_dpo/beta_margin_std": 0.5589181780815125, "epsilon_dpo/loss_margin_mean": 160.50839233398438, "grad_norm": 32.33449172973633, "kl/avg_steps": 0.6875, "kl/beta": 0.003731871023774147, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 8.15760890883607e-08, "logits/chosen": 0.2796686291694641, "logits/rejected": 0.533139705657959, "logps/chosen": -303.42431640625, "logps/ref_chosen": -70.2998275756836, "logps/ref_rejected": -99.98133850097656, "logps/rejected": -493.6142578125, "loss": 0.9474, "rewards/accuracies": 0.890625, "rewards/chosen": -0.8654706478118896, "rewards/margins": 0.5927386283874512, "rewards/rejected": -1.4582092761993408, "step": 520 }, { "epoch": 0.7650513950073421, "epsilon_dpo/beta": 0.003682434558868408, "epsilon_dpo/beta_margin_grad_mean": -0.3819310963153839, "epsilon_dpo/beta_margin_grad_std": 0.13134653866291046, "epsilon_dpo/beta_margin_mean": 0.5284645557403564, "epsilon_dpo/beta_margin_std": 0.6106793880462646, "epsilon_dpo/loss_margin_mean": 144.2578887939453, "grad_norm": 33.71346664428711, "kl/avg_steps": 0.65625, "kl/beta": 0.0037063895724713802, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 8.062991975753378e-08, "logits/chosen": 0.43052414059638977, "logits/rejected": 0.7658823132514954, "logps/chosen": -295.885498046875, "logps/ref_chosen": -58.14292907714844, "logps/ref_rejected": -83.28060913085938, "logps/rejected": -465.2810974121094, "loss": 1.0096, "rewards/accuracies": 0.828125, "rewards/chosen": -0.8762213587760925, "rewards/margins": 0.5284645557403564, "rewards/rejected": -1.4046858549118042, "step": 521 }, { "epoch": 0.7665198237885462, "epsilon_dpo/beta": 0.0036584262270480394, "epsilon_dpo/beta_margin_grad_mean": -0.3859768509864807, "epsilon_dpo/beta_margin_grad_std": 0.11610626429319382, "epsilon_dpo/beta_margin_mean": 0.49098727107048035, "epsilon_dpo/beta_margin_std": 0.5090169310569763, "epsilon_dpo/loss_margin_mean": 134.9163360595703, "grad_norm": 52.357242584228516, "kl/avg_steps": 0.65625, "kl/beta": 0.0036822250112891197, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 7.968821348583643e-08, "logits/chosen": 0.8381183743476868, "logits/rejected": 1.024916410446167, "logps/chosen": -297.3680114746094, "logps/ref_chosen": -46.54766845703125, "logps/ref_rejected": -66.01388549804688, "logps/rejected": -451.7505798339844, "loss": 1.0151, "rewards/accuracies": 0.828125, "rewards/chosen": -0.9188036918640137, "rewards/margins": 0.49098724126815796, "rewards/rejected": -1.4097909927368164, "step": 522 }, { "epoch": 0.7679882525697503, "epsilon_dpo/beta": 0.0036368609871715307, "epsilon_dpo/beta_margin_grad_mean": -0.3799547255039215, "epsilon_dpo/beta_margin_grad_std": 0.1328548938035965, "epsilon_dpo/beta_margin_mean": 0.5245332717895508, "epsilon_dpo/beta_margin_std": 0.5979333519935608, "epsilon_dpo/loss_margin_mean": 145.19387817382812, "grad_norm": 38.532352447509766, "kl/avg_steps": 0.59375, "kl/beta": 0.003658218076452613, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 7.875099508810484e-08, "logits/chosen": 0.4588143825531006, "logits/rejected": 0.9571596384048462, "logps/chosen": -311.2894592285156, "logps/ref_chosen": -61.76960372924805, "logps/ref_rejected": -83.76141357421875, "logps/rejected": -478.47515869140625, "loss": 1.0114, "rewards/accuracies": 0.796875, "rewards/chosen": -0.908942699432373, "rewards/margins": 0.5245332717895508, "rewards/rejected": -1.4334759712219238, "step": 523 }, { "epoch": 0.7694566813509545, "epsilon_dpo/beta": 0.003615394700318575, "epsilon_dpo/beta_margin_grad_mean": -0.38006651401519775, "epsilon_dpo/beta_margin_grad_std": 0.12714146077632904, "epsilon_dpo/beta_margin_mean": 0.5202612280845642, "epsilon_dpo/beta_margin_std": 0.5618492960929871, "epsilon_dpo/loss_margin_mean": 144.8385772705078, "grad_norm": 38.41351318359375, "kl/avg_steps": 0.59375, "kl/beta": 0.0036366255953907967, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 7.781828926091535e-08, "logits/chosen": 0.23068641126155853, "logits/rejected": 0.8385107517242432, "logps/chosen": -329.2145690917969, "logps/ref_chosen": -78.0720443725586, "logps/ref_rejected": -81.30198669433594, "logps/rejected": -477.2830810546875, "loss": 1.0058, "rewards/accuracies": 0.828125, "rewards/chosen": -0.9094934463500977, "rewards/margins": 0.5202612280845642, "rewards/rejected": -1.4297547340393066, "step": 524 }, { "epoch": 0.7709251101321586, "epsilon_dpo/beta": 0.0035884054377675056, "epsilon_dpo/beta_margin_grad_mean": -0.34438762068748474, "epsilon_dpo/beta_margin_grad_std": 0.1204388365149498, "epsilon_dpo/beta_margin_mean": 0.6936529278755188, "epsilon_dpo/beta_margin_std": 0.5703473687171936, "epsilon_dpo/loss_margin_mean": 193.9362335205078, "grad_norm": 32.43462371826172, "kl/avg_steps": 0.75, "kl/beta": 0.003615160472691059, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 7.689012058193384e-08, "logits/chosen": 0.6722402572631836, "logits/rejected": 0.6254914402961731, "logps/chosen": -290.9559020996094, "logps/ref_chosen": -50.827857971191406, "logps/ref_rejected": -100.05293273925781, "logps/rejected": -534.1171875, "loss": 0.8806, "rewards/accuracies": 0.890625, "rewards/chosen": -0.8624576330184937, "rewards/margins": 0.6936529278755188, "rewards/rejected": -1.5561106204986572, "step": 525 }, { "epoch": 0.7723935389133627, "epsilon_dpo/beta": 0.0035628145560622215, "epsilon_dpo/beta_margin_grad_mean": -0.3525753617286682, "epsilon_dpo/beta_margin_grad_std": 0.11983883380889893, "epsilon_dpo/beta_margin_mean": 0.6575983762741089, "epsilon_dpo/beta_margin_std": 0.5771363377571106, "epsilon_dpo/loss_margin_mean": 185.22718811035156, "grad_norm": 31.187345504760742, "kl/avg_steps": 0.71875, "kl/beta": 0.0035882487427443266, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 7.596651350926836e-08, "logits/chosen": 0.3530668616294861, "logits/rejected": 0.8896932601928711, "logps/chosen": -307.4703063964844, "logps/ref_chosen": -63.167232513427734, "logps/ref_rejected": -86.30934143066406, "logps/rejected": -515.839599609375, "loss": 0.9058, "rewards/accuracies": 0.84375, "rewards/chosen": -0.8706945180892944, "rewards/margins": 0.6575983762741089, "rewards/rejected": -1.5282928943634033, "step": 526 }, { "epoch": 0.7738619676945668, "epsilon_dpo/beta": 0.0035385030787438154, "epsilon_dpo/beta_margin_grad_mean": -0.38446280360221863, "epsilon_dpo/beta_margin_grad_std": 0.11603199690580368, "epsilon_dpo/beta_margin_mean": 0.5095372200012207, "epsilon_dpo/beta_margin_std": 0.5438603162765503, "epsilon_dpo/loss_margin_mean": 144.64404296875, "grad_norm": 36.69266891479492, "kl/avg_steps": 0.6875, "kl/beta": 0.0035626422613859177, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 7.504749238082414e-08, "logits/chosen": 0.4142528772354126, "logits/rejected": 1.0028213262557983, "logps/chosen": -342.5270690917969, "logps/ref_chosen": -71.12867736816406, "logps/ref_rejected": -78.3425521850586, "logps/rejected": -494.385009765625, "loss": 1.0064, "rewards/accuracies": 0.859375, "rewards/chosen": -0.9618728756904602, "rewards/margins": 0.5095372200012207, "rewards/rejected": -1.4714101552963257, "step": 527 }, { "epoch": 0.775330396475771, "epsilon_dpo/beta": 0.0035187650937587023, "epsilon_dpo/beta_margin_grad_mean": -0.37635356187820435, "epsilon_dpo/beta_margin_grad_std": 0.13374869525432587, "epsilon_dpo/beta_margin_mean": 0.5589490532875061, "epsilon_dpo/beta_margin_std": 0.6314277648925781, "epsilon_dpo/loss_margin_mean": 159.76739501953125, "grad_norm": 32.5216064453125, "kl/avg_steps": 0.5625, "kl/beta": 0.003538316348567605, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 7.413308141366254e-08, "logits/chosen": 0.4960554242134094, "logits/rejected": 0.7435092926025391, "logps/chosen": -338.567138671875, "logps/ref_chosen": -68.0894546508789, "logps/ref_rejected": -93.91006469726562, "logps/rejected": -524.1551513671875, "loss": 0.9917, "rewards/accuracies": 0.828125, "rewards/chosen": -0.9529532194137573, "rewards/margins": 0.5589489936828613, "rewards/rejected": -1.5119023323059082, "step": 528 }, { "epoch": 0.7767988252569751, "epsilon_dpo/beta": 0.003498527454212308, "epsilon_dpo/beta_margin_grad_mean": -0.39533618092536926, "epsilon_dpo/beta_margin_grad_std": 0.12744130194187164, "epsilon_dpo/beta_margin_mean": 0.4569692611694336, "epsilon_dpo/beta_margin_std": 0.567453145980835, "epsilon_dpo/loss_margin_mean": 131.51893615722656, "grad_norm": 41.775115966796875, "kl/avg_steps": 0.578125, "kl/beta": 0.003518524579703808, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.78125, "learning_rate": 7.322330470336313e-08, "logits/chosen": 0.7810444235801697, "logits/rejected": 0.8288546204566956, "logps/chosen": -379.33599853515625, "logps/ref_chosen": -55.5749626159668, "logps/ref_rejected": -89.20909118652344, "logps/rejected": -544.489013671875, "loss": 1.0552, "rewards/accuracies": 0.84375, "rewards/chosen": -1.1342432498931885, "rewards/margins": 0.4569692611694336, "rewards/rejected": -1.591212511062622, "step": 529 }, { "epoch": 0.7782672540381792, "epsilon_dpo/beta": 0.0034735032822936773, "epsilon_dpo/beta_margin_grad_mean": -0.3521674871444702, "epsilon_dpo/beta_margin_grad_std": 0.13671302795410156, "epsilon_dpo/beta_margin_mean": 0.6726905107498169, "epsilon_dpo/beta_margin_std": 0.6687734127044678, "epsilon_dpo/loss_margin_mean": 194.39688110351562, "grad_norm": 32.801971435546875, "kl/avg_steps": 0.71875, "kl/beta": 0.003498299978673458, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 7.231818622338822e-08, "logits/chosen": 0.7362005710601807, "logits/rejected": 1.0054600238800049, "logps/chosen": -305.640869140625, "logps/ref_chosen": -47.601417541503906, "logps/ref_rejected": -87.2845230102539, "logps/rejected": -539.7208251953125, "loss": 0.9197, "rewards/accuracies": 0.90625, "rewards/chosen": -0.8980590105056763, "rewards/margins": 0.6726905107498169, "rewards/rejected": -1.5707495212554932, "step": 530 }, { "epoch": 0.7797356828193832, "epsilon_dpo/beta": 0.0034530577249825, "epsilon_dpo/beta_margin_grad_mean": -0.3696640431880951, "epsilon_dpo/beta_margin_grad_std": 0.12576787173748016, "epsilon_dpo/beta_margin_mean": 0.5839786529541016, "epsilon_dpo/beta_margin_std": 0.5997346639633179, "epsilon_dpo/loss_margin_mean": 169.9185791015625, "grad_norm": 32.61705017089844, "kl/avg_steps": 0.59375, "kl/beta": 0.003473335411399603, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 7.141774982445147e-08, "logits/chosen": 0.6975569725036621, "logits/rejected": 1.2282212972640991, "logps/chosen": -340.595947265625, "logps/ref_chosen": -55.246063232421875, "logps/ref_rejected": -70.60598754882812, "logps/rejected": -525.8744506835938, "loss": 0.9644, "rewards/accuracies": 0.859375, "rewards/chosen": -0.9860022664070129, "rewards/margins": 0.5839786529541016, "rewards/rejected": -1.5699808597564697, "step": 531 }, { "epoch": 0.7812041116005873, "epsilon_dpo/beta": 0.003431597026064992, "epsilon_dpo/beta_margin_grad_mean": -0.36659470200538635, "epsilon_dpo/beta_margin_grad_std": 0.14631877839565277, "epsilon_dpo/beta_margin_mean": 0.6034601330757141, "epsilon_dpo/beta_margin_std": 0.6991419196128845, "epsilon_dpo/loss_margin_mean": 176.8496551513672, "grad_norm": 44.568336486816406, "kl/avg_steps": 0.625, "kl/beta": 0.0034528342075645924, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 7.052201923388953e-08, "logits/chosen": 0.5979117155075073, "logits/rejected": 1.1816649436950684, "logps/chosen": -386.58587646484375, "logps/ref_chosen": -70.28602600097656, "logps/ref_rejected": -86.5913314819336, "logps/rejected": -579.7408447265625, "loss": 0.9791, "rewards/accuracies": 0.828125, "rewards/chosen": -1.087220311164856, "rewards/margins": 0.6034601330757141, "rewards/rejected": -1.6906805038452148, "step": 532 }, { "epoch": 0.7826725403817915, "epsilon_dpo/beta": 0.0034156448673456907, "epsilon_dpo/beta_margin_grad_mean": -0.38440099358558655, "epsilon_dpo/beta_margin_grad_std": 0.14013712108135223, "epsilon_dpo/beta_margin_mean": 0.5196799635887146, "epsilon_dpo/beta_margin_std": 0.6585808396339417, "epsilon_dpo/loss_margin_mean": 153.3245849609375, "grad_norm": 29.019819259643555, "kl/avg_steps": 0.46875, "kl/beta": 0.0034313879441469908, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 6.963101805503646e-08, "logits/chosen": 0.5976595878601074, "logits/rejected": 1.3575315475463867, "logps/chosen": -357.8514404296875, "logps/ref_chosen": -64.8551025390625, "logps/ref_rejected": -76.58805847167969, "logps/rejected": -522.9089965820312, "loss": 1.0296, "rewards/accuracies": 0.765625, "rewards/chosen": -1.0026906728744507, "rewards/margins": 0.5196799635887146, "rewards/rejected": -1.52237069606781, "step": 533 }, { "epoch": 0.7841409691629956, "epsilon_dpo/beta": 0.003390102181583643, "epsilon_dpo/beta_margin_grad_mean": -0.38129475712776184, "epsilon_dpo/beta_margin_grad_std": 0.1270858645439148, "epsilon_dpo/beta_margin_mean": 0.5227511525154114, "epsilon_dpo/beta_margin_std": 0.5960108041763306, "epsilon_dpo/loss_margin_mean": 154.84788513183594, "grad_norm": 39.05036926269531, "kl/avg_steps": 0.75, "kl/beta": 0.0034153785090893507, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 6.874476976660184e-08, "logits/chosen": 0.7569847106933594, "logits/rejected": 1.157335638999939, "logps/chosen": -370.6048583984375, "logps/ref_chosen": -60.119388580322266, "logps/ref_rejected": -78.54347229003906, "logps/rejected": -543.8767700195312, "loss": 1.01, "rewards/accuracies": 0.90625, "rewards/chosen": -1.0541493892669678, "rewards/margins": 0.5227511525154114, "rewards/rejected": -1.5769004821777344, "step": 534 }, { "epoch": 0.7856093979441997, "epsilon_dpo/beta": 0.003364865668118, "epsilon_dpo/beta_margin_grad_mean": -0.35946574807167053, "epsilon_dpo/beta_margin_grad_std": 0.12239540368318558, "epsilon_dpo/beta_margin_mean": 0.6262299418449402, "epsilon_dpo/beta_margin_std": 0.581615149974823, "epsilon_dpo/loss_margin_mean": 186.77011108398438, "grad_norm": 39.60608673095703, "kl/avg_steps": 0.75, "kl/beta": 0.0033899538684636354, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 6.786329772205246e-08, "logits/chosen": 0.7636107206344604, "logits/rejected": 0.9732518196105957, "logps/chosen": -337.1137390136719, "logps/ref_chosen": -54.330238342285156, "logps/ref_rejected": -96.30763244628906, "logps/rejected": -565.8612060546875, "loss": 0.9298, "rewards/accuracies": 0.890625, "rewards/chosen": -0.9524887204170227, "rewards/margins": 0.6262299418449402, "rewards/rejected": -1.578718662261963, "step": 535 }, { "epoch": 0.7870778267254038, "epsilon_dpo/beta": 0.003340868279337883, "epsilon_dpo/beta_margin_grad_mean": -0.34632307291030884, "epsilon_dpo/beta_margin_grad_std": 0.12324853241443634, "epsilon_dpo/beta_margin_mean": 0.7060097455978394, "epsilon_dpo/beta_margin_std": 0.6573695540428162, "epsilon_dpo/loss_margin_mean": 212.0374298095703, "grad_norm": 33.39285659790039, "kl/avg_steps": 0.71875, "kl/beta": 0.003364718286320567, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 6.698662514899638e-08, "logits/chosen": 0.6936272978782654, "logits/rejected": 1.0324440002441406, "logps/chosen": -315.45166015625, "logps/ref_chosen": -47.08053207397461, "logps/ref_rejected": -89.09783935546875, "logps/rejected": -569.50634765625, "loss": 0.8867, "rewards/accuracies": 0.890625, "rewards/chosen": -0.897739052772522, "rewards/margins": 0.7060097455978394, "rewards/rejected": -1.6037487983703613, "step": 536 }, { "epoch": 0.788546255506608, "epsilon_dpo/beta": 0.003319115610793233, "epsilon_dpo/beta_margin_grad_mean": -0.37151262164115906, "epsilon_dpo/beta_margin_grad_std": 0.12973767518997192, "epsilon_dpo/beta_margin_mean": 0.5708633661270142, "epsilon_dpo/beta_margin_std": 0.6028314232826233, "epsilon_dpo/loss_margin_mean": 172.84255981445312, "grad_norm": 38.036041259765625, "kl/avg_steps": 0.65625, "kl/beta": 0.0033407071605324745, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 6.611477514857114e-08, "logits/chosen": 0.6901233196258545, "logits/rejected": 1.4581618309020996, "logps/chosen": -339.01446533203125, "logps/ref_chosen": -57.747474670410156, "logps/ref_rejected": -70.43838500976562, "logps/rejected": -524.5479125976562, "loss": 0.9763, "rewards/accuracies": 0.859375, "rewards/chosen": -0.9351907968521118, "rewards/margins": 0.5708633661270142, "rewards/rejected": -1.506054162979126, "step": 537 }, { "epoch": 0.7900146842878121, "epsilon_dpo/beta": 0.003297475865110755, "epsilon_dpo/beta_margin_grad_mean": -0.37028956413269043, "epsilon_dpo/beta_margin_grad_std": 0.1146416962146759, "epsilon_dpo/beta_margin_mean": 0.5704164505004883, "epsilon_dpo/beta_margin_std": 0.5363638401031494, "epsilon_dpo/loss_margin_mean": 173.74566650390625, "grad_norm": 40.7779541015625, "kl/avg_steps": 0.65625, "kl/beta": 0.003318926552310586, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 6.524777069483525e-08, "logits/chosen": 0.6868191957473755, "logits/rejected": 1.1349079608917236, "logps/chosen": -380.8131103515625, "logps/ref_chosen": -66.41593933105469, "logps/ref_rejected": -84.22808837890625, "logps/rejected": -572.3709106445312, "loss": 0.9593, "rewards/accuracies": 0.859375, "rewards/chosen": -1.0381038188934326, "rewards/margins": 0.5704164505004883, "rewards/rejected": -1.608520269393921, "step": 538 }, { "epoch": 0.7914831130690162, "epsilon_dpo/beta": 0.003277007956057787, "epsilon_dpo/beta_margin_grad_mean": -0.37952712178230286, "epsilon_dpo/beta_margin_grad_std": 0.1332114338874817, "epsilon_dpo/beta_margin_mean": 0.5355409383773804, "epsilon_dpo/beta_margin_std": 0.6017335057258606, "epsilon_dpo/loss_margin_mean": 164.367431640625, "grad_norm": 26.899112701416016, "kl/avg_steps": 0.625, "kl/beta": 0.00329728820361197, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 6.438563463416221e-08, "logits/chosen": 0.6796512603759766, "logits/rejected": 1.075002670288086, "logps/chosen": -358.3330383300781, "logps/ref_chosen": -58.49285125732422, "logps/ref_rejected": -91.85395812988281, "logps/rejected": -556.0615234375, "loss": 1.0036, "rewards/accuracies": 0.78125, "rewards/chosen": -0.98396897315979, "rewards/margins": 0.5355409383773804, "rewards/rejected": -1.5195097923278809, "step": 539 }, { "epoch": 0.7929515418502202, "epsilon_dpo/beta": 0.0032546056900173426, "epsilon_dpo/beta_margin_grad_mean": -0.3410680592060089, "epsilon_dpo/beta_margin_grad_std": 0.1335648149251938, "epsilon_dpo/beta_margin_mean": 0.7191876769065857, "epsilon_dpo/beta_margin_std": 0.6394471526145935, "epsilon_dpo/loss_margin_mean": 221.92811584472656, "grad_norm": 40.84651565551758, "kl/avg_steps": 0.6875, "kl/beta": 0.003276808187365532, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 6.352838968463919e-08, "logits/chosen": 0.4540940523147583, "logits/rejected": 0.7001627683639526, "logps/chosen": -345.60693359375, "logps/ref_chosen": -63.482513427734375, "logps/ref_rejected": -116.43000030517578, "logps/rejected": -620.4825439453125, "loss": 0.8811, "rewards/accuracies": 0.890625, "rewards/chosen": -0.9193914532661438, "rewards/margins": 0.7191876769065857, "rewards/rejected": -1.6385791301727295, "step": 540 }, { "epoch": 0.7944199706314243, "epsilon_dpo/beta": 0.0032384854275733232, "epsilon_dpo/beta_margin_grad_mean": -0.39354854822158813, "epsilon_dpo/beta_margin_grad_std": 0.129751518368721, "epsilon_dpo/beta_margin_mean": 0.47249120473861694, "epsilon_dpo/beta_margin_std": 0.5960127711296082, "epsilon_dpo/loss_margin_mean": 147.02890014648438, "grad_norm": 40.98292541503906, "kl/avg_steps": 0.5, "kl/beta": 0.003254433861002326, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 6.267605843546767e-08, "logits/chosen": 0.5124396085739136, "logits/rejected": 0.808890163898468, "logps/chosen": -419.98681640625, "logps/ref_chosen": -78.28035736083984, "logps/ref_rejected": -103.273681640625, "logps/rejected": -592.009033203125, "loss": 1.0489, "rewards/accuracies": 0.78125, "rewards/chosen": -1.1090798377990723, "rewards/margins": 0.47249120473861694, "rewards/rejected": -1.581571102142334, "step": 541 }, { "epoch": 0.7958883994126285, "epsilon_dpo/beta": 0.0032183255534619093, "epsilon_dpo/beta_margin_grad_mean": -0.3658785820007324, "epsilon_dpo/beta_margin_grad_std": 0.1336062103509903, "epsilon_dpo/beta_margin_mean": 0.5983901023864746, "epsilon_dpo/beta_margin_std": 0.6134870052337646, "epsilon_dpo/loss_margin_mean": 186.9104766845703, "grad_norm": 37.7127685546875, "kl/avg_steps": 0.625, "kl/beta": 0.0032382425852119923, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 6.182866334636888e-08, "logits/chosen": 0.8491047024726868, "logits/rejected": 0.9811897873878479, "logps/chosen": -382.3415222167969, "logps/ref_chosen": -57.48497009277344, "logps/ref_rejected": -96.47506713867188, "logps/rejected": -608.2420654296875, "loss": 0.9599, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0470104217529297, "rewards/margins": 0.5983901023864746, "rewards/rejected": -1.6454004049301147, "step": 542 }, { "epoch": 0.7973568281938326, "epsilon_dpo/beta": 0.0032033645547926426, "epsilon_dpo/beta_margin_grad_mean": -0.40789729356765747, "epsilon_dpo/beta_margin_grad_std": 0.13657322525978088, "epsilon_dpo/beta_margin_mean": 0.4165812134742737, "epsilon_dpo/beta_margin_std": 0.641452431678772, "epsilon_dpo/loss_margin_mean": 131.22235107421875, "grad_norm": 37.1438102722168, "kl/avg_steps": 0.46875, "kl/beta": 0.003218129277229309, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 6.098622674699147e-08, "logits/chosen": 0.5867632627487183, "logits/rejected": 0.6698395013809204, "logps/chosen": -391.06011962890625, "logps/ref_chosen": -60.61750793457031, "logps/ref_rejected": -105.59896850585938, "logps/rejected": -567.263916015625, "loss": 1.1045, "rewards/accuracies": 0.75, "rewards/chosen": -1.0601210594177246, "rewards/margins": 0.4165812134742737, "rewards/rejected": -1.476702332496643, "step": 543 }, { "epoch": 0.7988252569750367, "epsilon_dpo/beta": 0.0031774069648236036, "epsilon_dpo/beta_margin_grad_mean": -0.3559897243976593, "epsilon_dpo/beta_margin_grad_std": 0.1129704937338829, "epsilon_dpo/beta_margin_mean": 0.6417028307914734, "epsilon_dpo/beta_margin_std": 0.5646976828575134, "epsilon_dpo/loss_margin_mean": 202.46644592285156, "grad_norm": 44.15922164916992, "kl/avg_steps": 0.8125, "kl/beta": 0.003203114727512002, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 6.01487708363232e-08, "logits/chosen": 0.7706766128540039, "logits/rejected": 0.9171621799468994, "logps/chosen": -404.3023681640625, "logps/ref_chosen": -59.642303466796875, "logps/ref_rejected": -100.95469665527344, "logps/rejected": -648.0811767578125, "loss": 0.9121, "rewards/accuracies": 0.890625, "rewards/chosen": -1.0956295728683472, "rewards/margins": 0.6417028307914734, "rewards/rejected": -1.7373324632644653, "step": 544 }, { "epoch": 0.8002936857562408, "epsilon_dpo/beta": 0.003156262217089534, "epsilon_dpo/beta_margin_grad_mean": -0.36415934562683105, "epsilon_dpo/beta_margin_grad_std": 0.11954941600561142, "epsilon_dpo/beta_margin_mean": 0.5967934727668762, "epsilon_dpo/beta_margin_std": 0.5416412949562073, "epsilon_dpo/loss_margin_mean": 189.89364624023438, "grad_norm": 35.48500442504883, "kl/avg_steps": 0.671875, "kl/beta": 0.003177299164235592, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.828125, "learning_rate": 5.9316317682106294e-08, "logits/chosen": 0.6420180201530457, "logits/rejected": 1.0094183683395386, "logps/chosen": -396.3116455078125, "logps/ref_chosen": -67.64859771728516, "logps/ref_rejected": -95.90800476074219, "logps/rejected": -614.4647216796875, "loss": 0.9431, "rewards/accuracies": 0.84375, "rewards/chosen": -1.0389024019241333, "rewards/margins": 0.5967934727668762, "rewards/rejected": -1.6356959342956543, "step": 545 }, { "epoch": 0.801762114537445, "epsilon_dpo/beta": 0.0031297774985432625, "epsilon_dpo/beta_margin_grad_mean": -0.3827662765979767, "epsilon_dpo/beta_margin_grad_std": 0.10833470523357391, "epsilon_dpo/beta_margin_mean": 0.5002341270446777, "epsilon_dpo/beta_margin_std": 0.4755844473838806, "epsilon_dpo/loss_margin_mean": 160.34983825683594, "grad_norm": 31.855754852294922, "kl/avg_steps": 0.84375, "kl/beta": 0.0031560941133648157, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 5.848888922025552e-08, "logits/chosen": 0.6947014927864075, "logits/rejected": 1.0679924488067627, "logps/chosen": -352.6844482421875, "logps/ref_chosen": -50.744232177734375, "logps/ref_rejected": -81.86622619628906, "logps/rejected": -544.15625, "loss": 1.0004, "rewards/accuracies": 0.90625, "rewards/chosen": -0.9460525512695312, "rewards/margins": 0.5002341270446777, "rewards/rejected": -1.446286678314209, "step": 546 }, { "epoch": 0.8032305433186491, "epsilon_dpo/beta": 0.0031143503729254007, "epsilon_dpo/beta_margin_grad_mean": -0.3933895230293274, "epsilon_dpo/beta_margin_grad_std": 0.13605713844299316, "epsilon_dpo/beta_margin_mean": 0.4776236116886139, "epsilon_dpo/beta_margin_std": 0.6284298300743103, "epsilon_dpo/loss_margin_mean": 154.54000854492188, "grad_norm": 44.819637298583984, "kl/avg_steps": 0.5, "kl/beta": 0.0031296873930841684, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 5.7666507254280265e-08, "logits/chosen": 0.5689883232116699, "logits/rejected": 1.1067110300064087, "logps/chosen": -430.132080078125, "logps/ref_chosen": -73.6877212524414, "logps/ref_rejected": -90.76136779785156, "logps/rejected": -601.7457275390625, "loss": 1.0537, "rewards/accuracies": 0.734375, "rewards/chosen": -1.1123816967010498, "rewards/margins": 0.4776236414909363, "rewards/rejected": -1.5900053977966309, "step": 547 }, { "epoch": 0.8046989720998532, "epsilon_dpo/beta": 0.003094471525400877, "epsilon_dpo/beta_margin_grad_mean": -0.3770330846309662, "epsilon_dpo/beta_margin_grad_std": 0.1237437054514885, "epsilon_dpo/beta_margin_mean": 0.5523515343666077, "epsilon_dpo/beta_margin_std": 0.6105153560638428, "epsilon_dpo/loss_margin_mean": 179.31539916992188, "grad_norm": 30.926790237426758, "kl/avg_steps": 0.640625, "kl/beta": 0.003114116843789816, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 5.684919345471029e-08, "logits/chosen": 0.6595076322555542, "logits/rejected": 0.9591635465621948, "logps/chosen": -409.63128662109375, "logps/ref_chosen": -65.24634552001953, "logps/ref_rejected": -94.11807250976562, "logps/rejected": -617.8184204101562, "loss": 0.9884, "rewards/accuracies": 0.828125, "rewards/chosen": -1.066938877105713, "rewards/margins": 0.5523515343666077, "rewards/rejected": -1.6192903518676758, "step": 548 }, { "epoch": 0.8061674008810573, "epsilon_dpo/beta": 0.003075262298807502, "epsilon_dpo/beta_margin_grad_mean": -0.38762035965919495, "epsilon_dpo/beta_margin_grad_std": 0.12105090171098709, "epsilon_dpo/beta_margin_mean": 0.4936157763004303, "epsilon_dpo/beta_margin_std": 0.5551433563232422, "epsilon_dpo/loss_margin_mean": 161.42079162597656, "grad_norm": 41.4873046875, "kl/avg_steps": 0.625, "kl/beta": 0.0030942941084504128, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 5.603696935852426e-08, "logits/chosen": 0.9439055919647217, "logits/rejected": 1.4650170803070068, "logps/chosen": -403.68853759765625, "logps/ref_chosen": -49.21235656738281, "logps/ref_rejected": -73.91031646728516, "logps/rejected": -589.8072509765625, "loss": 1.0222, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0920209884643555, "rewards/margins": 0.4936157763004303, "rewards/rejected": -1.585636854171753, "step": 549 }, { "epoch": 0.8076358296622613, "epsilon_dpo/beta": 0.0030513559468090534, "epsilon_dpo/beta_margin_grad_mean": -0.39734750986099243, "epsilon_dpo/beta_margin_grad_std": 0.11439433693885803, "epsilon_dpo/beta_margin_mean": 0.43822336196899414, "epsilon_dpo/beta_margin_std": 0.49982836842536926, "epsilon_dpo/loss_margin_mean": 144.28887939453125, "grad_norm": 47.187591552734375, "kl/avg_steps": 0.78125, "kl/beta": 0.0030750748701393604, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 5.5229856368582376e-08, "logits/chosen": 0.7585632801055908, "logits/rejected": 0.8104821443557739, "logps/chosen": -407.1593017578125, "logps/ref_chosen": -56.80695343017578, "logps/ref_rejected": -95.12580871582031, "logps/rejected": -589.76708984375, "loss": 1.0542, "rewards/accuracies": 0.890625, "rewards/chosen": -1.0703229904174805, "rewards/margins": 0.43822336196899414, "rewards/rejected": -1.5085463523864746, "step": 550 }, { "epoch": 0.8091042584434655, "epsilon_dpo/beta": 0.003028655657544732, "epsilon_dpo/beta_margin_grad_mean": -0.3529271185398102, "epsilon_dpo/beta_margin_grad_std": 0.12017516791820526, "epsilon_dpo/beta_margin_mean": 0.6479488611221313, "epsilon_dpo/beta_margin_std": 0.5548712015151978, "epsilon_dpo/loss_margin_mean": 214.7172393798828, "grad_norm": 36.151920318603516, "kl/avg_steps": 0.75, "kl/beta": 0.0030512369703501463, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 5.4427875753062734e-08, "logits/chosen": 0.6678426265716553, "logits/rejected": 0.7555434703826904, "logps/chosen": -406.9227294921875, "logps/ref_chosen": -59.10633087158203, "logps/ref_rejected": -111.67280578613281, "logps/rejected": -674.2064208984375, "loss": 0.9094, "rewards/accuracies": 0.875, "rewards/chosen": -1.0539422035217285, "rewards/margins": 0.6479488611221313, "rewards/rejected": -1.7018909454345703, "step": 551 }, { "epoch": 0.8105726872246696, "epsilon_dpo/beta": 0.00300705642439425, "epsilon_dpo/beta_margin_grad_mean": -0.3421294391155243, "epsilon_dpo/beta_margin_grad_std": 0.1339496374130249, "epsilon_dpo/beta_margin_mean": 0.7299196124076843, "epsilon_dpo/beta_margin_std": 0.6780387163162231, "epsilon_dpo/loss_margin_mean": 243.58790588378906, "grad_norm": 31.587860107421875, "kl/avg_steps": 0.71875, "kl/beta": 0.003028523176908493, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 5.363104864490034e-08, "logits/chosen": 0.6872535943984985, "logits/rejected": 0.9237173795700073, "logps/chosen": -399.505126953125, "logps/ref_chosen": -62.35459899902344, "logps/ref_rejected": -104.56210327148438, "logps/rejected": -685.300537109375, "loss": 0.8808, "rewards/accuracies": 0.875, "rewards/chosen": -1.0149754285812378, "rewards/margins": 0.7299196720123291, "rewards/rejected": -1.7448949813842773, "step": 552 }, { "epoch": 0.8120411160058737, "epsilon_dpo/beta": 0.002990296110510826, "epsilon_dpo/beta_margin_grad_mean": -0.38657346367836, "epsilon_dpo/beta_margin_grad_std": 0.12158539891242981, "epsilon_dpo/beta_margin_mean": 0.49795299768447876, "epsilon_dpo/beta_margin_std": 0.5614814162254333, "epsilon_dpo/loss_margin_mean": 167.6130828857422, "grad_norm": 42.0295295715332, "kl/avg_steps": 0.5625, "kl/beta": 0.0030069109052419662, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 5.2839396041230415e-08, "logits/chosen": 0.7896755933761597, "logits/rejected": 1.1973187923431396, "logps/chosen": -426.24334716796875, "logps/ref_chosen": -68.25881958007812, "logps/ref_rejected": -98.0971450805664, "logps/rejected": -623.6947631835938, "loss": 1.02, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0723387002944946, "rewards/margins": 0.49795299768447876, "rewards/rejected": -1.5702916383743286, "step": 553 }, { "epoch": 0.8135095447870778, "epsilon_dpo/beta": 0.002969831693917513, "epsilon_dpo/beta_margin_grad_mean": -0.36070722341537476, "epsilon_dpo/beta_margin_grad_std": 0.13217829167842865, "epsilon_dpo/beta_margin_mean": 0.615236759185791, "epsilon_dpo/beta_margin_std": 0.603321373462677, "epsilon_dpo/loss_margin_mean": 208.22732543945312, "grad_norm": 31.18020248413086, "kl/avg_steps": 0.6875, "kl/beta": 0.002990091685205698, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 5.205293880283551e-08, "logits/chosen": 0.4497915804386139, "logits/rejected": 1.1443912982940674, "logps/chosen": -418.65570068359375, "logps/ref_chosen": -67.94767761230469, "logps/ref_rejected": -89.78272247314453, "logps/rejected": -648.7180786132812, "loss": 0.9457, "rewards/accuracies": 0.859375, "rewards/chosen": -1.0432777404785156, "rewards/margins": 0.615236759185791, "rewards/rejected": -1.6585144996643066, "step": 554 }, { "epoch": 0.8149779735682819, "epsilon_dpo/beta": 0.002950481604784727, "epsilon_dpo/beta_margin_grad_mean": -0.36732053756713867, "epsilon_dpo/beta_margin_grad_std": 0.1373441517353058, "epsilon_dpo/beta_margin_mean": 0.6067565083503723, "epsilon_dpo/beta_margin_std": 0.6645219326019287, "epsilon_dpo/loss_margin_mean": 206.6040802001953, "grad_norm": 33.9810791015625, "kl/avg_steps": 0.65625, "kl/beta": 0.0029696752317249775, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 5.127169765359515e-08, "logits/chosen": 0.8876933455467224, "logits/rejected": 0.9146698713302612, "logps/chosen": -409.0677795410156, "logps/ref_chosen": -53.33049011230469, "logps/ref_rejected": -108.47937774658203, "logps/rejected": -670.8207397460938, "loss": 0.965, "rewards/accuracies": 0.859375, "rewards/chosen": -1.0504544973373413, "rewards/margins": 0.6067564487457275, "rewards/rejected": -1.6572110652923584, "step": 555 }, { "epoch": 0.8164464023494861, "epsilon_dpo/beta": 0.0029330896213650703, "epsilon_dpo/beta_margin_grad_mean": -0.3955794870853424, "epsilon_dpo/beta_margin_grad_std": 0.12321365624666214, "epsilon_dpo/beta_margin_mean": 0.4681977927684784, "epsilon_dpo/beta_margin_std": 0.5896342992782593, "epsilon_dpo/loss_margin_mean": 160.55120849609375, "grad_norm": 44.372745513916016, "kl/avg_steps": 0.59375, "kl/beta": 0.0029503137338906527, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 5.049569317994012e-08, "logits/chosen": 0.7680181860923767, "logits/rejected": 0.9773924350738525, "logps/chosen": -434.9501953125, "logps/ref_chosen": -58.64447021484375, "logps/ref_rejected": -101.34040832519531, "logps/rejected": -638.1973266601562, "loss": 1.0484, "rewards/accuracies": 0.84375, "rewards/chosen": -1.1052610874176025, "rewards/margins": 0.4681978225708008, "rewards/rejected": -1.5734589099884033, "step": 556 }, { "epoch": 0.8179148311306902, "epsilon_dpo/beta": 0.002912110649049282, "epsilon_dpo/beta_margin_grad_mean": -0.3733898401260376, "epsilon_dpo/beta_margin_grad_std": 0.1256464719772339, "epsilon_dpo/beta_margin_mean": 0.5663333535194397, "epsilon_dpo/beta_margin_std": 0.5976885557174683, "epsilon_dpo/loss_margin_mean": 195.2670135498047, "grad_norm": 49.433834075927734, "kl/avg_steps": 0.71875, "kl/beta": 0.002932899631559849, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 4.9724945830310144e-08, "logits/chosen": 0.6856008768081665, "logits/rejected": 0.8689923286437988, "logps/chosen": -452.6092224121094, "logps/ref_chosen": -67.84066009521484, "logps/ref_rejected": -109.93966674804688, "logps/rejected": -689.9752197265625, "loss": 0.9768, "rewards/accuracies": 0.859375, "rewards/chosen": -1.1214444637298584, "rewards/margins": 0.5663332939147949, "rewards/rejected": -1.6877777576446533, "step": 557 }, { "epoch": 0.8193832599118943, "epsilon_dpo/beta": 0.0028895088471472263, "epsilon_dpo/beta_margin_grad_mean": -0.3464803695678711, "epsilon_dpo/beta_margin_grad_std": 0.13288140296936035, "epsilon_dpo/beta_margin_mean": 0.6969226598739624, "epsilon_dpo/beta_margin_std": 0.6406014561653137, "epsilon_dpo/loss_margin_mean": 241.95303344726562, "grad_norm": 32.371253967285156, "kl/avg_steps": 0.78125, "kl/beta": 0.0029119697865098715, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 4.8959475914614554e-08, "logits/chosen": 0.6253798007965088, "logits/rejected": 0.9323130249977112, "logps/chosen": -407.76385498046875, "logps/ref_chosen": -62.36824035644531, "logps/ref_rejected": -102.16102600097656, "logps/rejected": -689.5096435546875, "loss": 0.8958, "rewards/accuracies": 0.890625, "rewards/chosen": -0.9991766214370728, "rewards/margins": 0.6969226598739624, "rewards/rejected": -1.6960992813110352, "step": 558 }, { "epoch": 0.8208516886930984, "epsilon_dpo/beta": 0.0028689156752079725, "epsilon_dpo/beta_margin_grad_mean": -0.38074734807014465, "epsilon_dpo/beta_margin_grad_std": 0.11717140674591064, "epsilon_dpo/beta_margin_mean": 0.5243265628814697, "epsilon_dpo/beta_margin_std": 0.5393105149269104, "epsilon_dpo/loss_margin_mean": 183.4691925048828, "grad_norm": 33.07487869262695, "kl/avg_steps": 0.71875, "kl/beta": 0.0028893963899463415, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 4.8199303603697614e-08, "logits/chosen": 0.6320044994354248, "logits/rejected": 1.1396799087524414, "logps/chosen": -447.97857666015625, "logps/ref_chosen": -60.75232696533203, "logps/ref_rejected": -93.4422836303711, "logps/rejected": -664.1376953125, "loss": 0.9949, "rewards/accuracies": 0.859375, "rewards/chosen": -1.111776351928711, "rewards/margins": 0.5243265628814697, "rewards/rejected": -1.6361029148101807, "step": 559 }, { "epoch": 0.8223201174743024, "epsilon_dpo/beta": 0.0028515763115137815, "epsilon_dpo/beta_margin_grad_mean": -0.41036349534988403, "epsilon_dpo/beta_margin_grad_std": 0.09816108644008636, "epsilon_dpo/beta_margin_mean": 0.38115882873535156, "epsilon_dpo/beta_margin_std": 0.42943939566612244, "epsilon_dpo/loss_margin_mean": 134.3894805908203, "grad_norm": 31.247947692871094, "kl/avg_steps": 0.609375, "kl/beta": 0.0028687771409749985, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.7444448928806615e-08, "logits/chosen": 0.7274857759475708, "logits/rejected": 1.293564796447754, "logps/chosen": -398.22711181640625, "logps/ref_chosen": -58.10382080078125, "logps/ref_rejected": -79.99122619628906, "logps/rejected": -554.5040283203125, "loss": 1.0843, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9707115888595581, "rewards/margins": 0.38115882873535156, "rewards/rejected": -1.3518704175949097, "step": 560 }, { "epoch": 0.8237885462555066, "epsilon_dpo/beta": 0.0028329724445939064, "epsilon_dpo/beta_margin_grad_mean": -0.40248867869377136, "epsilon_dpo/beta_margin_grad_std": 0.11467036604881287, "epsilon_dpo/beta_margin_mean": 0.41967105865478516, "epsilon_dpo/beta_margin_std": 0.5235335826873779, "epsilon_dpo/loss_margin_mean": 148.99917602539062, "grad_norm": 31.213546752929688, "kl/avg_steps": 0.65625, "kl/beta": 0.0028514014557003975, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.669493178106432e-08, "logits/chosen": 1.0767673254013062, "logits/rejected": 1.1919467449188232, "logps/chosen": -434.6761169433594, "logps/ref_chosen": -50.91287612915039, "logps/ref_rejected": -99.06857299804688, "logps/rejected": -631.8309936523438, "loss": 1.0728, "rewards/accuracies": 0.828125, "rewards/chosen": -1.087891936302185, "rewards/margins": 0.41967105865478516, "rewards/rejected": -1.5075631141662598, "step": 561 }, { "epoch": 0.8252569750367107, "epsilon_dpo/beta": 0.0028145022224634886, "epsilon_dpo/beta_margin_grad_mean": -0.3710513412952423, "epsilon_dpo/beta_margin_grad_std": 0.1203143373131752, "epsilon_dpo/beta_margin_mean": 0.57718825340271, "epsilon_dpo/beta_margin_std": 0.5860868692398071, "epsilon_dpo/loss_margin_mean": 205.9770050048828, "grad_norm": 27.3686580657959, "kl/avg_steps": 0.65625, "kl/beta": 0.0028328110929578543, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.5950771910944596e-08, "logits/chosen": 0.7035528421401978, "logits/rejected": 1.2065534591674805, "logps/chosen": -408.2765808105469, "logps/ref_chosen": -59.46440124511719, "logps/ref_rejected": -96.54266357421875, "logps/rejected": -651.3318481445312, "loss": 0.9645, "rewards/accuracies": 0.875, "rewards/chosen": -0.9833180904388428, "rewards/margins": 0.57718825340271, "rewards/rejected": -1.5605063438415527, "step": 562 }, { "epoch": 0.8267254038179148, "epsilon_dpo/beta": 0.0027987912762910128, "epsilon_dpo/beta_margin_grad_mean": -0.3996942639350891, "epsilon_dpo/beta_margin_grad_std": 0.12220650911331177, "epsilon_dpo/beta_margin_mean": 0.4326484203338623, "epsilon_dpo/beta_margin_std": 0.5346004962921143, "epsilon_dpo/loss_margin_mean": 155.6993408203125, "grad_norm": 31.659542083740234, "kl/avg_steps": 0.5625, "kl/beta": 0.0028143420349806547, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.521198892775202e-08, "logits/chosen": 0.9191850423812866, "logits/rejected": 1.1130412817001343, "logps/chosen": -437.1990966796875, "logps/ref_chosen": -60.60819625854492, "logps/ref_rejected": -94.56770324707031, "logps/rejected": -626.8579711914062, "loss": 1.0668, "rewards/accuracies": 0.796875, "rewards/chosen": -1.0559725761413574, "rewards/margins": 0.4326484203338623, "rewards/rejected": -1.4886209964752197, "step": 563 }, { "epoch": 0.8281938325991189, "epsilon_dpo/beta": 0.0027761387173086405, "epsilon_dpo/beta_margin_grad_mean": -0.3738451898097992, "epsilon_dpo/beta_margin_grad_std": 0.09065651148557663, "epsilon_dpo/beta_margin_mean": 0.5356013178825378, "epsilon_dpo/beta_margin_std": 0.400540828704834, "epsilon_dpo/loss_margin_mean": 193.3629608154297, "grad_norm": 25.625314712524414, "kl/avg_steps": 0.8125, "kl/beta": 0.0027985998895019293, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 4.447860229910544e-08, "logits/chosen": 0.6433418393135071, "logits/rejected": 1.1163069009780884, "logps/chosen": -418.558837890625, "logps/ref_chosen": -74.26837921142578, "logps/ref_rejected": -93.2381820678711, "logps/rejected": -630.8916015625, "loss": 0.9584, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9560706615447998, "rewards/margins": 0.5356012582778931, "rewards/rejected": -1.4916720390319824, "step": 564 }, { "epoch": 0.8296622613803231, "epsilon_dpo/beta": 0.0027624403592199087, "epsilon_dpo/beta_margin_grad_mean": -0.3863063454627991, "epsilon_dpo/beta_margin_grad_std": 0.13359439373016357, "epsilon_dpo/beta_margin_mean": 0.5184347033500671, "epsilon_dpo/beta_margin_std": 0.6451285481452942, "epsilon_dpo/loss_margin_mean": 188.94174194335938, "grad_norm": 25.23392105102539, "kl/avg_steps": 0.5, "kl/beta": 0.0027760444208979607, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.375063135042445e-08, "logits/chosen": 0.6726326942443848, "logits/rejected": 1.2626471519470215, "logps/chosen": -433.564453125, "logps/ref_chosen": -69.0199203491211, "logps/ref_rejected": -85.7789306640625, "logps/rejected": -639.2651977539062, "loss": 1.0243, "rewards/accuracies": 0.796875, "rewards/chosen": -1.0082712173461914, "rewards/margins": 0.5184347033500671, "rewards/rejected": -1.5267059803009033, "step": 565 }, { "epoch": 0.8311306901615272, "epsilon_dpo/beta": 0.0027512870728969574, "epsilon_dpo/beta_margin_grad_mean": -0.3903193771839142, "epsilon_dpo/beta_margin_grad_std": 0.1350727379322052, "epsilon_dpo/beta_margin_mean": 0.48427921533584595, "epsilon_dpo/beta_margin_std": 0.5979759097099304, "epsilon_dpo/loss_margin_mean": 177.55999755859375, "grad_norm": 34.45583724975586, "kl/avg_steps": 0.40625, "kl/beta": 0.0027622333727777004, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.3028095264420525e-08, "logits/chosen": 0.9276168942451477, "logits/rejected": 1.0409126281738281, "logps/chosen": -414.1277770996094, "logps/ref_chosen": -66.5453109741211, "logps/ref_rejected": -103.86931610107422, "logps/rejected": -629.0117797851562, "loss": 1.0425, "rewards/accuracies": 0.71875, "rewards/chosen": -0.958074688911438, "rewards/margins": 0.48427921533584595, "rewards/rejected": -1.4423538446426392, "step": 566 }, { "epoch": 0.8325991189427313, "epsilon_dpo/beta": 0.0027367155998945236, "epsilon_dpo/beta_margin_grad_mean": -0.38993099331855774, "epsilon_dpo/beta_margin_grad_std": 0.12341343611478806, "epsilon_dpo/beta_margin_mean": 0.4824931025505066, "epsilon_dpo/beta_margin_std": 0.55397629737854, "epsilon_dpo/loss_margin_mean": 177.4494171142578, "grad_norm": 30.24207305908203, "kl/avg_steps": 0.53125, "kl/beta": 0.0027510570362210274, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.231101308059165e-08, "logits/chosen": 0.8174692392349243, "logits/rejected": 1.4295332431793213, "logps/chosen": -399.6371154785156, "logps/ref_chosen": -52.858299255371094, "logps/ref_rejected": -85.37095642089844, "logps/rejected": -609.5991821289062, "loss": 1.0315, "rewards/accuracies": 0.765625, "rewards/chosen": -0.9509602785110474, "rewards/margins": 0.4824931025505066, "rewards/rejected": -1.4334534406661987, "step": 567 }, { "epoch": 0.8340675477239354, "epsilon_dpo/beta": 0.002716267015784979, "epsilon_dpo/beta_margin_grad_mean": -0.3715296983718872, "epsilon_dpo/beta_margin_grad_std": 0.11285816133022308, "epsilon_dpo/beta_margin_mean": 0.558646559715271, "epsilon_dpo/beta_margin_std": 0.5174387693405151, "epsilon_dpo/loss_margin_mean": 206.4679718017578, "grad_norm": 25.43803596496582, "kl/avg_steps": 0.75, "kl/beta": 0.0027365193236619234, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 4.1599403694720145e-08, "logits/chosen": 1.0080702304840088, "logits/rejected": 1.1029735803604126, "logps/chosen": -359.6253967285156, "logps/ref_chosen": -45.1923828125, "logps/ref_rejected": -89.09236145019531, "logps/rejected": -609.9932861328125, "loss": 0.9646, "rewards/accuracies": 0.875, "rewards/chosen": -0.8550735712051392, "rewards/margins": 0.558646559715271, "rewards/rejected": -1.4137201309204102, "step": 568 }, { "epoch": 0.8355359765051396, "epsilon_dpo/beta": 0.002696895506232977, "epsilon_dpo/beta_margin_grad_mean": -0.3962607979774475, "epsilon_dpo/beta_margin_grad_std": 0.12354662269353867, "epsilon_dpo/beta_margin_mean": 0.4480724334716797, "epsilon_dpo/beta_margin_std": 0.5613905191421509, "epsilon_dpo/loss_margin_mean": 167.05653381347656, "grad_norm": 40.66030502319336, "kl/avg_steps": 0.71875, "kl/beta": 0.0027161482721567154, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 4.089328585837512e-08, "logits/chosen": 0.8680237531661987, "logits/rejected": 1.4851438999176025, "logps/chosen": -424.75897216796875, "logps/ref_chosen": -63.72056198120117, "logps/ref_rejected": -79.10325622558594, "logps/rejected": -607.1982421875, "loss": 1.06, "rewards/accuracies": 0.859375, "rewards/chosen": -0.9750781059265137, "rewards/margins": 0.4480724334716797, "rewards/rejected": -1.4231505393981934, "step": 569 }, { "epoch": 0.8370044052863436, "epsilon_dpo/beta": 0.0026784928049892187, "epsilon_dpo/beta_margin_grad_mean": -0.3771604001522064, "epsilon_dpo/beta_margin_grad_std": 0.10708852857351303, "epsilon_dpo/beta_margin_mean": 0.5304939150810242, "epsilon_dpo/beta_margin_std": 0.47928735613822937, "epsilon_dpo/loss_margin_mean": 198.85386657714844, "grad_norm": 29.28664779663086, "kl/avg_steps": 0.6875, "kl/beta": 0.0026967651210725307, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.019267817841834e-08, "logits/chosen": 0.734595775604248, "logits/rejected": 1.25980544090271, "logps/chosen": -394.47503662109375, "logps/ref_chosen": -61.61454772949219, "logps/ref_rejected": -82.1418685913086, "logps/rejected": -613.856201171875, "loss": 0.9776, "rewards/accuracies": 0.875, "rewards/chosen": -0.8928371667861938, "rewards/margins": 0.530493974685669, "rewards/rejected": -1.4233310222625732, "step": 570 }, { "epoch": 0.8384728340675477, "epsilon_dpo/beta": 0.0026618780102580786, "epsilon_dpo/beta_margin_grad_mean": -0.3846544623374939, "epsilon_dpo/beta_margin_grad_std": 0.12741585075855255, "epsilon_dpo/beta_margin_mean": 0.5052838325500488, "epsilon_dpo/beta_margin_std": 0.578217089176178, "epsilon_dpo/loss_margin_mean": 190.95159912109375, "grad_norm": 30.714710235595703, "kl/avg_steps": 0.625, "kl/beta": 0.002678351476788521, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 3.9497599116513705e-08, "logits/chosen": 1.0484448671340942, "logits/rejected": 1.1895785331726074, "logps/chosen": -405.9598388671875, "logps/ref_chosen": -53.05406188964844, "logps/ref_rejected": -91.33682250976562, "logps/rejected": -635.1942138671875, "loss": 1.0201, "rewards/accuracies": 0.84375, "rewards/chosen": -0.941373884677887, "rewards/margins": 0.5052838325500488, "rewards/rejected": -1.446657657623291, "step": 571 }, { "epoch": 0.8399412628487518, "epsilon_dpo/beta": 0.0026478401850908995, "epsilon_dpo/beta_margin_grad_mean": -0.39481183886528015, "epsilon_dpo/beta_margin_grad_std": 0.13346989452838898, "epsilon_dpo/beta_margin_mean": 0.47457355260849, "epsilon_dpo/beta_margin_std": 0.626340925693512, "epsilon_dpo/loss_margin_mean": 180.48153686523438, "grad_norm": 34.774078369140625, "kl/avg_steps": 0.53125, "kl/beta": 0.002661715727299452, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 3.880806698864086e-08, "logits/chosen": 1.2371206283569336, "logits/rejected": 1.321467399597168, "logps/chosen": -414.22698974609375, "logps/ref_chosen": -48.459285736083984, "logps/ref_rejected": -83.5570297241211, "logps/rejected": -629.8062744140625, "loss": 1.0546, "rewards/accuracies": 0.765625, "rewards/chosen": -0.9693921804428101, "rewards/margins": 0.47457355260849, "rewards/rejected": -1.4439656734466553, "step": 572 }, { "epoch": 0.8414096916299559, "epsilon_dpo/beta": 0.002631365554407239, "epsilon_dpo/beta_margin_grad_mean": -0.38391563296318054, "epsilon_dpo/beta_margin_grad_std": 0.11443734169006348, "epsilon_dpo/beta_margin_mean": 0.5076600313186646, "epsilon_dpo/beta_margin_std": 0.523607611656189, "epsilon_dpo/loss_margin_mean": 193.87985229492188, "grad_norm": 23.62225914001465, "kl/avg_steps": 0.625, "kl/beta": 0.002647650195285678, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 3.812409996461275e-08, "logits/chosen": 0.9495484828948975, "logits/rejected": 1.3154916763305664, "logps/chosen": -382.74468994140625, "logps/ref_chosen": -51.62262725830078, "logps/ref_rejected": -85.32499694824219, "logps/rejected": -610.326904296875, "loss": 1.0039, "rewards/accuracies": 0.796875, "rewards/chosen": -0.8721754550933838, "rewards/margins": 0.5076599717140198, "rewards/rejected": -1.3798354864120483, "step": 573 }, { "epoch": 0.8428781204111601, "epsilon_dpo/beta": 0.0026125547010451555, "epsilon_dpo/beta_margin_grad_mean": -0.384895384311676, "epsilon_dpo/beta_margin_grad_std": 0.10996060073375702, "epsilon_dpo/beta_margin_mean": 0.49979686737060547, "epsilon_dpo/beta_margin_std": 0.4993724524974823, "epsilon_dpo/loss_margin_mean": 192.04933166503906, "grad_norm": 31.828584671020508, "kl/avg_steps": 0.71875, "kl/beta": 0.0026312051340937614, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 3.74457160675965e-08, "logits/chosen": 0.9036248922348022, "logits/rejected": 1.078535795211792, "logps/chosen": -383.24945068359375, "logps/ref_chosen": -51.04446029663086, "logps/ref_rejected": -92.80640411376953, "logps/rejected": -617.060791015625, "loss": 1.0047, "rewards/accuracies": 0.859375, "rewards/chosen": -0.8691877126693726, "rewards/margins": 0.49979686737060547, "rewards/rejected": -1.3689844608306885, "step": 574 }, { "epoch": 0.8443465491923642, "epsilon_dpo/beta": 0.00259554386138916, "epsilon_dpo/beta_margin_grad_mean": -0.38595253229141235, "epsilon_dpo/beta_margin_grad_std": 0.13600078225135803, "epsilon_dpo/beta_margin_mean": 0.4975544810295105, "epsilon_dpo/beta_margin_std": 0.604264497756958, "epsilon_dpo/loss_margin_mean": 192.96554565429688, "grad_norm": 32.43202590942383, "kl/avg_steps": 0.65625, "kl/beta": 0.0026124282740056515, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 3.677293317363864e-08, "logits/chosen": 0.7630248069763184, "logits/rejected": 1.0416980981826782, "logps/chosen": -435.67181396484375, "logps/ref_chosen": -71.79014587402344, "logps/ref_rejected": -95.38619995117188, "logps/rejected": -652.2333984375, "loss": 1.0343, "rewards/accuracies": 0.828125, "rewards/chosen": -0.9463493824005127, "rewards/margins": 0.4975544810295105, "rewards/rejected": -1.4439038038253784, "step": 575 }, { "epoch": 0.8458149779735683, "epsilon_dpo/beta": 0.0025802438613027334, "epsilon_dpo/beta_margin_grad_mean": -0.389384925365448, "epsilon_dpo/beta_margin_grad_std": 0.1267186403274536, "epsilon_dpo/beta_margin_mean": 0.4831611216068268, "epsilon_dpo/beta_margin_std": 0.5626578330993652, "epsilon_dpo/loss_margin_mean": 188.4665985107422, "grad_norm": 35.16240692138672, "kl/avg_steps": 0.59375, "kl/beta": 0.00259539601393044, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 3.6105769011194224e-08, "logits/chosen": 0.9789996147155762, "logits/rejected": 1.0455071926116943, "logps/chosen": -397.0924072265625, "logps/ref_chosen": -54.262969970703125, "logps/ref_rejected": -100.7542724609375, "logps/rejected": -632.0503540039062, "loss": 1.0339, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8859975934028625, "rewards/margins": 0.4831610918045044, "rewards/rejected": -1.3691587448120117, "step": 576 }, { "epoch": 0.8472834067547724, "epsilon_dpo/beta": 0.002560575259849429, "epsilon_dpo/beta_margin_grad_mean": -0.384562611579895, "epsilon_dpo/beta_margin_grad_std": 0.10579453408718109, "epsilon_dpo/beta_margin_mean": 0.4938971996307373, "epsilon_dpo/beta_margin_std": 0.4734703600406647, "epsilon_dpo/loss_margin_mean": 193.65318298339844, "grad_norm": 33.70735168457031, "kl/avg_steps": 0.765625, "kl/beta": 0.0025800769217312336, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.875, "learning_rate": 3.5444241160659304e-08, "logits/chosen": 0.6827180981636047, "logits/rejected": 1.1888189315795898, "logps/chosen": -371.88262939453125, "logps/ref_chosen": -61.909706115722656, "logps/ref_rejected": -84.07069396972656, "logps/rejected": -587.69677734375, "loss": 1.0041, "rewards/accuracies": 0.90625, "rewards/chosen": -0.7951531410217285, "rewards/margins": 0.4938971996307373, "rewards/rejected": -1.2890503406524658, "step": 577 }, { "epoch": 0.8487518355359766, "epsilon_dpo/beta": 0.002543124370276928, "epsilon_dpo/beta_margin_grad_mean": -0.388141393661499, "epsilon_dpo/beta_margin_grad_std": 0.09937335550785065, "epsilon_dpo/beta_margin_mean": 0.47608840465545654, "epsilon_dpo/beta_margin_std": 0.4362509548664093, "epsilon_dpo/loss_margin_mean": 187.98797607421875, "grad_norm": 22.605552673339844, "kl/avg_steps": 0.6875, "kl/beta": 0.0025604732800275087, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 3.478836705390808e-08, "logits/chosen": 1.0281192064285278, "logits/rejected": 1.186118721961975, "logps/chosen": -368.55120849609375, "logps/ref_chosen": -49.26368713378906, "logps/ref_rejected": -83.43626403808594, "logps/rejected": -590.7117309570312, "loss": 1.0104, "rewards/accuracies": 0.890625, "rewards/chosen": -0.8131706714630127, "rewards/margins": 0.47608840465545654, "rewards/rejected": -1.2892590761184692, "step": 578 }, { "epoch": 0.8502202643171806, "epsilon_dpo/beta": 0.002526554511860013, "epsilon_dpo/beta_margin_grad_mean": -0.40730589628219604, "epsilon_dpo/beta_margin_grad_std": 0.09855158627033234, "epsilon_dpo/beta_margin_mean": 0.3938119113445282, "epsilon_dpo/beta_margin_std": 0.43337389826774597, "epsilon_dpo/loss_margin_mean": 156.7061309814453, "grad_norm": 40.55356216430664, "kl/avg_steps": 0.65625, "kl/beta": 0.0025429902598261833, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 3.41381639738331e-08, "logits/chosen": 0.8365261554718018, "logits/rejected": 0.9743883609771729, "logps/chosen": -383.7625427246094, "logps/ref_chosen": -58.88581848144531, "logps/ref_rejected": -94.78762817382812, "logps/rejected": -576.3704833984375, "loss": 1.0747, "rewards/accuracies": 0.84375, "rewards/chosen": -0.821784257888794, "rewards/margins": 0.3938118815422058, "rewards/rejected": -1.2155961990356445, "step": 579 }, { "epoch": 0.8516886930983847, "epsilon_dpo/beta": 0.0025069238618016243, "epsilon_dpo/beta_margin_grad_mean": -0.3790266513824463, "epsilon_dpo/beta_margin_grad_std": 0.1196211501955986, "epsilon_dpo/beta_margin_mean": 0.5311000347137451, "epsilon_dpo/beta_margin_std": 0.559917688369751, "epsilon_dpo/loss_margin_mean": 212.6781768798828, "grad_norm": 20.837158203125, "kl/avg_steps": 0.78125, "kl/beta": 0.002526410622522235, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 3.349364905389032e-08, "logits/chosen": 0.8197837471961975, "logits/rejected": 1.0574215650558472, "logps/chosen": -326.85235595703125, "logps/ref_chosen": -48.70684051513672, "logps/ref_rejected": -81.7583999633789, "logps/rejected": -572.5821533203125, "loss": 0.9942, "rewards/accuracies": 0.890625, "rewards/chosen": -0.698243260383606, "rewards/margins": 0.5311000347137451, "rewards/rejected": -1.2293434143066406, "step": 580 }, { "epoch": 0.8531571218795888, "epsilon_dpo/beta": 0.0024890571366995573, "epsilon_dpo/beta_margin_grad_mean": -0.40003034472465515, "epsilon_dpo/beta_margin_grad_std": 0.10129855573177338, "epsilon_dpo/beta_margin_mean": 0.4284476041793823, "epsilon_dpo/beta_margin_std": 0.4555365741252899, "epsilon_dpo/loss_margin_mean": 172.8662872314453, "grad_norm": 28.829118728637695, "kl/avg_steps": 0.71875, "kl/beta": 0.0025068260729312897, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 3.285483927764726e-08, "logits/chosen": 0.8677518963813782, "logits/rejected": 1.1416388750076294, "logps/chosen": -405.48883056640625, "logps/ref_chosen": -62.22235107421875, "logps/ref_rejected": -91.73568725585938, "logps/rejected": -607.868408203125, "loss": 1.0508, "rewards/accuracies": 0.859375, "rewards/chosen": -0.8552682399749756, "rewards/margins": 0.4284476041793823, "rewards/rejected": -1.283715844154358, "step": 581 }, { "epoch": 0.8546255506607929, "epsilon_dpo/beta": 0.002474406035616994, "epsilon_dpo/beta_margin_grad_mean": -0.38045981526374817, "epsilon_dpo/beta_margin_grad_std": 0.12385688722133636, "epsilon_dpo/beta_margin_mean": 0.5223144888877869, "epsilon_dpo/beta_margin_std": 0.559083104133606, "epsilon_dpo/loss_margin_mean": 212.2982177734375, "grad_norm": 28.197917938232422, "kl/avg_steps": 0.59375, "kl/beta": 0.0024889367632567883, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 3.222175147833556e-08, "logits/chosen": 0.9349786639213562, "logits/rejected": 0.860551118850708, "logps/chosen": -382.42120361328125, "logps/ref_chosen": -58.228660583496094, "logps/ref_rejected": -110.06959533691406, "logps/rejected": -646.5603637695312, "loss": 1.0025, "rewards/accuracies": 0.828125, "rewards/chosen": -0.8031030893325806, "rewards/margins": 0.5223144888877869, "rewards/rejected": -1.3254175186157227, "step": 582 }, { "epoch": 0.856093979441997, "epsilon_dpo/beta": 0.0024605742655694485, "epsilon_dpo/beta_margin_grad_mean": -0.4143773019313812, "epsilon_dpo/beta_margin_grad_std": 0.11367323249578476, "epsilon_dpo/beta_margin_mean": 0.3690873384475708, "epsilon_dpo/beta_margin_std": 0.502434253692627, "epsilon_dpo/loss_margin_mean": 151.1329803466797, "grad_norm": 31.24814224243164, "kl/avg_steps": 0.5625, "kl/beta": 0.00247424584813416, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.159440233840763e-08, "logits/chosen": 0.8064649105072021, "logits/rejected": 1.2051582336425781, "logps/chosen": -398.5650634765625, "logps/ref_chosen": -56.86286163330078, "logps/ref_rejected": -88.4039306640625, "logps/rejected": -581.2390747070312, "loss": 1.1098, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8417296409606934, "rewards/margins": 0.3690873384475708, "rewards/rejected": -1.2108169794082642, "step": 583 }, { "epoch": 0.8575624082232012, "epsilon_dpo/beta": 0.002442197408527136, "epsilon_dpo/beta_margin_grad_mean": -0.3627620041370392, "epsilon_dpo/beta_margin_grad_std": 0.10778437554836273, "epsilon_dpo/beta_margin_mean": 0.5972607731819153, "epsilon_dpo/beta_margin_std": 0.4922870397567749, "epsilon_dpo/loss_margin_mean": 245.3231964111328, "grad_norm": 28.720809936523438, "kl/avg_steps": 0.75, "kl/beta": 0.0024604061618447304, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 3.0972808389096635e-08, "logits/chosen": 0.7612216472625732, "logits/rejected": 1.1449096202850342, "logps/chosen": -366.0911865234375, "logps/ref_chosen": -56.90068054199219, "logps/ref_rejected": -97.63606262207031, "logps/rejected": -652.1497802734375, "loss": 0.9309, "rewards/accuracies": 0.90625, "rewards/chosen": -0.7563260793685913, "rewards/margins": 0.5972607135772705, "rewards/rejected": -1.3535869121551514, "step": 584 }, { "epoch": 0.8590308370044053, "epsilon_dpo/beta": 0.0024301232770085335, "epsilon_dpo/beta_margin_grad_mean": -0.3972703516483307, "epsilon_dpo/beta_margin_grad_std": 0.11508132517337799, "epsilon_dpo/beta_margin_mean": 0.4482652544975281, "epsilon_dpo/beta_margin_std": 0.5208894610404968, "epsilon_dpo/loss_margin_mean": 185.71475219726562, "grad_norm": 34.63766098022461, "kl/avg_steps": 0.5, "kl/beta": 0.0024420905392616987, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 3.035698600998121e-08, "logits/chosen": 0.938701868057251, "logits/rejected": 1.331627607345581, "logps/chosen": -415.82305908203125, "logps/ref_chosen": -60.973968505859375, "logps/ref_rejected": -84.16952514648438, "logps/rejected": -624.7333984375, "loss": 1.0495, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8638983964920044, "rewards/margins": 0.4482652544975281, "rewards/rejected": -1.3121635913848877, "step": 585 }, { "epoch": 0.8604992657856094, "epsilon_dpo/beta": 0.0024172733537852764, "epsilon_dpo/beta_margin_grad_mean": -0.4087361693382263, "epsilon_dpo/beta_margin_grad_std": 0.12433697283267975, "epsilon_dpo/beta_margin_mean": 0.39639025926589966, "epsilon_dpo/beta_margin_std": 0.5501787662506104, "epsilon_dpo/loss_margin_mean": 165.33468627929688, "grad_norm": 26.173227310180664, "kl/avg_steps": 0.53125, "kl/beta": 0.002429940737783909, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.974695142855388e-08, "logits/chosen": 1.2281886339187622, "logits/rejected": 1.4619064331054688, "logps/chosen": -428.53717041015625, "logps/ref_chosen": -56.85559844970703, "logps/ref_rejected": -91.8026123046875, "logps/rejected": -628.8189086914062, "loss": 1.0992, "rewards/accuracies": 0.75, "rewards/chosen": -0.9000787734985352, "rewards/margins": 0.39639025926589966, "rewards/rejected": -1.29646897315979, "step": 586 }, { "epoch": 0.8619676945668135, "epsilon_dpo/beta": 0.0024014776572585106, "epsilon_dpo/beta_margin_grad_mean": -0.3929090201854706, "epsilon_dpo/beta_margin_grad_std": 0.11319870501756668, "epsilon_dpo/beta_margin_mean": 0.46409526467323303, "epsilon_dpo/beta_margin_std": 0.5094618797302246, "epsilon_dpo/loss_margin_mean": 194.23648071289062, "grad_norm": 28.493549346923828, "kl/avg_steps": 0.65625, "kl/beta": 0.0024170998949557543, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 2.9142720719793122e-08, "logits/chosen": 1.1399567127227783, "logits/rejected": 1.4511924982070923, "logps/chosen": -356.84228515625, "logps/ref_chosen": -44.69159698486328, "logps/ref_rejected": -82.62385559082031, "logps/rejected": -589.0110473632812, "loss": 1.0348, "rewards/accuracies": 0.828125, "rewards/chosen": -0.7502772808074951, "rewards/margins": 0.46409526467323303, "rewards/rejected": -1.2143725156784058, "step": 587 }, { "epoch": 0.8634361233480177, "epsilon_dpo/beta": 0.0023895734921097755, "epsilon_dpo/beta_margin_grad_mean": -0.40370285511016846, "epsilon_dpo/beta_margin_grad_std": 0.10795515775680542, "epsilon_dpo/beta_margin_mean": 0.41405537724494934, "epsilon_dpo/beta_margin_std": 0.47634997963905334, "epsilon_dpo/loss_margin_mean": 174.47763061523438, "grad_norm": 31.45778465270996, "kl/avg_steps": 0.5, "kl/beta": 0.0024013412185013294, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 2.8544309805740018e-08, "logits/chosen": 1.0272313356399536, "logits/rejected": 1.047000527381897, "logps/chosen": -411.3370361328125, "logps/ref_chosen": -50.294952392578125, "logps/ref_rejected": -107.36988067626953, "logps/rejected": -642.8896484375, "loss": 1.0674, "rewards/accuracies": 0.796875, "rewards/chosen": -0.8642209768295288, "rewards/margins": 0.41405537724494934, "rewards/rejected": -1.2782764434814453, "step": 588 }, { "epoch": 0.8649045521292217, "epsilon_dpo/beta": 0.0023724576458334923, "epsilon_dpo/beta_margin_grad_mean": -0.3712667226791382, "epsilon_dpo/beta_margin_grad_std": 0.11630377918481827, "epsilon_dpo/beta_margin_mean": 0.5647463202476501, "epsilon_dpo/beta_margin_std": 0.5330010056495667, "epsilon_dpo/loss_margin_mean": 238.95262145996094, "grad_norm": 33.41949462890625, "kl/avg_steps": 0.71875, "kl/beta": 0.002389394212514162, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 2.7951734455078786e-08, "logits/chosen": 0.9147559404373169, "logits/rejected": 1.1535378694534302, "logps/chosen": -400.4263000488281, "logps/ref_chosen": -59.929908752441406, "logps/ref_rejected": -111.65534973144531, "logps/rejected": -691.1043701171875, "loss": 0.9636, "rewards/accuracies": 0.875, "rewards/chosen": -0.8093938827514648, "rewards/margins": 0.5647463202476501, "rewards/rejected": -1.3741401433944702, "step": 589 }, { "epoch": 0.8663729809104258, "epsilon_dpo/beta": 0.002357010031118989, "epsilon_dpo/beta_margin_grad_mean": -0.3921390175819397, "epsilon_dpo/beta_margin_grad_std": 0.10695243626832962, "epsilon_dpo/beta_margin_mean": 0.46295467019081116, "epsilon_dpo/beta_margin_std": 0.47514328360557556, "epsilon_dpo/loss_margin_mean": 197.41427612304688, "grad_norm": 23.523006439208984, "kl/avg_steps": 0.65625, "kl/beta": 0.002372342860326171, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 2.736501028272095e-08, "logits/chosen": 0.8840131163597107, "logits/rejected": 0.8686351776123047, "logps/chosen": -389.12689208984375, "logps/ref_chosen": -55.80979537963867, "logps/ref_rejected": -106.06282043457031, "logps/rejected": -636.794189453125, "loss": 1.0284, "rewards/accuracies": 0.828125, "rewards/chosen": -0.7873044013977051, "rewards/margins": 0.46295467019081116, "rewards/rejected": -1.2502591609954834, "step": 590 }, { "epoch": 0.8678414096916299, "epsilon_dpo/beta": 0.002340169856324792, "epsilon_dpo/beta_margin_grad_mean": -0.3897119462490082, "epsilon_dpo/beta_margin_grad_std": 0.10302460938692093, "epsilon_dpo/beta_margin_mean": 0.47087207436561584, "epsilon_dpo/beta_margin_std": 0.45756813883781433, "epsilon_dpo/loss_margin_mean": 202.04428100585938, "grad_norm": 24.383817672729492, "kl/avg_steps": 0.71875, "kl/beta": 0.0023568759206682444, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 2.678415274939408e-08, "logits/chosen": 1.0149821043014526, "logits/rejected": 1.6015218496322632, "logps/chosen": -421.56243896484375, "logps/ref_chosen": -56.24061965942383, "logps/ref_rejected": -83.78629302978516, "logps/rejected": -651.15234375, "loss": 1.0185, "rewards/accuracies": 0.875, "rewards/chosen": -0.8560632467269897, "rewards/margins": 0.47087207436561584, "rewards/rejected": -1.3269352912902832, "step": 591 }, { "epoch": 0.869309838472834, "epsilon_dpo/beta": 0.0023278577718883753, "epsilon_dpo/beta_margin_grad_mean": -0.3993397355079651, "epsilon_dpo/beta_margin_grad_std": 0.1315532922744751, "epsilon_dpo/beta_margin_mean": 0.4305296242237091, "epsilon_dpo/beta_margin_std": 0.5752761960029602, "epsilon_dpo/loss_margin_mean": 186.57310485839844, "grad_norm": 28.219043731689453, "kl/avg_steps": 0.53125, "kl/beta": 0.002340056700631976, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.6209177161234442e-08, "logits/chosen": 1.4690220355987549, "logits/rejected": 1.865086555480957, "logps/chosen": -443.86004638671875, "logps/ref_chosen": -47.94025421142578, "logps/ref_rejected": -75.73287963867188, "logps/rejected": -658.2257080078125, "loss": 1.0795, "rewards/accuracies": 0.796875, "rewards/chosen": -0.9236236810684204, "rewards/margins": 0.4305296540260315, "rewards/rejected": -1.3541532754898071, "step": 592 }, { "epoch": 0.8707782672540382, "epsilon_dpo/beta": 0.002318466315045953, "epsilon_dpo/beta_margin_grad_mean": -0.42013755440711975, "epsilon_dpo/beta_margin_grad_std": 0.10402812063694, "epsilon_dpo/beta_margin_mean": 0.3387100100517273, "epsilon_dpo/beta_margin_std": 0.4469892084598541, "epsilon_dpo/loss_margin_mean": 147.3956756591797, "grad_norm": 26.93070411682129, "kl/avg_steps": 0.40625, "kl/beta": 0.0023276908323168755, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 2.564009866938349e-08, "logits/chosen": 1.260116457939148, "logits/rejected": 1.8658943176269531, "logps/chosen": -393.62982177734375, "logps/ref_chosen": -48.690757751464844, "logps/ref_rejected": -60.90800476074219, "logps/rejected": -553.2427368164062, "loss": 1.1235, "rewards/accuracies": 0.734375, "rewards/chosen": -0.8011359572410583, "rewards/margins": 0.3387100100517273, "rewards/rejected": -1.1398459672927856, "step": 593 }, { "epoch": 0.8722466960352423, "epsilon_dpo/beta": 0.002306912327185273, "epsilon_dpo/beta_margin_grad_mean": -0.3927120566368103, "epsilon_dpo/beta_margin_grad_std": 0.12015083432197571, "epsilon_dpo/beta_margin_mean": 0.46669813990592957, "epsilon_dpo/beta_margin_std": 0.533369243144989, "epsilon_dpo/loss_margin_mean": 203.7169189453125, "grad_norm": 24.436321258544922, "kl/avg_steps": 0.5, "kl/beta": 0.00231827306561172, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 2.5076932269588708e-08, "logits/chosen": 0.9466878175735474, "logits/rejected": 1.4920939207077026, "logps/chosen": -377.2304382324219, "logps/ref_chosen": -54.93488693237305, "logps/ref_rejected": -86.09967041015625, "logps/rejected": -612.1121826171875, "loss": 1.0391, "rewards/accuracies": 0.796875, "rewards/chosen": -0.745464563369751, "rewards/margins": 0.46669816970825195, "rewards/rejected": -1.212162733078003, "step": 594 }, { "epoch": 0.8737151248164464, "epsilon_dpo/beta": 0.0022903885692358017, "epsilon_dpo/beta_margin_grad_mean": -0.40245962142944336, "epsilon_dpo/beta_margin_grad_std": 0.10169457644224167, "epsilon_dpo/beta_margin_mean": 0.4241439998149872, "epsilon_dpo/beta_margin_std": 0.4779883921146393, "epsilon_dpo/loss_margin_mean": 185.88308715820312, "grad_norm": 32.14240646362305, "kl/avg_steps": 0.71875, "kl/beta": 0.0023067393340170383, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 2.451969280180849e-08, "logits/chosen": 1.174853801727295, "logits/rejected": 1.582853078842163, "logps/chosen": -397.878662109375, "logps/ref_chosen": -49.42041778564453, "logps/ref_rejected": -80.62731170654297, "logps/rejected": -614.9686279296875, "loss": 1.0575, "rewards/accuracies": 0.859375, "rewards/chosen": -0.7989033460617065, "rewards/margins": 0.4241439700126648, "rewards/rejected": -1.2230473756790161, "step": 595 }, { "epoch": 0.8751835535976505, "epsilon_dpo/beta": 0.0022790543735027313, "epsilon_dpo/beta_margin_grad_mean": -0.41742995381355286, "epsilon_dpo/beta_margin_grad_std": 0.10700484365224838, "epsilon_dpo/beta_margin_mean": 0.35146239399909973, "epsilon_dpo/beta_margin_std": 0.4646219313144684, "epsilon_dpo/loss_margin_mean": 155.37747192382812, "grad_norm": 37.94215774536133, "kl/avg_steps": 0.5, "kl/beta": 0.002290277974680066, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 2.396839494982103e-08, "logits/chosen": 0.926213264465332, "logits/rejected": 1.491598129272461, "logps/chosen": -417.51220703125, "logps/ref_chosen": -59.791683197021484, "logps/ref_rejected": -80.09111785888672, "logps/rejected": -593.1890869140625, "loss": 1.1165, "rewards/accuracies": 0.765625, "rewards/chosen": -0.8174558877944946, "rewards/margins": 0.35146236419677734, "rewards/rejected": -1.168918251991272, "step": 596 }, { "epoch": 0.8766519823788547, "epsilon_dpo/beta": 0.002266291296109557, "epsilon_dpo/beta_margin_grad_mean": -0.39597126841545105, "epsilon_dpo/beta_margin_grad_std": 0.11562085896730423, "epsilon_dpo/beta_margin_mean": 0.4527498185634613, "epsilon_dpo/beta_margin_std": 0.5288181900978088, "epsilon_dpo/loss_margin_mean": 201.0142822265625, "grad_norm": 25.600309371948242, "kl/avg_steps": 0.5625, "kl/beta": 0.0022788834758102894, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.3423053240837514e-08, "logits/chosen": 1.0698202848434448, "logits/rejected": 1.1980940103530884, "logps/chosen": -419.4420166015625, "logps/ref_chosen": -57.26078796386719, "logps/ref_rejected": -100.6937255859375, "logps/rejected": -663.8892822265625, "loss": 1.0476, "rewards/accuracies": 0.828125, "rewards/chosen": -0.8222044706344604, "rewards/margins": 0.4527497887611389, "rewards/rejected": -1.2749543190002441, "step": 597 }, { "epoch": 0.8781204111600588, "epsilon_dpo/beta": 0.0022536148317158222, "epsilon_dpo/beta_margin_grad_mean": -0.4088347256183624, "epsilon_dpo/beta_margin_grad_std": 0.12910935282707214, "epsilon_dpo/beta_margin_mean": 0.3876650333404541, "epsilon_dpo/beta_margin_std": 0.5588559508323669, "epsilon_dpo/loss_margin_mean": 173.6048583984375, "grad_norm": 29.18787384033203, "kl/avg_steps": 0.5625, "kl/beta": 0.0022661364637315273, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.2883682045119062e-08, "logits/chosen": 1.1886159181594849, "logits/rejected": 1.447231650352478, "logps/chosen": -441.88616943359375, "logps/ref_chosen": -52.51850509643555, "logps/ref_rejected": -89.44385528564453, "logps/rejected": -652.4163818359375, "loss": 1.11, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8795113563537598, "rewards/margins": 0.3876650333404541, "rewards/rejected": -1.2671763896942139, "step": 598 }, { "epoch": 0.8795888399412628, "epsilon_dpo/beta": 0.002237487817183137, "epsilon_dpo/beta_margin_grad_mean": -0.4056170880794525, "epsilon_dpo/beta_margin_grad_std": 0.087938092648983, "epsilon_dpo/beta_margin_mean": 0.39522066712379456, "epsilon_dpo/beta_margin_std": 0.37430721521377563, "epsilon_dpo/loss_margin_mean": 177.3351593017578, "grad_norm": 31.863269805908203, "kl/avg_steps": 0.71875, "kl/beta": 0.0022534606978297234, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 2.2350295575598367e-08, "logits/chosen": 1.1639418601989746, "logits/rejected": 1.354495644569397, "logps/chosen": -394.5838623046875, "logps/ref_chosen": -49.802677154541016, "logps/ref_rejected": -82.978515625, "logps/rejected": -605.0948486328125, "loss": 1.0632, "rewards/accuracies": 0.859375, "rewards/chosen": -0.7725363969802856, "rewards/margins": 0.39522066712379456, "rewards/rejected": -1.1677570343017578, "step": 599 }, { "epoch": 0.8810572687224669, "epsilon_dpo/beta": 0.002223964547738433, "epsilon_dpo/beta_margin_grad_mean": -0.4058476388454437, "epsilon_dpo/beta_margin_grad_std": 0.10309126228094101, "epsilon_dpo/beta_margin_mean": 0.4012675881385803, "epsilon_dpo/beta_margin_std": 0.4533129036426544, "epsilon_dpo/loss_margin_mean": 181.48497009277344, "grad_norm": 28.798608779907227, "kl/avg_steps": 0.609375, "kl/beta": 0.002237379550933838, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.796875, "learning_rate": 2.1822907887504932e-08, "logits/chosen": 0.8918631672859192, "logits/rejected": 1.333923101425171, "logps/chosen": -452.645751953125, "logps/ref_chosen": -66.43487548828125, "logps/ref_rejected": -85.45649719238281, "logps/rejected": -653.15234375, "loss": 1.0729, "rewards/accuracies": 0.84375, "rewards/chosen": -0.8606581687927246, "rewards/margins": 0.4012675881385803, "rewards/rejected": -1.2619256973266602, "step": 600 }, { "epoch": 0.8810572687224669, "eval_epsilon_dpo/beta": 0.0022153002209961414, "eval_epsilon_dpo/beta_margin_grad_mean": -0.4365132749080658, "eval_epsilon_dpo/beta_margin_grad_std": 0.10830235481262207, "eval_epsilon_dpo/beta_margin_mean": 0.2717970013618469, "eval_epsilon_dpo/beta_margin_std": 0.47190308570861816, "eval_epsilon_dpo/loss_margin_mean": 124.09112548828125, "eval_kl/n_epsilon_steps": 0.3022260367870331, "eval_kl/p_epsilon_steps": 0.695633590221405, "eval_logits/chosen": 0.9628068208694458, "eval_logits/rejected": 1.382341742515564, "eval_logps/chosen": -487.8938903808594, "eval_logps/ref_chosen": -79.05104064941406, "eval_logps/ref_rejected": -86.79793548583984, "eval_logps/rejected": -619.73193359375, "eval_loss": 0.5941485166549683, "eval_rewards/accuracies": 0.7183219194412231, "eval_rewards/chosen": -0.9072585701942444, "eval_rewards/margins": 0.2717970311641693, "eval_rewards/rejected": -1.1790555715560913, "eval_runtime": 43.142, "eval_samples_per_second": 54.216, "eval_steps_per_second": 1.715, "step": 600 }, { "epoch": 0.882525697503671, "epsilon_dpo/beta": 0.002211540238931775, "epsilon_dpo/beta_margin_grad_mean": -0.40351107716560364, "epsilon_dpo/beta_margin_grad_std": 0.10921073704957962, "epsilon_dpo/beta_margin_mean": 0.4135131537914276, "epsilon_dpo/beta_margin_std": 0.48056861758232117, "epsilon_dpo/loss_margin_mean": 188.20594787597656, "grad_norm": 25.276933670043945, "kl/avg_steps": 0.5625, "kl/beta": 0.0022238281089812517, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.1301532877994742e-08, "logits/chosen": 1.3206804990768433, "logits/rejected": 1.483794093132019, "logps/chosen": -459.1831970214844, "logps/ref_chosen": -59.13360595703125, "logps/ref_rejected": -94.69093322753906, "logps/rejected": -682.9464111328125, "loss": 1.0689, "rewards/accuracies": 0.8125, "rewards/chosen": -0.886534571647644, "rewards/margins": 0.4135131239891052, "rewards/rejected": -1.300047755241394, "step": 601 }, { "epoch": 0.8839941262848752, "epsilon_dpo/beta": 0.002197787631303072, "epsilon_dpo/beta_margin_grad_mean": -0.3858841061592102, "epsilon_dpo/beta_margin_grad_std": 0.10464838147163391, "epsilon_dpo/beta_margin_mean": 0.4918578863143921, "epsilon_dpo/beta_margin_std": 0.47307488322257996, "epsilon_dpo/loss_margin_mean": 224.8507843017578, "grad_norm": 31.37278175354004, "kl/avg_steps": 0.625, "kl/beta": 0.0022113891318440437, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 2.0786184285784298e-08, "logits/chosen": 1.2354438304901123, "logits/rejected": 1.4991683959960938, "logps/chosen": -368.17822265625, "logps/ref_chosen": -48.59352111816406, "logps/ref_rejected": -87.6685562133789, "logps/rejected": -632.10400390625, "loss": 1.005, "rewards/accuracies": 0.859375, "rewards/chosen": -0.7031533122062683, "rewards/margins": 0.4918578863143921, "rewards/rejected": -1.1950111389160156, "step": 602 }, { "epoch": 0.8854625550660793, "epsilon_dpo/beta": 0.0021861973218619823, "epsilon_dpo/beta_margin_grad_mean": -0.4108930826187134, "epsilon_dpo/beta_margin_grad_std": 0.11496561765670776, "epsilon_dpo/beta_margin_mean": 0.3901022970676422, "epsilon_dpo/beta_margin_std": 0.5260531902313232, "epsilon_dpo/loss_margin_mean": 179.67196655273438, "grad_norm": 38.248104095458984, "kl/avg_steps": 0.53125, "kl/beta": 0.002197653753682971, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.0276875690788204e-08, "logits/chosen": 0.6783925294876099, "logits/rejected": 1.2061731815338135, "logps/chosen": -432.961181640625, "logps/ref_chosen": -70.41461944580078, "logps/ref_rejected": -100.32560729980469, "logps/rejected": -642.5441284179688, "loss": 1.0967, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7941126823425293, "rewards/margins": 0.3901022970676422, "rewards/rejected": -1.1842149496078491, "step": 603 }, { "epoch": 0.8869309838472834, "epsilon_dpo/beta": 0.0021719117648899555, "epsilon_dpo/beta_margin_grad_mean": -0.39027997851371765, "epsilon_dpo/beta_margin_grad_std": 0.10892639309167862, "epsilon_dpo/beta_margin_mean": 0.47051891684532166, "epsilon_dpo/beta_margin_std": 0.48104289174079895, "epsilon_dpo/loss_margin_mean": 217.738037109375, "grad_norm": 33.354339599609375, "kl/avg_steps": 0.65625, "kl/beta": 0.0021860403940081596, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 1.977362051376158e-08, "logits/chosen": 1.2945600748062134, "logits/rejected": 1.3630659580230713, "logps/chosen": -370.6407470703125, "logps/ref_chosen": -46.45808029174805, "logps/ref_rejected": -91.8544921875, "logps/rejected": -633.7752075195312, "loss": 1.0241, "rewards/accuracies": 0.828125, "rewards/chosen": -0.7053238153457642, "rewards/margins": 0.47051894664764404, "rewards/rejected": -1.1758427619934082, "step": 604 }, { "epoch": 0.8883994126284875, "epsilon_dpo/beta": 0.0021577514708042145, "epsilon_dpo/beta_margin_grad_mean": -0.40043067932128906, "epsilon_dpo/beta_margin_grad_std": 0.10629180818796158, "epsilon_dpo/beta_margin_mean": 0.4359266459941864, "epsilon_dpo/beta_margin_std": 0.5075231194496155, "epsilon_dpo/loss_margin_mean": 202.94888305664062, "grad_norm": 30.262615203857422, "kl/avg_steps": 0.65625, "kl/beta": 0.0021717881318181753, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 1.9276432015946446e-08, "logits/chosen": 0.8817493319511414, "logits/rejected": 1.3649321794509888, "logps/chosen": -442.30731201171875, "logps/ref_chosen": -66.24933624267578, "logps/ref_rejected": -102.30496978759766, "logps/rejected": -681.3118896484375, "loss": 1.0539, "rewards/accuracies": 0.84375, "rewards/chosen": -0.8125105500221252, "rewards/margins": 0.4359266459941864, "rewards/rejected": -1.2484371662139893, "step": 605 }, { "epoch": 0.8898678414096917, "epsilon_dpo/beta": 0.002144357655197382, "epsilon_dpo/beta_margin_grad_mean": -0.3933078348636627, "epsilon_dpo/beta_margin_grad_std": 0.10445917397737503, "epsilon_dpo/beta_margin_mean": 0.4584818184375763, "epsilon_dpo/beta_margin_std": 0.4664647877216339, "epsilon_dpo/loss_margin_mean": 214.8511962890625, "grad_norm": 30.18663215637207, "kl/avg_steps": 0.625, "kl/beta": 0.0021576285362243652, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.8785323298722093e-08, "logits/chosen": 1.1324771642684937, "logits/rejected": 1.3250871896743774, "logps/chosen": -412.505126953125, "logps/ref_chosen": -54.819122314453125, "logps/ref_rejected": -98.37147521972656, "logps/rejected": -670.90869140625, "loss": 1.0297, "rewards/accuracies": 0.828125, "rewards/chosen": -0.7678847312927246, "rewards/margins": 0.4584817886352539, "rewards/rejected": -1.2263665199279785, "step": 606 }, { "epoch": 0.8913362701908958, "epsilon_dpo/beta": 0.0021323792170733213, "epsilon_dpo/beta_margin_grad_mean": -0.4152858853340149, "epsilon_dpo/beta_margin_grad_std": 0.08593456447124481, "epsilon_dpo/beta_margin_mean": 0.3553902506828308, "epsilon_dpo/beta_margin_std": 0.37019211053848267, "epsilon_dpo/loss_margin_mean": 167.57159423828125, "grad_norm": 27.074480056762695, "kl/avg_steps": 0.5625, "kl/beta": 0.002144227270036936, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.8300307303259904e-08, "logits/chosen": 1.0228676795959473, "logits/rejected": 1.6952282190322876, "logps/chosen": -426.2294616699219, "logps/ref_chosen": -58.08403778076172, "logps/ref_rejected": -79.777099609375, "logps/rejected": -615.494140625, "loss": 1.0947, "rewards/accuracies": 0.84375, "rewards/chosen": -0.7858235836029053, "rewards/margins": 0.3553902506828308, "rewards/rejected": -1.1412138938903809, "step": 607 }, { "epoch": 0.8928046989720999, "epsilon_dpo/beta": 0.002117785857990384, "epsilon_dpo/beta_margin_grad_mean": -0.4044342637062073, "epsilon_dpo/beta_margin_grad_std": 0.08454929292201996, "epsilon_dpo/beta_margin_mean": 0.39954301714897156, "epsilon_dpo/beta_margin_std": 0.361564576625824, "epsilon_dpo/loss_margin_mean": 189.41441345214844, "grad_norm": 36.970916748046875, "kl/avg_steps": 0.6875, "kl/beta": 0.0021322332322597504, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 1.7821396810182437e-08, "logits/chosen": 0.9039729833602905, "logits/rejected": 1.3142175674438477, "logps/chosen": -424.0223388671875, "logps/ref_chosen": -57.450836181640625, "logps/ref_rejected": -94.77339172363281, "logps/rejected": -650.75927734375, "loss": 1.0573, "rewards/accuracies": 0.875, "rewards/chosen": -0.7769819498062134, "rewards/margins": 0.39954301714897156, "rewards/rejected": -1.1765248775482178, "step": 608 }, { "epoch": 0.8942731277533039, "epsilon_dpo/beta": 0.002102663740515709, "epsilon_dpo/beta_margin_grad_mean": -0.3947716951370239, "epsilon_dpo/beta_margin_grad_std": 0.09765525162220001, "epsilon_dpo/beta_margin_mean": 0.4464167058467865, "epsilon_dpo/beta_margin_std": 0.43577244877815247, "epsilon_dpo/loss_margin_mean": 213.13291931152344, "grad_norm": 31.127103805541992, "kl/avg_steps": 0.71875, "kl/beta": 0.002117674332112074, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 1.7348604439226617e-08, "logits/chosen": 1.02443528175354, "logits/rejected": 1.3518232107162476, "logps/chosen": -418.4666442871094, "logps/ref_chosen": -58.805355072021484, "logps/ref_rejected": -88.81600952148438, "logps/rejected": -661.6102294921875, "loss": 1.033, "rewards/accuracies": 0.90625, "rewards/chosen": -0.7576903104782104, "rewards/margins": 0.4464166760444641, "rewards/rejected": -1.2041070461273193, "step": 609 }, { "epoch": 0.895741556534508, "epsilon_dpo/beta": 0.002089630113914609, "epsilon_dpo/beta_margin_grad_mean": -0.4126569330692291, "epsilon_dpo/beta_margin_grad_std": 0.1021689847111702, "epsilon_dpo/beta_margin_mean": 0.36963143944740295, "epsilon_dpo/beta_margin_std": 0.44020283222198486, "epsilon_dpo/loss_margin_mean": 178.02821350097656, "grad_norm": 23.951202392578125, "kl/avg_steps": 0.625, "kl/beta": 0.002102562226355076, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.6881942648911074e-08, "logits/chosen": 0.9360833168029785, "logits/rejected": 1.605763554573059, "logps/chosen": -417.2803955078125, "logps/ref_chosen": -65.69503784179688, "logps/ref_rejected": -83.4053955078125, "logps/rejected": -613.0189819335938, "loss": 1.0964, "rewards/accuracies": 0.8125, "rewards/chosen": -0.736303985118866, "rewards/margins": 0.36963140964508057, "rewards/rejected": -1.1059353351593018, "step": 610 }, { "epoch": 0.8972099853157122, "epsilon_dpo/beta": 0.002075997879728675, "epsilon_dpo/beta_margin_grad_mean": -0.40418821573257446, "epsilon_dpo/beta_margin_grad_std": 0.09916716814041138, "epsilon_dpo/beta_margin_mean": 0.40588513016700745, "epsilon_dpo/beta_margin_std": 0.4290505051612854, "epsilon_dpo/loss_margin_mean": 196.52256774902344, "grad_norm": 23.67366600036621, "kl/avg_steps": 0.65625, "kl/beta": 0.0020895027555525303, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 1.6421423736208e-08, "logits/chosen": 1.289412021636963, "logits/rejected": 1.4389369487762451, "logps/chosen": -427.18963623046875, "logps/ref_chosen": -52.59947204589844, "logps/ref_rejected": -86.33099365234375, "logps/rejected": -657.4437255859375, "loss": 1.0646, "rewards/accuracies": 0.84375, "rewards/chosen": -0.7785675525665283, "rewards/margins": 0.40588513016700745, "rewards/rejected": -1.1844526529312134, "step": 611 }, { "epoch": 0.8986784140969163, "epsilon_dpo/beta": 0.0020631118677556515, "epsilon_dpo/beta_margin_grad_mean": -0.3926045596599579, "epsilon_dpo/beta_margin_grad_std": 0.09865017980337143, "epsilon_dpo/beta_margin_mean": 0.4561316967010498, "epsilon_dpo/beta_margin_std": 0.4290175139904022, "epsilon_dpo/loss_margin_mean": 222.2028045654297, "grad_norm": 27.82708740234375, "kl/avg_steps": 0.625, "kl/beta": 0.0020758798345923424, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.5967059836219042e-08, "logits/chosen": 1.0549695491790771, "logits/rejected": 1.8483834266662598, "logps/chosen": -454.82489013671875, "logps/ref_chosen": -59.32372283935547, "logps/ref_rejected": -88.31239318847656, "logps/rejected": -706.016357421875, "loss": 1.0247, "rewards/accuracies": 0.84375, "rewards/chosen": -0.8178003430366516, "rewards/margins": 0.4561316967010498, "rewards/rejected": -1.2739319801330566, "step": 612 }, { "epoch": 0.9001468428781204, "epsilon_dpo/beta": 0.0020477185025811195, "epsilon_dpo/beta_margin_grad_mean": -0.4002331495285034, "epsilon_dpo/beta_margin_grad_std": 0.08480419963598251, "epsilon_dpo/beta_margin_mean": 0.4207998812198639, "epsilon_dpo/beta_margin_std": 0.3745913803577423, "epsilon_dpo/loss_margin_mean": 206.05389404296875, "grad_norm": 23.780893325805664, "kl/avg_steps": 0.75, "kl/beta": 0.0020629861392080784, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 1.551886292185553e-08, "logits/chosen": 1.0429415702819824, "logits/rejected": 1.2024908065795898, "logps/chosen": -412.083740234375, "logps/ref_chosen": -59.72996520996094, "logps/ref_rejected": -105.10753631591797, "logps/rejected": -663.5151977539062, "loss": 1.0419, "rewards/accuracies": 0.90625, "rewards/chosen": -0.7217282056808472, "rewards/margins": 0.4207998812198639, "rewards/rejected": -1.1425280570983887, "step": 613 }, { "epoch": 0.9016152716593245, "epsilon_dpo/beta": 0.0020369545090943575, "epsilon_dpo/beta_margin_grad_mean": -0.39440545439720154, "epsilon_dpo/beta_margin_grad_std": 0.10275840014219284, "epsilon_dpo/beta_margin_mean": 0.44816258549690247, "epsilon_dpo/beta_margin_std": 0.4428095519542694, "epsilon_dpo/loss_margin_mean": 221.35459899902344, "grad_norm": 31.241605758666992, "kl/avg_steps": 0.53125, "kl/beta": 0.002047628862783313, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.507684480352292e-08, "logits/chosen": 1.2069741487503052, "logits/rejected": 1.3134090900421143, "logps/chosen": -439.93914794921875, "logps/ref_chosen": -52.93898010253906, "logps/ref_rejected": -104.67938232421875, "logps/rejected": -713.0341796875, "loss": 1.034, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7898073196411133, "rewards/margins": 0.44816261529922485, "rewards/rejected": -1.237969994544983, "step": 614 }, { "epoch": 0.9030837004405287, "epsilon_dpo/beta": 0.0020242806058377028, "epsilon_dpo/beta_margin_grad_mean": -0.42308035492897034, "epsilon_dpo/beta_margin_grad_std": 0.09278630465269089, "epsilon_dpo/beta_margin_mean": 0.328517347574234, "epsilon_dpo/beta_margin_std": 0.420515775680542, "epsilon_dpo/loss_margin_mean": 163.16390991210938, "grad_norm": 22.429214477539062, "kl/avg_steps": 0.625, "kl/beta": 0.002036808291450143, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.4641017128809801e-08, "logits/chosen": 1.0206799507141113, "logits/rejected": 1.1133084297180176, "logps/chosen": -436.15289306640625, "logps/ref_chosen": -65.81727600097656, "logps/ref_rejected": -95.17749786376953, "logps/rejected": -628.677001953125, "loss": 1.1252, "rewards/accuracies": 0.84375, "rewards/chosen": -0.7508677244186401, "rewards/margins": 0.328517347574234, "rewards/rejected": -1.0793850421905518, "step": 615 }, { "epoch": 0.9045521292217328, "epsilon_dpo/beta": 0.002012972952798009, "epsilon_dpo/beta_margin_grad_mean": -0.4355887174606323, "epsilon_dpo/beta_margin_grad_std": 0.09638901799917221, "epsilon_dpo/beta_margin_mean": 0.27304208278656006, "epsilon_dpo/beta_margin_std": 0.41767895221710205, "epsilon_dpo/loss_margin_mean": 136.7162322998047, "grad_norm": 30.985132217407227, "kl/avg_steps": 0.5625, "kl/beta": 0.00202415743842721, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.4211391382180637e-08, "logits/chosen": 1.1943162679672241, "logits/rejected": 1.915705919265747, "logps/chosen": -499.29022216796875, "logps/ref_chosen": -65.13285827636719, "logps/ref_rejected": -74.70050048828125, "logps/rejected": -645.5740966796875, "loss": 1.1731, "rewards/accuracies": 0.828125, "rewards/chosen": -0.874975323677063, "rewards/margins": 0.27304205298423767, "rewards/rejected": -1.148017406463623, "step": 616 }, { "epoch": 0.9060205580029369, "epsilon_dpo/beta": 0.002005487447604537, "epsilon_dpo/beta_margin_grad_mean": -0.43639859557151794, "epsilon_dpo/beta_margin_grad_std": 0.08785145729780197, "epsilon_dpo/beta_margin_mean": 0.26643723249435425, "epsilon_dpo/beta_margin_std": 0.3760392665863037, "epsilon_dpo/loss_margin_mean": 134.0316925048828, "grad_norm": 37.784576416015625, "kl/avg_steps": 0.375, "kl/beta": 0.002012835117056966, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 1.378797888467345e-08, "logits/chosen": 1.0707861185073853, "logits/rejected": 1.959951400756836, "logps/chosen": -466.6313781738281, "logps/ref_chosen": -63.005550384521484, "logps/ref_rejected": -64.234130859375, "logps/rejected": -601.8916625976562, "loss": 1.1713, "rewards/accuracies": 0.765625, "rewards/chosen": -0.8111328482627869, "rewards/margins": 0.26643723249435425, "rewards/rejected": -1.0775700807571411, "step": 617 }, { "epoch": 0.9074889867841409, "epsilon_dpo/beta": 0.001993608195334673, "epsilon_dpo/beta_margin_grad_mean": -0.40763044357299805, "epsilon_dpo/beta_margin_grad_std": 0.1053873673081398, "epsilon_dpo/beta_margin_mean": 0.39190471172332764, "epsilon_dpo/beta_margin_std": 0.4546877443790436, "epsilon_dpo/loss_margin_mean": 197.85704040527344, "grad_norm": 46.317352294921875, "kl/avg_steps": 0.59375, "kl/beta": 0.002005315385758877, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 1.3370790793601371e-08, "logits/chosen": 1.0298012495040894, "logits/rejected": 1.473135232925415, "logps/chosen": -485.0655822753906, "logps/ref_chosen": -67.10135650634766, "logps/ref_rejected": -92.15339660644531, "logps/rejected": -707.9747314453125, "loss": 1.0813, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8346935510635376, "rewards/margins": 0.39190471172332764, "rewards/rejected": -1.2265982627868652, "step": 618 }, { "epoch": 0.908957415565345, "epsilon_dpo/beta": 0.001983709866181016, "epsilon_dpo/beta_margin_grad_mean": -0.4199216365814209, "epsilon_dpo/beta_margin_grad_std": 0.10553835332393646, "epsilon_dpo/beta_margin_mean": 0.3419099450111389, "epsilon_dpo/beta_margin_std": 0.45738500356674194, "epsilon_dpo/loss_margin_mean": 173.6983642578125, "grad_norm": 25.143455505371094, "kl/avg_steps": 0.5, "kl/beta": 0.001993478974327445, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.2959838102258535e-08, "logits/chosen": 1.0666018724441528, "logits/rejected": 1.462690830230713, "logps/chosen": -457.4405517578125, "logps/ref_chosen": -55.978233337402344, "logps/ref_rejected": -93.1854019165039, "logps/rejected": -668.3460693359375, "loss": 1.1228, "rewards/accuracies": 0.75, "rewards/chosen": -0.7980378866195679, "rewards/margins": 0.3419099450111389, "rewards/rejected": -1.1399478912353516, "step": 619 }, { "epoch": 0.9104258443465492, "epsilon_dpo/beta": 0.001972600817680359, "epsilon_dpo/beta_margin_grad_mean": -0.4120746850967407, "epsilon_dpo/beta_margin_grad_std": 0.10483256727457047, "epsilon_dpo/beta_margin_mean": 0.37475821375846863, "epsilon_dpo/beta_margin_std": 0.4594890773296356, "epsilon_dpo/loss_margin_mean": 191.25567626953125, "grad_norm": 34.329288482666016, "kl/avg_steps": 0.5625, "kl/beta": 0.001983561087399721, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.2555131639630567e-08, "logits/chosen": 1.2032023668289185, "logits/rejected": 1.7684378623962402, "logps/chosen": -441.5105895996094, "logps/ref_chosen": -59.79750061035156, "logps/ref_rejected": -78.41075134277344, "logps/rejected": -651.3795166015625, "loss": 1.0958, "rewards/accuracies": 0.828125, "rewards/chosen": -0.7542145252227783, "rewards/margins": 0.374758243560791, "rewards/rejected": -1.1289727687835693, "step": 620 }, { "epoch": 0.9118942731277533, "epsilon_dpo/beta": 0.001958484761416912, "epsilon_dpo/beta_margin_grad_mean": -0.3918313682079315, "epsilon_dpo/beta_margin_grad_std": 0.09972291439771652, "epsilon_dpo/beta_margin_mean": 0.46101054549217224, "epsilon_dpo/beta_margin_std": 0.44019511342048645, "epsilon_dpo/loss_margin_mean": 236.33409118652344, "grad_norm": 31.05923843383789, "kl/avg_steps": 0.71875, "kl/beta": 0.0019724660087376833, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 1.2156682070109086e-08, "logits/chosen": 1.324573278427124, "logits/rejected": 1.5857019424438477, "logps/chosen": -433.49237060546875, "logps/ref_chosen": -53.933753967285156, "logps/ref_rejected": -88.36952209472656, "logps/rejected": -704.26220703125, "loss": 1.0227, "rewards/accuracies": 0.890625, "rewards/chosen": -0.7442139387130737, "rewards/margins": 0.46101054549217224, "rewards/rejected": -1.2052245140075684, "step": 621 }, { "epoch": 0.9133627019089574, "epsilon_dpo/beta": 0.0019463448552414775, "epsilon_dpo/beta_margin_grad_mean": -0.41644492745399475, "epsilon_dpo/beta_margin_grad_std": 0.08895543962717056, "epsilon_dpo/beta_margin_mean": 0.35330408811569214, "epsilon_dpo/beta_margin_std": 0.3922674357891083, "epsilon_dpo/loss_margin_mean": 182.4312744140625, "grad_norm": 23.87201499938965, "kl/avg_steps": 0.625, "kl/beta": 0.0019583902321755886, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.1764499893210878e-08, "logits/chosen": 0.9237732887268066, "logits/rejected": 1.5608757734298706, "logps/chosen": -425.43084716796875, "logps/ref_chosen": -60.28582000732422, "logps/ref_rejected": -85.51873779296875, "logps/rejected": -633.0950317382812, "loss": 1.0998, "rewards/accuracies": 0.84375, "rewards/chosen": -0.7119662761688232, "rewards/margins": 0.35330408811569214, "rewards/rejected": -1.0652704238891602, "step": 622 }, { "epoch": 0.9148311306901615, "epsilon_dpo/beta": 0.0019348639762029052, "epsilon_dpo/beta_margin_grad_mean": -0.4306987226009369, "epsilon_dpo/beta_margin_grad_std": 0.09799355268478394, "epsilon_dpo/beta_margin_mean": 0.2944139540195465, "epsilon_dpo/beta_margin_std": 0.4290635883808136, "epsilon_dpo/loss_margin_mean": 153.20993041992188, "grad_norm": 26.694618225097656, "kl/avg_steps": 0.59375, "kl/beta": 0.0019462262280285358, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 1.1378595443300998e-08, "logits/chosen": 1.110864520072937, "logits/rejected": 1.6331336498260498, "logps/chosen": -470.3033447265625, "logps/ref_chosen": -64.15696716308594, "logps/ref_rejected": -85.08304595947266, "logps/rejected": -644.4393310546875, "loss": 1.1568, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7869951725006104, "rewards/margins": 0.2944139540195465, "rewards/rejected": -1.081409215927124, "step": 623 }, { "epoch": 0.9162995594713657, "epsilon_dpo/beta": 0.0019210248719900846, "epsilon_dpo/beta_margin_grad_mean": -0.3984293043613434, "epsilon_dpo/beta_margin_grad_std": 0.09685565531253815, "epsilon_dpo/beta_margin_mean": 0.4288451373577118, "epsilon_dpo/beta_margin_std": 0.41765204071998596, "epsilon_dpo/loss_margin_mean": 224.21632385253906, "grad_norm": 36.52444839477539, "kl/avg_steps": 0.71875, "kl/beta": 0.0019347387133166194, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 1.0998978889320582e-08, "logits/chosen": 0.7286983728408813, "logits/rejected": 1.4310073852539062, "logps/chosen": -467.0516357421875, "logps/ref_chosen": -71.91862487792969, "logps/ref_rejected": -97.13203430175781, "logps/rejected": -716.4813232421875, "loss": 1.0441, "rewards/accuracies": 0.859375, "rewards/chosen": -0.7600715160369873, "rewards/margins": 0.4288451075553894, "rewards/rejected": -1.1889166831970215, "step": 624 }, { "epoch": 0.9177679882525698, "epsilon_dpo/beta": 0.001906715682707727, "epsilon_dpo/beta_margin_grad_mean": -0.4110877215862274, "epsilon_dpo/beta_margin_grad_std": 0.08938928693532944, "epsilon_dpo/beta_margin_mean": 0.37681111693382263, "epsilon_dpo/beta_margin_std": 0.39676207304000854, "epsilon_dpo/loss_margin_mean": 198.28582763671875, "grad_norm": 29.51067543029785, "kl/avg_steps": 0.75, "kl/beta": 0.0019209319725632668, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 1.0625660234518913e-08, "logits/chosen": 1.1471667289733887, "logits/rejected": 1.666570782661438, "logps/chosen": -436.4429931640625, "logps/ref_chosen": -58.342071533203125, "logps/ref_rejected": -86.09038543701172, "logps/rejected": -662.4771728515625, "loss": 1.0812, "rewards/accuracies": 0.84375, "rewards/chosen": -0.7215262651443481, "rewards/margins": 0.37681108713150024, "rewards/rejected": -1.0983374118804932, "step": 625 }, { "epoch": 0.9192364170337739, "epsilon_dpo/beta": 0.0018984805792570114, "epsilon_dpo/beta_margin_grad_mean": -0.43188056349754333, "epsilon_dpo/beta_margin_grad_std": 0.09042949974536896, "epsilon_dpo/beta_margin_mean": 0.28581419587135315, "epsilon_dpo/beta_margin_std": 0.3867412805557251, "epsilon_dpo/loss_margin_mean": 151.79847717285156, "grad_norm": 33.83115005493164, "kl/avg_steps": 0.4375, "kl/beta": 0.0019066323293372989, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 1.0258649316189721e-08, "logits/chosen": 0.8864705562591553, "logits/rejected": 1.213023066520691, "logps/chosen": -486.8077392578125, "logps/ref_chosen": -75.11260986328125, "logps/ref_rejected": -99.18872833251953, "logps/rejected": -662.682373046875, "loss": 1.1565, "rewards/accuracies": 0.75, "rewards/chosen": -0.7822318077087402, "rewards/margins": 0.28581416606903076, "rewards/rejected": -1.068045973777771, "step": 626 }, { "epoch": 0.920704845814978, "epsilon_dpo/beta": 0.0018884310266003013, "epsilon_dpo/beta_margin_grad_mean": -0.40042755007743835, "epsilon_dpo/beta_margin_grad_std": 0.1201905831694603, "epsilon_dpo/beta_margin_mean": 0.4418928921222687, "epsilon_dpo/beta_margin_std": 0.5634236335754395, "epsilon_dpo/loss_margin_mean": 235.53546142578125, "grad_norm": 30.784543991088867, "kl/avg_steps": 0.53125, "kl/beta": 0.0018983271438628435, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 9.897955805412e-09, "logits/chosen": 1.2974317073822021, "logits/rejected": 1.250240445137024, "logps/chosen": -396.13018798828125, "logps/ref_chosen": -47.74314880371094, "logps/ref_rejected": -106.75448608398438, "logps/rejected": -690.677001953125, "loss": 1.0634, "rewards/accuracies": 0.765625, "rewards/chosen": -0.6593598127365112, "rewards/margins": 0.4418928623199463, "rewards/rejected": -1.101252794265747, "step": 627 }, { "epoch": 0.922173274596182, "epsilon_dpo/beta": 0.0018784517887979746, "epsilon_dpo/beta_margin_grad_mean": -0.4034620523452759, "epsilon_dpo/beta_margin_grad_std": 0.0999772772192955, "epsilon_dpo/beta_margin_mean": 0.4092748761177063, "epsilon_dpo/beta_margin_std": 0.4315047264099121, "epsilon_dpo/loss_margin_mean": 219.25965881347656, "grad_norm": 32.134822845458984, "kl/avg_steps": 0.53125, "kl/beta": 0.0018882955191656947, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 9.543589206795238e-09, "logits/chosen": 1.1723408699035645, "logits/rejected": 1.2950220108032227, "logps/chosen": -457.1246032714844, "logps/ref_chosen": -60.182945251464844, "logps/ref_rejected": -101.55467224121094, "logps/rejected": -717.7559814453125, "loss": 1.0624, "rewards/accuracies": 0.796875, "rewards/chosen": -0.7467862367630005, "rewards/margins": 0.4092748761177063, "rewards/rejected": -1.1560611724853516, "step": 628 }, { "epoch": 0.9236417033773862, "epsilon_dpo/beta": 0.0018667641561478376, "epsilon_dpo/beta_margin_grad_mean": -0.4184776246547699, "epsilon_dpo/beta_margin_grad_std": 0.07906016707420349, "epsilon_dpo/beta_margin_mean": 0.3396104574203491, "epsilon_dpo/beta_margin_std": 0.3389366567134857, "epsilon_dpo/loss_margin_mean": 182.80221557617188, "grad_norm": 30.716339111328125, "kl/avg_steps": 0.625, "kl/beta": 0.001878316979855299, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 9.19555885822887e-09, "logits/chosen": 1.0403006076812744, "logits/rejected": 1.5764471292495728, "logps/chosen": -472.028076171875, "logps/ref_chosen": -64.21353912353516, "logps/ref_rejected": -91.65367126464844, "logps/rejected": -682.2704467773438, "loss": 1.1026, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7622989416122437, "rewards/margins": 0.3396104574203491, "rewards/rejected": -1.1019093990325928, "step": 629 }, { "epoch": 0.9251101321585903, "epsilon_dpo/beta": 0.0018557527801021934, "epsilon_dpo/beta_margin_grad_mean": -0.43844300508499146, "epsilon_dpo/beta_margin_grad_std": 0.08784260600805283, "epsilon_dpo/beta_margin_mean": 0.2594304382801056, "epsilon_dpo/beta_margin_std": 0.38279202580451965, "epsilon_dpo/loss_margin_mean": 140.74285888671875, "grad_norm": 22.31944465637207, "kl/avg_steps": 0.59375, "kl/beta": 0.0018666504183784127, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 8.85387393063622e-09, "logits/chosen": 0.9522106647491455, "logits/rejected": 1.4870188236236572, "logps/chosen": -416.2958068847656, "logps/ref_chosen": -59.29100036621094, "logps/ref_rejected": -83.59829711914062, "logps/rejected": -581.345947265625, "loss": 1.1782, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6638652086257935, "rewards/margins": 0.259430468082428, "rewards/rejected": -0.9232956171035767, "step": 630 }, { "epoch": 0.9265785609397944, "epsilon_dpo/beta": 0.0018459591083228588, "epsilon_dpo/beta_margin_grad_mean": -0.427374005317688, "epsilon_dpo/beta_margin_grad_std": 0.10752011835575104, "epsilon_dpo/beta_margin_mean": 0.31045976281166077, "epsilon_dpo/beta_margin_std": 0.4674583971500397, "epsilon_dpo/loss_margin_mean": 169.60186767578125, "grad_norm": 26.94605255126953, "kl/avg_steps": 0.53125, "kl/beta": 0.0018556325230747461, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 8.518543427732949e-09, "logits/chosen": 1.2326271533966064, "logits/rejected": 1.7843652963638306, "logps/chosen": -480.91925048828125, "logps/ref_chosen": -59.45360565185547, "logps/ref_rejected": -80.95157623291016, "logps/rejected": -672.0191040039062, "loss": 1.1514, "rewards/accuracies": 0.765625, "rewards/chosen": -0.7797050476074219, "rewards/margins": 0.31045979261398315, "rewards/rejected": -1.0901647806167603, "step": 631 }, { "epoch": 0.9280469897209985, "epsilon_dpo/beta": 0.0018362043192610145, "epsilon_dpo/beta_margin_grad_mean": -0.4185718297958374, "epsilon_dpo/beta_margin_grad_std": 0.10718811303377151, "epsilon_dpo/beta_margin_mean": 0.34240588545799255, "epsilon_dpo/beta_margin_std": 0.45832404494285583, "epsilon_dpo/loss_margin_mean": 188.03225708007812, "grad_norm": 23.3004207611084, "kl/avg_steps": 0.53125, "kl/beta": 0.0018458266276866198, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 8.189576185789637e-09, "logits/chosen": 1.1802589893341064, "logits/rejected": 1.6396427154541016, "logps/chosen": -470.71783447265625, "logps/ref_chosen": -61.35155487060547, "logps/ref_rejected": -86.16017150878906, "logps/rejected": -683.5587158203125, "loss": 1.1232, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7532949447631836, "rewards/margins": 0.34240591526031494, "rewards/rejected": -1.0957008600234985, "step": 632 }, { "epoch": 0.9295154185022027, "epsilon_dpo/beta": 0.0018265009857714176, "epsilon_dpo/beta_margin_grad_mean": -0.44286853075027466, "epsilon_dpo/beta_margin_grad_std": 0.09384477883577347, "epsilon_dpo/beta_margin_mean": 0.2385793775320053, "epsilon_dpo/beta_margin_std": 0.39600229263305664, "epsilon_dpo/loss_margin_mean": 131.95301818847656, "grad_norm": 25.7509765625, "kl/avg_steps": 0.53125, "kl/beta": 0.0018360725371167064, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 7.866980873399015e-09, "logits/chosen": 1.1726946830749512, "logits/rejected": 1.3165276050567627, "logps/chosen": -478.14373779296875, "logps/ref_chosen": -57.278167724609375, "logps/ref_rejected": -91.58395385742188, "logps/rejected": -644.4025268554688, "loss": 1.1998, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7698875665664673, "rewards/margins": 0.2385793775320053, "rewards/rejected": -1.0084669589996338, "step": 633 }, { "epoch": 0.9309838472834068, "epsilon_dpo/beta": 0.0018197029130533338, "epsilon_dpo/beta_margin_grad_mean": -0.43454012274742126, "epsilon_dpo/beta_margin_grad_std": 0.09465321153402328, "epsilon_dpo/beta_margin_mean": 0.2766806483268738, "epsilon_dpo/beta_margin_std": 0.4046916365623474, "epsilon_dpo/loss_margin_mean": 153.4633026123047, "grad_norm": 20.374540328979492, "kl/avg_steps": 0.375, "kl/beta": 0.0018263699021190405, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 7.550765991247654e-09, "logits/chosen": 1.1397655010223389, "logits/rejected": 1.5167808532714844, "logps/chosen": -490.03021240234375, "logps/ref_chosen": -66.61896514892578, "logps/ref_rejected": -107.12565612792969, "logps/rejected": -684.000244140625, "loss": 1.1678, "rewards/accuracies": 0.75, "rewards/chosen": -0.7717798948287964, "rewards/margins": 0.2766806483268738, "rewards/rejected": -1.048460602760315, "step": 634 }, { "epoch": 0.9324522760646109, "epsilon_dpo/beta": 0.0018100612796843052, "epsilon_dpo/beta_margin_grad_mean": -0.43598106503486633, "epsilon_dpo/beta_margin_grad_std": 0.08887307345867157, "epsilon_dpo/beta_margin_mean": 0.2698087692260742, "epsilon_dpo/beta_margin_std": 0.3833208382129669, "epsilon_dpo/loss_margin_mean": 150.1700897216797, "grad_norm": 24.014848709106445, "kl/avg_steps": 0.53125, "kl/beta": 0.0018195465672761202, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 7.240939871891699e-09, "logits/chosen": 1.0775097608566284, "logits/rejected": 1.6850745677947998, "logps/chosen": -480.21636962890625, "logps/ref_chosen": -73.95551300048828, "logps/ref_rejected": -82.50045776367188, "logps/rejected": -638.931396484375, "loss": 1.1695, "rewards/accuracies": 0.765625, "rewards/chosen": -0.7364508509635925, "rewards/margins": 0.2698087692260742, "rewards/rejected": -1.0062596797943115, "step": 635 }, { "epoch": 0.933920704845815, "epsilon_dpo/beta": 0.0017982334829866886, "epsilon_dpo/beta_margin_grad_mean": -0.4176199436187744, "epsilon_dpo/beta_margin_grad_std": 0.08222676068544388, "epsilon_dpo/beta_margin_mean": 0.3438073992729187, "epsilon_dpo/beta_margin_std": 0.35238152742385864, "epsilon_dpo/loss_margin_mean": 192.16282653808594, "grad_norm": 24.99462127685547, "kl/avg_steps": 0.65625, "kl/beta": 0.0018099313601851463, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 6.937510679537628e-09, "logits/chosen": 1.1912219524383545, "logits/rejected": 1.6938612461090088, "logps/chosen": -446.11859130859375, "logps/ref_chosen": -59.628910064697266, "logps/ref_rejected": -81.97883605957031, "logps/rejected": -660.63134765625, "loss": 1.1013, "rewards/accuracies": 0.828125, "rewards/chosen": -0.6957628726959229, "rewards/margins": 0.3438073992729187, "rewards/rejected": -1.0395703315734863, "step": 636 }, { "epoch": 0.9353891336270191, "epsilon_dpo/beta": 0.001785947591997683, "epsilon_dpo/beta_margin_grad_mean": -0.40572530031204224, "epsilon_dpo/beta_margin_grad_std": 0.0916060358285904, "epsilon_dpo/beta_margin_mean": 0.397985577583313, "epsilon_dpo/beta_margin_std": 0.3974331319332123, "epsilon_dpo/loss_margin_mean": 223.77783203125, "grad_norm": 29.209657669067383, "kl/avg_steps": 0.6875, "kl/beta": 0.0017981311539188027, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 6.640486409826785e-09, "logits/chosen": 1.282531976699829, "logits/rejected": 1.531841516494751, "logps/chosen": -461.0699462890625, "logps/ref_chosen": -49.652687072753906, "logps/ref_rejected": -98.40513610839844, "logps/rejected": -733.6002197265625, "loss": 1.0647, "rewards/accuracies": 0.859375, "rewards/chosen": -0.735283613204956, "rewards/margins": 0.397985577583313, "rewards/rejected": -1.1332693099975586, "step": 637 }, { "epoch": 0.9368575624082232, "epsilon_dpo/beta": 0.0017759855836629868, "epsilon_dpo/beta_margin_grad_mean": -0.418316513299942, "epsilon_dpo/beta_margin_grad_std": 0.08480395376682281, "epsilon_dpo/beta_margin_mean": 0.34098342061042786, "epsilon_dpo/beta_margin_std": 0.3596995174884796, "epsilon_dpo/loss_margin_mean": 193.12738037109375, "grad_norm": 24.57693862915039, "kl/avg_steps": 0.5625, "kl/beta": 0.001785853412002325, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 6.349874889624962e-09, "logits/chosen": 1.1940290927886963, "logits/rejected": 1.8498295545578003, "logps/chosen": -444.61712646484375, "logps/ref_chosen": -58.156646728515625, "logps/ref_rejected": -79.3014907836914, "logps/rejected": -658.889404296875, "loss": 1.1051, "rewards/accuracies": 0.796875, "rewards/chosen": -0.6871584057807922, "rewards/margins": 0.34098342061042786, "rewards/rejected": -1.028141736984253, "step": 638 }, { "epoch": 0.9383259911894273, "epsilon_dpo/beta": 0.0017677166033536196, "epsilon_dpo/beta_margin_grad_mean": -0.4446796178817749, "epsilon_dpo/beta_margin_grad_std": 0.09073272347450256, "epsilon_dpo/beta_margin_mean": 0.22914056479930878, "epsilon_dpo/beta_margin_std": 0.3779585361480713, "epsilon_dpo/loss_margin_mean": 131.0664825439453, "grad_norm": 35.84690856933594, "kl/avg_steps": 0.46875, "kl/beta": 0.0017758641624823213, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 6.065683776815933e-09, "logits/chosen": 0.9751791954040527, "logits/rejected": 2.1251561641693115, "logps/chosen": -530.078369140625, "logps/ref_chosen": -72.32319641113281, "logps/ref_rejected": -74.2749252319336, "logps/rejected": -663.0965576171875, "loss": 1.205, "rewards/accuracies": 0.734375, "rewards/chosen": -0.810213029384613, "rewards/margins": 0.22914057970046997, "rewards/rejected": -1.039353609085083, "step": 639 }, { "epoch": 0.9397944199706314, "epsilon_dpo/beta": 0.0017561545828357339, "epsilon_dpo/beta_margin_grad_mean": -0.40443187952041626, "epsilon_dpo/beta_margin_grad_std": 0.09270329028367996, "epsilon_dpo/beta_margin_mean": 0.40909114480018616, "epsilon_dpo/beta_margin_std": 0.42611533403396606, "epsilon_dpo/loss_margin_mean": 233.86619567871094, "grad_norm": 22.977806091308594, "kl/avg_steps": 0.65625, "kl/beta": 0.0017675786511972547, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 5.7879205600998296e-09, "logits/chosen": 1.0889379978179932, "logits/rejected": 1.4330050945281982, "logps/chosen": -454.34246826171875, "logps/ref_chosen": -56.13436508178711, "logps/ref_rejected": -108.60014343261719, "logps/rejected": -740.6744384765625, "loss": 1.0597, "rewards/accuracies": 0.875, "rewards/chosen": -0.7000018358230591, "rewards/margins": 0.40909114480018616, "rewards/rejected": -1.1090929508209229, "step": 640 }, { "epoch": 0.9412628487518355, "epsilon_dpo/beta": 0.0017463513650000095, "epsilon_dpo/beta_margin_grad_mean": -0.43330150842666626, "epsilon_dpo/beta_margin_grad_std": 0.09495817869901657, "epsilon_dpo/beta_margin_mean": 0.2847559452056885, "epsilon_dpo/beta_margin_std": 0.4316932260990143, "epsilon_dpo/loss_margin_mean": 164.40440368652344, "grad_norm": 22.582042694091797, "kl/avg_steps": 0.5625, "kl/beta": 0.0017560544656589627, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 5.516592558795746e-09, "logits/chosen": 1.144837737083435, "logits/rejected": 1.7584877014160156, "logps/chosen": -524.75146484375, "logps/ref_chosen": -64.99689483642578, "logps/ref_rejected": -86.99232482910156, "logps/rejected": -711.1512451171875, "loss": 1.1644, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8040879964828491, "rewards/margins": 0.2847559452056885, "rewards/rejected": -1.0888439416885376, "step": 641 }, { "epoch": 0.9427312775330396, "epsilon_dpo/beta": 0.0017393117304891348, "epsilon_dpo/beta_margin_grad_mean": -0.42638909816741943, "epsilon_dpo/beta_margin_grad_std": 0.10415295511484146, "epsilon_dpo/beta_margin_mean": 0.318130224943161, "epsilon_dpo/beta_margin_std": 0.47046786546707153, "epsilon_dpo/loss_margin_mean": 184.53005981445312, "grad_norm": 30.074098587036133, "kl/avg_steps": 0.40625, "kl/beta": 0.001746231922879815, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 5.251706922648868e-09, "logits/chosen": 1.054837942123413, "logits/rejected": 1.3688443899154663, "logps/chosen": -474.4324951171875, "logps/ref_chosen": -65.68924713134766, "logps/ref_rejected": -110.24205017089844, "logps/rejected": -703.515380859375, "loss": 1.1442, "rewards/accuracies": 0.75, "rewards/chosen": -0.7122145891189575, "rewards/margins": 0.318130224943161, "rewards/rejected": -1.030344843864441, "step": 642 }, { "epoch": 0.9441997063142438, "epsilon_dpo/beta": 0.0017290131654590368, "epsilon_dpo/beta_margin_grad_mean": -0.4249967932701111, "epsilon_dpo/beta_margin_grad_std": 0.07674811780452728, "epsilon_dpo/beta_margin_mean": 0.3110256791114807, "epsilon_dpo/beta_margin_std": 0.3235742747783661, "epsilon_dpo/loss_margin_mean": 180.82168579101562, "grad_norm": 19.92149543762207, "kl/avg_steps": 0.59375, "kl/beta": 0.0017391665605828166, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.993270631642038e-09, "logits/chosen": 1.3465564250946045, "logits/rejected": 1.6335021257400513, "logps/chosen": -459.1343994140625, "logps/ref_chosen": -51.94999694824219, "logps/ref_rejected": -87.46833801269531, "logps/rejected": -675.4744873046875, "loss": 1.1245, "rewards/accuracies": 0.828125, "rewards/chosen": -0.7049366235733032, "rewards/margins": 0.3110256791114807, "rewards/rejected": -1.0159623622894287, "step": 643 }, { "epoch": 0.9456681350954479, "epsilon_dpo/beta": 0.0017204286996275187, "epsilon_dpo/beta_margin_grad_mean": -0.4298701286315918, "epsilon_dpo/beta_margin_grad_std": 0.09674103558063507, "epsilon_dpo/beta_margin_mean": 0.29447636008262634, "epsilon_dpo/beta_margin_std": 0.4145286977291107, "epsilon_dpo/loss_margin_mean": 172.5329132080078, "grad_norm": 24.23951530456543, "kl/avg_steps": 0.5, "kl/beta": 0.0017289011739194393, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.741290495811873e-09, "logits/chosen": 1.12656569480896, "logits/rejected": 1.5934898853302002, "logps/chosen": -467.9019470214844, "logps/ref_chosen": -59.017662048339844, "logps/ref_rejected": -87.13668823242188, "logps/rejected": -668.5538330078125, "loss": 1.1544, "rewards/accuracies": 0.796875, "rewards/chosen": -0.704412579536438, "rewards/margins": 0.29447638988494873, "rewards/rejected": -0.9988888502120972, "step": 644 }, { "epoch": 0.947136563876652, "epsilon_dpo/beta": 0.001714020036160946, "epsilon_dpo/beta_margin_grad_mean": -0.44414132833480835, "epsilon_dpo/beta_margin_grad_std": 0.10007171332836151, "epsilon_dpo/beta_margin_mean": 0.23775628209114075, "epsilon_dpo/beta_margin_std": 0.4299026131629944, "epsilon_dpo/loss_margin_mean": 140.25137329101562, "grad_norm": 33.30598068237305, "kl/avg_steps": 0.375, "kl/beta": 0.0017202997114509344, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.495773155069299e-09, "logits/chosen": 1.328917145729065, "logits/rejected": 1.2759854793548584, "logps/chosen": -468.2938232421875, "logps/ref_chosen": -55.87602233886719, "logps/ref_rejected": -97.78080749511719, "logps/rejected": -650.449951171875, "loss": 1.2067, "rewards/accuracies": 0.734375, "rewards/chosen": -0.7081266641616821, "rewards/margins": 0.23775628209114075, "rewards/rejected": -0.9458829164505005, "step": 645 }, { "epoch": 0.9486049926578561, "epsilon_dpo/beta": 0.0017049381276592612, "epsilon_dpo/beta_margin_grad_mean": -0.43052205443382263, "epsilon_dpo/beta_margin_grad_std": 0.08512377738952637, "epsilon_dpo/beta_margin_mean": 0.2906644940376282, "epsilon_dpo/beta_margin_std": 0.3636190891265869, "epsilon_dpo/loss_margin_mean": 171.6541748046875, "grad_norm": 27.597734451293945, "kl/avg_steps": 0.53125, "kl/beta": 0.001713872654363513, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.256725079024553e-09, "logits/chosen": 1.1945412158966064, "logits/rejected": 1.949883222579956, "logps/chosen": -483.1705627441406, "logps/ref_chosen": -61.275787353515625, "logps/ref_rejected": -77.50580596923828, "logps/rejected": -671.0547485351562, "loss": 1.1482, "rewards/accuracies": 0.796875, "rewards/chosen": -0.7204253673553467, "rewards/margins": 0.2906644940376282, "rewards/rejected": -1.01108980178833, "step": 646 }, { "epoch": 0.9500734214390602, "epsilon_dpo/beta": 0.0016948629636317492, "epsilon_dpo/beta_margin_grad_mean": -0.4299209713935852, "epsilon_dpo/beta_margin_grad_std": 0.08932579308748245, "epsilon_dpo/beta_margin_mean": 0.2924911379814148, "epsilon_dpo/beta_margin_std": 0.37851130962371826, "epsilon_dpo/loss_margin_mean": 173.77426147460938, "grad_norm": 26.752613067626953, "kl/avg_steps": 0.59375, "kl/beta": 0.001704815891571343, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.024152566816791e-09, "logits/chosen": 1.186145544052124, "logits/rejected": 1.2575244903564453, "logps/chosen": -430.82135009765625, "logps/ref_chosen": -54.852413177490234, "logps/ref_rejected": -93.5194091796875, "logps/rejected": -643.2625732421875, "loss": 1.1495, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6383639574050903, "rewards/margins": 0.2924911379814148, "rewards/rejected": -0.9308550953865051, "step": 647 }, { "epoch": 0.9515418502202643, "epsilon_dpo/beta": 0.001684859162196517, "epsilon_dpo/beta_margin_grad_mean": -0.41094687581062317, "epsilon_dpo/beta_margin_grad_std": 0.10572434961795807, "epsilon_dpo/beta_margin_mean": 0.3823206424713135, "epsilon_dpo/beta_margin_std": 0.47271671891212463, "epsilon_dpo/loss_margin_mean": 228.3726806640625, "grad_norm": 22.86119842529297, "kl/avg_steps": 0.59375, "kl/beta": 0.0016947533003985882, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 3.798061746947995e-09, "logits/chosen": 1.4940239191055298, "logits/rejected": 1.5447354316711426, "logps/chosen": -468.0401611328125, "logps/ref_chosen": -54.17146682739258, "logps/ref_rejected": -98.71279907226562, "logps/rejected": -740.9541625976562, "loss": 1.0918, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6989763975143433, "rewards/margins": 0.3823206424713135, "rewards/rejected": -1.0812969207763672, "step": 648 }, { "epoch": 0.9530102790014684, "epsilon_dpo/beta": 0.0016796531854197383, "epsilon_dpo/beta_margin_grad_mean": -0.44180822372436523, "epsilon_dpo/beta_margin_grad_std": 0.09041650593280792, "epsilon_dpo/beta_margin_mean": 0.24603573977947235, "epsilon_dpo/beta_margin_std": 0.3954623341560364, "epsilon_dpo/loss_margin_mean": 147.93875122070312, "grad_norm": 20.313772201538086, "kl/avg_steps": 0.3125, "kl/beta": 0.0016847500810399652, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 3.5784585771215235e-09, "logits/chosen": 1.1369266510009766, "logits/rejected": 1.797896146774292, "logps/chosen": -488.5963134765625, "logps/ref_chosen": -62.4803466796875, "logps/ref_rejected": -80.07717895507812, "logps/rejected": -654.1318969726562, "loss": 1.1922, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7163372039794922, "rewards/margins": 0.24603573977947235, "rewards/rejected": -0.9623730182647705, "step": 649 }, { "epoch": 0.9544787077826725, "epsilon_dpo/beta": 0.0016707462491467595, "epsilon_dpo/beta_margin_grad_mean": -0.4147437810897827, "epsilon_dpo/beta_margin_grad_std": 0.09809747338294983, "epsilon_dpo/beta_margin_mean": 0.36171820759773254, "epsilon_dpo/beta_margin_std": 0.42442724108695984, "epsilon_dpo/loss_margin_mean": 217.962158203125, "grad_norm": 23.547075271606445, "kl/avg_steps": 0.53125, "kl/beta": 0.0016795016126707196, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 3.3653488440851253e-09, "logits/chosen": 1.4009277820587158, "logits/rejected": 1.5325489044189453, "logps/chosen": -468.0220947265625, "logps/ref_chosen": -56.09281921386719, "logps/ref_rejected": -98.26483917236328, "logps/rejected": -728.15625, "loss": 1.0995, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6900794506072998, "rewards/margins": 0.36171823740005493, "rewards/rejected": -1.0517977476119995, "step": 650 }, { "epoch": 0.9559471365638766, "epsilon_dpo/beta": 0.0016598289366811514, "epsilon_dpo/beta_margin_grad_mean": -0.41106364130973816, "epsilon_dpo/beta_margin_grad_std": 0.08254638314247131, "epsilon_dpo/beta_margin_mean": 0.37262973189353943, "epsilon_dpo/beta_margin_std": 0.35590580105781555, "epsilon_dpo/loss_margin_mean": 225.4111328125, "grad_norm": 26.650527954101562, "kl/avg_steps": 0.65625, "kl/beta": 0.0016706264577805996, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 3.158738163478475e-09, "logits/chosen": 1.4459567070007324, "logits/rejected": 1.4230878353118896, "logps/chosen": -401.56536865234375, "logps/ref_chosen": -43.42544937133789, "logps/ref_rejected": -99.9579086303711, "logps/rejected": -683.5089111328125, "loss": 1.078, "rewards/accuracies": 0.828125, "rewards/chosen": -0.5953332185745239, "rewards/margins": 0.37262973189353943, "rewards/rejected": -0.967962920665741, "step": 651 }, { "epoch": 0.9574155653450808, "epsilon_dpo/beta": 0.0016495260642841458, "epsilon_dpo/beta_margin_grad_mean": -0.42614489793777466, "epsilon_dpo/beta_margin_grad_std": 0.08484771847724915, "epsilon_dpo/beta_margin_mean": 0.31016284227371216, "epsilon_dpo/beta_margin_std": 0.36632609367370605, "epsilon_dpo/loss_margin_mean": 188.94003295898438, "grad_norm": 22.533578872680664, "kl/avg_steps": 0.625, "kl/beta": 0.001659734407439828, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 2.9586319796851555e-09, "logits/chosen": 0.9193169474601746, "logits/rejected": 1.0220489501953125, "logps/chosen": -452.76849365234375, "logps/ref_chosen": -62.57680892944336, "logps/ref_rejected": -111.76779174804688, "logps/rejected": -690.8995361328125, "loss": 1.1318, "rewards/accuracies": 0.875, "rewards/chosen": -0.6445059776306152, "rewards/margins": 0.31016284227371216, "rewards/rejected": -0.9546687602996826, "step": 652 }, { "epoch": 0.9588839941262849, "epsilon_dpo/beta": 0.0016382494941353798, "epsilon_dpo/beta_margin_grad_mean": -0.42025431990623474, "epsilon_dpo/beta_margin_grad_std": 0.08646851778030396, "epsilon_dpo/beta_margin_mean": 0.335557758808136, "epsilon_dpo/beta_margin_std": 0.37546736001968384, "epsilon_dpo/loss_margin_mean": 205.70590209960938, "grad_norm": 28.52092933654785, "kl/avg_steps": 0.6875, "kl/beta": 0.0016494254814460874, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 2.7650355656892166e-09, "logits/chosen": 1.114861011505127, "logits/rejected": 1.439771056175232, "logps/chosen": -489.69744873046875, "logps/ref_chosen": -61.11295700073242, "logps/ref_rejected": -103.24960327148438, "logps/rejected": -737.5400390625, "loss": 1.1119, "rewards/accuracies": 0.859375, "rewards/chosen": -0.7029273509979248, "rewards/margins": 0.335557758808136, "rewards/rejected": -1.038485050201416, "step": 653 }, { "epoch": 0.960352422907489, "epsilon_dpo/beta": 0.0016321832081303, "epsilon_dpo/beta_margin_grad_mean": -0.4364525079727173, "epsilon_dpo/beta_margin_grad_std": 0.09288126975297928, "epsilon_dpo/beta_margin_mean": 0.2680140733718872, "epsilon_dpo/beta_margin_std": 0.4029082953929901, "epsilon_dpo/loss_margin_mean": 165.78965759277344, "grad_norm": 23.749624252319336, "kl/avg_steps": 0.375, "kl/beta": 0.0016381631139665842, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 2.577954022936174e-09, "logits/chosen": 1.2121856212615967, "logits/rejected": 1.547875165939331, "logps/chosen": -492.3443908691406, "logps/ref_chosen": -61.7281379699707, "logps/ref_rejected": -98.7738037109375, "logps/rejected": -695.1796875, "loss": 1.1746, "rewards/accuracies": 0.75, "rewards/chosen": -0.7043324112892151, "rewards/margins": 0.2680141031742096, "rewards/rejected": -0.9723464846611023, "step": 654 }, { "epoch": 0.9618208516886931, "epsilon_dpo/beta": 0.0016220049001276493, "epsilon_dpo/beta_margin_grad_mean": -0.42558589577674866, "epsilon_dpo/beta_margin_grad_std": 0.09087540209293365, "epsilon_dpo/beta_margin_mean": 0.31348636746406555, "epsilon_dpo/beta_margin_std": 0.3893417716026306, "epsilon_dpo/loss_margin_mean": 194.35455322265625, "grad_norm": 30.1119384765625, "kl/avg_steps": 0.625, "kl/beta": 0.0016320429276674986, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 2.397392281198729e-09, "logits/chosen": 1.3309032917022705, "logits/rejected": 1.3572744131088257, "logps/chosen": -450.25946044921875, "logps/ref_chosen": -49.576812744140625, "logps/ref_rejected": -98.29183197021484, "logps/rejected": -693.3289794921875, "loss": 1.1333, "rewards/accuracies": 0.828125, "rewards/chosen": -0.6509025692939758, "rewards/margins": 0.31348636746406555, "rewards/rejected": -0.9643889665603638, "step": 655 }, { "epoch": 0.9632892804698973, "epsilon_dpo/beta": 0.0016091398429125547, "epsilon_dpo/beta_margin_grad_mean": -0.387954443693161, "epsilon_dpo/beta_margin_grad_std": 0.07566812634468079, "epsilon_dpo/beta_margin_mean": 0.46876344084739685, "epsilon_dpo/beta_margin_std": 0.32952094078063965, "epsilon_dpo/loss_margin_mean": 291.9250793457031, "grad_norm": 34.51503372192383, "kl/avg_steps": 0.796875, "kl/beta": 0.0016219060635194182, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 2.223355098446622e-09, "logits/chosen": 1.2120656967163086, "logits/rejected": 1.238845944404602, "logps/chosen": -435.2579345703125, "logps/ref_chosen": -52.54943084716797, "logps/ref_rejected": -113.67464447021484, "logps/rejected": -788.3082275390625, "loss": 0.9972, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6157395839691162, "rewards/margins": 0.46876341104507446, "rewards/rejected": -1.0845030546188354, "step": 656 }, { "epoch": 0.9647577092511013, "epsilon_dpo/beta": 0.0015976781724020839, "epsilon_dpo/beta_margin_grad_mean": -0.40962353348731995, "epsilon_dpo/beta_margin_grad_std": 0.08101543039083481, "epsilon_dpo/beta_margin_mean": 0.3786521255970001, "epsilon_dpo/beta_margin_std": 0.35267093777656555, "epsilon_dpo/loss_margin_mean": 237.8256378173828, "grad_norm": 32.31270980834961, "kl/avg_steps": 0.71875, "kl/beta": 0.0016090837307274342, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 2.055847060721566e-09, "logits/chosen": 1.4408583641052246, "logits/rejected": 1.516934871673584, "logps/chosen": -442.75128173828125, "logps/ref_chosen": -46.700538635253906, "logps/ref_rejected": -97.91487121582031, "logps/rejected": -731.791259765625, "loss": 1.0724, "rewards/accuracies": 0.875, "rewards/chosen": -0.6334355473518372, "rewards/margins": 0.37865209579467773, "rewards/rejected": -1.0120875835418701, "step": 657 }, { "epoch": 0.9662261380323054, "epsilon_dpo/beta": 0.0015887732151895761, "epsilon_dpo/beta_margin_grad_mean": -0.428006112575531, "epsilon_dpo/beta_margin_grad_std": 0.07601499557495117, "epsilon_dpo/beta_margin_mean": 0.2968916893005371, "epsilon_dpo/beta_margin_std": 0.3177672028541565, "epsilon_dpo/loss_margin_mean": 188.05613708496094, "grad_norm": 29.608266830444336, "kl/avg_steps": 0.5625, "kl/beta": 0.0015976008726283908, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.8948725820160662e-09, "logits/chosen": 1.3499019145965576, "logits/rejected": 1.6195529699325562, "logps/chosen": -495.99505615234375, "logps/ref_chosen": -60.958213806152344, "logps/ref_rejected": -95.93949127197266, "logps/rejected": -719.032470703125, "loss": 1.1358, "rewards/accuracies": 0.828125, "rewards/chosen": -0.6918191909790039, "rewards/margins": 0.2968916893005371, "rewards/rejected": -0.988710880279541, "step": 658 }, { "epoch": 0.9676945668135095, "epsilon_dpo/beta": 0.0015798864187672734, "epsilon_dpo/beta_margin_grad_mean": -0.4219341576099396, "epsilon_dpo/beta_margin_grad_std": 0.08217877149581909, "epsilon_dpo/beta_margin_mean": 0.32547417283058167, "epsilon_dpo/beta_margin_std": 0.35009095072746277, "epsilon_dpo/loss_margin_mean": 207.2586212158203, "grad_norm": 21.250402450561523, "kl/avg_steps": 0.5625, "kl/beta": 0.0015886647161096334, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.7404359041573723e-09, "logits/chosen": 0.7977874279022217, "logits/rejected": 1.7436567544937134, "logps/chosen": -492.78631591796875, "logps/ref_chosen": -76.74298095703125, "logps/ref_rejected": -87.4709701538086, "logps/rejected": -710.77294921875, "loss": 1.1164, "rewards/accuracies": 0.828125, "rewards/chosen": -0.6578041315078735, "rewards/margins": 0.32547420263290405, "rewards/rejected": -0.9832782745361328, "step": 659 }, { "epoch": 0.9691629955947136, "epsilon_dpo/beta": 0.0015685806283727288, "epsilon_dpo/beta_margin_grad_mean": -0.417694091796875, "epsilon_dpo/beta_margin_grad_std": 0.07718788832426071, "epsilon_dpo/beta_margin_mean": 0.3406737148761749, "epsilon_dpo/beta_margin_std": 0.32832127809524536, "epsilon_dpo/loss_margin_mean": 218.10269165039062, "grad_norm": 19.845855712890625, "kl/avg_steps": 0.71875, "kl/beta": 0.001579778385348618, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 1.592541096695571e-09, "logits/chosen": 1.3539650440216064, "logits/rejected": 2.1877787113189697, "logps/chosen": -476.62286376953125, "logps/ref_chosen": -59.047882080078125, "logps/ref_rejected": -75.96005249023438, "logps/rejected": -711.6376953125, "loss": 1.1002, "rewards/accuracies": 0.875, "rewards/chosen": -0.6557613611221313, "rewards/margins": 0.3406737148761749, "rewards/rejected": -0.9964351058006287, "step": 660 }, { "epoch": 0.9706314243759178, "epsilon_dpo/beta": 0.0015568967210128903, "epsilon_dpo/beta_margin_grad_mean": -0.4247366786003113, "epsilon_dpo/beta_margin_grad_std": 0.08477568626403809, "epsilon_dpo/beta_margin_mean": 0.31514978408813477, "epsilon_dpo/beta_margin_std": 0.3677848279476166, "epsilon_dpo/loss_margin_mean": 203.2999267578125, "grad_norm": 21.829837799072266, "kl/avg_steps": 0.75, "kl/beta": 0.0015685048419982195, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 1.4511920567963908e-09, "logits/chosen": 1.1137815713882446, "logits/rejected": 1.8122587203979492, "logps/chosen": -437.73468017578125, "logps/ref_chosen": -50.673973083496094, "logps/ref_rejected": -86.00569152832031, "logps/rejected": -676.3663330078125, "loss": 1.1278, "rewards/accuracies": 0.875, "rewards/chosen": -0.6036039590835571, "rewards/margins": 0.31514978408813477, "rewards/rejected": -0.9187537431716919, "step": 661 }, { "epoch": 0.9720998531571219, "epsilon_dpo/beta": 0.00154968595597893, "epsilon_dpo/beta_margin_grad_mean": -0.4349568486213684, "epsilon_dpo/beta_margin_grad_std": 0.08338849246501923, "epsilon_dpo/beta_margin_mean": 0.2698204517364502, "epsilon_dpo/beta_margin_std": 0.3489322364330292, "epsilon_dpo/loss_margin_mean": 175.56666564941406, "grad_norm": 19.571136474609375, "kl/avg_steps": 0.46875, "kl/beta": 0.0015568286180496216, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 1.3163925091384532e-09, "logits/chosen": 1.0110704898834229, "logits/rejected": 1.527016282081604, "logps/chosen": -493.9742431640625, "logps/ref_chosen": -69.26106262207031, "logps/ref_rejected": -89.05593872070312, "logps/rejected": -689.3358154296875, "loss": 1.1641, "rewards/accuracies": 0.765625, "rewards/chosen": -0.6591126918792725, "rewards/margins": 0.2698204517364502, "rewards/rejected": -0.9289331436157227, "step": 662 }, { "epoch": 0.973568281938326, "epsilon_dpo/beta": 0.0015390656189993024, "epsilon_dpo/beta_margin_grad_mean": -0.42479458451271057, "epsilon_dpo/beta_margin_grad_std": 0.08391547948122025, "epsilon_dpo/beta_margin_mean": 0.31414592266082764, "epsilon_dpo/beta_margin_std": 0.36153239011764526, "epsilon_dpo/loss_margin_mean": 205.07363891601562, "grad_norm": 21.666109085083008, "kl/avg_steps": 0.6875, "kl/beta": 0.0015495650004595518, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 1.1881460058152382e-09, "logits/chosen": 0.893703043460846, "logits/rejected": 0.9986363053321838, "logps/chosen": -456.033447265625, "logps/ref_chosen": -64.87891387939453, "logps/ref_rejected": -113.92536926269531, "logps/rejected": -710.153564453125, "loss": 1.1277, "rewards/accuracies": 0.84375, "rewards/chosen": -0.6028608083724976, "rewards/margins": 0.31414592266082764, "rewards/rejected": -0.9170067310333252, "step": 663 }, { "epoch": 0.9750367107195301, "epsilon_dpo/beta": 0.001530480687506497, "epsilon_dpo/beta_margin_grad_mean": -0.4216407239437103, "epsilon_dpo/beta_margin_grad_std": 0.08829301595687866, "epsilon_dpo/beta_margin_mean": 0.3248541057109833, "epsilon_dpo/beta_margin_std": 0.37472057342529297, "epsilon_dpo/loss_margin_mean": 213.6553497314453, "grad_norm": 21.28402328491211, "kl/avg_steps": 0.5625, "kl/beta": 0.001538984477519989, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.066455926241383e-09, "logits/chosen": 1.216430902481079, "logits/rejected": 1.6328184604644775, "logps/chosen": -497.7940673828125, "logps/ref_chosen": -60.88847351074219, "logps/ref_rejected": -105.521728515625, "logps/rejected": -756.0826416015625, "loss": 1.1214, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6699653267860413, "rewards/margins": 0.3248541057109833, "rewards/rejected": -0.9948194622993469, "step": 664 }, { "epoch": 0.9765051395007343, "epsilon_dpo/beta": 0.0015195284504443407, "epsilon_dpo/beta_margin_grad_mean": -0.4235166311264038, "epsilon_dpo/beta_margin_grad_std": 0.07510911673307419, "epsilon_dpo/beta_margin_mean": 0.3179273307323456, "epsilon_dpo/beta_margin_std": 0.32217973470687866, "epsilon_dpo/loss_margin_mean": 210.031005859375, "grad_norm": 20.151309967041016, "kl/avg_steps": 0.71875, "kl/beta": 0.0015303761465474963, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 9.513254770636137e-10, "logits/chosen": 1.220696210861206, "logits/rejected": 1.80734384059906, "logps/chosen": -484.7784423828125, "logps/ref_chosen": -60.56413269042969, "logps/ref_rejected": -84.8088150024414, "logps/rejected": -719.0540771484375, "loss": 1.1182, "rewards/accuracies": 0.875, "rewards/chosen": -0.645055890083313, "rewards/margins": 0.3179273307323456, "rewards/rejected": -0.962983250617981, "step": 665 }, { "epoch": 0.9779735682819384, "epsilon_dpo/beta": 0.0015096345450729132, "epsilon_dpo/beta_margin_grad_mean": -0.4246806502342224, "epsilon_dpo/beta_margin_grad_std": 0.06764908134937286, "epsilon_dpo/beta_margin_mean": 0.30967044830322266, "epsilon_dpo/beta_margin_std": 0.2827000617980957, "epsilon_dpo/loss_margin_mean": 206.04354858398438, "grad_norm": 21.010101318359375, "kl/avg_steps": 0.65625, "kl/beta": 0.001519454992376268, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 8.427576920763956e-10, "logits/chosen": 1.0827593803405762, "logits/rejected": 1.5170735120773315, "logps/chosen": -465.0032958984375, "logps/ref_chosen": -64.41996002197266, "logps/ref_rejected": -95.89163208007812, "logps/rejected": -702.5185546875, "loss": 1.1198, "rewards/accuracies": 0.84375, "rewards/chosen": -0.6052336692810059, "rewards/margins": 0.30967044830322266, "rewards/rejected": -0.9149041175842285, "step": 666 }, { "epoch": 0.9794419970631424, "epsilon_dpo/beta": 0.0014995539095252752, "epsilon_dpo/beta_margin_grad_mean": -0.430759996175766, "epsilon_dpo/beta_margin_grad_std": 0.07021576911211014, "epsilon_dpo/beta_margin_mean": 0.28509852290153503, "epsilon_dpo/beta_margin_std": 0.2928607165813446, "epsilon_dpo/loss_margin_mean": 191.0198516845703, "grad_norm": 27.546092987060547, "kl/avg_steps": 0.671875, "kl/beta": 0.001509548630565405, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.828125, "learning_rate": 7.407554321417764e-10, "logits/chosen": 1.0864355564117432, "logits/rejected": 1.7724149227142334, "logps/chosen": -513.462890625, "logps/ref_chosen": -69.27703094482422, "logps/ref_rejected": -87.83549499511719, "logps/rejected": -723.0411987304688, "loss": 1.1422, "rewards/accuracies": 0.84375, "rewards/chosen": -0.6668318510055542, "rewards/margins": 0.28509852290153503, "rewards/rejected": -0.9519304037094116, "step": 667 }, { "epoch": 0.9809104258443465, "epsilon_dpo/beta": 0.001492125797085464, "epsilon_dpo/beta_margin_grad_mean": -0.44598451256752014, "epsilon_dpo/beta_margin_grad_std": 0.08418892323970795, "epsilon_dpo/beta_margin_mean": 0.22311602532863617, "epsilon_dpo/beta_margin_std": 0.3523963391780853, "epsilon_dpo/loss_margin_mean": 150.9296875, "grad_norm": 32.91999816894531, "kl/avg_steps": 0.5, "kl/beta": 0.001499474048614502, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 6.453213851142225e-10, "logits/chosen": 1.0455001592636108, "logits/rejected": 1.1969373226165771, "logps/chosen": -527.62158203125, "logps/ref_chosen": -72.60400390625, "logps/ref_rejected": -103.73905181884766, "logps/rejected": -709.686279296875, "loss": 1.2058, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6804643869400024, "rewards/margins": 0.22311602532863617, "rewards/rejected": -0.903580367565155, "step": 668 }, { "epoch": 0.9823788546255506, "epsilon_dpo/beta": 0.0014805055689066648, "epsilon_dpo/beta_margin_grad_mean": -0.42099520564079285, "epsilon_dpo/beta_margin_grad_std": 0.08258918672800064, "epsilon_dpo/beta_margin_mean": 0.3277672231197357, "epsilon_dpo/beta_margin_std": 0.34995928406715393, "epsilon_dpo/loss_margin_mean": 222.26109313964844, "grad_norm": 18.236900329589844, "kl/avg_steps": 0.78125, "kl/beta": 0.0014920139219611883, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 5.564580657695939e-10, "logits/chosen": 1.1504340171813965, "logits/rejected": 1.864037275314331, "logps/chosen": -422.7698974609375, "logps/ref_chosen": -46.116416931152344, "logps/ref_rejected": -77.92434692382812, "logps/rejected": -676.8389892578125, "loss": 1.1146, "rewards/accuracies": 0.890625, "rewards/chosen": -0.5582731366157532, "rewards/margins": 0.3277672529220581, "rewards/rejected": -0.8860403299331665, "step": 669 }, { "epoch": 0.9838472834067548, "epsilon_dpo/beta": 0.0014731930568814278, "epsilon_dpo/beta_margin_grad_mean": -0.4170011579990387, "epsilon_dpo/beta_margin_grad_std": 0.0830448642373085, "epsilon_dpo/beta_margin_mean": 0.3461667001247406, "epsilon_dpo/beta_margin_std": 0.35282188653945923, "epsilon_dpo/loss_margin_mean": 236.43572998046875, "grad_norm": 17.821779251098633, "kl/avg_steps": 0.5, "kl/beta": 0.0014804479433223605, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.741678157389739e-10, "logits/chosen": 1.1266391277313232, "logits/rejected": 1.5027225017547607, "logps/chosen": -417.00958251953125, "logps/ref_chosen": -62.34575653076172, "logps/ref_rejected": -96.9405517578125, "logps/rejected": -688.0401611328125, "loss": 1.0996, "rewards/accuracies": 0.796875, "rewards/chosen": -0.5237119793891907, "rewards/margins": 0.3461666703224182, "rewards/rejected": -0.8698786497116089, "step": 670 }, { "epoch": 0.9853157121879589, "epsilon_dpo/beta": 0.001465403358452022, "epsilon_dpo/beta_margin_grad_mean": -0.4314085841178894, "epsilon_dpo/beta_margin_grad_std": 0.08191683143377304, "epsilon_dpo/beta_margin_mean": 0.28351274132728577, "epsilon_dpo/beta_margin_std": 0.3416160047054291, "epsilon_dpo/loss_margin_mean": 194.85296630859375, "grad_norm": 22.806631088256836, "kl/avg_steps": 0.53125, "kl/beta": 0.001473082578741014, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 3.9845280344705245e-10, "logits/chosen": 1.5543603897094727, "logits/rejected": 1.8255066871643066, "logps/chosen": -476.3839111328125, "logps/ref_chosen": -48.00010681152344, "logps/ref_rejected": -83.81932067871094, "logps/rejected": -707.0560302734375, "loss": 1.1511, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6287015676498413, "rewards/margins": 0.28351277112960815, "rewards/rejected": -0.9122143387794495, "step": 671 }, { "epoch": 0.986784140969163, "epsilon_dpo/beta": 0.0014567435719072819, "epsilon_dpo/beta_margin_grad_mean": -0.4389883875846863, "epsilon_dpo/beta_margin_grad_std": 0.08010012656450272, "epsilon_dpo/beta_margin_mean": 0.2526932656764984, "epsilon_dpo/beta_margin_std": 0.336892694234848, "epsilon_dpo/loss_margin_mean": 174.6773223876953, "grad_norm": 22.33818244934082, "kl/avg_steps": 0.59375, "kl/beta": 0.0014652981190010905, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 3.293150240547549e-10, "logits/chosen": 1.262594223022461, "logits/rejected": 1.5828099250793457, "logps/chosen": -524.9901123046875, "logps/ref_chosen": -58.583290100097656, "logps/ref_rejected": -93.14014434814453, "logps/rejected": -734.224365234375, "loss": 1.1769, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6801042556762695, "rewards/margins": 0.2526932656764984, "rewards/rejected": -0.9327975511550903, "step": 672 }, { "epoch": 0.9882525697503671, "epsilon_dpo/beta": 0.0014463242841884494, "epsilon_dpo/beta_margin_grad_mean": -0.42500680685043335, "epsilon_dpo/beta_margin_grad_std": 0.06854859739542007, "epsilon_dpo/beta_margin_mean": 0.3083582818508148, "epsilon_dpo/beta_margin_std": 0.2860918939113617, "epsilon_dpo/loss_margin_mean": 214.06727600097656, "grad_norm": 25.738595962524414, "kl/avg_steps": 0.71875, "kl/beta": 0.001456649275496602, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 2.6675629940689504e-10, "logits/chosen": 1.4295220375061035, "logits/rejected": 1.8196861743927002, "logps/chosen": -452.23095703125, "logps/ref_chosen": -46.72320556640625, "logps/ref_rejected": -85.29623413085938, "logps/rejected": -704.8712768554688, "loss": 1.1214, "rewards/accuracies": 0.875, "rewards/chosen": -0.5871830582618713, "rewards/margins": 0.30835825204849243, "rewards/rejected": -0.8955413699150085, "step": 673 }, { "epoch": 0.9897209985315712, "epsilon_dpo/beta": 0.0014355509774759412, "epsilon_dpo/beta_margin_grad_mean": -0.4180900752544403, "epsilon_dpo/beta_margin_grad_std": 0.07587840408086777, "epsilon_dpo/beta_margin_mean": 0.34153473377227783, "epsilon_dpo/beta_margin_std": 0.3308585286140442, "epsilon_dpo/loss_margin_mean": 238.66107177734375, "grad_norm": 17.829362869262695, "kl/avg_steps": 0.75, "kl/beta": 0.0014462543185800314, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 2.1077827798404725e-10, "logits/chosen": 1.5478763580322266, "logits/rejected": 2.0943801403045654, "logps/chosen": -419.6162109375, "logps/ref_chosen": -45.445526123046875, "logps/ref_rejected": -70.04593658447266, "logps/rejected": -682.877685546875, "loss": 1.0994, "rewards/accuracies": 0.875, "rewards/chosen": -0.5376299619674683, "rewards/margins": 0.34153473377227783, "rewards/rejected": -0.8791646957397461, "step": 674 }, { "epoch": 0.9911894273127754, "epsilon_dpo/beta": 0.001426659058779478, "epsilon_dpo/beta_margin_grad_mean": -0.41543251276016235, "epsilon_dpo/beta_margin_grad_std": 0.0737193152308464, "epsilon_dpo/beta_margin_mean": 0.35156434774398804, "epsilon_dpo/beta_margin_std": 0.31687280535697937, "epsilon_dpo/loss_margin_mean": 247.4196014404297, "grad_norm": 16.485437393188477, "kl/avg_steps": 0.625, "kl/beta": 0.0014354882296174765, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.6138243485910863e-10, "logits/chosen": 1.6052007675170898, "logits/rejected": 2.07883358001709, "logps/chosen": -425.02178955078125, "logps/ref_chosen": -44.17628479003906, "logps/ref_rejected": -74.09197998046875, "logps/rejected": -702.3570556640625, "loss": 1.0892, "rewards/accuracies": 0.890625, "rewards/chosen": -0.5436429977416992, "rewards/margins": 0.35156434774398804, "rewards/rejected": -0.8952072858810425, "step": 675 }, { "epoch": 0.9926578560939795, "epsilon_dpo/beta": 0.0014177978737279773, "epsilon_dpo/beta_margin_grad_mean": -0.42855557799339294, "epsilon_dpo/beta_margin_grad_std": 0.06745462119579315, "epsilon_dpo/beta_margin_mean": 0.29386886954307556, "epsilon_dpo/beta_margin_std": 0.2816160023212433, "epsilon_dpo/loss_margin_mean": 208.23379516601562, "grad_norm": 24.32298469543457, "kl/avg_steps": 0.625, "kl/beta": 0.0014265720965340734, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.1857007165852472e-10, "logits/chosen": 1.0184485912322998, "logits/rejected": 1.7093250751495361, "logps/chosen": -479.9848327636719, "logps/ref_chosen": -71.39852142333984, "logps/ref_rejected": -88.3587646484375, "logps/rejected": -705.1788330078125, "loss": 1.1331, "rewards/accuracies": 0.875, "rewards/chosen": -0.5800386667251587, "rewards/margins": 0.2938688397407532, "rewards/rejected": -0.8739074468612671, "step": 676 }, { "epoch": 0.9941262848751835, "epsilon_dpo/beta": 0.0014094345970079303, "epsilon_dpo/beta_margin_grad_mean": -0.42703139781951904, "epsilon_dpo/beta_margin_grad_std": 0.06837636977434158, "epsilon_dpo/beta_margin_mean": 0.3009083867073059, "epsilon_dpo/beta_margin_std": 0.28873759508132935, "epsilon_dpo/loss_margin_mean": 214.5569610595703, "grad_norm": 15.692850112915039, "kl/avg_steps": 0.59375, "kl/beta": 0.0014177113771438599, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 8.23423165278725e-11, "logits/chosen": 1.3182166814804077, "logits/rejected": 2.0713021755218506, "logps/chosen": -478.5683288574219, "logps/ref_chosen": -56.52743911743164, "logps/ref_rejected": -78.22654724121094, "logps/rejected": -714.8244018554688, "loss": 1.1279, "rewards/accuracies": 0.84375, "rewards/chosen": -0.5956122875213623, "rewards/margins": 0.3009083867073059, "rewards/rejected": -0.8965206146240234, "step": 677 }, { "epoch": 0.9955947136563876, "epsilon_dpo/beta": 0.001400234643369913, "epsilon_dpo/beta_margin_grad_mean": -0.4241836965084076, "epsilon_dpo/beta_margin_grad_std": 0.07738611847162247, "epsilon_dpo/beta_margin_mean": 0.3137598931789398, "epsilon_dpo/beta_margin_std": 0.32407626509666443, "epsilon_dpo/loss_margin_mean": 225.1610870361328, "grad_norm": 23.71813201904297, "kl/avg_steps": 0.65625, "kl/beta": 0.0014093434438109398, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 5.270012410216185e-11, "logits/chosen": 1.5611693859100342, "logits/rejected": 1.884830117225647, "logps/chosen": -443.7493896484375, "logps/ref_chosen": -46.13447570800781, "logps/ref_rejected": -80.60462951660156, "logps/rejected": -703.380615234375, "loss": 1.1224, "rewards/accuracies": 0.84375, "rewards/chosen": -0.5572014451026917, "rewards/margins": 0.31375986337661743, "rewards/rejected": -0.8709613084793091, "step": 678 }, { "epoch": 0.9970631424375918, "epsilon_dpo/beta": 0.0013928557746112347, "epsilon_dpo/beta_margin_grad_mean": -0.4372449219226837, "epsilon_dpo/beta_margin_grad_std": 0.0773983970284462, "epsilon_dpo/beta_margin_mean": 0.2592182755470276, "epsilon_dpo/beta_margin_std": 0.32160359621047974, "epsilon_dpo/loss_margin_mean": 187.47393798828125, "grad_norm": 23.833972930908203, "kl/avg_steps": 0.53125, "kl/beta": 0.0014001548988744617, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.9644275480772416e-11, "logits/chosen": 1.2658469676971436, "logits/rejected": 1.8166303634643555, "logps/chosen": -455.281494140625, "logps/ref_chosen": -50.294921875, "logps/ref_rejected": -76.59813690185547, "logps/rejected": -669.05859375, "loss": 1.169, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5649718046188354, "rewards/margins": 0.25921830534935, "rewards/rejected": -0.8241901397705078, "step": 679 }, { "epoch": 0.9985315712187959, "epsilon_dpo/beta": 0.001384624862112105, "epsilon_dpo/beta_margin_grad_mean": -0.42590072751045227, "epsilon_dpo/beta_margin_grad_std": 0.0837513729929924, "epsilon_dpo/beta_margin_mean": 0.3084065318107605, "epsilon_dpo/beta_margin_std": 0.3552207052707672, "epsilon_dpo/loss_margin_mean": 224.0642852783203, "grad_norm": 21.873979568481445, "kl/avg_steps": 0.59375, "kl/beta": 0.0013927558902651072, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 1.31753782067201e-11, "logits/chosen": 1.0442841053009033, "logits/rejected": 1.2980847358703613, "logps/chosen": -493.4278869628906, "logps/ref_chosen": -76.91569519042969, "logps/ref_rejected": -112.384765625, "logps/rejected": -752.9612426757812, "loss": 1.1318, "rewards/accuracies": 0.828125, "rewards/chosen": -0.5775318145751953, "rewards/margins": 0.3084065318107605, "rewards/rejected": -0.8859383463859558, "step": 680 }, { "epoch": 1.0, "epsilon_dpo/beta": 0.0013781830202788115, "epsilon_dpo/beta_margin_grad_mean": -0.4362422525882721, "epsilon_dpo/beta_margin_grad_std": 0.07315977662801743, "epsilon_dpo/beta_margin_mean": 0.26297879219055176, "epsilon_dpo/beta_margin_std": 0.3052116930484772, "epsilon_dpo/loss_margin_mean": 192.23939514160156, "grad_norm": 20.94184684753418, "kl/avg_steps": 0.46875, "kl/beta": 0.0013845352223142982, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 3.2938662507808745e-12, "logits/chosen": 1.2411224842071533, "logits/rejected": 1.6379847526550293, "logps/chosen": -490.7020263671875, "logps/ref_chosen": -60.957279205322266, "logps/ref_rejected": -88.5579833984375, "logps/rejected": -710.5421142578125, "loss": 1.1631, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5930610299110413, "rewards/margins": 0.26297879219055176, "rewards/rejected": -0.8560398817062378, "step": 681 }, { "epoch": 1.0, "step": 681, "total_flos": 0.0, "train_loss": 0.8920053139482126, "train_runtime": 3235.1211, "train_samples_per_second": 13.476, "train_steps_per_second": 0.211 } ], "logging_steps": 1, "max_steps": 681, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }