{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 681, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0014684287812041115, "grad_norm": 38.745460510253906, "kl/avg_steps": 0.09375, "kl/beta": 0.10000000149011612, "kl/n_epsilon_steps": 0.453125, "kl/p_epsilon_steps": 0.546875, "learning_rate": 0.0, "logits/chosen": -1.585817575454712, "logits/rejected": -0.5333532691001892, "logps/chosen": -85.37664031982422, "logps/ref_chosen": -85.43083190917969, "logps/ref_rejected": -79.90458679199219, "logps/rejected": -79.91163635253906, "loss": 1.381, "rewards/accuracies": 0.5625, "rewards/chosen": 0.005238114856183529, "rewards/margins": 0.005787555128335953, "rewards/rejected": -0.0005494409706443548, "step": 1 }, { "epoch": 0.002936857562408223, "grad_norm": 29.78634262084961, "kl/avg_steps": -0.3125, "kl/beta": 0.09990634024143219, "kl/n_epsilon_steps": 0.65625, "kl/p_epsilon_steps": 0.34375, "learning_rate": 7.246376811594203e-09, "logits/chosen": -0.7526164054870605, "logits/rejected": -0.3610996603965759, "logps/chosen": -82.11383056640625, "logps/ref_chosen": -82.06892395019531, "logps/ref_rejected": -81.65457153320312, "logps/rejected": -81.57505798339844, "loss": 1.3995, "rewards/accuracies": 0.375, "rewards/chosen": -0.004650775343179703, "rewards/margins": -0.012771460227668285, "rewards/rejected": 0.008120683953166008, "step": 2 }, { "epoch": 0.004405286343612335, "grad_norm": 26.100635528564453, "kl/avg_steps": 0.09375, "kl/beta": 0.10021952539682388, "kl/n_epsilon_steps": 0.453125, "kl/p_epsilon_steps": 0.546875, "learning_rate": 1.4492753623188406e-08, "logits/chosen": -1.002709150314331, "logits/rejected": -0.5633082985877991, "logps/chosen": -93.7535629272461, "logps/ref_chosen": -93.81098937988281, "logps/ref_rejected": -74.22950744628906, "logps/rejected": -74.23006439208984, "loss": 1.3813, "rewards/accuracies": 0.578125, "rewards/chosen": 0.005534623749554157, "rewards/margins": 0.00546031491830945, "rewards/rejected": 7.430883124470711e-05, "step": 3 }, { "epoch": 0.005873715124816446, "grad_norm": 32.92469024658203, "kl/avg_steps": 0.09375, "kl/beta": 0.10012565553188324, "kl/n_epsilon_steps": 0.453125, "kl/p_epsilon_steps": 0.546875, "learning_rate": 2.1739130434782606e-08, "logits/chosen": -0.8497915267944336, "logits/rejected": -0.17156964540481567, "logps/chosen": -87.32073211669922, "logps/ref_chosen": -87.29246520996094, "logps/ref_rejected": -93.82425689697266, "logps/rejected": -93.79373168945312, "loss": 1.3929, "rewards/accuracies": 0.5625, "rewards/chosen": -0.003004699246957898, "rewards/margins": -0.006174374371767044, "rewards/rejected": 0.0031696748919785023, "step": 4 }, { "epoch": 0.007342143906020558, "grad_norm": 35.190330505371094, "kl/avg_steps": 0.03125, "kl/beta": 0.10003187507390976, "kl/n_epsilon_steps": 0.484375, "kl/p_epsilon_steps": 0.515625, "learning_rate": 2.898550724637681e-08, "logits/chosen": -1.187368392944336, "logits/rejected": -0.5294585227966309, "logps/chosen": -89.35664367675781, "logps/ref_chosen": -89.33675384521484, "logps/ref_rejected": -88.74783325195312, "logps/rejected": -88.785400390625, "loss": 1.3853, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0021656095050275326, "rewards/margins": 0.0014310609549283981, "rewards/rejected": -0.0035966699942946434, "step": 5 }, { "epoch": 0.00881057268722467, "grad_norm": 36.109169006347656, "kl/avg_steps": 0.109375, "kl/beta": 0.10000062733888626, "kl/n_epsilon_steps": 0.4375, "kl/p_epsilon_steps": 0.546875, "learning_rate": 3.6231884057971014e-08, "logits/chosen": -1.2944458723068237, "logits/rejected": -0.41752371191978455, "logps/chosen": -97.32476043701172, "logps/ref_chosen": -97.32147216796875, "logps/ref_rejected": -97.88345336914062, "logps/rejected": -97.89209747314453, "loss": 1.3866, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0005067111924290657, "rewards/margins": 0.00019457843154668808, "rewards/rejected": -0.0007012896239757538, "step": 6 }, { "epoch": 0.010279001468428781, "grad_norm": 37.86967086791992, "kl/avg_steps": 0.09375, "kl/beta": 0.09989137202501297, "kl/n_epsilon_steps": 0.453125, "kl/p_epsilon_steps": 0.546875, "learning_rate": 4.347826086956521e-08, "logits/chosen": -0.6984870433807373, "logits/rejected": -0.42031070590019226, "logps/chosen": -86.60205078125, "logps/ref_chosen": -86.64852905273438, "logps/ref_rejected": -109.61618041992188, "logps/rejected": -109.63433837890625, "loss": 1.3805, "rewards/accuracies": 0.53125, "rewards/chosen": 0.004504315089434385, "rewards/margins": 0.006173794623464346, "rewards/rejected": -0.0016694795340299606, "step": 7 }, { "epoch": 0.011747430249632892, "grad_norm": 32.797054290771484, "kl/avg_steps": -0.125, "kl/beta": 0.0997978076338768, "kl/n_epsilon_steps": 0.5625, "kl/p_epsilon_steps": 0.4375, "learning_rate": 5.0724637681159424e-08, "logits/chosen": -1.414137363433838, "logits/rejected": -0.4588002562522888, "logps/chosen": -89.94332885742188, "logps/ref_chosen": -89.9236831665039, "logps/ref_rejected": -86.22803497314453, "logps/rejected": -86.1485366821289, "loss": 1.3968, "rewards/accuracies": 0.421875, "rewards/chosen": -0.002112824469804764, "rewards/margins": -0.010171930305659771, "rewards/rejected": 0.008059106767177582, "step": 8 }, { "epoch": 0.013215859030837005, "grad_norm": 35.72300338745117, "kl/avg_steps": 0.15625, "kl/beta": 0.09992270916700363, "kl/n_epsilon_steps": 0.421875, "kl/p_epsilon_steps": 0.578125, "learning_rate": 5.797101449275362e-08, "logits/chosen": -0.9759007692337036, "logits/rejected": -0.3913915753364563, "logps/chosen": -103.79417419433594, "logps/ref_chosen": -103.85713195800781, "logps/ref_rejected": -104.31932067871094, "logps/rejected": -104.34971618652344, "loss": 1.378, "rewards/accuracies": 0.59375, "rewards/chosen": 0.006141543388366699, "rewards/margins": 0.008944813162088394, "rewards/rejected": -0.002803270472213626, "step": 9 }, { "epoch": 0.014684287812041116, "grad_norm": 33.81359100341797, "kl/avg_steps": -0.25, "kl/beta": 0.09976682811975479, "kl/n_epsilon_steps": 0.625, "kl/p_epsilon_steps": 0.375, "learning_rate": 6.521739130434782e-08, "logits/chosen": -1.2568838596343994, "logits/rejected": -0.34031057357788086, "logps/chosen": -76.20588684082031, "logps/ref_chosen": -76.20436096191406, "logps/ref_rejected": -87.15210723876953, "logps/rejected": -87.01283264160156, "loss": 1.401, "rewards/accuracies": 0.359375, "rewards/chosen": -0.00028783950256183743, "rewards/margins": -0.014337141066789627, "rewards/rejected": 0.014049299992620945, "step": 10 }, { "epoch": 0.016152716593245228, "grad_norm": 37.09693908691406, "kl/avg_steps": -0.015625, "kl/beta": 0.10001686960458755, "kl/n_epsilon_steps": 0.5, "kl/p_epsilon_steps": 0.484375, "learning_rate": 7.246376811594203e-08, "logits/chosen": -1.4052127599716187, "logits/rejected": -0.392024964094162, "logps/chosen": -82.30293273925781, "logps/ref_chosen": -82.36649322509766, "logps/ref_rejected": -94.26461791992188, "logps/rejected": -94.2509536743164, "loss": 1.3819, "rewards/accuracies": 0.515625, "rewards/chosen": 0.006203308701515198, "rewards/margins": 0.004709784872829914, "rewards/rejected": 0.0014935237122699618, "step": 11 }, { "epoch": 0.01762114537444934, "grad_norm": 39.88624572753906, "kl/avg_steps": 0.125, "kl/beta": 0.1000325009226799, "kl/n_epsilon_steps": 0.4375, "kl/p_epsilon_steps": 0.5625, "learning_rate": 7.971014492753623e-08, "logits/chosen": -0.7849316596984863, "logits/rejected": -0.267448753118515, "logps/chosen": -99.06604766845703, "logps/ref_chosen": -99.10549926757812, "logps/ref_rejected": -110.27140808105469, "logps/rejected": -110.31909942626953, "loss": 1.3785, "rewards/accuracies": 0.578125, "rewards/chosen": 0.003762049600481987, "rewards/margins": 0.008365876972675323, "rewards/rejected": -0.0046038273721933365, "step": 12 }, { "epoch": 0.01908957415565345, "grad_norm": 41.430633544921875, "kl/avg_steps": 0.15625, "kl/beta": 0.09990761429071426, "kl/n_epsilon_steps": 0.421875, "kl/p_epsilon_steps": 0.578125, "learning_rate": 8.695652173913042e-08, "logits/chosen": -1.5717546939849854, "logits/rejected": -0.7266464829444885, "logps/chosen": -90.52992248535156, "logps/ref_chosen": -90.55973052978516, "logps/ref_rejected": -93.69110107421875, "logps/rejected": -93.72262573242188, "loss": 1.3812, "rewards/accuracies": 0.578125, "rewards/chosen": 0.002787390723824501, "rewards/margins": 0.00573158822953701, "rewards/rejected": -0.0029441972728818655, "step": 13 }, { "epoch": 0.020558002936857563, "grad_norm": 35.816402435302734, "kl/avg_steps": 0.03125, "kl/beta": 0.09975175559520721, "kl/n_epsilon_steps": 0.484375, "kl/p_epsilon_steps": 0.515625, "learning_rate": 9.420289855072464e-08, "logits/chosen": -0.5540125370025635, "logits/rejected": -0.28503644466400146, "logps/chosen": -99.85889434814453, "logps/ref_chosen": -99.82717895507812, "logps/ref_rejected": -108.94200134277344, "logps/rejected": -108.9466552734375, "loss": 1.3898, "rewards/accuracies": 0.515625, "rewards/chosen": -0.003366068471223116, "rewards/margins": -0.0030267564579844475, "rewards/rejected": -0.00033931387588381767, "step": 14 }, { "epoch": 0.022026431718061675, "grad_norm": 31.496644973754883, "kl/avg_steps": -0.1875, "kl/beta": 0.09972058981657028, "kl/n_epsilon_steps": 0.59375, "kl/p_epsilon_steps": 0.40625, "learning_rate": 1.0144927536231885e-07, "logits/chosen": -0.8750624060630798, "logits/rejected": -0.2879735827445984, "logps/chosen": -78.86597442626953, "logps/ref_chosen": -78.90997314453125, "logps/ref_rejected": -90.06234741210938, "logps/rejected": -90.09466552734375, "loss": 1.3795, "rewards/accuracies": 0.4375, "rewards/chosen": 0.004188378341495991, "rewards/margins": 0.007276091258972883, "rewards/rejected": -0.0030877136159688234, "step": 15 }, { "epoch": 0.023494860499265784, "grad_norm": 34.249393463134766, "kl/avg_steps": 0.15625, "kl/beta": 0.09990791976451874, "kl/n_epsilon_steps": 0.421875, "kl/p_epsilon_steps": 0.578125, "learning_rate": 1.0869565217391303e-07, "logits/chosen": -0.9704724550247192, "logits/rejected": -0.0901266559958458, "logps/chosen": -97.3776626586914, "logps/ref_chosen": -97.42327880859375, "logps/ref_rejected": -90.59945678710938, "logps/rejected": -90.60440826416016, "loss": 1.3824, "rewards/accuracies": 0.578125, "rewards/chosen": 0.0043681650422513485, "rewards/margins": 0.004663495346903801, "rewards/rejected": -0.0002953286748379469, "step": 16 }, { "epoch": 0.024963289280469897, "grad_norm": 35.82853698730469, "kl/avg_steps": 0.03125, "kl/beta": 0.09975205361843109, "kl/n_epsilon_steps": 0.484375, "kl/p_epsilon_steps": 0.515625, "learning_rate": 1.1594202898550725e-07, "logits/chosen": -1.0136702060699463, "logits/rejected": -0.5759009122848511, "logps/chosen": -104.36908721923828, "logps/ref_chosen": -104.36431121826172, "logps/ref_rejected": -90.46772766113281, "logps/rejected": -90.47051239013672, "loss": 1.3875, "rewards/accuracies": 0.515625, "rewards/chosen": -0.0006631199503317475, "rewards/margins": -0.0005828813882544637, "rewards/rejected": -8.023856207728386e-05, "step": 17 }, { "epoch": 0.02643171806167401, "grad_norm": 41.663047790527344, "kl/avg_steps": 0.0625, "kl/beta": 0.09972089529037476, "kl/n_epsilon_steps": 0.46875, "kl/p_epsilon_steps": 0.53125, "learning_rate": 1.2318840579710146e-07, "logits/chosen": -1.795450210571289, "logits/rejected": -0.8455245494842529, "logps/chosen": -87.06967163085938, "logps/ref_chosen": -87.09195709228516, "logps/ref_rejected": -81.85072326660156, "logps/rejected": -81.87223052978516, "loss": 1.383, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0020006708800792694, "rewards/margins": 0.003985004499554634, "rewards/rejected": -0.001984333386644721, "step": 18 }, { "epoch": 0.027900146842878122, "grad_norm": 31.180570602416992, "kl/avg_steps": 0.0625, "kl/beta": 0.09965860843658447, "kl/n_epsilon_steps": 0.46875, "kl/p_epsilon_steps": 0.53125, "learning_rate": 1.3043478260869563e-07, "logits/chosen": -1.0867879390716553, "logits/rejected": -0.03352098539471626, "logps/chosen": -105.85330963134766, "logps/ref_chosen": -105.87354278564453, "logps/ref_rejected": -96.93023681640625, "logps/rejected": -96.95452880859375, "loss": 1.3826, "rewards/accuracies": 0.546875, "rewards/chosen": 0.0018369832541793585, "rewards/margins": 0.004121738485991955, "rewards/rejected": -0.0022847556974738836, "step": 19 }, { "epoch": 0.02936857562408223, "grad_norm": 32.30035400390625, "kl/avg_steps": 0.09375, "kl/beta": 0.09959635883569717, "kl/n_epsilon_steps": 0.453125, "kl/p_epsilon_steps": 0.546875, "learning_rate": 1.3768115942028986e-07, "logits/chosen": -1.2128328084945679, "logits/rejected": -0.32826870679855347, "logps/chosen": -90.72392272949219, "logps/ref_chosen": -90.75811767578125, "logps/ref_rejected": -85.91232299804688, "logps/rejected": -85.96060180664062, "loss": 1.3791, "rewards/accuracies": 0.546875, "rewards/chosen": 0.003147183684632182, "rewards/margins": 0.007826998829841614, "rewards/rejected": -0.004679815378040075, "step": 20 }, { "epoch": 0.030837004405286344, "grad_norm": 31.352293014526367, "kl/avg_steps": 0.125, "kl/beta": 0.09950307011604309, "kl/n_epsilon_steps": 0.4375, "kl/p_epsilon_steps": 0.5625, "learning_rate": 1.4492753623188405e-07, "logits/chosen": -0.9726300239562988, "logits/rejected": -0.411948561668396, "logps/chosen": -80.2506332397461, "logps/ref_chosen": -80.33346557617188, "logps/ref_rejected": -83.9337387084961, "logps/rejected": -83.91175842285156, "loss": 1.3808, "rewards/accuracies": 0.59375, "rewards/chosen": 0.008116335608065128, "rewards/margins": 0.005770478397607803, "rewards/rejected": 0.002345857210457325, "step": 21 }, { "epoch": 0.032305433186490456, "grad_norm": 42.61378860473633, "kl/avg_steps": 0.09375, "kl/beta": 0.09937884658575058, "kl/n_epsilon_steps": 0.453125, "kl/p_epsilon_steps": 0.546875, "learning_rate": 1.5217391304347825e-07, "logits/chosen": -0.9675413370132446, "logits/rejected": -0.6590346693992615, "logps/chosen": -95.35507202148438, "logps/ref_chosen": -95.39530181884766, "logps/ref_rejected": -103.47351837158203, "logps/rejected": -103.48601531982422, "loss": 1.3819, "rewards/accuracies": 0.546875, "rewards/chosen": 0.0037977853789925575, "rewards/margins": 0.004873580764979124, "rewards/rejected": -0.0010757955024018884, "step": 22 }, { "epoch": 0.033773861967694566, "grad_norm": 31.29548454284668, "kl/avg_steps": -0.09375, "kl/beta": 0.09928576648235321, "kl/n_epsilon_steps": 0.546875, "kl/p_epsilon_steps": 0.453125, "learning_rate": 1.5942028985507245e-07, "logits/chosen": -1.156263828277588, "logits/rejected": -0.3607323169708252, "logps/chosen": -90.63298034667969, "logps/ref_chosen": -90.63751220703125, "logps/ref_rejected": -86.59425354003906, "logps/rejected": -86.54367065429688, "loss": 1.3916, "rewards/accuracies": 0.4375, "rewards/chosen": 0.00023264711489900947, "rewards/margins": -0.004911544732749462, "rewards/rejected": 0.005144191440194845, "step": 23 }, { "epoch": 0.03524229074889868, "grad_norm": 44.2933464050293, "kl/avg_steps": -0.03125, "kl/beta": 0.09937893599271774, "kl/n_epsilon_steps": 0.515625, "kl/p_epsilon_steps": 0.484375, "learning_rate": 1.6666666666666665e-07, "logits/chosen": -0.8095067143440247, "logits/rejected": -0.5244461297988892, "logps/chosen": -69.98039245605469, "logps/ref_chosen": -69.91728973388672, "logps/ref_rejected": -106.60990142822266, "logps/rejected": -106.59368133544922, "loss": 1.395, "rewards/accuracies": 0.453125, "rewards/chosen": -0.006438862532377243, "rewards/margins": -0.008207438513636589, "rewards/rejected": 0.0017685755155980587, "step": 24 }, { "epoch": 0.03671071953010279, "grad_norm": 36.52031326293945, "kl/avg_steps": 0.25, "kl/beta": 0.09941000491380692, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 1.7391304347826085e-07, "logits/chosen": -1.2055686712265015, "logits/rejected": -0.4221525490283966, "logps/chosen": -80.82945251464844, "logps/ref_chosen": -80.82548522949219, "logps/ref_rejected": -95.95710754394531, "logps/rejected": -96.06101989746094, "loss": 1.3773, "rewards/accuracies": 0.65625, "rewards/chosen": -0.0005520773120224476, "rewards/margins": 0.009578470140695572, "rewards/rejected": -0.010130547918379307, "step": 25 }, { "epoch": 0.0381791483113069, "grad_norm": 40.3887939453125, "kl/avg_steps": 0.15625, "kl/beta": 0.09916209429502487, "kl/n_epsilon_steps": 0.421875, "kl/p_epsilon_steps": 0.578125, "learning_rate": 1.8115942028985507e-07, "logits/chosen": -1.4493508338928223, "logits/rejected": -0.5392433404922485, "logps/chosen": -88.89604949951172, "logps/ref_chosen": -88.90116882324219, "logps/ref_rejected": -109.82818603515625, "logps/rejected": -109.86863708496094, "loss": 1.3825, "rewards/accuracies": 0.53125, "rewards/chosen": 0.00035169871989637613, "rewards/margins": 0.004207686521112919, "rewards/rejected": -0.0038559872191399336, "step": 26 }, { "epoch": 0.039647577092511016, "grad_norm": 43.48580551147461, "kl/avg_steps": 0.28125, "kl/beta": 0.09900739789009094, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 1.8840579710144927e-07, "logits/chosen": -1.6640090942382812, "logits/rejected": -0.6294593811035156, "logps/chosen": -77.5752182006836, "logps/ref_chosen": -77.59600830078125, "logps/ref_rejected": -103.93850708007812, "logps/rejected": -104.07573699951172, "loss": 1.3714, "rewards/accuracies": 0.65625, "rewards/chosen": 0.0019225336145609617, "rewards/margins": 0.01532358955591917, "rewards/rejected": -0.013401055708527565, "step": 27 }, { "epoch": 0.041116005873715125, "grad_norm": 35.73523712158203, "kl/avg_steps": 0.4375, "kl/beta": 0.09872972220182419, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 1.9565217391304347e-07, "logits/chosen": -0.956201434135437, "logits/rejected": -0.39606085419654846, "logps/chosen": -102.17597961425781, "logps/ref_chosen": -102.22856140136719, "logps/ref_rejected": -96.9594955444336, "logps/rejected": -97.06103515625, "loss": 1.3723, "rewards/accuracies": 0.71875, "rewards/chosen": 0.004990905057638884, "rewards/margins": 0.014799000695347786, "rewards/rejected": -0.00980809610337019, "step": 28 }, { "epoch": 0.042584434654919234, "grad_norm": 41.57979965209961, "kl/avg_steps": 0.15625, "kl/beta": 0.09829965978860855, "kl/n_epsilon_steps": 0.421875, "kl/p_epsilon_steps": 0.578125, "learning_rate": 2.028985507246377e-07, "logits/chosen": -1.4259010553359985, "logits/rejected": -0.6437522172927856, "logps/chosen": -88.62876892089844, "logps/ref_chosen": -88.64704895019531, "logps/ref_rejected": -102.96011352539062, "logps/rejected": -103.11316680908203, "loss": 1.3704, "rewards/accuracies": 0.609375, "rewards/chosen": 0.0016324977623298764, "rewards/margins": 0.01648685149848461, "rewards/rejected": -0.014854353852570057, "step": 29 }, { "epoch": 0.04405286343612335, "grad_norm": 38.55412292480469, "kl/avg_steps": 0.0625, "kl/beta": 0.09814630448818207, "kl/n_epsilon_steps": 0.46875, "kl/p_epsilon_steps": 0.53125, "learning_rate": 2.1014492753623187e-07, "logits/chosen": -0.9132494926452637, "logits/rejected": -0.49612629413604736, "logps/chosen": -88.3867416381836, "logps/ref_chosen": -88.38838958740234, "logps/ref_rejected": -102.31889343261719, "logps/rejected": -102.37272644042969, "loss": 1.3817, "rewards/accuracies": 0.578125, "rewards/chosen": 6.297486834228039e-06, "rewards/margins": 0.0051061781123280525, "rewards/rejected": -0.005099880509078503, "step": 30 }, { "epoch": 0.04552129221732746, "grad_norm": 30.63753890991211, "kl/avg_steps": 0.09375, "kl/beta": 0.09808500111103058, "kl/n_epsilon_steps": 0.453125, "kl/p_epsilon_steps": 0.546875, "learning_rate": 2.1739130434782607e-07, "logits/chosen": -0.8628894090652466, "logits/rejected": -0.3569292426109314, "logps/chosen": -101.13359832763672, "logps/ref_chosen": -101.12565612792969, "logps/ref_rejected": -79.85842895507812, "logps/rejected": -79.95623779296875, "loss": 1.3782, "rewards/accuracies": 0.578125, "rewards/chosen": -0.0009215597528964281, "rewards/margins": 0.008493051864206791, "rewards/rejected": -0.009414611384272575, "step": 31 }, { "epoch": 0.04698972099853157, "grad_norm": 34.43489456176758, "kl/avg_steps": 0.0, "kl/beta": 0.09799313545227051, "kl/n_epsilon_steps": 0.5, "kl/p_epsilon_steps": 0.5, "learning_rate": 2.2463768115942027e-07, "logits/chosen": -0.9892777800559998, "logits/rejected": -0.34080418944358826, "logps/chosen": -96.68499755859375, "logps/ref_chosen": -96.61703491210938, "logps/ref_rejected": -96.224365234375, "logps/rejected": -96.38025665283203, "loss": 1.3789, "rewards/accuracies": 0.5625, "rewards/chosen": -0.006817132234573364, "rewards/margins": 0.008191294968128204, "rewards/rejected": -0.015008427202701569, "step": 32 }, { "epoch": 0.048458149779735685, "grad_norm": 32.965362548828125, "kl/avg_steps": 0.140625, "kl/beta": 0.09799313545227051, "kl/n_epsilon_steps": 0.421875, "kl/p_epsilon_steps": 0.5625, "learning_rate": 2.318840579710145e-07, "logits/chosen": -1.2965284585952759, "logits/rejected": -0.3348070979118347, "logps/chosen": -81.51568603515625, "logps/ref_chosen": -81.5210189819336, "logps/ref_rejected": -93.80595397949219, "logps/rejected": -93.91485595703125, "loss": 1.376, "rewards/accuracies": 0.578125, "rewards/chosen": 0.0003811400383710861, "rewards/margins": 0.010841380804777145, "rewards/rejected": -0.01046024076640606, "step": 33 }, { "epoch": 0.049926578560939794, "grad_norm": 39.99357604980469, "kl/avg_steps": 0.234375, "kl/beta": 0.09785552322864532, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.609375, "learning_rate": 2.391304347826087e-07, "logits/chosen": -1.2087818384170532, "logits/rejected": -0.19416889548301697, "logps/chosen": -77.15280151367188, "logps/ref_chosen": -77.20204162597656, "logps/ref_rejected": -106.71875762939453, "logps/rejected": -106.84297943115234, "loss": 1.3705, "rewards/accuracies": 0.65625, "rewards/chosen": 0.004645414184778929, "rewards/margins": 0.016573915258049965, "rewards/rejected": -0.011928501538932323, "step": 34 }, { "epoch": 0.0513950073421439, "grad_norm": 41.544822692871094, "kl/avg_steps": 0.34375, "kl/beta": 0.0976267158985138, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 2.463768115942029e-07, "logits/chosen": -1.575798749923706, "logits/rejected": -0.5509282350540161, "logps/chosen": -77.5616683959961, "logps/ref_chosen": -77.57035827636719, "logps/ref_rejected": -112.18855285644531, "logps/rejected": -112.379638671875, "loss": 1.3678, "rewards/accuracies": 0.65625, "rewards/chosen": 0.0007331545930355787, "rewards/margins": 0.019134394824504852, "rewards/rejected": -0.018401240929961205, "step": 35 }, { "epoch": 0.05286343612334802, "grad_norm": 31.387723922729492, "kl/avg_steps": 0.28125, "kl/beta": 0.09729227423667908, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 2.536231884057971e-07, "logits/chosen": -1.432613730430603, "logits/rejected": -0.642814040184021, "logps/chosen": -83.33061218261719, "logps/ref_chosen": -83.28824615478516, "logps/ref_rejected": -91.75741577148438, "logps/rejected": -92.01959228515625, "loss": 1.3662, "rewards/accuracies": 0.625, "rewards/chosen": -0.0043023210018873215, "rewards/margins": 0.02096126601099968, "rewards/rejected": -0.025263587012887, "step": 36 }, { "epoch": 0.05433186490455213, "grad_norm": 34.76736831665039, "kl/avg_steps": -0.03125, "kl/beta": 0.09701940417289734, "kl/n_epsilon_steps": 0.515625, "kl/p_epsilon_steps": 0.484375, "learning_rate": 2.6086956521739126e-07, "logits/chosen": -0.9882210493087769, "logits/rejected": -0.7192566990852356, "logps/chosen": -94.82408142089844, "logps/ref_chosen": -94.77108764648438, "logps/ref_rejected": -85.7172622680664, "logps/rejected": -85.97396087646484, "loss": 1.368, "rewards/accuracies": 0.53125, "rewards/chosen": -0.005295893643051386, "rewards/margins": 0.01935383304953575, "rewards/rejected": -0.0246497243642807, "step": 37 }, { "epoch": 0.055800293685756244, "grad_norm": 35.520565032958984, "kl/avg_steps": 0.3125, "kl/beta": 0.09704973548650742, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 2.681159420289855e-07, "logits/chosen": -0.9708235263824463, "logits/rejected": -0.36027270555496216, "logps/chosen": -75.92189025878906, "logps/ref_chosen": -75.97850799560547, "logps/ref_rejected": -104.10401916503906, "logps/rejected": -104.33356475830078, "loss": 1.3601, "rewards/accuracies": 0.65625, "rewards/chosen": 0.0053534312173724174, "rewards/margins": 0.027343016117811203, "rewards/rejected": -0.02198958396911621, "step": 38 }, { "epoch": 0.05726872246696035, "grad_norm": 34.98324966430664, "kl/avg_steps": 0.46875, "kl/beta": 0.09674739837646484, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 2.753623188405797e-07, "logits/chosen": -1.2482174634933472, "logits/rejected": -0.5844467878341675, "logps/chosen": -81.19099426269531, "logps/ref_chosen": -81.18577575683594, "logps/ref_rejected": -84.1959228515625, "logps/rejected": -84.48798370361328, "loss": 1.3596, "rewards/accuracies": 0.765625, "rewards/chosen": -0.0006124734645709395, "rewards/margins": 0.02736220322549343, "rewards/rejected": -0.027974674478173256, "step": 39 }, { "epoch": 0.05873715124816446, "grad_norm": 27.852684020996094, "kl/avg_steps": 0.40625, "kl/beta": 0.09629600495100021, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 2.8260869565217386e-07, "logits/chosen": -0.8650610446929932, "logits/rejected": -0.20084291696548462, "logps/chosen": -83.25595092773438, "logps/ref_chosen": -83.33256530761719, "logps/ref_rejected": -80.25591278076172, "logps/rejected": -80.43058776855469, "loss": 1.363, "rewards/accuracies": 0.6875, "rewards/chosen": 0.007215453311800957, "rewards/margins": 0.023813467472791672, "rewards/rejected": -0.016598014160990715, "step": 40 }, { "epoch": 0.06020558002936858, "grad_norm": 31.773061752319336, "kl/avg_steps": 0.34375, "kl/beta": 0.09590639173984528, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 2.898550724637681e-07, "logits/chosen": -1.026604413986206, "logits/rejected": -0.37627846002578735, "logps/chosen": -93.19059753417969, "logps/ref_chosen": -93.14866638183594, "logps/ref_rejected": -102.07920837402344, "logps/rejected": -102.35368347167969, "loss": 1.3656, "rewards/accuracies": 0.671875, "rewards/chosen": -0.004216345027089119, "rewards/margins": 0.021797288209199905, "rewards/rejected": -0.026013631373643875, "step": 41 }, { "epoch": 0.06167400881057269, "grad_norm": 38.71194839477539, "kl/avg_steps": 0.40625, "kl/beta": 0.09557784348726273, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 2.971014492753623e-07, "logits/chosen": -1.2612457275390625, "logits/rejected": -0.5760804414749146, "logps/chosen": -90.67225646972656, "logps/ref_chosen": -90.70162200927734, "logps/ref_rejected": -113.8646469116211, "logps/rejected": -114.30716705322266, "loss": 1.3432, "rewards/accuracies": 0.734375, "rewards/chosen": 0.002619321458041668, "rewards/margins": 0.044551268219947815, "rewards/rejected": -0.041931942105293274, "step": 42 }, { "epoch": 0.0631424375917768, "grad_norm": 35.74715805053711, "kl/avg_steps": 0.4375, "kl/beta": 0.09519112855195999, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 3.043478260869565e-07, "logits/chosen": -1.2501815557479858, "logits/rejected": -0.3675554692745209, "logps/chosen": -89.61614990234375, "logps/ref_chosen": -89.64402770996094, "logps/ref_rejected": -103.95185852050781, "logps/rejected": -104.35449981689453, "loss": 1.3476, "rewards/accuracies": 0.703125, "rewards/chosen": 0.002490551210939884, "rewards/margins": 0.0404253825545311, "rewards/rejected": -0.03793483227491379, "step": 43 }, { "epoch": 0.06461086637298091, "grad_norm": 32.69614028930664, "kl/avg_steps": 0.46875, "kl/beta": 0.09477648138999939, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 3.115942028985507e-07, "logits/chosen": -1.5350993871688843, "logits/rejected": -0.39478617906570435, "logps/chosen": -81.98731994628906, "logps/ref_chosen": -81.8783187866211, "logps/ref_rejected": -113.421630859375, "logps/rejected": -113.86365509033203, "loss": 1.3562, "rewards/accuracies": 0.734375, "rewards/chosen": -0.010416124947369099, "rewards/margins": 0.03108786605298519, "rewards/rejected": -0.041503991931676865, "step": 44 }, { "epoch": 0.06607929515418502, "grad_norm": 22.90825080871582, "kl/avg_steps": 0.40625, "kl/beta": 0.09433428943157196, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 3.188405797101449e-07, "logits/chosen": -0.9012535810470581, "logits/rejected": -0.24215500056743622, "logps/chosen": -77.44337463378906, "logps/ref_chosen": -77.34459686279297, "logps/ref_rejected": -84.24774169921875, "logps/rejected": -84.58972930908203, "loss": 1.3645, "rewards/accuracies": 0.75, "rewards/chosen": -0.009415511973202229, "rewards/margins": 0.022572031244635582, "rewards/rejected": -0.031987544149160385, "step": 45 }, { "epoch": 0.06754772393538913, "grad_norm": 31.163087844848633, "kl/avg_steps": 0.546875, "kl/beta": 0.09395260363817215, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.765625, "learning_rate": 3.260869565217391e-07, "logits/chosen": -1.383570909500122, "logits/rejected": -0.5739269256591797, "logps/chosen": -90.39727020263672, "logps/ref_chosen": -90.3408203125, "logps/ref_rejected": -100.676513671875, "logps/rejected": -101.10578918457031, "loss": 1.3528, "rewards/accuracies": 0.75, "rewards/chosen": -0.005449555814266205, "rewards/margins": 0.03452453762292862, "rewards/rejected": -0.039974093437194824, "step": 46 }, { "epoch": 0.06901615271659324, "grad_norm": 31.565263748168945, "kl/avg_steps": 0.375, "kl/beta": 0.09344159811735153, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 3.333333333333333e-07, "logits/chosen": -0.9797345995903015, "logits/rejected": -0.36259299516677856, "logps/chosen": -104.41678619384766, "logps/ref_chosen": -104.41130065917969, "logps/ref_rejected": -101.25489807128906, "logps/rejected": -101.67523193359375, "loss": 1.3497, "rewards/accuracies": 0.671875, "rewards/chosen": -0.0006828177720308304, "rewards/margins": 0.03818219527602196, "rewards/rejected": -0.03886501491069794, "step": 47 }, { "epoch": 0.07048458149779736, "grad_norm": 35.98372268676758, "kl/avg_steps": 0.53125, "kl/beta": 0.09309250116348267, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 3.4057971014492755e-07, "logits/chosen": -1.2902014255523682, "logits/rejected": -0.7823787331581116, "logps/chosen": -91.1424560546875, "logps/ref_chosen": -91.10027313232422, "logps/ref_rejected": -95.08057403564453, "logps/rejected": -95.68135070800781, "loss": 1.3375, "rewards/accuracies": 0.796875, "rewards/chosen": -0.004128246568143368, "rewards/margins": 0.05126403272151947, "rewards/rejected": -0.05539228022098541, "step": 48 }, { "epoch": 0.07195301027900147, "grad_norm": 36.75701904296875, "kl/avg_steps": 0.46875, "kl/beta": 0.09260056167840958, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 3.478260869565217e-07, "logits/chosen": -2.277641773223877, "logits/rejected": -0.8717272281646729, "logps/chosen": -92.91861724853516, "logps/ref_chosen": -93.00367736816406, "logps/ref_rejected": -91.74899291992188, "logps/rejected": -92.42121887207031, "loss": 1.3199, "rewards/accuracies": 0.75, "rewards/chosen": 0.007629199419170618, "rewards/margins": 0.06930971145629883, "rewards/rejected": -0.06168051436543465, "step": 49 }, { "epoch": 0.07342143906020558, "grad_norm": 30.533233642578125, "kl/avg_steps": 0.59375, "kl/beta": 0.09216851741075516, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 3.5507246376811595e-07, "logits/chosen": -1.142214059829712, "logits/rejected": -0.34731245040893555, "logps/chosen": -94.70893859863281, "logps/ref_chosen": -94.62681579589844, "logps/ref_rejected": -103.57435607910156, "logps/rejected": -104.34310913085938, "loss": 1.3264, "rewards/accuracies": 0.796875, "rewards/chosen": -0.007727333344519138, "rewards/margins": 0.0625331848859787, "rewards/rejected": -0.07026051729917526, "step": 50 }, { "epoch": 0.07488986784140969, "grad_norm": 27.474159240722656, "kl/avg_steps": 0.53125, "kl/beta": 0.09162449836730957, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 3.6231884057971015e-07, "logits/chosen": -1.5295517444610596, "logits/rejected": -1.121992826461792, "logps/chosen": -87.59027099609375, "logps/ref_chosen": -87.50727844238281, "logps/ref_rejected": -83.47235870361328, "logps/rejected": -84.15083312988281, "loss": 1.3355, "rewards/accuracies": 0.734375, "rewards/chosen": -0.007823294959962368, "rewards/margins": 0.053790926933288574, "rewards/rejected": -0.06161422282457352, "step": 51 }, { "epoch": 0.0763582966226138, "grad_norm": 29.224390029907227, "kl/avg_steps": 0.625, "kl/beta": 0.09114031493663788, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 3.695652173913043e-07, "logits/chosen": -1.5085875988006592, "logits/rejected": -0.8081971406936646, "logps/chosen": -90.67510986328125, "logps/ref_chosen": -90.63026428222656, "logps/ref_rejected": -87.0390625, "logps/rejected": -87.94956970214844, "loss": 1.3116, "rewards/accuracies": 0.8125, "rewards/chosen": -0.004274226725101471, "rewards/margins": 0.07802098244428635, "rewards/rejected": -0.08229520916938782, "step": 52 }, { "epoch": 0.07782672540381791, "grad_norm": 35.840335845947266, "kl/avg_steps": 0.59375, "kl/beta": 0.0905742272734642, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 3.7681159420289855e-07, "logits/chosen": -1.548392415046692, "logits/rejected": -1.1408261060714722, "logps/chosen": -81.63391876220703, "logps/ref_chosen": -81.58306884765625, "logps/ref_rejected": -95.66152954101562, "logps/rejected": -96.76312255859375, "loss": 1.2987, "rewards/accuracies": 0.8125, "rewards/chosen": -0.004804985597729683, "rewards/margins": 0.09401369839906693, "rewards/rejected": -0.09881868213415146, "step": 53 }, { "epoch": 0.07929515418502203, "grad_norm": 35.994388580322266, "kl/avg_steps": 0.5, "kl/beta": 0.09003961831331253, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 3.8405797101449274e-07, "logits/chosen": -1.3138582706451416, "logits/rejected": -0.6569335460662842, "logps/chosen": -88.93460083007812, "logps/ref_chosen": -88.91016387939453, "logps/ref_rejected": -99.1175537109375, "logps/rejected": -100.24466705322266, "loss": 1.2939, "rewards/accuracies": 0.765625, "rewards/chosen": -0.002394177485257387, "rewards/margins": 0.09816494584083557, "rewards/rejected": -0.10055913031101227, "step": 54 }, { "epoch": 0.08076358296622614, "grad_norm": 31.913665771484375, "kl/avg_steps": 0.46875, "kl/beta": 0.08959165960550308, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 3.9130434782608694e-07, "logits/chosen": -1.6863343715667725, "logits/rejected": -0.6388131976127625, "logps/chosen": -92.33493041992188, "logps/ref_chosen": -92.45592498779297, "logps/ref_rejected": -92.97093963623047, "logps/rejected": -93.97943115234375, "loss": 1.2933, "rewards/accuracies": 0.765625, "rewards/chosen": 0.010466434992849827, "rewards/margins": 0.10004328191280365, "rewards/rejected": -0.0895768478512764, "step": 55 }, { "epoch": 0.08223201174743025, "grad_norm": 29.783920288085938, "kl/avg_steps": 0.5625, "kl/beta": 0.08917365968227386, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.9855072463768114e-07, "logits/chosen": -1.9710514545440674, "logits/rejected": -0.6645182371139526, "logps/chosen": -87.32936096191406, "logps/ref_chosen": -87.23665618896484, "logps/ref_rejected": -100.64553833007812, "logps/rejected": -101.76547241210938, "loss": 1.3033, "rewards/accuracies": 0.8125, "rewards/chosen": -0.008556234650313854, "rewards/margins": 0.09043145924806595, "rewards/rejected": -0.09898769855499268, "step": 56 }, { "epoch": 0.08370044052863436, "grad_norm": 30.23967933654785, "kl/avg_steps": 0.5, "kl/beta": 0.08867485821247101, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.057971014492754e-07, "logits/chosen": -1.7121399641036987, "logits/rejected": -0.9029750227928162, "logps/chosen": -98.1844253540039, "logps/ref_chosen": -98.15074157714844, "logps/ref_rejected": -101.05284118652344, "logps/rejected": -102.23455047607422, "loss": 1.2927, "rewards/accuracies": 0.765625, "rewards/chosen": -0.003269542008638382, "rewards/margins": 0.10057489573955536, "rewards/rejected": -0.10384444147348404, "step": 57 }, { "epoch": 0.08516886930983847, "grad_norm": 33.85087966918945, "kl/avg_steps": 0.65625, "kl/beta": 0.08823369443416595, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.1304347826086954e-07, "logits/chosen": -1.996552586555481, "logits/rejected": -1.058452844619751, "logps/chosen": -99.30267333984375, "logps/ref_chosen": -99.58097076416016, "logps/ref_rejected": -91.22227478027344, "logps/rejected": -92.50682067871094, "loss": 1.2623, "rewards/accuracies": 0.90625, "rewards/chosen": 0.02414235845208168, "rewards/margins": 0.13646608591079712, "rewards/rejected": -0.11232372373342514, "step": 58 }, { "epoch": 0.08663729809104258, "grad_norm": 30.440584182739258, "kl/avg_steps": 0.625, "kl/beta": 0.08765843510627747, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.2028985507246374e-07, "logits/chosen": -1.5328807830810547, "logits/rejected": -0.9605180621147156, "logps/chosen": -89.82688903808594, "logps/ref_chosen": -89.80232238769531, "logps/ref_rejected": -94.27667236328125, "logps/rejected": -95.44361877441406, "loss": 1.2934, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0023305192589759827, "rewards/margins": 0.09902888536453247, "rewards/rejected": -0.10135940462350845, "step": 59 }, { "epoch": 0.0881057268722467, "grad_norm": 25.22243309020996, "kl/avg_steps": 0.40625, "kl/beta": 0.08711396902799606, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.2753623188405794e-07, "logits/chosen": -1.692917823791504, "logits/rejected": -0.8258851170539856, "logps/chosen": -95.11837768554688, "logps/ref_chosen": -95.15571594238281, "logps/ref_rejected": -91.3724365234375, "logps/rejected": -92.52310180664062, "loss": 1.2904, "rewards/accuracies": 0.765625, "rewards/chosen": 0.002980598248541355, "rewards/margins": 0.10234124958515167, "rewards/rejected": -0.0993606448173523, "step": 60 }, { "epoch": 0.08957415565345081, "grad_norm": 25.07730484008789, "kl/avg_steps": 0.53125, "kl/beta": 0.08676150441169739, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.3478260869565214e-07, "logits/chosen": -1.4720783233642578, "logits/rejected": -0.7469910979270935, "logps/chosen": -85.89845275878906, "logps/ref_chosen": -85.73231506347656, "logps/ref_rejected": -97.96575927734375, "logps/rejected": -99.31712341308594, "loss": 1.2933, "rewards/accuracies": 0.796875, "rewards/chosen": -0.014695134945213795, "rewards/margins": 0.101577028632164, "rewards/rejected": -0.11627216637134552, "step": 61 }, { "epoch": 0.09104258443465492, "grad_norm": 24.972980499267578, "kl/avg_steps": 0.65625, "kl/beta": 0.08630301803350449, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.420289855072464e-07, "logits/chosen": -1.7914103269577026, "logits/rejected": -0.8692626953125, "logps/chosen": -81.72990417480469, "logps/ref_chosen": -81.63538360595703, "logps/ref_rejected": -84.03831481933594, "logps/rejected": -85.38259887695312, "loss": 1.2896, "rewards/accuracies": 0.828125, "rewards/chosen": -0.008462773635983467, "rewards/margins": 0.10654792189598083, "rewards/rejected": -0.11501070111989975, "step": 62 }, { "epoch": 0.09251101321585903, "grad_norm": 30.612600326538086, "kl/avg_steps": 0.625, "kl/beta": 0.0857403501868248, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.4927536231884053e-07, "logits/chosen": -1.5954476594924927, "logits/rejected": -0.8784996271133423, "logps/chosen": -103.65047454833984, "logps/ref_chosen": -103.62405395507812, "logps/ref_rejected": -103.40303039550781, "logps/rejected": -104.9129638671875, "loss": 1.2686, "rewards/accuracies": 0.8125, "rewards/chosen": -0.002559835556894541, "rewards/margins": 0.12582427263259888, "rewards/rejected": -0.12838411331176758, "step": 63 }, { "epoch": 0.09397944199706314, "grad_norm": 30.650426864624023, "kl/avg_steps": 0.5, "kl/beta": 0.08520779758691788, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.5652173913043473e-07, "logits/chosen": -1.671600580215454, "logits/rejected": -0.9036776423454285, "logps/chosen": -87.17109680175781, "logps/ref_chosen": -87.0015869140625, "logps/ref_rejected": -100.5854721069336, "logps/rejected": -102.39601135253906, "loss": 1.2612, "rewards/accuracies": 0.75, "rewards/chosen": -0.014812503941357136, "rewards/margins": 0.1382252722978592, "rewards/rejected": -0.15303777158260345, "step": 64 }, { "epoch": 0.09544787077826726, "grad_norm": 33.42892837524414, "kl/avg_steps": 0.75, "kl/beta": 0.0847838819026947, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 4.63768115942029e-07, "logits/chosen": -1.976373314857483, "logits/rejected": -1.0799182653427124, "logps/chosen": -91.29652404785156, "logps/ref_chosen": -91.22191619873047, "logps/ref_rejected": -115.33553314208984, "logps/rejected": -117.1944351196289, "loss": 1.2492, "rewards/accuracies": 0.875, "rewards/chosen": -0.006513871252536774, "rewards/margins": 0.14960609376430511, "rewards/rejected": -0.15611997246742249, "step": 65 }, { "epoch": 0.09691629955947137, "grad_norm": 22.01787567138672, "kl/avg_steps": 0.5, "kl/beta": 0.08415273576974869, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.7101449275362313e-07, "logits/chosen": -2.0008394718170166, "logits/rejected": -1.0472722053527832, "logps/chosen": -84.10005187988281, "logps/ref_chosen": -83.78422546386719, "logps/ref_rejected": -82.7520980834961, "logps/rejected": -84.29136657714844, "loss": 1.2937, "rewards/accuracies": 0.796875, "rewards/chosen": -0.026944037526845932, "rewards/margins": 0.10165668278932571, "rewards/rejected": -0.12860071659088135, "step": 66 }, { "epoch": 0.09838472834067548, "grad_norm": 20.735403060913086, "kl/avg_steps": 0.34375, "kl/beta": 0.08373406529426575, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.782608695652174e-07, "logits/chosen": -2.0153298377990723, "logits/rejected": -1.0666594505310059, "logps/chosen": -88.51578521728516, "logps/ref_chosen": -87.67295837402344, "logps/ref_rejected": -79.08674621582031, "logps/rejected": -81.0645751953125, "loss": 1.3088, "rewards/accuracies": 0.671875, "rewards/chosen": -0.07106294482946396, "rewards/margins": 0.09327295422554016, "rewards/rejected": -0.16433589160442352, "step": 67 }, { "epoch": 0.09985315712187959, "grad_norm": 24.614826202392578, "kl/avg_steps": 0.5625, "kl/beta": 0.08344721049070358, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.855072463768116e-07, "logits/chosen": -1.7102564573287964, "logits/rejected": -1.115820288658142, "logps/chosen": -97.353759765625, "logps/ref_chosen": -97.07884216308594, "logps/ref_rejected": -84.11872863769531, "logps/rejected": -86.50021362304688, "loss": 1.2341, "rewards/accuracies": 0.796875, "rewards/chosen": -0.02329547144472599, "rewards/margins": 0.1738092005252838, "rewards/rejected": -0.19710469245910645, "step": 68 }, { "epoch": 0.1013215859030837, "grad_norm": 26.68158531188965, "kl/avg_steps": 0.5625, "kl/beta": 0.0829804465174675, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.927536231884058e-07, "logits/chosen": -2.055922269821167, "logits/rejected": -1.1354026794433594, "logps/chosen": -86.03010559082031, "logps/ref_chosen": -85.71971130371094, "logps/ref_rejected": -109.4802017211914, "logps/rejected": -111.91256713867188, "loss": 1.2348, "rewards/accuracies": 0.84375, "rewards/chosen": -0.025998366996645927, "rewards/margins": 0.1740657240152359, "rewards/rejected": -0.20006409287452698, "step": 69 }, { "epoch": 0.1027900146842878, "grad_norm": 25.407556533813477, "kl/avg_steps": 0.53125, "kl/beta": 0.08251629024744034, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 5e-07, "logits/chosen": -1.9641715288162231, "logits/rejected": -1.39949631690979, "logps/chosen": -95.62403869628906, "logps/ref_chosen": -95.00994873046875, "logps/ref_rejected": -96.21272277832031, "logps/rejected": -99.49327087402344, "loss": 1.201, "rewards/accuracies": 0.796875, "rewards/chosen": -0.05099921301007271, "rewards/margins": 0.21753652393817902, "rewards/rejected": -0.2685357332229614, "step": 70 }, { "epoch": 0.10425844346549193, "grad_norm": 30.39597511291504, "kl/avg_steps": 0.71875, "kl/beta": 0.08208024501800537, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 4.999967061337492e-07, "logits/chosen": -2.9308741092681885, "logits/rejected": -1.5702245235443115, "logps/chosen": -91.07780456542969, "logps/ref_chosen": -90.97735595703125, "logps/ref_rejected": -102.59103393554688, "logps/rejected": -106.28335571289062, "loss": 1.14, "rewards/accuracies": 0.90625, "rewards/chosen": -0.00880364328622818, "rewards/margins": 0.291946142911911, "rewards/rejected": -0.3007497787475586, "step": 71 }, { "epoch": 0.10572687224669604, "grad_norm": 25.40204620361328, "kl/avg_steps": 0.5625, "kl/beta": 0.08149450272321701, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.999868246217933e-07, "logits/chosen": -2.533379077911377, "logits/rejected": -1.5070923566818237, "logps/chosen": -98.47068786621094, "logps/ref_chosen": -97.89152526855469, "logps/ref_rejected": -100.19171142578125, "logps/rejected": -103.68728637695312, "loss": 1.1831, "rewards/accuracies": 0.8125, "rewards/chosen": -0.04744531959295273, "rewards/margins": 0.23507529497146606, "rewards/rejected": -0.282520592212677, "step": 72 }, { "epoch": 0.10719530102790015, "grad_norm": 24.58971405029297, "kl/avg_steps": 0.40625, "kl/beta": 0.08103866130113602, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.999703557245192e-07, "logits/chosen": -2.8086342811584473, "logits/rejected": -1.789698600769043, "logps/chosen": -96.5920181274414, "logps/ref_chosen": -95.7690200805664, "logps/ref_rejected": -95.93243408203125, "logps/rejected": -99.84471130371094, "loss": 1.1906, "rewards/accuracies": 0.78125, "rewards/chosen": -0.06719671189785004, "rewards/margins": 0.24726057052612305, "rewards/rejected": -0.3144572973251343, "step": 73 }, { "epoch": 0.10866372980910426, "grad_norm": 27.501953125, "kl/avg_steps": 0.5, "kl/beta": 0.080710768699646, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.999472998758977e-07, "logits/chosen": -2.912767171859741, "logits/rejected": -1.9761888980865479, "logps/chosen": -79.79828643798828, "logps/ref_chosen": -78.80839538574219, "logps/ref_rejected": -101.64676666259766, "logps/rejected": -106.22779083251953, "loss": 1.1518, "rewards/accuracies": 0.765625, "rewards/chosen": -0.08012489974498749, "rewards/margins": 0.2865511476993561, "rewards/rejected": -0.3666760325431824, "step": 74 }, { "epoch": 0.11013215859030837, "grad_norm": 29.259719848632812, "kl/avg_steps": 0.53125, "kl/beta": 0.08030922710895538, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.999176576834721e-07, "logits/chosen": -2.8685462474823, "logits/rejected": -1.5099756717681885, "logps/chosen": -79.43289947509766, "logps/ref_chosen": -78.28185272216797, "logps/ref_rejected": -115.40311431884766, "logps/rejected": -121.06864929199219, "loss": 1.1034, "rewards/accuracies": 0.828125, "rewards/chosen": -0.09254170209169388, "rewards/margins": 0.35883650183677673, "rewards/rejected": -0.45137819647789, "step": 75 }, { "epoch": 0.11160058737151249, "grad_norm": 19.229440689086914, "kl/avg_steps": 0.375, "kl/beta": 0.079884834587574, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.998814299283415e-07, "logits/chosen": -3.326892375946045, "logits/rejected": -2.0114529132843018, "logps/chosen": -89.3749008178711, "logps/ref_chosen": -87.87714385986328, "logps/ref_rejected": -85.71968078613281, "logps/rejected": -90.23574829101562, "loss": 1.1945, "rewards/accuracies": 0.796875, "rewards/chosen": -0.12006325274705887, "rewards/margins": 0.23822948336601257, "rewards/rejected": -0.35829272866249084, "step": 76 }, { "epoch": 0.1130690161527166, "grad_norm": 22.950105667114258, "kl/avg_steps": 0.4375, "kl/beta": 0.07958638668060303, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.998386175651409e-07, "logits/chosen": -2.7979376316070557, "logits/rejected": -1.765808343887329, "logps/chosen": -101.35743713378906, "logps/ref_chosen": -99.70034790039062, "logps/ref_rejected": -98.20576477050781, "logps/rejected": -103.30995178222656, "loss": 1.1752, "rewards/accuracies": 0.71875, "rewards/chosen": -0.1325981616973877, "rewards/margins": 0.2707356810569763, "rewards/rejected": -0.4033338725566864, "step": 77 }, { "epoch": 0.1145374449339207, "grad_norm": 22.01249885559082, "kl/avg_steps": 0.484375, "kl/beta": 0.07923971116542816, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.997892217220159e-07, "logits/chosen": -2.7685134410858154, "logits/rejected": -1.9960670471191406, "logps/chosen": -91.00945281982422, "logps/ref_chosen": -90.29670715332031, "logps/ref_rejected": -91.13772583007812, "logps/rejected": -95.54225158691406, "loss": 1.1464, "rewards/accuracies": 0.84375, "rewards/chosen": -0.05697726085782051, "rewards/margins": 0.2894305884838104, "rewards/rejected": -0.34640786051750183, "step": 78 }, { "epoch": 0.11600587371512482, "grad_norm": 24.0369815826416, "kl/avg_steps": 0.59375, "kl/beta": 0.07885774970054626, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.997332437005931e-07, "logits/chosen": -3.548121452331543, "logits/rejected": -2.20076847076416, "logps/chosen": -87.27758026123047, "logps/ref_chosen": -86.37832641601562, "logps/ref_rejected": -94.10777282714844, "logps/rejected": -99.85430908203125, "loss": 1.0944, "rewards/accuracies": 0.796875, "rewards/chosen": -0.0716433972120285, "rewards/margins": 0.37777939438819885, "rewards/rejected": -0.44942277669906616, "step": 79 }, { "epoch": 0.11747430249632893, "grad_norm": 22.031478881835938, "kl/avg_steps": 0.40625, "kl/beta": 0.07839228957891464, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.996706849759452e-07, "logits/chosen": -3.074854850769043, "logits/rejected": -2.0579161643981934, "logps/chosen": -95.61634826660156, "logps/ref_chosen": -93.97032165527344, "logps/ref_rejected": -92.57441711425781, "logps/rejected": -98.0230941772461, "loss": 1.1543, "rewards/accuracies": 0.8125, "rewards/chosen": -0.12929072976112366, "rewards/margins": 0.29479026794433594, "rewards/rejected": -0.424081027507782, "step": 80 }, { "epoch": 0.11894273127753303, "grad_norm": 24.160234451293945, "kl/avg_steps": 0.53125, "kl/beta": 0.078075110912323, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.996015471965529e-07, "logits/chosen": -3.137648105621338, "logits/rejected": -1.6095255613327026, "logps/chosen": -100.93592834472656, "logps/ref_chosen": -99.83012390136719, "logps/ref_rejected": -133.67245483398438, "logps/rejected": -140.25442504882812, "loss": 1.0766, "rewards/accuracies": 0.875, "rewards/chosen": -0.0865008607506752, "rewards/margins": 0.42291849851608276, "rewards/rejected": -0.509419322013855, "step": 81 }, { "epoch": 0.12041116005873716, "grad_norm": 21.8071346282959, "kl/avg_steps": 0.53125, "kl/beta": 0.07766252756118774, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.995258321842611e-07, "logits/chosen": -3.389235019683838, "logits/rejected": -2.2665112018585205, "logps/chosen": -85.08958435058594, "logps/ref_chosen": -83.04598236083984, "logps/ref_rejected": -94.52595520019531, "logps/rejected": -100.36309814453125, "loss": 1.1567, "rewards/accuracies": 0.796875, "rewards/chosen": -0.15859441459178925, "rewards/margins": 0.29103851318359375, "rewards/rejected": -0.4496329426765442, "step": 82 }, { "epoch": 0.12187958883994127, "grad_norm": 21.79849624633789, "kl/avg_steps": 0.28125, "kl/beta": 0.07725212723016739, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 4.994435419342304e-07, "logits/chosen": -3.9037327766418457, "logits/rejected": -2.2527427673339844, "logps/chosen": -94.64817810058594, "logps/ref_chosen": -92.17621612548828, "logps/ref_rejected": -107.74464416503906, "logps/rejected": -114.48917388916016, "loss": 1.1507, "rewards/accuracies": 0.703125, "rewards/chosen": -0.19215711951255798, "rewards/margins": 0.3255422115325928, "rewards/rejected": -0.5176993012428284, "step": 83 }, { "epoch": 0.12334801762114538, "grad_norm": 22.36842918395996, "kl/avg_steps": 0.3125, "kl/beta": 0.07703546434640884, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 4.993546786148857e-07, "logits/chosen": -3.4791769981384277, "logits/rejected": -2.393409490585327, "logps/chosen": -104.68441772460938, "logps/ref_chosen": -101.5264892578125, "logps/ref_rejected": -92.42608642578125, "logps/rejected": -98.19878387451172, "loss": 1.2405, "rewards/accuracies": 0.6875, "rewards/chosen": -0.24369969964027405, "rewards/margins": 0.19829508662223816, "rewards/rejected": -0.4419947564601898, "step": 84 }, { "epoch": 0.12481644640234948, "grad_norm": 20.850772857666016, "kl/avg_steps": 0.25, "kl/beta": 0.07679548114538193, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 4.992592445678582e-07, "logits/chosen": -3.586297035217285, "logits/rejected": -2.9060792922973633, "logps/chosen": -98.8546371459961, "logps/ref_chosen": -96.12738037109375, "logps/ref_rejected": -85.05519104003906, "logps/rejected": -91.33653259277344, "loss": 1.1934, "rewards/accuracies": 0.734375, "rewards/chosen": -0.21086883544921875, "rewards/margins": 0.2689524292945862, "rewards/rejected": -0.4798212945461273, "step": 85 }, { "epoch": 0.1262848751835536, "grad_norm": 18.46939468383789, "kl/avg_steps": 0.40625, "kl/beta": 0.0766039714217186, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.991572423079235e-07, "logits/chosen": -3.4410600662231445, "logits/rejected": -2.2581920623779297, "logps/chosen": -83.9306640625, "logps/ref_chosen": -81.70426940917969, "logps/ref_rejected": -93.6554946899414, "logps/rejected": -100.36227416992188, "loss": 1.1471, "rewards/accuracies": 0.8125, "rewards/chosen": -0.17061173915863037, "rewards/margins": 0.3391328752040863, "rewards/rejected": -0.5097446441650391, "step": 86 }, { "epoch": 0.1277533039647577, "grad_norm": 21.282371520996094, "kl/avg_steps": 0.53125, "kl/beta": 0.07629402726888657, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.990486745229364e-07, "logits/chosen": -4.2626729011535645, "logits/rejected": -2.455996513366699, "logps/chosen": -95.24873352050781, "logps/ref_chosen": -92.68596649169922, "logps/ref_rejected": -102.91818237304688, "logps/rejected": -110.37586212158203, "loss": 1.137, "rewards/accuracies": 0.828125, "rewards/chosen": -0.19558626413345337, "rewards/margins": 0.36860454082489014, "rewards/rejected": -0.5641908049583435, "step": 87 }, { "epoch": 0.12922173274596183, "grad_norm": 19.734506607055664, "kl/avg_steps": 0.34375, "kl/beta": 0.07589085400104523, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.989335440737586e-07, "logits/chosen": -3.0765509605407715, "logits/rejected": -2.2642664909362793, "logps/chosen": -103.91253662109375, "logps/ref_chosen": -100.76298522949219, "logps/ref_rejected": -113.15037536621094, "logps/rejected": -120.1460189819336, "loss": 1.1828, "rewards/accuracies": 0.734375, "rewards/chosen": -0.239786297082901, "rewards/margins": 0.28768616914749146, "rewards/rejected": -0.5274724960327148, "step": 88 }, { "epoch": 0.13069016152716592, "grad_norm": 20.043725967407227, "kl/avg_steps": 0.34375, "kl/beta": 0.07563087344169617, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.988118539941847e-07, "logits/chosen": -3.5043540000915527, "logits/rejected": -2.6482343673706055, "logps/chosen": -92.20272827148438, "logps/ref_chosen": -89.69108581542969, "logps/ref_rejected": -88.56832885742188, "logps/rejected": -95.34589385986328, "loss": 1.1584, "rewards/accuracies": 0.75, "rewards/chosen": -0.19024960696697235, "rewards/margins": 0.3187325596809387, "rewards/rejected": -0.5089821815490723, "step": 89 }, { "epoch": 0.13215859030837004, "grad_norm": 22.877426147460938, "kl/avg_steps": 0.53125, "kl/beta": 0.07537178695201874, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.986836074908615e-07, "logits/chosen": -4.08919620513916, "logits/rejected": -2.59071683883667, "logps/chosen": -83.66722106933594, "logps/ref_chosen": -81.38255310058594, "logps/ref_rejected": -117.77714538574219, "logps/rejected": -125.76116180419922, "loss": 1.0847, "rewards/accuracies": 0.828125, "rewards/chosen": -0.1728699803352356, "rewards/margins": 0.4245910942554474, "rewards/rejected": -0.5974611043930054, "step": 90 }, { "epoch": 0.13362701908957417, "grad_norm": 21.175289154052734, "kl/avg_steps": 0.21875, "kl/beta": 0.07497348636388779, "kl/n_epsilon_steps": 0.390625, "kl/p_epsilon_steps": 0.609375, "learning_rate": 4.985488079432037e-07, "logits/chosen": -3.6395578384399414, "logits/rejected": -2.8058741092681885, "logps/chosen": -99.98194885253906, "logps/ref_chosen": -97.22188568115234, "logps/ref_rejected": -92.97674560546875, "logps/rejected": -100.44671630859375, "loss": 1.1414, "rewards/accuracies": 0.703125, "rewards/chosen": -0.20845842361450195, "rewards/margins": 0.3486502170562744, "rewards/rejected": -0.5571086406707764, "step": 91 }, { "epoch": 0.13509544787077826, "grad_norm": 19.441743850708008, "kl/avg_steps": 0.28125, "kl/beta": 0.07480984181165695, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 4.984074589033043e-07, "logits/chosen": -4.114226341247559, "logits/rejected": -3.050344944000244, "logps/chosen": -87.49899291992188, "logps/ref_chosen": -84.5302734375, "logps/ref_rejected": -84.5013198852539, "logps/rejected": -91.48877716064453, "loss": 1.1718, "rewards/accuracies": 0.703125, "rewards/chosen": -0.22265848517417908, "rewards/margins": 0.29672500491142273, "rewards/rejected": -0.5193834900856018, "step": 92 }, { "epoch": 0.13656387665198239, "grad_norm": 20.867734909057617, "kl/avg_steps": 0.28125, "kl/beta": 0.07460002601146698, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 4.982595640958425e-07, "logits/chosen": -3.9874632358551025, "logits/rejected": -2.371685266494751, "logps/chosen": -93.55166625976562, "logps/ref_chosen": -90.25043487548828, "logps/ref_rejected": -84.09422302246094, "logps/rejected": -91.32198333740234, "loss": 1.192, "rewards/accuracies": 0.71875, "rewards/chosen": -0.24706237018108368, "rewards/margins": 0.2884610891342163, "rewards/rejected": -0.535523533821106, "step": 93 }, { "epoch": 0.13803230543318648, "grad_norm": 22.021520614624023, "kl/avg_steps": 0.53125, "kl/beta": 0.07439080625772476, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.98105127417984e-07, "logits/chosen": -4.104582786560059, "logits/rejected": -2.78466796875, "logps/chosen": -95.84749603271484, "logps/ref_chosen": -92.4542236328125, "logps/ref_rejected": -105.24728393554688, "logps/rejected": -114.69227600097656, "loss": 1.0808, "rewards/accuracies": 0.8125, "rewards/chosen": -0.25266966223716736, "rewards/margins": 0.44442811608314514, "rewards/rejected": -0.6970977783203125, "step": 94 }, { "epoch": 0.1395007342143906, "grad_norm": 18.556907653808594, "kl/avg_steps": 0.34375, "kl/beta": 0.07399769127368927, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.979441529392784e-07, "logits/chosen": -4.07137393951416, "logits/rejected": -2.9404728412628174, "logps/chosen": -81.59260559082031, "logps/ref_chosen": -78.87370300292969, "logps/ref_rejected": -83.59121704101562, "logps/rejected": -91.0462875366211, "loss": 1.1379, "rewards/accuracies": 0.703125, "rewards/chosen": -0.201766699552536, "rewards/margins": 0.346103310585022, "rewards/rejected": -0.5478700399398804, "step": 95 }, { "epoch": 0.14096916299559473, "grad_norm": 19.24999237060547, "kl/avg_steps": 0.46875, "kl/beta": 0.07374419271945953, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.977766449015534e-07, "logits/chosen": -3.7676031589508057, "logits/rejected": -2.4514598846435547, "logps/chosen": -109.04351806640625, "logps/ref_chosen": -106.5921630859375, "logps/ref_rejected": -101.76802062988281, "logps/rejected": -110.20415496826172, "loss": 1.0757, "rewards/accuracies": 0.828125, "rewards/chosen": -0.18087545037269592, "rewards/margins": 0.43662530183792114, "rewards/rejected": -0.6175007224082947, "step": 96 }, { "epoch": 0.14243759177679882, "grad_norm": 21.76212501525879, "kl/avg_steps": 0.40625, "kl/beta": 0.07340013235807419, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.976026077188012e-07, "logits/chosen": -4.675760746002197, "logits/rejected": -3.565962791442871, "logps/chosen": -92.28569030761719, "logps/ref_chosen": -88.67988586425781, "logps/ref_rejected": -84.81229400634766, "logps/rejected": -93.66389465332031, "loss": 1.1006, "rewards/accuracies": 0.8125, "rewards/chosen": -0.26550614833831787, "rewards/margins": 0.380950003862381, "rewards/rejected": -0.6464561223983765, "step": 97 }, { "epoch": 0.14390602055800295, "grad_norm": 19.99883460998535, "kl/avg_steps": 0.53125, "kl/beta": 0.0731031522154808, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.974220459770639e-07, "logits/chosen": -3.876922607421875, "logits/rejected": -2.7262768745422363, "logps/chosen": -95.04853057861328, "logps/ref_chosen": -92.24249267578125, "logps/ref_rejected": -101.51948547363281, "logps/rejected": -110.45492553710938, "loss": 1.0488, "rewards/accuracies": 0.875, "rewards/chosen": -0.2048511505126953, "rewards/margins": 0.4435645043849945, "rewards/rejected": -0.6484156250953674, "step": 98 }, { "epoch": 0.14537444933920704, "grad_norm": 20.251489639282227, "kl/avg_steps": 0.53125, "kl/beta": 0.0727168396115303, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.972349644343108e-07, "logits/chosen": -4.384429454803467, "logits/rejected": -3.071768045425415, "logps/chosen": -74.27732849121094, "logps/ref_chosen": -72.18464660644531, "logps/ref_rejected": -91.88131713867188, "logps/rejected": -100.81095123291016, "loss": 1.0479, "rewards/accuracies": 0.8125, "rewards/chosen": -0.15243776142597198, "rewards/margins": 0.49188998341560364, "rewards/rejected": -0.6443277597427368, "step": 99 }, { "epoch": 0.14684287812041116, "grad_norm": 17.54705047607422, "kl/avg_steps": 0.40625, "kl/beta": 0.07233257591724396, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.970413680203148e-07, "logits/chosen": -4.1084113121032715, "logits/rejected": -2.772047758102417, "logps/chosen": -92.06529235839844, "logps/ref_chosen": -89.51382446289062, "logps/ref_rejected": -81.21713256835938, "logps/rejected": -88.53147888183594, "loss": 1.1171, "rewards/accuracies": 0.828125, "rewards/chosen": -0.18433162569999695, "rewards/margins": 0.3410602807998657, "rewards/rejected": -0.5253919363021851, "step": 100 }, { "epoch": 0.14684287812041116, "eval_kl/n_epsilon_steps": 0.3857020437717438, "eval_kl/p_epsilon_steps": 0.6134417653083801, "eval_logits/chosen": -3.108599901199341, "eval_logits/rejected": -2.4245166778564453, "eval_logps/chosen": -104.82538604736328, "eval_logps/ref_chosen": -100.49356842041016, "eval_logps/ref_rejected": -94.06775665283203, "eval_logps/rejected": -101.79056549072266, "eval_loss": 0.6201064586639404, "eval_rewards/accuracies": 0.6840753555297852, "eval_rewards/chosen": -0.31309446692466736, "eval_rewards/margins": 0.24013973772525787, "eval_rewards/rejected": -0.5532342195510864, "eval_runtime": 48.5803, "eval_samples_per_second": 48.147, "eval_steps_per_second": 1.523, "step": 100 }, { "epoch": 0.14831130690161526, "grad_norm": 21.208911895751953, "kl/avg_steps": 0.34375, "kl/beta": 0.07203991711139679, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.968412618365215e-07, "logits/chosen": -3.8228065967559814, "logits/rejected": -3.2613918781280518, "logps/chosen": -98.40846252441406, "logps/ref_chosen": -94.4031982421875, "logps/ref_rejected": -89.91001892089844, "logps/rejected": -98.31871032714844, "loss": 1.1745, "rewards/accuracies": 0.75, "rewards/chosen": -0.2888886034488678, "rewards/margins": 0.3126002252101898, "rewards/rejected": -0.6014888286590576, "step": 101 }, { "epoch": 0.14977973568281938, "grad_norm": 26.08466148376465, "kl/avg_steps": 0.390625, "kl/beta": 0.07179312407970428, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.966346511559149e-07, "logits/chosen": -4.024471282958984, "logits/rejected": -3.1433403491973877, "logps/chosen": -96.39299774169922, "logps/ref_chosen": -91.60942840576172, "logps/ref_rejected": -75.76891326904297, "logps/rejected": -84.2006607055664, "loss": 1.2292, "rewards/accuracies": 0.75, "rewards/chosen": -0.3448712229728699, "rewards/margins": 0.2573922276496887, "rewards/rejected": -0.6022634506225586, "step": 102 }, { "epoch": 0.1512481644640235, "grad_norm": 20.780960083007812, "kl/avg_steps": 0.53125, "kl/beta": 0.07151377201080322, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.964215414228785e-07, "logits/chosen": -4.443181037902832, "logits/rejected": -3.242924928665161, "logps/chosen": -101.16336059570312, "logps/ref_chosen": -97.40238952636719, "logps/ref_rejected": -98.98168182373047, "logps/rejected": -109.21807861328125, "loss": 1.0673, "rewards/accuracies": 0.78125, "rewards/chosen": -0.26924505829811096, "rewards/margins": 0.4573027193546295, "rewards/rejected": -0.7265477776527405, "step": 103 }, { "epoch": 0.1527165932452276, "grad_norm": 17.77613639831543, "kl/avg_steps": 0.40625, "kl/beta": 0.07113586366176605, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.96201938253052e-07, "logits/chosen": -4.106165885925293, "logits/rejected": -3.3227481842041016, "logps/chosen": -97.89046478271484, "logps/ref_chosen": -94.85636138916016, "logps/ref_rejected": -97.02668762207031, "logps/rejected": -106.888427734375, "loss": 1.0609, "rewards/accuracies": 0.828125, "rewards/chosen": -0.2158360481262207, "rewards/margins": 0.4804786741733551, "rewards/rejected": -0.6963146924972534, "step": 104 }, { "epoch": 0.15418502202643172, "grad_norm": 20.64962387084961, "kl/avg_steps": 0.53125, "kl/beta": 0.07084804773330688, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.959758474331832e-07, "logits/chosen": -4.99745512008667, "logits/rejected": -3.3221869468688965, "logps/chosen": -100.64320373535156, "logps/ref_chosen": -96.75790405273438, "logps/ref_rejected": -101.5774917602539, "logps/rejected": -112.40327453613281, "loss": 1.0544, "rewards/accuracies": 0.796875, "rewards/chosen": -0.2757706940174103, "rewards/margins": 0.4858649969100952, "rewards/rejected": -0.7616356611251831, "step": 105 }, { "epoch": 0.15565345080763582, "grad_norm": 19.51259422302246, "kl/avg_steps": 0.46875, "kl/beta": 0.07047365605831146, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.957432749209755e-07, "logits/chosen": -4.652019500732422, "logits/rejected": -3.1131434440612793, "logps/chosen": -96.30393981933594, "logps/ref_chosen": -92.43791198730469, "logps/ref_rejected": -91.00969696044922, "logps/rejected": -101.36027526855469, "loss": 1.0813, "rewards/accuracies": 0.78125, "rewards/chosen": -0.27257949113845825, "rewards/margins": 0.4514220058917999, "rewards/rejected": -0.7240015268325806, "step": 106 }, { "epoch": 0.15712187958883994, "grad_norm": 19.533924102783203, "kl/avg_steps": 0.5, "kl/beta": 0.07014484703540802, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.955042268449307e-07, "logits/chosen": -3.9555153846740723, "logits/rejected": -2.785179615020752, "logps/chosen": -108.09788513183594, "logps/ref_chosen": -105.12500762939453, "logps/ref_rejected": -101.18250274658203, "logps/rejected": -111.86402130126953, "loss": 1.0554, "rewards/accuracies": 0.828125, "rewards/chosen": -0.20865142345428467, "rewards/margins": 0.5342296361923218, "rewards/rejected": -0.7428811192512512, "step": 107 }, { "epoch": 0.15859030837004406, "grad_norm": 19.818805694580078, "kl/avg_steps": 0.46875, "kl/beta": 0.0697958692908287, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.952587095041881e-07, "logits/chosen": -5.145913124084473, "logits/rejected": -3.347729206085205, "logps/chosen": -93.46691131591797, "logps/ref_chosen": -89.20936584472656, "logps/ref_rejected": -101.83784484863281, "logps/rejected": -113.4327163696289, "loss": 1.0533, "rewards/accuracies": 0.78125, "rewards/chosen": -0.29724550247192383, "rewards/margins": 0.5062732100486755, "rewards/rejected": -0.8035186529159546, "step": 108 }, { "epoch": 0.16005873715124816, "grad_norm": 18.633304595947266, "kl/avg_steps": 0.375, "kl/beta": 0.06947022676467896, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.95006729368358e-07, "logits/chosen": -4.46284294128418, "logits/rejected": -3.8937301635742188, "logps/chosen": -103.74415588378906, "logps/ref_chosen": -100.18708801269531, "logps/ref_rejected": -103.53739929199219, "logps/rejected": -116.03982543945312, "loss": 0.9893, "rewards/accuracies": 0.765625, "rewards/chosen": -0.24741317331790924, "rewards/margins": 0.6145913004875183, "rewards/rejected": -0.8620044589042664, "step": 109 }, { "epoch": 0.16152716593245228, "grad_norm": 22.954145431518555, "kl/avg_steps": 0.3125, "kl/beta": 0.06921068578958511, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 4.947482930773511e-07, "logits/chosen": -4.040766716003418, "logits/rejected": -3.4033002853393555, "logps/chosen": -111.32714080810547, "logps/ref_chosen": -106.89572143554688, "logps/ref_rejected": -85.81120300292969, "logps/rejected": -95.40037536621094, "loss": 1.1493, "rewards/accuracies": 0.75, "rewards/chosen": -0.30703774094581604, "rewards/margins": 0.3521472215652466, "rewards/rejected": -0.6591849327087402, "step": 110 }, { "epoch": 0.16299559471365638, "grad_norm": 17.981557846069336, "kl/avg_steps": 0.5625, "kl/beta": 0.06899508088827133, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.944834074412042e-07, "logits/chosen": -4.594253063201904, "logits/rejected": -3.7376368045806885, "logps/chosen": -97.20205688476562, "logps/ref_chosen": -93.3293685913086, "logps/ref_rejected": -104.177490234375, "logps/rejected": -118.46589660644531, "loss": 0.9406, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2670831084251404, "rewards/margins": 0.7102484703063965, "rewards/rejected": -0.9773316383361816, "step": 111 }, { "epoch": 0.1644640234948605, "grad_norm": 19.202064514160156, "kl/avg_steps": 0.1875, "kl/beta": 0.06860914826393127, "kl/n_epsilon_steps": 0.40625, "kl/p_epsilon_steps": 0.59375, "learning_rate": 4.942120794399002e-07, "logits/chosen": -5.043312072753906, "logits/rejected": -4.008922576904297, "logps/chosen": -87.14385986328125, "logps/ref_chosen": -82.7470474243164, "logps/ref_rejected": -71.55845642089844, "logps/rejected": -81.11387634277344, "loss": 1.1448, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3032221496105194, "rewards/margins": 0.3490729331970215, "rewards/rejected": -0.6522951126098633, "step": 112 }, { "epoch": 0.16593245227606462, "grad_norm": 22.44894790649414, "kl/avg_steps": 0.40625, "kl/beta": 0.06848075240850449, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.939343162231841e-07, "logits/chosen": -4.6388139724731445, "logits/rejected": -3.630887031555176, "logps/chosen": -103.29286193847656, "logps/ref_chosen": -97.89949798583984, "logps/ref_rejected": -84.41960144042969, "logps/rejected": -95.4296875, "loss": 1.1132, "rewards/accuracies": 0.8125, "rewards/chosen": -0.37001150846481323, "rewards/margins": 0.38021430373191833, "rewards/rejected": -0.7502257823944092, "step": 113 }, { "epoch": 0.16740088105726872, "grad_norm": 22.647991180419922, "kl/avg_steps": 0.34375, "kl/beta": 0.06820367276668549, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.936501251103751e-07, "logits/chosen": -5.111181259155273, "logits/rejected": -3.6170125007629395, "logps/chosen": -101.57315063476562, "logps/ref_chosen": -96.4951171875, "logps/ref_rejected": -92.01536560058594, "logps/rejected": -105.01116180419922, "loss": 1.0716, "rewards/accuracies": 0.8125, "rewards/chosen": -0.34747976064682007, "rewards/margins": 0.533342719078064, "rewards/rejected": -0.880822479724884, "step": 114 }, { "epoch": 0.16886930983847284, "grad_norm": 20.238996505737305, "kl/avg_steps": 0.4375, "kl/beta": 0.06797002255916595, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.933595135901732e-07, "logits/chosen": -4.847101211547852, "logits/rejected": -3.5432467460632324, "logps/chosen": -108.37738800048828, "logps/ref_chosen": -103.18013000488281, "logps/ref_rejected": -105.71595764160156, "logps/rejected": -118.87684631347656, "loss": 1.0422, "rewards/accuracies": 0.765625, "rewards/chosen": -0.3536038398742676, "rewards/margins": 0.5348291993141174, "rewards/rejected": -0.8884330987930298, "step": 115 }, { "epoch": 0.17033773861967694, "grad_norm": 17.519254684448242, "kl/avg_steps": 0.375, "kl/beta": 0.0676739513874054, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.930624893204624e-07, "logits/chosen": -5.042938232421875, "logits/rejected": -3.4571943283081055, "logps/chosen": -85.59718322753906, "logps/ref_chosen": -81.15837097167969, "logps/ref_rejected": -86.30586242675781, "logps/rejected": -97.48081970214844, "loss": 1.064, "rewards/accuracies": 0.78125, "rewards/chosen": -0.300454318523407, "rewards/margins": 0.45105841755867004, "rewards/rejected": -0.7515127062797546, "step": 116 }, { "epoch": 0.17180616740088106, "grad_norm": 29.290815353393555, "kl/avg_steps": 0.3125, "kl/beta": 0.06742112338542938, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 4.927590601281083e-07, "logits/chosen": -4.475282192230225, "logits/rejected": -3.8478426933288574, "logps/chosen": -102.51126098632812, "logps/ref_chosen": -97.0809555053711, "logps/ref_rejected": -75.1399154663086, "logps/rejected": -85.75031280517578, "loss": 1.1723, "rewards/accuracies": 0.75, "rewards/chosen": -0.36769068241119385, "rewards/margins": 0.34401029348373413, "rewards/rejected": -0.7117009162902832, "step": 117 }, { "epoch": 0.17327459618208516, "grad_norm": 16.853479385375977, "kl/avg_steps": 0.484375, "kl/beta": 0.0672110915184021, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.924492340087524e-07, "logits/chosen": -4.661401748657227, "logits/rejected": -3.8089747428894043, "logps/chosen": -90.43417358398438, "logps/ref_chosen": -86.5362319946289, "logps/ref_rejected": -82.93563842773438, "logps/rejected": -94.443603515625, "loss": 1.0216, "rewards/accuracies": 0.828125, "rewards/chosen": -0.2619098126888275, "rewards/margins": 0.506035327911377, "rewards/rejected": -0.7679451107978821, "step": 118 }, { "epoch": 0.17474302496328928, "grad_norm": 20.742483139038086, "kl/avg_steps": 0.5, "kl/beta": 0.06688710302114487, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.92133019126601e-07, "logits/chosen": -5.028365135192871, "logits/rejected": -4.106209754943848, "logps/chosen": -110.28329467773438, "logps/ref_chosen": -104.60908508300781, "logps/ref_rejected": -109.1578140258789, "logps/rejected": -122.87940979003906, "loss": 1.0755, "rewards/accuracies": 0.796875, "rewards/chosen": -0.38021495938301086, "rewards/margins": 0.5309704542160034, "rewards/rejected": -0.9111853837966919, "step": 119 }, { "epoch": 0.1762114537444934, "grad_norm": 25.90955924987793, "kl/avg_steps": 0.59375, "kl/beta": 0.06655433028936386, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.918104238142103e-07, "logits/chosen": -4.781673431396484, "logits/rejected": -3.775351047515869, "logps/chosen": -119.28671264648438, "logps/ref_chosen": -114.68742370605469, "logps/ref_rejected": -111.18586730957031, "logps/rejected": -126.28087615966797, "loss": 1.0054, "rewards/accuracies": 0.8125, "rewards/chosen": -0.30640918016433716, "rewards/margins": 0.6896119117736816, "rewards/rejected": -0.9960211515426636, "step": 120 }, { "epoch": 0.1776798825256975, "grad_norm": 18.882291793823242, "kl/avg_steps": 0.53125, "kl/beta": 0.06616149842739105, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.91481456572267e-07, "logits/chosen": -5.0012335777282715, "logits/rejected": -3.5916976928710938, "logps/chosen": -96.17632293701172, "logps/ref_chosen": -92.11666107177734, "logps/ref_rejected": -114.972900390625, "logps/rejected": -128.61309814453125, "loss": 0.9533, "rewards/accuracies": 0.828125, "rewards/chosen": -0.26827573776245117, "rewards/margins": 0.6271148920059204, "rewards/rejected": -0.8953906297683716, "step": 121 }, { "epoch": 0.17914831130690162, "grad_norm": 18.61724281311035, "kl/avg_steps": 0.5, "kl/beta": 0.0658118724822998, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.911461260693638e-07, "logits/chosen": -4.714388847351074, "logits/rejected": -3.9871459007263184, "logps/chosen": -89.19412994384766, "logps/ref_chosen": -84.87757110595703, "logps/ref_rejected": -111.0829849243164, "logps/rejected": -126.41008758544922, "loss": 0.9203, "rewards/accuracies": 0.828125, "rewards/chosen": -0.2842973470687866, "rewards/margins": 0.716621994972229, "rewards/rejected": -1.0009193420410156, "step": 122 }, { "epoch": 0.18061674008810572, "grad_norm": 18.020599365234375, "kl/avg_steps": 0.3125, "kl/beta": 0.06548444926738739, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 4.908044411417711e-07, "logits/chosen": -4.689582824707031, "logits/rejected": -3.9507789611816406, "logps/chosen": -103.35702514648438, "logps/ref_chosen": -98.47874450683594, "logps/ref_rejected": -93.85417175292969, "logps/rejected": -106.42095947265625, "loss": 1.0567, "rewards/accuracies": 0.765625, "rewards/chosen": -0.3202221989631653, "rewards/margins": 0.49699968099594116, "rewards/rejected": -0.8172218799591064, "step": 123 }, { "epoch": 0.18208516886930984, "grad_norm": 18.883220672607422, "kl/avg_steps": 0.375, "kl/beta": 0.06528045237064362, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.904564107932048e-07, "logits/chosen": -4.445045471191406, "logits/rejected": -3.482326030731201, "logps/chosen": -107.31734466552734, "logps/ref_chosen": -102.76290130615234, "logps/ref_rejected": -123.005615234375, "logps/rejected": -137.15272521972656, "loss": 1.0345, "rewards/accuracies": 0.703125, "rewards/chosen": -0.29794853925704956, "rewards/margins": 0.6185939908027649, "rewards/rejected": -0.9165425300598145, "step": 124 }, { "epoch": 0.18355359765051396, "grad_norm": 21.531150817871094, "kl/avg_steps": 0.28125, "kl/beta": 0.06503656506538391, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 4.90102044194588e-07, "logits/chosen": -4.837682247161865, "logits/rejected": -4.35373592376709, "logps/chosen": -96.40164947509766, "logps/ref_chosen": -90.42627716064453, "logps/ref_rejected": -89.24901580810547, "logps/rejected": -102.02460479736328, "loss": 1.1176, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3898157477378845, "rewards/margins": 0.43594902753829956, "rewards/rejected": -0.8257647752761841, "step": 125 }, { "epoch": 0.18502202643171806, "grad_norm": 18.599184036254883, "kl/avg_steps": 0.59375, "kl/beta": 0.06485415995121002, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.897413506838102e-07, "logits/chosen": -4.741695404052734, "logits/rejected": -3.933988094329834, "logps/chosen": -99.16655731201172, "logps/ref_chosen": -94.42288208007812, "logps/ref_rejected": -103.88156127929688, "logps/rejected": -118.57122039794922, "loss": 0.94, "rewards/accuracies": 0.875, "rewards/chosen": -0.30715566873550415, "rewards/margins": 0.6382361650466919, "rewards/rejected": -0.9453917741775513, "step": 126 }, { "epoch": 0.18649045521292218, "grad_norm": 20.271377563476562, "kl/avg_steps": 0.28125, "kl/beta": 0.06447136402130127, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 4.89374339765481e-07, "logits/chosen": -5.095927715301514, "logits/rejected": -3.9779820442199707, "logps/chosen": -89.53435516357422, "logps/ref_chosen": -84.75105285644531, "logps/ref_rejected": -83.54523468017578, "logps/rejected": -95.63729858398438, "loss": 1.0846, "rewards/accuracies": 0.75, "rewards/chosen": -0.3097778856754303, "rewards/margins": 0.4655241370201111, "rewards/rejected": -0.7753020524978638, "step": 127 }, { "epoch": 0.18795888399412627, "grad_norm": 22.813669204711914, "kl/avg_steps": 0.328125, "kl/beta": 0.06429054588079453, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.65625, "learning_rate": 4.890010211106795e-07, "logits/chosen": -4.714390754699707, "logits/rejected": -3.6033260822296143, "logps/chosen": -95.26617431640625, "logps/ref_chosen": -90.61964416503906, "logps/ref_rejected": -83.10347747802734, "logps/rejected": -95.3511734008789, "loss": 1.1062, "rewards/accuracies": 0.75, "rewards/chosen": -0.29941076040267944, "rewards/margins": 0.482503741979599, "rewards/rejected": -0.7819145321846008, "step": 128 }, { "epoch": 0.1894273127753304, "grad_norm": 18.353992462158203, "kl/avg_steps": 0.25, "kl/beta": 0.06408028304576874, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 4.88621404556699e-07, "logits/chosen": -4.422687530517578, "logits/rejected": -3.368746757507324, "logps/chosen": -104.02716064453125, "logps/ref_chosen": -98.91870880126953, "logps/ref_rejected": -103.28419494628906, "logps/rejected": -116.28022003173828, "loss": 1.0793, "rewards/accuracies": 0.6875, "rewards/chosen": -0.32816505432128906, "rewards/margins": 0.4989137649536133, "rewards/rejected": -0.8270788192749023, "step": 129 }, { "epoch": 0.19089574155653452, "grad_norm": 16.212928771972656, "kl/avg_steps": 0.5625, "kl/beta": 0.06392048299312592, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.882355001067891e-07, "logits/chosen": -5.384788513183594, "logits/rejected": -4.525052070617676, "logps/chosen": -77.16912841796875, "logps/ref_chosen": -73.18368530273438, "logps/ref_rejected": -88.02154541015625, "logps/rejected": -102.19236755371094, "loss": 0.9753, "rewards/accuracies": 0.796875, "rewards/chosen": -0.2548287510871887, "rewards/margins": 0.6435878276824951, "rewards/rejected": -0.8984166383743286, "step": 130 }, { "epoch": 0.19236417033773862, "grad_norm": 18.157773971557617, "kl/avg_steps": 0.5625, "kl/beta": 0.06356293708086014, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.878433179298909e-07, "logits/chosen": -5.078728675842285, "logits/rejected": -3.907397747039795, "logps/chosen": -77.2987289428711, "logps/ref_chosen": -73.9564437866211, "logps/ref_rejected": -94.45582580566406, "logps/rejected": -107.02981567382812, "loss": 1.0071, "rewards/accuracies": 0.765625, "rewards/chosen": -0.21306927502155304, "rewards/margins": 0.5797268748283386, "rewards/rejected": -0.7927961349487305, "step": 131 }, { "epoch": 0.19383259911894274, "grad_norm": 21.45513153076172, "kl/avg_steps": 0.59375, "kl/beta": 0.06320739537477493, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.874448683603694e-07, "logits/chosen": -5.31199836730957, "logits/rejected": -4.291113376617432, "logps/chosen": -97.30502319335938, "logps/ref_chosen": -92.43948364257812, "logps/ref_rejected": -94.13251495361328, "logps/rejected": -108.19319152832031, "loss": 1.0126, "rewards/accuracies": 0.84375, "rewards/chosen": -0.3079445958137512, "rewards/margins": 0.5743677020072937, "rewards/rejected": -0.8823122978210449, "step": 132 }, { "epoch": 0.19530102790014683, "grad_norm": 17.602924346923828, "kl/avg_steps": 0.5, "kl/beta": 0.06283431500196457, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.870401618977415e-07, "logits/chosen": -5.221474647521973, "logits/rejected": -4.042792320251465, "logps/chosen": -97.0290298461914, "logps/ref_chosen": -93.11666107177734, "logps/ref_rejected": -102.09491729736328, "logps/rejected": -115.60842895507812, "loss": 0.9767, "rewards/accuracies": 0.8125, "rewards/chosen": -0.24647247791290283, "rewards/margins": 0.5967481732368469, "rewards/rejected": -0.843220591545105, "step": 133 }, { "epoch": 0.19676945668135096, "grad_norm": 16.935972213745117, "kl/avg_steps": 0.4375, "kl/beta": 0.06252171099185944, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.866292092063986e-07, "logits/chosen": -5.074636936187744, "logits/rejected": -4.259199142456055, "logps/chosen": -87.7016830444336, "logps/ref_chosen": -83.55012512207031, "logps/ref_rejected": -93.8815689086914, "logps/rejected": -108.22308349609375, "loss": 0.9528, "rewards/accuracies": 0.828125, "rewards/chosen": -0.26006680727005005, "rewards/margins": 0.6304125785827637, "rewards/rejected": -0.8904794454574585, "step": 134 }, { "epoch": 0.19823788546255505, "grad_norm": 21.895069122314453, "kl/avg_steps": 0.5625, "kl/beta": 0.06224936991930008, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.862120211153265e-07, "logits/chosen": -4.999515533447266, "logits/rejected": -3.6168813705444336, "logps/chosen": -81.46526336669922, "logps/ref_chosen": -77.58262634277344, "logps/ref_rejected": -121.05648803710938, "logps/rejected": -134.99432373046875, "loss": 0.9797, "rewards/accuracies": 0.84375, "rewards/chosen": -0.24222984910011292, "rewards/margins": 0.6191209554672241, "rewards/rejected": -0.8613507747650146, "step": 135 }, { "epoch": 0.19970631424375918, "grad_norm": 17.361907958984375, "kl/avg_steps": 0.46875, "kl/beta": 0.06190117448568344, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.857886086178193e-07, "logits/chosen": -5.6198625564575195, "logits/rejected": -4.077550888061523, "logps/chosen": -98.3825454711914, "logps/ref_chosen": -94.1727294921875, "logps/ref_rejected": -103.26428985595703, "logps/rejected": -117.48695373535156, "loss": 0.9543, "rewards/accuracies": 0.84375, "rewards/chosen": -0.2606146037578583, "rewards/margins": 0.613581120967865, "rewards/rejected": -0.8741957545280457, "step": 136 }, { "epoch": 0.2011747430249633, "grad_norm": 18.447507858276367, "kl/avg_steps": 0.8125, "kl/beta": 0.06161236763000488, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 4.853589828711902e-07, "logits/chosen": -5.679508686065674, "logits/rejected": -4.036385536193848, "logps/chosen": -81.5108642578125, "logps/ref_chosen": -78.11874389648438, "logps/ref_rejected": -117.92237091064453, "logps/rejected": -136.72076416015625, "loss": 0.8065, "rewards/accuracies": 0.921875, "rewards/chosen": -0.20868739485740662, "rewards/margins": 0.9392319321632385, "rewards/rejected": -1.1479192972183228, "step": 137 }, { "epoch": 0.2026431718061674, "grad_norm": 16.60499382019043, "kl/avg_steps": 0.5625, "kl/beta": 0.061115801334381104, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.849231551964771e-07, "logits/chosen": -4.7099151611328125, "logits/rejected": -4.268791198730469, "logps/chosen": -98.61029815673828, "logps/ref_chosen": -92.82994079589844, "logps/ref_rejected": -98.64839935302734, "logps/rejected": -113.98179626464844, "loss": 0.992, "rewards/accuracies": 0.828125, "rewards/chosen": -0.35308200120925903, "rewards/margins": 0.5774024724960327, "rewards/rejected": -0.9304844737052917, "step": 138 }, { "epoch": 0.20411160058737152, "grad_norm": 16.9788875579834, "kl/avg_steps": 0.65625, "kl/beta": 0.06077394634485245, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.844811370781446e-07, "logits/chosen": -5.217746734619141, "logits/rejected": -4.438928127288818, "logps/chosen": -82.84307861328125, "logps/ref_chosen": -79.00301361083984, "logps/ref_rejected": -85.82733154296875, "logps/rejected": -100.77876281738281, "loss": 0.9595, "rewards/accuracies": 0.859375, "rewards/chosen": -0.23357324302196503, "rewards/margins": 0.6675229072570801, "rewards/rejected": -0.9010961055755615, "step": 139 }, { "epoch": 0.2055800293685756, "grad_norm": 15.984687805175781, "kl/avg_steps": 0.5, "kl/beta": 0.06037771701812744, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.840329401637809e-07, "logits/chosen": -5.573225975036621, "logits/rejected": -4.482275485992432, "logps/chosen": -92.46583557128906, "logps/ref_chosen": -88.35244750976562, "logps/ref_rejected": -90.57477569580078, "logps/rejected": -104.81709289550781, "loss": 0.9655, "rewards/accuracies": 0.84375, "rewards/chosen": -0.24829351902008057, "rewards/margins": 0.6051296591758728, "rewards/rejected": -0.8534231781959534, "step": 140 }, { "epoch": 0.20704845814977973, "grad_norm": 22.27216911315918, "kl/avg_steps": 0.65625, "kl/beta": 0.06007733196020126, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.83578576263792e-07, "logits/chosen": -5.062009811401367, "logits/rejected": -4.248960018157959, "logps/chosen": -112.61961364746094, "logps/ref_chosen": -107.22016906738281, "logps/ref_rejected": -104.71223449707031, "logps/rejected": -121.64073181152344, "loss": 0.9503, "rewards/accuracies": 0.859375, "rewards/chosen": -0.3238122761249542, "rewards/margins": 0.6848909854888916, "rewards/rejected": -1.0087032318115234, "step": 141 }, { "epoch": 0.20851688693098386, "grad_norm": 16.56783676147461, "kl/avg_steps": 0.5625, "kl/beta": 0.05968564376235008, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.83118057351089e-07, "logits/chosen": -5.5315375328063965, "logits/rejected": -4.753322601318359, "logps/chosen": -94.61272430419922, "logps/ref_chosen": -88.72894287109375, "logps/ref_rejected": -99.73038482666016, "logps/rejected": -116.76524353027344, "loss": 0.9498, "rewards/accuracies": 0.8125, "rewards/chosen": -0.34996697306632996, "rewards/margins": 0.6583119630813599, "rewards/rejected": -1.0082789659500122, "step": 142 }, { "epoch": 0.20998531571218795, "grad_norm": 16.937776565551758, "kl/avg_steps": 0.375, "kl/beta": 0.05935179069638252, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.826513955607734e-07, "logits/chosen": -5.914580345153809, "logits/rejected": -4.424531936645508, "logps/chosen": -89.47615051269531, "logps/ref_chosen": -85.50457763671875, "logps/ref_rejected": -85.8056640625, "logps/rejected": -99.1048583984375, "loss": 1.0174, "rewards/accuracies": 0.828125, "rewards/chosen": -0.2356262505054474, "rewards/margins": 0.547943115234375, "rewards/rejected": -0.7835693359375, "step": 143 }, { "epoch": 0.21145374449339208, "grad_norm": 22.844528198242188, "kl/avg_steps": 0.4375, "kl/beta": 0.05913005396723747, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.821786031898176e-07, "logits/chosen": -5.4832234382629395, "logits/rejected": -4.616265296936035, "logps/chosen": -109.42718505859375, "logps/ref_chosen": -104.68708801269531, "logps/ref_rejected": -87.71908569335938, "logps/rejected": -101.95695495605469, "loss": 1.0494, "rewards/accuracies": 0.828125, "rewards/chosen": -0.2811855673789978, "rewards/margins": 0.5548486113548279, "rewards/rejected": -0.8360341787338257, "step": 144 }, { "epoch": 0.21292217327459617, "grad_norm": 17.371044158935547, "kl/avg_steps": 0.5, "kl/beta": 0.058872487396001816, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.816996926967401e-07, "logits/chosen": -5.808310031890869, "logits/rejected": -4.368202209472656, "logps/chosen": -92.62870788574219, "logps/ref_chosen": -88.51988983154297, "logps/ref_rejected": -83.76513671875, "logps/rejected": -99.01193237304688, "loss": 0.948, "rewards/accuracies": 0.828125, "rewards/chosen": -0.24210438132286072, "rewards/margins": 0.6486882567405701, "rewards/rejected": -0.8907926678657532, "step": 145 }, { "epoch": 0.2143906020558003, "grad_norm": 17.89594841003418, "kl/avg_steps": 0.59375, "kl/beta": 0.05857958644628525, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.812146767012779e-07, "logits/chosen": -5.770585060119629, "logits/rejected": -4.961735725402832, "logps/chosen": -107.27189636230469, "logps/ref_chosen": -102.9058609008789, "logps/ref_rejected": -88.9493637084961, "logps/rejected": -105.66189575195312, "loss": 0.9083, "rewards/accuracies": 0.890625, "rewards/chosen": -0.2555050253868103, "rewards/margins": 0.715686559677124, "rewards/rejected": -0.9711916446685791, "step": 146 }, { "epoch": 0.21585903083700442, "grad_norm": 16.04936408996582, "kl/avg_steps": 0.5625, "kl/beta": 0.058233823627233505, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.807235679840536e-07, "logits/chosen": -6.048250198364258, "logits/rejected": -5.216745853424072, "logps/chosen": -93.50482177734375, "logps/ref_chosen": -89.26219177246094, "logps/ref_rejected": -76.50011444091797, "logps/rejected": -93.5650405883789, "loss": 0.8803, "rewards/accuracies": 0.828125, "rewards/chosen": -0.2465764731168747, "rewards/margins": 0.7393079996109009, "rewards/rejected": -0.9858845472335815, "step": 147 }, { "epoch": 0.2173274596182085, "grad_norm": 17.10126304626465, "kl/avg_steps": 0.375, "kl/beta": 0.05790809169411659, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.802263794862384e-07, "logits/chosen": -5.5003767013549805, "logits/rejected": -3.983274459838867, "logps/chosen": -94.35877990722656, "logps/ref_chosen": -91.76287841796875, "logps/ref_rejected": -110.38552856445312, "logps/rejected": -124.3576889038086, "loss": 0.962, "rewards/accuracies": 0.796875, "rewards/chosen": -0.15071691572666168, "rewards/margins": 0.6519370079040527, "rewards/rejected": -0.8026539087295532, "step": 148 }, { "epoch": 0.21879588839941264, "grad_norm": 17.85907745361328, "kl/avg_steps": 0.4375, "kl/beta": 0.057691749185323715, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.797231243092118e-07, "logits/chosen": -5.346034049987793, "logits/rejected": -4.409138202667236, "logps/chosen": -98.8444595336914, "logps/ref_chosen": -93.4840087890625, "logps/ref_rejected": -105.64369201660156, "logps/rejected": -121.99969482421875, "loss": 0.9992, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3102826178073883, "rewards/margins": 0.6273058652877808, "rewards/rejected": -0.9375884532928467, "step": 149 }, { "epoch": 0.22026431718061673, "grad_norm": 15.73752498626709, "kl/avg_steps": 0.5625, "kl/beta": 0.05744044482707977, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.792138157142157e-07, "logits/chosen": -5.399765968322754, "logits/rejected": -4.2668867111206055, "logps/chosen": -79.96965026855469, "logps/ref_chosen": -75.7786865234375, "logps/ref_rejected": -90.02676391601562, "logps/rejected": -105.93204498291016, "loss": 0.9614, "rewards/accuracies": 0.875, "rewards/chosen": -0.24165549874305725, "rewards/margins": 0.6654115915298462, "rewards/rejected": -0.907067060470581, "step": 150 }, { "epoch": 0.22173274596182085, "grad_norm": 19.85856056213379, "kl/avg_steps": 0.5, "kl/beta": 0.05711914971470833, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.786984671220053e-07, "logits/chosen": -5.871485710144043, "logits/rejected": -4.4503984451293945, "logps/chosen": -108.49866485595703, "logps/ref_chosen": -102.73422241210938, "logps/ref_rejected": -106.03781127929688, "logps/rejected": -123.03112030029297, "loss": 0.9827, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3297105133533478, "rewards/margins": 0.633903980255127, "rewards/rejected": -0.9636145234107971, "step": 151 }, { "epoch": 0.22320117474302498, "grad_norm": 16.81842803955078, "kl/avg_steps": 0.59375, "kl/beta": 0.056834977120161057, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.78177092112495e-07, "logits/chosen": -4.908111572265625, "logits/rejected": -3.836758613586426, "logps/chosen": -100.57263946533203, "logps/ref_chosen": -97.59074401855469, "logps/ref_rejected": -110.49913024902344, "logps/rejected": -126.90365600585938, "loss": 0.8941, "rewards/accuracies": 0.84375, "rewards/chosen": -0.1701817512512207, "rewards/margins": 0.7548890113830566, "rewards/rejected": -0.9250707626342773, "step": 152 }, { "epoch": 0.22466960352422907, "grad_norm": 17.356094360351562, "kl/avg_steps": 0.4375, "kl/beta": 0.05649951100349426, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.776497044244016e-07, "logits/chosen": -5.573543548583984, "logits/rejected": -4.830375671386719, "logps/chosen": -92.43611907958984, "logps/ref_chosen": -87.02560424804688, "logps/ref_rejected": -91.06853485107422, "logps/rejected": -106.13377380371094, "loss": 1.0443, "rewards/accuracies": 0.765625, "rewards/chosen": -0.3063035011291504, "rewards/margins": 0.5388628244400024, "rewards/rejected": -0.8451663255691528, "step": 153 }, { "epoch": 0.2261380323054332, "grad_norm": 15.954586029052734, "kl/avg_steps": 0.46875, "kl/beta": 0.056253399699926376, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.771163179548808e-07, "logits/chosen": -5.686001300811768, "logits/rejected": -4.457275390625, "logps/chosen": -94.85427856445312, "logps/ref_chosen": -91.10566711425781, "logps/ref_rejected": -111.37238311767578, "logps/rejected": -130.63352966308594, "loss": 0.8199, "rewards/accuracies": 0.90625, "rewards/chosen": -0.21082431077957153, "rewards/margins": 0.8645293116569519, "rewards/rejected": -1.0753536224365234, "step": 154 }, { "epoch": 0.2276064610866373, "grad_norm": 23.60336685180664, "kl/avg_steps": 0.4375, "kl/beta": 0.05599094182252884, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.7657694675916247e-07, "logits/chosen": -5.547731399536133, "logits/rejected": -5.115335464477539, "logps/chosen": -100.05500793457031, "logps/ref_chosen": -94.56097412109375, "logps/ref_rejected": -84.77359008789062, "logps/rejected": -102.24840545654297, "loss": 1.0092, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3092682957649231, "rewards/margins": 0.6626745462417603, "rewards/rejected": -0.9719428420066833, "step": 155 }, { "epoch": 0.2290748898678414, "grad_norm": 19.952051162719727, "kl/avg_steps": 0.359375, "kl/beta": 0.055747050791978836, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.7603160505017893e-07, "logits/chosen": -5.7792768478393555, "logits/rejected": -4.890432357788086, "logps/chosen": -101.51638793945312, "logps/ref_chosen": -94.65538787841797, "logps/ref_rejected": -84.41934204101562, "logps/rejected": -101.98014831542969, "loss": 1.0161, "rewards/accuracies": 0.75, "rewards/chosen": -0.38445067405700684, "rewards/margins": 0.5892693400382996, "rewards/rejected": -0.9737200736999512, "step": 156 }, { "epoch": 0.2305433186490455, "grad_norm": 17.198551177978516, "kl/avg_steps": 0.6875, "kl/beta": 0.055547427386045456, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.7548030719819154e-07, "logits/chosen": -6.013505458831787, "logits/rejected": -4.734508037567139, "logps/chosen": -103.44956970214844, "logps/ref_chosen": -97.01239013671875, "logps/ref_rejected": -111.70133972167969, "logps/rejected": -132.464111328125, "loss": 0.8564, "rewards/accuracies": 0.875, "rewards/chosen": -0.35640445351600647, "rewards/margins": 0.7874443531036377, "rewards/rejected": -1.1438487768173218, "step": 157 }, { "epoch": 0.23201174743024963, "grad_norm": 16.71400260925293, "kl/avg_steps": 0.59375, "kl/beta": 0.05516814440488815, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.7492306773041136e-07, "logits/chosen": -5.91513204574585, "logits/rejected": -4.732603073120117, "logps/chosen": -100.4566650390625, "logps/ref_chosen": -94.71295928955078, "logps/ref_rejected": -120.93733978271484, "logps/rejected": -141.5926971435547, "loss": 0.8512, "rewards/accuracies": 0.90625, "rewards/chosen": -0.316554993391037, "rewards/margins": 0.814699113368988, "rewards/rejected": -1.1312541961669922, "step": 158 }, { "epoch": 0.23348017621145375, "grad_norm": 15.837252616882324, "kl/avg_steps": 0.484375, "kl/beta": 0.0548425167798996, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.743599013306165e-07, "logits/chosen": -6.067984104156494, "logits/rejected": -5.724274158477783, "logps/chosen": -113.19328308105469, "logps/ref_chosen": -108.2182388305664, "logps/ref_rejected": -94.83901977539062, "logps/rejected": -113.71253204345703, "loss": 0.9148, "rewards/accuracies": 0.796875, "rewards/chosen": -0.2735292613506317, "rewards/margins": 0.7541267275810242, "rewards/rejected": -1.0276559591293335, "step": 159 }, { "epoch": 0.23494860499265785, "grad_norm": 15.891210556030273, "kl/avg_steps": 0.53125, "kl/beta": 0.054578155279159546, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.737908228387656e-07, "logits/chosen": -6.017996788024902, "logits/rejected": -4.622193336486816, "logps/chosen": -102.32695770263672, "logps/ref_chosen": -98.16340637207031, "logps/ref_rejected": -102.2809829711914, "logps/rejected": -119.59290313720703, "loss": 0.9092, "rewards/accuracies": 0.828125, "rewards/chosen": -0.22743016481399536, "rewards/margins": 0.7104052305221558, "rewards/rejected": -0.9378353953361511, "step": 160 }, { "epoch": 0.23641703377386197, "grad_norm": 17.31453514099121, "kl/avg_steps": 0.4375, "kl/beta": 0.054289739578962326, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.7321584725060594e-07, "logits/chosen": -6.470645904541016, "logits/rejected": -5.285548210144043, "logps/chosen": -92.63033294677734, "logps/ref_chosen": -86.59187316894531, "logps/ref_rejected": -89.01643371582031, "logps/rejected": -107.30801391601562, "loss": 0.9714, "rewards/accuracies": 0.78125, "rewards/chosen": -0.32883331179618835, "rewards/margins": 0.6572731733322144, "rewards/rejected": -0.9861065149307251, "step": 161 }, { "epoch": 0.23788546255506607, "grad_norm": 15.398876190185547, "kl/avg_steps": 0.5625, "kl/beta": 0.054053258150815964, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.7263498971727905e-07, "logits/chosen": -5.194693088531494, "logits/rejected": -4.573456764221191, "logps/chosen": -102.532470703125, "logps/ref_chosen": -97.40288543701172, "logps/ref_rejected": -95.86923217773438, "logps/rejected": -114.22633361816406, "loss": 0.9167, "rewards/accuracies": 0.859375, "rewards/chosen": -0.278072714805603, "rewards/margins": 0.7081333994865417, "rewards/rejected": -0.9862060546875, "step": 162 }, { "epoch": 0.2393538913362702, "grad_norm": 13.848814964294434, "kl/avg_steps": 0.5625, "kl/beta": 0.053750909864902496, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.720482655449212e-07, "logits/chosen": -6.277469635009766, "logits/rejected": -5.564417839050293, "logps/chosen": -87.13270568847656, "logps/ref_chosen": -82.1753921508789, "logps/ref_rejected": -82.86473846435547, "logps/rejected": -103.97894287109375, "loss": 0.8482, "rewards/accuracies": 0.859375, "rewards/chosen": -0.26623615622520447, "rewards/margins": 0.8599917888641357, "rewards/rejected": -1.1262279748916626, "step": 163 }, { "epoch": 0.24082232011747431, "grad_norm": 17.024089813232422, "kl/avg_steps": 0.3125, "kl/beta": 0.05345025286078453, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 4.714556901942599e-07, "logits/chosen": -5.856521129608154, "logits/rejected": -4.963005542755127, "logps/chosen": -98.60881805419922, "logps/ref_chosen": -90.4742660522461, "logps/ref_rejected": -87.02809143066406, "logps/rejected": -106.09263610839844, "loss": 1.0038, "rewards/accuracies": 0.796875, "rewards/chosen": -0.4347628355026245, "rewards/margins": 0.5785181522369385, "rewards/rejected": -1.013280987739563, "step": 164 }, { "epoch": 0.2422907488986784, "grad_norm": 14.025198936462402, "kl/avg_steps": 0.625, "kl/beta": 0.05328373983502388, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.708572792802069e-07, "logits/chosen": -5.403741359710693, "logits/rejected": -5.085051536560059, "logps/chosen": -92.47659301757812, "logps/ref_chosen": -87.313232421875, "logps/ref_rejected": -81.902587890625, "logps/rejected": -102.87800598144531, "loss": 0.857, "rewards/accuracies": 0.859375, "rewards/chosen": -0.27433332800865173, "rewards/margins": 0.8343935012817383, "rewards/rejected": -1.1087267398834229, "step": 165 }, { "epoch": 0.24375917767988253, "grad_norm": 15.63122844696045, "kl/avg_steps": 0.59375, "kl/beta": 0.052952785044908524, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.702530485714461e-07, "logits/chosen": -6.5487165451049805, "logits/rejected": -5.4730329513549805, "logps/chosen": -90.94509887695312, "logps/ref_chosen": -81.5860595703125, "logps/ref_rejected": -103.16517639160156, "logps/rejected": -127.57855224609375, "loss": 0.9066, "rewards/accuracies": 0.828125, "rewards/chosen": -0.49418699741363525, "rewards/margins": 0.7889156341552734, "rewards/rejected": -1.2831027507781982, "step": 166 }, { "epoch": 0.24522760646108663, "grad_norm": 14.752199172973633, "kl/avg_steps": 0.59375, "kl/beta": 0.05264023318886757, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.6964301399001877e-07, "logits/chosen": -6.093404769897461, "logits/rejected": -5.437514305114746, "logps/chosen": -91.57254028320312, "logps/ref_chosen": -83.56167602539062, "logps/ref_rejected": -99.05305480957031, "logps/rejected": -123.61196899414062, "loss": 0.8129, "rewards/accuracies": 0.890625, "rewards/chosen": -0.4190889000892639, "rewards/margins": 0.8631209135055542, "rewards/rejected": -1.282209873199463, "step": 167 }, { "epoch": 0.24669603524229075, "grad_norm": 16.830984115600586, "kl/avg_steps": 0.5625, "kl/beta": 0.05232952535152435, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.690271916109034e-07, "logits/chosen": -6.080756664276123, "logits/rejected": -5.517510414123535, "logps/chosen": -87.75483703613281, "logps/ref_chosen": -79.93124389648438, "logps/ref_rejected": -80.10216522216797, "logps/rejected": -102.02044677734375, "loss": 0.9032, "rewards/accuracies": 0.890625, "rewards/chosen": -0.4084423780441284, "rewards/margins": 0.7297256588935852, "rewards/rejected": -1.1381680965423584, "step": 168 }, { "epoch": 0.24816446402349487, "grad_norm": 15.615654945373535, "kl/avg_steps": 0.625, "kl/beta": 0.052036818116903305, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.6840559766159235e-07, "logits/chosen": -5.397192001342773, "logits/rejected": -5.175437927246094, "logps/chosen": -89.30410766601562, "logps/ref_chosen": -82.90370178222656, "logps/ref_rejected": -90.12191009521484, "logps/rejected": -112.40554809570312, "loss": 0.8642, "rewards/accuracies": 0.828125, "rewards/chosen": -0.33170753717422485, "rewards/margins": 0.8178310394287109, "rewards/rejected": -1.149538516998291, "step": 169 }, { "epoch": 0.24963289280469897, "grad_norm": 16.472681045532227, "kl/avg_steps": 0.5625, "kl/beta": 0.05171360820531845, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.6777824852166437e-07, "logits/chosen": -5.991590976715088, "logits/rejected": -5.608396053314209, "logps/chosen": -83.11984252929688, "logps/ref_chosen": -74.98197174072266, "logps/ref_rejected": -82.16560363769531, "logps/rejected": -104.49899291992188, "loss": 0.8871, "rewards/accuracies": 0.84375, "rewards/chosen": -0.4205918610095978, "rewards/margins": 0.7263666987419128, "rewards/rejected": -1.146958589553833, "step": 170 }, { "epoch": 0.2511013215859031, "grad_norm": 19.757944107055664, "kl/avg_steps": 0.46875, "kl/beta": 0.05142434686422348, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.6714516072235273e-07, "logits/chosen": -5.451948165893555, "logits/rejected": -4.568760871887207, "logps/chosen": -128.7462158203125, "logps/ref_chosen": -118.97853088378906, "logps/ref_rejected": -115.7693099975586, "logps/rejected": -141.45156860351562, "loss": 0.9253, "rewards/accuracies": 0.84375, "rewards/chosen": -0.5009539127349854, "rewards/margins": 0.8095153570175171, "rewards/rejected": -1.3104692697525024, "step": 171 }, { "epoch": 0.2525697503671072, "grad_norm": 14.202917098999023, "kl/avg_steps": 0.46875, "kl/beta": 0.05118441954255104, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.6650635094610966e-07, "logits/chosen": -6.403740882873535, "logits/rejected": -5.487905025482178, "logps/chosen": -111.73493957519531, "logps/ref_chosen": -101.15379333496094, "logps/ref_rejected": -91.00567626953125, "logps/rejected": -116.28710174560547, "loss": 0.8464, "rewards/accuracies": 0.890625, "rewards/chosen": -0.5407711267471313, "rewards/margins": 0.7465265989303589, "rewards/rejected": -1.2872977256774902, "step": 172 }, { "epoch": 0.2540381791483113, "grad_norm": 14.85136604309082, "kl/avg_steps": 0.46875, "kl/beta": 0.05094561353325844, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.6586183602616687e-07, "logits/chosen": -5.471677780151367, "logits/rejected": -5.009877681732178, "logps/chosen": -109.14393615722656, "logps/ref_chosen": -99.05668640136719, "logps/ref_rejected": -83.8507308959961, "logps/rejected": -106.2504653930664, "loss": 0.9611, "rewards/accuracies": 0.796875, "rewards/chosen": -0.5136967897415161, "rewards/margins": 0.620842456817627, "rewards/rejected": -1.1345391273498535, "step": 173 }, { "epoch": 0.2555066079295154, "grad_norm": 16.939958572387695, "kl/avg_steps": 0.625, "kl/beta": 0.050707921385765076, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.652116329460919e-07, "logits/chosen": -6.058362007141113, "logits/rejected": -5.3862104415893555, "logps/chosen": -90.09979248046875, "logps/ref_chosen": -80.40890502929688, "logps/ref_rejected": -108.20171356201172, "logps/rejected": -134.08053588867188, "loss": 0.8609, "rewards/accuracies": 0.859375, "rewards/chosen": -0.49034202098846436, "rewards/margins": 0.8122279644012451, "rewards/rejected": -1.3025699853897095, "step": 174 }, { "epoch": 0.25697503671071953, "grad_norm": 16.927989959716797, "kl/avg_steps": 0.78125, "kl/beta": 0.050392962992191315, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 4.645557588393406e-07, "logits/chosen": -6.3116278648376465, "logits/rejected": -5.949856758117676, "logps/chosen": -95.58826446533203, "logps/ref_chosen": -82.71649169921875, "logps/ref_rejected": -96.66712951660156, "logps/rejected": -125.78546142578125, "loss": 0.8423, "rewards/accuracies": 0.890625, "rewards/chosen": -0.6450769305229187, "rewards/margins": 0.8101252317428589, "rewards/rejected": -1.4552021026611328, "step": 175 }, { "epoch": 0.25844346549192365, "grad_norm": 15.526463508605957, "kl/avg_steps": 0.53125, "kl/beta": 0.050002321600914, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.638942309888058e-07, "logits/chosen": -6.382523059844971, "logits/rejected": -5.265742778778076, "logps/chosen": -90.09957885742188, "logps/ref_chosen": -79.62930297851562, "logps/ref_rejected": -102.6885986328125, "logps/rejected": -129.64431762695312, "loss": 0.9033, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5223788619041443, "rewards/margins": 0.8148936629295349, "rewards/rejected": -1.3372724056243896, "step": 176 }, { "epoch": 0.2599118942731278, "grad_norm": 14.051129341125488, "kl/avg_steps": 0.65625, "kl/beta": 0.0497380867600441, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.6322706682636137e-07, "logits/chosen": -5.991172790527344, "logits/rejected": -5.463325500488281, "logps/chosen": -103.93224334716797, "logps/ref_chosen": -92.56109619140625, "logps/ref_rejected": -103.33838653564453, "logps/rejected": -133.99789428710938, "loss": 0.807, "rewards/accuracies": 0.859375, "rewards/chosen": -0.5626887083053589, "rewards/margins": 0.9501281380653381, "rewards/rejected": -1.5128169059753418, "step": 177 }, { "epoch": 0.26138032305433184, "grad_norm": 16.428043365478516, "kl/avg_steps": 0.75, "kl/beta": 0.049413807690143585, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 4.6255428393240354e-07, "logits/chosen": -6.9524335861206055, "logits/rejected": -6.058289527893066, "logps/chosen": -94.86000061035156, "logps/ref_chosen": -84.26579284667969, "logps/ref_rejected": -110.79730987548828, "logps/rejected": -143.80020141601562, "loss": 0.7237, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5214236974716187, "rewards/margins": 1.096163272857666, "rewards/rejected": -1.6175870895385742, "step": 178 }, { "epoch": 0.26284875183553597, "grad_norm": 18.81154441833496, "kl/avg_steps": 0.4375, "kl/beta": 0.04904596507549286, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.6187590003538724e-07, "logits/chosen": -6.702276229858398, "logits/rejected": -5.42424201965332, "logps/chosen": -106.93254089355469, "logps/ref_chosen": -93.4479751586914, "logps/ref_rejected": -94.25540924072266, "logps/rejected": -122.918212890625, "loss": 0.9762, "rewards/accuracies": 0.734375, "rewards/chosen": -0.6620521545410156, "rewards/margins": 0.735353946685791, "rewards/rejected": -1.3974061012268066, "step": 179 }, { "epoch": 0.2643171806167401, "grad_norm": 19.117578506469727, "kl/avg_steps": 0.6875, "kl/beta": 0.04883232340216637, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.611919330113591e-07, "logits/chosen": -5.92899751663208, "logits/rejected": -5.633153915405273, "logps/chosen": -100.06759643554688, "logps/ref_chosen": -88.83554077148438, "logps/ref_rejected": -103.90696716308594, "logps/rejected": -133.78744506835938, "loss": 0.8081, "rewards/accuracies": 0.890625, "rewards/chosen": -0.5459901094436646, "rewards/margins": 0.901505172252655, "rewards/rejected": -1.4474952220916748, "step": 180 }, { "epoch": 0.2657856093979442, "grad_norm": 16.675270080566406, "kl/avg_steps": 0.453125, "kl/beta": 0.04849889129400253, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.605024008834863e-07, "logits/chosen": -6.083570957183838, "logits/rejected": -5.588318824768066, "logps/chosen": -91.0196762084961, "logps/ref_chosen": -79.23396301269531, "logps/ref_rejected": -69.17442321777344, "logps/rejected": -95.4429931640625, "loss": 0.9401, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5707411766052246, "rewards/margins": 0.6956329345703125, "rewards/rejected": -1.266374111175537, "step": 181 }, { "epoch": 0.26725403817914833, "grad_norm": 15.468544006347656, "kl/avg_steps": 0.5625, "kl/beta": 0.04828012362122536, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.598073218215817e-07, "logits/chosen": -6.867135047912598, "logits/rejected": -6.089150428771973, "logps/chosen": -90.71196746826172, "logps/ref_chosen": -76.08218383789062, "logps/ref_rejected": -94.9686050415039, "logps/rejected": -128.43321228027344, "loss": 0.8622, "rewards/accuracies": 0.875, "rewards/chosen": -0.7034212350845337, "rewards/margins": 0.8995457887649536, "rewards/rejected": -1.6029670238494873, "step": 182 }, { "epoch": 0.2687224669603524, "grad_norm": 16.670425415039062, "kl/avg_steps": 0.59375, "kl/beta": 0.04801006615161896, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.5910671414162484e-07, "logits/chosen": -7.112674713134766, "logits/rejected": -6.326689720153809, "logps/chosen": -108.50236511230469, "logps/ref_chosen": -93.90790557861328, "logps/ref_rejected": -83.56061553955078, "logps/rejected": -117.24481201171875, "loss": 0.8151, "rewards/accuracies": 0.890625, "rewards/chosen": -0.6983296871185303, "rewards/margins": 0.9077068567276001, "rewards/rejected": -1.6060365438461304, "step": 183 }, { "epoch": 0.2701908957415565, "grad_norm": 17.39129066467285, "kl/avg_steps": 0.375, "kl/beta": 0.04772669076919556, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.5840059630527985e-07, "logits/chosen": -5.867307662963867, "logits/rejected": -5.439145565032959, "logps/chosen": -100.19184875488281, "logps/ref_chosen": -86.86790466308594, "logps/ref_rejected": -84.28456115722656, "logps/rejected": -110.23704528808594, "loss": 1.0505, "rewards/accuracies": 0.734375, "rewards/chosen": -0.6346135139465332, "rewards/margins": 0.5953242778778076, "rewards/rejected": -1.2299377918243408, "step": 184 }, { "epoch": 0.27165932452276065, "grad_norm": 19.491111755371094, "kl/avg_steps": 0.4375, "kl/beta": 0.047548383474349976, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.5768898691940836e-07, "logits/chosen": -6.329668045043945, "logits/rejected": -5.343301773071289, "logps/chosen": -108.25900268554688, "logps/ref_chosen": -94.02142333984375, "logps/ref_rejected": -81.45668029785156, "logps/rejected": -111.11482238769531, "loss": 0.9684, "rewards/accuracies": 0.796875, "rewards/chosen": -0.6773566007614136, "rewards/margins": 0.7247765064239502, "rewards/rejected": -1.4021331071853638, "step": 185 }, { "epoch": 0.27312775330396477, "grad_norm": 18.766021728515625, "kl/avg_steps": 0.46875, "kl/beta": 0.04734126478433609, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.5697190473557947e-07, "logits/chosen": -6.936692237854004, "logits/rejected": -6.2103986740112305, "logps/chosen": -131.09014892578125, "logps/ref_chosen": -114.19168090820312, "logps/ref_rejected": -93.33912658691406, "logps/rejected": -127.2985610961914, "loss": 0.8852, "rewards/accuracies": 0.828125, "rewards/chosen": -0.7987364530563354, "rewards/margins": 0.7991558313369751, "rewards/rejected": -1.5978922843933105, "step": 186 }, { "epoch": 0.2745961820851689, "grad_norm": 16.06877899169922, "kl/avg_steps": 0.53125, "kl/beta": 0.047120388597249985, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.5624936864957555e-07, "logits/chosen": -6.3317413330078125, "logits/rejected": -5.483089447021484, "logps/chosen": -91.86463928222656, "logps/ref_chosen": -76.86955261230469, "logps/ref_rejected": -88.16494750976562, "logps/rejected": -118.53182983398438, "loss": 0.968, "rewards/accuracies": 0.828125, "rewards/chosen": -0.705847978591919, "rewards/margins": 0.7159624099731445, "rewards/rejected": -1.4218103885650635, "step": 187 }, { "epoch": 0.27606461086637296, "grad_norm": 15.785407066345215, "kl/avg_steps": 0.625, "kl/beta": 0.046871382743120193, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.5552139770089454e-07, "logits/chosen": -6.618991851806641, "logits/rejected": -5.915453910827637, "logps/chosen": -90.27525329589844, "logps/ref_chosen": -79.19479370117188, "logps/ref_rejected": -96.82890319824219, "logps/rejected": -125.84371948242188, "loss": 0.8615, "rewards/accuracies": 0.859375, "rewards/chosen": -0.5177306532859802, "rewards/margins": 0.8314113616943359, "rewards/rejected": -1.349142074584961, "step": 188 }, { "epoch": 0.2775330396475771, "grad_norm": 14.651037216186523, "kl/avg_steps": 0.34375, "kl/beta": 0.04658025875687599, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 4.5478801107224794e-07, "logits/chosen": -6.843825817108154, "logits/rejected": -6.071965217590332, "logps/chosen": -98.51889038085938, "logps/ref_chosen": -83.8927993774414, "logps/ref_rejected": -79.46589660644531, "logps/rejected": -112.61798095703125, "loss": 0.8845, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6822503805160522, "rewards/margins": 0.8548742532730103, "rewards/rejected": -1.5371246337890625, "step": 189 }, { "epoch": 0.2790014684287812, "grad_norm": 15.543427467346191, "kl/avg_steps": 0.65625, "kl/beta": 0.046420685946941376, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.5404922808905543e-07, "logits/chosen": -6.244542121887207, "logits/rejected": -5.1248064041137695, "logps/chosen": -114.46714782714844, "logps/ref_chosen": -103.44146728515625, "logps/ref_rejected": -109.21503448486328, "logps/rejected": -141.81443786621094, "loss": 0.7663, "rewards/accuracies": 0.90625, "rewards/chosen": -0.5097974538803101, "rewards/margins": 0.991597056388855, "rewards/rejected": -1.501394510269165, "step": 190 }, { "epoch": 0.28046989720998533, "grad_norm": 16.704288482666016, "kl/avg_steps": 0.59375, "kl/beta": 0.04611803591251373, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.5330506821893565e-07, "logits/chosen": -5.9573469161987305, "logits/rejected": -5.739363193511963, "logps/chosen": -110.06816864013672, "logps/ref_chosen": -99.78551483154297, "logps/ref_rejected": -116.00028228759766, "logps/rejected": -149.94467163085938, "loss": 0.7528, "rewards/accuracies": 0.890625, "rewards/chosen": -0.47380760312080383, "rewards/margins": 1.0805836915969849, "rewards/rejected": -1.5543913841247559, "step": 191 }, { "epoch": 0.28193832599118945, "grad_norm": 19.49271583557129, "kl/avg_steps": 0.53125, "kl/beta": 0.04584582895040512, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.5255555107119336e-07, "logits/chosen": -6.447579860687256, "logits/rejected": -5.825846195220947, "logps/chosen": -119.15298461914062, "logps/ref_chosen": -100.86032104492188, "logps/ref_rejected": -109.44500732421875, "logps/rejected": -144.4326934814453, "loss": 0.9386, "rewards/accuracies": 0.796875, "rewards/chosen": -0.8361877202987671, "rewards/margins": 0.757125973701477, "rewards/rejected": -1.5933136940002441, "step": 192 }, { "epoch": 0.2834067547723935, "grad_norm": 19.705093383789062, "kl/avg_steps": 0.25, "kl/beta": 0.04560355842113495, "kl/n_epsilon_steps": 0.375, "kl/p_epsilon_steps": 0.625, "learning_rate": 4.5180069639630236e-07, "logits/chosen": -6.4181036949157715, "logits/rejected": -5.755351543426514, "logps/chosen": -107.38533020019531, "logps/ref_chosen": -92.93831634521484, "logps/ref_rejected": -83.10917663574219, "logps/rejected": -111.87471771240234, "loss": 1.05, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6607075929641724, "rewards/margins": 0.6442556381225586, "rewards/rejected": -1.304963231086731, "step": 193 }, { "epoch": 0.28487518355359764, "grad_norm": 14.911462783813477, "kl/avg_steps": 0.5625, "kl/beta": 0.0454898327589035, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.510405240853854e-07, "logits/chosen": -6.82811164855957, "logits/rejected": -6.5324788093566895, "logps/chosen": -79.15022277832031, "logps/ref_chosen": -63.79157257080078, "logps/ref_rejected": -68.72665405273438, "logps/rejected": -100.59823608398438, "loss": 0.9448, "rewards/accuracies": 0.796875, "rewards/chosen": -0.6952458620071411, "rewards/margins": 0.7425950765609741, "rewards/rejected": -1.4378409385681152, "step": 194 }, { "epoch": 0.28634361233480177, "grad_norm": 15.496999740600586, "kl/avg_steps": 0.625, "kl/beta": 0.04523538425564766, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.5027505416968985e-07, "logits/chosen": -6.802285194396973, "logits/rejected": -5.815672874450684, "logps/chosen": -98.27217102050781, "logps/ref_chosen": -83.88545989990234, "logps/ref_rejected": -104.00125122070312, "logps/rejected": -139.27413940429688, "loss": 0.847, "rewards/accuracies": 0.859375, "rewards/chosen": -0.6492324471473694, "rewards/margins": 0.9344033002853394, "rewards/rejected": -1.583635687828064, "step": 195 }, { "epoch": 0.2878120411160059, "grad_norm": 13.717185974121094, "kl/avg_steps": 0.5625, "kl/beta": 0.04495441913604736, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.495043068200599e-07, "logits/chosen": -6.458456993103027, "logits/rejected": -5.85124397277832, "logps/chosen": -105.92664337158203, "logps/ref_chosen": -89.4013671875, "logps/ref_rejected": -83.34735107421875, "logps/rejected": -120.28113555908203, "loss": 0.8099, "rewards/accuracies": 0.84375, "rewards/chosen": -0.7399119138717651, "rewards/margins": 0.908238410949707, "rewards/rejected": -1.6481503248214722, "step": 196 }, { "epoch": 0.28928046989721, "grad_norm": 16.101377487182617, "kl/avg_steps": 0.5625, "kl/beta": 0.04470296576619148, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.4872830234640493e-07, "logits/chosen": -6.566864013671875, "logits/rejected": -5.954580783843994, "logps/chosen": -103.89794921875, "logps/ref_chosen": -86.8406982421875, "logps/ref_rejected": -82.79289245605469, "logps/rejected": -112.60846710205078, "loss": 1.0239, "rewards/accuracies": 0.84375, "rewards/chosen": -0.7603064179420471, "rewards/margins": 0.5637940168380737, "rewards/rejected": -1.3241004943847656, "step": 197 }, { "epoch": 0.2907488986784141, "grad_norm": 18.52680206298828, "kl/avg_steps": 0.71875, "kl/beta": 0.04445291683077812, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 4.479470611971645e-07, "logits/chosen": -6.684327125549316, "logits/rejected": -5.88149881362915, "logps/chosen": -104.97913360595703, "logps/ref_chosen": -85.04824829101562, "logps/ref_rejected": -104.10365295410156, "logps/rejected": -143.3640594482422, "loss": 0.8573, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8809682130813599, "rewards/margins": 0.8499820232391357, "rewards/rejected": -1.730950117111206, "step": 198 }, { "epoch": 0.2922173274596182, "grad_norm": 15.971412658691406, "kl/avg_steps": 0.4375, "kl/beta": 0.044135693460702896, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.471606039587695e-07, "logits/chosen": -6.563417434692383, "logits/rejected": -6.149653434753418, "logps/chosen": -101.11582946777344, "logps/ref_chosen": -81.78669738769531, "logps/ref_rejected": -90.790771484375, "logps/rejected": -126.85987091064453, "loss": 0.9335, "rewards/accuracies": 0.796875, "rewards/chosen": -0.8515444993972778, "rewards/margins": 0.7307862043380737, "rewards/rejected": -1.5823307037353516, "step": 199 }, { "epoch": 0.2936857562408223, "grad_norm": 14.888577461242676, "kl/avg_steps": 0.59375, "kl/beta": 0.043943438678979874, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.4636895135509966e-07, "logits/chosen": -6.618537425994873, "logits/rejected": -5.955951690673828, "logps/chosen": -102.80973052978516, "logps/ref_chosen": -84.26809692382812, "logps/ref_rejected": -86.96786499023438, "logps/rejected": -125.29328918457031, "loss": 0.9014, "rewards/accuracies": 0.84375, "rewards/chosen": -0.8119258284568787, "rewards/margins": 0.8592541813850403, "rewards/rejected": -1.6711798906326294, "step": 200 }, { "epoch": 0.2936857562408223, "eval_kl/n_epsilon_steps": 0.3069349229335785, "eval_kl/p_epsilon_steps": 0.690496563911438, "eval_logits/chosen": -5.910821914672852, "eval_logits/rejected": -5.565565586090088, "eval_logps/chosen": -122.01994323730469, "eval_logps/ref_chosen": -100.49356842041016, "eval_logps/ref_rejected": -94.06775665283203, "eval_logps/rejected": -130.100341796875, "eval_loss": 0.5126060247421265, "eval_rewards/accuracies": 0.7619863152503967, "eval_rewards/chosen": -0.9392141103744507, "eval_rewards/margins": 0.6259253621101379, "eval_rewards/rejected": -1.5651392936706543, "eval_runtime": 48.5322, "eval_samples_per_second": 48.195, "eval_steps_per_second": 1.525, "step": 200 }, { "epoch": 0.29515418502202645, "grad_norm": 16.643579483032227, "kl/avg_steps": 0.5, "kl/beta": 0.04368406534194946, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.455721242469372e-07, "logits/chosen": -6.567927360534668, "logits/rejected": -5.76133394241333, "logps/chosen": -127.57327270507812, "logps/ref_chosen": -107.77249145507812, "logps/ref_rejected": -119.79248046875, "logps/rejected": -158.6796875, "loss": 0.9275, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8622275590896606, "rewards/margins": 0.8243715763092041, "rewards/rejected": -1.6865991353988647, "step": 201 }, { "epoch": 0.2966226138032305, "grad_norm": 15.5465669631958, "kl/avg_steps": 0.5, "kl/beta": 0.043466731905937195, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.4477014363141755e-07, "logits/chosen": -6.6011576652526855, "logits/rejected": -5.9799299240112305, "logps/chosen": -95.47408294677734, "logps/ref_chosen": -75.97245025634766, "logps/ref_rejected": -94.4599838256836, "logps/rejected": -130.75558471679688, "loss": 0.947, "rewards/accuracies": 0.765625, "rewards/chosen": -0.8463935256004333, "rewards/margins": 0.7217241525650024, "rewards/rejected": -1.568117618560791, "step": 202 }, { "epoch": 0.29809104258443464, "grad_norm": 14.089229583740234, "kl/avg_steps": 0.5625, "kl/beta": 0.04325047880411148, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.439630306414758e-07, "logits/chosen": -6.752559661865234, "logits/rejected": -5.879279136657715, "logps/chosen": -111.82502746582031, "logps/ref_chosen": -94.96715545654297, "logps/ref_rejected": -92.8876724243164, "logps/rejected": -129.54229736328125, "loss": 0.8472, "rewards/accuracies": 0.875, "rewards/chosen": -0.7273061275482178, "rewards/margins": 0.8475444316864014, "rewards/rejected": -1.5748505592346191, "step": 203 }, { "epoch": 0.29955947136563876, "grad_norm": 17.907960891723633, "kl/avg_steps": 0.40625, "kl/beta": 0.043008554726839066, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.431508065452897e-07, "logits/chosen": -6.687747955322266, "logits/rejected": -5.982107162475586, "logps/chosen": -137.6444091796875, "logps/ref_chosen": -116.35719299316406, "logps/ref_rejected": -93.39759063720703, "logps/rejected": -131.52774047851562, "loss": 0.9609, "rewards/accuracies": 0.765625, "rewards/chosen": -0.9145333170890808, "rewards/margins": 0.7152736186981201, "rewards/rejected": -1.6298069953918457, "step": 204 }, { "epoch": 0.3010279001468429, "grad_norm": 12.563446998596191, "kl/avg_steps": 0.75, "kl/beta": 0.04283454269170761, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 4.4233349274571974e-07, "logits/chosen": -6.747779846191406, "logits/rejected": -6.664907932281494, "logps/chosen": -108.34004211425781, "logps/ref_chosen": -88.85934448242188, "logps/ref_rejected": -91.8544921875, "logps/rejected": -133.15499877929688, "loss": 0.7808, "rewards/accuracies": 0.875, "rewards/chosen": -0.8293547630310059, "rewards/margins": 0.9246822595596313, "rewards/rejected": -1.7540370225906372, "step": 205 }, { "epoch": 0.302496328928047, "grad_norm": 17.306621551513672, "kl/avg_steps": 0.78125, "kl/beta": 0.042515672743320465, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 4.415111107797445e-07, "logits/chosen": -6.731910705566406, "logits/rejected": -5.665759086608887, "logps/chosen": -96.40692138671875, "logps/ref_chosen": -76.54634857177734, "logps/ref_rejected": -102.95314025878906, "logps/rejected": -142.6748809814453, "loss": 0.8253, "rewards/accuracies": 0.890625, "rewards/chosen": -0.8388286828994751, "rewards/margins": 0.8355581164360046, "rewards/rejected": -1.6743868589401245, "step": 206 }, { "epoch": 0.3039647577092511, "grad_norm": 19.9998722076416, "kl/avg_steps": 0.5, "kl/beta": 0.04218609631061554, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.4068368231789365e-07, "logits/chosen": -7.154547214508057, "logits/rejected": -6.694057464599609, "logps/chosen": -107.92840576171875, "logps/ref_chosen": -86.23164367675781, "logps/ref_rejected": -90.65512084960938, "logps/rejected": -133.81866455078125, "loss": 0.9093, "rewards/accuracies": 0.828125, "rewards/chosen": -0.9145298600196838, "rewards/margins": 0.8951901197433472, "rewards/rejected": -1.8097199201583862, "step": 207 }, { "epoch": 0.3054331864904552, "grad_norm": 15.674067497253418, "kl/avg_steps": 0.65625, "kl/beta": 0.041976213455200195, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.398512291636768e-07, "logits/chosen": -7.054866313934326, "logits/rejected": -6.2353129386901855, "logps/chosen": -115.6429672241211, "logps/ref_chosen": -94.1595458984375, "logps/ref_rejected": -100.96233367919922, "logps/rejected": -145.07806396484375, "loss": 0.7998, "rewards/accuracies": 0.921875, "rewards/chosen": -0.8975973725318909, "rewards/margins": 0.9404553771018982, "rewards/rejected": -1.838052749633789, "step": 208 }, { "epoch": 0.3069016152716593, "grad_norm": 15.056414604187012, "kl/avg_steps": 0.5, "kl/beta": 0.04170254245400429, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.3901377325300857e-07, "logits/chosen": -7.251114845275879, "logits/rejected": -6.478391647338867, "logps/chosen": -102.76752471923828, "logps/ref_chosen": -84.17056274414062, "logps/ref_rejected": -87.61955261230469, "logps/rejected": -125.9391860961914, "loss": 0.8985, "rewards/accuracies": 0.828125, "rewards/chosen": -0.7734057903289795, "rewards/margins": 0.8138402104377747, "rewards/rejected": -1.5872459411621094, "step": 209 }, { "epoch": 0.30837004405286345, "grad_norm": 13.82249641418457, "kl/avg_steps": 0.625, "kl/beta": 0.04149506613612175, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.381713366536311e-07, "logits/chosen": -7.367747783660889, "logits/rejected": -6.7037177085876465, "logps/chosen": -101.83394622802734, "logps/ref_chosen": -81.17117309570312, "logps/ref_rejected": -84.17478942871094, "logps/rejected": -123.829833984375, "loss": 0.8996, "rewards/accuracies": 0.84375, "rewards/chosen": -0.8534192442893982, "rewards/margins": 0.7803501486778259, "rewards/rejected": -1.6337693929672241, "step": 210 }, { "epoch": 0.30983847283406757, "grad_norm": 16.27776527404785, "kl/avg_steps": 0.46875, "kl/beta": 0.04123733192682266, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.373239415645323e-07, "logits/chosen": -6.783376216888428, "logits/rejected": -6.2379889488220215, "logps/chosen": -131.35140991210938, "logps/ref_chosen": -108.71271514892578, "logps/ref_rejected": -93.55564880371094, "logps/rejected": -135.39334106445312, "loss": 0.9141, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9316354990005493, "rewards/margins": 0.7831017971038818, "rewards/rejected": -1.7147372961044312, "step": 211 }, { "epoch": 0.31130690161527164, "grad_norm": 13.126346588134766, "kl/avg_steps": 0.53125, "kl/beta": 0.0410449355840683, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.3647161031536086e-07, "logits/chosen": -6.987194061279297, "logits/rejected": -6.6716461181640625, "logps/chosen": -113.453125, "logps/ref_chosen": -98.36194610595703, "logps/ref_rejected": -109.88999938964844, "logps/rejected": -151.75265502929688, "loss": 0.7363, "rewards/accuracies": 0.90625, "rewards/chosen": -0.6175893545150757, "rewards/margins": 1.0888441801071167, "rewards/rejected": -1.706433653831482, "step": 212 }, { "epoch": 0.31277533039647576, "grad_norm": 19.004343032836914, "kl/avg_steps": 0.5625, "kl/beta": 0.04082803428173065, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.3561436536583774e-07, "logits/chosen": -6.994483947753906, "logits/rejected": -6.408557891845703, "logps/chosen": -126.75518798828125, "logps/ref_chosen": -108.05531311035156, "logps/ref_rejected": -100.14414978027344, "logps/rejected": -140.40818786621094, "loss": 0.8605, "rewards/accuracies": 0.84375, "rewards/chosen": -0.7606232166290283, "rewards/margins": 0.8708195686340332, "rewards/rejected": -1.6314427852630615, "step": 213 }, { "epoch": 0.3142437591776799, "grad_norm": 15.15538501739502, "kl/avg_steps": 0.4375, "kl/beta": 0.04059966281056404, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.3475222930516473e-07, "logits/chosen": -6.964527130126953, "logits/rejected": -6.679478645324707, "logps/chosen": -98.05276489257812, "logps/ref_chosen": -77.80473327636719, "logps/ref_rejected": -84.27578735351562, "logps/rejected": -123.21199798583984, "loss": 0.9841, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8206988573074341, "rewards/margins": 0.749920129776001, "rewards/rejected": -1.570618987083435, "step": 214 }, { "epoch": 0.315712187958884, "grad_norm": 13.474177360534668, "kl/avg_steps": 0.6875, "kl/beta": 0.04042281210422516, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.3388522485142885e-07, "logits/chosen": -7.422246932983398, "logits/rejected": -6.752878189086914, "logps/chosen": -101.85633087158203, "logps/ref_chosen": -85.1138916015625, "logps/ref_rejected": -96.86151885986328, "logps/rejected": -138.35052490234375, "loss": 0.7734, "rewards/accuracies": 0.875, "rewards/chosen": -0.6723982095718384, "rewards/margins": 0.9908286333084106, "rewards/rejected": -1.663226842880249, "step": 215 }, { "epoch": 0.31718061674008813, "grad_norm": 13.187141418457031, "kl/avg_steps": 0.53125, "kl/beta": 0.04014680162072182, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.330133748510036e-07, "logits/chosen": -6.856760501861572, "logits/rejected": -6.676458835601807, "logps/chosen": -99.6475830078125, "logps/ref_chosen": -80.5923080444336, "logps/ref_rejected": -81.41983795166016, "logps/rejected": -121.41978454589844, "loss": 0.8502, "rewards/accuracies": 0.890625, "rewards/chosen": -0.7613588571548462, "rewards/margins": 0.832861065864563, "rewards/rejected": -1.5942199230194092, "step": 216 }, { "epoch": 0.3186490455212922, "grad_norm": 17.514558792114258, "kl/avg_steps": 0.46875, "kl/beta": 0.03993465006351471, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.3213670227794757e-07, "logits/chosen": -7.025016784667969, "logits/rejected": -6.148534774780273, "logps/chosen": -115.89637756347656, "logps/ref_chosen": -93.47257995605469, "logps/ref_rejected": -103.488525390625, "logps/rejected": -144.48422241210938, "loss": 0.9483, "rewards/accuracies": 0.828125, "rewards/chosen": -0.8944262266159058, "rewards/margins": 0.7329256534576416, "rewards/rejected": -1.6273518800735474, "step": 217 }, { "epoch": 0.3201174743024963, "grad_norm": 15.729193687438965, "kl/avg_steps": 0.453125, "kl/beta": 0.03974832966923714, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.3125523023339815e-07, "logits/chosen": -7.055582046508789, "logits/rejected": -6.02012300491333, "logps/chosen": -110.00785827636719, "logps/ref_chosen": -89.05883026123047, "logps/ref_rejected": -94.30680847167969, "logps/rejected": -136.88497924804688, "loss": 0.8984, "rewards/accuracies": 0.828125, "rewards/chosen": -0.8316381573677063, "rewards/margins": 0.8504889607429504, "rewards/rejected": -1.6821269989013672, "step": 218 }, { "epoch": 0.32158590308370044, "grad_norm": 19.58609390258789, "kl/avg_steps": 0.46875, "kl/beta": 0.039569031447172165, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.303689819449636e-07, "logits/chosen": -7.102532386779785, "logits/rejected": -6.606064796447754, "logps/chosen": -120.11994171142578, "logps/ref_chosen": -101.00733947753906, "logps/ref_rejected": -92.46794128417969, "logps/rejected": -131.8204803466797, "loss": 0.9475, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7558479309082031, "rewards/margins": 0.7920831441879272, "rewards/rejected": -1.5479310750961304, "step": 219 }, { "epoch": 0.32305433186490456, "grad_norm": 16.06678581237793, "kl/avg_steps": 0.46875, "kl/beta": 0.03938441723585129, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.2947798076611047e-07, "logits/chosen": -6.876911163330078, "logits/rejected": -6.016563415527344, "logps/chosen": -115.22142028808594, "logps/ref_chosen": -95.53721618652344, "logps/ref_rejected": -94.30703735351562, "logps/rejected": -132.008544921875, "loss": 0.9298, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7732564210891724, "rewards/margins": 0.702507734298706, "rewards/rejected": -1.4757641553878784, "step": 220 }, { "epoch": 0.3245227606461087, "grad_norm": 14.65071964263916, "kl/avg_steps": 0.6875, "kl/beta": 0.039200663566589355, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.285822501755485e-07, "logits/chosen": -6.791396141052246, "logits/rejected": -5.949629783630371, "logps/chosen": -101.76954650878906, "logps/ref_chosen": -82.84486389160156, "logps/ref_rejected": -110.81179809570312, "logps/rejected": -156.64260864257812, "loss": 0.7544, "rewards/accuracies": 0.875, "rewards/chosen": -0.7377252578735352, "rewards/margins": 1.0444551706314087, "rewards/rejected": -1.7821805477142334, "step": 221 }, { "epoch": 0.32599118942731276, "grad_norm": 20.610990524291992, "kl/avg_steps": 0.53125, "kl/beta": 0.03893300145864487, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.276818137766118e-07, "logits/chosen": -7.152358055114746, "logits/rejected": -6.381607532501221, "logps/chosen": -118.88861083984375, "logps/ref_chosen": -95.14198303222656, "logps/ref_rejected": -106.80441284179688, "logps/rejected": -152.92138671875, "loss": 0.9338, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9227837920188904, "rewards/margins": 0.8608850240707397, "rewards/rejected": -1.7836687564849854, "step": 222 }, { "epoch": 0.3274596182085169, "grad_norm": 15.403647422790527, "kl/avg_steps": 0.5, "kl/beta": 0.03872726112604141, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.2677669529663686e-07, "logits/chosen": -7.370273590087891, "logits/rejected": -6.7063889503479, "logps/chosen": -108.16079711914062, "logps/ref_chosen": -85.57511138916016, "logps/ref_rejected": -86.45238494873047, "logps/rejected": -126.88802337646484, "loss": 0.98, "rewards/accuracies": 0.796875, "rewards/chosen": -0.8735491037368774, "rewards/margins": 0.6830679178237915, "rewards/rejected": -1.556617021560669, "step": 223 }, { "epoch": 0.328928046989721, "grad_norm": 15.034847259521484, "kl/avg_steps": 0.5, "kl/beta": 0.03853458911180496, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.2586691858633747e-07, "logits/chosen": -7.058864593505859, "logits/rejected": -6.6526618003845215, "logps/chosen": -104.06519317626953, "logps/ref_chosen": -82.72380065917969, "logps/ref_rejected": -82.59538269042969, "logps/rejected": -126.1408462524414, "loss": 0.867, "rewards/accuracies": 0.828125, "rewards/chosen": -0.820472002029419, "rewards/margins": 0.8463258147239685, "rewards/rejected": -1.6667978763580322, "step": 224 }, { "epoch": 0.3303964757709251, "grad_norm": 15.859506607055664, "kl/avg_steps": 0.53125, "kl/beta": 0.03834287449717522, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.249525076191759e-07, "logits/chosen": -7.314513206481934, "logits/rejected": -6.8365983963012695, "logps/chosen": -119.15274810791016, "logps/ref_chosen": -95.67768096923828, "logps/ref_rejected": -105.09687805175781, "logps/rejected": -153.17977905273438, "loss": 0.8512, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8980911374092102, "rewards/margins": 0.9331176280975342, "rewards/rejected": -1.83120858669281, "step": 225 }, { "epoch": 0.33186490455212925, "grad_norm": 13.422561645507812, "kl/avg_steps": 0.625, "kl/beta": 0.03814025595784187, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.2403348649073167e-07, "logits/chosen": -7.025399684906006, "logits/rejected": -6.684216499328613, "logps/chosen": -112.96669006347656, "logps/ref_chosen": -93.46092987060547, "logps/ref_rejected": -86.7017593383789, "logps/rejected": -128.74534606933594, "loss": 0.8457, "rewards/accuracies": 0.859375, "rewards/chosen": -0.7408524751663208, "rewards/margins": 0.8516495227813721, "rewards/rejected": -1.5925018787384033, "step": 226 }, { "epoch": 0.3333333333333333, "grad_norm": 15.427087783813477, "kl/avg_steps": 0.5625, "kl/beta": 0.03790335729718208, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.2310987941806615e-07, "logits/chosen": -7.346820831298828, "logits/rejected": -6.7165141105651855, "logps/chosen": -119.78524017333984, "logps/ref_chosen": -92.81427001953125, "logps/ref_rejected": -104.73692321777344, "logps/rejected": -154.31568908691406, "loss": 0.906, "rewards/accuracies": 0.78125, "rewards/chosen": -1.0186035633087158, "rewards/margins": 0.8465676307678223, "rewards/rejected": -1.8651710748672485, "step": 227 }, { "epoch": 0.33480176211453744, "grad_norm": 15.720854759216309, "kl/avg_steps": 0.40625, "kl/beta": 0.03769134357571602, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.2218171073908463e-07, "logits/chosen": -7.082067489624023, "logits/rejected": -6.396081924438477, "logps/chosen": -118.57168579101562, "logps/ref_chosen": -94.03712463378906, "logps/ref_rejected": -96.02151489257812, "logps/rejected": -138.6416015625, "loss": 0.962, "rewards/accuracies": 0.828125, "rewards/chosen": -0.9233343601226807, "rewards/margins": 0.6742476224899292, "rewards/rejected": -1.5975819826126099, "step": 228 }, { "epoch": 0.33627019089574156, "grad_norm": 13.65562915802002, "kl/avg_steps": 0.5625, "kl/beta": 0.03753884509205818, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.212490049118951e-07, "logits/chosen": -7.236542224884033, "logits/rejected": -6.673727989196777, "logps/chosen": -116.85358428955078, "logps/ref_chosen": -95.57766723632812, "logps/ref_rejected": -89.17379760742188, "logps/rejected": -139.57473754882812, "loss": 0.7638, "rewards/accuracies": 0.859375, "rewards/chosen": -0.7962503433227539, "rewards/margins": 1.0827782154083252, "rewards/rejected": -1.879028558731079, "step": 229 }, { "epoch": 0.3377386196769457, "grad_norm": 15.202421188354492, "kl/avg_steps": 0.78125, "kl/beta": 0.037328869104385376, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 4.203117865141635e-07, "logits/chosen": -7.399197578430176, "logits/rejected": -6.820221900939941, "logps/chosen": -86.61280822753906, "logps/ref_chosen": -63.713626861572266, "logps/ref_rejected": -91.9087142944336, "logps/rejected": -143.0418243408203, "loss": 0.7439, "rewards/accuracies": 0.90625, "rewards/chosen": -0.8491979241371155, "rewards/margins": 1.0426983833312988, "rewards/rejected": -1.8918962478637695, "step": 230 }, { "epoch": 0.3392070484581498, "grad_norm": 13.081562995910645, "kl/avg_steps": 0.671875, "kl/beta": 0.03703949600458145, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.1937008024246625e-07, "logits/chosen": -6.668990135192871, "logits/rejected": -6.165530681610107, "logps/chosen": -121.447998046875, "logps/ref_chosen": -95.45567321777344, "logps/ref_rejected": -80.95568084716797, "logps/rejected": -130.36236572265625, "loss": 0.8274, "rewards/accuracies": 0.859375, "rewards/chosen": -0.957029402256012, "rewards/margins": 0.8586543798446655, "rewards/rejected": -1.8156838417053223, "step": 231 }, { "epoch": 0.3406754772393539, "grad_norm": 15.00186538696289, "kl/avg_steps": 0.5, "kl/beta": 0.03679230064153671, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.1842391091163933e-07, "logits/chosen": -6.989046096801758, "logits/rejected": -6.184296607971191, "logps/chosen": -122.46676635742188, "logps/ref_chosen": -96.89726257324219, "logps/ref_rejected": -89.76461791992188, "logps/rejected": -136.52182006835938, "loss": 0.9199, "rewards/accuracies": 0.796875, "rewards/chosen": -0.9383091926574707, "rewards/margins": 0.7705283164978027, "rewards/rejected": -1.7088375091552734, "step": 232 }, { "epoch": 0.342143906020558, "grad_norm": 14.02349853515625, "kl/avg_steps": 0.625, "kl/beta": 0.03660925105214119, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.174733034541245e-07, "logits/chosen": -7.088686943054199, "logits/rejected": -6.224997520446777, "logps/chosen": -116.73382568359375, "logps/ref_chosen": -89.05032348632812, "logps/ref_rejected": -112.75917053222656, "logps/rejected": -169.069580078125, "loss": 0.7772, "rewards/accuracies": 0.828125, "rewards/chosen": -1.0092403888702393, "rewards/margins": 1.036684513092041, "rewards/rejected": -2.045924663543701, "step": 233 }, { "epoch": 0.3436123348017621, "grad_norm": 13.894185066223145, "kl/avg_steps": 0.625, "kl/beta": 0.03638186678290367, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.165182829193126e-07, "logits/chosen": -7.294958114624023, "logits/rejected": -6.297677040100098, "logps/chosen": -100.55630493164062, "logps/ref_chosen": -74.318359375, "logps/ref_rejected": -106.38758850097656, "logps/rejected": -162.41140747070312, "loss": 0.7907, "rewards/accuracies": 0.875, "rewards/chosen": -0.9510073661804199, "rewards/margins": 1.071781873703003, "rewards/rejected": -2.022789239883423, "step": 234 }, { "epoch": 0.34508076358296624, "grad_norm": 13.863997459411621, "kl/avg_steps": 0.53125, "kl/beta": 0.03615589067339897, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.1555887447288255e-07, "logits/chosen": -6.75037956237793, "logits/rejected": -6.180291175842285, "logps/chosen": -129.61538696289062, "logps/ref_chosen": -98.217041015625, "logps/ref_rejected": -97.24677276611328, "logps/rejected": -151.29678344726562, "loss": 0.8641, "rewards/accuracies": 0.875, "rewards/chosen": -1.1314321756362915, "rewards/margins": 0.810757577419281, "rewards/rejected": -1.9421896934509277, "step": 235 }, { "epoch": 0.3465491923641703, "grad_norm": 14.032482147216797, "kl/avg_steps": 0.71875, "kl/beta": 0.03596482798457146, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 4.1459510339613946e-07, "logits/chosen": -6.762874126434326, "logits/rejected": -6.526078224182129, "logps/chosen": -105.3929672241211, "logps/ref_chosen": -78.83773040771484, "logps/ref_rejected": -109.06343078613281, "logps/rejected": -163.83856201171875, "loss": 0.8264, "rewards/accuracies": 0.921875, "rewards/chosen": -0.9487680196762085, "rewards/margins": 1.0044116973876953, "rewards/rejected": -1.9531795978546143, "step": 236 }, { "epoch": 0.34801762114537443, "grad_norm": 15.7308931350708, "kl/avg_steps": 0.46875, "kl/beta": 0.03570817783474922, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 4.136269950853473e-07, "logits/chosen": -7.501216888427734, "logits/rejected": -6.719086647033691, "logps/chosen": -115.78558349609375, "logps/ref_chosen": -85.21128845214844, "logps/ref_rejected": -99.90999603271484, "logps/rejected": -153.35638427734375, "loss": 0.9932, "rewards/accuracies": 0.78125, "rewards/chosen": -1.0902290344238281, "rewards/margins": 0.8065685033798218, "rewards/rejected": -1.89679753780365, "step": 237 }, { "epoch": 0.34948604992657856, "grad_norm": 13.518217086791992, "kl/avg_steps": 0.625, "kl/beta": 0.03554157540202141, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.126545750510605e-07, "logits/chosen": -6.873219013214111, "logits/rejected": -5.906381607055664, "logps/chosen": -112.39120483398438, "logps/ref_chosen": -78.73123168945312, "logps/ref_rejected": -95.41840362548828, "logps/rejected": -153.98170471191406, "loss": 0.8772, "rewards/accuracies": 0.796875, "rewards/chosen": -1.190969705581665, "rewards/margins": 0.8749915361404419, "rewards/rejected": -2.0659611225128174, "step": 238 }, { "epoch": 0.3509544787077827, "grad_norm": 14.817635536193848, "kl/avg_steps": 0.5, "kl/beta": 0.035320818424224854, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.116778689174514e-07, "logits/chosen": -7.087869644165039, "logits/rejected": -6.191803932189941, "logps/chosen": -124.30506134033203, "logps/ref_chosen": -92.60093688964844, "logps/ref_rejected": -100.51769256591797, "logps/rejected": -155.46278381347656, "loss": 0.9094, "rewards/accuracies": 0.75, "rewards/chosen": -1.117043375968933, "rewards/margins": 0.8112146854400635, "rewards/rejected": -1.9282580614089966, "step": 239 }, { "epoch": 0.3524229074889868, "grad_norm": 17.138147354125977, "kl/avg_steps": 0.375, "kl/beta": 0.03514509275555611, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 4.106969024216348e-07, "logits/chosen": -7.254019260406494, "logits/rejected": -7.064877986907959, "logps/chosen": -118.43206787109375, "logps/ref_chosen": -86.15977478027344, "logps/ref_rejected": -80.45567321777344, "logps/rejected": -133.68617248535156, "loss": 0.9385, "rewards/accuracies": 0.78125, "rewards/chosen": -1.1331913471221924, "rewards/margins": 0.7280229926109314, "rewards/rejected": -1.8612143993377686, "step": 240 }, { "epoch": 0.35389133627019087, "grad_norm": 13.10658073425293, "kl/avg_steps": 0.6875, "kl/beta": 0.03501379117369652, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.097117014129903e-07, "logits/chosen": -7.0530242919921875, "logits/rejected": -6.54688835144043, "logps/chosen": -128.86328125, "logps/ref_chosen": -101.04594421386719, "logps/ref_rejected": -94.04934692382812, "logps/rejected": -158.0999298095703, "loss": 0.6851, "rewards/accuracies": 0.859375, "rewards/chosen": -0.9694840312004089, "rewards/margins": 1.2560572624206543, "rewards/rejected": -2.225541353225708, "step": 241 }, { "epoch": 0.355359765051395, "grad_norm": 14.712688446044922, "kl/avg_steps": 0.6875, "kl/beta": 0.034774716943502426, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.087222918524807e-07, "logits/chosen": -7.671010494232178, "logits/rejected": -7.0201215744018555, "logps/chosen": -129.89794921875, "logps/ref_chosen": -95.67266082763672, "logps/ref_rejected": -90.65454864501953, "logps/rejected": -150.88523864746094, "loss": 0.8351, "rewards/accuracies": 0.84375, "rewards/chosen": -1.184476613998413, "rewards/margins": 0.8943547606468201, "rewards/rejected": -2.078831434249878, "step": 242 }, { "epoch": 0.3568281938325991, "grad_norm": 14.254800796508789, "kl/avg_steps": 0.5625, "kl/beta": 0.03453727439045906, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.07728699811968e-07, "logits/chosen": -7.436939239501953, "logits/rejected": -7.066817283630371, "logps/chosen": -130.82611083984375, "logps/ref_chosen": -98.03140258789062, "logps/ref_rejected": -83.18806457519531, "logps/rejected": -141.62124633789062, "loss": 0.8947, "rewards/accuracies": 0.765625, "rewards/chosen": -1.1293984651565552, "rewards/margins": 0.8747692704200745, "rewards/rejected": -2.0041677951812744, "step": 243 }, { "epoch": 0.35829662261380324, "grad_norm": 14.085189819335938, "kl/avg_steps": 0.65625, "kl/beta": 0.03434408828616142, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.067309514735267e-07, "logits/chosen": -7.297283172607422, "logits/rejected": -6.48216438293457, "logps/chosen": -123.65585327148438, "logps/ref_chosen": -88.89391326904297, "logps/ref_rejected": -102.57278442382812, "logps/rejected": -163.68592834472656, "loss": 0.8616, "rewards/accuracies": 0.84375, "rewards/chosen": -1.187152624130249, "rewards/margins": 0.8951038122177124, "rewards/rejected": -2.082256317138672, "step": 244 }, { "epoch": 0.35976505139500736, "grad_norm": 13.635170936584473, "kl/avg_steps": 0.78125, "kl/beta": 0.03412017226219177, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 4.057290731287531e-07, "logits/chosen": -7.1143999099731445, "logits/rejected": -6.517737865447998, "logps/chosen": -133.73660278320312, "logps/ref_chosen": -104.19400024414062, "logps/ref_rejected": -92.65645599365234, "logps/rejected": -151.42526245117188, "loss": 0.777, "rewards/accuracies": 0.859375, "rewards/chosen": -1.0023081302642822, "rewards/margins": 0.9868428707122803, "rewards/rejected": -1.9891510009765625, "step": 245 }, { "epoch": 0.36123348017621143, "grad_norm": 15.270983695983887, "kl/avg_steps": 0.53125, "kl/beta": 0.03385567665100098, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.047230911780736e-07, "logits/chosen": -6.8056464195251465, "logits/rejected": -6.480011940002441, "logps/chosen": -141.06942749023438, "logps/ref_chosen": -103.21904754638672, "logps/ref_rejected": -90.9922103881836, "logps/rejected": -151.3291015625, "loss": 0.9225, "rewards/accuracies": 0.8125, "rewards/chosen": -1.277728796005249, "rewards/margins": 0.7526397705078125, "rewards/rejected": -2.0303685665130615, "step": 246 }, { "epoch": 0.36270190895741555, "grad_norm": 12.317371368408203, "kl/avg_steps": 0.78125, "kl/beta": 0.03367676958441734, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 4.0371303213004814e-07, "logits/chosen": -7.767994403839111, "logits/rejected": -7.044826507568359, "logps/chosen": -121.5998306274414, "logps/ref_chosen": -86.99436950683594, "logps/ref_rejected": -111.33802795410156, "logps/rejected": -179.18167114257812, "loss": 0.7459, "rewards/accuracies": 0.875, "rewards/chosen": -1.1581556797027588, "rewards/margins": 1.1077954769134521, "rewards/rejected": -2.265951156616211, "step": 247 }, { "epoch": 0.3641703377386197, "grad_norm": 15.562211990356445, "kl/avg_steps": 0.5625, "kl/beta": 0.03341570869088173, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.0269892260067197e-07, "logits/chosen": -7.3634748458862305, "logits/rejected": -6.64450740814209, "logps/chosen": -113.81005859375, "logps/ref_chosen": -74.7855224609375, "logps/ref_rejected": -98.27689361572266, "logps/rejected": -160.82879638671875, "loss": 0.9066, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2984592914581299, "rewards/margins": 0.7774021625518799, "rewards/rejected": -2.0758614540100098, "step": 248 }, { "epoch": 0.3656387665198238, "grad_norm": 17.970985412597656, "kl/avg_steps": 0.4375, "kl/beta": 0.0332287959754467, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 4.0168078931267426e-07, "logits/chosen": -7.139558792114258, "logits/rejected": -6.51041316986084, "logps/chosen": -137.87208557128906, "logps/ref_chosen": -95.70379638671875, "logps/ref_rejected": -87.14646911621094, "logps/rejected": -151.0142059326172, "loss": 0.9861, "rewards/accuracies": 0.78125, "rewards/chosen": -1.3996614217758179, "rewards/margins": 0.7120518684387207, "rewards/rejected": -2.111713409423828, "step": 249 }, { "epoch": 0.3671071953010279, "grad_norm": 18.755630493164062, "kl/avg_steps": 0.5625, "kl/beta": 0.03308405354619026, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.006586590948141e-07, "logits/chosen": -7.073131561279297, "logits/rejected": -6.734502792358398, "logps/chosen": -153.10398864746094, "logps/ref_chosen": -114.05220794677734, "logps/ref_rejected": -81.08768463134766, "logps/rejected": -142.95217895507812, "loss": 0.9481, "rewards/accuracies": 0.765625, "rewards/chosen": -1.287900686264038, "rewards/margins": 0.7461932897567749, "rewards/rejected": -2.0340938568115234, "step": 250 }, { "epoch": 0.368575624082232, "grad_norm": 15.883668899536133, "kl/avg_steps": 0.28125, "kl/beta": 0.03289899602532387, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 3.9963255888117325e-07, "logits/chosen": -7.458626747131348, "logits/rejected": -6.964792251586914, "logps/chosen": -135.24278259277344, "logps/ref_chosen": -97.71128845214844, "logps/ref_rejected": -83.52742004394531, "logps/rejected": -142.99111938476562, "loss": 0.9872, "rewards/accuracies": 0.75, "rewards/chosen": -1.2341444492340088, "rewards/margins": 0.7130569219589233, "rewards/rejected": -1.9472013711929321, "step": 251 }, { "epoch": 0.3700440528634361, "grad_norm": 16.21406364440918, "kl/avg_steps": 0.5, "kl/beta": 0.03280672803521156, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 3.9860251571044666e-07, "logits/chosen": -7.022328853607178, "logits/rejected": -6.755680084228516, "logps/chosen": -145.88766479492188, "logps/ref_chosen": -108.9861068725586, "logps/ref_rejected": -91.56424713134766, "logps/rejected": -151.3180389404297, "loss": 0.93, "rewards/accuracies": 0.796875, "rewards/chosen": -1.2076178789138794, "rewards/margins": 0.7408928871154785, "rewards/rejected": -1.948510766029358, "step": 252 }, { "epoch": 0.37151248164464024, "grad_norm": 15.019343376159668, "kl/avg_steps": 0.625, "kl/beta": 0.03264351189136505, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 3.9756855672522986e-07, "logits/chosen": -6.749828815460205, "logits/rejected": -6.487143516540527, "logps/chosen": -140.8556365966797, "logps/ref_chosen": -100.21630859375, "logps/ref_rejected": -105.67670440673828, "logps/rejected": -168.1231689453125, "loss": 0.9468, "rewards/accuracies": 0.796875, "rewards/chosen": -1.3202366828918457, "rewards/margins": 0.7031639218330383, "rewards/rejected": -2.0234005451202393, "step": 253 }, { "epoch": 0.37298091042584436, "grad_norm": 14.38194751739502, "kl/avg_steps": 0.359375, "kl/beta": 0.032440755516290665, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 3.965307091713037e-07, "logits/chosen": -7.187813758850098, "logits/rejected": -6.099554061889648, "logps/chosen": -138.41323852539062, "logps/ref_chosen": -98.73518371582031, "logps/ref_rejected": -93.73825073242188, "logps/rejected": -154.52410888671875, "loss": 0.971, "rewards/accuracies": 0.75, "rewards/chosen": -1.2845338582992554, "rewards/margins": 0.676956057548523, "rewards/rejected": -1.9614899158477783, "step": 254 }, { "epoch": 0.3744493392070485, "grad_norm": 14.393223762512207, "kl/avg_steps": 0.71875, "kl/beta": 0.03232458978891373, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 3.954890003969163e-07, "logits/chosen": -7.347094535827637, "logits/rejected": -6.871660232543945, "logps/chosen": -131.79598999023438, "logps/ref_chosen": -90.382568359375, "logps/ref_rejected": -97.07625579833984, "logps/rejected": -166.75144958496094, "loss": 0.8306, "rewards/accuracies": 0.84375, "rewards/chosen": -1.3301056623458862, "rewards/margins": 0.9035694599151611, "rewards/rejected": -2.233675003051758, "step": 255 }, { "epoch": 0.37591776798825255, "grad_norm": 14.8760986328125, "kl/avg_steps": 0.65625, "kl/beta": 0.03209391236305237, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 3.944434578520628e-07, "logits/chosen": -7.483323097229004, "logits/rejected": -6.505797386169434, "logps/chosen": -126.28778076171875, "logps/ref_chosen": -88.7528076171875, "logps/ref_rejected": -98.49382781982422, "logps/rejected": -164.5001220703125, "loss": 0.8817, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1998153924942017, "rewards/margins": 0.9028624296188354, "rewards/rejected": -2.102677822113037, "step": 256 }, { "epoch": 0.37738619676945667, "grad_norm": 15.880992889404297, "kl/avg_steps": 0.5625, "kl/beta": 0.03188467025756836, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.933941090877615e-07, "logits/chosen": -7.26761531829834, "logits/rejected": -6.992372512817383, "logps/chosen": -122.74734497070312, "logps/ref_chosen": -82.80352783203125, "logps/ref_rejected": -85.8677978515625, "logps/rejected": -153.03878784179688, "loss": 0.8809, "rewards/accuracies": 0.84375, "rewards/chosen": -1.2686723470687866, "rewards/margins": 0.8583362698554993, "rewards/rejected": -2.1270086765289307, "step": 257 }, { "epoch": 0.3788546255506608, "grad_norm": 14.805852890014648, "kl/avg_steps": 0.5625, "kl/beta": 0.031706321984529495, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.923409817553284e-07, "logits/chosen": -7.284233093261719, "logits/rejected": -6.5939788818359375, "logps/chosen": -129.7963409423828, "logps/ref_chosen": -90.187744140625, "logps/ref_rejected": -103.47068786621094, "logps/rejected": -169.39410400390625, "loss": 0.8631, "rewards/accuracies": 0.859375, "rewards/chosen": -1.2512186765670776, "rewards/margins": 0.8250081539154053, "rewards/rejected": -2.0762269496917725, "step": 258 }, { "epoch": 0.3803230543318649, "grad_norm": 15.601424217224121, "kl/avg_steps": 0.5625, "kl/beta": 0.03152897208929062, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.9128410360564793e-07, "logits/chosen": -6.8948163986206055, "logits/rejected": -6.679216384887695, "logps/chosen": -133.95770263671875, "logps/ref_chosen": -90.77254486083984, "logps/ref_rejected": -94.58816528320312, "logps/rejected": -159.41954040527344, "loss": 0.9968, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3569505214691162, "rewards/margins": 0.6738921403884888, "rewards/rejected": -2.0308425426483154, "step": 259 }, { "epoch": 0.38179148311306904, "grad_norm": 16.810916900634766, "kl/avg_steps": 0.5, "kl/beta": 0.03135261312127113, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 3.9022350248844246e-07, "logits/chosen": -7.622888088226318, "logits/rejected": -6.941500186920166, "logps/chosen": -118.55705261230469, "logps/ref_chosen": -75.59269714355469, "logps/ref_rejected": -100.84554290771484, "logps/rejected": -164.71981811523438, "loss": 1.0562, "rewards/accuracies": 0.75, "rewards/chosen": -1.3429489135742188, "rewards/margins": 0.6456452012062073, "rewards/rejected": -1.9885942935943604, "step": 260 }, { "epoch": 0.3832599118942731, "grad_norm": 13.099295616149902, "kl/avg_steps": 0.65625, "kl/beta": 0.031196629628539085, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 3.891592063515376e-07, "logits/chosen": -7.312009811401367, "logits/rejected": -7.067169666290283, "logps/chosen": -129.3551025390625, "logps/ref_chosen": -94.25491333007812, "logps/ref_rejected": -93.65699768066406, "logps/rejected": -158.41415405273438, "loss": 0.8049, "rewards/accuracies": 0.84375, "rewards/chosen": -1.0897305011749268, "rewards/margins": 0.9158782958984375, "rewards/rejected": -2.0056087970733643, "step": 261 }, { "epoch": 0.38472834067547723, "grad_norm": 13.834671020507812, "kl/avg_steps": 0.6875, "kl/beta": 0.03099323809146881, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 3.880912432401264e-07, "logits/chosen": -7.694709777832031, "logits/rejected": -6.795375823974609, "logps/chosen": -123.33172607421875, "logps/ref_chosen": -85.26730346679688, "logps/ref_rejected": -90.82609558105469, "logps/rejected": -156.57522583007812, "loss": 0.8587, "rewards/accuracies": 0.859375, "rewards/chosen": -1.1732381582260132, "rewards/margins": 0.8486050367355347, "rewards/rejected": -2.021843194961548, "step": 262 }, { "epoch": 0.38619676945668135, "grad_norm": 13.023946762084961, "kl/avg_steps": 0.75, "kl/beta": 0.030781613662838936, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 3.870196412960302e-07, "logits/chosen": -6.901250839233398, "logits/rejected": -6.722340106964111, "logps/chosen": -127.14653015136719, "logps/ref_chosen": -95.75790405273438, "logps/ref_rejected": -101.83377075195312, "logps/rejected": -169.84298706054688, "loss": 0.7435, "rewards/accuracies": 0.859375, "rewards/chosen": -0.9609699845314026, "rewards/margins": 1.114938497543335, "rewards/rejected": -2.075908660888672, "step": 263 }, { "epoch": 0.3876651982378855, "grad_norm": 13.047411918640137, "kl/avg_steps": 0.5625, "kl/beta": 0.030552471056580544, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.8594442875695665e-07, "logits/chosen": -7.175429344177246, "logits/rejected": -6.558530330657959, "logps/chosen": -124.76637268066406, "logps/ref_chosen": -90.6226577758789, "logps/ref_rejected": -100.32554626464844, "logps/rejected": -164.76083374023438, "loss": 0.8463, "rewards/accuracies": 0.84375, "rewards/chosen": -1.0397236347198486, "rewards/margins": 0.9148914217948914, "rewards/rejected": -1.9546151161193848, "step": 264 }, { "epoch": 0.3891336270190896, "grad_norm": 12.617591857910156, "kl/avg_steps": 0.5625, "kl/beta": 0.030381573364138603, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.848656339557562e-07, "logits/chosen": -6.967649936676025, "logits/rejected": -6.795099258422852, "logps/chosen": -128.9173583984375, "logps/ref_chosen": -92.37232971191406, "logps/ref_rejected": -94.62757110595703, "logps/rejected": -160.58474731445312, "loss": 0.8792, "rewards/accuracies": 0.796875, "rewards/chosen": -1.1058483123779297, "rewards/margins": 0.8837213516235352, "rewards/rejected": -1.9895697832107544, "step": 265 }, { "epoch": 0.39060205580029367, "grad_norm": 15.880788803100586, "kl/avg_steps": 0.34375, "kl/beta": 0.030211633071303368, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 3.8378328531967507e-07, "logits/chosen": -7.422516345977783, "logits/rejected": -7.111500263214111, "logps/chosen": -143.80422973632812, "logps/ref_chosen": -102.20002746582031, "logps/ref_rejected": -74.36642456054688, "logps/rejected": -141.06625366210938, "loss": 0.9138, "rewards/accuracies": 0.796875, "rewards/chosen": -1.2558131217956543, "rewards/margins": 0.7501237392425537, "rewards/rejected": -2.005937099456787, "step": 266 }, { "epoch": 0.3920704845814978, "grad_norm": 13.670655250549316, "kl/avg_steps": 0.65625, "kl/beta": 0.0301081370562315, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 3.8269741136960646e-07, "logits/chosen": -7.119365692138672, "logits/rejected": -6.709759712219238, "logps/chosen": -139.10177612304688, "logps/ref_chosen": -104.28599548339844, "logps/ref_rejected": -95.98719024658203, "logps/rejected": -161.63311767578125, "loss": 0.7978, "rewards/accuracies": 0.875, "rewards/chosen": -1.04282808303833, "rewards/margins": 0.9189317226409912, "rewards/rejected": -1.9617595672607422, "step": 267 }, { "epoch": 0.3935389133627019, "grad_norm": 13.244879722595215, "kl/avg_steps": 0.75, "kl/beta": 0.029911840334534645, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 3.8160804071933894e-07, "logits/chosen": -7.436177730560303, "logits/rejected": -6.697609901428223, "logps/chosen": -121.36919403076172, "logps/ref_chosen": -86.69622039794922, "logps/ref_rejected": -109.19183349609375, "logps/rejected": -176.08946228027344, "loss": 0.7894, "rewards/accuracies": 0.875, "rewards/chosen": -1.030745029449463, "rewards/margins": 0.9536941051483154, "rewards/rejected": -1.9844391345977783, "step": 268 }, { "epoch": 0.39500734214390604, "grad_norm": 13.288008689880371, "kl/avg_steps": 0.5625, "kl/beta": 0.02968917228281498, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.8051520207480204e-07, "logits/chosen": -7.45705509185791, "logits/rejected": -6.695377826690674, "logps/chosen": -140.38177490234375, "logps/ref_chosen": -104.97181701660156, "logps/ref_rejected": -112.4764633178711, "logps/rejected": -181.06021118164062, "loss": 0.8031, "rewards/accuracies": 0.84375, "rewards/chosen": -1.0468546152114868, "rewards/margins": 0.9742782115936279, "rewards/rejected": -2.0211329460144043, "step": 269 }, { "epoch": 0.3964757709251101, "grad_norm": 13.801987648010254, "kl/avg_steps": 0.5625, "kl/beta": 0.029523104429244995, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.794189242333106e-07, "logits/chosen": -7.553506851196289, "logits/rejected": -6.664368152618408, "logps/chosen": -139.99703979492188, "logps/ref_chosen": -101.07383728027344, "logps/ref_rejected": -117.75289916992188, "logps/rejected": -186.19866943359375, "loss": 0.9174, "rewards/accuracies": 0.75, "rewards/chosen": -1.1459414958953857, "rewards/margins": 0.8607551455497742, "rewards/rejected": -2.0066967010498047, "step": 270 }, { "epoch": 0.39794419970631423, "grad_norm": 12.916444778442383, "kl/avg_steps": 0.75, "kl/beta": 0.029357966035604477, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 3.7831923608280514e-07, "logits/chosen": -6.938437461853027, "logits/rejected": -6.5058183670043945, "logps/chosen": -129.179443359375, "logps/ref_chosen": -96.72459411621094, "logps/ref_rejected": -98.5244140625, "logps/rejected": -166.06207275390625, "loss": 0.8123, "rewards/accuracies": 0.890625, "rewards/chosen": -0.9474867582321167, "rewards/margins": 1.0177432298660278, "rewards/rejected": -1.9652299880981445, "step": 271 }, { "epoch": 0.39941262848751835, "grad_norm": 14.018041610717773, "kl/avg_steps": 0.625, "kl/beta": 0.02913942001760006, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 3.772161666010912e-07, "logits/chosen": -7.514509677886963, "logits/rejected": -7.041294574737549, "logps/chosen": -108.3265380859375, "logps/ref_chosen": -80.97721862792969, "logps/ref_rejected": -108.55535888671875, "logps/rejected": -169.4478759765625, "loss": 0.8282, "rewards/accuracies": 0.859375, "rewards/chosen": -0.7937983870506287, "rewards/margins": 0.9667574167251587, "rewards/rejected": -1.7605557441711426, "step": 272 }, { "epoch": 0.4008810572687225, "grad_norm": 13.580171585083008, "kl/avg_steps": 0.59375, "kl/beta": 0.028958430513739586, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 3.761097448550755e-07, "logits/chosen": -7.261774063110352, "logits/rejected": -6.644618988037109, "logps/chosen": -128.26251220703125, "logps/ref_chosen": -92.22460174560547, "logps/ref_rejected": -97.3630599975586, "logps/rejected": -168.29635620117188, "loss": 0.8615, "rewards/accuracies": 0.828125, "rewards/chosen": -1.0409189462661743, "rewards/margins": 0.9985566139221191, "rewards/rejected": -2.039475440979004, "step": 273 }, { "epoch": 0.4023494860499266, "grad_norm": 15.386717796325684, "kl/avg_steps": 0.75, "kl/beta": 0.028787503018975258, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 3.75e-07, "logits/chosen": -7.7138991355896, "logits/rejected": -6.974665641784668, "logps/chosen": -119.94572448730469, "logps/ref_chosen": -82.2608871459961, "logps/ref_rejected": -83.87699127197266, "logps/rejected": -149.46990966796875, "loss": 0.8819, "rewards/accuracies": 0.875, "rewards/chosen": -1.0782254934310913, "rewards/margins": 0.7943964004516602, "rewards/rejected": -1.8726218938827515, "step": 274 }, { "epoch": 0.40381791483113066, "grad_norm": 15.320517539978027, "kl/avg_steps": 0.71875, "kl/beta": 0.028573205694556236, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 3.738869612786737e-07, "logits/chosen": -7.628604888916016, "logits/rejected": -6.829320907592773, "logps/chosen": -111.30049896240234, "logps/ref_chosen": -79.68695831298828, "logps/ref_rejected": -98.7509765625, "logps/rejected": -160.444580078125, "loss": 0.9047, "rewards/accuracies": 0.796875, "rewards/chosen": -0.8999394178390503, "rewards/margins": 0.8488346338272095, "rewards/rejected": -1.7487740516662598, "step": 275 }, { "epoch": 0.4052863436123348, "grad_norm": 15.482325553894043, "kl/avg_steps": 0.53125, "kl/beta": 0.028369300067424774, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 3.7277065802070204e-07, "logits/chosen": -7.228045463562012, "logits/rejected": -6.846656799316406, "logps/chosen": -126.2287826538086, "logps/ref_chosen": -86.53970336914062, "logps/ref_rejected": -77.85394287109375, "logps/rejected": -146.60427856445312, "loss": 0.9138, "rewards/accuracies": 0.84375, "rewards/chosen": -1.1226071119308472, "rewards/margins": 0.8151148557662964, "rewards/rejected": -1.9377222061157227, "step": 276 }, { "epoch": 0.4067547723935389, "grad_norm": 15.835052490234375, "kl/avg_steps": 0.65625, "kl/beta": 0.02821938507258892, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 3.71651119641714e-07, "logits/chosen": -7.30094051361084, "logits/rejected": -6.272882461547852, "logps/chosen": -123.2606430053711, "logps/ref_chosen": -84.24411010742188, "logps/ref_rejected": -98.83421325683594, "logps/rejected": -164.99749755859375, "loss": 0.898, "rewards/accuracies": 0.765625, "rewards/chosen": -1.0951545238494873, "rewards/margins": 0.7574270963668823, "rewards/rejected": -1.8525817394256592, "step": 277 }, { "epoch": 0.40822320117474303, "grad_norm": 13.882027626037598, "kl/avg_steps": 0.65625, "kl/beta": 0.028035402297973633, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 3.705283756425872e-07, "logits/chosen": -7.213944435119629, "logits/rejected": -6.768559455871582, "logps/chosen": -120.13731384277344, "logps/ref_chosen": -82.431884765625, "logps/ref_rejected": -97.85691833496094, "logps/rejected": -164.93141174316406, "loss": 0.9088, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0521965026855469, "rewards/margins": 0.8135128021240234, "rewards/rejected": -1.8657093048095703, "step": 278 }, { "epoch": 0.40969162995594716, "grad_norm": 13.638445854187012, "kl/avg_steps": 0.59375, "kl/beta": 0.02785261906683445, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 3.6940245560867e-07, "logits/chosen": -7.466989517211914, "logits/rejected": -7.357805252075195, "logps/chosen": -125.54033660888672, "logps/ref_chosen": -85.16799926757812, "logps/ref_rejected": -94.12664794921875, "logps/rejected": -165.55284118652344, "loss": 0.8719, "rewards/accuracies": 0.796875, "rewards/chosen": -1.119484305381775, "rewards/margins": 0.8550732135772705, "rewards/rejected": -1.9745573997497559, "step": 279 }, { "epoch": 0.4111600587371512, "grad_norm": 13.675751686096191, "kl/avg_steps": 0.625, "kl/beta": 0.027688222005963326, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 3.6827338920900253e-07, "logits/chosen": -7.190787315368652, "logits/rejected": -6.597271919250488, "logps/chosen": -123.91621398925781, "logps/ref_chosen": -85.85641479492188, "logps/ref_rejected": -104.11859130859375, "logps/rejected": -177.7848663330078, "loss": 0.7947, "rewards/accuracies": 0.875, "rewards/chosen": -1.0486705303192139, "rewards/margins": 0.9753453731536865, "rewards/rejected": -2.0240159034729004, "step": 280 }, { "epoch": 0.41262848751835535, "grad_norm": 11.921332359313965, "kl/avg_steps": 0.6875, "kl/beta": 0.02751624397933483, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 3.6714120619553435e-07, "logits/chosen": -7.7915472984313965, "logits/rejected": -6.586763381958008, "logps/chosen": -125.37873840332031, "logps/ref_chosen": -86.55081939697266, "logps/ref_rejected": -88.62866973876953, "logps/rejected": -164.1964874267578, "loss": 0.8278, "rewards/accuracies": 0.84375, "rewards/chosen": -1.0626280307769775, "rewards/margins": 0.9996333718299866, "rewards/rejected": -2.0622613430023193, "step": 281 }, { "epoch": 0.41409691629955947, "grad_norm": 11.661370277404785, "kl/avg_steps": 0.8125, "kl/beta": 0.027328362688422203, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 3.660059364023408e-07, "logits/chosen": -7.430984973907471, "logits/rejected": -6.962902545928955, "logps/chosen": -143.371826171875, "logps/ref_chosen": -105.10511016845703, "logps/ref_rejected": -102.85336303710938, "logps/rejected": -183.70306396484375, "loss": 0.6931, "rewards/accuracies": 0.953125, "rewards/chosen": -1.037402868270874, "rewards/margins": 1.152178168296814, "rewards/rejected": -2.1895809173583984, "step": 282 }, { "epoch": 0.4155653450807636, "grad_norm": 17.020105361938477, "kl/avg_steps": 0.65625, "kl/beta": 0.02710810862481594, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 3.6486760974483685e-07, "logits/chosen": -7.325111389160156, "logits/rejected": -6.981871604919434, "logps/chosen": -139.34780883789062, "logps/ref_chosen": -95.05259704589844, "logps/ref_rejected": -103.54454803466797, "logps/rejected": -177.7033233642578, "loss": 0.9734, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1950119733810425, "rewards/margins": 0.7993054389953613, "rewards/rejected": -1.9943175315856934, "step": 283 }, { "epoch": 0.4170337738619677, "grad_norm": 11.532405853271484, "kl/avg_steps": 0.78125, "kl/beta": 0.02693137153983116, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 3.6372625621898863e-07, "logits/chosen": -7.086446762084961, "logits/rejected": -6.687787055969238, "logps/chosen": -124.68060302734375, "logps/ref_chosen": -87.6664810180664, "logps/ref_rejected": -98.75103759765625, "logps/rejected": -177.91339111328125, "loss": 0.6944, "rewards/accuracies": 0.921875, "rewards/chosen": -0.9900509119033813, "rewards/margins": 1.1234304904937744, "rewards/rejected": -2.1134815216064453, "step": 284 }, { "epoch": 0.4185022026431718, "grad_norm": 18.86406898498535, "kl/avg_steps": 0.6875, "kl/beta": 0.026722600683569908, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 3.625819059005228e-07, "logits/chosen": -7.379509449005127, "logits/rejected": -7.0911407470703125, "logps/chosen": -137.629150390625, "logps/ref_chosen": -94.43303680419922, "logps/ref_rejected": -104.07194519042969, "logps/rejected": -176.97714233398438, "loss": 0.8825, "rewards/accuracies": 0.84375, "rewards/chosen": -1.1481690406799316, "rewards/margins": 0.7845076322555542, "rewards/rejected": -1.9326767921447754, "step": 285 }, { "epoch": 0.4199706314243759, "grad_norm": 14.356172561645508, "kl/avg_steps": 0.5625, "kl/beta": 0.026540137827396393, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.614345889441346e-07, "logits/chosen": -7.472691535949707, "logits/rejected": -7.013616561889648, "logps/chosen": -149.8736572265625, "logps/ref_chosen": -103.72039794921875, "logps/ref_rejected": -96.25775909423828, "logps/rejected": -179.59671020507812, "loss": 0.8014, "rewards/accuracies": 0.828125, "rewards/chosen": -1.2212910652160645, "rewards/margins": 0.9765151143074036, "rewards/rejected": -2.1978063583374023, "step": 286 }, { "epoch": 0.42143906020558003, "grad_norm": 15.217528343200684, "kl/avg_steps": 0.34375, "kl/beta": 0.026391685009002686, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 3.6028433558269275e-07, "logits/chosen": -7.295309066772461, "logits/rejected": -6.699875831604004, "logps/chosen": -147.86685180664062, "logps/ref_chosen": -93.88988494873047, "logps/ref_rejected": -83.33365631103516, "logps/rejected": -162.0262451171875, "loss": 1.0332, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4231257438659668, "rewards/margins": 0.6437419652938843, "rewards/rejected": -2.0668678283691406, "step": 287 }, { "epoch": 0.42290748898678415, "grad_norm": 14.122811317443848, "kl/avg_steps": 0.53125, "kl/beta": 0.026301274076104164, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 3.5913117612644327e-07, "logits/chosen": -7.697492599487305, "logits/rejected": -7.256779670715332, "logps/chosen": -139.82644653320312, "logps/ref_chosen": -88.15602111816406, "logps/ref_rejected": -93.28195190429688, "logps/rejected": -183.26959228515625, "loss": 0.816, "rewards/accuracies": 0.828125, "rewards/chosen": -1.3554046154022217, "rewards/margins": 0.9966922998428345, "rewards/rejected": -2.3520970344543457, "step": 288 }, { "epoch": 0.4243759177679883, "grad_norm": 13.091045379638672, "kl/avg_steps": 0.59375, "kl/beta": 0.02616228722035885, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 3.5797514096221024e-07, "logits/chosen": -7.527945518493652, "logits/rejected": -7.149833679199219, "logps/chosen": -126.64739227294922, "logps/ref_chosen": -75.39292907714844, "logps/ref_rejected": -93.15428161621094, "logps/rejected": -185.3885498046875, "loss": 0.8297, "rewards/accuracies": 0.796875, "rewards/chosen": -1.3352906703948975, "rewards/margins": 1.0608309507369995, "rewards/rejected": -2.3961217403411865, "step": 289 }, { "epoch": 0.42584434654919234, "grad_norm": 14.055940628051758, "kl/avg_steps": 0.8125, "kl/beta": 0.026007864624261856, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 3.568162605525952e-07, "logits/chosen": -7.388311386108398, "logits/rejected": -7.0829925537109375, "logps/chosen": -144.79820251464844, "logps/ref_chosen": -88.0419692993164, "logps/ref_rejected": -123.21215057373047, "logps/rejected": -221.4034423828125, "loss": 0.8033, "rewards/accuracies": 0.859375, "rewards/chosen": -1.4665530920028687, "rewards/margins": 1.0655090808868408, "rewards/rejected": -2.53206205368042, "step": 290 }, { "epoch": 0.42731277533039647, "grad_norm": 14.552366256713867, "kl/avg_steps": 0.625, "kl/beta": 0.0257982537150383, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 3.5565456543517485e-07, "logits/chosen": -7.654292106628418, "logits/rejected": -6.923820972442627, "logps/chosen": -148.91311645507812, "logps/ref_chosen": -94.09524536132812, "logps/ref_rejected": -96.05006408691406, "logps/rejected": -183.22817993164062, "loss": 0.9127, "rewards/accuracies": 0.859375, "rewards/chosen": -1.408342719078064, "rewards/margins": 0.8246374130249023, "rewards/rejected": -2.232980251312256, "step": 291 }, { "epoch": 0.4287812041116006, "grad_norm": 12.810431480407715, "kl/avg_steps": 0.6875, "kl/beta": 0.025638015940785408, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 3.5449008622169583e-07, "logits/chosen": -7.4952850341796875, "logits/rejected": -7.096066951751709, "logps/chosen": -142.94708251953125, "logps/ref_chosen": -88.25041198730469, "logps/ref_rejected": -96.41764068603516, "logps/rejected": -189.70559692382812, "loss": 0.7522, "rewards/accuracies": 0.859375, "rewards/chosen": -1.3947566747665405, "rewards/margins": 0.9789954423904419, "rewards/rejected": -2.3737521171569824, "step": 292 }, { "epoch": 0.4302496328928047, "grad_norm": 13.038102149963379, "kl/avg_steps": 0.53125, "kl/beta": 0.025462958961725235, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 3.5332285359726846e-07, "logits/chosen": -7.2822585105896, "logits/rejected": -6.593290328979492, "logps/chosen": -146.07388305664062, "logps/ref_chosen": -87.37654876708984, "logps/ref_rejected": -85.75579833984375, "logps/rejected": -172.91517639160156, "loss": 0.9942, "rewards/accuracies": 0.78125, "rewards/chosen": -1.4899965524673462, "rewards/margins": 0.7153399586677551, "rewards/rejected": -2.205336570739746, "step": 293 }, { "epoch": 0.43171806167400884, "grad_norm": 14.321362495422363, "kl/avg_steps": 0.53125, "kl/beta": 0.02532840147614479, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 3.5215289831955786e-07, "logits/chosen": -6.959033966064453, "logits/rejected": -6.0898590087890625, "logps/chosen": -134.58004760742188, "logps/ref_chosen": -73.5079574584961, "logps/ref_rejected": -88.08877563476562, "logps/rejected": -174.16912841796875, "loss": 1.0529, "rewards/accuracies": 0.75, "rewards/chosen": -1.5407367944717407, "rewards/margins": 0.6243571043014526, "rewards/rejected": -2.1650938987731934, "step": 294 }, { "epoch": 0.4331864904552129, "grad_norm": 13.053278923034668, "kl/avg_steps": 0.59375, "kl/beta": 0.025194555521011353, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 3.509802512179737e-07, "logits/chosen": -7.271920680999756, "logits/rejected": -7.100852012634277, "logps/chosen": -136.8726043701172, "logps/ref_chosen": -77.76548767089844, "logps/ref_rejected": -94.24726867675781, "logps/rejected": -189.65672302246094, "loss": 0.8606, "rewards/accuracies": 0.828125, "rewards/chosen": -1.4831101894378662, "rewards/margins": 0.9041118025779724, "rewards/rejected": -2.3872218132019043, "step": 295 }, { "epoch": 0.434654919236417, "grad_norm": 13.810120582580566, "kl/avg_steps": 0.5625, "kl/beta": 0.02504584565758705, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.498049431928577e-07, "logits/chosen": -7.517086505889893, "logits/rejected": -7.0501275062561035, "logps/chosen": -161.45822143554688, "logps/ref_chosen": -97.85641479492188, "logps/ref_rejected": -100.81631469726562, "logps/rejected": -203.63685607910156, "loss": 0.8194, "rewards/accuracies": 0.828125, "rewards/chosen": -1.586834192276001, "rewards/margins": 0.9714199304580688, "rewards/rejected": -2.5582542419433594, "step": 296 }, { "epoch": 0.43612334801762115, "grad_norm": 14.663503646850586, "kl/avg_steps": 0.5625, "kl/beta": 0.024905750527977943, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.486270052146694e-07, "logits/chosen": -7.523492813110352, "logits/rejected": -6.788856506347656, "logps/chosen": -154.54638671875, "logps/ref_chosen": -88.56583404541016, "logps/ref_rejected": -101.55656433105469, "logps/rejected": -200.7189178466797, "loss": 0.8867, "rewards/accuracies": 0.828125, "rewards/chosen": -1.6361867189407349, "rewards/margins": 0.8170878887176514, "rewards/rejected": -2.453274726867676, "step": 297 }, { "epoch": 0.43759177679882527, "grad_norm": 16.382911682128906, "kl/avg_steps": 0.59375, "kl/beta": 0.02476643957197666, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 3.474464683231698e-07, "logits/chosen": -7.475191593170166, "logits/rejected": -6.408700942993164, "logps/chosen": -154.0912322998047, "logps/ref_chosen": -94.88043975830078, "logps/ref_rejected": -122.31101989746094, "logps/rejected": -221.15391540527344, "loss": 0.8289, "rewards/accuracies": 0.828125, "rewards/chosen": -1.460301160812378, "rewards/margins": 0.9708787202835083, "rewards/rejected": -2.431180000305176, "step": 298 }, { "epoch": 0.4390602055800294, "grad_norm": 12.90149974822998, "kl/avg_steps": 0.71875, "kl/beta": 0.024620257318019867, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 3.462633636266041e-07, "logits/chosen": -7.417859077453613, "logits/rejected": -6.741024971008301, "logps/chosen": -142.339111328125, "logps/ref_chosen": -80.40835571289062, "logps/ref_rejected": -89.53716278076172, "logps/rejected": -193.05116271972656, "loss": 0.785, "rewards/accuracies": 0.859375, "rewards/chosen": -1.5161972045898438, "rewards/margins": 1.0127980709075928, "rewards/rejected": -2.5289955139160156, "step": 299 }, { "epoch": 0.44052863436123346, "grad_norm": 14.46535587310791, "kl/avg_steps": 0.625, "kl/beta": 0.024444561451673508, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 3.4507772230088147e-07, "logits/chosen": -7.5312700271606445, "logits/rejected": -7.016364097595215, "logps/chosen": -153.97088623046875, "logps/ref_chosen": -88.15890502929688, "logps/ref_rejected": -100.93919372558594, "logps/rejected": -202.05059814453125, "loss": 0.912, "rewards/accuracies": 0.828125, "rewards/chosen": -1.6017869710922241, "rewards/margins": 0.8526525497436523, "rewards/rejected": -2.454439640045166, "step": 300 }, { "epoch": 0.44052863436123346, "eval_kl/n_epsilon_steps": 0.24700342118740082, "eval_kl/p_epsilon_steps": 0.7508561611175537, "eval_logits/chosen": -7.007012367248535, "eval_logits/rejected": -6.721932888031006, "eval_logps/chosen": -167.77142333984375, "eval_logps/ref_chosen": -100.49356842041016, "eval_logps/ref_rejected": -94.06775665283203, "eval_logps/rejected": -191.32177734375, "eval_loss": 0.4874841272830963, "eval_rewards/accuracies": 0.7786815166473389, "eval_rewards/chosen": -1.6289423704147339, "eval_rewards/margins": 0.7190282940864563, "eval_rewards/rejected": -2.347970724105835, "eval_runtime": 48.5566, "eval_samples_per_second": 48.171, "eval_steps_per_second": 1.524, "step": 300 }, { "epoch": 0.4419970631424376, "grad_norm": 13.402172088623047, "kl/avg_steps": 0.609375, "kl/beta": 0.024292731657624245, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.796875, "learning_rate": 3.4388957558875316e-07, "logits/chosen": -7.663355827331543, "logits/rejected": -6.919793128967285, "logps/chosen": -146.97164916992188, "logps/ref_chosen": -85.50491333007812, "logps/ref_rejected": -98.9168472290039, "logps/rejected": -203.77188110351562, "loss": 0.7859, "rewards/accuracies": 0.828125, "rewards/chosen": -1.4873508214950562, "rewards/margins": 1.0426502227783203, "rewards/rejected": -2.530000925064087, "step": 301 }, { "epoch": 0.4434654919236417, "grad_norm": 15.27524185180664, "kl/avg_steps": 0.625, "kl/beta": 0.024145595729351044, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 3.426989547989902e-07, "logits/chosen": -7.2973504066467285, "logits/rejected": -6.5177412033081055, "logps/chosen": -142.61880493164062, "logps/ref_chosen": -79.45040893554688, "logps/ref_rejected": -103.05909729003906, "logps/rejected": -201.1916046142578, "loss": 0.9129, "rewards/accuracies": 0.84375, "rewards/chosen": -1.5182812213897705, "rewards/margins": 0.8340890407562256, "rewards/rejected": -2.352370262145996, "step": 302 }, { "epoch": 0.44493392070484583, "grad_norm": 12.639718055725098, "kl/avg_steps": 0.5625, "kl/beta": 0.023995622992515564, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.4150589130555773e-07, "logits/chosen": -7.532470703125, "logits/rejected": -7.15334939956665, "logps/chosen": -152.356689453125, "logps/ref_chosen": -95.59829711914062, "logps/ref_rejected": -92.0645751953125, "logps/rejected": -189.04776000976562, "loss": 0.8158, "rewards/accuracies": 0.859375, "rewards/chosen": -1.356937289237976, "rewards/margins": 0.9546318650245667, "rewards/rejected": -2.3115692138671875, "step": 303 }, { "epoch": 0.44640234948604995, "grad_norm": 13.857175827026367, "kl/avg_steps": 0.59375, "kl/beta": 0.02386140264570713, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 3.403104165467883e-07, "logits/chosen": -7.706064224243164, "logits/rejected": -7.303454399108887, "logps/chosen": -156.93124389648438, "logps/ref_chosen": -99.4531478881836, "logps/ref_rejected": -98.08226013183594, "logps/rejected": -197.8302001953125, "loss": 0.7929, "rewards/accuracies": 0.84375, "rewards/chosen": -1.365452766418457, "rewards/margins": 0.9975064396858215, "rewards/rejected": -2.362959146499634, "step": 304 }, { "epoch": 0.447870778267254, "grad_norm": 13.020018577575684, "kl/avg_steps": 0.59375, "kl/beta": 0.02372056059539318, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 3.391125620245535e-07, "logits/chosen": -7.775443077087402, "logits/rejected": -6.882397174835205, "logps/chosen": -162.6031494140625, "logps/ref_chosen": -103.26339721679688, "logps/ref_rejected": -97.82897186279297, "logps/rejected": -199.2515869140625, "loss": 0.8001, "rewards/accuracies": 0.84375, "rewards/chosen": -1.4017119407653809, "rewards/margins": 0.9873659610748291, "rewards/rejected": -2.38907790184021, "step": 305 }, { "epoch": 0.44933920704845814, "grad_norm": 12.147577285766602, "kl/avg_steps": 0.84375, "kl/beta": 0.023580551147460938, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 3.3791235930343417e-07, "logits/chosen": -7.674917697906494, "logits/rejected": -7.210516452789307, "logps/chosen": -152.4990234375, "logps/ref_chosen": -100.3038330078125, "logps/ref_rejected": -92.23086547851562, "logps/rejected": -195.33135986328125, "loss": 0.6474, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2215442657470703, "rewards/margins": 1.1882264614105225, "rewards/rejected": -2.4097707271575928, "step": 306 }, { "epoch": 0.45080763582966227, "grad_norm": 14.133451461791992, "kl/avg_steps": 0.53125, "kl/beta": 0.023383256047964096, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 3.367098400098881e-07, "logits/chosen": -7.329036235809326, "logits/rejected": -7.163274765014648, "logps/chosen": -155.960205078125, "logps/ref_chosen": -96.64976501464844, "logps/ref_rejected": -93.34860229492188, "logps/rejected": -189.1453857421875, "loss": 0.9218, "rewards/accuracies": 0.78125, "rewards/chosen": -1.3819317817687988, "rewards/margins": 0.8423360586166382, "rewards/rejected": -2.2242679595947266, "step": 307 }, { "epoch": 0.4522760646108664, "grad_norm": 15.209975242614746, "kl/avg_steps": 0.625, "kl/beta": 0.023259688168764114, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 3.355050358314172e-07, "logits/chosen": -7.033696174621582, "logits/rejected": -7.267401218414307, "logps/chosen": -146.19422912597656, "logps/ref_chosen": -85.72467041015625, "logps/ref_rejected": -86.42318725585938, "logps/rejected": -183.3817596435547, "loss": 0.9237, "rewards/accuracies": 0.796875, "rewards/chosen": -1.399101734161377, "rewards/margins": 0.8382473587989807, "rewards/rejected": -2.237349033355713, "step": 308 }, { "epoch": 0.45374449339207046, "grad_norm": 14.02563762664795, "kl/avg_steps": 0.59375, "kl/beta": 0.023115217685699463, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 3.3429797851573183e-07, "logits/chosen": -7.613210201263428, "logits/rejected": -7.04583215713501, "logps/chosen": -148.60951232910156, "logps/ref_chosen": -83.15145874023438, "logps/ref_rejected": -86.58602142333984, "logps/rejected": -190.06564331054688, "loss": 0.8829, "rewards/accuracies": 0.828125, "rewards/chosen": -1.5072109699249268, "rewards/margins": 0.8685282468795776, "rewards/rejected": -2.375739097595215, "step": 309 }, { "epoch": 0.4552129221732746, "grad_norm": 13.904105186462402, "kl/avg_steps": 0.53125, "kl/beta": 0.022978780791163445, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 3.3308869986991487e-07, "logits/chosen": -7.28669548034668, "logits/rejected": -7.22123908996582, "logps/chosen": -148.03106689453125, "logps/ref_chosen": -89.73799133300781, "logps/ref_rejected": -87.07083129882812, "logps/rejected": -181.75921630859375, "loss": 0.9143, "rewards/accuracies": 0.78125, "rewards/chosen": -1.3350245952606201, "rewards/margins": 0.826156735420227, "rewards/rejected": -2.1611814498901367, "step": 310 }, { "epoch": 0.4566813509544787, "grad_norm": 13.777924537658691, "kl/avg_steps": 0.65625, "kl/beta": 0.02285735122859478, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 3.3187723175958346e-07, "logits/chosen": -7.765219688415527, "logits/rejected": -6.731596946716309, "logps/chosen": -159.7617645263672, "logps/ref_chosen": -88.52644348144531, "logps/ref_rejected": -82.26608276367188, "logps/rejected": -199.7523651123047, "loss": 0.7578, "rewards/accuracies": 0.890625, "rewards/chosen": -1.6187620162963867, "rewards/margins": 1.046243667602539, "rewards/rejected": -2.665005683898926, "step": 311 }, { "epoch": 0.4581497797356828, "grad_norm": 13.674013137817383, "kl/avg_steps": 0.6875, "kl/beta": 0.022708328440785408, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 3.306636061080487e-07, "logits/chosen": -7.522071838378906, "logits/rejected": -6.754973411560059, "logps/chosen": -146.90805053710938, "logps/ref_chosen": -80.40069580078125, "logps/ref_rejected": -82.4471435546875, "logps/rejected": -192.24771118164062, "loss": 0.8414, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5027642250061035, "rewards/margins": 0.9715795516967773, "rewards/rejected": -2.474343776702881, "step": 312 }, { "epoch": 0.45961820851688695, "grad_norm": 12.186444282531738, "kl/avg_steps": 0.5625, "kl/beta": 0.022553274407982826, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.2944785489547537e-07, "logits/chosen": -7.390935897827148, "logits/rejected": -6.827203750610352, "logps/chosen": -146.6564483642578, "logps/ref_chosen": -84.14190673828125, "logps/ref_rejected": -93.42684936523438, "logps/rejected": -194.61727905273438, "loss": 0.9267, "rewards/accuracies": 0.78125, "rewards/chosen": -1.4026517868041992, "rewards/margins": 0.8612587451934814, "rewards/rejected": -2.2639105319976807, "step": 313 }, { "epoch": 0.461086637298091, "grad_norm": 13.97519588470459, "kl/avg_steps": 0.625, "kl/beta": 0.02242712303996086, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 3.2823001015803857e-07, "logits/chosen": -7.827272415161133, "logits/rejected": -7.375405311584473, "logps/chosen": -151.27520751953125, "logps/ref_chosen": -88.80972290039062, "logps/ref_rejected": -104.15892028808594, "logps/rejected": -207.26962280273438, "loss": 0.872, "rewards/accuracies": 0.78125, "rewards/chosen": -1.3944957256317139, "rewards/margins": 0.9006831049919128, "rewards/rejected": -2.2951788902282715, "step": 314 }, { "epoch": 0.46255506607929514, "grad_norm": 14.756415367126465, "kl/avg_steps": 0.5625, "kl/beta": 0.022287823259830475, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.270101039870797e-07, "logits/chosen": -7.049970626831055, "logits/rejected": -6.526283264160156, "logps/chosen": -143.4265594482422, "logps/ref_chosen": -80.793701171875, "logps/ref_rejected": -92.13711547851562, "logps/rejected": -195.54171752929688, "loss": 0.9216, "rewards/accuracies": 0.78125, "rewards/chosen": -1.391645073890686, "rewards/margins": 0.8969128131866455, "rewards/rejected": -2.288558006286621, "step": 315 }, { "epoch": 0.46402349486049926, "grad_norm": 14.08152961730957, "kl/avg_steps": 0.59375, "kl/beta": 0.022163156419992447, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 3.2578816852826086e-07, "logits/chosen": -7.215094089508057, "logits/rejected": -7.132869720458984, "logps/chosen": -150.78274536132812, "logps/ref_chosen": -84.18425750732422, "logps/ref_rejected": -105.6908187866211, "logps/rejected": -212.91693115234375, "loss": 0.8837, "rewards/accuracies": 0.84375, "rewards/chosen": -1.467881679534912, "rewards/margins": 0.8902303576469421, "rewards/rejected": -2.358112096786499, "step": 316 }, { "epoch": 0.4654919236417034, "grad_norm": 14.905621528625488, "kl/avg_steps": 0.65625, "kl/beta": 0.02203233912587166, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 3.2456423598071783e-07, "logits/chosen": -7.734880447387695, "logits/rejected": -7.02924919128418, "logps/chosen": -152.81094360351562, "logps/ref_chosen": -87.87348937988281, "logps/ref_rejected": -108.26519775390625, "logps/rejected": -228.77639770507812, "loss": 0.7381, "rewards/accuracies": 0.875, "rewards/chosen": -1.4228007793426514, "rewards/margins": 1.2112901210784912, "rewards/rejected": -2.6340909004211426, "step": 317 }, { "epoch": 0.4669603524229075, "grad_norm": 11.828646659851074, "kl/avg_steps": 0.71875, "kl/beta": 0.021888693794608116, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 3.233383385962115e-07, "logits/chosen": -7.722089767456055, "logits/rejected": -7.081110000610352, "logps/chosen": -161.36456298828125, "logps/ref_chosen": -98.20553588867188, "logps/ref_rejected": -88.13629150390625, "logps/rejected": -199.14776611328125, "loss": 0.7892, "rewards/accuracies": 0.828125, "rewards/chosen": -1.3737818002700806, "rewards/margins": 1.0358906984329224, "rewards/rejected": -2.409672498703003, "step": 318 }, { "epoch": 0.4684287812041116, "grad_norm": 13.124372482299805, "kl/avg_steps": 0.78125, "kl/beta": 0.021732492372393608, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 3.2211050867827805e-07, "logits/chosen": -7.879185199737549, "logits/rejected": -7.01790714263916, "logps/chosen": -137.73611450195312, "logps/ref_chosen": -82.50337219238281, "logps/ref_rejected": -118.65068054199219, "logps/rejected": -224.25790405273438, "loss": 0.7808, "rewards/accuracies": 0.875, "rewards/chosen": -1.1925673484802246, "rewards/margins": 1.082844853401184, "rewards/rejected": -2.2754123210906982, "step": 319 }, { "epoch": 0.4698972099853157, "grad_norm": 12.478285789489746, "kl/avg_steps": 0.6875, "kl/beta": 0.021564023569226265, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 3.208807785813777e-07, "logits/chosen": -7.674609184265137, "logits/rejected": -7.378990173339844, "logps/chosen": -144.46971130371094, "logps/ref_chosen": -87.1372299194336, "logps/ref_rejected": -103.22412109375, "logps/rejected": -203.762451171875, "loss": 0.8435, "rewards/accuracies": 0.84375, "rewards/chosen": -1.2286326885223389, "rewards/margins": 0.9210659861564636, "rewards/rejected": -2.149698495864868, "step": 320 }, { "epoch": 0.4713656387665198, "grad_norm": 13.07551097869873, "kl/avg_steps": 0.78125, "kl/beta": 0.021416783332824707, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 3.1964918071004217e-07, "logits/chosen": -7.491329193115234, "logits/rejected": -7.139033317565918, "logps/chosen": -157.41146850585938, "logps/ref_chosen": -93.10466766357422, "logps/ref_rejected": -97.4681396484375, "logps/rejected": -210.389892578125, "loss": 0.8056, "rewards/accuracies": 0.828125, "rewards/chosen": -1.3688414096832275, "rewards/margins": 1.0292859077453613, "rewards/rejected": -2.398127555847168, "step": 321 }, { "epoch": 0.47283406754772395, "grad_norm": 15.268094062805176, "kl/avg_steps": 0.65625, "kl/beta": 0.021250760182738304, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 3.184157475180207e-07, "logits/chosen": -7.795929908752441, "logits/rejected": -7.272746562957764, "logps/chosen": -152.91873168945312, "logps/ref_chosen": -92.19291687011719, "logps/ref_rejected": -100.89321899414062, "logps/rejected": -216.01556396484375, "loss": 0.7797, "rewards/accuracies": 0.84375, "rewards/chosen": -1.2838683128356934, "rewards/margins": 1.1429917812347412, "rewards/rejected": -2.4268603324890137, "step": 322 }, { "epoch": 0.47430249632892807, "grad_norm": 12.563841819763184, "kl/avg_steps": 0.71875, "kl/beta": 0.021112211048603058, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 3.171805115074251e-07, "logits/chosen": -7.563092231750488, "logits/rejected": -7.294485092163086, "logps/chosen": -148.90322875976562, "logps/ref_chosen": -80.27210235595703, "logps/ref_rejected": -80.30335235595703, "logps/rejected": -197.24813842773438, "loss": 0.7955, "rewards/accuracies": 0.890625, "rewards/chosen": -1.440096139907837, "rewards/margins": 1.0090997219085693, "rewards/rejected": -2.449195623397827, "step": 323 }, { "epoch": 0.47577092511013214, "grad_norm": 13.184640884399414, "kl/avg_steps": 0.6875, "kl/beta": 0.020961550995707512, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 3.1594350522787295e-07, "logits/chosen": -7.81708288192749, "logits/rejected": -6.640934944152832, "logps/chosen": -165.7332763671875, "logps/ref_chosen": -95.72935485839844, "logps/ref_rejected": -94.16924285888672, "logps/rejected": -216.46131896972656, "loss": 0.732, "rewards/accuracies": 0.859375, "rewards/chosen": -1.4592244625091553, "rewards/margins": 1.0842043161392212, "rewards/rejected": -2.543428897857666, "step": 324 }, { "epoch": 0.47723935389133626, "grad_norm": 11.3136625289917, "kl/avg_steps": 0.65625, "kl/beta": 0.020818423479795456, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 3.147047612756302e-07, "logits/chosen": -7.779555320739746, "logits/rejected": -7.204484939575195, "logps/chosen": -164.93191528320312, "logps/ref_chosen": -98.70687103271484, "logps/ref_rejected": -76.40809631347656, "logps/rejected": -192.03103637695312, "loss": 0.7957, "rewards/accuracies": 0.84375, "rewards/chosen": -1.37131929397583, "rewards/margins": 1.0167847871780396, "rewards/rejected": -2.38810396194458, "step": 325 }, { "epoch": 0.4787077826725404, "grad_norm": 13.992250442504883, "kl/avg_steps": 0.625, "kl/beta": 0.020682694390416145, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 3.134643122927519e-07, "logits/chosen": -7.7820024490356445, "logits/rejected": -7.323631286621094, "logps/chosen": -180.78939819335938, "logps/ref_chosen": -104.66130065917969, "logps/ref_rejected": -86.82537841796875, "logps/rejected": -208.27098083496094, "loss": 0.8558, "rewards/accuracies": 0.828125, "rewards/chosen": -1.567757487297058, "rewards/margins": 0.9265294671058655, "rewards/rejected": -2.4942867755889893, "step": 326 }, { "epoch": 0.4801762114537445, "grad_norm": 13.753683090209961, "kl/avg_steps": 0.5625, "kl/beta": 0.020554229617118835, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.1222219096622264e-07, "logits/chosen": -7.905759811401367, "logits/rejected": -7.24397611618042, "logps/chosen": -180.28341674804688, "logps/ref_chosen": -107.44024658203125, "logps/ref_rejected": -117.05323028564453, "logps/rejected": -237.6690216064453, "loss": 0.8317, "rewards/accuracies": 0.828125, "rewards/chosen": -1.491244912147522, "rewards/margins": 0.9712578058242798, "rewards/rejected": -2.4625027179718018, "step": 327 }, { "epoch": 0.48164464023494863, "grad_norm": 13.748778343200684, "kl/avg_steps": 0.6875, "kl/beta": 0.020439259707927704, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 3.1097843002709427e-07, "logits/chosen": -7.3603315353393555, "logits/rejected": -6.873122215270996, "logps/chosen": -162.29519653320312, "logps/ref_chosen": -86.6106948852539, "logps/ref_rejected": -97.40188598632812, "logps/rejected": -229.42196655273438, "loss": 0.7407, "rewards/accuracies": 0.875, "rewards/chosen": -1.5384535789489746, "rewards/margins": 1.1390830278396606, "rewards/rejected": -2.6775364875793457, "step": 328 }, { "epoch": 0.4831130690161527, "grad_norm": 14.800250053405762, "kl/avg_steps": 0.6875, "kl/beta": 0.020299699157476425, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 3.0973306224962437e-07, "logits/chosen": -7.575889587402344, "logits/rejected": -7.213668346405029, "logps/chosen": -178.1090850830078, "logps/ref_chosen": -100.61308288574219, "logps/ref_rejected": -104.95370483398438, "logps/rejected": -230.13726806640625, "loss": 0.8164, "rewards/accuracies": 0.828125, "rewards/chosen": -1.564449667930603, "rewards/margins": 0.9571825861930847, "rewards/rejected": -2.521632194519043, "step": 329 }, { "epoch": 0.4845814977973568, "grad_norm": 12.028066635131836, "kl/avg_steps": 0.6875, "kl/beta": 0.020161090418696404, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 3.084861204504122e-07, "logits/chosen": -7.624053955078125, "logits/rejected": -7.438431739807129, "logps/chosen": -147.8223876953125, "logps/ref_chosen": -76.95843505859375, "logps/ref_rejected": -92.81494140625, "logps/rejected": -214.4041290283203, "loss": 0.7539, "rewards/accuracies": 0.875, "rewards/chosen": -1.4205009937286377, "rewards/margins": 1.0114681720733643, "rewards/rejected": -2.431969165802002, "step": 330 }, { "epoch": 0.48604992657856094, "grad_norm": 16.355430603027344, "kl/avg_steps": 0.75, "kl/beta": 0.02002342976629734, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 3.072376374875335e-07, "logits/chosen": -7.7047834396362305, "logits/rejected": -7.323886394500732, "logps/chosen": -158.1719512939453, "logps/ref_chosen": -83.0594253540039, "logps/ref_rejected": -94.13113403320312, "logps/rejected": -222.07827758789062, "loss": 0.7566, "rewards/accuracies": 0.875, "rewards/chosen": -1.4940705299377441, "rewards/margins": 1.0462760925292969, "rewards/rejected": -2.540346622467041, "step": 331 }, { "epoch": 0.48751835535976507, "grad_norm": 13.534795761108398, "kl/avg_steps": 0.625, "kl/beta": 0.019874371588230133, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 3.059876462596758e-07, "logits/chosen": -7.760369300842285, "logits/rejected": -7.029866695404053, "logps/chosen": -159.60678100585938, "logps/ref_chosen": -81.93089294433594, "logps/ref_rejected": -82.84590148925781, "logps/rejected": -202.23422241210938, "loss": 0.9013, "rewards/accuracies": 0.828125, "rewards/chosen": -1.5361754894256592, "rewards/margins": 0.8190126419067383, "rewards/rejected": -2.3551878929138184, "step": 332 }, { "epoch": 0.4889867841409692, "grad_norm": 13.899153709411621, "kl/avg_steps": 0.5625, "kl/beta": 0.01975092850625515, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.0473617970527015e-07, "logits/chosen": -7.564597129821777, "logits/rejected": -7.067813396453857, "logps/chosen": -177.69625854492188, "logps/ref_chosen": -88.53257751464844, "logps/ref_rejected": -102.46601867675781, "logps/rejected": -244.46804809570312, "loss": 0.8778, "rewards/accuracies": 0.75, "rewards/chosen": -1.7536228895187378, "rewards/margins": 1.0312126874923706, "rewards/rejected": -2.7848353385925293, "step": 333 }, { "epoch": 0.49045521292217326, "grad_norm": 16.076087951660156, "kl/avg_steps": 0.6875, "kl/beta": 0.019640451297163963, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 3.034832708016243e-07, "logits/chosen": -7.2526021003723145, "logits/rejected": -7.094786643981934, "logps/chosen": -180.3138427734375, "logps/ref_chosen": -97.75778198242188, "logps/ref_rejected": -100.00584411621094, "logps/rejected": -230.16831970214844, "loss": 0.9017, "rewards/accuracies": 0.828125, "rewards/chosen": -1.6136884689331055, "rewards/margins": 0.9234945774078369, "rewards/rejected": -2.5371832847595215, "step": 334 }, { "epoch": 0.4919236417033774, "grad_norm": 15.462376594543457, "kl/avg_steps": 0.53125, "kl/beta": 0.019506344571709633, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 3.022289525640531e-07, "logits/chosen": -7.538198471069336, "logits/rejected": -6.875842571258545, "logps/chosen": -179.6153106689453, "logps/ref_chosen": -96.37603759765625, "logps/ref_rejected": -94.92401123046875, "logps/rejected": -222.1100616455078, "loss": 0.9078, "rewards/accuracies": 0.796875, "rewards/chosen": -1.618840217590332, "rewards/margins": 0.8465075492858887, "rewards/rejected": -2.4653477668762207, "step": 335 }, { "epoch": 0.4933920704845815, "grad_norm": 14.706995010375977, "kl/avg_steps": 0.75, "kl/beta": 0.01940326578915119, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 3.009732580450086e-07, "logits/chosen": -7.663290500640869, "logits/rejected": -7.155807971954346, "logps/chosen": -170.34237670898438, "logps/ref_chosen": -88.52952575683594, "logps/ref_rejected": -109.55577850341797, "logps/rejected": -245.36219787597656, "loss": 0.7966, "rewards/accuracies": 0.890625, "rewards/chosen": -1.5775220394134521, "rewards/margins": 1.0359567403793335, "rewards/rejected": -2.6134791374206543, "step": 336 }, { "epoch": 0.4948604992657856, "grad_norm": 15.293198585510254, "kl/avg_steps": 0.59375, "kl/beta": 0.019258825108408928, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 2.9971622033320914e-07, "logits/chosen": -7.815160274505615, "logits/rejected": -7.3971452713012695, "logps/chosen": -179.1387939453125, "logps/ref_chosen": -97.31238555908203, "logps/ref_rejected": -110.13134765625, "logps/rejected": -241.39308166503906, "loss": 0.8903, "rewards/accuracies": 0.828125, "rewards/chosen": -1.5696876049041748, "rewards/margins": 0.9406921863555908, "rewards/rejected": -2.5103797912597656, "step": 337 }, { "epoch": 0.49632892804698975, "grad_norm": 15.433863639831543, "kl/avg_steps": 0.625, "kl/beta": 0.019145149737596512, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 2.984578725527675e-07, "logits/chosen": -7.460289478302002, "logits/rejected": -7.160378456115723, "logps/chosen": -167.595947265625, "logps/ref_chosen": -83.63011932373047, "logps/ref_rejected": -93.69235229492188, "logps/rejected": -226.01394653320312, "loss": 0.8439, "rewards/accuracies": 0.828125, "rewards/chosen": -1.5996140241622925, "rewards/margins": 0.9147864580154419, "rewards/rejected": -2.5144004821777344, "step": 338 }, { "epoch": 0.4977973568281938, "grad_norm": 14.1244535446167, "kl/avg_steps": 0.59375, "kl/beta": 0.01902623660862446, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 2.9719824786231796e-07, "logits/chosen": -7.783293724060059, "logits/rejected": -7.273715496063232, "logps/chosen": -184.80462646484375, "logps/ref_chosen": -99.85182189941406, "logps/ref_rejected": -109.0743637084961, "logps/rejected": -246.49473571777344, "loss": 0.8648, "rewards/accuracies": 0.8125, "rewards/chosen": -1.609981656074524, "rewards/margins": 0.985941469669342, "rewards/rejected": -2.5959229469299316, "step": 339 }, { "epoch": 0.49926578560939794, "grad_norm": 17.34671974182129, "kl/avg_steps": 0.625, "kl/beta": 0.018913934007287025, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 2.959373794541426e-07, "logits/chosen": -7.614503383636475, "logits/rejected": -7.28403377532959, "logps/chosen": -177.3800811767578, "logps/ref_chosen": -84.97396087646484, "logps/ref_rejected": -78.80033874511719, "logps/rejected": -218.79473876953125, "loss": 0.8763, "rewards/accuracies": 0.828125, "rewards/chosen": -1.739735722541809, "rewards/margins": 0.8897112607955933, "rewards/rejected": -2.6294469833374023, "step": 340 }, { "epoch": 0.5007342143906021, "grad_norm": 16.43499755859375, "kl/avg_steps": 0.53125, "kl/beta": 0.01879645697772503, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.946753005532965e-07, "logits/chosen": -7.702646732330322, "logits/rejected": -6.986645698547363, "logps/chosen": -184.0854949951172, "logps/ref_chosen": -85.26399230957031, "logps/ref_rejected": -106.08595275878906, "logps/rejected": -250.41880798339844, "loss": 0.9302, "rewards/accuracies": 0.8125, "rewards/chosen": -1.850089430809021, "rewards/margins": 0.8445804119110107, "rewards/rejected": -2.694669723510742, "step": 341 }, { "epoch": 0.5022026431718062, "grad_norm": 14.287416458129883, "kl/avg_steps": 0.625, "kl/beta": 0.01869712769985199, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 2.934120444167326e-07, "logits/chosen": -7.695643424987793, "logits/rejected": -6.729315280914307, "logps/chosen": -178.91830444335938, "logps/ref_chosen": -101.03860473632812, "logps/ref_rejected": -98.89851379394531, "logps/rejected": -240.55776977539062, "loss": 0.7283, "rewards/accuracies": 0.875, "rewards/chosen": -1.4495775699615479, "rewards/margins": 1.179455280303955, "rewards/rejected": -2.629033088684082, "step": 342 }, { "epoch": 0.5036710719530103, "grad_norm": 15.677153587341309, "kl/avg_steps": 0.71875, "kl/beta": 0.018580997362732887, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 2.9214764433242476e-07, "logits/chosen": -7.71726131439209, "logits/rejected": -7.144956588745117, "logps/chosen": -166.2335205078125, "logps/ref_chosen": -83.74736785888672, "logps/ref_rejected": -116.57908630371094, "logps/rejected": -253.29689025878906, "loss": 0.8041, "rewards/accuracies": 0.859375, "rewards/chosen": -1.5235764980316162, "rewards/margins": 0.996279776096344, "rewards/rejected": -2.5198564529418945, "step": 343 }, { "epoch": 0.5051395007342144, "grad_norm": 14.323365211486816, "kl/avg_steps": 0.640625, "kl/beta": 0.018448399379849434, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 2.9088213361849126e-07, "logits/chosen": -7.541003227233887, "logits/rejected": -7.094578266143799, "logps/chosen": -167.88778686523438, "logps/ref_chosen": -78.25396728515625, "logps/ref_rejected": -95.4287109375, "logps/rejected": -237.73025512695312, "loss": 0.8489, "rewards/accuracies": 0.84375, "rewards/chosen": -1.6444833278656006, "rewards/margins": 0.9605084657669067, "rewards/rejected": -2.604991912841797, "step": 344 }, { "epoch": 0.5066079295154186, "grad_norm": 14.38456916809082, "kl/avg_steps": 0.71875, "kl/beta": 0.01833096705377102, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 2.896155456223163e-07, "logits/chosen": -7.9548492431640625, "logits/rejected": -7.6490678787231445, "logps/chosen": -184.28465270996094, "logps/ref_chosen": -98.0794906616211, "logps/ref_rejected": -104.03477478027344, "logps/rejected": -248.8415985107422, "loss": 0.7793, "rewards/accuracies": 0.890625, "rewards/chosen": -1.5716257095336914, "rewards/margins": 1.0628182888031006, "rewards/rejected": -2.634444236755371, "step": 345 }, { "epoch": 0.5080763582966226, "grad_norm": 13.954184532165527, "kl/avg_steps": 0.53125, "kl/beta": 0.01820015348494053, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.883479137196714e-07, "logits/chosen": -7.322576522827148, "logits/rejected": -6.97154426574707, "logps/chosen": -179.28921508789062, "logps/ref_chosen": -80.68348693847656, "logps/ref_rejected": -85.89260864257812, "logps/rejected": -232.02542114257812, "loss": 0.9413, "rewards/accuracies": 0.78125, "rewards/chosen": -1.788151502609253, "rewards/margins": 0.8540264368057251, "rewards/rejected": -2.6421780586242676, "step": 346 }, { "epoch": 0.5095447870778267, "grad_norm": 14.986981391906738, "kl/avg_steps": 0.5625, "kl/beta": 0.01810397580265999, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.8707927131383614e-07, "logits/chosen": -7.665702819824219, "logits/rejected": -7.370832920074463, "logps/chosen": -187.2706298828125, "logps/ref_chosen": -93.67938232421875, "logps/ref_rejected": -97.54251861572266, "logps/rejected": -237.28759765625, "loss": 0.9517, "rewards/accuracies": 0.765625, "rewards/chosen": -1.686805009841919, "rewards/margins": 0.8248369693756104, "rewards/rejected": -2.5116419792175293, "step": 347 }, { "epoch": 0.5110132158590308, "grad_norm": 13.239946365356445, "kl/avg_steps": 0.6875, "kl/beta": 0.01800270937383175, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 2.858096518347179e-07, "logits/chosen": -7.554695129394531, "logits/rejected": -7.127823829650879, "logps/chosen": -173.94375610351562, "logps/ref_chosen": -90.25224304199219, "logps/ref_rejected": -95.77027893066406, "logps/rejected": -228.76617431640625, "loss": 0.888, "rewards/accuracies": 0.828125, "rewards/chosen": -1.4998319149017334, "rewards/margins": 0.8769406676292419, "rewards/rejected": -2.376772403717041, "step": 348 }, { "epoch": 0.5124816446402349, "grad_norm": 13.889335632324219, "kl/avg_steps": 0.59375, "kl/beta": 0.0178797859698534, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 2.845390887379706e-07, "logits/chosen": -7.729681968688965, "logits/rejected": -7.328579902648926, "logps/chosen": -180.9906463623047, "logps/ref_chosen": -92.05809020996094, "logps/ref_rejected": -103.96527099609375, "logps/rejected": -241.1156463623047, "loss": 0.906, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5828135013580322, "rewards/margins": 0.8517956137657166, "rewards/rejected": -2.4346094131469727, "step": 349 }, { "epoch": 0.5139500734214391, "grad_norm": 15.606201171875, "kl/avg_steps": 0.59375, "kl/beta": 0.017774252220988274, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 2.8326761550411346e-07, "logits/chosen": -7.431205749511719, "logits/rejected": -7.113109588623047, "logps/chosen": -183.2895050048828, "logps/ref_chosen": -87.05419921875, "logps/ref_rejected": -95.72193145751953, "logps/rejected": -232.35379028320312, "loss": 0.9483, "rewards/accuracies": 0.796875, "rewards/chosen": -1.7016650438308716, "rewards/margins": 0.7095808982849121, "rewards/rejected": -2.411245822906494, "step": 350 }, { "epoch": 0.5154185022026432, "grad_norm": 13.189620018005371, "kl/avg_steps": 0.65625, "kl/beta": 0.01766934059560299, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 2.819952656376487e-07, "logits/chosen": -8.007095336914062, "logits/rejected": -7.268853187561035, "logps/chosen": -181.0399169921875, "logps/ref_chosen": -89.16903686523438, "logps/ref_rejected": -108.29801940917969, "logps/rejected": -259.3538818359375, "loss": 0.8124, "rewards/accuracies": 0.84375, "rewards/chosen": -1.6149550676345825, "rewards/margins": 1.0333154201507568, "rewards/rejected": -2.64827036857605, "step": 351 }, { "epoch": 0.5168869309838473, "grad_norm": 14.201725959777832, "kl/avg_steps": 0.5625, "kl/beta": 0.017554141581058502, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.8072207266617854e-07, "logits/chosen": -7.791421890258789, "logits/rejected": -7.136171340942383, "logps/chosen": -183.04286193847656, "logps/ref_chosen": -96.81147766113281, "logps/ref_rejected": -84.40482330322266, "logps/rejected": -225.66738891601562, "loss": 0.8255, "rewards/accuracies": 0.828125, "rewards/chosen": -1.509531021118164, "rewards/margins": 0.955288290977478, "rewards/rejected": -2.4648194313049316, "step": 352 }, { "epoch": 0.5183553597650514, "grad_norm": 15.365931510925293, "kl/avg_steps": 0.40625, "kl/beta": 0.017455952242016792, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 2.794480701395219e-07, "logits/chosen": -7.713462829589844, "logits/rejected": -7.477531433105469, "logps/chosen": -183.22384643554688, "logps/ref_chosen": -85.36248016357422, "logps/ref_rejected": -87.36732482910156, "logps/rejected": -219.9366455078125, "loss": 1.0508, "rewards/accuracies": 0.734375, "rewards/chosen": -1.7041988372802734, "rewards/margins": 0.5970733761787415, "rewards/rejected": -2.301272392272949, "step": 353 }, { "epoch": 0.5198237885462555, "grad_norm": 12.326685905456543, "kl/avg_steps": 0.8125, "kl/beta": 0.017385324463248253, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 2.781732916288303e-07, "logits/chosen": -7.71798038482666, "logits/rejected": -7.087685585021973, "logps/chosen": -183.48825073242188, "logps/ref_chosen": -95.8011703491211, "logps/ref_rejected": -94.86614227294922, "logps/rejected": -236.8675537109375, "loss": 0.7853, "rewards/accuracies": 0.875, "rewards/chosen": -1.5137109756469727, "rewards/margins": 0.9342291951179504, "rewards/rejected": -2.4479401111602783, "step": 354 }, { "epoch": 0.5212922173274597, "grad_norm": 12.55382251739502, "kl/avg_steps": 0.6875, "kl/beta": 0.017245206981897354, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 2.7689777072570284e-07, "logits/chosen": -7.723712921142578, "logits/rejected": -7.181628704071045, "logps/chosen": -172.11546325683594, "logps/ref_chosen": -86.185546875, "logps/ref_rejected": -88.4615707397461, "logps/rejected": -227.43336486816406, "loss": 0.8456, "rewards/accuracies": 0.828125, "rewards/chosen": -1.473813772201538, "rewards/margins": 0.904168426990509, "rewards/rejected": -2.3779821395874023, "step": 355 }, { "epoch": 0.5227606461086637, "grad_norm": 15.903926849365234, "kl/avg_steps": 0.40625, "kl/beta": 0.01712745428085327, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 2.7562154104130176e-07, "logits/chosen": -7.718483924865723, "logits/rejected": -7.080849647521973, "logps/chosen": -191.98709106445312, "logps/ref_chosen": -90.818115234375, "logps/ref_rejected": -82.1273193359375, "logps/rejected": -222.52052307128906, "loss": 1.0336, "rewards/accuracies": 0.71875, "rewards/chosen": -1.7288204431533813, "rewards/margins": 0.6621589660644531, "rewards/rejected": -2.390979290008545, "step": 356 }, { "epoch": 0.5242290748898678, "grad_norm": 12.080792427062988, "kl/avg_steps": 0.65625, "kl/beta": 0.017058156430721283, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 2.7434463620546594e-07, "logits/chosen": -7.890803337097168, "logits/rejected": -7.257773399353027, "logps/chosen": -171.85879516601562, "logps/ref_chosen": -87.50475311279297, "logps/ref_rejected": -92.24937438964844, "logps/rejected": -239.12313842773438, "loss": 0.7488, "rewards/accuracies": 0.875, "rewards/chosen": -1.4316484928131104, "rewards/margins": 1.055083990097046, "rewards/rejected": -2.4867324829101562, "step": 357 }, { "epoch": 0.5256975036710719, "grad_norm": 13.29261589050293, "kl/avg_steps": 0.546875, "kl/beta": 0.016946941614151, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.730670898658255e-07, "logits/chosen": -7.43376350402832, "logits/rejected": -6.966874122619629, "logps/chosen": -190.7314453125, "logps/ref_chosen": -95.05546569824219, "logps/ref_rejected": -96.47611999511719, "logps/rejected": -237.2618408203125, "loss": 0.9431, "rewards/accuracies": 0.765625, "rewards/chosen": -1.6134233474731445, "rewards/margins": 0.7554316520690918, "rewards/rejected": -2.3688549995422363, "step": 358 }, { "epoch": 0.527165932452276, "grad_norm": 15.180391311645508, "kl/avg_steps": 0.5, "kl/beta": 0.016854766756296158, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 2.717889356869146e-07, "logits/chosen": -7.312591552734375, "logits/rejected": -6.956157684326172, "logps/chosen": -192.33929443359375, "logps/ref_chosen": -87.37332916259766, "logps/ref_rejected": -88.77188110351562, "logps/rejected": -236.61764526367188, "loss": 1.0136, "rewards/accuracies": 0.71875, "rewards/chosen": -1.762831687927246, "rewards/margins": 0.7133128046989441, "rewards/rejected": -2.476144313812256, "step": 359 }, { "epoch": 0.5286343612334802, "grad_norm": 14.518013954162598, "kl/avg_steps": 0.65625, "kl/beta": 0.01677091233432293, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 2.7051020734928443e-07, "logits/chosen": -7.531591415405273, "logits/rejected": -6.866611480712891, "logps/chosen": -175.69329833984375, "logps/ref_chosen": -75.22344970703125, "logps/ref_rejected": -76.093994140625, "logps/rejected": -229.7479248046875, "loss": 0.8437, "rewards/accuracies": 0.859375, "rewards/chosen": -1.6761372089385986, "rewards/margins": 0.8820022344589233, "rewards/rejected": -2.5581398010253906, "step": 360 }, { "epoch": 0.5301027900146843, "grad_norm": 14.980257987976074, "kl/avg_steps": 0.5625, "kl/beta": 0.016661571338772774, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.6923093854861593e-07, "logits/chosen": -7.7249908447265625, "logits/rejected": -7.067028999328613, "logps/chosen": -183.44650268554688, "logps/ref_chosen": -81.95333099365234, "logps/ref_rejected": -96.67919921875, "logps/rejected": -245.79881286621094, "loss": 0.9577, "rewards/accuracies": 0.78125, "rewards/chosen": -1.684104561805725, "rewards/margins": 0.7838806509971619, "rewards/rejected": -2.467985153198242, "step": 361 }, { "epoch": 0.5315712187958884, "grad_norm": 12.824585914611816, "kl/avg_steps": 0.6875, "kl/beta": 0.016568373888731003, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 2.679511629948319e-07, "logits/chosen": -7.730887413024902, "logits/rejected": -7.379080772399902, "logps/chosen": -192.10179138183594, "logps/ref_chosen": -89.33971405029297, "logps/ref_rejected": -110.5019760131836, "logps/rejected": -265.67584228515625, "loss": 0.8735, "rewards/accuracies": 0.859375, "rewards/chosen": -1.6923720836639404, "rewards/margins": 0.8581586480140686, "rewards/rejected": -2.5505309104919434, "step": 362 }, { "epoch": 0.5330396475770925, "grad_norm": 12.172687530517578, "kl/avg_steps": 0.6875, "kl/beta": 0.01645524427294731, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 2.6667091441120816e-07, "logits/chosen": -7.627842903137207, "logits/rejected": -7.226386070251465, "logps/chosen": -177.34544372558594, "logps/ref_chosen": -80.7750015258789, "logps/ref_rejected": -79.43267822265625, "logps/rejected": -243.47921752929688, "loss": 0.7656, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5798821449279785, "rewards/margins": 1.0982314348220825, "rewards/rejected": -2.6781134605407715, "step": 363 }, { "epoch": 0.5345080763582967, "grad_norm": 14.342798233032227, "kl/avg_steps": 0.625, "kl/beta": 0.01634288765490055, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 2.6539022653348575e-07, "logits/chosen": -7.7837934494018555, "logits/rejected": -6.654484748840332, "logps/chosen": -174.86444091796875, "logps/ref_chosen": -76.52960205078125, "logps/ref_rejected": -97.56173706054688, "logps/rejected": -247.8596649169922, "loss": 0.8462, "rewards/accuracies": 0.796875, "rewards/chosen": -1.5991710424423218, "rewards/margins": 0.8397619724273682, "rewards/rejected": -2.4389328956604004, "step": 364 }, { "epoch": 0.5359765051395007, "grad_norm": 13.283844947814941, "kl/avg_steps": 0.65625, "kl/beta": 0.01624137908220291, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 2.641091331089811e-07, "logits/chosen": -7.504325866699219, "logits/rejected": -7.238739967346191, "logps/chosen": -178.31759643554688, "logps/ref_chosen": -80.54489135742188, "logps/ref_rejected": -99.32316589355469, "logps/rejected": -254.62168884277344, "loss": 0.8603, "rewards/accuracies": 0.828125, "rewards/chosen": -1.5808205604553223, "rewards/margins": 0.9229766130447388, "rewards/rejected": -2.5037970542907715, "step": 365 }, { "epoch": 0.5374449339207048, "grad_norm": 12.054483413696289, "kl/avg_steps": 0.71875, "kl/beta": 0.01613548956811428, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 2.6282766789569736e-07, "logits/chosen": -7.318140983581543, "logits/rejected": -6.966005325317383, "logps/chosen": -168.7039337158203, "logps/ref_chosen": -72.23806762695312, "logps/ref_rejected": -91.67182922363281, "logps/rejected": -242.06005859375, "loss": 0.86, "rewards/accuracies": 0.921875, "rewards/chosen": -1.5466276407241821, "rewards/margins": 0.8613663911819458, "rewards/rejected": -2.407994031906128, "step": 366 }, { "epoch": 0.5389133627019089, "grad_norm": 12.747227668762207, "kl/avg_steps": 0.828125, "kl/beta": 0.016020342707633972, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.90625, "learning_rate": 2.615458646614349e-07, "logits/chosen": -7.577718734741211, "logits/rejected": -7.4753642082214355, "logps/chosen": -179.722412109375, "logps/ref_chosen": -85.42201232910156, "logps/ref_rejected": -83.51779174804688, "logps/rejected": -234.3914031982422, "loss": 0.8254, "rewards/accuracies": 0.90625, "rewards/chosen": -1.4997714757919312, "rewards/margins": 0.896426796913147, "rewards/rejected": -2.396198272705078, "step": 367 }, { "epoch": 0.540381791483113, "grad_norm": 14.037300109863281, "kl/avg_steps": 0.625, "kl/beta": 0.015888763591647148, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 2.6026375718290083e-07, "logits/chosen": -7.514585494995117, "logits/rejected": -7.210457801818848, "logps/chosen": -165.94821166992188, "logps/ref_chosen": -73.84112548828125, "logps/ref_rejected": -104.69528198242188, "logps/rejected": -256.14404296875, "loss": 0.8403, "rewards/accuracies": 0.84375, "rewards/chosen": -1.4564285278320312, "rewards/margins": 0.9321895241737366, "rewards/rejected": -2.388618230819702, "step": 368 }, { "epoch": 0.5418502202643172, "grad_norm": 13.097122192382812, "kl/avg_steps": 0.75, "kl/beta": 0.015790076926350594, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 2.589813792448196e-07, "logits/chosen": -7.481395721435547, "logits/rejected": -6.605921745300293, "logps/chosen": -196.92771911621094, "logps/ref_chosen": -98.46196746826172, "logps/ref_rejected": -99.15672302246094, "logps/rejected": -256.7950439453125, "loss": 0.7803, "rewards/accuracies": 0.875, "rewards/chosen": -1.545214056968689, "rewards/margins": 0.9250272512435913, "rewards/rejected": -2.4702415466308594, "step": 369 }, { "epoch": 0.5433186490455213, "grad_norm": 13.142091751098633, "kl/avg_steps": 0.40625, "kl/beta": 0.015672532841563225, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 2.5769876463904263e-07, "logits/chosen": -7.735294342041016, "logits/rejected": -7.384430885314941, "logps/chosen": -204.67718505859375, "logps/ref_chosen": -96.77574157714844, "logps/ref_rejected": -102.74411010742188, "logps/rejected": -253.85340881347656, "loss": 1.002, "rewards/accuracies": 0.796875, "rewards/chosen": -1.687301754951477, "rewards/margins": 0.6685256361961365, "rewards/rejected": -2.3558273315429688, "step": 370 }, { "epoch": 0.5447870778267254, "grad_norm": 14.304271697998047, "kl/avg_steps": 0.46875, "kl/beta": 0.015609120018780231, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 2.5641594716365744e-07, "logits/chosen": -7.651013374328613, "logits/rejected": -7.241464614868164, "logps/chosen": -206.90415954589844, "logps/ref_chosen": -102.99020385742188, "logps/ref_rejected": -105.35063171386719, "logps/rejected": -252.56182861328125, "loss": 1.0035, "rewards/accuracies": 0.734375, "rewards/chosen": -1.6160762310028076, "rewards/margins": 0.6670252084732056, "rewards/rejected": -2.2831013202667236, "step": 371 }, { "epoch": 0.5462555066079295, "grad_norm": 11.53231430053711, "kl/avg_steps": 0.8125, "kl/beta": 0.0155362943187356, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 2.551329606220976e-07, "logits/chosen": -7.6374592781066895, "logits/rejected": -7.359759330749512, "logps/chosen": -181.00729370117188, "logps/ref_chosen": -91.02623748779297, "logps/ref_rejected": -85.6102294921875, "logps/rejected": -244.81317138671875, "loss": 0.7449, "rewards/accuracies": 0.890625, "rewards/chosen": -1.3880116939544678, "rewards/margins": 1.0640125274658203, "rewards/rejected": -2.452024221420288, "step": 372 }, { "epoch": 0.5477239353891337, "grad_norm": 14.019618034362793, "kl/avg_steps": 0.625, "kl/beta": 0.015411078929901123, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 2.538498388222517e-07, "logits/chosen": -7.477275848388672, "logits/rejected": -7.463842868804932, "logps/chosen": -201.16326904296875, "logps/ref_chosen": -100.07307434082031, "logps/ref_rejected": -91.11634826660156, "logps/rejected": -247.42393493652344, "loss": 0.8783, "rewards/accuracies": 0.828125, "rewards/chosen": -1.5517187118530273, "rewards/margins": 0.8413230180740356, "rewards/rejected": -2.3930416107177734, "step": 373 }, { "epoch": 0.5491923641703378, "grad_norm": 11.548641204833984, "kl/avg_steps": 0.6875, "kl/beta": 0.015315357595682144, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 2.525666155755725e-07, "logits/chosen": -7.55961799621582, "logits/rejected": -7.308844089508057, "logps/chosen": -188.9845733642578, "logps/ref_chosen": -98.01234436035156, "logps/ref_rejected": -100.75224304199219, "logps/rejected": -249.3828582763672, "loss": 0.8132, "rewards/accuracies": 0.84375, "rewards/chosen": -1.3845758438110352, "rewards/margins": 0.8738114833831787, "rewards/rejected": -2.258387327194214, "step": 374 }, { "epoch": 0.5506607929515418, "grad_norm": 12.984651565551758, "kl/avg_steps": 0.71875, "kl/beta": 0.015210784040391445, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 2.512833246961859e-07, "logits/chosen": -7.9140520095825195, "logits/rejected": -7.584393501281738, "logps/chosen": -180.4878387451172, "logps/ref_chosen": -84.39627838134766, "logps/ref_rejected": -95.7188491821289, "logps/rejected": -254.66061401367188, "loss": 0.8142, "rewards/accuracies": 0.84375, "rewards/chosen": -1.4533175230026245, "rewards/margins": 0.9453805685043335, "rewards/rejected": -2.398698091506958, "step": 375 }, { "epoch": 0.5521292217327459, "grad_norm": 12.451373100280762, "kl/avg_steps": 0.75, "kl/beta": 0.015102236531674862, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 2.5e-07, "logits/chosen": -7.593746662139893, "logits/rejected": -7.532279014587402, "logps/chosen": -187.0894775390625, "logps/ref_chosen": -95.73770141601562, "logps/ref_rejected": -111.97331237792969, "logps/rejected": -271.1904296875, "loss": 0.7789, "rewards/accuracies": 0.890625, "rewards/chosen": -1.3709272146224976, "rewards/margins": 1.0137239694595337, "rewards/rejected": -2.3846511840820312, "step": 376 }, { "epoch": 0.55359765051395, "grad_norm": 14.200995445251465, "kl/avg_steps": 0.6875, "kl/beta": 0.014989812858402729, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 2.487166753038141e-07, "logits/chosen": -7.635054588317871, "logits/rejected": -7.0941009521484375, "logps/chosen": -178.06936645507812, "logps/ref_chosen": -78.92694854736328, "logps/ref_rejected": -105.89892578125, "logps/rejected": -275.2508544921875, "loss": 0.7926, "rewards/accuracies": 0.84375, "rewards/chosen": -1.4772100448608398, "rewards/margins": 1.0408310890197754, "rewards/rejected": -2.5180411338806152, "step": 377 }, { "epoch": 0.5550660792951542, "grad_norm": 13.540968894958496, "kl/avg_steps": 0.6875, "kl/beta": 0.014887461438775063, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 2.4743338442442754e-07, "logits/chosen": -7.783210754394531, "logits/rejected": -7.034327507019043, "logps/chosen": -172.66668701171875, "logps/ref_chosen": -75.37240600585938, "logps/ref_rejected": -93.70409393310547, "logps/rejected": -250.10272216796875, "loss": 0.8581, "rewards/accuracies": 0.84375, "rewards/chosen": -1.4412063360214233, "rewards/margins": 0.869320273399353, "rewards/rejected": -2.3105268478393555, "step": 378 }, { "epoch": 0.5565345080763583, "grad_norm": 12.696690559387207, "kl/avg_steps": 0.53125, "kl/beta": 0.014785809442400932, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.461501611777483e-07, "logits/chosen": -7.8199920654296875, "logits/rejected": -6.8525214195251465, "logps/chosen": -195.7200927734375, "logps/ref_chosen": -88.85568237304688, "logps/ref_rejected": -121.38114929199219, "logps/rejected": -284.5279541015625, "loss": 0.9442, "rewards/accuracies": 0.75, "rewards/chosen": -1.5753943920135498, "rewards/margins": 0.8215144276618958, "rewards/rejected": -2.396908760070801, "step": 379 }, { "epoch": 0.5580029368575624, "grad_norm": 14.155634880065918, "kl/avg_steps": 0.65625, "kl/beta": 0.014707674272358418, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 2.4486703937790243e-07, "logits/chosen": -7.855627059936523, "logits/rejected": -7.288210868835449, "logps/chosen": -197.644775390625, "logps/ref_chosen": -79.83321380615234, "logps/ref_rejected": -110.74475860595703, "logps/rejected": -291.2830810546875, "loss": 0.8692, "rewards/accuracies": 0.828125, "rewards/chosen": -1.7239048480987549, "rewards/margins": 0.9118264317512512, "rewards/rejected": -2.6357314586639404, "step": 380 }, { "epoch": 0.5594713656387665, "grad_norm": 14.164735794067383, "kl/avg_steps": 0.625, "kl/beta": 0.014611784368753433, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 2.435840528363426e-07, "logits/chosen": -7.805541038513184, "logits/rejected": -6.962313652038574, "logps/chosen": -190.10198974609375, "logps/ref_chosen": -86.56683349609375, "logps/ref_rejected": -84.416748046875, "logps/rejected": -243.78826904296875, "loss": 0.8887, "rewards/accuracies": 0.828125, "rewards/chosen": -1.5055629014968872, "rewards/margins": 0.8068733811378479, "rewards/rejected": -2.312436580657959, "step": 381 }, { "epoch": 0.5609397944199707, "grad_norm": 12.716059684753418, "kl/avg_steps": 0.59375, "kl/beta": 0.01452102791517973, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 2.4230123536095745e-07, "logits/chosen": -7.854497909545898, "logits/rejected": -7.399716377258301, "logps/chosen": -207.37265014648438, "logps/ref_chosen": -104.46180725097656, "logps/ref_rejected": -114.60223388671875, "logps/rejected": -278.6050109863281, "loss": 0.8502, "rewards/accuracies": 0.84375, "rewards/chosen": -1.4875770807266235, "rewards/margins": 0.8772394061088562, "rewards/rejected": -2.364816427230835, "step": 382 }, { "epoch": 0.5624082232011748, "grad_norm": 13.573335647583008, "kl/avg_steps": 0.4375, "kl/beta": 0.014435318298637867, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 2.4101862075518037e-07, "logits/chosen": -7.561490058898926, "logits/rejected": -7.388404369354248, "logps/chosen": -192.78128051757812, "logps/ref_chosen": -82.11607360839844, "logps/ref_rejected": -98.77716064453125, "logps/rejected": -262.2416076660156, "loss": 0.927, "rewards/accuracies": 0.78125, "rewards/chosen": -1.593533992767334, "rewards/margins": 0.7530485987663269, "rewards/rejected": -2.3465826511383057, "step": 383 }, { "epoch": 0.5638766519823789, "grad_norm": 14.46293830871582, "kl/avg_steps": 0.625, "kl/beta": 0.014372439123690128, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 2.397362428170992e-07, "logits/chosen": -7.834491729736328, "logits/rejected": -7.562644004821777, "logps/chosen": -197.14675903320312, "logps/ref_chosen": -80.58525085449219, "logps/ref_rejected": -91.44789123535156, "logps/rejected": -262.77398681640625, "loss": 0.9637, "rewards/accuracies": 0.828125, "rewards/chosen": -1.6679697036743164, "rewards/margins": 0.7766201496124268, "rewards/rejected": -2.444589853286743, "step": 384 }, { "epoch": 0.5653450807635829, "grad_norm": 13.538987159729004, "kl/avg_steps": 0.546875, "kl/beta": 0.01428316906094551, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.3845413533856514e-07, "logits/chosen": -7.669012069702148, "logits/rejected": -7.240923881530762, "logps/chosen": -192.1517333984375, "logps/ref_chosen": -92.76016235351562, "logps/ref_rejected": -84.45997619628906, "logps/rejected": -247.5975341796875, "loss": 0.8706, "rewards/accuracies": 0.78125, "rewards/chosen": -1.4141170978546143, "rewards/margins": 0.8999189138412476, "rewards/rejected": -2.3140358924865723, "step": 385 }, { "epoch": 0.566813509544787, "grad_norm": 11.718225479125977, "kl/avg_steps": 0.65625, "kl/beta": 0.01420548278838396, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 2.3717233210430254e-07, "logits/chosen": -7.931042671203613, "logits/rejected": -7.277120590209961, "logps/chosen": -188.76788330078125, "logps/ref_chosen": -85.35896301269531, "logps/ref_rejected": -96.99065399169922, "logps/rejected": -262.10107421875, "loss": 0.8706, "rewards/accuracies": 0.84375, "rewards/chosen": -1.461796522140503, "rewards/margins": 0.8657737970352173, "rewards/rejected": -2.3275704383850098, "step": 386 }, { "epoch": 0.5682819383259912, "grad_norm": 12.556093215942383, "kl/avg_steps": 0.71875, "kl/beta": 0.014112867414951324, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 2.3589086689101889e-07, "logits/chosen": -7.733986854553223, "logits/rejected": -6.640727519989014, "logps/chosen": -217.92800903320312, "logps/ref_chosen": -102.68814086914062, "logps/ref_rejected": -99.42858123779297, "logps/rejected": -274.15191650390625, "loss": 0.8604, "rewards/accuracies": 0.84375, "rewards/chosen": -1.6170012950897217, "rewards/margins": 0.8299874067306519, "rewards/rejected": -2.446988582611084, "step": 387 }, { "epoch": 0.5697503671071953, "grad_norm": 12.432695388793945, "kl/avg_steps": 0.6875, "kl/beta": 0.014012155123054981, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 2.3460977346651428e-07, "logits/chosen": -7.944313049316406, "logits/rejected": -7.454949855804443, "logps/chosen": -197.45010375976562, "logps/ref_chosen": -83.58523559570312, "logps/ref_rejected": -110.10037231445312, "logps/rejected": -292.4563293457031, "loss": 0.7893, "rewards/accuracies": 0.859375, "rewards/chosen": -1.5865106582641602, "rewards/margins": 0.9495990872383118, "rewards/rejected": -2.5361099243164062, "step": 388 }, { "epoch": 0.5712187958883994, "grad_norm": 13.698887825012207, "kl/avg_steps": 0.53125, "kl/beta": 0.013916479423642159, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.3332908558879177e-07, "logits/chosen": -7.64278507232666, "logits/rejected": -7.23593807220459, "logps/chosen": -209.8504180908203, "logps/ref_chosen": -89.822265625, "logps/ref_rejected": -84.85377502441406, "logps/rejected": -264.9825134277344, "loss": 0.9298, "rewards/accuracies": 0.78125, "rewards/chosen": -1.663960337638855, "rewards/margins": 0.8260251879692078, "rewards/rejected": -2.489985704421997, "step": 389 }, { "epoch": 0.5726872246696035, "grad_norm": 12.986654281616211, "kl/avg_steps": 0.46875, "kl/beta": 0.013842938467860222, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 2.320488370051681e-07, "logits/chosen": -7.480320453643799, "logits/rejected": -7.105579853057861, "logps/chosen": -198.42169189453125, "logps/ref_chosen": -77.53244018554688, "logps/ref_rejected": -92.58392333984375, "logps/rejected": -268.6641540527344, "loss": 0.971, "rewards/accuracies": 0.734375, "rewards/chosen": -1.6698600053787231, "rewards/margins": 0.7543532252311707, "rewards/rejected": -2.424213171005249, "step": 390 }, { "epoch": 0.5741556534508077, "grad_norm": 13.51944351196289, "kl/avg_steps": 0.59375, "kl/beta": 0.013778353109955788, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 2.3076906145138405e-07, "logits/chosen": -7.788546085357666, "logits/rejected": -6.940990447998047, "logps/chosen": -198.9241943359375, "logps/ref_chosen": -88.2329330444336, "logps/ref_rejected": -93.00756072998047, "logps/rejected": -264.8871765136719, "loss": 0.9043, "rewards/accuracies": 0.84375, "rewards/chosen": -1.5192664861679077, "rewards/margins": 0.8330180048942566, "rewards/rejected": -2.3522844314575195, "step": 391 }, { "epoch": 0.5756240822320118, "grad_norm": 13.640396118164062, "kl/avg_steps": 0.65625, "kl/beta": 0.013697026297450066, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 2.294897926507156e-07, "logits/chosen": -7.636374473571777, "logits/rejected": -7.049764156341553, "logps/chosen": -198.85385131835938, "logps/ref_chosen": -87.74441528320312, "logps/ref_rejected": -108.57071685791016, "logps/rejected": -287.69525146484375, "loss": 0.8565, "rewards/accuracies": 0.84375, "rewards/chosen": -1.5143227577209473, "rewards/margins": 0.9206820726394653, "rewards/rejected": -2.435004711151123, "step": 392 }, { "epoch": 0.5770925110132159, "grad_norm": 11.787603378295898, "kl/avg_steps": 0.625, "kl/beta": 0.013607725501060486, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 2.2821106431308543e-07, "logits/chosen": -7.288888931274414, "logits/rejected": -6.870842933654785, "logps/chosen": -176.33932495117188, "logps/ref_chosen": -76.56381225585938, "logps/ref_rejected": -79.72618865966797, "logps/rejected": -242.21641540527344, "loss": 0.8682, "rewards/accuracies": 0.765625, "rewards/chosen": -1.3515516519546509, "rewards/margins": 0.8435693979263306, "rewards/rejected": -2.1951210498809814, "step": 393 }, { "epoch": 0.57856093979442, "grad_norm": 12.555571556091309, "kl/avg_steps": 0.703125, "kl/beta": 0.013523206114768982, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 2.2693291013417452e-07, "logits/chosen": -7.870112895965576, "logits/rejected": -7.220252513885498, "logps/chosen": -188.681640625, "logps/ref_chosen": -84.69866180419922, "logps/ref_rejected": -97.457763671875, "logps/rejected": -268.5348815917969, "loss": 0.8111, "rewards/accuracies": 0.828125, "rewards/chosen": -1.3974303007125854, "rewards/margins": 0.8974814414978027, "rewards/rejected": -2.2949118614196777, "step": 394 }, { "epoch": 0.580029368575624, "grad_norm": 14.517809867858887, "kl/avg_steps": 0.5625, "kl/beta": 0.013428784906864166, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.2565536379453404e-07, "logits/chosen": -7.730129718780518, "logits/rejected": -7.53689432144165, "logps/chosen": -195.88299560546875, "logps/ref_chosen": -88.88763427734375, "logps/ref_rejected": -91.0455322265625, "logps/rejected": -254.3079376220703, "loss": 1.0664, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4323430061340332, "rewards/margins": 0.7445914149284363, "rewards/rejected": -2.1769344806671143, "step": 395 }, { "epoch": 0.5814977973568282, "grad_norm": 16.962797164916992, "kl/avg_steps": 0.40625, "kl/beta": 0.01335367001593113, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 2.2437845895869825e-07, "logits/chosen": -7.620447158813477, "logits/rejected": -7.031801223754883, "logps/chosen": -218.27194213867188, "logps/ref_chosen": -102.47132873535156, "logps/ref_rejected": -93.23478698730469, "logps/rejected": -254.623046875, "loss": 1.0318, "rewards/accuracies": 0.734375, "rewards/chosen": -1.542496919631958, "rewards/margins": 0.6003559827804565, "rewards/rejected": -2.142852783203125, "step": 396 }, { "epoch": 0.5829662261380323, "grad_norm": 13.833749771118164, "kl/avg_steps": 0.578125, "kl/beta": 0.013299640268087387, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.2310222927429716e-07, "logits/chosen": -8.157541275024414, "logits/rejected": -7.536031246185303, "logps/chosen": -204.63792419433594, "logps/ref_chosen": -101.04441833496094, "logps/ref_rejected": -108.8511734008789, "logps/rejected": -296.58837890625, "loss": 0.794, "rewards/accuracies": 0.828125, "rewards/chosen": -1.3734837770462036, "rewards/margins": 1.1063416004180908, "rewards/rejected": -2.479825496673584, "step": 397 }, { "epoch": 0.5844346549192364, "grad_norm": 13.908939361572266, "kl/avg_steps": 0.65625, "kl/beta": 0.01322319358587265, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 2.2182670837116972e-07, "logits/chosen": -7.908001899719238, "logits/rejected": -7.516959190368652, "logps/chosen": -204.48220825195312, "logps/ref_chosen": -95.04922485351562, "logps/ref_rejected": -114.00563049316406, "logps/rejected": -286.8526611328125, "loss": 0.9283, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4388189315795898, "rewards/margins": 0.82853102684021, "rewards/rejected": -2.2673499584198, "step": 398 }, { "epoch": 0.5859030837004405, "grad_norm": 12.327967643737793, "kl/avg_steps": 0.71875, "kl/beta": 0.013136982917785645, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 2.2055192986047804e-07, "logits/chosen": -7.728894233703613, "logits/rejected": -7.285490989685059, "logps/chosen": -196.3619384765625, "logps/ref_chosen": -96.63853454589844, "logps/ref_rejected": -83.53645324707031, "logps/rejected": -262.048828125, "loss": 0.7418, "rewards/accuracies": 0.921875, "rewards/chosen": -1.3026535511016846, "rewards/margins": 1.0243358612060547, "rewards/rejected": -2.3269894123077393, "step": 399 }, { "epoch": 0.5873715124816447, "grad_norm": 14.875067710876465, "kl/avg_steps": 0.6875, "kl/beta": 0.013043234124779701, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 2.192779273338215e-07, "logits/chosen": -8.027582168579102, "logits/rejected": -7.236846923828125, "logps/chosen": -204.7100067138672, "logps/ref_chosen": -105.30314636230469, "logps/ref_rejected": -108.6555404663086, "logps/rejected": -291.34063720703125, "loss": 0.7868, "rewards/accuracies": 0.859375, "rewards/chosen": -1.2911700010299683, "rewards/margins": 1.073652744293213, "rewards/rejected": -2.3648228645324707, "step": 400 }, { "epoch": 0.5873715124816447, "eval_kl/n_epsilon_steps": 0.23416095972061157, "eval_kl/p_epsilon_steps": 0.7645547986030579, "eval_logits/chosen": -7.477985382080078, "eval_logits/rejected": -7.118831157684326, "eval_logps/chosen": -222.89894104003906, "eval_logps/ref_chosen": -100.49356842041016, "eval_logps/ref_rejected": -94.06775665283203, "eval_logps/rejected": -269.13983154296875, "eval_loss": 0.49379390478134155, "eval_rewards/accuracies": 0.7791095972061157, "eval_rewards/chosen": -1.5799260139465332, "eval_rewards/margins": 0.673462986946106, "eval_rewards/rejected": -2.2533888816833496, "eval_runtime": 48.5073, "eval_samples_per_second": 48.22, "eval_steps_per_second": 1.526, "step": 400 }, { "epoch": 0.5888399412628488, "grad_norm": 15.558944702148438, "kl/avg_steps": 0.375, "kl/beta": 0.012954174540936947, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 2.1800473436235136e-07, "logits/chosen": -7.620119094848633, "logits/rejected": -7.406888961791992, "logps/chosen": -197.28976440429688, "logps/ref_chosen": -84.28916931152344, "logps/ref_rejected": -90.943115234375, "logps/rejected": -255.7905731201172, "loss": 1.0429, "rewards/accuracies": 0.703125, "rewards/chosen": -1.4610803127288818, "rewards/margins": 0.6627606749534607, "rewards/rejected": -2.123840808868408, "step": 401 }, { "epoch": 0.5903083700440529, "grad_norm": 12.638237953186035, "kl/avg_steps": 0.78125, "kl/beta": 0.012905777432024479, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 2.1673238449588665e-07, "logits/chosen": -7.716352939605713, "logits/rejected": -7.249260902404785, "logps/chosen": -185.138916015625, "logps/ref_chosen": -83.59312438964844, "logps/ref_rejected": -87.81027221679688, "logps/rejected": -284.30706787109375, "loss": 0.6486, "rewards/accuracies": 0.90625, "rewards/chosen": -1.3018509149551392, "rewards/margins": 1.2127395868301392, "rewards/rejected": -2.5145905017852783, "step": 402 }, { "epoch": 0.591776798825257, "grad_norm": 11.707761764526367, "kl/avg_steps": 0.71875, "kl/beta": 0.012805732898414135, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 2.154609112620295e-07, "logits/chosen": -7.694974899291992, "logits/rejected": -6.938326835632324, "logps/chosen": -179.35401916503906, "logps/ref_chosen": -73.75308227539062, "logps/ref_rejected": -83.92012786865234, "logps/rejected": -260.35247802734375, "loss": 0.8488, "rewards/accuracies": 0.828125, "rewards/chosen": -1.3442716598510742, "rewards/margins": 0.8966612219810486, "rewards/rejected": -2.2409329414367676, "step": 403 }, { "epoch": 0.593245227606461, "grad_norm": 17.042625427246094, "kl/avg_steps": 0.59375, "kl/beta": 0.01271434873342514, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 2.1419034816528218e-07, "logits/chosen": -7.690559387207031, "logits/rejected": -7.393105983734131, "logps/chosen": -203.4947509765625, "logps/ref_chosen": -79.67617797851562, "logps/ref_rejected": -84.280517578125, "logps/rejected": -271.59332275390625, "loss": 0.9664, "rewards/accuracies": 0.84375, "rewards/chosen": -1.567918300628662, "rewards/margins": 0.7968522906303406, "rewards/rejected": -2.3647704124450684, "step": 404 }, { "epoch": 0.5947136563876652, "grad_norm": 11.477922439575195, "kl/avg_steps": 0.65625, "kl/beta": 0.012639302760362625, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 2.129207286861638e-07, "logits/chosen": -7.6790571212768555, "logits/rejected": -7.126956462860107, "logps/chosen": -224.55743408203125, "logps/ref_chosen": -96.46195220947266, "logps/ref_rejected": -92.87071228027344, "logps/rejected": -283.11083984375, "loss": 0.8803, "rewards/accuracies": 0.84375, "rewards/chosen": -1.6104973554611206, "rewards/margins": 0.7770987749099731, "rewards/rejected": -2.3875961303710938, "step": 405 }, { "epoch": 0.5961820851688693, "grad_norm": 16.791955947875977, "kl/avg_steps": 0.59375, "kl/beta": 0.012556898407638073, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 2.1165208628032861e-07, "logits/chosen": -7.518543720245361, "logits/rejected": -7.182583808898926, "logps/chosen": -199.13026428222656, "logps/ref_chosen": -78.13396453857422, "logps/ref_rejected": -98.28359985351562, "logps/rejected": -276.6565246582031, "loss": 0.9891, "rewards/accuracies": 0.78125, "rewards/chosen": -1.5131518840789795, "rewards/margins": 0.7109516859054565, "rewards/rejected": -2.2241034507751465, "step": 406 }, { "epoch": 0.5976505139500734, "grad_norm": 14.285913467407227, "kl/avg_steps": 0.5625, "kl/beta": 0.012482781894505024, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.1038445437768375e-07, "logits/chosen": -7.507756233215332, "logits/rejected": -6.921146392822266, "logps/chosen": -212.05181884765625, "logps/ref_chosen": -84.01283264160156, "logps/ref_rejected": -82.78103637695312, "logps/rejected": -264.0316162109375, "loss": 0.9939, "rewards/accuracies": 0.859375, "rewards/chosen": -1.5919959545135498, "rewards/margins": 0.6562482118606567, "rewards/rejected": -2.248244285583496, "step": 407 }, { "epoch": 0.5991189427312775, "grad_norm": 13.731730461120605, "kl/avg_steps": 0.421875, "kl/beta": 0.012412958778440952, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.703125, "learning_rate": 2.0911786638150872e-07, "logits/chosen": -7.530098915100098, "logits/rejected": -7.0652008056640625, "logps/chosen": -224.00033569335938, "logps/ref_chosen": -104.46175384521484, "logps/ref_rejected": -96.37218475341797, "logps/rejected": -270.20428466796875, "loss": 1.011, "rewards/accuracies": 0.765625, "rewards/chosen": -1.480905294418335, "rewards/margins": 0.6653140783309937, "rewards/rejected": -2.146219253540039, "step": 408 }, { "epoch": 0.6005873715124816, "grad_norm": 12.978145599365234, "kl/avg_steps": 0.5625, "kl/beta": 0.012360811233520508, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.0785235566757517e-07, "logits/chosen": -7.490418434143066, "logits/rejected": -7.061441898345947, "logps/chosen": -229.72506713867188, "logps/ref_chosen": -97.66830444335938, "logps/ref_rejected": -90.04584503173828, "logps/rejected": -275.54852294921875, "loss": 1.0303, "rewards/accuracies": 0.828125, "rewards/chosen": -1.6262993812561035, "rewards/margins": 0.6515868306159973, "rewards/rejected": -2.277886152267456, "step": 409 }, { "epoch": 0.6020558002936858, "grad_norm": 12.724774360656738, "kl/avg_steps": 0.578125, "kl/beta": 0.01229167077690363, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.065879555832674e-07, "logits/chosen": -7.461033821105957, "logits/rejected": -7.218528747558594, "logps/chosen": -205.697265625, "logps/ref_chosen": -76.46923828125, "logps/ref_rejected": -88.64064025878906, "logps/rejected": -275.3063049316406, "loss": 0.9507, "rewards/accuracies": 0.78125, "rewards/chosen": -1.5810291767120361, "rewards/margins": 0.697364091873169, "rewards/rejected": -2.278393268585205, "step": 410 }, { "epoch": 0.6035242290748899, "grad_norm": 11.978557586669922, "kl/avg_steps": 0.53125, "kl/beta": 0.012221017852425575, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.0532469944670343e-07, "logits/chosen": -7.684117317199707, "logits/rejected": -7.11949348449707, "logps/chosen": -221.50474548339844, "logps/ref_chosen": -87.16630554199219, "logps/ref_rejected": -87.09603118896484, "logps/rejected": -281.2148742675781, "loss": 0.9201, "rewards/accuracies": 0.796875, "rewards/chosen": -1.6353148221969604, "rewards/margins": 0.7224369049072266, "rewards/rejected": -2.3577518463134766, "step": 411 }, { "epoch": 0.604992657856094, "grad_norm": 13.545778274536133, "kl/avg_steps": 0.46875, "kl/beta": 0.012156437151134014, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 2.0406262054585738e-07, "logits/chosen": -7.71500301361084, "logits/rejected": -6.960906982421875, "logps/chosen": -205.3089599609375, "logps/ref_chosen": -78.94734191894531, "logps/ref_rejected": -106.10554504394531, "logps/rejected": -289.611572265625, "loss": 1.0053, "rewards/accuracies": 0.71875, "rewards/chosen": -1.5305845737457275, "rewards/margins": 0.6853532791137695, "rewards/rejected": -2.215937614440918, "step": 412 }, { "epoch": 0.6064610866372981, "grad_norm": 12.726061820983887, "kl/avg_steps": 0.71875, "kl/beta": 0.012099719606339931, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 2.0280175213768205e-07, "logits/chosen": -7.195650100708008, "logits/rejected": -7.105502128601074, "logps/chosen": -232.97862243652344, "logps/ref_chosen": -95.69471740722656, "logps/ref_rejected": -107.96085357666016, "logps/rejected": -308.6137390136719, "loss": 0.9401, "rewards/accuracies": 0.84375, "rewards/chosen": -1.650810956954956, "rewards/margins": 0.7573432922363281, "rewards/rejected": -2.4081544876098633, "step": 413 }, { "epoch": 0.6079295154185022, "grad_norm": 16.81880760192871, "kl/avg_steps": 0.53125, "kl/beta": 0.012013372965157032, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.0154212744723247e-07, "logits/chosen": -7.222269058227539, "logits/rejected": -6.936489105224609, "logps/chosen": -214.6531524658203, "logps/ref_chosen": -88.27667236328125, "logps/ref_rejected": -92.87004089355469, "logps/rejected": -278.8741455078125, "loss": 0.9978, "rewards/accuracies": 0.765625, "rewards/chosen": -1.5135799646377563, "rewards/margins": 0.7068269848823547, "rewards/rejected": -2.220407009124756, "step": 414 }, { "epoch": 0.6093979441997063, "grad_norm": 13.74764633178711, "kl/avg_steps": 0.5625, "kl/beta": 0.011949889361858368, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.002837796667909e-07, "logits/chosen": -7.991650104522705, "logits/rejected": -7.442322731018066, "logps/chosen": -242.48207092285156, "logps/ref_chosen": -108.91590118408203, "logps/ref_rejected": -107.47135925292969, "logps/rejected": -295.0639953613281, "loss": 1.0108, "rewards/accuracies": 0.796875, "rewards/chosen": -1.5901520252227783, "rewards/margins": 0.6369077563285828, "rewards/rejected": -2.227059841156006, "step": 415 }, { "epoch": 0.6108663729809104, "grad_norm": 11.695809364318848, "kl/avg_steps": 0.5625, "kl/beta": 0.01188304740935564, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.990267419549914e-07, "logits/chosen": -7.476058006286621, "logits/rejected": -7.21858024597168, "logps/chosen": -224.93905639648438, "logps/ref_chosen": -93.39888000488281, "logps/ref_rejected": -97.6729736328125, "logps/rejected": -302.91412353515625, "loss": 0.875, "rewards/accuracies": 0.8125, "rewards/chosen": -1.556352138519287, "rewards/margins": 0.8654446601867676, "rewards/rejected": -2.4217967987060547, "step": 416 }, { "epoch": 0.6123348017621145, "grad_norm": 12.070055961608887, "kl/avg_steps": 0.6875, "kl/beta": 0.011816578917205334, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 1.9777104743594686e-07, "logits/chosen": -7.844758033752441, "logits/rejected": -6.70277214050293, "logps/chosen": -208.16903686523438, "logps/ref_chosen": -83.53533172607422, "logps/ref_rejected": -74.44184112548828, "logps/rejected": -266.1756591796875, "loss": 0.8896, "rewards/accuracies": 0.84375, "rewards/chosen": -1.4651763439178467, "rewards/margins": 0.7836226224899292, "rewards/rejected": -2.2487990856170654, "step": 417 }, { "epoch": 0.6138032305433186, "grad_norm": 13.669163703918457, "kl/avg_steps": 0.84375, "kl/beta": 0.011735894717276096, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 1.965167291983757e-07, "logits/chosen": -7.78868293762207, "logits/rejected": -7.488645553588867, "logps/chosen": -225.50465393066406, "logps/ref_chosen": -108.22152709960938, "logps/ref_rejected": -111.8646469116211, "logps/rejected": -311.86407470703125, "loss": 0.7851, "rewards/accuracies": 0.921875, "rewards/chosen": -1.3652478456497192, "rewards/margins": 0.9605357646942139, "rewards/rejected": -2.3257837295532227, "step": 418 }, { "epoch": 0.6152716593245228, "grad_norm": 11.460182189941406, "kl/avg_steps": 0.71875, "kl/beta": 0.011637701652944088, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 1.9526382029472988e-07, "logits/chosen": -7.678118705749512, "logits/rejected": -7.43233585357666, "logps/chosen": -223.0901641845703, "logps/ref_chosen": -97.18328094482422, "logps/ref_rejected": -98.18531799316406, "logps/rejected": -296.13623046875, "loss": 0.8759, "rewards/accuracies": 0.84375, "rewards/chosen": -1.4567546844482422, "rewards/margins": 0.8287121057510376, "rewards/rejected": -2.2854669094085693, "step": 419 }, { "epoch": 0.6167400881057269, "grad_norm": 14.639280319213867, "kl/avg_steps": 0.5625, "kl/beta": 0.011554652824997902, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.9401235374032425e-07, "logits/chosen": -7.911190509796143, "logits/rejected": -6.9983086585998535, "logps/chosen": -243.3070831298828, "logps/ref_chosen": -114.30847930908203, "logps/ref_rejected": -75.68356323242188, "logps/rejected": -270.84930419921875, "loss": 0.9212, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4852039813995361, "rewards/margins": 0.7554365396499634, "rewards/rejected": -2.24064040184021, "step": 420 }, { "epoch": 0.618208516886931, "grad_norm": 11.352572441101074, "kl/avg_steps": 0.5625, "kl/beta": 0.011490020900964737, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.9276236251246653e-07, "logits/chosen": -7.578913688659668, "logits/rejected": -6.917729377746582, "logps/chosen": -215.33929443359375, "logps/ref_chosen": -85.87985229492188, "logps/ref_rejected": -96.33648681640625, "logps/rejected": -295.12469482421875, "loss": 0.8958, "rewards/accuracies": 0.796875, "rewards/chosen": -1.4818620681762695, "rewards/margins": 0.7875877618789673, "rewards/rejected": -2.2694497108459473, "step": 421 }, { "epoch": 0.6196769456681351, "grad_norm": 11.291938781738281, "kl/avg_steps": 0.6875, "kl/beta": 0.011425751261413097, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 1.9151387954958792e-07, "logits/chosen": -7.927703857421875, "logits/rejected": -7.495624542236328, "logps/chosen": -224.50631713867188, "logps/ref_chosen": -100.48060607910156, "logps/ref_rejected": -94.40821838378906, "logps/rejected": -286.848388671875, "loss": 0.8898, "rewards/accuracies": 0.84375, "rewards/chosen": -1.409227728843689, "rewards/margins": 0.7730013132095337, "rewards/rejected": -2.1822290420532227, "step": 422 }, { "epoch": 0.6211453744493393, "grad_norm": 12.197092056274414, "kl/avg_steps": 0.4375, "kl/beta": 0.011347736231982708, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 1.902669377503756e-07, "logits/chosen": -7.66268253326416, "logits/rejected": -7.711108207702637, "logps/chosen": -198.95823669433594, "logps/ref_chosen": -78.44993591308594, "logps/ref_rejected": -92.04652404785156, "logps/rejected": -276.6893310546875, "loss": 0.9664, "rewards/accuracies": 0.71875, "rewards/chosen": -1.364768624305725, "rewards/margins": 0.718655526638031, "rewards/rejected": -2.0834240913391113, "step": 423 }, { "epoch": 0.6226138032305433, "grad_norm": 11.235798835754395, "kl/avg_steps": 0.59375, "kl/beta": 0.011298305355012417, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 1.890215699729057e-07, "logits/chosen": -7.801157474517822, "logits/rejected": -6.635551452636719, "logps/chosen": -220.01161193847656, "logps/ref_chosen": -87.6423568725586, "logps/ref_rejected": -72.36566162109375, "logps/rejected": -272.5384521484375, "loss": 0.9078, "rewards/accuracies": 0.859375, "rewards/chosen": -1.4883897304534912, "rewards/margins": 0.7573517560958862, "rewards/rejected": -2.245741367340088, "step": 424 }, { "epoch": 0.6240822320117474, "grad_norm": 13.440584182739258, "kl/avg_steps": 0.1875, "kl/beta": 0.011231618002057076, "kl/n_epsilon_steps": 0.40625, "kl/p_epsilon_steps": 0.59375, "learning_rate": 1.8777780903377732e-07, "logits/chosen": -7.429314613342285, "logits/rejected": -7.256626605987549, "logps/chosen": -215.4922332763672, "logps/ref_chosen": -78.51979064941406, "logps/ref_rejected": -102.74864196777344, "logps/rejected": -285.57012939453125, "loss": 1.1393, "rewards/accuracies": 0.625, "rewards/chosen": -1.5395094156265259, "rewards/margins": 0.5070829391479492, "rewards/rejected": -2.0465922355651855, "step": 425 }, { "epoch": 0.6255506607929515, "grad_norm": 13.104548454284668, "kl/avg_steps": 0.5625, "kl/beta": 0.011210598051548004, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.8653568770724803e-07, "logits/chosen": -7.60335636138916, "logits/rejected": -7.2415266036987305, "logps/chosen": -227.1387939453125, "logps/ref_chosen": -108.50582885742188, "logps/ref_rejected": -88.300048828125, "logps/rejected": -274.4301452636719, "loss": 0.8943, "rewards/accuracies": 0.828125, "rewards/chosen": -1.3247995376586914, "rewards/margins": 0.7480140924453735, "rewards/rejected": -2.0728135108947754, "step": 426 }, { "epoch": 0.6270190895741556, "grad_norm": 10.724087715148926, "kl/avg_steps": 0.75, "kl/beta": 0.01114789117127657, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 1.8529523872436977e-07, "logits/chosen": -7.953921794891357, "logits/rejected": -7.36940860748291, "logps/chosen": -200.22837829589844, "logps/ref_chosen": -99.12046813964844, "logps/ref_rejected": -85.724609375, "logps/rejected": -250.37628173828125, "loss": 0.9039, "rewards/accuracies": 0.875, "rewards/chosen": -1.119832158088684, "rewards/margins": 0.7005788087844849, "rewards/rejected": -1.820410966873169, "step": 427 }, { "epoch": 0.6284875183553598, "grad_norm": 12.104276657104492, "kl/avg_steps": 0.5625, "kl/beta": 0.011064904741942883, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.8405649477212697e-07, "logits/chosen": -7.528921604156494, "logits/rejected": -7.260353088378906, "logps/chosen": -243.97320556640625, "logps/ref_chosen": -105.96925354003906, "logps/ref_rejected": -109.1021728515625, "logps/rejected": -297.7687683105469, "loss": 1.0721, "rewards/accuracies": 0.75, "rewards/chosen": -1.520464301109314, "rewards/margins": 0.5525679588317871, "rewards/rejected": -2.0730321407318115, "step": 428 }, { "epoch": 0.6299559471365639, "grad_norm": 13.0979642868042, "kl/avg_steps": 0.53125, "kl/beta": 0.011003012768924236, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.828194884925749e-07, "logits/chosen": -7.794681072235107, "logits/rejected": -7.093453407287598, "logps/chosen": -237.86270141601562, "logps/ref_chosen": -113.54486846923828, "logps/ref_rejected": -98.24201965332031, "logps/rejected": -292.80511474609375, "loss": 0.9249, "rewards/accuracies": 0.796875, "rewards/chosen": -1.363885760307312, "rewards/margins": 0.7638142108917236, "rewards/rejected": -2.127699851989746, "step": 429 }, { "epoch": 0.631424375917768, "grad_norm": 12.785799980163574, "kl/avg_steps": 0.4375, "kl/beta": 0.010944867506623268, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 1.8158425248197928e-07, "logits/chosen": -7.379518032073975, "logits/rejected": -7.428651809692383, "logps/chosen": -216.0718994140625, "logps/ref_chosen": -91.31936645507812, "logps/ref_rejected": -110.1096420288086, "logps/rejected": -288.0528564453125, "loss": 1.0248, "rewards/accuracies": 0.765625, "rewards/chosen": -1.361114501953125, "rewards/margins": 0.5744451284408569, "rewards/rejected": -1.9355595111846924, "step": 430 }, { "epoch": 0.6328928046989721, "grad_norm": 11.655879974365234, "kl/avg_steps": 0.5, "kl/beta": 0.010897193104028702, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.8035081928995788e-07, "logits/chosen": -7.7428998947143555, "logits/rejected": -7.222956657409668, "logps/chosen": -215.51998901367188, "logps/ref_chosen": -93.18122100830078, "logps/ref_rejected": -98.13226318359375, "logps/rejected": -283.9945983886719, "loss": 0.9558, "rewards/accuracies": 0.78125, "rewards/chosen": -1.3294377326965332, "rewards/margins": 0.6835219860076904, "rewards/rejected": -2.0129597187042236, "step": 431 }, { "epoch": 0.6343612334801763, "grad_norm": 10.997950553894043, "kl/avg_steps": 0.625, "kl/beta": 0.010842978022992611, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.791192214186223e-07, "logits/chosen": -7.891312122344971, "logits/rejected": -7.409452438354492, "logps/chosen": -215.7032928466797, "logps/ref_chosen": -104.43478393554688, "logps/ref_rejected": -105.08955383300781, "logps/rejected": -292.071533203125, "loss": 0.8684, "rewards/accuracies": 0.84375, "rewards/chosen": -1.2002313137054443, "rewards/margins": 0.8118045330047607, "rewards/rejected": -2.012035846710205, "step": 432 }, { "epoch": 0.6358296622613803, "grad_norm": 11.87765884399414, "kl/avg_steps": 0.46875, "kl/beta": 0.010775630362331867, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 1.7788949132172193e-07, "logits/chosen": -7.532526969909668, "logits/rejected": -7.0036725997924805, "logps/chosen": -220.77711486816406, "logps/ref_chosen": -89.84322357177734, "logps/ref_rejected": -101.73345947265625, "logps/rejected": -292.6389465332031, "loss": 1.001, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4064010381698608, "rewards/margins": 0.6378804445266724, "rewards/rejected": -2.044281482696533, "step": 433 }, { "epoch": 0.6372980910425844, "grad_norm": 11.362030982971191, "kl/avg_steps": 0.5, "kl/beta": 0.0107253547757864, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.7666166140378853e-07, "logits/chosen": -7.701730728149414, "logits/rejected": -7.350008010864258, "logps/chosen": -219.51129150390625, "logps/ref_chosen": -97.6925277709961, "logps/ref_rejected": -84.09130096435547, "logps/rejected": -268.685302734375, "loss": 0.984, "rewards/accuracies": 0.78125, "rewards/chosen": -1.3009238243103027, "rewards/margins": 0.6648129820823669, "rewards/rejected": -1.9657368659973145, "step": 434 }, { "epoch": 0.6387665198237885, "grad_norm": 11.840472221374512, "kl/avg_steps": 0.65625, "kl/beta": 0.010671994648873806, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 1.7543576401928218e-07, "logits/chosen": -7.7169880867004395, "logits/rejected": -7.244067192077637, "logps/chosen": -203.26406860351562, "logps/ref_chosen": -86.17192077636719, "logps/ref_rejected": -93.751708984375, "logps/rejected": -278.71820068359375, "loss": 0.9314, "rewards/accuracies": 0.828125, "rewards/chosen": -1.2436943054199219, "rewards/margins": 0.7158711552619934, "rewards/rejected": -1.9595654010772705, "step": 435 }, { "epoch": 0.6402349486049926, "grad_norm": 11.141986846923828, "kl/avg_steps": 0.78125, "kl/beta": 0.010602416470646858, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 1.742118314717391e-07, "logits/chosen": -8.073395729064941, "logits/rejected": -6.879586219787598, "logps/chosen": -200.92079162597656, "logps/ref_chosen": -105.78710174560547, "logps/ref_rejected": -88.62471008300781, "logps/rejected": -263.154296875, "loss": 0.8215, "rewards/accuracies": 0.90625, "rewards/chosen": -1.0024278163909912, "rewards/margins": 0.8327617645263672, "rewards/rejected": -1.8351895809173584, "step": 436 }, { "epoch": 0.6417033773861968, "grad_norm": 12.68950366973877, "kl/avg_steps": 0.53125, "kl/beta": 0.010520227253437042, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.7298989601292036e-07, "logits/chosen": -7.771686553955078, "logits/rejected": -7.039131164550781, "logps/chosen": -210.84364318847656, "logps/ref_chosen": -96.06204223632812, "logps/ref_rejected": -89.01220703125, "logps/rejected": -258.4656677246094, "loss": 0.9969, "rewards/accuracies": 0.796875, "rewards/chosen": -1.2023859024047852, "rewards/margins": 0.5679023265838623, "rewards/rejected": -1.770288109779358, "step": 437 }, { "epoch": 0.6431718061674009, "grad_norm": 11.616890907287598, "kl/avg_steps": 0.53125, "kl/beta": 0.01046463381499052, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.7176998984196144e-07, "logits/chosen": -7.707596302032471, "logits/rejected": -7.402861595153809, "logps/chosen": -212.05523681640625, "logps/ref_chosen": -101.85537719726562, "logps/ref_rejected": -89.4476547241211, "logps/rejected": -262.99102783203125, "loss": 0.949, "rewards/accuracies": 0.78125, "rewards/chosen": -1.149721622467041, "rewards/margins": 0.6547319889068604, "rewards/rejected": -1.8044536113739014, "step": 438 }, { "epoch": 0.644640234948605, "grad_norm": 12.508461952209473, "kl/avg_steps": 0.625, "kl/beta": 0.010409334674477577, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.7055214510452458e-07, "logits/chosen": -7.787407875061035, "logits/rejected": -7.194408416748047, "logps/chosen": -205.33200073242188, "logps/ref_chosen": -81.75563049316406, "logps/ref_rejected": -90.58635711669922, "logps/rejected": -273.9890441894531, "loss": 0.9918, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2794837951660156, "rewards/margins": 0.6150201559066772, "rewards/rejected": -1.8945040702819824, "step": 439 }, { "epoch": 0.6461086637298091, "grad_norm": 10.161545753479004, "kl/avg_steps": 0.625, "kl/beta": 0.010344680398702621, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.6933639389195134e-07, "logits/chosen": -7.840778827667236, "logits/rejected": -7.435771465301514, "logps/chosen": -216.48208618164062, "logps/ref_chosen": -105.64108276367188, "logps/ref_rejected": -103.40100860595703, "logps/rejected": -284.5960388183594, "loss": 0.923, "rewards/accuracies": 0.796875, "rewards/chosen": -1.1412960290908813, "rewards/margins": 0.7194013595581055, "rewards/rejected": -1.8606973886489868, "step": 440 }, { "epoch": 0.6475770925110133, "grad_norm": 10.330644607543945, "kl/avg_steps": 0.84375, "kl/beta": 0.010280427522957325, "kl/n_epsilon_steps": 0.078125, "kl/p_epsilon_steps": 0.921875, "learning_rate": 1.681227682404166e-07, "logits/chosen": -7.877140522003174, "logits/rejected": -6.723335266113281, "logps/chosen": -221.08547973632812, "logps/ref_chosen": -91.529541015625, "logps/ref_rejected": -103.619384765625, "logps/rejected": -306.0940856933594, "loss": 0.8764, "rewards/accuracies": 0.890625, "rewards/chosen": -1.3216805458068848, "rewards/margins": 0.7420125007629395, "rewards/rejected": -2.063693046569824, "step": 441 }, { "epoch": 0.6490455212922174, "grad_norm": 11.554139137268066, "kl/avg_steps": 0.6875, "kl/beta": 0.010194412432610989, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 1.669113001300851e-07, "logits/chosen": -7.492166996002197, "logits/rejected": -6.9776811599731445, "logps/chosen": -216.06195068359375, "logps/ref_chosen": -84.77755737304688, "logps/ref_rejected": -83.82415008544922, "logps/rejected": -295.6746826171875, "loss": 0.8427, "rewards/accuracies": 0.859375, "rewards/chosen": -1.3306344747543335, "rewards/margins": 0.8125271797180176, "rewards/rejected": -2.1431615352630615, "step": 442 }, { "epoch": 0.6505139500734214, "grad_norm": 11.208428382873535, "kl/avg_steps": 0.625, "kl/beta": 0.010124804452061653, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.6570202148426815e-07, "logits/chosen": -7.71527099609375, "logits/rejected": -7.359766960144043, "logps/chosen": -227.578125, "logps/ref_chosen": -102.64927673339844, "logps/ref_rejected": -93.03807067871094, "logps/rejected": -277.5597229003906, "loss": 0.9816, "rewards/accuracies": 0.828125, "rewards/chosen": -1.2582169771194458, "rewards/margins": 0.5959169864654541, "rewards/rejected": -1.8541338443756104, "step": 443 }, { "epoch": 0.6519823788546255, "grad_norm": 12.04550552368164, "kl/avg_steps": 0.4375, "kl/beta": 0.010061916895210743, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 1.6449496416858282e-07, "logits/chosen": -7.471601963043213, "logits/rejected": -7.507413387298584, "logps/chosen": -221.87020874023438, "logps/ref_chosen": -87.91971588134766, "logps/ref_rejected": -103.32345581054688, "logps/rejected": -295.9997253417969, "loss": 1.0177, "rewards/accuracies": 0.78125, "rewards/chosen": -1.34379243850708, "rewards/margins": 0.5834265947341919, "rewards/rejected": -1.9272189140319824, "step": 444 }, { "epoch": 0.6534508076358296, "grad_norm": 10.32401180267334, "kl/avg_steps": 0.59375, "kl/beta": 0.010018087923526764, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 1.6329015999011182e-07, "logits/chosen": -7.7422380447387695, "logits/rejected": -7.081811904907227, "logps/chosen": -216.97213745117188, "logps/ref_chosen": -101.40087127685547, "logps/ref_rejected": -99.03790283203125, "logps/rejected": -282.439208984375, "loss": 0.925, "rewards/accuracies": 0.828125, "rewards/chosen": -1.1529053449630737, "rewards/margins": 0.6718940138816833, "rewards/rejected": -1.8247992992401123, "step": 445 }, { "epoch": 0.6549192364170338, "grad_norm": 12.130196571350098, "kl/avg_steps": 0.5625, "kl/beta": 0.009958956390619278, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.6208764069656578e-07, "logits/chosen": -7.809151649475098, "logits/rejected": -7.551025390625, "logps/chosen": -199.480224609375, "logps/ref_chosen": -87.42234802246094, "logps/ref_rejected": -106.70075988769531, "logps/rejected": -281.7625732421875, "loss": 0.9615, "rewards/accuracies": 0.796875, "rewards/chosen": -1.1098341941833496, "rewards/margins": 0.6203456521034241, "rewards/rejected": -1.7301799058914185, "step": 446 }, { "epoch": 0.6563876651982379, "grad_norm": 11.679533958435059, "kl/avg_steps": 0.65625, "kl/beta": 0.009903251193463802, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 1.608874379754465e-07, "logits/chosen": -7.548259735107422, "logits/rejected": -7.2763800621032715, "logps/chosen": -197.861083984375, "logps/ref_chosen": -83.6152572631836, "logps/ref_rejected": -104.91239929199219, "logps/rejected": -294.75396728515625, "loss": 0.9079, "rewards/accuracies": 0.828125, "rewards/chosen": -1.1265549659729004, "rewards/margins": 0.7398759126663208, "rewards/rejected": -1.8664308786392212, "step": 447 }, { "epoch": 0.657856093979442, "grad_norm": 11.890493392944336, "kl/avg_steps": 0.5625, "kl/beta": 0.00983868446201086, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.5968958345321177e-07, "logits/chosen": -7.886592864990234, "logits/rejected": -7.5247802734375, "logps/chosen": -214.25633239746094, "logps/ref_chosen": -92.5757827758789, "logps/ref_rejected": -107.68977355957031, "logps/rejected": -305.142578125, "loss": 0.9455, "rewards/accuracies": 0.78125, "rewards/chosen": -1.1929302215576172, "rewards/margins": 0.7362387180328369, "rewards/rejected": -1.929168939590454, "step": 448 }, { "epoch": 0.6593245227606461, "grad_norm": 10.460795402526855, "kl/avg_steps": 0.53125, "kl/beta": 0.009783651679754257, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.584941086944423e-07, "logits/chosen": -7.6187238693237305, "logits/rejected": -7.113336086273193, "logps/chosen": -223.71267700195312, "logps/ref_chosen": -102.39893341064453, "logps/ref_rejected": -95.14886474609375, "logps/rejected": -293.65252685546875, "loss": 0.9078, "rewards/accuracies": 0.78125, "rewards/chosen": -1.1828500032424927, "rewards/margins": 0.7462369799613953, "rewards/rejected": -1.9290869235992432, "step": 449 }, { "epoch": 0.6607929515418502, "grad_norm": 11.509246826171875, "kl/avg_steps": 0.71875, "kl/beta": 0.009731950238347054, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 1.573010452010098e-07, "logits/chosen": -8.000127792358398, "logits/rejected": -7.75508451461792, "logps/chosen": -191.9837646484375, "logps/ref_chosen": -86.99285888671875, "logps/ref_rejected": -108.53203582763672, "logps/rejected": -290.04888916015625, "loss": 0.9059, "rewards/accuracies": 0.859375, "rewards/chosen": -1.0168712139129639, "rewards/margins": 0.7357749938964844, "rewards/rejected": -1.7526460886001587, "step": 450 }, { "epoch": 0.6622613803230544, "grad_norm": 16.418062210083008, "kl/avg_steps": 0.46875, "kl/beta": 0.009662501513957977, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 1.5611042441124687e-07, "logits/chosen": -7.632309913635254, "logits/rejected": -7.299648761749268, "logps/chosen": -206.88548278808594, "logps/ref_chosen": -86.81128692626953, "logps/ref_rejected": -79.8555908203125, "logps/rejected": -262.9779357910156, "loss": 1.0328, "rewards/accuracies": 0.734375, "rewards/chosen": -1.157651424407959, "rewards/margins": 0.6012389659881592, "rewards/rejected": -1.7588902711868286, "step": 451 }, { "epoch": 0.6637298091042585, "grad_norm": 10.160775184631348, "kl/avg_steps": 0.6875, "kl/beta": 0.00961741991341114, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 1.549222776991186e-07, "logits/chosen": -7.714634895324707, "logits/rejected": -7.272608757019043, "logps/chosen": -185.05380249023438, "logps/ref_chosen": -79.379638671875, "logps/ref_rejected": -103.71539306640625, "logps/rejected": -277.22369384765625, "loss": 0.9212, "rewards/accuracies": 0.84375, "rewards/chosen": -1.0104700326919556, "rewards/margins": 0.6453733444213867, "rewards/rejected": -1.6558433771133423, "step": 452 }, { "epoch": 0.6651982378854625, "grad_norm": 10.804041862487793, "kl/avg_steps": 0.6875, "kl/beta": 0.009551751427352428, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 1.5373663637339584e-07, "logits/chosen": -7.6690826416015625, "logits/rejected": -7.041682243347168, "logps/chosen": -213.16781616210938, "logps/ref_chosen": -87.6951904296875, "logps/ref_rejected": -90.0582275390625, "logps/rejected": -297.2564697265625, "loss": 0.8581, "rewards/accuracies": 0.875, "rewards/chosen": -1.191428303718567, "rewards/margins": 0.7726534605026245, "rewards/rejected": -1.964081883430481, "step": 453 }, { "epoch": 0.6666666666666666, "grad_norm": 11.586421966552734, "kl/avg_steps": 0.46875, "kl/beta": 0.009486531838774681, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 1.5255353167683017e-07, "logits/chosen": -7.6572160720825195, "logits/rejected": -7.268322944641113, "logps/chosen": -224.0977325439453, "logps/ref_chosen": -89.56623840332031, "logps/ref_rejected": -92.92105102539062, "logps/rejected": -293.582275390625, "loss": 1.0254, "rewards/accuracies": 0.75, "rewards/chosen": -1.273888349533081, "rewards/margins": 0.6185543537139893, "rewards/rejected": -1.8924428224563599, "step": 454 }, { "epoch": 0.6681350954478708, "grad_norm": 10.893000602722168, "kl/avg_steps": 0.5, "kl/beta": 0.00944227073341608, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.5137299478533064e-07, "logits/chosen": -7.797979354858398, "logits/rejected": -7.566769599914551, "logps/chosen": -199.83328247070312, "logps/ref_chosen": -77.6299819946289, "logps/ref_rejected": -118.97795104980469, "logps/rejected": -308.0276184082031, "loss": 0.9778, "rewards/accuracies": 0.796875, "rewards/chosen": -1.15059232711792, "rewards/margins": 0.6233835220336914, "rewards/rejected": -1.7739757299423218, "step": 455 }, { "epoch": 0.6696035242290749, "grad_norm": 12.390460014343262, "kl/avg_steps": 0.5625, "kl/beta": 0.009395293891429901, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.5019505680714232e-07, "logits/chosen": -7.607115745544434, "logits/rejected": -7.611271858215332, "logps/chosen": -215.07440185546875, "logps/ref_chosen": -89.61686706542969, "logps/ref_rejected": -109.5597152709961, "logps/rejected": -298.5582580566406, "loss": 0.9811, "rewards/accuracies": 0.796875, "rewards/chosen": -1.1745624542236328, "rewards/margins": 0.5899935960769653, "rewards/rejected": -1.7645561695098877, "step": 456 }, { "epoch": 0.671071953010279, "grad_norm": 11.532391548156738, "kl/avg_steps": 0.8125, "kl/beta": 0.009342741221189499, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 1.4901974878202627e-07, "logits/chosen": -7.829406261444092, "logits/rejected": -7.343839168548584, "logps/chosen": -193.81298828125, "logps/ref_chosen": -87.32168579101562, "logps/ref_rejected": -90.76660919189453, "logps/rejected": -280.0379638671875, "loss": 0.842, "rewards/accuracies": 0.921875, "rewards/chosen": -0.9881317615509033, "rewards/margins": 0.765188992023468, "rewards/rejected": -1.7533208131790161, "step": 457 }, { "epoch": 0.6725403817914831, "grad_norm": 11.981277465820312, "kl/avg_steps": 0.59375, "kl/beta": 0.009267443791031837, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 1.4784710168044212e-07, "logits/chosen": -7.395938873291016, "logits/rejected": -7.573344707489014, "logps/chosen": -213.52423095703125, "logps/ref_chosen": -93.52044677734375, "logps/ref_rejected": -103.36898803710938, "logps/rejected": -286.9295654296875, "loss": 1.024, "rewards/accuracies": 0.78125, "rewards/chosen": -1.1065789461135864, "rewards/margins": 0.581390380859375, "rewards/rejected": -1.687969446182251, "step": 458 }, { "epoch": 0.6740088105726872, "grad_norm": 9.090399742126465, "kl/avg_steps": 0.65625, "kl/beta": 0.009212742559611797, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 1.466771464027316e-07, "logits/chosen": -7.5142059326171875, "logits/rejected": -6.964491844177246, "logps/chosen": -190.7766571044922, "logps/ref_chosen": -75.68820190429688, "logps/ref_rejected": -92.17048645019531, "logps/rejected": -284.36572265625, "loss": 0.9252, "rewards/accuracies": 0.8125, "rewards/chosen": -1.05517578125, "rewards/margins": 0.7023875713348389, "rewards/rejected": -1.7575633525848389, "step": 459 }, { "epoch": 0.6754772393538914, "grad_norm": 12.827125549316406, "kl/avg_steps": 0.65625, "kl/beta": 0.009152678772807121, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 1.4550991377830423e-07, "logits/chosen": -7.7618255615234375, "logits/rejected": -7.418609619140625, "logps/chosen": -201.06686401367188, "logps/ref_chosen": -81.11788940429688, "logps/ref_rejected": -110.31238555908203, "logps/rejected": -300.1006164550781, "loss": 0.9702, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0923173427581787, "rewards/margins": 0.6314802765846252, "rewards/rejected": -1.7237976789474487, "step": 460 }, { "epoch": 0.6769456681350955, "grad_norm": 11.000265121459961, "kl/avg_steps": 0.46875, "kl/beta": 0.009093005210161209, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 1.4434543456482518e-07, "logits/chosen": -7.253688335418701, "logits/rejected": -6.825348854064941, "logps/chosen": -227.41180419921875, "logps/ref_chosen": -81.58352661132812, "logps/ref_rejected": -93.87710571289062, "logps/rejected": -295.16571044921875, "loss": 1.0504, "rewards/accuracies": 0.734375, "rewards/chosen": -1.3213638067245483, "rewards/margins": 0.4980071187019348, "rewards/rejected": -1.819370985031128, "step": 461 }, { "epoch": 0.6784140969162996, "grad_norm": 10.546554565429688, "kl/avg_steps": 0.71875, "kl/beta": 0.00905058067291975, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 1.4318373944740484e-07, "logits/chosen": -7.895207405090332, "logits/rejected": -7.166084289550781, "logps/chosen": -205.47802734375, "logps/ref_chosen": -94.19855499267578, "logps/ref_rejected": -85.63162994384766, "logps/rejected": -272.15692138671875, "loss": 0.9154, "rewards/accuracies": 0.90625, "rewards/chosen": -1.00137460231781, "rewards/margins": 0.6736320853233337, "rewards/rejected": -1.675006628036499, "step": 462 }, { "epoch": 0.6798825256975036, "grad_norm": 12.333318710327148, "kl/avg_steps": 0.65625, "kl/beta": 0.008985994383692741, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 1.4202485903778976e-07, "logits/chosen": -7.807669639587402, "logits/rejected": -7.118095397949219, "logps/chosen": -193.8429718017578, "logps/ref_chosen": -85.92474365234375, "logps/ref_rejected": -96.90184020996094, "logps/rejected": -292.2165222167969, "loss": 0.8797, "rewards/accuracies": 0.84375, "rewards/chosen": -0.9655375480651855, "rewards/margins": 0.7759765386581421, "rewards/rejected": -1.7415142059326172, "step": 463 }, { "epoch": 0.6813509544787077, "grad_norm": 14.331416130065918, "kl/avg_steps": 0.71875, "kl/beta": 0.008927407674491405, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 1.4086882387355658e-07, "logits/chosen": -7.688302993774414, "logits/rejected": -7.788723945617676, "logps/chosen": -212.67556762695312, "logps/ref_chosen": -79.68920135498047, "logps/ref_rejected": -107.29232025146484, "logps/rejected": -310.6097717285156, "loss": 0.9686, "rewards/accuracies": 0.828125, "rewards/chosen": -1.1799356937408447, "rewards/margins": 0.6203708648681641, "rewards/rejected": -1.8003066778182983, "step": 464 }, { "epoch": 0.6828193832599119, "grad_norm": 11.430578231811523, "kl/avg_steps": 0.71875, "kl/beta": 0.008863699622452259, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 1.3971566441730714e-07, "logits/chosen": -7.3866472244262695, "logits/rejected": -7.172554016113281, "logps/chosen": -213.5987548828125, "logps/ref_chosen": -91.8602294921875, "logps/ref_rejected": -118.71000671386719, "logps/rejected": -325.18511962890625, "loss": 0.8775, "rewards/accuracies": 0.875, "rewards/chosen": -1.073095440864563, "rewards/margins": 0.7431429624557495, "rewards/rejected": -1.8162384033203125, "step": 465 }, { "epoch": 0.684287812041116, "grad_norm": 8.964788436889648, "kl/avg_steps": 0.71875, "kl/beta": 0.0088004469871521, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 1.3856541105586545e-07, "logits/chosen": -7.547239780426025, "logits/rejected": -7.197183609008789, "logps/chosen": -195.26803588867188, "logps/ref_chosen": -84.70140075683594, "logps/ref_rejected": -96.05084228515625, "logps/rejected": -294.0559387207031, "loss": 0.8759, "rewards/accuracies": 0.859375, "rewards/chosen": -0.9667361974716187, "rewards/margins": 0.7610065937042236, "rewards/rejected": -1.7277427911758423, "step": 466 }, { "epoch": 0.6857562408223201, "grad_norm": 11.361374855041504, "kl/avg_steps": 0.5625, "kl/beta": 0.008737645111978054, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.3741809409947729e-07, "logits/chosen": -8.130990028381348, "logits/rejected": -7.5615644454956055, "logps/chosen": -244.66354370117188, "logps/ref_chosen": -109.29832458496094, "logps/ref_rejected": -108.8436508178711, "logps/rejected": -320.74517822265625, "loss": 0.9698, "rewards/accuracies": 0.828125, "rewards/chosen": -1.1772236824035645, "rewards/margins": 0.6603525876998901, "rewards/rejected": -1.8375762701034546, "step": 467 }, { "epoch": 0.6872246696035242, "grad_norm": 11.03893756866455, "kl/avg_steps": 0.4375, "kl/beta": 0.008688771165907383, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 1.362737437810114e-07, "logits/chosen": -7.599516868591309, "logits/rejected": -7.804562091827393, "logps/chosen": -210.38668823242188, "logps/ref_chosen": -98.32164764404297, "logps/ref_rejected": -106.68048095703125, "logps/rejected": -282.5191345214844, "loss": 1.0436, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9716976881027222, "rewards/margins": 0.5471584796905518, "rewards/rejected": -1.518856167793274, "step": 468 }, { "epoch": 0.6886930983847284, "grad_norm": 12.710182189941406, "kl/avg_steps": 0.6875, "kl/beta": 0.008650923147797585, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 1.351323902551631e-07, "logits/chosen": -7.760175704956055, "logits/rejected": -7.189078330993652, "logps/chosen": -229.18951416015625, "logps/ref_chosen": -96.76420593261719, "logps/ref_rejected": -109.59500885009766, "logps/rejected": -324.5375061035156, "loss": 0.9145, "rewards/accuracies": 0.84375, "rewards/chosen": -1.1398154497146606, "rewards/margins": 0.7056005597114563, "rewards/rejected": -1.8454160690307617, "step": 469 }, { "epoch": 0.6901615271659325, "grad_norm": 11.018048286437988, "kl/avg_steps": 0.78125, "kl/beta": 0.008591854013502598, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 1.339940635976592e-07, "logits/chosen": -7.906304359436035, "logits/rejected": -7.4226555824279785, "logps/chosen": -208.847900390625, "logps/ref_chosen": -83.49665832519531, "logps/ref_rejected": -88.48578643798828, "logps/rejected": -289.9870300292969, "loss": 0.9357, "rewards/accuracies": 0.875, "rewards/chosen": -1.0692801475524902, "rewards/margins": 0.6468397378921509, "rewards/rejected": -1.7161200046539307, "step": 470 }, { "epoch": 0.6916299559471366, "grad_norm": 10.446932792663574, "kl/avg_steps": 0.6875, "kl/beta": 0.008525250479578972, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 1.3285879380446563e-07, "logits/chosen": -7.087899208068848, "logits/rejected": -6.836267948150635, "logps/chosen": -233.13998413085938, "logps/ref_chosen": -88.47430419921875, "logps/ref_rejected": -90.48171997070312, "logps/rejected": -307.9644470214844, "loss": 0.9497, "rewards/accuracies": 0.84375, "rewards/chosen": -1.2270324230194092, "rewards/margins": 0.6135708093643188, "rewards/rejected": -1.8406031131744385, "step": 471 }, { "epoch": 0.6930983847283406, "grad_norm": 9.13286304473877, "kl/avg_steps": 0.71875, "kl/beta": 0.008467039093375206, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 1.317266107909975e-07, "logits/chosen": -7.951723098754883, "logits/rejected": -7.220186233520508, "logps/chosen": -254.59384155273438, "logps/ref_chosen": -125.23369598388672, "logps/ref_rejected": -121.05349731445312, "logps/rejected": -340.85113525390625, "loss": 0.8455, "rewards/accuracies": 0.890625, "rewards/chosen": -1.0882880687713623, "rewards/margins": 0.757997989654541, "rewards/rejected": -1.8462860584259033, "step": 472 }, { "epoch": 0.6945668135095447, "grad_norm": 11.645308494567871, "kl/avg_steps": 0.40625, "kl/beta": 0.008406616747379303, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 1.3059754439133002e-07, "logits/chosen": -7.586986064910889, "logits/rejected": -7.300940990447998, "logps/chosen": -243.3128662109375, "logps/ref_chosen": -95.61137390136719, "logps/ref_rejected": -88.15115356445312, "logps/rejected": -292.8554992675781, "loss": 1.0855, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2393798828125, "rewards/margins": 0.4723225235939026, "rewards/rejected": -1.711702585220337, "step": 473 }, { "epoch": 0.6960352422907489, "grad_norm": 12.829643249511719, "kl/avg_steps": 0.4375, "kl/beta": 0.008372602984309196, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 1.2947162435741277e-07, "logits/chosen": -7.622153282165527, "logits/rejected": -7.345100402832031, "logps/chosen": -225.4574737548828, "logps/ref_chosen": -81.47975158691406, "logps/ref_rejected": -96.46562957763672, "logps/rejected": -293.2532958984375, "loss": 1.1116, "rewards/accuracies": 0.734375, "rewards/chosen": -1.2031913995742798, "rewards/margins": 0.43547606468200684, "rewards/rejected": -1.6386675834655762, "step": 474 }, { "epoch": 0.697503671071953, "grad_norm": 9.961170196533203, "kl/avg_steps": 0.625, "kl/beta": 0.008336132392287254, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.2834888035828596e-07, "logits/chosen": -7.716796875, "logits/rejected": -7.385658264160156, "logps/chosen": -182.11282348632812, "logps/ref_chosen": -74.19598388671875, "logps/ref_rejected": -94.69242095947266, "logps/rejected": -285.2503662109375, "loss": 0.9232, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8945390582084656, "rewards/margins": 0.6813703775405884, "rewards/rejected": -1.5759094953536987, "step": 475 }, { "epoch": 0.6989720998531571, "grad_norm": 12.75778579711914, "kl/avg_steps": 0.59375, "kl/beta": 0.008284355513751507, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 1.2722934197929802e-07, "logits/chosen": -7.788543224334717, "logits/rejected": -7.032910346984863, "logps/chosen": -201.364013671875, "logps/ref_chosen": -71.97109985351562, "logps/ref_rejected": -80.26224517822266, "logps/rejected": -281.6759033203125, "loss": 0.9888, "rewards/accuracies": 0.78125, "rewards/chosen": -1.067098617553711, "rewards/margins": 0.5897184610366821, "rewards/rejected": -1.656817078590393, "step": 476 }, { "epoch": 0.7004405286343612, "grad_norm": 10.86681079864502, "kl/avg_steps": 0.6875, "kl/beta": 0.008235457353293896, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 1.2611303872132631e-07, "logits/chosen": -7.685525894165039, "logits/rejected": -7.131807327270508, "logps/chosen": -231.5677032470703, "logps/ref_chosen": -105.00555419921875, "logps/ref_rejected": -81.87843322753906, "logps/rejected": -285.14166259765625, "loss": 0.9673, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0373156070709229, "rewards/margins": 0.6240187883377075, "rewards/rejected": -1.6613343954086304, "step": 477 }, { "epoch": 0.7019089574155654, "grad_norm": 9.899713516235352, "kl/avg_steps": 0.75, "kl/beta": 0.008179225027561188, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 1.2500000000000005e-07, "logits/chosen": -7.471836090087891, "logits/rejected": -7.25750732421875, "logps/chosen": -198.8499755859375, "logps/ref_chosen": -76.7882080078125, "logps/ref_rejected": -90.43994140625, "logps/rejected": -291.9015808105469, "loss": 0.9417, "rewards/accuracies": 0.90625, "rewards/chosen": -0.9919052124023438, "rewards/margins": 0.6421196460723877, "rewards/rejected": -1.6340248584747314, "step": 478 }, { "epoch": 0.7033773861967695, "grad_norm": 12.096019744873047, "kl/avg_steps": 0.625, "kl/beta": 0.008118337951600552, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.2389025514492456e-07, "logits/chosen": -7.843846321105957, "logits/rejected": -7.429350852966309, "logps/chosen": -211.1195526123047, "logps/ref_chosen": -81.3623046875, "logps/ref_rejected": -101.09114074707031, "logps/rejected": -299.9999694824219, "loss": 0.9958, "rewards/accuracies": 0.796875, "rewards/chosen": -1.0483417510986328, "rewards/margins": 0.5544079542160034, "rewards/rejected": -1.6027498245239258, "step": 479 }, { "epoch": 0.7048458149779736, "grad_norm": 12.593810081481934, "kl/avg_steps": 0.5, "kl/beta": 0.008067913353443146, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.227838333989088e-07, "logits/chosen": -8.081748962402344, "logits/rejected": -7.16196346282959, "logps/chosen": -240.56443786621094, "logps/ref_chosen": -96.7739028930664, "logps/ref_rejected": -86.40473937988281, "logps/rejected": -296.2643127441406, "loss": 1.0276, "rewards/accuracies": 0.78125, "rewards/chosen": -1.1555697917938232, "rewards/margins": 0.5263204574584961, "rewards/rejected": -1.6818903684616089, "step": 480 }, { "epoch": 0.7063142437591777, "grad_norm": 11.46150016784668, "kl/avg_steps": 0.59375, "kl/beta": 0.008027774281799793, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 1.2168076391719489e-07, "logits/chosen": -7.622033596038818, "logits/rejected": -6.947890281677246, "logps/chosen": -233.6854705810547, "logps/ref_chosen": -91.670166015625, "logps/ref_rejected": -98.69490051269531, "logps/rejected": -314.57574462890625, "loss": 0.99, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1354894638061523, "rewards/margins": 0.5857840180397034, "rewards/rejected": -1.721273422241211, "step": 481 }, { "epoch": 0.7077826725403817, "grad_norm": 11.263471603393555, "kl/avg_steps": 0.59375, "kl/beta": 0.007980390451848507, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 1.2058107576668938e-07, "logits/chosen": -7.860917091369629, "logits/rejected": -7.225180625915527, "logps/chosen": -235.95123291015625, "logps/ref_chosen": -98.52011108398438, "logps/ref_rejected": -94.8294448852539, "logps/rejected": -304.33599853515625, "loss": 0.9871, "rewards/accuracies": 0.796875, "rewards/chosen": -1.0913074016571045, "rewards/margins": 0.5683927536010742, "rewards/rejected": -1.6596999168395996, "step": 482 }, { "epoch": 0.7092511013215859, "grad_norm": 9.014310836791992, "kl/avg_steps": 0.75, "kl/beta": 0.00793328694999218, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 1.194847979251979e-07, "logits/chosen": -8.008612632751465, "logits/rejected": -7.4816975593566895, "logps/chosen": -248.36256408691406, "logps/ref_chosen": -107.11860656738281, "logps/ref_rejected": -101.11499786376953, "logps/rejected": -319.85467529296875, "loss": 0.9526, "rewards/accuracies": 0.90625, "rewards/chosen": -1.1138007640838623, "rewards/margins": 0.6078551411628723, "rewards/rejected": -1.7216558456420898, "step": 483 }, { "epoch": 0.71071953010279, "grad_norm": 8.998906135559082, "kl/avg_steps": 0.6875, "kl/beta": 0.007874229922890663, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 1.1839195928066101e-07, "logits/chosen": -7.730357646942139, "logits/rejected": -6.921565055847168, "logps/chosen": -220.87344360351562, "logps/ref_chosen": -86.97991943359375, "logps/ref_rejected": -90.72367095947266, "logps/rejected": -307.90826416015625, "loss": 0.9462, "rewards/accuracies": 0.828125, "rewards/chosen": -1.049012303352356, "rewards/margins": 0.6481724381446838, "rewards/rejected": -1.6971848011016846, "step": 484 }, { "epoch": 0.7121879588839941, "grad_norm": 9.495361328125, "kl/avg_steps": 0.6875, "kl/beta": 0.007820464670658112, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 1.1730258863039347e-07, "logits/chosen": -7.978307247161865, "logits/rejected": -7.098984241485596, "logps/chosen": -215.27700805664062, "logps/ref_chosen": -94.05874633789062, "logps/ref_rejected": -108.56297302246094, "logps/rejected": -324.5331726074219, "loss": 0.8722, "rewards/accuracies": 0.90625, "rewards/chosen": -0.9427545070648193, "rewards/margins": 0.733292281627655, "rewards/rejected": -1.6760468482971191, "step": 485 }, { "epoch": 0.7136563876651982, "grad_norm": 11.05057144165039, "kl/avg_steps": 0.53125, "kl/beta": 0.007767065893858671, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.1621671468032493e-07, "logits/chosen": -7.851730823516846, "logits/rejected": -7.302485466003418, "logps/chosen": -233.9942626953125, "logps/ref_chosen": -93.74588012695312, "logps/ref_rejected": -98.07064819335938, "logps/rejected": -322.165771484375, "loss": 0.9519, "rewards/accuracies": 0.765625, "rewards/chosen": -1.0858381986618042, "rewards/margins": 0.643623411655426, "rewards/rejected": -1.7294615507125854, "step": 486 }, { "epoch": 0.7151248164464024, "grad_norm": 10.726838111877441, "kl/avg_steps": 0.78125, "kl/beta": 0.0077260215766727924, "kl/n_epsilon_steps": 0.109375, "kl/p_epsilon_steps": 0.890625, "learning_rate": 1.1513436604424378e-07, "logits/chosen": -7.636082649230957, "logits/rejected": -7.332340240478516, "logps/chosen": -224.526123046875, "logps/ref_chosen": -88.0335693359375, "logps/ref_rejected": -98.47209930419922, "logps/rejected": -311.92681884765625, "loss": 0.9663, "rewards/accuracies": 0.875, "rewards/chosen": -1.047347068786621, "rewards/margins": 0.5880845785140991, "rewards/rejected": -1.6354316473007202, "step": 487 }, { "epoch": 0.7165932452276065, "grad_norm": 11.892740249633789, "kl/avg_steps": 0.65625, "kl/beta": 0.007666129618883133, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 1.1405557124304335e-07, "logits/chosen": -7.434564590454102, "logits/rejected": -6.9452362060546875, "logps/chosen": -213.1976776123047, "logps/ref_chosen": -84.78964233398438, "logps/ref_rejected": -90.2734603881836, "logps/rejected": -299.0603942871094, "loss": 0.935, "rewards/accuracies": 0.890625, "rewards/chosen": -0.9786466360092163, "rewards/margins": 0.6095430850982666, "rewards/rejected": -1.5881898403167725, "step": 488 }, { "epoch": 0.7180616740088106, "grad_norm": 9.741364479064941, "kl/avg_steps": 0.53125, "kl/beta": 0.007616148795932531, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.1298035870396985e-07, "logits/chosen": -7.642593860626221, "logits/rejected": -7.163801670074463, "logps/chosen": -229.92735290527344, "logps/ref_chosen": -90.46929931640625, "logps/ref_rejected": -86.39761352539062, "logps/rejected": -301.63934326171875, "loss": 0.982, "rewards/accuracies": 0.78125, "rewards/chosen": -1.0582023859024048, "rewards/margins": 0.5706866979598999, "rewards/rejected": -1.6288890838623047, "step": 489 }, { "epoch": 0.7195301027900147, "grad_norm": 10.637678146362305, "kl/avg_steps": 0.453125, "kl/beta": 0.0075759016908705235, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.71875, "learning_rate": 1.1190875675987355e-07, "logits/chosen": -7.2537336349487305, "logits/rejected": -7.196831226348877, "logps/chosen": -230.93482971191406, "logps/ref_chosen": -85.32012939453125, "logps/ref_rejected": -115.99385070800781, "logps/rejected": -327.2318115234375, "loss": 1.094, "rewards/accuracies": 0.734375, "rewards/chosen": -1.1003369092941284, "rewards/margins": 0.48968762159347534, "rewards/rejected": -1.590024471282959, "step": 490 }, { "epoch": 0.7209985315712188, "grad_norm": 9.847221374511719, "kl/avg_steps": 0.5, "kl/beta": 0.00754172820597887, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.1084079364846241e-07, "logits/chosen": -7.813044548034668, "logits/rejected": -7.093764305114746, "logps/chosen": -229.64122009277344, "logps/ref_chosen": -86.14351654052734, "logps/ref_rejected": -80.67945861816406, "logps/rejected": -291.98773193359375, "loss": 1.0378, "rewards/accuracies": 0.78125, "rewards/chosen": -1.0794801712036133, "rewards/margins": 0.5048484802246094, "rewards/rejected": -1.5843286514282227, "step": 491 }, { "epoch": 0.7224669603524229, "grad_norm": 13.724920272827148, "kl/avg_steps": 0.46875, "kl/beta": 0.007504207547754049, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 1.097764975115576e-07, "logits/chosen": -7.806938171386719, "logits/rejected": -7.273170471191406, "logps/chosen": -213.5839080810547, "logps/ref_chosen": -81.10757446289062, "logps/ref_rejected": -80.75199890136719, "logps/rejected": -268.00067138671875, "loss": 1.151, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9921280145645142, "rewards/margins": 0.40435731410980225, "rewards/rejected": -1.3964853286743164, "step": 492 }, { "epoch": 0.723935389133627, "grad_norm": 11.94544506072998, "kl/avg_steps": 0.5625, "kl/beta": 0.007469195406883955, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.0871589639435203e-07, "logits/chosen": -7.833661079406738, "logits/rejected": -6.987953186035156, "logps/chosen": -245.13128662109375, "logps/ref_chosen": -112.20733642578125, "logps/ref_rejected": -93.60719299316406, "logps/rejected": -293.9502868652344, "loss": 1.0192, "rewards/accuracies": 0.828125, "rewards/chosen": -0.9897406697273254, "rewards/margins": 0.49745315313339233, "rewards/rejected": -1.4871938228607178, "step": 493 }, { "epoch": 0.7254038179148311, "grad_norm": 10.316105842590332, "kl/avg_steps": 0.71875, "kl/beta": 0.007427416276186705, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 1.0765901824467166e-07, "logits/chosen": -7.544450759887695, "logits/rejected": -7.313390731811523, "logps/chosen": -200.50823974609375, "logps/ref_chosen": -73.11489868164062, "logps/ref_rejected": -92.16300201416016, "logps/rejected": -312.24517822265625, "loss": 0.9054, "rewards/accuracies": 0.890625, "rewards/chosen": -0.940959632396698, "rewards/margins": 0.6807957291603088, "rewards/rejected": -1.6217553615570068, "step": 494 }, { "epoch": 0.7268722466960352, "grad_norm": 10.488041877746582, "kl/avg_steps": 0.625, "kl/beta": 0.0073744128458201885, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.0660589091223854e-07, "logits/chosen": -7.922597408294678, "logits/rejected": -7.498479843139648, "logps/chosen": -228.8547821044922, "logps/ref_chosen": -99.52032470703125, "logps/ref_rejected": -97.93089294433594, "logps/rejected": -302.8748779296875, "loss": 1.0093, "rewards/accuracies": 0.8125, "rewards/chosen": -0.950130820274353, "rewards/margins": 0.5504207611083984, "rewards/rejected": -1.5005515813827515, "step": 495 }, { "epoch": 0.7283406754772394, "grad_norm": 10.726445198059082, "kl/avg_steps": 0.4375, "kl/beta": 0.007328609004616737, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.703125, "learning_rate": 1.0555654214793722e-07, "logits/chosen": -7.784862995147705, "logits/rejected": -7.454352378845215, "logps/chosen": -250.06234741210938, "logps/ref_chosen": -107.85675048828125, "logps/ref_rejected": -92.77056121826172, "logps/rejected": -301.3846740722656, "loss": 1.0384, "rewards/accuracies": 0.765625, "rewards/chosen": -1.039567232131958, "rewards/margins": 0.48080918192863464, "rewards/rejected": -1.520376443862915, "step": 496 }, { "epoch": 0.7298091042584435, "grad_norm": 10.815900802612305, "kl/avg_steps": 0.3125, "kl/beta": 0.007296686060726643, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 1.0451099960308374e-07, "logits/chosen": -7.584544658660889, "logits/rejected": -7.411238670349121, "logps/chosen": -249.58370971679688, "logps/ref_chosen": -92.08322143554688, "logps/ref_rejected": -81.79503631591797, "logps/rejected": -286.0672607421875, "loss": 1.1655, "rewards/accuracies": 0.703125, "rewards/chosen": -1.1472631692886353, "rewards/margins": 0.33593475818634033, "rewards/rejected": -1.483197808265686, "step": 497 }, { "epoch": 0.7312775330396476, "grad_norm": 10.606035232543945, "kl/avg_steps": 0.6875, "kl/beta": 0.007273954804986715, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 1.0346929082869641e-07, "logits/chosen": -7.672981262207031, "logits/rejected": -7.160597801208496, "logps/chosen": -226.19503784179688, "logps/ref_chosen": -98.19436645507812, "logps/ref_rejected": -90.68746185302734, "logps/rejected": -293.2069396972656, "loss": 0.9923, "rewards/accuracies": 0.859375, "rewards/chosen": -0.9259578585624695, "rewards/margins": 0.5358661413192749, "rewards/rejected": -1.4618239402770996, "step": 498 }, { "epoch": 0.7327459618208517, "grad_norm": 13.12460708618164, "kl/avg_steps": 0.625, "kl/beta": 0.007224287837743759, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.0243144327477013e-07, "logits/chosen": -7.342049598693848, "logits/rejected": -7.139578342437744, "logps/chosen": -219.22235107421875, "logps/ref_chosen": -81.0399169921875, "logps/ref_rejected": -106.92170715332031, "logps/rejected": -319.46099853515625, "loss": 1.0261, "rewards/accuracies": 0.796875, "rewards/chosen": -0.9939165711402893, "rewards/margins": 0.5303443074226379, "rewards/rejected": -1.5242608785629272, "step": 499 }, { "epoch": 0.7342143906020558, "grad_norm": 11.147185325622559, "kl/avg_steps": 0.4375, "kl/beta": 0.007179416250437498, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 1.0139748428955333e-07, "logits/chosen": -7.811801433563232, "logits/rejected": -7.598600387573242, "logps/chosen": -237.38864135742188, "logps/ref_chosen": -89.248046875, "logps/ref_rejected": -100.41021728515625, "logps/rejected": -317.2994079589844, "loss": 1.0563, "rewards/accuracies": 0.734375, "rewards/chosen": -1.0610756874084473, "rewards/margins": 0.48731058835983276, "rewards/rejected": -1.5483863353729248, "step": 500 }, { "epoch": 0.7342143906020558, "eval_kl/n_epsilon_steps": 0.25042808055877686, "eval_kl/p_epsilon_steps": 0.7491438388824463, "eval_logits/chosen": -7.587223529815674, "eval_logits/rejected": -7.184981822967529, "eval_logps/chosen": -242.73046875, "eval_logps/ref_chosen": -100.49356842041016, "eval_logps/ref_rejected": -94.06775665283203, "eval_logps/rejected": -299.77655029296875, "eval_loss": 0.5370330810546875, "eval_rewards/accuracies": 0.7666952013969421, "eval_rewards/chosen": -1.0134860277175903, "eval_rewards/margins": 0.447692334651947, "eval_rewards/rejected": -1.4611784219741821, "eval_runtime": 48.5908, "eval_samples_per_second": 48.137, "eval_steps_per_second": 1.523, "step": 500 }, { "epoch": 0.73568281938326, "grad_norm": 11.189581871032715, "kl/avg_steps": 0.46875, "kl/beta": 0.007148143369704485, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 1.0036744111882672e-07, "logits/chosen": -7.5872368812561035, "logits/rejected": -7.235992431640625, "logps/chosen": -220.14817810058594, "logps/ref_chosen": -90.596923828125, "logps/ref_rejected": -85.2972640991211, "logps/rejected": -288.4246520996094, "loss": 1.0588, "rewards/accuracies": 0.765625, "rewards/chosen": -0.9249030947685242, "rewards/margins": 0.5188544392585754, "rewards/rejected": -1.4437575340270996, "step": 501 }, { "epoch": 0.737151248164464, "grad_norm": 10.501534461975098, "kl/avg_steps": 0.5, "kl/beta": 0.007114792708307505, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 9.934134090518592e-08, "logits/chosen": -8.039237976074219, "logits/rejected": -6.9175286293029785, "logps/chosen": -235.90652465820312, "logps/ref_chosen": -103.47073364257812, "logps/ref_rejected": -89.04415893554688, "logps/rejected": -287.358642578125, "loss": 1.0789, "rewards/accuracies": 0.78125, "rewards/chosen": -0.9396613836288452, "rewards/margins": 0.46196457743644714, "rewards/rejected": -1.4016259908676147, "step": 502 }, { "epoch": 0.7386196769456681, "grad_norm": 9.694924354553223, "kl/avg_steps": 0.4375, "kl/beta": 0.007079395931214094, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 9.831921068732571e-08, "logits/chosen": -7.841567039489746, "logits/rejected": -7.385000228881836, "logps/chosen": -222.0538787841797, "logps/ref_chosen": -93.72999572753906, "logps/ref_rejected": -87.60896301269531, "logps/rejected": -291.72247314453125, "loss": 1.0182, "rewards/accuracies": 0.78125, "rewards/chosen": -0.9058637619018555, "rewards/margins": 0.5300005674362183, "rewards/rejected": -1.4358644485473633, "step": 503 }, { "epoch": 0.7400881057268722, "grad_norm": 12.8939208984375, "kl/avg_steps": 0.75, "kl/beta": 0.007048558443784714, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 9.730107739932805e-08, "logits/chosen": -8.119867324829102, "logits/rejected": -7.42840576171875, "logps/chosen": -221.08261108398438, "logps/ref_chosen": -92.2918701171875, "logps/ref_rejected": -109.03765106201172, "logps/rejected": -328.2010498046875, "loss": 0.9324, "rewards/accuracies": 0.890625, "rewards/chosen": -0.9024004340171814, "rewards/margins": 0.629888117313385, "rewards/rejected": -1.5322885513305664, "step": 504 }, { "epoch": 0.7415565345080763, "grad_norm": 9.646526336669922, "kl/avg_steps": 0.53125, "kl/beta": 0.006996087729930878, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 9.628696786995188e-08, "logits/chosen": -7.906380653381348, "logits/rejected": -7.302044868469238, "logps/chosen": -250.1861572265625, "logps/ref_chosen": -106.27693176269531, "logps/ref_rejected": -94.20611572265625, "logps/rejected": -310.4810791015625, "loss": 1.0176, "rewards/accuracies": 0.78125, "rewards/chosen": -1.0034873485565186, "rewards/margins": 0.5002511143684387, "rewards/rejected": -1.5037386417388916, "step": 505 }, { "epoch": 0.7430249632892805, "grad_norm": 9.696473121643066, "kl/avg_steps": 0.65625, "kl/beta": 0.006959117483347654, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 9.527690882192635e-08, "logits/chosen": -7.563765525817871, "logits/rejected": -7.289196014404297, "logps/chosen": -221.6510009765625, "logps/ref_chosen": -81.44086456298828, "logps/ref_rejected": -85.66439819335938, "logps/rejected": -301.66949462890625, "loss": 0.9974, "rewards/accuracies": 0.875, "rewards/chosen": -0.9702832698822021, "rewards/margins": 0.5212363600730896, "rewards/rejected": -1.4915196895599365, "step": 506 }, { "epoch": 0.7444933920704846, "grad_norm": 8.90442943572998, "kl/avg_steps": 0.71875, "kl/beta": 0.006913745775818825, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 9.427092687124691e-08, "logits/chosen": -7.426417827606201, "logits/rejected": -7.099169731140137, "logps/chosen": -245.88427734375, "logps/ref_chosen": -99.42694854736328, "logps/ref_rejected": -102.25296020507812, "logps/rejected": -328.4168701171875, "loss": 1.0131, "rewards/accuracies": 0.859375, "rewards/chosen": -1.0063550472259521, "rewards/margins": 0.5441376566886902, "rewards/rejected": -1.550492763519287, "step": 507 }, { "epoch": 0.7459618208516887, "grad_norm": 10.534308433532715, "kl/avg_steps": 0.46875, "kl/beta": 0.006864408031105995, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 9.326904852647344e-08, "logits/chosen": -7.581225395202637, "logits/rejected": -7.184242248535156, "logps/chosen": -241.60638427734375, "logps/ref_chosen": -97.9156494140625, "logps/ref_rejected": -103.79782104492188, "logps/rejected": -314.3401794433594, "loss": 1.0875, "rewards/accuracies": 0.734375, "rewards/chosen": -0.9846047759056091, "rewards/margins": 0.4526156783103943, "rewards/rejected": -1.4372204542160034, "step": 508 }, { "epoch": 0.7474302496328928, "grad_norm": 12.450897216796875, "kl/avg_steps": 0.625, "kl/beta": 0.006832381244748831, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 9.227130018803195e-08, "logits/chosen": -7.861898422241211, "logits/rejected": -7.100181579589844, "logps/chosen": -237.10333251953125, "logps/ref_chosen": -100.72984313964844, "logps/ref_rejected": -86.943359375, "logps/rejected": -287.923095703125, "loss": 1.0779, "rewards/accuracies": 0.796875, "rewards/chosen": -0.9274278879165649, "rewards/margins": 0.43540412187576294, "rewards/rejected": -1.3628320693969727, "step": 509 }, { "epoch": 0.748898678414097, "grad_norm": 10.103314399719238, "kl/avg_steps": 0.578125, "kl/beta": 0.006789944134652615, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.78125, "learning_rate": 9.127770814751932e-08, "logits/chosen": -7.755180358886719, "logits/rejected": -7.310698986053467, "logps/chosen": -248.5462646484375, "logps/ref_chosen": -89.16938781738281, "logps/ref_rejected": -107.91940307617188, "logps/rejected": -342.783447265625, "loss": 1.0245, "rewards/accuracies": 0.796875, "rewards/chosen": -1.0782501697540283, "rewards/margins": 0.5061511993408203, "rewards/rejected": -1.5844013690948486, "step": 510 }, { "epoch": 0.750367107195301, "grad_norm": 9.313636779785156, "kl/avg_steps": 0.6875, "kl/beta": 0.006750915199518204, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 9.028829858700973e-08, "logits/chosen": -7.98992919921875, "logits/rejected": -7.07249641418457, "logps/chosen": -233.65809631347656, "logps/ref_chosen": -89.74979400634766, "logps/ref_rejected": -97.73127746582031, "logps/rejected": -322.42425537109375, "loss": 0.998, "rewards/accuracies": 0.84375, "rewards/chosen": -0.9661585092544556, "rewards/margins": 0.539162278175354, "rewards/rejected": -1.5053207874298096, "step": 511 }, { "epoch": 0.7518355359765051, "grad_norm": 11.483407974243164, "kl/avg_steps": 0.625, "kl/beta": 0.006704819854348898, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 8.930309757836516e-08, "logits/chosen": -7.597411155700684, "logits/rejected": -7.131270408630371, "logps/chosen": -221.8423614501953, "logps/ref_chosen": -85.73515319824219, "logps/ref_rejected": -87.6248779296875, "logps/rejected": -300.55499267578125, "loss": 1.032, "rewards/accuracies": 0.828125, "rewards/chosen": -0.9088408350944519, "rewards/margins": 0.5085346698760986, "rewards/rejected": -1.4173755645751953, "step": 512 }, { "epoch": 0.7533039647577092, "grad_norm": 10.936783790588379, "kl/avg_steps": 0.6875, "kl/beta": 0.006663174834102392, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 8.832213108254863e-08, "logits/chosen": -7.599587917327881, "logits/rejected": -7.238034725189209, "logps/chosen": -240.4691162109375, "logps/ref_chosen": -96.19051361083984, "logps/ref_rejected": -82.91773986816406, "logps/rejected": -295.57098388671875, "loss": 1.0548, "rewards/accuracies": 0.84375, "rewards/chosen": -0.956098198890686, "rewards/margins": 0.44991129636764526, "rewards/rejected": -1.4060094356536865, "step": 513 }, { "epoch": 0.7547723935389133, "grad_norm": 10.800374031066895, "kl/avg_steps": 0.46875, "kl/beta": 0.006617678329348564, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 8.734542494893954e-08, "logits/chosen": -7.524253845214844, "logits/rejected": -7.1176605224609375, "logps/chosen": -235.91629028320312, "logps/ref_chosen": -97.82363891601562, "logps/ref_rejected": -85.36982727050781, "logps/rejected": -294.8122253417969, "loss": 1.0464, "rewards/accuracies": 0.75, "rewards/chosen": -0.9116629958152771, "rewards/margins": 0.46637022495269775, "rewards/rejected": -1.37803316116333, "step": 514 }, { "epoch": 0.7562408223201175, "grad_norm": 13.166977882385254, "kl/avg_steps": 0.59375, "kl/beta": 0.0065868026576936245, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 8.637300491465272e-08, "logits/chosen": -7.724981307983398, "logits/rejected": -7.439934730529785, "logps/chosen": -219.55661010742188, "logps/ref_chosen": -81.93355560302734, "logps/ref_rejected": -93.5765609741211, "logps/rejected": -294.6715087890625, "loss": 1.091, "rewards/accuracies": 0.796875, "rewards/chosen": -0.9028570652008057, "rewards/margins": 0.41230061650276184, "rewards/rejected": -1.3151576519012451, "step": 515 }, { "epoch": 0.7577092511013216, "grad_norm": 9.387166023254395, "kl/avg_steps": 0.5, "kl/beta": 0.006547924131155014, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 8.540489660386064e-08, "logits/chosen": -7.62106990814209, "logits/rejected": -7.103853225708008, "logps/chosen": -236.6641845703125, "logps/ref_chosen": -97.09166717529297, "logps/ref_rejected": -118.55667114257812, "logps/rejected": -326.1341552734375, "loss": 1.1026, "rewards/accuracies": 0.75, "rewards/chosen": -0.9115662574768066, "rewards/margins": 0.43881756067276, "rewards/rejected": -1.3503837585449219, "step": 516 }, { "epoch": 0.7591776798825257, "grad_norm": 9.433574676513672, "kl/avg_steps": 0.5625, "kl/beta": 0.006515347398817539, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 8.444112552711752e-08, "logits/chosen": -7.664484024047852, "logits/rejected": -7.389780521392822, "logps/chosen": -241.97000122070312, "logps/ref_chosen": -93.18656921386719, "logps/ref_rejected": -95.5927963256836, "logps/rejected": -327.4776611328125, "loss": 1.0038, "rewards/accuracies": 0.796875, "rewards/chosen": -0.9653780460357666, "rewards/margins": 0.534911036491394, "rewards/rejected": -1.500288963317871, "step": 517 }, { "epoch": 0.7606461086637298, "grad_norm": 11.14731502532959, "kl/avg_steps": 0.5625, "kl/beta": 0.006478903815150261, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 8.348171708068747e-08, "logits/chosen": -7.792545318603516, "logits/rejected": -7.39441442489624, "logps/chosen": -241.240478515625, "logps/ref_chosen": -87.0213623046875, "logps/ref_rejected": -91.24455261230469, "logps/rejected": -307.2711486816406, "loss": 1.1116, "rewards/accuracies": 0.796875, "rewards/chosen": -0.9949862360954285, "rewards/margins": 0.394919216632843, "rewards/rejected": -1.3899054527282715, "step": 518 }, { "epoch": 0.762114537444934, "grad_norm": 9.395498275756836, "kl/avg_steps": 0.65625, "kl/beta": 0.006442663725465536, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 8.25266965458755e-08, "logits/chosen": -7.722479343414307, "logits/rejected": -7.366518974304199, "logps/chosen": -244.36495971679688, "logps/ref_chosen": -100.68755340576172, "logps/ref_rejected": -109.03042602539062, "logps/rejected": -321.7382507324219, "loss": 1.0488, "rewards/accuracies": 0.84375, "rewards/chosen": -0.9199578762054443, "rewards/margins": 0.43974611163139343, "rewards/rejected": -1.3597040176391602, "step": 519 }, { "epoch": 0.7635829662261381, "grad_norm": 9.159811973571777, "kl/avg_steps": 0.625, "kl/beta": 0.006400659214705229, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 8.15760890883607e-08, "logits/chosen": -7.769089221954346, "logits/rejected": -7.011660575866699, "logps/chosen": -229.7172393798828, "logps/ref_chosen": -98.77762603759766, "logps/ref_rejected": -106.9457015991211, "logps/rejected": -318.0968322753906, "loss": 0.9971, "rewards/accuracies": 0.859375, "rewards/chosen": -0.834189772605896, "rewards/margins": 0.507774829864502, "rewards/rejected": -1.341964602470398, "step": 520 }, { "epoch": 0.7650513950073421, "grad_norm": 9.642518043518066, "kl/avg_steps": 0.65625, "kl/beta": 0.006360903847962618, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 8.062991975753378e-08, "logits/chosen": -7.651534080505371, "logits/rejected": -7.242176055908203, "logps/chosen": -224.09951782226562, "logps/ref_chosen": -86.35108947753906, "logps/ref_rejected": -90.39317321777344, "logps/rejected": -306.9713134765625, "loss": 1.0088, "rewards/accuracies": 0.828125, "rewards/chosen": -0.8716791868209839, "rewards/margins": 0.4956836700439453, "rewards/rejected": -1.3673629760742188, "step": 521 }, { "epoch": 0.7665198237885462, "grad_norm": 9.185672760009766, "kl/avg_steps": 0.46875, "kl/beta": 0.006319432519376278, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 7.968821348583643e-08, "logits/chosen": -7.3879499435424805, "logits/rejected": -6.970926284790039, "logps/chosen": -215.34011840820312, "logps/ref_chosen": -76.008056640625, "logps/ref_rejected": -74.44125366210938, "logps/rejected": -289.88787841796875, "loss": 1.0429, "rewards/accuracies": 0.734375, "rewards/chosen": -0.8779767751693726, "rewards/margins": 0.47517913579940796, "rewards/rejected": -1.3531558513641357, "step": 522 }, { "epoch": 0.7679882525697503, "grad_norm": 11.464506149291992, "kl/avg_steps": 0.4375, "kl/beta": 0.006289948243647814, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 7.875099508810484e-08, "logits/chosen": -7.647948741912842, "logits/rejected": -6.6883955001831055, "logps/chosen": -255.38108825683594, "logps/ref_chosen": -96.49627685546875, "logps/ref_rejected": -91.0152816772461, "logps/rejected": -302.5433349609375, "loss": 1.1708, "rewards/accuracies": 0.734375, "rewards/chosen": -0.9972683787345886, "rewards/margins": 0.3256107568740845, "rewards/rejected": -1.3228791952133179, "step": 523 }, { "epoch": 0.7694566813509545, "grad_norm": 10.063695907592773, "kl/avg_steps": 0.46875, "kl/beta": 0.006262549664825201, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 7.781828926091535e-08, "logits/chosen": -8.02735710144043, "logits/rejected": -7.3874831199646, "logps/chosen": -248.78433227539062, "logps/ref_chosen": -103.40550231933594, "logps/ref_rejected": -87.91015625, "logps/rejected": -306.3849792480469, "loss": 1.0526, "rewards/accuracies": 0.828125, "rewards/chosen": -0.908073902130127, "rewards/margins": 0.45238441228866577, "rewards/rejected": -1.3604583740234375, "step": 524 }, { "epoch": 0.7709251101321586, "grad_norm": 8.88698673248291, "kl/avg_steps": 0.5625, "kl/beta": 0.006233330816030502, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 7.689012058193384e-08, "logits/chosen": -7.746663570404053, "logits/rejected": -7.510030746459961, "logps/chosen": -218.09078979492188, "logps/ref_chosen": -84.20648193359375, "logps/ref_rejected": -106.26158905029297, "logps/rejected": -323.40283203125, "loss": 1.0194, "rewards/accuracies": 0.765625, "rewards/chosen": -0.8309329748153687, "rewards/margins": 0.513077974319458, "rewards/rejected": -1.344010829925537, "step": 525 }, { "epoch": 0.7723935389133627, "grad_norm": 8.236015319824219, "kl/avg_steps": 0.5, "kl/beta": 0.006198464427143335, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 7.596651350926836e-08, "logits/chosen": -7.848876476287842, "logits/rejected": -7.135056495666504, "logps/chosen": -246.24072265625, "logps/ref_chosen": -102.85411071777344, "logps/ref_rejected": -93.65145874023438, "logps/rejected": -315.64605712890625, "loss": 1.0393, "rewards/accuracies": 0.765625, "rewards/chosen": -0.8863099813461304, "rewards/margins": 0.4812777638435364, "rewards/rejected": -1.3675878047943115, "step": 526 }, { "epoch": 0.7738619676945668, "grad_norm": 8.652771949768066, "kl/avg_steps": 0.6875, "kl/beta": 0.006167626474052668, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 7.504749238082414e-08, "logits/chosen": -8.085288047790527, "logits/rejected": -7.213015556335449, "logps/chosen": -236.21170043945312, "logps/ref_chosen": -96.76543426513672, "logps/ref_rejected": -85.39518737792969, "logps/rejected": -304.16583251953125, "loss": 1.0093, "rewards/accuracies": 0.875, "rewards/chosen": -0.8550451993942261, "rewards/margins": 0.4838915765285492, "rewards/rejected": -1.3389368057250977, "step": 527 }, { "epoch": 0.775330396475771, "grad_norm": 8.880172729492188, "kl/avg_steps": 0.6875, "kl/beta": 0.006125513464212418, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 7.413308141366254e-08, "logits/chosen": -8.002275466918945, "logits/rejected": -7.597675800323486, "logps/chosen": -225.98036193847656, "logps/ref_chosen": -94.1649169921875, "logps/ref_rejected": -100.372314453125, "logps/rejected": -312.89617919921875, "loss": 1.0225, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8029056191444397, "rewards/margins": 0.48855090141296387, "rewards/rejected": -1.2914564609527588, "step": 528 }, { "epoch": 0.7767988252569751, "grad_norm": 10.054601669311523, "kl/avg_steps": 0.5, "kl/beta": 0.006083688233047724, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 7.322330470336313e-08, "logits/chosen": -7.928413391113281, "logits/rejected": -7.295429229736328, "logps/chosen": -234.24697875976562, "logps/ref_chosen": -80.77679443359375, "logps/ref_rejected": -97.191162109375, "logps/rejected": -321.6305847167969, "loss": 1.0863, "rewards/accuracies": 0.75, "rewards/chosen": -0.9308747053146362, "rewards/margins": 0.42603152990341187, "rewards/rejected": -1.3569061756134033, "step": 529 }, { "epoch": 0.7782672540381792, "grad_norm": 8.395234107971191, "kl/avg_steps": 0.65625, "kl/beta": 0.006053421180695295, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 7.231818622338822e-08, "logits/chosen": -7.8860321044921875, "logits/rejected": -7.2712297439575195, "logps/chosen": -221.32212829589844, "logps/ref_chosen": -79.7757339477539, "logps/ref_rejected": -92.05390167236328, "logps/rejected": -306.6104736328125, "loss": 1.0424, "rewards/accuracies": 0.84375, "rewards/chosen": -0.852878212928772, "rewards/margins": 0.43727239966392517, "rewards/rejected": -1.2901506423950195, "step": 530 }, { "epoch": 0.7797356828193832, "grad_norm": 8.446939468383789, "kl/avg_steps": 0.6875, "kl/beta": 0.006013954523950815, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 7.141774982445147e-08, "logits/chosen": -7.656224250793457, "logits/rejected": -6.923822402954102, "logps/chosen": -217.52377319335938, "logps/ref_chosen": -79.31919860839844, "logps/ref_rejected": -78.10832214355469, "logps/rejected": -304.80413818359375, "loss": 0.9847, "rewards/accuracies": 0.859375, "rewards/chosen": -0.8268610239028931, "rewards/margins": 0.5261213183403015, "rewards/rejected": -1.3529822826385498, "step": 531 }, { "epoch": 0.7812041116005873, "grad_norm": 9.200221061706543, "kl/avg_steps": 0.625, "kl/beta": 0.005972891114652157, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 7.052201923388953e-08, "logits/chosen": -7.909399032592773, "logits/rejected": -7.158049583435059, "logps/chosen": -246.2357177734375, "logps/ref_chosen": -101.63691711425781, "logps/ref_rejected": -93.34539031982422, "logps/rejected": -336.9823913574219, "loss": 0.9473, "rewards/accuracies": 0.828125, "rewards/chosen": -0.859551191329956, "rewards/margins": 0.5850256681442261, "rewards/rejected": -1.4445768594741821, "step": 532 }, { "epoch": 0.7826725403817915, "grad_norm": 9.506068229675293, "kl/avg_steps": 0.4375, "kl/beta": 0.0059357923455536366, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 6.963101805503646e-08, "logits/chosen": -7.598127365112305, "logits/rejected": -6.821909427642822, "logps/chosen": -232.10377502441406, "logps/ref_chosen": -90.74664306640625, "logps/ref_rejected": -82.88848876953125, "logps/rejected": -282.5700378417969, "loss": 1.1378, "rewards/accuracies": 0.75, "rewards/chosen": -0.8360397219657898, "rewards/margins": 0.3414258658885956, "rewards/rejected": -1.177465558052063, "step": 533 }, { "epoch": 0.7841409691629956, "grad_norm": 8.0197114944458, "kl/avg_steps": 0.71875, "kl/beta": 0.005909936036914587, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 6.874476976660184e-08, "logits/chosen": -7.659628391265869, "logits/rejected": -7.227793216705322, "logps/chosen": -214.4866943359375, "logps/ref_chosen": -86.63084411621094, "logps/ref_rejected": -84.87196350097656, "logps/rejected": -296.1422424316406, "loss": 1.0025, "rewards/accuracies": 0.875, "rewards/chosen": -0.7507482767105103, "rewards/margins": 0.4875665009021759, "rewards/rejected": -1.2383147478103638, "step": 534 }, { "epoch": 0.7856093979441997, "grad_norm": 7.752889633178711, "kl/avg_steps": 0.59375, "kl/beta": 0.005867761559784412, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 6.786329772205246e-08, "logits/chosen": -7.62947940826416, "logits/rejected": -7.0962138175964355, "logps/chosen": -216.57179260253906, "logps/ref_chosen": -90.34539031982422, "logps/ref_rejected": -100.7779769897461, "logps/rejected": -315.5731201171875, "loss": 1.0009, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7379162907600403, "rewards/margins": 0.5136837363243103, "rewards/rejected": -1.2516000270843506, "step": 535 }, { "epoch": 0.7870778267254038, "grad_norm": 8.29379940032959, "kl/avg_steps": 0.625, "kl/beta": 0.005833127535879612, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 6.698662514899638e-08, "logits/chosen": -7.761280059814453, "logits/rejected": -7.13836669921875, "logps/chosen": -211.44839477539062, "logps/ref_chosen": -78.95956420898438, "logps/ref_rejected": -93.08779907226562, "logps/rejected": -308.13739013671875, "loss": 1.0251, "rewards/accuracies": 0.859375, "rewards/chosen": -0.7697625756263733, "rewards/margins": 0.47593456506729126, "rewards/rejected": -1.245697021484375, "step": 536 }, { "epoch": 0.788546255506608, "grad_norm": 7.392307758331299, "kl/avg_steps": 0.625, "kl/beta": 0.005796896759420633, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 6.611477514857114e-08, "logits/chosen": -7.517266273498535, "logits/rejected": -6.534526824951172, "logps/chosen": -200.18478393554688, "logps/ref_chosen": -86.70939636230469, "logps/ref_rejected": -78.90767669677734, "logps/rejected": -286.97747802734375, "loss": 0.9757, "rewards/accuracies": 0.875, "rewards/chosen": -0.654714047908783, "rewards/margins": 0.5424842834472656, "rewards/rejected": -1.1971983909606934, "step": 537 }, { "epoch": 0.7900146842878121, "grad_norm": 8.975393295288086, "kl/avg_steps": 0.53125, "kl/beta": 0.005760891363024712, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 6.524777069483525e-08, "logits/chosen": -7.621641159057617, "logits/rejected": -7.222417831420898, "logps/chosen": -250.26470947265625, "logps/ref_chosen": -98.44214630126953, "logps/ref_rejected": -91.09465026855469, "logps/rejected": -319.07464599609375, "loss": 1.0446, "rewards/accuracies": 0.875, "rewards/chosen": -0.8716259002685547, "rewards/margins": 0.43390387296676636, "rewards/rejected": -1.3055297136306763, "step": 538 }, { "epoch": 0.7914831130690162, "grad_norm": 7.8592424392700195, "kl/avg_steps": 0.59375, "kl/beta": 0.005730448290705681, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 6.438563463416221e-08, "logits/chosen": -7.682827949523926, "logits/rejected": -7.03498649597168, "logps/chosen": -227.2099609375, "logps/ref_chosen": -90.93212127685547, "logps/ref_rejected": -97.44546508789062, "logps/rejected": -312.05865478515625, "loss": 1.0608, "rewards/accuracies": 0.796875, "rewards/chosen": -0.7778847217559814, "rewards/margins": 0.44326433539390564, "rewards/rejected": -1.2211490869522095, "step": 539 }, { "epoch": 0.7929515418502202, "grad_norm": 9.220181465148926, "kl/avg_steps": 0.71875, "kl/beta": 0.005696624517440796, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 6.352838968463919e-08, "logits/chosen": -8.04260540008545, "logits/rejected": -7.764565467834473, "logps/chosen": -247.24325561523438, "logps/ref_chosen": -107.8698959350586, "logps/ref_rejected": -121.39349365234375, "logps/rejected": -353.7125549316406, "loss": 1.0036, "rewards/accuracies": 0.890625, "rewards/chosen": -0.789432942867279, "rewards/margins": 0.5233358144760132, "rewards/rejected": -1.3127686977386475, "step": 540 }, { "epoch": 0.7944199706314243, "grad_norm": 8.543745040893555, "kl/avg_steps": 0.53125, "kl/beta": 0.00565597228705883, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 6.267605843546767e-08, "logits/chosen": -7.653639793395996, "logits/rejected": -7.352320671081543, "logps/chosen": -264.3243713378906, "logps/ref_chosen": -106.56748962402344, "logps/ref_rejected": -109.06633758544922, "logps/rejected": -329.9075927734375, "loss": 1.114, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8885458111763, "rewards/margins": 0.3523477613925934, "rewards/rejected": -1.2408936023712158, "step": 541 }, { "epoch": 0.7958883994126285, "grad_norm": 8.278136253356934, "kl/avg_steps": 0.65625, "kl/beta": 0.005626083817332983, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 6.182866334636888e-08, "logits/chosen": -7.408226013183594, "logits/rejected": -7.1460490226745605, "logps/chosen": -218.92977905273438, "logps/ref_chosen": -87.36929321289062, "logps/ref_rejected": -102.82034301757812, "logps/rejected": -326.39044189453125, "loss": 0.9943, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7365444302558899, "rewards/margins": 0.5116760730743408, "rewards/rejected": -1.248220443725586, "step": 542 }, { "epoch": 0.7973568281938326, "grad_norm": 9.519865989685059, "kl/avg_steps": 0.375, "kl/beta": 0.005589403212070465, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 6.098622674699147e-08, "logits/chosen": -7.630051612854004, "logits/rejected": -7.440648078918457, "logps/chosen": -244.11569213867188, "logps/ref_chosen": -87.152587890625, "logps/ref_rejected": -111.53172302246094, "logps/rejected": -317.44921875, "loss": 1.2137, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8755909204483032, "rewards/margins": 0.2687823474407196, "rewards/rejected": -1.1443732976913452, "step": 543 }, { "epoch": 0.7988252569750367, "grad_norm": 10.596867561340332, "kl/avg_steps": 0.46875, "kl/beta": 0.005568521562963724, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 6.01487708363232e-08, "logits/chosen": -7.832944869995117, "logits/rejected": -7.403214931488037, "logps/chosen": -244.70623779296875, "logps/ref_chosen": -86.14691162109375, "logps/ref_rejected": -106.3939208984375, "logps/rejected": -325.4195861816406, "loss": 1.1329, "rewards/accuracies": 0.734375, "rewards/chosen": -0.8795119524002075, "rewards/margins": 0.33259445428848267, "rewards/rejected": -1.212106466293335, "step": 544 }, { "epoch": 0.8002936857562408, "grad_norm": 8.951570510864258, "kl/avg_steps": 0.59375, "kl/beta": 0.005542540457099676, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 5.9316317682106294e-08, "logits/chosen": -7.671846389770508, "logits/rejected": -7.305430889129639, "logps/chosen": -242.76431274414062, "logps/ref_chosen": -96.42424011230469, "logps/ref_rejected": -100.73405456542969, "logps/rejected": -321.11676025390625, "loss": 1.0893, "rewards/accuracies": 0.828125, "rewards/chosen": -0.8078324794769287, "rewards/margins": 0.4048755168914795, "rewards/rejected": -1.2127079963684082, "step": 545 }, { "epoch": 0.801762114537445, "grad_norm": 8.22799301147461, "kl/avg_steps": 0.5625, "kl/beta": 0.005509825889021158, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 5.848888922025552e-08, "logits/chosen": -7.779958724975586, "logits/rejected": -7.305843353271484, "logps/chosen": -233.88308715820312, "logps/ref_chosen": -86.02112579345703, "logps/ref_rejected": -87.37263488769531, "logps/rejected": -306.00103759765625, "loss": 1.0982, "rewards/accuracies": 0.796875, "rewards/chosen": -0.8111412525177002, "rewards/margins": 0.3848496675491333, "rewards/rejected": -1.195991039276123, "step": 546 }, { "epoch": 0.8032305433186491, "grad_norm": 8.054706573486328, "kl/avg_steps": 0.65625, "kl/beta": 0.005479006562381983, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 5.7666507254280265e-08, "logits/chosen": -7.915996074676514, "logits/rejected": -7.356800556182861, "logps/chosen": -244.9899444580078, "logps/ref_chosen": -100.65100860595703, "logps/ref_rejected": -97.9117431640625, "logps/rejected": -322.5784912109375, "loss": 1.0372, "rewards/accuracies": 0.859375, "rewards/chosen": -0.7865848541259766, "rewards/margins": 0.43519890308380127, "rewards/rejected": -1.2217838764190674, "step": 547 }, { "epoch": 0.8046989720998532, "grad_norm": 9.502042770385742, "kl/avg_steps": 0.5, "kl/beta": 0.005443285219371319, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 5.684919345471029e-08, "logits/chosen": -7.732963562011719, "logits/rejected": -7.511667251586914, "logps/chosen": -241.53533935546875, "logps/ref_chosen": -96.03010559082031, "logps/ref_rejected": -98.41322326660156, "logps/rejected": -305.8096618652344, "loss": 1.1549, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7902284860610962, "rewards/margins": 0.3315829634666443, "rewards/rejected": -1.1218115091323853, "step": 548 }, { "epoch": 0.8061674008810573, "grad_norm": 9.200112342834473, "kl/avg_steps": 0.625, "kl/beta": 0.00541620422154665, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 5.603696935852426e-08, "logits/chosen": -7.810609340667725, "logits/rejected": -7.074420928955078, "logps/chosen": -215.589111328125, "logps/ref_chosen": -83.45096588134766, "logps/ref_rejected": -81.38023376464844, "logps/rejected": -288.0585632324219, "loss": 1.0911, "rewards/accuracies": 0.796875, "rewards/chosen": -0.7132003307342529, "rewards/margins": 0.39838820695877075, "rewards/rejected": -1.111588716506958, "step": 549 }, { "epoch": 0.8076358296622613, "grad_norm": 8.178189277648926, "kl/avg_steps": 0.65625, "kl/beta": 0.00538256298750639, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 5.5229856368582376e-08, "logits/chosen": -7.702338218688965, "logits/rejected": -7.392082214355469, "logps/chosen": -239.71810913085938, "logps/ref_chosen": -89.59434509277344, "logps/ref_rejected": -101.31549835205078, "logps/rejected": -320.63720703125, "loss": 1.0957, "rewards/accuracies": 0.875, "rewards/chosen": -0.8037484884262085, "rewards/margins": 0.36810043454170227, "rewards/rejected": -1.171849012374878, "step": 550 }, { "epoch": 0.8091042584434655, "grad_norm": 8.914412498474121, "kl/avg_steps": 0.71875, "kl/beta": 0.005347470287233591, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 5.4427875753062734e-08, "logits/chosen": -7.659028053283691, "logits/rejected": -7.469613075256348, "logps/chosen": -226.66494750976562, "logps/ref_chosen": -95.08216094970703, "logps/ref_rejected": -117.7192611694336, "logps/rejected": -333.44720458984375, "loss": 1.0428, "rewards/accuracies": 0.890625, "rewards/chosen": -0.6989625692367554, "rewards/margins": 0.4451354146003723, "rewards/rejected": -1.144097924232483, "step": 551 }, { "epoch": 0.8105726872246696, "grad_norm": 7.542394638061523, "kl/avg_steps": 0.6875, "kl/beta": 0.005309309810400009, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 5.363104864490034e-08, "logits/chosen": -8.226886749267578, "logits/rejected": -7.813612937927246, "logps/chosen": -229.49818420410156, "logps/ref_chosen": -101.40918731689453, "logps/ref_rejected": -110.83843994140625, "logps/rejected": -349.8272705078125, "loss": 0.9501, "rewards/accuracies": 0.84375, "rewards/chosen": -0.6762351989746094, "rewards/margins": 0.582294762134552, "rewards/rejected": -1.2585299015045166, "step": 552 }, { "epoch": 0.8120411160058737, "grad_norm": 8.038527488708496, "kl/avg_steps": 0.5, "kl/beta": 0.005273057147860527, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 5.2839396041230415e-08, "logits/chosen": -7.498537540435791, "logits/rejected": -6.907330513000488, "logps/chosen": -237.8641815185547, "logps/ref_chosen": -97.96388244628906, "logps/ref_rejected": -103.57719421386719, "logps/rejected": -313.3408203125, "loss": 1.1097, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7355344295501709, "rewards/margins": 0.3635327219963074, "rewards/rejected": -1.0990670919418335, "step": 553 }, { "epoch": 0.8135095447870778, "grad_norm": 7.692688941955566, "kl/avg_steps": 0.40625, "kl/beta": 0.005246823187917471, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 5.205293880283551e-08, "logits/chosen": -7.834980487823486, "logits/rejected": -7.495749473571777, "logps/chosen": -251.03372192382812, "logps/ref_chosen": -103.68680572509766, "logps/ref_rejected": -92.99210357666016, "logps/rejected": -316.3883056640625, "loss": 1.1078, "rewards/accuracies": 0.765625, "rewards/chosen": -0.7710244655609131, "rewards/margins": 0.39337316155433655, "rewards/rejected": -1.1643975973129272, "step": 554 }, { "epoch": 0.8149779735682819, "grad_norm": 8.527905464172363, "kl/avg_steps": 0.53125, "kl/beta": 0.005225594155490398, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 5.127169765359515e-08, "logits/chosen": -7.357694149017334, "logits/rejected": -7.210930824279785, "logps/chosen": -241.05128479003906, "logps/ref_chosen": -86.20362854003906, "logps/ref_rejected": -114.54719543457031, "logps/rejected": -343.023193359375, "loss": 1.1068, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8064323663711548, "rewards/margins": 0.37958627939224243, "rewards/rejected": -1.1860185861587524, "step": 555 }, { "epoch": 0.8164464023494861, "grad_norm": 7.89186954498291, "kl/avg_steps": 0.53125, "kl/beta": 0.005197979975491762, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 5.049569317994012e-08, "logits/chosen": -7.9103193283081055, "logits/rejected": -6.936588287353516, "logps/chosen": -226.0050811767578, "logps/ref_chosen": -88.40348815917969, "logps/ref_rejected": -106.63912200927734, "logps/rejected": -323.8564147949219, "loss": 1.0752, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7129172086715698, "rewards/margins": 0.40871816873550415, "rewards/rejected": -1.1216354370117188, "step": 556 }, { "epoch": 0.8179148311306902, "grad_norm": 6.813101291656494, "kl/avg_steps": 0.5625, "kl/beta": 0.005170511547476053, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.9724945830310144e-08, "logits/chosen": -7.78956413269043, "logits/rejected": -7.387689590454102, "logps/chosen": -245.75643920898438, "logps/ref_chosen": -98.83523559570312, "logps/ref_rejected": -115.64888763427734, "logps/rejected": -350.8326110839844, "loss": 1.0491, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7568396925926208, "rewards/margins": 0.45094895362854004, "rewards/rejected": -1.2077887058258057, "step": 557 }, { "epoch": 0.8193832599118943, "grad_norm": 8.840348243713379, "kl/avg_steps": 0.8125, "kl/beta": 0.005141590256243944, "kl/n_epsilon_steps": 0.09375, "kl/p_epsilon_steps": 0.90625, "learning_rate": 4.8959475914614554e-08, "logits/chosen": -8.046621322631836, "logits/rejected": -7.308237075805664, "logps/chosen": -254.44580078125, "logps/ref_chosen": -106.01183319091797, "logps/ref_rejected": -108.39131164550781, "logps/rejected": -347.26983642578125, "loss": 1.0219, "rewards/accuracies": 0.890625, "rewards/chosen": -0.7578614950180054, "rewards/margins": 0.45999467372894287, "rewards/rejected": -1.2178561687469482, "step": 558 }, { "epoch": 0.8208516886930984, "grad_norm": 9.36201000213623, "kl/avg_steps": 0.59375, "kl/beta": 0.005100151523947716, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.8199303603697614e-08, "logits/chosen": -7.8494062423706055, "logits/rejected": -7.495527267456055, "logps/chosen": -257.96856689453125, "logps/ref_chosen": -100.42474365234375, "logps/ref_rejected": -100.41270446777344, "logps/rejected": -334.3748474121094, "loss": 1.0914, "rewards/accuracies": 0.8125, "rewards/chosen": -0.79989093542099, "rewards/margins": 0.38467034697532654, "rewards/rejected": -1.1845612525939941, "step": 559 }, { "epoch": 0.8223201174743024, "grad_norm": 10.975285530090332, "kl/avg_steps": 0.53125, "kl/beta": 0.005070047918707132, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.7444448928806615e-08, "logits/chosen": -7.894330978393555, "logits/rejected": -7.052946090698242, "logps/chosen": -242.33444213867188, "logps/ref_chosen": -87.63551330566406, "logps/ref_rejected": -87.37985229492188, "logps/rejected": -313.7993469238281, "loss": 1.1114, "rewards/accuracies": 0.796875, "rewards/chosen": -0.781200647354126, "rewards/margins": 0.35893329977989197, "rewards/rejected": -1.1401338577270508, "step": 560 }, { "epoch": 0.8237885462555066, "grad_norm": 7.341772556304932, "kl/avg_steps": 0.65625, "kl/beta": 0.005043255630880594, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 4.669493178106432e-08, "logits/chosen": -7.702178001403809, "logits/rejected": -7.204655647277832, "logps/chosen": -235.06967163085938, "logps/ref_chosen": -74.88140106201172, "logps/ref_rejected": -106.04796600341797, "logps/rejected": -336.55767822265625, "loss": 1.104, "rewards/accuracies": 0.84375, "rewards/chosen": -0.8038696050643921, "rewards/margins": 0.35057592391967773, "rewards/rejected": -1.1544455289840698, "step": 561 }, { "epoch": 0.8252569750367107, "grad_norm": 8.39021110534668, "kl/avg_steps": 0.5, "kl/beta": 0.005010375287383795, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.5950771910944596e-08, "logits/chosen": -7.776498794555664, "logits/rejected": -7.434757232666016, "logps/chosen": -246.33297729492188, "logps/ref_chosen": -100.66322326660156, "logps/ref_rejected": -102.24087524414062, "logps/rejected": -314.5208740234375, "loss": 1.1451, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7275943756103516, "rewards/margins": 0.32898658514022827, "rewards/rejected": -1.056580901145935, "step": 562 }, { "epoch": 0.8267254038179148, "grad_norm": 8.17624568939209, "kl/avg_steps": 0.5, "kl/beta": 0.004985447973012924, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 4.521198892775202e-08, "logits/chosen": -7.574672698974609, "logits/rejected": -7.067543983459473, "logps/chosen": -251.52645874023438, "logps/ref_chosen": -96.25921630859375, "logps/ref_rejected": -101.20503997802734, "logps/rejected": -318.0636901855469, "loss": 1.151, "rewards/accuracies": 0.765625, "rewards/chosen": -0.7714153528213501, "rewards/margins": 0.30289867520332336, "rewards/rejected": -1.0743141174316406, "step": 563 }, { "epoch": 0.8281938325991189, "grad_norm": 8.074674606323242, "kl/avg_steps": 0.578125, "kl/beta": 0.0049606445245444775, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.447860229910544e-08, "logits/chosen": -7.528448104858398, "logits/rejected": -7.25858736038208, "logps/chosen": -263.2978210449219, "logps/ref_chosen": -112.96040344238281, "logps/ref_rejected": -99.52803802490234, "logps/rejected": -316.2669982910156, "loss": 1.1213, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7416489124298096, "rewards/margins": 0.32538917660713196, "rewards/rejected": -1.0670380592346191, "step": 564 }, { "epoch": 0.8296622613803231, "grad_norm": 7.512981414794922, "kl/avg_steps": 0.53125, "kl/beta": 0.004932130686938763, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 4.375063135042445e-08, "logits/chosen": -7.909860610961914, "logits/rejected": -7.201497554779053, "logps/chosen": -239.09127807617188, "logps/ref_chosen": -95.36558532714844, "logps/ref_rejected": -92.21737670898438, "logps/rejected": -322.09124755859375, "loss": 1.0586, "rewards/accuracies": 0.796875, "rewards/chosen": -0.7062567472457886, "rewards/margins": 0.41997939348220825, "rewards/rejected": -1.1262362003326416, "step": 565 }, { "epoch": 0.8311306901615272, "grad_norm": 6.797919750213623, "kl/avg_steps": 0.625, "kl/beta": 0.004906067159026861, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.3028095264420525e-08, "logits/chosen": -7.698864936828613, "logits/rejected": -7.01510763168335, "logps/chosen": -248.45718383789062, "logps/ref_chosen": -98.97320556640625, "logps/ref_rejected": -110.82090759277344, "logps/rejected": -340.752197265625, "loss": 1.0825, "rewards/accuracies": 0.84375, "rewards/chosen": -0.7296649217605591, "rewards/margins": 0.3899438977241516, "rewards/rejected": -1.1196088790893555, "step": 566 }, { "epoch": 0.8325991189427313, "grad_norm": 8.844162940979004, "kl/avg_steps": 0.578125, "kl/beta": 0.00487559475004673, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.231101308059165e-08, "logits/chosen": -7.589968681335449, "logits/rejected": -6.949653625488281, "logps/chosen": -247.43397521972656, "logps/ref_chosen": -92.7035903930664, "logps/ref_rejected": -91.22431945800781, "logps/rejected": -315.1552734375, "loss": 1.1228, "rewards/accuracies": 0.796875, "rewards/chosen": -0.7517495155334473, "rewards/margins": 0.3330436944961548, "rewards/rejected": -1.084793210029602, "step": 567 }, { "epoch": 0.8340675477239354, "grad_norm": 7.321277618408203, "kl/avg_steps": 0.59375, "kl/beta": 0.004847569856792688, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.1599403694720145e-08, "logits/chosen": -7.3952226638793945, "logits/rejected": -7.033419609069824, "logps/chosen": -219.06063842773438, "logps/ref_chosen": -73.13226318359375, "logps/ref_rejected": -94.95265197753906, "logps/rejected": -326.4876708984375, "loss": 1.0596, "rewards/accuracies": 0.859375, "rewards/chosen": -0.7039542198181152, "rewards/margins": 0.4104452133178711, "rewards/rejected": -1.1143994331359863, "step": 568 }, { "epoch": 0.8355359765051396, "grad_norm": 10.243264198303223, "kl/avg_steps": 0.625, "kl/beta": 0.004818957298994064, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 4.089328585837512e-08, "logits/chosen": -7.595402240753174, "logits/rejected": -7.113461494445801, "logps/chosen": -243.99252319335938, "logps/ref_chosen": -89.21708679199219, "logps/ref_rejected": -86.41105651855469, "logps/rejected": -312.2991943359375, "loss": 1.1345, "rewards/accuracies": 0.828125, "rewards/chosen": -0.7422770261764526, "rewards/margins": 0.3377072811126709, "rewards/rejected": -1.0799843072891235, "step": 569 }, { "epoch": 0.8370044052863436, "grad_norm": 7.365958213806152, "kl/avg_steps": 0.6875, "kl/beta": 0.004789025988429785, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 4.019267817841834e-08, "logits/chosen": -7.986109733581543, "logits/rejected": -7.3613433837890625, "logps/chosen": -233.00393676757812, "logps/ref_chosen": -94.76277160644531, "logps/ref_rejected": -88.84693145751953, "logps/rejected": -320.5428466796875, "loss": 1.029, "rewards/accuracies": 0.84375, "rewards/chosen": -0.6582140922546387, "rewards/margins": 0.4426998496055603, "rewards/rejected": -1.1009138822555542, "step": 570 }, { "epoch": 0.8384728340675477, "grad_norm": 9.40793514251709, "kl/avg_steps": 0.59375, "kl/beta": 0.004756326321512461, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 3.9497599116513705e-08, "logits/chosen": -7.549665451049805, "logits/rejected": -7.4702606201171875, "logps/chosen": -221.9232177734375, "logps/ref_chosen": -80.8212890625, "logps/ref_rejected": -97.01528930664062, "logps/rejected": -323.5943603515625, "loss": 1.0783, "rewards/accuracies": 0.828125, "rewards/chosen": -0.6683664917945862, "rewards/margins": 0.40152615308761597, "rewards/rejected": -1.0698926448822021, "step": 571 }, { "epoch": 0.8399412628487518, "grad_norm": 7.180628776550293, "kl/avg_steps": 0.5625, "kl/beta": 0.004728252068161964, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.880806698864086e-08, "logits/chosen": -7.4798078536987305, "logits/rejected": -7.4275007247924805, "logps/chosen": -221.84637451171875, "logps/ref_chosen": -77.37992858886719, "logps/ref_rejected": -90.56597900390625, "logps/rejected": -319.32598876953125, "loss": 1.0794, "rewards/accuracies": 0.796875, "rewards/chosen": -0.6801784038543701, "rewards/margins": 0.39393994212150574, "rewards/rejected": -1.0741183757781982, "step": 572 }, { "epoch": 0.8414096916299559, "grad_norm": 7.156675338745117, "kl/avg_steps": 0.65625, "kl/beta": 0.004701804369688034, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 3.812409996461275e-08, "logits/chosen": -7.6273932456970215, "logits/rejected": -7.473799228668213, "logps/chosen": -224.49549865722656, "logps/ref_chosen": -83.7060775756836, "logps/ref_rejected": -90.69746398925781, "logps/rejected": -310.0919494628906, "loss": 1.1016, "rewards/accuracies": 0.828125, "rewards/chosen": -0.6590225100517273, "rewards/margins": 0.3648913502693176, "rewards/rejected": -1.023913860321045, "step": 573 }, { "epoch": 0.8428781204111601, "grad_norm": 8.27857494354248, "kl/avg_steps": 0.5625, "kl/beta": 0.004671149887144566, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.74457160675965e-08, "logits/chosen": -7.626450538635254, "logits/rejected": -7.274733066558838, "logps/chosen": -234.46607971191406, "logps/ref_chosen": -85.12948608398438, "logps/ref_rejected": -99.05403137207031, "logps/rejected": -330.6099548339844, "loss": 1.0924, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6954778432846069, "rewards/margins": 0.37927812337875366, "rewards/rejected": -1.074756145477295, "step": 574 }, { "epoch": 0.8443465491923642, "grad_norm": 9.878924369812012, "kl/avg_steps": 0.59375, "kl/beta": 0.004645021632313728, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 3.677293317363864e-08, "logits/chosen": -7.789747714996338, "logits/rejected": -7.52932071685791, "logps/chosen": -258.09442138671875, "logps/ref_chosen": -105.77084350585938, "logps/ref_rejected": -100.94944763183594, "logps/rejected": -335.0401306152344, "loss": 1.0999, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7044973969459534, "rewards/margins": 0.37485966086387634, "rewards/rejected": -1.0793570280075073, "step": 575 }, { "epoch": 0.8458149779735683, "grad_norm": 7.550034046173096, "kl/avg_steps": 0.53125, "kl/beta": 0.004617604892700911, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 3.6105769011194224e-08, "logits/chosen": -7.752475738525391, "logits/rejected": -7.319489479064941, "logps/chosen": -234.95172119140625, "logps/ref_chosen": -81.20452880859375, "logps/ref_rejected": -105.40940856933594, "logps/rejected": -339.11627197265625, "loss": 1.1064, "rewards/accuracies": 0.765625, "rewards/chosen": -0.7074535489082336, "rewards/margins": 0.36441028118133545, "rewards/rejected": -1.0718638896942139, "step": 576 }, { "epoch": 0.8472834067547724, "grad_norm": 6.790652275085449, "kl/avg_steps": 0.65625, "kl/beta": 0.00459320330992341, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 3.5444241160659304e-08, "logits/chosen": -7.78272819519043, "logits/rejected": -7.093866348266602, "logps/chosen": -238.08453369140625, "logps/ref_chosen": -101.46419525146484, "logps/ref_rejected": -89.32637023925781, "logps/rejected": -322.09918212890625, "loss": 1.0486, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6245477199554443, "rewards/margins": 0.43618321418762207, "rewards/rejected": -1.0607309341430664, "step": 577 }, { "epoch": 0.8487518355359766, "grad_norm": 10.109602928161621, "kl/avg_steps": 0.53125, "kl/beta": 0.004563257098197937, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 3.478836705390808e-08, "logits/chosen": -7.468972682952881, "logits/rejected": -7.147921562194824, "logps/chosen": -239.65093994140625, "logps/ref_chosen": -77.29241943359375, "logps/ref_rejected": -88.24701690673828, "logps/rejected": -317.5960693359375, "loss": 1.1523, "rewards/accuracies": 0.765625, "rewards/chosen": -0.7381951808929443, "rewards/margins": 0.3014621138572693, "rewards/rejected": -1.0396572351455688, "step": 578 }, { "epoch": 0.8502202643171806, "grad_norm": 7.376382350921631, "kl/avg_steps": 0.53125, "kl/beta": 0.004539142828434706, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 3.41381639738331e-08, "logits/chosen": -7.745573997497559, "logits/rejected": -6.99274206161499, "logps/chosen": -232.49017333984375, "logps/ref_chosen": -87.34305572509766, "logps/ref_rejected": -102.12100982666016, "logps/rejected": -319.19024658203125, "loss": 1.13, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6563674211502075, "rewards/margins": 0.3224019408226013, "rewards/rejected": -0.9787693023681641, "step": 579 }, { "epoch": 0.8516886930983847, "grad_norm": 7.219406604766846, "kl/avg_steps": 0.5, "kl/beta": 0.004515156149864197, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 3.349364905389032e-08, "logits/chosen": -7.58662223815918, "logits/rejected": -7.382083892822266, "logps/chosen": -229.9556121826172, "logps/ref_chosen": -88.92879486083984, "logps/ref_rejected": -88.69661712646484, "logps/rejected": -315.407958984375, "loss": 1.0857, "rewards/accuracies": 0.796875, "rewards/chosen": -0.6354615092277527, "rewards/margins": 0.3822898864746094, "rewards/rejected": -1.0177514553070068, "step": 580 }, { "epoch": 0.8531571218795888, "grad_norm": 8.329227447509766, "kl/avg_steps": 0.4375, "kl/beta": 0.0044926926493644714, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 3.285483927764726e-08, "logits/chosen": -7.649691581726074, "logits/rejected": -7.227565765380859, "logps/chosen": -242.04396057128906, "logps/ref_chosen": -92.22323608398438, "logps/ref_rejected": -99.38943481445312, "logps/rejected": -324.2420654296875, "loss": 1.1356, "rewards/accuracies": 0.734375, "rewards/chosen": -0.671623945236206, "rewards/margins": 0.33214688301086426, "rewards/rejected": -1.0037708282470703, "step": 581 }, { "epoch": 0.8546255506607929, "grad_norm": 6.972781181335449, "kl/avg_steps": 0.5625, "kl/beta": 0.004473122768104076, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.222175147833556e-08, "logits/chosen": -7.825651168823242, "logits/rejected": -7.724842548370361, "logps/chosen": -217.5670928955078, "logps/ref_chosen": -87.33561706542969, "logps/ref_rejected": -115.1544189453125, "logps/rejected": -317.5841064453125, "loss": 1.1379, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5807515382766724, "rewards/margins": 0.3188899755477905, "rewards/rejected": -0.8996415138244629, "step": 582 }, { "epoch": 0.856093979441997, "grad_norm": 7.694782733917236, "kl/avg_steps": 0.4375, "kl/beta": 0.004448102321475744, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 3.159440233840763e-08, "logits/chosen": -7.3343915939331055, "logits/rejected": -7.080375671386719, "logps/chosen": -240.53501892089844, "logps/ref_chosen": -83.71707153320312, "logps/ref_rejected": -94.58465576171875, "logps/rejected": -328.16815185546875, "loss": 1.12, "rewards/accuracies": 0.75, "rewards/chosen": -0.695499837398529, "rewards/margins": 0.337327241897583, "rewards/rejected": -1.0328271389007568, "step": 583 }, { "epoch": 0.8575624082232012, "grad_norm": 7.240791320800781, "kl/avg_steps": 0.53125, "kl/beta": 0.0044287266209721565, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 3.0972808389096635e-08, "logits/chosen": -7.599449157714844, "logits/rejected": -7.185504913330078, "logps/chosen": -232.9828338623047, "logps/ref_chosen": -94.25955963134766, "logps/ref_rejected": -104.48269653320312, "logps/rejected": -325.73150634765625, "loss": 1.104, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6124042272567749, "rewards/margins": 0.3608018755912781, "rewards/rejected": -0.9732060432434082, "step": 584 }, { "epoch": 0.8590308370044053, "grad_norm": 6.849031925201416, "kl/avg_steps": 0.6875, "kl/beta": 0.004405323415994644, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 3.035698600998121e-08, "logits/chosen": -7.737574577331543, "logits/rejected": -7.5866241455078125, "logps/chosen": -234.18655395507812, "logps/ref_chosen": -85.63675689697266, "logps/ref_rejected": -90.28547668457031, "logps/rejected": -332.9254150390625, "loss": 1.0747, "rewards/accuracies": 0.859375, "rewards/chosen": -0.6510626077651978, "rewards/margins": 0.4093397259712219, "rewards/rejected": -1.0604023933410645, "step": 585 }, { "epoch": 0.8604992657856094, "grad_norm": 7.319284439086914, "kl/avg_steps": 0.5, "kl/beta": 0.0043752435594797134, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 2.974695142855388e-08, "logits/chosen": -7.4545087814331055, "logits/rejected": -6.863378524780273, "logps/chosen": -246.67648315429688, "logps/ref_chosen": -83.68115234375, "logps/ref_rejected": -98.35916137695312, "logps/rejected": -325.6482238769531, "loss": 1.1595, "rewards/accuracies": 0.796875, "rewards/chosen": -0.7111270427703857, "rewards/margins": 0.2777676582336426, "rewards/rejected": -0.9888947606086731, "step": 586 }, { "epoch": 0.8619676945668135, "grad_norm": 6.929393291473389, "kl/avg_steps": 0.625, "kl/beta": 0.004353476222604513, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 2.9142720719793122e-08, "logits/chosen": -7.802062511444092, "logits/rejected": -6.934123992919922, "logps/chosen": -210.05108642578125, "logps/ref_chosen": -77.5775146484375, "logps/ref_rejected": -89.3203353881836, "logps/rejected": -292.1748352050781, "loss": 1.1418, "rewards/accuracies": 0.796875, "rewards/chosen": -0.5736163258552551, "rewards/margins": 0.30252429842948914, "rewards/rejected": -0.8761405944824219, "step": 587 }, { "epoch": 0.8634361233480177, "grad_norm": 7.931462287902832, "kl/avg_steps": 0.5625, "kl/beta": 0.00432643573731184, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.8544309805740018e-08, "logits/chosen": -7.959348678588867, "logits/rejected": -7.665192604064941, "logps/chosen": -227.4091796875, "logps/ref_chosen": -83.80426025390625, "logps/ref_rejected": -113.11943054199219, "logps/rejected": -323.88861083984375, "loss": 1.168, "rewards/accuracies": 0.703125, "rewards/chosen": -0.6190832257270813, "rewards/margins": 0.28650587797164917, "rewards/rejected": -0.9055891036987305, "step": 588 }, { "epoch": 0.8649045521292217, "grad_norm": 6.9884934425354, "kl/avg_steps": 0.625, "kl/beta": 0.004302235785871744, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 2.7951734455078786e-08, "logits/chosen": -7.721344947814941, "logits/rejected": -6.941378593444824, "logps/chosen": -250.2454376220703, "logps/ref_chosen": -96.46501922607422, "logps/ref_rejected": -116.03719329833984, "logps/rejected": -356.3966064453125, "loss": 1.0798, "rewards/accuracies": 0.875, "rewards/chosen": -0.65859454870224, "rewards/margins": 0.3684034049510956, "rewards/rejected": -1.0269978046417236, "step": 589 }, { "epoch": 0.8663729809104258, "grad_norm": 8.866425514221191, "kl/avg_steps": 0.75, "kl/beta": 0.0042755138128995895, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 2.736501028272095e-08, "logits/chosen": -7.883823394775391, "logits/rejected": -7.474784851074219, "logps/chosen": -230.829345703125, "logps/ref_chosen": -88.4278564453125, "logps/ref_rejected": -110.19876861572266, "logps/rejected": -336.062744140625, "loss": 1.0942, "rewards/accuracies": 0.890625, "rewards/chosen": -0.6050465106964111, "rewards/margins": 0.3528992533683777, "rewards/rejected": -0.957945704460144, "step": 590 }, { "epoch": 0.8678414096916299, "grad_norm": 5.826635360717773, "kl/avg_steps": 0.59375, "kl/beta": 0.004243686329573393, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 2.678415274939408e-08, "logits/chosen": -7.728172302246094, "logits/rejected": -6.94543981552124, "logps/chosen": -224.34735107421875, "logps/ref_chosen": -87.84968566894531, "logps/ref_rejected": -89.57919311523438, "logps/rejected": -317.07794189453125, "loss": 1.0725, "rewards/accuracies": 0.84375, "rewards/chosen": -0.577212929725647, "rewards/margins": 0.3819907307624817, "rewards/rejected": -0.9592036604881287, "step": 591 }, { "epoch": 0.869309838472834, "grad_norm": 7.418654441833496, "kl/avg_steps": 0.53125, "kl/beta": 0.004218637943267822, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.6209177161234442e-08, "logits/chosen": -7.696051597595215, "logits/rejected": -7.094331741333008, "logps/chosen": -235.08880615234375, "logps/ref_chosen": -74.97573852539062, "logps/ref_rejected": -81.55262756347656, "logps/rejected": -313.0411376953125, "loss": 1.1533, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6733924150466919, "rewards/margins": 0.2968708872795105, "rewards/rejected": -0.9702633619308472, "step": 592 }, { "epoch": 0.8707782672540382, "grad_norm": 10.643902778625488, "kl/avg_steps": 0.5, "kl/beta": 0.0041963448747992516, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 2.564009866938349e-08, "logits/chosen": -7.23436164855957, "logits/rejected": -6.888458251953125, "logps/chosen": -221.35968017578125, "logps/ref_chosen": -72.23902893066406, "logps/ref_rejected": -70.09652709960938, "logps/rejected": -280.5572509765625, "loss": 1.1869, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6236478090286255, "rewards/margins": 0.2537292242050171, "rewards/rejected": -0.8773769736289978, "step": 593 }, { "epoch": 0.8722466960352423, "grad_norm": 6.839757442474365, "kl/avg_steps": 0.625, "kl/beta": 0.004175467416644096, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 2.5076932269588708e-08, "logits/chosen": -7.658824920654297, "logits/rejected": -7.013692378997803, "logps/chosen": -226.31228637695312, "logps/ref_chosen": -90.43771362304688, "logps/ref_rejected": -92.267578125, "logps/rejected": -318.3495788574219, "loss": 1.0774, "rewards/accuracies": 0.890625, "rewards/chosen": -0.5646122694015503, "rewards/margins": 0.37262603640556335, "rewards/rejected": -0.937238335609436, "step": 594 }, { "epoch": 0.8737151248164464, "grad_norm": 7.265661239624023, "kl/avg_steps": 0.40625, "kl/beta": 0.004149532876908779, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 2.451969280180849e-08, "logits/chosen": -7.426346778869629, "logits/rejected": -6.822681427001953, "logps/chosen": -224.98826599121094, "logps/ref_chosen": -81.51480865478516, "logps/ref_rejected": -86.52047729492188, "logps/rejected": -306.0340576171875, "loss": 1.1373, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5944833159446716, "rewards/margins": 0.31153303384780884, "rewards/rejected": -0.9060162901878357, "step": 595 }, { "epoch": 0.8751835535976505, "grad_norm": 7.548264503479004, "kl/avg_steps": 0.53125, "kl/beta": 0.004132743459194899, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.396839494982103e-08, "logits/chosen": -7.338626384735107, "logits/rejected": -6.9880876541137695, "logps/chosen": -234.91661071777344, "logps/ref_chosen": -88.20805358886719, "logps/ref_rejected": -86.09375, "logps/rejected": -299.5952453613281, "loss": 1.1748, "rewards/accuracies": 0.75, "rewards/chosen": -0.6046238541603088, "rewards/margins": 0.27212288975715637, "rewards/rejected": -0.8767467737197876, "step": 596 }, { "epoch": 0.8766519823788547, "grad_norm": 6.239217758178711, "kl/avg_steps": 0.71875, "kl/beta": 0.004110904410481453, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 2.3423053240837514e-08, "logits/chosen": -7.8197503089904785, "logits/rejected": -7.281065940856934, "logps/chosen": -227.93858337402344, "logps/ref_chosen": -83.58998107910156, "logps/ref_rejected": -106.94439697265625, "logps/rejected": -340.60699462890625, "loss": 1.0937, "rewards/accuracies": 0.828125, "rewards/chosen": -0.5899473428726196, "rewards/margins": 0.36288982629776, "rewards/rejected": -0.9528372287750244, "step": 597 }, { "epoch": 0.8781204111600588, "grad_norm": 6.588911533355713, "kl/avg_steps": 0.5625, "kl/beta": 0.004081568215042353, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.2883682045119062e-08, "logits/chosen": -7.641655921936035, "logits/rejected": -7.323144912719727, "logps/chosen": -227.22210693359375, "logps/ref_chosen": -90.18262481689453, "logps/ref_rejected": -94.7985610961914, "logps/rejected": -304.92047119140625, "loss": 1.1493, "rewards/accuracies": 0.765625, "rewards/chosen": -0.557396411895752, "rewards/margins": 0.2942042946815491, "rewards/rejected": -0.8516007661819458, "step": 598 }, { "epoch": 0.8795888399412628, "grad_norm": 6.382659435272217, "kl/avg_steps": 0.375, "kl/beta": 0.004058737773448229, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 2.2350295575598367e-08, "logits/chosen": -7.610280990600586, "logits/rejected": -7.40827751159668, "logps/chosen": -218.54464721679688, "logps/ref_chosen": -77.84227752685547, "logps/ref_rejected": -89.1976318359375, "logps/rejected": -296.90069580078125, "loss": 1.1718, "rewards/accuracies": 0.703125, "rewards/chosen": -0.570677638053894, "rewards/margins": 0.2681824266910553, "rewards/rejected": -0.838860034942627, "step": 599 }, { "epoch": 0.8810572687224669, "grad_norm": 5.930601119995117, "kl/avg_steps": 0.5625, "kl/beta": 0.004043574444949627, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.1822907887504932e-08, "logits/chosen": -7.736866474151611, "logits/rejected": -7.516294956207275, "logps/chosen": -242.80270385742188, "logps/ref_chosen": -92.18781280517578, "logps/ref_rejected": -92.95429992675781, "logps/rejected": -322.2016906738281, "loss": 1.1296, "rewards/accuracies": 0.796875, "rewards/chosen": -0.6067217588424683, "rewards/margins": 0.31417691707611084, "rewards/rejected": -0.9208986759185791, "step": 600 }, { "epoch": 0.8810572687224669, "eval_kl/n_epsilon_steps": 0.252996563911438, "eval_kl/p_epsilon_steps": 0.7465753555297852, "eval_logits/chosen": -7.617927551269531, "eval_logits/rejected": -7.201268672943115, "eval_logps/chosen": -250.90887451171875, "eval_logps/ref_chosen": -100.49356842041016, "eval_logps/ref_rejected": -94.06775665283203, "eval_logps/rejected": -312.31573486328125, "eval_loss": 0.5845144987106323, "eval_rewards/accuracies": 0.7568492889404297, "eval_rewards/chosen": -0.6029341220855713, "eval_rewards/margins": 0.2691001296043396, "eval_rewards/rejected": -0.8720343112945557, "eval_runtime": 48.5484, "eval_samples_per_second": 48.179, "eval_steps_per_second": 1.524, "step": 600 }, { "epoch": 0.882525697503671, "grad_norm": 6.323831081390381, "kl/avg_steps": 0.59375, "kl/beta": 0.004020956344902515, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 2.1301532877994742e-08, "logits/chosen": -7.606431007385254, "logits/rejected": -7.300946235656738, "logps/chosen": -244.5936279296875, "logps/ref_chosen": -89.21614074707031, "logps/ref_rejected": -100.17054748535156, "logps/rejected": -336.119873046875, "loss": 1.1196, "rewards/accuracies": 0.828125, "rewards/chosen": -0.6223502159118652, "rewards/margins": 0.32026663422584534, "rewards/rejected": -0.9426168203353882, "step": 601 }, { "epoch": 0.8839941262848752, "grad_norm": 7.207840919494629, "kl/avg_steps": 0.65625, "kl/beta": 0.003997222986072302, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 2.0786184285784298e-08, "logits/chosen": -7.25759744644165, "logits/rejected": -7.028449058532715, "logps/chosen": -202.654296875, "logps/ref_chosen": -80.05760192871094, "logps/ref_rejected": -93.197509765625, "logps/rejected": -315.4969482421875, "loss": 1.0711, "rewards/accuracies": 0.828125, "rewards/chosen": -0.48817095160484314, "rewards/margins": 0.3939117193222046, "rewards/rejected": -0.8820826411247253, "step": 602 }, { "epoch": 0.8854625550660793, "grad_norm": 6.062160968780518, "kl/avg_steps": 0.5625, "kl/beta": 0.003971162252128124, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 2.0276875690788204e-08, "logits/chosen": -7.96243953704834, "logits/rejected": -7.082343101501465, "logps/chosen": -235.39010620117188, "logps/ref_chosen": -102.30957794189453, "logps/ref_rejected": -108.06884765625, "logps/rejected": -326.241455078125, "loss": 1.1139, "rewards/accuracies": 0.78125, "rewards/chosen": -0.526463508605957, "rewards/margins": 0.33385905623435974, "rewards/rejected": -0.8603225946426392, "step": 603 }, { "epoch": 0.8869309838472834, "grad_norm": 6.03358268737793, "kl/avg_steps": 0.6875, "kl/beta": 0.0039489492774009705, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 1.977362051376158e-08, "logits/chosen": -7.427624702453613, "logits/rejected": -7.069514274597168, "logps/chosen": -216.16348266601562, "logps/ref_chosen": -78.17408752441406, "logps/ref_rejected": -99.4961166381836, "logps/rejected": -323.61181640625, "loss": 1.1058, "rewards/accuracies": 0.84375, "rewards/chosen": -0.5420645475387573, "rewards/margins": 0.33607620000839233, "rewards/rejected": -0.8781407475471497, "step": 604 }, { "epoch": 0.8883994126284875, "grad_norm": 6.287881374359131, "kl/avg_steps": 0.46875, "kl/beta": 0.003921985626220703, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 1.9276432015946446e-08, "logits/chosen": -7.781911373138428, "logits/rejected": -7.073343276977539, "logps/chosen": -242.73826599121094, "logps/ref_chosen": -94.77333068847656, "logps/ref_rejected": -107.30490112304688, "logps/rejected": -327.88031005859375, "loss": 1.1539, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5785641074180603, "rewards/margins": 0.2811967134475708, "rewards/rejected": -0.8597608208656311, "step": 605 }, { "epoch": 0.8898678414096917, "grad_norm": 6.576826572418213, "kl/avg_steps": 0.65625, "kl/beta": 0.003903687233105302, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 1.8785323298722093e-08, "logits/chosen": -7.757646083831787, "logits/rejected": -6.858328342437744, "logps/chosen": -232.45693969726562, "logps/ref_chosen": -87.7533950805664, "logps/ref_rejected": -104.2422103881836, "logps/rejected": -326.3280944824219, "loss": 1.1412, "rewards/accuracies": 0.828125, "rewards/chosen": -0.5615659952163696, "rewards/margins": 0.298251748085022, "rewards/rejected": -0.8598177433013916, "step": 606 }, { "epoch": 0.8913362701908958, "grad_norm": 6.907278060913086, "kl/avg_steps": 0.5, "kl/beta": 0.0038782362826168537, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.8300307303259904e-08, "logits/chosen": -7.828334808349609, "logits/rejected": -6.711248397827148, "logps/chosen": -229.9647216796875, "logps/ref_chosen": -88.32904815673828, "logps/ref_rejected": -86.76811218261719, "logps/rejected": -314.5696716308594, "loss": 1.1314, "rewards/accuracies": 0.75, "rewards/chosen": -0.5482698082923889, "rewards/margins": 0.3295517861843109, "rewards/rejected": -0.8778215646743774, "step": 607 }, { "epoch": 0.8928046989720999, "grad_norm": 7.614426612854004, "kl/avg_steps": 0.65625, "kl/beta": 0.0038589416071772575, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 1.7821396810182437e-08, "logits/chosen": -7.646144866943359, "logits/rejected": -7.090588569641113, "logps/chosen": -215.21920776367188, "logps/ref_chosen": -85.76937103271484, "logps/ref_rejected": -100.23281860351562, "logps/rejected": -319.41949462890625, "loss": 1.1084, "rewards/accuracies": 0.828125, "rewards/chosen": -0.4974094033241272, "rewards/margins": 0.34195607900619507, "rewards/rejected": -0.8393654823303223, "step": 608 }, { "epoch": 0.8942731277533039, "grad_norm": 6.746026992797852, "kl/avg_steps": 0.625, "kl/beta": 0.0038337823934853077, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.7348604439226617e-08, "logits/chosen": -7.802424907684326, "logits/rejected": -7.451755523681641, "logps/chosen": -232.28611755371094, "logps/ref_chosen": -92.96656799316406, "logps/ref_rejected": -95.91818237304688, "logps/rejected": -322.77618408203125, "loss": 1.1163, "rewards/accuracies": 0.84375, "rewards/chosen": -0.531781792640686, "rewards/margins": 0.3316161632537842, "rewards/rejected": -0.8633979558944702, "step": 609 }, { "epoch": 0.895741556534508, "grad_norm": 6.536015510559082, "kl/avg_steps": 0.65625, "kl/beta": 0.0038099701050668955, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 1.6881942648911074e-08, "logits/chosen": -7.566076278686523, "logits/rejected": -7.088051795959473, "logps/chosen": -228.5191650390625, "logps/ref_chosen": -94.70028686523438, "logps/ref_rejected": -89.68739318847656, "logps/rejected": -314.64813232421875, "loss": 1.1011, "rewards/accuracies": 0.875, "rewards/chosen": -0.5073626637458801, "rewards/margins": 0.3432729244232178, "rewards/rejected": -0.8506356477737427, "step": 610 }, { "epoch": 0.8972099853157122, "grad_norm": 7.131064414978027, "kl/avg_steps": 0.5625, "kl/beta": 0.003785130102187395, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.6421423736208e-08, "logits/chosen": -7.605221748352051, "logits/rejected": -7.5947184562683105, "logps/chosen": -224.7576446533203, "logps/ref_chosen": -86.78334045410156, "logps/ref_rejected": -89.84307861328125, "logps/rejected": -316.98431396484375, "loss": 1.1105, "rewards/accuracies": 0.828125, "rewards/chosen": -0.52030348777771, "rewards/margins": 0.3337860703468323, "rewards/rejected": -0.8540895581245422, "step": 611 }, { "epoch": 0.8986784140969163, "grad_norm": 6.337128639221191, "kl/avg_steps": 0.6875, "kl/beta": 0.003763957880437374, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 1.5967059836219042e-08, "logits/chosen": -7.775234699249268, "logits/rejected": -6.849664688110352, "logps/chosen": -251.38430786132812, "logps/ref_chosen": -101.02015686035156, "logps/ref_rejected": -93.78302764892578, "logps/rejected": -335.0631103515625, "loss": 1.1023, "rewards/accuracies": 0.90625, "rewards/chosen": -0.5628792643547058, "rewards/margins": 0.33847683668136597, "rewards/rejected": -0.9013561010360718, "step": 612 }, { "epoch": 0.9001468428781204, "grad_norm": 6.324493885040283, "kl/avg_steps": 0.90625, "kl/beta": 0.003738257335498929, "kl/n_epsilon_steps": 0.046875, "kl/p_epsilon_steps": 0.953125, "learning_rate": 1.551886292185553e-08, "logits/chosen": -8.078614234924316, "logits/rejected": -7.592404365539551, "logps/chosen": -213.39401245117188, "logps/ref_chosen": -88.9886245727539, "logps/ref_rejected": -109.99551391601562, "logps/rejected": -333.4338073730469, "loss": 1.0762, "rewards/accuracies": 0.921875, "rewards/chosen": -0.46113917231559753, "rewards/margins": 0.3663785457611084, "rewards/rejected": -0.8275177478790283, "step": 613 }, { "epoch": 0.9016152716593245, "grad_norm": 7.8051652908325195, "kl/avg_steps": 0.5625, "kl/beta": 0.0037046836223453283, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.507684480352292e-08, "logits/chosen": -7.316326141357422, "logits/rejected": -7.080374240875244, "logps/chosen": -230.80239868164062, "logps/ref_chosen": -80.20005798339844, "logps/ref_rejected": -109.86239624023438, "logps/rejected": -340.70416259765625, "loss": 1.1461, "rewards/accuracies": 0.734375, "rewards/chosen": -0.5560883283615112, "rewards/margins": 0.29344964027404785, "rewards/rejected": -0.8495379686355591, "step": 614 }, { "epoch": 0.9030837004405287, "grad_norm": 6.5305585861206055, "kl/avg_steps": 0.59375, "kl/beta": 0.0036839614622294903, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 1.4641017128809801e-08, "logits/chosen": -7.641972541809082, "logits/rejected": -6.816801071166992, "logps/chosen": -243.8656463623047, "logps/ref_chosen": -100.43526458740234, "logps/ref_rejected": -101.1800537109375, "logps/rejected": -315.3383483886719, "loss": 1.1669, "rewards/accuracies": 0.796875, "rewards/chosen": -0.525856077671051, "rewards/margins": 0.2574092149734497, "rewards/rejected": -0.7832653522491455, "step": 615 }, { "epoch": 0.9045521292217328, "grad_norm": 5.881459712982178, "kl/avg_steps": 0.625, "kl/beta": 0.003662216942757368, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.4211391382180637e-08, "logits/chosen": -7.507961273193359, "logits/rejected": -7.136435508728027, "logps/chosen": -236.98391723632812, "logps/ref_chosen": -92.49292755126953, "logps/ref_rejected": -82.06065368652344, "logps/rejected": -318.6792297363281, "loss": 1.1124, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5267339944839478, "rewards/margins": 0.3333369195461273, "rewards/rejected": -0.8600709438323975, "step": 616 }, { "epoch": 0.9060205580029369, "grad_norm": 6.169250011444092, "kl/avg_steps": 0.59375, "kl/beta": 0.0036394703201949596, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 1.378797888467345e-08, "logits/chosen": -7.898214817047119, "logits/rejected": -7.082752227783203, "logps/chosen": -241.96456909179688, "logps/ref_chosen": -91.09699249267578, "logps/ref_rejected": -70.41004943847656, "logps/rejected": -294.9502868652344, "loss": 1.1664, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5471093654632568, "rewards/margins": 0.26464053988456726, "rewards/rejected": -0.8117498755455017, "step": 617 }, { "epoch": 0.9074889867841409, "grad_norm": 7.049670696258545, "kl/avg_steps": 0.5625, "kl/beta": 0.003617988433688879, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.3370790793601371e-08, "logits/chosen": -7.613478660583496, "logits/rejected": -7.167911052703857, "logps/chosen": -252.33189392089844, "logps/ref_chosen": -102.02059936523438, "logps/ref_rejected": -99.80119323730469, "logps/rejected": -331.26397705078125, "loss": 1.1462, "rewards/accuracies": 0.796875, "rewards/chosen": -0.5415487289428711, "rewards/margins": 0.2898721694946289, "rewards/rejected": -0.8314208388328552, "step": 618 }, { "epoch": 0.908957415565345, "grad_norm": 6.142784118652344, "kl/avg_steps": 0.375, "kl/beta": 0.003597751259803772, "kl/n_epsilon_steps": 0.3125, "kl/p_epsilon_steps": 0.6875, "learning_rate": 1.2959838102258535e-08, "logits/chosen": -7.600045204162598, "logits/rejected": -7.366483688354492, "logps/chosen": -234.5894775390625, "logps/ref_chosen": -89.74136352539062, "logps/ref_rejected": -99.90138244628906, "logps/rejected": -319.714111328125, "loss": 1.1708, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5201643705368042, "rewards/margins": 0.26622116565704346, "rewards/rejected": -0.7863855361938477, "step": 619 }, { "epoch": 0.9104258443465492, "grad_norm": 5.838797092437744, "kl/avg_steps": 0.40625, "kl/beta": 0.003584309946745634, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 1.2555131639630567e-08, "logits/chosen": -8.057722091674805, "logits/rejected": -7.399764060974121, "logps/chosen": -225.7647247314453, "logps/ref_chosen": -85.12431335449219, "logps/ref_rejected": -85.41253662109375, "logps/rejected": -307.8377380371094, "loss": 1.1526, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5036029815673828, "rewards/margins": 0.28931373357772827, "rewards/rejected": -0.7929166555404663, "step": 620 }, { "epoch": 0.9118942731277533, "grad_norm": 5.572709083557129, "kl/avg_steps": 0.46875, "kl/beta": 0.0035698076244443655, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 1.2156682070109086e-08, "logits/chosen": -7.758251190185547, "logits/rejected": -7.50982666015625, "logps/chosen": -229.43209838867188, "logps/ref_chosen": -89.24842071533203, "logps/ref_rejected": -95.46463775634766, "logps/rejected": -315.90655517578125, "loss": 1.1604, "rewards/accuracies": 0.765625, "rewards/chosen": -0.49918726086616516, "rewards/margins": 0.28264692425727844, "rewards/rejected": -0.7818341851234436, "step": 621 }, { "epoch": 0.9133627019089574, "grad_norm": 6.386410713195801, "kl/avg_steps": 0.5625, "kl/beta": 0.0035531523171812296, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.1764499893210878e-08, "logits/chosen": -7.883831977844238, "logits/rejected": -6.893963813781738, "logps/chosen": -242.18194580078125, "logps/ref_chosen": -99.79413604736328, "logps/ref_rejected": -90.82821655273438, "logps/rejected": -317.67083740234375, "loss": 1.1355, "rewards/accuracies": 0.796875, "rewards/chosen": -0.5042383670806885, "rewards/margins": 0.29660987854003906, "rewards/rejected": -0.8008482456207275, "step": 622 }, { "epoch": 0.9148311306901615, "grad_norm": 7.438137531280518, "kl/avg_steps": 0.40625, "kl/beta": 0.0035332776606082916, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 1.1378595443300998e-08, "logits/chosen": -7.771790504455566, "logits/rejected": -7.093353748321533, "logps/chosen": -236.72573852539062, "logps/ref_chosen": -90.45555114746094, "logps/ref_rejected": -91.32276916503906, "logps/rejected": -297.7119445800781, "loss": 1.2219, "rewards/accuracies": 0.703125, "rewards/chosen": -0.5164909958839417, "rewards/margins": 0.20907816290855408, "rewards/rejected": -0.7255691289901733, "step": 623 }, { "epoch": 0.9162995594713657, "grad_norm": 6.896797180175781, "kl/avg_steps": 0.5625, "kl/beta": 0.0035189816262573004, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.0998978889320582e-08, "logits/chosen": -8.146064758300781, "logits/rejected": -7.289480209350586, "logps/chosen": -258.7445373535156, "logps/ref_chosen": -109.87522888183594, "logps/ref_rejected": -104.77320861816406, "logps/rejected": -344.84759521484375, "loss": 1.1213, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5218902826309204, "rewards/margins": 0.3172043263912201, "rewards/rejected": -0.8390946388244629, "step": 624 }, { "epoch": 0.9177679882525698, "grad_norm": 5.683692455291748, "kl/avg_steps": 0.5, "kl/beta": 0.0034992981236428022, "kl/n_epsilon_steps": 0.25, "kl/p_epsilon_steps": 0.75, "learning_rate": 1.0625660234518913e-08, "logits/chosen": -7.770914554595947, "logits/rejected": -7.289626121520996, "logps/chosen": -243.88046264648438, "logps/ref_chosen": -87.16815948486328, "logps/ref_rejected": -91.86148071289062, "logps/rejected": -318.0606994628906, "loss": 1.1839, "rewards/accuracies": 0.75, "rewards/chosen": -0.5464307069778442, "rewards/margins": 0.24018320441246033, "rewards/rejected": -0.786613941192627, "step": 625 }, { "epoch": 0.9192364170337739, "grad_norm": 7.084156513214111, "kl/avg_steps": 0.40625, "kl/beta": 0.0034818886779248714, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 1.0258649316189721e-08, "logits/chosen": -7.743939399719238, "logits/rejected": -7.482439994812012, "logps/chosen": -254.01075744628906, "logps/ref_chosen": -104.22421264648438, "logps/ref_rejected": -104.1774673461914, "logps/rejected": -321.1068115234375, "loss": 1.2005, "rewards/accuracies": 0.75, "rewards/chosen": -0.5199881792068481, "rewards/margins": 0.23050320148468018, "rewards/rejected": -0.7504914402961731, "step": 626 }, { "epoch": 0.920704845814978, "grad_norm": 5.33841609954834, "kl/avg_steps": 0.625, "kl/beta": 0.0034678007941693068, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 9.897955805412e-09, "logits/chosen": -7.591939926147461, "logits/rejected": -7.3537187576293945, "logps/chosen": -208.89431762695312, "logps/ref_chosen": -74.93461608886719, "logps/ref_rejected": -112.57289123535156, "logps/rejected": -330.51983642578125, "loss": 1.1507, "rewards/accuracies": 0.828125, "rewards/chosen": -0.46203625202178955, "rewards/margins": 0.2876952886581421, "rewards/rejected": -0.7497316002845764, "step": 627 }, { "epoch": 0.922173274596182, "grad_norm": 6.301426410675049, "kl/avg_steps": 0.6875, "kl/beta": 0.0034462616313248873, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 9.543589206795238e-09, "logits/chosen": -7.962158679962158, "logits/rejected": -7.421542644500732, "logps/chosen": -231.39364624023438, "logps/ref_chosen": -93.69107818603516, "logps/ref_rejected": -107.34395599365234, "logps/rejected": -335.9172668457031, "loss": 1.1251, "rewards/accuracies": 0.859375, "rewards/chosen": -0.4717589020729065, "rewards/margins": 0.30952274799346924, "rewards/rejected": -0.7812816500663757, "step": 628 }, { "epoch": 0.9236417033773862, "grad_norm": 5.8255295753479, "kl/avg_steps": 0.53125, "kl/beta": 0.0034227303694933653, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 9.19555885822887e-09, "logits/chosen": -7.680126190185547, "logits/rejected": -7.379844665527344, "logps/chosen": -253.8419952392578, "logps/ref_chosen": -103.23037719726562, "logps/ref_rejected": -97.16841888427734, "logps/rejected": -326.94293212890625, "loss": 1.1605, "rewards/accuracies": 0.828125, "rewards/chosen": -0.5139710903167725, "rewards/margins": 0.2675333023071289, "rewards/rejected": -0.7815043926239014, "step": 629 }, { "epoch": 0.9251101321585903, "grad_norm": 7.221403121948242, "kl/avg_steps": 0.46875, "kl/beta": 0.003404643153771758, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 8.85387393063622e-09, "logits/chosen": -7.831258773803711, "logits/rejected": -7.15199089050293, "logps/chosen": -232.68309020996094, "logps/ref_chosen": -93.89755249023438, "logps/ref_rejected": -89.3743896484375, "logps/rejected": -283.4253234863281, "loss": 1.2339, "rewards/accuracies": 0.75, "rewards/chosen": -0.4714937210083008, "rewards/margins": 0.1851492077112198, "rewards/rejected": -0.6566429138183594, "step": 630 }, { "epoch": 0.9265785609397944, "grad_norm": 5.580920219421387, "kl/avg_steps": 0.53125, "kl/beta": 0.003388758283108473, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 8.518543427732949e-09, "logits/chosen": -7.815197944641113, "logits/rejected": -7.3377485275268555, "logps/chosen": -220.7677764892578, "logps/ref_chosen": -87.77082061767578, "logps/ref_rejected": -88.68241882324219, "logps/rejected": -300.57403564453125, "loss": 1.1612, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4492769241333008, "rewards/margins": 0.2641494572162628, "rewards/rejected": -0.7134263515472412, "step": 631 }, { "epoch": 0.9280469897209985, "grad_norm": 6.467636585235596, "kl/avg_steps": 0.53125, "kl/beta": 0.0033708508126437664, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 8.189576185789637e-09, "logits/chosen": -7.605412483215332, "logits/rejected": -7.2681169509887695, "logps/chosen": -221.4307861328125, "logps/ref_chosen": -88.62652587890625, "logps/ref_rejected": -91.45091247558594, "logps/rejected": -310.26220703125, "loss": 1.1516, "rewards/accuracies": 0.765625, "rewards/chosen": -0.44607216119766235, "rewards/margins": 0.2862231731414795, "rewards/rejected": -0.7322953343391418, "step": 632 }, { "epoch": 0.9295154185022027, "grad_norm": 6.414995193481445, "kl/avg_steps": 0.28125, "kl/beta": 0.00335303763858974, "kl/n_epsilon_steps": 0.359375, "kl/p_epsilon_steps": 0.640625, "learning_rate": 7.866980873399015e-09, "logits/chosen": -7.696202278137207, "logits/rejected": -7.264307498931885, "logps/chosen": -247.08786010742188, "logps/ref_chosen": -81.37442016601562, "logps/ref_rejected": -98.62571716308594, "logps/rejected": -316.35614013671875, "loss": 1.2589, "rewards/accuracies": 0.671875, "rewards/chosen": -0.5550001859664917, "rewards/margins": 0.17108407616615295, "rewards/rejected": -0.7260842323303223, "step": 633 }, { "epoch": 0.9309838472834068, "grad_norm": 6.248257637023926, "kl/avg_steps": 0.53125, "kl/beta": 0.0033436338417232037, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 7.550765991247654e-09, "logits/chosen": -7.881155490875244, "logits/rejected": -7.148515701293945, "logps/chosen": -250.07144165039062, "logps/ref_chosen": -96.12284851074219, "logps/ref_rejected": -112.84780883789062, "logps/rejected": -334.539306640625, "loss": 1.1935, "rewards/accuracies": 0.75, "rewards/chosen": -0.5128281712532043, "rewards/margins": 0.22365406155586243, "rewards/rejected": -0.7364822030067444, "step": 634 }, { "epoch": 0.9324522760646109, "grad_norm": 6.869739055633545, "kl/avg_steps": 0.46875, "kl/beta": 0.003325964557006955, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 7.240939871891699e-09, "logits/chosen": -7.332803249359131, "logits/rejected": -6.858902931213379, "logps/chosen": -258.06280517578125, "logps/ref_chosen": -98.68411254882812, "logps/ref_rejected": -89.8991928100586, "logps/rejected": -301.9739990234375, "loss": 1.2457, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5283612012863159, "rewards/margins": 0.17254400253295898, "rewards/rejected": -0.7009052634239197, "step": 635 }, { "epoch": 0.933920704845815, "grad_norm": 5.351519584655762, "kl/avg_steps": 0.625, "kl/beta": 0.003310446860268712, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 6.937510679537628e-09, "logits/chosen": -7.650508880615234, "logits/rejected": -6.78857946395874, "logps/chosen": -234.77163696289062, "logps/ref_chosen": -90.41796112060547, "logps/ref_rejected": -87.70687866210938, "logps/rejected": -316.6680603027344, "loss": 1.1499, "rewards/accuracies": 0.828125, "rewards/chosen": -0.4755138158798218, "rewards/margins": 0.27691105008125305, "rewards/rejected": -0.7524248361587524, "step": 636 }, { "epoch": 0.9353891336270191, "grad_norm": 5.5197577476501465, "kl/avg_steps": 0.59375, "kl/beta": 0.0032898851204663515, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 6.640486409826785e-09, "logits/chosen": -7.380791664123535, "logits/rejected": -7.242560386657715, "logps/chosen": -227.92446899414062, "logps/ref_chosen": -82.44971466064453, "logps/ref_rejected": -104.02860260009766, "logps/rejected": -339.26043701171875, "loss": 1.1395, "rewards/accuracies": 0.765625, "rewards/chosen": -0.4761905372142792, "rewards/margins": 0.29185453057289124, "rewards/rejected": -0.7680450677871704, "step": 637 }, { "epoch": 0.9368575624082232, "grad_norm": 5.237280368804932, "kl/avg_steps": 0.59375, "kl/beta": 0.0032704665791243315, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 6.349874889624962e-09, "logits/chosen": -7.648863792419434, "logits/rejected": -6.924067497253418, "logps/chosen": -226.06134033203125, "logps/ref_chosen": -91.92498779296875, "logps/ref_rejected": -86.28703308105469, "logps/rejected": -310.4959716796875, "loss": 1.1318, "rewards/accuracies": 0.84375, "rewards/chosen": -0.43678027391433716, "rewards/margins": 0.2914542555809021, "rewards/rejected": -0.7282345294952393, "step": 638 }, { "epoch": 0.9383259911894273, "grad_norm": 5.765613079071045, "kl/avg_steps": 0.34375, "kl/beta": 0.003251162823289633, "kl/n_epsilon_steps": 0.328125, "kl/p_epsilon_steps": 0.671875, "learning_rate": 6.065683776815933e-09, "logits/chosen": -7.779665470123291, "logits/rejected": -6.977321624755859, "logps/chosen": -281.9426574707031, "logps/ref_chosen": -104.52755737304688, "logps/ref_rejected": -81.4803466796875, "logps/rejected": -305.64599609375, "loss": 1.2695, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5761454105377197, "rewards/margins": 0.1490968018770218, "rewards/rejected": -0.7252421975135803, "step": 639 }, { "epoch": 0.9397944199706314, "grad_norm": 5.791350364685059, "kl/avg_steps": 0.71875, "kl/beta": 0.00324002536945045, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 5.7879205600998296e-09, "logits/chosen": -7.959763526916504, "logits/rejected": -7.383049964904785, "logps/chosen": -245.22933959960938, "logps/ref_chosen": -97.88526916503906, "logps/ref_rejected": -112.70501708984375, "logps/rejected": -343.93792724609375, "loss": 1.1573, "rewards/accuracies": 0.859375, "rewards/chosen": -0.47469913959503174, "rewards/margins": 0.26852816343307495, "rewards/rejected": -0.7432272434234619, "step": 640 }, { "epoch": 0.9412628487518355, "grad_norm": 5.238857269287109, "kl/avg_steps": 0.3125, "kl/beta": 0.0032169038895517588, "kl/n_epsilon_steps": 0.34375, "kl/p_epsilon_steps": 0.65625, "learning_rate": 5.516592558795746e-09, "logits/chosen": -7.708554267883301, "logits/rejected": -7.541186332702637, "logps/chosen": -232.91995239257812, "logps/ref_chosen": -96.4456787109375, "logps/ref_rejected": -95.13568878173828, "logps/rejected": -317.35809326171875, "loss": 1.1614, "rewards/accuracies": 0.71875, "rewards/chosen": -0.4384271800518036, "rewards/margins": 0.27239149808883667, "rewards/rejected": -0.7108187079429626, "step": 641 }, { "epoch": 0.9427312775330396, "grad_norm": 5.874085903167725, "kl/avg_steps": 0.53125, "kl/beta": 0.003206882392987609, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 5.251706922648868e-09, "logits/chosen": -7.545126914978027, "logits/rejected": -7.102571487426758, "logps/chosen": -256.21673583984375, "logps/ref_chosen": -100.75984954833984, "logps/ref_rejected": -114.70763397216797, "logps/rejected": -347.91558837890625, "loss": 1.1825, "rewards/accuracies": 0.78125, "rewards/chosen": -0.497159868478775, "rewards/margins": 0.24616217613220215, "rewards/rejected": -0.7433220148086548, "step": 642 }, { "epoch": 0.9441997063142438, "grad_norm": 6.670008659362793, "kl/avg_steps": 0.5625, "kl/beta": 0.003189935814589262, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.993270631642038e-09, "logits/chosen": -7.875416278839111, "logits/rejected": -7.243396759033203, "logps/chosen": -229.69638061523438, "logps/ref_chosen": -84.74365997314453, "logps/ref_rejected": -94.31842041015625, "logps/rejected": -315.35858154296875, "loss": 1.1822, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4609188437461853, "rewards/margins": 0.2396201640367508, "rewards/rejected": -0.7005389928817749, "step": 643 }, { "epoch": 0.9456681350954479, "grad_norm": 6.133624076843262, "kl/avg_steps": 0.5625, "kl/beta": 0.0031720928382128477, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 4.741290495811873e-09, "logits/chosen": -7.913561820983887, "logits/rejected": -7.103449821472168, "logps/chosen": -230.85415649414062, "logps/ref_chosen": -85.32275390625, "logps/ref_rejected": -94.60861206054688, "logps/rejected": -304.661865234375, "loss": 1.2111, "rewards/accuracies": 0.796875, "rewards/chosen": -0.45997491478919983, "rewards/margins": 0.20199283957481384, "rewards/rejected": -0.6619677543640137, "step": 644 }, { "epoch": 0.947136563876652, "grad_norm": 5.844942092895508, "kl/avg_steps": 0.1875, "kl/beta": 0.0031543495133519173, "kl/n_epsilon_steps": 0.40625, "kl/p_epsilon_steps": 0.59375, "learning_rate": 4.495773155069299e-09, "logits/chosen": -7.571622371673584, "logits/rejected": -7.209532737731934, "logps/chosen": -241.77206420898438, "logps/ref_chosen": -82.59024047851562, "logps/ref_rejected": -103.00375366210938, "logps/rejected": -306.9005432128906, "loss": 1.2771, "rewards/accuracies": 0.625, "rewards/chosen": -0.5023009777069092, "rewards/margins": 0.13825571537017822, "rewards/rejected": -0.6405566930770874, "step": 645 }, { "epoch": 0.9486049926578561, "grad_norm": 5.194578647613525, "kl/avg_steps": 0.59375, "kl/beta": 0.0031484460923820734, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 4.256725079024553e-09, "logits/chosen": -7.935370922088623, "logits/rejected": -7.05921745300293, "logps/chosen": -229.54640197753906, "logps/ref_chosen": -91.1920394897461, "logps/ref_rejected": -83.77833557128906, "logps/rejected": -303.084716796875, "loss": 1.1663, "rewards/accuracies": 0.796875, "rewards/chosen": -0.43383896350860596, "rewards/margins": 0.2520383596420288, "rewards/rejected": -0.6858773231506348, "step": 646 }, { "epoch": 0.9500734214390602, "grad_norm": 5.19912576675415, "kl/avg_steps": 0.40625, "kl/beta": 0.00312986271455884, "kl/n_epsilon_steps": 0.296875, "kl/p_epsilon_steps": 0.703125, "learning_rate": 4.024152566816791e-09, "logits/chosen": -7.209627151489258, "logits/rejected": -7.173065185546875, "logps/chosen": -242.48236083984375, "logps/ref_chosen": -88.84446716308594, "logps/ref_rejected": -99.49832916259766, "logps/rejected": -322.8554382324219, "loss": 1.2089, "rewards/accuracies": 0.703125, "rewards/chosen": -0.47964316606521606, "rewards/margins": 0.21512295305728912, "rewards/rejected": -0.694766104221344, "step": 647 }, { "epoch": 0.9515418502202643, "grad_norm": 5.339207172393799, "kl/avg_steps": 0.6875, "kl/beta": 0.003117199055850506, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 3.798061746947995e-09, "logits/chosen": -7.534468173980713, "logits/rejected": -7.395105838775635, "logps/chosen": -227.42025756835938, "logps/ref_chosen": -87.84810638427734, "logps/ref_rejected": -104.67005920410156, "logps/rejected": -343.5663146972656, "loss": 1.127, "rewards/accuracies": 0.859375, "rewards/chosen": -0.4329220652580261, "rewards/margins": 0.30612409114837646, "rewards/rejected": -0.7390461564064026, "step": 648 }, { "epoch": 0.9530102790014684, "grad_norm": 5.596925258636475, "kl/avg_steps": 0.5625, "kl/beta": 0.0030959146097302437, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.5784585771215235e-09, "logits/chosen": -7.8632001876831055, "logits/rejected": -7.287814140319824, "logps/chosen": -217.3938446044922, "logps/ref_chosen": -89.6925048828125, "logps/ref_rejected": -88.70658111572266, "logps/rejected": -297.97711181640625, "loss": 1.1676, "rewards/accuracies": 0.78125, "rewards/chosen": -0.39361506700515747, "rewards/margins": 0.24976572394371033, "rewards/rejected": -0.6433808207511902, "step": 649 }, { "epoch": 0.9544787077826725, "grad_norm": 5.370362281799316, "kl/avg_steps": 0.625, "kl/beta": 0.0030785975977778435, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 3.3653488440851253e-09, "logits/chosen": -7.673471450805664, "logits/rejected": -7.417351722717285, "logps/chosen": -241.04432678222656, "logps/ref_chosen": -89.93060302734375, "logps/ref_rejected": -102.61282348632812, "logps/rejected": -332.0096435546875, "loss": 1.1869, "rewards/accuracies": 0.796875, "rewards/chosen": -0.463270366191864, "rewards/margins": 0.23782768845558167, "rewards/rejected": -0.7010980844497681, "step": 650 }, { "epoch": 0.9559471365638766, "grad_norm": 5.453701972961426, "kl/avg_steps": 0.65625, "kl/beta": 0.0030594756826758385, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 3.158738163478475e-09, "logits/chosen": -7.594956398010254, "logits/rejected": -6.981936454772949, "logps/chosen": -208.45730590820312, "logps/ref_chosen": -79.18731689453125, "logps/ref_rejected": -105.93333435058594, "logps/rejected": -328.37841796875, "loss": 1.1412, "rewards/accuracies": 0.84375, "rewards/chosen": -0.3932613134384155, "rewards/margins": 0.2819024920463562, "rewards/rejected": -0.675163745880127, "step": 651 }, { "epoch": 0.9574155653450808, "grad_norm": 4.904377460479736, "kl/avg_steps": 0.6875, "kl/beta": 0.003039528848603368, "kl/n_epsilon_steps": 0.15625, "kl/p_epsilon_steps": 0.84375, "learning_rate": 2.9586319796851555e-09, "logits/chosen": -7.993418216705322, "logits/rejected": -7.6948065757751465, "logps/chosen": -234.99612426757812, "logps/ref_chosen": -101.79022979736328, "logps/ref_rejected": -116.3245849609375, "logps/rejected": -330.2041015625, "loss": 1.1713, "rewards/accuracies": 0.859375, "rewards/chosen": -0.40263664722442627, "rewards/margins": 0.24258247017860413, "rewards/rejected": -0.6452191472053528, "step": 652 }, { "epoch": 0.9588839941262849, "grad_norm": 5.360039234161377, "kl/avg_steps": 0.59375, "kl/beta": 0.003018774790689349, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 2.7650355656892166e-09, "logits/chosen": -7.770810127258301, "logits/rejected": -7.401587963104248, "logps/chosen": -243.21240234375, "logps/ref_chosen": -93.35359191894531, "logps/ref_rejected": -109.12324523925781, "logps/rejected": -335.51324462890625, "loss": 1.1885, "rewards/accuracies": 0.796875, "rewards/chosen": -0.45035725831985474, "rewards/margins": 0.22805194556713104, "rewards/rejected": -0.678409218788147, "step": 653 }, { "epoch": 0.960352422907489, "grad_norm": 7.331425666809082, "kl/avg_steps": 0.53125, "kl/beta": 0.003000956494361162, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.577954022936174e-09, "logits/chosen": -7.537060260772705, "logits/rejected": -7.259842872619629, "logps/chosen": -241.98892211914062, "logps/ref_chosen": -89.11553955078125, "logps/ref_rejected": -104.91995239257812, "logps/rejected": -323.5856628417969, "loss": 1.2262, "rewards/accuracies": 0.75, "rewards/chosen": -0.45708924531936646, "rewards/margins": 0.1945749968290329, "rewards/rejected": -0.6516642570495605, "step": 654 }, { "epoch": 0.9618208516886931, "grad_norm": 5.144998550415039, "kl/avg_steps": 0.53125, "kl/beta": 0.0029850981663912535, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.397392281198729e-09, "logits/chosen": -7.6725006103515625, "logits/rejected": -7.398637771606445, "logps/chosen": -220.708984375, "logps/ref_chosen": -81.03610229492188, "logps/ref_rejected": -102.80233764648438, "logps/rejected": -322.0316162109375, "loss": 1.1856, "rewards/accuracies": 0.796875, "rewards/chosen": -0.4156780540943146, "rewards/margins": 0.23453345894813538, "rewards/rejected": -0.65021151304245, "step": 655 }, { "epoch": 0.9632892804698973, "grad_norm": 5.95790958404541, "kl/avg_steps": 0.71875, "kl/beta": 0.00296932365745306, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 2.223355098446622e-09, "logits/chosen": -7.680576324462891, "logits/rejected": -7.717093467712402, "logps/chosen": -223.98876953125, "logps/ref_chosen": -85.32534790039062, "logps/ref_rejected": -118.33866882324219, "logps/rejected": -357.14013671875, "loss": 1.1311, "rewards/accuracies": 0.84375, "rewards/chosen": -0.4093048572540283, "rewards/margins": 0.294216513633728, "rewards/rejected": -0.7035213708877563, "step": 656 }, { "epoch": 0.9647577092511013, "grad_norm": 6.30833625793457, "kl/avg_steps": 0.59375, "kl/beta": 0.002948133973404765, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 2.055847060721566e-09, "logits/chosen": -7.752594947814941, "logits/rejected": -7.609277725219727, "logps/chosen": -226.81549072265625, "logps/ref_chosen": -80.19772338867188, "logps/ref_rejected": -102.581298828125, "logps/rejected": -330.3614501953125, "loss": 1.1867, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4302343726158142, "rewards/margins": 0.23619841039180756, "rewards/rejected": -0.666432797908783, "step": 657 }, { "epoch": 0.9662261380323054, "grad_norm": 5.188457012176514, "kl/avg_steps": 0.65625, "kl/beta": 0.002930732909590006, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 1.8948725820160662e-09, "logits/chosen": -7.566948890686035, "logits/rejected": -7.049898147583008, "logps/chosen": -248.58145141601562, "logps/ref_chosen": -94.634521484375, "logps/ref_rejected": -101.63162231445312, "logps/rejected": -325.5663146972656, "loss": 1.2098, "rewards/accuracies": 0.84375, "rewards/chosen": -0.44880080223083496, "rewards/margins": 0.202609121799469, "rewards/rejected": -0.6514099836349487, "step": 658 }, { "epoch": 0.9676945668135095, "grad_norm": 4.891656875610352, "kl/avg_steps": 0.5625, "kl/beta": 0.0029116251971572638, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 1.7404359041573723e-09, "logits/chosen": -8.355998039245605, "logits/rejected": -7.4629597663879395, "logps/chosen": -246.71299743652344, "logps/ref_chosen": -112.55587005615234, "logps/ref_rejected": -93.9216079711914, "logps/rejected": -315.93914794921875, "loss": 1.1774, "rewards/accuracies": 0.796875, "rewards/chosen": -0.3891860246658325, "rewards/margins": 0.25232329964637756, "rewards/rejected": -0.6415092945098877, "step": 659 }, { "epoch": 0.9691629955947136, "grad_norm": 7.11164665222168, "kl/avg_steps": 0.59375, "kl/beta": 0.0028953389264643192, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 1.592541096695571e-09, "logits/chosen": -7.641197681427002, "logits/rejected": -7.153841972351074, "logps/chosen": -234.69586181640625, "logps/ref_chosen": -93.37742614746094, "logps/ref_rejected": -81.39482116699219, "logps/rejected": -311.2170715332031, "loss": 1.1675, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4076111614704132, "rewards/margins": 0.25319740176200867, "rewards/rejected": -0.6608085632324219, "step": 660 }, { "epoch": 0.9706314243759178, "grad_norm": 4.977914333343506, "kl/avg_steps": 0.65625, "kl/beta": 0.002878249390050769, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 1.4511920567963908e-09, "logits/chosen": -7.581234931945801, "logits/rejected": -6.8354692459106445, "logps/chosen": -233.6283416748047, "logps/ref_chosen": -87.85516357421875, "logps/ref_rejected": -92.40330505371094, "logps/rejected": -332.63751220703125, "loss": 1.1572, "rewards/accuracies": 0.859375, "rewards/chosen": -0.41780516505241394, "rewards/margins": 0.26863789558410645, "rewards/rejected": -0.6864430904388428, "step": 661 }, { "epoch": 0.9720998531571219, "grad_norm": 4.641286373138428, "kl/avg_steps": 0.59375, "kl/beta": 0.002859483938664198, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 1.3163925091384532e-09, "logits/chosen": -7.912067413330078, "logits/rejected": -7.321425437927246, "logps/chosen": -243.25115966796875, "logps/ref_chosen": -102.77980041503906, "logps/ref_rejected": -95.22531127929688, "logps/rejected": -322.36285400390625, "loss": 1.1704, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4002068340778351, "rewards/margins": 0.24505510926246643, "rewards/rejected": -0.6452619433403015, "step": 662 }, { "epoch": 0.973568281938326, "grad_norm": 5.062544822692871, "kl/avg_steps": 0.46875, "kl/beta": 0.002842606045305729, "kl/n_epsilon_steps": 0.265625, "kl/p_epsilon_steps": 0.734375, "learning_rate": 1.1881460058152382e-09, "logits/chosen": -7.826132774353027, "logits/rejected": -7.58699369430542, "logps/chosen": -235.74203491210938, "logps/ref_chosen": -96.34658813476562, "logps/ref_rejected": -120.52645111083984, "logps/rejected": -340.11309814453125, "loss": 1.1945, "rewards/accuracies": 0.75, "rewards/chosen": -0.39547061920166016, "rewards/margins": 0.2249811291694641, "rewards/rejected": -0.6204517483711243, "step": 663 }, { "epoch": 0.9750367107195301, "grad_norm": 5.462507724761963, "kl/avg_steps": 0.53125, "kl/beta": 0.002829343546181917, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.066455926241383e-09, "logits/chosen": -7.908196449279785, "logits/rejected": -7.449959754943848, "logps/chosen": -245.00120544433594, "logps/ref_chosen": -91.84242248535156, "logps/ref_rejected": -111.83668518066406, "logps/rejected": -338.63543701171875, "loss": 1.2061, "rewards/accuracies": 0.78125, "rewards/chosen": -0.43186432123184204, "rewards/margins": 0.20575222373008728, "rewards/rejected": -0.6376165151596069, "step": 664 }, { "epoch": 0.9765051395007343, "grad_norm": 4.550708293914795, "kl/avg_steps": 0.71875, "kl/beta": 0.0028143920935690403, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 9.513254770636137e-10, "logits/chosen": -7.470752716064453, "logits/rejected": -7.1154465675354, "logps/chosen": -217.97030639648438, "logps/ref_chosen": -88.18618774414062, "logps/ref_rejected": -91.9120101928711, "logps/rejected": -303.5909423828125, "loss": 1.1848, "rewards/accuracies": 0.890625, "rewards/chosen": -0.36321356892585754, "rewards/margins": 0.22789615392684937, "rewards/rejected": -0.5911097526550293, "step": 665 }, { "epoch": 0.9779735682819384, "grad_norm": 6.450153350830078, "kl/avg_steps": 0.53125, "kl/beta": 0.002794307889416814, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 8.427576920763956e-10, "logits/chosen": -7.702958583831787, "logits/rejected": -7.515384674072266, "logps/chosen": -249.63475036621094, "logps/ref_chosen": -100.97460174560547, "logps/ref_rejected": -101.24992370605469, "logps/rejected": -327.71697998046875, "loss": 1.2025, "rewards/accuracies": 0.765625, "rewards/chosen": -0.41387584805488586, "rewards/margins": 0.2145766168832779, "rewards/rejected": -0.628452479839325, "step": 666 }, { "epoch": 0.9794419970631424, "grad_norm": 4.641995429992676, "kl/avg_steps": 0.59375, "kl/beta": 0.002779541537165642, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 7.407554321417764e-10, "logits/chosen": -7.869577884674072, "logits/rejected": -7.049417495727539, "logps/chosen": -256.4490661621094, "logps/ref_chosen": -97.5711669921875, "logps/ref_rejected": -93.58476257324219, "logps/rejected": -339.6527404785156, "loss": 1.1737, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4399784505367279, "rewards/margins": 0.23969195783138275, "rewards/rejected": -0.6796703934669495, "step": 667 }, { "epoch": 0.9809104258443465, "grad_norm": 6.144704341888428, "kl/avg_steps": 0.53125, "kl/beta": 0.002763135591521859, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 6.453213851142225e-10, "logits/chosen": -8.091211318969727, "logits/rejected": -7.701225280761719, "logps/chosen": -242.12380981445312, "logps/ref_chosen": -102.5750503540039, "logps/ref_rejected": -108.81768798828125, "logps/rejected": -345.40899658203125, "loss": 1.1573, "rewards/accuracies": 0.828125, "rewards/chosen": -0.38403064012527466, "rewards/margins": 0.2652904689311981, "rewards/rejected": -0.6493211388587952, "step": 668 }, { "epoch": 0.9823788546255506, "grad_norm": 5.321188926696777, "kl/avg_steps": 0.59375, "kl/beta": 0.0027485338505357504, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 5.564580657695939e-10, "logits/chosen": -7.704500198364258, "logits/rejected": -6.9445672035217285, "logps/chosen": -217.85406494140625, "logps/ref_chosen": -89.49478149414062, "logps/ref_rejected": -82.51950073242188, "logps/rejected": -305.6316223144531, "loss": 1.1609, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3517535924911499, "rewards/margins": 0.2574951946735382, "rewards/rejected": -0.6092487573623657, "step": 669 }, { "epoch": 0.9838472834067548, "grad_norm": 4.645162582397461, "kl/avg_steps": 0.75, "kl/beta": 0.0027323109097778797, "kl/n_epsilon_steps": 0.125, "kl/p_epsilon_steps": 0.875, "learning_rate": 4.741678157389739e-10, "logits/chosen": -7.779049873352051, "logits/rejected": -7.034740447998047, "logps/chosen": -223.52005004882812, "logps/ref_chosen": -95.45459747314453, "logps/ref_rejected": -101.53292846679688, "logps/rejected": -332.3464050292969, "loss": 1.1435, "rewards/accuracies": 0.890625, "rewards/chosen": -0.3478962182998657, "rewards/margins": 0.2775384485721588, "rewards/rejected": -0.6254346370697021, "step": 670 }, { "epoch": 0.9853157121879589, "grad_norm": 4.930266380310059, "kl/avg_steps": 0.5625, "kl/beta": 0.0027119710575789213, "kl/n_epsilon_steps": 0.21875, "kl/p_epsilon_steps": 0.78125, "learning_rate": 3.9845280344705245e-10, "logits/chosen": -7.417923927307129, "logits/rejected": -7.285045623779297, "logps/chosen": -239.85293579101562, "logps/ref_chosen": -82.12312316894531, "logps/ref_rejected": -90.21969604492188, "logps/rejected": -317.8214111328125, "loss": 1.2244, "rewards/accuracies": 0.796875, "rewards/chosen": -0.42587053775787354, "rewards/margins": 0.18708932399749756, "rewards/rejected": -0.6129598617553711, "step": 671 }, { "epoch": 0.986784140969163, "grad_norm": 5.901381969451904, "kl/avg_steps": 0.4375, "kl/beta": 0.0026968014426529408, "kl/n_epsilon_steps": 0.28125, "kl/p_epsilon_steps": 0.71875, "learning_rate": 3.293150240547549e-10, "logits/chosen": -7.819128513336182, "logits/rejected": -7.268359184265137, "logps/chosen": -247.38026428222656, "logps/ref_chosen": -90.0619125366211, "logps/ref_rejected": -100.45323181152344, "logps/rejected": -320.5558166503906, "loss": 1.2405, "rewards/accuracies": 0.734375, "rewards/chosen": -0.423230916261673, "rewards/margins": 0.1669987291097641, "rewards/rejected": -0.5902296304702759, "step": 672 }, { "epoch": 0.9882525697503671, "grad_norm": 5.085114479064941, "kl/avg_steps": 0.53125, "kl/beta": 0.002685054438188672, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 2.6675629940689504e-10, "logits/chosen": -7.903676986694336, "logits/rejected": -7.309889793395996, "logps/chosen": -218.5710906982422, "logps/ref_chosen": -79.26315307617188, "logps/ref_rejected": -91.34925079345703, "logps/rejected": -302.2262268066406, "loss": 1.2187, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3727704882621765, "rewards/margins": 0.18984678387641907, "rewards/rejected": -0.562617301940918, "step": 673 }, { "epoch": 0.9897209985315712, "grad_norm": 4.563065052032471, "kl/avg_steps": 0.71875, "kl/beta": 0.0026708655059337616, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 2.1077827798404725e-10, "logits/chosen": -7.53197717666626, "logits/rejected": -7.05259895324707, "logps/chosen": -206.57798767089844, "logps/ref_chosen": -75.45831298828125, "logps/ref_rejected": -76.20362854003906, "logps/rejected": -302.875, "loss": 1.1642, "rewards/accuracies": 0.890625, "rewards/chosen": -0.34820300340652466, "rewards/margins": 0.2524449825286865, "rewards/rejected": -0.600648045539856, "step": 674 }, { "epoch": 0.9911894273127754, "grad_norm": 5.335831165313721, "kl/avg_steps": 0.53125, "kl/beta": 0.0026518055237829685, "kl/n_epsilon_steps": 0.234375, "kl/p_epsilon_steps": 0.765625, "learning_rate": 1.6138243485910863e-10, "logits/chosen": -7.707300186157227, "logits/rejected": -7.3889970779418945, "logps/chosen": -227.656494140625, "logps/ref_chosen": -79.90953063964844, "logps/ref_rejected": -81.21824645996094, "logps/rejected": -313.17236328125, "loss": 1.1993, "rewards/accuracies": 0.765625, "rewards/chosen": -0.3907131552696228, "rewards/margins": 0.2203899621963501, "rewards/rejected": -0.6111031174659729, "step": 675 }, { "epoch": 0.9926578560939795, "grad_norm": 4.61472749710083, "kl/avg_steps": 0.59375, "kl/beta": 0.0026377923786640167, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 1.1857007165852472e-10, "logits/chosen": -8.043817520141602, "logits/rejected": -7.283336639404297, "logps/chosen": -254.79995727539062, "logps/ref_chosen": -98.17111206054688, "logps/ref_rejected": -95.024658203125, "logps/rejected": -336.66851806640625, "loss": 1.1933, "rewards/accuracies": 0.796875, "rewards/chosen": -0.41136687994003296, "rewards/margins": 0.2214387059211731, "rewards/rejected": -0.632805585861206, "step": 676 }, { "epoch": 0.9941262848751835, "grad_norm": 4.470887660980225, "kl/avg_steps": 0.59375, "kl/beta": 0.002622222760692239, "kl/n_epsilon_steps": 0.203125, "kl/p_epsilon_steps": 0.796875, "learning_rate": 8.23423165278725e-11, "logits/chosen": -7.727552890777588, "logits/rejected": -6.984161376953125, "logps/chosen": -230.60610961914062, "logps/ref_chosen": -91.37928009033203, "logps/ref_rejected": -82.87776947021484, "logps/rejected": -308.8211669921875, "loss": 1.186, "rewards/accuracies": 0.84375, "rewards/chosen": -0.3634149134159088, "rewards/margins": 0.22486907243728638, "rewards/rejected": -0.5882839560508728, "step": 677 }, { "epoch": 0.9955947136563876, "grad_norm": 4.6202616691589355, "kl/avg_steps": 0.71875, "kl/beta": 0.002606745343655348, "kl/n_epsilon_steps": 0.140625, "kl/p_epsilon_steps": 0.859375, "learning_rate": 5.270012410216185e-11, "logits/chosen": -7.629189491271973, "logits/rejected": -6.984808921813965, "logps/chosen": -208.89874267578125, "logps/ref_chosen": -75.64586639404297, "logps/ref_rejected": -86.96611022949219, "logps/rejected": -323.3565979003906, "loss": 1.1519, "rewards/accuracies": 0.859375, "rewards/chosen": -0.3453512489795685, "rewards/margins": 0.265934556722641, "rewards/rejected": -0.6112858057022095, "step": 678 }, { "epoch": 0.9970631424375918, "grad_norm": 4.485384464263916, "kl/avg_steps": 0.65625, "kl/beta": 0.0025881431065499783, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 2.9644275480772416e-11, "logits/chosen": -7.64573860168457, "logits/rejected": -6.9826436042785645, "logps/chosen": -232.62774658203125, "logps/ref_chosen": -80.77344512939453, "logps/ref_rejected": -82.87850189208984, "logps/rejected": -317.65594482421875, "loss": 1.1954, "rewards/accuracies": 0.875, "rewards/chosen": -0.39101141691207886, "rewards/margins": 0.2123621702194214, "rewards/rejected": -0.6033735871315002, "step": 679 }, { "epoch": 0.9985315712187959, "grad_norm": 4.077805042266846, "kl/avg_steps": 0.625, "kl/beta": 0.0025712691713124514, "kl/n_epsilon_steps": 0.1875, "kl/p_epsilon_steps": 0.8125, "learning_rate": 1.31753782067201e-11, "logits/chosen": -7.682188987731934, "logits/rejected": -7.54879093170166, "logps/chosen": -261.8695373535156, "logps/ref_chosen": -107.68292999267578, "logps/ref_rejected": -116.09486389160156, "logps/rejected": -359.0973815917969, "loss": 1.1887, "rewards/accuracies": 0.859375, "rewards/chosen": -0.3943979740142822, "rewards/margins": 0.22572645545005798, "rewards/rejected": -0.6201244592666626, "step": 680 }, { "epoch": 1.0, "grad_norm": 4.475795269012451, "kl/avg_steps": 0.65625, "kl/beta": 0.002555298386141658, "kl/n_epsilon_steps": 0.171875, "kl/p_epsilon_steps": 0.828125, "learning_rate": 3.2938662507808745e-12, "logits/chosen": -7.837874412536621, "logits/rejected": -7.355884552001953, "logps/chosen": -231.38394165039062, "logps/ref_chosen": -93.01106262207031, "logps/ref_rejected": -94.82217407226562, "logps/rejected": -311.75775146484375, "loss": 1.2117, "rewards/accuracies": 0.828125, "rewards/chosen": -0.3517065942287445, "rewards/margins": 0.19827213883399963, "rewards/rejected": -0.5499787330627441, "step": 681 }, { "epoch": 1.0, "step": 681, "total_flos": 0.0, "train_loss": 1.0212064078200755, "train_runtime": 3308.3388, "train_samples_per_second": 13.178, "train_steps_per_second": 0.206 } ], "logging_steps": 1, "max_steps": 681, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }