{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 2130, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004694835680751174, "grad_norm": 936.4916883988651, "learning_rate": 2.1126760563380282e-08, "logits/chosen": -0.540820300579071, "logits/rejected": -0.3045898377895355, "logps/chosen": -465.6000061035156, "logps/rejected": -556.4000244140625, "loss": 2.3114, "rewards/accuracies": 0.18437500298023224, "rewards/chosen": -0.3614257872104645, "rewards/margins": -0.0269775390625, "rewards/rejected": -0.33447265625, "step": 10 }, { "epoch": 0.009389671361502348, "grad_norm": 953.4474139142043, "learning_rate": 4.460093896713615e-08, "logits/chosen": -0.626171886920929, "logits/rejected": -0.34306639432907104, "logps/chosen": -488.3999938964844, "logps/rejected": -564.2000122070312, "loss": 3.0435, "rewards/accuracies": 0.29296875, "rewards/chosen": 0.03300781175494194, "rewards/margins": 0.11865234375, "rewards/rejected": -0.08632812649011612, "step": 20 }, { "epoch": 0.014084507042253521, "grad_norm": 770.2453757778413, "learning_rate": 6.807511737089202e-08, "logits/chosen": -0.539257824420929, "logits/rejected": -0.2958007752895355, "logps/chosen": -524.2000122070312, "logps/rejected": -620.4000244140625, "loss": 3.2779, "rewards/accuracies": 0.3031249940395355, "rewards/chosen": -0.03046875074505806, "rewards/margins": 0.17373046278953552, "rewards/rejected": -0.20371094346046448, "step": 30 }, { "epoch": 0.018779342723004695, "grad_norm": 1180.5671607925733, "learning_rate": 9.154929577464789e-08, "logits/chosen": -0.557421863079071, "logits/rejected": -0.3057617247104645, "logps/chosen": -509.6000061035156, "logps/rejected": -636.7999877929688, "loss": 3.3844, "rewards/accuracies": 0.30078125, "rewards/chosen": -0.671191394329071, "rewards/margins": -0.0002929687616415322, "rewards/rejected": -0.6700195074081421, "step": 40 }, { "epoch": 0.023474178403755867, "grad_norm": 964.074666416198, "learning_rate": 1.1502347417840374e-07, "logits/chosen": -0.559374988079071, "logits/rejected": -0.2877563536167145, "logps/chosen": -493.79998779296875, "logps/rejected": -577.5999755859375, "loss": 3.5928, "rewards/accuracies": 0.27734375, "rewards/chosen": -0.36137694120407104, "rewards/margins": -0.6417480707168579, "rewards/rejected": 0.2801757752895355, "step": 50 }, { "epoch": 0.028169014084507043, "grad_norm": 867.7361371491062, "learning_rate": 1.384976525821596e-07, "logits/chosen": -0.556640625, "logits/rejected": -0.25772398710250854, "logps/chosen": -505.20001220703125, "logps/rejected": -624.5999755859375, "loss": 3.726, "rewards/accuracies": 0.2789062559604645, "rewards/chosen": -0.02421874925494194, "rewards/margins": -0.17329101264476776, "rewards/rejected": 0.14902344346046448, "step": 60 }, { "epoch": 0.03286384976525822, "grad_norm": 890.6960891622597, "learning_rate": 1.619718309859155e-07, "logits/chosen": -0.5816406011581421, "logits/rejected": -0.3013916015625, "logps/chosen": -484.20001220703125, "logps/rejected": -578.4000244140625, "loss": 6.7057, "rewards/accuracies": 0.29296875, "rewards/chosen": -0.3755859434604645, "rewards/margins": -2.995410203933716, "rewards/rejected": 2.6173338890075684, "step": 70 }, { "epoch": 0.03755868544600939, "grad_norm": 709.4406154704218, "learning_rate": 1.8544600938967138e-07, "logits/chosen": -0.48359376192092896, "logits/rejected": -0.23949584364891052, "logps/chosen": -491.20001220703125, "logps/rejected": -602.2000122070312, "loss": 3.1643, "rewards/accuracies": 0.27734375, "rewards/chosen": 0.33867186307907104, "rewards/margins": 0.06064452975988388, "rewards/rejected": 0.2789062559604645, "step": 80 }, { "epoch": 0.04225352112676056, "grad_norm": 824.7413516341182, "learning_rate": 2.089201877934272e-07, "logits/chosen": -0.4554687440395355, "logits/rejected": -0.25, "logps/chosen": -479.0, "logps/rejected": -555.5999755859375, "loss": 2.8996, "rewards/accuracies": 0.3140625059604645, "rewards/chosen": 0.49951171875, "rewards/margins": 0.3792480528354645, "rewards/rejected": 0.12050781399011612, "step": 90 }, { "epoch": 0.046948356807511735, "grad_norm": 1653.1079391474298, "learning_rate": 2.323943661971831e-07, "logits/chosen": -0.508007824420929, "logits/rejected": -0.29736328125, "logps/chosen": -507.6000061035156, "logps/rejected": -547.7999877929688, "loss": 3.1115, "rewards/accuracies": 0.3023437559604645, "rewards/chosen": 0.4888671934604645, "rewards/margins": 0.255859375, "rewards/rejected": 0.23378905653953552, "step": 100 }, { "epoch": 0.051643192488262914, "grad_norm": 895.3891335741198, "learning_rate": 2.5586854460093895e-07, "logits/chosen": -0.5640624761581421, "logits/rejected": -0.2915283143520355, "logps/chosen": -491.6000061035156, "logps/rejected": -602.0, "loss": 3.249, "rewards/accuracies": 0.2945312559604645, "rewards/chosen": 0.784130871295929, "rewards/margins": 0.3072753846645355, "rewards/rejected": 0.4765625, "step": 110 }, { "epoch": 0.056338028169014086, "grad_norm": 891.1988315206388, "learning_rate": 2.7934272300469483e-07, "logits/chosen": -0.519726574420929, "logits/rejected": -0.38164061307907104, "logps/chosen": -502.3999938964844, "logps/rejected": -587.4000244140625, "loss": 3.2137, "rewards/accuracies": 0.32109373807907104, "rewards/chosen": 1.2775390148162842, "rewards/margins": 1.161523461341858, "rewards/rejected": 0.11464843899011612, "step": 120 }, { "epoch": 0.06103286384976526, "grad_norm": 970.3541688922608, "learning_rate": 3.0281690140845066e-07, "logits/chosen": -0.549023449420929, "logits/rejected": -0.291748046875, "logps/chosen": -507.6000061035156, "logps/rejected": -601.5999755859375, "loss": 2.8365, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 1.3640625476837158, "rewards/margins": 1.09375, "rewards/rejected": 0.26953125, "step": 130 }, { "epoch": 0.06572769953051644, "grad_norm": 886.8802773140834, "learning_rate": 3.2629107981220654e-07, "logits/chosen": -0.5814453363418579, "logits/rejected": -0.2689453065395355, "logps/chosen": -498.0, "logps/rejected": -610.0, "loss": 2.7553, "rewards/accuracies": 0.3539062440395355, "rewards/chosen": 2.12109375, "rewards/margins": 2.042187452316284, "rewards/rejected": 0.08017577975988388, "step": 140 }, { "epoch": 0.07042253521126761, "grad_norm": 903.8202407742016, "learning_rate": 3.497652582159624e-07, "logits/chosen": -0.560351550579071, "logits/rejected": -0.2798828184604645, "logps/chosen": -497.0, "logps/rejected": -609.7999877929688, "loss": 3.1926, "rewards/accuracies": 0.3570312559604645, "rewards/chosen": 2.3460936546325684, "rewards/margins": 0.903613269329071, "rewards/rejected": 1.4416992664337158, "step": 150 }, { "epoch": 0.07511737089201878, "grad_norm": 983.5383487663207, "learning_rate": 3.732394366197183e-07, "logits/chosen": -0.5279296636581421, "logits/rejected": -0.22114257514476776, "logps/chosen": -512.4000244140625, "logps/rejected": -631.2000122070312, "loss": 3.9087, "rewards/accuracies": 0.38671875, "rewards/chosen": 2.4429688453674316, "rewards/margins": 1.255468726158142, "rewards/rejected": 1.18212890625, "step": 160 }, { "epoch": 0.07981220657276995, "grad_norm": 818.9213208546128, "learning_rate": 3.967136150234742e-07, "logits/chosen": -0.544726550579071, "logits/rejected": -0.22911377251148224, "logps/chosen": -472.0, "logps/rejected": -597.0, "loss": 2.9613, "rewards/accuracies": 0.4007812440395355, "rewards/chosen": 3.25, "rewards/margins": 2.4775390625, "rewards/rejected": 0.777539074420929, "step": 170 }, { "epoch": 0.08450704225352113, "grad_norm": 943.9390967074684, "learning_rate": 4.2018779342723e-07, "logits/chosen": -0.606249988079071, "logits/rejected": -0.3448730409145355, "logps/chosen": -457.79998779296875, "logps/rejected": -574.2000122070312, "loss": 2.8498, "rewards/accuracies": 0.41015625, "rewards/chosen": 3.7109375, "rewards/margins": 1.94287109375, "rewards/rejected": 1.769921898841858, "step": 180 }, { "epoch": 0.0892018779342723, "grad_norm": 842.17616267613, "learning_rate": 4.436619718309859e-07, "logits/chosen": -0.541210949420929, "logits/rejected": -0.26118165254592896, "logps/chosen": -491.20001220703125, "logps/rejected": -571.5999755859375, "loss": 3.1334, "rewards/accuracies": 0.42578125, "rewards/chosen": 4.412499904632568, "rewards/margins": 2.990039110183716, "rewards/rejected": 1.426367163658142, "step": 190 }, { "epoch": 0.09389671361502347, "grad_norm": 731.4315487758088, "learning_rate": 4.671361502347418e-07, "logits/chosen": -0.591796875, "logits/rejected": -0.31083983182907104, "logps/chosen": -478.20001220703125, "logps/rejected": -576.4000244140625, "loss": 3.0793, "rewards/accuracies": 0.44140625, "rewards/chosen": 4.354687690734863, "rewards/margins": 3.1507811546325684, "rewards/rejected": 1.207617163658142, "step": 200 }, { "epoch": 0.09859154929577464, "grad_norm": 786.8624304594038, "learning_rate": 4.906103286384976e-07, "logits/chosen": -0.5687500238418579, "logits/rejected": -0.2825683653354645, "logps/chosen": -492.0, "logps/rejected": -608.4000244140625, "loss": 2.9854, "rewards/accuracies": 0.44843751192092896, "rewards/chosen": 4.9921875, "rewards/margins": 3.9820313453674316, "rewards/rejected": 1.012109398841858, "step": 210 }, { "epoch": 0.10328638497652583, "grad_norm": 853.881725612068, "learning_rate": 4.984350547730829e-07, "logits/chosen": -0.575976550579071, "logits/rejected": -0.26899415254592896, "logps/chosen": -503.3999938964844, "logps/rejected": -587.5999755859375, "loss": 2.8506, "rewards/accuracies": 0.453125, "rewards/chosen": 5.478125095367432, "rewards/margins": 3.817187547683716, "rewards/rejected": 1.6613280773162842, "step": 220 }, { "epoch": 0.107981220657277, "grad_norm": 684.3825590042018, "learning_rate": 4.958268127282212e-07, "logits/chosen": -0.6468750238418579, "logits/rejected": -0.39521485567092896, "logps/chosen": -493.0, "logps/rejected": -623.4000244140625, "loss": 3.0047, "rewards/accuracies": 0.453125, "rewards/chosen": 6.118750095367432, "rewards/margins": 3.5355467796325684, "rewards/rejected": 2.578906297683716, "step": 230 }, { "epoch": 0.11267605633802817, "grad_norm": 809.34644578425, "learning_rate": 4.932185706833594e-07, "logits/chosen": -0.614062488079071, "logits/rejected": -0.34912109375, "logps/chosen": -448.3999938964844, "logps/rejected": -511.20001220703125, "loss": 3.1082, "rewards/accuracies": 0.46171873807907104, "rewards/chosen": 5.934374809265137, "rewards/margins": 3.6328125, "rewards/rejected": 2.305859327316284, "step": 240 }, { "epoch": 0.11737089201877934, "grad_norm": 732.5110357800185, "learning_rate": 4.906103286384976e-07, "logits/chosen": -0.605273425579071, "logits/rejected": -0.298828125, "logps/chosen": -491.6000061035156, "logps/rejected": -626.7999877929688, "loss": 2.9504, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 6.409375190734863, "rewards/margins": 5.34375, "rewards/rejected": 1.067968726158142, "step": 250 }, { "epoch": 0.12206572769953052, "grad_norm": 840.1466180792445, "learning_rate": 4.880020865936358e-07, "logits/chosen": -0.5787109136581421, "logits/rejected": -0.2738281190395355, "logps/chosen": -461.3999938964844, "logps/rejected": -568.4000244140625, "loss": 2.7264, "rewards/accuracies": 0.47343748807907104, "rewards/chosen": 7.290625095367432, "rewards/margins": 4.998437404632568, "rewards/rejected": 2.293164014816284, "step": 260 }, { "epoch": 0.1267605633802817, "grad_norm": 906.1804152285723, "learning_rate": 4.853938445487741e-07, "logits/chosen": -0.580273449420929, "logits/rejected": -0.24868163466453552, "logps/chosen": -488.3999938964844, "logps/rejected": -596.7999877929688, "loss": 2.9914, "rewards/accuracies": 0.49531251192092896, "rewards/chosen": 7.703125, "rewards/margins": 5.434374809265137, "rewards/rejected": 2.2734375, "step": 270 }, { "epoch": 0.13145539906103287, "grad_norm": 674.0579049267843, "learning_rate": 4.827856025039123e-07, "logits/chosen": -0.610546886920929, "logits/rejected": -0.2940429747104645, "logps/chosen": -487.20001220703125, "logps/rejected": -602.4000244140625, "loss": 2.8809, "rewards/accuracies": 0.4703125059604645, "rewards/chosen": 7.712500095367432, "rewards/margins": 5.581250190734863, "rewards/rejected": 2.130078077316284, "step": 280 }, { "epoch": 0.13615023474178403, "grad_norm": 768.8196108071818, "learning_rate": 4.801773604590506e-07, "logits/chosen": -0.5960937738418579, "logits/rejected": -0.2982421815395355, "logps/chosen": -524.2000122070312, "logps/rejected": -598.7999877929688, "loss": 3.3368, "rewards/accuracies": 0.4906249940395355, "rewards/chosen": 7.853125095367432, "rewards/margins": 4.871874809265137, "rewards/rejected": 2.9761719703674316, "step": 290 }, { "epoch": 0.14084507042253522, "grad_norm": 630.2246279947489, "learning_rate": 4.775691184141888e-07, "logits/chosen": -0.6265624761581421, "logits/rejected": -0.2874511778354645, "logps/chosen": -507.20001220703125, "logps/rejected": -583.4000244140625, "loss": 3.0864, "rewards/accuracies": 0.5054687261581421, "rewards/chosen": 8.265625, "rewards/margins": 5.46875, "rewards/rejected": 2.8021483421325684, "step": 300 }, { "epoch": 0.14553990610328638, "grad_norm": 646.7126703035038, "learning_rate": 4.749608763693271e-07, "logits/chosen": -0.5462890863418579, "logits/rejected": -0.2920898497104645, "logps/chosen": -483.0, "logps/rejected": -581.0, "loss": 2.8642, "rewards/accuracies": 0.504687488079071, "rewards/chosen": 8.25, "rewards/margins": 5.935937404632568, "rewards/rejected": 2.321093797683716, "step": 310 }, { "epoch": 0.15023474178403756, "grad_norm": 810.7754152436736, "learning_rate": 4.7235263432446533e-07, "logits/chosen": -0.5806640386581421, "logits/rejected": -0.36542969942092896, "logps/chosen": -483.0, "logps/rejected": -595.5999755859375, "loss": 2.923, "rewards/accuracies": 0.49296873807907104, "rewards/chosen": 8.478124618530273, "rewards/margins": 5.542187690734863, "rewards/rejected": 2.932812452316284, "step": 320 }, { "epoch": 0.15492957746478872, "grad_norm": 810.8418878441339, "learning_rate": 4.6974439227960353e-07, "logits/chosen": -0.532421886920929, "logits/rejected": -0.2914062440395355, "logps/chosen": -498.79998779296875, "logps/rejected": -596.4000244140625, "loss": 3.3479, "rewards/accuracies": 0.5289062261581421, "rewards/chosen": 8.653124809265137, "rewards/margins": 6.704687595367432, "rewards/rejected": 1.9441406726837158, "step": 330 }, { "epoch": 0.1596244131455399, "grad_norm": 836.959000847699, "learning_rate": 4.671361502347418e-07, "logits/chosen": -0.573437511920929, "logits/rejected": -0.3017822206020355, "logps/chosen": -474.6000061035156, "logps/rejected": -535.7999877929688, "loss": 3.1531, "rewards/accuracies": 0.507031261920929, "rewards/chosen": 9.303125381469727, "rewards/margins": 5.779687404632568, "rewards/rejected": 3.534374952316284, "step": 340 }, { "epoch": 0.1643192488262911, "grad_norm": 1681.2151711740987, "learning_rate": 4.6452790818988004e-07, "logits/chosen": -0.588671863079071, "logits/rejected": -0.30296629667282104, "logps/chosen": -509.20001220703125, "logps/rejected": -602.0, "loss": 2.768, "rewards/accuracies": 0.5171874761581421, "rewards/chosen": 9.359375, "rewards/margins": 8.75, "rewards/rejected": 0.623828113079071, "step": 350 }, { "epoch": 0.16901408450704225, "grad_norm": 711.6938609230214, "learning_rate": 4.6191966614501824e-07, "logits/chosen": -0.5794922113418579, "logits/rejected": -0.3939453065395355, "logps/chosen": -478.79998779296875, "logps/rejected": -571.7999877929688, "loss": 3.1462, "rewards/accuracies": 0.5023437738418579, "rewards/chosen": 9.149999618530273, "rewards/margins": 6.046875, "rewards/rejected": 3.1070313453674316, "step": 360 }, { "epoch": 0.17370892018779344, "grad_norm": 718.2678942111543, "learning_rate": 4.593114241001565e-07, "logits/chosen": -0.6226562261581421, "logits/rejected": -0.2999023497104645, "logps/chosen": -513.0, "logps/rejected": -597.4000244140625, "loss": 2.9787, "rewards/accuracies": 0.5289062261581421, "rewards/chosen": 9.300000190734863, "rewards/margins": 7.306250095367432, "rewards/rejected": 1.9919922351837158, "step": 370 }, { "epoch": 0.1784037558685446, "grad_norm": 744.8106892125453, "learning_rate": 4.5670318205529474e-07, "logits/chosen": -0.57421875, "logits/rejected": -0.2909179627895355, "logps/chosen": -489.79998779296875, "logps/rejected": -596.5999755859375, "loss": 3.2521, "rewards/accuracies": 0.4898437559604645, "rewards/chosen": 8.550000190734863, "rewards/margins": 6.270312309265137, "rewards/rejected": 2.27783203125, "step": 380 }, { "epoch": 0.18309859154929578, "grad_norm": 691.9149010825635, "learning_rate": 4.54094940010433e-07, "logits/chosen": -0.5582031011581421, "logits/rejected": -0.25493162870407104, "logps/chosen": -499.20001220703125, "logps/rejected": -608.2000122070312, "loss": 3.2335, "rewards/accuracies": 0.510937511920929, "rewards/chosen": 8.949999809265137, "rewards/margins": 7.159375190734863, "rewards/rejected": 1.795312523841858, "step": 390 }, { "epoch": 0.18779342723004694, "grad_norm": 652.7177331591703, "learning_rate": 4.514866979655712e-07, "logits/chosen": -0.533398449420929, "logits/rejected": -0.3169799745082855, "logps/chosen": -489.0, "logps/rejected": -566.0, "loss": 3.1198, "rewards/accuracies": 0.528124988079071, "rewards/chosen": 10.037500381469727, "rewards/margins": 6.987500190734863, "rewards/rejected": 3.0455079078674316, "step": 400 }, { "epoch": 0.19248826291079812, "grad_norm": 839.8790315464779, "learning_rate": 4.4887845592070945e-07, "logits/chosen": -0.5882812738418579, "logits/rejected": -0.26453858613967896, "logps/chosen": -475.20001220703125, "logps/rejected": -622.0, "loss": 3.2332, "rewards/accuracies": 0.504687488079071, "rewards/chosen": 9.350000381469727, "rewards/margins": 7.021874904632568, "rewards/rejected": 2.324658155441284, "step": 410 }, { "epoch": 0.19718309859154928, "grad_norm": 747.1514500184338, "learning_rate": 4.462702138758477e-07, "logits/chosen": -0.5171874761581421, "logits/rejected": -0.2999816834926605, "logps/chosen": -509.0, "logps/rejected": -584.5999755859375, "loss": 3.2621, "rewards/accuracies": 0.514843761920929, "rewards/chosen": 9.90625, "rewards/margins": 8.050000190734863, "rewards/rejected": 1.864843726158142, "step": 420 }, { "epoch": 0.20187793427230047, "grad_norm": 967.391287502094, "learning_rate": 4.436619718309859e-07, "logits/chosen": -0.59765625, "logits/rejected": -0.31640625, "logps/chosen": -490.20001220703125, "logps/rejected": -620.4000244140625, "loss": 3.2472, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 9.818750381469727, "rewards/margins": 7.989062309265137, "rewards/rejected": 1.83056640625, "step": 430 }, { "epoch": 0.20657276995305165, "grad_norm": 803.2838624577214, "learning_rate": 4.4105372978612415e-07, "logits/chosen": -0.5003906488418579, "logits/rejected": -0.2442626953125, "logps/chosen": -510.79998779296875, "logps/rejected": -611.2000122070312, "loss": 3.0099, "rewards/accuracies": 0.5453125238418579, "rewards/chosen": 9.537500381469727, "rewards/margins": 8.996874809265137, "rewards/rejected": 0.5492187738418579, "step": 440 }, { "epoch": 0.2112676056338028, "grad_norm": 692.8482089343754, "learning_rate": 4.384454877412624e-07, "logits/chosen": -0.4974609315395355, "logits/rejected": -0.23740234971046448, "logps/chosen": -499.79998779296875, "logps/rejected": -666.5999755859375, "loss": 2.9774, "rewards/accuracies": 0.5054687261581421, "rewards/chosen": 9.087499618530273, "rewards/margins": 8.971875190734863, "rewards/rejected": 0.10976562649011612, "step": 450 }, { "epoch": 0.215962441314554, "grad_norm": 725.2203687920032, "learning_rate": 4.358372456964006e-07, "logits/chosen": -0.556640625, "logits/rejected": -0.312490850687027, "logps/chosen": -488.79998779296875, "logps/rejected": -595.2000122070312, "loss": 3.0113, "rewards/accuracies": 0.534375011920929, "rewards/chosen": 9.759374618530273, "rewards/margins": 8.003125190734863, "rewards/rejected": 1.759765625, "step": 460 }, { "epoch": 0.22065727699530516, "grad_norm": 715.4109570999155, "learning_rate": 4.3322900365153886e-07, "logits/chosen": -0.4730468690395355, "logits/rejected": -0.3287109434604645, "logps/chosen": -532.4000244140625, "logps/rejected": -599.0, "loss": 3.768, "rewards/accuracies": 0.526562511920929, "rewards/chosen": 8.321874618530273, "rewards/margins": 7.1484375, "rewards/rejected": 1.173242211341858, "step": 470 }, { "epoch": 0.22535211267605634, "grad_norm": 635.7645892418309, "learning_rate": 4.306207616066771e-07, "logits/chosen": -0.574023425579071, "logits/rejected": -0.29472655057907104, "logps/chosen": -483.0, "logps/rejected": -573.7999877929688, "loss": 2.9334, "rewards/accuracies": 0.5171874761581421, "rewards/chosen": 8.762499809265137, "rewards/margins": 6.537499904632568, "rewards/rejected": 2.2237305641174316, "step": 480 }, { "epoch": 0.2300469483568075, "grad_norm": 772.5807591537028, "learning_rate": 4.280125195618153e-07, "logits/chosen": -0.606249988079071, "logits/rejected": -0.36083984375, "logps/chosen": -488.0, "logps/rejected": -610.0, "loss": 3.1291, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 8.184374809265137, "rewards/margins": 8.665624618530273, "rewards/rejected": -0.47392576932907104, "step": 490 }, { "epoch": 0.2347417840375587, "grad_norm": 817.269768586059, "learning_rate": 4.2540427751695357e-07, "logits/chosen": -0.564648449420929, "logits/rejected": -0.27338868379592896, "logps/chosen": -498.20001220703125, "logps/rejected": -576.7999877929688, "loss": 3.7429, "rewards/accuracies": 0.5078125, "rewards/chosen": 7.909375190734863, "rewards/margins": 7.087500095367432, "rewards/rejected": 0.8238281011581421, "step": 500 }, { "epoch": 0.23943661971830985, "grad_norm": 738.8661445149991, "learning_rate": 4.227960354720918e-07, "logits/chosen": -0.5765625238418579, "logits/rejected": -0.32447510957717896, "logps/chosen": -484.0, "logps/rejected": -612.2000122070312, "loss": 2.9784, "rewards/accuracies": 0.5296875238418579, "rewards/chosen": 9.024999618530273, "rewards/margins": 8.443750381469727, "rewards/rejected": 0.576367199420929, "step": 510 }, { "epoch": 0.24413145539906103, "grad_norm": 698.5030693902871, "learning_rate": 4.2018779342723e-07, "logits/chosen": -0.603710949420929, "logits/rejected": -0.3518310487270355, "logps/chosen": -499.20001220703125, "logps/rejected": -576.4000244140625, "loss": 3.2801, "rewards/accuracies": 0.5140625238418579, "rewards/chosen": 7.582812309265137, "rewards/margins": 7.228125095367432, "rewards/rejected": 0.3636718690395355, "step": 520 }, { "epoch": 0.24882629107981222, "grad_norm": 1250.986077743337, "learning_rate": 4.1757955138236827e-07, "logits/chosen": -0.503125011920929, "logits/rejected": -0.27568358182907104, "logps/chosen": -486.79998779296875, "logps/rejected": -600.5999755859375, "loss": 3.1068, "rewards/accuracies": 0.5546875, "rewards/chosen": 7.990624904632568, "rewards/margins": 9.068750381469727, "rewards/rejected": -1.0732421875, "step": 530 }, { "epoch": 0.2535211267605634, "grad_norm": 782.4662096269492, "learning_rate": 4.149713093375065e-07, "logits/chosen": -0.5757812261581421, "logits/rejected": -0.36601561307907104, "logps/chosen": -484.6000061035156, "logps/rejected": -553.5999755859375, "loss": 2.987, "rewards/accuracies": 0.5406249761581421, "rewards/chosen": 8.584375381469727, "rewards/margins": 10.290624618530273, "rewards/rejected": -1.7034180164337158, "step": 540 }, { "epoch": 0.25821596244131456, "grad_norm": 717.9304720964192, "learning_rate": 4.123630672926447e-07, "logits/chosen": -0.583984375, "logits/rejected": -0.3319335877895355, "logps/chosen": -470.20001220703125, "logps/rejected": -582.5999755859375, "loss": 2.9565, "rewards/accuracies": 0.5296875238418579, "rewards/chosen": 8.537500381469727, "rewards/margins": 7.740624904632568, "rewards/rejected": 0.803027331829071, "step": 550 }, { "epoch": 0.26291079812206575, "grad_norm": 616.0665175630934, "learning_rate": 4.09754825247783e-07, "logits/chosen": -0.5181640386581421, "logits/rejected": -0.2769775390625, "logps/chosen": -494.20001220703125, "logps/rejected": -580.0, "loss": 2.983, "rewards/accuracies": 0.535937488079071, "rewards/chosen": 9.184374809265137, "rewards/margins": 10.193750381469727, "rewards/rejected": -1.0084960460662842, "step": 560 }, { "epoch": 0.2676056338028169, "grad_norm": 600.2053461131181, "learning_rate": 4.0714658320292123e-07, "logits/chosen": -0.5078125, "logits/rejected": -0.29139405488967896, "logps/chosen": -490.6000061035156, "logps/rejected": -588.4000244140625, "loss": 2.5187, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 9.706250190734863, "rewards/margins": 10.293749809265137, "rewards/rejected": -0.587109386920929, "step": 570 }, { "epoch": 0.27230046948356806, "grad_norm": 636.8463471544266, "learning_rate": 4.045383411580595e-07, "logits/chosen": -0.553515613079071, "logits/rejected": -0.28217774629592896, "logps/chosen": -472.0, "logps/rejected": -592.0, "loss": 3.0021, "rewards/accuracies": 0.5328124761581421, "rewards/chosen": 9.037500381469727, "rewards/margins": 9.168749809265137, "rewards/rejected": -0.13486328721046448, "step": 580 }, { "epoch": 0.27699530516431925, "grad_norm": 677.8079168374967, "learning_rate": 4.019300991131977e-07, "logits/chosen": -0.595703125, "logits/rejected": -0.35400390625, "logps/chosen": -477.20001220703125, "logps/rejected": -583.0, "loss": 3.0265, "rewards/accuracies": 0.561718761920929, "rewards/chosen": 8.800000190734863, "rewards/margins": 8.959375381469727, "rewards/rejected": -0.15244141221046448, "step": 590 }, { "epoch": 0.28169014084507044, "grad_norm": 692.7414767184621, "learning_rate": 3.9932185706833594e-07, "logits/chosen": -0.49003905057907104, "logits/rejected": -0.3021484315395355, "logps/chosen": -500.3999938964844, "logps/rejected": -580.0, "loss": 3.0173, "rewards/accuracies": 0.5296875238418579, "rewards/chosen": 8.512499809265137, "rewards/margins": 7.926562309265137, "rewards/rejected": 0.5816406011581421, "step": 600 }, { "epoch": 0.2863849765258216, "grad_norm": 735.8257845166838, "learning_rate": 3.967136150234742e-07, "logits/chosen": -0.5347656011581421, "logits/rejected": -0.3173828125, "logps/chosen": -495.6000061035156, "logps/rejected": -592.2000122070312, "loss": 3.0775, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 8.762499809265137, "rewards/margins": 8.368749618530273, "rewards/rejected": 0.3892578184604645, "step": 610 }, { "epoch": 0.29107981220657275, "grad_norm": 704.0548430051728, "learning_rate": 3.941053729786124e-07, "logits/chosen": -0.5884765386581421, "logits/rejected": -0.2982421815395355, "logps/chosen": -500.20001220703125, "logps/rejected": -591.0, "loss": 3.288, "rewards/accuracies": 0.547656238079071, "rewards/chosen": 8.615625381469727, "rewards/margins": 9.612500190734863, "rewards/rejected": -0.9984375238418579, "step": 620 }, { "epoch": 0.29577464788732394, "grad_norm": 703.2533417756349, "learning_rate": 3.9149713093375064e-07, "logits/chosen": -0.5048828125, "logits/rejected": -0.23325195908546448, "logps/chosen": -490.0, "logps/rejected": -611.0, "loss": 3.05, "rewards/accuracies": 0.5218750238418579, "rewards/chosen": 7.900000095367432, "rewards/margins": 8.190625190734863, "rewards/rejected": -0.29443359375, "step": 630 }, { "epoch": 0.3004694835680751, "grad_norm": 704.887168465974, "learning_rate": 3.888888888888889e-07, "logits/chosen": -0.589062511920929, "logits/rejected": -0.23969726264476776, "logps/chosen": -483.0, "logps/rejected": -624.4000244140625, "loss": 2.6343, "rewards/accuracies": 0.5390625, "rewards/chosen": 8.25, "rewards/margins": 10.178125381469727, "rewards/rejected": -1.9357421398162842, "step": 640 }, { "epoch": 0.3051643192488263, "grad_norm": 651.8175500689835, "learning_rate": 3.862806468440271e-07, "logits/chosen": -0.540234386920929, "logits/rejected": -0.3387695252895355, "logps/chosen": -458.0, "logps/rejected": -556.4000244140625, "loss": 3.0457, "rewards/accuracies": 0.508593738079071, "rewards/chosen": 7.831250190734863, "rewards/margins": 8.221875190734863, "rewards/rejected": -0.38457030057907104, "step": 650 }, { "epoch": 0.30985915492957744, "grad_norm": 590.2718180590139, "learning_rate": 3.8367240479916535e-07, "logits/chosen": -0.6089843511581421, "logits/rejected": -0.2800048887729645, "logps/chosen": -481.0, "logps/rejected": -553.4000244140625, "loss": 2.8161, "rewards/accuracies": 0.5296875238418579, "rewards/chosen": 7.596875190734863, "rewards/margins": 8.6015625, "rewards/rejected": -1.0144531726837158, "step": 660 }, { "epoch": 0.3145539906103286, "grad_norm": 736.5249123798675, "learning_rate": 3.810641627543036e-07, "logits/chosen": -0.6089843511581421, "logits/rejected": -0.364990234375, "logps/chosen": -490.79998779296875, "logps/rejected": -581.5999755859375, "loss": 2.8618, "rewards/accuracies": 0.546093761920929, "rewards/chosen": 8.028124809265137, "rewards/margins": 9.521875381469727, "rewards/rejected": -1.5031249523162842, "step": 670 }, { "epoch": 0.3192488262910798, "grad_norm": 612.0181234702416, "learning_rate": 3.784559207094418e-07, "logits/chosen": -0.509570300579071, "logits/rejected": -0.2801757752895355, "logps/chosen": -508.3999938964844, "logps/rejected": -581.0, "loss": 3.1478, "rewards/accuracies": 0.53125, "rewards/chosen": 7.815625190734863, "rewards/margins": 7.731249809265137, "rewards/rejected": 0.0771484375, "step": 680 }, { "epoch": 0.323943661971831, "grad_norm": 693.6194407690226, "learning_rate": 3.7584767866458005e-07, "logits/chosen": -0.561718761920929, "logits/rejected": -0.3109374940395355, "logps/chosen": -497.79998779296875, "logps/rejected": -559.4000244140625, "loss": 3.1098, "rewards/accuracies": 0.516406238079071, "rewards/chosen": 8.34375, "rewards/margins": 8.028124809265137, "rewards/rejected": 0.3042968809604645, "step": 690 }, { "epoch": 0.3286384976525822, "grad_norm": 980.6069308740243, "learning_rate": 3.732394366197183e-07, "logits/chosen": -0.564648449420929, "logits/rejected": -0.29863280057907104, "logps/chosen": -521.7999877929688, "logps/rejected": -562.2000122070312, "loss": 3.4979, "rewards/accuracies": 0.5390625, "rewards/chosen": 8.590624809265137, "rewards/margins": 8.746874809265137, "rewards/rejected": -0.16035155951976776, "step": 700 }, { "epoch": 0.3333333333333333, "grad_norm": 886.5994980308456, "learning_rate": 3.706311945748565e-07, "logits/chosen": -0.5289062261581421, "logits/rejected": -0.33574217557907104, "logps/chosen": -535.7999877929688, "logps/rejected": -610.2000122070312, "loss": 3.761, "rewards/accuracies": 0.522656261920929, "rewards/chosen": 7.612500190734863, "rewards/margins": 9.631250381469727, "rewards/rejected": -2.0287108421325684, "step": 710 }, { "epoch": 0.3380281690140845, "grad_norm": 718.3692526731336, "learning_rate": 3.6802295252999476e-07, "logits/chosen": -0.5394531488418579, "logits/rejected": -0.3095703125, "logps/chosen": -507.6000061035156, "logps/rejected": -638.4000244140625, "loss": 2.6471, "rewards/accuracies": 0.530468761920929, "rewards/chosen": 8.315625190734863, "rewards/margins": 12.821874618530273, "rewards/rejected": -4.528515815734863, "step": 720 }, { "epoch": 0.3427230046948357, "grad_norm": 694.2287406311675, "learning_rate": 3.65414710485133e-07, "logits/chosen": -0.582812488079071, "logits/rejected": -0.34257811307907104, "logps/chosen": -501.0, "logps/rejected": -563.4000244140625, "loss": 3.1621, "rewards/accuracies": 0.542187511920929, "rewards/chosen": 7.871874809265137, "rewards/margins": 9.159375190734863, "rewards/rejected": -1.2830078601837158, "step": 730 }, { "epoch": 0.3474178403755869, "grad_norm": 651.1695511074623, "learning_rate": 3.6280646844027127e-07, "logits/chosen": -0.5687500238418579, "logits/rejected": -0.25281983613967896, "logps/chosen": -483.20001220703125, "logps/rejected": -628.7999877929688, "loss": 2.7521, "rewards/accuracies": 0.553906261920929, "rewards/chosen": 8.071874618530273, "rewards/margins": 9.237500190734863, "rewards/rejected": -1.15771484375, "step": 740 }, { "epoch": 0.352112676056338, "grad_norm": 778.4278897957579, "learning_rate": 3.6019822639540947e-07, "logits/chosen": -0.5560547113418579, "logits/rejected": -0.32861328125, "logps/chosen": -509.20001220703125, "logps/rejected": -556.7999877929688, "loss": 3.3031, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 7.900000095367432, "rewards/margins": 8.396875381469727, "rewards/rejected": -0.4990234375, "step": 750 }, { "epoch": 0.3568075117370892, "grad_norm": 660.4340824321005, "learning_rate": 3.575899843505477e-07, "logits/chosen": -0.516796886920929, "logits/rejected": -0.33613282442092896, "logps/chosen": -492.79998779296875, "logps/rejected": -550.4000244140625, "loss": 3.0992, "rewards/accuracies": 0.555468738079071, "rewards/chosen": 7.932812690734863, "rewards/margins": 9.582812309265137, "rewards/rejected": -1.649316430091858, "step": 760 }, { "epoch": 0.3615023474178404, "grad_norm": 788.9971388182393, "learning_rate": 3.5498174230568597e-07, "logits/chosen": -0.532031238079071, "logits/rejected": -0.2958984375, "logps/chosen": -488.20001220703125, "logps/rejected": -582.7999877929688, "loss": 2.6459, "rewards/accuracies": 0.54296875, "rewards/chosen": 7.896874904632568, "rewards/margins": 10.399999618530273, "rewards/rejected": -2.5047850608825684, "step": 770 }, { "epoch": 0.36619718309859156, "grad_norm": 1790.3438554630125, "learning_rate": 3.5237350026082417e-07, "logits/chosen": -0.594921886920929, "logits/rejected": -0.42949217557907104, "logps/chosen": -501.6000061035156, "logps/rejected": -546.5999755859375, "loss": 3.4257, "rewards/accuracies": 0.5328124761581421, "rewards/chosen": 7.793749809265137, "rewards/margins": 7.753125190734863, "rewards/rejected": 0.04624023288488388, "step": 780 }, { "epoch": 0.37089201877934275, "grad_norm": 849.6966910228913, "learning_rate": 3.497652582159624e-07, "logits/chosen": -0.588085949420929, "logits/rejected": -0.30078125, "logps/chosen": -493.20001220703125, "logps/rejected": -605.7999877929688, "loss": 3.1886, "rewards/accuracies": 0.53515625, "rewards/chosen": 7.821875095367432, "rewards/margins": 11.018750190734863, "rewards/rejected": -3.197705030441284, "step": 790 }, { "epoch": 0.3755868544600939, "grad_norm": 706.8069372811456, "learning_rate": 3.471570161711007e-07, "logits/chosen": -0.510546863079071, "logits/rejected": -0.2950683534145355, "logps/chosen": -528.4000244140625, "logps/rejected": -613.4000244140625, "loss": 3.5888, "rewards/accuracies": 0.5234375, "rewards/chosen": 6.746874809265137, "rewards/margins": 9.296875, "rewards/rejected": -2.55078125, "step": 800 }, { "epoch": 0.38028169014084506, "grad_norm": 693.8663419803147, "learning_rate": 3.445487741262389e-07, "logits/chosen": -0.506640613079071, "logits/rejected": -0.21652832627296448, "logps/chosen": -483.0, "logps/rejected": -613.7999877929688, "loss": 2.6531, "rewards/accuracies": 0.547656238079071, "rewards/chosen": 7.65625, "rewards/margins": 9.778124809265137, "rewards/rejected": -2.1187500953674316, "step": 810 }, { "epoch": 0.38497652582159625, "grad_norm": 780.2407805065338, "learning_rate": 3.4194053208137713e-07, "logits/chosen": -0.537109375, "logits/rejected": -0.33056640625, "logps/chosen": -489.79998779296875, "logps/rejected": -560.0, "loss": 2.7854, "rewards/accuracies": 0.539843738079071, "rewards/chosen": 7.765625, "rewards/margins": 9.600000381469727, "rewards/rejected": -1.828027367591858, "step": 820 }, { "epoch": 0.38967136150234744, "grad_norm": 957.2505990732736, "learning_rate": 3.393322900365154e-07, "logits/chosen": -0.579296886920929, "logits/rejected": -0.3182617127895355, "logps/chosen": -467.79998779296875, "logps/rejected": -560.4000244140625, "loss": 3.5009, "rewards/accuracies": 0.538281261920929, "rewards/chosen": 6.53125, "rewards/margins": 7.952343940734863, "rewards/rejected": -1.4187500476837158, "step": 830 }, { "epoch": 0.39436619718309857, "grad_norm": 724.0153774821662, "learning_rate": 3.367240479916536e-07, "logits/chosen": -0.6166015863418579, "logits/rejected": -0.3013671934604645, "logps/chosen": -521.0, "logps/rejected": -594.7999877929688, "loss": 3.4961, "rewards/accuracies": 0.563281238079071, "rewards/chosen": 6.4375, "rewards/margins": 8.871874809265137, "rewards/rejected": -2.442578077316284, "step": 840 }, { "epoch": 0.39906103286384975, "grad_norm": 715.9626311103833, "learning_rate": 3.3411580594679184e-07, "logits/chosen": -0.6275390386581421, "logits/rejected": -0.29509276151657104, "logps/chosen": -480.20001220703125, "logps/rejected": -585.7999877929688, "loss": 2.8731, "rewards/accuracies": 0.539843738079071, "rewards/chosen": 6.220312595367432, "rewards/margins": 9.346875190734863, "rewards/rejected": -3.1253905296325684, "step": 850 }, { "epoch": 0.40375586854460094, "grad_norm": 1949.4952193537345, "learning_rate": 3.315075639019301e-07, "logits/chosen": -0.583984375, "logits/rejected": -0.19003906846046448, "logps/chosen": -477.3999938964844, "logps/rejected": -579.5999755859375, "loss": 2.8271, "rewards/accuracies": 0.5390625, "rewards/chosen": 6.78125, "rewards/margins": 9.840624809265137, "rewards/rejected": -3.061816453933716, "step": 860 }, { "epoch": 0.4084507042253521, "grad_norm": 1202.8802856820173, "learning_rate": 3.288993218570683e-07, "logits/chosen": -0.546093761920929, "logits/rejected": -0.3382812440395355, "logps/chosen": -484.20001220703125, "logps/rejected": -554.7999877929688, "loss": 3.2188, "rewards/accuracies": 0.5445312261581421, "rewards/chosen": 7.199999809265137, "rewards/margins": 8.274999618530273, "rewards/rejected": -1.068017601966858, "step": 870 }, { "epoch": 0.4131455399061033, "grad_norm": 673.648231886372, "learning_rate": 3.2629107981220654e-07, "logits/chosen": -0.560351550579071, "logits/rejected": -0.30546873807907104, "logps/chosen": -497.6000061035156, "logps/rejected": -573.2000122070312, "loss": 3.482, "rewards/accuracies": 0.539843738079071, "rewards/chosen": 6.609375, "rewards/margins": 9.037500381469727, "rewards/rejected": -2.418261766433716, "step": 880 }, { "epoch": 0.41784037558685444, "grad_norm": 599.8760436395402, "learning_rate": 3.236828377673448e-07, "logits/chosen": -0.6011718511581421, "logits/rejected": -0.3213867247104645, "logps/chosen": -520.4000244140625, "logps/rejected": -569.5999755859375, "loss": 3.5229, "rewards/accuracies": 0.5445312261581421, "rewards/chosen": 6.346875190734863, "rewards/margins": 9.337499618530273, "rewards/rejected": -2.9823241233825684, "step": 890 }, { "epoch": 0.4225352112676056, "grad_norm": 623.0480606601192, "learning_rate": 3.2107459572248305e-07, "logits/chosen": -0.5830078125, "logits/rejected": -0.3199218809604645, "logps/chosen": -501.0, "logps/rejected": -582.7999877929688, "loss": 2.9044, "rewards/accuracies": 0.5492187738418579, "rewards/chosen": 6.271874904632568, "rewards/margins": 9.278124809265137, "rewards/rejected": -3.008593797683716, "step": 900 }, { "epoch": 0.4272300469483568, "grad_norm": 1027.4154087417915, "learning_rate": 3.1846635367762125e-07, "logits/chosen": -0.5775390863418579, "logits/rejected": -0.28276365995407104, "logps/chosen": -499.0, "logps/rejected": -617.2000122070312, "loss": 3.6288, "rewards/accuracies": 0.5570312738418579, "rewards/chosen": 6.009375095367432, "rewards/margins": 10.96875, "rewards/rejected": -4.965039253234863, "step": 910 }, { "epoch": 0.431924882629108, "grad_norm": 711.6853424661571, "learning_rate": 3.158581116327595e-07, "logits/chosen": -0.5824218988418579, "logits/rejected": -0.28779298067092896, "logps/chosen": -500.3999938964844, "logps/rejected": -640.4000244140625, "loss": 3.0375, "rewards/accuracies": 0.5601562261581421, "rewards/chosen": 6.474999904632568, "rewards/margins": 10.706250190734863, "rewards/rejected": -4.226171970367432, "step": 920 }, { "epoch": 0.43661971830985913, "grad_norm": 690.9363933044156, "learning_rate": 3.1324986958789775e-07, "logits/chosen": -0.6468750238418579, "logits/rejected": -0.4146484434604645, "logps/chosen": -469.20001220703125, "logps/rejected": -558.4000244140625, "loss": 3.3928, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 6.618750095367432, "rewards/margins": 8.659375190734863, "rewards/rejected": -2.046875, "step": 930 }, { "epoch": 0.4413145539906103, "grad_norm": 685.6144137924031, "learning_rate": 3.1064162754303595e-07, "logits/chosen": -0.5501953363418579, "logits/rejected": -0.3304687440395355, "logps/chosen": -501.6000061035156, "logps/rejected": -609.2000122070312, "loss": 3.5258, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 6.0546875, "rewards/margins": 9.787500381469727, "rewards/rejected": -3.734375, "step": 940 }, { "epoch": 0.4460093896713615, "grad_norm": 673.1261882678233, "learning_rate": 3.080333854981742e-07, "logits/chosen": -0.60546875, "logits/rejected": -0.34565430879592896, "logps/chosen": -488.3999938964844, "logps/rejected": -560.7999877929688, "loss": 3.1799, "rewards/accuracies": 0.51953125, "rewards/chosen": 6.353125095367432, "rewards/margins": 7.509375095367432, "rewards/rejected": -1.1597168445587158, "step": 950 }, { "epoch": 0.4507042253521127, "grad_norm": 723.4112298867857, "learning_rate": 3.0542514345331246e-07, "logits/chosen": -0.596484363079071, "logits/rejected": -0.306640625, "logps/chosen": -496.79998779296875, "logps/rejected": -631.0, "loss": 3.4013, "rewards/accuracies": 0.53125, "rewards/chosen": 6.764062404632568, "rewards/margins": 9.057812690734863, "rewards/rejected": -2.2943358421325684, "step": 960 }, { "epoch": 0.45539906103286387, "grad_norm": 900.144659160139, "learning_rate": 3.0281690140845066e-07, "logits/chosen": -0.6083984375, "logits/rejected": -0.3916015625, "logps/chosen": -525.4000244140625, "logps/rejected": -600.0, "loss": 3.527, "rewards/accuracies": 0.546875, "rewards/chosen": 6.996874809265137, "rewards/margins": 8.2890625, "rewards/rejected": -1.2966797351837158, "step": 970 }, { "epoch": 0.460093896713615, "grad_norm": 598.809032929382, "learning_rate": 3.002086593635889e-07, "logits/chosen": -0.5337890386581421, "logits/rejected": -0.24836425483226776, "logps/chosen": -493.6000061035156, "logps/rejected": -609.0, "loss": 3.3099, "rewards/accuracies": 0.5484374761581421, "rewards/chosen": 6.839062690734863, "rewards/margins": 10.178125381469727, "rewards/rejected": -3.3421874046325684, "step": 980 }, { "epoch": 0.4647887323943662, "grad_norm": 599.9700628960113, "learning_rate": 2.9760041731872716e-07, "logits/chosen": -0.582812488079071, "logits/rejected": -0.3437255918979645, "logps/chosen": -458.20001220703125, "logps/rejected": -572.4000244140625, "loss": 2.5221, "rewards/accuracies": 0.5718749761581421, "rewards/chosen": 7.784375190734863, "rewards/margins": 9.78125, "rewards/rejected": -1.9953124523162842, "step": 990 }, { "epoch": 0.4694835680751174, "grad_norm": 681.2360129847848, "learning_rate": 2.9499217527386536e-07, "logits/chosen": -0.6041015386581421, "logits/rejected": -0.2806640565395355, "logps/chosen": -484.6000061035156, "logps/rejected": -585.5999755859375, "loss": 3.1059, "rewards/accuracies": 0.538281261920929, "rewards/chosen": 7.568749904632568, "rewards/margins": 9.853124618530273, "rewards/rejected": -2.295703172683716, "step": 1000 }, { "epoch": 0.47417840375586856, "grad_norm": 643.1899678460082, "learning_rate": 2.923839332290036e-07, "logits/chosen": -0.6207031011581421, "logits/rejected": -0.3514648377895355, "logps/chosen": -473.6000061035156, "logps/rejected": -592.5999755859375, "loss": 2.8973, "rewards/accuracies": 0.5445312261581421, "rewards/chosen": 7.309374809265137, "rewards/margins": 10.149999618530273, "rewards/rejected": -2.8369140625, "step": 1010 }, { "epoch": 0.4788732394366197, "grad_norm": 838.8064846810831, "learning_rate": 2.8977569118414187e-07, "logits/chosen": -0.5833984613418579, "logits/rejected": -0.32011717557907104, "logps/chosen": -475.20001220703125, "logps/rejected": -572.4000244140625, "loss": 2.9685, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 7.699999809265137, "rewards/margins": 10.309374809265137, "rewards/rejected": -2.6146483421325684, "step": 1020 }, { "epoch": 0.4835680751173709, "grad_norm": 627.4869165594788, "learning_rate": 2.8716744913928007e-07, "logits/chosen": -0.552734375, "logits/rejected": -0.32099610567092896, "logps/chosen": -493.0, "logps/rejected": -566.5999755859375, "loss": 2.7791, "rewards/accuracies": 0.5625, "rewards/chosen": 7.824999809265137, "rewards/margins": 8.868749618530273, "rewards/rejected": -1.0486328601837158, "step": 1030 }, { "epoch": 0.48826291079812206, "grad_norm": 550.4410761224154, "learning_rate": 2.845592070944183e-07, "logits/chosen": -0.5205078125, "logits/rejected": -0.23173828423023224, "logps/chosen": -488.79998779296875, "logps/rejected": -613.2000122070312, "loss": 2.9664, "rewards/accuracies": 0.5484374761581421, "rewards/chosen": 8.243749618530273, "rewards/margins": 10.134374618530273, "rewards/rejected": -1.8917968273162842, "step": 1040 }, { "epoch": 0.49295774647887325, "grad_norm": 631.5685713710841, "learning_rate": 2.819509650495566e-07, "logits/chosen": -0.5146484375, "logits/rejected": -0.29707032442092896, "logps/chosen": -483.79998779296875, "logps/rejected": -566.0, "loss": 2.8197, "rewards/accuracies": 0.56640625, "rewards/chosen": 8.225000381469727, "rewards/margins": 9.856249809265137, "rewards/rejected": -1.6281249523162842, "step": 1050 }, { "epoch": 0.49765258215962443, "grad_norm": 593.4267147051244, "learning_rate": 2.7934272300469483e-07, "logits/chosen": -0.560351550579071, "logits/rejected": -0.4249023497104645, "logps/chosen": -513.0, "logps/rejected": -568.2000122070312, "loss": 2.9353, "rewards/accuracies": 0.553906261920929, "rewards/chosen": 8.856249809265137, "rewards/margins": 10.890625, "rewards/rejected": -2.0306639671325684, "step": 1060 }, { "epoch": 0.5023474178403756, "grad_norm": 593.1742320142328, "learning_rate": 2.7673448095983303e-07, "logits/chosen": -0.519726574420929, "logits/rejected": -0.3246093690395355, "logps/chosen": -495.20001220703125, "logps/rejected": -569.5999755859375, "loss": 3.1332, "rewards/accuracies": 0.532031238079071, "rewards/chosen": 7.34375, "rewards/margins": 8.553125381469727, "rewards/rejected": -1.2117187976837158, "step": 1070 }, { "epoch": 0.5070422535211268, "grad_norm": 735.2639718496534, "learning_rate": 2.741262389149713e-07, "logits/chosen": -0.552929699420929, "logits/rejected": -0.3033203184604645, "logps/chosen": -494.3999938964844, "logps/rejected": -589.0, "loss": 3.407, "rewards/accuracies": 0.5570312738418579, "rewards/chosen": 7.065625190734863, "rewards/margins": 9.556249618530273, "rewards/rejected": -2.496875047683716, "step": 1080 }, { "epoch": 0.5117370892018779, "grad_norm": 798.1422062507208, "learning_rate": 2.7151799687010953e-07, "logits/chosen": -0.550000011920929, "logits/rejected": -0.29638671875, "logps/chosen": -497.6000061035156, "logps/rejected": -587.0, "loss": 3.3928, "rewards/accuracies": 0.55078125, "rewards/chosen": 7.121874809265137, "rewards/margins": 8.756250381469727, "rewards/rejected": -1.6281249523162842, "step": 1090 }, { "epoch": 0.5164319248826291, "grad_norm": 1201.4511157708635, "learning_rate": 2.6890975482524773e-07, "logits/chosen": -0.5296875238418579, "logits/rejected": -0.3097167909145355, "logps/chosen": -516.5999755859375, "logps/rejected": -585.7999877929688, "loss": 3.0338, "rewards/accuracies": 0.565625011920929, "rewards/chosen": 6.7109375, "rewards/margins": 10.074999809265137, "rewards/rejected": -3.3666014671325684, "step": 1100 }, { "epoch": 0.5211267605633803, "grad_norm": 724.5873116169236, "learning_rate": 2.66301512780386e-07, "logits/chosen": -0.549023449420929, "logits/rejected": -0.35810548067092896, "logps/chosen": -472.3999938964844, "logps/rejected": -591.2000122070312, "loss": 2.6868, "rewards/accuracies": 0.5531250238418579, "rewards/chosen": 7.090624809265137, "rewards/margins": 10.871874809265137, "rewards/rejected": -3.782763719558716, "step": 1110 }, { "epoch": 0.5258215962441315, "grad_norm": 586.4937796454979, "learning_rate": 2.6369327073552424e-07, "logits/chosen": -0.5439453125, "logits/rejected": -0.35205078125, "logps/chosen": -477.3999938964844, "logps/rejected": -561.4000244140625, "loss": 2.6832, "rewards/accuracies": 0.5570312738418579, "rewards/chosen": 7.728125095367432, "rewards/margins": 9.984375, "rewards/rejected": -2.2554688453674316, "step": 1120 }, { "epoch": 0.5305164319248826, "grad_norm": 630.9074770381483, "learning_rate": 2.6108502869066244e-07, "logits/chosen": -0.58544921875, "logits/rejected": -0.36494141817092896, "logps/chosen": -509.3999938964844, "logps/rejected": -645.5999755859375, "loss": 3.6539, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 6.574999809265137, "rewards/margins": 12.157812118530273, "rewards/rejected": -5.588281154632568, "step": 1130 }, { "epoch": 0.5352112676056338, "grad_norm": 599.0270304457341, "learning_rate": 2.584767866458007e-07, "logits/chosen": -0.529101550579071, "logits/rejected": -0.32512205839157104, "logps/chosen": -487.0, "logps/rejected": -579.4000244140625, "loss": 2.7899, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 7.112500190734863, "rewards/margins": 10.3125, "rewards/rejected": -3.1884765625, "step": 1140 }, { "epoch": 0.539906103286385, "grad_norm": 564.1910288820185, "learning_rate": 2.5586854460093895e-07, "logits/chosen": -0.5804687738418579, "logits/rejected": -0.2655273377895355, "logps/chosen": -495.3999938964844, "logps/rejected": -597.5999755859375, "loss": 2.9811, "rewards/accuracies": 0.563281238079071, "rewards/chosen": 7.081250190734863, "rewards/margins": 9.846875190734863, "rewards/rejected": -2.762890577316284, "step": 1150 }, { "epoch": 0.5446009389671361, "grad_norm": 971.8421015449942, "learning_rate": 2.5326030255607715e-07, "logits/chosen": -0.5316406488418579, "logits/rejected": -0.28593748807907104, "logps/chosen": -497.6000061035156, "logps/rejected": -585.5999755859375, "loss": 3.5082, "rewards/accuracies": 0.5406249761581421, "rewards/chosen": 7.487500190734863, "rewards/margins": 9.234375, "rewards/rejected": -1.745019555091858, "step": 1160 }, { "epoch": 0.5492957746478874, "grad_norm": 643.1228028490086, "learning_rate": 2.506520605112154e-07, "logits/chosen": -0.521679699420929, "logits/rejected": -0.2811523377895355, "logps/chosen": -490.79998779296875, "logps/rejected": -583.2000122070312, "loss": 3.6691, "rewards/accuracies": 0.5335937738418579, "rewards/chosen": 7.021874904632568, "rewards/margins": 8.143750190734863, "rewards/rejected": -1.1179687976837158, "step": 1170 }, { "epoch": 0.5539906103286385, "grad_norm": 809.4106379432997, "learning_rate": 2.4804381846635365e-07, "logits/chosen": -0.48847657442092896, "logits/rejected": -0.28955078125, "logps/chosen": -509.79998779296875, "logps/rejected": -581.7999877929688, "loss": 3.4789, "rewards/accuracies": 0.51953125, "rewards/chosen": 6.659375190734863, "rewards/margins": 9.018750190734863, "rewards/rejected": -2.35546875, "step": 1180 }, { "epoch": 0.5586854460093896, "grad_norm": 602.6839782157416, "learning_rate": 2.454355764214919e-07, "logits/chosen": -0.521484375, "logits/rejected": -0.31660157442092896, "logps/chosen": -473.79998779296875, "logps/rejected": -610.5999755859375, "loss": 2.9022, "rewards/accuracies": 0.5523437261581421, "rewards/chosen": 7.375, "rewards/margins": 9.806249618530273, "rewards/rejected": -2.421679735183716, "step": 1190 }, { "epoch": 0.5633802816901409, "grad_norm": 742.5582620860798, "learning_rate": 2.4282733437663016e-07, "logits/chosen": -0.5972656011581421, "logits/rejected": -0.34443360567092896, "logps/chosen": -481.79998779296875, "logps/rejected": -568.0, "loss": 2.9105, "rewards/accuracies": 0.532031238079071, "rewards/chosen": 7.940625190734863, "rewards/margins": 9.053125381469727, "rewards/rejected": -1.1140625476837158, "step": 1200 }, { "epoch": 0.568075117370892, "grad_norm": 673.876604269412, "learning_rate": 2.4021909233176836e-07, "logits/chosen": -0.5361328125, "logits/rejected": -0.23430176079273224, "logps/chosen": -470.0, "logps/rejected": -587.2000122070312, "loss": 2.8525, "rewards/accuracies": 0.5492187738418579, "rewards/chosen": 7.490624904632568, "rewards/margins": 10.234375, "rewards/rejected": -2.7408204078674316, "step": 1210 }, { "epoch": 0.5727699530516432, "grad_norm": 665.4142859510565, "learning_rate": 2.376108502869066e-07, "logits/chosen": -0.545703113079071, "logits/rejected": -0.264089971780777, "logps/chosen": -502.20001220703125, "logps/rejected": -613.0, "loss": 3.0669, "rewards/accuracies": 0.5390625, "rewards/chosen": 6.918749809265137, "rewards/margins": 9.303125381469727, "rewards/rejected": -2.3788084983825684, "step": 1220 }, { "epoch": 0.5774647887323944, "grad_norm": 662.7524915927413, "learning_rate": 2.3500260824204484e-07, "logits/chosen": -0.5287109613418579, "logits/rejected": -0.335205078125, "logps/chosen": -527.0, "logps/rejected": -610.5999755859375, "loss": 3.1557, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 7.15625, "rewards/margins": 10.515625, "rewards/rejected": -3.3568358421325684, "step": 1230 }, { "epoch": 0.5821596244131455, "grad_norm": 539.6673917005603, "learning_rate": 2.323943661971831e-07, "logits/chosen": -0.5914062261581421, "logits/rejected": -0.2911132872104645, "logps/chosen": -491.6000061035156, "logps/rejected": -587.0, "loss": 3.0982, "rewards/accuracies": 0.5484374761581421, "rewards/chosen": 7.606249809265137, "rewards/margins": 9.118749618530273, "rewards/rejected": -1.509374976158142, "step": 1240 }, { "epoch": 0.5868544600938967, "grad_norm": 651.1539552229518, "learning_rate": 2.2978612415232132e-07, "logits/chosen": -0.562304675579071, "logits/rejected": -0.275634765625, "logps/chosen": -519.7999877929688, "logps/rejected": -617.7999877929688, "loss": 3.1728, "rewards/accuracies": 0.535937488079071, "rewards/chosen": 7.771874904632568, "rewards/margins": 10.09375, "rewards/rejected": -2.309375047683716, "step": 1250 }, { "epoch": 0.5915492957746479, "grad_norm": 566.8947632352489, "learning_rate": 2.2717788210745957e-07, "logits/chosen": -0.5503906011581421, "logits/rejected": -0.30223387479782104, "logps/chosen": -490.0, "logps/rejected": -586.2000122070312, "loss": 3.2256, "rewards/accuracies": 0.542187511920929, "rewards/chosen": 6.315234184265137, "rewards/margins": 10.145312309265137, "rewards/rejected": -3.8343749046325684, "step": 1260 }, { "epoch": 0.596244131455399, "grad_norm": 971.4050267076251, "learning_rate": 2.245696400625978e-07, "logits/chosen": -0.552734375, "logits/rejected": -0.2924560606479645, "logps/chosen": -513.0, "logps/rejected": -592.5999755859375, "loss": 3.741, "rewards/accuracies": 0.5531250238418579, "rewards/chosen": 6.800000190734863, "rewards/margins": 9.440625190734863, "rewards/rejected": -2.6376953125, "step": 1270 }, { "epoch": 0.6009389671361502, "grad_norm": 610.9080527114611, "learning_rate": 2.2196139801773602e-07, "logits/chosen": -0.5589843988418579, "logits/rejected": -0.40947264432907104, "logps/chosen": -482.20001220703125, "logps/rejected": -557.0, "loss": 3.0901, "rewards/accuracies": 0.546093761920929, "rewards/chosen": 6.987500190734863, "rewards/margins": 8.787500381469727, "rewards/rejected": -1.8064453601837158, "step": 1280 }, { "epoch": 0.6056338028169014, "grad_norm": 1084.4740676288952, "learning_rate": 2.1935315597287428e-07, "logits/chosen": -0.616015613079071, "logits/rejected": -0.3415771424770355, "logps/chosen": -500.0, "logps/rejected": -563.4000244140625, "loss": 3.1714, "rewards/accuracies": 0.567187488079071, "rewards/chosen": 7.137499809265137, "rewards/margins": 9.493749618530273, "rewards/rejected": -2.3480467796325684, "step": 1290 }, { "epoch": 0.6103286384976526, "grad_norm": 598.8893568585898, "learning_rate": 2.167449139280125e-07, "logits/chosen": -0.54296875, "logits/rejected": -0.31254881620407104, "logps/chosen": -499.3999938964844, "logps/rejected": -595.4000244140625, "loss": 3.1184, "rewards/accuracies": 0.5492187738418579, "rewards/chosen": 7.768750190734863, "rewards/margins": 9.290624618530273, "rewards/rejected": -1.50732421875, "step": 1300 }, { "epoch": 0.6150234741784038, "grad_norm": 626.8468418711121, "learning_rate": 2.1413667188315073e-07, "logits/chosen": -0.612109363079071, "logits/rejected": -0.3144287168979645, "logps/chosen": -474.20001220703125, "logps/rejected": -554.5999755859375, "loss": 3.3531, "rewards/accuracies": 0.567187488079071, "rewards/chosen": 7.487500190734863, "rewards/margins": 9.209375381469727, "rewards/rejected": -1.729101538658142, "step": 1310 }, { "epoch": 0.6197183098591549, "grad_norm": 750.372286172025, "learning_rate": 2.1152842983828898e-07, "logits/chosen": -0.5888671875, "logits/rejected": -0.37226563692092896, "logps/chosen": -499.6000061035156, "logps/rejected": -571.7999877929688, "loss": 3.6354, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 6.671875, "rewards/margins": 8.178125381469727, "rewards/rejected": -1.51171875, "step": 1320 }, { "epoch": 0.6244131455399061, "grad_norm": 657.6500886241043, "learning_rate": 2.089201877934272e-07, "logits/chosen": -0.509570300579071, "logits/rejected": -0.3338867127895355, "logps/chosen": -515.4000244140625, "logps/rejected": -615.7999877929688, "loss": 2.7699, "rewards/accuracies": 0.567187488079071, "rewards/chosen": 7.417187690734863, "rewards/margins": 11.631250381469727, "rewards/rejected": -4.208593845367432, "step": 1330 }, { "epoch": 0.6291079812206573, "grad_norm": 613.3546155687558, "learning_rate": 2.0631194574856543e-07, "logits/chosen": -0.5941406488418579, "logits/rejected": -0.3384765684604645, "logps/chosen": -477.79998779296875, "logps/rejected": -535.5999755859375, "loss": 3.1904, "rewards/accuracies": 0.530468761920929, "rewards/chosen": 7.521874904632568, "rewards/margins": 8.675000190734863, "rewards/rejected": -1.151757836341858, "step": 1340 }, { "epoch": 0.6338028169014085, "grad_norm": 621.6320418211213, "learning_rate": 2.0370370370370369e-07, "logits/chosen": -0.5435546636581421, "logits/rejected": -0.3436523377895355, "logps/chosen": -494.0, "logps/rejected": -635.0, "loss": 2.8662, "rewards/accuracies": 0.551562488079071, "rewards/chosen": 7.546875, "rewards/margins": 11.596875190734863, "rewards/rejected": -4.063281059265137, "step": 1350 }, { "epoch": 0.6384976525821596, "grad_norm": 617.4020625936793, "learning_rate": 2.010954616588419e-07, "logits/chosen": -0.6195312738418579, "logits/rejected": -0.3753906190395355, "logps/chosen": -485.6000061035156, "logps/rejected": -558.4000244140625, "loss": 3.091, "rewards/accuracies": 0.55859375, "rewards/chosen": 7.556250095367432, "rewards/margins": 9.003125190734863, "rewards/rejected": -1.450097680091858, "step": 1360 }, { "epoch": 0.6431924882629108, "grad_norm": 672.6037754684576, "learning_rate": 1.9848721961398017e-07, "logits/chosen": -0.547070324420929, "logits/rejected": -0.3144775331020355, "logps/chosen": -496.3999938964844, "logps/rejected": -560.2000122070312, "loss": 2.7423, "rewards/accuracies": 0.51953125, "rewards/chosen": 7.934374809265137, "rewards/margins": 9.506250381469727, "rewards/rejected": -1.570703148841858, "step": 1370 }, { "epoch": 0.647887323943662, "grad_norm": 699.0543578950155, "learning_rate": 1.958789775691184e-07, "logits/chosen": -0.568554699420929, "logits/rejected": -0.3368164002895355, "logps/chosen": -476.6000061035156, "logps/rejected": -571.2000122070312, "loss": 2.8709, "rewards/accuracies": 0.55859375, "rewards/chosen": 7.946875095367432, "rewards/margins": 10.612500190734863, "rewards/rejected": -2.660937547683716, "step": 1380 }, { "epoch": 0.6525821596244131, "grad_norm": 668.0934696573247, "learning_rate": 1.9327073552425662e-07, "logits/chosen": -0.5560547113418579, "logits/rejected": -0.310302734375, "logps/chosen": -473.79998779296875, "logps/rejected": -561.0, "loss": 3.0307, "rewards/accuracies": 0.55859375, "rewards/chosen": 8.024999618530273, "rewards/margins": 10.615625381469727, "rewards/rejected": -2.5926756858825684, "step": 1390 }, { "epoch": 0.6572769953051644, "grad_norm": 729.6161780072857, "learning_rate": 1.906624934793949e-07, "logits/chosen": -0.5703125, "logits/rejected": -0.2599121034145355, "logps/chosen": -492.0, "logps/rejected": -609.7999877929688, "loss": 3.0586, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 7.440625190734863, "rewards/margins": 11.140625, "rewards/rejected": -3.711718797683716, "step": 1400 }, { "epoch": 0.6619718309859155, "grad_norm": 715.2542453381328, "learning_rate": 1.8805425143453312e-07, "logits/chosen": -0.5843750238418579, "logits/rejected": -0.2657714784145355, "logps/chosen": -473.0, "logps/rejected": -633.2000122070312, "loss": 2.8905, "rewards/accuracies": 0.571093738079071, "rewards/chosen": 7.834374904632568, "rewards/margins": 11.143750190734863, "rewards/rejected": -3.296435594558716, "step": 1410 }, { "epoch": 0.6666666666666666, "grad_norm": 693.6691945075337, "learning_rate": 1.8544600938967138e-07, "logits/chosen": -0.615234375, "logits/rejected": -0.32111817598342896, "logps/chosen": -484.79998779296875, "logps/rejected": -564.0, "loss": 2.99, "rewards/accuracies": 0.5492187738418579, "rewards/chosen": 7.831250190734863, "rewards/margins": 8.740625381469727, "rewards/rejected": -0.92041015625, "step": 1420 }, { "epoch": 0.6713615023474179, "grad_norm": 622.704864552459, "learning_rate": 1.828377673448096e-07, "logits/chosen": -0.5277343988418579, "logits/rejected": -0.33271485567092896, "logps/chosen": -518.0, "logps/rejected": -615.5999755859375, "loss": 3.7651, "rewards/accuracies": 0.569531261920929, "rewards/chosen": 6.666406154632568, "rewards/margins": 10.696874618530273, "rewards/rejected": -4.035546779632568, "step": 1430 }, { "epoch": 0.676056338028169, "grad_norm": 557.6060250533635, "learning_rate": 1.8022952529994783e-07, "logits/chosen": -0.5562499761581421, "logits/rejected": -0.34648436307907104, "logps/chosen": -500.20001220703125, "logps/rejected": -568.0, "loss": 3.0349, "rewards/accuracies": 0.5484374761581421, "rewards/chosen": 7.709374904632568, "rewards/margins": 10.068750381469727, "rewards/rejected": -2.3662109375, "step": 1440 }, { "epoch": 0.6807511737089202, "grad_norm": 832.7173598348687, "learning_rate": 1.7762128325508608e-07, "logits/chosen": -0.5042968988418579, "logits/rejected": -0.23063965141773224, "logps/chosen": -510.79998779296875, "logps/rejected": -639.7999877929688, "loss": 4.0965, "rewards/accuracies": 0.542187511920929, "rewards/chosen": 7.543749809265137, "rewards/margins": 13.678125381469727, "rewards/rejected": -6.125390529632568, "step": 1450 }, { "epoch": 0.6854460093896714, "grad_norm": 727.8863117189119, "learning_rate": 1.750130412102243e-07, "logits/chosen": -0.560351550579071, "logits/rejected": -0.3172607421875, "logps/chosen": -532.7999877929688, "logps/rejected": -592.4000244140625, "loss": 3.4532, "rewards/accuracies": 0.5609375238418579, "rewards/chosen": 8.034375190734863, "rewards/margins": 9.540624618530273, "rewards/rejected": -1.5104491710662842, "step": 1460 }, { "epoch": 0.6901408450704225, "grad_norm": 665.98507573463, "learning_rate": 1.7240479916536254e-07, "logits/chosen": -0.5625, "logits/rejected": -0.2787109315395355, "logps/chosen": -482.3999938964844, "logps/rejected": -606.4000244140625, "loss": 2.8532, "rewards/accuracies": 0.559374988079071, "rewards/chosen": 8.421875, "rewards/margins": 10.109375, "rewards/rejected": -1.688085913658142, "step": 1470 }, { "epoch": 0.6948356807511737, "grad_norm": 686.2735224868583, "learning_rate": 1.697965571205008e-07, "logits/chosen": -0.552539050579071, "logits/rejected": -0.33916014432907104, "logps/chosen": -485.20001220703125, "logps/rejected": -558.0, "loss": 3.0797, "rewards/accuracies": 0.563281238079071, "rewards/chosen": 7.681250095367432, "rewards/margins": 9.212499618530273, "rewards/rejected": -1.525781273841858, "step": 1480 }, { "epoch": 0.6995305164319249, "grad_norm": 823.9255307150204, "learning_rate": 1.6718831507563902e-07, "logits/chosen": -0.529101550579071, "logits/rejected": -0.30976563692092896, "logps/chosen": -494.0, "logps/rejected": -552.5999755859375, "loss": 2.8754, "rewards/accuracies": 0.55859375, "rewards/chosen": 7.987500190734863, "rewards/margins": 9.006250381469727, "rewards/rejected": -1.023828148841858, "step": 1490 }, { "epoch": 0.704225352112676, "grad_norm": 657.5041925092606, "learning_rate": 1.6458007303077727e-07, "logits/chosen": -0.5777343511581421, "logits/rejected": -0.3371337950229645, "logps/chosen": -479.6000061035156, "logps/rejected": -560.0, "loss": 3.2957, "rewards/accuracies": 0.5640624761581421, "rewards/chosen": 7.734375, "rewards/margins": 8.425000190734863, "rewards/rejected": -0.687304675579071, "step": 1500 }, { "epoch": 0.7089201877934272, "grad_norm": 699.8025269732028, "learning_rate": 1.619718309859155e-07, "logits/chosen": -0.558789074420929, "logits/rejected": -0.379150390625, "logps/chosen": -496.79998779296875, "logps/rejected": -599.7999877929688, "loss": 3.3295, "rewards/accuracies": 0.532031238079071, "rewards/chosen": 7.3125, "rewards/margins": 10.606249809265137, "rewards/rejected": -3.288281202316284, "step": 1510 }, { "epoch": 0.7136150234741784, "grad_norm": 894.261170355261, "learning_rate": 1.5936358894105372e-07, "logits/chosen": -0.6714843511581421, "logits/rejected": -0.39140623807907104, "logps/chosen": -495.6000061035156, "logps/rejected": -583.4000244140625, "loss": 2.7382, "rewards/accuracies": 0.5484374761581421, "rewards/chosen": 7.746874809265137, "rewards/margins": 9.425000190734863, "rewards/rejected": -1.673828125, "step": 1520 }, { "epoch": 0.7183098591549296, "grad_norm": 666.7376469260297, "learning_rate": 1.5675534689619197e-07, "logits/chosen": -0.543749988079071, "logits/rejected": -0.27685546875, "logps/chosen": -499.3999938964844, "logps/rejected": -551.2000122070312, "loss": 2.9828, "rewards/accuracies": 0.563281238079071, "rewards/chosen": 7.609375, "rewards/margins": 9.518750190734863, "rewards/rejected": -1.917578101158142, "step": 1530 }, { "epoch": 0.7230046948356808, "grad_norm": 662.0664844198171, "learning_rate": 1.541471048513302e-07, "logits/chosen": -0.546093761920929, "logits/rejected": -0.32329100370407104, "logps/chosen": -500.0, "logps/rejected": -583.7999877929688, "loss": 3.1827, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 7.731249809265137, "rewards/margins": 9.662500381469727, "rewards/rejected": -1.9306151866912842, "step": 1540 }, { "epoch": 0.7276995305164319, "grad_norm": 671.8667777196581, "learning_rate": 1.5153886280646843e-07, "logits/chosen": -0.5718749761581421, "logits/rejected": -0.30622559785842896, "logps/chosen": -467.6000061035156, "logps/rejected": -542.0, "loss": 2.9899, "rewards/accuracies": 0.542187511920929, "rewards/chosen": 7.456250190734863, "rewards/margins": 9.028124809265137, "rewards/rejected": -1.575927734375, "step": 1550 }, { "epoch": 0.7323943661971831, "grad_norm": 687.5329556398598, "learning_rate": 1.4893062076160668e-07, "logits/chosen": -0.5625, "logits/rejected": -0.28461915254592896, "logps/chosen": -492.20001220703125, "logps/rejected": -596.0, "loss": 2.9629, "rewards/accuracies": 0.5328124761581421, "rewards/chosen": 7.590624809265137, "rewards/margins": 10.199999809265137, "rewards/rejected": -2.60546875, "step": 1560 }, { "epoch": 0.7370892018779343, "grad_norm": 539.8559941198413, "learning_rate": 1.463223787167449e-07, "logits/chosen": -0.523632824420929, "logits/rejected": -0.31696778535842896, "logps/chosen": -498.79998779296875, "logps/rejected": -570.4000244140625, "loss": 2.9552, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 7.715624809265137, "rewards/margins": 9.934374809265137, "rewards/rejected": -2.222851514816284, "step": 1570 }, { "epoch": 0.7417840375586855, "grad_norm": 794.5687090711544, "learning_rate": 1.4371413667188313e-07, "logits/chosen": -0.5874999761581421, "logits/rejected": -0.2868408262729645, "logps/chosen": -486.0, "logps/rejected": -613.5999755859375, "loss": 2.7866, "rewards/accuracies": 0.5609375238418579, "rewards/chosen": 8.471875190734863, "rewards/margins": 10.846875190734863, "rewards/rejected": -2.3687500953674316, "step": 1580 }, { "epoch": 0.7464788732394366, "grad_norm": 696.9612541146422, "learning_rate": 1.4110589462702139e-07, "logits/chosen": -0.541015625, "logits/rejected": -0.2994628846645355, "logps/chosen": -485.3999938964844, "logps/rejected": -556.7999877929688, "loss": 3.0187, "rewards/accuracies": 0.547656238079071, "rewards/chosen": 7.978125095367432, "rewards/margins": 8.665624618530273, "rewards/rejected": -0.690478503704071, "step": 1590 }, { "epoch": 0.7511737089201878, "grad_norm": 602.4749389689824, "learning_rate": 1.384976525821596e-07, "logits/chosen": -0.517773449420929, "logits/rejected": -0.2635253965854645, "logps/chosen": -500.3999938964844, "logps/rejected": -635.2000122070312, "loss": 7.2381, "rewards/accuracies": 0.5523437261581421, "rewards/chosen": 8.068750381469727, "rewards/margins": 6.724999904632568, "rewards/rejected": 1.361572265625, "step": 1600 }, { "epoch": 0.755868544600939, "grad_norm": 643.0174875130219, "learning_rate": 1.3588941053729787e-07, "logits/chosen": -0.513867199420929, "logits/rejected": -0.22817382216453552, "logps/chosen": -505.3999938964844, "logps/rejected": -604.5999755859375, "loss": 2.7863, "rewards/accuracies": 0.5414062738418579, "rewards/chosen": 8.46875, "rewards/margins": 10.243749618530273, "rewards/rejected": -1.774072289466858, "step": 1610 }, { "epoch": 0.7605633802816901, "grad_norm": 682.4161556288692, "learning_rate": 1.332811684924361e-07, "logits/chosen": -0.516406238079071, "logits/rejected": -0.33183592557907104, "logps/chosen": -510.3999938964844, "logps/rejected": -582.7999877929688, "loss": 3.3439, "rewards/accuracies": 0.547656238079071, "rewards/chosen": 8.012499809265137, "rewards/margins": 9.274999618530273, "rewards/rejected": -1.268457055091858, "step": 1620 }, { "epoch": 0.7652582159624414, "grad_norm": 606.0021332100204, "learning_rate": 1.3067292644757432e-07, "logits/chosen": -0.595507800579071, "logits/rejected": -0.2540283203125, "logps/chosen": -506.0, "logps/rejected": -616.7999877929688, "loss": 3.0503, "rewards/accuracies": 0.5406249761581421, "rewards/chosen": 7.521874904632568, "rewards/margins": 9.475000381469727, "rewards/rejected": -1.956445336341858, "step": 1630 }, { "epoch": 0.7699530516431925, "grad_norm": 792.1113238235391, "learning_rate": 1.2806468440271257e-07, "logits/chosen": -0.559765636920929, "logits/rejected": -0.28925782442092896, "logps/chosen": -481.20001220703125, "logps/rejected": -573.5999755859375, "loss": 2.8232, "rewards/accuracies": 0.5546875, "rewards/chosen": 7.978125095367432, "rewards/margins": 9.78125, "rewards/rejected": -1.8105957508087158, "step": 1640 }, { "epoch": 0.7746478873239436, "grad_norm": 700.3545013383844, "learning_rate": 1.254564423578508e-07, "logits/chosen": -0.539257824420929, "logits/rejected": -0.3470703065395355, "logps/chosen": -487.0, "logps/rejected": -578.4000244140625, "loss": 2.4581, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 8.540624618530273, "rewards/margins": 10.103124618530273, "rewards/rejected": -1.558203101158142, "step": 1650 }, { "epoch": 0.7793427230046949, "grad_norm": 809.245201865097, "learning_rate": 1.2284820031298902e-07, "logits/chosen": -0.5126953125, "logits/rejected": -0.3380371034145355, "logps/chosen": -521.4000244140625, "logps/rejected": -605.7999877929688, "loss": 2.9874, "rewards/accuracies": 0.5367187261581421, "rewards/chosen": 7.737500190734863, "rewards/margins": 10.865625381469727, "rewards/rejected": -3.1166014671325684, "step": 1660 }, { "epoch": 0.784037558685446, "grad_norm": 665.2570576680542, "learning_rate": 1.2023995826812728e-07, "logits/chosen": -0.5718749761581421, "logits/rejected": -0.3250976502895355, "logps/chosen": -490.20001220703125, "logps/rejected": -591.0, "loss": 2.9369, "rewards/accuracies": 0.5523437261581421, "rewards/chosen": 8.446874618530273, "rewards/margins": 9.846875190734863, "rewards/rejected": -1.400488257408142, "step": 1670 }, { "epoch": 0.7887323943661971, "grad_norm": 752.696108586667, "learning_rate": 1.176317162232655e-07, "logits/chosen": -0.6128906011581421, "logits/rejected": -0.3177734315395355, "logps/chosen": -480.0, "logps/rejected": -599.0, "loss": 2.9677, "rewards/accuracies": 0.5601562261581421, "rewards/chosen": 8.143750190734863, "rewards/margins": 10.262499809265137, "rewards/rejected": -2.1097655296325684, "step": 1680 }, { "epoch": 0.7934272300469484, "grad_norm": 886.444334024518, "learning_rate": 1.1502347417840374e-07, "logits/chosen": -0.5474609136581421, "logits/rejected": -0.2578125, "logps/chosen": -488.3999938964844, "logps/rejected": -577.2000122070312, "loss": 3.6857, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 7.428124904632568, "rewards/margins": 8.993749618530273, "rewards/rejected": -1.5764648914337158, "step": 1690 }, { "epoch": 0.7981220657276995, "grad_norm": 690.3418757428329, "learning_rate": 1.1241523213354198e-07, "logits/chosen": -0.566601574420929, "logits/rejected": -0.34394532442092896, "logps/chosen": -488.6000061035156, "logps/rejected": -599.4000244140625, "loss": 2.9509, "rewards/accuracies": 0.5804687738418579, "rewards/chosen": 8.909375190734863, "rewards/margins": 11.543749809265137, "rewards/rejected": -2.6484375, "step": 1700 }, { "epoch": 0.8028169014084507, "grad_norm": 951.879202418882, "learning_rate": 1.0980699008868022e-07, "logits/chosen": -0.585742175579071, "logits/rejected": -0.37602537870407104, "logps/chosen": -500.3999938964844, "logps/rejected": -568.5999755859375, "loss": 3.3134, "rewards/accuracies": 0.5570312738418579, "rewards/chosen": 8.037500381469727, "rewards/margins": 9.440625190734863, "rewards/rejected": -1.4011719226837158, "step": 1710 }, { "epoch": 0.8075117370892019, "grad_norm": 629.9148965982773, "learning_rate": 1.0719874804381846e-07, "logits/chosen": -0.649609386920929, "logits/rejected": -0.38530272245407104, "logps/chosen": -487.0, "logps/rejected": -542.4000244140625, "loss": 2.7406, "rewards/accuracies": 0.5609375238418579, "rewards/chosen": 8.259374618530273, "rewards/margins": 8.681249618530273, "rewards/rejected": -0.4283203184604645, "step": 1720 }, { "epoch": 0.812206572769953, "grad_norm": 569.9439703406143, "learning_rate": 1.045905059989567e-07, "logits/chosen": -0.5960937738418579, "logits/rejected": -0.31416016817092896, "logps/chosen": -463.3999938964844, "logps/rejected": -533.5999755859375, "loss": 2.6339, "rewards/accuracies": 0.5679687261581421, "rewards/chosen": 8.46875, "rewards/margins": 9.65625, "rewards/rejected": -1.185937523841858, "step": 1730 }, { "epoch": 0.8169014084507042, "grad_norm": 848.3734185339168, "learning_rate": 1.0198226395409494e-07, "logits/chosen": -0.550585925579071, "logits/rejected": -0.34638673067092896, "logps/chosen": -469.20001220703125, "logps/rejected": -555.0, "loss": 3.3004, "rewards/accuracies": 0.5367187261581421, "rewards/chosen": 7.662499904632568, "rewards/margins": 9.521875381469727, "rewards/rejected": -1.86328125, "step": 1740 }, { "epoch": 0.8215962441314554, "grad_norm": 600.4772981814895, "learning_rate": 9.937402190923318e-08, "logits/chosen": -0.5947265625, "logits/rejected": -0.3211914002895355, "logps/chosen": -460.20001220703125, "logps/rejected": -591.5999755859375, "loss": 2.7497, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 8.268750190734863, "rewards/margins": 11.928125381469727, "rewards/rejected": -3.668750047683716, "step": 1750 }, { "epoch": 0.8262910798122066, "grad_norm": 611.1577078731616, "learning_rate": 9.676577986437141e-08, "logits/chosen": -0.5748046636581421, "logits/rejected": -0.33642578125, "logps/chosen": -510.3999938964844, "logps/rejected": -570.7999877929688, "loss": 3.5791, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 7.609375, "rewards/margins": 9.134374618530273, "rewards/rejected": -1.529296875, "step": 1760 }, { "epoch": 0.8309859154929577, "grad_norm": 648.8690168695862, "learning_rate": 9.415753781950965e-08, "logits/chosen": -0.572070300579071, "logits/rejected": -0.32463377714157104, "logps/chosen": -479.6000061035156, "logps/rejected": -569.4000244140625, "loss": 2.947, "rewards/accuracies": 0.563281238079071, "rewards/chosen": 8.268750190734863, "rewards/margins": 10.040624618530273, "rewards/rejected": -1.7820312976837158, "step": 1770 }, { "epoch": 0.8356807511737089, "grad_norm": 709.0062176328063, "learning_rate": 9.154929577464789e-08, "logits/chosen": -0.553515613079071, "logits/rejected": -0.2647460997104645, "logps/chosen": -489.20001220703125, "logps/rejected": -613.4000244140625, "loss": 2.5187, "rewards/accuracies": 0.56640625, "rewards/chosen": 8.03125, "rewards/margins": 16.75, "rewards/rejected": -8.693554878234863, "step": 1780 }, { "epoch": 0.8403755868544601, "grad_norm": 640.0288073373737, "learning_rate": 8.894105372978613e-08, "logits/chosen": -0.569531261920929, "logits/rejected": -0.28093260526657104, "logps/chosen": -500.6000061035156, "logps/rejected": -582.7999877929688, "loss": 3.0297, "rewards/accuracies": 0.5625, "rewards/chosen": 8.846875190734863, "rewards/margins": 10.596875190734863, "rewards/rejected": -1.740625023841858, "step": 1790 }, { "epoch": 0.8450704225352113, "grad_norm": 657.6727394519957, "learning_rate": 8.633281168492435e-08, "logits/chosen": -0.577343761920929, "logits/rejected": -0.27021485567092896, "logps/chosen": -486.20001220703125, "logps/rejected": -568.2000122070312, "loss": 2.8612, "rewards/accuracies": 0.5679687261581421, "rewards/chosen": 7.731249809265137, "rewards/margins": 10.253125190734863, "rewards/rejected": -2.5238280296325684, "step": 1800 }, { "epoch": 0.8497652582159625, "grad_norm": 595.9651350595409, "learning_rate": 8.372456964006259e-08, "logits/chosen": -0.552734375, "logits/rejected": -0.3524414002895355, "logps/chosen": -487.6000061035156, "logps/rejected": -570.5999755859375, "loss": 3.4585, "rewards/accuracies": 0.555468738079071, "rewards/chosen": 7.903124809265137, "rewards/margins": 10.243749618530273, "rewards/rejected": -2.342968702316284, "step": 1810 }, { "epoch": 0.8544600938967136, "grad_norm": 888.4311175065893, "learning_rate": 8.111632759520083e-08, "logits/chosen": -0.48945313692092896, "logits/rejected": -0.23842772841453552, "logps/chosen": -524.7999877929688, "logps/rejected": -584.2000122070312, "loss": 2.8415, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 7.331250190734863, "rewards/margins": 10.21875, "rewards/rejected": -2.8828125, "step": 1820 }, { "epoch": 0.8591549295774648, "grad_norm": 623.0755665468133, "learning_rate": 7.850808555033907e-08, "logits/chosen": -0.523242175579071, "logits/rejected": -0.337890625, "logps/chosen": -492.6000061035156, "logps/rejected": -597.2000122070312, "loss": 2.3676, "rewards/accuracies": 0.582812488079071, "rewards/chosen": 8.121874809265137, "rewards/margins": 11.412500381469727, "rewards/rejected": -3.29638671875, "step": 1830 }, { "epoch": 0.863849765258216, "grad_norm": 609.2189612705729, "learning_rate": 7.58998435054773e-08, "logits/chosen": -0.58203125, "logits/rejected": -0.30668944120407104, "logps/chosen": -503.79998779296875, "logps/rejected": -583.2000122070312, "loss": 3.2083, "rewards/accuracies": 0.561718761920929, "rewards/chosen": 8.015625, "rewards/margins": 9.40625, "rewards/rejected": -1.3894531726837158, "step": 1840 }, { "epoch": 0.8685446009389671, "grad_norm": 790.3997402940676, "learning_rate": 7.329160146061554e-08, "logits/chosen": -0.577343761920929, "logits/rejected": -0.29130858182907104, "logps/chosen": -478.3999938964844, "logps/rejected": -574.7999877929688, "loss": 3.0896, "rewards/accuracies": 0.565625011920929, "rewards/chosen": 8.496874809265137, "rewards/margins": 9.256250381469727, "rewards/rejected": -0.758984386920929, "step": 1850 }, { "epoch": 0.8732394366197183, "grad_norm": 739.4402772560634, "learning_rate": 7.068335941575378e-08, "logits/chosen": -0.5376952886581421, "logits/rejected": -0.3302246034145355, "logps/chosen": -482.0, "logps/rejected": -557.7999877929688, "loss": 3.4079, "rewards/accuracies": 0.5609375238418579, "rewards/chosen": 7.412499904632568, "rewards/margins": 9.15625, "rewards/rejected": -1.7509765625, "step": 1860 }, { "epoch": 0.8779342723004695, "grad_norm": 778.9822885285478, "learning_rate": 6.807511737089202e-08, "logits/chosen": -0.601757824420929, "logits/rejected": -0.262939453125, "logps/chosen": -481.6000061035156, "logps/rejected": -583.4000244140625, "loss": 3.3559, "rewards/accuracies": 0.551562488079071, "rewards/chosen": 7.5703125, "rewards/margins": 9.300000190734863, "rewards/rejected": -1.7374999523162842, "step": 1870 }, { "epoch": 0.8826291079812206, "grad_norm": 680.5021603159538, "learning_rate": 6.546687532603024e-08, "logits/chosen": -0.48906248807907104, "logits/rejected": -0.23833008110523224, "logps/chosen": -487.3999938964844, "logps/rejected": -583.5999755859375, "loss": 3.3205, "rewards/accuracies": 0.546093761920929, "rewards/chosen": 7.859375, "rewards/margins": 9.256250381469727, "rewards/rejected": -1.3935546875, "step": 1880 }, { "epoch": 0.8873239436619719, "grad_norm": 556.4744995981142, "learning_rate": 6.285863328116848e-08, "logits/chosen": -0.5693359375, "logits/rejected": -0.2549804747104645, "logps/chosen": -444.3999938964844, "logps/rejected": -600.7999877929688, "loss": 2.7209, "rewards/accuracies": 0.57421875, "rewards/chosen": 7.984375, "rewards/margins": 10.918749809265137, "rewards/rejected": -2.921093702316284, "step": 1890 }, { "epoch": 0.892018779342723, "grad_norm": 708.4120686929201, "learning_rate": 6.025039123630672e-08, "logits/chosen": -0.573046863079071, "logits/rejected": -0.38251954317092896, "logps/chosen": -499.6000061035156, "logps/rejected": -551.5999755859375, "loss": 3.0181, "rewards/accuracies": 0.553906261920929, "rewards/chosen": 8.709375381469727, "rewards/margins": 9.912500381469727, "rewards/rejected": -1.2111327648162842, "step": 1900 }, { "epoch": 0.8967136150234741, "grad_norm": 673.7113524318612, "learning_rate": 5.764214919144496e-08, "logits/chosen": -0.5718749761581421, "logits/rejected": -0.3662109375, "logps/chosen": -501.6000061035156, "logps/rejected": -550.2000122070312, "loss": 3.5332, "rewards/accuracies": 0.538281261920929, "rewards/chosen": 8.015625, "rewards/margins": 8.606249809265137, "rewards/rejected": -0.5980468988418579, "step": 1910 }, { "epoch": 0.9014084507042254, "grad_norm": 943.6234053280361, "learning_rate": 5.50339071465832e-08, "logits/chosen": -0.5824218988418579, "logits/rejected": -0.37031251192092896, "logps/chosen": -496.20001220703125, "logps/rejected": -543.0, "loss": 3.1724, "rewards/accuracies": 0.5648437738418579, "rewards/chosen": 8.034375190734863, "rewards/margins": 9.446874618530273, "rewards/rejected": -1.422216773033142, "step": 1920 }, { "epoch": 0.9061032863849765, "grad_norm": 761.7087708355676, "learning_rate": 5.2425665101721436e-08, "logits/chosen": -0.59375, "logits/rejected": -0.35273438692092896, "logps/chosen": -500.6000061035156, "logps/rejected": -618.5999755859375, "loss": 2.8534, "rewards/accuracies": 0.5640624761581421, "rewards/chosen": 8.387499809265137, "rewards/margins": 10.953125, "rewards/rejected": -2.5625977516174316, "step": 1930 }, { "epoch": 0.9107981220657277, "grad_norm": 581.5326460490315, "learning_rate": 4.9817423056859675e-08, "logits/chosen": -0.5755859613418579, "logits/rejected": -0.34931641817092896, "logps/chosen": -501.0, "logps/rejected": -593.5999755859375, "loss": 3.0832, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 8.037500381469727, "rewards/margins": 11.181249618530273, "rewards/rejected": -3.1435546875, "step": 1940 }, { "epoch": 0.9154929577464789, "grad_norm": 755.7301536856177, "learning_rate": 4.720918101199791e-08, "logits/chosen": -0.5933593511581421, "logits/rejected": -0.29145509004592896, "logps/chosen": -502.6000061035156, "logps/rejected": -641.4000244140625, "loss": 2.6221, "rewards/accuracies": 0.5546875, "rewards/chosen": 8.018750190734863, "rewards/margins": 12.774999618530273, "rewards/rejected": -4.751757621765137, "step": 1950 }, { "epoch": 0.92018779342723, "grad_norm": 793.7216932418602, "learning_rate": 4.460093896713615e-08, "logits/chosen": -0.565625011920929, "logits/rejected": -0.30351561307907104, "logps/chosen": -491.79998779296875, "logps/rejected": -596.0, "loss": 3.5352, "rewards/accuracies": 0.514843761920929, "rewards/chosen": 7.403124809265137, "rewards/margins": 8.171875, "rewards/rejected": -0.7734375, "step": 1960 }, { "epoch": 0.9248826291079812, "grad_norm": 631.8380870069659, "learning_rate": 4.199269692227438e-08, "logits/chosen": -0.591796875, "logits/rejected": -0.33183592557907104, "logps/chosen": -483.20001220703125, "logps/rejected": -546.0, "loss": 2.8773, "rewards/accuracies": 0.571093738079071, "rewards/chosen": 8.381250381469727, "rewards/margins": 8.940625190734863, "rewards/rejected": -0.5601562261581421, "step": 1970 }, { "epoch": 0.9295774647887324, "grad_norm": 568.1185058633969, "learning_rate": 3.938445487741262e-08, "logits/chosen": -0.5921875238418579, "logits/rejected": -0.3631591796875, "logps/chosen": -507.20001220703125, "logps/rejected": -585.0, "loss": 2.6661, "rewards/accuracies": 0.5546875, "rewards/chosen": 7.837500095367432, "rewards/margins": 12.659375190734863, "rewards/rejected": -4.811132907867432, "step": 1980 }, { "epoch": 0.9342723004694836, "grad_norm": 616.9978117440941, "learning_rate": 3.677621283255086e-08, "logits/chosen": -0.558789074420929, "logits/rejected": -0.17104491591453552, "logps/chosen": -505.79998779296875, "logps/rejected": -577.7999877929688, "loss": 2.9304, "rewards/accuracies": 0.565625011920929, "rewards/chosen": 8.178125381469727, "rewards/margins": 10.162500381469727, "rewards/rejected": -1.98828125, "step": 1990 }, { "epoch": 0.9389671361502347, "grad_norm": 618.6914385245799, "learning_rate": 3.41679707876891e-08, "logits/chosen": -0.539843738079071, "logits/rejected": -0.28044432401657104, "logps/chosen": -493.20001220703125, "logps/rejected": -634.4000244140625, "loss": 3.4985, "rewards/accuracies": 0.542187511920929, "rewards/chosen": 7.321875095367432, "rewards/margins": 10.649999618530273, "rewards/rejected": -3.329882860183716, "step": 2000 }, { "epoch": 0.9436619718309859, "grad_norm": 678.935108178829, "learning_rate": 3.155972874282733e-08, "logits/chosen": -0.5634765625, "logits/rejected": -0.23886719346046448, "logps/chosen": -474.6000061035156, "logps/rejected": -572.0, "loss": 2.9062, "rewards/accuracies": 0.55078125, "rewards/chosen": 8.206250190734863, "rewards/margins": 8.7890625, "rewards/rejected": -0.579785168170929, "step": 2010 }, { "epoch": 0.9483568075117371, "grad_norm": 744.6802447297364, "learning_rate": 2.8951486697965573e-08, "logits/chosen": -0.5882812738418579, "logits/rejected": -0.40800780057907104, "logps/chosen": -501.6000061035156, "logps/rejected": -584.0, "loss": 2.9893, "rewards/accuracies": 0.55078125, "rewards/chosen": 7.824999809265137, "rewards/margins": 9.381250381469727, "rewards/rejected": -1.55908203125, "step": 2020 }, { "epoch": 0.9530516431924883, "grad_norm": 716.6062801310147, "learning_rate": 2.634324465310381e-08, "logits/chosen": -0.587109386920929, "logits/rejected": -0.3207031190395355, "logps/chosen": -479.3999938964844, "logps/rejected": -582.0, "loss": 3.0668, "rewards/accuracies": 0.5531250238418579, "rewards/chosen": 8.390625, "rewards/margins": 9.915624618530273, "rewards/rejected": -1.5211913585662842, "step": 2030 }, { "epoch": 0.9577464788732394, "grad_norm": 741.9040324330703, "learning_rate": 2.3735002608242045e-08, "logits/chosen": -0.5171874761581421, "logits/rejected": -0.28645020723342896, "logps/chosen": -504.0, "logps/rejected": -553.2000122070312, "loss": 3.1581, "rewards/accuracies": 0.5718749761581421, "rewards/chosen": 7.660937309265137, "rewards/margins": 9.365625381469727, "rewards/rejected": -1.7068359851837158, "step": 2040 }, { "epoch": 0.9624413145539906, "grad_norm": 657.7087990825831, "learning_rate": 2.1126760563380282e-08, "logits/chosen": -0.5771484375, "logits/rejected": -0.30634766817092896, "logps/chosen": -487.0, "logps/rejected": -535.4000244140625, "loss": 2.7019, "rewards/accuracies": 0.5484374761581421, "rewards/chosen": 8.225000381469727, "rewards/margins": 8.728124618530273, "rewards/rejected": -0.502246081829071, "step": 2050 }, { "epoch": 0.9671361502347418, "grad_norm": 597.3107773889226, "learning_rate": 1.8518518518518518e-08, "logits/chosen": -0.52734375, "logits/rejected": -0.28852540254592896, "logps/chosen": -506.0, "logps/rejected": -575.7999877929688, "loss": 3.4987, "rewards/accuracies": 0.547656238079071, "rewards/chosen": 7.290625095367432, "rewards/margins": 9.462499618530273, "rewards/rejected": -2.174609422683716, "step": 2060 }, { "epoch": 0.971830985915493, "grad_norm": 653.7306608538205, "learning_rate": 1.5910276473656755e-08, "logits/chosen": -0.5357421636581421, "logits/rejected": -0.39794921875, "logps/chosen": -484.0, "logps/rejected": -574.0, "loss": 2.9408, "rewards/accuracies": 0.577343761920929, "rewards/chosen": 7.903124809265137, "rewards/margins": 10.120312690734863, "rewards/rejected": -2.2171874046325684, "step": 2070 }, { "epoch": 0.9765258215962441, "grad_norm": 536.0583557889304, "learning_rate": 1.3302034428794991e-08, "logits/chosen": -0.5298827886581421, "logits/rejected": -0.28955078125, "logps/chosen": -468.20001220703125, "logps/rejected": -542.7999877929688, "loss": 2.6625, "rewards/accuracies": 0.56640625, "rewards/chosen": 8.590624809265137, "rewards/margins": 9.934374809265137, "rewards/rejected": -1.342187523841858, "step": 2080 }, { "epoch": 0.9812206572769953, "grad_norm": 599.9590872843921, "learning_rate": 1.0693792383933229e-08, "logits/chosen": -0.5982421636581421, "logits/rejected": -0.2824951112270355, "logps/chosen": -496.79998779296875, "logps/rejected": -596.7999877929688, "loss": 3.185, "rewards/accuracies": 0.532031238079071, "rewards/chosen": 7.962500095367432, "rewards/margins": 9.503125190734863, "rewards/rejected": -1.5427734851837158, "step": 2090 }, { "epoch": 0.9859154929577465, "grad_norm": 759.5439873702159, "learning_rate": 8.085550339071465e-09, "logits/chosen": -0.630859375, "logits/rejected": -0.30701905488967896, "logps/chosen": -506.20001220703125, "logps/rejected": -614.0, "loss": 3.1632, "rewards/accuracies": 0.5679687261581421, "rewards/chosen": 8.515625, "rewards/margins": 10.203125, "rewards/rejected": -1.6890137195587158, "step": 2100 }, { "epoch": 0.9906103286384976, "grad_norm": 918.6341180673062, "learning_rate": 5.4773082942097025e-09, "logits/chosen": -0.586132824420929, "logits/rejected": -0.38066405057907104, "logps/chosen": -482.79998779296875, "logps/rejected": -553.5999755859375, "loss": 2.7837, "rewards/accuracies": 0.55078125, "rewards/chosen": 7.96875, "rewards/margins": 9.568750381469727, "rewards/rejected": -1.5974609851837158, "step": 2110 }, { "epoch": 0.9953051643192489, "grad_norm": 722.4425099651021, "learning_rate": 2.8690662493479393e-09, "logits/chosen": -0.549023449420929, "logits/rejected": -0.37397462129592896, "logps/chosen": -488.20001220703125, "logps/rejected": -563.7999877929688, "loss": 3.184, "rewards/accuracies": 0.559374988079071, "rewards/chosen": 7.65625, "rewards/margins": 9.509374618530273, "rewards/rejected": -1.8437988758087158, "step": 2120 }, { "epoch": 1.0, "grad_norm": 628.7749726185859, "learning_rate": 2.608242044861763e-10, "logits/chosen": -0.5673828125, "logits/rejected": -0.2880493104457855, "logps/chosen": -482.0, "logps/rejected": -588.4000244140625, "loss": 2.7861, "rewards/accuracies": 0.5825449228286743, "rewards/chosen": 8.643750190734863, "rewards/margins": 11.34375, "rewards/rejected": -2.6859374046325684, "step": 2130 }, { "epoch": 1.0, "step": 2130, "total_flos": 0.0, "train_loss": 3.126477795811326, "train_runtime": 14505.8392, "train_samples_per_second": 18.791, "train_steps_per_second": 0.147 } ], "logging_steps": 10, "max_steps": 2130, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }