{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 2130, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004694835680751174, "grad_norm": 349.5201090470861, "learning_rate": 2.1126760563380282e-08, "logits/chosen": -4.224999904632568, "logits/rejected": -4.053124904632568, "logps/chosen": -546.2000122070312, "logps/rejected": -632.0, "loss": 2.2046, "rewards/accuracies": 0.16718749701976776, "rewards/chosen": -9.765625145519152e-05, "rewards/margins": -0.09228515625, "rewards/rejected": 0.09267578274011612, "step": 10 }, { "epoch": 0.009389671361502348, "grad_norm": 362.6757163538109, "learning_rate": 4.460093896713615e-08, "logits/chosen": -4.125, "logits/rejected": -4.064062595367432, "logps/chosen": -575.5999755859375, "logps/rejected": -634.0, "loss": 3.0842, "rewards/accuracies": 0.24375000596046448, "rewards/chosen": -0.13261719048023224, "rewards/margins": -0.29277342557907104, "rewards/rejected": 0.16044922173023224, "step": 20 }, { "epoch": 0.014084507042253521, "grad_norm": 292.98498991781304, "learning_rate": 6.807511737089202e-08, "logits/chosen": -4.129687309265137, "logits/rejected": -3.9625000953674316, "logps/chosen": -608.4000244140625, "logps/rejected": -700.0, "loss": 2.7602, "rewards/accuracies": 0.25703126192092896, "rewards/chosen": 0.4609375, "rewards/margins": 0.20703125, "rewards/rejected": 0.25341796875, "step": 30 }, { "epoch": 0.018779342723004695, "grad_norm": 335.938627794309, "learning_rate": 9.154929577464789e-08, "logits/chosen": -4.167187690734863, "logits/rejected": -4.0390625, "logps/chosen": -599.2000122070312, "logps/rejected": -716.7999877929688, "loss": 3.2419, "rewards/accuracies": 0.25390625, "rewards/chosen": -0.49101561307907104, "rewards/margins": -0.22670897841453552, "rewards/rejected": -0.2652343809604645, "step": 40 }, { "epoch": 0.023474178403755867, "grad_norm": 383.05083847560314, "learning_rate": 1.1502347417840374e-07, "logits/chosen": -4.123437404632568, "logits/rejected": -4.067187309265137, "logps/chosen": -576.0, "logps/rejected": -640.7999877929688, "loss": 3.2698, "rewards/accuracies": 0.26484376192092896, "rewards/chosen": -0.32695311307907104, "rewards/margins": -0.0634765625, "rewards/rejected": -0.263671875, "step": 50 }, { "epoch": 0.028169014084507043, "grad_norm": 349.1965000442969, "learning_rate": 1.384976525821596e-07, "logits/chosen": -4.199999809265137, "logits/rejected": -4.032812595367432, "logps/chosen": -590.7999877929688, "logps/rejected": -697.5999755859375, "loss": 2.7531, "rewards/accuracies": 0.23593750596046448, "rewards/chosen": -0.3087402284145355, "rewards/margins": -0.02490234375, "rewards/rejected": -0.283935546875, "step": 60 }, { "epoch": 0.03286384976525822, "grad_norm": 433.25242575117915, "learning_rate": 1.619718309859155e-07, "logits/chosen": -4.092187404632568, "logits/rejected": -4.006249904632568, "logps/chosen": -566.2000122070312, "logps/rejected": -628.0, "loss": 2.7881, "rewards/accuracies": 0.22578124701976776, "rewards/chosen": 0.04750976711511612, "rewards/margins": -0.10000000149011612, "rewards/rejected": 0.1484375, "step": 70 }, { "epoch": 0.03755868544600939, "grad_norm": 323.0287405163464, "learning_rate": 1.8544600938967138e-07, "logits/chosen": -4.199999809265137, "logits/rejected": -4.112500190734863, "logps/chosen": -578.0, "logps/rejected": -670.0, "loss": 3.2578, "rewards/accuracies": 0.23984375596046448, "rewards/chosen": -0.2548828125, "rewards/margins": -0.34467774629592896, "rewards/rejected": 0.08955077826976776, "step": 80 }, { "epoch": 0.04225352112676056, "grad_norm": 606.9294632383828, "learning_rate": 2.089201877934272e-07, "logits/chosen": -4.221875190734863, "logits/rejected": -4.129687309265137, "logps/chosen": -552.2000122070312, "logps/rejected": -619.0, "loss": 2.8603, "rewards/accuracies": 0.24765625596046448, "rewards/chosen": -0.15292969346046448, "rewards/margins": 0.014575195498764515, "rewards/rejected": -0.16718749701976776, "step": 90 }, { "epoch": 0.046948356807511735, "grad_norm": 373.77763598016054, "learning_rate": 2.323943661971831e-07, "logits/chosen": -4.137499809265137, "logits/rejected": -4.084374904632568, "logps/chosen": -595.2000122070312, "logps/rejected": -623.2000122070312, "loss": 2.7484, "rewards/accuracies": 0.25078123807907104, "rewards/chosen": 0.10239257663488388, "rewards/margins": 0.43437498807907104, "rewards/rejected": -0.33222657442092896, "step": 100 }, { "epoch": 0.051643192488262914, "grad_norm": 379.39664695274985, "learning_rate": 2.5586854460093895e-07, "logits/chosen": -4.190625190734863, "logits/rejected": -4.050000190734863, "logps/chosen": -575.2000122070312, "logps/rejected": -670.4000244140625, "loss": 3.2888, "rewards/accuracies": 0.24843749403953552, "rewards/chosen": 0.02099609375, "rewards/margins": 0.03076171875, "rewards/rejected": -0.00927734375, "step": 110 }, { "epoch": 0.056338028169014086, "grad_norm": 332.1539670626537, "learning_rate": 2.7934272300469483e-07, "logits/chosen": -4.198437690734863, "logits/rejected": -4.051562309265137, "logps/chosen": -586.4000244140625, "logps/rejected": -657.2000122070312, "loss": 2.9951, "rewards/accuracies": 0.2515625059604645, "rewards/chosen": 0.09458007663488388, "rewards/margins": -0.36796873807907104, "rewards/rejected": 0.4627929627895355, "step": 120 }, { "epoch": 0.06103286384976526, "grad_norm": 358.02210765226994, "learning_rate": 3.0281690140845066e-07, "logits/chosen": -4.1640625, "logits/rejected": -4.056250095367432, "logps/chosen": -593.0, "logps/rejected": -676.0, "loss": 2.2491, "rewards/accuracies": 0.26171875, "rewards/chosen": 0.8482910394668579, "rewards/margins": 0.8780273199081421, "rewards/rejected": -0.0302734375, "step": 130 }, { "epoch": 0.06572769953051644, "grad_norm": 426.7438636061182, "learning_rate": 3.2629107981220654e-07, "logits/chosen": -4.128125190734863, "logits/rejected": -4.034375190734863, "logps/chosen": -581.4000244140625, "logps/rejected": -691.5999755859375, "loss": 2.896, "rewards/accuracies": 0.24687500298023224, "rewards/chosen": 0.449462890625, "rewards/margins": -0.0478515625, "rewards/rejected": 0.4970703125, "step": 140 }, { "epoch": 0.07042253521126761, "grad_norm": 318.17414183804203, "learning_rate": 3.497652582159624e-07, "logits/chosen": -4.196875095367432, "logits/rejected": -4.153124809265137, "logps/chosen": -581.4000244140625, "logps/rejected": -678.0, "loss": 2.4536, "rewards/accuracies": 0.28515625, "rewards/chosen": 1.0592772960662842, "rewards/margins": 0.843554675579071, "rewards/rejected": 0.21484375, "step": 150 }, { "epoch": 0.07511737089201878, "grad_norm": 402.2068723059072, "learning_rate": 3.732394366197183e-07, "logits/chosen": -4.159375190734863, "logits/rejected": -4.053124904632568, "logps/chosen": -600.4000244140625, "logps/rejected": -680.2000122070312, "loss": 2.5593, "rewards/accuracies": 0.2679687440395355, "rewards/chosen": 1.2078125476837158, "rewards/margins": 0.7469238042831421, "rewards/rejected": 0.4620117247104645, "step": 160 }, { "epoch": 0.07981220657276995, "grad_norm": 329.38260711999607, "learning_rate": 3.967136150234742e-07, "logits/chosen": -4.175000190734863, "logits/rejected": -4.025000095367432, "logps/chosen": -551.4000244140625, "logps/rejected": -658.7999877929688, "loss": 2.7635, "rewards/accuracies": 0.2734375, "rewards/chosen": 1.1056640148162842, "rewards/margins": 0.679980456829071, "rewards/rejected": 0.4246582090854645, "step": 170 }, { "epoch": 0.08450704225352113, "grad_norm": 312.91437090204977, "learning_rate": 4.2018779342723e-07, "logits/chosen": -4.143750190734863, "logits/rejected": -4.015625, "logps/chosen": -538.5999755859375, "logps/rejected": -649.5999755859375, "loss": 2.0002, "rewards/accuracies": 0.30156248807907104, "rewards/chosen": 1.570703148841858, "rewards/margins": 1.3400390148162842, "rewards/rejected": 0.23125000298023224, "step": 180 }, { "epoch": 0.0892018779342723, "grad_norm": 326.6984379520781, "learning_rate": 4.436619718309859e-07, "logits/chosen": -4.1484375, "logits/rejected": -4.0859375, "logps/chosen": -577.4000244140625, "logps/rejected": -635.5999755859375, "loss": 2.5892, "rewards/accuracies": 0.31718748807907104, "rewards/chosen": 1.8718750476837158, "rewards/margins": 1.2676270008087158, "rewards/rejected": 0.604296863079071, "step": 190 }, { "epoch": 0.09389671361502347, "grad_norm": 268.53014556032116, "learning_rate": 4.671361502347418e-07, "logits/chosen": -4.123437404632568, "logits/rejected": -4.025000095367432, "logps/chosen": -561.2000122070312, "logps/rejected": -644.7999877929688, "loss": 2.4101, "rewards/accuracies": 0.33515626192092896, "rewards/chosen": 2.66015625, "rewards/margins": 1.9445312023162842, "rewards/rejected": 0.717578113079071, "step": 200 }, { "epoch": 0.09859154929577464, "grad_norm": 283.1829513051546, "learning_rate": 4.906103286384976e-07, "logits/chosen": -4.173437595367432, "logits/rejected": -4.040625095367432, "logps/chosen": -571.0, "logps/rejected": -677.2000122070312, "loss": 2.9838, "rewards/accuracies": 0.33515626192092896, "rewards/chosen": 3.332812547683716, "rewards/margins": 1.3986327648162842, "rewards/rejected": 1.933203101158142, "step": 210 }, { "epoch": 0.10328638497652583, "grad_norm": 304.4791346501985, "learning_rate": 4.984350547730829e-07, "logits/chosen": -4.25, "logits/rejected": -4.099999904632568, "logps/chosen": -584.2000122070312, "logps/rejected": -656.0, "loss": 2.7172, "rewards/accuracies": 0.3515625, "rewards/chosen": 3.3765625953674316, "rewards/margins": 1.972070336341858, "rewards/rejected": 1.4074218273162842, "step": 220 }, { "epoch": 0.107981220657277, "grad_norm": 321.7772351477239, "learning_rate": 4.958268127282212e-07, "logits/chosen": -4.199999809265137, "logits/rejected": -4.087500095367432, "logps/chosen": -573.5999755859375, "logps/rejected": -684.7999877929688, "loss": 2.6311, "rewards/accuracies": 0.36796873807907104, "rewards/chosen": 3.7828125953674316, "rewards/margins": 2.497265577316284, "rewards/rejected": 1.2843749523162842, "step": 230 }, { "epoch": 0.11267605633802817, "grad_norm": 377.608437816389, "learning_rate": 4.932185706833594e-07, "logits/chosen": -4.131249904632568, "logits/rejected": -4.078125, "logps/chosen": -528.4000244140625, "logps/rejected": -572.0, "loss": 2.7644, "rewards/accuracies": 0.36015623807907104, "rewards/chosen": 3.546875, "rewards/margins": 1.503320336341858, "rewards/rejected": 2.043750047683716, "step": 240 }, { "epoch": 0.11737089201877934, "grad_norm": 309.5342207630007, "learning_rate": 4.906103286384976e-07, "logits/chosen": -4.175000190734863, "logits/rejected": -4.107812404632568, "logps/chosen": -574.4000244140625, "logps/rejected": -689.4000244140625, "loss": 3.0576, "rewards/accuracies": 0.35078126192092896, "rewards/chosen": 3.879687547683716, "rewards/margins": 2.080127000808716, "rewards/rejected": 1.8014647960662842, "step": 250 }, { "epoch": 0.12206572769953052, "grad_norm": 408.1206998275862, "learning_rate": 4.880020865936358e-07, "logits/chosen": -4.137499809265137, "logits/rejected": -4.067187309265137, "logps/chosen": -540.7999877929688, "logps/rejected": -636.0, "loss": 2.6253, "rewards/accuracies": 0.37890625, "rewards/chosen": 4.121874809265137, "rewards/margins": 1.921875, "rewards/rejected": 2.203906297683716, "step": 260 }, { "epoch": 0.1267605633802817, "grad_norm": 295.9696657695239, "learning_rate": 4.853938445487741e-07, "logits/chosen": -4.078125, "logits/rejected": -4.003125190734863, "logps/chosen": -573.2000122070312, "logps/rejected": -664.0, "loss": 2.9412, "rewards/accuracies": 0.3921875059604645, "rewards/chosen": 4.870312690734863, "rewards/margins": 3.128613233566284, "rewards/rejected": 1.746484398841858, "step": 270 }, { "epoch": 0.13145539906103287, "grad_norm": 343.11140208524813, "learning_rate": 4.827856025039123e-07, "logits/chosen": -4.178124904632568, "logits/rejected": -4.099999904632568, "logps/chosen": -570.2000122070312, "logps/rejected": -670.4000244140625, "loss": 3.3178, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 5.498437404632568, "rewards/margins": 3.4000000953674316, "rewards/rejected": 2.1011719703674316, "step": 280 }, { "epoch": 0.13615023474178403, "grad_norm": 304.3885220361387, "learning_rate": 4.801773604590506e-07, "logits/chosen": -4.135937690734863, "logits/rejected": -4.057812690734863, "logps/chosen": -612.7999877929688, "logps/rejected": -667.5999755859375, "loss": 3.2102, "rewards/accuracies": 0.4359374940395355, "rewards/chosen": 5.34375, "rewards/margins": 2.6572265625, "rewards/rejected": 2.6861329078674316, "step": 290 }, { "epoch": 0.14084507042253522, "grad_norm": 284.5528315210802, "learning_rate": 4.775691184141888e-07, "logits/chosen": -4.146874904632568, "logits/rejected": -4.084374904632568, "logps/chosen": -590.4000244140625, "logps/rejected": -663.5999755859375, "loss": 3.0293, "rewards/accuracies": 0.4351562559604645, "rewards/chosen": 6.224999904632568, "rewards/margins": 3.686718702316284, "rewards/rejected": 2.537890672683716, "step": 300 }, { "epoch": 0.14553990610328638, "grad_norm": 340.14707313956575, "learning_rate": 4.749608763693271e-07, "logits/chosen": -4.1875, "logits/rejected": -4.079687595367432, "logps/chosen": -565.2000122070312, "logps/rejected": -650.0, "loss": 3.0758, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": 6.237500190734863, "rewards/margins": 4.151562690734863, "rewards/rejected": 2.0859375, "step": 310 }, { "epoch": 0.15023474178403756, "grad_norm": 357.8195713232441, "learning_rate": 4.7235263432446533e-07, "logits/chosen": -4.1875, "logits/rejected": -4.046875, "logps/chosen": -565.7999877929688, "logps/rejected": -641.2000122070312, "loss": 3.2169, "rewards/accuracies": 0.4468750059604645, "rewards/chosen": 6.146874904632568, "rewards/margins": 3.5914063453674316, "rewards/rejected": 2.5484375953674316, "step": 320 }, { "epoch": 0.15492957746478872, "grad_norm": 387.3224371650379, "learning_rate": 4.6974439227960353e-07, "logits/chosen": -4.178124904632568, "logits/rejected": -4.010937690734863, "logps/chosen": -584.2000122070312, "logps/rejected": -670.4000244140625, "loss": 3.0496, "rewards/accuracies": 0.45781248807907104, "rewards/chosen": 6.340624809265137, "rewards/margins": 4.669140815734863, "rewards/rejected": 1.6662108898162842, "step": 330 }, { "epoch": 0.1596244131455399, "grad_norm": 378.7521561914789, "learning_rate": 4.671361502347418e-07, "logits/chosen": -4.203125, "logits/rejected": -4.118750095367432, "logps/chosen": -555.7999877929688, "logps/rejected": -601.4000244140625, "loss": 3.6385, "rewards/accuracies": 0.453125, "rewards/chosen": 6.518750190734863, "rewards/margins": 3.078320264816284, "rewards/rejected": 3.4437499046325684, "step": 340 }, { "epoch": 0.1643192488262911, "grad_norm": 280.186323729001, "learning_rate": 4.6452790818988004e-07, "logits/chosen": -4.214062690734863, "logits/rejected": -4.126562595367432, "logps/chosen": -593.2000122070312, "logps/rejected": -669.2000122070312, "loss": 3.508, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": 6.784375190734863, "rewards/margins": 4.580859184265137, "rewards/rejected": 2.203418016433716, "step": 350 }, { "epoch": 0.16901408450704225, "grad_norm": 289.7400713232728, "learning_rate": 4.6191966614501824e-07, "logits/chosen": -4.173437595367432, "logits/rejected": -4.074999809265137, "logps/chosen": -560.5999755859375, "logps/rejected": -643.7999877929688, "loss": 3.0314, "rewards/accuracies": 0.46406251192092896, "rewards/chosen": 7.434374809265137, "rewards/margins": 4.764843940734863, "rewards/rejected": 2.671093702316284, "step": 360 }, { "epoch": 0.17370892018779344, "grad_norm": 305.1857644992596, "learning_rate": 4.593114241001565e-07, "logits/chosen": -4.115624904632568, "logits/rejected": -4.074999809265137, "logps/chosen": -602.4000244140625, "logps/rejected": -672.0, "loss": 3.0562, "rewards/accuracies": 0.4429687559604645, "rewards/chosen": 7.678124904632568, "rewards/margins": 5.004687309265137, "rewards/rejected": 2.671093702316284, "step": 370 }, { "epoch": 0.1784037558685446, "grad_norm": 281.3726377297068, "learning_rate": 4.5670318205529474e-07, "logits/chosen": -4.146874904632568, "logits/rejected": -4.026562690734863, "logps/chosen": -570.7999877929688, "logps/rejected": -666.4000244140625, "loss": 3.4014, "rewards/accuracies": 0.44453126192092896, "rewards/chosen": 6.846875190734863, "rewards/margins": 3.7099609375, "rewards/rejected": 3.1382813453674316, "step": 380 }, { "epoch": 0.18309859154929578, "grad_norm": 318.3140273505768, "learning_rate": 4.54094940010433e-07, "logits/chosen": -4.168749809265137, "logits/rejected": -4.053124904632568, "logps/chosen": -581.2000122070312, "logps/rejected": -679.5999755859375, "loss": 3.165, "rewards/accuracies": 0.47187501192092896, "rewards/chosen": 7.240624904632568, "rewards/margins": 4.939062595367432, "rewards/rejected": 2.298828125, "step": 390 }, { "epoch": 0.18779342723004694, "grad_norm": 266.65165381199415, "learning_rate": 4.514866979655712e-07, "logits/chosen": -4.181250095367432, "logits/rejected": -4.048437595367432, "logps/chosen": -572.0, "logps/rejected": -645.5999755859375, "loss": 2.8849, "rewards/accuracies": 0.46171873807907104, "rewards/chosen": 7.662499904632568, "rewards/margins": 5.106249809265137, "rewards/rejected": 2.54736328125, "step": 400 }, { "epoch": 0.19248826291079812, "grad_norm": 267.9080238940424, "learning_rate": 4.4887845592070945e-07, "logits/chosen": -4.109375, "logits/rejected": -3.995312452316284, "logps/chosen": -559.5999755859375, "logps/rejected": -688.4000244140625, "loss": 3.2426, "rewards/accuracies": 0.46406251192092896, "rewards/chosen": 7.618750095367432, "rewards/margins": 4.5625, "rewards/rejected": 3.0562500953674316, "step": 410 }, { "epoch": 0.19718309859154928, "grad_norm": 312.69487450228394, "learning_rate": 4.462702138758477e-07, "logits/chosen": -4.121874809265137, "logits/rejected": -4.03125, "logps/chosen": -597.5999755859375, "logps/rejected": -653.2000122070312, "loss": 3.7578, "rewards/accuracies": 0.43437498807907104, "rewards/chosen": 7.662499904632568, "rewards/margins": 3.947070360183716, "rewards/rejected": 3.7164063453674316, "step": 420 }, { "epoch": 0.20187793427230047, "grad_norm": 323.8865547251275, "learning_rate": 4.436619718309859e-07, "logits/chosen": -4.103125095367432, "logits/rejected": -3.9546875953674316, "logps/chosen": -581.4000244140625, "logps/rejected": -698.7999877929688, "loss": 3.3093, "rewards/accuracies": 0.4632812440395355, "rewards/chosen": 7.925000190734863, "rewards/margins": 5.646874904632568, "rewards/rejected": 2.2835936546325684, "step": 430 }, { "epoch": 0.20657276995305165, "grad_norm": 393.9159806692629, "learning_rate": 4.4105372978612415e-07, "logits/chosen": -4.212500095367432, "logits/rejected": -4.090624809265137, "logps/chosen": -597.2000122070312, "logps/rejected": -683.2000122070312, "loss": 3.6426, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 7.912499904632568, "rewards/margins": 5.670312404632568, "rewards/rejected": 2.243359327316284, "step": 440 }, { "epoch": 0.2112676056338028, "grad_norm": 373.9652792696506, "learning_rate": 4.384454877412624e-07, "logits/chosen": -4.189062595367432, "logits/rejected": -4.051562309265137, "logps/chosen": -582.5999755859375, "logps/rejected": -746.7999877929688, "loss": 3.6727, "rewards/accuracies": 0.4554687440395355, "rewards/chosen": 7.740624904632568, "rewards/margins": 4.469531059265137, "rewards/rejected": 3.26953125, "step": 450 }, { "epoch": 0.215962441314554, "grad_norm": 330.13898710319694, "learning_rate": 4.358372456964006e-07, "logits/chosen": -4.1328125, "logits/rejected": -4.0703125, "logps/chosen": -574.4000244140625, "logps/rejected": -666.7999877929688, "loss": 3.2602, "rewards/accuracies": 0.4765625, "rewards/chosen": 8.378125190734863, "rewards/margins": 5.387499809265137, "rewards/rejected": 2.9869141578674316, "step": 460 }, { "epoch": 0.22065727699530516, "grad_norm": 293.6213926149627, "learning_rate": 4.3322900365153886e-07, "logits/chosen": -4.25, "logits/rejected": -4.096875190734863, "logps/chosen": -612.0, "logps/rejected": -669.5999755859375, "loss": 3.3312, "rewards/accuracies": 0.46953123807907104, "rewards/chosen": 8.553125381469727, "rewards/margins": 5.612500190734863, "rewards/rejected": 2.9410157203674316, "step": 470 }, { "epoch": 0.22535211267605634, "grad_norm": 295.2353414406269, "learning_rate": 4.306207616066771e-07, "logits/chosen": -4.209374904632568, "logits/rejected": -4.096875190734863, "logps/chosen": -562.2000122070312, "logps/rejected": -645.2000122070312, "loss": 3.1204, "rewards/accuracies": 0.46562498807907104, "rewards/chosen": 8.978124618530273, "rewards/margins": 5.035937309265137, "rewards/rejected": 3.94140625, "step": 480 }, { "epoch": 0.2300469483568075, "grad_norm": 254.67708489014822, "learning_rate": 4.280125195618153e-07, "logits/chosen": -4.153124809265137, "logits/rejected": -4.010937690734863, "logps/chosen": -573.0, "logps/rejected": -675.4000244140625, "loss": 2.9846, "rewards/accuracies": 0.48359376192092896, "rewards/chosen": 9.565625190734863, "rewards/margins": 6.709374904632568, "rewards/rejected": 2.8531250953674316, "step": 490 }, { "epoch": 0.2347417840375587, "grad_norm": 274.79861750540084, "learning_rate": 4.2540427751695357e-07, "logits/chosen": -4.146874904632568, "logits/rejected": -4.056250095367432, "logps/chosen": -580.0, "logps/rejected": -646.4000244140625, "loss": 3.157, "rewards/accuracies": 0.47968751192092896, "rewards/chosen": 8.712499618530273, "rewards/margins": 5.418749809265137, "rewards/rejected": 3.296875, "step": 500 }, { "epoch": 0.23943661971830985, "grad_norm": 250.05393627156735, "learning_rate": 4.227960354720918e-07, "logits/chosen": -4.221875190734863, "logits/rejected": -4.1171875, "logps/chosen": -566.5999755859375, "logps/rejected": -685.2000122070312, "loss": 2.8452, "rewards/accuracies": 0.4945312440395355, "rewards/chosen": 9.324999809265137, "rewards/margins": 7.099999904632568, "rewards/rejected": 2.210742235183716, "step": 510 }, { "epoch": 0.24413145539906103, "grad_norm": 289.17642513775206, "learning_rate": 4.2018779342723e-07, "logits/chosen": -4.150000095367432, "logits/rejected": -4.079687595367432, "logps/chosen": -582.4000244140625, "logps/rejected": -637.2000122070312, "loss": 3.1881, "rewards/accuracies": 0.47734373807907104, "rewards/chosen": 9.550000190734863, "rewards/margins": 5.614062309265137, "rewards/rejected": 3.934765577316284, "step": 520 }, { "epoch": 0.24882629107981222, "grad_norm": 301.4772408122623, "learning_rate": 4.1757955138236827e-07, "logits/chosen": -4.21875, "logits/rejected": -4.059374809265137, "logps/chosen": -565.4000244140625, "logps/rejected": -671.2000122070312, "loss": 3.2645, "rewards/accuracies": 0.500781238079071, "rewards/chosen": 9.428125381469727, "rewards/margins": 7.178124904632568, "rewards/rejected": 2.2474608421325684, "step": 530 }, { "epoch": 0.2535211267605634, "grad_norm": 291.7949545654684, "learning_rate": 4.149713093375065e-07, "logits/chosen": -4.193749904632568, "logits/rejected": -4.078125, "logps/chosen": -566.4000244140625, "logps/rejected": -618.7999877929688, "loss": 3.2226, "rewards/accuracies": 0.4976562559604645, "rewards/chosen": 9.790624618530273, "rewards/margins": 5.895312309265137, "rewards/rejected": 3.8968749046325684, "step": 540 }, { "epoch": 0.25821596244131456, "grad_norm": 295.46300897462413, "learning_rate": 4.123630672926447e-07, "logits/chosen": -4.189062595367432, "logits/rejected": -4.09375, "logps/chosen": -555.4000244140625, "logps/rejected": -664.5999755859375, "loss": 3.3349, "rewards/accuracies": 0.4828124940395355, "rewards/chosen": 9.003125190734863, "rewards/margins": 5.984375, "rewards/rejected": 3.02392578125, "step": 550 }, { "epoch": 0.26291079812206575, "grad_norm": 282.55243937448216, "learning_rate": 4.09754825247783e-07, "logits/chosen": -4.168749809265137, "logits/rejected": -4.034375190734863, "logps/chosen": -575.2000122070312, "logps/rejected": -645.5999755859375, "loss": 3.3186, "rewards/accuracies": 0.4976562559604645, "rewards/chosen": 9.703125, "rewards/margins": 6.403124809265137, "rewards/rejected": 3.303906202316284, "step": 560 }, { "epoch": 0.2676056338028169, "grad_norm": 243.28520451102895, "learning_rate": 4.0714658320292123e-07, "logits/chosen": -4.217187404632568, "logits/rejected": -4.068749904632568, "logps/chosen": -574.0, "logps/rejected": -659.5999755859375, "loss": 3.5655, "rewards/accuracies": 0.4820312559604645, "rewards/chosen": 8.893750190734863, "rewards/margins": 6.009375095367432, "rewards/rejected": 2.874218702316284, "step": 570 }, { "epoch": 0.27230046948356806, "grad_norm": 290.7842698495749, "learning_rate": 4.045383411580595e-07, "logits/chosen": -4.215624809265137, "logits/rejected": -4.096875190734863, "logps/chosen": -555.0, "logps/rejected": -658.4000244140625, "loss": 2.9852, "rewards/accuracies": 0.4976562559604645, "rewards/chosen": 10.081250190734863, "rewards/margins": 7.290625095367432, "rewards/rejected": 2.79833984375, "step": 580 }, { "epoch": 0.27699530516431925, "grad_norm": 291.2604745601705, "learning_rate": 4.019300991131977e-07, "logits/chosen": -4.140625, "logits/rejected": -4.015625, "logps/chosen": -557.0, "logps/rejected": -648.7999877929688, "loss": 3.018, "rewards/accuracies": 0.5023437738418579, "rewards/chosen": 10.199999809265137, "rewards/margins": 6.995312690734863, "rewards/rejected": 3.2113280296325684, "step": 590 }, { "epoch": 0.28169014084507044, "grad_norm": 227.7522974477189, "learning_rate": 3.9932185706833594e-07, "logits/chosen": -4.206250190734863, "logits/rejected": -4.068749904632568, "logps/chosen": -583.2000122070312, "logps/rejected": -650.0, "loss": 3.2984, "rewards/accuracies": 0.51171875, "rewards/chosen": 10.106249809265137, "rewards/margins": 6.421875, "rewards/rejected": 3.671093702316284, "step": 600 }, { "epoch": 0.2863849765258216, "grad_norm": 287.0075813999204, "learning_rate": 3.967136150234742e-07, "logits/chosen": -4.151562690734863, "logits/rejected": -4.059374809265137, "logps/chosen": -584.2000122070312, "logps/rejected": -664.7999877929688, "loss": 3.377, "rewards/accuracies": 0.4828124940395355, "rewards/chosen": 10.190625190734863, "rewards/margins": 6.734375, "rewards/rejected": 3.453906297683716, "step": 610 }, { "epoch": 0.29107981220657275, "grad_norm": 280.2718266050094, "learning_rate": 3.941053729786124e-07, "logits/chosen": -4.154687404632568, "logits/rejected": -4.051562309265137, "logps/chosen": -587.4000244140625, "logps/rejected": -661.5999755859375, "loss": 3.1514, "rewards/accuracies": 0.5179687738418579, "rewards/chosen": 9.859375, "rewards/margins": 7.278124809265137, "rewards/rejected": 2.56640625, "step": 620 }, { "epoch": 0.29577464788732394, "grad_norm": 324.67176703959694, "learning_rate": 3.9149713093375064e-07, "logits/chosen": -4.1015625, "logits/rejected": -3.957812547683716, "logps/chosen": -580.0, "logps/rejected": -683.5999755859375, "loss": 3.7381, "rewards/accuracies": 0.504687488079071, "rewards/chosen": 9.53125, "rewards/margins": 6.6875, "rewards/rejected": 2.8443360328674316, "step": 630 }, { "epoch": 0.3004694835680751, "grad_norm": 316.9452197327162, "learning_rate": 3.888888888888889e-07, "logits/chosen": -4.185937404632568, "logits/rejected": -4.026562690734863, "logps/chosen": -568.5999755859375, "logps/rejected": -696.0, "loss": 3.5189, "rewards/accuracies": 0.4765625, "rewards/chosen": 9.512499809265137, "rewards/margins": 6.237500190734863, "rewards/rejected": 3.28076171875, "step": 640 }, { "epoch": 0.3051643192488263, "grad_norm": 297.1688628970196, "learning_rate": 3.862806468440271e-07, "logits/chosen": -4.284375190734863, "logits/rejected": -4.184374809265137, "logps/chosen": -527.0, "logps/rejected": -620.0, "loss": 3.1752, "rewards/accuracies": 0.4945312440395355, "rewards/chosen": 9.568750381469727, "rewards/margins": 5.458984375, "rewards/rejected": 4.130053520202637, "step": 650 }, { "epoch": 0.30985915492957744, "grad_norm": 251.23055282468778, "learning_rate": 3.8367240479916535e-07, "logits/chosen": -4.192187309265137, "logits/rejected": -4.151562690734863, "logps/chosen": -560.4000244140625, "logps/rejected": -623.0, "loss": 3.2523, "rewards/accuracies": 0.48046875, "rewards/chosen": 9.712499618530273, "rewards/margins": 5.959374904632568, "rewards/rejected": 3.7593750953674316, "step": 660 }, { "epoch": 0.3145539906103286, "grad_norm": 322.7495965594595, "learning_rate": 3.810641627543036e-07, "logits/chosen": -4.196875095367432, "logits/rejected": -4.09375, "logps/chosen": -576.4000244140625, "logps/rejected": -658.2000122070312, "loss": 3.4558, "rewards/accuracies": 0.49609375, "rewards/chosen": 9.318750381469727, "rewards/margins": 6.964062690734863, "rewards/rejected": 2.3499999046325684, "step": 670 }, { "epoch": 0.3192488262910798, "grad_norm": 304.2542609024998, "learning_rate": 3.784559207094418e-07, "logits/chosen": -4.175000190734863, "logits/rejected": -4.074999809265137, "logps/chosen": -595.2000122070312, "logps/rejected": -651.2000122070312, "loss": 3.9372, "rewards/accuracies": 0.484375, "rewards/chosen": 9.699999809265137, "rewards/margins": 5.493750095367432, "rewards/rejected": 4.212500095367432, "step": 680 }, { "epoch": 0.323943661971831, "grad_norm": 242.7782318272521, "learning_rate": 3.7584767866458005e-07, "logits/chosen": -4.162499904632568, "logits/rejected": -4.042187690734863, "logps/chosen": -578.4000244140625, "logps/rejected": -621.7999877929688, "loss": 3.8324, "rewards/accuracies": 0.4859375059604645, "rewards/chosen": 8.943750381469727, "rewards/margins": 6.134375095367432, "rewards/rejected": 2.81640625, "step": 690 }, { "epoch": 0.3286384976525822, "grad_norm": 387.5093883157667, "learning_rate": 3.732394366197183e-07, "logits/chosen": -4.212500095367432, "logits/rejected": -4.109375, "logps/chosen": -606.5999755859375, "logps/rejected": -635.5999755859375, "loss": 4.5264, "rewards/accuracies": 0.500781238079071, "rewards/chosen": 8.987500190734863, "rewards/margins": 5.770312309265137, "rewards/rejected": 3.219531297683716, "step": 700 }, { "epoch": 0.3333333333333333, "grad_norm": 357.2057827892341, "learning_rate": 3.706311945748565e-07, "logits/chosen": -4.168749809265137, "logits/rejected": -4.051562309265137, "logps/chosen": -623.5999755859375, "logps/rejected": -645.5999755859375, "loss": 3.9486, "rewards/accuracies": 0.49921876192092896, "rewards/chosen": 9.634374618530273, "rewards/margins": 6.546875, "rewards/rejected": 3.098437547683716, "step": 710 }, { "epoch": 0.3380281690140845, "grad_norm": 289.48952564262845, "learning_rate": 3.6802295252999476e-07, "logits/chosen": -4.15625, "logits/rejected": -3.9703125953674316, "logps/chosen": -590.7999877929688, "logps/rejected": -686.5999755859375, "loss": 3.6266, "rewards/accuracies": 0.49687498807907104, "rewards/chosen": 9.737500190734863, "rewards/margins": 7.662499904632568, "rewards/rejected": 2.065869092941284, "step": 720 }, { "epoch": 0.3427230046948357, "grad_norm": 322.44572436537794, "learning_rate": 3.65414710485133e-07, "logits/chosen": -4.159375190734863, "logits/rejected": -4.09375, "logps/chosen": -585.2000122070312, "logps/rejected": -628.5999755859375, "loss": 3.707, "rewards/accuracies": 0.515625, "rewards/chosen": 9.618749618530273, "rewards/margins": 7.190625190734863, "rewards/rejected": 2.4375, "step": 730 }, { "epoch": 0.3474178403755869, "grad_norm": 327.4047879299755, "learning_rate": 3.6280646844027127e-07, "logits/chosen": -4.135937690734863, "logits/rejected": -4.040625095367432, "logps/chosen": -564.0, "logps/rejected": -702.0, "loss": 2.9845, "rewards/accuracies": 0.5257812738418579, "rewards/chosen": 9.962499618530273, "rewards/margins": 8.574999809265137, "rewards/rejected": 1.39453125, "step": 740 }, { "epoch": 0.352112676056338, "grad_norm": 268.40285966149014, "learning_rate": 3.6019822639540947e-07, "logits/chosen": -4.150000095367432, "logits/rejected": -4.042187690734863, "logps/chosen": -599.7999877929688, "logps/rejected": -627.2000122070312, "loss": 2.9559, "rewards/accuracies": 0.4984374940395355, "rewards/chosen": 10.800000190734863, "rewards/margins": 7.465624809265137, "rewards/rejected": 3.3382811546325684, "step": 750 }, { "epoch": 0.3568075117370892, "grad_norm": 228.49576724117685, "learning_rate": 3.575899843505477e-07, "logits/chosen": -4.175000190734863, "logits/rejected": -4.071875095367432, "logps/chosen": -571.5999755859375, "logps/rejected": -611.5999755859375, "loss": 3.3682, "rewards/accuracies": 0.5328124761581421, "rewards/chosen": 9.946874618530273, "rewards/margins": 6.965234279632568, "rewards/rejected": 2.9810547828674316, "step": 760 }, { "epoch": 0.3615023474178404, "grad_norm": 339.19249872693797, "learning_rate": 3.5498174230568597e-07, "logits/chosen": -4.110937595367432, "logits/rejected": -4.0234375, "logps/chosen": -570.4000244140625, "logps/rejected": -650.7999877929688, "loss": 2.9888, "rewards/accuracies": 0.51953125, "rewards/chosen": 9.78125, "rewards/margins": 8.318750381469727, "rewards/rejected": 1.464257836341858, "step": 770 }, { "epoch": 0.36619718309859156, "grad_norm": 307.4418590636236, "learning_rate": 3.5237350026082417e-07, "logits/chosen": -4.125, "logits/rejected": -4.046875, "logps/chosen": -578.4000244140625, "logps/rejected": -609.2000122070312, "loss": 3.5426, "rewards/accuracies": 0.49140626192092896, "rewards/chosen": 9.300000190734863, "rewards/margins": 6.425000190734863, "rewards/rejected": 2.8755860328674316, "step": 780 }, { "epoch": 0.37089201877934275, "grad_norm": 304.8766546196448, "learning_rate": 3.497652582159624e-07, "logits/chosen": -4.140625, "logits/rejected": -4.051562309265137, "logps/chosen": -582.7999877929688, "logps/rejected": -674.0, "loss": 3.1521, "rewards/accuracies": 0.503125011920929, "rewards/chosen": 9.399999618530273, "rewards/margins": 7.859375, "rewards/rejected": 1.552148461341858, "step": 790 }, { "epoch": 0.3755868544600939, "grad_norm": 253.18672796971052, "learning_rate": 3.471570161711007e-07, "logits/chosen": -4.168749809265137, "logits/rejected": -4.042187690734863, "logps/chosen": -609.7999877929688, "logps/rejected": -685.7999877929688, "loss": 3.4785, "rewards/accuracies": 0.51171875, "rewards/chosen": 9.543749809265137, "rewards/margins": 8.240625381469727, "rewards/rejected": 1.3092772960662842, "step": 800 }, { "epoch": 0.38028169014084506, "grad_norm": 267.7610739280651, "learning_rate": 3.445487741262389e-07, "logits/chosen": -4.184374809265137, "logits/rejected": -4.103125095367432, "logps/chosen": -561.5999755859375, "logps/rejected": -647.5999755859375, "loss": 3.9487, "rewards/accuracies": 0.5140625238418579, "rewards/chosen": 9.259374618530273, "rewards/margins": 7.068749904632568, "rewards/rejected": 2.196093797683716, "step": 810 }, { "epoch": 0.38497652582159625, "grad_norm": 278.98516681627376, "learning_rate": 3.4194053208137713e-07, "logits/chosen": -4.153124809265137, "logits/rejected": -4.051562309265137, "logps/chosen": -571.2000122070312, "logps/rejected": -622.0, "loss": 3.2654, "rewards/accuracies": 0.5179687738418579, "rewards/chosen": 9.9375, "rewards/margins": 7.559374809265137, "rewards/rejected": 2.372119188308716, "step": 820 }, { "epoch": 0.38967136150234744, "grad_norm": 263.29366918290094, "learning_rate": 3.393322900365154e-07, "logits/chosen": -4.171875, "logits/rejected": -4.009375095367432, "logps/chosen": -540.5999755859375, "logps/rejected": -624.0, "loss": 3.855, "rewards/accuracies": 0.51171875, "rewards/chosen": 8.768750190734863, "rewards/margins": 6.830078125, "rewards/rejected": 1.9441406726837158, "step": 830 }, { "epoch": 0.39436619718309857, "grad_norm": 268.93242807228216, "learning_rate": 3.367240479916536e-07, "logits/chosen": -4.162499904632568, "logits/rejected": -4.026562690734863, "logps/chosen": -606.0, "logps/rejected": -673.5999755859375, "loss": 4.048, "rewards/accuracies": 0.508593738079071, "rewards/chosen": 8.678125381469727, "rewards/margins": 6.693749904632568, "rewards/rejected": 1.9873046875, "step": 840 }, { "epoch": 0.39906103286384975, "grad_norm": 312.88147844071045, "learning_rate": 3.3411580594679184e-07, "logits/chosen": -4.2109375, "logits/rejected": -4.068749904632568, "logps/chosen": -561.7999877929688, "logps/rejected": -646.7999877929688, "loss": 3.2428, "rewards/accuracies": 0.5054687261581421, "rewards/chosen": 9.328125, "rewards/margins": 7.162499904632568, "rewards/rejected": 2.1823487281799316, "step": 850 }, { "epoch": 0.40375586854460094, "grad_norm": 260.7141136621247, "learning_rate": 3.315075639019301e-07, "logits/chosen": -4.173437595367432, "logits/rejected": -4.095312595367432, "logps/chosen": -555.5999755859375, "logps/rejected": -642.0, "loss": 2.9281, "rewards/accuracies": 0.510937511920929, "rewards/chosen": 9.393750190734863, "rewards/margins": 8.515625, "rewards/rejected": 0.88330078125, "step": 860 }, { "epoch": 0.4084507042253521, "grad_norm": 296.8891743971074, "learning_rate": 3.288993218570683e-07, "logits/chosen": -4.199999809265137, "logits/rejected": -4.040625095367432, "logps/chosen": -562.7999877929688, "logps/rejected": -628.0, "loss": 3.5136, "rewards/accuracies": 0.510937511920929, "rewards/chosen": 9.068750381469727, "rewards/margins": 6.717187404632568, "rewards/rejected": 2.3675780296325684, "step": 870 }, { "epoch": 0.4131455399061033, "grad_norm": 265.0685724449779, "learning_rate": 3.2629107981220654e-07, "logits/chosen": -4.170312404632568, "logits/rejected": -3.996875047683716, "logps/chosen": -586.2000122070312, "logps/rejected": -651.0, "loss": 3.7241, "rewards/accuracies": 0.48906248807907104, "rewards/chosen": 8.490625381469727, "rewards/margins": 5.989062309265137, "rewards/rejected": 2.502978563308716, "step": 880 }, { "epoch": 0.41784037558685444, "grad_norm": 290.79755612144265, "learning_rate": 3.236828377673448e-07, "logits/chosen": -4.107812404632568, "logits/rejected": -4.0703125, "logps/chosen": -609.2000122070312, "logps/rejected": -644.0, "loss": 4.4678, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": 8.574999809265137, "rewards/margins": 5.985937595367432, "rewards/rejected": 2.5953125953674316, "step": 890 }, { "epoch": 0.4225352112676056, "grad_norm": 261.2611410206578, "learning_rate": 3.2107459572248305e-07, "logits/chosen": -4.118750095367432, "logits/rejected": -4.043749809265137, "logps/chosen": -587.2000122070312, "logps/rejected": -651.2000122070312, "loss": 2.8678, "rewards/accuracies": 0.5335937738418579, "rewards/chosen": 9.399999618530273, "rewards/margins": 8.534375190734863, "rewards/rejected": 0.861328125, "step": 900 }, { "epoch": 0.4272300469483568, "grad_norm": 362.8472585991743, "learning_rate": 3.1846635367762125e-07, "logits/chosen": -4.131249904632568, "logits/rejected": -4.028124809265137, "logps/chosen": -587.4000244140625, "logps/rejected": -677.7999877929688, "loss": 3.9947, "rewards/accuracies": 0.4921875, "rewards/chosen": 9.056249618530273, "rewards/margins": 7.450781345367432, "rewards/rejected": 1.5958983898162842, "step": 910 }, { "epoch": 0.431924882629108, "grad_norm": 428.9226775378459, "learning_rate": 3.158581116327595e-07, "logits/chosen": -4.15625, "logits/rejected": -4.059374809265137, "logps/chosen": -586.0, "logps/rejected": -707.5999755859375, "loss": 3.3051, "rewards/accuracies": 0.515625, "rewards/chosen": 9.362500190734863, "rewards/margins": 8.137499809265137, "rewards/rejected": 1.2257812023162842, "step": 920 }, { "epoch": 0.43661971830985913, "grad_norm": 363.0151726164607, "learning_rate": 3.1324986958789775e-07, "logits/chosen": -4.171875, "logits/rejected": -4.067187309265137, "logps/chosen": -547.4000244140625, "logps/rejected": -624.4000244140625, "loss": 3.1743, "rewards/accuracies": 0.4898437559604645, "rewards/chosen": 9.231249809265137, "rewards/margins": 6.953125, "rewards/rejected": 2.28125, "step": 930 }, { "epoch": 0.4413145539906103, "grad_norm": 300.234697004928, "learning_rate": 3.1064162754303595e-07, "logits/chosen": -4.140625, "logits/rejected": -3.9921875, "logps/chosen": -585.0, "logps/rejected": -686.0, "loss": 3.6555, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 8.868749618530273, "rewards/margins": 8.421875, "rewards/rejected": 0.443359375, "step": 940 }, { "epoch": 0.4460093896713615, "grad_norm": 307.2370185290697, "learning_rate": 3.080333854981742e-07, "logits/chosen": -4.131249904632568, "logits/rejected": -4.017187595367432, "logps/chosen": -573.0, "logps/rejected": -626.0, "loss": 3.6289, "rewards/accuracies": 0.520312488079071, "rewards/chosen": 9.081250190734863, "rewards/margins": 6.7421875, "rewards/rejected": 2.350781202316284, "step": 950 }, { "epoch": 0.4507042253521127, "grad_norm": 315.1420548177452, "learning_rate": 3.0542514345331246e-07, "logits/chosen": -4.159375190734863, "logits/rejected": -4.032812595367432, "logps/chosen": -582.5999755859375, "logps/rejected": -710.0, "loss": 3.708, "rewards/accuracies": 0.4828124940395355, "rewards/chosen": 8.524999618530273, "rewards/margins": 7.0859375, "rewards/rejected": 1.443359375, "step": 960 }, { "epoch": 0.45539906103286387, "grad_norm": 292.1652064232398, "learning_rate": 3.0281690140845066e-07, "logits/chosen": -4.103125095367432, "logits/rejected": -4.059374809265137, "logps/chosen": -611.2000122070312, "logps/rejected": -673.4000244140625, "loss": 3.8467, "rewards/accuracies": 0.48828125, "rewards/chosen": 8.893750190734863, "rewards/margins": 6.938281059265137, "rewards/rejected": 1.955664038658142, "step": 970 }, { "epoch": 0.460093896713615, "grad_norm": 305.7034108659845, "learning_rate": 3.002086593635889e-07, "logits/chosen": -4.206250190734863, "logits/rejected": -4.035937309265137, "logps/chosen": -576.0, "logps/rejected": -676.7999877929688, "loss": 2.8016, "rewards/accuracies": 0.5132812261581421, "rewards/chosen": 9.1875, "rewards/margins": 9.334375381469727, "rewards/rejected": -0.13544921576976776, "step": 980 }, { "epoch": 0.4647887323943662, "grad_norm": 276.83804710650116, "learning_rate": 2.9760041731872716e-07, "logits/chosen": -4.193749904632568, "logits/rejected": -4.089062690734863, "logps/chosen": -538.5999755859375, "logps/rejected": -639.2000122070312, "loss": 2.9199, "rewards/accuracies": 0.535937488079071, "rewards/chosen": 9.056249618530273, "rewards/margins": 8.396875381469727, "rewards/rejected": 0.6742187738418579, "step": 990 }, { "epoch": 0.4694835680751174, "grad_norm": 994.2333919263641, "learning_rate": 2.9499217527386536e-07, "logits/chosen": -4.178124904632568, "logits/rejected": -4.0859375, "logps/chosen": -563.7999877929688, "logps/rejected": -655.2000122070312, "loss": 2.8529, "rewards/accuracies": 0.508593738079071, "rewards/chosen": 9.096875190734863, "rewards/margins": 8.024999618530273, "rewards/rejected": 1.073144555091858, "step": 1000 }, { "epoch": 0.47417840375586856, "grad_norm": 263.164923515219, "learning_rate": 2.923839332290036e-07, "logits/chosen": -4.123437404632568, "logits/rejected": -4.012499809265137, "logps/chosen": -560.0, "logps/rejected": -667.2000122070312, "loss": 3.148, "rewards/accuracies": 0.49921876192092896, "rewards/chosen": 8.240625381469727, "rewards/margins": 7.603125095367432, "rewards/rejected": 0.6279296875, "step": 1010 }, { "epoch": 0.4788732394366197, "grad_norm": 318.0813412687802, "learning_rate": 2.8977569118414187e-07, "logits/chosen": -4.25, "logits/rejected": -4.096875190734863, "logps/chosen": -549.4000244140625, "logps/rejected": -635.7999877929688, "loss": 3.1516, "rewards/accuracies": 0.526562511920929, "rewards/chosen": 8.353124618530273, "rewards/margins": 7.503125190734863, "rewards/rejected": 0.833789050579071, "step": 1020 }, { "epoch": 0.4835680751173709, "grad_norm": 277.03569972426686, "learning_rate": 2.8716744913928007e-07, "logits/chosen": -4.214062690734863, "logits/rejected": -4.106249809265137, "logps/chosen": -572.7999877929688, "logps/rejected": -632.2000122070312, "loss": 2.8036, "rewards/accuracies": 0.5296875238418579, "rewards/chosen": 9.274999618530273, "rewards/margins": 8.546875, "rewards/rejected": 0.735546886920929, "step": 1030 }, { "epoch": 0.48826291079812206, "grad_norm": 211.8844280559408, "learning_rate": 2.845592070944183e-07, "logits/chosen": -4.206250190734863, "logits/rejected": -4.074999809265137, "logps/chosen": -570.5999755859375, "logps/rejected": -683.5999755859375, "loss": 3.0776, "rewards/accuracies": 0.508593738079071, "rewards/chosen": 9.168749809265137, "rewards/margins": 7.628125190734863, "rewards/rejected": 1.5320312976837158, "step": 1040 }, { "epoch": 0.49295774647887325, "grad_norm": 1084.9538807947201, "learning_rate": 2.819509650495566e-07, "logits/chosen": -4.1875, "logits/rejected": -4.025000095367432, "logps/chosen": -562.0, "logps/rejected": -629.0, "loss": 3.2087, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 8.762499809265137, "rewards/margins": 7.685937404632568, "rewards/rejected": 1.0810546875, "step": 1050 }, { "epoch": 0.49765258215962443, "grad_norm": 307.62081588002684, "learning_rate": 2.7934272300469483e-07, "logits/chosen": -4.134375095367432, "logits/rejected": -4.035937309265137, "logps/chosen": -598.0, "logps/rejected": -641.0, "loss": 3.1239, "rewards/accuracies": 0.4976562559604645, "rewards/chosen": 9.178125381469727, "rewards/margins": 7.729687690734863, "rewards/rejected": 1.441308617591858, "step": 1060 }, { "epoch": 0.5023474178403756, "grad_norm": 291.91293524568334, "learning_rate": 2.7673448095983303e-07, "logits/chosen": -4.1484375, "logits/rejected": -4.057812690734863, "logps/chosen": -578.0, "logps/rejected": -640.4000244140625, "loss": 3.6736, "rewards/accuracies": 0.4976562559604645, "rewards/chosen": 8.696874618530273, "rewards/margins": 7.473437309265137, "rewards/rejected": 1.2189452648162842, "step": 1070 }, { "epoch": 0.5070422535211268, "grad_norm": 1044.1991089251253, "learning_rate": 2.741262389149713e-07, "logits/chosen": -4.154687404632568, "logits/rejected": -3.9671874046325684, "logps/chosen": -578.0, "logps/rejected": -662.0, "loss": 3.184, "rewards/accuracies": 0.535937488079071, "rewards/chosen": 8.759374618530273, "rewards/margins": 8.140625, "rewards/rejected": 0.632031261920929, "step": 1080 }, { "epoch": 0.5117370892018779, "grad_norm": 268.621353868072, "learning_rate": 2.7151799687010953e-07, "logits/chosen": -4.1875, "logits/rejected": -4.029687404632568, "logps/chosen": -584.2000122070312, "logps/rejected": -665.5999755859375, "loss": 3.0666, "rewards/accuracies": 0.530468761920929, "rewards/chosen": 8.821874618530273, "rewards/margins": 9.159375190734863, "rewards/rejected": -0.3453125059604645, "step": 1090 }, { "epoch": 0.5164319248826291, "grad_norm": 275.9837003696594, "learning_rate": 2.6890975482524773e-07, "logits/chosen": -4.082812309265137, "logits/rejected": -3.9781250953674316, "logps/chosen": -603.0, "logps/rejected": -655.2000122070312, "loss": 3.5672, "rewards/accuracies": 0.53125, "rewards/chosen": 9.087499618530273, "rewards/margins": 8.928125381469727, "rewards/rejected": 0.14448241889476776, "step": 1100 }, { "epoch": 0.5211267605633803, "grad_norm": 331.47009226887195, "learning_rate": 2.66301512780386e-07, "logits/chosen": -4.153124809265137, "logits/rejected": -3.9859375953674316, "logps/chosen": -552.2000122070312, "logps/rejected": -645.5999755859375, "loss": 3.3857, "rewards/accuracies": 0.51171875, "rewards/chosen": 8.665624618530273, "rewards/margins": 8.559374809265137, "rewards/rejected": 0.111328125, "step": 1110 }, { "epoch": 0.5258215962441315, "grad_norm": 317.89046309154855, "learning_rate": 2.6369327073552424e-07, "logits/chosen": -4.203125, "logits/rejected": -4.082812309265137, "logps/chosen": -560.5999755859375, "logps/rejected": -618.7999877929688, "loss": 2.9995, "rewards/accuracies": 0.5171874761581421, "rewards/chosen": 8.984375, "rewards/margins": 7.115624904632568, "rewards/rejected": 1.881250023841858, "step": 1120 }, { "epoch": 0.5305164319248826, "grad_norm": 309.94253870846245, "learning_rate": 2.6108502869066244e-07, "logits/chosen": -4.162499904632568, "logits/rejected": -4.0546875, "logps/chosen": -593.5999755859375, "logps/rejected": -688.7999877929688, "loss": 3.2947, "rewards/accuracies": 0.5234375, "rewards/chosen": 9.215624809265137, "rewards/margins": 8.375, "rewards/rejected": 0.826953113079071, "step": 1130 }, { "epoch": 0.5352112676056338, "grad_norm": 366.55680036700505, "learning_rate": 2.584767866458007e-07, "logits/chosen": -4.217187404632568, "logits/rejected": -4.059374809265137, "logps/chosen": -568.4000244140625, "logps/rejected": -646.7999877929688, "loss": 3.5653, "rewards/accuracies": 0.500781238079071, "rewards/chosen": 9.065625190734863, "rewards/margins": 7.668749809265137, "rewards/rejected": 1.3982422351837158, "step": 1140 }, { "epoch": 0.539906103286385, "grad_norm": 272.5639470143928, "learning_rate": 2.5586854460093895e-07, "logits/chosen": -4.146874904632568, "logits/rejected": -4.025000095367432, "logps/chosen": -578.5999755859375, "logps/rejected": -671.2000122070312, "loss": 3.3055, "rewards/accuracies": 0.534375011920929, "rewards/chosen": 8.943750381469727, "rewards/margins": 8.596875190734863, "rewards/rejected": 0.3472656309604645, "step": 1150 }, { "epoch": 0.5446009389671361, "grad_norm": 249.74677763574044, "learning_rate": 2.5326030255607715e-07, "logits/chosen": -4.175000190734863, "logits/rejected": -4.057812690734863, "logps/chosen": -584.0, "logps/rejected": -654.7999877929688, "loss": 3.5447, "rewards/accuracies": 0.51171875, "rewards/chosen": 9.278124809265137, "rewards/margins": 8.045312881469727, "rewards/rejected": 1.235937476158142, "step": 1160 }, { "epoch": 0.5492957746478874, "grad_norm": 259.21404711583165, "learning_rate": 2.506520605112154e-07, "logits/chosen": -4.1640625, "logits/rejected": -4.081250190734863, "logps/chosen": -576.2000122070312, "logps/rejected": -656.0, "loss": 3.4939, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 8.59375, "rewards/margins": 6.532812595367432, "rewards/rejected": 2.0582032203674316, "step": 1170 }, { "epoch": 0.5539906103286385, "grad_norm": 262.41014591871937, "learning_rate": 2.4804381846635365e-07, "logits/chosen": -4.206250190734863, "logits/rejected": -4.109375, "logps/chosen": -593.4000244140625, "logps/rejected": -650.5999755859375, "loss": 3.4565, "rewards/accuracies": 0.514843761920929, "rewards/chosen": 8.637499809265137, "rewards/margins": 7.8359375, "rewards/rejected": 0.8057616949081421, "step": 1180 }, { "epoch": 0.5586854460093896, "grad_norm": 285.70918665861836, "learning_rate": 2.454355764214919e-07, "logits/chosen": -4.178124904632568, "logits/rejected": -4.089062690734863, "logps/chosen": -554.5999755859375, "logps/rejected": -676.5999755859375, "loss": 3.6881, "rewards/accuracies": 0.5015624761581421, "rewards/chosen": 8.412500381469727, "rewards/margins": 7.448437690734863, "rewards/rejected": 0.95751953125, "step": 1190 }, { "epoch": 0.5633802816901409, "grad_norm": 2395.3317758278217, "learning_rate": 2.4282733437663016e-07, "logits/chosen": -4.190625190734863, "logits/rejected": -4.140625, "logps/chosen": -583.4000244140625, "logps/rejected": -633.2000122070312, "loss": 3.8918, "rewards/accuracies": 0.508593738079071, "rewards/chosen": 8.962499618530273, "rewards/margins": 6.510937690734863, "rewards/rejected": 2.446093797683716, "step": 1200 }, { "epoch": 0.568075117370892, "grad_norm": 291.79485364660576, "learning_rate": 2.4021909233176836e-07, "logits/chosen": -4.153124809265137, "logits/rejected": -4.067187309265137, "logps/chosen": -548.5999755859375, "logps/rejected": -653.2000122070312, "loss": 3.6139, "rewards/accuracies": 0.503125011920929, "rewards/chosen": 8.703125, "rewards/margins": 6.978906154632568, "rewards/rejected": 1.7251465320587158, "step": 1210 }, { "epoch": 0.5727699530516432, "grad_norm": 293.112373801934, "learning_rate": 2.376108502869066e-07, "logits/chosen": -4.181250095367432, "logits/rejected": -4.125, "logps/chosen": -585.7999877929688, "logps/rejected": -678.4000244140625, "loss": 3.2837, "rewards/accuracies": 0.514843761920929, "rewards/chosen": 8.96875, "rewards/margins": 7.993750095367432, "rewards/rejected": 0.983593761920929, "step": 1220 }, { "epoch": 0.5774647887323944, "grad_norm": 409.90103938295374, "learning_rate": 2.3500260824204484e-07, "logits/chosen": -4.129687309265137, "logits/rejected": -4.009375095367432, "logps/chosen": -613.2000122070312, "logps/rejected": -688.0, "loss": 3.2107, "rewards/accuracies": 0.5078125, "rewards/chosen": 8.831250190734863, "rewards/margins": 9.434374809265137, "rewards/rejected": -0.60546875, "step": 1230 }, { "epoch": 0.5821596244131455, "grad_norm": 262.57777744316803, "learning_rate": 2.323943661971831e-07, "logits/chosen": -4.221875190734863, "logits/rejected": -4.092187404632568, "logps/chosen": -573.4000244140625, "logps/rejected": -648.7999877929688, "loss": 3.164, "rewards/accuracies": 0.508593738079071, "rewards/chosen": 9.140625, "rewards/margins": 8.3203125, "rewards/rejected": 0.8245605230331421, "step": 1240 }, { "epoch": 0.5868544600938967, "grad_norm": 267.13392657233817, "learning_rate": 2.2978612415232132e-07, "logits/chosen": -4.175000190734863, "logits/rejected": -4.074999809265137, "logps/chosen": -605.5999755859375, "logps/rejected": -678.0, "loss": 3.06, "rewards/accuracies": 0.5101562738418579, "rewards/chosen": 9.199999809265137, "rewards/margins": 8.590624809265137, "rewards/rejected": 0.608593761920929, "step": 1250 }, { "epoch": 0.5915492957746479, "grad_norm": 365.075647857907, "learning_rate": 2.2717788210745957e-07, "logits/chosen": -4.171875, "logits/rejected": -4.065625190734863, "logps/chosen": -568.2000122070312, "logps/rejected": -661.5999755859375, "loss": 3.8677, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 8.6328125, "rewards/margins": 8.237500190734863, "rewards/rejected": 0.39375001192092896, "step": 1260 }, { "epoch": 0.596244131455399, "grad_norm": 258.1348728222688, "learning_rate": 2.245696400625978e-07, "logits/chosen": -4.1875, "logits/rejected": -4.073437690734863, "logps/chosen": -597.4000244140625, "logps/rejected": -670.4000244140625, "loss": 3.3881, "rewards/accuracies": 0.49687498807907104, "rewards/chosen": 8.578125, "rewards/margins": 8.340624809265137, "rewards/rejected": 0.23769530653953552, "step": 1270 }, { "epoch": 0.6009389671361502, "grad_norm": 253.69792112851948, "learning_rate": 2.2196139801773602e-07, "logits/chosen": -4.153124809265137, "logits/rejected": -4.0078125, "logps/chosen": -562.2000122070312, "logps/rejected": -628.7999877929688, "loss": 3.3145, "rewards/accuracies": 0.48359376192092896, "rewards/chosen": 8.212499618530273, "rewards/margins": 7.349999904632568, "rewards/rejected": 0.857617199420929, "step": 1280 }, { "epoch": 0.6056338028169014, "grad_norm": 335.2679465369597, "learning_rate": 2.1935315597287428e-07, "logits/chosen": -4.140625, "logits/rejected": -4.045312404632568, "logps/chosen": -582.0, "logps/rejected": -628.7999877929688, "loss": 3.2686, "rewards/accuracies": 0.53125, "rewards/chosen": 8.487500190734863, "rewards/margins": 7.296875, "rewards/rejected": 1.1824219226837158, "step": 1290 }, { "epoch": 0.6103286384976526, "grad_norm": 231.44130498480763, "learning_rate": 2.167449139280125e-07, "logits/chosen": -4.175000190734863, "logits/rejected": -4.056250095367432, "logps/chosen": -588.2000122070312, "logps/rejected": -672.0, "loss": 2.9305, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 9.725000381469727, "rewards/margins": 9.03125, "rewards/rejected": 0.7081054449081421, "step": 1300 }, { "epoch": 0.6150234741784038, "grad_norm": 266.1692522688036, "learning_rate": 2.1413667188315073e-07, "logits/chosen": -4.237500190734863, "logits/rejected": -4.192187309265137, "logps/chosen": -556.7999877929688, "logps/rejected": -617.2000122070312, "loss": 3.648, "rewards/accuracies": 0.5078125, "rewards/chosen": 8.921875, "rewards/margins": 8.4375, "rewards/rejected": 0.472412109375, "step": 1310 }, { "epoch": 0.6197183098591549, "grad_norm": 391.06496825559407, "learning_rate": 2.1152842983828898e-07, "logits/chosen": -4.126562595367432, "logits/rejected": -4.03125, "logps/chosen": -588.2000122070312, "logps/rejected": -645.2000122070312, "loss": 4.0506, "rewards/accuracies": 0.5101562738418579, "rewards/chosen": 8.337499618530273, "rewards/margins": 7.263281345367432, "rewards/rejected": 1.096093773841858, "step": 1320 }, { "epoch": 0.6244131455399061, "grad_norm": 278.7704596502006, "learning_rate": 2.089201877934272e-07, "logits/chosen": -4.128125190734863, "logits/rejected": -4.021874904632568, "logps/chosen": -601.5999755859375, "logps/rejected": -694.7999877929688, "loss": 3.4997, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 8.03125, "rewards/margins": 8.348437309265137, "rewards/rejected": -0.3167968690395355, "step": 1330 }, { "epoch": 0.6291079812206573, "grad_norm": 250.9766144500523, "learning_rate": 2.0631194574856543e-07, "logits/chosen": -4.224999904632568, "logits/rejected": -4.146874904632568, "logps/chosen": -557.0, "logps/rejected": -599.7999877929688, "loss": 3.2297, "rewards/accuracies": 0.50390625, "rewards/chosen": 8.934374809265137, "rewards/margins": 6.598437309265137, "rewards/rejected": 2.334765672683716, "step": 1340 }, { "epoch": 0.6338028169014085, "grad_norm": 286.2932148393472, "learning_rate": 2.0370370370370369e-07, "logits/chosen": -4.114062309265137, "logits/rejected": -4.025000095367432, "logps/chosen": -581.5999755859375, "logps/rejected": -696.0, "loss": 3.2912, "rewards/accuracies": 0.5289062261581421, "rewards/chosen": 8.237500190734863, "rewards/margins": 7.896874904632568, "rewards/rejected": 0.33769530057907104, "step": 1350 }, { "epoch": 0.6384976525821596, "grad_norm": 254.91504224414876, "learning_rate": 2.010954616588419e-07, "logits/chosen": -4.196875095367432, "logits/rejected": -4.03125, "logps/chosen": -569.7999877929688, "logps/rejected": -640.7999877929688, "loss": 3.3697, "rewards/accuracies": 0.4906249940395355, "rewards/chosen": 8.440625190734863, "rewards/margins": 7.165625095367432, "rewards/rejected": 1.27001953125, "step": 1360 }, { "epoch": 0.6431924882629108, "grad_norm": 414.74835013532675, "learning_rate": 1.9848721961398017e-07, "logits/chosen": -4.209374904632568, "logits/rejected": -4.139062404632568, "logps/chosen": -578.4000244140625, "logps/rejected": -629.5999755859375, "loss": 3.3279, "rewards/accuracies": 0.5015624761581421, "rewards/chosen": 9.503125190734863, "rewards/margins": 8.378125190734863, "rewards/rejected": 1.1222655773162842, "step": 1370 }, { "epoch": 0.647887323943662, "grad_norm": 283.32788354346286, "learning_rate": 1.958789775691184e-07, "logits/chosen": -4.165625095367432, "logits/rejected": -4.026562690734863, "logps/chosen": -563.5999755859375, "logps/rejected": -648.4000244140625, "loss": 3.8283, "rewards/accuracies": 0.51953125, "rewards/chosen": 8.606249809265137, "rewards/margins": 8.185937881469727, "rewards/rejected": 0.4126953184604645, "step": 1380 }, { "epoch": 0.6525821596244131, "grad_norm": 231.57307044550026, "learning_rate": 1.9327073552425662e-07, "logits/chosen": -4.15625, "logits/rejected": -4.050000190734863, "logps/chosen": -555.5999755859375, "logps/rejected": -622.2000122070312, "loss": 3.011, "rewards/accuracies": 0.5093749761581421, "rewards/chosen": 9.303125381469727, "rewards/margins": 9.512499809265137, "rewards/rejected": -0.21181640028953552, "step": 1390 }, { "epoch": 0.6572769953051644, "grad_norm": 245.7010459859998, "learning_rate": 1.906624934793949e-07, "logits/chosen": -4.159375190734863, "logits/rejected": -4.084374904632568, "logps/chosen": -575.0, "logps/rejected": -676.0, "loss": 3.4527, "rewards/accuracies": 0.538281261920929, "rewards/chosen": 9.899999618530273, "rewards/margins": 9.184374809265137, "rewards/rejected": 0.704882800579071, "step": 1400 }, { "epoch": 0.6619718309859155, "grad_norm": 292.70988132691616, "learning_rate": 1.8805425143453312e-07, "logits/chosen": -4.193749904632568, "logits/rejected": -3.9703125953674316, "logps/chosen": -553.5999755859375, "logps/rejected": -707.5999755859375, "loss": 3.2431, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 8.978124618530273, "rewards/margins": 7.946875095367432, "rewards/rejected": 1.0339844226837158, "step": 1410 }, { "epoch": 0.6666666666666666, "grad_norm": 289.38318966753934, "learning_rate": 1.8544600938967138e-07, "logits/chosen": -4.167187690734863, "logits/rejected": -4.0625, "logps/chosen": -571.7999877929688, "logps/rejected": -632.2000122070312, "loss": 3.3539, "rewards/accuracies": 0.515625, "rewards/chosen": 8.84375, "rewards/margins": 7.934374809265137, "rewards/rejected": 0.91796875, "step": 1420 }, { "epoch": 0.6713615023474179, "grad_norm": 222.43533121990208, "learning_rate": 1.828377673448096e-07, "logits/chosen": -4.150000095367432, "logits/rejected": -4.068749904632568, "logps/chosen": -607.0, "logps/rejected": -652.7999877929688, "loss": 3.7237, "rewards/accuracies": 0.5234375, "rewards/chosen": 8.765625, "rewards/margins": 8.543749809265137, "rewards/rejected": 0.21718749403953552, "step": 1430 }, { "epoch": 0.676056338028169, "grad_norm": 216.86086222140142, "learning_rate": 1.8022952529994783e-07, "logits/chosen": -4.184374809265137, "logits/rejected": -4.057812690734863, "logps/chosen": -583.7999877929688, "logps/rejected": -644.4000244140625, "loss": 2.99, "rewards/accuracies": 0.5210937261581421, "rewards/chosen": 8.912500381469727, "rewards/margins": 9.4921875, "rewards/rejected": -0.573046863079071, "step": 1440 }, { "epoch": 0.6807511737089202, "grad_norm": 4657.860857885418, "learning_rate": 1.7762128325508608e-07, "logits/chosen": -4.199999809265137, "logits/rejected": -4.095312595367432, "logps/chosen": -592.7999877929688, "logps/rejected": -685.2000122070312, "loss": 3.8062, "rewards/accuracies": 0.5101562738418579, "rewards/chosen": 9.371874809265137, "rewards/margins": 8.243749618530273, "rewards/rejected": 1.1334960460662842, "step": 1450 }, { "epoch": 0.6854460093896714, "grad_norm": 304.75248514169795, "learning_rate": 1.750130412102243e-07, "logits/chosen": -4.087500095367432, "logits/rejected": -4.010937690734863, "logps/chosen": -622.4000244140625, "logps/rejected": -668.4000244140625, "loss": 3.8537, "rewards/accuracies": 0.510937511920929, "rewards/chosen": 9.203125, "rewards/margins": 8.846875190734863, "rewards/rejected": 0.35283201932907104, "step": 1460 }, { "epoch": 0.6901408450704225, "grad_norm": 250.6636699927517, "learning_rate": 1.7240479916536254e-07, "logits/chosen": -4.084374904632568, "logits/rejected": -4.014062404632568, "logps/chosen": -561.4000244140625, "logps/rejected": -684.0, "loss": 3.4143, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 8.928125381469727, "rewards/margins": 8.459375381469727, "rewards/rejected": 0.4691406190395355, "step": 1470 }, { "epoch": 0.6948356807511737, "grad_norm": 248.50954893198326, "learning_rate": 1.697965571205008e-07, "logits/chosen": -4.212500095367432, "logits/rejected": -4.114062309265137, "logps/chosen": -569.5999755859375, "logps/rejected": -625.0, "loss": 3.9249, "rewards/accuracies": 0.5335937738418579, "rewards/chosen": 8.978124618530273, "rewards/margins": 7.28125, "rewards/rejected": 1.696679711341858, "step": 1480 }, { "epoch": 0.6995305164319249, "grad_norm": 259.50616073958525, "learning_rate": 1.6718831507563902e-07, "logits/chosen": -4.181250095367432, "logits/rejected": -4.120312690734863, "logps/chosen": -576.4000244140625, "logps/rejected": -615.4000244140625, "loss": 3.0162, "rewards/accuracies": 0.5, "rewards/chosen": 9.290624618530273, "rewards/margins": 6.721875190734863, "rewards/rejected": 2.567578077316284, "step": 1490 }, { "epoch": 0.704225352112676, "grad_norm": 237.4618737278568, "learning_rate": 1.6458007303077727e-07, "logits/chosen": -4.15625, "logits/rejected": -4.071875095367432, "logps/chosen": -564.0, "logps/rejected": -628.4000244140625, "loss": 3.7618, "rewards/accuracies": 0.5140625238418579, "rewards/chosen": 8.762499809265137, "rewards/margins": 7.009375095367432, "rewards/rejected": 1.7527344226837158, "step": 1500 }, { "epoch": 0.7089201877934272, "grad_norm": 304.6021709201427, "learning_rate": 1.619718309859155e-07, "logits/chosen": -4.176562309265137, "logits/rejected": -4.03125, "logps/chosen": -583.4000244140625, "logps/rejected": -673.5999755859375, "loss": 3.2058, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 9.234375, "rewards/margins": 8.543749809265137, "rewards/rejected": 0.6929687261581421, "step": 1510 }, { "epoch": 0.7136150234741784, "grad_norm": 288.23056226017525, "learning_rate": 1.5936358894105372e-07, "logits/chosen": -4.196875095367432, "logits/rejected": -4.078125, "logps/chosen": -584.5999755859375, "logps/rejected": -664.0, "loss": 3.2153, "rewards/accuracies": 0.5210937261581421, "rewards/chosen": 9.600000381469727, "rewards/margins": 8.106249809265137, "rewards/rejected": 1.49609375, "step": 1520 }, { "epoch": 0.7183098591549296, "grad_norm": 263.20042200814987, "learning_rate": 1.5675534689619197e-07, "logits/chosen": -4.184374809265137, "logits/rejected": -4.112500190734863, "logps/chosen": -583.5999755859375, "logps/rejected": -611.5999755859375, "loss": 3.699, "rewards/accuracies": 0.51171875, "rewards/chosen": 8.365625381469727, "rewards/margins": 6.4921875, "rewards/rejected": 1.866796851158142, "step": 1530 }, { "epoch": 0.7230046948356808, "grad_norm": 256.34527394439255, "learning_rate": 1.541471048513302e-07, "logits/chosen": -4.1484375, "logits/rejected": -4.0625, "logps/chosen": -584.0, "logps/rejected": -654.4000244140625, "loss": 3.3438, "rewards/accuracies": 0.5015624761581421, "rewards/chosen": 9.071874618530273, "rewards/margins": 8.068750381469727, "rewards/rejected": 1.000878930091858, "step": 1540 }, { "epoch": 0.7276995305164319, "grad_norm": 247.38632157274768, "learning_rate": 1.5153886280646843e-07, "logits/chosen": -4.175000190734863, "logits/rejected": -4.109375, "logps/chosen": -541.0, "logps/rejected": -602.5999755859375, "loss": 2.9387, "rewards/accuracies": 0.5453125238418579, "rewards/chosen": 9.828125, "rewards/margins": 8.837499618530273, "rewards/rejected": 0.986523449420929, "step": 1550 }, { "epoch": 0.7323943661971831, "grad_norm": 249.28526659037962, "learning_rate": 1.4893062076160668e-07, "logits/chosen": -4.184374809265137, "logits/rejected": -4.131249904632568, "logps/chosen": -572.7999877929688, "logps/rejected": -662.7999877929688, "loss": 2.9065, "rewards/accuracies": 0.547656238079071, "rewards/chosen": 10.112500190734863, "rewards/margins": 9.84375, "rewards/rejected": 0.25664061307907104, "step": 1560 }, { "epoch": 0.7370892018779343, "grad_norm": 220.7522120460472, "learning_rate": 1.463223787167449e-07, "logits/chosen": -4.1875, "logits/rejected": -4.081250190734863, "logps/chosen": -585.2000122070312, "logps/rejected": -642.0, "loss": 3.5233, "rewards/accuracies": 0.510937511920929, "rewards/chosen": 8.915624618530273, "rewards/margins": 6.90625, "rewards/rejected": 1.998046875, "step": 1570 }, { "epoch": 0.7417840375586855, "grad_norm": 282.38518483309315, "learning_rate": 1.4371413667188313e-07, "logits/chosen": -4.215624809265137, "logits/rejected": -4.0703125, "logps/chosen": -568.4000244140625, "logps/rejected": -680.4000244140625, "loss": 3.2953, "rewards/accuracies": 0.5414062738418579, "rewards/chosen": 9.340624809265137, "rewards/margins": 9.096875190734863, "rewards/rejected": 0.23681640625, "step": 1580 }, { "epoch": 0.7464788732394366, "grad_norm": 266.0114406524382, "learning_rate": 1.4110589462702139e-07, "logits/chosen": -4.237500190734863, "logits/rejected": -4.134375095367432, "logps/chosen": -567.5999755859375, "logps/rejected": -626.2000122070312, "loss": 3.4592, "rewards/accuracies": 0.530468761920929, "rewards/chosen": 9.65625, "rewards/margins": 7.831250190734863, "rewards/rejected": 1.8234374523162842, "step": 1590 }, { "epoch": 0.7511737089201878, "grad_norm": 293.3811924171183, "learning_rate": 1.384976525821596e-07, "logits/chosen": -4.259375095367432, "logits/rejected": -4.076562404632568, "logps/chosen": -585.7999877929688, "logps/rejected": -671.5999755859375, "loss": 3.4814, "rewards/accuracies": 0.5171874761581421, "rewards/chosen": 9.199999809265137, "rewards/margins": 9.306249618530273, "rewards/rejected": -0.10209961235523224, "step": 1600 }, { "epoch": 0.755868544600939, "grad_norm": 299.3190882040113, "learning_rate": 1.3588941053729787e-07, "logits/chosen": -4.157812595367432, "logits/rejected": -4.082812309265137, "logps/chosen": -586.5999755859375, "logps/rejected": -672.2000122070312, "loss": 3.2615, "rewards/accuracies": 0.5140625238418579, "rewards/chosen": 9.368749618530273, "rewards/margins": 9.240625381469727, "rewards/rejected": 0.13740234076976776, "step": 1610 }, { "epoch": 0.7605633802816901, "grad_norm": 292.3874233512557, "learning_rate": 1.332811684924361e-07, "logits/chosen": -4.168749809265137, "logits/rejected": -4.068749904632568, "logps/chosen": -594.7999877929688, "logps/rejected": -650.0, "loss": 3.265, "rewards/accuracies": 0.484375, "rewards/chosen": 8.831250190734863, "rewards/margins": 7.465624809265137, "rewards/rejected": 1.368749976158142, "step": 1620 }, { "epoch": 0.7652582159624414, "grad_norm": 257.6034386681777, "learning_rate": 1.3067292644757432e-07, "logits/chosen": -4.090624809265137, "logits/rejected": -4.003125190734863, "logps/chosen": -597.2000122070312, "logps/rejected": -692.7999877929688, "loss": 3.3718, "rewards/accuracies": 0.5234375, "rewards/chosen": 8.643750190734863, "rewards/margins": 8.181249618530273, "rewards/rejected": 0.46630859375, "step": 1630 }, { "epoch": 0.7699530516431925, "grad_norm": 351.04743008203644, "learning_rate": 1.2806468440271257e-07, "logits/chosen": -4.171875, "logits/rejected": -4.056250095367432, "logps/chosen": -560.2000122070312, "logps/rejected": -632.5999755859375, "loss": 2.8387, "rewards/accuracies": 0.52734375, "rewards/chosen": 9.603124618530273, "rewards/margins": 8.171875, "rewards/rejected": 1.4375, "step": 1640 }, { "epoch": 0.7746478873239436, "grad_norm": 297.5853232790701, "learning_rate": 1.254564423578508e-07, "logits/chosen": -4.178124904632568, "logits/rejected": -4.059374809265137, "logps/chosen": -567.4000244140625, "logps/rejected": -644.7999877929688, "loss": 3.0734, "rewards/accuracies": 0.5171874761581421, "rewards/chosen": 9.106249809265137, "rewards/margins": 8.315625190734863, "rewards/rejected": 0.797656238079071, "step": 1650 }, { "epoch": 0.7793427230046949, "grad_norm": 258.1808056870632, "learning_rate": 1.2284820031298902e-07, "logits/chosen": -4.165625095367432, "logits/rejected": -4.040625095367432, "logps/chosen": -606.4000244140625, "logps/rejected": -671.4000244140625, "loss": 3.1496, "rewards/accuracies": 0.5218750238418579, "rewards/chosen": 9.106249809265137, "rewards/margins": 9.184374809265137, "rewards/rejected": -0.07441405951976776, "step": 1660 }, { "epoch": 0.784037558685446, "grad_norm": 315.1161588076266, "learning_rate": 1.2023995826812728e-07, "logits/chosen": -4.170312404632568, "logits/rejected": -4.03125, "logps/chosen": -569.5999755859375, "logps/rejected": -659.2000122070312, "loss": 3.6683, "rewards/accuracies": 0.5132812261581421, "rewards/chosen": 8.653124809265137, "rewards/margins": 8.026562690734863, "rewards/rejected": 0.627685546875, "step": 1670 }, { "epoch": 0.7887323943661971, "grad_norm": 339.45568377970034, "learning_rate": 1.176317162232655e-07, "logits/chosen": -4.175000190734863, "logits/rejected": -4.090624809265137, "logps/chosen": -565.0, "logps/rejected": -670.2000122070312, "loss": 3.4659, "rewards/accuracies": 0.5218750238418579, "rewards/chosen": 9.118749618530273, "rewards/margins": 8.050000190734863, "rewards/rejected": 1.072265625, "step": 1680 }, { "epoch": 0.7934272300469484, "grad_norm": 309.46428057578476, "learning_rate": 1.1502347417840374e-07, "logits/chosen": -4.143750190734863, "logits/rejected": -4.042187690734863, "logps/chosen": -572.5999755859375, "logps/rejected": -650.7999877929688, "loss": 3.3681, "rewards/accuracies": 0.526562511920929, "rewards/chosen": 9.600000381469727, "rewards/margins": 8.931249618530273, "rewards/rejected": 0.663281261920929, "step": 1690 }, { "epoch": 0.7981220657276995, "grad_norm": 276.7849077382129, "learning_rate": 1.1241523213354198e-07, "logits/chosen": -4.203125, "logits/rejected": -4.059374809265137, "logps/chosen": -574.4000244140625, "logps/rejected": -672.7999877929688, "loss": 3.2875, "rewards/accuracies": 0.539843738079071, "rewards/chosen": 9.787500381469727, "rewards/margins": 9.084375381469727, "rewards/rejected": 0.699414074420929, "step": 1700 }, { "epoch": 0.8028169014084507, "grad_norm": 286.84592205572187, "learning_rate": 1.0980699008868022e-07, "logits/chosen": -4.162499904632568, "logits/rejected": -4.0703125, "logps/chosen": -586.4000244140625, "logps/rejected": -637.2000122070312, "loss": 3.9705, "rewards/accuracies": 0.5210937261581421, "rewards/chosen": 9.065625190734863, "rewards/margins": 7.614062309265137, "rewards/rejected": 1.4560546875, "step": 1710 }, { "epoch": 0.8075117370892019, "grad_norm": 304.19812578369937, "learning_rate": 1.0719874804381846e-07, "logits/chosen": -4.154687404632568, "logits/rejected": -4.0859375, "logps/chosen": -568.4000244140625, "logps/rejected": -603.4000244140625, "loss": 2.9644, "rewards/accuracies": 0.5023437738418579, "rewards/chosen": 9.259374618530273, "rewards/margins": 6.884375095367432, "rewards/rejected": 2.369335889816284, "step": 1720 }, { "epoch": 0.812206572769953, "grad_norm": 277.7578193166386, "learning_rate": 1.045905059989567e-07, "logits/chosen": -4.209374904632568, "logits/rejected": -4.151562690734863, "logps/chosen": -542.7999877929688, "logps/rejected": -594.5999755859375, "loss": 3.0375, "rewards/accuracies": 0.5140625238418579, "rewards/chosen": 9.524999618530273, "rewards/margins": 7.068749904632568, "rewards/rejected": 2.4613280296325684, "step": 1730 }, { "epoch": 0.8169014084507042, "grad_norm": 300.59459600097534, "learning_rate": 1.0198226395409494e-07, "logits/chosen": -4.159375190734863, "logits/rejected": -4.029687404632568, "logps/chosen": -551.4000244140625, "logps/rejected": -619.4000244140625, "loss": 3.3825, "rewards/accuracies": 0.5210937261581421, "rewards/chosen": 8.734375, "rewards/margins": 7.996874809265137, "rewards/rejected": 0.744140625, "step": 1740 }, { "epoch": 0.8215962441314554, "grad_norm": 242.278341686424, "learning_rate": 9.937402190923318e-08, "logits/chosen": -4.128125190734863, "logits/rejected": -4.021874904632568, "logps/chosen": -541.2000122070312, "logps/rejected": -658.7999877929688, "loss": 3.2527, "rewards/accuracies": 0.49296873807907104, "rewards/chosen": 8.490625381469727, "rewards/margins": 8.984375, "rewards/rejected": -0.4996093809604645, "step": 1750 }, { "epoch": 0.8262910798122066, "grad_norm": 264.74847983452094, "learning_rate": 9.676577986437141e-08, "logits/chosen": -4.1171875, "logits/rejected": -3.981250047683716, "logps/chosen": -601.5999755859375, "logps/rejected": -652.4000244140625, "loss": 3.3588, "rewards/accuracies": 0.5140625238418579, "rewards/chosen": 8.646875381469727, "rewards/margins": 8.09375, "rewards/rejected": 0.553759753704071, "step": 1760 }, { "epoch": 0.8309859154929577, "grad_norm": 252.43600024697975, "learning_rate": 9.415753781950965e-08, "logits/chosen": -4.193749904632568, "logits/rejected": -4.0859375, "logps/chosen": -554.0, "logps/rejected": -640.0, "loss": 2.8763, "rewards/accuracies": 0.546093761920929, "rewards/chosen": 9.568750381469727, "rewards/margins": 9.568750381469727, "rewards/rejected": 0.013867187313735485, "step": 1770 }, { "epoch": 0.8356807511737089, "grad_norm": 253.5525987973917, "learning_rate": 9.154929577464789e-08, "logits/chosen": -4.181250095367432, "logits/rejected": -4.043749809265137, "logps/chosen": -572.7999877929688, "logps/rejected": -644.0, "loss": 3.2664, "rewards/accuracies": 0.514843761920929, "rewards/chosen": 8.684374809265137, "rewards/margins": 8.540624618530273, "rewards/rejected": 0.15009765326976776, "step": 1780 }, { "epoch": 0.8403755868544601, "grad_norm": 250.62939288036958, "learning_rate": 8.894105372978613e-08, "logits/chosen": -4.140625, "logits/rejected": -4.053124904632568, "logps/chosen": -590.7999877929688, "logps/rejected": -653.0, "loss": 3.257, "rewards/accuracies": 0.508593738079071, "rewards/chosen": 9.178125381469727, "rewards/margins": 9.496874809265137, "rewards/rejected": -0.32539063692092896, "step": 1790 }, { "epoch": 0.8450704225352113, "grad_norm": 276.0547109194086, "learning_rate": 8.633281168492435e-08, "logits/chosen": -4.193749904632568, "logits/rejected": -4.125, "logps/chosen": -569.2000122070312, "logps/rejected": -633.4000244140625, "loss": 3.2604, "rewards/accuracies": 0.546875, "rewards/chosen": 8.996874809265137, "rewards/margins": 8.640625, "rewards/rejected": 0.34589844942092896, "step": 1800 }, { "epoch": 0.8497652582159625, "grad_norm": 315.86906591823214, "learning_rate": 8.372456964006259e-08, "logits/chosen": -4.120312690734863, "logits/rejected": -4.009375095367432, "logps/chosen": -564.7999877929688, "logps/rejected": -642.0, "loss": 3.4094, "rewards/accuracies": 0.538281261920929, "rewards/chosen": 9.053125381469727, "rewards/margins": 8.796875, "rewards/rejected": 0.25703126192092896, "step": 1810 }, { "epoch": 0.8544600938967136, "grad_norm": 299.3337185435239, "learning_rate": 8.111632759520083e-08, "logits/chosen": -4.123437404632568, "logits/rejected": -4.045312404632568, "logps/chosen": -613.7999877929688, "logps/rejected": -664.4000244140625, "loss": 3.5758, "rewards/accuracies": 0.5289062261581421, "rewards/chosen": 8.653124809265137, "rewards/margins": 8.795312881469727, "rewards/rejected": -0.14658203721046448, "step": 1820 }, { "epoch": 0.8591549295774648, "grad_norm": 305.90788242817615, "learning_rate": 7.850808555033907e-08, "logits/chosen": -4.15625, "logits/rejected": -4.009375095367432, "logps/chosen": -573.2000122070312, "logps/rejected": -666.7999877929688, "loss": 3.1492, "rewards/accuracies": 0.526562511920929, "rewards/chosen": 9.293749809265137, "rewards/margins": 9.759374618530273, "rewards/rejected": -0.47089844942092896, "step": 1830 }, { "epoch": 0.863849765258216, "grad_norm": 283.60957647674786, "learning_rate": 7.58998435054773e-08, "logits/chosen": -4.159375190734863, "logits/rejected": -4.012499809265137, "logps/chosen": -592.0, "logps/rejected": -664.4000244140625, "loss": 3.4984, "rewards/accuracies": 0.5015624761581421, "rewards/chosen": 9.246874809265137, "rewards/margins": 7.703125, "rewards/rejected": 1.5480468273162842, "step": 1840 }, { "epoch": 0.8685446009389671, "grad_norm": 264.6730900596976, "learning_rate": 7.329160146061554e-08, "logits/chosen": -4.206250190734863, "logits/rejected": -4.190625190734863, "logps/chosen": -561.5999755859375, "logps/rejected": -650.0, "loss": 2.9436, "rewards/accuracies": 0.522656261920929, "rewards/chosen": 9.556249618530273, "rewards/margins": 8.175000190734863, "rewards/rejected": 1.376953125, "step": 1850 }, { "epoch": 0.8732394366197183, "grad_norm": 315.0386051657591, "learning_rate": 7.068335941575378e-08, "logits/chosen": -4.125, "logits/rejected": -4.079687595367432, "logps/chosen": -563.2000122070312, "logps/rejected": -622.2000122070312, "loss": 3.8326, "rewards/accuracies": 0.5132812261581421, "rewards/chosen": 8.432812690734863, "rewards/margins": 8.271875381469727, "rewards/rejected": 0.1669921875, "step": 1860 }, { "epoch": 0.8779342723004695, "grad_norm": 347.82291763141683, "learning_rate": 6.807511737089202e-08, "logits/chosen": -4.240624904632568, "logits/rejected": -4.107812404632568, "logps/chosen": -559.5999755859375, "logps/rejected": -644.7999877929688, "loss": 3.2637, "rewards/accuracies": 0.508593738079071, "rewards/chosen": 9.253125190734863, "rewards/margins": 8.503125190734863, "rewards/rejected": 0.744335949420929, "step": 1870 }, { "epoch": 0.8826291079812206, "grad_norm": 370.90610454742574, "learning_rate": 6.546687532603024e-08, "logits/chosen": -4.181250095367432, "logits/rejected": -4.057812690734863, "logps/chosen": -566.5999755859375, "logps/rejected": -654.4000244140625, "loss": 3.2342, "rewards/accuracies": 0.4906249940395355, "rewards/chosen": 8.434374809265137, "rewards/margins": 7.290625095367432, "rewards/rejected": 1.1453125476837158, "step": 1880 }, { "epoch": 0.8873239436619719, "grad_norm": 240.62246133414592, "learning_rate": 6.285863328116848e-08, "logits/chosen": -4.1875, "logits/rejected": -4.074999809265137, "logps/chosen": -521.0, "logps/rejected": -667.5999755859375, "loss": 3.2679, "rewards/accuracies": 0.5234375, "rewards/chosen": 8.834375381469727, "rewards/margins": 8.953125, "rewards/rejected": -0.13417968153953552, "step": 1890 }, { "epoch": 0.892018779342723, "grad_norm": 269.3744968930516, "learning_rate": 6.025039123630672e-08, "logits/chosen": -4.15625, "logits/rejected": -4.073437690734863, "logps/chosen": -584.4000244140625, "logps/rejected": -622.4000244140625, "loss": 3.6584, "rewards/accuracies": 0.516406238079071, "rewards/chosen": 9.231249809265137, "rewards/margins": 8.609375, "rewards/rejected": 0.62744140625, "step": 1900 }, { "epoch": 0.8967136150234741, "grad_norm": 286.7976408001139, "learning_rate": 5.764214919144496e-08, "logits/chosen": -4.184374809265137, "logits/rejected": -4.106249809265137, "logps/chosen": -588.7999877929688, "logps/rejected": -616.5999755859375, "loss": 3.2276, "rewards/accuracies": 0.49531251192092896, "rewards/chosen": 8.837499618530273, "rewards/margins": 6.567187309265137, "rewards/rejected": 2.2730469703674316, "step": 1910 }, { "epoch": 0.9014084507042254, "grad_norm": 350.40438971050554, "learning_rate": 5.50339071465832e-08, "logits/chosen": -4.173437595367432, "logits/rejected": -4.115624904632568, "logps/chosen": -580.2000122070312, "logps/rejected": -604.0, "loss": 3.4644, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 8.759374618530273, "rewards/margins": 7.09375, "rewards/rejected": 1.672216773033142, "step": 1920 }, { "epoch": 0.9061032863849765, "grad_norm": 324.283448378128, "learning_rate": 5.2425665101721436e-08, "logits/chosen": -4.135937690734863, "logits/rejected": -3.984375, "logps/chosen": -590.5999755859375, "logps/rejected": -693.2000122070312, "loss": 3.1065, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 8.921875, "rewards/margins": 8.193750381469727, "rewards/rejected": 0.7367187738418579, "step": 1930 }, { "epoch": 0.9107981220657277, "grad_norm": 273.11521435297766, "learning_rate": 4.9817423056859675e-08, "logits/chosen": -4.193749904632568, "logits/rejected": -4.067187309265137, "logps/chosen": -586.0, "logps/rejected": -665.2000122070312, "loss": 3.2885, "rewards/accuracies": 0.514843761920929, "rewards/chosen": 9.646875381469727, "rewards/margins": 9.609375, "rewards/rejected": 0.03242187574505806, "step": 1940 }, { "epoch": 0.9154929577464789, "grad_norm": 272.6089128679087, "learning_rate": 4.720918101199791e-08, "logits/chosen": -4.089062690734863, "logits/rejected": -3.942187547683716, "logps/chosen": -591.5999755859375, "logps/rejected": -718.4000244140625, "loss": 3.6842, "rewards/accuracies": 0.49531251192092896, "rewards/chosen": 8.921875, "rewards/margins": 9.946874618530273, "rewards/rejected": -1.0234375, "step": 1950 }, { "epoch": 0.92018779342723, "grad_norm": 243.55218528138548, "learning_rate": 4.460093896713615e-08, "logits/chosen": -4.217187404632568, "logits/rejected": -4.056250095367432, "logps/chosen": -571.5999755859375, "logps/rejected": -671.5999755859375, "loss": 3.64, "rewards/accuracies": 0.5101562738418579, "rewards/chosen": 9.643750190734863, "rewards/margins": 8.115625381469727, "rewards/rejected": 1.5261719226837158, "step": 1960 }, { "epoch": 0.9248826291079812, "grad_norm": 262.26311139456277, "learning_rate": 4.199269692227438e-08, "logits/chosen": -4.181250095367432, "logits/rejected": -4.112500190734863, "logps/chosen": -562.5999755859375, "logps/rejected": -610.7999877929688, "loss": 3.6316, "rewards/accuracies": 0.4898437559604645, "rewards/chosen": 8.578125, "rewards/margins": 7.546875, "rewards/rejected": 1.0242187976837158, "step": 1970 }, { "epoch": 0.9295774647887324, "grad_norm": 301.82634449968526, "learning_rate": 3.938445487741262e-08, "logits/chosen": -4.112500190734863, "logits/rejected": -3.9546875953674316, "logps/chosen": -588.7999877929688, "logps/rejected": -642.7999877929688, "loss": 3.1398, "rewards/accuracies": 0.500781238079071, "rewards/chosen": 8.509374618530273, "rewards/margins": 7.971875190734863, "rewards/rejected": 0.547656238079071, "step": 1980 }, { "epoch": 0.9342723004694836, "grad_norm": 256.70883534671344, "learning_rate": 3.677621283255086e-08, "logits/chosen": -4.206250190734863, "logits/rejected": -4.120312690734863, "logps/chosen": -596.0, "logps/rejected": -644.0, "loss": 3.2795, "rewards/accuracies": 0.5328124761581421, "rewards/chosen": 9.756250381469727, "rewards/margins": 8.909375190734863, "rewards/rejected": 0.83203125, "step": 1990 }, { "epoch": 0.9389671361502347, "grad_norm": 282.63915129606966, "learning_rate": 3.41679707876891e-08, "logits/chosen": -4.154687404632568, "logits/rejected": -4.0234375, "logps/chosen": -578.2000122070312, "logps/rejected": -702.4000244140625, "loss": 3.399, "rewards/accuracies": 0.52734375, "rewards/chosen": 8.784375190734863, "rewards/margins": 9.846875190734863, "rewards/rejected": -1.062890648841858, "step": 2000 }, { "epoch": 0.9436619718309859, "grad_norm": 281.2956650232833, "learning_rate": 3.155972874282733e-08, "logits/chosen": -4.215624809265137, "logits/rejected": -4.079687595367432, "logps/chosen": -555.5999755859375, "logps/rejected": -639.5999755859375, "loss": 3.2535, "rewards/accuracies": 0.535937488079071, "rewards/chosen": 9.4375, "rewards/margins": 8.964062690734863, "rewards/rejected": 0.468017578125, "step": 2010 }, { "epoch": 0.9483568075117371, "grad_norm": 281.4878426700789, "learning_rate": 2.8951486697965573e-08, "logits/chosen": -4.139062404632568, "logits/rejected": -4.048437595367432, "logps/chosen": -589.4000244140625, "logps/rejected": -659.5999755859375, "loss": 3.4492, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 9.112500190734863, "rewards/margins": 8.853124618530273, "rewards/rejected": 0.255859375, "step": 2020 }, { "epoch": 0.9530516431924883, "grad_norm": 296.87658522208113, "learning_rate": 2.634324465310381e-08, "logits/chosen": -4.185937404632568, "logits/rejected": -4.042187690734863, "logps/chosen": -564.2000122070312, "logps/rejected": -644.7999877929688, "loss": 3.4496, "rewards/accuracies": 0.5093749761581421, "rewards/chosen": 8.821874618530273, "rewards/margins": 8.653124809265137, "rewards/rejected": 0.16904297471046448, "step": 2030 }, { "epoch": 0.9577464788732394, "grad_norm": 252.64030204060066, "learning_rate": 2.3735002608242045e-08, "logits/chosen": -4.168749809265137, "logits/rejected": -4.056250095367432, "logps/chosen": -584.2000122070312, "logps/rejected": -617.2000122070312, "loss": 2.8264, "rewards/accuracies": 0.53515625, "rewards/chosen": 9.75, "rewards/margins": 8.509374618530273, "rewards/rejected": 1.2374999523162842, "step": 2040 }, { "epoch": 0.9624413145539906, "grad_norm": 274.4937616307003, "learning_rate": 2.1126760563380282e-08, "logits/chosen": -4.181250095367432, "logits/rejected": -4.096875190734863, "logps/chosen": -569.7999877929688, "logps/rejected": -599.2000122070312, "loss": 3.5493, "rewards/accuracies": 0.510937511920929, "rewards/chosen": 8.875, "rewards/margins": 6.400000095367432, "rewards/rejected": 2.4742188453674316, "step": 2050 }, { "epoch": 0.9671361502347418, "grad_norm": 239.72102282540757, "learning_rate": 1.8518518518518518e-08, "logits/chosen": -4.162499904632568, "logits/rejected": -4.0234375, "logps/chosen": -595.2000122070312, "logps/rejected": -648.7999877929688, "loss": 3.7638, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 8.953125, "rewards/margins": 7.721875190734863, "rewards/rejected": 1.220703125, "step": 2060 }, { "epoch": 0.971830985915493, "grad_norm": 266.3637050395917, "learning_rate": 1.5910276473656755e-08, "logits/chosen": -4.143750190734863, "logits/rejected": -3.9937500953674316, "logps/chosen": -570.2000122070312, "logps/rejected": -643.5999755859375, "loss": 2.7844, "rewards/accuracies": 0.520312488079071, "rewards/chosen": 9.15625, "rewards/margins": 9.082812309265137, "rewards/rejected": 0.07871093600988388, "step": 2070 }, { "epoch": 0.9765258215962441, "grad_norm": 244.26802244450215, "learning_rate": 1.3302034428794991e-08, "logits/chosen": -4.215624809265137, "logits/rejected": -4.153124809265137, "logps/chosen": -549.5999755859375, "logps/rejected": -610.4000244140625, "loss": 3.382, "rewards/accuracies": 0.5179687738418579, "rewards/chosen": 8.71875, "rewards/margins": 8.443750381469727, "rewards/rejected": 0.2803710997104645, "step": 2080 }, { "epoch": 0.9812206572769953, "grad_norm": 273.08298980869785, "learning_rate": 1.0693792383933229e-08, "logits/chosen": -4.212500095367432, "logits/rejected": -4.15625, "logps/chosen": -582.0, "logps/rejected": -658.4000244140625, "loss": 3.4493, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 9.546875, "rewards/margins": 8.981249809265137, "rewards/rejected": 0.56201171875, "step": 2090 }, { "epoch": 0.9859154929577465, "grad_norm": 247.97793524019707, "learning_rate": 8.085550339071465e-09, "logits/chosen": -4.159375190734863, "logits/rejected": -4.059374809265137, "logps/chosen": -597.0, "logps/rejected": -688.5999755859375, "loss": 3.6896, "rewards/accuracies": 0.52734375, "rewards/chosen": 8.565625190734863, "rewards/margins": 8.681249618530273, "rewards/rejected": -0.12148437649011612, "step": 2100 }, { "epoch": 0.9906103286384976, "grad_norm": 355.78181353320775, "learning_rate": 5.4773082942097025e-09, "logits/chosen": -4.178124904632568, "logits/rejected": -4.104687690734863, "logps/chosen": -565.4000244140625, "logps/rejected": -622.2000122070312, "loss": 3.1718, "rewards/accuracies": 0.5234375, "rewards/chosen": 9.175000190734863, "rewards/margins": 8.899999618530273, "rewards/rejected": 0.26640623807907104, "step": 2110 }, { "epoch": 0.9953051643192489, "grad_norm": 302.36578049371593, "learning_rate": 2.8690662493479393e-09, "logits/chosen": -4.231249809265137, "logits/rejected": -4.035937309265137, "logps/chosen": -568.0, "logps/rejected": -635.2000122070312, "loss": 3.9098, "rewards/accuracies": 0.528124988079071, "rewards/chosen": 9.043749809265137, "rewards/margins": 7.417578220367432, "rewards/rejected": 1.6320312023162842, "step": 2120 }, { "epoch": 1.0, "grad_norm": 246.1305241577918, "learning_rate": 2.608242044861763e-10, "logits/chosen": -4.165625095367432, "logits/rejected": -4.1171875, "logps/chosen": -566.0, "logps/rejected": -658.0, "loss": 3.576, "rewards/accuracies": 0.5049871206283569, "rewards/chosen": 9.1875, "rewards/margins": 9.4375, "rewards/rejected": -0.25664061307907104, "step": 2130 }, { "epoch": 1.0, "step": 2130, "total_flos": 0.0, "train_loss": 3.2928951639524646, "train_runtime": 18574.6633, "train_samples_per_second": 14.675, "train_steps_per_second": 0.115 } ], "logging_steps": 10, "max_steps": 2130, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }