{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 2130, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004694835680751174, "grad_norm": 520.142591451342, "learning_rate": 2.1126760563380282e-08, "logits/chosen": -2.4390625953674316, "logits/rejected": -2.137500047683716, "logps/chosen": -696.7999877929688, "logps/rejected": -814.0, "loss": 1.997, "rewards/accuracies": 0.09296874701976776, "rewards/chosen": -0.338623046875, "rewards/margins": -0.05322265625, "rewards/rejected": -0.2852539122104645, "step": 10 }, { "epoch": 0.009389671361502348, "grad_norm": 718.6985853103332, "learning_rate": 4.460093896713615e-08, "logits/chosen": -2.362499952316284, "logits/rejected": -2.2249999046325684, "logps/chosen": -732.4000244140625, "logps/rejected": -813.2000122070312, "loss": 3.6606, "rewards/accuracies": 0.23906250298023224, "rewards/chosen": 0.20595702528953552, "rewards/margins": 0.1640625, "rewards/rejected": 0.04326171800494194, "step": 20 }, { "epoch": 0.014084507042253521, "grad_norm": 1552.2132968157528, "learning_rate": 6.807511737089202e-08, "logits/chosen": -2.2523436546325684, "logits/rejected": -2.0, "logps/chosen": -778.7999877929688, "logps/rejected": -908.7999877929688, "loss": 4.2634, "rewards/accuracies": 0.26171875, "rewards/chosen": -0.35332030057907104, "rewards/margins": -0.07529296725988388, "rewards/rejected": -0.2796874940395355, "step": 30 }, { "epoch": 0.018779342723004695, "grad_norm": 8657.18713257464, "learning_rate": 9.154929577464789e-08, "logits/chosen": -2.339062452316284, "logits/rejected": -2.137500047683716, "logps/chosen": -770.4000244140625, "logps/rejected": -930.7999877929688, "loss": 3.8689, "rewards/accuracies": 0.28515625, "rewards/chosen": 0.4447265565395355, "rewards/margins": 0.9095703363418579, "rewards/rejected": -0.46601563692092896, "step": 40 }, { "epoch": 0.023474178403755867, "grad_norm": 565.9183355419589, "learning_rate": 1.1502347417840374e-07, "logits/chosen": -2.268749952316284, "logits/rejected": -2.1328125, "logps/chosen": -753.2000122070312, "logps/rejected": -833.5999755859375, "loss": 4.4475, "rewards/accuracies": 0.2718749940395355, "rewards/chosen": -0.08486328274011612, "rewards/margins": -0.04443359375, "rewards/rejected": -0.04072265699505806, "step": 50 }, { "epoch": 0.028169014084507043, "grad_norm": 697.3566349630747, "learning_rate": 1.384976525821596e-07, "logits/chosen": -2.2906250953674316, "logits/rejected": -2.1039061546325684, "logps/chosen": -779.5999755859375, "logps/rejected": -908.4000244140625, "loss": 4.4893, "rewards/accuracies": 0.26171875, "rewards/chosen": -0.203125, "rewards/margins": -0.654125988483429, "rewards/rejected": 0.4522460997104645, "step": 60 }, { "epoch": 0.03286384976525822, "grad_norm": 10766.834300261338, "learning_rate": 1.619718309859155e-07, "logits/chosen": -2.1617188453674316, "logits/rejected": -1.9695312976837158, "logps/chosen": -745.5999755859375, "logps/rejected": -816.7999877929688, "loss": 4.8299, "rewards/accuracies": 0.2867187559604645, "rewards/chosen": 0.15312500298023224, "rewards/margins": -0.07851562649011612, "rewards/rejected": 0.23310546576976776, "step": 70 }, { "epoch": 0.03755868544600939, "grad_norm": 505.2891052099527, "learning_rate": 1.8544600938967138e-07, "logits/chosen": -2.3765625953674316, "logits/rejected": -2.1953125, "logps/chosen": -740.7999877929688, "logps/rejected": -854.7999877929688, "loss": 4.1491, "rewards/accuracies": 0.2718749940395355, "rewards/chosen": 0.2137451171875, "rewards/margins": 0.07695312798023224, "rewards/rejected": 0.13845214247703552, "step": 80 }, { "epoch": 0.04225352112676056, "grad_norm": 668.2554945973214, "learning_rate": 2.089201877934272e-07, "logits/chosen": -2.3890624046325684, "logits/rejected": -2.2265625, "logps/chosen": -718.7999877929688, "logps/rejected": -816.4000244140625, "loss": 3.908, "rewards/accuracies": 0.2578125, "rewards/chosen": 0.4122680723667145, "rewards/margins": -0.19423827528953552, "rewards/rejected": 0.6078125238418579, "step": 90 }, { "epoch": 0.046948356807511735, "grad_norm": 776.6929187550726, "learning_rate": 2.323943661971831e-07, "logits/chosen": -2.307812452316284, "logits/rejected": -2.211718797683716, "logps/chosen": -764.4000244140625, "logps/rejected": -798.4000244140625, "loss": 3.9806, "rewards/accuracies": 0.2632812559604645, "rewards/chosen": -0.03876953199505806, "rewards/margins": -0.2451171875, "rewards/rejected": 0.20556640625, "step": 100 }, { "epoch": 0.051643192488262914, "grad_norm": 717.5365230161353, "learning_rate": 2.5586854460093895e-07, "logits/chosen": -2.4203124046325684, "logits/rejected": -2.2671875953674316, "logps/chosen": -742.0, "logps/rejected": -870.4000244140625, "loss": 4.6294, "rewards/accuracies": 0.27812498807907104, "rewards/chosen": 0.15576171875, "rewards/margins": -0.35624998807907104, "rewards/rejected": 0.5127929449081421, "step": 110 }, { "epoch": 0.056338028169014086, "grad_norm": 2977.9502504703755, "learning_rate": 2.7934272300469483e-07, "logits/chosen": -2.2554688453674316, "logits/rejected": -2.1578125953674316, "logps/chosen": -768.0, "logps/rejected": -862.4000244140625, "loss": 4.0131, "rewards/accuracies": 0.28125, "rewards/chosen": 0.18720702826976776, "rewards/margins": -0.06640625, "rewards/rejected": 0.25371092557907104, "step": 120 }, { "epoch": 0.06103286384976526, "grad_norm": 2509.124943319802, "learning_rate": 3.0281690140845066e-07, "logits/chosen": -2.235156297683716, "logits/rejected": -2.1156249046325684, "logps/chosen": -781.2000122070312, "logps/rejected": -900.7999877929688, "loss": 6.0719, "rewards/accuracies": 0.27421873807907104, "rewards/chosen": -0.826171875, "rewards/margins": -1.742456078529358, "rewards/rejected": 0.917285144329071, "step": 130 }, { "epoch": 0.06572769953051644, "grad_norm": 690.7196142418027, "learning_rate": 3.2629107981220654e-07, "logits/chosen": -2.234375, "logits/rejected": -2.11328125, "logps/chosen": -766.7999877929688, "logps/rejected": -898.0, "loss": 5.058, "rewards/accuracies": 0.25703126192092896, "rewards/chosen": -0.03217773512005806, "rewards/margins": -0.32109373807907104, "rewards/rejected": 0.2876953184604645, "step": 140 }, { "epoch": 0.07042253521126761, "grad_norm": 669.3896973423242, "learning_rate": 3.497652582159624e-07, "logits/chosen": -2.305468797683716, "logits/rejected": -2.282031297683716, "logps/chosen": -746.7999877929688, "logps/rejected": -861.5999755859375, "loss": 3.5748, "rewards/accuracies": 0.2757812440395355, "rewards/chosen": 0.6036132574081421, "rewards/margins": 0.8902343511581421, "rewards/rejected": -0.28557127714157104, "step": 150 }, { "epoch": 0.07511737089201878, "grad_norm": 721.4120567673536, "learning_rate": 3.732394366197183e-07, "logits/chosen": -2.2750000953674316, "logits/rejected": -2.120312452316284, "logps/chosen": -770.7999877929688, "logps/rejected": -874.4000244140625, "loss": 4.8792, "rewards/accuracies": 0.29296875, "rewards/chosen": -0.11386718600988388, "rewards/margins": 0.2789062559604645, "rewards/rejected": -0.38984376192092896, "step": 160 }, { "epoch": 0.07981220657276995, "grad_norm": 1753.633484672201, "learning_rate": 3.967136150234742e-07, "logits/chosen": -2.2515625953674316, "logits/rejected": -2.065624952316284, "logps/chosen": -728.7999877929688, "logps/rejected": -867.5999755859375, "loss": 5.735, "rewards/accuracies": 0.29609376192092896, "rewards/chosen": 0.771484375, "rewards/margins": -0.755175769329071, "rewards/rejected": 1.525048851966858, "step": 170 }, { "epoch": 0.08450704225352113, "grad_norm": 778.6604227743157, "learning_rate": 4.2018779342723e-07, "logits/chosen": -2.2890625, "logits/rejected": -2.090625047683716, "logps/chosen": -700.0, "logps/rejected": -869.2000122070312, "loss": 3.5314, "rewards/accuracies": 0.2757812440395355, "rewards/chosen": 0.5699707269668579, "rewards/margins": 0.905102550983429, "rewards/rejected": -0.3370117247104645, "step": 180 }, { "epoch": 0.0892018779342723, "grad_norm": 742.4443494266026, "learning_rate": 4.436619718309859e-07, "logits/chosen": -2.21875, "logits/rejected": -2.149218797683716, "logps/chosen": -752.0, "logps/rejected": -814.4000244140625, "loss": 4.7805, "rewards/accuracies": 0.28828126192092896, "rewards/chosen": 0.2789062559604645, "rewards/margins": -0.45097655057907104, "rewards/rejected": 0.729296863079071, "step": 190 }, { "epoch": 0.09389671361502347, "grad_norm": 542.7728573196438, "learning_rate": 4.671361502347418e-07, "logits/chosen": -2.2554688453674316, "logits/rejected": -2.0648436546325684, "logps/chosen": -729.5999755859375, "logps/rejected": -824.4000244140625, "loss": 3.6234, "rewards/accuracies": 0.27812498807907104, "rewards/chosen": 0.767382800579071, "rewards/margins": 1.3642578125, "rewards/rejected": -0.5965331792831421, "step": 200 }, { "epoch": 0.09859154929577464, "grad_norm": 1461.4933077149913, "learning_rate": 4.906103286384976e-07, "logits/chosen": -2.2671875953674316, "logits/rejected": -2.1195311546325684, "logps/chosen": -748.7999877929688, "logps/rejected": -891.5999755859375, "loss": 3.7303, "rewards/accuracies": 0.2757812440395355, "rewards/chosen": 0.4224853515625, "rewards/margins": 0.23105469346046448, "rewards/rejected": 0.19394531846046448, "step": 210 }, { "epoch": 0.10328638497652583, "grad_norm": 756.737522385797, "learning_rate": 4.984350547730829e-07, "logits/chosen": -2.3921875953674316, "logits/rejected": -2.1703124046325684, "logps/chosen": -779.2000122070312, "logps/rejected": -852.7999877929688, "loss": 4.1088, "rewards/accuracies": 0.26875001192092896, "rewards/chosen": 0.364013671875, "rewards/margins": 0.3695312440395355, "rewards/rejected": -0.00791015662252903, "step": 220 }, { "epoch": 0.107981220657277, "grad_norm": 2242.980863525319, "learning_rate": 4.958268127282212e-07, "logits/chosen": -2.2867188453674316, "logits/rejected": -2.1851563453674316, "logps/chosen": -755.2000122070312, "logps/rejected": -878.7999877929688, "loss": 3.9552, "rewards/accuracies": 0.3062500059604645, "rewards/chosen": 1.0666015148162842, "rewards/margins": 2.1900391578674316, "rewards/rejected": -1.1203124523162842, "step": 230 }, { "epoch": 0.11267605633802817, "grad_norm": 455.8500781083609, "learning_rate": 4.932185706833594e-07, "logits/chosen": -2.2164063453674316, "logits/rejected": -2.168750047683716, "logps/chosen": -693.5999755859375, "logps/rejected": -736.4000244140625, "loss": 4.3035, "rewards/accuracies": 0.29609376192092896, "rewards/chosen": 0.21464844048023224, "rewards/margins": 0.4082275331020355, "rewards/rejected": -0.19091796875, "step": 240 }, { "epoch": 0.11737089201877934, "grad_norm": 9707.277894991463, "learning_rate": 4.906103286384976e-07, "logits/chosen": -2.317187547683716, "logits/rejected": -2.1796875, "logps/chosen": -742.4000244140625, "logps/rejected": -878.0, "loss": 5.258, "rewards/accuracies": 0.28593748807907104, "rewards/chosen": -1.098413109779358, "rewards/margins": -0.5367676019668579, "rewards/rejected": -0.5621093511581421, "step": 250 }, { "epoch": 0.12206572769953052, "grad_norm": 636.3943955298223, "learning_rate": 4.880020865936358e-07, "logits/chosen": -2.2265625, "logits/rejected": -2.200000047683716, "logps/chosen": -700.7999877929688, "logps/rejected": -814.4000244140625, "loss": 4.1654, "rewards/accuracies": 0.296875, "rewards/chosen": 0.925488293170929, "rewards/margins": 0.37675780057907104, "rewards/rejected": 0.548291027545929, "step": 260 }, { "epoch": 0.1267605633802817, "grad_norm": 628.6083377328596, "learning_rate": 4.853938445487741e-07, "logits/chosen": -2.1937499046325684, "logits/rejected": -1.8914062976837158, "logps/chosen": -742.4000244140625, "logps/rejected": -864.7999877929688, "loss": 4.3146, "rewards/accuracies": 0.3046875, "rewards/chosen": 0.3232421875, "rewards/margins": 0.791210949420929, "rewards/rejected": -0.46650391817092896, "step": 270 }, { "epoch": 0.13145539906103287, "grad_norm": 518.911626956217, "learning_rate": 4.827856025039123e-07, "logits/chosen": -2.3062500953674316, "logits/rejected": -2.207812547683716, "logps/chosen": -752.0, "logps/rejected": -876.7999877929688, "loss": 4.545, "rewards/accuracies": 0.3031249940395355, "rewards/chosen": 1.2041015625, "rewards/margins": 1.7902343273162842, "rewards/rejected": -0.5816406011581421, "step": 280 }, { "epoch": 0.13615023474178403, "grad_norm": 466.3217272893134, "learning_rate": 4.801773604590506e-07, "logits/chosen": -2.221874952316284, "logits/rejected": -2.1812500953674316, "logps/chosen": -797.2000122070312, "logps/rejected": -874.7999877929688, "loss": 3.4628, "rewards/accuracies": 0.2992187440395355, "rewards/chosen": 0.5323241949081421, "rewards/margins": 1.7703125476837158, "rewards/rejected": -1.240966796875, "step": 290 }, { "epoch": 0.14084507042253522, "grad_norm": 1099.1881265876816, "learning_rate": 4.775691184141888e-07, "logits/chosen": -2.3578124046325684, "logits/rejected": -2.2203125953674316, "logps/chosen": -760.4000244140625, "logps/rejected": -860.0, "loss": 4.6967, "rewards/accuracies": 0.2945312559604645, "rewards/chosen": 0.25957030057907104, "rewards/margins": 3.1707520484924316, "rewards/rejected": -2.9110350608825684, "step": 300 }, { "epoch": 0.14553990610328638, "grad_norm": 967.9292035390623, "learning_rate": 4.749608763693271e-07, "logits/chosen": -2.3109374046325684, "logits/rejected": -2.1226563453674316, "logps/chosen": -725.2000122070312, "logps/rejected": -836.4000244140625, "loss": 3.5246, "rewards/accuracies": 0.3148437440395355, "rewards/chosen": 1.117578148841858, "rewards/margins": 1.4765625, "rewards/rejected": -0.36152344942092896, "step": 310 }, { "epoch": 0.15023474178403756, "grad_norm": 798.8789598637936, "learning_rate": 4.7235263432446533e-07, "logits/chosen": -2.3187499046325684, "logits/rejected": -2.12890625, "logps/chosen": -739.5999755859375, "logps/rejected": -856.7999877929688, "loss": 3.8969, "rewards/accuracies": 0.3179687559604645, "rewards/chosen": 0.737060546875, "rewards/margins": 2.250781297683716, "rewards/rejected": -1.516015648841858, "step": 320 }, { "epoch": 0.15492957746478872, "grad_norm": 718.0201290330298, "learning_rate": 4.6974439227960353e-07, "logits/chosen": -2.28125, "logits/rejected": -2.059375047683716, "logps/chosen": -770.0, "logps/rejected": -887.2000122070312, "loss": 3.9144, "rewards/accuracies": 0.30546873807907104, "rewards/chosen": 1.103124976158142, "rewards/margins": 1.5705077648162842, "rewards/rejected": -0.46699219942092896, "step": 330 }, { "epoch": 0.1596244131455399, "grad_norm": 519.8108487600896, "learning_rate": 4.671361502347418e-07, "logits/chosen": -2.3984375, "logits/rejected": -2.2796874046325684, "logps/chosen": -720.7999877929688, "logps/rejected": -777.2000122070312, "loss": 5.8592, "rewards/accuracies": 0.30859375, "rewards/chosen": 0.3306640684604645, "rewards/margins": -0.9193359613418579, "rewards/rejected": 1.249609351158142, "step": 340 }, { "epoch": 0.1643192488262911, "grad_norm": 619.4060279377454, "learning_rate": 4.6452790818988004e-07, "logits/chosen": -2.3335938453674316, "logits/rejected": -2.171093702316284, "logps/chosen": -773.2000122070312, "logps/rejected": -869.5999755859375, "loss": 3.5465, "rewards/accuracies": 0.3148437440395355, "rewards/chosen": 0.9961913824081421, "rewards/margins": 1.6785156726837158, "rewards/rejected": -0.6844726800918579, "step": 350 }, { "epoch": 0.16901408450704225, "grad_norm": 10870.44571613044, "learning_rate": 4.6191966614501824e-07, "logits/chosen": -2.31640625, "logits/rejected": -2.110156297683716, "logps/chosen": -731.2000122070312, "logps/rejected": -841.5999755859375, "loss": 6.2137, "rewards/accuracies": 0.32343751192092896, "rewards/chosen": 0.8446289300918579, "rewards/margins": 0.35224610567092896, "rewards/rejected": 0.49506837129592896, "step": 360 }, { "epoch": 0.17370892018779344, "grad_norm": 6196.223136121422, "learning_rate": 4.593114241001565e-07, "logits/chosen": -2.2249999046325684, "logits/rejected": -2.133593797683716, "logps/chosen": -795.2000122070312, "logps/rejected": -871.2000122070312, "loss": 4.3763, "rewards/accuracies": 0.30937498807907104, "rewards/chosen": 0.8076171875, "rewards/margins": 2.6605467796325684, "rewards/rejected": -1.8560059070587158, "step": 370 }, { "epoch": 0.1784037558685446, "grad_norm": 956.6319306542155, "learning_rate": 4.5670318205529474e-07, "logits/chosen": -2.2015624046325684, "logits/rejected": -2.020312547683716, "logps/chosen": -750.7999877929688, "logps/rejected": -871.5999755859375, "loss": 3.8167, "rewards/accuracies": 0.3304687440395355, "rewards/chosen": 1.7366943359375, "rewards/margins": 3.37890625, "rewards/rejected": -1.638818383216858, "step": 380 }, { "epoch": 0.18309859154929578, "grad_norm": 8776.661255480909, "learning_rate": 4.54094940010433e-07, "logits/chosen": -2.2578125, "logits/rejected": -2.11328125, "logps/chosen": -763.5999755859375, "logps/rejected": -902.4000244140625, "loss": 4.024, "rewards/accuracies": 0.3125, "rewards/chosen": 0.804882824420929, "rewards/margins": 2.1529297828674316, "rewards/rejected": -1.345312476158142, "step": 390 }, { "epoch": 0.18779342723004694, "grad_norm": 719.0136384818551, "learning_rate": 4.514866979655712e-07, "logits/chosen": -2.3140625953674316, "logits/rejected": -2.1226563453674316, "logps/chosen": -755.5999755859375, "logps/rejected": -854.0, "loss": 3.6524, "rewards/accuracies": 0.3304687440395355, "rewards/chosen": 1.326562523841858, "rewards/margins": 5.921875, "rewards/rejected": -4.58935546875, "step": 400 }, { "epoch": 0.19248826291079812, "grad_norm": 642.6361751965826, "learning_rate": 4.4887845592070945e-07, "logits/chosen": -2.2421875, "logits/rejected": -2.06640625, "logps/chosen": -722.4000244140625, "logps/rejected": -879.2000122070312, "loss": 3.4391, "rewards/accuracies": 0.3382812440395355, "rewards/chosen": 1.3244140148162842, "rewards/margins": 2.5455079078674316, "rewards/rejected": -1.2185547351837158, "step": 410 }, { "epoch": 0.19718309859154928, "grad_norm": 497.8577583794182, "learning_rate": 4.462702138758477e-07, "logits/chosen": -2.20703125, "logits/rejected": -2.0960936546325684, "logps/chosen": -778.4000244140625, "logps/rejected": -868.0, "loss": 2.8947, "rewards/accuracies": 0.34062498807907104, "rewards/chosen": 1.898046851158142, "rewards/margins": 5.805468559265137, "rewards/rejected": -3.9066405296325684, "step": 420 }, { "epoch": 0.20187793427230047, "grad_norm": 890.313950643353, "learning_rate": 4.436619718309859e-07, "logits/chosen": -2.2109375, "logits/rejected": -1.9265625476837158, "logps/chosen": -750.0, "logps/rejected": -900.7999877929688, "loss": 3.8911, "rewards/accuracies": 0.3515625, "rewards/chosen": 1.587011694908142, "rewards/margins": 2.849804639816284, "rewards/rejected": -1.2580077648162842, "step": 430 }, { "epoch": 0.20657276995305165, "grad_norm": 2009.4647327356392, "learning_rate": 4.4105372978612415e-07, "logits/chosen": -2.3203125, "logits/rejected": -2.2328124046325684, "logps/chosen": -769.2000122070312, "logps/rejected": -882.4000244140625, "loss": 4.6993, "rewards/accuracies": 0.33281248807907104, "rewards/chosen": 0.58251953125, "rewards/margins": 1.8867676258087158, "rewards/rejected": -1.3054687976837158, "step": 440 }, { "epoch": 0.2112676056338028, "grad_norm": 989.3531885201104, "learning_rate": 4.384454877412624e-07, "logits/chosen": -2.38671875, "logits/rejected": -2.067187547683716, "logps/chosen": -738.0, "logps/rejected": -975.2000122070312, "loss": 3.7969, "rewards/accuracies": 0.3414062559604645, "rewards/chosen": 1.134667992591858, "rewards/margins": 3.767578125, "rewards/rejected": -2.6279296875, "step": 450 }, { "epoch": 0.215962441314554, "grad_norm": 545.8369253651695, "learning_rate": 4.358372456964006e-07, "logits/chosen": -2.260937452316284, "logits/rejected": -2.1070313453674316, "logps/chosen": -742.0, "logps/rejected": -873.5999755859375, "loss": 3.7957, "rewards/accuracies": 0.3453125059604645, "rewards/chosen": 1.5535156726837158, "rewards/margins": 3.942187547683716, "rewards/rejected": -2.3919677734375, "step": 460 }, { "epoch": 0.22065727699530516, "grad_norm": 527.7080770261233, "learning_rate": 4.3322900365153886e-07, "logits/chosen": -2.370312452316284, "logits/rejected": -2.196093797683716, "logps/chosen": -799.2000122070312, "logps/rejected": -863.2000122070312, "loss": 4.7648, "rewards/accuracies": 0.3265624940395355, "rewards/chosen": 0.34296876192092896, "rewards/margins": 2.1605467796325684, "rewards/rejected": -1.819580078125, "step": 470 }, { "epoch": 0.22535211267605634, "grad_norm": 406.09742225980256, "learning_rate": 4.306207616066771e-07, "logits/chosen": -2.359375, "logits/rejected": -2.190624952316284, "logps/chosen": -735.2000122070312, "logps/rejected": -839.5999755859375, "loss": 4.1495, "rewards/accuracies": 0.33515626192092896, "rewards/chosen": 1.804296851158142, "rewards/margins": 2.5394530296325684, "rewards/rejected": -0.736889660358429, "step": 480 }, { "epoch": 0.2300469483568075, "grad_norm": 5490.36357087384, "learning_rate": 4.280125195618153e-07, "logits/chosen": -2.325000047683716, "logits/rejected": -2.061718702316284, "logps/chosen": -743.5999755859375, "logps/rejected": -869.5999755859375, "loss": 3.8629, "rewards/accuracies": 0.3382812440395355, "rewards/chosen": 0.9203125238418579, "rewards/margins": 2.908886671066284, "rewards/rejected": -1.987890601158142, "step": 490 }, { "epoch": 0.2347417840375587, "grad_norm": 997.4028144857181, "learning_rate": 4.2540427751695357e-07, "logits/chosen": -2.292187452316284, "logits/rejected": -2.1773438453674316, "logps/chosen": -766.7999877929688, "logps/rejected": -850.7999877929688, "loss": 4.0899, "rewards/accuracies": 0.33515626192092896, "rewards/chosen": 1.191796898841858, "rewards/margins": 2.816601514816284, "rewards/rejected": -1.6278808116912842, "step": 500 }, { "epoch": 0.23943661971830985, "grad_norm": 561.7245894831859, "learning_rate": 4.227960354720918e-07, "logits/chosen": -2.3539061546325684, "logits/rejected": -2.186718702316284, "logps/chosen": -741.2000122070312, "logps/rejected": -897.5999755859375, "loss": 3.5953, "rewards/accuracies": 0.3765625059604645, "rewards/chosen": 2.7421875, "rewards/margins": 4.625, "rewards/rejected": -1.880468726158142, "step": 510 }, { "epoch": 0.24413145539906103, "grad_norm": 499.4831548868703, "learning_rate": 4.2018779342723e-07, "logits/chosen": -2.2593750953674316, "logits/rejected": -2.258593797683716, "logps/chosen": -762.7999877929688, "logps/rejected": -837.2000122070312, "loss": 3.7502, "rewards/accuracies": 0.3492187559604645, "rewards/chosen": 1.6145508289337158, "rewards/margins": 3.1888670921325684, "rewards/rejected": -1.576171875, "step": 520 }, { "epoch": 0.24882629107981222, "grad_norm": 455.056965245539, "learning_rate": 4.1757955138236827e-07, "logits/chosen": -2.319531202316284, "logits/rejected": -2.0843749046325684, "logps/chosen": -731.5999755859375, "logps/rejected": -871.5999755859375, "loss": 3.7579, "rewards/accuracies": 0.3609375059604645, "rewards/chosen": 1.85546875, "rewards/margins": 4.625, "rewards/rejected": -2.7668700218200684, "step": 530 }, { "epoch": 0.2535211267605634, "grad_norm": 446.0832378408931, "learning_rate": 4.149713093375065e-07, "logits/chosen": -2.375, "logits/rejected": -2.121875047683716, "logps/chosen": -734.7999877929688, "logps/rejected": -792.0, "loss": 3.6603, "rewards/accuracies": 0.3453125059604645, "rewards/chosen": 1.8771483898162842, "rewards/margins": 3.235156297683716, "rewards/rejected": -1.357031226158142, "step": 540 }, { "epoch": 0.25821596244131456, "grad_norm": 313.2168391651203, "learning_rate": 4.123630672926447e-07, "logits/chosen": -2.362499952316284, "logits/rejected": -2.19140625, "logps/chosen": -717.2000122070312, "logps/rejected": -854.4000244140625, "loss": 3.8799, "rewards/accuracies": 0.37421876192092896, "rewards/chosen": 2.354687452316284, "rewards/margins": 5.006249904632568, "rewards/rejected": -2.6578125953674316, "step": 550 }, { "epoch": 0.26291079812206575, "grad_norm": 380.2190323980337, "learning_rate": 4.09754825247783e-07, "logits/chosen": -2.284374952316284, "logits/rejected": -2.0414061546325684, "logps/chosen": -749.2000122070312, "logps/rejected": -858.0, "loss": 3.701, "rewards/accuracies": 0.3617187440395355, "rewards/chosen": 2.235156297683716, "rewards/margins": 4.295312404632568, "rewards/rejected": -2.0572266578674316, "step": 560 }, { "epoch": 0.2676056338028169, "grad_norm": 414.74954225225156, "learning_rate": 4.0714658320292123e-07, "logits/chosen": -2.448437452316284, "logits/rejected": -2.260937452316284, "logps/chosen": -726.0, "logps/rejected": -850.4000244140625, "loss": 3.8754, "rewards/accuracies": 0.37968748807907104, "rewards/chosen": 1.817968726158142, "rewards/margins": 5.8125, "rewards/rejected": -3.9976563453674316, "step": 570 }, { "epoch": 0.27230046948356806, "grad_norm": 556.6860073670566, "learning_rate": 4.045383411580595e-07, "logits/chosen": -2.3218750953674316, "logits/rejected": -2.110156297683716, "logps/chosen": -725.2000122070312, "logps/rejected": -862.0, "loss": 4.1742, "rewards/accuracies": 0.3656249940395355, "rewards/chosen": 2.0453124046325684, "rewards/margins": 4.602734565734863, "rewards/rejected": -2.5546875, "step": 580 }, { "epoch": 0.27699530516431925, "grad_norm": 514.6027456989814, "learning_rate": 4.019300991131977e-07, "logits/chosen": -2.190624952316284, "logits/rejected": -2.106250047683716, "logps/chosen": -732.7999877929688, "logps/rejected": -848.7999877929688, "loss": 3.3683, "rewards/accuracies": 0.37890625, "rewards/chosen": 2.628124952316284, "rewards/margins": 4.798437595367432, "rewards/rejected": -2.1703124046325684, "step": 590 }, { "epoch": 0.28169014084507044, "grad_norm": 346.10997799515474, "learning_rate": 3.9932185706833594e-07, "logits/chosen": -2.315624952316284, "logits/rejected": -2.0875000953674316, "logps/chosen": -760.0, "logps/rejected": -837.2000122070312, "loss": 3.4386, "rewards/accuracies": 0.37031251192092896, "rewards/chosen": 2.828906297683716, "rewards/margins": 5.220703125, "rewards/rejected": -2.400097608566284, "step": 600 }, { "epoch": 0.2863849765258216, "grad_norm": 499.7878762613954, "learning_rate": 3.967136150234742e-07, "logits/chosen": -2.225781202316284, "logits/rejected": -2.104687452316284, "logps/chosen": -771.5999755859375, "logps/rejected": -854.0, "loss": 4.0253, "rewards/accuracies": 0.38203126192092896, "rewards/chosen": 1.9755859375, "rewards/margins": 4.127343654632568, "rewards/rejected": -2.1502928733825684, "step": 610 }, { "epoch": 0.29107981220657275, "grad_norm": 443.15171808264427, "learning_rate": 3.941053729786124e-07, "logits/chosen": -2.253124952316284, "logits/rejected": -2.0484375953674316, "logps/chosen": -768.7999877929688, "logps/rejected": -872.7999877929688, "loss": 3.6086, "rewards/accuracies": 0.3984375, "rewards/chosen": 2.6519532203674316, "rewards/margins": 6.217187404632568, "rewards/rejected": -3.5660157203674316, "step": 620 }, { "epoch": 0.29577464788732394, "grad_norm": 848.1849647479538, "learning_rate": 3.9149713093375064e-07, "logits/chosen": -2.229687452316284, "logits/rejected": -2.05078125, "logps/chosen": -748.4000244140625, "logps/rejected": -893.5999755859375, "loss": 3.7016, "rewards/accuracies": 0.37968748807907104, "rewards/chosen": 2.3148436546325684, "rewards/margins": 6.423437595367432, "rewards/rejected": -4.104687690734863, "step": 630 }, { "epoch": 0.3004694835680751, "grad_norm": 1575.8528024836098, "learning_rate": 3.888888888888889e-07, "logits/chosen": -2.2953124046325684, "logits/rejected": -2.086718797683716, "logps/chosen": -732.0, "logps/rejected": -898.0, "loss": 4.2902, "rewards/accuracies": 0.3929687440395355, "rewards/chosen": 2.0843749046325684, "rewards/margins": 6.378125190734863, "rewards/rejected": -4.292578220367432, "step": 640 }, { "epoch": 0.3051643192488263, "grad_norm": 537.2576474944893, "learning_rate": 3.862806468440271e-07, "logits/chosen": -2.430468797683716, "logits/rejected": -2.3203125, "logps/chosen": -684.7999877929688, "logps/rejected": -791.5999755859375, "loss": 3.7996, "rewards/accuracies": 0.3812499940395355, "rewards/chosen": 1.9363281726837158, "rewards/margins": 4.251562595367432, "rewards/rejected": -2.314648389816284, "step": 650 }, { "epoch": 0.30985915492957744, "grad_norm": 3342.812121912648, "learning_rate": 3.8367240479916535e-07, "logits/chosen": -2.3046875, "logits/rejected": -2.260937452316284, "logps/chosen": -733.2000122070312, "logps/rejected": -809.5999755859375, "loss": 3.9199, "rewards/accuracies": 0.38359373807907104, "rewards/chosen": 2.831249952316284, "rewards/margins": 4.091406345367432, "rewards/rejected": -1.263085961341858, "step": 660 }, { "epoch": 0.3145539906103286, "grad_norm": 464.3621079865818, "learning_rate": 3.810641627543036e-07, "logits/chosen": -2.4375, "logits/rejected": -2.171093702316284, "logps/chosen": -734.4000244140625, "logps/rejected": -854.4000244140625, "loss": 3.6537, "rewards/accuracies": 0.39453125, "rewards/chosen": 2.4437499046325684, "rewards/margins": 6.0390625, "rewards/rejected": -3.594531297683716, "step": 670 }, { "epoch": 0.3192488262910798, "grad_norm": 433.46451540511623, "learning_rate": 3.784559207094418e-07, "logits/chosen": -2.339062452316284, "logits/rejected": -2.167187452316284, "logps/chosen": -758.4000244140625, "logps/rejected": -833.5999755859375, "loss": 4.2747, "rewards/accuracies": 0.38203126192092896, "rewards/chosen": 2.720703125, "rewards/margins": 4.618750095367432, "rewards/rejected": -1.903173804283142, "step": 680 }, { "epoch": 0.323943661971831, "grad_norm": 361.46426282793567, "learning_rate": 3.7584767866458005e-07, "logits/chosen": -2.2890625, "logits/rejected": -2.128124952316284, "logps/chosen": -749.2000122070312, "logps/rejected": -830.0, "loss": 4.0208, "rewards/accuracies": 0.41093748807907104, "rewards/chosen": 2.555859327316284, "rewards/margins": 5.395703315734863, "rewards/rejected": -2.8382811546325684, "step": 690 }, { "epoch": 0.3286384976525822, "grad_norm": 485.60510363341695, "learning_rate": 3.732394366197183e-07, "logits/chosen": -2.342968702316284, "logits/rejected": -2.120312452316284, "logps/chosen": -780.7999877929688, "logps/rejected": -820.7999877929688, "loss": 4.662, "rewards/accuracies": 0.40703123807907104, "rewards/chosen": 2.749682664871216, "rewards/margins": 5.974999904632568, "rewards/rejected": -3.227343797683716, "step": 700 }, { "epoch": 0.3333333333333333, "grad_norm": 361.06030358227264, "learning_rate": 3.706311945748565e-07, "logits/chosen": -2.3265624046325684, "logits/rejected": -2.0859375, "logps/chosen": -796.7999877929688, "logps/rejected": -838.4000244140625, "loss": 3.8436, "rewards/accuracies": 0.39921873807907104, "rewards/chosen": 2.5707030296325684, "rewards/margins": 5.237109184265137, "rewards/rejected": -2.668652296066284, "step": 710 }, { "epoch": 0.3380281690140845, "grad_norm": 805.4099665248119, "learning_rate": 3.6802295252999476e-07, "logits/chosen": -2.25, "logits/rejected": -1.954687476158142, "logps/chosen": -771.5999755859375, "logps/rejected": -909.5999755859375, "loss": 4.5502, "rewards/accuracies": 0.40546876192092896, "rewards/chosen": 1.332421898841858, "rewards/margins": 5.862500190734863, "rewards/rejected": -4.526123046875, "step": 720 }, { "epoch": 0.3427230046948357, "grad_norm": 836.6120700173155, "learning_rate": 3.65414710485133e-07, "logits/chosen": -2.3046875, "logits/rejected": -2.2046875953674316, "logps/chosen": -765.5999755859375, "logps/rejected": -828.7999877929688, "loss": 4.277, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": 1.676367163658142, "rewards/margins": 5.441601753234863, "rewards/rejected": -3.758007764816284, "step": 730 }, { "epoch": 0.3474178403755869, "grad_norm": 495.30095761214017, "learning_rate": 3.6280646844027127e-07, "logits/chosen": -2.2750000953674316, "logits/rejected": -2.1953125, "logps/chosen": -744.4000244140625, "logps/rejected": -912.4000244140625, "loss": 3.7158, "rewards/accuracies": 0.4039062559604645, "rewards/chosen": 2.78515625, "rewards/margins": 6.276562690734863, "rewards/rejected": -3.493945360183716, "step": 740 }, { "epoch": 0.352112676056338, "grad_norm": 505.136183106453, "learning_rate": 3.6019822639540947e-07, "logits/chosen": -2.2421875, "logits/rejected": -2.035937547683716, "logps/chosen": -778.7999877929688, "logps/rejected": -818.4000244140625, "loss": 5.3975, "rewards/accuracies": 0.40625, "rewards/chosen": 0.9281250238418579, "rewards/margins": 5.040625095367432, "rewards/rejected": -4.099218845367432, "step": 750 }, { "epoch": 0.3568075117370892, "grad_norm": 660.5281059191414, "learning_rate": 3.575899843505477e-07, "logits/chosen": -2.260937452316284, "logits/rejected": -2.120312452316284, "logps/chosen": -749.2000122070312, "logps/rejected": -799.5999755859375, "loss": 4.2791, "rewards/accuracies": 0.4085937440395355, "rewards/chosen": 2.5106444358825684, "rewards/margins": 5.396874904632568, "rewards/rejected": -2.874706983566284, "step": 760 }, { "epoch": 0.3615023474178404, "grad_norm": 625.8963095469089, "learning_rate": 3.5498174230568597e-07, "logits/chosen": -2.2328124046325684, "logits/rejected": -2.08984375, "logps/chosen": -735.2000122070312, "logps/rejected": -838.7999877929688, "loss": 3.6004, "rewards/accuracies": 0.36640626192092896, "rewards/chosen": 2.1490235328674316, "rewards/margins": 5.450781345367432, "rewards/rejected": -3.302734375, "step": 770 }, { "epoch": 0.36619718309859156, "grad_norm": 2236.100732457622, "learning_rate": 3.5237350026082417e-07, "logits/chosen": -2.3499999046325684, "logits/rejected": -2.1421875953674316, "logps/chosen": -751.2000122070312, "logps/rejected": -796.7999877929688, "loss": 4.5821, "rewards/accuracies": 0.39140623807907104, "rewards/chosen": 1.852148413658142, "rewards/margins": 4.368750095367432, "rewards/rejected": -2.520703077316284, "step": 780 }, { "epoch": 0.37089201877934275, "grad_norm": 10858.709315184346, "learning_rate": 3.497652582159624e-07, "logits/chosen": -2.305468797683716, "logits/rejected": -2.104687452316284, "logps/chosen": -743.2000122070312, "logps/rejected": -882.0, "loss": 4.5831, "rewards/accuracies": 0.40546876192092896, "rewards/chosen": 3.220703125, "rewards/margins": 5.546875, "rewards/rejected": -2.3187499046325684, "step": 790 }, { "epoch": 0.3755868544600939, "grad_norm": 521.9508389090934, "learning_rate": 3.471570161711007e-07, "logits/chosen": -2.2562499046325684, "logits/rejected": -2.039843797683716, "logps/chosen": -792.4000244140625, "logps/rejected": -906.7999877929688, "loss": 4.3751, "rewards/accuracies": 0.3929687440395355, "rewards/chosen": 2.5406250953674316, "rewards/margins": 7.356249809265137, "rewards/rejected": -4.815625190734863, "step": 800 }, { "epoch": 0.38028169014084506, "grad_norm": 10833.720938010409, "learning_rate": 3.445487741262389e-07, "logits/chosen": -2.3421874046325684, "logits/rejected": -2.176562547683716, "logps/chosen": -728.7999877929688, "logps/rejected": -824.4000244140625, "loss": 4.7887, "rewards/accuracies": 0.4085937440395355, "rewards/chosen": 2.8812499046325684, "rewards/margins": 6.7578125, "rewards/rejected": -3.8828125, "step": 810 }, { "epoch": 0.38497652582159625, "grad_norm": 486.2322033900268, "learning_rate": 3.4194053208137713e-07, "logits/chosen": -2.3375000953674316, "logits/rejected": -2.203125, "logps/chosen": -741.2000122070312, "logps/rejected": -824.4000244140625, "loss": 3.8345, "rewards/accuracies": 0.39921873807907104, "rewards/chosen": 2.8296875953674316, "rewards/margins": 6.026171684265137, "rewards/rejected": -3.198046922683716, "step": 820 }, { "epoch": 0.38967136150234744, "grad_norm": 549.5772895819349, "learning_rate": 3.393322900365154e-07, "logits/chosen": -2.284374952316284, "logits/rejected": -2.09375, "logps/chosen": -696.0, "logps/rejected": -800.4000244140625, "loss": 3.6164, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": 2.98046875, "rewards/margins": 6.065625190734863, "rewards/rejected": -3.092578172683716, "step": 830 }, { "epoch": 0.39436619718309857, "grad_norm": 471.91907103487586, "learning_rate": 3.367240479916536e-07, "logits/chosen": -2.3304686546325684, "logits/rejected": -2.128124952316284, "logps/chosen": -785.5999755859375, "logps/rejected": -877.5999755859375, "loss": 4.2607, "rewards/accuracies": 0.4156250059604645, "rewards/chosen": 3.482617139816284, "rewards/margins": 6.278124809265137, "rewards/rejected": -2.8047852516174316, "step": 840 }, { "epoch": 0.39906103286384975, "grad_norm": 416.94331200444964, "learning_rate": 3.3411580594679184e-07, "logits/chosen": -2.340625047683716, "logits/rejected": -2.15625, "logps/chosen": -728.7999877929688, "logps/rejected": -836.0, "loss": 4.2115, "rewards/accuracies": 0.4078125059604645, "rewards/chosen": 2.883984327316284, "rewards/margins": 5.96484375, "rewards/rejected": -3.0833983421325684, "step": 850 }, { "epoch": 0.40375586854460094, "grad_norm": 410.68613216293846, "learning_rate": 3.315075639019301e-07, "logits/chosen": -2.2671875953674316, "logits/rejected": -2.159374952316284, "logps/chosen": -725.2000122070312, "logps/rejected": -842.0, "loss": 3.8122, "rewards/accuracies": 0.42656248807907104, "rewards/chosen": 3.440624952316284, "rewards/margins": 7.651562690734863, "rewards/rejected": -4.215624809265137, "step": 860 }, { "epoch": 0.4084507042253521, "grad_norm": 348.24347301593184, "learning_rate": 3.288993218570683e-07, "logits/chosen": -2.266406297683716, "logits/rejected": -2.034374952316284, "logps/chosen": -738.4000244140625, "logps/rejected": -829.5999755859375, "loss": 4.2795, "rewards/accuracies": 0.4164062440395355, "rewards/chosen": 2.887500047683716, "rewards/margins": 5.710156440734863, "rewards/rejected": -2.8257813453674316, "step": 870 }, { "epoch": 0.4131455399061033, "grad_norm": 462.9151988995226, "learning_rate": 3.2629107981220654e-07, "logits/chosen": -2.32421875, "logits/rejected": -2.0875000953674316, "logps/chosen": -764.4000244140625, "logps/rejected": -850.4000244140625, "loss": 3.855, "rewards/accuracies": 0.4273437559604645, "rewards/chosen": 2.641796827316284, "rewards/margins": 7.3359375, "rewards/rejected": -4.700585842132568, "step": 880 }, { "epoch": 0.41784037558685444, "grad_norm": 633.2921618059665, "learning_rate": 3.236828377673448e-07, "logits/chosen": -2.2945313453674316, "logits/rejected": -2.19921875, "logps/chosen": -793.5999755859375, "logps/rejected": -836.0, "loss": 5.2967, "rewards/accuracies": 0.40937501192092896, "rewards/chosen": 2.6263670921325684, "rewards/margins": 6.115624904632568, "rewards/rejected": -3.4859375953674316, "step": 890 }, { "epoch": 0.4225352112676056, "grad_norm": 370.9120195694783, "learning_rate": 3.2107459572248305e-07, "logits/chosen": -2.280468702316284, "logits/rejected": -2.1578125953674316, "logps/chosen": -759.5999755859375, "logps/rejected": -836.7999877929688, "loss": 3.9937, "rewards/accuracies": 0.39921873807907104, "rewards/chosen": 2.9664063453674316, "rewards/margins": 6.659375190734863, "rewards/rejected": -3.6904296875, "step": 900 }, { "epoch": 0.4272300469483568, "grad_norm": 420.0254816718112, "learning_rate": 3.1846635367762125e-07, "logits/chosen": -2.19140625, "logits/rejected": -2.0648436546325684, "logps/chosen": -762.0, "logps/rejected": -865.2000122070312, "loss": 4.0513, "rewards/accuracies": 0.41328126192092896, "rewards/chosen": 2.8734374046325684, "rewards/margins": 7.330859184265137, "rewards/rejected": -4.443359375, "step": 910 }, { "epoch": 0.431924882629108, "grad_norm": 516.9297080870588, "learning_rate": 3.158581116327595e-07, "logits/chosen": -2.203906297683716, "logits/rejected": -2.1617188453674316, "logps/chosen": -769.2000122070312, "logps/rejected": -916.0, "loss": 4.3426, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": 3.044140577316284, "rewards/margins": 7.994531154632568, "rewards/rejected": -4.952343940734863, "step": 920 }, { "epoch": 0.43661971830985913, "grad_norm": 309.58605122753823, "learning_rate": 3.1324986958789775e-07, "logits/chosen": -2.3031249046325684, "logits/rejected": -2.19140625, "logps/chosen": -708.0, "logps/rejected": -827.2000122070312, "loss": 3.6086, "rewards/accuracies": 0.4203124940395355, "rewards/chosen": 2.8515625, "rewards/margins": 7.5078125, "rewards/rejected": -4.663177490234375, "step": 930 }, { "epoch": 0.4413145539906103, "grad_norm": 618.6834777646845, "learning_rate": 3.1064162754303595e-07, "logits/chosen": -2.223437547683716, "logits/rejected": -2.096874952316284, "logps/chosen": -779.2000122070312, "logps/rejected": -881.2000122070312, "loss": 4.2693, "rewards/accuracies": 0.4281249940395355, "rewards/chosen": 2.9593749046325684, "rewards/margins": 7.512499809265137, "rewards/rejected": -4.546875, "step": 940 }, { "epoch": 0.4460093896713615, "grad_norm": 368.69666501852373, "learning_rate": 3.080333854981742e-07, "logits/chosen": -2.276562452316284, "logits/rejected": -2.1421875953674316, "logps/chosen": -751.2000122070312, "logps/rejected": -802.7999877929688, "loss": 4.4111, "rewards/accuracies": 0.4039062559604645, "rewards/chosen": 3.410937547683716, "rewards/margins": 4.789843559265137, "rewards/rejected": -1.383203148841858, "step": 950 }, { "epoch": 0.4507042253521127, "grad_norm": 746.5695982997314, "learning_rate": 3.0542514345331246e-07, "logits/chosen": -2.2640624046325684, "logits/rejected": -2.0609374046325684, "logps/chosen": -750.7999877929688, "logps/rejected": -913.5999755859375, "loss": 4.3995, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": 3.2171874046325684, "rewards/margins": 7.142578125, "rewards/rejected": -3.922656297683716, "step": 960 }, { "epoch": 0.45539906103286387, "grad_norm": 760.3386454959006, "learning_rate": 3.0281690140845066e-07, "logits/chosen": -2.1781249046325684, "logits/rejected": -2.125, "logps/chosen": -806.7999877929688, "logps/rejected": -877.5999755859375, "loss": 4.693, "rewards/accuracies": 0.42890626192092896, "rewards/chosen": 3.000781297683716, "rewards/margins": 5.631249904632568, "rewards/rejected": -2.630859375, "step": 970 }, { "epoch": 0.460093896713615, "grad_norm": 1085.1427603923041, "learning_rate": 3.002086593635889e-07, "logits/chosen": -2.3531250953674316, "logits/rejected": -2.1343750953674316, "logps/chosen": -760.4000244140625, "logps/rejected": -890.7999877929688, "loss": 4.008, "rewards/accuracies": 0.44062501192092896, "rewards/chosen": 3.5023436546325684, "rewards/margins": 8.532812118530273, "rewards/rejected": -5.0234375, "step": 980 }, { "epoch": 0.4647887323943662, "grad_norm": 573.0020487487674, "learning_rate": 2.9760041731872716e-07, "logits/chosen": -2.4312500953674316, "logits/rejected": -2.2718749046325684, "logps/chosen": -692.4000244140625, "logps/rejected": -826.7999877929688, "loss": 3.5181, "rewards/accuracies": 0.43046873807907104, "rewards/chosen": 3.774218797683716, "rewards/margins": 7.478125095367432, "rewards/rejected": -3.699023485183716, "step": 990 }, { "epoch": 0.4694835680751174, "grad_norm": 532.002820648586, "learning_rate": 2.9499217527386536e-07, "logits/chosen": -2.2906250953674316, "logits/rejected": -2.1890625953674316, "logps/chosen": -736.4000244140625, "logps/rejected": -849.2000122070312, "loss": 4.0951, "rewards/accuracies": 0.40937501192092896, "rewards/chosen": 2.7989745140075684, "rewards/margins": 5.766797065734863, "rewards/rejected": -2.973828077316284, "step": 1000 }, { "epoch": 0.47417840375586856, "grad_norm": 747.5917247217351, "learning_rate": 2.923839332290036e-07, "logits/chosen": -2.25390625, "logits/rejected": -2.061718702316284, "logps/chosen": -732.0, "logps/rejected": -876.0, "loss": 3.9824, "rewards/accuracies": 0.40703123807907104, "rewards/chosen": 2.9765625, "rewards/margins": 9.293749809265137, "rewards/rejected": -6.31884765625, "step": 1010 }, { "epoch": 0.4788732394366197, "grad_norm": 426.966215090325, "learning_rate": 2.8977569118414187e-07, "logits/chosen": -2.401562452316284, "logits/rejected": -2.25390625, "logps/chosen": -714.0, "logps/rejected": -848.7999877929688, "loss": 3.8824, "rewards/accuracies": 0.4468750059604645, "rewards/chosen": 2.970263719558716, "rewards/margins": 9.743749618530273, "rewards/rejected": -6.774316310882568, "step": 1020 }, { "epoch": 0.4835680751173709, "grad_norm": 365.5440615260703, "learning_rate": 2.8716744913928007e-07, "logits/chosen": -2.3062500953674316, "logits/rejected": -2.0921874046325684, "logps/chosen": -740.4000244140625, "logps/rejected": -818.0, "loss": 4.139, "rewards/accuracies": 0.4296875, "rewards/chosen": 2.978710889816284, "rewards/margins": 6.871874809265137, "rewards/rejected": -3.887500047683716, "step": 1030 }, { "epoch": 0.48826291079812206, "grad_norm": 447.88692510897215, "learning_rate": 2.845592070944183e-07, "logits/chosen": -2.2953124046325684, "logits/rejected": -2.1304688453674316, "logps/chosen": -746.4000244140625, "logps/rejected": -881.5999755859375, "loss": 3.8399, "rewards/accuracies": 0.43828123807907104, "rewards/chosen": 4.042187690734863, "rewards/margins": 7.153124809265137, "rewards/rejected": -3.12255859375, "step": 1040 }, { "epoch": 0.49295774647887325, "grad_norm": 6937.32676845602, "learning_rate": 2.819509650495566e-07, "logits/chosen": -2.3499999046325684, "logits/rejected": -2.0999999046325684, "logps/chosen": -725.5999755859375, "logps/rejected": -826.4000244140625, "loss": 4.1211, "rewards/accuracies": 0.42890626192092896, "rewards/chosen": 3.339062452316284, "rewards/margins": 7.1875, "rewards/rejected": -3.845410108566284, "step": 1050 }, { "epoch": 0.49765258215962443, "grad_norm": 354.53169803091913, "learning_rate": 2.7934272300469483e-07, "logits/chosen": -2.328125, "logits/rejected": -2.065624952316284, "logps/chosen": -773.5999755859375, "logps/rejected": -839.5999755859375, "loss": 3.6095, "rewards/accuracies": 0.44453126192092896, "rewards/chosen": 3.508593797683716, "rewards/margins": 9.635937690734863, "rewards/rejected": -6.143359184265137, "step": 1060 }, { "epoch": 0.5023474178403756, "grad_norm": 456.8381360686394, "learning_rate": 2.7673448095983303e-07, "logits/chosen": -2.2671875953674316, "logits/rejected": -2.109375, "logps/chosen": -742.0, "logps/rejected": -838.7999877929688, "loss": 4.201, "rewards/accuracies": 0.43671876192092896, "rewards/chosen": 3.332812547683716, "rewards/margins": 6.599023342132568, "rewards/rejected": -3.263964891433716, "step": 1070 }, { "epoch": 0.5070422535211268, "grad_norm": 418.36525536731284, "learning_rate": 2.741262389149713e-07, "logits/chosen": -2.2281250953674316, "logits/rejected": -1.998437523841858, "logps/chosen": -765.5999755859375, "logps/rejected": -866.0, "loss": 3.8887, "rewards/accuracies": 0.44921875, "rewards/chosen": 3.53125, "rewards/margins": 8.162500381469727, "rewards/rejected": -4.622656345367432, "step": 1080 }, { "epoch": 0.5117370892018779, "grad_norm": 2210.2364862666204, "learning_rate": 2.7151799687010953e-07, "logits/chosen": -2.2640624046325684, "logits/rejected": -1.997656226158142, "logps/chosen": -756.0, "logps/rejected": -870.0, "loss": 4.5425, "rewards/accuracies": 0.4242187440395355, "rewards/chosen": 2.4625000953674316, "rewards/margins": 7.379687309265137, "rewards/rejected": -4.918749809265137, "step": 1090 }, { "epoch": 0.5164319248826291, "grad_norm": 443.0955421475496, "learning_rate": 2.6890975482524773e-07, "logits/chosen": -2.246875047683716, "logits/rejected": -2.121875047683716, "logps/chosen": -792.4000244140625, "logps/rejected": -862.7999877929688, "loss": 4.1742, "rewards/accuracies": 0.41484373807907104, "rewards/chosen": 2.030078172683716, "rewards/margins": 7.120312690734863, "rewards/rejected": -5.096289157867432, "step": 1100 }, { "epoch": 0.5211267605633803, "grad_norm": 518.8473672890916, "learning_rate": 2.66301512780386e-07, "logits/chosen": -2.3375000953674316, "logits/rejected": -2.0015625953674316, "logps/chosen": -702.0, "logps/rejected": -856.4000244140625, "loss": 4.1225, "rewards/accuracies": 0.41796875, "rewards/chosen": 2.481640577316284, "rewards/margins": 9.381250381469727, "rewards/rejected": -6.911913871765137, "step": 1110 }, { "epoch": 0.5258215962441315, "grad_norm": 363.22129286758627, "learning_rate": 2.6369327073552424e-07, "logits/chosen": -2.26953125, "logits/rejected": -2.1773438453674316, "logps/chosen": -738.0, "logps/rejected": -804.7999877929688, "loss": 3.7051, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": 3.3125, "rewards/margins": 6.903124809265137, "rewards/rejected": -3.5972657203674316, "step": 1120 }, { "epoch": 0.5305164319248826, "grad_norm": 536.6899668393712, "learning_rate": 2.6108502869066244e-07, "logits/chosen": -2.268749952316284, "logits/rejected": -2.147656202316284, "logps/chosen": -779.5999755859375, "logps/rejected": -889.2000122070312, "loss": 5.0237, "rewards/accuracies": 0.43437498807907104, "rewards/chosen": 2.6285157203674316, "rewards/margins": 9.725000381469727, "rewards/rejected": -7.099999904632568, "step": 1130 }, { "epoch": 0.5352112676056338, "grad_norm": 424.48547696453915, "learning_rate": 2.584767866458007e-07, "logits/chosen": -2.3359375, "logits/rejected": -2.114062547683716, "logps/chosen": -741.5999755859375, "logps/rejected": -851.2000122070312, "loss": 4.3225, "rewards/accuracies": 0.43828123807907104, "rewards/chosen": 3.164843797683716, "rewards/margins": 8.550000190734863, "rewards/rejected": -5.394384860992432, "step": 1140 }, { "epoch": 0.539906103286385, "grad_norm": 335.36613670796896, "learning_rate": 2.5586854460093895e-07, "logits/chosen": -2.2421875, "logits/rejected": -2.1343750953674316, "logps/chosen": -756.4000244140625, "logps/rejected": -864.0, "loss": 3.6961, "rewards/accuracies": 0.43671876192092896, "rewards/chosen": 3.1851563453674316, "rewards/margins": 7.917187690734863, "rewards/rejected": -4.73046875, "step": 1150 }, { "epoch": 0.5446009389671361, "grad_norm": 423.8242216632141, "learning_rate": 2.5326030255607715e-07, "logits/chosen": -2.3375000953674316, "logits/rejected": -2.198437452316284, "logps/chosen": -748.7999877929688, "logps/rejected": -840.4000244140625, "loss": 4.3461, "rewards/accuracies": 0.4242187440395355, "rewards/chosen": 3.552734375, "rewards/margins": 7.80859375, "rewards/rejected": -4.260156154632568, "step": 1160 }, { "epoch": 0.5492957746478874, "grad_norm": 421.6433201369265, "learning_rate": 2.506520605112154e-07, "logits/chosen": -2.2874999046325684, "logits/rejected": -2.171875, "logps/chosen": -746.4000244140625, "logps/rejected": -847.2000122070312, "loss": 4.2687, "rewards/accuracies": 0.4195312559604645, "rewards/chosen": 3.234375, "rewards/margins": 8.793749809265137, "rewards/rejected": -5.568749904632568, "step": 1170 }, { "epoch": 0.5539906103286385, "grad_norm": 360.7665284965791, "learning_rate": 2.4804381846635365e-07, "logits/chosen": -2.276562452316284, "logits/rejected": -2.160937547683716, "logps/chosen": -773.5999755859375, "logps/rejected": -845.5999755859375, "loss": 4.4898, "rewards/accuracies": 0.4476562440395355, "rewards/chosen": 2.789904832839966, "rewards/margins": 7.818749904632568, "rewards/rejected": -5.0234375, "step": 1180 }, { "epoch": 0.5586854460093896, "grad_norm": 324.5642211557035, "learning_rate": 2.454355764214919e-07, "logits/chosen": -2.1953125, "logits/rejected": -2.098437547683716, "logps/chosen": -728.4000244140625, "logps/rejected": -878.7999877929688, "loss": 4.0787, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": 3.4945311546325684, "rewards/margins": 7.643750190734863, "rewards/rejected": -4.148193359375, "step": 1190 }, { "epoch": 0.5633802816901409, "grad_norm": 405.78556055418204, "learning_rate": 2.4282733437663016e-07, "logits/chosen": -2.28515625, "logits/rejected": -2.2640624046325684, "logps/chosen": -752.4000244140625, "logps/rejected": -815.5999755859375, "loss": 3.5846, "rewards/accuracies": 0.4078125059604645, "rewards/chosen": 3.9476561546325684, "rewards/margins": 7.10546875, "rewards/rejected": -3.1617188453674316, "step": 1200 }, { "epoch": 0.568075117370892, "grad_norm": 373.50694707333025, "learning_rate": 2.4021909233176836e-07, "logits/chosen": -2.249218702316284, "logits/rejected": -2.0703125, "logps/chosen": -714.4000244140625, "logps/rejected": -846.4000244140625, "loss": 3.9689, "rewards/accuracies": 0.4234375059604645, "rewards/chosen": 3.852343797683716, "rewards/margins": 8.003125190734863, "rewards/rejected": -4.151757717132568, "step": 1210 }, { "epoch": 0.5727699530516432, "grad_norm": 361.61775798266564, "learning_rate": 2.376108502869066e-07, "logits/chosen": -2.3921875953674316, "logits/rejected": -2.3203125, "logps/chosen": -757.2000122070312, "logps/rejected": -862.7999877929688, "loss": 5.3978, "rewards/accuracies": 0.4351562559604645, "rewards/chosen": 3.139453172683716, "rewards/margins": 7.23046875, "rewards/rejected": -4.089648246765137, "step": 1220 }, { "epoch": 0.5774647887323944, "grad_norm": 420.8433662870921, "learning_rate": 2.3500260824204484e-07, "logits/chosen": -2.2914061546325684, "logits/rejected": -2.1343750953674316, "logps/chosen": -790.0, "logps/rejected": -889.5999755859375, "loss": 5.1055, "rewards/accuracies": 0.44453126192092896, "rewards/chosen": 2.4339842796325684, "rewards/margins": 7.3828125, "rewards/rejected": -4.946093559265137, "step": 1230 }, { "epoch": 0.5821596244131455, "grad_norm": 382.45656168082024, "learning_rate": 2.323943661971831e-07, "logits/chosen": -2.385937452316284, "logits/rejected": -2.1976561546325684, "logps/chosen": -747.5999755859375, "logps/rejected": -843.5999755859375, "loss": 4.4922, "rewards/accuracies": 0.4296875, "rewards/chosen": 2.762500047683716, "rewards/margins": 6.120312690734863, "rewards/rejected": -3.3580079078674316, "step": 1240 }, { "epoch": 0.5868544600938967, "grad_norm": 515.3933772313463, "learning_rate": 2.2978612415232132e-07, "logits/chosen": -2.332812547683716, "logits/rejected": -2.149218797683716, "logps/chosen": -803.5999755859375, "logps/rejected": -885.2000122070312, "loss": 3.8058, "rewards/accuracies": 0.4359374940395355, "rewards/chosen": 3.7613282203674316, "rewards/margins": 9.2890625, "rewards/rejected": -5.537499904632568, "step": 1250 }, { "epoch": 0.5915492957746479, "grad_norm": 511.2143919673897, "learning_rate": 2.2717788210745957e-07, "logits/chosen": -2.229687452316284, "logits/rejected": -2.0062499046325684, "logps/chosen": -753.5999755859375, "logps/rejected": -888.7999877929688, "loss": 4.3063, "rewards/accuracies": 0.41093748807907104, "rewards/chosen": 2.5609374046325684, "rewards/margins": 8.8359375, "rewards/rejected": -6.276953220367432, "step": 1260 }, { "epoch": 0.596244131455399, "grad_norm": 644.6610888055669, "learning_rate": 2.245696400625978e-07, "logits/chosen": -2.299999952316284, "logits/rejected": -2.1695313453674316, "logps/chosen": -763.5999755859375, "logps/rejected": -851.2000122070312, "loss": 5.2695, "rewards/accuracies": 0.4351562559604645, "rewards/chosen": 2.975390672683716, "rewards/margins": 5.928124904632568, "rewards/rejected": -2.943554639816284, "step": 1270 }, { "epoch": 0.6009389671361502, "grad_norm": 321.7455984861024, "learning_rate": 2.2196139801773602e-07, "logits/chosen": -2.3125, "logits/rejected": -2.089062452316284, "logps/chosen": -718.0, "logps/rejected": -816.7999877929688, "loss": 3.8524, "rewards/accuracies": 0.45390623807907104, "rewards/chosen": 3.815624952316284, "rewards/margins": 9.934374809265137, "rewards/rejected": -6.112500190734863, "step": 1280 }, { "epoch": 0.6056338028169014, "grad_norm": 590.758641860248, "learning_rate": 2.1935315597287428e-07, "logits/chosen": -2.2054686546325684, "logits/rejected": -2.067187547683716, "logps/chosen": -763.5999755859375, "logps/rejected": -828.0, "loss": 4.7133, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": 3.06591796875, "rewards/margins": 6.80078125, "rewards/rejected": -3.7359375953674316, "step": 1290 }, { "epoch": 0.6103286384976526, "grad_norm": 461.2463674049622, "learning_rate": 2.167449139280125e-07, "logits/chosen": -2.2320313453674316, "logits/rejected": -2.065624952316284, "logps/chosen": -764.7999877929688, "logps/rejected": -869.2000122070312, "loss": 4.3558, "rewards/accuracies": 0.4085937440395355, "rewards/chosen": 3.3890624046325684, "rewards/margins": 7.753125190734863, "rewards/rejected": -4.361132621765137, "step": 1300 }, { "epoch": 0.6150234741784038, "grad_norm": 280.5584514541023, "learning_rate": 2.1413667188315073e-07, "logits/chosen": -2.3765625953674316, "logits/rejected": -2.3765625953674316, "logps/chosen": -736.4000244140625, "logps/rejected": -794.7999877929688, "loss": 4.0469, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 3.831249952316284, "rewards/margins": 7.349218845367432, "rewards/rejected": -3.514453172683716, "step": 1310 }, { "epoch": 0.6197183098591549, "grad_norm": 495.67213017482453, "learning_rate": 2.1152842983828898e-07, "logits/chosen": -2.307812452316284, "logits/rejected": -2.1429686546325684, "logps/chosen": -758.0, "logps/rejected": -843.2000122070312, "loss": 4.2716, "rewards/accuracies": 0.43437498807907104, "rewards/chosen": 3.620312452316284, "rewards/margins": 6.261328220367432, "rewards/rejected": -2.6399903297424316, "step": 1320 }, { "epoch": 0.6244131455399061, "grad_norm": 608.4249803472775, "learning_rate": 2.089201877934272e-07, "logits/chosen": -2.249218702316284, "logits/rejected": -1.9968750476837158, "logps/chosen": -788.0, "logps/rejected": -903.5999755859375, "loss": 4.9686, "rewards/accuracies": 0.44843751192092896, "rewards/chosen": 2.845898389816284, "rewards/margins": 8.354687690734863, "rewards/rejected": -5.509521484375, "step": 1330 }, { "epoch": 0.6291079812206573, "grad_norm": 299.32455814618487, "learning_rate": 2.0631194574856543e-07, "logits/chosen": -2.4124999046325684, "logits/rejected": -2.229687452316284, "logps/chosen": -720.0, "logps/rejected": -778.4000244140625, "loss": 4.9123, "rewards/accuracies": 0.4281249940395355, "rewards/chosen": 3.137500047683716, "rewards/margins": 7.143359184265137, "rewards/rejected": -4.014062404632568, "step": 1340 }, { "epoch": 0.6338028169014085, "grad_norm": 452.9560817569427, "learning_rate": 2.0370370370370369e-07, "logits/chosen": -2.190624952316284, "logits/rejected": -2.05078125, "logps/chosen": -761.5999755859375, "logps/rejected": -901.5999755859375, "loss": 4.2726, "rewards/accuracies": 0.42890626192092896, "rewards/chosen": 3.0523438453674316, "rewards/margins": 8.350781440734863, "rewards/rejected": -5.297436714172363, "step": 1350 }, { "epoch": 0.6384976525821596, "grad_norm": 449.42369709396064, "learning_rate": 2.010954616588419e-07, "logits/chosen": -2.3203125, "logits/rejected": -2.125, "logps/chosen": -747.5999755859375, "logps/rejected": -842.0, "loss": 3.7139, "rewards/accuracies": 0.45234376192092896, "rewards/chosen": 4.084374904632568, "rewards/margins": 7.849999904632568, "rewards/rejected": -3.768749952316284, "step": 1360 }, { "epoch": 0.6431924882629108, "grad_norm": 332.49243374349845, "learning_rate": 1.9848721961398017e-07, "logits/chosen": -2.3515625, "logits/rejected": -2.2421875, "logps/chosen": -755.2000122070312, "logps/rejected": -823.5999755859375, "loss": 4.3203, "rewards/accuracies": 0.42265623807907104, "rewards/chosen": 3.0523438453674316, "rewards/margins": 8.746874809265137, "rewards/rejected": -5.705468654632568, "step": 1370 }, { "epoch": 0.647887323943662, "grad_norm": 475.9502903148322, "learning_rate": 1.958789775691184e-07, "logits/chosen": -2.3062500953674316, "logits/rejected": -2.1273436546325684, "logps/chosen": -734.7999877929688, "logps/rejected": -838.4000244140625, "loss": 4.0781, "rewards/accuracies": 0.4078125059604645, "rewards/chosen": 3.0570311546325684, "rewards/margins": 8.162500381469727, "rewards/rejected": -5.091864109039307, "step": 1380 }, { "epoch": 0.6525821596244131, "grad_norm": 5461.133518431073, "learning_rate": 1.9327073552425662e-07, "logits/chosen": -2.340625047683716, "logits/rejected": -2.125, "logps/chosen": -716.4000244140625, "logps/rejected": -803.5999755859375, "loss": 3.605, "rewards/accuracies": 0.46015626192092896, "rewards/chosen": 3.429394483566284, "rewards/margins": 8.985156059265137, "rewards/rejected": -5.564843654632568, "step": 1390 }, { "epoch": 0.6572769953051644, "grad_norm": 6885.443450742423, "learning_rate": 1.906624934793949e-07, "logits/chosen": -2.239062547683716, "logits/rejected": -2.196093797683716, "logps/chosen": -748.0, "logps/rejected": -885.5999755859375, "loss": 4.3522, "rewards/accuracies": 0.4664062559604645, "rewards/chosen": 2.715625047683716, "rewards/margins": 9.5234375, "rewards/rejected": -6.817968845367432, "step": 1400 }, { "epoch": 0.6619718309859155, "grad_norm": 423.958432658112, "learning_rate": 1.8805425143453312e-07, "logits/chosen": -2.364062547683716, "logits/rejected": -2.0406250953674316, "logps/chosen": -725.2000122070312, "logps/rejected": -925.5999755859375, "loss": 3.816, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 3.6527342796325684, "rewards/margins": 9.598437309265137, "rewards/rejected": -5.954297065734863, "step": 1410 }, { "epoch": 0.6666666666666666, "grad_norm": 452.36410328926013, "learning_rate": 1.8544600938967138e-07, "logits/chosen": -2.3226561546325684, "logits/rejected": -2.225781202316284, "logps/chosen": -734.7999877929688, "logps/rejected": -816.0, "loss": 3.5225, "rewards/accuracies": 0.4375, "rewards/chosen": 2.813281297683716, "rewards/margins": 7.532812595367432, "rewards/rejected": -4.720312595367432, "step": 1420 }, { "epoch": 0.6713615023474179, "grad_norm": 361.3010901392399, "learning_rate": 1.828377673448096e-07, "logits/chosen": -2.2210936546325684, "logits/rejected": -2.192187547683716, "logps/chosen": -785.2000122070312, "logps/rejected": -850.7999877929688, "loss": 4.9125, "rewards/accuracies": 0.4515624940395355, "rewards/chosen": 1.88671875, "rewards/margins": 7.845312595367432, "rewards/rejected": -5.946875095367432, "step": 1430 }, { "epoch": 0.676056338028169, "grad_norm": 426.6786872090034, "learning_rate": 1.8022952529994783e-07, "logits/chosen": -2.4078125953674316, "logits/rejected": -2.2093749046325684, "logps/chosen": -738.4000244140625, "logps/rejected": -838.4000244140625, "loss": 4.1243, "rewards/accuracies": 0.43359375, "rewards/chosen": 3.356250047683716, "rewards/margins": 10.160937309265137, "rewards/rejected": -6.804931640625, "step": 1440 }, { "epoch": 0.6807511737089202, "grad_norm": 399.76098537479686, "learning_rate": 1.7762128325508608e-07, "logits/chosen": -2.354687452316284, "logits/rejected": -2.19140625, "logps/chosen": -769.2000122070312, "logps/rejected": -922.4000244140625, "loss": 4.9205, "rewards/accuracies": 0.43828123807907104, "rewards/chosen": 2.5992188453674316, "rewards/margins": 9.056249618530273, "rewards/rejected": -6.466015815734863, "step": 1450 }, { "epoch": 0.6854460093896714, "grad_norm": 479.2473483667506, "learning_rate": 1.750130412102243e-07, "logits/chosen": -2.1734375953674316, "logits/rejected": -2.106250047683716, "logps/chosen": -813.5999755859375, "logps/rejected": -881.5999755859375, "loss": 5.3232, "rewards/accuracies": 0.43671876192092896, "rewards/chosen": 2.576171875, "rewards/margins": 8.268750190734863, "rewards/rejected": -5.686718940734863, "step": 1460 }, { "epoch": 0.6901408450704225, "grad_norm": 415.9604208306485, "learning_rate": 1.7240479916536254e-07, "logits/chosen": -2.203125, "logits/rejected": -2.145312547683716, "logps/chosen": -739.5999755859375, "logps/rejected": -874.7999877929688, "loss": 4.774, "rewards/accuracies": 0.4296875, "rewards/chosen": 2.839062452316284, "rewards/margins": 10.199999809265137, "rewards/rejected": -7.369531154632568, "step": 1470 }, { "epoch": 0.6948356807511737, "grad_norm": 297.0991105900083, "learning_rate": 1.697965571205008e-07, "logits/chosen": -2.4281249046325684, "logits/rejected": -2.2562499046325684, "logps/chosen": -732.0, "logps/rejected": -794.7999877929688, "loss": 3.9867, "rewards/accuracies": 0.45390623807907104, "rewards/chosen": 4.060156345367432, "rewards/margins": 9.248437881469727, "rewards/rejected": -5.195703029632568, "step": 1480 }, { "epoch": 0.6995305164319249, "grad_norm": 559.399278140405, "learning_rate": 1.6718831507563902e-07, "logits/chosen": -2.364062547683716, "logits/rejected": -2.270312547683716, "logps/chosen": -746.4000244140625, "logps/rejected": -792.0, "loss": 3.8338, "rewards/accuracies": 0.4296875, "rewards/chosen": 3.2894530296325684, "rewards/margins": 6.504687309265137, "rewards/rejected": -3.2171874046325684, "step": 1490 }, { "epoch": 0.704225352112676, "grad_norm": 318.92530656132584, "learning_rate": 1.6458007303077727e-07, "logits/chosen": -2.3125, "logits/rejected": -2.234375, "logps/chosen": -727.2000122070312, "logps/rejected": -800.0, "loss": 4.4147, "rewards/accuracies": 0.4546875059604645, "rewards/chosen": 3.581249952316284, "rewards/margins": 7.589062690734863, "rewards/rejected": -4.009961128234863, "step": 1500 }, { "epoch": 0.7089201877934272, "grad_norm": 389.24309598289756, "learning_rate": 1.619718309859155e-07, "logits/chosen": -2.3890624046325684, "logits/rejected": -2.116406202316284, "logps/chosen": -756.7999877929688, "logps/rejected": -879.5999755859375, "loss": 4.2559, "rewards/accuracies": 0.4585937559604645, "rewards/chosen": 3.3070311546325684, "rewards/margins": 9.251562118530273, "rewards/rejected": -5.9453125, "step": 1510 }, { "epoch": 0.7136150234741784, "grad_norm": 747.4741107805338, "learning_rate": 1.5936358894105372e-07, "logits/chosen": -2.3656249046325684, "logits/rejected": -2.2109375, "logps/chosen": -752.0, "logps/rejected": -842.7999877929688, "loss": 4.1086, "rewards/accuracies": 0.46171873807907104, "rewards/chosen": 4.715624809265137, "rewards/margins": 8.232812881469727, "rewards/rejected": -3.5199217796325684, "step": 1520 }, { "epoch": 0.7183098591549296, "grad_norm": 1212.1237860042627, "learning_rate": 1.5675534689619197e-07, "logits/chosen": -2.370312452316284, "logits/rejected": -2.2523436546325684, "logps/chosen": -760.4000244140625, "logps/rejected": -783.5999755859375, "loss": 4.6827, "rewards/accuracies": 0.421875, "rewards/chosen": 2.787304639816284, "rewards/margins": 5.881249904632568, "rewards/rejected": -3.096874952316284, "step": 1530 }, { "epoch": 0.7230046948356808, "grad_norm": 404.49052723472374, "learning_rate": 1.541471048513302e-07, "logits/chosen": -2.2578125, "logits/rejected": -2.141406297683716, "logps/chosen": -761.5999755859375, "logps/rejected": -847.2000122070312, "loss": 3.9635, "rewards/accuracies": 0.46562498807907104, "rewards/chosen": 4.581250190734863, "rewards/margins": 8.944531440734863, "rewards/rejected": -4.373242378234863, "step": 1540 }, { "epoch": 0.7276995305164319, "grad_norm": 368.63816383458794, "learning_rate": 1.5153886280646843e-07, "logits/chosen": -2.265625, "logits/rejected": -2.12109375, "logps/chosen": -718.0, "logps/rejected": -819.2000122070312, "loss": 3.908, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": 3.5171875953674316, "rewards/margins": 7.840624809265137, "rewards/rejected": -4.328515529632568, "step": 1550 }, { "epoch": 0.7323943661971831, "grad_norm": 370.59996660930864, "learning_rate": 1.4893062076160668e-07, "logits/chosen": -2.4234375953674316, "logits/rejected": -2.3453125953674316, "logps/chosen": -737.2000122070312, "logps/rejected": -839.2000122070312, "loss": 4.1557, "rewards/accuracies": 0.4351562559604645, "rewards/chosen": 3.2367186546325684, "rewards/margins": 7.862500190734863, "rewards/rejected": -4.623046875, "step": 1560 }, { "epoch": 0.7370892018779343, "grad_norm": 387.72493562249076, "learning_rate": 1.463223787167449e-07, "logits/chosen": -2.3046875, "logits/rejected": -2.178906202316284, "logps/chosen": -767.5999755859375, "logps/rejected": -842.4000244140625, "loss": 4.2895, "rewards/accuracies": 0.44140625, "rewards/chosen": 3.366406202316284, "rewards/margins": 7.634375095367432, "rewards/rejected": -4.2734375, "step": 1570 }, { "epoch": 0.7417840375586855, "grad_norm": 498.16996435367645, "learning_rate": 1.4371413667188313e-07, "logits/chosen": -2.3984375, "logits/rejected": -2.241406202316284, "logps/chosen": -738.0, "logps/rejected": -871.5999755859375, "loss": 4.1729, "rewards/accuracies": 0.4585937559604645, "rewards/chosen": 4.318749904632568, "rewards/margins": 8.706250190734863, "rewards/rejected": -4.389062404632568, "step": 1580 }, { "epoch": 0.7464788732394366, "grad_norm": 422.9669286102344, "learning_rate": 1.4110589462702139e-07, "logits/chosen": -2.4140625, "logits/rejected": -2.2953124046325684, "logps/chosen": -741.5999755859375, "logps/rejected": -805.2000122070312, "loss": 4.2679, "rewards/accuracies": 0.4351562559604645, "rewards/chosen": 2.90234375, "rewards/margins": 6.705468654632568, "rewards/rejected": -3.8003907203674316, "step": 1590 }, { "epoch": 0.7511737089201878, "grad_norm": 334.8000258754459, "learning_rate": 1.384976525821596e-07, "logits/chosen": -2.4234375953674316, "logits/rejected": -2.1500000953674316, "logps/chosen": -754.7999877929688, "logps/rejected": -885.5999755859375, "loss": 3.4982, "rewards/accuracies": 0.4585937559604645, "rewards/chosen": 4.109375, "rewards/margins": 10.159375190734863, "rewards/rejected": -6.046093940734863, "step": 1600 }, { "epoch": 0.755868544600939, "grad_norm": 450.17417017313414, "learning_rate": 1.3588941053729787e-07, "logits/chosen": -2.2203125953674316, "logits/rejected": -2.180468797683716, "logps/chosen": -767.2000122070312, "logps/rejected": -880.4000244140625, "loss": 4.1285, "rewards/accuracies": 0.46015626192092896, "rewards/chosen": 3.856250047683716, "rewards/margins": 8.818750381469727, "rewards/rejected": -4.962500095367432, "step": 1610 }, { "epoch": 0.7605633802816901, "grad_norm": 394.65988882766004, "learning_rate": 1.332811684924361e-07, "logits/chosen": -2.2828125953674316, "logits/rejected": -2.1703124046325684, "logps/chosen": -784.0, "logps/rejected": -851.5999755859375, "loss": 4.6866, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": 3.389843702316284, "rewards/margins": 8.590624809265137, "rewards/rejected": -5.199999809265137, "step": 1620 }, { "epoch": 0.7652582159624414, "grad_norm": 424.3996351789177, "learning_rate": 1.3067292644757432e-07, "logits/chosen": -2.051562547683716, "logits/rejected": -1.9343750476837158, "logps/chosen": -787.5999755859375, "logps/rejected": -906.4000244140625, "loss": 3.7363, "rewards/accuracies": 0.44921875, "rewards/chosen": 4.043749809265137, "rewards/margins": 9.721875190734863, "rewards/rejected": -5.680468559265137, "step": 1630 }, { "epoch": 0.7699530516431925, "grad_norm": 399.16976696133713, "learning_rate": 1.2806468440271257e-07, "logits/chosen": -2.301562547683716, "logits/rejected": -2.171093702316284, "logps/chosen": -730.4000244140625, "logps/rejected": -824.4000244140625, "loss": 3.8615, "rewards/accuracies": 0.43281251192092896, "rewards/chosen": 3.6468749046325684, "rewards/margins": 7.654687404632568, "rewards/rejected": -4.010156154632568, "step": 1640 }, { "epoch": 0.7746478873239436, "grad_norm": 394.3409840047376, "learning_rate": 1.254564423578508e-07, "logits/chosen": -2.2992186546325684, "logits/rejected": -2.0835938453674316, "logps/chosen": -746.4000244140625, "logps/rejected": -870.4000244140625, "loss": 4.1122, "rewards/accuracies": 0.4453125, "rewards/chosen": 3.388671875, "rewards/margins": 8.792187690734863, "rewards/rejected": -5.399804592132568, "step": 1650 }, { "epoch": 0.7793427230046949, "grad_norm": 409.67174776858445, "learning_rate": 1.2284820031298902e-07, "logits/chosen": -2.274218797683716, "logits/rejected": -2.11328125, "logps/chosen": -789.5999755859375, "logps/rejected": -873.2000122070312, "loss": 4.4931, "rewards/accuracies": 0.48515623807907104, "rewards/chosen": 3.9164061546325684, "rewards/margins": 11.112500190734863, "rewards/rejected": -7.189062595367432, "step": 1660 }, { "epoch": 0.784037558685446, "grad_norm": 456.6688094864312, "learning_rate": 1.2023995826812728e-07, "logits/chosen": -2.34765625, "logits/rejected": -2.1875, "logps/chosen": -746.7999877929688, "logps/rejected": -847.2000122070312, "loss": 4.8528, "rewards/accuracies": 0.44843751192092896, "rewards/chosen": 3.4117188453674316, "rewards/margins": 7.196875095367432, "rewards/rejected": -3.789843797683716, "step": 1670 }, { "epoch": 0.7887323943661971, "grad_norm": 1310.523103957443, "learning_rate": 1.176317162232655e-07, "logits/chosen": -2.3968749046325684, "logits/rejected": -2.278125047683716, "logps/chosen": -735.5999755859375, "logps/rejected": -870.0, "loss": 3.953, "rewards/accuracies": 0.43359375, "rewards/chosen": 3.5853514671325684, "rewards/margins": 8.796875, "rewards/rejected": -5.217187404632568, "step": 1680 }, { "epoch": 0.7934272300469484, "grad_norm": 1724.0070361409773, "learning_rate": 1.1502347417840374e-07, "logits/chosen": -2.2890625, "logits/rejected": -2.073437452316284, "logps/chosen": -737.5999755859375, "logps/rejected": -845.2000122070312, "loss": 4.1449, "rewards/accuracies": 0.4398437440395355, "rewards/chosen": 3.5546875, "rewards/margins": 8.342187881469727, "rewards/rejected": -4.786035060882568, "step": 1690 }, { "epoch": 0.7981220657276995, "grad_norm": 622.6768320134224, "learning_rate": 1.1241523213354198e-07, "logits/chosen": -2.3343749046325684, "logits/rejected": -2.203906297683716, "logps/chosen": -745.5999755859375, "logps/rejected": -874.0, "loss": 4.8334, "rewards/accuracies": 0.40546876192092896, "rewards/chosen": 3.487499952316284, "rewards/margins": 7.410937309265137, "rewards/rejected": -3.9242186546325684, "step": 1700 }, { "epoch": 0.8028169014084507, "grad_norm": 439.70596258416913, "learning_rate": 1.0980699008868022e-07, "logits/chosen": -2.21484375, "logits/rejected": -2.069531202316284, "logps/chosen": -784.7999877929688, "logps/rejected": -828.4000244140625, "loss": 4.4736, "rewards/accuracies": 0.4296875, "rewards/chosen": 2.723437547683716, "rewards/margins": 6.446875095367432, "rewards/rejected": -3.7225584983825684, "step": 1710 }, { "epoch": 0.8075117370892019, "grad_norm": 564.5328064267883, "learning_rate": 1.0719874804381846e-07, "logits/chosen": -2.3734374046325684, "logits/rejected": -2.2874999046325684, "logps/chosen": -732.7999877929688, "logps/rejected": -771.5999755859375, "loss": 4.1398, "rewards/accuracies": 0.43671876192092896, "rewards/chosen": 3.6968750953674316, "rewards/margins": 5.946875095367432, "rewards/rejected": -2.2494139671325684, "step": 1720 }, { "epoch": 0.812206572769953, "grad_norm": 409.65102881247816, "learning_rate": 1.045905059989567e-07, "logits/chosen": -2.336718797683716, "logits/rejected": -2.2398438453674316, "logps/chosen": -711.2000122070312, "logps/rejected": -772.4000244140625, "loss": 4.5148, "rewards/accuracies": 0.46875, "rewards/chosen": 3.5531249046325684, "rewards/margins": 9.015625, "rewards/rejected": -5.461718559265137, "step": 1730 }, { "epoch": 0.8169014084507042, "grad_norm": 748.089847869853, "learning_rate": 1.0198226395409494e-07, "logits/chosen": -2.176562547683716, "logits/rejected": -2.0570311546325684, "logps/chosen": -722.7999877929688, "logps/rejected": -810.0, "loss": 4.262, "rewards/accuracies": 0.44062501192092896, "rewards/chosen": 3.4253907203674316, "rewards/margins": 7.412499904632568, "rewards/rejected": -3.9896483421325684, "step": 1740 }, { "epoch": 0.8215962441314554, "grad_norm": 328.6171942963381, "learning_rate": 9.937402190923318e-08, "logits/chosen": -2.225781202316284, "logits/rejected": -2.0328125953674316, "logps/chosen": -720.7999877929688, "logps/rejected": -867.2000122070312, "loss": 3.3953, "rewards/accuracies": 0.46562498807907104, "rewards/chosen": 4.143750190734863, "rewards/margins": 10.809374809265137, "rewards/rejected": -6.6640625, "step": 1750 }, { "epoch": 0.8262910798122066, "grad_norm": 396.428360485738, "learning_rate": 9.676577986437141e-08, "logits/chosen": -2.2328124046325684, "logits/rejected": -2.0335936546325684, "logps/chosen": -781.5999755859375, "logps/rejected": -849.5999755859375, "loss": 4.593, "rewards/accuracies": 0.4671874940395355, "rewards/chosen": 3.284960985183716, "rewards/margins": 9.949999809265137, "rewards/rejected": -6.658715724945068, "step": 1760 }, { "epoch": 0.8309859154929577, "grad_norm": 707.3465488581495, "learning_rate": 9.415753781950965e-08, "logits/chosen": -2.3203125, "logits/rejected": -2.129687547683716, "logps/chosen": -727.5999755859375, "logps/rejected": -838.0, "loss": 3.6212, "rewards/accuracies": 0.4593749940395355, "rewards/chosen": 4.284375190734863, "rewards/margins": 9.942187309265137, "rewards/rejected": -5.672265529632568, "step": 1770 }, { "epoch": 0.8356807511737089, "grad_norm": 395.9535365243982, "learning_rate": 9.154929577464789e-08, "logits/chosen": -2.340625047683716, "logits/rejected": -2.1484375, "logps/chosen": -744.7999877929688, "logps/rejected": -850.0, "loss": 3.3361, "rewards/accuracies": 0.4476562440395355, "rewards/chosen": 3.8828125, "rewards/margins": 12.431249618530273, "rewards/rejected": -8.540624618530273, "step": 1780 }, { "epoch": 0.8403755868544601, "grad_norm": 330.81263976850016, "learning_rate": 8.894105372978613e-08, "logits/chosen": -2.2320313453674316, "logits/rejected": -2.2249999046325684, "logps/chosen": -781.5999755859375, "logps/rejected": -858.0, "loss": 3.7592, "rewards/accuracies": 0.45703125, "rewards/chosen": 3.5062499046325684, "rewards/margins": 8.868749618530273, "rewards/rejected": -5.375, "step": 1790 }, { "epoch": 0.8450704225352113, "grad_norm": 499.877437455524, "learning_rate": 8.633281168492435e-08, "logits/chosen": -2.3375000953674316, "logits/rejected": -2.1929688453674316, "logps/chosen": -744.4000244140625, "logps/rejected": -806.7999877929688, "loss": 4.4666, "rewards/accuracies": 0.44843751192092896, "rewards/chosen": 3.653515577316284, "rewards/margins": 7.83203125, "rewards/rejected": -4.1796875, "step": 1800 }, { "epoch": 0.8497652582159625, "grad_norm": 396.3397631474569, "learning_rate": 8.372456964006259e-08, "logits/chosen": -2.2320313453674316, "logits/rejected": -2.073437452316284, "logps/chosen": -748.4000244140625, "logps/rejected": -866.4000244140625, "loss": 4.7405, "rewards/accuracies": 0.4632812440395355, "rewards/chosen": 2.959277391433716, "rewards/margins": 9.856249809265137, "rewards/rejected": -6.900000095367432, "step": 1810 }, { "epoch": 0.8544600938967136, "grad_norm": 530.220629633677, "learning_rate": 8.111632759520083e-08, "logits/chosen": -2.200000047683716, "logits/rejected": -2.078125, "logps/chosen": -801.5999755859375, "logps/rejected": -862.0, "loss": 4.4322, "rewards/accuracies": 0.46015626192092896, "rewards/chosen": 3.389453172683716, "rewards/margins": 7.340624809265137, "rewards/rejected": -3.950000047683716, "step": 1820 }, { "epoch": 0.8591549295774648, "grad_norm": 440.65956743164475, "learning_rate": 7.850808555033907e-08, "logits/chosen": -2.3421874046325684, "logits/rejected": -2.1929688453674316, "logps/chosen": -743.5999755859375, "logps/rejected": -870.0, "loss": 3.9501, "rewards/accuracies": 0.44921875, "rewards/chosen": 3.938281297683716, "rewards/margins": 10.178125381469727, "rewards/rejected": -6.231249809265137, "step": 1830 }, { "epoch": 0.863849765258216, "grad_norm": 442.2402627617055, "learning_rate": 7.58998435054773e-08, "logits/chosen": -2.332812547683716, "logits/rejected": -2.0648436546325684, "logps/chosen": -751.2000122070312, "logps/rejected": -862.7999877929688, "loss": 4.3231, "rewards/accuracies": 0.4476562440395355, "rewards/chosen": 3.4214844703674316, "rewards/margins": 9.412500381469727, "rewards/rejected": -6.0, "step": 1840 }, { "epoch": 0.8685446009389671, "grad_norm": 10384.624792404133, "learning_rate": 7.329160146061554e-08, "logits/chosen": -2.370312452316284, "logits/rejected": -2.3296875953674316, "logps/chosen": -734.0, "logps/rejected": -836.7999877929688, "loss": 4.7064, "rewards/accuracies": 0.4710937440395355, "rewards/chosen": 4.026562690734863, "rewards/margins": 8.247655868530273, "rewards/rejected": -4.221875190734863, "step": 1850 }, { "epoch": 0.8732394366197183, "grad_norm": 673.9242811565833, "learning_rate": 7.068335941575378e-08, "logits/chosen": -2.1734375953674316, "logits/rejected": -2.153125047683716, "logps/chosen": -740.0, "logps/rejected": -806.7999877929688, "loss": 4.3453, "rewards/accuracies": 0.45234376192092896, "rewards/chosen": 3.1578125953674316, "rewards/margins": 7.625, "rewards/rejected": -4.46240234375, "step": 1860 }, { "epoch": 0.8779342723004695, "grad_norm": 607.3789650567315, "learning_rate": 6.807511737089202e-08, "logits/chosen": -2.4046874046325684, "logits/rejected": -2.196093797683716, "logps/chosen": -739.2000122070312, "logps/rejected": -842.0, "loss": 4.1078, "rewards/accuracies": 0.44921875, "rewards/chosen": 3.664843797683716, "rewards/margins": 9.100000381469727, "rewards/rejected": -5.440625190734863, "step": 1870 }, { "epoch": 0.8826291079812206, "grad_norm": 381.2094754272133, "learning_rate": 6.546687532603024e-08, "logits/chosen": -2.278125047683716, "logits/rejected": -2.155468702316284, "logps/chosen": -735.2000122070312, "logps/rejected": -841.5999755859375, "loss": 4.8184, "rewards/accuracies": 0.45234376192092896, "rewards/chosen": 3.410937547683716, "rewards/margins": 7.385937690734863, "rewards/rejected": -3.982617139816284, "step": 1880 }, { "epoch": 0.8873239436619719, "grad_norm": 299.44549442697485, "learning_rate": 6.285863328116848e-08, "logits/chosen": -2.3125, "logits/rejected": -2.11328125, "logps/chosen": -682.0, "logps/rejected": -856.7999877929688, "loss": 3.7071, "rewards/accuracies": 0.46875, "rewards/chosen": 4.295312404632568, "rewards/margins": 10.567187309265137, "rewards/rejected": -6.278124809265137, "step": 1890 }, { "epoch": 0.892018779342723, "grad_norm": 398.4727420176694, "learning_rate": 6.025039123630672e-08, "logits/chosen": -2.3140625953674316, "logits/rejected": -2.067187547683716, "logps/chosen": -762.0, "logps/rejected": -814.0, "loss": 3.5143, "rewards/accuracies": 0.453125, "rewards/chosen": 4.451562404632568, "rewards/margins": 9.012499809265137, "rewards/rejected": -4.559668064117432, "step": 1900 }, { "epoch": 0.8967136150234741, "grad_norm": 337.5599153360449, "learning_rate": 5.764214919144496e-08, "logits/chosen": -2.35546875, "logits/rejected": -2.22265625, "logps/chosen": -770.7999877929688, "logps/rejected": -814.4000244140625, "loss": 3.8246, "rewards/accuracies": 0.43828123807907104, "rewards/chosen": 3.7601561546325684, "rewards/margins": 8.521875381469727, "rewards/rejected": -4.768456935882568, "step": 1910 }, { "epoch": 0.9014084507042254, "grad_norm": 505.2498825423291, "learning_rate": 5.50339071465832e-08, "logits/chosen": -2.284374952316284, "logits/rejected": -2.3359375, "logps/chosen": -759.2000122070312, "logps/rejected": -774.0, "loss": 3.8311, "rewards/accuracies": 0.4320312440395355, "rewards/chosen": 3.6195311546325684, "rewards/margins": 7.840624809265137, "rewards/rejected": -4.220410346984863, "step": 1920 }, { "epoch": 0.9061032863849765, "grad_norm": 327.8189786562515, "learning_rate": 5.2425665101721436e-08, "logits/chosen": -2.328125, "logits/rejected": -2.1312499046325684, "logps/chosen": -751.5999755859375, "logps/rejected": -880.0, "loss": 4.4301, "rewards/accuracies": 0.4453125, "rewards/chosen": 3.4945311546325684, "rewards/margins": 9.09375, "rewards/rejected": -5.605273246765137, "step": 1930 }, { "epoch": 0.9107981220657277, "grad_norm": 347.13831647070737, "learning_rate": 4.9817423056859675e-08, "logits/chosen": -2.3531250953674316, "logits/rejected": -2.1273436546325684, "logps/chosen": -756.7999877929688, "logps/rejected": -864.4000244140625, "loss": 4.8472, "rewards/accuracies": 0.44140625, "rewards/chosen": 3.235058546066284, "rewards/margins": 8.7421875, "rewards/rejected": -5.513281345367432, "step": 1940 }, { "epoch": 0.9154929577464789, "grad_norm": 298.01909557922994, "learning_rate": 4.720918101199791e-08, "logits/chosen": -2.139843702316284, "logits/rejected": -1.9132812023162842, "logps/chosen": -776.0, "logps/rejected": -940.4000244140625, "loss": 3.9666, "rewards/accuracies": 0.4859375059604645, "rewards/chosen": 4.353125095367432, "rewards/margins": 11.556249618530273, "rewards/rejected": -7.199999809265137, "step": 1950 }, { "epoch": 0.92018779342723, "grad_norm": 294.48518836380975, "learning_rate": 4.460093896713615e-08, "logits/chosen": -2.3609375953674316, "logits/rejected": -2.0921874046325684, "logps/chosen": -745.2000122070312, "logps/rejected": -877.2000122070312, "loss": 4.4285, "rewards/accuracies": 0.4554687440395355, "rewards/chosen": 4.26953125, "rewards/margins": 8.362500190734863, "rewards/rejected": -4.088281154632568, "step": 1960 }, { "epoch": 0.9248826291079812, "grad_norm": 445.20212474569666, "learning_rate": 4.199269692227438e-08, "logits/chosen": -2.3031249046325684, "logits/rejected": -2.2281250953674316, "logps/chosen": -743.5999755859375, "logps/rejected": -786.7999877929688, "loss": 4.7986, "rewards/accuracies": 0.43281251192092896, "rewards/chosen": 3.094531297683716, "rewards/margins": 7.203125, "rewards/rejected": -4.1083984375, "step": 1970 }, { "epoch": 0.9295774647887324, "grad_norm": 454.23834025841336, "learning_rate": 3.938445487741262e-08, "logits/chosen": -2.1875, "logits/rejected": -1.9148437976837158, "logps/chosen": -774.0, "logps/rejected": -861.2000122070312, "loss": 4.2181, "rewards/accuracies": 0.44453126192092896, "rewards/chosen": 3.535937547683716, "rewards/margins": 9.074999809265137, "rewards/rejected": -5.548047065734863, "step": 1980 }, { "epoch": 0.9342723004694836, "grad_norm": 392.17307200423585, "learning_rate": 3.677621283255086e-08, "logits/chosen": -2.4078125953674316, "logits/rejected": -2.2710938453674316, "logps/chosen": -766.7999877929688, "logps/rejected": -832.7999877929688, "loss": 3.8059, "rewards/accuracies": 0.4429687559604645, "rewards/chosen": 3.9078125953674316, "rewards/margins": 7.943749904632568, "rewards/rejected": -4.034375190734863, "step": 1990 }, { "epoch": 0.9389671361502347, "grad_norm": 339.44226418791203, "learning_rate": 3.41679707876891e-08, "logits/chosen": -2.3687500953674316, "logits/rejected": -2.2007813453674316, "logps/chosen": -732.0, "logps/rejected": -894.7999877929688, "loss": 3.7702, "rewards/accuracies": 0.45703125, "rewards/chosen": 3.6273436546325684, "rewards/margins": 11.837499618530273, "rewards/rejected": -8.2109375, "step": 2000 }, { "epoch": 0.9436619718309859, "grad_norm": 390.26770671950203, "learning_rate": 3.155972874282733e-08, "logits/chosen": -2.348437547683716, "logits/rejected": -2.1382813453674316, "logps/chosen": -724.0, "logps/rejected": -818.4000244140625, "loss": 4.599, "rewards/accuracies": 0.43437498807907104, "rewards/chosen": 3.7381348609924316, "rewards/margins": 6.074999809265137, "rewards/rejected": -2.3402342796325684, "step": 2010 }, { "epoch": 0.9483568075117371, "grad_norm": 445.6413313616932, "learning_rate": 2.8951486697965573e-08, "logits/chosen": -2.297656297683716, "logits/rejected": -2.19140625, "logps/chosen": -767.5999755859375, "logps/rejected": -858.4000244140625, "loss": 4.0664, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": 3.754687547683716, "rewards/margins": 8.571874618530273, "rewards/rejected": -4.814745903015137, "step": 2020 }, { "epoch": 0.9530516431924883, "grad_norm": 443.53300480170367, "learning_rate": 2.634324465310381e-08, "logits/chosen": -2.375, "logits/rejected": -2.143749952316284, "logps/chosen": -735.2000122070312, "logps/rejected": -838.0, "loss": 3.9934, "rewards/accuracies": 0.4546875059604645, "rewards/chosen": 3.706249952316284, "rewards/margins": 9.168749809265137, "rewards/rejected": -5.470312595367432, "step": 2030 }, { "epoch": 0.9577464788732394, "grad_norm": 548.3499825235165, "learning_rate": 2.3735002608242045e-08, "logits/chosen": -2.2359375953674316, "logits/rejected": -2.0875000953674316, "logps/chosen": -777.2000122070312, "logps/rejected": -815.2000122070312, "loss": 5.05, "rewards/accuracies": 0.4546875059604645, "rewards/chosen": 3.278515577316284, "rewards/margins": 8.28125, "rewards/rejected": -5.006640434265137, "step": 2040 }, { "epoch": 0.9624413145539906, "grad_norm": 407.4024447296721, "learning_rate": 2.1126760563380282e-08, "logits/chosen": -2.3578124046325684, "logits/rejected": -2.2281250953674316, "logps/chosen": -746.7999877929688, "logps/rejected": -779.5999755859375, "loss": 4.1102, "rewards/accuracies": 0.45703125, "rewards/chosen": 3.909374952316284, "rewards/margins": 8.640625, "rewards/rejected": -4.732226371765137, "step": 2050 }, { "epoch": 0.9671361502347418, "grad_norm": 332.7928982783245, "learning_rate": 1.8518518518518518e-08, "logits/chosen": -2.28515625, "logits/rejected": -2.0492186546325684, "logps/chosen": -773.2000122070312, "logps/rejected": -853.5999755859375, "loss": 4.4431, "rewards/accuracies": 0.44062501192092896, "rewards/chosen": 3.1953125, "rewards/margins": 8.565625190734863, "rewards/rejected": -5.377343654632568, "step": 2060 }, { "epoch": 0.971830985915493, "grad_norm": 363.9987243736904, "learning_rate": 1.5910276473656755e-08, "logits/chosen": -2.2718749046325684, "logits/rejected": -2.030468702316284, "logps/chosen": -745.5999755859375, "logps/rejected": -845.5999755859375, "loss": 4.1346, "rewards/accuracies": 0.4546875059604645, "rewards/chosen": 4.180468559265137, "rewards/margins": 8.756250381469727, "rewards/rejected": -4.577734470367432, "step": 2070 }, { "epoch": 0.9765258215962441, "grad_norm": 282.5830371548437, "learning_rate": 1.3302034428794991e-08, "logits/chosen": -2.4046874046325684, "logits/rejected": -2.3187499046325684, "logps/chosen": -708.0, "logps/rejected": -786.0, "loss": 4.9213, "rewards/accuracies": 0.47265625, "rewards/chosen": 3.958203077316284, "rewards/margins": 8.301562309265137, "rewards/rejected": -4.33984375, "step": 2080 }, { "epoch": 0.9812206572769953, "grad_norm": 454.09493922137517, "learning_rate": 1.0693792383933229e-08, "logits/chosen": -2.344531297683716, "logits/rejected": -2.282031297683716, "logps/chosen": -765.2000122070312, "logps/rejected": -849.2000122070312, "loss": 4.1305, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 4.190625190734863, "rewards/margins": 8.904687881469727, "rewards/rejected": -4.71044921875, "step": 2090 }, { "epoch": 0.9859154929577465, "grad_norm": 978.3858800511712, "learning_rate": 8.085550339071465e-09, "logits/chosen": -2.28125, "logits/rejected": -2.147656202316284, "logps/chosen": -782.7999877929688, "logps/rejected": -878.7999877929688, "loss": 4.6955, "rewards/accuracies": 0.44062501192092896, "rewards/chosen": 2.861328125, "rewards/margins": 7.328125, "rewards/rejected": -4.464062690734863, "step": 2100 }, { "epoch": 0.9906103286384976, "grad_norm": 3846.6990654292445, "learning_rate": 5.4773082942097025e-09, "logits/chosen": -2.296875, "logits/rejected": -2.1875, "logps/chosen": -728.7999877929688, "logps/rejected": -802.0, "loss": 4.1061, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": 3.919140577316284, "rewards/margins": 10.006250381469727, "rewards/rejected": -6.084741115570068, "step": 2110 }, { "epoch": 0.9953051643192489, "grad_norm": 435.0445407316125, "learning_rate": 2.8690662493479393e-09, "logits/chosen": -2.2750000953674316, "logits/rejected": -2.01953125, "logps/chosen": -753.5999755859375, "logps/rejected": -842.4000244140625, "loss": 4.6775, "rewards/accuracies": 0.4554687440395355, "rewards/chosen": 3.551562547683716, "rewards/margins": 8.192187309265137, "rewards/rejected": -4.649218559265137, "step": 2120 }, { "epoch": 1.0, "grad_norm": 379.7605837883577, "learning_rate": 2.608242044861763e-10, "logits/chosen": -2.31640625, "logits/rejected": -2.184375047683716, "logps/chosen": -742.7999877929688, "logps/rejected": -854.7999877929688, "loss": 3.895, "rewards/accuracies": 0.47061213850975037, "rewards/chosen": 4.150390625, "rewards/margins": 12.193750381469727, "rewards/rejected": -8.0390625, "step": 2130 }, { "epoch": 1.0, "step": 2130, "total_flos": 0.0, "train_loss": 4.194636241930751, "train_runtime": 7092.6143, "train_samples_per_second": 38.432, "train_steps_per_second": 0.3 } ], "logging_steps": 10, "max_steps": 2130, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }