{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 2130, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004694835680751174, "grad_norm": 565.6911667702249, "learning_rate": 2.1126760563380282e-08, "logits/chosen": -2.667187452316284, "logits/rejected": -2.3421874046325684, "logps/chosen": -706.7999877929688, "logps/rejected": -824.0, "loss": 1.7469, "rewards/accuracies": 0.06562499701976776, "rewards/chosen": -0.21103516221046448, "rewards/margins": -0.12214355170726776, "rewards/rejected": -0.08945312350988388, "step": 10 }, { "epoch": 0.009389671361502348, "grad_norm": 795.4360405196873, "learning_rate": 4.460093896713615e-08, "logits/chosen": -2.582812547683716, "logits/rejected": -2.44140625, "logps/chosen": -743.5999755859375, "logps/rejected": -823.5999755859375, "loss": 3.3411, "rewards/accuracies": 0.2515625059604645, "rewards/chosen": 0.2962890565395355, "rewards/margins": 0.4253906309604645, "rewards/rejected": -0.13002929091453552, "step": 20 }, { "epoch": 0.014084507042253521, "grad_norm": 494.5616113800972, "learning_rate": 6.807511737089202e-08, "logits/chosen": -2.4749999046325684, "logits/rejected": -2.2109375, "logps/chosen": -790.0, "logps/rejected": -916.7999877929688, "loss": 4.3715, "rewards/accuracies": 0.24531249701976776, "rewards/chosen": 0.011181640438735485, "rewards/margins": -0.4087890684604645, "rewards/rejected": 0.419921875, "step": 30 }, { "epoch": 0.018779342723004695, "grad_norm": 29663.688045294435, "learning_rate": 9.154929577464789e-08, "logits/chosen": -2.5609374046325684, "logits/rejected": -2.3414063453674316, "logps/chosen": -781.5999755859375, "logps/rejected": -950.0, "loss": 4.017, "rewards/accuracies": 0.2640624940395355, "rewards/chosen": -0.0010253905784338713, "rewards/margins": 0.37373048067092896, "rewards/rejected": -0.3775390684604645, "step": 40 }, { "epoch": 0.023474178403755867, "grad_norm": 1866.6692324977785, "learning_rate": 1.1502347417840374e-07, "logits/chosen": -2.489062547683716, "logits/rejected": -2.3460936546325684, "logps/chosen": -762.7999877929688, "logps/rejected": -847.5999755859375, "loss": 3.7117, "rewards/accuracies": 0.25078123807907104, "rewards/chosen": 0.4264160096645355, "rewards/margins": 0.31297606229782104, "rewards/rejected": 0.11249999701976776, "step": 50 }, { "epoch": 0.028169014084507043, "grad_norm": 781.8139207708915, "learning_rate": 1.384976525821596e-07, "logits/chosen": -2.515625, "logits/rejected": -2.3265624046325684, "logps/chosen": -790.7999877929688, "logps/rejected": -919.5999755859375, "loss": 4.0642, "rewards/accuracies": 0.25703126192092896, "rewards/chosen": 0.09511718899011612, "rewards/margins": -0.09335937350988388, "rewards/rejected": 0.189208984375, "step": 60 }, { "epoch": 0.03286384976525822, "grad_norm": 535.2796307281731, "learning_rate": 1.619718309859155e-07, "logits/chosen": -2.3812499046325684, "logits/rejected": -2.180468797683716, "logps/chosen": -758.0, "logps/rejected": -826.4000244140625, "loss": 3.9391, "rewards/accuracies": 0.265625, "rewards/chosen": -0.20917968451976776, "rewards/margins": -0.04550781100988388, "rewards/rejected": -0.1646728515625, "step": 70 }, { "epoch": 0.03755868544600939, "grad_norm": 446.8209311523151, "learning_rate": 1.8544600938967138e-07, "logits/chosen": -2.5953125953674316, "logits/rejected": -2.4140625, "logps/chosen": -751.2000122070312, "logps/rejected": -864.0, "loss": 5.0561, "rewards/accuracies": 0.24921874701976776, "rewards/chosen": -0.4513183534145355, "rewards/margins": -1.0255858898162842, "rewards/rejected": 0.5743163824081421, "step": 80 }, { "epoch": 0.04225352112676056, "grad_norm": 621.4110929271666, "learning_rate": 2.089201877934272e-07, "logits/chosen": -2.620312452316284, "logits/rejected": -2.457812547683716, "logps/chosen": -730.7999877929688, "logps/rejected": -827.2000122070312, "loss": 3.6887, "rewards/accuracies": 0.2632812559604645, "rewards/chosen": -0.20273438096046448, "rewards/margins": 0.530468761920929, "rewards/rejected": -0.7367187738418579, "step": 90 }, { "epoch": 0.046948356807511735, "grad_norm": 552.5703654191287, "learning_rate": 2.323943661971831e-07, "logits/chosen": -2.535937547683716, "logits/rejected": -2.426562547683716, "logps/chosen": -774.0, "logps/rejected": -808.0, "loss": 4.0339, "rewards/accuracies": 0.23749999701976776, "rewards/chosen": 0.06601562350988388, "rewards/margins": 0.03476562350988388, "rewards/rejected": 0.03028564527630806, "step": 100 }, { "epoch": 0.051643192488262914, "grad_norm": 497.9981942342969, "learning_rate": 2.5586854460093895e-07, "logits/chosen": -2.645312547683716, "logits/rejected": -2.4828124046325684, "logps/chosen": -752.4000244140625, "logps/rejected": -885.2000122070312, "loss": 4.0214, "rewards/accuracies": 0.27265626192092896, "rewards/chosen": 0.2857910096645355, "rewards/margins": 0.7242187261581421, "rewards/rejected": -0.43608397245407104, "step": 110 }, { "epoch": 0.056338028169014086, "grad_norm": 2839.9771056846735, "learning_rate": 2.7934272300469483e-07, "logits/chosen": -2.4820313453674316, "logits/rejected": -2.370312452316284, "logps/chosen": -778.7999877929688, "logps/rejected": -880.0, "loss": 3.8928, "rewards/accuracies": 0.2679687440395355, "rewards/chosen": 0.5595703125, "rewards/margins": 3.4248046875, "rewards/rejected": -2.882031202316284, "step": 120 }, { "epoch": 0.06103286384976526, "grad_norm": 2321.4271258065637, "learning_rate": 3.0281690140845066e-07, "logits/chosen": -2.450000047683716, "logits/rejected": -2.328906297683716, "logps/chosen": -794.4000244140625, "logps/rejected": -914.0, "loss": 3.8254, "rewards/accuracies": 0.28125, "rewards/chosen": 0.07355956733226776, "rewards/margins": 0.49262696504592896, "rewards/rejected": -0.4170898497104645, "step": 130 }, { "epoch": 0.06572769953051644, "grad_norm": 1567.2456749763714, "learning_rate": 3.2629107981220654e-07, "logits/chosen": -2.4593749046325684, "logits/rejected": -2.335156202316284, "logps/chosen": -778.4000244140625, "logps/rejected": -908.7999877929688, "loss": 3.4053, "rewards/accuracies": 0.24765625596046448, "rewards/chosen": 0.13657227158546448, "rewards/margins": 0.7059570550918579, "rewards/rejected": -0.570019543170929, "step": 140 }, { "epoch": 0.07042253521126761, "grad_norm": 708.2125750233134, "learning_rate": 3.497652582159624e-07, "logits/chosen": -2.526562452316284, "logits/rejected": -2.5093750953674316, "logps/chosen": -755.5999755859375, "logps/rejected": -870.7999877929688, "loss": 3.5788, "rewards/accuracies": 0.2562499940395355, "rewards/chosen": 0.05146484449505806, "rewards/margins": 0.5595703125, "rewards/rejected": -0.5091797113418579, "step": 150 }, { "epoch": 0.07511737089201878, "grad_norm": 631.5400236296861, "learning_rate": 3.732394366197183e-07, "logits/chosen": -2.503124952316284, "logits/rejected": -2.3382811546325684, "logps/chosen": -781.5999755859375, "logps/rejected": -887.5999755859375, "loss": 3.9919, "rewards/accuracies": 0.2671875059604645, "rewards/chosen": 0.19023437798023224, "rewards/margins": 3.1605467796325684, "rewards/rejected": -2.96484375, "step": 160 }, { "epoch": 0.07981220657276995, "grad_norm": 785.8584783654784, "learning_rate": 3.967136150234742e-07, "logits/chosen": -2.47265625, "logits/rejected": -2.289843797683716, "logps/chosen": -739.2000122070312, "logps/rejected": -877.5999755859375, "loss": 4.1042, "rewards/accuracies": 0.28437501192092896, "rewards/chosen": 0.174560546875, "rewards/margins": 2.25146484375, "rewards/rejected": -2.075000047683716, "step": 170 }, { "epoch": 0.08450704225352113, "grad_norm": 763.3628562607071, "learning_rate": 4.2018779342723e-07, "logits/chosen": -2.5140624046325684, "logits/rejected": -2.2992186546325684, "logps/chosen": -709.5999755859375, "logps/rejected": -887.5999755859375, "loss": 4.1191, "rewards/accuracies": 0.25, "rewards/chosen": -0.3524414002895355, "rewards/margins": -0.336669921875, "rewards/rejected": -0.01523437537252903, "step": 180 }, { "epoch": 0.0892018779342723, "grad_norm": 630.9536910315088, "learning_rate": 4.436619718309859e-07, "logits/chosen": -2.442187547683716, "logits/rejected": -2.358593702316284, "logps/chosen": -761.5999755859375, "logps/rejected": -824.4000244140625, "loss": 4.1129, "rewards/accuracies": 0.2640624940395355, "rewards/chosen": -0.07143554836511612, "rewards/margins": 0.3963256776332855, "rewards/rejected": -0.4695068299770355, "step": 190 }, { "epoch": 0.09389671361502347, "grad_norm": 469.0066969575735, "learning_rate": 4.671361502347418e-07, "logits/chosen": -2.479687452316284, "logits/rejected": -2.2890625, "logps/chosen": -740.4000244140625, "logps/rejected": -832.0, "loss": 3.8034, "rewards/accuracies": 0.28046876192092896, "rewards/chosen": 0.02991943433880806, "rewards/margins": 1.4141113758087158, "rewards/rejected": -1.3859374523162842, "step": 200 }, { "epoch": 0.09859154929577464, "grad_norm": 1323.8265146826134, "learning_rate": 4.906103286384976e-07, "logits/chosen": -2.496875047683716, "logits/rejected": -2.348437547683716, "logps/chosen": -759.2000122070312, "logps/rejected": -899.5999755859375, "loss": 3.5549, "rewards/accuracies": 0.26484376192092896, "rewards/chosen": 0.729785144329071, "rewards/margins": 3.080859422683716, "rewards/rejected": -2.3451905250549316, "step": 210 }, { "epoch": 0.10328638497652583, "grad_norm": 554.2134102745014, "learning_rate": 4.984350547730829e-07, "logits/chosen": -2.6234374046325684, "logits/rejected": -2.390625, "logps/chosen": -790.4000244140625, "logps/rejected": -863.5999755859375, "loss": 4.6254, "rewards/accuracies": 0.2679687440395355, "rewards/chosen": -0.20195312798023224, "rewards/margins": 0.504638671875, "rewards/rejected": -0.7046874761581421, "step": 220 }, { "epoch": 0.107981220657277, "grad_norm": 1064.8691121841825, "learning_rate": 4.958268127282212e-07, "logits/chosen": -2.5171875953674316, "logits/rejected": -2.399218797683716, "logps/chosen": -767.2000122070312, "logps/rejected": -889.2000122070312, "loss": 4.0596, "rewards/accuracies": 0.27812498807907104, "rewards/chosen": -0.05839843675494194, "rewards/margins": 1.6222655773162842, "rewards/rejected": -1.6826293468475342, "step": 230 }, { "epoch": 0.11267605633802817, "grad_norm": 437.35088415956835, "learning_rate": 4.932185706833594e-07, "logits/chosen": -2.442187547683716, "logits/rejected": -2.3890624046325684, "logps/chosen": -704.4000244140625, "logps/rejected": -750.7999877929688, "loss": 3.8325, "rewards/accuracies": 0.2945312559604645, "rewards/chosen": 0.12875977158546448, "rewards/margins": 0.897265613079071, "rewards/rejected": -0.7696533203125, "step": 240 }, { "epoch": 0.11737089201877934, "grad_norm": 10472.598448354647, "learning_rate": 4.906103286384976e-07, "logits/chosen": -2.5484375953674316, "logits/rejected": -2.4007811546325684, "logps/chosen": -752.7999877929688, "logps/rejected": -888.4000244140625, "loss": 5.351, "rewards/accuracies": 0.25859373807907104, "rewards/chosen": -1.69580078125, "rewards/margins": -0.614062488079071, "rewards/rejected": -1.078222632408142, "step": 250 }, { "epoch": 0.12206572769953052, "grad_norm": 502.4444253016428, "learning_rate": 4.880020865936358e-07, "logits/chosen": -2.450000047683716, "logits/rejected": -2.42578125, "logps/chosen": -712.0, "logps/rejected": -824.0, "loss": 3.2333, "rewards/accuracies": 0.2632812559604645, "rewards/chosen": 0.04560546949505806, "rewards/margins": 1.2366211414337158, "rewards/rejected": -1.19287109375, "step": 260 }, { "epoch": 0.1267605633802817, "grad_norm": 739.3258331357866, "learning_rate": 4.853938445487741e-07, "logits/chosen": -2.4140625, "logits/rejected": -2.1070313453674316, "logps/chosen": -752.7999877929688, "logps/rejected": -875.5999755859375, "loss": 4.3557, "rewards/accuracies": 0.2789062559604645, "rewards/chosen": 0.46367186307907104, "rewards/margins": 0.6446288824081421, "rewards/rejected": -0.17988280951976776, "step": 270 }, { "epoch": 0.13145539906103287, "grad_norm": 732.1575140816506, "learning_rate": 4.827856025039123e-07, "logits/chosen": -2.53125, "logits/rejected": -2.4312500953674316, "logps/chosen": -764.0, "logps/rejected": -890.7999877929688, "loss": 3.9307, "rewards/accuracies": 0.2828125059604645, "rewards/chosen": 0.535937488079071, "rewards/margins": 5.005663871765137, "rewards/rejected": -4.469336032867432, "step": 280 }, { "epoch": 0.13615023474178403, "grad_norm": 680.260964177763, "learning_rate": 4.801773604590506e-07, "logits/chosen": -2.448437452316284, "logits/rejected": -2.4046874046325684, "logps/chosen": -808.0, "logps/rejected": -883.5999755859375, "loss": 4.015, "rewards/accuracies": 0.28437501192092896, "rewards/chosen": -0.513378918170929, "rewards/margins": 0.6968017816543579, "rewards/rejected": -1.2082030773162842, "step": 290 }, { "epoch": 0.14084507042253522, "grad_norm": 484.15589372431845, "learning_rate": 4.775691184141888e-07, "logits/chosen": -2.5796875953674316, "logits/rejected": -2.43359375, "logps/chosen": -771.2000122070312, "logps/rejected": -871.5999755859375, "loss": 3.4621, "rewards/accuracies": 0.2671875059604645, "rewards/chosen": 0.3833984434604645, "rewards/margins": 3.655712842941284, "rewards/rejected": -3.2718749046325684, "step": 300 }, { "epoch": 0.14553990610328638, "grad_norm": 1063.1094328134461, "learning_rate": 4.749608763693271e-07, "logits/chosen": -2.5390625, "logits/rejected": -2.3296875953674316, "logps/chosen": -734.4000244140625, "logps/rejected": -846.4000244140625, "loss": 3.8768, "rewards/accuracies": 0.2992187440395355, "rewards/chosen": -0.05756836012005806, "rewards/margins": 1.36328125, "rewards/rejected": -1.421289086341858, "step": 310 }, { "epoch": 0.15023474178403756, "grad_norm": 597.4164022776029, "learning_rate": 4.7235263432446533e-07, "logits/chosen": -2.5453124046325684, "logits/rejected": -2.3414063453674316, "logps/chosen": -752.4000244140625, "logps/rejected": -877.5999755859375, "loss": 3.6313, "rewards/accuracies": 0.2914062440395355, "rewards/chosen": 0.23031005263328552, "rewards/margins": 2.939208984375, "rewards/rejected": -2.709033250808716, "step": 320 }, { "epoch": 0.15492957746478872, "grad_norm": 836.9926173033505, "learning_rate": 4.6974439227960353e-07, "logits/chosen": -2.504687547683716, "logits/rejected": -2.2718749046325684, "logps/chosen": -782.7999877929688, "logps/rejected": -899.5999755859375, "loss": 3.8357, "rewards/accuracies": 0.30859375, "rewards/chosen": 0.248046875, "rewards/margins": 2.1177735328674316, "rewards/rejected": -1.8639647960662842, "step": 330 }, { "epoch": 0.1596244131455399, "grad_norm": 553.0501295303345, "learning_rate": 4.671361502347418e-07, "logits/chosen": -2.6312499046325684, "logits/rejected": -2.515625, "logps/chosen": -730.4000244140625, "logps/rejected": -787.2000122070312, "loss": 3.4912, "rewards/accuracies": 0.2992187440395355, "rewards/chosen": 0.689379870891571, "rewards/margins": 4.359765529632568, "rewards/rejected": -3.661425828933716, "step": 340 }, { "epoch": 0.1643192488262911, "grad_norm": 2669.757558666913, "learning_rate": 4.6452790818988004e-07, "logits/chosen": -2.5703125, "logits/rejected": -2.387500047683716, "logps/chosen": -785.2000122070312, "logps/rejected": -882.0, "loss": 3.3999, "rewards/accuracies": 0.2984375059604645, "rewards/chosen": 1.208593726158142, "rewards/margins": 1.7898437976837158, "rewards/rejected": -0.5843750238418579, "step": 350 }, { "epoch": 0.16901408450704225, "grad_norm": 740.4010783124489, "learning_rate": 4.6191966614501824e-07, "logits/chosen": -2.543750047683716, "logits/rejected": -2.332812547683716, "logps/chosen": -743.2000122070312, "logps/rejected": -851.5999755859375, "loss": 3.4666, "rewards/accuracies": 0.3218750059604645, "rewards/chosen": 0.512890636920929, "rewards/margins": 6.828515529632568, "rewards/rejected": -6.310327053070068, "step": 360 }, { "epoch": 0.17370892018779344, "grad_norm": 2942.807860511629, "learning_rate": 4.593114241001565e-07, "logits/chosen": -2.448437452316284, "logits/rejected": -2.3531250953674316, "logps/chosen": -805.5999755859375, "logps/rejected": -886.0, "loss": 3.7938, "rewards/accuracies": 0.30859375, "rewards/chosen": -0.04311523586511612, "rewards/margins": 2.0357422828674316, "rewards/rejected": -2.08056640625, "step": 370 }, { "epoch": 0.1784037558685446, "grad_norm": 592.1991244589053, "learning_rate": 4.5670318205529474e-07, "logits/chosen": -2.4234375953674316, "logits/rejected": -2.2164063453674316, "logps/chosen": -761.5999755859375, "logps/rejected": -885.5999755859375, "loss": 4.5918, "rewards/accuracies": 0.30859375, "rewards/chosen": -0.115234375, "rewards/margins": 1.762792944908142, "rewards/rejected": -1.8748047351837158, "step": 380 }, { "epoch": 0.18309859154929578, "grad_norm": 4553.100945645485, "learning_rate": 4.54094940010433e-07, "logits/chosen": -2.4828124046325684, "logits/rejected": -2.30859375, "logps/chosen": -775.5999755859375, "logps/rejected": -920.4000244140625, "loss": 3.4011, "rewards/accuracies": 0.31640625, "rewards/chosen": 0.701953113079071, "rewards/margins": 3.6348633766174316, "rewards/rejected": -2.9296875, "step": 390 }, { "epoch": 0.18779342723004694, "grad_norm": 661.379018707722, "learning_rate": 4.514866979655712e-07, "logits/chosen": -2.5296874046325684, "logits/rejected": -2.33203125, "logps/chosen": -767.5999755859375, "logps/rejected": -868.0, "loss": 3.7633, "rewards/accuracies": 0.3226562440395355, "rewards/chosen": 0.591015636920929, "rewards/margins": 4.783984184265137, "rewards/rejected": -4.193957328796387, "step": 400 }, { "epoch": 0.19248826291079812, "grad_norm": 1045.15590730633, "learning_rate": 4.4887845592070945e-07, "logits/chosen": -2.4703125953674316, "logits/rejected": -2.278125047683716, "logps/chosen": -732.7999877929688, "logps/rejected": -890.0, "loss": 3.482, "rewards/accuracies": 0.3335937559604645, "rewards/chosen": 0.6034179925918579, "rewards/margins": 2.454296827316284, "rewards/rejected": -1.852148413658142, "step": 410 }, { "epoch": 0.19718309859154928, "grad_norm": 729.3198056456259, "learning_rate": 4.462702138758477e-07, "logits/chosen": -2.4296875, "logits/rejected": -2.3187499046325684, "logps/chosen": -789.2000122070312, "logps/rejected": -878.0, "loss": 4.0816, "rewards/accuracies": 0.3125, "rewards/chosen": 0.571093738079071, "rewards/margins": 3.8039307594299316, "rewards/rejected": -3.2376952171325684, "step": 420 }, { "epoch": 0.20187793427230047, "grad_norm": 705.1605522022758, "learning_rate": 4.436619718309859e-07, "logits/chosen": -2.43359375, "logits/rejected": -2.1351561546325684, "logps/chosen": -760.7999877929688, "logps/rejected": -910.4000244140625, "loss": 3.596, "rewards/accuracies": 0.32421875, "rewards/chosen": 0.577343761920929, "rewards/margins": 3.344799757003784, "rewards/rejected": -2.7671875953674316, "step": 430 }, { "epoch": 0.20657276995305165, "grad_norm": 557.8632666657721, "learning_rate": 4.4105372978612415e-07, "logits/chosen": -2.5609374046325684, "logits/rejected": -2.464062452316284, "logps/chosen": -780.7999877929688, "logps/rejected": -896.7999877929688, "loss": 3.7354, "rewards/accuracies": 0.36015623807907104, "rewards/chosen": 1.197607398033142, "rewards/margins": 6.650000095367432, "rewards/rejected": -5.451562404632568, "step": 440 }, { "epoch": 0.2112676056338028, "grad_norm": 1032.671415110097, "learning_rate": 4.384454877412624e-07, "logits/chosen": -2.6171875, "logits/rejected": -2.2867188453674316, "logps/chosen": -747.2000122070312, "logps/rejected": -987.2000122070312, "loss": 3.4085, "rewards/accuracies": 0.35546875, "rewards/chosen": 1.0007812976837158, "rewards/margins": 5.028124809265137, "rewards/rejected": -4.025781154632568, "step": 450 }, { "epoch": 0.215962441314554, "grad_norm": 655.9631561504643, "learning_rate": 4.358372456964006e-07, "logits/chosen": -2.4828124046325684, "logits/rejected": -2.3265624046325684, "logps/chosen": -752.0, "logps/rejected": -885.5999755859375, "loss": 2.9593, "rewards/accuracies": 0.3492187559604645, "rewards/chosen": 1.330468773841858, "rewards/margins": 4.120312690734863, "rewards/rejected": -2.7900390625, "step": 460 }, { "epoch": 0.22065727699530516, "grad_norm": 432.09840432611537, "learning_rate": 4.3322900365153886e-07, "logits/chosen": -2.604687452316284, "logits/rejected": -2.417187452316284, "logps/chosen": -810.0, "logps/rejected": -873.5999755859375, "loss": 5.1664, "rewards/accuracies": 0.3382812440395355, "rewards/chosen": -0.9012695550918579, "rewards/margins": 1.2035400867462158, "rewards/rejected": -2.1019530296325684, "step": 470 }, { "epoch": 0.22535211267605634, "grad_norm": 452.9827108309734, "learning_rate": 4.306207616066771e-07, "logits/chosen": -2.5875000953674316, "logits/rejected": -2.409374952316284, "logps/chosen": -746.7999877929688, "logps/rejected": -850.4000244140625, "loss": 3.9037, "rewards/accuracies": 0.33515626192092896, "rewards/chosen": 0.18442383408546448, "rewards/margins": 2.8453125953674316, "rewards/rejected": -2.657421827316284, "step": 480 }, { "epoch": 0.2300469483568075, "grad_norm": 5492.5122378169, "learning_rate": 4.280125195618153e-07, "logits/chosen": -2.5453124046325684, "logits/rejected": -2.2734375, "logps/chosen": -754.0, "logps/rejected": -880.0, "loss": 3.8738, "rewards/accuracies": 0.33671873807907104, "rewards/chosen": 0.385986328125, "rewards/margins": 3.467968702316284, "rewards/rejected": -3.083984375, "step": 490 }, { "epoch": 0.2347417840375587, "grad_norm": 1167.4501255643606, "learning_rate": 4.2540427751695357e-07, "logits/chosen": -2.5093750953674316, "logits/rejected": -2.3843750953674316, "logps/chosen": -779.5999755859375, "logps/rejected": -866.7999877929688, "loss": 4.1098, "rewards/accuracies": 0.3257812559604645, "rewards/chosen": 0.09238281100988388, "rewards/margins": 3.05029296875, "rewards/rejected": -2.9576172828674316, "step": 500 }, { "epoch": 0.23943661971830985, "grad_norm": 332.63817888625744, "learning_rate": 4.227960354720918e-07, "logits/chosen": -2.5765624046325684, "logits/rejected": -2.3921875953674316, "logps/chosen": -752.7999877929688, "logps/rejected": -910.4000244140625, "loss": 3.7576, "rewards/accuracies": 0.34375, "rewards/chosen": 1.048242211341858, "rewards/margins": 3.6106934547424316, "rewards/rejected": -2.5645508766174316, "step": 510 }, { "epoch": 0.24413145539906103, "grad_norm": 502.03856699077505, "learning_rate": 4.2018779342723e-07, "logits/chosen": -2.479687452316284, "logits/rejected": -2.4765625, "logps/chosen": -772.7999877929688, "logps/rejected": -847.2000122070312, "loss": 3.5195, "rewards/accuracies": 0.34062498807907104, "rewards/chosen": 0.878857433795929, "rewards/margins": 5.823339939117432, "rewards/rejected": -4.945703029632568, "step": 520 }, { "epoch": 0.24882629107981222, "grad_norm": 771.1277054977296, "learning_rate": 4.1757955138236827e-07, "logits/chosen": -2.5390625, "logits/rejected": -2.2914061546325684, "logps/chosen": -742.4000244140625, "logps/rejected": -880.0, "loss": 3.6895, "rewards/accuracies": 0.359375, "rewards/chosen": 0.79052734375, "rewards/margins": 4.889843940734863, "rewards/rejected": -4.102880954742432, "step": 530 }, { "epoch": 0.2535211267605634, "grad_norm": 530.6892827947637, "learning_rate": 4.149713093375065e-07, "logits/chosen": -2.6015625, "logits/rejected": -2.340625047683716, "logps/chosen": -746.0, "logps/rejected": -801.5999755859375, "loss": 3.3273, "rewards/accuracies": 0.3765625059604645, "rewards/chosen": 1.501074194908142, "rewards/margins": 3.7425780296325684, "rewards/rejected": -2.2427735328674316, "step": 540 }, { "epoch": 0.25821596244131456, "grad_norm": 973.8686718903007, "learning_rate": 4.123630672926447e-07, "logits/chosen": -2.5796875953674316, "logits/rejected": -2.391406297683716, "logps/chosen": -726.7999877929688, "logps/rejected": -867.2000122070312, "loss": 3.529, "rewards/accuracies": 0.3421874940395355, "rewards/chosen": 0.7134765386581421, "rewards/margins": 5.2177734375, "rewards/rejected": -4.508984565734863, "step": 550 }, { "epoch": 0.26291079812206575, "grad_norm": 407.83342420314153, "learning_rate": 4.09754825247783e-07, "logits/chosen": -2.5140624046325684, "logits/rejected": -2.25390625, "logps/chosen": -760.0, "logps/rejected": -869.5999755859375, "loss": 3.2248, "rewards/accuracies": 0.37187498807907104, "rewards/chosen": 1.899999976158142, "rewards/margins": 4.778124809265137, "rewards/rejected": -2.877002000808716, "step": 560 }, { "epoch": 0.2676056338028169, "grad_norm": 302.72908809116825, "learning_rate": 4.0714658320292123e-07, "logits/chosen": -2.6812500953674316, "logits/rejected": -2.473437547683716, "logps/chosen": -735.5999755859375, "logps/rejected": -859.5999755859375, "loss": 4.9141, "rewards/accuracies": 0.359375, "rewards/chosen": -0.07233886420726776, "rewards/margins": 3.760546922683716, "rewards/rejected": -3.8359375, "step": 570 }, { "epoch": 0.27230046948356806, "grad_norm": 669.0778365945856, "learning_rate": 4.045383411580595e-07, "logits/chosen": -2.546875, "logits/rejected": -2.332812547683716, "logps/chosen": -736.4000244140625, "logps/rejected": -873.2000122070312, "loss": 4.09, "rewards/accuracies": 0.3617187440395355, "rewards/chosen": 0.733691394329071, "rewards/margins": 3.9351563453674316, "rewards/rejected": -3.203906297683716, "step": 580 }, { "epoch": 0.27699530516431925, "grad_norm": 388.3348301215674, "learning_rate": 4.019300991131977e-07, "logits/chosen": -2.417187452316284, "logits/rejected": -2.3265624046325684, "logps/chosen": -742.0, "logps/rejected": -861.5999755859375, "loss": 3.6498, "rewards/accuracies": 0.3734374940395355, "rewards/chosen": 1.2927734851837158, "rewards/margins": 5.004687309265137, "rewards/rejected": -3.7132811546325684, "step": 590 }, { "epoch": 0.28169014084507044, "grad_norm": 776.7173779793094, "learning_rate": 3.9932185706833594e-07, "logits/chosen": -2.535937547683716, "logits/rejected": -2.299999952316284, "logps/chosen": -771.5999755859375, "logps/rejected": -848.4000244140625, "loss": 3.9873, "rewards/accuracies": 0.35859376192092896, "rewards/chosen": 1.2136719226837158, "rewards/margins": 3.6937499046325684, "rewards/rejected": -2.4820313453674316, "step": 600 }, { "epoch": 0.2863849765258216, "grad_norm": 550.6462713949477, "learning_rate": 3.967136150234742e-07, "logits/chosen": -2.442187547683716, "logits/rejected": -2.30859375, "logps/chosen": -783.5999755859375, "logps/rejected": -866.0, "loss": 3.6671, "rewards/accuracies": 0.34843748807907104, "rewards/chosen": 1.035253882408142, "rewards/margins": 3.5406250953674316, "rewards/rejected": -2.5028319358825684, "step": 610 }, { "epoch": 0.29107981220657275, "grad_norm": 619.879333340021, "learning_rate": 3.941053729786124e-07, "logits/chosen": -2.481250047683716, "logits/rejected": -2.2734375, "logps/chosen": -780.7999877929688, "logps/rejected": -882.0, "loss": 3.2713, "rewards/accuracies": 0.3578124940395355, "rewards/chosen": 1.43115234375, "rewards/margins": 4.9141845703125, "rewards/rejected": -3.477832078933716, "step": 620 }, { "epoch": 0.29577464788732394, "grad_norm": 405.60589163435407, "learning_rate": 3.9149713093375064e-07, "logits/chosen": -2.4515624046325684, "logits/rejected": -2.2578125, "logps/chosen": -757.5999755859375, "logps/rejected": -907.2000122070312, "loss": 4.1245, "rewards/accuracies": 0.375, "rewards/chosen": 0.6396484375, "rewards/margins": 6.715624809265137, "rewards/rejected": -6.079687595367432, "step": 630 }, { "epoch": 0.3004694835680751, "grad_norm": 1366.714756802284, "learning_rate": 3.888888888888889e-07, "logits/chosen": -2.5140624046325684, "logits/rejected": -2.289843797683716, "logps/chosen": -742.0, "logps/rejected": -914.7999877929688, "loss": 3.5693, "rewards/accuracies": 0.3929687440395355, "rewards/chosen": 0.6029297113418579, "rewards/margins": 7.671875, "rewards/rejected": -7.057812690734863, "step": 640 }, { "epoch": 0.3051643192488263, "grad_norm": 499.4118498197771, "learning_rate": 3.862806468440271e-07, "logits/chosen": -2.6734375953674316, "logits/rejected": -2.557812452316284, "logps/chosen": -695.2000122070312, "logps/rejected": -801.5999755859375, "loss": 3.6225, "rewards/accuracies": 0.37421876192092896, "rewards/chosen": 1.0159912109375, "rewards/margins": 4.239843845367432, "rewards/rejected": -3.223828077316284, "step": 650 }, { "epoch": 0.30985915492957744, "grad_norm": 2923.9992019418596, "learning_rate": 3.8367240479916535e-07, "logits/chosen": -2.5218749046325684, "logits/rejected": -2.4749999046325684, "logps/chosen": -744.4000244140625, "logps/rejected": -822.7999877929688, "loss": 3.6278, "rewards/accuracies": 0.3695312440395355, "rewards/chosen": 0.9283202886581421, "rewards/margins": 3.9336915016174316, "rewards/rejected": -3.004687547683716, "step": 660 }, { "epoch": 0.3145539906103286, "grad_norm": 569.6849507966, "learning_rate": 3.810641627543036e-07, "logits/chosen": -2.6546874046325684, "logits/rejected": -2.389843702316284, "logps/chosen": -744.4000244140625, "logps/rejected": -864.7999877929688, "loss": 3.4553, "rewards/accuracies": 0.39140623807907104, "rewards/chosen": 1.55126953125, "rewards/margins": 9.740625381469727, "rewards/rejected": -8.171483993530273, "step": 670 }, { "epoch": 0.3192488262910798, "grad_norm": 534.6767062419108, "learning_rate": 3.784559207094418e-07, "logits/chosen": -2.5640625953674316, "logits/rejected": -2.375, "logps/chosen": -768.4000244140625, "logps/rejected": -847.5999755859375, "loss": 4.1308, "rewards/accuracies": 0.36250001192092896, "rewards/chosen": 1.5206298828125, "rewards/margins": 5.283203125, "rewards/rejected": -3.764843702316284, "step": 680 }, { "epoch": 0.323943661971831, "grad_norm": 421.2227352043194, "learning_rate": 3.7584767866458005e-07, "logits/chosen": -2.510937452316284, "logits/rejected": -2.3343749046325684, "logps/chosen": -760.7999877929688, "logps/rejected": -841.2000122070312, "loss": 4.6036, "rewards/accuracies": 0.38203126192092896, "rewards/chosen": 1.15625, "rewards/margins": 3.3667969703674316, "rewards/rejected": -2.2106690406799316, "step": 690 }, { "epoch": 0.3286384976525822, "grad_norm": 550.222250946523, "learning_rate": 3.732394366197183e-07, "logits/chosen": -2.573437452316284, "logits/rejected": -2.339062452316284, "logps/chosen": -789.5999755859375, "logps/rejected": -830.7999877929688, "loss": 4.4506, "rewards/accuracies": 0.37421876192092896, "rewards/chosen": 0.47871094942092896, "rewards/margins": 5.626367092132568, "rewards/rejected": -5.14599609375, "step": 700 }, { "epoch": 0.3333333333333333, "grad_norm": 464.88793773987993, "learning_rate": 3.706311945748565e-07, "logits/chosen": -2.551562547683716, "logits/rejected": -2.305468797683716, "logps/chosen": -808.0, "logps/rejected": -849.5999755859375, "loss": 4.5459, "rewards/accuracies": 0.37968748807907104, "rewards/chosen": 0.10097656399011612, "rewards/margins": 5.934423923492432, "rewards/rejected": -5.839062690734863, "step": 710 }, { "epoch": 0.3380281690140845, "grad_norm": 813.4903124605913, "learning_rate": 3.6802295252999476e-07, "logits/chosen": -2.4703125953674316, "logits/rejected": -2.167187452316284, "logps/chosen": -782.4000244140625, "logps/rejected": -920.7999877929688, "loss": 4.2721, "rewards/accuracies": 0.3851562440395355, "rewards/chosen": 1.151953101158142, "rewards/margins": 6.137499809265137, "rewards/rejected": -4.985937595367432, "step": 720 }, { "epoch": 0.3427230046948357, "grad_norm": 956.1257350396455, "learning_rate": 3.65414710485133e-07, "logits/chosen": -2.5296874046325684, "logits/rejected": -2.417187452316284, "logps/chosen": -777.5999755859375, "logps/rejected": -843.2000122070312, "loss": 3.961, "rewards/accuracies": 0.39140623807907104, "rewards/chosen": 1.1710937023162842, "rewards/margins": 6.31640625, "rewards/rejected": -5.139062404632568, "step": 730 }, { "epoch": 0.3474178403755869, "grad_norm": 565.641108432113, "learning_rate": 3.6280646844027127e-07, "logits/chosen": -2.4937500953674316, "logits/rejected": -2.4117188453674316, "logps/chosen": -756.0, "logps/rejected": -922.7999877929688, "loss": 4.1313, "rewards/accuracies": 0.38359373807907104, "rewards/chosen": 1.1298828125, "rewards/margins": 4.427734375, "rewards/rejected": -3.298046827316284, "step": 740 }, { "epoch": 0.352112676056338, "grad_norm": 750.2292394754626, "learning_rate": 3.6019822639540947e-07, "logits/chosen": -2.464062452316284, "logits/rejected": -2.2359375953674316, "logps/chosen": -788.0, "logps/rejected": -832.4000244140625, "loss": 3.4955, "rewards/accuracies": 0.3921875059604645, "rewards/chosen": 1.6628906726837158, "rewards/margins": 7.282812595367432, "rewards/rejected": -5.615624904632568, "step": 750 }, { "epoch": 0.3568075117370892, "grad_norm": 557.4905266376433, "learning_rate": 3.575899843505477e-07, "logits/chosen": -2.48046875, "logits/rejected": -2.335156202316284, "logps/chosen": -760.4000244140625, "logps/rejected": -810.4000244140625, "loss": 3.7194, "rewards/accuracies": 0.38203126192092896, "rewards/chosen": 1.4665038585662842, "rewards/margins": 5.578125, "rewards/rejected": -4.11328125, "step": 760 }, { "epoch": 0.3615023474178404, "grad_norm": 433.8262978893309, "learning_rate": 3.5498174230568597e-07, "logits/chosen": -2.450000047683716, "logits/rejected": -2.30859375, "logps/chosen": -745.5999755859375, "logps/rejected": -848.4000244140625, "loss": 3.7391, "rewards/accuracies": 0.3773437440395355, "rewards/chosen": 1.3266112804412842, "rewards/margins": 5.746874809265137, "rewards/rejected": -4.421484470367432, "step": 770 }, { "epoch": 0.36619718309859156, "grad_norm": 2071.8092091562908, "learning_rate": 3.5237350026082417e-07, "logits/chosen": -2.5703125, "logits/rejected": -2.3531250953674316, "logps/chosen": -762.4000244140625, "logps/rejected": -808.4000244140625, "loss": 4.2241, "rewards/accuracies": 0.3812499940395355, "rewards/chosen": 0.672656238079071, "rewards/margins": 4.354687690734863, "rewards/rejected": -3.6761717796325684, "step": 780 }, { "epoch": 0.37089201877934275, "grad_norm": 7457.507699465494, "learning_rate": 3.497652582159624e-07, "logits/chosen": -2.5250000953674316, "logits/rejected": -2.3296875953674316, "logps/chosen": -753.5999755859375, "logps/rejected": -892.4000244140625, "loss": 3.9503, "rewards/accuracies": 0.40312498807907104, "rewards/chosen": 1.294921875, "rewards/margins": 7.037499904632568, "rewards/rejected": -5.741406440734863, "step": 790 }, { "epoch": 0.3755868544600939, "grad_norm": 1445.875583313148, "learning_rate": 3.471570161711007e-07, "logits/chosen": -2.4859375953674316, "logits/rejected": -2.253124952316284, "logps/chosen": -804.7999877929688, "logps/rejected": -920.4000244140625, "loss": 4.3674, "rewards/accuracies": 0.4046874940395355, "rewards/chosen": 1.0244140625, "rewards/margins": 5.954687595367432, "rewards/rejected": -4.928515434265137, "step": 800 }, { "epoch": 0.38028169014084506, "grad_norm": 376.19559644235187, "learning_rate": 3.445487741262389e-07, "logits/chosen": -2.5640625953674316, "logits/rejected": -2.391406297683716, "logps/chosen": -739.5999755859375, "logps/rejected": -834.4000244140625, "loss": 3.8772, "rewards/accuracies": 0.3828125, "rewards/chosen": 0.706835925579071, "rewards/margins": 6.7578125, "rewards/rejected": -6.046875, "step": 810 }, { "epoch": 0.38497652582159625, "grad_norm": 399.8943174401036, "learning_rate": 3.4194053208137713e-07, "logits/chosen": -2.565624952316284, "logits/rejected": -2.4164061546325684, "logps/chosen": -752.4000244140625, "logps/rejected": -839.5999755859375, "loss": 4.4075, "rewards/accuracies": 0.41093748807907104, "rewards/chosen": 1.2795898914337158, "rewards/margins": 7.703125, "rewards/rejected": -6.4375, "step": 820 }, { "epoch": 0.38967136150234744, "grad_norm": 540.1545873421608, "learning_rate": 3.393322900365154e-07, "logits/chosen": -2.5078125, "logits/rejected": -2.313281297683716, "logps/chosen": -705.5999755859375, "logps/rejected": -811.5999755859375, "loss": 3.9295, "rewards/accuracies": 0.39140623807907104, "rewards/chosen": 1.4328124523162842, "rewards/margins": 7.010937690734863, "rewards/rejected": -5.579980373382568, "step": 830 }, { "epoch": 0.39436619718309857, "grad_norm": 398.95949972688743, "learning_rate": 3.367240479916536e-07, "logits/chosen": -2.5640625953674316, "logits/rejected": -2.3499999046325684, "logps/chosen": -797.5999755859375, "logps/rejected": -887.5999755859375, "loss": 4.1677, "rewards/accuracies": 0.40937501192092896, "rewards/chosen": 1.8445312976837158, "rewards/margins": 6.669335842132568, "rewards/rejected": -4.823828220367432, "step": 840 }, { "epoch": 0.39906103286384975, "grad_norm": 378.67988404553887, "learning_rate": 3.3411580594679184e-07, "logits/chosen": -2.5640625953674316, "logits/rejected": -2.370312452316284, "logps/chosen": -739.5999755859375, "logps/rejected": -846.4000244140625, "loss": 3.7494, "rewards/accuracies": 0.4078125059604645, "rewards/chosen": 1.6320312023162842, "rewards/margins": 6.028124809265137, "rewards/rejected": -4.400000095367432, "step": 850 }, { "epoch": 0.40375586854460094, "grad_norm": 656.500868911795, "learning_rate": 3.315075639019301e-07, "logits/chosen": -2.489062547683716, "logits/rejected": -2.3843750953674316, "logps/chosen": -735.5999755859375, "logps/rejected": -852.4000244140625, "loss": 3.8746, "rewards/accuracies": 0.40546876192092896, "rewards/chosen": 1.168359398841858, "rewards/margins": 6.262499809265137, "rewards/rejected": -5.090624809265137, "step": 860 }, { "epoch": 0.4084507042253521, "grad_norm": 625.0679463603142, "learning_rate": 3.288993218570683e-07, "logits/chosen": -2.503124952316284, "logits/rejected": -2.2523436546325684, "logps/chosen": -750.4000244140625, "logps/rejected": -840.7999877929688, "loss": 4.3809, "rewards/accuracies": 0.3804687559604645, "rewards/chosen": 1.46435546875, "rewards/margins": 4.899023532867432, "rewards/rejected": -3.442578077316284, "step": 870 }, { "epoch": 0.4131455399061033, "grad_norm": 839.3821506173657, "learning_rate": 3.2629107981220654e-07, "logits/chosen": -2.5374999046325684, "logits/rejected": -2.3062500953674316, "logps/chosen": -775.2000122070312, "logps/rejected": -860.0, "loss": 3.6519, "rewards/accuracies": 0.4046874940395355, "rewards/chosen": 2.0689454078674316, "rewards/margins": 7.540625095367432, "rewards/rejected": -5.477343559265137, "step": 880 }, { "epoch": 0.41784037558685444, "grad_norm": 519.695619682195, "learning_rate": 3.236828377673448e-07, "logits/chosen": -2.5140624046325684, "logits/rejected": -2.421875, "logps/chosen": -804.7999877929688, "logps/rejected": -850.4000244140625, "loss": 6.8044, "rewards/accuracies": 0.3890624940395355, "rewards/chosen": -1.004296898841858, "rewards/margins": 5.70703125, "rewards/rejected": -6.724999904632568, "step": 890 }, { "epoch": 0.4225352112676056, "grad_norm": 493.47752227203154, "learning_rate": 3.2107459572248305e-07, "logits/chosen": -2.504687547683716, "logits/rejected": -2.3734374046325684, "logps/chosen": -770.4000244140625, "logps/rejected": -847.2000122070312, "loss": 3.45, "rewards/accuracies": 0.40937501192092896, "rewards/chosen": 1.75, "rewards/margins": 5.90625, "rewards/rejected": -4.16796875, "step": 900 }, { "epoch": 0.4272300469483568, "grad_norm": 10593.943476212193, "learning_rate": 3.1846635367762125e-07, "logits/chosen": -2.410937547683716, "logits/rejected": -2.2828125953674316, "logps/chosen": -774.4000244140625, "logps/rejected": -874.4000244140625, "loss": 3.9206, "rewards/accuracies": 0.3812499940395355, "rewards/chosen": 1.322363257408142, "rewards/margins": 6.353515625, "rewards/rejected": -5.038476467132568, "step": 910 }, { "epoch": 0.431924882629108, "grad_norm": 479.1515863081533, "learning_rate": 3.158581116327595e-07, "logits/chosen": -2.4203124046325684, "logits/rejected": -2.38671875, "logps/chosen": -780.4000244140625, "logps/rejected": -925.5999755859375, "loss": 3.9953, "rewards/accuracies": 0.41484373807907104, "rewards/chosen": 1.800195336341858, "rewards/margins": 8.231249809265137, "rewards/rejected": -6.442187309265137, "step": 920 }, { "epoch": 0.43661971830985913, "grad_norm": 309.4993820468666, "learning_rate": 3.1324986958789775e-07, "logits/chosen": -2.5296874046325684, "logits/rejected": -2.4164061546325684, "logps/chosen": -717.2000122070312, "logps/rejected": -838.7999877929688, "loss": 3.454, "rewards/accuracies": 0.40625, "rewards/chosen": 1.9074218273162842, "rewards/margins": 7.676562309265137, "rewards/rejected": -5.77734375, "step": 930 }, { "epoch": 0.4413145539906103, "grad_norm": 449.457169211395, "learning_rate": 3.1064162754303595e-07, "logits/chosen": -2.4390625953674316, "logits/rejected": -2.3140625953674316, "logps/chosen": -791.2000122070312, "logps/rejected": -891.2000122070312, "loss": 3.8918, "rewards/accuracies": 0.4039062559604645, "rewards/chosen": 1.931249976158142, "rewards/margins": 9.203125, "rewards/rejected": -7.2734375, "step": 940 }, { "epoch": 0.4460093896713615, "grad_norm": 359.67755776356705, "learning_rate": 3.080333854981742e-07, "logits/chosen": -2.492968797683716, "logits/rejected": -2.356250047683716, "logps/chosen": -762.0, "logps/rejected": -812.0, "loss": 4.991, "rewards/accuracies": 0.39921873807907104, "rewards/chosen": 1.846289038658142, "rewards/margins": 4.626562595367432, "rewards/rejected": -2.783740282058716, "step": 950 }, { "epoch": 0.4507042253521127, "grad_norm": 527.6209037583344, "learning_rate": 3.0542514345331246e-07, "logits/chosen": -2.4859375953674316, "logits/rejected": -2.2750000953674316, "logps/chosen": -761.5999755859375, "logps/rejected": -926.7999877929688, "loss": 5.4318, "rewards/accuracies": 0.4085937440395355, "rewards/chosen": -0.19394531846046448, "rewards/margins": 8.131250381469727, "rewards/rejected": -8.321874618530273, "step": 960 }, { "epoch": 0.45539906103286387, "grad_norm": 638.1379874447194, "learning_rate": 3.0281690140845066e-07, "logits/chosen": -2.3921875953674316, "logits/rejected": -2.3375000953674316, "logps/chosen": -820.0, "logps/rejected": -888.0, "loss": 4.226, "rewards/accuracies": 0.40312498807907104, "rewards/chosen": 1.9226562976837158, "rewards/margins": 5.69921875, "rewards/rejected": -3.766796827316284, "step": 970 }, { "epoch": 0.460093896713615, "grad_norm": 942.6247775437032, "learning_rate": 3.002086593635889e-07, "logits/chosen": -2.5859375, "logits/rejected": -2.354687452316284, "logps/chosen": -771.5999755859375, "logps/rejected": -899.2000122070312, "loss": 4.0496, "rewards/accuracies": 0.4078125059604645, "rewards/chosen": 1.5322265625, "rewards/margins": 7.767187595367432, "rewards/rejected": -6.242968559265137, "step": 980 }, { "epoch": 0.4647887323943662, "grad_norm": 3626.1298158125637, "learning_rate": 2.9760041731872716e-07, "logits/chosen": -2.65625, "logits/rejected": -2.4742188453674316, "logps/chosen": -702.0, "logps/rejected": -838.4000244140625, "loss": 4.7111, "rewards/accuracies": 0.38749998807907104, "rewards/chosen": 1.496484398841858, "rewards/margins": 5.602343559265137, "rewards/rejected": -4.102978706359863, "step": 990 }, { "epoch": 0.4694835680751174, "grad_norm": 406.49649089797106, "learning_rate": 2.9499217527386536e-07, "logits/chosen": -2.503124952316284, "logits/rejected": -2.4046874046325684, "logps/chosen": -747.5999755859375, "logps/rejected": -860.4000244140625, "loss": 4.0069, "rewards/accuracies": 0.41718751192092896, "rewards/chosen": 1.5046875476837158, "rewards/margins": 6.682031154632568, "rewards/rejected": -5.184374809265137, "step": 1000 }, { "epoch": 0.47417840375586856, "grad_norm": 506.72006818340515, "learning_rate": 2.923839332290036e-07, "logits/chosen": -2.4828124046325684, "logits/rejected": -2.280468702316284, "logps/chosen": -742.0, "logps/rejected": -886.7999877929688, "loss": 4.3592, "rewards/accuracies": 0.42890626192092896, "rewards/chosen": 1.080712914466858, "rewards/margins": 8.725000381469727, "rewards/rejected": -7.650781154632568, "step": 1010 }, { "epoch": 0.4788732394366197, "grad_norm": 331.6698829751223, "learning_rate": 2.8977569118414187e-07, "logits/chosen": -2.6343750953674316, "logits/rejected": -2.4859375953674316, "logps/chosen": -724.7999877929688, "logps/rejected": -861.5999755859375, "loss": 3.3359, "rewards/accuracies": 0.421875, "rewards/chosen": 1.74072265625, "rewards/margins": 8.039843559265137, "rewards/rejected": -6.299218654632568, "step": 1020 }, { "epoch": 0.4835680751173709, "grad_norm": 724.8297816261552, "learning_rate": 2.8716744913928007e-07, "logits/chosen": -2.53125, "logits/rejected": -2.3109374046325684, "logps/chosen": -750.0, "logps/rejected": -828.4000244140625, "loss": 4.0032, "rewards/accuracies": 0.41484373807907104, "rewards/chosen": 1.715234398841858, "rewards/margins": 6.314062595367432, "rewards/rejected": -4.590624809265137, "step": 1030 }, { "epoch": 0.48826291079812206, "grad_norm": 327.79228392344027, "learning_rate": 2.845592070944183e-07, "logits/chosen": -2.5218749046325684, "logits/rejected": -2.3343749046325684, "logps/chosen": -757.2000122070312, "logps/rejected": -891.5999755859375, "loss": 3.2396, "rewards/accuracies": 0.4359374940395355, "rewards/chosen": 2.3446288108825684, "rewards/margins": 7.74609375, "rewards/rejected": -5.407812595367432, "step": 1040 }, { "epoch": 0.49295774647887325, "grad_norm": 6608.484232586509, "learning_rate": 2.819509650495566e-07, "logits/chosen": -2.567187547683716, "logits/rejected": -2.30859375, "logps/chosen": -736.4000244140625, "logps/rejected": -837.5999755859375, "loss": 3.9455, "rewards/accuracies": 0.43359375, "rewards/chosen": 2.247363328933716, "rewards/margins": 7.001562595367432, "rewards/rejected": -4.753125190734863, "step": 1050 }, { "epoch": 0.49765258215962443, "grad_norm": 349.943533688359, "learning_rate": 2.7934272300469483e-07, "logits/chosen": -2.549999952316284, "logits/rejected": -2.2757811546325684, "logps/chosen": -785.2000122070312, "logps/rejected": -850.7999877929688, "loss": 5.3839, "rewards/accuracies": 0.4140625, "rewards/chosen": 0.3746093809604645, "rewards/margins": 7.790625095367432, "rewards/rejected": -7.41796875, "step": 1060 }, { "epoch": 0.5023474178403756, "grad_norm": 434.98130349645817, "learning_rate": 2.7673448095983303e-07, "logits/chosen": -2.4921875, "logits/rejected": -2.3140625953674316, "logps/chosen": -751.2000122070312, "logps/rejected": -847.2000122070312, "loss": 3.5818, "rewards/accuracies": 0.40625, "rewards/chosen": 1.982031226158142, "rewards/margins": 6.810937404632568, "rewards/rejected": -4.827538967132568, "step": 1070 }, { "epoch": 0.5070422535211268, "grad_norm": 430.3282615170605, "learning_rate": 2.741262389149713e-07, "logits/chosen": -2.4437499046325684, "logits/rejected": -2.2054686546325684, "logps/chosen": -777.2000122070312, "logps/rejected": -877.5999755859375, "loss": 4.0664, "rewards/accuracies": 0.4085937440395355, "rewards/chosen": 2.3507323265075684, "rewards/margins": 7.624218940734863, "rewards/rejected": -5.269921779632568, "step": 1080 }, { "epoch": 0.5117370892018779, "grad_norm": 826.0539068917697, "learning_rate": 2.7151799687010953e-07, "logits/chosen": -2.492968797683716, "logits/rejected": -2.2109375, "logps/chosen": -766.4000244140625, "logps/rejected": -879.5999755859375, "loss": 4.054, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.838574230670929, "rewards/margins": 7.073437690734863, "rewards/rejected": -6.241406440734863, "step": 1090 }, { "epoch": 0.5164319248826291, "grad_norm": 471.53478532936106, "learning_rate": 2.6890975482524773e-07, "logits/chosen": -2.467968702316284, "logits/rejected": -2.350781202316284, "logps/chosen": -803.2000122070312, "logps/rejected": -872.4000244140625, "loss": 4.0094, "rewards/accuracies": 0.41015625, "rewards/chosen": 1.1484375, "rewards/margins": 6.334374904632568, "rewards/rejected": -5.192187309265137, "step": 1100 }, { "epoch": 0.5211267605633803, "grad_norm": 422.19347230345033, "learning_rate": 2.66301512780386e-07, "logits/chosen": -2.5703125, "logits/rejected": -2.211718797683716, "logps/chosen": -711.5999755859375, "logps/rejected": -870.4000244140625, "loss": 4.6858, "rewards/accuracies": 0.41093748807907104, "rewards/chosen": 1.419921875, "rewards/margins": 6.921093940734863, "rewards/rejected": -5.49658203125, "step": 1110 }, { "epoch": 0.5258215962441315, "grad_norm": 417.46095170062046, "learning_rate": 2.6369327073552424e-07, "logits/chosen": -2.4984374046325684, "logits/rejected": -2.401562452316284, "logps/chosen": -750.0, "logps/rejected": -815.5999755859375, "loss": 3.5642, "rewards/accuracies": 0.40625, "rewards/chosen": 2.4632811546325684, "rewards/margins": 7.137499809265137, "rewards/rejected": -4.66796875, "step": 1120 }, { "epoch": 0.5305164319248826, "grad_norm": 544.6941773081793, "learning_rate": 2.6108502869066244e-07, "logits/chosen": -2.4921875, "logits/rejected": -2.3539061546325684, "logps/chosen": -790.7999877929688, "logps/rejected": -902.4000244140625, "loss": 4.8063, "rewards/accuracies": 0.4078125059604645, "rewards/chosen": 1.3991210460662842, "rewards/margins": 8.399218559265137, "rewards/rejected": -6.991796970367432, "step": 1130 }, { "epoch": 0.5352112676056338, "grad_norm": 420.67957655658216, "learning_rate": 2.584767866458007e-07, "logits/chosen": -2.5687499046325684, "logits/rejected": -2.32421875, "logps/chosen": -752.7999877929688, "logps/rejected": -860.4000244140625, "loss": 4.1379, "rewards/accuracies": 0.41093748807907104, "rewards/chosen": 1.276464819908142, "rewards/margins": 6.489062309265137, "rewards/rejected": -5.220312595367432, "step": 1140 }, { "epoch": 0.539906103286385, "grad_norm": 699.4986528073072, "learning_rate": 2.5586854460093895e-07, "logits/chosen": -2.473437547683716, "logits/rejected": -2.3687500953674316, "logps/chosen": -767.2000122070312, "logps/rejected": -874.0, "loss": 3.892, "rewards/accuracies": 0.4281249940395355, "rewards/chosen": 1.6943359375, "rewards/margins": 8.603124618530273, "rewards/rejected": -6.907031059265137, "step": 1150 }, { "epoch": 0.5446009389671361, "grad_norm": 465.8298608956407, "learning_rate": 2.5326030255607715e-07, "logits/chosen": -2.5687499046325684, "logits/rejected": -2.4234375953674316, "logps/chosen": -758.7999877929688, "logps/rejected": -850.4000244140625, "loss": 4.4265, "rewards/accuracies": 0.4281249940395355, "rewards/chosen": 1.349218726158142, "rewards/margins": 7.001757621765137, "rewards/rejected": -5.646874904632568, "step": 1160 }, { "epoch": 0.5492957746478874, "grad_norm": 398.61046399043613, "learning_rate": 2.506520605112154e-07, "logits/chosen": -2.5078125, "logits/rejected": -2.393749952316284, "logps/chosen": -758.7999877929688, "logps/rejected": -860.0, "loss": 5.0062, "rewards/accuracies": 0.4007812440395355, "rewards/chosen": 0.739501953125, "rewards/margins": 5.525000095367432, "rewards/rejected": -4.789843559265137, "step": 1170 }, { "epoch": 0.5539906103286385, "grad_norm": 323.4324114923158, "learning_rate": 2.4804381846635365e-07, "logits/chosen": -2.512500047683716, "logits/rejected": -2.38671875, "logps/chosen": -783.5999755859375, "logps/rejected": -855.2000122070312, "loss": 4.7812, "rewards/accuracies": 0.43359375, "rewards/chosen": 1.3312499523162842, "rewards/margins": 7.223437309265137, "rewards/rejected": -5.905468940734863, "step": 1180 }, { "epoch": 0.5586854460093896, "grad_norm": 506.18135811725455, "learning_rate": 2.454355764214919e-07, "logits/chosen": -2.417187452316284, "logits/rejected": -2.3125, "logps/chosen": -738.4000244140625, "logps/rejected": -892.0, "loss": 3.8514, "rewards/accuracies": 0.41718751192092896, "rewards/chosen": 1.822265625, "rewards/margins": 7.956250190734863, "rewards/rejected": -6.136328220367432, "step": 1190 }, { "epoch": 0.5633802816901409, "grad_norm": 386.1224858207139, "learning_rate": 2.4282733437663016e-07, "logits/chosen": -2.5093750953674316, "logits/rejected": -2.487499952316284, "logps/chosen": -764.4000244140625, "logps/rejected": -826.0, "loss": 3.9235, "rewards/accuracies": 0.40937501192092896, "rewards/chosen": 2.237499952316284, "rewards/margins": 6.0703125, "rewards/rejected": -3.822265625, "step": 1200 }, { "epoch": 0.568075117370892, "grad_norm": 621.2564817580269, "learning_rate": 2.4021909233176836e-07, "logits/chosen": -2.4828124046325684, "logits/rejected": -2.2953124046325684, "logps/chosen": -724.4000244140625, "logps/rejected": -858.0, "loss": 4.0181, "rewards/accuracies": 0.4039062559604645, "rewards/chosen": 2.3140625953674316, "rewards/margins": 7.865624904632568, "rewards/rejected": -5.55078125, "step": 1210 }, { "epoch": 0.5727699530516432, "grad_norm": 378.7437394723237, "learning_rate": 2.376108502869066e-07, "logits/chosen": -2.628124952316284, "logits/rejected": -2.557812452316284, "logps/chosen": -767.5999755859375, "logps/rejected": -874.4000244140625, "loss": 4.2021, "rewards/accuracies": 0.4164062440395355, "rewards/chosen": 1.8992187976837158, "rewards/margins": 7.783593654632568, "rewards/rejected": -5.893359184265137, "step": 1220 }, { "epoch": 0.5774647887323944, "grad_norm": 466.7533524196612, "learning_rate": 2.3500260824204484e-07, "logits/chosen": -2.512500047683716, "logits/rejected": -2.356250047683716, "logps/chosen": -803.2000122070312, "logps/rejected": -901.5999755859375, "loss": 4.0792, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 1.685156226158142, "rewards/margins": 10.629687309265137, "rewards/rejected": -8.962109565734863, "step": 1230 }, { "epoch": 0.5821596244131455, "grad_norm": 427.4222059466879, "learning_rate": 2.323943661971831e-07, "logits/chosen": -2.612499952316284, "logits/rejected": -2.4124999046325684, "logps/chosen": -759.5999755859375, "logps/rejected": -857.2000122070312, "loss": 4.2713, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": 1.312158226966858, "rewards/margins": 7.184374809265137, "rewards/rejected": -5.859375, "step": 1240 }, { "epoch": 0.5868544600938967, "grad_norm": 650.8385347996614, "learning_rate": 2.2978612415232132e-07, "logits/chosen": -2.5562500953674316, "logits/rejected": -2.3617186546325684, "logps/chosen": -816.4000244140625, "logps/rejected": -896.7999877929688, "loss": 4.212, "rewards/accuracies": 0.42265623807907104, "rewards/chosen": 1.7316405773162842, "rewards/margins": 6.887499809265137, "rewards/rejected": -5.161328315734863, "step": 1250 }, { "epoch": 0.5915492957746479, "grad_norm": 482.610616264509, "learning_rate": 2.2717788210745957e-07, "logits/chosen": -2.453125, "logits/rejected": -2.2249999046325684, "logps/chosen": -765.2000122070312, "logps/rejected": -904.4000244140625, "loss": 4.3097, "rewards/accuracies": 0.44218748807907104, "rewards/chosen": 2.155468702316284, "rewards/margins": 10.750781059265137, "rewards/rejected": -8.612500190734863, "step": 1260 }, { "epoch": 0.596244131455399, "grad_norm": 422.4832760195976, "learning_rate": 2.245696400625978e-07, "logits/chosen": -2.526562452316284, "logits/rejected": -2.4046874046325684, "logps/chosen": -773.2000122070312, "logps/rejected": -860.7999877929688, "loss": 4.0265, "rewards/accuracies": 0.40937501192092896, "rewards/chosen": 1.829980492591858, "rewards/margins": 9.487500190734863, "rewards/rejected": -7.626953125, "step": 1270 }, { "epoch": 0.6009389671361502, "grad_norm": 380.6754834257087, "learning_rate": 2.2196139801773602e-07, "logits/chosen": -2.543750047683716, "logits/rejected": -2.3109374046325684, "logps/chosen": -728.4000244140625, "logps/rejected": -830.7999877929688, "loss": 3.7522, "rewards/accuracies": 0.4281249940395355, "rewards/chosen": 1.5955078601837158, "rewards/margins": 9.362500190734863, "rewards/rejected": -7.762499809265137, "step": 1280 }, { "epoch": 0.6056338028169014, "grad_norm": 617.5125468445888, "learning_rate": 2.1935315597287428e-07, "logits/chosen": -2.4281249046325684, "logits/rejected": -2.278125047683716, "logps/chosen": -775.5999755859375, "logps/rejected": -839.5999755859375, "loss": 3.6877, "rewards/accuracies": 0.4351562559604645, "rewards/chosen": 2.0220704078674316, "rewards/margins": 7.323437690734863, "rewards/rejected": -5.301562309265137, "step": 1290 }, { "epoch": 0.6103286384976526, "grad_norm": 377.8792864453347, "learning_rate": 2.167449139280125e-07, "logits/chosen": -2.456249952316284, "logits/rejected": -2.2796874046325684, "logps/chosen": -774.7999877929688, "logps/rejected": -877.2000122070312, "loss": 3.8299, "rewards/accuracies": 0.40546876192092896, "rewards/chosen": 2.3003907203674316, "rewards/margins": 6.846875190734863, "rewards/rejected": -4.547656059265137, "step": 1300 }, { "epoch": 0.6150234741784038, "grad_norm": 304.34047436814666, "learning_rate": 2.1413667188315073e-07, "logits/chosen": -2.5999999046325684, "logits/rejected": -2.598437547683716, "logps/chosen": -748.0, "logps/rejected": -806.0, "loss": 3.989, "rewards/accuracies": 0.39921873807907104, "rewards/chosen": 1.685449242591858, "rewards/margins": 5.956250190734863, "rewards/rejected": -4.268750190734863, "step": 1310 }, { "epoch": 0.6197183098591549, "grad_norm": 413.641434738981, "learning_rate": 2.1152842983828898e-07, "logits/chosen": -2.5234375, "logits/rejected": -2.346874952316284, "logps/chosen": -770.4000244140625, "logps/rejected": -858.0, "loss": 4.7325, "rewards/accuracies": 0.3921875059604645, "rewards/chosen": 1.538476586341858, "rewards/margins": 5.655859470367432, "rewards/rejected": -4.113671779632568, "step": 1320 }, { "epoch": 0.6244131455399061, "grad_norm": 1379.025094348344, "learning_rate": 2.089201877934272e-07, "logits/chosen": -2.4625000953674316, "logits/rejected": -2.196093797683716, "logps/chosen": -800.4000244140625, "logps/rejected": -918.4000244140625, "loss": 4.4263, "rewards/accuracies": 0.44218748807907104, "rewards/chosen": 1.782373070716858, "rewards/margins": 8.328906059265137, "rewards/rejected": -6.541406154632568, "step": 1330 }, { "epoch": 0.6291079812206573, "grad_norm": 399.4508285042661, "learning_rate": 2.0631194574856543e-07, "logits/chosen": -2.6468749046325684, "logits/rejected": -2.4468750953674316, "logps/chosen": -730.0, "logps/rejected": -790.7999877929688, "loss": 4.5796, "rewards/accuracies": 0.421875, "rewards/chosen": 1.739843726158142, "rewards/margins": 7.872656345367432, "rewards/rejected": -6.1328125, "step": 1340 }, { "epoch": 0.6338028169014085, "grad_norm": 338.4272660340767, "learning_rate": 2.0370370370370369e-07, "logits/chosen": -2.4039063453674316, "logits/rejected": -2.266406297683716, "logps/chosen": -773.5999755859375, "logps/rejected": -914.7999877929688, "loss": 4.0211, "rewards/accuracies": 0.4296875, "rewards/chosen": 2.1039061546325684, "rewards/margins": 10.896875381469727, "rewards/rejected": -8.801562309265137, "step": 1350 }, { "epoch": 0.6384976525821596, "grad_norm": 372.84283303889856, "learning_rate": 2.010954616588419e-07, "logits/chosen": -2.551562547683716, "logits/rejected": -2.3375000953674316, "logps/chosen": -760.0, "logps/rejected": -857.5999755859375, "loss": 4.1396, "rewards/accuracies": 0.4398437440395355, "rewards/chosen": 2.0943360328674316, "rewards/margins": 7.762499809265137, "rewards/rejected": -5.674023628234863, "step": 1360 }, { "epoch": 0.6431924882629108, "grad_norm": 371.56207049513785, "learning_rate": 1.9848721961398017e-07, "logits/chosen": -2.5687499046325684, "logits/rejected": -2.4554686546325684, "logps/chosen": -766.7999877929688, "logps/rejected": -840.0, "loss": 3.2361, "rewards/accuracies": 0.42578125, "rewards/chosen": 2.659374952316284, "rewards/margins": 10.098437309265137, "rewards/rejected": -7.450781345367432, "step": 1370 }, { "epoch": 0.647887323943662, "grad_norm": 456.8266245239595, "learning_rate": 1.958789775691184e-07, "logits/chosen": -2.518749952316284, "logits/rejected": -2.3343749046325684, "logps/chosen": -746.4000244140625, "logps/rejected": -848.7999877929688, "loss": 4.1247, "rewards/accuracies": 0.4351562559604645, "rewards/chosen": 1.48974609375, "rewards/margins": 8.231249809265137, "rewards/rejected": -6.737109184265137, "step": 1380 }, { "epoch": 0.6525821596244131, "grad_norm": 10588.701835458416, "learning_rate": 1.9327073552425662e-07, "logits/chosen": -2.5718750953674316, "logits/rejected": -2.33984375, "logps/chosen": -727.5999755859375, "logps/rejected": -816.4000244140625, "loss": 3.724, "rewards/accuracies": 0.421875, "rewards/chosen": 2.4072265625, "rewards/margins": 8.176562309265137, "rewards/rejected": -5.768750190734863, "step": 1390 }, { "epoch": 0.6572769953051644, "grad_norm": 6617.2501954335, "learning_rate": 1.906624934793949e-07, "logits/chosen": -2.4781250953674316, "logits/rejected": -2.4234375953674316, "logps/chosen": -757.2000122070312, "logps/rejected": -895.5999755859375, "loss": 4.1209, "rewards/accuracies": 0.42656248807907104, "rewards/chosen": 1.1183593273162842, "rewards/margins": 8.98046875, "rewards/rejected": -7.872656345367432, "step": 1400 }, { "epoch": 0.6619718309859155, "grad_norm": 341.1578733158136, "learning_rate": 1.8805425143453312e-07, "logits/chosen": -2.5875000953674316, "logits/rejected": -2.2484374046325684, "logps/chosen": -737.5999755859375, "logps/rejected": -941.2000122070312, "loss": 3.7639, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": 2.3257813453674316, "rewards/margins": 12.199999809265137, "rewards/rejected": -9.868749618530273, "step": 1410 }, { "epoch": 0.6666666666666666, "grad_norm": 388.862663254699, "learning_rate": 1.8544600938967138e-07, "logits/chosen": -2.5406250953674316, "logits/rejected": -2.440624952316284, "logps/chosen": -745.5999755859375, "logps/rejected": -826.4000244140625, "loss": 3.7476, "rewards/accuracies": 0.44140625, "rewards/chosen": 2.389233350753784, "rewards/margins": 7.125, "rewards/rejected": -4.7294921875, "step": 1420 }, { "epoch": 0.6713615023474179, "grad_norm": 272.7682525749342, "learning_rate": 1.828377673448096e-07, "logits/chosen": -2.4390625953674316, "logits/rejected": -2.3984375, "logps/chosen": -796.4000244140625, "logps/rejected": -863.2000122070312, "loss": 4.7793, "rewards/accuracies": 0.45234376192092896, "rewards/chosen": 1.3416016101837158, "rewards/margins": 8.584375381469727, "rewards/rejected": -7.237500190734863, "step": 1430 }, { "epoch": 0.676056338028169, "grad_norm": 349.4069381240554, "learning_rate": 1.8022952529994783e-07, "logits/chosen": -2.6343750953674316, "logits/rejected": -2.403125047683716, "logps/chosen": -748.0, "logps/rejected": -852.7999877929688, "loss": 3.6646, "rewards/accuracies": 0.4156250059604645, "rewards/chosen": 1.7000000476837158, "rewards/margins": 9.232812881469727, "rewards/rejected": -7.525781154632568, "step": 1440 }, { "epoch": 0.6807511737089202, "grad_norm": 411.27398877615707, "learning_rate": 1.7762128325508608e-07, "logits/chosen": -2.5921874046325684, "logits/rejected": -2.418750047683716, "logps/chosen": -780.7999877929688, "logps/rejected": -937.5999755859375, "loss": 5.1075, "rewards/accuracies": 0.4117187559604645, "rewards/chosen": 1.0319335460662842, "rewards/margins": 9.723437309265137, "rewards/rejected": -8.700780868530273, "step": 1450 }, { "epoch": 0.6854460093896714, "grad_norm": 408.6167188032052, "learning_rate": 1.750130412102243e-07, "logits/chosen": -2.379687547683716, "logits/rejected": -2.3125, "logps/chosen": -824.4000244140625, "logps/rejected": -893.5999755859375, "loss": 4.5059, "rewards/accuracies": 0.4234375059604645, "rewards/chosen": 1.7265625, "rewards/margins": 9.5234375, "rewards/rejected": -7.801562309265137, "step": 1460 }, { "epoch": 0.6901408450704225, "grad_norm": 388.9376139954803, "learning_rate": 1.7240479916536254e-07, "logits/chosen": -2.421875, "logits/rejected": -2.3671875, "logps/chosen": -752.4000244140625, "logps/rejected": -884.0, "loss": 4.2261, "rewards/accuracies": 0.4195312559604645, "rewards/chosen": 2.1416993141174316, "rewards/margins": 10.865625381469727, "rewards/rejected": -8.707616806030273, "step": 1470 }, { "epoch": 0.6948356807511737, "grad_norm": 378.524423691828, "learning_rate": 1.697965571205008e-07, "logits/chosen": -2.653125047683716, "logits/rejected": -2.4710936546325684, "logps/chosen": -743.2000122070312, "logps/rejected": -805.2000122070312, "loss": 4.051, "rewards/accuracies": 0.4140625, "rewards/chosen": 1.4373047351837158, "rewards/margins": 5.876562595367432, "rewards/rejected": -4.446484565734863, "step": 1480 }, { "epoch": 0.6995305164319249, "grad_norm": 550.5852865233995, "learning_rate": 1.6718831507563902e-07, "logits/chosen": -2.5953125953674316, "logits/rejected": -2.4921875, "logps/chosen": -757.5999755859375, "logps/rejected": -801.5999755859375, "loss": 3.7732, "rewards/accuracies": 0.42109376192092896, "rewards/chosen": 2.1136717796325684, "rewards/margins": 6.278124809265137, "rewards/rejected": -4.1552734375, "step": 1490 }, { "epoch": 0.704225352112676, "grad_norm": 427.18588926267864, "learning_rate": 1.6458007303077727e-07, "logits/chosen": -2.53125, "logits/rejected": -2.448437452316284, "logps/chosen": -736.4000244140625, "logps/rejected": -808.0, "loss": 4.0702, "rewards/accuracies": 0.42656248807907104, "rewards/chosen": 2.1546874046325684, "rewards/margins": 6.940625190734863, "rewards/rejected": -4.786718845367432, "step": 1500 }, { "epoch": 0.7089201877934272, "grad_norm": 599.1394174500863, "learning_rate": 1.619718309859155e-07, "logits/chosen": -2.606250047683716, "logits/rejected": -2.323437452316284, "logps/chosen": -767.5999755859375, "logps/rejected": -891.5999755859375, "loss": 3.8417, "rewards/accuracies": 0.4507812559604645, "rewards/chosen": 1.878515601158142, "rewards/margins": 9.865625381469727, "rewards/rejected": -7.992773532867432, "step": 1510 }, { "epoch": 0.7136150234741784, "grad_norm": 536.4767435258981, "learning_rate": 1.5936358894105372e-07, "logits/chosen": -2.5859375, "logits/rejected": -2.4312500953674316, "logps/chosen": -764.0, "logps/rejected": -856.0, "loss": 3.6968, "rewards/accuracies": 0.41718751192092896, "rewards/chosen": 2.6656250953674316, "rewards/margins": 10.020312309265137, "rewards/rejected": -7.350781440734863, "step": 1520 }, { "epoch": 0.7183098591549296, "grad_norm": 1229.6023414752588, "learning_rate": 1.5675534689619197e-07, "logits/chosen": -2.5953125953674316, "logits/rejected": -2.484375, "logps/chosen": -770.7999877929688, "logps/rejected": -790.7999877929688, "loss": 4.2477, "rewards/accuracies": 0.4140625, "rewards/chosen": 1.7410156726837158, "rewards/margins": 7.956445217132568, "rewards/rejected": -6.218359470367432, "step": 1530 }, { "epoch": 0.7230046948356808, "grad_norm": 536.6568789146753, "learning_rate": 1.541471048513302e-07, "logits/chosen": -2.479687452316284, "logits/rejected": -2.3609375953674316, "logps/chosen": -770.7999877929688, "logps/rejected": -857.2000122070312, "loss": 3.3124, "rewards/accuracies": 0.42578125, "rewards/chosen": 3.4242186546325684, "rewards/margins": 8.21875, "rewards/rejected": -4.804101467132568, "step": 1540 }, { "epoch": 0.7276995305164319, "grad_norm": 470.8839638867676, "learning_rate": 1.5153886280646843e-07, "logits/chosen": -2.4906249046325684, "logits/rejected": -2.331249952316284, "logps/chosen": -729.5999755859375, "logps/rejected": -836.0, "loss": 4.0775, "rewards/accuracies": 0.4281249940395355, "rewards/chosen": 1.929296851158142, "rewards/margins": 8.8515625, "rewards/rejected": -6.932812690734863, "step": 1550 }, { "epoch": 0.7323943661971831, "grad_norm": 440.8105351379023, "learning_rate": 1.4893062076160668e-07, "logits/chosen": -2.6500000953674316, "logits/rejected": -2.5718750953674316, "logps/chosen": -748.0, "logps/rejected": -852.4000244140625, "loss": 3.4896, "rewards/accuracies": 0.4375, "rewards/chosen": 2.1708984375, "rewards/margins": 10.100000381469727, "rewards/rejected": -7.9375, "step": 1560 }, { "epoch": 0.7370892018779343, "grad_norm": 358.7075675203654, "learning_rate": 1.463223787167449e-07, "logits/chosen": -2.53125, "logits/rejected": -2.395312547683716, "logps/chosen": -780.0, "logps/rejected": -855.5999755859375, "loss": 4.6828, "rewards/accuracies": 0.4351562559604645, "rewards/chosen": 1.6946532726287842, "rewards/margins": 6.732031345367432, "rewards/rejected": -5.04296875, "step": 1570 }, { "epoch": 0.7417840375586855, "grad_norm": 630.1006096089187, "learning_rate": 1.4371413667188313e-07, "logits/chosen": -2.6312499046325684, "logits/rejected": -2.4585938453674316, "logps/chosen": -749.5999755859375, "logps/rejected": -878.7999877929688, "loss": 4.1201, "rewards/accuracies": 0.4140625, "rewards/chosen": 1.547265648841858, "rewards/margins": 7.09375, "rewards/rejected": -5.534375190734863, "step": 1580 }, { "epoch": 0.7464788732394366, "grad_norm": 362.6335972225942, "learning_rate": 1.4110589462702139e-07, "logits/chosen": -2.6390624046325684, "logits/rejected": -2.504687547683716, "logps/chosen": -752.7999877929688, "logps/rejected": -816.0, "loss": 4.4723, "rewards/accuracies": 0.41718751192092896, "rewards/chosen": 1.8791015148162842, "rewards/margins": 6.903124809265137, "rewards/rejected": -5.0234375, "step": 1590 }, { "epoch": 0.7511737089201878, "grad_norm": 363.5249971959507, "learning_rate": 1.384976525821596e-07, "logits/chosen": -2.651562452316284, "logits/rejected": -2.362499952316284, "logps/chosen": -764.7999877929688, "logps/rejected": -894.7999877929688, "loss": 4.0403, "rewards/accuracies": 0.45703125, "rewards/chosen": 2.082226514816284, "rewards/margins": 9.690625190734863, "rewards/rejected": -7.603906154632568, "step": 1600 }, { "epoch": 0.755868544600939, "grad_norm": 490.924017816434, "learning_rate": 1.3588941053729787e-07, "logits/chosen": -2.442187547683716, "logits/rejected": -2.387500047683716, "logps/chosen": -778.0, "logps/rejected": -890.7999877929688, "loss": 4.5292, "rewards/accuracies": 0.4242187440395355, "rewards/chosen": 1.9943115711212158, "rewards/margins": 7.2109375, "rewards/rejected": -5.216271877288818, "step": 1610 }, { "epoch": 0.7605633802816901, "grad_norm": 334.9934735886465, "learning_rate": 1.332811684924361e-07, "logits/chosen": -2.5140624046325684, "logits/rejected": -2.3921875953674316, "logps/chosen": -795.5999755859375, "logps/rejected": -862.4000244140625, "loss": 4.8928, "rewards/accuracies": 0.43046873807907104, "rewards/chosen": 1.53955078125, "rewards/margins": 7.959374904632568, "rewards/rejected": -6.420312404632568, "step": 1620 }, { "epoch": 0.7652582159624414, "grad_norm": 381.4998335805573, "learning_rate": 1.3067292644757432e-07, "logits/chosen": -2.2718749046325684, "logits/rejected": -2.129687547683716, "logps/chosen": -799.5999755859375, "logps/rejected": -920.4000244140625, "loss": 3.8557, "rewards/accuracies": 0.4281249940395355, "rewards/chosen": 1.784765601158142, "rewards/margins": 8.512499809265137, "rewards/rejected": -6.728125095367432, "step": 1630 }, { "epoch": 0.7699530516431925, "grad_norm": 303.586260752689, "learning_rate": 1.2806468440271257e-07, "logits/chosen": -2.528125047683716, "logits/rejected": -2.3929686546325684, "logps/chosen": -742.0, "logps/rejected": -833.2000122070312, "loss": 4.1733, "rewards/accuracies": 0.42578125, "rewards/chosen": 2.0132813453674316, "rewards/margins": 6.892187595367432, "rewards/rejected": -4.879687309265137, "step": 1640 }, { "epoch": 0.7746478873239436, "grad_norm": 432.801124389805, "learning_rate": 1.254564423578508e-07, "logits/chosen": -2.5093750953674316, "logits/rejected": -2.288281202316284, "logps/chosen": -759.2000122070312, "logps/rejected": -884.4000244140625, "loss": 3.5372, "rewards/accuracies": 0.4398437440395355, "rewards/chosen": 2.962890625, "rewards/margins": 8.556249618530273, "rewards/rejected": -5.588281154632568, "step": 1650 }, { "epoch": 0.7793427230046949, "grad_norm": 346.4487001972693, "learning_rate": 1.2284820031298902e-07, "logits/chosen": -2.4937500953674316, "logits/rejected": -2.3343749046325684, "logps/chosen": -802.0, "logps/rejected": -886.0, "loss": 3.8265, "rewards/accuracies": 0.43437498807907104, "rewards/chosen": 1.944921851158142, "rewards/margins": 12.169921875, "rewards/rejected": -10.222949028015137, "step": 1660 }, { "epoch": 0.784037558685446, "grad_norm": 410.86570464485783, "learning_rate": 1.2023995826812728e-07, "logits/chosen": -2.5687499046325684, "logits/rejected": -2.403125047683716, "logps/chosen": -758.4000244140625, "logps/rejected": -860.7999877929688, "loss": 4.157, "rewards/accuracies": 0.4476562440395355, "rewards/chosen": 2.3128905296325684, "rewards/margins": 9.828125, "rewards/rejected": -7.517187595367432, "step": 1670 }, { "epoch": 0.7887323943661971, "grad_norm": 2136.642881743141, "learning_rate": 1.176317162232655e-07, "logits/chosen": -2.6234374046325684, "logits/rejected": -2.5, "logps/chosen": -746.0, "logps/rejected": -884.4000244140625, "loss": 4.1253, "rewards/accuracies": 0.4359374940395355, "rewards/chosen": 2.102734327316284, "rewards/margins": 11.690625190734863, "rewards/rejected": -9.579687118530273, "step": 1680 }, { "epoch": 0.7934272300469484, "grad_norm": 630.6795631117106, "learning_rate": 1.1502347417840374e-07, "logits/chosen": -2.5093750953674316, "logits/rejected": -2.2906250953674316, "logps/chosen": -747.2000122070312, "logps/rejected": -857.2000122070312, "loss": 4.2829, "rewards/accuracies": 0.41484373807907104, "rewards/chosen": 2.1099610328674316, "rewards/margins": 7.543749809265137, "rewards/rejected": -5.440197944641113, "step": 1690 }, { "epoch": 0.7981220657276995, "grad_norm": 465.23589968375455, "learning_rate": 1.1241523213354198e-07, "logits/chosen": -2.5640625953674316, "logits/rejected": -2.426562547683716, "logps/chosen": -756.4000244140625, "logps/rejected": -885.2000122070312, "loss": 3.3596, "rewards/accuracies": 0.4203124940395355, "rewards/chosen": 2.2164063453674316, "rewards/margins": 8.035937309265137, "rewards/rejected": -5.818945407867432, "step": 1700 }, { "epoch": 0.8028169014084507, "grad_norm": 441.8906632519451, "learning_rate": 1.0980699008868022e-07, "logits/chosen": -2.4359374046325684, "logits/rejected": -2.2757811546325684, "logps/chosen": -796.7999877929688, "logps/rejected": -839.5999755859375, "loss": 4.3753, "rewards/accuracies": 0.43359375, "rewards/chosen": 2.609375, "rewards/margins": 9.723437309265137, "rewards/rejected": -7.110156059265137, "step": 1710 }, { "epoch": 0.8075117370892019, "grad_norm": 564.7915107361899, "learning_rate": 1.0719874804381846e-07, "logits/chosen": -2.5999999046325684, "logits/rejected": -2.503124952316284, "logps/chosen": -744.0, "logps/rejected": -781.5999755859375, "loss": 4.1198, "rewards/accuracies": 0.40703123807907104, "rewards/chosen": 2.150585889816284, "rewards/margins": 5.887499809265137, "rewards/rejected": -3.741455078125, "step": 1720 }, { "epoch": 0.812206572769953, "grad_norm": 320.8212970469599, "learning_rate": 1.045905059989567e-07, "logits/chosen": -2.5687499046325684, "logits/rejected": -2.4390625953674316, "logps/chosen": -722.4000244140625, "logps/rejected": -790.7999877929688, "loss": 4.7643, "rewards/accuracies": 0.4195312559604645, "rewards/chosen": 1.0138671398162842, "rewards/margins": 9.967187881469727, "rewards/rejected": -8.94140625, "step": 1730 }, { "epoch": 0.8169014084507042, "grad_norm": 713.6258367327782, "learning_rate": 1.0198226395409494e-07, "logits/chosen": -2.4000000953674316, "logits/rejected": -2.2796874046325684, "logps/chosen": -733.5999755859375, "logps/rejected": -817.2000122070312, "loss": 4.4454, "rewards/accuracies": 0.4046874940395355, "rewards/chosen": 1.625, "rewards/margins": 6.20703125, "rewards/rejected": -4.579687595367432, "step": 1740 }, { "epoch": 0.8215962441314554, "grad_norm": 279.8866152039485, "learning_rate": 9.937402190923318e-08, "logits/chosen": -2.440624952316284, "logits/rejected": -2.2328124046325684, "logps/chosen": -732.7999877929688, "logps/rejected": -878.7999877929688, "loss": 4.1971, "rewards/accuracies": 0.4546875059604645, "rewards/chosen": 2.5145506858825684, "rewards/margins": 8.526562690734863, "rewards/rejected": -6.001562595367432, "step": 1750 }, { "epoch": 0.8262910798122066, "grad_norm": 382.2986812231122, "learning_rate": 9.676577986437141e-08, "logits/chosen": -2.4546875953674316, "logits/rejected": -2.235156297683716, "logps/chosen": -792.0, "logps/rejected": -860.7999877929688, "loss": 4.5396, "rewards/accuracies": 0.45390623807907104, "rewards/chosen": 1.724023461341858, "rewards/margins": 9.040624618530273, "rewards/rejected": -7.306250095367432, "step": 1760 }, { "epoch": 0.8309859154929577, "grad_norm": 703.9331919204005, "learning_rate": 9.415753781950965e-08, "logits/chosen": -2.549999952316284, "logits/rejected": -2.34375, "logps/chosen": -740.4000244140625, "logps/rejected": -850.0, "loss": 3.3349, "rewards/accuracies": 0.41484373807907104, "rewards/chosen": 2.344531297683716, "rewards/margins": 7.818749904632568, "rewards/rejected": -5.477343559265137, "step": 1770 }, { "epoch": 0.8356807511737089, "grad_norm": 433.06690799996636, "learning_rate": 9.154929577464789e-08, "logits/chosen": -2.5562500953674316, "logits/rejected": -2.3726563453674316, "logps/chosen": -756.0, "logps/rejected": -866.7999877929688, "loss": 3.7646, "rewards/accuracies": 0.44140625, "rewards/chosen": 2.4619140625, "rewards/margins": 12.078125, "rewards/rejected": -9.592187881469727, "step": 1780 }, { "epoch": 0.8403755868544601, "grad_norm": 408.1750732624494, "learning_rate": 8.894105372978613e-08, "logits/chosen": -2.4453125, "logits/rejected": -2.421875, "logps/chosen": -794.7999877929688, "logps/rejected": -871.7999877929688, "loss": 4.391, "rewards/accuracies": 0.41093748807907104, "rewards/chosen": 2.060742139816284, "rewards/margins": 7.740624904632568, "rewards/rejected": -5.689062595367432, "step": 1790 }, { "epoch": 0.8450704225352113, "grad_norm": 366.3144865147577, "learning_rate": 8.633281168492435e-08, "logits/chosen": -2.5625, "logits/rejected": -2.4046874046325684, "logps/chosen": -756.0, "logps/rejected": -821.2000122070312, "loss": 3.7373, "rewards/accuracies": 0.453125, "rewards/chosen": 2.2708983421325684, "rewards/margins": 11.6796875, "rewards/rejected": -9.404687881469727, "step": 1800 }, { "epoch": 0.8497652582159625, "grad_norm": 388.6020923869252, "learning_rate": 8.372456964006259e-08, "logits/chosen": -2.4437499046325684, "logits/rejected": -2.2796874046325684, "logps/chosen": -761.2000122070312, "logps/rejected": -878.0, "loss": 4.1477, "rewards/accuracies": 0.4320312440395355, "rewards/chosen": 1.795312523841858, "rewards/margins": 7.658593654632568, "rewards/rejected": -5.867968559265137, "step": 1810 }, { "epoch": 0.8544600938967136, "grad_norm": 512.5397402097874, "learning_rate": 8.111632759520083e-08, "logits/chosen": -2.426562547683716, "logits/rejected": -2.282031297683716, "logps/chosen": -812.4000244140625, "logps/rejected": -874.0, "loss": 4.547, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 1.595556616783142, "rewards/margins": 7.388281345367432, "rewards/rejected": -5.796875, "step": 1820 }, { "epoch": 0.8591549295774648, "grad_norm": 340.7283920774716, "learning_rate": 7.850808555033907e-08, "logits/chosen": -2.5609374046325684, "logits/rejected": -2.4124999046325684, "logps/chosen": -755.5999755859375, "logps/rejected": -883.2000122070312, "loss": 3.4809, "rewards/accuracies": 0.4453125, "rewards/chosen": 2.485546827316284, "rewards/margins": 9.396875381469727, "rewards/rejected": -6.90625, "step": 1830 }, { "epoch": 0.863849765258216, "grad_norm": 367.04348311423695, "learning_rate": 7.58998435054773e-08, "logits/chosen": -2.549999952316284, "logits/rejected": -2.274218797683716, "logps/chosen": -762.7999877929688, "logps/rejected": -876.7999877929688, "loss": 4.6239, "rewards/accuracies": 0.44218748807907104, "rewards/chosen": 2.080273389816284, "rewards/margins": 10.565625190734863, "rewards/rejected": -8.48046875, "step": 1840 }, { "epoch": 0.8685446009389671, "grad_norm": 509.58132237696515, "learning_rate": 7.329160146061554e-08, "logits/chosen": -2.6015625, "logits/rejected": -2.565624952316284, "logps/chosen": -745.5999755859375, "logps/rejected": -848.7999877929688, "loss": 4.3797, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 3.035351514816284, "rewards/margins": 11.510937690734863, "rewards/rejected": -8.481249809265137, "step": 1850 }, { "epoch": 0.8732394366197183, "grad_norm": 758.4254032746667, "learning_rate": 7.068335941575378e-08, "logits/chosen": -2.3921875953674316, "logits/rejected": -2.374218702316284, "logps/chosen": -750.0, "logps/rejected": -816.0, "loss": 4.3346, "rewards/accuracies": 0.4390625059604645, "rewards/chosen": 2.27734375, "rewards/margins": 7.941992282867432, "rewards/rejected": -5.670702934265137, "step": 1860 }, { "epoch": 0.8779342723004695, "grad_norm": 976.6318104706936, "learning_rate": 6.807511737089202e-08, "logits/chosen": -2.6390624046325684, "logits/rejected": -2.4156250953674316, "logps/chosen": -750.7999877929688, "logps/rejected": -858.4000244140625, "loss": 3.5868, "rewards/accuracies": 0.421875, "rewards/chosen": 2.016406297683716, "rewards/margins": 10.040624618530273, "rewards/rejected": -8.024999618530273, "step": 1870 }, { "epoch": 0.8826291079812206, "grad_norm": 474.75167930245186, "learning_rate": 6.546687532603024e-08, "logits/chosen": -2.4984374046325684, "logits/rejected": -2.375781297683716, "logps/chosen": -745.5999755859375, "logps/rejected": -852.7999877929688, "loss": 4.5859, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": 1.3720703125, "rewards/margins": 7.1875, "rewards/rejected": -5.814062595367432, "step": 1880 }, { "epoch": 0.8873239436619719, "grad_norm": 474.0294259726967, "learning_rate": 6.285863328116848e-08, "logits/chosen": -2.5406250953674316, "logits/rejected": -2.328125, "logps/chosen": -693.5999755859375, "logps/rejected": -867.5999755859375, "loss": 3.5894, "rewards/accuracies": 0.43359375, "rewards/chosen": 2.319140672683716, "rewards/margins": 9.146875381469727, "rewards/rejected": -6.824999809265137, "step": 1890 }, { "epoch": 0.892018779342723, "grad_norm": 460.4751878306455, "learning_rate": 6.025039123630672e-08, "logits/chosen": -2.534374952316284, "logits/rejected": -2.268749952316284, "logps/chosen": -772.7999877929688, "logps/rejected": -828.0, "loss": 4.1889, "rewards/accuracies": 0.421875, "rewards/chosen": 2.466796875, "rewards/margins": 8.559374809265137, "rewards/rejected": -6.09375, "step": 1900 }, { "epoch": 0.8967136150234741, "grad_norm": 490.0525850202359, "learning_rate": 5.764214919144496e-08, "logits/chosen": -2.5718750953674316, "logits/rejected": -2.4437499046325684, "logps/chosen": -782.7999877929688, "logps/rejected": -824.4000244140625, "loss": 4.515, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 1.664575219154358, "rewards/margins": 6.497460842132568, "rewards/rejected": -4.832665920257568, "step": 1910 }, { "epoch": 0.9014084507042254, "grad_norm": 866.3318927072091, "learning_rate": 5.50339071465832e-08, "logits/chosen": -2.515625, "logits/rejected": -2.567187547683716, "logps/chosen": -770.4000244140625, "logps/rejected": -782.7999877929688, "loss": 3.6488, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 2.9683594703674316, "rewards/margins": 7.34375, "rewards/rejected": -4.371142387390137, "step": 1920 }, { "epoch": 0.9061032863849765, "grad_norm": 488.9619307108808, "learning_rate": 5.2425665101721436e-08, "logits/chosen": -2.5484375953674316, "logits/rejected": -2.3492188453674316, "logps/chosen": -762.4000244140625, "logps/rejected": -890.7999877929688, "loss": 4.0203, "rewards/accuracies": 0.4375, "rewards/chosen": 2.850781202316284, "rewards/margins": 7.865624904632568, "rewards/rejected": -5.0107421875, "step": 1930 }, { "epoch": 0.9107981220657277, "grad_norm": 411.04003251078433, "learning_rate": 4.9817423056859675e-08, "logits/chosen": -2.581249952316284, "logits/rejected": -2.3499999046325684, "logps/chosen": -767.5999755859375, "logps/rejected": -875.5999755859375, "loss": 4.3625, "rewards/accuracies": 0.44062501192092896, "rewards/chosen": 2.0251708030700684, "rewards/margins": 8.865625381469727, "rewards/rejected": -6.841406345367432, "step": 1940 }, { "epoch": 0.9154929577464789, "grad_norm": 384.2455186003981, "learning_rate": 4.720918101199791e-08, "logits/chosen": -2.350781202316284, "logits/rejected": -2.1187500953674316, "logps/chosen": -786.4000244140625, "logps/rejected": -952.7999877929688, "loss": 3.7473, "rewards/accuracies": 0.4203124940395355, "rewards/chosen": 2.0298829078674316, "rewards/margins": 9.115625381469727, "rewards/rejected": -7.082812309265137, "step": 1950 }, { "epoch": 0.92018779342723, "grad_norm": 304.4130325346357, "learning_rate": 4.460093896713615e-08, "logits/chosen": -2.5859375, "logits/rejected": -2.307812452316284, "logps/chosen": -756.7999877929688, "logps/rejected": -888.4000244140625, "loss": 4.4148, "rewards/accuracies": 0.4320312440395355, "rewards/chosen": 2.321972608566284, "rewards/margins": 9.120312690734863, "rewards/rejected": -6.80859375, "step": 1960 }, { "epoch": 0.9248826291079812, "grad_norm": 401.47706217712914, "learning_rate": 4.199269692227438e-08, "logits/chosen": -2.534374952316284, "logits/rejected": -2.4546875953674316, "logps/chosen": -754.4000244140625, "logps/rejected": -795.2000122070312, "loss": 4.2677, "rewards/accuracies": 0.4296875, "rewards/chosen": 1.928979516029358, "rewards/margins": 9.821874618530273, "rewards/rejected": -7.884375095367432, "step": 1970 }, { "epoch": 0.9295774647887324, "grad_norm": 723.7170404352454, "learning_rate": 3.938445487741262e-08, "logits/chosen": -2.40625, "logits/rejected": -2.109375, "logps/chosen": -784.7999877929688, "logps/rejected": -877.2000122070312, "loss": 4.5287, "rewards/accuracies": 0.41718751192092896, "rewards/chosen": 0.9930664300918579, "rewards/margins": 8.557031631469727, "rewards/rejected": -7.555468559265137, "step": 1980 }, { "epoch": 0.9342723004694836, "grad_norm": 654.9120455263484, "learning_rate": 3.677621283255086e-08, "logits/chosen": -2.6312499046325684, "logits/rejected": -2.495312452316284, "logps/chosen": -777.5999755859375, "logps/rejected": -844.7999877929688, "loss": 3.714, "rewards/accuracies": 0.46406251192092896, "rewards/chosen": 3.069140672683716, "rewards/margins": 9.065625190734863, "rewards/rejected": -5.993750095367432, "step": 1990 }, { "epoch": 0.9389671361502347, "grad_norm": 311.59037152518556, "learning_rate": 3.41679707876891e-08, "logits/chosen": -2.5859375, "logits/rejected": -2.4140625, "logps/chosen": -742.7999877929688, "logps/rejected": -906.0, "loss": 3.6833, "rewards/accuracies": 0.4468750059604645, "rewards/chosen": 2.420605421066284, "rewards/margins": 9.184374809265137, "rewards/rejected": -6.758593559265137, "step": 2000 }, { "epoch": 0.9436619718309859, "grad_norm": 412.4287903893836, "learning_rate": 3.155972874282733e-08, "logits/chosen": -2.573437452316284, "logits/rejected": -2.3617186546325684, "logps/chosen": -736.0, "logps/rejected": -828.7999877929688, "loss": 4.4662, "rewards/accuracies": 0.41328126192092896, "rewards/chosen": 3.070996046066284, "rewards/margins": 6.551562309265137, "rewards/rejected": -3.4789061546325684, "step": 2010 }, { "epoch": 0.9483568075117371, "grad_norm": 349.9846262717242, "learning_rate": 2.8951486697965573e-08, "logits/chosen": -2.5250000953674316, "logits/rejected": -2.407031297683716, "logps/chosen": -780.0, "logps/rejected": -870.0, "loss": 4.2449, "rewards/accuracies": 0.45781248807907104, "rewards/chosen": 2.4232420921325684, "rewards/margins": 8.9140625, "rewards/rejected": -6.493750095367432, "step": 2020 }, { "epoch": 0.9530516431924883, "grad_norm": 411.30259984389926, "learning_rate": 2.634324465310381e-08, "logits/chosen": -2.5953125953674316, "logits/rejected": -2.3671875, "logps/chosen": -745.2000122070312, "logps/rejected": -847.5999755859375, "loss": 4.5732, "rewards/accuracies": 0.42890626192092896, "rewards/chosen": 2.0653319358825684, "rewards/margins": 7.204687595367432, "rewards/rejected": -5.143359184265137, "step": 2030 }, { "epoch": 0.9577464788732394, "grad_norm": 398.25354769825475, "learning_rate": 2.3735002608242045e-08, "logits/chosen": -2.456249952316284, "logits/rejected": -2.292187452316284, "logps/chosen": -789.5999755859375, "logps/rejected": -833.2000122070312, "loss": 3.924, "rewards/accuracies": 0.4554687440395355, "rewards/chosen": 2.1742186546325684, "rewards/margins": 12.118749618530273, "rewards/rejected": -9.961718559265137, "step": 2040 }, { "epoch": 0.9624413145539906, "grad_norm": 366.86602945403564, "learning_rate": 2.1126760563380282e-08, "logits/chosen": -2.582812547683716, "logits/rejected": -2.446093797683716, "logps/chosen": -758.4000244140625, "logps/rejected": -793.5999755859375, "loss": 4.2615, "rewards/accuracies": 0.44218748807907104, "rewards/chosen": 1.9796874523162842, "rewards/margins": 8.4296875, "rewards/rejected": -6.447656154632568, "step": 2050 }, { "epoch": 0.9671361502347418, "grad_norm": 414.3553510572445, "learning_rate": 1.8518518518518518e-08, "logits/chosen": -2.5078125, "logits/rejected": -2.270312547683716, "logps/chosen": -783.2000122070312, "logps/rejected": -862.0, "loss": 4.2096, "rewards/accuracies": 0.4164062440395355, "rewards/chosen": 2.216796875, "rewards/margins": 7.948437690734863, "rewards/rejected": -5.735156059265137, "step": 2060 }, { "epoch": 0.971830985915493, "grad_norm": 375.38516798549716, "learning_rate": 1.5910276473656755e-08, "logits/chosen": -2.4906249046325684, "logits/rejected": -2.2445311546325684, "logps/chosen": -756.0, "logps/rejected": -857.5999755859375, "loss": 4.3646, "rewards/accuracies": 0.4195312559604645, "rewards/chosen": 1.678613305091858, "rewards/margins": 6.6708984375, "rewards/rejected": -4.986425876617432, "step": 2070 }, { "epoch": 0.9765258215962441, "grad_norm": 404.1144560326498, "learning_rate": 1.3302034428794991e-08, "logits/chosen": -2.6328125, "logits/rejected": -2.5484375953674316, "logps/chosen": -718.7999877929688, "logps/rejected": -794.7999877929688, "loss": 3.7124, "rewards/accuracies": 0.46562498807907104, "rewards/chosen": 2.765625, "rewards/margins": 11.140625, "rewards/rejected": -8.370312690734863, "step": 2080 }, { "epoch": 0.9812206572769953, "grad_norm": 403.513179459941, "learning_rate": 1.0693792383933229e-08, "logits/chosen": -2.5703125, "logits/rejected": -2.5062499046325684, "logps/chosen": -776.7999877929688, "logps/rejected": -860.4000244140625, "loss": 3.5547, "rewards/accuracies": 0.4296875, "rewards/chosen": 2.987499952316284, "rewards/margins": 9.940625190734863, "rewards/rejected": -6.948437690734863, "step": 2090 }, { "epoch": 0.9859154929577465, "grad_norm": 926.0940749908417, "learning_rate": 8.085550339071465e-09, "logits/chosen": -2.5, "logits/rejected": -2.3609375953674316, "logps/chosen": -795.5999755859375, "logps/rejected": -888.7999877929688, "loss": 4.5348, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": 2.0, "rewards/margins": 6.564062595367432, "rewards/rejected": -4.567968845367432, "step": 2100 }, { "epoch": 0.9906103286384976, "grad_norm": 4480.350164168996, "learning_rate": 5.4773082942097025e-09, "logits/chosen": -2.534374952316284, "logits/rejected": -2.4156250953674316, "logps/chosen": -738.7999877929688, "logps/rejected": -812.0, "loss": 4.1578, "rewards/accuracies": 0.453125, "rewards/chosen": 2.116406202316284, "rewards/margins": 8.359375, "rewards/rejected": -6.248437404632568, "step": 2110 }, { "epoch": 0.9953051643192489, "grad_norm": 480.41107911126227, "learning_rate": 2.8690662493479393e-09, "logits/chosen": -2.4937500953674316, "logits/rejected": -2.2406249046325684, "logps/chosen": -764.7999877929688, "logps/rejected": -853.2000122070312, "loss": 4.2772, "rewards/accuracies": 0.43281251192092896, "rewards/chosen": 1.509863257408142, "rewards/margins": 8.040624618530273, "rewards/rejected": -6.537499904632568, "step": 2120 }, { "epoch": 1.0, "grad_norm": 741.7895726615369, "learning_rate": 2.608242044861763e-10, "logits/chosen": -2.534374952316284, "logits/rejected": -2.393749952316284, "logps/chosen": -753.5999755859375, "logps/rejected": -868.7999877929688, "loss": 4.0373, "rewards/accuracies": 0.4348672926425934, "rewards/chosen": 2.864062547683716, "rewards/margins": 11.854687690734863, "rewards/rejected": -8.991406440734863, "step": 2130 }, { "epoch": 1.0, "step": 2130, "total_flos": 0.0, "train_loss": 4.036165593823357, "train_runtime": 7155.9863, "train_samples_per_second": 38.092, "train_steps_per_second": 0.298 } ], "logging_steps": 10, "max_steps": 2130, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }