{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1911, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005234231876472127, "grad_norm": 136.4956817626953, "learning_rate": 0.0, "logits/chosen": -0.40118408203125, "logits/rejected": -0.41802978515625, "logps/chosen": -297.609375, "logps/rejected": -247.84375, "logps/weighted_chosen": -4.50634765625, "logps/weighted_rejected": -3.43408203125, "loss": 0.6914, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "rewards/weighted_accuracies": 0.0, "rewards/weighted_chosen": 0.0, "rewards/weighted_margins": 0.0, "rewards/weighted_rejected": 0.0, "step": 1 }, { "epoch": 0.005234231876472127, "grad_norm": 269.8279113769531, "learning_rate": 4.6875e-08, "logits/chosen": -0.3176456093788147, "logits/rejected": -0.3530849814414978, "logps/chosen": -275.5694580078125, "logps/rejected": -255.875, "logps/weighted_chosen": -2.478325843811035, "logps/weighted_rejected": -2.635009765625, "loss": 0.6911, "rewards/accuracies": 0.2604166567325592, "rewards/chosen": -0.0588107630610466, "rewards/margins": -0.0334201380610466, "rewards/rejected": -0.025390625, "rewards/weighted_accuracies": 0.3229166567325592, "rewards/weighted_chosen": 0.0010899438057094812, "rewards/weighted_margins": 0.0014837053604424, "rewards/weighted_rejected": -0.00039333768654614687, "step": 10 }, { "epoch": 0.010468463752944255, "grad_norm": 12.950738906860352, "learning_rate": 9.895833333333332e-08, "logits/chosen": -0.2977302670478821, "logits/rejected": -0.3139175474643707, "logps/chosen": -294.37579345703125, "logps/rejected": -272.7203063964844, "logps/weighted_chosen": -2.30804443359375, "logps/weighted_rejected": -2.247570753097534, "loss": 0.692, "rewards/accuracies": 0.296875, "rewards/chosen": -0.05908203125, "rewards/margins": -0.01513671875, "rewards/rejected": -0.0439453125, "rewards/weighted_accuracies": 0.34687501192092896, "rewards/weighted_chosen": 7.171630568336695e-05, "rewards/weighted_margins": 3.662109520519152e-05, "rewards/weighted_rejected": 3.509521411615424e-05, "step": 20 }, { "epoch": 0.015702695629416383, "grad_norm": 65.14871215820312, "learning_rate": 1.5104166666666664e-07, "logits/chosen": -0.2917991578578949, "logits/rejected": -0.3373458981513977, "logps/chosen": -297.9375, "logps/rejected": -268.0062561035156, "logps/weighted_chosen": -2.3156495094299316, "logps/weighted_rejected": -2.529247999191284, "loss": 0.693, "rewards/accuracies": 0.28125, "rewards/chosen": 0.02177734300494194, "rewards/margins": -0.06005859375, "rewards/rejected": 0.08183594048023224, "rewards/weighted_accuracies": 0.34062498807907104, "rewards/weighted_chosen": 0.0012060165172442794, "rewards/weighted_margins": -0.0021120072342455387, "rewards/weighted_rejected": 0.0033180236350744963, "step": 30 }, { "epoch": 0.02093692750588851, "grad_norm": 30.699859619140625, "learning_rate": 2.03125e-07, "logits/chosen": -0.301095575094223, "logits/rejected": -0.3433547914028168, "logps/chosen": -278.7124938964844, "logps/rejected": -253.84530639648438, "logps/weighted_chosen": -2.3508667945861816, "logps/weighted_rejected": -2.55126953125, "loss": 0.6909, "rewards/accuracies": 0.2718749940395355, "rewards/chosen": -0.014892578125, "rewards/margins": -0.105224609375, "rewards/rejected": 0.09033203125, "rewards/weighted_accuracies": 0.359375, "rewards/weighted_chosen": 0.0031726837623864412, "rewards/weighted_margins": 0.0010789871448650956, "rewards/weighted_rejected": 0.002093696501106024, "step": 40 }, { "epoch": 0.02617115938236064, "grad_norm": 15.275498390197754, "learning_rate": 2.552083333333333e-07, "logits/chosen": -0.28184205293655396, "logits/rejected": -0.32042425870895386, "logps/chosen": -280.19140625, "logps/rejected": -267.4398498535156, "logps/weighted_chosen": -2.281604051589966, "logps/weighted_rejected": -2.278125047683716, "loss": 0.691, "rewards/accuracies": 0.3125, "rewards/chosen": 0.009082031436264515, "rewards/margins": -0.11552734673023224, "rewards/rejected": 0.12460937350988388, "rewards/weighted_accuracies": 0.375, "rewards/weighted_chosen": 0.0036834715865552425, "rewards/weighted_margins": 0.0022117614280432463, "rewards/weighted_rejected": 0.0014717101585119963, "step": 50 }, { "epoch": 0.031405391258832765, "grad_norm": 42.36075973510742, "learning_rate": 3.0729166666666665e-07, "logits/chosen": -0.3147949278354645, "logits/rejected": -0.3081672787666321, "logps/chosen": -277.5492248535156, "logps/rejected": -261.63201904296875, "logps/weighted_chosen": -2.6925292015075684, "logps/weighted_rejected": -2.5440917015075684, "loss": 0.6888, "rewards/accuracies": 0.39375001192092896, "rewards/chosen": 0.15625, "rewards/margins": 0.06132812425494194, "rewards/rejected": 0.09492187201976776, "rewards/weighted_accuracies": 0.43437498807907104, "rewards/weighted_chosen": 0.0014663696056231856, "rewards/weighted_margins": 0.0070289610885083675, "rewards/weighted_rejected": -0.005612182430922985, "step": 60 }, { "epoch": 0.036639623135304895, "grad_norm": 70.38953399658203, "learning_rate": 3.59375e-07, "logits/chosen": -0.31798094511032104, "logits/rejected": -0.32500457763671875, "logps/chosen": -289.6695251464844, "logps/rejected": -244.7351531982422, "logps/weighted_chosen": -2.19110107421875, "logps/weighted_rejected": -2.427661180496216, "loss": 0.6869, "rewards/accuracies": 0.4375, "rewards/chosen": 0.36865234375, "rewards/margins": 0.34619140625, "rewards/rejected": 0.0224609375, "rewards/weighted_accuracies": 0.5375000238418579, "rewards/weighted_chosen": 0.02108154259622097, "rewards/weighted_margins": 0.011628913693130016, "rewards/weighted_rejected": 0.009454727172851562, "step": 70 }, { "epoch": 0.04187385501177702, "grad_norm": 54.81600570678711, "learning_rate": 4.114583333333333e-07, "logits/chosen": -0.2848251461982727, "logits/rejected": -0.3061889708042145, "logps/chosen": -289.08203125, "logps/rejected": -269.95782470703125, "logps/weighted_chosen": -2.3393311500549316, "logps/weighted_rejected": -2.5093994140625, "loss": 0.6827, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.679980456829071, "rewards/margins": 0.3672851622104645, "rewards/rejected": 0.31269532442092896, "rewards/weighted_accuracies": 0.5062500238418579, "rewards/weighted_chosen": 0.048927437514066696, "rewards/weighted_margins": 0.025808846578001976, "rewards/weighted_rejected": 0.02311401441693306, "step": 80 }, { "epoch": 0.04710808688824915, "grad_norm": 30.603567123413086, "learning_rate": 4.6354166666666664e-07, "logits/chosen": -0.312631219625473, "logits/rejected": -0.30586013197898865, "logps/chosen": -279.92578125, "logps/rejected": -257.5234375, "logps/weighted_chosen": -2.5583739280700684, "logps/weighted_rejected": -2.6037840843200684, "loss": 0.6752, "rewards/accuracies": 0.49687498807907104, "rewards/chosen": 0.757617175579071, "rewards/margins": 0.5347656011581421, "rewards/rejected": 0.22285155951976776, "rewards/weighted_accuracies": 0.546875, "rewards/weighted_chosen": 0.08651771396398544, "rewards/weighted_margins": 0.05735473707318306, "rewards/weighted_rejected": 0.02917785570025444, "step": 90 }, { "epoch": 0.05234231876472128, "grad_norm": 99.52488708496094, "learning_rate": 5.156249999999999e-07, "logits/chosen": -0.2793869078159332, "logits/rejected": -0.3509994447231293, "logps/chosen": -290.19061279296875, "logps/rejected": -285.94842529296875, "logps/weighted_chosen": -2.013537645339966, "logps/weighted_rejected": -2.5231690406799316, "loss": 0.6984, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.72607421875, "rewards/margins": 0.80712890625, "rewards/rejected": -0.0810546875, "rewards/weighted_accuracies": 0.6187499761581421, "rewards/weighted_chosen": 0.07289886474609375, "rewards/weighted_margins": 0.019171524792909622, "rewards/weighted_rejected": 0.05372200161218643, "step": 100 }, { "epoch": 0.05757655064119341, "grad_norm": 55.06239318847656, "learning_rate": 5.677083333333333e-07, "logits/chosen": -0.3277145326137543, "logits/rejected": -0.3189544677734375, "logps/chosen": -296.17889404296875, "logps/rejected": -261.51953125, "logps/weighted_chosen": -2.647631883621216, "logps/weighted_rejected": -2.6458740234375, "loss": 0.6844, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.1376953125, "rewards/margins": 0.789746105670929, "rewards/rejected": -0.6522461175918579, "rewards/weighted_accuracies": 0.565625011920929, "rewards/weighted_chosen": 0.05432138592004776, "rewards/weighted_margins": 0.03696594387292862, "rewards/weighted_rejected": 0.01735992357134819, "step": 110 }, { "epoch": 0.06281078251766553, "grad_norm": 42.45036315917969, "learning_rate": 6.197916666666666e-07, "logits/chosen": -0.33232802152633667, "logits/rejected": -0.35865744948387146, "logps/chosen": -294.6078186035156, "logps/rejected": -255.01327514648438, "logps/weighted_chosen": -1.970422387123108, "logps/weighted_rejected": -2.1345458030700684, "loss": 0.6837, "rewards/accuracies": 0.609375, "rewards/chosen": 0.0013671874767169356, "rewards/margins": 1.2595703601837158, "rewards/rejected": -1.258203148841858, "rewards/weighted_accuracies": 0.59375, "rewards/weighted_chosen": 0.048947714269161224, "rewards/weighted_margins": 0.03674011304974556, "rewards/weighted_rejected": 0.012198830023407936, "step": 120 }, { "epoch": 0.06804501439413765, "grad_norm": 28.637327194213867, "learning_rate": 6.718749999999999e-07, "logits/chosen": -0.2933105528354645, "logits/rejected": -0.29020920395851135, "logps/chosen": -305.86602783203125, "logps/rejected": -279.09844970703125, "logps/weighted_chosen": -1.9946777820587158, "logps/weighted_rejected": -2.2933349609375, "loss": 0.6754, "rewards/accuracies": 0.625, "rewards/chosen": 0.285400390625, "rewards/margins": 1.7852051258087158, "rewards/rejected": -1.4998047351837158, "rewards/weighted_accuracies": 0.6187499761581421, "rewards/weighted_chosen": 0.06223602220416069, "rewards/weighted_margins": 0.05499267578125, "rewards/weighted_rejected": 0.0072036744095385075, "step": 130 }, { "epoch": 0.07327924627060979, "grad_norm": 20.866477966308594, "learning_rate": 7.239583333333333e-07, "logits/chosen": -0.33005982637405396, "logits/rejected": -0.34638214111328125, "logps/chosen": -300.7281188964844, "logps/rejected": -276.9156188964844, "logps/weighted_chosen": -2.294506788253784, "logps/weighted_rejected": -2.561084032058716, "loss": 0.6606, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -1.998437523841858, "rewards/margins": 2.3173828125, "rewards/rejected": -4.315820217132568, "rewards/weighted_accuracies": 0.671875, "rewards/weighted_chosen": 0.04616241529583931, "rewards/weighted_margins": 0.08864898979663849, "rewards/weighted_rejected": -0.04253540188074112, "step": 140 }, { "epoch": 0.07851347814708191, "grad_norm": 43.497623443603516, "learning_rate": 7.760416666666666e-07, "logits/chosen": -0.2890525758266449, "logits/rejected": -0.30130767822265625, "logps/chosen": -287.3109436035156, "logps/rejected": -256.72967529296875, "logps/weighted_chosen": -2.883862257003784, "logps/weighted_rejected": -2.796630859375, "loss": 0.664, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -3.108203172683716, "rewards/margins": 2.7275390625, "rewards/rejected": -5.835741996765137, "rewards/weighted_accuracies": 0.6343749761581421, "rewards/weighted_chosen": 0.04738159105181694, "rewards/weighted_margins": 0.09773864597082138, "rewards/weighted_rejected": -0.05034027248620987, "step": 150 }, { "epoch": 0.08374771002355404, "grad_norm": 12.989988327026367, "learning_rate": 8.28125e-07, "logits/chosen": -0.3334007263183594, "logits/rejected": -0.3831237852573395, "logps/chosen": -308.2320251464844, "logps/rejected": -281.5335998535156, "logps/weighted_chosen": -2.378857374191284, "logps/weighted_rejected": -2.3824095726013184, "loss": 0.6464, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.347460746765137, "rewards/margins": 3.98388671875, "rewards/rejected": -8.331347465515137, "rewards/weighted_accuracies": 0.65625, "rewards/weighted_chosen": 0.04017028957605362, "rewards/weighted_margins": 0.15716858208179474, "rewards/weighted_rejected": -0.11710510402917862, "step": 160 }, { "epoch": 0.08898194190002617, "grad_norm": 691.15625, "learning_rate": 8.802083333333333e-07, "logits/chosen": -0.37783128023147583, "logits/rejected": -0.37789613008499146, "logps/chosen": -310.76953125, "logps/rejected": -270.8374938964844, "logps/weighted_chosen": -2.6180419921875, "logps/weighted_rejected": -2.8051514625549316, "loss": 0.683, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -7.780859470367432, "rewards/margins": 4.9013671875, "rewards/rejected": -12.683398246765137, "rewards/weighted_accuracies": 0.643750011920929, "rewards/weighted_chosen": 0.06108245998620987, "rewards/weighted_margins": 0.17230224609375, "rewards/weighted_rejected": -0.11125946044921875, "step": 170 }, { "epoch": 0.0942161737764983, "grad_norm": 65.56312561035156, "learning_rate": 9.322916666666666e-07, "logits/chosen": -0.3433074951171875, "logits/rejected": -0.363290399312973, "logps/chosen": -285.8812561035156, "logps/rejected": -274.94842529296875, "logps/weighted_chosen": -2.50494384765625, "logps/weighted_rejected": -2.9246826171875, "loss": 0.6821, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -12.179101943969727, "rewards/margins": 6.432421684265137, "rewards/rejected": -18.608592987060547, "rewards/weighted_accuracies": 0.59375, "rewards/weighted_chosen": -0.0216827392578125, "rewards/weighted_margins": 0.19813232123851776, "rewards/weighted_rejected": -0.2197723388671875, "step": 180 }, { "epoch": 0.09945040565297043, "grad_norm": 26.828750610351562, "learning_rate": 9.84375e-07, "logits/chosen": -0.36560744047164917, "logits/rejected": -0.40380173921585083, "logps/chosen": -321.97967529296875, "logps/rejected": -287.5570373535156, "logps/weighted_chosen": -2.3037109375, "logps/weighted_rejected": -2.64697265625, "loss": 0.6489, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -16.246679306030273, "rewards/margins": 7.440039157867432, "rewards/rejected": -23.685937881469727, "rewards/weighted_accuracies": 0.640625, "rewards/weighted_chosen": -0.11536254733800888, "rewards/weighted_margins": 0.19368895888328552, "rewards/weighted_rejected": -0.309121698141098, "step": 190 }, { "epoch": 0.10468463752944256, "grad_norm": 117.84883880615234, "learning_rate": 9.99959085414323e-07, "logits/chosen": -0.4036270081996918, "logits/rejected": -0.43666380643844604, "logps/chosen": -327.8265686035156, "logps/rejected": -284.4234313964844, "logps/weighted_chosen": -2.6368165016174316, "logps/weighted_rejected": -3.0156006813049316, "loss": 0.6383, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -18.538379669189453, "rewards/margins": 8.212695121765137, "rewards/rejected": -26.748046875, "rewards/weighted_accuracies": 0.6468750238418579, "rewards/weighted_chosen": -0.12391586601734161, "rewards/weighted_margins": 0.28039854764938354, "rewards/weighted_rejected": -0.404205322265625, "step": 200 }, { "epoch": 0.10991886940591468, "grad_norm": 15.525572776794434, "learning_rate": 9.997587035630105e-07, "logits/chosen": -0.41632765531539917, "logits/rejected": -0.4597106873989105, "logps/chosen": -308.75469970703125, "logps/rejected": -316.69842529296875, "logps/weighted_chosen": -2.439282178878784, "logps/weighted_rejected": -3.152050733566284, "loss": 0.6437, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -21.632617950439453, "rewards/margins": 10.560155868530273, "rewards/rejected": -32.18867111206055, "rewards/weighted_accuracies": 0.671875, "rewards/weighted_chosen": -0.2110244780778885, "rewards/weighted_margins": 0.23976440727710724, "rewards/weighted_rejected": -0.4508819580078125, "step": 210 }, { "epoch": 0.11515310128238682, "grad_norm": 31.583986282348633, "learning_rate": 9.99391406364405e-07, "logits/chosen": -0.37994080781936646, "logits/rejected": -0.384744256734848, "logps/chosen": -318.63592529296875, "logps/rejected": -305.0, "logps/weighted_chosen": -2.9573731422424316, "logps/weighted_rejected": -3.207348585128784, "loss": 0.7259, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -26.522266387939453, "rewards/margins": 9.657129287719727, "rewards/rejected": -36.18437576293945, "rewards/weighted_accuracies": 0.625, "rewards/weighted_chosen": -0.3494918942451477, "rewards/weighted_margins": 0.17980042099952698, "rewards/weighted_rejected": -0.5295776128768921, "step": 220 }, { "epoch": 0.12038733315885894, "grad_norm": 611.2006225585938, "learning_rate": 9.988573164927884e-07, "logits/chosen": -0.3254447877407074, "logits/rejected": -0.36442261934280396, "logps/chosen": -297.16717529296875, "logps/rejected": -296.48907470703125, "logps/weighted_chosen": -2.35888671875, "logps/weighted_rejected": -2.804126024246216, "loss": 0.6637, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -28.293750762939453, "rewards/margins": 16.522851943969727, "rewards/rejected": -44.817970275878906, "rewards/weighted_accuracies": 0.6812499761581421, "rewards/weighted_chosen": -0.1478603333234787, "rewards/weighted_margins": 0.3882904052734375, "rewards/weighted_rejected": -0.5360336303710938, "step": 230 }, { "epoch": 0.12562156503533106, "grad_norm": 34.738059997558594, "learning_rate": 9.98156612329838e-07, "logits/chosen": -0.3876676559448242, "logits/rejected": -0.4364303648471832, "logps/chosen": -312.5921936035156, "logps/rejected": -348.60626220703125, "logps/weighted_chosen": -2.594775438308716, "logps/weighted_rejected": -3.1068115234375, "loss": 0.7482, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -47.75507736206055, "rewards/margins": 18.762109756469727, "rewards/rejected": -66.5289077758789, "rewards/weighted_accuracies": 0.6499999761581421, "rewards/weighted_chosen": -0.319406121969223, "rewards/weighted_margins": 0.204498291015625, "rewards/weighted_rejected": -0.5241454839706421, "step": 240 }, { "epoch": 0.13085579691180318, "grad_norm": 141.2784423828125, "learning_rate": 9.97289527905053e-07, "logits/chosen": -0.4159797728061676, "logits/rejected": -0.4382568299770355, "logps/chosen": -308.44842529296875, "logps/rejected": -313.49530029296875, "logps/weighted_chosen": -2.802807569503784, "logps/weighted_rejected": -2.9874024391174316, "loss": 0.7583, "rewards/accuracies": 0.609375, "rewards/chosen": -44.16621017456055, "rewards/margins": 13.382226943969727, "rewards/rejected": -57.54296875, "rewards/weighted_accuracies": 0.6187499761581421, "rewards/weighted_chosen": -0.20551452040672302, "rewards/weighted_margins": 0.22150878608226776, "rewards/weighted_rejected": -0.42730408906936646, "step": 250 }, { "epoch": 0.1360900287882753, "grad_norm": 75.63247680664062, "learning_rate": 9.962563528175875e-07, "logits/chosen": -0.4156021177768707, "logits/rejected": -0.4493362307548523, "logps/chosen": -342.1312561035156, "logps/rejected": -317.84063720703125, "logps/weighted_chosen": -2.56585693359375, "logps/weighted_rejected": -3.4276366233825684, "loss": 0.6673, "rewards/accuracies": 0.5625, "rewards/chosen": -43.44648361206055, "rewards/margins": 13.459375381469727, "rewards/rejected": -56.90898513793945, "rewards/weighted_accuracies": 0.6156250238418579, "rewards/weighted_chosen": -0.310464471578598, "rewards/weighted_margins": 0.32973939180374146, "rewards/weighted_rejected": -0.6404876708984375, "step": 260 }, { "epoch": 0.14132426066474746, "grad_norm": 32.991607666015625, "learning_rate": 9.950574321395277e-07, "logits/chosen": -0.507153332233429, "logits/rejected": -0.5264068841934204, "logps/chosen": -317.0335998535156, "logps/rejected": -300.5718688964844, "logps/weighted_chosen": -2.669018507003784, "logps/weighted_rejected": -2.8846678733825684, "loss": 0.6774, "rewards/accuracies": 0.609375, "rewards/chosen": -27.629491806030273, "rewards/margins": 9.762109756469727, "rewards/rejected": -37.39765548706055, "rewards/weighted_accuracies": 0.6000000238418579, "rewards/weighted_chosen": -0.23316803574562073, "rewards/weighted_margins": 0.2321929931640625, "rewards/weighted_rejected": -0.46527099609375, "step": 270 }, { "epoch": 0.14655849254121958, "grad_norm": 41.87471389770508, "learning_rate": 9.936931663006413e-07, "logits/chosen": -0.5015045404434204, "logits/rejected": -0.49525564908981323, "logps/chosen": -319.9296875, "logps/rejected": -312.28826904296875, "logps/weighted_chosen": -2.667797803878784, "logps/weighted_rejected": -3.121386766433716, "loss": 0.6458, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -17.077342987060547, "rewards/margins": 15.751367568969727, "rewards/rejected": -32.82636642456055, "rewards/weighted_accuracies": 0.703125, "rewards/weighted_chosen": -0.10239715874195099, "rewards/weighted_margins": 0.3067169189453125, "rewards/weighted_rejected": -0.409027099609375, "step": 280 }, { "epoch": 0.1517927244176917, "grad_norm": 22.655261993408203, "learning_rate": 9.921640109546357e-07, "logits/chosen": -0.4936012327671051, "logits/rejected": -0.5640270113945007, "logps/chosen": -286.4117126464844, "logps/rejected": -286.15313720703125, "logps/weighted_chosen": -2.640380859375, "logps/weighted_rejected": -3.774829149246216, "loss": 0.6369, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -19.629491806030273, "rewards/margins": 14.98291015625, "rewards/rejected": -34.61503982543945, "rewards/weighted_accuracies": 0.668749988079071, "rewards/weighted_chosen": -0.21005859971046448, "rewards/weighted_margins": 0.3854110836982727, "rewards/weighted_rejected": -0.5955780148506165, "step": 290 }, { "epoch": 0.15702695629416383, "grad_norm": 33.83458709716797, "learning_rate": 9.90470476826975e-07, "logits/chosen": -0.5687958002090454, "logits/rejected": -0.5700439214706421, "logps/chosen": -298.06561279296875, "logps/rejected": -311.8359375, "logps/weighted_chosen": -2.555248975753784, "logps/weighted_rejected": -3.0333251953125, "loss": 0.6326, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -28.754297256469727, "rewards/margins": 15.813085556030273, "rewards/rejected": -44.567771911621094, "rewards/weighted_accuracies": 0.6468750238418579, "rewards/weighted_chosen": -0.25576478242874146, "rewards/weighted_margins": 0.3327178955078125, "rewards/weighted_rejected": -0.5886474847793579, "step": 300 }, { "epoch": 0.16226118817063595, "grad_norm": 29.578954696655273, "learning_rate": 9.886131295443002e-07, "logits/chosen": -0.6218292117118835, "logits/rejected": -0.6808746457099915, "logps/chosen": -313.44061279296875, "logps/rejected": -297.9156188964844, "logps/weighted_chosen": -2.719403028488159, "logps/weighted_rejected": -3.1142334938049316, "loss": 0.6221, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -32.1875, "rewards/margins": 15.571874618530273, "rewards/rejected": -47.7666015625, "rewards/weighted_accuracies": 0.6499999761581421, "rewards/weighted_chosen": -0.19769592583179474, "rewards/weighted_margins": 0.521374523639679, "rewards/weighted_rejected": -0.7188171148300171, "step": 310 }, { "epoch": 0.16749542004710807, "grad_norm": 15.948070526123047, "learning_rate": 9.865925894455166e-07, "logits/chosen": -0.6636382937431335, "logits/rejected": -0.6817939877510071, "logps/chosen": -336.1507873535156, "logps/rejected": -306.62890625, "logps/weighted_chosen": -2.83941650390625, "logps/weighted_rejected": -3.273486375808716, "loss": 0.7046, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -36.21074295043945, "rewards/margins": 12.9765625, "rewards/rejected": -49.17656326293945, "rewards/weighted_accuracies": 0.6343749761581421, "rewards/weighted_chosen": -0.3679946959018707, "rewards/weighted_margins": 0.278738409280777, "rewards/weighted_rejected": -0.6464904546737671, "step": 320 }, { "epoch": 0.17272965192358022, "grad_norm": 53.10931396484375, "learning_rate": 9.84409531374603e-07, "logits/chosen": -0.6222923398017883, "logits/rejected": -0.607867419719696, "logps/chosen": -344.19061279296875, "logps/rejected": -315.84686279296875, "logps/weighted_chosen": -2.915234327316284, "logps/weighted_rejected": -3.346630811691284, "loss": 0.6334, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -39.39531326293945, "rewards/margins": 13.606640815734863, "rewards/rejected": -53.00273513793945, "rewards/weighted_accuracies": 0.65625, "rewards/weighted_chosen": -0.33662718534469604, "rewards/weighted_margins": 0.3510375916957855, "rewards/weighted_rejected": -0.6879855990409851, "step": 330 }, { "epoch": 0.17796388380005235, "grad_norm": 13.994344711303711, "learning_rate": 9.820646844552219e-07, "logits/chosen": -0.6615959405899048, "logits/rejected": -0.714038074016571, "logps/chosen": -312.7328186035156, "logps/rejected": -323.3359375, "logps/weighted_chosen": -2.910736083984375, "logps/weighted_rejected": -3.2298340797424316, "loss": 0.6376, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -36.327537536621094, "rewards/margins": 21.465625762939453, "rewards/rejected": -57.79179763793945, "rewards/weighted_accuracies": 0.675000011920929, "rewards/weighted_chosen": -0.34371185302734375, "rewards/weighted_margins": 0.390716552734375, "rewards/weighted_rejected": -0.734301745891571, "step": 340 }, { "epoch": 0.18319811567652447, "grad_norm": 13.682384490966797, "learning_rate": 9.795588318471964e-07, "logits/chosen": -0.774444580078125, "logits/rejected": -0.7887184023857117, "logps/chosen": -302.76251220703125, "logps/rejected": -336.90313720703125, "logps/weighted_chosen": -2.718432664871216, "logps/weighted_rejected": -3.0622315406799316, "loss": 0.6451, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -43.29413986206055, "rewards/margins": 17.229297637939453, "rewards/rejected": -60.52812576293945, "rewards/weighted_accuracies": 0.637499988079071, "rewards/weighted_chosen": -0.33645325899124146, "rewards/weighted_margins": 0.33460694551467896, "rewards/weighted_rejected": -0.6709426641464233, "step": 350 }, { "epoch": 0.1884323475529966, "grad_norm": 19.35496711730957, "learning_rate": 9.768928104849415e-07, "logits/chosen": -0.815447986125946, "logits/rejected": -0.8216186761856079, "logps/chosen": -330.66015625, "logps/rejected": -313.8687438964844, "logps/weighted_chosen": -2.970752000808716, "logps/weighted_rejected": -3.1889405250549316, "loss": 0.6763, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -47.90937423706055, "rewards/margins": 17.064844131469727, "rewards/rejected": -64.98515319824219, "rewards/weighted_accuracies": 0.6499999761581421, "rewards/weighted_chosen": -0.3897460997104645, "rewards/weighted_margins": 0.384918212890625, "rewards/weighted_rejected": -0.7745300531387329, "step": 360 }, { "epoch": 0.19366657942946872, "grad_norm": 21.70383071899414, "learning_rate": 9.740675107979355e-07, "logits/chosen": -0.7939224243164062, "logits/rejected": -0.827624499797821, "logps/chosen": -373.0406188964844, "logps/rejected": -349.3343811035156, "logps/weighted_chosen": -2.506054639816284, "logps/weighted_rejected": -3.43896484375, "loss": 0.6215, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -58.082420349121094, "rewards/margins": 16.594335556030273, "rewards/rejected": -74.671875, "rewards/weighted_accuracies": 0.671875, "rewards/weighted_chosen": -0.48696595430374146, "rewards/weighted_margins": 0.412750244140625, "rewards/weighted_rejected": -0.90020751953125, "step": 370 }, { "epoch": 0.19890081130594087, "grad_norm": 21.682790756225586, "learning_rate": 9.71083876413323e-07, "logits/chosen": -0.77825927734375, "logits/rejected": -0.7921401858329773, "logps/chosen": -367.0062561035156, "logps/rejected": -355.78594970703125, "logps/weighted_chosen": -2.644775390625, "logps/weighted_rejected": -3.5374755859375, "loss": 0.6609, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -64.59492492675781, "rewards/margins": 21.614843368530273, "rewards/rejected": -86.1988296508789, "rewards/weighted_accuracies": 0.6625000238418579, "rewards/weighted_chosen": -0.5685027837753296, "rewards/weighted_margins": 0.3586792051792145, "rewards/weighted_rejected": -0.9269439578056335, "step": 380 }, { "epoch": 0.204135043182413, "grad_norm": 30.26679229736328, "learning_rate": 9.67942903840751e-07, "logits/chosen": -0.8401764035224915, "logits/rejected": -0.898754894733429, "logps/chosen": -362.0328063964844, "logps/rejected": -362.3687438964844, "logps/weighted_chosen": -2.8179688453674316, "logps/weighted_rejected": -3.4515624046325684, "loss": 0.5992, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -57.87226486206055, "rewards/margins": 30.196680068969727, "rewards/rejected": -88.0804672241211, "rewards/weighted_accuracies": 0.715624988079071, "rewards/weighted_chosen": -0.456369012594223, "rewards/weighted_margins": 0.495849609375, "rewards/weighted_rejected": -0.951983630657196, "step": 390 }, { "epoch": 0.2093692750588851, "grad_norm": 37.81236267089844, "learning_rate": 9.646456421395447e-07, "logits/chosen": -0.8902359008789062, "logits/rejected": -0.9107666015625, "logps/chosen": -380.6343688964844, "logps/rejected": -399.7281188964844, "logps/weighted_chosen": -2.709228515625, "logps/weighted_rejected": -3.455004930496216, "loss": 0.6557, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -62.43437576293945, "rewards/margins": 33.634765625, "rewards/rejected": -96.0941390991211, "rewards/weighted_accuracies": 0.653124988079071, "rewards/weighted_chosen": -0.4688262939453125, "rewards/weighted_margins": 0.3546081483364105, "rewards/weighted_rejected": -0.8232513666152954, "step": 400 }, { "epoch": 0.21460350693535724, "grad_norm": 21.761716842651367, "learning_rate": 9.611931925683266e-07, "logits/chosen": -0.849993109703064, "logits/rejected": -0.8886367678642273, "logps/chosen": -368.8374938964844, "logps/rejected": -353.73126220703125, "logps/weighted_chosen": -2.533374071121216, "logps/weighted_rejected": -3.233935594558716, "loss": 0.591, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -62.575782775878906, "rewards/margins": 27.423242568969727, "rewards/rejected": -90.0132827758789, "rewards/weighted_accuracies": 0.6937500238418579, "rewards/weighted_chosen": -0.4288879334926605, "rewards/weighted_margins": 0.507708728313446, "rewards/weighted_rejected": -0.9364013671875, "step": 410 }, { "epoch": 0.21983773881182936, "grad_norm": 28.842378616333008, "learning_rate": 9.575867082172085e-07, "logits/chosen": -0.836225152015686, "logits/rejected": -0.8854034543037415, "logps/chosen": -371.90625, "logps/rejected": -369.4125061035156, "logps/weighted_chosen": -2.9190917015075684, "logps/weighted_rejected": -3.118579149246216, "loss": 0.6198, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -66.98905944824219, "rewards/margins": 32.337890625, "rewards/rejected": -99.3402328491211, "rewards/weighted_accuracies": 0.6625000238418579, "rewards/weighted_chosen": -0.4911743104457855, "rewards/weighted_margins": 0.5335143804550171, "rewards/weighted_rejected": -1.0242736339569092, "step": 420 }, { "epoch": 0.22507197068830148, "grad_norm": 16.805343627929688, "learning_rate": 9.538273936226673e-07, "logits/chosen": -0.8646026849746704, "logits/rejected": -0.898516833782196, "logps/chosen": -328.94921875, "logps/rejected": -353.66717529296875, "logps/weighted_chosen": -3.1983399391174316, "logps/weighted_rejected": -3.6509766578674316, "loss": 0.6173, "rewards/accuracies": 0.625, "rewards/chosen": -63.52910232543945, "rewards/margins": 25.6201171875, "rewards/rejected": -89.1246109008789, "rewards/weighted_accuracies": 0.6499999761581421, "rewards/weighted_chosen": -0.436279296875, "rewards/weighted_margins": 0.438516229391098, "rewards/weighted_rejected": -0.8747711181640625, "step": 430 }, { "epoch": 0.23030620256477363, "grad_norm": 23.78999137878418, "learning_rate": 9.499165043652391e-07, "logits/chosen": -0.8759521245956421, "logits/rejected": -0.8865814208984375, "logps/chosen": -358.69140625, "logps/rejected": -358.76251220703125, "logps/weighted_chosen": -3.2625732421875, "logps/weighted_rejected": -3.359301805496216, "loss": 0.6366, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -67.4326171875, "rewards/margins": 24.610157012939453, "rewards/rejected": -92.03788757324219, "rewards/weighted_accuracies": 0.668749988079071, "rewards/weighted_chosen": -0.5896576046943665, "rewards/weighted_margins": 0.34770506620407104, "rewards/weighted_rejected": -0.9376128911972046, "step": 440 }, { "epoch": 0.23554043444124576, "grad_norm": 49.646263122558594, "learning_rate": 9.458553466501665e-07, "logits/chosen": -0.9158996343612671, "logits/rejected": -0.9590820074081421, "logps/chosen": -355.3109436035156, "logps/rejected": -341.9765625, "logps/weighted_chosen": -3.231823682785034, "logps/weighted_rejected": -3.4568848609924316, "loss": 0.657, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -69.14453125, "rewards/margins": 28.884180068969727, "rewards/rejected": -98.0492172241211, "rewards/weighted_accuracies": 0.6875, "rewards/weighted_chosen": -0.6470184326171875, "rewards/weighted_margins": 0.4533935487270355, "rewards/weighted_rejected": -1.100683569908142, "step": 450 }, { "epoch": 0.24077466631771788, "grad_norm": 38.45325469970703, "learning_rate": 9.416452768711366e-07, "logits/chosen": -0.9281005859375, "logits/rejected": -0.9545837640762329, "logps/chosen": -375.6390686035156, "logps/rejected": -370.203125, "logps/weighted_chosen": -2.9860596656799316, "logps/weighted_rejected": -3.635498046875, "loss": 0.6457, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -83.3871078491211, "rewards/margins": 26.872264862060547, "rewards/rejected": -110.26094055175781, "rewards/weighted_accuracies": 0.6625000238418579, "rewards/weighted_chosen": -0.6367782354354858, "rewards/weighted_margins": 0.515301525592804, "rewards/weighted_rejected": -1.152197241783142, "step": 460 }, { "epoch": 0.24600889819419, "grad_norm": 59.67642593383789, "learning_rate": 9.372877011572557e-07, "logits/chosen": -0.8797394037246704, "logits/rejected": -0.904949963092804, "logps/chosen": -398.8531188964844, "logps/rejected": -388.22186279296875, "logps/weighted_chosen": -3.0103516578674316, "logps/weighted_rejected": -3.409374952316284, "loss": 0.6181, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -88.2437515258789, "rewards/margins": 25.542577743530273, "rewards/rejected": -113.7890625, "rewards/weighted_accuracies": 0.6781250238418579, "rewards/weighted_chosen": -0.6517273187637329, "rewards/weighted_margins": 0.482818603515625, "rewards/weighted_rejected": -1.1340011358261108, "step": 470 }, { "epoch": 0.2512431300706621, "grad_norm": 27.465656280517578, "learning_rate": 9.327840749034141e-07, "logits/chosen": -0.910723865032196, "logits/rejected": -0.9439147710800171, "logps/chosen": -364.4750061035156, "logps/rejected": -392.03436279296875, "logps/weighted_chosen": -2.8880372047424316, "logps/weighted_rejected": -4.069311618804932, "loss": 0.6276, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -77.44355773925781, "rewards/margins": 38.147850036621094, "rewards/rejected": -115.59648132324219, "rewards/weighted_accuracies": 0.684374988079071, "rewards/weighted_chosen": -0.5768798589706421, "rewards/weighted_margins": 0.541027843952179, "rewards/weighted_rejected": -1.1178162097930908, "step": 480 }, { "epoch": 0.2564773619471343, "grad_norm": 34.35914993286133, "learning_rate": 9.281359022841965e-07, "logits/chosen": -0.80584716796875, "logits/rejected": -0.8291991949081421, "logps/chosen": -349.86798095703125, "logps/rejected": -354.75701904296875, "logps/weighted_chosen": -2.914257764816284, "logps/weighted_rejected": -4.178680419921875, "loss": 0.63, "rewards/accuracies": 0.625, "rewards/chosen": -74.94511413574219, "rewards/margins": 34.957618713378906, "rewards/rejected": -109.90644836425781, "rewards/weighted_accuracies": 0.690625011920929, "rewards/weighted_chosen": -0.564099133014679, "rewards/weighted_margins": 0.658795177936554, "rewards/weighted_rejected": -1.2229950428009033, "step": 490 }, { "epoch": 0.26171159382360637, "grad_norm": 41.01088333129883, "learning_rate": 9.233447357514989e-07, "logits/chosen": -0.775347888469696, "logits/rejected": -0.8226684331893921, "logps/chosen": -369.1148376464844, "logps/rejected": -373.8374938964844, "logps/weighted_chosen": -3.288012742996216, "logps/weighted_rejected": -3.7696776390075684, "loss": 0.6225, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -71.1283187866211, "rewards/margins": 32.867576599121094, "rewards/rejected": -104.0218734741211, "rewards/weighted_accuracies": 0.6937500238418579, "rewards/weighted_chosen": -0.5540069341659546, "rewards/weighted_margins": 0.598828136920929, "rewards/weighted_rejected": -1.1535155773162842, "step": 500 }, { "epoch": 0.26171159382360637, "eval_logits/chosen": -0.8767929673194885, "eval_logits/rejected": -0.8968105316162109, "eval_logps/chosen": -369.21099853515625, "eval_logps/rejected": -379.3609924316406, "eval_logps/weighted_chosen": -2.957455635070801, "eval_logps/weighted_rejected": -3.6658689975738525, "eval_loss": 0.6115986108779907, "eval_rewards/accuracies": 0.6294999718666077, "eval_rewards/chosen": -80.43468475341797, "eval_rewards/margins": 31.297624588012695, "eval_rewards/rejected": -111.73987579345703, "eval_rewards/weighted_accuracies": 0.6759999990463257, "eval_rewards/weighted_chosen": -0.5706117153167725, "eval_rewards/weighted_margins": 0.5400464534759521, "eval_rewards/weighted_rejected": -1.1106581687927246, "eval_runtime": 1361.4222, "eval_samples_per_second": 1.469, "eval_steps_per_second": 0.367, "step": 500 }, { "epoch": 0.2669458257000785, "grad_norm": 37.983856201171875, "learning_rate": 9.184121755160232e-07, "logits/chosen": -0.8435028195381165, "logits/rejected": -0.8772613406181335, "logps/chosen": -378.20782470703125, "logps/rejected": -406.3187561035156, "logps/weighted_chosen": -3.217089891433716, "logps/weighted_rejected": -3.7809815406799316, "loss": 0.6028, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -77.0746078491211, "rewards/margins": 40.68242263793945, "rewards/rejected": -117.759765625, "rewards/weighted_accuracies": 0.6875, "rewards/weighted_chosen": -0.5556915402412415, "rewards/weighted_margins": 0.5749145746231079, "rewards/weighted_rejected": -1.130804419517517, "step": 510 }, { "epoch": 0.2721800575765506, "grad_norm": 46.08055877685547, "learning_rate": 9.133398690128193e-07, "logits/chosen": -0.894885241985321, "logits/rejected": -0.9157882928848267, "logps/chosen": -405.5679626464844, "logps/rejected": -415.10076904296875, "logps/weighted_chosen": -3.0770506858825684, "logps/weighted_rejected": -3.949780225753784, "loss": 0.5762, "rewards/accuracies": 0.65625, "rewards/chosen": -84.00956726074219, "rewards/margins": 50.470703125, "rewards/rejected": -134.46737670898438, "rewards/weighted_accuracies": 0.7250000238418579, "rewards/weighted_chosen": -0.49931639432907104, "rewards/weighted_margins": 0.658154308795929, "rewards/weighted_rejected": -1.1574280261993408, "step": 520 }, { "epoch": 0.27741428945302277, "grad_norm": 40.89746856689453, "learning_rate": 9.081295103510554e-07, "logits/chosen": -0.8876678347587585, "logits/rejected": -0.9430938959121704, "logps/chosen": -383.52734375, "logps/rejected": -413.60626220703125, "logps/weighted_chosen": -2.752880811691284, "logps/weighted_rejected": -4.01953125, "loss": 0.5471, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -99.4878921508789, "rewards/margins": 44.412109375, "rewards/rejected": -143.859375, "rewards/weighted_accuracies": 0.706250011920929, "rewards/weighted_chosen": -0.6086364984512329, "rewards/weighted_margins": 0.8298507928848267, "rewards/weighted_rejected": -1.438207983970642, "step": 530 }, { "epoch": 0.2826485213294949, "grad_norm": 27.849748611450195, "learning_rate": 9.027828397481989e-07, "logits/chosen": -0.8531036376953125, "logits/rejected": -0.885467529296875, "logps/chosen": -354.4359436035156, "logps/rejected": -387.6328125, "logps/weighted_chosen": -3.355029344558716, "logps/weighted_rejected": -4.168432712554932, "loss": 0.5956, "rewards/accuracies": 0.625, "rewards/chosen": -93.89375305175781, "rewards/margins": 38.216407775878906, "rewards/rejected": -132.11776733398438, "rewards/weighted_accuracies": 0.675000011920929, "rewards/weighted_chosen": -0.6640685796737671, "rewards/weighted_margins": 0.6357482671737671, "rewards/weighted_rejected": -1.2999756336212158, "step": 540 }, { "epoch": 0.287882753205967, "grad_norm": 22.531660079956055, "learning_rate": 8.973016429487988e-07, "logits/chosen": -0.8864593505859375, "logits/rejected": -0.9035919308662415, "logps/chosen": -375.8890686035156, "logps/rejected": -396.171875, "logps/weighted_chosen": -3.2917237281799316, "logps/weighted_rejected": -3.580859422683716, "loss": 0.6356, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -94.04902648925781, "rewards/margins": 47.923240661621094, "rewards/rejected": -141.9646453857422, "rewards/weighted_accuracies": 0.71875, "rewards/weighted_chosen": -0.7129974365234375, "rewards/weighted_margins": 0.53741455078125, "rewards/weighted_rejected": -1.2503478527069092, "step": 550 }, { "epoch": 0.29311698508243916, "grad_norm": 36.68463897705078, "learning_rate": 8.916877506280601e-07, "logits/chosen": -0.9174438714981079, "logits/rejected": -0.9228302240371704, "logps/chosen": -387.2203063964844, "logps/rejected": -405.38592529296875, "logps/weighted_chosen": -3.4023194313049316, "logps/weighted_rejected": -3.6703858375549316, "loss": 0.5995, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -106.43359375, "rewards/margins": 44.87871170043945, "rewards/rejected": -151.3136749267578, "rewards/weighted_accuracies": 0.690625011920929, "rewards/weighted_chosen": -0.7533203363418579, "rewards/weighted_margins": 0.627819836139679, "rewards/weighted_rejected": -1.3810546398162842, "step": 560 }, { "epoch": 0.29835121695891126, "grad_norm": 14.758890151977539, "learning_rate": 8.85943037780415e-07, "logits/chosen": -0.9987869262695312, "logits/rejected": -1.0033996105194092, "logps/chosen": -392.72344970703125, "logps/rejected": -380.15625, "logps/weighted_chosen": -3.2074217796325684, "logps/weighted_rejected": -3.7137207984924316, "loss": 0.6343, "rewards/accuracies": 0.609375, "rewards/chosen": -106.4300765991211, "rewards/margins": 32.053321838378906, "rewards/rejected": -138.525390625, "rewards/weighted_accuracies": 0.7093750238418579, "rewards/weighted_chosen": -0.744158923625946, "rewards/weighted_margins": 0.547467052936554, "rewards/weighted_rejected": -1.291876196861267, "step": 570 }, { "epoch": 0.3035854488353834, "grad_norm": 18.140382766723633, "learning_rate": 8.800694230932884e-07, "logits/chosen": -0.958880603313446, "logits/rejected": -0.9720214605331421, "logps/chosen": -390.57342529296875, "logps/rejected": -398.5562438964844, "logps/weighted_chosen": -2.86993408203125, "logps/weighted_rejected": -3.4458250999450684, "loss": 0.624, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -100.91777038574219, "rewards/margins": 32.43359375, "rewards/rejected": -133.35546875, "rewards/weighted_accuracies": 0.659375011920929, "rewards/weighted_chosen": -0.558367908000946, "rewards/weighted_margins": 0.4340454041957855, "rewards/weighted_rejected": -0.992688000202179, "step": 580 }, { "epoch": 0.30881968071185556, "grad_norm": 16.685937881469727, "learning_rate": 8.740688683062723e-07, "logits/chosen": -1.0655395984649658, "logits/rejected": -1.0949890613555908, "logps/chosen": -449.0078125, "logps/rejected": -423.9593811035156, "logps/weighted_chosen": -2.912792921066284, "logps/weighted_rejected": -3.656201124191284, "loss": 0.6344, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -119.9111328125, "rewards/margins": 28.247461318969727, "rewards/rejected": -148.1476593017578, "rewards/weighted_accuracies": 0.706250011920929, "rewards/weighted_chosen": -0.5903121829032898, "rewards/weighted_margins": 0.525067150592804, "rewards/weighted_rejected": -1.1153564453125, "step": 590 }, { "epoch": 0.31405391258832765, "grad_norm": 20.500003814697266, "learning_rate": 8.679433775559215e-07, "logits/chosen": -1.0546143054962158, "logits/rejected": -1.1019103527069092, "logps/chosen": -473.6171875, "logps/rejected": -477.95001220703125, "logps/weighted_chosen": -2.896728515625, "logps/weighted_rejected": -3.985156297683716, "loss": 0.5741, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -148.9021453857422, "rewards/margins": 43.50390625, "rewards/rejected": -192.4656219482422, "rewards/weighted_accuracies": 0.71875, "rewards/weighted_chosen": -0.7574096918106079, "rewards/weighted_margins": 0.587969958782196, "rewards/weighted_rejected": -1.345495581626892, "step": 600 }, { "epoch": 0.3192881444647998, "grad_norm": 50.760562896728516, "learning_rate": 8.616949967063871e-07, "logits/chosen": -1.007971167564392, "logits/rejected": -1.0581481456756592, "logps/chosen": -419.1000061035156, "logps/rejected": -460.9624938964844, "logps/weighted_chosen": -3.3387207984924316, "logps/weighted_rejected": -3.981640577316284, "loss": 0.5971, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -150.87344360351562, "rewards/margins": 49.451171875, "rewards/rejected": -200.21444702148438, "rewards/weighted_accuracies": 0.671875, "rewards/weighted_chosen": -0.853253185749054, "rewards/weighted_margins": 0.644561767578125, "rewards/weighted_rejected": -1.497961401939392, "step": 610 }, { "epoch": 0.3245223763412719, "grad_norm": 16.369630813598633, "learning_rate": 8.553258126661154e-07, "logits/chosen": -1.0560119152069092, "logits/rejected": -1.0674712657928467, "logps/chosen": -442.5859375, "logps/rejected": -463.5718688964844, "logps/weighted_chosen": -3.429705858230591, "logps/weighted_rejected": -4.259472846984863, "loss": 0.6405, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -162.7121124267578, "rewards/margins": 44.754295349121094, "rewards/rejected": -207.43008422851562, "rewards/weighted_accuracies": 0.684374988079071, "rewards/weighted_chosen": -0.9004287719726562, "rewards/weighted_margins": 0.6113220453262329, "rewards/weighted_rejected": -1.511694312095642, "step": 620 }, { "epoch": 0.32975660821774405, "grad_norm": 31.255151748657227, "learning_rate": 8.488379526908368e-07, "logits/chosen": -1.0301024913787842, "logits/rejected": -1.0498230457305908, "logps/chosen": -465.5859375, "logps/rejected": -505.28125, "logps/weighted_chosen": -3.3856444358825684, "logps/weighted_rejected": -4.028759956359863, "loss": 0.6384, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -174.98046875, "rewards/margins": 61.357032775878906, "rewards/rejected": -236.38906860351562, "rewards/weighted_accuracies": 0.6968749761581421, "rewards/weighted_chosen": -0.969250500202179, "rewards/weighted_margins": 0.5812011957168579, "rewards/weighted_rejected": -1.5499999523162842, "step": 630 }, { "epoch": 0.33499084009421615, "grad_norm": 23.673486709594727, "learning_rate": 8.422335836730802e-07, "logits/chosen": -1.0135101079940796, "logits/rejected": -1.0169677734375, "logps/chosen": -426.2875061035156, "logps/rejected": -497.64373779296875, "logps/weighted_chosen": -3.098522901535034, "logps/weighted_rejected": -3.898510694503784, "loss": 0.6321, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -152.58749389648438, "rewards/margins": 67.427734375, "rewards/rejected": -220.0578155517578, "rewards/weighted_accuracies": 0.699999988079071, "rewards/weighted_chosen": -0.878277599811554, "rewards/weighted_margins": 0.590301513671875, "rewards/weighted_rejected": -1.468664526939392, "step": 640 }, { "epoch": 0.3402250719706883, "grad_norm": 20.44074821472168, "learning_rate": 8.355149114184485e-07, "logits/chosen": -1.105963110923767, "logits/rejected": -1.095733642578125, "logps/chosen": -474.7554626464844, "logps/rejected": -523.0843505859375, "logps/weighted_chosen": -3.1939697265625, "logps/weighted_rejected": -3.786669969558716, "loss": 0.5634, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -167.33828735351562, "rewards/margins": 73.52656555175781, "rewards/rejected": -240.85702514648438, "rewards/weighted_accuracies": 0.668749988079071, "rewards/weighted_chosen": -0.8172271847724915, "rewards/weighted_margins": 0.6973236203193665, "rewards/weighted_rejected": -1.514801025390625, "step": 650 }, { "epoch": 0.34545930384716045, "grad_norm": 25.958847045898438, "learning_rate": 8.286841799088963e-07, "logits/chosen": -1.1746642589569092, "logits/rejected": -1.179473876953125, "logps/chosen": -480.34063720703125, "logps/rejected": -518.9437255859375, "logps/weighted_chosen": -2.8995361328125, "logps/weighted_rejected": -3.656787157058716, "loss": 0.6073, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": -196.3136749267578, "rewards/margins": 59.513671875, "rewards/rejected": -255.85897827148438, "rewards/weighted_accuracies": 0.684374988079071, "rewards/weighted_chosen": -0.8131744265556335, "rewards/weighted_margins": 0.576770007610321, "rewards/weighted_rejected": -1.3900268077850342, "step": 660 }, { "epoch": 0.35069353572363254, "grad_norm": 15.336409568786621, "learning_rate": 8.217436705532599e-07, "logits/chosen": -1.1510436534881592, "logits/rejected": -1.173730492591858, "logps/chosen": -508.78594970703125, "logps/rejected": -500.79217529296875, "logps/weighted_chosen": -3.0011963844299316, "logps/weighted_rejected": -3.7521729469299316, "loss": 0.578, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -205.71914672851562, "rewards/margins": 31.244531631469727, "rewards/rejected": -237.14453125, "rewards/weighted_accuracies": 0.721875011920929, "rewards/weighted_chosen": -0.8121887445449829, "rewards/weighted_margins": 0.688189685344696, "rewards/weighted_rejected": -1.500115990638733, "step": 670 }, { "epoch": 0.3559277676001047, "grad_norm": 19.029943466186523, "learning_rate": 8.14695701425284e-07, "logits/chosen": -1.079345703125, "logits/rejected": -1.125024437904358, "logps/chosen": -466.38360595703125, "logps/rejected": -470.4234313964844, "logps/weighted_chosen": -2.9954833984375, "logps/weighted_rejected": -3.8232421875, "loss": 0.5882, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -160.8718719482422, "rewards/margins": 47.43242263793945, "rewards/rejected": -208.275390625, "rewards/weighted_accuracies": 0.690625011920929, "rewards/weighted_chosen": -0.7394653558731079, "rewards/weighted_margins": 0.627307116985321, "rewards/weighted_rejected": -1.3663330078125, "step": 680 }, { "epoch": 0.3611619994765768, "grad_norm": 13.842121124267578, "learning_rate": 8.075426264894046e-07, "logits/chosen": -1.013006567955017, "logits/rejected": -1.0405089855194092, "logps/chosen": -472.03436279296875, "logps/rejected": -501.09375, "logps/weighted_chosen": -3.1174073219299316, "logps/weighted_rejected": -4.429980278015137, "loss": 0.5163, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -166.5642547607422, "rewards/margins": 56.965431213378906, "rewards/rejected": -223.5593719482422, "rewards/weighted_accuracies": 0.778124988079071, "rewards/weighted_chosen": -0.7782195806503296, "rewards/weighted_margins": 0.887194812297821, "rewards/weighted_rejected": -1.665307641029358, "step": 690 }, { "epoch": 0.36639623135304894, "grad_norm": 31.751785278320312, "learning_rate": 8.002868348145435e-07, "logits/chosen": -1.0329437255859375, "logits/rejected": -1.0401580333709717, "logps/chosen": -535.6437377929688, "logps/rejected": -564.8609619140625, "logps/weighted_chosen": -3.4560546875, "logps/weighted_rejected": -3.8817505836486816, "loss": 0.5956, "rewards/accuracies": 0.625, "rewards/chosen": -235.3767547607422, "rewards/margins": 61.516014099121094, "rewards/rejected": -296.88360595703125, "rewards/weighted_accuracies": 0.71875, "rewards/weighted_chosen": -1.1499541997909546, "rewards/weighted_margins": 0.6718994379043579, "rewards/weighted_rejected": -1.8215301036834717, "step": 700 }, { "epoch": 0.3716304632295211, "grad_norm": 32.05786895751953, "learning_rate": 7.92930749776179e-07, "logits/chosen": -1.0492126941680908, "logits/rejected": -1.092248558998108, "logps/chosen": -564.2515869140625, "logps/rejected": -602.4500122070312, "logps/weighted_chosen": -3.8183350563049316, "logps/weighted_rejected": -4.569482326507568, "loss": 0.652, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -292.5419921875, "rewards/margins": 49.53203201293945, "rewards/rejected": -342.0179748535156, "rewards/weighted_accuracies": 0.6656249761581421, "rewards/weighted_chosen": -1.244970679283142, "rewards/weighted_margins": 0.630566418170929, "rewards/weighted_rejected": -1.8747985363006592, "step": 710 }, { "epoch": 0.3768646951059932, "grad_norm": 23.8446044921875, "learning_rate": 7.854768282469582e-07, "logits/chosen": -1.0707213878631592, "logits/rejected": -1.126983642578125, "logps/chosen": -462.90155029296875, "logps/rejected": -542.5218505859375, "logps/weighted_chosen": -3.208447217941284, "logps/weighted_rejected": -4.043383598327637, "loss": 0.6115, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -185.4597625732422, "rewards/margins": 79.3746109008789, "rewards/rejected": -264.8812561035156, "rewards/weighted_accuracies": 0.6781250238418579, "rewards/weighted_chosen": -0.8762931823730469, "rewards/weighted_margins": 0.711090087890625, "rewards/weighted_rejected": -1.5873870849609375, "step": 720 }, { "epoch": 0.38209892698246534, "grad_norm": 17.816680908203125, "learning_rate": 7.779275597761215e-07, "logits/chosen": -1.036431908607483, "logits/rejected": -1.105950951576233, "logps/chosen": -471.43280029296875, "logps/rejected": -532.6843872070312, "logps/weighted_chosen": -3.378674268722534, "logps/weighted_rejected": -4.096581935882568, "loss": 0.5442, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -195.43984985351562, "rewards/margins": 68.92460632324219, "rewards/rejected": -264.296875, "rewards/weighted_accuracies": 0.706250011920929, "rewards/weighted_chosen": -0.847393810749054, "rewards/weighted_margins": 0.777783215045929, "rewards/weighted_rejected": -1.625451683998108, "step": 730 }, { "epoch": 0.38733315885893743, "grad_norm": 29.750192642211914, "learning_rate": 7.702854657580126e-07, "logits/chosen": -1.0928223133087158, "logits/rejected": -1.1139404773712158, "logps/chosen": -507.796875, "logps/rejected": -523.7952880859375, "logps/weighted_chosen": -3.3760986328125, "logps/weighted_rejected": -4.447509765625, "loss": 0.5597, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -213.2277374267578, "rewards/margins": 56.09648513793945, "rewards/rejected": -269.4203186035156, "rewards/weighted_accuracies": 0.7406250238418579, "rewards/weighted_chosen": -0.927258312702179, "rewards/weighted_margins": 0.8479980230331421, "rewards/weighted_rejected": -1.7749145030975342, "step": 740 }, { "epoch": 0.3925673907354096, "grad_norm": 38.97298812866211, "learning_rate": 7.625530985899547e-07, "logits/chosen": -1.0656921863555908, "logits/rejected": -1.0953140258789062, "logps/chosen": -491.4609375, "logps/rejected": -545.0203247070312, "logps/weighted_chosen": -3.2592406272888184, "logps/weighted_rejected": -4.605615139007568, "loss": 0.5776, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -224.6374969482422, "rewards/margins": 75.23710632324219, "rewards/rejected": -300.0316467285156, "rewards/weighted_accuracies": 0.699999988079071, "rewards/weighted_chosen": -1.093621850013733, "rewards/weighted_margins": 0.852429211139679, "rewards/weighted_rejected": -1.9456908702850342, "step": 750 }, { "epoch": 0.39780162261188173, "grad_norm": 22.909940719604492, "learning_rate": 7.547330408197694e-07, "logits/chosen": -1.0663268566131592, "logits/rejected": -1.1003234386444092, "logps/chosen": -572.404296875, "logps/rejected": -556.4031372070312, "logps/weighted_chosen": -3.372363328933716, "logps/weighted_rejected": -4.400781154632568, "loss": 0.5943, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -276.7529296875, "rewards/margins": 23.257421493530273, "rewards/rejected": -300.1558532714844, "rewards/weighted_accuracies": 0.703125, "rewards/weighted_chosen": -1.1045868396759033, "rewards/weighted_margins": 0.7262054681777954, "rewards/weighted_rejected": -1.8311798572540283, "step": 760 }, { "epoch": 0.40303585448835383, "grad_norm": 107.9198989868164, "learning_rate": 7.468279042832271e-07, "logits/chosen": -1.067718505859375, "logits/rejected": -1.095727562904358, "logps/chosen": -467.5, "logps/rejected": -559.7703247070312, "logps/weighted_chosen": -3.0786986351013184, "logps/weighted_rejected": -3.8573241233825684, "loss": 0.5995, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -187.9324188232422, "rewards/margins": 82.28340148925781, "rewards/rejected": -270.2611389160156, "rewards/weighted_accuracies": 0.6968749761581421, "rewards/weighted_chosen": -0.867828369140625, "rewards/weighted_margins": 0.625274658203125, "rewards/weighted_rejected": -1.493688941001892, "step": 770 }, { "epoch": 0.408270086364826, "grad_norm": 18.453107833862305, "learning_rate": 7.388403292317154e-07, "logits/chosen": -1.0655944347381592, "logits/rejected": -1.121740698814392, "logps/chosen": -503.8828125, "logps/rejected": -519.8375244140625, "logps/weighted_chosen": -3.039843797683716, "logps/weighted_rejected": -3.848388671875, "loss": 0.5822, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -202.9210968017578, "rewards/margins": 50.95390701293945, "rewards/rejected": -253.869140625, "rewards/weighted_accuracies": 0.731249988079071, "rewards/weighted_chosen": -0.873791515827179, "rewards/weighted_margins": 0.6851806640625, "rewards/weighted_rejected": -1.5595581531524658, "step": 780 }, { "epoch": 0.4135043182412981, "grad_norm": 24.38869285583496, "learning_rate": 7.307729834504154e-07, "logits/chosen": -1.090570092201233, "logits/rejected": -1.1371276378631592, "logps/chosen": -490.86407470703125, "logps/rejected": -537.7296752929688, "logps/weighted_chosen": -3.041918992996216, "logps/weighted_rejected": -4.070898532867432, "loss": 0.6125, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -209.13241577148438, "rewards/margins": 62.44609451293945, "rewards/rejected": -271.58203125, "rewards/weighted_accuracies": 0.690625011920929, "rewards/weighted_chosen": -0.947405993938446, "rewards/weighted_margins": 0.609375, "rewards/weighted_rejected": -1.556726098060608, "step": 790 }, { "epoch": 0.4187385501177702, "grad_norm": 26.962316513061523, "learning_rate": 7.226285613672847e-07, "logits/chosen": -1.03680419921875, "logits/rejected": -1.0742919445037842, "logps/chosen": -500.84295654296875, "logps/rejected": -603.6422119140625, "logps/weighted_chosen": -3.154052734375, "logps/weighted_rejected": -4.342724800109863, "loss": 0.5564, "rewards/accuracies": 0.671875, "rewards/chosen": -223.51171875, "rewards/margins": 101.2464828491211, "rewards/rejected": -324.720703125, "rewards/weighted_accuracies": 0.7406250238418579, "rewards/weighted_chosen": -0.996997058391571, "rewards/weighted_margins": 0.755480945110321, "rewards/weighted_rejected": -1.752661108970642, "step": 800 }, { "epoch": 0.4239727819942423, "grad_norm": 15.780332565307617, "learning_rate": 7.144097831531398e-07, "logits/chosen": -1.0250060558319092, "logits/rejected": -1.052709937095642, "logps/chosen": -517.7921752929688, "logps/rejected": -600.5421752929688, "logps/weighted_chosen": -3.318652391433716, "logps/weighted_rejected": -4.39453125, "loss": 0.5695, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -247.40664672851562, "rewards/margins": 88.87187194824219, "rewards/rejected": -336.234375, "rewards/weighted_accuracies": 0.703125, "rewards/weighted_chosen": -1.082605004310608, "rewards/weighted_margins": 0.89263916015625, "rewards/weighted_rejected": -1.974450707435608, "step": 810 }, { "epoch": 0.42920701387071447, "grad_norm": 24.66672706604004, "learning_rate": 7.061193938131396e-07, "logits/chosen": -0.9916015863418579, "logits/rejected": -1.042016625404358, "logps/chosen": -607.3375244140625, "logps/rejected": -642.5671997070312, "logps/weighted_chosen": -3.689379930496216, "logps/weighted_rejected": -4.398633003234863, "loss": 0.5654, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -307.7925720214844, "rewards/margins": 69.5777359008789, "rewards/rejected": -377.3050842285156, "rewards/weighted_accuracies": 0.7124999761581421, "rewards/weighted_chosen": -1.1716430187225342, "rewards/weighted_margins": 0.9023987054824829, "rewards/weighted_rejected": -2.074475049972534, "step": 820 }, { "epoch": 0.4344412457471866, "grad_norm": 20.459196090698242, "learning_rate": 6.977601622699789e-07, "logits/chosen": -0.9984985589981079, "logits/rejected": -1.079248070716858, "logps/chosen": -471.5625, "logps/rejected": -574.8695068359375, "logps/weighted_chosen": -3.233837842941284, "logps/weighted_rejected": -4.211133003234863, "loss": 0.5097, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -186.7394561767578, "rewards/margins": 109.28633117675781, "rewards/rejected": -296.13555908203125, "rewards/weighted_accuracies": 0.746874988079071, "rewards/weighted_chosen": -0.760040283203125, "rewards/weighted_margins": 0.9420166015625, "rewards/weighted_rejected": -1.7019164562225342, "step": 830 }, { "epoch": 0.4396754776236587, "grad_norm": 89.301025390625, "learning_rate": 6.893348804390882e-07, "logits/chosen": -1.126031517982483, "logits/rejected": -1.1464111804962158, "logps/chosen": -541.5546875, "logps/rejected": -580.4109497070312, "logps/weighted_chosen": -3.6358885765075684, "logps/weighted_rejected": -4.262304782867432, "loss": 0.566, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -246.6085968017578, "rewards/margins": 74.0562515258789, "rewards/rejected": -320.61328125, "rewards/weighted_accuracies": 0.684374988079071, "rewards/weighted_chosen": -1.0077393054962158, "rewards/weighted_margins": 0.7990967035293579, "rewards/weighted_rejected": -1.806396484375, "step": 840 }, { "epoch": 0.44490970950013087, "grad_norm": 20.6274471282959, "learning_rate": 6.808463622961578e-07, "logits/chosen": -1.151123046875, "logits/rejected": -1.2046630382537842, "logps/chosen": -621.8390502929688, "logps/rejected": -716.7249755859375, "logps/weighted_chosen": -3.6482911109924316, "logps/weighted_rejected": -4.6015625, "loss": 0.5759, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -328.931640625, "rewards/margins": 106.0474624633789, "rewards/rejected": -435.07403564453125, "rewards/weighted_accuracies": 0.7124999761581421, "rewards/weighted_chosen": -1.2104400396347046, "rewards/weighted_margins": 0.9383910894393921, "rewards/weighted_rejected": -2.1482911109924316, "step": 850 }, { "epoch": 0.45014394137660296, "grad_norm": 20.91545295715332, "learning_rate": 6.722974429372925e-07, "logits/chosen": -1.1297729015350342, "logits/rejected": -1.170617699623108, "logps/chosen": -697.5484619140625, "logps/rejected": -730.65625, "logps/weighted_chosen": -3.5313477516174316, "logps/weighted_rejected": -5.184765815734863, "loss": 0.5217, "rewards/accuracies": 0.5718749761581421, "rewards/chosen": -393.822265625, "rewards/margins": 74.92500305175781, "rewards/rejected": -468.8492126464844, "rewards/weighted_accuracies": 0.7718750238418579, "rewards/weighted_chosen": -1.393365502357483, "rewards/weighted_margins": 1.1208374500274658, "rewards/weighted_rejected": -2.514172315597534, "step": 860 }, { "epoch": 0.4553781732530751, "grad_norm": 42.88610076904297, "learning_rate": 6.636909776321128e-07, "logits/chosen": -1.1412475109100342, "logits/rejected": -1.1436645984649658, "logps/chosen": -549.9671630859375, "logps/rejected": -677.8781127929688, "logps/weighted_chosen": -3.683642625808716, "logps/weighted_rejected": -4.7724609375, "loss": 0.5195, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -288.21484375, "rewards/margins": 124.931640625, "rewards/rejected": -413.17266845703125, "rewards/weighted_accuracies": 0.7250000238418579, "rewards/weighted_chosen": -1.2497345209121704, "rewards/weighted_margins": 0.971630871295929, "rewards/weighted_rejected": -2.2208008766174316, "step": 870 }, { "epoch": 0.46061240512954726, "grad_norm": 29.938108444213867, "learning_rate": 6.550298408701174e-07, "logits/chosen": -1.0770995616912842, "logits/rejected": -1.134680151939392, "logps/chosen": -574.3234252929688, "logps/rejected": -675.2101440429688, "logps/weighted_chosen": -3.807568311691284, "logps/weighted_rejected": -5.117138862609863, "loss": 0.5413, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -287.63494873046875, "rewards/margins": 104.92851257324219, "rewards/rejected": -392.5396423339844, "rewards/weighted_accuracies": 0.734375, "rewards/weighted_chosen": -1.1316711902618408, "rewards/weighted_margins": 0.954119861125946, "rewards/weighted_rejected": -2.0862364768981934, "step": 880 }, { "epoch": 0.46584663700601936, "grad_norm": 27.375259399414062, "learning_rate": 6.463169254006276e-07, "logits/chosen": -1.10406494140625, "logits/rejected": -1.156030297279358, "logps/chosen": -541.8601684570312, "logps/rejected": -578.5452880859375, "logps/weighted_chosen": -3.4522461891174316, "logps/weighted_rejected": -4.527783393859863, "loss": 0.5321, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -260.65704345703125, "rewards/margins": 71.1494140625, "rewards/rejected": -331.875, "rewards/weighted_accuracies": 0.7281249761581421, "rewards/weighted_chosen": -0.977038562297821, "rewards/weighted_margins": 1.022253394126892, "rewards/weighted_rejected": -1.999292016029358, "step": 890 }, { "epoch": 0.4710808688824915, "grad_norm": 32.590675354003906, "learning_rate": 6.375551412666326e-07, "logits/chosen": -1.0722472667694092, "logits/rejected": -1.1129913330078125, "logps/chosen": -499.8031311035156, "logps/rejected": -543.9796752929688, "logps/weighted_chosen": -3.171948194503784, "logps/weighted_rejected": -4.273193359375, "loss": 0.6035, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -225.7996063232422, "rewards/margins": 61.669532775878906, "rewards/rejected": -287.5015563964844, "rewards/weighted_accuracies": 0.6812499761581421, "rewards/weighted_chosen": -1.0042235851287842, "rewards/weighted_margins": 0.7809082269668579, "rewards/weighted_rejected": -1.784277319908142, "step": 900 }, { "epoch": 0.4763151007589636, "grad_norm": 37.88996887207031, "learning_rate": 6.287474148328583e-07, "logits/chosen": -1.0318695306777954, "logits/rejected": -1.0187164545059204, "logps/chosen": -491.97186279296875, "logps/rejected": -540.4202880859375, "logps/weighted_chosen": -3.5292601585388184, "logps/weighted_rejected": -5.02734375, "loss": 0.559, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -221.07968139648438, "rewards/margins": 72.57441711425781, "rewards/rejected": -293.56561279296875, "rewards/weighted_accuracies": 0.690625011920929, "rewards/weighted_chosen": -1.0068145990371704, "rewards/weighted_margins": 0.84521484375, "rewards/weighted_rejected": -1.851770043373108, "step": 910 }, { "epoch": 0.48154933263543576, "grad_norm": 25.340351104736328, "learning_rate": 6.198966878083857e-07, "logits/chosen": -1.0930664539337158, "logits/rejected": -1.12109375, "logps/chosen": -540.0, "logps/rejected": -623.390625, "logps/weighted_chosen": -3.68231201171875, "logps/weighted_rejected": -4.464990139007568, "loss": 0.5911, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -268.6884765625, "rewards/margins": 87.7593765258789, "rewards/rejected": -356.4453125, "rewards/weighted_accuracies": 0.7124999761581421, "rewards/weighted_chosen": -1.085455298423767, "rewards/weighted_margins": 0.79486083984375, "rewards/weighted_rejected": -1.880859375, "step": 920 }, { "epoch": 0.48678356451190785, "grad_norm": 47.73952865600586, "learning_rate": 6.110059162641439e-07, "logits/chosen": -1.1020386219024658, "logits/rejected": -1.1357238292694092, "logps/chosen": -533.7390747070312, "logps/rejected": -599.3781127929688, "logps/weighted_chosen": -2.9868531227111816, "logps/weighted_rejected": -4.054858207702637, "loss": 0.5458, "rewards/accuracies": 0.625, "rewards/chosen": -251.13125610351562, "rewards/margins": 80.9544906616211, "rewards/rejected": -332.2699279785156, "rewards/weighted_accuracies": 0.734375, "rewards/weighted_chosen": -0.8615478277206421, "rewards/weighted_margins": 0.8154662847518921, "rewards/weighted_rejected": -1.6761963367462158, "step": 930 }, { "epoch": 0.49201779638838, "grad_norm": 36.5290412902832, "learning_rate": 6.020780696456059e-07, "logits/chosen": -1.133764624595642, "logits/rejected": -1.177819848060608, "logps/chosen": -515.2952880859375, "logps/rejected": -611.4953002929688, "logps/weighted_chosen": -3.0075440406799316, "logps/weighted_rejected": -4.250219821929932, "loss": 0.5361, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -244.7570343017578, "rewards/margins": 99.73710632324219, "rewards/rejected": -344.3531188964844, "rewards/weighted_accuracies": 0.731249988079071, "rewards/weighted_chosen": -1.0392029285430908, "rewards/weighted_margins": 0.8944336175918579, "rewards/weighted_rejected": -1.93341064453125, "step": 940 }, { "epoch": 0.49725202826485215, "grad_norm": 36.72229766845703, "learning_rate": 5.931161297810185e-07, "logits/chosen": -1.2070038318634033, "logits/rejected": -1.23931884765625, "logps/chosen": -550.4031372070312, "logps/rejected": -615.3375244140625, "logps/weighted_chosen": -3.748730421066284, "logps/weighted_rejected": -4.64697265625, "loss": 0.5726, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": -277.07733154296875, "rewards/margins": 77.06562805175781, "rewards/rejected": -354.22381591796875, "rewards/weighted_accuracies": 0.6968749761581421, "rewards/weighted_chosen": -1.09320068359375, "rewards/weighted_margins": 0.812207043170929, "rewards/weighted_rejected": -1.906347632408142, "step": 950 }, { "epoch": 0.5024862601413242, "grad_norm": 39.58312225341797, "learning_rate": 5.841230898854959e-07, "logits/chosen": -1.1735107898712158, "logits/rejected": -1.2161743640899658, "logps/chosen": -684.3687744140625, "logps/rejected": -794.3922119140625, "logps/weighted_chosen": -3.8369140625, "logps/weighted_rejected": -4.986083984375, "loss": 0.6144, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -384.17852783203125, "rewards/margins": 148.6707000732422, "rewards/rejected": -532.8191528320312, "rewards/weighted_accuracies": 0.6937500238418579, "rewards/weighted_chosen": -1.5656402111053467, "rewards/weighted_margins": 0.9439452886581421, "rewards/weighted_rejected": -2.5091795921325684, "step": 960 }, { "epoch": 0.5077204920177963, "grad_norm": 24.286752700805664, "learning_rate": 5.751019535613102e-07, "logits/chosen": -1.0753357410430908, "logits/rejected": -1.104547142982483, "logps/chosen": -550.9773559570312, "logps/rejected": -653.6687622070312, "logps/weighted_chosen": -3.5946044921875, "logps/weighted_rejected": -5.038866996765137, "loss": 0.5266, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -291.25390625, "rewards/margins": 115.4117202758789, "rewards/rejected": -406.62811279296875, "rewards/weighted_accuracies": 0.7093750238418579, "rewards/weighted_chosen": -1.2581603527069092, "rewards/weighted_margins": 1.127111792564392, "rewards/weighted_rejected": -2.3849730491638184, "step": 970 }, { "epoch": 0.5129547238942685, "grad_norm": 22.55328941345215, "learning_rate": 5.660557337947117e-07, "logits/chosen": -1.0788085460662842, "logits/rejected": -1.109989881515503, "logps/chosen": -594.8406372070312, "logps/rejected": -653.6624755859375, "logps/weighted_chosen": -3.10791015625, "logps/weighted_rejected": -4.297143459320068, "loss": 0.5361, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -300.37872314453125, "rewards/margins": 95.828125, "rewards/rejected": -396.203125, "rewards/weighted_accuracies": 0.6968749761581421, "rewards/weighted_chosen": -1.0415496826171875, "rewards/weighted_margins": 0.88812255859375, "rewards/weighted_rejected": -1.929406762123108, "step": 980 }, { "epoch": 0.5181889557707406, "grad_norm": 53.25763702392578, "learning_rate": 5.569874519496174e-07, "logits/chosen": -1.055578589439392, "logits/rejected": -1.122521996498108, "logps/chosen": -497.3999938964844, "logps/rejected": -579.3656005859375, "logps/weighted_chosen": -3.393261671066284, "logps/weighted_rejected": -4.673925876617432, "loss": 0.5302, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -222.11483764648438, "rewards/margins": 90.296875, "rewards/rejected": -312.42108154296875, "rewards/weighted_accuracies": 0.699999988079071, "rewards/weighted_chosen": -0.8590606451034546, "rewards/weighted_margins": 0.879437267780304, "rewards/weighted_rejected": -1.7384154796600342, "step": 990 }, { "epoch": 0.5234231876472127, "grad_norm": 39.35697555541992, "learning_rate": 5.47900136758499e-07, "logits/chosen": -1.0391693115234375, "logits/rejected": -1.1183960437774658, "logps/chosen": -589.5906372070312, "logps/rejected": -653.09375, "logps/weighted_chosen": -3.634228467941284, "logps/weighted_rejected": -4.7723388671875, "loss": 0.5255, "rewards/accuracies": 0.59375, "rewards/chosen": -321.95623779296875, "rewards/margins": 89.08320617675781, "rewards/rejected": -410.8277282714844, "rewards/weighted_accuracies": 0.731249988079071, "rewards/weighted_chosen": -1.187219262123108, "rewards/weighted_margins": 0.9879394769668579, "rewards/weighted_rejected": -2.1756300926208496, "step": 1000 }, { "epoch": 0.5234231876472127, "eval_logits/chosen": -1.194671869277954, "eval_logits/rejected": -1.2200820446014404, "eval_logps/chosen": -645.4445190429688, "eval_logps/rejected": -726.552001953125, "eval_logps/weighted_chosen": -3.7153730392456055, "eval_logps/weighted_rejected": -4.821405410766602, "eval_loss": 0.5563206672668457, "eval_rewards/accuracies": 0.5899999737739563, "eval_rewards/chosen": -356.71826171875, "eval_rewards/margins": 102.17925262451172, "eval_rewards/rejected": -458.8554992675781, "eval_rewards/weighted_accuracies": 0.7135000228881836, "eval_rewards/weighted_chosen": -1.3285291194915771, "eval_rewards/weighted_margins": 0.9376659989356995, "eval_rewards/weighted_rejected": -2.266195297241211, "eval_runtime": 1365.9873, "eval_samples_per_second": 1.464, "eval_steps_per_second": 0.366, "step": 1000 }, { "epoch": 0.528657419523685, "grad_norm": 45.573585510253906, "learning_rate": 5.387968233108113e-07, "logits/chosen": -1.087103247642517, "logits/rejected": -1.085015892982483, "logps/chosen": -669.9202880859375, "logps/rejected": -781.1453247070312, "logps/weighted_chosen": -4.16748046875, "logps/weighted_rejected": -5.289355278015137, "loss": 0.5551, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": -385.9546813964844, "rewards/margins": 140.77304077148438, "rewards/rejected": -526.48046875, "rewards/weighted_accuracies": 0.746874988079071, "rewards/weighted_chosen": -1.512841820716858, "rewards/weighted_margins": 1.02880859375, "rewards/weighted_rejected": -2.542529344558716, "step": 1010 }, { "epoch": 0.533891651400157, "grad_norm": 304.9451904296875, "learning_rate": 5.296805520392962e-07, "logits/chosen": -1.160436987876892, "logits/rejected": -1.2073547840118408, "logps/chosen": -662.7718505859375, "logps/rejected": -731.2015380859375, "logps/weighted_chosen": -3.366748094558716, "logps/weighted_rejected": -4.887963771820068, "loss": 0.6053, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -350.32733154296875, "rewards/margins": 98.02656555175781, "rewards/rejected": -448.2242126464844, "rewards/weighted_accuracies": 0.703125, "rewards/weighted_chosen": -1.51910400390625, "rewards/weighted_margins": 0.8489990234375, "rewards/weighted_rejected": -2.3676390647888184, "step": 1020 }, { "epoch": 0.5391258832766291, "grad_norm": 186.6497344970703, "learning_rate": 5.205543677045049e-07, "logits/chosen": -1.0708801746368408, "logits/rejected": -1.114501953125, "logps/chosen": -556.8343505859375, "logps/rejected": -603.2718505859375, "logps/weighted_chosen": -3.673095703125, "logps/weighted_rejected": -4.930810451507568, "loss": 0.5157, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -282.62188720703125, "rewards/margins": 75.29374694824219, "rewards/rejected": -357.86053466796875, "rewards/weighted_accuracies": 0.746874988079071, "rewards/weighted_chosen": -1.1851806640625, "rewards/weighted_margins": 1.0668213367462158, "rewards/weighted_rejected": -2.2524254322052, "step": 1030 }, { "epoch": 0.5443601151531012, "grad_norm": 88.95674896240234, "learning_rate": 5.114213183778697e-07, "logits/chosen": -1.123449683189392, "logits/rejected": -1.1691772937774658, "logps/chosen": -555.5703125, "logps/rejected": -638.265625, "logps/weighted_chosen": -4.111181735992432, "logps/weighted_rejected": -4.94482421875, "loss": 0.5669, "rewards/accuracies": 0.625, "rewards/chosen": -275.11407470703125, "rewards/margins": 106.10546875, "rewards/rejected": -381.244140625, "rewards/weighted_accuracies": 0.7124999761581421, "rewards/weighted_chosen": -1.225732445716858, "rewards/weighted_margins": 0.956982433795929, "rewards/weighted_rejected": -2.182055711746216, "step": 1040 }, { "epoch": 0.5495943470295734, "grad_norm": 227.4050750732422, "learning_rate": 5.022844544236754e-07, "logits/chosen": -1.054589867591858, "logits/rejected": -1.088189721107483, "logps/chosen": -567.5890502929688, "logps/rejected": -656.4656372070312, "logps/weighted_chosen": -3.9198241233825684, "logps/weighted_rejected": -4.956591606140137, "loss": 0.572, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": -287.54962158203125, "rewards/margins": 113.6656265258789, "rewards/rejected": -401.166015625, "rewards/weighted_accuracies": 0.715624988079071, "rewards/weighted_chosen": -1.267511010169983, "rewards/weighted_margins": 0.9030395746231079, "rewards/weighted_rejected": -2.170910596847534, "step": 1050 }, { "epoch": 0.5548285789060455, "grad_norm": 1079.8702392578125, "learning_rate": 4.931468274802608e-07, "logits/chosen": -1.070989966392517, "logits/rejected": -1.1126830577850342, "logps/chosen": -568.2453002929688, "logps/rejected": -670.5187377929688, "logps/weighted_chosen": -3.2323241233825684, "logps/weighted_rejected": -4.560888767242432, "loss": 0.5383, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -285.74139404296875, "rewards/margins": 123.85664367675781, "rewards/rejected": -409.6675720214844, "rewards/weighted_accuracies": 0.7250000238418579, "rewards/weighted_chosen": -1.198522925376892, "rewards/weighted_margins": 0.938366711139679, "rewards/weighted_rejected": -2.136554002761841, "step": 1060 }, { "epoch": 0.5600628107825176, "grad_norm": 56.90155792236328, "learning_rate": 4.840114894407974e-07, "logits/chosen": -1.10162353515625, "logits/rejected": -1.1522216796875, "logps/chosen": -528.5078125, "logps/rejected": -583.03125, "logps/weighted_chosen": -3.598315477371216, "logps/weighted_rejected": -4.284228324890137, "loss": 0.5674, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -252.3361358642578, "rewards/margins": 79.1089859008789, "rewards/rejected": -331.48748779296875, "rewards/weighted_accuracies": 0.731249988079071, "rewards/weighted_chosen": -1.137750267982483, "rewards/weighted_margins": 0.870159924030304, "rewards/weighted_rejected": -2.008807420730591, "step": 1070 }, { "epoch": 0.5652970426589898, "grad_norm": 46.219932556152344, "learning_rate": 4.748814914339811e-07, "logits/chosen": -1.0657470226287842, "logits/rejected": -1.1109619140625, "logps/chosen": -569.3312377929688, "logps/rejected": -623.2874755859375, "logps/weighted_chosen": -3.5493407249450684, "logps/weighted_rejected": -4.359619140625, "loss": 0.5449, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -284.59649658203125, "rewards/margins": 83.34257507324219, "rewards/rejected": -368.00311279296875, "rewards/weighted_accuracies": 0.715624988079071, "rewards/weighted_chosen": -1.1415588855743408, "rewards/weighted_margins": 1.012670874595642, "rewards/weighted_rejected": -2.1540770530700684, "step": 1080 }, { "epoch": 0.5705312745354619, "grad_norm": 86.65474700927734, "learning_rate": 4.657598828049801e-07, "logits/chosen": -1.1116211414337158, "logits/rejected": -1.1939513683319092, "logps/chosen": -578.01953125, "logps/rejected": -681.3382568359375, "logps/weighted_chosen": -3.48370361328125, "logps/weighted_rejected": -4.3099365234375, "loss": 0.5293, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -287.9331970214844, "rewards/margins": 112.53828430175781, "rewards/rejected": -400.390625, "rewards/weighted_accuracies": 0.7281249761581421, "rewards/weighted_chosen": -1.1633422374725342, "rewards/weighted_margins": 0.9815673828125, "rewards/weighted_rejected": -2.144360303878784, "step": 1090 }, { "epoch": 0.575765506411934, "grad_norm": 419.50701904296875, "learning_rate": 4.566497100969792e-07, "logits/chosen": -1.1083495616912842, "logits/rejected": -1.1515319347381592, "logps/chosen": -681.3523559570312, "logps/rejected": -761.9328002929688, "logps/weighted_chosen": -3.8998045921325684, "logps/weighted_rejected": -4.931884765625, "loss": 0.5762, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -358.6919860839844, "rewards/margins": 104.8218765258789, "rewards/rejected": -463.4956970214844, "rewards/weighted_accuracies": 0.734375, "rewards/weighted_chosen": -1.351904273033142, "rewards/weighted_margins": 1.048761010169983, "rewards/weighted_rejected": -2.40142822265625, "step": 1100 }, { "epoch": 0.5809997382884062, "grad_norm": 56.882720947265625, "learning_rate": 4.475540160336576e-07, "logits/chosen": -1.1287415027618408, "logits/rejected": -1.183984398841858, "logps/chosen": -593.2249755859375, "logps/rejected": -657.9328002929688, "logps/weighted_chosen": -3.99072265625, "logps/weighted_rejected": -5.168993949890137, "loss": 0.5249, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -294.9878845214844, "rewards/margins": 104.7953109741211, "rewards/rejected": -399.80743408203125, "rewards/weighted_accuracies": 0.7250000238418579, "rewards/weighted_chosen": -1.250451683998108, "rewards/weighted_margins": 1.0572631359100342, "rewards/weighted_rejected": -2.306506395339966, "step": 1110 }, { "epoch": 0.5862339701648783, "grad_norm": 633.436279296875, "learning_rate": 4.3847583850294565e-07, "logits/chosen": -1.109594702720642, "logits/rejected": -1.14373779296875, "logps/chosen": -646.6163940429688, "logps/rejected": -697.4781494140625, "logps/weighted_chosen": -4.158276557922363, "logps/weighted_rejected": -4.803124904632568, "loss": 0.5622, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -355.50311279296875, "rewards/margins": 82.48612976074219, "rewards/rejected": -437.83984375, "rewards/weighted_accuracies": 0.7406250238418579, "rewards/weighted_chosen": -1.350854516029358, "rewards/weighted_margins": 1.000707983970642, "rewards/weighted_rejected": -2.352795362472534, "step": 1120 }, { "epoch": 0.5914682020413504, "grad_norm": 512.849853515625, "learning_rate": 4.294182095423934e-07, "logits/chosen": -1.0823333263397217, "logits/rejected": -1.1519775390625, "logps/chosen": -601.3250122070312, "logps/rejected": -686.8515625, "logps/weighted_chosen": -3.522167921066284, "logps/weighted_rejected": -4.769873142242432, "loss": 0.5542, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -315.3882751464844, "rewards/margins": 110.39375305175781, "rewards/rejected": -425.638671875, "rewards/weighted_accuracies": 0.703125, "rewards/weighted_chosen": -1.290283203125, "rewards/weighted_margins": 1.020105004310608, "rewards/weighted_rejected": -2.3109498023986816, "step": 1130 }, { "epoch": 0.5967024339178225, "grad_norm": 65.05388641357422, "learning_rate": 4.20384154326496e-07, "logits/chosen": -1.0805175304412842, "logits/rejected": -1.14312744140625, "logps/chosen": -516.7843627929688, "logps/rejected": -553.4617309570312, "logps/weighted_chosen": -3.43231201171875, "logps/weighted_rejected": -4.573815822601318, "loss": 0.5682, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -256.2328186035156, "rewards/margins": 61.6025390625, "rewards/rejected": -317.79412841796875, "rewards/weighted_accuracies": 0.7124999761581421, "rewards/weighted_chosen": -1.086877465248108, "rewards/weighted_margins": 0.8528991937637329, "rewards/weighted_rejected": -1.9411132335662842, "step": 1140 }, { "epoch": 0.6019366657942947, "grad_norm": 75.07366943359375, "learning_rate": 4.1137669015630863e-07, "logits/chosen": -1.077880859375, "logits/rejected": -1.1405029296875, "logps/chosen": -537.2109375, "logps/rejected": -624.4749755859375, "logps/weighted_chosen": -3.0843262672424316, "logps/weighted_rejected": -4.2431640625, "loss": 0.5219, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -241.1707000732422, "rewards/margins": 99.6703109741211, "rewards/rejected": -340.8062438964844, "rewards/weighted_accuracies": 0.7124999761581421, "rewards/weighted_chosen": -0.9316955804824829, "rewards/weighted_margins": 0.957165539264679, "rewards/weighted_rejected": -1.889074683189392, "step": 1150 }, { "epoch": 0.6071708976707668, "grad_norm": 53.81990432739258, "learning_rate": 4.023988254516943e-07, "logits/chosen": -1.059973120689392, "logits/rejected": -1.120214819908142, "logps/chosen": -530.0851440429688, "logps/rejected": -566.734375, "logps/weighted_chosen": -3.7030029296875, "logps/weighted_rejected": -4.251660346984863, "loss": 0.5091, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -229.01113891601562, "rewards/margins": 70.63789367675781, "rewards/rejected": -299.630859375, "rewards/weighted_accuracies": 0.75, "rewards/weighted_chosen": -0.8327713012695312, "rewards/weighted_margins": 0.8830932378768921, "rewards/weighted_rejected": -1.715429663658142, "step": 1160 }, { "epoch": 0.6124051295472389, "grad_norm": 50.65428161621094, "learning_rate": 3.9345355874653366e-07, "logits/chosen": -1.058905005455017, "logits/rejected": -1.0953247547149658, "logps/chosen": -535.8616943359375, "logps/rejected": -544.9773559570312, "logps/weighted_chosen": -3.3579344749450684, "logps/weighted_rejected": -4.205273628234863, "loss": 0.5906, "rewards/accuracies": 0.609375, "rewards/chosen": -238.26016235351562, "rewards/margins": 56.61601638793945, "rewards/rejected": -294.7398376464844, "rewards/weighted_accuracies": 0.6968749761581421, "rewards/weighted_chosen": -0.988323986530304, "rewards/weighted_margins": 0.763989269733429, "rewards/weighted_rejected": -1.7522430419921875, "step": 1170 }, { "epoch": 0.6176393614237111, "grad_norm": 44.16736602783203, "learning_rate": 3.8454387768724157e-07, "logits/chosen": -1.097143530845642, "logits/rejected": -1.1111571788787842, "logps/chosen": -475.8023376464844, "logps/rejected": -494.05078125, "logps/weighted_chosen": -3.5390625, "logps/weighted_rejected": -4.489306449890137, "loss": 0.5337, "rewards/accuracies": 0.578125, "rewards/chosen": -202.70858764648438, "rewards/margins": 60.640235900878906, "rewards/rejected": -263.4339904785156, "rewards/weighted_accuracies": 0.7124999761581421, "rewards/weighted_chosen": -0.8571746945381165, "rewards/weighted_margins": 0.9136596918106079, "rewards/weighted_rejected": -1.7705810070037842, "step": 1180 }, { "epoch": 0.6228735933001832, "grad_norm": 402.7128601074219, "learning_rate": 3.7567275803491525e-07, "logits/chosen": -1.1082000732421875, "logits/rejected": -1.155554175376892, "logps/chosen": -559.3796997070312, "logps/rejected": -586.2781372070312, "logps/weighted_chosen": -3.2266845703125, "logps/weighted_rejected": -4.550146579742432, "loss": 0.5018, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -257.08673095703125, "rewards/margins": 75.61640930175781, "rewards/rejected": -332.78204345703125, "rewards/weighted_accuracies": 0.746874988079071, "rewards/weighted_chosen": -0.8839172124862671, "rewards/weighted_margins": 1.0030639171600342, "rewards/weighted_rejected": -1.886468529701233, "step": 1190 }, { "epoch": 0.6281078251766553, "grad_norm": 393.51788330078125, "learning_rate": 3.66843162671456e-07, "logits/chosen": -1.1154296398162842, "logits/rejected": -1.1315186023712158, "logps/chosen": -552.2093505859375, "logps/rejected": -680.3843994140625, "logps/weighted_chosen": -3.9300293922424316, "logps/weighted_rejected": -4.456152439117432, "loss": 0.5959, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -278.14764404296875, "rewards/margins": 130.2062530517578, "rewards/rejected": -408.24530029296875, "rewards/weighted_accuracies": 0.671875, "rewards/weighted_chosen": -1.1631286144256592, "rewards/weighted_margins": 0.8199828863143921, "rewards/weighted_rejected": -1.9827392101287842, "step": 1200 }, { "epoch": 0.6333420570531274, "grad_norm": 915.16943359375, "learning_rate": 3.5805804060998924e-07, "logits/chosen": -1.1075561046600342, "logits/rejected": -1.1484863758087158, "logps/chosen": -600.2609252929688, "logps/rejected": -717.1453247070312, "logps/weighted_chosen": -3.2332520484924316, "logps/weighted_rejected": -4.710791110992432, "loss": 0.499, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -299.421875, "rewards/margins": 147.1335906982422, "rewards/rejected": -446.451171875, "rewards/weighted_accuracies": 0.75, "rewards/weighted_chosen": -1.162207007408142, "rewards/weighted_margins": 1.1646606922149658, "rewards/weighted_rejected": -2.326342821121216, "step": 1210 }, { "epoch": 0.6385762889295996, "grad_norm": 685.1804809570312, "learning_rate": 3.493203260099197e-07, "logits/chosen": -1.1248352527618408, "logits/rejected": -1.195556640625, "logps/chosen": -622.3953247070312, "logps/rejected": -694.5250244140625, "logps/weighted_chosen": -3.321972608566284, "logps/weighted_rejected": -4.685571193695068, "loss": 0.5497, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -334.587890625, "rewards/margins": 87.31562805175781, "rewards/rejected": -421.9072265625, "rewards/weighted_accuracies": 0.706250011920929, "rewards/weighted_chosen": -1.299523949623108, "rewards/weighted_margins": 0.9338439702987671, "rewards/weighted_rejected": -2.2326292991638184, "step": 1220 }, { "epoch": 0.6438105208060717, "grad_norm": 740.0750732421875, "learning_rate": 3.4063293719694407e-07, "logits/chosen": -1.126245141029358, "logits/rejected": -1.180517554283142, "logps/chosen": -567.4812622070312, "logps/rejected": -648.1539306640625, "logps/weighted_chosen": -3.639111280441284, "logps/weighted_rejected": -4.597058296203613, "loss": 0.6, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -290.95684814453125, "rewards/margins": 99.4990234375, "rewards/rejected": -390.62872314453125, "rewards/weighted_accuracies": 0.6812499761581421, "rewards/weighted_chosen": -1.1769592761993408, "rewards/weighted_margins": 0.8424926996231079, "rewards/weighted_rejected": -2.0197691917419434, "step": 1230 }, { "epoch": 0.6490447526825438, "grad_norm": 342.7829895019531, "learning_rate": 3.319987756883559e-07, "logits/chosen": -1.156945824623108, "logits/rejected": -1.198333740234375, "logps/chosen": -572.3328247070312, "logps/rejected": -671.7633056640625, "logps/weighted_chosen": -3.338793992996216, "logps/weighted_rejected": -4.542822360992432, "loss": 0.5046, "rewards/accuracies": 0.65625, "rewards/chosen": -294.57342529296875, "rewards/margins": 108.07109069824219, "rewards/rejected": -402.66680908203125, "rewards/weighted_accuracies": 0.746874988079071, "rewards/weighted_chosen": -1.004339575767517, "rewards/weighted_margins": 1.094323754310608, "rewards/weighted_rejected": -2.0980224609375, "step": 1240 }, { "epoch": 0.654278984559016, "grad_norm": 406.2123107910156, "learning_rate": 3.234207252239607e-07, "logits/chosen": -1.1353332996368408, "logits/rejected": -1.184594750404358, "logps/chosen": -643.1171875, "logps/rejected": -680.65625, "logps/weighted_chosen": -3.9437499046325684, "logps/weighted_rejected": -4.764257907867432, "loss": 0.5738, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -350.52264404296875, "rewards/margins": 77.30488586425781, "rewards/rejected": -428.01641845703125, "rewards/weighted_accuracies": 0.675000011920929, "rewards/weighted_chosen": -1.3293838500976562, "rewards/weighted_margins": 0.863757312297821, "rewards/weighted_rejected": -2.193603515625, "step": 1250 }, { "epoch": 0.6595132164354881, "grad_norm": 518.19775390625, "learning_rate": 3.1490165080293175e-07, "logits/chosen": -1.1296905279159546, "logits/rejected": -1.1927978992462158, "logps/chosen": -573.6226806640625, "logps/rejected": -675.9562377929688, "logps/weighted_chosen": -3.474902391433716, "logps/weighted_rejected": -4.43798828125, "loss": 0.5317, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -306.15155029296875, "rewards/margins": 106.2289047241211, "rewards/rejected": -412.40234375, "rewards/weighted_accuracies": 0.7124999761581421, "rewards/weighted_chosen": -1.2220337390899658, "rewards/weighted_margins": 0.9764038324356079, "rewards/weighted_rejected": -2.197509765625, "step": 1260 }, { "epoch": 0.6647474483119602, "grad_norm": 152.93478393554688, "learning_rate": 3.06444397726922e-07, "logits/chosen": -1.082611083984375, "logits/rejected": -1.1652100086212158, "logps/chosen": -609.8375244140625, "logps/rejected": -677.9749755859375, "logps/weighted_chosen": -3.4163575172424316, "logps/weighted_rejected": -5.1240234375, "loss": 0.5052, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -314.2923889160156, "rewards/margins": 101.6371078491211, "rewards/rejected": -415.8968811035156, "rewards/weighted_accuracies": 0.71875, "rewards/weighted_chosen": -1.135400414466858, "rewards/weighted_margins": 1.23980712890625, "rewards/weighted_rejected": -2.375744581222534, "step": 1270 }, { "epoch": 0.6699816801884323, "grad_norm": 285.5711669921875, "learning_rate": 2.980517906497586e-07, "logits/chosen": -1.1207764148712158, "logits/rejected": -1.1819336414337158, "logps/chosen": -597.5718994140625, "logps/rejected": -703.9953002929688, "logps/weighted_chosen": -3.632373094558716, "logps/weighted_rejected": -4.974999904632568, "loss": 0.5807, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -304.43048095703125, "rewards/margins": 127.4535140991211, "rewards/rejected": -431.8851623535156, "rewards/weighted_accuracies": 0.721875011920929, "rewards/weighted_chosen": -1.202392578125, "rewards/weighted_margins": 0.9914306402206421, "rewards/weighted_rejected": -2.193042039871216, "step": 1280 }, { "epoch": 0.6752159120649045, "grad_norm": 82.95504760742188, "learning_rate": 2.89726632634029e-07, "logits/chosen": -1.113470435142517, "logits/rejected": -1.1529052257537842, "logps/chosen": -599.0562744140625, "logps/rejected": -677.3453369140625, "logps/weighted_chosen": -3.367846727371216, "logps/weighted_rejected": -4.500195503234863, "loss": 0.574, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -324.748046875, "rewards/margins": 97.44921875, "rewards/rejected": -422.11798095703125, "rewards/weighted_accuracies": 0.6937500238418579, "rewards/weighted_chosen": -1.2255462408065796, "rewards/weighted_margins": 0.875323474407196, "rewards/weighted_rejected": -2.1007933616638184, "step": 1290 }, { "epoch": 0.6804501439413766, "grad_norm": 2586.47119140625, "learning_rate": 2.814717042148827e-07, "logits/chosen": -1.131799340248108, "logits/rejected": -1.1629149913787842, "logps/chosen": -566.9031372070312, "logps/rejected": -659.9281005859375, "logps/weighted_chosen": -4.212353706359863, "logps/weighted_rejected": -4.885107517242432, "loss": 0.606, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -292.85858154296875, "rewards/margins": 106.29667663574219, "rewards/rejected": -399.13201904296875, "rewards/weighted_accuracies": 0.684374988079071, "rewards/weighted_chosen": -1.2742125988006592, "rewards/weighted_margins": 0.8754211664199829, "rewards/weighted_rejected": -2.149639844894409, "step": 1300 }, { "epoch": 0.6856843758178487, "grad_norm": 20.005109786987305, "learning_rate": 2.7328976247135416e-07, "logits/chosen": -1.156762719154358, "logits/rejected": -1.1973998546600342, "logps/chosen": -577.4781494140625, "logps/rejected": -642.1609497070312, "logps/weighted_chosen": -3.5555663108825684, "logps/weighted_rejected": -4.547216892242432, "loss": 0.6037, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -287.97772216796875, "rewards/margins": 101.68867492675781, "rewards/rejected": -389.75079345703125, "rewards/weighted_accuracies": 0.690625011920929, "rewards/weighted_chosen": -1.2163574695587158, "rewards/weighted_margins": 0.806256115436554, "rewards/weighted_rejected": -2.022289991378784, "step": 1310 }, { "epoch": 0.6909186076943209, "grad_norm": 151.74427795410156, "learning_rate": 2.651835401055217e-07, "logits/chosen": -1.149621605873108, "logits/rejected": -1.183782935142517, "logps/chosen": -597.9265747070312, "logps/rejected": -678.625, "logps/weighted_chosen": -3.490771532058716, "logps/weighted_rejected": -4.412158012390137, "loss": 0.5855, "rewards/accuracies": 0.609375, "rewards/chosen": -320.44451904296875, "rewards/margins": 92.2328109741211, "rewards/rejected": -412.76953125, "rewards/weighted_accuracies": 0.7124999761581421, "rewards/weighted_chosen": -1.213354468345642, "rewards/weighted_margins": 0.8388611078262329, "rewards/weighted_rejected": -2.0516357421875, "step": 1320 }, { "epoch": 0.696152839570793, "grad_norm": 47.340484619140625, "learning_rate": 2.571557445298055e-07, "logits/chosen": -1.135412573814392, "logits/rejected": -1.1872680187225342, "logps/chosen": -558.5109252929688, "logps/rejected": -635.0374755859375, "logps/weighted_chosen": -3.464648485183716, "logps/weighted_rejected": -4.4168701171875, "loss": 0.5666, "rewards/accuracies": 0.625, "rewards/chosen": -280.0796813964844, "rewards/margins": 90.732421875, "rewards/rejected": -370.8531188964844, "rewards/weighted_accuracies": 0.675000011920929, "rewards/weighted_chosen": -1.1060791015625, "rewards/weighted_margins": 0.821246325969696, "rewards/weighted_rejected": -1.9268798828125, "step": 1330 }, { "epoch": 0.7013870714472651, "grad_norm": 49.26417922973633, "learning_rate": 2.49209056962716e-07, "logits/chosen": -1.146185278892517, "logits/rejected": -1.186279296875, "logps/chosen": -612.96875, "logps/rejected": -650.0765380859375, "logps/weighted_chosen": -3.656054735183716, "logps/weighted_rejected": -4.784960746765137, "loss": 0.5627, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -298.53436279296875, "rewards/margins": 84.52030944824219, "rewards/rejected": -383.0699157714844, "rewards/weighted_accuracies": 0.706250011920929, "rewards/weighted_chosen": -1.108679175376892, "rewards/weighted_margins": 0.8104248046875, "rewards/weighted_rejected": -1.9180908203125, "step": 1340 }, { "epoch": 0.7066213033237373, "grad_norm": 43.23517990112305, "learning_rate": 2.41346131533347e-07, "logits/chosen": -1.1938354969024658, "logits/rejected": -1.217199683189392, "logps/chosen": -628.8203125, "logps/rejected": -686.6749877929688, "logps/weighted_chosen": -3.1535401344299316, "logps/weighted_rejected": -4.464306831359863, "loss": 0.546, "rewards/accuracies": 0.625, "rewards/chosen": -322.18829345703125, "rewards/margins": 89.7154312133789, "rewards/rejected": -411.8656311035156, "rewards/weighted_accuracies": 0.737500011920929, "rewards/weighted_chosen": -1.133204698562622, "rewards/weighted_margins": 0.804534912109375, "rewards/weighted_rejected": -1.9378173351287842, "step": 1350 }, { "epoch": 0.7118555352002094, "grad_norm": 34.37676239013672, "learning_rate": 2.3356959439491898e-07, "logits/chosen": -1.1357300281524658, "logits/rejected": -1.2062256336212158, "logps/chosen": -595.59375, "logps/rejected": -672.6422119140625, "logps/weighted_chosen": -3.9969725608825684, "logps/weighted_rejected": -4.689795017242432, "loss": 0.5808, "rewards/accuracies": 0.625, "rewards/chosen": -325.7347717285156, "rewards/margins": 104.75312805175781, "rewards/rejected": -430.67108154296875, "rewards/weighted_accuracies": 0.699999988079071, "rewards/weighted_chosen": -1.2155777215957642, "rewards/weighted_margins": 0.9368530511856079, "rewards/weighted_rejected": -2.152294874191284, "step": 1360 }, { "epoch": 0.7170897670766815, "grad_norm": 51.87490463256836, "learning_rate": 2.258820428476645e-07, "logits/chosen": -1.1657593250274658, "logits/rejected": -1.2217528820037842, "logps/chosen": -631.8890380859375, "logps/rejected": -746.7828369140625, "logps/weighted_chosen": -3.470263719558716, "logps/weighted_rejected": -4.346484184265137, "loss": 0.5501, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -352.91015625, "rewards/margins": 124.40898132324219, "rewards/rejected": -477.31951904296875, "rewards/weighted_accuracies": 0.734375, "rewards/weighted_chosen": -1.186987280845642, "rewards/weighted_margins": 0.836499035358429, "rewards/weighted_rejected": -2.023852586746216, "step": 1370 }, { "epoch": 0.7223239989531536, "grad_norm": 416.21136474609375, "learning_rate": 2.1828604447135245e-07, "logits/chosen": -1.11541748046875, "logits/rejected": -1.1730468273162842, "logps/chosen": -694.0234375, "logps/rejected": -737.5452880859375, "logps/weighted_chosen": -3.9507813453674316, "logps/weighted_rejected": -5.173535346984863, "loss": 0.5413, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -423.78436279296875, "rewards/margins": 56.84648513793945, "rewards/rejected": -480.90936279296875, "rewards/weighted_accuracies": 0.6875, "rewards/weighted_chosen": -1.29058837890625, "rewards/weighted_margins": 0.98492431640625, "rewards/weighted_rejected": -2.275256395339966, "step": 1380 }, { "epoch": 0.7275582308296258, "grad_norm": 68.63409423828125, "learning_rate": 2.1078413626773545e-07, "logits/chosen": -1.174198865890503, "logits/rejected": -1.2014648914337158, "logps/chosen": -640.5343627929688, "logps/rejected": -758.4734497070312, "logps/weighted_chosen": -3.5299744606018066, "logps/weighted_rejected": -5.219091892242432, "loss": 0.5755, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -360.80780029296875, "rewards/margins": 128.4132843017578, "rewards/rejected": -489.3277282714844, "rewards/weighted_accuracies": 0.715624988079071, "rewards/weighted_chosen": -1.248020887374878, "rewards/weighted_margins": 0.9331451654434204, "rewards/weighted_rejected": -2.181262254714966, "step": 1390 }, { "epoch": 0.7327924627060979, "grad_norm": 323.93890380859375, "learning_rate": 2.0337882381321347e-07, "logits/chosen": -1.1679198741912842, "logits/rejected": -1.1945312023162842, "logps/chosen": -653.7203369140625, "logps/rejected": -707.4640502929688, "logps/weighted_chosen": -3.5541749000549316, "logps/weighted_rejected": -4.433495998382568, "loss": 0.5551, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -362.28399658203125, "rewards/margins": 91.1234359741211, "rewards/rejected": -453.5562438964844, "rewards/weighted_accuracies": 0.7093750238418579, "rewards/weighted_chosen": -1.336692452430725, "rewards/weighted_margins": 0.8907715082168579, "rewards/weighted_rejected": -2.227795362472534, "step": 1400 }, { "epoch": 0.73802669458257, "grad_norm": 74.13733673095703, "learning_rate": 1.960725804219905e-07, "logits/chosen": -1.149194359779358, "logits/rejected": -1.2033812999725342, "logps/chosen": -661.9656372070312, "logps/rejected": -768.3421630859375, "logps/weighted_chosen": -4.091479301452637, "logps/weighted_rejected": -4.242919921875, "loss": 0.5754, "rewards/accuracies": 0.5718749761581421, "rewards/chosen": -362.45428466796875, "rewards/margins": 122.5601577758789, "rewards/rejected": -484.8101501464844, "rewards/weighted_accuracies": 0.6937500238418579, "rewards/weighted_chosen": -1.2423064708709717, "rewards/weighted_margins": 0.8276306390762329, "rewards/weighted_rejected": -2.069873094558716, "step": 1410 }, { "epoch": 0.7432609264590422, "grad_norm": 237.26011657714844, "learning_rate": 1.8886784632000824e-07, "logits/chosen": -1.1620361804962158, "logits/rejected": -1.206933617591858, "logps/chosen": -640.3624877929688, "logps/rejected": -783.7437744140625, "logps/weighted_chosen": -3.4447264671325684, "logps/weighted_rejected": -4.929491996765137, "loss": 0.5398, "rewards/accuracies": 0.65625, "rewards/chosen": -354.3374938964844, "rewards/margins": 162.05233764648438, "rewards/rejected": -516.314453125, "rewards/weighted_accuracies": 0.7250000238418579, "rewards/weighted_chosen": -1.1836121082305908, "rewards/weighted_margins": 1.072717308998108, "rewards/weighted_rejected": -2.256579637527466, "step": 1420 }, { "epoch": 0.7484951583355143, "grad_norm": 482.7161560058594, "learning_rate": 1.8176702782993025e-07, "logits/chosen": -1.166711449623108, "logits/rejected": -1.185546875, "logps/chosen": -614.01953125, "logps/rejected": -700.3484497070312, "logps/weighted_chosen": -3.3925538063049316, "logps/weighted_rejected": -4.749658107757568, "loss": 0.5436, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -352.78125, "rewards/margins": 93.58476257324219, "rewards/rejected": -446.29296875, "rewards/weighted_accuracies": 0.706250011920929, "rewards/weighted_chosen": -1.2850220203399658, "rewards/weighted_margins": 0.9663940668106079, "rewards/weighted_rejected": -2.2514891624450684, "step": 1430 }, { "epoch": 0.7537293902119864, "grad_norm": 48.64420700073242, "learning_rate": 1.7477249656745034e-07, "logits/chosen": -1.1055176258087158, "logits/rejected": -1.157617211341858, "logps/chosen": -559.9140625, "logps/rejected": -613.4656372070312, "logps/weighted_chosen": -3.6603026390075684, "logps/weighted_rejected": -4.916943550109863, "loss": 0.5473, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -309.0234375, "rewards/margins": 77.943359375, "rewards/rejected": -387.1015625, "rewards/weighted_accuracies": 0.731249988079071, "rewards/weighted_chosen": -1.15887451171875, "rewards/weighted_margins": 1.0418579578399658, "rewards/weighted_rejected": -2.2011961936950684, "step": 1440 }, { "epoch": 0.7589636220884585, "grad_norm": 61.97262954711914, "learning_rate": 1.6788658864919118e-07, "logits/chosen": -1.111364722251892, "logits/rejected": -1.2080078125, "logps/chosen": -681.9781494140625, "logps/rejected": -766.1875, "logps/weighted_chosen": -3.5382080078125, "logps/weighted_rejected": -4.320166110992432, "loss": 0.5212, "rewards/accuracies": 0.625, "rewards/chosen": -367.3251953125, "rewards/margins": 115.8160171508789, "rewards/rejected": -482.9632873535156, "rewards/weighted_accuracies": 0.715624988079071, "rewards/weighted_chosen": -1.1493957042694092, "rewards/weighted_margins": 1.0002562999725342, "rewards/weighted_rejected": -2.149792432785034, "step": 1450 }, { "epoch": 0.7641978539649307, "grad_norm": 33.67172622680664, "learning_rate": 1.611116039124613e-07, "logits/chosen": -1.0988738536834717, "logits/rejected": -1.160314917564392, "logps/chosen": -592.9827880859375, "logps/rejected": -634.5718994140625, "logps/weighted_chosen": -3.8994140625, "logps/weighted_rejected": -4.845922946929932, "loss": 0.5548, "rewards/accuracies": 0.609375, "rewards/chosen": -330.28790283203125, "rewards/margins": 74.87812805175781, "rewards/rejected": -405.23126220703125, "rewards/weighted_accuracies": 0.6812499761581421, "rewards/weighted_chosen": -1.151556372642517, "rewards/weighted_margins": 0.903613269329071, "rewards/weighted_rejected": -2.055248975753784, "step": 1460 }, { "epoch": 0.7694320858414028, "grad_norm": 144.79293823242188, "learning_rate": 1.5444980514712723e-07, "logits/chosen": -1.196630835533142, "logits/rejected": -1.222875952720642, "logps/chosen": -668.71875, "logps/rejected": -797.6375122070312, "logps/weighted_chosen": -3.661572217941284, "logps/weighted_rejected": -4.466870307922363, "loss": 0.6131, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -357.33673095703125, "rewards/margins": 134.50155639648438, "rewards/rejected": -491.7562561035156, "rewards/weighted_accuracies": 0.6781250238418579, "rewards/weighted_chosen": -1.314672827720642, "rewards/weighted_margins": 0.78485107421875, "rewards/weighted_rejected": -2.099261522293091, "step": 1470 }, { "epoch": 0.7746663177178749, "grad_norm": 40.60256576538086, "learning_rate": 1.4790341733986083e-07, "logits/chosen": -1.150146484375, "logits/rejected": -1.1887328624725342, "logps/chosen": -620.7406005859375, "logps/rejected": -698.5499877929688, "logps/weighted_chosen": -3.9625487327575684, "logps/weighted_rejected": -4.390429496765137, "loss": 0.5719, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -333.4839782714844, "rewards/margins": 102.4716796875, "rewards/rejected": -436.12420654296875, "rewards/weighted_accuracies": 0.7093750238418579, "rewards/weighted_chosen": -1.234228491783142, "rewards/weighted_margins": 0.851452648639679, "rewards/weighted_rejected": -2.085034132003784, "step": 1480 }, { "epoch": 0.7799005495943471, "grad_norm": 217.96365356445312, "learning_rate": 1.4147462693101108e-07, "logits/chosen": -1.131872534751892, "logits/rejected": -1.179223656654358, "logps/chosen": -645.7999877929688, "logps/rejected": -766.2390747070312, "logps/weighted_chosen": -3.494384765625, "logps/weighted_rejected": -4.757568359375, "loss": 0.5302, "rewards/accuracies": 0.640625, "rewards/chosen": -358.8677673339844, "rewards/margins": 136.17617797851562, "rewards/rejected": -495.09765625, "rewards/weighted_accuracies": 0.734375, "rewards/weighted_chosen": -1.171356201171875, "rewards/weighted_margins": 1.06304931640625, "rewards/weighted_rejected": -2.235211133956909, "step": 1490 }, { "epoch": 0.7851347814708192, "grad_norm": 655.3441162109375, "learning_rate": 1.3516558108435177e-07, "logits/chosen": -1.133544921875, "logits/rejected": -1.154302954673767, "logps/chosen": -594.9124755859375, "logps/rejected": -729.9781494140625, "logps/weighted_chosen": -3.306835889816284, "logps/weighted_rejected": -4.992211818695068, "loss": 0.508, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -315.57147216796875, "rewards/margins": 149.4695281982422, "rewards/rejected": -465.1382751464844, "rewards/weighted_accuracies": 0.737500011920929, "rewards/weighted_chosen": -1.0454895496368408, "rewards/weighted_margins": 1.0130126476287842, "rewards/weighted_rejected": -2.058490037918091, "step": 1500 }, { "epoch": 0.7851347814708192, "eval_logits/chosen": -1.2306523323059082, "eval_logits/rejected": -1.26123046875, "eval_logps/chosen": -633.6630249023438, "eval_logps/rejected": -713.4450073242188, "eval_logps/weighted_chosen": -3.545902967453003, "eval_logps/weighted_rejected": -4.605466842651367, "eval_loss": 0.5521173477172852, "eval_rewards/accuracies": 0.593999981880188, "eval_rewards/chosen": -344.9159851074219, "eval_rewards/margins": 100.7760009765625, "eval_rewards/rejected": -445.6925048828125, "eval_rewards/weighted_accuracies": 0.7149999737739563, "eval_rewards/weighted_chosen": -1.159058928489685, "eval_rewards/weighted_margins": 0.8911977410316467, "eval_rewards/weighted_rejected": -2.0502567291259766, "eval_runtime": 1316.3607, "eval_samples_per_second": 1.519, "eval_steps_per_second": 0.38, "step": 1500 }, { "epoch": 0.7903690133472913, "grad_norm": 32.490413665771484, "learning_rate": 1.2897838696994505e-07, "logits/chosen": -1.1297607421875, "logits/rejected": -1.168298363685608, "logps/chosen": -585.390625, "logps/rejected": -678.625, "logps/weighted_chosen": -3.9292969703674316, "logps/weighted_rejected": -4.704247951507568, "loss": 0.5992, "rewards/accuracies": 0.671875, "rewards/chosen": -313.0980529785156, "rewards/margins": 103.3773422241211, "rewards/rejected": -416.39923095703125, "rewards/weighted_accuracies": 0.715624988079071, "rewards/weighted_chosen": -1.228363037109375, "rewards/weighted_margins": 0.8051391839981079, "rewards/weighted_rejected": -2.0337891578674316, "step": 1510 }, { "epoch": 0.7956032452237635, "grad_norm": 103.49887084960938, "learning_rate": 1.229151110603649e-07, "logits/chosen": -1.1730270385742188, "logits/rejected": -1.2250702381134033, "logps/chosen": -639.2672119140625, "logps/rejected": -710.4703369140625, "logps/weighted_chosen": -3.5760254859924316, "logps/weighted_rejected": -4.558545112609863, "loss": 0.6109, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -342.1875, "rewards/margins": 95.5367202758789, "rewards/rejected": -437.6800842285156, "rewards/weighted_accuracies": 0.721875011920929, "rewards/weighted_chosen": -1.242523193359375, "rewards/weighted_margins": 0.7712646722793579, "rewards/weighted_rejected": -2.013842821121216, "step": 1520 }, { "epoch": 0.8008374771002356, "grad_norm": 26.20205307006836, "learning_rate": 1.1697777844051104e-07, "logits/chosen": -1.1935913562774658, "logits/rejected": -1.216577172279358, "logps/chosen": -663.4562377929688, "logps/rejected": -768.6937255859375, "logps/weighted_chosen": -3.4478759765625, "logps/weighted_rejected": -4.542138576507568, "loss": 0.5215, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -360.3539123535156, "rewards/margins": 126.4839859008789, "rewards/rejected": -486.7699279785156, "rewards/weighted_accuracies": 0.7281249761581421, "rewards/weighted_chosen": -1.057836890220642, "rewards/weighted_margins": 0.954821765422821, "rewards/weighted_rejected": -2.012655735015869, "step": 1530 }, { "epoch": 0.8060717089767077, "grad_norm": 158.38113403320312, "learning_rate": 1.111683721312477e-07, "logits/chosen": -1.1692626476287842, "logits/rejected": -1.199652075767517, "logps/chosen": -622.1984252929688, "logps/rejected": -711.6531372070312, "logps/weighted_chosen": -3.277587890625, "logps/weighted_rejected": -3.949267625808716, "loss": 0.5668, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -346.7880859375, "rewards/margins": 104.9453125, "rewards/rejected": -451.7046813964844, "rewards/weighted_accuracies": 0.706250011920929, "rewards/weighted_chosen": -1.1675598621368408, "rewards/weighted_margins": 0.843762218952179, "rewards/weighted_rejected": -2.0111327171325684, "step": 1540 }, { "epoch": 0.8113059408531798, "grad_norm": 189.9469757080078, "learning_rate": 1.0548883242709033e-07, "logits/chosen": -1.106359839439392, "logits/rejected": -1.178131103515625, "logps/chosen": -629.1640625, "logps/rejected": -776.9000244140625, "logps/weighted_chosen": -4.056591987609863, "logps/weighted_rejected": -4.928466796875, "loss": 0.4908, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -344.5941467285156, "rewards/margins": 162.8507843017578, "rewards/rejected": -507.3597717285156, "rewards/weighted_accuracies": 0.7718750238418579, "rewards/weighted_chosen": -1.1156005859375, "rewards/weighted_margins": 1.0612304210662842, "rewards/weighted_rejected": -2.17724609375, "step": 1550 }, { "epoch": 0.816540172729652, "grad_norm": 301.8493957519531, "learning_rate": 9.994105624816379e-08, "logits/chosen": -1.164636254310608, "logits/rejected": -1.2321898937225342, "logps/chosen": -670.3968505859375, "logps/rejected": -756.6937255859375, "logps/weighted_chosen": -3.8463377952575684, "logps/weighted_rejected": -5.091845512390137, "loss": 0.5505, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -368.4644470214844, "rewards/margins": 105.59648132324219, "rewards/rejected": -473.984375, "rewards/weighted_accuracies": 0.6968749761581421, "rewards/weighted_chosen": -1.300726294517517, "rewards/weighted_margins": 0.9333862066268921, "rewards/weighted_rejected": -2.2337646484375, "step": 1560 }, { "epoch": 0.821774404606124, "grad_norm": 212.7559051513672, "learning_rate": 9.452689650664514e-08, "logits/chosen": -1.166418433189392, "logits/rejected": -1.244348168373108, "logps/chosen": -682.6515502929688, "logps/rejected": -709.1500244140625, "logps/weighted_chosen": -3.459155321121216, "logps/weighted_rejected": -4.23291015625, "loss": 0.5405, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -386.58514404296875, "rewards/margins": 66.9886703491211, "rewards/rejected": -453.6781311035156, "rewards/weighted_accuracies": 0.7250000238418579, "rewards/weighted_chosen": -1.2503662109375, "rewards/weighted_margins": 0.901043713092804, "rewards/weighted_rejected": -2.1515135765075684, "step": 1570 }, { "epoch": 0.8270086364825961, "grad_norm": 280.51715087890625, "learning_rate": 8.924816148790748e-08, "logits/chosen": -1.174096703529358, "logits/rejected": -1.233984351158142, "logps/chosen": -679.6077880859375, "logps/rejected": -780.9749755859375, "logps/weighted_chosen": -3.776562452316284, "logps/weighted_rejected": -4.682421684265137, "loss": 0.5356, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -385.0078125, "rewards/margins": 117.716796875, "rewards/rejected": -502.8359375, "rewards/weighted_accuracies": 0.699999988079071, "rewards/weighted_chosen": -1.1958191394805908, "rewards/weighted_margins": 1.0562187433242798, "rewards/weighted_rejected": -2.251452684402466, "step": 1580 }, { "epoch": 0.8322428683590684, "grad_norm": 39.72044372558594, "learning_rate": 8.410661424656607e-08, "logits/chosen": -1.203576683998108, "logits/rejected": -1.232690453529358, "logps/chosen": -671.4718627929688, "logps/rejected": -761.9312744140625, "logps/weighted_chosen": -3.552441358566284, "logps/weighted_rejected": -4.740527153015137, "loss": 0.5263, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -377.9898376464844, "rewards/margins": 124.10469055175781, "rewards/rejected": -501.8984375, "rewards/weighted_accuracies": 0.731249988079071, "rewards/weighted_chosen": -1.308996558189392, "rewards/weighted_margins": 1.106896996498108, "rewards/weighted_rejected": -2.4161620140075684, "step": 1590 }, { "epoch": 0.8374771002355405, "grad_norm": 98.31045532226562, "learning_rate": 7.910397201763308e-08, "logits/chosen": -1.165185570716858, "logits/rejected": -1.1953125, "logps/chosen": -686.2750244140625, "logps/rejected": -773.1312255859375, "logps/weighted_chosen": -3.680419921875, "logps/weighted_rejected": -4.370849609375, "loss": 0.6126, "rewards/accuracies": 0.565625011920929, "rewards/chosen": -419.4292907714844, "rewards/margins": 94.01953125, "rewards/rejected": -513.4410400390625, "rewards/weighted_accuracies": 0.7124999761581421, "rewards/weighted_chosen": -1.337377905845642, "rewards/weighted_margins": 0.9386047124862671, "rewards/weighted_rejected": -2.2768921852111816, "step": 1600 }, { "epoch": 0.8427113321120125, "grad_norm": 127.34908294677734, "learning_rate": 7.424190564297489e-08, "logits/chosen": -1.200341820716858, "logits/rejected": -1.2474243640899658, "logps/chosen": -723.3695068359375, "logps/rejected": -836.8171997070312, "logps/weighted_chosen": -3.5196776390075684, "logps/weighted_rejected": -4.542773246765137, "loss": 0.5396, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -442.2417907714844, "rewards/margins": 119.1753921508789, "rewards/rejected": -561.5078125, "rewards/weighted_accuracies": 0.7281249761581421, "rewards/weighted_chosen": -1.3686096668243408, "rewards/weighted_margins": 1.0560470819473267, "rewards/weighted_rejected": -2.424060106277466, "step": 1610 }, { "epoch": 0.8479455639884846, "grad_norm": 25.85185432434082, "learning_rate": 6.952203901326464e-08, "logits/chosen": -1.1488158702850342, "logits/rejected": -1.197351098060608, "logps/chosen": -701.828125, "logps/rejected": -791.3828125, "logps/weighted_chosen": -3.785595655441284, "logps/weighted_rejected": -5.068505764007568, "loss": 0.5519, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -412.7074279785156, "rewards/margins": 108.6156234741211, "rewards/rejected": -521.37109375, "rewards/weighted_accuracies": 0.731249988079071, "rewards/weighted_chosen": -1.265100121498108, "rewards/weighted_margins": 1.0268433094024658, "rewards/weighted_rejected": -2.291638135910034, "step": 1620 }, { "epoch": 0.8531797958649568, "grad_norm": 111.40572357177734, "learning_rate": 6.494594852561558e-08, "logits/chosen": -1.1355712413787842, "logits/rejected": -1.18328857421875, "logps/chosen": -710.8062744140625, "logps/rejected": -790.7976684570312, "logps/weighted_chosen": -3.600848436355591, "logps/weighted_rejected": -4.999890327453613, "loss": 0.5333, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -419.97344970703125, "rewards/margins": 99.8208999633789, "rewards/rejected": -519.9085693359375, "rewards/weighted_accuracies": 0.715624988079071, "rewards/weighted_chosen": -1.1865936517715454, "rewards/weighted_margins": 1.0258972644805908, "rewards/weighted_rejected": -2.2139830589294434, "step": 1630 }, { "epoch": 0.8584140277414289, "grad_norm": 33.190921783447266, "learning_rate": 6.051516255707773e-08, "logits/chosen": -1.163732886314392, "logits/rejected": -1.213415503501892, "logps/chosen": -656.5734252929688, "logps/rejected": -773.390625, "logps/weighted_chosen": -3.6455078125, "logps/weighted_rejected": -4.722460746765137, "loss": 0.539, "rewards/accuracies": 0.609375, "rewards/chosen": -383.56640625, "rewards/margins": 137.61328125, "rewards/rejected": -520.9671630859375, "rewards/weighted_accuracies": 0.7437499761581421, "rewards/weighted_chosen": -1.2532958984375, "rewards/weighted_margins": 1.098425269126892, "rewards/weighted_rejected": -2.3517823219299316, "step": 1640 }, { "epoch": 0.863648259617901, "grad_norm": 214.31565856933594, "learning_rate": 5.6231160954171796e-08, "logits/chosen": -1.112097144126892, "logits/rejected": -1.198706030845642, "logps/chosen": -665.1546630859375, "logps/rejected": -820.2890625, "logps/weighted_chosen": -3.9293456077575684, "logps/weighted_rejected": -4.075634956359863, "loss": 0.5607, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -395.1675720214844, "rewards/margins": 151.1287078857422, "rewards/rejected": -546.2628784179688, "rewards/weighted_accuracies": 0.7093750238418579, "rewards/weighted_chosen": -1.2908935546875, "rewards/weighted_margins": 0.8940979242324829, "rewards/weighted_rejected": -2.184948682785034, "step": 1650 }, { "epoch": 0.8688824914943732, "grad_norm": 40.810096740722656, "learning_rate": 5.209537453863289e-08, "logits/chosen": -1.177636742591858, "logits/rejected": -1.1999084949493408, "logps/chosen": -580.1390380859375, "logps/rejected": -695.2312622070312, "logps/weighted_chosen": -3.4837646484375, "logps/weighted_rejected": -4.563989162445068, "loss": 0.5084, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -310.1695251464844, "rewards/margins": 135.10391235351562, "rewards/rejected": -445.3179626464844, "rewards/weighted_accuracies": 0.737500011920929, "rewards/weighted_chosen": -1.1034972667694092, "rewards/weighted_margins": 1.097009301185608, "rewards/weighted_rejected": -2.2013213634490967, "step": 1660 }, { "epoch": 0.8741167233708453, "grad_norm": 198.37266540527344, "learning_rate": 4.8109184629527344e-08, "logits/chosen": -1.1480286121368408, "logits/rejected": -1.174829125404358, "logps/chosen": -629.2374877929688, "logps/rejected": -765.0234375, "logps/weighted_chosen": -3.574951171875, "logps/weighted_rejected": -4.886132717132568, "loss": 0.5108, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -358.5101623535156, "rewards/margins": 148.86758422851562, "rewards/rejected": -507.44061279296875, "rewards/weighted_accuracies": 0.737500011920929, "rewards/weighted_chosen": -1.147027611732483, "rewards/weighted_margins": 1.134619116783142, "rewards/weighted_rejected": -2.281726121902466, "step": 1670 }, { "epoch": 0.8793509552473174, "grad_norm": 21.939449310302734, "learning_rate": 4.427392258190399e-08, "logits/chosen": -1.1966369152069092, "logits/rejected": -1.217309594154358, "logps/chosen": -674.2453002929688, "logps/rejected": -737.453125, "logps/weighted_chosen": -3.573333740234375, "logps/weighted_rejected": -5.062792778015137, "loss": 0.5187, "rewards/accuracies": 0.5718749761581421, "rewards/chosen": -384.4058532714844, "rewards/margins": 92.5804672241211, "rewards/rejected": -477.02655029296875, "rewards/weighted_accuracies": 0.7281249761581421, "rewards/weighted_chosen": -1.1649169921875, "rewards/weighted_margins": 0.9775329828262329, "rewards/weighted_rejected": -2.1424317359924316, "step": 1680 }, { "epoch": 0.8845851871237895, "grad_norm": 72.72052764892578, "learning_rate": 4.059086934213141e-08, "logits/chosen": -1.1529357433319092, "logits/rejected": -1.1970093250274658, "logps/chosen": -670.0140380859375, "logps/rejected": -724.27734375, "logps/weighted_chosen": -3.518237352371216, "logps/weighted_rejected": -4.640625, "loss": 0.5218, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -378.71990966796875, "rewards/margins": 95.2548828125, "rewards/rejected": -474.0453186035156, "rewards/weighted_accuracies": 0.7437499761581421, "rewards/weighted_chosen": -1.298101782798767, "rewards/weighted_margins": 1.04913330078125, "rewards/weighted_rejected": -2.347949266433716, "step": 1690 }, { "epoch": 0.8898194190002617, "grad_norm": 320.4640197753906, "learning_rate": 3.7061255020073346e-08, "logits/chosen": -1.221380591392517, "logits/rejected": -1.2476806640625, "logps/chosen": -701.5734252929688, "logps/rejected": -770.7281494140625, "logps/weighted_chosen": -3.537548780441284, "logps/weighted_rejected": -4.314111232757568, "loss": 0.5509, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -415.10858154296875, "rewards/margins": 93.888671875, "rewards/rejected": -509.1382751464844, "rewards/weighted_accuracies": 0.7250000238418579, "rewards/weighted_chosen": -1.25872802734375, "rewards/weighted_margins": 0.9871581792831421, "rewards/weighted_rejected": -2.246020555496216, "step": 1700 }, { "epoch": 0.8950536508767338, "grad_norm": 21.794382095336914, "learning_rate": 3.3686258478241027e-08, "logits/chosen": -1.217187523841858, "logits/rejected": -1.2429320812225342, "logps/chosen": -727.0828247070312, "logps/rejected": -782.7093505859375, "logps/weighted_chosen": -3.617626905441284, "logps/weighted_rejected": -4.555224418640137, "loss": 0.5477, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -415.97637939453125, "rewards/margins": 87.7699203491211, "rewards/rejected": -503.55938720703125, "rewards/weighted_accuracies": 0.71875, "rewards/weighted_chosen": -1.3385436534881592, "rewards/weighted_margins": 0.872027575969696, "rewards/weighted_rejected": -2.211010694503784, "step": 1710 }, { "epoch": 0.9002878827532059, "grad_norm": 157.28964233398438, "learning_rate": 3.0467006938063366e-08, "logits/chosen": -1.1905517578125, "logits/rejected": -1.2344787120819092, "logps/chosen": -669.9632568359375, "logps/rejected": -746.0343627929688, "logps/weighted_chosen": -3.239794969558716, "logps/weighted_rejected": -4.668139457702637, "loss": 0.5525, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -397.27459716796875, "rewards/margins": 89.14570617675781, "rewards/rejected": -486.55157470703125, "rewards/weighted_accuracies": 0.699999988079071, "rewards/weighted_chosen": -1.291589379310608, "rewards/weighted_margins": 1.105200171470642, "rewards/weighted_rejected": -2.396716356277466, "step": 1720 }, { "epoch": 0.9055221146296781, "grad_norm": 295.63861083984375, "learning_rate": 2.7404575603403646e-08, "logits/chosen": -1.1640503406524658, "logits/rejected": -1.215722680091858, "logps/chosen": -654.3117065429688, "logps/rejected": -804.9749755859375, "logps/weighted_chosen": -3.811474561691284, "logps/weighted_rejected": -5.056396484375, "loss": 0.4448, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -380.11505126953125, "rewards/margins": 155.30078125, "rewards/rejected": -535.3187255859375, "rewards/weighted_accuracies": 0.778124988079071, "rewards/weighted_chosen": -1.1834747791290283, "rewards/weighted_margins": 1.2722076177597046, "rewards/weighted_rejected": -2.4549317359924316, "step": 1730 }, { "epoch": 0.9107563465061502, "grad_norm": 85.96134948730469, "learning_rate": 2.4499987301450698e-08, "logits/chosen": -1.1945922374725342, "logits/rejected": -1.2708861827850342, "logps/chosen": -713.2406005859375, "logps/rejected": -868.5015869140625, "logps/weighted_chosen": -3.6737060546875, "logps/weighted_rejected": -4.847460746765137, "loss": 0.4624, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -393.1859436035156, "rewards/margins": 189.7375030517578, "rewards/rejected": -583.0999755859375, "rewards/weighted_accuracies": 0.768750011920929, "rewards/weighted_chosen": -1.1947753429412842, "rewards/weighted_margins": 1.2012450695037842, "rewards/weighted_rejected": -2.3964600563049316, "step": 1740 }, { "epoch": 0.9159905783826223, "grad_norm": 99.16748046875, "learning_rate": 2.1754212141102347e-08, "logits/chosen": -1.1943480968475342, "logits/rejected": -1.211828589439392, "logps/chosen": -664.3336181640625, "logps/rejected": -769.375, "logps/weighted_chosen": -4.12176513671875, "logps/weighted_rejected": -5.160546779632568, "loss": 0.5123, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -385.3828125, "rewards/margins": 114.51953125, "rewards/rejected": -500.05859375, "rewards/weighted_accuracies": 0.762499988079071, "rewards/weighted_chosen": -1.272180199623108, "rewards/weighted_margins": 1.085974097251892, "rewards/weighted_rejected": -2.3573365211486816, "step": 1750 }, { "epoch": 0.9212248102590945, "grad_norm": 249.57980346679688, "learning_rate": 1.9168167188957586e-08, "logits/chosen": -1.141473412513733, "logits/rejected": -1.218017578125, "logps/chosen": -658.53125, "logps/rejected": -720.2249755859375, "logps/weighted_chosen": -3.7562012672424316, "logps/weighted_rejected": -4.617724418640137, "loss": 0.5498, "rewards/accuracies": 0.59375, "rewards/chosen": -377.919921875, "rewards/margins": 95.85078430175781, "rewards/rejected": -473.56719970703125, "rewards/weighted_accuracies": 0.7124999761581421, "rewards/weighted_chosen": -1.2995789051055908, "rewards/weighted_margins": 1.051355004310608, "rewards/weighted_rejected": -2.3507447242736816, "step": 1760 }, { "epoch": 0.9264590421355666, "grad_norm": 64.01028442382812, "learning_rate": 1.6742716163022865e-08, "logits/chosen": -1.20599365234375, "logits/rejected": -1.241949439048767, "logps/chosen": -681.2359619140625, "logps/rejected": -812.1702880859375, "logps/weighted_chosen": -3.519580125808716, "logps/weighted_rejected": -4.841210842132568, "loss": 0.5064, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -385.01171875, "rewards/margins": 150.9167938232422, "rewards/rejected": -535.8961181640625, "rewards/weighted_accuracies": 0.784375011920929, "rewards/weighted_chosen": -1.3197021484375, "rewards/weighted_margins": 1.097833275794983, "rewards/weighted_rejected": -2.417858839035034, "step": 1770 }, { "epoch": 0.9316932740120387, "grad_norm": 157.51995849609375, "learning_rate": 1.4478669144238343e-08, "logits/chosen": -1.164239525794983, "logits/rejected": -1.182275414466858, "logps/chosen": -622.8038940429688, "logps/rejected": -767.3624877929688, "logps/weighted_chosen": -3.6578125953674316, "logps/weighted_rejected": -5.080908298492432, "loss": 0.5548, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -359.64569091796875, "rewards/margins": 152.6183624267578, "rewards/rejected": -512.1964721679688, "rewards/weighted_accuracies": 0.734375, "rewards/weighted_chosen": -1.2336914539337158, "rewards/weighted_margins": 1.092431664466858, "rewards/weighted_rejected": -2.3256287574768066, "step": 1780 }, { "epoch": 0.9369275058885108, "grad_norm": 84.96482849121094, "learning_rate": 1.23767823059166e-08, "logits/chosen": -1.175811767578125, "logits/rejected": -1.2118408679962158, "logps/chosen": -703.2109375, "logps/rejected": -821.1812744140625, "logps/weighted_chosen": -3.453997850418091, "logps/weighted_rejected": -4.739892482757568, "loss": 0.5076, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -412.1927795410156, "rewards/margins": 128.1863250732422, "rewards/rejected": -540.3038940429688, "rewards/weighted_accuracies": 0.715624988079071, "rewards/weighted_chosen": -1.3178520202636719, "rewards/weighted_margins": 1.1170654296875, "rewards/weighted_rejected": -2.4344725608825684, "step": 1790 }, { "epoch": 0.942161737764983, "grad_norm": 24.094118118286133, "learning_rate": 1.0437757661187486e-08, "logits/chosen": -1.1822021007537842, "logits/rejected": -1.2230346202850342, "logps/chosen": -731.1359252929688, "logps/rejected": -840.8406372070312, "logps/weighted_chosen": -4.1541748046875, "logps/weighted_rejected": -4.836865425109863, "loss": 0.504, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -415.04376220703125, "rewards/margins": 144.8796844482422, "rewards/rejected": -559.7952880859375, "rewards/weighted_accuracies": 0.731249988079071, "rewards/weighted_chosen": -1.2489135265350342, "rewards/weighted_margins": 1.05224609375, "rewards/weighted_rejected": -2.301586866378784, "step": 1800 }, { "epoch": 0.9473959696414551, "grad_norm": 365.9298400878906, "learning_rate": 8.662242828530953e-09, "logits/chosen": -1.1748535633087158, "logits/rejected": -1.2249877452850342, "logps/chosen": -639.7000122070312, "logps/rejected": -771.2734375, "logps/weighted_chosen": -4.110595703125, "logps/weighted_rejected": -5.005419731140137, "loss": 0.5637, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -378.4296875, "rewards/margins": 139.9353485107422, "rewards/rejected": -518.2789306640625, "rewards/weighted_accuracies": 0.721875011920929, "rewards/weighted_chosen": -1.3699219226837158, "rewards/weighted_margins": 0.996142566204071, "rewards/weighted_rejected": -2.3658814430236816, "step": 1810 }, { "epoch": 0.9526302015179272, "grad_norm": 38.30413818359375, "learning_rate": 7.050830815478082e-09, "logits/chosen": -1.1999022960662842, "logits/rejected": -1.227929711341858, "logps/chosen": -652.6156005859375, "logps/rejected": -764.3577880859375, "logps/weighted_chosen": -3.982666015625, "logps/weighted_rejected": -4.869433403015137, "loss": 0.5582, "rewards/accuracies": 0.609375, "rewards/chosen": -392.23748779296875, "rewards/margins": 127.1484375, "rewards/rejected": -519.095703125, "rewards/weighted_accuracies": 0.7281249761581421, "rewards/weighted_chosen": -1.359582543373108, "rewards/weighted_margins": 0.993237316608429, "rewards/weighted_rejected": -2.352398633956909, "step": 1820 }, { "epoch": 0.9578644333943994, "grad_norm": 314.2492980957031, "learning_rate": 5.604059820551177e-09, "logits/chosen": -1.1942627429962158, "logits/rejected": -1.217626929283142, "logps/chosen": -691.1734619140625, "logps/rejected": -792.9508056640625, "logps/weighted_chosen": -3.8807616233825684, "logps/weighted_rejected": -4.9736328125, "loss": 0.5307, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -392.8941345214844, "rewards/margins": 139.66659545898438, "rewards/rejected": -532.2779541015625, "rewards/weighted_accuracies": 0.7093750238418579, "rewards/weighted_chosen": -1.2794067859649658, "rewards/weighted_margins": 1.0887451171875, "rewards/weighted_rejected": -2.3676390647888184, "step": 1830 }, { "epoch": 0.9630986652708715, "grad_norm": 29.67596435546875, "learning_rate": 4.322413053509943e-09, "logits/chosen": -1.1745643615722656, "logits/rejected": -1.2179076671600342, "logps/chosen": -716.4593505859375, "logps/rejected": -808.65625, "logps/weighted_chosen": -3.5707030296325684, "logps/weighted_rejected": -4.941259860992432, "loss": 0.5277, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -415.00274658203125, "rewards/margins": 121.33085632324219, "rewards/rejected": -536.318359375, "rewards/weighted_accuracies": 0.734375, "rewards/weighted_chosen": -1.2857787609100342, "rewards/weighted_margins": 1.0569336414337158, "rewards/weighted_rejected": -2.3438477516174316, "step": 1840 }, { "epoch": 0.9683328971473436, "grad_norm": 43.18519592285156, "learning_rate": 3.206318573963418e-09, "logits/chosen": -1.18829345703125, "logits/rejected": -1.2353515625, "logps/chosen": -673.609375, "logps/rejected": -773.1687622070312, "logps/weighted_chosen": -3.8472657203674316, "logps/weighted_rejected": -4.793212890625, "loss": 0.5458, "rewards/accuracies": 0.59375, "rewards/chosen": -387.5249938964844, "rewards/margins": 126.17655944824219, "rewards/rejected": -513.7249755859375, "rewards/weighted_accuracies": 0.703125, "rewards/weighted_chosen": -1.34063720703125, "rewards/weighted_margins": 0.9600464105606079, "rewards/weighted_rejected": -2.3008790016174316, "step": 1850 }, { "epoch": 0.9735671290238157, "grad_norm": 98.20772552490234, "learning_rate": 2.256149148401387e-09, "logits/chosen": -1.193994164466858, "logits/rejected": -1.213659644126892, "logps/chosen": -663.5203247070312, "logps/rejected": -831.8812255859375, "logps/weighted_chosen": -3.932324171066284, "logps/weighted_rejected": -4.665478706359863, "loss": 0.5257, "rewards/accuracies": 0.625, "rewards/chosen": -390.19647216796875, "rewards/margins": 177.13241577148438, "rewards/rejected": -567.4085693359375, "rewards/weighted_accuracies": 0.7718750238418579, "rewards/weighted_chosen": -1.37353515625, "rewards/weighted_margins": 0.999591052532196, "rewards/weighted_rejected": -2.3727660179138184, "step": 1860 }, { "epoch": 0.9788013609002879, "grad_norm": 156.7181854248047, "learning_rate": 1.4722221256933676e-09, "logits/chosen": -1.210534691810608, "logits/rejected": -1.2340819835662842, "logps/chosen": -682.4734497070312, "logps/rejected": -756.2828369140625, "logps/weighted_chosen": -3.5858397483825684, "logps/weighted_rejected": -5.135632514953613, "loss": 0.6143, "rewards/accuracies": 0.578125, "rewards/chosen": -411.08319091796875, "rewards/margins": 92.4906234741211, "rewards/rejected": -503.4302673339844, "rewards/weighted_accuracies": 0.7093750238418579, "rewards/weighted_chosen": -1.425439476966858, "rewards/weighted_margins": 0.88873291015625, "rewards/weighted_rejected": -2.314349412918091, "step": 1870 }, { "epoch": 0.98403559277676, "grad_norm": 43.11684799194336, "learning_rate": 8.547993310970003e-10, "logits/chosen": -1.1872589588165283, "logits/rejected": -1.232177734375, "logps/chosen": -659.8531494140625, "logps/rejected": -765.3453369140625, "logps/weighted_chosen": -4.0516357421875, "logps/weighted_rejected": -4.990136623382568, "loss": 0.6088, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -378.859375, "rewards/margins": 118.80351257324219, "rewards/rejected": -497.828125, "rewards/weighted_accuracies": 0.699999988079071, "rewards/weighted_chosen": -1.2618834972381592, "rewards/weighted_margins": 0.9063965082168579, "rewards/weighted_rejected": -2.1686768531799316, "step": 1880 }, { "epoch": 0.9892698246532321, "grad_norm": 290.6236877441406, "learning_rate": 4.040869788100032e-10, "logits/chosen": -1.1602783203125, "logits/rejected": -1.1969115734100342, "logps/chosen": -663.7374877929688, "logps/rejected": -736.7015380859375, "logps/weighted_chosen": -3.6440186500549316, "logps/weighted_rejected": -4.590576171875, "loss": 0.5181, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -384.9828186035156, "rewards/margins": 106.361328125, "rewards/rejected": -491.2445373535156, "rewards/weighted_accuracies": 0.734375, "rewards/weighted_chosen": -1.2502593994140625, "rewards/weighted_margins": 1.019628882408142, "rewards/weighted_rejected": -2.2702393531799316, "step": 1890 }, { "epoch": 0.9945040565297043, "grad_norm": 531.091064453125, "learning_rate": 1.202356030968743e-10, "logits/chosen": -1.2035033702850342, "logits/rejected": -1.2423584461212158, "logps/chosen": -708.1984252929688, "logps/rejected": -793.2374877929688, "logps/weighted_chosen": -3.401611328125, "logps/weighted_rejected": -4.605420112609863, "loss": 0.5386, "rewards/accuracies": 0.59375, "rewards/chosen": -401.53399658203125, "rewards/margins": 125.5843734741211, "rewards/rejected": -527.2421875, "rewards/weighted_accuracies": 0.734375, "rewards/weighted_chosen": -1.2851684093475342, "rewards/weighted_margins": 1.1517822742462158, "rewards/weighted_rejected": -2.436840772628784, "step": 1900 }, { "epoch": 0.9997382884061764, "grad_norm": 248.43289184570312, "learning_rate": 3.3400080112211405e-12, "logits/chosen": -1.171716332435608, "logits/rejected": -1.211022973060608, "logps/chosen": -732.8812255859375, "logps/rejected": -844.5875244140625, "logps/weighted_chosen": -3.752758741378784, "logps/weighted_rejected": -4.8924560546875, "loss": 0.4947, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -432.294921875, "rewards/margins": 132.95858764648438, "rewards/rejected": -565.1109619140625, "rewards/weighted_accuracies": 0.778124988079071, "rewards/weighted_chosen": -1.2966216802597046, "rewards/weighted_margins": 1.052240014076233, "rewards/weighted_rejected": -2.348583936691284, "step": 1910 } ], "logging_steps": 10, "max_steps": 1911, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }