{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.26171159382360637, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005234231876472127, "grad_norm": 136.4956817626953, "learning_rate": 0.0, "logits/chosen": -0.40118408203125, "logits/rejected": -0.41802978515625, "logps/chosen": -297.609375, "logps/rejected": -247.84375, "logps/weighted_chosen": -4.50634765625, "logps/weighted_rejected": -3.43408203125, "loss": 0.6914, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "rewards/weighted_accuracies": 0.0, "rewards/weighted_chosen": 0.0, "rewards/weighted_margins": 0.0, "rewards/weighted_rejected": 0.0, "step": 1 }, { "epoch": 0.005234231876472127, "grad_norm": 269.8279113769531, "learning_rate": 4.6875e-08, "logits/chosen": -0.3176456093788147, "logits/rejected": -0.3530849814414978, "logps/chosen": -275.5694580078125, "logps/rejected": -255.875, "logps/weighted_chosen": -2.478325843811035, "logps/weighted_rejected": -2.635009765625, "loss": 0.6911, "rewards/accuracies": 0.2604166567325592, "rewards/chosen": -0.0588107630610466, "rewards/margins": -0.0334201380610466, "rewards/rejected": -0.025390625, "rewards/weighted_accuracies": 0.3229166567325592, "rewards/weighted_chosen": 0.0010899438057094812, "rewards/weighted_margins": 0.0014837053604424, "rewards/weighted_rejected": -0.00039333768654614687, "step": 10 }, { "epoch": 0.010468463752944255, "grad_norm": 12.950738906860352, "learning_rate": 9.895833333333332e-08, "logits/chosen": -0.2977302670478821, "logits/rejected": -0.3139175474643707, "logps/chosen": -294.37579345703125, "logps/rejected": -272.7203063964844, "logps/weighted_chosen": -2.30804443359375, "logps/weighted_rejected": -2.247570753097534, "loss": 0.692, "rewards/accuracies": 0.296875, "rewards/chosen": -0.05908203125, "rewards/margins": -0.01513671875, "rewards/rejected": -0.0439453125, "rewards/weighted_accuracies": 0.34687501192092896, "rewards/weighted_chosen": 7.171630568336695e-05, "rewards/weighted_margins": 3.662109520519152e-05, "rewards/weighted_rejected": 3.509521411615424e-05, "step": 20 }, { "epoch": 0.015702695629416383, "grad_norm": 65.14871215820312, "learning_rate": 1.5104166666666664e-07, "logits/chosen": -0.2917991578578949, "logits/rejected": -0.3373458981513977, "logps/chosen": -297.9375, "logps/rejected": -268.0062561035156, "logps/weighted_chosen": -2.3156495094299316, "logps/weighted_rejected": -2.529247999191284, "loss": 0.693, "rewards/accuracies": 0.28125, "rewards/chosen": 0.02177734300494194, "rewards/margins": -0.06005859375, "rewards/rejected": 0.08183594048023224, "rewards/weighted_accuracies": 0.34062498807907104, "rewards/weighted_chosen": 0.0012060165172442794, "rewards/weighted_margins": -0.0021120072342455387, "rewards/weighted_rejected": 0.0033180236350744963, "step": 30 }, { "epoch": 0.02093692750588851, "grad_norm": 30.699859619140625, "learning_rate": 2.03125e-07, "logits/chosen": -0.301095575094223, "logits/rejected": -0.3433547914028168, "logps/chosen": -278.7124938964844, "logps/rejected": -253.84530639648438, "logps/weighted_chosen": -2.3508667945861816, "logps/weighted_rejected": -2.55126953125, "loss": 0.6909, "rewards/accuracies": 0.2718749940395355, "rewards/chosen": -0.014892578125, "rewards/margins": -0.105224609375, "rewards/rejected": 0.09033203125, "rewards/weighted_accuracies": 0.359375, "rewards/weighted_chosen": 0.0031726837623864412, "rewards/weighted_margins": 0.0010789871448650956, "rewards/weighted_rejected": 0.002093696501106024, "step": 40 }, { "epoch": 0.02617115938236064, "grad_norm": 15.275498390197754, "learning_rate": 2.552083333333333e-07, "logits/chosen": -0.28184205293655396, "logits/rejected": -0.32042425870895386, "logps/chosen": -280.19140625, "logps/rejected": -267.4398498535156, "logps/weighted_chosen": -2.281604051589966, "logps/weighted_rejected": -2.278125047683716, "loss": 0.691, "rewards/accuracies": 0.3125, "rewards/chosen": 0.009082031436264515, "rewards/margins": -0.11552734673023224, "rewards/rejected": 0.12460937350988388, "rewards/weighted_accuracies": 0.375, "rewards/weighted_chosen": 0.0036834715865552425, "rewards/weighted_margins": 0.0022117614280432463, "rewards/weighted_rejected": 0.0014717101585119963, "step": 50 }, { "epoch": 0.031405391258832765, "grad_norm": 42.36075973510742, "learning_rate": 3.0729166666666665e-07, "logits/chosen": -0.3147949278354645, "logits/rejected": -0.3081672787666321, "logps/chosen": -277.5492248535156, "logps/rejected": -261.63201904296875, "logps/weighted_chosen": -2.6925292015075684, "logps/weighted_rejected": -2.5440917015075684, "loss": 0.6888, "rewards/accuracies": 0.39375001192092896, "rewards/chosen": 0.15625, "rewards/margins": 0.06132812425494194, "rewards/rejected": 0.09492187201976776, "rewards/weighted_accuracies": 0.43437498807907104, "rewards/weighted_chosen": 0.0014663696056231856, "rewards/weighted_margins": 0.0070289610885083675, "rewards/weighted_rejected": -0.005612182430922985, "step": 60 }, { "epoch": 0.036639623135304895, "grad_norm": 70.38953399658203, "learning_rate": 3.59375e-07, "logits/chosen": -0.31798094511032104, "logits/rejected": -0.32500457763671875, "logps/chosen": -289.6695251464844, "logps/rejected": -244.7351531982422, "logps/weighted_chosen": -2.19110107421875, "logps/weighted_rejected": -2.427661180496216, "loss": 0.6869, "rewards/accuracies": 0.4375, "rewards/chosen": 0.36865234375, "rewards/margins": 0.34619140625, "rewards/rejected": 0.0224609375, "rewards/weighted_accuracies": 0.5375000238418579, "rewards/weighted_chosen": 0.02108154259622097, "rewards/weighted_margins": 0.011628913693130016, "rewards/weighted_rejected": 0.009454727172851562, "step": 70 }, { "epoch": 0.04187385501177702, "grad_norm": 54.81600570678711, "learning_rate": 4.114583333333333e-07, "logits/chosen": -0.2848251461982727, "logits/rejected": -0.3061889708042145, "logps/chosen": -289.08203125, "logps/rejected": -269.95782470703125, "logps/weighted_chosen": -2.3393311500549316, "logps/weighted_rejected": -2.5093994140625, "loss": 0.6827, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.679980456829071, "rewards/margins": 0.3672851622104645, "rewards/rejected": 0.31269532442092896, "rewards/weighted_accuracies": 0.5062500238418579, "rewards/weighted_chosen": 0.048927437514066696, "rewards/weighted_margins": 0.025808846578001976, "rewards/weighted_rejected": 0.02311401441693306, "step": 80 }, { "epoch": 0.04710808688824915, "grad_norm": 30.603567123413086, "learning_rate": 4.6354166666666664e-07, "logits/chosen": -0.312631219625473, "logits/rejected": -0.30586013197898865, "logps/chosen": -279.92578125, "logps/rejected": -257.5234375, "logps/weighted_chosen": -2.5583739280700684, "logps/weighted_rejected": -2.6037840843200684, "loss": 0.6752, "rewards/accuracies": 0.49687498807907104, "rewards/chosen": 0.757617175579071, "rewards/margins": 0.5347656011581421, "rewards/rejected": 0.22285155951976776, "rewards/weighted_accuracies": 0.546875, "rewards/weighted_chosen": 0.08651771396398544, "rewards/weighted_margins": 0.05735473707318306, "rewards/weighted_rejected": 0.02917785570025444, "step": 90 }, { "epoch": 0.05234231876472128, "grad_norm": 99.52488708496094, "learning_rate": 5.156249999999999e-07, "logits/chosen": -0.2793869078159332, "logits/rejected": -0.3509994447231293, "logps/chosen": -290.19061279296875, "logps/rejected": -285.94842529296875, "logps/weighted_chosen": -2.013537645339966, "logps/weighted_rejected": -2.5231690406799316, "loss": 0.6984, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.72607421875, "rewards/margins": 0.80712890625, "rewards/rejected": -0.0810546875, "rewards/weighted_accuracies": 0.6187499761581421, "rewards/weighted_chosen": 0.07289886474609375, "rewards/weighted_margins": 0.019171524792909622, "rewards/weighted_rejected": 0.05372200161218643, "step": 100 }, { "epoch": 0.05757655064119341, "grad_norm": 55.06239318847656, "learning_rate": 5.677083333333333e-07, "logits/chosen": -0.3277145326137543, "logits/rejected": -0.3189544677734375, "logps/chosen": -296.17889404296875, "logps/rejected": -261.51953125, "logps/weighted_chosen": -2.647631883621216, "logps/weighted_rejected": -2.6458740234375, "loss": 0.6844, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.1376953125, "rewards/margins": 0.789746105670929, "rewards/rejected": -0.6522461175918579, "rewards/weighted_accuracies": 0.565625011920929, "rewards/weighted_chosen": 0.05432138592004776, "rewards/weighted_margins": 0.03696594387292862, "rewards/weighted_rejected": 0.01735992357134819, "step": 110 }, { "epoch": 0.06281078251766553, "grad_norm": 42.45036315917969, "learning_rate": 6.197916666666666e-07, "logits/chosen": -0.33232802152633667, "logits/rejected": -0.35865744948387146, "logps/chosen": -294.6078186035156, "logps/rejected": -255.01327514648438, "logps/weighted_chosen": -1.970422387123108, "logps/weighted_rejected": -2.1345458030700684, "loss": 0.6837, "rewards/accuracies": 0.609375, "rewards/chosen": 0.0013671874767169356, "rewards/margins": 1.2595703601837158, "rewards/rejected": -1.258203148841858, "rewards/weighted_accuracies": 0.59375, "rewards/weighted_chosen": 0.048947714269161224, "rewards/weighted_margins": 0.03674011304974556, "rewards/weighted_rejected": 0.012198830023407936, "step": 120 }, { "epoch": 0.06804501439413765, "grad_norm": 28.637327194213867, "learning_rate": 6.718749999999999e-07, "logits/chosen": -0.2933105528354645, "logits/rejected": -0.29020920395851135, "logps/chosen": -305.86602783203125, "logps/rejected": -279.09844970703125, "logps/weighted_chosen": -1.9946777820587158, "logps/weighted_rejected": -2.2933349609375, "loss": 0.6754, "rewards/accuracies": 0.625, "rewards/chosen": 0.285400390625, "rewards/margins": 1.7852051258087158, "rewards/rejected": -1.4998047351837158, "rewards/weighted_accuracies": 0.6187499761581421, "rewards/weighted_chosen": 0.06223602220416069, "rewards/weighted_margins": 0.05499267578125, "rewards/weighted_rejected": 0.0072036744095385075, "step": 130 }, { "epoch": 0.07327924627060979, "grad_norm": 20.866477966308594, "learning_rate": 7.239583333333333e-07, "logits/chosen": -0.33005982637405396, "logits/rejected": -0.34638214111328125, "logps/chosen": -300.7281188964844, "logps/rejected": -276.9156188964844, "logps/weighted_chosen": -2.294506788253784, "logps/weighted_rejected": -2.561084032058716, "loss": 0.6606, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -1.998437523841858, "rewards/margins": 2.3173828125, "rewards/rejected": -4.315820217132568, "rewards/weighted_accuracies": 0.671875, "rewards/weighted_chosen": 0.04616241529583931, "rewards/weighted_margins": 0.08864898979663849, "rewards/weighted_rejected": -0.04253540188074112, "step": 140 }, { "epoch": 0.07851347814708191, "grad_norm": 43.497623443603516, "learning_rate": 7.760416666666666e-07, "logits/chosen": -0.2890525758266449, "logits/rejected": -0.30130767822265625, "logps/chosen": -287.3109436035156, "logps/rejected": -256.72967529296875, "logps/weighted_chosen": -2.883862257003784, "logps/weighted_rejected": -2.796630859375, "loss": 0.664, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -3.108203172683716, "rewards/margins": 2.7275390625, "rewards/rejected": -5.835741996765137, "rewards/weighted_accuracies": 0.6343749761581421, "rewards/weighted_chosen": 0.04738159105181694, "rewards/weighted_margins": 0.09773864597082138, "rewards/weighted_rejected": -0.05034027248620987, "step": 150 }, { "epoch": 0.08374771002355404, "grad_norm": 12.989988327026367, "learning_rate": 8.28125e-07, "logits/chosen": -0.3334007263183594, "logits/rejected": -0.3831237852573395, "logps/chosen": -308.2320251464844, "logps/rejected": -281.5335998535156, "logps/weighted_chosen": -2.378857374191284, "logps/weighted_rejected": -2.3824095726013184, "loss": 0.6464, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -4.347460746765137, "rewards/margins": 3.98388671875, "rewards/rejected": -8.331347465515137, "rewards/weighted_accuracies": 0.65625, "rewards/weighted_chosen": 0.04017028957605362, "rewards/weighted_margins": 0.15716858208179474, "rewards/weighted_rejected": -0.11710510402917862, "step": 160 }, { "epoch": 0.08898194190002617, "grad_norm": 691.15625, "learning_rate": 8.802083333333333e-07, "logits/chosen": -0.37783128023147583, "logits/rejected": -0.37789613008499146, "logps/chosen": -310.76953125, "logps/rejected": -270.8374938964844, "logps/weighted_chosen": -2.6180419921875, "logps/weighted_rejected": -2.8051514625549316, "loss": 0.683, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -7.780859470367432, "rewards/margins": 4.9013671875, "rewards/rejected": -12.683398246765137, "rewards/weighted_accuracies": 0.643750011920929, "rewards/weighted_chosen": 0.06108245998620987, "rewards/weighted_margins": 0.17230224609375, "rewards/weighted_rejected": -0.11125946044921875, "step": 170 }, { "epoch": 0.0942161737764983, "grad_norm": 65.56312561035156, "learning_rate": 9.322916666666666e-07, "logits/chosen": -0.3433074951171875, "logits/rejected": -0.363290399312973, "logps/chosen": -285.8812561035156, "logps/rejected": -274.94842529296875, "logps/weighted_chosen": -2.50494384765625, "logps/weighted_rejected": -2.9246826171875, "loss": 0.6821, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -12.179101943969727, "rewards/margins": 6.432421684265137, "rewards/rejected": -18.608592987060547, "rewards/weighted_accuracies": 0.59375, "rewards/weighted_chosen": -0.0216827392578125, "rewards/weighted_margins": 0.19813232123851776, "rewards/weighted_rejected": -0.2197723388671875, "step": 180 }, { "epoch": 0.09945040565297043, "grad_norm": 26.828750610351562, "learning_rate": 9.84375e-07, "logits/chosen": -0.36560744047164917, "logits/rejected": -0.40380173921585083, "logps/chosen": -321.97967529296875, "logps/rejected": -287.5570373535156, "logps/weighted_chosen": -2.3037109375, "logps/weighted_rejected": -2.64697265625, "loss": 0.6489, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -16.246679306030273, "rewards/margins": 7.440039157867432, "rewards/rejected": -23.685937881469727, "rewards/weighted_accuracies": 0.640625, "rewards/weighted_chosen": -0.11536254733800888, "rewards/weighted_margins": 0.19368895888328552, "rewards/weighted_rejected": -0.309121698141098, "step": 190 }, { "epoch": 0.10468463752944256, "grad_norm": 117.84883880615234, "learning_rate": 9.99959085414323e-07, "logits/chosen": -0.4036270081996918, "logits/rejected": -0.43666380643844604, "logps/chosen": -327.8265686035156, "logps/rejected": -284.4234313964844, "logps/weighted_chosen": -2.6368165016174316, "logps/weighted_rejected": -3.0156006813049316, "loss": 0.6383, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -18.538379669189453, "rewards/margins": 8.212695121765137, "rewards/rejected": -26.748046875, "rewards/weighted_accuracies": 0.6468750238418579, "rewards/weighted_chosen": -0.12391586601734161, "rewards/weighted_margins": 0.28039854764938354, "rewards/weighted_rejected": -0.404205322265625, "step": 200 }, { "epoch": 0.10991886940591468, "grad_norm": 15.525572776794434, "learning_rate": 9.997587035630105e-07, "logits/chosen": -0.41632765531539917, "logits/rejected": -0.4597106873989105, "logps/chosen": -308.75469970703125, "logps/rejected": -316.69842529296875, "logps/weighted_chosen": -2.439282178878784, "logps/weighted_rejected": -3.152050733566284, "loss": 0.6437, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -21.632617950439453, "rewards/margins": 10.560155868530273, "rewards/rejected": -32.18867111206055, "rewards/weighted_accuracies": 0.671875, "rewards/weighted_chosen": -0.2110244780778885, "rewards/weighted_margins": 0.23976440727710724, "rewards/weighted_rejected": -0.4508819580078125, "step": 210 }, { "epoch": 0.11515310128238682, "grad_norm": 31.583986282348633, "learning_rate": 9.99391406364405e-07, "logits/chosen": -0.37994080781936646, "logits/rejected": -0.384744256734848, "logps/chosen": -318.63592529296875, "logps/rejected": -305.0, "logps/weighted_chosen": -2.9573731422424316, "logps/weighted_rejected": -3.207348585128784, "loss": 0.7259, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -26.522266387939453, "rewards/margins": 9.657129287719727, "rewards/rejected": -36.18437576293945, "rewards/weighted_accuracies": 0.625, "rewards/weighted_chosen": -0.3494918942451477, "rewards/weighted_margins": 0.17980042099952698, "rewards/weighted_rejected": -0.5295776128768921, "step": 220 }, { "epoch": 0.12038733315885894, "grad_norm": 611.2006225585938, "learning_rate": 9.988573164927884e-07, "logits/chosen": -0.3254447877407074, "logits/rejected": -0.36442261934280396, "logps/chosen": -297.16717529296875, "logps/rejected": -296.48907470703125, "logps/weighted_chosen": -2.35888671875, "logps/weighted_rejected": -2.804126024246216, "loss": 0.6637, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -28.293750762939453, "rewards/margins": 16.522851943969727, "rewards/rejected": -44.817970275878906, "rewards/weighted_accuracies": 0.6812499761581421, "rewards/weighted_chosen": -0.1478603333234787, "rewards/weighted_margins": 0.3882904052734375, "rewards/weighted_rejected": -0.5360336303710938, "step": 230 }, { "epoch": 0.12562156503533106, "grad_norm": 34.738059997558594, "learning_rate": 9.98156612329838e-07, "logits/chosen": -0.3876676559448242, "logits/rejected": -0.4364303648471832, "logps/chosen": -312.5921936035156, "logps/rejected": -348.60626220703125, "logps/weighted_chosen": -2.594775438308716, "logps/weighted_rejected": -3.1068115234375, "loss": 0.7482, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -47.75507736206055, "rewards/margins": 18.762109756469727, "rewards/rejected": -66.5289077758789, "rewards/weighted_accuracies": 0.6499999761581421, "rewards/weighted_chosen": -0.319406121969223, "rewards/weighted_margins": 0.204498291015625, "rewards/weighted_rejected": -0.5241454839706421, "step": 240 }, { "epoch": 0.13085579691180318, "grad_norm": 141.2784423828125, "learning_rate": 9.97289527905053e-07, "logits/chosen": -0.4159797728061676, "logits/rejected": -0.4382568299770355, "logps/chosen": -308.44842529296875, "logps/rejected": -313.49530029296875, "logps/weighted_chosen": -2.802807569503784, "logps/weighted_rejected": -2.9874024391174316, "loss": 0.7583, "rewards/accuracies": 0.609375, "rewards/chosen": -44.16621017456055, "rewards/margins": 13.382226943969727, "rewards/rejected": -57.54296875, "rewards/weighted_accuracies": 0.6187499761581421, "rewards/weighted_chosen": -0.20551452040672302, "rewards/weighted_margins": 0.22150878608226776, "rewards/weighted_rejected": -0.42730408906936646, "step": 250 }, { "epoch": 0.1360900287882753, "grad_norm": 75.63247680664062, "learning_rate": 9.962563528175875e-07, "logits/chosen": -0.4156021177768707, "logits/rejected": -0.4493362307548523, "logps/chosen": -342.1312561035156, "logps/rejected": -317.84063720703125, "logps/weighted_chosen": -2.56585693359375, "logps/weighted_rejected": -3.4276366233825684, "loss": 0.6673, "rewards/accuracies": 0.5625, "rewards/chosen": -43.44648361206055, "rewards/margins": 13.459375381469727, "rewards/rejected": -56.90898513793945, "rewards/weighted_accuracies": 0.6156250238418579, "rewards/weighted_chosen": -0.310464471578598, "rewards/weighted_margins": 0.32973939180374146, "rewards/weighted_rejected": -0.6404876708984375, "step": 260 }, { "epoch": 0.14132426066474746, "grad_norm": 32.991607666015625, "learning_rate": 9.950574321395277e-07, "logits/chosen": -0.507153332233429, "logits/rejected": -0.5264068841934204, "logps/chosen": -317.0335998535156, "logps/rejected": -300.5718688964844, "logps/weighted_chosen": -2.669018507003784, "logps/weighted_rejected": -2.8846678733825684, "loss": 0.6774, "rewards/accuracies": 0.609375, "rewards/chosen": -27.629491806030273, "rewards/margins": 9.762109756469727, "rewards/rejected": -37.39765548706055, "rewards/weighted_accuracies": 0.6000000238418579, "rewards/weighted_chosen": -0.23316803574562073, "rewards/weighted_margins": 0.2321929931640625, "rewards/weighted_rejected": -0.46527099609375, "step": 270 }, { "epoch": 0.14655849254121958, "grad_norm": 41.87471389770508, "learning_rate": 9.936931663006413e-07, "logits/chosen": -0.5015045404434204, "logits/rejected": -0.49525564908981323, "logps/chosen": -319.9296875, "logps/rejected": -312.28826904296875, "logps/weighted_chosen": -2.667797803878784, "logps/weighted_rejected": -3.121386766433716, "loss": 0.6458, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -17.077342987060547, "rewards/margins": 15.751367568969727, "rewards/rejected": -32.82636642456055, "rewards/weighted_accuracies": 0.703125, "rewards/weighted_chosen": -0.10239715874195099, "rewards/weighted_margins": 0.3067169189453125, "rewards/weighted_rejected": -0.409027099609375, "step": 280 }, { "epoch": 0.1517927244176917, "grad_norm": 22.655261993408203, "learning_rate": 9.921640109546357e-07, "logits/chosen": -0.4936012327671051, "logits/rejected": -0.5640270113945007, "logps/chosen": -286.4117126464844, "logps/rejected": -286.15313720703125, "logps/weighted_chosen": -2.640380859375, "logps/weighted_rejected": -3.774829149246216, "loss": 0.6369, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -19.629491806030273, "rewards/margins": 14.98291015625, "rewards/rejected": -34.61503982543945, "rewards/weighted_accuracies": 0.668749988079071, "rewards/weighted_chosen": -0.21005859971046448, "rewards/weighted_margins": 0.3854110836982727, "rewards/weighted_rejected": -0.5955780148506165, "step": 290 }, { "epoch": 0.15702695629416383, "grad_norm": 33.83458709716797, "learning_rate": 9.90470476826975e-07, "logits/chosen": -0.5687958002090454, "logits/rejected": -0.5700439214706421, "logps/chosen": -298.06561279296875, "logps/rejected": -311.8359375, "logps/weighted_chosen": -2.555248975753784, "logps/weighted_rejected": -3.0333251953125, "loss": 0.6326, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -28.754297256469727, "rewards/margins": 15.813085556030273, "rewards/rejected": -44.567771911621094, "rewards/weighted_accuracies": 0.6468750238418579, "rewards/weighted_chosen": -0.25576478242874146, "rewards/weighted_margins": 0.3327178955078125, "rewards/weighted_rejected": -0.5886474847793579, "step": 300 }, { "epoch": 0.16226118817063595, "grad_norm": 29.578954696655273, "learning_rate": 9.886131295443002e-07, "logits/chosen": -0.6218292117118835, "logits/rejected": -0.6808746457099915, "logps/chosen": -313.44061279296875, "logps/rejected": -297.9156188964844, "logps/weighted_chosen": -2.719403028488159, "logps/weighted_rejected": -3.1142334938049316, "loss": 0.6221, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -32.1875, "rewards/margins": 15.571874618530273, "rewards/rejected": -47.7666015625, "rewards/weighted_accuracies": 0.6499999761581421, "rewards/weighted_chosen": -0.19769592583179474, "rewards/weighted_margins": 0.521374523639679, "rewards/weighted_rejected": -0.7188171148300171, "step": 310 }, { "epoch": 0.16749542004710807, "grad_norm": 15.948070526123047, "learning_rate": 9.865925894455166e-07, "logits/chosen": -0.6636382937431335, "logits/rejected": -0.6817939877510071, "logps/chosen": -336.1507873535156, "logps/rejected": -306.62890625, "logps/weighted_chosen": -2.83941650390625, "logps/weighted_rejected": -3.273486375808716, "loss": 0.7046, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -36.21074295043945, "rewards/margins": 12.9765625, "rewards/rejected": -49.17656326293945, "rewards/weighted_accuracies": 0.6343749761581421, "rewards/weighted_chosen": -0.3679946959018707, "rewards/weighted_margins": 0.278738409280777, "rewards/weighted_rejected": -0.6464904546737671, "step": 320 }, { "epoch": 0.17272965192358022, "grad_norm": 53.10931396484375, "learning_rate": 9.84409531374603e-07, "logits/chosen": -0.6222923398017883, "logits/rejected": -0.607867419719696, "logps/chosen": -344.19061279296875, "logps/rejected": -315.84686279296875, "logps/weighted_chosen": -2.915234327316284, "logps/weighted_rejected": -3.346630811691284, "loss": 0.6334, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -39.39531326293945, "rewards/margins": 13.606640815734863, "rewards/rejected": -53.00273513793945, "rewards/weighted_accuracies": 0.65625, "rewards/weighted_chosen": -0.33662718534469604, "rewards/weighted_margins": 0.3510375916957855, "rewards/weighted_rejected": -0.6879855990409851, "step": 330 }, { "epoch": 0.17796388380005235, "grad_norm": 13.994344711303711, "learning_rate": 9.820646844552219e-07, "logits/chosen": -0.6615959405899048, "logits/rejected": -0.714038074016571, "logps/chosen": -312.7328186035156, "logps/rejected": -323.3359375, "logps/weighted_chosen": -2.910736083984375, "logps/weighted_rejected": -3.2298340797424316, "loss": 0.6376, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -36.327537536621094, "rewards/margins": 21.465625762939453, "rewards/rejected": -57.79179763793945, "rewards/weighted_accuracies": 0.675000011920929, "rewards/weighted_chosen": -0.34371185302734375, "rewards/weighted_margins": 0.390716552734375, "rewards/weighted_rejected": -0.734301745891571, "step": 340 }, { "epoch": 0.18319811567652447, "grad_norm": 13.682384490966797, "learning_rate": 9.795588318471964e-07, "logits/chosen": -0.774444580078125, "logits/rejected": -0.7887184023857117, "logps/chosen": -302.76251220703125, "logps/rejected": -336.90313720703125, "logps/weighted_chosen": -2.718432664871216, "logps/weighted_rejected": -3.0622315406799316, "loss": 0.6451, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -43.29413986206055, "rewards/margins": 17.229297637939453, "rewards/rejected": -60.52812576293945, "rewards/weighted_accuracies": 0.637499988079071, "rewards/weighted_chosen": -0.33645325899124146, "rewards/weighted_margins": 0.33460694551467896, "rewards/weighted_rejected": -0.6709426641464233, "step": 350 }, { "epoch": 0.1884323475529966, "grad_norm": 19.35496711730957, "learning_rate": 9.768928104849415e-07, "logits/chosen": -0.815447986125946, "logits/rejected": -0.8216186761856079, "logps/chosen": -330.66015625, "logps/rejected": -313.8687438964844, "logps/weighted_chosen": -2.970752000808716, "logps/weighted_rejected": -3.1889405250549316, "loss": 0.6763, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -47.90937423706055, "rewards/margins": 17.064844131469727, "rewards/rejected": -64.98515319824219, "rewards/weighted_accuracies": 0.6499999761581421, "rewards/weighted_chosen": -0.3897460997104645, "rewards/weighted_margins": 0.384918212890625, "rewards/weighted_rejected": -0.7745300531387329, "step": 360 }, { "epoch": 0.19366657942946872, "grad_norm": 21.70383071899414, "learning_rate": 9.740675107979355e-07, "logits/chosen": -0.7939224243164062, "logits/rejected": -0.827624499797821, "logps/chosen": -373.0406188964844, "logps/rejected": -349.3343811035156, "logps/weighted_chosen": -2.506054639816284, "logps/weighted_rejected": -3.43896484375, "loss": 0.6215, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -58.082420349121094, "rewards/margins": 16.594335556030273, "rewards/rejected": -74.671875, "rewards/weighted_accuracies": 0.671875, "rewards/weighted_chosen": -0.48696595430374146, "rewards/weighted_margins": 0.412750244140625, "rewards/weighted_rejected": -0.90020751953125, "step": 370 }, { "epoch": 0.19890081130594087, "grad_norm": 21.682790756225586, "learning_rate": 9.71083876413323e-07, "logits/chosen": -0.77825927734375, "logits/rejected": -0.7921401858329773, "logps/chosen": -367.0062561035156, "logps/rejected": -355.78594970703125, "logps/weighted_chosen": -2.644775390625, "logps/weighted_rejected": -3.5374755859375, "loss": 0.6609, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -64.59492492675781, "rewards/margins": 21.614843368530273, "rewards/rejected": -86.1988296508789, "rewards/weighted_accuracies": 0.6625000238418579, "rewards/weighted_chosen": -0.5685027837753296, "rewards/weighted_margins": 0.3586792051792145, "rewards/weighted_rejected": -0.9269439578056335, "step": 380 }, { "epoch": 0.204135043182413, "grad_norm": 30.26679229736328, "learning_rate": 9.67942903840751e-07, "logits/chosen": -0.8401764035224915, "logits/rejected": -0.898754894733429, "logps/chosen": -362.0328063964844, "logps/rejected": -362.3687438964844, "logps/weighted_chosen": -2.8179688453674316, "logps/weighted_rejected": -3.4515624046325684, "loss": 0.5992, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -57.87226486206055, "rewards/margins": 30.196680068969727, "rewards/rejected": -88.0804672241211, "rewards/weighted_accuracies": 0.715624988079071, "rewards/weighted_chosen": -0.456369012594223, "rewards/weighted_margins": 0.495849609375, "rewards/weighted_rejected": -0.951983630657196, "step": 390 }, { "epoch": 0.2093692750588851, "grad_norm": 37.81236267089844, "learning_rate": 9.646456421395447e-07, "logits/chosen": -0.8902359008789062, "logits/rejected": -0.9107666015625, "logps/chosen": -380.6343688964844, "logps/rejected": -399.7281188964844, "logps/weighted_chosen": -2.709228515625, "logps/weighted_rejected": -3.455004930496216, "loss": 0.6557, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -62.43437576293945, "rewards/margins": 33.634765625, "rewards/rejected": -96.0941390991211, "rewards/weighted_accuracies": 0.653124988079071, "rewards/weighted_chosen": -0.4688262939453125, "rewards/weighted_margins": 0.3546081483364105, "rewards/weighted_rejected": -0.8232513666152954, "step": 400 }, { "epoch": 0.21460350693535724, "grad_norm": 21.761716842651367, "learning_rate": 9.611931925683266e-07, "logits/chosen": -0.849993109703064, "logits/rejected": -0.8886367678642273, "logps/chosen": -368.8374938964844, "logps/rejected": -353.73126220703125, "logps/weighted_chosen": -2.533374071121216, "logps/weighted_rejected": -3.233935594558716, "loss": 0.591, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -62.575782775878906, "rewards/margins": 27.423242568969727, "rewards/rejected": -90.0132827758789, "rewards/weighted_accuracies": 0.6937500238418579, "rewards/weighted_chosen": -0.4288879334926605, "rewards/weighted_margins": 0.507708728313446, "rewards/weighted_rejected": -0.9364013671875, "step": 410 }, { "epoch": 0.21983773881182936, "grad_norm": 28.842378616333008, "learning_rate": 9.575867082172085e-07, "logits/chosen": -0.836225152015686, "logits/rejected": -0.8854034543037415, "logps/chosen": -371.90625, "logps/rejected": -369.4125061035156, "logps/weighted_chosen": -2.9190917015075684, "logps/weighted_rejected": -3.118579149246216, "loss": 0.6198, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -66.98905944824219, "rewards/margins": 32.337890625, "rewards/rejected": -99.3402328491211, "rewards/weighted_accuracies": 0.6625000238418579, "rewards/weighted_chosen": -0.4911743104457855, "rewards/weighted_margins": 0.5335143804550171, "rewards/weighted_rejected": -1.0242736339569092, "step": 420 }, { "epoch": 0.22507197068830148, "grad_norm": 16.805343627929688, "learning_rate": 9.538273936226673e-07, "logits/chosen": -0.8646026849746704, "logits/rejected": -0.898516833782196, "logps/chosen": -328.94921875, "logps/rejected": -353.66717529296875, "logps/weighted_chosen": -3.1983399391174316, "logps/weighted_rejected": -3.6509766578674316, "loss": 0.6173, "rewards/accuracies": 0.625, "rewards/chosen": -63.52910232543945, "rewards/margins": 25.6201171875, "rewards/rejected": -89.1246109008789, "rewards/weighted_accuracies": 0.6499999761581421, "rewards/weighted_chosen": -0.436279296875, "rewards/weighted_margins": 0.438516229391098, "rewards/weighted_rejected": -0.8747711181640625, "step": 430 }, { "epoch": 0.23030620256477363, "grad_norm": 23.78999137878418, "learning_rate": 9.499165043652391e-07, "logits/chosen": -0.8759521245956421, "logits/rejected": -0.8865814208984375, "logps/chosen": -358.69140625, "logps/rejected": -358.76251220703125, "logps/weighted_chosen": -3.2625732421875, "logps/weighted_rejected": -3.359301805496216, "loss": 0.6366, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -67.4326171875, "rewards/margins": 24.610157012939453, "rewards/rejected": -92.03788757324219, "rewards/weighted_accuracies": 0.668749988079071, "rewards/weighted_chosen": -0.5896576046943665, "rewards/weighted_margins": 0.34770506620407104, "rewards/weighted_rejected": -0.9376128911972046, "step": 440 }, { "epoch": 0.23554043444124576, "grad_norm": 49.646263122558594, "learning_rate": 9.458553466501665e-07, "logits/chosen": -0.9158996343612671, "logits/rejected": -0.9590820074081421, "logps/chosen": -355.3109436035156, "logps/rejected": -341.9765625, "logps/weighted_chosen": -3.231823682785034, "logps/weighted_rejected": -3.4568848609924316, "loss": 0.657, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -69.14453125, "rewards/margins": 28.884180068969727, "rewards/rejected": -98.0492172241211, "rewards/weighted_accuracies": 0.6875, "rewards/weighted_chosen": -0.6470184326171875, "rewards/weighted_margins": 0.4533935487270355, "rewards/weighted_rejected": -1.100683569908142, "step": 450 }, { "epoch": 0.24077466631771788, "grad_norm": 38.45325469970703, "learning_rate": 9.416452768711366e-07, "logits/chosen": -0.9281005859375, "logits/rejected": -0.9545837640762329, "logps/chosen": -375.6390686035156, "logps/rejected": -370.203125, "logps/weighted_chosen": -2.9860596656799316, "logps/weighted_rejected": -3.635498046875, "loss": 0.6457, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -83.3871078491211, "rewards/margins": 26.872264862060547, "rewards/rejected": -110.26094055175781, "rewards/weighted_accuracies": 0.6625000238418579, "rewards/weighted_chosen": -0.6367782354354858, "rewards/weighted_margins": 0.515301525592804, "rewards/weighted_rejected": -1.152197241783142, "step": 460 }, { "epoch": 0.24600889819419, "grad_norm": 59.67642593383789, "learning_rate": 9.372877011572557e-07, "logits/chosen": -0.8797394037246704, "logits/rejected": -0.904949963092804, "logps/chosen": -398.8531188964844, "logps/rejected": -388.22186279296875, "logps/weighted_chosen": -3.0103516578674316, "logps/weighted_rejected": -3.409374952316284, "loss": 0.6181, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -88.2437515258789, "rewards/margins": 25.542577743530273, "rewards/rejected": -113.7890625, "rewards/weighted_accuracies": 0.6781250238418579, "rewards/weighted_chosen": -0.6517273187637329, "rewards/weighted_margins": 0.482818603515625, "rewards/weighted_rejected": -1.1340011358261108, "step": 470 }, { "epoch": 0.2512431300706621, "grad_norm": 27.465656280517578, "learning_rate": 9.327840749034141e-07, "logits/chosen": -0.910723865032196, "logits/rejected": -0.9439147710800171, "logps/chosen": -364.4750061035156, "logps/rejected": -392.03436279296875, "logps/weighted_chosen": -2.8880372047424316, "logps/weighted_rejected": -4.069311618804932, "loss": 0.6276, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -77.44355773925781, "rewards/margins": 38.147850036621094, "rewards/rejected": -115.59648132324219, "rewards/weighted_accuracies": 0.684374988079071, "rewards/weighted_chosen": -0.5768798589706421, "rewards/weighted_margins": 0.541027843952179, "rewards/weighted_rejected": -1.1178162097930908, "step": 480 }, { "epoch": 0.2564773619471343, "grad_norm": 34.35914993286133, "learning_rate": 9.281359022841965e-07, "logits/chosen": -0.80584716796875, "logits/rejected": -0.8291991949081421, "logps/chosen": -349.86798095703125, "logps/rejected": -354.75701904296875, "logps/weighted_chosen": -2.914257764816284, "logps/weighted_rejected": -4.178680419921875, "loss": 0.63, "rewards/accuracies": 0.625, "rewards/chosen": -74.94511413574219, "rewards/margins": 34.957618713378906, "rewards/rejected": -109.90644836425781, "rewards/weighted_accuracies": 0.690625011920929, "rewards/weighted_chosen": -0.564099133014679, "rewards/weighted_margins": 0.658795177936554, "rewards/weighted_rejected": -1.2229950428009033, "step": 490 }, { "epoch": 0.26171159382360637, "grad_norm": 41.01088333129883, "learning_rate": 9.233447357514989e-07, "logits/chosen": -0.775347888469696, "logits/rejected": -0.8226684331893921, "logps/chosen": -369.1148376464844, "logps/rejected": -373.8374938964844, "logps/weighted_chosen": -3.288012742996216, "logps/weighted_rejected": -3.7696776390075684, "loss": 0.6225, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -71.1283187866211, "rewards/margins": 32.867576599121094, "rewards/rejected": -104.0218734741211, "rewards/weighted_accuracies": 0.6937500238418579, "rewards/weighted_chosen": -0.5540069341659546, "rewards/weighted_margins": 0.598828136920929, "rewards/weighted_rejected": -1.1535155773162842, "step": 500 }, { "epoch": 0.26171159382360637, "eval_logits/chosen": -0.8767929673194885, "eval_logits/rejected": -0.8968105316162109, "eval_logps/chosen": -369.21099853515625, "eval_logps/rejected": -379.3609924316406, "eval_logps/weighted_chosen": -2.957455635070801, "eval_logps/weighted_rejected": -3.6658689975738525, "eval_loss": 0.6115986108779907, "eval_rewards/accuracies": 0.6294999718666077, "eval_rewards/chosen": -80.43468475341797, "eval_rewards/margins": 31.297624588012695, "eval_rewards/rejected": -111.73987579345703, "eval_rewards/weighted_accuracies": 0.6759999990463257, "eval_rewards/weighted_chosen": -0.5706117153167725, "eval_rewards/weighted_margins": 0.5400464534759521, "eval_rewards/weighted_rejected": -1.1106581687927246, "eval_runtime": 1361.4222, "eval_samples_per_second": 1.469, "eval_steps_per_second": 0.367, "step": 500 } ], "logging_steps": 10, "max_steps": 1911, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }