{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.535833891493637, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005358338914936369, "grad_norm": 29.978843688964844, "learning_rate": 0.0, "logits/chosen": -2.52783203125, "logits/rejected": -2.4970703125, "logps/chosen": -277.75, "logps/rejected": -223.0625, "logps/weighted_chosen": -0.4293212890625, "logps/weighted_rejected": -0.3968505859375, "loss": 0.6914, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "rewards/weighted_accuracies": 0.0, "rewards/weighted_chosen": 0.0, "rewards/weighted_margins": 0.0, "rewards/weighted_rejected": 0.0, "step": 1 }, { "epoch": 0.0053583389149363695, "grad_norm": 28.513858795166016, "learning_rate": 4.81283422459893e-08, "logits/chosen": -2.42626953125, "logits/rejected": -2.397569417953491, "logps/chosen": -195.53038024902344, "logps/rejected": -195.91839599609375, "logps/weighted_chosen": -0.4505276083946228, "logps/weighted_rejected": -0.4737955629825592, "loss": 0.6926, "rewards/accuracies": 0.2743055522441864, "rewards/chosen": 0.0321180559694767, "rewards/margins": -0.0375434011220932, "rewards/rejected": 0.0696614608168602, "rewards/weighted_accuracies": 0.3402777910232544, "rewards/weighted_chosen": 5.679660534951836e-05, "rewards/weighted_margins": 3.984239447163418e-05, "rewards/weighted_rejected": 1.695421087788418e-05, "step": 10 }, { "epoch": 0.010716677829872739, "grad_norm": 20.803266525268555, "learning_rate": 1.0160427807486631e-07, "logits/chosen": -2.4007811546325684, "logits/rejected": -2.3998780250549316, "logps/chosen": -199.92343139648438, "logps/rejected": -202.2101593017578, "logps/weighted_chosen": -0.4370788633823395, "logps/weighted_rejected": -0.464111328125, "loss": 0.6927, "rewards/accuracies": 0.34687501192092896, "rewards/chosen": -0.02207031287252903, "rewards/margins": 0.04130859300494194, "rewards/rejected": -0.06337890774011612, "rewards/weighted_accuracies": 0.36250001192092896, "rewards/weighted_chosen": 0.0006473541143350303, "rewards/weighted_margins": 7.362365431617945e-05, "rewards/weighted_rejected": 0.0005735397571697831, "step": 20 }, { "epoch": 0.016075016744809108, "grad_norm": 33.961883544921875, "learning_rate": 1.5508021390374333e-07, "logits/chosen": -2.32586669921875, "logits/rejected": -2.300219774246216, "logps/chosen": -198.45858764648438, "logps/rejected": -212.0234375, "logps/weighted_chosen": -0.45709228515625, "logps/weighted_rejected": -0.45776671171188354, "loss": 0.6925, "rewards/accuracies": 0.38749998807907104, "rewards/chosen": 0.1083984375, "rewards/margins": 0.1162109375, "rewards/rejected": -0.0078125, "rewards/weighted_accuracies": 0.4312500059604645, "rewards/weighted_chosen": 0.0002254486025776714, "rewards/weighted_margins": 0.0002849578741006553, "rewards/weighted_rejected": -5.970001075183973e-05, "step": 30 }, { "epoch": 0.021433355659745478, "grad_norm": 44.94234848022461, "learning_rate": 2.085561497326203e-07, "logits/chosen": -2.3628907203674316, "logits/rejected": -2.3779540061950684, "logps/chosen": -192.63125610351562, "logps/rejected": -207.24765014648438, "logps/weighted_chosen": -0.45967406034469604, "logps/weighted_rejected": -0.4842773377895355, "loss": 0.6926, "rewards/accuracies": 0.3687500059604645, "rewards/chosen": -0.01308593712747097, "rewards/margins": 0.04453124850988388, "rewards/rejected": -0.0576171875, "rewards/weighted_accuracies": 0.40937501192092896, "rewards/weighted_chosen": 0.0007827758672647178, "rewards/weighted_margins": 0.00011138916306663305, "rewards/weighted_rejected": 0.00067138671875, "step": 40 }, { "epoch": 0.02679169457468185, "grad_norm": 58.061580657958984, "learning_rate": 2.620320855614973e-07, "logits/chosen": -2.402636766433716, "logits/rejected": -2.3846192359924316, "logps/chosen": -213.23672485351562, "logps/rejected": -228.9015655517578, "logps/weighted_chosen": -0.4499572813510895, "logps/weighted_rejected": -0.4646240174770355, "loss": 0.6926, "rewards/accuracies": 0.34687501192092896, "rewards/chosen": -0.09589843451976776, "rewards/margins": 0.0673828125, "rewards/rejected": -0.16328124701976776, "rewards/weighted_accuracies": 0.44999998807907104, "rewards/weighted_chosen": 0.0027015686500817537, "rewards/weighted_margins": 0.0005447387811727822, "rewards/weighted_rejected": 0.0021568299271166325, "step": 50 }, { "epoch": 0.032150033489618215, "grad_norm": 30.78350830078125, "learning_rate": 3.155080213903743e-07, "logits/chosen": -2.349609375, "logits/rejected": -2.3248534202575684, "logps/chosen": -211.83438110351562, "logps/rejected": -218.11563110351562, "logps/weighted_chosen": -0.44114989042282104, "logps/weighted_rejected": -0.4686218202114105, "loss": 0.6918, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -0.0025390624068677425, "rewards/margins": 0.06503906100988388, "rewards/rejected": -0.06757812201976776, "rewards/weighted_accuracies": 0.48750001192092896, "rewards/weighted_chosen": 0.005175781436264515, "rewards/weighted_margins": 0.0019653320778161287, "rewards/weighted_rejected": 0.0032104491256177425, "step": 60 }, { "epoch": 0.03750837240455459, "grad_norm": 23.173866271972656, "learning_rate": 3.689839572192513e-07, "logits/chosen": -2.458251953125, "logits/rejected": -2.463671922683716, "logps/chosen": -194.55624389648438, "logps/rejected": -200.5437469482422, "logps/weighted_chosen": -0.4099182188510895, "logps/weighted_rejected": -0.4346679747104645, "loss": 0.6923, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -0.16562500596046448, "rewards/margins": 0.203125, "rewards/rejected": -0.3687500059604645, "rewards/weighted_accuracies": 0.503125011920929, "rewards/weighted_chosen": 0.0095977783203125, "rewards/weighted_margins": 0.0012573242420330644, "rewards/weighted_rejected": 0.00834045372903347, "step": 70 }, { "epoch": 0.042866711319490956, "grad_norm": 27.169227600097656, "learning_rate": 4.2245989304812833e-07, "logits/chosen": -2.44384765625, "logits/rejected": -2.4704346656799316, "logps/chosen": -219.31405639648438, "logps/rejected": -240.16641235351562, "logps/weighted_chosen": -0.4397827088832855, "logps/weighted_rejected": -0.46443480253219604, "loss": 0.6915, "rewards/accuracies": 0.4375, "rewards/chosen": 0.02402343787252903, "rewards/margins": 0.2919921875, "rewards/rejected": -0.2679687440395355, "rewards/weighted_accuracies": 0.47187501192092896, "rewards/weighted_chosen": 0.012739372439682484, "rewards/weighted_margins": 0.003216362092643976, "rewards/weighted_rejected": 0.00952300988137722, "step": 80 }, { "epoch": 0.04822505023442733, "grad_norm": 24.797061920166016, "learning_rate": 4.7593582887700533e-07, "logits/chosen": -2.46044921875, "logits/rejected": -2.4736571311950684, "logps/chosen": -195.90390014648438, "logps/rejected": -208.1320343017578, "logps/weighted_chosen": -0.42793577909469604, "logps/weighted_rejected": -0.447784423828125, "loss": 0.6922, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.599609375, "rewards/margins": 0.3626953065395355, "rewards/rejected": -0.9623047113418579, "rewards/weighted_accuracies": 0.5062500238418579, "rewards/weighted_chosen": 0.00842132605612278, "rewards/weighted_margins": 0.0021730423904955387, "rewards/weighted_rejected": 0.00624923687428236, "step": 90 }, { "epoch": 0.0535833891493637, "grad_norm": 13.923910140991211, "learning_rate": 5.294117647058823e-07, "logits/chosen": -2.4405760765075684, "logits/rejected": -2.425537109375, "logps/chosen": -224.88516235351562, "logps/rejected": -229.6984405517578, "logps/weighted_chosen": -0.4417480528354645, "logps/weighted_rejected": -0.454376220703125, "loss": 0.6949, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.443750023841858, "rewards/margins": 0.4691406190395355, "rewards/rejected": -1.9128906726837158, "rewards/weighted_accuracies": 0.518750011920929, "rewards/weighted_chosen": 0.001766204833984375, "rewards/weighted_margins": -0.0024765015114098787, "rewards/weighted_rejected": 0.0042396546341478825, "step": 100 }, { "epoch": 0.058941728064300064, "grad_norm": 43.69069290161133, "learning_rate": 5.828877005347593e-07, "logits/chosen": -2.4117431640625, "logits/rejected": -2.413012742996216, "logps/chosen": -217.89453125, "logps/rejected": -225.7703094482422, "logps/weighted_chosen": -0.45955199003219604, "logps/weighted_rejected": -0.4866271913051605, "loss": 0.6916, "rewards/accuracies": 0.4781250059604645, "rewards/chosen": -2.8353514671325684, "rewards/margins": 0.3775390684604645, "rewards/rejected": -3.212890625, "rewards/weighted_accuracies": 0.5093749761581421, "rewards/weighted_chosen": 0.0031097412575036287, "rewards/weighted_margins": 0.004855346865952015, "rewards/weighted_rejected": -0.0017486572032794356, "step": 110 }, { "epoch": 0.06430006697923643, "grad_norm": 24.36446762084961, "learning_rate": 6.363636363636363e-07, "logits/chosen": -2.4598631858825684, "logits/rejected": -2.4720215797424316, "logps/chosen": -188.3484344482422, "logps/rejected": -226.6437530517578, "logps/weighted_chosen": -0.4706787168979645, "logps/weighted_rejected": -0.551287829875946, "loss": 0.6893, "rewards/accuracies": 0.47187501192092896, "rewards/chosen": -4.0625, "rewards/margins": 0.947460949420929, "rewards/rejected": -5.010156154632568, "rewards/weighted_accuracies": 0.528124988079071, "rewards/weighted_chosen": -0.018326569348573685, "rewards/weighted_margins": 0.010852813720703125, "rewards/weighted_rejected": -0.02917938306927681, "step": 120 }, { "epoch": 0.06965840589417281, "grad_norm": 35.46619415283203, "learning_rate": 6.898395721925134e-07, "logits/chosen": -2.566601514816284, "logits/rejected": -2.5640625953674316, "logps/chosen": -233.8390655517578, "logps/rejected": -243.31875610351562, "logps/weighted_chosen": -0.46611326932907104, "logps/weighted_rejected": -0.48984986543655396, "loss": 0.7012, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -5.547656059265137, "rewards/margins": 1.3078124523162842, "rewards/rejected": -6.855859279632568, "rewards/weighted_accuracies": 0.49687498807907104, "rewards/weighted_chosen": -0.01891174353659153, "rewards/weighted_margins": -0.0069145201705396175, "rewards/weighted_rejected": -0.012065887451171875, "step": 130 }, { "epoch": 0.07501674480910918, "grad_norm": 18.125104904174805, "learning_rate": 7.433155080213903e-07, "logits/chosen": -2.602294921875, "logits/rejected": -2.599804639816284, "logps/chosen": -202.98281860351562, "logps/rejected": -215.4796905517578, "logps/weighted_chosen": -0.4594665467739105, "logps/weighted_rejected": -0.5065857172012329, "loss": 0.6858, "rewards/accuracies": 0.559374988079071, "rewards/chosen": -8.157422065734863, "rewards/margins": 2.392578125, "rewards/rejected": -10.550000190734863, "rewards/weighted_accuracies": 0.5531250238418579, "rewards/weighted_chosen": -0.02419433556497097, "rewards/weighted_margins": 0.01836395263671875, "rewards/weighted_rejected": -0.04254303127527237, "step": 140 }, { "epoch": 0.08037508372404555, "grad_norm": 44.02063751220703, "learning_rate": 7.967914438502673e-07, "logits/chosen": -2.6329102516174316, "logits/rejected": -2.630322217941284, "logps/chosen": -221.5109405517578, "logps/rejected": -245.00155639648438, "logps/weighted_chosen": -0.522930920124054, "logps/weighted_rejected": -0.535870373249054, "loss": 0.6933, "rewards/accuracies": 0.5625, "rewards/chosen": -11.2197265625, "rewards/margins": 2.375, "rewards/rejected": -13.596288681030273, "rewards/weighted_accuracies": 0.5406249761581421, "rewards/weighted_chosen": -0.07070770114660263, "rewards/weighted_margins": 0.00907135009765625, "rewards/weighted_rejected": -0.07977227866649628, "step": 150 }, { "epoch": 0.08573342263898191, "grad_norm": 30.984752655029297, "learning_rate": 8.502673796791443e-07, "logits/chosen": -2.552539110183716, "logits/rejected": -2.5354247093200684, "logps/chosen": -207.53280639648438, "logps/rejected": -236.8117218017578, "logps/weighted_chosen": -0.5689147710800171, "logps/weighted_rejected": -0.5829712152481079, "loss": 0.6936, "rewards/accuracies": 0.534375011920929, "rewards/chosen": -13.640625, "rewards/margins": 1.2478516101837158, "rewards/rejected": -14.890039443969727, "rewards/weighted_accuracies": 0.5625, "rewards/weighted_chosen": -0.11262588202953339, "rewards/weighted_margins": 0.01301422156393528, "rewards/weighted_rejected": -0.12571564316749573, "step": 160 }, { "epoch": 0.09109176155391828, "grad_norm": 29.657917022705078, "learning_rate": 9.037433155080213e-07, "logits/chosen": -2.5386719703674316, "logits/rejected": -2.550463914871216, "logps/chosen": -213.2921905517578, "logps/rejected": -224.1906280517578, "logps/weighted_chosen": -0.561798095703125, "logps/weighted_rejected": -0.5876098871231079, "loss": 0.6873, "rewards/accuracies": 0.565625011920929, "rewards/chosen": -15.824999809265137, "rewards/margins": 3.636914014816284, "rewards/rejected": -19.457422256469727, "rewards/weighted_accuracies": 0.534375011920929, "rewards/weighted_chosen": -0.11036185920238495, "rewards/weighted_margins": 0.027300262823700905, "rewards/weighted_rejected": -0.13774414360523224, "step": 170 }, { "epoch": 0.09645010046885466, "grad_norm": 16.24775505065918, "learning_rate": 9.572192513368984e-07, "logits/chosen": -2.5972657203674316, "logits/rejected": -2.5906739234924316, "logps/chosen": -229.1453094482422, "logps/rejected": -249.83749389648438, "logps/weighted_chosen": -0.607281506061554, "logps/weighted_rejected": -0.6552734375, "loss": 0.688, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -19.492773056030273, "rewards/margins": 5.0849609375, "rewards/rejected": -24.581249237060547, "rewards/weighted_accuracies": 0.528124988079071, "rewards/weighted_chosen": -0.16547851264476776, "rewards/weighted_margins": 0.02171630784869194, "rewards/weighted_rejected": -0.1871211975812912, "step": 180 }, { "epoch": 0.10180843938379103, "grad_norm": 22.05425262451172, "learning_rate": 9.999965031204306e-07, "logits/chosen": -2.589916944503784, "logits/rejected": -2.578906297683716, "logps/chosen": -224.77499389648438, "logps/rejected": -224.9296875, "logps/weighted_chosen": -0.6605224609375, "logps/weighted_rejected": -0.7260376214981079, "loss": 0.6892, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.936328887939453, "rewards/margins": 2.9306640625, "rewards/rejected": -26.86328125, "rewards/weighted_accuracies": 0.5406249761581421, "rewards/weighted_chosen": -0.21764373779296875, "rewards/weighted_margins": 0.04151611402630806, "rewards/weighted_rejected": -0.25913238525390625, "step": 190 }, { "epoch": 0.1071667782987274, "grad_norm": 23.82738494873047, "learning_rate": 9.998741174712533e-07, "logits/chosen": -2.669238328933716, "logits/rejected": -2.6612305641174316, "logps/chosen": -255.4250030517578, "logps/rejected": -270.40313720703125, "logps/weighted_chosen": -0.611083984375, "logps/weighted_rejected": -0.6286376714706421, "loss": 0.7003, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -27.870702743530273, "rewards/margins": 2.9964842796325684, "rewards/rejected": -30.857812881469727, "rewards/weighted_accuracies": 0.5, "rewards/weighted_chosen": -0.15414123237133026, "rewards/weighted_margins": 0.0028816224075853825, "rewards/weighted_rejected": -0.15706482529640198, "step": 200 }, { "epoch": 0.11252511721366376, "grad_norm": 28.775007247924805, "learning_rate": 9.995769367531952e-07, "logits/chosen": -2.7216796875, "logits/rejected": -2.700000047683716, "logps/chosen": -248.61093139648438, "logps/rejected": -247.4812469482422, "logps/weighted_chosen": -0.5626465082168579, "logps/weighted_rejected": -0.5739685297012329, "loss": 0.7034, "rewards/accuracies": 0.559374988079071, "rewards/chosen": -30.315235137939453, "rewards/margins": 1.091210961341858, "rewards/rejected": -31.409765243530273, "rewards/weighted_accuracies": 0.559374988079071, "rewards/weighted_chosen": -0.14006805419921875, "rewards/weighted_margins": -0.0037292479537427425, "rewards/weighted_rejected": -0.13644561171531677, "step": 210 }, { "epoch": 0.11788345612860013, "grad_norm": 32.326786041259766, "learning_rate": 9.991050648838675e-07, "logits/chosen": -2.8436036109924316, "logits/rejected": -2.8392577171325684, "logps/chosen": -242.8984375, "logps/rejected": -257.7124938964844, "logps/weighted_chosen": -0.616424560546875, "logps/weighted_rejected": -0.65435791015625, "loss": 0.6882, "rewards/accuracies": 0.5718749761581421, "rewards/chosen": -38.271095275878906, "rewards/margins": 1.724218726158142, "rewards/rejected": -39.98163986206055, "rewards/weighted_accuracies": 0.546875, "rewards/weighted_chosen": -0.15750885009765625, "rewards/weighted_margins": 0.02539825439453125, "rewards/weighted_rejected": -0.18288879096508026, "step": 220 }, { "epoch": 0.12324179504353651, "grad_norm": 19.367631912231445, "learning_rate": 9.98458666866564e-07, "logits/chosen": -2.869873046875, "logits/rejected": -2.870898485183716, "logps/chosen": -243.66561889648438, "logps/rejected": -263.89373779296875, "logps/weighted_chosen": -0.7010498046875, "logps/weighted_rejected": -0.750701904296875, "loss": 0.6841, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -47.01171875, "rewards/margins": 7.957421779632568, "rewards/rejected": -54.978515625, "rewards/weighted_accuracies": 0.565625011920929, "rewards/weighted_chosen": -0.24614867568016052, "rewards/weighted_margins": 0.04576721042394638, "rewards/weighted_rejected": -0.2919418215751648, "step": 230 }, { "epoch": 0.12860013395847286, "grad_norm": 13.939057350158691, "learning_rate": 9.97637968732563e-07, "logits/chosen": -2.9676756858825684, "logits/rejected": -2.950244188308716, "logps/chosen": -248.04843139648438, "logps/rejected": -263.4750061035156, "logps/weighted_chosen": -0.6573241949081421, "logps/weighted_rejected": -0.648510754108429, "loss": 0.7232, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -43.94921875, "rewards/margins": 5.845312595367432, "rewards/rejected": -49.795310974121094, "rewards/weighted_accuracies": 0.512499988079071, "rewards/weighted_chosen": -0.22567901015281677, "rewards/weighted_margins": -0.026827240362763405, "rewards/weighted_rejected": -0.19892653822898865, "step": 240 }, { "epoch": 0.13395847287340926, "grad_norm": 12.663073539733887, "learning_rate": 9.966432574620906e-07, "logits/chosen": -3.0162110328674316, "logits/rejected": -3.0179200172424316, "logps/chosen": -242.93905639648438, "logps/rejected": -270.6875, "logps/weighted_chosen": -0.628466784954071, "logps/weighted_rejected": -0.67413330078125, "loss": 0.696, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -48.034568786621094, "rewards/margins": 3.3388671875, "rewards/rejected": -51.372657775878906, "rewards/weighted_accuracies": 0.534375011920929, "rewards/weighted_chosen": -0.21130982041358948, "rewards/weighted_margins": 0.009045409969985485, "rewards/weighted_rejected": -0.22044983506202698, "step": 250 }, { "epoch": 0.13931681178834562, "grad_norm": 14.56610107421875, "learning_rate": 9.954748808839674e-07, "logits/chosen": -2.9679198265075684, "logits/rejected": -2.976855516433716, "logps/chosen": -277.53436279296875, "logps/rejected": -275.7718811035156, "logps/weighted_chosen": -0.63873291015625, "logps/weighted_rejected": -0.6825927495956421, "loss": 0.6844, "rewards/accuracies": 0.528124988079071, "rewards/chosen": -50.900779724121094, "rewards/margins": 4.017773628234863, "rewards/rejected": -54.93476486206055, "rewards/weighted_accuracies": 0.559374988079071, "rewards/weighted_chosen": -0.18876799941062927, "rewards/weighted_margins": 0.03838195651769638, "rewards/weighted_rejected": -0.2271774262189865, "step": 260 }, { "epoch": 0.144675150703282, "grad_norm": 12.570405006408691, "learning_rate": 9.941332475539824e-07, "logits/chosen": -3.037890672683716, "logits/rejected": -3.046337842941284, "logps/chosen": -263.0531311035156, "logps/rejected": -299.34375, "logps/weighted_chosen": -0.689990222454071, "logps/weighted_rejected": -0.7867187261581421, "loss": 0.6742, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -52.885154724121094, "rewards/margins": 9.96875, "rewards/rejected": -62.86054611206055, "rewards/weighted_accuracies": 0.6343749761581421, "rewards/weighted_chosen": -0.21307983994483948, "rewards/weighted_margins": 0.06503143161535263, "rewards/weighted_rejected": -0.2780609130859375, "step": 270 }, { "epoch": 0.15003348961821836, "grad_norm": 17.126554489135742, "learning_rate": 9.926188266120295e-07, "logits/chosen": -3.169921875, "logits/rejected": -3.1683592796325684, "logps/chosen": -246.58438110351562, "logps/rejected": -262.4078063964844, "logps/weighted_chosen": -0.681854248046875, "logps/weighted_rejected": -0.746777355670929, "loss": 0.6785, "rewards/accuracies": 0.59375, "rewards/chosen": -50.88359451293945, "rewards/margins": 8.079882621765137, "rewards/rejected": -58.97187423706055, "rewards/weighted_accuracies": 0.53125, "rewards/weighted_chosen": -0.2380828857421875, "rewards/weighted_margins": 0.05399169772863388, "rewards/weighted_rejected": -0.291983038187027, "step": 280 }, { "epoch": 0.15539182853315472, "grad_norm": 17.43288230895996, "learning_rate": 9.909321476180591e-07, "logits/chosen": -3.285205125808716, "logits/rejected": -3.2626953125, "logps/chosen": -269.65155029296875, "logps/rejected": -298.9703063964844, "logps/weighted_chosen": -0.7649291753768921, "logps/weighted_rejected": -0.828442394733429, "loss": 0.6972, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -66.34883117675781, "rewards/margins": 6.403515815734863, "rewards/rejected": -72.75663757324219, "rewards/weighted_accuracies": 0.5093749761581421, "rewards/weighted_chosen": -0.307464599609375, "rewards/weighted_margins": 0.0262603759765625, "rewards/weighted_rejected": -0.333578497171402, "step": 290 }, { "epoch": 0.1607501674480911, "grad_norm": 16.17060089111328, "learning_rate": 9.890738003669027e-07, "logits/chosen": -3.3641114234924316, "logits/rejected": -3.360107421875, "logps/chosen": -260.75, "logps/rejected": -282.5093688964844, "logps/weighted_chosen": -0.743237316608429, "logps/weighted_rejected": -0.8062988519668579, "loss": 0.6858, "rewards/accuracies": 0.609375, "rewards/chosen": -65.58906555175781, "rewards/margins": 8.375, "rewards/rejected": -73.9820327758789, "rewards/weighted_accuracies": 0.534375011920929, "rewards/weighted_chosen": -0.294107049703598, "rewards/weighted_margins": 0.04635162279009819, "rewards/weighted_rejected": -0.34059447050094604, "step": 300 }, { "epoch": 0.16610850636302746, "grad_norm": 19.679088592529297, "learning_rate": 9.870444346820348e-07, "logits/chosen": -3.3017578125, "logits/rejected": -3.2752928733825684, "logps/chosen": -277.9906311035156, "logps/rejected": -299.7265625, "logps/weighted_chosen": -0.76312255859375, "logps/weighted_rejected": -0.793139636516571, "loss": 0.6994, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -64.0882797241211, "rewards/margins": 10.8203125, "rewards/rejected": -74.90898132324219, "rewards/weighted_accuracies": 0.5218750238418579, "rewards/weighted_chosen": -0.310202032327652, "rewards/weighted_margins": 0.007830810733139515, "rewards/weighted_rejected": -0.3179565370082855, "step": 310 }, { "epoch": 0.17146684527796383, "grad_norm": 18.09251594543457, "learning_rate": 9.848447601883433e-07, "logits/chosen": -3.251171827316284, "logits/rejected": -3.2337403297424316, "logps/chosen": -255.09530639648438, "logps/rejected": -293.9375, "logps/weighted_chosen": -0.7666381597518921, "logps/weighted_rejected": -0.833050549030304, "loss": 0.6918, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -58.306251525878906, "rewards/margins": 15.045312881469727, "rewards/rejected": -73.3515625, "rewards/weighted_accuracies": 0.565625011920929, "rewards/weighted_chosen": -0.332742303609848, "rewards/weighted_margins": 0.03985900804400444, "rewards/weighted_rejected": -0.37254637479782104, "step": 320 }, { "epoch": 0.1768251841929002, "grad_norm": 25.393939971923828, "learning_rate": 9.824755460639899e-07, "logits/chosen": -3.203418016433716, "logits/rejected": -3.19873046875, "logps/chosen": -261.8515625, "logps/rejected": -305.0328063964844, "logps/weighted_chosen": -0.838146984577179, "logps/weighted_rejected": -0.897717297077179, "loss": 0.6862, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": -70.07734680175781, "rewards/margins": 16.4833984375, "rewards/rejected": -86.54609680175781, "rewards/weighted_accuracies": 0.5531250238418579, "rewards/weighted_chosen": -0.4094604551792145, "rewards/weighted_margins": 0.05315857008099556, "rewards/weighted_rejected": -0.4626617431640625, "step": 330 }, { "epoch": 0.18218352310783656, "grad_norm": 15.981916427612305, "learning_rate": 9.799376207714444e-07, "logits/chosen": -3.241992235183716, "logits/rejected": -3.2372069358825684, "logps/chosen": -283.8984375, "logps/rejected": -301.45782470703125, "logps/weighted_chosen": -0.940869152545929, "logps/weighted_rejected": -1.0183837413787842, "loss": 0.6807, "rewards/accuracies": 0.5531250238418579, "rewards/chosen": -82.82109069824219, "rewards/margins": 8.819140434265137, "rewards/rejected": -91.63594055175781, "rewards/weighted_accuracies": 0.5625, "rewards/weighted_chosen": -0.47734373807907104, "rewards/weighted_margins": 0.06443405151367188, "rewards/weighted_rejected": -0.5418640375137329, "step": 340 }, { "epoch": 0.18754186202277295, "grad_norm": 11.714133262634277, "learning_rate": 9.772318717677903e-07, "logits/chosen": -3.29248046875, "logits/rejected": -3.2901368141174316, "logps/chosen": -281.28125, "logps/rejected": -298.8671875, "logps/weighted_chosen": -0.8809448480606079, "logps/weighted_rejected": -0.9219726324081421, "loss": 0.6935, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -81.07421875, "rewards/margins": 11.681249618530273, "rewards/rejected": -92.7164077758789, "rewards/weighted_accuracies": 0.574999988079071, "rewards/weighted_chosen": -0.4380859434604645, "rewards/weighted_margins": 0.02567749097943306, "rewards/weighted_rejected": -0.46390992403030396, "step": 350 }, { "epoch": 0.19290020093770932, "grad_norm": 17.111141204833984, "learning_rate": 9.743592451943998e-07, "logits/chosen": -3.371386766433716, "logits/rejected": -3.3677735328674316, "logps/chosen": -301.2203063964844, "logps/rejected": -326.7718811035156, "logps/weighted_chosen": -0.884228527545929, "logps/weighted_rejected": -0.8854614496231079, "loss": 0.6983, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -89.6175765991211, "rewards/margins": 10.710546493530273, "rewards/rejected": -100.3335952758789, "rewards/weighted_accuracies": 0.5562499761581421, "rewards/weighted_chosen": -0.4132934510707855, "rewards/weighted_margins": 0.013682556338608265, "rewards/weighted_rejected": -0.427154541015625, "step": 360 }, { "epoch": 0.1982585398526457, "grad_norm": 13.4055814743042, "learning_rate": 9.713207455460892e-07, "logits/chosen": -3.321484327316284, "logits/rejected": -3.3426756858825684, "logps/chosen": -296.109375, "logps/rejected": -313.0843811035156, "logps/weighted_chosen": -0.8559204339981079, "logps/weighted_rejected": -0.9296630620956421, "loss": 0.6728, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -90.3265609741211, "rewards/margins": 7.080859184265137, "rewards/rejected": -97.3773422241211, "rewards/weighted_accuracies": 0.5843750238418579, "rewards/weighted_chosen": -0.365234375, "rewards/weighted_margins": 0.06043701246380806, "rewards/weighted_rejected": -0.4256530702114105, "step": 370 }, { "epoch": 0.20361687876758205, "grad_norm": 12.659865379333496, "learning_rate": 9.681174353198686e-07, "logits/chosen": -3.418164014816284, "logits/rejected": -3.4073243141174316, "logps/chosen": -303.66717529296875, "logps/rejected": -334.4281311035156, "logps/weighted_chosen": -0.7992187738418579, "logps/weighted_rejected": -0.8946533203125, "loss": 0.6715, "rewards/accuracies": 0.59375, "rewards/chosen": -92.99609375, "rewards/margins": 13.2685546875, "rewards/rejected": -106.234375, "rewards/weighted_accuracies": 0.581250011920929, "rewards/weighted_chosen": -0.362588495016098, "rewards/weighted_margins": 0.07253112643957138, "rewards/weighted_rejected": -0.4350738525390625, "step": 380 }, { "epoch": 0.20897521768251842, "grad_norm": 33.69499969482422, "learning_rate": 9.647504346434103e-07, "logits/chosen": -3.559375047683716, "logits/rejected": -3.565234422683716, "logps/chosen": -328.44219970703125, "logps/rejected": -352.2484436035156, "logps/weighted_chosen": -0.8904174566268921, "logps/weighted_rejected": -0.944140613079071, "loss": 0.688, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -107.42109680175781, "rewards/margins": 25.632421493530273, "rewards/rejected": -133.03515625, "rewards/weighted_accuracies": 0.581250011920929, "rewards/weighted_chosen": -0.430419921875, "rewards/weighted_margins": 0.04887695237994194, "rewards/weighted_rejected": -0.47917479276657104, "step": 390 }, { "epoch": 0.2143335565974548, "grad_norm": 21.384435653686523, "learning_rate": 9.612209208833646e-07, "logits/chosen": -3.5692381858825684, "logits/rejected": -3.5658202171325684, "logps/chosen": -310.74530029296875, "logps/rejected": -319.8671875, "logps/weighted_chosen": -0.885510265827179, "logps/weighted_rejected": -0.952441394329071, "loss": 0.6817, "rewards/accuracies": 0.5718749761581421, "rewards/chosen": -110.94140625, "rewards/margins": 7.463281154632568, "rewards/rejected": -118.3929672241211, "rewards/weighted_accuracies": 0.5718749761581421, "rewards/weighted_chosen": -0.43572998046875, "rewards/weighted_margins": 0.05572357028722763, "rewards/weighted_rejected": -0.4914703369140625, "step": 400 }, { "epoch": 0.21969189551239116, "grad_norm": 41.982784271240234, "learning_rate": 9.5753012823366e-07, "logits/chosen": -3.6553711891174316, "logits/rejected": -3.646191358566284, "logps/chosen": -335.3968811035156, "logps/rejected": -370.7281188964844, "logps/weighted_chosen": -0.857287585735321, "logps/weighted_rejected": -0.8953613042831421, "loss": 0.6959, "rewards/accuracies": 0.609375, "rewards/chosen": -123.1953125, "rewards/margins": 24.586328506469727, "rewards/rejected": -147.77499389648438, "rewards/weighted_accuracies": 0.5874999761581421, "rewards/weighted_chosen": -0.421051025390625, "rewards/weighted_margins": 0.0330963134765625, "rewards/weighted_rejected": -0.4541381895542145, "step": 410 }, { "epoch": 0.22505023442732752, "grad_norm": 30.137876510620117, "learning_rate": 9.536793472839324e-07, "logits/chosen": -3.714648485183716, "logits/rejected": -3.7328124046325684, "logps/chosen": -386.20001220703125, "logps/rejected": -426.3656311035156, "logps/weighted_chosen": -0.8975464105606079, "logps/weighted_rejected": -0.970263659954071, "loss": 0.6699, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -155.86874389648438, "rewards/margins": 28.117578506469727, "rewards/rejected": -183.94296264648438, "rewards/weighted_accuracies": 0.5687500238418579, "rewards/weighted_chosen": -0.43191832304000854, "rewards/weighted_margins": 0.07274474948644638, "rewards/weighted_rejected": -0.5046783685684204, "step": 420 }, { "epoch": 0.2304085733422639, "grad_norm": 27.828502655029297, "learning_rate": 9.496699245682351e-07, "logits/chosen": -3.798535108566284, "logits/rejected": -3.790332078933716, "logps/chosen": -396.24688720703125, "logps/rejected": -460.65625, "logps/weighted_chosen": -0.987683117389679, "logps/weighted_rejected": -1.0972778797149658, "loss": 0.6717, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -175.32421875, "rewards/margins": 44.544532775878906, "rewards/rejected": -219.8874969482422, "rewards/weighted_accuracies": 0.5718749761581421, "rewards/weighted_chosen": -0.5299133062362671, "rewards/weighted_margins": 0.08442840725183487, "rewards/weighted_rejected": -0.6142212152481079, "step": 430 }, { "epoch": 0.23576691225720026, "grad_norm": 25.154258728027344, "learning_rate": 9.455032620941839e-07, "logits/chosen": -3.865527391433716, "logits/rejected": -3.831738233566284, "logps/chosen": -364.0687561035156, "logps/rejected": -457.44061279296875, "logps/weighted_chosen": -1.0288207530975342, "logps/weighted_rejected": -1.1377685070037842, "loss": 0.6717, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -169.9953155517578, "rewards/margins": 61.437110900878906, "rewards/rejected": -231.46328735351562, "rewards/weighted_accuracies": 0.5843750238418579, "rewards/weighted_chosen": -0.5935516357421875, "rewards/weighted_margins": 0.09948424994945526, "rewards/weighted_rejected": -0.692797839641571, "step": 440 }, { "epoch": 0.24112525117213665, "grad_norm": 56.727012634277344, "learning_rate": 9.411808168527066e-07, "logits/chosen": -3.862499952316284, "logits/rejected": -3.863964796066284, "logps/chosen": -430.85626220703125, "logps/rejected": -484.0703125, "logps/weighted_chosen": -1.107214331626892, "logps/weighted_rejected": -1.182470679283142, "loss": 0.6926, "rewards/accuracies": 0.609375, "rewards/chosen": -225.39218139648438, "rewards/margins": 39.25664138793945, "rewards/rejected": -264.46405029296875, "rewards/weighted_accuracies": 0.550000011920929, "rewards/weighted_chosen": -0.6642822027206421, "rewards/weighted_margins": 0.06092224270105362, "rewards/weighted_rejected": -0.725268542766571, "step": 450 }, { "epoch": 0.24648359008707302, "grad_norm": 22.511943817138672, "learning_rate": 9.367041003085648e-07, "logits/chosen": -3.80322265625, "logits/rejected": -3.7904295921325684, "logps/chosen": -523.4312744140625, "logps/rejected": -489.375, "logps/weighted_chosen": -1.161865234375, "logps/weighted_rejected": -1.247900366783142, "loss": 0.6807, "rewards/accuracies": 0.546875, "rewards/chosen": -304.58203125, "rewards/margins": -29.098827362060547, "rewards/rejected": -275.4828186035156, "rewards/weighted_accuracies": 0.559374988079071, "rewards/weighted_chosen": -0.703125, "rewards/weighted_margins": 0.069793701171875, "rewards/weighted_rejected": -0.7724975347518921, "step": 460 }, { "epoch": 0.25184192900200936, "grad_norm": 26.90446662902832, "learning_rate": 9.320746778718274e-07, "logits/chosen": -3.8857421875, "logits/rejected": -3.8814454078674316, "logps/chosen": -421.92498779296875, "logps/rejected": -482.390625, "logps/weighted_chosen": -1.198461890220642, "logps/weighted_rejected": -1.3473632335662842, "loss": 0.6521, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -234.4382781982422, "rewards/margins": 44.27734375, "rewards/rejected": -278.60626220703125, "rewards/weighted_accuracies": 0.578125, "rewards/weighted_chosen": -0.731597900390625, "rewards/weighted_margins": 0.13102111220359802, "rewards/weighted_rejected": -0.862780749797821, "step": 470 }, { "epoch": 0.2572002679169457, "grad_norm": 39.92988586425781, "learning_rate": 9.272941683504808e-07, "logits/chosen": -3.947265625, "logits/rejected": -3.962695360183716, "logps/chosen": -564.7000122070312, "logps/rejected": -586.8125, "logps/weighted_chosen": -1.2533447742462158, "logps/weighted_rejected": -1.4404296875, "loss": 0.6616, "rewards/accuracies": 0.5718749761581421, "rewards/chosen": -346.68280029296875, "rewards/margins": 19.563282012939453, "rewards/rejected": -366.34686279296875, "rewards/weighted_accuracies": 0.612500011920929, "rewards/weighted_chosen": -0.810443103313446, "rewards/weighted_margins": 0.157379150390625, "rewards/weighted_rejected": -0.967883288860321, "step": 480 }, { "epoch": 0.2625586068318821, "grad_norm": 41.0818977355957, "learning_rate": 9.223642433843679e-07, "logits/chosen": -4.0751953125, "logits/rejected": -4.055956840515137, "logps/chosen": -527.2843627929688, "logps/rejected": -571.0406494140625, "logps/weighted_chosen": -1.251977562904358, "logps/weighted_rejected": -1.4073975086212158, "loss": 0.6527, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -325.359375, "rewards/margins": 38.974220275878906, "rewards/rejected": -364.4312438964844, "rewards/weighted_accuracies": 0.6343749761581421, "rewards/weighted_chosen": -0.7925049066543579, "rewards/weighted_margins": 0.14805908501148224, "rewards/weighted_rejected": -0.94061279296875, "step": 490 }, { "epoch": 0.2679169457468185, "grad_norm": 27.582181930541992, "learning_rate": 9.172866268606513e-07, "logits/chosen": -4.211133003234863, "logits/rejected": -4.199023246765137, "logps/chosen": -616.1749877929688, "logps/rejected": -701.46875, "logps/weighted_chosen": -1.4478759765625, "logps/weighted_rejected": -1.623193383216858, "loss": 0.6735, "rewards/accuracies": 0.609375, "rewards/chosen": -401.2093811035156, "rewards/margins": 72.16484069824219, "rewards/rejected": -473.28436279296875, "rewards/weighted_accuracies": 0.581250011920929, "rewards/weighted_chosen": -0.987841784954071, "rewards/weighted_margins": 0.13963623344898224, "rewards/weighted_rejected": -1.127478003501892, "step": 500 }, { "epoch": 0.2679169457468185, "eval_logits/chosen": -4.276368618011475, "eval_logits/rejected": -4.278684139251709, "eval_logps/chosen": -607.260986328125, "eval_logps/rejected": -683.6949462890625, "eval_logps/weighted_chosen": -1.5014480352401733, "eval_logps/weighted_rejected": -1.661415457725525, "eval_loss": 0.6799212694168091, "eval_rewards/accuracies": 0.5794392228126526, "eval_rewards/chosen": -404.2022705078125, "eval_rewards/margins": 63.045310974121094, "eval_rewards/rejected": -467.23162841796875, "eval_rewards/weighted_accuracies": 0.5817757248878479, "eval_rewards/weighted_chosen": -1.0436044931411743, "eval_rewards/weighted_margins": 0.14398153126239777, "eval_rewards/weighted_rejected": -1.1875860691070557, "eval_runtime": 2401.7026, "eval_samples_per_second": 1.247, "eval_steps_per_second": 0.312, "step": 500 }, { "epoch": 0.2732752846617549, "grad_norm": 54.40583801269531, "learning_rate": 9.120630943110077e-07, "logits/chosen": -4.304296970367432, "logits/rejected": -4.307519435882568, "logps/chosen": -611.9562377929688, "logps/rejected": -747.1343994140625, "logps/weighted_chosen": -1.560327172279358, "logps/weighted_rejected": -1.7308349609375, "loss": 0.6867, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -421.84844970703125, "rewards/margins": 114.833984375, "rewards/rejected": -536.4578247070312, "rewards/weighted_accuracies": 0.574999988079071, "rewards/weighted_chosen": -1.1248657703399658, "rewards/weighted_margins": 0.15375670790672302, "rewards/weighted_rejected": -1.278204321861267, "step": 510 }, { "epoch": 0.27863362357669125, "grad_norm": 34.89970397949219, "learning_rate": 9.066954722907638e-07, "logits/chosen": -4.209668159484863, "logits/rejected": -4.206738471984863, "logps/chosen": -670.203125, "logps/rejected": -714.546875, "logps/weighted_chosen": -1.442846655845642, "logps/weighted_rejected": -1.5417969226837158, "loss": 0.6897, "rewards/accuracies": 0.59375, "rewards/chosen": -450.8140563964844, "rewards/margins": 37.505859375, "rewards/rejected": -488.6499938964844, "rewards/weighted_accuracies": 0.5562499761581421, "rewards/weighted_chosen": -0.9898437261581421, "rewards/weighted_margins": 0.09036865085363388, "rewards/weighted_rejected": -1.080297827720642, "step": 520 }, { "epoch": 0.2839919624916276, "grad_norm": 23.328060150146484, "learning_rate": 9.01185637740189e-07, "logits/chosen": -4.20751953125, "logits/rejected": -4.211133003234863, "logps/chosen": -556.3812255859375, "logps/rejected": -589.7468872070312, "logps/weighted_chosen": -1.385229468345642, "logps/weighted_rejected": -1.45050048828125, "loss": 0.7116, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -350.5453186035156, "rewards/margins": 26.184375762939453, "rewards/rejected": -376.70001220703125, "rewards/weighted_accuracies": 0.528124988079071, "rewards/weighted_chosen": -0.9394775629043579, "rewards/weighted_margins": 0.040618896484375, "rewards/weighted_rejected": -0.9805663824081421, "step": 530 }, { "epoch": 0.289350301406564, "grad_norm": 16.991273880004883, "learning_rate": 8.955355173281707e-07, "logits/chosen": -4.077343940734863, "logits/rejected": -4.068749904632568, "logps/chosen": -523.546875, "logps/rejected": -587.75, "logps/weighted_chosen": -1.2385742664337158, "logps/weighted_rejected": -1.4024169445037842, "loss": 0.664, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -325.93438720703125, "rewards/margins": 49.041404724121094, "rewards/rejected": -374.98748779296875, "rewards/weighted_accuracies": 0.5718749761581421, "rewards/weighted_chosen": -0.800976574420929, "rewards/weighted_margins": 0.12630614638328552, "rewards/weighted_rejected": -0.927136242389679, "step": 540 }, { "epoch": 0.29470864032150035, "grad_norm": 27.356420516967773, "learning_rate": 8.897470867785002e-07, "logits/chosen": -3.99658203125, "logits/rejected": -3.965039014816284, "logps/chosen": -568.8343505859375, "logps/rejected": -617.6062622070312, "logps/weighted_chosen": -1.497778296470642, "logps/weighted_rejected": -1.623876929283142, "loss": 0.6754, "rewards/accuracies": 0.625, "rewards/chosen": -357.1234436035156, "rewards/margins": 26.792577743530273, "rewards/rejected": -383.796875, "rewards/weighted_accuracies": 0.59375, "rewards/weighted_chosen": -1.0353882312774658, "rewards/weighted_margins": 0.11194457858800888, "rewards/weighted_rejected": -1.147424340248108, "step": 550 }, { "epoch": 0.3000669792364367, "grad_norm": 25.5198974609375, "learning_rate": 8.838223701790055e-07, "logits/chosen": -4.065625190734863, "logits/rejected": -4.058691501617432, "logps/chosen": -624.9249877929688, "logps/rejected": -750.0968627929688, "logps/weighted_chosen": -1.3964111804962158, "logps/weighted_rejected": -1.536376953125, "loss": 0.6737, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": -375.62188720703125, "rewards/margins": 91.9671859741211, "rewards/rejected": -467.7281188964844, "rewards/weighted_accuracies": 0.581250011920929, "rewards/weighted_chosen": -0.920971691608429, "rewards/weighted_margins": 0.10718383640050888, "rewards/weighted_rejected": -1.027990698814392, "step": 560 }, { "epoch": 0.3054253181513731, "grad_norm": 25.556983947753906, "learning_rate": 8.777634392737718e-07, "logits/chosen": -4.264452934265137, "logits/rejected": -4.25390625, "logps/chosen": -525.6281127929688, "logps/rejected": -586.3875122070312, "logps/weighted_chosen": -1.3518555164337158, "logps/weighted_rejected": -1.4489257335662842, "loss": 0.6742, "rewards/accuracies": 0.625, "rewards/chosen": -333.21563720703125, "rewards/margins": 48.41523361206055, "rewards/rejected": -381.6578063964844, "rewards/weighted_accuracies": 0.625, "rewards/weighted_chosen": -0.872875988483429, "rewards/weighted_margins": 0.09407959133386612, "rewards/weighted_rejected": -0.966778576374054, "step": 570 }, { "epoch": 0.31078365706630945, "grad_norm": 24.922456741333008, "learning_rate": 8.71572412738697e-07, "logits/chosen": -4.380566596984863, "logits/rejected": -4.354784965515137, "logps/chosen": -615.390625, "logps/rejected": -714.2687377929688, "logps/weighted_chosen": -1.4163818359375, "logps/weighted_rejected": -1.651708960533142, "loss": 0.6381, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -407.7984313964844, "rewards/margins": 89.12968444824219, "rewards/rejected": -497.0406188964844, "rewards/weighted_accuracies": 0.59375, "rewards/weighted_chosen": -0.986492931842804, "rewards/weighted_margins": 0.215484619140625, "rewards/weighted_rejected": -1.201416015625, "step": 580 }, { "epoch": 0.3161419959812458, "grad_norm": 33.28370666503906, "learning_rate": 8.652514554406387e-07, "logits/chosen": -4.452832221984863, "logits/rejected": -4.452929496765137, "logps/chosen": -663.5750122070312, "logps/rejected": -756.0187377929688, "logps/weighted_chosen": -1.5756347179412842, "logps/weighted_rejected": -1.753076195716858, "loss": 0.6673, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -458.00469970703125, "rewards/margins": 74.2398452758789, "rewards/rejected": -532.1781005859375, "rewards/weighted_accuracies": 0.590624988079071, "rewards/weighted_chosen": -1.1391479969024658, "rewards/weighted_margins": 0.15434570610523224, "rewards/weighted_rejected": -1.294091820716858, "step": 590 }, { "epoch": 0.3215003348961822, "grad_norm": 35.64254379272461, "learning_rate": 8.588027776804058e-07, "logits/chosen": -4.769824028015137, "logits/rejected": -4.750781059265137, "logps/chosen": -647.7312622070312, "logps/rejected": -909.8531494140625, "logps/weighted_chosen": -1.617163062095642, "logps/weighted_rejected": -1.894140601158142, "loss": 0.6271, "rewards/accuracies": 0.625, "rewards/chosen": -460.3999938964844, "rewards/margins": 230.3953094482422, "rewards/rejected": -690.9593505859375, "rewards/weighted_accuracies": 0.612500011920929, "rewards/weighted_chosen": -1.1735718250274658, "rewards/weighted_margins": 0.250540167093277, "rewards/weighted_rejected": -1.423791527748108, "step": 600 }, { "epoch": 0.32685867381111855, "grad_norm": 40.923988342285156, "learning_rate": 8.522286344198657e-07, "logits/chosen": -4.929883003234863, "logits/rejected": -4.950390815734863, "logps/chosen": -829.8375244140625, "logps/rejected": -974.1187744140625, "logps/weighted_chosen": -1.932958960533142, "logps/weighted_rejected": -2.1434082984924316, "loss": 0.6941, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -633.2562255859375, "rewards/margins": 127.22265625, "rewards/rejected": -760.5562744140625, "rewards/weighted_accuracies": 0.581250011920929, "rewards/weighted_chosen": -1.479040503501892, "rewards/weighted_margins": 0.19188232719898224, "rewards/weighted_rejected": -1.670751929283142, "step": 610 }, { "epoch": 0.3322170127260549, "grad_norm": 36.736900329589844, "learning_rate": 8.455313244934324e-07, "logits/chosen": -4.833398342132568, "logits/rejected": -4.824902534484863, "logps/chosen": -797.4406127929688, "logps/rejected": -925.0250244140625, "logps/weighted_chosen": -1.9183349609375, "logps/weighted_rejected": -2.187915086746216, "loss": 0.6334, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -607.0437622070312, "rewards/margins": 120.53047180175781, "rewards/rejected": -727.6281127929688, "rewards/weighted_accuracies": 0.640625, "rewards/weighted_chosen": -1.450830101966858, "rewards/weighted_margins": 0.269805908203125, "rewards/weighted_rejected": -1.720971703529358, "step": 620 }, { "epoch": 0.3375753516409913, "grad_norm": 31.050582885742188, "learning_rate": 8.38713189804215e-07, "logits/chosen": -4.622754096984863, "logits/rejected": -4.619921684265137, "logps/chosen": -849.8062744140625, "logps/rejected": -945.5875244140625, "logps/weighted_chosen": -1.9373047351837158, "logps/weighted_rejected": -2.010424852371216, "loss": 0.7012, "rewards/accuracies": 0.534375011920929, "rewards/chosen": -641.3187255859375, "rewards/margins": 92.91719055175781, "rewards/rejected": -734.2515869140625, "rewards/weighted_accuracies": 0.5843750238418579, "rewards/weighted_chosen": -1.4818847179412842, "rewards/weighted_margins": 0.08740844577550888, "rewards/weighted_rejected": -1.569421410560608, "step": 630 }, { "epoch": 0.34293369055592765, "grad_norm": 40.31166076660156, "learning_rate": 8.317766145051057e-07, "logits/chosen": -4.409081935882568, "logits/rejected": -4.413866996765137, "logps/chosen": -700.421875, "logps/rejected": -798.5531005859375, "logps/weighted_chosen": -1.5460937023162842, "logps/weighted_rejected": -1.7400634288787842, "loss": 0.6505, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -500.640625, "rewards/margins": 82.5933609008789, "rewards/rejected": -583.515625, "rewards/weighted_accuracies": 0.6031249761581421, "rewards/weighted_chosen": -1.10516357421875, "rewards/weighted_margins": 0.16577759385108948, "rewards/weighted_rejected": -1.271386742591858, "step": 640 }, { "epoch": 0.348292029470864, "grad_norm": 48.715423583984375, "learning_rate": 8.247240241650917e-07, "logits/chosen": -4.5068359375, "logits/rejected": -4.5087890625, "logps/chosen": -747.8687744140625, "logps/rejected": -850.5374755859375, "logps/weighted_chosen": -1.6713135242462158, "logps/weighted_rejected": -1.8655273914337158, "loss": 0.6506, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -539.6968994140625, "rewards/margins": 100.546875, "rewards/rejected": -640.5562744140625, "rewards/weighted_accuracies": 0.621874988079071, "rewards/weighted_chosen": -1.21282958984375, "rewards/weighted_margins": 0.18698731064796448, "rewards/weighted_rejected": -1.400488257408142, "step": 650 }, { "epoch": 0.3536503683858004, "grad_norm": 46.12395477294922, "learning_rate": 8.175578849210894e-07, "logits/chosen": -4.743945121765137, "logits/rejected": -4.784277439117432, "logps/chosen": -885.8312377929688, "logps/rejected": -1061.4625244140625, "logps/weighted_chosen": -1.837182641029358, "logps/weighted_rejected": -2.1468505859375, "loss": 0.6184, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -676.6203002929688, "rewards/margins": 169.16796875, "rewards/rejected": -845.6687622070312, "rewards/weighted_accuracies": 0.668749988079071, "rewards/weighted_chosen": -1.385009765625, "rewards/weighted_margins": 0.30499267578125, "rewards/weighted_rejected": -1.688879370689392, "step": 660 }, { "epoch": 0.35900870730073675, "grad_norm": 135.67051696777344, "learning_rate": 8.102807026155873e-07, "logits/chosen": -4.988379001617432, "logits/rejected": -5.020312309265137, "logps/chosen": -1107.8218994140625, "logps/rejected": -1233.659423828125, "logps/weighted_chosen": -2.1700682640075684, "logps/weighted_rejected": -2.523388624191284, "loss": 0.6695, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -899.3546752929688, "rewards/margins": 124.86640930175781, "rewards/rejected": -1024.395263671875, "rewards/weighted_accuracies": 0.637499988079071, "rewards/weighted_chosen": -1.724951148033142, "rewards/weighted_margins": 0.35167235136032104, "rewards/weighted_rejected": -2.076953172683716, "step": 670 }, { "epoch": 0.3643670462156731, "grad_norm": 35.317413330078125, "learning_rate": 8.028950219204099e-07, "logits/chosen": -4.710644721984863, "logits/rejected": -4.72119140625, "logps/chosen": -753.5906372070312, "logps/rejected": -938.3656005859375, "logps/weighted_chosen": -1.5244872570037842, "logps/weighted_rejected": -1.7093017101287842, "loss": 0.6609, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -544.7249755859375, "rewards/margins": 165.43359375, "rewards/rejected": -710.1968994140625, "rewards/weighted_accuracies": 0.581250011920929, "rewards/weighted_chosen": -1.093054175376892, "rewards/weighted_margins": 0.16083984076976776, "rewards/weighted_rejected": -1.254003882408142, "step": 680 }, { "epoch": 0.3697253851306095, "grad_norm": 32.520843505859375, "learning_rate": 7.954034254469e-07, "logits/chosen": -4.69140625, "logits/rejected": -4.702832221984863, "logps/chosen": -823.9593505859375, "logps/rejected": -1088.659423828125, "logps/weighted_chosen": -1.5863525867462158, "logps/weighted_rejected": -1.8434326648712158, "loss": 0.6454, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -615.2468872070312, "rewards/margins": 252.4890594482422, "rewards/rejected": -867.453125, "rewards/weighted_accuracies": 0.637499988079071, "rewards/weighted_chosen": -1.138574242591858, "rewards/weighted_margins": 0.23261718451976776, "rewards/weighted_rejected": -1.371545433998108, "step": 690 }, { "epoch": 0.3750837240455459, "grad_norm": 46.228511810302734, "learning_rate": 7.878085328428368e-07, "logits/chosen": -4.722070217132568, "logits/rejected": -4.739160060882568, "logps/chosen": -1027.971923828125, "logps/rejected": -1167.421875, "logps/weighted_chosen": -1.942407250404358, "logps/weighted_rejected": -2.2004151344299316, "loss": 0.648, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -807.4937744140625, "rewards/margins": 135.06015014648438, "rewards/rejected": -942.3468627929688, "rewards/weighted_accuracies": 0.659375011920929, "rewards/weighted_chosen": -1.4796142578125, "rewards/weighted_margins": 0.23435059189796448, "rewards/weighted_rejected": -1.7139403820037842, "step": 700 }, { "epoch": 0.3804420629604823, "grad_norm": 78.81863403320312, "learning_rate": 7.801129998764014e-07, "logits/chosen": -5.2890625, "logits/rejected": -5.337304592132568, "logps/chosen": -1257.909423828125, "logps/rejected": -1334.559326171875, "logps/weighted_chosen": -2.3622069358825684, "logps/weighted_rejected": -2.719970703125, "loss": 0.6136, "rewards/accuracies": 0.5625, "rewards/chosen": -1042.5062255859375, "rewards/margins": 75.5640640258789, "rewards/rejected": -1118.1546630859375, "rewards/weighted_accuracies": 0.6312500238418579, "rewards/weighted_chosen": -1.926733374595642, "rewards/weighted_margins": 0.32972413301467896, "rewards/weighted_rejected": -2.256054639816284, "step": 710 }, { "epoch": 0.38580040187541864, "grad_norm": 61.859046936035156, "learning_rate": 7.723195175075135e-07, "logits/chosen": -5.721093654632568, "logits/rejected": -5.796093940734863, "logps/chosen": -1532.6312255859375, "logps/rejected": -1961.4312744140625, "logps/weighted_chosen": -3.167187452316284, "logps/weighted_rejected": -3.689257860183716, "loss": 0.6426, "rewards/accuracies": 0.640625, "rewards/chosen": -1346.1624755859375, "rewards/margins": 397.6070251464844, "rewards/rejected": -1743.856201171875, "rewards/weighted_accuracies": 0.659375011920929, "rewards/weighted_chosen": -2.710205078125, "rewards/weighted_margins": 0.50091552734375, "rewards/weighted_rejected": -3.210693359375, "step": 720 }, { "epoch": 0.391158740790355, "grad_norm": 41.49199295043945, "learning_rate": 7.644308109468609e-07, "logits/chosen": -4.699120998382568, "logits/rejected": -4.692578315734863, "logps/chosen": -1019.1875, "logps/rejected": -1183.715576171875, "logps/weighted_chosen": -2.080761671066284, "logps/weighted_rejected": -2.3409667015075684, "loss": 0.6433, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -815.9515380859375, "rewards/margins": 163.99960327148438, "rewards/rejected": -979.8297119140625, "rewards/weighted_accuracies": 0.65625, "rewards/weighted_chosen": -1.63037109375, "rewards/weighted_margins": 0.25758057832717896, "rewards/weighted_rejected": -1.888208031654358, "step": 730 }, { "epoch": 0.3965170797052914, "grad_norm": 42.90148162841797, "learning_rate": 7.564496387029531e-07, "logits/chosen": -4.489648342132568, "logits/rejected": -4.511328220367432, "logps/chosen": -835.9718627929688, "logps/rejected": -982.0, "logps/weighted_chosen": -1.939965844154358, "logps/weighted_rejected": -2.176513671875, "loss": 0.6367, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -630.2437744140625, "rewards/margins": 136.02578735351562, "rewards/rejected": -766.3656005859375, "rewards/weighted_accuracies": 0.6187499761581421, "rewards/weighted_chosen": -1.4935791492462158, "rewards/weighted_margins": 0.223297119140625, "rewards/weighted_rejected": -1.7169189453125, "step": 740 }, { "epoch": 0.40187541862022774, "grad_norm": 59.750553131103516, "learning_rate": 7.483787916175306e-07, "logits/chosen": -4.936230659484863, "logits/rejected": -4.978906154632568, "logps/chosen": -1336.675048828125, "logps/rejected": -1601.0625, "logps/weighted_chosen": -2.7215819358825684, "logps/weighted_rejected": -3.0896973609924316, "loss": 0.6144, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -1120.268798828125, "rewards/margins": 255.3640594482422, "rewards/rejected": -1375.2125244140625, "rewards/weighted_accuracies": 0.6781250238418579, "rewards/weighted_chosen": -2.2766356468200684, "rewards/weighted_margins": 0.36883544921875, "rewards/weighted_rejected": -2.64599609375, "step": 750 }, { "epoch": 0.4072337575351641, "grad_norm": 46.1683235168457, "learning_rate": 7.402210918896689e-07, "logits/chosen": -5.019238471984863, "logits/rejected": -5.020898342132568, "logps/chosen": -1522.1875, "logps/rejected": -1768.2249755859375, "logps/weighted_chosen": -3.219921827316284, "logps/weighted_rejected": -3.6636719703674316, "loss": 0.6513, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -1321.1937255859375, "rewards/margins": 242.9265594482422, "rewards/rejected": -1564.503173828125, "rewards/weighted_accuracies": 0.625, "rewards/weighted_chosen": -2.7684082984924316, "rewards/weighted_margins": 0.41920167207717896, "rewards/weighted_rejected": -3.188183546066284, "step": 760 }, { "epoch": 0.4125920964501005, "grad_norm": 63.735076904296875, "learning_rate": 7.31979392088917e-07, "logits/chosen": -4.508008003234863, "logits/rejected": -4.523144721984863, "logps/chosen": -1159.2125244140625, "logps/rejected": -1374.90625, "logps/weighted_chosen": -2.1778321266174316, "logps/weighted_rejected": -2.497607469558716, "loss": 0.6347, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -934.84375, "rewards/margins": 200.5187530517578, "rewards/rejected": -1135.8812255859375, "rewards/weighted_accuracies": 0.643750011920929, "rewards/weighted_chosen": -1.7113769054412842, "rewards/weighted_margins": 0.29365235567092896, "rewards/weighted_rejected": -2.00531005859375, "step": 770 }, { "epoch": 0.41795043536503684, "grad_norm": 59.270362854003906, "learning_rate": 7.236565741578162e-07, "logits/chosen": -4.526562690734863, "logits/rejected": -4.559374809265137, "logps/chosen": -1011.7062377929688, "logps/rejected": -1285.487548828125, "logps/weighted_chosen": -2.357372999191284, "logps/weighted_rejected": -2.668701171875, "loss": 0.6262, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -831.0437622070312, "rewards/margins": 246.8093719482422, "rewards/rejected": -1078.125, "rewards/weighted_accuracies": 0.6781250238418579, "rewards/weighted_chosen": -1.8983886241912842, "rewards/weighted_margins": 0.29566651582717896, "rewards/weighted_rejected": -2.19390869140625, "step": 780 }, { "epoch": 0.4233087742799732, "grad_norm": 52.86665725708008, "learning_rate": 7.152555484041475e-07, "logits/chosen": -4.317285060882568, "logits/rejected": -4.292870998382568, "logps/chosen": -1603.3062744140625, "logps/rejected": -1803.6937255859375, "logps/weighted_chosen": -3.095752000808716, "logps/weighted_rejected": -3.362353563308716, "loss": 0.6784, "rewards/accuracies": 0.565625011920929, "rewards/chosen": -1380.3187255859375, "rewards/margins": 201.5476531982422, "rewards/rejected": -1581.324951171875, "rewards/weighted_accuracies": 0.625, "rewards/weighted_chosen": -2.6065917015075684, "rewards/weighted_margins": 0.2591919004917145, "rewards/weighted_rejected": -2.8663573265075684, "step": 790 }, { "epoch": 0.4286671131949096, "grad_norm": 62.58599853515625, "learning_rate": 7.067792524832603e-07, "logits/chosen": -3.805224657058716, "logits/rejected": -3.7125487327575684, "logps/chosen": -1634.5562744140625, "logps/rejected": -2077.981201171875, "logps/weighted_chosen": -3.627148389816284, "logps/weighted_rejected": -4.034033298492432, "loss": 0.615, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -1447.7249755859375, "rewards/margins": 420.54376220703125, "rewards/rejected": -1869.34375, "rewards/weighted_accuracies": 0.653124988079071, "rewards/weighted_chosen": -3.1629881858825684, "rewards/weighted_margins": 0.39582520723342896, "rewards/weighted_rejected": -3.559765577316284, "step": 800 }, { "epoch": 0.43402545210984594, "grad_norm": 50.020774841308594, "learning_rate": 6.982306503708387e-07, "logits/chosen": -3.9469237327575684, "logits/rejected": -3.803515672683716, "logps/chosen": -2404.94384765625, "logps/rejected": -2957.58740234375, "logps/weighted_chosen": -4.365234375, "logps/weighted_rejected": -4.791406154632568, "loss": 0.6656, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2192.418701171875, "rewards/margins": 530.7859497070312, "rewards/rejected": -2723.253173828125, "rewards/weighted_accuracies": 0.6625000238418579, "rewards/weighted_chosen": -3.917187452316284, "rewards/weighted_margins": 0.39775389432907104, "rewards/weighted_rejected": -4.31494140625, "step": 810 }, { "epoch": 0.4393837910247823, "grad_norm": 58.94343948364258, "learning_rate": 6.896127313264642e-07, "logits/chosen": -3.968212842941284, "logits/rejected": -3.8505859375, "logps/chosen": -2033.018798828125, "logps/rejected": -2514.77490234375, "logps/weighted_chosen": -3.811718702316284, "logps/weighted_rejected": -4.425683498382568, "loss": 0.593, "rewards/accuracies": 0.59375, "rewards/chosen": -1811.596923828125, "rewards/margins": 460.2734375, "rewards/rejected": -2272.324951171875, "rewards/weighted_accuracies": 0.699999988079071, "rewards/weighted_chosen": -3.3623046875, "rewards/weighted_margins": 0.5855957269668579, "rewards/weighted_rejected": -3.9478516578674316, "step": 820 }, { "epoch": 0.4447421299397187, "grad_norm": 125.08843994140625, "learning_rate": 6.809285088483361e-07, "logits/chosen": -3.9242186546325684, "logits/rejected": -3.902148485183716, "logps/chosen": -1946.675048828125, "logps/rejected": -2531.72509765625, "logps/weighted_chosen": -4.324951171875, "logps/weighted_rejected": -4.77001953125, "loss": 0.6535, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1747.137451171875, "rewards/margins": 569.9296875, "rewards/rejected": -2317.175048828125, "rewards/weighted_accuracies": 0.625, "rewards/weighted_chosen": -3.8677978515625, "rewards/weighted_margins": 0.4211181700229645, "rewards/weighted_rejected": -4.289404392242432, "step": 830 }, { "epoch": 0.45010046885465504, "grad_norm": 139.7412109375, "learning_rate": 6.721810196195174e-07, "logits/chosen": -4.127343654632568, "logits/rejected": -4.019824028015137, "logps/chosen": -1883.1812744140625, "logps/rejected": -2426.175048828125, "logps/weighted_chosen": -4.117968559265137, "logps/weighted_rejected": -4.751269340515137, "loss": 0.5946, "rewards/accuracies": 0.609375, "rewards/chosen": -1689.3062744140625, "rewards/margins": 523.2828369140625, "rewards/rejected": -2212.34375, "rewards/weighted_accuracies": 0.659375011920929, "rewards/weighted_chosen": -3.679248094558716, "rewards/weighted_margins": 0.62701416015625, "rewards/weighted_rejected": -4.307226657867432, "step": 840 }, { "epoch": 0.4554588077695914, "grad_norm": 52.51661682128906, "learning_rate": 6.633733224460737e-07, "logits/chosen": -4.39306640625, "logits/rejected": -4.395898342132568, "logps/chosen": -1563.7125244140625, "logps/rejected": -2087.175048828125, "logps/weighted_chosen": -3.411572217941284, "logps/weighted_rejected": -3.908007860183716, "loss": 0.5817, "rewards/accuracies": 0.640625, "rewards/chosen": -1360.9937744140625, "rewards/margins": 503.0453186035156, "rewards/rejected": -1863.5687255859375, "rewards/weighted_accuracies": 0.671875, "rewards/weighted_chosen": -2.9693846702575684, "rewards/weighted_margins": 0.48114013671875, "rewards/weighted_rejected": -3.4501953125, "step": 850 }, { "epoch": 0.4608171466845278, "grad_norm": 75.17611694335938, "learning_rate": 6.545084971874736e-07, "logits/chosen": -4.860058784484863, "logits/rejected": -4.897070407867432, "logps/chosen": -2070.018798828125, "logps/rejected": -2581.47509765625, "logps/weighted_chosen": -4.260986328125, "logps/weighted_rejected": -4.812890529632568, "loss": 0.6117, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -1878.949951171875, "rewards/margins": 492.6968688964844, "rewards/rejected": -2372.15625, "rewards/weighted_accuracies": 0.6499999761581421, "rewards/weighted_chosen": -3.812451124191284, "rewards/weighted_margins": 0.538012683391571, "rewards/weighted_rejected": -4.350634574890137, "step": 860 }, { "epoch": 0.46617548559946415, "grad_norm": 61.795143127441406, "learning_rate": 6.455896436796313e-07, "logits/chosen": -4.861230373382568, "logits/rejected": -4.895312309265137, "logps/chosen": -2298.987548828125, "logps/rejected": -2865.375, "logps/weighted_chosen": -4.515429496765137, "logps/weighted_rejected": -5.017285346984863, "loss": 0.6405, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2101.11865234375, "rewards/margins": 552.28125, "rewards/rejected": -2653.13134765625, "rewards/weighted_accuracies": 0.625, "rewards/weighted_chosen": -4.060351371765137, "rewards/weighted_margins": 0.4820556640625, "rewards/weighted_rejected": -4.539941310882568, "step": 870 }, { "epoch": 0.4715338245144005, "grad_norm": 58.134342193603516, "learning_rate": 6.3661988065096e-07, "logits/chosen": -4.554785251617432, "logits/rejected": -4.533789157867432, "logps/chosen": -1841.206298828125, "logps/rejected": -2302.1875, "logps/weighted_chosen": -3.680468797683716, "logps/weighted_rejected": -4.158984184265137, "loss": 0.6052, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1644.3125, "rewards/margins": 452.23748779296875, "rewards/rejected": -2097.24365234375, "rewards/weighted_accuracies": 0.659375011920929, "rewards/weighted_chosen": -3.228564500808716, "rewards/weighted_margins": 0.45640867948532104, "rewards/weighted_rejected": -3.6844725608825684, "step": 880 }, { "epoch": 0.4768921634293369, "grad_norm": 47.12223815917969, "learning_rate": 6.276023446318213e-07, "logits/chosen": -4.478906154632568, "logits/rejected": -4.466796875, "logps/chosen": -2029.737548828125, "logps/rejected": -2337.10009765625, "logps/weighted_chosen": -3.8642578125, "logps/weighted_rejected": -4.384863376617432, "loss": 0.5771, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -1830.9156494140625, "rewards/margins": 296.99530029296875, "rewards/rejected": -2128.512451171875, "rewards/weighted_accuracies": 0.684374988079071, "rewards/weighted_chosen": -3.408935546875, "rewards/weighted_margins": 0.4983886778354645, "rewards/weighted_rejected": -3.9090819358825684, "step": 890 }, { "epoch": 0.4822505023442733, "grad_norm": 67.17890930175781, "learning_rate": 6.185401888577487e-07, "logits/chosen": -4.597363471984863, "logits/rejected": -4.523828029632568, "logps/chosen": -2203.487548828125, "logps/rejected": -3171.262451171875, "logps/weighted_chosen": -4.45849609375, "logps/weighted_rejected": -5.061327934265137, "loss": 0.5889, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -2014.4625244140625, "rewards/margins": 938.859375, "rewards/rejected": -2952.96875, "rewards/weighted_accuracies": 0.653124988079071, "rewards/weighted_chosen": -4.013134956359863, "rewards/weighted_margins": 0.587145984172821, "rewards/weighted_rejected": -4.601904392242432, "step": 900 }, { "epoch": 0.48760884125920967, "grad_norm": 56.467411041259766, "learning_rate": 6.094365821668307e-07, "logits/chosen": -4.489843845367432, "logits/rejected": -4.480322360992432, "logps/chosen": -1832.862548828125, "logps/rejected": -2382.074951171875, "logps/weighted_chosen": -3.7759766578674316, "logps/weighted_rejected": -4.449414253234863, "loss": 0.5499, "rewards/accuracies": 0.65625, "rewards/chosen": -1650.199951171875, "rewards/margins": 529.1953125, "rewards/rejected": -2178.621826171875, "rewards/weighted_accuracies": 0.7437499761581421, "rewards/weighted_chosen": -3.3441405296325684, "rewards/weighted_margins": 0.637280285358429, "rewards/weighted_rejected": -3.982177734375, "step": 910 }, { "epoch": 0.49296718017414604, "grad_norm": 53.93306350708008, "learning_rate": 6.002947078916364e-07, "logits/chosen": -4.535009860992432, "logits/rejected": -4.543847560882568, "logps/chosen": -1841.824951171875, "logps/rejected": -2362.90625, "logps/weighted_chosen": -3.4969725608825684, "logps/weighted_rejected": -3.9888672828674316, "loss": 0.6163, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -1628.5625, "rewards/margins": 496.4624938964844, "rewards/rejected": -2125.612548828125, "rewards/weighted_accuracies": 0.6781250238418579, "rewards/weighted_chosen": -3.064746141433716, "rewards/weighted_margins": 0.46923828125, "rewards/weighted_rejected": -3.53564453125, "step": 920 }, { "epoch": 0.4983255190890824, "grad_norm": 38.40058898925781, "learning_rate": 5.911177627460738e-07, "logits/chosen": -4.379052639007568, "logits/rejected": -4.362548828125, "logps/chosen": -1661.074951171875, "logps/rejected": -2101.175048828125, "logps/weighted_chosen": -3.29833984375, "logps/weighted_rejected": -3.7914061546325684, "loss": 0.6162, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1442.7750244140625, "rewards/margins": 429.57342529296875, "rewards/rejected": -1872.356201171875, "rewards/weighted_accuracies": 0.659375011920929, "rewards/weighted_chosen": -2.8355956077575684, "rewards/weighted_margins": 0.46900635957717896, "rewards/weighted_rejected": -3.3045411109924316, "step": 930 }, { "epoch": 0.5036838580040187, "grad_norm": 62.751251220703125, "learning_rate": 5.819089557075688e-07, "logits/chosen": -4.216992378234863, "logits/rejected": -4.262597560882568, "logps/chosen": -1618.6812744140625, "logps/rejected": -2067.5, "logps/weighted_chosen": -3.3895020484924316, "logps/weighted_rejected": -4.025586128234863, "loss": 0.5534, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -1416.8843994140625, "rewards/margins": 434.48748779296875, "rewards/rejected": -1851.956298828125, "rewards/weighted_accuracies": 0.7250000238418579, "rewards/weighted_chosen": -2.9775390625, "rewards/weighted_margins": 0.608203113079071, "rewards/weighted_rejected": -3.585742235183716, "step": 940 }, { "epoch": 0.5090421969189551, "grad_norm": 129.74171447753906, "learning_rate": 5.726715068949564e-07, "logits/chosen": -4.279882907867432, "logits/rejected": -4.284081935882568, "logps/chosen": -2506.074951171875, "logps/rejected": -2878.94384765625, "logps/weighted_chosen": -5.016894340515137, "logps/weighted_rejected": -5.711621284484863, "loss": 0.5989, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -2290.46875, "rewards/margins": 367.1187438964844, "rewards/rejected": -2656.796875, "rewards/weighted_accuracies": 0.6812499761581421, "rewards/weighted_chosen": -4.538281440734863, "rewards/weighted_margins": 0.670092761516571, "rewards/weighted_rejected": -5.206250190734863, "step": 950 }, { "epoch": 0.5144005358338914, "grad_norm": 55.60570526123047, "learning_rate": 5.634086464424742e-07, "logits/chosen": -4.012402534484863, "logits/rejected": -3.98974609375, "logps/chosen": -2225.10009765625, "logps/rejected": -2796.268798828125, "logps/weighted_chosen": -4.904882907867432, "logps/weighted_rejected": -5.526464939117432, "loss": 0.5626, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2037.143798828125, "rewards/margins": 551.8984375, "rewards/rejected": -2589.043701171875, "rewards/weighted_accuracies": 0.71875, "rewards/weighted_chosen": -4.45361328125, "rewards/weighted_margins": 0.5914306640625, "rewards/weighted_rejected": -5.044140815734863, "step": 960 }, { "epoch": 0.5197588747488279, "grad_norm": 47.078128814697266, "learning_rate": 5.54123613370256e-07, "logits/chosen": -3.900390625, "logits/rejected": -3.8998045921325684, "logps/chosen": -1623.7125244140625, "logps/rejected": -1898.550048828125, "logps/weighted_chosen": -3.878124952316284, "logps/weighted_rejected": -4.330664157867432, "loss": 0.582, "rewards/accuracies": 0.609375, "rewards/chosen": -1428.956298828125, "rewards/margins": 268.56561279296875, "rewards/rejected": -1697.0875244140625, "rewards/weighted_accuracies": 0.684374988079071, "rewards/weighted_chosen": -3.447314500808716, "rewards/weighted_margins": 0.4397216737270355, "rewards/weighted_rejected": -3.887988328933716, "step": 970 }, { "epoch": 0.5251172136637642, "grad_norm": 67.22262573242188, "learning_rate": 5.448196544517167e-07, "logits/chosen": -3.939257860183716, "logits/rejected": -3.950000047683716, "logps/chosen": -1683.112548828125, "logps/rejected": -2141.637451171875, "logps/weighted_chosen": -4.157031059265137, "logps/weighted_rejected": -4.748242378234863, "loss": 0.6054, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -1484.3062744140625, "rewards/margins": 448.54998779296875, "rewards/rejected": -1932.387451171875, "rewards/weighted_accuracies": 0.6156250238418579, "rewards/weighted_chosen": -3.728564500808716, "rewards/weighted_margins": 0.5644286870956421, "rewards/weighted_rejected": -4.292675971984863, "step": 980 }, { "epoch": 0.5304755525787006, "grad_norm": 62.878604888916016, "learning_rate": 5.355000230782267e-07, "logits/chosen": -3.724902391433716, "logits/rejected": -3.7308592796325684, "logps/chosen": -1693.425048828125, "logps/rejected": -2090.09375, "logps/weighted_chosen": -4.06298828125, "logps/weighted_rejected": -4.559179782867432, "loss": 0.6163, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1488.6500244140625, "rewards/margins": 378.28125, "rewards/rejected": -1866.75, "rewards/weighted_accuracies": 0.671875, "rewards/weighted_chosen": -3.603222608566284, "rewards/weighted_margins": 0.4825805723667145, "rewards/weighted_rejected": -4.086816310882568, "step": 990 }, { "epoch": 0.535833891493637, "grad_norm": 132.68304443359375, "learning_rate": 5.26167978121472e-07, "logits/chosen": -3.9014649391174316, "logits/rejected": -3.8974609375, "logps/chosen": -1748.393798828125, "logps/rejected": -2370.02490234375, "logps/weighted_chosen": -3.965136766433716, "logps/weighted_rejected": -4.65234375, "loss": 0.5721, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1539.40625, "rewards/margins": 598.4609375, "rewards/rejected": -2137.625, "rewards/weighted_accuracies": 0.690625011920929, "rewards/weighted_chosen": -3.532958984375, "rewards/weighted_margins": 0.640454113483429, "rewards/weighted_rejected": -4.173095703125, "step": 1000 }, { "epoch": 0.535833891493637, "eval_logits/chosen": -4.246370315551758, "eval_logits/rejected": -4.254964828491211, "eval_logps/chosen": -2158.052001953125, "eval_logps/rejected": -2603.586181640625, "eval_logps/weighted_chosen": -4.515809059143066, "eval_logps/weighted_rejected": -5.166130542755127, "eval_loss": 0.6049003601074219, "eval_rewards/accuracies": 0.6298397779464722, "eval_rewards/chosen": -1955.2657470703125, "eval_rewards/margins": 431.65203857421875, "eval_rewards/rejected": -2386.94921875, "eval_rewards/weighted_accuracies": 0.6779038906097412, "eval_rewards/weighted_chosen": -4.057965278625488, "eval_rewards/weighted_margins": 0.6343361735343933, "eval_rewards/weighted_rejected": -4.692301273345947, "eval_runtime": 2385.1831, "eval_samples_per_second": 1.255, "eval_steps_per_second": 0.314, "step": 1000 } ], "logging_steps": 10, "max_steps": 1867, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }