{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2679169457468185, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005358338914936369, "grad_norm": 29.978843688964844, "learning_rate": 0.0, "logits/chosen": -2.52783203125, "logits/rejected": -2.4970703125, "logps/chosen": -277.75, "logps/rejected": -223.0625, "logps/weighted_chosen": -0.4293212890625, "logps/weighted_rejected": -0.3968505859375, "loss": 0.6914, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "rewards/weighted_accuracies": 0.0, "rewards/weighted_chosen": 0.0, "rewards/weighted_margins": 0.0, "rewards/weighted_rejected": 0.0, "step": 1 }, { "epoch": 0.0053583389149363695, "grad_norm": 28.513858795166016, "learning_rate": 4.81283422459893e-08, "logits/chosen": -2.42626953125, "logits/rejected": -2.397569417953491, "logps/chosen": -195.53038024902344, "logps/rejected": -195.91839599609375, "logps/weighted_chosen": -0.4505276083946228, "logps/weighted_rejected": -0.4737955629825592, "loss": 0.6926, "rewards/accuracies": 0.2743055522441864, "rewards/chosen": 0.0321180559694767, "rewards/margins": -0.0375434011220932, "rewards/rejected": 0.0696614608168602, "rewards/weighted_accuracies": 0.3402777910232544, "rewards/weighted_chosen": 5.679660534951836e-05, "rewards/weighted_margins": 3.984239447163418e-05, "rewards/weighted_rejected": 1.695421087788418e-05, "step": 10 }, { "epoch": 0.010716677829872739, "grad_norm": 20.803266525268555, "learning_rate": 1.0160427807486631e-07, "logits/chosen": -2.4007811546325684, "logits/rejected": -2.3998780250549316, "logps/chosen": -199.92343139648438, "logps/rejected": -202.2101593017578, "logps/weighted_chosen": -0.4370788633823395, "logps/weighted_rejected": -0.464111328125, "loss": 0.6927, "rewards/accuracies": 0.34687501192092896, "rewards/chosen": -0.02207031287252903, "rewards/margins": 0.04130859300494194, "rewards/rejected": -0.06337890774011612, "rewards/weighted_accuracies": 0.36250001192092896, "rewards/weighted_chosen": 0.0006473541143350303, "rewards/weighted_margins": 7.362365431617945e-05, "rewards/weighted_rejected": 0.0005735397571697831, "step": 20 }, { "epoch": 0.016075016744809108, "grad_norm": 33.961883544921875, "learning_rate": 1.5508021390374333e-07, "logits/chosen": -2.32586669921875, "logits/rejected": -2.300219774246216, "logps/chosen": -198.45858764648438, "logps/rejected": -212.0234375, "logps/weighted_chosen": -0.45709228515625, "logps/weighted_rejected": -0.45776671171188354, "loss": 0.6925, "rewards/accuracies": 0.38749998807907104, "rewards/chosen": 0.1083984375, "rewards/margins": 0.1162109375, "rewards/rejected": -0.0078125, "rewards/weighted_accuracies": 0.4312500059604645, "rewards/weighted_chosen": 0.0002254486025776714, "rewards/weighted_margins": 0.0002849578741006553, "rewards/weighted_rejected": -5.970001075183973e-05, "step": 30 }, { "epoch": 0.021433355659745478, "grad_norm": 44.94234848022461, "learning_rate": 2.085561497326203e-07, "logits/chosen": -2.3628907203674316, "logits/rejected": -2.3779540061950684, "logps/chosen": -192.63125610351562, "logps/rejected": -207.24765014648438, "logps/weighted_chosen": -0.45967406034469604, "logps/weighted_rejected": -0.4842773377895355, "loss": 0.6926, "rewards/accuracies": 0.3687500059604645, "rewards/chosen": -0.01308593712747097, "rewards/margins": 0.04453124850988388, "rewards/rejected": -0.0576171875, "rewards/weighted_accuracies": 0.40937501192092896, "rewards/weighted_chosen": 0.0007827758672647178, "rewards/weighted_margins": 0.00011138916306663305, "rewards/weighted_rejected": 0.00067138671875, "step": 40 }, { "epoch": 0.02679169457468185, "grad_norm": 58.061580657958984, "learning_rate": 2.620320855614973e-07, "logits/chosen": -2.402636766433716, "logits/rejected": -2.3846192359924316, "logps/chosen": -213.23672485351562, "logps/rejected": -228.9015655517578, "logps/weighted_chosen": -0.4499572813510895, "logps/weighted_rejected": -0.4646240174770355, "loss": 0.6926, "rewards/accuracies": 0.34687501192092896, "rewards/chosen": -0.09589843451976776, "rewards/margins": 0.0673828125, "rewards/rejected": -0.16328124701976776, "rewards/weighted_accuracies": 0.44999998807907104, "rewards/weighted_chosen": 0.0027015686500817537, "rewards/weighted_margins": 0.0005447387811727822, "rewards/weighted_rejected": 0.0021568299271166325, "step": 50 }, { "epoch": 0.032150033489618215, "grad_norm": 30.78350830078125, "learning_rate": 3.155080213903743e-07, "logits/chosen": -2.349609375, "logits/rejected": -2.3248534202575684, "logps/chosen": -211.83438110351562, "logps/rejected": -218.11563110351562, "logps/weighted_chosen": -0.44114989042282104, "logps/weighted_rejected": -0.4686218202114105, "loss": 0.6918, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -0.0025390624068677425, "rewards/margins": 0.06503906100988388, "rewards/rejected": -0.06757812201976776, "rewards/weighted_accuracies": 0.48750001192092896, "rewards/weighted_chosen": 0.005175781436264515, "rewards/weighted_margins": 0.0019653320778161287, "rewards/weighted_rejected": 0.0032104491256177425, "step": 60 }, { "epoch": 0.03750837240455459, "grad_norm": 23.173866271972656, "learning_rate": 3.689839572192513e-07, "logits/chosen": -2.458251953125, "logits/rejected": -2.463671922683716, "logps/chosen": -194.55624389648438, "logps/rejected": -200.5437469482422, "logps/weighted_chosen": -0.4099182188510895, "logps/weighted_rejected": -0.4346679747104645, "loss": 0.6923, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -0.16562500596046448, "rewards/margins": 0.203125, "rewards/rejected": -0.3687500059604645, "rewards/weighted_accuracies": 0.503125011920929, "rewards/weighted_chosen": 0.0095977783203125, "rewards/weighted_margins": 0.0012573242420330644, "rewards/weighted_rejected": 0.00834045372903347, "step": 70 }, { "epoch": 0.042866711319490956, "grad_norm": 27.169227600097656, "learning_rate": 4.2245989304812833e-07, "logits/chosen": -2.44384765625, "logits/rejected": -2.4704346656799316, "logps/chosen": -219.31405639648438, "logps/rejected": -240.16641235351562, "logps/weighted_chosen": -0.4397827088832855, "logps/weighted_rejected": -0.46443480253219604, "loss": 0.6915, "rewards/accuracies": 0.4375, "rewards/chosen": 0.02402343787252903, "rewards/margins": 0.2919921875, "rewards/rejected": -0.2679687440395355, "rewards/weighted_accuracies": 0.47187501192092896, "rewards/weighted_chosen": 0.012739372439682484, "rewards/weighted_margins": 0.003216362092643976, "rewards/weighted_rejected": 0.00952300988137722, "step": 80 }, { "epoch": 0.04822505023442733, "grad_norm": 24.797061920166016, "learning_rate": 4.7593582887700533e-07, "logits/chosen": -2.46044921875, "logits/rejected": -2.4736571311950684, "logps/chosen": -195.90390014648438, "logps/rejected": -208.1320343017578, "logps/weighted_chosen": -0.42793577909469604, "logps/weighted_rejected": -0.447784423828125, "loss": 0.6922, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.599609375, "rewards/margins": 0.3626953065395355, "rewards/rejected": -0.9623047113418579, "rewards/weighted_accuracies": 0.5062500238418579, "rewards/weighted_chosen": 0.00842132605612278, "rewards/weighted_margins": 0.0021730423904955387, "rewards/weighted_rejected": 0.00624923687428236, "step": 90 }, { "epoch": 0.0535833891493637, "grad_norm": 13.923910140991211, "learning_rate": 5.294117647058823e-07, "logits/chosen": -2.4405760765075684, "logits/rejected": -2.425537109375, "logps/chosen": -224.88516235351562, "logps/rejected": -229.6984405517578, "logps/weighted_chosen": -0.4417480528354645, "logps/weighted_rejected": -0.454376220703125, "loss": 0.6949, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.443750023841858, "rewards/margins": 0.4691406190395355, "rewards/rejected": -1.9128906726837158, "rewards/weighted_accuracies": 0.518750011920929, "rewards/weighted_chosen": 0.001766204833984375, "rewards/weighted_margins": -0.0024765015114098787, "rewards/weighted_rejected": 0.0042396546341478825, "step": 100 }, { "epoch": 0.058941728064300064, "grad_norm": 43.69069290161133, "learning_rate": 5.828877005347593e-07, "logits/chosen": -2.4117431640625, "logits/rejected": -2.413012742996216, "logps/chosen": -217.89453125, "logps/rejected": -225.7703094482422, "logps/weighted_chosen": -0.45955199003219604, "logps/weighted_rejected": -0.4866271913051605, "loss": 0.6916, "rewards/accuracies": 0.4781250059604645, "rewards/chosen": -2.8353514671325684, "rewards/margins": 0.3775390684604645, "rewards/rejected": -3.212890625, "rewards/weighted_accuracies": 0.5093749761581421, "rewards/weighted_chosen": 0.0031097412575036287, "rewards/weighted_margins": 0.004855346865952015, "rewards/weighted_rejected": -0.0017486572032794356, "step": 110 }, { "epoch": 0.06430006697923643, "grad_norm": 24.36446762084961, "learning_rate": 6.363636363636363e-07, "logits/chosen": -2.4598631858825684, "logits/rejected": -2.4720215797424316, "logps/chosen": -188.3484344482422, "logps/rejected": -226.6437530517578, "logps/weighted_chosen": -0.4706787168979645, "logps/weighted_rejected": -0.551287829875946, "loss": 0.6893, "rewards/accuracies": 0.47187501192092896, "rewards/chosen": -4.0625, "rewards/margins": 0.947460949420929, "rewards/rejected": -5.010156154632568, "rewards/weighted_accuracies": 0.528124988079071, "rewards/weighted_chosen": -0.018326569348573685, "rewards/weighted_margins": 0.010852813720703125, "rewards/weighted_rejected": -0.02917938306927681, "step": 120 }, { "epoch": 0.06965840589417281, "grad_norm": 35.46619415283203, "learning_rate": 6.898395721925134e-07, "logits/chosen": -2.566601514816284, "logits/rejected": -2.5640625953674316, "logps/chosen": -233.8390655517578, "logps/rejected": -243.31875610351562, "logps/weighted_chosen": -0.46611326932907104, "logps/weighted_rejected": -0.48984986543655396, "loss": 0.7012, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -5.547656059265137, "rewards/margins": 1.3078124523162842, "rewards/rejected": -6.855859279632568, "rewards/weighted_accuracies": 0.49687498807907104, "rewards/weighted_chosen": -0.01891174353659153, "rewards/weighted_margins": -0.0069145201705396175, "rewards/weighted_rejected": -0.012065887451171875, "step": 130 }, { "epoch": 0.07501674480910918, "grad_norm": 18.125104904174805, "learning_rate": 7.433155080213903e-07, "logits/chosen": -2.602294921875, "logits/rejected": -2.599804639816284, "logps/chosen": -202.98281860351562, "logps/rejected": -215.4796905517578, "logps/weighted_chosen": -0.4594665467739105, "logps/weighted_rejected": -0.5065857172012329, "loss": 0.6858, "rewards/accuracies": 0.559374988079071, "rewards/chosen": -8.157422065734863, "rewards/margins": 2.392578125, "rewards/rejected": -10.550000190734863, "rewards/weighted_accuracies": 0.5531250238418579, "rewards/weighted_chosen": -0.02419433556497097, "rewards/weighted_margins": 0.01836395263671875, "rewards/weighted_rejected": -0.04254303127527237, "step": 140 }, { "epoch": 0.08037508372404555, "grad_norm": 44.02063751220703, "learning_rate": 7.967914438502673e-07, "logits/chosen": -2.6329102516174316, "logits/rejected": -2.630322217941284, "logps/chosen": -221.5109405517578, "logps/rejected": -245.00155639648438, "logps/weighted_chosen": -0.522930920124054, "logps/weighted_rejected": -0.535870373249054, "loss": 0.6933, "rewards/accuracies": 0.5625, "rewards/chosen": -11.2197265625, "rewards/margins": 2.375, "rewards/rejected": -13.596288681030273, "rewards/weighted_accuracies": 0.5406249761581421, "rewards/weighted_chosen": -0.07070770114660263, "rewards/weighted_margins": 0.00907135009765625, "rewards/weighted_rejected": -0.07977227866649628, "step": 150 }, { "epoch": 0.08573342263898191, "grad_norm": 30.984752655029297, "learning_rate": 8.502673796791443e-07, "logits/chosen": -2.552539110183716, "logits/rejected": -2.5354247093200684, "logps/chosen": -207.53280639648438, "logps/rejected": -236.8117218017578, "logps/weighted_chosen": -0.5689147710800171, "logps/weighted_rejected": -0.5829712152481079, "loss": 0.6936, "rewards/accuracies": 0.534375011920929, "rewards/chosen": -13.640625, "rewards/margins": 1.2478516101837158, "rewards/rejected": -14.890039443969727, "rewards/weighted_accuracies": 0.5625, "rewards/weighted_chosen": -0.11262588202953339, "rewards/weighted_margins": 0.01301422156393528, "rewards/weighted_rejected": -0.12571564316749573, "step": 160 }, { "epoch": 0.09109176155391828, "grad_norm": 29.657917022705078, "learning_rate": 9.037433155080213e-07, "logits/chosen": -2.5386719703674316, "logits/rejected": -2.550463914871216, "logps/chosen": -213.2921905517578, "logps/rejected": -224.1906280517578, "logps/weighted_chosen": -0.561798095703125, "logps/weighted_rejected": -0.5876098871231079, "loss": 0.6873, "rewards/accuracies": 0.565625011920929, "rewards/chosen": -15.824999809265137, "rewards/margins": 3.636914014816284, "rewards/rejected": -19.457422256469727, "rewards/weighted_accuracies": 0.534375011920929, "rewards/weighted_chosen": -0.11036185920238495, "rewards/weighted_margins": 0.027300262823700905, "rewards/weighted_rejected": -0.13774414360523224, "step": 170 }, { "epoch": 0.09645010046885466, "grad_norm": 16.24775505065918, "learning_rate": 9.572192513368984e-07, "logits/chosen": -2.5972657203674316, "logits/rejected": -2.5906739234924316, "logps/chosen": -229.1453094482422, "logps/rejected": -249.83749389648438, "logps/weighted_chosen": -0.607281506061554, "logps/weighted_rejected": -0.6552734375, "loss": 0.688, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -19.492773056030273, "rewards/margins": 5.0849609375, "rewards/rejected": -24.581249237060547, "rewards/weighted_accuracies": 0.528124988079071, "rewards/weighted_chosen": -0.16547851264476776, "rewards/weighted_margins": 0.02171630784869194, "rewards/weighted_rejected": -0.1871211975812912, "step": 180 }, { "epoch": 0.10180843938379103, "grad_norm": 22.05425262451172, "learning_rate": 9.999965031204306e-07, "logits/chosen": -2.589916944503784, "logits/rejected": -2.578906297683716, "logps/chosen": -224.77499389648438, "logps/rejected": -224.9296875, "logps/weighted_chosen": -0.6605224609375, "logps/weighted_rejected": -0.7260376214981079, "loss": 0.6892, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -23.936328887939453, "rewards/margins": 2.9306640625, "rewards/rejected": -26.86328125, "rewards/weighted_accuracies": 0.5406249761581421, "rewards/weighted_chosen": -0.21764373779296875, "rewards/weighted_margins": 0.04151611402630806, "rewards/weighted_rejected": -0.25913238525390625, "step": 190 }, { "epoch": 0.1071667782987274, "grad_norm": 23.82738494873047, "learning_rate": 9.998741174712533e-07, "logits/chosen": -2.669238328933716, "logits/rejected": -2.6612305641174316, "logps/chosen": -255.4250030517578, "logps/rejected": -270.40313720703125, "logps/weighted_chosen": -0.611083984375, "logps/weighted_rejected": -0.6286376714706421, "loss": 0.7003, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -27.870702743530273, "rewards/margins": 2.9964842796325684, "rewards/rejected": -30.857812881469727, "rewards/weighted_accuracies": 0.5, "rewards/weighted_chosen": -0.15414123237133026, "rewards/weighted_margins": 0.0028816224075853825, "rewards/weighted_rejected": -0.15706482529640198, "step": 200 }, { "epoch": 0.11252511721366376, "grad_norm": 28.775007247924805, "learning_rate": 9.995769367531952e-07, "logits/chosen": -2.7216796875, "logits/rejected": -2.700000047683716, "logps/chosen": -248.61093139648438, "logps/rejected": -247.4812469482422, "logps/weighted_chosen": -0.5626465082168579, "logps/weighted_rejected": -0.5739685297012329, "loss": 0.7034, "rewards/accuracies": 0.559374988079071, "rewards/chosen": -30.315235137939453, "rewards/margins": 1.091210961341858, "rewards/rejected": -31.409765243530273, "rewards/weighted_accuracies": 0.559374988079071, "rewards/weighted_chosen": -0.14006805419921875, "rewards/weighted_margins": -0.0037292479537427425, "rewards/weighted_rejected": -0.13644561171531677, "step": 210 }, { "epoch": 0.11788345612860013, "grad_norm": 32.326786041259766, "learning_rate": 9.991050648838675e-07, "logits/chosen": -2.8436036109924316, "logits/rejected": -2.8392577171325684, "logps/chosen": -242.8984375, "logps/rejected": -257.7124938964844, "logps/weighted_chosen": -0.616424560546875, "logps/weighted_rejected": -0.65435791015625, "loss": 0.6882, "rewards/accuracies": 0.5718749761581421, "rewards/chosen": -38.271095275878906, "rewards/margins": 1.724218726158142, "rewards/rejected": -39.98163986206055, "rewards/weighted_accuracies": 0.546875, "rewards/weighted_chosen": -0.15750885009765625, "rewards/weighted_margins": 0.02539825439453125, "rewards/weighted_rejected": -0.18288879096508026, "step": 220 }, { "epoch": 0.12324179504353651, "grad_norm": 19.367631912231445, "learning_rate": 9.98458666866564e-07, "logits/chosen": -2.869873046875, "logits/rejected": -2.870898485183716, "logps/chosen": -243.66561889648438, "logps/rejected": -263.89373779296875, "logps/weighted_chosen": -0.7010498046875, "logps/weighted_rejected": -0.750701904296875, "loss": 0.6841, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -47.01171875, "rewards/margins": 7.957421779632568, "rewards/rejected": -54.978515625, "rewards/weighted_accuracies": 0.565625011920929, "rewards/weighted_chosen": -0.24614867568016052, "rewards/weighted_margins": 0.04576721042394638, "rewards/weighted_rejected": -0.2919418215751648, "step": 230 }, { "epoch": 0.12860013395847286, "grad_norm": 13.939057350158691, "learning_rate": 9.97637968732563e-07, "logits/chosen": -2.9676756858825684, "logits/rejected": -2.950244188308716, "logps/chosen": -248.04843139648438, "logps/rejected": -263.4750061035156, "logps/weighted_chosen": -0.6573241949081421, "logps/weighted_rejected": -0.648510754108429, "loss": 0.7232, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -43.94921875, "rewards/margins": 5.845312595367432, "rewards/rejected": -49.795310974121094, "rewards/weighted_accuracies": 0.512499988079071, "rewards/weighted_chosen": -0.22567901015281677, "rewards/weighted_margins": -0.026827240362763405, "rewards/weighted_rejected": -0.19892653822898865, "step": 240 }, { "epoch": 0.13395847287340926, "grad_norm": 12.663073539733887, "learning_rate": 9.966432574620906e-07, "logits/chosen": -3.0162110328674316, "logits/rejected": -3.0179200172424316, "logps/chosen": -242.93905639648438, "logps/rejected": -270.6875, "logps/weighted_chosen": -0.628466784954071, "logps/weighted_rejected": -0.67413330078125, "loss": 0.696, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -48.034568786621094, "rewards/margins": 3.3388671875, "rewards/rejected": -51.372657775878906, "rewards/weighted_accuracies": 0.534375011920929, "rewards/weighted_chosen": -0.21130982041358948, "rewards/weighted_margins": 0.009045409969985485, "rewards/weighted_rejected": -0.22044983506202698, "step": 250 }, { "epoch": 0.13931681178834562, "grad_norm": 14.56610107421875, "learning_rate": 9.954748808839674e-07, "logits/chosen": -2.9679198265075684, "logits/rejected": -2.976855516433716, "logps/chosen": -277.53436279296875, "logps/rejected": -275.7718811035156, "logps/weighted_chosen": -0.63873291015625, "logps/weighted_rejected": -0.6825927495956421, "loss": 0.6844, "rewards/accuracies": 0.528124988079071, "rewards/chosen": -50.900779724121094, "rewards/margins": 4.017773628234863, "rewards/rejected": -54.93476486206055, "rewards/weighted_accuracies": 0.559374988079071, "rewards/weighted_chosen": -0.18876799941062927, "rewards/weighted_margins": 0.03838195651769638, "rewards/weighted_rejected": -0.2271774262189865, "step": 260 }, { "epoch": 0.144675150703282, "grad_norm": 12.570405006408691, "learning_rate": 9.941332475539824e-07, "logits/chosen": -3.037890672683716, "logits/rejected": -3.046337842941284, "logps/chosen": -263.0531311035156, "logps/rejected": -299.34375, "logps/weighted_chosen": -0.689990222454071, "logps/weighted_rejected": -0.7867187261581421, "loss": 0.6742, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -52.885154724121094, "rewards/margins": 9.96875, "rewards/rejected": -62.86054611206055, "rewards/weighted_accuracies": 0.6343749761581421, "rewards/weighted_chosen": -0.21307983994483948, "rewards/weighted_margins": 0.06503143161535263, "rewards/weighted_rejected": -0.2780609130859375, "step": 270 }, { "epoch": 0.15003348961821836, "grad_norm": 17.126554489135742, "learning_rate": 9.926188266120295e-07, "logits/chosen": -3.169921875, "logits/rejected": -3.1683592796325684, "logps/chosen": -246.58438110351562, "logps/rejected": -262.4078063964844, "logps/weighted_chosen": -0.681854248046875, "logps/weighted_rejected": -0.746777355670929, "loss": 0.6785, "rewards/accuracies": 0.59375, "rewards/chosen": -50.88359451293945, "rewards/margins": 8.079882621765137, "rewards/rejected": -58.97187423706055, "rewards/weighted_accuracies": 0.53125, "rewards/weighted_chosen": -0.2380828857421875, "rewards/weighted_margins": 0.05399169772863388, "rewards/weighted_rejected": -0.291983038187027, "step": 280 }, { "epoch": 0.15539182853315472, "grad_norm": 17.43288230895996, "learning_rate": 9.909321476180591e-07, "logits/chosen": -3.285205125808716, "logits/rejected": -3.2626953125, "logps/chosen": -269.65155029296875, "logps/rejected": -298.9703063964844, "logps/weighted_chosen": -0.7649291753768921, "logps/weighted_rejected": -0.828442394733429, "loss": 0.6972, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -66.34883117675781, "rewards/margins": 6.403515815734863, "rewards/rejected": -72.75663757324219, "rewards/weighted_accuracies": 0.5093749761581421, "rewards/weighted_chosen": -0.307464599609375, "rewards/weighted_margins": 0.0262603759765625, "rewards/weighted_rejected": -0.333578497171402, "step": 290 }, { "epoch": 0.1607501674480911, "grad_norm": 16.17060089111328, "learning_rate": 9.890738003669027e-07, "logits/chosen": -3.3641114234924316, "logits/rejected": -3.360107421875, "logps/chosen": -260.75, "logps/rejected": -282.5093688964844, "logps/weighted_chosen": -0.743237316608429, "logps/weighted_rejected": -0.8062988519668579, "loss": 0.6858, "rewards/accuracies": 0.609375, "rewards/chosen": -65.58906555175781, "rewards/margins": 8.375, "rewards/rejected": -73.9820327758789, "rewards/weighted_accuracies": 0.534375011920929, "rewards/weighted_chosen": -0.294107049703598, "rewards/weighted_margins": 0.04635162279009819, "rewards/weighted_rejected": -0.34059447050094604, "step": 300 }, { "epoch": 0.16610850636302746, "grad_norm": 19.679088592529297, "learning_rate": 9.870444346820348e-07, "logits/chosen": -3.3017578125, "logits/rejected": -3.2752928733825684, "logps/chosen": -277.9906311035156, "logps/rejected": -299.7265625, "logps/weighted_chosen": -0.76312255859375, "logps/weighted_rejected": -0.793139636516571, "loss": 0.6994, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -64.0882797241211, "rewards/margins": 10.8203125, "rewards/rejected": -74.90898132324219, "rewards/weighted_accuracies": 0.5218750238418579, "rewards/weighted_chosen": -0.310202032327652, "rewards/weighted_margins": 0.007830810733139515, "rewards/weighted_rejected": -0.3179565370082855, "step": 310 }, { "epoch": 0.17146684527796383, "grad_norm": 18.09251594543457, "learning_rate": 9.848447601883433e-07, "logits/chosen": -3.251171827316284, "logits/rejected": -3.2337403297424316, "logps/chosen": -255.09530639648438, "logps/rejected": -293.9375, "logps/weighted_chosen": -0.7666381597518921, "logps/weighted_rejected": -0.833050549030304, "loss": 0.6918, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -58.306251525878906, "rewards/margins": 15.045312881469727, "rewards/rejected": -73.3515625, "rewards/weighted_accuracies": 0.565625011920929, "rewards/weighted_chosen": -0.332742303609848, "rewards/weighted_margins": 0.03985900804400444, "rewards/weighted_rejected": -0.37254637479782104, "step": 320 }, { "epoch": 0.1768251841929002, "grad_norm": 25.393939971923828, "learning_rate": 9.824755460639899e-07, "logits/chosen": -3.203418016433716, "logits/rejected": -3.19873046875, "logps/chosen": -261.8515625, "logps/rejected": -305.0328063964844, "logps/weighted_chosen": -0.838146984577179, "logps/weighted_rejected": -0.897717297077179, "loss": 0.6862, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": -70.07734680175781, "rewards/margins": 16.4833984375, "rewards/rejected": -86.54609680175781, "rewards/weighted_accuracies": 0.5531250238418579, "rewards/weighted_chosen": -0.4094604551792145, "rewards/weighted_margins": 0.05315857008099556, "rewards/weighted_rejected": -0.4626617431640625, "step": 330 }, { "epoch": 0.18218352310783656, "grad_norm": 15.981916427612305, "learning_rate": 9.799376207714444e-07, "logits/chosen": -3.241992235183716, "logits/rejected": -3.2372069358825684, "logps/chosen": -283.8984375, "logps/rejected": -301.45782470703125, "logps/weighted_chosen": -0.940869152545929, "logps/weighted_rejected": -1.0183837413787842, "loss": 0.6807, "rewards/accuracies": 0.5531250238418579, "rewards/chosen": -82.82109069824219, "rewards/margins": 8.819140434265137, "rewards/rejected": -91.63594055175781, "rewards/weighted_accuracies": 0.5625, "rewards/weighted_chosen": -0.47734373807907104, "rewards/weighted_margins": 0.06443405151367188, "rewards/weighted_rejected": -0.5418640375137329, "step": 340 }, { "epoch": 0.18754186202277295, "grad_norm": 11.714133262634277, "learning_rate": 9.772318717677903e-07, "logits/chosen": -3.29248046875, "logits/rejected": -3.2901368141174316, "logps/chosen": -281.28125, "logps/rejected": -298.8671875, "logps/weighted_chosen": -0.8809448480606079, "logps/weighted_rejected": -0.9219726324081421, "loss": 0.6935, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -81.07421875, "rewards/margins": 11.681249618530273, "rewards/rejected": -92.7164077758789, "rewards/weighted_accuracies": 0.574999988079071, "rewards/weighted_chosen": -0.4380859434604645, "rewards/weighted_margins": 0.02567749097943306, "rewards/weighted_rejected": -0.46390992403030396, "step": 350 }, { "epoch": 0.19290020093770932, "grad_norm": 17.111141204833984, "learning_rate": 9.743592451943998e-07, "logits/chosen": -3.371386766433716, "logits/rejected": -3.3677735328674316, "logps/chosen": -301.2203063964844, "logps/rejected": -326.7718811035156, "logps/weighted_chosen": -0.884228527545929, "logps/weighted_rejected": -0.8854614496231079, "loss": 0.6983, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -89.6175765991211, "rewards/margins": 10.710546493530273, "rewards/rejected": -100.3335952758789, "rewards/weighted_accuracies": 0.5562499761581421, "rewards/weighted_chosen": -0.4132934510707855, "rewards/weighted_margins": 0.013682556338608265, "rewards/weighted_rejected": -0.427154541015625, "step": 360 }, { "epoch": 0.1982585398526457, "grad_norm": 13.4055814743042, "learning_rate": 9.713207455460892e-07, "logits/chosen": -3.321484327316284, "logits/rejected": -3.3426756858825684, "logps/chosen": -296.109375, "logps/rejected": -313.0843811035156, "logps/weighted_chosen": -0.8559204339981079, "logps/weighted_rejected": -0.9296630620956421, "loss": 0.6728, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -90.3265609741211, "rewards/margins": 7.080859184265137, "rewards/rejected": -97.3773422241211, "rewards/weighted_accuracies": 0.5843750238418579, "rewards/weighted_chosen": -0.365234375, "rewards/weighted_margins": 0.06043701246380806, "rewards/weighted_rejected": -0.4256530702114105, "step": 370 }, { "epoch": 0.20361687876758205, "grad_norm": 12.659865379333496, "learning_rate": 9.681174353198686e-07, "logits/chosen": -3.418164014816284, "logits/rejected": -3.4073243141174316, "logps/chosen": -303.66717529296875, "logps/rejected": -334.4281311035156, "logps/weighted_chosen": -0.7992187738418579, "logps/weighted_rejected": -0.8946533203125, "loss": 0.6715, "rewards/accuracies": 0.59375, "rewards/chosen": -92.99609375, "rewards/margins": 13.2685546875, "rewards/rejected": -106.234375, "rewards/weighted_accuracies": 0.581250011920929, "rewards/weighted_chosen": -0.362588495016098, "rewards/weighted_margins": 0.07253112643957138, "rewards/weighted_rejected": -0.4350738525390625, "step": 380 }, { "epoch": 0.20897521768251842, "grad_norm": 33.69499969482422, "learning_rate": 9.647504346434103e-07, "logits/chosen": -3.559375047683716, "logits/rejected": -3.565234422683716, "logps/chosen": -328.44219970703125, "logps/rejected": -352.2484436035156, "logps/weighted_chosen": -0.8904174566268921, "logps/weighted_rejected": -0.944140613079071, "loss": 0.688, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -107.42109680175781, "rewards/margins": 25.632421493530273, "rewards/rejected": -133.03515625, "rewards/weighted_accuracies": 0.581250011920929, "rewards/weighted_chosen": -0.430419921875, "rewards/weighted_margins": 0.04887695237994194, "rewards/weighted_rejected": -0.47917479276657104, "step": 390 }, { "epoch": 0.2143335565974548, "grad_norm": 21.384435653686523, "learning_rate": 9.612209208833646e-07, "logits/chosen": -3.5692381858825684, "logits/rejected": -3.5658202171325684, "logps/chosen": -310.74530029296875, "logps/rejected": -319.8671875, "logps/weighted_chosen": -0.885510265827179, "logps/weighted_rejected": -0.952441394329071, "loss": 0.6817, "rewards/accuracies": 0.5718749761581421, "rewards/chosen": -110.94140625, "rewards/margins": 7.463281154632568, "rewards/rejected": -118.3929672241211, "rewards/weighted_accuracies": 0.5718749761581421, "rewards/weighted_chosen": -0.43572998046875, "rewards/weighted_margins": 0.05572357028722763, "rewards/weighted_rejected": -0.4914703369140625, "step": 400 }, { "epoch": 0.21969189551239116, "grad_norm": 41.982784271240234, "learning_rate": 9.5753012823366e-07, "logits/chosen": -3.6553711891174316, "logits/rejected": -3.646191358566284, "logps/chosen": -335.3968811035156, "logps/rejected": -370.7281188964844, "logps/weighted_chosen": -0.857287585735321, "logps/weighted_rejected": -0.8953613042831421, "loss": 0.6959, "rewards/accuracies": 0.609375, "rewards/chosen": -123.1953125, "rewards/margins": 24.586328506469727, "rewards/rejected": -147.77499389648438, "rewards/weighted_accuracies": 0.5874999761581421, "rewards/weighted_chosen": -0.421051025390625, "rewards/weighted_margins": 0.0330963134765625, "rewards/weighted_rejected": -0.4541381895542145, "step": 410 }, { "epoch": 0.22505023442732752, "grad_norm": 30.137876510620117, "learning_rate": 9.536793472839324e-07, "logits/chosen": -3.714648485183716, "logits/rejected": -3.7328124046325684, "logps/chosen": -386.20001220703125, "logps/rejected": -426.3656311035156, "logps/weighted_chosen": -0.8975464105606079, "logps/weighted_rejected": -0.970263659954071, "loss": 0.6699, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -155.86874389648438, "rewards/margins": 28.117578506469727, "rewards/rejected": -183.94296264648438, "rewards/weighted_accuracies": 0.5687500238418579, "rewards/weighted_chosen": -0.43191832304000854, "rewards/weighted_margins": 0.07274474948644638, "rewards/weighted_rejected": -0.5046783685684204, "step": 420 }, { "epoch": 0.2304085733422639, "grad_norm": 27.828502655029297, "learning_rate": 9.496699245682351e-07, "logits/chosen": -3.798535108566284, "logits/rejected": -3.790332078933716, "logps/chosen": -396.24688720703125, "logps/rejected": -460.65625, "logps/weighted_chosen": -0.987683117389679, "logps/weighted_rejected": -1.0972778797149658, "loss": 0.6717, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -175.32421875, "rewards/margins": 44.544532775878906, "rewards/rejected": -219.8874969482422, "rewards/weighted_accuracies": 0.5718749761581421, "rewards/weighted_chosen": -0.5299133062362671, "rewards/weighted_margins": 0.08442840725183487, "rewards/weighted_rejected": -0.6142212152481079, "step": 430 }, { "epoch": 0.23576691225720026, "grad_norm": 25.154258728027344, "learning_rate": 9.455032620941839e-07, "logits/chosen": -3.865527391433716, "logits/rejected": -3.831738233566284, "logps/chosen": -364.0687561035156, "logps/rejected": -457.44061279296875, "logps/weighted_chosen": -1.0288207530975342, "logps/weighted_rejected": -1.1377685070037842, "loss": 0.6717, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -169.9953155517578, "rewards/margins": 61.437110900878906, "rewards/rejected": -231.46328735351562, "rewards/weighted_accuracies": 0.5843750238418579, "rewards/weighted_chosen": -0.5935516357421875, "rewards/weighted_margins": 0.09948424994945526, "rewards/weighted_rejected": -0.692797839641571, "step": 440 }, { "epoch": 0.24112525117213665, "grad_norm": 56.727012634277344, "learning_rate": 9.411808168527066e-07, "logits/chosen": -3.862499952316284, "logits/rejected": -3.863964796066284, "logps/chosen": -430.85626220703125, "logps/rejected": -484.0703125, "logps/weighted_chosen": -1.107214331626892, "logps/weighted_rejected": -1.182470679283142, "loss": 0.6926, "rewards/accuracies": 0.609375, "rewards/chosen": -225.39218139648438, "rewards/margins": 39.25664138793945, "rewards/rejected": -264.46405029296875, "rewards/weighted_accuracies": 0.550000011920929, "rewards/weighted_chosen": -0.6642822027206421, "rewards/weighted_margins": 0.06092224270105362, "rewards/weighted_rejected": -0.725268542766571, "step": 450 }, { "epoch": 0.24648359008707302, "grad_norm": 22.511943817138672, "learning_rate": 9.367041003085648e-07, "logits/chosen": -3.80322265625, "logits/rejected": -3.7904295921325684, "logps/chosen": -523.4312744140625, "logps/rejected": -489.375, "logps/weighted_chosen": -1.161865234375, "logps/weighted_rejected": -1.247900366783142, "loss": 0.6807, "rewards/accuracies": 0.546875, "rewards/chosen": -304.58203125, "rewards/margins": -29.098827362060547, "rewards/rejected": -275.4828186035156, "rewards/weighted_accuracies": 0.559374988079071, "rewards/weighted_chosen": -0.703125, "rewards/weighted_margins": 0.069793701171875, "rewards/weighted_rejected": -0.7724975347518921, "step": 460 }, { "epoch": 0.25184192900200936, "grad_norm": 26.90446662902832, "learning_rate": 9.320746778718274e-07, "logits/chosen": -3.8857421875, "logits/rejected": -3.8814454078674316, "logps/chosen": -421.92498779296875, "logps/rejected": -482.390625, "logps/weighted_chosen": -1.198461890220642, "logps/weighted_rejected": -1.3473632335662842, "loss": 0.6521, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -234.4382781982422, "rewards/margins": 44.27734375, "rewards/rejected": -278.60626220703125, "rewards/weighted_accuracies": 0.578125, "rewards/weighted_chosen": -0.731597900390625, "rewards/weighted_margins": 0.13102111220359802, "rewards/weighted_rejected": -0.862780749797821, "step": 470 }, { "epoch": 0.2572002679169457, "grad_norm": 39.92988586425781, "learning_rate": 9.272941683504808e-07, "logits/chosen": -3.947265625, "logits/rejected": -3.962695360183716, "logps/chosen": -564.7000122070312, "logps/rejected": -586.8125, "logps/weighted_chosen": -1.2533447742462158, "logps/weighted_rejected": -1.4404296875, "loss": 0.6616, "rewards/accuracies": 0.5718749761581421, "rewards/chosen": -346.68280029296875, "rewards/margins": 19.563282012939453, "rewards/rejected": -366.34686279296875, "rewards/weighted_accuracies": 0.612500011920929, "rewards/weighted_chosen": -0.810443103313446, "rewards/weighted_margins": 0.157379150390625, "rewards/weighted_rejected": -0.967883288860321, "step": 480 }, { "epoch": 0.2625586068318821, "grad_norm": 41.0818977355957, "learning_rate": 9.223642433843679e-07, "logits/chosen": -4.0751953125, "logits/rejected": -4.055956840515137, "logps/chosen": -527.2843627929688, "logps/rejected": -571.0406494140625, "logps/weighted_chosen": -1.251977562904358, "logps/weighted_rejected": -1.4073975086212158, "loss": 0.6527, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -325.359375, "rewards/margins": 38.974220275878906, "rewards/rejected": -364.4312438964844, "rewards/weighted_accuracies": 0.6343749761581421, "rewards/weighted_chosen": -0.7925049066543579, "rewards/weighted_margins": 0.14805908501148224, "rewards/weighted_rejected": -0.94061279296875, "step": 490 }, { "epoch": 0.2679169457468185, "grad_norm": 27.582181930541992, "learning_rate": 9.172866268606513e-07, "logits/chosen": -4.211133003234863, "logits/rejected": -4.199023246765137, "logps/chosen": -616.1749877929688, "logps/rejected": -701.46875, "logps/weighted_chosen": -1.4478759765625, "logps/weighted_rejected": -1.623193383216858, "loss": 0.6735, "rewards/accuracies": 0.609375, "rewards/chosen": -401.2093811035156, "rewards/margins": 72.16484069824219, "rewards/rejected": -473.28436279296875, "rewards/weighted_accuracies": 0.581250011920929, "rewards/weighted_chosen": -0.987841784954071, "rewards/weighted_margins": 0.13963623344898224, "rewards/weighted_rejected": -1.127478003501892, "step": 500 }, { "epoch": 0.2679169457468185, "eval_logits/chosen": -4.276368618011475, "eval_logits/rejected": -4.278684139251709, "eval_logps/chosen": -607.260986328125, "eval_logps/rejected": -683.6949462890625, "eval_logps/weighted_chosen": -1.5014480352401733, "eval_logps/weighted_rejected": -1.661415457725525, "eval_loss": 0.6799212694168091, "eval_rewards/accuracies": 0.5794392228126526, "eval_rewards/chosen": -404.2022705078125, "eval_rewards/margins": 63.045310974121094, "eval_rewards/rejected": -467.23162841796875, "eval_rewards/weighted_accuracies": 0.5817757248878479, "eval_rewards/weighted_chosen": -1.0436044931411743, "eval_rewards/weighted_margins": 0.14398153126239777, "eval_rewards/weighted_rejected": -1.1875860691070557, "eval_runtime": 2401.7026, "eval_samples_per_second": 1.247, "eval_steps_per_second": 0.312, "step": 500 } ], "logging_steps": 10, "max_steps": 1867, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }