{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9989528795811519, "eval_steps": 200, "global_step": 477, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0020942408376963353, "grad_norm": 14.617609977722168, "learning_rate": 0.0, "logits/chosen": 2.203179359436035, "logits/rejected": 2.035616397857666, "logps/chosen": -257.4821472167969, "logps/ref_chosen": -257.55841064453125, "logps/ref_rejected": -199.84764099121094, "logps/rejected": -199.93338012695312, "loss": 5.5446, "margin_dpo/margin_mean": 0.16199058294296265, "margin_dpo/margin_std": 0.6907856464385986, "step": 1 }, { "epoch": 0.004188481675392671, "grad_norm": 15.140374183654785, "learning_rate": 1.0416666666666666e-08, "logits/chosen": 2.1704792976379395, "logits/rejected": 2.0754430294036865, "logps/chosen": -224.03538513183594, "logps/ref_chosen": -224.12454223632812, "logps/ref_rejected": -182.62721252441406, "logps/rejected": -182.67271423339844, "loss": 5.5417, "margin_dpo/margin_mean": 0.13464844226837158, "margin_dpo/margin_std": 0.5429617166519165, "step": 2 }, { "epoch": 0.0062827225130890054, "grad_norm": 14.625223159790039, "learning_rate": 2.083333333333333e-08, "logits/chosen": 2.4683523178100586, "logits/rejected": 2.463977098464966, "logps/chosen": -312.9666748046875, "logps/ref_chosen": -312.8153991699219, "logps/ref_rejected": -291.1138916015625, "logps/rejected": -291.2332763671875, "loss": 5.5426, "margin_dpo/margin_mean": -0.03191244602203369, "margin_dpo/margin_std": 0.6326964497566223, "step": 3 }, { "epoch": 0.008376963350785341, "grad_norm": 15.790285110473633, "learning_rate": 3.125e-08, "logits/chosen": 1.5894497632980347, "logits/rejected": 1.4774465560913086, "logps/chosen": -310.7625427246094, "logps/ref_chosen": -310.8699645996094, "logps/ref_rejected": -323.95556640625, "logps/rejected": -323.9718933105469, "loss": 5.5437, "margin_dpo/margin_mean": 0.12377279996871948, "margin_dpo/margin_std": 0.9771984815597534, "step": 4 }, { "epoch": 0.010471204188481676, "grad_norm": 15.793586730957031, "learning_rate": 4.166666666666666e-08, "logits/chosen": 1.5695815086364746, "logits/rejected": 1.5709682703018188, "logps/chosen": -303.8356628417969, "logps/ref_chosen": -303.7280578613281, "logps/ref_rejected": -262.055419921875, "logps/rejected": -261.8935546875, "loss": 5.548, "margin_dpo/margin_mean": -0.26944446563720703, "margin_dpo/margin_std": 0.66167151927948, "step": 5 }, { "epoch": 0.012565445026178011, "grad_norm": 15.511699676513672, "learning_rate": 5.208333333333333e-08, "logits/chosen": 2.0192410945892334, "logits/rejected": 1.9741183519363403, "logps/chosen": -252.2058563232422, "logps/ref_chosen": -252.3014373779297, "logps/ref_rejected": -214.40451049804688, "logps/rejected": -214.4804229736328, "loss": 5.5507, "margin_dpo/margin_mean": 0.1714714765548706, "margin_dpo/margin_std": 0.6865968108177185, "step": 6 }, { "epoch": 0.014659685863874346, "grad_norm": 15.63283634185791, "learning_rate": 6.25e-08, "logits/chosen": 2.191936492919922, "logits/rejected": 2.0201575756073, "logps/chosen": -248.16464233398438, "logps/ref_chosen": -248.10345458984375, "logps/ref_rejected": -204.55133056640625, "logps/rejected": -204.63514709472656, "loss": 5.5465, "margin_dpo/margin_mean": 0.022650957107543945, "margin_dpo/margin_std": 0.7195451855659485, "step": 7 }, { "epoch": 0.016753926701570682, "grad_norm": 15.911747932434082, "learning_rate": 7.291666666666667e-08, "logits/chosen": 2.4633631706237793, "logits/rejected": 2.229030132293701, "logps/chosen": -446.24395751953125, "logps/ref_chosen": -446.1068115234375, "logps/ref_rejected": -316.3032531738281, "logps/rejected": -316.33001708984375, "loss": 5.5447, "margin_dpo/margin_mean": -0.11035525798797607, "margin_dpo/margin_std": 0.8465025424957275, "step": 8 }, { "epoch": 0.018848167539267015, "grad_norm": 14.066997528076172, "learning_rate": 8.333333333333333e-08, "logits/chosen": 1.9973905086517334, "logits/rejected": 1.8876209259033203, "logps/chosen": -291.28857421875, "logps/ref_chosen": -291.0896911621094, "logps/ref_rejected": -298.3818054199219, "logps/rejected": -298.3582763671875, "loss": 5.5483, "margin_dpo/margin_mean": -0.22240149974822998, "margin_dpo/margin_std": 0.7139020562171936, "step": 9 }, { "epoch": 0.020942408376963352, "grad_norm": 14.026876449584961, "learning_rate": 9.375e-08, "logits/chosen": 1.6050350666046143, "logits/rejected": 1.755211591720581, "logps/chosen": -221.44143676757812, "logps/ref_chosen": -221.42408752441406, "logps/ref_rejected": -210.35684204101562, "logps/rejected": -210.39434814453125, "loss": 5.544, "margin_dpo/margin_mean": 0.02016240358352661, "margin_dpo/margin_std": 0.5195479989051819, "step": 10 }, { "epoch": 0.023036649214659685, "grad_norm": 15.404158592224121, "learning_rate": 1.0416666666666667e-07, "logits/chosen": 1.8669978380203247, "logits/rejected": 1.7889609336853027, "logps/chosen": -307.2198181152344, "logps/ref_chosen": -307.2149658203125, "logps/ref_rejected": -264.55902099609375, "logps/rejected": -264.7065734863281, "loss": 5.5427, "margin_dpo/margin_mean": 0.1427026391029358, "margin_dpo/margin_std": 0.9485504627227783, "step": 11 }, { "epoch": 0.025130890052356022, "grad_norm": 14.81792163848877, "learning_rate": 1.1458333333333332e-07, "logits/chosen": 1.494691014289856, "logits/rejected": 1.6338729858398438, "logps/chosen": -273.935302734375, "logps/ref_chosen": -273.97259521484375, "logps/ref_rejected": -312.4557189941406, "logps/rejected": -312.26611328125, "loss": 5.5513, "margin_dpo/margin_mean": -0.15232467651367188, "margin_dpo/margin_std": 0.7628190517425537, "step": 12 }, { "epoch": 0.027225130890052355, "grad_norm": 14.786741256713867, "learning_rate": 1.25e-07, "logits/chosen": 1.8189257383346558, "logits/rejected": 1.8658004999160767, "logps/chosen": -264.774658203125, "logps/ref_chosen": -264.722412109375, "logps/ref_rejected": -264.62823486328125, "logps/rejected": -264.7838134765625, "loss": 5.5457, "margin_dpo/margin_mean": 0.10335606336593628, "margin_dpo/margin_std": 0.7768966555595398, "step": 13 }, { "epoch": 0.02931937172774869, "grad_norm": 15.321511268615723, "learning_rate": 1.3541666666666666e-07, "logits/chosen": 1.8423357009887695, "logits/rejected": 1.6009153127670288, "logps/chosen": -357.5430603027344, "logps/ref_chosen": -357.3697509765625, "logps/ref_rejected": -231.3351287841797, "logps/rejected": -231.34188842773438, "loss": 5.5436, "margin_dpo/margin_mean": -0.16655707359313965, "margin_dpo/margin_std": 0.6755635738372803, "step": 14 }, { "epoch": 0.031413612565445025, "grad_norm": 16.096477508544922, "learning_rate": 1.4583333333333335e-07, "logits/chosen": 2.050579071044922, "logits/rejected": 1.9528357982635498, "logps/chosen": -282.3099670410156, "logps/ref_chosen": -282.4208984375, "logps/ref_rejected": -193.90872192382812, "logps/rejected": -193.78834533691406, "loss": 5.5457, "margin_dpo/margin_mean": -0.009424567222595215, "margin_dpo/margin_std": 0.5681266784667969, "step": 15 }, { "epoch": 0.033507853403141365, "grad_norm": 16.60857391357422, "learning_rate": 1.5624999999999999e-07, "logits/chosen": 2.2264082431793213, "logits/rejected": 1.9722710847854614, "logps/chosen": -291.3759460449219, "logps/ref_chosen": -291.56591796875, "logps/ref_rejected": -252.4170684814453, "logps/rejected": -252.54373168945312, "loss": 5.54, "margin_dpo/margin_mean": 0.31664133071899414, "margin_dpo/margin_std": 0.7804574370384216, "step": 16 }, { "epoch": 0.0356020942408377, "grad_norm": 15.15626049041748, "learning_rate": 1.6666666666666665e-07, "logits/chosen": 1.9703552722930908, "logits/rejected": 1.9993352890014648, "logps/chosen": -343.3455505371094, "logps/ref_chosen": -343.4768981933594, "logps/ref_rejected": -338.89654541015625, "logps/rejected": -338.8592224121094, "loss": 5.5409, "margin_dpo/margin_mean": 0.09399676322937012, "margin_dpo/margin_std": 0.5367782115936279, "step": 17 }, { "epoch": 0.03769633507853403, "grad_norm": 15.167213439941406, "learning_rate": 1.7708333333333334e-07, "logits/chosen": 1.8425214290618896, "logits/rejected": 1.8331950902938843, "logps/chosen": -213.01934814453125, "logps/ref_chosen": -213.05694580078125, "logps/ref_rejected": -211.70962524414062, "logps/rejected": -211.76414489746094, "loss": 5.5491, "margin_dpo/margin_mean": 0.09212470054626465, "margin_dpo/margin_std": 0.6411672234535217, "step": 18 }, { "epoch": 0.039790575916230364, "grad_norm": 14.854358673095703, "learning_rate": 1.875e-07, "logits/chosen": 2.0766916275024414, "logits/rejected": 2.0941522121429443, "logps/chosen": -240.00901794433594, "logps/ref_chosen": -240.0670928955078, "logps/ref_rejected": -246.15377807617188, "logps/rejected": -246.24050903320312, "loss": 5.5489, "margin_dpo/margin_mean": 0.14478152990341187, "margin_dpo/margin_std": 0.584217369556427, "step": 19 }, { "epoch": 0.041884816753926704, "grad_norm": 15.4912748336792, "learning_rate": 1.9791666666666664e-07, "logits/chosen": 2.1966586112976074, "logits/rejected": 1.9358861446380615, "logps/chosen": -315.5570983886719, "logps/ref_chosen": -315.71331787109375, "logps/ref_rejected": -230.0822296142578, "logps/rejected": -230.0750732421875, "loss": 5.5455, "margin_dpo/margin_mean": 0.14912045001983643, "margin_dpo/margin_std": 0.5315914750099182, "step": 20 }, { "epoch": 0.04397905759162304, "grad_norm": 15.429500579833984, "learning_rate": 2.0833333333333333e-07, "logits/chosen": 2.09773588180542, "logits/rejected": 2.0702552795410156, "logps/chosen": -279.3077697753906, "logps/ref_chosen": -279.2261657714844, "logps/ref_rejected": -300.1985168457031, "logps/rejected": -300.22119140625, "loss": 5.5468, "margin_dpo/margin_mean": -0.05891883373260498, "margin_dpo/margin_std": 0.8325001001358032, "step": 21 }, { "epoch": 0.04607329842931937, "grad_norm": 13.734630584716797, "learning_rate": 2.1875e-07, "logits/chosen": 1.8216187953948975, "logits/rejected": 1.9799120426177979, "logps/chosen": -225.4229736328125, "logps/ref_chosen": -225.4801788330078, "logps/ref_rejected": -236.63134765625, "logps/rejected": -236.60411071777344, "loss": 5.5409, "margin_dpo/margin_mean": 0.029949307441711426, "margin_dpo/margin_std": 0.5145200490951538, "step": 22 }, { "epoch": 0.048167539267015703, "grad_norm": 15.402115821838379, "learning_rate": 2.2916666666666663e-07, "logits/chosen": 1.9867033958435059, "logits/rejected": 1.8609161376953125, "logps/chosen": -340.4596862792969, "logps/ref_chosen": -340.510986328125, "logps/ref_rejected": -273.1431579589844, "logps/rejected": -273.184814453125, "loss": 5.5456, "margin_dpo/margin_mean": 0.09300780296325684, "margin_dpo/margin_std": 0.5188795924186707, "step": 23 }, { "epoch": 0.050261780104712044, "grad_norm": 16.485750198364258, "learning_rate": 2.3958333333333335e-07, "logits/chosen": 1.7313284873962402, "logits/rejected": 1.6817138195037842, "logps/chosen": -274.0079040527344, "logps/ref_chosen": -273.9709777832031, "logps/ref_rejected": -269.8603210449219, "logps/rejected": -269.9830017089844, "loss": 5.5462, "margin_dpo/margin_mean": 0.08572280406951904, "margin_dpo/margin_std": 0.4962030053138733, "step": 24 }, { "epoch": 0.05235602094240838, "grad_norm": 14.515819549560547, "learning_rate": 2.5e-07, "logits/chosen": 1.7567241191864014, "logits/rejected": 1.772882342338562, "logps/chosen": -245.420654296875, "logps/ref_chosen": -245.38388061523438, "logps/ref_rejected": -251.77703857421875, "logps/rejected": -251.8808135986328, "loss": 5.5402, "margin_dpo/margin_mean": 0.06698936223983765, "margin_dpo/margin_std": 0.7465457916259766, "step": 25 }, { "epoch": 0.05445026178010471, "grad_norm": 15.561816215515137, "learning_rate": 2.604166666666667e-07, "logits/chosen": 1.6602405309677124, "logits/rejected": 1.611204743385315, "logps/chosen": -245.07839965820312, "logps/ref_chosen": -245.162109375, "logps/ref_rejected": -167.06671142578125, "logps/rejected": -166.95631408691406, "loss": 5.5441, "margin_dpo/margin_mean": -0.026699483394622803, "margin_dpo/margin_std": 0.7909866571426392, "step": 26 }, { "epoch": 0.05654450261780105, "grad_norm": 15.185941696166992, "learning_rate": 2.708333333333333e-07, "logits/chosen": 2.148705244064331, "logits/rejected": 1.9048577547073364, "logps/chosen": -309.2626037597656, "logps/ref_chosen": -309.4706115722656, "logps/ref_rejected": -200.16006469726562, "logps/rejected": -200.23269653320312, "loss": 5.5469, "margin_dpo/margin_mean": 0.2806363105773926, "margin_dpo/margin_std": 0.6260923147201538, "step": 27 }, { "epoch": 0.05863874345549738, "grad_norm": 15.434507369995117, "learning_rate": 2.8125e-07, "logits/chosen": 1.9996970891952515, "logits/rejected": 2.1089255809783936, "logps/chosen": -203.73443603515625, "logps/ref_chosen": -203.72039794921875, "logps/ref_rejected": -228.1062469482422, "logps/rejected": -228.02944946289062, "loss": 5.5409, "margin_dpo/margin_mean": -0.09086447954177856, "margin_dpo/margin_std": 0.3806726932525635, "step": 28 }, { "epoch": 0.060732984293193716, "grad_norm": 14.699873924255371, "learning_rate": 2.916666666666667e-07, "logits/chosen": 2.243607997894287, "logits/rejected": 1.9699711799621582, "logps/chosen": -341.47991943359375, "logps/ref_chosen": -341.7933349609375, "logps/ref_rejected": -323.7848815917969, "logps/rejected": -323.83416748046875, "loss": 5.5414, "margin_dpo/margin_mean": 0.3627087473869324, "margin_dpo/margin_std": 0.9482086896896362, "step": 29 }, { "epoch": 0.06282722513089005, "grad_norm": 14.436098098754883, "learning_rate": 3.020833333333333e-07, "logits/chosen": 1.4743300676345825, "logits/rejected": 1.4441381692886353, "logps/chosen": -239.34152221679688, "logps/ref_chosen": -239.4767303466797, "logps/ref_rejected": -228.0832977294922, "logps/rejected": -228.0165252685547, "loss": 5.5418, "margin_dpo/margin_mean": 0.06841355562210083, "margin_dpo/margin_std": 0.7110106348991394, "step": 30 }, { "epoch": 0.06492146596858639, "grad_norm": 13.857452392578125, "learning_rate": 3.1249999999999997e-07, "logits/chosen": 1.6719400882720947, "logits/rejected": 1.52069091796875, "logps/chosen": -268.8196105957031, "logps/ref_chosen": -268.9744567871094, "logps/ref_rejected": -221.5098114013672, "logps/rejected": -221.68231201171875, "loss": 5.5392, "margin_dpo/margin_mean": 0.3273264765739441, "margin_dpo/margin_std": 0.8021472692489624, "step": 31 }, { "epoch": 0.06701570680628273, "grad_norm": 15.621495246887207, "learning_rate": 3.2291666666666666e-07, "logits/chosen": 1.6164491176605225, "logits/rejected": 1.4590275287628174, "logps/chosen": -236.6236572265625, "logps/ref_chosen": -236.76123046875, "logps/ref_rejected": -191.0041046142578, "logps/rejected": -190.91786193847656, "loss": 5.5383, "margin_dpo/margin_mean": 0.051319420337677, "margin_dpo/margin_std": 0.562064528465271, "step": 32 }, { "epoch": 0.06910994764397906, "grad_norm": 14.935791015625, "learning_rate": 3.333333333333333e-07, "logits/chosen": 1.937072515487671, "logits/rejected": 1.866725206375122, "logps/chosen": -258.4335021972656, "logps/ref_chosen": -258.6623840332031, "logps/ref_rejected": -233.15805053710938, "logps/rejected": -233.19522094726562, "loss": 5.5401, "margin_dpo/margin_mean": 0.26607221364974976, "margin_dpo/margin_std": 0.9210672974586487, "step": 33 }, { "epoch": 0.0712041884816754, "grad_norm": 17.70219612121582, "learning_rate": 3.4375e-07, "logits/chosen": 2.076815128326416, "logits/rejected": 2.0185177326202393, "logps/chosen": -380.03729248046875, "logps/ref_chosen": -380.25201416015625, "logps/ref_rejected": -315.8236389160156, "logps/rejected": -315.7915954589844, "loss": 5.5395, "margin_dpo/margin_mean": 0.18271714448928833, "margin_dpo/margin_std": 0.7701175212860107, "step": 34 }, { "epoch": 0.07329842931937172, "grad_norm": 13.645162582397461, "learning_rate": 3.541666666666667e-07, "logits/chosen": 1.5646406412124634, "logits/rejected": 1.7504596710205078, "logps/chosen": -245.80335998535156, "logps/ref_chosen": -246.0772705078125, "logps/ref_rejected": -317.1019592285156, "logps/rejected": -317.00274658203125, "loss": 5.54, "margin_dpo/margin_mean": 0.17473018169403076, "margin_dpo/margin_std": 0.7614114284515381, "step": 35 }, { "epoch": 0.07539267015706806, "grad_norm": 17.520965576171875, "learning_rate": 3.645833333333333e-07, "logits/chosen": 1.7731884717941284, "logits/rejected": 1.8305914402008057, "logps/chosen": -343.9805908203125, "logps/ref_chosen": -344.1368408203125, "logps/ref_rejected": -343.6894836425781, "logps/rejected": -343.47882080078125, "loss": 5.5342, "margin_dpo/margin_mean": -0.05438530445098877, "margin_dpo/margin_std": 0.7112289071083069, "step": 36 }, { "epoch": 0.0774869109947644, "grad_norm": 15.14476203918457, "learning_rate": 3.75e-07, "logits/chosen": 1.9591785669326782, "logits/rejected": 1.9149752855300903, "logps/chosen": -310.9266357421875, "logps/ref_chosen": -311.3376770019531, "logps/ref_rejected": -278.5052185058594, "logps/rejected": -278.489990234375, "loss": 5.5375, "margin_dpo/margin_mean": 0.3958609700202942, "margin_dpo/margin_std": 0.6456325054168701, "step": 37 }, { "epoch": 0.07958115183246073, "grad_norm": 15.079659461975098, "learning_rate": 3.8541666666666665e-07, "logits/chosen": 2.1111977100372314, "logits/rejected": 2.3584368228912354, "logps/chosen": -193.07827758789062, "logps/ref_chosen": -193.3851318359375, "logps/ref_rejected": -234.6280975341797, "logps/rejected": -234.42193603515625, "loss": 5.5401, "margin_dpo/margin_mean": 0.10068202018737793, "margin_dpo/margin_std": 0.5997118353843689, "step": 38 }, { "epoch": 0.08167539267015707, "grad_norm": 15.749566078186035, "learning_rate": 3.958333333333333e-07, "logits/chosen": 1.7943568229675293, "logits/rejected": 1.8780990839004517, "logps/chosen": -290.79742431640625, "logps/ref_chosen": -291.5687255859375, "logps/ref_rejected": -317.7392578125, "logps/rejected": -317.748779296875, "loss": 5.5255, "margin_dpo/margin_mean": 0.7807860374450684, "margin_dpo/margin_std": 1.0324419736862183, "step": 39 }, { "epoch": 0.08376963350785341, "grad_norm": 15.053966522216797, "learning_rate": 4.0625e-07, "logits/chosen": 1.7152007818222046, "logits/rejected": 1.685285210609436, "logps/chosen": -211.45947265625, "logps/ref_chosen": -211.951904296875, "logps/ref_rejected": -166.82864379882812, "logps/rejected": -166.58428955078125, "loss": 5.528, "margin_dpo/margin_mean": 0.24808716773986816, "margin_dpo/margin_std": 0.9657536745071411, "step": 40 }, { "epoch": 0.08586387434554973, "grad_norm": 15.568541526794434, "learning_rate": 4.1666666666666667e-07, "logits/chosen": 1.968687653541565, "logits/rejected": 1.8429195880889893, "logps/chosen": -300.13665771484375, "logps/ref_chosen": -300.6400146484375, "logps/ref_rejected": -224.77317810058594, "logps/rejected": -224.72613525390625, "loss": 5.535, "margin_dpo/margin_mean": 0.45627307891845703, "margin_dpo/margin_std": 0.7364793419837952, "step": 41 }, { "epoch": 0.08795811518324607, "grad_norm": 14.581147193908691, "learning_rate": 4.270833333333333e-07, "logits/chosen": 2.1621668338775635, "logits/rejected": 2.195481061935425, "logps/chosen": -291.0929870605469, "logps/ref_chosen": -291.4709167480469, "logps/ref_rejected": -285.62982177734375, "logps/rejected": -285.6851501464844, "loss": 5.5294, "margin_dpo/margin_mean": 0.43321943283081055, "margin_dpo/margin_std": 1.0551120042800903, "step": 42 }, { "epoch": 0.09005235602094241, "grad_norm": 15.62813663482666, "learning_rate": 4.375e-07, "logits/chosen": 1.9382034540176392, "logits/rejected": 1.9245309829711914, "logps/chosen": -313.7782897949219, "logps/ref_chosen": -314.3768615722656, "logps/ref_rejected": -246.80313110351562, "logps/rejected": -246.7808380126953, "loss": 5.5273, "margin_dpo/margin_mean": 0.5762431621551514, "margin_dpo/margin_std": 0.9067457914352417, "step": 43 }, { "epoch": 0.09214659685863874, "grad_norm": 15.793681144714355, "learning_rate": 4.479166666666667e-07, "logits/chosen": 1.7762880325317383, "logits/rejected": 1.7065317630767822, "logps/chosen": -209.00802612304688, "logps/ref_chosen": -209.8181915283203, "logps/ref_rejected": -246.21340942382812, "logps/rejected": -246.0368194580078, "loss": 5.5223, "margin_dpo/margin_mean": 0.633570671081543, "margin_dpo/margin_std": 1.1882718801498413, "step": 44 }, { "epoch": 0.09424083769633508, "grad_norm": 16.929059982299805, "learning_rate": 4.5833333333333327e-07, "logits/chosen": 1.7606732845306396, "logits/rejected": 1.5792968273162842, "logps/chosen": -308.1605224609375, "logps/ref_chosen": -309.0930480957031, "logps/ref_rejected": -269.3559265136719, "logps/rejected": -268.9593200683594, "loss": 5.5261, "margin_dpo/margin_mean": 0.535961389541626, "margin_dpo/margin_std": 1.0746005773544312, "step": 45 }, { "epoch": 0.09633507853403141, "grad_norm": 16.489280700683594, "learning_rate": 4.6874999999999996e-07, "logits/chosen": 1.9223171472549438, "logits/rejected": 1.9758403301239014, "logps/chosen": -298.0412292480469, "logps/ref_chosen": -298.72467041015625, "logps/ref_rejected": -309.87786865234375, "logps/rejected": -309.4717712402344, "loss": 5.5236, "margin_dpo/margin_mean": 0.277274489402771, "margin_dpo/margin_std": 0.9340643882751465, "step": 46 }, { "epoch": 0.09842931937172775, "grad_norm": 13.506661415100098, "learning_rate": 4.791666666666667e-07, "logits/chosen": 1.6691988706588745, "logits/rejected": 2.0380465984344482, "logps/chosen": -215.84332275390625, "logps/ref_chosen": -216.43553161621094, "logps/ref_rejected": -292.6329345703125, "logps/rejected": -291.96148681640625, "loss": 5.5293, "margin_dpo/margin_mean": -0.0792464017868042, "margin_dpo/margin_std": 1.1294535398483276, "step": 47 }, { "epoch": 0.10052356020942409, "grad_norm": 14.86147403717041, "learning_rate": 4.895833333333333e-07, "logits/chosen": 2.211613178253174, "logits/rejected": 2.186110496520996, "logps/chosen": -234.05947875976562, "logps/ref_chosen": -234.77496337890625, "logps/ref_rejected": -240.41433715820312, "logps/rejected": -240.24525451660156, "loss": 5.5203, "margin_dpo/margin_mean": 0.5463833212852478, "margin_dpo/margin_std": 1.3220971822738647, "step": 48 }, { "epoch": 0.10261780104712041, "grad_norm": 15.43526840209961, "learning_rate": 5e-07, "logits/chosen": 1.7962108850479126, "logits/rejected": 1.9277849197387695, "logps/chosen": -245.73326110839844, "logps/ref_chosen": -246.7688446044922, "logps/ref_rejected": -253.47378540039062, "logps/rejected": -253.3438720703125, "loss": 5.5203, "margin_dpo/margin_mean": 0.9057276248931885, "margin_dpo/margin_std": 1.4221221208572388, "step": 49 }, { "epoch": 0.10471204188481675, "grad_norm": 15.111068725585938, "learning_rate": 4.999932966293553e-07, "logits/chosen": 2.2119665145874023, "logits/rejected": 2.335810422897339, "logps/chosen": -281.3116760253906, "logps/ref_chosen": -282.61981201171875, "logps/ref_rejected": -340.8515625, "logps/rejected": -340.31781005859375, "loss": 5.5284, "margin_dpo/margin_mean": 0.7743173837661743, "margin_dpo/margin_std": 1.4886585474014282, "step": 50 }, { "epoch": 0.1068062827225131, "grad_norm": 14.817649841308594, "learning_rate": 4.999731868769026e-07, "logits/chosen": 1.637377381324768, "logits/rejected": 1.7862030267715454, "logps/chosen": -244.794677734375, "logps/ref_chosen": -245.87562561035156, "logps/ref_rejected": -309.7420654296875, "logps/rejected": -309.6230773925781, "loss": 5.5202, "margin_dpo/margin_mean": 0.961925208568573, "margin_dpo/margin_std": 1.9343087673187256, "step": 51 }, { "epoch": 0.10890052356020942, "grad_norm": 17.035507202148438, "learning_rate": 4.99939671821067e-07, "logits/chosen": 1.8847734928131104, "logits/rejected": 2.039155960083008, "logps/chosen": -276.9980163574219, "logps/ref_chosen": -278.3123474121094, "logps/ref_rejected": -320.58203125, "logps/rejected": -319.9336853027344, "loss": 5.5067, "margin_dpo/margin_mean": 0.6659917235374451, "margin_dpo/margin_std": 1.508874773979187, "step": 52 }, { "epoch": 0.11099476439790576, "grad_norm": 15.945631980895996, "learning_rate": 4.998927532591591e-07, "logits/chosen": 2.085860013961792, "logits/rejected": 2.0801711082458496, "logps/chosen": -331.2710266113281, "logps/ref_chosen": -332.776123046875, "logps/ref_rejected": -325.1794128417969, "logps/rejected": -324.69622802734375, "loss": 5.5144, "margin_dpo/margin_mean": 1.0218517780303955, "margin_dpo/margin_std": 1.590319037437439, "step": 53 }, { "epoch": 0.1130890052356021, "grad_norm": 14.57484245300293, "learning_rate": 4.998324337072792e-07, "logits/chosen": 1.3913365602493286, "logits/rejected": 1.4456019401550293, "logps/chosen": -294.7577819824219, "logps/ref_chosen": -296.2243347167969, "logps/ref_rejected": -267.64251708984375, "logps/rejected": -267.3682861328125, "loss": 5.5131, "margin_dpo/margin_mean": 1.1922770738601685, "margin_dpo/margin_std": 1.47157621383667, "step": 54 }, { "epoch": 0.11518324607329843, "grad_norm": 12.808218955993652, "learning_rate": 4.997587164001815e-07, "logits/chosen": 1.8794647455215454, "logits/rejected": 1.8777508735656738, "logps/chosen": -197.05091857910156, "logps/ref_chosen": -198.1138916015625, "logps/ref_rejected": -185.93772888183594, "logps/rejected": -185.5297088623047, "loss": 5.522, "margin_dpo/margin_mean": 0.6549429893493652, "margin_dpo/margin_std": 1.0751301050186157, "step": 55 }, { "epoch": 0.11727748691099477, "grad_norm": 14.403154373168945, "learning_rate": 4.996716052911017e-07, "logits/chosen": 2.004265785217285, "logits/rejected": 1.965603232383728, "logps/chosen": -267.2569580078125, "logps/ref_chosen": -268.8618469238281, "logps/ref_rejected": -245.21348571777344, "logps/rejected": -244.97555541992188, "loss": 5.5085, "margin_dpo/margin_mean": 1.3669579029083252, "margin_dpo/margin_std": 1.768923282623291, "step": 56 }, { "epoch": 0.1193717277486911, "grad_norm": 17.60223960876465, "learning_rate": 4.99571105051544e-07, "logits/chosen": 2.1416828632354736, "logits/rejected": 1.8643951416015625, "logps/chosen": -286.81622314453125, "logps/ref_chosen": -288.4784851074219, "logps/ref_rejected": -239.400146484375, "logps/rejected": -238.46566772460938, "loss": 5.4919, "margin_dpo/margin_mean": 0.7277634739875793, "margin_dpo/margin_std": 1.487809658050537, "step": 57 }, { "epoch": 0.12146596858638743, "grad_norm": 14.885680198669434, "learning_rate": 4.994572210710314e-07, "logits/chosen": 1.8542314767837524, "logits/rejected": 1.8795952796936035, "logps/chosen": -276.6270446777344, "logps/ref_chosen": -278.2837219238281, "logps/ref_rejected": -262.5280456542969, "logps/rejected": -262.4251708984375, "loss": 5.5076, "margin_dpo/margin_mean": 1.5538146495819092, "margin_dpo/margin_std": 1.7451914548873901, "step": 58 }, { "epoch": 0.12356020942408377, "grad_norm": 15.256675720214844, "learning_rate": 4.993299594568162e-07, "logits/chosen": 1.695129632949829, "logits/rejected": 1.76768159866333, "logps/chosen": -231.92245483398438, "logps/ref_chosen": -232.77662658691406, "logps/ref_rejected": -226.2711181640625, "logps/rejected": -225.69354248046875, "loss": 5.5103, "margin_dpo/margin_mean": 0.27659308910369873, "margin_dpo/margin_std": 1.5459599494934082, "step": 59 }, { "epoch": 0.1256544502617801, "grad_norm": 14.66773509979248, "learning_rate": 4.991893270335525e-07, "logits/chosen": 1.8791348934173584, "logits/rejected": 1.587320327758789, "logps/chosen": -314.26800537109375, "logps/ref_chosen": -315.6903991699219, "logps/ref_rejected": -190.40899658203125, "logps/rejected": -189.97193908691406, "loss": 5.4951, "margin_dpo/margin_mean": 0.9853799939155579, "margin_dpo/margin_std": 1.7359843254089355, "step": 60 }, { "epoch": 0.12774869109947645, "grad_norm": 15.011839866638184, "learning_rate": 4.990353313429303e-07, "logits/chosen": 2.034388542175293, "logits/rejected": 2.065513849258423, "logps/chosen": -249.50173950195312, "logps/ref_chosen": -251.527099609375, "logps/ref_rejected": -261.0340270996094, "logps/rejected": -259.8365173339844, "loss": 5.503, "margin_dpo/margin_mean": 0.8278936147689819, "margin_dpo/margin_std": 1.6765094995498657, "step": 61 }, { "epoch": 0.12984293193717278, "grad_norm": 14.675267219543457, "learning_rate": 4.988679806432711e-07, "logits/chosen": 1.8862786293029785, "logits/rejected": 1.8161289691925049, "logps/chosen": -255.54144287109375, "logps/ref_chosen": -257.3919982910156, "logps/ref_rejected": -282.1814880371094, "logps/rejected": -281.52008056640625, "loss": 5.4967, "margin_dpo/margin_mean": 1.189134955406189, "margin_dpo/margin_std": 2.0515801906585693, "step": 62 }, { "epoch": 0.1319371727748691, "grad_norm": 14.802937507629395, "learning_rate": 4.986872839090852e-07, "logits/chosen": 2.1043763160705566, "logits/rejected": 2.2130091190338135, "logps/chosen": -320.39398193359375, "logps/ref_chosen": -322.24725341796875, "logps/ref_rejected": -327.70892333984375, "logps/rejected": -326.7320251464844, "loss": 5.4938, "margin_dpo/margin_mean": 0.876427948474884, "margin_dpo/margin_std": 2.8320491313934326, "step": 63 }, { "epoch": 0.13403141361256546, "grad_norm": 15.675727844238281, "learning_rate": 4.9849325083059e-07, "logits/chosen": 1.840427279472351, "logits/rejected": 2.098795175552368, "logps/chosen": -333.1174621582031, "logps/ref_chosen": -335.7379455566406, "logps/ref_rejected": -337.8742980957031, "logps/rejected": -337.04913330078125, "loss": 5.4794, "margin_dpo/margin_mean": 1.795319676399231, "margin_dpo/margin_std": 3.2046210765838623, "step": 64 }, { "epoch": 0.13612565445026178, "grad_norm": 14.844298362731934, "learning_rate": 4.982858918131906e-07, "logits/chosen": 1.8770238161087036, "logits/rejected": 1.9210056066513062, "logps/chosen": -309.9880676269531, "logps/ref_chosen": -312.36358642578125, "logps/ref_rejected": -300.220947265625, "logps/rejected": -298.6002502441406, "loss": 5.5002, "margin_dpo/margin_mean": 0.754831075668335, "margin_dpo/margin_std": 2.1830434799194336, "step": 65 }, { "epoch": 0.1382198952879581, "grad_norm": 14.79240894317627, "learning_rate": 4.980652179769217e-07, "logits/chosen": 1.8671329021453857, "logits/rejected": 2.058912515640259, "logps/chosen": -195.7026824951172, "logps/ref_chosen": -198.186767578125, "logps/ref_rejected": -248.18748474121094, "logps/rejected": -247.15882873535156, "loss": 5.4918, "margin_dpo/margin_mean": 1.4554123878479004, "margin_dpo/margin_std": 2.3160204887390137, "step": 66 }, { "epoch": 0.14031413612565444, "grad_norm": 14.211318969726562, "learning_rate": 4.978312411558517e-07, "logits/chosen": 2.104246139526367, "logits/rejected": 2.138582706451416, "logps/chosen": -289.1455078125, "logps/ref_chosen": -291.9940490722656, "logps/ref_rejected": -269.945068359375, "logps/rejected": -268.6217346191406, "loss": 5.4991, "margin_dpo/margin_mean": 1.5252022743225098, "margin_dpo/margin_std": 2.973376989364624, "step": 67 }, { "epoch": 0.1424083769633508, "grad_norm": 14.742173194885254, "learning_rate": 4.975839738974473e-07, "logits/chosen": 1.597095012664795, "logits/rejected": 1.4467945098876953, "logps/chosen": -287.52056884765625, "logps/ref_chosen": -289.9323425292969, "logps/ref_rejected": -225.7897491455078, "logps/rejected": -225.21658325195312, "loss": 5.4771, "margin_dpo/margin_mean": 1.8386340141296387, "margin_dpo/margin_std": 3.168964385986328, "step": 68 }, { "epoch": 0.14450261780104712, "grad_norm": 14.997187614440918, "learning_rate": 4.97323429461901e-07, "logits/chosen": 2.1530022621154785, "logits/rejected": 2.05501651763916, "logps/chosen": -263.3707580566406, "logps/ref_chosen": -266.7104797363281, "logps/ref_rejected": -229.5946502685547, "logps/rejected": -228.25326538085938, "loss": 5.4597, "margin_dpo/margin_mean": 1.9983257055282593, "margin_dpo/margin_std": 3.47263503074646, "step": 69 }, { "epoch": 0.14659685863874344, "grad_norm": 15.571727752685547, "learning_rate": 4.970496218214204e-07, "logits/chosen": 2.235600471496582, "logits/rejected": 2.3245067596435547, "logps/chosen": -265.7595520019531, "logps/ref_chosen": -268.6711120605469, "logps/ref_rejected": -261.61273193359375, "logps/rejected": -260.5652160644531, "loss": 5.4636, "margin_dpo/margin_mean": 1.864060878753662, "margin_dpo/margin_std": 2.7824549674987793, "step": 70 }, { "epoch": 0.1486910994764398, "grad_norm": 14.973682403564453, "learning_rate": 4.967625656594781e-07, "logits/chosen": 1.9091215133666992, "logits/rejected": 1.9661266803741455, "logps/chosen": -241.69586181640625, "logps/ref_chosen": -244.97821044921875, "logps/ref_rejected": -263.7174377441406, "logps/rejected": -262.50848388671875, "loss": 5.469, "margin_dpo/margin_mean": 2.0734434127807617, "margin_dpo/margin_std": 4.373683929443359, "step": 71 }, { "epoch": 0.15078534031413612, "grad_norm": 14.649141311645508, "learning_rate": 4.964622763700252e-07, "logits/chosen": 1.8277561664581299, "logits/rejected": 1.8917427062988281, "logps/chosen": -276.90264892578125, "logps/ref_chosen": -280.0353698730469, "logps/ref_rejected": -291.1289367675781, "logps/rejected": -289.7200927734375, "loss": 5.4668, "margin_dpo/margin_mean": 1.7238655090332031, "margin_dpo/margin_std": 3.216670274734497, "step": 72 }, { "epoch": 0.15287958115183245, "grad_norm": 14.681085586547852, "learning_rate": 4.961487700566646e-07, "logits/chosen": 2.040257453918457, "logits/rejected": 2.0101261138916016, "logps/chosen": -237.6248321533203, "logps/ref_chosen": -241.37384033203125, "logps/ref_rejected": -227.28871154785156, "logps/rejected": -224.7103271484375, "loss": 5.4684, "margin_dpo/margin_mean": 1.1706353425979614, "margin_dpo/margin_std": 2.684138774871826, "step": 73 }, { "epoch": 0.1549738219895288, "grad_norm": 16.057296752929688, "learning_rate": 4.958220635317885e-07, "logits/chosen": 1.7149076461791992, "logits/rejected": 1.616335153579712, "logps/chosen": -427.9046630859375, "logps/ref_chosen": -432.6361389160156, "logps/ref_rejected": -408.990478515625, "logps/rejected": -406.4610595703125, "loss": 5.4703, "margin_dpo/margin_mean": 2.2020528316497803, "margin_dpo/margin_std": 3.4136807918548584, "step": 74 }, { "epoch": 0.15706806282722513, "grad_norm": 15.3256254196167, "learning_rate": 4.954821743156767e-07, "logits/chosen": 1.8307483196258545, "logits/rejected": 1.8694071769714355, "logps/chosen": -277.4227294921875, "logps/ref_chosen": -282.2913513183594, "logps/ref_rejected": -227.30093383789062, "logps/rejected": -225.89971923828125, "loss": 5.4384, "margin_dpo/margin_mean": 3.4673845767974854, "margin_dpo/margin_std": 3.8357293605804443, "step": 75 }, { "epoch": 0.15916230366492146, "grad_norm": 16.74871253967285, "learning_rate": 4.951291206355559e-07, "logits/chosen": 1.9061857461929321, "logits/rejected": 1.6594858169555664, "logps/chosen": -272.63018798828125, "logps/ref_chosen": -277.90081787109375, "logps/ref_rejected": -214.1353302001953, "logps/rejected": -211.89590454101562, "loss": 5.431, "margin_dpo/margin_mean": 3.031224250793457, "margin_dpo/margin_std": 3.183443546295166, "step": 76 }, { "epoch": 0.1612565445026178, "grad_norm": 18.260398864746094, "learning_rate": 4.947629214246236e-07, "logits/chosen": 2.142491102218628, "logits/rejected": 2.1160454750061035, "logps/chosen": -278.9680480957031, "logps/ref_chosen": -283.3741455078125, "logps/ref_rejected": -239.51246643066406, "logps/rejected": -237.45001220703125, "loss": 5.4527, "margin_dpo/margin_mean": 2.3436222076416016, "margin_dpo/margin_std": 3.3090319633483887, "step": 77 }, { "epoch": 0.16335078534031414, "grad_norm": 14.633062362670898, "learning_rate": 4.943835963210323e-07, "logits/chosen": 1.6990811824798584, "logits/rejected": 1.6937521696090698, "logps/chosen": -202.76388549804688, "logps/ref_chosen": -207.1702423095703, "logps/ref_rejected": -196.26866149902344, "logps/rejected": -194.35032653808594, "loss": 5.4294, "margin_dpo/margin_mean": 2.4880149364471436, "margin_dpo/margin_std": 3.424355983734131, "step": 78 }, { "epoch": 0.16544502617801046, "grad_norm": 16.774738311767578, "learning_rate": 4.939911656668361e-07, "logits/chosen": 1.9445700645446777, "logits/rejected": 2.229759454727173, "logps/chosen": -208.6917724609375, "logps/ref_chosen": -212.90396118164062, "logps/ref_rejected": -242.32528686523438, "logps/rejected": -239.5742950439453, "loss": 5.4268, "margin_dpo/margin_mean": 1.461201786994934, "margin_dpo/margin_std": 4.005527973175049, "step": 79 }, { "epoch": 0.16753926701570682, "grad_norm": 15.637154579162598, "learning_rate": 4.935856505068998e-07, "logits/chosen": 1.3914299011230469, "logits/rejected": 1.5492628812789917, "logps/chosen": -251.85031127929688, "logps/ref_chosen": -257.9057312011719, "logps/ref_rejected": -246.391845703125, "logps/rejected": -243.01177978515625, "loss": 5.4504, "margin_dpo/margin_mean": 2.6753690242767334, "margin_dpo/margin_std": 3.9176571369171143, "step": 80 }, { "epoch": 0.16963350785340314, "grad_norm": 14.57850456237793, "learning_rate": 4.93167072587771e-07, "logits/chosen": 2.009546995162964, "logits/rejected": 2.2237491607666016, "logps/chosen": -220.03717041015625, "logps/ref_chosen": -226.68576049804688, "logps/ref_rejected": -215.2713623046875, "logps/rejected": -212.5276336669922, "loss": 5.4326, "margin_dpo/margin_mean": 3.9048891067504883, "margin_dpo/margin_std": 3.982060432434082, "step": 81 }, { "epoch": 0.17172774869109947, "grad_norm": 15.640838623046875, "learning_rate": 4.92735454356513e-07, "logits/chosen": 1.8449329137802124, "logits/rejected": 1.773772954940796, "logps/chosen": -290.084228515625, "logps/ref_chosen": -296.12799072265625, "logps/ref_rejected": -261.3748474121094, "logps/rejected": -258.4228515625, "loss": 5.4297, "margin_dpo/margin_mean": 3.091761589050293, "margin_dpo/margin_std": 4.757362365722656, "step": 82 }, { "epoch": 0.17382198952879582, "grad_norm": 15.583915710449219, "learning_rate": 4.922908189595017e-07, "logits/chosen": 1.8198847770690918, "logits/rejected": 1.8020501136779785, "logps/chosen": -255.5862274169922, "logps/ref_chosen": -261.39862060546875, "logps/ref_rejected": -279.9942626953125, "logps/rejected": -276.3531799316406, "loss": 5.4115, "margin_dpo/margin_mean": 2.1713221073150635, "margin_dpo/margin_std": 4.908409118652344, "step": 83 }, { "epoch": 0.17591623036649215, "grad_norm": 15.165980339050293, "learning_rate": 4.918331902411841e-07, "logits/chosen": 2.0596227645874023, "logits/rejected": 1.9471057653427124, "logps/chosen": -385.02862548828125, "logps/ref_chosen": -392.54547119140625, "logps/ref_rejected": -342.066162109375, "logps/rejected": -336.90234375, "loss": 5.4311, "margin_dpo/margin_mean": 2.353001356124878, "margin_dpo/margin_std": 5.491085052490234, "step": 84 }, { "epoch": 0.17801047120418848, "grad_norm": 13.898218154907227, "learning_rate": 4.913625927427995e-07, "logits/chosen": 1.505142331123352, "logits/rejected": 1.6710948944091797, "logps/chosen": -186.115478515625, "logps/ref_chosen": -192.9306640625, "logps/ref_rejected": -231.5825653076172, "logps/rejected": -227.84988403320312, "loss": 5.4719, "margin_dpo/margin_mean": 3.082519054412842, "margin_dpo/margin_std": 4.139708995819092, "step": 85 }, { "epoch": 0.18010471204188483, "grad_norm": 16.21003532409668, "learning_rate": 4.908790517010636e-07, "logits/chosen": 1.800257682800293, "logits/rejected": 1.8198274374008179, "logps/chosen": -306.5592346191406, "logps/ref_chosen": -313.5525207519531, "logps/ref_rejected": -285.59228515625, "logps/rejected": -280.493896484375, "loss": 5.4098, "margin_dpo/margin_mean": 1.8949062824249268, "margin_dpo/margin_std": 6.387627601623535, "step": 86 }, { "epoch": 0.18219895287958116, "grad_norm": 15.313575744628906, "learning_rate": 4.903825930468148e-07, "logits/chosen": 1.5017904043197632, "logits/rejected": 1.4683315753936768, "logps/chosen": -227.59046936035156, "logps/ref_chosen": -236.03445434570312, "logps/ref_rejected": -225.67410278320312, "logps/rejected": -221.80938720703125, "loss": 5.3899, "margin_dpo/margin_mean": 4.5792436599731445, "margin_dpo/margin_std": 6.2218732833862305, "step": 87 }, { "epoch": 0.18429319371727748, "grad_norm": 14.063505172729492, "learning_rate": 4.898732434036243e-07, "logits/chosen": 1.7088102102279663, "logits/rejected": 1.6172915697097778, "logps/chosen": -273.8514709472656, "logps/ref_chosen": -280.1703186035156, "logps/ref_rejected": -219.1881103515625, "logps/rejected": -216.62831115722656, "loss": 5.4232, "margin_dpo/margin_mean": 3.7590301036834717, "margin_dpo/margin_std": 6.66114616394043, "step": 88 }, { "epoch": 0.18638743455497384, "grad_norm": 15.837873458862305, "learning_rate": 4.893510300863676e-07, "logits/chosen": 2.1236486434936523, "logits/rejected": 2.1029891967773438, "logps/chosen": -202.62815856933594, "logps/ref_chosen": -211.3966827392578, "logps/ref_rejected": -171.04954528808594, "logps/rejected": -165.4285430908203, "loss": 5.3997, "margin_dpo/margin_mean": 3.1475319862365723, "margin_dpo/margin_std": 4.286096572875977, "step": 89 }, { "epoch": 0.18848167539267016, "grad_norm": 15.2637357711792, "learning_rate": 4.8881598109976e-07, "logits/chosen": 2.190295696258545, "logits/rejected": 2.0822367668151855, "logps/chosen": -271.2816467285156, "logps/ref_chosen": -280.9217834472656, "logps/ref_rejected": -245.75814819335938, "logps/rejected": -239.31825256347656, "loss": 5.4166, "margin_dpo/margin_mean": 3.2002599239349365, "margin_dpo/margin_std": 5.397155284881592, "step": 90 }, { "epoch": 0.1905759162303665, "grad_norm": 14.667741775512695, "learning_rate": 4.882681251368548e-07, "logits/chosen": 1.3757317066192627, "logits/rejected": 1.691314697265625, "logps/chosen": -121.55278778076172, "logps/ref_chosen": -130.23472595214844, "logps/ref_rejected": -177.76895141601562, "logps/rejected": -172.3560028076172, "loss": 5.4142, "margin_dpo/margin_mean": 3.2689881324768066, "margin_dpo/margin_std": 4.278058052062988, "step": 91 }, { "epoch": 0.19267015706806281, "grad_norm": 15.648965835571289, "learning_rate": 4.877074915775048e-07, "logits/chosen": 1.6639858484268188, "logits/rejected": 1.4799772500991821, "logps/chosen": -334.3116455078125, "logps/ref_chosen": -344.4306335449219, "logps/ref_rejected": -276.291748046875, "logps/rejected": -270.984375, "loss": 5.4004, "margin_dpo/margin_mean": 4.811601161956787, "margin_dpo/margin_std": 5.4291887283325195, "step": 92 }, { "epoch": 0.19476439790575917, "grad_norm": 14.095191955566406, "learning_rate": 4.871341104867864e-07, "logits/chosen": 1.9756680727005005, "logits/rejected": 1.923811674118042, "logps/chosen": -196.8525390625, "logps/ref_chosen": -206.1533660888672, "logps/ref_rejected": -231.759033203125, "logps/rejected": -227.26364135742188, "loss": 5.408, "margin_dpo/margin_mean": 4.805446147918701, "margin_dpo/margin_std": 6.056692600250244, "step": 93 }, { "epoch": 0.1968586387434555, "grad_norm": 15.55691909790039, "learning_rate": 4.865480126133871e-07, "logits/chosen": 1.7521295547485352, "logits/rejected": 1.8241287469863892, "logps/chosen": -250.36639404296875, "logps/ref_chosen": -261.2528381347656, "logps/ref_rejected": -269.2928771972656, "logps/rejected": -263.65771484375, "loss": 5.3919, "margin_dpo/margin_mean": 5.2512712478637695, "margin_dpo/margin_std": 7.996842384338379, "step": 94 }, { "epoch": 0.19895287958115182, "grad_norm": 16.091550827026367, "learning_rate": 4.859492293879573e-07, "logits/chosen": 1.884320855140686, "logits/rejected": 1.6460635662078857, "logps/chosen": -334.9806823730469, "logps/ref_chosen": -345.480224609375, "logps/ref_rejected": -294.0064697265625, "logps/rejected": -288.2855529785156, "loss": 5.3719, "margin_dpo/margin_mean": 4.778585433959961, "margin_dpo/margin_std": 8.979715347290039, "step": 95 }, { "epoch": 0.20104712041884817, "grad_norm": 15.211227416992188, "learning_rate": 4.853377929214243e-07, "logits/chosen": 1.442068099975586, "logits/rejected": 1.3402963876724243, "logps/chosen": -239.22625732421875, "logps/ref_chosen": -249.85205078125, "logps/ref_rejected": -274.1024169921875, "logps/rejected": -266.5990295410156, "loss": 5.3763, "margin_dpo/margin_mean": 3.1224074363708496, "margin_dpo/margin_std": 6.072546482086182, "step": 96 }, { "epoch": 0.2031413612565445, "grad_norm": 15.3523588180542, "learning_rate": 4.847137360032699e-07, "logits/chosen": 1.682770013809204, "logits/rejected": 1.789080262184143, "logps/chosen": -224.3016815185547, "logps/ref_chosen": -233.62025451660156, "logps/ref_rejected": -258.32647705078125, "logps/rejected": -253.4158935546875, "loss": 5.3694, "margin_dpo/margin_mean": 4.407979965209961, "margin_dpo/margin_std": 6.528227806091309, "step": 97 }, { "epoch": 0.20523560209424083, "grad_norm": 15.678691864013672, "learning_rate": 4.84077092099773e-07, "logits/chosen": 1.9161100387573242, "logits/rejected": 2.1308605670928955, "logps/chosen": -256.5081787109375, "logps/ref_chosen": -267.27911376953125, "logps/ref_rejected": -335.98284912109375, "logps/rejected": -327.7537841796875, "loss": 5.3668, "margin_dpo/margin_mean": 2.5418522357940674, "margin_dpo/margin_std": 7.70307731628418, "step": 98 }, { "epoch": 0.20732984293193718, "grad_norm": 15.061153411865234, "learning_rate": 4.834278953522137e-07, "logits/chosen": 1.8618698120117188, "logits/rejected": 1.8229163885116577, "logps/chosen": -275.3375244140625, "logps/ref_chosen": -285.90435791015625, "logps/ref_rejected": -278.0072021484375, "logps/rejected": -271.364013671875, "loss": 5.3595, "margin_dpo/margin_mean": 3.9236538410186768, "margin_dpo/margin_std": 9.717151641845703, "step": 99 }, { "epoch": 0.2094240837696335, "grad_norm": 15.537282943725586, "learning_rate": 4.827661805750437e-07, "logits/chosen": 1.6382941007614136, "logits/rejected": 1.5340853929519653, "logps/chosen": -327.0155944824219, "logps/ref_chosen": -335.2471008300781, "logps/ref_rejected": -304.7597351074219, "logps/rejected": -300.1502380371094, "loss": 5.3785, "margin_dpo/margin_mean": 3.622096061706543, "margin_dpo/margin_std": 7.071560859680176, "step": 100 }, { "epoch": 0.21151832460732983, "grad_norm": 15.370798110961914, "learning_rate": 4.820919832540181e-07, "logits/chosen": 1.507110357284546, "logits/rejected": 1.7560882568359375, "logps/chosen": -262.4298400878906, "logps/ref_chosen": -272.9364318847656, "logps/ref_rejected": -271.82366943359375, "logps/rejected": -268.6051330566406, "loss": 5.3734, "margin_dpo/margin_mean": 7.288076400756836, "margin_dpo/margin_std": 8.229599952697754, "step": 101 }, { "epoch": 0.2136125654450262, "grad_norm": 15.161721229553223, "learning_rate": 4.814053395442932e-07, "logits/chosen": 1.76149582862854, "logits/rejected": 1.8778889179229736, "logps/chosen": -151.35018920898438, "logps/ref_chosen": -159.15536499023438, "logps/ref_rejected": -191.47312927246094, "logps/rejected": -188.49746704101562, "loss": 5.305, "margin_dpo/margin_mean": 4.829489707946777, "margin_dpo/margin_std": 7.208276271820068, "step": 102 }, { "epoch": 0.2157068062827225, "grad_norm": 15.669957160949707, "learning_rate": 4.807062862684873e-07, "logits/chosen": 2.092226505279541, "logits/rejected": 2.202396869659424, "logps/chosen": -291.3448486328125, "logps/ref_chosen": -301.0699768066406, "logps/ref_rejected": -306.12469482421875, "logps/rejected": -298.96844482421875, "loss": 5.3684, "margin_dpo/margin_mean": 2.5688788890838623, "margin_dpo/margin_std": 8.519815444946289, "step": 103 }, { "epoch": 0.21780104712041884, "grad_norm": 14.514609336853027, "learning_rate": 4.799948609147061e-07, "logits/chosen": 1.875953197479248, "logits/rejected": 1.7595632076263428, "logps/chosen": -308.81158447265625, "logps/ref_chosen": -316.44036865234375, "logps/ref_rejected": -245.41790771484375, "logps/rejected": -242.4790496826172, "loss": 5.3826, "margin_dpo/margin_mean": 4.689910411834717, "margin_dpo/margin_std": 10.412727355957031, "step": 104 }, { "epoch": 0.2198952879581152, "grad_norm": 17.934940338134766, "learning_rate": 4.792711016345321e-07, "logits/chosen": 1.8088258504867554, "logits/rejected": 1.6915315389633179, "logps/chosen": -253.7894744873047, "logps/ref_chosen": -264.70599365234375, "logps/ref_rejected": -232.14236450195312, "logps/rejected": -229.86798095703125, "loss": 5.241, "margin_dpo/margin_mean": 8.642158508300781, "margin_dpo/margin_std": 9.996341705322266, "step": 105 }, { "epoch": 0.22198952879581152, "grad_norm": 16.461326599121094, "learning_rate": 4.785350472409791e-07, "logits/chosen": 1.8355200290679932, "logits/rejected": 2.0365209579467773, "logps/chosen": -274.2940673828125, "logps/ref_chosen": -280.6784973144531, "logps/ref_rejected": -353.0090026855469, "logps/rejected": -352.0072021484375, "loss": 5.3413, "margin_dpo/margin_mean": 5.382654666900635, "margin_dpo/margin_std": 9.296738624572754, "step": 106 }, { "epoch": 0.22408376963350785, "grad_norm": 16.5199031829834, "learning_rate": 4.777867372064105e-07, "logits/chosen": 1.6165478229522705, "logits/rejected": 1.5343804359436035, "logps/chosen": -327.95794677734375, "logps/ref_chosen": -336.91058349609375, "logps/ref_rejected": -280.02325439453125, "logps/rejected": -277.4742431640625, "loss": 5.2697, "margin_dpo/margin_mean": 6.403599262237549, "margin_dpo/margin_std": 9.143040657043457, "step": 107 }, { "epoch": 0.2261780104712042, "grad_norm": 16.170669555664062, "learning_rate": 4.770262116604223e-07, "logits/chosen": 1.8304221630096436, "logits/rejected": 2.0288898944854736, "logps/chosen": -224.6934356689453, "logps/ref_chosen": -232.04891967773438, "logps/ref_rejected": -248.3793487548828, "logps/rejected": -246.96351623535156, "loss": 5.2351, "margin_dpo/margin_mean": 5.9396281242370605, "margin_dpo/margin_std": 9.726874351501465, "step": 108 }, { "epoch": 0.22827225130890053, "grad_norm": 17.59023094177246, "learning_rate": 4.7625351138769166e-07, "logits/chosen": 1.8960250616073608, "logits/rejected": 1.919461965560913, "logps/chosen": -236.6331329345703, "logps/ref_chosen": -243.42401123046875, "logps/ref_rejected": -276.1861877441406, "logps/rejected": -274.469482421875, "loss": 5.2323, "margin_dpo/margin_mean": 5.074185371398926, "margin_dpo/margin_std": 8.536012649536133, "step": 109 }, { "epoch": 0.23036649214659685, "grad_norm": 15.948025703430176, "learning_rate": 4.75468677825789e-07, "logits/chosen": 1.6093004941940308, "logits/rejected": 1.6397433280944824, "logps/chosen": -234.94406127929688, "logps/ref_chosen": -242.5493621826172, "logps/ref_rejected": -195.59750366210938, "logps/rejected": -193.1940155029297, "loss": 5.246, "margin_dpo/margin_mean": 5.201825141906738, "margin_dpo/margin_std": 10.898710250854492, "step": 110 }, { "epoch": 0.2324607329842932, "grad_norm": 18.407573699951172, "learning_rate": 4.7467175306295647e-07, "logits/chosen": 1.6897281408309937, "logits/rejected": 1.7771556377410889, "logps/chosen": -272.2618408203125, "logps/ref_chosen": -279.930908203125, "logps/ref_rejected": -281.9147644042969, "logps/rejected": -282.97882080078125, "loss": 5.223, "margin_dpo/margin_mean": 8.733101844787598, "margin_dpo/margin_std": 11.250627517700195, "step": 111 }, { "epoch": 0.23455497382198953, "grad_norm": 14.606575012207031, "learning_rate": 4.7386277983585053e-07, "logits/chosen": 1.776769757270813, "logits/rejected": 1.8782975673675537, "logps/chosen": -243.624755859375, "logps/ref_chosen": -246.89129638671875, "logps/ref_rejected": -266.50506591796875, "logps/rejected": -265.4157409667969, "loss": 5.395, "margin_dpo/margin_mean": 2.177186965942383, "margin_dpo/margin_std": 10.55683422088623, "step": 112 }, { "epoch": 0.23664921465968586, "grad_norm": 16.843202590942383, "learning_rate": 4.7304180152725024e-07, "logits/chosen": 1.4505597352981567, "logits/rejected": 1.5904918909072876, "logps/chosen": -269.72711181640625, "logps/ref_chosen": -276.4613342285156, "logps/ref_rejected": -341.7659912109375, "logps/rejected": -342.48956298828125, "loss": 5.1716, "margin_dpo/margin_mean": 7.457816123962402, "margin_dpo/margin_std": 12.05935287475586, "step": 113 }, { "epoch": 0.2387434554973822, "grad_norm": 16.080974578857422, "learning_rate": 4.7220886216373085e-07, "logits/chosen": 1.6391416788101196, "logits/rejected": 1.5624871253967285, "logps/chosen": -247.58502197265625, "logps/ref_chosen": -251.4463653564453, "logps/ref_rejected": -210.03152465820312, "logps/rejected": -213.17721557617188, "loss": 5.3559, "margin_dpo/margin_mean": 7.00706672668457, "margin_dpo/margin_std": 8.853893280029297, "step": 114 }, { "epoch": 0.24083769633507854, "grad_norm": 15.944089889526367, "learning_rate": 4.7136400641330245e-07, "logits/chosen": 1.8735270500183105, "logits/rejected": 1.6029636859893799, "logps/chosen": -253.3223876953125, "logps/ref_chosen": -257.82574462890625, "logps/ref_rejected": -192.41648864746094, "logps/rejected": -191.51156616210938, "loss": 5.3403, "margin_dpo/margin_mean": 3.5983924865722656, "margin_dpo/margin_std": 8.953197479248047, "step": 115 }, { "epoch": 0.24293193717277486, "grad_norm": 16.291976928710938, "learning_rate": 4.70507279583015e-07, "logits/chosen": 1.695469856262207, "logits/rejected": 1.8081481456756592, "logps/chosen": -242.69943237304688, "logps/ref_chosen": -248.17518615722656, "logps/ref_rejected": -274.10870361328125, "logps/rejected": -276.470703125, "loss": 5.2459, "margin_dpo/margin_mean": 7.837741851806641, "margin_dpo/margin_std": 9.762773513793945, "step": 116 }, { "epoch": 0.2450261780104712, "grad_norm": 16.642024993896484, "learning_rate": 4.6963872761652834e-07, "logits/chosen": 1.6590253114700317, "logits/rejected": 1.4430992603302002, "logps/chosen": -229.59909057617188, "logps/ref_chosen": -235.29620361328125, "logps/ref_rejected": -190.87095642089844, "logps/rejected": -194.7079620361328, "loss": 5.2344, "margin_dpo/margin_mean": 9.53414535522461, "margin_dpo/margin_std": 9.101675033569336, "step": 117 }, { "epoch": 0.24712041884816754, "grad_norm": 20.776172637939453, "learning_rate": 4.687583970916486e-07, "logits/chosen": 1.6007872819900513, "logits/rejected": 1.6555330753326416, "logps/chosen": -256.0022277832031, "logps/ref_chosen": -260.44781494140625, "logps/ref_rejected": -308.3326416015625, "logps/rejected": -313.2330627441406, "loss": 5.2168, "margin_dpo/margin_mean": 9.346000671386719, "margin_dpo/margin_std": 13.443426132202148, "step": 118 }, { "epoch": 0.24921465968586387, "grad_norm": 15.843025207519531, "learning_rate": 4.6786633521783005e-07, "logits/chosen": 1.9080662727355957, "logits/rejected": 2.017760753631592, "logps/chosen": -282.1200866699219, "logps/ref_chosen": -286.9692687988281, "logps/ref_rejected": -331.7510986328125, "logps/rejected": -331.0477600097656, "loss": 5.2789, "margin_dpo/margin_mean": 4.145843982696533, "margin_dpo/margin_std": 13.910889625549316, "step": 119 }, { "epoch": 0.2513089005235602, "grad_norm": 16.17180061340332, "learning_rate": 4.669625898336438e-07, "logits/chosen": 1.9562854766845703, "logits/rejected": 1.8660322427749634, "logps/chosen": -278.0622253417969, "logps/ref_chosen": -281.98077392578125, "logps/ref_rejected": -283.52679443359375, "logps/rejected": -288.8341369628906, "loss": 5.2221, "margin_dpo/margin_mean": 9.225922584533691, "margin_dpo/margin_std": 13.011504173278809, "step": 120 }, { "epoch": 0.2534031413612565, "grad_norm": 14.78394889831543, "learning_rate": 4.6604720940421207e-07, "logits/chosen": 1.1911287307739258, "logits/rejected": 1.5024229288101196, "logps/chosen": -144.0179443359375, "logps/ref_chosen": -145.69662475585938, "logps/ref_rejected": -195.21612548828125, "logps/rejected": -199.57223510742188, "loss": 5.3557, "margin_dpo/margin_mean": 6.034780025482178, "margin_dpo/margin_std": 9.371333122253418, "step": 121 }, { "epoch": 0.2554973821989529, "grad_norm": 15.84109115600586, "learning_rate": 4.651202430186092e-07, "logits/chosen": 1.7703770399093628, "logits/rejected": 2.0983457565307617, "logps/chosen": -245.3528289794922, "logps/ref_chosen": -252.1569366455078, "logps/ref_rejected": -309.68548583984375, "logps/rejected": -306.59942626953125, "loss": 5.2869, "margin_dpo/margin_mean": 3.718092441558838, "margin_dpo/margin_std": 17.503427505493164, "step": 122 }, { "epoch": 0.25759162303664923, "grad_norm": 18.21240997314453, "learning_rate": 4.6418174038722924e-07, "logits/chosen": 1.6439917087554932, "logits/rejected": 1.4977948665618896, "logps/chosen": -358.5697326660156, "logps/ref_chosen": -366.5253601074219, "logps/ref_rejected": -285.2503662109375, "logps/rejected": -286.8184814453125, "loss": 5.1686, "margin_dpo/margin_mean": 9.523737907409668, "margin_dpo/margin_std": 12.473346710205078, "step": 123 }, { "epoch": 0.25968586387434556, "grad_norm": 16.816696166992188, "learning_rate": 4.6323175183912023e-07, "logits/chosen": 1.4895159006118774, "logits/rejected": 1.6259382963180542, "logps/chosen": -244.55775451660156, "logps/ref_chosen": -251.4420623779297, "logps/ref_rejected": -231.0302734375, "logps/rejected": -230.74337768554688, "loss": 5.2019, "margin_dpo/margin_mean": 6.597394943237305, "margin_dpo/margin_std": 15.111127853393555, "step": 124 }, { "epoch": 0.2617801047120419, "grad_norm": 16.386260986328125, "learning_rate": 4.6227032831928483e-07, "logits/chosen": 1.6032323837280273, "logits/rejected": 1.598075032234192, "logps/chosen": -242.79583740234375, "logps/ref_chosen": -248.3984375, "logps/ref_rejected": -307.77557373046875, "logps/rejected": -308.05059814453125, "loss": 5.2189, "margin_dpo/margin_mean": 5.877676010131836, "margin_dpo/margin_std": 13.811307907104492, "step": 125 }, { "epoch": 0.2638743455497382, "grad_norm": 16.69825553894043, "learning_rate": 4.612975213859487e-07, "logits/chosen": 1.7347309589385986, "logits/rejected": 1.9158421754837036, "logps/chosen": -291.75244140625, "logps/ref_chosen": -295.82366943359375, "logps/ref_rejected": -295.2666931152344, "logps/rejected": -299.0240783691406, "loss": 5.1658, "margin_dpo/margin_mean": 7.8285651206970215, "margin_dpo/margin_std": 13.573448181152344, "step": 126 }, { "epoch": 0.26596858638743454, "grad_norm": 16.867650985717773, "learning_rate": 4.603133832077953e-07, "logits/chosen": 1.1577448844909668, "logits/rejected": 1.106504201889038, "logps/chosen": -273.8604736328125, "logps/ref_chosen": -279.496337890625, "logps/ref_rejected": -278.802978515625, "logps/rejected": -282.5022277832031, "loss": 5.1019, "margin_dpo/margin_mean": 9.335136413574219, "margin_dpo/margin_std": 13.274786949157715, "step": 127 }, { "epoch": 0.2680628272251309, "grad_norm": 16.078149795532227, "learning_rate": 4.5931796656116837e-07, "logits/chosen": 1.4048773050308228, "logits/rejected": 1.4003376960754395, "logps/chosen": -258.86883544921875, "logps/ref_chosen": -264.52252197265625, "logps/ref_rejected": -239.76937866210938, "logps/rejected": -247.31756591796875, "loss": 5.057, "margin_dpo/margin_mean": 13.201865196228027, "margin_dpo/margin_std": 13.145772933959961, "step": 128 }, { "epoch": 0.27015706806282724, "grad_norm": 16.511791229248047, "learning_rate": 4.5831132482724193e-07, "logits/chosen": 1.5169470310211182, "logits/rejected": 1.6699875593185425, "logps/chosen": -290.08258056640625, "logps/ref_chosen": -296.95233154296875, "logps/ref_rejected": -260.0984802246094, "logps/rejected": -267.3520812988281, "loss": 5.1034, "margin_dpo/margin_mean": 14.12340259552002, "margin_dpo/margin_std": 17.277435302734375, "step": 129 }, { "epoch": 0.27225130890052357, "grad_norm": 21.19081687927246, "learning_rate": 4.5729351198915705e-07, "logits/chosen": 1.5894699096679688, "logits/rejected": 1.8434865474700928, "logps/chosen": -263.253173828125, "logps/ref_chosen": -274.7286682128906, "logps/ref_rejected": -325.187255859375, "logps/rejected": -327.59503173828125, "loss": 5.1333, "margin_dpo/margin_mean": 13.883302688598633, "margin_dpo/margin_std": 18.286108016967773, "step": 130 }, { "epoch": 0.2743455497382199, "grad_norm": 17.56028175354004, "learning_rate": 4.5626458262912735e-07, "logits/chosen": 1.44374680519104, "logits/rejected": 1.400553822517395, "logps/chosen": -270.84796142578125, "logps/ref_chosen": -279.3233642578125, "logps/ref_rejected": -299.2681884765625, "logps/rejected": -304.762451171875, "loss": 5.2149, "margin_dpo/margin_mean": 13.969644546508789, "margin_dpo/margin_std": 20.493703842163086, "step": 131 }, { "epoch": 0.2764397905759162, "grad_norm": 17.945659637451172, "learning_rate": 4.5522459192551166e-07, "logits/chosen": 1.5821537971496582, "logits/rejected": 1.6266090869903564, "logps/chosen": -281.5635681152344, "logps/ref_chosen": -291.3346862792969, "logps/ref_rejected": -283.13311767578125, "logps/rejected": -291.1026306152344, "loss": 5.1007, "margin_dpo/margin_mean": 17.740650177001953, "margin_dpo/margin_std": 17.511295318603516, "step": 132 }, { "epoch": 0.27853403141361255, "grad_norm": 16.447092056274414, "learning_rate": 4.541735956498554e-07, "logits/chosen": 1.6146799325942993, "logits/rejected": 1.5560858249664307, "logps/chosen": -223.23194885253906, "logps/ref_chosen": -233.71875, "logps/ref_rejected": -216.53781127929688, "logps/rejected": -223.18417358398438, "loss": 5.0683, "margin_dpo/margin_mean": 17.133150100708008, "margin_dpo/margin_std": 14.197755813598633, "step": 133 }, { "epoch": 0.2806282722513089, "grad_norm": 20.631309509277344, "learning_rate": 4.5311165016389914e-07, "logits/chosen": 1.920145869255066, "logits/rejected": 1.981586217880249, "logps/chosen": -348.9212951660156, "logps/ref_chosen": -348.29547119140625, "logps/ref_rejected": -343.04510498046875, "logps/rejected": -351.0985412597656, "loss": 5.22, "margin_dpo/margin_mean": 7.427634239196777, "margin_dpo/margin_std": 17.99872589111328, "step": 134 }, { "epoch": 0.28272251308900526, "grad_norm": 17.117826461791992, "learning_rate": 4.520388124165564e-07, "logits/chosen": 1.1408627033233643, "logits/rejected": 0.9298585057258606, "logps/chosen": -226.50486755371094, "logps/ref_chosen": -232.59129333496094, "logps/ref_rejected": -175.74066162109375, "logps/rejected": -181.21482849121094, "loss": 5.0762, "margin_dpo/margin_mean": 11.560598373413086, "margin_dpo/margin_std": 14.716351509094238, "step": 135 }, { "epoch": 0.2848167539267016, "grad_norm": 19.100990295410156, "learning_rate": 4.5095513994085974e-07, "logits/chosen": 1.0842311382293701, "logits/rejected": 1.3095265626907349, "logps/chosen": -183.53028869628906, "logps/ref_chosen": -189.21795654296875, "logps/ref_rejected": -191.75979614257812, "logps/rejected": -200.99093627929688, "loss": 5.1055, "margin_dpo/margin_mean": 14.91882038116455, "margin_dpo/margin_std": 16.662824630737305, "step": 136 }, { "epoch": 0.2869109947643979, "grad_norm": 17.85831642150879, "learning_rate": 4.498606908508753e-07, "logits/chosen": 1.86328125, "logits/rejected": 1.6945784091949463, "logps/chosen": -356.1871032714844, "logps/ref_chosen": -358.9820861816406, "logps/ref_rejected": -277.2926330566406, "logps/rejected": -286.5790710449219, "loss": 5.1522, "margin_dpo/margin_mean": 12.081487655639648, "margin_dpo/margin_std": 15.93864631652832, "step": 137 }, { "epoch": 0.28900523560209423, "grad_norm": 16.747636795043945, "learning_rate": 4.487555238385862e-07, "logits/chosen": 1.8756850957870483, "logits/rejected": 1.9149004220962524, "logps/chosen": -284.18572998046875, "logps/ref_chosen": -283.7969055175781, "logps/ref_rejected": -269.28643798828125, "logps/rejected": -280.577880859375, "loss": 5.1753, "margin_dpo/margin_mean": 10.902585983276367, "margin_dpo/margin_std": 22.474666595458984, "step": 138 }, { "epoch": 0.29109947643979056, "grad_norm": 16.213245391845703, "learning_rate": 4.476396981707453e-07, "logits/chosen": 1.4325203895568848, "logits/rejected": 1.5791009664535522, "logps/chosen": -218.7724609375, "logps/ref_chosen": -221.46124267578125, "logps/ref_rejected": -234.3295440673828, "logps/rejected": -236.3891143798828, "loss": 5.2792, "margin_dpo/margin_mean": 4.748367786407471, "margin_dpo/margin_std": 21.647659301757812, "step": 139 }, { "epoch": 0.2931937172774869, "grad_norm": 25.28333854675293, "learning_rate": 4.4651327368569684e-07, "logits/chosen": 1.5151917934417725, "logits/rejected": 1.5757999420166016, "logps/chosen": -237.8951873779297, "logps/ref_chosen": -246.27151489257812, "logps/ref_rejected": -255.00428771972656, "logps/rejected": -261.1990661621094, "loss": 5.0124, "margin_dpo/margin_mean": 14.571114540100098, "margin_dpo/margin_std": 15.991363525390625, "step": 140 }, { "epoch": 0.29528795811518327, "grad_norm": 23.41806411743164, "learning_rate": 4.453763107901675e-07, "logits/chosen": 1.5907336473464966, "logits/rejected": 1.6988296508789062, "logps/chosen": -264.4815979003906, "logps/ref_chosen": -267.79345703125, "logps/ref_rejected": -295.4119873046875, "logps/rejected": -308.6583557128906, "loss": 5.0535, "margin_dpo/margin_mean": 16.5582332611084, "margin_dpo/margin_std": 21.787641525268555, "step": 141 }, { "epoch": 0.2973821989528796, "grad_norm": 17.388578414916992, "learning_rate": 4.4422887045602674e-07, "logits/chosen": 1.8900976181030273, "logits/rejected": 1.6236388683319092, "logps/chosen": -341.6228942871094, "logps/ref_chosen": -352.8658752441406, "logps/ref_rejected": -219.5095672607422, "logps/rejected": -223.31808471679688, "loss": 5.0641, "margin_dpo/margin_mean": 15.05146598815918, "margin_dpo/margin_std": 16.49203109741211, "step": 142 }, { "epoch": 0.2994764397905759, "grad_norm": 18.831209182739258, "learning_rate": 4.4307101421701755e-07, "logits/chosen": 1.4374781847000122, "logits/rejected": 1.3258070945739746, "logps/chosen": -327.2297058105469, "logps/ref_chosen": -336.38482666015625, "logps/ref_rejected": -213.85707092285156, "logps/rejected": -229.42831420898438, "loss": 4.9898, "margin_dpo/margin_mean": 24.726341247558594, "margin_dpo/margin_std": 23.126943588256836, "step": 143 }, { "epoch": 0.30157068062827225, "grad_norm": 19.49022674560547, "learning_rate": 4.419028041654559e-07, "logits/chosen": 1.4996888637542725, "logits/rejected": 1.4556653499603271, "logps/chosen": -264.51727294921875, "logps/ref_chosen": -274.0345458984375, "logps/ref_rejected": -274.5603942871094, "logps/rejected": -273.80316162109375, "loss": 5.0793, "margin_dpo/margin_mean": 8.760041236877441, "margin_dpo/margin_std": 19.655494689941406, "step": 144 }, { "epoch": 0.3036649214659686, "grad_norm": 16.73856544494629, "learning_rate": 4.4072430294890166e-07, "logits/chosen": 1.6811779737472534, "logits/rejected": 1.7291994094848633, "logps/chosen": -269.10491943359375, "logps/ref_chosen": -274.1513366699219, "logps/ref_rejected": -226.63064575195312, "logps/rejected": -239.5631866455078, "loss": 4.9405, "margin_dpo/margin_mean": 17.978939056396484, "margin_dpo/margin_std": 23.945186614990234, "step": 145 }, { "epoch": 0.3057591623036649, "grad_norm": 29.0201358795166, "learning_rate": 4.395355737667985e-07, "logits/chosen": 1.4950841665267944, "logits/rejected": 1.7295074462890625, "logps/chosen": -227.69259643554688, "logps/ref_chosen": -229.48269653320312, "logps/ref_rejected": -249.7940216064453, "logps/rejected": -259.9455261230469, "loss": 5.0665, "margin_dpo/margin_mean": 11.941591262817383, "margin_dpo/margin_std": 17.751564025878906, "step": 146 }, { "epoch": 0.3078534031413613, "grad_norm": 16.784557342529297, "learning_rate": 4.3833668036708483e-07, "logits/chosen": 1.4946480989456177, "logits/rejected": 1.4648349285125732, "logps/chosen": -284.0057067871094, "logps/ref_chosen": -290.8128356933594, "logps/ref_rejected": -218.97787475585938, "logps/rejected": -229.5756072998047, "loss": 5.1681, "margin_dpo/margin_mean": 17.40483856201172, "margin_dpo/margin_std": 24.006677627563477, "step": 147 }, { "epoch": 0.3099476439790576, "grad_norm": 16.890840530395508, "learning_rate": 4.3712768704277524e-07, "logits/chosen": 1.5196683406829834, "logits/rejected": 1.509922742843628, "logps/chosen": -261.50762939453125, "logps/ref_chosen": -263.70001220703125, "logps/ref_rejected": -262.095703125, "logps/rejected": -272.2943115234375, "loss": 5.1353, "margin_dpo/margin_mean": 12.390965461730957, "margin_dpo/margin_std": 21.307296752929688, "step": 148 }, { "epoch": 0.31204188481675393, "grad_norm": 18.739173889160156, "learning_rate": 4.3590865862851263e-07, "logits/chosen": 1.9116061925888062, "logits/rejected": 1.7204910516738892, "logps/chosen": -344.4569396972656, "logps/ref_chosen": -350.6168518066406, "logps/ref_rejected": -277.2320251464844, "logps/rejected": -288.07904052734375, "loss": 5.0166, "margin_dpo/margin_mean": 17.006885528564453, "margin_dpo/margin_std": 17.385805130004883, "step": 149 }, { "epoch": 0.31413612565445026, "grad_norm": 17.36412811279297, "learning_rate": 4.346796604970912e-07, "logits/chosen": 1.934645652770996, "logits/rejected": 1.848956823348999, "logps/chosen": -261.2005920410156, "logps/ref_chosen": -264.05096435546875, "logps/ref_rejected": -286.02313232421875, "logps/rejected": -298.2835693359375, "loss": 5.0369, "margin_dpo/margin_mean": 15.110857963562012, "margin_dpo/margin_std": 18.34941291809082, "step": 150 }, { "epoch": 0.3162303664921466, "grad_norm": 20.5943546295166, "learning_rate": 4.3344075855595097e-07, "logits/chosen": 1.3573246002197266, "logits/rejected": 1.373565673828125, "logps/chosen": -254.92498779296875, "logps/ref_chosen": -257.74664306640625, "logps/ref_rejected": -256.2339172363281, "logps/rejected": -267.41278076171875, "loss": 4.7848, "margin_dpo/margin_mean": 14.000543594360352, "margin_dpo/margin_std": 23.76491928100586, "step": 151 }, { "epoch": 0.3183246073298429, "grad_norm": 21.099018096923828, "learning_rate": 4.3219201924364323e-07, "logits/chosen": 1.4018583297729492, "logits/rejected": 1.803174376487732, "logps/chosen": -245.9750213623047, "logps/ref_chosen": -250.47512817382812, "logps/ref_rejected": -322.36474609375, "logps/rejected": -333.2466735839844, "loss": 4.9817, "margin_dpo/margin_mean": 15.381957054138184, "margin_dpo/margin_std": 22.273956298828125, "step": 152 }, { "epoch": 0.3204188481675393, "grad_norm": 22.535673141479492, "learning_rate": 4.309335095262675e-07, "logits/chosen": 1.5490094423294067, "logits/rejected": 1.5208497047424316, "logps/chosen": -235.2023162841797, "logps/ref_chosen": -238.36544799804688, "logps/ref_rejected": -215.78970336914062, "logps/rejected": -236.40200805664062, "loss": 4.6931, "margin_dpo/margin_mean": 23.775440216064453, "margin_dpo/margin_std": 23.90810775756836, "step": 153 }, { "epoch": 0.3225130890052356, "grad_norm": 19.634624481201172, "learning_rate": 4.2966529689388064e-07, "logits/chosen": 1.213348627090454, "logits/rejected": 1.2180352210998535, "logps/chosen": -264.2608337402344, "logps/ref_chosen": -259.7012939453125, "logps/ref_rejected": -255.74172973632812, "logps/rejected": -272.3033142089844, "loss": 4.9699, "margin_dpo/margin_mean": 12.002017974853516, "margin_dpo/margin_std": 28.510345458984375, "step": 154 }, { "epoch": 0.32460732984293195, "grad_norm": 19.311044692993164, "learning_rate": 4.2838744935687716e-07, "logits/chosen": 1.4307262897491455, "logits/rejected": 1.4176700115203857, "logps/chosen": -324.7783203125, "logps/ref_chosen": -325.11517333984375, "logps/ref_rejected": -288.08380126953125, "logps/rejected": -307.0673828125, "loss": 5.1244, "margin_dpo/margin_mean": 19.320411682128906, "margin_dpo/margin_std": 28.002426147460938, "step": 155 }, { "epoch": 0.3267015706806283, "grad_norm": 19.045074462890625, "learning_rate": 4.271000354423425e-07, "logits/chosen": 1.6414060592651367, "logits/rejected": 1.486697793006897, "logps/chosen": -260.87078857421875, "logps/ref_chosen": -263.62353515625, "logps/ref_rejected": -183.94119262695312, "logps/rejected": -202.97857666015625, "loss": 4.8187, "margin_dpo/margin_mean": 21.790143966674805, "margin_dpo/margin_std": 19.9763240814209, "step": 156 }, { "epoch": 0.3287958115183246, "grad_norm": 24.014020919799805, "learning_rate": 4.258031241903777e-07, "logits/chosen": 1.4358762502670288, "logits/rejected": 1.5518109798431396, "logps/chosen": -248.9981231689453, "logps/ref_chosen": -237.6883087158203, "logps/ref_rejected": -232.87484741210938, "logps/rejected": -254.34902954101562, "loss": 5.0507, "margin_dpo/margin_mean": 10.164348602294922, "margin_dpo/margin_std": 22.118553161621094, "step": 157 }, { "epoch": 0.3308900523560209, "grad_norm": 19.902008056640625, "learning_rate": 4.2449678515039743e-07, "logits/chosen": 1.7699273824691772, "logits/rejected": 1.8686857223510742, "logps/chosen": -284.7595520019531, "logps/ref_chosen": -279.62335205078125, "logps/ref_rejected": -267.80615234375, "logps/rejected": -285.53924560546875, "loss": 4.9867, "margin_dpo/margin_mean": 12.59688663482666, "margin_dpo/margin_std": 24.365018844604492, "step": 158 }, { "epoch": 0.33298429319371725, "grad_norm": 22.825597763061523, "learning_rate": 4.2318108837739986e-07, "logits/chosen": 1.553140640258789, "logits/rejected": 1.439896583557129, "logps/chosen": -303.68487548828125, "logps/ref_chosen": -301.5324401855469, "logps/ref_rejected": -263.529541015625, "logps/rejected": -274.5115966796875, "loss": 5.1446, "margin_dpo/margin_mean": 8.82960319519043, "margin_dpo/margin_std": 30.41036605834961, "step": 159 }, { "epoch": 0.33507853403141363, "grad_norm": 20.597837448120117, "learning_rate": 4.218561044282098e-07, "logits/chosen": 1.9710590839385986, "logits/rejected": 1.699224829673767, "logps/chosen": -311.9967041015625, "logps/ref_chosen": -314.1754455566406, "logps/ref_rejected": -241.1903076171875, "logps/rejected": -267.9695129394531, "loss": 4.8583, "margin_dpo/margin_mean": 28.957944869995117, "margin_dpo/margin_std": 29.692768096923828, "step": 160 }, { "epoch": 0.33717277486910996, "grad_norm": 25.702106475830078, "learning_rate": 4.2052190435769554e-07, "logits/chosen": 1.345297932624817, "logits/rejected": 1.2094160318374634, "logps/chosen": -268.9353942871094, "logps/ref_chosen": -271.0775451660156, "logps/ref_rejected": -212.71853637695312, "logps/rejected": -228.42201232910156, "loss": 4.8254, "margin_dpo/margin_mean": 17.845624923706055, "margin_dpo/margin_std": 25.553813934326172, "step": 161 }, { "epoch": 0.3392670157068063, "grad_norm": 26.607454299926758, "learning_rate": 4.1917855971495763e-07, "logits/chosen": 1.6022093296051025, "logits/rejected": 1.4934312105178833, "logps/chosen": -293.98974609375, "logps/ref_chosen": -296.7241516113281, "logps/ref_rejected": -222.9241485595703, "logps/rejected": -236.45635986328125, "loss": 4.8946, "margin_dpo/margin_mean": 16.266626358032227, "margin_dpo/margin_std": 23.54033660888672, "step": 162 }, { "epoch": 0.3413612565445026, "grad_norm": 30.976999282836914, "learning_rate": 4.1782614253949255e-07, "logits/chosen": 1.7216789722442627, "logits/rejected": 1.7484244108200073, "logps/chosen": -246.4569549560547, "logps/ref_chosen": -249.64366149902344, "logps/ref_rejected": -244.58258056640625, "logps/rejected": -260.55218505859375, "loss": 4.7557, "margin_dpo/margin_mean": 19.156299591064453, "margin_dpo/margin_std": 21.998430252075195, "step": 163 }, { "epoch": 0.34345549738219894, "grad_norm": 22.384260177612305, "learning_rate": 4.164647253573289e-07, "logits/chosen": 1.4122521877288818, "logits/rejected": 1.5924245119094849, "logps/chosen": -214.8105926513672, "logps/ref_chosen": -203.6176300048828, "logps/ref_rejected": -216.5535888671875, "logps/rejected": -240.29396057128906, "loss": 4.8891, "margin_dpo/margin_mean": 12.547422409057617, "margin_dpo/margin_std": 23.612470626831055, "step": 164 }, { "epoch": 0.34554973821989526, "grad_norm": 28.401796340942383, "learning_rate": 4.1509438117713863e-07, "logits/chosen": 2.1252005100250244, "logits/rejected": 2.1613521575927734, "logps/chosen": -350.52978515625, "logps/ref_chosen": -344.1730651855469, "logps/ref_rejected": -304.00128173828125, "logps/rejected": -327.9066467285156, "loss": 4.9931, "margin_dpo/margin_mean": 17.548656463623047, "margin_dpo/margin_std": 27.731534957885742, "step": 165 }, { "epoch": 0.34764397905759165, "grad_norm": 19.331459045410156, "learning_rate": 4.137151834863213e-07, "logits/chosen": 1.646728277206421, "logits/rejected": 1.640990138053894, "logps/chosen": -242.68841552734375, "logps/ref_chosen": -233.72891235351562, "logps/ref_rejected": -208.29397583007812, "logps/rejected": -224.4010467529297, "loss": 5.0854, "margin_dpo/margin_mean": 7.147580146789551, "margin_dpo/margin_std": 26.665321350097656, "step": 166 }, { "epoch": 0.34973821989528797, "grad_norm": 19.852638244628906, "learning_rate": 4.123272062470633e-07, "logits/chosen": 1.6361035108566284, "logits/rejected": 1.411129117012024, "logps/chosen": -327.1979064941406, "logps/ref_chosen": -326.10198974609375, "logps/ref_rejected": -232.0992889404297, "logps/rejected": -256.6214294433594, "loss": 4.9473, "margin_dpo/margin_mean": 23.426191329956055, "margin_dpo/margin_std": 28.530874252319336, "step": 167 }, { "epoch": 0.3518324607329843, "grad_norm": 21.564373016357422, "learning_rate": 4.1093052389237174e-07, "logits/chosen": 1.3292641639709473, "logits/rejected": 1.2153077125549316, "logps/chosen": -246.62283325195312, "logps/ref_chosen": -247.4376983642578, "logps/ref_rejected": -216.68064880371094, "logps/rejected": -241.31011962890625, "loss": 4.7589, "margin_dpo/margin_mean": 25.444313049316406, "margin_dpo/margin_std": 17.723526000976562, "step": 168 }, { "epoch": 0.3539267015706806, "grad_norm": 19.15829086303711, "learning_rate": 4.0952521132208267e-07, "logits/chosen": 1.6247047185897827, "logits/rejected": 1.7833751440048218, "logps/chosen": -281.30078125, "logps/ref_chosen": -285.1272277832031, "logps/ref_rejected": -279.10943603515625, "logps/rejected": -302.5621032714844, "loss": 4.632, "margin_dpo/margin_mean": 27.279136657714844, "margin_dpo/margin_std": 23.28731346130371, "step": 169 }, { "epoch": 0.35602094240837695, "grad_norm": 24.02274513244629, "learning_rate": 4.081113438988443e-07, "logits/chosen": 1.5731761455535889, "logits/rejected": 1.4810683727264404, "logps/chosen": -357.42327880859375, "logps/ref_chosen": -358.3712463378906, "logps/ref_rejected": -245.13316345214844, "logps/rejected": -264.8816223144531, "loss": 4.7427, "margin_dpo/margin_mean": 20.696434020996094, "margin_dpo/margin_std": 30.21214485168457, "step": 170 }, { "epoch": 0.3581151832460733, "grad_norm": 23.137300491333008, "learning_rate": 4.0668899744407567e-07, "logits/chosen": 1.5857964754104614, "logits/rejected": 1.465027093887329, "logps/chosen": -269.0282897949219, "logps/ref_chosen": -273.9371337890625, "logps/ref_rejected": -241.6103515625, "logps/rejected": -259.7757263183594, "loss": 4.7322, "margin_dpo/margin_mean": 23.074222564697266, "margin_dpo/margin_std": 30.494476318359375, "step": 171 }, { "epoch": 0.36020942408376966, "grad_norm": 22.266416549682617, "learning_rate": 4.0525824823390043e-07, "logits/chosen": 1.6551828384399414, "logits/rejected": 1.8315401077270508, "logps/chosen": -254.328369140625, "logps/ref_chosen": -255.1793975830078, "logps/ref_rejected": -279.3556213378906, "logps/rejected": -293.8561706542969, "loss": 5.039, "margin_dpo/margin_mean": 15.351570129394531, "margin_dpo/margin_std": 22.528553009033203, "step": 172 }, { "epoch": 0.362303664921466, "grad_norm": 24.233097076416016, "learning_rate": 4.0381917299505686e-07, "logits/chosen": 1.626520037651062, "logits/rejected": 1.3303242921829224, "logps/chosen": -338.2034606933594, "logps/ref_chosen": -333.66375732421875, "logps/ref_rejected": -275.1485290527344, "logps/rejected": -300.3768310546875, "loss": 4.8793, "margin_dpo/margin_mean": 20.68860626220703, "margin_dpo/margin_std": 28.740659713745117, "step": 173 }, { "epoch": 0.3643979057591623, "grad_norm": 22.030344009399414, "learning_rate": 4.0237184890078243e-07, "logits/chosen": 1.9243297576904297, "logits/rejected": 1.6874244213104248, "logps/chosen": -354.771484375, "logps/ref_chosen": -362.5843505859375, "logps/ref_rejected": -250.0384521484375, "logps/rejected": -277.5650939941406, "loss": 4.6927, "margin_dpo/margin_mean": 35.33952713012695, "margin_dpo/margin_std": 33.1160774230957, "step": 174 }, { "epoch": 0.36649214659685864, "grad_norm": 35.58210754394531, "learning_rate": 4.00916353566676e-07, "logits/chosen": 1.5620782375335693, "logits/rejected": 1.598710536956787, "logps/chosen": -242.1133270263672, "logps/ref_chosen": -231.65187072753906, "logps/ref_rejected": -264.08526611328125, "logps/rejected": -294.81854248046875, "loss": 4.8994, "margin_dpo/margin_mean": 20.271865844726562, "margin_dpo/margin_std": 31.47553062438965, "step": 175 }, { "epoch": 0.36858638743455496, "grad_norm": 23.747568130493164, "learning_rate": 3.994527650465352e-07, "logits/chosen": 1.3475306034088135, "logits/rejected": 1.4316266775131226, "logps/chosen": -278.8919372558594, "logps/ref_chosen": -271.37152099609375, "logps/ref_rejected": -281.20074462890625, "logps/rejected": -299.3853759765625, "loss": 5.067, "margin_dpo/margin_mean": 10.664226531982422, "margin_dpo/margin_std": 33.30470657348633, "step": 176 }, { "epoch": 0.3706806282722513, "grad_norm": 21.642898559570312, "learning_rate": 3.979811618281705e-07, "logits/chosen": 1.6163108348846436, "logits/rejected": 1.4132215976715088, "logps/chosen": -270.68548583984375, "logps/ref_chosen": -266.7376403808594, "logps/ref_rejected": -217.114990234375, "logps/rejected": -240.8184356689453, "loss": 5.1559, "margin_dpo/margin_mean": 19.755634307861328, "margin_dpo/margin_std": 33.71812438964844, "step": 177 }, { "epoch": 0.37277486910994767, "grad_norm": 22.430517196655273, "learning_rate": 3.9650162282919654e-07, "logits/chosen": 1.463651180267334, "logits/rejected": 1.5219404697418213, "logps/chosen": -230.6317138671875, "logps/ref_chosen": -230.67471313476562, "logps/ref_rejected": -185.40577697753906, "logps/rejected": -219.8003387451172, "loss": 4.7678, "margin_dpo/margin_mean": 34.437557220458984, "margin_dpo/margin_std": 34.62123107910156, "step": 178 }, { "epoch": 0.374869109947644, "grad_norm": 30.32175064086914, "learning_rate": 3.9501422739279953e-07, "logits/chosen": 1.3042542934417725, "logits/rejected": 1.3105361461639404, "logps/chosen": -272.2032470703125, "logps/ref_chosen": -267.849853515625, "logps/ref_rejected": -270.5272521972656, "logps/rejected": -286.4674987792969, "loss": 4.9431, "margin_dpo/margin_mean": 11.586915969848633, "margin_dpo/margin_std": 27.10576629638672, "step": 179 }, { "epoch": 0.3769633507853403, "grad_norm": 41.640106201171875, "learning_rate": 3.935190552834828e-07, "logits/chosen": 1.7063815593719482, "logits/rejected": 1.6488580703735352, "logps/chosen": -302.65380859375, "logps/ref_chosen": -296.4002685546875, "logps/ref_rejected": -224.35203552246094, "logps/rejected": -253.70095825195312, "loss": 4.7457, "margin_dpo/margin_mean": 23.0954647064209, "margin_dpo/margin_std": 34.00359344482422, "step": 180 }, { "epoch": 0.37905759162303665, "grad_norm": 30.104999542236328, "learning_rate": 3.920161866827889e-07, "logits/chosen": 1.1796238422393799, "logits/rejected": 1.132272481918335, "logps/chosen": -241.59747314453125, "logps/ref_chosen": -243.10891723632812, "logps/ref_rejected": -231.96902465820312, "logps/rejected": -256.28497314453125, "loss": 4.8516, "margin_dpo/margin_mean": 25.827394485473633, "margin_dpo/margin_std": 33.010704040527344, "step": 181 }, { "epoch": 0.381151832460733, "grad_norm": 28.976200103759766, "learning_rate": 3.90505702185e-07, "logits/chosen": 1.584215521812439, "logits/rejected": 1.6100966930389404, "logps/chosen": -264.54351806640625, "logps/ref_chosen": -263.5075988769531, "logps/ref_rejected": -254.4083709716797, "logps/rejected": -296.13641357421875, "loss": 4.5773, "margin_dpo/margin_mean": 40.692108154296875, "margin_dpo/margin_std": 22.797809600830078, "step": 182 }, { "epoch": 0.3832460732984293, "grad_norm": 36.45281982421875, "learning_rate": 3.889876827928156e-07, "logits/chosen": 1.0563912391662598, "logits/rejected": 1.159271001815796, "logps/chosen": -233.31439208984375, "logps/ref_chosen": -220.9555206298828, "logps/ref_rejected": -224.3114471435547, "logps/rejected": -247.57742309570312, "loss": 4.7918, "margin_dpo/margin_mean": 10.90709114074707, "margin_dpo/margin_std": 35.502681732177734, "step": 183 }, { "epoch": 0.38534031413612563, "grad_norm": 24.624616622924805, "learning_rate": 3.874622099130087e-07, "logits/chosen": 1.666105031967163, "logits/rejected": 1.6945122480392456, "logps/chosen": -290.854736328125, "logps/ref_chosen": -285.35125732421875, "logps/ref_rejected": -282.2647705078125, "logps/rejected": -324.21044921875, "loss": 4.4107, "margin_dpo/margin_mean": 36.442237854003906, "margin_dpo/margin_std": 38.43600082397461, "step": 184 }, { "epoch": 0.387434554973822, "grad_norm": 29.312538146972656, "learning_rate": 3.859293653520604e-07, "logits/chosen": 1.6671961545944214, "logits/rejected": 1.7346235513687134, "logps/chosen": -326.77490234375, "logps/ref_chosen": -324.6773986816406, "logps/ref_rejected": -275.9365539550781, "logps/rejected": -308.9313659667969, "loss": 4.8262, "margin_dpo/margin_mean": 30.89735221862793, "margin_dpo/margin_std": 34.63751220703125, "step": 185 }, { "epoch": 0.38952879581151834, "grad_norm": 34.26539611816406, "learning_rate": 3.8438923131177237e-07, "logits/chosen": 1.594333291053772, "logits/rejected": 1.5077753067016602, "logps/chosen": -304.6080017089844, "logps/ref_chosen": -287.4004211425781, "logps/ref_rejected": -222.46803283691406, "logps/rejected": -260.6602783203125, "loss": 4.8356, "margin_dpo/margin_mean": 20.984676361083984, "margin_dpo/margin_std": 20.14511489868164, "step": 186 }, { "epoch": 0.39162303664921466, "grad_norm": 25.851015090942383, "learning_rate": 3.828418903848593e-07, "logits/chosen": 1.441859245300293, "logits/rejected": 1.5745567083358765, "logps/chosen": -401.31182861328125, "logps/ref_chosen": -378.8255310058594, "logps/ref_rejected": -319.38116455078125, "logps/rejected": -365.0159912109375, "loss": 4.8863, "margin_dpo/margin_mean": 23.14852523803711, "margin_dpo/margin_std": 45.19081115722656, "step": 187 }, { "epoch": 0.393717277486911, "grad_norm": 41.554141998291016, "learning_rate": 3.812874255505191e-07, "logits/chosen": 1.360278844833374, "logits/rejected": 1.1752986907958984, "logps/chosen": -250.5333251953125, "logps/ref_chosen": -246.3994903564453, "logps/ref_rejected": -204.85589599609375, "logps/rejected": -239.05686950683594, "loss": 4.8302, "margin_dpo/margin_mean": 30.06714630126953, "margin_dpo/margin_std": 34.51936340332031, "step": 188 }, { "epoch": 0.3958115183246073, "grad_norm": 38.48931884765625, "learning_rate": 3.797259201699833e-07, "logits/chosen": 1.4543706178665161, "logits/rejected": 1.5096098184585571, "logps/chosen": -264.8511047363281, "logps/ref_chosen": -264.7483825683594, "logps/ref_rejected": -292.3799743652344, "logps/rejected": -328.85107421875, "loss": 4.6022, "margin_dpo/margin_mean": 36.36838912963867, "margin_dpo/margin_std": 26.913986206054688, "step": 189 }, { "epoch": 0.39790575916230364, "grad_norm": 24.164396286010742, "learning_rate": 3.781574579820464e-07, "logits/chosen": 0.8813581466674805, "logits/rejected": 0.9559296369552612, "logps/chosen": -223.26422119140625, "logps/ref_chosen": -211.2392120361328, "logps/ref_rejected": -204.55384826660156, "logps/rejected": -233.70541381835938, "loss": 4.6669, "margin_dpo/margin_mean": 17.12653350830078, "margin_dpo/margin_std": 40.82097625732422, "step": 190 }, { "epoch": 0.4, "grad_norm": 28.65985107421875, "learning_rate": 3.765821230985757e-07, "logits/chosen": 1.188876748085022, "logits/rejected": 1.3144832849502563, "logps/chosen": -177.3275604248047, "logps/ref_chosen": -175.97952270507812, "logps/ref_rejected": -206.85325622558594, "logps/rejected": -228.22000122070312, "loss": 4.7686, "margin_dpo/margin_mean": 20.018733978271484, "margin_dpo/margin_std": 31.200864791870117, "step": 191 }, { "epoch": 0.40209424083769635, "grad_norm": 30.982559204101562, "learning_rate": 3.75e-07, "logits/chosen": 1.6706230640411377, "logits/rejected": 1.854614496231079, "logps/chosen": -253.131103515625, "logps/ref_chosen": -241.5125732421875, "logps/ref_rejected": -285.0710144042969, "logps/rejected": -313.1866149902344, "loss": 4.972, "margin_dpo/margin_mean": 16.49706268310547, "margin_dpo/margin_std": 54.19124221801758, "step": 192 }, { "epoch": 0.4041884816753927, "grad_norm": 28.91083526611328, "learning_rate": 3.734111735307796e-07, "logits/chosen": 1.7183902263641357, "logits/rejected": 1.5575065612792969, "logps/chosen": -255.6170196533203, "logps/ref_chosen": -247.06581115722656, "logps/ref_rejected": -221.4132537841797, "logps/rejected": -248.817138671875, "loss": 4.8338, "margin_dpo/margin_mean": 18.852684020996094, "margin_dpo/margin_std": 29.74808120727539, "step": 193 }, { "epoch": 0.406282722513089, "grad_norm": 35.922122955322266, "learning_rate": 3.7181572889485623e-07, "logits/chosen": 1.3995440006256104, "logits/rejected": 1.4992262125015259, "logps/chosen": -216.14686584472656, "logps/ref_chosen": -208.60263061523438, "logps/ref_rejected": -189.4849090576172, "logps/rejected": -212.74192810058594, "loss": 5.0447, "margin_dpo/margin_mean": 15.712799072265625, "margin_dpo/margin_std": 31.20469093322754, "step": 194 }, { "epoch": 0.4083769633507853, "grad_norm": 31.717365264892578, "learning_rate": 3.7021375165108377e-07, "logits/chosen": 1.464687466621399, "logits/rejected": 1.4596346616744995, "logps/chosen": -287.36065673828125, "logps/ref_chosen": -278.51275634765625, "logps/ref_rejected": -298.09185791015625, "logps/rejected": -318.23797607421875, "loss": 5.0759, "margin_dpo/margin_mean": 11.298222541809082, "margin_dpo/margin_std": 27.541015625, "step": 195 }, { "epoch": 0.41047120418848165, "grad_norm": 25.350812911987305, "learning_rate": 3.6860532770864005e-07, "logits/chosen": 1.1140937805175781, "logits/rejected": 1.2928128242492676, "logps/chosen": -213.8653564453125, "logps/ref_chosen": -213.48568725585938, "logps/ref_rejected": -216.8994903564453, "logps/rejected": -244.5856170654297, "loss": 4.6056, "margin_dpo/margin_mean": 27.30645751953125, "margin_dpo/margin_std": 27.342227935791016, "step": 196 }, { "epoch": 0.41256544502617803, "grad_norm": 26.07918357849121, "learning_rate": 3.6699054332241985e-07, "logits/chosen": 1.4508588314056396, "logits/rejected": 1.4278172254562378, "logps/chosen": -255.36036682128906, "logps/ref_chosen": -256.396728515625, "logps/ref_rejected": -185.2763671875, "logps/rejected": -232.59405517578125, "loss": 4.3517, "margin_dpo/margin_mean": 48.35406494140625, "margin_dpo/margin_std": 30.625558853149414, "step": 197 }, { "epoch": 0.41465968586387436, "grad_norm": 29.263551712036133, "learning_rate": 3.653694850884091e-07, "logits/chosen": 1.842124342918396, "logits/rejected": 1.9477362632751465, "logps/chosen": -362.33245849609375, "logps/ref_chosen": -366.5196838378906, "logps/ref_rejected": -361.7866516113281, "logps/rejected": -392.13189697265625, "loss": 4.6993, "margin_dpo/margin_mean": 34.532501220703125, "margin_dpo/margin_std": 40.5759162902832, "step": 198 }, { "epoch": 0.4167539267015707, "grad_norm": 32.77777862548828, "learning_rate": 3.6374223993904124e-07, "logits/chosen": 0.9073523879051208, "logits/rejected": 0.9206515550613403, "logps/chosen": -210.94586181640625, "logps/ref_chosen": -207.86968994140625, "logps/ref_rejected": -184.52076721191406, "logps/rejected": -226.15948486328125, "loss": 4.7308, "margin_dpo/margin_mean": 38.562538146972656, "margin_dpo/margin_std": 28.093774795532227, "step": 199 }, { "epoch": 0.418848167539267, "grad_norm": 28.612226486206055, "learning_rate": 3.621088951385353e-07, "logits/chosen": 1.3658336400985718, "logits/rejected": 1.3859562873840332, "logps/chosen": -281.68792724609375, "logps/ref_chosen": -276.4098205566406, "logps/ref_rejected": -252.23086547851562, "logps/rejected": -272.3934326171875, "loss": 4.9383, "margin_dpo/margin_mean": 14.884419441223145, "margin_dpo/margin_std": 53.33942794799805, "step": 200 }, { "epoch": 0.418848167539267, "eval_logits/chosen": 1.4300137758255005, "eval_logits/rejected": 1.4696903228759766, "eval_logps/chosen": -287.022705078125, "eval_logps/ref_chosen": -281.4588928222656, "eval_logps/ref_rejected": -261.84954833984375, "eval_logps/rejected": -295.6717529296875, "eval_loss": 0.597048819065094, "eval_margin_dpo/margin_mean": 28.258426666259766, "eval_margin_dpo/margin_std": 39.02444076538086, "eval_runtime": 93.548, "eval_samples_per_second": 21.379, "eval_steps_per_second": 1.336, "step": 200 }, { "epoch": 0.42094240837696334, "grad_norm": 32.25981903076172, "learning_rate": 3.604695382782159e-07, "logits/chosen": 1.2966924905776978, "logits/rejected": 1.4700032472610474, "logps/chosen": -266.32220458984375, "logps/ref_chosen": -265.32904052734375, "logps/ref_rejected": -255.19529724121094, "logps/rejected": -297.01544189453125, "loss": 4.6702, "margin_dpo/margin_mean": 40.82699203491211, "margin_dpo/margin_std": 30.352630615234375, "step": 201 }, { "epoch": 0.42303664921465967, "grad_norm": 35.75096130371094, "learning_rate": 3.588242572718162e-07, "logits/chosen": 1.6444792747497559, "logits/rejected": 1.5493888854980469, "logps/chosen": -278.0890197753906, "logps/ref_chosen": -274.6075439453125, "logps/ref_rejected": -219.9969940185547, "logps/rejected": -251.83348083496094, "loss": 4.7312, "margin_dpo/margin_mean": 28.354969024658203, "margin_dpo/margin_std": 44.327239990234375, "step": 202 }, { "epoch": 0.42513089005235605, "grad_norm": 34.300376892089844, "learning_rate": 3.571731403507635e-07, "logits/chosen": 1.4360129833221436, "logits/rejected": 1.3600637912750244, "logps/chosen": -302.18707275390625, "logps/ref_chosen": -295.6935119628906, "logps/ref_rejected": -241.4007568359375, "logps/rejected": -266.06201171875, "loss": 4.8634, "margin_dpo/margin_mean": 18.167736053466797, "margin_dpo/margin_std": 24.508058547973633, "step": 203 }, { "epoch": 0.4272251308900524, "grad_norm": 28.439380645751953, "learning_rate": 3.5551627605944746e-07, "logits/chosen": 2.044978141784668, "logits/rejected": 1.9398431777954102, "logps/chosen": -398.93328857421875, "logps/ref_chosen": -392.3414611816406, "logps/ref_rejected": -291.4375915527344, "logps/rejected": -327.0179443359375, "loss": 4.5824, "margin_dpo/margin_mean": 28.98847007751465, "margin_dpo/margin_std": 38.196693420410156, "step": 204 }, { "epoch": 0.4293193717277487, "grad_norm": 27.83588981628418, "learning_rate": 3.5385375325047163e-07, "logits/chosen": 1.3867862224578857, "logits/rejected": 1.657343864440918, "logps/chosen": -191.40664672851562, "logps/ref_chosen": -190.1780242919922, "logps/ref_rejected": -275.8878479003906, "logps/rejected": -311.2901306152344, "loss": 4.6413, "margin_dpo/margin_mean": 34.17367935180664, "margin_dpo/margin_std": 32.90446090698242, "step": 205 }, { "epoch": 0.431413612565445, "grad_norm": 30.838573455810547, "learning_rate": 3.5218566107988867e-07, "logits/chosen": 0.9244284629821777, "logits/rejected": 1.1862902641296387, "logps/chosen": -277.1954040527344, "logps/ref_chosen": -278.95977783203125, "logps/ref_rejected": -296.458984375, "logps/rejected": -318.1335144042969, "loss": 4.6764, "margin_dpo/margin_mean": 23.43887710571289, "margin_dpo/margin_std": 38.043148040771484, "step": 206 }, { "epoch": 0.43350785340314135, "grad_norm": 31.335426330566406, "learning_rate": 3.505120890024195e-07, "logits/chosen": 1.658402681350708, "logits/rejected": 1.8528728485107422, "logps/chosen": -222.63587951660156, "logps/ref_chosen": -219.367919921875, "logps/ref_rejected": -231.6876678466797, "logps/rejected": -260.1963195800781, "loss": 4.9355, "margin_dpo/margin_mean": 25.240697860717773, "margin_dpo/margin_std": 46.75006866455078, "step": 207 }, { "epoch": 0.4356020942408377, "grad_norm": 34.799400329589844, "learning_rate": 3.4883312676665534e-07, "logits/chosen": 1.4422191381454468, "logits/rejected": 1.4274728298187256, "logps/chosen": -308.7106628417969, "logps/ref_chosen": -303.848388671875, "logps/ref_rejected": -252.1853485107422, "logps/rejected": -288.1015625, "loss": 4.7313, "margin_dpo/margin_mean": 31.05390739440918, "margin_dpo/margin_std": 41.717647552490234, "step": 208 }, { "epoch": 0.437696335078534, "grad_norm": 32.08354949951172, "learning_rate": 3.4714886441024573e-07, "logits/chosen": 1.48392915725708, "logits/rejected": 1.2658922672271729, "logps/chosen": -353.7491760253906, "logps/ref_chosen": -347.6343688964844, "logps/ref_rejected": -240.31988525390625, "logps/rejected": -270.4415283203125, "loss": 4.891, "margin_dpo/margin_mean": 24.006847381591797, "margin_dpo/margin_std": 38.43374252319336, "step": 209 }, { "epoch": 0.4397905759162304, "grad_norm": 41.964515686035156, "learning_rate": 3.454593922550693e-07, "logits/chosen": 1.7486904859542847, "logits/rejected": 1.899224042892456, "logps/chosen": -230.97503662109375, "logps/ref_chosen": -236.3311767578125, "logps/ref_rejected": -289.6016845703125, "logps/rejected": -317.5093688964844, "loss": 4.7272, "margin_dpo/margin_mean": 33.26382064819336, "margin_dpo/margin_std": 33.677276611328125, "step": 210 }, { "epoch": 0.4418848167539267, "grad_norm": 31.20326805114746, "learning_rate": 3.4376480090239047e-07, "logits/chosen": 1.3021446466445923, "logits/rejected": 1.3574562072753906, "logps/chosen": -205.39637756347656, "logps/ref_chosen": -204.38107299804688, "logps/ref_rejected": -212.449462890625, "logps/rejected": -242.7283172607422, "loss": 4.3731, "margin_dpo/margin_mean": 29.263545989990234, "margin_dpo/margin_std": 35.2406005859375, "step": 211 }, { "epoch": 0.44397905759162304, "grad_norm": 35.141807556152344, "learning_rate": 3.4206518122800055e-07, "logits/chosen": 1.1048368215560913, "logits/rejected": 1.1881780624389648, "logps/chosen": -241.2005615234375, "logps/ref_chosen": -231.28570556640625, "logps/ref_rejected": -222.65725708007812, "logps/rejected": -248.41322326660156, "loss": 4.7702, "margin_dpo/margin_mean": 15.841072082519531, "margin_dpo/margin_std": 39.27621841430664, "step": 212 }, { "epoch": 0.44607329842931936, "grad_norm": 27.873577117919922, "learning_rate": 3.403606243773448e-07, "logits/chosen": 1.5394573211669922, "logits/rejected": 1.6554535627365112, "logps/chosen": -336.94354248046875, "logps/ref_chosen": -332.35968017578125, "logps/ref_rejected": -329.94830322265625, "logps/rejected": -365.0585021972656, "loss": 4.7341, "margin_dpo/margin_mean": 30.526296615600586, "margin_dpo/margin_std": 37.60173034667969, "step": 213 }, { "epoch": 0.4481675392670157, "grad_norm": 33.36276626586914, "learning_rate": 3.3865122176063385e-07, "logits/chosen": 1.8208783864974976, "logits/rejected": 1.9227497577667236, "logps/chosen": -320.77886962890625, "logps/ref_chosen": -303.07257080078125, "logps/ref_rejected": -310.52001953125, "logps/rejected": -347.1795654296875, "loss": 4.8278, "margin_dpo/margin_mean": 18.953208923339844, "margin_dpo/margin_std": 34.65628433227539, "step": 214 }, { "epoch": 0.450261780104712, "grad_norm": 31.66336441040039, "learning_rate": 3.3693706504794243e-07, "logits/chosen": 2.0376367568969727, "logits/rejected": 2.075817584991455, "logps/chosen": -283.66595458984375, "logps/ref_chosen": -286.654296875, "logps/ref_rejected": -272.1281433105469, "logps/rejected": -317.6420593261719, "loss": 4.8219, "margin_dpo/margin_mean": 48.50217819213867, "margin_dpo/margin_std": 41.356727600097656, "step": 215 }, { "epoch": 0.4523560209424084, "grad_norm": 51.738441467285156, "learning_rate": 3.3521824616429284e-07, "logits/chosen": 1.378481149673462, "logits/rejected": 1.2707972526550293, "logps/chosen": -364.43603515625, "logps/ref_chosen": -351.34417724609375, "logps/ref_rejected": -290.5171813964844, "logps/rejected": -327.94488525390625, "loss": 4.7106, "margin_dpo/margin_mean": 24.335878372192383, "margin_dpo/margin_std": 42.64997100830078, "step": 216 }, { "epoch": 0.4544502617801047, "grad_norm": 35.788352966308594, "learning_rate": 3.334948572847253e-07, "logits/chosen": 1.5968372821807861, "logits/rejected": 1.7124537229537964, "logps/chosen": -279.4749755859375, "logps/ref_chosen": -273.76788330078125, "logps/ref_rejected": -286.2580261230469, "logps/rejected": -343.501953125, "loss": 4.4028, "margin_dpo/margin_mean": 51.53682327270508, "margin_dpo/margin_std": 38.95985412597656, "step": 217 }, { "epoch": 0.45654450261780105, "grad_norm": 41.920623779296875, "learning_rate": 3.317669908293554e-07, "logits/chosen": 1.5484070777893066, "logits/rejected": 1.8093584775924683, "logps/chosen": -235.32321166992188, "logps/ref_chosen": -219.74948120117188, "logps/ref_rejected": -308.801025390625, "logps/rejected": -354.8564147949219, "loss": 4.5987, "margin_dpo/margin_mean": 30.481698989868164, "margin_dpo/margin_std": 39.370521545410156, "step": 218 }, { "epoch": 0.4586387434554974, "grad_norm": 30.652027130126953, "learning_rate": 3.300347394584172e-07, "logits/chosen": 1.3850374221801758, "logits/rejected": 1.4719693660736084, "logps/chosen": -282.8400573730469, "logps/ref_chosen": -264.65374755859375, "logps/ref_rejected": -233.9711151123047, "logps/rejected": -276.5810546875, "loss": 4.6628, "margin_dpo/margin_mean": 24.423654556274414, "margin_dpo/margin_std": 40.1912841796875, "step": 219 }, { "epoch": 0.4607329842931937, "grad_norm": 45.71303939819336, "learning_rate": 3.2829819606729477e-07, "logits/chosen": 1.9091517925262451, "logits/rejected": 1.744594931602478, "logps/chosen": -315.3508605957031, "logps/ref_chosen": -295.8961486816406, "logps/ref_rejected": -219.56228637695312, "logps/rejected": -273.97418212890625, "loss": 4.5451, "margin_dpo/margin_mean": 34.957191467285156, "margin_dpo/margin_std": 39.46710968017578, "step": 220 }, { "epoch": 0.46282722513089003, "grad_norm": 29.89345932006836, "learning_rate": 3.265574537815398e-07, "logits/chosen": 1.0531387329101562, "logits/rejected": 1.2520170211791992, "logps/chosen": -302.5050048828125, "logps/ref_chosen": -284.9080810546875, "logps/ref_rejected": -310.0538330078125, "logps/rejected": -356.16510009765625, "loss": 4.8325, "margin_dpo/margin_mean": 28.514326095581055, "margin_dpo/margin_std": 32.273284912109375, "step": 221 }, { "epoch": 0.4649214659685864, "grad_norm": 47.45641326904297, "learning_rate": 3.248126059518784e-07, "logits/chosen": 1.3722844123840332, "logits/rejected": 1.2933783531188965, "logps/chosen": -329.0756530761719, "logps/ref_chosen": -308.44622802734375, "logps/ref_rejected": -254.99667358398438, "logps/rejected": -303.69677734375, "loss": 4.5312, "margin_dpo/margin_mean": 28.07069969177246, "margin_dpo/margin_std": 30.664283752441406, "step": 222 }, { "epoch": 0.46701570680628274, "grad_norm": 36.27118682861328, "learning_rate": 3.230637461492043e-07, "logits/chosen": 1.214386224746704, "logits/rejected": 1.193519115447998, "logps/chosen": -283.5272521972656, "logps/ref_chosen": -258.5130310058594, "logps/ref_rejected": -231.13885498046875, "logps/rejected": -290.85296630859375, "loss": 4.4404, "margin_dpo/margin_mean": 34.699886322021484, "margin_dpo/margin_std": 42.52618408203125, "step": 223 }, { "epoch": 0.46910994764397906, "grad_norm": 36.553733825683594, "learning_rate": 3.213109681595612e-07, "logits/chosen": 1.2857964038848877, "logits/rejected": 1.433445692062378, "logps/chosen": -248.49815368652344, "logps/ref_chosen": -234.55177307128906, "logps/ref_rejected": -208.4610595703125, "logps/rejected": -271.1744689941406, "loss": 4.3795, "margin_dpo/margin_mean": 48.76702117919922, "margin_dpo/margin_std": 35.48724365234375, "step": 224 }, { "epoch": 0.4712041884816754, "grad_norm": 40.446937561035156, "learning_rate": 3.1955436597911315e-07, "logits/chosen": 1.6013773679733276, "logits/rejected": 1.722807765007019, "logps/chosen": -360.8241882324219, "logps/ref_chosen": -339.7688903808594, "logps/ref_rejected": -347.96112060546875, "logps/rejected": -397.2833251953125, "loss": 4.9012, "margin_dpo/margin_mean": 28.266937255859375, "margin_dpo/margin_std": 49.008575439453125, "step": 225 }, { "epoch": 0.4732984293193717, "grad_norm": 37.459991455078125, "learning_rate": 3.1779403380910425e-07, "logits/chosen": 0.7877386808395386, "logits/rejected": 1.0014675855636597, "logps/chosen": -225.65472412109375, "logps/ref_chosen": -209.56515502929688, "logps/ref_rejected": -207.83871459960938, "logps/rejected": -261.1890563964844, "loss": 4.8962, "margin_dpo/margin_mean": 37.260780334472656, "margin_dpo/margin_std": 42.05327606201172, "step": 226 }, { "epoch": 0.47539267015706804, "grad_norm": 29.621501922607422, "learning_rate": 3.160300660508064e-07, "logits/chosen": 1.4312937259674072, "logits/rejected": 1.644526481628418, "logps/chosen": -278.422607421875, "logps/ref_chosen": -252.69004821777344, "logps/ref_rejected": -252.89427185058594, "logps/rejected": -317.8539733886719, "loss": 4.4608, "margin_dpo/margin_mean": 39.22712707519531, "margin_dpo/margin_std": 55.3231086730957, "step": 227 }, { "epoch": 0.4774869109947644, "grad_norm": 46.8765869140625, "learning_rate": 3.1426255730045695e-07, "logits/chosen": 1.5183122158050537, "logits/rejected": 1.6018824577331543, "logps/chosen": -235.6670684814453, "logps/ref_chosen": -210.62913513183594, "logps/ref_rejected": -174.08975219726562, "logps/rejected": -226.82781982421875, "loss": 4.4722, "margin_dpo/margin_mean": 27.70014762878418, "margin_dpo/margin_std": 42.50754165649414, "step": 228 }, { "epoch": 0.47958115183246075, "grad_norm": 39.29153823852539, "learning_rate": 3.1249160234418644e-07, "logits/chosen": 1.3761322498321533, "logits/rejected": 1.3572083711624146, "logps/chosen": -336.7579345703125, "logps/ref_chosen": -315.1896057128906, "logps/ref_rejected": -265.8664855957031, "logps/rejected": -330.8331298828125, "loss": 4.1991, "margin_dpo/margin_mean": 43.39836883544922, "margin_dpo/margin_std": 46.784549713134766, "step": 229 }, { "epoch": 0.4816753926701571, "grad_norm": 39.0309944152832, "learning_rate": 3.1071729615293424e-07, "logits/chosen": 0.9634809494018555, "logits/rejected": 0.9632886648178101, "logps/chosen": -258.40020751953125, "logps/ref_chosen": -240.54244995117188, "logps/ref_rejected": -262.5657043457031, "logps/rejected": -319.0147705078125, "loss": 4.5649, "margin_dpo/margin_mean": 38.59132385253906, "margin_dpo/margin_std": 50.162498474121094, "step": 230 }, { "epoch": 0.4837696335078534, "grad_norm": 55.88585662841797, "learning_rate": 3.0893973387735683e-07, "logits/chosen": 1.1088343858718872, "logits/rejected": 1.1820017099380493, "logps/chosen": -326.6155700683594, "logps/ref_chosen": -290.8667907714844, "logps/ref_rejected": -277.01739501953125, "logps/rejected": -330.43194580078125, "loss": 4.7561, "margin_dpo/margin_mean": 17.66571807861328, "margin_dpo/margin_std": 42.80667495727539, "step": 231 }, { "epoch": 0.48586387434554973, "grad_norm": 43.03524398803711, "learning_rate": 3.071590108427243e-07, "logits/chosen": 1.3966903686523438, "logits/rejected": 1.5778224468231201, "logps/chosen": -285.2840270996094, "logps/ref_chosen": -260.0438232421875, "logps/ref_rejected": -261.63507080078125, "logps/rejected": -320.8078308105469, "loss": 4.5309, "margin_dpo/margin_mean": 33.93254089355469, "margin_dpo/margin_std": 37.87663269042969, "step": 232 }, { "epoch": 0.48795811518324606, "grad_norm": 53.92399597167969, "learning_rate": 3.05375222543809e-07, "logits/chosen": 0.838955283164978, "logits/rejected": 0.9410269856452942, "logps/chosen": -240.427734375, "logps/ref_chosen": -221.6608123779297, "logps/ref_rejected": -261.16839599609375, "logps/rejected": -328.1685485839844, "loss": 4.5981, "margin_dpo/margin_mean": 48.23321533203125, "margin_dpo/margin_std": 38.929080963134766, "step": 233 }, { "epoch": 0.4900523560209424, "grad_norm": 33.82901382446289, "learning_rate": 3.035884646397637e-07, "logits/chosen": 1.1958811283111572, "logits/rejected": 1.237013816833496, "logps/chosen": -297.06036376953125, "logps/ref_chosen": -281.4861145019531, "logps/ref_rejected": -276.58441162109375, "logps/rejected": -340.1748046875, "loss": 4.5243, "margin_dpo/margin_mean": 48.01616668701172, "margin_dpo/margin_std": 41.946895599365234, "step": 234 }, { "epoch": 0.49214659685863876, "grad_norm": 44.3351936340332, "learning_rate": 3.017988329489923e-07, "logits/chosen": 1.6281356811523438, "logits/rejected": 1.5668871402740479, "logps/chosen": -301.54229736328125, "logps/ref_chosen": -300.5598449707031, "logps/ref_rejected": -259.905029296875, "logps/rejected": -302.8565368652344, "loss": 4.6976, "margin_dpo/margin_mean": 41.96904754638672, "margin_dpo/margin_std": 42.275962829589844, "step": 235 }, { "epoch": 0.4942408376963351, "grad_norm": 32.59733963012695, "learning_rate": 3.000064234440111e-07, "logits/chosen": 1.2152283191680908, "logits/rejected": 1.2443571090698242, "logps/chosen": -282.0902099609375, "logps/ref_chosen": -270.4844665527344, "logps/ref_rejected": -231.67613220214844, "logps/rejected": -282.9165954589844, "loss": 4.4441, "margin_dpo/margin_mean": 39.63469696044922, "margin_dpo/margin_std": 43.977577209472656, "step": 236 }, { "epoch": 0.4963350785340314, "grad_norm": 47.38420486450195, "learning_rate": 2.9821133224630223e-07, "logits/chosen": 1.266021490097046, "logits/rejected": 1.4988112449645996, "logps/chosen": -219.76870727539062, "logps/ref_chosen": -194.99342346191406, "logps/ref_rejected": -243.12779235839844, "logps/rejected": -310.861083984375, "loss": 4.5863, "margin_dpo/margin_mean": 42.9580078125, "margin_dpo/margin_std": 39.53506851196289, "step": 237 }, { "epoch": 0.49842931937172774, "grad_norm": 39.1376838684082, "learning_rate": 2.964136556211588e-07, "logits/chosen": 1.1324211359024048, "logits/rejected": 1.0826090574264526, "logps/chosen": -261.50543212890625, "logps/ref_chosen": -240.9060516357422, "logps/ref_rejected": -205.97012329101562, "logps/rejected": -254.07977294921875, "loss": 4.446, "margin_dpo/margin_mean": 27.510299682617188, "margin_dpo/margin_std": 46.14052200317383, "step": 238 }, { "epoch": 0.5005235602094241, "grad_norm": 40.104766845703125, "learning_rate": 2.946134899725226e-07, "logits/chosen": 1.3846328258514404, "logits/rejected": 1.5582243204116821, "logps/chosen": -303.406005859375, "logps/ref_chosen": -277.0447998046875, "logps/ref_rejected": -284.7602233886719, "logps/rejected": -329.855712890625, "loss": 4.701, "margin_dpo/margin_mean": 18.734315872192383, "margin_dpo/margin_std": 59.868492126464844, "step": 239 }, { "epoch": 0.5026178010471204, "grad_norm": 38.7380485534668, "learning_rate": 2.9281093183781403e-07, "logits/chosen": 1.1157575845718384, "logits/rejected": 1.0506106615066528, "logps/chosen": -290.3915710449219, "logps/ref_chosen": -285.29144287109375, "logps/ref_rejected": -212.11915588378906, "logps/rejected": -265.2477111816406, "loss": 4.3702, "margin_dpo/margin_mean": 48.02838134765625, "margin_dpo/margin_std": 42.991214752197266, "step": 240 }, { "epoch": 0.5047120418848168, "grad_norm": 41.1712646484375, "learning_rate": 2.910060778827554e-07, "logits/chosen": 1.3538631200790405, "logits/rejected": 1.4909546375274658, "logps/chosen": -260.87939453125, "logps/ref_chosen": -254.9442901611328, "logps/ref_rejected": -278.5121154785156, "logps/rejected": -332.14483642578125, "loss": 4.7755, "margin_dpo/margin_mean": 47.697601318359375, "margin_dpo/margin_std": 46.98115539550781, "step": 241 }, { "epoch": 0.506806282722513, "grad_norm": 33.75428771972656, "learning_rate": 2.891990248961871e-07, "logits/chosen": 2.038219690322876, "logits/rejected": 1.9361552000045776, "logps/chosen": -274.6824645996094, "logps/ref_chosen": -264.16876220703125, "logps/ref_rejected": -215.627197265625, "logps/rejected": -269.078125, "loss": 4.4651, "margin_dpo/margin_mean": 42.937225341796875, "margin_dpo/margin_std": 52.65818786621094, "step": 242 }, { "epoch": 0.5089005235602094, "grad_norm": 45.30524826049805, "learning_rate": 2.873898697848762e-07, "logits/chosen": 1.3483995199203491, "logits/rejected": 1.3744844198226929, "logps/chosen": -322.53204345703125, "logps/ref_chosen": -313.7347106933594, "logps/ref_rejected": -357.50054931640625, "logps/rejected": -403.6512756347656, "loss": 4.1712, "margin_dpo/margin_mean": 37.353302001953125, "margin_dpo/margin_std": 46.82036590576172, "step": 243 }, { "epoch": 0.5109947643979058, "grad_norm": 35.50245666503906, "learning_rate": 2.8557870956832133e-07, "logits/chosen": 1.1742347478866577, "logits/rejected": 0.9833606481552124, "logps/chosen": -291.64044189453125, "logps/ref_chosen": -265.0720520019531, "logps/ref_rejected": -235.29541015625, "logps/rejected": -293.6773376464844, "loss": 4.2395, "margin_dpo/margin_mean": 31.813528060913086, "margin_dpo/margin_std": 38.89430236816406, "step": 244 }, { "epoch": 0.5130890052356021, "grad_norm": 55.53532791137695, "learning_rate": 2.837656413735479e-07, "logits/chosen": 1.9007817506790161, "logits/rejected": 1.6215357780456543, "logps/chosen": -346.5862731933594, "logps/ref_chosen": -338.6529235839844, "logps/ref_rejected": -259.6473693847656, "logps/rejected": -305.2576904296875, "loss": 4.2236, "margin_dpo/margin_mean": 37.676998138427734, "margin_dpo/margin_std": 33.821868896484375, "step": 245 }, { "epoch": 0.5151832460732985, "grad_norm": 36.23422622680664, "learning_rate": 2.8195076242990116e-07, "logits/chosen": 1.154848337173462, "logits/rejected": 1.1089671850204468, "logps/chosen": -273.8831787109375, "logps/ref_chosen": -254.98756408691406, "logps/ref_rejected": -201.20782470703125, "logps/rejected": -253.29745483398438, "loss": 4.8429, "margin_dpo/margin_mean": 33.19401550292969, "margin_dpo/margin_std": 46.2290153503418, "step": 246 }, { "epoch": 0.5172774869109947, "grad_norm": 41.20656967163086, "learning_rate": 2.801341700638307e-07, "logits/chosen": 1.237385630607605, "logits/rejected": 1.1006180047988892, "logps/chosen": -284.4879455566406, "logps/ref_chosen": -276.70361328125, "logps/ref_rejected": -209.83523559570312, "logps/rejected": -266.49542236328125, "loss": 4.4147, "margin_dpo/margin_mean": 48.87584686279297, "margin_dpo/margin_std": 42.81962966918945, "step": 247 }, { "epoch": 0.5193717277486911, "grad_norm": 57.703697204589844, "learning_rate": 2.7831596169367227e-07, "logits/chosen": 1.0914226770401, "logits/rejected": 1.1984620094299316, "logps/chosen": -258.7278747558594, "logps/ref_chosen": -249.7368621826172, "logps/ref_rejected": -230.7808837890625, "logps/rejected": -274.591552734375, "loss": 4.7502, "margin_dpo/margin_mean": 34.819644927978516, "margin_dpo/margin_std": 40.67426300048828, "step": 248 }, { "epoch": 0.5214659685863874, "grad_norm": 47.22350311279297, "learning_rate": 2.7649623482442274e-07, "logits/chosen": 1.0606677532196045, "logits/rejected": 1.1147487163543701, "logps/chosen": -266.5928649902344, "logps/ref_chosen": -229.43399047851562, "logps/ref_rejected": -242.59182739257812, "logps/rejected": -302.368896484375, "loss": 4.5617, "margin_dpo/margin_mean": 22.618181228637695, "margin_dpo/margin_std": 44.011497497558594, "step": 249 }, { "epoch": 0.5235602094240838, "grad_norm": 34.5158576965332, "learning_rate": 2.7467508704251135e-07, "logits/chosen": 1.6137490272521973, "logits/rejected": 1.7355223894119263, "logps/chosen": -386.4211120605469, "logps/ref_chosen": -374.47015380859375, "logps/ref_rejected": -397.1805114746094, "logps/rejected": -455.82952880859375, "loss": 4.6106, "margin_dpo/margin_mean": 46.698097229003906, "margin_dpo/margin_std": 53.3001823425293, "step": 250 }, { "epoch": 0.5256544502617801, "grad_norm": 44.91852569580078, "learning_rate": 2.7285261601056697e-07, "logits/chosen": 1.0645577907562256, "logits/rejected": 0.8425718545913696, "logps/chosen": -355.9337463378906, "logps/ref_chosen": -340.28240966796875, "logps/ref_rejected": -255.56735229492188, "logps/rejected": -305.7314147949219, "loss": 4.5346, "margin_dpo/margin_mean": 34.51277160644531, "margin_dpo/margin_std": 47.6010627746582, "step": 251 }, { "epoch": 0.5277486910994764, "grad_norm": 30.532474517822266, "learning_rate": 2.7102891946217994e-07, "logits/chosen": 1.4391117095947266, "logits/rejected": 1.4691420793533325, "logps/chosen": -215.19662475585938, "logps/ref_chosen": -198.7939453125, "logps/ref_rejected": -212.86849975585938, "logps/rejected": -271.3706359863281, "loss": 4.5578, "margin_dpo/margin_mean": 42.09947967529297, "margin_dpo/margin_std": 46.31159973144531, "step": 252 }, { "epoch": 0.5298429319371728, "grad_norm": 45.59535598754883, "learning_rate": 2.692040951966617e-07, "logits/chosen": 1.448297142982483, "logits/rejected": 1.3689329624176025, "logps/chosen": -370.5470275878906, "logps/ref_chosen": -343.3220520019531, "logps/ref_rejected": -258.52044677734375, "logps/rejected": -316.4342041015625, "loss": 4.7989, "margin_dpo/margin_mean": 30.688785552978516, "margin_dpo/margin_std": 51.24434280395508, "step": 253 }, { "epoch": 0.5319371727748691, "grad_norm": 37.76780700683594, "learning_rate": 2.6737824107379947e-07, "logits/chosen": 1.4605791568756104, "logits/rejected": 1.3956043720245361, "logps/chosen": -326.6246337890625, "logps/ref_chosen": -300.8880310058594, "logps/ref_rejected": -288.5895690917969, "logps/rejected": -342.62518310546875, "loss": 4.4466, "margin_dpo/margin_mean": 28.299026489257812, "margin_dpo/margin_std": 43.362815856933594, "step": 254 }, { "epoch": 0.5340314136125655, "grad_norm": 38.82050323486328, "learning_rate": 2.655514550086086e-07, "logits/chosen": 1.3760805130004883, "logits/rejected": 1.3785066604614258, "logps/chosen": -309.2912902832031, "logps/ref_chosen": -283.4182434082031, "logps/ref_rejected": -317.677978515625, "logps/rejected": -381.75701904296875, "loss": 4.3792, "margin_dpo/margin_mean": 38.20598220825195, "margin_dpo/margin_std": 58.638553619384766, "step": 255 }, { "epoch": 0.5361256544502618, "grad_norm": 37.75017166137695, "learning_rate": 2.6372383496608186e-07, "logits/chosen": 1.3811287879943848, "logits/rejected": 1.417975902557373, "logps/chosen": -352.6160583496094, "logps/ref_chosen": -333.6951599121094, "logps/ref_rejected": -302.9135437011719, "logps/rejected": -374.9257507324219, "loss": 4.4641, "margin_dpo/margin_mean": 53.091312408447266, "margin_dpo/margin_std": 62.08375930786133, "step": 256 }, { "epoch": 0.5382198952879581, "grad_norm": 39.30986022949219, "learning_rate": 2.618954789559356e-07, "logits/chosen": 1.3786240816116333, "logits/rejected": 1.5010215044021606, "logps/chosen": -297.0694885253906, "logps/ref_chosen": -269.2105712890625, "logps/ref_rejected": -282.474365234375, "logps/rejected": -354.8011474609375, "loss": 4.3795, "margin_dpo/margin_mean": 44.46790313720703, "margin_dpo/margin_std": 52.83469772338867, "step": 257 }, { "epoch": 0.5403141361256545, "grad_norm": 64.80396270751953, "learning_rate": 2.600664850273538e-07, "logits/chosen": 1.1780776977539062, "logits/rejected": 1.358864665031433, "logps/chosen": -304.0466613769531, "logps/ref_chosen": -274.53314208984375, "logps/ref_rejected": -284.3149108886719, "logps/rejected": -365.9967346191406, "loss": 4.2167, "margin_dpo/margin_mean": 52.16826248168945, "margin_dpo/margin_std": 58.13621139526367, "step": 258 }, { "epoch": 0.5424083769633508, "grad_norm": 57.088375091552734, "learning_rate": 2.582369512637302e-07, "logits/chosen": 1.1932607889175415, "logits/rejected": 1.141600489616394, "logps/chosen": -255.16656494140625, "logps/ref_chosen": -235.41139221191406, "logps/ref_rejected": -217.746826171875, "logps/rejected": -282.0968933105469, "loss": 4.5131, "margin_dpo/margin_mean": 44.59490966796875, "margin_dpo/margin_std": 53.4549674987793, "step": 259 }, { "epoch": 0.5445026178010471, "grad_norm": 65.07582092285156, "learning_rate": 2.5640697577740815e-07, "logits/chosen": 0.8414401412010193, "logits/rejected": 0.9391928911209106, "logps/chosen": -242.9241943359375, "logps/ref_chosen": -224.4993133544922, "logps/ref_rejected": -215.19839477539062, "logps/rejected": -268.3463439941406, "loss": 5.2773, "margin_dpo/margin_mean": 34.72306442260742, "margin_dpo/margin_std": 62.74570083618164, "step": 260 }, { "epoch": 0.5465968586387434, "grad_norm": 62.26913070678711, "learning_rate": 2.5457665670441937e-07, "logits/chosen": 0.8039923906326294, "logits/rejected": 0.6551789045333862, "logps/chosen": -289.4314270019531, "logps/ref_chosen": -251.2598114013672, "logps/ref_rejected": -205.53323364257812, "logps/rejected": -263.32464599609375, "loss": 4.7681, "margin_dpo/margin_mean": 19.61980438232422, "margin_dpo/margin_std": 53.51505661010742, "step": 261 }, { "epoch": 0.5486910994764398, "grad_norm": 57.36088562011719, "learning_rate": 2.527460921992209e-07, "logits/chosen": 1.5565768480300903, "logits/rejected": 1.5727018117904663, "logps/chosen": -370.4512634277344, "logps/ref_chosen": -347.8548889160156, "logps/ref_rejected": -309.43011474609375, "logps/rejected": -387.045166015625, "loss": 4.4059, "margin_dpo/margin_mean": 55.01873016357422, "margin_dpo/margin_std": 52.59794235229492, "step": 262 }, { "epoch": 0.5507853403141362, "grad_norm": 80.06723022460938, "learning_rate": 2.509153804294318e-07, "logits/chosen": 1.2121027708053589, "logits/rejected": 1.3596720695495605, "logps/chosen": -301.9384765625, "logps/ref_chosen": -261.0179443359375, "logps/ref_rejected": -295.4287109375, "logps/rejected": -357.9337463378906, "loss": 4.7515, "margin_dpo/margin_mean": 21.584484100341797, "margin_dpo/margin_std": 49.02084732055664, "step": 263 }, { "epoch": 0.5528795811518324, "grad_norm": 65.33440399169922, "learning_rate": 2.4908461957056825e-07, "logits/chosen": 1.4055628776550293, "logits/rejected": 1.2200889587402344, "logps/chosen": -321.2078857421875, "logps/ref_chosen": -297.6844482421875, "logps/ref_rejected": -205.72137451171875, "logps/rejected": -284.78070068359375, "loss": 4.2315, "margin_dpo/margin_mean": 55.53590393066406, "margin_dpo/margin_std": 46.41938018798828, "step": 264 }, { "epoch": 0.5549738219895288, "grad_norm": 51.22663879394531, "learning_rate": 2.4725390780077905e-07, "logits/chosen": 1.3676010370254517, "logits/rejected": 1.380630612373352, "logps/chosen": -306.2537536621094, "logps/ref_chosen": -285.8244323730469, "logps/ref_rejected": -275.6885681152344, "logps/rejected": -362.411865234375, "loss": 4.4685, "margin_dpo/margin_mean": 66.2939682006836, "margin_dpo/margin_std": 51.13595199584961, "step": 265 }, { "epoch": 0.5570680628272251, "grad_norm": 56.91117477416992, "learning_rate": 2.454233432955807e-07, "logits/chosen": 1.253815770149231, "logits/rejected": 1.3310532569885254, "logps/chosen": -280.5023193359375, "logps/ref_chosen": -273.0467834472656, "logps/ref_rejected": -291.18133544921875, "logps/rejected": -342.8387145996094, "loss": 4.314, "margin_dpo/margin_mean": 44.201820373535156, "margin_dpo/margin_std": 46.703495025634766, "step": 266 }, { "epoch": 0.5591623036649215, "grad_norm": 44.81831741333008, "learning_rate": 2.435930242225919e-07, "logits/chosen": 1.1857373714447021, "logits/rejected": 1.3162403106689453, "logps/chosen": -294.0497131347656, "logps/ref_chosen": -272.337890625, "logps/ref_rejected": -279.97076416015625, "logps/rejected": -351.3126525878906, "loss": 4.6021, "margin_dpo/margin_mean": 49.630027770996094, "margin_dpo/margin_std": 56.707557678222656, "step": 267 }, { "epoch": 0.5612565445026177, "grad_norm": 65.94684600830078, "learning_rate": 2.4176304873626984e-07, "logits/chosen": 1.1858967542648315, "logits/rejected": 1.2295567989349365, "logps/chosen": -257.2110900878906, "logps/ref_chosen": -235.03692626953125, "logps/ref_rejected": -245.3459014892578, "logps/rejected": -307.6155090332031, "loss": 4.4782, "margin_dpo/margin_mean": 40.09546661376953, "margin_dpo/margin_std": 52.132469177246094, "step": 268 }, { "epoch": 0.5633507853403141, "grad_norm": 40.701568603515625, "learning_rate": 2.399335149726463e-07, "logits/chosen": 1.1352908611297607, "logits/rejected": 1.3305590152740479, "logps/chosen": -262.03607177734375, "logps/ref_chosen": -240.3035430908203, "logps/ref_rejected": -233.82675170898438, "logps/rejected": -302.03057861328125, "loss": 4.6077, "margin_dpo/margin_mean": 46.471290588378906, "margin_dpo/margin_std": 58.428775787353516, "step": 269 }, { "epoch": 0.5654450261780105, "grad_norm": 73.9042739868164, "learning_rate": 2.381045210440644e-07, "logits/chosen": 1.550492286682129, "logits/rejected": 1.8568247556686401, "logps/chosen": -273.93243408203125, "logps/ref_chosen": -249.420166015625, "logps/ref_rejected": -279.5133972167969, "logps/rejected": -334.2721862792969, "loss": 4.5035, "margin_dpo/margin_mean": 30.24651336669922, "margin_dpo/margin_std": 66.87718200683594, "step": 270 }, { "epoch": 0.5675392670157068, "grad_norm": 64.23003387451172, "learning_rate": 2.3627616503391812e-07, "logits/chosen": 1.003787636756897, "logits/rejected": 1.0502243041992188, "logps/chosen": -243.3548126220703, "logps/ref_chosen": -227.45108032226562, "logps/ref_rejected": -183.29275512695312, "logps/rejected": -236.02288818359375, "loss": 4.2933, "margin_dpo/margin_mean": 36.826351165771484, "margin_dpo/margin_std": 40.91915512084961, "step": 271 }, { "epoch": 0.5696335078534032, "grad_norm": 54.780860900878906, "learning_rate": 2.344485449913914e-07, "logits/chosen": 1.5306094884872437, "logits/rejected": 1.417227029800415, "logps/chosen": -370.7244873046875, "logps/ref_chosen": -360.17462158203125, "logps/ref_rejected": -241.59568786621094, "logps/rejected": -302.3287658691406, "loss": 4.4468, "margin_dpo/margin_mean": 50.18324661254883, "margin_dpo/margin_std": 52.38308334350586, "step": 272 }, { "epoch": 0.5717277486910994, "grad_norm": 76.68623352050781, "learning_rate": 2.3262175892620062e-07, "logits/chosen": 1.463561773300171, "logits/rejected": 1.513543725013733, "logps/chosen": -323.9863586425781, "logps/ref_chosen": -309.366455078125, "logps/ref_rejected": -271.2337951660156, "logps/rejected": -335.74420166015625, "loss": 4.3448, "margin_dpo/margin_mean": 49.89052963256836, "margin_dpo/margin_std": 69.81361389160156, "step": 273 }, { "epoch": 0.5738219895287958, "grad_norm": 40.13818359375, "learning_rate": 2.3079590480333827e-07, "logits/chosen": 1.596007227897644, "logits/rejected": 1.7439165115356445, "logps/chosen": -304.8982238769531, "logps/ref_chosen": -295.56866455078125, "logps/ref_rejected": -253.984130859375, "logps/rejected": -311.5342102050781, "loss": 4.2593, "margin_dpo/margin_mean": 48.22050476074219, "margin_dpo/margin_std": 47.02198791503906, "step": 274 }, { "epoch": 0.5759162303664922, "grad_norm": 51.986778259277344, "learning_rate": 2.2897108053782e-07, "logits/chosen": 0.9961601495742798, "logits/rejected": 1.0950078964233398, "logps/chosen": -251.74990844726562, "logps/ref_chosen": -235.93154907226562, "logps/ref_rejected": -230.19454956054688, "logps/rejected": -288.5270080566406, "loss": 4.061, "margin_dpo/margin_mean": 42.51408386230469, "margin_dpo/margin_std": 56.936180114746094, "step": 275 }, { "epoch": 0.5780104712041885, "grad_norm": 51.089576721191406, "learning_rate": 2.2714738398943308e-07, "logits/chosen": 1.7104884386062622, "logits/rejected": 1.6105390787124634, "logps/chosen": -365.57635498046875, "logps/ref_chosen": -357.3829650878906, "logps/ref_rejected": -273.025146484375, "logps/rejected": -322.5356140136719, "loss": 4.1898, "margin_dpo/margin_mean": 41.317039489746094, "margin_dpo/margin_std": 46.224205017089844, "step": 276 }, { "epoch": 0.5801047120418849, "grad_norm": 59.21977233886719, "learning_rate": 2.2532491295748865e-07, "logits/chosen": 1.0496208667755127, "logits/rejected": 1.255394697189331, "logps/chosen": -316.8267822265625, "logps/ref_chosen": -289.98040771484375, "logps/ref_rejected": -310.3972473144531, "logps/rejected": -371.3232727050781, "loss": 4.7638, "margin_dpo/margin_mean": 34.079654693603516, "margin_dpo/margin_std": 57.57683563232422, "step": 277 }, { "epoch": 0.5821989528795811, "grad_norm": 52.5463981628418, "learning_rate": 2.2350376517557726e-07, "logits/chosen": 0.8676056861877441, "logits/rejected": 0.8314589262008667, "logps/chosen": -256.13165283203125, "logps/ref_chosen": -237.13531494140625, "logps/ref_rejected": -232.33502197265625, "logps/rejected": -290.1609802246094, "loss": 4.9549, "margin_dpo/margin_mean": 38.829627990722656, "margin_dpo/margin_std": 55.37847900390625, "step": 278 }, { "epoch": 0.5842931937172775, "grad_norm": 45.49783706665039, "learning_rate": 2.2168403830632769e-07, "logits/chosen": 1.3541253805160522, "logits/rejected": 1.4431573152542114, "logps/chosen": -361.9897766113281, "logps/ref_chosen": -354.13311767578125, "logps/ref_rejected": -305.6336975097656, "logps/rejected": -358.853271484375, "loss": 4.1861, "margin_dpo/margin_mean": 45.362972259521484, "margin_dpo/margin_std": 40.74034118652344, "step": 279 }, { "epoch": 0.5863874345549738, "grad_norm": 45.03684997558594, "learning_rate": 2.1986582993616925e-07, "logits/chosen": 1.337092638015747, "logits/rejected": 1.3559750318527222, "logps/chosen": -274.98260498046875, "logps/ref_chosen": -268.2659912109375, "logps/ref_rejected": -232.44114685058594, "logps/rejected": -289.997314453125, "loss": 4.4116, "margin_dpo/margin_mean": 50.83951950073242, "margin_dpo/margin_std": 63.36719512939453, "step": 280 }, { "epoch": 0.5884816753926702, "grad_norm": 40.450538635253906, "learning_rate": 2.1804923757009882e-07, "logits/chosen": 1.3821660280227661, "logits/rejected": 1.3343546390533447, "logps/chosen": -287.6787414550781, "logps/ref_chosen": -257.0721740722656, "logps/ref_rejected": -248.18264770507812, "logps/rejected": -319.9836730957031, "loss": 4.3178, "margin_dpo/margin_mean": 41.19449234008789, "margin_dpo/margin_std": 56.606285095214844, "step": 281 }, { "epoch": 0.5905759162303665, "grad_norm": 57.53801345825195, "learning_rate": 2.1623435862645205e-07, "logits/chosen": 1.5275428295135498, "logits/rejected": 1.6021305322647095, "logps/chosen": -293.01007080078125, "logps/ref_chosen": -269.2411804199219, "logps/ref_rejected": -323.8949279785156, "logps/rejected": -384.69775390625, "loss": 4.58, "margin_dpo/margin_mean": 37.03391647338867, "margin_dpo/margin_std": 60.21036148071289, "step": 282 }, { "epoch": 0.5926701570680628, "grad_norm": 48.43301773071289, "learning_rate": 2.1442129043167873e-07, "logits/chosen": 1.0364420413970947, "logits/rejected": 1.2966769933700562, "logps/chosen": -279.5700378417969, "logps/ref_chosen": -257.61688232421875, "logps/ref_rejected": -234.8463134765625, "logps/rejected": -297.4212646484375, "loss": 4.4572, "margin_dpo/margin_mean": 40.621826171875, "margin_dpo/margin_std": 61.95793533325195, "step": 283 }, { "epoch": 0.5947643979057592, "grad_norm": 77.20549011230469, "learning_rate": 2.1261013021512378e-07, "logits/chosen": 1.3976938724517822, "logits/rejected": 1.3549877405166626, "logps/chosen": -252.79287719726562, "logps/ref_chosen": -228.94891357421875, "logps/ref_rejected": -288.43804931640625, "logps/rejected": -346.0372009277344, "loss": 4.6112, "margin_dpo/margin_mean": 33.755226135253906, "margin_dpo/margin_std": 55.99586486816406, "step": 284 }, { "epoch": 0.5968586387434555, "grad_norm": 57.3791389465332, "learning_rate": 2.1080097510381294e-07, "logits/chosen": 1.5715055465698242, "logits/rejected": 1.4658746719360352, "logps/chosen": -386.9960632324219, "logps/ref_chosen": -364.84332275390625, "logps/ref_rejected": -306.4946594238281, "logps/rejected": -359.02520751953125, "loss": 4.7905, "margin_dpo/margin_mean": 30.377866744995117, "margin_dpo/margin_std": 52.86358642578125, "step": 285 }, { "epoch": 0.5989528795811518, "grad_norm": 37.026641845703125, "learning_rate": 2.089939221172446e-07, "logits/chosen": 1.36098051071167, "logits/rejected": 1.4245069026947021, "logps/chosen": -299.2890625, "logps/ref_chosen": -269.2027893066406, "logps/ref_rejected": -286.9102478027344, "logps/rejected": -346.26043701171875, "loss": 4.6962, "margin_dpo/margin_mean": 29.26395034790039, "margin_dpo/margin_std": 46.75328063964844, "step": 286 }, { "epoch": 0.6010471204188481, "grad_norm": 58.61341094970703, "learning_rate": 2.0718906816218595e-07, "logits/chosen": 1.219170093536377, "logits/rejected": 1.3217523097991943, "logps/chosen": -259.4914855957031, "logps/ref_chosen": -233.5873565673828, "logps/ref_rejected": -230.03646850585938, "logps/rejected": -291.0602722167969, "loss": 4.594, "margin_dpo/margin_mean": 35.11963653564453, "margin_dpo/margin_std": 54.2941780090332, "step": 287 }, { "epoch": 0.6031413612565445, "grad_norm": 46.509056091308594, "learning_rate": 2.053865100274774e-07, "logits/chosen": 1.5584362745285034, "logits/rejected": 1.3865753412246704, "logps/chosen": -412.9979553222656, "logps/ref_chosen": -378.4530029296875, "logps/ref_rejected": -302.7226257324219, "logps/rejected": -366.6625061035156, "loss": 4.5476, "margin_dpo/margin_mean": 29.394969940185547, "margin_dpo/margin_std": 42.88922119140625, "step": 288 }, { "epoch": 0.6052356020942409, "grad_norm": 42.84352493286133, "learning_rate": 2.035863443788411e-07, "logits/chosen": 1.5689976215362549, "logits/rejected": 1.5198795795440674, "logps/chosen": -373.4714660644531, "logps/ref_chosen": -342.27532958984375, "logps/ref_rejected": -317.79638671875, "logps/rejected": -370.5872802734375, "loss": 4.944, "margin_dpo/margin_mean": 21.594791412353516, "margin_dpo/margin_std": 44.652915954589844, "step": 289 }, { "epoch": 0.6073298429319371, "grad_norm": 58.85520935058594, "learning_rate": 2.0178866775369774e-07, "logits/chosen": 1.3218892812728882, "logits/rejected": 1.2929930686950684, "logps/chosen": -374.0101623535156, "logps/ref_chosen": -348.39788818359375, "logps/ref_rejected": -349.3028564453125, "logps/rejected": -415.2039489746094, "loss": 4.6543, "margin_dpo/margin_mean": 40.28877258300781, "margin_dpo/margin_std": 68.06195831298828, "step": 290 }, { "epoch": 0.6094240837696335, "grad_norm": 39.78348922729492, "learning_rate": 1.9999357655598891e-07, "logits/chosen": 1.0092355012893677, "logits/rejected": 1.146081805229187, "logps/chosen": -268.5143737792969, "logps/ref_chosen": -250.70835876464844, "logps/ref_rejected": -240.2347869873047, "logps/rejected": -312.80255126953125, "loss": 4.2499, "margin_dpo/margin_mean": 54.76176834106445, "margin_dpo/margin_std": 55.031982421875, "step": 291 }, { "epoch": 0.6115183246073298, "grad_norm": 56.82017135620117, "learning_rate": 1.9820116705100775e-07, "logits/chosen": 1.0279195308685303, "logits/rejected": 1.0469276905059814, "logps/chosen": -285.55364990234375, "logps/ref_chosen": -277.9742431640625, "logps/ref_rejected": -260.510986328125, "logps/rejected": -321.3108825683594, "loss": 4.4704, "margin_dpo/margin_mean": 53.22050094604492, "margin_dpo/margin_std": 38.52600860595703, "step": 292 }, { "epoch": 0.6136125654450262, "grad_norm": 82.68972778320312, "learning_rate": 1.9641153536023642e-07, "logits/chosen": 1.8253206014633179, "logits/rejected": 1.6558302640914917, "logps/chosen": -322.6553039550781, "logps/ref_chosen": -300.9186096191406, "logps/ref_rejected": -257.6700439453125, "logps/rejected": -320.5356750488281, "loss": 4.4861, "margin_dpo/margin_mean": 41.12899398803711, "margin_dpo/margin_std": 47.92676544189453, "step": 293 }, { "epoch": 0.6157068062827226, "grad_norm": 59.99162673950195, "learning_rate": 1.9462477745619106e-07, "logits/chosen": 1.151703953742981, "logits/rejected": 1.2953401803970337, "logps/chosen": -282.59271240234375, "logps/ref_chosen": -266.8080139160156, "logps/ref_rejected": -283.26959228515625, "logps/rejected": -355.72576904296875, "loss": 4.5619, "margin_dpo/margin_mean": 56.671478271484375, "margin_dpo/margin_std": 48.01198959350586, "step": 294 }, { "epoch": 0.6178010471204188, "grad_norm": 43.26309585571289, "learning_rate": 1.928409891572757e-07, "logits/chosen": 1.2075080871582031, "logits/rejected": 1.226868748664856, "logps/chosen": -282.1271667480469, "logps/ref_chosen": -240.19598388671875, "logps/ref_rejected": -214.87818908691406, "logps/rejected": -267.48272705078125, "loss": 4.5008, "margin_dpo/margin_mean": 10.673352241516113, "margin_dpo/margin_std": 68.71654510498047, "step": 295 }, { "epoch": 0.6198952879581152, "grad_norm": 41.60967254638672, "learning_rate": 1.9106026612264315e-07, "logits/chosen": 1.4175227880477905, "logits/rejected": 1.595857858657837, "logps/chosen": -236.48651123046875, "logps/ref_chosen": -227.85513305664062, "logps/ref_rejected": -256.476318359375, "logps/rejected": -316.8334655761719, "loss": 4.0503, "margin_dpo/margin_mean": 51.72578430175781, "margin_dpo/margin_std": 55.78533935546875, "step": 296 }, { "epoch": 0.6219895287958115, "grad_norm": 49.98090744018555, "learning_rate": 1.8928270384706582e-07, "logits/chosen": 1.33966863155365, "logits/rejected": 1.5027199983596802, "logps/chosen": -248.77769470214844, "logps/ref_chosen": -220.73609924316406, "logps/ref_rejected": -272.24017333984375, "logps/rejected": -329.5039367675781, "loss": 4.359, "margin_dpo/margin_mean": 29.222198486328125, "margin_dpo/margin_std": 50.361183166503906, "step": 297 }, { "epoch": 0.6240837696335079, "grad_norm": 70.29344940185547, "learning_rate": 1.875083976558136e-07, "logits/chosen": 1.4000345468521118, "logits/rejected": 1.3036651611328125, "logps/chosen": -363.42401123046875, "logps/ref_chosen": -346.2327880859375, "logps/ref_rejected": -285.7917785644531, "logps/rejected": -350.0803527832031, "loss": 4.2786, "margin_dpo/margin_mean": 47.0973014831543, "margin_dpo/margin_std": 58.554161071777344, "step": 298 }, { "epoch": 0.6261780104712041, "grad_norm": 68.31159973144531, "learning_rate": 1.8573744269954297e-07, "logits/chosen": 1.3761045932769775, "logits/rejected": 1.3730204105377197, "logps/chosen": -297.95330810546875, "logps/ref_chosen": -266.99658203125, "logps/ref_rejected": -262.5125427246094, "logps/rejected": -327.41265869140625, "loss": 4.5174, "margin_dpo/margin_mean": 33.94337463378906, "margin_dpo/margin_std": 67.46819305419922, "step": 299 }, { "epoch": 0.6282722513089005, "grad_norm": 41.25592803955078, "learning_rate": 1.839699339491937e-07, "logits/chosen": 1.0598005056381226, "logits/rejected": 1.1422343254089355, "logps/chosen": -306.2508239746094, "logps/ref_chosen": -281.19525146484375, "logps/ref_rejected": -288.6803894042969, "logps/rejected": -364.2799377441406, "loss": 4.5063, "margin_dpo/margin_mean": 50.54399490356445, "margin_dpo/margin_std": 57.26484298706055, "step": 300 }, { "epoch": 0.6303664921465969, "grad_norm": 80.2366943359375, "learning_rate": 1.8220596619089573e-07, "logits/chosen": 1.5710808038711548, "logits/rejected": 1.6003844738006592, "logps/chosen": -304.22662353515625, "logps/ref_chosen": -289.8253173828125, "logps/ref_rejected": -327.8699645996094, "logps/rejected": -389.2873840332031, "loss": 4.5827, "margin_dpo/margin_mean": 47.01613235473633, "margin_dpo/margin_std": 53.770626068115234, "step": 301 }, { "epoch": 0.6324607329842932, "grad_norm": 45.32533645629883, "learning_rate": 1.8044563402088682e-07, "logits/chosen": 1.390702486038208, "logits/rejected": 1.5861480236053467, "logps/chosen": -341.7214660644531, "logps/ref_chosen": -307.1119079589844, "logps/ref_rejected": -296.61785888671875, "logps/rejected": -385.1257019042969, "loss": 4.3462, "margin_dpo/margin_mean": 53.89823532104492, "margin_dpo/margin_std": 60.43596267700195, "step": 302 }, { "epoch": 0.6345549738219896, "grad_norm": 35.210235595703125, "learning_rate": 1.7868903184043885e-07, "logits/chosen": 1.0414403676986694, "logits/rejected": 1.2032232284545898, "logps/chosen": -287.9827575683594, "logps/ref_chosen": -261.281982421875, "logps/ref_rejected": -287.9131164550781, "logps/rejected": -370.0382385253906, "loss": 4.4312, "margin_dpo/margin_mean": 55.42430114746094, "margin_dpo/margin_std": 58.83437728881836, "step": 303 }, { "epoch": 0.6366492146596858, "grad_norm": 53.23714065551758, "learning_rate": 1.7693625385079574e-07, "logits/chosen": 1.1299974918365479, "logits/rejected": 1.1754674911499023, "logps/chosen": -317.24932861328125, "logps/ref_chosen": -276.4831848144531, "logps/ref_rejected": -257.2686462402344, "logps/rejected": -332.5347595214844, "loss": 4.6016, "margin_dpo/margin_mean": 34.49999237060547, "margin_dpo/margin_std": 42.753028869628906, "step": 304 }, { "epoch": 0.6387434554973822, "grad_norm": 50.1977653503418, "learning_rate": 1.7518739404812155e-07, "logits/chosen": 1.1471208333969116, "logits/rejected": 1.1764111518859863, "logps/chosen": -272.23565673828125, "logps/ref_chosen": -253.3165283203125, "logps/ref_rejected": -225.20468139648438, "logps/rejected": -278.5679931640625, "loss": 4.0448, "margin_dpo/margin_mean": 34.444190979003906, "margin_dpo/margin_std": 46.56166076660156, "step": 305 }, { "epoch": 0.6408376963350786, "grad_norm": 51.97341537475586, "learning_rate": 1.7344254621846017e-07, "logits/chosen": 1.2266101837158203, "logits/rejected": 1.1241114139556885, "logps/chosen": -338.9609069824219, "logps/ref_chosen": -324.57122802734375, "logps/ref_rejected": -299.1585693359375, "logps/rejected": -352.9222412109375, "loss": 4.306, "margin_dpo/margin_mean": 39.374000549316406, "margin_dpo/margin_std": 74.34258270263672, "step": 306 }, { "epoch": 0.6429319371727749, "grad_norm": 45.46051025390625, "learning_rate": 1.717018039327053e-07, "logits/chosen": 1.1175193786621094, "logits/rejected": 1.2578641176223755, "logps/chosen": -320.50177001953125, "logps/ref_chosen": -289.5794372558594, "logps/ref_rejected": -262.92510986328125, "logps/rejected": -347.1852722167969, "loss": 4.1751, "margin_dpo/margin_mean": 53.3377685546875, "margin_dpo/margin_std": 65.59990692138672, "step": 307 }, { "epoch": 0.6450261780104712, "grad_norm": 34.76543045043945, "learning_rate": 1.699652605415828e-07, "logits/chosen": 1.338348388671875, "logits/rejected": 1.3202842473983765, "logps/chosen": -348.8934020996094, "logps/ref_chosen": -305.04351806640625, "logps/ref_rejected": -305.0120849609375, "logps/rejected": -384.90301513671875, "loss": 4.6399, "margin_dpo/margin_mean": 36.04100036621094, "margin_dpo/margin_std": 60.608455657958984, "step": 308 }, { "epoch": 0.6471204188481675, "grad_norm": 66.8311996459961, "learning_rate": 1.6823300917064458e-07, "logits/chosen": 1.619686484336853, "logits/rejected": 1.3856267929077148, "logps/chosen": -354.4423522949219, "logps/ref_chosen": -316.80303955078125, "logps/ref_rejected": -240.09307861328125, "logps/rejected": -317.1116027832031, "loss": 4.284, "margin_dpo/margin_mean": 39.3791389465332, "margin_dpo/margin_std": 59.34083938598633, "step": 309 }, { "epoch": 0.6492146596858639, "grad_norm": 61.75398254394531, "learning_rate": 1.6650514271527465e-07, "logits/chosen": 1.2451605796813965, "logits/rejected": 1.5067654848098755, "logps/chosen": -289.0413818359375, "logps/ref_chosen": -240.17652893066406, "logps/ref_rejected": -242.7730712890625, "logps/rejected": -332.27880859375, "loss": 4.4389, "margin_dpo/margin_mean": 40.6408576965332, "margin_dpo/margin_std": 47.202674865722656, "step": 310 }, { "epoch": 0.6513089005235602, "grad_norm": 44.972686767578125, "learning_rate": 1.647817538357072e-07, "logits/chosen": 1.1516824960708618, "logits/rejected": 1.3081568479537964, "logps/chosen": -300.0003662109375, "logps/ref_chosen": -257.53515625, "logps/ref_rejected": -249.1999053955078, "logps/rejected": -334.6231689453125, "loss": 4.4418, "margin_dpo/margin_mean": 42.95802307128906, "margin_dpo/margin_std": 56.176063537597656, "step": 311 }, { "epoch": 0.6534031413612565, "grad_norm": 70.0779037475586, "learning_rate": 1.6306293495205755e-07, "logits/chosen": 1.374745488166809, "logits/rejected": 1.4253482818603516, "logps/chosen": -301.5932922363281, "logps/ref_chosen": -261.98828125, "logps/ref_rejected": -238.6123504638672, "logps/rejected": -317.7529296875, "loss": 4.3747, "margin_dpo/margin_mean": 39.53556442260742, "margin_dpo/margin_std": 68.29449462890625, "step": 312 }, { "epoch": 0.6554973821989529, "grad_norm": 56.74006271362305, "learning_rate": 1.6134877823936607e-07, "logits/chosen": 1.480233073234558, "logits/rejected": 1.6010148525238037, "logps/chosen": -417.88751220703125, "logps/ref_chosen": -380.5164794921875, "logps/ref_rejected": -340.59722900390625, "logps/rejected": -436.11846923828125, "loss": 4.5172, "margin_dpo/margin_mean": 58.15019607543945, "margin_dpo/margin_std": 64.69535064697266, "step": 313 }, { "epoch": 0.6575916230366492, "grad_norm": 52.93495559692383, "learning_rate": 1.5963937562265522e-07, "logits/chosen": 1.3546419143676758, "logits/rejected": 1.3760360479354858, "logps/chosen": -288.9587707519531, "logps/ref_chosen": -254.8392791748047, "logps/ref_rejected": -233.38494873046875, "logps/rejected": -312.0257263183594, "loss": 4.4718, "margin_dpo/margin_mean": 44.52123260498047, "margin_dpo/margin_std": 63.62104415893555, "step": 314 }, { "epoch": 0.6596858638743456, "grad_norm": 41.35818862915039, "learning_rate": 1.5793481877199943e-07, "logits/chosen": 1.7810715436935425, "logits/rejected": 1.7476561069488525, "logps/chosen": -315.27471923828125, "logps/ref_chosen": -287.1436767578125, "logps/ref_rejected": -245.744873046875, "logps/rejected": -311.196044921875, "loss": 4.1818, "margin_dpo/margin_mean": 37.32012939453125, "margin_dpo/margin_std": 46.346778869628906, "step": 315 }, { "epoch": 0.6617801047120419, "grad_norm": 60.255733489990234, "learning_rate": 1.562351990976095e-07, "logits/chosen": 0.9633012413978577, "logits/rejected": 1.0967074632644653, "logps/chosen": -310.6409912109375, "logps/ref_chosen": -278.97003173828125, "logps/ref_rejected": -268.5596618652344, "logps/rejected": -364.9547119140625, "loss": 4.1869, "margin_dpo/margin_mean": 64.72406005859375, "margin_dpo/margin_std": 64.02919006347656, "step": 316 }, { "epoch": 0.6638743455497382, "grad_norm": 63.166786193847656, "learning_rate": 1.5454060774493065e-07, "logits/chosen": 1.2942625284194946, "logits/rejected": 1.294532060623169, "logps/chosen": -277.548095703125, "logps/ref_chosen": -252.86656188964844, "logps/ref_rejected": -236.70155334472656, "logps/rejected": -304.314208984375, "loss": 4.3267, "margin_dpo/margin_mean": 42.931148529052734, "margin_dpo/margin_std": 60.30817413330078, "step": 317 }, { "epoch": 0.6659685863874345, "grad_norm": 59.5412712097168, "learning_rate": 1.5285113558975427e-07, "logits/chosen": 1.2442307472229004, "logits/rejected": 1.4497402906417847, "logps/chosen": -252.0952606201172, "logps/ref_chosen": -217.34515380859375, "logps/ref_rejected": -243.4803009033203, "logps/rejected": -328.711669921875, "loss": 4.2442, "margin_dpo/margin_mean": 50.481266021728516, "margin_dpo/margin_std": 49.45831298828125, "step": 318 }, { "epoch": 0.6680628272251309, "grad_norm": 34.23768615722656, "learning_rate": 1.5116687323334464e-07, "logits/chosen": 1.0364283323287964, "logits/rejected": 1.2878518104553223, "logps/chosen": -290.0143737792969, "logps/ref_chosen": -268.8816833496094, "logps/ref_rejected": -275.4843444824219, "logps/rejected": -347.4952697753906, "loss": 4.0379, "margin_dpo/margin_mean": 50.878273010253906, "margin_dpo/margin_std": 42.63667678833008, "step": 319 }, { "epoch": 0.6701570680628273, "grad_norm": 55.01826477050781, "learning_rate": 1.4948791099758052e-07, "logits/chosen": 1.7030011415481567, "logits/rejected": 1.6658614873886108, "logps/chosen": -328.2850341796875, "logps/ref_chosen": -307.4996337890625, "logps/ref_rejected": -251.08456420898438, "logps/rejected": -320.5891418457031, "loss": 4.3706, "margin_dpo/margin_mean": 48.71923065185547, "margin_dpo/margin_std": 54.48359680175781, "step": 320 }, { "epoch": 0.6722513089005235, "grad_norm": 36.16436004638672, "learning_rate": 1.478143389201113e-07, "logits/chosen": 1.5252739191055298, "logits/rejected": 1.3168452978134155, "logps/chosen": -343.71514892578125, "logps/ref_chosen": -309.8309631347656, "logps/ref_rejected": -248.75213623046875, "logps/rejected": -326.5061340332031, "loss": 4.4452, "margin_dpo/margin_mean": 43.869834899902344, "margin_dpo/margin_std": 63.80504608154297, "step": 321 }, { "epoch": 0.6743455497382199, "grad_norm": 43.79127502441406, "learning_rate": 1.461462467495284e-07, "logits/chosen": 0.9715927243232727, "logits/rejected": 1.0002247095108032, "logps/chosen": -323.4966735839844, "logps/ref_chosen": -291.58843994140625, "logps/ref_rejected": -265.43023681640625, "logps/rejected": -339.85174560546875, "loss": 4.1918, "margin_dpo/margin_mean": 42.5133056640625, "margin_dpo/margin_std": 48.27851486206055, "step": 322 }, { "epoch": 0.6764397905759162, "grad_norm": 47.61728286743164, "learning_rate": 1.4448372394055246e-07, "logits/chosen": 1.0764468908309937, "logits/rejected": 0.8316705822944641, "logps/chosen": -385.0590515136719, "logps/ref_chosen": -343.968017578125, "logps/ref_rejected": -254.12161254882812, "logps/rejected": -329.58868408203125, "loss": 4.879, "margin_dpo/margin_mean": 34.37602996826172, "margin_dpo/margin_std": 67.8411636352539, "step": 323 }, { "epoch": 0.6785340314136126, "grad_norm": 35.46821975708008, "learning_rate": 1.428268596492364e-07, "logits/chosen": 1.5448215007781982, "logits/rejected": 1.5194947719573975, "logps/chosen": -213.710693359375, "logps/ref_chosen": -206.94500732421875, "logps/ref_rejected": -262.6962890625, "logps/rejected": -316.6614990234375, "loss": 3.9753, "margin_dpo/margin_mean": 47.19957733154297, "margin_dpo/margin_std": 56.4971923828125, "step": 324 }, { "epoch": 0.680628272251309, "grad_norm": 40.15785598754883, "learning_rate": 1.4117574272818386e-07, "logits/chosen": 1.4091088771820068, "logits/rejected": 1.5492061376571655, "logps/chosen": -311.66400146484375, "logps/ref_chosen": -301.9862060546875, "logps/ref_rejected": -333.42236328125, "logps/rejected": -399.2699890136719, "loss": 4.5302, "margin_dpo/margin_mean": 56.1698112487793, "margin_dpo/margin_std": 54.43414306640625, "step": 325 }, { "epoch": 0.6827225130890052, "grad_norm": 52.91168975830078, "learning_rate": 1.3953046172178413e-07, "logits/chosen": 0.951869785785675, "logits/rejected": 1.228202223777771, "logps/chosen": -177.7034912109375, "logps/ref_chosen": -164.46109008789062, "logps/ref_rejected": -249.89413452148438, "logps/rejected": -324.3243713378906, "loss": 4.3905, "margin_dpo/margin_mean": 61.18782043457031, "margin_dpo/margin_std": 53.454124450683594, "step": 326 }, { "epoch": 0.6848167539267016, "grad_norm": 42.79719543457031, "learning_rate": 1.3789110486146468e-07, "logits/chosen": 1.5188686847686768, "logits/rejected": 1.4478236436843872, "logps/chosen": -259.4933166503906, "logps/ref_chosen": -246.3433837890625, "logps/ref_rejected": -229.85508728027344, "logps/rejected": -297.393798828125, "loss": 4.1578, "margin_dpo/margin_mean": 54.388763427734375, "margin_dpo/margin_std": 67.07605743408203, "step": 327 }, { "epoch": 0.6869109947643979, "grad_norm": 62.9756965637207, "learning_rate": 1.362577600609588e-07, "logits/chosen": 0.8666256666183472, "logits/rejected": 0.9310898780822754, "logps/chosen": -325.38824462890625, "logps/ref_chosen": -305.82012939453125, "logps/ref_rejected": -273.3159484863281, "logps/rejected": -348.3529357910156, "loss": 4.3427, "margin_dpo/margin_mean": 55.4688720703125, "margin_dpo/margin_std": 43.20487594604492, "step": 328 }, { "epoch": 0.6890052356020943, "grad_norm": 51.00785827636719, "learning_rate": 1.3463051491159093e-07, "logits/chosen": 1.4473413228988647, "logits/rejected": 1.7869523763656616, "logps/chosen": -283.1918029785156, "logps/ref_chosen": -258.7630615234375, "logps/ref_rejected": -284.41131591796875, "logps/rejected": -350.9872131347656, "loss": 4.7045, "margin_dpo/margin_mean": 42.1472053527832, "margin_dpo/margin_std": 59.915836334228516, "step": 329 }, { "epoch": 0.6910994764397905, "grad_norm": 47.54462432861328, "learning_rate": 1.3300945667758012e-07, "logits/chosen": 1.554024338722229, "logits/rejected": 1.5038138628005981, "logps/chosen": -360.6363830566406, "logps/ref_chosen": -330.3982238769531, "logps/ref_rejected": -274.9824523925781, "logps/rejected": -335.7353515625, "loss": 4.7699, "margin_dpo/margin_mean": 30.514692306518555, "margin_dpo/margin_std": 57.54186248779297, "step": 330 }, { "epoch": 0.6931937172774869, "grad_norm": 43.25348663330078, "learning_rate": 1.3139467229135998e-07, "logits/chosen": 1.2080715894699097, "logits/rejected": 1.1073570251464844, "logps/chosen": -306.7835693359375, "logps/ref_chosen": -279.2760009765625, "logps/ref_rejected": -220.27761840820312, "logps/rejected": -285.9614562988281, "loss": 4.4308, "margin_dpo/margin_mean": 38.1762580871582, "margin_dpo/margin_std": 54.596893310546875, "step": 331 }, { "epoch": 0.6952879581151833, "grad_norm": 40.57438278198242, "learning_rate": 1.2978624834891626e-07, "logits/chosen": 1.1713310480117798, "logits/rejected": 1.2179394960403442, "logps/chosen": -245.34036254882812, "logps/ref_chosen": -226.70223999023438, "logps/ref_rejected": -205.92601013183594, "logps/rejected": -280.2979736328125, "loss": 4.2023, "margin_dpo/margin_mean": 55.733863830566406, "margin_dpo/margin_std": 48.06736755371094, "step": 332 }, { "epoch": 0.6973821989528796, "grad_norm": 51.89149475097656, "learning_rate": 1.281842711051438e-07, "logits/chosen": 1.1539713144302368, "logits/rejected": 1.0611777305603027, "logps/chosen": -303.71087646484375, "logps/ref_chosen": -280.1510009765625, "logps/ref_rejected": -231.2144012451172, "logps/rejected": -309.7812194824219, "loss": 4.5953, "margin_dpo/margin_mean": 55.00693130493164, "margin_dpo/margin_std": 62.069034576416016, "step": 333 }, { "epoch": 0.6994764397905759, "grad_norm": 38.94709014892578, "learning_rate": 1.2658882646922033e-07, "logits/chosen": 1.1479655504226685, "logits/rejected": 1.1974003314971924, "logps/chosen": -290.0271301269531, "logps/ref_chosen": -269.64227294921875, "logps/ref_rejected": -260.0500793457031, "logps/rejected": -314.5991516113281, "loss": 4.4026, "margin_dpo/margin_mean": 34.16423416137695, "margin_dpo/margin_std": 44.75538635253906, "step": 334 }, { "epoch": 0.7015706806282722, "grad_norm": 49.46840286254883, "learning_rate": 1.2500000000000005e-07, "logits/chosen": 1.2762084007263184, "logits/rejected": 1.3634967803955078, "logps/chosen": -351.87103271484375, "logps/ref_chosen": -304.7079162597656, "logps/ref_rejected": -269.1751403808594, "logps/rejected": -330.24505615234375, "loss": 4.5817, "margin_dpo/margin_mean": 13.906787872314453, "margin_dpo/margin_std": 66.35944366455078, "step": 335 }, { "epoch": 0.7036649214659686, "grad_norm": 34.12943649291992, "learning_rate": 1.2341787690142435e-07, "logits/chosen": 1.5096263885498047, "logits/rejected": 1.792067289352417, "logps/chosen": -218.86050415039062, "logps/ref_chosen": -210.38368225097656, "logps/ref_rejected": -229.12037658691406, "logps/rejected": -289.0048522949219, "loss": 4.5624, "margin_dpo/margin_mean": 51.40761947631836, "margin_dpo/margin_std": 47.56304931640625, "step": 336 }, { "epoch": 0.7057591623036649, "grad_norm": 52.81936264038086, "learning_rate": 1.2184254201795363e-07, "logits/chosen": 0.93461012840271, "logits/rejected": 0.8486602306365967, "logps/chosen": -299.5815734863281, "logps/ref_chosen": -257.2767639160156, "logps/ref_rejected": -297.5929260253906, "logps/rejected": -374.0364074707031, "loss": 4.4007, "margin_dpo/margin_mean": 34.13860321044922, "margin_dpo/margin_std": 52.06587219238281, "step": 337 }, { "epoch": 0.7078534031413612, "grad_norm": 37.59018325805664, "learning_rate": 1.202740798300168e-07, "logits/chosen": 1.5364826917648315, "logits/rejected": 1.5837008953094482, "logps/chosen": -274.78564453125, "logps/ref_chosen": -257.8255310058594, "logps/ref_rejected": -216.51162719726562, "logps/rejected": -298.27276611328125, "loss": 4.1313, "margin_dpo/margin_mean": 64.8010482788086, "margin_dpo/margin_std": 52.778377532958984, "step": 338 }, { "epoch": 0.7099476439790576, "grad_norm": 43.429386138916016, "learning_rate": 1.1871257444948096e-07, "logits/chosen": 1.5933047533035278, "logits/rejected": 1.5398043394088745, "logps/chosen": -267.3699645996094, "logps/ref_chosen": -240.76815795898438, "logps/ref_rejected": -244.97377014160156, "logps/rejected": -315.84185791015625, "loss": 4.1606, "margin_dpo/margin_mean": 44.2662467956543, "margin_dpo/margin_std": 55.80255126953125, "step": 339 }, { "epoch": 0.7120418848167539, "grad_norm": 35.77497482299805, "learning_rate": 1.1715810961514072e-07, "logits/chosen": 0.9345113039016724, "logits/rejected": 1.0999596118927002, "logps/chosen": -204.77218627929688, "logps/ref_chosen": -187.35751342773438, "logps/ref_rejected": -232.0410614013672, "logps/rejected": -292.81396484375, "loss": 4.428, "margin_dpo/margin_mean": 43.358245849609375, "margin_dpo/margin_std": 77.42390441894531, "step": 340 }, { "epoch": 0.7141361256544503, "grad_norm": 60.169677734375, "learning_rate": 1.1561076868822755e-07, "logits/chosen": 1.5765248537063599, "logits/rejected": 1.8391637802124023, "logps/chosen": -314.5309753417969, "logps/ref_chosen": -283.4117736816406, "logps/ref_rejected": -302.2451171875, "logps/rejected": -380.4033203125, "loss": 4.801, "margin_dpo/margin_mean": 47.03903579711914, "margin_dpo/margin_std": 53.40591049194336, "step": 341 }, { "epoch": 0.7162303664921466, "grad_norm": 45.74888610839844, "learning_rate": 1.1407063464793965e-07, "logits/chosen": 1.175806999206543, "logits/rejected": 1.3320945501327515, "logps/chosen": -249.10618591308594, "logps/ref_chosen": -221.50335693359375, "logps/ref_rejected": -244.48382568359375, "logps/rejected": -306.7912292480469, "loss": 4.3427, "margin_dpo/margin_mean": 34.704566955566406, "margin_dpo/margin_std": 38.48919677734375, "step": 342 }, { "epoch": 0.7183246073298429, "grad_norm": 33.789772033691406, "learning_rate": 1.125377900869913e-07, "logits/chosen": 1.5595121383666992, "logits/rejected": 1.44582200050354, "logps/chosen": -346.7395935058594, "logps/ref_chosen": -340.46466064453125, "logps/ref_rejected": -267.65313720703125, "logps/rejected": -321.6587829589844, "loss": 4.5708, "margin_dpo/margin_mean": 47.7307243347168, "margin_dpo/margin_std": 60.58958435058594, "step": 343 }, { "epoch": 0.7204188481675393, "grad_norm": 31.717029571533203, "learning_rate": 1.110123172071844e-07, "logits/chosen": 1.2925912141799927, "logits/rejected": 1.426027774810791, "logps/chosen": -333.0621643066406, "logps/ref_chosen": -310.25018310546875, "logps/ref_rejected": -281.16302490234375, "logps/rejected": -352.4907531738281, "loss": 4.3502, "margin_dpo/margin_mean": 48.51573181152344, "margin_dpo/margin_std": 64.95976257324219, "step": 344 }, { "epoch": 0.7225130890052356, "grad_norm": 51.54704284667969, "learning_rate": 1.09494297815e-07, "logits/chosen": 1.4484617710113525, "logits/rejected": 1.585009217262268, "logps/chosen": -307.612548828125, "logps/ref_chosen": -284.6531066894531, "logps/ref_rejected": -304.72369384765625, "logps/rejected": -358.7782897949219, "loss": 4.51, "margin_dpo/margin_mean": 31.09515953063965, "margin_dpo/margin_std": 56.86581039428711, "step": 345 }, { "epoch": 0.724607329842932, "grad_norm": 65.52984619140625, "learning_rate": 1.0798381331721107e-07, "logits/chosen": 0.9799513220787048, "logits/rejected": 1.088678002357483, "logps/chosen": -310.8472595214844, "logps/ref_chosen": -255.6278076171875, "logps/ref_rejected": -237.61305236816406, "logps/rejected": -325.5954284667969, "loss": 4.375, "margin_dpo/margin_mean": 32.76295471191406, "margin_dpo/margin_std": 52.9566535949707, "step": 346 }, { "epoch": 0.7267015706806282, "grad_norm": 57.053504943847656, "learning_rate": 1.0648094471651722e-07, "logits/chosen": 1.3673675060272217, "logits/rejected": 1.3742492198944092, "logps/chosen": -315.336181640625, "logps/ref_chosen": -287.71807861328125, "logps/ref_rejected": -276.2634582519531, "logps/rejected": -353.022705078125, "loss": 4.5065, "margin_dpo/margin_mean": 49.14113998413086, "margin_dpo/margin_std": 64.81655883789062, "step": 347 }, { "epoch": 0.7287958115183246, "grad_norm": 41.55851745605469, "learning_rate": 1.0498577260720048e-07, "logits/chosen": 1.4351952075958252, "logits/rejected": 1.5722769498825073, "logps/chosen": -296.4559020996094, "logps/ref_chosen": -285.63232421875, "logps/ref_rejected": -264.0018615722656, "logps/rejected": -319.1177978515625, "loss": 4.6334, "margin_dpo/margin_mean": 44.29237365722656, "margin_dpo/margin_std": 51.82339096069336, "step": 348 }, { "epoch": 0.7308900523560209, "grad_norm": 34.51556396484375, "learning_rate": 1.0349837717080347e-07, "logits/chosen": 1.2128405570983887, "logits/rejected": 1.2827637195587158, "logps/chosen": -378.03369140625, "logps/ref_chosen": -347.98370361328125, "logps/ref_rejected": -328.8855895996094, "logps/rejected": -408.9764709472656, "loss": 4.2865, "margin_dpo/margin_mean": 50.040870666503906, "margin_dpo/margin_std": 62.83095169067383, "step": 349 }, { "epoch": 0.7329842931937173, "grad_norm": 51.1704216003418, "learning_rate": 1.0201883817182949e-07, "logits/chosen": 1.6027448177337646, "logits/rejected": 1.452850103378296, "logps/chosen": -300.945556640625, "logps/ref_chosen": -292.4501647949219, "logps/ref_rejected": -188.39346313476562, "logps/rejected": -268.33087158203125, "loss": 4.4481, "margin_dpo/margin_mean": 71.44198608398438, "margin_dpo/margin_std": 57.31879806518555, "step": 350 }, { "epoch": 0.7350785340314137, "grad_norm": 54.72409439086914, "learning_rate": 1.0054723495346482e-07, "logits/chosen": 1.2379735708236694, "logits/rejected": 1.2731664180755615, "logps/chosen": -284.1971740722656, "logps/ref_chosen": -267.4852294921875, "logps/ref_rejected": -223.13552856445312, "logps/rejected": -285.99163818359375, "loss": 4.9619, "margin_dpo/margin_mean": 46.14418029785156, "margin_dpo/margin_std": 70.45701599121094, "step": 351 }, { "epoch": 0.7371727748691099, "grad_norm": 65.03557586669922, "learning_rate": 9.908364643332398e-08, "logits/chosen": 1.2324098348617554, "logits/rejected": 1.4873018264770508, "logps/chosen": -282.592529296875, "logps/ref_chosen": -257.07952880859375, "logps/ref_rejected": -294.4090881347656, "logps/rejected": -372.9225158691406, "loss": 4.2427, "margin_dpo/margin_mean": 53.00044631958008, "margin_dpo/margin_std": 67.86283111572266, "step": 352 }, { "epoch": 0.7392670157068063, "grad_norm": 39.74465560913086, "learning_rate": 9.76281510992176e-08, "logits/chosen": 1.1475257873535156, "logits/rejected": 1.180965781211853, "logps/chosen": -321.6578674316406, "logps/ref_chosen": -290.9927062988281, "logps/ref_rejected": -263.1128845214844, "logps/rejected": -340.19183349609375, "loss": 4.2739, "margin_dpo/margin_mean": 46.413780212402344, "margin_dpo/margin_std": 61.069339752197266, "step": 353 }, { "epoch": 0.7413612565445026, "grad_norm": 48.98942184448242, "learning_rate": 9.618082700494318e-08, "logits/chosen": 1.084821343421936, "logits/rejected": 1.1856131553649902, "logps/chosen": -224.8162384033203, "logps/ref_chosen": -196.65435791015625, "logps/ref_rejected": -193.15533447265625, "logps/rejected": -250.77032470703125, "loss": 4.9672, "margin_dpo/margin_mean": 29.453073501586914, "margin_dpo/margin_std": 54.51377487182617, "step": 354 }, { "epoch": 0.743455497382199, "grad_norm": 52.5598030090332, "learning_rate": 9.474175176609956e-08, "logits/chosen": 1.538980484008789, "logits/rejected": 1.710750937461853, "logps/chosen": -305.9091491699219, "logps/ref_chosen": -277.7572937011719, "logps/ref_rejected": -296.24908447265625, "logps/rejected": -365.863525390625, "loss": 4.304, "margin_dpo/margin_mean": 41.46260452270508, "margin_dpo/margin_std": 60.0238151550293, "step": 355 }, { "epoch": 0.7455497382198953, "grad_norm": 38.38217544555664, "learning_rate": 9.331100255592436e-08, "logits/chosen": 1.1549817323684692, "logits/rejected": 1.270320177078247, "logps/chosen": -250.93751525878906, "logps/ref_chosen": -228.735595703125, "logps/ref_rejected": -288.4073486328125, "logps/rejected": -340.65838623046875, "loss": 4.4604, "margin_dpo/margin_mean": 30.04913902282715, "margin_dpo/margin_std": 50.18395233154297, "step": 356 }, { "epoch": 0.7476439790575916, "grad_norm": 48.77327346801758, "learning_rate": 9.18886561011557e-08, "logits/chosen": 1.2281720638275146, "logits/rejected": 1.2340593338012695, "logps/chosen": -345.0635986328125, "logps/ref_chosen": -327.5565185546875, "logps/ref_rejected": -286.9888610839844, "logps/rejected": -364.4182434082031, "loss": 4.4785, "margin_dpo/margin_mean": 59.92234802246094, "margin_dpo/margin_std": 68.7721939086914, "step": 357 }, { "epoch": 0.749738219895288, "grad_norm": 33.63121795654297, "learning_rate": 9.047478867791731e-08, "logits/chosen": 1.2939711809158325, "logits/rejected": 1.3026853799819946, "logps/chosen": -300.43011474609375, "logps/ref_chosen": -275.9919738769531, "logps/ref_rejected": -226.95779418945312, "logps/rejected": -304.18017578125, "loss": 4.326, "margin_dpo/margin_mean": 52.78423309326172, "margin_dpo/margin_std": 65.29212951660156, "step": 358 }, { "epoch": 0.7518324607329843, "grad_norm": 41.51667022705078, "learning_rate": 8.906947610762825e-08, "logits/chosen": 1.193036675453186, "logits/rejected": 1.3061870336532593, "logps/chosen": -288.68792724609375, "logps/ref_chosen": -265.4796447753906, "logps/ref_rejected": -269.9594421386719, "logps/rejected": -336.7837219238281, "loss": 4.2962, "margin_dpo/margin_mean": 43.615962982177734, "margin_dpo/margin_std": 57.33641052246094, "step": 359 }, { "epoch": 0.7539267015706806, "grad_norm": 41.91855239868164, "learning_rate": 8.76727937529367e-08, "logits/chosen": 1.4477754831314087, "logits/rejected": 1.4021023511886597, "logps/chosen": -364.541015625, "logps/ref_chosen": -336.95709228515625, "logps/ref_rejected": -275.51239013671875, "logps/rejected": -352.3710632324219, "loss": 4.4654, "margin_dpo/margin_mean": 49.27482604980469, "margin_dpo/margin_std": 65.66383361816406, "step": 360 }, { "epoch": 0.7560209424083769, "grad_norm": 37.49740219116211, "learning_rate": 8.628481651367875e-08, "logits/chosen": 1.1130855083465576, "logits/rejected": 1.3169794082641602, "logps/chosen": -233.5477294921875, "logps/ref_chosen": -223.0279541015625, "logps/ref_rejected": -233.6653289794922, "logps/rejected": -294.90289306640625, "loss": 4.1403, "margin_dpo/margin_mean": 50.717796325683594, "margin_dpo/margin_std": 58.103668212890625, "step": 361 }, { "epoch": 0.7581151832460733, "grad_norm": 75.54791259765625, "learning_rate": 8.490561882286135e-08, "logits/chosen": 1.167074203491211, "logits/rejected": 1.180873155593872, "logps/chosen": -337.9937744140625, "logps/ref_chosen": -298.1035461425781, "logps/ref_rejected": -230.74783325195312, "logps/rejected": -312.3794250488281, "loss": 4.3668, "margin_dpo/margin_mean": 41.741363525390625, "margin_dpo/margin_std": 74.15339660644531, "step": 362 }, { "epoch": 0.7602094240837697, "grad_norm": 69.05213928222656, "learning_rate": 8.353527464267104e-08, "logits/chosen": 1.4149866104125977, "logits/rejected": 1.3386046886444092, "logps/chosen": -324.6624450683594, "logps/ref_chosen": -315.25506591796875, "logps/ref_rejected": -276.456298828125, "logps/rejected": -352.84759521484375, "loss": 4.1568, "margin_dpo/margin_mean": 66.9839096069336, "margin_dpo/margin_std": 64.92231750488281, "step": 363 }, { "epoch": 0.762303664921466, "grad_norm": 31.97148895263672, "learning_rate": 8.217385746050742e-08, "logits/chosen": 1.5564781427383423, "logits/rejected": 1.3156113624572754, "logps/chosen": -380.2980041503906, "logps/ref_chosen": -336.43798828125, "logps/ref_rejected": -259.9676818847656, "logps/rejected": -339.92596435546875, "loss": 4.5886, "margin_dpo/margin_mean": 36.0982666015625, "margin_dpo/margin_std": 53.2963752746582, "step": 364 }, { "epoch": 0.7643979057591623, "grad_norm": 53.09539794921875, "learning_rate": 8.082144028504231e-08, "logits/chosen": 1.0552072525024414, "logits/rejected": 1.2801932096481323, "logps/chosen": -227.60394287109375, "logps/ref_chosen": -209.7356719970703, "logps/ref_rejected": -294.5636291503906, "logps/rejected": -367.4307861328125, "loss": 4.2219, "margin_dpo/margin_mean": 54.99891662597656, "margin_dpo/margin_std": 50.34431457519531, "step": 365 }, { "epoch": 0.7664921465968586, "grad_norm": 56.880043029785156, "learning_rate": 7.947809564230445e-08, "logits/chosen": 1.2870928049087524, "logits/rejected": 1.2019919157028198, "logps/chosen": -343.0006408691406, "logps/ref_chosen": -312.77142333984375, "logps/ref_rejected": -273.8427734375, "logps/rejected": -375.21856689453125, "loss": 4.2553, "margin_dpo/margin_mean": 71.1466293334961, "margin_dpo/margin_std": 62.22167205810547, "step": 366 }, { "epoch": 0.768586387434555, "grad_norm": 47.9551887512207, "learning_rate": 7.814389557179016e-08, "logits/chosen": 1.7175925970077515, "logits/rejected": 1.4891177415847778, "logps/chosen": -329.87542724609375, "logps/ref_chosen": -284.1925964355469, "logps/ref_rejected": -208.8526611328125, "logps/rejected": -294.8234558105469, "loss": 4.1368, "margin_dpo/margin_mean": 40.28790283203125, "margin_dpo/margin_std": 68.52880859375, "step": 367 }, { "epoch": 0.7706806282722513, "grad_norm": 32.45009231567383, "learning_rate": 7.681891162260015e-08, "logits/chosen": 1.7018953561782837, "logits/rejected": 1.5844436883926392, "logps/chosen": -376.4222412109375, "logps/ref_chosen": -360.64459228515625, "logps/ref_rejected": -297.281005859375, "logps/rejected": -367.4183654785156, "loss": 3.9894, "margin_dpo/margin_mean": 54.35973358154297, "margin_dpo/margin_std": 49.44243621826172, "step": 368 }, { "epoch": 0.7727748691099476, "grad_norm": 85.05940246582031, "learning_rate": 7.550321484960251e-08, "logits/chosen": 1.4974400997161865, "logits/rejected": 1.5112247467041016, "logps/chosen": -364.4231872558594, "logps/ref_chosen": -340.94610595703125, "logps/ref_rejected": -285.1484069824219, "logps/rejected": -367.0999450683594, "loss": 4.5603, "margin_dpo/margin_mean": 58.47446060180664, "margin_dpo/margin_std": 59.77842712402344, "step": 369 }, { "epoch": 0.774869109947644, "grad_norm": 36.800376892089844, "learning_rate": 7.419687580962222e-08, "logits/chosen": 1.2962076663970947, "logits/rejected": 1.5088801383972168, "logps/chosen": -313.3735046386719, "logps/ref_chosen": -276.9629211425781, "logps/ref_rejected": -274.93865966796875, "logps/rejected": -353.8816223144531, "loss": 4.1904, "margin_dpo/margin_mean": 42.532344818115234, "margin_dpo/margin_std": 52.544273376464844, "step": 370 }, { "epoch": 0.7769633507853403, "grad_norm": 79.35717010498047, "learning_rate": 7.289996455765748e-08, "logits/chosen": 0.7718454599380493, "logits/rejected": 1.0238559246063232, "logps/chosen": -365.87713623046875, "logps/ref_chosen": -323.23980712890625, "logps/ref_rejected": -317.0935363769531, "logps/rejected": -390.978759765625, "loss": 4.4027, "margin_dpo/margin_mean": 31.247907638549805, "margin_dpo/margin_std": 54.091983795166016, "step": 371 }, { "epoch": 0.7790575916230367, "grad_norm": 33.439125061035156, "learning_rate": 7.161255064312283e-08, "logits/chosen": 1.3146398067474365, "logits/rejected": 1.290389060974121, "logps/chosen": -338.93792724609375, "logps/ref_chosen": -303.75262451171875, "logps/ref_rejected": -205.62069702148438, "logps/rejected": -309.5386962890625, "loss": 4.0137, "margin_dpo/margin_mean": 68.732666015625, "margin_dpo/margin_std": 57.33296585083008, "step": 372 }, { "epoch": 0.7811518324607329, "grad_norm": 45.49492263793945, "learning_rate": 7.033470310611945e-08, "logits/chosen": 1.4074095487594604, "logits/rejected": 1.1533772945404053, "logps/chosen": -377.86865234375, "logps/ref_chosen": -346.5982666015625, "logps/ref_rejected": -253.89280700683594, "logps/rejected": -333.4144287109375, "loss": 4.1968, "margin_dpo/margin_mean": 48.25126266479492, "margin_dpo/margin_std": 56.07286834716797, "step": 373 }, { "epoch": 0.7832460732984293, "grad_norm": 57.437164306640625, "learning_rate": 6.906649047373245e-08, "logits/chosen": 1.4347639083862305, "logits/rejected": 1.5623588562011719, "logps/chosen": -292.1949768066406, "logps/ref_chosen": -252.59971618652344, "logps/ref_rejected": -249.61476135253906, "logps/rejected": -319.74609375, "loss": 4.7414, "margin_dpo/margin_mean": 30.536096572875977, "margin_dpo/margin_std": 58.64886474609375, "step": 374 }, { "epoch": 0.7853403141361257, "grad_norm": 67.63172149658203, "learning_rate": 6.780798075635675e-08, "logits/chosen": 1.1573803424835205, "logits/rejected": 1.0170570611953735, "logps/chosen": -274.2476806640625, "logps/ref_chosen": -247.3214569091797, "logps/ref_rejected": -188.48236083984375, "logps/rejected": -260.9528503417969, "loss": 4.4518, "margin_dpo/margin_mean": 45.54425811767578, "margin_dpo/margin_std": 53.55020523071289, "step": 375 }, { "epoch": 0.787434554973822, "grad_norm": 43.886695861816406, "learning_rate": 6.655924144404906e-08, "logits/chosen": 1.1326963901519775, "logits/rejected": 1.3887975215911865, "logps/chosen": -327.3310852050781, "logps/ref_chosen": -272.513916015625, "logps/ref_rejected": -305.7491760253906, "logps/rejected": -412.55767822265625, "loss": 4.5376, "margin_dpo/margin_mean": 51.99131393432617, "margin_dpo/margin_std": 69.22034454345703, "step": 376 }, { "epoch": 0.7895287958115184, "grad_norm": 47.22002029418945, "learning_rate": 6.532033950290885e-08, "logits/chosen": 1.356651782989502, "logits/rejected": 1.438635230064392, "logps/chosen": -323.85699462890625, "logps/ref_chosen": -298.3796081542969, "logps/ref_rejected": -273.41839599609375, "logps/rejected": -348.69940185546875, "loss": 4.5118, "margin_dpo/margin_mean": 49.80352020263672, "margin_dpo/margin_std": 56.26287841796875, "step": 377 }, { "epoch": 0.7916230366492146, "grad_norm": 43.87843704223633, "learning_rate": 6.409134137148736e-08, "logits/chosen": 1.4002658128738403, "logits/rejected": 1.4882569313049316, "logps/chosen": -305.824462890625, "logps/ref_chosen": -286.3173522949219, "logps/ref_rejected": -271.7178955078125, "logps/rejected": -340.6441345214844, "loss": 4.5644, "margin_dpo/margin_mean": 49.41912078857422, "margin_dpo/margin_std": 56.88557434082031, "step": 378 }, { "epoch": 0.793717277486911, "grad_norm": 57.02334213256836, "learning_rate": 6.28723129572247e-08, "logits/chosen": 1.439416527748108, "logits/rejected": 1.3806058168411255, "logps/chosen": -271.30230712890625, "logps/ref_chosen": -233.71743774414062, "logps/ref_rejected": -206.68927001953125, "logps/rejected": -284.8748474121094, "loss": 4.5262, "margin_dpo/margin_mean": 40.600711822509766, "margin_dpo/margin_std": 50.349300384521484, "step": 379 }, { "epoch": 0.7958115183246073, "grad_norm": 72.12355041503906, "learning_rate": 6.166331963291519e-08, "logits/chosen": 1.7105507850646973, "logits/rejected": 1.5248544216156006, "logps/chosen": -387.1551513671875, "logps/ref_chosen": -356.8863525390625, "logps/ref_rejected": -354.776123046875, "logps/rejected": -424.4413757324219, "loss": 4.4649, "margin_dpo/margin_mean": 39.396446228027344, "margin_dpo/margin_std": 53.38307189941406, "step": 380 }, { "epoch": 0.7979057591623037, "grad_norm": 97.61827850341797, "learning_rate": 6.046442623320145e-08, "logits/chosen": 0.9507350325584412, "logits/rejected": 1.0006842613220215, "logps/chosen": -260.0545654296875, "logps/ref_chosen": -235.81100463867188, "logps/ref_rejected": -237.37062072753906, "logps/rejected": -337.2359313964844, "loss": 4.1799, "margin_dpo/margin_mean": 75.62174987792969, "margin_dpo/margin_std": 58.47461700439453, "step": 381 }, { "epoch": 0.8, "grad_norm": 42.927345275878906, "learning_rate": 5.9275697051098275e-08, "logits/chosen": 1.312659740447998, "logits/rejected": 1.322435975074768, "logps/chosen": -294.5480651855469, "logps/ref_chosen": -259.17388916015625, "logps/ref_rejected": -230.83482360839844, "logps/rejected": -323.2818298339844, "loss": 3.9698, "margin_dpo/margin_mean": 57.07281494140625, "margin_dpo/margin_std": 65.82691955566406, "step": 382 }, { "epoch": 0.8020942408376963, "grad_norm": 60.843833923339844, "learning_rate": 5.809719583454414e-08, "logits/chosen": 1.0891731977462769, "logits/rejected": 1.3433376550674438, "logps/chosen": -316.5683898925781, "logps/ref_chosen": -269.8660583496094, "logps/ref_rejected": -320.0415954589844, "logps/rejected": -390.5964660644531, "loss": 4.4135, "margin_dpo/margin_mean": 23.852542877197266, "margin_dpo/margin_std": 56.356651306152344, "step": 383 }, { "epoch": 0.8041884816753927, "grad_norm": 70.86746978759766, "learning_rate": 5.6928985782982524e-08, "logits/chosen": 1.2213385105133057, "logits/rejected": 1.5940483808517456, "logps/chosen": -313.1070251464844, "logps/ref_chosen": -280.7498779296875, "logps/ref_rejected": -324.9134216308594, "logps/rejected": -393.7688293457031, "loss": 4.9522, "margin_dpo/margin_mean": 36.498287200927734, "margin_dpo/margin_std": 67.0685043334961, "step": 384 }, { "epoch": 0.806282722513089, "grad_norm": 41.98042678833008, "learning_rate": 5.57711295439732e-08, "logits/chosen": 1.4527329206466675, "logits/rejected": 1.5307879447937012, "logps/chosen": -346.65673828125, "logps/ref_chosen": -313.2212829589844, "logps/ref_rejected": -256.9848937988281, "logps/rejected": -341.84088134765625, "loss": 4.4431, "margin_dpo/margin_mean": 51.4205322265625, "margin_dpo/margin_std": 66.022705078125, "step": 385 }, { "epoch": 0.8083769633507853, "grad_norm": 48.37839126586914, "learning_rate": 5.4623689209832484e-08, "logits/chosen": 1.6142921447753906, "logits/rejected": 1.77769935131073, "logps/chosen": -376.63311767578125, "logps/ref_chosen": -342.4034423828125, "logps/ref_rejected": -319.1665954589844, "logps/rejected": -405.9571838378906, "loss": 4.1689, "margin_dpo/margin_mean": 52.56089401245117, "margin_dpo/margin_std": 56.50434875488281, "step": 386 }, { "epoch": 0.8104712041884817, "grad_norm": 37.86534118652344, "learning_rate": 5.3486726314303175e-08, "logits/chosen": 1.3724387884140015, "logits/rejected": 1.450311303138733, "logps/chosen": -249.56488037109375, "logps/ref_chosen": -209.16738891601562, "logps/ref_rejected": -225.61949157714844, "logps/rejected": -305.92340087890625, "loss": 4.3007, "margin_dpo/margin_mean": 39.906436920166016, "margin_dpo/margin_std": 77.46627807617188, "step": 387 }, { "epoch": 0.812565445026178, "grad_norm": 86.58211517333984, "learning_rate": 5.2360301829254745e-08, "logits/chosen": 1.78019118309021, "logits/rejected": 1.7659128904342651, "logps/chosen": -381.0487365722656, "logps/ref_chosen": -342.5128173828125, "logps/ref_rejected": -296.8653564453125, "logps/rejected": -390.75970458984375, "loss": 4.5827, "margin_dpo/margin_mean": 55.35844421386719, "margin_dpo/margin_std": 75.9105224609375, "step": 388 }, { "epoch": 0.8146596858638744, "grad_norm": 73.34844207763672, "learning_rate": 5.1244476161413806e-08, "logits/chosen": 1.6132086515426636, "logits/rejected": 1.4335089921951294, "logps/chosen": -354.9220886230469, "logps/ref_chosen": -336.53912353515625, "logps/ref_rejected": -237.36383056640625, "logps/rejected": -318.2511901855469, "loss": 4.2202, "margin_dpo/margin_mean": 62.504356384277344, "margin_dpo/margin_std": 76.5676498413086, "step": 389 }, { "epoch": 0.8167539267015707, "grad_norm": 72.32861328125, "learning_rate": 5.013930914912476e-08, "logits/chosen": 1.4496684074401855, "logits/rejected": 1.6118597984313965, "logps/chosen": -313.1777648925781, "logps/ref_chosen": -275.41680908203125, "logps/ref_rejected": -300.94329833984375, "logps/rejected": -397.3767395019531, "loss": 4.4604, "margin_dpo/margin_mean": 58.67253112792969, "margin_dpo/margin_std": 57.53920364379883, "step": 390 }, { "epoch": 0.818848167539267, "grad_norm": 38.00588607788086, "learning_rate": 4.904486005914027e-08, "logits/chosen": 1.421841025352478, "logits/rejected": 1.3273773193359375, "logps/chosen": -301.99920654296875, "logps/ref_chosen": -249.42276000976562, "logps/ref_rejected": -187.71572875976562, "logps/rejected": -270.2522888183594, "loss": 4.2838, "margin_dpo/margin_mean": 29.960115432739258, "margin_dpo/margin_std": 43.918495178222656, "step": 391 }, { "epoch": 0.8209424083769633, "grad_norm": 45.547096252441406, "learning_rate": 4.796118758344353e-08, "logits/chosen": 1.0780835151672363, "logits/rejected": 1.0823677778244019, "logps/chosen": -326.24456787109375, "logps/ref_chosen": -290.30438232421875, "logps/ref_rejected": -262.2787780761719, "logps/rejected": -357.97216796875, "loss": 3.8296, "margin_dpo/margin_mean": 59.75324249267578, "margin_dpo/margin_std": 60.47087860107422, "step": 392 }, { "epoch": 0.8230366492146597, "grad_norm": 53.403141021728516, "learning_rate": 4.688834983610082e-08, "logits/chosen": 1.3256309032440186, "logits/rejected": 1.1313904523849487, "logps/chosen": -356.47021484375, "logps/ref_chosen": -317.2633972167969, "logps/ref_rejected": -237.91380310058594, "logps/rejected": -326.9722900390625, "loss": 4.5399, "margin_dpo/margin_mean": 49.85166931152344, "margin_dpo/margin_std": 60.655738830566406, "step": 393 }, { "epoch": 0.8251308900523561, "grad_norm": 44.783817291259766, "learning_rate": 4.582640435014459e-08, "logits/chosen": 1.5566421747207642, "logits/rejected": 1.6724714040756226, "logps/chosen": -406.4788818359375, "logps/ref_chosen": -377.4843444824219, "logps/ref_rejected": -298.2265319824219, "logps/rejected": -381.8688659667969, "loss": 4.4044, "margin_dpo/margin_mean": 54.6478271484375, "margin_dpo/margin_std": 60.59336853027344, "step": 394 }, { "epoch": 0.8272251308900523, "grad_norm": 43.559539794921875, "learning_rate": 4.477540807448832e-08, "logits/chosen": 1.251692771911621, "logits/rejected": 1.2946186065673828, "logps/chosen": -310.1583557128906, "logps/ref_chosen": -281.3030090332031, "logps/ref_rejected": -272.98968505859375, "logps/rejected": -343.1010437011719, "loss": 4.3814, "margin_dpo/margin_mean": 41.256004333496094, "margin_dpo/margin_std": 55.48289108276367, "step": 395 }, { "epoch": 0.8293193717277487, "grad_norm": 173.74839782714844, "learning_rate": 4.373541737087263e-08, "logits/chosen": 1.4800870418548584, "logits/rejected": 1.4773489236831665, "logps/chosen": -338.620849609375, "logps/ref_chosen": -295.05364990234375, "logps/ref_rejected": -255.04061889648438, "logps/rejected": -347.4020080566406, "loss": 4.402, "margin_dpo/margin_mean": 48.79420471191406, "margin_dpo/margin_std": 69.13119506835938, "step": 396 }, { "epoch": 0.831413612565445, "grad_norm": 48.23088836669922, "learning_rate": 4.270648801084295e-08, "logits/chosen": 1.4665961265563965, "logits/rejected": 1.5797699689865112, "logps/chosen": -311.6304016113281, "logps/ref_chosen": -288.0824890136719, "logps/ref_rejected": -270.2839050292969, "logps/rejected": -332.85223388671875, "loss": 4.47, "margin_dpo/margin_mean": 39.020355224609375, "margin_dpo/margin_std": 48.824378967285156, "step": 397 }, { "epoch": 0.8335078534031414, "grad_norm": 92.45420837402344, "learning_rate": 4.168867517275806e-08, "logits/chosen": 1.2100859880447388, "logits/rejected": 1.4831223487854004, "logps/chosen": -297.94940185546875, "logps/ref_chosen": -252.48330688476562, "logps/ref_rejected": -273.87139892578125, "logps/rejected": -354.4050598144531, "loss": 4.9205, "margin_dpo/margin_mean": 35.06761932373047, "margin_dpo/margin_std": 80.69606018066406, "step": 398 }, { "epoch": 0.8356020942408376, "grad_norm": 82.78327941894531, "learning_rate": 4.0682033438831584e-08, "logits/chosen": 1.3988953828811646, "logits/rejected": 1.5033973455429077, "logps/chosen": -330.5037536621094, "logps/ref_chosen": -277.4305419921875, "logps/ref_rejected": -271.1197204589844, "logps/rejected": -356.5560607910156, "loss": 4.4852, "margin_dpo/margin_mean": 32.36311340332031, "margin_dpo/margin_std": 66.02288818359375, "step": 399 }, { "epoch": 0.837696335078534, "grad_norm": 37.495018005371094, "learning_rate": 3.968661679220467e-08, "logits/chosen": 1.2931967973709106, "logits/rejected": 1.240352988243103, "logps/chosen": -299.0211181640625, "logps/ref_chosen": -266.20025634765625, "logps/ref_rejected": -217.865966796875, "logps/rejected": -301.7166748046875, "loss": 4.2739, "margin_dpo/margin_mean": 51.02983093261719, "margin_dpo/margin_std": 63.865108489990234, "step": 400 }, { "epoch": 0.837696335078534, "eval_logits/chosen": 1.193334937095642, "eval_logits/rejected": 1.2366639375686646, "eval_logps/chosen": -316.0413513183594, "eval_logps/ref_chosen": -281.4588928222656, "eval_logps/ref_rejected": -261.84954833984375, "eval_logps/rejected": -345.1450500488281, "eval_loss": 0.5601758360862732, "eval_margin_dpo/margin_mean": 48.71305465698242, "eval_margin_dpo/margin_std": 68.15460205078125, "eval_runtime": 92.6746, "eval_samples_per_second": 21.581, "eval_steps_per_second": 1.349, "step": 400 }, { "epoch": 0.8397905759162304, "grad_norm": 42.00017547607422, "learning_rate": 3.8702478614051345e-08, "logits/chosen": 1.1790591478347778, "logits/rejected": 1.3527344465255737, "logps/chosen": -331.7237854003906, "logps/ref_chosen": -293.3692932128906, "logps/ref_rejected": -299.9609069824219, "logps/rejected": -400.61529541015625, "loss": 3.9949, "margin_dpo/margin_mean": 62.299903869628906, "margin_dpo/margin_std": 44.7177619934082, "step": 401 }, { "epoch": 0.8418848167539267, "grad_norm": 58.57191848754883, "learning_rate": 3.772967168071517e-08, "logits/chosen": 1.4228373765945435, "logits/rejected": 1.3410090208053589, "logps/chosen": -332.11737060546875, "logps/ref_chosen": -279.55889892578125, "logps/ref_rejected": -259.6942138671875, "logps/rejected": -331.76983642578125, "loss": 4.3682, "margin_dpo/margin_mean": 19.51715660095215, "margin_dpo/margin_std": 81.67544555664062, "step": 402 }, { "epoch": 0.8439790575916231, "grad_norm": 46.684837341308594, "learning_rate": 3.676824816087978e-08, "logits/chosen": 1.5173556804656982, "logits/rejected": 1.6118801832199097, "logps/chosen": -406.491455078125, "logps/ref_chosen": -372.2436828613281, "logps/ref_rejected": -285.5693359375, "logps/rejected": -390.88140869140625, "loss": 3.7148, "margin_dpo/margin_mean": 71.0643081665039, "margin_dpo/margin_std": 51.123374938964844, "step": 403 }, { "epoch": 0.8460732984293193, "grad_norm": 38.361358642578125, "learning_rate": 3.581825961277074e-08, "logits/chosen": 1.4560322761535645, "logits/rejected": 1.349548578262329, "logps/chosen": -372.926513671875, "logps/ref_chosen": -328.00860595703125, "logps/ref_rejected": -278.35009765625, "logps/rejected": -352.3278503417969, "loss": 4.4734, "margin_dpo/margin_mean": 29.05979347229004, "margin_dpo/margin_std": 75.3277816772461, "step": 404 }, { "epoch": 0.8481675392670157, "grad_norm": 50.2591438293457, "learning_rate": 3.487975698139084e-08, "logits/chosen": 1.3894712924957275, "logits/rejected": 1.5616024732589722, "logps/chosen": -270.36468505859375, "logps/ref_chosen": -228.44178771972656, "logps/ref_rejected": -244.33395385742188, "logps/rejected": -330.79925537109375, "loss": 4.2189, "margin_dpo/margin_mean": 44.542388916015625, "margin_dpo/margin_std": 59.48860168457031, "step": 405 }, { "epoch": 0.8502617801047121, "grad_norm": 63.63072204589844, "learning_rate": 3.3952790595787986e-08, "logits/chosen": 1.3200132846832275, "logits/rejected": 1.2781833410263062, "logps/chosen": -353.1100158691406, "logps/ref_chosen": -321.44598388671875, "logps/ref_rejected": -268.0445861816406, "logps/rejected": -349.9507751464844, "loss": 4.6261, "margin_dpo/margin_mean": 50.24217987060547, "margin_dpo/margin_std": 67.54312896728516, "step": 406 }, { "epoch": 0.8523560209424084, "grad_norm": 53.97494888305664, "learning_rate": 3.303741016635614e-08, "logits/chosen": 1.3076931238174438, "logits/rejected": 1.1266769170761108, "logps/chosen": -309.05596923828125, "logps/ref_chosen": -247.0522918701172, "logps/ref_rejected": -186.8645782470703, "logps/rejected": -270.8433837890625, "loss": 4.3225, "margin_dpo/margin_mean": 21.975143432617188, "margin_dpo/margin_std": 54.5270881652832, "step": 407 }, { "epoch": 0.8544502617801047, "grad_norm": 53.08890151977539, "learning_rate": 3.2133664782169944e-08, "logits/chosen": 1.1703412532806396, "logits/rejected": 1.2470866441726685, "logps/chosen": -253.00314331054688, "logps/ref_chosen": -213.5513458251953, "logps/ref_rejected": -267.9826354980469, "logps/rejected": -361.67535400390625, "loss": 4.2575, "margin_dpo/margin_mean": 54.240936279296875, "margin_dpo/margin_std": 45.82322692871094, "step": 408 }, { "epoch": 0.856544502617801, "grad_norm": 48.80266571044922, "learning_rate": 3.12416029083514e-08, "logits/chosen": 1.4997256994247437, "logits/rejected": 1.6531615257263184, "logps/chosen": -320.41851806640625, "logps/ref_chosen": -280.0785217285156, "logps/ref_rejected": -304.26910400390625, "logps/rejected": -392.4356994628906, "loss": 4.3055, "margin_dpo/margin_mean": 47.82660675048828, "margin_dpo/margin_std": 75.04901885986328, "step": 409 }, { "epoch": 0.8586387434554974, "grad_norm": 63.39336013793945, "learning_rate": 3.036127238347164e-08, "logits/chosen": 1.5989128351211548, "logits/rejected": 1.5641684532165527, "logps/chosen": -292.51336669921875, "logps/ref_chosen": -260.9378662109375, "logps/ref_rejected": -296.7695007324219, "logps/rejected": -368.9515380859375, "loss": 4.4542, "margin_dpo/margin_mean": 40.606529235839844, "margin_dpo/margin_std": 67.39288330078125, "step": 410 }, { "epoch": 0.8607329842931937, "grad_norm": 40.665950775146484, "learning_rate": 2.9492720416985e-08, "logits/chosen": 1.3025436401367188, "logits/rejected": 1.4121453762054443, "logps/chosen": -354.9593200683594, "logps/ref_chosen": -330.0611877441406, "logps/ref_rejected": -310.21368408203125, "logps/rejected": -398.6880187988281, "loss": 4.1892, "margin_dpo/margin_mean": 63.576229095458984, "margin_dpo/margin_std": 51.235904693603516, "step": 411 }, { "epoch": 0.86282722513089, "grad_norm": 55.58880615234375, "learning_rate": 2.863599358669755e-08, "logits/chosen": 1.1845945119857788, "logits/rejected": 1.3782296180725098, "logps/chosen": -288.62408447265625, "logps/ref_chosen": -254.76255798339844, "logps/ref_rejected": -315.16156005859375, "logps/rejected": -383.30535888671875, "loss": 4.4912, "margin_dpo/margin_mean": 34.28225326538086, "margin_dpo/margin_std": 53.833274841308594, "step": 412 }, { "epoch": 0.8649214659685864, "grad_norm": 81.09705352783203, "learning_rate": 2.7791137836269158e-08, "logits/chosen": 1.3580859899520874, "logits/rejected": 1.3456140756607056, "logps/chosen": -294.88671875, "logps/ref_chosen": -260.4962463378906, "logps/ref_rejected": -250.442626953125, "logps/rejected": -319.46563720703125, "loss": 4.5037, "margin_dpo/margin_mean": 34.632568359375, "margin_dpo/margin_std": 63.56281280517578, "step": 413 }, { "epoch": 0.8670157068062827, "grad_norm": 51.592769622802734, "learning_rate": 2.6958198472749717e-08, "logits/chosen": 1.6022746562957764, "logits/rejected": 1.5306450128555298, "logps/chosen": -428.2533264160156, "logps/ref_chosen": -391.94610595703125, "logps/ref_rejected": -287.8221130371094, "logps/rejected": -345.431396484375, "loss": 4.3943, "margin_dpo/margin_mean": 21.30208396911621, "margin_dpo/margin_std": 81.18660736083984, "step": 414 }, { "epoch": 0.8691099476439791, "grad_norm": 49.326297760009766, "learning_rate": 2.613722016414943e-08, "logits/chosen": 0.8005275726318359, "logits/rejected": 0.8713321685791016, "logps/chosen": -245.8915557861328, "logps/ref_chosen": -223.82870483398438, "logps/ref_rejected": -206.082763671875, "logps/rejected": -291.24188232421875, "loss": 4.0595, "margin_dpo/margin_mean": 63.09626007080078, "margin_dpo/margin_std": 63.13758087158203, "step": 415 }, { "epoch": 0.8712041884816754, "grad_norm": 48.53297424316406, "learning_rate": 2.5328246937043525e-08, "logits/chosen": 1.264346718788147, "logits/rejected": 1.331539273262024, "logps/chosen": -316.0765686035156, "logps/ref_chosen": -303.91656494140625, "logps/ref_rejected": -267.0210266113281, "logps/rejected": -343.7052917480469, "loss": 4.154, "margin_dpo/margin_mean": 64.5242919921875, "margin_dpo/margin_std": 58.65679168701172, "step": 416 }, { "epoch": 0.8732984293193717, "grad_norm": 43.959312438964844, "learning_rate": 2.4531322174210973e-08, "logits/chosen": 1.207326054573059, "logits/rejected": 1.3471075296401978, "logps/chosen": -298.7107849121094, "logps/ref_chosen": -249.8788604736328, "logps/ref_rejected": -210.48683166503906, "logps/rejected": -290.503662109375, "loss": 4.4556, "margin_dpo/margin_mean": 31.184890747070312, "margin_dpo/margin_std": 68.83541107177734, "step": 417 }, { "epoch": 0.875392670157068, "grad_norm": 57.87160110473633, "learning_rate": 2.3746488612308295e-08, "logits/chosen": 1.1209101676940918, "logits/rejected": 1.000208854675293, "logps/chosen": -411.3262023925781, "logps/ref_chosen": -355.4482421875, "logps/ref_rejected": -344.9490661621094, "logps/rejected": -433.7691650390625, "loss": 4.5932, "margin_dpo/margin_mean": 32.94209671020508, "margin_dpo/margin_std": 72.00234985351562, "step": 418 }, { "epoch": 0.8774869109947644, "grad_norm": 55.62238311767578, "learning_rate": 2.297378833957761e-08, "logits/chosen": 1.6920902729034424, "logits/rejected": 1.618727445602417, "logps/chosen": -433.79205322265625, "logps/ref_chosen": -381.6947021484375, "logps/ref_rejected": -322.436767578125, "logps/rejected": -426.5209045410156, "loss": 4.2079, "margin_dpo/margin_mean": 51.986717224121094, "margin_dpo/margin_std": 64.11946868896484, "step": 419 }, { "epoch": 0.8795811518324608, "grad_norm": 42.748905181884766, "learning_rate": 2.2213262793589482e-08, "logits/chosen": 1.0813815593719482, "logits/rejected": 1.1282932758331299, "logps/chosen": -290.3770446777344, "logps/ref_chosen": -255.09539794921875, "logps/ref_rejected": -254.58306884765625, "logps/rejected": -328.9988708496094, "loss": 4.2003, "margin_dpo/margin_mean": 39.13414001464844, "margin_dpo/margin_std": 68.49993896484375, "step": 420 }, { "epoch": 0.881675392670157, "grad_norm": 62.450592041015625, "learning_rate": 2.1464952759020856e-08, "logits/chosen": 1.2170071601867676, "logits/rejected": 1.0278328657150269, "logps/chosen": -303.1274719238281, "logps/ref_chosen": -280.7524719238281, "logps/ref_rejected": -187.39218139648438, "logps/rejected": -256.837646484375, "loss": 4.0246, "margin_dpo/margin_mean": 47.07042694091797, "margin_dpo/margin_std": 56.84251403808594, "step": 421 }, { "epoch": 0.8837696335078534, "grad_norm": 56.18370056152344, "learning_rate": 2.07288983654679e-08, "logits/chosen": 1.3475362062454224, "logits/rejected": 1.4008029699325562, "logps/chosen": -307.1512145996094, "logps/ref_chosen": -278.18890380859375, "logps/ref_rejected": -250.71591186523438, "logps/rejected": -318.3471984863281, "loss": 4.6777, "margin_dpo/margin_mean": 38.6689453125, "margin_dpo/margin_std": 72.85610961914062, "step": 422 }, { "epoch": 0.8858638743455497, "grad_norm": 64.5829849243164, "learning_rate": 2.0005139085293942e-08, "logits/chosen": 1.3403944969177246, "logits/rejected": 1.4666098356246948, "logps/chosen": -301.6260986328125, "logps/ref_chosen": -281.21820068359375, "logps/ref_rejected": -296.73907470703125, "logps/rejected": -367.79888916015625, "loss": 4.1139, "margin_dpo/margin_mean": 50.65192413330078, "margin_dpo/margin_std": 72.86007690429688, "step": 423 }, { "epoch": 0.8879581151832461, "grad_norm": 43.40376281738281, "learning_rate": 1.9293713731512673e-08, "logits/chosen": 1.1475857496261597, "logits/rejected": 1.005727767944336, "logps/chosen": -361.1445617675781, "logps/ref_chosen": -339.32550048828125, "logps/ref_rejected": -274.3222351074219, "logps/rejected": -354.5352478027344, "loss": 4.1116, "margin_dpo/margin_mean": 58.39393615722656, "margin_dpo/margin_std": 50.71818542480469, "step": 424 }, { "epoch": 0.8900523560209425, "grad_norm": 46.304969787597656, "learning_rate": 1.8594660455706763e-08, "logits/chosen": 1.2751210927963257, "logits/rejected": 1.5024828910827637, "logps/chosen": -283.94232177734375, "logps/ref_chosen": -254.7490997314453, "logps/ref_rejected": -266.1432800292969, "logps/rejected": -349.154541015625, "loss": 4.3925, "margin_dpo/margin_mean": 53.81803894042969, "margin_dpo/margin_std": 49.791236877441406, "step": 425 }, { "epoch": 0.8921465968586387, "grad_norm": 41.32174301147461, "learning_rate": 1.7908016745981856e-08, "logits/chosen": 0.896172285079956, "logits/rejected": 1.0513912439346313, "logps/chosen": -284.55767822265625, "logps/ref_chosen": -264.97216796875, "logps/ref_rejected": -244.05935668945312, "logps/rejected": -316.9269714355469, "loss": 4.2069, "margin_dpo/margin_mean": 53.282081604003906, "margin_dpo/margin_std": 63.170196533203125, "step": 426 }, { "epoch": 0.8942408376963351, "grad_norm": 42.068763732910156, "learning_rate": 1.7233819424956247e-08, "logits/chosen": 1.2293376922607422, "logits/rejected": 1.2114195823669434, "logps/chosen": -329.5334167480469, "logps/ref_chosen": -301.6879577636719, "logps/ref_rejected": -265.6783752441406, "logps/rejected": -380.6156311035156, "loss": 3.7144, "margin_dpo/margin_mean": 87.09181213378906, "margin_dpo/margin_std": 52.67485046386719, "step": 427 }, { "epoch": 0.8963350785340314, "grad_norm": 55.713130950927734, "learning_rate": 1.6572104647786245e-08, "logits/chosen": 1.5100287199020386, "logits/rejected": 1.70786714553833, "logps/chosen": -415.50616455078125, "logps/ref_chosen": -376.43316650390625, "logps/ref_rejected": -334.24676513671875, "logps/rejected": -440.6484375, "loss": 3.7971, "margin_dpo/margin_mean": 67.32867431640625, "margin_dpo/margin_std": 66.57231140136719, "step": 428 }, { "epoch": 0.8984293193717278, "grad_norm": 70.71985626220703, "learning_rate": 1.5922907900227017e-08, "logits/chosen": 1.4469355344772339, "logits/rejected": 1.4792401790618896, "logps/chosen": -247.02073669433594, "logps/ref_chosen": -218.89503479003906, "logps/ref_rejected": -236.546630859375, "logps/rejected": -322.55419921875, "loss": 4.4589, "margin_dpo/margin_mean": 57.881866455078125, "margin_dpo/margin_std": 76.20121002197266, "step": 429 }, { "epoch": 0.900523560209424, "grad_norm": 92.4472427368164, "learning_rate": 1.5286263996730026e-08, "logits/chosen": 1.3988643884658813, "logits/rejected": 1.5516958236694336, "logps/chosen": -318.7999267578125, "logps/ref_chosen": -281.9652099609375, "logps/ref_rejected": -266.40411376953125, "logps/rejected": -329.5230712890625, "loss": 4.4918, "margin_dpo/margin_mean": 26.284305572509766, "margin_dpo/margin_std": 64.46965789794922, "step": 430 }, { "epoch": 0.9026178010471204, "grad_norm": 42.31532287597656, "learning_rate": 1.4662207078575684e-08, "logits/chosen": 1.7694166898727417, "logits/rejected": 1.8366968631744385, "logps/chosen": -332.827880859375, "logps/ref_chosen": -286.10888671875, "logps/ref_rejected": -274.0017395019531, "logps/rejected": -361.9835510253906, "loss": 4.5107, "margin_dpo/margin_mean": 41.2628173828125, "margin_dpo/margin_std": 59.571441650390625, "step": 431 }, { "epoch": 0.9047120418848168, "grad_norm": 37.992881774902344, "learning_rate": 1.40507706120426e-08, "logits/chosen": 1.4250589609146118, "logits/rejected": 1.6473501920700073, "logps/chosen": -348.860595703125, "logps/ref_chosen": -316.9443359375, "logps/ref_rejected": -358.3585510253906, "logps/rejected": -438.4804382324219, "loss": 4.1495, "margin_dpo/margin_mean": 48.20561981201172, "margin_dpo/margin_std": 72.83356475830078, "step": 432 }, { "epoch": 0.9068062827225131, "grad_norm": 56.42523193359375, "learning_rate": 1.345198738661285e-08, "logits/chosen": 1.288971185684204, "logits/rejected": 1.2402310371398926, "logps/chosen": -312.3082275390625, "logps/ref_chosen": -282.2297668457031, "logps/ref_rejected": -258.5012512207031, "logps/rejected": -328.95513916015625, "loss": 4.5137, "margin_dpo/margin_mean": 40.375389099121094, "margin_dpo/margin_std": 55.306190490722656, "step": 433 }, { "epoch": 0.9089005235602095, "grad_norm": 34.96892547607422, "learning_rate": 1.2865889513213628e-08, "logits/chosen": 1.7188708782196045, "logits/rejected": 1.7233338356018066, "logps/chosen": -339.9571533203125, "logps/ref_chosen": -313.5975646972656, "logps/ref_rejected": -293.44097900390625, "logps/rejected": -374.973876953125, "loss": 4.2523, "margin_dpo/margin_mean": 55.173301696777344, "margin_dpo/margin_std": 61.733543395996094, "step": 434 }, { "epoch": 0.9109947643979057, "grad_norm": 84.76171875, "learning_rate": 1.2292508422495157e-08, "logits/chosen": 1.584928035736084, "logits/rejected": 1.718322515487671, "logps/chosen": -211.07582092285156, "logps/ref_chosen": -191.58889770507812, "logps/ref_rejected": -206.38133239746094, "logps/rejected": -269.05389404296875, "loss": 4.3653, "margin_dpo/margin_mean": 43.18561553955078, "margin_dpo/margin_std": 55.161441802978516, "step": 435 }, { "epoch": 0.9130890052356021, "grad_norm": 57.35254669189453, "learning_rate": 1.1731874863145142e-08, "logits/chosen": 1.098213791847229, "logits/rejected": 1.1476788520812988, "logps/chosen": -349.2528076171875, "logps/ref_chosen": -329.4399719238281, "logps/ref_rejected": -310.59783935546875, "logps/rejected": -374.02459716796875, "loss": 4.8351, "margin_dpo/margin_mean": 43.61388397216797, "margin_dpo/margin_std": 60.17463302612305, "step": 436 }, { "epoch": 0.9151832460732985, "grad_norm": 33.499061584472656, "learning_rate": 1.118401890024001e-08, "logits/chosen": 1.4528710842132568, "logits/rejected": 1.5862656831741333, "logps/chosen": -284.3026123046875, "logps/ref_chosen": -245.99761962890625, "logps/ref_rejected": -340.40283203125, "logps/rejected": -442.78497314453125, "loss": 4.1697, "margin_dpo/margin_mean": 64.07716369628906, "margin_dpo/margin_std": 62.12276077270508, "step": 437 }, { "epoch": 0.9172774869109948, "grad_norm": 139.29515075683594, "learning_rate": 1.06489699136324e-08, "logits/chosen": 1.1451233625411987, "logits/rejected": 1.288071870803833, "logps/chosen": -289.2056884765625, "logps/ref_chosen": -264.5708923339844, "logps/ref_rejected": -263.8647155761719, "logps/rejected": -315.4420471191406, "loss": 5.1452, "margin_dpo/margin_mean": 26.942535400390625, "margin_dpo/margin_std": 64.0194320678711, "step": 438 }, { "epoch": 0.9193717277486911, "grad_norm": 45.723548889160156, "learning_rate": 1.0126756596375685e-08, "logits/chosen": 1.449920892715454, "logits/rejected": 1.4370176792144775, "logps/chosen": -275.9388427734375, "logps/ref_chosen": -236.2272491455078, "logps/ref_rejected": -260.26531982421875, "logps/rejected": -337.4900207519531, "loss": 4.4005, "margin_dpo/margin_mean": 37.513099670410156, "margin_dpo/margin_std": 69.3291244506836, "step": 439 }, { "epoch": 0.9214659685863874, "grad_norm": 51.19523620605469, "learning_rate": 9.617406953185136e-09, "logits/chosen": 1.4269630908966064, "logits/rejected": 1.2528316974639893, "logps/chosen": -450.7151794433594, "logps/ref_chosen": -402.7833557128906, "logps/ref_rejected": -285.3439636230469, "logps/rejected": -360.1721496582031, "loss": 4.77, "margin_dpo/margin_mean": 26.89635467529297, "margin_dpo/margin_std": 57.31180191040039, "step": 440 }, { "epoch": 0.9235602094240838, "grad_norm": 45.438751220703125, "learning_rate": 9.12094829893642e-09, "logits/chosen": 1.4254412651062012, "logits/rejected": 1.6387895345687866, "logps/chosen": -382.6518859863281, "logps/ref_chosen": -348.18212890625, "logps/ref_rejected": -375.537353515625, "logps/rejected": -471.38824462890625, "loss": 4.0819, "margin_dpo/margin_mean": 61.381141662597656, "margin_dpo/margin_std": 54.50360870361328, "step": 441 }, { "epoch": 0.9256544502617801, "grad_norm": 53.282371520996094, "learning_rate": 8.637407257200496e-09, "logits/chosen": 1.2402985095977783, "logits/rejected": 1.3638098239898682, "logps/chosen": -275.3695373535156, "logps/ref_chosen": -232.696044921875, "logps/ref_rejected": -204.60752868652344, "logps/rejected": -279.0164489746094, "loss": 4.6929, "margin_dpo/margin_mean": 31.735416412353516, "margin_dpo/margin_std": 70.92927551269531, "step": 442 }, { "epoch": 0.9277486910994764, "grad_norm": 41.01181411743164, "learning_rate": 8.166809758815895e-09, "logits/chosen": 1.3078192472457886, "logits/rejected": 1.3253602981567383, "logps/chosen": -330.3334655761719, "logps/ref_chosen": -275.13873291015625, "logps/ref_rejected": -264.1988830566406, "logps/rejected": -356.9568176269531, "loss": 4.5058, "margin_dpo/margin_mean": 37.56320571899414, "margin_dpo/margin_std": 54.74559783935547, "step": 443 }, { "epoch": 0.9298429319371728, "grad_norm": 51.980506896972656, "learning_rate": 7.709181040498253e-09, "logits/chosen": 0.7922985553741455, "logits/rejected": 0.9772998690605164, "logps/chosen": -340.13336181640625, "logps/ref_chosen": -305.7708740234375, "logps/ref_rejected": -292.68731689453125, "logps/rejected": -366.3361511230469, "loss": 4.3732, "margin_dpo/margin_mean": 39.28638458251953, "margin_dpo/margin_std": 50.5058479309082, "step": 444 }, { "epoch": 0.9319371727748691, "grad_norm": 50.610992431640625, "learning_rate": 7.2645456434869965e-09, "logits/chosen": 1.336693286895752, "logits/rejected": 1.4031049013137817, "logps/chosen": -257.5855712890625, "logps/ref_chosen": -240.496337890625, "logps/ref_rejected": -226.0730743408203, "logps/rejected": -288.43756103515625, "loss": 4.6998, "margin_dpo/margin_mean": 45.27527618408203, "margin_dpo/margin_std": 74.52619934082031, "step": 445 }, { "epoch": 0.9340314136125655, "grad_norm": 62.949615478515625, "learning_rate": 6.832927412229017e-09, "logits/chosen": 1.3089869022369385, "logits/rejected": 1.283385992050171, "logps/chosen": -267.8168029785156, "logps/ref_chosen": -244.18284606933594, "logps/ref_rejected": -211.3776397705078, "logps/rejected": -278.58465576171875, "loss": 4.3051, "margin_dpo/margin_mean": 43.57305145263672, "margin_dpo/margin_std": 57.471893310546875, "step": 446 }, { "epoch": 0.9361256544502617, "grad_norm": 52.591312408447266, "learning_rate": 6.414349493100129e-09, "logits/chosen": 1.3533639907836914, "logits/rejected": 1.4261730909347534, "logps/chosen": -258.5954895019531, "logps/ref_chosen": -237.29592895507812, "logps/ref_rejected": -246.57034301757812, "logps/rejected": -316.0662841796875, "loss": 4.0452, "margin_dpo/margin_mean": 48.1964225769043, "margin_dpo/margin_std": 65.75294494628906, "step": 447 }, { "epoch": 0.9382198952879581, "grad_norm": 46.954933166503906, "learning_rate": 6.0088343331638756e-09, "logits/chosen": 1.6584538221359253, "logits/rejected": 1.7075550556182861, "logps/chosen": -336.6059265136719, "logps/ref_chosen": -308.6024475097656, "logps/ref_rejected": -277.87921142578125, "logps/rejected": -346.0470275878906, "loss": 4.3113, "margin_dpo/margin_mean": 40.164310455322266, "margin_dpo/margin_std": 57.308650970458984, "step": 448 }, { "epoch": 0.9403141361256544, "grad_norm": 54.72856140136719, "learning_rate": 5.616403678967624e-09, "logits/chosen": 1.8703218698501587, "logits/rejected": 1.5918076038360596, "logps/chosen": -386.6697998046875, "logps/ref_chosen": -376.94281005859375, "logps/ref_rejected": -267.11236572265625, "logps/rejected": -342.6089782714844, "loss": 4.1562, "margin_dpo/margin_mean": 65.76960754394531, "margin_dpo/margin_std": 61.83976745605469, "step": 449 }, { "epoch": 0.9424083769633508, "grad_norm": 65.85668182373047, "learning_rate": 5.2370785753763356e-09, "logits/chosen": 1.6628509759902954, "logits/rejected": 1.4311879873275757, "logps/chosen": -327.4361572265625, "logps/ref_chosen": -312.619384765625, "logps/ref_rejected": -215.62857055664062, "logps/rejected": -281.2243957519531, "loss": 4.5566, "margin_dpo/margin_mean": 50.77903366088867, "margin_dpo/margin_std": 56.17271041870117, "step": 450 }, { "epoch": 0.9445026178010472, "grad_norm": 55.37830352783203, "learning_rate": 4.8708793644441086e-09, "logits/chosen": 1.4510207176208496, "logits/rejected": 1.5802251100540161, "logps/chosen": -330.12811279296875, "logps/ref_chosen": -296.6983947753906, "logps/ref_rejected": -312.4747619628906, "logps/rejected": -392.5194396972656, "loss": 4.3406, "margin_dpo/margin_mean": 46.6149787902832, "margin_dpo/margin_std": 63.69156265258789, "step": 451 }, { "epoch": 0.9465968586387434, "grad_norm": 50.63416290283203, "learning_rate": 4.517825684323323e-09, "logits/chosen": 1.3996615409851074, "logits/rejected": 1.581308364868164, "logps/chosen": -329.9542541503906, "logps/ref_chosen": -294.50958251953125, "logps/ref_rejected": -295.56097412109375, "logps/rejected": -362.0646667480469, "loss": 4.3711, "margin_dpo/margin_mean": 31.059064865112305, "margin_dpo/margin_std": 63.657875061035156, "step": 452 }, { "epoch": 0.9486910994764398, "grad_norm": 44.21456527709961, "learning_rate": 4.1779364682113794e-09, "logits/chosen": 1.4765185117721558, "logits/rejected": 1.6153349876403809, "logps/chosen": -341.7298278808594, "logps/ref_chosen": -308.21917724609375, "logps/ref_rejected": -328.357421875, "logps/rejected": -414.666748046875, "loss": 4.1702, "margin_dpo/margin_mean": 52.798675537109375, "margin_dpo/margin_std": 59.44575881958008, "step": 453 }, { "epoch": 0.9507853403141361, "grad_norm": 36.86773681640625, "learning_rate": 3.851229943335393e-09, "logits/chosen": 1.8053613901138306, "logits/rejected": 1.7590644359588623, "logps/chosen": -365.978759765625, "logps/ref_chosen": -332.5453796386719, "logps/ref_rejected": -267.130615234375, "logps/rejected": -339.3133544921875, "loss": 4.3145, "margin_dpo/margin_mean": 38.749393463134766, "margin_dpo/margin_std": 65.05109405517578, "step": 454 }, { "epoch": 0.9528795811518325, "grad_norm": 49.20968246459961, "learning_rate": 3.5377236299748147e-09, "logits/chosen": 1.3919376134872437, "logits/rejected": 1.5252500772476196, "logps/chosen": -261.4302978515625, "logps/ref_chosen": -240.13719177246094, "logps/ref_rejected": -233.28451538085938, "logps/rejected": -313.81097412109375, "loss": 4.5531, "margin_dpo/margin_mean": 59.233402252197266, "margin_dpo/margin_std": 58.3806037902832, "step": 455 }, { "epoch": 0.9549738219895288, "grad_norm": 42.70205307006836, "learning_rate": 3.2374343405217884e-09, "logits/chosen": 1.635735034942627, "logits/rejected": 1.8064732551574707, "logps/chosen": -370.0901184082031, "logps/ref_chosen": -334.82666015625, "logps/ref_rejected": -313.20404052734375, "logps/rejected": -386.83837890625, "loss": 4.1309, "margin_dpo/margin_mean": 38.370887756347656, "margin_dpo/margin_std": 76.44358825683594, "step": 456 }, { "epoch": 0.9570680628272251, "grad_norm": 44.188438415527344, "learning_rate": 2.9503781785795713e-09, "logits/chosen": 1.4283052682876587, "logits/rejected": 1.370866298675537, "logps/chosen": -319.96136474609375, "logps/ref_chosen": -299.60650634765625, "logps/ref_rejected": -263.5287780761719, "logps/rejected": -360.9966125488281, "loss": 4.2728, "margin_dpo/margin_mean": 77.11296844482422, "margin_dpo/margin_std": 65.07032775878906, "step": 457 }, { "epoch": 0.9591623036649215, "grad_norm": 53.56594467163086, "learning_rate": 2.6765705380989432e-09, "logits/chosen": 1.6065071821212769, "logits/rejected": 1.4925872087478638, "logps/chosen": -309.9053955078125, "logps/ref_chosen": -272.7044372558594, "logps/ref_rejected": -235.6636962890625, "logps/rejected": -300.68353271484375, "loss": 4.3749, "margin_dpo/margin_mean": 27.818876266479492, "margin_dpo/margin_std": 55.92146301269531, "step": 458 }, { "epoch": 0.9612565445026178, "grad_norm": 38.86277389526367, "learning_rate": 2.416026102552732e-09, "logits/chosen": 1.428498387336731, "logits/rejected": 1.3140032291412354, "logps/chosen": -328.8189697265625, "logps/ref_chosen": -280.32196044921875, "logps/ref_rejected": -217.7216339111328, "logps/rejected": -297.50244140625, "loss": 4.8445, "margin_dpo/margin_mean": 31.283798217773438, "margin_dpo/margin_std": 54.594871520996094, "step": 459 }, { "epoch": 0.9633507853403142, "grad_norm": 91.57032012939453, "learning_rate": 2.168758844148272e-09, "logits/chosen": 1.443167805671692, "logits/rejected": 1.4732171297073364, "logps/chosen": -426.33984375, "logps/ref_chosen": -387.5949401855469, "logps/ref_rejected": -289.61505126953125, "logps/rejected": -362.8568115234375, "loss": 4.8551, "margin_dpo/margin_mean": 34.49686050415039, "margin_dpo/margin_std": 72.81037902832031, "step": 460 }, { "epoch": 0.9654450261780104, "grad_norm": 57.414451599121094, "learning_rate": 1.9347820230782295e-09, "logits/chosen": 1.5660542249679565, "logits/rejected": 1.520784616470337, "logps/chosen": -268.8662414550781, "logps/ref_chosen": -247.67520141601562, "logps/ref_rejected": -227.18458557128906, "logps/rejected": -311.7144775390625, "loss": 4.5036, "margin_dpo/margin_mean": 63.33882141113281, "margin_dpo/margin_std": 73.68379974365234, "step": 461 }, { "epoch": 0.9675392670157068, "grad_norm": 49.87493896484375, "learning_rate": 1.7141081868094209e-09, "logits/chosen": 1.4014174938201904, "logits/rejected": 1.3472931385040283, "logps/chosen": -378.84088134765625, "logps/ref_chosen": -350.8253173828125, "logps/ref_rejected": -262.5743713378906, "logps/rejected": -355.7335205078125, "loss": 4.116, "margin_dpo/margin_mean": 65.14358520507812, "margin_dpo/margin_std": 64.9459457397461, "step": 462 }, { "epoch": 0.9696335078534032, "grad_norm": 84.11515808105469, "learning_rate": 1.5067491694100153e-09, "logits/chosen": 1.2984429597854614, "logits/rejected": 1.3639535903930664, "logps/chosen": -271.6904602050781, "logps/ref_chosen": -229.31683349609375, "logps/ref_rejected": -227.2904815673828, "logps/rejected": -293.2787170410156, "loss": 4.4943, "margin_dpo/margin_mean": 23.614595413208008, "margin_dpo/margin_std": 70.91625213623047, "step": 463 }, { "epoch": 0.9717277486910995, "grad_norm": 51.687442779541016, "learning_rate": 1.3127160909147672e-09, "logits/chosen": 1.7068381309509277, "logits/rejected": 1.6888086795806885, "logps/chosen": -248.65789794921875, "logps/ref_chosen": -226.55776977539062, "logps/ref_rejected": -208.3471221923828, "logps/rejected": -285.44244384765625, "loss": 4.4572, "margin_dpo/margin_mean": 54.995182037353516, "margin_dpo/margin_std": 61.694210052490234, "step": 464 }, { "epoch": 0.9738219895287958, "grad_norm": 37.04083251953125, "learning_rate": 1.1320193567288527e-09, "logits/chosen": 1.1953487396240234, "logits/rejected": 1.2031378746032715, "logps/chosen": -311.39080810546875, "logps/ref_chosen": -287.9401550292969, "logps/ref_rejected": -305.5926818847656, "logps/rejected": -382.921630859375, "loss": 4.0925, "margin_dpo/margin_mean": 53.878334045410156, "margin_dpo/margin_std": 47.030704498291016, "step": 465 }, { "epoch": 0.9759162303664921, "grad_norm": 40.89207458496094, "learning_rate": 9.64668657069706e-10, "logits/chosen": 1.327850580215454, "logits/rejected": 1.417038917541504, "logps/chosen": -233.49664306640625, "logps/ref_chosen": -224.32131958007812, "logps/ref_rejected": -205.79212951660156, "logps/rejected": -285.48529052734375, "loss": 3.8998, "margin_dpo/margin_mean": 70.51785278320312, "margin_dpo/margin_std": 54.0175895690918, "step": 466 }, { "epoch": 0.9780104712041885, "grad_norm": 66.9071273803711, "learning_rate": 8.106729664475176e-10, "logits/chosen": 0.6929614543914795, "logits/rejected": 0.9394963979721069, "logps/chosen": -260.9166259765625, "logps/ref_chosen": -227.0828094482422, "logps/ref_rejected": -285.5081481933594, "logps/rejected": -364.97393798828125, "loss": 4.4527, "margin_dpo/margin_mean": 45.63193893432617, "margin_dpo/margin_std": 61.40578079223633, "step": 467 }, { "epoch": 0.9801047120418848, "grad_norm": 43.40538787841797, "learning_rate": 6.700405431837585e-10, "logits/chosen": 1.376441478729248, "logits/rejected": 1.1604515314102173, "logps/chosen": -349.08599853515625, "logps/ref_chosen": -314.6758117675781, "logps/ref_rejected": -290.43023681640625, "logps/rejected": -357.7455749511719, "loss": 4.5586, "margin_dpo/margin_mean": 32.90521240234375, "margin_dpo/margin_std": 61.592552185058594, "step": 468 }, { "epoch": 0.9821989528795811, "grad_norm": 48.64304733276367, "learning_rate": 5.427789289685347e-10, "logits/chosen": 1.2290468215942383, "logits/rejected": 1.205275058746338, "logps/chosen": -290.6042175292969, "logps/ref_chosen": -269.7442321777344, "logps/ref_rejected": -237.1964874267578, "logps/rejected": -307.8065185546875, "loss": 4.2722, "margin_dpo/margin_mean": 49.750064849853516, "margin_dpo/margin_std": 63.16607666015625, "step": 469 }, { "epoch": 0.9842931937172775, "grad_norm": 48.205284118652344, "learning_rate": 4.288949484559934e-10, "logits/chosen": 0.8344168066978455, "logits/rejected": 0.8336673378944397, "logps/chosen": -342.15057373046875, "logps/ref_chosen": -326.9454650878906, "logps/ref_rejected": -283.81768798828125, "logps/rejected": -360.6470031738281, "loss": 4.0516, "margin_dpo/margin_mean": 61.624237060546875, "margin_dpo/margin_std": 61.775753021240234, "step": 470 }, { "epoch": 0.9863874345549738, "grad_norm": 58.954227447509766, "learning_rate": 3.2839470889836627e-10, "logits/chosen": 1.4265129566192627, "logits/rejected": 1.394487977027893, "logps/chosen": -337.0513000488281, "logps/ref_chosen": -309.4604797363281, "logps/ref_rejected": -292.0646057128906, "logps/rejected": -356.0199890136719, "loss": 4.5083, "margin_dpo/margin_mean": 36.36451721191406, "margin_dpo/margin_std": 59.396270751953125, "step": 471 }, { "epoch": 0.9884816753926702, "grad_norm": 61.63766098022461, "learning_rate": 2.412835998185092e-10, "logits/chosen": 1.1254520416259766, "logits/rejected": 1.199691891670227, "logps/chosen": -210.82077026367188, "logps/ref_chosen": -185.00701904296875, "logps/ref_rejected": -208.1576385498047, "logps/rejected": -272.89483642578125, "loss": 4.1475, "margin_dpo/margin_mean": 38.92341613769531, "margin_dpo/margin_std": 71.17210388183594, "step": 472 }, { "epoch": 0.9905759162303664, "grad_norm": 56.59166717529297, "learning_rate": 1.6756629272085544e-10, "logits/chosen": 1.3166826963424683, "logits/rejected": 1.1217594146728516, "logps/chosen": -350.744140625, "logps/ref_chosen": -330.0291442871094, "logps/ref_rejected": -224.92051696777344, "logps/rejected": -302.14898681640625, "loss": 4.1337, "margin_dpo/margin_mean": 56.51344299316406, "margin_dpo/margin_std": 51.90734100341797, "step": 473 }, { "epoch": 0.9926701570680628, "grad_norm": 59.311004638671875, "learning_rate": 1.072467408408384e-10, "logits/chosen": 1.2557741403579712, "logits/rejected": 1.3725204467773438, "logps/chosen": -355.0524597167969, "logps/ref_chosen": -315.9046936035156, "logps/ref_rejected": -340.7234191894531, "logps/rejected": -411.844482421875, "loss": 4.5535, "margin_dpo/margin_mean": 31.97334861755371, "margin_dpo/margin_std": 64.91607666015625, "step": 474 }, { "epoch": 0.9947643979057592, "grad_norm": 54.42702102661133, "learning_rate": 6.032817893297793e-11, "logits/chosen": 0.872796356678009, "logits/rejected": 0.9358187913894653, "logps/chosen": -227.73483276367188, "logps/ref_chosen": -202.84310913085938, "logps/ref_rejected": -175.70704650878906, "logps/rejected": -250.8938751220703, "loss": 4.4903, "margin_dpo/margin_mean": 50.29510498046875, "margin_dpo/margin_std": 74.1104965209961, "step": 475 }, { "epoch": 0.9968586387434555, "grad_norm": 50.788307189941406, "learning_rate": 2.6813123097352287e-11, "logits/chosen": 1.07370924949646, "logits/rejected": 1.213090181350708, "logps/chosen": -292.42132568359375, "logps/ref_chosen": -276.843505859375, "logps/ref_rejected": -309.07757568359375, "logps/rejected": -368.9504089355469, "loss": 4.2622, "margin_dpo/margin_mean": 44.29502868652344, "margin_dpo/margin_std": 54.164615631103516, "step": 476 }, { "epoch": 0.9989528795811519, "grad_norm": 40.021400451660156, "learning_rate": 6.7033706447061635e-12, "logits/chosen": 0.8247851729393005, "logits/rejected": 0.9125658869743347, "logps/chosen": -297.199951171875, "logps/ref_chosen": -262.76971435546875, "logps/ref_rejected": -272.2499694824219, "logps/rejected": -356.56658935546875, "loss": 4.216, "margin_dpo/margin_mean": 49.88636779785156, "margin_dpo/margin_std": 77.48868560791016, "step": 477 }, { "epoch": 0.9989528795811519, "step": 477, "total_flos": 0.0, "train_loss": 4.779813265150698, "train_runtime": 7822.2821, "train_samples_per_second": 7.815, "train_steps_per_second": 0.061 } ], "logging_steps": 1, "max_steps": 477, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }