{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 340, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0029411764705882353, "grad_norm": 23.717201232910156, "learning_rate": 0.0, "logits/chosen": -0.4739703834056854, "logits/rejected": -0.44689586758613586, "logps/chosen": -72.44038391113281, "logps/ref_chosen": -72.42105865478516, "logps/ref_rejected": -71.02362823486328, "logps/rejected": -70.95858764648438, "loss": 0.6938, "margin_dpo/margin_mean": -0.0843656063079834, "margin_dpo/margin_std": 0.20181308686733246, "step": 1 }, { "epoch": 0.014705882352941176, "grad_norm": 24.15522003173828, "learning_rate": 5.88235294117647e-08, "logits/chosen": -0.5054930448532104, "logits/rejected": -0.4999650716781616, "logps/chosen": -76.55665588378906, "logps/ref_chosen": -76.4837875366211, "logps/ref_rejected": -71.7144775390625, "logps/rejected": -71.69610595703125, "loss": 0.6943, "margin_dpo/margin_mean": -0.0912436842918396, "margin_dpo/margin_std": 0.36911237239837646, "step": 5 }, { "epoch": 0.029411764705882353, "grad_norm": 23.068735122680664, "learning_rate": 1.3235294117647057e-07, "logits/chosen": -0.5124594569206238, "logits/rejected": -0.49317699670791626, "logps/chosen": -76.17481994628906, "logps/ref_chosen": -76.15269470214844, "logps/ref_rejected": -73.87877655029297, "logps/rejected": -73.90404510498047, "loss": 0.6933, "margin_dpo/margin_mean": 0.0031534195877611637, "margin_dpo/margin_std": 0.3234597444534302, "step": 10 }, { "epoch": 0.04411764705882353, "grad_norm": 28.796030044555664, "learning_rate": 2.0588235294117645e-07, "logits/chosen": -0.5413268208503723, "logits/rejected": -0.5226410031318665, "logps/chosen": -67.05145263671875, "logps/ref_chosen": -67.0902099609375, "logps/ref_rejected": -73.005859375, "logps/rejected": -73.06277465820312, "loss": 0.6898, "margin_dpo/margin_mean": 0.09566803276538849, "margin_dpo/margin_std": 0.3500857353210449, "step": 15 }, { "epoch": 0.058823529411764705, "grad_norm": 20.94307518005371, "learning_rate": 2.7941176470588235e-07, "logits/chosen": -0.5276651382446289, "logits/rejected": -0.5001177787780762, "logps/chosen": -73.87080383300781, "logps/ref_chosen": -73.9133071899414, "logps/ref_rejected": -80.46495056152344, "logps/rejected": -80.62101745605469, "loss": 0.6824, "margin_dpo/margin_mean": 0.19857604801654816, "margin_dpo/margin_std": 0.378338098526001, "step": 20 }, { "epoch": 0.07352941176470588, "grad_norm": 24.610126495361328, "learning_rate": 3.529411764705882e-07, "logits/chosen": -0.5061219930648804, "logits/rejected": -0.5009726285934448, "logps/chosen": -60.977256774902344, "logps/ref_chosen": -61.014869689941406, "logps/ref_rejected": -74.33148193359375, "logps/rejected": -74.73905181884766, "loss": 0.6642, "margin_dpo/margin_mean": 0.44518008828163147, "margin_dpo/margin_std": 0.6063351631164551, "step": 25 }, { "epoch": 0.08823529411764706, "grad_norm": 21.515533447265625, "learning_rate": 4.264705882352941e-07, "logits/chosen": -0.5904145240783691, "logits/rejected": -0.5685775279998779, "logps/chosen": -78.83164978027344, "logps/ref_chosen": -78.80770111083984, "logps/ref_rejected": -81.50379943847656, "logps/rejected": -83.10078430175781, "loss": 0.6294, "margin_dpo/margin_mean": 1.5730347633361816, "margin_dpo/margin_std": 1.7553781270980835, "step": 30 }, { "epoch": 0.10294117647058823, "grad_norm": 19.351747512817383, "learning_rate": 5e-07, "logits/chosen": -0.5566071271896362, "logits/rejected": -0.5428273677825928, "logps/chosen": -86.93069458007812, "logps/ref_chosen": -86.67269134521484, "logps/ref_rejected": -86.13935852050781, "logps/rejected": -88.55570220947266, "loss": 0.6028, "margin_dpo/margin_mean": 2.158336877822876, "margin_dpo/margin_std": 2.8764147758483887, "step": 35 }, { "epoch": 0.11764705882352941, "grad_norm": 18.829681396484375, "learning_rate": 4.996706849759452e-07, "logits/chosen": -0.6493271589279175, "logits/rejected": -0.6133594512939453, "logps/chosen": -71.7585220336914, "logps/ref_chosen": -69.31690216064453, "logps/ref_rejected": -83.9319076538086, "logps/rejected": -91.31529235839844, "loss": 0.5446, "margin_dpo/margin_mean": 4.941764831542969, "margin_dpo/margin_std": 8.191742897033691, "step": 40 }, { "epoch": 0.1323529411764706, "grad_norm": 23.498613357543945, "learning_rate": 4.986836074908615e-07, "logits/chosen": -0.6821354627609253, "logits/rejected": -0.6494560837745667, "logps/chosen": -73.5013427734375, "logps/ref_chosen": -69.97550964355469, "logps/ref_rejected": -100.10908508300781, "logps/rejected": -108.92988586425781, "loss": 0.553, "margin_dpo/margin_mean": 5.294968128204346, "margin_dpo/margin_std": 6.769883632659912, "step": 45 }, { "epoch": 0.14705882352941177, "grad_norm": 30.29952621459961, "learning_rate": 4.970413680203148e-07, "logits/chosen": -0.6595835089683533, "logits/rejected": -0.6233135461807251, "logps/chosen": -78.32559967041016, "logps/ref_chosen": -72.90187072753906, "logps/ref_rejected": -85.52653503417969, "logps/rejected": -95.23252868652344, "loss": 0.5518, "margin_dpo/margin_mean": 4.282275199890137, "margin_dpo/margin_std": 7.439302921295166, "step": 50 }, { "epoch": 0.16176470588235295, "grad_norm": 23.780656814575195, "learning_rate": 4.947482930773511e-07, "logits/chosen": -0.7151781916618347, "logits/rejected": -0.6897321939468384, "logps/chosen": -91.6336898803711, "logps/ref_chosen": -87.45826721191406, "logps/ref_rejected": -97.73722076416016, "logps/rejected": -109.0378646850586, "loss": 0.5112, "margin_dpo/margin_mean": 7.125207424163818, "margin_dpo/margin_std": 9.734245300292969, "step": 55 }, { "epoch": 0.17647058823529413, "grad_norm": 20.72915267944336, "learning_rate": 4.918104238142103e-07, "logits/chosen": -0.6631725430488586, "logits/rejected": -0.6214786767959595, "logps/chosen": -110.2301254272461, "logps/ref_chosen": -106.60343933105469, "logps/ref_rejected": -89.84490203857422, "logps/rejected": -99.53703308105469, "loss": 0.5286, "margin_dpo/margin_mean": 6.065438747406006, "margin_dpo/margin_std": 10.341069221496582, "step": 60 }, { "epoch": 0.19117647058823528, "grad_norm": 16.05661392211914, "learning_rate": 4.882355001067891e-07, "logits/chosen": -0.6507592797279358, "logits/rejected": -0.6253207921981812, "logps/chosen": -79.79920959472656, "logps/ref_chosen": -76.7091064453125, "logps/ref_rejected": -84.54231262207031, "logps/rejected": -93.5802001953125, "loss": 0.4746, "margin_dpo/margin_mean": 5.947785377502441, "margin_dpo/margin_std": 7.2523908615112305, "step": 65 }, { "epoch": 0.20588235294117646, "grad_norm": 16.453359603881836, "learning_rate": 4.840329401637809e-07, "logits/chosen": -0.698811411857605, "logits/rejected": -0.6621960401535034, "logps/chosen": -74.00252532958984, "logps/ref_chosen": -70.0877914428711, "logps/ref_rejected": -91.75868225097656, "logps/rejected": -103.95845031738281, "loss": 0.4662, "margin_dpo/margin_mean": 8.28502082824707, "margin_dpo/margin_std": 8.248537063598633, "step": 70 }, { "epoch": 0.22058823529411764, "grad_norm": 17.00535011291504, "learning_rate": 4.792138157142157e-07, "logits/chosen": -0.6827956438064575, "logits/rejected": -0.6566829681396484, "logps/chosen": -78.68012237548828, "logps/ref_chosen": -74.91792297363281, "logps/ref_rejected": -85.64566802978516, "logps/rejected": -97.5809555053711, "loss": 0.4863, "margin_dpo/margin_mean": 8.173115730285645, "margin_dpo/margin_std": 8.817681312561035, "step": 75 }, { "epoch": 0.23529411764705882, "grad_norm": 21.13958168029785, "learning_rate": 4.737908228387656e-07, "logits/chosen": -0.7372442483901978, "logits/rejected": -0.689995288848877, "logps/chosen": -102.5855941772461, "logps/ref_chosen": -97.75636291503906, "logps/ref_rejected": -92.88613891601562, "logps/rejected": -105.6670150756836, "loss": 0.451, "margin_dpo/margin_mean": 7.951646327972412, "margin_dpo/margin_std": 8.248537063598633, "step": 80 }, { "epoch": 0.25, "grad_norm": 18.165218353271484, "learning_rate": 4.6777824852166437e-07, "logits/chosen": -0.6671745777130127, "logits/rejected": -0.6385531425476074, "logps/chosen": -85.70280456542969, "logps/ref_chosen": -78.9326171875, "logps/ref_rejected": -88.00363159179688, "logps/rejected": -101.9955825805664, "loss": 0.4569, "margin_dpo/margin_mean": 7.221736907958984, "margin_dpo/margin_std": 8.439001083374023, "step": 85 }, { "epoch": 0.2647058823529412, "grad_norm": 20.739215850830078, "learning_rate": 4.611919330113591e-07, "logits/chosen": -0.6510001420974731, "logits/rejected": -0.629525899887085, "logps/chosen": -84.86643981933594, "logps/ref_chosen": -78.78388214111328, "logps/ref_rejected": -90.2783203125, "logps/rejected": -105.78071594238281, "loss": 0.4419, "margin_dpo/margin_mean": 9.419827461242676, "margin_dpo/margin_std": 9.238184928894043, "step": 90 }, { "epoch": 0.27941176470588236, "grad_norm": 17.511486053466797, "learning_rate": 4.5404922808905543e-07, "logits/chosen": -0.6517031788825989, "logits/rejected": -0.6104840040206909, "logps/chosen": -74.32402038574219, "logps/ref_chosen": -65.91403198242188, "logps/ref_rejected": -62.45396041870117, "logps/rejected": -78.22425842285156, "loss": 0.4514, "margin_dpo/margin_mean": 7.360299587249756, "margin_dpo/margin_std": 11.319549560546875, "step": 95 }, { "epoch": 0.29411764705882354, "grad_norm": 18.769145965576172, "learning_rate": 4.4636895135509966e-07, "logits/chosen": -0.6338332295417786, "logits/rejected": -0.6123248338699341, "logps/chosen": -84.81422424316406, "logps/ref_chosen": -77.24075317382812, "logps/ref_rejected": -93.24552917480469, "logps/rejected": -110.46153259277344, "loss": 0.4265, "margin_dpo/margin_mean": 9.642545700073242, "margin_dpo/margin_std": 11.237717628479004, "step": 100 }, { "epoch": 0.29411764705882354, "eval_logits/chosen": -0.6361338496208191, "eval_logits/rejected": -0.6085699200630188, "eval_logps/chosen": -107.19888305664062, "eval_logps/ref_chosen": -97.0617446899414, "eval_logps/ref_rejected": -80.18183135986328, "eval_logps/rejected": -95.6607437133789, "eval_loss": 0.5427329540252686, "eval_margin_dpo/margin_mean": 5.341787338256836, "eval_margin_dpo/margin_std": 10.061349868774414, "eval_runtime": 20.4041, "eval_samples_per_second": 114.634, "eval_steps_per_second": 0.931, "step": 100 }, { "epoch": 0.3088235294117647, "grad_norm": 17.255924224853516, "learning_rate": 4.381713366536311e-07, "logits/chosen": -0.6774856448173523, "logits/rejected": -0.6355584263801575, "logps/chosen": -76.29129791259766, "logps/ref_chosen": -70.76807403564453, "logps/ref_rejected": -74.71427917480469, "logps/rejected": -92.5668716430664, "loss": 0.427, "margin_dpo/margin_mean": 12.32937240600586, "margin_dpo/margin_std": 11.30049991607666, "step": 105 }, { "epoch": 0.3235294117647059, "grad_norm": 18.394851684570312, "learning_rate": 4.2947798076611047e-07, "logits/chosen": -0.6861704587936401, "logits/rejected": -0.6574342846870422, "logps/chosen": -89.50286102294922, "logps/ref_chosen": -81.14533996582031, "logps/ref_rejected": -89.10765838623047, "logps/rejected": -109.15755462646484, "loss": 0.4145, "margin_dpo/margin_mean": 11.69237995147705, "margin_dpo/margin_std": 12.493224143981934, "step": 110 }, { "epoch": 0.3382352941176471, "grad_norm": 18.24220848083496, "learning_rate": 4.203117865141635e-07, "logits/chosen": -0.6698350310325623, "logits/rejected": -0.6532580256462097, "logps/chosen": -76.1394271850586, "logps/ref_chosen": -64.77717590332031, "logps/ref_rejected": -99.79936218261719, "logps/rejected": -122.98934173583984, "loss": 0.4011, "margin_dpo/margin_mean": 11.827718734741211, "margin_dpo/margin_std": 12.786788940429688, "step": 115 }, { "epoch": 0.35294117647058826, "grad_norm": 22.541603088378906, "learning_rate": 4.106969024216348e-07, "logits/chosen": -0.6674679517745972, "logits/rejected": -0.6289718151092529, "logps/chosen": -86.44108581542969, "logps/ref_chosen": -77.35191345214844, "logps/ref_rejected": -82.3753433227539, "logps/rejected": -104.9658203125, "loss": 0.4154, "margin_dpo/margin_mean": 13.501307487487793, "margin_dpo/margin_std": 12.104052543640137, "step": 120 }, { "epoch": 0.36764705882352944, "grad_norm": 18.34996223449707, "learning_rate": 4.006586590948141e-07, "logits/chosen": -0.6953171491622925, "logits/rejected": -0.6653636693954468, "logps/chosen": -84.34068298339844, "logps/ref_chosen": -74.56766510009766, "logps/ref_rejected": -87.71104431152344, "logps/rejected": -109.58891296386719, "loss": 0.3871, "margin_dpo/margin_mean": 12.104842185974121, "margin_dpo/margin_std": 12.706830978393555, "step": 125 }, { "epoch": 0.38235294117647056, "grad_norm": 19.588794708251953, "learning_rate": 3.9022350248844246e-07, "logits/chosen": -0.6832663416862488, "logits/rejected": -0.6475099325180054, "logps/chosen": -92.37910461425781, "logps/ref_chosen": -79.86932373046875, "logps/ref_rejected": -92.48243713378906, "logps/rejected": -118.1786117553711, "loss": 0.4128, "margin_dpo/margin_mean": 13.186391830444336, "margin_dpo/margin_std": 16.62637710571289, "step": 130 }, { "epoch": 0.39705882352941174, "grad_norm": 19.383163452148438, "learning_rate": 3.794189242333106e-07, "logits/chosen": -0.6862474679946899, "logits/rejected": -0.6508306264877319, "logps/chosen": -93.94104766845703, "logps/ref_chosen": -82.55046081542969, "logps/ref_rejected": -91.73478698730469, "logps/rejected": -115.92558288574219, "loss": 0.3855, "margin_dpo/margin_mean": 12.800195693969727, "margin_dpo/margin_std": 15.272809028625488, "step": 135 }, { "epoch": 0.4117647058823529, "grad_norm": 15.547196388244629, "learning_rate": 3.6827338920900253e-07, "logits/chosen": -0.6315192580223083, "logits/rejected": -0.5951318740844727, "logps/chosen": -86.93388366699219, "logps/ref_chosen": -76.40785217285156, "logps/ref_rejected": -88.25675964355469, "logps/rejected": -111.50956726074219, "loss": 0.3782, "margin_dpo/margin_mean": 12.726763725280762, "margin_dpo/margin_std": 12.062446594238281, "step": 140 }, { "epoch": 0.4264705882352941, "grad_norm": 16.376129150390625, "learning_rate": 3.568162605525952e-07, "logits/chosen": -0.694092869758606, "logits/rejected": -0.6596013307571411, "logps/chosen": -90.28238677978516, "logps/ref_chosen": -79.43595123291016, "logps/ref_rejected": -80.57792663574219, "logps/rejected": -105.7525863647461, "loss": 0.3722, "margin_dpo/margin_mean": 14.328218460083008, "margin_dpo/margin_std": 13.251609802246094, "step": 145 }, { "epoch": 0.4411764705882353, "grad_norm": 19.48674201965332, "learning_rate": 3.4507772230088147e-07, "logits/chosen": -0.6205201745033264, "logits/rejected": -0.5989262461662292, "logps/chosen": -82.92797088623047, "logps/ref_chosen": -69.55223846435547, "logps/ref_rejected": -76.5206298828125, "logps/rejected": -99.82804870605469, "loss": 0.4063, "margin_dpo/margin_mean": 9.931692123413086, "margin_dpo/margin_std": 11.23712158203125, "step": 150 }, { "epoch": 0.45588235294117646, "grad_norm": 18.904706954956055, "learning_rate": 3.3308869986991487e-07, "logits/chosen": -0.6716780662536621, "logits/rejected": -0.6312578320503235, "logps/chosen": -98.33650207519531, "logps/ref_chosen": -83.78580474853516, "logps/ref_rejected": -79.48396301269531, "logps/rejected": -106.65342712402344, "loss": 0.3818, "margin_dpo/margin_mean": 12.618766784667969, "margin_dpo/margin_std": 14.547628402709961, "step": 155 }, { "epoch": 0.47058823529411764, "grad_norm": 16.047494888305664, "learning_rate": 3.208807785813777e-07, "logits/chosen": -0.6216621994972229, "logits/rejected": -0.5977298617362976, "logps/chosen": -85.62313079833984, "logps/ref_chosen": -71.89569091796875, "logps/ref_rejected": -95.74468231201172, "logps/rejected": -123.3752212524414, "loss": 0.3508, "margin_dpo/margin_mean": 13.903097152709961, "margin_dpo/margin_std": 10.593317031860352, "step": 160 }, { "epoch": 0.4852941176470588, "grad_norm": 17.789417266845703, "learning_rate": 3.084861204504122e-07, "logits/chosen": -0.6328192949295044, "logits/rejected": -0.5899003148078918, "logps/chosen": -91.7447738647461, "logps/ref_chosen": -77.03978729248047, "logps/ref_rejected": -88.47887420654297, "logps/rejected": -120.28157806396484, "loss": 0.3544, "margin_dpo/margin_mean": 17.097713470458984, "margin_dpo/margin_std": 14.805742263793945, "step": 165 }, { "epoch": 0.5, "grad_norm": 19.674264907836914, "learning_rate": 2.959373794541426e-07, "logits/chosen": -0.6691595911979675, "logits/rejected": -0.6374617218971252, "logps/chosen": -88.34684753417969, "logps/ref_chosen": -71.93138122558594, "logps/ref_rejected": -88.34697723388672, "logps/rejected": -119.37635803222656, "loss": 0.3454, "margin_dpo/margin_mean": 14.613912582397461, "margin_dpo/margin_std": 12.491094589233398, "step": 170 }, { "epoch": 0.5147058823529411, "grad_norm": 20.303539276123047, "learning_rate": 2.8326761550411346e-07, "logits/chosen": -0.6473700404167175, "logits/rejected": -0.6196728944778442, "logps/chosen": -86.78947448730469, "logps/ref_chosen": -68.0127182006836, "logps/ref_rejected": -92.58775329589844, "logps/rejected": -123.58447265625, "loss": 0.3713, "margin_dpo/margin_mean": 12.219950675964355, "margin_dpo/margin_std": 13.678237915039062, "step": 175 }, { "epoch": 0.5294117647058824, "grad_norm": 17.388011932373047, "learning_rate": 2.7051020734928443e-07, "logits/chosen": -0.611466646194458, "logits/rejected": -0.587906002998352, "logps/chosen": -80.26910400390625, "logps/ref_chosen": -61.942466735839844, "logps/ref_rejected": -87.44703674316406, "logps/rejected": -122.92547607421875, "loss": 0.3585, "margin_dpo/margin_mean": 17.15180778503418, "margin_dpo/margin_std": 14.575396537780762, "step": 180 }, { "epoch": 0.5441176470588235, "grad_norm": 19.291353225708008, "learning_rate": 2.5769876463904263e-07, "logits/chosen": -0.6199885606765747, "logits/rejected": -0.5656689405441284, "logps/chosen": -87.93196105957031, "logps/ref_chosen": -72.35160064697266, "logps/ref_rejected": -69.03958129882812, "logps/rejected": -99.19012451171875, "loss": 0.3501, "margin_dpo/margin_mean": 14.570175170898438, "margin_dpo/margin_std": 14.043818473815918, "step": 185 }, { "epoch": 0.5588235294117647, "grad_norm": 18.855066299438477, "learning_rate": 2.4486703937790243e-07, "logits/chosen": -0.6612949967384338, "logits/rejected": -0.6132839322090149, "logps/chosen": -100.40862274169922, "logps/ref_chosen": -79.45222473144531, "logps/ref_rejected": -71.31239318847656, "logps/rejected": -106.55586242675781, "loss": 0.3605, "margin_dpo/margin_mean": 14.287073135375977, "margin_dpo/margin_std": 14.959236145019531, "step": 190 }, { "epoch": 0.5735294117647058, "grad_norm": 20.077083587646484, "learning_rate": 2.320488370051681e-07, "logits/chosen": -0.667130172252655, "logits/rejected": -0.6179927587509155, "logps/chosen": -89.4631118774414, "logps/ref_chosen": -71.20511627197266, "logps/ref_rejected": -84.8467025756836, "logps/rejected": -121.50825500488281, "loss": 0.3429, "margin_dpo/margin_mean": 18.4035587310791, "margin_dpo/margin_std": 15.252446174621582, "step": 195 }, { "epoch": 0.5882352941176471, "grad_norm": 17.699968338012695, "learning_rate": 2.192779273338215e-07, "logits/chosen": -0.6087943911552429, "logits/rejected": -0.5693117380142212, "logps/chosen": -89.16279602050781, "logps/ref_chosen": -71.31782531738281, "logps/ref_rejected": -70.8514404296875, "logps/rejected": -104.97953796386719, "loss": 0.3411, "margin_dpo/margin_mean": 16.283123016357422, "margin_dpo/margin_std": 15.363842964172363, "step": 200 }, { "epoch": 0.5882352941176471, "eval_logits/chosen": -0.602095365524292, "eval_logits/rejected": -0.5640405416488647, "eval_logps/chosen": -119.31637573242188, "eval_logps/ref_chosen": -97.0617446899414, "eval_logps/ref_rejected": -80.18183135986328, "eval_logps/rejected": -112.73600769042969, "eval_loss": 0.4754122495651245, "eval_margin_dpo/margin_mean": 10.299551010131836, "eval_margin_dpo/margin_std": 14.652626991271973, "eval_runtime": 20.3073, "eval_samples_per_second": 115.18, "eval_steps_per_second": 0.936, "step": 200 }, { "epoch": 0.6029411764705882, "grad_norm": 19.535417556762695, "learning_rate": 2.065879555832674e-07, "logits/chosen": -0.5760528445243835, "logits/rejected": -0.5279114842414856, "logps/chosen": -104.2248764038086, "logps/ref_chosen": -84.44103240966797, "logps/ref_rejected": -71.78230285644531, "logps/rejected": -104.95343017578125, "loss": 0.3792, "margin_dpo/margin_mean": 13.387273788452148, "margin_dpo/margin_std": 14.807754516601562, "step": 205 }, { "epoch": 0.6176470588235294, "grad_norm": 17.17575454711914, "learning_rate": 1.9401235374032425e-07, "logits/chosen": -0.6245664358139038, "logits/rejected": -0.5699684619903564, "logps/chosen": -101.36656188964844, "logps/ref_chosen": -83.94493103027344, "logps/ref_rejected": -76.44892120361328, "logps/rejected": -108.5728988647461, "loss": 0.3251, "margin_dpo/margin_mean": 14.702362060546875, "margin_dpo/margin_std": 16.377933502197266, "step": 210 }, { "epoch": 0.6323529411764706, "grad_norm": 20.044084548950195, "learning_rate": 1.8158425248197928e-07, "logits/chosen": -0.5605936050415039, "logits/rejected": -0.5190353393554688, "logps/chosen": -102.8707275390625, "logps/ref_chosen": -82.23881530761719, "logps/ref_rejected": -85.1430892944336, "logps/rejected": -122.053955078125, "loss": 0.3633, "margin_dpo/margin_mean": 16.278963088989258, "margin_dpo/margin_std": 19.206457138061523, "step": 215 }, { "epoch": 0.6470588235294118, "grad_norm": 21.036956787109375, "learning_rate": 1.6933639389195134e-07, "logits/chosen": -0.621160626411438, "logits/rejected": -0.585429310798645, "logps/chosen": -97.38944244384766, "logps/ref_chosen": -76.5594482421875, "logps/ref_rejected": -84.79225158691406, "logps/rejected": -117.23432922363281, "loss": 0.3587, "margin_dpo/margin_mean": 11.612079620361328, "margin_dpo/margin_std": 14.565820693969727, "step": 220 }, { "epoch": 0.6617647058823529, "grad_norm": 21.023571014404297, "learning_rate": 1.573010452010098e-07, "logits/chosen": -0.6097210049629211, "logits/rejected": -0.6041680574417114, "logps/chosen": -87.20682525634766, "logps/ref_chosen": -68.70957946777344, "logps/ref_rejected": -95.65819549560547, "logps/rejected": -132.78231811523438, "loss": 0.3385, "margin_dpo/margin_mean": 18.626880645751953, "margin_dpo/margin_std": 18.950374603271484, "step": 225 }, { "epoch": 0.6764705882352942, "grad_norm": 19.34729766845703, "learning_rate": 1.4550991377830423e-07, "logits/chosen": -0.6367233395576477, "logits/rejected": -0.5984948873519897, "logps/chosen": -92.71955871582031, "logps/ref_chosen": -76.04148864746094, "logps/ref_rejected": -98.15973663330078, "logps/rejected": -129.41712951660156, "loss": 0.3269, "margin_dpo/margin_mean": 14.579324722290039, "margin_dpo/margin_std": 14.860456466674805, "step": 230 }, { "epoch": 0.6911764705882353, "grad_norm": 18.263099670410156, "learning_rate": 1.339940635976592e-07, "logits/chosen": -0.6155376434326172, "logits/rejected": -0.5955866575241089, "logps/chosen": -88.53390502929688, "logps/ref_chosen": -70.64253997802734, "logps/ref_rejected": -90.60277557373047, "logps/rejected": -127.80912780761719, "loss": 0.3347, "margin_dpo/margin_mean": 19.314985275268555, "margin_dpo/margin_std": 15.413273811340332, "step": 235 }, { "epoch": 0.7058823529411765, "grad_norm": 21.18890380859375, "learning_rate": 1.227838333989088e-07, "logits/chosen": -0.5532498955726624, "logits/rejected": -0.5167180299758911, "logps/chosen": -94.69210052490234, "logps/ref_chosen": -75.90282440185547, "logps/ref_rejected": -70.22077178955078, "logps/rejected": -106.57359313964844, "loss": 0.3433, "margin_dpo/margin_mean": 17.56354331970215, "margin_dpo/margin_std": 16.671550750732422, "step": 240 }, { "epoch": 0.7205882352941176, "grad_norm": 19.42283058166504, "learning_rate": 1.1190875675987355e-07, "logits/chosen": -0.5711519122123718, "logits/rejected": -0.5506427884101868, "logps/chosen": -87.87870788574219, "logps/ref_chosen": -68.88108825683594, "logps/ref_rejected": -102.547119140625, "logps/rejected": -142.7686767578125, "loss": 0.3073, "margin_dpo/margin_mean": 21.223926544189453, "margin_dpo/margin_std": 16.53793716430664, "step": 245 }, { "epoch": 0.7352941176470589, "grad_norm": 21.975610733032227, "learning_rate": 1.0139748428955333e-07, "logits/chosen": -0.63815838098526, "logits/rejected": -0.5797184705734253, "logps/chosen": -104.53717041015625, "logps/ref_chosen": -88.11860656738281, "logps/ref_rejected": -85.85978698730469, "logps/rejected": -118.47982025146484, "loss": 0.4138, "margin_dpo/margin_mean": 16.201473236083984, "margin_dpo/margin_std": 15.055798530578613, "step": 250 }, { "epoch": 0.75, "grad_norm": 21.86973762512207, "learning_rate": 9.127770814751932e-08, "logits/chosen": -0.5965814590454102, "logits/rejected": -0.5407648682594299, "logps/chosen": -113.81512451171875, "logps/ref_chosen": -93.02457427978516, "logps/ref_rejected": -86.20562744140625, "logps/rejected": -123.86918640136719, "loss": 0.3314, "margin_dpo/margin_mean": 16.87302017211914, "margin_dpo/margin_std": 16.191524505615234, "step": 255 }, { "epoch": 0.7647058823529411, "grad_norm": 20.748577117919922, "learning_rate": 8.15760890883607e-08, "logits/chosen": -0.5860427618026733, "logits/rejected": -0.5433794856071472, "logps/chosen": -98.30900573730469, "logps/ref_chosen": -79.27108001708984, "logps/ref_rejected": -94.08381652832031, "logps/rejected": -133.5509796142578, "loss": 0.3414, "margin_dpo/margin_mean": 20.42922592163086, "margin_dpo/margin_std": 16.98196029663086, "step": 260 }, { "epoch": 0.7794117647058824, "grad_norm": 20.377286911010742, "learning_rate": 7.231818622338822e-08, "logits/chosen": -0.5678300857543945, "logits/rejected": -0.5425071120262146, "logps/chosen": -99.11347198486328, "logps/ref_chosen": -79.24869537353516, "logps/ref_rejected": -92.03797912597656, "logps/rejected": -126.92435455322266, "loss": 0.3493, "margin_dpo/margin_mean": 15.021594047546387, "margin_dpo/margin_std": 12.837465286254883, "step": 265 }, { "epoch": 0.7941176470588235, "grad_norm": 17.822444915771484, "learning_rate": 6.352838968463919e-08, "logits/chosen": -0.606745719909668, "logits/rejected": -0.5473134517669678, "logps/chosen": -97.48078918457031, "logps/ref_chosen": -80.15914154052734, "logps/ref_rejected": -82.13599395751953, "logps/rejected": -116.37190246582031, "loss": 0.332, "margin_dpo/margin_mean": 16.91426658630371, "margin_dpo/margin_std": 14.53496265411377, "step": 270 }, { "epoch": 0.8088235294117647, "grad_norm": 20.570648193359375, "learning_rate": 5.5229856368582376e-08, "logits/chosen": -0.6010477542877197, "logits/rejected": -0.5661951899528503, "logps/chosen": -99.41848754882812, "logps/ref_chosen": -78.87225341796875, "logps/ref_rejected": -84.97318267822266, "logps/rejected": -122.4229965209961, "loss": 0.3348, "margin_dpo/margin_mean": 16.90357780456543, "margin_dpo/margin_std": 20.21615219116211, "step": 275 }, { "epoch": 0.8235294117647058, "grad_norm": 18.737754821777344, "learning_rate": 4.7444448928806615e-08, "logits/chosen": -0.5662145018577576, "logits/rejected": -0.525722324848175, "logps/chosen": -117.15876770019531, "logps/ref_chosen": -96.47113800048828, "logps/ref_rejected": -113.1217041015625, "logps/rejected": -154.00479125976562, "loss": 0.3329, "margin_dpo/margin_mean": 20.195457458496094, "margin_dpo/margin_std": 19.39859390258789, "step": 280 }, { "epoch": 0.8382352941176471, "grad_norm": 21.463726043701172, "learning_rate": 4.019267817841834e-08, "logits/chosen": -0.630197286605835, "logits/rejected": -0.5674210786819458, "logps/chosen": -111.90663146972656, "logps/ref_chosen": -91.53522491455078, "logps/ref_rejected": -76.2660140991211, "logps/rejected": -114.01655578613281, "loss": 0.3382, "margin_dpo/margin_mean": 17.379127502441406, "margin_dpo/margin_std": 17.829914093017578, "step": 285 }, { "epoch": 0.8529411764705882, "grad_norm": 18.62375831604004, "learning_rate": 3.349364905389032e-08, "logits/chosen": -0.5863774418830872, "logits/rejected": -0.5456980466842651, "logps/chosen": -98.92496490478516, "logps/ref_chosen": -78.96186828613281, "logps/ref_rejected": -78.63177490234375, "logps/rejected": -117.43675231933594, "loss": 0.3409, "margin_dpo/margin_mean": 18.841894149780273, "margin_dpo/margin_std": 18.295745849609375, "step": 290 }, { "epoch": 0.8676470588235294, "grad_norm": 16.586910247802734, "learning_rate": 2.736501028272095e-08, "logits/chosen": -0.5259509086608887, "logits/rejected": -0.5359938144683838, "logps/chosen": -85.10719299316406, "logps/ref_chosen": -64.14302825927734, "logps/ref_rejected": -98.70811462402344, "logps/rejected": -135.39389038085938, "loss": 0.3351, "margin_dpo/margin_mean": 15.721613883972168, "margin_dpo/margin_std": 16.5610294342041, "step": 295 }, { "epoch": 0.8823529411764706, "grad_norm": 19.39561653137207, "learning_rate": 2.1822907887504932e-08, "logits/chosen": -0.5196036696434021, "logits/rejected": -0.5250274538993835, "logps/chosen": -80.19596099853516, "logps/ref_chosen": -59.2784423828125, "logps/ref_rejected": -91.62141418457031, "logps/rejected": -130.80763244628906, "loss": 0.3552, "margin_dpo/margin_mean": 18.2686824798584, "margin_dpo/margin_std": 16.341278076171875, "step": 300 }, { "epoch": 0.8823529411764706, "eval_logits/chosen": -0.5876314640045166, "eval_logits/rejected": -0.5494834184646606, "eval_logps/chosen": -119.7147216796875, "eval_logps/ref_chosen": -97.0617446899414, "eval_logps/ref_rejected": -80.18183135986328, "eval_logps/rejected": -113.95352935791016, "eval_loss": 0.4588142931461334, "eval_margin_dpo/margin_mean": 11.118718147277832, "eval_margin_dpo/margin_std": 15.069600105285645, "eval_runtime": 20.3107, "eval_samples_per_second": 115.161, "eval_steps_per_second": 0.935, "step": 300 }, { "epoch": 0.8970588235294118, "grad_norm": 20.72559356689453, "learning_rate": 1.6881942648911074e-08, "logits/chosen": -0.6059945821762085, "logits/rejected": -0.5594589710235596, "logps/chosen": -110.14324951171875, "logps/ref_chosen": -90.05252838134766, "logps/ref_rejected": -93.02938842773438, "logps/rejected": -134.03268432617188, "loss": 0.3241, "margin_dpo/margin_mean": 20.912582397460938, "margin_dpo/margin_std": 15.790578842163086, "step": 305 }, { "epoch": 0.9117647058823529, "grad_norm": 18.592208862304688, "learning_rate": 1.2555131639630567e-08, "logits/chosen": -0.5199320316314697, "logits/rejected": -0.48348456621170044, "logps/chosen": -99.32337951660156, "logps/ref_chosen": -76.26285552978516, "logps/ref_rejected": -81.56607055664062, "logps/rejected": -121.98432922363281, "loss": 0.3336, "margin_dpo/margin_mean": 17.357715606689453, "margin_dpo/margin_std": 17.407108306884766, "step": 310 }, { "epoch": 0.9264705882352942, "grad_norm": 19.586881637573242, "learning_rate": 8.85387393063622e-09, "logits/chosen": -0.5956140160560608, "logits/rejected": -0.5609453916549683, "logps/chosen": -108.92083740234375, "logps/ref_chosen": -89.47105407714844, "logps/ref_rejected": -92.69927215576172, "logps/rejected": -129.36099243164062, "loss": 0.3444, "margin_dpo/margin_mean": 17.211929321289062, "margin_dpo/margin_std": 18.306108474731445, "step": 315 }, { "epoch": 0.9411764705882353, "grad_norm": 21.697298049926758, "learning_rate": 5.7879205600998296e-09, "logits/chosen": -0.6173444986343384, "logits/rejected": -0.5614223480224609, "logps/chosen": -98.2002182006836, "logps/ref_chosen": -76.45301818847656, "logps/ref_rejected": -65.2257308959961, "logps/rejected": -102.35930633544922, "loss": 0.3732, "margin_dpo/margin_mean": 15.386384963989258, "margin_dpo/margin_std": 15.031097412109375, "step": 320 }, { "epoch": 0.9558823529411765, "grad_norm": 22.468570709228516, "learning_rate": 3.3653488440851253e-09, "logits/chosen": -0.5936331152915955, "logits/rejected": -0.5392800569534302, "logps/chosen": -89.80387878417969, "logps/ref_chosen": -71.98212432861328, "logps/ref_rejected": -68.71195983886719, "logps/rejected": -102.19793701171875, "loss": 0.3374, "margin_dpo/margin_mean": 15.664227485656738, "margin_dpo/margin_std": 12.905950546264648, "step": 325 }, { "epoch": 0.9705882352941176, "grad_norm": 18.491226196289062, "learning_rate": 1.592541096695571e-09, "logits/chosen": -0.5897213816642761, "logits/rejected": -0.5493496656417847, "logps/chosen": -95.72080993652344, "logps/ref_chosen": -77.13968658447266, "logps/ref_rejected": -93.0115737915039, "logps/rejected": -132.95394897460938, "loss": 0.3212, "margin_dpo/margin_mean": 21.361230850219727, "margin_dpo/margin_std": 19.999116897583008, "step": 330 }, { "epoch": 0.9852941176470589, "grad_norm": 17.843168258666992, "learning_rate": 4.741678157389739e-10, "logits/chosen": -0.5449101328849792, "logits/rejected": -0.506639301776886, "logps/chosen": -97.75109100341797, "logps/ref_chosen": -78.12508392333984, "logps/ref_rejected": -73.1583480834961, "logps/rejected": -106.00955963134766, "loss": 0.3225, "margin_dpo/margin_mean": 13.225196838378906, "margin_dpo/margin_std": 12.341458320617676, "step": 335 }, { "epoch": 1.0, "grad_norm": 19.59518051147461, "learning_rate": 1.31753782067201e-11, "logits/chosen": -0.5786937475204468, "logits/rejected": -0.544124186038971, "logps/chosen": -85.4710922241211, "logps/ref_chosen": -64.36441802978516, "logps/ref_rejected": -73.83573913574219, "logps/rejected": -113.6316146850586, "loss": 0.3138, "margin_dpo/margin_mean": 18.689212799072266, "margin_dpo/margin_std": 18.127058029174805, "step": 340 }, { "epoch": 1.0, "step": 340, "total_flos": 0.0, "train_loss": 0.4133688477908864, "train_runtime": 1436.8705, "train_samples_per_second": 30.342, "train_steps_per_second": 0.237 } ], "logging_steps": 5, "max_steps": 340, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }