{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 681, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0014684287812041115, "grad_norm": 83.525146484375, "learning_rate": 0.0, "logits/chosen": -0.4974287748336792, "logits/rejected": -0.43299180269241333, "logps/chosen": -50.1435661315918, "logps/ref_chosen": -50.14883804321289, "logps/ref_rejected": -74.1280517578125, "logps/rejected": -74.09991455078125, "loss": 1.389, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.5005706548690796, "margin_dpo/beta_margin_grad_std": 0.010499694384634495, "margin_dpo/beta_margin_mean": -0.0022870064713060856, "margin_dpo/beta_margin_std": 0.0420234240591526, "margin_dpo/loss_margin_mean": -0.02287006378173828, "margin_dpo/margin_mean": -0.02287048101425171, "margin_dpo/margin_std": 0.41920793056488037, "step": 1 }, { "epoch": 0.002936857562408223, "grad_norm": 72.20420837402344, "learning_rate": 7.246376811594203e-09, "logits/chosen": -0.4953641891479492, "logits/rejected": -0.4594460129737854, "logps/chosen": -52.65569305419922, "logps/ref_chosen": -52.620704650878906, "logps/ref_rejected": -75.30413818359375, "logps/rejected": -75.27340698242188, "loss": 1.3932, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.5016425848007202, "margin_dpo/beta_margin_grad_std": 0.008806563913822174, "margin_dpo/beta_margin_mean": -0.006572261452674866, "margin_dpo/beta_margin_std": 0.03523966670036316, "margin_dpo/loss_margin_mean": -0.06572261452674866, "margin_dpo/margin_mean": -0.06572240591049194, "margin_dpo/margin_std": 0.35048407316207886, "step": 2 }, { "epoch": 0.004405286343612335, "grad_norm": 70.93851470947266, "learning_rate": 1.4492753623188406e-08, "logits/chosen": -0.4816606044769287, "logits/rejected": -0.44218793511390686, "logps/chosen": -60.9985466003418, "logps/ref_chosen": -60.98159408569336, "logps/ref_rejected": -68.67259216308594, "logps/rejected": -68.67314147949219, "loss": 1.3882, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.5004101395606995, "margin_dpo/beta_margin_grad_std": 0.008285283111035824, "margin_dpo/beta_margin_mean": -0.001640463131479919, "margin_dpo/beta_margin_std": 0.03315068036317825, "margin_dpo/loss_margin_mean": -0.01640462875366211, "margin_dpo/margin_mean": -0.01640373468399048, "margin_dpo/margin_std": 0.33020099997520447, "step": 3 }, { "epoch": 0.005873715124816446, "grad_norm": 71.9634780883789, "learning_rate": 2.1739130434782606e-08, "logits/chosen": -0.4688633680343628, "logits/rejected": -0.4411826729774475, "logps/chosen": -56.74000930786133, "logps/ref_chosen": -56.76771545410156, "logps/ref_rejected": -86.64710998535156, "logps/rejected": -86.62959289550781, "loss": 1.3857, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.49974533915519714, "margin_dpo/beta_margin_grad_std": 0.010213336907327175, "margin_dpo/beta_margin_mean": 0.0010185746941715479, "margin_dpo/beta_margin_std": 0.04087061062455177, "margin_dpo/loss_margin_mean": 0.01018574833869934, "margin_dpo/margin_mean": 0.0101853609085083, "margin_dpo/margin_std": 0.40629148483276367, "step": 4 }, { "epoch": 0.007342143906020558, "grad_norm": 89.44969940185547, "learning_rate": 2.898550724637681e-08, "logits/chosen": -0.5144953727722168, "logits/rejected": -0.4707370400428772, "logps/chosen": -53.81106185913086, "logps/ref_chosen": -53.859375, "logps/ref_rejected": -84.14918518066406, "logps/rejected": -84.13066864013672, "loss": 1.3838, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.49925631284713745, "margin_dpo/beta_margin_grad_std": 0.010840461589396, "margin_dpo/beta_margin_mean": 0.0029798836912959814, "margin_dpo/beta_margin_std": 0.043392810970544815, "margin_dpo/loss_margin_mean": 0.029798835515975952, "margin_dpo/margin_mean": 0.02979910373687744, "margin_dpo/margin_std": 0.4284527897834778, "step": 5 }, { "epoch": 0.00881057268722467, "grad_norm": 91.85087585449219, "learning_rate": 3.6231884057971014e-08, "logits/chosen": -0.5226503610610962, "logits/rejected": -0.48189258575439453, "logps/chosen": -63.01681137084961, "logps/ref_chosen": -63.007484436035156, "logps/ref_rejected": -92.64534759521484, "logps/rejected": -92.65907287597656, "loss": 1.3862, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.499889999628067, "margin_dpo/beta_margin_grad_std": 0.009657730348408222, "margin_dpo/beta_margin_mean": 0.00043985259253531694, "margin_dpo/beta_margin_std": 0.03865039348602295, "margin_dpo/loss_margin_mean": 0.004398524761199951, "margin_dpo/margin_mean": 0.0043981969356536865, "margin_dpo/margin_std": 0.37970417737960815, "step": 6 }, { "epoch": 0.010279001468428781, "grad_norm": 82.43697357177734, "learning_rate": 4.347826086956521e-08, "logits/chosen": -0.5088996887207031, "logits/rejected": -0.4749848246574402, "logps/chosen": -57.743560791015625, "logps/ref_chosen": -57.774818420410156, "logps/ref_rejected": -103.92059326171875, "logps/rejected": -103.90592193603516, "loss": 1.3851, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.4995860159397125, "margin_dpo/beta_margin_grad_std": 0.01023741252720356, "margin_dpo/beta_margin_mean": 0.0016585501143708825, "margin_dpo/beta_margin_std": 0.04097241163253784, "margin_dpo/loss_margin_mean": 0.016585499048233032, "margin_dpo/margin_mean": 0.01658591628074646, "margin_dpo/margin_std": 0.4064858555793762, "step": 7 }, { "epoch": 0.011747430249632892, "grad_norm": 79.04316711425781, "learning_rate": 5.0724637681159424e-08, "logits/chosen": -0.5012874007225037, "logits/rejected": -0.4746849238872528, "logps/chosen": -58.70497512817383, "logps/ref_chosen": -58.716033935546875, "logps/ref_rejected": -79.3114242553711, "logps/rejected": -79.27145385742188, "loss": 1.3896, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.5007215142250061, "margin_dpo/beta_margin_grad_std": 0.009568445384502411, "margin_dpo/beta_margin_mean": -0.0028907686937600374, "margin_dpo/beta_margin_std": 0.038289591670036316, "margin_dpo/loss_margin_mean": -0.028907686471939087, "margin_dpo/margin_mean": -0.028907448053359985, "margin_dpo/margin_std": 0.37828418612480164, "step": 8 }, { "epoch": 0.013215859030837005, "grad_norm": 85.21879577636719, "learning_rate": 5.797101449275362e-08, "logits/chosen": -0.4914604127407074, "logits/rejected": -0.44458478689193726, "logps/chosen": -69.87384033203125, "logps/ref_chosen": -69.8668441772461, "logps/ref_rejected": -99.6026611328125, "logps/rejected": -99.62161254882812, "loss": 1.3856, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.49970191717147827, "margin_dpo/beta_margin_grad_std": 0.010726687498390675, "margin_dpo/beta_margin_mean": 0.0011951536871492863, "margin_dpo/beta_margin_std": 0.04292509704828262, "margin_dpo/loss_margin_mean": 0.011951535940170288, "margin_dpo/margin_mean": 0.011951416730880737, "margin_dpo/margin_std": 0.4246274530887604, "step": 9 }, { "epoch": 0.014684287812041116, "grad_norm": 70.79057312011719, "learning_rate": 6.521739130434782e-08, "logits/chosen": -0.5021112561225891, "logits/rejected": -0.45928800106048584, "logps/chosen": -48.30955505371094, "logps/ref_chosen": -48.35768508911133, "logps/ref_rejected": -80.37206268310547, "logps/rejected": -80.38316345214844, "loss": 1.3808, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.4985186755657196, "margin_dpo/beta_margin_grad_std": 0.010679498314857483, "margin_dpo/beta_margin_mean": 0.005922754295170307, "margin_dpo/beta_margin_std": 0.04276762157678604, "margin_dpo/loss_margin_mean": 0.05922754108905792, "margin_dpo/margin_mean": 0.05922728776931763, "margin_dpo/margin_std": 0.425285279750824, "step": 10 }, { "epoch": 0.016152716593245228, "grad_norm": 68.34065246582031, "learning_rate": 7.246376811594203e-08, "logits/chosen": -0.46157172322273254, "logits/rejected": -0.4366176128387451, "logps/chosen": -52.98234558105469, "logps/ref_chosen": -53.01685333251953, "logps/ref_rejected": -87.78038024902344, "logps/rejected": -87.7928466796875, "loss": 1.382, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.4988263249397278, "margin_dpo/beta_margin_grad_std": 0.009599917568266392, "margin_dpo/beta_margin_mean": 0.004697933793067932, "margin_dpo/beta_margin_std": 0.03841574117541313, "margin_dpo/loss_margin_mean": 0.04697933793067932, "margin_dpo/margin_mean": 0.04697957634925842, "margin_dpo/margin_std": 0.3766877055168152, "step": 11 }, { "epoch": 0.01762114537444934, "grad_norm": 90.25657653808594, "learning_rate": 7.971014492753623e-08, "logits/chosen": -0.5372684001922607, "logits/rejected": -0.5010780096054077, "logps/chosen": -61.82605743408203, "logps/ref_chosen": -61.80543518066406, "logps/ref_rejected": -104.85826873779297, "logps/rejected": -104.91586303710938, "loss": 1.383, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.49907633662223816, "margin_dpo/beta_margin_grad_std": 0.009649958461523056, "margin_dpo/beta_margin_mean": 0.003697256790474057, "margin_dpo/beta_margin_std": 0.03862835466861725, "margin_dpo/loss_margin_mean": 0.036972567439079285, "margin_dpo/margin_mean": 0.03697209060192108, "margin_dpo/margin_std": 0.3801400065422058, "step": 12 }, { "epoch": 0.01908957415565345, "grad_norm": 79.32652282714844, "learning_rate": 8.695652173913042e-08, "logits/chosen": -0.4902585744857788, "logits/rejected": -0.46292757987976074, "logps/chosen": -64.28887176513672, "logps/ref_chosen": -64.26036071777344, "logps/ref_rejected": -87.20307922363281, "logps/rejected": -87.23356628417969, "loss": 1.3865, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.49995219707489014, "margin_dpo/beta_margin_grad_std": 0.010526234284043312, "margin_dpo/beta_margin_mean": 0.00019729437190108, "margin_dpo/beta_margin_std": 0.04214153066277504, "margin_dpo/loss_margin_mean": 0.0019729435443878174, "margin_dpo/margin_mean": 0.0019735991954803467, "margin_dpo/margin_std": 0.4049326777458191, "step": 13 }, { "epoch": 0.020558002936857563, "grad_norm": 85.4604263305664, "learning_rate": 9.420289855072464e-08, "logits/chosen": -0.489965558052063, "logits/rejected": -0.4511108696460724, "logps/chosen": -58.152305603027344, "logps/ref_chosen": -58.11021423339844, "logps/ref_rejected": -104.04708099365234, "logps/rejected": -104.09505462646484, "loss": 1.3863, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.4998512864112854, "margin_dpo/beta_margin_grad_std": 0.011847623623907566, "margin_dpo/beta_margin_mean": 0.000588723982218653, "margin_dpo/beta_margin_std": 0.047432418912649155, "margin_dpo/loss_margin_mean": 0.005887240171432495, "margin_dpo/margin_mean": 0.005887240171432495, "margin_dpo/margin_std": 0.47125041484832764, "step": 14 }, { "epoch": 0.022026431718061675, "grad_norm": 64.13221740722656, "learning_rate": 1.0144927536231885e-07, "logits/chosen": -0.46068376302719116, "logits/rejected": -0.44027313590049744, "logps/chosen": -56.97354507446289, "logps/ref_chosen": -56.96691131591797, "logps/ref_rejected": -80.80863952636719, "logps/rejected": -80.85784912109375, "loss": 1.3824, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.49893510341644287, "margin_dpo/beta_margin_grad_std": 0.009985481388866901, "margin_dpo/beta_margin_mean": 0.0042571574449539185, "margin_dpo/beta_margin_std": 0.03996788337826729, "margin_dpo/loss_margin_mean": 0.042571574449539185, "margin_dpo/margin_mean": 0.042571812868118286, "margin_dpo/margin_std": 0.39672398567199707, "step": 15 }, { "epoch": 0.023494860499265784, "grad_norm": 84.14559173583984, "learning_rate": 1.0869565217391303e-07, "logits/chosen": -0.52532559633255, "logits/rejected": -0.4843023419380188, "logps/chosen": -61.73296356201172, "logps/ref_chosen": -61.739891052246094, "logps/ref_rejected": -84.36947631835938, "logps/rejected": -84.38020324707031, "loss": 1.3848, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.49955853819847107, "margin_dpo/beta_margin_grad_std": 0.008663349784910679, "margin_dpo/beta_margin_mean": 0.001766052795574069, "margin_dpo/beta_margin_std": 0.03466500714421272, "margin_dpo/loss_margin_mean": 0.017660528421401978, "margin_dpo/margin_mean": 0.01766011118888855, "margin_dpo/margin_std": 0.3431432843208313, "step": 16 }, { "epoch": 0.024963289280469897, "grad_norm": 78.68696594238281, "learning_rate": 1.1594202898550725e-07, "logits/chosen": -0.5094451308250427, "logits/rejected": -0.4733882546424866, "logps/chosen": -67.70388793945312, "logps/ref_chosen": -67.71033477783203, "logps/ref_rejected": -85.37865447998047, "logps/rejected": -85.42217254638672, "loss": 1.3816, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.49875107407569885, "margin_dpo/beta_margin_grad_std": 0.008506165817379951, "margin_dpo/beta_margin_mean": 0.0049957516603171825, "margin_dpo/beta_margin_std": 0.034035272896289825, "margin_dpo/loss_margin_mean": 0.0499575138092041, "margin_dpo/margin_mean": 0.04995712637901306, "margin_dpo/margin_std": 0.3325832486152649, "step": 17 }, { "epoch": 0.02643171806167401, "grad_norm": 81.91975402832031, "learning_rate": 1.2318840579710146e-07, "logits/chosen": -0.4996645152568817, "logits/rejected": -0.4448869228363037, "logps/chosen": -47.723114013671875, "logps/ref_chosen": -47.7394905090332, "logps/ref_rejected": -75.4722900390625, "logps/rejected": -75.5279541015625, "loss": 1.3794, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.49819934368133545, "margin_dpo/beta_margin_grad_std": 0.008676947094500065, "margin_dpo/beta_margin_mean": 0.007204136345535517, "margin_dpo/beta_margin_std": 0.03471643477678299, "margin_dpo/loss_margin_mean": 0.07204136252403259, "margin_dpo/margin_mean": 0.0720413327217102, "margin_dpo/margin_std": 0.3442285656929016, "step": 18 }, { "epoch": 0.027900146842878122, "grad_norm": 73.45258331298828, "learning_rate": 1.3043478260869563e-07, "logits/chosen": -0.5062457323074341, "logits/rejected": -0.45754408836364746, "logps/chosen": -70.22134399414062, "logps/ref_chosen": -70.20535278320312, "logps/ref_rejected": -89.75758361816406, "logps/rejected": -89.80667114257812, "loss": 1.3833, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.49917319416999817, "margin_dpo/beta_margin_grad_std": 0.009437629953026772, "margin_dpo/beta_margin_mean": 0.003309211228042841, "margin_dpo/beta_margin_std": 0.03776707127690315, "margin_dpo/loss_margin_mean": 0.033092111349105835, "margin_dpo/margin_mean": 0.03309273719787598, "margin_dpo/margin_std": 0.3704480528831482, "step": 19 }, { "epoch": 0.02936857562408223, "grad_norm": 73.92622375488281, "learning_rate": 1.3768115942028986e-07, "logits/chosen": -0.5687921643257141, "logits/rejected": -0.5141441226005554, "logps/chosen": -50.828826904296875, "logps/ref_chosen": -50.80324172973633, "logps/ref_rejected": -78.8233413696289, "logps/rejected": -78.88971710205078, "loss": 1.3825, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.4989803433418274, "margin_dpo/beta_margin_grad_std": 0.007609857711941004, "margin_dpo/beta_margin_mean": 0.004078629892319441, "margin_dpo/beta_margin_std": 0.03044736012816429, "margin_dpo/loss_margin_mean": 0.040786296129226685, "margin_dpo/margin_mean": 0.0407865047454834, "margin_dpo/margin_std": 0.29486507177352905, "step": 20 }, { "epoch": 0.030837004405286344, "grad_norm": 77.78363037109375, "learning_rate": 1.4492753623188405e-07, "logits/chosen": -0.49086394906044006, "logits/rejected": -0.4666551351547241, "logps/chosen": -50.0500373840332, "logps/ref_chosen": -50.063018798828125, "logps/ref_rejected": -77.86878967285156, "logps/rejected": -77.97210693359375, "loss": 1.375, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.4970940947532654, "margin_dpo/beta_margin_grad_std": 0.008713486604392529, "margin_dpo/beta_margin_mean": 0.01162932813167572, "margin_dpo/beta_margin_std": 0.03486839681863785, "margin_dpo/loss_margin_mean": 0.1162932813167572, "margin_dpo/margin_mean": 0.11629366874694824, "margin_dpo/margin_std": 0.34371477365493774, "step": 21 }, { "epoch": 0.032305433186490456, "grad_norm": 84.3017349243164, "learning_rate": 1.5217391304347825e-07, "logits/chosen": -0.4743150472640991, "logits/rejected": -0.4301157593727112, "logps/chosen": -58.9935417175293, "logps/ref_chosen": -59.05763626098633, "logps/ref_rejected": -97.50466918945312, "logps/rejected": -97.69529724121094, "loss": 1.3615, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.4936363697052002, "margin_dpo/beta_margin_grad_std": 0.01110980473458767, "margin_dpo/beta_margin_mean": 0.025472251698374748, "margin_dpo/beta_margin_std": 0.044476091861724854, "margin_dpo/loss_margin_mean": 0.2547225058078766, "margin_dpo/margin_mean": 0.2547217905521393, "margin_dpo/margin_std": 0.4430729150772095, "step": 22 }, { "epoch": 0.033773861967694566, "grad_norm": 80.28763580322266, "learning_rate": 1.5942028985507245e-07, "logits/chosen": -0.4873223900794983, "logits/rejected": -0.4646031856536865, "logps/chosen": -60.04255676269531, "logps/ref_chosen": -60.07769775390625, "logps/ref_rejected": -81.1395492553711, "logps/rejected": -81.33428955078125, "loss": 1.364, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.49425840377807617, "margin_dpo/beta_margin_grad_std": 0.011434967629611492, "margin_dpo/beta_margin_mean": 0.022987453266978264, "margin_dpo/beta_margin_std": 0.04579947143793106, "margin_dpo/loss_margin_mean": 0.22987452149391174, "margin_dpo/margin_mean": 0.22987452149391174, "margin_dpo/margin_std": 0.4392421543598175, "step": 23 }, { "epoch": 0.03524229074889868, "grad_norm": 80.72453308105469, "learning_rate": 1.6666666666666665e-07, "logits/chosen": -0.479617714881897, "logits/rejected": -0.46357664465904236, "logps/chosen": -44.27165985107422, "logps/ref_chosen": -44.29103469848633, "logps/ref_rejected": -99.12521362304688, "logps/rejected": -99.34617614746094, "loss": 1.3629, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.49399420619010925, "margin_dpo/beta_margin_grad_std": 0.01102045550942421, "margin_dpo/beta_margin_mean": 0.02403390221297741, "margin_dpo/beta_margin_std": 0.04411429166793823, "margin_dpo/loss_margin_mean": 0.2403390109539032, "margin_dpo/margin_mean": 0.24034002423286438, "margin_dpo/margin_std": 0.42840874195098877, "step": 24 }, { "epoch": 0.03671071953010279, "grad_norm": 73.97421264648438, "learning_rate": 1.7391304347826085e-07, "logits/chosen": -0.49460622668266296, "logits/rejected": -0.4645787179470062, "logps/chosen": -52.51414489746094, "logps/ref_chosen": -52.537052154541016, "logps/ref_rejected": -89.34219360351562, "logps/rejected": -89.54405975341797, "loss": 1.3645, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.4943839907646179, "margin_dpo/beta_margin_grad_std": 0.011376350186765194, "margin_dpo/beta_margin_mean": 0.02247805707156658, "margin_dpo/beta_margin_std": 0.045543402433395386, "margin_dpo/loss_margin_mean": 0.22478055953979492, "margin_dpo/margin_mean": 0.22478067874908447, "margin_dpo/margin_std": 0.4543741047382355, "step": 25 }, { "epoch": 0.0381791483113069, "grad_norm": 87.36368560791016, "learning_rate": 1.8115942028985507e-07, "logits/chosen": -0.5448323488235474, "logits/rejected": -0.5133931636810303, "logps/chosen": -53.813804626464844, "logps/ref_chosen": -53.92280578613281, "logps/ref_rejected": -103.35971069335938, "logps/rejected": -103.66832733154297, "loss": 1.3457, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.48957008123397827, "margin_dpo/beta_margin_grad_std": 0.013178465887904167, "margin_dpo/beta_margin_mean": 0.04176199808716774, "margin_dpo/beta_margin_std": 0.05279136076569557, "margin_dpo/loss_margin_mean": 0.4176199734210968, "margin_dpo/margin_mean": 0.41762077808380127, "margin_dpo/margin_std": 0.5226191282272339, "step": 26 }, { "epoch": 0.039647577092511016, "grad_norm": 94.08861541748047, "learning_rate": 1.8840579710144927e-07, "logits/chosen": -0.5202087163925171, "logits/rejected": -0.4837333858013153, "logps/chosen": -42.766082763671875, "logps/ref_chosen": -42.898529052734375, "logps/ref_rejected": -98.72420501708984, "logps/rejected": -99.09607696533203, "loss": 1.3374, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.4874098598957062, "margin_dpo/beta_margin_grad_std": 0.014595179818570614, "margin_dpo/beta_margin_mean": 0.05043218284845352, "margin_dpo/beta_margin_std": 0.05854206159710884, "margin_dpo/loss_margin_mean": 0.504321813583374, "margin_dpo/margin_mean": 0.5043210983276367, "margin_dpo/margin_std": 0.5811291933059692, "step": 27 }, { "epoch": 0.041116005873715125, "grad_norm": 75.05455780029297, "learning_rate": 1.9565217391304347e-07, "logits/chosen": -0.5194311141967773, "logits/rejected": -0.46526244282722473, "logps/chosen": -60.553565979003906, "logps/ref_chosen": -60.55650329589844, "logps/ref_rejected": -91.40111541748047, "logps/rejected": -91.7254409790039, "loss": 1.3547, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.4918249249458313, "margin_dpo/beta_margin_grad_std": 0.015058773569762707, "margin_dpo/beta_margin_mean": 0.03272556886076927, "margin_dpo/beta_margin_std": 0.06033402308821678, "margin_dpo/loss_margin_mean": 0.3272556662559509, "margin_dpo/margin_mean": 0.3272559344768524, "margin_dpo/margin_std": 0.5973866581916809, "step": 28 }, { "epoch": 0.042584434654919234, "grad_norm": 90.46174621582031, "learning_rate": 2.028985507246377e-07, "logits/chosen": -0.5414900779724121, "logits/rejected": -0.49426716566085815, "logps/chosen": -57.68913269042969, "logps/ref_chosen": -57.80778503417969, "logps/ref_rejected": -97.39434814453125, "logps/rejected": -97.86851501464844, "loss": 1.3289, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.4852002263069153, "margin_dpo/beta_margin_grad_std": 0.015466224402189255, "margin_dpo/beta_margin_mean": 0.05928221344947815, "margin_dpo/beta_margin_std": 0.062019772827625275, "margin_dpo/loss_margin_mean": 0.5928221344947815, "margin_dpo/margin_mean": 0.5928229689598083, "margin_dpo/margin_std": 0.6189556121826172, "step": 29 }, { "epoch": 0.04405286343612335, "grad_norm": 87.33443450927734, "learning_rate": 2.1014492753623187e-07, "logits/chosen": -0.4894167184829712, "logits/rejected": -0.45850175619125366, "logps/chosen": -52.40911102294922, "logps/ref_chosen": -52.57737350463867, "logps/ref_rejected": -98.48921203613281, "logps/rejected": -99.00884246826172, "loss": 1.3197, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.48282885551452637, "margin_dpo/beta_margin_grad_std": 0.01581035926938057, "margin_dpo/beta_margin_mean": 0.06878980994224548, "margin_dpo/beta_margin_std": 0.06341779977083206, "margin_dpo/loss_margin_mean": 0.6878980398178101, "margin_dpo/margin_mean": 0.6878979206085205, "margin_dpo/margin_std": 0.62163245677948, "step": 30 }, { "epoch": 0.04552129221732746, "grad_norm": 67.94820404052734, "learning_rate": 2.1739130434782607e-07, "logits/chosen": -0.5108931064605713, "logits/rejected": -0.4666990637779236, "logps/chosen": -63.70445251464844, "logps/ref_chosen": -63.806922912597656, "logps/ref_rejected": -72.89400482177734, "logps/rejected": -73.24160766601562, "loss": 1.3429, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.48876938223838806, "margin_dpo/beta_margin_grad_std": 0.0166544821113348, "margin_dpo/beta_margin_mean": 0.04500679671764374, "margin_dpo/beta_margin_std": 0.06682661920785904, "margin_dpo/loss_margin_mean": 0.450067937374115, "margin_dpo/margin_mean": 0.4500678479671478, "margin_dpo/margin_std": 0.6665528416633606, "step": 31 }, { "epoch": 0.04698972099853157, "grad_norm": 82.90047454833984, "learning_rate": 2.2463768115942027e-07, "logits/chosen": -0.49858012795448303, "logits/rejected": -0.45628952980041504, "logps/chosen": -62.53711700439453, "logps/ref_chosen": -62.739524841308594, "logps/ref_rejected": -89.3175048828125, "logps/rejected": -89.8597640991211, "loss": 1.3154, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.48145589232444763, "margin_dpo/beta_margin_grad_std": 0.023477083072066307, "margin_dpo/beta_margin_mean": 0.07446718961000443, "margin_dpo/beta_margin_std": 0.09461291879415512, "margin_dpo/loss_margin_mean": 0.7446719408035278, "margin_dpo/margin_mean": 0.7446720600128174, "margin_dpo/margin_std": 0.9450139999389648, "step": 32 }, { "epoch": 0.048458149779735685, "grad_norm": 72.11341857910156, "learning_rate": 2.318840579710145e-07, "logits/chosen": -0.47633564472198486, "logits/rejected": -0.4497436285018921, "logps/chosen": -53.105873107910156, "logps/ref_chosen": -53.26097106933594, "logps/ref_rejected": -87.8851318359375, "logps/rejected": -88.37184143066406, "loss": 1.3243, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.48398149013519287, "margin_dpo/beta_margin_grad_std": 0.016650153324007988, "margin_dpo/beta_margin_mean": 0.06417950242757797, "margin_dpo/beta_margin_std": 0.06679090112447739, "margin_dpo/loss_margin_mean": 0.6417950391769409, "margin_dpo/margin_mean": 0.6417955160140991, "margin_dpo/margin_std": 0.6490182876586914, "step": 33 }, { "epoch": 0.049926578560939794, "grad_norm": 77.38883209228516, "learning_rate": 2.391304347826087e-07, "logits/chosen": -0.5127777457237244, "logits/rejected": -0.49532148241996765, "logps/chosen": -50.72978210449219, "logps/ref_chosen": -50.81732940673828, "logps/ref_rejected": -101.92184448242188, "logps/rejected": -102.66510009765625, "loss": 1.3068, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.4792894721031189, "margin_dpo/beta_margin_grad_std": 0.021500185132026672, "margin_dpo/beta_margin_mean": 0.0830799788236618, "margin_dpo/beta_margin_std": 0.08640186488628387, "margin_dpo/loss_margin_mean": 0.8307997584342957, "margin_dpo/margin_mean": 0.8307995796203613, "margin_dpo/margin_std": 0.8540636301040649, "step": 34 }, { "epoch": 0.0513950073421439, "grad_norm": 82.41116333007812, "learning_rate": 2.463768115942029e-07, "logits/chosen": -0.5374979972839355, "logits/rejected": -0.5004309415817261, "logps/chosen": -50.88545227050781, "logps/ref_chosen": -51.02449035644531, "logps/ref_rejected": -106.82443237304688, "logps/rejected": -107.90895080566406, "loss": 1.2708, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.4696078896522522, "margin_dpo/beta_margin_grad_std": 0.02748698741197586, "margin_dpo/beta_margin_mean": 0.12235570698976517, "margin_dpo/beta_margin_std": 0.11256185173988342, "margin_dpo/loss_margin_mean": 1.2235571146011353, "margin_dpo/margin_mean": 1.2235569953918457, "margin_dpo/margin_std": 1.111976146697998, "step": 35 }, { "epoch": 0.05286343612334802, "grad_norm": 72.79762268066406, "learning_rate": 2.536231884057971e-07, "logits/chosen": -0.5538948774337769, "logits/rejected": -0.517404317855835, "logps/chosen": -51.94648742675781, "logps/ref_chosen": -51.991493225097656, "logps/ref_rejected": -86.04061889648438, "logps/rejected": -87.11822509765625, "loss": 1.2813, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.47209563851356506, "margin_dpo/beta_margin_grad_std": 0.03178109601140022, "margin_dpo/beta_margin_mean": 0.11225982010364532, "margin_dpo/beta_margin_std": 0.12831299006938934, "margin_dpo/loss_margin_mean": 1.1225981712341309, "margin_dpo/margin_mean": 1.122597098350525, "margin_dpo/margin_std": 1.2439404726028442, "step": 36 }, { "epoch": 0.05433186490455213, "grad_norm": 61.13553237915039, "learning_rate": 2.6086956521739126e-07, "logits/chosen": -0.5280976295471191, "logits/rejected": -0.4858455955982208, "logps/chosen": -62.78415298461914, "logps/ref_chosen": -62.807106018066406, "logps/ref_rejected": -77.89507293701172, "logps/rejected": -78.90142059326172, "loss": 1.2911, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.47450384497642517, "margin_dpo/beta_margin_grad_std": 0.03514566645026207, "margin_dpo/beta_margin_mean": 0.10293034464120865, "margin_dpo/beta_margin_std": 0.14328184723854065, "margin_dpo/loss_margin_mean": 1.0293034315109253, "margin_dpo/margin_mean": 1.0293034315109253, "margin_dpo/margin_std": 1.3807631731033325, "step": 37 }, { "epoch": 0.055800293685756244, "grad_norm": 70.00904083251953, "learning_rate": 2.681159420289855e-07, "logits/chosen": -0.5190426111221313, "logits/rejected": -0.4862367510795593, "logps/chosen": -48.24530792236328, "logps/ref_chosen": -48.39051818847656, "logps/ref_rejected": -97.91244506835938, "logps/rejected": -99.11785888671875, "loss": 1.262, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.4666314721107483, "margin_dpo/beta_margin_grad_std": 0.03878392279148102, "margin_dpo/beta_margin_mean": 0.13506263494491577, "margin_dpo/beta_margin_std": 0.15932665765285492, "margin_dpo/loss_margin_mean": 1.3506262302398682, "margin_dpo/margin_mean": 1.3506265878677368, "margin_dpo/margin_std": 1.575331449508667, "step": 38 }, { "epoch": 0.05726872246696035, "grad_norm": 74.47781372070312, "learning_rate": 2.753623188405797e-07, "logits/chosen": -0.5537021160125732, "logits/rejected": -0.5135682821273804, "logps/chosen": -50.65707015991211, "logps/ref_chosen": -50.75046920776367, "logps/ref_rejected": -78.56951141357422, "logps/rejected": -80.16737365722656, "loss": 1.2298, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.45806559920310974, "margin_dpo/beta_margin_grad_std": 0.036758922040462494, "margin_dpo/beta_margin_mean": 0.16912682354450226, "margin_dpo/beta_margin_std": 0.14913904666900635, "margin_dpo/loss_margin_mean": 1.6912682056427002, "margin_dpo/margin_mean": 1.6912682056427002, "margin_dpo/margin_std": 1.4713746309280396, "step": 39 }, { "epoch": 0.05873715124816446, "grad_norm": 59.9489631652832, "learning_rate": 2.8260869565217386e-07, "logits/chosen": -0.5245569348335266, "logits/rejected": -0.4949991703033447, "logps/chosen": -57.77392578125, "logps/ref_chosen": -57.985069274902344, "logps/ref_rejected": -74.30007934570312, "logps/rejected": -75.65821075439453, "loss": 1.243, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.46128079295158386, "margin_dpo/beta_margin_grad_std": 0.04237818345427513, "margin_dpo/beta_margin_mean": 0.1569286286830902, "margin_dpo/beta_margin_std": 0.1742551028728485, "margin_dpo/loss_margin_mean": 1.5692862272262573, "margin_dpo/margin_mean": 1.5692870616912842, "margin_dpo/margin_std": 1.697884202003479, "step": 40 }, { "epoch": 0.06020558002936858, "grad_norm": 67.88613891601562, "learning_rate": 2.898550724637681e-07, "logits/chosen": -0.5592871308326721, "logits/rejected": -0.5240367650985718, "logps/chosen": -62.67747497558594, "logps/ref_chosen": -62.69581604003906, "logps/ref_rejected": -97.02352905273438, "logps/rejected": -98.87300109863281, "loss": 1.2195, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.4541543424129486, "margin_dpo/beta_margin_grad_std": 0.05179302766919136, "margin_dpo/beta_margin_mean": 0.18678142130374908, "margin_dpo/beta_margin_std": 0.21468721330165863, "margin_dpo/loss_margin_mean": 1.8678141832351685, "margin_dpo/margin_mean": 1.867814540863037, "margin_dpo/margin_std": 2.0870983600616455, "step": 41 }, { "epoch": 0.06167400881057269, "grad_norm": 78.81612396240234, "learning_rate": 2.971014492753623e-07, "logits/chosen": -0.5433309674263, "logits/rejected": -0.49680295586586, "logps/chosen": -58.707366943359375, "logps/ref_chosen": -58.96642303466797, "logps/ref_rejected": -109.90837097167969, "logps/rejected": -112.25081634521484, "loss": 1.1578, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.4366336166858673, "margin_dpo/beta_margin_grad_std": 0.058427974581718445, "margin_dpo/beta_margin_mean": 0.2601499557495117, "margin_dpo/beta_margin_std": 0.24821382761001587, "margin_dpo/loss_margin_mean": 2.601499557495117, "margin_dpo/margin_mean": 2.601499319076538, "margin_dpo/margin_std": 2.445554733276367, "step": 42 }, { "epoch": 0.0631424375917768, "grad_norm": 72.23222351074219, "learning_rate": 3.043478260869565e-07, "logits/chosen": -0.5568352341651917, "logits/rejected": -0.532639741897583, "logps/chosen": -53.65935516357422, "logps/ref_chosen": -54.15599822998047, "logps/ref_rejected": -96.48019409179688, "logps/rejected": -98.41513061523438, "loss": 1.1675, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.44025009870529175, "margin_dpo/beta_margin_grad_std": 0.04758695140480995, "margin_dpo/beta_margin_mean": 0.24315857887268066, "margin_dpo/beta_margin_std": 0.19878432154655457, "margin_dpo/loss_margin_mean": 2.4315857887268066, "margin_dpo/margin_mean": 2.4315857887268066, "margin_dpo/margin_std": 1.964142918586731, "step": 43 }, { "epoch": 0.06461086637298091, "grad_norm": 78.49581909179688, "learning_rate": 3.115942028985507e-07, "logits/chosen": -0.458575576543808, "logits/rejected": -0.43896228075027466, "logps/chosen": -49.86518859863281, "logps/ref_chosen": -50.07849884033203, "logps/ref_rejected": -108.78376007080078, "logps/rejected": -111.42298889160156, "loss": 1.1338, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.43024110794067383, "margin_dpo/beta_margin_grad_std": 0.0542747788131237, "margin_dpo/beta_margin_mean": 0.2852535545825958, "margin_dpo/beta_margin_std": 0.2277490794658661, "margin_dpo/loss_margin_mean": 2.8525354862213135, "margin_dpo/margin_mean": 2.852534532546997, "margin_dpo/margin_std": 2.270460605621338, "step": 44 }, { "epoch": 0.06607929515418502, "grad_norm": 62.053192138671875, "learning_rate": 3.188405797101449e-07, "logits/chosen": -0.4600446820259094, "logits/rejected": -0.4469829797744751, "logps/chosen": -48.24645233154297, "logps/ref_chosen": -48.41493225097656, "logps/ref_rejected": -77.93643188476562, "logps/rejected": -80.1404037475586, "loss": 1.1805, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.4424096643924713, "margin_dpo/beta_margin_grad_std": 0.06356598436832428, "margin_dpo/beta_margin_mean": 0.23724493384361267, "margin_dpo/beta_margin_std": 0.2693977653980255, "margin_dpo/loss_margin_mean": 2.3724491596221924, "margin_dpo/margin_mean": 2.3724491596221924, "margin_dpo/margin_std": 2.6500847339630127, "step": 45 }, { "epoch": 0.06754772393538913, "grad_norm": 69.27433013916016, "learning_rate": 3.260869565217391e-07, "logits/chosen": -0.5094949007034302, "logits/rejected": -0.45755523443222046, "logps/chosen": -55.80693435668945, "logps/ref_chosen": -55.999427795410156, "logps/ref_rejected": -95.652587890625, "logps/rejected": -98.43904113769531, "loss": 1.1354, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.42856818437576294, "margin_dpo/beta_margin_grad_std": 0.07470017671585083, "margin_dpo/beta_margin_mean": 0.2978942394256592, "margin_dpo/beta_margin_std": 0.3255438506603241, "margin_dpo/loss_margin_mean": 2.9789421558380127, "margin_dpo/margin_mean": 2.9789419174194336, "margin_dpo/margin_std": 3.244965076446533, "step": 46 }, { "epoch": 0.06901615271659324, "grad_norm": 65.2599868774414, "learning_rate": 3.333333333333333e-07, "logits/chosen": -0.5813416242599487, "logits/rejected": -0.5291002988815308, "logps/chosen": -57.496604919433594, "logps/ref_chosen": -57.92607879638672, "logps/ref_rejected": -94.67920684814453, "logps/rejected": -97.23886108398438, "loss": 1.1271, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.42738690972328186, "margin_dpo/beta_margin_grad_std": 0.0637550950050354, "margin_dpo/beta_margin_mean": 0.29891282320022583, "margin_dpo/beta_margin_std": 0.26972696185112, "margin_dpo/loss_margin_mean": 2.9891281127929688, "margin_dpo/margin_mean": 2.989128351211548, "margin_dpo/margin_std": 2.6342062950134277, "step": 47 }, { "epoch": 0.07048458149779736, "grad_norm": 73.67699432373047, "learning_rate": 3.4057971014492755e-07, "logits/chosen": -0.5998705625534058, "logits/rejected": -0.5423353910446167, "logps/chosen": -57.117156982421875, "logps/ref_chosen": -57.188072204589844, "logps/ref_rejected": -88.0166015625, "logps/rejected": -91.08055877685547, "loss": 1.1227, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.4244069755077362, "margin_dpo/beta_margin_grad_std": 0.07627448439598083, "margin_dpo/beta_margin_mean": 0.3134877383708954, "margin_dpo/beta_margin_std": 0.32677435874938965, "margin_dpo/loss_margin_mean": 3.1348772048950195, "margin_dpo/margin_mean": 3.1348774433135986, "margin_dpo/margin_std": 3.0109379291534424, "step": 48 }, { "epoch": 0.07195301027900147, "grad_norm": 61.355953216552734, "learning_rate": 3.478260869565217e-07, "logits/chosen": -0.5448025465011597, "logits/rejected": -0.4857603907585144, "logps/chosen": -61.36932373046875, "logps/ref_chosen": -61.685264587402344, "logps/ref_rejected": -83.76747131347656, "logps/rejected": -87.26129913330078, "loss": 1.0774, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.41020649671554565, "margin_dpo/beta_margin_grad_std": 0.08793335407972336, "margin_dpo/beta_margin_mean": 0.3809766173362732, "margin_dpo/beta_margin_std": 0.3965732753276825, "margin_dpo/loss_margin_mean": 3.8097660541534424, "margin_dpo/margin_mean": 3.8097658157348633, "margin_dpo/margin_std": 3.869323253631592, "step": 49 }, { "epoch": 0.07342143906020558, "grad_norm": 62.80997085571289, "learning_rate": 3.5507246376811595e-07, "logits/chosen": -0.5425491333007812, "logits/rejected": -0.5065620541572571, "logps/chosen": -58.89775848388672, "logps/ref_chosen": -58.72413635253906, "logps/ref_rejected": -96.35814666748047, "logps/rejected": -100.69513702392578, "loss": 1.0518, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.40193936228752136, "margin_dpo/beta_margin_grad_std": 0.09261800348758698, "margin_dpo/beta_margin_mean": 0.4163365066051483, "margin_dpo/beta_margin_std": 0.4100196361541748, "margin_dpo/loss_margin_mean": 4.163364887237549, "margin_dpo/margin_mean": 4.163365364074707, "margin_dpo/margin_std": 4.094795227050781, "step": 50 }, { "epoch": 0.07488986784140969, "grad_norm": 52.91781234741211, "learning_rate": 3.6231884057971015e-07, "logits/chosen": -0.5184497833251953, "logits/rejected": -0.4852331280708313, "logps/chosen": -61.69359588623047, "logps/ref_chosen": -61.3736686706543, "logps/ref_rejected": -76.00199890136719, "logps/rejected": -80.33977508544922, "loss": 1.085, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.40868327021598816, "margin_dpo/beta_margin_grad_std": 0.1110108494758606, "margin_dpo/beta_margin_mean": 0.4017845094203949, "margin_dpo/beta_margin_std": 0.5204705595970154, "margin_dpo/loss_margin_mean": 4.017845153808594, "margin_dpo/margin_mean": 4.017845153808594, "margin_dpo/margin_std": 5.1221513748168945, "step": 51 }, { "epoch": 0.0763582966226138, "grad_norm": 58.923404693603516, "learning_rate": 3.695652173913043e-07, "logits/chosen": -0.5524120330810547, "logits/rejected": -0.496574342250824, "logps/chosen": -51.979454040527344, "logps/ref_chosen": -52.33735656738281, "logps/ref_rejected": -79.97391510009766, "logps/rejected": -85.81260681152344, "loss": 0.9189, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.3595433533191681, "margin_dpo/beta_margin_grad_std": 0.10714302211999893, "margin_dpo/beta_margin_mean": 0.6196599006652832, "margin_dpo/beta_margin_std": 0.5214123129844666, "margin_dpo/loss_margin_mean": 6.196599006652832, "margin_dpo/margin_mean": 6.196599006652832, "margin_dpo/margin_std": 5.190753936767578, "step": 52 }, { "epoch": 0.07782672540381791, "grad_norm": 58.20880889892578, "learning_rate": 3.7681159420289855e-07, "logits/chosen": -0.6073682904243469, "logits/rejected": -0.5856744050979614, "logps/chosen": -53.506500244140625, "logps/ref_chosen": -53.31465530395508, "logps/ref_rejected": -91.7835922241211, "logps/rejected": -98.30122375488281, "loss": 0.9446, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.36480841040611267, "margin_dpo/beta_margin_grad_std": 0.12540318071842194, "margin_dpo/beta_margin_mean": 0.6325778961181641, "margin_dpo/beta_margin_std": 0.6903671622276306, "margin_dpo/loss_margin_mean": 6.325778961181641, "margin_dpo/margin_mean": 6.325778484344482, "margin_dpo/margin_std": 6.248142242431641, "step": 53 }, { "epoch": 0.07929515418502203, "grad_norm": 59.29412841796875, "learning_rate": 3.8405797101449274e-07, "logits/chosen": -0.633226752281189, "logits/rejected": -0.5815136432647705, "logps/chosen": -51.13933563232422, "logps/ref_chosen": -50.68865966796875, "logps/ref_rejected": -91.71539306640625, "logps/rejected": -97.51422119140625, "loss": 0.9783, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.3783862590789795, "margin_dpo/beta_margin_grad_std": 0.10563214868307114, "margin_dpo/beta_margin_mean": 0.5348156690597534, "margin_dpo/beta_margin_std": 0.5101956725120544, "margin_dpo/loss_margin_mean": 5.348156452178955, "margin_dpo/margin_mean": 5.348155498504639, "margin_dpo/margin_std": 5.086174488067627, "step": 54 }, { "epoch": 0.08076358296622614, "grad_norm": 53.738956451416016, "learning_rate": 3.9130434782608694e-07, "logits/chosen": -0.6361401081085205, "logits/rejected": -0.5729630589485168, "logps/chosen": -63.57060241699219, "logps/ref_chosen": -62.615234375, "logps/ref_rejected": -88.99349975585938, "logps/rejected": -96.49041748046875, "loss": 0.9548, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.361075222492218, "margin_dpo/beta_margin_grad_std": 0.14729972183704376, "margin_dpo/beta_margin_mean": 0.6541542410850525, "margin_dpo/beta_margin_std": 0.7597689032554626, "margin_dpo/loss_margin_mean": 6.541542053222656, "margin_dpo/margin_mean": 6.541542053222656, "margin_dpo/margin_std": 7.533283233642578, "step": 55 }, { "epoch": 0.08223201174743025, "grad_norm": 48.09397506713867, "learning_rate": 3.9855072463768114e-07, "logits/chosen": -0.5945051908493042, "logits/rejected": -0.5514425039291382, "logps/chosen": -58.66962432861328, "logps/ref_chosen": -57.93273162841797, "logps/ref_rejected": -94.1744384765625, "logps/rejected": -101.10653686523438, "loss": 0.9775, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.36780592799186707, "margin_dpo/beta_margin_grad_std": 0.14850637316703796, "margin_dpo/beta_margin_mean": 0.6195200085639954, "margin_dpo/beta_margin_std": 0.7477858066558838, "margin_dpo/loss_margin_mean": 6.195199966430664, "margin_dpo/margin_mean": 6.195199012756348, "margin_dpo/margin_std": 7.399816989898682, "step": 56 }, { "epoch": 0.08370044052863436, "grad_norm": 54.234169006347656, "learning_rate": 4.057971014492754e-07, "logits/chosen": -0.5641357898712158, "logits/rejected": -0.5353480577468872, "logps/chosen": -71.26276397705078, "logps/ref_chosen": -70.49528503417969, "logps/ref_rejected": -95.56546020507812, "logps/rejected": -103.23522186279297, "loss": 0.9078, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.34864187240600586, "margin_dpo/beta_margin_grad_std": 0.13589681684970856, "margin_dpo/beta_margin_mean": 0.6902284026145935, "margin_dpo/beta_margin_std": 0.6726579070091248, "margin_dpo/loss_margin_mean": 6.902284145355225, "margin_dpo/margin_mean": 6.902284145355225, "margin_dpo/margin_std": 6.639451026916504, "step": 57 }, { "epoch": 0.08516886930983847, "grad_norm": 59.243927001953125, "learning_rate": 4.1304347826086954e-07, "logits/chosen": -0.5894064903259277, "logits/rejected": -0.5127171874046326, "logps/chosen": -63.23316955566406, "logps/ref_chosen": -62.13294219970703, "logps/ref_rejected": -84.61729431152344, "logps/rejected": -93.32413482666016, "loss": 0.8977, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.3427189290523529, "margin_dpo/beta_margin_grad_std": 0.15285082161426544, "margin_dpo/beta_margin_mean": 0.7606607675552368, "margin_dpo/beta_margin_std": 0.8165130615234375, "margin_dpo/loss_margin_mean": 7.606607437133789, "margin_dpo/margin_mean": 7.606607437133789, "margin_dpo/margin_std": 8.09335708618164, "step": 58 }, { "epoch": 0.08663729809104258, "grad_norm": 55.42934799194336, "learning_rate": 4.2028985507246374e-07, "logits/chosen": -0.6423487663269043, "logits/rejected": -0.6032625436782837, "logps/chosen": -53.42650604248047, "logps/ref_chosen": -51.932525634765625, "logps/ref_rejected": -88.88520050048828, "logps/rejected": -98.86468505859375, "loss": 0.8575, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.32869505882263184, "margin_dpo/beta_margin_grad_std": 0.15742561221122742, "margin_dpo/beta_margin_mean": 0.8485509157180786, "margin_dpo/beta_margin_std": 0.8816735148429871, "margin_dpo/loss_margin_mean": 8.485508918762207, "margin_dpo/margin_mean": 8.485508918762207, "margin_dpo/margin_std": 8.604471206665039, "step": 59 }, { "epoch": 0.0881057268722467, "grad_norm": 64.29039764404297, "learning_rate": 4.2753623188405794e-07, "logits/chosen": -0.6296500563621521, "logits/rejected": -0.5711052417755127, "logps/chosen": -63.62670135498047, "logps/ref_chosen": -60.94218444824219, "logps/ref_rejected": -85.39340209960938, "logps/rejected": -94.76435089111328, "loss": 0.9555, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.3545895218849182, "margin_dpo/beta_margin_grad_std": 0.15808549523353577, "margin_dpo/beta_margin_mean": 0.6686438918113708, "margin_dpo/beta_margin_std": 0.7756204009056091, "margin_dpo/loss_margin_mean": 6.686439037322998, "margin_dpo/margin_mean": 6.686439514160156, "margin_dpo/margin_std": 7.678452968597412, "step": 60 }, { "epoch": 0.08957415565345081, "grad_norm": 54.964107513427734, "learning_rate": 4.3478260869565214e-07, "logits/chosen": -0.6372621655464172, "logits/rejected": -0.6041065454483032, "logps/chosen": -62.14350128173828, "logps/ref_chosen": -60.633522033691406, "logps/ref_rejected": -89.85249328613281, "logps/rejected": -99.61428833007812, "loss": 0.9341, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.34661665558815, "margin_dpo/beta_margin_grad_std": 0.1781352162361145, "margin_dpo/beta_margin_mean": 0.8251805901527405, "margin_dpo/beta_margin_std": 1.1422574520111084, "margin_dpo/loss_margin_mean": 8.251806259155273, "margin_dpo/margin_mean": 8.251806259155273, "margin_dpo/margin_std": 11.240764617919922, "step": 61 }, { "epoch": 0.09104258443465492, "grad_norm": 58.057708740234375, "learning_rate": 4.420289855072464e-07, "logits/chosen": -0.6090478897094727, "logits/rejected": -0.5749986171722412, "logps/chosen": -57.778465270996094, "logps/ref_chosen": -56.15077209472656, "logps/ref_rejected": -75.56619262695312, "logps/rejected": -83.39352416992188, "loss": 0.9993, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.37108179926872253, "margin_dpo/beta_margin_grad_std": 0.15791070461273193, "margin_dpo/beta_margin_mean": 0.6199632883071899, "margin_dpo/beta_margin_std": 0.8312649130821228, "margin_dpo/loss_margin_mean": 6.19963264465332, "margin_dpo/margin_mean": 6.19963264465332, "margin_dpo/margin_std": 8.127958297729492, "step": 62 }, { "epoch": 0.09251101321585903, "grad_norm": 56.769561767578125, "learning_rate": 4.4927536231884053e-07, "logits/chosen": -0.5860311388969421, "logits/rejected": -0.5402973890304565, "logps/chosen": -75.79495239257812, "logps/ref_chosen": -73.14739227294922, "logps/ref_rejected": -97.61006164550781, "logps/rejected": -108.62382507324219, "loss": 0.8773, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.3284067213535309, "margin_dpo/beta_margin_grad_std": 0.16741114854812622, "margin_dpo/beta_margin_mean": 0.8366211652755737, "margin_dpo/beta_margin_std": 0.9040850400924683, "margin_dpo/loss_margin_mean": 8.366211891174316, "margin_dpo/margin_mean": 8.366212844848633, "margin_dpo/margin_std": 8.857807159423828, "step": 63 }, { "epoch": 0.09397944199706314, "grad_norm": 52.091590881347656, "learning_rate": 4.5652173913043473e-07, "logits/chosen": -0.5791685581207275, "logits/rejected": -0.5466402769088745, "logps/chosen": -55.00431823730469, "logps/ref_chosen": -53.99859619140625, "logps/ref_rejected": -93.53020477294922, "logps/rejected": -104.35765075683594, "loss": 0.8493, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.3144356906414032, "margin_dpo/beta_margin_grad_std": 0.1805381327867508, "margin_dpo/beta_margin_mean": 0.9821729063987732, "margin_dpo/beta_margin_std": 1.1361504793167114, "margin_dpo/loss_margin_mean": 9.821728706359863, "margin_dpo/margin_mean": 9.82172966003418, "margin_dpo/margin_std": 11.043643951416016, "step": 64 }, { "epoch": 0.09544787077826726, "grad_norm": 54.09811782836914, "learning_rate": 4.63768115942029e-07, "logits/chosen": -0.6608457565307617, "logits/rejected": -0.6478947401046753, "logps/chosen": -68.0100326538086, "logps/ref_chosen": -64.83599853515625, "logps/ref_rejected": -109.94645690917969, "logps/rejected": -122.96417236328125, "loss": 0.8585, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.31038472056388855, "margin_dpo/beta_margin_grad_std": 0.18928615748882294, "margin_dpo/beta_margin_mean": 0.9843679666519165, "margin_dpo/beta_margin_std": 1.1074903011322021, "margin_dpo/loss_margin_mean": 9.843679428100586, "margin_dpo/margin_mean": 9.843679428100586, "margin_dpo/margin_std": 10.951974868774414, "step": 65 }, { "epoch": 0.09691629955947137, "grad_norm": 52.60911560058594, "learning_rate": 4.7101449275362313e-07, "logits/chosen": -0.6474887132644653, "logits/rejected": -0.6150294542312622, "logps/chosen": -54.36174011230469, "logps/ref_chosen": -51.44352722167969, "logps/ref_rejected": -75.63629150390625, "logps/rejected": -87.54934692382812, "loss": 0.8859, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.3307109475135803, "margin_dpo/beta_margin_grad_std": 0.1775081753730774, "margin_dpo/beta_margin_mean": 0.8994826078414917, "margin_dpo/beta_margin_std": 1.1073994636535645, "margin_dpo/loss_margin_mean": 8.99482536315918, "margin_dpo/margin_mean": 8.99482536315918, "margin_dpo/margin_std": 10.87942123413086, "step": 66 }, { "epoch": 0.09838472834067548, "grad_norm": 52.46964645385742, "learning_rate": 4.782608695652174e-07, "logits/chosen": -0.5966418981552124, "logits/rejected": -0.5537301301956177, "logps/chosen": -61.81807327270508, "logps/ref_chosen": -59.34080505371094, "logps/ref_rejected": -72.78729248046875, "logps/rejected": -84.54171752929688, "loss": 0.8693, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.32543134689331055, "margin_dpo/beta_margin_grad_std": 0.17773009836673737, "margin_dpo/beta_margin_mean": 0.9277163147926331, "margin_dpo/beta_margin_std": 1.1012858152389526, "margin_dpo/loss_margin_mean": 9.2771635055542, "margin_dpo/margin_mean": 9.277162551879883, "margin_dpo/margin_std": 10.92019271850586, "step": 67 }, { "epoch": 0.09985315712187959, "grad_norm": 52.40779113769531, "learning_rate": 4.855072463768116e-07, "logits/chosen": -0.6349166631698608, "logits/rejected": -0.5751150250434875, "logps/chosen": -67.98988342285156, "logps/ref_chosen": -65.2058334350586, "logps/ref_rejected": -77.20724487304688, "logps/rejected": -88.71192932128906, "loss": 0.8459, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.3253341615200043, "margin_dpo/beta_margin_grad_std": 0.15630275011062622, "margin_dpo/beta_margin_mean": 0.8720625042915344, "margin_dpo/beta_margin_std": 0.9045540690422058, "margin_dpo/loss_margin_mean": 8.720624923706055, "margin_dpo/margin_mean": 8.720624923706055, "margin_dpo/margin_std": 8.963220596313477, "step": 68 }, { "epoch": 0.1013215859030837, "grad_norm": 53.23897933959961, "learning_rate": 4.927536231884058e-07, "logits/chosen": -0.6085792183876038, "logits/rejected": -0.5847188234329224, "logps/chosen": -62.99334716796875, "logps/ref_chosen": -59.81924057006836, "logps/ref_rejected": -103.38886260986328, "logps/rejected": -116.94822692871094, "loss": 0.7777, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.30277135968208313, "margin_dpo/beta_margin_grad_std": 0.16077764332294464, "margin_dpo/beta_margin_mean": 1.0385247468948364, "margin_dpo/beta_margin_std": 1.040853500366211, "margin_dpo/loss_margin_mean": 10.385248184204102, "margin_dpo/margin_mean": 10.385248184204102, "margin_dpo/margin_std": 10.297136306762695, "step": 69 }, { "epoch": 0.1027900146842878, "grad_norm": 59.40316390991211, "learning_rate": 5e-07, "logits/chosen": -0.625554621219635, "logits/rejected": -0.5908818244934082, "logps/chosen": -66.4103012084961, "logps/ref_chosen": -61.930641174316406, "logps/ref_rejected": -91.060791015625, "logps/rejected": -106.7230453491211, "loss": 0.7928, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.30000537633895874, "margin_dpo/beta_margin_grad_std": 0.18498124182224274, "margin_dpo/beta_margin_mean": 1.118260145187378, "margin_dpo/beta_margin_std": 1.199088215827942, "margin_dpo/loss_margin_mean": 11.182600975036621, "margin_dpo/margin_mean": 11.182600975036621, "margin_dpo/margin_std": 11.917827606201172, "step": 70 }, { "epoch": 0.10425844346549193, "grad_norm": 49.68572998046875, "learning_rate": 4.999967061337492e-07, "logits/chosen": -0.6752599477767944, "logits/rejected": -0.6361984014511108, "logps/chosen": -65.69276428222656, "logps/ref_chosen": -61.750343322753906, "logps/ref_rejected": -97.33662414550781, "logps/rejected": -114.14346313476562, "loss": 0.702, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.27304375171661377, "margin_dpo/beta_margin_grad_std": 0.16720205545425415, "margin_dpo/beta_margin_mean": 1.2864418029785156, "margin_dpo/beta_margin_std": 1.286440372467041, "margin_dpo/loss_margin_mean": 12.86441707611084, "margin_dpo/margin_mean": 12.864418029785156, "margin_dpo/margin_std": 12.424565315246582, "step": 71 }, { "epoch": 0.10572687224669604, "grad_norm": 59.574241638183594, "learning_rate": 4.999868246217933e-07, "logits/chosen": -0.6442112922668457, "logits/rejected": -0.6080772280693054, "logps/chosen": -70.28240966796875, "logps/ref_chosen": -66.05341339111328, "logps/ref_rejected": -95.2869873046875, "logps/rejected": -112.89981079101562, "loss": 0.7297, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2696942389011383, "margin_dpo/beta_margin_grad_std": 0.19517795741558075, "margin_dpo/beta_margin_mean": 1.3383830785751343, "margin_dpo/beta_margin_std": 1.3651797771453857, "margin_dpo/loss_margin_mean": 13.383831024169922, "margin_dpo/margin_mean": 13.383831024169922, "margin_dpo/margin_std": 13.636287689208984, "step": 72 }, { "epoch": 0.10719530102790015, "grad_norm": 76.11861419677734, "learning_rate": 4.999703557245192e-07, "logits/chosen": -0.6918191909790039, "logits/rejected": -0.6510320901870728, "logps/chosen": -72.03385162353516, "logps/ref_chosen": -66.25627136230469, "logps/ref_rejected": -90.45613861083984, "logps/rejected": -109.28495788574219, "loss": 0.9513, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.31014135479927063, "margin_dpo/beta_margin_grad_std": 0.24917910993099213, "margin_dpo/beta_margin_mean": 1.3051246404647827, "margin_dpo/beta_margin_std": 1.8701282739639282, "margin_dpo/loss_margin_mean": 13.051246643066406, "margin_dpo/margin_mean": 13.051246643066406, "margin_dpo/margin_std": 18.630680084228516, "step": 73 }, { "epoch": 0.10866372980910426, "grad_norm": 71.0533676147461, "learning_rate": 4.999472998758977e-07, "logits/chosen": -0.6222573518753052, "logits/rejected": -0.6104036569595337, "logps/chosen": -59.54771423339844, "logps/ref_chosen": -53.42488098144531, "logps/ref_rejected": -95.94693756103516, "logps/rejected": -115.84016418457031, "loss": 0.8775, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.28980112075805664, "margin_dpo/beta_margin_grad_std": 0.22041486203670502, "margin_dpo/beta_margin_mean": 1.3770397901535034, "margin_dpo/beta_margin_std": 2.0495500564575195, "margin_dpo/loss_margin_mean": 13.770398139953613, "margin_dpo/margin_mean": 13.770397186279297, "margin_dpo/margin_std": 20.299190521240234, "step": 74 }, { "epoch": 0.11013215859030837, "grad_norm": 50.546207427978516, "learning_rate": 4.999176576834721e-07, "logits/chosen": -0.6528257131576538, "logits/rejected": -0.6429094672203064, "logps/chosen": -57.421756744384766, "logps/ref_chosen": -51.861663818359375, "logps/ref_rejected": -111.25397491455078, "logps/rejected": -135.87710571289062, "loss": 0.6084, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.22819584608078003, "margin_dpo/beta_margin_grad_std": 0.2010163813829422, "margin_dpo/beta_margin_mean": 1.90630304813385, "margin_dpo/beta_margin_std": 1.8703465461730957, "margin_dpo/loss_margin_mean": 19.063030242919922, "margin_dpo/margin_mean": 19.06302833557129, "margin_dpo/margin_std": 18.35777473449707, "step": 75 }, { "epoch": 0.11160058737151249, "grad_norm": 63.239871978759766, "learning_rate": 4.998814299283415e-07, "logits/chosen": -0.7003756165504456, "logits/rejected": -0.6578394770622253, "logps/chosen": -59.91857147216797, "logps/ref_chosen": -53.26604080200195, "logps/ref_rejected": -78.21662139892578, "logps/rejected": -97.16926574707031, "loss": 0.8122, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.28062668442726135, "margin_dpo/beta_margin_grad_std": 0.20105010271072388, "margin_dpo/beta_margin_mean": 1.2300118207931519, "margin_dpo/beta_margin_std": 1.421257495880127, "margin_dpo/loss_margin_mean": 12.300118446350098, "margin_dpo/margin_mean": 12.300118446350098, "margin_dpo/margin_std": 14.157339096069336, "step": 76 }, { "epoch": 0.1130690161527166, "grad_norm": 78.45389556884766, "learning_rate": 4.998386175651409e-07, "logits/chosen": -0.6659625768661499, "logits/rejected": -0.6236972212791443, "logps/chosen": -63.619422912597656, "logps/ref_chosen": -58.0966796875, "logps/ref_rejected": -93.77361297607422, "logps/rejected": -118.58006286621094, "loss": 0.6829, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2217966765165329, "margin_dpo/beta_margin_grad_std": 0.22179701924324036, "margin_dpo/beta_margin_mean": 1.9283708333969116, "margin_dpo/beta_margin_std": 1.9269988536834717, "margin_dpo/loss_margin_mean": 19.283708572387695, "margin_dpo/margin_mean": 19.283706665039062, "margin_dpo/margin_std": 19.11894989013672, "step": 77 }, { "epoch": 0.1145374449339207, "grad_norm": 66.56047058105469, "learning_rate": 4.997892217220159e-07, "logits/chosen": -0.6366250514984131, "logits/rejected": -0.6083469986915588, "logps/chosen": -60.89007568359375, "logps/ref_chosen": -55.61378479003906, "logps/ref_rejected": -84.93436431884766, "logps/rejected": -104.90266418457031, "loss": 0.7296, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2656141221523285, "margin_dpo/beta_margin_grad_std": 0.21040529012680054, "margin_dpo/beta_margin_mean": 1.4692002534866333, "margin_dpo/beta_margin_std": 1.5563766956329346, "margin_dpo/loss_margin_mean": 14.692002296447754, "margin_dpo/margin_mean": 14.69200325012207, "margin_dpo/margin_std": 15.322187423706055, "step": 78 }, { "epoch": 0.11600587371512482, "grad_norm": 59.296844482421875, "learning_rate": 4.997332437005931e-07, "logits/chosen": -0.6760110855102539, "logits/rejected": -0.6464430093765259, "logps/chosen": -60.498695373535156, "logps/ref_chosen": -55.45048522949219, "logps/ref_rejected": -87.64756774902344, "logps/rejected": -108.78245544433594, "loss": 0.7766, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2785935699939728, "margin_dpo/beta_margin_grad_std": 0.22844330966472626, "margin_dpo/beta_margin_mean": 1.6086679697036743, "margin_dpo/beta_margin_std": 1.9045932292938232, "margin_dpo/loss_margin_mean": 16.086679458618164, "margin_dpo/margin_mean": 16.086679458618164, "margin_dpo/margin_std": 18.848827362060547, "step": 79 }, { "epoch": 0.11747430249632893, "grad_norm": 63.66164016723633, "learning_rate": 4.996706849759452e-07, "logits/chosen": -0.7178832292556763, "logits/rejected": -0.6710443496704102, "logps/chosen": -65.51264190673828, "logps/ref_chosen": -58.519290924072266, "logps/ref_rejected": -87.54750061035156, "logps/rejected": -108.77944946289062, "loss": 0.8355, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.293730765581131, "margin_dpo/beta_margin_grad_std": 0.22851316630840302, "margin_dpo/beta_margin_mean": 1.42385995388031, "margin_dpo/beta_margin_std": 1.8568590879440308, "margin_dpo/loss_margin_mean": 14.238598823547363, "margin_dpo/margin_mean": 14.238598823547363, "margin_dpo/margin_std": 17.483436584472656, "step": 80 }, { "epoch": 0.11894273127753303, "grad_norm": 72.20431518554688, "learning_rate": 4.996015471965529e-07, "logits/chosen": -0.7084971070289612, "logits/rejected": -0.6748213171958923, "logps/chosen": -72.02912902832031, "logps/ref_chosen": -66.44886779785156, "logps/ref_rejected": -129.66270446777344, "logps/rejected": -153.9308624267578, "loss": 0.6904, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.24315199255943298, "margin_dpo/beta_margin_grad_std": 0.22722284495830536, "margin_dpo/beta_margin_mean": 1.8687902688980103, "margin_dpo/beta_margin_std": 2.0691728591918945, "margin_dpo/loss_margin_mean": 18.687902450561523, "margin_dpo/margin_mean": 18.687902450561523, "margin_dpo/margin_std": 20.542957305908203, "step": 81 }, { "epoch": 0.12041116005873716, "grad_norm": 87.32213592529297, "learning_rate": 4.995258321842611e-07, "logits/chosen": -0.6286877393722534, "logits/rejected": -0.6112765073776245, "logps/chosen": -59.366302490234375, "logps/ref_chosen": -52.232383728027344, "logps/ref_rejected": -90.74325561523438, "logps/rejected": -112.9305419921875, "loss": 0.9632, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2833250164985657, "margin_dpo/beta_margin_grad_std": 0.2522350549697876, "margin_dpo/beta_margin_mean": 1.5053365230560303, "margin_dpo/beta_margin_std": 2.1997368335723877, "margin_dpo/loss_margin_mean": 15.053364753723145, "margin_dpo/margin_mean": 15.053365707397461, "margin_dpo/margin_std": 21.363815307617188, "step": 82 }, { "epoch": 0.12187958883994127, "grad_norm": 67.5205307006836, "learning_rate": 4.994435419342304e-07, "logits/chosen": -0.6808498501777649, "logits/rejected": -0.6353092193603516, "logps/chosen": -62.771873474121094, "logps/ref_chosen": -55.82738494873047, "logps/ref_rejected": -103.71590423583984, "logps/rejected": -127.50509643554688, "loss": 0.7422, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2590833604335785, "margin_dpo/beta_margin_grad_std": 0.22852419316768646, "margin_dpo/beta_margin_mean": 1.6844712495803833, "margin_dpo/beta_margin_std": 1.8570791482925415, "margin_dpo/loss_margin_mean": 16.844711303710938, "margin_dpo/margin_mean": 16.844711303710938, "margin_dpo/margin_std": 18.56102752685547, "step": 83 }, { "epoch": 0.12334801762114538, "grad_norm": 58.463897705078125, "learning_rate": 4.993546786148857e-07, "logits/chosen": -0.6490943431854248, "logits/rejected": -0.6113982200622559, "logps/chosen": -72.32835388183594, "logps/ref_chosen": -67.1761703491211, "logps/ref_rejected": -87.29859924316406, "logps/rejected": -107.63688659667969, "loss": 0.6737, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.244083434343338, "margin_dpo/beta_margin_grad_std": 0.19765815138816833, "margin_dpo/beta_margin_mean": 1.5186108350753784, "margin_dpo/beta_margin_std": 1.4150245189666748, "margin_dpo/loss_margin_mean": 15.186108589172363, "margin_dpo/margin_mean": 15.18610954284668, "margin_dpo/margin_std": 13.861265182495117, "step": 84 }, { "epoch": 0.12481644640234948, "grad_norm": 64.56900787353516, "learning_rate": 4.992592445678582e-07, "logits/chosen": -0.6331249475479126, "logits/rejected": -0.6021745204925537, "logps/chosen": -64.1954345703125, "logps/ref_chosen": -58.406620025634766, "logps/ref_rejected": -78.63880157470703, "logps/rejected": -98.99234008789062, "loss": 0.7715, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.27991896867752075, "margin_dpo/beta_margin_grad_std": 0.2167506366968155, "margin_dpo/beta_margin_mean": 1.4564720392227173, "margin_dpo/beta_margin_std": 1.6396623849868774, "margin_dpo/loss_margin_mean": 14.564720153808594, "margin_dpo/margin_mean": 14.56472110748291, "margin_dpo/margin_std": 15.904397010803223, "step": 85 }, { "epoch": 0.1262848751835536, "grad_norm": 117.56181335449219, "learning_rate": 4.991572423079235e-07, "logits/chosen": -0.6592001914978027, "logits/rejected": -0.6417681574821472, "logps/chosen": -63.10496520996094, "logps/ref_chosen": -56.13746643066406, "logps/ref_rejected": -88.12165069580078, "logps/rejected": -110.20996856689453, "loss": 0.9186, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2995266318321228, "margin_dpo/beta_margin_grad_std": 0.24782723188400269, "margin_dpo/beta_margin_mean": 1.5120818614959717, "margin_dpo/beta_margin_std": 2.213315725326538, "margin_dpo/loss_margin_mean": 15.120819091796875, "margin_dpo/margin_mean": 15.120819091796875, "margin_dpo/margin_std": 21.751773834228516, "step": 86 }, { "epoch": 0.1277533039647577, "grad_norm": 66.58505249023438, "learning_rate": 4.990486745229364e-07, "logits/chosen": -0.7072494626045227, "logits/rejected": -0.670096755027771, "logps/chosen": -62.457305908203125, "logps/ref_chosen": -55.63609313964844, "logps/ref_rejected": -95.46757507324219, "logps/rejected": -118.72473907470703, "loss": 0.7859, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2566095292568207, "margin_dpo/beta_margin_grad_std": 0.23276211321353912, "margin_dpo/beta_margin_mean": 1.6435949802398682, "margin_dpo/beta_margin_std": 1.9270051717758179, "margin_dpo/loss_margin_mean": 16.435949325561523, "margin_dpo/margin_mean": 16.435949325561523, "margin_dpo/margin_std": 18.915019989013672, "step": 87 }, { "epoch": 0.12922173274596183, "grad_norm": 75.93292999267578, "learning_rate": 4.989335440737586e-07, "logits/chosen": -0.6478073596954346, "logits/rejected": -0.6310935020446777, "logps/chosen": -82.13240051269531, "logps/ref_chosen": -73.67115020751953, "logps/ref_rejected": -106.70849609375, "logps/rejected": -127.77642822265625, "loss": 0.9197, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.3014436364173889, "margin_dpo/beta_margin_grad_std": 0.23827531933784485, "margin_dpo/beta_margin_mean": 1.260668158531189, "margin_dpo/beta_margin_std": 1.7176685333251953, "margin_dpo/loss_margin_mean": 12.606681823730469, "margin_dpo/margin_mean": 12.606681823730469, "margin_dpo/margin_std": 15.93301773071289, "step": 88 }, { "epoch": 0.13069016152716592, "grad_norm": 56.17230224609375, "learning_rate": 4.988118539941847e-07, "logits/chosen": -0.6928755640983582, "logits/rejected": -0.6521140336990356, "logps/chosen": -65.11277770996094, "logps/ref_chosen": -60.624916076660156, "logps/ref_rejected": -82.08354949951172, "logps/rejected": -99.52984619140625, "loss": 0.7412, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.27584946155548096, "margin_dpo/beta_margin_grad_std": 0.18205879628658295, "margin_dpo/beta_margin_mean": 1.2958422899246216, "margin_dpo/beta_margin_std": 1.4058305025100708, "margin_dpo/loss_margin_mean": 12.958422660827637, "margin_dpo/margin_mean": 12.958423614501953, "margin_dpo/margin_std": 13.854536056518555, "step": 89 }, { "epoch": 0.13215859030837004, "grad_norm": 66.36186981201172, "learning_rate": 4.986836074908615e-07, "logits/chosen": -0.6513394713401794, "logits/rejected": -0.6424415111541748, "logps/chosen": -59.482887268066406, "logps/ref_chosen": -53.285308837890625, "logps/ref_rejected": -111.54470825195312, "logps/rejected": -133.55593872070312, "loss": 0.8411, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2849555015563965, "margin_dpo/beta_margin_grad_std": 0.22966991364955902, "margin_dpo/beta_margin_mean": 1.5813645124435425, "margin_dpo/beta_margin_std": 2.088043451309204, "margin_dpo/loss_margin_mean": 15.813644409179688, "margin_dpo/margin_mean": 15.813644409179688, "margin_dpo/margin_std": 20.459163665771484, "step": 90 }, { "epoch": 0.13362701908957417, "grad_norm": 65.70479583740234, "learning_rate": 4.985488079432037e-07, "logits/chosen": -0.695541262626648, "logits/rejected": -0.6568491458892822, "logps/chosen": -67.02444458007812, "logps/ref_chosen": -61.80295944213867, "logps/ref_rejected": -87.87395477294922, "logps/rejected": -108.97652435302734, "loss": 0.762, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.27236229181289673, "margin_dpo/beta_margin_grad_std": 0.23041805624961853, "margin_dpo/beta_margin_mean": 1.588107705116272, "margin_dpo/beta_margin_std": 1.7729498147964478, "margin_dpo/loss_margin_mean": 15.88107681274414, "margin_dpo/margin_mean": 15.881075859069824, "margin_dpo/margin_std": 17.554851531982422, "step": 91 }, { "epoch": 0.13509544787077826, "grad_norm": 60.354248046875, "learning_rate": 4.984074589033043e-07, "logits/chosen": -0.7051235437393188, "logits/rejected": -0.6763289570808411, "logps/chosen": -56.71138000488281, "logps/ref_chosen": -51.640769958496094, "logps/ref_rejected": -77.88117980957031, "logps/rejected": -97.6747055053711, "loss": 0.8107, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2847803235054016, "margin_dpo/beta_margin_grad_std": 0.230974480509758, "margin_dpo/beta_margin_mean": 1.4722909927368164, "margin_dpo/beta_margin_std": 1.7639739513397217, "margin_dpo/loss_margin_mean": 14.722909927368164, "margin_dpo/margin_mean": 14.722909927368164, "margin_dpo/margin_std": 17.423236846923828, "step": 92 }, { "epoch": 0.13656387665198239, "grad_norm": 48.63566589355469, "learning_rate": 4.982595640958425e-07, "logits/chosen": -0.7185451984405518, "logits/rejected": -0.6557145714759827, "logps/chosen": -57.98681640625, "logps/ref_chosen": -52.529239654541016, "logps/ref_rejected": -77.1607437133789, "logps/rejected": -97.54901123046875, "loss": 0.6862, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.25878626108169556, "margin_dpo/beta_margin_grad_std": 0.19036650657653809, "margin_dpo/beta_margin_mean": 1.4930684566497803, "margin_dpo/beta_margin_std": 1.5610140562057495, "margin_dpo/loss_margin_mean": 14.930684089660645, "margin_dpo/margin_mean": 14.930685043334961, "margin_dpo/margin_std": 15.499519348144531, "step": 93 }, { "epoch": 0.13803230543318648, "grad_norm": 51.54408264160156, "learning_rate": 4.98105127417984e-07, "logits/chosen": -0.6754232048988342, "logits/rejected": -0.6463443040847778, "logps/chosen": -67.19898986816406, "logps/ref_chosen": -61.22261047363281, "logps/ref_rejected": -99.59902954101562, "logps/rejected": -121.30268859863281, "loss": 0.6489, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2494257092475891, "margin_dpo/beta_margin_grad_std": 0.1913631409406662, "margin_dpo/beta_margin_mean": 1.5727283954620361, "margin_dpo/beta_margin_std": 1.4842208623886108, "margin_dpo/loss_margin_mean": 15.72728443145752, "margin_dpo/margin_mean": 15.727283477783203, "margin_dpo/margin_std": 14.665702819824219, "step": 94 }, { "epoch": 0.1395007342143906, "grad_norm": 49.133419036865234, "learning_rate": 4.979441529392784e-07, "logits/chosen": -0.7007203102111816, "logits/rejected": -0.6629537343978882, "logps/chosen": -57.13197326660156, "logps/ref_chosen": -52.52364730834961, "logps/ref_rejected": -75.88035583496094, "logps/rejected": -93.41667175292969, "loss": 0.7168, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.26963499188423157, "margin_dpo/beta_margin_grad_std": 0.18063600361347198, "margin_dpo/beta_margin_mean": 1.2927991151809692, "margin_dpo/beta_margin_std": 1.250800371170044, "margin_dpo/loss_margin_mean": 12.927990913391113, "margin_dpo/margin_mean": 12.927990913391113, "margin_dpo/margin_std": 12.453845977783203, "step": 95 }, { "epoch": 0.14096916299559473, "grad_norm": 50.28048324584961, "learning_rate": 4.977766449015534e-07, "logits/chosen": -0.6796199083328247, "logits/rejected": -0.6378945708274841, "logps/chosen": -65.92119598388672, "logps/ref_chosen": -62.15697479248047, "logps/ref_rejected": -96.59601593017578, "logps/rejected": -117.46200561523438, "loss": 0.6242, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.23479405045509338, "margin_dpo/beta_margin_grad_std": 0.18774589896202087, "margin_dpo/beta_margin_mean": 1.7101774215698242, "margin_dpo/beta_margin_std": 1.7064077854156494, "margin_dpo/loss_margin_mean": 17.101774215698242, "margin_dpo/margin_mean": 17.10177230834961, "margin_dpo/margin_std": 16.97222328186035, "step": 96 }, { "epoch": 0.14243759177679882, "grad_norm": 53.316551208496094, "learning_rate": 4.976026077188012e-07, "logits/chosen": -0.6628963947296143, "logits/rejected": -0.6078641414642334, "logps/chosen": -59.297088623046875, "logps/ref_chosen": -54.64636993408203, "logps/ref_rejected": -76.96475219726562, "logps/rejected": -95.37380981445312, "loss": 0.6845, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.26004087924957275, "margin_dpo/beta_margin_grad_std": 0.18338225781917572, "margin_dpo/beta_margin_mean": 1.375833511352539, "margin_dpo/beta_margin_std": 1.2895034551620483, "margin_dpo/loss_margin_mean": 13.758334159851074, "margin_dpo/margin_mean": 13.75833511352539, "margin_dpo/margin_std": 12.287176132202148, "step": 97 }, { "epoch": 0.14390602055800295, "grad_norm": 59.0107536315918, "learning_rate": 4.974220459770639e-07, "logits/chosen": -0.6648178100585938, "logits/rejected": -0.6405047178268433, "logps/chosen": -71.05479431152344, "logps/ref_chosen": -65.25862884521484, "logps/ref_rejected": -96.5274887084961, "logps/rejected": -117.12971496582031, "loss": 0.748, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.25851550698280334, "margin_dpo/beta_margin_grad_std": 0.21575090289115906, "margin_dpo/beta_margin_mean": 1.480607032775879, "margin_dpo/beta_margin_std": 1.5301272869110107, "margin_dpo/loss_margin_mean": 14.806070327758789, "margin_dpo/margin_mean": 14.806069374084473, "margin_dpo/margin_std": 15.232458114624023, "step": 98 }, { "epoch": 0.14537444933920704, "grad_norm": 48.36380386352539, "learning_rate": 4.972349644343108e-07, "logits/chosen": -0.6831210851669312, "logits/rejected": -0.6707972884178162, "logps/chosen": -50.50402069091797, "logps/ref_chosen": -45.63848114013672, "logps/ref_rejected": -86.43792724609375, "logps/rejected": -107.33246612548828, "loss": 0.6371, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.24989381432533264, "margin_dpo/beta_margin_grad_std": 0.17566484212875366, "margin_dpo/beta_margin_mean": 1.6028999090194702, "margin_dpo/beta_margin_std": 1.6380233764648438, "margin_dpo/loss_margin_mean": 16.02899932861328, "margin_dpo/margin_mean": 16.02899932861328, "margin_dpo/margin_std": 16.368377685546875, "step": 99 }, { "epoch": 0.14684287812041116, "grad_norm": 66.3724365234375, "learning_rate": 4.970413680203148e-07, "logits/chosen": -0.6820343732833862, "logits/rejected": -0.6383761167526245, "logps/chosen": -62.664703369140625, "logps/ref_chosen": -57.5939826965332, "logps/ref_rejected": -74.06021118164062, "logps/rejected": -90.71258544921875, "loss": 0.9045, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.30843135714530945, "margin_dpo/beta_margin_grad_std": 0.22273820638656616, "margin_dpo/beta_margin_mean": 1.158165693283081, "margin_dpo/beta_margin_std": 1.5505050420761108, "margin_dpo/loss_margin_mean": 11.581656455993652, "margin_dpo/margin_mean": 11.581655502319336, "margin_dpo/margin_std": 15.148920059204102, "step": 100 }, { "epoch": 0.14684287812041116, "eval_logits/chosen": -0.64442378282547, "eval_logits/rejected": -0.6169079542160034, "eval_logps/chosen": -87.21427917480469, "eval_logps/ref_chosen": -79.05104064941406, "eval_logps/ref_rejected": -86.79793548583984, "eval_logps/rejected": -103.4049301147461, "eval_loss": 0.5612262487411499, "eval_margin_dpo/beta": 0.10000000149011612, "eval_margin_dpo/beta_margin_grad_mean": -0.367234468460083, "eval_margin_dpo/beta_margin_grad_std": 0.23124347627162933, "eval_margin_dpo/beta_margin_mean": 0.8443758487701416, "eval_margin_dpo/beta_margin_std": 1.5440738201141357, "eval_margin_dpo/loss_margin_mean": 8.443757057189941, "eval_margin_dpo/margin_mean": 8.443758010864258, "eval_margin_dpo/margin_std": 15.440738677978516, "eval_runtime": 40.1706, "eval_samples_per_second": 58.227, "eval_steps_per_second": 1.842, "step": 100 }, { "epoch": 0.14831130690161526, "grad_norm": 59.758052825927734, "learning_rate": 4.968412618365215e-07, "logits/chosen": -0.6818881630897522, "logits/rejected": -0.6413546800613403, "logps/chosen": -67.38121032714844, "logps/ref_chosen": -61.64884948730469, "logps/ref_rejected": -83.18968963623047, "logps/rejected": -102.2203598022461, "loss": 0.7943, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.29040390253067017, "margin_dpo/beta_margin_grad_std": 0.208379328250885, "margin_dpo/beta_margin_mean": 1.3298306465148926, "margin_dpo/beta_margin_std": 1.6011595726013184, "margin_dpo/loss_margin_mean": 13.298306465148926, "margin_dpo/margin_mean": 13.298306465148926, "margin_dpo/margin_std": 15.829672813415527, "step": 101 }, { "epoch": 0.14977973568281938, "grad_norm": 70.77701568603516, "learning_rate": 4.966346511559149e-07, "logits/chosen": -0.6993674039840698, "logits/rejected": -0.6509321928024292, "logps/chosen": -71.01653289794922, "logps/ref_chosen": -64.0788803100586, "logps/ref_rejected": -68.18707275390625, "logps/rejected": -85.41771697998047, "loss": 0.9392, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.32659777998924255, "margin_dpo/beta_margin_grad_std": 0.22610485553741455, "margin_dpo/beta_margin_mean": 1.0292984247207642, "margin_dpo/beta_margin_std": 1.4347975254058838, "margin_dpo/loss_margin_mean": 10.292984008789062, "margin_dpo/margin_mean": 10.292984008789062, "margin_dpo/margin_std": 14.034963607788086, "step": 102 }, { "epoch": 0.1512481644640235, "grad_norm": 45.32851028442383, "learning_rate": 4.964215414228785e-07, "logits/chosen": -0.6965250968933105, "logits/rejected": -0.6588256359100342, "logps/chosen": -64.84249877929688, "logps/ref_chosen": -61.299278259277344, "logps/ref_rejected": -93.57271575927734, "logps/rejected": -115.04621887207031, "loss": 0.5497, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.21648754179477692, "margin_dpo/beta_margin_grad_std": 0.17404885590076447, "margin_dpo/beta_margin_mean": 1.7930291891098022, "margin_dpo/beta_margin_std": 1.5200693607330322, "margin_dpo/loss_margin_mean": 17.9302921295166, "margin_dpo/margin_mean": 17.93029022216797, "margin_dpo/margin_std": 15.179329872131348, "step": 103 }, { "epoch": 0.1527165932452276, "grad_norm": 53.074501037597656, "learning_rate": 4.96201938253052e-07, "logits/chosen": -0.711995005607605, "logits/rejected": -0.6668508052825928, "logps/chosen": -59.31719207763672, "logps/ref_chosen": -54.37277603149414, "logps/ref_rejected": -89.5647201538086, "logps/rejected": -111.41394805908203, "loss": 0.6775, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.25234103202819824, "margin_dpo/beta_margin_grad_std": 0.20327746868133545, "margin_dpo/beta_margin_mean": 1.6904809474945068, "margin_dpo/beta_margin_std": 1.7527903318405151, "margin_dpo/loss_margin_mean": 16.904809951782227, "margin_dpo/margin_mean": 16.904808044433594, "margin_dpo/margin_std": 17.310930252075195, "step": 104 }, { "epoch": 0.15418502202643172, "grad_norm": 39.851192474365234, "learning_rate": 4.959758474331832e-07, "logits/chosen": -0.7239284515380859, "logits/rejected": -0.6849699020385742, "logps/chosen": -58.359535217285156, "logps/ref_chosen": -54.638946533203125, "logps/ref_rejected": -97.97351837158203, "logps/rejected": -124.3260498046875, "loss": 0.4216, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16937798261642456, "margin_dpo/beta_margin_grad_std": 0.16204456984996796, "margin_dpo/beta_margin_mean": 2.2631936073303223, "margin_dpo/beta_margin_std": 1.6875535249710083, "margin_dpo/loss_margin_mean": 22.63193702697754, "margin_dpo/margin_mean": 22.63193702697754, "margin_dpo/margin_std": 16.76972007751465, "step": 105 }, { "epoch": 0.15565345080763582, "grad_norm": 47.982364654541016, "learning_rate": 4.957432749209755e-07, "logits/chosen": -0.6622617840766907, "logits/rejected": -0.6061959266662598, "logps/chosen": -59.73749923706055, "logps/ref_chosen": -54.83289337158203, "logps/ref_rejected": -85.22461700439453, "logps/rejected": -104.90985107421875, "loss": 0.6834, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2569577693939209, "margin_dpo/beta_margin_grad_std": 0.19905740022659302, "margin_dpo/beta_margin_mean": 1.4780631065368652, "margin_dpo/beta_margin_std": 1.442070722579956, "margin_dpo/loss_margin_mean": 14.780631065368652, "margin_dpo/margin_mean": 14.780631065368652, "margin_dpo/margin_std": 14.37628173828125, "step": 106 }, { "epoch": 0.15712187958883994, "grad_norm": 48.62154006958008, "learning_rate": 4.955042268449307e-07, "logits/chosen": -0.7029379606246948, "logits/rejected": -0.6468302011489868, "logps/chosen": -75.64985656738281, "logps/ref_chosen": -69.70780944824219, "logps/ref_rejected": -94.73950958251953, "logps/rejected": -117.44454956054688, "loss": 0.6434, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.23800994455814362, "margin_dpo/beta_margin_grad_std": 0.20428350567817688, "margin_dpo/beta_margin_mean": 1.6762993335723877, "margin_dpo/beta_margin_std": 1.5433250665664673, "margin_dpo/loss_margin_mean": 16.76299285888672, "margin_dpo/margin_mean": 16.76299285888672, "margin_dpo/margin_std": 15.39747428894043, "step": 107 }, { "epoch": 0.15859030837004406, "grad_norm": 59.20159912109375, "learning_rate": 4.952587095041881e-07, "logits/chosen": -0.7281018495559692, "logits/rejected": -0.6796263456344604, "logps/chosen": -61.804962158203125, "logps/ref_chosen": -56.0098876953125, "logps/ref_rejected": -95.79601287841797, "logps/rejected": -118.76254272460938, "loss": 0.7472, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.26712775230407715, "margin_dpo/beta_margin_grad_std": 0.22972996532917023, "margin_dpo/beta_margin_mean": 1.7171452045440674, "margin_dpo/beta_margin_std": 1.924187183380127, "margin_dpo/loss_margin_mean": 17.171451568603516, "margin_dpo/margin_mean": 17.171451568603516, "margin_dpo/margin_std": 19.11905288696289, "step": 108 }, { "epoch": 0.16005873715124816, "grad_norm": 49.94407272338867, "learning_rate": 4.95006729368358e-07, "logits/chosen": -0.6409512758255005, "logits/rejected": -0.6109081506729126, "logps/chosen": -68.07378387451172, "logps/ref_chosen": -62.88549041748047, "logps/ref_rejected": -98.68573760986328, "logps/rejected": -122.75498962402344, "loss": 0.5514, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.21403388679027557, "margin_dpo/beta_margin_grad_std": 0.18927177786827087, "margin_dpo/beta_margin_mean": 1.8880969285964966, "margin_dpo/beta_margin_std": 1.5617107152938843, "margin_dpo/loss_margin_mean": 18.880970001220703, "margin_dpo/margin_mean": 18.880970001220703, "margin_dpo/margin_std": 15.536466598510742, "step": 109 }, { "epoch": 0.16152716593245228, "grad_norm": 51.64780807495117, "learning_rate": 4.947482930773511e-07, "logits/chosen": -0.6488534212112427, "logits/rejected": -0.593596339225769, "logps/chosen": -63.12610626220703, "logps/ref_chosen": -58.753684997558594, "logps/ref_rejected": -79.75001525878906, "logps/rejected": -101.90496826171875, "loss": 0.6812, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.23650266230106354, "margin_dpo/beta_margin_grad_std": 0.2127843052148819, "margin_dpo/beta_margin_mean": 1.7782528400421143, "margin_dpo/beta_margin_std": 1.8577708005905151, "margin_dpo/loss_margin_mean": 17.782527923583984, "margin_dpo/margin_mean": 17.782527923583984, "margin_dpo/margin_std": 18.085121154785156, "step": 110 }, { "epoch": 0.16299559471365638, "grad_norm": 53.70595169067383, "learning_rate": 4.944834074412042e-07, "logits/chosen": -0.7132564187049866, "logits/rejected": -0.6853828430175781, "logps/chosen": -75.00138092041016, "logps/ref_chosen": -68.62410736083984, "logps/ref_rejected": -98.42886352539062, "logps/rejected": -123.1148452758789, "loss": 0.6606, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.23275849223136902, "margin_dpo/beta_margin_grad_std": 0.2126443088054657, "margin_dpo/beta_margin_mean": 1.8308711051940918, "margin_dpo/beta_margin_std": 1.8513504266738892, "margin_dpo/loss_margin_mean": 18.3087100982666, "margin_dpo/margin_mean": 18.3087100982666, "margin_dpo/margin_std": 18.054677963256836, "step": 111 }, { "epoch": 0.1644640234948605, "grad_norm": 61.954097747802734, "learning_rate": 4.942120794399002e-07, "logits/chosen": -0.6944275498390198, "logits/rejected": -0.6399896740913391, "logps/chosen": -56.5648193359375, "logps/ref_chosen": -50.24964141845703, "logps/ref_rejected": -64.77442932128906, "logps/rejected": -84.34908294677734, "loss": 0.8098, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.28910669684410095, "margin_dpo/beta_margin_grad_std": 0.2169356793165207, "margin_dpo/beta_margin_mean": 1.3259477615356445, "margin_dpo/beta_margin_std": 1.5364657640457153, "margin_dpo/loss_margin_mean": 13.259476661682129, "margin_dpo/margin_mean": 13.259477615356445, "margin_dpo/margin_std": 15.114201545715332, "step": 112 }, { "epoch": 0.16593245227606462, "grad_norm": 51.86798858642578, "learning_rate": 4.939343162231841e-07, "logits/chosen": -0.6678988933563232, "logits/rejected": -0.6150726079940796, "logps/chosen": -72.72209167480469, "logps/ref_chosen": -66.71295166015625, "logps/ref_rejected": -77.96870422363281, "logps/rejected": -98.91563415527344, "loss": 0.6636, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.25216349959373474, "margin_dpo/beta_margin_grad_std": 0.19559957087039948, "margin_dpo/beta_margin_mean": 1.4937783479690552, "margin_dpo/beta_margin_std": 1.3543710708618164, "margin_dpo/loss_margin_mean": 14.937784194946289, "margin_dpo/margin_mean": 14.937784194946289, "margin_dpo/margin_std": 13.522451400756836, "step": 113 }, { "epoch": 0.16740088105726872, "grad_norm": 48.08191680908203, "learning_rate": 4.936501251103751e-07, "logits/chosen": -0.6919864416122437, "logits/rejected": -0.644797682762146, "logps/chosen": -63.47439193725586, "logps/ref_chosen": -57.78507995605469, "logps/ref_rejected": -87.10966491699219, "logps/rejected": -113.00218200683594, "loss": 0.5974, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.22666409611701965, "margin_dpo/beta_margin_grad_std": 0.20277349650859833, "margin_dpo/beta_margin_mean": 2.020320415496826, "margin_dpo/beta_margin_std": 2.0695791244506836, "margin_dpo/loss_margin_mean": 20.203205108642578, "margin_dpo/margin_mean": 20.203205108642578, "margin_dpo/margin_std": 20.269628524780273, "step": 114 }, { "epoch": 0.16886930983847284, "grad_norm": 74.83251190185547, "learning_rate": 4.933595135901732e-07, "logits/chosen": -0.7010315656661987, "logits/rejected": -0.6554032564163208, "logps/chosen": -73.77932739257812, "logps/ref_chosen": -65.5826416015625, "logps/ref_rejected": -98.56552124023438, "logps/rejected": -122.03272247314453, "loss": 0.7993, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.27328062057495117, "margin_dpo/beta_margin_grad_std": 0.22063322365283966, "margin_dpo/beta_margin_mean": 1.5270512104034424, "margin_dpo/beta_margin_std": 1.8626760244369507, "margin_dpo/loss_margin_mean": 15.270512580871582, "margin_dpo/margin_mean": 15.270513534545898, "margin_dpo/margin_std": 18.557777404785156, "step": 115 }, { "epoch": 0.17033773861967694, "grad_norm": 45.14370346069336, "learning_rate": 4.930624893204624e-07, "logits/chosen": -0.7024333477020264, "logits/rejected": -0.6669833660125732, "logps/chosen": -57.33141326904297, "logps/ref_chosen": -51.40031051635742, "logps/ref_rejected": -80.5218505859375, "logps/rejected": -101.55380249023438, "loss": 0.6234, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.24537521600723267, "margin_dpo/beta_margin_grad_std": 0.17158278822898865, "margin_dpo/beta_margin_mean": 1.5100841522216797, "margin_dpo/beta_margin_std": 1.3967205286026, "margin_dpo/loss_margin_mean": 15.100841522216797, "margin_dpo/margin_mean": 15.100841522216797, "margin_dpo/margin_std": 13.869951248168945, "step": 116 }, { "epoch": 0.17180616740088106, "grad_norm": 61.82289505004883, "learning_rate": 4.927590601281083e-07, "logits/chosen": -0.6753987669944763, "logits/rejected": -0.6353236436843872, "logps/chosen": -75.63998413085938, "logps/ref_chosen": -69.29840850830078, "logps/ref_rejected": -66.58399200439453, "logps/rejected": -87.92289733886719, "loss": 0.696, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2614162564277649, "margin_dpo/beta_margin_grad_std": 0.19564659893512726, "margin_dpo/beta_margin_mean": 1.4997328519821167, "margin_dpo/beta_margin_std": 1.5858564376831055, "margin_dpo/loss_margin_mean": 14.997328758239746, "margin_dpo/margin_mean": 14.997328758239746, "margin_dpo/margin_std": 15.84735107421875, "step": 117 }, { "epoch": 0.17327459618208516, "grad_norm": 59.757423400878906, "learning_rate": 4.924492340087524e-07, "logits/chosen": -0.69645094871521, "logits/rejected": -0.653948962688446, "logps/chosen": -62.284420013427734, "logps/ref_chosen": -55.6409797668457, "logps/ref_rejected": -75.66905212402344, "logps/rejected": -96.29232788085938, "loss": 0.667, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.25842976570129395, "margin_dpo/beta_margin_grad_std": 0.1788530945777893, "margin_dpo/beta_margin_mean": 1.397983431816101, "margin_dpo/beta_margin_std": 1.2897310256958008, "margin_dpo/loss_margin_mean": 13.979833602905273, "margin_dpo/margin_mean": 13.979833602905273, "margin_dpo/margin_std": 12.897065162658691, "step": 118 }, { "epoch": 0.17474302496328928, "grad_norm": 58.3023796081543, "learning_rate": 4.92133019126601e-07, "logits/chosen": -0.6907534003257751, "logits/rejected": -0.666528582572937, "logps/chosen": -80.68411254882812, "logps/ref_chosen": -73.51019287109375, "logps/ref_rejected": -102.97728729248047, "logps/rejected": -124.96991729736328, "loss": 0.7427, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2673878073692322, "margin_dpo/beta_margin_grad_std": 0.21648362278938293, "margin_dpo/beta_margin_mean": 1.4818708896636963, "margin_dpo/beta_margin_std": 1.625535249710083, "margin_dpo/loss_margin_mean": 14.818708419799805, "margin_dpo/margin_mean": 14.818709373474121, "margin_dpo/margin_std": 16.06285858154297, "step": 119 }, { "epoch": 0.1762114537444934, "grad_norm": 72.29347229003906, "learning_rate": 4.918104238142103e-07, "logits/chosen": -0.6836975812911987, "logits/rejected": -0.6393747329711914, "logps/chosen": -84.85501098632812, "logps/ref_chosen": -76.78083801269531, "logps/ref_rejected": -108.02374267578125, "logps/rejected": -134.7047119140625, "loss": 0.6003, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.22438913583755493, "margin_dpo/beta_margin_grad_std": 0.2033691704273224, "margin_dpo/beta_margin_mean": 1.860680341720581, "margin_dpo/beta_margin_std": 1.7107524871826172, "margin_dpo/loss_margin_mean": 18.606801986694336, "margin_dpo/margin_mean": 18.60680389404297, "margin_dpo/margin_std": 17.087509155273438, "step": 120 }, { "epoch": 0.1776798825256975, "grad_norm": 50.97231674194336, "learning_rate": 4.91481456572267e-07, "logits/chosen": -0.6479914784431458, "logits/rejected": -0.6296772360801697, "logps/chosen": -69.47196960449219, "logps/ref_chosen": -61.789894104003906, "logps/ref_rejected": -109.99456787109375, "logps/rejected": -137.31689453125, "loss": 0.6, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.22889560461044312, "margin_dpo/beta_margin_grad_std": 0.20096932351589203, "margin_dpo/beta_margin_mean": 1.9640241861343384, "margin_dpo/beta_margin_std": 1.8757091760635376, "margin_dpo/loss_margin_mean": 19.640241622924805, "margin_dpo/margin_mean": 19.640243530273438, "margin_dpo/margin_std": 18.727981567382812, "step": 121 }, { "epoch": 0.17914831130690162, "grad_norm": 44.44526672363281, "learning_rate": 4.911461260693638e-07, "logits/chosen": -0.6837228536605835, "logits/rejected": -0.674382209777832, "logps/chosen": -53.734107971191406, "logps/ref_chosen": -46.90221405029297, "logps/ref_rejected": -106.71418762207031, "logps/rejected": -138.098388671875, "loss": 0.4292, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1675311177968979, "margin_dpo/beta_margin_grad_std": 0.18187315762043, "margin_dpo/beta_margin_mean": 2.455230712890625, "margin_dpo/beta_margin_std": 1.817508339881897, "margin_dpo/loss_margin_mean": 24.55230712890625, "margin_dpo/margin_mean": 24.55230712890625, "margin_dpo/margin_std": 18.124570846557617, "step": 122 }, { "epoch": 0.18061674008810572, "grad_norm": 65.7748031616211, "learning_rate": 4.908044411417711e-07, "logits/chosen": -0.6735109090805054, "logits/rejected": -0.642119288444519, "logps/chosen": -68.216064453125, "logps/ref_chosen": -61.33863830566406, "logps/ref_rejected": -87.77539825439453, "logps/rejected": -111.65895080566406, "loss": 0.7835, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.25650110840797424, "margin_dpo/beta_margin_grad_std": 0.23678642511367798, "margin_dpo/beta_margin_mean": 1.700613021850586, "margin_dpo/beta_margin_std": 1.9761358499526978, "margin_dpo/loss_margin_mean": 17.00613021850586, "margin_dpo/margin_mean": 17.00613021850586, "margin_dpo/margin_std": 19.711463928222656, "step": 123 }, { "epoch": 0.18208516886930984, "grad_norm": 62.76095199584961, "learning_rate": 4.904564107932048e-07, "logits/chosen": -0.6483690142631531, "logits/rejected": -0.6380556225776672, "logps/chosen": -78.72987365722656, "logps/ref_chosen": -71.44833374023438, "logps/ref_rejected": -117.58056640625, "logps/rejected": -146.22970581054688, "loss": 0.6458, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.23011672496795654, "margin_dpo/beta_margin_grad_std": 0.2240174412727356, "margin_dpo/beta_margin_mean": 2.1367599964141846, "margin_dpo/beta_margin_std": 2.3805363178253174, "margin_dpo/loss_margin_mean": 21.367599487304688, "margin_dpo/margin_mean": 21.367597579956055, "margin_dpo/margin_std": 23.656444549560547, "step": 124 }, { "epoch": 0.18355359765051396, "grad_norm": 44.872169494628906, "learning_rate": 4.90102044194588e-07, "logits/chosen": -0.6881054639816284, "logits/rejected": -0.6638908386230469, "logps/chosen": -55.62939453125, "logps/ref_chosen": -50.136940002441406, "logps/ref_rejected": -83.98861694335938, "logps/rejected": -109.26545715332031, "loss": 0.5385, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.21027234196662903, "margin_dpo/beta_margin_grad_std": 0.18001829087734222, "margin_dpo/beta_margin_mean": 1.9784388542175293, "margin_dpo/beta_margin_std": 1.7413667440414429, "margin_dpo/loss_margin_mean": 19.784387588500977, "margin_dpo/margin_mean": 19.78438949584961, "margin_dpo/margin_std": 17.293458938598633, "step": 125 }, { "epoch": 0.18502202643171806, "grad_norm": 55.02238845825195, "learning_rate": 4.897413506838102e-07, "logits/chosen": -0.6792329549789429, "logits/rejected": -0.6454166769981384, "logps/chosen": -61.897850036621094, "logps/ref_chosen": -55.66706848144531, "logps/ref_rejected": -98.1297607421875, "logps/rejected": -123.43519592285156, "loss": 0.5557, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.21414323151111603, "margin_dpo/beta_margin_grad_std": 0.18485862016677856, "margin_dpo/beta_margin_mean": 1.9074655771255493, "margin_dpo/beta_margin_std": 1.7027356624603271, "margin_dpo/loss_margin_mean": 19.074655532836914, "margin_dpo/margin_mean": 19.07465362548828, "margin_dpo/margin_std": 17.013324737548828, "step": 126 }, { "epoch": 0.18649045521292218, "grad_norm": 46.633995056152344, "learning_rate": 4.89374339765481e-07, "logits/chosen": -0.6350376605987549, "logits/rejected": -0.6004325151443481, "logps/chosen": -61.812808990478516, "logps/ref_chosen": -56.55467987060547, "logps/ref_rejected": -76.7957763671875, "logps/rejected": -98.11077880859375, "loss": 0.6291, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.23935288190841675, "margin_dpo/beta_margin_grad_std": 0.19522100687026978, "margin_dpo/beta_margin_mean": 1.605687141418457, "margin_dpo/beta_margin_std": 1.4145348072052002, "margin_dpo/loss_margin_mean": 16.05687141418457, "margin_dpo/margin_mean": 16.05687141418457, "margin_dpo/margin_std": 13.91942024230957, "step": 127 }, { "epoch": 0.18795888399412627, "grad_norm": 51.87793731689453, "learning_rate": 4.890010211106795e-07, "logits/chosen": -0.6689374446868896, "logits/rejected": -0.6231319904327393, "logps/chosen": -63.837318420410156, "logps/ref_chosen": -58.12095642089844, "logps/ref_rejected": -76.43896484375, "logps/rejected": -99.09458923339844, "loss": 0.68, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2527642250061035, "margin_dpo/beta_margin_grad_std": 0.20639841258525848, "margin_dpo/beta_margin_mean": 1.6939269304275513, "margin_dpo/beta_margin_std": 1.7684822082519531, "margin_dpo/loss_margin_mean": 16.939268112182617, "margin_dpo/margin_mean": 16.939268112182617, "margin_dpo/margin_std": 17.603744506835938, "step": 128 }, { "epoch": 0.1894273127753304, "grad_norm": 73.03254699707031, "learning_rate": 4.88621404556699e-07, "logits/chosen": -0.6583871841430664, "logits/rejected": -0.626196026802063, "logps/chosen": -75.50413513183594, "logps/ref_chosen": -66.91636657714844, "logps/ref_rejected": -96.6422119140625, "logps/rejected": -122.34420776367188, "loss": 0.7997, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2635345458984375, "margin_dpo/beta_margin_grad_std": 0.24272885918617249, "margin_dpo/beta_margin_mean": 1.7114232778549194, "margin_dpo/beta_margin_std": 2.0050904750823975, "margin_dpo/loss_margin_mean": 17.114233016967773, "margin_dpo/margin_mean": 17.11423110961914, "margin_dpo/margin_std": 19.97789764404297, "step": 129 }, { "epoch": 0.19089574155653452, "grad_norm": 48.8452033996582, "learning_rate": 4.882355001067891e-07, "logits/chosen": -0.648663341999054, "logits/rejected": -0.6347514390945435, "logps/chosen": -50.91802215576172, "logps/ref_chosen": -44.666847229003906, "logps/ref_rejected": -82.78165435791016, "logps/rejected": -111.94617462158203, "loss": 0.5896, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.20127610862255096, "margin_dpo/beta_margin_grad_std": 0.2229228913784027, "margin_dpo/beta_margin_mean": 2.291335105895996, "margin_dpo/beta_margin_std": 2.0445499420166016, "margin_dpo/loss_margin_mean": 22.913349151611328, "margin_dpo/margin_mean": 22.913349151611328, "margin_dpo/margin_std": 19.553049087524414, "step": 130 }, { "epoch": 0.19236417033773862, "grad_norm": 43.2736701965332, "learning_rate": 4.878433179298909e-07, "logits/chosen": -0.6537525057792664, "logits/rejected": -0.6376811265945435, "logps/chosen": -49.20851135253906, "logps/ref_chosen": -44.92458724975586, "logps/ref_rejected": -88.44401550292969, "logps/rejected": -112.79965209960938, "loss": 0.543, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.20455870032310486, "margin_dpo/beta_margin_grad_std": 0.19169239699840546, "margin_dpo/beta_margin_mean": 2.007171869277954, "margin_dpo/beta_margin_std": 1.7163995504379272, "margin_dpo/loss_margin_mean": 20.071718215942383, "margin_dpo/margin_mean": 20.071718215942383, "margin_dpo/margin_std": 17.120357513427734, "step": 131 }, { "epoch": 0.19383259911894274, "grad_norm": 51.05913543701172, "learning_rate": 4.874448683603694e-07, "logits/chosen": -0.7091637849807739, "logits/rejected": -0.6840115785598755, "logps/chosen": -65.47123718261719, "logps/ref_chosen": -59.00108337402344, "logps/ref_rejected": -87.89215087890625, "logps/rejected": -113.50138854980469, "loss": 0.5456, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.21537162363529205, "margin_dpo/beta_margin_grad_std": 0.1776151806116104, "margin_dpo/beta_margin_mean": 1.9139082431793213, "margin_dpo/beta_margin_std": 1.733182668685913, "margin_dpo/loss_margin_mean": 19.139081954956055, "margin_dpo/margin_mean": 19.139083862304688, "margin_dpo/margin_std": 17.268104553222656, "step": 132 }, { "epoch": 0.19530102790014683, "grad_norm": 56.10818862915039, "learning_rate": 4.870401618977415e-07, "logits/chosen": -0.6690283417701721, "logits/rejected": -0.6472345590591431, "logps/chosen": -74.25129699707031, "logps/ref_chosen": -66.60449981689453, "logps/ref_rejected": -96.33355712890625, "logps/rejected": -121.84832000732422, "loss": 0.71, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2546374201774597, "margin_dpo/beta_margin_grad_std": 0.22272148728370667, "margin_dpo/beta_margin_mean": 1.7867953777313232, "margin_dpo/beta_margin_std": 1.953741431236267, "margin_dpo/loss_margin_mean": 17.867952346801758, "margin_dpo/margin_mean": 17.86795425415039, "margin_dpo/margin_std": 19.506633758544922, "step": 133 }, { "epoch": 0.19676945668135096, "grad_norm": 42.318260192871094, "learning_rate": 4.866292092063986e-07, "logits/chosen": -0.6918569803237915, "logits/rejected": -0.6574737429618835, "logps/chosen": -56.94035339355469, "logps/ref_chosen": -52.06925582885742, "logps/ref_rejected": -87.6545181274414, "logps/rejected": -111.83763122558594, "loss": 0.4885, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.20181448757648468, "margin_dpo/beta_margin_grad_std": 0.14965248107910156, "margin_dpo/beta_margin_mean": 1.931201696395874, "margin_dpo/beta_margin_std": 1.5649566650390625, "margin_dpo/loss_margin_mean": 19.3120174407959, "margin_dpo/margin_mean": 19.3120174407959, "margin_dpo/margin_std": 15.627666473388672, "step": 134 }, { "epoch": 0.19823788546255505, "grad_norm": 54.5223274230957, "learning_rate": 4.862120211153265e-07, "logits/chosen": -0.6978960037231445, "logits/rejected": -0.6953153610229492, "logps/chosen": -58.12250518798828, "logps/ref_chosen": -50.353858947753906, "logps/ref_rejected": -115.97975158691406, "logps/rejected": -144.46224975585938, "loss": 0.5843, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.21617625653743744, "margin_dpo/beta_margin_grad_std": 0.2016371190547943, "margin_dpo/beta_margin_mean": 2.0713863372802734, "margin_dpo/beta_margin_std": 1.9683783054351807, "margin_dpo/loss_margin_mean": 20.713863372802734, "margin_dpo/margin_mean": 20.713863372802734, "margin_dpo/margin_std": 19.445152282714844, "step": 135 }, { "epoch": 0.19970631424375918, "grad_norm": 57.70625686645508, "learning_rate": 4.857886086178193e-07, "logits/chosen": -0.6800702810287476, "logits/rejected": -0.6486399173736572, "logps/chosen": -72.93949890136719, "logps/ref_chosen": -65.072509765625, "logps/ref_rejected": -96.32122802734375, "logps/rejected": -120.78422546386719, "loss": 0.6567, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2523123025894165, "margin_dpo/beta_margin_grad_std": 0.1883687824010849, "margin_dpo/beta_margin_mean": 1.6596003770828247, "margin_dpo/beta_margin_std": 1.9312176704406738, "margin_dpo/loss_margin_mean": 16.596004486083984, "margin_dpo/margin_mean": 16.596004486083984, "margin_dpo/margin_std": 18.42391014099121, "step": 136 }, { "epoch": 0.2011747430249633, "grad_norm": 60.66901779174805, "learning_rate": 4.853589828711902e-07, "logits/chosen": -0.661456823348999, "logits/rejected": -0.6546590328216553, "logps/chosen": -58.281639099121094, "logps/ref_chosen": -48.759117126464844, "logps/ref_rejected": -113.86377716064453, "logps/rejected": -145.80894470214844, "loss": 0.6241, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.219110906124115, "margin_dpo/beta_margin_grad_std": 0.21998608112335205, "margin_dpo/beta_margin_mean": 2.242264986038208, "margin_dpo/beta_margin_std": 2.2972939014434814, "margin_dpo/loss_margin_mean": 22.422649383544922, "margin_dpo/margin_mean": 22.422649383544922, "margin_dpo/margin_std": 22.739280700683594, "step": 137 }, { "epoch": 0.2026431718061674, "grad_norm": 69.17042541503906, "learning_rate": 4.849231551964771e-07, "logits/chosen": -0.6419689059257507, "logits/rejected": -0.6141855716705322, "logps/chosen": -69.76889038085938, "logps/ref_chosen": -60.519649505615234, "logps/ref_rejected": -93.19694519042969, "logps/rejected": -120.97154998779297, "loss": 0.6941, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.23004138469696045, "margin_dpo/beta_margin_grad_std": 0.22523212432861328, "margin_dpo/beta_margin_mean": 1.852536678314209, "margin_dpo/beta_margin_std": 1.8633073568344116, "margin_dpo/loss_margin_mean": 18.525365829467773, "margin_dpo/margin_mean": 18.525367736816406, "margin_dpo/margin_std": 18.522676467895508, "step": 138 }, { "epoch": 0.20411160058737152, "grad_norm": 49.38462448120117, "learning_rate": 4.844811370781446e-07, "logits/chosen": -0.664415717124939, "logits/rejected": -0.6345181465148926, "logps/chosen": -53.80146789550781, "logps/ref_chosen": -46.89138412475586, "logps/ref_rejected": -79.72798156738281, "logps/rejected": -107.02542114257812, "loss": 0.5486, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.21235616505146027, "margin_dpo/beta_margin_grad_std": 0.1912960559129715, "margin_dpo/beta_margin_mean": 2.038736343383789, "margin_dpo/beta_margin_std": 1.8344343900680542, "margin_dpo/loss_margin_mean": 20.38736343383789, "margin_dpo/margin_mean": 20.38736343383789, "margin_dpo/margin_std": 18.1939640045166, "step": 139 }, { "epoch": 0.2055800293685756, "grad_norm": 52.71585464477539, "learning_rate": 4.840329401637809e-07, "logits/chosen": -0.6803675889968872, "logits/rejected": -0.6538151502609253, "logps/chosen": -66.43865203857422, "logps/ref_chosen": -58.97471618652344, "logps/ref_rejected": -83.28411102294922, "logps/rejected": -110.04861450195312, "loss": 0.6767, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.24082183837890625, "margin_dpo/beta_margin_grad_std": 0.22746782004833221, "margin_dpo/beta_margin_mean": 1.930057406425476, "margin_dpo/beta_margin_std": 1.9649643898010254, "margin_dpo/loss_margin_mean": 19.300573348999023, "margin_dpo/margin_mean": 19.300575256347656, "margin_dpo/margin_std": 19.638553619384766, "step": 140 }, { "epoch": 0.20704845814977973, "grad_norm": 63.27604293823242, "learning_rate": 4.83578576263792e-07, "logits/chosen": -0.6606422662734985, "logits/rejected": -0.6359848976135254, "logps/chosen": -81.39404296875, "logps/ref_chosen": -75.0756607055664, "logps/ref_rejected": -98.1922607421875, "logps/rejected": -123.7364730834961, "loss": 0.6223, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.21940495073795319, "margin_dpo/beta_margin_grad_std": 0.2201792299747467, "margin_dpo/beta_margin_mean": 1.922583818435669, "margin_dpo/beta_margin_std": 1.737042784690857, "margin_dpo/loss_margin_mean": 19.22583770751953, "margin_dpo/margin_mean": 19.22583770751953, "margin_dpo/margin_std": 17.296768188476562, "step": 141 }, { "epoch": 0.20851688693098386, "grad_norm": 71.84093475341797, "learning_rate": 4.83118057351089e-07, "logits/chosen": -0.6464049816131592, "logits/rejected": -0.6306595802307129, "logps/chosen": -67.52783203125, "logps/ref_chosen": -58.027931213378906, "logps/ref_rejected": -94.58222198486328, "logps/rejected": -124.05125427246094, "loss": 0.7609, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.24063211679458618, "margin_dpo/beta_margin_grad_std": 0.23835302889347076, "margin_dpo/beta_margin_mean": 1.996911883354187, "margin_dpo/beta_margin_std": 2.20531964302063, "margin_dpo/loss_margin_mean": 19.969118118286133, "margin_dpo/margin_mean": 19.969120025634766, "margin_dpo/margin_std": 21.67835235595703, "step": 142 }, { "epoch": 0.20998531571218795, "grad_norm": 71.78072357177734, "learning_rate": 4.826513955607734e-07, "logits/chosen": -0.6453176736831665, "logits/rejected": -0.6054332256317139, "logps/chosen": -66.23277282714844, "logps/ref_chosen": -57.59645080566406, "logps/ref_rejected": -78.99957275390625, "logps/rejected": -102.97239685058594, "loss": 0.8411, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2758726179599762, "margin_dpo/beta_margin_grad_std": 0.2487667053937912, "margin_dpo/beta_margin_mean": 1.5336509943008423, "margin_dpo/beta_margin_std": 1.868659257888794, "margin_dpo/loss_margin_mean": 15.336509704589844, "margin_dpo/margin_mean": 15.336509704589844, "margin_dpo/margin_std": 18.46063232421875, "step": 143 }, { "epoch": 0.21145374449339208, "grad_norm": 43.579097747802734, "learning_rate": 4.821786031898176e-07, "logits/chosen": -0.6609046459197998, "logits/rejected": -0.614149808883667, "logps/chosen": -66.0463638305664, "logps/ref_chosen": -59.90636444091797, "logps/ref_rejected": -82.00025939941406, "logps/rejected": -107.83518981933594, "loss": 0.5331, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.20229505002498627, "margin_dpo/beta_margin_grad_std": 0.19436682760715485, "margin_dpo/beta_margin_mean": 1.9694933891296387, "margin_dpo/beta_margin_std": 1.6196097135543823, "margin_dpo/loss_margin_mean": 19.694934844970703, "margin_dpo/margin_mean": 19.694934844970703, "margin_dpo/margin_std": 16.18764305114746, "step": 144 }, { "epoch": 0.21292217327459617, "grad_norm": 47.01042175292969, "learning_rate": 4.816996926967401e-07, "logits/chosen": -0.655602216720581, "logits/rejected": -0.6127992868423462, "logps/chosen": -64.15069580078125, "logps/ref_chosen": -56.60066604614258, "logps/ref_rejected": -77.86631774902344, "logps/rejected": -105.38958740234375, "loss": 0.5577, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.21516340970993042, "margin_dpo/beta_margin_grad_std": 0.1895509660243988, "margin_dpo/beta_margin_mean": 1.9973245859146118, "margin_dpo/beta_margin_std": 1.822784185409546, "margin_dpo/loss_margin_mean": 19.97324562072754, "margin_dpo/margin_mean": 19.97324562072754, "margin_dpo/margin_std": 18.10959815979004, "step": 145 }, { "epoch": 0.2143906020558003, "grad_norm": 70.7383041381836, "learning_rate": 4.812146767012779e-07, "logits/chosen": -0.6367348432540894, "logits/rejected": -0.5759721398353577, "logps/chosen": -76.57098388671875, "logps/ref_chosen": -66.00045776367188, "logps/ref_rejected": -81.70278930664062, "logps/rejected": -108.75936126708984, "loss": 0.7169, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2570827901363373, "margin_dpo/beta_margin_grad_std": 0.22340109944343567, "margin_dpo/beta_margin_mean": 1.6486042737960815, "margin_dpo/beta_margin_std": 1.7461858987808228, "margin_dpo/loss_margin_mean": 16.486042022705078, "margin_dpo/margin_mean": 16.486042022705078, "margin_dpo/margin_std": 17.46074676513672, "step": 146 }, { "epoch": 0.21585903083700442, "grad_norm": 58.0555305480957, "learning_rate": 4.807235679840536e-07, "logits/chosen": -0.6206094622612, "logits/rejected": -0.5745464563369751, "logps/chosen": -61.8310432434082, "logps/ref_chosen": -53.405487060546875, "logps/ref_rejected": -71.39061737060547, "logps/rejected": -100.01313781738281, "loss": 0.57, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.21462920308113098, "margin_dpo/beta_margin_grad_std": 0.19549627602100372, "margin_dpo/beta_margin_mean": 2.0196962356567383, "margin_dpo/beta_margin_std": 1.897796630859375, "margin_dpo/loss_margin_mean": 20.196962356567383, "margin_dpo/margin_mean": 20.19696044921875, "margin_dpo/margin_std": 18.958223342895508, "step": 147 }, { "epoch": 0.2173274596182085, "grad_norm": 50.468204498291016, "learning_rate": 4.802263794862384e-07, "logits/chosen": -0.6692857146263123, "logits/rejected": -0.6372880935668945, "logps/chosen": -71.55451202392578, "logps/ref_chosen": -64.93708038330078, "logps/ref_rejected": -103.09384155273438, "logps/rejected": -125.87773132324219, "loss": 0.6722, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.252309113740921, "margin_dpo/beta_margin_grad_std": 0.19528046250343323, "margin_dpo/beta_margin_mean": 1.6166454553604126, "margin_dpo/beta_margin_std": 1.6389704942703247, "margin_dpo/loss_margin_mean": 16.166454315185547, "margin_dpo/margin_mean": 16.166454315185547, "margin_dpo/margin_std": 15.465744018554688, "step": 148 }, { "epoch": 0.21879588839941264, "grad_norm": 43.54497146606445, "learning_rate": 4.797231243092118e-07, "logits/chosen": -0.68845534324646, "logits/rejected": -0.6588988304138184, "logps/chosen": -65.29835510253906, "logps/ref_chosen": -58.47376251220703, "logps/ref_rejected": -99.31474304199219, "logps/rejected": -126.82577514648438, "loss": 0.5076, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.19679757952690125, "margin_dpo/beta_margin_grad_std": 0.18511685729026794, "margin_dpo/beta_margin_mean": 2.068645477294922, "margin_dpo/beta_margin_std": 1.6941298246383667, "margin_dpo/loss_margin_mean": 20.686452865600586, "margin_dpo/margin_mean": 20.68645477294922, "margin_dpo/margin_std": 16.817272186279297, "step": 149 }, { "epoch": 0.22026431718061673, "grad_norm": 57.30229187011719, "learning_rate": 4.792138157142157e-07, "logits/chosen": -0.6421929001808167, "logits/rejected": -0.6197161674499512, "logps/chosen": -52.55424118041992, "logps/ref_chosen": -45.705810546875, "logps/ref_rejected": -83.34759521484375, "logps/rejected": -109.71247863769531, "loss": 0.614, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.22440461814403534, "margin_dpo/beta_margin_grad_std": 0.20524847507476807, "margin_dpo/beta_margin_mean": 1.9516453742980957, "margin_dpo/beta_margin_std": 1.9830421209335327, "margin_dpo/loss_margin_mean": 19.51645278930664, "margin_dpo/margin_mean": 19.51645278930664, "margin_dpo/margin_std": 19.350839614868164, "step": 150 }, { "epoch": 0.22173274596182085, "grad_norm": 51.592350006103516, "learning_rate": 4.786984671220053e-07, "logits/chosen": -0.7092918157577515, "logits/rejected": -0.6702842712402344, "logps/chosen": -77.94678497314453, "logps/ref_chosen": -70.57083129882812, "logps/ref_rejected": -100.46382141113281, "logps/rejected": -129.22677612304688, "loss": 0.5158, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1952391117811203, "margin_dpo/beta_margin_grad_std": 0.1918613314628601, "margin_dpo/beta_margin_mean": 2.138700008392334, "margin_dpo/beta_margin_std": 1.8111618757247925, "margin_dpo/loss_margin_mean": 21.386999130249023, "margin_dpo/margin_mean": 21.38699722290039, "margin_dpo/margin_std": 18.097902297973633, "step": 151 }, { "epoch": 0.22320117474302498, "grad_norm": 51.7116813659668, "learning_rate": 4.78177092112495e-07, "logits/chosen": -0.6578500270843506, "logits/rejected": -0.6298225522041321, "logps/chosen": -65.78941345214844, "logps/ref_chosen": -60.164390563964844, "logps/ref_rejected": -106.14045715332031, "logps/rejected": -133.95053100585938, "loss": 0.4999, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18865174055099487, "margin_dpo/beta_margin_grad_std": 0.184275820851326, "margin_dpo/beta_margin_mean": 2.2185051441192627, "margin_dpo/beta_margin_std": 1.9241358041763306, "margin_dpo/loss_margin_mean": 22.185049057006836, "margin_dpo/margin_mean": 22.18505096435547, "margin_dpo/margin_std": 18.43682098388672, "step": 152 }, { "epoch": 0.22466960352422907, "grad_norm": 43.9327507019043, "learning_rate": 4.776497044244016e-07, "logits/chosen": -0.7056645154953003, "logits/rejected": -0.6881464719772339, "logps/chosen": -62.922088623046875, "logps/ref_chosen": -56.315277099609375, "logps/ref_rejected": -85.65583801269531, "logps/rejected": -111.08541870117188, "loss": 0.6484, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2367786020040512, "margin_dpo/beta_margin_grad_std": 0.21445007622241974, "margin_dpo/beta_margin_mean": 1.8822778463363647, "margin_dpo/beta_margin_std": 1.9090057611465454, "margin_dpo/loss_margin_mean": 18.822778701782227, "margin_dpo/margin_mean": 18.82278060913086, "margin_dpo/margin_std": 19.06036376953125, "step": 153 }, { "epoch": 0.2261380323054332, "grad_norm": 70.27942657470703, "learning_rate": 4.771163179548808e-07, "logits/chosen": -0.6403573751449585, "logits/rejected": -0.6136635541915894, "logps/chosen": -71.216796875, "logps/ref_chosen": -62.74256896972656, "logps/ref_rejected": -104.24420166015625, "logps/rejected": -131.48008728027344, "loss": 0.7131, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.24161392450332642, "margin_dpo/beta_margin_grad_std": 0.24336253106594086, "margin_dpo/beta_margin_mean": 1.8761656284332275, "margin_dpo/beta_margin_std": 1.9334018230438232, "margin_dpo/loss_margin_mean": 18.761655807495117, "margin_dpo/margin_mean": 18.761653900146484, "margin_dpo/margin_std": 18.826614379882812, "step": 154 }, { "epoch": 0.2276064610866373, "grad_norm": 54.39125061035156, "learning_rate": 4.7657694675916247e-07, "logits/chosen": -0.6604463458061218, "logits/rejected": -0.6317785978317261, "logps/chosen": -66.77053833007812, "logps/ref_chosen": -60.65318298339844, "logps/ref_rejected": -77.49220275878906, "logps/rejected": -103.34844207763672, "loss": 0.5756, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.20776690542697906, "margin_dpo/beta_margin_grad_std": 0.2009819895029068, "margin_dpo/beta_margin_mean": 1.9738881587982178, "margin_dpo/beta_margin_std": 1.7747788429260254, "margin_dpo/loss_margin_mean": 19.738882064819336, "margin_dpo/margin_mean": 19.738880157470703, "margin_dpo/margin_std": 17.563674926757812, "step": 155 }, { "epoch": 0.2290748898678414, "grad_norm": 87.28961944580078, "learning_rate": 4.7603160505017893e-07, "logits/chosen": -0.6653755903244019, "logits/rejected": -0.636063814163208, "logps/chosen": -79.47747802734375, "logps/ref_chosen": -69.49188232421875, "logps/ref_rejected": -77.1692886352539, "logps/rejected": -102.39488220214844, "loss": 0.9429, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.27911999821662903, "margin_dpo/beta_margin_grad_std": 0.26404860615730286, "margin_dpo/beta_margin_mean": 1.523999810218811, "margin_dpo/beta_margin_std": 2.001671075820923, "margin_dpo/loss_margin_mean": 15.239997863769531, "margin_dpo/margin_mean": 15.239997863769531, "margin_dpo/margin_std": 19.949081420898438, "step": 156 }, { "epoch": 0.2305433186490455, "grad_norm": 63.12745666503906, "learning_rate": 4.7548030719819154e-07, "logits/chosen": -0.7227987051010132, "logits/rejected": -0.6926777362823486, "logps/chosen": -71.74752807617188, "logps/ref_chosen": -61.368438720703125, "logps/ref_rejected": -107.64636993408203, "logps/rejected": -139.5974578857422, "loss": 0.545, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2063656747341156, "margin_dpo/beta_margin_grad_std": 0.20212943851947784, "margin_dpo/beta_margin_mean": 2.1572000980377197, "margin_dpo/beta_margin_std": 1.928480863571167, "margin_dpo/loss_margin_mean": 21.57200050354004, "margin_dpo/margin_mean": 21.57200050354004, "margin_dpo/margin_std": 19.126846313476562, "step": 157 }, { "epoch": 0.23201174743024963, "grad_norm": 46.42555236816406, "learning_rate": 4.7492306773041136e-07, "logits/chosen": -0.6610178351402283, "logits/rejected": -0.6375807523727417, "logps/chosen": -65.13560485839844, "logps/ref_chosen": -57.61292266845703, "logps/ref_rejected": -113.6946792602539, "logps/rejected": -143.13510131835938, "loss": 0.564, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.21795856952667236, "margin_dpo/beta_margin_grad_std": 0.196962371468544, "margin_dpo/beta_margin_mean": 2.191772937774658, "margin_dpo/beta_margin_std": 2.1438300609588623, "margin_dpo/loss_margin_mean": 21.917728424072266, "margin_dpo/margin_mean": 21.917728424072266, "margin_dpo/margin_std": 21.403545379638672, "step": 158 }, { "epoch": 0.23348017621145375, "grad_norm": 53.46623229980469, "learning_rate": 4.743599013306165e-07, "logits/chosen": -0.6735081076622009, "logits/rejected": -0.6329150199890137, "logps/chosen": -89.84956359863281, "logps/ref_chosen": -81.56034088134766, "logps/ref_rejected": -88.8987045288086, "logps/rejected": -116.82539367675781, "loss": 0.6587, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.23721808195114136, "margin_dpo/beta_margin_grad_std": 0.22351770102977753, "margin_dpo/beta_margin_mean": 1.9637459516525269, "margin_dpo/beta_margin_std": 2.0596272945404053, "margin_dpo/loss_margin_mean": 19.63745880126953, "margin_dpo/margin_mean": 19.63745880126953, "margin_dpo/margin_std": 20.491199493408203, "step": 159 }, { "epoch": 0.23494860499265785, "grad_norm": 58.52571105957031, "learning_rate": 4.737908228387656e-07, "logits/chosen": -0.6944263577461243, "logits/rejected": -0.6529999375343323, "logps/chosen": -74.34519958496094, "logps/ref_chosen": -65.73088073730469, "logps/ref_rejected": -97.21781921386719, "logps/rejected": -125.64112854003906, "loss": 0.6916, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.24713920056819916, "margin_dpo/beta_margin_grad_std": 0.22890552878379822, "margin_dpo/beta_margin_mean": 1.980899691581726, "margin_dpo/beta_margin_std": 2.211510419845581, "margin_dpo/loss_margin_mean": 19.808996200561523, "margin_dpo/margin_mean": 19.808996200561523, "margin_dpo/margin_std": 22.036544799804688, "step": 160 }, { "epoch": 0.23641703377386197, "grad_norm": 54.677059173583984, "learning_rate": 4.7321584725060594e-07, "logits/chosen": -0.6853748559951782, "logits/rejected": -0.6557826995849609, "logps/chosen": -60.53142547607422, "logps/ref_chosen": -52.43647766113281, "logps/ref_rejected": -83.43095397949219, "logps/rejected": -111.55258178710938, "loss": 0.646, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.23030346632003784, "margin_dpo/beta_margin_grad_std": 0.22901977598667145, "margin_dpo/beta_margin_mean": 2.0026674270629883, "margin_dpo/beta_margin_std": 1.9912761449813843, "margin_dpo/loss_margin_mean": 20.026674270629883, "margin_dpo/margin_mean": 20.026676177978516, "margin_dpo/margin_std": 19.802610397338867, "step": 161 }, { "epoch": 0.23788546255506607, "grad_norm": 46.158084869384766, "learning_rate": 4.7263498971727905e-07, "logits/chosen": -0.6514345407485962, "logits/rejected": -0.6138795614242554, "logps/chosen": -70.659423828125, "logps/ref_chosen": -62.61058807373047, "logps/ref_rejected": -89.39057922363281, "logps/rejected": -116.6668701171875, "loss": 0.6432, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.23401562869548798, "margin_dpo/beta_margin_grad_std": 0.21692141890525818, "margin_dpo/beta_margin_mean": 1.9227447509765625, "margin_dpo/beta_margin_std": 2.0753374099731445, "margin_dpo/loss_margin_mean": 19.227447509765625, "margin_dpo/margin_mean": 19.227447509765625, "margin_dpo/margin_std": 19.711488723754883, "step": 162 }, { "epoch": 0.2393538913362702, "grad_norm": 40.664859771728516, "learning_rate": 4.720482655449212e-07, "logits/chosen": -0.6877887845039368, "logits/rejected": -0.6518304944038391, "logps/chosen": -62.216094970703125, "logps/ref_chosen": -55.021629333496094, "logps/ref_rejected": -75.41822052001953, "logps/rejected": -101.20086669921875, "loss": 0.6, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2209862768650055, "margin_dpo/beta_margin_grad_std": 0.20438379049301147, "margin_dpo/beta_margin_mean": 1.858818531036377, "margin_dpo/beta_margin_std": 1.692821979522705, "margin_dpo/loss_margin_mean": 18.588184356689453, "margin_dpo/margin_mean": 18.588184356689453, "margin_dpo/margin_std": 16.888572692871094, "step": 163 }, { "epoch": 0.24082232011747431, "grad_norm": 37.478580474853516, "learning_rate": 4.714556901942599e-07, "logits/chosen": -0.6735349297523499, "logits/rejected": -0.6281991004943848, "logps/chosen": -61.48329162597656, "logps/ref_chosen": -55.64066696166992, "logps/ref_rejected": -79.66463470458984, "logps/rejected": -106.4957275390625, "loss": 0.4672, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18233349919319153, "margin_dpo/beta_margin_grad_std": 0.17561647295951843, "margin_dpo/beta_margin_mean": 2.0988471508026123, "margin_dpo/beta_margin_std": 1.599080204963684, "margin_dpo/loss_margin_mean": 20.98847007751465, "margin_dpo/margin_mean": 20.98847007751465, "margin_dpo/margin_std": 15.165935516357422, "step": 164 }, { "epoch": 0.2422907488986784, "grad_norm": 59.977012634277344, "learning_rate": 4.708572792802069e-07, "logits/chosen": -0.6780301332473755, "logits/rejected": -0.6259827613830566, "logps/chosen": -69.79907989501953, "logps/ref_chosen": -61.310691833496094, "logps/ref_rejected": -73.67060852050781, "logps/rejected": -96.30776977539062, "loss": 0.7391, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2699863314628601, "margin_dpo/beta_margin_grad_std": 0.2010025978088379, "margin_dpo/beta_margin_mean": 1.4148781299591064, "margin_dpo/beta_margin_std": 1.4906260967254639, "margin_dpo/loss_margin_mean": 14.148780822753906, "margin_dpo/margin_mean": 14.148780822753906, "margin_dpo/margin_std": 14.75755500793457, "step": 165 }, { "epoch": 0.24375917767988253, "grad_norm": 45.09745407104492, "learning_rate": 4.702530485714461e-07, "logits/chosen": -0.6649864912033081, "logits/rejected": -0.6568000316619873, "logps/chosen": -59.23976516723633, "logps/ref_chosen": -50.98360061645508, "logps/ref_rejected": -98.09512329101562, "logps/rejected": -129.61912536621094, "loss": 0.5314, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.20547892153263092, "margin_dpo/beta_margin_grad_std": 0.18839870393276215, "margin_dpo/beta_margin_mean": 2.3267838954925537, "margin_dpo/beta_margin_std": 2.3729536533355713, "margin_dpo/loss_margin_mean": 23.267839431762695, "margin_dpo/margin_mean": 23.267839431762695, "margin_dpo/margin_std": 23.49030876159668, "step": 166 }, { "epoch": 0.24522760646108663, "grad_norm": 55.43495178222656, "learning_rate": 4.6964301399001877e-07, "logits/chosen": -0.6117781400680542, "logits/rejected": -0.5964827537536621, "logps/chosen": -58.788291931152344, "logps/ref_chosen": -50.42409133911133, "logps/ref_rejected": -96.03042602539062, "logps/rejected": -128.52084350585938, "loss": 0.4979, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1734393686056137, "margin_dpo/beta_margin_grad_std": 0.19911234080791473, "margin_dpo/beta_margin_mean": 2.4126217365264893, "margin_dpo/beta_margin_std": 1.9151932001113892, "margin_dpo/loss_margin_mean": 24.126218795776367, "margin_dpo/margin_mean": 24.126216888427734, "margin_dpo/margin_std": 18.741436004638672, "step": 167 }, { "epoch": 0.24669603524229075, "grad_norm": 47.17963790893555, "learning_rate": 4.690271916109034e-07, "logits/chosen": -0.6795276403427124, "logits/rejected": -0.6451242566108704, "logps/chosen": -57.11155700683594, "logps/ref_chosen": -49.46282196044922, "logps/ref_rejected": -75.30854797363281, "logps/rejected": -101.9991455078125, "loss": 0.5363, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2067001909017563, "margin_dpo/beta_margin_grad_std": 0.18657654523849487, "margin_dpo/beta_margin_mean": 1.9041860103607178, "margin_dpo/beta_margin_std": 1.539984107017517, "margin_dpo/loss_margin_mean": 19.041860580444336, "margin_dpo/margin_mean": 19.041860580444336, "margin_dpo/margin_std": 15.281282424926758, "step": 168 }, { "epoch": 0.24816446402349487, "grad_norm": 54.283538818359375, "learning_rate": 4.6840559766159235e-07, "logits/chosen": -0.6633504629135132, "logits/rejected": -0.633745551109314, "logps/chosen": -67.26417541503906, "logps/ref_chosen": -59.803443908691406, "logps/ref_rejected": -83.34574890136719, "logps/rejected": -108.05218505859375, "loss": 0.7725, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.27190113067626953, "margin_dpo/beta_margin_grad_std": 0.22907379269599915, "margin_dpo/beta_margin_mean": 1.7245699167251587, "margin_dpo/beta_margin_std": 2.196559429168701, "margin_dpo/loss_margin_mean": 17.245698928833008, "margin_dpo/margin_mean": 17.245698928833008, "margin_dpo/margin_std": 21.799476623535156, "step": 169 }, { "epoch": 0.24963289280469897, "grad_norm": 47.430335998535156, "learning_rate": 4.6777824852166437e-07, "logits/chosen": -0.6211998462677002, "logits/rejected": -0.596994161605835, "logps/chosen": -55.63056182861328, "logps/ref_chosen": -49.471771240234375, "logps/ref_rejected": -75.91734313964844, "logps/rejected": -103.64749145507812, "loss": 0.5665, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2037757784128189, "margin_dpo/beta_margin_grad_std": 0.20665040612220764, "margin_dpo/beta_margin_mean": 2.1571362018585205, "margin_dpo/beta_margin_std": 1.8903136253356934, "margin_dpo/loss_margin_mean": 21.571361541748047, "margin_dpo/margin_mean": 21.571361541748047, "margin_dpo/margin_std": 18.570228576660156, "step": 170 }, { "epoch": 0.2511013215859031, "grad_norm": 62.58616256713867, "learning_rate": 4.6714516072235273e-07, "logits/chosen": -0.6630641222000122, "logits/rejected": -0.6101676225662231, "logps/chosen": -92.2599868774414, "logps/ref_chosen": -84.49931335449219, "logps/ref_rejected": -109.38209533691406, "logps/rejected": -135.90167236328125, "loss": 0.6488, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.23200581967830658, "margin_dpo/beta_margin_grad_std": 0.21404391527175903, "margin_dpo/beta_margin_mean": 1.8758889436721802, "margin_dpo/beta_margin_std": 1.987282395362854, "margin_dpo/loss_margin_mean": 18.75889015197754, "margin_dpo/margin_mean": 18.75889015197754, "margin_dpo/margin_std": 19.822465896606445, "step": 171 }, { "epoch": 0.2525697503671072, "grad_norm": 65.06268310546875, "learning_rate": 4.6650635094610966e-07, "logits/chosen": -0.7175358533859253, "logits/rejected": -0.6748548746109009, "logps/chosen": -78.930419921875, "logps/ref_chosen": -68.65391540527344, "logps/ref_rejected": -85.43667602539062, "logps/rejected": -113.89852905273438, "loss": 0.6925, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.23910076916217804, "margin_dpo/beta_margin_grad_std": 0.23038098216056824, "margin_dpo/beta_margin_mean": 1.8185348510742188, "margin_dpo/beta_margin_std": 1.889103651046753, "margin_dpo/loss_margin_mean": 18.185348510742188, "margin_dpo/margin_mean": 18.185348510742188, "margin_dpo/margin_std": 18.722644805908203, "step": 172 }, { "epoch": 0.2540381791483113, "grad_norm": 50.819210052490234, "learning_rate": 4.6586183602616687e-07, "logits/chosen": -0.7069448828697205, "logits/rejected": -0.6491652131080627, "logps/chosen": -70.72373962402344, "logps/ref_chosen": -63.050872802734375, "logps/ref_rejected": -78.68392944335938, "logps/rejected": -104.677490234375, "loss": 0.6217, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.22918951511383057, "margin_dpo/beta_margin_grad_std": 0.2061556577682495, "margin_dpo/beta_margin_mean": 1.8320705890655518, "margin_dpo/beta_margin_std": 1.7518055438995361, "margin_dpo/loss_margin_mean": 18.32070541381836, "margin_dpo/margin_mean": 18.32070541381836, "margin_dpo/margin_std": 17.509033203125, "step": 173 }, { "epoch": 0.2555066079295154, "grad_norm": 52.17148971557617, "learning_rate": 4.652116329460919e-07, "logits/chosen": -0.6733288168907166, "logits/rejected": -0.6736022233963013, "logps/chosen": -62.04674530029297, "logps/ref_chosen": -53.36296844482422, "logps/ref_rejected": -101.91120910644531, "logps/rejected": -129.01959228515625, "loss": 0.6537, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.23130814731121063, "margin_dpo/beta_margin_grad_std": 0.2197827398777008, "margin_dpo/beta_margin_mean": 1.8424601554870605, "margin_dpo/beta_margin_std": 1.8221278190612793, "margin_dpo/loss_margin_mean": 18.42460060119629, "margin_dpo/margin_mean": 18.42460060119629, "margin_dpo/margin_std": 17.7147216796875, "step": 174 }, { "epoch": 0.25697503671071953, "grad_norm": 49.36385726928711, "learning_rate": 4.645557588393406e-07, "logits/chosen": -0.6248526573181152, "logits/rejected": -0.6001460552215576, "logps/chosen": -52.519676208496094, "logps/ref_chosen": -45.417762756347656, "logps/ref_rejected": -89.50579833984375, "logps/rejected": -120.34637451171875, "loss": 0.4427, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18073977530002594, "margin_dpo/beta_margin_grad_std": 0.16047413647174835, "margin_dpo/beta_margin_mean": 2.3738653659820557, "margin_dpo/beta_margin_std": 1.9637548923492432, "margin_dpo/loss_margin_mean": 23.7386531829834, "margin_dpo/margin_mean": 23.7386531829834, "margin_dpo/margin_std": 19.49551010131836, "step": 175 }, { "epoch": 0.25844346549192365, "grad_norm": 43.94850540161133, "learning_rate": 4.638942309888058e-07, "logits/chosen": -0.6811977028846741, "logits/rejected": -0.6696683764457703, "logps/chosen": -57.92758560180664, "logps/ref_chosen": -50.45283889770508, "logps/ref_rejected": -95.55896759033203, "logps/rejected": -124.97660827636719, "loss": 0.4848, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1871013045310974, "margin_dpo/beta_margin_grad_std": 0.18806670606136322, "margin_dpo/beta_margin_mean": 2.194288969039917, "margin_dpo/beta_margin_std": 1.7437057495117188, "margin_dpo/loss_margin_mean": 21.942888259887695, "margin_dpo/margin_mean": 21.942890167236328, "margin_dpo/margin_std": 17.408954620361328, "step": 176 }, { "epoch": 0.2599118942731278, "grad_norm": 41.124366760253906, "learning_rate": 4.6322706682636137e-07, "logits/chosen": -0.6584987640380859, "logits/rejected": -0.6148337125778198, "logps/chosen": -70.50564575195312, "logps/ref_chosen": -61.21646499633789, "logps/ref_rejected": -95.89378356933594, "logps/rejected": -127.14639282226562, "loss": 0.5412, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.20711472630500793, "margin_dpo/beta_margin_grad_std": 0.19701464474201202, "margin_dpo/beta_margin_mean": 2.196343183517456, "margin_dpo/beta_margin_std": 2.060377359390259, "margin_dpo/loss_margin_mean": 21.963430404663086, "margin_dpo/margin_mean": 21.963428497314453, "margin_dpo/margin_std": 20.492202758789062, "step": 177 }, { "epoch": 0.26138032305433184, "grad_norm": 59.32670974731445, "learning_rate": 4.6255428393240354e-07, "logits/chosen": -0.6575347185134888, "logits/rejected": -0.6636496186256409, "logps/chosen": -70.52571868896484, "logps/ref_chosen": -58.26478958129883, "logps/ref_rejected": -105.36532592773438, "logps/rejected": -142.11297607421875, "loss": 0.5441, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.19805437326431274, "margin_dpo/beta_margin_grad_std": 0.2140408307313919, "margin_dpo/beta_margin_mean": 2.4486706256866455, "margin_dpo/beta_margin_std": 2.2665319442749023, "margin_dpo/loss_margin_mean": 24.48670768737793, "margin_dpo/margin_mean": 24.486705780029297, "margin_dpo/margin_std": 22.613819122314453, "step": 178 }, { "epoch": 0.26284875183553597, "grad_norm": 72.6666488647461, "learning_rate": 4.6187590003538724e-07, "logits/chosen": -0.6500136256217957, "logits/rejected": -0.6252506971359253, "logps/chosen": -72.29452514648438, "logps/ref_chosen": -61.05832290649414, "logps/ref_rejected": -90.52782440185547, "logps/rejected": -126.22462463378906, "loss": 0.6795, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.21323637664318085, "margin_dpo/beta_margin_grad_std": 0.2565738558769226, "margin_dpo/beta_margin_mean": 2.4460597038269043, "margin_dpo/beta_margin_std": 2.377058982849121, "margin_dpo/loss_margin_mean": 24.460596084594727, "margin_dpo/margin_mean": 24.460596084594727, "margin_dpo/margin_std": 23.486286163330078, "step": 179 }, { "epoch": 0.2643171806167401, "grad_norm": 45.02251052856445, "learning_rate": 4.611919330113591e-07, "logits/chosen": -0.6473774909973145, "logits/rejected": -0.6168711185455322, "logps/chosen": -63.067901611328125, "logps/ref_chosen": -54.34272003173828, "logps/ref_rejected": -98.21183776855469, "logps/rejected": -131.53292846679688, "loss": 0.455, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17352601885795593, "margin_dpo/beta_margin_grad_std": 0.1921532303094864, "margin_dpo/beta_margin_mean": 2.4595916271209717, "margin_dpo/beta_margin_std": 1.927443504333496, "margin_dpo/loss_margin_mean": 24.595916748046875, "margin_dpo/margin_mean": 24.595916748046875, "margin_dpo/margin_std": 19.19723129272461, "step": 180 }, { "epoch": 0.2657856093979442, "grad_norm": 52.635921478271484, "learning_rate": 4.605024008834863e-07, "logits/chosen": -0.6752246618270874, "logits/rejected": -0.6351001262664795, "logps/chosen": -63.29297637939453, "logps/ref_chosen": -55.000457763671875, "logps/ref_rejected": -61.656166076660156, "logps/rejected": -87.51589965820312, "loss": 0.6665, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.24270856380462646, "margin_dpo/beta_margin_grad_std": 0.21218255162239075, "margin_dpo/beta_margin_mean": 1.7567216157913208, "margin_dpo/beta_margin_std": 1.816219687461853, "margin_dpo/loss_margin_mean": 17.567216873168945, "margin_dpo/margin_mean": 17.567216873168945, "margin_dpo/margin_std": 18.145404815673828, "step": 181 }, { "epoch": 0.26725403817914833, "grad_norm": 62.45026397705078, "learning_rate": 4.598073218215817e-07, "logits/chosen": -0.6236890554428101, "logits/rejected": -0.6039330363273621, "logps/chosen": -50.268310546875, "logps/ref_chosen": -41.10784912109375, "logps/ref_rejected": -89.5215835571289, "logps/rejected": -125.66545867919922, "loss": 0.5717, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18390774726867676, "margin_dpo/beta_margin_grad_std": 0.2302154004573822, "margin_dpo/beta_margin_mean": 2.6983418464660645, "margin_dpo/beta_margin_std": 2.357621908187866, "margin_dpo/loss_margin_mean": 26.983417510986328, "margin_dpo/margin_mean": 26.983417510986328, "margin_dpo/margin_std": 23.51990509033203, "step": 182 }, { "epoch": 0.2687224669603524, "grad_norm": 57.11370849609375, "learning_rate": 4.5910671414162484e-07, "logits/chosen": -0.6656794548034668, "logits/rejected": -0.6146266460418701, "logps/chosen": -69.17279052734375, "logps/ref_chosen": -57.524559020996094, "logps/ref_rejected": -75.97572326660156, "logps/rejected": -108.50943756103516, "loss": 0.5269, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.20869557559490204, "margin_dpo/beta_margin_grad_std": 0.18125931918621063, "margin_dpo/beta_margin_mean": 2.0885488986968994, "margin_dpo/beta_margin_std": 1.8435460329055786, "margin_dpo/loss_margin_mean": 20.88549041748047, "margin_dpo/margin_mean": 20.88549041748047, "margin_dpo/margin_std": 16.793594360351562, "step": 183 }, { "epoch": 0.2701908957415565, "grad_norm": 55.754112243652344, "learning_rate": 4.5840059630527985e-07, "logits/chosen": -0.648979902267456, "logits/rejected": -0.6380875706672668, "logps/chosen": -67.61172485351562, "logps/ref_chosen": -58.544952392578125, "logps/ref_rejected": -76.63406372070312, "logps/rejected": -101.36076354980469, "loss": 0.7067, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.26532942056655884, "margin_dpo/beta_margin_grad_std": 0.20013463497161865, "margin_dpo/beta_margin_mean": 1.5659937858581543, "margin_dpo/beta_margin_std": 1.7907655239105225, "margin_dpo/loss_margin_mean": 15.659936904907227, "margin_dpo/margin_mean": 15.659937858581543, "margin_dpo/margin_std": 17.573902130126953, "step": 184 }, { "epoch": 0.27165932452276065, "grad_norm": 55.9732780456543, "learning_rate": 4.5768898691940836e-07, "logits/chosen": -0.6820551753044128, "logits/rejected": -0.6277487277984619, "logps/chosen": -71.6310806274414, "logps/ref_chosen": -62.02584457397461, "logps/ref_rejected": -73.76260375976562, "logps/rejected": -98.80401611328125, "loss": 0.8275, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.28143346309661865, "margin_dpo/beta_margin_grad_std": 0.23456232249736786, "margin_dpo/beta_margin_mean": 1.543617844581604, "margin_dpo/beta_margin_std": 1.9218882322311401, "margin_dpo/loss_margin_mean": 15.436178207397461, "margin_dpo/margin_mean": 15.436178207397461, "margin_dpo/margin_std": 18.934371948242188, "step": 185 }, { "epoch": 0.27312775330396477, "grad_norm": 42.71883010864258, "learning_rate": 4.5697190473557947e-07, "logits/chosen": -0.679948091506958, "logits/rejected": -0.6224746704101562, "logps/chosen": -79.09538269042969, "logps/ref_chosen": -69.35346984863281, "logps/ref_rejected": -88.07244873046875, "logps/rejected": -123.15438842773438, "loss": 0.3968, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15906205773353577, "margin_dpo/beta_margin_grad_std": 0.1656772643327713, "margin_dpo/beta_margin_mean": 2.5340030193328857, "margin_dpo/beta_margin_std": 1.8850746154785156, "margin_dpo/loss_margin_mean": 25.340028762817383, "margin_dpo/margin_mean": 25.34002685546875, "margin_dpo/margin_std": 18.844833374023438, "step": 186 }, { "epoch": 0.2745961820851689, "grad_norm": 63.56511306762695, "learning_rate": 4.5624936864957555e-07, "logits/chosen": -0.6759737730026245, "logits/rejected": -0.6658194065093994, "logps/chosen": -64.28144836425781, "logps/ref_chosen": -52.75646209716797, "logps/ref_rejected": -81.96910095214844, "logps/rejected": -112.44464111328125, "loss": 0.724, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.24051952362060547, "margin_dpo/beta_margin_grad_std": 0.23406511545181274, "margin_dpo/beta_margin_mean": 1.895055890083313, "margin_dpo/beta_margin_std": 2.0592827796936035, "margin_dpo/loss_margin_mean": 18.950557708740234, "margin_dpo/margin_mean": 18.950557708740234, "margin_dpo/margin_std": 20.23330307006836, "step": 187 }, { "epoch": 0.27606461086637296, "grad_norm": 59.55484390258789, "learning_rate": 4.5552139770089454e-07, "logits/chosen": -0.6790421009063721, "logits/rejected": -0.6611262559890747, "logps/chosen": -58.12608337402344, "logps/ref_chosen": -49.415489196777344, "logps/ref_rejected": -89.54043579101562, "logps/rejected": -119.92129516601562, "loss": 0.6085, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2119147628545761, "margin_dpo/beta_margin_grad_std": 0.21607068181037903, "margin_dpo/beta_margin_mean": 2.1670262813568115, "margin_dpo/beta_margin_std": 2.0467336177825928, "margin_dpo/loss_margin_mean": 21.670263290405273, "margin_dpo/margin_mean": 21.670263290405273, "margin_dpo/margin_std": 20.444095611572266, "step": 188 }, { "epoch": 0.2775330396475771, "grad_norm": 58.168373107910156, "learning_rate": 4.5478801107224794e-07, "logits/chosen": -0.6502203941345215, "logits/rejected": -0.5917783975601196, "logps/chosen": -60.68791961669922, "logps/ref_chosen": -52.39896011352539, "logps/ref_rejected": -72.16735076904297, "logps/rejected": -102.79643249511719, "loss": 0.5283, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.19385695457458496, "margin_dpo/beta_margin_grad_std": 0.18882273137569427, "margin_dpo/beta_margin_mean": 2.2340126037597656, "margin_dpo/beta_margin_std": 1.9784209728240967, "margin_dpo/loss_margin_mean": 22.340126037597656, "margin_dpo/margin_mean": 22.340126037597656, "margin_dpo/margin_std": 19.765621185302734, "step": 189 }, { "epoch": 0.2790014684287812, "grad_norm": 51.31618118286133, "learning_rate": 4.5404922808905543e-07, "logits/chosen": -0.7103268504142761, "logits/rejected": -0.6599196195602417, "logps/chosen": -73.30516052246094, "logps/ref_chosen": -64.68305969238281, "logps/ref_rejected": -102.55052185058594, "logps/rejected": -134.74267578125, "loss": 0.5025, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.19283924996852875, "margin_dpo/beta_margin_grad_std": 0.19168947637081146, "margin_dpo/beta_margin_mean": 2.3570051193237305, "margin_dpo/beta_margin_std": 2.1541616916656494, "margin_dpo/loss_margin_mean": 23.570051193237305, "margin_dpo/margin_mean": 23.570049285888672, "margin_dpo/margin_std": 20.566938400268555, "step": 190 }, { "epoch": 0.28046989720998533, "grad_norm": 34.443443298339844, "learning_rate": 4.5330506821893565e-07, "logits/chosen": -0.6464371681213379, "logits/rejected": -0.6282103061676025, "logps/chosen": -75.33536529541016, "logps/ref_chosen": -68.65887451171875, "logps/ref_rejected": -110.1396713256836, "logps/rejected": -144.71490478515625, "loss": 0.4356, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16399236023426056, "margin_dpo/beta_margin_grad_std": 0.19074279069900513, "margin_dpo/beta_margin_mean": 2.789874792098999, "margin_dpo/beta_margin_std": 2.458559274673462, "margin_dpo/loss_margin_mean": 27.89874839782715, "margin_dpo/margin_mean": 27.898746490478516, "margin_dpo/margin_std": 24.31833267211914, "step": 191 }, { "epoch": 0.28193832599118945, "grad_norm": 53.86375427246094, "learning_rate": 4.5255555107119336e-07, "logits/chosen": -0.6596635580062866, "logits/rejected": -0.6359300017356873, "logps/chosen": -80.91861724853516, "logps/ref_chosen": -69.72691345214844, "logps/ref_rejected": -103.32135009765625, "logps/rejected": -138.19093322753906, "loss": 0.5499, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.20277048647403717, "margin_dpo/beta_margin_grad_std": 0.2157028615474701, "margin_dpo/beta_margin_mean": 2.367788076400757, "margin_dpo/beta_margin_std": 2.2121126651763916, "margin_dpo/loss_margin_mean": 23.677879333496094, "margin_dpo/margin_mean": 23.677879333496094, "margin_dpo/margin_std": 21.862239837646484, "step": 192 }, { "epoch": 0.2834067547723935, "grad_norm": 83.77180480957031, "learning_rate": 4.5180069639630236e-07, "logits/chosen": -0.691096305847168, "logits/rejected": -0.639538049697876, "logps/chosen": -71.17577362060547, "logps/ref_chosen": -60.19049835205078, "logps/ref_rejected": -76.40755462646484, "logps/rejected": -104.34375, "loss": 0.781, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.23370835185050964, "margin_dpo/beta_margin_grad_std": 0.2316931039094925, "margin_dpo/beta_margin_mean": 1.6950924396514893, "margin_dpo/beta_margin_std": 1.8348240852355957, "margin_dpo/loss_margin_mean": 16.950923919677734, "margin_dpo/margin_mean": 16.950923919677734, "margin_dpo/margin_std": 17.993907928466797, "step": 193 }, { "epoch": 0.28487518355359764, "grad_norm": 36.576637268066406, "learning_rate": 4.510405240853854e-07, "logits/chosen": -0.6739726662635803, "logits/rejected": -0.6504776477813721, "logps/chosen": -45.40176010131836, "logps/ref_chosen": -37.84037399291992, "logps/ref_rejected": -60.684783935546875, "logps/rejected": -90.91381072998047, "loss": 0.5108, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.20001572370529175, "margin_dpo/beta_margin_grad_std": 0.18756882846355438, "margin_dpo/beta_margin_mean": 2.2667644023895264, "margin_dpo/beta_margin_std": 1.985824704170227, "margin_dpo/loss_margin_mean": 22.66764259338379, "margin_dpo/margin_mean": 22.66764259338379, "margin_dpo/margin_std": 19.685068130493164, "step": 194 }, { "epoch": 0.28634361233480177, "grad_norm": 54.66409683227539, "learning_rate": 4.5027505416968985e-07, "logits/chosen": -0.6257681846618652, "logits/rejected": -0.6233822107315063, "logps/chosen": -66.51872253417969, "logps/ref_chosen": -54.891571044921875, "logps/ref_rejected": -96.77095794677734, "logps/rejected": -130.88046264648438, "loss": 0.4833, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18052619695663452, "margin_dpo/beta_margin_grad_std": 0.19298022985458374, "margin_dpo/beta_margin_mean": 2.248234510421753, "margin_dpo/beta_margin_std": 1.8670945167541504, "margin_dpo/loss_margin_mean": 22.482343673706055, "margin_dpo/margin_mean": 22.482345581054688, "margin_dpo/margin_std": 18.633167266845703, "step": 195 }, { "epoch": 0.2878120411160059, "grad_norm": 45.430152893066406, "learning_rate": 4.495043068200599e-07, "logits/chosen": -0.6508013010025024, "logits/rejected": -0.6155145168304443, "logps/chosen": -61.67543029785156, "logps/ref_chosen": -53.245243072509766, "logps/ref_rejected": -76.05294799804688, "logps/rejected": -111.40143585205078, "loss": 0.442, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16920757293701172, "margin_dpo/beta_margin_grad_std": 0.1858920007944107, "margin_dpo/beta_margin_mean": 2.6918299198150635, "margin_dpo/beta_margin_std": 2.2219343185424805, "margin_dpo/loss_margin_mean": 26.918298721313477, "margin_dpo/margin_mean": 26.918298721313477, "margin_dpo/margin_std": 22.07672119140625, "step": 196 }, { "epoch": 0.28928046989721, "grad_norm": 52.48564910888672, "learning_rate": 4.4872830234640493e-07, "logits/chosen": -0.619194507598877, "logits/rejected": -0.6072961091995239, "logps/chosen": -69.83384704589844, "logps/ref_chosen": -60.42033767700195, "logps/ref_rejected": -77.20890808105469, "logps/rejected": -107.95001220703125, "loss": 0.5151, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.20070594549179077, "margin_dpo/beta_margin_grad_std": 0.18301643431186676, "margin_dpo/beta_margin_mean": 2.132758855819702, "margin_dpo/beta_margin_std": 1.8700523376464844, "margin_dpo/loss_margin_mean": 21.32758903503418, "margin_dpo/margin_mean": 21.32758903503418, "margin_dpo/margin_std": 18.477304458618164, "step": 197 }, { "epoch": 0.2907488986784141, "grad_norm": 50.00718307495117, "learning_rate": 4.479470611971645e-07, "logits/chosen": -0.6850384473800659, "logits/rejected": -0.6736607551574707, "logps/chosen": -64.92497253417969, "logps/ref_chosen": -55.03618621826172, "logps/ref_rejected": -97.24325561523438, "logps/rejected": -130.21774291992188, "loss": 0.5106, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1974949985742569, "margin_dpo/beta_margin_grad_std": 0.18310509622097015, "margin_dpo/beta_margin_mean": 2.30856990814209, "margin_dpo/beta_margin_std": 2.314561128616333, "margin_dpo/loss_margin_mean": 23.085697174072266, "margin_dpo/margin_mean": 23.085697174072266, "margin_dpo/margin_std": 22.705781936645508, "step": 198 }, { "epoch": 0.2922173274596182, "grad_norm": 54.243560791015625, "learning_rate": 4.471606039587695e-07, "logits/chosen": -0.681465744972229, "logits/rejected": -0.6637758016586304, "logps/chosen": -66.67937469482422, "logps/ref_chosen": -56.828826904296875, "logps/ref_rejected": -84.64820861816406, "logps/rejected": -117.4178237915039, "loss": 0.596, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.21248871088027954, "margin_dpo/beta_margin_grad_std": 0.220571368932724, "margin_dpo/beta_margin_mean": 2.2919063568115234, "margin_dpo/beta_margin_std": 2.2386255264282227, "margin_dpo/loss_margin_mean": 22.919063568115234, "margin_dpo/margin_mean": 22.919063568115234, "margin_dpo/margin_std": 22.307586669921875, "step": 199 }, { "epoch": 0.2936857562408223, "grad_norm": 69.43572998046875, "learning_rate": 4.4636895135509966e-07, "logits/chosen": -0.664925217628479, "logits/rejected": -0.6286830902099609, "logps/chosen": -64.1875991821289, "logps/ref_chosen": -53.06706237792969, "logps/ref_rejected": -80.60843658447266, "logps/rejected": -116.37126159667969, "loss": 0.6573, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2101076990365982, "margin_dpo/beta_margin_grad_std": 0.22291822731494904, "margin_dpo/beta_margin_mean": 2.464228868484497, "margin_dpo/beta_margin_std": 2.5758798122406006, "margin_dpo/loss_margin_mean": 24.642288208007812, "margin_dpo/margin_mean": 24.642288208007812, "margin_dpo/margin_std": 25.661331176757812, "step": 200 }, { "epoch": 0.2936857562408223, "eval_logits/chosen": -0.6437793374061584, "eval_logits/rejected": -0.6209864616394043, "eval_logps/chosen": -92.97444915771484, "eval_logps/ref_chosen": -79.05104064941406, "eval_logps/ref_rejected": -86.79793548583984, "eval_logps/rejected": -115.39439392089844, "eval_loss": 0.4776662588119507, "eval_margin_dpo/beta": 0.10000000149011612, "eval_margin_dpo/beta_margin_grad_mean": -0.31056177616119385, "eval_margin_dpo/beta_margin_grad_std": 0.24659062922000885, "eval_margin_dpo/beta_margin_mean": 1.4673058986663818, "eval_margin_dpo/beta_margin_std": 2.1103227138519287, "eval_margin_dpo/loss_margin_mean": 14.67305850982666, "eval_margin_dpo/margin_mean": 14.67305850982666, "eval_margin_dpo/margin_std": 21.103225708007812, "eval_runtime": 40.1206, "eval_samples_per_second": 58.299, "eval_steps_per_second": 1.844, "step": 200 }, { "epoch": 0.29515418502202645, "grad_norm": 52.3123893737793, "learning_rate": 4.455721242469372e-07, "logits/chosen": -0.6279897689819336, "logits/rejected": -0.5977976322174072, "logps/chosen": -83.8150634765625, "logps/ref_chosen": -75.4022216796875, "logps/ref_rejected": -114.80821990966797, "logps/rejected": -148.78948974609375, "loss": 0.5336, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18962885439395905, "margin_dpo/beta_margin_grad_std": 0.21612294018268585, "margin_dpo/beta_margin_mean": 2.556842803955078, "margin_dpo/beta_margin_std": 2.3333568572998047, "margin_dpo/loss_margin_mean": 25.56842613220215, "margin_dpo/margin_mean": 25.56842803955078, "margin_dpo/margin_std": 23.23642349243164, "step": 201 }, { "epoch": 0.2966226138032305, "grad_norm": 71.65006256103516, "learning_rate": 4.4477014363141755e-07, "logits/chosen": -0.6751070618629456, "logits/rejected": -0.663360059261322, "logps/chosen": -60.95005798339844, "logps/ref_chosen": -50.101318359375, "logps/ref_rejected": -86.98503112792969, "logps/rejected": -116.69070434570312, "loss": 0.7708, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.25191596150398254, "margin_dpo/beta_margin_grad_std": 0.24605146050453186, "margin_dpo/beta_margin_mean": 1.8856925964355469, "margin_dpo/beta_margin_std": 2.1072704792022705, "margin_dpo/loss_margin_mean": 18.85692596435547, "margin_dpo/margin_mean": 18.85692596435547, "margin_dpo/margin_std": 21.052621841430664, "step": 202 }, { "epoch": 0.29809104258443464, "grad_norm": 45.57592010498047, "learning_rate": 4.439630306414758e-07, "logits/chosen": -0.6758503317832947, "logits/rejected": -0.632592499256134, "logps/chosen": -68.63609313964844, "logps/ref_chosen": -60.60969543457031, "logps/ref_rejected": -85.89596557617188, "logps/rejected": -114.98287963867188, "loss": 0.532, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.20186971127986908, "margin_dpo/beta_margin_grad_std": 0.19567202031612396, "margin_dpo/beta_margin_mean": 2.106050491333008, "margin_dpo/beta_margin_std": 1.8404932022094727, "margin_dpo/loss_margin_mean": 21.060503005981445, "margin_dpo/margin_mean": 21.060504913330078, "margin_dpo/margin_std": 18.373455047607422, "step": 203 }, { "epoch": 0.29955947136563876, "grad_norm": 46.98274230957031, "learning_rate": 4.431508065452897e-07, "logits/chosen": -0.640312910079956, "logits/rejected": -0.5806652307510376, "logps/chosen": -89.8402099609375, "logps/ref_chosen": -80.16496276855469, "logps/ref_rejected": -87.69590759277344, "logps/rejected": -119.30364990234375, "loss": 0.5197, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1984976977109909, "margin_dpo/beta_margin_grad_std": 0.19227007031440735, "margin_dpo/beta_margin_mean": 2.193248987197876, "margin_dpo/beta_margin_std": 2.008610725402832, "margin_dpo/loss_margin_mean": 21.9324893951416, "margin_dpo/margin_mean": 21.9324893951416, "margin_dpo/margin_std": 19.7504825592041, "step": 204 }, { "epoch": 0.3010279001468429, "grad_norm": 67.07288360595703, "learning_rate": 4.4233349274571974e-07, "logits/chosen": -0.690580427646637, "logits/rejected": -0.6472284197807312, "logps/chosen": -70.58891296386719, "logps/ref_chosen": -59.384735107421875, "logps/ref_rejected": -85.12505340576172, "logps/rejected": -120.11308288574219, "loss": 0.6391, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.21127313375473022, "margin_dpo/beta_margin_grad_std": 0.2417152225971222, "margin_dpo/beta_margin_mean": 2.378384828567505, "margin_dpo/beta_margin_std": 2.325178384780884, "margin_dpo/loss_margin_mean": 23.78384780883789, "margin_dpo/margin_mean": 23.78384780883789, "margin_dpo/margin_std": 23.02971649169922, "step": 205 }, { "epoch": 0.302496328928047, "grad_norm": 44.62752914428711, "learning_rate": 4.415111107797445e-07, "logits/chosen": -0.6897552013397217, "logits/rejected": -0.6878204345703125, "logps/chosen": -57.75178527832031, "logps/ref_chosen": -46.964500427246094, "logps/ref_rejected": -98.9534912109375, "logps/rejected": -137.04470825195312, "loss": 0.4331, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16189786791801453, "margin_dpo/beta_margin_grad_std": 0.19883880019187927, "margin_dpo/beta_margin_mean": 2.730393171310425, "margin_dpo/beta_margin_std": 2.1477179527282715, "margin_dpo/loss_margin_mean": 27.303930282592773, "margin_dpo/margin_mean": 27.303932189941406, "margin_dpo/margin_std": 21.387907028198242, "step": 206 }, { "epoch": 0.3039647577092511, "grad_norm": 51.993370056152344, "learning_rate": 4.4068368231789365e-07, "logits/chosen": -0.6620955467224121, "logits/rejected": -0.6329070925712585, "logps/chosen": -64.26461791992188, "logps/ref_chosen": -56.05625915527344, "logps/ref_rejected": -84.44779968261719, "logps/rejected": -121.12300872802734, "loss": 0.4613, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1773681491613388, "margin_dpo/beta_margin_grad_std": 0.19014772772789001, "margin_dpo/beta_margin_mean": 2.8466851711273193, "margin_dpo/beta_margin_std": 2.5996739864349365, "margin_dpo/loss_margin_mean": 28.46685028076172, "margin_dpo/margin_mean": 28.46685028076172, "margin_dpo/margin_std": 25.901378631591797, "step": 207 }, { "epoch": 0.3054331864904552, "grad_norm": 52.05033493041992, "learning_rate": 4.398512291636768e-07, "logits/chosen": -0.6714012622833252, "logits/rejected": -0.6417751312255859, "logps/chosen": -79.44926452636719, "logps/ref_chosen": -67.06761169433594, "logps/ref_rejected": -94.28689575195312, "logps/rejected": -129.90614318847656, "loss": 0.4839, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.19027359783649445, "margin_dpo/beta_margin_grad_std": 0.18024224042892456, "margin_dpo/beta_margin_mean": 2.3237593173980713, "margin_dpo/beta_margin_std": 2.0613441467285156, "margin_dpo/loss_margin_mean": 23.237592697143555, "margin_dpo/margin_mean": 23.237594604492188, "margin_dpo/margin_std": 20.450965881347656, "step": 208 }, { "epoch": 0.3069016152716593, "grad_norm": 49.21156692504883, "learning_rate": 4.3901377325300857e-07, "logits/chosen": -0.6503257751464844, "logits/rejected": -0.6238174438476562, "logps/chosen": -65.82164764404297, "logps/ref_chosen": -56.18169403076172, "logps/ref_rejected": -80.94152069091797, "logps/rejected": -114.3055419921875, "loss": 0.6043, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.20685909688472748, "margin_dpo/beta_margin_grad_std": 0.23329907655715942, "margin_dpo/beta_margin_mean": 2.372406482696533, "margin_dpo/beta_margin_std": 2.2294185161590576, "margin_dpo/loss_margin_mean": 23.724063873291016, "margin_dpo/margin_mean": 23.724063873291016, "margin_dpo/margin_std": 21.8853816986084, "step": 209 }, { "epoch": 0.30837004405286345, "grad_norm": 47.80137634277344, "learning_rate": 4.381713366536311e-07, "logits/chosen": -0.6832214593887329, "logits/rejected": -0.6534386873245239, "logps/chosen": -56.31086349487305, "logps/ref_chosen": -46.371822357177734, "logps/ref_rejected": -76.68162536621094, "logps/rejected": -108.84925842285156, "loss": 0.5055, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.19675123691558838, "margin_dpo/beta_margin_grad_std": 0.18586638569831848, "margin_dpo/beta_margin_mean": 2.2228593826293945, "margin_dpo/beta_margin_std": 1.9132236242294312, "margin_dpo/loss_margin_mean": 22.228591918945312, "margin_dpo/margin_mean": 22.228591918945312, "margin_dpo/margin_std": 18.896484375, "step": 210 }, { "epoch": 0.30983847283406757, "grad_norm": 59.744529724121094, "learning_rate": 4.373239415645323e-07, "logits/chosen": -0.6856693029403687, "logits/rejected": -0.6315619945526123, "logps/chosen": -91.70536041259766, "logps/ref_chosen": -78.93235778808594, "logps/ref_rejected": -86.82098388671875, "logps/rejected": -122.416015625, "loss": 0.597, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.20305074751377106, "margin_dpo/beta_margin_grad_std": 0.23186683654785156, "margin_dpo/beta_margin_mean": 2.2822024822235107, "margin_dpo/beta_margin_std": 2.1043384075164795, "margin_dpo/loss_margin_mean": 22.822023391723633, "margin_dpo/margin_mean": 22.822023391723633, "margin_dpo/margin_std": 20.654033660888672, "step": 211 }, { "epoch": 0.31130690161527164, "grad_norm": 49.052188873291016, "learning_rate": 4.3647161031536086e-07, "logits/chosen": -0.6898226737976074, "logits/rejected": -0.6595550775527954, "logps/chosen": -70.00460052490234, "logps/ref_chosen": -58.19701385498047, "logps/ref_rejected": -103.05784606933594, "logps/rejected": -143.5913543701172, "loss": 0.4566, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17350082099437714, "margin_dpo/beta_margin_grad_std": 0.19579628109931946, "margin_dpo/beta_margin_mean": 2.872591733932495, "margin_dpo/beta_margin_std": 2.5697410106658936, "margin_dpo/loss_margin_mean": 28.725915908813477, "margin_dpo/margin_mean": 28.725915908813477, "margin_dpo/margin_std": 24.575883865356445, "step": 212 }, { "epoch": 0.31277533039647576, "grad_norm": 51.70877456665039, "learning_rate": 4.3561436536583774e-07, "logits/chosen": -0.6648838520050049, "logits/rejected": -0.6213950514793396, "logps/chosen": -77.46551513671875, "logps/ref_chosen": -67.51271057128906, "logps/ref_rejected": -93.91471862792969, "logps/rejected": -132.9736785888672, "loss": 0.4856, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1718873530626297, "margin_dpo/beta_margin_grad_std": 0.20221562683582306, "margin_dpo/beta_margin_mean": 2.9106154441833496, "margin_dpo/beta_margin_std": 2.622797966003418, "margin_dpo/loss_margin_mean": 29.10615348815918, "margin_dpo/margin_mean": 29.106151580810547, "margin_dpo/margin_std": 25.876745223999023, "step": 213 }, { "epoch": 0.3142437591776799, "grad_norm": 59.337345123291016, "learning_rate": 4.3475222930516473e-07, "logits/chosen": -0.6546872854232788, "logits/rejected": -0.6387699842453003, "logps/chosen": -52.264801025390625, "logps/ref_chosen": -41.604888916015625, "logps/ref_rejected": -77.51741027832031, "logps/rejected": -111.19406127929688, "loss": 0.6245, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.21652430295944214, "margin_dpo/beta_margin_grad_std": 0.2097923308610916, "margin_dpo/beta_margin_mean": 2.3016743659973145, "margin_dpo/beta_margin_std": 2.411076784133911, "margin_dpo/loss_margin_mean": 23.016742706298828, "margin_dpo/margin_mean": 23.016742706298828, "margin_dpo/margin_std": 24.032947540283203, "step": 214 }, { "epoch": 0.315712187958884, "grad_norm": 56.185760498046875, "learning_rate": 4.3388522485142885e-07, "logits/chosen": -0.6533488035202026, "logits/rejected": -0.624896228313446, "logps/chosen": -63.87721252441406, "logps/ref_chosen": -53.279266357421875, "logps/ref_rejected": -89.96464538574219, "logps/rejected": -125.45509338378906, "loss": 0.5351, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1931016743183136, "margin_dpo/beta_margin_grad_std": 0.20534518361091614, "margin_dpo/beta_margin_mean": 2.4892501831054688, "margin_dpo/beta_margin_std": 2.3997206687927246, "margin_dpo/loss_margin_mean": 24.892501831054688, "margin_dpo/margin_mean": 24.892501831054688, "margin_dpo/margin_std": 23.67044448852539, "step": 215 }, { "epoch": 0.31718061674008813, "grad_norm": 63.23625564575195, "learning_rate": 4.330133748510036e-07, "logits/chosen": -0.6673412919044495, "logits/rejected": -0.6347181797027588, "logps/chosen": -61.9163932800293, "logps/ref_chosen": -48.887794494628906, "logps/ref_rejected": -77.19892883300781, "logps/rejected": -117.18614196777344, "loss": 0.5867, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2070668637752533, "margin_dpo/beta_margin_grad_std": 0.22892099618911743, "margin_dpo/beta_margin_mean": 2.695861577987671, "margin_dpo/beta_margin_std": 2.648024797439575, "margin_dpo/loss_margin_mean": 26.958616256713867, "margin_dpo/margin_mean": 26.958616256713867, "margin_dpo/margin_std": 26.038864135742188, "step": 216 }, { "epoch": 0.3186490455212922, "grad_norm": 40.630271911621094, "learning_rate": 4.3213670227794757e-07, "logits/chosen": -0.6815335750579834, "logits/rejected": -0.6386614441871643, "logps/chosen": -60.836097717285156, "logps/ref_chosen": -49.845306396484375, "logps/ref_rejected": -100.07832336425781, "logps/rejected": -138.99900817871094, "loss": 0.4057, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15030139684677124, "margin_dpo/beta_margin_grad_std": 0.1869634985923767, "margin_dpo/beta_margin_mean": 2.7929890155792236, "margin_dpo/beta_margin_std": 2.152282953262329, "margin_dpo/loss_margin_mean": 27.929887771606445, "margin_dpo/margin_mean": 27.929889678955078, "margin_dpo/margin_std": 21.507854461669922, "step": 217 }, { "epoch": 0.3201174743024963, "grad_norm": 54.89970397949219, "learning_rate": 4.3125523023339815e-07, "logits/chosen": -0.6562488079071045, "logits/rejected": -0.6250983476638794, "logps/chosen": -69.97561645507812, "logps/ref_chosen": -58.576683044433594, "logps/ref_rejected": -87.84639739990234, "logps/rejected": -124.18275451660156, "loss": 0.5357, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.19920094311237335, "margin_dpo/beta_margin_grad_std": 0.20679454505443573, "margin_dpo/beta_margin_mean": 2.4937424659729004, "margin_dpo/beta_margin_std": 2.4085628986358643, "margin_dpo/loss_margin_mean": 24.937423706054688, "margin_dpo/margin_mean": 24.937423706054688, "margin_dpo/margin_std": 23.69991683959961, "step": 218 }, { "epoch": 0.32158590308370044, "grad_norm": 60.82085037231445, "learning_rate": 4.303689819449636e-07, "logits/chosen": -0.6473067998886108, "logits/rejected": -0.6150014400482178, "logps/chosen": -72.4845962524414, "logps/ref_chosen": -61.083858489990234, "logps/ref_rejected": -85.83042907714844, "logps/rejected": -119.23858642578125, "loss": 0.5267, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.19810640811920166, "margin_dpo/beta_margin_grad_std": 0.19194501638412476, "margin_dpo/beta_margin_mean": 2.200741767883301, "margin_dpo/beta_margin_std": 1.9767764806747437, "margin_dpo/loss_margin_mean": 22.007417678833008, "margin_dpo/margin_mean": 22.007417678833008, "margin_dpo/margin_std": 19.649311065673828, "step": 219 }, { "epoch": 0.32305433186490456, "grad_norm": 47.847412109375, "learning_rate": 4.2947798076611047e-07, "logits/chosen": -0.6549057960510254, "logits/rejected": -0.6138431429862976, "logps/chosen": -81.12652587890625, "logps/ref_chosen": -70.03128051757812, "logps/ref_rejected": -87.68551635742188, "logps/rejected": -119.67072296142578, "loss": 0.5029, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.19834579527378082, "margin_dpo/beta_margin_grad_std": 0.180747851729393, "margin_dpo/beta_margin_mean": 2.0889971256256104, "margin_dpo/beta_margin_std": 1.714986801147461, "margin_dpo/loss_margin_mean": 20.889970779418945, "margin_dpo/margin_mean": 20.889970779418945, "margin_dpo/margin_std": 16.6192626953125, "step": 220 }, { "epoch": 0.3245227606461087, "grad_norm": 48.202903747558594, "learning_rate": 4.285822501755485e-07, "logits/chosen": -0.6510884761810303, "logits/rejected": -0.6392531394958496, "logps/chosen": -64.44230651855469, "logps/ref_chosen": -52.15470886230469, "logps/ref_rejected": -106.46768188476562, "logps/rejected": -151.5745391845703, "loss": 0.339, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12460769712924957, "margin_dpo/beta_margin_grad_std": 0.1816825121641159, "margin_dpo/beta_margin_mean": 3.281925678253174, "margin_dpo/beta_margin_std": 2.298614740371704, "margin_dpo/loss_margin_mean": 32.81925582885742, "margin_dpo/margin_mean": 32.81925582885742, "margin_dpo/margin_std": 22.837791442871094, "step": 221 }, { "epoch": 0.32599118942731276, "grad_norm": 76.29179382324219, "learning_rate": 4.276818137766118e-07, "logits/chosen": -0.7204064130783081, "logits/rejected": -0.6859586238861084, "logps/chosen": -74.65057373046875, "logps/ref_chosen": -60.971099853515625, "logps/ref_rejected": -100.00115203857422, "logps/rejected": -139.82711791992188, "loss": 0.5718, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.19307678937911987, "margin_dpo/beta_margin_grad_std": 0.22420592606067657, "margin_dpo/beta_margin_mean": 2.6146483421325684, "margin_dpo/beta_margin_std": 2.517277479171753, "margin_dpo/loss_margin_mean": 26.146484375, "margin_dpo/margin_mean": 26.146484375, "margin_dpo/margin_std": 24.794532775878906, "step": 222 }, { "epoch": 0.3274596182085169, "grad_norm": 78.53938293457031, "learning_rate": 4.2677669529663686e-07, "logits/chosen": -0.7086101770401001, "logits/rejected": -0.6605532169342041, "logps/chosen": -68.55857849121094, "logps/ref_chosen": -52.64057922363281, "logps/ref_rejected": -82.82502746582031, "logps/rejected": -121.03541564941406, "loss": 0.738, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.22235842049121857, "margin_dpo/beta_margin_grad_std": 0.26257219910621643, "margin_dpo/beta_margin_mean": 2.229238271713257, "margin_dpo/beta_margin_std": 2.2963743209838867, "margin_dpo/loss_margin_mean": 22.292381286621094, "margin_dpo/margin_mean": 22.292381286621094, "margin_dpo/margin_std": 22.842185974121094, "step": 223 }, { "epoch": 0.328928046989721, "grad_norm": 74.9097671508789, "learning_rate": 4.2586691858633747e-07, "logits/chosen": -0.6751635074615479, "logits/rejected": -0.6340160369873047, "logps/chosen": -61.69850158691406, "logps/ref_chosen": -48.59540939331055, "logps/ref_rejected": -77.11648559570312, "logps/rejected": -116.998046875, "loss": 0.5673, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.19054511189460754, "margin_dpo/beta_margin_grad_std": 0.2218094766139984, "margin_dpo/beta_margin_mean": 2.6778461933135986, "margin_dpo/beta_margin_std": 2.5320727825164795, "margin_dpo/loss_margin_mean": 26.778461456298828, "margin_dpo/margin_mean": 26.778461456298828, "margin_dpo/margin_std": 24.951221466064453, "step": 224 }, { "epoch": 0.3303964757709251, "grad_norm": 43.42683792114258, "learning_rate": 4.249525076191759e-07, "logits/chosen": -0.6741304397583008, "logits/rejected": -0.6419914960861206, "logps/chosen": -72.66340637207031, "logps/ref_chosen": -58.000465393066406, "logps/ref_rejected": -99.90290832519531, "logps/rejected": -147.5189208984375, "loss": 0.4077, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14978624880313873, "margin_dpo/beta_margin_grad_std": 0.203273743391037, "margin_dpo/beta_margin_mean": 3.2953078746795654, "margin_dpo/beta_margin_std": 2.6787664890289307, "margin_dpo/loss_margin_mean": 32.95307922363281, "margin_dpo/margin_mean": 32.95307922363281, "margin_dpo/margin_std": 26.78388214111328, "step": 225 }, { "epoch": 0.33186490455212925, "grad_norm": 51.3669548034668, "learning_rate": 4.2403348649073167e-07, "logits/chosen": -0.6864838600158691, "logits/rejected": -0.6341279745101929, "logps/chosen": -69.51836395263672, "logps/ref_chosen": -58.898799896240234, "logps/ref_rejected": -78.68775939941406, "logps/rejected": -114.87089538574219, "loss": 0.4851, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17993833124637604, "margin_dpo/beta_margin_grad_std": 0.19534794986248016, "margin_dpo/beta_margin_mean": 2.5563576221466064, "margin_dpo/beta_margin_std": 2.2327442169189453, "margin_dpo/loss_margin_mean": 25.563575744628906, "margin_dpo/margin_mean": 25.563575744628906, "margin_dpo/margin_std": 21.131916046142578, "step": 226 }, { "epoch": 0.3333333333333333, "grad_norm": 48.44467544555664, "learning_rate": 4.2310987941806615e-07, "logits/chosen": -0.6759487390518188, "logits/rejected": -0.6457496881484985, "logps/chosen": -70.79923248291016, "logps/ref_chosen": -59.072181701660156, "logps/ref_rejected": -99.41236877441406, "logps/rejected": -142.67623901367188, "loss": 0.4141, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15354523062705994, "margin_dpo/beta_margin_grad_std": 0.1981254369020462, "margin_dpo/beta_margin_mean": 3.153681993484497, "margin_dpo/beta_margin_std": 2.6311187744140625, "margin_dpo/loss_margin_mean": 31.53681755065918, "margin_dpo/margin_mean": 31.536819458007812, "margin_dpo/margin_std": 26.029647827148438, "step": 227 }, { "epoch": 0.33480176211453744, "grad_norm": 55.06504821777344, "learning_rate": 4.2218171073908463e-07, "logits/chosen": -0.6563422679901123, "logits/rejected": -0.6227169036865234, "logps/chosen": -78.89456176757812, "logps/ref_chosen": -65.89129638671875, "logps/ref_rejected": -91.04875183105469, "logps/rejected": -128.3238525390625, "loss": 0.5336, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18722449243068695, "margin_dpo/beta_margin_grad_std": 0.2196025550365448, "margin_dpo/beta_margin_mean": 2.427182912826538, "margin_dpo/beta_margin_std": 2.091808795928955, "margin_dpo/loss_margin_mean": 24.27182960510254, "margin_dpo/margin_mean": 24.27182960510254, "margin_dpo/margin_std": 20.79153823852539, "step": 228 }, { "epoch": 0.33627019089574156, "grad_norm": 64.2571792602539, "learning_rate": 4.212490049118951e-07, "logits/chosen": -0.6885573863983154, "logits/rejected": -0.6359836459159851, "logps/chosen": -85.07996368408203, "logps/ref_chosen": -70.70636749267578, "logps/ref_rejected": -84.52740478515625, "logps/rejected": -126.00403594970703, "loss": 0.598, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1898403912782669, "margin_dpo/beta_margin_grad_std": 0.23285524547100067, "margin_dpo/beta_margin_mean": 2.71030330657959, "margin_dpo/beta_margin_std": 2.555929660797119, "margin_dpo/loss_margin_mean": 27.1030330657959, "margin_dpo/margin_mean": 27.103031158447266, "margin_dpo/margin_std": 25.39706802368164, "step": 229 }, { "epoch": 0.3377386196769457, "grad_norm": 50.68180465698242, "learning_rate": 4.203117865141635e-07, "logits/chosen": -0.6804044842720032, "logits/rejected": -0.6706264019012451, "logps/chosen": -51.398292541503906, "logps/ref_chosen": -39.282005310058594, "logps/ref_rejected": -85.62191009521484, "logps/rejected": -128.11248779296875, "loss": 0.5067, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16133855283260345, "margin_dpo/beta_margin_grad_std": 0.21256163716316223, "margin_dpo/beta_margin_mean": 3.037428140640259, "margin_dpo/beta_margin_std": 2.7951576709747314, "margin_dpo/loss_margin_mean": 30.37428092956543, "margin_dpo/margin_mean": 30.37428092956543, "margin_dpo/margin_std": 27.84336280822754, "step": 230 }, { "epoch": 0.3392070484581498, "grad_norm": 42.53703689575195, "learning_rate": 4.1937008024246625e-07, "logits/chosen": -0.6829984188079834, "logits/rejected": -0.6394829750061035, "logps/chosen": -74.62582397460938, "logps/ref_chosen": -63.27644348144531, "logps/ref_rejected": -74.1239013671875, "logps/rejected": -111.50166320800781, "loss": 0.4698, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18761104345321655, "margin_dpo/beta_margin_grad_std": 0.17357850074768066, "margin_dpo/beta_margin_mean": 2.6028378009796143, "margin_dpo/beta_margin_std": 2.508455276489258, "margin_dpo/loss_margin_mean": 26.028377532958984, "margin_dpo/margin_mean": 26.028377532958984, "margin_dpo/margin_std": 24.996898651123047, "step": 231 }, { "epoch": 0.3406754772393539, "grad_norm": 70.08275604248047, "learning_rate": 4.1842391091163933e-07, "logits/chosen": -0.6572903394699097, "logits/rejected": -0.5994934439659119, "logps/chosen": -84.29617309570312, "logps/ref_chosen": -70.74876403808594, "logps/ref_rejected": -83.97706604003906, "logps/rejected": -118.73604583740234, "loss": 0.6921, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2373134195804596, "margin_dpo/beta_margin_grad_std": 0.24425449967384338, "margin_dpo/beta_margin_mean": 2.1211562156677246, "margin_dpo/beta_margin_std": 2.3135242462158203, "margin_dpo/loss_margin_mean": 21.211563110351562, "margin_dpo/margin_mean": 21.211563110351562, "margin_dpo/margin_std": 22.4114933013916, "step": 232 }, { "epoch": 0.342143906020558, "grad_norm": 61.6278076171875, "learning_rate": 4.174733034541245e-07, "logits/chosen": -0.6890474557876587, "logits/rejected": -0.6643567085266113, "logps/chosen": -67.88652801513672, "logps/ref_chosen": -54.8829345703125, "logps/ref_rejected": -107.48007202148438, "logps/rejected": -148.36856079101562, "loss": 0.5602, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18775954842567444, "margin_dpo/beta_margin_grad_std": 0.23140782117843628, "margin_dpo/beta_margin_mean": 2.7884888648986816, "margin_dpo/beta_margin_std": 2.659421682357788, "margin_dpo/loss_margin_mean": 27.8848876953125, "margin_dpo/margin_mean": 27.884885787963867, "margin_dpo/margin_std": 26.062320709228516, "step": 233 }, { "epoch": 0.3436123348017621, "grad_norm": 60.47285461425781, "learning_rate": 4.165182829193126e-07, "logits/chosen": -0.6370252370834351, "logits/rejected": -0.6381373405456543, "logps/chosen": -54.90777587890625, "logps/ref_chosen": -44.09451675415039, "logps/ref_rejected": -100.00663757324219, "logps/rejected": -138.9691162109375, "loss": 0.4561, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15120825171470642, "margin_dpo/beta_margin_grad_std": 0.1929025799036026, "margin_dpo/beta_margin_mean": 2.814922571182251, "margin_dpo/beta_margin_std": 2.241670846939087, "margin_dpo/loss_margin_mean": 28.14922523498535, "margin_dpo/margin_mean": 28.14922332763672, "margin_dpo/margin_std": 21.847400665283203, "step": 234 }, { "epoch": 0.34508076358296624, "grad_norm": 63.21758270263672, "learning_rate": 4.1555887447288255e-07, "logits/chosen": -0.6568065881729126, "logits/rejected": -0.614643931388855, "logps/chosen": -77.54314422607422, "logps/ref_chosen": -62.237911224365234, "logps/ref_rejected": -90.39505767822266, "logps/rejected": -128.5604248046875, "loss": 0.5974, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.215665802359581, "margin_dpo/beta_margin_grad_std": 0.2162242829799652, "margin_dpo/beta_margin_mean": 2.2860143184661865, "margin_dpo/beta_margin_std": 2.296712875366211, "margin_dpo/loss_margin_mean": 22.860143661499023, "margin_dpo/margin_mean": 22.86014175415039, "margin_dpo/margin_std": 22.919218063354492, "step": 235 }, { "epoch": 0.3465491923641703, "grad_norm": 65.25566864013672, "learning_rate": 4.1459510339613946e-07, "logits/chosen": -0.6559075117111206, "logits/rejected": -0.6537389159202576, "logps/chosen": -60.41249084472656, "logps/ref_chosen": -49.34136199951172, "logps/ref_rejected": -103.51162719726562, "logps/rejected": -140.07135009765625, "loss": 0.5646, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.19891716539859772, "margin_dpo/beta_margin_grad_std": 0.22555947303771973, "margin_dpo/beta_margin_mean": 2.548858165740967, "margin_dpo/beta_margin_std": 2.3670222759246826, "margin_dpo/loss_margin_mean": 25.488582611083984, "margin_dpo/margin_mean": 25.488582611083984, "margin_dpo/margin_std": 23.585155487060547, "step": 236 }, { "epoch": 0.34801762114537443, "grad_norm": 48.03404235839844, "learning_rate": 4.136269950853473e-07, "logits/chosen": -0.6702800989151001, "logits/rejected": -0.636156439781189, "logps/chosen": -65.91875457763672, "logps/ref_chosen": -54.168121337890625, "logps/ref_rejected": -94.78036499023438, "logps/rejected": -134.05665588378906, "loss": 0.5116, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17360197007656097, "margin_dpo/beta_margin_grad_std": 0.21111546456813812, "margin_dpo/beta_margin_mean": 2.7525649070739746, "margin_dpo/beta_margin_std": 2.4088451862335205, "margin_dpo/loss_margin_mean": 27.52564811706543, "margin_dpo/margin_mean": 27.52564811706543, "margin_dpo/margin_std": 24.07387924194336, "step": 237 }, { "epoch": 0.34948604992657856, "grad_norm": 39.46758270263672, "learning_rate": 4.126545750510605e-07, "logits/chosen": -0.6305921077728271, "logits/rejected": -0.6234115958213806, "logps/chosen": -64.94898986816406, "logps/ref_chosen": -53.973121643066406, "logps/ref_rejected": -89.41795349121094, "logps/rejected": -125.03469848632812, "loss": 0.4407, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17112761735916138, "margin_dpo/beta_margin_grad_std": 0.17932020127773285, "margin_dpo/beta_margin_mean": 2.464088201522827, "margin_dpo/beta_margin_std": 2.025897979736328, "margin_dpo/loss_margin_mean": 24.640880584716797, "margin_dpo/margin_mean": 24.640880584716797, "margin_dpo/margin_std": 20.111305236816406, "step": 238 }, { "epoch": 0.3509544787077827, "grad_norm": 49.3748664855957, "learning_rate": 4.116778689174514e-07, "logits/chosen": -0.7114957571029663, "logits/rejected": -0.6843305826187134, "logps/chosen": -70.67376708984375, "logps/ref_chosen": -58.09782409667969, "logps/ref_rejected": -93.59294128417969, "logps/rejected": -131.71542358398438, "loss": 0.4436, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16616235673427582, "margin_dpo/beta_margin_grad_std": 0.18609200417995453, "margin_dpo/beta_margin_mean": 2.554654359817505, "margin_dpo/beta_margin_std": 2.115661144256592, "margin_dpo/loss_margin_mean": 25.54654312133789, "margin_dpo/margin_mean": 25.54654312133789, "margin_dpo/margin_std": 19.89307975769043, "step": 239 }, { "epoch": 0.3524229074889868, "grad_norm": 60.53359603881836, "learning_rate": 4.106969024216348e-07, "logits/chosen": -0.6911687850952148, "logits/rejected": -0.6599963903427124, "logps/chosen": -73.52519226074219, "logps/ref_chosen": -60.6144905090332, "logps/ref_rejected": -74.1185302734375, "logps/rejected": -109.58448791503906, "loss": 0.6257, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2058243602514267, "margin_dpo/beta_margin_grad_std": 0.2343757450580597, "margin_dpo/beta_margin_mean": 2.2555253505706787, "margin_dpo/beta_margin_std": 2.1107068061828613, "margin_dpo/loss_margin_mean": 22.555252075195312, "margin_dpo/margin_mean": 22.555252075195312, "margin_dpo/margin_std": 20.787620544433594, "step": 240 }, { "epoch": 0.35389133627019087, "grad_norm": 59.422630310058594, "learning_rate": 4.097117014129903e-07, "logits/chosen": -0.6552136540412903, "logits/rejected": -0.6012428998947144, "logps/chosen": -76.52700805664062, "logps/ref_chosen": -66.091064453125, "logps/ref_rejected": -88.06088256835938, "logps/rejected": -130.19644165039062, "loss": 0.5099, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1663733720779419, "margin_dpo/beta_margin_grad_std": 0.22269202768802643, "margin_dpo/beta_margin_mean": 3.169961929321289, "margin_dpo/beta_margin_std": 3.125377655029297, "margin_dpo/loss_margin_mean": 31.69961929321289, "margin_dpo/margin_mean": 31.69961929321289, "margin_dpo/margin_std": 29.628376007080078, "step": 241 }, { "epoch": 0.355359765051395, "grad_norm": 52.94541931152344, "learning_rate": 4.087222918524807e-07, "logits/chosen": -0.6454315185546875, "logits/rejected": -0.6136279702186584, "logps/chosen": -79.44197845458984, "logps/ref_chosen": -67.86392211914062, "logps/ref_rejected": -83.36033630371094, "logps/rejected": -119.58251190185547, "loss": 0.4934, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1810285896062851, "margin_dpo/beta_margin_grad_std": 0.19746260344982147, "margin_dpo/beta_margin_mean": 2.4644126892089844, "margin_dpo/beta_margin_std": 2.184521198272705, "margin_dpo/loss_margin_mean": 24.644126892089844, "margin_dpo/margin_mean": 24.644126892089844, "margin_dpo/margin_std": 21.64803123474121, "step": 242 }, { "epoch": 0.3568281938325991, "grad_norm": 34.107791900634766, "learning_rate": 4.07728699811968e-07, "logits/chosen": -0.6725857257843018, "logits/rejected": -0.6084048748016357, "logps/chosen": -74.12469482421875, "logps/ref_chosen": -63.08424377441406, "logps/ref_rejected": -76.33563232421875, "logps/rejected": -116.86687469482422, "loss": 0.3271, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13316328823566437, "margin_dpo/beta_margin_grad_std": 0.1522829383611679, "margin_dpo/beta_margin_mean": 2.9490790367126465, "margin_dpo/beta_margin_std": 2.1849277019500732, "margin_dpo/loss_margin_mean": 29.49078941345215, "margin_dpo/margin_mean": 29.49079132080078, "margin_dpo/margin_std": 21.805618286132812, "step": 243 }, { "epoch": 0.35829662261380324, "grad_norm": 42.87071228027344, "learning_rate": 4.067309514735267e-07, "logits/chosen": -0.6881895065307617, "logits/rejected": -0.6778185367584229, "logps/chosen": -71.2780990600586, "logps/ref_chosen": -61.14069366455078, "logps/ref_rejected": -94.89193725585938, "logps/rejected": -130.34854125976562, "loss": 0.4934, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18467824161052704, "margin_dpo/beta_margin_grad_std": 0.20196670293807983, "margin_dpo/beta_margin_mean": 2.5319199562072754, "margin_dpo/beta_margin_std": 2.150766372680664, "margin_dpo/loss_margin_mean": 25.319198608398438, "margin_dpo/margin_mean": 25.319198608398438, "margin_dpo/margin_std": 21.36996078491211, "step": 244 }, { "epoch": 0.35976505139500736, "grad_norm": 74.6055679321289, "learning_rate": 4.057290731287531e-07, "logits/chosen": -0.7033660411834717, "logits/rejected": -0.6514378786087036, "logps/chosen": -78.92977905273438, "logps/ref_chosen": -67.26228332519531, "logps/ref_rejected": -87.64010620117188, "logps/rejected": -126.20805358886719, "loss": 0.5326, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1932898312807083, "margin_dpo/beta_margin_grad_std": 0.2051628977060318, "margin_dpo/beta_margin_mean": 2.6900460720062256, "margin_dpo/beta_margin_std": 2.7254021167755127, "margin_dpo/loss_margin_mean": 26.90045928955078, "margin_dpo/margin_mean": 26.900461196899414, "margin_dpo/margin_std": 25.503192901611328, "step": 245 }, { "epoch": 0.36123348017621143, "grad_norm": 56.00790023803711, "learning_rate": 4.047230911780736e-07, "logits/chosen": -0.7089934945106506, "logits/rejected": -0.6705622673034668, "logps/chosen": -78.0211181640625, "logps/ref_chosen": -66.69696807861328, "logps/ref_rejected": -84.34634399414062, "logps/rejected": -118.77372741699219, "loss": 0.5288, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1983981728553772, "margin_dpo/beta_margin_grad_std": 0.19785138964653015, "margin_dpo/beta_margin_mean": 2.310323476791382, "margin_dpo/beta_margin_std": 2.1114535331726074, "margin_dpo/loss_margin_mean": 23.103235244750977, "margin_dpo/margin_mean": 23.103233337402344, "margin_dpo/margin_std": 21.10454559326172, "step": 246 }, { "epoch": 0.36270190895741555, "grad_norm": 41.90789031982422, "learning_rate": 4.0371303213004814e-07, "logits/chosen": -0.7110755443572998, "logits/rejected": -0.6894150972366333, "logps/chosen": -68.0724868774414, "logps/ref_chosen": -56.6053466796875, "logps/ref_rejected": -106.29327392578125, "logps/rejected": -150.26498413085938, "loss": 0.4045, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14395716786384583, "margin_dpo/beta_margin_grad_std": 0.19858244061470032, "margin_dpo/beta_margin_mean": 3.2504570484161377, "margin_dpo/beta_margin_std": 2.560331344604492, "margin_dpo/loss_margin_mean": 32.50457000732422, "margin_dpo/margin_mean": 32.50457000732422, "margin_dpo/margin_std": 25.436208724975586, "step": 247 }, { "epoch": 0.3641703377386197, "grad_norm": 42.92959213256836, "learning_rate": 4.0269892260067197e-07, "logits/chosen": -0.6845219135284424, "logits/rejected": -0.6683632135391235, "logps/chosen": -54.540321350097656, "logps/ref_chosen": -44.043216705322266, "logps/ref_rejected": -91.85687255859375, "logps/rejected": -126.71005249023438, "loss": 0.4107, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16844278573989868, "margin_dpo/beta_margin_grad_std": 0.15559379756450653, "margin_dpo/beta_margin_mean": 2.4356071949005127, "margin_dpo/beta_margin_std": 1.9233942031860352, "margin_dpo/loss_margin_mean": 24.35607147216797, "margin_dpo/margin_mean": 24.35607147216797, "margin_dpo/margin_std": 19.101226806640625, "step": 248 }, { "epoch": 0.3656387665198238, "grad_norm": 59.13127517700195, "learning_rate": 4.0168078931267426e-07, "logits/chosen": -0.7052150964736938, "logits/rejected": -0.6669450998306274, "logps/chosen": -74.95724487304688, "logps/ref_chosen": -62.442352294921875, "logps/ref_rejected": -80.46806335449219, "logps/rejected": -113.90575408935547, "loss": 0.6535, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.23194736242294312, "margin_dpo/beta_margin_grad_std": 0.2308819442987442, "margin_dpo/beta_margin_mean": 2.0922796726226807, "margin_dpo/beta_margin_std": 2.0924148559570312, "margin_dpo/loss_margin_mean": 20.92279624938965, "margin_dpo/margin_mean": 20.92279624938965, "margin_dpo/margin_std": 20.69894790649414, "step": 249 }, { "epoch": 0.3671071953010279, "grad_norm": 34.11585235595703, "learning_rate": 4.006586590948141e-07, "logits/chosen": -0.6944586038589478, "logits/rejected": -0.6244109272956848, "logps/chosen": -74.52294158935547, "logps/ref_chosen": -65.6366958618164, "logps/ref_rejected": -73.87183380126953, "logps/rejected": -108.57221221923828, "loss": 0.4359, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1552935689687729, "margin_dpo/beta_margin_grad_std": 0.19923508167266846, "margin_dpo/beta_margin_mean": 2.581413507461548, "margin_dpo/beta_margin_std": 1.8444185256958008, "margin_dpo/loss_margin_mean": 25.81413459777832, "margin_dpo/margin_mean": 25.81413459777832, "margin_dpo/margin_std": 18.1165828704834, "step": 250 }, { "epoch": 0.368575624082232, "grad_norm": 44.37178039550781, "learning_rate": 3.9963255888117325e-07, "logits/chosen": -0.7029905319213867, "logits/rejected": -0.6486064195632935, "logps/chosen": -70.05519104003906, "logps/ref_chosen": -57.182716369628906, "logps/ref_rejected": -77.66343688964844, "logps/rejected": -116.27742767333984, "loss": 0.4579, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1774299144744873, "margin_dpo/beta_margin_grad_std": 0.19099289178848267, "margin_dpo/beta_margin_mean": 2.5741522312164307, "margin_dpo/beta_margin_std": 2.079760789871216, "margin_dpo/loss_margin_mean": 25.74152183532715, "margin_dpo/margin_mean": 25.74152183532715, "margin_dpo/margin_std": 20.57958984375, "step": 251 }, { "epoch": 0.3700440528634361, "grad_norm": 53.544761657714844, "learning_rate": 3.9860251571044666e-07, "logits/chosen": -0.6703172326087952, "logits/rejected": -0.62431800365448, "logps/chosen": -83.42109680175781, "logps/ref_chosen": -71.68563842773438, "logps/ref_rejected": -84.75798797607422, "logps/rejected": -122.17694854736328, "loss": 0.4309, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15842175483703613, "margin_dpo/beta_margin_grad_std": 0.1866365671157837, "margin_dpo/beta_margin_mean": 2.5683515071868896, "margin_dpo/beta_margin_std": 1.9699735641479492, "margin_dpo/loss_margin_mean": 25.683515548706055, "margin_dpo/margin_mean": 25.683515548706055, "margin_dpo/margin_std": 19.43787384033203, "step": 252 }, { "epoch": 0.37151248164464024, "grad_norm": 50.1516227722168, "learning_rate": 3.9756855672522986e-07, "logits/chosen": -0.6842066049575806, "logits/rejected": -0.654214084148407, "logps/chosen": -79.20399475097656, "logps/ref_chosen": -69.13392639160156, "logps/ref_rejected": -98.70252990722656, "logps/rejected": -132.8687744140625, "loss": 0.6253, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2049117386341095, "margin_dpo/beta_margin_grad_std": 0.23132526874542236, "margin_dpo/beta_margin_mean": 2.4096176624298096, "margin_dpo/beta_margin_std": 2.3356130123138428, "margin_dpo/loss_margin_mean": 24.096176147460938, "margin_dpo/margin_mean": 24.096176147460938, "margin_dpo/margin_std": 22.95254135131836, "step": 253 }, { "epoch": 0.37298091042584436, "grad_norm": 64.96926879882812, "learning_rate": 3.965307091713037e-07, "logits/chosen": -0.7051047682762146, "logits/rejected": -0.6575514078140259, "logps/chosen": -64.82050323486328, "logps/ref_chosen": -54.154998779296875, "logps/ref_rejected": -90.30764770507812, "logps/rejected": -125.6099853515625, "loss": 0.5557, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.19986601173877716, "margin_dpo/beta_margin_grad_std": 0.2192426323890686, "margin_dpo/beta_margin_mean": 2.4636828899383545, "margin_dpo/beta_margin_std": 2.2709455490112305, "margin_dpo/loss_margin_mean": 24.636829376220703, "margin_dpo/margin_mean": 24.636829376220703, "margin_dpo/margin_std": 22.574371337890625, "step": 254 }, { "epoch": 0.3744493392070485, "grad_norm": 66.39599609375, "learning_rate": 3.954890003969163e-07, "logits/chosen": -0.7102745771408081, "logits/rejected": -0.6797518730163574, "logps/chosen": -70.39166259765625, "logps/ref_chosen": -57.14167022705078, "logps/ref_rejected": -90.2085952758789, "logps/rejected": -130.80026245117188, "loss": 0.6594, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1914447396993637, "margin_dpo/beta_margin_grad_std": 0.22745780646800995, "margin_dpo/beta_margin_mean": 2.734168291091919, "margin_dpo/beta_margin_std": 2.8561336994171143, "margin_dpo/loss_margin_mean": 27.34168243408203, "margin_dpo/margin_mean": 27.34168243408203, "margin_dpo/margin_std": 28.012739181518555, "step": 255 }, { "epoch": 0.37591776798825255, "grad_norm": 58.85321807861328, "learning_rate": 3.944434578520628e-07, "logits/chosen": -0.6565215587615967, "logits/rejected": -0.6265472769737244, "logps/chosen": -68.35701751708984, "logps/ref_chosen": -55.163490295410156, "logps/ref_rejected": -92.56291961669922, "logps/rejected": -133.102294921875, "loss": 0.5121, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17351847887039185, "margin_dpo/beta_margin_grad_std": 0.20426101982593536, "margin_dpo/beta_margin_mean": 2.734584093093872, "margin_dpo/beta_margin_std": 2.515676498413086, "margin_dpo/loss_margin_mean": 27.34583854675293, "margin_dpo/margin_mean": 27.34583854675293, "margin_dpo/margin_std": 25.045982360839844, "step": 256 }, { "epoch": 0.37738619676945667, "grad_norm": 45.65961456298828, "learning_rate": 3.933941090877615e-07, "logits/chosen": -0.6682260036468506, "logits/rejected": -0.6451402902603149, "logps/chosen": -61.90161895751953, "logps/ref_chosen": -49.4236946105957, "logps/ref_rejected": -79.53791809082031, "logps/rejected": -122.11911010742188, "loss": 0.5015, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17995263636112213, "margin_dpo/beta_margin_grad_std": 0.22144293785095215, "margin_dpo/beta_margin_mean": 3.0103280544281006, "margin_dpo/beta_margin_std": 2.688237190246582, "margin_dpo/loss_margin_mean": 30.103281021118164, "margin_dpo/margin_mean": 30.103281021118164, "margin_dpo/margin_std": 25.718414306640625, "step": 257 }, { "epoch": 0.3788546255506608, "grad_norm": 90.31965637207031, "learning_rate": 3.923409817553284e-07, "logits/chosen": -0.6991258263587952, "logits/rejected": -0.669155478477478, "logps/chosen": -75.35392761230469, "logps/ref_chosen": -59.384124755859375, "logps/ref_rejected": -95.9901123046875, "logps/rejected": -138.38613891601562, "loss": 0.7407, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2009788304567337, "margin_dpo/beta_margin_grad_std": 0.24991276860237122, "margin_dpo/beta_margin_mean": 2.642622947692871, "margin_dpo/beta_margin_std": 2.7432861328125, "margin_dpo/loss_margin_mean": 26.426227569580078, "margin_dpo/margin_mean": 26.426227569580078, "margin_dpo/margin_std": 27.302228927612305, "step": 258 }, { "epoch": 0.3803230543318649, "grad_norm": 54.30337142944336, "learning_rate": 3.9128410360564793e-07, "logits/chosen": -0.6342747211456299, "logits/rejected": -0.6089296340942383, "logps/chosen": -67.30290222167969, "logps/ref_chosen": -52.828346252441406, "logps/ref_rejected": -89.19165802001953, "logps/rejected": -127.61224365234375, "loss": 0.5311, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1905156373977661, "margin_dpo/beta_margin_grad_std": 0.20092925429344177, "margin_dpo/beta_margin_mean": 2.3946025371551514, "margin_dpo/beta_margin_std": 2.1142630577087402, "margin_dpo/loss_margin_mean": 23.946025848388672, "margin_dpo/margin_mean": 23.94602394104004, "margin_dpo/margin_std": 20.407352447509766, "step": 259 }, { "epoch": 0.38179148311306904, "grad_norm": 60.32538604736328, "learning_rate": 3.9022350248844246e-07, "logits/chosen": -0.6234908103942871, "logits/rejected": -0.6234794855117798, "logps/chosen": -62.85065460205078, "logps/ref_chosen": -47.41767501831055, "logps/ref_rejected": -95.08979034423828, "logps/rejected": -137.6796417236328, "loss": 0.5057, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18406428396701813, "margin_dpo/beta_margin_grad_std": 0.20967774093151093, "margin_dpo/beta_margin_mean": 2.7156875133514404, "margin_dpo/beta_margin_std": 2.5070676803588867, "margin_dpo/loss_margin_mean": 27.156875610351562, "margin_dpo/margin_mean": 27.156875610351562, "margin_dpo/margin_std": 25.030288696289062, "step": 260 }, { "epoch": 0.3832599118942731, "grad_norm": 47.035186767578125, "learning_rate": 3.891592063515376e-07, "logits/chosen": -0.6528719067573547, "logits/rejected": -0.6170308589935303, "logps/chosen": -65.26475524902344, "logps/ref_chosen": -53.03137969970703, "logps/ref_rejected": -88.51494598388672, "logps/rejected": -129.43865966796875, "loss": 0.4748, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17383930087089539, "margin_dpo/beta_margin_grad_std": 0.20882652699947357, "margin_dpo/beta_margin_mean": 2.869032144546509, "margin_dpo/beta_margin_std": 2.6597743034362793, "margin_dpo/loss_margin_mean": 28.690322875976562, "margin_dpo/margin_mean": 28.690322875976562, "margin_dpo/margin_std": 26.378036499023438, "step": 261 }, { "epoch": 0.38472834067547723, "grad_norm": 65.2990493774414, "learning_rate": 3.880912432401264e-07, "logits/chosen": -0.6476384401321411, "logits/rejected": -0.601101279258728, "logps/chosen": -74.31780242919922, "logps/ref_chosen": -59.620140075683594, "logps/ref_rejected": -86.41853332519531, "logps/rejected": -126.95146179199219, "loss": 0.5286, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1703178882598877, "margin_dpo/beta_margin_grad_std": 0.2238461673259735, "margin_dpo/beta_margin_mean": 2.583526849746704, "margin_dpo/beta_margin_std": 2.1952381134033203, "margin_dpo/loss_margin_mean": 25.835268020629883, "margin_dpo/margin_mean": 25.835269927978516, "margin_dpo/margin_std": 21.91771697998047, "step": 262 }, { "epoch": 0.38619676945668135, "grad_norm": 63.93273162841797, "learning_rate": 3.870196412960302e-07, "logits/chosen": -0.6848942041397095, "logits/rejected": -0.6289730072021484, "logps/chosen": -71.28265380859375, "logps/ref_chosen": -59.42094421386719, "logps/ref_rejected": -96.85720825195312, "logps/rejected": -139.320556640625, "loss": 0.4332, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16087834537029266, "margin_dpo/beta_margin_grad_std": 0.1975705325603485, "margin_dpo/beta_margin_mean": 3.0601646900177, "margin_dpo/beta_margin_std": 2.6217379570007324, "margin_dpo/loss_margin_mean": 30.601646423339844, "margin_dpo/margin_mean": 30.601646423339844, "margin_dpo/margin_std": 26.212867736816406, "step": 263 }, { "epoch": 0.3876651982378855, "grad_norm": 65.42985534667969, "learning_rate": 3.8594442875695665e-07, "logits/chosen": -0.6409514546394348, "logits/rejected": -0.6121193766593933, "logps/chosen": -76.15332794189453, "logps/ref_chosen": -62.722084045410156, "logps/ref_rejected": -93.85621643066406, "logps/rejected": -131.38528442382812, "loss": 0.5449, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.19213639199733734, "margin_dpo/beta_margin_grad_std": 0.20564202964305878, "margin_dpo/beta_margin_mean": 2.409783363342285, "margin_dpo/beta_margin_std": 2.2144694328308105, "margin_dpo/loss_margin_mean": 24.09783363342285, "margin_dpo/margin_mean": 24.097835540771484, "margin_dpo/margin_std": 21.762907028198242, "step": 264 }, { "epoch": 0.3891336270190896, "grad_norm": 73.85417938232422, "learning_rate": 3.848656339557562e-07, "logits/chosen": -0.6545775532722473, "logits/rejected": -0.6242020130157471, "logps/chosen": -76.27545928955078, "logps/ref_chosen": -61.971466064453125, "logps/ref_rejected": -88.02059936523438, "logps/rejected": -127.61671447753906, "loss": 0.5823, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.20656327903270721, "margin_dpo/beta_margin_grad_std": 0.2223885953426361, "margin_dpo/beta_margin_mean": 2.529212474822998, "margin_dpo/beta_margin_std": 2.583730459213257, "margin_dpo/loss_margin_mean": 25.292123794555664, "margin_dpo/margin_mean": 25.292123794555664, "margin_dpo/margin_std": 25.746461868286133, "step": 265 }, { "epoch": 0.39060205580029367, "grad_norm": 57.55160903930664, "learning_rate": 3.8378328531967507e-07, "logits/chosen": -0.6736335754394531, "logits/rejected": -0.6081231832504272, "logps/chosen": -80.7601547241211, "logps/ref_chosen": -67.09967041015625, "logps/ref_rejected": -67.97122192382812, "logps/rejected": -106.38961791992188, "loss": 0.5648, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.20071399211883545, "margin_dpo/beta_margin_grad_std": 0.22172965109348297, "margin_dpo/beta_margin_mean": 2.47579026222229, "margin_dpo/beta_margin_std": 2.267148017883301, "margin_dpo/loss_margin_mean": 24.757902145385742, "margin_dpo/margin_mean": 24.757904052734375, "margin_dpo/margin_std": 22.62070083618164, "step": 266 }, { "epoch": 0.3920704845814978, "grad_norm": 53.11775588989258, "learning_rate": 3.8269741136960646e-07, "logits/chosen": -0.6374738216400146, "logits/rejected": -0.5902992486953735, "logps/chosen": -82.08721923828125, "logps/ref_chosen": -68.97074890136719, "logps/ref_rejected": -90.16844940185547, "logps/rejected": -130.69570922851562, "loss": 0.4124, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15888135135173798, "margin_dpo/beta_margin_grad_std": 0.18455727398395538, "margin_dpo/beta_margin_mean": 2.7410788536071777, "margin_dpo/beta_margin_std": 2.210850954055786, "margin_dpo/loss_margin_mean": 27.410789489746094, "margin_dpo/margin_mean": 27.410789489746094, "margin_dpo/margin_std": 22.08306884765625, "step": 267 }, { "epoch": 0.3935389133627019, "grad_norm": 62.39994812011719, "learning_rate": 3.8160804071933894e-07, "logits/chosen": -0.6283696293830872, "logits/rejected": -0.6117571592330933, "logps/chosen": -68.46856689453125, "logps/ref_chosen": -55.900306701660156, "logps/ref_rejected": -101.64763641357422, "logps/rejected": -139.620361328125, "loss": 0.4971, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1794806867837906, "margin_dpo/beta_margin_grad_std": 0.21003910899162292, "margin_dpo/beta_margin_mean": 2.5404462814331055, "margin_dpo/beta_margin_std": 2.1823596954345703, "margin_dpo/loss_margin_mean": 25.404462814331055, "margin_dpo/margin_mean": 25.404464721679688, "margin_dpo/margin_std": 21.592742919921875, "step": 268 }, { "epoch": 0.39500734214390604, "grad_norm": 53.5538330078125, "learning_rate": 3.8051520207480204e-07, "logits/chosen": -0.6579411029815674, "logits/rejected": -0.6127967238426208, "logps/chosen": -82.96507263183594, "logps/ref_chosen": -70.03955078125, "logps/ref_rejected": -107.34937286376953, "logps/rejected": -153.08908081054688, "loss": 0.4067, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14487290382385254, "margin_dpo/beta_margin_grad_std": 0.21411198377609253, "margin_dpo/beta_margin_mean": 3.2814202308654785, "margin_dpo/beta_margin_std": 2.366851329803467, "margin_dpo/loss_margin_mean": 32.81420135498047, "margin_dpo/margin_mean": 32.81420135498047, "margin_dpo/margin_std": 23.582063674926758, "step": 269 }, { "epoch": 0.3964757709251101, "grad_norm": 41.36155319213867, "learning_rate": 3.794189242333106e-07, "logits/chosen": -0.6722906827926636, "logits/rejected": -0.6524355411529541, "logps/chosen": -80.42156219482422, "logps/ref_chosen": -69.53347778320312, "logps/ref_rejected": -109.92864990234375, "logps/rejected": -145.8834228515625, "loss": 0.5061, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18635544180870056, "margin_dpo/beta_margin_grad_std": 0.20361123979091644, "margin_dpo/beta_margin_mean": 2.506671190261841, "margin_dpo/beta_margin_std": 2.224198341369629, "margin_dpo/loss_margin_mean": 25.06671142578125, "margin_dpo/margin_mean": 25.06671142578125, "margin_dpo/margin_std": 22.077110290527344, "step": 270 }, { "epoch": 0.39794419970631423, "grad_norm": 51.65666961669922, "learning_rate": 3.7831923608280514e-07, "logits/chosen": -0.6164276599884033, "logits/rejected": -0.5750702619552612, "logps/chosen": -71.0101318359375, "logps/ref_chosen": -56.76457214355469, "logps/ref_rejected": -92.51383209228516, "logps/rejected": -132.6912841796875, "loss": 0.534, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18699227273464203, "margin_dpo/beta_margin_grad_std": 0.21381914615631104, "margin_dpo/beta_margin_mean": 2.593189001083374, "margin_dpo/beta_margin_std": 2.331104040145874, "margin_dpo/loss_margin_mean": 25.9318904876709, "margin_dpo/margin_mean": 25.931888580322266, "margin_dpo/margin_std": 23.307266235351562, "step": 271 }, { "epoch": 0.39941262848751835, "grad_norm": 52.00728225708008, "learning_rate": 3.772161666010912e-07, "logits/chosen": -0.6098858714103699, "logits/rejected": -0.5980672836303711, "logps/chosen": -62.51170349121094, "logps/ref_chosen": -49.49715805053711, "logps/ref_rejected": -105.54279327392578, "logps/rejected": -150.09420776367188, "loss": 0.5359, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17077092826366425, "margin_dpo/beta_margin_grad_std": 0.23799988627433777, "margin_dpo/beta_margin_mean": 3.153686046600342, "margin_dpo/beta_margin_std": 2.745887041091919, "margin_dpo/loss_margin_mean": 31.5368595123291, "margin_dpo/margin_mean": 31.5368595123291, "margin_dpo/margin_std": 27.00173568725586, "step": 272 }, { "epoch": 0.4008810572687225, "grad_norm": 59.0302848815918, "learning_rate": 3.761097448550755e-07, "logits/chosen": -0.5825521945953369, "logits/rejected": -0.5468716025352478, "logps/chosen": -77.9120864868164, "logps/ref_chosen": -62.97539520263672, "logps/ref_rejected": -92.49858093261719, "logps/rejected": -137.728759765625, "loss": 0.4627, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16412891447544098, "margin_dpo/beta_margin_grad_std": 0.2039998471736908, "margin_dpo/beta_margin_mean": 3.0293478965759277, "margin_dpo/beta_margin_std": 2.5976314544677734, "margin_dpo/loss_margin_mean": 30.29347801208496, "margin_dpo/margin_mean": 30.293479919433594, "margin_dpo/margin_std": 24.974472045898438, "step": 273 }, { "epoch": 0.4023494860499266, "grad_norm": 55.06562423706055, "learning_rate": 3.75e-07, "logits/chosen": -0.6257538199424744, "logits/rejected": -0.5888440608978271, "logps/chosen": -71.956298828125, "logps/ref_chosen": -55.66770935058594, "logps/ref_rejected": -77.33308410644531, "logps/rejected": -119.93206787109375, "loss": 0.5193, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1804315745830536, "margin_dpo/beta_margin_grad_std": 0.2112412303686142, "margin_dpo/beta_margin_mean": 2.6310408115386963, "margin_dpo/beta_margin_std": 2.3380789756774902, "margin_dpo/loss_margin_mean": 26.310407638549805, "margin_dpo/margin_mean": 26.310407638549805, "margin_dpo/margin_std": 23.162757873535156, "step": 274 }, { "epoch": 0.40381791483113066, "grad_norm": 64.80329895019531, "learning_rate": 3.738869612786737e-07, "logits/chosen": -0.657637894153595, "logits/rejected": -0.6397134065628052, "logps/chosen": -60.017059326171875, "logps/ref_chosen": -48.594703674316406, "logps/ref_rejected": -93.30369567871094, "logps/rejected": -132.4287567138672, "loss": 0.4719, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17478503286838531, "margin_dpo/beta_margin_grad_std": 0.20220105350017548, "margin_dpo/beta_margin_mean": 2.7702696323394775, "margin_dpo/beta_margin_std": 2.443610906600952, "margin_dpo/loss_margin_mean": 27.70269775390625, "margin_dpo/margin_mean": 27.70269775390625, "margin_dpo/margin_std": 24.364110946655273, "step": 275 }, { "epoch": 0.4052863436123348, "grad_norm": 62.004940032958984, "learning_rate": 3.7277065802070204e-07, "logits/chosen": -0.6588333249092102, "logits/rejected": -0.6178128719329834, "logps/chosen": -70.30280303955078, "logps/ref_chosen": -56.57740783691406, "logps/ref_rejected": -70.36566925048828, "logps/rejected": -109.56034851074219, "loss": 0.5956, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.21022818982601166, "margin_dpo/beta_margin_grad_std": 0.23304350674152374, "margin_dpo/beta_margin_mean": 2.5469281673431396, "margin_dpo/beta_margin_std": 2.4726295471191406, "margin_dpo/loss_margin_mean": 25.469282150268555, "margin_dpo/margin_mean": 25.469280242919922, "margin_dpo/margin_std": 24.609455108642578, "step": 276 }, { "epoch": 0.4067547723935389, "grad_norm": 41.01213836669922, "learning_rate": 3.71651119641714e-07, "logits/chosen": -0.6487230658531189, "logits/rejected": -0.6121164560317993, "logps/chosen": -68.6185302734375, "logps/ref_chosen": -56.27156066894531, "logps/ref_rejected": -92.88127136230469, "logps/rejected": -129.5735626220703, "loss": 0.4262, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17126400768756866, "margin_dpo/beta_margin_grad_std": 0.16744239628314972, "margin_dpo/beta_margin_mean": 2.434532880783081, "margin_dpo/beta_margin_std": 1.9181517362594604, "margin_dpo/loss_margin_mean": 24.345327377319336, "margin_dpo/margin_mean": 24.34532928466797, "margin_dpo/margin_std": 18.586042404174805, "step": 277 }, { "epoch": 0.40822320117474303, "grad_norm": 45.832942962646484, "learning_rate": 3.705283756425872e-07, "logits/chosen": -0.6533815860748291, "logits/rejected": -0.6426759958267212, "logps/chosen": -64.33135986328125, "logps/ref_chosen": -52.94194030761719, "logps/ref_rejected": -91.25357818603516, "logps/rejected": -132.53787231445312, "loss": 0.482, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17872436344623566, "margin_dpo/beta_margin_grad_std": 0.21186299622058868, "margin_dpo/beta_margin_mean": 2.989488124847412, "margin_dpo/beta_margin_std": 2.6637423038482666, "margin_dpo/loss_margin_mean": 29.894880294799805, "margin_dpo/margin_mean": 29.894882202148438, "margin_dpo/margin_std": 26.599227905273438, "step": 278 }, { "epoch": 0.40969162995594716, "grad_norm": 55.28075408935547, "learning_rate": 3.6940245560867e-07, "logits/chosen": -0.6621390581130981, "logits/rejected": -0.6348008513450623, "logps/chosen": -60.90464782714844, "logps/ref_chosen": -48.641319274902344, "logps/ref_rejected": -87.8514404296875, "logps/rejected": -129.54296875, "loss": 0.488, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17444436252117157, "margin_dpo/beta_margin_grad_std": 0.2196406126022339, "margin_dpo/beta_margin_mean": 2.9428205490112305, "margin_dpo/beta_margin_std": 2.446150302886963, "margin_dpo/loss_margin_mean": 29.428205490112305, "margin_dpo/margin_mean": 29.428203582763672, "margin_dpo/margin_std": 24.257051467895508, "step": 279 }, { "epoch": 0.4111600587371512, "grad_norm": 38.125099182128906, "learning_rate": 3.6827338920900253e-07, "logits/chosen": -0.6191302537918091, "logits/rejected": -0.6008737683296204, "logps/chosen": -72.28899383544922, "logps/ref_chosen": -58.797122955322266, "logps/ref_rejected": -98.61885070800781, "logps/rejected": -141.04525756835938, "loss": 0.3483, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.133779838681221, "margin_dpo/beta_margin_grad_std": 0.18086881935596466, "margin_dpo/beta_margin_mean": 2.8934521675109863, "margin_dpo/beta_margin_std": 1.8741562366485596, "margin_dpo/loss_margin_mean": 28.934520721435547, "margin_dpo/margin_mean": 28.934520721435547, "margin_dpo/margin_std": 18.611377716064453, "step": 280 }, { "epoch": 0.41262848751835535, "grad_norm": 64.53363037109375, "learning_rate": 3.6714120619553435e-07, "logits/chosen": -0.665095329284668, "logits/rejected": -0.62502121925354, "logps/chosen": -67.85418701171875, "logps/ref_chosen": -55.488521575927734, "logps/ref_rejected": -80.88258361816406, "logps/rejected": -118.54179382324219, "loss": 0.483, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15846484899520874, "margin_dpo/beta_margin_grad_std": 0.1876082420349121, "margin_dpo/beta_margin_mean": 2.5293540954589844, "margin_dpo/beta_margin_std": 2.0211899280548096, "margin_dpo/loss_margin_mean": 25.293540954589844, "margin_dpo/margin_mean": 25.293540954589844, "margin_dpo/margin_std": 20.025854110717773, "step": 281 }, { "epoch": 0.41409691629955947, "grad_norm": 50.1074333190918, "learning_rate": 3.660059364023408e-07, "logits/chosen": -0.6407305002212524, "logits/rejected": -0.593590497970581, "logps/chosen": -85.81141662597656, "logps/ref_chosen": -73.07014465332031, "logps/ref_rejected": -95.35098266601562, "logps/rejected": -131.50296020507812, "loss": 0.475, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18391168117523193, "margin_dpo/beta_margin_grad_std": 0.18578048050403595, "margin_dpo/beta_margin_mean": 2.341071128845215, "margin_dpo/beta_margin_std": 2.0938100814819336, "margin_dpo/loss_margin_mean": 23.41071128845215, "margin_dpo/margin_mean": 23.41071128845215, "margin_dpo/margin_std": 20.858131408691406, "step": 282 }, { "epoch": 0.4155653450807636, "grad_norm": 48.29468536376953, "learning_rate": 3.6486760974483685e-07, "logits/chosen": -0.6420848369598389, "logits/rejected": -0.6138025522232056, "logps/chosen": -74.30840301513672, "logps/ref_chosen": -61.89844512939453, "logps/ref_rejected": -96.98655700683594, "logps/rejected": -137.50985717773438, "loss": 0.4753, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1640281230211258, "margin_dpo/beta_margin_grad_std": 0.21006377041339874, "margin_dpo/beta_margin_mean": 2.811335563659668, "margin_dpo/beta_margin_std": 2.354823589324951, "margin_dpo/loss_margin_mean": 28.113353729248047, "margin_dpo/margin_mean": 28.113353729248047, "margin_dpo/margin_std": 23.463539123535156, "step": 283 }, { "epoch": 0.4170337738619677, "grad_norm": 43.3529167175293, "learning_rate": 3.6372625621898863e-07, "logits/chosen": -0.6275640726089478, "logits/rejected": -0.6143908500671387, "logps/chosen": -72.13871765136719, "logps/ref_chosen": -58.4355354309082, "logps/ref_rejected": -93.46926879882812, "logps/rejected": -137.00511169433594, "loss": 0.4108, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15540792047977448, "margin_dpo/beta_margin_grad_std": 0.1845344454050064, "margin_dpo/beta_margin_mean": 2.983266592025757, "margin_dpo/beta_margin_std": 2.578672170639038, "margin_dpo/loss_margin_mean": 29.832664489746094, "margin_dpo/margin_mean": 29.832664489746094, "margin_dpo/margin_std": 25.724153518676758, "step": 284 }, { "epoch": 0.4185022026431718, "grad_norm": 57.101165771484375, "learning_rate": 3.625819059005228e-07, "logits/chosen": -0.6859316825866699, "logits/rejected": -0.6596359014511108, "logps/chosen": -81.82306671142578, "logps/ref_chosen": -66.2322006225586, "logps/ref_rejected": -99.1268310546875, "logps/rejected": -141.17568969726562, "loss": 0.4257, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16400812566280365, "margin_dpo/beta_margin_grad_std": 0.18523728847503662, "margin_dpo/beta_margin_mean": 2.6457977294921875, "margin_dpo/beta_margin_std": 2.1147918701171875, "margin_dpo/loss_margin_mean": 26.457977294921875, "margin_dpo/margin_mean": 26.457977294921875, "margin_dpo/margin_std": 20.855016708374023, "step": 285 }, { "epoch": 0.4199706314243759, "grad_norm": 58.981807708740234, "learning_rate": 3.614345889441346e-07, "logits/chosen": -0.6508222222328186, "logits/rejected": -0.6174975633621216, "logps/chosen": -86.8876724243164, "logps/ref_chosen": -72.95100402832031, "logps/ref_rejected": -88.58845520019531, "logps/rejected": -130.25048828125, "loss": 0.5505, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18501420319080353, "margin_dpo/beta_margin_grad_std": 0.22696195542812347, "margin_dpo/beta_margin_mean": 2.772536516189575, "margin_dpo/beta_margin_std": 2.563308000564575, "margin_dpo/loss_margin_mean": 27.725364685058594, "margin_dpo/margin_mean": 27.725364685058594, "margin_dpo/margin_std": 25.239097595214844, "step": 286 }, { "epoch": 0.42143906020558003, "grad_norm": 52.582481384277344, "learning_rate": 3.6028433558269275e-07, "logits/chosen": -0.658734142780304, "logits/rejected": -0.6133627891540527, "logps/chosen": -75.86917114257812, "logps/ref_chosen": -61.54115295410156, "logps/ref_rejected": -77.6960678100586, "logps/rejected": -118.89381408691406, "loss": 0.534, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1924961507320404, "margin_dpo/beta_margin_grad_std": 0.21066516637802124, "margin_dpo/beta_margin_mean": 2.6869726181030273, "margin_dpo/beta_margin_std": 2.597897529602051, "margin_dpo/loss_margin_mean": 26.869726181030273, "margin_dpo/margin_mean": 26.86972427368164, "margin_dpo/margin_std": 25.95490264892578, "step": 287 }, { "epoch": 0.42290748898678415, "grad_norm": 57.62872314453125, "learning_rate": 3.5913117612644327e-07, "logits/chosen": -0.634566605091095, "logits/rejected": -0.6029102206230164, "logps/chosen": -72.6466293334961, "logps/ref_chosen": -56.661224365234375, "logps/ref_rejected": -87.335693359375, "logps/rejected": -131.12515258789062, "loss": 0.4303, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16040383279323578, "margin_dpo/beta_margin_grad_std": 0.19727593660354614, "margin_dpo/beta_margin_mean": 2.7804043292999268, "margin_dpo/beta_margin_std": 2.1588449478149414, "margin_dpo/loss_margin_mean": 27.80404281616211, "margin_dpo/margin_mean": 27.80404281616211, "margin_dpo/margin_std": 21.125944137573242, "step": 288 }, { "epoch": 0.4243759177679883, "grad_norm": 50.83492660522461, "learning_rate": 3.5797514096221024e-07, "logits/chosen": -0.6417437791824341, "logits/rejected": -0.6304539442062378, "logps/chosen": -61.59012985229492, "logps/ref_chosen": -45.23039245605469, "logps/ref_rejected": -87.64266967773438, "logps/rejected": -134.28424072265625, "loss": 0.5004, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18503758311271667, "margin_dpo/beta_margin_grad_std": 0.2104889303445816, "margin_dpo/beta_margin_mean": 3.0281827449798584, "margin_dpo/beta_margin_std": 2.936239004135132, "margin_dpo/loss_margin_mean": 30.28182601928711, "margin_dpo/margin_mean": 30.28182601928711, "margin_dpo/margin_std": 29.03339958190918, "step": 289 }, { "epoch": 0.42584434654919234, "grad_norm": 63.28836441040039, "learning_rate": 3.568162605525952e-07, "logits/chosen": -0.5944575071334839, "logits/rejected": -0.5908774137496948, "logps/chosen": -72.06575775146484, "logps/ref_chosen": -55.47149658203125, "logps/ref_rejected": -116.70857238769531, "logps/rejected": -164.83444213867188, "loss": 0.5027, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17223544418811798, "margin_dpo/beta_margin_grad_std": 0.22271078824996948, "margin_dpo/beta_margin_mean": 3.15316104888916, "margin_dpo/beta_margin_std": 2.957723617553711, "margin_dpo/loss_margin_mean": 31.53160858154297, "margin_dpo/margin_mean": 31.53160858154297, "margin_dpo/margin_std": 29.308597564697266, "step": 290 }, { "epoch": 0.42731277533039647, "grad_norm": 56.67517852783203, "learning_rate": 3.5565456543517485e-07, "logits/chosen": -0.6302033066749573, "logits/rejected": -0.595551609992981, "logps/chosen": -76.0269775390625, "logps/ref_chosen": -63.26036834716797, "logps/ref_rejected": -89.29708862304688, "logps/rejected": -129.76498413085938, "loss": 0.4813, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16477952897548676, "margin_dpo/beta_margin_grad_std": 0.20427057147026062, "margin_dpo/beta_margin_mean": 2.7701282501220703, "margin_dpo/beta_margin_std": 2.2966012954711914, "margin_dpo/loss_margin_mean": 27.701282501220703, "margin_dpo/margin_mean": 27.70128059387207, "margin_dpo/margin_std": 22.738750457763672, "step": 291 }, { "epoch": 0.4287812041116006, "grad_norm": 54.23537063598633, "learning_rate": 3.5449008622169583e-07, "logits/chosen": -0.6187624931335449, "logits/rejected": -0.5753225684165955, "logps/chosen": -70.70861053466797, "logps/ref_chosen": -53.91852951049805, "logps/ref_rejected": -89.96138000488281, "logps/rejected": -136.59767150878906, "loss": 0.3934, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15516723692417145, "margin_dpo/beta_margin_grad_std": 0.1758795827627182, "margin_dpo/beta_margin_mean": 2.9846208095550537, "margin_dpo/beta_margin_std": 2.461369752883911, "margin_dpo/loss_margin_mean": 29.846208572387695, "margin_dpo/margin_mean": 29.846210479736328, "margin_dpo/margin_std": 24.429065704345703, "step": 292 }, { "epoch": 0.4302496328928047, "grad_norm": 52.83697509765625, "learning_rate": 3.5332285359726846e-07, "logits/chosen": -0.6384230852127075, "logits/rejected": -0.6081752777099609, "logps/chosen": -76.67402648925781, "logps/ref_chosen": -60.376033782958984, "logps/ref_rejected": -77.8524398803711, "logps/rejected": -118.3228988647461, "loss": 0.5966, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2109437733888626, "margin_dpo/beta_margin_grad_std": 0.217354878783226, "margin_dpo/beta_margin_mean": 2.417245864868164, "margin_dpo/beta_margin_std": 2.448225259780884, "margin_dpo/loss_margin_mean": 24.17245864868164, "margin_dpo/margin_mean": 24.17245864868164, "margin_dpo/margin_std": 24.42681121826172, "step": 293 }, { "epoch": 0.43171806167400884, "grad_norm": 42.41814041137695, "learning_rate": 3.5215289831955786e-07, "logits/chosen": -0.6331781148910522, "logits/rejected": -0.6203632354736328, "logps/chosen": -62.738616943359375, "logps/ref_chosen": -48.0875358581543, "logps/ref_rejected": -81.89698791503906, "logps/rejected": -123.75438690185547, "loss": 0.5153, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1880524605512619, "margin_dpo/beta_margin_grad_std": 0.2148909568786621, "margin_dpo/beta_margin_mean": 2.7206313610076904, "margin_dpo/beta_margin_std": 2.606935977935791, "margin_dpo/loss_margin_mean": 27.206314086914062, "margin_dpo/margin_mean": 27.206314086914062, "margin_dpo/margin_std": 25.83649444580078, "step": 294 }, { "epoch": 0.4331864904552129, "grad_norm": 63.754703521728516, "learning_rate": 3.509802512179737e-07, "logits/chosen": -0.6102343797683716, "logits/rejected": -0.6015244722366333, "logps/chosen": -68.84889221191406, "logps/ref_chosen": -49.92467498779297, "logps/ref_rejected": -87.45632934570312, "logps/rejected": -133.5269775390625, "loss": 0.5905, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18437956273555756, "margin_dpo/beta_margin_grad_std": 0.22511690855026245, "margin_dpo/beta_margin_mean": 2.714644432067871, "margin_dpo/beta_margin_std": 2.515841245651245, "margin_dpo/loss_margin_mean": 27.14644432067871, "margin_dpo/margin_mean": 27.146446228027344, "margin_dpo/margin_std": 24.949363708496094, "step": 295 }, { "epoch": 0.434654919236417, "grad_norm": 79.74415588378906, "learning_rate": 3.498049431928577e-07, "logits/chosen": -0.6979824304580688, "logits/rejected": -0.6591476202011108, "logps/chosen": -84.0577392578125, "logps/ref_chosen": -65.49124145507812, "logps/ref_rejected": -93.08908081054688, "logps/rejected": -135.13502502441406, "loss": 0.7362, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.23561769723892212, "margin_dpo/beta_margin_grad_std": 0.25472357869148254, "margin_dpo/beta_margin_mean": 2.3479440212249756, "margin_dpo/beta_margin_std": 2.6585099697113037, "margin_dpo/loss_margin_mean": 23.47943878173828, "margin_dpo/margin_mean": 23.47943878173828, "margin_dpo/margin_std": 26.526391983032227, "step": 296 }, { "epoch": 0.43612334801762115, "grad_norm": 44.74517059326172, "learning_rate": 3.486270052146694e-07, "logits/chosen": -0.5774829387664795, "logits/rejected": -0.5429031848907471, "logps/chosen": -74.85836029052734, "logps/ref_chosen": -56.47694778442383, "logps/ref_rejected": -95.1385498046875, "logps/rejected": -142.09182739257812, "loss": 0.426, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16123466193675995, "margin_dpo/beta_margin_grad_std": 0.1913672685623169, "margin_dpo/beta_margin_mean": 2.8571863174438477, "margin_dpo/beta_margin_std": 2.378805160522461, "margin_dpo/loss_margin_mean": 28.571861267089844, "margin_dpo/margin_mean": 28.571863174438477, "margin_dpo/margin_std": 23.766578674316406, "step": 297 }, { "epoch": 0.43759177679882527, "grad_norm": 44.202003479003906, "learning_rate": 3.474464683231698e-07, "logits/chosen": -0.6306143999099731, "logits/rejected": -0.6259936690330505, "logps/chosen": -83.96099090576172, "logps/ref_chosen": -67.32516479492188, "logps/ref_rejected": -116.66217041015625, "logps/rejected": -163.1012420654297, "loss": 0.4135, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1614616960287094, "margin_dpo/beta_margin_grad_std": 0.180389866232872, "margin_dpo/beta_margin_mean": 2.9803249835968018, "margin_dpo/beta_margin_std": 2.6625173091888428, "margin_dpo/loss_margin_mean": 29.803251266479492, "margin_dpo/margin_mean": 29.80324935913086, "margin_dpo/margin_std": 26.537506103515625, "step": 298 }, { "epoch": 0.4390602055800294, "grad_norm": 59.32780075073242, "learning_rate": 3.462633636266041e-07, "logits/chosen": -0.5633834600448608, "logits/rejected": -0.5420501232147217, "logps/chosen": -64.30989837646484, "logps/ref_chosen": -48.96209716796875, "logps/ref_rejected": -84.32823944091797, "logps/rejected": -130.85752868652344, "loss": 0.5069, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17790299654006958, "margin_dpo/beta_margin_grad_std": 0.22678081691265106, "margin_dpo/beta_margin_mean": 3.118149518966675, "margin_dpo/beta_margin_std": 2.880525588989258, "margin_dpo/loss_margin_mean": 31.181493759155273, "margin_dpo/margin_mean": 31.181495666503906, "margin_dpo/margin_std": 27.928592681884766, "step": 299 }, { "epoch": 0.44052863436123346, "grad_norm": 81.78619384765625, "learning_rate": 3.4507772230088147e-07, "logits/chosen": -0.6105576157569885, "logits/rejected": -0.591549277305603, "logps/chosen": -80.5472183227539, "logps/ref_chosen": -59.073707580566406, "logps/ref_rejected": -95.9664535522461, "logps/rejected": -147.06117248535156, "loss": 0.7096, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.20562496781349182, "margin_dpo/beta_margin_grad_std": 0.27093952894210815, "margin_dpo/beta_margin_mean": 2.9621217250823975, "margin_dpo/beta_margin_std": 2.987994909286499, "margin_dpo/loss_margin_mean": 29.6212158203125, "margin_dpo/margin_mean": 29.621217727661133, "margin_dpo/margin_std": 29.81679344177246, "step": 300 }, { "epoch": 0.44052863436123346, "eval_logits/chosen": -0.6160351634025574, "eval_logits/rejected": -0.5900039672851562, "eval_logps/chosen": -100.12930297851562, "eval_logps/ref_chosen": -79.05104064941406, "eval_logps/ref_rejected": -86.79793548583984, "eval_logps/rejected": -125.98890686035156, "eval_loss": 0.4404529929161072, "eval_margin_dpo/beta": 0.10000000149011612, "eval_margin_dpo/beta_margin_grad_mean": -0.28246673941612244, "eval_margin_dpo/beta_margin_grad_std": 0.2513927221298218, "eval_margin_dpo/beta_margin_mean": 1.8112715482711792, "eval_margin_dpo/beta_margin_std": 2.3746871948242188, "eval_margin_dpo/loss_margin_mean": 18.112716674804688, "eval_margin_dpo/margin_mean": 18.112716674804688, "eval_margin_dpo/margin_std": 23.746871948242188, "eval_runtime": 40.0886, "eval_samples_per_second": 58.346, "eval_steps_per_second": 1.846, "step": 300 }, { "epoch": 0.4419970631424376, "grad_norm": 49.35285949707031, "learning_rate": 3.4388957558875316e-07, "logits/chosen": -0.6374814510345459, "logits/rejected": -0.6068045496940613, "logps/chosen": -75.65875244140625, "logps/ref_chosen": -57.249366760253906, "logps/ref_rejected": -92.35354614257812, "logps/rejected": -141.78347778320312, "loss": 0.404, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1557816118001938, "margin_dpo/beta_margin_grad_std": 0.1888331174850464, "margin_dpo/beta_margin_mean": 3.1020545959472656, "margin_dpo/beta_margin_std": 2.5941102504730225, "margin_dpo/loss_margin_mean": 31.020544052124023, "margin_dpo/margin_mean": 31.020544052124023, "margin_dpo/margin_std": 25.892236709594727, "step": 301 }, { "epoch": 0.4434654919236417, "grad_norm": 68.0921401977539, "learning_rate": 3.426989547989902e-07, "logits/chosen": -0.5743458271026611, "logits/rejected": -0.5641738176345825, "logps/chosen": -66.59398651123047, "logps/ref_chosen": -51.19799041748047, "logps/ref_rejected": -97.22636413574219, "logps/rejected": -141.77328491210938, "loss": 0.574, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18597975373268127, "margin_dpo/beta_margin_grad_std": 0.23450718820095062, "margin_dpo/beta_margin_mean": 2.9150946140289307, "margin_dpo/beta_margin_std": 2.9601025581359863, "margin_dpo/loss_margin_mean": 29.15094566345215, "margin_dpo/margin_mean": 29.15094757080078, "margin_dpo/margin_std": 29.199195861816406, "step": 302 }, { "epoch": 0.44493392070484583, "grad_norm": 74.45587921142578, "learning_rate": 3.4150589130555773e-07, "logits/chosen": -0.6296427249908447, "logits/rejected": -0.5928441286087036, "logps/chosen": -83.6898193359375, "logps/ref_chosen": -66.71394348144531, "logps/ref_rejected": -86.94542694091797, "logps/rejected": -131.94627380371094, "loss": 0.6507, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.21019597351551056, "margin_dpo/beta_margin_grad_std": 0.25447341799736023, "margin_dpo/beta_margin_mean": 2.802497386932373, "margin_dpo/beta_margin_std": 2.8981969356536865, "margin_dpo/loss_margin_mean": 28.024972915649414, "margin_dpo/margin_mean": 28.02497100830078, "margin_dpo/margin_std": 28.8425350189209, "step": 303 }, { "epoch": 0.44640234948604995, "grad_norm": 57.734458923339844, "learning_rate": 3.403104165467883e-07, "logits/chosen": -0.6465634703636169, "logits/rejected": -0.6173808574676514, "logps/chosen": -86.27723693847656, "logps/ref_chosen": -71.95069885253906, "logps/ref_rejected": -90.47203063964844, "logps/rejected": -133.15660095214844, "loss": 0.4558, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1500687152147293, "margin_dpo/beta_margin_grad_std": 0.2116183042526245, "margin_dpo/beta_margin_mean": 2.8358030319213867, "margin_dpo/beta_margin_std": 2.0787370204925537, "margin_dpo/loss_margin_mean": 28.358028411865234, "margin_dpo/margin_mean": 28.358028411865234, "margin_dpo/margin_std": 20.432727813720703, "step": 304 }, { "epoch": 0.447870778267254, "grad_norm": 54.050018310546875, "learning_rate": 3.391125620245535e-07, "logits/chosen": -0.6285189986228943, "logits/rejected": -0.5891715884208679, "logps/chosen": -84.83607482910156, "logps/ref_chosen": -66.79523468017578, "logps/ref_rejected": -92.75459289550781, "logps/rejected": -139.55845642089844, "loss": 0.4322, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16884967684745789, "margin_dpo/beta_margin_grad_std": 0.18422438204288483, "margin_dpo/beta_margin_mean": 2.8763022422790527, "margin_dpo/beta_margin_std": 2.6594858169555664, "margin_dpo/loss_margin_mean": 28.76302146911621, "margin_dpo/margin_mean": 28.76302146911621, "margin_dpo/margin_std": 26.524438858032227, "step": 305 }, { "epoch": 0.44933920704845814, "grad_norm": 73.82415008544922, "learning_rate": 3.3791235930343417e-07, "logits/chosen": -0.6591800451278687, "logits/rejected": -0.608156681060791, "logps/chosen": -85.36546325683594, "logps/ref_chosen": -69.68389892578125, "logps/ref_rejected": -85.15919494628906, "logps/rejected": -128.71629333496094, "loss": 0.5126, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16796466708183289, "margin_dpo/beta_margin_grad_std": 0.21312610805034637, "margin_dpo/beta_margin_mean": 2.7875523567199707, "margin_dpo/beta_margin_std": 2.3330721855163574, "margin_dpo/loss_margin_mean": 27.87552261352539, "margin_dpo/margin_mean": 27.87552261352539, "margin_dpo/margin_std": 23.294414520263672, "step": 306 }, { "epoch": 0.45080763582966227, "grad_norm": 54.04255294799805, "learning_rate": 3.367098400098881e-07, "logits/chosen": -0.632627546787262, "logits/rejected": -0.6056050658226013, "logps/chosen": -86.17469787597656, "logps/ref_chosen": -70.16542053222656, "logps/ref_rejected": -86.97230529785156, "logps/rejected": -128.53074645996094, "loss": 0.5331, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1910136193037033, "margin_dpo/beta_margin_grad_std": 0.216377392411232, "margin_dpo/beta_margin_mean": 2.554914951324463, "margin_dpo/beta_margin_std": 2.4117367267608643, "margin_dpo/loss_margin_mean": 25.549150466918945, "margin_dpo/margin_mean": 25.549152374267578, "margin_dpo/margin_std": 24.079296112060547, "step": 307 }, { "epoch": 0.4522760646108664, "grad_norm": 44.33732604980469, "learning_rate": 3.355050358314172e-07, "logits/chosen": -0.6031591892242432, "logits/rejected": -0.5757944583892822, "logps/chosen": -70.33519744873047, "logps/ref_chosen": -55.2449951171875, "logps/ref_rejected": -79.37226104736328, "logps/rejected": -123.3551254272461, "loss": 0.4969, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16970184445381165, "margin_dpo/beta_margin_grad_std": 0.20795808732509613, "margin_dpo/beta_margin_mean": 2.8892669677734375, "margin_dpo/beta_margin_std": 2.663458824157715, "margin_dpo/loss_margin_mean": 28.892669677734375, "margin_dpo/margin_mean": 28.892669677734375, "margin_dpo/margin_std": 26.245033264160156, "step": 308 }, { "epoch": 0.45374449339207046, "grad_norm": 57.88508987426758, "learning_rate": 3.3429797851573183e-07, "logits/chosen": -0.6012994647026062, "logits/rejected": -0.564967691898346, "logps/chosen": -66.71060180664062, "logps/ref_chosen": -48.959083557128906, "logps/ref_rejected": -82.34072875976562, "logps/rejected": -128.30679321289062, "loss": 0.5073, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17930957674980164, "margin_dpo/beta_margin_grad_std": 0.2210213840007782, "margin_dpo/beta_margin_mean": 2.821453094482422, "margin_dpo/beta_margin_std": 2.4553327560424805, "margin_dpo/loss_margin_mean": 28.21453094482422, "margin_dpo/margin_mean": 28.21453094482422, "margin_dpo/margin_std": 24.337322235107422, "step": 309 }, { "epoch": 0.4552129221732746, "grad_norm": 50.711158752441406, "learning_rate": 3.3308869986991487e-07, "logits/chosen": -0.6853815913200378, "logits/rejected": -0.6389970183372498, "logps/chosen": -78.57404327392578, "logps/ref_chosen": -62.74177932739258, "logps/ref_rejected": -79.9302978515625, "logps/rejected": -120.24269104003906, "loss": 0.4436, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1786579042673111, "margin_dpo/beta_margin_grad_std": 0.16861887276172638, "margin_dpo/beta_margin_mean": 2.448012351989746, "margin_dpo/beta_margin_std": 2.0728390216827393, "margin_dpo/loss_margin_mean": 24.48012351989746, "margin_dpo/margin_mean": 24.480121612548828, "margin_dpo/margin_std": 20.56110382080078, "step": 310 }, { "epoch": 0.4566813509544787, "grad_norm": 63.41331100463867, "learning_rate": 3.3187723175958346e-07, "logits/chosen": -0.6061598062515259, "logits/rejected": -0.56733638048172, "logps/chosen": -73.1260757446289, "logps/ref_chosen": -53.027976989746094, "logps/ref_rejected": -77.43820190429688, "logps/rejected": -131.88427734375, "loss": 0.3422, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12758134305477142, "margin_dpo/beta_margin_grad_std": 0.17252546548843384, "margin_dpo/beta_margin_mean": 3.434797525405884, "margin_dpo/beta_margin_std": 2.5421323776245117, "margin_dpo/loss_margin_mean": 34.34797286987305, "margin_dpo/margin_mean": 34.34797286987305, "margin_dpo/margin_std": 25.21231460571289, "step": 311 }, { "epoch": 0.4581497797356828, "grad_norm": 58.5345458984375, "learning_rate": 3.306636061080487e-07, "logits/chosen": -0.59651780128479, "logits/rejected": -0.5514322519302368, "logps/chosen": -65.93643951416016, "logps/ref_chosen": -49.39221954345703, "logps/ref_rejected": -75.79280090332031, "logps/rejected": -122.18972778320312, "loss": 0.4828, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16882839798927307, "margin_dpo/beta_margin_grad_std": 0.22298170626163483, "margin_dpo/beta_margin_mean": 2.9852707386016846, "margin_dpo/beta_margin_std": 2.6555700302124023, "margin_dpo/loss_margin_mean": 29.852705001831055, "margin_dpo/margin_mean": 29.852706909179688, "margin_dpo/margin_std": 26.147369384765625, "step": 312 }, { "epoch": 0.45961820851688695, "grad_norm": 54.11558151245117, "learning_rate": 3.2944785489547537e-07, "logits/chosen": -0.6790950298309326, "logits/rejected": -0.6427109241485596, "logps/chosen": -64.72444152832031, "logps/ref_chosen": -50.152740478515625, "logps/ref_rejected": -86.40620422363281, "logps/rejected": -126.8267822265625, "loss": 0.6273, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.21419396996498108, "margin_dpo/beta_margin_grad_std": 0.23362776637077332, "margin_dpo/beta_margin_mean": 2.5848872661590576, "margin_dpo/beta_margin_std": 2.6925547122955322, "margin_dpo/loss_margin_mean": 25.8488712310791, "margin_dpo/margin_mean": 25.848873138427734, "margin_dpo/margin_std": 26.872703552246094, "step": 313 }, { "epoch": 0.461086637298091, "grad_norm": 64.02825164794922, "learning_rate": 3.2823001015803857e-07, "logits/chosen": -0.6256019473075867, "logits/rejected": -0.6011070013046265, "logps/chosen": -72.6094970703125, "logps/ref_chosen": -57.23758316040039, "logps/ref_rejected": -97.59652709960938, "logps/rejected": -138.90609741210938, "loss": 0.5787, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.20730111002922058, "margin_dpo/beta_margin_grad_std": 0.22327031195163727, "margin_dpo/beta_margin_mean": 2.593766450881958, "margin_dpo/beta_margin_std": 2.570161819458008, "margin_dpo/loss_margin_mean": 25.93766212463379, "margin_dpo/margin_mean": 25.937664031982422, "margin_dpo/margin_std": 25.60101890563965, "step": 314 }, { "epoch": 0.46255506607929514, "grad_norm": 47.55817413330078, "learning_rate": 3.270101039870797e-07, "logits/chosen": -0.5983539819717407, "logits/rejected": -0.5785402059555054, "logps/chosen": -64.21907043457031, "logps/ref_chosen": -49.06958770751953, "logps/ref_rejected": -85.68087768554688, "logps/rejected": -125.40115356445312, "loss": 0.4903, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18685929477214813, "margin_dpo/beta_margin_grad_std": 0.19108892977237701, "margin_dpo/beta_margin_mean": 2.457080602645874, "margin_dpo/beta_margin_std": 2.294046401977539, "margin_dpo/loss_margin_mean": 24.570804595947266, "margin_dpo/margin_mean": 24.570804595947266, "margin_dpo/margin_std": 22.66987419128418, "step": 315 }, { "epoch": 0.46402349486049926, "grad_norm": 51.51395034790039, "learning_rate": 3.2578816852826086e-07, "logits/chosen": -0.6153182983398438, "logits/rejected": -0.6095402240753174, "logps/chosen": -71.8460693359375, "logps/ref_chosen": -54.26074981689453, "logps/ref_rejected": -101.2814712524414, "logps/rejected": -148.74497985839844, "loss": 0.4124, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1572447568178177, "margin_dpo/beta_margin_grad_std": 0.18591180443763733, "margin_dpo/beta_margin_mean": 2.9878194332122803, "margin_dpo/beta_margin_std": 2.6384308338165283, "margin_dpo/loss_margin_mean": 29.87819480895996, "margin_dpo/margin_mean": 29.87819480895996, "margin_dpo/margin_std": 26.249610900878906, "step": 316 }, { "epoch": 0.4654919236417034, "grad_norm": 37.093284606933594, "learning_rate": 3.2456423598071783e-07, "logits/chosen": -0.6742887496948242, "logits/rejected": -0.6377764940261841, "logps/chosen": -69.39185333251953, "logps/ref_chosen": -56.094207763671875, "logps/ref_rejected": -100.69905090332031, "logps/rejected": -147.84510803222656, "loss": 0.3514, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13395161926746368, "margin_dpo/beta_margin_grad_std": 0.18397875130176544, "margin_dpo/beta_margin_mean": 3.384840965270996, "margin_dpo/beta_margin_std": 2.511617422103882, "margin_dpo/loss_margin_mean": 33.848411560058594, "margin_dpo/margin_mean": 33.848411560058594, "margin_dpo/margin_std": 24.782241821289062, "step": 317 }, { "epoch": 0.4669603524229075, "grad_norm": 46.82166290283203, "learning_rate": 3.233383385962115e-07, "logits/chosen": -0.6575570106506348, "logits/rejected": -0.6119377613067627, "logps/chosen": -77.43252563476562, "logps/ref_chosen": -64.64570617675781, "logps/ref_rejected": -82.76425170898438, "logps/rejected": -126.07575988769531, "loss": 0.4239, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15713295340538025, "margin_dpo/beta_margin_grad_std": 0.19899439811706543, "margin_dpo/beta_margin_mean": 3.0524682998657227, "margin_dpo/beta_margin_std": 2.5276248455047607, "margin_dpo/loss_margin_mean": 30.524681091308594, "margin_dpo/margin_mean": 30.524682998657227, "margin_dpo/margin_std": 25.1044921875, "step": 318 }, { "epoch": 0.4684287812041116, "grad_norm": 42.284812927246094, "learning_rate": 3.2211050867827805e-07, "logits/chosen": -0.6065380573272705, "logits/rejected": -0.5934597253799438, "logps/chosen": -62.33796691894531, "logps/ref_chosen": -49.383758544921875, "logps/ref_rejected": -113.90650939941406, "logps/rejected": -156.28738403320312, "loss": 0.3656, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14448316395282745, "margin_dpo/beta_margin_grad_std": 0.173114612698555, "margin_dpo/beta_margin_mean": 2.9426653385162354, "margin_dpo/beta_margin_std": 2.2474348545074463, "margin_dpo/loss_margin_mean": 29.426651000976562, "margin_dpo/margin_mean": 29.426651000976562, "margin_dpo/margin_std": 22.307289123535156, "step": 319 }, { "epoch": 0.4698972099853157, "grad_norm": 50.809173583984375, "learning_rate": 3.208807785813777e-07, "logits/chosen": -0.6396864056587219, "logits/rejected": -0.6267807483673096, "logps/chosen": -74.15386962890625, "logps/ref_chosen": -59.50489044189453, "logps/ref_rejected": -97.66716766357422, "logps/rejected": -139.19309997558594, "loss": 0.4867, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17603495717048645, "margin_dpo/beta_margin_grad_std": 0.19671519100666046, "margin_dpo/beta_margin_mean": 2.687695026397705, "margin_dpo/beta_margin_std": 2.3658065795898438, "margin_dpo/loss_margin_mean": 26.876949310302734, "margin_dpo/margin_mean": 26.876949310302734, "margin_dpo/margin_std": 23.568016052246094, "step": 320 }, { "epoch": 0.4713656387665198, "grad_norm": 71.52301025390625, "learning_rate": 3.1964918071004217e-07, "logits/chosen": -0.6303710341453552, "logits/rejected": -0.5920969247817993, "logps/chosen": -80.57645416259766, "logps/ref_chosen": -61.548683166503906, "logps/ref_rejected": -91.64103698730469, "logps/rejected": -136.7852020263672, "loss": 0.706, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.224751278758049, "margin_dpo/beta_margin_grad_std": 0.2634871304035187, "margin_dpo/beta_margin_mean": 2.6116397380828857, "margin_dpo/beta_margin_std": 2.8075194358825684, "margin_dpo/loss_margin_mean": 26.116395950317383, "margin_dpo/margin_mean": 26.116397857666016, "margin_dpo/margin_std": 27.479251861572266, "step": 321 }, { "epoch": 0.47283406754772395, "grad_norm": 55.85123062133789, "learning_rate": 3.184157475180207e-07, "logits/chosen": -0.6203020811080933, "logits/rejected": -0.5984194874763489, "logps/chosen": -72.76739501953125, "logps/ref_chosen": -57.29003143310547, "logps/ref_rejected": -95.74992370605469, "logps/rejected": -142.85971069335938, "loss": 0.4316, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16712939739227295, "margin_dpo/beta_margin_grad_std": 0.18851304054260254, "margin_dpo/beta_margin_mean": 3.1632421016693115, "margin_dpo/beta_margin_std": 2.7988502979278564, "margin_dpo/loss_margin_mean": 31.63241958618164, "margin_dpo/margin_mean": 31.63241958618164, "margin_dpo/margin_std": 27.733840942382812, "step": 322 }, { "epoch": 0.47430249632892807, "grad_norm": 46.408023834228516, "learning_rate": 3.171805115074251e-07, "logits/chosen": -0.6271795034408569, "logits/rejected": -0.60142982006073, "logps/chosen": -66.6007308959961, "logps/ref_chosen": -51.23395919799805, "logps/ref_rejected": -75.06192016601562, "logps/rejected": -121.7147445678711, "loss": 0.4239, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15695661306381226, "margin_dpo/beta_margin_grad_std": 0.1998593658208847, "margin_dpo/beta_margin_mean": 3.128605842590332, "margin_dpo/beta_margin_std": 2.577242851257324, "margin_dpo/loss_margin_mean": 31.286056518554688, "margin_dpo/margin_mean": 31.286056518554688, "margin_dpo/margin_std": 25.391637802124023, "step": 323 }, { "epoch": 0.47577092511013214, "grad_norm": 59.7817497253418, "learning_rate": 3.1594350522787295e-07, "logits/chosen": -0.6021302938461304, "logits/rejected": -0.5500773191452026, "logps/chosen": -82.58967590332031, "logps/ref_chosen": -65.13516998291016, "logps/ref_rejected": -86.47750091552734, "logps/rejected": -133.61947631835938, "loss": 0.4627, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16844435036182404, "margin_dpo/beta_margin_grad_std": 0.207102969288826, "margin_dpo/beta_margin_mean": 2.9687466621398926, "margin_dpo/beta_margin_std": 2.6024606227874756, "margin_dpo/loss_margin_mean": 29.68746566772461, "margin_dpo/margin_mean": 29.687463760375977, "margin_dpo/margin_std": 25.10573959350586, "step": 324 }, { "epoch": 0.47723935389133626, "grad_norm": 43.97902297973633, "learning_rate": 3.147047612756302e-07, "logits/chosen": -0.6481966972351074, "logits/rejected": -0.5832774639129639, "logps/chosen": -70.51419067382812, "logps/ref_chosen": -56.215599060058594, "logps/ref_rejected": -70.0859375, "logps/rejected": -113.08633422851562, "loss": 0.4277, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1607721447944641, "margin_dpo/beta_margin_grad_std": 0.20015878975391388, "margin_dpo/beta_margin_mean": 2.870180368423462, "margin_dpo/beta_margin_std": 2.1971116065979004, "margin_dpo/loss_margin_mean": 28.70180320739746, "margin_dpo/margin_mean": 28.70180320739746, "margin_dpo/margin_std": 21.692012786865234, "step": 325 }, { "epoch": 0.4787077826725404, "grad_norm": 58.040103912353516, "learning_rate": 3.134643122927519e-07, "logits/chosen": -0.6689083576202393, "logits/rejected": -0.6224409937858582, "logps/chosen": -90.72734832763672, "logps/ref_chosen": -72.72496032714844, "logps/ref_rejected": -79.84678649902344, "logps/rejected": -123.62333679199219, "loss": 0.5107, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.19229000806808472, "margin_dpo/beta_margin_grad_std": 0.202370285987854, "margin_dpo/beta_margin_mean": 2.577415943145752, "margin_dpo/beta_margin_std": 2.4158146381378174, "margin_dpo/loss_margin_mean": 25.774160385131836, "margin_dpo/margin_mean": 25.774158477783203, "margin_dpo/margin_std": 24.106483459472656, "step": 326 }, { "epoch": 0.4801762114537445, "grad_norm": 48.54873275756836, "learning_rate": 3.1222219096622264e-07, "logits/chosen": -0.6299796104431152, "logits/rejected": -0.5861495733261108, "logps/chosen": -84.29374694824219, "logps/ref_chosen": -69.13441467285156, "logps/ref_rejected": -111.93377685546875, "logps/rejected": -164.3966064453125, "loss": 0.2894, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1097550168633461, "margin_dpo/beta_margin_grad_std": 0.17322029173374176, "margin_dpo/beta_margin_mean": 3.730348825454712, "margin_dpo/beta_margin_std": 2.5048036575317383, "margin_dpo/loss_margin_mean": 37.30348587036133, "margin_dpo/margin_mean": 37.303489685058594, "margin_dpo/margin_std": 24.60919952392578, "step": 327 }, { "epoch": 0.48164464023494863, "grad_norm": 52.86729431152344, "learning_rate": 3.1097843002709427e-07, "logits/chosen": -0.6159682869911194, "logits/rejected": -0.5945558547973633, "logps/chosen": -78.4199447631836, "logps/ref_chosen": -59.68719482421875, "logps/ref_rejected": -90.85499572753906, "logps/rejected": -137.5189971923828, "loss": 0.4753, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1785195767879486, "margin_dpo/beta_margin_grad_std": 0.1958281397819519, "margin_dpo/beta_margin_mean": 2.7931253910064697, "margin_dpo/beta_margin_std": 2.519235134124756, "margin_dpo/loss_margin_mean": 27.931251525878906, "margin_dpo/margin_mean": 27.931251525878906, "margin_dpo/margin_std": 25.17681312561035, "step": 328 }, { "epoch": 0.4831130690161527, "grad_norm": 60.656837463378906, "learning_rate": 3.0973306224962437e-07, "logits/chosen": -0.6298278570175171, "logits/rejected": -0.5832244753837585, "logps/chosen": -82.09062957763672, "logps/ref_chosen": -65.2461929321289, "logps/ref_rejected": -100.69770812988281, "logps/rejected": -154.71499633789062, "loss": 0.3743, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12058813869953156, "margin_dpo/beta_margin_grad_std": 0.20209833979606628, "margin_dpo/beta_margin_mean": 3.717285633087158, "margin_dpo/beta_margin_std": 2.69805908203125, "margin_dpo/loss_margin_mean": 37.172855377197266, "margin_dpo/margin_mean": 37.172855377197266, "margin_dpo/margin_std": 26.679462432861328, "step": 329 }, { "epoch": 0.4845814977973568, "grad_norm": 48.152469635009766, "learning_rate": 3.084861204504122e-07, "logits/chosen": -0.6236467361450195, "logits/rejected": -0.6115535497665405, "logps/chosen": -64.35501098632812, "logps/ref_chosen": -46.998348236083984, "logps/ref_rejected": -86.87684631347656, "logps/rejected": -135.53587341308594, "loss": 0.4026, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14161227643489838, "margin_dpo/beta_margin_grad_std": 0.18910308182239532, "margin_dpo/beta_margin_mean": 3.1302359104156494, "margin_dpo/beta_margin_std": 2.4480643272399902, "margin_dpo/loss_margin_mean": 31.302356719970703, "margin_dpo/margin_mean": 31.302356719970703, "margin_dpo/margin_std": 24.32876968383789, "step": 330 }, { "epoch": 0.48604992657856094, "grad_norm": 36.348045349121094, "learning_rate": 3.072376374875335e-07, "logits/chosen": -0.6435179710388184, "logits/rejected": -0.607470691204071, "logps/chosen": -66.53829193115234, "logps/ref_chosen": -50.52424621582031, "logps/ref_rejected": -89.01544189453125, "logps/rejected": -139.56106567382812, "loss": 0.2539, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1042742058634758, "margin_dpo/beta_margin_grad_std": 0.14150582253932953, "margin_dpo/beta_margin_mean": 3.453158378601074, "margin_dpo/beta_margin_std": 2.3450417518615723, "margin_dpo/loss_margin_mean": 34.53158187866211, "margin_dpo/margin_mean": 34.53158187866211, "margin_dpo/margin_std": 23.280845642089844, "step": 331 }, { "epoch": 0.48751835535976507, "grad_norm": 49.24102783203125, "learning_rate": 3.059876462596758e-07, "logits/chosen": -0.6652279496192932, "logits/rejected": -0.6368132829666138, "logps/chosen": -67.384521484375, "logps/ref_chosen": -49.18028259277344, "logps/ref_rejected": -76.48515319824219, "logps/rejected": -120.33047485351562, "loss": 0.5416, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18533176183700562, "margin_dpo/beta_margin_grad_std": 0.21836212277412415, "margin_dpo/beta_margin_mean": 2.564108371734619, "margin_dpo/beta_margin_std": 2.247309684753418, "margin_dpo/loss_margin_mean": 25.641084671020508, "margin_dpo/margin_mean": 25.641082763671875, "margin_dpo/margin_std": 22.292034149169922, "step": 332 }, { "epoch": 0.4889867841409692, "grad_norm": 65.32575225830078, "learning_rate": 3.0473617970527015e-07, "logits/chosen": -0.6248708963394165, "logits/rejected": -0.6129882335662842, "logps/chosen": -83.42967224121094, "logps/ref_chosen": -63.75574493408203, "logps/ref_rejected": -95.04411315917969, "logps/rejected": -147.51449584960938, "loss": 0.5294, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17729292809963226, "margin_dpo/beta_margin_grad_std": 0.23055444657802582, "margin_dpo/beta_margin_mean": 3.2796459197998047, "margin_dpo/beta_margin_std": 2.900787115097046, "margin_dpo/loss_margin_mean": 32.79645919799805, "margin_dpo/margin_mean": 32.79645919799805, "margin_dpo/margin_std": 28.70106315612793, "step": 333 }, { "epoch": 0.49045521292217326, "grad_norm": 47.027408599853516, "learning_rate": 3.034832708016243e-07, "logits/chosen": -0.5920358896255493, "logits/rejected": -0.5716053247451782, "logps/chosen": -87.28846740722656, "logps/ref_chosen": -66.97975158691406, "logps/ref_rejected": -95.31692504882812, "logps/rejected": -146.97540283203125, "loss": 0.3501, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12930122017860413, "margin_dpo/beta_margin_grad_std": 0.18267269432544708, "margin_dpo/beta_margin_mean": 3.1349751949310303, "margin_dpo/beta_margin_std": 2.204379081726074, "margin_dpo/loss_margin_mean": 31.34975242614746, "margin_dpo/margin_mean": 31.34975242614746, "margin_dpo/margin_std": 22.04346466064453, "step": 334 }, { "epoch": 0.4919236417033774, "grad_norm": 64.20230865478516, "learning_rate": 3.022289525640531e-07, "logits/chosen": -0.6635192632675171, "logits/rejected": -0.6386910676956177, "logps/chosen": -80.52153015136719, "logps/ref_chosen": -62.54248046875, "logps/ref_rejected": -87.6176986694336, "logps/rejected": -133.28208923339844, "loss": 0.5273, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18919570744037628, "margin_dpo/beta_margin_grad_std": 0.2199753075838089, "margin_dpo/beta_margin_mean": 2.768533945083618, "margin_dpo/beta_margin_std": 2.6116201877593994, "margin_dpo/loss_margin_mean": 27.685338973999023, "margin_dpo/margin_mean": 27.685338973999023, "margin_dpo/margin_std": 25.792123794555664, "step": 335 }, { "epoch": 0.4933920704845815, "grad_norm": 77.19268035888672, "learning_rate": 3.009732580450086e-07, "logits/chosen": -0.597804069519043, "logits/rejected": -0.5845491290092468, "logps/chosen": -73.87245178222656, "logps/ref_chosen": -54.531150817871094, "logps/ref_rejected": -104.40424346923828, "logps/rejected": -157.87643432617188, "loss": 0.4829, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13811615109443665, "margin_dpo/beta_margin_grad_std": 0.2196315973997116, "margin_dpo/beta_margin_mean": 3.4130895137786865, "margin_dpo/beta_margin_std": 2.9321839809417725, "margin_dpo/loss_margin_mean": 34.13089370727539, "margin_dpo/margin_mean": 34.13089370727539, "margin_dpo/margin_std": 28.592952728271484, "step": 336 }, { "epoch": 0.4948604992657856, "grad_norm": 51.322021484375, "learning_rate": 2.9971622033320914e-07, "logits/chosen": -0.6781501173973083, "logits/rejected": -0.655287504196167, "logps/chosen": -82.64189147949219, "logps/ref_chosen": -65.12869262695312, "logps/ref_rejected": -101.72701263427734, "logps/rejected": -150.01385498046875, "loss": 0.3766, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14225561916828156, "margin_dpo/beta_margin_grad_std": 0.1846165657043457, "margin_dpo/beta_margin_mean": 3.077363967895508, "margin_dpo/beta_margin_std": 2.2813127040863037, "margin_dpo/loss_margin_mean": 30.773639678955078, "margin_dpo/margin_mean": 30.773639678955078, "margin_dpo/margin_std": 22.479888916015625, "step": 337 }, { "epoch": 0.49632892804698975, "grad_norm": 54.3819580078125, "learning_rate": 2.984578725527675e-07, "logits/chosen": -0.6051090955734253, "logits/rejected": -0.5787324905395508, "logps/chosen": -78.59669494628906, "logps/ref_chosen": -58.422706604003906, "logps/ref_rejected": -89.06854248046875, "logps/rejected": -139.98745727539062, "loss": 0.3929, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15525604784488678, "margin_dpo/beta_margin_grad_std": 0.17774680256843567, "margin_dpo/beta_margin_mean": 3.0744948387145996, "margin_dpo/beta_margin_std": 2.481618642807007, "margin_dpo/loss_margin_mean": 30.74494743347168, "margin_dpo/margin_mean": 30.744945526123047, "margin_dpo/margin_std": 24.781238555908203, "step": 338 }, { "epoch": 0.4977973568281938, "grad_norm": 43.893314361572266, "learning_rate": 2.9719824786231796e-07, "logits/chosen": -0.6943797469139099, "logits/rejected": -0.6593571901321411, "logps/chosen": -77.50848388671875, "logps/ref_chosen": -59.99531555175781, "logps/ref_rejected": -103.9109115600586, "logps/rejected": -155.89947509765625, "loss": 0.3567, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13147962093353271, "margin_dpo/beta_margin_grad_std": 0.19132699072360992, "margin_dpo/beta_margin_mean": 3.4475395679473877, "margin_dpo/beta_margin_std": 2.6293349266052246, "margin_dpo/loss_margin_mean": 34.47539520263672, "margin_dpo/margin_mean": 34.47539520263672, "margin_dpo/margin_std": 25.884403228759766, "step": 339 }, { "epoch": 0.49926578560939794, "grad_norm": 47.391937255859375, "learning_rate": 2.959373794541426e-07, "logits/chosen": -0.5665243864059448, "logits/rejected": -0.5303752422332764, "logps/chosen": -73.12321472167969, "logps/ref_chosen": -52.83022689819336, "logps/ref_rejected": -73.10723876953125, "logps/rejected": -127.07223510742188, "loss": 0.3914, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14956659078598022, "margin_dpo/beta_margin_grad_std": 0.1902948021888733, "margin_dpo/beta_margin_mean": 3.3672003746032715, "margin_dpo/beta_margin_std": 3.0476183891296387, "margin_dpo/loss_margin_mean": 33.67200469970703, "margin_dpo/margin_mean": 33.67200469970703, "margin_dpo/margin_std": 30.17064666748047, "step": 340 }, { "epoch": 0.5007342143906021, "grad_norm": 49.45682907104492, "learning_rate": 2.946753005532965e-07, "logits/chosen": -0.5927428007125854, "logits/rejected": -0.5772538185119629, "logps/chosen": -69.79241943359375, "logps/ref_chosen": -47.899803161621094, "logps/ref_rejected": -101.80987548828125, "logps/rejected": -160.8358154296875, "loss": 0.3099, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12000828981399536, "margin_dpo/beta_margin_grad_std": 0.17166633903980255, "margin_dpo/beta_margin_mean": 3.713330030441284, "margin_dpo/beta_margin_std": 2.623293876647949, "margin_dpo/loss_margin_mean": 37.13330078125, "margin_dpo/margin_mean": 37.13330078125, "margin_dpo/margin_std": 26.138290405273438, "step": 341 }, { "epoch": 0.5022026431718062, "grad_norm": 71.23127746582031, "learning_rate": 2.934120444167326e-07, "logits/chosen": -0.5858966708183289, "logits/rejected": -0.5406848192214966, "logps/chosen": -90.55252838134766, "logps/ref_chosen": -71.99664306640625, "logps/ref_rejected": -92.58959197998047, "logps/rejected": -143.6535186767578, "loss": 0.4539, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16609962284564972, "margin_dpo/beta_margin_grad_std": 0.21481139957904816, "margin_dpo/beta_margin_mean": 3.2508039474487305, "margin_dpo/beta_margin_std": 2.9156150817871094, "margin_dpo/loss_margin_mean": 32.50803756713867, "margin_dpo/margin_mean": 32.50803756713867, "margin_dpo/margin_std": 29.08768081665039, "step": 342 }, { "epoch": 0.5036710719530103, "grad_norm": 58.09341812133789, "learning_rate": 2.9214764433242476e-07, "logits/chosen": -0.6343034505844116, "logits/rejected": -0.6098527908325195, "logps/chosen": -71.43299865722656, "logps/ref_chosen": -54.40562438964844, "logps/ref_rejected": -111.04141998291016, "logps/rejected": -162.60467529296875, "loss": 0.3828, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13273513317108154, "margin_dpo/beta_margin_grad_std": 0.20148909091949463, "margin_dpo/beta_margin_mean": 3.4535882472991943, "margin_dpo/beta_margin_std": 2.496697425842285, "margin_dpo/loss_margin_mean": 34.53588104248047, "margin_dpo/margin_mean": 34.53588104248047, "margin_dpo/margin_std": 24.785640716552734, "step": 343 }, { "epoch": 0.5051395007342144, "grad_norm": 59.848392486572266, "learning_rate": 2.9088213361849126e-07, "logits/chosen": -0.605980396270752, "logits/rejected": -0.5816007256507874, "logps/chosen": -74.10179901123047, "logps/ref_chosen": -53.96466827392578, "logps/ref_rejected": -90.62336730957031, "logps/rejected": -139.1201629638672, "loss": 0.5713, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1968245953321457, "margin_dpo/beta_margin_grad_std": 0.23204652965068817, "margin_dpo/beta_margin_mean": 2.8359665870666504, "margin_dpo/beta_margin_std": 2.9364418983459473, "margin_dpo/loss_margin_mean": 28.359664916992188, "margin_dpo/margin_mean": 28.359664916992188, "margin_dpo/margin_std": 28.389617919921875, "step": 344 }, { "epoch": 0.5066079295154186, "grad_norm": 52.64528274536133, "learning_rate": 2.896155456223163e-07, "logits/chosen": -0.6204372048377991, "logits/rejected": -0.5874470472335815, "logps/chosen": -81.14032745361328, "logps/ref_chosen": -61.685699462890625, "logps/ref_rejected": -99.49040985107422, "logps/rejected": -153.2753448486328, "loss": 0.3808, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12536533176898956, "margin_dpo/beta_margin_grad_std": 0.19518449902534485, "margin_dpo/beta_margin_mean": 3.433030605316162, "margin_dpo/beta_margin_std": 2.6923766136169434, "margin_dpo/loss_margin_mean": 34.33030700683594, "margin_dpo/margin_mean": 34.33030319213867, "margin_dpo/margin_std": 26.533443450927734, "step": 345 }, { "epoch": 0.5080763582966226, "grad_norm": 66.96176147460938, "learning_rate": 2.883479137196714e-07, "logits/chosen": -0.6459622383117676, "logits/rejected": -0.6260564923286438, "logps/chosen": -77.54434204101562, "logps/ref_chosen": -55.256263732910156, "logps/ref_rejected": -77.41532135009766, "logps/rejected": -130.17681884765625, "loss": 0.501, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1676793098449707, "margin_dpo/beta_margin_grad_std": 0.2248024344444275, "margin_dpo/beta_margin_mean": 3.0473415851593018, "margin_dpo/beta_margin_std": 2.67748761177063, "margin_dpo/loss_margin_mean": 30.47341537475586, "margin_dpo/margin_mean": 30.47341537475586, "margin_dpo/margin_std": 26.7009220123291, "step": 346 }, { "epoch": 0.5095447870778267, "grad_norm": 59.65949249267578, "learning_rate": 2.8707927131383614e-07, "logits/chosen": -0.6436910033226013, "logits/rejected": -0.6030080318450928, "logps/chosen": -80.81089782714844, "logps/ref_chosen": -57.56624221801758, "logps/ref_rejected": -92.35508728027344, "logps/rejected": -146.6624755859375, "loss": 0.5062, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15754956007003784, "margin_dpo/beta_margin_grad_std": 0.22358344495296478, "margin_dpo/beta_margin_mean": 3.1062729358673096, "margin_dpo/beta_margin_std": 2.751783847808838, "margin_dpo/loss_margin_mean": 31.062728881835938, "margin_dpo/margin_mean": 31.062728881835938, "margin_dpo/margin_std": 27.136539459228516, "step": 347 }, { "epoch": 0.5110132158590308, "grad_norm": 66.42916870117188, "learning_rate": 2.858096518347179e-07, "logits/chosen": -0.6487417221069336, "logits/rejected": -0.6265465021133423, "logps/chosen": -76.80084991455078, "logps/ref_chosen": -56.31770324707031, "logps/ref_rejected": -89.13837432861328, "logps/rejected": -139.78619384765625, "loss": 0.5393, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18773815035820007, "margin_dpo/beta_margin_grad_std": 0.23163601756095886, "margin_dpo/beta_margin_mean": 3.016467809677124, "margin_dpo/beta_margin_std": 2.694882392883301, "margin_dpo/loss_margin_mean": 30.1646785736084, "margin_dpo/margin_mean": 30.164676666259766, "margin_dpo/margin_std": 26.57091522216797, "step": 348 }, { "epoch": 0.5124816446402349, "grad_norm": 74.79912567138672, "learning_rate": 2.845390887379706e-07, "logits/chosen": -0.6091076135635376, "logits/rejected": -0.5961982011795044, "logps/chosen": -76.59553527832031, "logps/ref_chosen": -58.0255126953125, "logps/ref_rejected": -97.50515747070312, "logps/rejected": -141.9857177734375, "loss": 0.7187, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.22252453863620758, "margin_dpo/beta_margin_grad_std": 0.259151816368103, "margin_dpo/beta_margin_mean": 2.591054916381836, "margin_dpo/beta_margin_std": 2.9782986640930176, "margin_dpo/loss_margin_mean": 25.91054916381836, "margin_dpo/margin_mean": 25.91054916381836, "margin_dpo/margin_std": 29.515146255493164, "step": 349 }, { "epoch": 0.5139500734214391, "grad_norm": 60.944671630859375, "learning_rate": 2.8326761550411346e-07, "logits/chosen": -0.6253387928009033, "logits/rejected": -0.5987178087234497, "logps/chosen": -83.54938507080078, "logps/ref_chosen": -64.33049011230469, "logps/ref_rejected": -89.87164306640625, "logps/rejected": -136.64892578125, "loss": 0.6331, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.20255906879901886, "margin_dpo/beta_margin_grad_std": 0.24518385529518127, "margin_dpo/beta_margin_mean": 2.7558395862579346, "margin_dpo/beta_margin_std": 2.81416392326355, "margin_dpo/loss_margin_mean": 27.558395385742188, "margin_dpo/margin_mean": 27.558395385742188, "margin_dpo/margin_std": 27.75186538696289, "step": 350 }, { "epoch": 0.5154185022026432, "grad_norm": 44.936790466308594, "learning_rate": 2.819952656376487e-07, "logits/chosen": -0.5853023529052734, "logits/rejected": -0.5609848499298096, "logps/chosen": -77.83966827392578, "logps/ref_chosen": -60.6721305847168, "logps/ref_rejected": -101.5654296875, "logps/rejected": -153.1103515625, "loss": 0.3463, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.125121608376503, "margin_dpo/beta_margin_grad_std": 0.19120459258556366, "margin_dpo/beta_margin_mean": 3.4377381801605225, "margin_dpo/beta_margin_std": 2.4538416862487793, "margin_dpo/loss_margin_mean": 34.37738037109375, "margin_dpo/margin_mean": 34.37738037109375, "margin_dpo/margin_std": 24.536075592041016, "step": 351 }, { "epoch": 0.5168869309838473, "grad_norm": 71.77349853515625, "learning_rate": 2.8072207266617854e-07, "logits/chosen": -0.6167569160461426, "logits/rejected": -0.5798036456108093, "logps/chosen": -88.59427642822266, "logps/ref_chosen": -70.9434585571289, "logps/ref_rejected": -76.6419677734375, "logps/rejected": -121.15208435058594, "loss": 0.5829, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.20149900019168854, "margin_dpo/beta_margin_grad_std": 0.23062226176261902, "margin_dpo/beta_margin_mean": 2.685929775238037, "margin_dpo/beta_margin_std": 2.7565815448760986, "margin_dpo/loss_margin_mean": 26.859296798706055, "margin_dpo/margin_mean": 26.859298706054688, "margin_dpo/margin_std": 27.36536407470703, "step": 352 }, { "epoch": 0.5183553597650514, "grad_norm": 69.42572021484375, "learning_rate": 2.794480701395219e-07, "logits/chosen": -0.6497888565063477, "logits/rejected": -0.623069167137146, "logps/chosen": -78.64827728271484, "logps/ref_chosen": -58.39533996582031, "logps/ref_rejected": -80.33552551269531, "logps/rejected": -127.51103210449219, "loss": 0.7022, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.21879735589027405, "margin_dpo/beta_margin_grad_std": 0.2619403302669525, "margin_dpo/beta_margin_mean": 2.6922569274902344, "margin_dpo/beta_margin_std": 2.7726051807403564, "margin_dpo/loss_margin_mean": 26.92256736755371, "margin_dpo/margin_mean": 26.92256736755371, "margin_dpo/margin_std": 27.62420082092285, "step": 353 }, { "epoch": 0.5198237885462555, "grad_norm": 39.96236801147461, "learning_rate": 2.781732916288303e-07, "logits/chosen": -0.6289291381835938, "logits/rejected": -0.6045188903808594, "logps/chosen": -76.57125854492188, "logps/ref_chosen": -59.80299377441406, "logps/ref_rejected": -88.75750732421875, "logps/rejected": -137.36941528320312, "loss": 0.2933, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1184755489230156, "margin_dpo/beta_margin_grad_std": 0.15497253835201263, "margin_dpo/beta_margin_mean": 3.184364080429077, "margin_dpo/beta_margin_std": 2.173491954803467, "margin_dpo/loss_margin_mean": 31.843639373779297, "margin_dpo/margin_mean": 31.843639373779297, "margin_dpo/margin_std": 21.62934684753418, "step": 354 }, { "epoch": 0.5212922173274597, "grad_norm": 38.83880615234375, "learning_rate": 2.7689777072570284e-07, "logits/chosen": -0.6833703517913818, "logits/rejected": -0.6498676538467407, "logps/chosen": -70.72756958007812, "logps/ref_chosen": -54.128501892089844, "logps/ref_rejected": -82.40606689453125, "logps/rejected": -132.54278564453125, "loss": 0.3525, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14085842669010162, "margin_dpo/beta_margin_grad_std": 0.16604043543338776, "margin_dpo/beta_margin_mean": 3.353764295578003, "margin_dpo/beta_margin_std": 2.9216277599334717, "margin_dpo/loss_margin_mean": 33.53764343261719, "margin_dpo/margin_mean": 33.53764343261719, "margin_dpo/margin_std": 28.546594619750977, "step": 355 }, { "epoch": 0.5227606461086637, "grad_norm": 94.49285888671875, "learning_rate": 2.7562154104130176e-07, "logits/chosen": -0.6543818712234497, "logits/rejected": -0.6269608736038208, "logps/chosen": -86.63969421386719, "logps/ref_chosen": -64.67381286621094, "logps/ref_rejected": -75.89926147460938, "logps/rejected": -120.64968872070312, "loss": 0.8, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2603550851345062, "margin_dpo/beta_margin_grad_std": 0.25936320424079895, "margin_dpo/beta_margin_mean": 2.278454065322876, "margin_dpo/beta_margin_std": 2.8312277793884277, "margin_dpo/loss_margin_mean": 22.7845401763916, "margin_dpo/margin_mean": 22.78453826904297, "margin_dpo/margin_std": 27.273006439208984, "step": 356 }, { "epoch": 0.5242290748898678, "grad_norm": 47.93386459350586, "learning_rate": 2.7434463620546594e-07, "logits/chosen": -0.621991753578186, "logits/rejected": -0.5901703834533691, "logps/chosen": -70.54598999023438, "logps/ref_chosen": -52.725799560546875, "logps/ref_rejected": -86.84115600585938, "logps/rejected": -136.3178253173828, "loss": 0.3853, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13819599151611328, "margin_dpo/beta_margin_grad_std": 0.1930486112833023, "margin_dpo/beta_margin_mean": 3.165647268295288, "margin_dpo/beta_margin_std": 2.430621862411499, "margin_dpo/loss_margin_mean": 31.65647315979004, "margin_dpo/margin_mean": 31.65647315979004, "margin_dpo/margin_std": 23.81618309020996, "step": 357 }, { "epoch": 0.5256975036710719, "grad_norm": 65.47496795654297, "learning_rate": 2.730670898658255e-07, "logits/chosen": -0.6221922636032104, "logits/rejected": -0.5780969858169556, "logps/chosen": -79.48448944091797, "logps/ref_chosen": -63.20543670654297, "logps/ref_rejected": -88.373291015625, "logps/rejected": -133.81741333007812, "loss": 0.4719, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17817279696464539, "margin_dpo/beta_margin_grad_std": 0.19289124011993408, "margin_dpo/beta_margin_mean": 2.916506052017212, "margin_dpo/beta_margin_std": 2.7734534740448, "margin_dpo/loss_margin_mean": 29.165058135986328, "margin_dpo/margin_mean": 29.165058135986328, "margin_dpo/margin_std": 27.52760887145996, "step": 358 }, { "epoch": 0.527165932452276, "grad_norm": 65.17465209960938, "learning_rate": 2.717889356869146e-07, "logits/chosen": -0.5604794025421143, "logits/rejected": -0.5229655504226685, "logps/chosen": -78.45906829833984, "logps/ref_chosen": -56.370216369628906, "logps/ref_rejected": -82.17375183105469, "logps/rejected": -136.7306365966797, "loss": 0.4793, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15810422599315643, "margin_dpo/beta_margin_grad_std": 0.22020529210567474, "margin_dpo/beta_margin_mean": 3.2468044757843018, "margin_dpo/beta_margin_std": 2.889819860458374, "margin_dpo/loss_margin_mean": 32.46804428100586, "margin_dpo/margin_mean": 32.46804428100586, "margin_dpo/margin_std": 27.820709228515625, "step": 359 }, { "epoch": 0.5286343612334802, "grad_norm": 45.14263153076172, "learning_rate": 2.7051020734928443e-07, "logits/chosen": -0.5990117788314819, "logits/rejected": -0.5746924877166748, "logps/chosen": -70.55091857910156, "logps/ref_chosen": -51.460384368896484, "logps/ref_rejected": -69.83892822265625, "logps/rejected": -118.64250183105469, "loss": 0.4051, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14690235257148743, "margin_dpo/beta_margin_grad_std": 0.18873253464698792, "margin_dpo/beta_margin_mean": 2.9713053703308105, "margin_dpo/beta_margin_std": 2.3531863689422607, "margin_dpo/loss_margin_mean": 29.71305274963379, "margin_dpo/margin_mean": 29.71305274963379, "margin_dpo/margin_std": 23.29418182373047, "step": 360 }, { "epoch": 0.5301027900146843, "grad_norm": 59.539451599121094, "learning_rate": 2.6923093854861593e-07, "logits/chosen": -0.5896218419075012, "logits/rejected": -0.5681077837944031, "logps/chosen": -73.69671630859375, "logps/ref_chosen": -53.86951446533203, "logps/ref_rejected": -90.76925659179688, "logps/rejected": -138.9726104736328, "loss": 0.489, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17487533390522003, "margin_dpo/beta_margin_grad_std": 0.20955052971839905, "margin_dpo/beta_margin_mean": 2.837615966796875, "margin_dpo/beta_margin_std": 2.5256011486053467, "margin_dpo/loss_margin_mean": 28.37615966796875, "margin_dpo/margin_mean": 28.37615966796875, "margin_dpo/margin_std": 24.988800048828125, "step": 361 }, { "epoch": 0.5315712187958884, "grad_norm": 54.718441009521484, "learning_rate": 2.679511629948319e-07, "logits/chosen": -0.6207703351974487, "logits/rejected": -0.6055707931518555, "logps/chosen": -78.78889465332031, "logps/ref_chosen": -58.639060974121094, "logps/ref_rejected": -105.58195495605469, "logps/rejected": -159.64309692382812, "loss": 0.4171, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14796419441699982, "margin_dpo/beta_margin_grad_std": 0.20472835004329681, "margin_dpo/beta_margin_mean": 3.3911311626434326, "margin_dpo/beta_margin_std": 2.7831871509552, "margin_dpo/loss_margin_mean": 33.91130828857422, "margin_dpo/margin_mean": 33.91130828857422, "margin_dpo/margin_std": 27.807514190673828, "step": 362 }, { "epoch": 0.5330396475770925, "grad_norm": 203.7032012939453, "learning_rate": 2.6667091441120816e-07, "logits/chosen": -0.6275466084480286, "logits/rejected": -0.5816659331321716, "logps/chosen": -62.10087585449219, "logps/ref_chosen": -44.558380126953125, "logps/ref_rejected": -74.69496154785156, "logps/rejected": -131.32981872558594, "loss": 0.4034, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13567416369915009, "margin_dpo/beta_margin_grad_std": 0.2034136801958084, "margin_dpo/beta_margin_mean": 3.909236192703247, "margin_dpo/beta_margin_std": 3.2993836402893066, "margin_dpo/loss_margin_mean": 39.09236145019531, "margin_dpo/margin_mean": 39.09236145019531, "margin_dpo/margin_std": 32.874427795410156, "step": 363 }, { "epoch": 0.5345080763582967, "grad_norm": 69.3237533569336, "learning_rate": 2.6539022653348575e-07, "logits/chosen": -0.6238687038421631, "logits/rejected": -0.6193605065345764, "logps/chosen": -67.90678405761719, "logps/ref_chosen": -48.894622802734375, "logps/ref_rejected": -91.395751953125, "logps/rejected": -138.55311584472656, "loss": 0.517, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1832251399755478, "margin_dpo/beta_margin_grad_std": 0.21592886745929718, "margin_dpo/beta_margin_mean": 2.8145196437835693, "margin_dpo/beta_margin_std": 2.687561273574829, "margin_dpo/loss_margin_mean": 28.14519691467285, "margin_dpo/margin_mean": 28.14519691467285, "margin_dpo/margin_std": 26.325363159179688, "step": 364 }, { "epoch": 0.5359765051395007, "grad_norm": 53.85197448730469, "learning_rate": 2.641091331089811e-07, "logits/chosen": -0.5883674621582031, "logits/rejected": -0.5725036859512329, "logps/chosen": -69.85151672363281, "logps/ref_chosen": -51.49274444580078, "logps/ref_rejected": -92.70166778564453, "logps/rejected": -138.56735229492188, "loss": 0.4649, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1821574568748474, "margin_dpo/beta_margin_grad_std": 0.1840406358242035, "margin_dpo/beta_margin_mean": 2.750690460205078, "margin_dpo/beta_margin_std": 2.6088244915008545, "margin_dpo/loss_margin_mean": 27.50690269470215, "margin_dpo/margin_mean": 27.50690269470215, "margin_dpo/margin_std": 25.32098960876465, "step": 365 }, { "epoch": 0.5374449339207048, "grad_norm": 37.25920867919922, "learning_rate": 2.6282766789569736e-07, "logits/chosen": -0.6385898590087891, "logits/rejected": -0.6289624571800232, "logps/chosen": -61.94309997558594, "logps/ref_chosen": -44.7205696105957, "logps/ref_rejected": -83.31040954589844, "logps/rejected": -129.80517578125, "loss": 0.3876, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15300396084785461, "margin_dpo/beta_margin_grad_std": 0.17375686764717102, "margin_dpo/beta_margin_mean": 2.9272243976593018, "margin_dpo/beta_margin_std": 2.34128475189209, "margin_dpo/loss_margin_mean": 29.27224349975586, "margin_dpo/margin_mean": 29.27224349975586, "margin_dpo/margin_std": 23.367576599121094, "step": 366 }, { "epoch": 0.5389133627019089, "grad_norm": 57.8803825378418, "learning_rate": 2.615458646614349e-07, "logits/chosen": -0.6073235273361206, "logits/rejected": -0.5901012420654297, "logps/chosen": -77.51007080078125, "logps/ref_chosen": -58.405418395996094, "logps/ref_rejected": -76.75132751464844, "logps/rejected": -121.35176086425781, "loss": 0.512, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1787261813879013, "margin_dpo/beta_margin_grad_std": 0.21498483419418335, "margin_dpo/beta_margin_mean": 2.5495781898498535, "margin_dpo/beta_margin_std": 2.2787163257598877, "margin_dpo/loss_margin_mean": 25.49578094482422, "margin_dpo/margin_mean": 25.49578094482422, "margin_dpo/margin_std": 22.63507843017578, "step": 367 }, { "epoch": 0.540381791483113, "grad_norm": 42.138545989990234, "learning_rate": 2.6026375718290083e-07, "logits/chosen": -0.6203917264938354, "logits/rejected": -0.6051937937736511, "logps/chosen": -61.3531494140625, "logps/ref_chosen": -44.452518463134766, "logps/ref_rejected": -98.55526733398438, "logps/rejected": -146.37754821777344, "loss": 0.3505, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13690005242824554, "margin_dpo/beta_margin_grad_std": 0.1739962249994278, "margin_dpo/beta_margin_mean": 3.092164993286133, "margin_dpo/beta_margin_std": 2.4503121376037598, "margin_dpo/loss_margin_mean": 30.921649932861328, "margin_dpo/margin_mean": 30.921649932861328, "margin_dpo/margin_std": 23.325572967529297, "step": 368 }, { "epoch": 0.5418502202643172, "grad_norm": 68.98252868652344, "learning_rate": 2.589813792448196e-07, "logits/chosen": -0.6216270923614502, "logits/rejected": -0.5819696187973022, "logps/chosen": -89.982421875, "logps/ref_chosen": -71.38150024414062, "logps/ref_rejected": -91.29582214355469, "logps/rejected": -135.1055450439453, "loss": 0.5441, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.19111789762973785, "margin_dpo/beta_margin_grad_std": 0.2203211784362793, "margin_dpo/beta_margin_mean": 2.520881175994873, "margin_dpo/beta_margin_std": 2.2390410900115967, "margin_dpo/loss_margin_mean": 25.208810806274414, "margin_dpo/margin_mean": 25.208810806274414, "margin_dpo/margin_std": 22.379486083984375, "step": 369 }, { "epoch": 0.5433186490455213, "grad_norm": 57.02136993408203, "learning_rate": 2.5769876463904263e-07, "logits/chosen": -0.6215974688529968, "logits/rejected": -0.5932759046554565, "logps/chosen": -90.78213500976562, "logps/ref_chosen": -71.60749816894531, "logps/ref_rejected": -97.25978088378906, "logps/rejected": -141.36424255371094, "loss": 0.5193, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1943112313747406, "margin_dpo/beta_margin_grad_std": 0.2109173834323883, "margin_dpo/beta_margin_mean": 2.4929823875427246, "margin_dpo/beta_margin_std": 2.2076525688171387, "margin_dpo/loss_margin_mean": 24.929824829101562, "margin_dpo/margin_mean": 24.929824829101562, "margin_dpo/margin_std": 21.949810028076172, "step": 370 }, { "epoch": 0.5447870778267254, "grad_norm": 66.95852661132812, "learning_rate": 2.5641594716365744e-07, "logits/chosen": -0.6244653463363647, "logits/rejected": -0.5987306833267212, "logps/chosen": -89.35826873779297, "logps/ref_chosen": -69.41448974609375, "logps/ref_rejected": -99.17217254638672, "logps/rejected": -146.2442626953125, "loss": 0.6181, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1994006186723709, "margin_dpo/beta_margin_grad_std": 0.2342255413532257, "margin_dpo/beta_margin_mean": 2.712832450866699, "margin_dpo/beta_margin_std": 2.860605001449585, "margin_dpo/loss_margin_mean": 27.12832260131836, "margin_dpo/margin_mean": 27.12832260131836, "margin_dpo/margin_std": 28.538127899169922, "step": 371 }, { "epoch": 0.5462555066079295, "grad_norm": 54.26102066040039, "learning_rate": 2.551329606220976e-07, "logits/chosen": -0.631676971912384, "logits/rejected": -0.5819823741912842, "logps/chosen": -81.3255615234375, "logps/ref_chosen": -61.8179931640625, "logps/ref_rejected": -78.53949737548828, "logps/rejected": -129.32257080078125, "loss": 0.5102, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17238368093967438, "margin_dpo/beta_margin_grad_std": 0.22069989144802094, "margin_dpo/beta_margin_mean": 3.1275503635406494, "margin_dpo/beta_margin_std": 2.999239444732666, "margin_dpo/loss_margin_mean": 31.27550506591797, "margin_dpo/margin_mean": 31.27550506591797, "margin_dpo/margin_std": 29.97246551513672, "step": 372 }, { "epoch": 0.5477239353891337, "grad_norm": 65.14423370361328, "learning_rate": 2.538498388222517e-07, "logits/chosen": -0.6474554538726807, "logits/rejected": -0.6035970449447632, "logps/chosen": -85.37360382080078, "logps/ref_chosen": -64.21713256835938, "logps/ref_rejected": -85.95960998535156, "logps/rejected": -140.09530639648438, "loss": 0.4153, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14717315137386322, "margin_dpo/beta_margin_grad_std": 0.20184947550296783, "margin_dpo/beta_margin_mean": 3.2979226112365723, "margin_dpo/beta_margin_std": 2.632497549057007, "margin_dpo/loss_margin_mean": 32.979225158691406, "margin_dpo/margin_mean": 32.979225158691406, "margin_dpo/margin_std": 26.056396484375, "step": 373 }, { "epoch": 0.5491923641703378, "grad_norm": 46.910888671875, "learning_rate": 2.525666155755725e-07, "logits/chosen": -0.6809793710708618, "logits/rejected": -0.6489218473434448, "logps/chosen": -88.79120635986328, "logps/ref_chosen": -70.65017700195312, "logps/ref_rejected": -93.64016723632812, "logps/rejected": -141.58169555664062, "loss": 0.4273, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1627557873725891, "margin_dpo/beta_margin_grad_std": 0.18945109844207764, "margin_dpo/beta_margin_mean": 2.9800491333007812, "margin_dpo/beta_margin_std": 2.557394027709961, "margin_dpo/loss_margin_mean": 29.800491333007812, "margin_dpo/margin_mean": 29.800491333007812, "margin_dpo/margin_std": 25.399768829345703, "step": 374 }, { "epoch": 0.5506607929515418, "grad_norm": 54.142765045166016, "learning_rate": 2.512833246961859e-07, "logits/chosen": -0.6128416061401367, "logits/rejected": -0.5992000102996826, "logps/chosen": -79.39334106445312, "logps/ref_chosen": -60.080223083496094, "logps/ref_rejected": -88.93830871582031, "logps/rejected": -137.71192932128906, "loss": 0.516, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1836765855550766, "margin_dpo/beta_margin_grad_std": 0.22431671619415283, "margin_dpo/beta_margin_mean": 2.9460506439208984, "margin_dpo/beta_margin_std": 2.548676013946533, "margin_dpo/loss_margin_mean": 29.460506439208984, "margin_dpo/margin_mean": 29.460506439208984, "margin_dpo/margin_std": 24.365291595458984, "step": 375 }, { "epoch": 0.5521292217327459, "grad_norm": 48.23344421386719, "learning_rate": 2.5e-07, "logits/chosen": -0.6043192744255066, "logits/rejected": -0.5859169960021973, "logps/chosen": -82.08074951171875, "logps/ref_chosen": -62.660308837890625, "logps/ref_rejected": -105.526611328125, "logps/rejected": -157.26370239257812, "loss": 0.388, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14623141288757324, "margin_dpo/beta_margin_grad_std": 0.18046867847442627, "margin_dpo/beta_margin_mean": 3.23166561126709, "margin_dpo/beta_margin_std": 2.768313407897949, "margin_dpo/loss_margin_mean": 32.31665802001953, "margin_dpo/margin_mean": 32.31665802001953, "margin_dpo/margin_std": 27.595264434814453, "step": 376 }, { "epoch": 0.55359765051395, "grad_norm": 63.7028694152832, "learning_rate": 2.487166753038141e-07, "logits/chosen": -0.5758407115936279, "logits/rejected": -0.5594580769538879, "logps/chosen": -76.27653503417969, "logps/ref_chosen": -54.478736877441406, "logps/ref_rejected": -98.70335388183594, "logps/rejected": -150.482421875, "loss": 0.5374, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18470792472362518, "margin_dpo/beta_margin_grad_std": 0.2296735942363739, "margin_dpo/beta_margin_mean": 2.9981284141540527, "margin_dpo/beta_margin_std": 2.736407995223999, "margin_dpo/loss_margin_mean": 29.98128318786621, "margin_dpo/margin_mean": 29.981285095214844, "margin_dpo/margin_std": 26.938751220703125, "step": 377 }, { "epoch": 0.5550660792951542, "grad_norm": 43.11890411376953, "learning_rate": 2.4743338442442754e-07, "logits/chosen": -0.6386528015136719, "logits/rejected": -0.6268770694732666, "logps/chosen": -62.957725524902344, "logps/ref_chosen": -45.02053451538086, "logps/ref_rejected": -88.0469741821289, "logps/rejected": -137.9139404296875, "loss": 0.4093, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14005307853221893, "margin_dpo/beta_margin_grad_std": 0.205052450299263, "margin_dpo/beta_margin_mean": 3.1929781436920166, "margin_dpo/beta_margin_std": 2.5704288482666016, "margin_dpo/loss_margin_mean": 31.929780960083008, "margin_dpo/margin_mean": 31.929779052734375, "margin_dpo/margin_std": 25.305004119873047, "step": 378 }, { "epoch": 0.5565345080763583, "grad_norm": 55.71260452270508, "learning_rate": 2.461501611777483e-07, "logits/chosen": -0.628246545791626, "logits/rejected": -0.6193478107452393, "logps/chosen": -72.79714965820312, "logps/ref_chosen": -53.182098388671875, "logps/ref_rejected": -114.30015563964844, "logps/rejected": -167.07992553710938, "loss": 0.4362, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1567826271057129, "margin_dpo/beta_margin_grad_std": 0.20273275673389435, "margin_dpo/beta_margin_mean": 3.3164710998535156, "margin_dpo/beta_margin_std": 2.8849940299987793, "margin_dpo/loss_margin_mean": 33.164710998535156, "margin_dpo/margin_mean": 33.164710998535156, "margin_dpo/margin_std": 28.102596282958984, "step": 379 }, { "epoch": 0.5580029368575624, "grad_norm": 80.04903411865234, "learning_rate": 2.4486703937790243e-07, "logits/chosen": -0.5512839555740356, "logits/rejected": -0.5587849617004395, "logps/chosen": -74.60398864746094, "logps/ref_chosen": -51.3530387878418, "logps/ref_rejected": -104.19169616699219, "logps/rejected": -162.14089965820312, "loss": 0.5716, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17476344108581543, "margin_dpo/beta_margin_grad_std": 0.26113981008529663, "margin_dpo/beta_margin_mean": 3.46982479095459, "margin_dpo/beta_margin_std": 3.075237512588501, "margin_dpo/loss_margin_mean": 34.69824981689453, "margin_dpo/margin_mean": 34.69824981689453, "margin_dpo/margin_std": 30.40831756591797, "step": 380 }, { "epoch": 0.5594713656387665, "grad_norm": 64.25127410888672, "learning_rate": 2.435840528363426e-07, "logits/chosen": -0.6331868171691895, "logits/rejected": -0.5953764915466309, "logps/chosen": -79.46463012695312, "logps/ref_chosen": -57.80306625366211, "logps/ref_rejected": -79.21940612792969, "logps/rejected": -135.10447692871094, "loss": 0.5267, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16175399720668793, "margin_dpo/beta_margin_grad_std": 0.2353188693523407, "margin_dpo/beta_margin_mean": 3.4223499298095703, "margin_dpo/beta_margin_std": 3.058774471282959, "margin_dpo/loss_margin_mean": 34.2234992980957, "margin_dpo/margin_mean": 34.2234992980957, "margin_dpo/margin_std": 30.406585693359375, "step": 381 }, { "epoch": 0.5609397944199707, "grad_norm": 57.7338752746582, "learning_rate": 2.4230123536095745e-07, "logits/chosen": -0.6542234420776367, "logits/rejected": -0.6203514337539673, "logps/chosen": -84.785400390625, "logps/ref_chosen": -66.02030181884766, "logps/ref_rejected": -110.71015930175781, "logps/rejected": -164.99774169921875, "loss": 0.3934, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13284680247306824, "margin_dpo/beta_margin_grad_std": 0.2009667158126831, "margin_dpo/beta_margin_mean": 3.5522475242614746, "margin_dpo/beta_margin_std": 2.81280517578125, "margin_dpo/loss_margin_mean": 35.52247619628906, "margin_dpo/margin_mean": 35.52247619628906, "margin_dpo/margin_std": 27.49190330505371, "step": 382 }, { "epoch": 0.5624082232011748, "grad_norm": 53.96702575683594, "learning_rate": 2.4101862075518037e-07, "logits/chosen": -0.6232413053512573, "logits/rejected": -0.6115109920501709, "logps/chosen": -72.27030944824219, "logps/ref_chosen": -50.39148712158203, "logps/ref_rejected": -93.71589660644531, "logps/rejected": -147.88839721679688, "loss": 0.3758, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14286085963249207, "margin_dpo/beta_margin_grad_std": 0.1862889528274536, "margin_dpo/beta_margin_mean": 3.229367256164551, "margin_dpo/beta_margin_std": 2.584360122680664, "margin_dpo/loss_margin_mean": 32.29367446899414, "margin_dpo/margin_mean": 32.29367446899414, "margin_dpo/margin_std": 25.570125579833984, "step": 383 }, { "epoch": 0.5638766519823789, "grad_norm": 51.74114227294922, "learning_rate": 2.397362428170992e-07, "logits/chosen": -0.6301860809326172, "logits/rejected": -0.6027618050575256, "logps/chosen": -73.91239929199219, "logps/ref_chosen": -52.046104431152344, "logps/ref_rejected": -85.76089477539062, "logps/rejected": -139.81455993652344, "loss": 0.4799, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1736118346452713, "margin_dpo/beta_margin_grad_std": 0.21324963867664337, "margin_dpo/beta_margin_mean": 3.2187373638153076, "margin_dpo/beta_margin_std": 3.164245367050171, "margin_dpo/loss_margin_mean": 32.18737030029297, "margin_dpo/margin_mean": 32.187374114990234, "margin_dpo/margin_std": 31.576709747314453, "step": 384 }, { "epoch": 0.5653450807635829, "grad_norm": 58.88325881958008, "learning_rate": 2.3845413533856514e-07, "logits/chosen": -0.6432499885559082, "logits/rejected": -0.5914589166641235, "logps/chosen": -84.34367370605469, "logps/ref_chosen": -65.55216217041016, "logps/ref_rejected": -77.82792663574219, "logps/rejected": -125.24491882324219, "loss": 0.4483, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16570799052715302, "margin_dpo/beta_margin_grad_std": 0.20454205572605133, "margin_dpo/beta_margin_mean": 2.8625473976135254, "margin_dpo/beta_margin_std": 2.207449197769165, "margin_dpo/loss_margin_mean": 28.625473022460938, "margin_dpo/margin_mean": 28.625473022460938, "margin_dpo/margin_std": 22.058141708374023, "step": 385 }, { "epoch": 0.566813509544787, "grad_norm": 67.18842315673828, "learning_rate": 2.3717233210430254e-07, "logits/chosen": -0.6237103939056396, "logits/rejected": -0.5952026844024658, "logps/chosen": -79.99833679199219, "logps/ref_chosen": -58.22185516357422, "logps/ref_rejected": -92.32742309570312, "logps/rejected": -148.12274169921875, "loss": 0.3538, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12317676097154617, "margin_dpo/beta_margin_grad_std": 0.18745048344135284, "margin_dpo/beta_margin_mean": 3.4018843173980713, "margin_dpo/beta_margin_std": 2.448117971420288, "margin_dpo/loss_margin_mean": 34.01884078979492, "margin_dpo/margin_mean": 34.01884078979492, "margin_dpo/margin_std": 24.252649307250977, "step": 386 }, { "epoch": 0.5682819383259912, "grad_norm": 67.83007049560547, "learning_rate": 2.3589086689101889e-07, "logits/chosen": -0.6703263521194458, "logits/rejected": -0.6214380264282227, "logps/chosen": -85.0511474609375, "logps/ref_chosen": -66.41944885253906, "logps/ref_rejected": -92.16915893554688, "logps/rejected": -139.54898071289062, "loss": 0.473, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16746900975704193, "margin_dpo/beta_margin_grad_std": 0.21500559151172638, "margin_dpo/beta_margin_mean": 2.874812602996826, "margin_dpo/beta_margin_std": 2.3288333415985107, "margin_dpo/loss_margin_mean": 28.748125076293945, "margin_dpo/margin_mean": 28.748126983642578, "margin_dpo/margin_std": 23.084835052490234, "step": 387 }, { "epoch": 0.5697503671071953, "grad_norm": 50.18345260620117, "learning_rate": 2.3460977346651428e-07, "logits/chosen": -0.614050567150116, "logits/rejected": -0.6127490401268005, "logps/chosen": -71.32160186767578, "logps/ref_chosen": -50.129459381103516, "logps/ref_rejected": -104.43305969238281, "logps/rejected": -161.62437438964844, "loss": 0.375, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13886678218841553, "margin_dpo/beta_margin_grad_std": 0.19045500457286835, "margin_dpo/beta_margin_mean": 3.599916458129883, "margin_dpo/beta_margin_std": 3.1401546001434326, "margin_dpo/loss_margin_mean": 35.99916458129883, "margin_dpo/margin_mean": 35.99916076660156, "margin_dpo/margin_std": 29.291606903076172, "step": 388 }, { "epoch": 0.5712187958883994, "grad_norm": 40.010982513427734, "learning_rate": 2.3332908558879177e-07, "logits/chosen": -0.6919381022453308, "logits/rejected": -0.6485722064971924, "logps/chosen": -77.64576721191406, "logps/ref_chosen": -57.906593322753906, "logps/ref_rejected": -77.91454315185547, "logps/rejected": -131.74371337890625, "loss": 0.3484, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13054527342319489, "margin_dpo/beta_margin_grad_std": 0.1837066113948822, "margin_dpo/beta_margin_mean": 3.4090003967285156, "margin_dpo/beta_margin_std": 2.6279804706573486, "margin_dpo/loss_margin_mean": 34.090003967285156, "margin_dpo/margin_mean": 34.090003967285156, "margin_dpo/margin_std": 26.119720458984375, "step": 389 }, { "epoch": 0.5726872246696035, "grad_norm": 66.40118408203125, "learning_rate": 2.320488370051681e-07, "logits/chosen": -0.6006795763969421, "logits/rejected": -0.5766574740409851, "logps/chosen": -70.85358428955078, "logps/ref_chosen": -49.22591781616211, "logps/ref_rejected": -85.5281982421875, "logps/rejected": -138.63442993164062, "loss": 0.5273, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16621273756027222, "margin_dpo/beta_margin_grad_std": 0.23641641438007355, "margin_dpo/beta_margin_mean": 3.1478559970855713, "margin_dpo/beta_margin_std": 2.8455049991607666, "margin_dpo/loss_margin_mean": 31.478559494018555, "margin_dpo/margin_mean": 31.478557586669922, "margin_dpo/margin_std": 27.472740173339844, "step": 390 }, { "epoch": 0.5741556534508077, "grad_norm": 60.51673126220703, "learning_rate": 2.3076906145138405e-07, "logits/chosen": -0.6301641464233398, "logits/rejected": -0.6100099086761475, "logps/chosen": -87.16332244873047, "logps/ref_chosen": -64.32965087890625, "logps/ref_rejected": -86.73820495605469, "logps/rejected": -137.2554473876953, "loss": 0.5215, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18781331181526184, "margin_dpo/beta_margin_grad_std": 0.2151157259941101, "margin_dpo/beta_margin_mean": 2.7683560848236084, "margin_dpo/beta_margin_std": 2.7290608882904053, "margin_dpo/loss_margin_mean": 27.683561325073242, "margin_dpo/margin_mean": 27.683561325073242, "margin_dpo/margin_std": 26.582765579223633, "step": 391 }, { "epoch": 0.5756240822320118, "grad_norm": 42.56395721435547, "learning_rate": 2.294897926507156e-07, "logits/chosen": -0.640722393989563, "logits/rejected": -0.6189226508140564, "logps/chosen": -71.88900756835938, "logps/ref_chosen": -53.50397872924805, "logps/ref_rejected": -102.34583282470703, "logps/rejected": -155.18275451660156, "loss": 0.296, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1182653084397316, "margin_dpo/beta_margin_grad_std": 0.1554841548204422, "margin_dpo/beta_margin_mean": 3.4451892375946045, "margin_dpo/beta_margin_std": 2.6532013416290283, "margin_dpo/loss_margin_mean": 34.4518928527832, "margin_dpo/margin_mean": 34.45188903808594, "margin_dpo/margin_std": 25.238216400146484, "step": 392 }, { "epoch": 0.5770925110132159, "grad_norm": 60.799842834472656, "learning_rate": 2.2821106431308543e-07, "logits/chosen": -0.6209444403648376, "logits/rejected": -0.5936387777328491, "logps/chosen": -65.83811950683594, "logps/ref_chosen": -46.473915100097656, "logps/ref_rejected": -71.96885681152344, "logps/rejected": -118.9813461303711, "loss": 0.537, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1976858228445053, "margin_dpo/beta_margin_grad_std": 0.21031783521175385, "margin_dpo/beta_margin_mean": 2.7648279666900635, "margin_dpo/beta_margin_std": 2.7872002124786377, "margin_dpo/loss_margin_mean": 27.648279190063477, "margin_dpo/margin_mean": 27.648279190063477, "margin_dpo/margin_std": 27.81708526611328, "step": 393 }, { "epoch": 0.57856093979442, "grad_norm": 54.4792594909668, "learning_rate": 2.2693291013417452e-07, "logits/chosen": -0.6115927696228027, "logits/rejected": -0.5897927284240723, "logps/chosen": -71.93934631347656, "logps/ref_chosen": -52.91154479980469, "logps/ref_rejected": -90.82263946533203, "logps/rejected": -140.38363647460938, "loss": 0.5764, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1873016506433487, "margin_dpo/beta_margin_grad_std": 0.23149621486663818, "margin_dpo/beta_margin_mean": 3.0533206462860107, "margin_dpo/beta_margin_std": 3.1193511486053467, "margin_dpo/loss_margin_mean": 30.533206939697266, "margin_dpo/margin_mean": 30.533206939697266, "margin_dpo/margin_std": 31.190715789794922, "step": 394 }, { "epoch": 0.580029368575624, "grad_norm": 46.82632827758789, "learning_rate": 2.2565536379453404e-07, "logits/chosen": -0.7002590894699097, "logits/rejected": -0.6835087537765503, "logps/chosen": -80.11807250976562, "logps/ref_chosen": -62.546112060546875, "logps/ref_rejected": -83.78262329101562, "logps/rejected": -133.29296875, "loss": 0.4308, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16609802842140198, "margin_dpo/beta_margin_grad_std": 0.1954876035451889, "margin_dpo/beta_margin_mean": 3.1938390731811523, "margin_dpo/beta_margin_std": 2.6755151748657227, "margin_dpo/loss_margin_mean": 31.938390731811523, "margin_dpo/margin_mean": 31.93838882446289, "margin_dpo/margin_std": 26.722272872924805, "step": 395 }, { "epoch": 0.5814977973568282, "grad_norm": 48.82279586791992, "learning_rate": 2.2437845895869825e-07, "logits/chosen": -0.6726581454277039, "logits/rejected": -0.6284117698669434, "logps/chosen": -88.34222412109375, "logps/ref_chosen": -68.99594116210938, "logps/ref_rejected": -88.64665985107422, "logps/rejected": -139.3372802734375, "loss": 0.4054, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1422402560710907, "margin_dpo/beta_margin_grad_std": 0.21042148768901825, "margin_dpo/beta_margin_mean": 3.134434461593628, "margin_dpo/beta_margin_std": 2.398585557937622, "margin_dpo/loss_margin_mean": 31.344343185424805, "margin_dpo/margin_mean": 31.344345092773438, "margin_dpo/margin_std": 23.319499969482422, "step": 396 }, { "epoch": 0.5829662261380323, "grad_norm": 46.95646286010742, "learning_rate": 2.2310222927429716e-07, "logits/chosen": -0.6410428285598755, "logits/rejected": -0.5968055725097656, "logps/chosen": -77.82908630371094, "logps/ref_chosen": -61.27716827392578, "logps/ref_rejected": -103.11612701416016, "logps/rejected": -155.51364135742188, "loss": 0.3677, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13442568480968475, "margin_dpo/beta_margin_grad_std": 0.19571280479431152, "margin_dpo/beta_margin_mean": 3.5845582485198975, "margin_dpo/beta_margin_std": 2.7562365531921387, "margin_dpo/loss_margin_mean": 35.8455810546875, "margin_dpo/margin_mean": 35.8455810546875, "margin_dpo/margin_std": 27.45928955078125, "step": 397 }, { "epoch": 0.5844346549192364, "grad_norm": 48.90303421020508, "learning_rate": 2.2182670837116972e-07, "logits/chosen": -0.7075108289718628, "logits/rejected": -0.683822751045227, "logps/chosen": -87.01773071289062, "logps/ref_chosen": -68.15155029296875, "logps/ref_rejected": -108.52360534667969, "logps/rejected": -158.43099975585938, "loss": 0.3507, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13588033616542816, "margin_dpo/beta_margin_grad_std": 0.1709379106760025, "margin_dpo/beta_margin_mean": 3.104120969772339, "margin_dpo/beta_margin_std": 2.6031596660614014, "margin_dpo/loss_margin_mean": 31.041210174560547, "margin_dpo/margin_mean": 31.041208267211914, "margin_dpo/margin_std": 25.99789810180664, "step": 398 }, { "epoch": 0.5859030837004405, "grad_norm": 54.68693923950195, "learning_rate": 2.2055192986047804e-07, "logits/chosen": -0.661706805229187, "logits/rejected": -0.5980038046836853, "logps/chosen": -77.65357971191406, "logps/ref_chosen": -60.889801025390625, "logps/ref_rejected": -77.96558380126953, "logps/rejected": -129.20294189453125, "loss": 0.4264, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14156478643417358, "margin_dpo/beta_margin_grad_std": 0.22005578875541687, "margin_dpo/beta_margin_mean": 3.4473578929901123, "margin_dpo/beta_margin_std": 2.549055814743042, "margin_dpo/loss_margin_mean": 34.47357940673828, "margin_dpo/margin_mean": 34.47357940673828, "margin_dpo/margin_std": 25.397838592529297, "step": 399 }, { "epoch": 0.5873715124816447, "grad_norm": 56.09019470214844, "learning_rate": 2.192779273338215e-07, "logits/chosen": -0.6517215967178345, "logits/rejected": -0.6144533753395081, "logps/chosen": -81.19852447509766, "logps/ref_chosen": -63.64359664916992, "logps/ref_rejected": -105.252685546875, "logps/rejected": -161.04153442382812, "loss": 0.4494, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13553817570209503, "margin_dpo/beta_margin_grad_std": 0.2256467044353485, "margin_dpo/beta_margin_mean": 3.8233911991119385, "margin_dpo/beta_margin_std": 3.120196580886841, "margin_dpo/loss_margin_mean": 38.233909606933594, "margin_dpo/margin_mean": 38.233909606933594, "margin_dpo/margin_std": 30.78741455078125, "step": 400 }, { "epoch": 0.5873715124816447, "eval_logits/chosen": -0.6053273677825928, "eval_logits/rejected": -0.5765178203582764, "eval_logps/chosen": -101.84111785888672, "eval_logps/ref_chosen": -79.05104064941406, "eval_logps/ref_rejected": -86.79793548583984, "eval_logps/rejected": -129.76779174804688, "eval_loss": 0.42193400859832764, "eval_margin_dpo/beta": 0.10000000149011612, "eval_margin_dpo/beta_margin_grad_mean": -0.26937761902809143, "eval_margin_dpo/beta_margin_grad_std": 0.25375083088874817, "eval_margin_dpo/beta_margin_mean": 2.017979383468628, "eval_margin_dpo/beta_margin_std": 2.53668212890625, "eval_margin_dpo/loss_margin_mean": 20.179792404174805, "eval_margin_dpo/margin_mean": 20.179792404174805, "eval_margin_dpo/margin_std": 25.366819381713867, "eval_runtime": 40.0988, "eval_samples_per_second": 58.331, "eval_steps_per_second": 1.845, "step": 400 }, { "epoch": 0.5888399412628488, "grad_norm": 61.670928955078125, "learning_rate": 2.1800473436235136e-07, "logits/chosen": -0.5769657492637634, "logits/rejected": -0.5573090314865112, "logps/chosen": -76.15703582763672, "logps/ref_chosen": -57.16303253173828, "logps/ref_rejected": -83.79249572753906, "logps/rejected": -132.30641174316406, "loss": 0.659, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2077464908361435, "margin_dpo/beta_margin_grad_std": 0.25239863991737366, "margin_dpo/beta_margin_mean": 2.951991558074951, "margin_dpo/beta_margin_std": 3.1654133796691895, "margin_dpo/loss_margin_mean": 29.519914627075195, "margin_dpo/margin_mean": 29.519912719726562, "margin_dpo/margin_std": 31.590171813964844, "step": 401 }, { "epoch": 0.5903083700440529, "grad_norm": 26.211894989013672, "learning_rate": 2.1673238449588665e-07, "logits/chosen": -0.6328971982002258, "logits/rejected": -0.584295392036438, "logps/chosen": -62.62638854980469, "logps/ref_chosen": -50.74037170410156, "logps/ref_rejected": -81.0460433959961, "logps/rejected": -131.90960693359375, "loss": 0.2132, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.08531676232814789, "margin_dpo/beta_margin_grad_std": 0.13695916533470154, "margin_dpo/beta_margin_mean": 3.8977549076080322, "margin_dpo/beta_margin_std": 2.383517265319824, "margin_dpo/loss_margin_mean": 38.97754669189453, "margin_dpo/margin_mean": 38.97754669189453, "margin_dpo/margin_std": 23.83334732055664, "step": 402 }, { "epoch": 0.591776798825257, "grad_norm": 63.287567138671875, "learning_rate": 2.154609112620295e-07, "logits/chosen": -0.6422700881958008, "logits/rejected": -0.6241501569747925, "logps/chosen": -62.53410339355469, "logps/ref_chosen": -47.14731216430664, "logps/ref_rejected": -77.2666015625, "logps/rejected": -122.82563781738281, "loss": 0.5741, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1763564795255661, "margin_dpo/beta_margin_grad_std": 0.23426702618598938, "margin_dpo/beta_margin_mean": 3.0172247886657715, "margin_dpo/beta_margin_std": 2.846843957901001, "margin_dpo/loss_margin_mean": 30.1722469329834, "margin_dpo/margin_mean": 30.17224884033203, "margin_dpo/margin_std": 28.130752563476562, "step": 403 }, { "epoch": 0.593245227606461, "grad_norm": 54.917449951171875, "learning_rate": 2.1419034816528218e-07, "logits/chosen": -0.6123020648956299, "logits/rejected": -0.578801155090332, "logps/chosen": -63.40578079223633, "logps/ref_chosen": -47.875274658203125, "logps/ref_rejected": -77.15499877929688, "logps/rejected": -123.22205352783203, "loss": 0.5739, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18759508430957794, "margin_dpo/beta_margin_grad_std": 0.23627623915672302, "margin_dpo/beta_margin_mean": 3.053654670715332, "margin_dpo/beta_margin_std": 2.8906707763671875, "margin_dpo/loss_margin_mean": 30.53654670715332, "margin_dpo/margin_mean": 30.53654670715332, "margin_dpo/margin_std": 28.8435115814209, "step": 404 }, { "epoch": 0.5947136563876652, "grad_norm": 64.9948501586914, "learning_rate": 2.129207286861638e-07, "logits/chosen": -0.5796902179718018, "logits/rejected": -0.549854040145874, "logps/chosen": -84.49642944335938, "logps/ref_chosen": -65.16290283203125, "logps/ref_rejected": -87.18678283691406, "logps/rejected": -136.73745727539062, "loss": 0.5427, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18682169914245605, "margin_dpo/beta_margin_grad_std": 0.22749853134155273, "margin_dpo/beta_margin_mean": 3.021714210510254, "margin_dpo/beta_margin_std": 2.9265987873077393, "margin_dpo/loss_margin_mean": 30.217140197753906, "margin_dpo/margin_mean": 30.217140197753906, "margin_dpo/margin_std": 27.509521484375, "step": 405 }, { "epoch": 0.5961820851688693, "grad_norm": 61.627079010009766, "learning_rate": 2.1165208628032861e-07, "logits/chosen": -0.6366710662841797, "logits/rejected": -0.6224513649940491, "logps/chosen": -66.44183349609375, "logps/ref_chosen": -49.740814208984375, "logps/ref_rejected": -92.07862854003906, "logps/rejected": -140.552490234375, "loss": 0.5435, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1675260215997696, "margin_dpo/beta_margin_grad_std": 0.22542835772037506, "margin_dpo/beta_margin_mean": 3.1772830486297607, "margin_dpo/beta_margin_std": 2.8382697105407715, "margin_dpo/loss_margin_mean": 31.772830963134766, "margin_dpo/margin_mean": 31.772830963134766, "margin_dpo/margin_std": 27.909154891967773, "step": 406 }, { "epoch": 0.5976505139500734, "grad_norm": 68.63170623779297, "learning_rate": 2.1038445437768375e-07, "logits/chosen": -0.6445499062538147, "logits/rejected": -0.599348783493042, "logps/chosen": -72.40534973144531, "logps/ref_chosen": -56.33069610595703, "logps/ref_rejected": -77.5120849609375, "logps/rejected": -125.86834716796875, "loss": 0.6107, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18231885135173798, "margin_dpo/beta_margin_grad_std": 0.25164178013801575, "margin_dpo/beta_margin_mean": 3.228161334991455, "margin_dpo/beta_margin_std": 2.978395462036133, "margin_dpo/loss_margin_mean": 32.281612396240234, "margin_dpo/margin_mean": 32.281612396240234, "margin_dpo/margin_std": 29.094558715820312, "step": 407 }, { "epoch": 0.5991189427312775, "grad_norm": 81.44627380371094, "learning_rate": 2.0911786638150872e-07, "logits/chosen": -0.6902725696563721, "logits/rejected": -0.6373718976974487, "logps/chosen": -85.27023315429688, "logps/ref_chosen": -69.789306640625, "logps/ref_rejected": -90.09693908691406, "logps/rejected": -133.43089294433594, "loss": 0.6172, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.20204903185367584, "margin_dpo/beta_margin_grad_std": 0.25068265199661255, "margin_dpo/beta_margin_mean": 2.785304069519043, "margin_dpo/beta_margin_std": 2.7439894676208496, "margin_dpo/loss_margin_mean": 27.853038787841797, "margin_dpo/margin_mean": 27.853038787841797, "margin_dpo/margin_std": 27.155353546142578, "step": 408 }, { "epoch": 0.6005873715124816, "grad_norm": 49.702667236328125, "learning_rate": 2.0785235566757517e-07, "logits/chosen": -0.6016473770141602, "logits/rejected": -0.5694031119346619, "logps/chosen": -84.24601745605469, "logps/ref_chosen": -67.31744384765625, "logps/ref_rejected": -84.904296875, "logps/rejected": -132.7557373046875, "loss": 0.4121, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1540304273366928, "margin_dpo/beta_margin_grad_std": 0.19426687061786652, "margin_dpo/beta_margin_mean": 3.092287302017212, "margin_dpo/beta_margin_std": 2.565514326095581, "margin_dpo/loss_margin_mean": 30.92287254333496, "margin_dpo/margin_mean": 30.92287254333496, "margin_dpo/margin_std": 25.64594078063965, "step": 409 }, { "epoch": 0.6020558002936858, "grad_norm": 67.67236328125, "learning_rate": 2.065879555832674e-07, "logits/chosen": -0.6346931457519531, "logits/rejected": -0.6326348781585693, "logps/chosen": -70.31283569335938, "logps/ref_chosen": -51.465354919433594, "logps/ref_rejected": -83.198974609375, "logps/rejected": -129.9054718017578, "loss": 0.5957, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.20088441669940948, "margin_dpo/beta_margin_grad_std": 0.24235385656356812, "margin_dpo/beta_margin_mean": 2.7859020233154297, "margin_dpo/beta_margin_std": 2.646648406982422, "margin_dpo/loss_margin_mean": 27.859020233154297, "margin_dpo/margin_mean": 27.859020233154297, "margin_dpo/margin_std": 26.202781677246094, "step": 410 }, { "epoch": 0.6035242290748899, "grad_norm": 57.020423889160156, "learning_rate": 2.0532469944670343e-07, "logits/chosen": -0.6736893653869629, "logits/rejected": -0.640461802482605, "logps/chosen": -71.45536041259766, "logps/ref_chosen": -52.30727005004883, "logps/ref_rejected": -80.69495391845703, "logps/rejected": -129.53814697265625, "loss": 0.5393, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1864738166332245, "margin_dpo/beta_margin_grad_std": 0.2311916947364807, "margin_dpo/beta_margin_mean": 2.969511032104492, "margin_dpo/beta_margin_std": 2.880366563796997, "margin_dpo/loss_margin_mean": 29.695110321044922, "margin_dpo/margin_mean": 29.69510841369629, "margin_dpo/margin_std": 27.609901428222656, "step": 411 }, { "epoch": 0.604992657856094, "grad_norm": 41.312705993652344, "learning_rate": 2.0406262054585738e-07, "logits/chosen": -0.702052652835846, "logits/rejected": -0.6910427808761597, "logps/chosen": -68.71327209472656, "logps/ref_chosen": -53.144126892089844, "logps/ref_rejected": -100.06080627441406, "logps/rejected": -145.08905029296875, "loss": 0.501, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18716482818126678, "margin_dpo/beta_margin_grad_std": 0.21060419082641602, "margin_dpo/beta_margin_mean": 2.9459095001220703, "margin_dpo/beta_margin_std": 2.7281651496887207, "margin_dpo/loss_margin_mean": 29.459095001220703, "margin_dpo/margin_mean": 29.459096908569336, "margin_dpo/margin_std": 27.16181182861328, "step": 412 }, { "epoch": 0.6064610866372981, "grad_norm": 59.28904724121094, "learning_rate": 2.0280175213768205e-07, "logits/chosen": -0.5773541927337646, "logits/rejected": -0.5431898832321167, "logps/chosen": -80.49532318115234, "logps/ref_chosen": -61.58196258544922, "logps/ref_rejected": -99.47340393066406, "logps/rejected": -148.28915405273438, "loss": 0.4911, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15901578962802887, "margin_dpo/beta_margin_grad_std": 0.21321162581443787, "margin_dpo/beta_margin_mean": 2.990238666534424, "margin_dpo/beta_margin_std": 2.5804879665374756, "margin_dpo/loss_margin_mean": 29.902387619018555, "margin_dpo/margin_mean": 29.902387619018555, "margin_dpo/margin_std": 25.28069496154785, "step": 413 }, { "epoch": 0.6079295154185022, "grad_norm": 55.32724380493164, "learning_rate": 2.0154212744723247e-07, "logits/chosen": -0.6178678274154663, "logits/rejected": -0.580098032951355, "logps/chosen": -62.55944061279297, "logps/ref_chosen": -46.63148880004883, "logps/ref_rejected": -87.64652252197266, "logps/rejected": -139.25851440429688, "loss": 0.3637, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13259248435497284, "margin_dpo/beta_margin_grad_std": 0.19208675622940063, "margin_dpo/beta_margin_mean": 3.56840443611145, "margin_dpo/beta_margin_std": 2.785224437713623, "margin_dpo/loss_margin_mean": 35.684043884277344, "margin_dpo/margin_mean": 35.684043884277344, "margin_dpo/margin_std": 25.48971176147461, "step": 414 }, { "epoch": 0.6093979441997063, "grad_norm": 44.93287658691406, "learning_rate": 2.002837796667909e-07, "logits/chosen": -0.5938626527786255, "logits/rejected": -0.5675798654556274, "logps/chosen": -95.38108825683594, "logps/ref_chosen": -78.6182861328125, "logps/ref_rejected": -100.47752380371094, "logps/rejected": -146.9215850830078, "loss": 0.3982, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1585043966770172, "margin_dpo/beta_margin_grad_std": 0.17520886659622192, "margin_dpo/beta_margin_mean": 2.9681267738342285, "margin_dpo/beta_margin_std": 2.4819741249084473, "margin_dpo/loss_margin_mean": 29.68126678466797, "margin_dpo/margin_mean": 29.6812686920166, "margin_dpo/margin_std": 24.717784881591797, "step": 415 }, { "epoch": 0.6108663729809104, "grad_norm": 49.30588150024414, "learning_rate": 1.990267419549914e-07, "logits/chosen": -0.6397312879562378, "logits/rejected": -0.6059544086456299, "logps/chosen": -75.66851806640625, "logps/ref_chosen": -58.27912521362305, "logps/ref_rejected": -90.56871795654297, "logps/rejected": -144.47354125976562, "loss": 0.3876, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1345895677804947, "margin_dpo/beta_margin_grad_std": 0.2077297866344452, "margin_dpo/beta_margin_mean": 3.651543617248535, "margin_dpo/beta_margin_std": 2.8901610374450684, "margin_dpo/loss_margin_mean": 36.51543426513672, "margin_dpo/margin_mean": 36.51543426513672, "margin_dpo/margin_std": 27.713136672973633, "step": 416 }, { "epoch": 0.6123348017621145, "grad_norm": 38.555389404296875, "learning_rate": 1.9777104743594686e-07, "logits/chosen": -0.6252127289772034, "logits/rejected": -0.5568169355392456, "logps/chosen": -66.67837524414062, "logps/ref_chosen": -50.1987190246582, "logps/ref_rejected": -68.15184020996094, "logps/rejected": -119.5999755859375, "loss": 0.3154, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12362627685070038, "margin_dpo/beta_margin_grad_std": 0.17288993299007416, "margin_dpo/beta_margin_mean": 3.496847629547119, "margin_dpo/beta_margin_std": 2.3686065673828125, "margin_dpo/loss_margin_mean": 34.968475341796875, "margin_dpo/margin_mean": 34.968475341796875, "margin_dpo/margin_std": 23.240657806396484, "step": 417 }, { "epoch": 0.6138032305433186, "grad_norm": 64.83741760253906, "learning_rate": 1.965167291983757e-07, "logits/chosen": -0.6693556904792786, "logits/rejected": -0.6072407960891724, "logps/chosen": -99.16204833984375, "logps/ref_chosen": -81.97846984863281, "logps/ref_rejected": -104.69148254394531, "logps/rejected": -156.01602172851562, "loss": 0.5663, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16488902270793915, "margin_dpo/beta_margin_grad_std": 0.2364022433757782, "margin_dpo/beta_margin_mean": 3.4140961170196533, "margin_dpo/beta_margin_std": 3.2014167308807373, "margin_dpo/loss_margin_mean": 34.140960693359375, "margin_dpo/margin_mean": 34.140960693359375, "margin_dpo/margin_std": 31.345539093017578, "step": 418 }, { "epoch": 0.6152716593245228, "grad_norm": 46.96452331542969, "learning_rate": 1.9526382029472988e-07, "logits/chosen": -0.5874903202056885, "logits/rejected": -0.5439319610595703, "logps/chosen": -70.24662780761719, "logps/ref_chosen": -52.948646545410156, "logps/ref_rejected": -91.58309936523438, "logps/rejected": -142.82901000976562, "loss": 0.3088, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.11957548558712006, "margin_dpo/beta_margin_grad_std": 0.16336920857429504, "margin_dpo/beta_margin_mean": 3.3947930335998535, "margin_dpo/beta_margin_std": 2.4165024757385254, "margin_dpo/loss_margin_mean": 33.94792938232422, "margin_dpo/margin_mean": 33.94792938232422, "margin_dpo/margin_std": 23.982418060302734, "step": 419 }, { "epoch": 0.6167400881057269, "grad_norm": 61.41410827636719, "learning_rate": 1.9401235374032425e-07, "logits/chosen": -0.6708568930625916, "logits/rejected": -0.594412624835968, "logps/chosen": -95.875244140625, "logps/ref_chosen": -77.7699203491211, "logps/ref_rejected": -69.31985473632812, "logps/rejected": -120.385009765625, "loss": 0.4567, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1499803215265274, "margin_dpo/beta_margin_grad_std": 0.21647407114505768, "margin_dpo/beta_margin_mean": 3.2959823608398438, "margin_dpo/beta_margin_std": 2.774785041809082, "margin_dpo/loss_margin_mean": 32.95982360839844, "margin_dpo/margin_mean": 32.95981979370117, "margin_dpo/margin_std": 27.405033111572266, "step": 420 }, { "epoch": 0.618208516886931, "grad_norm": 79.4913330078125, "learning_rate": 1.9276236251246653e-07, "logits/chosen": -0.6430982351303101, "logits/rejected": -0.6089684963226318, "logps/chosen": -73.95745849609375, "logps/ref_chosen": -53.765865325927734, "logps/ref_rejected": -89.28144836425781, "logps/rejected": -137.42054748535156, "loss": 0.6226, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1968107521533966, "margin_dpo/beta_margin_grad_std": 0.24985744059085846, "margin_dpo/beta_margin_mean": 2.794750928878784, "margin_dpo/beta_margin_std": 2.7135543823242188, "margin_dpo/loss_margin_mean": 27.947509765625, "margin_dpo/margin_mean": 27.947509765625, "margin_dpo/margin_std": 26.780242919921875, "step": 421 }, { "epoch": 0.6196769456681351, "grad_norm": 66.62350463867188, "learning_rate": 1.9151387954958792e-07, "logits/chosen": -0.6613567471504211, "logits/rejected": -0.6198326349258423, "logps/chosen": -89.37240600585938, "logps/ref_chosen": -68.6337661743164, "logps/ref_rejected": -87.86351013183594, "logps/rejected": -138.73875427246094, "loss": 0.5663, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1885911077260971, "margin_dpo/beta_margin_grad_std": 0.2437172681093216, "margin_dpo/beta_margin_mean": 3.013660192489624, "margin_dpo/beta_margin_std": 2.8644890785217285, "margin_dpo/loss_margin_mean": 30.136600494384766, "margin_dpo/margin_mean": 30.136600494384766, "margin_dpo/margin_std": 28.641185760498047, "step": 422 }, { "epoch": 0.6211453744493393, "grad_norm": 66.34683227539062, "learning_rate": 1.902669377503756e-07, "logits/chosen": -0.6761616468429565, "logits/rejected": -0.6586691737174988, "logps/chosen": -74.14385986328125, "logps/ref_chosen": -54.99030303955078, "logps/ref_rejected": -86.30654907226562, "logps/rejected": -136.7641143798828, "loss": 0.5527, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18222779035568237, "margin_dpo/beta_margin_grad_std": 0.2336231768131256, "margin_dpo/beta_margin_mean": 3.1304006576538086, "margin_dpo/beta_margin_std": 2.9750967025756836, "margin_dpo/loss_margin_mean": 31.304006576538086, "margin_dpo/margin_mean": 31.304006576538086, "margin_dpo/margin_std": 29.631959915161133, "step": 423 }, { "epoch": 0.6226138032305433, "grad_norm": 48.2248649597168, "learning_rate": 1.890215699729057e-07, "logits/chosen": -0.6284000873565674, "logits/rejected": -0.5798854231834412, "logps/chosen": -73.47090148925781, "logps/ref_chosen": -56.01191711425781, "logps/ref_rejected": -66.47896575927734, "logps/rejected": -118.09882354736328, "loss": 0.4263, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15188807249069214, "margin_dpo/beta_margin_grad_std": 0.21018096804618835, "margin_dpo/beta_margin_mean": 3.4160873889923096, "margin_dpo/beta_margin_std": 3.078399419784546, "margin_dpo/loss_margin_mean": 34.16087341308594, "margin_dpo/margin_mean": 34.16087341308594, "margin_dpo/margin_std": 30.704998016357422, "step": 424 }, { "epoch": 0.6240822320117474, "grad_norm": 56.79523849487305, "learning_rate": 1.8777780903377732e-07, "logits/chosen": -0.6415660381317139, "logits/rejected": -0.6306988000869751, "logps/chosen": -65.49158477783203, "logps/ref_chosen": -46.868995666503906, "logps/ref_rejected": -95.92545318603516, "logps/rejected": -145.18174743652344, "loss": 0.5067, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16396918892860413, "margin_dpo/beta_margin_grad_std": 0.22638258337974548, "margin_dpo/beta_margin_mean": 3.063370704650879, "margin_dpo/beta_margin_std": 2.508201837539673, "margin_dpo/loss_margin_mean": 30.63370704650879, "margin_dpo/margin_mean": 30.633705139160156, "margin_dpo/margin_std": 24.710655212402344, "step": 425 }, { "epoch": 0.6255506607929515, "grad_norm": 73.21717071533203, "learning_rate": 1.8653568770724803e-07, "logits/chosen": -0.6280812621116638, "logits/rejected": -0.5743027925491333, "logps/chosen": -93.59241485595703, "logps/ref_chosen": -76.58354187011719, "logps/ref_rejected": -81.26658630371094, "logps/rejected": -132.15199279785156, "loss": 0.4413, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13312982022762299, "margin_dpo/beta_margin_grad_std": 0.21179711818695068, "margin_dpo/beta_margin_mean": 3.3876538276672363, "margin_dpo/beta_margin_std": 2.695366144180298, "margin_dpo/loss_margin_mean": 33.87653732299805, "margin_dpo/margin_mean": 33.87653732299805, "margin_dpo/margin_std": 26.354013442993164, "step": 426 }, { "epoch": 0.6270190895741556, "grad_norm": 56.27901840209961, "learning_rate": 1.8529523872436977e-07, "logits/chosen": -0.6733847856521606, "logits/rejected": -0.6199424266815186, "logps/chosen": -81.7194595336914, "logps/ref_chosen": -64.8538818359375, "logps/ref_rejected": -78.56600952148438, "logps/rejected": -120.1583251953125, "loss": 0.5885, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.190776988863945, "margin_dpo/beta_margin_grad_std": 0.20414692163467407, "margin_dpo/beta_margin_mean": 2.4726738929748535, "margin_dpo/beta_margin_std": 2.3600223064422607, "margin_dpo/loss_margin_mean": 24.72673797607422, "margin_dpo/margin_mean": 24.72673797607422, "margin_dpo/margin_std": 23.543621063232422, "step": 427 }, { "epoch": 0.6284875183553598, "grad_norm": 44.09659957885742, "learning_rate": 1.8405649477212697e-07, "logits/chosen": -0.6260280609130859, "logits/rejected": -0.5897619724273682, "logps/chosen": -83.10867309570312, "logps/ref_chosen": -62.63666534423828, "logps/ref_rejected": -103.28182220458984, "logps/rejected": -159.34945678710938, "loss": 0.3299, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.124484583735466, "margin_dpo/beta_margin_grad_std": 0.1756177842617035, "margin_dpo/beta_margin_mean": 3.559562921524048, "margin_dpo/beta_margin_std": 2.730299234390259, "margin_dpo/loss_margin_mean": 35.59562683105469, "margin_dpo/margin_mean": 35.59562683105469, "margin_dpo/margin_std": 27.276784896850586, "step": 428 }, { "epoch": 0.6299559471365639, "grad_norm": 61.60802459716797, "learning_rate": 1.828194884925749e-07, "logits/chosen": -0.636346697807312, "logits/rejected": -0.5803790092468262, "logps/chosen": -101.16323852539062, "logps/ref_chosen": -81.23401641845703, "logps/ref_rejected": -91.79493713378906, "logps/rejected": -141.40106201171875, "loss": 0.595, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1930510401725769, "margin_dpo/beta_margin_grad_std": 0.24151724576950073, "margin_dpo/beta_margin_mean": 2.967691421508789, "margin_dpo/beta_margin_std": 2.905752182006836, "margin_dpo/loss_margin_mean": 29.67691421508789, "margin_dpo/margin_mean": 29.67691421508789, "margin_dpo/margin_std": 28.60194969177246, "step": 429 }, { "epoch": 0.631424375917768, "grad_norm": 51.62448501586914, "learning_rate": 1.8158425248197928e-07, "logits/chosen": -0.6227689981460571, "logits/rejected": -0.6045354008674622, "logps/chosen": -79.01585388183594, "logps/ref_chosen": -60.92032241821289, "logps/ref_rejected": -104.42280578613281, "logps/rejected": -153.30923461914062, "loss": 0.4761, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1646682769060135, "margin_dpo/beta_margin_grad_std": 0.2231719046831131, "margin_dpo/beta_margin_mean": 3.0790908336639404, "margin_dpo/beta_margin_std": 2.675757884979248, "margin_dpo/loss_margin_mean": 30.790908813476562, "margin_dpo/margin_mean": 30.790908813476562, "margin_dpo/margin_std": 26.328550338745117, "step": 430 }, { "epoch": 0.6328928046989721, "grad_norm": 45.01468276977539, "learning_rate": 1.8035081928995788e-07, "logits/chosen": -0.6120933294296265, "logits/rejected": -0.5965217351913452, "logps/chosen": -76.03721618652344, "logps/ref_chosen": -57.348751068115234, "logps/ref_rejected": -92.84022521972656, "logps/rejected": -146.1577911376953, "loss": 0.3416, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13492785394191742, "margin_dpo/beta_margin_grad_std": 0.17364878952503204, "margin_dpo/beta_margin_mean": 3.4629099369049072, "margin_dpo/beta_margin_std": 2.6419167518615723, "margin_dpo/loss_margin_mean": 34.62909698486328, "margin_dpo/margin_mean": 34.62909698486328, "margin_dpo/margin_std": 26.410173416137695, "step": 431 }, { "epoch": 0.6343612334801763, "grad_norm": 55.2720947265625, "learning_rate": 1.791192214186223e-07, "logits/chosen": -0.6020532250404358, "logits/rejected": -0.5625859498977661, "logps/chosen": -88.92323303222656, "logps/ref_chosen": -71.07479095458984, "logps/ref_rejected": -98.57952880859375, "logps/rejected": -148.73974609375, "loss": 0.4396, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1505272537469864, "margin_dpo/beta_margin_grad_std": 0.20940996706485748, "margin_dpo/beta_margin_mean": 3.231178045272827, "margin_dpo/beta_margin_std": 2.7881994247436523, "margin_dpo/loss_margin_mean": 32.3117790222168, "margin_dpo/margin_mean": 32.3117790222168, "margin_dpo/margin_std": 27.102590560913086, "step": 432 }, { "epoch": 0.6358296622613803, "grad_norm": 71.04937744140625, "learning_rate": 1.7788949132172193e-07, "logits/chosen": -0.6384241580963135, "logits/rejected": -0.6068836450576782, "logps/chosen": -81.66122436523438, "logps/ref_chosen": -58.273193359375, "logps/ref_rejected": -95.95089721679688, "logps/rejected": -147.70458984375, "loss": 0.5849, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.19293591380119324, "margin_dpo/beta_margin_grad_std": 0.2420828938484192, "margin_dpo/beta_margin_mean": 2.83656644821167, "margin_dpo/beta_margin_std": 2.67340350151062, "margin_dpo/loss_margin_mean": 28.365663528442383, "margin_dpo/margin_mean": 28.365665435791016, "margin_dpo/margin_std": 26.324649810791016, "step": 433 }, { "epoch": 0.6372980910425844, "grad_norm": 48.197303771972656, "learning_rate": 1.7666166140378853e-07, "logits/chosen": -0.6621353626251221, "logits/rejected": -0.6182979345321655, "logps/chosen": -79.50520324707031, "logps/ref_chosen": -61.97370147705078, "logps/ref_rejected": -78.49861145019531, "logps/rejected": -125.54408264160156, "loss": 0.4218, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15651409327983856, "margin_dpo/beta_margin_grad_std": 0.19872474670410156, "margin_dpo/beta_margin_mean": 2.9513981342315674, "margin_dpo/beta_margin_std": 2.5280110836029053, "margin_dpo/loss_margin_mean": 29.513980865478516, "margin_dpo/margin_mean": 29.513980865478516, "margin_dpo/margin_std": 25.25749969482422, "step": 434 }, { "epoch": 0.6387665198237885, "grad_norm": 63.86077117919922, "learning_rate": 1.7543576401928218e-07, "logits/chosen": -0.6548939943313599, "logits/rejected": -0.6191599369049072, "logps/chosen": -69.592041015625, "logps/ref_chosen": -51.502052307128906, "logps/ref_rejected": -87.56689453125, "logps/rejected": -138.00416564941406, "loss": 0.5053, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1661788821220398, "margin_dpo/beta_margin_grad_std": 0.21013152599334717, "margin_dpo/beta_margin_mean": 3.234729051589966, "margin_dpo/beta_margin_std": 2.9603843688964844, "margin_dpo/loss_margin_mean": 32.3472900390625, "margin_dpo/margin_mean": 32.3472900390625, "margin_dpo/margin_std": 29.455238342285156, "step": 435 }, { "epoch": 0.6402349486049926, "grad_norm": 40.332698822021484, "learning_rate": 1.742118314717391e-07, "logits/chosen": -0.632080078125, "logits/rejected": -0.5719594955444336, "logps/chosen": -88.88678741455078, "logps/ref_chosen": -71.40371704101562, "logps/ref_rejected": -82.72775268554688, "logps/rejected": -131.7387237548828, "loss": 0.3539, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13681164383888245, "margin_dpo/beta_margin_grad_std": 0.17888766527175903, "margin_dpo/beta_margin_mean": 3.152789354324341, "margin_dpo/beta_margin_std": 2.4287147521972656, "margin_dpo/loss_margin_mean": 31.527891159057617, "margin_dpo/margin_mean": 31.527891159057617, "margin_dpo/margin_std": 24.248245239257812, "step": 436 }, { "epoch": 0.6417033773861968, "grad_norm": 51.00373840332031, "learning_rate": 1.7298989601292036e-07, "logits/chosen": -0.6353539228439331, "logits/rejected": -0.5929083824157715, "logps/chosen": -81.99353790283203, "logps/ref_chosen": -64.7442626953125, "logps/ref_rejected": -82.04356384277344, "logps/rejected": -127.4609375, "loss": 0.5269, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1786879152059555, "margin_dpo/beta_margin_grad_std": 0.23251357674598694, "margin_dpo/beta_margin_mean": 2.81680965423584, "margin_dpo/beta_margin_std": 2.37345027923584, "margin_dpo/loss_margin_mean": 28.168094635009766, "margin_dpo/margin_mean": 28.168094635009766, "margin_dpo/margin_std": 23.416202545166016, "step": 437 }, { "epoch": 0.6431718061674009, "grad_norm": 63.38606643676758, "learning_rate": 1.7176998984196144e-07, "logits/chosen": -0.6576756238937378, "logits/rejected": -0.5832280516624451, "logps/chosen": -78.18193817138672, "logps/ref_chosen": -59.0186653137207, "logps/ref_rejected": -83.07682037353516, "logps/rejected": -136.60678100585938, "loss": 0.3695, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13736094534397125, "margin_dpo/beta_margin_grad_std": 0.18339543044567108, "margin_dpo/beta_margin_mean": 3.4366683959960938, "margin_dpo/beta_margin_std": 2.721860408782959, "margin_dpo/loss_margin_mean": 34.36668395996094, "margin_dpo/margin_mean": 34.36668395996094, "margin_dpo/margin_std": 26.956180572509766, "step": 438 }, { "epoch": 0.644640234948605, "grad_norm": 71.34723663330078, "learning_rate": 1.7055214510452458e-07, "logits/chosen": -0.6156207323074341, "logits/rejected": -0.5937438607215881, "logps/chosen": -77.27565002441406, "logps/ref_chosen": -53.784080505371094, "logps/ref_rejected": -83.98545837402344, "logps/rejected": -134.45162963867188, "loss": 0.5261, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18373528122901917, "margin_dpo/beta_margin_grad_std": 0.2151244729757309, "margin_dpo/beta_margin_mean": 2.6974589824676514, "margin_dpo/beta_margin_std": 2.4870941638946533, "margin_dpo/loss_margin_mean": 26.97458839416504, "margin_dpo/margin_mean": 26.974590301513672, "margin_dpo/margin_std": 23.787738800048828, "step": 439 }, { "epoch": 0.6461086637298091, "grad_norm": 96.4582290649414, "learning_rate": 1.6933639389195134e-07, "logits/chosen": -0.6607520580291748, "logits/rejected": -0.6199520826339722, "logps/chosen": -96.89436340332031, "logps/ref_chosen": -78.56671905517578, "logps/ref_rejected": -96.49775695800781, "logps/rejected": -140.70578002929688, "loss": 0.6669, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2098814845085144, "margin_dpo/beta_margin_grad_std": 0.25330764055252075, "margin_dpo/beta_margin_mean": 2.5880370140075684, "margin_dpo/beta_margin_std": 2.716387987136841, "margin_dpo/loss_margin_mean": 25.880369186401367, "margin_dpo/margin_mean": 25.880369186401367, "margin_dpo/margin_std": 27.07331085205078, "step": 440 }, { "epoch": 0.6475770925110133, "grad_norm": 49.82929229736328, "learning_rate": 1.681227682404166e-07, "logits/chosen": -0.5963351726531982, "logits/rejected": -0.5610902309417725, "logps/chosen": -80.72434997558594, "logps/ref_chosen": -60.824440002441406, "logps/ref_rejected": -96.47080993652344, "logps/rejected": -147.17962646484375, "loss": 0.4379, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13756851851940155, "margin_dpo/beta_margin_grad_std": 0.19719012081623077, "margin_dpo/beta_margin_mean": 3.080892562866211, "margin_dpo/beta_margin_std": 2.426534414291382, "margin_dpo/loss_margin_mean": 30.808923721313477, "margin_dpo/margin_mean": 30.808923721313477, "margin_dpo/margin_std": 23.68011474609375, "step": 441 }, { "epoch": 0.6490455212922174, "grad_norm": 36.576942443847656, "learning_rate": 1.669113001300851e-07, "logits/chosen": -0.6140519380569458, "logits/rejected": -0.5783543586730957, "logps/chosen": -64.97787475585938, "logps/ref_chosen": -47.01121520996094, "logps/ref_rejected": -76.53926086425781, "logps/rejected": -132.34170532226562, "loss": 0.2823, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1093648299574852, "margin_dpo/beta_margin_grad_std": 0.16080023348331451, "margin_dpo/beta_margin_mean": 3.7835774421691895, "margin_dpo/beta_margin_std": 2.697312593460083, "margin_dpo/loss_margin_mean": 37.83577346801758, "margin_dpo/margin_mean": 37.83577346801758, "margin_dpo/margin_std": 26.404239654541016, "step": 442 }, { "epoch": 0.6505139500734214, "grad_norm": 79.94059753417969, "learning_rate": 1.6570202148426815e-07, "logits/chosen": -0.6004323959350586, "logits/rejected": -0.5627496242523193, "logps/chosen": -93.62142944335938, "logps/ref_chosen": -71.27301788330078, "logps/ref_rejected": -86.679931640625, "logps/rejected": -137.5754852294922, "loss": 0.6573, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.20045503973960876, "margin_dpo/beta_margin_grad_std": 0.263896644115448, "margin_dpo/beta_margin_mean": 2.8547141551971436, "margin_dpo/beta_margin_std": 2.7842366695404053, "margin_dpo/loss_margin_mean": 28.54714012145996, "margin_dpo/margin_mean": 28.54714012145996, "margin_dpo/margin_std": 27.68130111694336, "step": 443 }, { "epoch": 0.6519823788546255, "grad_norm": 47.294471740722656, "learning_rate": 1.6449496416858282e-07, "logits/chosen": -0.5860676169395447, "logits/rejected": -0.5605667233467102, "logps/chosen": -76.857421875, "logps/ref_chosen": -57.213706970214844, "logps/ref_rejected": -97.25489044189453, "logps/rejected": -151.163330078125, "loss": 0.4389, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14946991205215454, "margin_dpo/beta_margin_grad_std": 0.21209140121936798, "margin_dpo/beta_margin_mean": 3.4264724254608154, "margin_dpo/beta_margin_std": 2.8675122261047363, "margin_dpo/loss_margin_mean": 34.26472091674805, "margin_dpo/margin_mean": 34.26472473144531, "margin_dpo/margin_std": 28.598800659179688, "step": 444 }, { "epoch": 0.6534508076358296, "grad_norm": 61.75363540649414, "learning_rate": 1.6329015999011182e-07, "logits/chosen": -0.6285964250564575, "logits/rejected": -0.5963205695152283, "logps/chosen": -84.33077239990234, "logps/ref_chosen": -67.29979705810547, "logps/ref_rejected": -92.68267822265625, "logps/rejected": -141.63113403320312, "loss": 0.4624, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16633237898349762, "margin_dpo/beta_margin_grad_std": 0.21091465651988983, "margin_dpo/beta_margin_mean": 3.1917476654052734, "margin_dpo/beta_margin_std": 2.7972888946533203, "margin_dpo/loss_margin_mean": 31.917476654052734, "margin_dpo/margin_mean": 31.917476654052734, "margin_dpo/margin_std": 27.65774154663086, "step": 445 }, { "epoch": 0.6549192364170338, "grad_norm": 54.28517532348633, "learning_rate": 1.6208764069656578e-07, "logits/chosen": -0.5897877216339111, "logits/rejected": -0.568926215171814, "logps/chosen": -76.78812408447266, "logps/ref_chosen": -59.098487854003906, "logps/ref_rejected": -101.26419067382812, "logps/rejected": -149.1267852783203, "loss": 0.4368, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16494978964328766, "margin_dpo/beta_margin_grad_std": 0.1965719312429428, "margin_dpo/beta_margin_mean": 3.0172958374023438, "margin_dpo/beta_margin_std": 2.6881942749023438, "margin_dpo/loss_margin_mean": 30.172958374023438, "margin_dpo/margin_mean": 30.172958374023438, "margin_dpo/margin_std": 26.31899070739746, "step": 446 }, { "epoch": 0.6563876651982379, "grad_norm": 51.364315032958984, "learning_rate": 1.608874379754465e-07, "logits/chosen": -0.660834014415741, "logits/rejected": -0.6618390083312988, "logps/chosen": -76.43832397460938, "logps/ref_chosen": -56.07533264160156, "logps/ref_rejected": -98.69475555419922, "logps/rejected": -150.78875732421875, "loss": 0.4538, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16636352241039276, "margin_dpo/beta_margin_grad_std": 0.20971129834651947, "margin_dpo/beta_margin_mean": 3.1731016635894775, "margin_dpo/beta_margin_std": 2.897716760635376, "margin_dpo/loss_margin_mean": 31.731016159057617, "margin_dpo/margin_mean": 31.73101806640625, "margin_dpo/margin_std": 28.281917572021484, "step": 447 }, { "epoch": 0.657856093979442, "grad_norm": 47.65716552734375, "learning_rate": 1.5968958345321177e-07, "logits/chosen": -0.6168828010559082, "logits/rejected": -0.600253701210022, "logps/chosen": -80.88053131103516, "logps/ref_chosen": -60.00384521484375, "logps/ref_rejected": -102.26465606689453, "logps/rejected": -155.2429962158203, "loss": 0.3892, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13886789977550507, "margin_dpo/beta_margin_grad_std": 0.18517683446407318, "margin_dpo/beta_margin_mean": 3.2101657390594482, "margin_dpo/beta_margin_std": 2.5543386936187744, "margin_dpo/loss_margin_mean": 32.101654052734375, "margin_dpo/margin_mean": 32.101654052734375, "margin_dpo/margin_std": 25.43906021118164, "step": 448 }, { "epoch": 0.6593245227606461, "grad_norm": 79.98429107666016, "learning_rate": 1.584941086944423e-07, "logits/chosen": -0.5817546248435974, "logits/rejected": -0.5362948179244995, "logps/chosen": -89.62152099609375, "logps/ref_chosen": -67.52661895751953, "logps/ref_rejected": -88.59690856933594, "logps/rejected": -142.1068878173828, "loss": 0.6043, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17372801899909973, "margin_dpo/beta_margin_grad_std": 0.23874573409557343, "margin_dpo/beta_margin_mean": 3.141507387161255, "margin_dpo/beta_margin_std": 3.0401062965393066, "margin_dpo/loss_margin_mean": 31.41507339477539, "margin_dpo/margin_mean": 31.41507339477539, "margin_dpo/margin_std": 30.071718215942383, "step": 449 }, { "epoch": 0.6607929515418502, "grad_norm": 44.39156723022461, "learning_rate": 1.573010452010098e-07, "logits/chosen": -0.6516839265823364, "logits/rejected": -0.6243829727172852, "logps/chosen": -73.27051544189453, "logps/ref_chosen": -57.108116149902344, "logps/ref_rejected": -102.75494384765625, "logps/rejected": -153.4552459716797, "loss": 0.3207, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12840278446674347, "margin_dpo/beta_margin_grad_std": 0.1644502729177475, "margin_dpo/beta_margin_mean": 3.4537904262542725, "margin_dpo/beta_margin_std": 2.6313655376434326, "margin_dpo/loss_margin_mean": 34.53790283203125, "margin_dpo/margin_mean": 34.53790283203125, "margin_dpo/margin_std": 25.840599060058594, "step": 450 }, { "epoch": 0.6622613803230544, "grad_norm": 75.2901382446289, "learning_rate": 1.5611042441124687e-07, "logits/chosen": -0.6581634283065796, "logits/rejected": -0.6103047132492065, "logps/chosen": -80.07470703125, "logps/ref_chosen": -58.46883010864258, "logps/ref_rejected": -72.92941284179688, "logps/rejected": -124.00057983398438, "loss": 0.5537, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16929282248020172, "margin_dpo/beta_margin_grad_std": 0.2288813591003418, "margin_dpo/beta_margin_mean": 2.9465293884277344, "margin_dpo/beta_margin_std": 2.6042659282684326, "margin_dpo/loss_margin_mean": 29.465293884277344, "margin_dpo/margin_mean": 29.465293884277344, "margin_dpo/margin_std": 25.818279266357422, "step": 451 }, { "epoch": 0.6637298091042585, "grad_norm": 35.9453239440918, "learning_rate": 1.549222776991186e-07, "logits/chosen": -0.546400785446167, "logits/rejected": -0.5479906797409058, "logps/chosen": -66.35121154785156, "logps/ref_chosen": -50.39055252075195, "logps/ref_rejected": -97.77143096923828, "logps/rejected": -143.86688232421875, "loss": 0.2857, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12084120512008667, "margin_dpo/beta_margin_grad_std": 0.13351190090179443, "margin_dpo/beta_margin_mean": 3.0134785175323486, "margin_dpo/beta_margin_std": 2.2046637535095215, "margin_dpo/loss_margin_mean": 30.134784698486328, "margin_dpo/margin_mean": 30.134784698486328, "margin_dpo/margin_std": 21.948862075805664, "step": 452 }, { "epoch": 0.6651982378854625, "grad_norm": 51.65986633300781, "learning_rate": 1.5373663637339584e-07, "logits/chosen": -0.6441305875778198, "logits/rejected": -0.5928350687026978, "logps/chosen": -76.96781921386719, "logps/ref_chosen": -57.71485137939453, "logps/ref_rejected": -82.20741271972656, "logps/rejected": -130.54562377929688, "loss": 0.4664, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16998730599880219, "margin_dpo/beta_margin_grad_std": 0.2013465166091919, "margin_dpo/beta_margin_mean": 2.90852427482605, "margin_dpo/beta_margin_std": 2.558598279953003, "margin_dpo/loss_margin_mean": 29.085243225097656, "margin_dpo/margin_mean": 29.085243225097656, "margin_dpo/margin_std": 25.423097610473633, "step": 453 }, { "epoch": 0.6666666666666666, "grad_norm": 59.35947036743164, "learning_rate": 1.5255353167683017e-07, "logits/chosen": -0.6171283721923828, "logits/rejected": -0.5738873481750488, "logps/chosen": -81.52304077148438, "logps/ref_chosen": -60.945648193359375, "logps/ref_rejected": -84.9507827758789, "logps/rejected": -137.84750366210938, "loss": 0.4047, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14290541410446167, "margin_dpo/beta_margin_grad_std": 0.2013457864522934, "margin_dpo/beta_margin_mean": 3.2319328784942627, "margin_dpo/beta_margin_std": 2.6018524169921875, "margin_dpo/loss_margin_mean": 32.31932830810547, "margin_dpo/margin_mean": 32.31932830810547, "margin_dpo/margin_std": 25.902687072753906, "step": 454 }, { "epoch": 0.6681350954478708, "grad_norm": 93.24932861328125, "learning_rate": 1.5137299478533064e-07, "logits/chosen": -0.6162554621696472, "logits/rejected": -0.5891969203948975, "logps/chosen": -64.90336608886719, "logps/ref_chosen": -44.88671112060547, "logps/ref_rejected": -115.30147552490234, "logps/rejected": -172.52194213867188, "loss": 0.3629, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1229911670088768, "margin_dpo/beta_margin_grad_std": 0.19356586039066315, "margin_dpo/beta_margin_mean": 3.7203803062438965, "margin_dpo/beta_margin_std": 2.6521599292755127, "margin_dpo/loss_margin_mean": 37.203800201416016, "margin_dpo/margin_mean": 37.203800201416016, "margin_dpo/margin_std": 26.29052734375, "step": 455 }, { "epoch": 0.6696035242290749, "grad_norm": 49.41230010986328, "learning_rate": 1.5019505680714232e-07, "logits/chosen": -0.6331825256347656, "logits/rejected": -0.6310149431228638, "logps/chosen": -74.30551147460938, "logps/ref_chosen": -57.036781311035156, "logps/ref_rejected": -105.21783447265625, "logps/rejected": -160.00119018554688, "loss": 0.354, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13223397731781006, "margin_dpo/beta_margin_grad_std": 0.1870342195034027, "margin_dpo/beta_margin_mean": 3.751462936401367, "margin_dpo/beta_margin_std": 2.8913846015930176, "margin_dpo/loss_margin_mean": 37.51462936401367, "margin_dpo/margin_mean": 37.51462936401367, "margin_dpo/margin_std": 28.42435073852539, "step": 456 }, { "epoch": 0.671071953010279, "grad_norm": 59.397212982177734, "learning_rate": 1.4901974878202627e-07, "logits/chosen": -0.6320329308509827, "logits/rejected": -0.6049121618270874, "logps/chosen": -72.51710510253906, "logps/ref_chosen": -54.24253845214844, "logps/ref_rejected": -85.10956573486328, "logps/rejected": -136.43548583984375, "loss": 0.386, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13050609827041626, "margin_dpo/beta_margin_grad_std": 0.19707661867141724, "margin_dpo/beta_margin_mean": 3.3051366806030273, "margin_dpo/beta_margin_std": 2.4597647190093994, "margin_dpo/loss_margin_mean": 33.051368713378906, "margin_dpo/margin_mean": 33.051368713378906, "margin_dpo/margin_std": 24.472869873046875, "step": 457 }, { "epoch": 0.6725403817914831, "grad_norm": 56.77046585083008, "learning_rate": 1.4784710168044212e-07, "logits/chosen": -0.6297258138656616, "logits/rejected": -0.5929204225540161, "logps/chosen": -74.71857452392578, "logps/ref_chosen": -55.40888214111328, "logps/ref_rejected": -97.68325805664062, "logps/rejected": -155.025146484375, "loss": 0.4411, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14024901390075684, "margin_dpo/beta_margin_grad_std": 0.23043015599250793, "margin_dpo/beta_margin_mean": 3.8032190799713135, "margin_dpo/beta_margin_std": 3.263706922531128, "margin_dpo/loss_margin_mean": 38.03219223022461, "margin_dpo/margin_mean": 38.03219223022461, "margin_dpo/margin_std": 32.41196060180664, "step": 458 }, { "epoch": 0.6740088105726872, "grad_norm": 47.203277587890625, "learning_rate": 1.466771464027316e-07, "logits/chosen": -0.592144250869751, "logits/rejected": -0.5651764869689941, "logps/chosen": -67.03455352783203, "logps/ref_chosen": -46.55748748779297, "logps/ref_rejected": -86.16854095458984, "logps/rejected": -135.55999755859375, "loss": 0.4592, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16323688626289368, "margin_dpo/beta_margin_grad_std": 0.19608436524868011, "margin_dpo/beta_margin_mean": 2.8914406299591064, "margin_dpo/beta_margin_std": 2.370115280151367, "margin_dpo/loss_margin_mean": 28.914405822753906, "margin_dpo/margin_mean": 28.914405822753906, "margin_dpo/margin_std": 23.49092674255371, "step": 459 }, { "epoch": 0.6754772393538914, "grad_norm": 59.67298126220703, "learning_rate": 1.4550991377830423e-07, "logits/chosen": -0.5806307792663574, "logits/rejected": -0.5847660303115845, "logps/chosen": -70.59028625488281, "logps/ref_chosen": -51.63489532470703, "logps/ref_rejected": -104.11935424804688, "logps/rejected": -155.63986206054688, "loss": 0.4209, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15195363759994507, "margin_dpo/beta_margin_grad_std": 0.20993934571743011, "margin_dpo/beta_margin_mean": 3.25651216506958, "margin_dpo/beta_margin_std": 2.5778074264526367, "margin_dpo/loss_margin_mean": 32.565120697021484, "margin_dpo/margin_mean": 32.565120697021484, "margin_dpo/margin_std": 25.7642879486084, "step": 460 }, { "epoch": 0.6769456681350955, "grad_norm": 59.93415069580078, "learning_rate": 1.4434543456482518e-07, "logits/chosen": -0.5920594930648804, "logits/rejected": -0.5768572688102722, "logps/chosen": -79.71414184570312, "logps/ref_chosen": -55.18195343017578, "logps/ref_rejected": -86.47689819335938, "logps/rejected": -138.8244171142578, "loss": 0.5473, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18966011703014374, "margin_dpo/beta_margin_grad_std": 0.22385801374912262, "margin_dpo/beta_margin_mean": 2.7815327644348145, "margin_dpo/beta_margin_std": 2.8518621921539307, "margin_dpo/loss_margin_mean": 27.815326690673828, "margin_dpo/margin_mean": 27.815326690673828, "margin_dpo/margin_std": 27.13003921508789, "step": 461 }, { "epoch": 0.6784140969162996, "grad_norm": 64.90670776367188, "learning_rate": 1.4318373944740484e-07, "logits/chosen": -0.6181149482727051, "logits/rejected": -0.5787901878356934, "logps/chosen": -93.2876968383789, "logps/ref_chosen": -69.92803955078125, "logps/ref_rejected": -78.84111785888672, "logps/rejected": -129.06378173828125, "loss": 0.554, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1927950084209442, "margin_dpo/beta_margin_grad_std": 0.22063319385051727, "margin_dpo/beta_margin_mean": 2.6862998008728027, "margin_dpo/beta_margin_std": 2.5792369842529297, "margin_dpo/loss_margin_mean": 26.86299705505371, "margin_dpo/margin_mean": 26.862995147705078, "margin_dpo/margin_std": 25.538467407226562, "step": 462 }, { "epoch": 0.6798825256975036, "grad_norm": 50.19252014160156, "learning_rate": 1.4202485903778976e-07, "logits/chosen": -0.6169182062149048, "logits/rejected": -0.5887913703918457, "logps/chosen": -75.74092864990234, "logps/ref_chosen": -55.27437210083008, "logps/ref_rejected": -89.02497863769531, "logps/rejected": -143.4207763671875, "loss": 0.3546, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12193028628826141, "margin_dpo/beta_margin_grad_std": 0.19042545557022095, "margin_dpo/beta_margin_mean": 3.392923593521118, "margin_dpo/beta_margin_std": 2.4047834873199463, "margin_dpo/loss_margin_mean": 33.929237365722656, "margin_dpo/margin_mean": 33.929237365722656, "margin_dpo/margin_std": 23.769535064697266, "step": 463 }, { "epoch": 0.6813509544787077, "grad_norm": 54.16157531738281, "learning_rate": 1.4086882387355658e-07, "logits/chosen": -0.6251201629638672, "logits/rejected": -0.6308864951133728, "logps/chosen": -73.30712890625, "logps/ref_chosen": -50.91230010986328, "logps/ref_rejected": -102.4893798828125, "logps/rejected": -159.47793579101562, "loss": 0.4531, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14893580973148346, "margin_dpo/beta_margin_grad_std": 0.20893022418022156, "margin_dpo/beta_margin_mean": 3.4593725204467773, "margin_dpo/beta_margin_std": 2.9905498027801514, "margin_dpo/loss_margin_mean": 34.593727111816406, "margin_dpo/margin_mean": 34.593727111816406, "margin_dpo/margin_std": 29.88116455078125, "step": 464 }, { "epoch": 0.6828193832599119, "grad_norm": 50.176815032958984, "learning_rate": 1.3971566441730714e-07, "logits/chosen": -0.6043756008148193, "logits/rejected": -0.5841087102890015, "logps/chosen": -81.1992416381836, "logps/ref_chosen": -60.116851806640625, "logps/ref_rejected": -113.94602966308594, "logps/rejected": -172.650634765625, "loss": 0.2808, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.10672765225172043, "margin_dpo/beta_margin_grad_std": 0.16949497163295746, "margin_dpo/beta_margin_mean": 3.7622225284576416, "margin_dpo/beta_margin_std": 2.543063163757324, "margin_dpo/loss_margin_mean": 37.622222900390625, "margin_dpo/margin_mean": 37.622222900390625, "margin_dpo/margin_std": 25.363601684570312, "step": 465 }, { "epoch": 0.684287812041116, "grad_norm": 57.16488265991211, "learning_rate": 1.3856541105586545e-07, "logits/chosen": -0.6066223382949829, "logits/rejected": -0.5759164094924927, "logps/chosen": -75.47810363769531, "logps/ref_chosen": -52.920921325683594, "logps/ref_rejected": -90.3154296875, "logps/rejected": -146.87469482421875, "loss": 0.3827, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12489843368530273, "margin_dpo/beta_margin_grad_std": 0.20490986108779907, "margin_dpo/beta_margin_mean": 3.4002089500427246, "margin_dpo/beta_margin_std": 2.3887243270874023, "margin_dpo/loss_margin_mean": 34.00209045410156, "margin_dpo/margin_mean": 34.00209045410156, "margin_dpo/margin_std": 23.383773803710938, "step": 466 }, { "epoch": 0.6857562408223201, "grad_norm": 46.38023376464844, "learning_rate": 1.3741809409947729e-07, "logits/chosen": -0.6104651689529419, "logits/rejected": -0.5786043405532837, "logps/chosen": -101.92547607421875, "logps/ref_chosen": -78.7158203125, "logps/ref_rejected": -102.86019897460938, "logps/rejected": -160.5396270751953, "loss": 0.3729, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1380797028541565, "margin_dpo/beta_margin_grad_std": 0.1899155229330063, "margin_dpo/beta_margin_mean": 3.4469778537750244, "margin_dpo/beta_margin_std": 2.8059871196746826, "margin_dpo/loss_margin_mean": 34.46977615356445, "margin_dpo/margin_mean": 34.46977996826172, "margin_dpo/margin_std": 27.862186431884766, "step": 467 }, { "epoch": 0.6872246696035242, "grad_norm": 55.537410736083984, "learning_rate": 1.362737437810114e-07, "logits/chosen": -0.6192047595977783, "logits/rejected": -0.5922250747680664, "logps/chosen": -89.64823913574219, "logps/ref_chosen": -69.93536376953125, "logps/ref_rejected": -101.02881622314453, "logps/rejected": -152.7930450439453, "loss": 0.3886, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1464032083749771, "margin_dpo/beta_margin_grad_std": 0.18942488729953766, "margin_dpo/beta_margin_mean": 3.2051353454589844, "margin_dpo/beta_margin_std": 2.6790554523468018, "margin_dpo/loss_margin_mean": 32.051353454589844, "margin_dpo/margin_mean": 32.051353454589844, "margin_dpo/margin_std": 26.76758575439453, "step": 468 }, { "epoch": 0.6886930983847284, "grad_norm": 57.15333938598633, "learning_rate": 1.351323902551631e-07, "logits/chosen": -0.6003662347793579, "logits/rejected": -0.5658551454544067, "logps/chosen": -91.19867706298828, "logps/ref_chosen": -68.12469482421875, "logps/ref_rejected": -104.78640747070312, "logps/rejected": -161.0380401611328, "loss": 0.4299, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14955471456050873, "margin_dpo/beta_margin_grad_std": 0.21639080345630646, "margin_dpo/beta_margin_mean": 3.317765474319458, "margin_dpo/beta_margin_std": 2.7324230670928955, "margin_dpo/loss_margin_mean": 33.17765426635742, "margin_dpo/margin_mean": 33.17765426635742, "margin_dpo/margin_std": 27.252918243408203, "step": 469 }, { "epoch": 0.6901615271659325, "grad_norm": 41.5388298034668, "learning_rate": 1.339940635976592e-07, "logits/chosen": -0.5871062278747559, "logits/rejected": -0.5616201162338257, "logps/chosen": -64.00105285644531, "logps/ref_chosen": -43.79193115234375, "logps/ref_rejected": -82.70285034179688, "logps/rejected": -141.2603759765625, "loss": 0.2368, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.09441064298152924, "margin_dpo/beta_margin_grad_std": 0.15031108260154724, "margin_dpo/beta_margin_mean": 3.8348402976989746, "margin_dpo/beta_margin_std": 2.4011597633361816, "margin_dpo/loss_margin_mean": 38.3484001159668, "margin_dpo/margin_mean": 38.34840393066406, "margin_dpo/margin_std": 23.939483642578125, "step": 470 }, { "epoch": 0.6916299559471366, "grad_norm": 54.3143310546875, "learning_rate": 1.3285879380446563e-07, "logits/chosen": -0.5919795036315918, "logits/rejected": -0.5648236870765686, "logps/chosen": -87.58413696289062, "logps/ref_chosen": -63.33952331542969, "logps/ref_rejected": -83.61048126220703, "logps/rejected": -139.228271484375, "loss": 0.4208, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1538044661283493, "margin_dpo/beta_margin_grad_std": 0.2021295428276062, "margin_dpo/beta_margin_mean": 3.1373167037963867, "margin_dpo/beta_margin_std": 2.457742929458618, "margin_dpo/loss_margin_mean": 31.373167037963867, "margin_dpo/margin_mean": 31.3731689453125, "margin_dpo/margin_std": 24.42245101928711, "step": 471 }, { "epoch": 0.6930983847283406, "grad_norm": 50.913185119628906, "learning_rate": 1.317266107909975e-07, "logits/chosen": -0.6416307687759399, "logits/rejected": -0.5852631330490112, "logps/chosen": -104.90176391601562, "logps/ref_chosen": -83.66609954833984, "logps/ref_rejected": -117.20919799804688, "logps/rejected": -178.68946838378906, "loss": 0.3109, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.11783776432275772, "margin_dpo/beta_margin_grad_std": 0.1787194162607193, "margin_dpo/beta_margin_mean": 4.02446174621582, "margin_dpo/beta_margin_std": 3.3216898441314697, "margin_dpo/loss_margin_mean": 40.2446174621582, "margin_dpo/margin_mean": 40.24461364746094, "margin_dpo/margin_std": 33.13086700439453, "step": 472 }, { "epoch": 0.6945668135095447, "grad_norm": 78.06228637695312, "learning_rate": 1.3059754439133002e-07, "logits/chosen": -0.5605521202087402, "logits/rejected": -0.5147773623466492, "logps/chosen": -87.47222900390625, "logps/ref_chosen": -63.49696731567383, "logps/ref_rejected": -81.14657592773438, "logps/rejected": -133.27700805664062, "loss": 0.4899, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17109636962413788, "margin_dpo/beta_margin_grad_std": 0.22186963260173798, "margin_dpo/beta_margin_mean": 2.8155159950256348, "margin_dpo/beta_margin_std": 2.263782501220703, "margin_dpo/loss_margin_mean": 28.15515899658203, "margin_dpo/margin_mean": 28.15515899658203, "margin_dpo/margin_std": 22.536598205566406, "step": 473 }, { "epoch": 0.6960352422907489, "grad_norm": 60.656185150146484, "learning_rate": 1.2947162435741277e-07, "logits/chosen": -0.5783928632736206, "logits/rejected": -0.5659887790679932, "logps/chosen": -76.55195617675781, "logps/ref_chosen": -52.6119384765625, "logps/ref_rejected": -90.08041381835938, "logps/rejected": -144.70611572265625, "loss": 0.4737, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16651608049869537, "margin_dpo/beta_margin_grad_std": 0.22569791972637177, "margin_dpo/beta_margin_mean": 3.0685691833496094, "margin_dpo/beta_margin_std": 2.5612540245056152, "margin_dpo/loss_margin_mean": 30.68568992614746, "margin_dpo/margin_mean": 30.685691833496094, "margin_dpo/margin_std": 25.531770706176758, "step": 474 }, { "epoch": 0.697503671071953, "grad_norm": 43.59295654296875, "learning_rate": 1.2834888035828596e-07, "logits/chosen": -0.62577223777771, "logits/rejected": -0.6225380897521973, "logps/chosen": -63.40654754638672, "logps/ref_chosen": -42.49519348144531, "logps/ref_rejected": -90.06295013427734, "logps/rejected": -145.40371704101562, "loss": 0.3844, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1439659297466278, "margin_dpo/beta_margin_grad_std": 0.19135436415672302, "margin_dpo/beta_margin_mean": 3.442941188812256, "margin_dpo/beta_margin_std": 3.0452959537506104, "margin_dpo/loss_margin_mean": 34.429412841796875, "margin_dpo/margin_mean": 34.429412841796875, "margin_dpo/margin_std": 30.290939331054688, "step": 475 }, { "epoch": 0.6989720998531571, "grad_norm": 69.43315124511719, "learning_rate": 1.2722934197929802e-07, "logits/chosen": -0.5941322445869446, "logits/rejected": -0.5612877607345581, "logps/chosen": -64.73588562011719, "logps/ref_chosen": -42.949378967285156, "logps/ref_rejected": -73.71023559570312, "logps/rejected": -125.85054016113281, "loss": 0.5114, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17846056818962097, "margin_dpo/beta_margin_grad_std": 0.22891968488693237, "margin_dpo/beta_margin_mean": 3.0353803634643555, "margin_dpo/beta_margin_std": 2.6748228073120117, "margin_dpo/loss_margin_mean": 30.353801727294922, "margin_dpo/margin_mean": 30.353801727294922, "margin_dpo/margin_std": 26.741519927978516, "step": 476 }, { "epoch": 0.7004405286343612, "grad_norm": 81.28004455566406, "learning_rate": 1.2611303872132631e-07, "logits/chosen": -0.6341814994812012, "logits/rejected": -0.5662086009979248, "logps/chosen": -95.98014831542969, "logps/ref_chosen": -70.77261352539062, "logps/ref_rejected": -76.13737487792969, "logps/rejected": -133.20254516601562, "loss": 0.6021, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15644104778766632, "margin_dpo/beta_margin_grad_std": 0.24381397664546967, "margin_dpo/beta_margin_mean": 3.185762882232666, "margin_dpo/beta_margin_std": 2.7865076065063477, "margin_dpo/loss_margin_mean": 31.857629776000977, "margin_dpo/margin_mean": 31.857627868652344, "margin_dpo/margin_std": 27.68490982055664, "step": 477 }, { "epoch": 0.7019089574155654, "grad_norm": 48.535404205322266, "learning_rate": 1.2500000000000005e-07, "logits/chosen": -0.5645046234130859, "logits/rejected": -0.5495598316192627, "logps/chosen": -61.48149871826172, "logps/ref_chosen": -41.440513610839844, "logps/ref_rejected": -85.36196899414062, "logps/rejected": -139.90025329589844, "loss": 0.4001, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14866864681243896, "margin_dpo/beta_margin_grad_std": 0.18979746103286743, "margin_dpo/beta_margin_mean": 3.4497292041778564, "margin_dpo/beta_margin_std": 2.9467873573303223, "margin_dpo/loss_margin_mean": 34.497291564941406, "margin_dpo/margin_mean": 34.497291564941406, "margin_dpo/margin_std": 29.08106231689453, "step": 478 }, { "epoch": 0.7033773861967695, "grad_norm": 56.268714904785156, "learning_rate": 1.2389025514492456e-07, "logits/chosen": -0.558883786201477, "logits/rejected": -0.5508110523223877, "logps/chosen": -79.05259704589844, "logps/ref_chosen": -53.907920837402344, "logps/ref_rejected": -95.1163330078125, "logps/rejected": -150.62954711914062, "loss": 0.4427, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15836496651172638, "margin_dpo/beta_margin_grad_std": 0.21574333310127258, "margin_dpo/beta_margin_mean": 3.036853313446045, "margin_dpo/beta_margin_std": 2.2986533641815186, "margin_dpo/loss_margin_mean": 30.368532180786133, "margin_dpo/margin_mean": 30.368532180786133, "margin_dpo/margin_std": 22.058135986328125, "step": 479 }, { "epoch": 0.7048458149779736, "grad_norm": 73.03453826904297, "learning_rate": 1.227838333989088e-07, "logits/chosen": -0.5816048979759216, "logits/rejected": -0.523268461227417, "logps/chosen": -84.984619140625, "logps/ref_chosen": -58.682701110839844, "logps/ref_rejected": -82.93248748779297, "logps/rejected": -145.50759887695312, "loss": 0.5051, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15481433272361755, "margin_dpo/beta_margin_grad_std": 0.23069554567337036, "margin_dpo/beta_margin_mean": 3.627319097518921, "margin_dpo/beta_margin_std": 3.305205821990967, "margin_dpo/loss_margin_mean": 36.273189544677734, "margin_dpo/margin_mean": 36.273189544677734, "margin_dpo/margin_std": 31.943330764770508, "step": 480 }, { "epoch": 0.7063142437591777, "grad_norm": 53.513370513916016, "learning_rate": 1.2168076391719489e-07, "logits/chosen": -0.6167398691177368, "logits/rejected": -0.5804057121276855, "logps/chosen": -79.90770721435547, "logps/ref_chosen": -54.964271545410156, "logps/ref_rejected": -92.42044067382812, "logps/rejected": -152.1048583984375, "loss": 0.4399, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14174267649650574, "margin_dpo/beta_margin_grad_std": 0.2210419625043869, "margin_dpo/beta_margin_mean": 3.4740993976593018, "margin_dpo/beta_margin_std": 2.713843822479248, "margin_dpo/loss_margin_mean": 34.74099349975586, "margin_dpo/margin_mean": 34.74099349975586, "margin_dpo/margin_std": 26.750259399414062, "step": 481 }, { "epoch": 0.7077826725403817, "grad_norm": 54.49075698852539, "learning_rate": 1.2058107576668938e-07, "logits/chosen": -0.5958288908004761, "logits/rejected": -0.5650321841239929, "logps/chosen": -89.89315795898438, "logps/ref_chosen": -67.55347442626953, "logps/ref_rejected": -87.58953857421875, "logps/rejected": -140.00283813476562, "loss": 0.4309, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1658599078655243, "margin_dpo/beta_margin_grad_std": 0.1872914731502533, "margin_dpo/beta_margin_mean": 3.0073630809783936, "margin_dpo/beta_margin_std": 2.6090502738952637, "margin_dpo/loss_margin_mean": 30.073631286621094, "margin_dpo/margin_mean": 30.073631286621094, "margin_dpo/margin_std": 25.875329971313477, "step": 482 }, { "epoch": 0.7092511013215859, "grad_norm": 65.8294677734375, "learning_rate": 1.194847979251979e-07, "logits/chosen": -0.6282751560211182, "logits/rejected": -0.5697331428527832, "logps/chosen": -88.70866394042969, "logps/ref_chosen": -63.32981872558594, "logps/ref_rejected": -95.78697204589844, "logps/rejected": -156.66552734375, "loss": 0.3968, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13324548304080963, "margin_dpo/beta_margin_grad_std": 0.21172069013118744, "margin_dpo/beta_margin_mean": 3.5499703884124756, "margin_dpo/beta_margin_std": 2.7894442081451416, "margin_dpo/loss_margin_mean": 35.49970245361328, "margin_dpo/margin_mean": 35.49970245361328, "margin_dpo/margin_std": 27.264251708984375, "step": 483 }, { "epoch": 0.71071953010279, "grad_norm": 55.688453674316406, "learning_rate": 1.1839195928066101e-07, "logits/chosen": -0.6670191287994385, "logits/rejected": -0.6306544542312622, "logps/chosen": -80.87345886230469, "logps/ref_chosen": -59.13812255859375, "logps/ref_rejected": -84.37144470214844, "logps/rejected": -141.7012939453125, "loss": 0.3678, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1434181034564972, "margin_dpo/beta_margin_grad_std": 0.1813618689775467, "margin_dpo/beta_margin_mean": 3.559450626373291, "margin_dpo/beta_margin_std": 3.0208792686462402, "margin_dpo/loss_margin_mean": 35.594505310058594, "margin_dpo/margin_mean": 35.594505310058594, "margin_dpo/margin_std": 29.648942947387695, "step": 484 }, { "epoch": 0.7121879588839941, "grad_norm": 52.00883483886719, "learning_rate": 1.1730258863039347e-07, "logits/chosen": -0.5754466652870178, "logits/rejected": -0.5443192720413208, "logps/chosen": -77.70271301269531, "logps/ref_chosen": -58.849571228027344, "logps/ref_rejected": -103.36408996582031, "logps/rejected": -162.49534606933594, "loss": 0.4199, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14156897366046906, "margin_dpo/beta_margin_grad_std": 0.2183229923248291, "margin_dpo/beta_margin_mean": 4.0278120040893555, "margin_dpo/beta_margin_std": 3.2175581455230713, "margin_dpo/loss_margin_mean": 40.27811813354492, "margin_dpo/margin_mean": 40.278114318847656, "margin_dpo/margin_std": 32.148040771484375, "step": 485 }, { "epoch": 0.7136563876651982, "grad_norm": 66.21233367919922, "learning_rate": 1.1621671468032493e-07, "logits/chosen": -0.6356394290924072, "logits/rejected": -0.5828511714935303, "logps/chosen": -77.98770904541016, "logps/ref_chosen": -55.25966262817383, "logps/ref_rejected": -92.13936614990234, "logps/rejected": -153.8128204345703, "loss": 0.424, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14394216239452362, "margin_dpo/beta_margin_grad_std": 0.21945635974407196, "margin_dpo/beta_margin_mean": 3.8945412635803223, "margin_dpo/beta_margin_std": 3.0784053802490234, "margin_dpo/loss_margin_mean": 38.945411682128906, "margin_dpo/margin_mean": 38.945411682128906, "margin_dpo/margin_std": 30.78309440612793, "step": 486 }, { "epoch": 0.7151248164464024, "grad_norm": 49.65977096557617, "learning_rate": 1.1513436604424378e-07, "logits/chosen": -0.6391937732696533, "logits/rejected": -0.604444682598114, "logps/chosen": -75.19181060791016, "logps/ref_chosen": -53.06330871582031, "logps/ref_rejected": -92.4188232421875, "logps/rejected": -151.7467041015625, "loss": 0.3256, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12563364207744598, "margin_dpo/beta_margin_grad_std": 0.17673608660697937, "margin_dpo/beta_margin_mean": 3.7199385166168213, "margin_dpo/beta_margin_std": 2.6885433197021484, "margin_dpo/loss_margin_mean": 37.19938278198242, "margin_dpo/margin_mean": 37.19938278198242, "margin_dpo/margin_std": 26.274166107177734, "step": 487 }, { "epoch": 0.7165932452276065, "grad_norm": 32.75376510620117, "learning_rate": 1.1405557124304335e-07, "logits/chosen": -0.5953603386878967, "logits/rejected": -0.563835859298706, "logps/chosen": -72.79434204101562, "logps/ref_chosen": -52.228153228759766, "logps/ref_rejected": -84.00656127929688, "logps/rejected": -136.65927124023438, "loss": 0.2845, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.11705981194972992, "margin_dpo/beta_margin_grad_std": 0.1456899493932724, "margin_dpo/beta_margin_mean": 3.208653450012207, "margin_dpo/beta_margin_std": 2.13510799407959, "margin_dpo/loss_margin_mean": 32.08653259277344, "margin_dpo/margin_mean": 32.08653259277344, "margin_dpo/margin_std": 21.324390411376953, "step": 488 }, { "epoch": 0.7180616740088106, "grad_norm": 55.29383850097656, "learning_rate": 1.1298035870396985e-07, "logits/chosen": -0.5945910215377808, "logits/rejected": -0.5459895730018616, "logps/chosen": -77.7701416015625, "logps/ref_chosen": -55.989627838134766, "logps/ref_rejected": -79.39813232421875, "logps/rejected": -132.9573516845703, "loss": 0.4441, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1625826209783554, "margin_dpo/beta_margin_grad_std": 0.21051008999347687, "margin_dpo/beta_margin_mean": 3.1778712272644043, "margin_dpo/beta_margin_std": 2.766324520111084, "margin_dpo/loss_margin_mean": 31.778711318969727, "margin_dpo/margin_mean": 31.778709411621094, "margin_dpo/margin_std": 27.465885162353516, "step": 489 }, { "epoch": 0.7195301027900147, "grad_norm": 67.01080322265625, "learning_rate": 1.1190875675987355e-07, "logits/chosen": -0.613182783126831, "logits/rejected": -0.6027116775512695, "logps/chosen": -72.7847900390625, "logps/ref_chosen": -52.36639404296875, "logps/ref_rejected": -110.40904998779297, "logps/rejected": -162.11245727539062, "loss": 0.573, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.186043843626976, "margin_dpo/beta_margin_grad_std": 0.23473787307739258, "margin_dpo/beta_margin_mean": 3.1285009384155273, "margin_dpo/beta_margin_std": 2.9745311737060547, "margin_dpo/loss_margin_mean": 31.28500747680664, "margin_dpo/margin_mean": 31.28500747680664, "margin_dpo/margin_std": 29.551124572753906, "step": 490 }, { "epoch": 0.7209985315712188, "grad_norm": 71.59069061279297, "learning_rate": 1.1084079364846241e-07, "logits/chosen": -0.5881800651550293, "logits/rejected": -0.5435885190963745, "logps/chosen": -82.98500061035156, "logps/ref_chosen": -60.11626434326172, "logps/ref_rejected": -73.27278900146484, "logps/rejected": -124.22119140625, "loss": 0.5801, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1894461065530777, "margin_dpo/beta_margin_grad_std": 0.22930499911308289, "margin_dpo/beta_margin_mean": 2.807966709136963, "margin_dpo/beta_margin_std": 2.785522699356079, "margin_dpo/loss_margin_mean": 28.079666137695312, "margin_dpo/margin_mean": 28.079666137695312, "margin_dpo/margin_std": 27.83734893798828, "step": 491 }, { "epoch": 0.7224669603524229, "grad_norm": 109.9788589477539, "learning_rate": 1.097764975115576e-07, "logits/chosen": -0.6198358535766602, "logits/rejected": -0.5758175849914551, "logps/chosen": -77.27084350585938, "logps/ref_chosen": -53.99418258666992, "logps/ref_rejected": -72.65962219238281, "logps/rejected": -122.03599548339844, "loss": 0.9317, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2348533272743225, "margin_dpo/beta_margin_grad_std": 0.31157541275024414, "margin_dpo/beta_margin_mean": 2.609971046447754, "margin_dpo/beta_margin_std": 3.011613368988037, "margin_dpo/loss_margin_mean": 26.099708557128906, "margin_dpo/margin_mean": 26.099708557128906, "margin_dpo/margin_std": 29.874317169189453, "step": 492 }, { "epoch": 0.723935389133627, "grad_norm": 69.1717529296875, "learning_rate": 1.0871589639435203e-07, "logits/chosen": -0.6741948127746582, "logits/rejected": -0.6164962649345398, "logps/chosen": -95.62208557128906, "logps/ref_chosen": -75.49723815917969, "logps/ref_rejected": -87.32301330566406, "logps/rejected": -140.3636016845703, "loss": 0.4661, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15023410320281982, "margin_dpo/beta_margin_grad_std": 0.22642172873020172, "margin_dpo/beta_margin_mean": 3.2915735244750977, "margin_dpo/beta_margin_std": 2.6520345211029053, "margin_dpo/loss_margin_mean": 32.91573715209961, "margin_dpo/margin_mean": 32.91573715209961, "margin_dpo/margin_std": 26.319190979003906, "step": 493 }, { "epoch": 0.7254038179148311, "grad_norm": 75.6004867553711, "learning_rate": 1.0765901824467166e-07, "logits/chosen": -0.5462692379951477, "logits/rejected": -0.5368998050689697, "logps/chosen": -63.30023956298828, "logps/ref_chosen": -41.35926818847656, "logps/ref_rejected": -86.09136962890625, "logps/rejected": -143.49691772460938, "loss": 0.5108, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1606236845254898, "margin_dpo/beta_margin_grad_std": 0.23982644081115723, "margin_dpo/beta_margin_mean": 3.546459197998047, "margin_dpo/beta_margin_std": 3.0083138942718506, "margin_dpo/loss_margin_mean": 35.46459197998047, "margin_dpo/margin_mean": 35.46459197998047, "margin_dpo/margin_std": 29.650789260864258, "step": 494 }, { "epoch": 0.7268722466960352, "grad_norm": 67.52748107910156, "learning_rate": 1.0660589091223854e-07, "logits/chosen": -0.6319386959075928, "logits/rejected": -0.5911184549331665, "logps/chosen": -84.92739868164062, "logps/ref_chosen": -63.53507995605469, "logps/ref_rejected": -91.42443084716797, "logps/rejected": -145.19595336914062, "loss": 0.5177, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15922006964683533, "margin_dpo/beta_margin_grad_std": 0.22867868840694427, "margin_dpo/beta_margin_mean": 3.2379212379455566, "margin_dpo/beta_margin_std": 2.7765204906463623, "margin_dpo/loss_margin_mean": 32.37921142578125, "margin_dpo/margin_mean": 32.37921142578125, "margin_dpo/margin_std": 27.550233840942383, "step": 495 }, { "epoch": 0.7283406754772394, "grad_norm": 64.98089599609375, "learning_rate": 1.0555654214793722e-07, "logits/chosen": -0.6621850728988647, "logits/rejected": -0.6073780655860901, "logps/chosen": -96.5443115234375, "logps/ref_chosen": -72.59192657470703, "logps/ref_rejected": -84.32933807373047, "logps/rejected": -136.9782257080078, "loss": 0.5292, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17409659922122955, "margin_dpo/beta_margin_grad_std": 0.22257588803768158, "margin_dpo/beta_margin_mean": 2.869650363922119, "margin_dpo/beta_margin_std": 2.5869693756103516, "margin_dpo/loss_margin_mean": 28.696502685546875, "margin_dpo/margin_mean": 28.696502685546875, "margin_dpo/margin_std": 25.838706970214844, "step": 496 }, { "epoch": 0.7298091042584435, "grad_norm": 77.44481658935547, "learning_rate": 1.0451099960308374e-07, "logits/chosen": -0.6251211166381836, "logits/rejected": -0.5778101682662964, "logps/chosen": -83.82826232910156, "logps/ref_chosen": -58.593971252441406, "logps/ref_rejected": -76.28836822509766, "logps/rejected": -129.9313201904297, "loss": 0.613, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.20568180084228516, "margin_dpo/beta_margin_grad_std": 0.23945844173431396, "margin_dpo/beta_margin_mean": 2.8408656120300293, "margin_dpo/beta_margin_std": 2.808185577392578, "margin_dpo/loss_margin_mean": 28.408655166625977, "margin_dpo/margin_mean": 28.408655166625977, "margin_dpo/margin_std": 27.508472442626953, "step": 497 }, { "epoch": 0.7312775330396476, "grad_norm": 67.77000427246094, "learning_rate": 1.0346929082869641e-07, "logits/chosen": -0.6193152666091919, "logits/rejected": -0.5879042148590088, "logps/chosen": -95.3200912475586, "logps/ref_chosen": -71.20565795898438, "logps/ref_rejected": -83.95803833007812, "logps/rejected": -139.05723571777344, "loss": 0.5312, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17250090837478638, "margin_dpo/beta_margin_grad_std": 0.23728637397289276, "margin_dpo/beta_margin_mean": 3.098475933074951, "margin_dpo/beta_margin_std": 2.798676013946533, "margin_dpo/loss_margin_mean": 30.984760284423828, "margin_dpo/margin_mean": 30.984760284423828, "margin_dpo/margin_std": 27.886219024658203, "step": 498 }, { "epoch": 0.7327459618208517, "grad_norm": 82.45706939697266, "learning_rate": 1.0243144327477013e-07, "logits/chosen": -0.6282952427864075, "logits/rejected": -0.6203751564025879, "logps/chosen": -74.16600036621094, "logps/ref_chosen": -51.25519561767578, "logps/ref_rejected": -101.07870483398438, "logps/rejected": -155.54342651367188, "loss": 0.6873, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.179952934384346, "margin_dpo/beta_margin_grad_std": 0.2602365016937256, "margin_dpo/beta_margin_mean": 3.155392646789551, "margin_dpo/beta_margin_std": 3.0621047019958496, "margin_dpo/loss_margin_mean": 31.553926467895508, "margin_dpo/margin_mean": 31.553926467895508, "margin_dpo/margin_std": 30.37271499633789, "step": 499 }, { "epoch": 0.7342143906020558, "grad_norm": 45.40144729614258, "learning_rate": 1.0139748428955333e-07, "logits/chosen": -0.6029895544052124, "logits/rejected": -0.5873157382011414, "logps/chosen": -82.48162841796875, "logps/ref_chosen": -57.027442932128906, "logps/ref_rejected": -93.93421173095703, "logps/rejected": -153.21792602539062, "loss": 0.3799, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13710999488830566, "margin_dpo/beta_margin_grad_std": 0.19010357558727264, "margin_dpo/beta_margin_mean": 3.3829522132873535, "margin_dpo/beta_margin_std": 2.9376726150512695, "margin_dpo/loss_margin_mean": 33.82952117919922, "margin_dpo/margin_mean": 33.82952117919922, "margin_dpo/margin_std": 29.12575340270996, "step": 500 }, { "epoch": 0.7342143906020558, "eval_logits/chosen": -0.6185933947563171, "eval_logits/rejected": -0.5921938419342041, "eval_logps/chosen": -106.39403533935547, "eval_logps/ref_chosen": -79.05104064941406, "eval_logps/ref_rejected": -86.79793548583984, "eval_logps/rejected": -135.7742156982422, "eval_loss": 0.41000303626060486, "eval_margin_dpo/beta": 0.10000000149011612, "eval_margin_dpo/beta_margin_grad_mean": -0.25857067108154297, "eval_margin_dpo/beta_margin_grad_std": 0.2553975284099579, "eval_margin_dpo/beta_margin_mean": 2.1633284091949463, "eval_margin_dpo/beta_margin_std": 2.637815237045288, "eval_margin_dpo/loss_margin_mean": 21.633283615112305, "eval_margin_dpo/margin_mean": 21.63328742980957, "eval_margin_dpo/margin_std": 26.378154754638672, "eval_runtime": 40.1427, "eval_samples_per_second": 58.267, "eval_steps_per_second": 1.843, "step": 500 }, { "epoch": 0.73568281938326, "grad_norm": 53.82852554321289, "learning_rate": 1.0036744111882672e-07, "logits/chosen": -0.6225707530975342, "logits/rejected": -0.581325888633728, "logps/chosen": -76.03878784179688, "logps/ref_chosen": -54.359527587890625, "logps/ref_rejected": -80.15670776367188, "logps/rejected": -139.12896728515625, "loss": 0.3404, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12611275911331177, "margin_dpo/beta_margin_grad_std": 0.1843045949935913, "margin_dpo/beta_margin_mean": 3.729299545288086, "margin_dpo/beta_margin_std": 2.9057199954986572, "margin_dpo/loss_margin_mean": 37.29299545288086, "margin_dpo/margin_mean": 37.292991638183594, "margin_dpo/margin_std": 28.878681182861328, "step": 501 }, { "epoch": 0.737151248164464, "grad_norm": 45.499080657958984, "learning_rate": 9.934134090518592e-08, "logits/chosen": -0.6202658414840698, "logits/rejected": -0.5605558156967163, "logps/chosen": -90.09158325195312, "logps/ref_chosen": -67.60050964355469, "logps/ref_rejected": -82.94876098632812, "logps/rejected": -139.19085693359375, "loss": 0.3223, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.11901772767305374, "margin_dpo/beta_margin_grad_std": 0.18556702136993408, "margin_dpo/beta_margin_mean": 3.375103235244751, "margin_dpo/beta_margin_std": 2.1826813220977783, "margin_dpo/loss_margin_mean": 33.75102996826172, "margin_dpo/margin_mean": 33.75102996826172, "margin_dpo/margin_std": 20.78955078125, "step": 502 }, { "epoch": 0.7386196769456681, "grad_norm": 51.559326171875, "learning_rate": 9.831921068732571e-08, "logits/chosen": -0.572176456451416, "logits/rejected": -0.5319969654083252, "logps/chosen": -75.5657958984375, "logps/ref_chosen": -55.078407287597656, "logps/ref_rejected": -82.50544738769531, "logps/rejected": -137.12274169921875, "loss": 0.3974, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13606566190719604, "margin_dpo/beta_margin_grad_std": 0.21487735211849213, "margin_dpo/beta_margin_mean": 3.4129903316497803, "margin_dpo/beta_margin_std": 2.352537155151367, "margin_dpo/loss_margin_mean": 34.12990188598633, "margin_dpo/margin_mean": 34.12990188598633, "margin_dpo/margin_std": 23.329858779907227, "step": 503 }, { "epoch": 0.7400881057268722, "grad_norm": 33.44047546386719, "learning_rate": 9.730107739932805e-08, "logits/chosen": -0.6334669589996338, "logits/rejected": -0.614444375038147, "logps/chosen": -79.06010437011719, "logps/ref_chosen": -59.96575164794922, "logps/ref_rejected": -103.76213073730469, "logps/rejected": -162.8424072265625, "loss": 0.2414, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.09301727265119553, "margin_dpo/beta_margin_grad_std": 0.15588341653347015, "margin_dpo/beta_margin_mean": 3.998591899871826, "margin_dpo/beta_margin_std": 2.6279215812683105, "margin_dpo/loss_margin_mean": 39.98591995239258, "margin_dpo/margin_mean": 39.98591995239258, "margin_dpo/margin_std": 25.813270568847656, "step": 504 }, { "epoch": 0.7415565345080763, "grad_norm": 81.39633178710938, "learning_rate": 9.628696786995188e-08, "logits/chosen": -0.6545782089233398, "logits/rejected": -0.6030818819999695, "logps/chosen": -101.48240661621094, "logps/ref_chosen": -76.1549072265625, "logps/ref_rejected": -88.58537292480469, "logps/rejected": -142.19000244140625, "loss": 0.6802, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.20703783631324768, "margin_dpo/beta_margin_grad_std": 0.2496814727783203, "margin_dpo/beta_margin_mean": 2.8277130126953125, "margin_dpo/beta_margin_std": 2.957451343536377, "margin_dpo/loss_margin_mean": 28.277130126953125, "margin_dpo/margin_mean": 28.277130126953125, "margin_dpo/margin_std": 29.50797462463379, "step": 505 }, { "epoch": 0.7430249632892805, "grad_norm": 67.16016387939453, "learning_rate": 9.527690882192635e-08, "logits/chosen": -0.6147867441177368, "logits/rejected": -0.5790848731994629, "logps/chosen": -70.76807403564453, "logps/ref_chosen": -48.96050262451172, "logps/ref_rejected": -78.41505432128906, "logps/rejected": -134.20616149902344, "loss": 0.4462, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1522112339735031, "margin_dpo/beta_margin_grad_std": 0.21956755220890045, "margin_dpo/beta_margin_mean": 3.3983535766601562, "margin_dpo/beta_margin_std": 2.9518353939056396, "margin_dpo/loss_margin_mean": 33.98353576660156, "margin_dpo/margin_mean": 33.98353576660156, "margin_dpo/margin_std": 28.88640594482422, "step": 506 }, { "epoch": 0.7444933920704846, "grad_norm": 52.902652740478516, "learning_rate": 9.427092687124691e-08, "logits/chosen": -0.6434965133666992, "logits/rejected": -0.6062471866607666, "logps/chosen": -90.41842651367188, "logps/ref_chosen": -66.80150604248047, "logps/ref_rejected": -95.37289428710938, "logps/rejected": -150.97418212890625, "loss": 0.3443, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1344669610261917, "margin_dpo/beta_margin_grad_std": 0.1805417388677597, "margin_dpo/beta_margin_mean": 3.198436975479126, "margin_dpo/beta_margin_std": 2.2677083015441895, "margin_dpo/loss_margin_mean": 31.9843692779541, "margin_dpo/margin_mean": 31.984371185302734, "margin_dpo/margin_std": 22.465970993041992, "step": 507 }, { "epoch": 0.7459618208516887, "grad_norm": 65.08528900146484, "learning_rate": 9.326904852647344e-08, "logits/chosen": -0.6305240392684937, "logits/rejected": -0.5942909717559814, "logps/chosen": -92.9295883178711, "logps/ref_chosen": -71.303466796875, "logps/ref_rejected": -95.6275405883789, "logps/rejected": -149.353759765625, "loss": 0.5068, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1743934601545334, "margin_dpo/beta_margin_grad_std": 0.22552472352981567, "margin_dpo/beta_margin_mean": 3.2100088596343994, "margin_dpo/beta_margin_std": 2.8839914798736572, "margin_dpo/loss_margin_mean": 32.1000862121582, "margin_dpo/margin_mean": 32.1000862121582, "margin_dpo/margin_std": 28.509567260742188, "step": 508 }, { "epoch": 0.7474302496328928, "grad_norm": 70.77447509765625, "learning_rate": 9.227130018803195e-08, "logits/chosen": -0.6270876526832581, "logits/rejected": -0.5867846012115479, "logps/chosen": -85.74343872070312, "logps/ref_chosen": -63.81895065307617, "logps/ref_rejected": -83.25643920898438, "logps/rejected": -137.96099853515625, "loss": 0.4849, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14794375002384186, "margin_dpo/beta_margin_grad_std": 0.23274949193000793, "margin_dpo/beta_margin_mean": 3.2780051231384277, "margin_dpo/beta_margin_std": 2.602834463119507, "margin_dpo/loss_margin_mean": 32.780052185058594, "margin_dpo/margin_mean": 32.780052185058594, "margin_dpo/margin_std": 25.765064239501953, "step": 509 }, { "epoch": 0.748898678414097, "grad_norm": 56.0135498046875, "learning_rate": 9.127770814751932e-08, "logits/chosen": -0.5831998586654663, "logits/rejected": -0.5671366453170776, "logps/chosen": -78.48440551757812, "logps/ref_chosen": -51.878448486328125, "logps/ref_rejected": -102.7651596069336, "logps/rejected": -169.5789031982422, "loss": 0.4031, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14359986782073975, "margin_dpo/beta_margin_grad_std": 0.21300581097602844, "margin_dpo/beta_margin_mean": 4.020778656005859, "margin_dpo/beta_margin_std": 3.2402069568634033, "margin_dpo/loss_margin_mean": 40.207786560058594, "margin_dpo/margin_mean": 40.207786560058594, "margin_dpo/margin_std": 32.260772705078125, "step": 510 }, { "epoch": 0.750367107195301, "grad_norm": 55.32306671142578, "learning_rate": 9.028829858700973e-08, "logits/chosen": -0.6215203404426575, "logits/rejected": -0.586913526058197, "logps/chosen": -82.04804229736328, "logps/ref_chosen": -60.23811340332031, "logps/ref_rejected": -92.85676574707031, "logps/rejected": -150.72622680664062, "loss": 0.4605, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1583447903394699, "margin_dpo/beta_margin_grad_std": 0.22799551486968994, "margin_dpo/beta_margin_mean": 3.605954170227051, "margin_dpo/beta_margin_std": 2.9902870655059814, "margin_dpo/loss_margin_mean": 36.05954360961914, "margin_dpo/margin_mean": 36.059539794921875, "margin_dpo/margin_std": 29.7518310546875, "step": 511 }, { "epoch": 0.7518355359765051, "grad_norm": 43.83152389526367, "learning_rate": 8.930309757836516e-08, "logits/chosen": -0.6087368726730347, "logits/rejected": -0.5697593092918396, "logps/chosen": -79.78984069824219, "logps/ref_chosen": -54.905494689941406, "logps/ref_rejected": -81.87586975097656, "logps/rejected": -142.0902099609375, "loss": 0.3361, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13248665630817413, "margin_dpo/beta_margin_grad_std": 0.1739508956670761, "margin_dpo/beta_margin_mean": 3.5329983234405518, "margin_dpo/beta_margin_std": 2.7861456871032715, "margin_dpo/loss_margin_mean": 35.32998275756836, "margin_dpo/margin_mean": 35.32998275756836, "margin_dpo/margin_std": 27.727153778076172, "step": 512 }, { "epoch": 0.7533039647577092, "grad_norm": 73.83541107177734, "learning_rate": 8.832213108254863e-08, "logits/chosen": -0.6487611532211304, "logits/rejected": -0.5978541374206543, "logps/chosen": -89.40873718261719, "logps/ref_chosen": -64.91644287109375, "logps/ref_rejected": -76.06245422363281, "logps/rejected": -130.85317993164062, "loss": 0.5745, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16774950921535492, "margin_dpo/beta_margin_grad_std": 0.23628519475460052, "margin_dpo/beta_margin_mean": 3.029844284057617, "margin_dpo/beta_margin_std": 2.635484218597412, "margin_dpo/loss_margin_mean": 30.29844093322754, "margin_dpo/margin_mean": 30.298439025878906, "margin_dpo/margin_std": 25.809885025024414, "step": 513 }, { "epoch": 0.7547723935389133, "grad_norm": 73.3758544921875, "learning_rate": 8.734542494893954e-08, "logits/chosen": -0.6052289605140686, "logits/rejected": -0.5561543703079224, "logps/chosen": -99.59014892578125, "logps/ref_chosen": -74.22957611083984, "logps/ref_rejected": -78.945556640625, "logps/rejected": -135.22900390625, "loss": 0.6952, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.21260841190814972, "margin_dpo/beta_margin_grad_std": 0.2700116038322449, "margin_dpo/beta_margin_mean": 3.092288017272949, "margin_dpo/beta_margin_std": 3.3895537853240967, "margin_dpo/loss_margin_mean": 30.92287826538086, "margin_dpo/margin_mean": 30.922880172729492, "margin_dpo/margin_std": 33.82048797607422, "step": 514 }, { "epoch": 0.7562408223201175, "grad_norm": 50.55330276489258, "learning_rate": 8.637300491465272e-08, "logits/chosen": -0.6106176376342773, "logits/rejected": -0.5937142372131348, "logps/chosen": -72.84141540527344, "logps/ref_chosen": -50.40156555175781, "logps/ref_rejected": -87.09774780273438, "logps/rejected": -143.20123291015625, "loss": 0.3735, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14427538216114044, "margin_dpo/beta_margin_grad_std": 0.179901584982872, "margin_dpo/beta_margin_mean": 3.366363525390625, "margin_dpo/beta_margin_std": 2.7529520988464355, "margin_dpo/loss_margin_mean": 33.66363525390625, "margin_dpo/margin_mean": 33.66363525390625, "margin_dpo/margin_std": 27.46428680419922, "step": 515 }, { "epoch": 0.7577092511013216, "grad_norm": 51.07502365112305, "learning_rate": 8.540489660386064e-08, "logits/chosen": -0.633804202079773, "logits/rejected": -0.6136379837989807, "logps/chosen": -87.3009033203125, "logps/ref_chosen": -64.6495590209961, "logps/ref_rejected": -111.72238159179688, "logps/rejected": -169.82229614257812, "loss": 0.3698, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14012214541435242, "margin_dpo/beta_margin_grad_std": 0.18882179260253906, "margin_dpo/beta_margin_mean": 3.544856548309326, "margin_dpo/beta_margin_std": 2.8498873710632324, "margin_dpo/loss_margin_mean": 35.44856262207031, "margin_dpo/margin_mean": 35.44856262207031, "margin_dpo/margin_std": 28.429126739501953, "step": 516 }, { "epoch": 0.7591776798825257, "grad_norm": 48.961795806884766, "learning_rate": 8.444112552711752e-08, "logits/chosen": -0.6369151473045349, "logits/rejected": -0.5914682149887085, "logps/chosen": -86.39828491210938, "logps/ref_chosen": -60.913551330566406, "logps/ref_rejected": -89.08308410644531, "logps/rejected": -149.49826049804688, "loss": 0.3858, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12788426876068115, "margin_dpo/beta_margin_grad_std": 0.20656827092170715, "margin_dpo/beta_margin_mean": 3.4930450916290283, "margin_dpo/beta_margin_std": 2.5577361583709717, "margin_dpo/loss_margin_mean": 34.930450439453125, "margin_dpo/margin_mean": 34.930450439453125, "margin_dpo/margin_std": 25.562118530273438, "step": 517 }, { "epoch": 0.7606461086637298, "grad_norm": 52.089088439941406, "learning_rate": 8.348171708068747e-08, "logits/chosen": -0.6326709985733032, "logits/rejected": -0.6170345544815063, "logps/chosen": -82.4386215209961, "logps/ref_chosen": -57.45589065551758, "logps/ref_rejected": -85.31269836425781, "logps/rejected": -141.47119140625, "loss": 0.4549, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15643204748630524, "margin_dpo/beta_margin_grad_std": 0.2165152132511139, "margin_dpo/beta_margin_mean": 3.1175765991210938, "margin_dpo/beta_margin_std": 2.612419366836548, "margin_dpo/loss_margin_mean": 31.175765991210938, "margin_dpo/margin_mean": 31.175765991210938, "margin_dpo/margin_std": 26.030349731445312, "step": 518 }, { "epoch": 0.762114537444934, "grad_norm": 60.18330001831055, "learning_rate": 8.25266965458755e-08, "logits/chosen": -0.5879247784614563, "logits/rejected": -0.5525568723678589, "logps/chosen": -96.47016906738281, "logps/ref_chosen": -74.06330871582031, "logps/ref_rejected": -104.44416809082031, "logps/rejected": -159.36849975585938, "loss": 0.4562, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16369768977165222, "margin_dpo/beta_margin_grad_std": 0.2030007243156433, "margin_dpo/beta_margin_mean": 3.25174617767334, "margin_dpo/beta_margin_std": 3.083312511444092, "margin_dpo/loss_margin_mean": 32.51746368408203, "margin_dpo/margin_mean": 32.51746368408203, "margin_dpo/margin_std": 30.369220733642578, "step": 519 }, { "epoch": 0.7635829662261381, "grad_norm": 51.291900634765625, "learning_rate": 8.15760890883607e-08, "logits/chosen": -0.5656751394271851, "logits/rejected": -0.5422060489654541, "logps/chosen": -93.58493041992188, "logps/ref_chosen": -70.2998275756836, "logps/ref_rejected": -99.98133850097656, "logps/rejected": -156.14584350585938, "loss": 0.3683, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13689424097537994, "margin_dpo/beta_margin_grad_std": 0.186412513256073, "margin_dpo/beta_margin_mean": 3.2879397869110107, "margin_dpo/beta_margin_std": 2.570587635040283, "margin_dpo/loss_margin_mean": 32.879398345947266, "margin_dpo/margin_mean": 32.879398345947266, "margin_dpo/margin_std": 24.496992111206055, "step": 520 }, { "epoch": 0.7650513950073421, "grad_norm": 50.26503372192383, "learning_rate": 8.062991975753378e-08, "logits/chosen": -0.607467770576477, "logits/rejected": -0.5773638486862183, "logps/chosen": -80.21942138671875, "logps/ref_chosen": -58.14292907714844, "logps/ref_rejected": -83.28060913085938, "logps/rejected": -137.37908935546875, "loss": 0.4181, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15461619198322296, "margin_dpo/beta_margin_grad_std": 0.19728721678256989, "margin_dpo/beta_margin_mean": 3.202197313308716, "margin_dpo/beta_margin_std": 2.655714988708496, "margin_dpo/loss_margin_mean": 32.02197265625, "margin_dpo/margin_mean": 32.02197265625, "margin_dpo/margin_std": 26.189250946044922, "step": 521 }, { "epoch": 0.7665198237885462, "grad_norm": 51.4885368347168, "learning_rate": 7.968821348583643e-08, "logits/chosen": -0.5788943767547607, "logits/rejected": -0.5471011996269226, "logps/chosen": -69.88147735595703, "logps/ref_chosen": -46.54766845703125, "logps/ref_rejected": -66.01388549804688, "logps/rejected": -117.34169006347656, "loss": 0.4752, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17735202610492706, "margin_dpo/beta_margin_grad_std": 0.2005145400762558, "margin_dpo/beta_margin_mean": 2.7993991374969482, "margin_dpo/beta_margin_std": 2.631875514984131, "margin_dpo/loss_margin_mean": 27.99399185180664, "margin_dpo/margin_mean": 27.99399185180664, "margin_dpo/margin_std": 26.08908462524414, "step": 522 }, { "epoch": 0.7679882525697503, "grad_norm": 62.35993957519531, "learning_rate": 7.875099508810484e-08, "logits/chosen": -0.5938686728477478, "logits/rejected": -0.550100564956665, "logps/chosen": -85.62379455566406, "logps/ref_chosen": -61.76960372924805, "logps/ref_rejected": -83.76141357421875, "logps/rejected": -139.5577850341797, "loss": 0.5449, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17923639714717865, "margin_dpo/beta_margin_grad_std": 0.2329106628894806, "margin_dpo/beta_margin_mean": 3.194218397140503, "margin_dpo/beta_margin_std": 2.890404224395752, "margin_dpo/loss_margin_mean": 31.942182540893555, "margin_dpo/margin_mean": 31.942180633544922, "margin_dpo/margin_std": 28.83761978149414, "step": 523 }, { "epoch": 0.7694566813509545, "grad_norm": 61.87083053588867, "learning_rate": 7.781828926091535e-08, "logits/chosen": -0.5877140760421753, "logits/rejected": -0.5509617328643799, "logps/chosen": -100.80895233154297, "logps/ref_chosen": -78.0720443725586, "logps/ref_rejected": -81.30198669433594, "logps/rejected": -133.55508422851562, "loss": 0.5047, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16188479959964752, "margin_dpo/beta_margin_grad_std": 0.2156955450773239, "margin_dpo/beta_margin_mean": 2.951618194580078, "margin_dpo/beta_margin_std": 2.4761300086975098, "margin_dpo/loss_margin_mean": 29.51618194580078, "margin_dpo/margin_mean": 29.51618194580078, "margin_dpo/margin_std": 24.702909469604492, "step": 524 }, { "epoch": 0.7709251101321586, "grad_norm": 35.78192901611328, "learning_rate": 7.689012058193384e-08, "logits/chosen": -0.5947517156600952, "logits/rejected": -0.5842136144638062, "logps/chosen": -72.66329956054688, "logps/ref_chosen": -50.827857971191406, "logps/ref_rejected": -100.05293273925781, "logps/rejected": -157.0978546142578, "loss": 0.2654, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1080591008067131, "margin_dpo/beta_margin_grad_std": 0.1474897265434265, "margin_dpo/beta_margin_mean": 3.5209484100341797, "margin_dpo/beta_margin_std": 2.3788013458251953, "margin_dpo/loss_margin_mean": 35.2094841003418, "margin_dpo/margin_mean": 35.2094841003418, "margin_dpo/margin_std": 23.671607971191406, "step": 525 }, { "epoch": 0.7723935389133627, "grad_norm": 69.24839782714844, "learning_rate": 7.596651350926836e-08, "logits/chosen": -0.6309370994567871, "logits/rejected": -0.5772014260292053, "logps/chosen": -88.0202407836914, "logps/ref_chosen": -63.167232513427734, "logps/ref_rejected": -86.30934143066406, "logps/rejected": -145.69381713867188, "loss": 0.4258, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.144230917096138, "margin_dpo/beta_margin_grad_std": 0.21393723785877228, "margin_dpo/beta_margin_mean": 3.453146457672119, "margin_dpo/beta_margin_std": 2.792224168777466, "margin_dpo/loss_margin_mean": 34.531463623046875, "margin_dpo/margin_mean": 34.531463623046875, "margin_dpo/margin_std": 27.76288604736328, "step": 526 }, { "epoch": 0.7738619676945668, "grad_norm": 76.02661895751953, "learning_rate": 7.504749238082414e-08, "logits/chosen": -0.6780938506126404, "logits/rejected": -0.6229304671287537, "logps/chosen": -94.22843933105469, "logps/ref_chosen": -71.12867736816406, "logps/ref_rejected": -78.3425521850586, "logps/rejected": -133.57666015625, "loss": 0.4342, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15442487597465515, "margin_dpo/beta_margin_grad_std": 0.20508261024951935, "margin_dpo/beta_margin_mean": 3.213435649871826, "margin_dpo/beta_margin_std": 2.8153111934661865, "margin_dpo/loss_margin_mean": 32.13435363769531, "margin_dpo/margin_mean": 32.13435363769531, "margin_dpo/margin_std": 28.03814697265625, "step": 527 }, { "epoch": 0.775330396475771, "grad_norm": 53.0721549987793, "learning_rate": 7.413308141366254e-08, "logits/chosen": -0.5941898226737976, "logits/rejected": -0.5680090188980103, "logps/chosen": -91.19302368164062, "logps/ref_chosen": -68.0894546508789, "logps/ref_rejected": -93.91006469726562, "logps/rejected": -147.61412048339844, "loss": 0.4206, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1577254980802536, "margin_dpo/beta_margin_grad_std": 0.20022036135196686, "margin_dpo/beta_margin_mean": 3.0600483417510986, "margin_dpo/beta_margin_std": 2.504538059234619, "margin_dpo/loss_margin_mean": 30.600481033325195, "margin_dpo/margin_mean": 30.600481033325195, "margin_dpo/margin_std": 24.954174041748047, "step": 528 }, { "epoch": 0.7767988252569751, "grad_norm": 76.8966064453125, "learning_rate": 7.322330470336313e-08, "logits/chosen": -0.6043155789375305, "logits/rejected": -0.5836986899375916, "logps/chosen": -82.0534896850586, "logps/ref_chosen": -55.5749626159668, "logps/ref_rejected": -89.20909118652344, "logps/rejected": -143.092041015625, "loss": 0.7156, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.19791270792484283, "margin_dpo/beta_margin_grad_std": 0.25681713223457336, "margin_dpo/beta_margin_mean": 2.740443229675293, "margin_dpo/beta_margin_std": 2.8750996589660645, "margin_dpo/loss_margin_mean": 27.40443229675293, "margin_dpo/margin_mean": 27.404430389404297, "margin_dpo/margin_std": 28.411148071289062, "step": 529 }, { "epoch": 0.7782672540381792, "grad_norm": 62.848148345947266, "learning_rate": 7.231818622338822e-08, "logits/chosen": -0.5931106805801392, "logits/rejected": -0.5747063159942627, "logps/chosen": -71.18637084960938, "logps/ref_chosen": -47.601417541503906, "logps/ref_rejected": -87.2845230102539, "logps/rejected": -147.40582275390625, "loss": 0.4445, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13506180047988892, "margin_dpo/beta_margin_grad_std": 0.23474474251270294, "margin_dpo/beta_margin_mean": 3.6536343097686768, "margin_dpo/beta_margin_std": 2.7414755821228027, "margin_dpo/loss_margin_mean": 36.53634262084961, "margin_dpo/margin_mean": 36.53634262084961, "margin_dpo/margin_std": 27.119749069213867, "step": 530 }, { "epoch": 0.7797356828193832, "grad_norm": 54.345516204833984, "learning_rate": 7.141774982445147e-08, "logits/chosen": -0.6257216334342957, "logits/rejected": -0.5846695899963379, "logps/chosen": -77.81207275390625, "logps/ref_chosen": -55.246063232421875, "logps/ref_rejected": -70.60598754882812, "logps/rejected": -125.383056640625, "loss": 0.5198, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16368556022644043, "margin_dpo/beta_margin_grad_std": 0.23361165821552277, "margin_dpo/beta_margin_mean": 3.22110652923584, "margin_dpo/beta_margin_std": 2.786233901977539, "margin_dpo/loss_margin_mean": 32.21106719970703, "margin_dpo/margin_mean": 32.21106719970703, "margin_dpo/margin_std": 27.787128448486328, "step": 531 }, { "epoch": 0.7812041116005873, "grad_norm": 57.886627197265625, "learning_rate": 7.052201923388953e-08, "logits/chosen": -0.6238207817077637, "logits/rejected": -0.5933228731155396, "logps/chosen": -93.80610656738281, "logps/ref_chosen": -70.28602600097656, "logps/ref_rejected": -86.5913314819336, "logps/rejected": -148.1940155029297, "loss": 0.3443, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12783224880695343, "margin_dpo/beta_margin_grad_std": 0.19326746463775635, "margin_dpo/beta_margin_mean": 3.8082594871520996, "margin_dpo/beta_margin_std": 2.9897003173828125, "margin_dpo/loss_margin_mean": 38.08259582519531, "margin_dpo/margin_mean": 38.08259582519531, "margin_dpo/margin_std": 27.77364730834961, "step": 532 }, { "epoch": 0.7826725403817915, "grad_norm": 71.21391296386719, "learning_rate": 6.963101805503646e-08, "logits/chosen": -0.6265207529067993, "logits/rejected": -0.584482729434967, "logps/chosen": -87.6553726196289, "logps/ref_chosen": -64.8551025390625, "logps/ref_rejected": -76.58805847167969, "logps/rejected": -125.80079650878906, "loss": 0.5973, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18623670935630798, "margin_dpo/beta_margin_grad_std": 0.23616911470890045, "margin_dpo/beta_margin_mean": 2.6412458419799805, "margin_dpo/beta_margin_std": 2.417598009109497, "margin_dpo/loss_margin_mean": 26.412456512451172, "margin_dpo/margin_mean": 26.412456512451172, "margin_dpo/margin_std": 23.92403793334961, "step": 533 }, { "epoch": 0.7841409691629956, "grad_norm": 46.806941986083984, "learning_rate": 6.874476976660184e-08, "logits/chosen": -0.6205226182937622, "logits/rejected": -0.5897984504699707, "logps/chosen": -82.1025619506836, "logps/ref_chosen": -60.119388580322266, "logps/ref_rejected": -78.54347229003906, "logps/rejected": -132.9524688720703, "loss": 0.3996, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15258625149726868, "margin_dpo/beta_margin_grad_std": 0.18508727848529816, "margin_dpo/beta_margin_mean": 3.2425835132598877, "margin_dpo/beta_margin_std": 2.758850336074829, "margin_dpo/loss_margin_mean": 32.42583465576172, "margin_dpo/margin_mean": 32.42583465576172, "margin_dpo/margin_std": 27.545181274414062, "step": 534 }, { "epoch": 0.7856093979441997, "grad_norm": 43.828521728515625, "learning_rate": 6.786329772205246e-08, "logits/chosen": -0.6110357046127319, "logits/rejected": -0.5794057846069336, "logps/chosen": -74.39203643798828, "logps/ref_chosen": -54.330238342285156, "logps/ref_rejected": -96.30763244628906, "logps/rejected": -152.29916381835938, "loss": 0.3868, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1333179920911789, "margin_dpo/beta_margin_grad_std": 0.20468372106552124, "margin_dpo/beta_margin_mean": 3.5929739475250244, "margin_dpo/beta_margin_std": 2.733642578125, "margin_dpo/loss_margin_mean": 35.92974090576172, "margin_dpo/margin_mean": 35.92974090576172, "margin_dpo/margin_std": 26.782804489135742, "step": 535 }, { "epoch": 0.7870778267254038, "grad_norm": 32.79661178588867, "learning_rate": 6.698662514899638e-08, "logits/chosen": -0.6306508779525757, "logits/rejected": -0.6139326095581055, "logps/chosen": -67.34485626220703, "logps/ref_chosen": -47.08053207397461, "logps/ref_rejected": -89.09783935546875, "logps/rejected": -150.07534790039062, "loss": 0.2143, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.08794363588094711, "margin_dpo/beta_margin_grad_std": 0.13372628390789032, "margin_dpo/beta_margin_mean": 4.071318626403809, "margin_dpo/beta_margin_std": 2.757542848587036, "margin_dpo/loss_margin_mean": 40.71318817138672, "margin_dpo/margin_mean": 40.71318817138672, "margin_dpo/margin_std": 27.551332473754883, "step": 536 }, { "epoch": 0.788546255506608, "grad_norm": 57.09232711791992, "learning_rate": 6.611477514857114e-08, "logits/chosen": -0.6294724941253662, "logits/rejected": -0.5703548192977905, "logps/chosen": -78.06488037109375, "logps/ref_chosen": -57.747474670410156, "logps/ref_rejected": -70.43838500976562, "logps/rejected": -124.39582824707031, "loss": 0.423, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.157338947057724, "margin_dpo/beta_margin_grad_std": 0.19860957562923431, "margin_dpo/beta_margin_mean": 3.3640034198760986, "margin_dpo/beta_margin_std": 2.8764986991882324, "margin_dpo/loss_margin_mean": 33.64003372192383, "margin_dpo/margin_mean": 33.64003372192383, "margin_dpo/margin_std": 28.32391929626465, "step": 537 }, { "epoch": 0.7900146842878121, "grad_norm": 48.7829704284668, "learning_rate": 6.524777069483525e-08, "logits/chosen": -0.6543197631835938, "logits/rejected": -0.6095279455184937, "logps/chosen": -88.88335418701172, "logps/ref_chosen": -66.41593933105469, "logps/ref_rejected": -84.22808837890625, "logps/rejected": -139.14047241210938, "loss": 0.354, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12948301434516907, "margin_dpo/beta_margin_grad_std": 0.1803048700094223, "margin_dpo/beta_margin_mean": 3.2444982528686523, "margin_dpo/beta_margin_std": 2.5140750408172607, "margin_dpo/loss_margin_mean": 32.44498062133789, "margin_dpo/margin_mean": 32.44498062133789, "margin_dpo/margin_std": 24.870681762695312, "step": 538 }, { "epoch": 0.7914831130690162, "grad_norm": 53.8671760559082, "learning_rate": 6.438563463416221e-08, "logits/chosen": -0.6606429815292358, "logits/rejected": -0.6174535751342773, "logps/chosen": -79.59363555908203, "logps/ref_chosen": -58.49285125732422, "logps/ref_rejected": -91.85395812988281, "logps/rejected": -144.0075225830078, "loss": 0.4892, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1646711677312851, "margin_dpo/beta_margin_grad_std": 0.21774569153785706, "margin_dpo/beta_margin_mean": 3.105278968811035, "margin_dpo/beta_margin_std": 2.70540189743042, "margin_dpo/loss_margin_mean": 31.05278968811035, "margin_dpo/margin_mean": 31.05278968811035, "margin_dpo/margin_std": 26.950185775756836, "step": 539 }, { "epoch": 0.7929515418502202, "grad_norm": 65.30083465576172, "learning_rate": 6.352838968463919e-08, "logits/chosen": -0.6370847225189209, "logits/rejected": -0.6102155447006226, "logps/chosen": -84.67435455322266, "logps/ref_chosen": -63.482513427734375, "logps/ref_rejected": -116.43000030517578, "logps/rejected": -172.81341552734375, "loss": 0.4614, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1394956409931183, "margin_dpo/beta_margin_grad_std": 0.22995907068252563, "margin_dpo/beta_margin_mean": 3.5191566944122314, "margin_dpo/beta_margin_std": 2.8065860271453857, "margin_dpo/loss_margin_mean": 35.191566467285156, "margin_dpo/margin_mean": 35.191566467285156, "margin_dpo/margin_std": 27.493711471557617, "step": 540 }, { "epoch": 0.7944199706314243, "grad_norm": 60.31626510620117, "learning_rate": 6.267605843546767e-08, "logits/chosen": -0.6471028327941895, "logits/rejected": -0.6038549542427063, "logps/chosen": -100.83651733398438, "logps/ref_chosen": -78.28035736083984, "logps/ref_rejected": -103.273681640625, "logps/rejected": -156.01519775390625, "loss": 0.4299, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1511632800102234, "margin_dpo/beta_margin_grad_std": 0.20662708580493927, "margin_dpo/beta_margin_mean": 3.0185351371765137, "margin_dpo/beta_margin_std": 2.4248569011688232, "margin_dpo/loss_margin_mean": 30.18535041809082, "margin_dpo/margin_mean": 30.185348510742188, "margin_dpo/margin_std": 23.390766143798828, "step": 541 }, { "epoch": 0.7958883994126285, "grad_norm": 37.80149841308594, "learning_rate": 6.182866334636888e-08, "logits/chosen": -0.6717028617858887, "logits/rejected": -0.6646940112113953, "logps/chosen": -79.87113952636719, "logps/ref_chosen": -57.48497009277344, "logps/ref_rejected": -96.47506713867188, "logps/rejected": -153.2113494873047, "loss": 0.3688, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13427144289016724, "margin_dpo/beta_margin_grad_std": 0.19649642705917358, "margin_dpo/beta_margin_mean": 3.4350104331970215, "margin_dpo/beta_margin_std": 2.5037317276000977, "margin_dpo/loss_margin_mean": 34.35010528564453, "margin_dpo/margin_mean": 34.35010528564453, "margin_dpo/margin_std": 24.91982650756836, "step": 542 }, { "epoch": 0.7973568281938326, "grad_norm": 72.47408294677734, "learning_rate": 6.098622674699147e-08, "logits/chosen": -0.6002076864242554, "logits/rejected": -0.5865759253501892, "logps/chosen": -83.70565795898438, "logps/ref_chosen": -60.61750793457031, "logps/ref_rejected": -105.59896850585938, "logps/rejected": -154.27377319335938, "loss": 0.5987, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1960902214050293, "margin_dpo/beta_margin_grad_std": 0.23114755749702454, "margin_dpo/beta_margin_mean": 2.5586671829223633, "margin_dpo/beta_margin_std": 2.56718111038208, "margin_dpo/loss_margin_mean": 25.586669921875, "margin_dpo/margin_mean": 25.586671829223633, "margin_dpo/margin_std": 25.219966888427734, "step": 543 }, { "epoch": 0.7988252569750367, "grad_norm": 53.03235626220703, "learning_rate": 6.01487708363232e-08, "logits/chosen": -0.6120076179504395, "logits/rejected": -0.5983961820602417, "logps/chosen": -84.55440521240234, "logps/ref_chosen": -59.642303466796875, "logps/ref_rejected": -100.95469665527344, "logps/rejected": -158.740478515625, "loss": 0.3113, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12066913396120071, "margin_dpo/beta_margin_grad_std": 0.16076715290546417, "margin_dpo/beta_margin_mean": 3.287367105484009, "margin_dpo/beta_margin_std": 2.3992013931274414, "margin_dpo/loss_margin_mean": 32.8736686706543, "margin_dpo/margin_mean": 32.8736686706543, "margin_dpo/margin_std": 23.937637329101562, "step": 544 }, { "epoch": 0.8002936857562408, "grad_norm": 50.49260330200195, "learning_rate": 5.9316317682106294e-08, "logits/chosen": -0.5966023206710815, "logits/rejected": -0.5714937448501587, "logps/chosen": -91.05603790283203, "logps/ref_chosen": -67.64859771728516, "logps/ref_rejected": -95.90800476074219, "logps/rejected": -153.31039428710938, "loss": 0.3894, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.147560253739357, "margin_dpo/beta_margin_grad_std": 0.19324474036693573, "margin_dpo/beta_margin_mean": 3.3994970321655273, "margin_dpo/beta_margin_std": 2.6682093143463135, "margin_dpo/loss_margin_mean": 33.99496841430664, "margin_dpo/margin_mean": 33.994972229003906, "margin_dpo/margin_std": 26.630447387695312, "step": 545 }, { "epoch": 0.801762114537445, "grad_norm": 48.28023147583008, "learning_rate": 5.848888922025552e-08, "logits/chosen": -0.598247230052948, "logits/rejected": -0.5669834017753601, "logps/chosen": -72.54621887207031, "logps/ref_chosen": -50.744232177734375, "logps/ref_rejected": -81.86622619628906, "logps/rejected": -137.0621337890625, "loss": 0.3406, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12454946339130402, "margin_dpo/beta_margin_grad_std": 0.17545826733112335, "margin_dpo/beta_margin_mean": 3.33939266204834, "margin_dpo/beta_margin_std": 2.392932891845703, "margin_dpo/loss_margin_mean": 33.393924713134766, "margin_dpo/margin_mean": 33.39392852783203, "margin_dpo/margin_std": 23.586105346679688, "step": 546 }, { "epoch": 0.8032305433186491, "grad_norm": 94.86836242675781, "learning_rate": 5.7666507254280265e-08, "logits/chosen": -0.5786527395248413, "logits/rejected": -0.5422627925872803, "logps/chosen": -98.80986022949219, "logps/ref_chosen": -73.6877212524414, "logps/ref_rejected": -90.76136779785156, "logps/rejected": -146.6555938720703, "loss": 0.5943, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18261206150054932, "margin_dpo/beta_margin_grad_std": 0.2419717162847519, "margin_dpo/beta_margin_mean": 3.0772078037261963, "margin_dpo/beta_margin_std": 3.048326015472412, "margin_dpo/loss_margin_mean": 30.772077560424805, "margin_dpo/margin_mean": 30.772075653076172, "margin_dpo/margin_std": 29.73462677001953, "step": 547 }, { "epoch": 0.8046989720998532, "grad_norm": 50.10495376586914, "learning_rate": 5.684919345471029e-08, "logits/chosen": -0.6317073106765747, "logits/rejected": -0.5984237790107727, "logps/chosen": -87.01058959960938, "logps/ref_chosen": -65.24634552001953, "logps/ref_rejected": -94.11807250976562, "logps/rejected": -150.06707763671875, "loss": 0.4276, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14587746560573578, "margin_dpo/beta_margin_grad_std": 0.21648141741752625, "margin_dpo/beta_margin_mean": 3.418475389480591, "margin_dpo/beta_margin_std": 2.860063076019287, "margin_dpo/loss_margin_mean": 34.18475341796875, "margin_dpo/margin_mean": 34.18475341796875, "margin_dpo/margin_std": 28.568710327148438, "step": 548 }, { "epoch": 0.8061674008810573, "grad_norm": 69.96656799316406, "learning_rate": 5.603696935852426e-08, "logits/chosen": -0.6198970079421997, "logits/rejected": -0.5846245288848877, "logps/chosen": -69.8495101928711, "logps/ref_chosen": -49.21235656738281, "logps/ref_rejected": -73.91031646728516, "logps/rejected": -129.4089813232422, "loss": 0.3395, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12493831664323807, "margin_dpo/beta_margin_grad_std": 0.18240319192409515, "margin_dpo/beta_margin_mean": 3.4861512184143066, "margin_dpo/beta_margin_std": 2.548912286758423, "margin_dpo/loss_margin_mean": 34.86151123046875, "margin_dpo/margin_mean": 34.86151123046875, "margin_dpo/margin_std": 25.463951110839844, "step": 549 }, { "epoch": 0.8076358296622613, "grad_norm": 75.74690246582031, "learning_rate": 5.5229856368582376e-08, "logits/chosen": -0.5934598445892334, "logits/rejected": -0.5707608461380005, "logps/chosen": -81.35966491699219, "logps/ref_chosen": -56.80695343017578, "logps/ref_rejected": -95.12580871582031, "logps/rejected": -147.45431518554688, "loss": 0.5179, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18190813064575195, "margin_dpo/beta_margin_grad_std": 0.2241961658000946, "margin_dpo/beta_margin_mean": 2.777578592300415, "margin_dpo/beta_margin_std": 2.4132397174835205, "margin_dpo/loss_margin_mean": 27.775785446166992, "margin_dpo/margin_mean": 27.775785446166992, "margin_dpo/margin_std": 24.089828491210938, "step": 550 }, { "epoch": 0.8091042584434655, "grad_norm": 63.201576232910156, "learning_rate": 5.4427875753062734e-08, "logits/chosen": -0.6071433424949646, "logits/rejected": -0.5851248502731323, "logps/chosen": -82.18833923339844, "logps/ref_chosen": -59.10633087158203, "logps/ref_rejected": -111.67280578613281, "logps/rejected": -169.89010620117188, "loss": 0.3486, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13184352219104767, "margin_dpo/beta_margin_grad_std": 0.18528760969638824, "margin_dpo/beta_margin_mean": 3.5135278701782227, "margin_dpo/beta_margin_std": 2.6755788326263428, "margin_dpo/loss_margin_mean": 35.135276794433594, "margin_dpo/margin_mean": 35.135276794433594, "margin_dpo/margin_std": 26.726924896240234, "step": 551 }, { "epoch": 0.8105726872246696, "grad_norm": 36.340457916259766, "learning_rate": 5.363104864490034e-08, "logits/chosen": -0.5997291803359985, "logits/rejected": -0.5684548616409302, "logps/chosen": -82.64513397216797, "logps/ref_chosen": -62.35459899902344, "logps/ref_rejected": -104.56210327148438, "logps/rejected": -164.4604949951172, "loss": 0.2519, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.10329456627368927, "margin_dpo/beta_margin_grad_std": 0.142158642411232, "margin_dpo/beta_margin_mean": 3.960785388946533, "margin_dpo/beta_margin_std": 3.118678331375122, "margin_dpo/loss_margin_mean": 39.607852935791016, "margin_dpo/margin_mean": 39.607852935791016, "margin_dpo/margin_std": 30.571399688720703, "step": 552 }, { "epoch": 0.8120411160058737, "grad_norm": 59.747318267822266, "learning_rate": 5.2839396041230415e-08, "logits/chosen": -0.6111286878585815, "logits/rejected": -0.5851707458496094, "logps/chosen": -89.31002044677734, "logps/ref_chosen": -68.25881958007812, "logps/ref_rejected": -98.0971450805664, "logps/rejected": -149.80963134765625, "loss": 0.4075, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15195949375629425, "margin_dpo/beta_margin_grad_std": 0.1938675493001938, "margin_dpo/beta_margin_mean": 3.0661275386810303, "margin_dpo/beta_margin_std": 2.555492401123047, "margin_dpo/loss_margin_mean": 30.661273956298828, "margin_dpo/margin_mean": 30.661273956298828, "margin_dpo/margin_std": 24.51015853881836, "step": 553 }, { "epoch": 0.8135095447870778, "grad_norm": 69.38467407226562, "learning_rate": 5.205293880283551e-08, "logits/chosen": -0.5836309194564819, "logits/rejected": -0.5294244289398193, "logps/chosen": -90.94577026367188, "logps/ref_chosen": -67.94767761230469, "logps/ref_rejected": -89.78272247314453, "logps/rejected": -154.76803588867188, "loss": 0.4321, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12726253271102905, "margin_dpo/beta_margin_grad_std": 0.22509342432022095, "margin_dpo/beta_margin_mean": 4.198723316192627, "margin_dpo/beta_margin_std": 3.1148805618286133, "margin_dpo/loss_margin_mean": 41.98722839355469, "margin_dpo/margin_mean": 41.98722839355469, "margin_dpo/margin_std": 31.020339965820312, "step": 554 }, { "epoch": 0.8149779735682819, "grad_norm": 60.07759094238281, "learning_rate": 5.127169765359515e-08, "logits/chosen": -0.631875216960907, "logits/rejected": -0.6264532804489136, "logps/chosen": -75.14740753173828, "logps/ref_chosen": -53.33049011230469, "logps/ref_rejected": -108.47937774658203, "logps/rejected": -165.06500244140625, "loss": 0.4537, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1464599072933197, "margin_dpo/beta_margin_grad_std": 0.23116105794906616, "margin_dpo/beta_margin_mean": 3.4768707752227783, "margin_dpo/beta_margin_std": 2.832597494125366, "margin_dpo/loss_margin_mean": 34.768707275390625, "margin_dpo/margin_mean": 34.768707275390625, "margin_dpo/margin_std": 27.81438446044922, "step": 555 }, { "epoch": 0.8164464023494861, "grad_norm": 95.35712432861328, "learning_rate": 5.049569317994012e-08, "logits/chosen": -0.5981155633926392, "logits/rejected": -0.5586187839508057, "logps/chosen": -80.44320678710938, "logps/ref_chosen": -58.64447021484375, "logps/ref_rejected": -101.34040832519531, "logps/rejected": -153.6765594482422, "loss": 0.5367, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1837112158536911, "margin_dpo/beta_margin_grad_std": 0.23774275183677673, "margin_dpo/beta_margin_mean": 3.053741455078125, "margin_dpo/beta_margin_std": 2.7874701023101807, "margin_dpo/loss_margin_mean": 30.53741455078125, "margin_dpo/margin_mean": 30.53741455078125, "margin_dpo/margin_std": 27.704578399658203, "step": 556 }, { "epoch": 0.8179148311306902, "grad_norm": 51.85511016845703, "learning_rate": 4.9724945830310144e-08, "logits/chosen": -0.6293572187423706, "logits/rejected": -0.6109094619750977, "logps/chosen": -89.30657958984375, "logps/ref_chosen": -67.84066009521484, "logps/ref_rejected": -109.93966674804688, "logps/rejected": -161.80950927734375, "loss": 0.4557, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1644848883152008, "margin_dpo/beta_margin_grad_std": 0.2031964659690857, "margin_dpo/beta_margin_mean": 3.0403928756713867, "margin_dpo/beta_margin_std": 2.689265251159668, "margin_dpo/loss_margin_mean": 30.403926849365234, "margin_dpo/margin_mean": 30.403926849365234, "margin_dpo/margin_std": 26.008148193359375, "step": 557 }, { "epoch": 0.8193832599118943, "grad_norm": 37.57906723022461, "learning_rate": 4.8959475914614554e-08, "logits/chosen": -0.6321591138839722, "logits/rejected": -0.5818116068840027, "logps/chosen": -81.43386840820312, "logps/ref_chosen": -62.36824035644531, "logps/ref_rejected": -102.16102600097656, "logps/rejected": -162.21075439453125, "loss": 0.2873, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.10846921056509018, "margin_dpo/beta_margin_grad_std": 0.1664186269044876, "margin_dpo/beta_margin_mean": 4.098410129547119, "margin_dpo/beta_margin_std": 3.1049041748046875, "margin_dpo/loss_margin_mean": 40.984100341796875, "margin_dpo/margin_mean": 40.984100341796875, "margin_dpo/margin_std": 30.58254051208496, "step": 558 }, { "epoch": 0.8208516886930984, "grad_norm": 55.36118698120117, "learning_rate": 4.8199303603697614e-08, "logits/chosen": -0.6702802181243896, "logits/rejected": -0.6162378787994385, "logps/chosen": -80.25833129882812, "logps/ref_chosen": -60.75232696533203, "logps/ref_rejected": -93.4422836303711, "logps/rejected": -146.09507751464844, "loss": 0.4285, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1548963040113449, "margin_dpo/beta_margin_grad_std": 0.20749419927597046, "margin_dpo/beta_margin_mean": 3.3146774768829346, "margin_dpo/beta_margin_std": 2.7877228260040283, "margin_dpo/loss_margin_mean": 33.14677429199219, "margin_dpo/margin_mean": 33.14677429199219, "margin_dpo/margin_std": 27.695289611816406, "step": 559 }, { "epoch": 0.8223201174743024, "grad_norm": 55.987613677978516, "learning_rate": 4.7444448928806615e-08, "logits/chosen": -0.5809895992279053, "logits/rejected": -0.5309587717056274, "logps/chosen": -78.91523742675781, "logps/ref_chosen": -58.10382080078125, "logps/ref_rejected": -79.99122619628906, "logps/rejected": -129.71127319335938, "loss": 0.4468, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1631620079278946, "margin_dpo/beta_margin_grad_std": 0.19706346094608307, "margin_dpo/beta_margin_mean": 2.890864372253418, "margin_dpo/beta_margin_std": 2.4986603260040283, "margin_dpo/loss_margin_mean": 28.90864372253418, "margin_dpo/margin_mean": 28.90864372253418, "margin_dpo/margin_std": 24.452037811279297, "step": 560 }, { "epoch": 0.8237885462555066, "grad_norm": 64.24273681640625, "learning_rate": 4.669493178106432e-08, "logits/chosen": -0.5840525031089783, "logits/rejected": -0.5768797397613525, "logps/chosen": -75.96839904785156, "logps/ref_chosen": -50.91287612915039, "logps/ref_rejected": -99.06857299804688, "logps/rejected": -153.4114227294922, "loss": 0.499, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1737356036901474, "margin_dpo/beta_margin_grad_std": 0.21739643812179565, "margin_dpo/beta_margin_mean": 2.9287328720092773, "margin_dpo/beta_margin_std": 2.6283650398254395, "margin_dpo/loss_margin_mean": 29.287328720092773, "margin_dpo/margin_mean": 29.287330627441406, "margin_dpo/margin_std": 26.124765396118164, "step": 561 }, { "epoch": 0.8252569750367107, "grad_norm": 32.95391082763672, "learning_rate": 4.5950771910944596e-08, "logits/chosen": -0.6423584222793579, "logits/rejected": -0.5944841504096985, "logps/chosen": -78.03691101074219, "logps/ref_chosen": -59.46440124511719, "logps/ref_rejected": -96.54266357421875, "logps/rejected": -152.8929443359375, "loss": 0.2448, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.09955663979053497, "margin_dpo/beta_margin_grad_std": 0.13690762221813202, "margin_dpo/beta_margin_mean": 3.777778148651123, "margin_dpo/beta_margin_std": 2.689279079437256, "margin_dpo/loss_margin_mean": 37.77777862548828, "margin_dpo/margin_mean": 37.77777862548828, "margin_dpo/margin_std": 26.642715454101562, "step": 562 }, { "epoch": 0.8267254038179148, "grad_norm": 62.781524658203125, "learning_rate": 4.521198892775202e-08, "logits/chosen": -0.5596526265144348, "logits/rejected": -0.5383732914924622, "logps/chosen": -83.21853637695312, "logps/ref_chosen": -60.60819625854492, "logps/ref_rejected": -94.56770324707031, "logps/rejected": -147.46449279785156, "loss": 0.4102, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15092504024505615, "margin_dpo/beta_margin_grad_std": 0.19437021017074585, "margin_dpo/beta_margin_mean": 3.0286452770233154, "margin_dpo/beta_margin_std": 2.374891519546509, "margin_dpo/loss_margin_mean": 30.28645133972168, "margin_dpo/margin_mean": 30.286453247070312, "margin_dpo/margin_std": 23.04823875427246, "step": 563 }, { "epoch": 0.8281938325991189, "grad_norm": 47.216941833496094, "learning_rate": 4.447860229910544e-08, "logits/chosen": -0.6696497201919556, "logits/rejected": -0.6134607195854187, "logps/chosen": -96.129638671875, "logps/ref_chosen": -74.26837921142578, "logps/ref_rejected": -93.2381820678711, "logps/rejected": -147.440673828125, "loss": 0.3702, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1358504742383957, "margin_dpo/beta_margin_grad_std": 0.18947924673557281, "margin_dpo/beta_margin_mean": 3.234123945236206, "margin_dpo/beta_margin_std": 2.284991502761841, "margin_dpo/loss_margin_mean": 32.34123992919922, "margin_dpo/margin_mean": 32.34123992919922, "margin_dpo/margin_std": 22.592432022094727, "step": 564 }, { "epoch": 0.8296622613803231, "grad_norm": 42.826656341552734, "learning_rate": 4.375063135042445e-08, "logits/chosen": -0.6142770051956177, "logits/rejected": -0.5721128582954407, "logps/chosen": -90.92054748535156, "logps/ref_chosen": -69.0199203491211, "logps/ref_rejected": -85.7789306640625, "logps/rejected": -142.84002685546875, "loss": 0.3742, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1414240449666977, "margin_dpo/beta_margin_grad_std": 0.18444375693798065, "margin_dpo/beta_margin_mean": 3.5160465240478516, "margin_dpo/beta_margin_std": 3.0857439041137695, "margin_dpo/loss_margin_mean": 35.160465240478516, "margin_dpo/margin_mean": 35.160465240478516, "margin_dpo/margin_std": 30.577224731445312, "step": 565 }, { "epoch": 0.8311306901615272, "grad_norm": 59.75011444091797, "learning_rate": 4.3028095264420525e-08, "logits/chosen": -0.5794559717178345, "logits/rejected": -0.564411461353302, "logps/chosen": -87.007568359375, "logps/ref_chosen": -66.5453109741211, "logps/ref_rejected": -103.86931610107422, "logps/rejected": -158.2462158203125, "loss": 0.4748, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16711834073066711, "margin_dpo/beta_margin_grad_std": 0.2162701040506363, "margin_dpo/beta_margin_mean": 3.391465425491333, "margin_dpo/beta_margin_std": 3.0285098552703857, "margin_dpo/loss_margin_mean": 33.91465377807617, "margin_dpo/margin_mean": 33.91465759277344, "margin_dpo/margin_std": 29.81332778930664, "step": 566 }, { "epoch": 0.8325991189427313, "grad_norm": 80.87443542480469, "learning_rate": 4.231101308059165e-08, "logits/chosen": -0.6721388697624207, "logits/rejected": -0.6175321340560913, "logps/chosen": -75.00394439697266, "logps/ref_chosen": -52.858299255371094, "logps/ref_rejected": -85.37095642089844, "logps/rejected": -139.839599609375, "loss": 0.5855, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17626914381980896, "margin_dpo/beta_margin_grad_std": 0.24363385140895844, "margin_dpo/beta_margin_mean": 3.2322990894317627, "margin_dpo/beta_margin_std": 2.850463390350342, "margin_dpo/loss_margin_mean": 32.32299041748047, "margin_dpo/margin_mean": 32.32299041748047, "margin_dpo/margin_std": 28.030927658081055, "step": 567 }, { "epoch": 0.8340675477239354, "grad_norm": 44.05873489379883, "learning_rate": 4.1599403694720145e-08, "logits/chosen": -0.6051311492919922, "logits/rejected": -0.58836430311203, "logps/chosen": -67.80856323242188, "logps/ref_chosen": -45.1923828125, "logps/ref_rejected": -89.09236145019531, "logps/rejected": -149.61813354492188, "loss": 0.3543, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12138967961072922, "margin_dpo/beta_margin_grad_std": 0.20320722460746765, "margin_dpo/beta_margin_mean": 3.7909576892852783, "margin_dpo/beta_margin_std": 2.74312162399292, "margin_dpo/loss_margin_mean": 37.909576416015625, "margin_dpo/margin_mean": 37.909576416015625, "margin_dpo/margin_std": 26.69784164428711, "step": 568 }, { "epoch": 0.8355359765051396, "grad_norm": 64.07440948486328, "learning_rate": 4.089328585837512e-08, "logits/chosen": -0.6324214935302734, "logits/rejected": -0.6018074750900269, "logps/chosen": -86.39301300048828, "logps/ref_chosen": -63.72056198120117, "logps/ref_rejected": -79.10325622558594, "logps/rejected": -131.90521240234375, "loss": 0.5035, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17517776787281036, "margin_dpo/beta_margin_grad_std": 0.22150224447250366, "margin_dpo/beta_margin_mean": 3.012951135635376, "margin_dpo/beta_margin_std": 2.732919931411743, "margin_dpo/loss_margin_mean": 30.1295108795166, "margin_dpo/margin_mean": 30.129512786865234, "margin_dpo/margin_std": 27.2763729095459, "step": 569 }, { "epoch": 0.8370044052863436, "grad_norm": 51.117984771728516, "learning_rate": 4.019267817841834e-08, "logits/chosen": -0.6380658149719238, "logits/rejected": -0.586859941482544, "logps/chosen": -82.45683288574219, "logps/ref_chosen": -61.61454772949219, "logps/ref_rejected": -82.1418685913086, "logps/rejected": -137.9852294921875, "loss": 0.3273, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12137595564126968, "margin_dpo/beta_margin_grad_std": 0.18200945854187012, "margin_dpo/beta_margin_mean": 3.500107765197754, "margin_dpo/beta_margin_std": 2.537564516067505, "margin_dpo/loss_margin_mean": 35.00107955932617, "margin_dpo/margin_mean": 35.001075744628906, "margin_dpo/margin_std": 25.32352066040039, "step": 570 }, { "epoch": 0.8384728340675477, "grad_norm": 61.06570053100586, "learning_rate": 3.9497599116513705e-08, "logits/chosen": -0.6200108528137207, "logits/rejected": -0.5971379280090332, "logps/chosen": -75.36268615722656, "logps/ref_chosen": -53.05406188964844, "logps/ref_rejected": -91.33682250976562, "logps/rejected": -148.22666931152344, "loss": 0.3723, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13746890425682068, "margin_dpo/beta_margin_grad_std": 0.19136172533035278, "margin_dpo/beta_margin_mean": 3.458122968673706, "margin_dpo/beta_margin_std": 2.7801191806793213, "margin_dpo/loss_margin_mean": 34.58123016357422, "margin_dpo/margin_mean": 34.58122634887695, "margin_dpo/margin_std": 27.320905685424805, "step": 571 }, { "epoch": 0.8399412628487518, "grad_norm": 81.89973449707031, "learning_rate": 3.880806698864086e-08, "logits/chosen": -0.5995185375213623, "logits/rejected": -0.583366334438324, "logps/chosen": -75.69577026367188, "logps/ref_chosen": -48.459285736083984, "logps/ref_rejected": -83.5570297241211, "logps/rejected": -143.04672241210938, "loss": 0.6994, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18695051968097687, "margin_dpo/beta_margin_grad_std": 0.25848084688186646, "margin_dpo/beta_margin_mean": 3.2253220081329346, "margin_dpo/beta_margin_std": 3.1980674266815186, "margin_dpo/loss_margin_mean": 32.25321960449219, "margin_dpo/margin_mean": 32.25321960449219, "margin_dpo/margin_std": 31.76388931274414, "step": 572 }, { "epoch": 0.8414096916299559, "grad_norm": 60.32197570800781, "learning_rate": 3.812409996461275e-08, "logits/chosen": -0.6115789413452148, "logits/rejected": -0.5747958421707153, "logps/chosen": -73.32516479492188, "logps/ref_chosen": -51.62262725830078, "logps/ref_rejected": -85.32499694824219, "logps/rejected": -141.56448364257812, "loss": 0.4409, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15062843263149261, "margin_dpo/beta_margin_grad_std": 0.21439620852470398, "margin_dpo/beta_margin_mean": 3.453695058822632, "margin_dpo/beta_margin_std": 2.861546754837036, "margin_dpo/loss_margin_mean": 34.536949157714844, "margin_dpo/margin_mean": 34.536949157714844, "margin_dpo/margin_std": 26.226520538330078, "step": 573 }, { "epoch": 0.8428781204111601, "grad_norm": 71.44043731689453, "learning_rate": 3.74457160675965e-08, "logits/chosen": -0.662164568901062, "logits/rejected": -0.6289517879486084, "logps/chosen": -74.57516479492188, "logps/ref_chosen": -51.04446029663086, "logps/ref_rejected": -92.80640411376953, "logps/rejected": -150.48431396484375, "loss": 0.4525, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14266708493232727, "margin_dpo/beta_margin_grad_std": 0.20333649218082428, "margin_dpo/beta_margin_mean": 3.4147207736968994, "margin_dpo/beta_margin_std": 2.838653326034546, "margin_dpo/loss_margin_mean": 34.14720916748047, "margin_dpo/margin_mean": 34.14720916748047, "margin_dpo/margin_std": 27.81047821044922, "step": 574 }, { "epoch": 0.8443465491923642, "grad_norm": 83.72123718261719, "learning_rate": 3.677293317363864e-08, "logits/chosen": -0.5885167121887207, "logits/rejected": -0.5552273988723755, "logps/chosen": -97.17411804199219, "logps/ref_chosen": -71.79014587402344, "logps/ref_rejected": -95.38619995117188, "logps/rejected": -157.09030151367188, "loss": 0.6269, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16689082980155945, "margin_dpo/beta_margin_grad_std": 0.2690768837928772, "margin_dpo/beta_margin_mean": 3.632012128829956, "margin_dpo/beta_margin_std": 3.186659574508667, "margin_dpo/loss_margin_mean": 36.32011795043945, "margin_dpo/margin_mean": 36.32011795043945, "margin_dpo/margin_std": 31.200788497924805, "step": 575 }, { "epoch": 0.8458149779735683, "grad_norm": 47.92924118041992, "learning_rate": 3.6105769011194224e-08, "logits/chosen": -0.608694314956665, "logits/rejected": -0.6066184043884277, "logps/chosen": -77.77719116210938, "logps/ref_chosen": -54.262969970703125, "logps/ref_rejected": -100.7542724609375, "logps/rejected": -159.03663635253906, "loss": 0.4414, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14325302839279175, "margin_dpo/beta_margin_grad_std": 0.1987282782793045, "margin_dpo/beta_margin_mean": 3.4768130779266357, "margin_dpo/beta_margin_std": 3.035773992538452, "margin_dpo/loss_margin_mean": 34.768131256103516, "margin_dpo/margin_mean": 34.768131256103516, "margin_dpo/margin_std": 29.473041534423828, "step": 576 }, { "epoch": 0.8472834067547724, "grad_norm": 50.64008331298828, "learning_rate": 3.5444241160659304e-08, "logits/chosen": -0.6180467009544373, "logits/rejected": -0.5673972368240356, "logps/chosen": -81.8788070678711, "logps/ref_chosen": -61.909706115722656, "logps/ref_rejected": -84.07069396972656, "logps/rejected": -142.12271118164062, "loss": 0.355, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12185031920671463, "margin_dpo/beta_margin_grad_std": 0.19115541875362396, "margin_dpo/beta_margin_mean": 3.8082919120788574, "margin_dpo/beta_margin_std": 2.871381998062134, "margin_dpo/loss_margin_mean": 38.082916259765625, "margin_dpo/margin_mean": 38.082916259765625, "margin_dpo/margin_std": 27.93082046508789, "step": 577 }, { "epoch": 0.8487518355359766, "grad_norm": 56.69212341308594, "learning_rate": 3.478836705390808e-08, "logits/chosen": -0.5924346446990967, "logits/rejected": -0.576144814491272, "logps/chosen": -76.00813293457031, "logps/ref_chosen": -49.26368713378906, "logps/ref_rejected": -83.43626403808594, "logps/rejected": -145.4608154296875, "loss": 0.3666, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13498100638389587, "margin_dpo/beta_margin_grad_std": 0.1896868646144867, "margin_dpo/beta_margin_mean": 3.5280113220214844, "margin_dpo/beta_margin_std": 2.7423183917999268, "margin_dpo/loss_margin_mean": 35.280113220214844, "margin_dpo/margin_mean": 35.280113220214844, "margin_dpo/margin_std": 26.86646270751953, "step": 578 }, { "epoch": 0.8502202643171806, "grad_norm": 58.20693588256836, "learning_rate": 3.41381639738331e-08, "logits/chosen": -0.5839706659317017, "logits/rejected": -0.5557907223701477, "logps/chosen": -80.32678985595703, "logps/ref_chosen": -58.88581848144531, "logps/ref_rejected": -94.78762817382812, "logps/rejected": -146.857666015625, "loss": 0.3726, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1434396207332611, "margin_dpo/beta_margin_grad_std": 0.18104501068592072, "margin_dpo/beta_margin_mean": 3.0629067420959473, "margin_dpo/beta_margin_std": 2.3335225582122803, "margin_dpo/loss_margin_mean": 30.62906837463379, "margin_dpo/margin_mean": 30.629066467285156, "margin_dpo/margin_std": 23.209409713745117, "step": 579 }, { "epoch": 0.8516886930983847, "grad_norm": 46.23623275756836, "learning_rate": 3.349364905389032e-08, "logits/chosen": -0.6004414558410645, "logits/rejected": -0.5626081228256226, "logps/chosen": -67.79255676269531, "logps/ref_chosen": -48.70684051513672, "logps/ref_rejected": -81.7583999633789, "logps/rejected": -141.36367797851562, "loss": 0.3452, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.11654352396726608, "margin_dpo/beta_margin_grad_std": 0.19992825388908386, "margin_dpo/beta_margin_mean": 4.051955699920654, "margin_dpo/beta_margin_std": 3.0127224922180176, "margin_dpo/loss_margin_mean": 40.519554138183594, "margin_dpo/margin_mean": 40.519554138183594, "margin_dpo/margin_std": 30.061260223388672, "step": 580 }, { "epoch": 0.8531571218795888, "grad_norm": 52.084510803222656, "learning_rate": 3.285483927764726e-08, "logits/chosen": -0.6124836802482605, "logits/rejected": -0.5917458534240723, "logps/chosen": -83.28103637695312, "logps/ref_chosen": -62.22235107421875, "logps/ref_rejected": -91.73568725585938, "logps/rejected": -143.91891479492188, "loss": 0.427, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15421409904956818, "margin_dpo/beta_margin_grad_std": 0.20411258935928345, "margin_dpo/beta_margin_mean": 3.1124558448791504, "margin_dpo/beta_margin_std": 2.4851300716400146, "margin_dpo/loss_margin_mean": 31.124557495117188, "margin_dpo/margin_mean": 31.124557495117188, "margin_dpo/margin_std": 24.729656219482422, "step": 581 }, { "epoch": 0.8546255506607929, "grad_norm": 66.89032745361328, "learning_rate": 3.222175147833556e-08, "logits/chosen": -0.6103986501693726, "logits/rejected": -0.6106557846069336, "logps/chosen": -77.18165588378906, "logps/ref_chosen": -58.228660583496094, "logps/ref_rejected": -110.06959533691406, "logps/rejected": -164.72235107421875, "loss": 0.4122, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1494351178407669, "margin_dpo/beta_margin_grad_std": 0.20539377629756927, "margin_dpo/beta_margin_mean": 3.5699760913848877, "margin_dpo/beta_margin_std": 2.9193856716156006, "margin_dpo/loss_margin_mean": 35.69976043701172, "margin_dpo/margin_mean": 35.69976043701172, "margin_dpo/margin_std": 29.070068359375, "step": 582 }, { "epoch": 0.856093979441997, "grad_norm": 67.63478088378906, "learning_rate": 3.159440233840763e-08, "logits/chosen": -0.5816251039505005, "logits/rejected": -0.563323438167572, "logps/chosen": -81.4218978881836, "logps/ref_chosen": -56.86286163330078, "logps/ref_rejected": -88.4039306640625, "logps/rejected": -142.5087890625, "loss": 0.5731, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18249757587909698, "margin_dpo/beta_margin_grad_std": 0.23606114089488983, "margin_dpo/beta_margin_mean": 2.954582929611206, "margin_dpo/beta_margin_std": 2.9761416912078857, "margin_dpo/loss_margin_mean": 29.54582977294922, "margin_dpo/margin_mean": 29.54582977294922, "margin_dpo/margin_std": 29.586864471435547, "step": 583 }, { "epoch": 0.8575624082232012, "grad_norm": 42.6038932800293, "learning_rate": 3.0972808389096635e-08, "logits/chosen": -0.5968157052993774, "logits/rejected": -0.5377147197723389, "logps/chosen": -74.70047760009766, "logps/ref_chosen": -56.90068054199219, "logps/ref_rejected": -97.63606262207031, "logps/rejected": -154.8049774169922, "loss": 0.2565, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.10045187175273895, "margin_dpo/beta_margin_grad_std": 0.15640710294246674, "margin_dpo/beta_margin_mean": 3.9369120597839355, "margin_dpo/beta_margin_std": 2.6277453899383545, "margin_dpo/loss_margin_mean": 39.36912155151367, "margin_dpo/margin_mean": 39.369117736816406, "margin_dpo/margin_std": 26.270313262939453, "step": 584 }, { "epoch": 0.8590308370044053, "grad_norm": 67.87578582763672, "learning_rate": 3.035698600998121e-08, "logits/chosen": -0.59834885597229, "logits/rejected": -0.570793628692627, "logps/chosen": -85.62800598144531, "logps/ref_chosen": -60.973968505859375, "logps/ref_rejected": -84.16952514648438, "logps/rejected": -141.87606811523438, "loss": 0.4808, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1607590615749359, "margin_dpo/beta_margin_grad_std": 0.22445270419120789, "margin_dpo/beta_margin_mean": 3.305250883102417, "margin_dpo/beta_margin_std": 2.882838010787964, "margin_dpo/loss_margin_mean": 33.05250549316406, "margin_dpo/margin_mean": 33.05250549316406, "margin_dpo/margin_std": 28.073482513427734, "step": 585 }, { "epoch": 0.8604992657856094, "grad_norm": 64.7607650756836, "learning_rate": 2.974695142855388e-08, "logits/chosen": -0.5770500898361206, "logits/rejected": -0.5656349658966064, "logps/chosen": -82.06817626953125, "logps/ref_chosen": -56.85559844970703, "logps/ref_rejected": -91.8026123046875, "logps/rejected": -149.83375549316406, "loss": 0.5571, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16552101075649261, "margin_dpo/beta_margin_grad_std": 0.23515141010284424, "margin_dpo/beta_margin_mean": 3.2818562984466553, "margin_dpo/beta_margin_std": 2.998326539993286, "margin_dpo/loss_margin_mean": 32.81856155395508, "margin_dpo/margin_mean": 32.81856155395508, "margin_dpo/margin_std": 29.866100311279297, "step": 586 }, { "epoch": 0.8619676945668135, "grad_norm": 46.60182571411133, "learning_rate": 2.9142720719793122e-08, "logits/chosen": -0.6530208587646484, "logits/rejected": -0.6360703706741333, "logps/chosen": -62.827880859375, "logps/ref_chosen": -44.69159698486328, "logps/ref_rejected": -82.62385559082031, "logps/rejected": -131.34133911132812, "loss": 0.5, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1838323473930359, "margin_dpo/beta_margin_grad_std": 0.21402017772197723, "margin_dpo/beta_margin_mean": 3.0581204891204834, "margin_dpo/beta_margin_std": 2.7596585750579834, "margin_dpo/loss_margin_mean": 30.581205368041992, "margin_dpo/margin_mean": 30.58120346069336, "margin_dpo/margin_std": 27.5166015625, "step": 587 }, { "epoch": 0.8634361233480177, "grad_norm": 69.0011978149414, "learning_rate": 2.8544309805740018e-08, "logits/chosen": -0.6193605661392212, "logits/rejected": -0.6037042140960693, "logps/chosen": -72.90446472167969, "logps/ref_chosen": -50.294952392578125, "logps/ref_rejected": -107.36988067626953, "logps/rejected": -162.0211639404297, "loss": 0.4825, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16973692178726196, "margin_dpo/beta_margin_grad_std": 0.2188083976507187, "margin_dpo/beta_margin_mean": 3.204176664352417, "margin_dpo/beta_margin_std": 2.8548641204833984, "margin_dpo/loss_margin_mean": 32.04176712036133, "margin_dpo/margin_mean": 32.04176712036133, "margin_dpo/margin_std": 28.34649658203125, "step": 588 }, { "epoch": 0.8649045521292217, "grad_norm": 41.75368118286133, "learning_rate": 2.7951734455078786e-08, "logits/chosen": -0.6043207049369812, "logits/rejected": -0.5762794017791748, "logps/chosen": -82.0301513671875, "logps/ref_chosen": -59.929908752441406, "logps/ref_rejected": -111.65534973144531, "logps/rejected": -178.11956787109375, "loss": 0.3183, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.10713424533605576, "margin_dpo/beta_margin_grad_std": 0.19531351327896118, "margin_dpo/beta_margin_mean": 4.436398506164551, "margin_dpo/beta_margin_std": 3.308957099914551, "margin_dpo/loss_margin_mean": 44.36398696899414, "margin_dpo/margin_mean": 44.363983154296875, "margin_dpo/margin_std": 33.016082763671875, "step": 589 }, { "epoch": 0.8663729809104258, "grad_norm": 38.492958068847656, "learning_rate": 2.736501028272095e-08, "logits/chosen": -0.6257822513580322, "logits/rejected": -0.6011620163917542, "logps/chosen": -77.59140014648438, "logps/ref_chosen": -55.80979537963867, "logps/ref_rejected": -106.06282043457031, "logps/rejected": -166.545166015625, "loss": 0.2747, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.10672850161790848, "margin_dpo/beta_margin_grad_std": 0.15377846360206604, "margin_dpo/beta_margin_mean": 3.8700757026672363, "margin_dpo/beta_margin_std": 2.8155410289764404, "margin_dpo/loss_margin_mean": 38.70075607299805, "margin_dpo/margin_mean": 38.70075607299805, "margin_dpo/margin_std": 28.101266860961914, "step": 590 }, { "epoch": 0.8678414096916299, "grad_norm": 63.176300048828125, "learning_rate": 2.678415274939408e-08, "logits/chosen": -0.6197452545166016, "logits/rejected": -0.5587427616119385, "logps/chosen": -81.11954498291016, "logps/ref_chosen": -56.24061965942383, "logps/ref_rejected": -83.78629302978516, "logps/rejected": -145.21470642089844, "loss": 0.3933, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12877370417118073, "margin_dpo/beta_margin_grad_std": 0.21353955566883087, "margin_dpo/beta_margin_mean": 3.6549482345581055, "margin_dpo/beta_margin_std": 2.5838942527770996, "margin_dpo/loss_margin_mean": 36.54948425292969, "margin_dpo/margin_mean": 36.54948425292969, "margin_dpo/margin_std": 25.822662353515625, "step": 591 }, { "epoch": 0.869309838472834, "grad_norm": 85.90067291259766, "learning_rate": 2.6209177161234442e-08, "logits/chosen": -0.6024812459945679, "logits/rejected": -0.5797621011734009, "logps/chosen": -73.40560913085938, "logps/ref_chosen": -47.94025421142578, "logps/ref_rejected": -75.73287963867188, "logps/rejected": -136.9627685546875, "loss": 0.5919, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15234871208667755, "margin_dpo/beta_margin_grad_std": 0.2595595717430115, "margin_dpo/beta_margin_mean": 3.576453685760498, "margin_dpo/beta_margin_std": 2.9050722122192383, "margin_dpo/loss_margin_mean": 35.76453399658203, "margin_dpo/margin_mean": 35.76453399658203, "margin_dpo/margin_std": 28.354209899902344, "step": 592 }, { "epoch": 0.8707782672540382, "grad_norm": 91.05819702148438, "learning_rate": 2.564009866938349e-08, "logits/chosen": -0.5534486770629883, "logits/rejected": -0.5277604460716248, "logps/chosen": -72.19068145751953, "logps/ref_chosen": -48.690757751464844, "logps/ref_rejected": -60.90800476074219, "logps/rejected": -114.34281921386719, "loss": 0.6214, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18934105336666107, "margin_dpo/beta_margin_grad_std": 0.2560799717903137, "margin_dpo/beta_margin_mean": 2.9934892654418945, "margin_dpo/beta_margin_std": 2.8498647212982178, "margin_dpo/loss_margin_mean": 29.934890747070312, "margin_dpo/margin_mean": 29.934890747070312, "margin_dpo/margin_std": 28.364194869995117, "step": 593 }, { "epoch": 0.8722466960352423, "grad_norm": 61.273799896240234, "learning_rate": 2.5076932269588708e-08, "logits/chosen": -0.6247504353523254, "logits/rejected": -0.5774896740913391, "logps/chosen": -76.23826599121094, "logps/ref_chosen": -54.93488693237305, "logps/ref_rejected": -86.09967041015625, "logps/rejected": -147.30072021484375, "loss": 0.4983, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14729663729667664, "margin_dpo/beta_margin_grad_std": 0.21982048451900482, "margin_dpo/beta_margin_mean": 3.9897680282592773, "margin_dpo/beta_margin_std": 3.5569002628326416, "margin_dpo/loss_margin_mean": 39.89767837524414, "margin_dpo/margin_mean": 39.897682189941406, "margin_dpo/margin_std": 34.09754180908203, "step": 594 }, { "epoch": 0.8737151248164464, "grad_norm": 43.19758987426758, "learning_rate": 2.451969280180849e-08, "logits/chosen": -0.5981707572937012, "logits/rejected": -0.5647690296173096, "logps/chosen": -72.47997283935547, "logps/ref_chosen": -49.42041778564453, "logps/ref_rejected": -80.62731170654297, "logps/rejected": -135.92117309570312, "loss": 0.3785, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14596061408519745, "margin_dpo/beta_margin_grad_std": 0.18515609204769135, "margin_dpo/beta_margin_mean": 3.223430871963501, "margin_dpo/beta_margin_std": 2.712632417678833, "margin_dpo/loss_margin_mean": 32.23430633544922, "margin_dpo/margin_mean": 32.23430633544922, "margin_dpo/margin_std": 26.91950225830078, "step": 595 }, { "epoch": 0.8751835535976505, "grad_norm": 62.67118453979492, "learning_rate": 2.396839494982103e-08, "logits/chosen": -0.5990803241729736, "logits/rejected": -0.5536556243896484, "logps/chosen": -81.62798309326172, "logps/ref_chosen": -59.791683197021484, "logps/ref_rejected": -80.09111785888672, "logps/rejected": -136.9227752685547, "loss": 0.4779, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16171905398368835, "margin_dpo/beta_margin_grad_std": 0.2228172868490219, "margin_dpo/beta_margin_mean": 3.4995357990264893, "margin_dpo/beta_margin_std": 3.0359036922454834, "margin_dpo/loss_margin_mean": 34.995357513427734, "margin_dpo/margin_mean": 34.995357513427734, "margin_dpo/margin_std": 29.87460708618164, "step": 596 }, { "epoch": 0.8766519823788547, "grad_norm": 60.41211700439453, "learning_rate": 2.3423053240837514e-08, "logits/chosen": -0.5785458087921143, "logits/rejected": -0.5741050243377686, "logps/chosen": -79.68421936035156, "logps/ref_chosen": -57.26078796386719, "logps/ref_rejected": -100.6937255859375, "logps/rejected": -158.439453125, "loss": 0.5269, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1700301468372345, "margin_dpo/beta_margin_grad_std": 0.240619495511055, "margin_dpo/beta_margin_mean": 3.532228708267212, "margin_dpo/beta_margin_std": 3.1880156993865967, "margin_dpo/loss_margin_mean": 35.32228469848633, "margin_dpo/margin_mean": 35.32228088378906, "margin_dpo/margin_std": 31.730697631835938, "step": 597 }, { "epoch": 0.8781204111600588, "grad_norm": 62.684593200683594, "learning_rate": 2.2883682045119062e-08, "logits/chosen": -0.6382852792739868, "logits/rejected": -0.6104525923728943, "logps/chosen": -75.68168640136719, "logps/ref_chosen": -52.51850509643555, "logps/ref_rejected": -89.44385528564453, "logps/rejected": -145.06332397460938, "loss": 0.4532, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14109815657138824, "margin_dpo/beta_margin_grad_std": 0.21213975548744202, "margin_dpo/beta_margin_mean": 3.2456300258636475, "margin_dpo/beta_margin_std": 2.523146629333496, "margin_dpo/loss_margin_mean": 32.456298828125, "margin_dpo/margin_mean": 32.456298828125, "margin_dpo/margin_std": 24.71021270751953, "step": 598 }, { "epoch": 0.8795888399412628, "grad_norm": 58.74440383911133, "learning_rate": 2.2350295575598367e-08, "logits/chosen": -0.5937786102294922, "logits/rejected": -0.5736675262451172, "logps/chosen": -71.58798217773438, "logps/ref_chosen": -49.802677154541016, "logps/ref_rejected": -82.978515625, "logps/rejected": -137.42575073242188, "loss": 0.4549, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15846121311187744, "margin_dpo/beta_margin_grad_std": 0.22350937128067017, "margin_dpo/beta_margin_mean": 3.266195058822632, "margin_dpo/beta_margin_std": 2.5800249576568604, "margin_dpo/loss_margin_mean": 32.661949157714844, "margin_dpo/margin_mean": 32.661949157714844, "margin_dpo/margin_std": 25.785709381103516, "step": 599 }, { "epoch": 0.8810572687224669, "grad_norm": 71.30066680908203, "learning_rate": 2.1822907887504932e-08, "logits/chosen": -0.663406252861023, "logits/rejected": -0.6377668380737305, "logps/chosen": -87.9482421875, "logps/ref_chosen": -66.43487548828125, "logps/ref_rejected": -85.45649719238281, "logps/rejected": -137.04623413085938, "loss": 0.4868, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15097591280937195, "margin_dpo/beta_margin_grad_std": 0.21298004686832428, "margin_dpo/beta_margin_mean": 3.007636785507202, "margin_dpo/beta_margin_std": 2.5406298637390137, "margin_dpo/loss_margin_mean": 30.07636833190918, "margin_dpo/margin_mean": 30.076370239257812, "margin_dpo/margin_std": 25.37958526611328, "step": 600 }, { "epoch": 0.8810572687224669, "eval_logits/chosen": -0.6200472712516785, "eval_logits/rejected": -0.5939710140228271, "eval_logps/chosen": -105.88009643554688, "eval_logps/ref_chosen": -79.05104064941406, "eval_logps/ref_rejected": -86.79793548583984, "eval_logps/rejected": -135.36648559570312, "eval_loss": 0.4055388867855072, "eval_margin_dpo/beta": 0.10000000149011612, "eval_margin_dpo/beta_margin_grad_mean": -0.2572847008705139, "eval_margin_dpo/beta_margin_grad_std": 0.2540939152240753, "eval_margin_dpo/beta_margin_mean": 2.173950672149658, "eval_margin_dpo/beta_margin_std": 2.6342239379882812, "eval_margin_dpo/loss_margin_mean": 21.739503860473633, "eval_margin_dpo/margin_mean": 21.739503860473633, "eval_margin_dpo/margin_std": 26.342239379882812, "eval_runtime": 40.1711, "eval_samples_per_second": 58.226, "eval_steps_per_second": 1.842, "step": 600 }, { "epoch": 0.882525697503671, "grad_norm": 82.41748046875, "learning_rate": 2.1301532877994742e-08, "logits/chosen": -0.6234362125396729, "logits/rejected": -0.595230758190155, "logps/chosen": -85.03898620605469, "logps/ref_chosen": -59.13360595703125, "logps/ref_rejected": -94.69093322753906, "logps/rejected": -154.72296142578125, "loss": 0.5251, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15890392661094666, "margin_dpo/beta_margin_grad_std": 0.24525830149650574, "margin_dpo/beta_margin_mean": 3.412665843963623, "margin_dpo/beta_margin_std": 2.932499885559082, "margin_dpo/loss_margin_mean": 34.12665939331055, "margin_dpo/margin_mean": 34.12665939331055, "margin_dpo/margin_std": 28.886859893798828, "step": 601 }, { "epoch": 0.8839941262848752, "grad_norm": 68.01284790039062, "learning_rate": 2.0786184285784298e-08, "logits/chosen": -0.6261130571365356, "logits/rejected": -0.626197338104248, "logps/chosen": -66.78749084472656, "logps/ref_chosen": -48.59352111816406, "logps/ref_rejected": -87.6685562133789, "logps/rejected": -143.57113647460938, "loss": 0.3531, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12587696313858032, "margin_dpo/beta_margin_grad_std": 0.19796113669872284, "margin_dpo/beta_margin_mean": 3.770860195159912, "margin_dpo/beta_margin_std": 2.8099164962768555, "margin_dpo/loss_margin_mean": 37.70860290527344, "margin_dpo/margin_mean": 37.70860290527344, "margin_dpo/margin_std": 27.594802856445312, "step": 602 }, { "epoch": 0.8854625550660793, "grad_norm": 65.49505615234375, "learning_rate": 2.0276875690788204e-08, "logits/chosen": -0.637772262096405, "logits/rejected": -0.5984662175178528, "logps/chosen": -90.7020263671875, "logps/ref_chosen": -70.41461944580078, "logps/ref_rejected": -100.32560729980469, "logps/rejected": -152.65615844726562, "loss": 0.4719, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1628403216600418, "margin_dpo/beta_margin_grad_std": 0.2238980382680893, "margin_dpo/beta_margin_mean": 3.2043137550354004, "margin_dpo/beta_margin_std": 2.628369092941284, "margin_dpo/loss_margin_mean": 32.04313659667969, "margin_dpo/margin_mean": 32.04313659667969, "margin_dpo/margin_std": 26.230998992919922, "step": 603 }, { "epoch": 0.8869309838472834, "grad_norm": 64.45735931396484, "learning_rate": 1.977362051376158e-08, "logits/chosen": -0.5643373727798462, "logits/rejected": -0.5535662770271301, "logps/chosen": -65.24546813964844, "logps/ref_chosen": -46.45808029174805, "logps/ref_rejected": -91.8544921875, "logps/rejected": -146.03567504882812, "loss": 0.4607, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14535552263259888, "margin_dpo/beta_margin_grad_std": 0.2219676375389099, "margin_dpo/beta_margin_mean": 3.5393803119659424, "margin_dpo/beta_margin_std": 2.988723039627075, "margin_dpo/loss_margin_mean": 35.393802642822266, "margin_dpo/margin_mean": 35.393802642822266, "margin_dpo/margin_std": 29.511133193969727, "step": 604 }, { "epoch": 0.8883994126284875, "grad_norm": 62.30309295654297, "learning_rate": 1.9276432015946446e-08, "logits/chosen": -0.6186962127685547, "logits/rejected": -0.603484034538269, "logps/chosen": -90.84162139892578, "logps/ref_chosen": -66.24933624267578, "logps/ref_rejected": -102.30496978759766, "logps/rejected": -158.584228515625, "loss": 0.4569, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1487942636013031, "margin_dpo/beta_margin_grad_std": 0.1991681158542633, "margin_dpo/beta_margin_mean": 3.168696165084839, "margin_dpo/beta_margin_std": 3.0381813049316406, "margin_dpo/loss_margin_mean": 31.686960220336914, "margin_dpo/margin_mean": 31.68695831298828, "margin_dpo/margin_std": 29.331512451171875, "step": 605 }, { "epoch": 0.8898678414096917, "grad_norm": 44.491546630859375, "learning_rate": 1.8785323298722093e-08, "logits/chosen": -0.5961008071899414, "logits/rejected": -0.564789354801178, "logps/chosen": -76.80615234375, "logps/ref_chosen": -54.819122314453125, "logps/ref_rejected": -98.37147521972656, "logps/rejected": -157.0362548828125, "loss": 0.2954, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12030400335788727, "margin_dpo/beta_margin_grad_std": 0.1559758484363556, "margin_dpo/beta_margin_mean": 3.6677749156951904, "margin_dpo/beta_margin_std": 2.649290084838867, "margin_dpo/loss_margin_mean": 36.67774963378906, "margin_dpo/margin_mean": 36.6777458190918, "margin_dpo/margin_std": 25.4649658203125, "step": 606 }, { "epoch": 0.8913362701908958, "grad_norm": 50.320865631103516, "learning_rate": 1.8300307303259904e-08, "logits/chosen": -0.5963802337646484, "logits/rejected": -0.5623406171798706, "logps/chosen": -79.11578369140625, "logps/ref_chosen": -58.08403778076172, "logps/ref_rejected": -79.777099609375, "logps/rejected": -133.24951171875, "loss": 0.3387, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13021884858608246, "margin_dpo/beta_margin_grad_std": 0.17474402487277985, "margin_dpo/beta_margin_mean": 3.244065999984741, "margin_dpo/beta_margin_std": 2.3698348999023438, "margin_dpo/loss_margin_mean": 32.44065856933594, "margin_dpo/margin_mean": 32.44065856933594, "margin_dpo/margin_std": 23.566665649414062, "step": 607 }, { "epoch": 0.8928046989720999, "grad_norm": 58.68054962158203, "learning_rate": 1.7821396810182437e-08, "logits/chosen": -0.6197670698165894, "logits/rejected": -0.5876868963241577, "logps/chosen": -78.17123413085938, "logps/ref_chosen": -57.450836181640625, "logps/ref_rejected": -94.77339172363281, "logps/rejected": -148.73159790039062, "loss": 0.4835, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15202751755714417, "margin_dpo/beta_margin_grad_std": 0.22783887386322021, "margin_dpo/beta_margin_mean": 3.323781728744507, "margin_dpo/beta_margin_std": 2.665648937225342, "margin_dpo/loss_margin_mean": 33.237815856933594, "margin_dpo/margin_mean": 33.237815856933594, "margin_dpo/margin_std": 26.23067855834961, "step": 608 }, { "epoch": 0.8942731277533039, "grad_norm": 64.48681640625, "learning_rate": 1.7348604439226617e-08, "logits/chosen": -0.642276406288147, "logits/rejected": -0.6062139272689819, "logps/chosen": -81.96639251708984, "logps/ref_chosen": -58.805355072021484, "logps/ref_rejected": -88.81600952148438, "logps/rejected": -145.61566162109375, "loss": 0.3479, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12766654789447784, "margin_dpo/beta_margin_grad_std": 0.18546564877033234, "margin_dpo/beta_margin_mean": 3.3638622760772705, "margin_dpo/beta_margin_std": 2.390188694000244, "margin_dpo/loss_margin_mean": 33.63862228393555, "margin_dpo/margin_mean": 33.63862228393555, "margin_dpo/margin_std": 23.823345184326172, "step": 609 }, { "epoch": 0.895741556534508, "grad_norm": 74.75220489501953, "learning_rate": 1.6881942648911074e-08, "logits/chosen": -0.6093329191207886, "logits/rejected": -0.5498037934303284, "logps/chosen": -90.24623107910156, "logps/ref_chosen": -65.69503784179688, "logps/ref_rejected": -83.4053955078125, "logps/rejected": -140.6365966796875, "loss": 0.4533, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15364539623260498, "margin_dpo/beta_margin_grad_std": 0.21927115321159363, "margin_dpo/beta_margin_mean": 3.2680017948150635, "margin_dpo/beta_margin_std": 2.5850718021392822, "margin_dpo/loss_margin_mean": 32.680015563964844, "margin_dpo/margin_mean": 32.680015563964844, "margin_dpo/margin_std": 24.990657806396484, "step": 610 }, { "epoch": 0.8972099853157122, "grad_norm": 52.49784851074219, "learning_rate": 1.6421423736208e-08, "logits/chosen": -0.6183408498764038, "logits/rejected": -0.580921471118927, "logps/chosen": -74.63272094726562, "logps/ref_chosen": -52.59947204589844, "logps/ref_rejected": -86.33099365234375, "logps/rejected": -144.24195861816406, "loss": 0.3916, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1446484923362732, "margin_dpo/beta_margin_grad_std": 0.1988787204027176, "margin_dpo/beta_margin_mean": 3.587771415710449, "margin_dpo/beta_margin_std": 2.8050875663757324, "margin_dpo/loss_margin_mean": 35.877716064453125, "margin_dpo/margin_mean": 35.877716064453125, "margin_dpo/margin_std": 27.9959774017334, "step": 611 }, { "epoch": 0.8986784140969163, "grad_norm": 44.11368179321289, "learning_rate": 1.5967059836219042e-08, "logits/chosen": -0.6275376081466675, "logits/rejected": -0.5670713782310486, "logps/chosen": -80.20808410644531, "logps/ref_chosen": -59.32372283935547, "logps/ref_rejected": -88.31239318847656, "logps/rejected": -150.14288330078125, "loss": 0.2722, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.10046197474002838, "margin_dpo/beta_margin_grad_std": 0.16786958277225494, "margin_dpo/beta_margin_mean": 4.094614028930664, "margin_dpo/beta_margin_std": 2.759153127670288, "margin_dpo/loss_margin_mean": 40.94614028930664, "margin_dpo/margin_mean": 40.94614028930664, "margin_dpo/margin_std": 27.57101058959961, "step": 612 }, { "epoch": 0.9001468428781204, "grad_norm": 51.012901306152344, "learning_rate": 1.551886292185553e-08, "logits/chosen": -0.6327238082885742, "logits/rejected": -0.6284672021865845, "logps/chosen": -80.63017272949219, "logps/ref_chosen": -59.72996520996094, "logps/ref_rejected": -105.10753631591797, "logps/rejected": -161.78628540039062, "loss": 0.3659, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13130351901054382, "margin_dpo/beta_margin_grad_std": 0.2031707614660263, "margin_dpo/beta_margin_mean": 3.5778555870056152, "margin_dpo/beta_margin_std": 2.7456395626068115, "margin_dpo/loss_margin_mean": 35.77855682373047, "margin_dpo/margin_mean": 35.77855682373047, "margin_dpo/margin_std": 27.143339157104492, "step": 613 }, { "epoch": 0.9016152716593245, "grad_norm": 48.471588134765625, "learning_rate": 1.507684480352292e-08, "logits/chosen": -0.5755459070205688, "logits/rejected": -0.5698869824409485, "logps/chosen": -76.522705078125, "logps/ref_chosen": -52.93898010253906, "logps/ref_rejected": -104.67938232421875, "logps/rejected": -164.02252197265625, "loss": 0.3031, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.11619433760643005, "margin_dpo/beta_margin_grad_std": 0.1717434674501419, "margin_dpo/beta_margin_mean": 3.5759410858154297, "margin_dpo/beta_margin_std": 2.556734800338745, "margin_dpo/loss_margin_mean": 35.7594108581543, "margin_dpo/margin_mean": 35.7594108581543, "margin_dpo/margin_std": 25.549396514892578, "step": 614 }, { "epoch": 0.9030837004405287, "grad_norm": 41.79697799682617, "learning_rate": 1.4641017128809801e-08, "logits/chosen": -0.5839822292327881, "logits/rejected": -0.5518302917480469, "logps/chosen": -86.97941589355469, "logps/ref_chosen": -65.81727600097656, "logps/ref_rejected": -95.17749786376953, "logps/rejected": -146.57379150390625, "loss": 0.4035, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15672753751277924, "margin_dpo/beta_margin_grad_std": 0.1847524344921112, "margin_dpo/beta_margin_mean": 3.0234153270721436, "margin_dpo/beta_margin_std": 2.3445184230804443, "margin_dpo/loss_margin_mean": 30.234153747558594, "margin_dpo/margin_mean": 30.23415184020996, "margin_dpo/margin_std": 22.966224670410156, "step": 615 }, { "epoch": 0.9045521292217328, "grad_norm": 72.02803039550781, "learning_rate": 1.4211391382180637e-08, "logits/chosen": -0.613810122013092, "logits/rejected": -0.5613222122192383, "logps/chosen": -88.5474853515625, "logps/ref_chosen": -65.13285827636719, "logps/ref_rejected": -74.70050048828125, "logps/rejected": -130.7073516845703, "loss": 0.5016, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16017135977745056, "margin_dpo/beta_margin_grad_std": 0.230714350938797, "margin_dpo/beta_margin_mean": 3.2592217922210693, "margin_dpo/beta_margin_std": 3.033946990966797, "margin_dpo/loss_margin_mean": 32.59221649169922, "margin_dpo/margin_mean": 32.59221649169922, "margin_dpo/margin_std": 29.501014709472656, "step": 616 }, { "epoch": 0.9060205580029369, "grad_norm": 54.11730194091797, "learning_rate": 1.378797888467345e-08, "logits/chosen": -0.5736366510391235, "logits/rejected": -0.5296716094017029, "logps/chosen": -87.65239715576172, "logps/ref_chosen": -63.005550384521484, "logps/ref_rejected": -64.234130859375, "logps/rejected": -118.85501098632812, "loss": 0.3826, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14828212559223175, "margin_dpo/beta_margin_grad_std": 0.17800341546535492, "margin_dpo/beta_margin_mean": 2.9974029064178467, "margin_dpo/beta_margin_std": 2.35481333732605, "margin_dpo/loss_margin_mean": 29.974029541015625, "margin_dpo/margin_mean": 29.974029541015625, "margin_dpo/margin_std": 23.434463500976562, "step": 617 }, { "epoch": 0.9074889867841409, "grad_norm": 67.99271392822266, "learning_rate": 1.3370790793601371e-08, "logits/chosen": -0.6390504837036133, "logits/rejected": -0.610953152179718, "logps/chosen": -90.81625366210938, "logps/ref_chosen": -67.10135650634766, "logps/ref_rejected": -92.15339660644531, "logps/rejected": -146.72813415527344, "loss": 0.4624, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16537515819072723, "margin_dpo/beta_margin_grad_std": 0.21399806439876556, "margin_dpo/beta_margin_mean": 3.085983991622925, "margin_dpo/beta_margin_std": 2.6704392433166504, "margin_dpo/loss_margin_mean": 30.859838485717773, "margin_dpo/margin_mean": 30.859840393066406, "margin_dpo/margin_std": 26.370765686035156, "step": 618 }, { "epoch": 0.908957415565345, "grad_norm": 55.240272521972656, "learning_rate": 1.2959838102258535e-08, "logits/chosen": -0.5689994096755981, "logits/rejected": -0.5356103777885437, "logps/chosen": -79.01873779296875, "logps/ref_chosen": -55.978233337402344, "logps/ref_rejected": -93.1854019165039, "logps/rejected": -149.14964294433594, "loss": 0.4702, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16712483763694763, "margin_dpo/beta_margin_grad_std": 0.21881355345249176, "margin_dpo/beta_margin_mean": 3.2923738956451416, "margin_dpo/beta_margin_std": 3.01572847366333, "margin_dpo/loss_margin_mean": 32.92374038696289, "margin_dpo/margin_mean": 32.92374038696289, "margin_dpo/margin_std": 29.776756286621094, "step": 619 }, { "epoch": 0.9104258443465492, "grad_norm": 34.842933654785156, "learning_rate": 1.2555131639630567e-08, "logits/chosen": -0.6340548396110535, "logits/rejected": -0.5965070724487305, "logps/chosen": -79.86566162109375, "logps/ref_chosen": -59.79750061035156, "logps/ref_rejected": -78.41075134277344, "logps/rejected": -134.0952911376953, "loss": 0.2579, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.10915657132863998, "margin_dpo/beta_margin_grad_std": 0.13496600091457367, "margin_dpo/beta_margin_mean": 3.561638355255127, "margin_dpo/beta_margin_std": 2.65966534614563, "margin_dpo/loss_margin_mean": 35.61638259887695, "margin_dpo/margin_mean": 35.61638259887695, "margin_dpo/margin_std": 25.934829711914062, "step": 620 }, { "epoch": 0.9118942731277533, "grad_norm": 40.03609848022461, "learning_rate": 1.2156682070109086e-08, "logits/chosen": -0.6073925495147705, "logits/rejected": -0.5800847411155701, "logps/chosen": -72.59913635253906, "logps/ref_chosen": -53.933753967285156, "logps/ref_rejected": -88.36952209472656, "logps/rejected": -143.29660034179688, "loss": 0.3092, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.10899462550878525, "margin_dpo/beta_margin_grad_std": 0.1772289127111435, "margin_dpo/beta_margin_mean": 3.6261699199676514, "margin_dpo/beta_margin_std": 2.691709041595459, "margin_dpo/loss_margin_mean": 36.26169967651367, "margin_dpo/margin_mean": 36.26169967651367, "margin_dpo/margin_std": 26.822023391723633, "step": 621 }, { "epoch": 0.9133627019089574, "grad_norm": 48.39630889892578, "learning_rate": 1.1764499893210878e-08, "logits/chosen": -0.5869364142417908, "logits/rejected": -0.5320132970809937, "logps/chosen": -82.65914916992188, "logps/ref_chosen": -60.28582000732422, "logps/ref_rejected": -85.51873779296875, "logps/rejected": -144.71177673339844, "loss": 0.3854, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1378306895494461, "margin_dpo/beta_margin_grad_std": 0.1983867883682251, "margin_dpo/beta_margin_mean": 3.6819705963134766, "margin_dpo/beta_margin_std": 2.8990466594696045, "margin_dpo/loss_margin_mean": 36.819705963134766, "margin_dpo/margin_mean": 36.819705963134766, "margin_dpo/margin_std": 28.56911277770996, "step": 622 }, { "epoch": 0.9148311306901615, "grad_norm": 73.08351135253906, "learning_rate": 1.1378595443300998e-08, "logits/chosen": -0.6507315635681152, "logits/rejected": -0.6161798238754272, "logps/chosen": -88.72175598144531, "logps/ref_chosen": -64.15696716308594, "logps/ref_rejected": -85.08304595947266, "logps/rejected": -140.02056884765625, "loss": 0.5541, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18612176179885864, "margin_dpo/beta_margin_grad_std": 0.23363880813121796, "margin_dpo/beta_margin_mean": 3.037273645401001, "margin_dpo/beta_margin_std": 2.930415630340576, "margin_dpo/loss_margin_mean": 30.37273597717285, "margin_dpo/margin_mean": 30.37273406982422, "margin_dpo/margin_std": 28.88761329650879, "step": 623 }, { "epoch": 0.9162995594713657, "grad_norm": 71.18040466308594, "learning_rate": 1.0998978889320582e-08, "logits/chosen": -0.6796859502792358, "logits/rejected": -0.6095322966575623, "logps/chosen": -94.83811950683594, "logps/ref_chosen": -71.91862487792969, "logps/ref_rejected": -97.13203430175781, "logps/rejected": -157.37045288085938, "loss": 0.4965, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14844343066215515, "margin_dpo/beta_margin_grad_std": 0.25005990266799927, "margin_dpo/beta_margin_mean": 3.7318942546844482, "margin_dpo/beta_margin_std": 2.772662878036499, "margin_dpo/loss_margin_mean": 37.31894302368164, "margin_dpo/margin_mean": 37.318939208984375, "margin_dpo/margin_std": 27.622631072998047, "step": 624 }, { "epoch": 0.9177679882525698, "grad_norm": 49.115333557128906, "learning_rate": 1.0625660234518913e-08, "logits/chosen": -0.59247887134552, "logits/rejected": -0.5529348850250244, "logps/chosen": -81.66363525390625, "logps/ref_chosen": -58.342071533203125, "logps/ref_rejected": -86.09038543701172, "logps/rejected": -144.93325805664062, "loss": 0.3591, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13468137383460999, "margin_dpo/beta_margin_grad_std": 0.18700142204761505, "margin_dpo/beta_margin_mean": 3.552130937576294, "margin_dpo/beta_margin_std": 2.8531718254089355, "margin_dpo/loss_margin_mean": 35.52130889892578, "margin_dpo/margin_mean": 35.52130889892578, "margin_dpo/margin_std": 28.529512405395508, "step": 625 }, { "epoch": 0.9192364170337739, "grad_norm": 63.880088806152344, "learning_rate": 1.0258649316189721e-08, "logits/chosen": -0.5680443644523621, "logits/rejected": -0.5336043834686279, "logps/chosen": -98.9983139038086, "logps/ref_chosen": -75.11260986328125, "logps/ref_rejected": -99.18872833251953, "logps/rejected": -153.16671752929688, "loss": 0.5253, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18730950355529785, "margin_dpo/beta_margin_grad_std": 0.21887990832328796, "margin_dpo/beta_margin_mean": 3.009228467941284, "margin_dpo/beta_margin_std": 2.89654541015625, "margin_dpo/loss_margin_mean": 30.09228515625, "margin_dpo/margin_mean": 30.09228515625, "margin_dpo/margin_std": 28.44098472595215, "step": 626 }, { "epoch": 0.920704845814978, "grad_norm": 78.66019439697266, "learning_rate": 9.897955805412e-09, "logits/chosen": -0.579108476638794, "logits/rejected": -0.587154746055603, "logps/chosen": -69.19841003417969, "logps/ref_chosen": -47.74314880371094, "logps/ref_rejected": -106.75448608398438, "logps/rejected": -162.16537475585938, "loss": 0.6048, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18939301371574402, "margin_dpo/beta_margin_grad_std": 0.2481917440891266, "margin_dpo/beta_margin_mean": 3.3955633640289307, "margin_dpo/beta_margin_std": 3.4090704917907715, "margin_dpo/loss_margin_mean": 33.95563507080078, "margin_dpo/margin_mean": 33.95563507080078, "margin_dpo/margin_std": 34.049468994140625, "step": 627 }, { "epoch": 0.922173274596182, "grad_norm": 41.49999237060547, "learning_rate": 9.543589206795238e-09, "logits/chosen": -0.6199311017990112, "logits/rejected": -0.6005183458328247, "logps/chosen": -82.25130462646484, "logps/ref_chosen": -60.182945251464844, "logps/ref_rejected": -101.55467224121094, "logps/rejected": -159.23948669433594, "loss": 0.3001, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12019230425357819, "margin_dpo/beta_margin_grad_std": 0.15883654356002808, "margin_dpo/beta_margin_mean": 3.5616455078125, "margin_dpo/beta_margin_std": 2.615797758102417, "margin_dpo/loss_margin_mean": 35.616455078125, "margin_dpo/margin_mean": 35.616455078125, "margin_dpo/margin_std": 25.80486297607422, "step": 628 }, { "epoch": 0.9236417033773862, "grad_norm": 62.908477783203125, "learning_rate": 9.19555885822887e-09, "logits/chosen": -0.6567898392677307, "logits/rejected": -0.6142420768737793, "logps/chosen": -86.42594909667969, "logps/ref_chosen": -64.21353912353516, "logps/ref_rejected": -91.65367126464844, "logps/rejected": -145.6768798828125, "loss": 0.4054, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1407935917377472, "margin_dpo/beta_margin_grad_std": 0.19307489693164825, "margin_dpo/beta_margin_mean": 3.1810803413391113, "margin_dpo/beta_margin_std": 2.5537304878234863, "margin_dpo/loss_margin_mean": 31.81080436706543, "margin_dpo/margin_mean": 31.810806274414062, "margin_dpo/margin_std": 25.02639389038086, "step": 629 }, { "epoch": 0.9251101321585903, "grad_norm": 60.66549301147461, "learning_rate": 8.85387393063622e-09, "logits/chosen": -0.6706698536872864, "logits/rejected": -0.6243743896484375, "logps/chosen": -79.63174438476562, "logps/ref_chosen": -59.29100036621094, "logps/ref_rejected": -83.59829711914062, "logps/rejected": -134.34188842773438, "loss": 0.461, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1645456999540329, "margin_dpo/beta_margin_grad_std": 0.20726469159126282, "margin_dpo/beta_margin_mean": 3.0402843952178955, "margin_dpo/beta_margin_std": 2.556612014770508, "margin_dpo/loss_margin_mean": 30.402841567993164, "margin_dpo/margin_mean": 30.402843475341797, "margin_dpo/margin_std": 25.565155029296875, "step": 630 }, { "epoch": 0.9265785609397944, "grad_norm": 94.32537078857422, "learning_rate": 8.518543427732949e-09, "logits/chosen": -0.6151013374328613, "logits/rejected": -0.5717021822929382, "logps/chosen": -83.84978485107422, "logps/ref_chosen": -59.45360565185547, "logps/ref_rejected": -80.95157623291016, "logps/rejected": -133.63461303710938, "loss": 0.7356, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1976294070482254, "margin_dpo/beta_margin_grad_std": 0.26410892605781555, "margin_dpo/beta_margin_mean": 2.8286869525909424, "margin_dpo/beta_margin_std": 2.959045886993408, "margin_dpo/loss_margin_mean": 28.286867141723633, "margin_dpo/margin_mean": 28.286869049072266, "margin_dpo/margin_std": 29.41876220703125, "step": 631 }, { "epoch": 0.9280469897209985, "grad_norm": 86.42517852783203, "learning_rate": 8.189576185789637e-09, "logits/chosen": -0.619070291519165, "logits/rejected": -0.5838553309440613, "logps/chosen": -85.71180725097656, "logps/ref_chosen": -61.35155487060547, "logps/ref_rejected": -86.16017150878906, "logps/rejected": -143.08697509765625, "loss": 0.7093, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16710862517356873, "margin_dpo/beta_margin_grad_std": 0.26828470826148987, "margin_dpo/beta_margin_mean": 3.256655216217041, "margin_dpo/beta_margin_std": 3.002157211303711, "margin_dpo/loss_margin_mean": 32.566551208496094, "margin_dpo/margin_mean": 32.566551208496094, "margin_dpo/margin_std": 29.249189376831055, "step": 632 }, { "epoch": 0.9295154185022027, "grad_norm": 60.10581970214844, "learning_rate": 7.866980873399015e-09, "logits/chosen": -0.6361432075500488, "logits/rejected": -0.6225095987319946, "logps/chosen": -80.6368408203125, "logps/ref_chosen": -57.278167724609375, "logps/ref_rejected": -91.58395385742188, "logps/rejected": -142.36219787597656, "loss": 0.5499, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.19326050579547882, "margin_dpo/beta_margin_grad_std": 0.22396619617938995, "margin_dpo/beta_margin_mean": 2.741957902908325, "margin_dpo/beta_margin_std": 2.554403066635132, "margin_dpo/loss_margin_mean": 27.419578552246094, "margin_dpo/margin_mean": 27.419578552246094, "margin_dpo/margin_std": 24.602121353149414, "step": 633 }, { "epoch": 0.9309838472834068, "grad_norm": 73.96012878417969, "learning_rate": 7.550765991247654e-09, "logits/chosen": -0.5560423135757446, "logits/rejected": -0.538284420967102, "logps/chosen": -93.19425964355469, "logps/ref_chosen": -66.61896514892578, "logps/ref_rejected": -107.12565612792969, "logps/rejected": -161.61175537109375, "loss": 0.6531, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.20842374861240387, "margin_dpo/beta_margin_grad_std": 0.24925780296325684, "margin_dpo/beta_margin_mean": 2.791081190109253, "margin_dpo/beta_margin_std": 2.91212797164917, "margin_dpo/loss_margin_mean": 27.910810470581055, "margin_dpo/margin_mean": 27.910812377929688, "margin_dpo/margin_std": 29.05972671508789, "step": 634 }, { "epoch": 0.9324522760646109, "grad_norm": 50.861839294433594, "learning_rate": 7.240939871891699e-09, "logits/chosen": -0.608803391456604, "logits/rejected": -0.5592911243438721, "logps/chosen": -96.6619873046875, "logps/ref_chosen": -73.95551300048828, "logps/ref_rejected": -82.50045776367188, "logps/rejected": -133.83990478515625, "loss": 0.409, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15432217717170715, "margin_dpo/beta_margin_grad_std": 0.18803834915161133, "margin_dpo/beta_margin_mean": 2.8632962703704834, "margin_dpo/beta_margin_std": 2.2558207511901855, "margin_dpo/loss_margin_mean": 28.63296127319336, "margin_dpo/margin_mean": 28.63296127319336, "margin_dpo/margin_std": 22.48883628845215, "step": 635 }, { "epoch": 0.933920704845815, "grad_norm": 47.65840530395508, "learning_rate": 6.937510679537628e-09, "logits/chosen": -0.5629330277442932, "logits/rejected": -0.5346908569335938, "logps/chosen": -82.30425262451172, "logps/ref_chosen": -59.628910064697266, "logps/ref_rejected": -81.97883605957031, "logps/rejected": -137.65878295898438, "loss": 0.4012, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13953568041324615, "margin_dpo/beta_margin_grad_std": 0.21565653383731842, "margin_dpo/beta_margin_mean": 3.3004610538482666, "margin_dpo/beta_margin_std": 2.40425968170166, "margin_dpo/loss_margin_mean": 33.00461196899414, "margin_dpo/margin_mean": 33.004608154296875, "margin_dpo/margin_std": 23.834693908691406, "step": 636 }, { "epoch": 0.9353891336270191, "grad_norm": 53.40937042236328, "learning_rate": 6.640486409826785e-09, "logits/chosen": -0.5897486209869385, "logits/rejected": -0.5671026110649109, "logps/chosen": -73.21141815185547, "logps/ref_chosen": -49.652687072753906, "logps/ref_rejected": -98.40513610839844, "logps/rejected": -154.89999389648438, "loss": 0.3634, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13438232243061066, "margin_dpo/beta_margin_grad_std": 0.18966805934906006, "margin_dpo/beta_margin_mean": 3.2936134338378906, "margin_dpo/beta_margin_std": 2.5701606273651123, "margin_dpo/loss_margin_mean": 32.936134338378906, "margin_dpo/margin_mean": 32.936134338378906, "margin_dpo/margin_std": 25.349170684814453, "step": 637 }, { "epoch": 0.9368575624082232, "grad_norm": 41.96897888183594, "learning_rate": 6.349874889624962e-09, "logits/chosen": -0.5449614524841309, "logits/rejected": -0.49521952867507935, "logps/chosen": -78.70539855957031, "logps/ref_chosen": -58.156646728515625, "logps/ref_rejected": -79.3014907836914, "logps/rejected": -136.9318084716797, "loss": 0.3245, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12071166932582855, "margin_dpo/beta_margin_grad_std": 0.17367184162139893, "margin_dpo/beta_margin_mean": 3.7081568241119385, "margin_dpo/beta_margin_std": 2.8503551483154297, "margin_dpo/loss_margin_mean": 37.08156967163086, "margin_dpo/margin_mean": 37.08156967163086, "margin_dpo/margin_std": 27.137168884277344, "step": 638 }, { "epoch": 0.9383259911894273, "grad_norm": 57.53899383544922, "learning_rate": 6.065683776815933e-09, "logits/chosen": -0.58185875415802, "logits/rejected": -0.5182079672813416, "logps/chosen": -97.73635864257812, "logps/ref_chosen": -72.32319641113281, "logps/ref_rejected": -74.2749252319336, "logps/rejected": -130.7800750732422, "loss": 0.4397, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14476662874221802, "margin_dpo/beta_margin_grad_std": 0.199687659740448, "margin_dpo/beta_margin_mean": 3.109198570251465, "margin_dpo/beta_margin_std": 2.4722304344177246, "margin_dpo/loss_margin_mean": 31.09198570251465, "margin_dpo/margin_mean": 31.09198760986328, "margin_dpo/margin_std": 24.611787796020508, "step": 639 }, { "epoch": 0.9397944199706314, "grad_norm": 44.61709213256836, "learning_rate": 5.7879205600998296e-09, "logits/chosen": -0.5778528451919556, "logits/rejected": -0.5412660241127014, "logps/chosen": -78.43016815185547, "logps/ref_chosen": -56.13436508178711, "logps/ref_rejected": -108.60014343261719, "logps/rejected": -167.73947143554688, "loss": 0.3066, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12420199811458588, "margin_dpo/beta_margin_grad_std": 0.15969912707805634, "margin_dpo/beta_margin_mean": 3.6843512058258057, "margin_dpo/beta_margin_std": 3.017540454864502, "margin_dpo/loss_margin_mean": 36.84351348876953, "margin_dpo/margin_mean": 36.84351348876953, "margin_dpo/margin_std": 29.767667770385742, "step": 640 }, { "epoch": 0.9412628487518355, "grad_norm": 51.515228271484375, "learning_rate": 5.516592558795746e-09, "logits/chosen": -0.6603978872299194, "logits/rejected": -0.6059365272521973, "logps/chosen": -88.82362365722656, "logps/ref_chosen": -64.99689483642578, "logps/ref_rejected": -86.99232482910156, "logps/rejected": -142.99404907226562, "loss": 0.3746, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1466035395860672, "margin_dpo/beta_margin_grad_std": 0.17220094799995422, "margin_dpo/beta_margin_mean": 3.217498779296875, "margin_dpo/beta_margin_std": 3.0656890869140625, "margin_dpo/loss_margin_mean": 32.17498779296875, "margin_dpo/margin_mean": 32.17498779296875, "margin_dpo/margin_std": 29.780851364135742, "step": 641 }, { "epoch": 0.9427312775330396, "grad_norm": 78.2542724609375, "learning_rate": 5.251706922648868e-09, "logits/chosen": -0.5912165641784668, "logits/rejected": -0.5562861561775208, "logps/chosen": -90.29745483398438, "logps/ref_chosen": -65.68924713134766, "logps/ref_rejected": -110.24205017089844, "logps/rejected": -170.28448486328125, "loss": 0.4822, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15366876125335693, "margin_dpo/beta_margin_grad_std": 0.22277072072029114, "margin_dpo/beta_margin_mean": 3.5434229373931885, "margin_dpo/beta_margin_std": 3.0842573642730713, "margin_dpo/loss_margin_mean": 35.43423080444336, "margin_dpo/margin_mean": 35.434226989746094, "margin_dpo/margin_std": 30.440698623657227, "step": 642 }, { "epoch": 0.9441997063142438, "grad_norm": 51.46054458618164, "learning_rate": 4.993270631642038e-09, "logits/chosen": -0.6483656764030457, "logits/rejected": -0.62122642993927, "logps/chosen": -71.25507354736328, "logps/ref_chosen": -51.94999694824219, "logps/ref_rejected": -87.46833801269531, "logps/rejected": -137.56893920898438, "loss": 0.4257, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14452366530895233, "margin_dpo/beta_margin_grad_std": 0.19863076508045197, "margin_dpo/beta_margin_mean": 3.0795533657073975, "margin_dpo/beta_margin_std": 2.446993350982666, "margin_dpo/loss_margin_mean": 30.795534133911133, "margin_dpo/margin_mean": 30.795534133911133, "margin_dpo/margin_std": 24.044445037841797, "step": 643 }, { "epoch": 0.9456681350954479, "grad_norm": 75.44609069824219, "learning_rate": 4.741290495811873e-09, "logits/chosen": -0.6009418964385986, "logits/rejected": -0.57252037525177, "logps/chosen": -79.76002502441406, "logps/ref_chosen": -59.017662048339844, "logps/ref_rejected": -87.13668823242188, "logps/rejected": -138.11033630371094, "loss": 0.5657, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1847480684518814, "margin_dpo/beta_margin_grad_std": 0.23598462343215942, "margin_dpo/beta_margin_mean": 3.0231282711029053, "margin_dpo/beta_margin_std": 2.8853414058685303, "margin_dpo/loss_margin_mean": 30.231281280517578, "margin_dpo/margin_mean": 30.231281280517578, "margin_dpo/margin_std": 28.730857849121094, "step": 644 }, { "epoch": 0.947136563876652, "grad_norm": 70.22451782226562, "learning_rate": 4.495773155069299e-09, "logits/chosen": -0.5856224298477173, "logits/rejected": -0.5652365684509277, "logps/chosen": -79.71002197265625, "logps/ref_chosen": -55.87602233886719, "logps/ref_rejected": -97.78080749511719, "logps/rejected": -150.6129913330078, "loss": 0.544, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.19084730744361877, "margin_dpo/beta_margin_grad_std": 0.22894078493118286, "margin_dpo/beta_margin_mean": 2.899817705154419, "margin_dpo/beta_margin_std": 2.835707187652588, "margin_dpo/loss_margin_mean": 28.99817657470703, "margin_dpo/margin_mean": 28.99817657470703, "margin_dpo/margin_std": 27.904760360717773, "step": 645 }, { "epoch": 0.9486049926578561, "grad_norm": 51.758888244628906, "learning_rate": 4.256725079024553e-09, "logits/chosen": -0.6095120906829834, "logits/rejected": -0.5594819784164429, "logps/chosen": -83.82559967041016, "logps/ref_chosen": -61.275787353515625, "logps/ref_rejected": -77.50580596923828, "logps/rejected": -133.1284637451172, "loss": 0.316, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.11909312754869461, "margin_dpo/beta_margin_grad_std": 0.17143264412879944, "margin_dpo/beta_margin_mean": 3.307283878326416, "margin_dpo/beta_margin_std": 2.259632110595703, "margin_dpo/loss_margin_mean": 33.072837829589844, "margin_dpo/margin_mean": 33.072837829589844, "margin_dpo/margin_std": 22.390499114990234, "step": 646 }, { "epoch": 0.9500734214390602, "grad_norm": 81.30612182617188, "learning_rate": 4.024152566816791e-09, "logits/chosen": -0.5496389865875244, "logits/rejected": -0.5257160067558289, "logps/chosen": -78.84927368164062, "logps/ref_chosen": -54.852413177490234, "logps/ref_rejected": -93.5194091796875, "logps/rejected": -150.27786254882812, "loss": 0.5032, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16137224435806274, "margin_dpo/beta_margin_grad_std": 0.24134768545627594, "margin_dpo/beta_margin_mean": 3.2761595249176025, "margin_dpo/beta_margin_std": 2.728703260421753, "margin_dpo/loss_margin_mean": 32.761592864990234, "margin_dpo/margin_mean": 32.761592864990234, "margin_dpo/margin_std": 26.8262939453125, "step": 647 }, { "epoch": 0.9515418502202643, "grad_norm": 47.32956314086914, "learning_rate": 3.798061746947995e-09, "logits/chosen": -0.6139056086540222, "logits/rejected": -0.6051241159439087, "logps/chosen": -73.89356231689453, "logps/ref_chosen": -54.17146682739258, "logps/ref_rejected": -98.71279907226562, "logps/rejected": -158.77578735351562, "loss": 0.3728, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.138786181807518, "margin_dpo/beta_margin_grad_std": 0.19550208747386932, "margin_dpo/beta_margin_mean": 4.034091472625732, "margin_dpo/beta_margin_std": 3.443060874938965, "margin_dpo/loss_margin_mean": 40.340911865234375, "margin_dpo/margin_mean": 40.340911865234375, "margin_dpo/margin_std": 34.24688720703125, "step": 648 }, { "epoch": 0.9530102790014684, "grad_norm": 50.813629150390625, "learning_rate": 3.5784585771215235e-09, "logits/chosen": -0.6515902876853943, "logits/rejected": -0.6201357841491699, "logps/chosen": -83.07283020019531, "logps/ref_chosen": -62.4803466796875, "logps/ref_rejected": -80.07717895507812, "logps/rejected": -129.16033935546875, "loss": 0.536, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1962561458349228, "margin_dpo/beta_margin_grad_std": 0.21189990639686584, "margin_dpo/beta_margin_mean": 2.849066734313965, "margin_dpo/beta_margin_std": 2.8495798110961914, "margin_dpo/loss_margin_mean": 28.49066734313965, "margin_dpo/margin_mean": 28.49066925048828, "margin_dpo/margin_std": 28.419557571411133, "step": 649 }, { "epoch": 0.9544787077826725, "grad_norm": 59.442115783691406, "learning_rate": 3.3653488440851253e-09, "logits/chosen": -0.5737979412078857, "logits/rejected": -0.5637534260749817, "logps/chosen": -80.34698486328125, "logps/ref_chosen": -56.09281921386719, "logps/ref_rejected": -98.26483917236328, "logps/rejected": -159.14297485351562, "loss": 0.3545, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13270466029644012, "margin_dpo/beta_margin_grad_std": 0.1840543895959854, "margin_dpo/beta_margin_mean": 3.662398338317871, "margin_dpo/beta_margin_std": 2.9485061168670654, "margin_dpo/loss_margin_mean": 36.62398147583008, "margin_dpo/margin_mean": 36.623985290527344, "margin_dpo/margin_std": 28.712535858154297, "step": 650 }, { "epoch": 0.9559471365638766, "grad_norm": 38.10145950317383, "learning_rate": 3.158738163478475e-09, "logits/chosen": -0.653481125831604, "logits/rejected": -0.6552349328994751, "logps/chosen": -62.947837829589844, "logps/ref_chosen": -43.42544937133789, "logps/ref_rejected": -99.9579086303711, "logps/rejected": -155.12380981445312, "loss": 0.3146, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12548606097698212, "margin_dpo/beta_margin_grad_std": 0.1639980524778366, "margin_dpo/beta_margin_mean": 3.564352035522461, "margin_dpo/beta_margin_std": 2.6986141204833984, "margin_dpo/loss_margin_mean": 35.64352035522461, "margin_dpo/margin_mean": 35.643516540527344, "margin_dpo/margin_std": 26.896413803100586, "step": 651 }, { "epoch": 0.9574155653450808, "grad_norm": 39.05808639526367, "learning_rate": 2.9586319796851555e-09, "logits/chosen": -0.6412711143493652, "logits/rejected": -0.617784857749939, "logps/chosen": -78.93205261230469, "logps/ref_chosen": -62.57680892944336, "logps/ref_rejected": -111.76779174804688, "logps/rejected": -163.570556640625, "loss": 0.3386, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13532721996307373, "margin_dpo/beta_margin_grad_std": 0.16726936399936676, "margin_dpo/beta_margin_mean": 3.544752597808838, "margin_dpo/beta_margin_std": 2.8201441764831543, "margin_dpo/loss_margin_mean": 35.44752502441406, "margin_dpo/margin_mean": 35.44752502441406, "margin_dpo/margin_std": 28.104263305664062, "step": 652 }, { "epoch": 0.9588839941262849, "grad_norm": 51.56984329223633, "learning_rate": 2.7650355656892166e-09, "logits/chosen": -0.6192601919174194, "logits/rejected": -0.5976792573928833, "logps/chosen": -84.49002075195312, "logps/ref_chosen": -61.11295700073242, "logps/ref_rejected": -103.24960327148438, "logps/rejected": -162.29043579101562, "loss": 0.3249, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12032375484704971, "margin_dpo/beta_margin_grad_std": 0.18257243931293488, "margin_dpo/beta_margin_mean": 3.566375970840454, "margin_dpo/beta_margin_std": 2.590984344482422, "margin_dpo/loss_margin_mean": 35.66375732421875, "margin_dpo/margin_mean": 35.66375732421875, "margin_dpo/margin_std": 25.806888580322266, "step": 653 }, { "epoch": 0.960352422907489, "grad_norm": 72.21066284179688, "learning_rate": 2.577954022936174e-09, "logits/chosen": -0.6111325025558472, "logits/rejected": -0.6062880754470825, "logps/chosen": -86.98482513427734, "logps/ref_chosen": -61.7281379699707, "logps/ref_rejected": -98.7738037109375, "logps/rejected": -153.51400756835938, "loss": 0.5285, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17636682093143463, "margin_dpo/beta_margin_grad_std": 0.2301536500453949, "margin_dpo/beta_margin_mean": 2.948350667953491, "margin_dpo/beta_margin_std": 2.8891336917877197, "margin_dpo/loss_margin_mean": 29.48350715637207, "margin_dpo/margin_mean": 29.483509063720703, "margin_dpo/margin_std": 28.753616333007812, "step": 654 }, { "epoch": 0.9618208516886931, "grad_norm": 72.21392059326172, "learning_rate": 2.397392281198729e-09, "logits/chosen": -0.6073825359344482, "logits/rejected": -0.6081333160400391, "logps/chosen": -70.99528503417969, "logps/ref_chosen": -49.576812744140625, "logps/ref_rejected": -98.29183197021484, "logps/rejected": -150.2451629638672, "loss": 0.5089, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18340007960796356, "margin_dpo/beta_margin_grad_std": 0.217272087931633, "margin_dpo/beta_margin_mean": 3.0534873008728027, "margin_dpo/beta_margin_std": 2.986149311065674, "margin_dpo/loss_margin_mean": 30.53487205505371, "margin_dpo/margin_mean": 30.534870147705078, "margin_dpo/margin_std": 29.086572647094727, "step": 655 }, { "epoch": 0.9632892804698973, "grad_norm": 40.71949768066406, "learning_rate": 2.223355098446622e-09, "logits/chosen": -0.5212767124176025, "logits/rejected": -0.5257933139801025, "logps/chosen": -73.37840270996094, "logps/ref_chosen": -52.54943084716797, "logps/ref_rejected": -113.67464447021484, "logps/rejected": -176.44357299804688, "loss": 0.2412, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.0933179035782814, "margin_dpo/beta_margin_grad_std": 0.15993143618106842, "margin_dpo/beta_margin_mean": 4.193995952606201, "margin_dpo/beta_margin_std": 2.617056369781494, "margin_dpo/loss_margin_mean": 41.93996047973633, "margin_dpo/margin_mean": 41.93996047973633, "margin_dpo/margin_std": 25.561412811279297, "step": 656 }, { "epoch": 0.9647577092511013, "grad_norm": 45.717838287353516, "learning_rate": 2.055847060721566e-09, "logits/chosen": -0.6373677849769592, "logits/rejected": -0.6168010234832764, "logps/chosen": -68.62776184082031, "logps/ref_chosen": -46.700538635253906, "logps/ref_rejected": -97.91487121582031, "logps/rejected": -157.26351928710938, "loss": 0.3432, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.11700256913900375, "margin_dpo/beta_margin_grad_std": 0.1847466230392456, "margin_dpo/beta_margin_mean": 3.7421414852142334, "margin_dpo/beta_margin_std": 2.8833959102630615, "margin_dpo/loss_margin_mean": 37.42141342163086, "margin_dpo/margin_mean": 37.42141342163086, "margin_dpo/margin_std": 28.687862396240234, "step": 657 }, { "epoch": 0.9662261380323054, "grad_norm": 59.321533203125, "learning_rate": 1.8948725820160662e-09, "logits/chosen": -0.6310451030731201, "logits/rejected": -0.5929208993911743, "logps/chosen": -86.52423095703125, "logps/ref_chosen": -60.958213806152344, "logps/ref_rejected": -95.93949127197266, "logps/rejected": -156.62518310546875, "loss": 0.4487, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14949670433998108, "margin_dpo/beta_margin_grad_std": 0.2181350290775299, "margin_dpo/beta_margin_mean": 3.5119664669036865, "margin_dpo/beta_margin_std": 3.0297350883483887, "margin_dpo/loss_margin_mean": 35.11966323852539, "margin_dpo/margin_mean": 35.119667053222656, "margin_dpo/margin_std": 29.735076904296875, "step": 658 }, { "epoch": 0.9676945668135095, "grad_norm": 57.14666748046875, "learning_rate": 1.7404359041573723e-09, "logits/chosen": -0.6056843996047974, "logits/rejected": -0.540166974067688, "logps/chosen": -96.09359741210938, "logps/ref_chosen": -76.74298095703125, "logps/ref_rejected": -87.4709701538086, "logps/rejected": -141.1275634765625, "loss": 0.5047, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16867277026176453, "margin_dpo/beta_margin_grad_std": 0.23597054183483124, "margin_dpo/beta_margin_mean": 3.4305975437164307, "margin_dpo/beta_margin_std": 2.9324827194213867, "margin_dpo/loss_margin_mean": 34.305973052978516, "margin_dpo/margin_mean": 34.30597686767578, "margin_dpo/margin_std": 29.283281326293945, "step": 659 }, { "epoch": 0.9691629955947136, "grad_norm": 49.01050567626953, "learning_rate": 1.592541096695571e-09, "logits/chosen": -0.6273288130760193, "logits/rejected": -0.5808557271957397, "logps/chosen": -80.31892395019531, "logps/ref_chosen": -59.047882080078125, "logps/ref_rejected": -75.96005249023438, "logps/rejected": -135.07073974609375, "loss": 0.2917, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1133999153971672, "margin_dpo/beta_margin_grad_std": 0.1706753671169281, "margin_dpo/beta_margin_mean": 3.7839651107788086, "margin_dpo/beta_margin_std": 2.7794392108917236, "margin_dpo/loss_margin_mean": 37.83964920043945, "margin_dpo/margin_mean": 37.83965301513672, "margin_dpo/margin_std": 27.737031936645508, "step": 660 }, { "epoch": 0.9706314243759178, "grad_norm": 64.96249389648438, "learning_rate": 1.4511920567963908e-09, "logits/chosen": -0.6019885540008545, "logits/rejected": -0.5567299127578735, "logps/chosen": -71.31771850585938, "logps/ref_chosen": -50.673973083496094, "logps/ref_rejected": -86.00569152832031, "logps/rejected": -141.6163787841797, "loss": 0.4523, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14642944931983948, "margin_dpo/beta_margin_grad_std": 0.2149476855993271, "margin_dpo/beta_margin_mean": 3.4966940879821777, "margin_dpo/beta_margin_std": 3.0865559577941895, "margin_dpo/loss_margin_mean": 34.966941833496094, "margin_dpo/margin_mean": 34.966941833496094, "margin_dpo/margin_std": 29.39708709716797, "step": 661 }, { "epoch": 0.9720998531571219, "grad_norm": 50.99778747558594, "learning_rate": 1.3163925091384532e-09, "logits/chosen": -0.6079974174499512, "logits/rejected": -0.5556979775428772, "logps/chosen": -93.49765014648438, "logps/ref_chosen": -69.26106262207031, "logps/ref_rejected": -89.05593872070312, "logps/rejected": -144.09814453125, "loss": 0.378, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14259321987628937, "margin_dpo/beta_margin_grad_std": 0.17750491201877594, "margin_dpo/beta_margin_mean": 3.080561399459839, "margin_dpo/beta_margin_std": 2.571131944656372, "margin_dpo/loss_margin_mean": 30.805612564086914, "margin_dpo/margin_mean": 30.80561065673828, "margin_dpo/margin_std": 25.51202964782715, "step": 662 }, { "epoch": 0.973568281938326, "grad_norm": 39.297733306884766, "learning_rate": 1.1881460058152382e-09, "logits/chosen": -0.6374907493591309, "logits/rejected": -0.6157968044281006, "logps/chosen": -83.19400024414062, "logps/ref_chosen": -64.87891387939453, "logps/ref_rejected": -113.92536926269531, "logps/rejected": -165.287353515625, "loss": 0.3262, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12396994978189468, "margin_dpo/beta_margin_grad_std": 0.15943719446659088, "margin_dpo/beta_margin_mean": 3.304689407348633, "margin_dpo/beta_margin_std": 2.461198568344116, "margin_dpo/loss_margin_mean": 33.04689407348633, "margin_dpo/margin_mean": 33.046897888183594, "margin_dpo/margin_std": 24.47772216796875, "step": 663 }, { "epoch": 0.9750367107195301, "grad_norm": 69.85308074951172, "learning_rate": 1.066455926241383e-09, "logits/chosen": -0.5776158571243286, "logits/rejected": -0.5483744144439697, "logps/chosen": -84.34225463867188, "logps/ref_chosen": -60.88847351074219, "logps/ref_rejected": -105.521728515625, "logps/rejected": -166.12283325195312, "loss": 0.4288, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.11845803260803223, "margin_dpo/beta_margin_grad_std": 0.1965658962726593, "margin_dpo/beta_margin_mean": 3.7147321701049805, "margin_dpo/beta_margin_std": 2.736865997314453, "margin_dpo/loss_margin_mean": 37.14732360839844, "margin_dpo/margin_mean": 37.14732360839844, "margin_dpo/margin_std": 26.97930145263672, "step": 664 }, { "epoch": 0.9765051395007343, "grad_norm": 44.827796936035156, "learning_rate": 9.513254770636137e-10, "logits/chosen": -0.6395413279533386, "logits/rejected": -0.5962468385696411, "logps/chosen": -81.45133972167969, "logps/ref_chosen": -60.56413269042969, "logps/ref_rejected": -84.8088150024414, "logps/rejected": -137.22821044921875, "loss": 0.3524, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13737604022026062, "margin_dpo/beta_margin_grad_std": 0.17597481608390808, "margin_dpo/beta_margin_mean": 3.153219223022461, "margin_dpo/beta_margin_std": 2.3784492015838623, "margin_dpo/loss_margin_mean": 31.53219223022461, "margin_dpo/margin_mean": 31.53219223022461, "margin_dpo/margin_std": 23.21342658996582, "step": 665 }, { "epoch": 0.9779735682819384, "grad_norm": 61.68048858642578, "learning_rate": 8.427576920763956e-10, "logits/chosen": -0.6096721887588501, "logits/rejected": -0.5720229148864746, "logps/chosen": -88.06729125976562, "logps/ref_chosen": -64.41996002197266, "logps/ref_rejected": -95.89163208007812, "logps/rejected": -154.82492065429688, "loss": 0.4262, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13251623511314392, "margin_dpo/beta_margin_grad_std": 0.21360599994659424, "margin_dpo/beta_margin_mean": 3.5285956859588623, "margin_dpo/beta_margin_std": 2.662677049636841, "margin_dpo/loss_margin_mean": 35.28595733642578, "margin_dpo/margin_mean": 35.28595733642578, "margin_dpo/margin_std": 26.031997680664062, "step": 666 }, { "epoch": 0.9794419970631424, "grad_norm": 58.16268539428711, "learning_rate": 7.407554321417764e-10, "logits/chosen": -0.5887176990509033, "logits/rejected": -0.536880612373352, "logps/chosen": -94.41732025146484, "logps/ref_chosen": -69.27703094482422, "logps/ref_rejected": -87.83549499511719, "logps/rejected": -147.3884735107422, "loss": 0.3242, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12313113361597061, "margin_dpo/beta_margin_grad_std": 0.16970713436603546, "margin_dpo/beta_margin_mean": 3.441269636154175, "margin_dpo/beta_margin_std": 2.472762107849121, "margin_dpo/loss_margin_mean": 34.412696838378906, "margin_dpo/margin_mean": 34.412696838378906, "margin_dpo/margin_std": 24.47201919555664, "step": 667 }, { "epoch": 0.9809104258443465, "grad_norm": 69.86036682128906, "learning_rate": 6.453213851142225e-10, "logits/chosen": -0.6267153024673462, "logits/rejected": -0.5883671641349792, "logps/chosen": -96.0662841796875, "logps/ref_chosen": -72.60400390625, "logps/ref_rejected": -103.73905181884766, "logps/rejected": -160.34828186035156, "loss": 0.4507, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1533161848783493, "margin_dpo/beta_margin_grad_std": 0.22066539525985718, "margin_dpo/beta_margin_mean": 3.314695358276367, "margin_dpo/beta_margin_std": 2.6596431732177734, "margin_dpo/loss_margin_mean": 33.146949768066406, "margin_dpo/margin_mean": 33.146949768066406, "margin_dpo/margin_std": 25.9494571685791, "step": 668 }, { "epoch": 0.9823788546255506, "grad_norm": 68.40164947509766, "learning_rate": 5.564580657695939e-10, "logits/chosen": -0.6119288802146912, "logits/rejected": -0.5665886998176575, "logps/chosen": -65.71624755859375, "logps/ref_chosen": -46.116416931152344, "logps/ref_rejected": -77.92434692382812, "logps/rejected": -135.82337951660156, "loss": 0.5021, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15446214377880096, "margin_dpo/beta_margin_grad_std": 0.23781202733516693, "margin_dpo/beta_margin_mean": 3.8299198150634766, "margin_dpo/beta_margin_std": 3.285043716430664, "margin_dpo/loss_margin_mean": 38.299198150634766, "margin_dpo/margin_mean": 38.299198150634766, "margin_dpo/margin_std": 32.602203369140625, "step": 669 }, { "epoch": 0.9838472834067548, "grad_norm": 44.809444427490234, "learning_rate": 4.741678157389739e-10, "logits/chosen": -0.5694983005523682, "logits/rejected": -0.5347045660018921, "logps/chosen": -83.17808532714844, "logps/ref_chosen": -62.34575653076172, "logps/ref_rejected": -96.9405517578125, "logps/rejected": -156.79319763183594, "loss": 0.2702, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.10760509222745895, "margin_dpo/beta_margin_grad_std": 0.15676988661289215, "margin_dpo/beta_margin_mean": 3.9020321369171143, "margin_dpo/beta_margin_std": 2.6109135150909424, "margin_dpo/loss_margin_mean": 39.020320892333984, "margin_dpo/margin_mean": 39.02031707763672, "margin_dpo/margin_std": 25.866138458251953, "step": 670 }, { "epoch": 0.9853157121879589, "grad_norm": 48.393497467041016, "learning_rate": 3.9845280344705245e-10, "logits/chosen": -0.5919187068939209, "logits/rejected": -0.5590361952781677, "logps/chosen": -72.3186264038086, "logps/ref_chosen": -48.00010681152344, "logps/ref_rejected": -83.81932067871094, "logps/rejected": -143.67893981933594, "loss": 0.3555, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13741746544837952, "margin_dpo/beta_margin_grad_std": 0.1758362054824829, "margin_dpo/beta_margin_mean": 3.554109811782837, "margin_dpo/beta_margin_std": 2.9027013778686523, "margin_dpo/loss_margin_mean": 35.54109573364258, "margin_dpo/margin_mean": 35.541099548339844, "margin_dpo/margin_std": 28.457447052001953, "step": 671 }, { "epoch": 0.986784140969163, "grad_norm": 66.19140625, "learning_rate": 3.293150240547549e-10, "logits/chosen": -0.6310614347457886, "logits/rejected": -0.5937498211860657, "logps/chosen": -82.76466369628906, "logps/ref_chosen": -58.583290100097656, "logps/ref_rejected": -93.14014434814453, "logps/rejected": -149.71588134765625, "loss": 0.4842, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1728401631116867, "margin_dpo/beta_margin_grad_std": 0.21923863887786865, "margin_dpo/beta_margin_mean": 3.2394371032714844, "margin_dpo/beta_margin_std": 3.0078792572021484, "margin_dpo/loss_margin_mean": 32.394371032714844, "margin_dpo/margin_mean": 32.394371032714844, "margin_dpo/margin_std": 29.72500228881836, "step": 672 }, { "epoch": 0.9882525697503671, "grad_norm": 43.1835823059082, "learning_rate": 2.6675629940689504e-10, "logits/chosen": -0.6048033237457275, "logits/rejected": -0.5747998952865601, "logps/chosen": -67.85647583007812, "logps/ref_chosen": -46.72320556640625, "logps/ref_rejected": -85.29623413085938, "logps/rejected": -143.50682067871094, "loss": 0.3112, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1224966049194336, "margin_dpo/beta_margin_grad_std": 0.16890129446983337, "margin_dpo/beta_margin_mean": 3.707730531692505, "margin_dpo/beta_margin_std": 2.7531516551971436, "margin_dpo/loss_margin_mean": 37.07730484008789, "margin_dpo/margin_mean": 37.077301025390625, "margin_dpo/margin_std": 27.354259490966797, "step": 673 }, { "epoch": 0.9897209985315712, "grad_norm": 36.11240005493164, "learning_rate": 2.1077827798404725e-10, "logits/chosen": -0.5830689668655396, "logits/rejected": -0.5558980703353882, "logps/chosen": -67.47659301757812, "logps/ref_chosen": -45.445526123046875, "logps/ref_rejected": -70.04593658447266, "logps/rejected": -129.95156860351562, "loss": 0.2851, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.11657389253377914, "margin_dpo/beta_margin_grad_std": 0.1537243127822876, "margin_dpo/beta_margin_mean": 3.78745698928833, "margin_dpo/beta_margin_std": 2.833228826522827, "margin_dpo/loss_margin_mean": 37.874568939208984, "margin_dpo/margin_mean": 37.874568939208984, "margin_dpo/margin_std": 28.31113052368164, "step": 674 }, { "epoch": 0.9911894273127754, "grad_norm": 66.7328109741211, "learning_rate": 1.6138243485910863e-10, "logits/chosen": -0.5768786668777466, "logits/rejected": -0.5500950813293457, "logps/chosen": -64.9262924194336, "logps/ref_chosen": -44.17628479003906, "logps/ref_rejected": -74.09197998046875, "logps/rejected": -134.33714294433594, "loss": 0.3902, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.11795066297054291, "margin_dpo/beta_margin_grad_std": 0.21005932986736298, "margin_dpo/beta_margin_mean": 3.949514865875244, "margin_dpo/beta_margin_std": 2.772658109664917, "margin_dpo/loss_margin_mean": 39.495147705078125, "margin_dpo/margin_mean": 39.495147705078125, "margin_dpo/margin_std": 27.606351852416992, "step": 675 }, { "epoch": 0.9926578560939795, "grad_norm": 79.0772933959961, "learning_rate": 1.1857007165852472e-10, "logits/chosen": -0.615682065486908, "logits/rejected": -0.5794901847839355, "logps/chosen": -96.71508026123047, "logps/ref_chosen": -71.39852142333984, "logps/ref_rejected": -88.3587646484375, "logps/rejected": -149.972412109375, "loss": 0.4162, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1375354677438736, "margin_dpo/beta_margin_grad_std": 0.20353099703788757, "margin_dpo/beta_margin_mean": 3.629708766937256, "margin_dpo/beta_margin_std": 2.896389961242676, "margin_dpo/loss_margin_mean": 36.29708480834961, "margin_dpo/margin_mean": 36.29708480834961, "margin_dpo/margin_std": 28.651344299316406, "step": 676 }, { "epoch": 0.9941262848751835, "grad_norm": 65.4261245727539, "learning_rate": 8.23423165278725e-11, "logits/chosen": -0.5974393486976624, "logits/rejected": -0.5463284254074097, "logps/chosen": -79.63191986083984, "logps/ref_chosen": -56.52743911743164, "logps/ref_rejected": -78.22654724121094, "logps/rejected": -138.780517578125, "loss": 0.4482, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13777747750282288, "margin_dpo/beta_margin_grad_std": 0.22497375309467316, "margin_dpo/beta_margin_mean": 3.744948625564575, "margin_dpo/beta_margin_std": 2.872178792953491, "margin_dpo/loss_margin_mean": 37.449485778808594, "margin_dpo/margin_mean": 37.449485778808594, "margin_dpo/margin_std": 28.472801208496094, "step": 677 }, { "epoch": 0.9955947136563876, "grad_norm": 50.95887756347656, "learning_rate": 5.270012410216185e-11, "logits/chosen": -0.5905472040176392, "logits/rejected": -0.5667222738265991, "logps/chosen": -67.8372802734375, "logps/ref_chosen": -46.13447570800781, "logps/ref_rejected": -80.60462951660156, "logps/rejected": -139.0126495361328, "loss": 0.4485, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16534043848514557, "margin_dpo/beta_margin_grad_std": 0.21265582740306854, "margin_dpo/beta_margin_mean": 3.6705210208892822, "margin_dpo/beta_margin_std": 3.1234190464019775, "margin_dpo/loss_margin_mean": 36.7052116394043, "margin_dpo/margin_mean": 36.70520782470703, "margin_dpo/margin_std": 31.16322135925293, "step": 678 }, { "epoch": 0.9970631424375918, "grad_norm": 47.8213005065918, "learning_rate": 2.9644275480772416e-11, "logits/chosen": -0.6013349294662476, "logits/rejected": -0.5681812167167664, "logps/chosen": -72.65241241455078, "logps/ref_chosen": -50.294921875, "logps/ref_rejected": -76.59813690185547, "logps/rejected": -135.8075408935547, "loss": 0.3274, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.11131599545478821, "margin_dpo/beta_margin_grad_std": 0.1771748960018158, "margin_dpo/beta_margin_mean": 3.6851911544799805, "margin_dpo/beta_margin_std": 2.695528745651245, "margin_dpo/loss_margin_mean": 36.85191345214844, "margin_dpo/margin_mean": 36.85191345214844, "margin_dpo/margin_std": 26.87795639038086, "step": 679 }, { "epoch": 0.9985315712187959, "grad_norm": 57.492881774902344, "learning_rate": 1.31753782067201e-11, "logits/chosen": -0.6063967347145081, "logits/rejected": -0.5727298259735107, "logps/chosen": -99.56130981445312, "logps/ref_chosen": -76.91569519042969, "logps/ref_rejected": -112.384765625, "logps/rejected": -171.20968627929688, "loss": 0.381, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1346154808998108, "margin_dpo/beta_margin_grad_std": 0.1998624950647354, "margin_dpo/beta_margin_mean": 3.6179311275482178, "margin_dpo/beta_margin_std": 2.948613405227661, "margin_dpo/loss_margin_mean": 36.1793098449707, "margin_dpo/margin_mean": 36.17931365966797, "margin_dpo/margin_std": 29.298704147338867, "step": 680 }, { "epoch": 1.0, "grad_norm": 52.16978073120117, "learning_rate": 3.2938662507808745e-12, "logits/chosen": -0.6496413946151733, "logits/rejected": -0.6223350167274475, "logps/chosen": -84.20474243164062, "logps/ref_chosen": -60.957279205322266, "logps/ref_rejected": -88.5579833984375, "logps/rejected": -143.598876953125, "loss": 0.4583, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16029776632785797, "margin_dpo/beta_margin_grad_std": 0.20890314877033234, "margin_dpo/beta_margin_mean": 3.1793434619903564, "margin_dpo/beta_margin_std": 2.862551212310791, "margin_dpo/loss_margin_mean": 31.793434143066406, "margin_dpo/margin_mean": 31.793434143066406, "margin_dpo/margin_std": 28.037933349609375, "step": 681 }, { "epoch": 1.0, "step": 681, "total_flos": 0.0, "train_loss": 0.5730435011495403, "train_runtime": 3273.0613, "train_samples_per_second": 13.32, "train_steps_per_second": 0.208 } ], "logging_steps": 1, "max_steps": 681, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }