{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 200, "global_step": 681, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "KL/chosen_KL_mean": 0.00527191162109375, "KL/mean": 0.016706019639968872, "KL/rejected_KL_mean": 0.028141021728515625, "KL/std": 0.272699236869812, "epoch": 0.0014684287812041115, "fcm_dpo/beta": 0.800000011920929, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": -0.02287006378173828, "fcm_dpo/q_t": 0.5040594935417175, "grad_norm": 676.3800659179688, "learning_rate": 0.0, "logits/chosen": -0.4974287748336792, "logits/rejected": -0.43299180269241333, "logps/chosen": -50.1435661315918, "logps/ref_chosen": -50.14883804321289, "logps/ref_rejected": -74.1280517578125, "logps/rejected": -74.09991455078125, "loss": 1.4324, "margin_dpo/margin_mean": -0.02287048101425171, "margin_dpo/margin_std": 0.41920793056488037, "step": 1 }, { "KL/chosen_KL_mean": -0.03498649597167969, "KL/mean": -0.00212840735912323, "KL/rejected_KL_mean": 0.030735015869140625, "KL/std": 0.24797174334526062, "epoch": 0.002936857562408223, "fcm_dpo/beta": 0.800000011920929, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": -0.06572261452674866, "fcm_dpo/q_t": 0.5128992795944214, "grad_norm": 589.6188354492188, "learning_rate": 7.246376811594203e-09, "logits/chosen": -0.49536412954330444, "logits/rejected": -0.4594460427761078, "logps/chosen": -52.65568923950195, "logps/ref_chosen": -52.620704650878906, "logps/ref_rejected": -75.30413818359375, "logps/rejected": -75.27340698242188, "loss": 1.4592, "margin_dpo/margin_mean": -0.06572240591049194, "margin_dpo/margin_std": 0.35048407316207886, "step": 2 }, { "KL/chosen_KL_mean": -0.0075588226318359375, "KL/mean": -0.0043991804122924805, "KL/rejected_KL_mean": -0.001239776611328125, "KL/std": 0.22414085268974304, "epoch": 0.004405286343612335, "fcm_dpo/beta": 0.800000011920929, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": -0.00632166862487793, "fcm_dpo/q_t": 0.5012327432632446, "grad_norm": 575.7740478515625, "learning_rate": 1.4492753623188406e-08, "logits/chosen": -0.4817797839641571, "logits/rejected": -0.44226667284965515, "logps/chosen": -60.98915481567383, "logps/ref_chosen": -60.981597900390625, "logps/ref_rejected": -68.67259216308594, "logps/rejected": -68.67383575439453, "loss": 1.405, "margin_dpo/margin_mean": -0.0063214898109436035, "margin_dpo/margin_std": 0.2866283059120178, "step": 3 }, { "KL/chosen_KL_mean": -0.04131507873535156, "KL/mean": -0.0033356696367263794, "KL/rejected_KL_mean": 0.034641265869140625, "KL/std": 0.25460168719291687, "epoch": 0.005873715124816446, "fcm_dpo/beta": 0.800000011920929, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": -0.07595756649971008, "fcm_dpo/q_t": 0.5149009227752686, "grad_norm": 598.5643920898438, "learning_rate": 2.1739130434782606e-08, "logits/chosen": -0.4682745039463043, "logits/rejected": -0.44059938192367554, "logps/chosen": -56.80902862548828, "logps/ref_chosen": -56.7677116394043, "logps/ref_rejected": -86.64710998535156, "logps/rejected": -86.61247253417969, "loss": 1.469, "margin_dpo/margin_mean": -0.0759580135345459, "margin_dpo/margin_std": 0.36108309030532837, "step": 4 }, { "KL/chosen_KL_mean": 0.0052433013916015625, "KL/mean": 0.018906593322753906, "KL/rejected_KL_mean": 0.032573699951171875, "KL/std": 0.2835850417613983, "epoch": 0.007342143906020558, "fcm_dpo/beta": 0.800000011920929, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": -0.027328133583068848, "fcm_dpo/q_t": 0.5043825507164001, "grad_norm": 748.8038940429688, "learning_rate": 2.898550724637681e-08, "logits/chosen": -0.5214688777923584, "logits/rejected": -0.4782792031764984, "logps/chosen": -53.85413360595703, "logps/ref_chosen": -53.859375, "logps/ref_rejected": -84.14918518066406, "logps/rejected": -84.11660766601562, "loss": 1.4327, "margin_dpo/margin_mean": -0.02732786536216736, "margin_dpo/margin_std": 0.39059120416641235, "step": 5 }, { "KL/chosen_KL_mean": 0.0131988525390625, "KL/mean": -0.0006367862224578857, "KL/rejected_KL_mean": -0.014469146728515625, "KL/std": 0.2519422471523285, "epoch": 0.00881057268722467, "fcm_dpo/beta": 0.800000011920929, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.0276680588722229, "fcm_dpo/q_t": 0.49454063177108765, "grad_norm": 761.1502685546875, "learning_rate": 3.6231884057971014e-08, "logits/chosen": -0.4976610243320465, "logits/rejected": -0.4546470046043396, "logps/chosen": -62.994285583496094, "logps/ref_chosen": -63.007484436035156, "logps/ref_rejected": -92.64534759521484, "logps/rejected": -92.65982055664062, "loss": 1.3859, "margin_dpo/margin_mean": 0.027667373418807983, "margin_dpo/margin_std": 0.35976481437683105, "step": 6 }, { "KL/chosen_KL_mean": 0.035762786865234375, "KL/mean": 0.020337015390396118, "KL/rejected_KL_mean": 0.004913330078125, "KL/std": 0.2789710462093353, "epoch": 0.010279001468428781, "fcm_dpo/beta": 0.800000011920929, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.030850648880004883, "fcm_dpo/q_t": 0.49415522813796997, "grad_norm": 648.91259765625, "learning_rate": 4.347826086956521e-08, "logits/chosen": -0.5009369254112244, "logits/rejected": -0.4670419692993164, "logps/chosen": -57.73905944824219, "logps/ref_chosen": -57.774818420410156, "logps/ref_rejected": -103.92059326171875, "logps/rejected": -103.91567993164062, "loss": 1.3851, "margin_dpo/margin_mean": 0.030851304531097412, "margin_dpo/margin_std": 0.3817327618598938, "step": 7 }, { "KL/chosen_KL_mean": 0.006595611572265625, "KL/mean": 0.011890605092048645, "KL/rejected_KL_mean": 0.01718902587890625, "KL/std": 0.2876508831977844, "epoch": 0.011747430249632892, "fcm_dpo/beta": 0.800000011920929, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": -0.01059296727180481, "fcm_dpo/q_t": 0.5014467239379883, "grad_norm": 638.4391479492188, "learning_rate": 5.0724637681159424e-08, "logits/chosen": -0.5008213520050049, "logits/rejected": -0.47419145703315735, "logps/chosen": -58.709442138671875, "logps/ref_chosen": -58.716033935546875, "logps/ref_rejected": -79.3114242553711, "logps/rejected": -79.29423522949219, "loss": 1.4194, "margin_dpo/margin_mean": -0.010592788457870483, "margin_dpo/margin_std": 0.3931761384010315, "step": 8 }, { "KL/chosen_KL_mean": 0.03925895690917969, "KL/mean": 0.02510516345500946, "KL/rejected_KL_mean": 0.010951995849609375, "KL/std": 0.30363646149635315, "epoch": 0.013215859030837005, "fcm_dpo/beta": 0.800000011920929, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.028308242559432983, "fcm_dpo/q_t": 0.49435490369796753, "grad_norm": 672.1676025390625, "learning_rate": 5.797101449275362e-08, "logits/chosen": -0.5118868350982666, "logits/rejected": -0.4663264751434326, "logps/chosen": -69.82758331298828, "logps/ref_chosen": -69.8668441772461, "logps/ref_rejected": -99.6026611328125, "logps/rejected": -99.59171295166016, "loss": 1.394, "margin_dpo/margin_mean": 0.028307169675827026, "margin_dpo/margin_std": 0.4165334105491638, "step": 9 }, { "KL/chosen_KL_mean": 0.0066814422607421875, "KL/mean": -0.015493467450141907, "KL/rejected_KL_mean": -0.037662506103515625, "KL/std": 0.2837975323200226, "epoch": 0.014684287812041116, "fcm_dpo/beta": 0.800000011920929, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.04434821009635925, "fcm_dpo/q_t": 0.4914953112602234, "grad_norm": 545.5849609375, "learning_rate": 6.521739130434782e-08, "logits/chosen": -0.4926380515098572, "logits/rejected": -0.44934237003326416, "logps/chosen": -48.35100555419922, "logps/ref_chosen": -48.35768508911133, "logps/ref_rejected": -80.37206268310547, "logps/rejected": -80.40972900390625, "loss": 1.3752, "margin_dpo/margin_mean": 0.04434826970100403, "margin_dpo/margin_std": 0.37585416436195374, "step": 10 }, { "KL/chosen_KL_mean": -0.000308990478515625, "KL/mean": -0.00840708613395691, "KL/rejected_KL_mean": -0.01650238037109375, "KL/std": 0.28651660680770874, "epoch": 0.016152716593245228, "fcm_dpo/beta": 0.800000011920929, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.01619333028793335, "fcm_dpo/q_t": 0.49684467911720276, "grad_norm": 539.252197265625, "learning_rate": 7.246376811594203e-08, "logits/chosen": -0.4729078710079193, "logits/rejected": -0.44843602180480957, "logps/chosen": -53.01716613769531, "logps/ref_chosen": -53.01685333251953, "logps/ref_rejected": -87.78038024902344, "logps/rejected": -87.796875, "loss": 1.3993, "margin_dpo/margin_mean": 0.016193389892578125, "margin_dpo/margin_std": 0.378741979598999, "step": 11 }, { "KL/chosen_KL_mean": -0.0455169677734375, "KL/mean": -0.0396341010928154, "KL/rejected_KL_mean": -0.033748626708984375, "KL/std": 0.27084100246429443, "epoch": 0.01762114537444934, "fcm_dpo/beta": 0.800000011920929, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": -0.01176394522190094, "fcm_dpo/q_t": 0.5020244717597961, "grad_norm": 731.9530639648438, "learning_rate": 7.971014492753623e-08, "logits/chosen": -0.5192070007324219, "logits/rejected": -0.48252177238464355, "logps/chosen": -61.8509521484375, "logps/ref_chosen": -61.80543518066406, "logps/ref_rejected": -104.8582763671875, "logps/rejected": -104.89202117919922, "loss": 1.4199, "margin_dpo/margin_mean": -0.011764273047447205, "margin_dpo/margin_std": 0.3872652053833008, "step": 12 }, { "KL/chosen_KL_mean": 0.0012149810791015625, "KL/mean": 0.024721741676330566, "KL/rejected_KL_mean": 0.0482330322265625, "KL/std": 0.2512255609035492, "epoch": 0.01908957415565345, "fcm_dpo/beta": 0.800000011920929, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": -0.047011733055114746, "fcm_dpo/q_t": 0.5091712474822998, "grad_norm": 657.2701416015625, "learning_rate": 8.695652173913042e-08, "logits/chosen": -0.4995085597038269, "logits/rejected": -0.47286656498908997, "logps/chosen": -64.25914001464844, "logps/ref_chosen": -64.2603530883789, "logps/ref_rejected": -87.20307922363281, "logps/rejected": -87.15484619140625, "loss": 1.4451, "margin_dpo/margin_mean": -0.047012150287628174, "margin_dpo/margin_std": 0.3621995747089386, "step": 13 }, { "KL/chosen_KL_mean": -0.030294418334960938, "KL/mean": -0.0255916565656662, "KL/rejected_KL_mean": -0.020893096923828125, "KL/std": 0.27099841833114624, "epoch": 0.020558002936857563, "fcm_dpo/beta": 0.800000011920929, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": -0.009398609399795532, "fcm_dpo/q_t": 0.5020325183868408, "grad_norm": 684.386962890625, "learning_rate": 9.420289855072464e-08, "logits/chosen": -0.49813684821128845, "logits/rejected": -0.4595490097999573, "logps/chosen": -58.1405029296875, "logps/ref_chosen": -58.11021041870117, "logps/ref_rejected": -104.04708099365234, "logps/rejected": -104.06797790527344, "loss": 1.4211, "margin_dpo/margin_mean": -0.009398102760314941, "margin_dpo/margin_std": 0.4144117534160614, "step": 14 }, { "KL/chosen_KL_mean": -0.031803131103515625, "KL/mean": -0.05128836631774902, "KL/rejected_KL_mean": -0.070770263671875, "KL/std": 0.2677825093269348, "epoch": 0.022026431718061675, "fcm_dpo/beta": 0.800000011920929, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.038970112800598145, "fcm_dpo/q_t": 0.49189913272857666, "grad_norm": 519.0724487304688, "learning_rate": 1.0144927536231885e-07, "logits/chosen": -0.4899941384792328, "logits/rejected": -0.4712330996990204, "logps/chosen": -56.99871063232422, "logps/ref_chosen": -56.96691131591797, "logps/ref_rejected": -80.80863952636719, "logps/rejected": -80.87940979003906, "loss": 1.3835, "margin_dpo/margin_mean": 0.03897008299827576, "margin_dpo/margin_std": 0.4108760356903076, "step": 15 }, { "KL/chosen_KL_mean": 0.010923385620117188, "KL/mean": -0.03770947456359863, "KL/rejected_KL_mean": -0.08633804321289062, "KL/std": 0.25064218044281006, "epoch": 0.023494860499265784, "fcm_dpo/beta": 0.800000011920929, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.09726369380950928, "fcm_dpo/q_t": 0.4807215929031372, "grad_norm": 658.9254150390625, "learning_rate": 1.0869565217391303e-07, "logits/chosen": -0.5206788182258606, "logits/rejected": -0.4792103171348572, "logps/chosen": -61.72896957397461, "logps/ref_chosen": -61.739891052246094, "logps/ref_rejected": -84.36947631835938, "logps/rejected": -84.455810546875, "loss": 1.3318, "margin_dpo/margin_mean": 0.09726375341415405, "margin_dpo/margin_std": 0.3595857620239258, "step": 16 }, { "KL/chosen_KL_mean": 0.007419586181640625, "KL/mean": -0.00011467933654785156, "KL/rejected_KL_mean": -0.007648468017578125, "KL/std": 0.2673434615135193, "epoch": 0.024963289280469897, "fcm_dpo/beta": 0.800000011920929, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.015069246292114258, "fcm_dpo/q_t": 0.49689337611198425, "grad_norm": 644.1885986328125, "learning_rate": 1.1594202898550725e-07, "logits/chosen": -0.4899910092353821, "logits/rejected": -0.4514046311378479, "logps/chosen": -67.70291137695312, "logps/ref_chosen": -67.71033477783203, "logps/ref_rejected": -85.37865447998047, "logps/rejected": -85.38630676269531, "loss": 1.4017, "margin_dpo/margin_mean": 0.015069544315338135, "margin_dpo/margin_std": 0.4079738259315491, "step": 17 }, { "KL/chosen_KL_mean": -0.020544052124023438, "KL/mean": -0.054789185523986816, "KL/rejected_KL_mean": -0.0890350341796875, "KL/std": 0.26502934098243713, "epoch": 0.02643171806167401, "fcm_dpo/beta": 0.800000011920929, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.0684882402420044, "fcm_dpo/q_t": 0.4862971305847168, "grad_norm": 632.3574829101562, "learning_rate": 1.2318840579710146e-07, "logits/chosen": -0.48648545145988464, "logits/rejected": -0.43023061752319336, "logps/chosen": -47.760032653808594, "logps/ref_chosen": -47.7394905090332, "logps/ref_rejected": -75.4722900390625, "logps/rejected": -75.56132507324219, "loss": 1.3528, "margin_dpo/margin_mean": 0.06848806142807007, "margin_dpo/margin_std": 0.3598102629184723, "step": 18 }, { "KL/chosen_KL_mean": -0.00384521484375, "KL/mean": -0.04403865337371826, "KL/rejected_KL_mean": -0.08422470092773438, "KL/std": 0.24243327975273132, "epoch": 0.027900146842878122, "fcm_dpo/beta": 0.800000011920929, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.08037757873535156, "fcm_dpo/q_t": 0.4844985008239746, "grad_norm": 576.9486083984375, "learning_rate": 1.3043478260869563e-07, "logits/chosen": -0.528351902961731, "logits/rejected": -0.4815219044685364, "logps/chosen": -70.2092056274414, "logps/ref_chosen": -70.20536041259766, "logps/ref_rejected": -89.7575912475586, "logps/rejected": -89.84181213378906, "loss": 1.3433, "margin_dpo/margin_mean": 0.08037763833999634, "margin_dpo/margin_std": 0.3444629907608032, "step": 19 }, { "KL/chosen_KL_mean": -0.043910980224609375, "KL/mean": -0.06724703311920166, "KL/rejected_KL_mean": -0.09057998657226562, "KL/std": 0.2558104395866394, "epoch": 0.02936857562408223, "fcm_dpo/beta": 0.800000011920929, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.04667180776596069, "fcm_dpo/q_t": 0.49041998386383057, "grad_norm": 575.07275390625, "learning_rate": 1.3768115942028986e-07, "logits/chosen": -0.5057722330093384, "logits/rejected": -0.44449201226234436, "logps/chosen": -50.84715270996094, "logps/ref_chosen": -50.80324172973633, "logps/ref_rejected": -78.82334899902344, "logps/rejected": -78.91392517089844, "loss": 1.3713, "margin_dpo/margin_mean": 0.046672046184539795, "margin_dpo/margin_std": 0.37012988328933716, "step": 20 }, { "KL/chosen_KL_mean": -0.007419586181640625, "KL/mean": -0.08116798102855682, "KL/rejected_KL_mean": -0.15491485595703125, "KL/std": 0.27864497900009155, "epoch": 0.030837004405286344, "fcm_dpo/beta": 0.800000011920929, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.14749857783317566, "fcm_dpo/q_t": 0.47142279148101807, "grad_norm": 567.56494140625, "learning_rate": 1.4492753623188405e-07, "logits/chosen": -0.471624493598938, "logits/rejected": -0.4458872079849243, "logps/chosen": -50.0704345703125, "logps/ref_chosen": -50.063018798828125, "logps/ref_rejected": -77.86878967285156, "logps/rejected": -78.02371215820312, "loss": 1.2925, "margin_dpo/margin_mean": 0.1474984586238861, "margin_dpo/margin_std": 0.3555169403553009, "step": 21 }, { "KL/chosen_KL_mean": 0.029613494873046875, "KL/mean": -0.03875645995140076, "KL/rejected_KL_mean": -0.10712814331054688, "KL/std": 0.2763916254043579, "epoch": 0.032305433186490456, "fcm_dpo/beta": 0.800000011920929, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.13674354553222656, "fcm_dpo/q_t": 0.47342291474342346, "grad_norm": 609.1174926757812, "learning_rate": 1.5217391304347825e-07, "logits/chosen": -0.4810870885848999, "logits/rejected": -0.43719351291656494, "logps/chosen": -59.02802276611328, "logps/ref_chosen": -59.05763626098633, "logps/ref_rejected": -97.50466918945312, "logps/rejected": -97.61180114746094, "loss": 1.3046, "margin_dpo/margin_mean": 0.1367432177066803, "margin_dpo/margin_std": 0.38546815514564514, "step": 22 }, { "KL/chosen_KL_mean": 0.06751251220703125, "KL/mean": -0.029902145266532898, "KL/rejected_KL_mean": -0.1273174285888672, "KL/std": 0.32757315039634705, "epoch": 0.033773861967694566, "fcm_dpo/beta": 0.800000011920929, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.19482764601707458, "fcm_dpo/q_t": 0.46310946345329285, "grad_norm": 545.2821044921875, "learning_rate": 1.5942028985507245e-07, "logits/chosen": -0.4882713258266449, "logits/rejected": -0.4656675159931183, "logps/chosen": -60.01018524169922, "logps/ref_chosen": -60.07769775390625, "logps/ref_rejected": -81.13955688476562, "logps/rejected": -81.2668685913086, "loss": 1.2669, "margin_dpo/margin_mean": 0.19482776522636414, "margin_dpo/margin_std": 0.43239736557006836, "step": 23 }, { "KL/chosen_KL_mean": 0.057018280029296875, "KL/mean": -0.06638666987419128, "KL/rejected_KL_mean": -0.18979644775390625, "KL/std": 0.3026999235153198, "epoch": 0.03524229074889868, "fcm_dpo/beta": 0.8059060573577881, "fcm_dpo/delta": 0.0732855275273323, "fcm_dpo/margin": 0.2468125820159912, "fcm_dpo/q_t": 0.45153895020484924, "grad_norm": 589.2479248046875, "learning_rate": 1.6666666666666665e-07, "logits/chosen": -0.4822083115577698, "logits/rejected": -0.46618789434432983, "logps/chosen": -44.23401641845703, "logps/ref_chosen": -44.29103469848633, "logps/ref_rejected": -99.12521362304688, "logps/rejected": -99.31501007080078, "loss": 1.2205, "margin_dpo/margin_mean": 0.24681302905082703, "margin_dpo/margin_std": 0.3777139186859131, "step": 24 }, { "KL/chosen_KL_mean": 0.016529083251953125, "KL/mean": -0.07727153599262238, "KL/rejected_KL_mean": -0.17107009887695312, "KL/std": 0.34193894267082214, "epoch": 0.03671071953010279, "fcm_dpo/beta": 0.8118120431900024, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.18760046362876892, "fcm_dpo/q_t": 0.4633547067642212, "grad_norm": 538.94677734375, "learning_rate": 1.7391304347826085e-07, "logits/chosen": -0.5210130214691162, "logits/rejected": -0.4919084310531616, "logps/chosen": -52.52052307128906, "logps/ref_chosen": -52.537052154541016, "logps/ref_rejected": -89.34219360351562, "logps/rejected": -89.51325988769531, "loss": 1.2696, "margin_dpo/margin_mean": 0.18760085105895996, "margin_dpo/margin_std": 0.4301965832710266, "step": 25 }, { "KL/chosen_KL_mean": 0.06367874145507812, "KL/mean": -0.09489929676055908, "KL/rejected_KL_mean": -0.25347900390625, "KL/std": 0.40129321813583374, "epoch": 0.0381791483113069, "fcm_dpo/beta": 0.8153971433639526, "fcm_dpo/delta": 0.04396749660372734, "fcm_dpo/margin": 0.3171558976173401, "fcm_dpo/q_t": 0.43973731994628906, "grad_norm": 565.0730590820312, "learning_rate": 1.8115942028985507e-07, "logits/chosen": -0.5216317176818848, "logits/rejected": -0.4897175133228302, "logps/chosen": -53.859130859375, "logps/ref_chosen": -53.92280578613281, "logps/ref_rejected": -103.35971069335938, "logps/rejected": -103.61318969726562, "loss": 1.1923, "margin_dpo/margin_mean": 0.31715625524520874, "margin_dpo/margin_std": 0.5414035320281982, "step": 26 }, { "KL/chosen_KL_mean": 0.13181304931640625, "KL/mean": -0.10721321403980255, "KL/rejected_KL_mean": -0.3462409973144531, "KL/std": 0.445562481880188, "epoch": 0.039647577092511016, "fcm_dpo/beta": 0.8174295425415039, "fcm_dpo/delta": 0.00940924696624279, "fcm_dpo/margin": 0.4780521094799042, "fcm_dpo/q_t": 0.4092448949813843, "grad_norm": 575.7881469726562, "learning_rate": 1.8840579710144927e-07, "logits/chosen": -0.5156636238098145, "logits/rejected": -0.47926321625709534, "logps/chosen": -42.76671600341797, "logps/ref_chosen": -42.898529052734375, "logps/ref_rejected": -98.72419738769531, "logps/rejected": -99.07044219970703, "loss": 1.0763, "margin_dpo/margin_mean": 0.47805216908454895, "margin_dpo/margin_std": 0.5317339897155762, "step": 27 }, { "KL/chosen_KL_mean": 0.029035568237304688, "KL/mean": -0.13445699214935303, "KL/rejected_KL_mean": -0.29795074462890625, "KL/std": 0.4010791778564453, "epoch": 0.041116005873715125, "fcm_dpo/beta": 0.8387187123298645, "fcm_dpo/delta": 0.12880420684814453, "fcm_dpo/margin": 0.3269842267036438, "fcm_dpo/q_t": 0.4359322488307953, "grad_norm": 499.330810546875, "learning_rate": 1.9565217391304347e-07, "logits/chosen": -0.5236212015151978, "logits/rejected": -0.4699610471725464, "logps/chosen": -60.5274658203125, "logps/ref_chosen": -60.55650329589844, "logps/ref_rejected": -91.40111541748047, "logps/rejected": -91.69906616210938, "loss": 1.1808, "margin_dpo/margin_mean": 0.3269844055175781, "margin_dpo/margin_std": 0.5416440367698669, "step": 28 }, { "KL/chosen_KL_mean": 0.13134193420410156, "KL/mean": -0.14041244983673096, "KL/rejected_KL_mean": -0.4121665954589844, "KL/std": 0.44271203875541687, "epoch": 0.042584434654919234, "fcm_dpo/beta": 0.8350539207458496, "fcm_dpo/delta": -0.056411802768707275, "fcm_dpo/margin": 0.5435106754302979, "fcm_dpo/q_t": 0.39213281869888306, "grad_norm": 554.7158813476562, "learning_rate": 2.028985507246377e-07, "logits/chosen": -0.5539100170135498, "logits/rejected": -0.5077922344207764, "logps/chosen": -57.67644500732422, "logps/ref_chosen": -57.80778503417969, "logps/ref_rejected": -97.39434814453125, "logps/rejected": -97.8065185546875, "loss": 1.0209, "margin_dpo/margin_mean": 0.5435110330581665, "margin_dpo/margin_std": 0.47872793674468994, "step": 29 }, { "KL/chosen_KL_mean": 0.13145065307617188, "KL/mean": -0.23007872700691223, "KL/rejected_KL_mean": -0.59161376953125, "KL/std": 0.6256662607192993, "epoch": 0.04405286343612335, "fcm_dpo/beta": 0.804972231388092, "fcm_dpo/delta": -0.19466958940029144, "fcm_dpo/margin": 0.7230579853057861, "fcm_dpo/q_t": 0.36796897649765015, "grad_norm": 459.3413391113281, "learning_rate": 2.1014492753623187e-07, "logits/chosen": -0.5020028948783875, "logits/rejected": -0.47157809138298035, "logps/chosen": -52.4459228515625, "logps/ref_chosen": -52.577369689941406, "logps/ref_rejected": -98.48920440673828, "logps/rejected": -99.08081817626953, "loss": 0.9594, "margin_dpo/margin_mean": 0.7230584621429443, "margin_dpo/margin_std": 0.712450385093689, "step": 30 }, { "KL/chosen_KL_mean": 0.1332244873046875, "KL/mean": -0.13057458400726318, "KL/rejected_KL_mean": -0.394378662109375, "KL/std": 0.5608391165733337, "epoch": 0.04552129221732746, "fcm_dpo/beta": 0.7972604632377625, "fcm_dpo/delta": -0.0215899795293808, "fcm_dpo/margin": 0.5276015996932983, "fcm_dpo/q_t": 0.4051462411880493, "grad_norm": 401.8319091796875, "learning_rate": 2.1739130434782607e-07, "logits/chosen": -0.509661853313446, "logits/rejected": -0.46542733907699585, "logps/chosen": -63.67369842529297, "logps/ref_chosen": -63.806922912597656, "logps/ref_rejected": -72.89400482177734, "logps/rejected": -73.28838348388672, "loss": 1.0803, "margin_dpo/margin_mean": 0.5276015996932983, "margin_dpo/margin_std": 0.7064246535301208, "step": 31 }, { "KL/chosen_KL_mean": 0.20654296875, "KL/mean": -0.18202102184295654, "KL/rejected_KL_mean": -0.570587158203125, "KL/std": 0.7511119842529297, "epoch": 0.04698972099853157, "fcm_dpo/beta": 0.772221565246582, "fcm_dpo/delta": -0.21296542882919312, "fcm_dpo/margin": 0.7771282196044922, "fcm_dpo/q_t": 0.3716619610786438, "grad_norm": 401.576416015625, "learning_rate": 2.2463768115942027e-07, "logits/chosen": -0.5210152864456177, "logits/rejected": -0.4803611934185028, "logps/chosen": -62.532981872558594, "logps/ref_chosen": -62.739524841308594, "logps/ref_rejected": -89.3175048828125, "logps/rejected": -89.88809204101562, "loss": 0.979, "margin_dpo/margin_mean": 0.7771281003952026, "margin_dpo/margin_std": 0.9537783861160278, "step": 32 }, { "KL/chosen_KL_mean": 0.11682891845703125, "KL/mean": -0.20829498767852783, "KL/rejected_KL_mean": -0.5334129333496094, "KL/std": 0.5831528902053833, "epoch": 0.048458149779735685, "fcm_dpo/beta": 0.7558040022850037, "fcm_dpo/delta": -0.09609463810920715, "fcm_dpo/margin": 0.6502407789230347, "fcm_dpo/q_t": 0.38533806800842285, "grad_norm": 389.24462890625, "learning_rate": 2.318840579710145e-07, "logits/chosen": -0.48112988471984863, "logits/rejected": -0.4547329545021057, "logps/chosen": -53.144142150878906, "logps/ref_chosen": -53.26097106933594, "logps/ref_rejected": -87.8851318359375, "logps/rejected": -88.41854858398438, "loss": 1.0079, "margin_dpo/margin_mean": 0.6502406597137451, "margin_dpo/margin_std": 0.6353799104690552, "step": 33 }, { "KL/chosen_KL_mean": 0.10170745849609375, "KL/mean": -0.2716452181339264, "KL/rejected_KL_mean": -0.6450004577636719, "KL/std": 0.6980259418487549, "epoch": 0.049926578560939794, "fcm_dpo/beta": 0.7317001819610596, "fcm_dpo/delta": -0.15499642491340637, "fcm_dpo/margin": 0.7467071413993835, "fcm_dpo/q_t": 0.3767717480659485, "grad_norm": 378.3127136230469, "learning_rate": 2.391304347826087e-07, "logits/chosen": -0.5065457224845886, "logits/rejected": -0.48904159665107727, "logps/chosen": -50.71562194824219, "logps/ref_chosen": -50.81732940673828, "logps/ref_rejected": -101.92184448242188, "logps/rejected": -102.56684875488281, "loss": 0.9903, "margin_dpo/margin_mean": 0.746705949306488, "margin_dpo/margin_std": 0.8056973218917847, "step": 34 }, { "KL/chosen_KL_mean": 0.18681907653808594, "KL/mean": -0.41164833307266235, "KL/rejected_KL_mean": -1.0101165771484375, "KL/std": 0.9510899782180786, "epoch": 0.0513950073421439, "fcm_dpo/beta": 0.6779334545135498, "fcm_dpo/delta": -0.4493417739868164, "fcm_dpo/margin": 1.1969325542449951, "fcm_dpo/q_t": 0.3209930658340454, "grad_norm": 309.58441162109375, "learning_rate": 2.463768115942029e-07, "logits/chosen": -0.502815306186676, "logits/rejected": -0.46622055768966675, "logps/chosen": -50.837669372558594, "logps/ref_chosen": -51.02449035644531, "logps/ref_rejected": -106.82443237304688, "logps/rejected": -107.83454895019531, "loss": 0.8254, "margin_dpo/margin_mean": 1.1969324350357056, "margin_dpo/margin_std": 1.0367286205291748, "step": 35 }, { "KL/chosen_KL_mean": 0.05117225646972656, "KL/mean": -0.48711907863616943, "KL/rejected_KL_mean": -1.0254096984863281, "KL/std": 1.0425536632537842, "epoch": 0.05286343612334802, "fcm_dpo/beta": 0.6399196982383728, "fcm_dpo/delta": -0.3091672658920288, "fcm_dpo/margin": 1.0765844583511353, "fcm_dpo/q_t": 0.35396426916122437, "grad_norm": 261.8072814941406, "learning_rate": 2.536231884057971e-07, "logits/chosen": -0.5571258068084717, "logits/rejected": -0.5207737684249878, "logps/chosen": -51.9403190612793, "logps/ref_chosen": -51.991493225097656, "logps/ref_rejected": -86.0406265258789, "logps/rejected": -87.0660400390625, "loss": 0.949, "margin_dpo/margin_mean": 1.0765833854675293, "margin_dpo/margin_std": 1.2341694831848145, "step": 36 }, { "KL/chosen_KL_mean": 0.019609451293945312, "KL/mean": -0.4790758192539215, "KL/rejected_KL_mean": -0.9777679443359375, "KL/std": 1.0161794424057007, "epoch": 0.05433186490455213, "fcm_dpo/beta": 0.5977625846862793, "fcm_dpo/delta": -0.2139551043510437, "fcm_dpo/margin": 0.9973729252815247, "fcm_dpo/q_t": 0.3746863603591919, "grad_norm": 244.11151123046875, "learning_rate": 2.6086956521739126e-07, "logits/chosen": -0.4970467984676361, "logits/rejected": -0.45224228501319885, "logps/chosen": -62.787498474121094, "logps/ref_chosen": -62.807106018066406, "logps/ref_rejected": -77.89507293701172, "logps/rejected": -78.87284088134766, "loss": 1.0035, "margin_dpo/margin_mean": 0.9973729848861694, "margin_dpo/margin_std": 1.285217046737671, "step": 37 }, { "KL/chosen_KL_mean": 0.21410560607910156, "KL/mean": -0.46886640787124634, "KL/rejected_KL_mean": -1.1518363952636719, "KL/std": 1.316064476966858, "epoch": 0.055800293685756244, "fcm_dpo/beta": 0.5653368830680847, "fcm_dpo/delta": -0.4023910164833069, "fcm_dpo/margin": 1.365942120552063, "fcm_dpo/q_t": 0.3435903489589691, "grad_norm": 240.15562438964844, "learning_rate": 2.681159420289855e-07, "logits/chosen": -0.5118378400802612, "logits/rejected": -0.4790714979171753, "logps/chosen": -48.176414489746094, "logps/ref_chosen": -48.39051818847656, "logps/ref_rejected": -97.91244506835938, "logps/rejected": -99.06427764892578, "loss": 0.9045, "margin_dpo/margin_mean": 1.3659417629241943, "margin_dpo/margin_std": 1.6197600364685059, "step": 38 }, { "KL/chosen_KL_mean": 0.08580398559570312, "KL/mean": -0.7106390595436096, "KL/rejected_KL_mean": -1.5070762634277344, "KL/std": 1.289241075515747, "epoch": 0.05726872246696035, "fcm_dpo/beta": 0.5120701193809509, "fcm_dpo/delta": -0.4571428894996643, "fcm_dpo/margin": 1.5928757190704346, "fcm_dpo/q_t": 0.3189411163330078, "grad_norm": 256.7136535644531, "learning_rate": 2.753623188405797e-07, "logits/chosen": -0.5451552867889404, "logits/rejected": -0.5046231746673584, "logps/chosen": -50.664669036865234, "logps/ref_chosen": -50.75047302246094, "logps/ref_rejected": -78.56951141357422, "logps/rejected": -80.07658386230469, "loss": 0.8415, "margin_dpo/margin_mean": 1.5928757190704346, "margin_dpo/margin_std": 1.4120266437530518, "step": 39 }, { "KL/chosen_KL_mean": 0.18593215942382812, "KL/mean": -0.5416154861450195, "KL/rejected_KL_mean": -1.2691650390625, "KL/std": 1.3004155158996582, "epoch": 0.05873715124816446, "fcm_dpo/beta": 0.4821917414665222, "fcm_dpo/delta": -0.32341742515563965, "fcm_dpo/margin": 1.4550951719284058, "fcm_dpo/q_t": 0.35090136528015137, "grad_norm": 182.71023559570312, "learning_rate": 2.8260869565217386e-07, "logits/chosen": -0.5080227255821228, "logits/rejected": -0.47728431224823, "logps/chosen": -57.79913330078125, "logps/ref_chosen": -57.985069274902344, "logps/ref_rejected": -74.3000717163086, "logps/rejected": -75.5692367553711, "loss": 0.922, "margin_dpo/margin_mean": 1.455095887184143, "margin_dpo/margin_std": 1.5767593383789062, "step": 40 }, { "KL/chosen_KL_mean": 0.07164192199707031, "KL/mean": -0.858450174331665, "KL/rejected_KL_mean": -1.7885398864746094, "KL/std": 1.770848274230957, "epoch": 0.06020558002936858, "fcm_dpo/beta": 0.44186830520629883, "fcm_dpo/delta": -0.45917147397994995, "fcm_dpo/margin": 1.8601810932159424, "fcm_dpo/q_t": 0.3276433050632477, "grad_norm": 186.04129028320312, "learning_rate": 2.898550724637681e-07, "logits/chosen": -0.5379878282546997, "logits/rejected": -0.5013633370399475, "logps/chosen": -62.624176025390625, "logps/ref_chosen": -62.69581604003906, "logps/ref_rejected": -97.02352905273438, "logps/rejected": -98.81207275390625, "loss": 0.867, "margin_dpo/margin_mean": 1.8601820468902588, "margin_dpo/margin_std": 1.9144206047058105, "step": 41 }, { "KL/chosen_KL_mean": 0.21912765502929688, "KL/mean": -0.9815043210983276, "KL/rejected_KL_mean": -2.1821327209472656, "KL/std": 1.9854331016540527, "epoch": 0.06167400881057269, "fcm_dpo/beta": 0.39385828375816345, "fcm_dpo/delta": -0.6033186912536621, "fcm_dpo/margin": 2.401261806488037, "fcm_dpo/q_t": 0.3034874200820923, "grad_norm": 167.1330108642578, "learning_rate": 2.971014492753623e-07, "logits/chosen": -0.5730389356613159, "logits/rejected": -0.5269917249679565, "logps/chosen": -58.74729919433594, "logps/ref_chosen": -58.966426849365234, "logps/ref_rejected": -109.90837097167969, "logps/rejected": -112.09050750732422, "loss": 0.792, "margin_dpo/margin_mean": 2.4012622833251953, "margin_dpo/margin_std": 2.241847276687622, "step": 42 }, { "KL/chosen_KL_mean": 0.5337352752685547, "KL/mean": -0.6120513677597046, "KL/rejected_KL_mean": -1.757843017578125, "KL/std": 1.6964552402496338, "epoch": 0.0631424375917768, "fcm_dpo/beta": 0.3543139696121216, "fcm_dpo/delta": -0.4530714154243469, "fcm_dpo/margin": 2.291576385498047, "fcm_dpo/q_t": 0.31728753447532654, "grad_norm": 157.29473876953125, "learning_rate": 3.043478260869565e-07, "logits/chosen": -0.5307985544204712, "logits/rejected": -0.505626916885376, "logps/chosen": -53.62226104736328, "logps/ref_chosen": -54.15599822998047, "logps/ref_rejected": -96.48019409179688, "logps/rejected": -98.238037109375, "loss": 0.8149, "margin_dpo/margin_mean": 2.2915759086608887, "margin_dpo/margin_std": 1.8112150430679321, "step": 43 }, { "KL/chosen_KL_mean": 0.21957778930664062, "KL/mean": -1.076310157775879, "KL/rejected_KL_mean": -2.3721961975097656, "KL/std": 2.057605266571045, "epoch": 0.06461086637298091, "fcm_dpo/beta": 0.3235365152359009, "fcm_dpo/delta": -0.48091480135917664, "fcm_dpo/margin": 2.591776132583618, "fcm_dpo/q_t": 0.31211215257644653, "grad_norm": 155.63697814941406, "learning_rate": 3.115942028985507e-07, "logits/chosen": -0.4682161509990692, "logits/rejected": -0.4482540488243103, "logps/chosen": -49.85892105102539, "logps/ref_chosen": -50.07849884033203, "logps/ref_rejected": -108.78376007080078, "logps/rejected": -111.15596008300781, "loss": 0.7988, "margin_dpo/margin_mean": 2.59177565574646, "margin_dpo/margin_std": 2.0130257606506348, "step": 44 }, { "KL/chosen_KL_mean": 0.1359119415283203, "KL/mean": -0.8931126594543457, "KL/rejected_KL_mean": -1.9221420288085938, "KL/std": 1.8634648323059082, "epoch": 0.06607929515418502, "fcm_dpo/beta": 0.3062588572502136, "fcm_dpo/delta": -0.24527329206466675, "fcm_dpo/margin": 2.0580525398254395, "fcm_dpo/q_t": 0.36411553621292114, "grad_norm": 127.76445007324219, "learning_rate": 3.188405797101449e-07, "logits/chosen": -0.4810647964477539, "logits/rejected": -0.46846526861190796, "logps/chosen": -48.279014587402344, "logps/ref_chosen": -48.4149284362793, "logps/ref_rejected": -77.93643188476562, "logps/rejected": -79.85856628417969, "loss": 0.9597, "margin_dpo/margin_mean": 2.0580527782440186, "margin_dpo/margin_std": 2.3514111042022705, "step": 45 }, { "KL/chosen_KL_mean": 0.20447921752929688, "KL/mean": -1.1372920274734497, "KL/rejected_KL_mean": -2.4790611267089844, "KL/std": 2.375460624694824, "epoch": 0.06754772393538913, "fcm_dpo/beta": 0.2851349711418152, "fcm_dpo/delta": -0.3946601152420044, "fcm_dpo/margin": 2.6835451126098633, "fcm_dpo/q_t": 0.33826661109924316, "grad_norm": 132.63743591308594, "learning_rate": 3.260869565217391e-07, "logits/chosen": -0.5322977900505066, "logits/rejected": -0.4815298914909363, "logps/chosen": -55.794944763183594, "logps/ref_chosen": -55.999427795410156, "logps/ref_rejected": -95.652587890625, "logps/rejected": -98.13165283203125, "loss": 0.8899, "margin_dpo/margin_mean": 2.683544158935547, "margin_dpo/margin_std": 2.860718250274658, "step": 46 }, { "KL/chosen_KL_mean": 0.3806018829345703, "KL/mean": -0.9414160251617432, "KL/rejected_KL_mean": -2.2634353637695312, "KL/std": 2.2800047397613525, "epoch": 0.06901615271659324, "fcm_dpo/beta": 0.2670041620731354, "fcm_dpo/delta": -0.328019380569458, "fcm_dpo/margin": 2.6440370082855225, "fcm_dpo/q_t": 0.3423752188682556, "grad_norm": 126.02400207519531, "learning_rate": 3.333333333333333e-07, "logits/chosen": -0.5748400688171387, "logits/rejected": -0.5222221612930298, "logps/chosen": -57.54547882080078, "logps/ref_chosen": -57.92607879638672, "logps/ref_rejected": -94.67920684814453, "logps/rejected": -96.94264221191406, "loss": 0.8904, "margin_dpo/margin_mean": 2.6440372467041016, "margin_dpo/margin_std": 2.386019229888916, "step": 47 }, { "KL/chosen_KL_mean": 0.08409309387207031, "KL/mean": -1.2250878810882568, "KL/rejected_KL_mean": -2.534271240234375, "KL/std": 2.2343883514404297, "epoch": 0.07048458149779736, "fcm_dpo/beta": 0.2488497495651245, "fcm_dpo/delta": -0.27214479446411133, "fcm_dpo/margin": 2.618363857269287, "fcm_dpo/q_t": 0.35341036319732666, "grad_norm": 138.55894470214844, "learning_rate": 3.4057971014492755e-07, "logits/chosen": -0.5984026193618774, "logits/rejected": -0.5413084030151367, "logps/chosen": -57.103981018066406, "logps/ref_chosen": -57.188072204589844, "logps/ref_rejected": -88.0166015625, "logps/rejected": -90.55087280273438, "loss": 0.9335, "margin_dpo/margin_mean": 2.618363618850708, "margin_dpo/margin_std": 2.488217353820801, "step": 48 }, { "KL/chosen_KL_mean": 0.43454933166503906, "KL/mean": -1.1184592247009277, "KL/rejected_KL_mean": -2.671466827392578, "KL/std": 2.7026281356811523, "epoch": 0.07195301027900147, "fcm_dpo/beta": 0.23550444841384888, "fcm_dpo/delta": -0.3564870357513428, "fcm_dpo/margin": 3.106010913848877, "fcm_dpo/q_t": 0.34103497862815857, "grad_norm": 104.59957122802734, "learning_rate": 3.478260869565217e-07, "logits/chosen": -0.5330841541290283, "logits/rejected": -0.47351568937301636, "logps/chosen": -61.2507209777832, "logps/ref_chosen": -61.685272216796875, "logps/ref_rejected": -83.76747131347656, "logps/rejected": -86.4389419555664, "loss": 0.8935, "margin_dpo/margin_mean": 3.106010675430298, "margin_dpo/margin_std": 3.1022186279296875, "step": 49 }, { "KL/chosen_KL_mean": 0.03325843811035156, "KL/mean": -1.700391173362732, "KL/rejected_KL_mean": -3.434040069580078, "KL/std": 2.8286612033843994, "epoch": 0.07342143906020558, "fcm_dpo/beta": 0.21684028208255768, "fcm_dpo/delta": -0.38183990120887756, "fcm_dpo/margin": 3.4672958850860596, "fcm_dpo/q_t": 0.333289235830307, "grad_norm": 104.01911163330078, "learning_rate": 3.5507246376811595e-07, "logits/chosen": -0.5359183549880981, "logits/rejected": -0.499971866607666, "logps/chosen": -58.690879821777344, "logps/ref_chosen": -58.72413635253906, "logps/ref_rejected": -96.35814666748047, "logps/rejected": -99.79219055175781, "loss": 0.8683, "margin_dpo/margin_mean": 3.4672961235046387, "margin_dpo/margin_std": 3.1691863536834717, "step": 50 }, { "KL/chosen_KL_mean": -0.08742523193359375, "KL/mean": -1.8161455392837524, "KL/rejected_KL_mean": -3.5448684692382812, "KL/std": 3.425395965576172, "epoch": 0.07488986784140969, "fcm_dpo/beta": 0.20151767134666443, "fcm_dpo/delta": -0.32323533296585083, "fcm_dpo/margin": 3.4574427604675293, "fcm_dpo/q_t": 0.3550441563129425, "grad_norm": 80.95540618896484, "learning_rate": 3.6231884057971015e-07, "logits/chosen": -0.4952942132949829, "logits/rejected": -0.46139243245124817, "logps/chosen": -61.46109390258789, "logps/ref_chosen": -61.3736686706543, "logps/ref_rejected": -76.00199890136719, "logps/rejected": -79.546875, "loss": 0.9468, "margin_dpo/margin_mean": 3.4574432373046875, "margin_dpo/margin_std": 4.162242889404297, "step": 51 }, { "KL/chosen_KL_mean": 0.5666332244873047, "KL/mean": -2.1044023036956787, "KL/rejected_KL_mean": -4.7754364013671875, "KL/std": 4.042649269104004, "epoch": 0.0763582966226138, "fcm_dpo/beta": 0.181796133518219, "fcm_dpo/delta": -0.6337956190109253, "fcm_dpo/margin": 5.342073440551758, "fcm_dpo/q_t": 0.2899671792984009, "grad_norm": 79.82083129882812, "learning_rate": 3.695652173913043e-07, "logits/chosen": -0.5962961912155151, "logits/rejected": -0.543999433517456, "logps/chosen": -51.770721435546875, "logps/ref_chosen": -52.33735656738281, "logps/ref_rejected": -79.97391510009766, "logps/rejected": -84.74935150146484, "loss": 0.7499, "margin_dpo/margin_mean": 5.342073440551758, "margin_dpo/margin_std": 4.25220251083374, "step": 52 }, { "KL/chosen_KL_mean": -0.03797149658203125, "KL/mean": -2.6791794300079346, "KL/rejected_KL_mean": -5.320384979248047, "KL/std": 4.5573835372924805, "epoch": 0.07782672540381791, "fcm_dpo/beta": 0.16499710083007812, "fcm_dpo/delta": -0.5135352611541748, "fcm_dpo/margin": 5.282422065734863, "fcm_dpo/q_t": 0.3234487771987915, "grad_norm": 81.50251007080078, "learning_rate": 3.7681159420289855e-07, "logits/chosen": -0.6104946136474609, "logits/rejected": -0.588903546333313, "logps/chosen": -53.352622985839844, "logps/ref_chosen": -53.31465148925781, "logps/ref_rejected": -91.78359985351562, "logps/rejected": -97.1039810180664, "loss": 0.8463, "margin_dpo/margin_mean": 5.282422065734863, "margin_dpo/margin_std": 5.204236030578613, "step": 53 }, { "KL/chosen_KL_mean": -0.13544654846191406, "KL/mean": -2.47892427444458, "KL/rejected_KL_mean": -4.822395324707031, "KL/std": 4.284974098205566, "epoch": 0.07929515418502203, "fcm_dpo/beta": 0.15150442719459534, "fcm_dpo/delta": -0.33330458402633667, "fcm_dpo/margin": 4.68695068359375, "fcm_dpo/q_t": 0.34295719861984253, "grad_norm": 71.17793273925781, "learning_rate": 3.8405797101449274e-07, "logits/chosen": -0.5825854539871216, "logits/rejected": -0.528401255607605, "logps/chosen": -50.8241081237793, "logps/ref_chosen": -50.68865966796875, "logps/ref_rejected": -91.71539306640625, "logps/rejected": -96.53778839111328, "loss": 0.8869, "margin_dpo/margin_mean": 4.68695068359375, "margin_dpo/margin_std": 4.375544548034668, "step": 54 }, { "KL/chosen_KL_mean": -0.5525169372558594, "KL/mean": -3.349881887435913, "KL/rejected_KL_mean": -6.147243499755859, "KL/std": 5.250433921813965, "epoch": 0.08076358296622614, "fcm_dpo/beta": 0.14030683040618896, "fcm_dpo/delta": -0.4165322184562683, "fcm_dpo/margin": 5.594724178314209, "fcm_dpo/q_t": 0.33590346574783325, "grad_norm": 67.65316772460938, "learning_rate": 3.9130434782608694e-07, "logits/chosen": -0.6309506893157959, "logits/rejected": -0.5673823952674866, "logps/chosen": -63.16775131225586, "logps/ref_chosen": -62.615234375, "logps/ref_rejected": -88.99349975585938, "logps/rejected": -95.1407470703125, "loss": 0.9007, "margin_dpo/margin_mean": 5.594723701477051, "margin_dpo/margin_std": 6.235048294067383, "step": 55 }, { "KL/chosen_KL_mean": -0.32973480224609375, "KL/mean": -3.0303850173950195, "KL/rejected_KL_mean": -5.7310333251953125, "KL/std": 5.075455665588379, "epoch": 0.08223201174743025, "fcm_dpo/beta": 0.13035638630390167, "fcm_dpo/delta": -0.32700973749160767, "fcm_dpo/margin": 5.401305198669434, "fcm_dpo/q_t": 0.3499238193035126, "grad_norm": 56.44548034667969, "learning_rate": 3.9855072463768114e-07, "logits/chosen": -0.5875349044799805, "logits/rejected": -0.5435941815376282, "logps/chosen": -58.2624626159668, "logps/ref_chosen": -57.9327278137207, "logps/ref_rejected": -94.1744384765625, "logps/rejected": -99.90547180175781, "loss": 0.9377, "margin_dpo/margin_mean": 5.401305198669434, "margin_dpo/margin_std": 6.176411151885986, "step": 56 }, { "KL/chosen_KL_mean": -0.4023780822753906, "KL/mean": -3.3978283405303955, "KL/rejected_KL_mean": -6.393280029296875, "KL/std": 5.025920867919922, "epoch": 0.08370044052863436, "fcm_dpo/beta": 0.12205598503351212, "fcm_dpo/delta": -0.35622814297676086, "fcm_dpo/margin": 5.9908952713012695, "fcm_dpo/q_t": 0.3362714648246765, "grad_norm": 62.98070526123047, "learning_rate": 4.057971014492754e-07, "logits/chosen": -0.5721724033355713, "logits/rejected": -0.544060230255127, "logps/chosen": -70.89765930175781, "logps/ref_chosen": -70.49528503417969, "logps/ref_rejected": -95.56546020507812, "logps/rejected": -101.958740234375, "loss": 0.8807, "margin_dpo/margin_mean": 5.9908952713012695, "margin_dpo/margin_std": 5.452801704406738, "step": 57 }, { "KL/chosen_KL_mean": -0.46083641052246094, "KL/mean": -3.8815183639526367, "KL/rejected_KL_mean": -7.3022003173828125, "KL/std": 5.823391437530518, "epoch": 0.08516886930983847, "fcm_dpo/beta": 0.11291979253292084, "fcm_dpo/delta": -0.4023542106151581, "fcm_dpo/margin": 6.84135627746582, "fcm_dpo/q_t": 0.33498990535736084, "grad_norm": 63.307865142822266, "learning_rate": 4.1304347826086954e-07, "logits/chosen": -0.574745237827301, "logits/rejected": -0.4966890215873718, "logps/chosen": -62.59377670288086, "logps/ref_chosen": -62.13294219970703, "logps/ref_rejected": -84.61729431152344, "logps/rejected": -91.91949462890625, "loss": 0.8841, "margin_dpo/margin_mean": 6.841357231140137, "margin_dpo/margin_std": 6.957030296325684, "step": 58 }, { "KL/chosen_KL_mean": -0.9522266387939453, "KL/mean": -4.575308799743652, "KL/rejected_KL_mean": -8.19839096069336, "KL/std": 6.170098304748535, "epoch": 0.08663729809104258, "fcm_dpo/beta": 0.10300938785076141, "fcm_dpo/delta": -0.3802601099014282, "fcm_dpo/margin": 7.246167182922363, "fcm_dpo/q_t": 0.33733034133911133, "grad_norm": 60.765480041503906, "learning_rate": 4.2028985507246374e-07, "logits/chosen": -0.6427372694015503, "logits/rejected": -0.6026915311813354, "logps/chosen": -52.88475036621094, "logps/ref_chosen": -51.932525634765625, "logps/ref_rejected": -88.88520050048828, "logps/rejected": -97.08358764648438, "loss": 0.8868, "margin_dpo/margin_mean": 7.246167182922363, "margin_dpo/margin_std": 7.133745193481445, "step": 59 }, { "KL/chosen_KL_mean": -1.8632774353027344, "KL/mean": -4.822530746459961, "KL/rejected_KL_mean": -7.781780242919922, "KL/std": 5.658910751342773, "epoch": 0.0881057268722467, "fcm_dpo/beta": 0.09867256879806519, "fcm_dpo/delta": -0.19654613733291626, "fcm_dpo/margin": 5.918500900268555, "fcm_dpo/q_t": 0.36642712354660034, "grad_norm": 63.796966552734375, "learning_rate": 4.2753623188405794e-07, "logits/chosen": -0.6072988510131836, "logits/rejected": -0.5472843050956726, "logps/chosen": -62.80546569824219, "logps/ref_chosen": -60.94218826293945, "logps/ref_rejected": -85.39340209960938, "logps/rejected": -93.17517852783203, "loss": 0.9761, "margin_dpo/margin_mean": 5.918500900268555, "margin_dpo/margin_std": 6.41326379776001, "step": 60 }, { "KL/chosen_KL_mean": -0.8773689270019531, "KL/mean": -4.494866371154785, "KL/rejected_KL_mean": -8.11236572265625, "KL/std": 7.751307487487793, "epoch": 0.08957415565345081, "fcm_dpo/beta": 0.09338235855102539, "fcm_dpo/delta": -0.2975808084011078, "fcm_dpo/margin": 7.234995365142822, "fcm_dpo/q_t": 0.3608711063861847, "grad_norm": 51.70878982543945, "learning_rate": 4.3478260869565214e-07, "logits/chosen": -0.5910314321517944, "logits/rejected": -0.5556684732437134, "logps/chosen": -61.510894775390625, "logps/ref_chosen": -60.633522033691406, "logps/ref_rejected": -89.85249328613281, "logps/rejected": -97.96485900878906, "loss": 0.9652, "margin_dpo/margin_mean": 7.234993934631348, "margin_dpo/margin_std": 9.499799728393555, "step": 61 }, { "KL/chosen_KL_mean": -1.0850257873535156, "KL/mean": -3.8349273204803467, "KL/rejected_KL_mean": -6.584831237792969, "KL/std": 5.598065376281738, "epoch": 0.09104258443465492, "fcm_dpo/beta": 0.09132882952690125, "fcm_dpo/delta": -0.10761071741580963, "fcm_dpo/margin": 5.499805927276611, "fcm_dpo/q_t": 0.3879823684692383, "grad_norm": 50.53475570678711, "learning_rate": 4.420289855072464e-07, "logits/chosen": -0.6022673845291138, "logits/rejected": -0.5683047771453857, "logps/chosen": -57.23579788208008, "logps/ref_chosen": -56.15077209472656, "logps/ref_rejected": -75.56619262695312, "logps/rejected": -82.15103149414062, "loss": 1.0326, "margin_dpo/margin_mean": 5.499805450439453, "margin_dpo/margin_std": 6.748025894165039, "step": 62 }, { "KL/chosen_KL_mean": -1.885568618774414, "KL/mean": -5.602993011474609, "KL/rejected_KL_mean": -9.320415496826172, "KL/std": 7.238819122314453, "epoch": 0.09251101321585903, "fcm_dpo/beta": 0.08644914627075195, "fcm_dpo/delta": -0.26188862323760986, "fcm_dpo/margin": 7.43485164642334, "fcm_dpo/q_t": 0.35505515336990356, "grad_norm": 50.32809829711914, "learning_rate": 4.4927536231884053e-07, "logits/chosen": -0.5845507383346558, "logits/rejected": -0.5376572012901306, "logps/chosen": -75.032958984375, "logps/ref_chosen": -73.14739227294922, "logps/ref_rejected": -97.61006164550781, "logps/rejected": -106.93048095703125, "loss": 0.9392, "margin_dpo/margin_mean": 7.434851169586182, "margin_dpo/margin_std": 7.581734657287598, "step": 63 }, { "KL/chosen_KL_mean": -0.5354537963867188, "KL/mean": -4.859354496002197, "KL/rejected_KL_mean": -9.183258056640625, "KL/std": 7.766883850097656, "epoch": 0.09397944199706314, "fcm_dpo/beta": 0.08089442551136017, "fcm_dpo/delta": -0.3278960585594177, "fcm_dpo/margin": 8.647797584533691, "fcm_dpo/q_t": 0.3481459617614746, "grad_norm": 45.80192947387695, "learning_rate": 4.5652173913043473e-07, "logits/chosen": -0.5578076243400574, "logits/rejected": -0.5246820449829102, "logps/chosen": -54.534053802490234, "logps/ref_chosen": -53.998600006103516, "logps/ref_rejected": -93.53019714355469, "logps/rejected": -102.71345520019531, "loss": 0.9256, "margin_dpo/margin_mean": 8.647798538208008, "margin_dpo/margin_std": 9.425071716308594, "step": 64 }, { "KL/chosen_KL_mean": -2.3879737854003906, "KL/mean": -6.748409271240234, "KL/rejected_KL_mean": -11.108848571777344, "KL/std": 8.675431251525879, "epoch": 0.09544787077826726, "fcm_dpo/beta": 0.076596200466156, "fcm_dpo/delta": -0.2893035411834717, "fcm_dpo/margin": 8.720873832702637, "fcm_dpo/q_t": 0.35113364458084106, "grad_norm": 46.513771057128906, "learning_rate": 4.63768115942029e-07, "logits/chosen": -0.6502401828765869, "logits/rejected": -0.636010468006134, "logps/chosen": -67.2239761352539, "logps/ref_chosen": -64.83599853515625, "logps/ref_rejected": -109.94645690917969, "logps/rejected": -121.05531311035156, "loss": 0.9421, "margin_dpo/margin_mean": 8.720873832702637, "margin_dpo/margin_std": 9.467870712280273, "step": 65 }, { "KL/chosen_KL_mean": -2.218568801879883, "KL/mean": -6.281618118286133, "KL/rejected_KL_mean": -10.34466552734375, "KL/std": 7.9723615646362305, "epoch": 0.09691629955947137, "fcm_dpo/beta": 0.07299195230007172, "fcm_dpo/delta": -0.20810872316360474, "fcm_dpo/margin": 8.126091957092285, "fcm_dpo/q_t": 0.3702622056007385, "grad_norm": 40.38670349121094, "learning_rate": 4.7101449275362313e-07, "logits/chosen": -0.6466140747070312, "logits/rejected": -0.613680362701416, "logps/chosen": -53.66209411621094, "logps/ref_chosen": -51.44352722167969, "logps/ref_rejected": -75.63629913330078, "logps/rejected": -85.98096466064453, "loss": 0.9797, "margin_dpo/margin_mean": 8.126091003417969, "margin_dpo/margin_std": 9.491106986999512, "step": 66 }, { "KL/chosen_KL_mean": -1.9192867279052734, "KL/mean": -6.12217903137207, "KL/rejected_KL_mean": -10.3250732421875, "KL/std": 7.945716857910156, "epoch": 0.09838472834067548, "fcm_dpo/beta": 0.07074415683746338, "fcm_dpo/delta": -0.20659056305885315, "fcm_dpo/margin": 8.405787467956543, "fcm_dpo/q_t": 0.3689546287059784, "grad_norm": 41.76984786987305, "learning_rate": 4.782608695652174e-07, "logits/chosen": -0.5952399969100952, "logits/rejected": -0.5532902479171753, "logps/chosen": -61.260093688964844, "logps/ref_chosen": -59.34080505371094, "logps/ref_rejected": -72.78728485107422, "logps/rejected": -83.11235809326172, "loss": 0.9742, "margin_dpo/margin_mean": 8.405787467956543, "margin_dpo/margin_std": 9.565013885498047, "step": 67 }, { "KL/chosen_KL_mean": -2.0800132751464844, "KL/mean": -6.116093635559082, "KL/rejected_KL_mean": -10.152172088623047, "KL/std": 7.245296478271484, "epoch": 0.09985315712187959, "fcm_dpo/beta": 0.06832877546548843, "fcm_dpo/delta": -0.16015967726707458, "fcm_dpo/margin": 8.072154998779297, "fcm_dpo/q_t": 0.37427279353141785, "grad_norm": 39.69685363769531, "learning_rate": 4.855072463768116e-07, "logits/chosen": -0.6283758878707886, "logits/rejected": -0.5697811841964722, "logps/chosen": -67.28584289550781, "logps/ref_chosen": -65.2058334350586, "logps/ref_rejected": -77.20724487304688, "logps/rejected": -87.35942077636719, "loss": 0.9769, "margin_dpo/margin_mean": 8.072154998779297, "margin_dpo/margin_std": 8.083388328552246, "step": 68 }, { "KL/chosen_KL_mean": -2.5830154418945312, "KL/mean": -7.418220520019531, "KL/rejected_KL_mean": -12.253425598144531, "KL/std": 8.492610931396484, "epoch": 0.1013215859030837, "fcm_dpo/beta": 0.06493359059095383, "fcm_dpo/delta": -0.24463072419166565, "fcm_dpo/margin": 9.670412063598633, "fcm_dpo/q_t": 0.35869020223617554, "grad_norm": 42.51207733154297, "learning_rate": 4.927536231884058e-07, "logits/chosen": -0.587549090385437, "logits/rejected": -0.5634763240814209, "logps/chosen": -62.40225601196289, "logps/ref_chosen": -59.81924057006836, "logps/ref_rejected": -103.38886260986328, "logps/rejected": -115.64228820800781, "loss": 0.9315, "margin_dpo/margin_mean": 9.67041301727295, "margin_dpo/margin_std": 9.319877624511719, "step": 69 }, { "KL/chosen_KL_mean": -3.936086654663086, "KL/mean": -9.134437561035156, "KL/rejected_KL_mean": -14.33279037475586, "KL/std": 10.255716323852539, "epoch": 0.1027900146842878, "fcm_dpo/beta": 0.06210237741470337, "fcm_dpo/delta": -0.2620813846588135, "fcm_dpo/margin": 10.396703720092773, "fcm_dpo/q_t": 0.3578363060951233, "grad_norm": 42.537837982177734, "learning_rate": 5e-07, "logits/chosen": -0.6045395731925964, "logits/rejected": -0.5682834386825562, "logps/chosen": -65.86672973632812, "logps/ref_chosen": -61.930641174316406, "logps/ref_rejected": -91.06078338623047, "logps/rejected": -105.39356994628906, "loss": 0.9407, "margin_dpo/margin_mean": 10.396702766418457, "margin_dpo/margin_std": 10.94558048248291, "step": 70 }, { "KL/chosen_KL_mean": -3.512136459350586, "KL/mean": -9.568258285522461, "KL/rejected_KL_mean": -15.624378204345703, "KL/std": 10.597580909729004, "epoch": 0.10425844346549193, "fcm_dpo/beta": 0.05811074376106262, "fcm_dpo/delta": -0.32751208543777466, "fcm_dpo/margin": 12.11224365234375, "fcm_dpo/q_t": 0.34550318121910095, "grad_norm": 39.08191680908203, "learning_rate": 4.999967061337492e-07, "logits/chosen": -0.6395025253295898, "logits/rejected": -0.5973723530769348, "logps/chosen": -65.2624740600586, "logps/ref_chosen": -61.750335693359375, "logps/ref_rejected": -97.33662414550781, "logps/rejected": -112.96099853515625, "loss": 0.8977, "margin_dpo/margin_mean": 12.112241744995117, "margin_dpo/margin_std": 11.558072090148926, "step": 71 }, { "KL/chosen_KL_mean": -4.052457809448242, "KL/mean": -10.506906509399414, "KL/rejected_KL_mean": -16.96135711669922, "KL/std": 11.406668663024902, "epoch": 0.10572687224669604, "fcm_dpo/beta": 0.0543680340051651, "fcm_dpo/delta": -0.32582148909568787, "fcm_dpo/margin": 12.908900260925293, "fcm_dpo/q_t": 0.34477001428604126, "grad_norm": 39.94166564941406, "learning_rate": 4.999868246217933e-07, "logits/chosen": -0.6469070911407471, "logits/rejected": -0.6107661128044128, "logps/chosen": -70.10586547851562, "logps/ref_chosen": -66.05341339111328, "logps/ref_rejected": -95.2869873046875, "logps/rejected": -112.24834442138672, "loss": 0.9069, "margin_dpo/margin_mean": 12.908900260925293, "margin_dpo/margin_std": 13.00861930847168, "step": 72 }, { "KL/chosen_KL_mean": -5.937223434448242, "KL/mean": -12.432035446166992, "KL/rejected_KL_mean": -18.92684555053711, "KL/std": 15.173410415649414, "epoch": 0.10719530102790015, "fcm_dpo/beta": 0.05116545781493187, "fcm_dpo/delta": -0.28520524501800537, "fcm_dpo/margin": 12.989622116088867, "fcm_dpo/q_t": 0.36612752079963684, "grad_norm": 38.19060134887695, "learning_rate": 4.999703557245192e-07, "logits/chosen": -0.6732739806175232, "logits/rejected": -0.6307477951049805, "logps/chosen": -72.19349670410156, "logps/ref_chosen": -66.25627136230469, "logps/ref_rejected": -90.45613098144531, "logps/rejected": -109.38298034667969, "loss": 1.0119, "margin_dpo/margin_mean": 12.989622116088867, "margin_dpo/margin_std": 18.78784942626953, "step": 73 }, { "KL/chosen_KL_mean": -6.64216423034668, "KL/mean": -13.723569869995117, "KL/rejected_KL_mean": -20.804977416992188, "KL/std": 16.866836547851562, "epoch": 0.10866372980910426, "fcm_dpo/beta": 0.048250216990709305, "fcm_dpo/delta": -0.3053615391254425, "fcm_dpo/margin": 14.162809371948242, "fcm_dpo/q_t": 0.3610179126262665, "grad_norm": 39.493221282958984, "learning_rate": 4.999472998758977e-07, "logits/chosen": -0.6567627191543579, "logits/rejected": -0.6456471681594849, "logps/chosen": -60.067047119140625, "logps/ref_chosen": -53.42488098144531, "logps/ref_rejected": -95.94693756103516, "logps/rejected": -116.75191497802734, "loss": 0.9834, "margin_dpo/margin_mean": 14.162809371948242, "margin_dpo/margin_std": 20.69675636291504, "step": 74 }, { "KL/chosen_KL_mean": -6.570465087890625, "KL/mean": -16.630451202392578, "KL/rejected_KL_mean": -26.6904296875, "KL/std": 17.67294692993164, "epoch": 0.11013215859030837, "fcm_dpo/beta": 0.04431544989347458, "fcm_dpo/delta": -0.5371890068054199, "fcm_dpo/margin": 20.119972229003906, "fcm_dpo/q_t": 0.31586113572120667, "grad_norm": 34.22724533081055, "learning_rate": 4.999176576834721e-07, "logits/chosen": -0.6665393114089966, "logits/rejected": -0.656915009021759, "logps/chosen": -58.43212890625, "logps/ref_chosen": -51.861663818359375, "logps/ref_rejected": -111.25398254394531, "logps/rejected": -137.9444122314453, "loss": 0.832, "margin_dpo/margin_mean": 20.119970321655273, "margin_dpo/margin_std": 19.77768325805664, "step": 75 }, { "KL/chosen_KL_mean": -8.186553955078125, "KL/mean": -14.74972915649414, "KL/rejected_KL_mean": -21.31290054321289, "KL/std": 14.5498685836792, "epoch": 0.11160058737151249, "fcm_dpo/beta": 0.041812874376773834, "fcm_dpo/delta": -0.1574609875679016, "fcm_dpo/margin": 13.126352310180664, "fcm_dpo/q_t": 0.3751528859138489, "grad_norm": 34.15813446044922, "learning_rate": 4.998814299283415e-07, "logits/chosen": -0.6963686943054199, "logits/rejected": -0.6531896591186523, "logps/chosen": -61.45259094238281, "logps/ref_chosen": -53.26603698730469, "logps/ref_rejected": -78.21662902832031, "logps/rejected": -99.52952575683594, "loss": 1.0058, "margin_dpo/margin_mean": 13.126352310180664, "margin_dpo/margin_std": 15.78203010559082, "step": 76 }, { "KL/chosen_KL_mean": -7.309167861938477, "KL/mean": -17.81867218017578, "KL/rejected_KL_mean": -28.328174591064453, "KL/std": 19.93675994873047, "epoch": 0.1130690161527166, "fcm_dpo/beta": 0.03852991759777069, "fcm_dpo/delta": -0.4501330256462097, "fcm_dpo/margin": 21.019004821777344, "fcm_dpo/q_t": 0.3251643776893616, "grad_norm": 34.96170425415039, "learning_rate": 4.998386175651409e-07, "logits/chosen": -0.6893630623817444, "logits/rejected": -0.6500611305236816, "logps/chosen": -65.40584564208984, "logps/ref_chosen": -58.0966796875, "logps/ref_rejected": -93.77361297607422, "logps/rejected": -122.10179138183594, "loss": 0.8772, "margin_dpo/margin_mean": 21.019004821777344, "margin_dpo/margin_std": 21.777103424072266, "step": 77 }, { "KL/chosen_KL_mean": -7.4875030517578125, "KL/mean": -15.621191024780273, "KL/rejected_KL_mean": -23.754878997802734, "KL/std": 16.25094223022461, "epoch": 0.1145374449339207, "fcm_dpo/beta": 0.03683791682124138, "fcm_dpo/delta": -0.21150818467140198, "fcm_dpo/margin": 16.267370223999023, "fcm_dpo/q_t": 0.36780738830566406, "grad_norm": 31.654874801635742, "learning_rate": 4.997892217220159e-07, "logits/chosen": -0.6330477595329285, "logits/rejected": -0.6059073209762573, "logps/chosen": -63.101287841796875, "logps/ref_chosen": -55.61378479003906, "logps/ref_rejected": -84.93436431884766, "logps/rejected": -108.68923950195312, "loss": 0.976, "margin_dpo/margin_mean": 16.267372131347656, "margin_dpo/margin_std": 18.412311553955078, "step": 78 }, { "KL/chosen_KL_mean": -7.873466491699219, "KL/mean": -16.800203323364258, "KL/rejected_KL_mean": -25.726943969726562, "KL/std": 18.54357147216797, "epoch": 0.11600587371512482, "fcm_dpo/beta": 0.03503450006246567, "fcm_dpo/delta": -0.24067077040672302, "fcm_dpo/margin": 17.853469848632812, "fcm_dpo/q_t": 0.36850211024284363, "grad_norm": 28.002782821655273, "learning_rate": 4.997332437005931e-07, "logits/chosen": -0.67842698097229, "logits/rejected": -0.6490979194641113, "logps/chosen": -63.323951721191406, "logps/ref_chosen": -55.45048522949219, "logps/ref_rejected": -87.64756774902344, "logps/rejected": -113.37451171875, "loss": 0.9887, "margin_dpo/margin_mean": 17.85346794128418, "margin_dpo/margin_std": 22.688926696777344, "step": 79 }, { "KL/chosen_KL_mean": -10.66299057006836, "KL/mean": -18.957509994506836, "KL/rejected_KL_mean": -27.252025604248047, "KL/std": 19.518186569213867, "epoch": 0.11747430249632893, "fcm_dpo/beta": 0.03348912298679352, "fcm_dpo/delta": -0.16750264167785645, "fcm_dpo/margin": 16.589031219482422, "fcm_dpo/q_t": 0.3835224509239197, "grad_norm": 30.331220626831055, "learning_rate": 4.996706849759452e-07, "logits/chosen": -0.7170759439468384, "logits/rejected": -0.6725642085075378, "logps/chosen": -69.18228149414062, "logps/ref_chosen": -58.519290924072266, "logps/ref_rejected": -87.54750061035156, "logps/rejected": -114.79953002929688, "loss": 1.0334, "margin_dpo/margin_mean": 16.589031219482422, "margin_dpo/margin_std": 22.471614837646484, "step": 80 }, { "KL/chosen_KL_mean": -9.806392669677734, "KL/mean": -21.405792236328125, "KL/rejected_KL_mean": -33.00519561767578, "KL/std": 23.604204177856445, "epoch": 0.11894273127753303, "fcm_dpo/beta": 0.03153174743056297, "fcm_dpo/delta": -0.3608952760696411, "fcm_dpo/margin": 23.198793411254883, "fcm_dpo/q_t": 0.347603440284729, "grad_norm": 30.84009552001953, "learning_rate": 4.996015471965529e-07, "logits/chosen": -0.710389256477356, "logits/rejected": -0.679502010345459, "logps/chosen": -76.25526428222656, "logps/ref_chosen": -66.44886779785156, "logps/ref_rejected": -129.66270446777344, "logps/rejected": -162.66790771484375, "loss": 0.9328, "margin_dpo/margin_mean": 23.198793411254883, "margin_dpo/margin_std": 27.952896118164062, "step": 81 }, { "KL/chosen_KL_mean": -12.119726181030273, "KL/mean": -21.315797805786133, "KL/rejected_KL_mean": -30.51186752319336, "KL/std": 21.45583724975586, "epoch": 0.12041116005873716, "fcm_dpo/beta": 0.030675500631332397, "fcm_dpo/delta": -0.17443646490573883, "fcm_dpo/margin": 18.392139434814453, "fcm_dpo/q_t": 0.38255757093429565, "grad_norm": 33.19075012207031, "learning_rate": 4.995258321842611e-07, "logits/chosen": -0.6418750286102295, "logits/rejected": -0.628775954246521, "logps/chosen": -64.35211181640625, "logps/ref_chosen": -52.232383728027344, "logps/ref_rejected": -90.74325561523438, "logps/rejected": -121.25511932373047, "loss": 1.0751, "margin_dpo/margin_mean": 18.392139434814453, "margin_dpo/margin_std": 29.462291717529297, "step": 82 }, { "KL/chosen_KL_mean": -12.515398025512695, "KL/mean": -23.42616081237793, "KL/rejected_KL_mean": -34.3369255065918, "KL/std": 22.522113800048828, "epoch": 0.12187958883994127, "fcm_dpo/beta": 0.028894957154989243, "fcm_dpo/delta": -0.2471744269132614, "fcm_dpo/margin": 21.821533203125, "fcm_dpo/q_t": 0.3641561269760132, "grad_norm": 31.546810150146484, "learning_rate": 4.994435419342304e-07, "logits/chosen": -0.6872934103012085, "logits/rejected": -0.6507028937339783, "logps/chosen": -68.34278869628906, "logps/ref_chosen": -55.82738494873047, "logps/ref_rejected": -103.71589660644531, "logps/rejected": -138.05282592773438, "loss": 0.978, "margin_dpo/margin_mean": 21.821533203125, "margin_dpo/margin_std": 26.563232421875, "step": 83 }, { "KL/chosen_KL_mean": -11.519216537475586, "KL/mean": -21.042236328125, "KL/rejected_KL_mean": -30.565250396728516, "KL/std": 19.391069412231445, "epoch": 0.12334801762114538, "fcm_dpo/beta": 0.027821559458971024, "fcm_dpo/delta": -0.13951639831066132, "fcm_dpo/margin": 19.046039581298828, "fcm_dpo/q_t": 0.3775210976600647, "grad_norm": 27.340347290039062, "learning_rate": 4.993546786148857e-07, "logits/chosen": -0.6515509486198425, "logits/rejected": -0.6115979552268982, "logps/chosen": -78.69538879394531, "logps/ref_chosen": -67.1761703491211, "logps/ref_rejected": -87.29859924316406, "logps/rejected": -117.86385345458984, "loss": 0.9982, "margin_dpo/margin_mean": 19.046039581298828, "margin_dpo/margin_std": 19.851577758789062, "step": 84 }, { "KL/chosen_KL_mean": -12.147787094116211, "KL/mean": -21.90998077392578, "KL/rejected_KL_mean": -31.67218017578125, "KL/std": 20.009681701660156, "epoch": 0.12481644640234948, "fcm_dpo/beta": 0.02746494486927986, "fcm_dpo/delta": -0.14406868815422058, "fcm_dpo/margin": 19.524394989013672, "fcm_dpo/q_t": 0.38153764605522156, "grad_norm": 27.906219482421875, "learning_rate": 4.992592445678582e-07, "logits/chosen": -0.6355807185173035, "logits/rejected": -0.6021965742111206, "logps/chosen": -70.55440521240234, "logps/ref_chosen": -58.4066162109375, "logps/ref_rejected": -78.63880157470703, "logps/rejected": -110.31098175048828, "loss": 1.0232, "margin_dpo/margin_mean": 19.524394989013672, "margin_dpo/margin_std": 23.956645965576172, "step": 85 }, { "KL/chosen_KL_mean": -15.925954818725586, "KL/mean": -26.277530670166016, "KL/rejected_KL_mean": -36.62910461425781, "KL/std": 27.010652542114258, "epoch": 0.1262848751835536, "fcm_dpo/beta": 0.026533078402280807, "fcm_dpo/delta": -0.15769629180431366, "fcm_dpo/margin": 20.703153610229492, "fcm_dpo/q_t": 0.393225759267807, "grad_norm": 31.247163772583008, "learning_rate": 4.991572423079235e-07, "logits/chosen": -0.6769875288009644, "logits/rejected": -0.6635218858718872, "logps/chosen": -72.06341552734375, "logps/ref_chosen": -56.13746643066406, "logps/ref_rejected": -88.12165069580078, "logps/rejected": -124.7507553100586, "loss": 1.1039, "margin_dpo/margin_mean": 20.703155517578125, "margin_dpo/margin_std": 37.23381042480469, "step": 86 }, { "KL/chosen_KL_mean": -15.697734832763672, "KL/mean": -27.728618621826172, "KL/rejected_KL_mean": -39.7595100402832, "KL/std": 26.763015747070312, "epoch": 0.1277533039647577, "fcm_dpo/beta": 0.025222256779670715, "fcm_dpo/delta": -0.22208669781684875, "fcm_dpo/margin": 24.061767578125, "fcm_dpo/q_t": 0.3686726689338684, "grad_norm": 26.473875045776367, "learning_rate": 4.990486745229364e-07, "logits/chosen": -0.7123448252677917, "logits/rejected": -0.6856144666671753, "logps/chosen": -71.33383178710938, "logps/ref_chosen": -55.63609313964844, "logps/ref_rejected": -95.46757507324219, "logps/rejected": -135.22708129882812, "loss": 1.0143, "margin_dpo/margin_mean": 24.061767578125, "margin_dpo/margin_std": 32.34611892700195, "step": 87 }, { "KL/chosen_KL_mean": -19.72933578491211, "KL/mean": -28.825397491455078, "KL/rejected_KL_mean": -37.921451568603516, "KL/std": 27.96208953857422, "epoch": 0.12922173274596183, "fcm_dpo/beta": 0.02473517321050167, "fcm_dpo/delta": -0.05327114462852478, "fcm_dpo/margin": 18.192121505737305, "fcm_dpo/q_t": 0.40377742052078247, "grad_norm": 27.7163028717041, "learning_rate": 4.989335440737586e-07, "logits/chosen": -0.6769958734512329, "logits/rejected": -0.6691812872886658, "logps/chosen": -93.40048217773438, "logps/ref_chosen": -73.67115020751953, "logps/ref_rejected": -106.70849609375, "logps/rejected": -144.62994384765625, "loss": 1.118, "margin_dpo/margin_mean": 18.192121505737305, "margin_dpo/margin_std": 29.348196029663086, "step": 88 }, { "KL/chosen_KL_mean": -12.14109992980957, "KL/mean": -22.370540618896484, "KL/rejected_KL_mean": -32.59998321533203, "KL/std": 23.696063995361328, "epoch": 0.13069016152716592, "fcm_dpo/beta": 0.024530138820409775, "fcm_dpo/delta": -0.10719307512044907, "fcm_dpo/margin": 20.458881378173828, "fcm_dpo/q_t": 0.3884883522987366, "grad_norm": 25.34421730041504, "learning_rate": 4.988118539941847e-07, "logits/chosen": -0.7158247232437134, "logits/rejected": -0.6819084882736206, "logps/chosen": -72.7660140991211, "logps/ref_chosen": -60.624916076660156, "logps/ref_rejected": -82.08354949951172, "logps/rejected": -114.68353271484375, "loss": 1.0395, "margin_dpo/margin_mean": 20.45888328552246, "margin_dpo/margin_std": 26.786773681640625, "step": 89 }, { "KL/chosen_KL_mean": -15.632135391235352, "KL/mean": -29.54084014892578, "KL/rejected_KL_mean": -43.449546813964844, "KL/std": 32.48262023925781, "epoch": 0.13215859030837004, "fcm_dpo/beta": 0.02354896441102028, "fcm_dpo/delta": -0.2722369432449341, "fcm_dpo/margin": 27.817413330078125, "fcm_dpo/q_t": 0.3705536723136902, "grad_norm": 27.808216094970703, "learning_rate": 4.986836074908615e-07, "logits/chosen": -0.6693460941314697, "logits/rejected": -0.6799524426460266, "logps/chosen": -68.91744995117188, "logps/ref_chosen": -53.285308837890625, "logps/ref_rejected": -111.54470825195312, "logps/rejected": -154.9942626953125, "loss": 1.0206, "margin_dpo/margin_mean": 27.817413330078125, "margin_dpo/margin_std": 40.34447479248047, "step": 90 }, { "KL/chosen_KL_mean": -16.3182430267334, "KL/mean": -27.830875396728516, "KL/rejected_KL_mean": -39.34351348876953, "KL/std": 26.04579734802246, "epoch": 0.13362701908957417, "fcm_dpo/beta": 0.022640112787485123, "fcm_dpo/delta": -0.1277696192264557, "fcm_dpo/margin": 23.0252685546875, "fcm_dpo/q_t": 0.38871896266937256, "grad_norm": 25.545448303222656, "learning_rate": 4.985488079432037e-07, "logits/chosen": -0.6979465484619141, "logits/rejected": -0.6646697521209717, "logps/chosen": -78.12120056152344, "logps/ref_chosen": -61.802955627441406, "logps/ref_rejected": -87.87395477294922, "logps/rejected": -127.21746826171875, "loss": 1.059, "margin_dpo/margin_mean": 23.025266647338867, "margin_dpo/margin_std": 33.728050231933594, "step": 91 }, { "KL/chosen_KL_mean": -14.809152603149414, "KL/mean": -26.451236724853516, "KL/rejected_KL_mean": -38.093318939208984, "KL/std": 26.83241081237793, "epoch": 0.13509544787077826, "fcm_dpo/beta": 0.022130444645881653, "fcm_dpo/delta": -0.12148790061473846, "fcm_dpo/margin": 23.284168243408203, "fcm_dpo/q_t": 0.38861751556396484, "grad_norm": 23.78729248046875, "learning_rate": 4.984074589033043e-07, "logits/chosen": -0.7112252712249756, "logits/rejected": -0.6874991655349731, "logps/chosen": -66.44992065429688, "logps/ref_chosen": -51.640769958496094, "logps/ref_rejected": -77.88117980957031, "logps/rejected": -115.97450256347656, "loss": 1.0518, "margin_dpo/margin_mean": 23.284168243408203, "margin_dpo/margin_std": 32.566123962402344, "step": 92 }, { "KL/chosen_KL_mean": -16.61697006225586, "KL/mean": -28.332927703857422, "KL/rejected_KL_mean": -40.04888153076172, "KL/std": 25.330059051513672, "epoch": 0.13656387665198239, "fcm_dpo/beta": 0.021535798907279968, "fcm_dpo/delta": -0.11011452972888947, "fcm_dpo/margin": 23.431915283203125, "fcm_dpo/q_t": 0.3877296447753906, "grad_norm": 24.3586368560791, "learning_rate": 4.982595640958425e-07, "logits/chosen": -0.7280140519142151, "logits/rejected": -0.676377534866333, "logps/chosen": -69.14620971679688, "logps/ref_chosen": -52.529239654541016, "logps/ref_rejected": -77.16075134277344, "logps/rejected": -117.20962524414062, "loss": 1.0277, "margin_dpo/margin_mean": 23.431915283203125, "margin_dpo/margin_std": 29.139026641845703, "step": 93 }, { "KL/chosen_KL_mean": -17.951906204223633, "KL/mean": -31.419082641601562, "KL/rejected_KL_mean": -44.88625717163086, "KL/std": 29.03601837158203, "epoch": 0.13803230543318648, "fcm_dpo/beta": 0.020746299996972084, "fcm_dpo/delta": -0.1698264628648758, "fcm_dpo/margin": 26.934350967407227, "fcm_dpo/q_t": 0.3750302195549011, "grad_norm": 24.17359733581543, "learning_rate": 4.98105127417984e-07, "logits/chosen": -0.6785788536071777, "logits/rejected": -0.6629636287689209, "logps/chosen": -79.17451477050781, "logps/ref_chosen": -61.22261047363281, "logps/ref_rejected": -99.59902954101562, "logps/rejected": -144.48529052734375, "loss": 0.9928, "margin_dpo/margin_mean": 26.934350967407227, "margin_dpo/margin_std": 30.304607391357422, "step": 94 }, { "KL/chosen_KL_mean": -16.976377487182617, "KL/mean": -27.94607162475586, "KL/rejected_KL_mean": -38.91576385498047, "KL/std": 27.511489868164062, "epoch": 0.1395007342143906, "fcm_dpo/beta": 0.02045309543609619, "fcm_dpo/delta": -0.05145730823278427, "fcm_dpo/margin": 21.939393997192383, "fcm_dpo/q_t": 0.3971632122993469, "grad_norm": 22.47222900390625, "learning_rate": 4.979441529392784e-07, "logits/chosen": -0.7069982290267944, "logits/rejected": -0.6784754991531372, "logps/chosen": -69.5000228881836, "logps/ref_chosen": -52.523643493652344, "logps/ref_rejected": -75.8803482055664, "logps/rejected": -114.79611206054688, "loss": 1.0678, "margin_dpo/margin_mean": 21.939393997192383, "margin_dpo/margin_std": 28.80935287475586, "step": 95 }, { "KL/chosen_KL_mean": -16.412687301635742, "KL/mean": -31.368961334228516, "KL/rejected_KL_mean": -46.32524108886719, "KL/std": 31.75434112548828, "epoch": 0.14096916299559473, "fcm_dpo/beta": 0.019721299409866333, "fcm_dpo/delta": -0.20427075028419495, "fcm_dpo/margin": 29.912555694580078, "fcm_dpo/q_t": 0.37095409631729126, "grad_norm": 23.20262336730957, "learning_rate": 4.977766449015534e-07, "logits/chosen": -0.7112823128700256, "logits/rejected": -0.6814401149749756, "logps/chosen": -78.56965637207031, "logps/ref_chosen": -62.15697479248047, "logps/ref_rejected": -96.59601593017578, "logps/rejected": -142.9212646484375, "loss": 0.9801, "margin_dpo/margin_mean": 29.912555694580078, "margin_dpo/margin_std": 34.870269775390625, "step": 96 }, { "KL/chosen_KL_mean": -17.793996810913086, "KL/mean": -29.294153213500977, "KL/rejected_KL_mean": -40.7943115234375, "KL/std": 26.314815521240234, "epoch": 0.14243759177679882, "fcm_dpo/beta": 0.019657842814922333, "fcm_dpo/delta": -0.05561104789376259, "fcm_dpo/margin": 23.000316619873047, "fcm_dpo/q_t": 0.395273894071579, "grad_norm": 23.92318344116211, "learning_rate": 4.976026077188012e-07, "logits/chosen": -0.6561405658721924, "logits/rejected": -0.6128396987915039, "logps/chosen": -72.44036102294922, "logps/ref_chosen": -54.646366119384766, "logps/ref_rejected": -76.96475219726562, "logps/rejected": -117.75906372070312, "loss": 1.0535, "margin_dpo/margin_mean": 23.000316619873047, "margin_dpo/margin_std": 26.310195922851562, "step": 97 }, { "KL/chosen_KL_mean": -21.98511505126953, "KL/mean": -35.2052001953125, "KL/rejected_KL_mean": -48.42529296875, "KL/std": 30.05870819091797, "epoch": 0.14390602055800295, "fcm_dpo/beta": 0.019086042419075966, "fcm_dpo/delta": -0.11044582724571228, "fcm_dpo/margin": 26.44017791748047, "fcm_dpo/q_t": 0.3849552869796753, "grad_norm": 24.65069580078125, "learning_rate": 4.974220459770639e-07, "logits/chosen": -0.6879181265830994, "logits/rejected": -0.6715967059135437, "logps/chosen": -87.24374389648438, "logps/ref_chosen": -65.25862884521484, "logps/ref_rejected": -96.5274887084961, "logps/rejected": -144.95278930664062, "loss": 1.0538, "margin_dpo/margin_mean": 26.44017791748047, "margin_dpo/margin_std": 36.05833053588867, "step": 98 }, { "KL/chosen_KL_mean": -17.362274169921875, "KL/mean": -33.471343994140625, "KL/rejected_KL_mean": -49.580406188964844, "KL/std": 32.57749938964844, "epoch": 0.14537444933920704, "fcm_dpo/beta": 0.01838843896985054, "fcm_dpo/delta": -0.20529845356941223, "fcm_dpo/margin": 32.21813201904297, "fcm_dpo/q_t": 0.373318076133728, "grad_norm": 21.651269912719727, "learning_rate": 4.972349644343108e-07, "logits/chosen": -0.6547946929931641, "logits/rejected": -0.6550130248069763, "logps/chosen": -63.00075912475586, "logps/ref_chosen": -45.638484954833984, "logps/ref_rejected": -86.43793487548828, "logps/rejected": -136.01834106445312, "loss": 0.9905, "margin_dpo/margin_mean": 32.21813201904297, "margin_dpo/margin_std": 40.488311767578125, "step": 99 }, { "KL/chosen_KL_mean": -19.682992935180664, "KL/mean": -29.148677825927734, "KL/rejected_KL_mean": -38.6143684387207, "KL/std": 26.299453735351562, "epoch": 0.14684287812041116, "fcm_dpo/beta": 0.01846114918589592, "fcm_dpo/delta": 0.05179622396826744, "fcm_dpo/margin": 18.93136978149414, "fcm_dpo/q_t": 0.42105787992477417, "grad_norm": 24.000566482543945, "learning_rate": 4.970413680203148e-07, "logits/chosen": -0.6635209321975708, "logits/rejected": -0.6204158663749695, "logps/chosen": -77.2769775390625, "logps/ref_chosen": -57.59397888183594, "logps/ref_rejected": -74.06021118164062, "logps/rejected": -112.67457580566406, "loss": 1.1631, "margin_dpo/margin_mean": 18.93136978149414, "margin_dpo/margin_std": 33.863746643066406, "step": 100 }, { "KL/chosen_KL_mean": -24.127986907958984, "KL/mean": -35.636356353759766, "KL/rejected_KL_mean": -47.14472198486328, "KL/std": 32.51232147216797, "epoch": 0.14831130690161526, "fcm_dpo/beta": 0.018270574510097504, "fcm_dpo/delta": -0.022218167781829834, "fcm_dpo/margin": 23.0167293548584, "fcm_dpo/q_t": 0.4101085960865021, "grad_norm": 23.815645217895508, "learning_rate": 4.968412618365215e-07, "logits/chosen": -0.696588933467865, "logits/rejected": -0.6663883924484253, "logps/chosen": -85.77684020996094, "logps/ref_chosen": -61.64885330200195, "logps/ref_rejected": -83.18968200683594, "logps/rejected": -130.33441162109375, "loss": 1.125, "margin_dpo/margin_mean": 23.01673126220703, "margin_dpo/margin_std": 39.10064697265625, "step": 101 }, { "KL/chosen_KL_mean": -26.22239112854004, "KL/mean": -34.66426086425781, "KL/rejected_KL_mean": -43.10613250732422, "KL/std": 30.4959659576416, "epoch": 0.14977973568281938, "fcm_dpo/beta": 0.018309336155653, "fcm_dpo/delta": -0.024285031482577324, "fcm_dpo/margin": 16.883745193481445, "fcm_dpo/q_t": 0.43124186992645264, "grad_norm": 26.869401931762695, "learning_rate": 4.966346511559149e-07, "logits/chosen": -0.7078909873962402, "logits/rejected": -0.6638644933700562, "logps/chosen": -90.30126953125, "logps/ref_chosen": -64.0788803100586, "logps/ref_rejected": -68.18707275390625, "logps/rejected": -111.293212890625, "loss": 1.2061, "margin_dpo/margin_mean": 16.883745193481445, "margin_dpo/margin_std": 35.99235534667969, "step": 102 }, { "KL/chosen_KL_mean": -20.61368179321289, "KL/mean": -37.49169158935547, "KL/rejected_KL_mean": -54.369693756103516, "KL/std": 33.84938049316406, "epoch": 0.1512481644640235, "fcm_dpo/beta": 0.017673712223768234, "fcm_dpo/delta": -0.2095203697681427, "fcm_dpo/margin": 33.75600814819336, "fcm_dpo/q_t": 0.37030476331710815, "grad_norm": 23.006120681762695, "learning_rate": 4.964215414228785e-07, "logits/chosen": -0.6496819257736206, "logits/rejected": -0.6131845116615295, "logps/chosen": -81.9129638671875, "logps/ref_chosen": -61.299278259277344, "logps/ref_rejected": -93.57270812988281, "logps/rejected": -147.94241333007812, "loss": 0.9829, "margin_dpo/margin_mean": 33.756011962890625, "margin_dpo/margin_std": 40.593528747558594, "step": 103 }, { "KL/chosen_KL_mean": -22.588258743286133, "KL/mean": -38.630088806152344, "KL/rejected_KL_mean": -54.67192077636719, "KL/std": 37.43622589111328, "epoch": 0.1527165932452276, "fcm_dpo/beta": 0.017140310257673264, "fcm_dpo/delta": -0.15842567384243011, "fcm_dpo/margin": 32.083656311035156, "fcm_dpo/q_t": 0.3846966028213501, "grad_norm": 22.44390296936035, "learning_rate": 4.96201938253052e-07, "logits/chosen": -0.7081250548362732, "logits/rejected": -0.6775051355361938, "logps/chosen": -76.96102905273438, "logps/ref_chosen": -54.372772216796875, "logps/ref_rejected": -89.5647201538086, "logps/rejected": -144.23663330078125, "loss": 1.0402, "margin_dpo/margin_mean": 32.083656311035156, "margin_dpo/margin_std": 45.53196334838867, "step": 104 }, { "KL/chosen_KL_mean": -21.200937271118164, "KL/mean": -44.030094146728516, "KL/rejected_KL_mean": -66.8592529296875, "KL/std": 36.557212829589844, "epoch": 0.15418502202643172, "fcm_dpo/beta": 0.016101296991109848, "fcm_dpo/delta": -0.36145851016044617, "fcm_dpo/margin": 45.65831756591797, "fcm_dpo/q_t": 0.33412182331085205, "grad_norm": 23.292463302612305, "learning_rate": 4.959758474331832e-07, "logits/chosen": -0.6853151321411133, "logits/rejected": -0.6624854803085327, "logps/chosen": -75.83988189697266, "logps/ref_chosen": -54.638946533203125, "logps/ref_rejected": -97.97351837158203, "logps/rejected": -164.832763671875, "loss": 0.8698, "margin_dpo/margin_mean": 45.65831756591797, "margin_dpo/margin_std": 40.13270568847656, "step": 105 }, { "KL/chosen_KL_mean": -23.751272201538086, "KL/mean": -38.02503967285156, "KL/rejected_KL_mean": -52.298805236816406, "KL/std": 31.860212326049805, "epoch": 0.15565345080763582, "fcm_dpo/beta": 0.015641074627637863, "fcm_dpo/delta": -0.04885800927877426, "fcm_dpo/margin": 28.547529220581055, "fcm_dpo/q_t": 0.3972783088684082, "grad_norm": 22.089712142944336, "learning_rate": 4.957432749209755e-07, "logits/chosen": -0.6288084983825684, "logits/rejected": -0.5959875583648682, "logps/chosen": -78.58416748046875, "logps/ref_chosen": -54.83289337158203, "logps/ref_rejected": -85.22461700439453, "logps/rejected": -137.52342224121094, "loss": 1.0582, "margin_dpo/margin_mean": 28.547531127929688, "margin_dpo/margin_std": 34.79780197143555, "step": 106 }, { "KL/chosen_KL_mean": -28.478961944580078, "KL/mean": -44.62788009643555, "KL/rejected_KL_mean": -60.77680206298828, "KL/std": 38.920860290527344, "epoch": 0.15712187958883994, "fcm_dpo/beta": 0.015334920957684517, "fcm_dpo/delta": -0.10102778673171997, "fcm_dpo/margin": 32.29783630371094, "fcm_dpo/q_t": 0.3877396583557129, "grad_norm": 21.205881118774414, "learning_rate": 4.955042268449307e-07, "logits/chosen": -0.6800429821014404, "logits/rejected": -0.6334393620491028, "logps/chosen": -98.18677520751953, "logps/ref_chosen": -69.70780944824219, "logps/ref_rejected": -94.73950958251953, "logps/rejected": -155.5163116455078, "loss": 1.0433, "margin_dpo/margin_mean": 32.29783630371094, "margin_dpo/margin_std": 41.090171813964844, "step": 107 }, { "KL/chosen_KL_mean": -25.73700523376465, "KL/mean": -44.92538833618164, "KL/rejected_KL_mean": -64.11376953125, "KL/std": 42.71458435058594, "epoch": 0.15859030837004406, "fcm_dpo/beta": 0.014912154525518417, "fcm_dpo/delta": -0.18271130323410034, "fcm_dpo/margin": 38.376766204833984, "fcm_dpo/q_t": 0.3810121417045593, "grad_norm": 21.520112991333008, "learning_rate": 4.952587095041881e-07, "logits/chosen": -0.6687978506088257, "logits/rejected": -0.646253228187561, "logps/chosen": -81.74688720703125, "logps/ref_chosen": -56.0098876953125, "logps/ref_rejected": -95.79601287841797, "logps/rejected": -159.9097900390625, "loss": 1.039, "margin_dpo/margin_mean": 38.376766204833984, "margin_dpo/margin_std": 55.44823455810547, "step": 108 }, { "KL/chosen_KL_mean": -24.07698631286621, "KL/mean": -44.76786804199219, "KL/rejected_KL_mean": -65.45874786376953, "KL/std": 41.37312316894531, "epoch": 0.16005873715124816, "fcm_dpo/beta": 0.014279071241617203, "fcm_dpo/delta": -0.20386075973510742, "fcm_dpo/margin": 41.38175964355469, "fcm_dpo/q_t": 0.3684207797050476, "grad_norm": 21.89470100402832, "learning_rate": 4.95006729368358e-07, "logits/chosen": -0.6142909526824951, "logits/rejected": -0.5930590629577637, "logps/chosen": -86.96247863769531, "logps/ref_chosen": -62.88549041748047, "logps/ref_rejected": -98.68573760986328, "logps/rejected": -164.1444854736328, "loss": 0.9868, "margin_dpo/margin_mean": 41.38175964355469, "margin_dpo/margin_std": 48.550384521484375, "step": 109 }, { "KL/chosen_KL_mean": -24.62155532836914, "KL/mean": -43.333526611328125, "KL/rejected_KL_mean": -62.04550552368164, "KL/std": 41.19173049926758, "epoch": 0.16152716593245228, "fcm_dpo/beta": 0.013750969432294369, "fcm_dpo/delta": -0.12534289062023163, "fcm_dpo/margin": 37.42394256591797, "fcm_dpo/q_t": 0.3864942193031311, "grad_norm": 19.2232608795166, "learning_rate": 4.947482930773511e-07, "logits/chosen": -0.608720064163208, "logits/rejected": -0.5711803436279297, "logps/chosen": -83.375244140625, "logps/ref_chosen": -58.753684997558594, "logps/ref_rejected": -79.75001525878906, "logps/rejected": -141.79551696777344, "loss": 1.0488, "margin_dpo/margin_mean": 37.42394256591797, "margin_dpo/margin_std": 49.388160705566406, "step": 110 }, { "KL/chosen_KL_mean": -28.314008712768555, "KL/mean": -49.46540832519531, "KL/rejected_KL_mean": -70.61680603027344, "KL/std": 46.69132995605469, "epoch": 0.16299559471365638, "fcm_dpo/beta": 0.01339393574744463, "fcm_dpo/delta": -0.17819947004318237, "fcm_dpo/margin": 42.30280303955078, "fcm_dpo/q_t": 0.37644529342651367, "grad_norm": 21.545623779296875, "learning_rate": 4.944834074412042e-07, "logits/chosen": -0.6583347320556641, "logits/rejected": -0.6357636451721191, "logps/chosen": -96.9381103515625, "logps/ref_chosen": -68.62410736083984, "logps/ref_rejected": -98.42886352539062, "logps/rejected": -169.04566955566406, "loss": 1.0303, "margin_dpo/margin_mean": 42.30280303955078, "margin_dpo/margin_std": 57.269569396972656, "step": 111 }, { "KL/chosen_KL_mean": -26.175884246826172, "KL/mean": -39.6971435546875, "KL/rejected_KL_mean": -53.2183952331543, "KL/std": 32.65059280395508, "epoch": 0.1644640234948605, "fcm_dpo/beta": 0.0133673045784235, "fcm_dpo/delta": 0.03976195305585861, "fcm_dpo/margin": 27.04251480102539, "fcm_dpo/q_t": 0.41706275939941406, "grad_norm": 19.833799362182617, "learning_rate": 4.942120794399002e-07, "logits/chosen": -0.6356754302978516, "logits/rejected": -0.5966525077819824, "logps/chosen": -76.42552185058594, "logps/ref_chosen": -50.24964141845703, "logps/ref_rejected": -64.77442932128906, "logps/rejected": -117.99282836914062, "loss": 1.1238, "margin_dpo/margin_mean": 27.042512893676758, "margin_dpo/margin_std": 39.28888702392578, "step": 112 }, { "KL/chosen_KL_mean": -32.902503967285156, "KL/mean": -47.536865234375, "KL/rejected_KL_mean": -62.17121887207031, "KL/std": 34.14165496826172, "epoch": 0.16593245227606462, "fcm_dpo/beta": 0.013465975411236286, "fcm_dpo/delta": 0.006106908433139324, "fcm_dpo/margin": 29.26871109008789, "fcm_dpo/q_t": 0.40915369987487793, "grad_norm": 20.532014846801758, "learning_rate": 4.939343162231841e-07, "logits/chosen": -0.5950823426246643, "logits/rejected": -0.5499939322471619, "logps/chosen": -99.6154556274414, "logps/ref_chosen": -66.71295166015625, "logps/ref_rejected": -77.96870422363281, "logps/rejected": -140.13992309570312, "loss": 1.0911, "margin_dpo/margin_mean": 29.26871109008789, "margin_dpo/margin_std": 37.83106231689453, "step": 113 }, { "KL/chosen_KL_mean": -30.239316940307617, "KL/mean": -53.44865417480469, "KL/rejected_KL_mean": -76.65798950195312, "KL/std": 51.340576171875, "epoch": 0.16740088105726872, "fcm_dpo/beta": 0.012956779450178146, "fcm_dpo/delta": -0.21660971641540527, "fcm_dpo/margin": 46.41866683959961, "fcm_dpo/q_t": 0.3740847706794739, "grad_norm": 21.725406646728516, "learning_rate": 4.936501251103751e-07, "logits/chosen": -0.6112878918647766, "logits/rejected": -0.5779776573181152, "logps/chosen": -88.02439880371094, "logps/ref_chosen": -57.78507995605469, "logps/ref_rejected": -87.10966491699219, "logps/rejected": -163.7676544189453, "loss": 0.9983, "margin_dpo/margin_mean": 46.41866683959961, "margin_dpo/margin_std": 62.333763122558594, "step": 114 }, { "KL/chosen_KL_mean": -39.5991325378418, "KL/mean": -55.670440673828125, "KL/rejected_KL_mean": -71.74173736572266, "KL/std": 49.06171417236328, "epoch": 0.16886930983847284, "fcm_dpo/beta": 0.01290312223136425, "fcm_dpo/delta": -0.015496611595153809, "fcm_dpo/margin": 32.142608642578125, "fcm_dpo/q_t": 0.41342228651046753, "grad_norm": 31.30410385131836, "learning_rate": 4.933595135901732e-07, "logits/chosen": -0.6504275798797607, "logits/rejected": -0.6302141547203064, "logps/chosen": -105.18177795410156, "logps/ref_chosen": -65.5826416015625, "logps/ref_rejected": -98.56552124023438, "logps/rejected": -170.3072509765625, "loss": 1.163, "margin_dpo/margin_mean": 32.142608642578125, "margin_dpo/margin_std": 64.65020751953125, "step": 115 }, { "KL/chosen_KL_mean": -29.908945083618164, "KL/mean": -47.60071563720703, "KL/rejected_KL_mean": -65.29248809814453, "KL/std": 41.23687744140625, "epoch": 0.17033773861967694, "fcm_dpo/beta": 0.012806812301278114, "fcm_dpo/delta": -0.05582479387521744, "fcm_dpo/margin": 35.38352966308594, "fcm_dpo/q_t": 0.397646963596344, "grad_norm": 21.841028213500977, "learning_rate": 4.930624893204624e-07, "logits/chosen": -0.6186249256134033, "logits/rejected": -0.608126163482666, "logps/chosen": -81.30925750732422, "logps/ref_chosen": -51.40031433105469, "logps/ref_rejected": -80.5218505859375, "logps/rejected": -145.8143310546875, "loss": 1.0577, "margin_dpo/margin_mean": 35.38352966308594, "margin_dpo/margin_std": 44.40971374511719, "step": 116 }, { "KL/chosen_KL_mean": -38.531761169433594, "KL/mean": -52.92847442626953, "KL/rejected_KL_mean": -67.3251953125, "KL/std": 44.12018585205078, "epoch": 0.17180616740088106, "fcm_dpo/beta": 0.012780335731804371, "fcm_dpo/delta": 0.03322294354438782, "fcm_dpo/margin": 28.793424606323242, "fcm_dpo/q_t": 0.41756001114845276, "grad_norm": 28.672874450683594, "learning_rate": 4.927590601281083e-07, "logits/chosen": -0.590896487236023, "logits/rejected": -0.5536011457443237, "logps/chosen": -107.83016967773438, "logps/ref_chosen": -69.29840850830078, "logps/ref_rejected": -66.583984375, "logps/rejected": -133.9091796875, "loss": 1.1456, "margin_dpo/margin_mean": 28.793426513671875, "margin_dpo/margin_std": 50.167694091796875, "step": 117 }, { "KL/chosen_KL_mean": -30.69570541381836, "KL/mean": -48.43621826171875, "KL/rejected_KL_mean": -66.17672729492188, "KL/std": 40.15031814575195, "epoch": 0.17327459618208516, "fcm_dpo/beta": 0.01270340196788311, "fcm_dpo/delta": -0.05310482531785965, "fcm_dpo/margin": 35.48102569580078, "fcm_dpo/q_t": 0.3987045884132385, "grad_norm": 21.261783599853516, "learning_rate": 4.924492340087524e-07, "logits/chosen": -0.6285079717636108, "logits/rejected": -0.6085944771766663, "logps/chosen": -86.33668518066406, "logps/ref_chosen": -55.6409797668457, "logps/ref_rejected": -75.66905975341797, "logps/rejected": -141.84579467773438, "loss": 1.0621, "margin_dpo/margin_mean": 35.48102569580078, "margin_dpo/margin_std": 45.933509826660156, "step": 118 }, { "KL/chosen_KL_mean": -42.2201042175293, "KL/mean": -59.702537536621094, "KL/rejected_KL_mean": -77.18496704101562, "KL/std": 45.075523376464844, "epoch": 0.17474302496328928, "fcm_dpo/beta": 0.012506300583481789, "fcm_dpo/delta": -0.04011544585227966, "fcm_dpo/margin": 34.964866638183594, "fcm_dpo/q_t": 0.4051211178302765, "grad_norm": 23.512651443481445, "learning_rate": 4.92133019126601e-07, "logits/chosen": -0.5862429738044739, "logits/rejected": -0.5710224509239197, "logps/chosen": -115.73030090332031, "logps/ref_chosen": -73.51019287109375, "logps/ref_rejected": -102.977294921875, "logps/rejected": -180.16226196289062, "loss": 1.1028, "margin_dpo/margin_mean": 34.964866638183594, "margin_dpo/margin_std": 54.86541748046875, "step": 119 }, { "KL/chosen_KL_mean": -43.40005111694336, "KL/mean": -67.5398941040039, "KL/rejected_KL_mean": -91.67974853515625, "KL/std": 56.29936218261719, "epoch": 0.1762114537444934, "fcm_dpo/beta": 0.012185569852590561, "fcm_dpo/delta": -0.20024745166301727, "fcm_dpo/margin": 48.279685974121094, "fcm_dpo/q_t": 0.3711671531200409, "grad_norm": 22.41628074645996, "learning_rate": 4.918104238142103e-07, "logits/chosen": -0.629610002040863, "logits/rejected": -0.5978201031684875, "logps/chosen": -120.1808853149414, "logps/ref_chosen": -76.78083801269531, "logps/ref_rejected": -108.02374267578125, "logps/rejected": -199.7034912109375, "loss": 0.9945, "margin_dpo/margin_mean": 48.27968215942383, "margin_dpo/margin_std": 59.22111892700195, "step": 120 }, { "KL/chosen_KL_mean": -41.76482391357422, "KL/mean": -68.51100158691406, "KL/rejected_KL_mean": -95.25717163085938, "KL/std": 55.03094482421875, "epoch": 0.1776798825256975, "fcm_dpo/beta": 0.011589834466576576, "fcm_dpo/delta": -0.23640641570091248, "fcm_dpo/margin": 53.492347717285156, "fcm_dpo/q_t": 0.36686164140701294, "grad_norm": 25.108898162841797, "learning_rate": 4.91481456572267e-07, "logits/chosen": -0.6011873483657837, "logits/rejected": -0.5975435972213745, "logps/chosen": -103.55471801757812, "logps/ref_chosen": -61.789894104003906, "logps/ref_rejected": -109.99456787109375, "logps/rejected": -205.25173950195312, "loss": 0.996, "margin_dpo/margin_mean": 53.492347717285156, "margin_dpo/margin_std": 68.95616912841797, "step": 121 }, { "KL/chosen_KL_mean": -37.74582290649414, "KL/mean": -71.00506591796875, "KL/rejected_KL_mean": -104.26431274414062, "KL/std": 62.45606994628906, "epoch": 0.17914831130690162, "fcm_dpo/beta": 0.010934034362435341, "fcm_dpo/delta": -0.352782666683197, "fcm_dpo/margin": 66.51848602294922, "fcm_dpo/q_t": 0.3402714133262634, "grad_norm": 23.686704635620117, "learning_rate": 4.911461260693638e-07, "logits/chosen": -0.5722811222076416, "logits/rejected": -0.5880405306816101, "logps/chosen": -84.64803314208984, "logps/ref_chosen": -46.9022102355957, "logps/ref_rejected": -106.71418762207031, "logps/rejected": -210.97850036621094, "loss": 0.893, "margin_dpo/margin_mean": 66.51847839355469, "margin_dpo/margin_std": 65.36463928222656, "step": 122 }, { "KL/chosen_KL_mean": -41.76447296142578, "KL/mean": -63.59715270996094, "KL/rejected_KL_mean": -85.42982482910156, "KL/std": 53.64918518066406, "epoch": 0.18061674008810572, "fcm_dpo/beta": 0.010566072538495064, "fcm_dpo/delta": -0.06529982388019562, "fcm_dpo/margin": 43.66535186767578, "fcm_dpo/q_t": 0.40092289447784424, "grad_norm": 20.792469024658203, "learning_rate": 4.908044411417711e-07, "logits/chosen": -0.5638459920883179, "logits/rejected": -0.5461542010307312, "logps/chosen": -103.10311126708984, "logps/ref_chosen": -61.33863830566406, "logps/ref_rejected": -87.775390625, "logps/rejected": -173.20523071289062, "loss": 1.1066, "margin_dpo/margin_mean": 43.66535186767578, "margin_dpo/margin_std": 71.86073303222656, "step": 123 }, { "KL/chosen_KL_mean": -47.43387222290039, "KL/mean": -79.43309020996094, "KL/rejected_KL_mean": -111.43231201171875, "KL/std": 73.24830627441406, "epoch": 0.18208516886930984, "fcm_dpo/beta": 0.010147863999009132, "fcm_dpo/delta": -0.2676956057548523, "fcm_dpo/margin": 63.99842834472656, "fcm_dpo/q_t": 0.3697102665901184, "grad_norm": 22.633270263671875, "learning_rate": 4.904564107932048e-07, "logits/chosen": -0.5460699796676636, "logits/rejected": -0.5479286909103394, "logps/chosen": -118.8822021484375, "logps/ref_chosen": -71.44833374023438, "logps/ref_rejected": -117.58056640625, "logps/rejected": -229.01287841796875, "loss": 1.0168, "margin_dpo/margin_mean": 63.998435974121094, "margin_dpo/margin_std": 92.30806732177734, "step": 124 }, { "KL/chosen_KL_mean": -39.13515090942383, "KL/mean": -67.69670104980469, "KL/rejected_KL_mean": -96.25823974609375, "KL/std": 61.70295333862305, "epoch": 0.18355359765051396, "fcm_dpo/beta": 0.009806671179831028, "fcm_dpo/delta": -0.1694386899471283, "fcm_dpo/margin": 57.12309646606445, "fcm_dpo/q_t": 0.3796992301940918, "grad_norm": 19.055896759033203, "learning_rate": 4.90102044194588e-07, "logits/chosen": -0.5038829445838928, "logits/rejected": -0.5049155950546265, "logps/chosen": -89.2720947265625, "logps/ref_chosen": -50.136940002441406, "logps/ref_rejected": -83.98861694335938, "logps/rejected": -180.24685668945312, "loss": 1.0252, "margin_dpo/margin_mean": 57.12309646606445, "margin_dpo/margin_std": 76.22639465332031, "step": 125 }, { "KL/chosen_KL_mean": -42.27728271484375, "KL/mean": -68.87530517578125, "KL/rejected_KL_mean": -95.47331237792969, "KL/std": 55.436668395996094, "epoch": 0.18502202643171806, "fcm_dpo/beta": 0.009523214772343636, "fcm_dpo/delta": -0.11256399005651474, "fcm_dpo/margin": 53.19603729248047, "fcm_dpo/q_t": 0.38864102959632874, "grad_norm": 20.268415451049805, "learning_rate": 4.897413506838102e-07, "logits/chosen": -0.5412114858627319, "logits/rejected": -0.5357469320297241, "logps/chosen": -97.94435119628906, "logps/ref_chosen": -55.66706848144531, "logps/ref_rejected": -98.1297607421875, "logps/rejected": -193.6030731201172, "loss": 1.0422, "margin_dpo/margin_mean": 53.1960334777832, "margin_dpo/margin_std": 70.52363586425781, "step": 126 }, { "KL/chosen_KL_mean": -41.10186767578125, "KL/mean": -60.70246887207031, "KL/rejected_KL_mean": -80.30307006835938, "KL/std": 48.769588470458984, "epoch": 0.18649045521292218, "fcm_dpo/beta": 0.009529907256364822, "fcm_dpo/delta": 0.027420198544859886, "fcm_dpo/margin": 39.201202392578125, "fcm_dpo/q_t": 0.4137781858444214, "grad_norm": 20.927669525146484, "learning_rate": 4.89374339765481e-07, "logits/chosen": -0.5445564985275269, "logits/rejected": -0.5252886414527893, "logps/chosen": -97.65654754638672, "logps/ref_chosen": -56.55467987060547, "logps/ref_rejected": -76.7957763671875, "logps/rejected": -157.09884643554688, "loss": 1.128, "margin_dpo/margin_mean": 39.201202392578125, "margin_dpo/margin_std": 61.48206329345703, "step": 127 }, { "KL/chosen_KL_mean": -43.87337875366211, "KL/mean": -64.43997192382812, "KL/rejected_KL_mean": -85.00656127929688, "KL/std": 57.35568618774414, "epoch": 0.18795888399412627, "fcm_dpo/beta": 0.009586036205291748, "fcm_dpo/delta": 0.005548093467950821, "fcm_dpo/margin": 41.13318634033203, "fcm_dpo/q_t": 0.41241586208343506, "grad_norm": 30.452770233154297, "learning_rate": 4.890010211106795e-07, "logits/chosen": -0.5140960812568665, "logits/rejected": -0.4917343258857727, "logps/chosen": -101.99433898925781, "logps/ref_chosen": -58.12095642089844, "logps/ref_rejected": -76.43896484375, "logps/rejected": -161.44552612304688, "loss": 1.1401, "margin_dpo/margin_mean": 41.13318634033203, "margin_dpo/margin_std": 71.68881225585938, "step": 128 }, { "KL/chosen_KL_mean": -53.54546356201172, "KL/mean": -75.27021789550781, "KL/rejected_KL_mean": -96.9949722290039, "KL/std": 63.507652282714844, "epoch": 0.1894273127753304, "fcm_dpo/beta": 0.009535422548651695, "fcm_dpo/delta": -0.014929811470210552, "fcm_dpo/margin": 43.44950866699219, "fcm_dpo/q_t": 0.41339975595474243, "grad_norm": 21.074941635131836, "learning_rate": 4.88621404556699e-07, "logits/chosen": -0.5416771769523621, "logits/rejected": -0.5308274626731873, "logps/chosen": -120.46183776855469, "logps/ref_chosen": -66.91637420654297, "logps/ref_rejected": -96.6422119140625, "logps/rejected": -193.63717651367188, "loss": 1.1471, "margin_dpo/margin_mean": 43.44950866699219, "margin_dpo/margin_std": 82.56187438964844, "step": 129 }, { "KL/chosen_KL_mean": -39.77076721191406, "KL/mean": -72.65290069580078, "KL/rejected_KL_mean": -105.5350341796875, "KL/std": 65.77041625976562, "epoch": 0.19089574155653452, "fcm_dpo/beta": 0.009305297397077084, "fcm_dpo/delta": -0.2256622165441513, "fcm_dpo/margin": 65.76427459716797, "fcm_dpo/q_t": 0.36885032057762146, "grad_norm": 20.190004348754883, "learning_rate": 4.882355001067891e-07, "logits/chosen": -0.47879669070243835, "logits/rejected": -0.4725271463394165, "logps/chosen": -84.4376220703125, "logps/ref_chosen": -44.66685104370117, "logps/ref_rejected": -82.78165435791016, "logps/rejected": -188.31668090820312, "loss": 0.9939, "margin_dpo/margin_mean": 65.76426696777344, "margin_dpo/margin_std": 80.05824279785156, "step": 130 }, { "KL/chosen_KL_mean": -35.21397399902344, "KL/mean": -67.79527282714844, "KL/rejected_KL_mean": -100.37657165527344, "KL/std": 64.40512084960938, "epoch": 0.19236417033773862, "fcm_dpo/beta": 0.00883854366838932, "fcm_dpo/delta": -0.18700018525123596, "fcm_dpo/margin": 65.16259002685547, "fcm_dpo/q_t": 0.36890602111816406, "grad_norm": 29.07965850830078, "learning_rate": 4.878433179298909e-07, "logits/chosen": -0.49699753522872925, "logits/rejected": -0.5037678480148315, "logps/chosen": -80.13856506347656, "logps/ref_chosen": -44.924591064453125, "logps/ref_rejected": -88.44401550292969, "logps/rejected": -188.82058715820312, "loss": 0.9793, "margin_dpo/margin_mean": 65.16259002685547, "margin_dpo/margin_std": 70.59186553955078, "step": 131 }, { "KL/chosen_KL_mean": -47.16490173339844, "KL/mean": -74.09092712402344, "KL/rejected_KL_mean": -101.01695251464844, "KL/std": 64.85047912597656, "epoch": 0.19383259911894274, "fcm_dpo/beta": 0.008657930418848991, "fcm_dpo/delta": -0.06971244513988495, "fcm_dpo/margin": 53.85203552246094, "fcm_dpo/q_t": 0.40050774812698364, "grad_norm": 19.968883514404297, "learning_rate": 4.874448683603694e-07, "logits/chosen": -0.5425466299057007, "logits/rejected": -0.54119473695755, "logps/chosen": -106.16598510742188, "logps/ref_chosen": -59.00108337402344, "logps/ref_rejected": -87.89215087890625, "logps/rejected": -188.9091033935547, "loss": 1.0858, "margin_dpo/margin_mean": 53.85203552246094, "margin_dpo/margin_std": 84.09346008300781, "step": 132 }, { "KL/chosen_KL_mean": -55.64082336425781, "KL/mean": -79.35708618164062, "KL/rejected_KL_mean": -103.07334899902344, "KL/std": 58.46957015991211, "epoch": 0.19530102790014683, "fcm_dpo/beta": 0.008616752922534943, "fcm_dpo/delta": -0.009173337370157242, "fcm_dpo/margin": 47.432525634765625, "fcm_dpo/q_t": 0.4102787375450134, "grad_norm": 25.95366096496582, "learning_rate": 4.870401618977415e-07, "logits/chosen": -0.5186644792556763, "logits/rejected": -0.5046030879020691, "logps/chosen": -122.24532318115234, "logps/ref_chosen": -66.60449981689453, "logps/ref_rejected": -96.33355712890625, "logps/rejected": -199.40692138671875, "loss": 1.1096, "margin_dpo/margin_mean": 47.432525634765625, "margin_dpo/margin_std": 74.17190551757812, "step": 133 }, { "KL/chosen_KL_mean": -44.13493347167969, "KL/mean": -69.59406280517578, "KL/rejected_KL_mean": -95.05320739746094, "KL/std": 57.12010955810547, "epoch": 0.19676945668135096, "fcm_dpo/beta": 0.008605021983385086, "fcm_dpo/delta": -0.039981499314308167, "fcm_dpo/margin": 50.918270111083984, "fcm_dpo/q_t": 0.4016202986240387, "grad_norm": 19.049375534057617, "learning_rate": 4.866292092063986e-07, "logits/chosen": -0.47495368123054504, "logits/rejected": -0.45979058742523193, "logps/chosen": -96.20419311523438, "logps/ref_chosen": -52.06925582885742, "logps/ref_rejected": -87.6545181274414, "logps/rejected": -182.70773315429688, "loss": 1.0667, "margin_dpo/margin_mean": 50.91827392578125, "margin_dpo/margin_std": 66.06755065917969, "step": 134 }, { "KL/chosen_KL_mean": -48.97834014892578, "KL/mean": -85.58039855957031, "KL/rejected_KL_mean": -122.18243408203125, "KL/std": 75.99504089355469, "epoch": 0.19823788546255505, "fcm_dpo/beta": 0.008282874710857868, "fcm_dpo/delta": -0.21973907947540283, "fcm_dpo/margin": 73.2041015625, "fcm_dpo/q_t": 0.37036457657814026, "grad_norm": 22.302026748657227, "learning_rate": 4.862120211153265e-07, "logits/chosen": -0.4845237731933594, "logits/rejected": -0.5190806984901428, "logps/chosen": -99.33219909667969, "logps/ref_chosen": -50.353858947753906, "logps/ref_rejected": -115.97975158691406, "logps/rejected": -238.1621856689453, "loss": 0.993, "margin_dpo/margin_mean": 73.2041015625, "margin_dpo/margin_std": 91.65379333496094, "step": 135 }, { "KL/chosen_KL_mean": -58.40410614013672, "KL/mean": -82.88113403320312, "KL/rejected_KL_mean": -107.358154296875, "KL/std": 69.1063461303711, "epoch": 0.19970631424375918, "fcm_dpo/beta": 0.008144080638885498, "fcm_dpo/delta": 0.0006435923278331757, "fcm_dpo/margin": 48.954044342041016, "fcm_dpo/q_t": 0.4184736907482147, "grad_norm": 20.428884506225586, "learning_rate": 4.857886086178193e-07, "logits/chosen": -0.4917562007904053, "logits/rejected": -0.48317497968673706, "logps/chosen": -123.47661590576172, "logps/ref_chosen": -65.072509765625, "logps/ref_rejected": -96.32122802734375, "logps/rejected": -203.67938232421875, "loss": 1.1412, "margin_dpo/margin_mean": 48.95404052734375, "margin_dpo/margin_std": 88.62368774414062, "step": 136 }, { "KL/chosen_KL_mean": -56.22892761230469, "KL/mean": -94.69110107421875, "KL/rejected_KL_mean": -133.1532745361328, "KL/std": 91.96360778808594, "epoch": 0.2011747430249633, "fcm_dpo/beta": 0.007917901501059532, "fcm_dpo/delta": -0.22291553020477295, "fcm_dpo/margin": 76.92433166503906, "fcm_dpo/q_t": 0.37597256898880005, "grad_norm": 18.094011306762695, "learning_rate": 4.853589828711902e-07, "logits/chosen": -0.44450414180755615, "logits/rejected": -0.47170525789260864, "logps/chosen": -104.988037109375, "logps/ref_chosen": -48.759117126464844, "logps/ref_rejected": -113.86376953125, "logps/rejected": -247.0170440673828, "loss": 1.0223, "margin_dpo/margin_mean": 76.9243392944336, "margin_dpo/margin_std": 109.81398010253906, "step": 137 }, { "KL/chosen_KL_mean": -58.91520690917969, "KL/mean": -87.9725341796875, "KL/rejected_KL_mean": -117.02986145019531, "KL/std": 70.14852905273438, "epoch": 0.2026431718061674, "fcm_dpo/beta": 0.007785219699144363, "fcm_dpo/delta": -0.054885830730199814, "fcm_dpo/margin": 58.114654541015625, "fcm_dpo/q_t": 0.3965170383453369, "grad_norm": 21.79376220703125, "learning_rate": 4.849231551964771e-07, "logits/chosen": -0.4243091940879822, "logits/rejected": -0.41211992502212524, "logps/chosen": -119.43486022949219, "logps/ref_chosen": -60.519649505615234, "logps/ref_rejected": -93.19694519042969, "logps/rejected": -210.226806640625, "loss": 1.0558, "margin_dpo/margin_mean": 58.114654541015625, "margin_dpo/margin_std": 71.69284057617188, "step": 138 }, { "KL/chosen_KL_mean": -49.914390563964844, "KL/mean": -84.76184844970703, "KL/rejected_KL_mean": -119.60931396484375, "KL/std": 66.84112548828125, "epoch": 0.20411160058737152, "fcm_dpo/beta": 0.0076102884486317635, "fcm_dpo/delta": -0.13750019669532776, "fcm_dpo/margin": 69.69491577148438, "fcm_dpo/q_t": 0.382676362991333, "grad_norm": 18.322908401489258, "learning_rate": 4.844811370781446e-07, "logits/chosen": -0.44147640466690063, "logits/rejected": -0.4316656291484833, "logps/chosen": -96.80577087402344, "logps/ref_chosen": -46.89138412475586, "logps/ref_rejected": -79.72798156738281, "logps/rejected": -199.33729553222656, "loss": 1.0187, "margin_dpo/margin_mean": 69.6949234008789, "margin_dpo/margin_std": 86.69469451904297, "step": 139 }, { "KL/chosen_KL_mean": -60.419158935546875, "KL/mean": -92.12008666992188, "KL/rejected_KL_mean": -123.82099914550781, "KL/std": 74.5600814819336, "epoch": 0.2055800293685756, "fcm_dpo/beta": 0.00744934706017375, "fcm_dpo/delta": -0.075960174202919, "fcm_dpo/margin": 63.40184783935547, "fcm_dpo/q_t": 0.3951931893825531, "grad_norm": 22.85267448425293, "learning_rate": 4.840329401637809e-07, "logits/chosen": -0.44698405265808105, "logits/rejected": -0.43398311734199524, "logps/chosen": -119.39387512207031, "logps/ref_chosen": -58.97471618652344, "logps/ref_rejected": -83.28410339355469, "logps/rejected": -207.1051025390625, "loss": 1.0674, "margin_dpo/margin_mean": 63.40184783935547, "margin_dpo/margin_std": 89.21878051757812, "step": 140 }, { "KL/chosen_KL_mean": -68.77699279785156, "KL/mean": -98.60934448242188, "KL/rejected_KL_mean": -128.44168090820312, "KL/std": 82.50788879394531, "epoch": 0.20704845814977973, "fcm_dpo/beta": 0.007387247867882252, "fcm_dpo/delta": -0.04261501878499985, "fcm_dpo/margin": 59.66469955444336, "fcm_dpo/q_t": 0.4012778103351593, "grad_norm": 27.14394760131836, "learning_rate": 4.83578576263792e-07, "logits/chosen": -0.43428778648376465, "logits/rejected": -0.4219193756580353, "logps/chosen": -143.8526611328125, "logps/ref_chosen": -75.07566833496094, "logps/ref_rejected": -98.1922607421875, "logps/rejected": -226.63394165039062, "loss": 1.1057, "margin_dpo/margin_mean": 59.664695739746094, "margin_dpo/margin_std": 95.55046844482422, "step": 141 }, { "KL/chosen_KL_mean": -70.15496826171875, "KL/mean": -104.54232025146484, "KL/rejected_KL_mean": -138.9296875, "KL/std": 90.10287475585938, "epoch": 0.20851688693098386, "fcm_dpo/beta": 0.007280835881829262, "fcm_dpo/delta": -0.10598242282867432, "fcm_dpo/margin": 68.77471923828125, "fcm_dpo/q_t": 0.39226555824279785, "grad_norm": 28.602947235107422, "learning_rate": 4.83118057351089e-07, "logits/chosen": -0.4074459671974182, "logits/rejected": -0.4063273072242737, "logps/chosen": -128.18289184570312, "logps/ref_chosen": -58.027931213378906, "logps/ref_rejected": -94.58222961425781, "logps/rejected": -233.51190185546875, "loss": 1.0853, "margin_dpo/margin_mean": 68.77471923828125, "margin_dpo/margin_std": 106.35022735595703, "step": 142 }, { "KL/chosen_KL_mean": -73.86743927001953, "KL/mean": -95.766845703125, "KL/rejected_KL_mean": -117.66624450683594, "KL/std": 79.29611206054688, "epoch": 0.20998531571218795, "fcm_dpo/beta": 0.007270464673638344, "fcm_dpo/delta": 0.08424904197454453, "fcm_dpo/margin": 43.79881286621094, "fcm_dpo/q_t": 0.43221598863601685, "grad_norm": 24.429250717163086, "learning_rate": 4.826513955607734e-07, "logits/chosen": -0.41874316334724426, "logits/rejected": -0.41285938024520874, "logps/chosen": -131.46389770507812, "logps/ref_chosen": -57.59645080566406, "logps/ref_rejected": -78.99957275390625, "logps/rejected": -196.6658172607422, "loss": 1.1982, "margin_dpo/margin_mean": 43.79881286621094, "margin_dpo/margin_std": 92.73600769042969, "step": 143 }, { "KL/chosen_KL_mean": -64.49729919433594, "KL/mean": -91.391357421875, "KL/rejected_KL_mean": -118.28541564941406, "KL/std": 66.07506561279297, "epoch": 0.21145374449339208, "fcm_dpo/beta": 0.007330943364650011, "fcm_dpo/delta": 0.005903269629925489, "fcm_dpo/margin": 53.788116455078125, "fcm_dpo/q_t": 0.41074827313423157, "grad_norm": 20.96710205078125, "learning_rate": 4.821786031898176e-07, "logits/chosen": -0.42471039295196533, "logits/rejected": -0.41297537088394165, "logps/chosen": -124.40365600585938, "logps/ref_chosen": -59.90636444091797, "logps/ref_rejected": -82.00025939941406, "logps/rejected": -200.28567504882812, "loss": 1.1061, "margin_dpo/margin_mean": 53.788116455078125, "margin_dpo/margin_std": 77.68215942382812, "step": 144 }, { "KL/chosen_KL_mean": -61.503116607666016, "KL/mean": -90.38899993896484, "KL/rejected_KL_mean": -119.27488708496094, "KL/std": 66.21023559570312, "epoch": 0.21292217327459617, "fcm_dpo/beta": 0.007309791631996632, "fcm_dpo/delta": -0.02330685406923294, "fcm_dpo/margin": 57.771766662597656, "fcm_dpo/q_t": 0.40421411395072937, "grad_norm": 24.383813858032227, "learning_rate": 4.816996926967401e-07, "logits/chosen": -0.4303405284881592, "logits/rejected": -0.41304582357406616, "logps/chosen": -118.1037826538086, "logps/ref_chosen": -56.60066604614258, "logps/ref_rejected": -77.86631774902344, "logps/rejected": -197.14120483398438, "loss": 1.0875, "margin_dpo/margin_mean": 57.771766662597656, "margin_dpo/margin_std": 80.48458862304688, "step": 145 }, { "KL/chosen_KL_mean": -84.24576568603516, "KL/mean": -106.54432678222656, "KL/rejected_KL_mean": -128.8428955078125, "KL/std": 72.23680877685547, "epoch": 0.2143906020558003, "fcm_dpo/beta": 0.007366587873548269, "fcm_dpo/delta": 0.07394760102033615, "fcm_dpo/margin": 44.597129821777344, "fcm_dpo/q_t": 0.4255162477493286, "grad_norm": 27.966459274291992, "learning_rate": 4.812146767012779e-07, "logits/chosen": -0.4398476481437683, "logits/rejected": -0.4156040847301483, "logps/chosen": -150.2462158203125, "logps/ref_chosen": -66.00045013427734, "logps/ref_rejected": -81.70278930664062, "logps/rejected": -210.54568481445312, "loss": 1.1839, "margin_dpo/margin_mean": 44.597129821777344, "margin_dpo/margin_std": 87.12408447265625, "step": 146 }, { "KL/chosen_KL_mean": -61.255210876464844, "KL/mean": -91.42532348632812, "KL/rejected_KL_mean": -121.59544372558594, "KL/std": 72.65022277832031, "epoch": 0.21585903083700442, "fcm_dpo/beta": 0.007349137216806412, "fcm_dpo/delta": -0.045649539679288864, "fcm_dpo/margin": 60.3402214050293, "fcm_dpo/q_t": 0.40209323167800903, "grad_norm": 19.574729919433594, "learning_rate": 4.807235679840536e-07, "logits/chosen": -0.4582536816596985, "logits/rejected": -0.43836987018585205, "logps/chosen": -114.66069793701172, "logps/ref_chosen": -53.405487060546875, "logps/ref_rejected": -71.39060974121094, "logps/rejected": -192.98605346679688, "loss": 1.09, "margin_dpo/margin_mean": 60.34022521972656, "margin_dpo/margin_std": 90.08858489990234, "step": 147 }, { "KL/chosen_KL_mean": -59.720096588134766, "KL/mean": -85.48670959472656, "KL/rejected_KL_mean": -111.25332641601562, "KL/std": 71.65913391113281, "epoch": 0.2173274596182085, "fcm_dpo/beta": 0.007287460379302502, "fcm_dpo/delta": -0.0821787416934967, "fcm_dpo/margin": 51.533233642578125, "fcm_dpo/q_t": 0.4167160391807556, "grad_norm": 19.111360549926758, "learning_rate": 4.802263794862384e-07, "logits/chosen": -0.5154159665107727, "logits/rejected": -0.5085688829421997, "logps/chosen": -124.65718078613281, "logps/ref_chosen": -64.93708038330078, "logps/ref_rejected": -103.09384155273438, "logps/rejected": -214.34716796875, "loss": 1.1251, "margin_dpo/margin_mean": 51.533233642578125, "margin_dpo/margin_std": 75.76239776611328, "step": 148 }, { "KL/chosen_KL_mean": -56.87557601928711, "KL/mean": -89.17583465576172, "KL/rejected_KL_mean": -121.4760971069336, "KL/std": 65.12533569335938, "epoch": 0.21879588839941264, "fcm_dpo/beta": 0.007102725096046925, "fcm_dpo/delta": -0.0634830892086029, "fcm_dpo/margin": 64.60050964355469, "fcm_dpo/q_t": 0.39505213499069214, "grad_norm": 17.82282257080078, "learning_rate": 4.797231243092118e-07, "logits/chosen": -0.49969860911369324, "logits/rejected": -0.4846518933773041, "logps/chosen": -115.34933471679688, "logps/ref_chosen": -58.47376251220703, "logps/ref_rejected": -99.31474304199219, "logps/rejected": -220.79083251953125, "loss": 1.0534, "margin_dpo/margin_mean": 64.60050964355469, "margin_dpo/margin_std": 77.06610870361328, "step": 149 }, { "KL/chosen_KL_mean": -50.41044235229492, "KL/mean": -81.95865631103516, "KL/rejected_KL_mean": -113.50686645507812, "KL/std": 76.46592712402344, "epoch": 0.22026431718061673, "fcm_dpo/beta": 0.007035818882286549, "fcm_dpo/delta": -0.04736195132136345, "fcm_dpo/margin": 63.096431732177734, "fcm_dpo/q_t": 0.40432196855545044, "grad_norm": 18.326509475708008, "learning_rate": 4.792138157142157e-07, "logits/chosen": -0.45320773124694824, "logits/rejected": -0.456167995929718, "logps/chosen": -96.11625671386719, "logps/ref_chosen": -45.705810546875, "logps/ref_rejected": -83.34759521484375, "logps/rejected": -196.85446166992188, "loss": 1.0814, "margin_dpo/margin_mean": 63.096431732177734, "margin_dpo/margin_std": 91.71922302246094, "step": 150 }, { "KL/chosen_KL_mean": -62.595176696777344, "KL/mean": -94.06363677978516, "KL/rejected_KL_mean": -125.53208923339844, "KL/std": 71.80149841308594, "epoch": 0.22173274596182085, "fcm_dpo/beta": 0.007020828314125538, "fcm_dpo/delta": -0.04382166266441345, "fcm_dpo/margin": 62.936920166015625, "fcm_dpo/q_t": 0.3986474275588989, "grad_norm": 21.404926300048828, "learning_rate": 4.786984671220053e-07, "logits/chosen": -0.5180387496948242, "logits/rejected": -0.4879586100578308, "logps/chosen": -133.166015625, "logps/ref_chosen": -70.57083129882812, "logps/ref_rejected": -100.46382141113281, "logps/rejected": -225.99591064453125, "loss": 1.0626, "margin_dpo/margin_mean": 62.936920166015625, "margin_dpo/margin_std": 78.73812866210938, "step": 151 }, { "KL/chosen_KL_mean": -54.41718292236328, "KL/mean": -92.68287658691406, "KL/rejected_KL_mean": -130.9485626220703, "KL/std": 73.81169128417969, "epoch": 0.22320117474302498, "fcm_dpo/beta": 0.006885044276714325, "fcm_dpo/delta": -0.13378563523292542, "fcm_dpo/margin": 76.53138732910156, "fcm_dpo/q_t": 0.3824828267097473, "grad_norm": 20.40810203552246, "learning_rate": 4.78177092112495e-07, "logits/chosen": -0.4824361205101013, "logits/rejected": -0.47918662428855896, "logps/chosen": -114.58157348632812, "logps/ref_chosen": -60.16438674926758, "logps/ref_rejected": -106.14045715332031, "logps/rejected": -237.08901977539062, "loss": 1.017, "margin_dpo/margin_mean": 76.53138732910156, "margin_dpo/margin_std": 90.15208435058594, "step": 152 }, { "KL/chosen_KL_mean": -55.965492248535156, "KL/mean": -88.55425262451172, "KL/rejected_KL_mean": -121.14301300048828, "KL/std": 80.69013977050781, "epoch": 0.22466960352422907, "fcm_dpo/beta": 0.006790122948586941, "fcm_dpo/delta": -0.044522788375616074, "fcm_dpo/margin": 65.17752075195312, "fcm_dpo/q_t": 0.40387213230133057, "grad_norm": 15.532352447509766, "learning_rate": 4.776497044244016e-07, "logits/chosen": -0.49872875213623047, "logits/rejected": -0.4938894510269165, "logps/chosen": -112.28076934814453, "logps/ref_chosen": -56.315277099609375, "logps/ref_rejected": -85.65583801269531, "logps/rejected": -206.79884338378906, "loss": 1.0909, "margin_dpo/margin_mean": 65.17752075195312, "margin_dpo/margin_std": 99.14810180664062, "step": 153 }, { "KL/chosen_KL_mean": -66.57502746582031, "KL/mean": -97.78536987304688, "KL/rejected_KL_mean": -128.99571228027344, "KL/std": 80.61070251464844, "epoch": 0.2261380323054332, "fcm_dpo/beta": 0.006765860132873058, "fcm_dpo/delta": -0.023557795211672783, "fcm_dpo/margin": 62.42070007324219, "fcm_dpo/q_t": 0.4070885479450226, "grad_norm": 18.9123592376709, "learning_rate": 4.771163179548808e-07, "logits/chosen": -0.4788493514060974, "logits/rejected": -0.48109960556030273, "logps/chosen": -129.31759643554688, "logps/ref_chosen": -62.74256896972656, "logps/ref_rejected": -104.24420166015625, "logps/rejected": -233.2399139404297, "loss": 1.1206, "margin_dpo/margin_mean": 62.42070388793945, "margin_dpo/margin_std": 102.98345947265625, "step": 154 }, { "KL/chosen_KL_mean": -61.065223693847656, "KL/mean": -92.69122314453125, "KL/rejected_KL_mean": -124.31721496582031, "KL/std": 75.25825500488281, "epoch": 0.2276064610866373, "fcm_dpo/beta": 0.006722897756844759, "fcm_dpo/delta": -0.026448355987668037, "fcm_dpo/margin": 63.251991271972656, "fcm_dpo/q_t": 0.4044472575187683, "grad_norm": 19.345184326171875, "learning_rate": 4.7657694675916247e-07, "logits/chosen": -0.4656848907470703, "logits/rejected": -0.44397997856140137, "logps/chosen": -121.7184066772461, "logps/ref_chosen": -60.65318298339844, "logps/ref_rejected": -77.49220275878906, "logps/rejected": -201.80941772460938, "loss": 1.0946, "margin_dpo/margin_mean": 63.25199890136719, "margin_dpo/margin_std": 93.20378112792969, "step": 155 }, { "KL/chosen_KL_mean": -85.85633850097656, "KL/mean": -104.20632934570312, "KL/rejected_KL_mean": -122.55632019042969, "KL/std": 80.0552978515625, "epoch": 0.2290748898678414, "fcm_dpo/beta": 0.0067663900554180145, "fcm_dpo/delta": 0.05330243334174156, "fcm_dpo/margin": 36.699989318847656, "fcm_dpo/q_t": 0.44376257061958313, "grad_norm": 29.024635314941406, "learning_rate": 4.7603160505017893e-07, "logits/chosen": -0.41135138273239136, "logits/rejected": -0.40221792459487915, "logps/chosen": -155.34820556640625, "logps/ref_chosen": -69.49188232421875, "logps/ref_rejected": -77.16929626464844, "logps/rejected": -199.72561645507812, "loss": 1.2725, "margin_dpo/margin_mean": 36.699989318847656, "margin_dpo/margin_std": 105.47511291503906, "step": 156 }, { "KL/chosen_KL_mean": -78.11371612548828, "KL/mean": -118.10308837890625, "KL/rejected_KL_mean": -158.09246826171875, "KL/std": 87.00740051269531, "epoch": 0.2305433186490455, "fcm_dpo/beta": 0.0065932744182646275, "fcm_dpo/delta": -0.13586004078388214, "fcm_dpo/margin": 79.97874450683594, "fcm_dpo/q_t": 0.37946271896362305, "grad_norm": 23.550922393798828, "learning_rate": 4.7548030719819154e-07, "logits/chosen": -0.40428751707077026, "logits/rejected": -0.4115862250328064, "logps/chosen": -139.48214721679688, "logps/ref_chosen": -61.368438720703125, "logps/ref_rejected": -107.64636993408203, "logps/rejected": -265.73883056640625, "loss": 1.0299, "margin_dpo/margin_mean": 79.97874450683594, "margin_dpo/margin_std": 98.83998107910156, "step": 157 }, { "KL/chosen_KL_mean": -78.0158462524414, "KL/mean": -120.59271240234375, "KL/rejected_KL_mean": -163.16958618164062, "KL/std": 107.82058715820312, "epoch": 0.23201174743024963, "fcm_dpo/beta": 0.006434428971260786, "fcm_dpo/delta": -0.15647649765014648, "fcm_dpo/margin": 85.15372467041016, "fcm_dpo/q_t": 0.38724130392074585, "grad_norm": 20.167179107666016, "learning_rate": 4.7492306773041136e-07, "logits/chosen": -0.3908860683441162, "logits/rejected": -0.40809518098831177, "logps/chosen": -135.62876892089844, "logps/ref_chosen": -57.612918853759766, "logps/ref_rejected": -113.6946792602539, "logps/rejected": -276.8642578125, "loss": 1.0558, "margin_dpo/margin_mean": 85.15372467041016, "margin_dpo/margin_std": 129.78738403320312, "step": 158 }, { "KL/chosen_KL_mean": -88.34893798828125, "KL/mean": -117.44178771972656, "KL/rejected_KL_mean": -146.5346221923828, "KL/std": 95.12521362304688, "epoch": 0.23348017621145375, "fcm_dpo/beta": 0.006435505114495754, "fcm_dpo/delta": 0.026020796969532967, "fcm_dpo/margin": 58.185665130615234, "fcm_dpo/q_t": 0.4166509807109833, "grad_norm": 22.233327865600586, "learning_rate": 4.743599013306165e-07, "logits/chosen": -0.3991559147834778, "logits/rejected": -0.36633995175361633, "logps/chosen": -169.90928649902344, "logps/ref_chosen": -81.56034851074219, "logps/ref_rejected": -88.89871215820312, "logps/rejected": -235.43331909179688, "loss": 1.1488, "margin_dpo/margin_mean": 58.1856689453125, "margin_dpo/margin_std": 102.5018310546875, "step": 159 }, { "KL/chosen_KL_mean": -90.17693328857422, "KL/mean": -129.38868713378906, "KL/rejected_KL_mean": -168.6004638671875, "KL/std": 101.24613952636719, "epoch": 0.23494860499265785, "fcm_dpo/beta": 0.00629377830773592, "fcm_dpo/delta": -0.09926701337099075, "fcm_dpo/margin": 78.42352294921875, "fcm_dpo/q_t": 0.3964502215385437, "grad_norm": 23.24005126953125, "learning_rate": 4.737908228387656e-07, "logits/chosen": -0.3764195144176483, "logits/rejected": -0.3670395612716675, "logps/chosen": -155.90780639648438, "logps/ref_chosen": -65.73088073730469, "logps/ref_rejected": -97.21781921386719, "logps/rejected": -265.8182678222656, "loss": 1.0906, "margin_dpo/margin_mean": 78.42352294921875, "margin_dpo/margin_std": 126.56271362304688, "step": 160 }, { "KL/chosen_KL_mean": -76.84190368652344, "KL/mean": -111.3232650756836, "KL/rejected_KL_mean": -145.80462646484375, "KL/std": 80.283203125, "epoch": 0.23641703377386197, "fcm_dpo/beta": 0.006256973836570978, "fcm_dpo/delta": -0.03292801231145859, "fcm_dpo/margin": 68.96272277832031, "fcm_dpo/q_t": 0.40471774339675903, "grad_norm": 21.411426544189453, "learning_rate": 4.7321584725060594e-07, "logits/chosen": -0.34909725189208984, "logits/rejected": -0.34455615282058716, "logps/chosen": -129.27838134765625, "logps/ref_chosen": -52.43647003173828, "logps/ref_rejected": -83.43095397949219, "logps/rejected": -229.23558044433594, "loss": 1.0934, "margin_dpo/margin_mean": 68.96272277832031, "margin_dpo/margin_std": 102.0528564453125, "step": 161 }, { "KL/chosen_KL_mean": -74.41575622558594, "KL/mean": -108.84408569335938, "KL/rejected_KL_mean": -143.27243041992188, "KL/std": 89.80424499511719, "epoch": 0.23788546255506607, "fcm_dpo/beta": 0.006169519387185574, "fcm_dpo/delta": -0.02764543890953064, "fcm_dpo/margin": 68.85668182373047, "fcm_dpo/q_t": 0.4066374897956848, "grad_norm": 22.422332763671875, "learning_rate": 4.7263498971727905e-07, "logits/chosen": -0.44204244017601013, "logits/rejected": -0.4256909489631653, "logps/chosen": -137.02633666992188, "logps/ref_chosen": -62.6105842590332, "logps/ref_rejected": -89.39057922363281, "logps/rejected": -232.6630096435547, "loss": 1.1078, "margin_dpo/margin_mean": 68.85668182373047, "margin_dpo/margin_std": 106.42919158935547, "step": 162 }, { "KL/chosen_KL_mean": -83.02301025390625, "KL/mean": -116.34445190429688, "KL/rejected_KL_mean": -149.6658935546875, "KL/std": 89.37168884277344, "epoch": 0.2393538913362702, "fcm_dpo/beta": 0.006201374344527721, "fcm_dpo/delta": -0.013848692178726196, "fcm_dpo/margin": 66.64288330078125, "fcm_dpo/q_t": 0.40968990325927734, "grad_norm": 20.55537223815918, "learning_rate": 4.720482655449212e-07, "logits/chosen": -0.37833207845687866, "logits/rejected": -0.3598299026489258, "logps/chosen": -138.04464721679688, "logps/ref_chosen": -55.021629333496094, "logps/ref_rejected": -75.418212890625, "logps/rejected": -225.0841064453125, "loss": 1.1151, "margin_dpo/margin_mean": 66.64288330078125, "margin_dpo/margin_std": 107.60564422607422, "step": 163 }, { "KL/chosen_KL_mean": -75.54098510742188, "KL/mean": -116.61768341064453, "KL/rejected_KL_mean": -157.69436645507812, "KL/std": 87.67422485351562, "epoch": 0.24082232011747431, "fcm_dpo/beta": 0.006058148108422756, "fcm_dpo/delta": -0.10455699265003204, "fcm_dpo/margin": 82.15338134765625, "fcm_dpo/q_t": 0.38800323009490967, "grad_norm": 23.66642189025879, "learning_rate": 4.714556901942599e-07, "logits/chosen": -0.38808923959732056, "logits/rejected": -0.3748926520347595, "logps/chosen": -131.18165588378906, "logps/ref_chosen": -55.64066696166992, "logps/ref_rejected": -79.66463470458984, "logps/rejected": -237.3590087890625, "loss": 1.0355, "margin_dpo/margin_mean": 82.15338134765625, "margin_dpo/margin_std": 101.20204162597656, "step": 164 }, { "KL/chosen_KL_mean": -81.94873046875, "KL/mean": -108.24546813964844, "KL/rejected_KL_mean": -134.54220581054688, "KL/std": 74.16279602050781, "epoch": 0.2422907488986784, "fcm_dpo/beta": 0.006125118583440781, "fcm_dpo/delta": 0.08051982522010803, "fcm_dpo/margin": 52.59346008300781, "fcm_dpo/q_t": 0.42784789204597473, "grad_norm": 24.392139434814453, "learning_rate": 4.708572792802069e-07, "logits/chosen": -0.3788298964500427, "logits/rejected": -0.3494594097137451, "logps/chosen": -143.25941467285156, "logps/ref_chosen": -61.310691833496094, "logps/ref_rejected": -73.67060852050781, "logps/rejected": -208.21279907226562, "loss": 1.1772, "margin_dpo/margin_mean": 52.59346008300781, "margin_dpo/margin_std": 99.10749053955078, "step": 165 }, { "KL/chosen_KL_mean": -72.17887115478516, "KL/mean": -121.31666564941406, "KL/rejected_KL_mean": -170.4544677734375, "KL/std": 108.35174560546875, "epoch": 0.24375917767988253, "fcm_dpo/beta": 0.00596030056476593, "fcm_dpo/delta": -0.19864240288734436, "fcm_dpo/margin": 98.27557373046875, "fcm_dpo/q_t": 0.3804228901863098, "grad_norm": 17.77814292907715, "learning_rate": 4.702530485714461e-07, "logits/chosen": -0.3884061574935913, "logits/rejected": -0.39925825595855713, "logps/chosen": -123.1624755859375, "logps/ref_chosen": -50.98360061645508, "logps/ref_rejected": -98.09512329101562, "logps/rejected": -268.5495910644531, "loss": 1.0179, "margin_dpo/margin_mean": 98.27557373046875, "margin_dpo/margin_std": 137.25851440429688, "step": 166 }, { "KL/chosen_KL_mean": -73.13114929199219, "KL/mean": -124.726806640625, "KL/rejected_KL_mean": -176.3224639892578, "KL/std": 98.56492614746094, "epoch": 0.24522760646108663, "fcm_dpo/beta": 0.0057468172162771225, "fcm_dpo/delta": -0.20526599884033203, "fcm_dpo/margin": 103.19131469726562, "fcm_dpo/q_t": 0.36781153082847595, "grad_norm": 18.200578689575195, "learning_rate": 4.6964301399001877e-07, "logits/chosen": -0.36232346296310425, "logits/rejected": -0.36501890420913696, "logps/chosen": -123.55524444580078, "logps/ref_chosen": -50.424095153808594, "logps/ref_rejected": -96.03042602539062, "logps/rejected": -272.3529052734375, "loss": 0.9736, "margin_dpo/margin_mean": 103.19131469726562, "margin_dpo/margin_std": 114.00679016113281, "step": 167 }, { "KL/chosen_KL_mean": -79.71585083007812, "KL/mean": -117.56804656982422, "KL/rejected_KL_mean": -155.42022705078125, "KL/std": 91.46189880371094, "epoch": 0.24669603524229075, "fcm_dpo/beta": 0.005651239771395922, "fcm_dpo/delta": -0.029224606230854988, "fcm_dpo/margin": 75.70437622070312, "fcm_dpo/q_t": 0.40402811765670776, "grad_norm": 19.609725952148438, "learning_rate": 4.690271916109034e-07, "logits/chosen": -0.3544921278953552, "logits/rejected": -0.3437988758087158, "logps/chosen": -129.17868041992188, "logps/ref_chosen": -49.462825775146484, "logps/ref_rejected": -75.30855560302734, "logps/rejected": -230.72879028320312, "loss": 1.0797, "margin_dpo/margin_mean": 75.70437622070312, "margin_dpo/margin_std": 102.8135986328125, "step": 168 }, { "KL/chosen_KL_mean": -82.12629699707031, "KL/mean": -115.36107635498047, "KL/rejected_KL_mean": -148.59585571289062, "KL/std": 91.35723876953125, "epoch": 0.24816446402349487, "fcm_dpo/beta": 0.005572349298745394, "fcm_dpo/delta": -0.07357925921678543, "fcm_dpo/margin": 66.46955108642578, "fcm_dpo/q_t": 0.42019665241241455, "grad_norm": 19.77760124206543, "learning_rate": 4.6840559766159235e-07, "logits/chosen": -0.3844687044620514, "logits/rejected": -0.36855146288871765, "logps/chosen": -141.92974853515625, "logps/ref_chosen": -59.803443908691406, "logps/ref_rejected": -83.34574890136719, "logps/rejected": -231.94158935546875, "loss": 1.1611, "margin_dpo/margin_mean": 66.46955108642578, "margin_dpo/margin_std": 125.47772216796875, "step": 169 }, { "KL/chosen_KL_mean": -72.95841979980469, "KL/mean": -112.25492095947266, "KL/rejected_KL_mean": -151.55142211914062, "KL/std": 86.3768310546875, "epoch": 0.24963289280469897, "fcm_dpo/beta": 0.005510912276804447, "fcm_dpo/delta": -0.03551424294710159, "fcm_dpo/margin": 78.5929946899414, "fcm_dpo/q_t": 0.4006960988044739, "grad_norm": 18.546783447265625, "learning_rate": 4.6777824852166437e-07, "logits/chosen": -0.3328009247779846, "logits/rejected": -0.321723610162735, "logps/chosen": -122.4301986694336, "logps/ref_chosen": -49.471771240234375, "logps/ref_rejected": -75.91734313964844, "logps/rejected": -227.46875, "loss": 1.077, "margin_dpo/margin_mean": 78.5929946899414, "margin_dpo/margin_std": 103.26647186279297, "step": 170 }, { "KL/chosen_KL_mean": -106.71308135986328, "KL/mean": -138.3514404296875, "KL/rejected_KL_mean": -169.98980712890625, "KL/std": 100.98884582519531, "epoch": 0.2511013215859031, "fcm_dpo/beta": 0.005571361631155014, "fcm_dpo/delta": 0.04921392351388931, "fcm_dpo/margin": 63.276710510253906, "fcm_dpo/q_t": 0.425686776638031, "grad_norm": 27.314945220947266, "learning_rate": 4.6714516072235273e-07, "logits/chosen": -0.33173030614852905, "logits/rejected": -0.3102639317512512, "logps/chosen": -191.21240234375, "logps/ref_chosen": -84.49931335449219, "logps/ref_rejected": -109.38209533691406, "logps/rejected": -279.37188720703125, "loss": 1.1782, "margin_dpo/margin_mean": 63.276710510253906, "margin_dpo/margin_std": 130.07742309570312, "step": 171 }, { "KL/chosen_KL_mean": -93.982177734375, "KL/mean": -127.27529907226562, "KL/rejected_KL_mean": -160.56842041015625, "KL/std": 97.7683334350586, "epoch": 0.2525697503671072, "fcm_dpo/beta": 0.005604305304586887, "fcm_dpo/delta": 0.027874935418367386, "fcm_dpo/margin": 66.58624267578125, "fcm_dpo/q_t": 0.4164371192455292, "grad_norm": 19.571752548217773, "learning_rate": 4.6650635094610966e-07, "logits/chosen": -0.3909180760383606, "logits/rejected": -0.3718334138393402, "logps/chosen": -162.6361083984375, "logps/ref_chosen": -68.65391540527344, "logps/ref_rejected": -85.43667602539062, "logps/rejected": -246.00509643554688, "loss": 1.1342, "margin_dpo/margin_mean": 66.58624267578125, "margin_dpo/margin_std": 109.93568420410156, "step": 172 }, { "KL/chosen_KL_mean": -85.7948989868164, "KL/mean": -119.3344497680664, "KL/rejected_KL_mean": -152.87399291992188, "KL/std": 91.15321350097656, "epoch": 0.2540381791483113, "fcm_dpo/beta": 0.00565545866265893, "fcm_dpo/delta": 0.021153416484594345, "fcm_dpo/margin": 67.0791015625, "fcm_dpo/q_t": 0.4138815701007843, "grad_norm": 19.52805519104004, "learning_rate": 4.6586183602616687e-07, "logits/chosen": -0.38386523723602295, "logits/rejected": -0.353518009185791, "logps/chosen": -148.8457794189453, "logps/ref_chosen": -63.050880432128906, "logps/ref_rejected": -78.68392181396484, "logps/rejected": -231.55792236328125, "loss": 1.1093, "margin_dpo/margin_mean": 67.0791015625, "margin_dpo/margin_std": 95.22209167480469, "step": 173 }, { "KL/chosen_KL_mean": -82.10379791259766, "KL/mean": -121.74862670898438, "KL/rejected_KL_mean": -161.3934783935547, "KL/std": 96.38128662109375, "epoch": 0.2555066079295154, "fcm_dpo/beta": 0.005634433589875698, "fcm_dpo/delta": -0.04934954643249512, "fcm_dpo/margin": 79.28968048095703, "fcm_dpo/q_t": 0.4018397331237793, "grad_norm": 26.33484649658203, "learning_rate": 4.652116329460919e-07, "logits/chosen": -0.32500776648521423, "logits/rejected": -0.3427908718585968, "logps/chosen": -135.46676635742188, "logps/ref_chosen": -53.36296844482422, "logps/ref_rejected": -101.91120910644531, "logps/rejected": -263.3046875, "loss": 1.0871, "margin_dpo/margin_mean": 79.28968048095703, "margin_dpo/margin_std": 115.50275421142578, "step": 174 }, { "KL/chosen_KL_mean": -75.51565551757812, "KL/mean": -129.7423095703125, "KL/rejected_KL_mean": -183.96896362304688, "KL/std": 102.85943603515625, "epoch": 0.25697503671071953, "fcm_dpo/beta": 0.00541552621871233, "fcm_dpo/delta": -0.19940567016601562, "fcm_dpo/margin": 108.45330810546875, "fcm_dpo/q_t": 0.3663579821586609, "grad_norm": 27.85973358154297, "learning_rate": 4.645557588393406e-07, "logits/chosen": -0.32464098930358887, "logits/rejected": -0.31119775772094727, "logps/chosen": -120.93342590332031, "logps/ref_chosen": -45.417762756347656, "logps/ref_rejected": -89.50579833984375, "logps/rejected": -273.47479248046875, "loss": 0.9587, "margin_dpo/margin_mean": 108.45330810546875, "margin_dpo/margin_std": 108.14402770996094, "step": 175 }, { "KL/chosen_KL_mean": -81.18791198730469, "KL/mean": -126.93669891357422, "KL/rejected_KL_mean": -172.68548583984375, "KL/std": 102.40462493896484, "epoch": 0.25844346549192365, "fcm_dpo/beta": 0.00530798826366663, "fcm_dpo/delta": -0.08994344621896744, "fcm_dpo/margin": 91.49755859375, "fcm_dpo/q_t": 0.3935086727142334, "grad_norm": 21.088232040405273, "learning_rate": 4.638942309888058e-07, "logits/chosen": -0.3157244324684143, "logits/rejected": -0.3328602910041809, "logps/chosen": -131.6407470703125, "logps/ref_chosen": -50.452842712402344, "logps/ref_rejected": -95.5589599609375, "logps/rejected": -268.24444580078125, "loss": 1.0459, "margin_dpo/margin_mean": 91.49755859375, "margin_dpo/margin_std": 118.77714538574219, "step": 176 }, { "KL/chosen_KL_mean": -93.40528869628906, "KL/mean": -138.24319458007812, "KL/rejected_KL_mean": -183.0811309814453, "KL/std": 109.86962127685547, "epoch": 0.2599118942731278, "fcm_dpo/beta": 0.005232410505414009, "fcm_dpo/delta": -0.07259676605463028, "fcm_dpo/margin": 89.67583465576172, "fcm_dpo/q_t": 0.3961235284805298, "grad_norm": 26.937484741210938, "learning_rate": 4.6322706682636137e-07, "logits/chosen": -0.3570261597633362, "logits/rejected": -0.3483562469482422, "logps/chosen": -154.6217498779297, "logps/ref_chosen": -61.216468811035156, "logps/ref_rejected": -95.89378356933594, "logps/rejected": -278.97491455078125, "loss": 1.0546, "margin_dpo/margin_mean": 89.67582702636719, "margin_dpo/margin_std": 117.8981704711914, "step": 177 }, { "KL/chosen_KL_mean": -102.56449890136719, "KL/mean": -159.57308959960938, "KL/rejected_KL_mean": -216.5816650390625, "KL/std": 129.33131408691406, "epoch": 0.26138032305433184, "fcm_dpo/beta": 0.005035985726863146, "fcm_dpo/delta": -0.18584051728248596, "fcm_dpo/margin": 114.01716613769531, "fcm_dpo/q_t": 0.37551695108413696, "grad_norm": 24.992889404296875, "learning_rate": 4.6255428393240354e-07, "logits/chosen": -0.2407420575618744, "logits/rejected": -0.23280589282512665, "logps/chosen": -160.82928466796875, "logps/ref_chosen": -58.26478958129883, "logps/ref_rejected": -105.3653335571289, "logps/rejected": -321.947021484375, "loss": 0.9997, "margin_dpo/margin_mean": 114.01716613769531, "margin_dpo/margin_std": 141.47775268554688, "step": 178 }, { "KL/chosen_KL_mean": -103.99703979492188, "KL/mean": -144.8743438720703, "KL/rejected_KL_mean": -185.75167846679688, "KL/std": 110.77649688720703, "epoch": 0.26284875183553597, "fcm_dpo/beta": 0.004978477954864502, "fcm_dpo/delta": -0.007790856063365936, "fcm_dpo/margin": 81.7546157836914, "fcm_dpo/q_t": 0.4102519154548645, "grad_norm": 26.21021270751953, "learning_rate": 4.6187590003538724e-07, "logits/chosen": -0.2729560434818268, "logits/rejected": -0.2793646454811096, "logps/chosen": -165.05535888671875, "logps/ref_chosen": -61.05832290649414, "logps/ref_rejected": -90.52782440185547, "logps/rejected": -276.27947998046875, "loss": 1.1302, "margin_dpo/margin_mean": 81.75462341308594, "margin_dpo/margin_std": 138.115966796875, "step": 179 }, { "KL/chosen_KL_mean": -89.33973693847656, "KL/mean": -141.10313415527344, "KL/rejected_KL_mean": -192.86651611328125, "KL/std": 98.31314086914062, "epoch": 0.2643171806167401, "fcm_dpo/beta": 0.004909820854663849, "fcm_dpo/delta": -0.1142577975988388, "fcm_dpo/margin": 103.52678680419922, "fcm_dpo/q_t": 0.38421761989593506, "grad_norm": 17.1671142578125, "learning_rate": 4.611919330113591e-07, "logits/chosen": -0.3321910500526428, "logits/rejected": -0.32747143507003784, "logps/chosen": -143.68246459960938, "logps/ref_chosen": -54.34272003173828, "logps/ref_rejected": -98.21183776855469, "logps/rejected": -291.078369140625, "loss": 1.0234, "margin_dpo/margin_mean": 103.52677917480469, "margin_dpo/margin_std": 122.91793060302734, "step": 180 }, { "KL/chosen_KL_mean": -77.30790710449219, "KL/mean": -109.57101440429688, "KL/rejected_KL_mean": -141.83412170410156, "KL/std": 90.15087890625, "epoch": 0.2657856093979442, "fcm_dpo/beta": 0.004953712224960327, "fcm_dpo/delta": 0.08287452906370163, "fcm_dpo/margin": 64.52621459960938, "fcm_dpo/q_t": 0.4273186922073364, "grad_norm": 17.40216064453125, "learning_rate": 4.605024008834863e-07, "logits/chosen": -0.3179657459259033, "logits/rejected": -0.290554940700531, "logps/chosen": -132.30836486816406, "logps/ref_chosen": -55.000457763671875, "logps/ref_rejected": -61.656166076660156, "logps/rejected": -203.49029541015625, "loss": 1.1666, "margin_dpo/margin_mean": 64.52621459960938, "margin_dpo/margin_std": 114.01302337646484, "step": 181 }, { "KL/chosen_KL_mean": -73.5950927734375, "KL/mean": -129.3390350341797, "KL/rejected_KL_mean": -185.08297729492188, "KL/std": 109.08509063720703, "epoch": 0.26725403817914833, "fcm_dpo/beta": 0.004840575158596039, "fcm_dpo/delta": -0.1485544741153717, "fcm_dpo/margin": 111.48786926269531, "fcm_dpo/q_t": 0.37746167182922363, "grad_norm": 17.18657112121582, "learning_rate": 4.598073218215817e-07, "logits/chosen": -0.30293479561805725, "logits/rejected": -0.3121221959590912, "logps/chosen": -114.70294952392578, "logps/ref_chosen": -41.107852935791016, "logps/ref_rejected": -89.5215835571289, "logps/rejected": -274.60455322265625, "loss": 1.0114, "margin_dpo/margin_mean": 111.48786926269531, "margin_dpo/margin_std": 131.83238220214844, "step": 182 }, { "KL/chosen_KL_mean": -110.05368041992188, "KL/mean": -140.24612426757812, "KL/rejected_KL_mean": -170.43858337402344, "KL/std": 90.4064712524414, "epoch": 0.2687224669603524, "fcm_dpo/beta": 0.004783437587320805, "fcm_dpo/delta": -0.041149888187646866, "fcm_dpo/margin": 60.38490295410156, "fcm_dpo/q_t": 0.43268686532974243, "grad_norm": 18.724519729614258, "learning_rate": 4.5910671414162484e-07, "logits/chosen": -0.31450870633125305, "logits/rejected": -0.30454862117767334, "logps/chosen": -167.5782470703125, "logps/ref_chosen": -57.52456283569336, "logps/ref_rejected": -75.97572326660156, "logps/rejected": -246.414306640625, "loss": 1.1804, "margin_dpo/margin_mean": 60.38490295410156, "margin_dpo/margin_std": 99.7965087890625, "step": 183 }, { "KL/chosen_KL_mean": -93.04067993164062, "KL/mean": -124.70458984375, "KL/rejected_KL_mean": -156.3684844970703, "KL/std": 85.67855834960938, "epoch": 0.2701908957415565, "fcm_dpo/beta": 0.00478300591930747, "fcm_dpo/delta": -0.0009023167076520622, "fcm_dpo/margin": 63.327796936035156, "fcm_dpo/q_t": 0.43101605772972107, "grad_norm": 19.71858024597168, "learning_rate": 4.5840059630527985e-07, "logits/chosen": -0.34772413969039917, "logits/rejected": -0.337999165058136, "logps/chosen": -151.58563232421875, "logps/ref_chosen": -58.544952392578125, "logps/ref_rejected": -76.63406372070312, "logps/rejected": -233.0025634765625, "loss": 1.1707, "margin_dpo/margin_mean": 63.32780075073242, "margin_dpo/margin_std": 107.8141098022461, "step": 184 }, { "KL/chosen_KL_mean": -101.89752960205078, "KL/mean": -127.2352523803711, "KL/rejected_KL_mean": -152.57298278808594, "KL/std": 100.57600402832031, "epoch": 0.27165932452276065, "fcm_dpo/beta": 0.004893806297332048, "fcm_dpo/delta": 0.1559191346168518, "fcm_dpo/margin": 50.67546081542969, "fcm_dpo/q_t": 0.446666955947876, "grad_norm": 19.57623291015625, "learning_rate": 4.5768898691940836e-07, "logits/chosen": -0.31452715396881104, "logits/rejected": -0.29128819704055786, "logps/chosen": -163.92337036132812, "logps/ref_chosen": -62.025848388671875, "logps/ref_rejected": -73.7625961303711, "logps/rejected": -226.3355712890625, "loss": 1.2331, "margin_dpo/margin_mean": 50.67546081542969, "margin_dpo/margin_std": 120.71915435791016, "step": 185 }, { "KL/chosen_KL_mean": -93.28840637207031, "KL/mean": -141.59420776367188, "KL/rejected_KL_mean": -189.89999389648438, "KL/std": 101.8707046508789, "epoch": 0.27312775330396477, "fcm_dpo/beta": 0.004873909987509251, "fcm_dpo/delta": -0.0744955912232399, "fcm_dpo/margin": 96.61160278320312, "fcm_dpo/q_t": 0.3932849168777466, "grad_norm": 28.687524795532227, "learning_rate": 4.5697190473557947e-07, "logits/chosen": -0.36662542819976807, "logits/rejected": -0.3481537103652954, "logps/chosen": -162.64187622070312, "logps/ref_chosen": -69.35346984863281, "logps/ref_rejected": -88.07244873046875, "logps/rejected": -277.97247314453125, "loss": 1.043, "margin_dpo/margin_mean": 96.6115951538086, "margin_dpo/margin_std": 116.60765075683594, "step": 186 }, { "KL/chosen_KL_mean": -87.90306091308594, "KL/mean": -127.93620300292969, "KL/rejected_KL_mean": -167.9693603515625, "KL/std": 96.8105697631836, "epoch": 0.2745961820851689, "fcm_dpo/beta": 0.004888523370027542, "fcm_dpo/delta": 0.008520994335412979, "fcm_dpo/margin": 80.06629943847656, "fcm_dpo/q_t": 0.41026172041893005, "grad_norm": 22.329120635986328, "learning_rate": 4.5624936864957555e-07, "logits/chosen": -0.3108750581741333, "logits/rejected": -0.3043569326400757, "logps/chosen": -140.65951538085938, "logps/ref_chosen": -52.7564582824707, "logps/ref_rejected": -81.96910095214844, "logps/rejected": -249.93846130371094, "loss": 1.0963, "margin_dpo/margin_mean": 80.06629943847656, "margin_dpo/margin_std": 105.62159729003906, "step": 187 }, { "KL/chosen_KL_mean": -83.09587097167969, "KL/mean": -132.18377685546875, "KL/rejected_KL_mean": -181.27166748046875, "KL/std": 108.80839538574219, "epoch": 0.27606461086637296, "fcm_dpo/beta": 0.004818486049771309, "fcm_dpo/delta": -0.07663469016551971, "fcm_dpo/margin": 98.17579650878906, "fcm_dpo/q_t": 0.3935472369194031, "grad_norm": 30.24648094177246, "learning_rate": 4.5552139770089454e-07, "logits/chosen": -0.29885581135749817, "logits/rejected": -0.3061617612838745, "logps/chosen": -132.5113525390625, "logps/ref_chosen": -49.415489196777344, "logps/ref_rejected": -89.54043579101562, "logps/rejected": -270.81207275390625, "loss": 1.0432, "margin_dpo/margin_mean": 98.17579650878906, "margin_dpo/margin_std": 119.95960998535156, "step": 188 }, { "KL/chosen_KL_mean": -98.87287902832031, "KL/mean": -138.66372680664062, "KL/rejected_KL_mean": -178.45455932617188, "KL/std": 109.66006469726562, "epoch": 0.2775330396475771, "fcm_dpo/beta": 0.004812294617295265, "fcm_dpo/delta": 0.017675260081887245, "fcm_dpo/margin": 79.58168029785156, "fcm_dpo/q_t": 0.41616469621658325, "grad_norm": 26.0447998046875, "learning_rate": 4.5478801107224794e-07, "logits/chosen": -0.3316497206687927, "logits/rejected": -0.3185557723045349, "logps/chosen": -151.27183532714844, "logps/ref_chosen": -52.39896011352539, "logps/ref_rejected": -72.16735076904297, "logps/rejected": -250.62191772460938, "loss": 1.1307, "margin_dpo/margin_mean": 79.58168029785156, "margin_dpo/margin_std": 134.16441345214844, "step": 189 }, { "KL/chosen_KL_mean": -106.78004455566406, "KL/mean": -154.5462646484375, "KL/rejected_KL_mean": -202.3125, "KL/std": 118.31991577148438, "epoch": 0.2790014684287812, "fcm_dpo/beta": 0.004810405895113945, "fcm_dpo/delta": -0.0634998232126236, "fcm_dpo/margin": 95.53245544433594, "fcm_dpo/q_t": 0.39865702390670776, "grad_norm": 18.914052963256836, "learning_rate": 4.5404922808905543e-07, "logits/chosen": -0.3244064450263977, "logits/rejected": -0.31318405270576477, "logps/chosen": -171.46310424804688, "logps/ref_chosen": -64.68305969238281, "logps/ref_rejected": -102.55052185058594, "logps/rejected": -304.863037109375, "loss": 1.0853, "margin_dpo/margin_mean": 95.53245544433594, "margin_dpo/margin_std": 137.739990234375, "step": 190 }, { "KL/chosen_KL_mean": -99.22306823730469, "KL/mean": -168.31130981445312, "KL/rejected_KL_mean": -237.39950561523438, "KL/std": 138.23770141601562, "epoch": 0.28046989720998533, "fcm_dpo/beta": 0.004560886882245541, "fcm_dpo/delta": -0.24714502692222595, "fcm_dpo/margin": 138.17645263671875, "fcm_dpo/q_t": 0.3622073531150818, "grad_norm": 21.69601058959961, "learning_rate": 4.5330506821893565e-07, "logits/chosen": -0.3293009102344513, "logits/rejected": -0.30673933029174805, "logps/chosen": -167.88194274902344, "logps/ref_chosen": -68.65887451171875, "logps/ref_rejected": -110.1396713256836, "logps/rejected": -347.5391845703125, "loss": 0.955, "margin_dpo/margin_mean": 138.17645263671875, "margin_dpo/margin_std": 153.68263244628906, "step": 191 }, { "KL/chosen_KL_mean": -127.37928009033203, "KL/mean": -173.00198364257812, "KL/rejected_KL_mean": -218.62469482421875, "KL/std": 119.46993255615234, "epoch": 0.28193832599118945, "fcm_dpo/beta": 0.0045256055891513824, "fcm_dpo/delta": -0.013612883165478706, "fcm_dpo/margin": 91.24540710449219, "fcm_dpo/q_t": 0.4095039367675781, "grad_norm": 25.06287956237793, "learning_rate": 4.5255555107119336e-07, "logits/chosen": -0.29608154296875, "logits/rejected": -0.2969720959663391, "logps/chosen": -197.106201171875, "logps/ref_chosen": -69.72691345214844, "logps/ref_rejected": -103.32135009765625, "logps/rejected": -321.946044921875, "loss": 1.1165, "margin_dpo/margin_mean": 91.24540710449219, "margin_dpo/margin_std": 148.260009765625, "step": 192 }, { "KL/chosen_KL_mean": -125.8011474609375, "KL/mean": -152.323486328125, "KL/rejected_KL_mean": -178.84580993652344, "KL/std": 109.30430603027344, "epoch": 0.2834067547723935, "fcm_dpo/beta": 0.0045287711545825005, "fcm_dpo/delta": 0.041177622973918915, "fcm_dpo/margin": 53.04465103149414, "fcm_dpo/q_t": 0.44289711117744446, "grad_norm": 29.155603408813477, "learning_rate": 4.5180069639630236e-07, "logits/chosen": -0.3240417540073395, "logits/rejected": -0.319375216960907, "logps/chosen": -185.99163818359375, "logps/ref_chosen": -60.19049835205078, "logps/ref_rejected": -76.40755462646484, "logps/rejected": -255.25335693359375, "loss": 1.2557, "margin_dpo/margin_mean": 53.04465103149414, "margin_dpo/margin_std": 140.07958984375, "step": 193 }, { "KL/chosen_KL_mean": -77.91348266601562, "KL/mean": -120.96083068847656, "KL/rejected_KL_mean": -164.00819396972656, "KL/std": 90.18110656738281, "epoch": 0.28487518355359764, "fcm_dpo/beta": 0.004540526773780584, "fcm_dpo/delta": 0.00930863805115223, "fcm_dpo/margin": 86.09469604492188, "fcm_dpo/q_t": 0.40839725732803345, "grad_norm": 16.880617141723633, "learning_rate": 4.510405240853854e-07, "logits/chosen": -0.2117328941822052, "logits/rejected": -0.19627614319324493, "logps/chosen": -115.75386047363281, "logps/ref_chosen": -37.84037399291992, "logps/ref_rejected": -60.684783935546875, "logps/rejected": -224.69296264648438, "loss": 1.0819, "margin_dpo/margin_mean": 86.09468841552734, "margin_dpo/margin_std": 99.02046203613281, "step": 194 }, { "KL/chosen_KL_mean": -123.40607452392578, "KL/mean": -170.14013671875, "KL/rejected_KL_mean": -216.87420654296875, "KL/std": 114.27099609375, "epoch": 0.28634361233480177, "fcm_dpo/beta": 0.004536244552582502, "fcm_dpo/delta": -0.02507840283215046, "fcm_dpo/margin": 93.46810913085938, "fcm_dpo/q_t": 0.40300172567367554, "grad_norm": 23.332624435424805, "learning_rate": 4.5027505416968985e-07, "logits/chosen": -0.2611733078956604, "logits/rejected": -0.2804575562477112, "logps/chosen": -178.29763793945312, "logps/ref_chosen": -54.891571044921875, "logps/ref_rejected": -96.77095794677734, "logps/rejected": -313.6451416015625, "loss": 1.0732, "margin_dpo/margin_mean": 93.46810913085938, "margin_dpo/margin_std": 118.67242431640625, "step": 195 }, { "KL/chosen_KL_mean": -94.62376403808594, "KL/mean": -147.49710083007812, "KL/rejected_KL_mean": -200.3704376220703, "KL/std": 114.38645935058594, "epoch": 0.2878120411160059, "fcm_dpo/beta": 0.004456968978047371, "fcm_dpo/delta": -0.07591746747493744, "fcm_dpo/margin": 105.74667358398438, "fcm_dpo/q_t": 0.3952023983001709, "grad_norm": 17.2235107421875, "learning_rate": 4.495043068200599e-07, "logits/chosen": -0.2857983708381653, "logits/rejected": -0.2708747684955597, "logps/chosen": -147.8690185546875, "logps/ref_chosen": -53.245243072509766, "logps/ref_rejected": -76.05294799804688, "logps/rejected": -276.42340087890625, "loss": 1.0618, "margin_dpo/margin_mean": 105.74667358398438, "margin_dpo/margin_std": 138.9565887451172, "step": 196 }, { "KL/chosen_KL_mean": -97.66607666015625, "KL/mean": -138.1141357421875, "KL/rejected_KL_mean": -178.56219482421875, "KL/std": 99.23837280273438, "epoch": 0.28928046989721, "fcm_dpo/beta": 0.0045026084408164024, "fcm_dpo/delta": 0.03682290017604828, "fcm_dpo/margin": 80.89613342285156, "fcm_dpo/q_t": 0.4166555404663086, "grad_norm": 18.91534996032715, "learning_rate": 4.4872830234640493e-07, "logits/chosen": -0.28415030241012573, "logits/rejected": -0.2793928384780884, "logps/chosen": -158.08641052246094, "logps/ref_chosen": -60.42033386230469, "logps/ref_rejected": -77.20890808105469, "logps/rejected": -255.7711181640625, "loss": 1.1147, "margin_dpo/margin_mean": 80.89613342285156, "margin_dpo/margin_std": 111.98675537109375, "step": 197 }, { "KL/chosen_KL_mean": -110.67520141601562, "KL/mean": -162.27679443359375, "KL/rejected_KL_mean": -213.87840270996094, "KL/std": 122.85459899902344, "epoch": 0.2907488986784141, "fcm_dpo/beta": 0.004457796923816204, "fcm_dpo/delta": -0.06293203681707382, "fcm_dpo/margin": 103.20319366455078, "fcm_dpo/q_t": 0.3982738256454468, "grad_norm": 22.420948028564453, "learning_rate": 4.479470611971645e-07, "logits/chosen": -0.31583186984062195, "logits/rejected": -0.31620723009109497, "logps/chosen": -165.71139526367188, "logps/ref_chosen": -55.03618621826172, "logps/ref_rejected": -97.24325561523438, "logps/rejected": -311.12164306640625, "loss": 1.0625, "margin_dpo/margin_mean": 103.20319366455078, "margin_dpo/margin_std": 139.06460571289062, "step": 198 }, { "KL/chosen_KL_mean": -104.47000885009766, "KL/mean": -155.94883728027344, "KL/rejected_KL_mean": -207.42767333984375, "KL/std": 111.98977661132812, "epoch": 0.2922173274596182, "fcm_dpo/beta": 0.004380636848509312, "fcm_dpo/delta": -0.054408542811870575, "fcm_dpo/margin": 102.95765686035156, "fcm_dpo/q_t": 0.39817678928375244, "grad_norm": 23.365877151489258, "learning_rate": 4.471606039587695e-07, "logits/chosen": -0.3535653352737427, "logits/rejected": -0.3379266858100891, "logps/chosen": -161.298828125, "logps/ref_chosen": -56.828826904296875, "logps/ref_rejected": -84.64820861816406, "logps/rejected": -292.07586669921875, "loss": 1.0706, "margin_dpo/margin_mean": 102.95765686035156, "margin_dpo/margin_std": 137.37387084960938, "step": 199 }, { "KL/chosen_KL_mean": -102.97698974609375, "KL/mean": -154.0308837890625, "KL/rejected_KL_mean": -205.0847625732422, "KL/std": 120.30370330810547, "epoch": 0.2936857562408223, "fcm_dpo/beta": 0.004349041730165482, "fcm_dpo/delta": -0.046485088765621185, "fcm_dpo/margin": 102.1077880859375, "fcm_dpo/q_t": 0.4016646146774292, "grad_norm": 20.43732452392578, "learning_rate": 4.4636895135509966e-07, "logits/chosen": -0.28155362606048584, "logits/rejected": -0.2662222385406494, "logps/chosen": -156.04405212402344, "logps/ref_chosen": -53.06706237792969, "logps/ref_rejected": -80.60843658447266, "logps/rejected": -285.6932067871094, "loss": 1.0923, "margin_dpo/margin_mean": 102.1077880859375, "margin_dpo/margin_std": 154.43467712402344, "step": 200 }, { "KL/chosen_KL_mean": -107.32299041748047, "KL/mean": -158.21717834472656, "KL/rejected_KL_mean": -209.11135864257812, "KL/std": 126.88838195800781, "epoch": 0.29515418502202645, "fcm_dpo/beta": 0.004335206001996994, "fcm_dpo/delta": -0.04319122061133385, "fcm_dpo/margin": 101.78836059570312, "fcm_dpo/q_t": 0.4008065462112427, "grad_norm": 20.02793312072754, "learning_rate": 4.455721242469372e-07, "logits/chosen": -0.3679494261741638, "logits/rejected": -0.36475256085395813, "logps/chosen": -182.7252197265625, "logps/ref_chosen": -75.4022216796875, "logps/ref_rejected": -114.80821990966797, "logps/rejected": -323.9195861816406, "loss": 1.0816, "margin_dpo/margin_mean": 101.78836059570312, "margin_dpo/margin_std": 143.44985961914062, "step": 201 }, { "KL/chosen_KL_mean": -111.05018615722656, "KL/mean": -146.92918395996094, "KL/rejected_KL_mean": -182.8081817626953, "KL/std": 109.39289855957031, "epoch": 0.2966226138032305, "fcm_dpo/beta": 0.004375634714961052, "fcm_dpo/delta": 0.08873856067657471, "fcm_dpo/margin": 71.75799560546875, "fcm_dpo/q_t": 0.43042024970054626, "grad_norm": 20.026168823242188, "learning_rate": 4.4477014363141755e-07, "logits/chosen": -0.2920665740966797, "logits/rejected": -0.30537718534469604, "logps/chosen": -161.15150451660156, "logps/ref_chosen": -50.101318359375, "logps/ref_rejected": -86.98503112792969, "logps/rejected": -269.793212890625, "loss": 1.1858, "margin_dpo/margin_mean": 71.75798797607422, "margin_dpo/margin_std": 141.61204528808594, "step": 202 }, { "KL/chosen_KL_mean": -109.14225769042969, "KL/mean": -152.8557586669922, "KL/rejected_KL_mean": -196.5692138671875, "KL/std": 111.0545654296875, "epoch": 0.29809104258443464, "fcm_dpo/beta": 0.004399011377245188, "fcm_dpo/delta": 0.016021015122532845, "fcm_dpo/margin": 87.4269790649414, "fcm_dpo/q_t": 0.41227254271507263, "grad_norm": 20.92191505432129, "learning_rate": 4.439630306414758e-07, "logits/chosen": -0.3324393033981323, "logits/rejected": -0.321586549282074, "logps/chosen": -169.751953125, "logps/ref_chosen": -60.60969543457031, "logps/ref_rejected": -85.89596557617188, "logps/rejected": -282.4652099609375, "loss": 1.1052, "margin_dpo/margin_mean": 87.4269790649414, "margin_dpo/margin_std": 122.52427673339844, "step": 203 }, { "KL/chosen_KL_mean": -122.31649780273438, "KL/mean": -163.0527801513672, "KL/rejected_KL_mean": -203.78907775878906, "KL/std": 121.6878662109375, "epoch": 0.29955947136563876, "fcm_dpo/beta": 0.004431103356182575, "fcm_dpo/delta": 0.04046226292848587, "fcm_dpo/margin": 81.47259521484375, "fcm_dpo/q_t": 0.4206123650074005, "grad_norm": 26.054628372192383, "learning_rate": 4.431508065452897e-07, "logits/chosen": -0.4261574149131775, "logits/rejected": -0.38839346170425415, "logps/chosen": -202.4814453125, "logps/ref_chosen": -80.16496276855469, "logps/ref_rejected": -87.69590759277344, "logps/rejected": -291.4849853515625, "loss": 1.1504, "margin_dpo/margin_mean": 81.47258758544922, "margin_dpo/margin_std": 142.71099853515625, "step": 204 }, { "KL/chosen_KL_mean": -117.57252502441406, "KL/mean": -171.2581024169922, "KL/rejected_KL_mean": -224.94366455078125, "KL/std": 124.95415496826172, "epoch": 0.3010279001468429, "fcm_dpo/beta": 0.004357962869107723, "fcm_dpo/delta": -0.07328492403030396, "fcm_dpo/margin": 107.37113189697266, "fcm_dpo/q_t": 0.3927791714668274, "grad_norm": 19.602901458740234, "learning_rate": 4.4233349274571974e-07, "logits/chosen": -0.34010183811187744, "logits/rejected": -0.3110367953777313, "logps/chosen": -176.95726013183594, "logps/ref_chosen": -59.384735107421875, "logps/ref_rejected": -85.12505340576172, "logps/rejected": -310.0687255859375, "loss": 1.0606, "margin_dpo/margin_mean": 107.37113952636719, "margin_dpo/margin_std": 136.42298889160156, "step": 205 }, { "KL/chosen_KL_mean": -108.13024139404297, "KL/mean": -166.66561889648438, "KL/rejected_KL_mean": -225.20098876953125, "KL/std": 116.73199462890625, "epoch": 0.302496328928047, "fcm_dpo/beta": 0.004292918369174004, "fcm_dpo/delta": -0.10862280428409576, "fcm_dpo/margin": 117.07073974609375, "fcm_dpo/q_t": 0.38319119811058044, "grad_norm": 25.08537483215332, "learning_rate": 4.415111107797445e-07, "logits/chosen": -0.25357377529144287, "logits/rejected": -0.25648266077041626, "logps/chosen": -155.09474182128906, "logps/ref_chosen": -46.964500427246094, "logps/ref_rejected": -98.9534912109375, "logps/rejected": -324.15447998046875, "loss": 1.0155, "margin_dpo/margin_mean": 117.07073974609375, "margin_dpo/margin_std": 127.296875, "step": 206 }, { "KL/chosen_KL_mean": -99.36531829833984, "KL/mean": -167.42587280273438, "KL/rejected_KL_mean": -235.48643493652344, "KL/std": 135.0901641845703, "epoch": 0.3039647577092511, "fcm_dpo/beta": 0.004175534471869469, "fcm_dpo/delta": -0.1783892959356308, "fcm_dpo/margin": 136.12110900878906, "fcm_dpo/q_t": 0.37481075525283813, "grad_norm": 22.83678436279297, "learning_rate": 4.4068368231789365e-07, "logits/chosen": -0.3456140458583832, "logits/rejected": -0.3190155029296875, "logps/chosen": -155.42156982421875, "logps/ref_chosen": -56.05625915527344, "logps/ref_rejected": -84.44779968261719, "logps/rejected": -319.9342346191406, "loss": 0.9899, "margin_dpo/margin_mean": 136.12110900878906, "margin_dpo/margin_std": 157.92230224609375, "step": 207 }, { "KL/chosen_KL_mean": -152.72695922851562, "KL/mean": -205.685791015625, "KL/rejected_KL_mean": -258.64459228515625, "KL/std": 128.31884765625, "epoch": 0.3054331864904552, "fcm_dpo/beta": 0.004102812148630619, "fcm_dpo/delta": -0.03629569336771965, "fcm_dpo/margin": 105.91764068603516, "fcm_dpo/q_t": 0.40152066946029663, "grad_norm": 25.079753875732422, "learning_rate": 4.398512291636768e-07, "logits/chosen": -0.3910176157951355, "logits/rejected": -0.3746778964996338, "logps/chosen": -219.79457092285156, "logps/ref_chosen": -67.06761169433594, "logps/ref_rejected": -94.28689575195312, "logps/rejected": -352.9315185546875, "loss": 1.0928, "margin_dpo/margin_mean": 105.91764831542969, "margin_dpo/margin_std": 156.15579223632812, "step": 208 }, { "KL/chosen_KL_mean": -129.83139038085938, "KL/mean": -176.5919952392578, "KL/rejected_KL_mean": -223.35260009765625, "KL/std": 117.19390869140625, "epoch": 0.3069016152716593, "fcm_dpo/beta": 0.004114994779229164, "fcm_dpo/delta": 0.015758566558361053, "fcm_dpo/margin": 93.5212173461914, "fcm_dpo/q_t": 0.4131506383419037, "grad_norm": 30.509929656982422, "learning_rate": 4.3901377325300857e-07, "logits/chosen": -0.26792603731155396, "logits/rejected": -0.2554609179496765, "logps/chosen": -186.01309204101562, "logps/ref_chosen": -56.18169403076172, "logps/ref_rejected": -80.94152069091797, "logps/rejected": -304.29412841796875, "loss": 1.1281, "margin_dpo/margin_mean": 93.52120971679688, "margin_dpo/margin_std": 149.02239990234375, "step": 209 }, { "KL/chosen_KL_mean": -116.4477767944336, "KL/mean": -170.14013671875, "KL/rejected_KL_mean": -223.83248901367188, "KL/std": 119.5206069946289, "epoch": 0.30837004405286345, "fcm_dpo/beta": 0.004107258282601833, "fcm_dpo/delta": -0.04311756044626236, "fcm_dpo/margin": 107.38471984863281, "fcm_dpo/q_t": 0.40103164315223694, "grad_norm": 24.466625213623047, "learning_rate": 4.381713366536311e-07, "logits/chosen": -0.30964159965515137, "logits/rejected": -0.3021623492240906, "logps/chosen": -162.81959533691406, "logps/ref_chosen": -46.371822357177734, "logps/ref_rejected": -76.68162536621094, "logps/rejected": -300.51409912109375, "loss": 1.0767, "margin_dpo/margin_mean": 107.38471221923828, "margin_dpo/margin_std": 146.1619873046875, "step": 210 }, { "KL/chosen_KL_mean": -167.79052734375, "KL/mean": -213.89144897460938, "KL/rejected_KL_mean": -259.99237060546875, "KL/std": 136.38101196289062, "epoch": 0.30983847283406757, "fcm_dpo/beta": 0.0040979161858558655, "fcm_dpo/delta": 0.02301332727074623, "fcm_dpo/margin": 92.20182800292969, "fcm_dpo/q_t": 0.4184762239456177, "grad_norm": 33.73249816894531, "learning_rate": 4.373239415645323e-07, "logits/chosen": -0.3087081015110016, "logits/rejected": -0.26862210035324097, "logps/chosen": -246.72286987304688, "logps/ref_chosen": -78.93235778808594, "logps/ref_rejected": -86.82098388671875, "logps/rejected": -346.8133544921875, "loss": 1.1425, "margin_dpo/margin_mean": 92.20182800292969, "margin_dpo/margin_std": 158.70635986328125, "step": 211 }, { "KL/chosen_KL_mean": -138.05943298339844, "KL/mean": -203.9874267578125, "KL/rejected_KL_mean": -269.9154052734375, "KL/std": 145.75244140625, "epoch": 0.31130690161527164, "fcm_dpo/beta": 0.003989426419138908, "fcm_dpo/delta": -0.13511215150356293, "fcm_dpo/margin": 131.85598754882812, "fcm_dpo/q_t": 0.382382869720459, "grad_norm": 24.812414169311523, "learning_rate": 4.3647161031536086e-07, "logits/chosen": -0.26592007279396057, "logits/rejected": -0.25487691164016724, "logps/chosen": -196.25643920898438, "logps/ref_chosen": -58.19701385498047, "logps/ref_rejected": -103.05785369873047, "logps/rejected": -372.9732666015625, "loss": 1.0304, "margin_dpo/margin_mean": 131.85598754882812, "margin_dpo/margin_std": 161.92364501953125, "step": 212 }, { "KL/chosen_KL_mean": -129.4147186279297, "KL/mean": -192.4105224609375, "KL/rejected_KL_mean": -255.40631103515625, "KL/std": 128.76019287109375, "epoch": 0.31277533039647576, "fcm_dpo/beta": 0.003918571397662163, "fcm_dpo/delta": -0.09918186068534851, "fcm_dpo/margin": 125.9916000366211, "fcm_dpo/q_t": 0.3877994418144226, "grad_norm": 32.999141693115234, "learning_rate": 4.3561436536583774e-07, "logits/chosen": -0.3247559368610382, "logits/rejected": -0.30033838748931885, "logps/chosen": -196.92742919921875, "logps/ref_chosen": -67.51271057128906, "logps/ref_rejected": -93.91471862792969, "logps/rejected": -349.321044921875, "loss": 1.0348, "margin_dpo/margin_mean": 125.9916000366211, "margin_dpo/margin_std": 152.36477661132812, "step": 213 }, { "KL/chosen_KL_mean": -111.72816467285156, "KL/mean": -168.243896484375, "KL/rejected_KL_mean": -224.75961303710938, "KL/std": 126.36701965332031, "epoch": 0.3142437591776799, "fcm_dpo/beta": 0.0038879900239408016, "fcm_dpo/delta": -0.04128566384315491, "fcm_dpo/margin": 113.03146362304688, "fcm_dpo/q_t": 0.40068867802619934, "grad_norm": 22.442670822143555, "learning_rate": 4.3475222930516473e-07, "logits/chosen": -0.2424134612083435, "logits/rejected": -0.24728670716285706, "logps/chosen": -153.3330535888672, "logps/ref_chosen": -41.604888916015625, "logps/ref_rejected": -77.51741027832031, "logps/rejected": -302.27703857421875, "loss": 1.0691, "margin_dpo/margin_mean": 113.03147888183594, "margin_dpo/margin_std": 146.84898376464844, "step": 214 }, { "KL/chosen_KL_mean": -130.1924591064453, "KL/mean": -189.68450927734375, "KL/rejected_KL_mean": -249.17657470703125, "KL/std": 129.92074584960938, "epoch": 0.315712187958884, "fcm_dpo/beta": 0.0038366110529750586, "fcm_dpo/delta": -0.05947209149599075, "fcm_dpo/margin": 118.9841079711914, "fcm_dpo/q_t": 0.3948417901992798, "grad_norm": 26.03775405883789, "learning_rate": 4.3388522485142885e-07, "logits/chosen": -0.27569520473480225, "logits/rejected": -0.2664262354373932, "logps/chosen": -183.4717254638672, "logps/ref_chosen": -53.279266357421875, "logps/ref_rejected": -89.96464538574219, "logps/rejected": -339.1412353515625, "loss": 1.044, "margin_dpo/margin_mean": 118.9841079711914, "margin_dpo/margin_std": 135.40081787109375, "step": 215 }, { "KL/chosen_KL_mean": -134.6547088623047, "KL/mean": -192.75794982910156, "KL/rejected_KL_mean": -250.8612060546875, "KL/std": 135.34701538085938, "epoch": 0.31718061674008813, "fcm_dpo/beta": 0.0038237408734858036, "fcm_dpo/delta": -0.046559788286685944, "fcm_dpo/margin": 116.20650482177734, "fcm_dpo/q_t": 0.4005330502986908, "grad_norm": 25.67644691467285, "learning_rate": 4.330133748510036e-07, "logits/chosen": -0.2784517705440521, "logits/rejected": -0.26232653856277466, "logps/chosen": -183.54251098632812, "logps/ref_chosen": -48.887794494628906, "logps/ref_rejected": -77.19892883300781, "logps/rejected": -328.06011962890625, "loss": 1.085, "margin_dpo/margin_mean": 116.20650482177734, "margin_dpo/margin_std": 166.7809295654297, "step": 216 }, { "KL/chosen_KL_mean": -133.6094970703125, "KL/mean": -202.16543579101562, "KL/rejected_KL_mean": -270.72137451171875, "KL/std": 137.3214111328125, "epoch": 0.3186490455212922, "fcm_dpo/beta": 0.0037270013708621264, "fcm_dpo/delta": -0.11725132167339325, "fcm_dpo/margin": 137.11187744140625, "fcm_dpo/q_t": 0.3843996822834015, "grad_norm": 21.137720108032227, "learning_rate": 4.3213670227794757e-07, "logits/chosen": -0.26740846037864685, "logits/rejected": -0.2628672122955322, "logps/chosen": -183.45480346679688, "logps/ref_chosen": -49.845306396484375, "logps/ref_rejected": -100.07832336425781, "logps/rejected": -370.7996826171875, "loss": 1.0138, "margin_dpo/margin_mean": 137.11187744140625, "margin_dpo/margin_std": 154.92494201660156, "step": 217 }, { "KL/chosen_KL_mean": -141.71827697753906, "KL/mean": -194.88088989257812, "KL/rejected_KL_mean": -248.04351806640625, "KL/std": 136.5504150390625, "epoch": 0.3201174743024963, "fcm_dpo/beta": 0.0037083416245877743, "fcm_dpo/delta": 0.0059033287689089775, "fcm_dpo/margin": 106.32524108886719, "fcm_dpo/q_t": 0.4114874601364136, "grad_norm": 22.63395881652832, "learning_rate": 4.3125523023339815e-07, "logits/chosen": -0.2704193592071533, "logits/rejected": -0.26462244987487793, "logps/chosen": -200.29495239257812, "logps/ref_chosen": -58.576683044433594, "logps/ref_rejected": -87.84639739990234, "logps/rejected": -335.8899230957031, "loss": 1.1123, "margin_dpo/margin_mean": 106.32524108886719, "margin_dpo/margin_std": 159.38711547851562, "step": 218 }, { "KL/chosen_KL_mean": -149.95846557617188, "KL/mean": -195.880859375, "KL/rejected_KL_mean": -241.8032684326172, "KL/std": 140.34046936035156, "epoch": 0.32158590308370044, "fcm_dpo/beta": 0.0037643599789589643, "fcm_dpo/delta": 0.05545644462108612, "fcm_dpo/margin": 91.84481048583984, "fcm_dpo/q_t": 0.4227282404899597, "grad_norm": 27.498462677001953, "learning_rate": 4.303689819449636e-07, "logits/chosen": -0.34284111857414246, "logits/rejected": -0.33793115615844727, "logps/chosen": -211.04232788085938, "logps/ref_chosen": -61.083858489990234, "logps/ref_rejected": -85.83042907714844, "logps/rejected": -327.6336975097656, "loss": 1.1678, "margin_dpo/margin_mean": 91.84481048583984, "margin_dpo/margin_std": 171.24391174316406, "step": 219 }, { "KL/chosen_KL_mean": -168.154296875, "KL/mean": -206.7792205810547, "KL/rejected_KL_mean": -245.40414428710938, "KL/std": 120.42225646972656, "epoch": 0.32305433186490456, "fcm_dpo/beta": 0.00381092494353652, "fcm_dpo/delta": 0.10894529521465302, "fcm_dpo/margin": 77.24984741210938, "fcm_dpo/q_t": 0.4311019778251648, "grad_norm": 24.416948318481445, "learning_rate": 4.2947798076611047e-07, "logits/chosen": -0.2805694043636322, "logits/rejected": -0.2573145031929016, "logps/chosen": -238.18557739257812, "logps/ref_chosen": -70.03128051757812, "logps/ref_rejected": -87.68551635742188, "logps/rejected": -333.08966064453125, "loss": 1.1685, "margin_dpo/margin_mean": 77.24984741210938, "margin_dpo/margin_std": 125.33375549316406, "step": 220 }, { "KL/chosen_KL_mean": -144.7303466796875, "KL/mean": -229.44322204589844, "KL/rejected_KL_mean": -314.1560974121094, "KL/std": 154.45883178710938, "epoch": 0.3245227606461087, "fcm_dpo/beta": 0.0037145623937249184, "fcm_dpo/delta": -0.24419276416301727, "fcm_dpo/margin": 169.42575073242188, "fcm_dpo/q_t": 0.3563760221004486, "grad_norm": 25.376220703125, "learning_rate": 4.285822501755485e-07, "logits/chosen": -0.28012099862098694, "logits/rejected": -0.28704455494880676, "logps/chosen": -196.8850555419922, "logps/ref_chosen": -52.15470886230469, "logps/ref_rejected": -106.46768188476562, "logps/rejected": -420.623779296875, "loss": 0.9329, "margin_dpo/margin_mean": 169.42575073242188, "margin_dpo/margin_std": 157.756103515625, "step": 221 }, { "KL/chosen_KL_mean": -154.47286987304688, "KL/mean": -217.7156982421875, "KL/rejected_KL_mean": -280.95849609375, "KL/std": 141.71084594726562, "epoch": 0.32599118942731276, "fcm_dpo/beta": 0.0036375990603119135, "fcm_dpo/delta": -0.06309865415096283, "fcm_dpo/margin": 126.48562622070312, "fcm_dpo/q_t": 0.39524269104003906, "grad_norm": 21.781951904296875, "learning_rate": 4.276818137766118e-07, "logits/chosen": -0.3290114402770996, "logits/rejected": -0.3324123024940491, "logps/chosen": -215.4439697265625, "logps/ref_chosen": -60.971099853515625, "logps/ref_rejected": -100.00115203857422, "logps/rejected": -380.95965576171875, "loss": 1.0556, "margin_dpo/margin_mean": 126.4856185913086, "margin_dpo/margin_std": 158.17788696289062, "step": 222 }, { "KL/chosen_KL_mean": -162.42262268066406, "KL/mean": -216.47817993164062, "KL/rejected_KL_mean": -270.53375244140625, "KL/std": 139.66900634765625, "epoch": 0.3274596182085169, "fcm_dpo/beta": 0.0036115439143031836, "fcm_dpo/delta": 0.009921977296471596, "fcm_dpo/margin": 108.11112213134766, "fcm_dpo/q_t": 0.4122130274772644, "grad_norm": 25.37874412536621, "learning_rate": 4.2677669529663686e-07, "logits/chosen": -0.24600395560264587, "logits/rejected": -0.2434278130531311, "logps/chosen": -215.06320190429688, "logps/ref_chosen": -52.64057540893555, "logps/ref_rejected": -82.82502746582031, "logps/rejected": -353.3587646484375, "loss": 1.1264, "margin_dpo/margin_mean": 108.11112213134766, "margin_dpo/margin_std": 176.22784423828125, "step": 223 }, { "KL/chosen_KL_mean": -141.04718017578125, "KL/mean": -202.54745483398438, "KL/rejected_KL_mean": -264.0477294921875, "KL/std": 154.926513671875, "epoch": 0.328928046989721, "fcm_dpo/beta": 0.003568105399608612, "fcm_dpo/delta": -0.04215101897716522, "fcm_dpo/margin": 123.00054168701172, "fcm_dpo/q_t": 0.4034996032714844, "grad_norm": 23.84183120727539, "learning_rate": 4.2586691858633747e-07, "logits/chosen": -0.3231106102466583, "logits/rejected": -0.3093082904815674, "logps/chosen": -189.64259338378906, "logps/ref_chosen": -48.59541320800781, "logps/ref_rejected": -77.11648559570312, "logps/rejected": -341.1642150878906, "loss": 1.0901, "margin_dpo/margin_mean": 123.00054168701172, "margin_dpo/margin_std": 179.27581787109375, "step": 224 }, { "KL/chosen_KL_mean": -163.38140869140625, "KL/mean": -235.73553466796875, "KL/rejected_KL_mean": -308.089599609375, "KL/std": 149.63555908203125, "epoch": 0.3303964757709251, "fcm_dpo/beta": 0.003522678278386593, "fcm_dpo/delta": -0.11579211056232452, "fcm_dpo/margin": 144.70822143554688, "fcm_dpo/q_t": 0.3856618404388428, "grad_norm": 22.832763671875, "learning_rate": 4.249525076191759e-07, "logits/chosen": -0.30160531401634216, "logits/rejected": -0.29380398988723755, "logps/chosen": -221.38186645507812, "logps/ref_chosen": -58.000465393066406, "logps/ref_rejected": -99.90291595458984, "logps/rejected": -407.9925231933594, "loss": 1.0322, "margin_dpo/margin_mean": 144.70822143554688, "margin_dpo/margin_std": 182.02337646484375, "step": 225 }, { "KL/chosen_KL_mean": -134.9498291015625, "KL/mean": -192.45901489257812, "KL/rejected_KL_mean": -249.96820068359375, "KL/std": 144.02420043945312, "epoch": 0.33186490455212925, "fcm_dpo/beta": 0.0034828565549105406, "fcm_dpo/delta": -0.001416236162185669, "fcm_dpo/margin": 115.01838684082031, "fcm_dpo/q_t": 0.4106452763080597, "grad_norm": 30.9562931060791, "learning_rate": 4.2403348649073167e-07, "logits/chosen": -0.36006784439086914, "logits/rejected": -0.32199037075042725, "logps/chosen": -193.8486328125, "logps/ref_chosen": -58.898799896240234, "logps/ref_rejected": -78.68775939941406, "logps/rejected": -328.65594482421875, "loss": 1.1053, "margin_dpo/margin_mean": 115.01838684082031, "margin_dpo/margin_std": 166.55606079101562, "step": 226 }, { "KL/chosen_KL_mean": -160.70245361328125, "KL/mean": -233.0044403076172, "KL/rejected_KL_mean": -305.3064270019531, "KL/std": 166.17343139648438, "epoch": 0.3333333333333333, "fcm_dpo/beta": 0.0034378478303551674, "fcm_dpo/delta": -0.10290348529815674, "fcm_dpo/margin": 144.60397338867188, "fcm_dpo/q_t": 0.38717547059059143, "grad_norm": 22.616573333740234, "learning_rate": 4.2310987941806615e-07, "logits/chosen": -0.36182135343551636, "logits/rejected": -0.3515620529651642, "logps/chosen": -219.77462768554688, "logps/ref_chosen": -59.072181701660156, "logps/ref_rejected": -99.41236877441406, "logps/rejected": -404.71881103515625, "loss": 1.0293, "margin_dpo/margin_mean": 144.60397338867188, "margin_dpo/margin_std": 172.94851684570312, "step": 227 }, { "KL/chosen_KL_mean": -162.9698028564453, "KL/mean": -214.10031127929688, "KL/rejected_KL_mean": -265.2308349609375, "KL/std": 134.146728515625, "epoch": 0.33480176211453744, "fcm_dpo/beta": 0.0034628671128302813, "fcm_dpo/delta": 0.0474376454949379, "fcm_dpo/margin": 102.26102447509766, "fcm_dpo/q_t": 0.418906033039093, "grad_norm": 26.848163604736328, "learning_rate": 4.2218171073908463e-07, "logits/chosen": -0.31502634286880493, "logits/rejected": -0.2975466251373291, "logps/chosen": -228.861083984375, "logps/ref_chosen": -65.89128875732422, "logps/ref_rejected": -91.04875183105469, "logps/rejected": -356.2795715332031, "loss": 1.1398, "margin_dpo/margin_mean": 102.26102447509766, "margin_dpo/margin_std": 164.67837524414062, "step": 228 }, { "KL/chosen_KL_mean": -167.70614624023438, "KL/mean": -222.6710968017578, "KL/rejected_KL_mean": -277.63604736328125, "KL/std": 156.47640991210938, "epoch": 0.33627019089574156, "fcm_dpo/beta": 0.003477250225841999, "fcm_dpo/delta": 0.018432918936014175, "fcm_dpo/margin": 109.92990112304688, "fcm_dpo/q_t": 0.41217368841171265, "grad_norm": 31.090105056762695, "learning_rate": 4.212490049118951e-07, "logits/chosen": -0.40321260690689087, "logits/rejected": -0.37525972723960876, "logps/chosen": -238.4125213623047, "logps/ref_chosen": -70.70637512207031, "logps/ref_rejected": -84.52741241455078, "logps/rejected": -362.1634521484375, "loss": 1.1155, "margin_dpo/margin_mean": 109.92990112304688, "margin_dpo/margin_std": 164.13958740234375, "step": 229 }, { "KL/chosen_KL_mean": -126.01810455322266, "KL/mean": -208.7505340576172, "KL/rejected_KL_mean": -291.48297119140625, "KL/std": 146.91603088378906, "epoch": 0.3377386196769457, "fcm_dpo/beta": 0.0033868225291371346, "fcm_dpo/delta": -0.17020674049854279, "fcm_dpo/margin": 165.46487426757812, "fcm_dpo/q_t": 0.37030667066574097, "grad_norm": 24.862947463989258, "learning_rate": 4.203117865141635e-07, "logits/chosen": -0.31466907262802124, "logits/rejected": -0.31940126419067383, "logps/chosen": -165.30010986328125, "logps/ref_chosen": -39.282005310058594, "logps/ref_rejected": -85.62191009521484, "logps/rejected": -377.1048889160156, "loss": 0.9708, "margin_dpo/margin_mean": 165.46487426757812, "margin_dpo/margin_std": 161.74786376953125, "step": 230 }, { "KL/chosen_KL_mean": -148.27967834472656, "KL/mean": -205.89512634277344, "KL/rejected_KL_mean": -263.51055908203125, "KL/std": 131.91383361816406, "epoch": 0.3392070484581498, "fcm_dpo/beta": 0.003367940429598093, "fcm_dpo/delta": 0.01239101029932499, "fcm_dpo/margin": 115.23086547851562, "fcm_dpo/q_t": 0.4120003581047058, "grad_norm": 23.897953033447266, "learning_rate": 4.1937008024246625e-07, "logits/chosen": -0.3472822308540344, "logits/rejected": -0.31850665807724, "logps/chosen": -211.55612182617188, "logps/ref_chosen": -63.27644348144531, "logps/ref_rejected": -74.1239013671875, "logps/rejected": -337.63446044921875, "loss": 1.0961, "margin_dpo/margin_mean": 115.23086547851562, "margin_dpo/margin_std": 152.61228942871094, "step": 231 }, { "KL/chosen_KL_mean": -182.43360900878906, "KL/mean": -230.69378662109375, "KL/rejected_KL_mean": -278.9539794921875, "KL/std": 157.08425903320312, "epoch": 0.3406754772393539, "fcm_dpo/beta": 0.0034012598916888237, "fcm_dpo/delta": 0.07421056926250458, "fcm_dpo/margin": 96.52035522460938, "fcm_dpo/q_t": 0.4272800087928772, "grad_norm": 25.275489807128906, "learning_rate": 4.1842391091163933e-07, "logits/chosen": -0.36489853262901306, "logits/rejected": -0.3447696268558502, "logps/chosen": -253.182373046875, "logps/ref_chosen": -70.74876403808594, "logps/ref_rejected": -83.97706604003906, "logps/rejected": -362.9310302734375, "loss": 1.1552, "margin_dpo/margin_mean": 96.52035522460938, "margin_dpo/margin_std": 163.61196899414062, "step": 232 }, { "KL/chosen_KL_mean": -166.96878051757812, "KL/mean": -240.413330078125, "KL/rejected_KL_mean": -313.8578796386719, "KL/std": 165.35609436035156, "epoch": 0.342143906020558, "fcm_dpo/beta": 0.003372794948518276, "fcm_dpo/delta": -0.10033433884382248, "fcm_dpo/margin": 146.88909912109375, "fcm_dpo/q_t": 0.3924492597579956, "grad_norm": 26.219318389892578, "learning_rate": 4.174733034541245e-07, "logits/chosen": -0.31235527992248535, "logits/rejected": -0.3153640925884247, "logps/chosen": -221.85171508789062, "logps/ref_chosen": -54.8829345703125, "logps/ref_rejected": -107.4800796508789, "logps/rejected": -421.33795166015625, "loss": 1.0631, "margin_dpo/margin_mean": 146.88909912109375, "margin_dpo/margin_std": 210.21575927734375, "step": 233 }, { "KL/chosen_KL_mean": -165.90899658203125, "KL/mean": -241.48690795898438, "KL/rejected_KL_mean": -317.0648193359375, "KL/std": 148.5336151123047, "epoch": 0.3436123348017621, "fcm_dpo/beta": 0.0032793269492685795, "fcm_dpo/delta": -0.1023728996515274, "fcm_dpo/margin": 151.15582275390625, "fcm_dpo/q_t": 0.38733774423599243, "grad_norm": 41.00167465209961, "learning_rate": 4.165182829193126e-07, "logits/chosen": -0.3150627017021179, "logits/rejected": -0.34308189153671265, "logps/chosen": -210.00350952148438, "logps/ref_chosen": -44.094520568847656, "logps/ref_rejected": -100.00663757324219, "logps/rejected": -417.07147216796875, "loss": 1.0245, "margin_dpo/margin_mean": 151.15582275390625, "margin_dpo/margin_std": 169.50576782226562, "step": 234 }, { "KL/chosen_KL_mean": -197.6452178955078, "KL/mean": -246.17535400390625, "KL/rejected_KL_mean": -294.70550537109375, "KL/std": 142.49789428710938, "epoch": 0.34508076358296624, "fcm_dpo/beta": 0.0033345932606607676, "fcm_dpo/delta": 0.07840821146965027, "fcm_dpo/margin": 97.06028747558594, "fcm_dpo/q_t": 0.42566415667533875, "grad_norm": 30.126893997192383, "learning_rate": 4.1555887447288255e-07, "logits/chosen": -0.36974847316741943, "logits/rejected": -0.35245949029922485, "logps/chosen": -259.88311767578125, "logps/ref_chosen": -62.237911224365234, "logps/ref_rejected": -90.39506530761719, "logps/rejected": -385.1005859375, "loss": 1.166, "margin_dpo/margin_mean": 97.06028747558594, "margin_dpo/margin_std": 170.45330810546875, "step": 235 }, { "KL/chosen_KL_mean": -141.45745849609375, "KL/mean": -219.01951599121094, "KL/rejected_KL_mean": -296.58160400390625, "KL/std": 149.04501342773438, "epoch": 0.3465491923641703, "fcm_dpo/beta": 0.003280568402260542, "fcm_dpo/delta": -0.11472684144973755, "fcm_dpo/margin": 155.12411499023438, "fcm_dpo/q_t": 0.3806627690792084, "grad_norm": 48.95918273925781, "learning_rate": 4.1459510339613946e-07, "logits/chosen": -0.3421390652656555, "logits/rejected": -0.341775506734848, "logps/chosen": -190.79881286621094, "logps/ref_chosen": -49.34136199951172, "logps/ref_rejected": -103.51162719726562, "logps/rejected": -400.09320068359375, "loss": 0.9902, "margin_dpo/margin_mean": 155.12413024902344, "margin_dpo/margin_std": 140.13827514648438, "step": 236 }, { "KL/chosen_KL_mean": -187.8695068359375, "KL/mean": -248.0149383544922, "KL/rejected_KL_mean": -308.1603698730469, "KL/std": 149.92160034179688, "epoch": 0.34801762114537443, "fcm_dpo/beta": 0.0032692216336727142, "fcm_dpo/delta": 0.006964612752199173, "fcm_dpo/margin": 120.29085540771484, "fcm_dpo/q_t": 0.41055381298065186, "grad_norm": 27.945327758789062, "learning_rate": 4.136269950853473e-07, "logits/chosen": -0.3814457356929779, "logits/rejected": -0.3774615526199341, "logps/chosen": -242.03762817382812, "logps/ref_chosen": -54.168121337890625, "logps/ref_rejected": -94.78036499023438, "logps/rejected": -402.94073486328125, "loss": 1.1038, "margin_dpo/margin_mean": 120.29085540771484, "margin_dpo/margin_std": 172.14694213867188, "step": 237 }, { "KL/chosen_KL_mean": -166.24346923828125, "KL/mean": -225.47650146484375, "KL/rejected_KL_mean": -284.70953369140625, "KL/std": 151.5367889404297, "epoch": 0.34948604992657856, "fcm_dpo/beta": 0.0032576932571828365, "fcm_dpo/delta": 0.01427885890007019, "fcm_dpo/margin": 118.46604919433594, "fcm_dpo/q_t": 0.41405510902404785, "grad_norm": 25.66066551208496, "learning_rate": 4.126545750510605e-07, "logits/chosen": -0.3540055751800537, "logits/rejected": -0.3696235418319702, "logps/chosen": -220.21658325195312, "logps/ref_chosen": -53.973121643066406, "logps/ref_rejected": -89.41795349121094, "logps/rejected": -374.1274719238281, "loss": 1.1057, "margin_dpo/margin_mean": 118.46604919433594, "margin_dpo/margin_std": 167.74099731445312, "step": 238 }, { "KL/chosen_KL_mean": -176.48590087890625, "KL/mean": -241.95834350585938, "KL/rejected_KL_mean": -307.4307861328125, "KL/std": 140.23287963867188, "epoch": 0.3509544787077827, "fcm_dpo/beta": 0.0032407566905021667, "fcm_dpo/delta": -0.026493586599826813, "fcm_dpo/margin": 130.94488525390625, "fcm_dpo/q_t": 0.40173038840293884, "grad_norm": 41.689571380615234, "learning_rate": 4.116778689174514e-07, "logits/chosen": -0.36024659872055054, "logits/rejected": -0.3487205505371094, "logps/chosen": -234.58370971679688, "logps/ref_chosen": -58.09782409667969, "logps/ref_rejected": -93.59294128417969, "logps/rejected": -401.02374267578125, "loss": 1.0726, "margin_dpo/margin_mean": 130.94488525390625, "margin_dpo/margin_std": 158.7174530029297, "step": 239 }, { "KL/chosen_KL_mean": -184.19223022460938, "KL/mean": -238.87380981445312, "KL/rejected_KL_mean": -293.5553894042969, "KL/std": 150.6685028076172, "epoch": 0.3524229074889868, "fcm_dpo/beta": 0.003267391351982951, "fcm_dpo/delta": 0.044143058359622955, "fcm_dpo/margin": 109.3631820678711, "fcm_dpo/q_t": 0.41868656873703003, "grad_norm": 34.611045837402344, "learning_rate": 4.106969024216348e-07, "logits/chosen": -0.38304704427719116, "logits/rejected": -0.362338662147522, "logps/chosen": -244.80670166015625, "logps/ref_chosen": -60.6144905090332, "logps/ref_rejected": -74.1185302734375, "logps/rejected": -367.6739196777344, "loss": 1.1445, "margin_dpo/margin_mean": 109.36317443847656, "margin_dpo/margin_std": 180.47584533691406, "step": 240 }, { "KL/chosen_KL_mean": -152.53372192382812, "KL/mean": -237.73934936523438, "KL/rejected_KL_mean": -322.9449462890625, "KL/std": 166.03857421875, "epoch": 0.35389133627019087, "fcm_dpo/beta": 0.0032168994657695293, "fcm_dpo/delta": -0.15657520294189453, "fcm_dpo/margin": 170.4112548828125, "fcm_dpo/q_t": 0.37778547406196594, "grad_norm": 30.021371841430664, "learning_rate": 4.097117014129903e-07, "logits/chosen": -0.4348849952220917, "logits/rejected": -0.4122951626777649, "logps/chosen": -218.62478637695312, "logps/ref_chosen": -66.091064453125, "logps/ref_rejected": -88.06088256835938, "logps/rejected": -411.005859375, "loss": 1.0002, "margin_dpo/margin_mean": 170.4112548828125, "margin_dpo/margin_std": 193.95298767089844, "step": 241 }, { "KL/chosen_KL_mean": -178.14532470703125, "KL/mean": -239.86007690429688, "KL/rejected_KL_mean": -301.57489013671875, "KL/std": 146.30352783203125, "epoch": 0.355359765051395, "fcm_dpo/beta": 0.0032022669911384583, "fcm_dpo/delta": 0.004475157707929611, "fcm_dpo/margin": 123.42952728271484, "fcm_dpo/q_t": 0.4114469289779663, "grad_norm": 35.95293045043945, "learning_rate": 4.087222918524807e-07, "logits/chosen": -0.35760676860809326, "logits/rejected": -0.33343029022216797, "logps/chosen": -246.00924682617188, "logps/ref_chosen": -67.86392974853516, "logps/ref_rejected": -83.36033630371094, "logps/rejected": -384.9352111816406, "loss": 1.1033, "margin_dpo/margin_mean": 123.42953491210938, "margin_dpo/margin_std": 175.8994140625, "step": 242 }, { "KL/chosen_KL_mean": -181.12774658203125, "KL/mean": -257.05047607421875, "KL/rejected_KL_mean": -332.9732360839844, "KL/std": 163.24075317382812, "epoch": 0.3568281938325991, "fcm_dpo/beta": 0.0031442558392882347, "fcm_dpo/delta": -0.08152244985103607, "fcm_dpo/margin": 151.84548950195312, "fcm_dpo/q_t": 0.3913443386554718, "grad_norm": 27.06866455078125, "learning_rate": 4.07728699811968e-07, "logits/chosen": -0.34606635570526123, "logits/rejected": -0.3166462182998657, "logps/chosen": -244.2119903564453, "logps/ref_chosen": -63.0842399597168, "logps/ref_rejected": -76.33563232421875, "logps/rejected": -409.3088684082031, "loss": 1.0419, "margin_dpo/margin_mean": 151.84548950195312, "margin_dpo/margin_std": 185.63485717773438, "step": 243 }, { "KL/chosen_KL_mean": -161.24728393554688, "KL/mean": -241.1260986328125, "KL/rejected_KL_mean": -321.0049133300781, "KL/std": 159.36138916015625, "epoch": 0.35829662261380324, "fcm_dpo/beta": 0.0030848030000925064, "fcm_dpo/delta": -0.09792040288448334, "fcm_dpo/margin": 159.75762939453125, "fcm_dpo/q_t": 0.3867965638637543, "grad_norm": 32.62556076049805, "learning_rate": 4.067309514735267e-07, "logits/chosen": -0.4207112193107605, "logits/rejected": -0.4130573272705078, "logps/chosen": -222.38796997070312, "logps/ref_chosen": -61.140689849853516, "logps/ref_rejected": -94.89193725585938, "logps/rejected": -415.8968505859375, "loss": 1.0159, "margin_dpo/margin_mean": 159.75762939453125, "margin_dpo/margin_std": 168.3123779296875, "step": 244 }, { "KL/chosen_KL_mean": -189.32240295410156, "KL/mean": -250.90139770507812, "KL/rejected_KL_mean": -312.48040771484375, "KL/std": 155.51824951171875, "epoch": 0.35976505139500736, "fcm_dpo/beta": 0.003060833550989628, "fcm_dpo/delta": 0.022731080651283264, "fcm_dpo/margin": 123.15798950195312, "fcm_dpo/q_t": 0.4138393700122833, "grad_norm": 28.120004653930664, "learning_rate": 4.057290731287531e-07, "logits/chosen": -0.3828911781311035, "logits/rejected": -0.35266777873039246, "logps/chosen": -256.5846862792969, "logps/ref_chosen": -67.26228332519531, "logps/ref_rejected": -87.64010620117188, "logps/rejected": -400.1204833984375, "loss": 1.1166, "margin_dpo/margin_mean": 123.15798950195312, "margin_dpo/margin_std": 173.21768188476562, "step": 245 }, { "KL/chosen_KL_mean": -184.933349609375, "KL/mean": -248.30307006835938, "KL/rejected_KL_mean": -311.67279052734375, "KL/std": 172.1173858642578, "epoch": 0.36123348017621143, "fcm_dpo/beta": 0.0030869655311107635, "fcm_dpo/delta": 0.009035417810082436, "fcm_dpo/margin": 126.73939514160156, "fcm_dpo/q_t": 0.4123349189758301, "grad_norm": 25.968130111694336, "learning_rate": 4.047230911780736e-07, "logits/chosen": -0.43583017587661743, "logits/rejected": -0.39911651611328125, "logps/chosen": -251.6303253173828, "logps/ref_chosen": -66.69696807861328, "logps/ref_rejected": -84.34634399414062, "logps/rejected": -396.01910400390625, "loss": 1.108, "margin_dpo/margin_mean": 126.73939514160156, "margin_dpo/margin_std": 186.197265625, "step": 246 }, { "KL/chosen_KL_mean": -217.3389129638672, "KL/mean": -306.2090148925781, "KL/rejected_KL_mean": -395.0791015625, "KL/std": 180.24539184570312, "epoch": 0.36270190895741555, "fcm_dpo/beta": 0.0030250344425439835, "fcm_dpo/delta": -0.14571964740753174, "fcm_dpo/margin": 177.74020385742188, "fcm_dpo/q_t": 0.377947062253952, "grad_norm": 34.5230712890625, "learning_rate": 4.0371303213004814e-07, "logits/chosen": -0.35546165704727173, "logits/rejected": -0.3532963991165161, "logps/chosen": -273.94427490234375, "logps/ref_chosen": -56.6053466796875, "logps/ref_rejected": -106.29326629638672, "logps/rejected": -501.37237548828125, "loss": 1.0034, "margin_dpo/margin_mean": 177.7401885986328, "margin_dpo/margin_std": 199.57327270507812, "step": 247 }, { "KL/chosen_KL_mean": -187.72491455078125, "KL/mean": -263.7269287109375, "KL/rejected_KL_mean": -339.7288818359375, "KL/std": 144.5110321044922, "epoch": 0.3641703377386197, "fcm_dpo/beta": 0.00296983914449811, "fcm_dpo/delta": -0.05443059653043747, "fcm_dpo/margin": 152.0039825439453, "fcm_dpo/q_t": 0.3929385244846344, "grad_norm": 30.407548904418945, "learning_rate": 4.0269892260067197e-07, "logits/chosen": -0.3517131209373474, "logits/rejected": -0.37110453844070435, "logps/chosen": -231.76812744140625, "logps/ref_chosen": -44.043216705322266, "logps/ref_rejected": -91.85687255859375, "logps/rejected": -431.5857849121094, "loss": 1.0265, "margin_dpo/margin_mean": 152.00399780273438, "margin_dpo/margin_std": 138.8626708984375, "step": 248 }, { "KL/chosen_KL_mean": -232.51548767089844, "KL/mean": -272.2945556640625, "KL/rejected_KL_mean": -312.07366943359375, "KL/std": 151.4901580810547, "epoch": 0.3656387665198238, "fcm_dpo/beta": 0.0030452050268650055, "fcm_dpo/delta": 0.1617167890071869, "fcm_dpo/margin": 79.55818176269531, "fcm_dpo/q_t": 0.4446827173233032, "grad_norm": 51.20861053466797, "learning_rate": 4.0168078931267426e-07, "logits/chosen": -0.3823295533657074, "logits/rejected": -0.360689252614975, "logps/chosen": -294.95782470703125, "logps/ref_chosen": -62.442352294921875, "logps/ref_rejected": -80.46806335449219, "logps/rejected": -392.5417175292969, "loss": 1.2284, "margin_dpo/margin_mean": 79.55818176269531, "margin_dpo/margin_std": 176.84884643554688, "step": 249 }, { "KL/chosen_KL_mean": -208.30474853515625, "KL/mean": -283.3992919921875, "KL/rejected_KL_mean": -358.49383544921875, "KL/std": 153.48965454101562, "epoch": 0.3671071953010279, "fcm_dpo/beta": 0.0030563112813979387, "fcm_dpo/delta": -0.061967238783836365, "fcm_dpo/margin": 150.18910217285156, "fcm_dpo/q_t": 0.39273035526275635, "grad_norm": 63.86355972290039, "learning_rate": 4.006586590948141e-07, "logits/chosen": -0.3651628792285919, "logits/rejected": -0.3122418522834778, "logps/chosen": -273.9414367675781, "logps/ref_chosen": -65.63668823242188, "logps/ref_rejected": -73.87184143066406, "logps/rejected": -432.36566162109375, "loss": 1.0317, "margin_dpo/margin_mean": 150.18910217285156, "margin_dpo/margin_std": 152.9610137939453, "step": 250 }, { "KL/chosen_KL_mean": -218.24432373046875, "KL/mean": -268.9655456542969, "KL/rejected_KL_mean": -319.686767578125, "KL/std": 161.1807861328125, "epoch": 0.368575624082232, "fcm_dpo/beta": 0.003067499492317438, "fcm_dpo/delta": 0.09177864342927933, "fcm_dpo/margin": 101.44242858886719, "fcm_dpo/q_t": 0.42934930324554443, "grad_norm": 46.124000549316406, "learning_rate": 3.9963255888117325e-07, "logits/chosen": -0.32652994990348816, "logits/rejected": -0.29296159744262695, "logps/chosen": -275.42706298828125, "logps/ref_chosen": -57.182716369628906, "logps/ref_rejected": -77.66343688964844, "logps/rejected": -397.3501892089844, "loss": 1.1706, "margin_dpo/margin_mean": 101.44244384765625, "margin_dpo/margin_std": 174.69888305664062, "step": 251 }, { "KL/chosen_KL_mean": -209.73199462890625, "KL/mean": -280.8515625, "KL/rejected_KL_mean": -351.9710693359375, "KL/std": 143.38165283203125, "epoch": 0.3700440528634361, "fcm_dpo/beta": 0.0030634840950369835, "fcm_dpo/delta": -0.037671059370040894, "fcm_dpo/margin": 142.23912048339844, "fcm_dpo/q_t": 0.3959454894065857, "grad_norm": 26.554500579833984, "learning_rate": 3.9860251571044666e-07, "logits/chosen": -0.3959979712963104, "logits/rejected": -0.3560243248939514, "logps/chosen": -281.4176330566406, "logps/ref_chosen": -71.68563842773438, "logps/ref_rejected": -84.75799560546875, "logps/rejected": -436.72906494140625, "loss": 1.0445, "margin_dpo/margin_mean": 142.23912048339844, "margin_dpo/margin_std": 143.8651580810547, "step": 252 }, { "KL/chosen_KL_mean": -180.37916564941406, "KL/mean": -247.28616333007812, "KL/rejected_KL_mean": -314.19317626953125, "KL/std": 149.814208984375, "epoch": 0.37151248164464024, "fcm_dpo/beta": 0.003077391069382429, "fcm_dpo/delta": -0.012822866439819336, "fcm_dpo/margin": 133.81402587890625, "fcm_dpo/q_t": 0.40381836891174316, "grad_norm": 22.02428436279297, "learning_rate": 3.9756855672522986e-07, "logits/chosen": -0.4026602804660797, "logits/rejected": -0.3960729241371155, "logps/chosen": -249.51309204101562, "logps/ref_chosen": -69.1339340209961, "logps/ref_rejected": -98.70252990722656, "logps/rejected": -412.8957214355469, "loss": 1.0801, "margin_dpo/margin_mean": 133.81402587890625, "margin_dpo/margin_std": 165.3370819091797, "step": 253 }, { "KL/chosen_KL_mean": -173.2677001953125, "KL/mean": -232.09576416015625, "KL/rejected_KL_mean": -290.923828125, "KL/std": 157.92564392089844, "epoch": 0.37298091042584436, "fcm_dpo/beta": 0.0030737267807126045, "fcm_dpo/delta": 0.03981554135680199, "fcm_dpo/margin": 117.6561279296875, "fcm_dpo/q_t": 0.42006951570510864, "grad_norm": 20.989831924438477, "learning_rate": 3.965307091713037e-07, "logits/chosen": -0.37722885608673096, "logits/rejected": -0.364244282245636, "logps/chosen": -227.42269897460938, "logps/ref_chosen": -54.154998779296875, "logps/ref_rejected": -90.30764770507812, "logps/rejected": -381.2314758300781, "loss": 1.1402, "margin_dpo/margin_mean": 117.6561279296875, "margin_dpo/margin_std": 197.60504150390625, "step": 254 }, { "KL/chosen_KL_mean": -170.31912231445312, "KL/mean": -232.83419799804688, "KL/rejected_KL_mean": -295.3492736816406, "KL/std": 138.8697967529297, "epoch": 0.3744493392070485, "fcm_dpo/beta": 0.0030783750116825104, "fcm_dpo/delta": 0.015461381524801254, "fcm_dpo/margin": 125.03014373779297, "fcm_dpo/q_t": 0.4109645187854767, "grad_norm": 21.194082260131836, "learning_rate": 3.954890003969163e-07, "logits/chosen": -0.3547831177711487, "logits/rejected": -0.3463220000267029, "logps/chosen": -227.46080017089844, "logps/ref_chosen": -57.14167022705078, "logps/ref_rejected": -90.2085952758789, "logps/rejected": -385.557861328125, "loss": 1.107, "margin_dpo/margin_mean": 125.03013610839844, "margin_dpo/margin_std": 173.13796997070312, "step": 255 }, { "KL/chosen_KL_mean": -147.08546447753906, "KL/mean": -213.44046020507812, "KL/rejected_KL_mean": -279.79547119140625, "KL/std": 147.87371826171875, "epoch": 0.37591776798825255, "fcm_dpo/beta": 0.0030894456431269646, "fcm_dpo/delta": -0.010427280329167843, "fcm_dpo/margin": 132.71002197265625, "fcm_dpo/q_t": 0.40535274147987366, "grad_norm": 39.32383346557617, "learning_rate": 3.944434578520628e-07, "logits/chosen": -0.3375306725502014, "logits/rejected": -0.3469845950603485, "logps/chosen": -202.24896240234375, "logps/ref_chosen": -55.163490295410156, "logps/ref_rejected": -92.56291961669922, "logps/rejected": -372.3583984375, "loss": 1.0822, "margin_dpo/margin_mean": 132.71002197265625, "margin_dpo/margin_std": 170.27206420898438, "step": 256 }, { "KL/chosen_KL_mean": -140.64047241210938, "KL/mean": -211.18006896972656, "KL/rejected_KL_mean": -281.71966552734375, "KL/std": 152.0925750732422, "epoch": 0.37738619676945667, "fcm_dpo/beta": 0.00305275060236454, "fcm_dpo/delta": -0.03301442041993141, "fcm_dpo/margin": 141.07920837402344, "fcm_dpo/q_t": 0.40170031785964966, "grad_norm": 23.27699089050293, "learning_rate": 3.933941090877615e-07, "logits/chosen": -0.3658456802368164, "logits/rejected": -0.3539636731147766, "logps/chosen": -190.0641632080078, "logps/ref_chosen": -49.42369842529297, "logps/ref_rejected": -79.53791809082031, "logps/rejected": -361.257568359375, "loss": 1.0705, "margin_dpo/margin_mean": 141.07920837402344, "margin_dpo/margin_std": 172.331787109375, "step": 257 }, { "KL/chosen_KL_mean": -186.5501708984375, "KL/mean": -254.5238037109375, "KL/rejected_KL_mean": -322.4974365234375, "KL/std": 160.353515625, "epoch": 0.3788546255506608, "fcm_dpo/beta": 0.003063221462070942, "fcm_dpo/delta": -0.017175834625959396, "fcm_dpo/margin": 135.947265625, "fcm_dpo/q_t": 0.403425931930542, "grad_norm": 33.59754943847656, "learning_rate": 3.923409817553284e-07, "logits/chosen": -0.33080989122390747, "logits/rejected": -0.3309909701347351, "logps/chosen": -245.93429565429688, "logps/ref_chosen": -59.384124755859375, "logps/ref_rejected": -95.99010467529297, "logps/rejected": -418.487548828125, "loss": 1.0923, "margin_dpo/margin_mean": 135.947265625, "margin_dpo/margin_std": 190.87680053710938, "step": 258 }, { "KL/chosen_KL_mean": -180.4862060546875, "KL/mean": -238.32546997070312, "KL/rejected_KL_mean": -296.16473388671875, "KL/std": 154.69522094726562, "epoch": 0.3803230543318649, "fcm_dpo/beta": 0.003068537451326847, "fcm_dpo/delta": 0.046667762100696564, "fcm_dpo/margin": 115.67851257324219, "fcm_dpo/q_t": 0.4190807044506073, "grad_norm": 27.576894760131836, "learning_rate": 3.9128410360564793e-07, "logits/chosen": -0.3981941342353821, "logits/rejected": -0.39807045459747314, "logps/chosen": -233.31454467773438, "logps/ref_chosen": -52.828346252441406, "logps/ref_rejected": -89.191650390625, "logps/rejected": -385.35638427734375, "loss": 1.1328, "margin_dpo/margin_mean": 115.67851257324219, "margin_dpo/margin_std": 177.17535400390625, "step": 259 }, { "KL/chosen_KL_mean": -187.39376831054688, "KL/mean": -266.05322265625, "KL/rejected_KL_mean": -344.71270751953125, "KL/std": 160.06930541992188, "epoch": 0.38179148311306904, "fcm_dpo/beta": 0.003057563677430153, "fcm_dpo/delta": -0.08508844673633575, "fcm_dpo/margin": 157.31893920898438, "fcm_dpo/q_t": 0.39034503698349, "grad_norm": 31.068809509277344, "learning_rate": 3.9022350248844246e-07, "logits/chosen": -0.3578794002532959, "logits/rejected": -0.37384307384490967, "logps/chosen": -234.81146240234375, "logps/ref_chosen": -47.41767501831055, "logps/ref_rejected": -95.08978271484375, "logps/rejected": -439.802490234375, "loss": 1.0238, "margin_dpo/margin_mean": 157.31893920898438, "margin_dpo/margin_std": 170.58743286132812, "step": 260 }, { "KL/chosen_KL_mean": -198.41915893554688, "KL/mean": -272.1341552734375, "KL/rejected_KL_mean": -345.84918212890625, "KL/std": 177.0203857421875, "epoch": 0.3832599118942731, "fcm_dpo/beta": 0.0030062044970691204, "fcm_dpo/delta": -0.045364413410425186, "fcm_dpo/margin": 147.43002319335938, "fcm_dpo/q_t": 0.4001613259315491, "grad_norm": 19.665233612060547, "learning_rate": 3.891592063515376e-07, "logits/chosen": -0.3024892210960388, "logits/rejected": -0.303438663482666, "logps/chosen": -251.45053100585938, "logps/ref_chosen": -53.03137969970703, "logps/ref_rejected": -88.51494598388672, "logps/rejected": -434.3641357421875, "loss": 1.0747, "margin_dpo/margin_mean": 147.43002319335938, "margin_dpo/margin_std": 202.10751342773438, "step": 261 }, { "KL/chosen_KL_mean": -231.7572021484375, "KL/mean": -292.0511474609375, "KL/rejected_KL_mean": -352.3450927734375, "KL/std": 160.82723999023438, "epoch": 0.38472834067547723, "fcm_dpo/beta": 0.0030295196920633316, "fcm_dpo/delta": 0.035723648965358734, "fcm_dpo/margin": 120.58787536621094, "fcm_dpo/q_t": 0.4150589108467102, "grad_norm": 23.96550750732422, "learning_rate": 3.880912432401264e-07, "logits/chosen": -0.3180779814720154, "logits/rejected": -0.29295575618743896, "logps/chosen": -291.3773193359375, "logps/ref_chosen": -59.620140075683594, "logps/ref_rejected": -86.41853332519531, "logps/rejected": -438.76361083984375, "loss": 1.1084, "margin_dpo/margin_mean": 120.58787536621094, "margin_dpo/margin_std": 159.1489715576172, "step": 262 }, { "KL/chosen_KL_mean": -214.27926635742188, "KL/mean": -300.5625, "KL/rejected_KL_mean": -386.8457336425781, "KL/std": 190.634033203125, "epoch": 0.38619676945668135, "fcm_dpo/beta": 0.002960496349260211, "fcm_dpo/delta": -0.11779750883579254, "fcm_dpo/margin": 172.56646728515625, "fcm_dpo/q_t": 0.38409924507141113, "grad_norm": 20.86168098449707, "learning_rate": 3.870196412960302e-07, "logits/chosen": -0.3342781960964203, "logits/rejected": -0.31365495920181274, "logps/chosen": -273.7001953125, "logps/ref_chosen": -59.42094421386719, "logps/ref_rejected": -96.85720825195312, "logps/rejected": -483.70294189453125, "loss": 1.0246, "margin_dpo/margin_mean": 172.56646728515625, "margin_dpo/margin_std": 205.58511352539062, "step": 263 }, { "KL/chosen_KL_mean": -228.12066650390625, "KL/mean": -300.76123046875, "KL/rejected_KL_mean": -373.40179443359375, "KL/std": 175.52401733398438, "epoch": 0.3876651982378855, "fcm_dpo/beta": 0.0029227761551737785, "fcm_dpo/delta": -0.026993874460458755, "fcm_dpo/margin": 145.28106689453125, "fcm_dpo/q_t": 0.4040907025337219, "grad_norm": 22.25633430480957, "learning_rate": 3.8594442875695665e-07, "logits/chosen": -0.37826618552207947, "logits/rejected": -0.3711628019809723, "logps/chosen": -290.8427734375, "logps/ref_chosen": -62.722084045410156, "logps/ref_rejected": -93.85620880126953, "logps/rejected": -467.25799560546875, "loss": 1.0907, "margin_dpo/margin_mean": 145.2810821533203, "margin_dpo/margin_std": 201.77871704101562, "step": 264 }, { "KL/chosen_KL_mean": -242.83438110351562, "KL/mean": -314.224365234375, "KL/rejected_KL_mean": -385.6143798828125, "KL/std": 194.88223266601562, "epoch": 0.3891336270190896, "fcm_dpo/beta": 0.0029316158033907413, "fcm_dpo/delta": -0.019387083128094673, "fcm_dpo/margin": 142.77999877929688, "fcm_dpo/q_t": 0.40785303711891174, "grad_norm": 26.028135299682617, "learning_rate": 3.848656339557562e-07, "logits/chosen": -0.3183874785900116, "logits/rejected": -0.304283082485199, "logps/chosen": -304.80584716796875, "logps/ref_chosen": -61.971466064453125, "logps/ref_rejected": -88.02059936523438, "logps/rejected": -473.6349792480469, "loss": 1.1159, "margin_dpo/margin_mean": 142.77999877929688, "margin_dpo/margin_std": 233.77301025390625, "step": 265 }, { "KL/chosen_KL_mean": -247.06314086914062, "KL/mean": -303.61907958984375, "KL/rejected_KL_mean": -360.17498779296875, "KL/std": 165.74261474609375, "epoch": 0.39060205580029367, "fcm_dpo/beta": 0.0029581869021058083, "fcm_dpo/delta": 0.06766145676374435, "fcm_dpo/margin": 113.11186218261719, "fcm_dpo/q_t": 0.42461222410202026, "grad_norm": 42.105552673339844, "learning_rate": 3.8378328531967507e-07, "logits/chosen": -0.33068183064460754, "logits/rejected": -0.2887161374092102, "logps/chosen": -314.1628112792969, "logps/ref_chosen": -67.09967041015625, "logps/ref_rejected": -67.97122192382812, "logps/rejected": -428.146240234375, "loss": 1.1555, "margin_dpo/margin_mean": 113.11186218261719, "margin_dpo/margin_std": 193.41064453125, "step": 266 }, { "KL/chosen_KL_mean": -218.62939453125, "KL/mean": -286.082763671875, "KL/rejected_KL_mean": -353.53619384765625, "KL/std": 175.55633544921875, "epoch": 0.3920704845814978, "fcm_dpo/beta": 0.002957455348223448, "fcm_dpo/delta": 0.0007606670260429382, "fcm_dpo/margin": 134.9067840576172, "fcm_dpo/q_t": 0.4094040095806122, "grad_norm": 30.943706512451172, "learning_rate": 3.8269741136960646e-07, "logits/chosen": -0.4011858105659485, "logits/rejected": -0.37256526947021484, "logps/chosen": -287.60015869140625, "logps/ref_chosen": -68.97075653076172, "logps/ref_rejected": -90.16844940185547, "logps/rejected": -443.7046203613281, "loss": 1.1093, "margin_dpo/margin_mean": 134.90679931640625, "margin_dpo/margin_std": 202.06175231933594, "step": 267 }, { "KL/chosen_KL_mean": -222.305419921875, "KL/mean": -293.548095703125, "KL/rejected_KL_mean": -364.79071044921875, "KL/std": 170.23300170898438, "epoch": 0.3935389133627019, "fcm_dpo/beta": 0.0029503919649869204, "fcm_dpo/delta": -0.02154139243066311, "fcm_dpo/margin": 142.48526000976562, "fcm_dpo/q_t": 0.40528371930122375, "grad_norm": 23.621414184570312, "learning_rate": 3.8160804071933894e-07, "logits/chosen": -0.3952227234840393, "logits/rejected": -0.40232187509536743, "logps/chosen": -278.20574951171875, "logps/ref_chosen": -55.90031051635742, "logps/ref_rejected": -101.64763641357422, "logps/rejected": -466.4383239746094, "loss": 1.0936, "margin_dpo/margin_mean": 142.48526000976562, "margin_dpo/margin_std": 206.6456298828125, "step": 268 }, { "KL/chosen_KL_mean": -233.79080200195312, "KL/mean": -320.32830810546875, "KL/rejected_KL_mean": -406.86578369140625, "KL/std": 176.70233154296875, "epoch": 0.39500734214390604, "fcm_dpo/beta": 0.002910827985033393, "fcm_dpo/delta": -0.109227254986763, "fcm_dpo/margin": 173.074951171875, "fcm_dpo/q_t": 0.38658440113067627, "grad_norm": 23.399341583251953, "learning_rate": 3.8051520207480204e-07, "logits/chosen": -0.4322871267795563, "logits/rejected": -0.4134712517261505, "logps/chosen": -303.83038330078125, "logps/ref_chosen": -70.03955841064453, "logps/ref_rejected": -107.34937286376953, "logps/rejected": -514.2151489257812, "loss": 1.0422, "margin_dpo/margin_mean": 173.074951171875, "margin_dpo/margin_std": 224.85488891601562, "step": 269 }, { "KL/chosen_KL_mean": -206.2056427001953, "KL/mean": -268.89251708984375, "KL/rejected_KL_mean": -331.57940673828125, "KL/std": 156.54763793945312, "epoch": 0.3964757709251101, "fcm_dpo/beta": 0.002915448509156704, "fcm_dpo/delta": 0.03554879128932953, "fcm_dpo/margin": 125.37371826171875, "fcm_dpo/q_t": 0.4160010814666748, "grad_norm": 25.85466194152832, "learning_rate": 3.794189242333106e-07, "logits/chosen": -0.46122607588768005, "logits/rejected": -0.45263946056365967, "logps/chosen": -275.7391357421875, "logps/ref_chosen": -69.53347778320312, "logps/ref_rejected": -109.92864990234375, "logps/rejected": -441.5080261230469, "loss": 1.1288, "margin_dpo/margin_mean": 125.37371826171875, "margin_dpo/margin_std": 193.83901977539062, "step": 270 }, { "KL/chosen_KL_mean": -191.8485870361328, "KL/mean": -269.27569580078125, "KL/rejected_KL_mean": -346.70281982421875, "KL/std": 154.6339111328125, "epoch": 0.39794419970631423, "fcm_dpo/beta": 0.0028847784269601107, "fcm_dpo/delta": -0.049142319709062576, "fcm_dpo/margin": 154.854248046875, "fcm_dpo/q_t": 0.3972168564796448, "grad_norm": 22.624128341674805, "learning_rate": 3.7831923608280514e-07, "logits/chosen": -0.4137924313545227, "logits/rejected": -0.39371395111083984, "logps/chosen": -248.6131591796875, "logps/ref_chosen": -56.76456832885742, "logps/ref_rejected": -92.51383972167969, "logps/rejected": -439.2166748046875, "loss": 1.0476, "margin_dpo/margin_mean": 154.85426330566406, "margin_dpo/margin_std": 175.0887908935547, "step": 271 }, { "KL/chosen_KL_mean": -180.83294677734375, "KL/mean": -275.11517333984375, "KL/rejected_KL_mean": -369.3974304199219, "KL/std": 171.1865234375, "epoch": 0.39941262848751835, "fcm_dpo/beta": 0.002835802501067519, "fcm_dpo/delta": -0.1421043574810028, "fcm_dpo/margin": 188.56448364257812, "fcm_dpo/q_t": 0.37538450956344604, "grad_norm": 32.70468521118164, "learning_rate": 3.772161666010912e-07, "logits/chosen": -0.3418217897415161, "logits/rejected": -0.350533664226532, "logps/chosen": -230.33010864257812, "logps/ref_chosen": -49.497154235839844, "logps/ref_rejected": -105.54279327392578, "logps/rejected": -474.94024658203125, "loss": 0.9828, "margin_dpo/margin_mean": 188.56448364257812, "margin_dpo/margin_std": 178.37030029296875, "step": 272 }, { "KL/chosen_KL_mean": -211.40390014648438, "KL/mean": -304.2496643066406, "KL/rejected_KL_mean": -397.0954284667969, "KL/std": 175.37908935546875, "epoch": 0.4008810572687225, "fcm_dpo/beta": 0.00274536176584661, "fcm_dpo/delta": -0.11640150099992752, "fcm_dpo/margin": 185.6915283203125, "fcm_dpo/q_t": 0.3824120759963989, "grad_norm": 21.104841232299805, "learning_rate": 3.761097448550755e-07, "logits/chosen": -0.3616599440574646, "logits/rejected": -0.34069812297821045, "logps/chosen": -274.3792724609375, "logps/ref_chosen": -62.97539520263672, "logps/ref_rejected": -92.49858093261719, "logps/rejected": -489.593994140625, "loss": 1.0078, "margin_dpo/margin_mean": 185.6915283203125, "margin_dpo/margin_std": 196.62474060058594, "step": 273 }, { "KL/chosen_KL_mean": -254.86184692382812, "KL/mean": -323.7417907714844, "KL/rejected_KL_mean": -392.6217346191406, "KL/std": 157.4027099609375, "epoch": 0.4023494860499266, "fcm_dpo/beta": 0.0027546617202460766, "fcm_dpo/delta": 0.021147366613149643, "fcm_dpo/margin": 137.75990295410156, "fcm_dpo/q_t": 0.4122427701950073, "grad_norm": 26.784929275512695, "learning_rate": 3.75e-07, "logits/chosen": -0.2770734429359436, "logits/rejected": -0.2582925260066986, "logps/chosen": -310.5295715332031, "logps/ref_chosen": -55.66770935058594, "logps/ref_rejected": -77.33308410644531, "logps/rejected": -469.954833984375, "loss": 1.1023, "margin_dpo/margin_mean": 137.7598876953125, "margin_dpo/margin_std": 182.3780517578125, "step": 274 }, { "KL/chosen_KL_mean": -200.73196411132812, "KL/mean": -275.09075927734375, "KL/rejected_KL_mean": -349.4495849609375, "KL/std": 161.7529296875, "epoch": 0.40381791483113066, "fcm_dpo/beta": 0.0027543343603610992, "fcm_dpo/delta": -0.010161615908145905, "fcm_dpo/margin": 148.71763610839844, "fcm_dpo/q_t": 0.4048367142677307, "grad_norm": 23.660945892333984, "learning_rate": 3.738869612786737e-07, "logits/chosen": -0.3343503177165985, "logits/rejected": -0.3386019766330719, "logps/chosen": -249.32666015625, "logps/ref_chosen": -48.594703674316406, "logps/ref_rejected": -93.30369567871094, "logps/rejected": -442.7532958984375, "loss": 1.0776, "margin_dpo/margin_mean": 148.71762084960938, "margin_dpo/margin_std": 181.3911590576172, "step": 275 }, { "KL/chosen_KL_mean": -220.0225372314453, "KL/mean": -294.27294921875, "KL/rejected_KL_mean": -368.52337646484375, "KL/std": 166.26907348632812, "epoch": 0.4052863436123348, "fcm_dpo/beta": 0.002734929323196411, "fcm_dpo/delta": -0.0065729208290576935, "fcm_dpo/margin": 148.5008544921875, "fcm_dpo/q_t": 0.40702730417251587, "grad_norm": 28.4442195892334, "learning_rate": 3.7277065802070204e-07, "logits/chosen": -0.33677220344543457, "logits/rejected": -0.31244635581970215, "logps/chosen": -276.5999450683594, "logps/ref_chosen": -56.57740783691406, "logps/ref_rejected": -70.36566925048828, "logps/rejected": -438.8890380859375, "loss": 1.0889, "margin_dpo/margin_mean": 148.5008544921875, "margin_dpo/margin_std": 197.42532348632812, "step": 276 }, { "KL/chosen_KL_mean": -239.46275329589844, "KL/mean": -317.6234130859375, "KL/rejected_KL_mean": -395.7840576171875, "KL/std": 170.51141357421875, "epoch": 0.4067547723935389, "fcm_dpo/beta": 0.0027346829883754253, "fcm_dpo/delta": -0.028729649260640144, "fcm_dpo/margin": 156.3212890625, "fcm_dpo/q_t": 0.40158289670944214, "grad_norm": 25.2097225189209, "learning_rate": 3.71651119641714e-07, "logits/chosen": -0.2805439829826355, "logits/rejected": -0.2614123523235321, "logps/chosen": -295.73431396484375, "logps/ref_chosen": -56.27156066894531, "logps/ref_rejected": -92.88127136230469, "logps/rejected": -488.6653137207031, "loss": 1.0752, "margin_dpo/margin_mean": 156.3212890625, "margin_dpo/margin_std": 201.70455932617188, "step": 277 }, { "KL/chosen_KL_mean": -215.6261749267578, "KL/mean": -304.9232482910156, "KL/rejected_KL_mean": -394.2203369140625, "KL/std": 179.00375366210938, "epoch": 0.40822320117474303, "fcm_dpo/beta": 0.0026860979851335287, "fcm_dpo/delta": -0.08402146399021149, "fcm_dpo/margin": 178.59414672851562, "fcm_dpo/q_t": 0.38982489705085754, "grad_norm": 23.9796199798584, "learning_rate": 3.705283756425872e-07, "logits/chosen": -0.31917044520378113, "logits/rejected": -0.32198548316955566, "logps/chosen": -268.568115234375, "logps/ref_chosen": -52.94194030761719, "logps/ref_rejected": -91.25357818603516, "logps/rejected": -485.4739074707031, "loss": 1.0282, "margin_dpo/margin_mean": 178.59414672851562, "margin_dpo/margin_std": 198.23104858398438, "step": 278 }, { "KL/chosen_KL_mean": -253.12692260742188, "KL/mean": -339.2762145996094, "KL/rejected_KL_mean": -425.425537109375, "KL/std": 185.4033966064453, "epoch": 0.40969162995594716, "fcm_dpo/beta": 0.0026397216133773327, "fcm_dpo/delta": -0.058702681213617325, "fcm_dpo/margin": 172.298583984375, "fcm_dpo/q_t": 0.3971731960773468, "grad_norm": 28.65115737915039, "learning_rate": 3.6940245560867e-07, "logits/chosen": -0.2409660816192627, "logits/rejected": -0.23715651035308838, "logps/chosen": -301.76824951171875, "logps/ref_chosen": -48.641319274902344, "logps/ref_rejected": -87.8514404296875, "logps/rejected": -513.2769775390625, "loss": 1.0662, "margin_dpo/margin_mean": 172.298583984375, "margin_dpo/margin_std": 225.11993408203125, "step": 279 }, { "KL/chosen_KL_mean": -251.72592163085938, "KL/mean": -344.47216796875, "KL/rejected_KL_mean": -437.21832275390625, "KL/std": 179.44480895996094, "epoch": 0.4111600587371512, "fcm_dpo/beta": 0.0026130005717277527, "fcm_dpo/delta": -0.08905528485774994, "fcm_dpo/margin": 185.492431640625, "fcm_dpo/q_t": 0.3867556154727936, "grad_norm": 23.646604537963867, "learning_rate": 3.6827338920900253e-07, "logits/chosen": -0.29928305745124817, "logits/rejected": -0.29868048429489136, "logps/chosen": -310.5230407714844, "logps/ref_chosen": -58.797122955322266, "logps/ref_rejected": -98.61885070800781, "logps/rejected": -535.837158203125, "loss": 1.0195, "margin_dpo/margin_mean": 185.492431640625, "margin_dpo/margin_std": 194.14852905273438, "step": 280 }, { "KL/chosen_KL_mean": -230.0161895751953, "KL/mean": -313.6643371582031, "KL/rejected_KL_mean": -397.3125, "KL/std": 175.47067260742188, "epoch": 0.41262848751835535, "fcm_dpo/beta": 0.0025973522569984198, "fcm_dpo/delta": -0.036213867366313934, "fcm_dpo/margin": 167.29629516601562, "fcm_dpo/q_t": 0.3972865343093872, "grad_norm": 23.116756439208984, "learning_rate": 3.6714120619553435e-07, "logits/chosen": -0.32348203659057617, "logits/rejected": -0.2926068902015686, "logps/chosen": -285.50469970703125, "logps/ref_chosen": -55.488521575927734, "logps/ref_rejected": -80.88258361816406, "logps/rejected": -478.195068359375, "loss": 1.0576, "margin_dpo/margin_mean": 167.29629516601562, "margin_dpo/margin_std": 190.1474609375, "step": 281 }, { "KL/chosen_KL_mean": -254.25106811523438, "KL/mean": -319.09814453125, "KL/rejected_KL_mean": -383.9451904296875, "KL/std": 183.95884704589844, "epoch": 0.41409691629955947, "fcm_dpo/beta": 0.0026173896621912718, "fcm_dpo/delta": 0.06215311959385872, "fcm_dpo/margin": 129.69415283203125, "fcm_dpo/q_t": 0.42490720748901367, "grad_norm": 23.07098388671875, "learning_rate": 3.660059364023408e-07, "logits/chosen": -0.407487154006958, "logits/rejected": -0.3855167031288147, "logps/chosen": -327.32122802734375, "logps/ref_chosen": -73.07014465332031, "logps/ref_rejected": -95.35098266601562, "logps/rejected": -479.29620361328125, "loss": 1.1456, "margin_dpo/margin_mean": 129.69415283203125, "margin_dpo/margin_std": 215.59033203125, "step": 282 }, { "KL/chosen_KL_mean": -257.63934326171875, "KL/mean": -357.9620361328125, "KL/rejected_KL_mean": -458.2847595214844, "KL/std": 216.70352172851562, "epoch": 0.4155653450807636, "fcm_dpo/beta": 0.002578072715550661, "fcm_dpo/delta": -0.12350119650363922, "fcm_dpo/margin": 200.64541625976562, "fcm_dpo/q_t": 0.3798179626464844, "grad_norm": 27.85664176940918, "learning_rate": 3.6486760974483685e-07, "logits/chosen": -0.3043696880340576, "logits/rejected": -0.3012719452381134, "logps/chosen": -319.53778076171875, "logps/ref_chosen": -61.89844512939453, "logps/ref_rejected": -96.98655700683594, "logps/rejected": -555.2713012695312, "loss": 1.0023, "margin_dpo/margin_mean": 200.64540100097656, "margin_dpo/margin_std": 206.13702392578125, "step": 283 }, { "KL/chosen_KL_mean": -241.11981201171875, "KL/mean": -333.07305908203125, "KL/rejected_KL_mean": -425.02630615234375, "KL/std": 198.67276000976562, "epoch": 0.4170337738619677, "fcm_dpo/beta": 0.0025225451681762934, "fcm_dpo/delta": -0.06718793511390686, "fcm_dpo/margin": 183.906494140625, "fcm_dpo/q_t": 0.39294561743736267, "grad_norm": 21.307559967041016, "learning_rate": 3.6372625621898863e-07, "logits/chosen": -0.38930195569992065, "logits/rejected": -0.37415850162506104, "logps/chosen": -299.55535888671875, "logps/ref_chosen": -58.4355354309082, "logps/ref_rejected": -93.46926879882812, "logps/rejected": -518.49560546875, "loss": 1.0327, "margin_dpo/margin_mean": 183.906494140625, "margin_dpo/margin_std": 199.44464111328125, "step": 284 }, { "KL/chosen_KL_mean": -272.1553039550781, "KL/mean": -350.767578125, "KL/rejected_KL_mean": -429.3798522949219, "KL/std": 169.2427978515625, "epoch": 0.4185022026431718, "fcm_dpo/beta": 0.0025313901714980602, "fcm_dpo/delta": 0.001558154821395874, "fcm_dpo/margin": 157.2245330810547, "fcm_dpo/q_t": 0.4059276878833771, "grad_norm": 25.553455352783203, "learning_rate": 3.625819059005228e-07, "logits/chosen": -0.3947150707244873, "logits/rejected": -0.37686532735824585, "logps/chosen": -338.38751220703125, "logps/ref_chosen": -66.23219299316406, "logps/ref_rejected": -99.1268310546875, "logps/rejected": -528.5067138671875, "loss": 1.0824, "margin_dpo/margin_mean": 157.2245330810547, "margin_dpo/margin_std": 185.91542053222656, "step": 285 }, { "KL/chosen_KL_mean": -268.74615478515625, "KL/mean": -357.44683837890625, "KL/rejected_KL_mean": -446.1475524902344, "KL/std": 201.09698486328125, "epoch": 0.4199706314243759, "fcm_dpo/beta": 0.002504766918718815, "fcm_dpo/delta": -0.04638573154807091, "fcm_dpo/margin": 177.4013671875, "fcm_dpo/q_t": 0.3975168466567993, "grad_norm": 24.010190963745117, "learning_rate": 3.614345889441346e-07, "logits/chosen": -0.3724295198917389, "logits/rejected": -0.3507525324821472, "logps/chosen": -341.6971740722656, "logps/ref_chosen": -72.95100402832031, "logps/ref_rejected": -88.58845520019531, "logps/rejected": -534.7359619140625, "loss": 1.0627, "margin_dpo/margin_mean": 177.40138244628906, "margin_dpo/margin_std": 221.60133361816406, "step": 286 }, { "KL/chosen_KL_mean": -245.1240234375, "KL/mean": -319.36212158203125, "KL/rejected_KL_mean": -393.60015869140625, "KL/std": 172.98507690429688, "epoch": 0.42143906020558003, "fcm_dpo/beta": 0.002508362988010049, "fcm_dpo/delta": 0.028594862669706345, "fcm_dpo/margin": 148.4761199951172, "fcm_dpo/q_t": 0.41409826278686523, "grad_norm": 21.714580535888672, "learning_rate": 3.6028433558269275e-07, "logits/chosen": -0.36127710342407227, "logits/rejected": -0.33609965443611145, "logps/chosen": -306.6651916503906, "logps/ref_chosen": -61.54115295410156, "logps/ref_rejected": -77.69607543945312, "logps/rejected": -471.2962341308594, "loss": 1.1047, "margin_dpo/margin_mean": 148.47613525390625, "margin_dpo/margin_std": 196.79652404785156, "step": 287 }, { "KL/chosen_KL_mean": -241.18698120117188, "KL/mean": -333.81597900390625, "KL/rejected_KL_mean": -426.4449462890625, "KL/std": 176.66453552246094, "epoch": 0.42290748898678415, "fcm_dpo/beta": 0.0024691871367394924, "fcm_dpo/delta": -0.06168384104967117, "fcm_dpo/margin": 185.25796508789062, "fcm_dpo/q_t": 0.39265191555023193, "grad_norm": 21.64046287536621, "learning_rate": 3.5913117612644327e-07, "logits/chosen": -0.3853694200515747, "logits/rejected": -0.36701393127441406, "logps/chosen": -297.84820556640625, "logps/ref_chosen": -56.661224365234375, "logps/ref_rejected": -87.33570098876953, "logps/rejected": -513.7806396484375, "loss": 1.0344, "margin_dpo/margin_mean": 185.25796508789062, "margin_dpo/margin_std": 186.76431274414062, "step": 288 }, { "KL/chosen_KL_mean": -234.46319580078125, "KL/mean": -339.17218017578125, "KL/rejected_KL_mean": -443.88116455078125, "KL/std": 201.45144653320312, "epoch": 0.4243759177679883, "fcm_dpo/beta": 0.0024433922953903675, "fcm_dpo/delta": -0.11754532158374786, "fcm_dpo/margin": 209.41796875, "fcm_dpo/q_t": 0.38267892599105835, "grad_norm": 24.331398010253906, "learning_rate": 3.5797514096221024e-07, "logits/chosen": -0.3093263506889343, "logits/rejected": -0.30716824531555176, "logps/chosen": -279.693603515625, "logps/ref_chosen": -45.23039245605469, "logps/ref_rejected": -87.64266967773438, "logps/rejected": -531.5238037109375, "loss": 1.0067, "margin_dpo/margin_mean": 209.41796875, "margin_dpo/margin_std": 220.97320556640625, "step": 289 }, { "KL/chosen_KL_mean": -259.2691650390625, "KL/mean": -364.51544189453125, "KL/rejected_KL_mean": -469.76171875, "KL/std": 214.3240509033203, "epoch": 0.42584434654919234, "fcm_dpo/beta": 0.002378998324275017, "fcm_dpo/delta": -0.10634815692901611, "fcm_dpo/margin": 210.4925537109375, "fcm_dpo/q_t": 0.3870581388473511, "grad_norm": 21.887035369873047, "learning_rate": 3.568162605525952e-07, "logits/chosen": -0.31333810091018677, "logits/rejected": -0.33406785130500793, "logps/chosen": -314.74066162109375, "logps/ref_chosen": -55.47149658203125, "logps/ref_rejected": -116.70857238769531, "logps/rejected": -586.4702758789062, "loss": 1.0288, "margin_dpo/margin_mean": 210.49253845214844, "margin_dpo/margin_std": 254.7548828125, "step": 290 }, { "KL/chosen_KL_mean": -229.11093139648438, "KL/mean": -324.2933654785156, "KL/rejected_KL_mean": -419.47576904296875, "KL/std": 183.89297485351562, "epoch": 0.42731277533039647, "fcm_dpo/beta": 0.002363018225878477, "fcm_dpo/delta": -0.05237455666065216, "fcm_dpo/margin": 190.36480712890625, "fcm_dpo/q_t": 0.39525771141052246, "grad_norm": 22.43724250793457, "learning_rate": 3.5565456543517485e-07, "logits/chosen": -0.32546311616897583, "logits/rejected": -0.3105616271495819, "logps/chosen": -292.3713073730469, "logps/ref_chosen": -63.26036834716797, "logps/ref_rejected": -89.29708862304688, "logps/rejected": -508.7728576660156, "loss": 1.0516, "margin_dpo/margin_mean": 190.36480712890625, "margin_dpo/margin_std": 219.41854858398438, "step": 291 }, { "KL/chosen_KL_mean": -263.8599853515625, "KL/mean": -365.8248291015625, "KL/rejected_KL_mean": -467.7896728515625, "KL/std": 221.3306884765625, "epoch": 0.4287812041116006, "fcm_dpo/beta": 0.0023269178345799446, "fcm_dpo/delta": -0.07821857929229736, "fcm_dpo/margin": 203.92965698242188, "fcm_dpo/q_t": 0.39132484793663025, "grad_norm": 26.2148380279541, "learning_rate": 3.5449008622169583e-07, "logits/chosen": -0.3232491612434387, "logits/rejected": -0.3077685534954071, "logps/chosen": -317.7785339355469, "logps/ref_chosen": -53.91852951049805, "logps/ref_rejected": -89.96138000488281, "logps/rejected": -557.7510375976562, "loss": 1.0518, "margin_dpo/margin_mean": 203.92965698242188, "margin_dpo/margin_std": 259.8817443847656, "step": 292 }, { "KL/chosen_KL_mean": -269.84210205078125, "KL/mean": -353.70697021484375, "KL/rejected_KL_mean": -437.5718078613281, "KL/std": 224.77517700195312, "epoch": 0.4302496328928047, "fcm_dpo/beta": 0.002317019272595644, "fcm_dpo/delta": 0.011760619468986988, "fcm_dpo/margin": 167.7296905517578, "fcm_dpo/q_t": 0.41295474767684937, "grad_norm": 38.31715774536133, "learning_rate": 3.5332285359726846e-07, "logits/chosen": -0.31585967540740967, "logits/rejected": -0.3076015114784241, "logps/chosen": -330.2181396484375, "logps/ref_chosen": -60.376033782958984, "logps/ref_rejected": -77.85244750976562, "logps/rejected": -515.4242553710938, "loss": 1.1161, "margin_dpo/margin_mean": 167.72970581054688, "margin_dpo/margin_std": 257.7914733886719, "step": 293 }, { "KL/chosen_KL_mean": -257.2511291503906, "KL/mean": -347.72064208984375, "KL/rejected_KL_mean": -438.19012451171875, "KL/std": 204.02166748046875, "epoch": 0.43171806167400884, "fcm_dpo/beta": 0.002303325105458498, "fcm_dpo/delta": -0.01764640584588051, "fcm_dpo/margin": 180.93899536132812, "fcm_dpo/q_t": 0.4063786268234253, "grad_norm": 20.47630500793457, "learning_rate": 3.5215289831955786e-07, "logits/chosen": -0.3238428235054016, "logits/rejected": -0.33083072304725647, "logps/chosen": -305.3386535644531, "logps/ref_chosen": -48.0875358581543, "logps/ref_rejected": -81.89698791503906, "logps/rejected": -520.0870971679688, "loss": 1.0891, "margin_dpo/margin_mean": 180.93899536132812, "margin_dpo/margin_std": 251.64306640625, "step": 294 }, { "KL/chosen_KL_mean": -306.6954345703125, "KL/mean": -408.22650146484375, "KL/rejected_KL_mean": -509.75762939453125, "KL/std": 231.20632934570312, "epoch": 0.4331864904552129, "fcm_dpo/beta": 0.002286091446876526, "fcm_dpo/delta": -0.06729740649461746, "fcm_dpo/margin": 203.06219482421875, "fcm_dpo/q_t": 0.3953544497489929, "grad_norm": 25.35572052001953, "learning_rate": 3.509802512179737e-07, "logits/chosen": -0.3378216624259949, "logits/rejected": -0.34275323152542114, "logps/chosen": -356.6201171875, "logps/ref_chosen": -49.92467498779297, "logps/ref_rejected": -87.45632934570312, "logps/rejected": -597.2139282226562, "loss": 1.0634, "margin_dpo/margin_mean": 203.06219482421875, "margin_dpo/margin_std": 268.56146240234375, "step": 295 }, { "KL/chosen_KL_mean": -361.69189453125, "KL/mean": -433.52117919921875, "KL/rejected_KL_mean": -505.3504943847656, "KL/std": 217.83050537109375, "epoch": 0.434654919236417, "fcm_dpo/beta": 0.002259893110021949, "fcm_dpo/delta": -0.03520293906331062, "fcm_dpo/margin": 143.65863037109375, "fcm_dpo/q_t": 0.4249199628829956, "grad_norm": 37.518798828125, "learning_rate": 3.498049431928577e-07, "logits/chosen": -0.39878833293914795, "logits/rejected": -0.3886667490005493, "logps/chosen": -427.18310546875, "logps/ref_chosen": -65.49124145507812, "logps/ref_rejected": -93.08908081054688, "logps/rejected": -598.4395751953125, "loss": 1.177, "margin_dpo/margin_mean": 143.65863037109375, "margin_dpo/margin_std": 266.5452575683594, "step": 296 }, { "KL/chosen_KL_mean": -317.15142822265625, "KL/mean": -396.7143859863281, "KL/rejected_KL_mean": -476.27734375, "KL/std": 205.87338256835938, "epoch": 0.43612334801762115, "fcm_dpo/beta": 0.002268793759867549, "fcm_dpo/delta": 0.04043077677488327, "fcm_dpo/margin": 159.12591552734375, "fcm_dpo/q_t": 0.4162394404411316, "grad_norm": 30.537891387939453, "learning_rate": 3.486270052146694e-07, "logits/chosen": -0.38363754749298096, "logits/rejected": -0.38842642307281494, "logps/chosen": -373.6283874511719, "logps/ref_chosen": -56.476951599121094, "logps/ref_rejected": -95.1385498046875, "logps/rejected": -571.4158935546875, "loss": 1.1114, "margin_dpo/margin_mean": 159.12591552734375, "margin_dpo/margin_std": 209.26858520507812, "step": 297 }, { "KL/chosen_KL_mean": -326.7189025878906, "KL/mean": -432.0556640625, "KL/rejected_KL_mean": -537.3925170898438, "KL/std": 260.4841003417969, "epoch": 0.43759177679882527, "fcm_dpo/beta": 0.002258981578052044, "fcm_dpo/delta": -0.07966307550668716, "fcm_dpo/margin": 210.67361450195312, "fcm_dpo/q_t": 0.39616093039512634, "grad_norm": 25.527212142944336, "learning_rate": 3.474464683231698e-07, "logits/chosen": -0.3951565623283386, "logits/rejected": -0.4159289598464966, "logps/chosen": -394.0440673828125, "logps/ref_chosen": -67.32516479492188, "logps/ref_rejected": -116.66217041015625, "logps/rejected": -654.0546875, "loss": 1.0681, "margin_dpo/margin_mean": 210.67361450195312, "margin_dpo/margin_std": 302.4098815917969, "step": 298 }, { "KL/chosen_KL_mean": -267.58660888671875, "KL/mean": -359.8266296386719, "KL/rejected_KL_mean": -452.066650390625, "KL/std": 204.48165893554688, "epoch": 0.4390602055800294, "fcm_dpo/beta": 0.0022493680007755756, "fcm_dpo/delta": -0.016153991222381592, "fcm_dpo/margin": 184.4800567626953, "fcm_dpo/q_t": 0.40638279914855957, "grad_norm": 37.44233322143555, "learning_rate": 3.462633636266041e-07, "logits/chosen": -0.37919020652770996, "logits/rejected": -0.38419249653816223, "logps/chosen": -316.5487060546875, "logps/ref_chosen": -48.96209716796875, "logps/ref_rejected": -84.32823944091797, "logps/rejected": -536.3948974609375, "loss": 1.0928, "margin_dpo/margin_mean": 184.4800567626953, "margin_dpo/margin_std": 256.15374755859375, "step": 299 }, { "KL/chosen_KL_mean": -322.3915710449219, "KL/mean": -433.9006652832031, "KL/rejected_KL_mean": -545.4097900390625, "KL/std": 238.47645568847656, "epoch": 0.44052863436123346, "fcm_dpo/beta": 0.0022087290417402983, "fcm_dpo/delta": -0.0972696915268898, "fcm_dpo/margin": 223.01821899414062, "fcm_dpo/q_t": 0.38775455951690674, "grad_norm": 33.15673828125, "learning_rate": 3.4507772230088147e-07, "logits/chosen": -0.39440402388572693, "logits/rejected": -0.4011026620864868, "logps/chosen": -381.46527099609375, "logps/ref_chosen": -59.07371139526367, "logps/ref_rejected": -95.9664535522461, "logps/rejected": -641.376220703125, "loss": 1.0439, "margin_dpo/margin_mean": 223.01821899414062, "margin_dpo/margin_std": 281.2703857421875, "step": 300 }, { "KL/chosen_KL_mean": -275.7957763671875, "KL/mean": -380.38037109375, "KL/rejected_KL_mean": -484.9649963378906, "KL/std": 218.0830078125, "epoch": 0.4419970631424376, "fcm_dpo/beta": 0.002167191356420517, "fcm_dpo/delta": -0.05619215965270996, "fcm_dpo/margin": 209.16920471191406, "fcm_dpo/q_t": 0.39748483896255493, "grad_norm": 23.70587921142578, "learning_rate": 3.4388957558875316e-07, "logits/chosen": -0.40770500898361206, "logits/rejected": -0.40985846519470215, "logps/chosen": -333.045166015625, "logps/ref_chosen": -57.249366760253906, "logps/ref_rejected": -92.35354614257812, "logps/rejected": -577.3185424804688, "loss": 1.0547, "margin_dpo/margin_mean": 209.16921997070312, "margin_dpo/margin_std": 258.6616516113281, "step": 301 }, { "KL/chosen_KL_mean": -233.06362915039062, "KL/mean": -322.35797119140625, "KL/rejected_KL_mean": -411.6523132324219, "KL/std": 194.712158203125, "epoch": 0.4434654919236417, "fcm_dpo/beta": 0.002163384575396776, "fcm_dpo/delta": 0.013927444815635681, "fcm_dpo/margin": 178.58871459960938, "fcm_dpo/q_t": 0.4111108183860779, "grad_norm": 27.502851486206055, "learning_rate": 3.426989547989902e-07, "logits/chosen": -0.3880677819252014, "logits/rejected": -0.3943794369697571, "logps/chosen": -284.2615966796875, "logps/ref_chosen": -51.197994232177734, "logps/ref_rejected": -97.22636413574219, "logps/rejected": -508.878662109375, "loss": 1.0956, "margin_dpo/margin_mean": 178.58871459960938, "margin_dpo/margin_std": 229.92686462402344, "step": 302 }, { "KL/chosen_KL_mean": -241.3304443359375, "KL/mean": -321.529541015625, "KL/rejected_KL_mean": -401.7286376953125, "KL/std": 198.59133911132812, "epoch": 0.44493392070484583, "fcm_dpo/beta": 0.002184551674872637, "fcm_dpo/delta": 0.05139891058206558, "fcm_dpo/margin": 160.39822387695312, "fcm_dpo/q_t": 0.4194805920124054, "grad_norm": 26.236801147460938, "learning_rate": 3.4150589130555773e-07, "logits/chosen": -0.41135263442993164, "logits/rejected": -0.3971450924873352, "logps/chosen": -308.04437255859375, "logps/ref_chosen": -66.71394348144531, "logps/ref_rejected": -86.94542694091797, "logps/rejected": -488.674072265625, "loss": 1.1326, "margin_dpo/margin_mean": 160.39822387695312, "margin_dpo/margin_std": 240.5888671875, "step": 303 }, { "KL/chosen_KL_mean": -217.92318725585938, "KL/mean": -308.0065002441406, "KL/rejected_KL_mean": -398.0898132324219, "KL/std": 184.00167846679688, "epoch": 0.44640234948604995, "fcm_dpo/beta": 0.002205474767833948, "fcm_dpo/delta": 0.0024266578257083893, "fcm_dpo/margin": 180.16664123535156, "fcm_dpo/q_t": 0.40509599447250366, "grad_norm": 33.5152473449707, "learning_rate": 3.403104165467883e-07, "logits/chosen": -0.42003294825553894, "logits/rejected": -0.4105939567089081, "logps/chosen": -289.8739013671875, "logps/ref_chosen": -71.95069885253906, "logps/ref_rejected": -90.47203063964844, "logps/rejected": -488.56182861328125, "loss": 1.0614, "margin_dpo/margin_mean": 180.16664123535156, "margin_dpo/margin_std": 164.29513549804688, "step": 304 }, { "KL/chosen_KL_mean": -242.13421630859375, "KL/mean": -321.9437561035156, "KL/rejected_KL_mean": -401.7532653808594, "KL/std": 213.17129516601562, "epoch": 0.447870778267254, "fcm_dpo/beta": 0.0022017783485352993, "fcm_dpo/delta": 0.05001223459839821, "fcm_dpo/margin": 159.6190643310547, "fcm_dpo/q_t": 0.42033082246780396, "grad_norm": 20.60869026184082, "learning_rate": 3.391125620245535e-07, "logits/chosen": -0.4197441339492798, "logits/rejected": -0.4035117030143738, "logps/chosen": -308.929443359375, "logps/ref_chosen": -66.79523468017578, "logps/ref_rejected": -92.75459289550781, "logps/rejected": -494.50787353515625, "loss": 1.1316, "margin_dpo/margin_mean": 159.6190643310547, "margin_dpo/margin_std": 237.54547119140625, "step": 305 }, { "KL/chosen_KL_mean": -238.73196411132812, "KL/mean": -318.37652587890625, "KL/rejected_KL_mean": -398.0210876464844, "KL/std": 185.3416748046875, "epoch": 0.44933920704845814, "fcm_dpo/beta": 0.0022371697705239058, "fcm_dpo/delta": 0.045154958963394165, "fcm_dpo/margin": 159.28915405273438, "fcm_dpo/q_t": 0.41680416464805603, "grad_norm": 25.355138778686523, "learning_rate": 3.3791235930343417e-07, "logits/chosen": -0.3622322082519531, "logits/rejected": -0.33640217781066895, "logps/chosen": -308.4158630371094, "logps/ref_chosen": -69.68389892578125, "logps/ref_rejected": -85.15919494628906, "logps/rejected": -483.1802978515625, "loss": 1.1079, "margin_dpo/margin_mean": 159.2891387939453, "margin_dpo/margin_std": 199.42770385742188, "step": 306 }, { "KL/chosen_KL_mean": -218.85501098632812, "KL/mean": -301.641357421875, "KL/rejected_KL_mean": -384.42767333984375, "KL/std": 174.05111694335938, "epoch": 0.45080763582966227, "fcm_dpo/beta": 0.00224516075104475, "fcm_dpo/delta": 0.029343584552407265, "fcm_dpo/margin": 165.57266235351562, "fcm_dpo/q_t": 0.41345101594924927, "grad_norm": 22.831409454345703, "learning_rate": 3.367098400098881e-07, "logits/chosen": -0.3699556589126587, "logits/rejected": -0.35030895471572876, "logps/chosen": -289.02044677734375, "logps/ref_chosen": -70.16542053222656, "logps/ref_rejected": -86.97230529785156, "logps/rejected": -471.39996337890625, "loss": 1.1025, "margin_dpo/margin_mean": 165.57266235351562, "margin_dpo/margin_std": 212.66111755371094, "step": 307 }, { "KL/chosen_KL_mean": -228.22244262695312, "KL/mean": -321.84100341796875, "KL/rejected_KL_mean": -415.4595947265625, "KL/std": 197.64083862304688, "epoch": 0.4522760646108664, "fcm_dpo/beta": 0.002246259246021509, "fcm_dpo/delta": -0.021481268107891083, "fcm_dpo/margin": 187.23712158203125, "fcm_dpo/q_t": 0.40048325061798096, "grad_norm": 28.529888153076172, "learning_rate": 3.355050358314172e-07, "logits/chosen": -0.3800508975982666, "logits/rejected": -0.36786890029907227, "logps/chosen": -283.4674377441406, "logps/ref_chosen": -55.2449951171875, "logps/ref_rejected": -79.37226104736328, "logps/rejected": -494.83184814453125, "loss": 1.0533, "margin_dpo/margin_mean": 187.2371368408203, "margin_dpo/margin_std": 190.32083129882812, "step": 308 }, { "KL/chosen_KL_mean": -236.49908447265625, "KL/mean": -329.51934814453125, "KL/rejected_KL_mean": -422.53961181640625, "KL/std": 211.5294189453125, "epoch": 0.45374449339207046, "fcm_dpo/beta": 0.0022473763674497604, "fcm_dpo/delta": -0.019291866570711136, "fcm_dpo/margin": 186.04052734375, "fcm_dpo/q_t": 0.40186938643455505, "grad_norm": 31.124483108520508, "learning_rate": 3.3429797851573183e-07, "logits/chosen": -0.33029186725616455, "logits/rejected": -0.32252657413482666, "logps/chosen": -285.45819091796875, "logps/ref_chosen": -48.959083557128906, "logps/ref_rejected": -82.34072875976562, "logps/rejected": -504.88031005859375, "loss": 1.0741, "margin_dpo/margin_mean": 186.04052734375, "margin_dpo/margin_std": 224.29818725585938, "step": 309 }, { "KL/chosen_KL_mean": -281.655029296875, "KL/mean": -370.4913330078125, "KL/rejected_KL_mean": -459.32763671875, "KL/std": 190.68673706054688, "epoch": 0.4552129221732746, "fcm_dpo/beta": 0.002242402173578739, "fcm_dpo/delta": 0.001399170607328415, "fcm_dpo/margin": 177.672607421875, "fcm_dpo/q_t": 0.40604251623153687, "grad_norm": 29.155431747436523, "learning_rate": 3.3308869986991487e-07, "logits/chosen": -0.3663063645362854, "logits/rejected": -0.35591787099838257, "logps/chosen": -344.3968200683594, "logps/ref_chosen": -62.74177932739258, "logps/ref_rejected": -79.9302978515625, "logps/rejected": -539.2579345703125, "loss": 1.0711, "margin_dpo/margin_mean": 177.672607421875, "margin_dpo/margin_std": 188.24441528320312, "step": 310 }, { "KL/chosen_KL_mean": -303.1356201171875, "KL/mean": -403.75494384765625, "KL/rejected_KL_mean": -504.37420654296875, "KL/std": 248.3344268798828, "epoch": 0.4566813509544787, "fcm_dpo/beta": 0.0022182685788720846, "fcm_dpo/delta": -0.04857812821865082, "fcm_dpo/margin": 201.23855590820312, "fcm_dpo/q_t": 0.3989013433456421, "grad_norm": 23.382389068603516, "learning_rate": 3.3187723175958346e-07, "logits/chosen": -0.3620428442955017, "logits/rejected": -0.3392923176288605, "logps/chosen": -356.16363525390625, "logps/ref_chosen": -53.02798080444336, "logps/ref_rejected": -77.43820190429688, "logps/rejected": -581.8123779296875, "loss": 1.0653, "margin_dpo/margin_mean": 201.23855590820312, "margin_dpo/margin_std": 260.4375915527344, "step": 311 }, { "KL/chosen_KL_mean": -292.04132080078125, "KL/mean": -381.0889892578125, "KL/rejected_KL_mean": -470.13665771484375, "KL/std": 213.00611877441406, "epoch": 0.4581497797356828, "fcm_dpo/beta": 0.002222396433353424, "fcm_dpo/delta": 0.004101406782865524, "fcm_dpo/margin": 178.0953826904297, "fcm_dpo/q_t": 0.41095060110092163, "grad_norm": 34.67451095581055, "learning_rate": 3.306636061080487e-07, "logits/chosen": -0.3244793117046356, "logits/rejected": -0.3159826993942261, "logps/chosen": -341.43353271484375, "logps/ref_chosen": -49.39221954345703, "logps/ref_rejected": -75.79280853271484, "logps/rejected": -545.929443359375, "loss": 1.102, "margin_dpo/margin_mean": 178.09536743164062, "margin_dpo/margin_std": 254.87359619140625, "step": 312 }, { "KL/chosen_KL_mean": -271.20880126953125, "KL/mean": -367.263671875, "KL/rejected_KL_mean": -463.318603515625, "KL/std": 229.5218048095703, "epoch": 0.45961820851688695, "fcm_dpo/beta": 0.002221351722255349, "fcm_dpo/delta": -0.02877044305205345, "fcm_dpo/margin": 192.10980224609375, "fcm_dpo/q_t": 0.40212100744247437, "grad_norm": 26.70942497253418, "learning_rate": 3.2944785489547537e-07, "logits/chosen": -0.3600277602672577, "logits/rejected": -0.3582299053668976, "logps/chosen": -321.3615417480469, "logps/ref_chosen": -50.152740478515625, "logps/ref_rejected": -86.40620422363281, "logps/rejected": -549.7247924804688, "loss": 1.0837, "margin_dpo/margin_mean": 192.10980224609375, "margin_dpo/margin_std": 255.63043212890625, "step": 313 }, { "KL/chosen_KL_mean": -286.284423828125, "KL/mean": -373.939453125, "KL/rejected_KL_mean": -461.594482421875, "KL/std": 214.23907470703125, "epoch": 0.461086637298091, "fcm_dpo/beta": 0.002198255155235529, "fcm_dpo/delta": 0.014935776591300964, "fcm_dpo/margin": 175.31007385253906, "fcm_dpo/q_t": 0.4136376976966858, "grad_norm": 24.576322555541992, "learning_rate": 3.2823001015803857e-07, "logits/chosen": -0.4146174192428589, "logits/rejected": -0.41618144512176514, "logps/chosen": -343.5220031738281, "logps/ref_chosen": -57.237579345703125, "logps/ref_rejected": -97.5965347290039, "logps/rejected": -559.1910400390625, "loss": 1.1215, "margin_dpo/margin_mean": 175.31007385253906, "margin_dpo/margin_std": 272.0423889160156, "step": 314 }, { "KL/chosen_KL_mean": -258.12774658203125, "KL/mean": -340.7080078125, "KL/rejected_KL_mean": -423.2882385253906, "KL/std": 191.50604248046875, "epoch": 0.46255506607929514, "fcm_dpo/beta": 0.0022171130403876305, "fcm_dpo/delta": 0.0351128987967968, "fcm_dpo/margin": 165.16049194335938, "fcm_dpo/q_t": 0.415382444858551, "grad_norm": 20.407081604003906, "learning_rate": 3.270101039870797e-07, "logits/chosen": -0.31862980127334595, "logits/rejected": -0.32194432616233826, "logps/chosen": -307.19732666015625, "logps/ref_chosen": -49.06958770751953, "logps/ref_rejected": -85.68087768554688, "logps/rejected": -508.9691162109375, "loss": 1.1062, "margin_dpo/margin_mean": 165.16049194335938, "margin_dpo/margin_std": 214.27072143554688, "step": 315 }, { "KL/chosen_KL_mean": -253.05474853515625, "KL/mean": -367.26123046875, "KL/rejected_KL_mean": -481.4677429199219, "KL/std": 226.0899658203125, "epoch": 0.46402349486049926, "fcm_dpo/beta": 0.0021798848174512386, "fcm_dpo/delta": -0.10400072485208511, "fcm_dpo/margin": 228.41297912597656, "fcm_dpo/q_t": 0.3863717317581177, "grad_norm": 26.12361717224121, "learning_rate": 3.2578816852826086e-07, "logits/chosen": -0.3711768090724945, "logits/rejected": -0.37630313634872437, "logps/chosen": -307.31549072265625, "logps/ref_chosen": -54.26074981689453, "logps/ref_rejected": -101.2814712524414, "logps/rejected": -582.7492065429688, "loss": 1.0163, "margin_dpo/margin_mean": 228.41297912597656, "margin_dpo/margin_std": 247.7483367919922, "step": 316 }, { "KL/chosen_KL_mean": -257.2327880859375, "KL/mean": -380.733642578125, "KL/rejected_KL_mean": -504.2344970703125, "KL/std": 204.96792602539062, "epoch": 0.4654919236417034, "fcm_dpo/beta": 0.0021434309892356396, "fcm_dpo/delta": -0.13644810020923615, "fcm_dpo/margin": 247.001708984375, "fcm_dpo/q_t": 0.3767782747745514, "grad_norm": 26.812149047851562, "learning_rate": 3.2456423598071783e-07, "logits/chosen": -0.40798693895339966, "logits/rejected": -0.4007112681865692, "logps/chosen": -313.3270263671875, "logps/ref_chosen": -56.094207763671875, "logps/ref_rejected": -100.69905090332031, "logps/rejected": -604.93359375, "loss": 0.9842, "margin_dpo/margin_mean": 247.001708984375, "margin_dpo/margin_std": 228.92449951171875, "step": 317 }, { "KL/chosen_KL_mean": -277.570068359375, "KL/mean": -369.9479675292969, "KL/rejected_KL_mean": -462.3258361816406, "KL/std": 211.07015991210938, "epoch": 0.4669603524229075, "fcm_dpo/beta": 0.0021276050247251987, "fcm_dpo/delta": 0.0070451050996780396, "fcm_dpo/margin": 184.7557373046875, "fcm_dpo/q_t": 0.40913063287734985, "grad_norm": 27.008316040039062, "learning_rate": 3.233383385962115e-07, "logits/chosen": -0.4423850178718567, "logits/rejected": -0.414249062538147, "logps/chosen": -342.21575927734375, "logps/ref_chosen": -64.64569854736328, "logps/ref_rejected": -82.76425170898438, "logps/rejected": -545.090087890625, "loss": 1.0887, "margin_dpo/margin_mean": 184.75575256347656, "margin_dpo/margin_std": 232.82424926757812, "step": 318 }, { "KL/chosen_KL_mean": -247.65008544921875, "KL/mean": -358.6512451171875, "KL/rejected_KL_mean": -469.65240478515625, "KL/std": 232.62017822265625, "epoch": 0.4684287812041116, "fcm_dpo/beta": 0.0021020234562456608, "fcm_dpo/delta": -0.06988762319087982, "fcm_dpo/margin": 222.0023193359375, "fcm_dpo/q_t": 0.39178723096847534, "grad_norm": 22.64339828491211, "learning_rate": 3.2211050867827805e-07, "logits/chosen": -0.41219907999038696, "logits/rejected": -0.4256317913532257, "logps/chosen": -297.0338134765625, "logps/ref_chosen": -49.383758544921875, "logps/ref_rejected": -113.90650939941406, "logps/rejected": -583.5589599609375, "loss": 1.0377, "margin_dpo/margin_mean": 222.00233459472656, "margin_dpo/margin_std": 251.9779052734375, "step": 319 }, { "KL/chosen_KL_mean": -261.9112548828125, "KL/mean": -383.52362060546875, "KL/rejected_KL_mean": -505.13592529296875, "KL/std": 241.70034790039062, "epoch": 0.4698972099853157, "fcm_dpo/beta": 0.0020547928288578987, "fcm_dpo/delta": -0.10551808774471283, "fcm_dpo/margin": 243.2246856689453, "fcm_dpo/q_t": 0.3849526047706604, "grad_norm": 28.402790069580078, "learning_rate": 3.208807785813777e-07, "logits/chosen": -0.398881196975708, "logits/rejected": -0.4026961922645569, "logps/chosen": -321.4161376953125, "logps/ref_chosen": -59.50489044189453, "logps/ref_rejected": -97.66717529296875, "logps/rejected": -602.8031005859375, "loss": 1.0129, "margin_dpo/margin_mean": 243.22470092773438, "margin_dpo/margin_std": 259.66607666015625, "step": 320 }, { "KL/chosen_KL_mean": -320.12939453125, "KL/mean": -424.35235595703125, "KL/rejected_KL_mean": -528.5753173828125, "KL/std": 247.34034729003906, "epoch": 0.4713656387665198, "fcm_dpo/beta": 0.0020298874005675316, "fcm_dpo/delta": -0.025036636739969254, "fcm_dpo/margin": 208.4459228515625, "fcm_dpo/q_t": 0.4030313193798065, "grad_norm": 25.453643798828125, "learning_rate": 3.1964918071004217e-07, "logits/chosen": -0.36571186780929565, "logits/rejected": -0.3529800772666931, "logps/chosen": -381.6781005859375, "logps/ref_chosen": -61.548683166503906, "logps/ref_rejected": -91.64103698730469, "logps/rejected": -620.2163696289062, "loss": 1.0767, "margin_dpo/margin_mean": 208.4459228515625, "margin_dpo/margin_std": 262.54095458984375, "step": 321 }, { "KL/chosen_KL_mean": -278.1830139160156, "KL/mean": -390.0034484863281, "KL/rejected_KL_mean": -501.8238525390625, "KL/std": 217.16629028320312, "epoch": 0.47283406754772395, "fcm_dpo/beta": 0.0020174758974462748, "fcm_dpo/delta": -0.05392756685614586, "fcm_dpo/margin": 223.640869140625, "fcm_dpo/q_t": 0.39428332448005676, "grad_norm": 33.35853958129883, "learning_rate": 3.184157475180207e-07, "logits/chosen": -0.3929744362831116, "logits/rejected": -0.39414313435554504, "logps/chosen": -335.4730529785156, "logps/ref_chosen": -57.29003143310547, "logps/ref_rejected": -95.74992370605469, "logps/rejected": -597.5737915039062, "loss": 1.0374, "margin_dpo/margin_mean": 223.640869140625, "margin_dpo/margin_std": 232.15565490722656, "step": 322 }, { "KL/chosen_KL_mean": -303.02447509765625, "KL/mean": -401.8762512207031, "KL/rejected_KL_mean": -500.72802734375, "KL/std": 222.78414916992188, "epoch": 0.47430249632892807, "fcm_dpo/beta": 0.0020154546946287155, "fcm_dpo/delta": 0.0015676822513341904, "fcm_dpo/margin": 197.70355224609375, "fcm_dpo/q_t": 0.4071164131164551, "grad_norm": 40.872833251953125, "learning_rate": 3.171805115074251e-07, "logits/chosen": -0.43304672837257385, "logits/rejected": -0.4297791123390198, "logps/chosen": -354.2584228515625, "logps/ref_chosen": -51.23395919799805, "logps/ref_rejected": -75.06192016601562, "logps/rejected": -575.7899169921875, "loss": 1.086, "margin_dpo/margin_mean": 197.70355224609375, "margin_dpo/margin_std": 243.023681640625, "step": 323 }, { "KL/chosen_KL_mean": -347.34503173828125, "KL/mean": -441.3756103515625, "KL/rejected_KL_mean": -535.4061889648438, "KL/std": 241.51780700683594, "epoch": 0.47577092511013214, "fcm_dpo/beta": 0.0020376183092594147, "fcm_dpo/delta": 0.016333594918251038, "fcm_dpo/margin": 188.06114196777344, "fcm_dpo/q_t": 0.4135010838508606, "grad_norm": 41.4670295715332, "learning_rate": 3.1594350522787295e-07, "logits/chosen": -0.42671218514442444, "logits/rejected": -0.41737309098243713, "logps/chosen": -412.4801940917969, "logps/ref_chosen": -65.13516998291016, "logps/ref_rejected": -86.47750854492188, "logps/rejected": -621.8836669921875, "loss": 1.1178, "margin_dpo/margin_mean": 188.0611572265625, "margin_dpo/margin_std": 271.6750183105469, "step": 324 }, { "KL/chosen_KL_mean": -271.2542724609375, "KL/mean": -350.12774658203125, "KL/rejected_KL_mean": -429.001220703125, "KL/std": 204.61688232421875, "epoch": 0.47723935389133626, "fcm_dpo/beta": 0.002051199320703745, "fcm_dpo/delta": 0.07886850088834763, "fcm_dpo/margin": 157.7469482421875, "fcm_dpo/q_t": 0.4243575930595398, "grad_norm": 27.271472930908203, "learning_rate": 3.147047612756302e-07, "logits/chosen": -0.4851588308811188, "logits/rejected": -0.4723260998725891, "logps/chosen": -327.4698486328125, "logps/ref_chosen": -56.215599060058594, "logps/ref_rejected": -70.08592987060547, "logps/rejected": -499.0871276855469, "loss": 1.137, "margin_dpo/margin_mean": 157.7469482421875, "margin_dpo/margin_std": 213.0748291015625, "step": 325 }, { "KL/chosen_KL_mean": -291.67022705078125, "KL/mean": -364.40618896484375, "KL/rejected_KL_mean": -437.1421813964844, "KL/std": 193.47927856445312, "epoch": 0.4787077826725404, "fcm_dpo/beta": 0.0020837995689362288, "fcm_dpo/delta": 0.10001323372125626, "fcm_dpo/margin": 145.47195434570312, "fcm_dpo/q_t": 0.4282350540161133, "grad_norm": 77.27185821533203, "learning_rate": 3.134643122927519e-07, "logits/chosen": -0.4964238405227661, "logits/rejected": -0.4864235520362854, "logps/chosen": -364.39520263671875, "logps/ref_chosen": -72.72496032714844, "logps/ref_rejected": -79.8467788696289, "logps/rejected": -516.9889526367188, "loss": 1.1457, "margin_dpo/margin_mean": 145.47195434570312, "margin_dpo/margin_std": 189.67251586914062, "step": 326 }, { "KL/chosen_KL_mean": -262.9017333984375, "KL/mean": -374.2644348144531, "KL/rejected_KL_mean": -485.62713623046875, "KL/std": 206.111572265625, "epoch": 0.4801762114537445, "fcm_dpo/beta": 0.0020731650292873383, "fcm_dpo/delta": -0.06503181904554367, "fcm_dpo/margin": 222.725341796875, "fcm_dpo/q_t": 0.39099863171577454, "grad_norm": 71.85537719726562, "learning_rate": 3.1222219096622264e-07, "logits/chosen": -0.43772637844085693, "logits/rejected": -0.4322543740272522, "logps/chosen": -332.0361633300781, "logps/ref_chosen": -69.13441467285156, "logps/ref_rejected": -111.93377685546875, "logps/rejected": -597.5609130859375, "loss": 1.0265, "margin_dpo/margin_mean": 222.725341796875, "margin_dpo/margin_std": 220.58840942382812, "step": 327 }, { "KL/chosen_KL_mean": -290.6591796875, "KL/mean": -395.22021484375, "KL/rejected_KL_mean": -499.78118896484375, "KL/std": 233.2576904296875, "epoch": 0.48164464023494863, "fcm_dpo/beta": 0.0020635989494621754, "fcm_dpo/delta": -0.03295481950044632, "fcm_dpo/margin": 209.12203979492188, "fcm_dpo/q_t": 0.40090832114219666, "grad_norm": 34.49656295776367, "learning_rate": 3.1097843002709427e-07, "logits/chosen": -0.4391549825668335, "logits/rejected": -0.44703197479248047, "logps/chosen": -350.34637451171875, "logps/ref_chosen": -59.68719482421875, "logps/ref_rejected": -90.85499572753906, "logps/rejected": -590.6361694335938, "loss": 1.0657, "margin_dpo/margin_mean": 209.12203979492188, "margin_dpo/margin_std": 258.40020751953125, "step": 328 }, { "KL/chosen_KL_mean": -349.1519470214844, "KL/mean": -458.60186767578125, "KL/rejected_KL_mean": -568.0517578125, "KL/std": 267.2418212890625, "epoch": 0.4831130690161527, "fcm_dpo/beta": 0.0020334022119641304, "fcm_dpo/delta": -0.047926321625709534, "fcm_dpo/margin": 218.89981079101562, "fcm_dpo/q_t": 0.39808762073516846, "grad_norm": 32.74344253540039, "learning_rate": 3.0973306224962437e-07, "logits/chosen": -0.4355248808860779, "logits/rejected": -0.4343896210193634, "logps/chosen": -414.39813232421875, "logps/ref_chosen": -65.2461929321289, "logps/ref_rejected": -100.69770812988281, "logps/rejected": -668.74951171875, "loss": 1.0751, "margin_dpo/margin_mean": 218.89981079101562, "margin_dpo/margin_std": 289.7802734375, "step": 329 }, { "KL/chosen_KL_mean": -293.26153564453125, "KL/mean": -410.9012756347656, "KL/rejected_KL_mean": -528.5409545898438, "KL/std": 247.63658142089844, "epoch": 0.4845814977973568, "fcm_dpo/beta": 0.002014409750699997, "fcm_dpo/delta": -0.0776476040482521, "fcm_dpo/margin": 235.2794189453125, "fcm_dpo/q_t": 0.39114266633987427, "grad_norm": 23.221515655517578, "learning_rate": 3.084861204504122e-07, "logits/chosen": -0.36399000883102417, "logits/rejected": -0.3640822768211365, "logps/chosen": -340.2598876953125, "logps/ref_chosen": -46.998348236083984, "logps/ref_rejected": -86.87684631347656, "logps/rejected": -615.4178466796875, "loss": 1.035, "margin_dpo/margin_mean": 235.27944946289062, "margin_dpo/margin_std": 269.7514343261719, "step": 330 }, { "KL/chosen_KL_mean": -312.65850830078125, "KL/mean": -426.49957275390625, "KL/rejected_KL_mean": -540.340576171875, "KL/std": 204.22564697265625, "epoch": 0.48604992657856094, "fcm_dpo/beta": 0.0019897697493433952, "fcm_dpo/delta": -0.05556933581829071, "fcm_dpo/margin": 227.68206787109375, "fcm_dpo/q_t": 0.39270973205566406, "grad_norm": 25.662797927856445, "learning_rate": 3.072376374875335e-07, "logits/chosen": -0.4090471863746643, "logits/rejected": -0.40625983476638794, "logps/chosen": -363.1827392578125, "logps/ref_chosen": -50.52424621582031, "logps/ref_rejected": -89.01544189453125, "logps/rejected": -629.3560180664062, "loss": 1.0209, "margin_dpo/margin_mean": 227.68206787109375, "margin_dpo/margin_std": 201.46847534179688, "step": 331 }, { "KL/chosen_KL_mean": -308.2393798828125, "KL/mean": -389.4369812011719, "KL/rejected_KL_mean": -470.63458251953125, "KL/std": 208.03164672851562, "epoch": 0.48751835535976507, "fcm_dpo/beta": 0.002010452328249812, "fcm_dpo/delta": 0.07582204043865204, "fcm_dpo/margin": 162.3951873779297, "fcm_dpo/q_t": 0.42497020959854126, "grad_norm": 21.15984535217285, "learning_rate": 3.059876462596758e-07, "logits/chosen": -0.3901352882385254, "logits/rejected": -0.3734605014324188, "logps/chosen": -357.419677734375, "logps/ref_chosen": -49.18028259277344, "logps/ref_rejected": -76.48515319824219, "logps/rejected": -547.1197509765625, "loss": 1.137, "margin_dpo/margin_mean": 162.3951873779297, "margin_dpo/margin_std": 232.2877655029297, "step": 332 }, { "KL/chosen_KL_mean": -318.0777893066406, "KL/mean": -425.67242431640625, "KL/rejected_KL_mean": -533.26708984375, "KL/std": 241.38174438476562, "epoch": 0.4889867841409692, "fcm_dpo/beta": 0.0019922310020774603, "fcm_dpo/delta": -0.031133878976106644, "fcm_dpo/margin": 215.18930053710938, "fcm_dpo/q_t": 0.4023503363132477, "grad_norm": 23.545013427734375, "learning_rate": 3.0473617970527015e-07, "logits/chosen": -0.4057619571685791, "logits/rejected": -0.3980754017829895, "logps/chosen": -381.8335266113281, "logps/ref_chosen": -63.75574493408203, "logps/ref_rejected": -95.04411315917969, "logps/rejected": -628.3111572265625, "loss": 1.0854, "margin_dpo/margin_mean": 215.18930053710938, "margin_dpo/margin_std": 291.11785888671875, "step": 333 }, { "KL/chosen_KL_mean": -299.594970703125, "KL/mean": -393.54693603515625, "KL/rejected_KL_mean": -487.4989318847656, "KL/std": 260.1100769042969, "epoch": 0.49045521292217326, "fcm_dpo/beta": 0.0020080246031284332, "fcm_dpo/delta": 0.02357984334230423, "fcm_dpo/margin": 187.9039306640625, "fcm_dpo/q_t": 0.414248526096344, "grad_norm": 22.068880081176758, "learning_rate": 3.034832708016243e-07, "logits/chosen": -0.41038450598716736, "logits/rejected": -0.40884631872177124, "logps/chosen": -366.5747375488281, "logps/ref_chosen": -66.97975158691406, "logps/ref_rejected": -95.31692504882812, "logps/rejected": -582.8158569335938, "loss": 1.1256, "margin_dpo/margin_mean": 187.9039306640625, "margin_dpo/margin_std": 291.5281982421875, "step": 334 }, { "KL/chosen_KL_mean": -315.5887756347656, "KL/mean": -390.5389404296875, "KL/rejected_KL_mean": -465.4891052246094, "KL/std": 233.7744140625, "epoch": 0.4919236417033774, "fcm_dpo/beta": 0.0020120900589972734, "fcm_dpo/delta": 0.0005856315256096423, "fcm_dpo/margin": 149.90032958984375, "fcm_dpo/q_t": 0.4307052493095398, "grad_norm": 24.0537052154541, "learning_rate": 3.022289525640531e-07, "logits/chosen": -0.41110914945602417, "logits/rejected": -0.38828492164611816, "logps/chosen": -378.1312561035156, "logps/ref_chosen": -62.54248046875, "logps/ref_rejected": -87.61770629882812, "logps/rejected": -553.1068115234375, "loss": 1.1711, "margin_dpo/margin_mean": 149.90032958984375, "margin_dpo/margin_std": 253.32125854492188, "step": 335 }, { "KL/chosen_KL_mean": -303.5247802734375, "KL/mean": -422.5467529296875, "KL/rejected_KL_mean": -541.5687255859375, "KL/std": 268.7447509765625, "epoch": 0.4933920704845815, "fcm_dpo/beta": 0.0019951139111071825, "fcm_dpo/delta": -0.07861563563346863, "fcm_dpo/margin": 238.04388427734375, "fcm_dpo/q_t": 0.39314505457878113, "grad_norm": 26.437314987182617, "learning_rate": 3.009732580450086e-07, "logits/chosen": -0.384580135345459, "logits/rejected": -0.38513875007629395, "logps/chosen": -358.05596923828125, "logps/ref_chosen": -54.53115463256836, "logps/ref_rejected": -104.40424346923828, "logps/rejected": -645.972900390625, "loss": 1.0597, "margin_dpo/margin_mean": 238.04388427734375, "margin_dpo/margin_std": 317.6496276855469, "step": 336 }, { "KL/chosen_KL_mean": -271.9599609375, "KL/mean": -389.19110107421875, "KL/rejected_KL_mean": -506.4222106933594, "KL/std": 216.30596923828125, "epoch": 0.4948604992657856, "fcm_dpo/beta": 0.001958012580871582, "fcm_dpo/delta": -0.06217820942401886, "fcm_dpo/margin": 234.46224975585938, "fcm_dpo/q_t": 0.39352160692214966, "grad_norm": 28.5858154296875, "learning_rate": 2.9971622033320914e-07, "logits/chosen": -0.43051382899284363, "logits/rejected": -0.42107608914375305, "logps/chosen": -337.088623046875, "logps/ref_chosen": -65.12869262695312, "logps/ref_rejected": -101.72701263427734, "logps/rejected": -608.1492309570312, "loss": 1.0362, "margin_dpo/margin_mean": 234.46224975585938, "margin_dpo/margin_std": 254.32135009765625, "step": 337 }, { "KL/chosen_KL_mean": -241.61541748046875, "KL/mean": -351.14056396484375, "KL/rejected_KL_mean": -460.665771484375, "KL/std": 205.6318359375, "epoch": 0.49632892804698975, "fcm_dpo/beta": 0.0019469063263386488, "fcm_dpo/delta": -0.02767963334918022, "fcm_dpo/margin": 219.05032348632812, "fcm_dpo/q_t": 0.3992460072040558, "grad_norm": 28.797653198242188, "learning_rate": 2.984578725527675e-07, "logits/chosen": -0.4226057231426239, "logits/rejected": -0.413729190826416, "logps/chosen": -300.0381164550781, "logps/ref_chosen": -58.422706604003906, "logps/ref_rejected": -89.06854248046875, "logps/rejected": -549.7342529296875, "loss": 1.0451, "margin_dpo/margin_mean": 219.0503387451172, "margin_dpo/margin_std": 213.1126251220703, "step": 338 }, { "KL/chosen_KL_mean": -263.037353515625, "KL/mean": -371.4858703613281, "KL/rejected_KL_mean": -479.93438720703125, "KL/std": 212.88307189941406, "epoch": 0.4977973568281938, "fcm_dpo/beta": 0.0019529033452272415, "fcm_dpo/delta": -0.025594212114810944, "fcm_dpo/margin": 216.89700317382812, "fcm_dpo/q_t": 0.40049952268600464, "grad_norm": 28.772863388061523, "learning_rate": 2.9719824786231796e-07, "logits/chosen": -0.4214293956756592, "logits/rejected": -0.4066346287727356, "logps/chosen": -323.0326843261719, "logps/ref_chosen": -59.99531555175781, "logps/ref_rejected": -103.9109115600586, "logps/rejected": -583.8452758789062, "loss": 1.0555, "margin_dpo/margin_mean": 216.89700317382812, "margin_dpo/margin_std": 217.88153076171875, "step": 339 }, { "KL/chosen_KL_mean": -285.5733337402344, "KL/mean": -379.6592712402344, "KL/rejected_KL_mean": -473.7452087402344, "KL/std": 208.79730224609375, "epoch": 0.49926578560939794, "fcm_dpo/beta": 0.0019388075452297926, "fcm_dpo/delta": 0.0363488644361496, "fcm_dpo/margin": 188.17184448242188, "fcm_dpo/q_t": 0.4153197407722473, "grad_norm": 23.684680938720703, "learning_rate": 2.959373794541426e-07, "logits/chosen": -0.3585469722747803, "logits/rejected": -0.3364448547363281, "logps/chosen": -338.403564453125, "logps/ref_chosen": -52.83022689819336, "logps/ref_rejected": -73.10723114013672, "logps/rejected": -546.8524169921875, "loss": 1.1127, "margin_dpo/margin_mean": 188.17184448242188, "margin_dpo/margin_std": 253.51083374023438, "step": 340 }, { "KL/chosen_KL_mean": -286.9049072265625, "KL/mean": -402.452392578125, "KL/rejected_KL_mean": -517.9999389648438, "KL/std": 244.80047607421875, "epoch": 0.5007342143906021, "fcm_dpo/beta": 0.0019295980455353856, "fcm_dpo/delta": -0.04842275381088257, "fcm_dpo/margin": 231.09503173828125, "fcm_dpo/q_t": 0.39680999517440796, "grad_norm": 23.347299575805664, "learning_rate": 2.946753005532965e-07, "logits/chosen": -0.38709864020347595, "logits/rejected": -0.38579249382019043, "logps/chosen": -334.8046875, "logps/ref_chosen": -47.899803161621094, "logps/ref_rejected": -101.80987548828125, "logps/rejected": -619.809814453125, "loss": 1.0474, "margin_dpo/margin_mean": 231.0950164794922, "margin_dpo/margin_std": 254.49545288085938, "step": 341 }, { "KL/chosen_KL_mean": -302.0462341308594, "KL/mean": -403.65533447265625, "KL/rejected_KL_mean": -505.2643737792969, "KL/std": 232.10491943359375, "epoch": 0.5022026431718062, "fcm_dpo/beta": 0.0019408478401601315, "fcm_dpo/delta": 0.005411949008703232, "fcm_dpo/margin": 203.21817016601562, "fcm_dpo/q_t": 0.4091772437095642, "grad_norm": 22.63515853881836, "learning_rate": 2.934120444167326e-07, "logits/chosen": -0.41977885365486145, "logits/rejected": -0.39589670300483704, "logps/chosen": -374.0428771972656, "logps/ref_chosen": -71.99664306640625, "logps/ref_rejected": -92.58959197998047, "logps/rejected": -597.85400390625, "loss": 1.0955, "margin_dpo/margin_mean": 203.21817016601562, "margin_dpo/margin_std": 270.35748291015625, "step": 342 }, { "KL/chosen_KL_mean": -301.8097839355469, "KL/mean": -423.54559326171875, "KL/rejected_KL_mean": -545.2814331054688, "KL/std": 240.10980224609375, "epoch": 0.5036710719530103, "fcm_dpo/beta": 0.001911632250994444, "fcm_dpo/delta": -0.06870149821043015, "fcm_dpo/margin": 243.47164916992188, "fcm_dpo/q_t": 0.389914333820343, "grad_norm": 26.55186653137207, "learning_rate": 2.9214764433242476e-07, "logits/chosen": -0.40133050084114075, "logits/rejected": -0.4047289490699768, "logps/chosen": -356.21539306640625, "logps/ref_chosen": -54.405616760253906, "logps/ref_rejected": -111.04142761230469, "logps/rejected": -656.3228759765625, "loss": 1.0164, "margin_dpo/margin_mean": 243.47164916992188, "margin_dpo/margin_std": 218.26107788085938, "step": 343 }, { "KL/chosen_KL_mean": -306.146728515625, "KL/mean": -411.81256103515625, "KL/rejected_KL_mean": -517.4783935546875, "KL/std": 263.5313720703125, "epoch": 0.5051395007342144, "fcm_dpo/beta": 0.0019231976475566626, "fcm_dpo/delta": -0.008610613644123077, "fcm_dpo/margin": 211.33169555664062, "fcm_dpo/q_t": 0.40852880477905273, "grad_norm": 30.049617767333984, "learning_rate": 2.9088213361849126e-07, "logits/chosen": -0.436582088470459, "logits/rejected": -0.4415278732776642, "logps/chosen": -360.11138916015625, "logps/ref_chosen": -53.96466827392578, "logps/ref_rejected": -90.62336730957031, "logps/rejected": -608.101806640625, "loss": 1.09, "margin_dpo/margin_mean": 211.33169555664062, "margin_dpo/margin_std": 268.55743408203125, "step": 344 }, { "KL/chosen_KL_mean": -365.295166015625, "KL/mean": -484.23297119140625, "KL/rejected_KL_mean": -603.1707153320312, "KL/std": 253.80105590820312, "epoch": 0.5066079295154186, "fcm_dpo/beta": 0.001892891013994813, "fcm_dpo/delta": -0.052630215883255005, "fcm_dpo/margin": 237.8755645751953, "fcm_dpo/q_t": 0.39596062898635864, "grad_norm": 21.18876075744629, "learning_rate": 2.896155456223163e-07, "logits/chosen": -0.46160662174224854, "logits/rejected": -0.46070683002471924, "logps/chosen": -426.9808654785156, "logps/ref_chosen": -61.685699462890625, "logps/ref_rejected": -99.49041748046875, "logps/rejected": -702.6611328125, "loss": 1.0516, "margin_dpo/margin_mean": 237.8755645751953, "margin_dpo/margin_std": 280.4576721191406, "step": 345 }, { "KL/chosen_KL_mean": -366.03582763671875, "KL/mean": -478.0716552734375, "KL/rejected_KL_mean": -590.107421875, "KL/std": 247.56460571289062, "epoch": 0.5080763582966226, "fcm_dpo/beta": 0.0018771484028548002, "fcm_dpo/delta": -0.02152422070503235, "fcm_dpo/margin": 224.07159423828125, "fcm_dpo/q_t": 0.40137046575546265, "grad_norm": 23.943134307861328, "learning_rate": 2.883479137196714e-07, "logits/chosen": -0.38838616013526917, "logits/rejected": -0.3767489790916443, "logps/chosen": -421.2921142578125, "logps/ref_chosen": -55.256263732910156, "logps/ref_rejected": -77.41532135009766, "logps/rejected": -667.5227661132812, "loss": 1.0685, "margin_dpo/margin_mean": 224.07159423828125, "margin_dpo/margin_std": 267.4580993652344, "step": 346 }, { "KL/chosen_KL_mean": -364.4321594238281, "KL/mean": -474.5341796875, "KL/rejected_KL_mean": -584.63623046875, "KL/std": 261.7486877441406, "epoch": 0.5095447870778267, "fcm_dpo/beta": 0.001872351742349565, "fcm_dpo/delta": -0.012826315127313137, "fcm_dpo/margin": 220.20404052734375, "fcm_dpo/q_t": 0.40555307269096375, "grad_norm": 25.194826126098633, "learning_rate": 2.8707927131383614e-07, "logits/chosen": -0.4138352870941162, "logits/rejected": -0.40841221809387207, "logps/chosen": -421.9984130859375, "logps/ref_chosen": -57.56623840332031, "logps/ref_rejected": -92.35509490966797, "logps/rejected": -676.9913330078125, "loss": 1.0869, "margin_dpo/margin_mean": 220.20404052734375, "margin_dpo/margin_std": 294.784423828125, "step": 347 }, { "KL/chosen_KL_mean": -316.824951171875, "KL/mean": -411.66033935546875, "KL/rejected_KL_mean": -506.4957580566406, "KL/std": 220.3616180419922, "epoch": 0.5110132158590308, "fcm_dpo/beta": 0.0018715888727456331, "fcm_dpo/delta": 0.04614096134901047, "fcm_dpo/margin": 189.6708221435547, "fcm_dpo/q_t": 0.41821640729904175, "grad_norm": 21.97551727294922, "learning_rate": 2.858096518347179e-07, "logits/chosen": -0.44485563039779663, "logits/rejected": -0.446816623210907, "logps/chosen": -373.14263916015625, "logps/ref_chosen": -56.31770324707031, "logps/ref_rejected": -89.13836669921875, "logps/rejected": -595.6341552734375, "loss": 1.1197, "margin_dpo/margin_mean": 189.6708221435547, "margin_dpo/margin_std": 254.99215698242188, "step": 348 }, { "KL/chosen_KL_mean": -301.78118896484375, "KL/mean": -409.97479248046875, "KL/rejected_KL_mean": -518.1683959960938, "KL/std": 247.78256225585938, "epoch": 0.5124816446402349, "fcm_dpo/beta": 0.0018918917048722506, "fcm_dpo/delta": -0.010010870173573494, "fcm_dpo/margin": 216.3872528076172, "fcm_dpo/q_t": 0.40753403306007385, "grad_norm": 20.3768367767334, "learning_rate": 2.845390887379706e-07, "logits/chosen": -0.4286951422691345, "logits/rejected": -0.42969733476638794, "logps/chosen": -359.80670166015625, "logps/ref_chosen": -58.025516510009766, "logps/ref_rejected": -97.50515747070312, "logps/rejected": -615.673583984375, "loss": 1.0971, "margin_dpo/margin_mean": 216.3872528076172, "margin_dpo/margin_std": 307.3798828125, "step": 349 }, { "KL/chosen_KL_mean": -318.9201354980469, "KL/mean": -423.44635009765625, "KL/rejected_KL_mean": -527.9725341796875, "KL/std": 231.78794860839844, "epoch": 0.5139500734214391, "fcm_dpo/beta": 0.0018784052226692438, "fcm_dpo/delta": 0.007296178489923477, "fcm_dpo/margin": 209.05239868164062, "fcm_dpo/q_t": 0.40847963094711304, "grad_norm": 30.007631301879883, "learning_rate": 2.8326761550411346e-07, "logits/chosen": -0.4662485122680664, "logits/rejected": -0.46992364525794983, "logps/chosen": -383.2506103515625, "logps/ref_chosen": -64.33049011230469, "logps/ref_rejected": -89.87164306640625, "logps/rejected": -617.84423828125, "loss": 1.1014, "margin_dpo/margin_mean": 209.0524139404297, "margin_dpo/margin_std": 286.0767822265625, "step": 350 }, { "KL/chosen_KL_mean": -291.11431884765625, "KL/mean": -417.4867858886719, "KL/rejected_KL_mean": -543.8592529296875, "KL/std": 271.51190185546875, "epoch": 0.5154185022026432, "fcm_dpo/beta": 0.0018615357112139463, "fcm_dpo/delta": -0.07428093254566193, "fcm_dpo/margin": 252.7449951171875, "fcm_dpo/q_t": 0.39380645751953125, "grad_norm": 30.30561637878418, "learning_rate": 2.819952656376487e-07, "logits/chosen": -0.45949965715408325, "logits/rejected": -0.4594254493713379, "logps/chosen": -351.78643798828125, "logps/ref_chosen": -60.6721305847168, "logps/ref_rejected": -101.5654296875, "logps/rejected": -645.4246826171875, "loss": 1.049, "margin_dpo/margin_mean": 252.7449951171875, "margin_dpo/margin_std": 314.3124084472656, "step": 351 }, { "KL/chosen_KL_mean": -349.67596435546875, "KL/mean": -426.81182861328125, "KL/rejected_KL_mean": -503.9476623535156, "KL/std": 249.33474731445312, "epoch": 0.5168869309838473, "fcm_dpo/beta": 0.0018662881338968873, "fcm_dpo/delta": 0.01947195641696453, "fcm_dpo/margin": 154.27169799804688, "fcm_dpo/q_t": 0.4333202540874481, "grad_norm": 31.560047149658203, "learning_rate": 2.8072207266617854e-07, "logits/chosen": -0.427381694316864, "logits/rejected": -0.39572709798812866, "logps/chosen": -420.6194152832031, "logps/ref_chosen": -70.9434585571289, "logps/ref_rejected": -76.6419677734375, "logps/rejected": -580.589599609375, "loss": 1.1823, "margin_dpo/margin_mean": 154.27169799804688, "margin_dpo/margin_std": 272.5928039550781, "step": 352 }, { "KL/chosen_KL_mean": -324.58709716796875, "KL/mean": -425.61248779296875, "KL/rejected_KL_mean": -526.6378173828125, "KL/std": 249.03515625, "epoch": 0.5183553597650514, "fcm_dpo/beta": 0.001863989164121449, "fcm_dpo/delta": 0.023936476558446884, "fcm_dpo/margin": 202.05075073242188, "fcm_dpo/q_t": 0.4135817885398865, "grad_norm": 25.82649040222168, "learning_rate": 2.794480701395219e-07, "logits/chosen": -0.47480159997940063, "logits/rejected": -0.46382421255111694, "logps/chosen": -382.982421875, "logps/ref_chosen": -58.39533996582031, "logps/ref_rejected": -80.33553314208984, "logps/rejected": -606.973388671875, "loss": 1.1089, "margin_dpo/margin_mean": 202.0507354736328, "margin_dpo/margin_std": 275.25592041015625, "step": 353 }, { "KL/chosen_KL_mean": -270.94378662109375, "KL/mean": -384.894287109375, "KL/rejected_KL_mean": -498.8448181152344, "KL/std": 226.17709350585938, "epoch": 0.5198237885462555, "fcm_dpo/beta": 0.0018658683402463794, "fcm_dpo/delta": -0.026406319811940193, "fcm_dpo/margin": 227.90101623535156, "fcm_dpo/q_t": 0.399784117937088, "grad_norm": 27.079309463500977, "learning_rate": 2.781732916288303e-07, "logits/chosen": -0.43044692277908325, "logits/rejected": -0.42145881056785583, "logps/chosen": -330.74676513671875, "logps/ref_chosen": -59.80299377441406, "logps/ref_rejected": -88.75750732421875, "logps/rejected": -587.602294921875, "loss": 1.0478, "margin_dpo/margin_mean": 227.90103149414062, "margin_dpo/margin_std": 226.9383544921875, "step": 354 }, { "KL/chosen_KL_mean": -270.277099609375, "KL/mean": -383.14495849609375, "KL/rejected_KL_mean": -496.0128479003906, "KL/std": 231.57516479492188, "epoch": 0.5212922173274597, "fcm_dpo/beta": 0.0018558851443231106, "fcm_dpo/delta": -0.019903086125850677, "fcm_dpo/margin": 225.7357177734375, "fcm_dpo/q_t": 0.40093863010406494, "grad_norm": 35.046512603759766, "learning_rate": 2.7689777072570284e-07, "logits/chosen": -0.538011908531189, "logits/rejected": -0.5325556993484497, "logps/chosen": -324.4056091308594, "logps/ref_chosen": -54.12849807739258, "logps/ref_rejected": -82.40606689453125, "logps/rejected": -578.4189453125, "loss": 1.0533, "margin_dpo/margin_mean": 225.7357177734375, "margin_dpo/margin_std": 227.40325927734375, "step": 355 }, { "KL/chosen_KL_mean": -336.8787841796875, "KL/mean": -398.74810791015625, "KL/rejected_KL_mean": -460.6174621582031, "KL/std": 237.61134338378906, "epoch": 0.5227606461086637, "fcm_dpo/beta": 0.0018649199046194553, "fcm_dpo/delta": 0.037342458963394165, "fcm_dpo/margin": 123.73867797851562, "fcm_dpo/q_t": 0.44756919145584106, "grad_norm": 32.40851974487305, "learning_rate": 2.7562154104130176e-07, "logits/chosen": -0.500129222869873, "logits/rejected": -0.48293763399124146, "logps/chosen": -401.5526123046875, "logps/ref_chosen": -64.6738052368164, "logps/ref_rejected": -75.89926147460938, "logps/rejected": -536.5167236328125, "loss": 1.2411, "margin_dpo/margin_mean": 123.73868560791016, "margin_dpo/margin_std": 291.8314208984375, "step": 356 }, { "KL/chosen_KL_mean": -302.33843994140625, "KL/mean": -403.6683349609375, "KL/rejected_KL_mean": -504.9982604980469, "KL/std": 235.599365234375, "epoch": 0.5242290748898678, "fcm_dpo/beta": 0.001869656378403306, "fcm_dpo/delta": 0.021622397005558014, "fcm_dpo/margin": 202.65980529785156, "fcm_dpo/q_t": 0.4108501672744751, "grad_norm": 24.0694522857666, "learning_rate": 2.7434463620546594e-07, "logits/chosen": -0.47579270601272583, "logits/rejected": -0.4654581546783447, "logps/chosen": -355.0642395019531, "logps/ref_chosen": -52.725799560546875, "logps/ref_rejected": -86.84115600585938, "logps/rejected": -591.8394165039062, "loss": 1.0927, "margin_dpo/margin_mean": 202.65982055664062, "margin_dpo/margin_std": 240.59239196777344, "step": 357 }, { "KL/chosen_KL_mean": -271.8714599609375, "KL/mean": -368.218994140625, "KL/rejected_KL_mean": -464.56658935546875, "KL/std": 229.80101013183594, "epoch": 0.5256975036710719, "fcm_dpo/beta": 0.0018902610754594207, "fcm_dpo/delta": 0.03711070120334625, "fcm_dpo/margin": 192.69509887695312, "fcm_dpo/q_t": 0.4157891273498535, "grad_norm": 23.27127456665039, "learning_rate": 2.730670898658255e-07, "logits/chosen": -0.5084176063537598, "logits/rejected": -0.4954051971435547, "logps/chosen": -335.076904296875, "logps/ref_chosen": -63.20543670654297, "logps/ref_rejected": -88.373291015625, "logps/rejected": -552.9398803710938, "loss": 1.1069, "margin_dpo/margin_mean": 192.69509887695312, "margin_dpo/margin_std": 249.3919219970703, "step": 358 }, { "KL/chosen_KL_mean": -314.4269104003906, "KL/mean": -426.55120849609375, "KL/rejected_KL_mean": -538.675537109375, "KL/std": 234.15951538085938, "epoch": 0.527165932452276, "fcm_dpo/beta": 0.0018845023587346077, "fcm_dpo/delta": -0.023675762116909027, "fcm_dpo/margin": 224.2485809326172, "fcm_dpo/q_t": 0.4034174680709839, "grad_norm": 25.26753044128418, "learning_rate": 2.717889356869146e-07, "logits/chosen": -0.4363176226615906, "logits/rejected": -0.4269408583641052, "logps/chosen": -370.797119140625, "logps/ref_chosen": -56.370216369628906, "logps/ref_rejected": -82.17375183105469, "logps/rejected": -620.8492431640625, "loss": 1.0759, "margin_dpo/margin_mean": 224.2485809326172, "margin_dpo/margin_std": 283.8497009277344, "step": 359 }, { "KL/chosen_KL_mean": -313.34381103515625, "KL/mean": -399.41571044921875, "KL/rejected_KL_mean": -485.4875793457031, "KL/std": 200.4339599609375, "epoch": 0.5286343612334802, "fcm_dpo/beta": 0.0019088031258434057, "fcm_dpo/delta": 0.07367773354053497, "fcm_dpo/margin": 172.14376831054688, "fcm_dpo/q_t": 0.42239513993263245, "grad_norm": 39.744529724121094, "learning_rate": 2.7051020734928443e-07, "logits/chosen": -0.44749805331230164, "logits/rejected": -0.4344269633293152, "logps/chosen": -364.80419921875, "logps/ref_chosen": -51.460384368896484, "logps/ref_rejected": -69.83892059326172, "logps/rejected": -555.3265380859375, "loss": 1.1216, "margin_dpo/margin_mean": 172.1437530517578, "margin_dpo/margin_std": 204.19671630859375, "step": 360 }, { "KL/chosen_KL_mean": -329.3931884765625, "KL/mean": -417.86151123046875, "KL/rejected_KL_mean": -506.329833984375, "KL/std": 236.83558654785156, "epoch": 0.5301027900146843, "fcm_dpo/beta": 0.0019349538488313556, "fcm_dpo/delta": 0.05939781665802002, "fcm_dpo/margin": 176.93667602539062, "fcm_dpo/q_t": 0.4206019639968872, "grad_norm": 28.265932083129883, "learning_rate": 2.6923093854861593e-07, "logits/chosen": -0.42435145378112793, "logits/rejected": -0.4203334152698517, "logps/chosen": -383.2627258300781, "logps/ref_chosen": -53.86951446533203, "logps/ref_rejected": -90.7692642211914, "logps/rejected": -597.09912109375, "loss": 1.1322, "margin_dpo/margin_mean": 176.93667602539062, "margin_dpo/margin_std": 257.4376220703125, "step": 361 }, { "KL/chosen_KL_mean": -289.91583251953125, "KL/mean": -428.1727600097656, "KL/rejected_KL_mean": -566.4297485351562, "KL/std": 248.63421630859375, "epoch": 0.5315712187958884, "fcm_dpo/beta": 0.0018921452574431896, "fcm_dpo/delta": -0.13069821894168854, "fcm_dpo/margin": 276.513916015625, "fcm_dpo/q_t": 0.3790006637573242, "grad_norm": 24.199586868286133, "learning_rate": 2.679511629948319e-07, "logits/chosen": -0.45070475339889526, "logits/rejected": -0.46132344007492065, "logps/chosen": -348.55487060546875, "logps/ref_chosen": -58.639060974121094, "logps/ref_rejected": -105.58195495605469, "logps/rejected": -672.01171875, "loss": 0.9867, "margin_dpo/margin_mean": 276.5138854980469, "margin_dpo/margin_std": 264.4261474609375, "step": 362 }, { "KL/chosen_KL_mean": -264.89666748046875, "KL/mean": -404.021728515625, "KL/rejected_KL_mean": -543.1467895507812, "KL/std": 245.37258911132812, "epoch": 0.5330396475770925, "fcm_dpo/beta": 0.001857282593846321, "fcm_dpo/delta": -0.12296949326992035, "fcm_dpo/margin": 278.2501220703125, "fcm_dpo/q_t": 0.38030678033828735, "grad_norm": 23.536636352539062, "learning_rate": 2.6667091441120816e-07, "logits/chosen": -0.4141322076320648, "logits/rejected": -0.4084208607673645, "logps/chosen": -309.455078125, "logps/ref_chosen": -44.558380126953125, "logps/ref_rejected": -74.69496154785156, "logps/rejected": -617.841796875, "loss": 0.9918, "margin_dpo/margin_mean": 278.2501220703125, "margin_dpo/margin_std": 267.5613098144531, "step": 363 }, { "KL/chosen_KL_mean": -294.61907958984375, "KL/mean": -398.00604248046875, "KL/rejected_KL_mean": -501.39300537109375, "KL/std": 241.02774047851562, "epoch": 0.5345080763582967, "fcm_dpo/beta": 0.001852140761911869, "fcm_dpo/delta": 0.017322657629847527, "fcm_dpo/margin": 206.77392578125, "fcm_dpo/q_t": 0.41120392084121704, "grad_norm": 24.284122467041016, "learning_rate": 2.6539022653348575e-07, "logits/chosen": -0.46633392572402954, "logits/rejected": -0.4776257276535034, "logps/chosen": -343.513671875, "logps/ref_chosen": -48.894622802734375, "logps/ref_rejected": -91.395751953125, "logps/rejected": -592.7887573242188, "loss": 1.1034, "margin_dpo/margin_mean": 206.77394104003906, "margin_dpo/margin_std": 278.9082946777344, "step": 364 }, { "KL/chosen_KL_mean": -282.74884033203125, "KL/mean": -393.6543884277344, "KL/rejected_KL_mean": -504.5599060058594, "KL/std": 251.020751953125, "epoch": 0.5359765051395007, "fcm_dpo/beta": 0.0018444794695824385, "fcm_dpo/delta": -0.009514345787465572, "fcm_dpo/margin": 221.81109619140625, "fcm_dpo/q_t": 0.40571504831314087, "grad_norm": 23.80178451538086, "learning_rate": 2.641091331089811e-07, "logits/chosen": -0.42873144149780273, "logits/rejected": -0.440301775932312, "logps/chosen": -334.2415771484375, "logps/ref_chosen": -51.49274444580078, "logps/ref_rejected": -92.70166778564453, "logps/rejected": -597.2615966796875, "loss": 1.0708, "margin_dpo/margin_mean": 221.81109619140625, "margin_dpo/margin_std": 257.0599060058594, "step": 365 }, { "KL/chosen_KL_mean": -263.0450134277344, "KL/mean": -374.4595031738281, "KL/rejected_KL_mean": -485.8740234375, "KL/std": 239.58175659179688, "epoch": 0.5374449339207048, "fcm_dpo/beta": 0.001833090209402144, "fcm_dpo/delta": -0.009195588529109955, "fcm_dpo/margin": 222.82901000976562, "fcm_dpo/q_t": 0.40598738193511963, "grad_norm": 21.787899017333984, "learning_rate": 2.6282766789569736e-07, "logits/chosen": -0.41914117336273193, "logits/rejected": -0.43506374955177307, "logps/chosen": -307.76556396484375, "logps/ref_chosen": -44.7205696105957, "logps/ref_rejected": -83.31040954589844, "logps/rejected": -569.1844482421875, "loss": 1.0851, "margin_dpo/margin_mean": 222.82901000976562, "margin_dpo/margin_std": 287.36468505859375, "step": 366 }, { "KL/chosen_KL_mean": -271.44921875, "KL/mean": -364.90087890625, "KL/rejected_KL_mean": -458.3525085449219, "KL/std": 220.12733459472656, "epoch": 0.5389133627019089, "fcm_dpo/beta": 0.0018576278816908598, "fcm_dpo/delta": 0.05456267669796944, "fcm_dpo/margin": 186.90333557128906, "fcm_dpo/q_t": 0.41822659969329834, "grad_norm": 20.662567138671875, "learning_rate": 2.615458646614349e-07, "logits/chosen": -0.44316402077674866, "logits/rejected": -0.4273492395877838, "logps/chosen": -329.8546142578125, "logps/ref_chosen": -58.405418395996094, "logps/ref_rejected": -76.75132751464844, "logps/rejected": -535.1038208007812, "loss": 1.1158, "margin_dpo/margin_mean": 186.90333557128906, "margin_dpo/margin_std": 237.801025390625, "step": 367 }, { "KL/chosen_KL_mean": -256.9306640625, "KL/mean": -400.14288330078125, "KL/rejected_KL_mean": -543.3551025390625, "KL/std": 246.394287109375, "epoch": 0.540381791483113, "fcm_dpo/beta": 0.0018250863067805767, "fcm_dpo/delta": -0.12947417795658112, "fcm_dpo/margin": 286.42449951171875, "fcm_dpo/q_t": 0.37577325105667114, "grad_norm": 35.373626708984375, "learning_rate": 2.6026375718290083e-07, "logits/chosen": -0.46660637855529785, "logits/rejected": -0.4749259352684021, "logps/chosen": -301.3831787109375, "logps/ref_chosen": -44.452518463134766, "logps/ref_rejected": -98.55526733398438, "logps/rejected": -641.910400390625, "loss": 0.9699, "margin_dpo/margin_mean": 286.42449951171875, "margin_dpo/margin_std": 225.23828125, "step": 368 }, { "KL/chosen_KL_mean": -343.1910095214844, "KL/mean": -415.24658203125, "KL/rejected_KL_mean": -487.3021240234375, "KL/std": 250.22268676757812, "epoch": 0.5418502202643172, "fcm_dpo/beta": 0.001818750286474824, "fcm_dpo/delta": 0.0356462262570858, "fcm_dpo/margin": 144.11106872558594, "fcm_dpo/q_t": 0.43869489431381226, "grad_norm": 29.09714126586914, "learning_rate": 2.589813792448196e-07, "logits/chosen": -0.47453856468200684, "logits/rejected": -0.4585055708885193, "logps/chosen": -414.572509765625, "logps/ref_chosen": -71.38150024414062, "logps/ref_rejected": -91.29582214355469, "logps/rejected": -578.597900390625, "loss": 1.205, "margin_dpo/margin_mean": 144.11106872558594, "margin_dpo/margin_std": 282.6583251953125, "step": 369 }, { "KL/chosen_KL_mean": -344.9140625, "KL/mean": -421.39202880859375, "KL/rejected_KL_mean": -497.8699951171875, "KL/std": 256.6206359863281, "epoch": 0.5433186490455213, "fcm_dpo/beta": 0.0018614260479807854, "fcm_dpo/delta": 0.11835242807865143, "fcm_dpo/margin": 152.9559783935547, "fcm_dpo/q_t": 0.4348163604736328, "grad_norm": 29.563688278198242, "learning_rate": 2.5769876463904263e-07, "logits/chosen": -0.46674686670303345, "logits/rejected": -0.45883116126060486, "logps/chosen": -416.52154541015625, "logps/ref_chosen": -71.60749816894531, "logps/ref_rejected": -97.25978088378906, "logps/rejected": -595.1297607421875, "loss": 1.1911, "margin_dpo/margin_mean": 152.95596313476562, "margin_dpo/margin_std": 288.8004150390625, "step": 370 }, { "KL/chosen_KL_mean": -335.6900939941406, "KL/mean": -441.45654296875, "KL/rejected_KL_mean": -547.222900390625, "KL/std": 263.390625, "epoch": 0.5447870778267254, "fcm_dpo/beta": 0.0018742081010714173, "fcm_dpo/delta": 0.003576137125492096, "fcm_dpo/margin": 211.5328369140625, "fcm_dpo/q_t": 0.40980789065361023, "grad_norm": 27.87566566467285, "learning_rate": 2.5641594716365744e-07, "logits/chosen": -0.5134047269821167, "logits/rejected": -0.49832814931869507, "logps/chosen": -405.1045837402344, "logps/ref_chosen": -69.41448974609375, "logps/ref_rejected": -99.17217254638672, "logps/rejected": -646.3951416015625, "loss": 1.1043, "margin_dpo/margin_mean": 211.5328369140625, "margin_dpo/margin_std": 302.67376708984375, "step": 371 }, { "KL/chosen_KL_mean": -320.33050537109375, "KL/mean": -443.8200378417969, "KL/rejected_KL_mean": -567.3095703125, "KL/std": 299.37408447265625, "epoch": 0.5462555066079295, "fcm_dpo/beta": 0.0018502443563193083, "fcm_dpo/delta": -0.05988244712352753, "fcm_dpo/margin": 246.97903442382812, "fcm_dpo/q_t": 0.398201584815979, "grad_norm": 25.24636459350586, "learning_rate": 2.551329606220976e-07, "logits/chosen": -0.5078925490379333, "logits/rejected": -0.49255937337875366, "logps/chosen": -382.14849853515625, "logps/ref_chosen": -61.8179931640625, "logps/ref_rejected": -78.53948974609375, "logps/rejected": -645.8489990234375, "loss": 1.0673, "margin_dpo/margin_mean": 246.97903442382812, "margin_dpo/margin_std": 337.23712158203125, "step": 372 }, { "KL/chosen_KL_mean": -346.5724792480469, "KL/mean": -466.34234619140625, "KL/rejected_KL_mean": -586.1122436523438, "KL/std": 286.9017639160156, "epoch": 0.5477239353891337, "fcm_dpo/beta": 0.0018451586365699768, "fcm_dpo/delta": -0.044221822172403336, "fcm_dpo/margin": 239.539794921875, "fcm_dpo/q_t": 0.3970082998275757, "grad_norm": 32.10969161987305, "learning_rate": 2.538498388222517e-07, "logits/chosen": -0.4709147810935974, "logits/rejected": -0.4543595016002655, "logps/chosen": -410.78961181640625, "logps/ref_chosen": -64.21713256835938, "logps/ref_rejected": -85.95960998535156, "logps/rejected": -672.0718994140625, "loss": 1.0556, "margin_dpo/margin_mean": 239.539794921875, "margin_dpo/margin_std": 271.01666259765625, "step": 373 }, { "KL/chosen_KL_mean": -305.8607177734375, "KL/mean": -413.46075439453125, "KL/rejected_KL_mean": -521.060791015625, "KL/std": 309.17022705078125, "epoch": 0.5491923641703378, "fcm_dpo/beta": 0.0018260091310366988, "fcm_dpo/delta": 0.007001262158155441, "fcm_dpo/margin": 215.20005798339844, "fcm_dpo/q_t": 0.413374662399292, "grad_norm": 24.107498168945312, "learning_rate": 2.525666155755725e-07, "logits/chosen": -0.5528968572616577, "logits/rejected": -0.5381832122802734, "logps/chosen": -376.5108947753906, "logps/ref_chosen": -70.65018463134766, "logps/ref_rejected": -93.64016723632812, "logps/rejected": -614.700927734375, "loss": 1.1254, "margin_dpo/margin_mean": 215.2000732421875, "margin_dpo/margin_std": 354.08026123046875, "step": 374 }, { "KL/chosen_KL_mean": -316.97076416015625, "KL/mean": -415.6497802734375, "KL/rejected_KL_mean": -514.3287353515625, "KL/std": 248.89346313476562, "epoch": 0.5506607929515418, "fcm_dpo/beta": 0.0018346281722187996, "fcm_dpo/delta": 0.03891323506832123, "fcm_dpo/margin": 197.35801696777344, "fcm_dpo/q_t": 0.416460245847702, "grad_norm": 28.248476028442383, "learning_rate": 2.512833246961859e-07, "logits/chosen": -0.48038458824157715, "logits/rejected": -0.4829840064048767, "logps/chosen": -377.0509948730469, "logps/ref_chosen": -60.080223083496094, "logps/ref_rejected": -88.93830871582031, "logps/rejected": -603.26708984375, "loss": 1.1373, "margin_dpo/margin_mean": 197.35800170898438, "margin_dpo/margin_std": 312.4132995605469, "step": 375 }, { "KL/chosen_KL_mean": -300.00848388671875, "KL/mean": -428.203857421875, "KL/rejected_KL_mean": -556.3992919921875, "KL/std": 273.4169616699219, "epoch": 0.5521292217327459, "fcm_dpo/beta": 0.0018291289452463388, "fcm_dpo/delta": -0.07235552370548248, "fcm_dpo/margin": 256.3907470703125, "fcm_dpo/q_t": 0.39261579513549805, "grad_norm": 25.545181274414062, "learning_rate": 2.5e-07, "logits/chosen": -0.44510942697525024, "logits/rejected": -0.43642458319664, "logps/chosen": -362.66876220703125, "logps/ref_chosen": -62.660308837890625, "logps/ref_rejected": -105.52660369873047, "logps/rejected": -661.9258422851562, "loss": 1.0513, "margin_dpo/margin_mean": 256.3907470703125, "margin_dpo/margin_std": 320.7281188964844, "step": 376 }, { "KL/chosen_KL_mean": -301.1566467285156, "KL/mean": -427.1124267578125, "KL/rejected_KL_mean": -553.0682373046875, "KL/std": 280.93780517578125, "epoch": 0.55359765051395, "fcm_dpo/beta": 0.0018094563856720924, "fcm_dpo/delta": -0.05844918638467789, "fcm_dpo/margin": 251.91156005859375, "fcm_dpo/q_t": 0.39543959498405457, "grad_norm": 25.23697853088379, "learning_rate": 2.487166753038141e-07, "logits/chosen": -0.397521436214447, "logits/rejected": -0.39853352308273315, "logps/chosen": -355.6353759765625, "logps/ref_chosen": -54.478736877441406, "logps/ref_rejected": -98.70335388183594, "logps/rejected": -651.7716064453125, "loss": 1.0517, "margin_dpo/margin_mean": 251.91156005859375, "margin_dpo/margin_std": 303.8875732421875, "step": 377 }, { "KL/chosen_KL_mean": -276.0648498535156, "KL/mean": -409.00689697265625, "KL/rejected_KL_mean": -541.948974609375, "KL/std": 263.941162109375, "epoch": 0.5550660792951542, "fcm_dpo/beta": 0.001778826816007495, "fcm_dpo/delta": -0.07667370140552521, "fcm_dpo/margin": 265.88409423828125, "fcm_dpo/q_t": 0.3893394470214844, "grad_norm": 30.339950561523438, "learning_rate": 2.4743338442442754e-07, "logits/chosen": -0.4182929992675781, "logits/rejected": -0.4355580806732178, "logps/chosen": -321.08538818359375, "logps/ref_chosen": -45.02053451538086, "logps/ref_rejected": -88.0469741821289, "logps/rejected": -629.9959716796875, "loss": 1.0286, "margin_dpo/margin_mean": 265.88409423828125, "margin_dpo/margin_std": 285.3680419921875, "step": 378 }, { "KL/chosen_KL_mean": -324.24822998046875, "KL/mean": -456.31768798828125, "KL/rejected_KL_mean": -588.3870849609375, "KL/std": 270.865478515625, "epoch": 0.5565345080763583, "fcm_dpo/beta": 0.0017488367157056928, "fcm_dpo/delta": -0.06558392941951752, "fcm_dpo/margin": 264.138916015625, "fcm_dpo/q_t": 0.39461731910705566, "grad_norm": 23.810928344726562, "learning_rate": 2.461501611777483e-07, "logits/chosen": -0.3999977111816406, "logits/rejected": -0.4254748225212097, "logps/chosen": -377.43035888671875, "logps/ref_chosen": -53.182098388671875, "logps/ref_rejected": -114.3001708984375, "logps/rejected": -702.687255859375, "loss": 1.0496, "margin_dpo/margin_mean": 264.1388854980469, "margin_dpo/margin_std": 319.9960021972656, "step": 379 }, { "KL/chosen_KL_mean": -327.43389892578125, "KL/mean": -471.5666198730469, "KL/rejected_KL_mean": -615.6993408203125, "KL/std": 298.45489501953125, "epoch": 0.5580029368575624, "fcm_dpo/beta": 0.0017266274662688375, "fcm_dpo/delta": -0.10273480415344238, "fcm_dpo/margin": 288.2655029296875, "fcm_dpo/q_t": 0.3840462565422058, "grad_norm": 27.789323806762695, "learning_rate": 2.4486703937790243e-07, "logits/chosen": -0.4401572346687317, "logits/rejected": -0.46762269735336304, "logps/chosen": -378.78692626953125, "logps/ref_chosen": -51.3530387878418, "logps/ref_rejected": -104.19169616699219, "logps/rejected": -719.8910522460938, "loss": 1.0222, "margin_dpo/margin_mean": 288.2655029296875, "margin_dpo/margin_std": 325.3714599609375, "step": 380 }, { "KL/chosen_KL_mean": -348.72802734375, "KL/mean": -450.8270263671875, "KL/rejected_KL_mean": -552.926025390625, "KL/std": 260.6754455566406, "epoch": 0.5594713656387665, "fcm_dpo/beta": 0.0017242280300706625, "fcm_dpo/delta": 0.04968461021780968, "fcm_dpo/margin": 204.19798278808594, "fcm_dpo/q_t": 0.42024338245391846, "grad_norm": 25.380268096923828, "learning_rate": 2.435840528363426e-07, "logits/chosen": -0.43789827823638916, "logits/rejected": -0.4210563898086548, "logps/chosen": -406.5311279296875, "logps/ref_chosen": -57.80306625366211, "logps/ref_rejected": -79.21940612792969, "logps/rejected": -632.1454467773438, "loss": 1.1515, "margin_dpo/margin_mean": 204.197998046875, "margin_dpo/margin_std": 352.91259765625, "step": 381 }, { "KL/chosen_KL_mean": -352.124267578125, "KL/mean": -475.0651550292969, "KL/rejected_KL_mean": -598.0060424804688, "KL/std": 246.15951538085938, "epoch": 0.5609397944199707, "fcm_dpo/beta": 0.0017251023091375828, "fcm_dpo/delta": -0.02524741366505623, "fcm_dpo/margin": 245.88180541992188, "fcm_dpo/q_t": 0.4009873569011688, "grad_norm": 30.70073890686035, "learning_rate": 2.4230123536095745e-07, "logits/chosen": -0.46005573868751526, "logits/rejected": -0.4692569375038147, "logps/chosen": -418.14453125, "logps/ref_chosen": -66.02030181884766, "logps/ref_rejected": -110.71016693115234, "logps/rejected": -708.7161865234375, "loss": 1.0578, "margin_dpo/margin_mean": 245.88180541992188, "margin_dpo/margin_std": 270.6402587890625, "step": 382 }, { "KL/chosen_KL_mean": -358.22650146484375, "KL/mean": -477.87152099609375, "KL/rejected_KL_mean": -597.5165405273438, "KL/std": 270.5577392578125, "epoch": 0.5624082232011748, "fcm_dpo/beta": 0.0017189650097861886, "fcm_dpo/delta": -0.011835414916276932, "fcm_dpo/margin": 239.28997802734375, "fcm_dpo/q_t": 0.40572264790534973, "grad_norm": 29.060136795043945, "learning_rate": 2.4101862075518037e-07, "logits/chosen": -0.4785361886024475, "logits/rejected": -0.4892638325691223, "logps/chosen": -408.6180114746094, "logps/ref_chosen": -50.39148712158203, "logps/ref_rejected": -93.71589660644531, "logps/rejected": -691.232421875, "loss": 1.1034, "margin_dpo/margin_mean": 239.28997802734375, "margin_dpo/margin_std": 355.65509033203125, "step": 383 }, { "KL/chosen_KL_mean": -376.49822998046875, "KL/mean": -473.9187316894531, "KL/rejected_KL_mean": -571.3392333984375, "KL/std": 258.77777099609375, "epoch": 0.5638766519823789, "fcm_dpo/beta": 0.0017420074436813593, "fcm_dpo/delta": 0.062150660902261734, "fcm_dpo/margin": 194.84095764160156, "fcm_dpo/q_t": 0.4199580252170563, "grad_norm": 28.971044540405273, "learning_rate": 2.397362428170992e-07, "logits/chosen": -0.49737972021102905, "logits/rejected": -0.4919343590736389, "logps/chosen": -428.5443420410156, "logps/ref_chosen": -52.046104431152344, "logps/ref_rejected": -85.76089477539062, "logps/rejected": -657.10009765625, "loss": 1.1205, "margin_dpo/margin_mean": 194.84097290039062, "margin_dpo/margin_std": 245.9820556640625, "step": 384 }, { "KL/chosen_KL_mean": -361.83416748046875, "KL/mean": -478.52764892578125, "KL/rejected_KL_mean": -595.2210693359375, "KL/std": 224.6497802734375, "epoch": 0.5653450807635829, "fcm_dpo/beta": 0.0017391443252563477, "fcm_dpo/delta": -0.006142602767795324, "fcm_dpo/margin": 233.38693237304688, "fcm_dpo/q_t": 0.4041079580783844, "grad_norm": 34.824005126953125, "learning_rate": 2.3845413533856514e-07, "logits/chosen": -0.5053662061691284, "logits/rejected": -0.48552972078323364, "logps/chosen": -427.3863220214844, "logps/ref_chosen": -65.55215454101562, "logps/ref_rejected": -77.82792663574219, "logps/rejected": -673.049072265625, "loss": 1.0656, "margin_dpo/margin_mean": 233.38693237304688, "margin_dpo/margin_std": 248.42532348632812, "step": 385 }, { "KL/chosen_KL_mean": -378.93572998046875, "KL/mean": -507.42462158203125, "KL/rejected_KL_mean": -635.9134521484375, "KL/std": 285.33013916015625, "epoch": 0.566813509544787, "fcm_dpo/beta": 0.0017280435422435403, "fcm_dpo/delta": -0.046092525124549866, "fcm_dpo/margin": 256.977783203125, "fcm_dpo/q_t": 0.39897212386131287, "grad_norm": 34.94272994995117, "learning_rate": 2.3717233210430254e-07, "logits/chosen": -0.4857565760612488, "logits/rejected": -0.4821171760559082, "logps/chosen": -437.1575927734375, "logps/ref_chosen": -58.22185516357422, "logps/ref_rejected": -92.32742309570312, "logps/rejected": -728.2409057617188, "loss": 1.066, "margin_dpo/margin_mean": 256.977783203125, "margin_dpo/margin_std": 332.15789794921875, "step": 386 }, { "KL/chosen_KL_mean": -393.4022216796875, "KL/mean": -504.98333740234375, "KL/rejected_KL_mean": -616.564453125, "KL/std": 264.5252685546875, "epoch": 0.5682819383259912, "fcm_dpo/beta": 0.001718209940008819, "fcm_dpo/delta": 0.01678801327943802, "fcm_dpo/margin": 223.16220092773438, "fcm_dpo/q_t": 0.41014280915260315, "grad_norm": 37.69805908203125, "learning_rate": 2.3589086689101889e-07, "logits/chosen": -0.5313735008239746, "logits/rejected": -0.5198484063148499, "logps/chosen": -459.8216552734375, "logps/ref_chosen": -66.41944885253906, "logps/ref_rejected": -92.16915893554688, "logps/rejected": -708.733642578125, "loss": 1.093, "margin_dpo/margin_mean": 223.16221618652344, "margin_dpo/margin_std": 270.58795166015625, "step": 387 }, { "KL/chosen_KL_mean": -380.30328369140625, "KL/mean": -524.3448486328125, "KL/rejected_KL_mean": -668.386474609375, "KL/std": 307.99615478515625, "epoch": 0.5697503671071953, "fcm_dpo/beta": 0.0016994503093883395, "fcm_dpo/delta": -0.09461631625890732, "fcm_dpo/margin": 288.0831604003906, "fcm_dpo/q_t": 0.39002934098243713, "grad_norm": 34.622745513916016, "learning_rate": 2.3460977346651428e-07, "logits/chosen": -0.49403852224349976, "logits/rejected": -0.5061089396476746, "logps/chosen": -430.4327392578125, "logps/ref_chosen": -50.129459381103516, "logps/ref_rejected": -104.43305969238281, "logps/rejected": -772.8195190429688, "loss": 1.0371, "margin_dpo/margin_mean": 288.0831604003906, "margin_dpo/margin_std": 356.5264892578125, "step": 388 }, { "KL/chosen_KL_mean": -403.150390625, "KL/mean": -520.1739501953125, "KL/rejected_KL_mean": -637.1974487304688, "KL/std": 292.00225830078125, "epoch": 0.5712187958883994, "fcm_dpo/beta": 0.001694181701168418, "fcm_dpo/delta": 0.003580855205655098, "fcm_dpo/margin": 234.0470733642578, "fcm_dpo/q_t": 0.40893417596817017, "grad_norm": 31.105833053588867, "learning_rate": 2.3332908558879177e-07, "logits/chosen": -0.5170685648918152, "logits/rejected": -0.5164707899093628, "logps/chosen": -461.0569763183594, "logps/ref_chosen": -57.906593322753906, "logps/ref_rejected": -77.91454315185547, "logps/rejected": -715.1119995117188, "loss": 1.0973, "margin_dpo/margin_mean": 234.04705810546875, "margin_dpo/margin_std": 320.1253967285156, "step": 389 }, { "KL/chosen_KL_mean": -388.3230285644531, "KL/mean": -506.873291015625, "KL/rejected_KL_mean": -625.4235229492188, "KL/std": 285.4632568359375, "epoch": 0.5726872246696035, "fcm_dpo/beta": 0.0016903409268707037, "fcm_dpo/delta": -0.00112185999751091, "fcm_dpo/margin": 237.10052490234375, "fcm_dpo/q_t": 0.41132819652557373, "grad_norm": 31.483285903930664, "learning_rate": 2.320488370051681e-07, "logits/chosen": -0.449199914932251, "logits/rejected": -0.4437105655670166, "logps/chosen": -437.5489501953125, "logps/ref_chosen": -49.22591781616211, "logps/ref_rejected": -85.5281982421875, "logps/rejected": -710.9517211914062, "loss": 1.1107, "margin_dpo/margin_mean": 237.10049438476562, "margin_dpo/margin_std": 363.8388366699219, "step": 390 }, { "KL/chosen_KL_mean": -366.3382263183594, "KL/mean": -437.36956787109375, "KL/rejected_KL_mean": -508.40093994140625, "KL/std": 255.61053466796875, "epoch": 0.5741556534508077, "fcm_dpo/beta": 0.001737719401717186, "fcm_dpo/delta": 0.1569492220878601, "fcm_dpo/margin": 142.06271362304688, "fcm_dpo/q_t": 0.44300517439842224, "grad_norm": 39.402198791503906, "learning_rate": 2.3076906145138405e-07, "logits/chosen": -0.4606980085372925, "logits/rejected": -0.4532572627067566, "logps/chosen": -430.6678771972656, "logps/ref_chosen": -64.32965087890625, "logps/ref_rejected": -86.73820495605469, "logps/rejected": -595.13916015625, "loss": 1.2092, "margin_dpo/margin_mean": 142.0626983642578, "margin_dpo/margin_std": 269.4259033203125, "step": 391 }, { "KL/chosen_KL_mean": -303.680908203125, "KL/mean": -440.0765380859375, "KL/rejected_KL_mean": -576.47216796875, "KL/std": 265.79547119140625, "epoch": 0.5756240822320118, "fcm_dpo/beta": 0.0017368567641824484, "fcm_dpo/delta": -0.07744710892438889, "fcm_dpo/margin": 272.79132080078125, "fcm_dpo/q_t": 0.38911527395248413, "grad_norm": 29.656238555908203, "learning_rate": 2.294897926507156e-07, "logits/chosen": -0.4456912875175476, "logits/rejected": -0.4384923577308655, "logps/chosen": -357.18487548828125, "logps/ref_chosen": -53.50397872924805, "logps/ref_rejected": -102.34584045410156, "logps/rejected": -678.8179931640625, "loss": 1.0146, "margin_dpo/margin_mean": 272.7912902832031, "margin_dpo/margin_std": 255.12586975097656, "step": 392 }, { "KL/chosen_KL_mean": -293.63629150390625, "KL/mean": -398.53155517578125, "KL/rejected_KL_mean": -503.4267578125, "KL/std": 257.4723205566406, "epoch": 0.5770925110132159, "fcm_dpo/beta": 0.0017312290146946907, "fcm_dpo/delta": 0.03820331022143364, "fcm_dpo/margin": 209.79046630859375, "fcm_dpo/q_t": 0.41823697090148926, "grad_norm": 25.39501190185547, "learning_rate": 2.2821106431308543e-07, "logits/chosen": -0.4462127685546875, "logits/rejected": -0.445779412984848, "logps/chosen": -340.1102294921875, "logps/ref_chosen": -46.473915100097656, "logps/ref_rejected": -71.96885681152344, "logps/rejected": -575.3956298828125, "loss": 1.124, "margin_dpo/margin_mean": 209.79046630859375, "margin_dpo/margin_std": 317.695068359375, "step": 393 }, { "KL/chosen_KL_mean": -321.10101318359375, "KL/mean": -431.11602783203125, "KL/rejected_KL_mean": -541.131103515625, "KL/std": 263.2855529785156, "epoch": 0.57856093979442, "fcm_dpo/beta": 0.001739653293043375, "fcm_dpo/delta": 0.017899950966238976, "fcm_dpo/margin": 220.0300750732422, "fcm_dpo/q_t": 0.4115217924118042, "grad_norm": 22.869054794311523, "learning_rate": 2.2693291013417452e-07, "logits/chosen": -0.46011653542518616, "logits/rejected": -0.4607650935649872, "logps/chosen": -374.0125732421875, "logps/ref_chosen": -52.91154861450195, "logps/ref_rejected": -90.8226318359375, "logps/rejected": -631.9537353515625, "loss": 1.0947, "margin_dpo/margin_mean": 220.0300750732422, "margin_dpo/margin_std": 278.54339599609375, "step": 394 }, { "KL/chosen_KL_mean": -319.739990234375, "KL/mean": -439.35211181640625, "KL/rejected_KL_mean": -558.9642333984375, "KL/std": 267.854248046875, "epoch": 0.580029368575624, "fcm_dpo/beta": 0.0017342737410217524, "fcm_dpo/delta": -0.015743978321552277, "fcm_dpo/margin": 239.2242431640625, "fcm_dpo/q_t": 0.40489462018013, "grad_norm": 21.308685302734375, "learning_rate": 2.2565536379453404e-07, "logits/chosen": -0.4947971701622009, "logits/rejected": -0.49258559942245483, "logps/chosen": -382.2861022949219, "logps/ref_chosen": -62.546112060546875, "logps/ref_rejected": -83.78262329101562, "logps/rejected": -642.7468872070312, "loss": 1.0802, "margin_dpo/margin_mean": 239.2242431640625, "margin_dpo/margin_std": 307.40301513671875, "step": 395 }, { "KL/chosen_KL_mean": -327.5321044921875, "KL/mean": -437.19598388671875, "KL/rejected_KL_mean": -546.8599853515625, "KL/std": 254.036865234375, "epoch": 0.5814977973568282, "fcm_dpo/beta": 0.0017403149977326393, "fcm_dpo/delta": 0.019010702148079872, "fcm_dpo/margin": 219.32781982421875, "fcm_dpo/q_t": 0.41025522351264954, "grad_norm": 23.98872947692871, "learning_rate": 2.2437845895869825e-07, "logits/chosen": -0.4915603995323181, "logits/rejected": -0.475990355014801, "logps/chosen": -396.52801513671875, "logps/ref_chosen": -68.99594116210938, "logps/ref_rejected": -88.64665985107422, "logps/rejected": -635.506591796875, "loss": 1.0806, "margin_dpo/margin_mean": 219.32781982421875, "margin_dpo/margin_std": 237.82656860351562, "step": 396 }, { "KL/chosen_KL_mean": -305.2785339355469, "KL/mean": -444.10589599609375, "KL/rejected_KL_mean": -582.9332275390625, "KL/std": 257.37060546875, "epoch": 0.5829662261380323, "fcm_dpo/beta": 0.0017182010924443603, "fcm_dpo/delta": -0.08137989044189453, "fcm_dpo/margin": 277.6547546386719, "fcm_dpo/q_t": 0.38798123598098755, "grad_norm": 34.52851867675781, "learning_rate": 2.2310222927429716e-07, "logits/chosen": -0.434369832277298, "logits/rejected": -0.43857717514038086, "logps/chosen": -366.5556945800781, "logps/ref_chosen": -61.27716827392578, "logps/ref_rejected": -103.11612701416016, "logps/rejected": -686.0493774414062, "loss": 1.0139, "margin_dpo/margin_mean": 277.65472412109375, "margin_dpo/margin_std": 265.4613037109375, "step": 397 }, { "KL/chosen_KL_mean": -339.3876953125, "KL/mean": -462.7353515625, "KL/rejected_KL_mean": -586.0830078125, "KL/std": 268.5569152832031, "epoch": 0.5844346549192364, "fcm_dpo/beta": 0.0017101437551900744, "fcm_dpo/delta": -0.022855112329125404, "fcm_dpo/margin": 246.69528198242188, "fcm_dpo/q_t": 0.40368321537971497, "grad_norm": 21.167795181274414, "learning_rate": 2.2182670837116972e-07, "logits/chosen": -0.504738986492157, "logits/rejected": -0.5006571412086487, "logps/chosen": -407.53924560546875, "logps/ref_chosen": -68.15155029296875, "logps/ref_rejected": -108.52360534667969, "logps/rejected": -694.6066284179688, "loss": 1.0739, "margin_dpo/margin_mean": 246.69528198242188, "margin_dpo/margin_std": 314.19610595703125, "step": 398 }, { "KL/chosen_KL_mean": -292.50311279296875, "KL/mean": -409.574462890625, "KL/rejected_KL_mean": -526.645751953125, "KL/std": 240.19448852539062, "epoch": 0.5859030837004405, "fcm_dpo/beta": 0.0017040125094354153, "fcm_dpo/delta": 0.000902075320482254, "fcm_dpo/margin": 234.1426544189453, "fcm_dpo/q_t": 0.40744373202323914, "grad_norm": 26.88203239440918, "learning_rate": 2.2055192986047804e-07, "logits/chosen": -0.4666723310947418, "logits/rejected": -0.42695629596710205, "logps/chosen": -353.3929138183594, "logps/ref_chosen": -60.889801025390625, "logps/ref_rejected": -77.965576171875, "logps/rejected": -604.611328125, "loss": 1.0882, "margin_dpo/margin_mean": 234.1426544189453, "margin_dpo/margin_std": 298.5992431640625, "step": 399 }, { "KL/chosen_KL_mean": -281.47064208984375, "KL/mean": -446.57061767578125, "KL/rejected_KL_mean": -611.6705932617188, "KL/std": 261.82757568359375, "epoch": 0.5873715124816447, "fcm_dpo/beta": 0.0016647314187139273, "fcm_dpo/delta": -0.158945232629776, "fcm_dpo/margin": 330.199951171875, "fcm_dpo/q_t": 0.3706052005290985, "grad_norm": 28.340599060058594, "learning_rate": 2.192779273338215e-07, "logits/chosen": -0.4434245228767395, "logits/rejected": -0.43936118483543396, "logps/chosen": -345.1142272949219, "logps/ref_chosen": -63.64359664916992, "logps/ref_rejected": -105.252685546875, "logps/rejected": -716.9232788085938, "loss": 0.9628, "margin_dpo/margin_mean": 330.199951171875, "margin_dpo/margin_std": 286.12255859375, "step": 400 }, { "KL/chosen_KL_mean": -341.30584716796875, "KL/mean": -434.10931396484375, "KL/rejected_KL_mean": -526.9127197265625, "KL/std": 270.2348937988281, "epoch": 0.5888399412628488, "fcm_dpo/beta": 0.0016737841069698334, "fcm_dpo/delta": 0.09229836612939835, "fcm_dpo/margin": 185.60690307617188, "fcm_dpo/q_t": 0.4304611086845398, "grad_norm": 23.37901496887207, "learning_rate": 2.1800473436235136e-07, "logits/chosen": -0.4471530318260193, "logits/rejected": -0.4397915005683899, "logps/chosen": -398.4688720703125, "logps/ref_chosen": -57.16303253173828, "logps/ref_rejected": -83.79249572753906, "logps/rejected": -610.7052001953125, "loss": 1.1851, "margin_dpo/margin_mean": 185.60690307617188, "margin_dpo/margin_std": 357.864013671875, "step": 401 }, { "KL/chosen_KL_mean": -254.9739532470703, "KL/mean": -427.3084716796875, "KL/rejected_KL_mean": -599.6429443359375, "KL/std": 300.2375793457031, "epoch": 0.5903083700440529, "fcm_dpo/beta": 0.0016427625669166446, "fcm_dpo/delta": -0.17618390917778015, "fcm_dpo/margin": 344.6689758300781, "fcm_dpo/q_t": 0.3684191107749939, "grad_norm": 21.227405548095703, "learning_rate": 2.1673238449588665e-07, "logits/chosen": -0.490563303232193, "logits/rejected": -0.4836328625679016, "logps/chosen": -305.7143249511719, "logps/ref_chosen": -50.74037170410156, "logps/ref_rejected": -81.0460433959961, "logps/rejected": -680.68896484375, "loss": 0.9545, "margin_dpo/margin_mean": 344.6689758300781, "margin_dpo/margin_std": 306.67181396484375, "step": 402 }, { "KL/chosen_KL_mean": -286.7187805175781, "KL/mean": -420.0758056640625, "KL/rejected_KL_mean": -553.4327392578125, "KL/std": 288.28253173828125, "epoch": 0.591776798825257, "fcm_dpo/beta": 0.0016190607566386461, "fcm_dpo/delta": -0.03328249230980873, "fcm_dpo/margin": 266.7139892578125, "fcm_dpo/q_t": 0.3999168574810028, "grad_norm": 23.255849838256836, "learning_rate": 2.154609112620295e-07, "logits/chosen": -0.4604523479938507, "logits/rejected": -0.4616071879863739, "logps/chosen": -333.8660888671875, "logps/ref_chosen": -47.14731216430664, "logps/ref_rejected": -77.2666015625, "logps/rejected": -630.6993408203125, "loss": 1.056, "margin_dpo/margin_mean": 266.7139892578125, "margin_dpo/margin_std": 298.5339050292969, "step": 403 }, { "KL/chosen_KL_mean": -327.2615966796875, "KL/mean": -458.3185119628906, "KL/rejected_KL_mean": -589.3754272460938, "KL/std": 274.367919921875, "epoch": 0.593245227606461, "fcm_dpo/beta": 0.0016095450846478343, "fcm_dpo/delta": -0.022908374667167664, "fcm_dpo/margin": 262.11383056640625, "fcm_dpo/q_t": 0.40320682525634766, "grad_norm": 30.493053436279297, "learning_rate": 2.1419034816528218e-07, "logits/chosen": -0.4503590166568756, "logits/rejected": -0.4424477815628052, "logps/chosen": -375.13690185546875, "logps/ref_chosen": -47.875274658203125, "logps/ref_rejected": -77.15499877929688, "logps/rejected": -666.5303955078125, "loss": 1.0819, "margin_dpo/margin_mean": 262.11383056640625, "margin_dpo/margin_std": 347.977783203125, "step": 404 }, { "KL/chosen_KL_mean": -380.44219970703125, "KL/mean": -484.3817443847656, "KL/rejected_KL_mean": -588.3212890625, "KL/std": 300.7879943847656, "epoch": 0.5947136563876652, "fcm_dpo/beta": 0.001602754695340991, "fcm_dpo/delta": -0.043147142976522446, "fcm_dpo/margin": 207.8790283203125, "fcm_dpo/q_t": 0.4246622323989868, "grad_norm": 32.75885772705078, "learning_rate": 2.129207286861638e-07, "logits/chosen": -0.44959819316864014, "logits/rejected": -0.4414255619049072, "logps/chosen": -445.6051025390625, "logps/ref_chosen": -65.16290283203125, "logps/ref_rejected": -87.18678283691406, "logps/rejected": -675.508056640625, "loss": 1.1666, "margin_dpo/margin_mean": 207.8790283203125, "margin_dpo/margin_std": 365.72845458984375, "step": 405 }, { "KL/chosen_KL_mean": -342.9149475097656, "KL/mean": -486.18707275390625, "KL/rejected_KL_mean": -629.459228515625, "KL/std": 308.88067626953125, "epoch": 0.5961820851688693, "fcm_dpo/beta": 0.0015887843910604715, "fcm_dpo/delta": -0.05800767242908478, "fcm_dpo/margin": 286.5443115234375, "fcm_dpo/q_t": 0.39615678787231445, "grad_norm": 23.16806983947754, "learning_rate": 2.1165208628032861e-07, "logits/chosen": -0.48730766773223877, "logits/rejected": -0.5009229183197021, "logps/chosen": -392.65576171875, "logps/ref_chosen": -49.740814208984375, "logps/ref_rejected": -92.07862854003906, "logps/rejected": -721.5379028320312, "loss": 1.0517, "margin_dpo/margin_mean": 286.5443115234375, "margin_dpo/margin_std": 344.0789794921875, "step": 406 }, { "KL/chosen_KL_mean": -372.48846435546875, "KL/mean": -458.53515625, "KL/rejected_KL_mean": -544.5818481445312, "KL/std": 246.12185668945312, "epoch": 0.5976505139500734, "fcm_dpo/beta": 0.0015788807068020105, "fcm_dpo/delta": 0.009174516424536705, "fcm_dpo/margin": 172.09339904785156, "fcm_dpo/q_t": 0.43600770831108093, "grad_norm": 27.538284301757812, "learning_rate": 2.1038445437768375e-07, "logits/chosen": -0.4871164560317993, "logits/rejected": -0.45956844091415405, "logps/chosen": -428.81915283203125, "logps/ref_chosen": -56.33069610595703, "logps/ref_rejected": -77.51209259033203, "logps/rejected": -622.093994140625, "loss": 1.1991, "margin_dpo/margin_mean": 172.09341430664062, "margin_dpo/margin_std": 325.55999755859375, "step": 407 }, { "KL/chosen_KL_mean": -372.11248779296875, "KL/mean": -475.17840576171875, "KL/rejected_KL_mean": -578.244384765625, "KL/std": 243.3355255126953, "epoch": 0.5991189427312775, "fcm_dpo/beta": 0.0015977565199136734, "fcm_dpo/delta": 0.07305292040109634, "fcm_dpo/margin": 206.1318359375, "fcm_dpo/q_t": 0.42254602909088135, "grad_norm": 33.735374450683594, "learning_rate": 2.0911786638150872e-07, "logits/chosen": -0.5174983143806458, "logits/rejected": -0.4989047050476074, "logps/chosen": -441.90179443359375, "logps/ref_chosen": -69.789306640625, "logps/ref_rejected": -90.09693908691406, "logps/rejected": -668.34130859375, "loss": 1.1292, "margin_dpo/margin_mean": 206.1318359375, "margin_dpo/margin_std": 271.8330993652344, "step": 408 }, { "KL/chosen_KL_mean": -352.27313232421875, "KL/mean": -452.4332275390625, "KL/rejected_KL_mean": -552.5933837890625, "KL/std": 252.9176025390625, "epoch": 0.6005873715124816, "fcm_dpo/beta": 0.0016257348470389843, "fcm_dpo/delta": 0.07655191421508789, "fcm_dpo/margin": 200.32015991210938, "fcm_dpo/q_t": 0.42412787675857544, "grad_norm": 37.519432067871094, "learning_rate": 2.0785235566757517e-07, "logits/chosen": -0.4930969476699829, "logits/rejected": -0.48252415657043457, "logps/chosen": -419.590576171875, "logps/ref_chosen": -67.31744384765625, "logps/ref_rejected": -84.904296875, "logps/rejected": -637.4976196289062, "loss": 1.1368, "margin_dpo/margin_mean": 200.32015991210938, "margin_dpo/margin_std": 280.4720153808594, "step": 409 }, { "KL/chosen_KL_mean": -331.1689453125, "KL/mean": -449.50732421875, "KL/rejected_KL_mean": -567.8456420898438, "KL/std": 254.5586395263672, "epoch": 0.6020558002936858, "fcm_dpo/beta": 0.0016305126482620835, "fcm_dpo/delta": 0.014659320935606956, "fcm_dpo/margin": 236.67666625976562, "fcm_dpo/q_t": 0.40897810459136963, "grad_norm": 31.696603775024414, "learning_rate": 2.065879555832674e-07, "logits/chosen": -0.4725341796875, "logits/rejected": -0.47504281997680664, "logps/chosen": -382.63433837890625, "logps/ref_chosen": -51.465354919433594, "logps/ref_rejected": -83.198974609375, "logps/rejected": -651.0446166992188, "loss": 1.0884, "margin_dpo/margin_mean": 236.67666625976562, "margin_dpo/margin_std": 284.35211181640625, "step": 410 }, { "KL/chosen_KL_mean": -353.330810546875, "KL/mean": -468.7662658691406, "KL/rejected_KL_mean": -584.20166015625, "KL/std": 282.9371337890625, "epoch": 0.6035242290748899, "fcm_dpo/beta": 0.001626357901841402, "fcm_dpo/delta": 0.02442072331905365, "fcm_dpo/margin": 230.87094116210938, "fcm_dpo/q_t": 0.41455578804016113, "grad_norm": 44.872047424316406, "learning_rate": 2.0532469944670343e-07, "logits/chosen": -0.48675569891929626, "logits/rejected": -0.5024890303611755, "logps/chosen": -405.6380615234375, "logps/ref_chosen": -52.30727005004883, "logps/ref_rejected": -80.69495391845703, "logps/rejected": -664.8966674804688, "loss": 1.1138, "margin_dpo/margin_mean": 230.87094116210938, "margin_dpo/margin_std": 319.9390869140625, "step": 411 }, { "KL/chosen_KL_mean": -358.72283935546875, "KL/mean": -475.9858093261719, "KL/rejected_KL_mean": -593.2487182617188, "KL/std": 272.87725830078125, "epoch": 0.604992657856094, "fcm_dpo/beta": 0.0016456831945106387, "fcm_dpo/delta": 0.014591998420655727, "fcm_dpo/margin": 234.52587890625, "fcm_dpo/q_t": 0.4107934236526489, "grad_norm": 37.48828887939453, "learning_rate": 2.0406262054585738e-07, "logits/chosen": -0.5510473251342773, "logits/rejected": -0.5846823453903198, "logps/chosen": -411.86700439453125, "logps/ref_chosen": -53.144126892089844, "logps/ref_rejected": -100.0608139038086, "logps/rejected": -693.3095703125, "loss": 1.1019, "margin_dpo/margin_mean": 234.52587890625, "margin_dpo/margin_std": 319.07745361328125, "step": 412 }, { "KL/chosen_KL_mean": -391.5623779296875, "KL/mean": -510.7938537597656, "KL/rejected_KL_mean": -630.025390625, "KL/std": 288.8511962890625, "epoch": 0.6064610866372981, "fcm_dpo/beta": 0.0016510069835931063, "fcm_dpo/delta": 0.0064473580569028854, "fcm_dpo/margin": 238.46292114257812, "fcm_dpo/q_t": 0.40730637311935425, "grad_norm": 33.53278732299805, "learning_rate": 2.0280175213768205e-07, "logits/chosen": -0.5170902013778687, "logits/rejected": -0.5278250575065613, "logps/chosen": -453.14434814453125, "logps/ref_chosen": -61.58196258544922, "logps/ref_rejected": -99.47340393066406, "logps/rejected": -729.498779296875, "loss": 1.0936, "margin_dpo/margin_mean": 238.4629364013672, "margin_dpo/margin_std": 311.4296875, "step": 413 }, { "KL/chosen_KL_mean": -364.56231689453125, "KL/mean": -492.68621826171875, "KL/rejected_KL_mean": -620.8101806640625, "KL/std": 266.0479736328125, "epoch": 0.6079295154185022, "fcm_dpo/beta": 0.001652669394388795, "fcm_dpo/delta": -0.025241520255804062, "fcm_dpo/margin": 256.247802734375, "fcm_dpo/q_t": 0.4015337824821472, "grad_norm": 35.41373062133789, "learning_rate": 2.0154212744723247e-07, "logits/chosen": -0.5074818134307861, "logits/rejected": -0.5076801776885986, "logps/chosen": -411.1938171386719, "logps/ref_chosen": -46.63148498535156, "logps/ref_rejected": -87.64653015136719, "logps/rejected": -708.4566650390625, "loss": 1.0711, "margin_dpo/margin_mean": 256.2478332519531, "margin_dpo/margin_std": 307.007080078125, "step": 414 }, { "KL/chosen_KL_mean": -394.33807373046875, "KL/mean": -489.5907897949219, "KL/rejected_KL_mean": -584.843505859375, "KL/std": 265.6529541015625, "epoch": 0.6093979441997063, "fcm_dpo/beta": 0.0016560875810682774, "fcm_dpo/delta": 0.08733348548412323, "fcm_dpo/margin": 190.50540161132812, "fcm_dpo/q_t": 0.42633694410324097, "grad_norm": 29.98908233642578, "learning_rate": 2.002837796667909e-07, "logits/chosen": -0.5528023838996887, "logits/rejected": -0.5534902811050415, "logps/chosen": -472.95635986328125, "logps/ref_chosen": -78.6182861328125, "logps/ref_rejected": -100.47752380371094, "logps/rejected": -685.321044921875, "loss": 1.1517, "margin_dpo/margin_mean": 190.50540161132812, "margin_dpo/margin_std": 290.72491455078125, "step": 415 }, { "KL/chosen_KL_mean": -382.89056396484375, "KL/mean": -540.4343872070312, "KL/rejected_KL_mean": -697.9782104492188, "KL/std": 309.90460205078125, "epoch": 0.6108663729809104, "fcm_dpo/beta": 0.0016415867721661925, "fcm_dpo/delta": -0.12347446382045746, "fcm_dpo/margin": 315.0876770019531, "fcm_dpo/q_t": 0.38012266159057617, "grad_norm": 37.14603042602539, "learning_rate": 1.990267419549914e-07, "logits/chosen": -0.5974301099777222, "logits/rejected": -0.6131728887557983, "logps/chosen": -441.1697082519531, "logps/ref_chosen": -58.27912521362305, "logps/ref_rejected": -90.56871795654297, "logps/rejected": -788.5469360351562, "loss": 0.9956, "margin_dpo/margin_mean": 315.0876770019531, "margin_dpo/margin_std": 312.2025146484375, "step": 416 }, { "KL/chosen_KL_mean": -382.858154296875, "KL/mean": -517.802978515625, "KL/rejected_KL_mean": -652.7478637695312, "KL/std": 286.8901672363281, "epoch": 0.6123348017621145, "fcm_dpo/beta": 0.0016207349253818393, "fcm_dpo/delta": -0.039116691797971725, "fcm_dpo/margin": 269.8897399902344, "fcm_dpo/q_t": 0.3975698947906494, "grad_norm": 29.581459045410156, "learning_rate": 1.9777104743594686e-07, "logits/chosen": -0.5895746946334839, "logits/rejected": -0.583941638469696, "logps/chosen": -433.0568542480469, "logps/ref_chosen": -50.1987190246582, "logps/ref_rejected": -68.15184020996094, "logps/rejected": -720.8997192382812, "loss": 1.0467, "margin_dpo/margin_mean": 269.8897705078125, "margin_dpo/margin_std": 286.52203369140625, "step": 417 }, { "KL/chosen_KL_mean": -423.42840576171875, "KL/mean": -557.5980224609375, "KL/rejected_KL_mean": -691.767578125, "KL/std": 328.6751708984375, "epoch": 0.6138032305433186, "fcm_dpo/beta": 0.0016204738058149815, "fcm_dpo/delta": -0.03766999393701553, "fcm_dpo/margin": 268.33917236328125, "fcm_dpo/q_t": 0.403804749250412, "grad_norm": 33.6888427734375, "learning_rate": 1.965167291983757e-07, "logits/chosen": -0.6470938920974731, "logits/rejected": -0.6322601437568665, "logps/chosen": -505.4068603515625, "logps/ref_chosen": -81.97846984863281, "logps/ref_rejected": -104.69148254394531, "logps/rejected": -796.4591064453125, "loss": 1.0912, "margin_dpo/margin_mean": 268.33917236328125, "margin_dpo/margin_std": 380.9486999511719, "step": 418 }, { "KL/chosen_KL_mean": -383.6288146972656, "KL/mean": -537.1128540039062, "KL/rejected_KL_mean": -690.5968017578125, "KL/std": 298.24053955078125, "epoch": 0.6152716593245228, "fcm_dpo/beta": 0.0015830930788069963, "fcm_dpo/delta": -0.09028756618499756, "fcm_dpo/margin": 306.96807861328125, "fcm_dpo/q_t": 0.38735997676849365, "grad_norm": 34.53618240356445, "learning_rate": 1.9526382029472988e-07, "logits/chosen": -0.6050629019737244, "logits/rejected": -0.6123736500740051, "logps/chosen": -436.57745361328125, "logps/ref_chosen": -52.948646545410156, "logps/ref_rejected": -91.58309936523438, "logps/rejected": -782.179931640625, "loss": 1.0281, "margin_dpo/margin_mean": 306.9680480957031, "margin_dpo/margin_std": 345.80462646484375, "step": 419 }, { "KL/chosen_KL_mean": -483.5123291015625, "KL/mean": -582.5636596679688, "KL/rejected_KL_mean": -681.614990234375, "KL/std": 315.52850341796875, "epoch": 0.6167400881057269, "fcm_dpo/beta": 0.0015975853893905878, "fcm_dpo/delta": 0.08602797240018845, "fcm_dpo/margin": 198.1026153564453, "fcm_dpo/q_t": 0.43039628863334656, "grad_norm": 62.545352935791016, "learning_rate": 1.9401235374032425e-07, "logits/chosen": -0.652934730052948, "logits/rejected": -0.626418948173523, "logps/chosen": -561.2822265625, "logps/ref_chosen": -77.7699203491211, "logps/ref_rejected": -69.31985473632812, "logps/rejected": -750.934814453125, "loss": 1.195, "margin_dpo/margin_mean": 198.1026153564453, "margin_dpo/margin_std": 413.45147705078125, "step": 420 }, { "KL/chosen_KL_mean": -400.49127197265625, "KL/mean": -509.7319030761719, "KL/rejected_KL_mean": -618.9725341796875, "KL/std": 300.7071533203125, "epoch": 0.618208516886931, "fcm_dpo/beta": 0.001619070884771645, "fcm_dpo/delta": 0.04750995337963104, "fcm_dpo/margin": 218.4813232421875, "fcm_dpo/q_t": 0.4164373278617859, "grad_norm": 31.18450927734375, "learning_rate": 1.9276236251246653e-07, "logits/chosen": -0.6628319025039673, "logits/rejected": -0.6538623571395874, "logps/chosen": -454.25714111328125, "logps/ref_chosen": -53.765865325927734, "logps/ref_rejected": -89.28144836425781, "logps/rejected": -708.2540283203125, "loss": 1.1261, "margin_dpo/margin_mean": 218.4813232421875, "margin_dpo/margin_std": 307.9769287109375, "step": 421 }, { "KL/chosen_KL_mean": -447.5567626953125, "KL/mean": -573.166015625, "KL/rejected_KL_mean": -698.7752685546875, "KL/std": 324.9825134277344, "epoch": 0.6196769456681351, "fcm_dpo/beta": 0.001614267472177744, "fcm_dpo/delta": -0.005787511821836233, "fcm_dpo/margin": 251.21852111816406, "fcm_dpo/q_t": 0.4061550498008728, "grad_norm": 39.39680480957031, "learning_rate": 1.9151387954958792e-07, "logits/chosen": -0.6519845724105835, "logits/rejected": -0.6586755514144897, "logps/chosen": -516.1905517578125, "logps/ref_chosen": -68.6337661743164, "logps/ref_rejected": -87.86351013183594, "logps/rejected": -786.6387939453125, "loss": 1.1048, "margin_dpo/margin_mean": 251.21853637695312, "margin_dpo/margin_std": 371.8038330078125, "step": 422 }, { "KL/chosen_KL_mean": -412.92333984375, "KL/mean": -550.4478149414062, "KL/rejected_KL_mean": -687.9722900390625, "KL/std": 291.155029296875, "epoch": 0.6211453744493393, "fcm_dpo/beta": 0.0016041090711951256, "fcm_dpo/delta": -0.04313413053750992, "fcm_dpo/margin": 275.04901123046875, "fcm_dpo/q_t": 0.39793986082077026, "grad_norm": 34.45840835571289, "learning_rate": 1.902669377503756e-07, "logits/chosen": -0.6469055414199829, "logits/rejected": -0.6586620807647705, "logps/chosen": -467.91363525390625, "logps/ref_chosen": -54.99030303955078, "logps/ref_rejected": -86.30654907226562, "logps/rejected": -774.2788696289062, "loss": 1.0554, "margin_dpo/margin_mean": 275.04901123046875, "margin_dpo/margin_std": 323.095947265625, "step": 423 }, { "KL/chosen_KL_mean": -366.37255859375, "KL/mean": -494.26788330078125, "KL/rejected_KL_mean": -622.1632080078125, "KL/std": 289.16485595703125, "epoch": 0.6226138032305433, "fcm_dpo/beta": 0.0015932890819385648, "fcm_dpo/delta": -0.008194293826818466, "fcm_dpo/margin": 255.79061889648438, "fcm_dpo/q_t": 0.4079640209674835, "grad_norm": 39.47249221801758, "learning_rate": 1.890215699729057e-07, "logits/chosen": -0.6213667392730713, "logits/rejected": -0.6005524396896362, "logps/chosen": -422.38446044921875, "logps/ref_chosen": -56.01192092895508, "logps/ref_rejected": -66.47896575927734, "logps/rejected": -688.6422119140625, "loss": 1.0894, "margin_dpo/margin_mean": 255.79061889648438, "margin_dpo/margin_std": 346.87823486328125, "step": 424 }, { "KL/chosen_KL_mean": -408.7755126953125, "KL/mean": -500.9407958984375, "KL/rejected_KL_mean": -593.10595703125, "KL/std": 269.2953186035156, "epoch": 0.6240822320117474, "fcm_dpo/beta": 0.0016285117017105222, "fcm_dpo/delta": 0.10236521810293198, "fcm_dpo/margin": 184.3304901123047, "fcm_dpo/q_t": 0.42912036180496216, "grad_norm": 33.893001556396484, "learning_rate": 1.8777780903377732e-07, "logits/chosen": -0.6225741505622864, "logits/rejected": -0.6225865483283997, "logps/chosen": -455.64453125, "logps/ref_chosen": -46.86899948120117, "logps/ref_rejected": -95.92545318603516, "logps/rejected": -689.031494140625, "loss": 1.1745, "margin_dpo/margin_mean": 184.3304901123047, "margin_dpo/margin_std": 319.94305419921875, "step": 425 }, { "KL/chosen_KL_mean": -371.005615234375, "KL/mean": -493.8805236816406, "KL/rejected_KL_mean": -616.7554321289062, "KL/std": 271.8358154296875, "epoch": 0.6255506607929515, "fcm_dpo/beta": 0.0016375456470996141, "fcm_dpo/delta": -0.0028386712074279785, "fcm_dpo/margin": 245.7498321533203, "fcm_dpo/q_t": 0.40605005621910095, "grad_norm": 24.651735305786133, "learning_rate": 1.8653568770724803e-07, "logits/chosen": -0.603665292263031, "logits/rejected": -0.5721160173416138, "logps/chosen": -447.58917236328125, "logps/ref_chosen": -76.58354187011719, "logps/ref_rejected": -81.26658630371094, "logps/rejected": -698.0220336914062, "loss": 1.0796, "margin_dpo/margin_mean": 245.7498321533203, "margin_dpo/margin_std": 290.891357421875, "step": 426 }, { "KL/chosen_KL_mean": -333.1865539550781, "KL/mean": -426.1375427246094, "KL/rejected_KL_mean": -519.0885620117188, "KL/std": 232.90151977539062, "epoch": 0.6270190895741556, "fcm_dpo/beta": 0.0016515168827027082, "fcm_dpo/delta": 0.09603013098239899, "fcm_dpo/margin": 185.9020233154297, "fcm_dpo/q_t": 0.4287715554237366, "grad_norm": 25.280298233032227, "learning_rate": 1.8529523872436977e-07, "logits/chosen": -0.6069827079772949, "logits/rejected": -0.5897522568702698, "logps/chosen": -398.0404357910156, "logps/ref_chosen": -64.8538818359375, "logps/ref_rejected": -78.5660171508789, "logps/rejected": -597.654541015625, "loss": 1.1536, "margin_dpo/margin_mean": 185.9020233154297, "margin_dpo/margin_std": 277.96832275390625, "step": 427 }, { "KL/chosen_KL_mean": -421.74224853515625, "KL/mean": -548.2906494140625, "KL/rejected_KL_mean": -674.8390502929688, "KL/std": 312.8956298828125, "epoch": 0.6284875183553598, "fcm_dpo/beta": 0.001651531783863902, "fcm_dpo/delta": -0.019138701260089874, "fcm_dpo/margin": 253.09681701660156, "fcm_dpo/q_t": 0.4038928151130676, "grad_norm": 31.78797721862793, "learning_rate": 1.8405649477212697e-07, "logits/chosen": -0.5810732245445251, "logits/rejected": -0.5857928395271301, "logps/chosen": -484.37890625, "logps/ref_chosen": -62.63666534423828, "logps/ref_rejected": -103.28181457519531, "logps/rejected": -778.120849609375, "loss": 1.1004, "margin_dpo/margin_mean": 253.09683227539062, "margin_dpo/margin_std": 371.6815185546875, "step": 428 }, { "KL/chosen_KL_mean": -435.04058837890625, "KL/mean": -534.104248046875, "KL/rejected_KL_mean": -633.1678466796875, "KL/std": 291.46722412109375, "epoch": 0.6299559471365639, "fcm_dpo/beta": 0.0016507648397237062, "fcm_dpo/delta": -0.03221222758293152, "fcm_dpo/margin": 198.1272430419922, "fcm_dpo/q_t": 0.4247916340827942, "grad_norm": 31.24496078491211, "learning_rate": 1.828194884925749e-07, "logits/chosen": -0.5936084985733032, "logits/rejected": -0.570778489112854, "logps/chosen": -516.2745971679688, "logps/ref_chosen": -81.23401641845703, "logps/ref_rejected": -91.79493713378906, "logps/rejected": -724.9627685546875, "loss": 1.1706, "margin_dpo/margin_mean": 198.1272430419922, "margin_dpo/margin_std": 350.78887939453125, "step": 429 }, { "KL/chosen_KL_mean": -348.882568359375, "KL/mean": -453.02349853515625, "KL/rejected_KL_mean": -557.1644287109375, "KL/std": 258.8692321777344, "epoch": 0.631424375917768, "fcm_dpo/beta": 0.0016623124247416854, "fcm_dpo/delta": 0.055517442524433136, "fcm_dpo/margin": 208.2819061279297, "fcm_dpo/q_t": 0.41966137290000916, "grad_norm": 36.20037078857422, "learning_rate": 1.8158425248197928e-07, "logits/chosen": -0.5428692102432251, "logits/rejected": -0.5417746305465698, "logps/chosen": -409.8028564453125, "logps/ref_chosen": -60.920326232910156, "logps/ref_rejected": -104.42280578613281, "logps/rejected": -661.5872802734375, "loss": 1.122, "margin_dpo/margin_mean": 208.2819061279297, "margin_dpo/margin_std": 283.18389892578125, "step": 430 }, { "KL/chosen_KL_mean": -318.82403564453125, "KL/mean": -467.67669677734375, "KL/rejected_KL_mean": -616.529296875, "KL/std": 278.1539001464844, "epoch": 0.6328928046989721, "fcm_dpo/beta": 0.0016387823270633817, "fcm_dpo/delta": -0.09255114197731018, "fcm_dpo/margin": 297.705322265625, "fcm_dpo/q_t": 0.3862955570220947, "grad_norm": 29.066301345825195, "learning_rate": 1.8035081928995788e-07, "logits/chosen": -0.5722877383232117, "logits/rejected": -0.5794203281402588, "logps/chosen": -376.17279052734375, "logps/ref_chosen": -57.34874725341797, "logps/ref_rejected": -92.84022521972656, "logps/rejected": -709.3695678710938, "loss": 1.0176, "margin_dpo/margin_mean": 297.705322265625, "margin_dpo/margin_std": 304.55963134765625, "step": 431 }, { "KL/chosen_KL_mean": -330.6824645996094, "KL/mean": -471.57916259765625, "KL/rejected_KL_mean": -612.475830078125, "KL/std": 277.7702941894531, "epoch": 0.6343612334801763, "fcm_dpo/beta": 0.0016283730510622263, "fcm_dpo/delta": -0.06209279224276543, "fcm_dpo/margin": 281.7933654785156, "fcm_dpo/q_t": 0.3932652473449707, "grad_norm": 31.19976043701172, "learning_rate": 1.791192214186223e-07, "logits/chosen": -0.5432115793228149, "logits/rejected": -0.5331372618675232, "logps/chosen": -401.75726318359375, "logps/ref_chosen": -71.07479095458984, "logps/ref_rejected": -98.57952880859375, "logps/rejected": -711.055419921875, "loss": 1.0323, "margin_dpo/margin_mean": 281.7933654785156, "margin_dpo/margin_std": 281.0911560058594, "step": 432 }, { "KL/chosen_KL_mean": -419.6158447265625, "KL/mean": -512.61572265625, "KL/rejected_KL_mean": -605.6156005859375, "KL/std": 285.9530029296875, "epoch": 0.6358296622613803, "fcm_dpo/beta": 0.0016389940865337849, "fcm_dpo/delta": 0.09798242151737213, "fcm_dpo/margin": 185.99978637695312, "fcm_dpo/q_t": 0.4270647466182709, "grad_norm": 37.97488784790039, "learning_rate": 1.7788949132172193e-07, "logits/chosen": -0.6025904417037964, "logits/rejected": -0.591471791267395, "logps/chosen": -477.8890380859375, "logps/ref_chosen": -58.273193359375, "logps/ref_rejected": -95.95089721679688, "logps/rejected": -701.5665283203125, "loss": 1.1752, "margin_dpo/margin_mean": 185.99978637695312, "margin_dpo/margin_std": 330.22344970703125, "step": 433 }, { "KL/chosen_KL_mean": -354.8839111328125, "KL/mean": -471.41546630859375, "KL/rejected_KL_mean": -587.9470825195312, "KL/std": 279.48162841796875, "epoch": 0.6372980910425844, "fcm_dpo/beta": 0.0016441468615084887, "fcm_dpo/delta": 0.017331628128886223, "fcm_dpo/margin": 233.06314086914062, "fcm_dpo/q_t": 0.41576558351516724, "grad_norm": 20.974876403808594, "learning_rate": 1.7666166140378853e-07, "logits/chosen": -0.5894551873207092, "logits/rejected": -0.5889327526092529, "logps/chosen": -416.8576354980469, "logps/ref_chosen": -61.97370147705078, "logps/ref_rejected": -78.49861145019531, "logps/rejected": -666.4456787109375, "loss": 1.112, "margin_dpo/margin_mean": 233.06314086914062, "margin_dpo/margin_std": 350.6734619140625, "step": 434 }, { "KL/chosen_KL_mean": -339.70953369140625, "KL/mean": -465.9414978027344, "KL/rejected_KL_mean": -592.1734619140625, "KL/std": 277.168701171875, "epoch": 0.6387665198237885, "fcm_dpo/beta": 0.001645256532356143, "fcm_dpo/delta": -0.016068164259195328, "fcm_dpo/margin": 252.4639129638672, "fcm_dpo/q_t": 0.40314286947250366, "grad_norm": 31.517038345336914, "learning_rate": 1.7543576401928218e-07, "logits/chosen": -0.6623108386993408, "logits/rejected": -0.656818151473999, "logps/chosen": -391.21160888671875, "logps/ref_chosen": -51.502052307128906, "logps/ref_rejected": -87.56689453125, "logps/rejected": -679.7403564453125, "loss": 1.0747, "margin_dpo/margin_mean": 252.46392822265625, "margin_dpo/margin_std": 302.1197509765625, "step": 435 }, { "KL/chosen_KL_mean": -361.44921875, "KL/mean": -471.92071533203125, "KL/rejected_KL_mean": -582.3922119140625, "KL/std": 250.78880310058594, "epoch": 0.6402349486049926, "fcm_dpo/beta": 0.0016492058057338, "fcm_dpo/delta": 0.03687084838747978, "fcm_dpo/margin": 220.94302368164062, "fcm_dpo/q_t": 0.4155094027519226, "grad_norm": 41.7802619934082, "learning_rate": 1.742118314717391e-07, "logits/chosen": -0.604122519493103, "logits/rejected": -0.5753868222236633, "logps/chosen": -432.8529052734375, "logps/ref_chosen": -71.40371704101562, "logps/ref_rejected": -82.72775268554688, "logps/rejected": -665.1199951171875, "loss": 1.1117, "margin_dpo/margin_mean": 220.94302368164062, "margin_dpo/margin_std": 296.26336669921875, "step": 436 }, { "KL/chosen_KL_mean": -363.95184326171875, "KL/mean": -478.0007629394531, "KL/rejected_KL_mean": -592.0496826171875, "KL/std": 241.47042846679688, "epoch": 0.6417033773861968, "fcm_dpo/beta": 0.0016615703934803605, "fcm_dpo/delta": 0.021829720586538315, "fcm_dpo/margin": 228.0978240966797, "fcm_dpo/q_t": 0.4112783670425415, "grad_norm": 22.802751541137695, "learning_rate": 1.7298989601292036e-07, "logits/chosen": -0.6008163690567017, "logits/rejected": -0.5768181681632996, "logps/chosen": -428.69610595703125, "logps/ref_chosen": -64.7442626953125, "logps/ref_rejected": -82.04356384277344, "logps/rejected": -674.0932006835938, "loss": 1.0971, "margin_dpo/margin_mean": 228.09780883789062, "margin_dpo/margin_std": 287.695556640625, "step": 437 }, { "KL/chosen_KL_mean": -375.4371337890625, "KL/mean": -506.3113708496094, "KL/rejected_KL_mean": -637.185546875, "KL/std": 277.33953857421875, "epoch": 0.6431718061674009, "fcm_dpo/beta": 0.0016493103466928005, "fcm_dpo/delta": -0.03368060290813446, "fcm_dpo/margin": 261.7484436035156, "fcm_dpo/q_t": 0.39925286173820496, "grad_norm": 27.77182388305664, "learning_rate": 1.7176998984196144e-07, "logits/chosen": -0.6229407787322998, "logits/rejected": -0.6052076816558838, "logps/chosen": -434.455810546875, "logps/ref_chosen": -59.0186653137207, "logps/ref_rejected": -83.07682800292969, "logps/rejected": -720.2623901367188, "loss": 1.0575, "margin_dpo/margin_mean": 261.7484436035156, "margin_dpo/margin_std": 294.62152099609375, "step": 438 }, { "KL/chosen_KL_mean": -397.02178955078125, "KL/mean": -500.23809814453125, "KL/rejected_KL_mean": -603.4544067382812, "KL/std": 279.9316711425781, "epoch": 0.644640234948605, "fcm_dpo/beta": 0.001632218947634101, "fcm_dpo/delta": -0.06549854576587677, "fcm_dpo/margin": 206.4326171875, "fcm_dpo/q_t": 0.42261505126953125, "grad_norm": 28.284347534179688, "learning_rate": 1.7055214510452458e-07, "logits/chosen": -0.6147496700286865, "logits/rejected": -0.6177977323532104, "logps/chosen": -450.8058776855469, "logps/ref_chosen": -53.78407669067383, "logps/ref_rejected": -83.98545837402344, "logps/rejected": -687.4398803710938, "loss": 1.1492, "margin_dpo/margin_mean": 206.43260192871094, "margin_dpo/margin_std": 320.11016845703125, "step": 439 }, { "KL/chosen_KL_mean": -427.0113525390625, "KL/mean": -549.5577392578125, "KL/rejected_KL_mean": -672.10400390625, "KL/std": 337.44097900390625, "epoch": 0.6461086637298091, "fcm_dpo/beta": 0.0016363917384296656, "fcm_dpo/delta": -0.0012616775929927826, "fcm_dpo/margin": 245.09262084960938, "fcm_dpo/q_t": 0.4104476571083069, "grad_norm": 33.926116943359375, "learning_rate": 1.6933639389195134e-07, "logits/chosen": -0.6553194522857666, "logits/rejected": -0.6481237411499023, "logps/chosen": -505.5780944824219, "logps/ref_chosen": -78.56671905517578, "logps/ref_rejected": -96.49775695800781, "logps/rejected": -768.601806640625, "loss": 1.0939, "margin_dpo/margin_mean": 245.0926513671875, "margin_dpo/margin_std": 338.4553527832031, "step": 440 }, { "KL/chosen_KL_mean": -478.03643798828125, "KL/mean": -599.6422729492188, "KL/rejected_KL_mean": -721.2481689453125, "KL/std": 347.5352783203125, "epoch": 0.6475770925110133, "fcm_dpo/beta": 0.0016381317982450128, "fcm_dpo/delta": 0.0013750754296779633, "fcm_dpo/margin": 243.21163940429688, "fcm_dpo/q_t": 0.4130883812904358, "grad_norm": 35.49950408935547, "learning_rate": 1.681227682404166e-07, "logits/chosen": -0.6869616508483887, "logits/rejected": -0.6749493479728699, "logps/chosen": -538.86083984375, "logps/ref_chosen": -60.824440002441406, "logps/ref_rejected": -96.47080993652344, "logps/rejected": -817.7189331054688, "loss": 1.1354, "margin_dpo/margin_mean": 243.21163940429688, "margin_dpo/margin_std": 415.7077331542969, "step": 441 }, { "KL/chosen_KL_mean": -415.85516357421875, "KL/mean": -557.4616088867188, "KL/rejected_KL_mean": -699.0679931640625, "KL/std": 336.6131591796875, "epoch": 0.6490455212922174, "fcm_dpo/beta": 0.0016285094898194075, "fcm_dpo/delta": -0.06479034572839737, "fcm_dpo/margin": 283.212890625, "fcm_dpo/q_t": 0.39779287576675415, "grad_norm": 30.064672470092773, "learning_rate": 1.669113001300851e-07, "logits/chosen": -0.6426968574523926, "logits/rejected": -0.6355198621749878, "logps/chosen": -462.8663635253906, "logps/ref_chosen": -47.01121520996094, "logps/ref_rejected": -76.53926086425781, "logps/rejected": -775.6072998046875, "loss": 1.0672, "margin_dpo/margin_mean": 283.212890625, "margin_dpo/margin_std": 374.67840576171875, "step": 442 }, { "KL/chosen_KL_mean": -472.62872314453125, "KL/mean": -562.2388305664062, "KL/rejected_KL_mean": -651.8489990234375, "KL/std": 327.6013488769531, "epoch": 0.6505139500734214, "fcm_dpo/beta": 0.0016116888727992773, "fcm_dpo/delta": 0.0013880077749490738, "fcm_dpo/margin": 179.22032165527344, "fcm_dpo/q_t": 0.4350769817829132, "grad_norm": 37.92525863647461, "learning_rate": 1.6570202148426815e-07, "logits/chosen": -0.6569748520851135, "logits/rejected": -0.636266827583313, "logps/chosen": -543.9017333984375, "logps/ref_chosen": -71.27301788330078, "logps/ref_rejected": -86.679931640625, "logps/rejected": -738.5289306640625, "loss": 1.2139, "margin_dpo/margin_mean": 179.22032165527344, "margin_dpo/margin_std": 391.09747314453125, "step": 443 }, { "KL/chosen_KL_mean": -461.75042724609375, "KL/mean": -612.273193359375, "KL/rejected_KL_mean": -762.7958984375, "KL/std": 354.7071533203125, "epoch": 0.6519823788546255, "fcm_dpo/beta": 0.001586769474670291, "fcm_dpo/delta": -0.0820961445569992, "fcm_dpo/margin": 301.04541015625, "fcm_dpo/q_t": 0.3927891254425049, "grad_norm": 28.29939842224121, "learning_rate": 1.6449496416858282e-07, "logits/chosen": -0.606819748878479, "logits/rejected": -0.6156275272369385, "logps/chosen": -518.9641723632812, "logps/ref_chosen": -57.213706970214844, "logps/ref_rejected": -97.25489807128906, "logps/rejected": -860.05078125, "loss": 1.0506, "margin_dpo/margin_mean": 301.04541015625, "margin_dpo/margin_std": 390.89599609375, "step": 444 }, { "KL/chosen_KL_mean": -402.9183044433594, "KL/mean": -539.9268798828125, "KL/rejected_KL_mean": -676.935546875, "KL/std": 278.75872802734375, "epoch": 0.6534508076358296, "fcm_dpo/beta": 0.0015799321699887514, "fcm_dpo/delta": -0.03441721200942993, "fcm_dpo/margin": 274.0172119140625, "fcm_dpo/q_t": 0.4012291133403778, "grad_norm": 31.61493492126465, "learning_rate": 1.6329015999011182e-07, "logits/chosen": -0.6219183206558228, "logits/rejected": -0.6123214960098267, "logps/chosen": -470.21807861328125, "logps/ref_chosen": -67.29979705810547, "logps/ref_rejected": -92.68267059326172, "logps/rejected": -769.6182250976562, "loss": 1.0706, "margin_dpo/margin_mean": 274.0172424316406, "margin_dpo/margin_std": 343.6361083984375, "step": 445 }, { "KL/chosen_KL_mean": -358.52130126953125, "KL/mean": -507.340087890625, "KL/rejected_KL_mean": -656.158935546875, "KL/std": 294.689453125, "epoch": 0.6549192364170338, "fcm_dpo/beta": 0.0015673264861106873, "fcm_dpo/delta": -0.0700267031788826, "fcm_dpo/margin": 297.6376037597656, "fcm_dpo/q_t": 0.3905888795852661, "grad_norm": 33.535675048828125, "learning_rate": 1.6208764069656578e-07, "logits/chosen": -0.6319636106491089, "logits/rejected": -0.6446236371994019, "logps/chosen": -417.61981201171875, "logps/ref_chosen": -59.098487854003906, "logps/ref_rejected": -101.26419067382812, "logps/rejected": -757.423095703125, "loss": 1.0286, "margin_dpo/margin_mean": 297.6376037597656, "margin_dpo/margin_std": 300.60076904296875, "step": 446 }, { "KL/chosen_KL_mean": -370.45916748046875, "KL/mean": -522.060791015625, "KL/rejected_KL_mean": -673.6624755859375, "KL/std": 334.0948486328125, "epoch": 0.6563876651982379, "fcm_dpo/beta": 0.001533093280158937, "fcm_dpo/delta": -0.06830260902643204, "fcm_dpo/margin": 303.20330810546875, "fcm_dpo/q_t": 0.39436084032058716, "grad_norm": 30.70143699645996, "learning_rate": 1.608874379754465e-07, "logits/chosen": -0.6682947874069214, "logits/rejected": -0.6798413395881653, "logps/chosen": -426.53448486328125, "logps/ref_chosen": -56.07533264160156, "logps/ref_rejected": -98.69475555419922, "logps/rejected": -772.357177734375, "loss": 1.0422, "margin_dpo/margin_mean": 303.20330810546875, "margin_dpo/margin_std": 361.6494140625, "step": 447 }, { "KL/chosen_KL_mean": -412.73431396484375, "KL/mean": -553.443359375, "KL/rejected_KL_mean": -694.1524658203125, "KL/std": 292.61798095703125, "epoch": 0.657856093979442, "fcm_dpo/beta": 0.001529197907075286, "fcm_dpo/delta": -0.03181178867816925, "fcm_dpo/margin": 281.41815185546875, "fcm_dpo/q_t": 0.4007987380027771, "grad_norm": 32.050010681152344, "learning_rate": 1.5968958345321177e-07, "logits/chosen": -0.5674072504043579, "logits/rejected": -0.5757938623428345, "logps/chosen": -472.7381591796875, "logps/ref_chosen": -60.00384521484375, "logps/ref_rejected": -102.26465606689453, "logps/rejected": -796.4171142578125, "loss": 1.0647, "margin_dpo/margin_mean": 281.41815185546875, "margin_dpo/margin_std": 337.77032470703125, "step": 448 }, { "KL/chosen_KL_mean": -404.6903076171875, "KL/mean": -552.0719604492188, "KL/rejected_KL_mean": -699.4535522460938, "KL/std": 354.35406494140625, "epoch": 0.6593245227606461, "fcm_dpo/beta": 0.0015087838983163238, "fcm_dpo/delta": -0.046950291842222214, "fcm_dpo/margin": 294.76324462890625, "fcm_dpo/q_t": 0.4015204608440399, "grad_norm": 29.442913055419922, "learning_rate": 1.584941086944423e-07, "logits/chosen": -0.6377573013305664, "logits/rejected": -0.6385862231254578, "logps/chosen": -472.2169189453125, "logps/ref_chosen": -67.52661895751953, "logps/ref_rejected": -88.59690856933594, "logps/rejected": -788.0504760742188, "loss": 1.0779, "margin_dpo/margin_mean": 294.76324462890625, "margin_dpo/margin_std": 421.296142578125, "step": 449 }, { "KL/chosen_KL_mean": -330.97320556640625, "KL/mean": -487.33258056640625, "KL/rejected_KL_mean": -643.69189453125, "KL/std": 313.515625, "epoch": 0.6607929515418502, "fcm_dpo/beta": 0.0014938064850866795, "fcm_dpo/delta": -0.07037577033042908, "fcm_dpo/margin": 312.7186584472656, "fcm_dpo/q_t": 0.3899462819099426, "grad_norm": 41.06943893432617, "learning_rate": 1.573010452010098e-07, "logits/chosen": -0.6237972974777222, "logits/rejected": -0.6363176107406616, "logps/chosen": -388.0813293457031, "logps/ref_chosen": -57.10811996459961, "logps/ref_rejected": -102.75494384765625, "logps/rejected": -746.44677734375, "loss": 1.0229, "margin_dpo/margin_mean": 312.7186584472656, "margin_dpo/margin_std": 310.42205810546875, "step": 450 }, { "KL/chosen_KL_mean": -439.8880920410156, "KL/mean": -556.1387329101562, "KL/rejected_KL_mean": -672.389404296875, "KL/std": 351.8367004394531, "epoch": 0.6622613803230544, "fcm_dpo/beta": 0.0015017553232610226, "fcm_dpo/delta": 0.052446845918893814, "fcm_dpo/margin": 232.50131225585938, "fcm_dpo/q_t": 0.41802603006362915, "grad_norm": 32.329261779785156, "learning_rate": 1.5611042441124687e-07, "logits/chosen": -0.6703058481216431, "logits/rejected": -0.6499719619750977, "logps/chosen": -498.35693359375, "logps/ref_chosen": -58.46883010864258, "logps/ref_rejected": -72.92941284179688, "logps/rejected": -745.3187866210938, "loss": 1.1564, "margin_dpo/margin_mean": 232.50131225585938, "margin_dpo/margin_std": 410.9925537109375, "step": 451 }, { "KL/chosen_KL_mean": -307.5163269042969, "KL/mean": -446.1638488769531, "KL/rejected_KL_mean": -584.8113403320312, "KL/std": 270.72735595703125, "epoch": 0.6637298091042585, "fcm_dpo/beta": 0.001496224314905703, "fcm_dpo/delta": -0.015626225620508194, "fcm_dpo/margin": 277.29498291015625, "fcm_dpo/q_t": 0.4016070067882538, "grad_norm": 22.36501693725586, "learning_rate": 1.549222776991186e-07, "logits/chosen": -0.5967873930931091, "logits/rejected": -0.6146633625030518, "logps/chosen": -357.9068908691406, "logps/ref_chosen": -50.39055252075195, "logps/ref_rejected": -97.77142333984375, "logps/rejected": -682.582763671875, "loss": 1.0545, "margin_dpo/margin_mean": 277.29498291015625, "margin_dpo/margin_std": 275.4556579589844, "step": 452 }, { "KL/chosen_KL_mean": -370.3055419921875, "KL/mean": -499.5263671875, "KL/rejected_KL_mean": -628.7472534179688, "KL/std": 281.7098388671875, "epoch": 0.6651982378854625, "fcm_dpo/beta": 0.001497291261330247, "fcm_dpo/delta": 0.013492653146386147, "fcm_dpo/margin": 258.44171142578125, "fcm_dpo/q_t": 0.4115417003631592, "grad_norm": 27.039974212646484, "learning_rate": 1.5373663637339584e-07, "logits/chosen": -0.6106635332107544, "logits/rejected": -0.5978103876113892, "logps/chosen": -428.0203857421875, "logps/ref_chosen": -57.71485137939453, "logps/ref_rejected": -82.20741271972656, "logps/rejected": -710.9547119140625, "loss": 1.0961, "margin_dpo/margin_mean": 258.44171142578125, "margin_dpo/margin_std": 340.037353515625, "step": 453 }, { "KL/chosen_KL_mean": -440.86590576171875, "KL/mean": -588.3302001953125, "KL/rejected_KL_mean": -735.7944946289062, "KL/std": 319.46673583984375, "epoch": 0.6666666666666666, "fcm_dpo/beta": 0.0014888541772961617, "fcm_dpo/delta": -0.041121020913124084, "fcm_dpo/margin": 294.9285888671875, "fcm_dpo/q_t": 0.3986594080924988, "grad_norm": 27.240726470947266, "learning_rate": 1.5255353167683017e-07, "logits/chosen": -0.6478193402290344, "logits/rejected": -0.6373401880264282, "logps/chosen": -501.8115234375, "logps/ref_chosen": -60.945648193359375, "logps/ref_rejected": -84.95079040527344, "logps/rejected": -820.7452392578125, "loss": 1.0565, "margin_dpo/margin_mean": 294.9285888671875, "margin_dpo/margin_std": 350.4466857910156, "step": 454 }, { "KL/chosen_KL_mean": -384.628662109375, "KL/mean": -554.451416015625, "KL/rejected_KL_mean": -724.274169921875, "KL/std": 327.93011474609375, "epoch": 0.6681350954478708, "fcm_dpo/beta": 0.0014744448708370328, "fcm_dpo/delta": -0.10616149008274078, "fcm_dpo/margin": 339.64556884765625, "fcm_dpo/q_t": 0.3856102526187897, "grad_norm": 32.356143951416016, "learning_rate": 1.5137299478533064e-07, "logits/chosen": -0.6396904587745667, "logits/rejected": -0.6583499908447266, "logps/chosen": -429.5153503417969, "logps/ref_chosen": -44.88671112060547, "logps/ref_rejected": -115.30147552490234, "logps/rejected": -839.57568359375, "loss": 1.0187, "margin_dpo/margin_mean": 339.64556884765625, "margin_dpo/margin_std": 368.43084716796875, "step": 455 }, { "KL/chosen_KL_mean": -411.4013977050781, "KL/mean": -578.405029296875, "KL/rejected_KL_mean": -745.4085693359375, "KL/std": 345.68756103515625, "epoch": 0.6696035242290749, "fcm_dpo/beta": 0.0014414741890504956, "fcm_dpo/delta": -0.08550744503736496, "fcm_dpo/margin": 334.0072021484375, "fcm_dpo/q_t": 0.38874322175979614, "grad_norm": 29.948514938354492, "learning_rate": 1.5019505680714232e-07, "logits/chosen": -0.6326814889907837, "logits/rejected": -0.6543838977813721, "logps/chosen": -468.43817138671875, "logps/ref_chosen": -57.036781311035156, "logps/ref_rejected": -105.21784210205078, "logps/rejected": -850.6264038085938, "loss": 1.0142, "margin_dpo/margin_mean": 334.0072021484375, "margin_dpo/margin_std": 335.52606201171875, "step": 456 }, { "KL/chosen_KL_mean": -410.80609130859375, "KL/mean": -580.7474365234375, "KL/rejected_KL_mean": -750.6888427734375, "KL/std": 341.3404541015625, "epoch": 0.671071953010279, "fcm_dpo/beta": 0.0014121406711637974, "fcm_dpo/delta": -0.08430389314889908, "fcm_dpo/margin": 339.88275146484375, "fcm_dpo/q_t": 0.3872183859348297, "grad_norm": 36.677040100097656, "learning_rate": 1.4901974878202627e-07, "logits/chosen": -0.6665825843811035, "logits/rejected": -0.6668688058853149, "logps/chosen": -465.0486145019531, "logps/ref_chosen": -54.24253845214844, "logps/ref_rejected": -85.10956573486328, "logps/rejected": -835.79833984375, "loss": 1.0146, "margin_dpo/margin_mean": 339.8827209472656, "margin_dpo/margin_std": 330.81744384765625, "step": 457 }, { "KL/chosen_KL_mean": -430.34912109375, "KL/mean": -581.6856689453125, "KL/rejected_KL_mean": -733.022216796875, "KL/std": 319.516357421875, "epoch": 0.6725403817914831, "fcm_dpo/beta": 0.0013981210067868233, "fcm_dpo/delta": -0.0246875062584877, "fcm_dpo/margin": 302.67303466796875, "fcm_dpo/q_t": 0.40338659286499023, "grad_norm": 24.763858795166016, "learning_rate": 1.4784710168044212e-07, "logits/chosen": -0.6812525987625122, "logits/rejected": -0.6768727898597717, "logps/chosen": -485.75799560546875, "logps/ref_chosen": -55.40888214111328, "logps/ref_rejected": -97.68325805664062, "logps/rejected": -830.7054443359375, "loss": 1.0701, "margin_dpo/margin_mean": 302.6730651855469, "margin_dpo/margin_std": 369.3429260253906, "step": 458 }, { "KL/chosen_KL_mean": -459.40435791015625, "KL/mean": -625.1992797851562, "KL/rejected_KL_mean": -790.994140625, "KL/std": 361.43035888671875, "epoch": 0.6740088105726872, "fcm_dpo/beta": 0.001385183772072196, "fcm_dpo/delta": -0.0625496357679367, "fcm_dpo/margin": 331.5897216796875, "fcm_dpo/q_t": 0.3948526680469513, "grad_norm": 32.20987319946289, "learning_rate": 1.466771464027316e-07, "logits/chosen": -0.6852984428405762, "logits/rejected": -0.7049773931503296, "logps/chosen": -505.96185302734375, "logps/ref_chosen": -46.55748748779297, "logps/ref_rejected": -86.16854095458984, "logps/rejected": -877.1627197265625, "loss": 1.0536, "margin_dpo/margin_mean": 331.5897521972656, "margin_dpo/margin_std": 406.781982421875, "step": 459 }, { "KL/chosen_KL_mean": -512.4947509765625, "KL/mean": -693.9195556640625, "KL/rejected_KL_mean": -875.3443603515625, "KL/std": 360.5986328125, "epoch": 0.6754772393538914, "fcm_dpo/beta": 0.0013653924688696861, "fcm_dpo/delta": -0.10034875571727753, "fcm_dpo/margin": 362.849609375, "fcm_dpo/q_t": 0.3863321542739868, "grad_norm": 40.046512603759766, "learning_rate": 1.4550991377830423e-07, "logits/chosen": -0.7233697772026062, "logits/rejected": -0.7532409429550171, "logps/chosen": -564.1296997070312, "logps/ref_chosen": -51.63489532470703, "logps/ref_rejected": -104.11935424804688, "logps/rejected": -979.4637451171875, "loss": 1.0176, "margin_dpo/margin_mean": 362.849609375, "margin_dpo/margin_std": 396.6766357421875, "step": 460 }, { "KL/chosen_KL_mean": -548.9373779296875, "KL/mean": -692.796142578125, "KL/rejected_KL_mean": -836.6549072265625, "KL/std": 373.7737731933594, "epoch": 0.6769456681350955, "fcm_dpo/beta": 0.0013628401793539524, "fcm_dpo/delta": 0.008031336590647697, "fcm_dpo/margin": 287.7176208496094, "fcm_dpo/q_t": 0.41146761178970337, "grad_norm": 25.319110870361328, "learning_rate": 1.4434543456482518e-07, "logits/chosen": -0.7724506855010986, "logits/rejected": -0.7853858470916748, "logps/chosen": -604.1192626953125, "logps/ref_chosen": -55.18195724487305, "logps/ref_rejected": -86.47689819335938, "logps/rejected": -923.1318359375, "loss": 1.1035, "margin_dpo/margin_mean": 287.71759033203125, "margin_dpo/margin_std": 409.4786071777344, "step": 461 }, { "KL/chosen_KL_mean": -559.73583984375, "KL/mean": -675.0146484375, "KL/rejected_KL_mean": -790.2933959960938, "KL/std": 367.716552734375, "epoch": 0.6784140969162996, "fcm_dpo/beta": 0.0013765160692855716, "fcm_dpo/delta": 0.08537392318248749, "fcm_dpo/margin": 230.5576171875, "fcm_dpo/q_t": 0.428949773311615, "grad_norm": 43.80839920043945, "learning_rate": 1.4318373944740484e-07, "logits/chosen": -0.8070446848869324, "logits/rejected": -0.7969012260437012, "logps/chosen": -629.663818359375, "logps/ref_chosen": -69.92803192138672, "logps/ref_rejected": -78.84111022949219, "logps/rejected": -869.134521484375, "loss": 1.1705, "margin_dpo/margin_mean": 230.55758666992188, "margin_dpo/margin_std": 416.346923828125, "step": 462 }, { "KL/chosen_KL_mean": -559.2445068359375, "KL/mean": -710.7879638671875, "KL/rejected_KL_mean": -862.3314208984375, "KL/std": 379.8200378417969, "epoch": 0.6798825256975036, "fcm_dpo/beta": 0.0013845614157617092, "fcm_dpo/delta": -0.02077137678861618, "fcm_dpo/margin": 303.08685302734375, "fcm_dpo/q_t": 0.405579149723053, "grad_norm": 37.92364501953125, "learning_rate": 1.4202485903778976e-07, "logits/chosen": -0.7985125780105591, "logits/rejected": -0.8062667846679688, "logps/chosen": -614.5189208984375, "logps/ref_chosen": -55.27437210083008, "logps/ref_rejected": -89.02497863769531, "logps/rejected": -951.3563842773438, "loss": 1.0939, "margin_dpo/margin_mean": 303.08685302734375, "margin_dpo/margin_std": 430.80828857421875, "step": 463 }, { "KL/chosen_KL_mean": -555.7085571289062, "KL/mean": -794.9366455078125, "KL/rejected_KL_mean": -1034.1646728515625, "KL/std": 459.53875732421875, "epoch": 0.6813509544787077, "fcm_dpo/beta": 0.0013179676607251167, "fcm_dpo/delta": -0.24893316626548767, "fcm_dpo/margin": 478.4560546875, "fcm_dpo/q_t": 0.3569212555885315, "grad_norm": 38.354400634765625, "learning_rate": 1.4086882387355658e-07, "logits/chosen": -0.7893344163894653, "logits/rejected": -0.8504258990287781, "logps/chosen": -606.620849609375, "logps/ref_chosen": -50.91230010986328, "logps/ref_rejected": -102.4893798828125, "logps/rejected": -1136.654052734375, "loss": 0.936, "margin_dpo/margin_mean": 478.4560546875, "margin_dpo/margin_std": 467.5229187011719, "step": 464 }, { "KL/chosen_KL_mean": -564.8525390625, "KL/mean": -755.03271484375, "KL/rejected_KL_mean": -945.2127075195312, "KL/std": 466.654052734375, "epoch": 0.6828193832599119, "fcm_dpo/beta": 0.0012960683088749647, "fcm_dpo/delta": -0.09770198166370392, "fcm_dpo/margin": 380.36016845703125, "fcm_dpo/q_t": 0.3856911063194275, "grad_norm": 38.300540924072266, "learning_rate": 1.3971566441730714e-07, "logits/chosen": -0.793292760848999, "logits/rejected": -0.8128570318222046, "logps/chosen": -624.9694213867188, "logps/ref_chosen": -60.116851806640625, "logps/ref_rejected": -113.94602966308594, "logps/rejected": -1059.15869140625, "loss": 1.046, "margin_dpo/margin_mean": 380.36016845703125, "margin_dpo/margin_std": 488.3748779296875, "step": 465 }, { "KL/chosen_KL_mean": -626.0338134765625, "KL/mean": -793.6627197265625, "KL/rejected_KL_mean": -961.291748046875, "KL/std": 443.3585205078125, "epoch": 0.684287812041116, "fcm_dpo/beta": 0.0012731440365314484, "fcm_dpo/delta": -0.028695937246084213, "fcm_dpo/margin": 335.2580261230469, "fcm_dpo/q_t": 0.40266337990760803, "grad_norm": 36.897422790527344, "learning_rate": 1.3856541105586545e-07, "logits/chosen": -0.8305766582489014, "logits/rejected": -0.8335669040679932, "logps/chosen": -678.9547119140625, "logps/ref_chosen": -52.920921325683594, "logps/ref_rejected": -90.3154296875, "logps/rejected": -1051.607177734375, "loss": 1.0959, "margin_dpo/margin_mean": 335.2580261230469, "margin_dpo/margin_std": 487.9477233886719, "step": 466 }, { "KL/chosen_KL_mean": -786.627685546875, "KL/mean": -963.7129516601562, "KL/rejected_KL_mean": -1140.7982177734375, "KL/std": 571.1072998046875, "epoch": 0.6857562408223201, "fcm_dpo/beta": 0.0012568333186209202, "fcm_dpo/delta": -0.049399569630622864, "fcm_dpo/margin": 354.17047119140625, "fcm_dpo/q_t": 0.403271347284317, "grad_norm": 49.63898468017578, "learning_rate": 1.3741809409947729e-07, "logits/chosen": -0.9358654022216797, "logits/rejected": -0.9291361570358276, "logps/chosen": -865.343505859375, "logps/ref_chosen": -78.7158203125, "logps/ref_rejected": -102.86019897460938, "logps/rejected": -1243.658447265625, "loss": 1.1501, "margin_dpo/margin_mean": 354.17047119140625, "margin_dpo/margin_std": 657.70361328125, "step": 467 }, { "KL/chosen_KL_mean": -622.95849609375, "KL/mean": -845.2493896484375, "KL/rejected_KL_mean": -1067.540283203125, "KL/std": 515.7662963867188, "epoch": 0.6872246696035242, "fcm_dpo/beta": 0.0012354985810816288, "fcm_dpo/delta": -0.15800079703330994, "fcm_dpo/margin": 444.5817565917969, "fcm_dpo/q_t": 0.38068056106567383, "grad_norm": 39.59514617919922, "learning_rate": 1.362737437810114e-07, "logits/chosen": -0.9017723798751831, "logits/rejected": -0.9113543629646301, "logps/chosen": -692.8939208984375, "logps/ref_chosen": -69.93536376953125, "logps/ref_rejected": -101.02880859375, "logps/rejected": -1168.569091796875, "loss": 1.017, "margin_dpo/margin_mean": 444.5817565917969, "margin_dpo/margin_std": 578.4088745117188, "step": 468 }, { "KL/chosen_KL_mean": -662.3765258789062, "KL/mean": -883.9148559570312, "KL/rejected_KL_mean": -1105.453125, "KL/std": 464.3486022949219, "epoch": 0.6886930983847284, "fcm_dpo/beta": 0.0011932153720408678, "fcm_dpo/delta": -0.13762570917606354, "fcm_dpo/margin": 443.07666015625, "fcm_dpo/q_t": 0.37997373938560486, "grad_norm": 33.920169830322266, "learning_rate": 1.351323902551631e-07, "logits/chosen": -0.9452608227729797, "logits/rejected": -0.9607683420181274, "logps/chosen": -730.501220703125, "logps/ref_chosen": -68.12469482421875, "logps/ref_rejected": -104.78640747070312, "logps/rejected": -1210.239501953125, "loss": 1.0151, "margin_dpo/margin_mean": 443.07666015625, "margin_dpo/margin_std": 517.1961669921875, "step": 469 }, { "KL/chosen_KL_mean": -561.1131591796875, "KL/mean": -754.48193359375, "KL/rejected_KL_mean": -947.8507080078125, "KL/std": 474.6820068359375, "epoch": 0.6901615271659325, "fcm_dpo/beta": 0.001185485627502203, "fcm_dpo/delta": -0.06129393354058266, "fcm_dpo/margin": 386.737548828125, "fcm_dpo/q_t": 0.3954838514328003, "grad_norm": 28.70859718322754, "learning_rate": 1.339940635976592e-07, "logits/chosen": -0.885380744934082, "logits/rejected": -0.8969517350196838, "logps/chosen": -604.9050903320312, "logps/ref_chosen": -43.791927337646484, "logps/ref_rejected": -82.70285034179688, "logps/rejected": -1030.5535888671875, "loss": 1.0719, "margin_dpo/margin_mean": 386.737548828125, "margin_dpo/margin_std": 531.3819580078125, "step": 470 }, { "KL/chosen_KL_mean": -726.427734375, "KL/mean": -887.2966918945312, "KL/rejected_KL_mean": -1048.165771484375, "KL/std": 499.47198486328125, "epoch": 0.6916299559471366, "fcm_dpo/beta": 0.0011751014972105622, "fcm_dpo/delta": 0.022231273353099823, "fcm_dpo/margin": 321.73797607421875, "fcm_dpo/q_t": 0.41689130663871765, "grad_norm": 37.73147964477539, "learning_rate": 1.3285879380446563e-07, "logits/chosen": -0.9937692880630493, "logits/rejected": -1.0016134977340698, "logps/chosen": -789.7672119140625, "logps/ref_chosen": -63.33952331542969, "logps/ref_rejected": -83.61048126220703, "logps/rejected": -1131.776123046875, "loss": 1.1346, "margin_dpo/margin_mean": 321.73797607421875, "margin_dpo/margin_std": 533.201904296875, "step": 471 }, { "KL/chosen_KL_mean": -712.793701171875, "KL/mean": -912.88037109375, "KL/rejected_KL_mean": -1112.967041015625, "KL/std": 599.86572265625, "epoch": 0.6930983847283406, "fcm_dpo/beta": 0.0011670588282868266, "fcm_dpo/delta": -0.07075389474630356, "fcm_dpo/margin": 400.17333984375, "fcm_dpo/q_t": 0.4006522297859192, "grad_norm": 35.72392272949219, "learning_rate": 1.317266107909975e-07, "logits/chosen": -0.954893946647644, "logits/rejected": -0.93065345287323, "logps/chosen": -796.4598388671875, "logps/ref_chosen": -83.66610717773438, "logps/ref_rejected": -117.20919799804688, "logps/rejected": -1230.17626953125, "loss": 1.102, "margin_dpo/margin_mean": 400.17333984375, "margin_dpo/margin_std": 645.8973999023438, "step": 472 }, { "KL/chosen_KL_mean": -852.4325561523438, "KL/mean": -943.5150146484375, "KL/rejected_KL_mean": -1034.5975341796875, "KL/std": 626.2051391601562, "epoch": 0.6945668135095447, "fcm_dpo/beta": 0.0011666135396808386, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 182.16502380371094, "fcm_dpo/q_t": 0.45503491163253784, "grad_norm": 116.40143585205078, "learning_rate": 1.3059754439133002e-07, "logits/chosen": -1.0035320520401, "logits/rejected": -0.9779636859893799, "logps/chosen": -915.9295654296875, "logps/ref_chosen": -63.49696731567383, "logps/ref_rejected": -81.14657592773438, "logps/rejected": -1115.744140625, "loss": 1.3781, "margin_dpo/margin_mean": 182.16500854492188, "margin_dpo/margin_std": 820.9163208007812, "step": 473 }, { "KL/chosen_KL_mean": -686.1465454101562, "KL/mean": -849.884521484375, "KL/rejected_KL_mean": -1013.62255859375, "KL/std": 536.9088134765625, "epoch": 0.6960352422907489, "fcm_dpo/beta": 0.0011568089248612523, "fcm_dpo/delta": -0.0847577303647995, "fcm_dpo/margin": 327.47601318359375, "fcm_dpo/q_t": 0.41386541724205017, "grad_norm": 36.65470886230469, "learning_rate": 1.2947162435741277e-07, "logits/chosen": -0.944141149520874, "logits/rejected": -0.9484027624130249, "logps/chosen": -738.7584838867188, "logps/ref_chosen": -52.6119384765625, "logps/ref_rejected": -90.08041381835938, "logps/rejected": -1103.7030029296875, "loss": 1.1566, "margin_dpo/margin_mean": 327.47601318359375, "margin_dpo/margin_std": 581.762939453125, "step": 474 }, { "KL/chosen_KL_mean": -497.8739013671875, "KL/mean": -715.1051025390625, "KL/rejected_KL_mean": -932.3363037109375, "KL/std": 428.21868896484375, "epoch": 0.697503671071953, "fcm_dpo/beta": 0.0011289971880614758, "fcm_dpo/delta": -0.09538697451353073, "fcm_dpo/margin": 434.46240234375, "fcm_dpo/q_t": 0.38658300042152405, "grad_norm": 42.494869232177734, "learning_rate": 1.2834888035828596e-07, "logits/chosen": -0.9651395082473755, "logits/rejected": -0.9933245182037354, "logps/chosen": -540.369140625, "logps/ref_chosen": -42.49519348144531, "logps/ref_rejected": -90.06294250488281, "logps/rejected": -1022.3992919921875, "loss": 1.017, "margin_dpo/margin_mean": 434.46240234375, "margin_dpo/margin_std": 460.6612548828125, "step": 475 }, { "KL/chosen_KL_mean": -643.5517578125, "KL/mean": -811.9400634765625, "KL/rejected_KL_mean": -980.328369140625, "KL/std": 494.214111328125, "epoch": 0.6989720998531571, "fcm_dpo/beta": 0.0011274400167167187, "fcm_dpo/delta": 0.021099748089909554, "fcm_dpo/margin": 336.776611328125, "fcm_dpo/q_t": 0.4138496518135071, "grad_norm": 58.33089065551758, "learning_rate": 1.2722934197929802e-07, "logits/chosen": -0.9680467844009399, "logits/rejected": -0.985359787940979, "logps/chosen": -686.5010986328125, "logps/ref_chosen": -42.94938278198242, "logps/ref_rejected": -73.71023559570312, "logps/rejected": -1054.03857421875, "loss": 1.108, "margin_dpo/margin_mean": 336.776611328125, "margin_dpo/margin_std": 474.5893859863281, "step": 476 }, { "KL/chosen_KL_mean": -671.0191040039062, "KL/mean": -845.5562744140625, "KL/rejected_KL_mean": -1020.0934448242188, "KL/std": 523.93212890625, "epoch": 0.7004405286343612, "fcm_dpo/beta": 0.0011346408864483237, "fcm_dpo/delta": 0.0038326121866703033, "fcm_dpo/margin": 349.0743408203125, "fcm_dpo/q_t": 0.41092249751091003, "grad_norm": 32.38017272949219, "learning_rate": 1.2611303872132631e-07, "logits/chosen": -0.9727839231491089, "logits/rejected": -0.9398672580718994, "logps/chosen": -741.791748046875, "logps/ref_chosen": -70.77261352539062, "logps/ref_rejected": -76.13737487792969, "logps/rejected": -1096.2308349609375, "loss": 1.138, "margin_dpo/margin_mean": 349.0743408203125, "margin_dpo/margin_std": 609.6284790039062, "step": 477 }, { "KL/chosen_KL_mean": -543.57373046875, "KL/mean": -744.6063842773438, "KL/rejected_KL_mean": -945.6390380859375, "KL/std": 450.7725830078125, "epoch": 0.7019089574155654, "fcm_dpo/beta": 0.001124206930398941, "fcm_dpo/delta": -0.05444270372390747, "fcm_dpo/margin": 402.06536865234375, "fcm_dpo/q_t": 0.39735937118530273, "grad_norm": 33.31148147583008, "learning_rate": 1.2500000000000005e-07, "logits/chosen": -0.8353407979011536, "logits/rejected": -0.8606827259063721, "logps/chosen": -585.0142822265625, "logps/ref_chosen": -41.440513610839844, "logps/ref_rejected": -85.36196899414062, "logps/rejected": -1031.0009765625, "loss": 1.0658, "margin_dpo/margin_mean": 402.06536865234375, "margin_dpo/margin_std": 529.6115112304688, "step": 478 }, { "KL/chosen_KL_mean": -704.5205078125, "KL/mean": -897.114501953125, "KL/rejected_KL_mean": -1089.7083740234375, "KL/std": 577.3861083984375, "epoch": 0.7033773861967695, "fcm_dpo/beta": 0.0011205710470676422, "fcm_dpo/delta": -0.0337379164993763, "fcm_dpo/margin": 385.18798828125, "fcm_dpo/q_t": 0.40699535608291626, "grad_norm": 30.347566604614258, "learning_rate": 1.2389025514492456e-07, "logits/chosen": -0.9103279113769531, "logits/rejected": -0.9409140348434448, "logps/chosen": -758.428466796875, "logps/ref_chosen": -53.907920837402344, "logps/ref_rejected": -95.1163330078125, "logps/rejected": -1184.82470703125, "loss": 1.1246, "margin_dpo/margin_mean": 385.18798828125, "margin_dpo/margin_std": 668.6539306640625, "step": 479 }, { "KL/chosen_KL_mean": -799.6283569335938, "KL/mean": -950.8204345703125, "KL/rejected_KL_mean": -1102.0125732421875, "KL/std": 511.99639892578125, "epoch": 0.7048458149779736, "fcm_dpo/beta": 0.0011057795491069555, "fcm_dpo/delta": -0.04888079687952995, "fcm_dpo/margin": 302.38421630859375, "fcm_dpo/q_t": 0.4251486659049988, "grad_norm": 53.98848342895508, "learning_rate": 1.227838333989088e-07, "logits/chosen": -0.9347141981124878, "logits/rejected": -0.9363481998443604, "logps/chosen": -858.31103515625, "logps/ref_chosen": -58.682701110839844, "logps/ref_rejected": -82.93248748779297, "logps/rejected": -1184.945068359375, "loss": 1.1745, "margin_dpo/margin_mean": 302.38421630859375, "margin_dpo/margin_std": 548.7213134765625, "step": 480 }, { "KL/chosen_KL_mean": -666.797119140625, "KL/mean": -899.0219116210938, "KL/rejected_KL_mean": -1131.24658203125, "KL/std": 542.1533813476562, "epoch": 0.7063142437591777, "fcm_dpo/beta": 0.0010841806652024388, "fcm_dpo/delta": -0.10892824828624725, "fcm_dpo/margin": 464.44964599609375, "fcm_dpo/q_t": 0.3871780037879944, "grad_norm": 28.775257110595703, "learning_rate": 1.2168076391719489e-07, "logits/chosen": -0.9464655518531799, "logits/rejected": -0.9776947498321533, "logps/chosen": -721.7613525390625, "logps/ref_chosen": -54.964271545410156, "logps/ref_rejected": -92.42044067382812, "logps/rejected": -1223.6671142578125, "loss": 1.0317, "margin_dpo/margin_mean": 464.4496154785156, "margin_dpo/margin_std": 576.0606689453125, "step": 481 }, { "KL/chosen_KL_mean": -718.3167114257812, "KL/mean": -820.912353515625, "KL/rejected_KL_mean": -923.5079345703125, "KL/std": 538.7589111328125, "epoch": 0.7077826725403817, "fcm_dpo/beta": 0.0010857656598091125, "fcm_dpo/delta": 0.08385416120290756, "fcm_dpo/margin": 205.1912384033203, "fcm_dpo/q_t": 0.4472288489341736, "grad_norm": 54.073997497558594, "learning_rate": 1.2058107576668938e-07, "logits/chosen": -0.8339771032333374, "logits/rejected": -0.8247455358505249, "logps/chosen": -785.8701171875, "logps/ref_chosen": -67.553466796875, "logps/ref_rejected": -87.58953857421875, "logps/rejected": -1011.0975341796875, "loss": 1.2821, "margin_dpo/margin_mean": 205.19125366210938, "margin_dpo/margin_std": 625.9072265625, "step": 482 }, { "KL/chosen_KL_mean": -616.08154296875, "KL/mean": -854.05029296875, "KL/rejected_KL_mean": -1092.01904296875, "KL/std": 518.1444091796875, "epoch": 0.7092511013215859, "fcm_dpo/beta": 0.001074553350917995, "fcm_dpo/delta": -0.11760546267032623, "fcm_dpo/margin": 475.93743896484375, "fcm_dpo/q_t": 0.38575249910354614, "grad_norm": 31.249698638916016, "learning_rate": 1.194847979251979e-07, "logits/chosen": -0.9016849994659424, "logits/rejected": -0.9181410074234009, "logps/chosen": -679.411376953125, "logps/ref_chosen": -63.32981872558594, "logps/ref_rejected": -95.78697204589844, "logps/rejected": -1187.805908203125, "loss": 1.0223, "margin_dpo/margin_mean": 475.9373779296875, "margin_dpo/margin_std": 576.4466552734375, "step": 483 }, { "KL/chosen_KL_mean": -521.124755859375, "KL/mean": -732.0531005859375, "KL/rejected_KL_mean": -942.9813842773438, "KL/std": 495.1884765625, "epoch": 0.71071953010279, "fcm_dpo/beta": 0.0010655100923031569, "fcm_dpo/delta": -0.05194704234600067, "fcm_dpo/margin": 421.85662841796875, "fcm_dpo/q_t": 0.3974398374557495, "grad_norm": 62.45989990234375, "learning_rate": 1.1839195928066101e-07, "logits/chosen": -0.8718733787536621, "logits/rejected": -0.8974796533584595, "logps/chosen": -580.262939453125, "logps/ref_chosen": -59.13812255859375, "logps/ref_rejected": -84.37144470214844, "logps/rejected": -1027.352783203125, "loss": 1.0546, "margin_dpo/margin_mean": 421.85662841796875, "margin_dpo/margin_std": 506.4351806640625, "step": 484 }, { "KL/chosen_KL_mean": -539.0780639648438, "KL/mean": -738.4888916015625, "KL/rejected_KL_mean": -937.8997192382812, "KL/std": 491.9963073730469, "epoch": 0.7121879588839941, "fcm_dpo/beta": 0.0010573656763881445, "fcm_dpo/delta": -0.022721393033862114, "fcm_dpo/margin": 398.82159423828125, "fcm_dpo/q_t": 0.40495431423187256, "grad_norm": 35.550392150878906, "learning_rate": 1.1730258863039347e-07, "logits/chosen": -0.794913649559021, "logits/rejected": -0.8159662485122681, "logps/chosen": -597.9276123046875, "logps/ref_chosen": -58.849571228027344, "logps/ref_rejected": -103.36408233642578, "logps/rejected": -1041.2637939453125, "loss": 1.089, "margin_dpo/margin_mean": 398.8216552734375, "margin_dpo/margin_std": 560.9061279296875, "step": 485 }, { "KL/chosen_KL_mean": -621.9697265625, "KL/mean": -850.4727172851562, "KL/rejected_KL_mean": -1078.9757080078125, "KL/std": 568.115478515625, "epoch": 0.7136563876651982, "fcm_dpo/beta": 0.0010410689283162355, "fcm_dpo/delta": -0.07956840097904205, "fcm_dpo/margin": 457.0060119628906, "fcm_dpo/q_t": 0.3951270878314972, "grad_norm": 30.52185821533203, "learning_rate": 1.1621671468032493e-07, "logits/chosen": -0.8822107911109924, "logits/rejected": -0.9047358632087708, "logps/chosen": -677.2293701171875, "logps/ref_chosen": -55.25966262817383, "logps/ref_rejected": -92.13936614990234, "logps/rejected": -1171.114990234375, "loss": 1.0777, "margin_dpo/margin_mean": 457.00604248046875, "margin_dpo/margin_std": 684.0638427734375, "step": 486 }, { "KL/chosen_KL_mean": -674.0582275390625, "KL/mean": -843.9366455078125, "KL/rejected_KL_mean": -1013.8150634765625, "KL/std": 521.2279663085938, "epoch": 0.7151248164464024, "fcm_dpo/beta": 0.001048167236149311, "fcm_dpo/delta": 0.044978074729442596, "fcm_dpo/margin": 339.7569580078125, "fcm_dpo/q_t": 0.41704893112182617, "grad_norm": 33.21575164794922, "learning_rate": 1.1513436604424378e-07, "logits/chosen": -0.8942869901657104, "logits/rejected": -0.9046221971511841, "logps/chosen": -727.1214599609375, "logps/ref_chosen": -53.06330871582031, "logps/ref_rejected": -92.41883087158203, "logps/rejected": -1106.23388671875, "loss": 1.1342, "margin_dpo/margin_mean": 339.7569580078125, "margin_dpo/margin_std": 529.3812255859375, "step": 487 }, { "KL/chosen_KL_mean": -542.7528076171875, "KL/mean": -720.9403686523438, "KL/rejected_KL_mean": -899.1280517578125, "KL/std": 461.8782043457031, "epoch": 0.7165932452276065, "fcm_dpo/beta": 0.001053705345839262, "fcm_dpo/delta": 0.02513560838997364, "fcm_dpo/margin": 356.37518310546875, "fcm_dpo/q_t": 0.4128515124320984, "grad_norm": 34.97057342529297, "learning_rate": 1.1405557124304335e-07, "logits/chosen": -0.8141319751739502, "logits/rejected": -0.8222429752349854, "logps/chosen": -594.98095703125, "logps/ref_chosen": -52.22815704345703, "logps/ref_rejected": -84.00656127929688, "logps/rejected": -983.1345825195312, "loss": 1.0978, "margin_dpo/margin_mean": 356.37518310546875, "margin_dpo/margin_std": 445.29071044921875, "step": 488 }, { "KL/chosen_KL_mean": -500.2234802246094, "KL/mean": -671.4964599609375, "KL/rejected_KL_mean": -842.7694091796875, "KL/std": 455.5792236328125, "epoch": 0.7180616740088106, "fcm_dpo/beta": 0.0010611966717988253, "fcm_dpo/delta": 0.03748384118080139, "fcm_dpo/margin": 342.54595947265625, "fcm_dpo/q_t": 0.4179996848106384, "grad_norm": 27.448223114013672, "learning_rate": 1.1298035870396985e-07, "logits/chosen": -0.8774303197860718, "logits/rejected": -0.8812981247901917, "logps/chosen": -556.213134765625, "logps/ref_chosen": -55.989627838134766, "logps/ref_rejected": -79.39812469482422, "logps/rejected": -922.1675415039062, "loss": 1.1201, "margin_dpo/margin_mean": 342.5459289550781, "margin_dpo/margin_std": 497.568359375, "step": 489 }, { "KL/chosen_KL_mean": -602.345703125, "KL/mean": -789.0585327148438, "KL/rejected_KL_mean": -975.7713012695312, "KL/std": 554.9224853515625, "epoch": 0.7195301027900147, "fcm_dpo/beta": 0.0010620702523738146, "fcm_dpo/delta": 0.0034542735666036606, "fcm_dpo/margin": 373.4256591796875, "fcm_dpo/q_t": 0.4118698239326477, "grad_norm": 37.27909851074219, "learning_rate": 1.1190875675987355e-07, "logits/chosen": -0.8731991052627563, "logits/rejected": -0.911872148513794, "logps/chosen": -654.7120361328125, "logps/ref_chosen": -52.36639404296875, "logps/ref_rejected": -110.4090576171875, "logps/rejected": -1086.180419921875, "loss": 1.1353, "margin_dpo/margin_mean": 373.4256896972656, "margin_dpo/margin_std": 642.0191650390625, "step": 490 }, { "KL/chosen_KL_mean": -558.270263671875, "KL/mean": -684.875, "KL/rejected_KL_mean": -811.479736328125, "KL/std": 470.6094970703125, "epoch": 0.7209985315712188, "fcm_dpo/beta": 0.0010826380457729101, "fcm_dpo/delta": 0.12914934754371643, "fcm_dpo/margin": 253.20947265625, "fcm_dpo/q_t": 0.4377876818180084, "grad_norm": 32.712093353271484, "learning_rate": 1.1084079364846241e-07, "logits/chosen": -0.9021086096763611, "logits/rejected": -0.8983560800552368, "logps/chosen": -618.3865356445312, "logps/ref_chosen": -60.11626434326172, "logps/ref_rejected": -73.27278900146484, "logps/rejected": -884.7525634765625, "loss": 1.1945, "margin_dpo/margin_mean": 253.20948791503906, "margin_dpo/margin_std": 480.40399169921875, "step": 491 }, { "KL/chosen_KL_mean": -578.8584594726562, "KL/mean": -706.635498046875, "KL/rejected_KL_mean": -834.41259765625, "KL/std": 479.8116149902344, "epoch": 0.7224669603524229, "fcm_dpo/beta": 0.0011053578928112984, "fcm_dpo/delta": 0.12110729515552521, "fcm_dpo/margin": 255.5540771484375, "fcm_dpo/q_t": 0.4370453357696533, "grad_norm": 31.351417541503906, "learning_rate": 1.097764975115576e-07, "logits/chosen": -0.9315870404243469, "logits/rejected": -0.9179561734199524, "logps/chosen": -632.8526611328125, "logps/ref_chosen": -53.994178771972656, "logps/ref_rejected": -72.65962219238281, "logps/rejected": -907.0721435546875, "loss": 1.2158, "margin_dpo/margin_mean": 255.55409240722656, "margin_dpo/margin_std": 563.4765014648438, "step": 492 }, { "KL/chosen_KL_mean": -622.0254516601562, "KL/mean": -759.6363525390625, "KL/rejected_KL_mean": -897.247314453125, "KL/std": 523.6834106445312, "epoch": 0.723935389133627, "fcm_dpo/beta": 0.0011120472336187959, "fcm_dpo/delta": -0.011631077155470848, "fcm_dpo/margin": 275.22186279296875, "fcm_dpo/q_t": 0.4283533990383148, "grad_norm": 35.48196029663086, "learning_rate": 1.0871589639435203e-07, "logits/chosen": -0.9604239463806152, "logits/rejected": -0.9354810118675232, "logps/chosen": -697.522705078125, "logps/ref_chosen": -75.49723815917969, "logps/ref_rejected": -87.32301330566406, "logps/rejected": -984.5703125, "loss": 1.1808, "margin_dpo/margin_mean": 275.22186279296875, "margin_dpo/margin_std": 509.6318359375, "step": 493 }, { "KL/chosen_KL_mean": -498.29388427734375, "KL/mean": -716.009521484375, "KL/rejected_KL_mean": -933.72509765625, "KL/std": 477.13525390625, "epoch": 0.7254038179148311, "fcm_dpo/beta": 0.0010983939282596111, "fcm_dpo/delta": -0.0822177529335022, "fcm_dpo/margin": 435.43121337890625, "fcm_dpo/q_t": 0.3895619511604309, "grad_norm": 48.854129791259766, "learning_rate": 1.0765901824467166e-07, "logits/chosen": -0.8259874582290649, "logits/rejected": -0.8612606525421143, "logps/chosen": -539.6531982421875, "logps/ref_chosen": -41.35926818847656, "logps/ref_rejected": -86.09136962890625, "logps/rejected": -1019.8164672851562, "loss": 1.0284, "margin_dpo/margin_mean": 435.43121337890625, "margin_dpo/margin_std": 479.2147216796875, "step": 494 }, { "KL/chosen_KL_mean": -555.478759765625, "KL/mean": -748.5401611328125, "KL/rejected_KL_mean": -941.6015625, "KL/std": 487.3724060058594, "epoch": 0.7268722466960352, "fcm_dpo/beta": 0.0010908616241067648, "fcm_dpo/delta": -0.022133061662316322, "fcm_dpo/margin": 386.12274169921875, "fcm_dpo/q_t": 0.4062625765800476, "grad_norm": 31.173200607299805, "learning_rate": 1.0660589091223854e-07, "logits/chosen": -0.9227169752120972, "logits/rejected": -0.9331672191619873, "logps/chosen": -619.0137939453125, "logps/ref_chosen": -63.53507995605469, "logps/ref_rejected": -91.42443084716797, "logps/rejected": -1033.02587890625, "loss": 1.0958, "margin_dpo/margin_mean": 386.1227722167969, "margin_dpo/margin_std": 574.861328125, "step": 495 }, { "KL/chosen_KL_mean": -699.9385986328125, "KL/mean": -801.8622436523438, "KL/rejected_KL_mean": -903.785888671875, "KL/std": 393.8895568847656, "epoch": 0.7283406754772394, "fcm_dpo/beta": 0.0011188681237399578, "fcm_dpo/delta": 0.17580503225326538, "fcm_dpo/margin": 203.84727478027344, "fcm_dpo/q_t": 0.44692689180374146, "grad_norm": 64.04462432861328, "learning_rate": 1.0555654214793722e-07, "logits/chosen": -0.8876965641975403, "logits/rejected": -0.8597399592399597, "logps/chosen": -772.530517578125, "logps/ref_chosen": -72.5919189453125, "logps/ref_rejected": -84.32933807373047, "logps/rejected": -988.115234375, "loss": 1.2315, "margin_dpo/margin_mean": 203.84725952148438, "margin_dpo/margin_std": 444.61639404296875, "step": 496 }, { "KL/chosen_KL_mean": -672.76513671875, "KL/mean": -776.4244384765625, "KL/rejected_KL_mean": -880.0836791992188, "KL/std": 496.9170227050781, "epoch": 0.7298091042584435, "fcm_dpo/beta": 0.001132933422923088, "fcm_dpo/delta": 0.02158385142683983, "fcm_dpo/margin": 207.31849670410156, "fcm_dpo/q_t": 0.44608232378959656, "grad_norm": 41.72929000854492, "learning_rate": 1.0451099960308374e-07, "logits/chosen": -0.858148455619812, "logits/rejected": -0.8439843654632568, "logps/chosen": -731.359130859375, "logps/ref_chosen": -58.59397506713867, "logps/ref_rejected": -76.28836822509766, "logps/rejected": -956.3720703125, "loss": 1.2363, "margin_dpo/margin_mean": 207.31849670410156, "margin_dpo/margin_std": 470.48712158203125, "step": 497 }, { "KL/chosen_KL_mean": -647.0549926757812, "KL/mean": -820.817626953125, "KL/rejected_KL_mean": -994.5802001953125, "KL/std": 524.9059448242188, "epoch": 0.7312775330396476, "fcm_dpo/beta": 0.0011333951260894537, "fcm_dpo/delta": 0.006364853121340275, "fcm_dpo/margin": 347.5252685546875, "fcm_dpo/q_t": 0.4110422730445862, "grad_norm": 44.66307067871094, "learning_rate": 1.0346929082869641e-07, "logits/chosen": -0.9270666837692261, "logits/rejected": -0.9164772033691406, "logps/chosen": -718.2606201171875, "logps/ref_chosen": -71.20565795898438, "logps/ref_rejected": -83.95803833007812, "logps/rejected": -1078.538330078125, "loss": 1.133, "margin_dpo/margin_mean": 347.5252685546875, "margin_dpo/margin_std": 589.1495361328125, "step": 498 }, { "KL/chosen_KL_mean": -564.4229736328125, "KL/mean": -775.9797973632812, "KL/rejected_KL_mean": -987.5364990234375, "KL/std": 518.1436767578125, "epoch": 0.7327459618208517, "fcm_dpo/beta": 0.0011198758147656918, "fcm_dpo/delta": -0.0776449665427208, "fcm_dpo/margin": 423.1136169433594, "fcm_dpo/q_t": 0.3932623863220215, "grad_norm": 42.19448471069336, "learning_rate": 1.0243144327477013e-07, "logits/chosen": -0.9195848107337952, "logits/rejected": -0.955754280090332, "logps/chosen": -615.6781616210938, "logps/ref_chosen": -51.25519561767578, "logps/ref_rejected": -101.07870483398438, "logps/rejected": -1088.615234375, "loss": 1.0594, "margin_dpo/margin_mean": 423.11358642578125, "margin_dpo/margin_std": 571.1682739257812, "step": 499 }, { "KL/chosen_KL_mean": -671.8187255859375, "KL/mean": -847.9236450195312, "KL/rejected_KL_mean": -1024.028564453125, "KL/std": 458.623046875, "epoch": 0.7342143906020558, "fcm_dpo/beta": 0.001116940751671791, "fcm_dpo/delta": 0.006859854329377413, "fcm_dpo/margin": 352.2098388671875, "fcm_dpo/q_t": 0.41066765785217285, "grad_norm": 35.08540725708008, "learning_rate": 1.0139748428955333e-07, "logits/chosen": -0.9325329661369324, "logits/rejected": -0.9658868312835693, "logps/chosen": -728.84619140625, "logps/ref_chosen": -57.027442932128906, "logps/ref_rejected": -93.93421173095703, "logps/rejected": -1117.9627685546875, "loss": 1.1271, "margin_dpo/margin_mean": 352.2098388671875, "margin_dpo/margin_std": 569.4359741210938, "step": 500 }, { "KL/chosen_KL_mean": -572.5186767578125, "KL/mean": -759.097900390625, "KL/rejected_KL_mean": -945.6771240234375, "KL/std": 483.7222900390625, "epoch": 0.73568281938326, "fcm_dpo/beta": 0.0011179624125361443, "fcm_dpo/delta": -0.017988204956054688, "fcm_dpo/margin": 373.15838623046875, "fcm_dpo/q_t": 0.4073890149593353, "grad_norm": 33.07366943359375, "learning_rate": 1.0036744111882672e-07, "logits/chosen": -0.8823180198669434, "logits/rejected": -0.8721954822540283, "logps/chosen": -626.8782348632812, "logps/ref_chosen": -54.359527587890625, "logps/ref_rejected": -80.15670013427734, "logps/rejected": -1025.833740234375, "loss": 1.1115, "margin_dpo/margin_mean": 373.15838623046875, "margin_dpo/margin_std": 589.1995849609375, "step": 501 }, { "KL/chosen_KL_mean": -518.148681640625, "KL/mean": -708.8804321289062, "KL/rejected_KL_mean": -899.612060546875, "KL/std": 450.12200927734375, "epoch": 0.737151248164464, "fcm_dpo/beta": 0.0011112934444099665, "fcm_dpo/delta": -0.024970781058073044, "fcm_dpo/margin": 381.4633483886719, "fcm_dpo/q_t": 0.40389347076416016, "grad_norm": 29.106658935546875, "learning_rate": 9.934134090518592e-08, "logits/chosen": -0.8181363940238953, "logits/rejected": -0.8096420764923096, "logps/chosen": -585.7492065429688, "logps/ref_chosen": -67.60050964355469, "logps/ref_rejected": -82.94876098632812, "logps/rejected": -982.5608520507812, "loss": 1.0688, "margin_dpo/margin_mean": 381.4633483886719, "margin_dpo/margin_std": 472.6005859375, "step": 502 }, { "KL/chosen_KL_mean": -523.8008422851562, "KL/mean": -695.634521484375, "KL/rejected_KL_mean": -867.4680786132812, "KL/std": 425.3750305175781, "epoch": 0.7386196769456681, "fcm_dpo/beta": 0.001108947559259832, "fcm_dpo/delta": 0.01953038200736046, "fcm_dpo/margin": 343.667236328125, "fcm_dpo/q_t": 0.41361480951309204, "grad_norm": 28.44597816467285, "learning_rate": 9.831921068732571e-08, "logits/chosen": -0.8395601511001587, "logits/rejected": -0.8324748873710632, "logps/chosen": -578.8792724609375, "logps/ref_chosen": -55.078407287597656, "logps/ref_rejected": -82.50544738769531, "logps/rejected": -949.9735107421875, "loss": 1.1006, "margin_dpo/margin_mean": 343.6672668457031, "margin_dpo/margin_std": 457.1417236328125, "step": 503 }, { "KL/chosen_KL_mean": -567.9959716796875, "KL/mean": -775.1651611328125, "KL/rejected_KL_mean": -982.3343505859375, "KL/std": 495.7042541503906, "epoch": 0.7400881057268722, "fcm_dpo/beta": 0.0011051710462197661, "fcm_dpo/delta": -0.06065092608332634, "fcm_dpo/margin": 414.3384094238281, "fcm_dpo/q_t": 0.3969269096851349, "grad_norm": 29.1925106048584, "learning_rate": 9.730107739932805e-08, "logits/chosen": -0.8878883123397827, "logits/rejected": -0.9151204228401184, "logps/chosen": -627.961669921875, "logps/ref_chosen": -59.96575164794922, "logps/ref_rejected": -103.76212310791016, "logps/rejected": -1086.096435546875, "loss": 1.0678, "margin_dpo/margin_mean": 414.3384094238281, "margin_dpo/margin_std": 545.068603515625, "step": 504 }, { "KL/chosen_KL_mean": -644.965087890625, "KL/mean": -754.7982177734375, "KL/rejected_KL_mean": -864.63134765625, "KL/std": 480.5470275878906, "epoch": 0.7415565345080763, "fcm_dpo/beta": 0.0011264740023761988, "fcm_dpo/delta": 0.15632013976573944, "fcm_dpo/margin": 219.66627502441406, "fcm_dpo/q_t": 0.44324439764022827, "grad_norm": 46.33066940307617, "learning_rate": 9.628696786995188e-08, "logits/chosen": -0.926541805267334, "logits/rejected": -0.9026806354522705, "logps/chosen": -721.1199951171875, "logps/ref_chosen": -76.1549072265625, "logps/ref_rejected": -88.58537292480469, "logps/rejected": -953.2167358398438, "loss": 1.2186, "margin_dpo/margin_mean": 219.666259765625, "margin_dpo/margin_std": 454.0839538574219, "step": 505 }, { "KL/chosen_KL_mean": -516.7422485351562, "KL/mean": -701.2262573242188, "KL/rejected_KL_mean": -885.7102661132812, "KL/std": 465.8466796875, "epoch": 0.7430249632892805, "fcm_dpo/beta": 0.0011272106785327196, "fcm_dpo/delta": -0.017008088529109955, "fcm_dpo/margin": 368.968017578125, "fcm_dpo/q_t": 0.405214786529541, "grad_norm": 40.91910171508789, "learning_rate": 9.527690882192635e-08, "logits/chosen": -0.909249484539032, "logits/rejected": -0.9271351099014282, "logps/chosen": -565.7027587890625, "logps/ref_chosen": -48.96050262451172, "logps/ref_rejected": -78.41505432128906, "logps/rejected": -964.1253662109375, "loss": 1.0916, "margin_dpo/margin_mean": 368.968017578125, "margin_dpo/margin_std": 511.87725830078125, "step": 506 }, { "KL/chosen_KL_mean": -589.63037109375, "KL/mean": -747.434326171875, "KL/rejected_KL_mean": -905.2382202148438, "KL/std": 535.4652099609375, "epoch": 0.7444933920704846, "fcm_dpo/beta": 0.0011373506858944893, "fcm_dpo/delta": 0.04258999228477478, "fcm_dpo/margin": 315.60784912109375, "fcm_dpo/q_t": 0.4222760498523712, "grad_norm": 30.438766479492188, "learning_rate": 9.427092687124691e-08, "logits/chosen": -0.9080416560173035, "logits/rejected": -0.9134109020233154, "logps/chosen": -656.431884765625, "logps/ref_chosen": -66.80149841308594, "logps/ref_rejected": -95.37289428710938, "logps/rejected": -1000.611083984375, "loss": 1.1574, "margin_dpo/margin_mean": 315.6078186035156, "margin_dpo/margin_std": 585.120849609375, "step": 507 }, { "KL/chosen_KL_mean": -643.0169067382812, "KL/mean": -777.5562744140625, "KL/rejected_KL_mean": -912.095703125, "KL/std": 526.7119750976562, "epoch": 0.7459618208516887, "fcm_dpo/beta": 0.0011591333895921707, "fcm_dpo/delta": 0.09055158495903015, "fcm_dpo/margin": 269.07879638671875, "fcm_dpo/q_t": 0.43193942308425903, "grad_norm": 43.13357162475586, "learning_rate": 9.326904852647344e-08, "logits/chosen": -0.8901297450065613, "logits/rejected": -0.8925095796585083, "logps/chosen": -714.3203735351562, "logps/ref_chosen": -71.303466796875, "logps/ref_rejected": -95.6275405883789, "logps/rejected": -1007.7232666015625, "loss": 1.2159, "margin_dpo/margin_mean": 269.07879638671875, "margin_dpo/margin_std": 613.018310546875, "step": 508 }, { "KL/chosen_KL_mean": -475.68719482421875, "KL/mean": -625.1077880859375, "KL/rejected_KL_mean": -774.5283813476562, "KL/std": 375.84735107421875, "epoch": 0.7474302496328928, "fcm_dpo/beta": 0.0011755790328606963, "fcm_dpo/delta": 0.04976864904165268, "fcm_dpo/margin": 298.8412170410156, "fcm_dpo/q_t": 0.4198834300041199, "grad_norm": 28.78981590270996, "learning_rate": 9.227130018803195e-08, "logits/chosen": -0.8108519315719604, "logits/rejected": -0.8077250123023987, "logps/chosen": -539.5061645507812, "logps/ref_chosen": -63.81895065307617, "logps/ref_rejected": -83.25643920898438, "logps/rejected": -857.7847900390625, "loss": 1.1345, "margin_dpo/margin_mean": 298.8412170410156, "margin_dpo/margin_std": 454.3382873535156, "step": 509 }, { "KL/chosen_KL_mean": -573.8865966796875, "KL/mean": -767.037841796875, "KL/rejected_KL_mean": -960.1890869140625, "KL/std": 436.06304931640625, "epoch": 0.748898678414097, "fcm_dpo/beta": 0.001165606314316392, "fcm_dpo/delta": -0.05261443555355072, "fcm_dpo/margin": 386.30255126953125, "fcm_dpo/q_t": 0.395079642534256, "grad_norm": 31.50473403930664, "learning_rate": 9.127770814751932e-08, "logits/chosen": -0.8299954533576965, "logits/rejected": -0.8495923280715942, "logps/chosen": -625.7650756835938, "logps/ref_chosen": -51.878448486328125, "logps/ref_rejected": -102.7651596069336, "logps/rejected": -1062.954345703125, "loss": 1.0442, "margin_dpo/margin_mean": 386.30255126953125, "margin_dpo/margin_std": 427.64312744140625, "step": 510 }, { "KL/chosen_KL_mean": -543.1624145507812, "KL/mean": -698.3929443359375, "KL/rejected_KL_mean": -853.62353515625, "KL/std": 473.2135314941406, "epoch": 0.750367107195301, "fcm_dpo/beta": 0.0011685066856443882, "fcm_dpo/delta": 0.03857073932886124, "fcm_dpo/margin": 310.46112060546875, "fcm_dpo/q_t": 0.4177062213420868, "grad_norm": 36.895851135253906, "learning_rate": 9.028829858700973e-08, "logits/chosen": -0.8786974549293518, "logits/rejected": -0.8848339319229126, "logps/chosen": -603.4005126953125, "logps/ref_chosen": -60.23811721801758, "logps/ref_rejected": -92.85676574707031, "logps/rejected": -946.4803466796875, "loss": 1.1563, "margin_dpo/margin_mean": 310.46112060546875, "margin_dpo/margin_std": 569.1368408203125, "step": 511 }, { "KL/chosen_KL_mean": -426.22259521484375, "KL/mean": -634.8209838867188, "KL/rejected_KL_mean": -843.4193115234375, "KL/std": 422.7352294921875, "epoch": 0.7518355359765051, "fcm_dpo/beta": 0.0011527151800692081, "fcm_dpo/delta": -0.08524032682180405, "fcm_dpo/margin": 417.19671630859375, "fcm_dpo/q_t": 0.38830769062042236, "grad_norm": 52.474979400634766, "learning_rate": 8.930309757836516e-08, "logits/chosen": -0.8602747917175293, "logits/rejected": -0.8812437057495117, "logps/chosen": -481.12811279296875, "logps/ref_chosen": -54.905494689941406, "logps/ref_rejected": -81.87586975097656, "logps/rejected": -925.295166015625, "loss": 1.0194, "margin_dpo/margin_mean": 417.19671630859375, "margin_dpo/margin_std": 433.2279052734375, "step": 512 }, { "KL/chosen_KL_mean": -534.08837890625, "KL/mean": -682.8746337890625, "KL/rejected_KL_mean": -831.660888671875, "KL/std": 402.69537353515625, "epoch": 0.7533039647577092, "fcm_dpo/beta": 0.0011531409109011292, "fcm_dpo/delta": 0.05850052088499069, "fcm_dpo/margin": 297.572509765625, "fcm_dpo/q_t": 0.42183050513267517, "grad_norm": 44.99782943725586, "learning_rate": 8.832213108254863e-08, "logits/chosen": -0.9019182920455933, "logits/rejected": -0.8876909017562866, "logps/chosen": -599.0048828125, "logps/ref_chosen": -64.91644287109375, "logps/ref_rejected": -76.06245422363281, "logps/rejected": -907.7233276367188, "loss": 1.1427, "margin_dpo/margin_mean": 297.572509765625, "margin_dpo/margin_std": 461.5359191894531, "step": 513 }, { "KL/chosen_KL_mean": -556.1077270507812, "KL/mean": -703.8541259765625, "KL/rejected_KL_mean": -851.6004638671875, "KL/std": 437.5733642578125, "epoch": 0.7547723935389133, "fcm_dpo/beta": 0.0011747241951525211, "fcm_dpo/delta": 0.05465298146009445, "fcm_dpo/margin": 295.4927673339844, "fcm_dpo/q_t": 0.42250925302505493, "grad_norm": 27.680599212646484, "learning_rate": 8.734542494893954e-08, "logits/chosen": -0.8176130652427673, "logits/rejected": -0.8068991899490356, "logps/chosen": -630.3372802734375, "logps/ref_chosen": -74.22957611083984, "logps/ref_rejected": -78.945556640625, "logps/rejected": -930.5460205078125, "loss": 1.1448, "margin_dpo/margin_mean": 295.4927673339844, "margin_dpo/margin_std": 489.8013610839844, "step": 514 }, { "KL/chosen_KL_mean": -475.355712890625, "KL/mean": -587.893798828125, "KL/rejected_KL_mean": -700.431884765625, "KL/std": 380.00994873046875, "epoch": 0.7562408223201175, "fcm_dpo/beta": 0.0011996763059869409, "fcm_dpo/delta": 0.13362111151218414, "fcm_dpo/margin": 225.07615661621094, "fcm_dpo/q_t": 0.4364135265350342, "grad_norm": 43.433387756347656, "learning_rate": 8.637300491465272e-08, "logits/chosen": -0.8131271600723267, "logits/rejected": -0.8236969709396362, "logps/chosen": -525.75732421875, "logps/ref_chosen": -50.40156555175781, "logps/ref_rejected": -87.09774780273438, "logps/rejected": -787.5296630859375, "loss": 1.2001, "margin_dpo/margin_mean": 225.076171875, "margin_dpo/margin_std": 440.53546142578125, "step": 515 }, { "KL/chosen_KL_mean": -509.38916015625, "KL/mean": -684.5283813476562, "KL/rejected_KL_mean": -859.6676025390625, "KL/std": 431.451416015625, "epoch": 0.7577092511013216, "fcm_dpo/beta": 0.0012069368967786431, "fcm_dpo/delta": -0.02384302206337452, "fcm_dpo/margin": 350.2784118652344, "fcm_dpo/q_t": 0.4014623761177063, "grad_norm": 40.77699279785156, "learning_rate": 8.540489660386064e-08, "logits/chosen": -0.9214959740638733, "logits/rejected": -0.9501833319664001, "logps/chosen": -574.0386962890625, "logps/ref_chosen": -64.64956665039062, "logps/ref_rejected": -111.72237396240234, "logps/rejected": -971.3899536132812, "loss": 1.0691, "margin_dpo/margin_mean": 350.2784118652344, "margin_dpo/margin_std": 421.2920227050781, "step": 516 }, { "KL/chosen_KL_mean": -546.0828857421875, "KL/mean": -750.1161499023438, "KL/rejected_KL_mean": -954.1494140625, "KL/std": 485.58392333984375, "epoch": 0.7591776798825257, "fcm_dpo/beta": 0.0011841601226478815, "fcm_dpo/delta": -0.08773398399353027, "fcm_dpo/margin": 408.0665283203125, "fcm_dpo/q_t": 0.39262643456459045, "grad_norm": 32.40835189819336, "learning_rate": 8.444112552711752e-08, "logits/chosen": -0.8471901416778564, "logits/rejected": -0.8455414772033691, "logps/chosen": -606.9964599609375, "logps/ref_chosen": -60.913551330566406, "logps/ref_rejected": -89.08308410644531, "logps/rejected": -1043.232421875, "loss": 1.0478, "margin_dpo/margin_mean": 408.0665283203125, "margin_dpo/margin_std": 531.6887817382812, "step": 517 }, { "KL/chosen_KL_mean": -491.3916931152344, "KL/mean": -659.3934936523438, "KL/rejected_KL_mean": -827.3953247070312, "KL/std": 386.0772705078125, "epoch": 0.7606461086637298, "fcm_dpo/beta": 0.0011782585643231869, "fcm_dpo/delta": 0.004059506580233574, "fcm_dpo/margin": 336.003662109375, "fcm_dpo/q_t": 0.40781712532043457, "grad_norm": 53.09996795654297, "learning_rate": 8.348171708068747e-08, "logits/chosen": -0.8820132613182068, "logits/rejected": -0.8964939117431641, "logps/chosen": -548.8475341796875, "logps/ref_chosen": -57.45589065551758, "logps/ref_rejected": -85.31269836425781, "logps/rejected": -912.7080078125, "loss": 1.0905, "margin_dpo/margin_mean": 336.003662109375, "margin_dpo/margin_std": 429.115966796875, "step": 518 }, { "KL/chosen_KL_mean": -507.8600769042969, "KL/mean": -623.6256713867188, "KL/rejected_KL_mean": -739.3912963867188, "KL/std": 352.45123291015625, "epoch": 0.762114537444934, "fcm_dpo/beta": 0.0011857892386615276, "fcm_dpo/delta": 0.028549687936902046, "fcm_dpo/margin": 231.53121948242188, "fcm_dpo/q_t": 0.4360736012458801, "grad_norm": 40.137332916259766, "learning_rate": 8.25266965458755e-08, "logits/chosen": -0.8481384515762329, "logits/rejected": -0.8306090235710144, "logps/chosen": -581.9234008789062, "logps/ref_chosen": -74.06331634521484, "logps/ref_rejected": -104.44416809082031, "logps/rejected": -843.83544921875, "loss": 1.1966, "margin_dpo/margin_mean": 231.53121948242188, "margin_dpo/margin_std": 438.93048095703125, "step": 519 }, { "KL/chosen_KL_mean": -543.58251953125, "KL/mean": -701.6224975585938, "KL/rejected_KL_mean": -859.6624145507812, "KL/std": 423.9530029296875, "epoch": 0.7635829662261381, "fcm_dpo/beta": 0.0011898789089173079, "fcm_dpo/delta": 0.024717746302485466, "fcm_dpo/margin": 316.0799560546875, "fcm_dpo/q_t": 0.4156090021133423, "grad_norm": 39.273406982421875, "learning_rate": 8.15760890883607e-08, "logits/chosen": -0.8328167200088501, "logits/rejected": -0.839728832244873, "logps/chosen": -613.88232421875, "logps/ref_chosen": -70.2998275756836, "logps/ref_rejected": -99.98133850097656, "logps/rejected": -959.643798828125, "loss": 1.1217, "margin_dpo/margin_mean": 316.0799560546875, "margin_dpo/margin_std": 469.7958068847656, "step": 520 }, { "KL/chosen_KL_mean": -485.7915954589844, "KL/mean": -656.0485229492188, "KL/rejected_KL_mean": -826.305419921875, "KL/std": 436.52398681640625, "epoch": 0.7650513950073421, "fcm_dpo/beta": 0.0012013925006613135, "fcm_dpo/delta": -0.010350905358791351, "fcm_dpo/margin": 340.51385498046875, "fcm_dpo/q_t": 0.4064163863658905, "grad_norm": 35.94675827026367, "learning_rate": 8.062991975753378e-08, "logits/chosen": -0.8887852430343628, "logits/rejected": -0.8937211036682129, "logps/chosen": -543.9345703125, "logps/ref_chosen": -58.14292526245117, "logps/ref_rejected": -83.28060913085938, "logps/rejected": -909.5860595703125, "loss": 1.0873, "margin_dpo/margin_mean": 340.51385498046875, "margin_dpo/margin_std": 440.299560546875, "step": 521 }, { "KL/chosen_KL_mean": -558.0612182617188, "KL/mean": -710.2643432617188, "KL/rejected_KL_mean": -862.467529296875, "KL/std": 458.29718017578125, "epoch": 0.7665198237885462, "fcm_dpo/beta": 0.0011995111126452684, "fcm_dpo/delta": 0.03616529330611229, "fcm_dpo/margin": 304.40625, "fcm_dpo/q_t": 0.41692230105400085, "grad_norm": 31.800031661987305, "learning_rate": 7.968821348583643e-08, "logits/chosen": -0.887365460395813, "logits/rejected": -0.8906110525131226, "logps/chosen": -604.60888671875, "logps/ref_chosen": -46.54766845703125, "logps/ref_rejected": -66.01388549804688, "logps/rejected": -928.4814453125, "loss": 1.1338, "margin_dpo/margin_mean": 304.40625, "margin_dpo/margin_std": 485.10552978515625, "step": 522 }, { "KL/chosen_KL_mean": -589.586669921875, "KL/mean": -753.298828125, "KL/rejected_KL_mean": -917.010986328125, "KL/std": 527.0751342773438, "epoch": 0.7679882525697503, "fcm_dpo/beta": 0.0012012626975774765, "fcm_dpo/delta": 0.006936301477253437, "fcm_dpo/margin": 327.4243469238281, "fcm_dpo/q_t": 0.4126874804496765, "grad_norm": 40.38140106201172, "learning_rate": 7.875099508810484e-08, "logits/chosen": -0.954033613204956, "logits/rejected": -0.9616571068763733, "logps/chosen": -651.356201171875, "logps/ref_chosen": -61.76960372924805, "logps/ref_rejected": -83.76141357421875, "logps/rejected": -1000.7723999023438, "loss": 1.1345, "margin_dpo/margin_mean": 327.42437744140625, "margin_dpo/margin_std": 560.2000732421875, "step": 523 }, { "KL/chosen_KL_mean": -570.922119140625, "KL/mean": -741.5078735351562, "KL/rejected_KL_mean": -912.0936279296875, "KL/std": 479.9468994140625, "epoch": 0.7694566813509545, "fcm_dpo/beta": 0.001195290358737111, "fcm_dpo/delta": -0.008701588958501816, "fcm_dpo/margin": 341.17156982421875, "fcm_dpo/q_t": 0.40466296672821045, "grad_norm": 41.24837112426758, "learning_rate": 7.781828926091535e-08, "logits/chosen": -0.9870351552963257, "logits/rejected": -0.9809165000915527, "logps/chosen": -648.994140625, "logps/ref_chosen": -78.0720443725586, "logps/ref_rejected": -81.30198669433594, "logps/rejected": -993.3956298828125, "loss": 1.1083, "margin_dpo/margin_mean": 341.17156982421875, "margin_dpo/margin_std": 499.772705078125, "step": 524 }, { "KL/chosen_KL_mean": -579.7892456054688, "KL/mean": -798.8469848632812, "KL/rejected_KL_mean": -1017.9046630859375, "KL/std": 511.00323486328125, "epoch": 0.7709251101321586, "fcm_dpo/beta": 0.001174594508484006, "fcm_dpo/delta": -0.12164277583360672, "fcm_dpo/margin": 438.11553955078125, "fcm_dpo/q_t": 0.384994238615036, "grad_norm": 30.923978805541992, "learning_rate": 7.689012058193384e-08, "logits/chosen": -0.9067383408546448, "logits/rejected": -0.943909764289856, "logps/chosen": -630.6170654296875, "logps/ref_chosen": -50.827857971191406, "logps/ref_rejected": -100.05294036865234, "logps/rejected": -1117.9576416015625, "loss": 1.0245, "margin_dpo/margin_mean": 438.11553955078125, "margin_dpo/margin_std": 533.79833984375, "step": 525 }, { "KL/chosen_KL_mean": -613.677001953125, "KL/mean": -823.80029296875, "KL/rejected_KL_mean": -1033.9234619140625, "KL/std": 496.4564208984375, "epoch": 0.7723935389133627, "fcm_dpo/beta": 0.0011603353777900338, "fcm_dpo/delta": -0.0920577421784401, "fcm_dpo/margin": 420.246337890625, "fcm_dpo/q_t": 0.38871896266937256, "grad_norm": 29.568998336791992, "learning_rate": 7.596651350926836e-08, "logits/chosen": -0.9136035442352295, "logits/rejected": -0.9113898873329163, "logps/chosen": -676.84423828125, "logps/ref_chosen": -63.167236328125, "logps/ref_rejected": -86.30934143066406, "logps/rejected": -1120.2327880859375, "loss": 1.0538, "margin_dpo/margin_mean": 420.246337890625, "margin_dpo/margin_std": 552.4700317382812, "step": 526 }, { "KL/chosen_KL_mean": -627.5474853515625, "KL/mean": -774.3292236328125, "KL/rejected_KL_mean": -921.1109619140625, "KL/std": 518.6869506835938, "epoch": 0.7738619676945668, "fcm_dpo/beta": 0.0011582564329728484, "fcm_dpo/delta": 0.062111612409353256, "fcm_dpo/margin": 293.5635070800781, "fcm_dpo/q_t": 0.4208937883377075, "grad_norm": 32.31246566772461, "learning_rate": 7.504749238082414e-08, "logits/chosen": -1.083193063735962, "logits/rejected": -1.0532429218292236, "logps/chosen": -698.6761474609375, "logps/ref_chosen": -71.12867736816406, "logps/ref_rejected": -78.3425521850586, "logps/rejected": -999.4535522460938, "loss": 1.1347, "margin_dpo/margin_mean": 293.5634765625, "margin_dpo/margin_std": 429.7330322265625, "step": 527 }, { "KL/chosen_KL_mean": -646.1875, "KL/mean": -824.3826904296875, "KL/rejected_KL_mean": -1002.5779418945312, "KL/std": 493.030029296875, "epoch": 0.775330396475771, "fcm_dpo/beta": 0.0011612444650381804, "fcm_dpo/delta": -0.01449208240956068, "fcm_dpo/margin": 356.39044189453125, "fcm_dpo/q_t": 0.40898245573043823, "grad_norm": 51.520660400390625, "learning_rate": 7.413308141366254e-08, "logits/chosen": -1.0223352909088135, "logits/rejected": -1.0113223791122437, "logps/chosen": -714.2769165039062, "logps/ref_chosen": -68.0894546508789, "logps/ref_rejected": -93.91006469726562, "logps/rejected": -1096.488037109375, "loss": 1.1168, "margin_dpo/margin_mean": 356.39044189453125, "margin_dpo/margin_std": 576.7999267578125, "step": 528 }, { "KL/chosen_KL_mean": -762.7345581054688, "KL/mean": -881.308349609375, "KL/rejected_KL_mean": -999.882080078125, "KL/std": 446.8496398925781, "epoch": 0.7767988252569751, "fcm_dpo/beta": 0.0011665602214634418, "fcm_dpo/delta": 0.02131509780883789, "fcm_dpo/margin": 237.14752197265625, "fcm_dpo/q_t": 0.4364190697669983, "grad_norm": 43.96234893798828, "learning_rate": 7.322330470336313e-08, "logits/chosen": -1.0255260467529297, "logits/rejected": -1.0359970331192017, "logps/chosen": -818.3095703125, "logps/ref_chosen": -55.57495880126953, "logps/ref_rejected": -89.20909118652344, "logps/rejected": -1089.0911865234375, "loss": 1.2374, "margin_dpo/margin_mean": 237.14752197265625, "margin_dpo/margin_std": 574.8782348632812, "step": 529 }, { "KL/chosen_KL_mean": -668.689208984375, "KL/mean": -865.7867431640625, "KL/rejected_KL_mean": -1062.88427734375, "KL/std": 562.298095703125, "epoch": 0.7782672540381792, "fcm_dpo/beta": 0.0011577388504520059, "fcm_dpo/delta": -0.05902961269021034, "fcm_dpo/margin": 394.1950988769531, "fcm_dpo/q_t": 0.4014556407928467, "grad_norm": 53.588645935058594, "learning_rate": 7.231818622338822e-08, "logits/chosen": -0.9486408829689026, "logits/rejected": -0.9478579759597778, "logps/chosen": -716.2905883789062, "logps/ref_chosen": -47.601417541503906, "logps/ref_rejected": -87.2845230102539, "logps/rejected": -1150.1688232421875, "loss": 1.1248, "margin_dpo/margin_mean": 394.1950988769531, "margin_dpo/margin_std": 704.854736328125, "step": 530 }, { "KL/chosen_KL_mean": -726.8511962890625, "KL/mean": -901.04345703125, "KL/rejected_KL_mean": -1075.2357177734375, "KL/std": 581.962158203125, "epoch": 0.7797356828193832, "fcm_dpo/beta": 0.0011553821386769414, "fcm_dpo/delta": -0.0027520228177309036, "fcm_dpo/margin": 348.384521484375, "fcm_dpo/q_t": 0.4106895327568054, "grad_norm": 36.676822662353516, "learning_rate": 7.141774982445147e-08, "logits/chosen": -1.0601121187210083, "logits/rejected": -1.0491019487380981, "logps/chosen": -782.0972900390625, "logps/ref_chosen": -55.246063232421875, "logps/ref_rejected": -70.60598754882812, "logps/rejected": -1145.841796875, "loss": 1.1178, "margin_dpo/margin_mean": 348.3844909667969, "margin_dpo/margin_std": 555.5078735351562, "step": 531 }, { "KL/chosen_KL_mean": -733.3638305664062, "KL/mean": -910.0308227539062, "KL/rejected_KL_mean": -1086.69775390625, "KL/std": 548.9955444335938, "epoch": 0.7812041116005873, "fcm_dpo/beta": 0.0011431981110945344, "fcm_dpo/delta": -0.0051701366901397705, "fcm_dpo/margin": 353.33392333984375, "fcm_dpo/q_t": 0.41110938787460327, "grad_norm": 76.43359375, "learning_rate": 7.052201923388953e-08, "logits/chosen": -0.986026406288147, "logits/rejected": -0.962155282497406, "logps/chosen": -803.6498413085938, "logps/ref_chosen": -70.28601837158203, "logps/ref_rejected": -86.5913314819336, "logps/rejected": -1173.2890625, "loss": 1.1519, "margin_dpo/margin_mean": 353.33392333984375, "margin_dpo/margin_std": 646.3972778320312, "step": 532 }, { "KL/chosen_KL_mean": -653.2174072265625, "KL/mean": -784.70263671875, "KL/rejected_KL_mean": -916.1878051757812, "KL/std": 470.7712707519531, "epoch": 0.7826725403817915, "fcm_dpo/beta": 0.00114994659088552, "fcm_dpo/delta": -0.009539761580526829, "fcm_dpo/margin": 262.97039794921875, "fcm_dpo/q_t": 0.4316534399986267, "grad_norm": 54.19781494140625, "learning_rate": 6.963101805503646e-08, "logits/chosen": -0.9915690422058105, "logits/rejected": -0.9716913104057312, "logps/chosen": -718.072509765625, "logps/ref_chosen": -64.8551025390625, "logps/ref_rejected": -76.58805847167969, "logps/rejected": -992.77587890625, "loss": 1.2054, "margin_dpo/margin_mean": 262.97039794921875, "margin_dpo/margin_std": 563.1723022460938, "step": 533 }, { "KL/chosen_KL_mean": -698.3402099609375, "KL/mean": -880.8671875, "KL/rejected_KL_mean": -1063.39404296875, "KL/std": 534.1531982421875, "epoch": 0.7841409691629956, "fcm_dpo/beta": 0.001137340790592134, "fcm_dpo/delta": -0.017138652503490448, "fcm_dpo/margin": 365.05377197265625, "fcm_dpo/q_t": 0.40718841552734375, "grad_norm": 37.44085693359375, "learning_rate": 6.874476976660184e-08, "logits/chosen": -0.9751067757606506, "logits/rejected": -0.9717357158660889, "logps/chosen": -758.4595947265625, "logps/ref_chosen": -60.119388580322266, "logps/ref_rejected": -78.54347229003906, "logps/rejected": -1141.9375, "loss": 1.1061, "margin_dpo/margin_mean": 365.0538024902344, "margin_dpo/margin_std": 543.7841796875, "step": 534 }, { "KL/chosen_KL_mean": -584.0714721679688, "KL/mean": -793.9010620117188, "KL/rejected_KL_mean": -1003.7305908203125, "KL/std": 514.93896484375, "epoch": 0.7856093979441997, "fcm_dpo/beta": 0.0011387758422642946, "fcm_dpo/delta": -0.0821937620639801, "fcm_dpo/margin": 419.6590576171875, "fcm_dpo/q_t": 0.39370042085647583, "grad_norm": 29.454992294311523, "learning_rate": 6.786329772205246e-08, "logits/chosen": -0.9155275821685791, "logits/rejected": -0.9193699359893799, "logps/chosen": -638.4017333984375, "logps/ref_chosen": -54.330238342285156, "logps/ref_rejected": -96.30763244628906, "logps/rejected": -1100.0382080078125, "loss": 1.0564, "margin_dpo/margin_mean": 419.6590576171875, "margin_dpo/margin_std": 548.3145141601562, "step": 535 }, { "KL/chosen_KL_mean": -528.38671875, "KL/mean": -777.7352905273438, "KL/rejected_KL_mean": -1027.083740234375, "KL/std": 573.330322265625, "epoch": 0.7870778267254038, "fcm_dpo/beta": 0.0011007413268089294, "fcm_dpo/delta": -0.15751913189888, "fcm_dpo/margin": 498.69720458984375, "fcm_dpo/q_t": 0.3827175498008728, "grad_norm": 38.40432357788086, "learning_rate": 6.698662514899638e-08, "logits/chosen": -0.9019815325737, "logits/rejected": -0.933282732963562, "logps/chosen": -575.4672241210938, "logps/ref_chosen": -47.08053207397461, "logps/ref_rejected": -89.09783935546875, "logps/rejected": -1116.181640625, "loss": 1.0258, "margin_dpo/margin_mean": 498.6971740722656, "margin_dpo/margin_std": 687.29541015625, "step": 536 }, { "KL/chosen_KL_mean": -552.53515625, "KL/mean": -724.3604736328125, "KL/rejected_KL_mean": -896.1856689453125, "KL/std": 467.387939453125, "epoch": 0.788546255506608, "fcm_dpo/beta": 0.0011007563443854451, "fcm_dpo/delta": 0.021880976855754852, "fcm_dpo/margin": 343.6505126953125, "fcm_dpo/q_t": 0.4137336313724518, "grad_norm": 42.51823425292969, "learning_rate": 6.611477514857114e-08, "logits/chosen": -0.9375029802322388, "logits/rejected": -0.9250655770301819, "logps/chosen": -610.2826538085938, "logps/ref_chosen": -57.747467041015625, "logps/ref_rejected": -70.43838500976562, "logps/rejected": -966.6240234375, "loss": 1.1392, "margin_dpo/margin_mean": 343.6505126953125, "margin_dpo/margin_std": 572.0556640625, "step": 537 }, { "KL/chosen_KL_mean": -675.07861328125, "KL/mean": -863.2783813476562, "KL/rejected_KL_mean": -1051.478271484375, "KL/std": 501.026123046875, "epoch": 0.7900146842878121, "fcm_dpo/beta": 0.0010912481229752302, "fcm_dpo/delta": -0.011391473934054375, "fcm_dpo/margin": 376.39959716796875, "fcm_dpo/q_t": 0.40652403235435486, "grad_norm": 32.99623489379883, "learning_rate": 6.524777069483525e-08, "logits/chosen": -0.9146217107772827, "logits/rejected": -0.9003403186798096, "logps/chosen": -741.4945678710938, "logps/ref_chosen": -66.41594696044922, "logps/ref_rejected": -84.22808837890625, "logps/rejected": -1135.706298828125, "loss": 1.0884, "margin_dpo/margin_mean": 376.3995666503906, "margin_dpo/margin_std": 509.0475158691406, "step": 538 }, { "KL/chosen_KL_mean": -576.7205810546875, "KL/mean": -757.0679931640625, "KL/rejected_KL_mean": -937.4154052734375, "KL/std": 426.46710205078125, "epoch": 0.7914831130690162, "fcm_dpo/beta": 0.0010967530542984605, "fcm_dpo/delta": 0.004480024799704552, "fcm_dpo/margin": 360.6948547363281, "fcm_dpo/q_t": 0.4094918370246887, "grad_norm": 36.090091705322266, "learning_rate": 6.438563463416221e-08, "logits/chosen": -0.9449999332427979, "logits/rejected": -0.9356608390808105, "logps/chosen": -635.21337890625, "logps/ref_chosen": -58.492855072021484, "logps/ref_rejected": -91.85395050048828, "logps/rejected": -1029.269287109375, "loss": 1.0922, "margin_dpo/margin_mean": 360.6948547363281, "margin_dpo/margin_std": 470.793212890625, "step": 539 }, { "KL/chosen_KL_mean": -585.2135620117188, "KL/mean": -820.2244262695312, "KL/rejected_KL_mean": -1055.2353515625, "KL/std": 539.0792236328125, "epoch": 0.7929515418502202, "fcm_dpo/beta": 0.001079935347661376, "fcm_dpo/delta": -0.1131967157125473, "fcm_dpo/margin": 470.0218811035156, "fcm_dpo/q_t": 0.3896099925041199, "grad_norm": 34.88587951660156, "learning_rate": 6.352838968463919e-08, "logits/chosen": -0.9021656513214111, "logits/rejected": -0.9275361895561218, "logps/chosen": -648.696044921875, "logps/ref_chosen": -63.482513427734375, "logps/ref_rejected": -116.42999267578125, "logps/rejected": -1171.665283203125, "loss": 1.052, "margin_dpo/margin_mean": 470.0218505859375, "margin_dpo/margin_std": 641.974853515625, "step": 540 }, { "KL/chosen_KL_mean": -691.6966552734375, "KL/mean": -813.2490234375, "KL/rejected_KL_mean": -934.8014526367188, "KL/std": 476.3492431640625, "epoch": 0.7944199706314243, "fcm_dpo/beta": 0.0010710853384807706, "fcm_dpo/delta": 0.004636428784579039, "fcm_dpo/margin": 243.10484313964844, "fcm_dpo/q_t": 0.4407821297645569, "grad_norm": 53.53697204589844, "learning_rate": 6.267605843546767e-08, "logits/chosen": -1.006117343902588, "logits/rejected": -1.0002844333648682, "logps/chosen": -769.9769897460938, "logps/ref_chosen": -78.28036499023438, "logps/ref_rejected": -103.273681640625, "logps/rejected": -1038.0751953125, "loss": 1.2295, "margin_dpo/margin_mean": 243.10482788085938, "margin_dpo/margin_std": 556.22802734375, "step": 541 }, { "KL/chosen_KL_mean": -586.3802490234375, "KL/mean": -818.114013671875, "KL/rejected_KL_mean": -1049.8477783203125, "KL/std": 528.82470703125, "epoch": 0.7958883994126285, "fcm_dpo/beta": 0.001048381207510829, "fcm_dpo/delta": -0.09220831096172333, "fcm_dpo/margin": 463.4676208496094, "fcm_dpo/q_t": 0.3913062810897827, "grad_norm": 52.26215362548828, "learning_rate": 6.182866334636888e-08, "logits/chosen": -0.9827414751052856, "logits/rejected": -1.0148510932922363, "logps/chosen": -643.865234375, "logps/ref_chosen": -57.48497009277344, "logps/ref_rejected": -96.47506713867188, "logps/rejected": -1146.3228759765625, "loss": 1.0568, "margin_dpo/margin_mean": 463.46759033203125, "margin_dpo/margin_std": 619.2740478515625, "step": 542 }, { "KL/chosen_KL_mean": -671.8050537109375, "KL/mean": -835.260986328125, "KL/rejected_KL_mean": -998.716796875, "KL/std": 636.306640625, "epoch": 0.7973568281938326, "fcm_dpo/beta": 0.0010581112001091242, "fcm_dpo/delta": 0.05605652183294296, "fcm_dpo/margin": 326.9117431640625, "fcm_dpo/q_t": 0.4326293468475342, "grad_norm": 37.28949737548828, "learning_rate": 6.098622674699147e-08, "logits/chosen": -0.9363719820976257, "logits/rejected": -0.9664100408554077, "logps/chosen": -732.422607421875, "logps/ref_chosen": -60.61750793457031, "logps/ref_rejected": -105.59896850585938, "logps/rejected": -1104.3157958984375, "loss": 1.2008, "margin_dpo/margin_mean": 326.91180419921875, "margin_dpo/margin_std": 747.7957763671875, "step": 543 }, { "KL/chosen_KL_mean": -671.8202514648438, "KL/mean": -865.7568359375, "KL/rejected_KL_mean": -1059.6934814453125, "KL/std": 506.9651794433594, "epoch": 0.7988252569750367, "fcm_dpo/beta": 0.0010604651179164648, "fcm_dpo/delta": -0.011842611245810986, "fcm_dpo/margin": 387.8731994628906, "fcm_dpo/q_t": 0.4069485068321228, "grad_norm": 34.128662109375, "learning_rate": 6.01487708363232e-08, "logits/chosen": -0.9132235050201416, "logits/rejected": -0.9313616752624512, "logps/chosen": -731.4625244140625, "logps/ref_chosen": -59.642303466796875, "logps/ref_rejected": -100.95469665527344, "logps/rejected": -1160.648193359375, "loss": 1.1033, "margin_dpo/margin_mean": 387.8731994628906, "margin_dpo/margin_std": 581.0994262695312, "step": 544 }, { "KL/chosen_KL_mean": -610.9091796875, "KL/mean": -838.3101806640625, "KL/rejected_KL_mean": -1065.711181640625, "KL/std": 501.88525390625, "epoch": 0.8002936857562408, "fcm_dpo/beta": 0.001048812409862876, "fcm_dpo/delta": -0.0808180570602417, "fcm_dpo/margin": 454.80206298828125, "fcm_dpo/q_t": 0.39258188009262085, "grad_norm": 40.978694915771484, "learning_rate": 5.9316317682106294e-08, "logits/chosen": -0.8520915508270264, "logits/rejected": -0.8862226009368896, "logps/chosen": -678.5577392578125, "logps/ref_chosen": -67.64859771728516, "logps/ref_rejected": -95.90800476074219, "logps/rejected": -1161.619140625, "loss": 1.0465, "margin_dpo/margin_mean": 454.80206298828125, "margin_dpo/margin_std": 570.3173828125, "step": 545 }, { "KL/chosen_KL_mean": -588.1337890625, "KL/mean": -736.3134155273438, "KL/rejected_KL_mean": -884.4930419921875, "KL/std": 446.0101013183594, "epoch": 0.801762114537445, "fcm_dpo/beta": 0.0010577274952083826, "fcm_dpo/delta": 0.08934411406517029, "fcm_dpo/margin": 296.3592529296875, "fcm_dpo/q_t": 0.42593374848365784, "grad_norm": 33.9541015625, "learning_rate": 5.848888922025552e-08, "logits/chosen": -0.9163818359375, "logits/rejected": -0.9050056338310242, "logps/chosen": -638.8780517578125, "logps/ref_chosen": -50.744232177734375, "logps/ref_rejected": -81.86622619628906, "logps/rejected": -966.3592529296875, "loss": 1.1611, "margin_dpo/margin_mean": 296.3592529296875, "margin_dpo/margin_std": 485.8372802734375, "step": 546 }, { "KL/chosen_KL_mean": -589.9801635742188, "KL/mean": -785.5772094726562, "KL/rejected_KL_mean": -981.17431640625, "KL/std": 491.4225769042969, "epoch": 0.8032305433186491, "fcm_dpo/beta": 0.0010618357919156551, "fcm_dpo/delta": -0.016077794134616852, "fcm_dpo/margin": 391.194091796875, "fcm_dpo/q_t": 0.4060874581336975, "grad_norm": 40.24742126464844, "learning_rate": 5.7666507254280265e-08, "logits/chosen": -0.8674280643463135, "logits/rejected": -0.881703794002533, "logps/chosen": -663.6678466796875, "logps/ref_chosen": -73.6877212524414, "logps/ref_rejected": -90.76136779785156, "logps/rejected": -1071.9356689453125, "loss": 1.0896, "margin_dpo/margin_mean": 391.194091796875, "margin_dpo/margin_std": 540.9904174804688, "step": 547 }, { "KL/chosen_KL_mean": -622.0706787109375, "KL/mean": -800.4874267578125, "KL/rejected_KL_mean": -978.9041748046875, "KL/std": 532.3212280273438, "epoch": 0.8046989720998532, "fcm_dpo/beta": 0.00106256443541497, "fcm_dpo/delta": 0.021664846688508987, "fcm_dpo/margin": 356.8335266113281, "fcm_dpo/q_t": 0.41794323921203613, "grad_norm": 31.893949508666992, "learning_rate": 5.684919345471029e-08, "logits/chosen": -0.9592008590698242, "logits/rejected": -0.9574205875396729, "logps/chosen": -687.3170166015625, "logps/ref_chosen": -65.24634552001953, "logps/ref_rejected": -94.11807250976562, "logps/rejected": -1073.022216796875, "loss": 1.1266, "margin_dpo/margin_mean": 356.8335266113281, "margin_dpo/margin_std": 595.5147094726562, "step": 548 }, { "KL/chosen_KL_mean": -648.3043823242188, "KL/mean": -785.566162109375, "KL/rejected_KL_mean": -922.8280639648438, "KL/std": 443.6226501464844, "epoch": 0.8061674008810573, "fcm_dpo/beta": 0.0010651289485394955, "fcm_dpo/delta": 0.011987905949354172, "fcm_dpo/margin": 274.5236511230469, "fcm_dpo/q_t": 0.4341329336166382, "grad_norm": 56.424461364746094, "learning_rate": 5.603696935852426e-08, "logits/chosen": -0.9499194622039795, "logits/rejected": -0.94138503074646, "logps/chosen": -697.5167236328125, "logps/ref_chosen": -49.21235656738281, "logps/ref_rejected": -73.91031646728516, "logps/rejected": -996.7384033203125, "loss": 1.1934, "margin_dpo/margin_mean": 274.52362060546875, "margin_dpo/margin_std": 537.4019775390625, "step": 549 }, { "KL/chosen_KL_mean": -648.9557495117188, "KL/mean": -816.33251953125, "KL/rejected_KL_mean": -983.709228515625, "KL/std": 498.76806640625, "epoch": 0.8076358296622613, "fcm_dpo/beta": 0.0010721642756834626, "fcm_dpo/delta": 0.04264108091592789, "fcm_dpo/margin": 334.7535095214844, "fcm_dpo/q_t": 0.41808733344078064, "grad_norm": 34.55923843383789, "learning_rate": 5.5229856368582376e-08, "logits/chosen": -0.898378849029541, "logits/rejected": -0.9233511686325073, "logps/chosen": -705.7626953125, "logps/ref_chosen": -56.80695343017578, "logps/ref_rejected": -95.12580871582031, "logps/rejected": -1078.8350830078125, "loss": 1.1307, "margin_dpo/margin_mean": 334.7535400390625, "margin_dpo/margin_std": 521.0529174804688, "step": 550 }, { "KL/chosen_KL_mean": -560.6070556640625, "KL/mean": -831.8469848632812, "KL/rejected_KL_mean": -1103.0869140625, "KL/std": 532.996337890625, "epoch": 0.8091042584434655, "fcm_dpo/beta": 0.0010456846794113517, "fcm_dpo/delta": -0.17770320177078247, "fcm_dpo/margin": 542.4798583984375, "fcm_dpo/q_t": 0.37006914615631104, "grad_norm": 50.58546447753906, "learning_rate": 5.4427875753062734e-08, "logits/chosen": -0.8850421905517578, "logits/rejected": -0.9424214363098145, "logps/chosen": -619.71337890625, "logps/ref_chosen": -59.10633087158203, "logps/ref_rejected": -111.67280578613281, "logps/rejected": -1214.759765625, "loss": 0.9675, "margin_dpo/margin_mean": 542.4798583984375, "margin_dpo/margin_std": 536.1229858398438, "step": 551 }, { "KL/chosen_KL_mean": -546.4177856445312, "KL/mean": -849.3590087890625, "KL/rejected_KL_mean": -1152.300048828125, "KL/std": 611.6688232421875, "epoch": 0.8105726872246696, "fcm_dpo/beta": 0.0009956832509487867, "fcm_dpo/delta": -0.21994295716285706, "fcm_dpo/margin": 605.8824462890625, "fcm_dpo/q_t": 0.36827754974365234, "grad_norm": 51.7211799621582, "learning_rate": 5.363104864490034e-08, "logits/chosen": -0.9471904039382935, "logits/rejected": -0.9889096021652222, "logps/chosen": -608.7723999023438, "logps/ref_chosen": -62.35459899902344, "logps/ref_rejected": -104.56210327148438, "logps/rejected": -1256.8623046875, "loss": 0.9761, "margin_dpo/margin_mean": 605.8823852539062, "margin_dpo/margin_std": 696.2559814453125, "step": 552 }, { "KL/chosen_KL_mean": -627.032958984375, "KL/mean": -778.630859375, "KL/rejected_KL_mean": -930.2286987304688, "KL/std": 506.2027587890625, "epoch": 0.8120411160058737, "fcm_dpo/beta": 0.0010049683041870594, "fcm_dpo/delta": 0.0984039306640625, "fcm_dpo/margin": 303.19573974609375, "fcm_dpo/q_t": 0.4325829744338989, "grad_norm": 25.391754150390625, "learning_rate": 5.2839396041230415e-08, "logits/chosen": -0.9180362224578857, "logits/rejected": -0.9116028547286987, "logps/chosen": -695.291748046875, "logps/ref_chosen": -68.25881958007812, "logps/ref_rejected": -98.0971450805664, "logps/rejected": -1028.325927734375, "loss": 1.1778, "margin_dpo/margin_mean": 303.1957702636719, "margin_dpo/margin_std": 562.311279296875, "step": 553 }, { "KL/chosen_KL_mean": -641.80712890625, "KL/mean": -855.0924072265625, "KL/rejected_KL_mean": -1068.3775634765625, "KL/std": 544.1128540039062, "epoch": 0.8135095447870778, "fcm_dpo/beta": 0.001014210982248187, "fcm_dpo/delta": -0.034694697707891464, "fcm_dpo/margin": 426.5704040527344, "fcm_dpo/q_t": 0.40570682287216187, "grad_norm": 55.65289306640625, "learning_rate": 5.205293880283551e-08, "logits/chosen": -0.9369876980781555, "logits/rejected": -0.9231326580047607, "logps/chosen": -709.7548217773438, "logps/ref_chosen": -67.94767761230469, "logps/ref_rejected": -89.78272247314453, "logps/rejected": -1158.1602783203125, "loss": 1.1142, "margin_dpo/margin_mean": 426.5704345703125, "margin_dpo/margin_std": 692.4230346679688, "step": 554 }, { "KL/chosen_KL_mean": -657.1424560546875, "KL/mean": -902.9049072265625, "KL/rejected_KL_mean": -1148.66748046875, "KL/std": 580.0269165039062, "epoch": 0.8149779735682819, "fcm_dpo/beta": 0.0009933705441653728, "fcm_dpo/delta": -0.09272074699401855, "fcm_dpo/margin": 491.5249938964844, "fcm_dpo/q_t": 0.3950349688529968, "grad_norm": 36.912017822265625, "learning_rate": 5.127169765359515e-08, "logits/chosen": -0.9536832571029663, "logits/rejected": -1.00516676902771, "logps/chosen": -710.472900390625, "logps/ref_chosen": -53.33049011230469, "logps/ref_rejected": -108.47937774658203, "logps/rejected": -1257.146728515625, "loss": 1.0769, "margin_dpo/margin_mean": 491.5250244140625, "margin_dpo/margin_std": 748.8701171875, "step": 555 }, { "KL/chosen_KL_mean": -631.5589599609375, "KL/mean": -781.698974609375, "KL/rejected_KL_mean": -931.8389892578125, "KL/std": 445.9133605957031, "epoch": 0.8164464023494861, "fcm_dpo/beta": 0.0010029294062405825, "fcm_dpo/delta": 0.10188616812229156, "fcm_dpo/margin": 300.2801513671875, "fcm_dpo/q_t": 0.43013256788253784, "grad_norm": 37.66986083984375, "learning_rate": 5.049569317994012e-08, "logits/chosen": -0.9434751272201538, "logits/rejected": -0.9371851086616516, "logps/chosen": -690.203369140625, "logps/ref_chosen": -58.64447021484375, "logps/ref_rejected": -101.34040832519531, "logps/rejected": -1033.179443359375, "loss": 1.1549, "margin_dpo/margin_mean": 300.2801513671875, "margin_dpo/margin_std": 445.45648193359375, "step": 556 }, { "KL/chosen_KL_mean": -681.3212890625, "KL/mean": -898.03466796875, "KL/rejected_KL_mean": -1114.748046875, "KL/std": 606.8817749023438, "epoch": 0.8179148311306902, "fcm_dpo/beta": 0.00100015162024647, "fcm_dpo/delta": -0.03522220626473427, "fcm_dpo/margin": 433.4267883300781, "fcm_dpo/q_t": 0.4047701060771942, "grad_norm": 59.12196731567383, "learning_rate": 4.9724945830310144e-08, "logits/chosen": -0.974113404750824, "logits/rejected": -1.007190227508545, "logps/chosen": -749.1619873046875, "logps/ref_chosen": -67.84066009521484, "logps/ref_rejected": -109.93965911865234, "logps/rejected": -1224.687744140625, "loss": 1.107, "margin_dpo/margin_mean": 433.4267883300781, "margin_dpo/margin_std": 684.0028076171875, "step": 557 }, { "KL/chosen_KL_mean": -605.5770263671875, "KL/mean": -900.3006591796875, "KL/rejected_KL_mean": -1195.0242919921875, "KL/std": 560.5341186523438, "epoch": 0.8193832599118943, "fcm_dpo/beta": 0.0009742493275552988, "fcm_dpo/delta": -0.18480078876018524, "fcm_dpo/margin": 589.447265625, "fcm_dpo/q_t": 0.36719027161598206, "grad_norm": 32.32392120361328, "learning_rate": 4.8959475914614554e-08, "logits/chosen": -0.9915221929550171, "logits/rejected": -1.0101500749588013, "logps/chosen": -667.9453125, "logps/ref_chosen": -62.36824035644531, "logps/ref_rejected": -102.16102600097656, "logps/rejected": -1297.185302734375, "loss": 0.9754, "margin_dpo/margin_mean": 589.447265625, "margin_dpo/margin_std": 615.0665283203125, "step": 558 }, { "KL/chosen_KL_mean": -699.533935546875, "KL/mean": -948.122802734375, "KL/rejected_KL_mean": -1196.711669921875, "KL/std": 582.9520263671875, "epoch": 0.8208516886930984, "fcm_dpo/beta": 0.0009549415553919971, "fcm_dpo/delta": -0.0784287303686142, "fcm_dpo/margin": 497.1776428222656, "fcm_dpo/q_t": 0.3929804563522339, "grad_norm": 28.45130157470703, "learning_rate": 4.8199303603697614e-08, "logits/chosen": -1.1178151369094849, "logits/rejected": -1.1277766227722168, "logps/chosen": -760.2862548828125, "logps/ref_chosen": -60.752323150634766, "logps/ref_rejected": -93.44229125976562, "logps/rejected": -1290.15380859375, "loss": 1.0501, "margin_dpo/margin_mean": 497.17767333984375, "margin_dpo/margin_std": 634.4929809570312, "step": 559 }, { "KL/chosen_KL_mean": -626.045166015625, "KL/mean": -804.8916015625, "KL/rejected_KL_mean": -983.7379760742188, "KL/std": 515.1402587890625, "epoch": 0.8223201174743024, "fcm_dpo/beta": 0.0009533166885375977, "fcm_dpo/delta": 0.06086999550461769, "fcm_dpo/margin": 357.6928405761719, "fcm_dpo/q_t": 0.42201805114746094, "grad_norm": 31.18995475769043, "learning_rate": 4.7444448928806615e-08, "logits/chosen": -0.8733669519424438, "logits/rejected": -0.856816828250885, "logps/chosen": -684.1489868164062, "logps/ref_chosen": -58.10382080078125, "logps/ref_rejected": -79.99122619628906, "logps/rejected": -1063.729248046875, "loss": 1.1422, "margin_dpo/margin_mean": 357.69287109375, "margin_dpo/margin_std": 554.28125, "step": 560 }, { "KL/chosen_KL_mean": -734.6890258789062, "KL/mean": -880.0928344726562, "KL/rejected_KL_mean": -1025.49658203125, "KL/std": 505.81610107421875, "epoch": 0.8237885462555066, "fcm_dpo/beta": 0.0009805468143895268, "fcm_dpo/delta": 0.11781884729862213, "fcm_dpo/margin": 290.80755615234375, "fcm_dpo/q_t": 0.43384015560150146, "grad_norm": 41.6833610534668, "learning_rate": 4.669493178106432e-08, "logits/chosen": -1.0193910598754883, "logits/rejected": -1.037698745727539, "logps/chosen": -785.6019287109375, "logps/ref_chosen": -50.912879943847656, "logps/ref_rejected": -99.06856536865234, "logps/rejected": -1124.565185546875, "loss": 1.208, "margin_dpo/margin_mean": 290.8075256347656, "margin_dpo/margin_std": 622.505615234375, "step": 561 }, { "KL/chosen_KL_mean": -690.031982421875, "KL/mean": -897.793701171875, "KL/rejected_KL_mean": -1105.555419921875, "KL/std": 561.4027099609375, "epoch": 0.8252569750367107, "fcm_dpo/beta": 0.0009777405066415668, "fcm_dpo/delta": -0.007167506963014603, "fcm_dpo/margin": 415.52349853515625, "fcm_dpo/q_t": 0.40837323665618896, "grad_norm": 40.292320251464844, "learning_rate": 4.5950771910944596e-08, "logits/chosen": -1.024315357208252, "logits/rejected": -1.0398998260498047, "logps/chosen": -749.496337890625, "logps/ref_chosen": -59.46440124511719, "logps/ref_rejected": -96.54266357421875, "logps/rejected": -1202.09814453125, "loss": 1.1004, "margin_dpo/margin_mean": 415.52349853515625, "margin_dpo/margin_std": 602.928955078125, "step": 562 }, { "KL/chosen_KL_mean": -751.924072265625, "KL/mean": -913.3809204101562, "KL/rejected_KL_mean": -1074.837890625, "KL/std": 594.7562255859375, "epoch": 0.8267254038179148, "fcm_dpo/beta": 0.0009785356232896447, "fcm_dpo/delta": -0.04188579320907593, "fcm_dpo/margin": 322.9136657714844, "fcm_dpo/q_t": 0.4237578213214874, "grad_norm": 40.284427642822266, "learning_rate": 4.521198892775202e-08, "logits/chosen": -1.0153368711471558, "logits/rejected": -1.0244905948638916, "logps/chosen": -812.5322265625, "logps/ref_chosen": -60.60819625854492, "logps/ref_rejected": -94.56770324707031, "logps/rejected": -1169.405517578125, "loss": 1.2111, "margin_dpo/margin_mean": 322.9136657714844, "margin_dpo/margin_std": 696.208251953125, "step": 563 }, { "KL/chosen_KL_mean": -686.2769775390625, "KL/mean": -883.5341796875, "KL/rejected_KL_mean": -1080.7913818359375, "KL/std": 529.4177856445312, "epoch": 0.8281938325991189, "fcm_dpo/beta": 0.0009786732262000442, "fcm_dpo/delta": 0.01432707067579031, "fcm_dpo/margin": 394.514404296875, "fcm_dpo/q_t": 0.41156822443008423, "grad_norm": 40.814979553222656, "learning_rate": 4.447860229910544e-08, "logits/chosen": -1.0610636472702026, "logits/rejected": -1.053609013557434, "logps/chosen": -760.5452880859375, "logps/ref_chosen": -74.26837921142578, "logps/ref_rejected": -93.23818969726562, "logps/rejected": -1174.029541015625, "loss": 1.0964, "margin_dpo/margin_mean": 394.514404296875, "margin_dpo/margin_std": 508.09417724609375, "step": 564 }, { "KL/chosen_KL_mean": -716.1942138671875, "KL/mean": -929.130126953125, "KL/rejected_KL_mean": -1142.0660400390625, "KL/std": 609.8633422851562, "epoch": 0.8296622613803231, "fcm_dpo/beta": 0.0009748205775395036, "fcm_dpo/delta": -0.015806902199983597, "fcm_dpo/margin": 425.871826171875, "fcm_dpo/q_t": 0.41027140617370605, "grad_norm": 42.958003997802734, "learning_rate": 4.375063135042445e-08, "logits/chosen": -0.9631332159042358, "logits/rejected": -0.9660812616348267, "logps/chosen": -785.214111328125, "logps/ref_chosen": -69.0199203491211, "logps/ref_rejected": -85.7789306640625, "logps/rejected": -1227.844970703125, "loss": 1.127, "margin_dpo/margin_mean": 425.871826171875, "margin_dpo/margin_std": 730.54736328125, "step": 565 }, { "KL/chosen_KL_mean": -707.080322265625, "KL/mean": -945.5238647460938, "KL/rejected_KL_mean": -1183.967529296875, "KL/std": 644.3311767578125, "epoch": 0.8311306901615272, "fcm_dpo/beta": 0.000970390741713345, "fcm_dpo/delta": -0.06614132225513458, "fcm_dpo/margin": 476.88720703125, "fcm_dpo/q_t": 0.3978080153465271, "grad_norm": 32.570377349853516, "learning_rate": 4.3028095264420525e-08, "logits/chosen": -1.0035604238510132, "logits/rejected": -1.0268689393997192, "logps/chosen": -773.6256103515625, "logps/ref_chosen": -66.5453109741211, "logps/ref_rejected": -103.86932373046875, "logps/rejected": -1287.8367919921875, "loss": 1.0997, "margin_dpo/margin_mean": 476.88720703125, "margin_dpo/margin_std": 747.5025634765625, "step": 566 }, { "KL/chosen_KL_mean": -654.31689453125, "KL/mean": -836.738037109375, "KL/rejected_KL_mean": -1019.1592407226562, "KL/std": 439.9637756347656, "epoch": 0.8325991189427313, "fcm_dpo/beta": 0.000967850093729794, "fcm_dpo/delta": 0.048623181879520416, "fcm_dpo/margin": 364.84228515625, "fcm_dpo/q_t": 0.4172418713569641, "grad_norm": 42.263118743896484, "learning_rate": 4.231101308059165e-08, "logits/chosen": -1.077162504196167, "logits/rejected": -1.0844173431396484, "logps/chosen": -707.1751708984375, "logps/ref_chosen": -52.85829544067383, "logps/ref_rejected": -85.37095642089844, "logps/rejected": -1104.5302734375, "loss": 1.1145, "margin_dpo/margin_mean": 364.8423156738281, "margin_dpo/margin_std": 478.48101806640625, "step": 567 }, { "KL/chosen_KL_mean": -648.8751220703125, "KL/mean": -889.5942993164062, "KL/rejected_KL_mean": -1130.3134765625, "KL/std": 514.7445068359375, "epoch": 0.8340675477239354, "fcm_dpo/beta": 0.0009604596998542547, "fcm_dpo/delta": -0.06550342589616776, "fcm_dpo/margin": 481.43841552734375, "fcm_dpo/q_t": 0.3923587203025818, "grad_norm": 30.015487670898438, "learning_rate": 4.1599403694720145e-08, "logits/chosen": -1.0153778791427612, "logits/rejected": -1.0532267093658447, "logps/chosen": -694.0675048828125, "logps/ref_chosen": -45.1923828125, "logps/ref_rejected": -89.09236907958984, "logps/rejected": -1219.40576171875, "loss": 1.0371, "margin_dpo/margin_mean": 481.43841552734375, "margin_dpo/margin_std": 534.50341796875, "step": 568 }, { "KL/chosen_KL_mean": -744.4910278320312, "KL/mean": -949.3118896484375, "KL/rejected_KL_mean": -1154.1328125, "KL/std": 672.5451049804688, "epoch": 0.8355359765051396, "fcm_dpo/beta": 0.0009647671831771731, "fcm_dpo/delta": 0.00415463000535965, "fcm_dpo/margin": 409.6417236328125, "fcm_dpo/q_t": 0.410520076751709, "grad_norm": 49.2674446105957, "learning_rate": 4.089328585837512e-08, "logits/chosen": -1.025818109512329, "logits/rejected": -1.0322705507278442, "logps/chosen": -808.2116088867188, "logps/ref_chosen": -63.72056198120117, "logps/ref_rejected": -79.10325622558594, "logps/rejected": -1233.236083984375, "loss": 1.134, "margin_dpo/margin_mean": 409.6417236328125, "margin_dpo/margin_std": 681.7449951171875, "step": 569 }, { "KL/chosen_KL_mean": -687.3922119140625, "KL/mean": -874.5309448242188, "KL/rejected_KL_mean": -1061.669677734375, "KL/std": 541.8402709960938, "epoch": 0.8370044052863436, "fcm_dpo/beta": 0.0009631971479393542, "fcm_dpo/delta": 0.040986284613609314, "fcm_dpo/margin": 374.2774963378906, "fcm_dpo/q_t": 0.4191049039363861, "grad_norm": 29.400341033935547, "learning_rate": 4.019267817841834e-08, "logits/chosen": -1.1264129877090454, "logits/rejected": -1.1218767166137695, "logps/chosen": -749.0067749023438, "logps/ref_chosen": -61.61454391479492, "logps/ref_rejected": -82.14186096191406, "logps/rejected": -1143.8115234375, "loss": 1.1329, "margin_dpo/margin_mean": 374.2774963378906, "margin_dpo/margin_std": 585.3759155273438, "step": 570 }, { "KL/chosen_KL_mean": -716.3717041015625, "KL/mean": -937.341796875, "KL/rejected_KL_mean": -1158.3118896484375, "KL/std": 546.9036865234375, "epoch": 0.8384728340675477, "fcm_dpo/beta": 0.0009610787965357304, "fcm_dpo/delta": -0.02597730979323387, "fcm_dpo/margin": 441.94012451171875, "fcm_dpo/q_t": 0.40491753816604614, "grad_norm": 41.382713317871094, "learning_rate": 3.9497599116513705e-08, "logits/chosen": -1.0316365957260132, "logits/rejected": -1.0433616638183594, "logps/chosen": -769.42578125, "logps/ref_chosen": -53.05406188964844, "logps/ref_rejected": -91.33682250976562, "logps/rejected": -1249.648681640625, "loss": 1.1022, "margin_dpo/margin_mean": 441.94012451171875, "margin_dpo/margin_std": 674.6705322265625, "step": 571 }, { "KL/chosen_KL_mean": -752.6365966796875, "KL/mean": -983.4873046875, "KL/rejected_KL_mean": -1214.3380126953125, "KL/std": 640.98876953125, "epoch": 0.8399412628487518, "fcm_dpo/beta": 0.000953345384914428, "fcm_dpo/delta": -0.04224724695086479, "fcm_dpo/margin": 461.7014465332031, "fcm_dpo/q_t": 0.4049929678440094, "grad_norm": 28.851036071777344, "learning_rate": 3.880806698864086e-08, "logits/chosen": -1.074343204498291, "logits/rejected": -1.1045624017715454, "logps/chosen": -801.095947265625, "logps/ref_chosen": -48.45928955078125, "logps/ref_rejected": -83.55703735351562, "logps/rejected": -1297.89501953125, "loss": 1.1089, "margin_dpo/margin_mean": 461.7014465332031, "margin_dpo/margin_std": 761.991943359375, "step": 572 }, { "KL/chosen_KL_mean": -734.09912109375, "KL/mean": -932.6853637695312, "KL/rejected_KL_mean": -1131.271728515625, "KL/std": 566.801513671875, "epoch": 0.8414096916299559, "fcm_dpo/beta": 0.0009574309224262834, "fcm_dpo/delta": 0.020482124760746956, "fcm_dpo/margin": 397.17266845703125, "fcm_dpo/q_t": 0.4144536852836609, "grad_norm": 25.593852996826172, "learning_rate": 3.812409996461275e-08, "logits/chosen": -1.0817201137542725, "logits/rejected": -1.092029333114624, "logps/chosen": -785.7216796875, "logps/ref_chosen": -51.62262725830078, "logps/ref_rejected": -85.32499694824219, "logps/rejected": -1216.5966796875, "loss": 1.108, "margin_dpo/margin_mean": 397.17266845703125, "margin_dpo/margin_std": 568.3775634765625, "step": 573 }, { "KL/chosen_KL_mean": -666.8729248046875, "KL/mean": -884.59521484375, "KL/rejected_KL_mean": -1102.317626953125, "KL/std": 510.1865234375, "epoch": 0.8428781204111601, "fcm_dpo/beta": 0.0009572736453264952, "fcm_dpo/delta": -0.017637627199292183, "fcm_dpo/margin": 435.4447021484375, "fcm_dpo/q_t": 0.40490391850471497, "grad_norm": 34.41420364379883, "learning_rate": 3.74457160675965e-08, "logits/chosen": -1.0834131240844727, "logits/rejected": -1.1097838878631592, "logps/chosen": -717.9173583984375, "logps/ref_chosen": -51.04446029663086, "logps/ref_rejected": -92.80640411376953, "logps/rejected": -1195.1240234375, "loss": 1.0865, "margin_dpo/margin_mean": 435.4447021484375, "margin_dpo/margin_std": 589.404296875, "step": 574 }, { "KL/chosen_KL_mean": -747.3386840820312, "KL/mean": -951.7550659179688, "KL/rejected_KL_mean": -1156.17138671875, "KL/std": 525.8989868164062, "epoch": 0.8443465491923642, "fcm_dpo/beta": 0.000948374392464757, "fcm_dpo/delta": 0.011579148471355438, "fcm_dpo/margin": 408.832763671875, "fcm_dpo/q_t": 0.4119390845298767, "grad_norm": 41.16646957397461, "learning_rate": 3.677293317363864e-08, "logits/chosen": -0.9548108577728271, "logits/rejected": -0.960533618927002, "logps/chosen": -819.1287841796875, "logps/ref_chosen": -71.7901382446289, "logps/ref_rejected": -95.38619995117188, "logps/rejected": -1251.5576171875, "loss": 1.1358, "margin_dpo/margin_mean": 408.8327941894531, "margin_dpo/margin_std": 670.8654174804688, "step": 575 }, { "KL/chosen_KL_mean": -732.2178955078125, "KL/mean": -887.31591796875, "KL/rejected_KL_mean": -1042.4139404296875, "KL/std": 495.1169128417969, "epoch": 0.8458149779735683, "fcm_dpo/beta": 0.0009677187772467732, "fcm_dpo/delta": 0.10300955176353455, "fcm_dpo/margin": 310.19610595703125, "fcm_dpo/q_t": 0.4320542812347412, "grad_norm": 33.871307373046875, "learning_rate": 3.6105769011194224e-08, "logits/chosen": -1.1345970630645752, "logits/rejected": -1.1627776622772217, "logps/chosen": -786.4808349609375, "logps/ref_chosen": -54.262962341308594, "logps/ref_rejected": -100.75428009033203, "logps/rejected": -1143.168212890625, "loss": 1.1832, "margin_dpo/margin_mean": 310.1961364746094, "margin_dpo/margin_std": 572.8515014648438, "step": 576 }, { "KL/chosen_KL_mean": -644.5338134765625, "KL/mean": -845.4898681640625, "KL/rejected_KL_mean": -1046.4459228515625, "KL/std": 548.8078002929688, "epoch": 0.8472834067547724, "fcm_dpo/beta": 0.0009760315297171474, "fcm_dpo/delta": 0.008000888861715794, "fcm_dpo/margin": 401.9121398925781, "fcm_dpo/q_t": 0.4116186499595642, "grad_norm": 26.741127014160156, "learning_rate": 3.5444241160659304e-08, "logits/chosen": -1.0345063209533691, "logits/rejected": -1.0221607685089111, "logps/chosen": -706.4434814453125, "logps/ref_chosen": -61.909706115722656, "logps/ref_rejected": -84.07069396972656, "logps/rejected": -1130.5166015625, "loss": 1.1172, "margin_dpo/margin_mean": 401.912109375, "margin_dpo/margin_std": 588.481201171875, "step": 577 }, { "KL/chosen_KL_mean": -641.31494140625, "KL/mean": -864.2884521484375, "KL/rejected_KL_mean": -1087.2618408203125, "KL/std": 547.829833984375, "epoch": 0.8487518355359766, "fcm_dpo/beta": 0.0009675461915321648, "fcm_dpo/delta": -0.033737167716026306, "fcm_dpo/margin": 445.94696044921875, "fcm_dpo/q_t": 0.40107935667037964, "grad_norm": 35.5023078918457, "learning_rate": 3.478836705390808e-08, "logits/chosen": -0.9391261339187622, "logits/rejected": -0.9697315692901611, "logps/chosen": -690.57861328125, "logps/ref_chosen": -49.26368713378906, "logps/ref_rejected": -83.4362564086914, "logps/rejected": -1170.6981201171875, "loss": 1.0675, "margin_dpo/margin_mean": 445.94696044921875, "margin_dpo/margin_std": 543.1402587890625, "step": 578 }, { "KL/chosen_KL_mean": -739.6239013671875, "KL/mean": -874.08740234375, "KL/rejected_KL_mean": -1008.5507202148438, "KL/std": 546.8580932617188, "epoch": 0.8502202643171806, "fcm_dpo/beta": 0.000979724689386785, "fcm_dpo/delta": 0.04294705390930176, "fcm_dpo/margin": 268.92694091796875, "fcm_dpo/q_t": 0.4389345049858093, "grad_norm": 53.29568099975586, "learning_rate": 3.41381639738331e-08, "logits/chosen": -0.9918534755706787, "logits/rejected": -0.9905188083648682, "logps/chosen": -798.509765625, "logps/ref_chosen": -58.88581848144531, "logps/ref_rejected": -94.78762817382812, "logps/rejected": -1103.33837890625, "loss": 1.2205, "margin_dpo/margin_mean": 268.92694091796875, "margin_dpo/margin_std": 589.2688598632812, "step": 579 }, { "KL/chosen_KL_mean": -540.1065673828125, "KL/mean": -791.6300048828125, "KL/rejected_KL_mean": -1043.1533203125, "KL/std": 603.1106567382812, "epoch": 0.8516886930983847, "fcm_dpo/beta": 0.0009632025612518191, "fcm_dpo/delta": -0.08945266157388687, "fcm_dpo/margin": 503.046875, "fcm_dpo/q_t": 0.3941301107406616, "grad_norm": 30.890201568603516, "learning_rate": 3.349364905389032e-08, "logits/chosen": -0.8625004291534424, "logits/rejected": -0.8941880464553833, "logps/chosen": -588.8133544921875, "logps/ref_chosen": -48.70683670043945, "logps/ref_rejected": -81.7583999633789, "logps/rejected": -1124.911865234375, "loss": 1.0525, "margin_dpo/margin_mean": 503.046875, "margin_dpo/margin_std": 688.0292358398438, "step": 580 }, { "KL/chosen_KL_mean": -729.04736328125, "KL/mean": -905.5509033203125, "KL/rejected_KL_mean": -1082.054443359375, "KL/std": 574.6087036132812, "epoch": 0.8531571218795888, "fcm_dpo/beta": 0.0009719936642795801, "fcm_dpo/delta": 0.05880071595311165, "fcm_dpo/margin": 353.0070495605469, "fcm_dpo/q_t": 0.42395222187042236, "grad_norm": 40.021026611328125, "learning_rate": 3.285483927764726e-08, "logits/chosen": -1.0945156812667847, "logits/rejected": -1.103161334991455, "logps/chosen": -791.2696533203125, "logps/ref_chosen": -62.22235107421875, "logps/ref_rejected": -91.73568725585938, "logps/rejected": -1173.7901611328125, "loss": 1.1562, "margin_dpo/margin_mean": 353.0070495605469, "margin_dpo/margin_std": 629.962158203125, "step": 581 }, { "KL/chosen_KL_mean": -639.3220825195312, "KL/mean": -835.397216796875, "KL/rejected_KL_mean": -1031.472412109375, "KL/std": 496.10870361328125, "epoch": 0.8546255506607929, "fcm_dpo/beta": 0.0009659301722422242, "fcm_dpo/delta": -0.08082351088523865, "fcm_dpo/margin": 392.1502990722656, "fcm_dpo/q_t": 0.4117897152900696, "grad_norm": 30.24727439880371, "learning_rate": 3.222175147833556e-08, "logits/chosen": -1.0156798362731934, "logits/rejected": -1.039165735244751, "logps/chosen": -697.55078125, "logps/ref_chosen": -58.228660583496094, "logps/ref_rejected": -110.06959533691406, "logps/rejected": -1141.5419921875, "loss": 1.1133, "margin_dpo/margin_mean": 392.1502990722656, "margin_dpo/margin_std": 531.6431884765625, "step": 582 }, { "KL/chosen_KL_mean": -728.5455322265625, "KL/mean": -847.5494384765625, "KL/rejected_KL_mean": -966.553466796875, "KL/std": 537.0501098632812, "epoch": 0.856093979441997, "fcm_dpo/beta": 0.0009584878571331501, "fcm_dpo/delta": 0.0019017525482922792, "fcm_dpo/margin": 238.00784301757812, "fcm_dpo/q_t": 0.44876495003700256, "grad_norm": 62.559593200683594, "learning_rate": 3.159440233840763e-08, "logits/chosen": -0.9803950190544128, "logits/rejected": -0.9772003293037415, "logps/chosen": -785.408447265625, "logps/ref_chosen": -56.86286163330078, "logps/ref_rejected": -88.4039306640625, "logps/rejected": -1054.957275390625, "loss": 1.2647, "margin_dpo/margin_mean": 238.00784301757812, "margin_dpo/margin_std": 635.57080078125, "step": 583 }, { "KL/chosen_KL_mean": -625.9443359375, "KL/mean": -869.571044921875, "KL/rejected_KL_mean": -1113.19775390625, "KL/std": 543.7821044921875, "epoch": 0.8575624082232012, "fcm_dpo/beta": 0.0009496349957771599, "fcm_dpo/delta": -0.0657280907034874, "fcm_dpo/margin": 487.2535400390625, "fcm_dpo/q_t": 0.39483463764190674, "grad_norm": 34.74457931518555, "learning_rate": 3.0972808389096635e-08, "logits/chosen": -1.0310046672821045, "logits/rejected": -1.0454175472259521, "logps/chosen": -682.844970703125, "logps/ref_chosen": -56.90068054199219, "logps/ref_rejected": -97.63606262207031, "logps/rejected": -1210.833984375, "loss": 1.0459, "margin_dpo/margin_mean": 487.2535400390625, "margin_dpo/margin_std": 577.2208862304688, "step": 584 }, { "KL/chosen_KL_mean": -723.0252075195312, "KL/mean": -949.249267578125, "KL/rejected_KL_mean": -1175.473388671875, "KL/std": 627.5142822265625, "epoch": 0.8590308370044053, "fcm_dpo/beta": 0.0009398453403264284, "fcm_dpo/delta": -0.02655157260596752, "fcm_dpo/margin": 452.44818115234375, "fcm_dpo/q_t": 0.4046263098716736, "grad_norm": 30.642621994018555, "learning_rate": 3.035698600998121e-08, "logits/chosen": -1.0402522087097168, "logits/rejected": -1.065436840057373, "logps/chosen": -783.9991455078125, "logps/ref_chosen": -60.973968505859375, "logps/ref_rejected": -84.16952514648438, "logps/rejected": -1259.6429443359375, "loss": 1.1158, "margin_dpo/margin_mean": 452.44818115234375, "margin_dpo/margin_std": 744.0001220703125, "step": 585 }, { "KL/chosen_KL_mean": -759.1077880859375, "KL/mean": -908.213134765625, "KL/rejected_KL_mean": -1057.318359375, "KL/std": 538.911865234375, "epoch": 0.8604992657856094, "fcm_dpo/beta": 0.0009562649065628648, "fcm_dpo/delta": 0.1183374673128128, "fcm_dpo/margin": 298.2107238769531, "fcm_dpo/q_t": 0.4345587491989136, "grad_norm": 30.652240753173828, "learning_rate": 2.974695142855388e-08, "logits/chosen": -1.0284502506256104, "logits/rejected": -1.0488755702972412, "logps/chosen": -815.96337890625, "logps/ref_chosen": -56.85559844970703, "logps/ref_rejected": -91.80261993408203, "logps/rejected": -1149.12109375, "loss": 1.1994, "margin_dpo/margin_mean": 298.2107238769531, "margin_dpo/margin_std": 598.6849365234375, "step": 586 }, { "KL/chosen_KL_mean": -528.1431884765625, "KL/mean": -738.4869995117188, "KL/rejected_KL_mean": -948.830810546875, "KL/std": 569.4769897460938, "epoch": 0.8619676945668135, "fcm_dpo/beta": 0.0009647482074797153, "fcm_dpo/delta": -0.006192212924361229, "fcm_dpo/margin": 420.68768310546875, "fcm_dpo/q_t": 0.40699630975723267, "grad_norm": 45.825714111328125, "learning_rate": 2.9142720719793122e-08, "logits/chosen": -1.0657103061676025, "logits/rejected": -1.09328293800354, "logps/chosen": -572.8347778320312, "logps/ref_chosen": -44.69159698486328, "logps/ref_rejected": -82.62385559082031, "logps/rejected": -1031.4547119140625, "loss": 1.0913, "margin_dpo/margin_mean": 420.68768310546875, "margin_dpo/margin_std": 569.4257202148438, "step": 587 }, { "KL/chosen_KL_mean": -687.0555419921875, "KL/mean": -863.8863525390625, "KL/rejected_KL_mean": -1040.7171630859375, "KL/std": 487.860107421875, "epoch": 0.8634361233480177, "fcm_dpo/beta": 0.0009668685379438102, "fcm_dpo/delta": 0.05999944359064102, "fcm_dpo/margin": 353.66156005859375, "fcm_dpo/q_t": 0.42055660486221313, "grad_norm": 26.90322494506836, "learning_rate": 2.8544309805740018e-08, "logits/chosen": -1.039747714996338, "logits/rejected": -1.066466212272644, "logps/chosen": -737.3505249023438, "logps/ref_chosen": -50.29494857788086, "logps/ref_rejected": -107.36988067626953, "logps/rejected": -1148.087158203125, "loss": 1.1351, "margin_dpo/margin_mean": 353.66156005859375, "margin_dpo/margin_std": 518.2518310546875, "step": 588 }, { "KL/chosen_KL_mean": -697.592529296875, "KL/mean": -931.67236328125, "KL/rejected_KL_mean": -1165.752197265625, "KL/std": 557.1414184570312, "epoch": 0.8649045521292217, "fcm_dpo/beta": 0.0009669238934293389, "fcm_dpo/delta": -0.055137749761343, "fcm_dpo/margin": 468.15966796875, "fcm_dpo/q_t": 0.39669230580329895, "grad_norm": 27.10498046875, "learning_rate": 2.7951734455078786e-08, "logits/chosen": -1.0181684494018555, "logits/rejected": -1.0344040393829346, "logps/chosen": -757.5223999023438, "logps/ref_chosen": -59.929908752441406, "logps/ref_rejected": -111.65534973144531, "logps/rejected": -1277.407470703125, "loss": 1.058, "margin_dpo/margin_mean": 468.1596984863281, "margin_dpo/margin_std": 587.922607421875, "step": 589 }, { "KL/chosen_KL_mean": -597.85400390625, "KL/mean": -826.746826171875, "KL/rejected_KL_mean": -1055.6396484375, "KL/std": 536.9008178710938, "epoch": 0.8663729809104258, "fcm_dpo/beta": 0.0009563218918628991, "fcm_dpo/delta": -0.039607785642147064, "fcm_dpo/margin": 457.78570556640625, "fcm_dpo/q_t": 0.40053310990333557, "grad_norm": 27.430288314819336, "learning_rate": 2.736501028272095e-08, "logits/chosen": -0.9699843525886536, "logits/rejected": -0.9988424777984619, "logps/chosen": -653.663818359375, "logps/ref_chosen": -55.80979537963867, "logps/ref_rejected": -106.06282043457031, "logps/rejected": -1161.7025146484375, "loss": 1.0661, "margin_dpo/margin_mean": 457.78570556640625, "margin_dpo/margin_std": 581.5162353515625, "step": 590 }, { "KL/chosen_KL_mean": -660.5640869140625, "KL/mean": -869.3895874023438, "KL/rejected_KL_mean": -1078.215087890625, "KL/std": 518.4139404296875, "epoch": 0.8678414096916299, "fcm_dpo/beta": 0.0009555625729262829, "fcm_dpo/delta": 0.0009453542297706008, "fcm_dpo/margin": 417.65093994140625, "fcm_dpo/q_t": 0.4076194763183594, "grad_norm": 31.656789779663086, "learning_rate": 2.678415274939408e-08, "logits/chosen": -1.0377655029296875, "logits/rejected": -1.0326879024505615, "logps/chosen": -716.8046875, "logps/ref_chosen": -56.24061965942383, "logps/ref_rejected": -83.78629302978516, "logps/rejected": -1162.0013427734375, "loss": 1.0991, "margin_dpo/margin_mean": 417.65093994140625, "margin_dpo/margin_std": 582.1824951171875, "step": 591 }, { "KL/chosen_KL_mean": -691.4828491210938, "KL/mean": -881.25439453125, "KL/rejected_KL_mean": -1071.02587890625, "KL/std": 527.1260375976562, "epoch": 0.869309838472834, "fcm_dpo/beta": 0.0009605808882042766, "fcm_dpo/delta": 0.03677193447947502, "fcm_dpo/margin": 379.5429992675781, "fcm_dpo/q_t": 0.41771793365478516, "grad_norm": 32.95262908935547, "learning_rate": 2.6209177161234442e-08, "logits/chosen": -1.0429775714874268, "logits/rejected": -1.0484647750854492, "logps/chosen": -739.423095703125, "logps/ref_chosen": -47.94025421142578, "logps/ref_rejected": -75.73287963867188, "logps/rejected": -1146.7587890625, "loss": 1.1662, "margin_dpo/margin_mean": 379.54296875, "margin_dpo/margin_std": 707.2916870117188, "step": 592 }, { "KL/chosen_KL_mean": -653.00244140625, "KL/mean": -799.3251953125, "KL/rejected_KL_mean": -945.64794921875, "KL/std": 570.3946533203125, "epoch": 0.8707782672540382, "fcm_dpo/beta": 0.0009675570763647556, "fcm_dpo/delta": 0.024251248687505722, "fcm_dpo/margin": 292.6455383300781, "fcm_dpo/q_t": 0.4363827705383301, "grad_norm": 49.979095458984375, "learning_rate": 2.564009866938349e-08, "logits/chosen": -0.8947024345397949, "logits/rejected": -0.8848444819450378, "logps/chosen": -701.6932373046875, "logps/ref_chosen": -48.690757751464844, "logps/ref_rejected": -60.90800094604492, "logps/rejected": -1006.555908203125, "loss": 1.2117, "margin_dpo/margin_mean": 292.6455383300781, "margin_dpo/margin_std": 629.4410400390625, "step": 593 }, { "KL/chosen_KL_mean": -645.703369140625, "KL/mean": -829.5958251953125, "KL/rejected_KL_mean": -1013.4883422851562, "KL/std": 561.0531616210938, "epoch": 0.8722466960352423, "fcm_dpo/beta": 0.0009798401733860373, "fcm_dpo/delta": 0.03994458168745041, "fcm_dpo/margin": 367.78497314453125, "fcm_dpo/q_t": 0.41849082708358765, "grad_norm": 37.234134674072266, "learning_rate": 2.5076932269588708e-08, "logits/chosen": -1.017820954322815, "logits/rejected": -1.0110870599746704, "logps/chosen": -700.6382446289062, "logps/ref_chosen": -54.93488693237305, "logps/ref_rejected": -86.09967803955078, "logps/rejected": -1099.5880126953125, "loss": 1.1434, "margin_dpo/margin_mean": 367.7850036621094, "margin_dpo/margin_std": 599.2181396484375, "step": 594 }, { "KL/chosen_KL_mean": -591.552001953125, "KL/mean": -793.7593994140625, "KL/rejected_KL_mean": -995.966796875, "KL/std": 504.74029541015625, "epoch": 0.8737151248164464, "fcm_dpo/beta": 0.0009743094560690224, "fcm_dpo/delta": 0.006154121831059456, "fcm_dpo/margin": 404.4147033691406, "fcm_dpo/q_t": 0.41215771436691284, "grad_norm": 41.969970703125, "learning_rate": 2.451969280180849e-08, "logits/chosen": -1.0084481239318848, "logits/rejected": -1.0286178588867188, "logps/chosen": -640.972412109375, "logps/ref_chosen": -49.4204216003418, "logps/ref_rejected": -80.62731170654297, "logps/rejected": -1076.593994140625, "loss": 1.1, "margin_dpo/margin_mean": 404.41473388671875, "margin_dpo/margin_std": 568.5491333007812, "step": 595 }, { "KL/chosen_KL_mean": -683.7548828125, "KL/mean": -831.98193359375, "KL/rejected_KL_mean": -980.2089233398438, "KL/std": 516.1655883789062, "epoch": 0.8751835535976505, "fcm_dpo/beta": 0.000993602559901774, "fcm_dpo/delta": 0.10849238932132721, "fcm_dpo/margin": 296.4541015625, "fcm_dpo/q_t": 0.43519163131713867, "grad_norm": 55.20982360839844, "learning_rate": 2.396839494982103e-08, "logits/chosen": -0.9886128306388855, "logits/rejected": -0.9605743885040283, "logps/chosen": -743.546630859375, "logps/ref_chosen": -59.791683197021484, "logps/ref_rejected": -80.09111785888672, "logps/rejected": -1060.300048828125, "loss": 1.1997, "margin_dpo/margin_mean": 296.4541015625, "margin_dpo/margin_std": 616.732421875, "step": 596 }, { "KL/chosen_KL_mean": -663.219482421875, "KL/mean": -916.369384765625, "KL/rejected_KL_mean": -1169.5191650390625, "KL/std": 611.7984619140625, "epoch": 0.8766519823788547, "fcm_dpo/beta": 0.0009732224280014634, "fcm_dpo/delta": -0.10066782683134079, "fcm_dpo/margin": 506.2997131347656, "fcm_dpo/q_t": 0.3910676836967468, "grad_norm": 28.543447494506836, "learning_rate": 2.3423053240837514e-08, "logits/chosen": -0.9715480208396912, "logits/rejected": -1.0194578170776367, "logps/chosen": -720.4802856445312, "logps/ref_chosen": -57.26078796386719, "logps/ref_rejected": -100.6937255859375, "logps/rejected": -1270.212890625, "loss": 1.0574, "margin_dpo/margin_mean": 506.2997131347656, "margin_dpo/margin_std": 674.1318359375, "step": 597 }, { "KL/chosen_KL_mean": -661.1876831054688, "KL/mean": -860.0836791992188, "KL/rejected_KL_mean": -1058.979736328125, "KL/std": 523.8168334960938, "epoch": 0.8781204111600588, "fcm_dpo/beta": 0.0009721480309963226, "fcm_dpo/delta": 0.012626536190509796, "fcm_dpo/margin": 397.7920227050781, "fcm_dpo/q_t": 0.4108089506626129, "grad_norm": 44.60902404785156, "learning_rate": 2.2883682045119062e-08, "logits/chosen": -0.9934415817260742, "logits/rejected": -1.0047008991241455, "logps/chosen": -713.7061767578125, "logps/ref_chosen": -52.51850509643555, "logps/ref_rejected": -89.44385528564453, "logps/rejected": -1148.423583984375, "loss": 1.1157, "margin_dpo/margin_mean": 397.79205322265625, "margin_dpo/margin_std": 574.5714721679688, "step": 598 }, { "KL/chosen_KL_mean": -680.6580810546875, "KL/mean": -848.9474487304688, "KL/rejected_KL_mean": -1017.2368774414062, "KL/std": 514.0728149414062, "epoch": 0.8795888399412628, "fcm_dpo/beta": 0.0009692448657006025, "fcm_dpo/delta": -0.05783551558852196, "fcm_dpo/margin": 336.578857421875, "fcm_dpo/q_t": 0.42119812965393066, "grad_norm": 29.521316528320312, "learning_rate": 2.2350295575598367e-08, "logits/chosen": -0.9816111326217651, "logits/rejected": -0.9900449514389038, "logps/chosen": -730.460693359375, "logps/ref_chosen": -49.802677154541016, "logps/ref_rejected": -82.978515625, "logps/rejected": -1100.2154541015625, "loss": 1.1461, "margin_dpo/margin_mean": 336.578857421875, "margin_dpo/margin_std": 485.81683349609375, "step": 599 }, { "KL/chosen_KL_mean": -738.8037109375, "KL/mean": -900.9039306640625, "KL/rejected_KL_mean": -1063.004150390625, "KL/std": 517.9166870117188, "epoch": 0.8810572687224669, "fcm_dpo/beta": 0.0009796018712222576, "fcm_dpo/delta": 0.08520510792732239, "fcm_dpo/margin": 324.20037841796875, "fcm_dpo/q_t": 0.42857182025909424, "grad_norm": 27.683170318603516, "learning_rate": 2.1822907887504932e-08, "logits/chosen": -1.0804599523544312, "logits/rejected": -1.078963279724121, "logps/chosen": -805.2385864257812, "logps/ref_chosen": -66.43487548828125, "logps/ref_rejected": -85.45649719238281, "logps/rejected": -1148.4605712890625, "loss": 1.1791, "margin_dpo/margin_mean": 324.20037841796875, "margin_dpo/margin_std": 615.3521118164062, "step": 600 }, { "KL/chosen_KL_mean": -733.8780517578125, "KL/mean": -939.5074462890625, "KL/rejected_KL_mean": -1145.13671875, "KL/std": 553.6764526367188, "epoch": 0.882525697503671, "fcm_dpo/beta": 0.0009836689569056034, "fcm_dpo/delta": -0.004790919832885265, "fcm_dpo/margin": 411.2587890625, "fcm_dpo/q_t": 0.4059777557849884, "grad_norm": 35.56853103637695, "learning_rate": 2.1301532877994742e-08, "logits/chosen": -0.9942201972007751, "logits/rejected": -1.0163451433181763, "logps/chosen": -793.01171875, "logps/ref_chosen": -59.13361358642578, "logps/ref_rejected": -94.69093322753906, "logps/rejected": -1239.8277587890625, "loss": 1.0895, "margin_dpo/margin_mean": 411.2587890625, "margin_dpo/margin_std": 544.517822265625, "step": 601 }, { "KL/chosen_KL_mean": -508.48651123046875, "KL/mean": -759.9006958007812, "KL/rejected_KL_mean": -1011.3148803710938, "KL/std": 511.07110595703125, "epoch": 0.8839941262848752, "fcm_dpo/beta": 0.0009767541196197271, "fcm_dpo/delta": -0.09598802030086517, "fcm_dpo/margin": 502.8282775878906, "fcm_dpo/q_t": 0.3871699869632721, "grad_norm": 64.79503631591797, "learning_rate": 2.0786184285784298e-08, "logits/chosen": -1.037233591079712, "logits/rejected": -1.0741159915924072, "logps/chosen": -557.080078125, "logps/ref_chosen": -48.59352111816406, "logps/ref_rejected": -87.6685562133789, "logps/rejected": -1098.9833984375, "loss": 1.0188, "margin_dpo/margin_mean": 502.8282775878906, "margin_dpo/margin_std": 533.3328857421875, "step": 602 }, { "KL/chosen_KL_mean": -641.906494140625, "KL/mean": -867.3243408203125, "KL/rejected_KL_mean": -1092.7423095703125, "KL/std": 565.2948608398438, "epoch": 0.8854625550660793, "fcm_dpo/beta": 0.0009601364727132022, "fcm_dpo/delta": -0.03442168980836868, "fcm_dpo/margin": 450.83575439453125, "fcm_dpo/q_t": 0.4039689302444458, "grad_norm": 37.1284065246582, "learning_rate": 2.0276875690788204e-08, "logits/chosen": -1.0195807218551636, "logits/rejected": -1.0123958587646484, "logps/chosen": -712.3211059570312, "logps/ref_chosen": -70.41461944580078, "logps/ref_rejected": -100.32559967041016, "logps/rejected": -1193.06787109375, "loss": 1.0902, "margin_dpo/margin_mean": 450.83575439453125, "margin_dpo/margin_std": 660.8984375, "step": 603 }, { "KL/chosen_KL_mean": -633.6109619140625, "KL/mean": -884.2783813476562, "KL/rejected_KL_mean": -1134.94580078125, "KL/std": 557.851318359375, "epoch": 0.8869309838472834, "fcm_dpo/beta": 0.0009481116430833936, "fcm_dpo/delta": -0.07910436391830444, "fcm_dpo/margin": 501.3348693847656, "fcm_dpo/q_t": 0.39480096101760864, "grad_norm": 41.052913665771484, "learning_rate": 1.977362051376158e-08, "logits/chosen": -1.0104937553405762, "logits/rejected": -1.0476266145706177, "logps/chosen": -680.0690307617188, "logps/ref_chosen": -46.45808029174805, "logps/ref_rejected": -91.8544921875, "logps/rejected": -1226.80029296875, "loss": 1.0613, "margin_dpo/margin_mean": 501.33489990234375, "margin_dpo/margin_std": 684.626708984375, "step": 604 }, { "KL/chosen_KL_mean": -654.0040283203125, "KL/mean": -836.632568359375, "KL/rejected_KL_mean": -1019.260986328125, "KL/std": 520.555908203125, "epoch": 0.8883994126284875, "fcm_dpo/beta": 0.0009504579938948154, "fcm_dpo/delta": 0.05476874113082886, "fcm_dpo/margin": 365.2569580078125, "fcm_dpo/q_t": 0.4232047498226166, "grad_norm": 32.279541015625, "learning_rate": 1.9276432015946446e-08, "logits/chosen": -0.9521446228027344, "logits/rejected": -0.9669671654701233, "logps/chosen": -720.2533569335938, "logps/ref_chosen": -66.24933624267578, "logps/ref_rejected": -102.30496978759766, "logps/rejected": -1121.56591796875, "loss": 1.1459, "margin_dpo/margin_mean": 365.2569885253906, "margin_dpo/margin_std": 619.3516845703125, "step": 605 }, { "KL/chosen_KL_mean": -664.3292236328125, "KL/mean": -865.0908203125, "KL/rejected_KL_mean": -1065.852294921875, "KL/std": 528.2071533203125, "epoch": 0.8898678414096917, "fcm_dpo/beta": 0.0009606323437765241, "fcm_dpo/delta": 0.014511629939079285, "fcm_dpo/margin": 401.52313232421875, "fcm_dpo/q_t": 0.41221147775650024, "grad_norm": 27.56992530822754, "learning_rate": 1.8785323298722093e-08, "logits/chosen": -0.9551470875740051, "logits/rejected": -0.9689816236495972, "logps/chosen": -719.1483764648438, "logps/ref_chosen": -54.819122314453125, "logps/ref_rejected": -98.37146759033203, "logps/rejected": -1164.223876953125, "loss": 1.105, "margin_dpo/margin_mean": 401.52313232421875, "margin_dpo/margin_std": 560.071533203125, "step": 606 }, { "KL/chosen_KL_mean": -685.24560546875, "KL/mean": -856.268310546875, "KL/rejected_KL_mean": -1027.291015625, "KL/std": 536.4832763671875, "epoch": 0.8913362701908958, "fcm_dpo/beta": 0.0009705802076496184, "fcm_dpo/delta": 0.0699785053730011, "fcm_dpo/margin": 342.0454406738281, "fcm_dpo/q_t": 0.42473822832107544, "grad_norm": 26.594741821289062, "learning_rate": 1.8300307303259904e-08, "logits/chosen": -0.9825999140739441, "logits/rejected": -0.9769987463951111, "logps/chosen": -743.32958984375, "logps/ref_chosen": -58.08403778076172, "logps/ref_rejected": -79.777099609375, "logps/rejected": -1107.068115234375, "loss": 1.1528, "margin_dpo/margin_mean": 342.0454406738281, "margin_dpo/margin_std": 565.7429809570312, "step": 607 }, { "KL/chosen_KL_mean": -613.8583374023438, "KL/mean": -814.1548461914062, "KL/rejected_KL_mean": -1014.4512939453125, "KL/std": 486.152099609375, "epoch": 0.8928046989720999, "fcm_dpo/beta": 0.0009749716846272349, "fcm_dpo/delta": 0.009674161672592163, "fcm_dpo/margin": 400.5929260253906, "fcm_dpo/q_t": 0.4095849096775055, "grad_norm": 30.251602172851562, "learning_rate": 1.7821396810182437e-08, "logits/chosen": -1.0167486667633057, "logits/rejected": -1.030979871749878, "logps/chosen": -671.3092041015625, "logps/ref_chosen": -57.450836181640625, "logps/ref_rejected": -94.77339172363281, "logps/rejected": -1109.224609375, "loss": 1.0893, "margin_dpo/margin_mean": 400.5929260253906, "margin_dpo/margin_std": 500.50164794921875, "step": 608 }, { "KL/chosen_KL_mean": -640.0119018554688, "KL/mean": -893.2958984375, "KL/rejected_KL_mean": -1146.579833984375, "KL/std": 658.148193359375, "epoch": 0.8942731277533039, "fcm_dpo/beta": 0.0009604240767657757, "fcm_dpo/delta": -0.09091140329837799, "fcm_dpo/margin": 506.5679016113281, "fcm_dpo/q_t": 0.3968961536884308, "grad_norm": 33.4229736328125, "learning_rate": 1.7348604439226617e-08, "logits/chosen": -1.0928289890289307, "logits/rejected": -1.119450569152832, "logps/chosen": -698.8172607421875, "logps/ref_chosen": -58.805355072021484, "logps/ref_rejected": -88.81600952148438, "logps/rejected": -1235.3958740234375, "loss": 1.0712, "margin_dpo/margin_mean": 506.56793212890625, "margin_dpo/margin_std": 784.350830078125, "step": 609 }, { "KL/chosen_KL_mean": -626.9141845703125, "KL/mean": -787.075927734375, "KL/rejected_KL_mean": -947.2376708984375, "KL/std": 506.7711486816406, "epoch": 0.895741556534508, "fcm_dpo/beta": 0.0009702660609036684, "fcm_dpo/delta": 0.09187015891075134, "fcm_dpo/margin": 320.3234558105469, "fcm_dpo/q_t": 0.42758795619010925, "grad_norm": 41.30014419555664, "learning_rate": 1.6881942648911074e-08, "logits/chosen": -0.9679499864578247, "logits/rejected": -0.9463798999786377, "logps/chosen": -692.6092529296875, "logps/ref_chosen": -65.69503784179688, "logps/ref_rejected": -83.40538787841797, "logps/rejected": -1030.64306640625, "loss": 1.1736, "margin_dpo/margin_mean": 320.3234558105469, "margin_dpo/margin_std": 581.2178955078125, "step": 610 }, { "KL/chosen_KL_mean": -670.1181640625, "KL/mean": -940.070068359375, "KL/rejected_KL_mean": -1210.02197265625, "KL/std": 670.6271362304688, "epoch": 0.8972099853157122, "fcm_dpo/beta": 0.0009553628042340279, "fcm_dpo/delta": -0.12215965986251831, "fcm_dpo/margin": 539.90380859375, "fcm_dpo/q_t": 0.3883088231086731, "grad_norm": 30.76310920715332, "learning_rate": 1.6421423736208e-08, "logits/chosen": -1.0270860195159912, "logits/rejected": -1.0751309394836426, "logps/chosen": -722.7176513671875, "logps/ref_chosen": -52.59946823120117, "logps/ref_rejected": -86.33099365234375, "logps/rejected": -1296.35302734375, "loss": 1.0465, "margin_dpo/margin_mean": 539.90380859375, "margin_dpo/margin_std": 748.833251953125, "step": 611 }, { "KL/chosen_KL_mean": -699.7198486328125, "KL/mean": -904.0280151367188, "KL/rejected_KL_mean": -1108.336181640625, "KL/std": 525.3094482421875, "epoch": 0.8986784140969163, "fcm_dpo/beta": 0.0009539818856865168, "fcm_dpo/delta": 0.010405594483017921, "fcm_dpo/margin": 408.61627197265625, "fcm_dpo/q_t": 0.410278856754303, "grad_norm": 27.917463302612305, "learning_rate": 1.5967059836219042e-08, "logits/chosen": -1.0200650691986084, "logits/rejected": -1.0243524312973022, "logps/chosen": -759.0435791015625, "logps/ref_chosen": -59.32372283935547, "logps/ref_rejected": -88.31239318847656, "logps/rejected": -1196.6485595703125, "loss": 1.0998, "margin_dpo/margin_mean": 408.61627197265625, "margin_dpo/margin_std": 555.7114868164062, "step": 612 }, { "KL/chosen_KL_mean": -628.8876342773438, "KL/mean": -882.475341796875, "KL/rejected_KL_mean": -1136.06298828125, "KL/std": 600.5887451171875, "epoch": 0.9001468428781204, "fcm_dpo/beta": 0.000938057666644454, "fcm_dpo/delta": -0.07989558577537537, "fcm_dpo/margin": 507.1754150390625, "fcm_dpo/q_t": 0.39130979776382446, "grad_norm": 32.273929595947266, "learning_rate": 1.551886292185553e-08, "logits/chosen": -1.0217537879943848, "logits/rejected": -1.0764918327331543, "logps/chosen": -688.6175537109375, "logps/ref_chosen": -59.72996520996094, "logps/ref_rejected": -105.10752868652344, "logps/rejected": -1241.1705322265625, "loss": 1.0397, "margin_dpo/margin_mean": 507.1754150390625, "margin_dpo/margin_std": 607.134033203125, "step": 613 }, { "KL/chosen_KL_mean": -696.5354614257812, "KL/mean": -941.188720703125, "KL/rejected_KL_mean": -1185.8419189453125, "KL/std": 581.6685180664062, "epoch": 0.9016152716593245, "fcm_dpo/beta": 0.0009298705263063312, "fcm_dpo/delta": -0.057572945952415466, "fcm_dpo/margin": 489.3064270019531, "fcm_dpo/q_t": 0.39804306626319885, "grad_norm": 43.64583206176758, "learning_rate": 1.507684480352292e-08, "logits/chosen": -1.0031187534332275, "logits/rejected": -1.0775550603866577, "logps/chosen": -749.4744262695312, "logps/ref_chosen": -52.93898010253906, "logps/ref_rejected": -104.67938232421875, "logps/rejected": -1290.521240234375, "loss": 1.0728, "margin_dpo/margin_mean": 489.30645751953125, "margin_dpo/margin_std": 680.9266357421875, "step": 614 }, { "KL/chosen_KL_mean": -656.3577880859375, "KL/mean": -862.1884155273438, "KL/rejected_KL_mean": -1068.01904296875, "KL/std": 608.759033203125, "epoch": 0.9030837004405287, "fcm_dpo/beta": 0.0009312764159403741, "fcm_dpo/delta": 0.017031406983733177, "fcm_dpo/margin": 411.66131591796875, "fcm_dpo/q_t": 0.412952184677124, "grad_norm": 26.297256469726562, "learning_rate": 1.4641017128809801e-08, "logits/chosen": -0.9910403490066528, "logits/rejected": -1.014068365097046, "logps/chosen": -722.175048828125, "logps/ref_chosen": -65.81727600097656, "logps/ref_rejected": -95.17749786376953, "logps/rejected": -1163.196533203125, "loss": 1.1233, "margin_dpo/margin_mean": 411.6612854003906, "margin_dpo/margin_std": 651.5823974609375, "step": 615 }, { "KL/chosen_KL_mean": -774.6954345703125, "KL/mean": -941.8838500976562, "KL/rejected_KL_mean": -1109.0721435546875, "KL/std": 509.840576171875, "epoch": 0.9045521292217328, "fcm_dpo/beta": 0.0009417695691809058, "fcm_dpo/delta": 0.08782128244638443, "fcm_dpo/margin": 334.37677001953125, "fcm_dpo/q_t": 0.4281091094017029, "grad_norm": 34.39405822753906, "learning_rate": 1.4211391382180637e-08, "logits/chosen": -1.0712862014770508, "logits/rejected": -1.0602033138275146, "logps/chosen": -839.8283081054688, "logps/ref_chosen": -65.13285827636719, "logps/ref_rejected": -74.70050048828125, "logps/rejected": -1183.772705078125, "loss": 1.1659, "margin_dpo/margin_mean": 334.37677001953125, "margin_dpo/margin_std": 575.5633544921875, "step": 616 }, { "KL/chosen_KL_mean": -732.1382446289062, "KL/mean": -852.7117919921875, "KL/rejected_KL_mean": -973.285400390625, "KL/std": 490.1700439453125, "epoch": 0.9060205580029369, "fcm_dpo/beta": 0.0009709987789392471, "fcm_dpo/delta": 0.16959968209266663, "fcm_dpo/margin": 241.1470489501953, "fcm_dpo/q_t": 0.44701701402664185, "grad_norm": 53.658565521240234, "learning_rate": 1.378797888467345e-08, "logits/chosen": -0.9488894939422607, "logits/rejected": -0.9184377789497375, "logps/chosen": -795.143798828125, "logps/ref_chosen": -63.005550384521484, "logps/ref_rejected": -64.234130859375, "logps/rejected": -1037.51953125, "loss": 1.2385, "margin_dpo/margin_mean": 241.14703369140625, "margin_dpo/margin_std": 569.4755859375, "step": 617 }, { "KL/chosen_KL_mean": -763.865478515625, "KL/mean": -1025.0372314453125, "KL/rejected_KL_mean": -1286.208984375, "KL/std": 651.3458251953125, "epoch": 0.9074889867841409, "fcm_dpo/beta": 0.0009685006225481629, "fcm_dpo/delta": -0.11168282479047775, "fcm_dpo/margin": 522.3434448242188, "fcm_dpo/q_t": 0.3915684223175049, "grad_norm": 40.38612747192383, "learning_rate": 1.3370790793601371e-08, "logits/chosen": -1.0099037885665894, "logits/rejected": -1.0500290393829346, "logps/chosen": -830.9668579101562, "logps/ref_chosen": -67.10134887695312, "logps/ref_rejected": -92.15340423583984, "logps/rejected": -1378.3623046875, "loss": 1.0849, "margin_dpo/margin_mean": 522.3434448242188, "margin_dpo/margin_std": 827.5892333984375, "step": 618 }, { "KL/chosen_KL_mean": -751.3358154296875, "KL/mean": -951.3638916015625, "KL/rejected_KL_mean": -1151.3919677734375, "KL/std": 605.0140380859375, "epoch": 0.908957415565345, "fcm_dpo/beta": 0.0009594704024493694, "fcm_dpo/delta": 0.016789617016911507, "fcm_dpo/margin": 400.0561828613281, "fcm_dpo/q_t": 0.42001599073410034, "grad_norm": 49.994224548339844, "learning_rate": 1.2959838102258535e-08, "logits/chosen": -0.9973533153533936, "logits/rejected": -1.0120331048965454, "logps/chosen": -807.3140869140625, "logps/ref_chosen": -55.978233337402344, "logps/ref_rejected": -93.1854019165039, "logps/rejected": -1244.577392578125, "loss": 1.1707, "margin_dpo/margin_mean": 400.0561828613281, "margin_dpo/margin_std": 802.4068603515625, "step": 619 }, { "KL/chosen_KL_mean": -691.8986206054688, "KL/mean": -882.22509765625, "KL/rejected_KL_mean": -1072.551513671875, "KL/std": 538.706787109375, "epoch": 0.9104258443465492, "fcm_dpo/beta": 0.000961203477345407, "fcm_dpo/delta": 0.035247065126895905, "fcm_dpo/margin": 380.6529846191406, "fcm_dpo/q_t": 0.4174911379814148, "grad_norm": 34.35021209716797, "learning_rate": 1.2555131639630567e-08, "logits/chosen": -1.0476036071777344, "logits/rejected": -1.0572441816329956, "logps/chosen": -751.6961059570312, "logps/ref_chosen": -59.79750061035156, "logps/ref_rejected": -78.41075134277344, "logps/rejected": -1150.96240234375, "loss": 1.1326, "margin_dpo/margin_mean": 380.6529846191406, "margin_dpo/margin_std": 605.3323974609375, "step": 620 }, { "KL/chosen_KL_mean": -665.3763427734375, "KL/mean": -940.9721069335938, "KL/rejected_KL_mean": -1216.567626953125, "KL/std": 632.556884765625, "epoch": 0.9118942731277533, "fcm_dpo/beta": 0.0009547668742015958, "fcm_dpo/delta": -0.13341151177883148, "fcm_dpo/margin": 551.19140625, "fcm_dpo/q_t": 0.3814903199672699, "grad_norm": 39.83711624145508, "learning_rate": 1.2156682070109086e-08, "logits/chosen": -1.0708098411560059, "logits/rejected": -1.1203954219818115, "logps/chosen": -719.3101196289062, "logps/ref_chosen": -53.93375778198242, "logps/ref_rejected": -88.36951446533203, "logps/rejected": -1304.937255859375, "loss": 1.033, "margin_dpo/margin_mean": 551.19140625, "margin_dpo/margin_std": 702.85986328125, "step": 621 }, { "KL/chosen_KL_mean": -658.5285034179688, "KL/mean": -857.1796264648438, "KL/rejected_KL_mean": -1055.8306884765625, "KL/std": 493.7017822265625, "epoch": 0.9133627019089574, "fcm_dpo/beta": 0.0009404352167621255, "fcm_dpo/delta": 0.0270434208214283, "fcm_dpo/margin": 397.30218505859375, "fcm_dpo/q_t": 0.4160274565219879, "grad_norm": 30.787620544433594, "learning_rate": 1.1764499893210878e-08, "logits/chosen": -0.9209400415420532, "logits/rejected": -0.9098290205001831, "logps/chosen": -718.8143310546875, "logps/ref_chosen": -60.28582000732422, "logps/ref_rejected": -85.51873779296875, "logps/rejected": -1141.349365234375, "loss": 1.1242, "margin_dpo/margin_mean": 397.30218505859375, "margin_dpo/margin_std": 618.2388916015625, "step": 622 }, { "KL/chosen_KL_mean": -722.2666015625, "KL/mean": -879.1347045898438, "KL/rejected_KL_mean": -1036.0028076171875, "KL/std": 512.3851318359375, "epoch": 0.9148311306901615, "fcm_dpo/beta": 0.0009606959065422416, "fcm_dpo/delta": 0.10166360437870026, "fcm_dpo/margin": 313.7361755371094, "fcm_dpo/q_t": 0.4334968328475952, "grad_norm": 33.62225341796875, "learning_rate": 1.1378595443300998e-08, "logits/chosen": -1.0529344081878662, "logits/rejected": -1.0536704063415527, "logps/chosen": -786.423583984375, "logps/ref_chosen": -64.1569595336914, "logps/ref_rejected": -85.08304595947266, "logps/rejected": -1121.0859375, "loss": 1.1853, "margin_dpo/margin_mean": 313.7362060546875, "margin_dpo/margin_std": 610.1846923828125, "step": 623 }, { "KL/chosen_KL_mean": -710.2718505859375, "KL/mean": -944.1239013671875, "KL/rejected_KL_mean": -1177.9759521484375, "KL/std": 529.1799926757812, "epoch": 0.9162995594713657, "fcm_dpo/beta": 0.0009594388538971543, "fcm_dpo/delta": -0.05099187046289444, "fcm_dpo/margin": 467.7041015625, "fcm_dpo/q_t": 0.3950585722923279, "grad_norm": 39.309574127197266, "learning_rate": 1.0998978889320582e-08, "logits/chosen": -1.0544100999832153, "logits/rejected": -1.0604016780853271, "logps/chosen": -782.1905517578125, "logps/ref_chosen": -71.91862487792969, "logps/ref_rejected": -97.13203430175781, "logps/rejected": -1275.1080322265625, "loss": 1.0585, "margin_dpo/margin_mean": 467.7041015625, "margin_dpo/margin_std": 573.2549438476562, "step": 624 }, { "KL/chosen_KL_mean": -676.450927734375, "KL/mean": -938.8895263671875, "KL/rejected_KL_mean": -1201.328125, "KL/std": 594.9174194335938, "epoch": 0.9177679882525698, "fcm_dpo/beta": 0.0009445177856832743, "fcm_dpo/delta": -0.10063250362873077, "fcm_dpo/margin": 524.8773193359375, "fcm_dpo/q_t": 0.38663381338119507, "grad_norm": 58.24129867553711, "learning_rate": 1.0625660234518913e-08, "logits/chosen": -1.006543755531311, "logits/rejected": -1.0352264642715454, "logps/chosen": -734.79296875, "logps/ref_chosen": -58.342071533203125, "logps/ref_rejected": -86.09038543701172, "logps/rejected": -1287.4185791015625, "loss": 1.0137, "margin_dpo/margin_mean": 524.8773193359375, "margin_dpo/margin_std": 560.148193359375, "step": 625 }, { "KL/chosen_KL_mean": -826.4632568359375, "KL/mean": -968.705810546875, "KL/rejected_KL_mean": -1110.9483642578125, "KL/std": 636.0460815429688, "epoch": 0.9192364170337739, "fcm_dpo/beta": 0.0009557833545841277, "fcm_dpo/delta": 0.1316283643245697, "fcm_dpo/margin": 284.48504638671875, "fcm_dpo/q_t": 0.4362901449203491, "grad_norm": 32.43234634399414, "learning_rate": 1.0258649316189721e-08, "logits/chosen": -0.9522177577018738, "logits/rejected": -0.9446706771850586, "logps/chosen": -901.575927734375, "logps/ref_chosen": -75.11260986328125, "logps/ref_rejected": -99.188720703125, "logps/rejected": -1210.136962890625, "loss": 1.2195, "margin_dpo/margin_mean": 284.48504638671875, "margin_dpo/margin_std": 614.84765625, "step": 626 }, { "KL/chosen_KL_mean": -572.4993286132812, "KL/mean": -865.5345458984375, "KL/rejected_KL_mean": -1158.569580078125, "KL/std": 690.389892578125, "epoch": 0.920704845814978, "fcm_dpo/beta": 0.000945016392506659, "fcm_dpo/delta": -0.162668839097023, "fcm_dpo/margin": 586.0703125, "fcm_dpo/q_t": 0.3842451572418213, "grad_norm": 24.69363784790039, "learning_rate": 9.897955805412e-09, "logits/chosen": -0.8964744806289673, "logits/rejected": -0.9713860154151917, "logps/chosen": -620.2424926757812, "logps/ref_chosen": -47.74314880371094, "logps/ref_rejected": -106.75448608398438, "logps/rejected": -1265.32421875, "loss": 1.0279, "margin_dpo/margin_mean": 586.0703125, "margin_dpo/margin_std": 790.52783203125, "step": 627 }, { "KL/chosen_KL_mean": -741.4865112304688, "KL/mean": -962.6798706054688, "KL/rejected_KL_mean": -1183.873291015625, "KL/std": 572.581787109375, "epoch": 0.922173274596182, "fcm_dpo/beta": 0.0009279233636334538, "fcm_dpo/delta": -0.011030579917132854, "fcm_dpo/margin": 442.3868408203125, "fcm_dpo/q_t": 0.40770232677459717, "grad_norm": 28.13714599609375, "learning_rate": 9.543589206795238e-09, "logits/chosen": -1.0725154876708984, "logits/rejected": -1.0900166034698486, "logps/chosen": -801.66943359375, "logps/ref_chosen": -60.182945251464844, "logps/ref_rejected": -101.55467224121094, "logps/rejected": -1285.427978515625, "loss": 1.1039, "margin_dpo/margin_mean": 442.3868408203125, "margin_dpo/margin_std": 658.2325439453125, "step": 628 }, { "KL/chosen_KL_mean": -733.53369140625, "KL/mean": -936.6685791015625, "KL/rejected_KL_mean": -1139.8037109375, "KL/std": 552.74169921875, "epoch": 0.9236417033773862, "fcm_dpo/beta": 0.0009324135025963187, "fcm_dpo/delta": 0.02201123535633087, "fcm_dpo/margin": 406.26995849609375, "fcm_dpo/q_t": 0.4115867018699646, "grad_norm": 34.70335006713867, "learning_rate": 9.19555885822887e-09, "logits/chosen": -1.0210623741149902, "logits/rejected": -1.0353336334228516, "logps/chosen": -797.7471923828125, "logps/ref_chosen": -64.21354675292969, "logps/ref_rejected": -91.65367126464844, "logps/rejected": -1231.457275390625, "loss": 1.1046, "margin_dpo/margin_mean": 406.26995849609375, "margin_dpo/margin_std": 546.7474365234375, "step": 629 }, { "KL/chosen_KL_mean": -653.820068359375, "KL/mean": -769.8485107421875, "KL/rejected_KL_mean": -885.8768920898438, "KL/std": 560.4996337890625, "epoch": 0.9251101321585903, "fcm_dpo/beta": 0.0009379271068610251, "fcm_dpo/delta": 0.051444362848997116, "fcm_dpo/margin": 232.05682373046875, "fcm_dpo/q_t": 0.45478296279907227, "grad_norm": 52.160728454589844, "learning_rate": 8.85387393063622e-09, "logits/chosen": -1.0045530796051025, "logits/rejected": -0.9794071912765503, "logps/chosen": -713.111083984375, "logps/ref_chosen": -59.29100036621094, "logps/ref_rejected": -83.59829711914062, "logps/rejected": -969.4752197265625, "loss": 1.2718, "margin_dpo/margin_mean": 232.05682373046875, "margin_dpo/margin_std": 672.6798095703125, "step": 630 }, { "KL/chosen_KL_mean": -792.3543701171875, "KL/mean": -974.973388671875, "KL/rejected_KL_mean": -1157.592529296875, "KL/std": 561.6439208984375, "epoch": 0.9265785609397944, "fcm_dpo/beta": 0.0009518619626760483, "fcm_dpo/delta": 0.05409633368253708, "fcm_dpo/margin": 365.2381286621094, "fcm_dpo/q_t": 0.42036306858062744, "grad_norm": 34.219573974609375, "learning_rate": 8.518543427732949e-09, "logits/chosen": -1.0938163995742798, "logits/rejected": -1.1026105880737305, "logps/chosen": -851.8079833984375, "logps/ref_chosen": -59.45360565185547, "logps/ref_rejected": -80.95156860351562, "logps/rejected": -1238.5440673828125, "loss": 1.1567, "margin_dpo/margin_mean": 365.2381286621094, "margin_dpo/margin_std": 643.0179443359375, "step": 631 }, { "KL/chosen_KL_mean": -682.728515625, "KL/mean": -877.140625, "KL/rejected_KL_mean": -1071.552734375, "KL/std": 512.9617919921875, "epoch": 0.9280469897209985, "fcm_dpo/beta": 0.0009528810624033213, "fcm_dpo/delta": 0.03032829239964485, "fcm_dpo/margin": 388.82421875, "fcm_dpo/q_t": 0.4152371883392334, "grad_norm": 43.099708557128906, "learning_rate": 8.189576185789637e-09, "logits/chosen": -1.0412629842758179, "logits/rejected": -1.0417115688323975, "logps/chosen": -744.080078125, "logps/ref_chosen": -61.35155487060547, "logps/ref_rejected": -86.16017150878906, "logps/rejected": -1157.712890625, "loss": 1.1352, "margin_dpo/margin_mean": 388.82421875, "margin_dpo/margin_std": 626.7635498046875, "step": 632 }, { "KL/chosen_KL_mean": -747.4510498046875, "KL/mean": -891.1580810546875, "KL/rejected_KL_mean": -1034.8651123046875, "KL/std": 504.61749267578125, "epoch": 0.9295154185022027, "fcm_dpo/beta": 0.0009792209602892399, "fcm_dpo/delta": 0.12146103382110596, "fcm_dpo/margin": 287.41412353515625, "fcm_dpo/q_t": 0.43543291091918945, "grad_norm": 47.98980712890625, "learning_rate": 7.866980873399015e-09, "logits/chosen": -1.0630054473876953, "logits/rejected": -1.0733153820037842, "logps/chosen": -804.729248046875, "logps/ref_chosen": -57.27816390991211, "logps/ref_rejected": -91.58395385742188, "logps/rejected": -1126.4490966796875, "loss": 1.2016, "margin_dpo/margin_mean": 287.41412353515625, "margin_dpo/margin_std": 580.196533203125, "step": 633 }, { "KL/chosen_KL_mean": -855.8317260742188, "KL/mean": -977.2432861328125, "KL/rejected_KL_mean": -1098.655029296875, "KL/std": 611.3326416015625, "epoch": 0.9309838472834068, "fcm_dpo/beta": 0.0009960609022527933, "fcm_dpo/delta": 0.06913906335830688, "fcm_dpo/margin": 242.82333374023438, "fcm_dpo/q_t": 0.44638699293136597, "grad_norm": 35.45087814331055, "learning_rate": 7.550765991247654e-09, "logits/chosen": -0.9742704033851624, "logits/rejected": -0.9662094116210938, "logps/chosen": -922.45068359375, "logps/ref_chosen": -66.61896514892578, "logps/ref_rejected": -107.12564849853516, "logps/rejected": -1205.78076171875, "loss": 1.2382, "margin_dpo/margin_mean": 242.82333374023438, "margin_dpo/margin_std": 585.6327514648438, "step": 634 }, { "KL/chosen_KL_mean": -741.16796875, "KL/mean": -916.81396484375, "KL/rejected_KL_mean": -1092.4599609375, "KL/std": 625.1411743164062, "epoch": 0.9324522760646109, "fcm_dpo/beta": 0.0010028297547250986, "fcm_dpo/delta": 0.04947870969772339, "fcm_dpo/margin": 351.2921142578125, "fcm_dpo/q_t": 0.42215800285339355, "grad_norm": 38.723793029785156, "learning_rate": 7.240939871891699e-09, "logits/chosen": -1.0590343475341797, "logits/rejected": -1.0402554273605347, "logps/chosen": -815.1234741210938, "logps/ref_chosen": -73.95551300048828, "logps/ref_rejected": -82.50045776367188, "logps/rejected": -1174.96044921875, "loss": 1.1545, "margin_dpo/margin_mean": 351.2921142578125, "margin_dpo/margin_std": 629.5093994140625, "step": 635 }, { "KL/chosen_KL_mean": -695.1683959960938, "KL/mean": -907.8634033203125, "KL/rejected_KL_mean": -1120.558349609375, "KL/std": 626.8406372070312, "epoch": 0.933920704845815, "fcm_dpo/beta": 0.0010084551759064198, "fcm_dpo/delta": -0.03108617290854454, "fcm_dpo/margin": 425.3899841308594, "fcm_dpo/q_t": 0.40682026743888855, "grad_norm": 27.428804397583008, "learning_rate": 6.937510679537628e-09, "logits/chosen": -0.974394679069519, "logits/rejected": -0.9779649972915649, "logps/chosen": -754.7973022460938, "logps/ref_chosen": -59.628910064697266, "logps/ref_rejected": -81.97883605957031, "logps/rejected": -1202.537109375, "loss": 1.0923, "margin_dpo/margin_mean": 425.3899841308594, "margin_dpo/margin_std": 645.3447265625, "step": 636 }, { "KL/chosen_KL_mean": -712.531005859375, "KL/mean": -942.2177124023438, "KL/rejected_KL_mean": -1171.904541015625, "KL/std": 613.010009765625, "epoch": 0.9353891336270191, "fcm_dpo/beta": 0.0009865246247500181, "fcm_dpo/delta": -0.0564747154712677, "fcm_dpo/margin": 459.3734436035156, "fcm_dpo/q_t": 0.3993714153766632, "grad_norm": 29.097070693969727, "learning_rate": 6.640486409826785e-09, "logits/chosen": -1.068098545074463, "logits/rejected": -1.115422248840332, "logps/chosen": -762.1837158203125, "logps/ref_chosen": -49.652687072753906, "logps/ref_rejected": -98.40513610839844, "logps/rejected": -1270.3095703125, "loss": 1.0719, "margin_dpo/margin_mean": 459.3734130859375, "margin_dpo/margin_std": 634.1755981445312, "step": 637 }, { "KL/chosen_KL_mean": -675.932373046875, "KL/mean": -855.3536987304688, "KL/rejected_KL_mean": -1034.775146484375, "KL/std": 574.1265869140625, "epoch": 0.9368575624082232, "fcm_dpo/beta": 0.0009809336625039577, "fcm_dpo/delta": -0.07716827094554901, "fcm_dpo/margin": 358.84271240234375, "fcm_dpo/q_t": 0.41367873549461365, "grad_norm": 43.48149490356445, "learning_rate": 6.349874889624962e-09, "logits/chosen": -0.966257631778717, "logits/rejected": -0.9517063498497009, "logps/chosen": -734.0889892578125, "logps/ref_chosen": -58.156639099121094, "logps/ref_rejected": -79.3014907836914, "logps/rejected": -1114.0765380859375, "loss": 1.1675, "margin_dpo/margin_mean": 358.84271240234375, "margin_dpo/margin_std": 657.916748046875, "step": 638 }, { "KL/chosen_KL_mean": -934.8937377929688, "KL/mean": -1017.8052368164062, "KL/rejected_KL_mean": -1100.716796875, "KL/std": 551.710205078125, "epoch": 0.9383259911894273, "fcm_dpo/beta": 0.0009733641054481268, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 165.8230743408203, "fcm_dpo/q_t": 0.4636532962322235, "grad_norm": 106.04683685302734, "learning_rate": 6.065683776815933e-09, "logits/chosen": -0.9813928604125977, "logits/rejected": -0.9263367652893066, "logps/chosen": -1007.2169189453125, "logps/ref_chosen": -72.32319641113281, "logps/ref_rejected": -74.2749252319336, "logps/rejected": -1174.99169921875, "loss": 1.3435, "margin_dpo/margin_mean": 165.82305908203125, "margin_dpo/margin_std": 706.1915893554688, "step": 639 }, { "KL/chosen_KL_mean": -715.442626953125, "KL/mean": -994.8519897460938, "KL/rejected_KL_mean": -1274.2613525390625, "KL/std": 640.503173828125, "epoch": 0.9397944199706314, "fcm_dpo/beta": 0.0009570815600454807, "fcm_dpo/delta": -0.14221924543380737, "fcm_dpo/margin": 558.8187255859375, "fcm_dpo/q_t": 0.38426363468170166, "grad_norm": 39.72331237792969, "learning_rate": 5.7879205600998296e-09, "logits/chosen": -0.9474629163742065, "logits/rejected": -0.9777064919471741, "logps/chosen": -771.5770263671875, "logps/ref_chosen": -56.13436508178711, "logps/ref_rejected": -108.60014343261719, "logps/rejected": -1382.8614501953125, "loss": 1.0338, "margin_dpo/margin_mean": 558.8187255859375, "margin_dpo/margin_std": 753.1952514648438, "step": 640 }, { "KL/chosen_KL_mean": -827.5257568359375, "KL/mean": -1006.5942993164062, "KL/rejected_KL_mean": -1185.662841796875, "KL/std": 546.6756591796875, "epoch": 0.9412628487518355, "fcm_dpo/beta": 0.0009527778020128608, "fcm_dpo/delta": 0.060891155153512955, "fcm_dpo/margin": 358.13702392578125, "fcm_dpo/q_t": 0.42465877532958984, "grad_norm": 35.67609405517578, "learning_rate": 5.516592558795746e-09, "logits/chosen": -1.007719874382019, "logits/rejected": -1.0168031454086304, "logps/chosen": -892.5226440429688, "logps/ref_chosen": -64.99689483642578, "logps/ref_rejected": -86.99232482910156, "logps/rejected": -1272.6551513671875, "loss": 1.1809, "margin_dpo/margin_mean": 358.1370544433594, "margin_dpo/margin_std": 718.3710327148438, "step": 641 }, { "KL/chosen_KL_mean": -792.3043212890625, "KL/mean": -1011.9464721679688, "KL/rejected_KL_mean": -1231.588623046875, "KL/std": 722.4783935546875, "epoch": 0.9427312775330396, "fcm_dpo/beta": 0.0009587721433490515, "fcm_dpo/delta": -0.022414878010749817, "fcm_dpo/margin": 439.2843017578125, "fcm_dpo/q_t": 0.41409242153167725, "grad_norm": 38.891204833984375, "learning_rate": 5.251706922648868e-09, "logits/chosen": -0.9521088600158691, "logits/rejected": -0.9873976707458496, "logps/chosen": -857.9935302734375, "logps/ref_chosen": -65.68924713134766, "logps/ref_rejected": -110.24205017089844, "logps/rejected": -1341.830810546875, "loss": 1.1499, "margin_dpo/margin_mean": 439.2843017578125, "margin_dpo/margin_std": 870.07275390625, "step": 642 }, { "KL/chosen_KL_mean": -706.8209228515625, "KL/mean": -878.863037109375, "KL/rejected_KL_mean": -1050.905029296875, "KL/std": 530.0516357421875, "epoch": 0.9441997063142438, "fcm_dpo/beta": 0.0009463735623285174, "fcm_dpo/delta": -0.036860737949609756, "fcm_dpo/margin": 344.0841979980469, "fcm_dpo/q_t": 0.42428651452064514, "grad_norm": 37.40614700317383, "learning_rate": 4.993270631642038e-09, "logits/chosen": -1.0797677040100098, "logits/rejected": -1.0802643299102783, "logps/chosen": -758.7708740234375, "logps/ref_chosen": -51.94999694824219, "logps/ref_rejected": -87.46833801269531, "logps/rejected": -1138.3734130859375, "loss": 1.1498, "margin_dpo/margin_mean": 344.0841979980469, "margin_dpo/margin_std": 523.859130859375, "step": 643 }, { "KL/chosen_KL_mean": -694.5718994140625, "KL/mean": -873.572021484375, "KL/rejected_KL_mean": -1052.5721435546875, "KL/std": 631.7380981445312, "epoch": 0.9456681350954479, "fcm_dpo/beta": 0.000954576360527426, "fcm_dpo/delta": 0.06034265458583832, "fcm_dpo/margin": 358.000244140625, "fcm_dpo/q_t": 0.4248698949813843, "grad_norm": 45.14137649536133, "learning_rate": 4.741290495811873e-09, "logits/chosen": -0.9851275682449341, "logits/rejected": -0.9928478002548218, "logps/chosen": -753.589599609375, "logps/ref_chosen": -59.017662048339844, "logps/ref_rejected": -87.13668823242188, "logps/rejected": -1139.708740234375, "loss": 1.1823, "margin_dpo/margin_mean": 358.000244140625, "margin_dpo/margin_std": 717.85693359375, "step": 644 }, { "KL/chosen_KL_mean": -724.6256103515625, "KL/mean": -813.5394287109375, "KL/rejected_KL_mean": -902.4532470703125, "KL/std": 482.83013916015625, "epoch": 0.947136563876652, "fcm_dpo/beta": 0.0009710404556244612, "fcm_dpo/delta": 0.06831113994121552, "fcm_dpo/margin": 177.82762145996094, "fcm_dpo/q_t": 0.4625673294067383, "grad_norm": 88.68705749511719, "learning_rate": 4.495773155069299e-09, "logits/chosen": -0.9806017875671387, "logits/rejected": -0.9682430028915405, "logps/chosen": -780.501708984375, "logps/ref_chosen": -55.87602233886719, "logps/ref_rejected": -97.78080749511719, "logps/rejected": -1000.2340698242188, "loss": 1.3219, "margin_dpo/margin_mean": 177.8275909423828, "margin_dpo/margin_std": 645.446533203125, "step": 645 }, { "KL/chosen_KL_mean": -697.8182373046875, "KL/mean": -850.4000244140625, "KL/rejected_KL_mean": -1002.981689453125, "KL/std": 472.5347900390625, "epoch": 0.9486049926578561, "fcm_dpo/beta": 0.0009829029440879822, "fcm_dpo/delta": 0.1032671183347702, "fcm_dpo/margin": 305.16351318359375, "fcm_dpo/q_t": 0.432457834482193, "grad_norm": 52.068904876708984, "learning_rate": 4.256725079024553e-09, "logits/chosen": -1.0154389142990112, "logits/rejected": -0.9988906383514404, "logps/chosen": -759.093994140625, "logps/ref_chosen": -61.275787353515625, "logps/ref_rejected": -77.50580596923828, "logps/rejected": -1080.487548828125, "loss": 1.1842, "margin_dpo/margin_mean": 305.1635437011719, "margin_dpo/margin_std": 565.9290771484375, "step": 646 }, { "KL/chosen_KL_mean": -608.2819213867188, "KL/mean": -799.4105224609375, "KL/rejected_KL_mean": -990.5390625, "KL/std": 541.8460693359375, "epoch": 0.9500734214390602, "fcm_dpo/beta": 0.0009943554177880287, "fcm_dpo/delta": 0.02068711817264557, "fcm_dpo/margin": 382.257080078125, "fcm_dpo/q_t": 0.41246411204338074, "grad_norm": 27.42867088317871, "learning_rate": 4.024152566816791e-09, "logits/chosen": -0.8789236545562744, "logits/rejected": -0.9076966047286987, "logps/chosen": -663.1343383789062, "logps/ref_chosen": -54.8524169921875, "logps/ref_rejected": -93.5194091796875, "logps/rejected": -1084.0584716796875, "loss": 1.1078, "margin_dpo/margin_mean": 382.257080078125, "margin_dpo/margin_std": 527.3770751953125, "step": 647 }, { "KL/chosen_KL_mean": -657.1648559570312, "KL/mean": -943.3175659179688, "KL/rejected_KL_mean": -1229.47021484375, "KL/std": 657.721435546875, "epoch": 0.9515418502202643, "fcm_dpo/beta": 0.0009720301604829729, "fcm_dpo/delta": -0.16539156436920166, "fcm_dpo/margin": 572.3054809570312, "fcm_dpo/q_t": 0.38105693459510803, "grad_norm": 28.24399757385254, "learning_rate": 3.798061746947995e-09, "logits/chosen": -1.045109510421753, "logits/rejected": -1.1041361093521118, "logps/chosen": -711.3363037109375, "logps/ref_chosen": -54.17146682739258, "logps/ref_rejected": -98.7127914428711, "logps/rejected": -1328.18310546875, "loss": 1.0232, "margin_dpo/margin_mean": 572.3054809570312, "margin_dpo/margin_std": 778.92529296875, "step": 648 }, { "KL/chosen_KL_mean": -681.6015625, "KL/mean": -813.9896240234375, "KL/rejected_KL_mean": -946.3775634765625, "KL/std": 507.69940185546875, "epoch": 0.9530102790014684, "fcm_dpo/beta": 0.000985685153864324, "fcm_dpo/delta": 0.1423780918121338, "fcm_dpo/margin": 264.7760009765625, "fcm_dpo/q_t": 0.4443369209766388, "grad_norm": 29.93989372253418, "learning_rate": 3.5784585771215235e-09, "logits/chosen": -1.052908182144165, "logits/rejected": -1.044716238975525, "logps/chosen": -744.0819091796875, "logps/ref_chosen": -62.480350494384766, "logps/ref_rejected": -80.07717895507812, "logps/rejected": -1026.454833984375, "loss": 1.2366, "margin_dpo/margin_mean": 264.7760009765625, "margin_dpo/margin_std": 629.158935546875, "step": 649 }, { "KL/chosen_KL_mean": -741.3469848632812, "KL/mean": -952.0729370117188, "KL/rejected_KL_mean": -1162.798828125, "KL/std": 635.6630859375, "epoch": 0.9544787077826725, "fcm_dpo/beta": 0.0009861743310466409, "fcm_dpo/delta": -0.016445815563201904, "fcm_dpo/margin": 421.45196533203125, "fcm_dpo/q_t": 0.40908634662628174, "grad_norm": 31.88619041442871, "learning_rate": 3.3653488440851253e-09, "logits/chosen": -1.0458638668060303, "logits/rejected": -1.0673601627349854, "logps/chosen": -797.4398193359375, "logps/ref_chosen": -56.09281921386719, "logps/ref_rejected": -98.26483917236328, "logps/rejected": -1261.063720703125, "loss": 1.129, "margin_dpo/margin_mean": 421.45196533203125, "margin_dpo/margin_std": 725.9293212890625, "step": 650 }, { "KL/chosen_KL_mean": -497.8349304199219, "KL/mean": -769.283203125, "KL/rejected_KL_mean": -1040.7314453125, "KL/std": 584.7640380859375, "epoch": 0.9559471365638766, "fcm_dpo/beta": 0.000967178144492209, "fcm_dpo/delta": -0.13220591843128204, "fcm_dpo/margin": 542.8966064453125, "fcm_dpo/q_t": 0.3810211718082428, "grad_norm": 43.920989990234375, "learning_rate": 3.158738163478475e-09, "logits/chosen": -1.0429898500442505, "logits/rejected": -1.0994410514831543, "logps/chosen": -541.2603759765625, "logps/ref_chosen": -43.42544937133789, "logps/ref_rejected": -99.95791625976562, "logps/rejected": -1140.689453125, "loss": 1.0016, "margin_dpo/margin_mean": 542.8966064453125, "margin_dpo/margin_std": 594.7532958984375, "step": 651 }, { "KL/chosen_KL_mean": -632.5731201171875, "KL/mean": -839.5629272460938, "KL/rejected_KL_mean": -1046.552734375, "KL/std": 598.0744018554688, "epoch": 0.9574155653450808, "fcm_dpo/beta": 0.0009628928382880986, "fcm_dpo/delta": 0.0014214273542165756, "fcm_dpo/margin": 413.9795837402344, "fcm_dpo/q_t": 0.41178420186042786, "grad_norm": 32.01892852783203, "learning_rate": 2.9586319796851555e-09, "logits/chosen": -1.0343468189239502, "logits/rejected": -1.0592379570007324, "logps/chosen": -695.14990234375, "logps/ref_chosen": -62.57680892944336, "logps/ref_rejected": -111.76779174804688, "logps/rejected": -1158.320556640625, "loss": 1.1247, "margin_dpo/margin_mean": 413.9796142578125, "margin_dpo/margin_std": 687.262939453125, "step": 652 }, { "KL/chosen_KL_mean": -769.5801391601562, "KL/mean": -955.7160034179688, "KL/rejected_KL_mean": -1141.851806640625, "KL/std": 637.4415283203125, "epoch": 0.9588839941262849, "fcm_dpo/beta": 0.0009692892199382186, "fcm_dpo/delta": 0.04052945226430893, "fcm_dpo/margin": 372.2716064453125, "fcm_dpo/q_t": 0.4203724265098572, "grad_norm": 33.566246032714844, "learning_rate": 2.7650355656892166e-09, "logits/chosen": -1.0515234470367432, "logits/rejected": -1.0738110542297363, "logps/chosen": -830.693115234375, "logps/ref_chosen": -61.11295700073242, "logps/ref_rejected": -103.24960327148438, "logps/rejected": -1245.101318359375, "loss": 1.1492, "margin_dpo/margin_mean": 372.2716369628906, "margin_dpo/margin_std": 651.8836669921875, "step": 653 }, { "KL/chosen_KL_mean": -688.090087890625, "KL/mean": -868.0014038085938, "KL/rejected_KL_mean": -1047.9127197265625, "KL/std": 508.9238586425781, "epoch": 0.960352422907489, "fcm_dpo/beta": 0.0009763325797393918, "fcm_dpo/delta": 0.050495948642492294, "fcm_dpo/margin": 359.82257080078125, "fcm_dpo/q_t": 0.4218224287033081, "grad_norm": 31.80510711669922, "learning_rate": 2.577954022936174e-09, "logits/chosen": -1.045449137687683, "logits/rejected": -1.0607552528381348, "logps/chosen": -749.8182373046875, "logps/ref_chosen": -61.7281379699707, "logps/ref_rejected": -98.7738037109375, "logps/rejected": -1146.6865234375, "loss": 1.1428, "margin_dpo/margin_mean": 359.8226013183594, "margin_dpo/margin_std": 591.5650634765625, "step": 654 }, { "KL/chosen_KL_mean": -654.7279663085938, "KL/mean": -843.5281982421875, "KL/rejected_KL_mean": -1032.328369140625, "KL/std": 518.3046264648438, "epoch": 0.9618208516886931, "fcm_dpo/beta": 0.0009844074957072735, "fcm_dpo/delta": 0.029366791248321533, "fcm_dpo/margin": 377.60040283203125, "fcm_dpo/q_t": 0.41697460412979126, "grad_norm": 26.179494857788086, "learning_rate": 2.397392281198729e-09, "logits/chosen": -1.0613682270050049, "logits/rejected": -1.1010310649871826, "logps/chosen": -704.3048095703125, "logps/ref_chosen": -49.576812744140625, "logps/ref_rejected": -98.29183197021484, "logps/rejected": -1130.6201171875, "loss": 1.1326, "margin_dpo/margin_mean": 377.6003723144531, "margin_dpo/margin_std": 612.6904296875, "step": 655 }, { "KL/chosen_KL_mean": -706.594970703125, "KL/mean": -1030.3953857421875, "KL/rejected_KL_mean": -1354.19580078125, "KL/std": 699.4261474609375, "epoch": 0.9632892804698973, "fcm_dpo/beta": 0.0009554917924106121, "fcm_dpo/delta": -0.23265045881271362, "fcm_dpo/margin": 647.600830078125, "fcm_dpo/q_t": 0.364484578371048, "grad_norm": 79.35772705078125, "learning_rate": 2.223355098446622e-09, "logits/chosen": -0.9079943299293518, "logits/rejected": -0.978103518486023, "logps/chosen": -759.1444091796875, "logps/ref_chosen": -52.54943084716797, "logps/ref_rejected": -113.67464447021484, "logps/rejected": -1467.870361328125, "loss": 0.957, "margin_dpo/margin_mean": 647.600830078125, "margin_dpo/margin_std": 704.768310546875, "step": 656 }, { "KL/chosen_KL_mean": -660.0347290039062, "KL/mean": -919.8479614257812, "KL/rejected_KL_mean": -1179.6611328125, "KL/std": 662.580810546875, "epoch": 0.9647577092511013, "fcm_dpo/beta": 0.0009273520554415882, "fcm_dpo/delta": -0.08619820326566696, "fcm_dpo/margin": 519.6265258789062, "fcm_dpo/q_t": 0.3917388916015625, "grad_norm": 33.481868743896484, "learning_rate": 2.055847060721566e-09, "logits/chosen": -1.0952857732772827, "logits/rejected": -1.1410545110702515, "logps/chosen": -706.7352294921875, "logps/ref_chosen": -46.700538635253906, "logps/ref_rejected": -97.91487121582031, "logps/rejected": -1277.5760498046875, "loss": 1.051, "margin_dpo/margin_mean": 519.6265258789062, "margin_dpo/margin_std": 686.9745483398438, "step": 657 }, { "KL/chosen_KL_mean": -716.8538818359375, "KL/mean": -926.1987915039062, "KL/rejected_KL_mean": -1135.543701171875, "KL/std": 523.1241455078125, "epoch": 0.9662261380323054, "fcm_dpo/beta": 0.0009218085906468332, "fcm_dpo/delta": 0.014132943004369736, "fcm_dpo/margin": 418.6898193359375, "fcm_dpo/q_t": 0.4112616181373596, "grad_norm": 32.001991271972656, "learning_rate": 1.8948725820160662e-09, "logits/chosen": -1.0506086349487305, "logits/rejected": -1.0856657028198242, "logps/chosen": -777.8120727539062, "logps/ref_chosen": -60.95820999145508, "logps/ref_rejected": -95.93949127197266, "logps/rejected": -1231.483154296875, "loss": 1.1103, "margin_dpo/margin_mean": 418.6898193359375, "margin_dpo/margin_std": 594.6806640625, "step": 658 }, { "KL/chosen_KL_mean": -614.8621826171875, "KL/mean": -816.10400390625, "KL/rejected_KL_mean": -1017.3458251953125, "KL/std": 519.0294189453125, "epoch": 0.9676945668135095, "fcm_dpo/beta": 0.0009279932710342109, "fcm_dpo/delta": 0.02739275060594082, "fcm_dpo/margin": 402.483642578125, "fcm_dpo/q_t": 0.4154208302497864, "grad_norm": 38.490875244140625, "learning_rate": 1.7404359041573723e-09, "logits/chosen": -0.9568224549293518, "logits/rejected": -0.9347273111343384, "logps/chosen": -691.6051635742188, "logps/ref_chosen": -76.74298095703125, "logps/ref_rejected": -87.4709701538086, "logps/rejected": -1104.8167724609375, "loss": 1.1164, "margin_dpo/margin_mean": 402.483642578125, "margin_dpo/margin_std": 586.9424438476562, "step": 659 }, { "KL/chosen_KL_mean": -670.32568359375, "KL/mean": -932.9974365234375, "KL/rejected_KL_mean": -1195.6693115234375, "KL/std": 613.5252685546875, "epoch": 0.9691629955947136, "fcm_dpo/beta": 0.0009214339079335332, "fcm_dpo/delta": -0.08826512098312378, "fcm_dpo/margin": 525.3435668945312, "fcm_dpo/q_t": 0.3903365135192871, "grad_norm": 45.50758743286133, "learning_rate": 1.592541096695571e-09, "logits/chosen": -1.0583207607269287, "logits/rejected": -1.0816309452056885, "logps/chosen": -729.3735961914062, "logps/ref_chosen": -59.04788589477539, "logps/ref_rejected": -75.96005249023438, "logps/rejected": -1271.62939453125, "loss": 1.0413, "margin_dpo/margin_mean": 525.3435668945312, "margin_dpo/margin_std": 652.117431640625, "step": 660 }, { "KL/chosen_KL_mean": -593.3245849609375, "KL/mean": -826.2509765625, "KL/rejected_KL_mean": -1059.177490234375, "KL/std": 661.68408203125, "epoch": 0.9706314243759178, "fcm_dpo/beta": 0.0009160168119706213, "fcm_dpo/delta": -0.028154436498880386, "fcm_dpo/margin": 465.85284423828125, "fcm_dpo/q_t": 0.40546107292175293, "grad_norm": 44.195091247558594, "learning_rate": 1.4511920567963908e-09, "logits/chosen": -1.0162544250488281, "logits/rejected": -1.0294699668884277, "logps/chosen": -643.99853515625, "logps/ref_chosen": -50.673973083496094, "logps/ref_rejected": -86.00569152832031, "logps/rejected": -1145.18310546875, "loss": 1.0806, "margin_dpo/margin_mean": 465.85284423828125, "margin_dpo/margin_std": 654.1995849609375, "step": 661 }, { "KL/chosen_KL_mean": -686.9578857421875, "KL/mean": -869.9920043945312, "KL/rejected_KL_mean": -1053.026123046875, "KL/std": 554.1969604492188, "epoch": 0.9720998531571219, "fcm_dpo/beta": 0.0009198813932016492, "fcm_dpo/delta": 0.06546258926391602, "fcm_dpo/margin": 366.068359375, "fcm_dpo/q_t": 0.42443883419036865, "grad_norm": 27.159738540649414, "learning_rate": 1.3163925091384532e-09, "logits/chosen": -0.9714980125427246, "logits/rejected": -0.9738001823425293, "logps/chosen": -756.2188720703125, "logps/ref_chosen": -69.26106262207031, "logps/ref_rejected": -89.05593872070312, "logps/rejected": -1142.08203125, "loss": 1.1657, "margin_dpo/margin_mean": 366.068359375, "margin_dpo/margin_std": 672.253662109375, "step": 662 }, { "KL/chosen_KL_mean": -672.4644775390625, "KL/mean": -889.0924072265625, "KL/rejected_KL_mean": -1105.72021484375, "KL/std": 637.3165283203125, "epoch": 0.973568281938326, "fcm_dpo/beta": 0.0009213130106218159, "fcm_dpo/delta": 0.0008019153028726578, "fcm_dpo/margin": 433.2557373046875, "fcm_dpo/q_t": 0.4122500717639923, "grad_norm": 26.556825637817383, "learning_rate": 1.1881460058152382e-09, "logits/chosen": -1.0173933506011963, "logits/rejected": -1.0415921211242676, "logps/chosen": -737.3433837890625, "logps/ref_chosen": -64.87890625, "logps/ref_rejected": -113.92536926269531, "logps/rejected": -1219.6456298828125, "loss": 1.1271, "margin_dpo/margin_mean": 433.25579833984375, "margin_dpo/margin_std": 735.6624145507812, "step": 663 }, { "KL/chosen_KL_mean": -696.9286499023438, "KL/mean": -946.314208984375, "KL/rejected_KL_mean": -1195.69970703125, "KL/std": 625.337158203125, "epoch": 0.9750367107195301, "fcm_dpo/beta": 0.0009112852858379483, "fcm_dpo/delta": -0.057732854038476944, "fcm_dpo/margin": 498.7710266113281, "fcm_dpo/q_t": 0.39850109815597534, "grad_norm": 29.249494552612305, "learning_rate": 1.066455926241383e-09, "logits/chosen": -1.0290577411651611, "logits/rejected": -1.0633100271224976, "logps/chosen": -757.817138671875, "logps/ref_chosen": -60.88847351074219, "logps/ref_rejected": -105.521728515625, "logps/rejected": -1301.221435546875, "loss": 1.0681, "margin_dpo/margin_mean": 498.77105712890625, "margin_dpo/margin_std": 673.9359130859375, "step": 664 }, { "KL/chosen_KL_mean": -624.91357421875, "KL/mean": -831.52734375, "KL/rejected_KL_mean": -1038.14111328125, "KL/std": 521.3961791992188, "epoch": 0.9765051395007343, "fcm_dpo/beta": 0.0009156306041404605, "fcm_dpo/delta": 0.02248411625623703, "fcm_dpo/margin": 413.22760009765625, "fcm_dpo/q_t": 0.4127081632614136, "grad_norm": 39.16621780395508, "learning_rate": 9.513254770636137e-10, "logits/chosen": -1.1201207637786865, "logits/rejected": -1.1449217796325684, "logps/chosen": -685.4776611328125, "logps/ref_chosen": -60.56413269042969, "logps/ref_rejected": -84.80882263183594, "logps/rejected": -1122.949951171875, "loss": 1.0981, "margin_dpo/margin_mean": 413.22760009765625, "margin_dpo/margin_std": 524.672607421875, "step": 665 }, { "KL/chosen_KL_mean": -664.80712890625, "KL/mean": -876.2521362304688, "KL/rejected_KL_mean": -1087.697021484375, "KL/std": 528.6064453125, "epoch": 0.9779735682819384, "fcm_dpo/beta": 0.0009172533173114061, "fcm_dpo/delta": 0.012579199858009815, "fcm_dpo/margin": 422.88995361328125, "fcm_dpo/q_t": 0.41190439462661743, "grad_norm": 25.72242546081543, "learning_rate": 8.427576920763956e-10, "logits/chosen": -0.9221373200416565, "logits/rejected": -0.9332491755485535, "logps/chosen": -729.2271118164062, "logps/ref_chosen": -64.41996002197266, "logps/ref_rejected": -95.8916244506836, "logps/rejected": -1183.5887451171875, "loss": 1.106, "margin_dpo/margin_mean": 422.88995361328125, "margin_dpo/margin_std": 591.49951171875, "step": 666 }, { "KL/chosen_KL_mean": -747.1198120117188, "KL/mean": -997.51904296875, "KL/rejected_KL_mean": -1247.918212890625, "KL/std": 593.6838989257812, "epoch": 0.9794419970631424, "fcm_dpo/beta": 0.0009089080849662423, "fcm_dpo/delta": -0.058049269020557404, "fcm_dpo/margin": 500.79840087890625, "fcm_dpo/q_t": 0.3967137038707733, "grad_norm": 35.70097732543945, "learning_rate": 7.407554321417764e-10, "logits/chosen": -0.9862961769104004, "logits/rejected": -0.9898433089256287, "logps/chosen": -816.3968505859375, "logps/ref_chosen": -69.27702331542969, "logps/ref_rejected": -87.83549499511719, "logps/rejected": -1335.753662109375, "loss": 1.0614, "margin_dpo/margin_mean": 500.79840087890625, "margin_dpo/margin_std": 649.6275634765625, "step": 667 }, { "KL/chosen_KL_mean": -799.2650756835938, "KL/mean": -969.44189453125, "KL/rejected_KL_mean": -1139.61865234375, "KL/std": 626.5878295898438, "epoch": 0.9809104258443465, "fcm_dpo/beta": 0.0009242600062862039, "fcm_dpo/delta": 0.08742087334394455, "fcm_dpo/margin": 340.3535461425781, "fcm_dpo/q_t": 0.43164360523223877, "grad_norm": 46.50619888305664, "learning_rate": 6.453213851142225e-10, "logits/chosen": -1.027015209197998, "logits/rejected": -1.0318031311035156, "logps/chosen": -871.8690795898438, "logps/ref_chosen": -72.60400390625, "logps/ref_rejected": -103.73905944824219, "logps/rejected": -1243.357666015625, "loss": 1.2061, "margin_dpo/margin_mean": 340.35357666015625, "margin_dpo/margin_std": 745.7200927734375, "step": 668 }, { "KL/chosen_KL_mean": -607.0186157226562, "KL/mean": -848.4479370117188, "KL/rejected_KL_mean": -1089.877197265625, "KL/std": 578.434326171875, "epoch": 0.9823788546255506, "fcm_dpo/beta": 0.0009196557221002877, "fcm_dpo/delta": -0.04609519988298416, "fcm_dpo/margin": 482.8586120605469, "fcm_dpo/q_t": 0.39873257279396057, "grad_norm": 24.051504135131836, "learning_rate": 5.564580657695939e-10, "logits/chosen": -1.0002648830413818, "logits/rejected": -1.0088210105895996, "logps/chosen": -653.135009765625, "logps/ref_chosen": -46.116416931152344, "logps/ref_rejected": -77.92434692382812, "logps/rejected": -1167.801513671875, "loss": 1.0728, "margin_dpo/margin_mean": 482.8586120605469, "margin_dpo/margin_std": 649.423583984375, "step": 669 }, { "KL/chosen_KL_mean": -589.9717407226562, "KL/mean": -834.8968505859375, "KL/rejected_KL_mean": -1079.822021484375, "KL/std": 548.0260009765625, "epoch": 0.9838472834067548, "fcm_dpo/beta": 0.0009132723789662123, "fcm_dpo/delta": -0.04978980869054794, "fcm_dpo/margin": 489.85028076171875, "fcm_dpo/q_t": 0.3985205292701721, "grad_norm": 27.57679557800293, "learning_rate": 4.741678157389739e-10, "logits/chosen": -0.9369679689407349, "logits/rejected": -0.956099271774292, "logps/chosen": -652.3175048828125, "logps/ref_chosen": -62.34575271606445, "logps/ref_rejected": -96.9405517578125, "logps/rejected": -1176.7625732421875, "loss": 1.0764, "margin_dpo/margin_mean": 489.85028076171875, "margin_dpo/margin_std": 666.9021606445312, "step": 670 }, { "KL/chosen_KL_mean": -739.0850219726562, "KL/mean": -940.6197509765625, "KL/rejected_KL_mean": -1142.154541015625, "KL/std": 546.3319091796875, "epoch": 0.9853157121879589, "fcm_dpo/beta": 0.0009144209325313568, "fcm_dpo/delta": 0.03224237263202667, "fcm_dpo/margin": 403.0694580078125, "fcm_dpo/q_t": 0.4150552749633789, "grad_norm": 31.614513397216797, "learning_rate": 3.9845280344705245e-10, "logits/chosen": -1.047501564025879, "logits/rejected": -1.0776722431182861, "logps/chosen": -787.0851440429688, "logps/ref_chosen": -48.00010681152344, "logps/ref_rejected": -83.81932067871094, "logps/rejected": -1225.973876953125, "loss": 1.1359, "margin_dpo/margin_mean": 403.0694885253906, "margin_dpo/margin_std": 654.9969482421875, "step": 671 }, { "KL/chosen_KL_mean": -817.5101318359375, "KL/mean": -1022.951904296875, "KL/rejected_KL_mean": -1228.3935546875, "KL/std": 672.7301025390625, "epoch": 0.986784140969163, "fcm_dpo/beta": 0.0009142364142462611, "fcm_dpo/delta": 0.02527567557990551, "fcm_dpo/margin": 410.8834228515625, "fcm_dpo/q_t": 0.4172729551792145, "grad_norm": 50.31674575805664, "learning_rate": 3.293150240547549e-10, "logits/chosen": -1.097043514251709, "logits/rejected": -1.1035444736480713, "logps/chosen": -876.0934448242188, "logps/ref_chosen": -58.58328628540039, "logps/ref_rejected": -93.14015197753906, "logps/rejected": -1321.53369140625, "loss": 1.1531, "margin_dpo/margin_mean": 410.8834228515625, "margin_dpo/margin_std": 742.9523315429688, "step": 672 }, { "KL/chosen_KL_mean": -726.111572265625, "KL/mean": -917.39892578125, "KL/rejected_KL_mean": -1108.686279296875, "KL/std": 555.8173828125, "epoch": 0.9882525697503671, "fcm_dpo/beta": 0.0009239012142643332, "fcm_dpo/delta": 0.048241592943668365, "fcm_dpo/margin": 382.5746765136719, "fcm_dpo/q_t": 0.41999146342277527, "grad_norm": 33.52194595336914, "learning_rate": 2.6675629940689504e-10, "logits/chosen": -1.051267147064209, "logits/rejected": -1.0552277565002441, "logps/chosen": -772.8348388671875, "logps/ref_chosen": -46.72320556640625, "logps/ref_rejected": -85.29623413085938, "logps/rejected": -1193.982421875, "loss": 1.1356, "margin_dpo/margin_mean": 382.5746765136719, "margin_dpo/margin_std": 613.573974609375, "step": 673 }, { "KL/chosen_KL_mean": -596.3148803710938, "KL/mean": -841.9620971679688, "KL/rejected_KL_mean": -1087.6092529296875, "KL/std": 549.2061157226562, "epoch": 0.9897209985315712, "fcm_dpo/beta": 0.0009207893162965775, "fcm_dpo/delta": -0.05484557896852493, "fcm_dpo/margin": 491.2944641113281, "fcm_dpo/q_t": 0.39986640214920044, "grad_norm": 42.7830810546875, "learning_rate": 2.1077827798404725e-10, "logits/chosen": -0.9467175602912903, "logits/rejected": -0.96770179271698, "logps/chosen": -641.7603759765625, "logps/ref_chosen": -45.445526123046875, "logps/ref_rejected": -70.04593658447266, "logps/rejected": -1157.6552734375, "loss": 1.0662, "margin_dpo/margin_mean": 491.29443359375, "margin_dpo/margin_std": 659.54931640625, "step": 674 }, { "KL/chosen_KL_mean": -681.6663818359375, "KL/mean": -935.557861328125, "KL/rejected_KL_mean": -1189.44921875, "KL/std": 612.2513427734375, "epoch": 0.9911894273127754, "fcm_dpo/beta": 0.0009013206581585109, "fcm_dpo/delta": -0.06186992675065994, "fcm_dpo/margin": 507.7829284667969, "fcm_dpo/q_t": 0.3982793688774109, "grad_norm": 28.38162612915039, "learning_rate": 1.6138243485910863e-10, "logits/chosen": -1.023393154144287, "logits/rejected": -1.0347087383270264, "logps/chosen": -725.8426513671875, "logps/ref_chosen": -44.17628479003906, "logps/ref_rejected": -74.09197998046875, "logps/rejected": -1263.541259765625, "loss": 1.0636, "margin_dpo/margin_mean": 507.782958984375, "margin_dpo/margin_std": 652.8712158203125, "step": 675 }, { "KL/chosen_KL_mean": -729.031005859375, "KL/mean": -965.601318359375, "KL/rejected_KL_mean": -1202.1717529296875, "KL/std": 577.6213989257812, "epoch": 0.9926578560939795, "fcm_dpo/beta": 0.0009026783518493176, "fcm_dpo/delta": -0.028323372825980186, "fcm_dpo/margin": 473.1407775878906, "fcm_dpo/q_t": 0.4024215042591095, "grad_norm": 25.832805633544922, "learning_rate": 1.1857007165852472e-10, "logits/chosen": -0.9483187198638916, "logits/rejected": -0.9624805450439453, "logps/chosen": -800.4295043945312, "logps/ref_chosen": -71.39852905273438, "logps/ref_rejected": -88.3587646484375, "logps/rejected": -1290.530517578125, "loss": 1.0689, "margin_dpo/margin_mean": 473.14080810546875, "margin_dpo/margin_std": 579.1337280273438, "step": 676 }, { "KL/chosen_KL_mean": -734.3248291015625, "KL/mean": -961.4959716796875, "KL/rejected_KL_mean": -1188.667236328125, "KL/std": 600.77294921875, "epoch": 0.9941262848751835, "fcm_dpo/beta": 0.0008973192889243364, "fcm_dpo/delta": -0.008067594841122627, "fcm_dpo/margin": 454.34246826171875, "fcm_dpo/q_t": 0.41222789883613586, "grad_norm": 29.9565372467041, "learning_rate": 8.23423165278725e-11, "logits/chosen": -1.0780959129333496, "logits/rejected": -1.0737848281860352, "logps/chosen": -790.8522338867188, "logps/ref_chosen": -56.527435302734375, "logps/ref_rejected": -78.22654724121094, "logps/rejected": -1266.893798828125, "loss": 1.1093, "margin_dpo/margin_mean": 454.34246826171875, "margin_dpo/margin_std": 722.945068359375, "step": 677 }, { "KL/chosen_KL_mean": -620.3502197265625, "KL/mean": -886.8961181640625, "KL/rejected_KL_mean": -1153.44189453125, "KL/std": 635.0799560546875, "epoch": 0.9955947136563876, "fcm_dpo/beta": 0.0008914459031075239, "fcm_dpo/delta": -0.07900315523147583, "fcm_dpo/margin": 533.0916137695312, "fcm_dpo/q_t": 0.39344462752342224, "grad_norm": 33.316654205322266, "learning_rate": 5.270012410216185e-11, "logits/chosen": -0.9998750686645508, "logits/rejected": -1.036036491394043, "logps/chosen": -666.4847412109375, "logps/ref_chosen": -46.13447570800781, "logps/ref_rejected": -80.60462951660156, "logps/rejected": -1234.0465087890625, "loss": 1.058, "margin_dpo/margin_mean": 533.0916137695312, "margin_dpo/margin_std": 715.0037841796875, "step": 678 }, { "KL/chosen_KL_mean": -701.3464965820312, "KL/mean": -892.486328125, "KL/rejected_KL_mean": -1083.626220703125, "KL/std": 525.0972900390625, "epoch": 0.9970631424375918, "fcm_dpo/beta": 0.0008914553327485919, "fcm_dpo/delta": 0.06134221330285072, "fcm_dpo/margin": 382.27972412109375, "fcm_dpo/q_t": 0.42303356528282166, "grad_norm": 31.741161346435547, "learning_rate": 2.9644275480772416e-11, "logits/chosen": -1.020527720451355, "logits/rejected": -1.010463833808899, "logps/chosen": -751.6414184570312, "logps/ref_chosen": -50.294921875, "logps/ref_rejected": -76.59813690185547, "logps/rejected": -1160.224365234375, "loss": 1.1415, "margin_dpo/margin_mean": 382.2797546386719, "margin_dpo/margin_std": 604.84912109375, "step": 679 }, { "KL/chosen_KL_mean": -712.27880859375, "KL/mean": -966.59423828125, "KL/rejected_KL_mean": -1220.90966796875, "KL/std": 689.5088500976562, "epoch": 0.9985315712187959, "fcm_dpo/beta": 0.0008838686626404524, "fcm_dpo/delta": -0.05259976163506508, "fcm_dpo/margin": 508.6307678222656, "fcm_dpo/q_t": 0.3975260853767395, "grad_norm": 40.12180709838867, "learning_rate": 1.31753782067201e-11, "logits/chosen": -1.00642728805542, "logits/rejected": -1.0368506908416748, "logps/chosen": -789.1945190429688, "logps/ref_chosen": -76.91569519042969, "logps/ref_rejected": -112.384765625, "logps/rejected": -1333.29443359375, "loss": 1.0861, "margin_dpo/margin_mean": 508.6307678222656, "margin_dpo/margin_std": 746.896484375, "step": 680 }, { "KL/chosen_KL_mean": -710.1919555664062, "KL/mean": -909.7540283203125, "KL/rejected_KL_mean": -1109.316162109375, "KL/std": 567.23486328125, "epoch": 1.0, "fcm_dpo/beta": 0.0008977074176073074, "fcm_dpo/delta": 0.04187752678990364, "fcm_dpo/margin": 399.12420654296875, "fcm_dpo/q_t": 0.4188354015350342, "grad_norm": 33.870479583740234, "learning_rate": 3.2938662507808745e-12, "logits/chosen": -1.0663830041885376, "logits/rejected": -1.0852856636047363, "logps/chosen": -771.1492309570312, "logps/ref_chosen": -60.957279205322266, "logps/ref_rejected": -88.55797576904297, "logps/rejected": -1197.8740234375, "loss": 1.1392, "margin_dpo/margin_mean": 399.1241760253906, "margin_dpo/margin_std": 622.114501953125, "step": 681 }, { "epoch": 1.0, "step": 681, "total_flos": 0.0, "train_loss": 1.0911195537242244, "train_runtime": 1739.0324, "train_samples_per_second": 25.07, "train_steps_per_second": 0.392 } ], "logging_steps": 1, "max_steps": 681, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }