{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 200, "global_step": 681, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "KL/chosen_KL_mean": 0.00527191162109375, "KL/mean": 0.016706019639968872, "KL/rejected_KL_mean": 0.028141021728515625, "KL/std": 0.272699236869812, "epoch": 0.0014684287812041115, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": -0.02287006378173828, "fcm_dpo/q_t": 0.501685619354248, "grad_norm": 251.27125549316406, "learning_rate": 0.0, "logits/chosen": -0.4974287748336792, "logits/rejected": -0.43299180269241333, "logps/chosen": -50.1435661315918, "logps/ref_chosen": -50.14883804321289, "logps/ref_rejected": -74.1280517578125, "logps/rejected": -74.09991455078125, "loss": 1.3971, "margin_dpo/margin_mean": -0.02287048101425171, "margin_dpo/margin_std": 0.41920793056488037, "step": 1 }, { "KL/chosen_KL_mean": -0.03498649597167969, "KL/mean": -0.00212840735912323, "KL/rejected_KL_mean": 0.030735015869140625, "KL/std": 0.24797174334526062, "epoch": 0.002936857562408223, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": -0.06572261452674866, "fcm_dpo/q_t": 0.5049160718917847, "grad_norm": 217.6841278076172, "learning_rate": 7.246376811594203e-09, "logits/chosen": -0.49536412954330444, "logits/rejected": -0.4594460427761078, "logps/chosen": -52.65568923950195, "logps/ref_chosen": -52.620704650878906, "logps/ref_rejected": -75.30413818359375, "logps/rejected": -75.27340698242188, "loss": 1.4089, "margin_dpo/margin_mean": -0.06572240591049194, "margin_dpo/margin_std": 0.35048407316207886, "step": 2 }, { "KL/chosen_KL_mean": 0.019153594970703125, "KL/mean": -0.0025722086429595947, "KL/rejected_KL_mean": -0.02429962158203125, "KL/std": 0.2354850471019745, "epoch": 0.004405286343612335, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.04345357418060303, "fcm_dpo/q_t": 0.49674931168556213, "grad_norm": 210.45652770996094, "learning_rate": 1.4492753623188406e-08, "logits/chosen": -0.4816104471683502, "logits/rejected": -0.4421927034854889, "logps/chosen": -60.962440490722656, "logps/ref_chosen": -60.981597900390625, "logps/ref_rejected": -68.67259216308594, "logps/rejected": -68.6968994140625, "loss": 1.3761, "margin_dpo/margin_mean": 0.04345354437828064, "margin_dpo/margin_std": 0.35039910674095154, "step": 3 }, { "KL/chosen_KL_mean": -0.001567840576171875, "KL/mean": 0.023254141211509705, "KL/rejected_KL_mean": 0.04807281494140625, "KL/std": 0.26486122608184814, "epoch": 0.005873715124816446, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": -0.04963979125022888, "fcm_dpo/q_t": 0.5037118196487427, "grad_norm": 217.65200805664062, "learning_rate": 2.1739130434782606e-08, "logits/chosen": -0.4678453207015991, "logits/rejected": -0.4402541518211365, "logps/chosen": -56.76927947998047, "logps/ref_chosen": -56.7677116394043, "logps/ref_rejected": -86.64710998535156, "logps/rejected": -86.59903717041016, "loss": 1.4047, "margin_dpo/margin_mean": -0.04964029788970947, "margin_dpo/margin_std": 0.39235860109329224, "step": 4 }, { "KL/chosen_KL_mean": 0.030719757080078125, "KL/mean": 0.031741127371788025, "KL/rejected_KL_mean": 0.032764434814453125, "KL/std": 0.2725304961204529, "epoch": 0.007342143906020558, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": -0.00204351544380188, "fcm_dpo/q_t": 0.5001676678657532, "grad_norm": 270.9964294433594, "learning_rate": 2.898550724637681e-08, "logits/chosen": -0.4972953498363495, "logits/rejected": -0.4523712396621704, "logps/chosen": -53.82865524291992, "logps/ref_chosen": -53.859375, "logps/ref_rejected": -84.14918518066406, "logps/rejected": -84.11642456054688, "loss": 1.3902, "margin_dpo/margin_mean": -0.0020435750484466553, "margin_dpo/margin_std": 0.37501761317253113, "step": 5 }, { "KL/chosen_KL_mean": -0.0196380615234375, "KL/mean": -0.020306527614593506, "KL/rejected_KL_mean": -0.02097320556640625, "KL/std": 0.27848026156425476, "epoch": 0.00881057268722467, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.001337289810180664, "fcm_dpo/q_t": 0.49989837408065796, "grad_norm": 276.3399963378906, "learning_rate": 3.6231884057971014e-08, "logits/chosen": -0.5145661234855652, "logits/rejected": -0.47312256693840027, "logps/chosen": -63.027122497558594, "logps/ref_chosen": -63.007484436035156, "logps/ref_rejected": -92.64534759521484, "logps/rejected": -92.66632080078125, "loss": 1.3894, "margin_dpo/margin_mean": 0.0013370811939239502, "margin_dpo/margin_std": 0.3880041539669037, "step": 6 }, { "KL/chosen_KL_mean": 0.03285026550292969, "KL/mean": 0.02877350151538849, "KL/rejected_KL_mean": 0.02469635009765625, "KL/std": 0.30477985739707947, "epoch": 0.010279001468428781, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.008149892091751099, "fcm_dpo/q_t": 0.49938228726387024, "grad_norm": 245.37692260742188, "learning_rate": 4.347826086956521e-08, "logits/chosen": -0.5077540874481201, "logits/rejected": -0.47386452555656433, "logps/chosen": -57.74197006225586, "logps/ref_chosen": -57.774818420410156, "logps/ref_rejected": -103.92059326171875, "logps/rejected": -103.8958969116211, "loss": 1.3873, "margin_dpo/margin_mean": 0.008150070905685425, "margin_dpo/margin_std": 0.38711145520210266, "step": 7 }, { "KL/chosen_KL_mean": -0.0022296905517578125, "KL/mean": 0.031146153807640076, "KL/rejected_KL_mean": 0.06452560424804688, "KL/std": 0.33025887608528137, "epoch": 0.011747430249632892, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": -0.06675639748573303, "fcm_dpo/q_t": 0.5049271583557129, "grad_norm": 240.70013427734375, "learning_rate": 5.0724637681159424e-08, "logits/chosen": -0.5015411376953125, "logits/rejected": -0.47501832246780396, "logps/chosen": -58.718265533447266, "logps/ref_chosen": -58.716033935546875, "logps/ref_rejected": -79.3114242553711, "logps/rejected": -79.24690246582031, "loss": 1.4116, "margin_dpo/margin_mean": -0.06675609946250916, "margin_dpo/margin_std": 0.47787904739379883, "step": 8 }, { "KL/chosen_KL_mean": 0.02227783203125, "KL/mean": -0.00015251338481903076, "KL/rejected_KL_mean": -0.0225830078125, "KL/std": 0.30635231733322144, "epoch": 0.013215859030837005, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.04486778378486633, "fcm_dpo/q_t": 0.49667075276374817, "grad_norm": 254.40870666503906, "learning_rate": 5.797101449275362e-08, "logits/chosen": -0.4882626235485077, "logits/rejected": -0.4411010444164276, "logps/chosen": -69.84456634521484, "logps/ref_chosen": -69.8668441772461, "logps/ref_rejected": -99.6026611328125, "logps/rejected": -99.625244140625, "loss": 1.3778, "margin_dpo/margin_mean": 0.04486680030822754, "margin_dpo/margin_std": 0.4566071927547455, "step": 9 }, { "KL/chosen_KL_mean": 0.022472381591796875, "KL/mean": 0.017022237181663513, "KL/rejected_KL_mean": 0.01157379150390625, "KL/std": 0.24305114150047302, "epoch": 0.014684287812041116, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.010898560285568237, "fcm_dpo/q_t": 0.4991758465766907, "grad_norm": 212.15330505371094, "learning_rate": 6.521739130434782e-08, "logits/chosen": -0.4911458492279053, "logits/rejected": -0.4477323889732361, "logps/chosen": -48.33521270751953, "logps/ref_chosen": -48.35768508911133, "logps/ref_rejected": -80.37206268310547, "logps/rejected": -80.36048889160156, "loss": 1.3858, "margin_dpo/margin_mean": 0.010898619890213013, "margin_dpo/margin_std": 0.34846025705337524, "step": 10 }, { "KL/chosen_KL_mean": -0.0001010894775390625, "KL/mean": 0.0009690821170806885, "KL/rejected_KL_mean": 0.00203704833984375, "KL/std": 0.291149377822876, "epoch": 0.016152716593245228, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": -0.0021440982818603516, "fcm_dpo/q_t": 0.5001416802406311, "grad_norm": 207.24131774902344, "learning_rate": 7.246376811594203e-08, "logits/chosen": -0.4701375365257263, "logits/rejected": -0.4457797110080719, "logps/chosen": -53.0169563293457, "logps/ref_chosen": -53.01685333251953, "logps/ref_rejected": -87.78038024902344, "logps/rejected": -87.77833557128906, "loss": 1.3909, "margin_dpo/margin_mean": -0.0021438300609588623, "margin_dpo/margin_std": 0.4191063344478607, "step": 11 }, { "KL/chosen_KL_mean": -0.010473251342773438, "KL/mean": -0.01090405136346817, "KL/rejected_KL_mean": -0.011325836181640625, "KL/std": 0.27011072635650635, "epoch": 0.01762114537444934, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.0008537918329238892, "fcm_dpo/q_t": 0.4999362528324127, "grad_norm": 271.4062194824219, "learning_rate": 7.971014492753623e-08, "logits/chosen": -0.538188099861145, "logits/rejected": -0.5020288228988647, "logps/chosen": -61.81591033935547, "logps/ref_chosen": -61.80543518066406, "logps/ref_rejected": -104.8582763671875, "logps/rejected": -104.86959838867188, "loss": 1.3887, "margin_dpo/margin_mean": 0.0008526891469955444, "margin_dpo/margin_std": 0.3452816605567932, "step": 12 }, { "KL/chosen_KL_mean": 0.04018592834472656, "KL/mean": 0.009547561407089233, "KL/rejected_KL_mean": -0.021087646484375, "KL/std": 0.2959768772125244, "epoch": 0.01908957415565345, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.06127917766571045, "fcm_dpo/q_t": 0.495451420545578, "grad_norm": 236.69508361816406, "learning_rate": 8.695652173913042e-08, "logits/chosen": -0.4695357084274292, "logits/rejected": -0.44066792726516724, "logps/chosen": -64.22016906738281, "logps/ref_chosen": -64.2603530883789, "logps/ref_rejected": -87.20307922363281, "logps/rejected": -87.22416687011719, "loss": 1.3727, "margin_dpo/margin_mean": 0.061279088258743286, "margin_dpo/margin_std": 0.44392725825309753, "step": 13 }, { "KL/chosen_KL_mean": -0.011384963989257812, "KL/mean": -0.02230377495288849, "KL/rejected_KL_mean": -0.03322601318359375, "KL/std": 0.2484772801399231, "epoch": 0.020558002936857563, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.021841615438461304, "fcm_dpo/q_t": 0.49836230278015137, "grad_norm": 255.34683227539062, "learning_rate": 9.420289855072464e-08, "logits/chosen": -0.46936067938804626, "logits/rejected": -0.4296714961528778, "logps/chosen": -58.12159729003906, "logps/ref_chosen": -58.11021041870117, "logps/ref_rejected": -104.04708099365234, "logps/rejected": -104.08030700683594, "loss": 1.3824, "margin_dpo/margin_mean": 0.021842211484909058, "margin_dpo/margin_std": 0.34157758951187134, "step": 14 }, { "KL/chosen_KL_mean": -0.05281257629394531, "KL/mean": -0.04400016367435455, "KL/rejected_KL_mean": -0.035190582275390625, "KL/std": 0.252704918384552, "epoch": 0.022026431718061675, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": -0.01762327551841736, "fcm_dpo/q_t": 0.501282811164856, "grad_norm": 193.75828552246094, "learning_rate": 1.0144927536231885e-07, "logits/chosen": -0.5062054991722107, "logits/rejected": -0.4881584942340851, "logps/chosen": -57.01972198486328, "logps/ref_chosen": -56.96691131591797, "logps/ref_rejected": -80.80863952636719, "logps/rejected": -80.84383392333984, "loss": 1.3952, "margin_dpo/margin_mean": -0.017623186111450195, "margin_dpo/margin_std": 0.3986828327178955, "step": 15 }, { "KL/chosen_KL_mean": -0.016290664672851562, "KL/mean": -0.02468542754650116, "KL/rejected_KL_mean": -0.033077239990234375, "KL/std": 0.264546275138855, "epoch": 0.023494860499265784, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.016786009073257446, "fcm_dpo/q_t": 0.49875974655151367, "grad_norm": 251.1475830078125, "learning_rate": 1.0869565217391303e-07, "logits/chosen": -0.5262372493743896, "logits/rejected": -0.48537588119506836, "logps/chosen": -61.75618362426758, "logps/ref_chosen": -61.739891052246094, "logps/ref_rejected": -84.36947631835938, "logps/rejected": -84.40254974365234, "loss": 1.3846, "margin_dpo/margin_mean": 0.016786575317382812, "margin_dpo/margin_std": 0.38438552618026733, "step": 16 }, { "KL/chosen_KL_mean": 0.045101165771484375, "KL/mean": -0.004545360803604126, "KL/rejected_KL_mean": -0.054195404052734375, "KL/std": 0.26345258951187134, "epoch": 0.024963289280469897, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.09929555654525757, "fcm_dpo/q_t": 0.492563396692276, "grad_norm": 233.72305297851562, "learning_rate": 1.1594202898550725e-07, "logits/chosen": -0.4858800768852234, "logits/rejected": -0.44683146476745605, "logps/chosen": -67.66523742675781, "logps/ref_chosen": -67.71033477783203, "logps/ref_rejected": -85.37865447998047, "logps/rejected": -85.43284606933594, "loss": 1.3596, "margin_dpo/margin_mean": 0.09929636120796204, "margin_dpo/margin_std": 0.3553627133369446, "step": 17 }, { "KL/chosen_KL_mean": 0.018472671508789062, "KL/mean": -0.009962007403373718, "KL/rejected_KL_mean": -0.03839874267578125, "KL/std": 0.21256747841835022, "epoch": 0.02643171806167401, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.05686947703361511, "fcm_dpo/q_t": 0.4957374036312103, "grad_norm": 245.1805877685547, "learning_rate": 1.2318840579710146e-07, "logits/chosen": -0.5064246654510498, "logits/rejected": -0.45240044593811035, "logps/chosen": -47.72101593017578, "logps/ref_chosen": -47.7394905090332, "logps/ref_rejected": -75.4722900390625, "logps/rejected": -75.51068115234375, "loss": 1.3715, "margin_dpo/margin_mean": 0.05686900019645691, "margin_dpo/margin_std": 0.30619317293167114, "step": 18 }, { "KL/chosen_KL_mean": 0.057403564453125, "KL/mean": 0.0009317547082901001, "KL/rejected_KL_mean": -0.05553436279296875, "KL/std": 0.27681607007980347, "epoch": 0.027900146842878122, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.11293384432792664, "fcm_dpo/q_t": 0.49154412746429443, "grad_norm": 221.10816955566406, "learning_rate": 1.3043478260869563e-07, "logits/chosen": -0.5035191774368286, "logits/rejected": -0.45468592643737793, "logps/chosen": -70.14795684814453, "logps/ref_chosen": -70.20536041259766, "logps/ref_rejected": -89.7575912475586, "logps/rejected": -89.81312561035156, "loss": 1.3562, "margin_dpo/margin_mean": 0.11293420195579529, "margin_dpo/margin_std": 0.39282259345054626, "step": 19 }, { "KL/chosen_KL_mean": 0.002490997314453125, "KL/mean": -0.047424912452697754, "KL/rejected_KL_mean": -0.0973358154296875, "KL/std": 0.25252386927604675, "epoch": 0.02936857562408223, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.09983032941818237, "fcm_dpo/q_t": 0.4925253391265869, "grad_norm": 218.9619903564453, "learning_rate": 1.3768115942028986e-07, "logits/chosen": -0.5584119558334351, "logits/rejected": -0.5027008652687073, "logps/chosen": -50.800750732421875, "logps/ref_chosen": -50.80324172973633, "logps/ref_rejected": -78.82334899902344, "logps/rejected": -78.92068481445312, "loss": 1.359, "margin_dpo/margin_mean": 0.0998302698135376, "margin_dpo/margin_std": 0.32752203941345215, "step": 20 }, { "KL/chosen_KL_mean": 0.0040454864501953125, "KL/mean": -0.04826641082763672, "KL/rejected_KL_mean": -0.10057830810546875, "KL/std": 0.28005871176719666, "epoch": 0.030837004405286344, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.1046256422996521, "fcm_dpo/q_t": 0.49217915534973145, "grad_norm": 227.61415100097656, "learning_rate": 1.4492753623188405e-07, "logits/chosen": -0.4951311945915222, "logits/rejected": -0.4713231921195984, "logps/chosen": -50.0589714050293, "logps/ref_chosen": -50.063018798828125, "logps/ref_rejected": -77.86878967285156, "logps/rejected": -77.96937561035156, "loss": 1.3583, "margin_dpo/margin_mean": 0.10462629795074463, "margin_dpo/margin_std": 0.3697164058685303, "step": 21 }, { "KL/chosen_KL_mean": 0.0416259765625, "KL/mean": -0.07057403028011322, "KL/rejected_KL_mean": -0.18277359008789062, "KL/std": 0.27579018473625183, "epoch": 0.032305433186490456, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.22439709305763245, "fcm_dpo/q_t": 0.48323309421539307, "grad_norm": 243.1344451904297, "learning_rate": 1.5217391304347825e-07, "logits/chosen": -0.46972396969795227, "logits/rejected": -0.4252376854419708, "logps/chosen": -59.01601028442383, "logps/ref_chosen": -59.05763626098633, "logps/ref_rejected": -97.50466918945312, "logps/rejected": -97.68744659423828, "loss": 1.3233, "margin_dpo/margin_mean": 0.22439703345298767, "margin_dpo/margin_std": 0.3758489489555359, "step": 22 }, { "KL/chosen_KL_mean": -0.0016460418701171875, "KL/mean": -0.07504256069660187, "KL/rejected_KL_mean": -0.14843368530273438, "KL/std": 0.2956269383430481, "epoch": 0.033773861967694566, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.14679017663002014, "fcm_dpo/q_t": 0.48901820182800293, "grad_norm": 235.14389038085938, "learning_rate": 1.5942028985507245e-07, "logits/chosen": -0.4931301474571228, "logits/rejected": -0.470862478017807, "logps/chosen": -60.079341888427734, "logps/ref_chosen": -60.07769775390625, "logps/ref_rejected": -81.13955688476562, "logps/rejected": -81.2879867553711, "loss": 1.3475, "margin_dpo/margin_mean": 0.1467902660369873, "margin_dpo/margin_std": 0.4319424331188202, "step": 23 }, { "KL/chosen_KL_mean": 0.04685783386230469, "KL/mean": -0.06474106758832932, "KL/rejected_KL_mean": -0.1763458251953125, "KL/std": 0.2938792407512665, "epoch": 0.03524229074889868, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.2231999784708023, "fcm_dpo/q_t": 0.4833376705646515, "grad_norm": 242.87490844726562, "learning_rate": 1.6666666666666665e-07, "logits/chosen": -0.5050971508026123, "logits/rejected": -0.48868709802627563, "logps/chosen": -44.244178771972656, "logps/ref_chosen": -44.29103469848633, "logps/ref_rejected": -99.12521362304688, "logps/rejected": -99.30155944824219, "loss": 1.3241, "margin_dpo/margin_mean": 0.22320020198822021, "margin_dpo/margin_std": 0.3832412362098694, "step": 24 }, { "KL/chosen_KL_mean": 0.03863716125488281, "KL/mean": -0.07194776833057404, "KL/rejected_KL_mean": -0.18252944946289062, "KL/std": 0.36078929901123047, "epoch": 0.03671071953010279, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.22117015719413757, "fcm_dpo/q_t": 0.48354804515838623, "grad_norm": 215.2100830078125, "learning_rate": 1.7391304347826085e-07, "logits/chosen": -0.5046179294586182, "logits/rejected": -0.47490301728248596, "logps/chosen": -52.4984130859375, "logps/ref_chosen": -52.537052154541016, "logps/ref_rejected": -89.34219360351562, "logps/rejected": -89.52471923828125, "loss": 1.3255, "margin_dpo/margin_mean": 0.22116953134536743, "margin_dpo/margin_std": 0.4389370083808899, "step": 25 }, { "KL/chosen_KL_mean": 0.09479713439941406, "KL/mean": -0.10598999261856079, "KL/rejected_KL_mean": -0.3067779541015625, "KL/std": 0.4495195746421814, "epoch": 0.0381791483113069, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.40157991647720337, "fcm_dpo/q_t": 0.470198392868042, "grad_norm": 240.65769958496094, "learning_rate": 1.8115942028985507e-07, "logits/chosen": -0.5313920974731445, "logits/rejected": -0.49980974197387695, "logps/chosen": -53.82801055908203, "logps/ref_chosen": -53.92280578613281, "logps/ref_rejected": -103.35971069335938, "logps/rejected": -103.66648864746094, "loss": 1.277, "margin_dpo/margin_mean": 0.40158015489578247, "margin_dpo/margin_std": 0.5663931965827942, "step": 26 }, { "KL/chosen_KL_mean": 0.09834671020507812, "KL/mean": -0.15826506912708282, "KL/rejected_KL_mean": -0.41487884521484375, "KL/std": 0.47776395082473755, "epoch": 0.039647577092511016, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.5132265090942383, "fcm_dpo/q_t": 0.4619525671005249, "grad_norm": 256.8159484863281, "learning_rate": 1.8840579710144927e-07, "logits/chosen": -0.5238237977027893, "logits/rejected": -0.4873714745044708, "logps/chosen": -42.8001823425293, "logps/ref_chosen": -42.898529052734375, "logps/ref_rejected": -98.72419738769531, "logps/rejected": -99.13908386230469, "loss": 1.2455, "margin_dpo/margin_mean": 0.5132263898849487, "margin_dpo/margin_std": 0.5646921992301941, "step": 27 }, { "KL/chosen_KL_mean": 0.015338897705078125, "KL/mean": -0.14923109114170074, "KL/rejected_KL_mean": -0.3137969970703125, "KL/std": 0.4466787576675415, "epoch": 0.041116005873715125, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.3291375935077667, "fcm_dpo/q_t": 0.4754894971847534, "grad_norm": 209.7417755126953, "learning_rate": 1.9565217391304347e-07, "logits/chosen": -0.5104295611381531, "logits/rejected": -0.4556117355823517, "logps/chosen": -60.541160583496094, "logps/ref_chosen": -60.55650329589844, "logps/ref_rejected": -91.40111541748047, "logps/rejected": -91.71491241455078, "loss": 1.299, "margin_dpo/margin_mean": 0.32913774251937866, "margin_dpo/margin_std": 0.6227332353591919, "step": 28 }, { "KL/chosen_KL_mean": 0.1466350555419922, "KL/mean": -0.15072329342365265, "KL/rejected_KL_mean": -0.44808197021484375, "KL/std": 0.5040621161460876, "epoch": 0.042584434654919234, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.5947138071060181, "fcm_dpo/q_t": 0.4558557868003845, "grad_norm": 247.98696899414062, "learning_rate": 2.028985507246377e-07, "logits/chosen": -0.5724257826805115, "logits/rejected": -0.5275709629058838, "logps/chosen": -57.66114807128906, "logps/ref_chosen": -57.80778503417969, "logps/ref_rejected": -97.39434814453125, "logps/rejected": -97.84243774414062, "loss": 1.2226, "margin_dpo/margin_mean": 0.5947141647338867, "margin_dpo/margin_std": 0.5515247583389282, "step": 29 }, { "KL/chosen_KL_mean": 0.13365936279296875, "KL/mean": -0.17379064857959747, "KL/rejected_KL_mean": -0.4812431335449219, "KL/std": 0.5387458801269531, "epoch": 0.04405286343612335, "fcm_dpo/beta": 0.30578601360321045, "fcm_dpo/delta": 0.09551539272069931, "fcm_dpo/margin": 0.6149008274078369, "fcm_dpo/q_t": 0.4539734125137329, "grad_norm": 242.61647033691406, "learning_rate": 2.1014492753623187e-07, "logits/chosen": -0.5310481190681458, "logits/rejected": -0.5018342137336731, "logps/chosen": -52.44371032714844, "logps/ref_chosen": -52.577369689941406, "logps/ref_rejected": -98.48920440673828, "logps/rejected": -98.97044372558594, "loss": 1.2141, "margin_dpo/margin_mean": 0.6149011850357056, "margin_dpo/margin_std": 0.5445628762245178, "step": 30 }, { "KL/chosen_KL_mean": 0.1033782958984375, "KL/mean": -0.14287717640399933, "KL/rejected_KL_mean": -0.3891334533691406, "KL/std": 0.5448415279388428, "epoch": 0.04552129221732746, "fcm_dpo/beta": 0.30578601360321045, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.4925115406513214, "fcm_dpo/q_t": 0.46302998065948486, "grad_norm": 189.3855438232422, "learning_rate": 2.1739130434782607e-07, "logits/chosen": -0.4839329719543457, "logits/rejected": -0.43673014640808105, "logps/chosen": -63.70354461669922, "logps/ref_chosen": -63.806922912597656, "logps/ref_rejected": -72.89400482177734, "logps/rejected": -73.28314208984375, "loss": 1.2527, "margin_dpo/margin_mean": 0.4925113320350647, "margin_dpo/margin_std": 0.6987070441246033, "step": 31 }, { "KL/chosen_KL_mean": 0.14173126220703125, "KL/mean": -0.19788116216659546, "KL/rejected_KL_mean": -0.5374908447265625, "KL/std": 0.67319655418396, "epoch": 0.04698972099853157, "fcm_dpo/beta": 0.30862361192703247, "fcm_dpo/delta": 0.0919463187456131, "fcm_dpo/margin": 0.6792210936546326, "fcm_dpo/q_t": 0.4495420455932617, "grad_norm": 225.79928588867188, "learning_rate": 2.2463768115942027e-07, "logits/chosen": -0.5124537944793701, "logits/rejected": -0.4711976647377014, "logps/chosen": -62.59779357910156, "logps/ref_chosen": -62.739524841308594, "logps/ref_rejected": -89.3175048828125, "logps/rejected": -89.85499572753906, "loss": 1.2053, "margin_dpo/margin_mean": 0.6792212724685669, "margin_dpo/margin_std": 0.8771206140518188, "step": 32 }, { "KL/chosen_KL_mean": 0.15187644958496094, "KL/mean": -0.1501408815383911, "KL/rejected_KL_mean": -0.45215606689453125, "KL/std": 0.5369387865066528, "epoch": 0.048458149779735685, "fcm_dpo/beta": 0.31455251574516296, "fcm_dpo/delta": 0.09827958792448044, "fcm_dpo/margin": 0.6040317416191101, "fcm_dpo/q_t": 0.45361170172691345, "grad_norm": 205.43460083007812, "learning_rate": 2.318840579710145e-07, "logits/chosen": -0.5107744932174683, "logits/rejected": -0.4852328896522522, "logps/chosen": -53.10909652709961, "logps/ref_chosen": -53.26097106933594, "logps/ref_rejected": -87.8851318359375, "logps/rejected": -88.33729553222656, "loss": 1.2149, "margin_dpo/margin_mean": 0.6040312051773071, "margin_dpo/margin_std": 0.6277109384536743, "step": 33 }, { "KL/chosen_KL_mean": 0.06952667236328125, "KL/mean": -0.3327553868293762, "KL/rejected_KL_mean": -0.7350387573242188, "KL/std": 0.7567273378372192, "epoch": 0.049926578560939794, "fcm_dpo/beta": 0.3245845437049866, "fcm_dpo/delta": 0.1425287425518036, "fcm_dpo/margin": 0.8045636415481567, "fcm_dpo/q_t": 0.43769991397857666, "grad_norm": 211.18833923339844, "learning_rate": 2.391304347826087e-07, "logits/chosen": -0.48843878507614136, "logits/rejected": -0.471035897731781, "logps/chosen": -50.747802734375, "logps/ref_chosen": -50.81732940673828, "logps/ref_rejected": -101.92184448242188, "logps/rejected": -102.6568832397461, "loss": 1.1632, "margin_dpo/margin_mean": 0.8045632839202881, "margin_dpo/margin_std": 0.911353349685669, "step": 34 }, { "KL/chosen_KL_mean": 0.14473533630371094, "KL/mean": -0.47121092677116394, "KL/rejected_KL_mean": -1.087158203125, "KL/std": 1.0332869291305542, "epoch": 0.0513950073421439, "fcm_dpo/beta": 0.32634085416793823, "fcm_dpo/delta": -0.0021304162219166756, "fcm_dpo/margin": 1.2318875789642334, "fcm_dpo/q_t": 0.40531784296035767, "grad_norm": 206.67367553710938, "learning_rate": 2.463768115942029e-07, "logits/chosen": -0.520686149597168, "logits/rejected": -0.48392248153686523, "logps/chosen": -50.87975311279297, "logps/ref_chosen": -51.02449035644531, "logps/ref_rejected": -106.82443237304688, "logps/rejected": -107.91159057617188, "loss": 1.0562, "margin_dpo/margin_mean": 1.2318875789642334, "margin_dpo/margin_std": 1.157043218612671, "step": 35 }, { "KL/chosen_KL_mean": 0.04113960266113281, "KL/mean": -0.5248871445655823, "KL/rejected_KL_mean": -1.090911865234375, "KL/std": 1.0772857666015625, "epoch": 0.05286343612334802, "fcm_dpo/beta": 0.3298990726470947, "fcm_dpo/delta": 0.026729058474302292, "fcm_dpo/margin": 1.132055640220642, "fcm_dpo/q_t": 0.41170650720596313, "grad_norm": 183.64088439941406, "learning_rate": 2.536231884057971e-07, "logits/chosen": -0.5615625381469727, "logits/rejected": -0.5254453420639038, "logps/chosen": -51.95035171508789, "logps/ref_chosen": -51.991493225097656, "logps/ref_rejected": -86.0406265258789, "logps/rejected": -87.13153839111328, "loss": 1.0873, "margin_dpo/margin_mean": 1.1320552825927734, "margin_dpo/margin_std": 1.191450834274292, "step": 36 }, { "KL/chosen_KL_mean": -0.014146804809570312, "KL/mean": -0.48922494053840637, "KL/rejected_KL_mean": -0.9643096923828125, "KL/std": 1.0300785303115845, "epoch": 0.05433186490455213, "fcm_dpo/beta": 0.3311406373977661, "fcm_dpo/delta": 0.08806828409433365, "fcm_dpo/margin": 0.9501617550849915, "fcm_dpo/q_t": 0.4271436929702759, "grad_norm": 166.49075317382812, "learning_rate": 2.6086956521739126e-07, "logits/chosen": -0.5190207958221436, "logits/rejected": -0.4760361909866333, "logps/chosen": -62.82125473022461, "logps/ref_chosen": -62.807106018066406, "logps/ref_rejected": -77.89507293701172, "logps/rejected": -78.85938262939453, "loss": 1.1429, "margin_dpo/margin_mean": 0.9501620531082153, "margin_dpo/margin_std": 1.3070077896118164, "step": 37 }, { "KL/chosen_KL_mean": 0.10162544250488281, "KL/mean": -0.5765421390533447, "KL/rejected_KL_mean": -1.2547111511230469, "KL/std": 1.3713576793670654, "epoch": 0.055800293685756244, "fcm_dpo/beta": 0.332706481218338, "fcm_dpo/delta": -0.053727779537439346, "fcm_dpo/margin": 1.3563368320465088, "fcm_dpo/q_t": 0.3991077244281769, "grad_norm": 170.35545349121094, "learning_rate": 2.681159420289855e-07, "logits/chosen": -0.5162097215652466, "logits/rejected": -0.483456015586853, "logps/chosen": -48.28889465332031, "logps/ref_chosen": -48.39051818847656, "logps/ref_rejected": -97.91244506835938, "logps/rejected": -99.16715240478516, "loss": 1.0542, "margin_dpo/margin_mean": 1.3563368320465088, "margin_dpo/margin_std": 1.683530569076538, "step": 38 }, { "KL/chosen_KL_mean": 0.09050559997558594, "KL/mean": -0.7401334047317505, "KL/rejected_KL_mean": -1.5707664489746094, "KL/std": 1.3007447719573975, "epoch": 0.05726872246696035, "fcm_dpo/beta": 0.3225635588169098, "fcm_dpo/delta": -0.1443103402853012, "fcm_dpo/margin": 1.6612706184387207, "fcm_dpo/q_t": 0.37417465448379517, "grad_norm": 174.84286499023438, "learning_rate": 2.753623188405797e-07, "logits/chosen": -0.5401022434234619, "logits/rejected": -0.49907436966896057, "logps/chosen": -50.65996551513672, "logps/ref_chosen": -50.75047302246094, "logps/ref_rejected": -78.56951141357422, "logps/rejected": -80.14027404785156, "loss": 0.9688, "margin_dpo/margin_mean": 1.6612703800201416, "margin_dpo/margin_std": 1.4079031944274902, "step": 39 }, { "KL/chosen_KL_mean": 0.1924877166748047, "KL/mean": -0.5853748321533203, "KL/rejected_KL_mean": -1.3632469177246094, "KL/std": 1.4208192825317383, "epoch": 0.05873715124816446, "fcm_dpo/beta": 0.31791430711746216, "fcm_dpo/delta": -0.09945414215326309, "fcm_dpo/margin": 1.5557353496551514, "fcm_dpo/q_t": 0.38887178897857666, "grad_norm": 139.58270263671875, "learning_rate": 2.8260869565217386e-07, "logits/chosen": -0.5097917318344116, "logits/rejected": -0.4792172312736511, "logps/chosen": -57.792579650878906, "logps/ref_chosen": -57.985069274902344, "logps/ref_rejected": -74.3000717163086, "logps/rejected": -75.66331481933594, "loss": 1.0208, "margin_dpo/margin_mean": 1.5557358264923096, "margin_dpo/margin_std": 1.7217731475830078, "step": 40 }, { "KL/chosen_KL_mean": 0.018938064575195312, "KL/mean": -0.8975176811218262, "KL/rejected_KL_mean": -1.8139724731445312, "KL/std": 1.8013949394226074, "epoch": 0.06020558002936858, "fcm_dpo/beta": 0.3076004981994629, "fcm_dpo/delta": -0.17332524061203003, "fcm_dpo/margin": 1.8329108953475952, "fcm_dpo/q_t": 0.3731822073459625, "grad_norm": 151.85443115234375, "learning_rate": 2.898550724637681e-07, "logits/chosen": -0.5335030555725098, "logits/rejected": -0.49662622809410095, "logps/chosen": -62.6768798828125, "logps/ref_chosen": -62.69581604003906, "logps/ref_rejected": -97.02352905273438, "logps/rejected": -98.8375015258789, "loss": 0.9812, "margin_dpo/margin_mean": 1.8329112529754639, "margin_dpo/margin_std": 1.9542649984359741, "step": 41 }, { "KL/chosen_KL_mean": 0.2197589874267578, "KL/mean": -1.0017893314361572, "KL/rejected_KL_mean": -2.2233352661132812, "KL/std": 2.0296993255615234, "epoch": 0.06167400881057269, "fcm_dpo/beta": 0.2897103428840637, "fcm_dpo/delta": -0.33123135566711426, "fcm_dpo/margin": 2.4430980682373047, "fcm_dpo/q_t": 0.34266549348831177, "grad_norm": 148.23951721191406, "learning_rate": 2.971014492753623e-07, "logits/chosen": -0.549653172492981, "logits/rejected": -0.5031782984733582, "logps/chosen": -58.746665954589844, "logps/ref_chosen": -58.966426849365234, "logps/ref_rejected": -109.90837097167969, "logps/rejected": -112.1317138671875, "loss": 0.8866, "margin_dpo/margin_mean": 2.4430971145629883, "margin_dpo/margin_std": 2.242748737335205, "step": 42 }, { "KL/chosen_KL_mean": 0.5042438507080078, "KL/mean": -0.717160165309906, "KL/rejected_KL_mean": -1.938568115234375, "KL/std": 1.816794991493225, "epoch": 0.0631424375917768, "fcm_dpo/beta": 0.2717617154121399, "fcm_dpo/delta": -0.2843329906463623, "fcm_dpo/margin": 2.442809581756592, "fcm_dpo/q_t": 0.3463453948497772, "grad_norm": 136.68927001953125, "learning_rate": 3.043478260869565e-07, "logits/chosen": -0.558872640132904, "logits/rejected": -0.5347921848297119, "logps/chosen": -53.65175247192383, "logps/ref_chosen": -54.15599822998047, "logps/ref_rejected": -96.48019409179688, "logps/rejected": -98.41876220703125, "loss": 0.888, "margin_dpo/margin_mean": 2.442809581756592, "margin_dpo/margin_std": 1.9328808784484863, "step": 43 }, { "KL/chosen_KL_mean": 0.22796630859375, "KL/mean": -1.16841721534729, "KL/rejected_KL_mean": -2.5648040771484375, "KL/std": 2.2068114280700684, "epoch": 0.06461086637298091, "fcm_dpo/beta": 0.2552001476287842, "fcm_dpo/delta": -0.3377786874771118, "fcm_dpo/margin": 2.792766571044922, "fcm_dpo/q_t": 0.33637571334838867, "grad_norm": 138.35983276367188, "learning_rate": 3.115942028985507e-07, "logits/chosen": -0.4246031641960144, "logits/rejected": -0.40571877360343933, "logps/chosen": -49.85053253173828, "logps/ref_chosen": -50.07849884033203, "logps/ref_rejected": -108.78376007080078, "logps/rejected": -111.34856414794922, "loss": 0.8594, "margin_dpo/margin_mean": 2.79276704788208, "margin_dpo/margin_std": 2.16209077835083, "step": 44 }, { "KL/chosen_KL_mean": 0.16518402099609375, "KL/mean": -0.9224708676338196, "KL/rejected_KL_mean": -2.0101280212402344, "KL/std": 1.9454594850540161, "epoch": 0.06607929515418502, "fcm_dpo/beta": 0.24679788947105408, "fcm_dpo/delta": -0.14438273012638092, "fcm_dpo/margin": 2.1753125190734863, "fcm_dpo/q_t": 0.38011178374290466, "grad_norm": 111.90202331542969, "learning_rate": 3.188405797101449e-07, "logits/chosen": -0.4974350333213806, "logits/rejected": -0.4851893186569214, "logps/chosen": -48.2497444152832, "logps/ref_chosen": -48.4149284362793, "logps/ref_rejected": -77.93643188476562, "logps/rejected": -79.9465560913086, "loss": 0.9987, "margin_dpo/margin_mean": 2.1753129959106445, "margin_dpo/margin_std": 2.421452283859253, "step": 45 }, { "KL/chosen_KL_mean": 0.19502639770507812, "KL/mean": -1.2299586534500122, "KL/rejected_KL_mean": -2.6549415588378906, "KL/std": 2.5399794578552246, "epoch": 0.06754772393538913, "fcm_dpo/beta": 0.23457413911819458, "fcm_dpo/delta": -0.2872818112373352, "fcm_dpo/margin": 2.849971294403076, "fcm_dpo/q_t": 0.3544684946537018, "grad_norm": 118.44244384765625, "learning_rate": 3.260869565217391e-07, "logits/chosen": -0.527849555015564, "logits/rejected": -0.4767192304134369, "logps/chosen": -55.80439758300781, "logps/ref_chosen": -55.999427795410156, "logps/ref_rejected": -95.652587890625, "logps/rejected": -98.30752563476562, "loss": 0.9307, "margin_dpo/margin_mean": 2.8499715328216553, "margin_dpo/margin_std": 3.0548930168151855, "step": 46 }, { "KL/chosen_KL_mean": 0.384002685546875, "KL/mean": -1.0363800525665283, "KL/rejected_KL_mean": -2.4567604064941406, "KL/std": 2.471060276031494, "epoch": 0.06901615271659324, "fcm_dpo/beta": 0.22347593307495117, "fcm_dpo/delta": -0.25001367926597595, "fcm_dpo/margin": 2.840768337249756, "fcm_dpo/q_t": 0.35581424832344055, "grad_norm": 111.23075866699219, "learning_rate": 3.333333333333333e-07, "logits/chosen": -0.5658366680145264, "logits/rejected": -0.5126087665557861, "logps/chosen": -57.542076110839844, "logps/ref_chosen": -57.92607879638672, "logps/ref_rejected": -94.67920684814453, "logps/rejected": -97.13597106933594, "loss": 0.9238, "margin_dpo/margin_mean": 2.840768575668335, "margin_dpo/margin_std": 2.5524120330810547, "step": 47 }, { "KL/chosen_KL_mean": 0.06220054626464844, "KL/mean": -1.376787781715393, "KL/rejected_KL_mean": -2.815776824951172, "KL/std": 2.4476280212402344, "epoch": 0.07048458149779736, "fcm_dpo/beta": 0.21101200580596924, "fcm_dpo/delta": -0.22269634902477264, "fcm_dpo/margin": 2.8779749870300293, "fcm_dpo/q_t": 0.36156171560287476, "grad_norm": 119.95755767822266, "learning_rate": 3.4057971014492755e-07, "logits/chosen": -0.611646294593811, "logits/rejected": -0.5553910732269287, "logps/chosen": -57.12587356567383, "logps/ref_chosen": -57.188072204589844, "logps/ref_rejected": -88.0166015625, "logps/rejected": -90.83238220214844, "loss": 0.9519, "margin_dpo/margin_mean": 2.877974510192871, "margin_dpo/margin_std": 2.711777448654175, "step": 48 }, { "KL/chosen_KL_mean": 0.34456825256347656, "KL/mean": -1.4146552085876465, "KL/rejected_KL_mean": -3.1738739013671875, "KL/std": 3.116457939147949, "epoch": 0.07195301027900147, "fcm_dpo/beta": 0.20099371671676636, "fcm_dpo/delta": -0.329367995262146, "fcm_dpo/margin": 3.5184366703033447, "fcm_dpo/q_t": 0.3465607166290283, "grad_norm": 93.63461303710938, "learning_rate": 3.478260869565217e-07, "logits/chosen": -0.537588357925415, "logits/rejected": -0.4779571294784546, "logps/chosen": -61.340702056884766, "logps/ref_chosen": -61.685272216796875, "logps/ref_rejected": -83.76747131347656, "logps/rejected": -86.94134521484375, "loss": 0.9077, "margin_dpo/margin_mean": 3.5184359550476074, "margin_dpo/margin_std": 3.583613157272339, "step": 49 }, { "KL/chosen_KL_mean": -0.04874992370605469, "KL/mean": -1.9753637313842773, "KL/rejected_KL_mean": -3.9019737243652344, "KL/std": 3.2308237552642822, "epoch": 0.07342143906020558, "fcm_dpo/beta": 0.18617978692054749, "fcm_dpo/delta": -0.3431151509284973, "fcm_dpo/margin": 3.8532233238220215, "fcm_dpo/q_t": 0.34047919511795044, "grad_norm": 91.7352066040039, "learning_rate": 3.5507246376811595e-07, "logits/chosen": -0.5507527589797974, "logits/rejected": -0.5151888728141785, "logps/chosen": -58.77288818359375, "logps/ref_chosen": -58.72413635253906, "logps/ref_rejected": -96.35814666748047, "logps/rejected": -100.26011657714844, "loss": 0.8888, "margin_dpo/margin_mean": 3.8532235622406006, "margin_dpo/margin_std": 3.644498825073242, "step": 50 }, { "KL/chosen_KL_mean": -0.19082260131835938, "KL/mean": -2.0723307132720947, "KL/rejected_KL_mean": -3.9538421630859375, "KL/std": 3.815108299255371, "epoch": 0.07488986784140969, "fcm_dpo/beta": 0.17457202076911926, "fcm_dpo/delta": -0.27861201763153076, "fcm_dpo/margin": 3.763016700744629, "fcm_dpo/q_t": 0.3622833490371704, "grad_norm": 73.21631622314453, "learning_rate": 3.6231884057971015e-07, "logits/chosen": -0.5122474431991577, "logits/rejected": -0.47880104184150696, "logps/chosen": -61.564491271972656, "logps/ref_chosen": -61.3736686706543, "logps/ref_rejected": -76.00199890136719, "logps/rejected": -79.95584106445312, "loss": 0.9655, "margin_dpo/margin_mean": 3.76301646232605, "margin_dpo/margin_std": 4.63081693649292, "step": 51 }, { "KL/chosen_KL_mean": 0.4765663146972656, "KL/mean": -2.4252328872680664, "KL/rejected_KL_mean": -5.327030181884766, "KL/std": 4.417823314666748, "epoch": 0.0763582966226138, "fcm_dpo/beta": 0.15911118686199188, "fcm_dpo/delta": -0.5776325464248657, "fcm_dpo/margin": 5.8035993576049805, "fcm_dpo/q_t": 0.2984340786933899, "grad_norm": 73.25430297851562, "learning_rate": 3.695652173913043e-07, "logits/chosen": -0.5584224462509155, "logits/rejected": -0.5033497214317322, "logps/chosen": -51.86079025268555, "logps/ref_chosen": -52.33735656738281, "logps/ref_rejected": -79.97391510009766, "logps/rejected": -85.30094909667969, "loss": 0.7706, "margin_dpo/margin_mean": 5.803599834442139, "margin_dpo/margin_std": 4.6530866622924805, "step": 52 }, { "KL/chosen_KL_mean": -0.0835723876953125, "KL/mean": -3.0129737854003906, "KL/rejected_KL_mean": -5.942371368408203, "KL/std": 5.052390098571777, "epoch": 0.07782672540381791, "fcm_dpo/beta": 0.14530491828918457, "fcm_dpo/delta": -0.49047210812568665, "fcm_dpo/margin": 5.85880708694458, "fcm_dpo/q_t": 0.32626470923423767, "grad_norm": 72.15726470947266, "learning_rate": 3.7681159420289855e-07, "logits/chosen": -0.606256365776062, "logits/rejected": -0.5844460725784302, "logps/chosen": -53.398223876953125, "logps/ref_chosen": -53.31465148925781, "logps/ref_rejected": -91.78359985351562, "logps/rejected": -97.72596740722656, "loss": 0.8522, "margin_dpo/margin_mean": 5.858806610107422, "margin_dpo/margin_std": 5.748600006103516, "step": 53 }, { "KL/chosen_KL_mean": -0.2327747344970703, "KL/mean": -2.8028464317321777, "KL/rejected_KL_mean": -5.372917175292969, "KL/std": 4.757123947143555, "epoch": 0.07929515418502203, "fcm_dpo/beta": 0.13407519459724426, "fcm_dpo/delta": -0.3100808262825012, "fcm_dpo/margin": 5.140138626098633, "fcm_dpo/q_t": 0.3468964397907257, "grad_norm": 64.02351379394531, "learning_rate": 3.8405797101449274e-07, "logits/chosen": -0.5881419777870178, "logits/rejected": -0.534300684928894, "logps/chosen": -50.92143630981445, "logps/ref_chosen": -50.68865966796875, "logps/ref_rejected": -91.71539306640625, "logps/rejected": -97.08831024169922, "loss": 0.8972, "margin_dpo/margin_mean": 5.140138626098633, "margin_dpo/margin_std": 4.812758445739746, "step": 54 }, { "KL/chosen_KL_mean": -0.7527198791503906, "KL/mean": -3.7856006622314453, "KL/rejected_KL_mean": -6.8184814453125, "KL/std": 5.726006507873535, "epoch": 0.08076358296622614, "fcm_dpo/beta": 0.12486197054386139, "fcm_dpo/delta": -0.38554632663726807, "fcm_dpo/margin": 6.0657572746276855, "fcm_dpo/q_t": 0.3410576581954956, "grad_norm": 62.06749725341797, "learning_rate": 3.9130434782608694e-07, "logits/chosen": -0.6522265076637268, "logits/rejected": -0.590487003326416, "logps/chosen": -63.36795425415039, "logps/ref_chosen": -62.615234375, "logps/ref_rejected": -88.99349975585938, "logps/rejected": -95.81198120117188, "loss": 0.915, "margin_dpo/margin_mean": 6.065756797790527, "margin_dpo/margin_std": 6.8796820640563965, "step": 55 }, { "KL/chosen_KL_mean": -0.548431396484375, "KL/mean": -3.4274816513061523, "KL/rejected_KL_mean": -6.3065338134765625, "KL/std": 5.56746768951416, "epoch": 0.08223201174743025, "fcm_dpo/beta": 0.11680299043655396, "fcm_dpo/delta": -0.2921079993247986, "fcm_dpo/margin": 5.758103847503662, "fcm_dpo/q_t": 0.3566039800643921, "grad_norm": 53.04601287841797, "learning_rate": 3.9855072463768114e-07, "logits/chosen": -0.6132587194442749, "logits/rejected": -0.5706372261047363, "logps/chosen": -58.48115921020508, "logps/ref_chosen": -57.9327278137207, "logps/ref_rejected": -94.1744384765625, "logps/rejected": -100.48097229003906, "loss": 0.9546, "margin_dpo/margin_mean": 5.758103847503662, "margin_dpo/margin_std": 6.777911186218262, "step": 56 }, { "KL/chosen_KL_mean": -0.5712432861328125, "KL/mean": -3.808412551879883, "KL/rejected_KL_mean": -7.045585632324219, "KL/std": 5.480106353759766, "epoch": 0.08370044052863436, "fcm_dpo/beta": 0.10994692891836166, "fcm_dpo/delta": -0.3346494138240814, "fcm_dpo/margin": 6.474340915679932, "fcm_dpo/q_t": 0.34009259939193726, "grad_norm": 57.49006652832031, "learning_rate": 4.057971014492754e-07, "logits/chosen": -0.5823420882225037, "logits/rejected": -0.5546176433563232, "logps/chosen": -71.0665283203125, "logps/ref_chosen": -70.49528503417969, "logps/ref_rejected": -95.56546020507812, "logps/rejected": -102.61103820800781, "loss": 0.8905, "margin_dpo/margin_mean": 6.474340438842773, "margin_dpo/margin_std": 5.933760643005371, "step": 57 }, { "KL/chosen_KL_mean": -0.6897735595703125, "KL/mean": -4.305522918701172, "KL/rejected_KL_mean": -7.921272277832031, "KL/std": 6.297882556915283, "epoch": 0.08516886930983847, "fcm_dpo/beta": 0.10236389189958572, "fcm_dpo/delta": -0.36620625853538513, "fcm_dpo/margin": 7.231494903564453, "fcm_dpo/q_t": 0.3412542939186096, "grad_norm": 58.814815521240234, "learning_rate": 4.1304347826086954e-07, "logits/chosen": -0.6043993234634399, "logits/rejected": -0.529456377029419, "logps/chosen": -62.822715759277344, "logps/ref_chosen": -62.13294219970703, "logps/ref_rejected": -84.61729431152344, "logps/rejected": -92.53856658935547, "loss": 0.9009, "margin_dpo/margin_mean": 7.2314958572387695, "margin_dpo/margin_std": 7.5085673332214355, "step": 58 }, { "KL/chosen_KL_mean": -1.2114276885986328, "KL/mean": -5.092733383178711, "KL/rejected_KL_mean": -8.974040985107422, "KL/std": 6.752954959869385, "epoch": 0.08663729809104258, "fcm_dpo/beta": 0.09383856505155563, "fcm_dpo/delta": -0.3598101735115051, "fcm_dpo/margin": 7.762610912322998, "fcm_dpo/q_t": 0.34148186445236206, "grad_norm": 54.276611328125, "learning_rate": 4.2028985507246374e-07, "logits/chosen": -0.6205647587776184, "logits/rejected": -0.5793225765228271, "logps/chosen": -53.143951416015625, "logps/ref_chosen": -51.932525634765625, "logps/ref_rejected": -88.88520050048828, "logps/rejected": -97.85923767089844, "loss": 0.8996, "margin_dpo/margin_mean": 7.76261043548584, "margin_dpo/margin_std": 7.849611282348633, "step": 59 }, { "KL/chosen_KL_mean": -2.1884403228759766, "KL/mean": -5.350527286529541, "KL/rejected_KL_mean": -8.512611389160156, "KL/std": 6.1919403076171875, "epoch": 0.0881057268722467, "fcm_dpo/beta": 0.09018626809120178, "fcm_dpo/delta": -0.1816769540309906, "fcm_dpo/margin": 6.3241682052612305, "fcm_dpo/q_t": 0.36957529187202454, "grad_norm": 60.4672966003418, "learning_rate": 4.2753623188405794e-07, "logits/chosen": -0.5891699194908142, "logits/rejected": -0.527586042881012, "logps/chosen": -63.13063049316406, "logps/ref_chosen": -60.94218826293945, "logps/ref_rejected": -85.39340209960938, "logps/rejected": -93.906005859375, "loss": 0.9876, "margin_dpo/margin_mean": 6.3241682052612305, "margin_dpo/margin_std": 7.076349258422852, "step": 60 }, { "KL/chosen_KL_mean": -1.1373729705810547, "KL/mean": -5.013064861297607, "KL/rejected_KL_mean": -8.888755798339844, "KL/std": 8.38675594329834, "epoch": 0.08957415565345081, "fcm_dpo/beta": 0.08559857308864594, "fcm_dpo/delta": -0.2840117812156677, "fcm_dpo/margin": 7.751380443572998, "fcm_dpo/q_t": 0.36387041211128235, "grad_norm": 49.0460205078125, "learning_rate": 4.3478260869565214e-07, "logits/chosen": -0.5909574627876282, "logits/rejected": -0.5559062361717224, "logps/chosen": -61.770896911621094, "logps/ref_chosen": -60.633522033691406, "logps/ref_rejected": -89.85249328613281, "logps/rejected": -98.74125671386719, "loss": 0.9755, "margin_dpo/margin_mean": 7.751380920410156, "margin_dpo/margin_std": 10.394891738891602, "step": 61 }, { "KL/chosen_KL_mean": -1.3573627471923828, "KL/mean": -4.249211311340332, "KL/rejected_KL_mean": -7.141059875488281, "KL/std": 6.042973518371582, "epoch": 0.09104258443465492, "fcm_dpo/beta": 0.08392874896526337, "fcm_dpo/delta": -0.08972346782684326, "fcm_dpo/margin": 5.783695697784424, "fcm_dpo/q_t": 0.3917636275291443, "grad_norm": 47.25103759765625, "learning_rate": 4.420289855072464e-07, "logits/chosen": -0.635386049747467, "logits/rejected": -0.6032891273498535, "logps/chosen": -57.50813293457031, "logps/ref_chosen": -56.15077209472656, "logps/ref_rejected": -75.56619262695312, "logps/rejected": -82.70726013183594, "loss": 1.0451, "margin_dpo/margin_mean": 5.783695697784424, "margin_dpo/margin_std": 7.337882995605469, "step": 62 }, { "KL/chosen_KL_mean": -2.206483840942383, "KL/mean": -6.167753219604492, "KL/rejected_KL_mean": -10.129024505615234, "KL/std": 7.77467679977417, "epoch": 0.09251101321585903, "fcm_dpo/beta": 0.07972732186317444, "fcm_dpo/delta": -0.2495255470275879, "fcm_dpo/margin": 7.922541618347168, "fcm_dpo/q_t": 0.3571065664291382, "grad_norm": 47.997623443603516, "learning_rate": 4.4927536231884053e-07, "logits/chosen": -0.6038184762001038, "logits/rejected": -0.5584800243377686, "logps/chosen": -75.3538818359375, "logps/ref_chosen": -73.14739227294922, "logps/ref_rejected": -97.61006164550781, "logps/rejected": -107.73908996582031, "loss": 0.9445, "margin_dpo/margin_mean": 7.922541618347168, "margin_dpo/margin_std": 8.097877502441406, "step": 63 }, { "KL/chosen_KL_mean": -0.7482147216796875, "KL/mean": -5.395984649658203, "KL/rejected_KL_mean": -10.043754577636719, "KL/std": 8.429512023925781, "epoch": 0.09397944199706314, "fcm_dpo/beta": 0.07469938695430756, "fcm_dpo/delta": -0.3220548927783966, "fcm_dpo/margin": 9.295536041259766, "fcm_dpo/q_t": 0.34917110204696655, "grad_norm": 44.42738342285156, "learning_rate": 4.5652173913043473e-07, "logits/chosen": -0.5919687747955322, "logits/rejected": -0.5603554248809814, "logps/chosen": -54.7468147277832, "logps/ref_chosen": -53.998600006103516, "logps/ref_rejected": -93.53019714355469, "logps/rejected": -103.57395935058594, "loss": 0.9301, "margin_dpo/margin_mean": 9.295536041259766, "margin_dpo/margin_std": 10.248291969299316, "step": 64 }, { "KL/chosen_KL_mean": -2.768260955810547, "KL/mean": -7.429973602294922, "KL/rejected_KL_mean": -12.091690063476562, "KL/std": 9.386064529418945, "epoch": 0.09544787077826726, "fcm_dpo/beta": 0.07084572315216064, "fcm_dpo/delta": -0.2809777855873108, "fcm_dpo/margin": 9.323431015014648, "fcm_dpo/q_t": 0.35268324613571167, "grad_norm": 44.16692352294922, "learning_rate": 4.63768115942029e-07, "logits/chosen": -0.6519949436187744, "logits/rejected": -0.6382172107696533, "logps/chosen": -67.60426330566406, "logps/ref_chosen": -64.83599853515625, "logps/ref_rejected": -109.94645690917969, "logps/rejected": -122.03814697265625, "loss": 0.9463, "margin_dpo/margin_mean": 9.323431015014648, "margin_dpo/margin_std": 10.194602012634277, "step": 65 }, { "KL/chosen_KL_mean": -2.5610218048095703, "KL/mean": -6.84706974029541, "KL/rejected_KL_mean": -11.133113861083984, "KL/std": 8.479511260986328, "epoch": 0.09691629955947137, "fcm_dpo/beta": 0.06770157068967819, "fcm_dpo/delta": -0.19393965601921082, "fcm_dpo/margin": 8.572092056274414, "fcm_dpo/q_t": 0.37264156341552734, "grad_norm": 39.67411422729492, "learning_rate": 4.7101449275362313e-07, "logits/chosen": -0.6443692445755005, "logits/rejected": -0.6113446950912476, "logps/chosen": -54.004547119140625, "logps/ref_chosen": -51.44352722167969, "logps/ref_rejected": -75.63629913330078, "logps/rejected": -86.7694091796875, "loss": 0.987, "margin_dpo/margin_mean": 8.572092056274414, "margin_dpo/margin_std": 10.129063606262207, "step": 66 }, { "KL/chosen_KL_mean": -2.2398548126220703, "KL/mean": -6.684027671813965, "KL/rejected_KL_mean": -11.128204345703125, "KL/std": 8.494741439819336, "epoch": 0.09838472834067548, "fcm_dpo/beta": 0.06574313342571259, "fcm_dpo/delta": -0.19544380903244019, "fcm_dpo/margin": 8.888347625732422, "fcm_dpo/q_t": 0.37113136053085327, "grad_norm": 38.83026885986328, "learning_rate": 4.782608695652174e-07, "logits/chosen": -0.6339254975318909, "logits/rejected": -0.5946371555328369, "logps/chosen": -61.580657958984375, "logps/ref_chosen": -59.34080505371094, "logps/ref_rejected": -72.78728485107422, "logps/rejected": -83.91548919677734, "loss": 0.9797, "margin_dpo/margin_mean": 8.888347625732422, "margin_dpo/margin_std": 10.183094024658203, "step": 67 }, { "KL/chosen_KL_mean": -2.51678466796875, "KL/mean": -6.736393928527832, "KL/rejected_KL_mean": -10.956001281738281, "KL/std": 7.671031951904297, "epoch": 0.09985315712187959, "fcm_dpo/beta": 0.06370236724615097, "fcm_dpo/delta": -0.1451815813779831, "fcm_dpo/margin": 8.439210891723633, "fcm_dpo/q_t": 0.3772110342979431, "grad_norm": 38.64521408081055, "learning_rate": 4.855072463768116e-07, "logits/chosen": -0.6282086968421936, "logits/rejected": -0.5680973529815674, "logps/chosen": -67.72261810302734, "logps/ref_chosen": -65.2058334350586, "logps/ref_rejected": -77.20724487304688, "logps/rejected": -88.16325378417969, "loss": 0.986, "margin_dpo/margin_mean": 8.439210891723633, "margin_dpo/margin_std": 8.544252395629883, "step": 68 }, { "KL/chosen_KL_mean": -3.0470409393310547, "KL/mean": -8.152204513549805, "KL/rejected_KL_mean": -13.257366180419922, "KL/std": 9.061971664428711, "epoch": 0.1013215859030837, "fcm_dpo/beta": 0.06064834073185921, "fcm_dpo/delta": -0.23509711027145386, "fcm_dpo/margin": 10.210319519042969, "fcm_dpo/q_t": 0.3607165217399597, "grad_norm": 40.75960159301758, "learning_rate": 4.927536231884058e-07, "logits/chosen": -0.5967146158218384, "logits/rejected": -0.5730553865432739, "logps/chosen": -62.86627960205078, "logps/ref_chosen": -59.81924057006836, "logps/ref_rejected": -103.38886260986328, "logps/rejected": -116.64622497558594, "loss": 0.9375, "margin_dpo/margin_mean": 10.210320472717285, "margin_dpo/margin_std": 9.960121154785156, "step": 69 }, { "KL/chosen_KL_mean": -4.498558044433594, "KL/mean": -10.02988052368164, "KL/rejected_KL_mean": -15.561203002929688, "KL/std": 11.078158378601074, "epoch": 0.1027900146842878, "fcm_dpo/beta": 0.058067694306373596, "fcm_dpo/delta": -0.25848639011383057, "fcm_dpo/margin": 11.062643051147461, "fcm_dpo/q_t": 0.35891324281692505, "grad_norm": 40.870914459228516, "learning_rate": 5e-07, "logits/chosen": -0.6100128889083862, "logits/rejected": -0.5743746757507324, "logps/chosen": -66.42919921875, "logps/ref_chosen": -61.930641174316406, "logps/ref_rejected": -91.06078338623047, "logps/rejected": -106.62198638916016, "loss": 0.9454, "margin_dpo/margin_mean": 11.062643051147461, "margin_dpo/margin_std": 11.850614547729492, "step": 70 }, { "KL/chosen_KL_mean": -4.099088668823242, "KL/mean": -10.583032608032227, "KL/rejected_KL_mean": -17.066974639892578, "KL/std": 11.512796401977539, "epoch": 0.10425844346549193, "fcm_dpo/beta": 0.05431191250681877, "fcm_dpo/delta": -0.3282097578048706, "fcm_dpo/margin": 12.967889785766602, "fcm_dpo/q_t": 0.34613728523254395, "grad_norm": 37.54128646850586, "learning_rate": 4.999967061337492e-07, "logits/chosen": -0.6697078943252563, "logits/rejected": -0.6301345825195312, "logps/chosen": -65.84942626953125, "logps/ref_chosen": -61.750335693359375, "logps/ref_rejected": -97.33662414550781, "logps/rejected": -114.40359497070312, "loss": 0.9003, "margin_dpo/margin_mean": 12.967889785766602, "margin_dpo/margin_std": 12.603883743286133, "step": 71 }, { "KL/chosen_KL_mean": -4.795114517211914, "KL/mean": -11.703153610229492, "KL/rejected_KL_mean": -18.611186981201172, "KL/std": 12.470186233520508, "epoch": 0.10572687224669604, "fcm_dpo/beta": 0.05085095018148422, "fcm_dpo/delta": -0.32645586133003235, "fcm_dpo/margin": 13.816070556640625, "fcm_dpo/q_t": 0.3455986976623535, "grad_norm": 37.724822998046875, "learning_rate": 4.999868246217933e-07, "logits/chosen": -0.6510441303253174, "logits/rejected": -0.6148891448974609, "logps/chosen": -70.84852600097656, "logps/ref_chosen": -66.05341339111328, "logps/ref_rejected": -95.2869873046875, "logps/rejected": -113.89817810058594, "loss": 0.9125, "margin_dpo/margin_mean": 13.816070556640625, "margin_dpo/margin_std": 14.364995956420898, "step": 72 }, { "KL/chosen_KL_mean": -6.796857833862305, "KL/mean": -13.687841415405273, "KL/rejected_KL_mean": -20.57882308959961, "KL/std": 16.359634399414062, "epoch": 0.10719530102790015, "fcm_dpo/beta": 0.047877371311187744, "fcm_dpo/delta": -0.2798731029033661, "fcm_dpo/margin": 13.781963348388672, "fcm_dpo/q_t": 0.3676333427429199, "grad_norm": 36.75204849243164, "learning_rate": 4.999703557245192e-07, "logits/chosen": -0.6967453956604004, "logits/rejected": -0.6562691926956177, "logps/chosen": -73.05313110351562, "logps/ref_chosen": -66.25627136230469, "logps/ref_rejected": -90.45613098144531, "logps/rejected": -111.03495788574219, "loss": 1.0192, "margin_dpo/margin_mean": 13.781963348388672, "margin_dpo/margin_std": 20.299331665039062, "step": 73 }, { "KL/chosen_KL_mean": -7.430627822875977, "KL/mean": -14.996770858764648, "KL/rejected_KL_mean": -22.562911987304688, "KL/std": 18.265613555908203, "epoch": 0.10866372980910426, "fcm_dpo/beta": 0.04516391456127167, "fcm_dpo/delta": -0.3053804337978363, "fcm_dpo/margin": 15.132284164428711, "fcm_dpo/q_t": 0.3618400990962982, "grad_norm": 37.91171646118164, "learning_rate": 4.999472998758977e-07, "logits/chosen": -0.6205891370773315, "logits/rejected": -0.6105706691741943, "logps/chosen": -60.85551071166992, "logps/ref_chosen": -53.42488098144531, "logps/ref_rejected": -95.94693756103516, "logps/rejected": -118.50984954833984, "loss": 0.9871, "margin_dpo/margin_mean": 15.132284164428711, "margin_dpo/margin_std": 22.55142593383789, "step": 74 }, { "KL/chosen_KL_mean": -7.307668685913086, "KL/mean": -18.042144775390625, "KL/rejected_KL_mean": -28.776607513427734, "KL/std": 18.98027801513672, "epoch": 0.11013215859030837, "fcm_dpo/beta": 0.041482701897621155, "fcm_dpo/delta": -0.5359930396080017, "fcm_dpo/margin": 21.468948364257812, "fcm_dpo/q_t": 0.3164390027523041, "grad_norm": 33.1515998840332, "learning_rate": 4.999176576834721e-07, "logits/chosen": -0.6864483952522278, "logits/rejected": -0.6774381399154663, "logps/chosen": -59.16933059692383, "logps/ref_chosen": -51.861663818359375, "logps/ref_rejected": -111.25398254394531, "logps/rejected": -140.0305938720703, "loss": 0.8343, "margin_dpo/margin_mean": 21.468948364257812, "margin_dpo/margin_std": 21.272823333740234, "step": 75 }, { "KL/chosen_KL_mean": -9.035161972045898, "KL/mean": -15.948980331420898, "KL/rejected_KL_mean": -22.862796783447266, "KL/std": 15.473119735717773, "epoch": 0.11160058737151249, "fcm_dpo/beta": 0.039188824594020844, "fcm_dpo/delta": -0.14997366070747375, "fcm_dpo/margin": 13.827640533447266, "fcm_dpo/q_t": 0.3768173158168793, "grad_norm": 32.834896087646484, "learning_rate": 4.998814299283415e-07, "logits/chosen": -0.714850664138794, "logits/rejected": -0.6740258932113647, "logps/chosen": -62.30120086669922, "logps/ref_chosen": -53.26603698730469, "logps/ref_rejected": -78.21662902832031, "logps/rejected": -101.07942199707031, "loss": 1.0119, "margin_dpo/margin_mean": 13.827640533447266, "margin_dpo/margin_std": 16.90443229675293, "step": 76 }, { "KL/chosen_KL_mean": -8.176126480102539, "KL/mean": -19.34077262878418, "KL/rejected_KL_mean": -30.505416870117188, "KL/std": 21.328655242919922, "epoch": 0.1130690161527166, "fcm_dpo/beta": 0.036142949014902115, "fcm_dpo/delta": -0.4468532204627991, "fcm_dpo/margin": 22.32929039001465, "fcm_dpo/q_t": 0.3258803188800812, "grad_norm": 34.206050872802734, "learning_rate": 4.998386175651409e-07, "logits/chosen": -0.6671018004417419, "logits/rejected": -0.6263935565948486, "logps/chosen": -66.2728042602539, "logps/ref_chosen": -58.0966796875, "logps/ref_rejected": -93.77361297607422, "logps/rejected": -124.2790298461914, "loss": 0.882, "margin_dpo/margin_mean": 22.329288482666016, "margin_dpo/margin_std": 23.451766967773438, "step": 77 }, { "KL/chosen_KL_mean": -8.268457412719727, "KL/mean": -16.85858917236328, "KL/rejected_KL_mean": -25.448719024658203, "KL/std": 17.313419342041016, "epoch": 0.1145374449339207, "fcm_dpo/beta": 0.03458146005868912, "fcm_dpo/delta": -0.2059612274169922, "fcm_dpo/margin": 17.180259704589844, "fcm_dpo/q_t": 0.3691740036010742, "grad_norm": 30.835861206054688, "learning_rate": 4.997892217220159e-07, "logits/chosen": -0.6544767618179321, "logits/rejected": -0.6291429996490479, "logps/chosen": -63.882240295410156, "logps/ref_chosen": -55.61378479003906, "logps/ref_rejected": -84.93436431884766, "logps/rejected": -110.38308715820312, "loss": 0.9801, "margin_dpo/margin_mean": 17.180259704589844, "margin_dpo/margin_std": 19.697816848754883, "step": 78 }, { "KL/chosen_KL_mean": -8.609933853149414, "KL/mean": -18.018735885620117, "KL/rejected_KL_mean": -27.427539825439453, "KL/std": 19.773313522338867, "epoch": 0.11600587371512482, "fcm_dpo/beta": 0.032929353415966034, "fcm_dpo/delta": -0.2343355119228363, "fcm_dpo/margin": 18.817596435546875, "fcm_dpo/q_t": 0.3699612617492676, "grad_norm": 27.264328002929688, "learning_rate": 4.997332437005931e-07, "logits/chosen": -0.6396697163581848, "logits/rejected": -0.6080412268638611, "logps/chosen": -64.0604248046875, "logps/ref_chosen": -55.45048522949219, "logps/ref_rejected": -87.64756774902344, "logps/rejected": -115.07510375976562, "loss": 0.9939, "margin_dpo/margin_mean": 18.81760025024414, "margin_dpo/margin_std": 24.226604461669922, "step": 79 }, { "KL/chosen_KL_mean": -11.600616455078125, "KL/mean": -20.326831817626953, "KL/rejected_KL_mean": -29.053043365478516, "KL/std": 20.81574058532715, "epoch": 0.11747430249632893, "fcm_dpo/beta": 0.03151794895529747, "fcm_dpo/delta": -0.16158056259155273, "fcm_dpo/margin": 17.452417373657227, "fcm_dpo/q_t": 0.38502687215805054, "grad_norm": 29.68206214904785, "learning_rate": 4.996706849759452e-07, "logits/chosen": -0.723793625831604, "logits/rejected": -0.681006669998169, "logps/chosen": -70.11990356445312, "logps/ref_chosen": -58.519290924072266, "logps/ref_rejected": -87.54750061035156, "logps/rejected": -116.60054016113281, "loss": 1.039, "margin_dpo/margin_mean": 17.45241928100586, "margin_dpo/margin_std": 24.047962188720703, "step": 80 }, { "KL/chosen_KL_mean": -10.693958282470703, "KL/mean": -22.869586944580078, "KL/rejected_KL_mean": -35.04521942138672, "KL/std": 24.97785186767578, "epoch": 0.11894273127753303, "fcm_dpo/beta": 0.029724348336458206, "fcm_dpo/delta": -0.3522689640522003, "fcm_dpo/margin": 24.351259231567383, "fcm_dpo/q_t": 0.34924542903900146, "grad_norm": 30.155729293823242, "learning_rate": 4.996015471965529e-07, "logits/chosen": -0.6988470554351807, "logits/rejected": -0.6684309244155884, "logps/chosen": -77.142822265625, "logps/ref_chosen": -66.44886779785156, "logps/ref_rejected": -129.66270446777344, "logps/rejected": -164.70791625976562, "loss": 0.9381, "margin_dpo/margin_mean": 24.35125732421875, "margin_dpo/margin_std": 29.631593704223633, "step": 81 }, { "KL/chosen_KL_mean": -13.060468673706055, "KL/mean": -22.69424057006836, "KL/rejected_KL_mean": -32.3280143737793, "KL/std": 22.624094009399414, "epoch": 0.12041116005873716, "fcm_dpo/beta": 0.028949948027729988, "fcm_dpo/delta": -0.16753321886062622, "fcm_dpo/margin": 19.267545700073242, "fcm_dpo/q_t": 0.3837572932243347, "grad_norm": 32.10773849487305, "learning_rate": 4.995258321842611e-07, "logits/chosen": -0.6523622274398804, "logits/rejected": -0.6406021118164062, "logps/chosen": -65.29285430908203, "logps/ref_chosen": -52.232383728027344, "logps/ref_rejected": -90.74325561523438, "logps/rejected": -123.0712661743164, "loss": 1.0786, "margin_dpo/margin_mean": 19.267547607421875, "margin_dpo/margin_std": 31.091327667236328, "step": 82 }, { "KL/chosen_KL_mean": -13.508848190307617, "KL/mean": -24.929195404052734, "KL/rejected_KL_mean": -36.349544525146484, "KL/std": 23.749820709228516, "epoch": 0.12187958883994127, "fcm_dpo/beta": 0.027322106063365936, "fcm_dpo/delta": -0.2400093972682953, "fcm_dpo/margin": 22.84069061279297, "fcm_dpo/q_t": 0.3655874729156494, "grad_norm": 31.6568660736084, "learning_rate": 4.994435419342304e-07, "logits/chosen": -0.6658318042755127, "logits/rejected": -0.6298344135284424, "logps/chosen": -69.33623504638672, "logps/ref_chosen": -55.82738494873047, "logps/ref_rejected": -103.71589660644531, "logps/rejected": -140.06544494628906, "loss": 0.9834, "margin_dpo/margin_mean": 22.84069061279297, "margin_dpo/margin_std": 28.148263931274414, "step": 83 }, { "KL/chosen_KL_mean": -12.589178085327148, "KL/mean": -22.418010711669922, "KL/rejected_KL_mean": -32.24684524536133, "KL/std": 20.389741897583008, "epoch": 0.12334801762114538, "fcm_dpo/beta": 0.026361385360360146, "fcm_dpo/delta": -0.12693113088607788, "fcm_dpo/margin": 19.657663345336914, "fcm_dpo/q_t": 0.38021910190582275, "grad_norm": 26.903989791870117, "learning_rate": 4.993546786148857e-07, "logits/chosen": -0.6667696237564087, "logits/rejected": -0.6284007430076599, "logps/chosen": -79.76535034179688, "logps/ref_chosen": -67.1761703491211, "logps/ref_rejected": -87.29859924316406, "logps/rejected": -119.54544830322266, "loss": 1.0067, "margin_dpo/margin_mean": 19.657665252685547, "margin_dpo/margin_std": 20.888431549072266, "step": 84 }, { "KL/chosen_KL_mean": -13.00227165222168, "KL/mean": -23.157577514648438, "KL/rejected_KL_mean": -33.31288146972656, "KL/std": 20.965476989746094, "epoch": 0.12481644640234948, "fcm_dpo/beta": 0.02606545016169548, "fcm_dpo/delta": -0.13678425550460815, "fcm_dpo/margin": 20.310611724853516, "fcm_dpo/q_t": 0.3828505277633667, "grad_norm": 27.330554962158203, "learning_rate": 4.992592445678582e-07, "logits/chosen": -0.6378351449966431, "logits/rejected": -0.6047541499137878, "logps/chosen": -71.40888977050781, "logps/ref_chosen": -58.4066162109375, "logps/ref_rejected": -78.63880157470703, "logps/rejected": -111.9516830444336, "loss": 1.0268, "margin_dpo/margin_mean": 20.310611724853516, "margin_dpo/margin_std": 25.04244613647461, "step": 85 }, { "KL/chosen_KL_mean": -17.00665855407715, "KL/mean": -27.735424041748047, "KL/rejected_KL_mean": -38.46418380737305, "KL/std": 28.39218521118164, "epoch": 0.1262848751835536, "fcm_dpo/beta": 0.025230124592781067, "fcm_dpo/delta": -0.14920490980148315, "fcm_dpo/margin": 21.457521438598633, "fcm_dpo/q_t": 0.3947563171386719, "grad_norm": 30.875211715698242, "learning_rate": 4.991572423079235e-07, "logits/chosen": -0.6909030675888062, "logits/rejected": -0.6786030530929565, "logps/chosen": -73.14411926269531, "logps/ref_chosen": -56.13746643066406, "logps/ref_rejected": -88.12165069580078, "logps/rejected": -126.58583068847656, "loss": 1.11, "margin_dpo/margin_mean": 21.457523345947266, "margin_dpo/margin_std": 39.128604888916016, "step": 86 }, { "KL/chosen_KL_mean": -16.64571189880371, "KL/mean": -29.247547149658203, "KL/rejected_KL_mean": -41.84938430786133, "KL/std": 28.30339813232422, "epoch": 0.1277533039647577, "fcm_dpo/beta": 0.023992381989955902, "fcm_dpo/delta": -0.21979403495788574, "fcm_dpo/margin": 25.20366859436035, "fcm_dpo/q_t": 0.36907005310058594, "grad_norm": 26.347061157226562, "learning_rate": 4.990486745229364e-07, "logits/chosen": -0.719502329826355, "logits/rejected": -0.6944303512573242, "logps/chosen": -72.28180694580078, "logps/ref_chosen": -55.63609313964844, "logps/ref_rejected": -95.46757507324219, "logps/rejected": -137.31695556640625, "loss": 1.0182, "margin_dpo/margin_mean": 25.20366859436035, "margin_dpo/margin_std": 34.2125244140625, "step": 87 }, { "KL/chosen_KL_mean": -21.046295166015625, "KL/mean": -30.57300567626953, "KL/rejected_KL_mean": -40.09971618652344, "KL/std": 29.50307846069336, "epoch": 0.12922173274596183, "fcm_dpo/beta": 0.023543458431959152, "fcm_dpo/delta": -0.051779814064502716, "fcm_dpo/margin": 19.053417205810547, "fcm_dpo/q_t": 0.4041179418563843, "grad_norm": 27.308347702026367, "learning_rate": 4.989335440737586e-07, "logits/chosen": -0.6762702465057373, "logits/rejected": -0.6694661378860474, "logps/chosen": -94.71744537353516, "logps/ref_chosen": -73.67115020751953, "logps/ref_rejected": -106.70849609375, "logps/rejected": -146.80821228027344, "loss": 1.1201, "margin_dpo/margin_mean": 19.053417205810547, "margin_dpo/margin_std": 30.958572387695312, "step": 88 }, { "KL/chosen_KL_mean": -12.778741836547852, "KL/mean": -23.465646743774414, "KL/rejected_KL_mean": -34.15255355834961, "KL/std": 24.849987030029297, "epoch": 0.13069016152716592, "fcm_dpo/beta": 0.02335914969444275, "fcm_dpo/delta": -0.10445674508810043, "fcm_dpo/margin": 21.37381362915039, "fcm_dpo/q_t": 0.38910990953445435, "grad_norm": 24.864566802978516, "learning_rate": 4.988118539941847e-07, "logits/chosen": -0.7264094352722168, "logits/rejected": -0.6940090656280518, "logps/chosen": -73.40365600585938, "logps/ref_chosen": -60.624916076660156, "logps/ref_rejected": -82.08354949951172, "logps/rejected": -116.23609924316406, "loss": 1.042, "margin_dpo/margin_mean": 21.37381362915039, "margin_dpo/margin_std": 28.21473503112793, "step": 89 }, { "KL/chosen_KL_mean": -16.536136627197266, "KL/mean": -31.133424758911133, "KL/rejected_KL_mean": -45.730709075927734, "KL/std": 34.05305480957031, "epoch": 0.13215859030837004, "fcm_dpo/beta": 0.02243289351463318, "fcm_dpo/delta": -0.2720962464809418, "fcm_dpo/margin": 29.194570541381836, "fcm_dpo/q_t": 0.3703291416168213, "grad_norm": 27.197731018066406, "learning_rate": 4.986836074908615e-07, "logits/chosen": -0.6625027656555176, "logits/rejected": -0.6746160984039307, "logps/chosen": -69.82144927978516, "logps/ref_chosen": -53.285308837890625, "logps/ref_rejected": -111.54470825195312, "logps/rejected": -157.27542114257812, "loss": 1.0204, "margin_dpo/margin_mean": 29.194570541381836, "margin_dpo/margin_std": 42.260841369628906, "step": 90 }, { "KL/chosen_KL_mean": -17.2109432220459, "KL/mean": -29.20108413696289, "KL/rejected_KL_mean": -41.191226959228516, "KL/std": 27.21971321105957, "epoch": 0.13362701908957417, "fcm_dpo/beta": 0.02157766930758953, "fcm_dpo/delta": -0.12366719543933868, "fcm_dpo/margin": 23.98028564453125, "fcm_dpo/q_t": 0.3894466459751129, "grad_norm": 24.908628463745117, "learning_rate": 4.985488079432037e-07, "logits/chosen": -0.6959347724914551, "logits/rejected": -0.6632735729217529, "logps/chosen": -79.01390075683594, "logps/ref_chosen": -61.802955627441406, "logps/ref_rejected": -87.87395477294922, "logps/rejected": -129.065185546875, "loss": 1.061, "margin_dpo/margin_mean": 23.98028564453125, "margin_dpo/margin_std": 35.235958099365234, "step": 91 }, { "KL/chosen_KL_mean": -15.647247314453125, "KL/mean": -27.800127029418945, "KL/rejected_KL_mean": -39.95301055908203, "KL/std": 28.11497688293457, "epoch": 0.13509544787077826, "fcm_dpo/beta": 0.021103451028466225, "fcm_dpo/delta": -0.11899492889642715, "fcm_dpo/margin": 24.30576515197754, "fcm_dpo/q_t": 0.3890799880027771, "grad_norm": 23.367460250854492, "learning_rate": 4.984074589033043e-07, "logits/chosen": -0.7003687620162964, "logits/rejected": -0.676365852355957, "logps/chosen": -67.28801727294922, "logps/ref_chosen": -51.640769958496094, "logps/ref_rejected": -77.88117980957031, "logps/rejected": -117.83419799804688, "loss": 1.0536, "margin_dpo/margin_mean": 24.305763244628906, "margin_dpo/margin_std": 34.140499114990234, "step": 92 }, { "KL/chosen_KL_mean": -17.547260284423828, "KL/mean": -29.79717254638672, "KL/rejected_KL_mean": -42.04708480834961, "KL/std": 26.565616607666016, "epoch": 0.13656387665198239, "fcm_dpo/beta": 0.020545653998851776, "fcm_dpo/delta": -0.1087617427110672, "fcm_dpo/margin": 24.499828338623047, "fcm_dpo/q_t": 0.3881436884403229, "grad_norm": 23.9678897857666, "learning_rate": 4.982595640958425e-07, "logits/chosen": -0.751114547252655, "logits/rejected": -0.7044565081596375, "logps/chosen": -70.07649993896484, "logps/ref_chosen": -52.529239654541016, "logps/ref_rejected": -77.16075134277344, "logps/rejected": -119.20783233642578, "loss": 1.0292, "margin_dpo/margin_mean": 24.499828338623047, "margin_dpo/margin_std": 30.640230178833008, "step": 93 }, { "KL/chosen_KL_mean": -18.9477596282959, "KL/mean": -33.051692962646484, "KL/rejected_KL_mean": -47.1556282043457, "KL/std": 30.412057876586914, "epoch": 0.13803230543318648, "fcm_dpo/beta": 0.019795160740613937, "fcm_dpo/delta": -0.16931986808776855, "fcm_dpo/margin": 28.20786476135254, "fcm_dpo/q_t": 0.3750844895839691, "grad_norm": 23.683237075805664, "learning_rate": 4.98105127417984e-07, "logits/chosen": -0.6706228256225586, "logits/rejected": -0.6556359529495239, "logps/chosen": -80.17037200927734, "logps/ref_chosen": -61.22261047363281, "logps/ref_rejected": -99.59902954101562, "logps/rejected": -146.75466918945312, "loss": 0.9929, "margin_dpo/margin_mean": 28.207866668701172, "margin_dpo/margin_std": 31.745136260986328, "step": 94 }, { "KL/chosen_KL_mean": -17.981779098510742, "KL/mean": -29.374893188476562, "KL/rejected_KL_mean": -40.76800537109375, "KL/std": 29.04880142211914, "epoch": 0.1395007342143906, "fcm_dpo/beta": 0.019524898380041122, "fcm_dpo/delta": -0.04741118103265762, "fcm_dpo/margin": 22.786224365234375, "fcm_dpo/q_t": 0.39812785387039185, "grad_norm": 22.271825790405273, "learning_rate": 4.979441529392784e-07, "logits/chosen": -0.7048947811126709, "logits/rejected": -0.6773319244384766, "logps/chosen": -70.50542449951172, "logps/ref_chosen": -52.523643493652344, "logps/ref_rejected": -75.8803482055664, "logps/rejected": -116.64836120605469, "loss": 1.072, "margin_dpo/margin_mean": 22.786224365234375, "margin_dpo/margin_std": 30.41301727294922, "step": 95 }, { "KL/chosen_KL_mean": -17.554834365844727, "KL/mean": -32.99760437011719, "KL/rejected_KL_mean": -48.44038391113281, "KL/std": 33.0831298828125, "epoch": 0.14096916299559473, "fcm_dpo/beta": 0.01885131560266018, "fcm_dpo/delta": -0.19593745470046997, "fcm_dpo/margin": 30.885547637939453, "fcm_dpo/q_t": 0.3727257251739502, "grad_norm": 22.89360237121582, "learning_rate": 4.977766449015534e-07, "logits/chosen": -0.7279924154281616, "logits/rejected": -0.7011754512786865, "logps/chosen": -79.71180725097656, "logps/ref_chosen": -62.15697479248047, "logps/ref_rejected": -96.59601593017578, "logps/rejected": -145.03640747070312, "loss": 0.986, "margin_dpo/margin_mean": 30.88555145263672, "margin_dpo/margin_std": 36.464759826660156, "step": 96 }, { "KL/chosen_KL_mean": -18.827903747558594, "KL/mean": -30.793764114379883, "KL/rejected_KL_mean": -42.75962829589844, "KL/std": 27.565874099731445, "epoch": 0.14243759177679882, "fcm_dpo/beta": 0.01880602166056633, "fcm_dpo/delta": -0.0534333810210228, "fcm_dpo/margin": 23.93172836303711, "fcm_dpo/q_t": 0.39575350284576416, "grad_norm": 23.5416202545166, "learning_rate": 4.976026077188012e-07, "logits/chosen": -0.6459416151046753, "logits/rejected": -0.6031548976898193, "logps/chosen": -73.47427368164062, "logps/ref_chosen": -54.646366119384766, "logps/ref_rejected": -76.96475219726562, "logps/rejected": -119.72438049316406, "loss": 1.0555, "margin_dpo/margin_mean": 23.93172836303711, "margin_dpo/margin_std": 27.58646011352539, "step": 97 }, { "KL/chosen_KL_mean": -23.348102569580078, "KL/mean": -36.9785270690918, "KL/rejected_KL_mean": -50.60894775390625, "KL/std": 31.48232650756836, "epoch": 0.14390602055800295, "fcm_dpo/beta": 0.0182771235704422, "fcm_dpo/delta": -0.1036653220653534, "fcm_dpo/margin": 27.26085662841797, "fcm_dpo/q_t": 0.3863416314125061, "grad_norm": 24.430879592895508, "learning_rate": 4.974220459770639e-07, "logits/chosen": -0.6633949875831604, "logits/rejected": -0.6469439268112183, "logps/chosen": -88.60673522949219, "logps/ref_chosen": -65.25862884521484, "logps/ref_rejected": -96.5274887084961, "logps/rejected": -147.13644409179688, "loss": 1.0592, "margin_dpo/margin_mean": 27.260852813720703, "margin_dpo/margin_std": 37.74567794799805, "step": 98 }, { "KL/chosen_KL_mean": -18.30929183959961, "KL/mean": -35.074058532714844, "KL/rejected_KL_mean": -51.838829040527344, "KL/std": 34.02536392211914, "epoch": 0.14537444933920704, "fcm_dpo/beta": 0.017622604966163635, "fcm_dpo/delta": -0.20359688997268677, "fcm_dpo/margin": 33.529537200927734, "fcm_dpo/q_t": 0.37370553612709045, "grad_norm": 21.330432891845703, "learning_rate": 4.972349644343108e-07, "logits/chosen": -0.6809293627738953, "logits/rejected": -0.6816772222518921, "logps/chosen": -63.947776794433594, "logps/ref_chosen": -45.638484954833984, "logps/ref_rejected": -86.43793487548828, "logps/rejected": -138.27676391601562, "loss": 0.9921, "margin_dpo/margin_mean": 33.529541015625, "margin_dpo/margin_std": 42.3082389831543, "step": 99 }, { "KL/chosen_KL_mean": -20.69596290588379, "KL/mean": -30.469200134277344, "KL/rejected_KL_mean": -40.242435455322266, "KL/std": 27.422863006591797, "epoch": 0.14684287812041116, "fcm_dpo/beta": 0.01770273968577385, "fcm_dpo/delta": 0.05536198988556862, "fcm_dpo/margin": 19.546470642089844, "fcm_dpo/q_t": 0.4218849539756775, "grad_norm": 23.772842407226562, "learning_rate": 4.970413680203148e-07, "logits/chosen": -0.620718240737915, "logits/rejected": -0.5739752650260925, "logps/chosen": -78.28994750976562, "logps/ref_chosen": -57.59397888183594, "logps/ref_rejected": -74.06021118164062, "logps/rejected": -114.30264282226562, "loss": 1.1659, "margin_dpo/margin_mean": 19.546470642089844, "margin_dpo/margin_std": 35.299861907958984, "step": 100 }, { "KL/chosen_KL_mean": -25.458805084228516, "KL/mean": -37.322052001953125, "KL/rejected_KL_mean": -49.18529510498047, "KL/std": 33.75147247314453, "epoch": 0.14831130690161526, "fcm_dpo/beta": 0.01753612421452999, "fcm_dpo/delta": -0.01752624288201332, "fcm_dpo/margin": 23.726482391357422, "fcm_dpo/q_t": 0.41101598739624023, "grad_norm": 23.58587646484375, "learning_rate": 4.968412618365215e-07, "logits/chosen": -0.6659466028213501, "logits/rejected": -0.6346107721328735, "logps/chosen": -87.10765838623047, "logps/ref_chosen": -61.64885330200195, "logps/ref_rejected": -83.18968200683594, "logps/rejected": -132.37498474121094, "loss": 1.1286, "margin_dpo/margin_mean": 23.726482391357422, "margin_dpo/margin_std": 40.702640533447266, "step": 101 }, { "KL/chosen_KL_mean": -27.646360397338867, "KL/mean": -36.379215240478516, "KL/rejected_KL_mean": -45.112064361572266, "KL/std": 31.945594787597656, "epoch": 0.14977973568281938, "fcm_dpo/beta": 0.01757633686065674, "fcm_dpo/delta": -0.024831483140587807, "fcm_dpo/margin": 17.465709686279297, "fcm_dpo/q_t": 0.4317839741706848, "grad_norm": 26.59412384033203, "learning_rate": 4.966346511559149e-07, "logits/chosen": -0.6938978433609009, "logits/rejected": -0.6497205495834351, "logps/chosen": -91.72523498535156, "logps/ref_chosen": -64.0788803100586, "logps/ref_rejected": -68.18707275390625, "logps/rejected": -113.29914093017578, "loss": 1.2103, "margin_dpo/margin_mean": 17.46571159362793, "margin_dpo/margin_std": 37.897613525390625, "step": 102 }, { "KL/chosen_KL_mean": -21.812721252441406, "KL/mean": -39.324649810791016, "KL/rejected_KL_mean": -56.83657455444336, "KL/std": 35.332679748535156, "epoch": 0.1512481644640235, "fcm_dpo/beta": 0.016969915479421616, "fcm_dpo/delta": -0.20710483193397522, "fcm_dpo/margin": 35.02384948730469, "fcm_dpo/q_t": 0.3709907531738281, "grad_norm": 22.75851058959961, "learning_rate": 4.964215414228785e-07, "logits/chosen": -0.6656967997550964, "logits/rejected": -0.6312940120697021, "logps/chosen": -83.11199951171875, "logps/ref_chosen": -61.299278259277344, "logps/ref_rejected": -93.57270812988281, "logps/rejected": -150.40928649902344, "loss": 0.9857, "margin_dpo/margin_mean": 35.02384948730469, "margin_dpo/margin_std": 42.5711669921875, "step": 103 }, { "KL/chosen_KL_mean": -23.92180824279785, "KL/mean": -40.524253845214844, "KL/rejected_KL_mean": -57.12669372558594, "KL/std": 39.19900131225586, "epoch": 0.1527165932452276, "fcm_dpo/beta": 0.01646982505917549, "fcm_dpo/delta": -0.15515577793121338, "fcm_dpo/margin": 33.20488357543945, "fcm_dpo/q_t": 0.38579535484313965, "grad_norm": 22.30910301208496, "learning_rate": 4.96201938253052e-07, "logits/chosen": -0.668786883354187, "logits/rejected": -0.6362247467041016, "logps/chosen": -78.29458618164062, "logps/ref_chosen": -54.372772216796875, "logps/ref_rejected": -89.5647201538086, "logps/rejected": -146.69140625, "loss": 1.0453, "margin_dpo/margin_mean": 33.20488357543945, "margin_dpo/margin_std": 47.9078369140625, "step": 104 }, { "KL/chosen_KL_mean": -22.438573837280273, "KL/mean": -46.06147766113281, "KL/rejected_KL_mean": -69.68439483642578, "KL/std": 38.165252685546875, "epoch": 0.15418502202643172, "fcm_dpo/beta": 0.015482816845178604, "fcm_dpo/delta": -0.3573678731918335, "fcm_dpo/margin": 47.245811462402344, "fcm_dpo/q_t": 0.3350944519042969, "grad_norm": 22.623994827270508, "learning_rate": 4.959758474331832e-07, "logits/chosen": -0.6759747862815857, "logits/rejected": -0.653762936592102, "logps/chosen": -77.0775146484375, "logps/ref_chosen": -54.638946533203125, "logps/ref_rejected": -97.97351837158203, "logps/rejected": -167.6579132080078, "loss": 0.8737, "margin_dpo/margin_mean": 47.245811462402344, "margin_dpo/margin_std": 42.06477355957031, "step": 105 }, { "KL/chosen_KL_mean": -25.158750534057617, "KL/mean": -39.978302001953125, "KL/rejected_KL_mean": -54.79785919189453, "KL/std": 33.347076416015625, "epoch": 0.15565345080763582, "fcm_dpo/beta": 0.015045535750687122, "fcm_dpo/delta": -0.04824310541152954, "fcm_dpo/margin": 29.63909912109375, "fcm_dpo/q_t": 0.39749810099601746, "grad_norm": 21.77722930908203, "learning_rate": 4.957432749209755e-07, "logits/chosen": -0.6268042325973511, "logits/rejected": -0.5954272747039795, "logps/chosen": -79.99164581298828, "logps/ref_chosen": -54.83289337158203, "logps/ref_rejected": -85.22461700439453, "logps/rejected": -140.02247619628906, "loss": 1.0594, "margin_dpo/margin_mean": 29.63909912109375, "margin_dpo/margin_std": 36.35613250732422, "step": 106 }, { "KL/chosen_KL_mean": -29.925743103027344, "KL/mean": -46.62507629394531, "KL/rejected_KL_mean": -63.32440185546875, "KL/std": 40.514007568359375, "epoch": 0.15712187958883994, "fcm_dpo/beta": 0.014755118638277054, "fcm_dpo/delta": -0.09840479493141174, "fcm_dpo/margin": 33.398658752441406, "fcm_dpo/q_t": 0.3882708251476288, "grad_norm": 21.028383255004883, "learning_rate": 4.955042268449307e-07, "logits/chosen": -0.670194149017334, "logits/rejected": -0.6239144802093506, "logps/chosen": -99.63356018066406, "logps/ref_chosen": -69.70780944824219, "logps/ref_rejected": -94.73950958251953, "logps/rejected": -158.06390380859375, "loss": 1.0453, "margin_dpo/margin_mean": 33.398658752441406, "margin_dpo/margin_std": 42.718177795410156, "step": 107 }, { "KL/chosen_KL_mean": -27.031885147094727, "KL/mean": -47.079811096191406, "KL/rejected_KL_mean": -67.12774658203125, "KL/std": 44.77525329589844, "epoch": 0.15859030837004406, "fcm_dpo/beta": 0.014345895498991013, "fcm_dpo/delta": -0.18586613237857819, "fcm_dpo/margin": 40.095855712890625, "fcm_dpo/q_t": 0.38066431879997253, "grad_norm": 21.29493522644043, "learning_rate": 4.952587095041881e-07, "logits/chosen": -0.669190526008606, "logits/rejected": -0.6483861207962036, "logps/chosen": -83.0417709350586, "logps/ref_chosen": -56.0098876953125, "logps/ref_rejected": -95.79601287841797, "logps/rejected": -162.92376708984375, "loss": 1.0383, "margin_dpo/margin_mean": 40.095855712890625, "margin_dpo/margin_std": 57.993988037109375, "step": 108 }, { "KL/chosen_KL_mean": -25.21091651916504, "KL/mean": -46.66911315917969, "KL/rejected_KL_mean": -68.12731170654297, "KL/std": 43.04130554199219, "epoch": 0.16005873715124816, "fcm_dpo/beta": 0.013737066648900509, "fcm_dpo/delta": -0.20238548517227173, "fcm_dpo/margin": 42.91639709472656, "fcm_dpo/q_t": 0.3687817454338074, "grad_norm": 21.884559631347656, "learning_rate": 4.95006729368358e-07, "logits/chosen": -0.6145904660224915, "logits/rejected": -0.59392911195755, "logps/chosen": -88.09640502929688, "logps/ref_chosen": -62.88549041748047, "logps/ref_rejected": -98.68573760986328, "logps/rejected": -166.81304931640625, "loss": 0.9884, "margin_dpo/margin_mean": 42.91639709472656, "margin_dpo/margin_std": 50.632591247558594, "step": 109 }, { "KL/chosen_KL_mean": -26.089508056640625, "KL/mean": -45.36097717285156, "KL/rejected_KL_mean": -64.63245391845703, "KL/std": 42.92705154418945, "epoch": 0.16152716593245228, "fcm_dpo/beta": 0.013237670063972473, "fcm_dpo/delta": -0.12065520882606506, "fcm_dpo/margin": 38.542945861816406, "fcm_dpo/q_t": 0.3875874876976013, "grad_norm": 19.124555587768555, "learning_rate": 4.947482930773511e-07, "logits/chosen": -0.5867836475372314, "logits/rejected": -0.5484417676925659, "logps/chosen": -84.84319305419922, "logps/ref_chosen": -58.753684997558594, "logps/ref_rejected": -79.75001525878906, "logps/rejected": -144.38247680664062, "loss": 1.0537, "margin_dpo/margin_mean": 38.542945861816406, "margin_dpo/margin_std": 51.59848403930664, "step": 110 }, { "KL/chosen_KL_mean": -29.633333206176758, "KL/mean": -51.53802490234375, "KL/rejected_KL_mean": -73.4427261352539, "KL/std": 48.50222396850586, "epoch": 0.16299559471365638, "fcm_dpo/beta": 0.01289959717541933, "fcm_dpo/delta": -0.17664864659309387, "fcm_dpo/margin": 43.809391021728516, "fcm_dpo/q_t": 0.3767807185649872, "grad_norm": 21.609487533569336, "learning_rate": 4.944834074412042e-07, "logits/chosen": -0.669287919998169, "logits/rejected": -0.6483087539672852, "logps/chosen": -98.25743865966797, "logps/ref_chosen": -68.62410736083984, "logps/ref_rejected": -98.42886352539062, "logps/rejected": -171.87158203125, "loss": 1.0324, "margin_dpo/margin_mean": 43.80938720703125, "margin_dpo/margin_std": 59.64892578125, "step": 111 }, { "KL/chosen_KL_mean": -27.373397827148438, "KL/mean": -41.31971740722656, "KL/rejected_KL_mean": -55.26603698730469, "KL/std": 33.80635070800781, "epoch": 0.1644640234948605, "fcm_dpo/beta": 0.01288105733692646, "fcm_dpo/delta": 0.04204365238547325, "fcm_dpo/margin": 27.89263916015625, "fcm_dpo/q_t": 0.4176030158996582, "grad_norm": 19.551979064941406, "learning_rate": 4.942120794399002e-07, "logits/chosen": -0.6326720118522644, "logits/rejected": -0.5948728322982788, "logps/chosen": -77.62303924560547, "logps/ref_chosen": -50.24964141845703, "logps/ref_rejected": -64.77442932128906, "logps/rejected": -120.04046630859375, "loss": 1.1257, "margin_dpo/margin_mean": 27.892641067504883, "margin_dpo/margin_std": 40.78398895263672, "step": 112 }, { "KL/chosen_KL_mean": -34.30276870727539, "KL/mean": -49.319786071777344, "KL/rejected_KL_mean": -64.33680725097656, "KL/std": 35.23823928833008, "epoch": 0.16593245227606462, "fcm_dpo/beta": 0.012986140325665474, "fcm_dpo/delta": 0.01037517748773098, "fcm_dpo/margin": 30.034034729003906, "fcm_dpo/q_t": 0.4100501537322998, "grad_norm": 20.03278160095215, "learning_rate": 4.939343162231841e-07, "logits/chosen": -0.5970338582992554, "logits/rejected": -0.5535135269165039, "logps/chosen": -101.01571655273438, "logps/ref_chosen": -66.71295166015625, "logps/ref_rejected": -77.96870422363281, "logps/rejected": -142.30551147460938, "loss": 1.0938, "margin_dpo/margin_mean": 30.034034729003906, "margin_dpo/margin_std": 38.99578094482422, "step": 113 }, { "KL/chosen_KL_mean": -31.6552791595459, "KL/mean": -55.58210754394531, "KL/rejected_KL_mean": -79.50894165039062, "KL/std": 53.05522918701172, "epoch": 0.16740088105726872, "fcm_dpo/beta": 0.01250369194895029, "fcm_dpo/delta": -0.21323440968990326, "fcm_dpo/margin": 47.85365295410156, "fcm_dpo/q_t": 0.3746366500854492, "grad_norm": 21.371883392333984, "learning_rate": 4.936501251103751e-07, "logits/chosen": -0.594833493232727, "logits/rejected": -0.5611605048179626, "logps/chosen": -89.44036102294922, "logps/ref_chosen": -57.78507995605469, "logps/ref_rejected": -87.10966491699219, "logps/rejected": -166.61859130859375, "loss": 1.0003, "margin_dpo/margin_mean": 47.85365295410156, "margin_dpo/margin_std": 64.31402587890625, "step": 114 }, { "KL/chosen_KL_mean": -41.24885559082031, "KL/mean": -57.65170669555664, "KL/rejected_KL_mean": -74.05455017089844, "KL/std": 50.606632232666016, "epoch": 0.16886930983847284, "fcm_dpo/beta": 0.012465628795325756, "fcm_dpo/delta": -0.009462913498282433, "fcm_dpo/margin": 32.80569076538086, "fcm_dpo/q_t": 0.414449542760849, "grad_norm": 26.58994483947754, "learning_rate": 4.933595135901732e-07, "logits/chosen": -0.6483290195465088, "logits/rejected": -0.6295895576477051, "logps/chosen": -106.83149719238281, "logps/ref_chosen": -65.5826416015625, "logps/ref_rejected": -98.56552124023438, "logps/rejected": -172.6200714111328, "loss": 1.1668, "margin_dpo/margin_mean": 32.80569076538086, "margin_dpo/margin_std": 66.59864807128906, "step": 115 }, { "KL/chosen_KL_mean": -30.92531394958496, "KL/mean": -49.26237487792969, "KL/rejected_KL_mean": -67.59944152832031, "KL/std": 42.78678894042969, "epoch": 0.17033773861967694, "fcm_dpo/beta": 0.012374404817819595, "fcm_dpo/delta": -0.05652306228876114, "fcm_dpo/margin": 36.67411804199219, "fcm_dpo/q_t": 0.39768484234809875, "grad_norm": 21.639448165893555, "learning_rate": 4.930624893204624e-07, "logits/chosen": -0.6038175821304321, "logits/rejected": -0.5935859680175781, "logps/chosen": -82.32562255859375, "logps/ref_chosen": -51.40031433105469, "logps/ref_rejected": -80.5218505859375, "logps/rejected": -148.1212921142578, "loss": 1.0582, "margin_dpo/margin_mean": 36.67411804199219, "margin_dpo/margin_std": 46.316162109375, "step": 116 }, { "KL/chosen_KL_mean": -39.578678131103516, "KL/mean": -54.53247833251953, "KL/rejected_KL_mean": -69.48628234863281, "KL/std": 45.36625671386719, "epoch": 0.17180616740088106, "fcm_dpo/beta": 0.012346116825938225, "fcm_dpo/delta": 0.031927622854709625, "fcm_dpo/margin": 29.907602310180664, "fcm_dpo/q_t": 0.41722893714904785, "grad_norm": 28.117990493774414, "learning_rate": 4.927590601281083e-07, "logits/chosen": -0.6056150197982788, "logits/rejected": -0.5710107088088989, "logps/chosen": -108.87709045410156, "logps/ref_chosen": -69.29840850830078, "logps/ref_rejected": -66.583984375, "logps/rejected": -136.0702667236328, "loss": 1.1431, "margin_dpo/margin_mean": 29.907604217529297, "margin_dpo/margin_std": 51.51899719238281, "step": 117 }, { "KL/chosen_KL_mean": -31.513439178466797, "KL/mean": -49.722755432128906, "KL/rejected_KL_mean": -67.93206024169922, "KL/std": 41.247047424316406, "epoch": 0.17327459618208516, "fcm_dpo/beta": 0.01227930560708046, "fcm_dpo/delta": -0.04938432201743126, "fcm_dpo/margin": 36.41863250732422, "fcm_dpo/q_t": 0.3994569778442383, "grad_norm": 20.659421920776367, "learning_rate": 4.924492340087524e-07, "logits/chosen": -0.6473113298416138, "logits/rejected": -0.6298643350601196, "logps/chosen": -87.1544189453125, "logps/ref_chosen": -55.6409797668457, "logps/ref_rejected": -75.66905975341797, "logps/rejected": -143.6011199951172, "loss": 1.0642, "margin_dpo/margin_mean": 36.41863250732422, "margin_dpo/margin_std": 47.30088424682617, "step": 118 }, { "KL/chosen_KL_mean": -43.098350524902344, "KL/mean": -61.04634475708008, "KL/rejected_KL_mean": -78.99433898925781, "KL/std": 46.14408874511719, "epoch": 0.17474302496328928, "fcm_dpo/beta": 0.012094875797629356, "fcm_dpo/delta": -0.036839861422777176, "fcm_dpo/margin": 35.89598846435547, "fcm_dpo/q_t": 0.40563011169433594, "grad_norm": 23.585227966308594, "learning_rate": 4.92133019126601e-07, "logits/chosen": -0.6196680068969727, "logits/rejected": -0.6074246168136597, "logps/chosen": -116.60855102539062, "logps/ref_chosen": -73.51019287109375, "logps/ref_rejected": -102.977294921875, "logps/rejected": -181.97161865234375, "loss": 1.1036, "margin_dpo/margin_mean": 35.895992279052734, "margin_dpo/margin_std": 56.17529296875, "step": 119 }, { "KL/chosen_KL_mean": -44.474021911621094, "KL/mean": -69.42485046386719, "KL/rejected_KL_mean": -94.37568664550781, "KL/std": 58.080665588378906, "epoch": 0.1762114537444934, "fcm_dpo/beta": 0.011788450181484222, "fcm_dpo/delta": -0.2001763880252838, "fcm_dpo/margin": 49.901649475097656, "fcm_dpo/q_t": 0.37107378244400024, "grad_norm": 21.931350708007812, "learning_rate": 4.918104238142103e-07, "logits/chosen": -0.6279151439666748, "logits/rejected": -0.5975610017776489, "logps/chosen": -121.25485229492188, "logps/ref_chosen": -76.78083801269531, "logps/ref_rejected": -108.02374267578125, "logps/rejected": -202.3994140625, "loss": 0.9934, "margin_dpo/margin_mean": 49.901649475097656, "margin_dpo/margin_std": 60.89421081542969, "step": 120 }, { "KL/chosen_KL_mean": -42.93914794921875, "KL/mean": -70.19830322265625, "KL/rejected_KL_mean": -97.45744323730469, "KL/std": 56.19927978515625, "epoch": 0.1776798825256975, "fcm_dpo/beta": 0.011228121817111969, "fcm_dpo/delta": -0.22773230075836182, "fcm_dpo/margin": 54.51830291748047, "fcm_dpo/q_t": 0.3683249354362488, "grad_norm": 24.322509765625, "learning_rate": 4.91481456572267e-07, "logits/chosen": -0.5546694993972778, "logits/rejected": -0.5499193072319031, "logps/chosen": -104.72904968261719, "logps/ref_chosen": -61.789894104003906, "logps/ref_rejected": -109.99456787109375, "logps/rejected": -207.4520263671875, "loss": 0.9991, "margin_dpo/margin_mean": 54.51830291748047, "margin_dpo/margin_std": 70.47315216064453, "step": 121 }, { "KL/chosen_KL_mean": -39.022762298583984, "KL/mean": -73.1162109375, "KL/rejected_KL_mean": -107.20967102050781, "KL/std": 64.33946990966797, "epoch": 0.17914831130690162, "fcm_dpo/beta": 0.010600419715046883, "fcm_dpo/delta": -0.34783935546875, "fcm_dpo/margin": 68.18690490722656, "fcm_dpo/q_t": 0.34132951498031616, "grad_norm": 23.660940170288086, "learning_rate": 4.911461260693638e-07, "logits/chosen": -0.5362130403518677, "logits/rejected": -0.5523936748504639, "logps/chosen": -85.92497253417969, "logps/ref_chosen": -46.9022102355957, "logps/ref_rejected": -106.71418762207031, "logps/rejected": -213.92385864257812, "loss": 0.8959, "margin_dpo/margin_mean": 68.18690490722656, "margin_dpo/margin_std": 67.38003540039062, "step": 122 }, { "KL/chosen_KL_mean": -42.91706848144531, "KL/mean": -65.22838592529297, "KL/rejected_KL_mean": -87.53971862792969, "KL/std": 54.896278381347656, "epoch": 0.18061674008810572, "fcm_dpo/beta": 0.010254621505737305, "fcm_dpo/delta": -0.06130140274763107, "fcm_dpo/margin": 44.62263870239258, "fcm_dpo/q_t": 0.4016492962837219, "grad_norm": 20.85264778137207, "learning_rate": 4.908044411417711e-07, "logits/chosen": -0.5527976751327515, "logits/rejected": -0.535463273525238, "logps/chosen": -104.25570678710938, "logps/ref_chosen": -61.33863830566406, "logps/ref_rejected": -87.775390625, "logps/rejected": -175.3151092529297, "loss": 1.1084, "margin_dpo/margin_mean": 44.62263488769531, "margin_dpo/margin_std": 73.6215591430664, "step": 123 }, { "KL/chosen_KL_mean": -48.867706298828125, "KL/mean": -81.8544921875, "KL/rejected_KL_mean": -114.84127044677734, "KL/std": 75.15878295898438, "epoch": 0.18208516886930984, "fcm_dpo/beta": 0.00984976440668106, "fcm_dpo/delta": -0.26814186573028564, "fcm_dpo/margin": 65.97357177734375, "fcm_dpo/q_t": 0.36925771832466125, "grad_norm": 22.487119674682617, "learning_rate": 4.904564107932048e-07, "logits/chosen": -0.5741191506385803, "logits/rejected": -0.5771223306655884, "logps/chosen": -120.3160400390625, "logps/ref_chosen": -71.44833374023438, "logps/ref_rejected": -117.58056640625, "logps/rejected": -232.42184448242188, "loss": 1.0146, "margin_dpo/margin_mean": 65.97357177734375, "margin_dpo/margin_std": 94.4095458984375, "step": 124 }, { "KL/chosen_KL_mean": -40.33204650878906, "KL/mean": -69.6327896118164, "KL/rejected_KL_mean": -98.93354797363281, "KL/std": 63.41231918334961, "epoch": 0.18355359765051396, "fcm_dpo/beta": 0.009521868079900742, "fcm_dpo/delta": -0.16708803176879883, "fcm_dpo/margin": 58.601497650146484, "fcm_dpo/q_t": 0.3802080452442169, "grad_norm": 18.873462677001953, "learning_rate": 4.90102044194588e-07, "logits/chosen": -0.4996240735054016, "logits/rejected": -0.5013130903244019, "logps/chosen": -90.46898651123047, "logps/ref_chosen": -50.136940002441406, "logps/ref_rejected": -83.98861694335938, "logps/rejected": -182.92214965820312, "loss": 1.0267, "margin_dpo/margin_mean": 58.60149383544922, "margin_dpo/margin_std": 78.49264526367188, "step": 125 }, { "KL/chosen_KL_mean": -43.697837829589844, "KL/mean": -70.89157104492188, "KL/rejected_KL_mean": -98.08531188964844, "KL/std": 56.95924377441406, "epoch": 0.18502202643171806, "fcm_dpo/beta": 0.009252631105482578, "fcm_dpo/delta": -0.10898162424564362, "fcm_dpo/margin": 54.387474060058594, "fcm_dpo/q_t": 0.38932526111602783, "grad_norm": 20.310638427734375, "learning_rate": 4.897413506838102e-07, "logits/chosen": -0.543870210647583, "logits/rejected": -0.5395331382751465, "logps/chosen": -99.36490631103516, "logps/ref_chosen": -55.66706848144531, "logps/ref_rejected": -98.1297607421875, "logps/rejected": -196.21507263183594, "loss": 1.0441, "margin_dpo/margin_mean": 54.387474060058594, "margin_dpo/margin_std": 72.31570434570312, "step": 126 }, { "KL/chosen_KL_mean": -42.63111114501953, "KL/mean": -62.53132629394531, "KL/rejected_KL_mean": -82.43153381347656, "KL/std": 50.20075225830078, "epoch": 0.18649045521292218, "fcm_dpo/beta": 0.009267613291740417, "fcm_dpo/delta": 0.03231769800186157, "fcm_dpo/margin": 39.8004264831543, "fcm_dpo/q_t": 0.414898157119751, "grad_norm": 20.918685913085938, "learning_rate": 4.89374339765481e-07, "logits/chosen": -0.5204076170921326, "logits/rejected": -0.5004839897155762, "logps/chosen": -99.185791015625, "logps/ref_chosen": -56.55467987060547, "logps/ref_rejected": -76.7957763671875, "logps/rejected": -159.22732543945312, "loss": 1.1319, "margin_dpo/margin_mean": 39.80043029785156, "margin_dpo/margin_std": 63.22393798828125, "step": 127 }, { "KL/chosen_KL_mean": -45.72355270385742, "KL/mean": -66.75888061523438, "KL/rejected_KL_mean": -87.79420471191406, "KL/std": 59.13935852050781, "epoch": 0.18795888399412627, "fcm_dpo/beta": 0.009327895939350128, "fcm_dpo/delta": 0.007488146424293518, "fcm_dpo/margin": 42.07066345214844, "fcm_dpo/q_t": 0.4129607379436493, "grad_norm": 29.327892303466797, "learning_rate": 4.890010211106795e-07, "logits/chosen": -0.5080424547195435, "logits/rejected": -0.4870242476463318, "logps/chosen": -103.84451293945312, "logps/ref_chosen": -58.12095642089844, "logps/ref_rejected": -76.43896484375, "logps/rejected": -164.23316955566406, "loss": 1.1428, "margin_dpo/margin_mean": 42.07066345214844, "margin_dpo/margin_std": 74.08981323242188, "step": 128 }, { "KL/chosen_KL_mean": -55.38201904296875, "KL/mean": -77.5055923461914, "KL/rejected_KL_mean": -99.62916564941406, "KL/std": 65.07495880126953, "epoch": 0.1894273127753304, "fcm_dpo/beta": 0.009284512139856815, "fcm_dpo/delta": -0.01128113642334938, "fcm_dpo/margin": 44.24713134765625, "fcm_dpo/q_t": 0.4140230417251587, "grad_norm": 20.87249183654785, "learning_rate": 4.88621404556699e-07, "logits/chosen": -0.5257991552352905, "logits/rejected": -0.5148609280586243, "logps/chosen": -122.29839324951172, "logps/ref_chosen": -66.91637420654297, "logps/ref_rejected": -96.6422119140625, "logps/rejected": -196.27137756347656, "loss": 1.1488, "margin_dpo/margin_mean": 44.24713134765625, "margin_dpo/margin_std": 84.46028900146484, "step": 129 }, { "KL/chosen_KL_mean": -41.156402587890625, "KL/mean": -74.48085021972656, "KL/rejected_KL_mean": -107.8052978515625, "KL/std": 66.90983581542969, "epoch": 0.19089574155653452, "fcm_dpo/beta": 0.00907239317893982, "fcm_dpo/delta": -0.2178019881248474, "fcm_dpo/margin": 66.64888763427734, "fcm_dpo/q_t": 0.37017908692359924, "grad_norm": 21.265871047973633, "learning_rate": 4.882355001067891e-07, "logits/chosen": -0.5009858012199402, "logits/rejected": -0.4956481158733368, "logps/chosen": -85.82325744628906, "logps/ref_chosen": -44.66685104370117, "logps/ref_rejected": -82.78165435791016, "logps/rejected": -190.58694458007812, "loss": 0.9976, "margin_dpo/margin_mean": 66.64889526367188, "margin_dpo/margin_std": 81.45700073242188, "step": 130 }, { "KL/chosen_KL_mean": -36.80657196044922, "KL/mean": -70.24964904785156, "KL/rejected_KL_mean": -103.69271850585938, "KL/std": 66.65603637695312, "epoch": 0.19236417033773862, "fcm_dpo/beta": 0.008619595319032669, "fcm_dpo/delta": -0.18765899538993835, "fcm_dpo/margin": 66.88614654541016, "fcm_dpo/q_t": 0.3690122663974762, "grad_norm": 28.770198822021484, "learning_rate": 4.878433179298909e-07, "logits/chosen": -0.498441219329834, "logits/rejected": -0.5051707625389099, "logps/chosen": -81.73116302490234, "logps/ref_chosen": -44.924591064453125, "logps/ref_rejected": -88.44401550292969, "logps/rejected": -192.13671875, "loss": 0.9805, "margin_dpo/margin_mean": 66.88614654541016, "margin_dpo/margin_std": 73.10858154296875, "step": 131 }, { "KL/chosen_KL_mean": -48.89440155029297, "KL/mean": -76.34346008300781, "KL/rejected_KL_mean": -103.79251098632812, "KL/std": 66.48584747314453, "epoch": 0.19383259911894274, "fcm_dpo/beta": 0.00844726525247097, "fcm_dpo/delta": -0.06705770641565323, "fcm_dpo/margin": 54.898109436035156, "fcm_dpo/q_t": 0.40113556385040283, "grad_norm": 19.942279815673828, "learning_rate": 4.874448683603694e-07, "logits/chosen": -0.48032820224761963, "logits/rejected": -0.47630518674850464, "logps/chosen": -107.89549255371094, "logps/ref_chosen": -59.00108337402344, "logps/ref_rejected": -87.89215087890625, "logps/rejected": -191.68466186523438, "loss": 1.0886, "margin_dpo/margin_mean": 54.89811325073242, "margin_dpo/margin_std": 86.42204284667969, "step": 132 }, { "KL/chosen_KL_mean": -57.482398986816406, "KL/mean": -81.73624420166016, "KL/rejected_KL_mean": -105.99009704589844, "KL/std": 59.815887451171875, "epoch": 0.19530102790014683, "fcm_dpo/beta": 0.008408504538238049, "fcm_dpo/delta": -0.00830613262951374, "fcm_dpo/margin": 48.5077018737793, "fcm_dpo/q_t": 0.41032248735427856, "grad_norm": 27.46077537536621, "learning_rate": 4.870401618977415e-07, "logits/chosen": -0.5145904421806335, "logits/rejected": -0.5010430812835693, "logps/chosen": -124.08689880371094, "logps/ref_chosen": -66.60449981689453, "logps/ref_rejected": -96.33355712890625, "logps/rejected": -202.32366943359375, "loss": 1.11, "margin_dpo/margin_mean": 48.50770568847656, "margin_dpo/margin_std": 75.81759643554688, "step": 133 }, { "KL/chosen_KL_mean": -45.88512420654297, "KL/mean": -71.80236053466797, "KL/rejected_KL_mean": -97.7196044921875, "KL/std": 58.62601852416992, "epoch": 0.19676945668135096, "fcm_dpo/beta": 0.008402526378631592, "fcm_dpo/delta": -0.037258490920066833, "fcm_dpo/margin": 51.83448028564453, "fcm_dpo/q_t": 0.4022940993309021, "grad_norm": 18.916580200195312, "learning_rate": 4.866292092063986e-07, "logits/chosen": -0.4667087197303772, "logits/rejected": -0.4526156187057495, "logps/chosen": -97.95437622070312, "logps/ref_chosen": -52.06925582885742, "logps/ref_rejected": -87.6545181274414, "logps/rejected": -185.37411499023438, "loss": 1.0693, "margin_dpo/margin_mean": 51.83448028564453, "margin_dpo/margin_std": 67.91160583496094, "step": 134 }, { "KL/chosen_KL_mean": -50.227142333984375, "KL/mean": -87.61117553710938, "KL/rejected_KL_mean": -124.99522399902344, "KL/std": 77.6145248413086, "epoch": 0.19823788546255505, "fcm_dpo/beta": 0.008090103045105934, "fcm_dpo/delta": -0.21815121173858643, "fcm_dpo/margin": 74.76806640625, "fcm_dpo/q_t": 0.3705596625804901, "grad_norm": 22.08445167541504, "learning_rate": 4.862120211153265e-07, "logits/chosen": -0.48219579458236694, "logits/rejected": -0.5165150165557861, "logps/chosen": -100.58100128173828, "logps/ref_chosen": -50.353858947753906, "logps/ref_rejected": -115.97975158691406, "logps/rejected": -240.9749755859375, "loss": 0.9939, "margin_dpo/margin_mean": 74.76806640625, "margin_dpo/margin_std": 93.64501953125, "step": 135 }, { "KL/chosen_KL_mean": -59.928977966308594, "KL/mean": -84.67167663574219, "KL/rejected_KL_mean": -109.41439819335938, "KL/std": 70.4333724975586, "epoch": 0.19970631424375918, "fcm_dpo/beta": 0.007961141876876354, "fcm_dpo/delta": 0.005593650043010712, "fcm_dpo/margin": 49.48542404174805, "fcm_dpo/q_t": 0.41935813426971436, "grad_norm": 20.334075927734375, "learning_rate": 4.857886086178193e-07, "logits/chosen": -0.481515109539032, "logits/rejected": -0.4732978343963623, "logps/chosen": -125.0014877319336, "logps/ref_chosen": -65.072509765625, "logps/ref_rejected": -96.32122802734375, "logps/rejected": -205.73562622070312, "loss": 1.1443, "margin_dpo/margin_mean": 49.48542022705078, "margin_dpo/margin_std": 90.12128448486328, "step": 136 }, { "KL/chosen_KL_mean": -57.55767822265625, "KL/mean": -96.53668212890625, "KL/rejected_KL_mean": -135.51568603515625, "KL/std": 93.2874984741211, "epoch": 0.2011747430249633, "fcm_dpo/beta": 0.00774747971445322, "fcm_dpo/delta": -0.21738505363464355, "fcm_dpo/margin": 77.95802307128906, "fcm_dpo/q_t": 0.3767836093902588, "grad_norm": 19.739362716674805, "learning_rate": 4.853589828711902e-07, "logits/chosen": -0.42566192150115967, "logits/rejected": -0.4528757333755493, "logps/chosen": -106.31678771972656, "logps/ref_chosen": -48.759117126464844, "logps/ref_rejected": -113.86376953125, "logps/rejected": -249.37945556640625, "loss": 1.0251, "margin_dpo/margin_mean": 77.95802307128906, "margin_dpo/margin_std": 111.74757385253906, "step": 137 }, { "KL/chosen_KL_mean": -59.95167922973633, "KL/mean": -89.62110137939453, "KL/rejected_KL_mean": -119.29052734375, "KL/std": 71.20696258544922, "epoch": 0.2026431718061674, "fcm_dpo/beta": 0.007621297147125006, "fcm_dpo/delta": -0.054680272936820984, "fcm_dpo/margin": 59.33884811401367, "fcm_dpo/q_t": 0.3964976966381073, "grad_norm": 21.295473098754883, "learning_rate": 4.849231551964771e-07, "logits/chosen": -0.4297477602958679, "logits/rejected": -0.41819727420806885, "logps/chosen": -120.47132873535156, "logps/ref_chosen": -60.519649505615234, "logps/ref_rejected": -93.19694519042969, "logps/rejected": -212.4874725341797, "loss": 1.0552, "margin_dpo/margin_mean": 59.33884811401367, "margin_dpo/margin_std": 72.82606506347656, "step": 138 }, { "KL/chosen_KL_mean": -50.222434997558594, "KL/mean": -85.49241638183594, "KL/rejected_KL_mean": -120.76240539550781, "KL/std": 67.446044921875, "epoch": 0.20411160058737152, "fcm_dpo/beta": 0.007455192506313324, "fcm_dpo/delta": -0.13268427550792694, "fcm_dpo/margin": 70.53996276855469, "fcm_dpo/q_t": 0.38338446617126465, "grad_norm": 18.57466697692871, "learning_rate": 4.844811370781446e-07, "logits/chosen": -0.44361281394958496, "logits/rejected": -0.4341086149215698, "logps/chosen": -97.11381530761719, "logps/ref_chosen": -46.89138412475586, "logps/ref_rejected": -79.72798156738281, "logps/rejected": -200.49038696289062, "loss": 1.0198, "margin_dpo/margin_mean": 70.53996276855469, "margin_dpo/margin_std": 87.36215209960938, "step": 139 }, { "KL/chosen_KL_mean": -60.796875, "KL/mean": -92.86795806884766, "KL/rejected_KL_mean": -124.93904113769531, "KL/std": 75.12921905517578, "epoch": 0.2055800293685756, "fcm_dpo/beta": 0.007304832339286804, "fcm_dpo/delta": -0.07196947187185287, "fcm_dpo/margin": 64.14215850830078, "fcm_dpo/q_t": 0.39591526985168457, "grad_norm": 21.425811767578125, "learning_rate": 4.840329401637809e-07, "logits/chosen": -0.4486401081085205, "logits/rejected": -0.4356744587421417, "logps/chosen": -119.77159118652344, "logps/ref_chosen": -58.97471618652344, "logps/ref_rejected": -83.28410339355469, "logps/rejected": -208.22314453125, "loss": 1.0689, "margin_dpo/margin_mean": 64.14215850830078, "margin_dpo/margin_std": 90.35142517089844, "step": 140 }, { "KL/chosen_KL_mean": -69.00209045410156, "KL/mean": -99.4312744140625, "KL/rejected_KL_mean": -129.86044311523438, "KL/std": 83.01104736328125, "epoch": 0.20704845814977973, "fcm_dpo/beta": 0.007242698222398758, "fcm_dpo/delta": -0.04263737052679062, "fcm_dpo/margin": 60.85835266113281, "fcm_dpo/q_t": 0.4011520743370056, "grad_norm": 27.03215980529785, "learning_rate": 4.83578576263792e-07, "logits/chosen": -0.4251963496208191, "logits/rejected": -0.41217079758644104, "logps/chosen": -144.0777587890625, "logps/ref_chosen": -75.07566833496094, "logps/ref_rejected": -98.1922607421875, "logps/rejected": -228.05270385742188, "loss": 1.1024, "margin_dpo/margin_mean": 60.85835266113281, "margin_dpo/margin_std": 95.99069213867188, "step": 141 }, { "KL/chosen_KL_mean": -69.8545150756836, "KL/mean": -104.672607421875, "KL/rejected_KL_mean": -139.49070739746094, "KL/std": 90.1983642578125, "epoch": 0.20851688693098386, "fcm_dpo/beta": 0.007142849266529083, "fcm_dpo/delta": -0.10244297236204147, "fcm_dpo/margin": 69.63619995117188, "fcm_dpo/q_t": 0.3925698399543762, "grad_norm": 26.979690551757812, "learning_rate": 4.83118057351089e-07, "logits/chosen": -0.40249842405319214, "logits/rejected": -0.40109604597091675, "logps/chosen": -127.8824462890625, "logps/ref_chosen": -58.027931213378906, "logps/ref_rejected": -94.58222961425781, "logps/rejected": -234.07293701171875, "loss": 1.0826, "margin_dpo/margin_mean": 69.63619995117188, "margin_dpo/margin_std": 106.22422790527344, "step": 142 }, { "KL/chosen_KL_mean": -74.11309051513672, "KL/mean": -96.2786865234375, "KL/rejected_KL_mean": -118.44427490234375, "KL/std": 79.39483642578125, "epoch": 0.20998531571218795, "fcm_dpo/beta": 0.007136983796954155, "fcm_dpo/delta": 0.08635500073432922, "fcm_dpo/margin": 44.3311882019043, "fcm_dpo/q_t": 0.4322276711463928, "grad_norm": 23.613080978393555, "learning_rate": 4.826513955607734e-07, "logits/chosen": -0.38653671741485596, "logits/rejected": -0.3791394829750061, "logps/chosen": -131.70953369140625, "logps/ref_chosen": -57.59645080566406, "logps/ref_rejected": -78.99957275390625, "logps/rejected": -197.44384765625, "loss": 1.1961, "margin_dpo/margin_mean": 44.3311882019043, "margin_dpo/margin_std": 92.59246826171875, "step": 143 }, { "KL/chosen_KL_mean": -65.83393859863281, "KL/mean": -93.10847473144531, "KL/rejected_KL_mean": -120.38301086425781, "KL/std": 67.2380142211914, "epoch": 0.21145374449339208, "fcm_dpo/beta": 0.007198760285973549, "fcm_dpo/delta": 0.007602264638990164, "fcm_dpo/margin": 54.54907989501953, "fcm_dpo/q_t": 0.4110908508300781, "grad_norm": 21.00301170349121, "learning_rate": 4.821786031898176e-07, "logits/chosen": -0.38446202874183655, "logits/rejected": -0.368974506855011, "logps/chosen": -125.74029541015625, "logps/ref_chosen": -59.90636444091797, "logps/ref_rejected": -82.00025939941406, "logps/rejected": -202.38327026367188, "loss": 1.1073, "margin_dpo/margin_mean": 54.54907989501953, "margin_dpo/margin_std": 79.00935363769531, "step": 144 }, { "KL/chosen_KL_mean": -62.50547409057617, "KL/mean": -91.89143371582031, "KL/rejected_KL_mean": -121.27738952636719, "KL/std": 67.18325805664062, "epoch": 0.21292217327459617, "fcm_dpo/beta": 0.007178094238042831, "fcm_dpo/delta": -0.02286495827138424, "fcm_dpo/margin": 58.771915435791016, "fcm_dpo/q_t": 0.40436333417892456, "grad_norm": 23.93907356262207, "learning_rate": 4.816996926967401e-07, "logits/chosen": -0.43855080008506775, "logits/rejected": -0.42247945070266724, "logps/chosen": -119.10614013671875, "logps/ref_chosen": -56.60066604614258, "logps/ref_rejected": -77.86631774902344, "logps/rejected": -199.14370727539062, "loss": 1.0876, "margin_dpo/margin_mean": 58.771915435791016, "margin_dpo/margin_std": 81.85527038574219, "step": 145 }, { "KL/chosen_KL_mean": -85.47122192382812, "KL/mean": -108.09506225585938, "KL/rejected_KL_mean": -130.71890258789062, "KL/std": 72.80694580078125, "epoch": 0.2143906020558003, "fcm_dpo/beta": 0.007236181758344173, "fcm_dpo/delta": 0.07508739829063416, "fcm_dpo/margin": 45.2476806640625, "fcm_dpo/q_t": 0.4256265461444855, "grad_norm": 26.226016998291016, "learning_rate": 4.812146767012779e-07, "logits/chosen": -0.3779621720314026, "logits/rejected": -0.34679633378982544, "logps/chosen": -151.4716796875, "logps/ref_chosen": -66.00045013427734, "logps/ref_rejected": -81.70278930664062, "logps/rejected": -212.42169189453125, "loss": 1.183, "margin_dpo/margin_mean": 45.2476806640625, "margin_dpo/margin_std": 87.81689453125, "step": 146 }, { "KL/chosen_KL_mean": -62.50874328613281, "KL/mean": -93.20917510986328, "KL/rejected_KL_mean": -123.90959930419922, "KL/std": 73.37457275390625, "epoch": 0.21585903083700442, "fcm_dpo/beta": 0.007220801897346973, "fcm_dpo/delta": -0.045540180057287216, "fcm_dpo/margin": 61.40085983276367, "fcm_dpo/q_t": 0.4018627405166626, "grad_norm": 19.40831184387207, "learning_rate": 4.807235679840536e-07, "logits/chosen": -0.44671040773391724, "logits/rejected": -0.42650818824768066, "logps/chosen": -115.91423034667969, "logps/ref_chosen": -53.405487060546875, "logps/ref_rejected": -71.39060974121094, "logps/rejected": -195.3002166748047, "loss": 1.0886, "margin_dpo/margin_mean": 61.40085983276367, "margin_dpo/margin_std": 90.92599487304688, "step": 147 }, { "KL/chosen_KL_mean": -61.455265045166016, "KL/mean": -87.85798645019531, "KL/rejected_KL_mean": -114.26071166992188, "KL/std": 73.19024658203125, "epoch": 0.2173274596182085, "fcm_dpo/beta": 0.007157785817980766, "fcm_dpo/delta": -0.0840882733464241, "fcm_dpo/margin": 52.805450439453125, "fcm_dpo/q_t": 0.4160994589328766, "grad_norm": 19.22397804260254, "learning_rate": 4.802263794862384e-07, "logits/chosen": -0.47541412711143494, "logits/rejected": -0.46777063608169556, "logps/chosen": -126.39234924316406, "logps/ref_chosen": -64.93708038330078, "logps/ref_rejected": -103.09384155273438, "logps/rejected": -217.35455322265625, "loss": 1.1233, "margin_dpo/margin_mean": 52.80545425415039, "margin_dpo/margin_std": 77.12681579589844, "step": 148 }, { "KL/chosen_KL_mean": -59.069393157958984, "KL/mean": -91.8096694946289, "KL/rejected_KL_mean": -124.54994201660156, "KL/std": 66.26725006103516, "epoch": 0.21879588839941264, "fcm_dpo/beta": 0.00697628129273653, "fcm_dpo/delta": -0.06137773394584656, "fcm_dpo/margin": 65.48056030273438, "fcm_dpo/q_t": 0.3954726457595825, "grad_norm": 18.177705764770508, "learning_rate": 4.797231243092118e-07, "logits/chosen": -0.4999982714653015, "logits/rejected": -0.48564597964286804, "logps/chosen": -117.54315185546875, "logps/ref_chosen": -58.47376251220703, "logps/ref_rejected": -99.31474304199219, "logps/rejected": -223.86468505859375, "loss": 1.0551, "margin_dpo/margin_mean": 65.48056030273438, "margin_dpo/margin_std": 78.40389251708984, "step": 149 }, { "KL/chosen_KL_mean": -52.26180648803711, "KL/mean": -84.42105865478516, "KL/rejected_KL_mean": -116.58030700683594, "KL/std": 78.1861343383789, "epoch": 0.22026431718061673, "fcm_dpo/beta": 0.006911845877766609, "fcm_dpo/delta": -0.04800789803266525, "fcm_dpo/margin": 64.3185043334961, "fcm_dpo/q_t": 0.40420806407928467, "grad_norm": 18.062509536743164, "learning_rate": 4.792138157142157e-07, "logits/chosen": -0.46106863021850586, "logits/rejected": -0.4648742079734802, "logps/chosen": -97.96762084960938, "logps/ref_chosen": -45.705810546875, "logps/ref_rejected": -83.34759521484375, "logps/rejected": -199.9279022216797, "loss": 1.0812, "margin_dpo/margin_mean": 64.3185043334961, "margin_dpo/margin_std": 93.56321716308594, "step": 150 }, { "KL/chosen_KL_mean": -65.14360046386719, "KL/mean": -97.421142578125, "KL/rejected_KL_mean": -129.69869995117188, "KL/std": 73.59419250488281, "epoch": 0.22173274596182085, "fcm_dpo/beta": 0.006893502548336983, "fcm_dpo/delta": -0.047122225165367126, "fcm_dpo/margin": 64.55509185791016, "fcm_dpo/q_t": 0.3980643153190613, "grad_norm": 20.905559539794922, "learning_rate": 4.786984671220053e-07, "logits/chosen": -0.541815996170044, "logits/rejected": -0.5158591866493225, "logps/chosen": -135.7144317626953, "logps/ref_chosen": -70.57083129882812, "logps/ref_rejected": -100.46382141113281, "logps/rejected": -230.16250610351562, "loss": 1.061, "margin_dpo/margin_mean": 64.55509185791016, "margin_dpo/margin_std": 80.68389892578125, "step": 151 }, { "KL/chosen_KL_mean": -57.33415603637695, "KL/mean": -96.5593032836914, "KL/rejected_KL_mean": -135.78445434570312, "KL/std": 76.12808990478516, "epoch": 0.22320117474302498, "fcm_dpo/beta": 0.006756227929145098, "fcm_dpo/delta": -0.13709712028503418, "fcm_dpo/margin": 78.45030212402344, "fcm_dpo/q_t": 0.38215482234954834, "grad_norm": 19.958600997924805, "learning_rate": 4.78177092112495e-07, "logits/chosen": -0.48123008012771606, "logits/rejected": -0.47946709394454956, "logps/chosen": -117.49854278564453, "logps/ref_chosen": -60.16438674926758, "logps/ref_rejected": -106.14045715332031, "logps/rejected": -241.92491149902344, "loss": 1.0171, "margin_dpo/margin_mean": 78.45030212402344, "margin_dpo/margin_std": 93.28910827636719, "step": 152 }, { "KL/chosen_KL_mean": -57.76191711425781, "KL/mean": -91.28083801269531, "KL/rejected_KL_mean": -124.79976654052734, "KL/std": 82.61441040039062, "epoch": 0.22466960352422907, "fcm_dpo/beta": 0.006657836027443409, "fcm_dpo/delta": -0.04847495257854462, "fcm_dpo/margin": 67.03783416748047, "fcm_dpo/q_t": 0.403054416179657, "grad_norm": 15.512747764587402, "learning_rate": 4.776497044244016e-07, "logits/chosen": -0.48859214782714844, "logits/rejected": -0.48414355516433716, "logps/chosen": -114.07719421386719, "logps/ref_chosen": -56.315277099609375, "logps/ref_rejected": -85.65583801269531, "logps/rejected": -210.45559692382812, "loss": 1.0887, "margin_dpo/margin_mean": 67.037841796875, "margin_dpo/margin_std": 101.445068359375, "step": 153 }, { "KL/chosen_KL_mean": -69.54912567138672, "KL/mean": -101.59078979492188, "KL/rejected_KL_mean": -133.6324462890625, "KL/std": 83.2306137084961, "epoch": 0.2261380323054332, "fcm_dpo/beta": 0.006629183888435364, "fcm_dpo/delta": -0.026146577671170235, "fcm_dpo/margin": 64.08331298828125, "fcm_dpo/q_t": 0.4066181182861328, "grad_norm": 19.13957977294922, "learning_rate": 4.771163179548808e-07, "logits/chosen": -0.4480747580528259, "logits/rejected": -0.4504152834415436, "logps/chosen": -132.29168701171875, "logps/ref_chosen": -62.74256896972656, "logps/ref_rejected": -104.24420166015625, "logps/rejected": -237.87664794921875, "loss": 1.1201, "margin_dpo/margin_mean": 64.08331298828125, "margin_dpo/margin_std": 105.86033630371094, "step": 154 }, { "KL/chosen_KL_mean": -63.659236907958984, "KL/mean": -95.98409271240234, "KL/rejected_KL_mean": -128.3089599609375, "KL/std": 76.96090698242188, "epoch": 0.2276064610866373, "fcm_dpo/beta": 0.006584943272173405, "fcm_dpo/delta": -0.026932524517178535, "fcm_dpo/margin": 64.64971923828125, "fcm_dpo/q_t": 0.4042484164237976, "grad_norm": 19.22228240966797, "learning_rate": 4.7657694675916247e-07, "logits/chosen": -0.49922215938568115, "logits/rejected": -0.4822632670402527, "logps/chosen": -124.31242370605469, "logps/ref_chosen": -60.65318298339844, "logps/ref_rejected": -77.49220275878906, "logps/rejected": -205.80116271972656, "loss": 1.0942, "margin_dpo/margin_mean": 64.64971923828125, "margin_dpo/margin_std": 95.12773132324219, "step": 155 }, { "KL/chosen_KL_mean": -89.18508911132812, "KL/mean": -107.76153564453125, "KL/rejected_KL_mean": -126.33798217773438, "KL/std": 82.31591796875, "epoch": 0.2290748898678414, "fcm_dpo/beta": 0.006626888178288937, "fcm_dpo/delta": 0.05205275118350983, "fcm_dpo/margin": 37.15288543701172, "fcm_dpo/q_t": 0.44431304931640625, "grad_norm": 29.14635467529297, "learning_rate": 4.7603160505017893e-07, "logits/chosen": -0.4069097638130188, "logits/rejected": -0.3994802236557007, "logps/chosen": -158.67697143554688, "logps/ref_chosen": -69.49188232421875, "logps/ref_rejected": -77.16929626464844, "logps/rejected": -203.50726318359375, "loss": 1.2754, "margin_dpo/margin_mean": 37.15288543701172, "margin_dpo/margin_std": 108.07014465332031, "step": 156 }, { "KL/chosen_KL_mean": -80.44374084472656, "KL/mean": -121.49053192138672, "KL/rejected_KL_mean": -162.53732299804688, "KL/std": 89.35894775390625, "epoch": 0.2305433186490455, "fcm_dpo/beta": 0.006456049624830484, "fcm_dpo/delta": -0.13866297900676727, "fcm_dpo/margin": 82.09356689453125, "fcm_dpo/q_t": 0.37898433208465576, "grad_norm": 23.95264434814453, "learning_rate": 4.7548030719819154e-07, "logits/chosen": -0.3959600329399109, "logits/rejected": -0.4036720395088196, "logps/chosen": -141.8121795654297, "logps/ref_chosen": -61.368438720703125, "logps/ref_rejected": -107.64636993408203, "logps/rejected": -270.1837158203125, "loss": 1.0292, "margin_dpo/margin_mean": 82.09357452392578, "margin_dpo/margin_std": 101.68392944335938, "step": 157 }, { "KL/chosen_KL_mean": -80.78021240234375, "KL/mean": -124.482666015625, "KL/rejected_KL_mean": -168.18511962890625, "KL/std": 110.5858154296875, "epoch": 0.23201174743024963, "fcm_dpo/beta": 0.006296713836491108, "fcm_dpo/delta": -0.15908576548099518, "fcm_dpo/margin": 87.40489959716797, "fcm_dpo/q_t": 0.3867127597332001, "grad_norm": 19.55266761779785, "learning_rate": 4.7492306773041136e-07, "logits/chosen": -0.33809971809387207, "logits/rejected": -0.35502055287361145, "logps/chosen": -138.39312744140625, "logps/ref_chosen": -57.612918853759766, "logps/ref_rejected": -113.6946792602539, "logps/rejected": -281.8797912597656, "loss": 1.0552, "margin_dpo/margin_mean": 87.4049072265625, "margin_dpo/margin_std": 133.116943359375, "step": 158 }, { "KL/chosen_KL_mean": -90.86648559570312, "KL/mean": -120.76834106445312, "KL/rejected_KL_mean": -150.67018127441406, "KL/std": 97.11602783203125, "epoch": 0.23348017621145375, "fcm_dpo/beta": 0.006294050253927708, "fcm_dpo/delta": 0.02400752529501915, "fcm_dpo/margin": 59.80369567871094, "fcm_dpo/q_t": 0.41609764099121094, "grad_norm": 22.071809768676758, "learning_rate": 4.743599013306165e-07, "logits/chosen": -0.4063527286052704, "logits/rejected": -0.37675607204437256, "logps/chosen": -172.4268341064453, "logps/ref_chosen": -81.56034851074219, "logps/ref_rejected": -88.89871215820312, "logps/rejected": -239.5688934326172, "loss": 1.1473, "margin_dpo/margin_mean": 59.80369567871094, "margin_dpo/margin_std": 104.75639343261719, "step": 159 }, { "KL/chosen_KL_mean": -93.1180419921875, "KL/mean": -133.37435913085938, "KL/rejected_KL_mean": -173.6306610107422, "KL/std": 104.18497467041016, "epoch": 0.23494860499265785, "fcm_dpo/beta": 0.006151704117655754, "fcm_dpo/delta": -0.10115846991539001, "fcm_dpo/margin": 80.51261901855469, "fcm_dpo/q_t": 0.3962337076663971, "grad_norm": 23.225406646728516, "learning_rate": 4.737908228387656e-07, "logits/chosen": -0.3705775737762451, "logits/rejected": -0.362305611371994, "logps/chosen": -158.8489227294922, "logps/ref_chosen": -65.73088073730469, "logps/ref_rejected": -97.21781921386719, "logps/rejected": -270.8484802246094, "loss": 1.0912, "margin_dpo/margin_mean": 80.51261901855469, "margin_dpo/margin_std": 130.29788208007812, "step": 160 }, { "KL/chosen_KL_mean": -79.38202667236328, "KL/mean": -114.66146850585938, "KL/rejected_KL_mean": -149.94090270996094, "KL/std": 82.19270324707031, "epoch": 0.23641703377386197, "fcm_dpo/beta": 0.00611657090485096, "fcm_dpo/delta": -0.03301185369491577, "fcm_dpo/margin": 70.55889129638672, "fcm_dpo/q_t": 0.4046275019645691, "grad_norm": 21.588083267211914, "learning_rate": 4.7321584725060594e-07, "logits/chosen": -0.3816624879837036, "logits/rejected": -0.3820039629936218, "logps/chosen": -131.81849670410156, "logps/ref_chosen": -52.43647003173828, "logps/ref_rejected": -83.43095397949219, "logps/rejected": -233.37185668945312, "loss": 1.0935, "margin_dpo/margin_mean": 70.55888366699219, "margin_dpo/margin_std": 104.3523941040039, "step": 161 }, { "KL/chosen_KL_mean": -76.32833862304688, "KL/mean": -111.31201171875, "KL/rejected_KL_mean": -146.29568481445312, "KL/std": 91.16246032714844, "epoch": 0.23788546255506607, "fcm_dpo/beta": 0.0060338219627738, "fcm_dpo/delta": -0.02483561635017395, "fcm_dpo/margin": 69.96736145019531, "fcm_dpo/q_t": 0.407100111246109, "grad_norm": 21.74049186706543, "learning_rate": 4.7263498971727905e-07, "logits/chosen": -0.4195418953895569, "logits/rejected": -0.4026295840740204, "logps/chosen": -138.9389190673828, "logps/ref_chosen": -62.6105842590332, "logps/ref_rejected": -89.39057922363281, "logps/rejected": -235.686279296875, "loss": 1.109, "margin_dpo/margin_mean": 69.96736145019531, "margin_dpo/margin_std": 108.229248046875, "step": 162 }, { "KL/chosen_KL_mean": -85.92386627197266, "KL/mean": -120.09257507324219, "KL/rejected_KL_mean": -154.26129150390625, "KL/std": 91.9381103515625, "epoch": 0.2393538913362702, "fcm_dpo/beta": 0.006065480876713991, "fcm_dpo/delta": -0.015123652294278145, "fcm_dpo/margin": 68.33741760253906, "fcm_dpo/q_t": 0.4095137119293213, "grad_norm": 21.437828063964844, "learning_rate": 4.720482655449212e-07, "logits/chosen": -0.3672639727592468, "logits/rejected": -0.3495738208293915, "logps/chosen": -140.94549560546875, "logps/ref_chosen": -55.021629333496094, "logps/ref_rejected": -75.418212890625, "logps/rejected": -229.67950439453125, "loss": 1.1152, "margin_dpo/margin_mean": 68.33741760253906, "margin_dpo/margin_std": 110.58999633789062, "step": 163 }, { "KL/chosen_KL_mean": -77.3841323852539, "KL/mean": -119.36503601074219, "KL/rejected_KL_mean": -161.34591674804688, "KL/std": 89.17874908447266, "epoch": 0.24082232011747431, "fcm_dpo/beta": 0.0059239305555820465, "fcm_dpo/delta": -0.10423934459686279, "fcm_dpo/margin": 83.9617919921875, "fcm_dpo/q_t": 0.3878824710845947, "grad_norm": 21.113449096679688, "learning_rate": 4.714556901942599e-07, "logits/chosen": -0.3516240119934082, "logits/rejected": -0.33663517236709595, "logps/chosen": -133.02481079101562, "logps/ref_chosen": -55.64066696166992, "logps/ref_rejected": -79.66463470458984, "logps/rejected": -241.01055908203125, "loss": 1.035, "margin_dpo/margin_mean": 83.9617919921875, "margin_dpo/margin_std": 102.90313720703125, "step": 164 }, { "KL/chosen_KL_mean": -83.97947692871094, "KL/mean": -110.91374969482422, "KL/rejected_KL_mean": -137.8480224609375, "KL/std": 75.39066314697266, "epoch": 0.2422907488986784, "fcm_dpo/beta": 0.005989417899399996, "fcm_dpo/delta": 0.08000632375478745, "fcm_dpo/margin": 53.86854553222656, "fcm_dpo/q_t": 0.42767125368118286, "grad_norm": 23.085264205932617, "learning_rate": 4.708572792802069e-07, "logits/chosen": -0.3876940608024597, "logits/rejected": -0.36072492599487305, "logps/chosen": -145.2901611328125, "logps/ref_chosen": -61.310691833496094, "logps/ref_rejected": -73.67060852050781, "logps/rejected": -211.51861572265625, "loss": 1.1749, "margin_dpo/margin_mean": 53.86854553222656, "margin_dpo/margin_std": 100.26142883300781, "step": 165 }, { "KL/chosen_KL_mean": -73.80902099609375, "KL/mean": -123.69305419921875, "KL/rejected_KL_mean": -173.57708740234375, "KL/std": 109.76763916015625, "epoch": 0.24375917767988253, "fcm_dpo/beta": 0.0058315591886639595, "fcm_dpo/delta": -0.19435712695121765, "fcm_dpo/margin": 99.76808166503906, "fcm_dpo/q_t": 0.3807521462440491, "grad_norm": 17.283048629760742, "learning_rate": 4.702530485714461e-07, "logits/chosen": -0.36310431361198425, "logits/rejected": -0.37374886870384216, "logps/chosen": -124.79261779785156, "logps/ref_chosen": -50.98360061645508, "logps/ref_rejected": -98.09512329101562, "logps/rejected": -271.6722106933594, "loss": 1.0185, "margin_dpo/margin_mean": 99.76808166503906, "margin_dpo/margin_std": 138.61410522460938, "step": 166 }, { "KL/chosen_KL_mean": -75.02628326416016, "KL/mean": -127.5509033203125, "KL/rejected_KL_mean": -180.07553100585938, "KL/std": 100.14985656738281, "epoch": 0.24522760646108663, "fcm_dpo/beta": 0.005625586491078138, "fcm_dpo/delta": -0.20304620265960693, "fcm_dpo/margin": 105.04924011230469, "fcm_dpo/q_t": 0.36813193559646606, "grad_norm": 21.618406295776367, "learning_rate": 4.6964301399001877e-07, "logits/chosen": -0.3545036017894745, "logits/rejected": -0.35761505365371704, "logps/chosen": -125.45037841796875, "logps/ref_chosen": -50.424095153808594, "logps/ref_rejected": -96.03042602539062, "logps/rejected": -276.10595703125, "loss": 0.9744, "margin_dpo/margin_mean": 105.04924011230469, "margin_dpo/margin_std": 115.94286346435547, "step": 167 }, { "KL/chosen_KL_mean": -81.31709289550781, "KL/mean": -120.00129699707031, "KL/rejected_KL_mean": -158.6855010986328, "KL/std": 93.61595153808594, "epoch": 0.24669603524229075, "fcm_dpo/beta": 0.0055332607589662075, "fcm_dpo/delta": -0.029504312202334404, "fcm_dpo/margin": 77.36842346191406, "fcm_dpo/q_t": 0.40392887592315674, "grad_norm": 19.52683448791504, "learning_rate": 4.690271916109034e-07, "logits/chosen": -0.349258691072464, "logits/rejected": -0.339669793844223, "logps/chosen": -130.7799072265625, "logps/ref_chosen": -49.462825775146484, "logps/ref_rejected": -75.30855560302734, "logps/rejected": -233.99404907226562, "loss": 1.079, "margin_dpo/margin_mean": 77.36842346191406, "margin_dpo/margin_std": 104.69574737548828, "step": 168 }, { "KL/chosen_KL_mean": -83.92520904541016, "KL/mean": -117.55094909667969, "KL/rejected_KL_mean": -151.1767120361328, "KL/std": 92.2286605834961, "epoch": 0.24816446402349487, "fcm_dpo/beta": 0.005457356106489897, "fcm_dpo/delta": -0.07184266299009323, "fcm_dpo/margin": 67.25149536132812, "fcm_dpo/q_t": 0.420589804649353, "grad_norm": 20.000539779663086, "learning_rate": 4.6840559766159235e-07, "logits/chosen": -0.3642885386943817, "logits/rejected": -0.34793075919151306, "logps/chosen": -143.72865295410156, "logps/ref_chosen": -59.803443908691406, "logps/ref_rejected": -83.34574890136719, "logps/rejected": -234.5224609375, "loss": 1.162, "margin_dpo/margin_mean": 67.25149536132812, "margin_dpo/margin_std": 126.57770538330078, "step": 169 }, { "KL/chosen_KL_mean": -74.95732116699219, "KL/mean": -114.7613296508789, "KL/rejected_KL_mean": -154.56533813476562, "KL/std": 88.03938293457031, "epoch": 0.24963289280469897, "fcm_dpo/beta": 0.005398896988481283, "fcm_dpo/delta": -0.03206340968608856, "fcm_dpo/margin": 79.6080093383789, "fcm_dpo/q_t": 0.4014880359172821, "grad_norm": 17.664331436157227, "learning_rate": 4.6777824852166437e-07, "logits/chosen": -0.298395574092865, "logits/rejected": -0.2869154214859009, "logps/chosen": -124.42909240722656, "logps/ref_chosen": -49.471771240234375, "logps/ref_rejected": -75.91734313964844, "logps/rejected": -230.482666015625, "loss": 1.0798, "margin_dpo/margin_mean": 79.6080093383789, "margin_dpo/margin_std": 105.32583618164062, "step": 170 }, { "KL/chosen_KL_mean": -110.30496215820312, "KL/mean": -142.41079711914062, "KL/rejected_KL_mean": -174.51663208007812, "KL/std": 103.6309585571289, "epoch": 0.2511013215859031, "fcm_dpo/beta": 0.005459581036120653, "fcm_dpo/delta": 0.051255661994218826, "fcm_dpo/margin": 64.2116470336914, "fcm_dpo/q_t": 0.4263428747653961, "grad_norm": 28.27412223815918, "learning_rate": 4.6714516072235273e-07, "logits/chosen": -0.3548741340637207, "logits/rejected": -0.3387761116027832, "logps/chosen": -194.8042755126953, "logps/ref_chosen": -84.49931335449219, "logps/ref_rejected": -109.38209533691406, "logps/rejected": -283.89874267578125, "loss": 1.1816, "margin_dpo/margin_mean": 64.2116470336914, "margin_dpo/margin_std": 133.7387237548828, "step": 171 }, { "KL/chosen_KL_mean": -95.93238830566406, "KL/mean": -130.29452514648438, "KL/rejected_KL_mean": -164.65667724609375, "KL/std": 100.20172882080078, "epoch": 0.2525697503671072, "fcm_dpo/beta": 0.005491352174431086, "fcm_dpo/delta": 0.02349797450006008, "fcm_dpo/margin": 68.72428131103516, "fcm_dpo/q_t": 0.41575637459754944, "grad_norm": 18.535226821899414, "learning_rate": 4.6650635094610966e-07, "logits/chosen": -0.3598722219467163, "logits/rejected": -0.3389941453933716, "logps/chosen": -164.5863037109375, "logps/ref_chosen": -68.65391540527344, "logps/ref_rejected": -85.43667602539062, "logps/rejected": -250.0933380126953, "loss": 1.132, "margin_dpo/margin_mean": 68.72427368164062, "margin_dpo/margin_std": 113.38480377197266, "step": 172 }, { "KL/chosen_KL_mean": -88.23031616210938, "KL/mean": -122.02497100830078, "KL/rejected_KL_mean": -155.81961059570312, "KL/std": 92.91819763183594, "epoch": 0.2540381791483113, "fcm_dpo/beta": 0.005545733496546745, "fcm_dpo/delta": 0.025776570662856102, "fcm_dpo/margin": 67.58930969238281, "fcm_dpo/q_t": 0.4147086441516876, "grad_norm": 20.111751556396484, "learning_rate": 4.6586183602616687e-07, "logits/chosen": -0.3795207440853119, "logits/rejected": -0.3491283059120178, "logps/chosen": -151.28118896484375, "logps/ref_chosen": -63.050880432128906, "logps/ref_rejected": -78.68392181396484, "logps/rejected": -234.5035400390625, "loss": 1.1112, "margin_dpo/margin_mean": 67.58930969238281, "margin_dpo/margin_std": 95.4912109375, "step": 173 }, { "KL/chosen_KL_mean": -82.88648223876953, "KL/mean": -122.82292175292969, "KL/rejected_KL_mean": -162.75936889648438, "KL/std": 97.02500915527344, "epoch": 0.2555066079295154, "fcm_dpo/beta": 0.005529084708541632, "fcm_dpo/delta": -0.04399598762392998, "fcm_dpo/margin": 79.87288665771484, "fcm_dpo/q_t": 0.402817964553833, "grad_norm": 28.604568481445312, "learning_rate": 4.652116329460919e-07, "logits/chosen": -0.30759066343307495, "logits/rejected": -0.3249150216579437, "logps/chosen": -136.24945068359375, "logps/ref_chosen": -53.36296844482422, "logps/ref_rejected": -101.91120910644531, "logps/rejected": -264.67059326171875, "loss": 1.0882, "margin_dpo/margin_mean": 79.87288665771484, "margin_dpo/margin_std": 115.7405014038086, "step": 174 }, { "KL/chosen_KL_mean": -76.46298217773438, "KL/mean": -131.42095947265625, "KL/rejected_KL_mean": -186.37893676757812, "KL/std": 104.40403747558594, "epoch": 0.25697503671071953, "fcm_dpo/beta": 0.005318961106240749, "fcm_dpo/delta": -0.1964312642812729, "fcm_dpo/margin": 109.91595458984375, "fcm_dpo/q_t": 0.36687812209129333, "grad_norm": 29.169300079345703, "learning_rate": 4.645557588393406e-07, "logits/chosen": -0.32927554845809937, "logits/rejected": -0.31611427664756775, "logps/chosen": -121.88074493408203, "logps/ref_chosen": -45.417762756347656, "logps/ref_rejected": -89.50579833984375, "logps/rejected": -275.884765625, "loss": 0.9593, "margin_dpo/margin_mean": 109.91596221923828, "margin_dpo/margin_std": 109.20188903808594, "step": 175 }, { "KL/chosen_KL_mean": -81.6014404296875, "KL/mean": -127.35952758789062, "KL/rejected_KL_mean": -173.11761474609375, "KL/std": 102.09504699707031, "epoch": 0.25844346549192365, "fcm_dpo/beta": 0.0052184974774718285, "fcm_dpo/delta": -0.08138823509216309, "fcm_dpo/margin": 91.51618957519531, "fcm_dpo/q_t": 0.394927978515625, "grad_norm": 20.01445770263672, "learning_rate": 4.638942309888058e-07, "logits/chosen": -0.28535836935043335, "logits/rejected": -0.3025384843349457, "logps/chosen": -132.0542755126953, "logps/ref_chosen": -50.452842712402344, "logps/ref_rejected": -95.5589599609375, "logps/rejected": -268.67657470703125, "loss": 1.0499, "margin_dpo/margin_mean": 91.51618957519531, "margin_dpo/margin_std": 118.59428405761719, "step": 176 }, { "KL/chosen_KL_mean": -94.85406494140625, "KL/mean": -140.81088256835938, "KL/rejected_KL_mean": -186.76768493652344, "KL/std": 111.390869140625, "epoch": 0.2599118942731278, "fcm_dpo/beta": 0.005144456867128611, "fcm_dpo/delta": -0.07641495764255524, "fcm_dpo/margin": 91.91362762451172, "fcm_dpo/q_t": 0.3949311375617981, "grad_norm": 27.786762237548828, "learning_rate": 4.6322706682636137e-07, "logits/chosen": -0.3724118173122406, "logits/rejected": -0.364002525806427, "logps/chosen": -156.07052612304688, "logps/ref_chosen": -61.216468811035156, "logps/ref_rejected": -95.89378356933594, "logps/rejected": -282.6614685058594, "loss": 1.0495, "margin_dpo/margin_mean": 91.91362762451172, "margin_dpo/margin_std": 118.17066955566406, "step": 177 }, { "KL/chosen_KL_mean": -104.51988220214844, "KL/mean": -162.33509826660156, "KL/rejected_KL_mean": -220.1503143310547, "KL/std": 131.26687622070312, "epoch": 0.26138032305433184, "fcm_dpo/beta": 0.004952050745487213, "fcm_dpo/delta": -0.18411573767662048, "fcm_dpo/margin": 115.63043975830078, "fcm_dpo/q_t": 0.37578919529914856, "grad_norm": 27.795106887817383, "learning_rate": 4.6255428393240354e-07, "logits/chosen": -0.2641046941280365, "logits/rejected": -0.2551937997341156, "logps/chosen": -162.78466796875, "logps/ref_chosen": -58.26478958129883, "logps/ref_rejected": -105.3653335571289, "logps/rejected": -325.5156555175781, "loss": 1.0002, "margin_dpo/margin_mean": 115.63044738769531, "margin_dpo/margin_std": 143.41700744628906, "step": 178 }, { "KL/chosen_KL_mean": -109.52047729492188, "KL/mean": -149.59457397460938, "KL/rejected_KL_mean": -189.66867065429688, "KL/std": 112.11015319824219, "epoch": 0.26284875183553597, "fcm_dpo/beta": 0.004905564710497856, "fcm_dpo/delta": 0.006664544343948364, "fcm_dpo/margin": 80.14815521240234, "fcm_dpo/q_t": 0.41320013999938965, "grad_norm": 34.54417419433594, "learning_rate": 4.6187590003538724e-07, "logits/chosen": -0.30266761779785156, "logits/rejected": -0.3117542266845703, "logps/chosen": -170.57879638671875, "logps/ref_chosen": -61.05832290649414, "logps/ref_rejected": -90.52782440185547, "logps/rejected": -280.19647216796875, "loss": 1.1404, "margin_dpo/margin_mean": 80.14815521240234, "margin_dpo/margin_std": 139.17221069335938, "step": 179 }, { "KL/chosen_KL_mean": -94.91496276855469, "KL/mean": -146.8360595703125, "KL/rejected_KL_mean": -198.7571563720703, "KL/std": 101.62055969238281, "epoch": 0.2643171806167401, "fcm_dpo/beta": 0.004845252260565758, "fcm_dpo/delta": -0.10879069566726685, "fcm_dpo/margin": 103.84219360351562, "fcm_dpo/q_t": 0.3857002854347229, "grad_norm": 19.25888442993164, "learning_rate": 4.611919330113591e-07, "logits/chosen": -0.29693859815597534, "logits/rejected": -0.29173195362091064, "logps/chosen": -149.2576904296875, "logps/ref_chosen": -54.34272003173828, "logps/ref_rejected": -98.21183776855469, "logps/rejected": -296.968994140625, "loss": 1.0305, "margin_dpo/margin_mean": 103.84219360351562, "margin_dpo/margin_std": 126.80170440673828, "step": 180 }, { "KL/chosen_KL_mean": -83.34645080566406, "KL/mean": -115.71342468261719, "KL/rejected_KL_mean": -148.08038330078125, "KL/std": 93.42445373535156, "epoch": 0.2657856093979442, "fcm_dpo/beta": 0.004894108511507511, "fcm_dpo/delta": 0.08573634922504425, "fcm_dpo/margin": 64.73393249511719, "fcm_dpo/q_t": 0.4280344247817993, "grad_norm": 20.40754508972168, "learning_rate": 4.605024008834863e-07, "logits/chosen": -0.3203880190849304, "logits/rejected": -0.2962578535079956, "logps/chosen": -138.34690856933594, "logps/ref_chosen": -55.000457763671875, "logps/ref_rejected": -61.656166076660156, "logps/rejected": -209.73655700683594, "loss": 1.1713, "margin_dpo/margin_mean": 64.73393249511719, "margin_dpo/margin_std": 117.01361083984375, "step": 181 }, { "KL/chosen_KL_mean": -79.22171020507812, "KL/mean": -136.55117797851562, "KL/rejected_KL_mean": -193.88064575195312, "KL/std": 114.58843994140625, "epoch": 0.26725403817914833, "fcm_dpo/beta": 0.004775552079081535, "fcm_dpo/delta": -0.15708649158477783, "fcm_dpo/margin": 114.658935546875, "fcm_dpo/q_t": 0.37630826234817505, "grad_norm": 18.048755645751953, "learning_rate": 4.598073218215817e-07, "logits/chosen": -0.2832631766796112, "logits/rejected": -0.29323720932006836, "logps/chosen": -120.32955932617188, "logps/ref_chosen": -41.107852935791016, "logps/ref_rejected": -89.5215835571289, "logps/rejected": -283.4022216796875, "loss": 1.0117, "margin_dpo/margin_mean": 114.658935546875, "margin_dpo/margin_std": 138.28912353515625, "step": 182 }, { "KL/chosen_KL_mean": -116.80380249023438, "KL/mean": -147.58697509765625, "KL/rejected_KL_mean": -178.3701629638672, "KL/std": 93.99075317382812, "epoch": 0.2687224669603524, "fcm_dpo/beta": 0.004714460577815771, "fcm_dpo/delta": -0.04429354518651962, "fcm_dpo/margin": 61.56635665893555, "fcm_dpo/q_t": 0.4325304627418518, "grad_norm": 21.687788009643555, "learning_rate": 4.5910671414162484e-07, "logits/chosen": -0.29588770866394043, "logits/rejected": -0.28640466928482056, "logps/chosen": -174.328369140625, "logps/ref_chosen": -57.52456283569336, "logps/ref_rejected": -75.97572326660156, "logps/rejected": -254.34588623046875, "loss": 1.1828, "margin_dpo/margin_mean": 61.56635665893555, "margin_dpo/margin_std": 104.21000671386719, "step": 183 }, { "KL/chosen_KL_mean": -95.97406005859375, "KL/mean": -128.724609375, "KL/rejected_KL_mean": -161.4751434326172, "KL/std": 88.82809448242188, "epoch": 0.2701908957415565, "fcm_dpo/beta": 0.0047124335542321205, "fcm_dpo/delta": -0.004301935900002718, "fcm_dpo/margin": 65.50109100341797, "fcm_dpo/q_t": 0.4299464225769043, "grad_norm": 18.115541458129883, "learning_rate": 4.5840059630527985e-07, "logits/chosen": -0.3457328975200653, "logits/rejected": -0.33615928888320923, "logps/chosen": -154.51901245117188, "logps/ref_chosen": -58.544952392578125, "logps/ref_rejected": -76.63406372070312, "logps/rejected": -238.1092071533203, "loss": 1.1687, "margin_dpo/margin_mean": 65.50109100341797, "margin_dpo/margin_std": 111.95549011230469, "step": 184 }, { "KL/chosen_KL_mean": -104.63207244873047, "KL/mean": -130.31781005859375, "KL/rejected_KL_mean": -156.0035400390625, "KL/std": 102.3460693359375, "epoch": 0.27165932452276065, "fcm_dpo/beta": 0.0048194690607488155, "fcm_dpo/delta": 0.15635941922664642, "fcm_dpo/margin": 51.37147521972656, "fcm_dpo/q_t": 0.44680285453796387, "grad_norm": 20.880599975585938, "learning_rate": 4.5768898691940836e-07, "logits/chosen": -0.3066332936286926, "logits/rejected": -0.2832027077674866, "logps/chosen": -166.65792846679688, "logps/ref_chosen": -62.025848388671875, "logps/ref_rejected": -73.7625961303711, "logps/rejected": -229.76614379882812, "loss": 1.2341, "margin_dpo/margin_mean": 51.37147521972656, "margin_dpo/margin_std": 122.99656677246094, "step": 185 }, { "KL/chosen_KL_mean": -95.6789779663086, "KL/mean": -144.47451782226562, "KL/rejected_KL_mean": -193.27001953125, "KL/std": 103.38729858398438, "epoch": 0.27312775330396477, "fcm_dpo/beta": 0.004802432842552662, "fcm_dpo/delta": -0.07216604053974152, "fcm_dpo/margin": 97.591064453125, "fcm_dpo/q_t": 0.393841028213501, "grad_norm": 26.266706466674805, "learning_rate": 4.5697190473557947e-07, "logits/chosen": -0.3480488061904907, "logits/rejected": -0.32328087091445923, "logps/chosen": -165.03244018554688, "logps/ref_chosen": -69.35346984863281, "logps/ref_rejected": -88.07244873046875, "logps/rejected": -281.34246826171875, "loss": 1.045, "margin_dpo/margin_mean": 97.591064453125, "margin_dpo/margin_std": 118.66375732421875, "step": 186 }, { "KL/chosen_KL_mean": -88.34507751464844, "KL/mean": -128.76791381835938, "KL/rejected_KL_mean": -169.19076538085938, "KL/std": 97.03087615966797, "epoch": 0.2745961820851689, "fcm_dpo/beta": 0.004818159155547619, "fcm_dpo/delta": 0.010491464287042618, "fcm_dpo/margin": 80.8456802368164, "fcm_dpo/q_t": 0.41053086519241333, "grad_norm": 22.043073654174805, "learning_rate": 4.5624936864957555e-07, "logits/chosen": -0.3333667516708374, "logits/rejected": -0.3270256221294403, "logps/chosen": -141.10153198242188, "logps/ref_chosen": -52.7564582824707, "logps/ref_rejected": -81.96910095214844, "logps/rejected": -251.15985107421875, "loss": 1.0959, "margin_dpo/margin_mean": 80.8456802368164, "margin_dpo/margin_std": 105.46454620361328, "step": 187 }, { "KL/chosen_KL_mean": -83.22930908203125, "KL/mean": -131.79873657226562, "KL/rejected_KL_mean": -180.36813354492188, "KL/std": 107.7387466430664, "epoch": 0.27606461086637296, "fcm_dpo/beta": 0.004757707007229328, "fcm_dpo/delta": -0.06513302028179169, "fcm_dpo/margin": 97.13882446289062, "fcm_dpo/q_t": 0.3954910933971405, "grad_norm": 28.16905975341797, "learning_rate": 4.5552139770089454e-07, "logits/chosen": -0.3342798352241516, "logits/rejected": -0.3404528498649597, "logps/chosen": -132.64480590820312, "logps/ref_chosen": -49.415489196777344, "logps/ref_rejected": -89.54043579101562, "logps/rejected": -269.9085693359375, "loss": 1.0479, "margin_dpo/margin_mean": 97.13883209228516, "margin_dpo/margin_std": 117.88801574707031, "step": 188 }, { "KL/chosen_KL_mean": -96.34205627441406, "KL/mean": -136.79183959960938, "KL/rejected_KL_mean": -177.24160766601562, "KL/std": 108.5394287109375, "epoch": 0.2775330396475771, "fcm_dpo/beta": 0.004754100926220417, "fcm_dpo/delta": 0.015977924689650536, "fcm_dpo/margin": 80.89956665039062, "fcm_dpo/q_t": 0.41550976037979126, "grad_norm": 23.41521644592285, "learning_rate": 4.5478801107224794e-07, "logits/chosen": -0.3520697355270386, "logits/rejected": -0.3348464369773865, "logps/chosen": -148.7410125732422, "logps/ref_chosen": -52.39896011352539, "logps/ref_rejected": -72.16735076904297, "logps/rejected": -249.40896606445312, "loss": 1.1258, "margin_dpo/margin_mean": 80.89956665039062, "margin_dpo/margin_std": 133.14503479003906, "step": 189 }, { "KL/chosen_KL_mean": -103.06121826171875, "KL/mean": -150.68637084960938, "KL/rejected_KL_mean": -198.3115234375, "KL/std": 115.74911499023438, "epoch": 0.2790014684287812, "fcm_dpo/beta": 0.004754353780299425, "fcm_dpo/delta": -0.05636203661561012, "fcm_dpo/margin": 95.25030517578125, "fcm_dpo/q_t": 0.39939507842063904, "grad_norm": 18.363422393798828, "learning_rate": 4.5404922808905543e-07, "logits/chosen": -0.38547688722610474, "logits/rejected": -0.375651478767395, "logps/chosen": -167.74429321289062, "logps/ref_chosen": -64.68305969238281, "logps/ref_rejected": -102.55052185058594, "logps/rejected": -300.862060546875, "loss": 1.0822, "margin_dpo/margin_mean": 95.25030517578125, "margin_dpo/margin_std": 133.5958251953125, "step": 190 }, { "KL/chosen_KL_mean": -95.35700988769531, "KL/mean": -163.53025817871094, "KL/rejected_KL_mean": -231.70352172851562, "KL/std": 133.92214965820312, "epoch": 0.28046989720998533, "fcm_dpo/beta": 0.004521770402789116, "fcm_dpo/delta": -0.23212674260139465, "fcm_dpo/margin": 136.34649658203125, "fcm_dpo/q_t": 0.3637212812900543, "grad_norm": 20.231264114379883, "learning_rate": 4.5330506821893565e-07, "logits/chosen": -0.3467414379119873, "logits/rejected": -0.3258952498435974, "logps/chosen": -164.015869140625, "logps/ref_chosen": -68.65887451171875, "logps/ref_rejected": -110.1396713256836, "logps/rejected": -341.84320068359375, "loss": 0.9565, "margin_dpo/margin_mean": 136.34649658203125, "margin_dpo/margin_std": 147.54470825195312, "step": 191 }, { "KL/chosen_KL_mean": -124.43778991699219, "KL/mean": -170.04360961914062, "KL/rejected_KL_mean": -215.6494140625, "KL/std": 117.56196594238281, "epoch": 0.28193832599118945, "fcm_dpo/beta": 0.0044925631955266, "fcm_dpo/delta": -0.010320080444216728, "fcm_dpo/margin": 91.21162414550781, "fcm_dpo/q_t": 0.4096482992172241, "grad_norm": 25.52708625793457, "learning_rate": 4.5255555107119336e-07, "logits/chosen": -0.32894307374954224, "logits/rejected": -0.328900545835495, "logps/chosen": -194.16470336914062, "logps/ref_chosen": -69.72691345214844, "logps/ref_rejected": -103.32135009765625, "logps/rejected": -318.97076416015625, "loss": 1.1135, "margin_dpo/margin_mean": 91.21162414550781, "margin_dpo/margin_std": 144.74786376953125, "step": 192 }, { "KL/chosen_KL_mean": -124.83224487304688, "KL/mean": -151.44308471679688, "KL/rejected_KL_mean": -178.053955078125, "KL/std": 107.97267150878906, "epoch": 0.2834067547723935, "fcm_dpo/beta": 0.004495399538427591, "fcm_dpo/delta": 0.0392833836376667, "fcm_dpo/margin": 53.221702575683594, "fcm_dpo/q_t": 0.44323813915252686, "grad_norm": 26.372344970703125, "learning_rate": 4.5180069639630236e-07, "logits/chosen": -0.3519429564476013, "logits/rejected": -0.34495627880096436, "logps/chosen": -185.02273559570312, "logps/ref_chosen": -60.19049835205078, "logps/ref_rejected": -76.40755462646484, "logps/rejected": -254.4615020751953, "loss": 1.2529, "margin_dpo/margin_mean": 53.221702575683594, "margin_dpo/margin_std": 137.95343017578125, "step": 193 }, { "KL/chosen_KL_mean": -78.5848388671875, "KL/mean": -121.88168334960938, "KL/rejected_KL_mean": -165.17852783203125, "KL/std": 90.56858825683594, "epoch": 0.28487518355359764, "fcm_dpo/beta": 0.004506401717662811, "fcm_dpo/delta": 0.010020148009061813, "fcm_dpo/margin": 86.59368896484375, "fcm_dpo/q_t": 0.4085754156112671, "grad_norm": 18.025230407714844, "learning_rate": 4.510405240853854e-07, "logits/chosen": -0.2157665491104126, "logits/rejected": -0.1980063021183014, "logps/chosen": -116.42521667480469, "logps/ref_chosen": -37.84037399291992, "logps/ref_rejected": -60.684783935546875, "logps/rejected": -225.86331176757812, "loss": 1.082, "margin_dpo/margin_mean": 86.59367370605469, "margin_dpo/margin_std": 99.3104019165039, "step": 194 }, { "KL/chosen_KL_mean": -124.42152404785156, "KL/mean": -171.287353515625, "KL/rejected_KL_mean": -218.1531982421875, "KL/std": 112.64602661132812, "epoch": 0.28634361233480177, "fcm_dpo/beta": 0.004506120923906565, "fcm_dpo/delta": -0.023354141041636467, "fcm_dpo/margin": 93.73165893554688, "fcm_dpo/q_t": 0.4031534194946289, "grad_norm": 22.234222412109375, "learning_rate": 4.5027505416968985e-07, "logits/chosen": -0.24858853220939636, "logits/rejected": -0.2673921287059784, "logps/chosen": -179.31309509277344, "logps/ref_chosen": -54.891571044921875, "logps/ref_rejected": -96.77095794677734, "logps/rejected": -314.9241638183594, "loss": 1.0714, "margin_dpo/margin_mean": 93.73165893554688, "margin_dpo/margin_std": 116.69031524658203, "step": 195 }, { "KL/chosen_KL_mean": -97.50921630859375, "KL/mean": -150.7244415283203, "KL/rejected_KL_mean": -203.93966674804688, "KL/std": 114.79684448242188, "epoch": 0.2878120411160059, "fcm_dpo/beta": 0.004426237195730209, "fcm_dpo/delta": -0.07563818991184235, "fcm_dpo/margin": 106.43045043945312, "fcm_dpo/q_t": 0.3946911692619324, "grad_norm": 18.23614501953125, "learning_rate": 4.495043068200599e-07, "logits/chosen": -0.30258023738861084, "logits/rejected": -0.288103848695755, "logps/chosen": -150.75445556640625, "logps/ref_chosen": -53.245243072509766, "logps/ref_rejected": -76.05294799804688, "logps/rejected": -279.99261474609375, "loss": 1.0592, "margin_dpo/margin_mean": 106.43045043945312, "margin_dpo/margin_std": 137.8130645751953, "step": 196 }, { "KL/chosen_KL_mean": -101.94883728027344, "KL/mean": -143.18423461914062, "KL/rejected_KL_mean": -184.41961669921875, "KL/std": 101.37451171875, "epoch": 0.28928046989721, "fcm_dpo/beta": 0.004469497129321098, "fcm_dpo/delta": 0.03227302059531212, "fcm_dpo/margin": 82.47077941894531, "fcm_dpo/q_t": 0.41608455777168274, "grad_norm": 18.144241333007812, "learning_rate": 4.4872830234640493e-07, "logits/chosen": -0.2958967983722687, "logits/rejected": -0.290219783782959, "logps/chosen": -162.36917114257812, "logps/ref_chosen": -60.42033386230469, "logps/ref_rejected": -77.20890808105469, "logps/rejected": -261.6285400390625, "loss": 1.1138, "margin_dpo/margin_mean": 82.47077941894531, "margin_dpo/margin_std": 115.41438293457031, "step": 197 }, { "KL/chosen_KL_mean": -114.83041381835938, "KL/mean": -167.80075073242188, "KL/rejected_KL_mean": -220.7711181640625, "KL/std": 126.27465057373047, "epoch": 0.2907488986784141, "fcm_dpo/beta": 0.004416568670421839, "fcm_dpo/delta": -0.07120651751756668, "fcm_dpo/margin": 105.94068145751953, "fcm_dpo/q_t": 0.3967708349227905, "grad_norm": 22.24930191040039, "learning_rate": 4.479470611971645e-07, "logits/chosen": -0.3203980028629303, "logits/rejected": -0.3210110068321228, "logps/chosen": -169.86660766601562, "logps/ref_chosen": -55.03618621826172, "logps/ref_rejected": -97.24325561523438, "logps/rejected": -318.0143737792969, "loss": 1.0591, "margin_dpo/margin_mean": 105.94068145751953, "margin_dpo/margin_std": 143.1464080810547, "step": 198 }, { "KL/chosen_KL_mean": -110.44184875488281, "KL/mean": -162.89865112304688, "KL/rejected_KL_mean": -215.35546875, "KL/std": 114.6693115234375, "epoch": 0.2922173274596182, "fcm_dpo/beta": 0.004335303790867329, "fcm_dpo/delta": -0.058502815663814545, "fcm_dpo/margin": 104.91361999511719, "fcm_dpo/q_t": 0.3972257673740387, "grad_norm": 23.503461837768555, "learning_rate": 4.471606039587695e-07, "logits/chosen": -0.27863985300064087, "logits/rejected": -0.2598820924758911, "logps/chosen": -167.2706756591797, "logps/ref_chosen": -56.828826904296875, "logps/ref_rejected": -84.64820861816406, "logps/rejected": -300.003662109375, "loss": 1.0675, "margin_dpo/margin_mean": 104.91361999511719, "margin_dpo/margin_std": 138.49346923828125, "step": 199 }, { "KL/chosen_KL_mean": -108.3734130859375, "KL/mean": -160.2718963623047, "KL/rejected_KL_mean": -212.17037963867188, "KL/std": 123.57206726074219, "epoch": 0.2936857562408223, "fcm_dpo/beta": 0.004300840198993683, "fcm_dpo/delta": -0.04902205243706703, "fcm_dpo/margin": 103.79698181152344, "fcm_dpo/q_t": 0.40177974104881287, "grad_norm": 22.9044246673584, "learning_rate": 4.4636895135509966e-07, "logits/chosen": -0.26905137300491333, "logits/rejected": -0.25207480788230896, "logps/chosen": -161.44046020507812, "logps/ref_chosen": -53.06706237792969, "logps/ref_rejected": -80.60843658447266, "logps/rejected": -292.77880859375, "loss": 1.0929, "margin_dpo/margin_mean": 103.7969741821289, "margin_dpo/margin_std": 158.15789794921875, "step": 200 }, { "KL/chosen_KL_mean": -113.9289321899414, "KL/mean": -165.15493774414062, "KL/rejected_KL_mean": -216.38092041015625, "KL/std": 129.3989715576172, "epoch": 0.29515418502202645, "fcm_dpo/beta": 0.004290143959224224, "fcm_dpo/delta": -0.041380785405635834, "fcm_dpo/margin": 102.45198822021484, "fcm_dpo/q_t": 0.40112942457199097, "grad_norm": 20.798912048339844, "learning_rate": 4.455721242469372e-07, "logits/chosen": -0.3590313792228699, "logits/rejected": -0.3559607267379761, "logps/chosen": -189.33114624023438, "logps/ref_chosen": -75.4022216796875, "logps/ref_rejected": -114.80821990966797, "logps/rejected": -331.18914794921875, "loss": 1.0829, "margin_dpo/margin_mean": 102.45198059082031, "margin_dpo/margin_std": 144.92611694335938, "step": 201 }, { "KL/chosen_KL_mean": -116.26347351074219, "KL/mean": -152.79412841796875, "KL/rejected_KL_mean": -189.32476806640625, "KL/std": 111.22699737548828, "epoch": 0.2966226138032305, "fcm_dpo/beta": 0.0043277074582874775, "fcm_dpo/delta": 0.08649900555610657, "fcm_dpo/margin": 73.06129455566406, "fcm_dpo/q_t": 0.42985087633132935, "grad_norm": 20.812585830688477, "learning_rate": 4.4477014363141755e-07, "logits/chosen": -0.2794426679611206, "logits/rejected": -0.293861985206604, "logps/chosen": -166.3647918701172, "logps/ref_chosen": -50.101318359375, "logps/ref_rejected": -86.98503112792969, "logps/rejected": -276.309814453125, "loss": 1.1841, "margin_dpo/margin_mean": 73.0613021850586, "margin_dpo/margin_std": 143.23988342285156, "step": 202 }, { "KL/chosen_KL_mean": -114.866455078125, "KL/mean": -160.2359619140625, "KL/rejected_KL_mean": -205.60543823242188, "KL/std": 114.47230529785156, "epoch": 0.29809104258443464, "fcm_dpo/beta": 0.004343975335359573, "fcm_dpo/delta": 0.00605600792914629, "fcm_dpo/margin": 90.73900604248047, "fcm_dpo/q_t": 0.4100680649280548, "grad_norm": 21.901674270629883, "learning_rate": 4.439630306414758e-07, "logits/chosen": -0.3357563614845276, "logits/rejected": -0.32634925842285156, "logps/chosen": -175.4761505126953, "logps/ref_chosen": -60.60969543457031, "logps/ref_rejected": -85.89596557617188, "logps/rejected": -291.50140380859375, "loss": 1.0993, "margin_dpo/margin_mean": 90.73899841308594, "margin_dpo/margin_std": 125.68807983398438, "step": 203 }, { "KL/chosen_KL_mean": -128.02561950683594, "KL/mean": -169.0902557373047, "KL/rejected_KL_mean": -210.15489196777344, "KL/std": 124.49624633789062, "epoch": 0.29955947136563876, "fcm_dpo/beta": 0.00437512993812561, "fcm_dpo/delta": 0.04220545291900635, "fcm_dpo/margin": 82.12925720214844, "fcm_dpo/q_t": 0.42096078395843506, "grad_norm": 22.241016387939453, "learning_rate": 4.431508065452897e-07, "logits/chosen": -0.4248543977737427, "logits/rejected": -0.38815587759017944, "logps/chosen": -208.19058227539062, "logps/ref_chosen": -80.16496276855469, "logps/ref_rejected": -87.69590759277344, "logps/rejected": -297.8507995605469, "loss": 1.1522, "margin_dpo/margin_mean": 82.12925720214844, "margin_dpo/margin_std": 144.77645874023438, "step": 204 }, { "KL/chosen_KL_mean": -120.78949737548828, "KL/mean": -176.00003051757812, "KL/rejected_KL_mean": -231.2105712890625, "KL/std": 126.26949310302734, "epoch": 0.3010279001468429, "fcm_dpo/beta": 0.004297832027077675, "fcm_dpo/delta": -0.08044849336147308, "fcm_dpo/margin": 110.42106628417969, "fcm_dpo/q_t": 0.39113306999206543, "grad_norm": 21.1467342376709, "learning_rate": 4.4233349274571974e-07, "logits/chosen": -0.32940664887428284, "logits/rejected": -0.29995858669281006, "logps/chosen": -180.17422485351562, "logps/ref_chosen": -59.384735107421875, "logps/ref_rejected": -85.12505340576172, "logps/rejected": -316.33563232421875, "loss": 1.0534, "margin_dpo/margin_mean": 110.42106628417969, "margin_dpo/margin_std": 136.45323181152344, "step": 205 }, { "KL/chosen_KL_mean": -110.37324523925781, "KL/mean": -169.74008178710938, "KL/rejected_KL_mean": -229.10691833496094, "KL/std": 117.97074127197266, "epoch": 0.302496328928047, "fcm_dpo/beta": 0.004232403822243214, "fcm_dpo/delta": -0.10859975218772888, "fcm_dpo/margin": 118.73365783691406, "fcm_dpo/q_t": 0.38320374488830566, "grad_norm": 25.72849464416504, "learning_rate": 4.415111107797445e-07, "logits/chosen": -0.26661020517349243, "logits/rejected": -0.2699154019355774, "logps/chosen": -157.33773803710938, "logps/ref_chosen": -46.964500427246094, "logps/ref_rejected": -98.9534912109375, "logps/rejected": -328.0604248046875, "loss": 1.0145, "margin_dpo/margin_mean": 118.73365783691406, "margin_dpo/margin_std": 128.0810089111328, "step": 206 }, { "KL/chosen_KL_mean": -100.90605163574219, "KL/mean": -168.05532836914062, "KL/rejected_KL_mean": -235.20462036132812, "KL/std": 134.08450317382812, "epoch": 0.3039647577092511, "fcm_dpo/beta": 0.004127143882215023, "fcm_dpo/delta": -0.1631755232810974, "fcm_dpo/margin": 134.29855346679688, "fcm_dpo/q_t": 0.3774099349975586, "grad_norm": 22.681591033935547, "learning_rate": 4.4068368231789365e-07, "logits/chosen": -0.35407179594039917, "logits/rejected": -0.32842785120010376, "logps/chosen": -156.96231079101562, "logps/ref_chosen": -56.05625915527344, "logps/ref_rejected": -84.44779968261719, "logps/rejected": -319.65240478515625, "loss": 0.9974, "margin_dpo/margin_mean": 134.298583984375, "margin_dpo/margin_std": 156.59857177734375, "step": 207 }, { "KL/chosen_KL_mean": -154.8147735595703, "KL/mean": -207.43914794921875, "KL/rejected_KL_mean": -260.06353759765625, "KL/std": 128.05979919433594, "epoch": 0.3054331864904552, "fcm_dpo/beta": 0.004062125459313393, "fcm_dpo/delta": -0.02891511656343937, "fcm_dpo/margin": 105.24872589111328, "fcm_dpo/q_t": 0.40312352776527405, "grad_norm": 23.56682014465332, "learning_rate": 4.398512291636768e-07, "logits/chosen": -0.38881534337997437, "logits/rejected": -0.37188804149627686, "logps/chosen": -221.88238525390625, "logps/ref_chosen": -67.06761169433594, "logps/ref_rejected": -94.28689575195312, "logps/rejected": -354.35040283203125, "loss": 1.096, "margin_dpo/margin_mean": 105.24872589111328, "margin_dpo/margin_std": 155.59713745117188, "step": 208 }, { "KL/chosen_KL_mean": -129.39630126953125, "KL/mean": -176.29495239257812, "KL/rejected_KL_mean": -223.19363403320312, "KL/std": 115.95198059082031, "epoch": 0.3069016152716593, "fcm_dpo/beta": 0.004076983779668808, "fcm_dpo/delta": 0.018282007426023483, "fcm_dpo/margin": 93.79732513427734, "fcm_dpo/q_t": 0.41346555948257446, "grad_norm": 26.791549682617188, "learning_rate": 4.3901377325300857e-07, "logits/chosen": -0.26864010095596313, "logits/rejected": -0.2571912109851837, "logps/chosen": -185.5780029296875, "logps/ref_chosen": -56.18169403076172, "logps/ref_rejected": -80.94152069091797, "logps/rejected": -304.1351623535156, "loss": 1.1292, "margin_dpo/margin_mean": 93.79731750488281, "margin_dpo/margin_std": 149.582763671875, "step": 209 }, { "KL/chosen_KL_mean": -116.94548034667969, "KL/mean": -171.28390502929688, "KL/rejected_KL_mean": -225.622314453125, "KL/std": 119.73749542236328, "epoch": 0.30837004405286345, "fcm_dpo/beta": 0.004069700837135315, "fcm_dpo/delta": -0.04440900310873985, "fcm_dpo/margin": 108.67684936523438, "fcm_dpo/q_t": 0.40055060386657715, "grad_norm": 23.223583221435547, "learning_rate": 4.381713366536311e-07, "logits/chosen": -0.2933782935142517, "logits/rejected": -0.2853144705295563, "logps/chosen": -163.31729125976562, "logps/ref_chosen": -46.371822357177734, "logps/ref_rejected": -76.68162536621094, "logps/rejected": -302.303955078125, "loss": 1.0747, "margin_dpo/margin_mean": 108.67683410644531, "margin_dpo/margin_std": 146.201904296875, "step": 210 }, { "KL/chosen_KL_mean": -168.60812377929688, "KL/mean": -214.71722412109375, "KL/rejected_KL_mean": -260.8263244628906, "KL/std": 136.29385375976562, "epoch": 0.30983847283406757, "fcm_dpo/beta": 0.004061352461576462, "fcm_dpo/delta": 0.02644379436969757, "fcm_dpo/margin": 92.21820831298828, "fcm_dpo/q_t": 0.41933655738830566, "grad_norm": 30.78042221069336, "learning_rate": 4.373239415645323e-07, "logits/chosen": -0.3214316964149475, "logits/rejected": -0.2823808193206787, "logps/chosen": -247.54046630859375, "logps/ref_chosen": -78.93235778808594, "logps/ref_rejected": -86.82098388671875, "logps/rejected": -347.6473083496094, "loss": 1.1462, "margin_dpo/margin_mean": 92.21821594238281, "margin_dpo/margin_std": 160.97451782226562, "step": 211 }, { "KL/chosen_KL_mean": -139.83131408691406, "KL/mean": -206.15283203125, "KL/rejected_KL_mean": -272.474365234375, "KL/std": 148.2513427734375, "epoch": 0.31130690161527164, "fcm_dpo/beta": 0.003954698797315359, "fcm_dpo/delta": -0.13367314636707306, "fcm_dpo/margin": 132.64306640625, "fcm_dpo/q_t": 0.3826107978820801, "grad_norm": 24.799522399902344, "learning_rate": 4.3647161031536086e-07, "logits/chosen": -0.3196195363998413, "logits/rejected": -0.31085437536239624, "logps/chosen": -198.0283203125, "logps/ref_chosen": -58.19701385498047, "logps/ref_rejected": -103.05785369873047, "logps/rejected": -375.5322265625, "loss": 1.0319, "margin_dpo/margin_mean": 132.64306640625, "margin_dpo/margin_std": 163.62814331054688, "step": 212 }, { "KL/chosen_KL_mean": -132.14027404785156, "KL/mean": -195.88795471191406, "KL/rejected_KL_mean": -259.6356506347656, "KL/std": 129.16009521484375, "epoch": 0.31277533039647576, "fcm_dpo/beta": 0.0038848065305501223, "fcm_dpo/delta": -0.10088707506656647, "fcm_dpo/margin": 127.49536895751953, "fcm_dpo/q_t": 0.3872129023075104, "grad_norm": 29.145305633544922, "learning_rate": 4.3561436536583774e-07, "logits/chosen": -0.3360249698162079, "logits/rejected": -0.31101077795028687, "logps/chosen": -199.65298461914062, "logps/ref_chosen": -67.51271057128906, "logps/ref_rejected": -93.91471862792969, "logps/rejected": -353.55035400390625, "loss": 1.0333, "margin_dpo/margin_mean": 127.49537658691406, "margin_dpo/margin_std": 153.2450408935547, "step": 213 }, { "KL/chosen_KL_mean": -112.85121154785156, "KL/mean": -170.10894775390625, "KL/rejected_KL_mean": -227.36666870117188, "KL/std": 127.09822082519531, "epoch": 0.3142437591776799, "fcm_dpo/beta": 0.003853208851069212, "fcm_dpo/delta": -0.043163709342479706, "fcm_dpo/margin": 114.5154800415039, "fcm_dpo/q_t": 0.4001784920692444, "grad_norm": 23.445825576782227, "learning_rate": 4.3475222930516473e-07, "logits/chosen": -0.25710099935531616, "logits/rejected": -0.26210659742355347, "logps/chosen": -154.4561004638672, "logps/ref_chosen": -41.604888916015625, "logps/ref_rejected": -77.51741027832031, "logps/rejected": -304.88409423828125, "loss": 1.0671, "margin_dpo/margin_mean": 114.51548767089844, "margin_dpo/margin_std": 147.68756103515625, "step": 214 }, { "KL/chosen_KL_mean": -134.02593994140625, "KL/mean": -194.3079833984375, "KL/rejected_KL_mean": -254.5900421142578, "KL/std": 132.08059692382812, "epoch": 0.315712187958884, "fcm_dpo/beta": 0.0038004510570317507, "fcm_dpo/delta": -0.06129283457994461, "fcm_dpo/margin": 120.56410217285156, "fcm_dpo/q_t": 0.39448457956314087, "grad_norm": 26.497583389282227, "learning_rate": 4.3388522485142885e-07, "logits/chosen": -0.2794630229473114, "logits/rejected": -0.27032387256622314, "logps/chosen": -187.30520629882812, "logps/ref_chosen": -53.279266357421875, "logps/ref_rejected": -89.96464538574219, "logps/rejected": -344.5546875, "loss": 1.0433, "margin_dpo/margin_mean": 120.56410217285156, "margin_dpo/margin_std": 137.23513793945312, "step": 215 }, { "KL/chosen_KL_mean": -138.703369140625, "KL/mean": -198.391845703125, "KL/rejected_KL_mean": -258.0802917480469, "KL/std": 138.0404510498047, "epoch": 0.31718061674008813, "fcm_dpo/beta": 0.003782880725339055, "fcm_dpo/delta": -0.05413120239973068, "fcm_dpo/margin": 119.37692260742188, "fcm_dpo/q_t": 0.39900004863739014, "grad_norm": 24.74566078186035, "learning_rate": 4.330133748510036e-07, "logits/chosen": -0.2884059250354767, "logits/rejected": -0.27266985177993774, "logps/chosen": -187.59117126464844, "logps/ref_chosen": -48.887794494628906, "logps/ref_rejected": -77.19892883300781, "logps/rejected": -335.27923583984375, "loss": 1.0807, "margin_dpo/margin_mean": 119.37692260742188, "margin_dpo/margin_std": 169.870849609375, "step": 216 }, { "KL/chosen_KL_mean": -141.81265258789062, "KL/mean": -211.98358154296875, "KL/rejected_KL_mean": -282.154541015625, "KL/std": 141.36019897460938, "epoch": 0.3186490455212922, "fcm_dpo/beta": 0.003682144917547703, "fcm_dpo/delta": -0.12343692779541016, "fcm_dpo/margin": 140.3418731689453, "fcm_dpo/q_t": 0.3833308517932892, "grad_norm": 20.693517684936523, "learning_rate": 4.3213670227794757e-07, "logits/chosen": -0.26268115639686584, "logits/rejected": -0.2574685513973236, "logps/chosen": -191.657958984375, "logps/ref_chosen": -49.845306396484375, "logps/ref_rejected": -100.07832336425781, "logps/rejected": -382.23284912109375, "loss": 1.0116, "margin_dpo/margin_mean": 140.34185791015625, "margin_dpo/margin_std": 159.26388549804688, "step": 217 }, { "KL/chosen_KL_mean": -148.78378295898438, "KL/mean": -203.23626708984375, "KL/rejected_KL_mean": -257.688720703125, "KL/std": 139.531494140625, "epoch": 0.3201174743024963, "fcm_dpo/beta": 0.003660230664536357, "fcm_dpo/delta": 0.0013791173696517944, "fcm_dpo/margin": 108.90492248535156, "fcm_dpo/q_t": 0.410659521818161, "grad_norm": 21.03861427307129, "learning_rate": 4.3125523023339815e-07, "logits/chosen": -0.28771138191223145, "logits/rejected": -0.28279104828834534, "logps/chosen": -207.3604736328125, "logps/ref_chosen": -58.576683044433594, "logps/ref_rejected": -87.84639739990234, "logps/rejected": -345.5351257324219, "loss": 1.1106, "margin_dpo/margin_mean": 108.9049301147461, "margin_dpo/margin_std": 163.36837768554688, "step": 218 }, { "KL/chosen_KL_mean": -159.7332763671875, "KL/mean": -205.7009735107422, "KL/rejected_KL_mean": -251.6686553955078, "KL/std": 144.26089477539062, "epoch": 0.32158590308370044, "fcm_dpo/beta": 0.0037173782475292683, "fcm_dpo/delta": 0.05955355241894722, "fcm_dpo/margin": 91.93537139892578, "fcm_dpo/q_t": 0.4234854578971863, "grad_norm": 28.875822067260742, "learning_rate": 4.303689819449636e-07, "logits/chosen": -0.3138810992240906, "logits/rejected": -0.30747318267822266, "logps/chosen": -220.817138671875, "logps/ref_chosen": -61.083858489990234, "logps/ref_rejected": -85.83042907714844, "logps/rejected": -337.49908447265625, "loss": 1.1738, "margin_dpo/margin_mean": 91.93536376953125, "margin_dpo/margin_std": 175.91717529296875, "step": 219 }, { "KL/chosen_KL_mean": -179.41836547851562, "KL/mean": -218.91024780273438, "KL/rejected_KL_mean": -258.40216064453125, "KL/std": 125.53445434570312, "epoch": 0.32305433186490456, "fcm_dpo/beta": 0.0037627811543643475, "fcm_dpo/delta": 0.1060803085565567, "fcm_dpo/margin": 78.98377990722656, "fcm_dpo/q_t": 0.43049296736717224, "grad_norm": 28.49346160888672, "learning_rate": 4.2947798076611047e-07, "logits/chosen": -0.2874869704246521, "logits/rejected": -0.2643676996231079, "logps/chosen": -249.44964599609375, "logps/ref_chosen": -70.03128051757812, "logps/ref_rejected": -87.68551635742188, "logps/rejected": -346.087646484375, "loss": 1.1674, "margin_dpo/margin_mean": 78.98377990722656, "margin_dpo/margin_std": 128.49771118164062, "step": 220 }, { "KL/chosen_KL_mean": -153.0963134765625, "KL/mean": -238.85574340820312, "KL/rejected_KL_mean": -324.6151428222656, "KL/std": 157.85043334960938, "epoch": 0.3245227606461087, "fcm_dpo/beta": 0.003666388336569071, "fcm_dpo/delta": -0.24369555711746216, "fcm_dpo/margin": 171.518798828125, "fcm_dpo/q_t": 0.35655221343040466, "grad_norm": 25.883392333984375, "learning_rate": 4.285822501755485e-07, "logits/chosen": -0.28886061906814575, "logits/rejected": -0.2952112555503845, "logps/chosen": -205.25103759765625, "logps/ref_chosen": -52.15470886230469, "logps/ref_rejected": -106.46768188476562, "logps/rejected": -431.08282470703125, "loss": 0.9343, "margin_dpo/margin_mean": 171.518798828125, "margin_dpo/margin_std": 161.14630126953125, "step": 221 }, { "KL/chosen_KL_mean": -157.40545654296875, "KL/mean": -221.60968017578125, "KL/rejected_KL_mean": -285.81390380859375, "KL/std": 144.6243133544922, "epoch": 0.32599118942731276, "fcm_dpo/beta": 0.003590481821447611, "fcm_dpo/delta": -0.06407497823238373, "fcm_dpo/margin": 128.40843200683594, "fcm_dpo/q_t": 0.39516395330429077, "grad_norm": 20.063804626464844, "learning_rate": 4.276818137766118e-07, "logits/chosen": -0.32411831617355347, "logits/rejected": -0.32718104124069214, "logps/chosen": -218.37655639648438, "logps/ref_chosen": -60.971099853515625, "logps/ref_rejected": -100.00115203857422, "logps/rejected": -385.8150634765625, "loss": 1.0554, "margin_dpo/margin_mean": 128.40843200683594, "margin_dpo/margin_std": 161.19532775878906, "step": 222 }, { "KL/chosen_KL_mean": -166.05140686035156, "KL/mean": -220.04031372070312, "KL/rejected_KL_mean": -274.02923583984375, "KL/std": 142.65538024902344, "epoch": 0.3274596182085169, "fcm_dpo/beta": 0.0035675265826284885, "fcm_dpo/delta": 0.015362029895186424, "fcm_dpo/margin": 107.97785186767578, "fcm_dpo/q_t": 0.41345182061195374, "grad_norm": 23.098182678222656, "learning_rate": 4.2677669529663686e-07, "logits/chosen": -0.2400050163269043, "logits/rejected": -0.2347499132156372, "logps/chosen": -218.69198608398438, "logps/ref_chosen": -52.64057540893555, "logps/ref_rejected": -82.82502746582031, "logps/rejected": -356.854248046875, "loss": 1.1337, "margin_dpo/margin_mean": 107.97784423828125, "margin_dpo/margin_std": 181.26332092285156, "step": 223 }, { "KL/chosen_KL_mean": -140.97390747070312, "KL/mean": -202.82191467285156, "KL/rejected_KL_mean": -264.669921875, "KL/std": 155.19711303710938, "epoch": 0.328928046989721, "fcm_dpo/beta": 0.0035286881029605865, "fcm_dpo/delta": -0.03949831798672676, "fcm_dpo/margin": 123.69601440429688, "fcm_dpo/q_t": 0.40351927280426025, "grad_norm": 24.251049041748047, "learning_rate": 4.2586691858633747e-07, "logits/chosen": -0.3118600845336914, "logits/rejected": -0.2952437102794647, "logps/chosen": -189.56932067871094, "logps/ref_chosen": -48.59541320800781, "logps/ref_rejected": -77.11648559570312, "logps/rejected": -341.7864074707031, "loss": 1.088, "margin_dpo/margin_mean": 123.69600677490234, "margin_dpo/margin_std": 177.5772247314453, "step": 224 }, { "KL/chosen_KL_mean": -159.09860229492188, "KL/mean": -231.4862060546875, "KL/rejected_KL_mean": -303.8738098144531, "KL/std": 148.37298583984375, "epoch": 0.3303964757709251, "fcm_dpo/beta": 0.0034855613484978676, "fcm_dpo/delta": -0.11032609641551971, "fcm_dpo/margin": 144.7752227783203, "fcm_dpo/q_t": 0.38641393184661865, "grad_norm": 21.628904342651367, "learning_rate": 4.249525076191759e-07, "logits/chosen": -0.3304445743560791, "logits/rejected": -0.32175442576408386, "logps/chosen": -217.09906005859375, "logps/ref_chosen": -58.000465393066406, "logps/ref_rejected": -99.90291595458984, "logps/rejected": -403.7767333984375, "loss": 1.0326, "margin_dpo/margin_mean": 144.7752227783203, "margin_dpo/margin_std": 180.18701171875, "step": 225 }, { "KL/chosen_KL_mean": -133.54840087890625, "KL/mean": -190.18936157226562, "KL/rejected_KL_mean": -246.830322265625, "KL/std": 143.92837524414062, "epoch": 0.33186490455212925, "fcm_dpo/beta": 0.003453510347753763, "fcm_dpo/delta": 0.008469540625810623, "fcm_dpo/margin": 113.28192901611328, "fcm_dpo/q_t": 0.4127262234687805, "grad_norm": 28.347190856933594, "learning_rate": 4.2403348649073167e-07, "logits/chosen": -0.396121621131897, "logits/rejected": -0.3598354160785675, "logps/chosen": -192.44720458984375, "logps/ref_chosen": -58.898799896240234, "logps/ref_rejected": -78.68775939941406, "logps/rejected": -325.51806640625, "loss": 1.1124, "margin_dpo/margin_mean": 113.28193664550781, "margin_dpo/margin_std": 167.6037139892578, "step": 226 }, { "KL/chosen_KL_mean": -153.62620544433594, "KL/mean": -225.86712646484375, "KL/rejected_KL_mean": -298.1080627441406, "KL/std": 163.63131713867188, "epoch": 0.3333333333333333, "fcm_dpo/beta": 0.003411718178540468, "fcm_dpo/delta": -0.09843979775905609, "fcm_dpo/margin": 144.4818878173828, "fcm_dpo/q_t": 0.38779282569885254, "grad_norm": 21.415340423583984, "learning_rate": 4.2310987941806615e-07, "logits/chosen": -0.3376998007297516, "logits/rejected": -0.3247716724872589, "logps/chosen": -212.69837951660156, "logps/ref_chosen": -59.072181701660156, "logps/ref_rejected": -99.41236877441406, "logps/rejected": -397.52044677734375, "loss": 1.0302, "margin_dpo/margin_mean": 144.4818878173828, "margin_dpo/margin_std": 171.448486328125, "step": 227 }, { "KL/chosen_KL_mean": -157.6712646484375, "KL/mean": -208.39749145507812, "KL/rejected_KL_mean": -259.12371826171875, "KL/std": 133.16519165039062, "epoch": 0.33480176211453744, "fcm_dpo/beta": 0.00343983992934227, "fcm_dpo/delta": 0.05273807793855667, "fcm_dpo/margin": 101.45245361328125, "fcm_dpo/q_t": 0.4198562502861023, "grad_norm": 22.723651885986328, "learning_rate": 4.2218171073908463e-07, "logits/chosen": -0.34845787286758423, "logits/rejected": -0.3311355710029602, "logps/chosen": -223.56256103515625, "logps/ref_chosen": -65.89128875732422, "logps/ref_rejected": -91.04875183105469, "logps/rejected": -350.1724853515625, "loss": 1.1416, "margin_dpo/margin_mean": 101.45246887207031, "margin_dpo/margin_std": 162.93731689453125, "step": 228 }, { "KL/chosen_KL_mean": -161.77908325195312, "KL/mean": -216.16851806640625, "KL/rejected_KL_mean": -270.5579528808594, "KL/std": 153.45242309570312, "epoch": 0.33627019089574156, "fcm_dpo/beta": 0.0034589767456054688, "fcm_dpo/delta": 0.024636760354042053, "fcm_dpo/margin": 108.77888488769531, "fcm_dpo/q_t": 0.41331931948661804, "grad_norm": 30.321849822998047, "learning_rate": 4.212490049118951e-07, "logits/chosen": -0.4106701612472534, "logits/rejected": -0.3795148730278015, "logps/chosen": -232.48545837402344, "logps/ref_chosen": -70.70637512207031, "logps/ref_rejected": -84.52741241455078, "logps/rejected": -355.08538818359375, "loss": 1.1173, "margin_dpo/margin_mean": 108.77888488769531, "margin_dpo/margin_std": 161.33079528808594, "step": 229 }, { "KL/chosen_KL_mean": -125.57262420654297, "KL/mean": -208.11363220214844, "KL/rejected_KL_mean": -290.6546630859375, "KL/std": 146.30148315429688, "epoch": 0.3377386196769457, "fcm_dpo/beta": 0.003373272018507123, "fcm_dpo/delta": -0.16629549860954285, "fcm_dpo/margin": 165.08203125, "fcm_dpo/q_t": 0.3711177110671997, "grad_norm": 28.437881469726562, "learning_rate": 4.203117865141635e-07, "logits/chosen": -0.31211984157562256, "logits/rejected": -0.3166271448135376, "logps/chosen": -164.85462951660156, "logps/ref_chosen": -39.282005310058594, "logps/ref_rejected": -85.62191009521484, "logps/rejected": -376.27655029296875, "loss": 0.9724, "margin_dpo/margin_mean": 165.08203125, "margin_dpo/margin_std": 161.2001495361328, "step": 230 }, { "KL/chosen_KL_mean": -148.6149444580078, "KL/mean": -206.58450317382812, "KL/rejected_KL_mean": -264.5540771484375, "KL/std": 131.30169677734375, "epoch": 0.3392070484581498, "fcm_dpo/beta": 0.0033540253061801195, "fcm_dpo/delta": 0.011585213243961334, "fcm_dpo/margin": 115.93913269042969, "fcm_dpo/q_t": 0.4116860628128052, "grad_norm": 23.887901306152344, "learning_rate": 4.1937008024246625e-07, "logits/chosen": -0.3661789894104004, "logits/rejected": -0.33695119619369507, "logps/chosen": -211.89138793945312, "logps/ref_chosen": -63.27644348144531, "logps/ref_rejected": -74.1239013671875, "logps/rejected": -338.677978515625, "loss": 1.0947, "margin_dpo/margin_mean": 115.93913269042969, "margin_dpo/margin_std": 152.4516143798828, "step": 231 }, { "KL/chosen_KL_mean": -187.2882843017578, "KL/mean": -234.39813232421875, "KL/rejected_KL_mean": -281.5079650878906, "KL/std": 158.7782745361328, "epoch": 0.3406754772393539, "fcm_dpo/beta": 0.0033917182590812445, "fcm_dpo/delta": 0.08317073434591293, "fcm_dpo/margin": 94.21968841552734, "fcm_dpo/q_t": 0.4292943477630615, "grad_norm": 25.093761444091797, "learning_rate": 4.1842391091163933e-07, "logits/chosen": -0.35640761256217957, "logits/rejected": -0.33361750841140747, "logps/chosen": -258.03704833984375, "logps/ref_chosen": -70.74876403808594, "logps/ref_rejected": -83.97706604003906, "logps/rejected": -365.48504638671875, "loss": 1.1629, "margin_dpo/margin_mean": 94.21968078613281, "margin_dpo/margin_std": 164.27349853515625, "step": 232 }, { "KL/chosen_KL_mean": -170.53353881835938, "KL/mean": -244.69998168945312, "KL/rejected_KL_mean": -318.866455078125, "KL/std": 167.12754821777344, "epoch": 0.342143906020558, "fcm_dpo/beta": 0.003362037241458893, "fcm_dpo/delta": -0.10381458699703217, "fcm_dpo/margin": 148.33291625976562, "fcm_dpo/q_t": 0.3922462463378906, "grad_norm": 27.809114456176758, "learning_rate": 4.174733034541245e-07, "logits/chosen": -0.3662954270839691, "logits/rejected": -0.37061402201652527, "logps/chosen": -225.41647338867188, "logps/ref_chosen": -54.8829345703125, "logps/ref_rejected": -107.4800796508789, "logps/rejected": -426.3465270996094, "loss": 1.0649, "margin_dpo/margin_mean": 148.33291625976562, "margin_dpo/margin_std": 215.14837646484375, "step": 233 }, { "KL/chosen_KL_mean": -167.3464813232422, "KL/mean": -245.2230987548828, "KL/rejected_KL_mean": -323.0997009277344, "KL/std": 150.866455078125, "epoch": 0.3436123348017621, "fcm_dpo/beta": 0.003259950317442417, "fcm_dpo/delta": -0.11555645614862442, "fcm_dpo/margin": 155.75323486328125, "fcm_dpo/q_t": 0.3844042122364044, "grad_norm": 36.372398376464844, "learning_rate": 4.165182829193126e-07, "logits/chosen": -0.3135479688644409, "logits/rejected": -0.34066638350486755, "logps/chosen": -211.44100952148438, "logps/ref_chosen": -44.094520568847656, "logps/ref_rejected": -100.00663757324219, "logps/rejected": -423.1063232421875, "loss": 1.0153, "margin_dpo/margin_mean": 155.75323486328125, "margin_dpo/margin_std": 169.97021484375, "step": 234 }, { "KL/chosen_KL_mean": -198.0821990966797, "KL/mean": -247.19805908203125, "KL/rejected_KL_mean": -296.31390380859375, "KL/std": 143.0350341796875, "epoch": 0.34508076358296624, "fcm_dpo/beta": 0.0033134431578218937, "fcm_dpo/delta": 0.07656269520521164, "fcm_dpo/margin": 98.23170471191406, "fcm_dpo/q_t": 0.42514321208000183, "grad_norm": 27.246450424194336, "learning_rate": 4.1555887447288255e-07, "logits/chosen": -0.38035351037979126, "logits/rejected": -0.3624608516693115, "logps/chosen": -260.32012939453125, "logps/ref_chosen": -62.237911224365234, "logps/ref_rejected": -90.39506530761719, "logps/rejected": -386.708984375, "loss": 1.1651, "margin_dpo/margin_mean": 98.2317123413086, "margin_dpo/margin_std": 172.09481811523438, "step": 235 }, { "KL/chosen_KL_mean": -141.14572143554688, "KL/mean": -220.7097930908203, "KL/rejected_KL_mean": -300.2738952636719, "KL/std": 150.90029907226562, "epoch": 0.3465491923641703, "fcm_dpo/beta": 0.003255967516452074, "fcm_dpo/delta": -0.12453138083219528, "fcm_dpo/margin": 159.128173828125, "fcm_dpo/q_t": 0.3787830173969269, "grad_norm": 40.99539566040039, "learning_rate": 4.1459510339613946e-07, "logits/chosen": -0.3324674963951111, "logits/rejected": -0.33202531933784485, "logps/chosen": -190.48707580566406, "logps/ref_chosen": -49.34136199951172, "logps/ref_rejected": -103.51162719726562, "logps/rejected": -403.7855224609375, "loss": 0.9848, "margin_dpo/margin_mean": 159.128173828125, "margin_dpo/margin_std": 143.73922729492188, "step": 236 }, { "KL/chosen_KL_mean": -186.3697509765625, "KL/mean": -246.17672729492188, "KL/rejected_KL_mean": -305.98370361328125, "KL/std": 150.6981964111328, "epoch": 0.34801762114537443, "fcm_dpo/beta": 0.0032444519456475973, "fcm_dpo/delta": 0.012344859540462494, "fcm_dpo/margin": 119.61395263671875, "fcm_dpo/q_t": 0.411772221326828, "grad_norm": 26.916481018066406, "learning_rate": 4.136269950853473e-07, "logits/chosen": -0.39190009236335754, "logits/rejected": -0.38742589950561523, "logps/chosen": -240.53787231445312, "logps/ref_chosen": -54.168121337890625, "logps/ref_rejected": -94.78036499023438, "logps/rejected": -400.7640686035156, "loss": 1.1085, "margin_dpo/margin_mean": 119.61394500732422, "margin_dpo/margin_std": 173.78614807128906, "step": 237 }, { "KL/chosen_KL_mean": -164.6402587890625, "KL/mean": -224.44186401367188, "KL/rejected_KL_mean": -284.2435302734375, "KL/std": 150.41282653808594, "epoch": 0.34948604992657856, "fcm_dpo/beta": 0.003233974566683173, "fcm_dpo/delta": 0.013382863253355026, "fcm_dpo/margin": 119.60325622558594, "fcm_dpo/q_t": 0.41353076696395874, "grad_norm": 22.910091400146484, "learning_rate": 4.126545750510605e-07, "logits/chosen": -0.3664902448654175, "logits/rejected": -0.38237977027893066, "logps/chosen": -218.61337280273438, "logps/ref_chosen": -53.973121643066406, "logps/ref_rejected": -89.41795349121094, "logps/rejected": -373.66143798828125, "loss": 1.1035, "margin_dpo/margin_mean": 119.60325622558594, "margin_dpo/margin_std": 166.632080078125, "step": 238 }, { "KL/chosen_KL_mean": -173.09349060058594, "KL/mean": -240.07090759277344, "KL/rejected_KL_mean": -307.04833984375, "KL/std": 139.98458862304688, "epoch": 0.3509544787077827, "fcm_dpo/beta": 0.003213751595467329, "fcm_dpo/delta": -0.03296435624361038, "fcm_dpo/margin": 133.9548797607422, "fcm_dpo/q_t": 0.40008848905563354, "grad_norm": 43.435367584228516, "learning_rate": 4.116778689174514e-07, "logits/chosen": -0.357890248298645, "logits/rejected": -0.34496229887008667, "logps/chosen": -231.19131469726562, "logps/ref_chosen": -58.09782409667969, "logps/ref_rejected": -93.59294128417969, "logps/rejected": -400.64129638671875, "loss": 1.0658, "margin_dpo/margin_mean": 133.9548797607422, "margin_dpo/margin_std": 157.7518310546875, "step": 239 }, { "KL/chosen_KL_mean": -189.6949462890625, "KL/mean": -246.00955200195312, "KL/rejected_KL_mean": -302.32415771484375, "KL/std": 153.5874786376953, "epoch": 0.3524229074889868, "fcm_dpo/beta": 0.0032359175384044647, "fcm_dpo/delta": 0.03676654398441315, "fcm_dpo/margin": 112.62922668457031, "fcm_dpo/q_t": 0.41720300912857056, "grad_norm": 38.87122344970703, "learning_rate": 4.106969024216348e-07, "logits/chosen": -0.3707585334777832, "logits/rejected": -0.3482241630554199, "logps/chosen": -250.30943298339844, "logps/ref_chosen": -60.6144905090332, "logps/ref_rejected": -74.1185302734375, "logps/rejected": -376.44268798828125, "loss": 1.14, "margin_dpo/margin_mean": 112.62922668457031, "margin_dpo/margin_std": 183.985595703125, "step": 240 }, { "KL/chosen_KL_mean": -162.46920776367188, "KL/mean": -250.920166015625, "KL/rejected_KL_mean": -339.3711242675781, "KL/std": 172.28445434570312, "epoch": 0.35389133627019087, "fcm_dpo/beta": 0.0031772879883646965, "fcm_dpo/delta": -0.1714785099029541, "fcm_dpo/margin": 176.90191650390625, "fcm_dpo/q_t": 0.3749390244483948, "grad_norm": 22.715147018432617, "learning_rate": 4.097117014129903e-07, "logits/chosen": -0.41761964559555054, "logits/rejected": -0.39237093925476074, "logps/chosen": -228.56027221679688, "logps/ref_chosen": -66.091064453125, "logps/ref_rejected": -88.06088256835938, "logps/rejected": -427.4320068359375, "loss": 0.992, "margin_dpo/margin_mean": 176.90191650390625, "margin_dpo/margin_std": 199.16400146484375, "step": 241 }, { "KL/chosen_KL_mean": -190.58236694335938, "KL/mean": -253.82553100585938, "KL/rejected_KL_mean": -317.0687255859375, "KL/std": 150.39723205566406, "epoch": 0.355359765051395, "fcm_dpo/beta": 0.003157552797347307, "fcm_dpo/delta": 0.00024249032139778137, "fcm_dpo/margin": 126.4863510131836, "fcm_dpo/q_t": 0.4103991985321045, "grad_norm": 31.792686462402344, "learning_rate": 4.087222918524807e-07, "logits/chosen": -0.35047098994255066, "logits/rejected": -0.32680755853652954, "logps/chosen": -258.4462890625, "logps/ref_chosen": -67.86392974853516, "logps/ref_rejected": -83.36033630371094, "logps/rejected": -400.4290771484375, "loss": 1.1001, "margin_dpo/margin_mean": 126.48635864257812, "margin_dpo/margin_std": 179.0635986328125, "step": 242 }, { "KL/chosen_KL_mean": -187.70462036132812, "KL/mean": -265.26544189453125, "KL/rejected_KL_mean": -342.8262634277344, "KL/std": 165.91262817382812, "epoch": 0.3568281938325991, "fcm_dpo/beta": 0.0030988508369773626, "fcm_dpo/delta": -0.08498271554708481, "fcm_dpo/margin": 155.12164306640625, "fcm_dpo/q_t": 0.3906670808792114, "grad_norm": 23.673492431640625, "learning_rate": 4.07728699811968e-07, "logits/chosen": -0.37325674295425415, "logits/rejected": -0.3453625440597534, "logps/chosen": -250.7888641357422, "logps/ref_chosen": -63.0842399597168, "logps/ref_rejected": -76.33563232421875, "logps/rejected": -419.1618957519531, "loss": 1.0381, "margin_dpo/margin_mean": 155.12164306640625, "margin_dpo/margin_std": 187.38595581054688, "step": 243 }, { "KL/chosen_KL_mean": -165.96322631835938, "KL/mean": -245.95059204101562, "KL/rejected_KL_mean": -325.93792724609375, "KL/std": 159.3692626953125, "epoch": 0.35829662261380324, "fcm_dpo/beta": 0.0030439933761954308, "fcm_dpo/delta": -0.09161465615034103, "fcm_dpo/margin": 159.97470092773438, "fcm_dpo/q_t": 0.38761717081069946, "grad_norm": 31.374834060668945, "learning_rate": 4.067309514735267e-07, "logits/chosen": -0.42891860008239746, "logits/rejected": -0.42122605443000793, "logps/chosen": -227.1039276123047, "logps/ref_chosen": -61.140689849853516, "logps/ref_rejected": -94.89193725585938, "logps/rejected": -420.82989501953125, "loss": 1.0162, "margin_dpo/margin_mean": 159.97470092773438, "margin_dpo/margin_std": 164.85641479492188, "step": 244 }, { "KL/chosen_KL_mean": -186.3873291015625, "KL/mean": -248.48068237304688, "KL/rejected_KL_mean": -310.57403564453125, "KL/std": 153.88400268554688, "epoch": 0.35976505139500736, "fcm_dpo/beta": 0.0030218339525163174, "fcm_dpo/delta": 0.02460547536611557, "fcm_dpo/margin": 124.18669128417969, "fcm_dpo/q_t": 0.4137793779373169, "grad_norm": 26.153411865234375, "learning_rate": 4.057290731287531e-07, "logits/chosen": -0.4331769049167633, "logits/rejected": -0.40836483240127563, "logps/chosen": -253.64962768554688, "logps/ref_chosen": -67.26228332519531, "logps/ref_rejected": -87.64010620117188, "logps/rejected": -398.214111328125, "loss": 1.1139, "margin_dpo/margin_mean": 124.18669128417969, "margin_dpo/margin_std": 170.16293334960938, "step": 245 }, { "KL/chosen_KL_mean": -182.14895629882812, "KL/mean": -245.166015625, "KL/rejected_KL_mean": -308.18310546875, "KL/std": 170.09425354003906, "epoch": 0.36123348017621143, "fcm_dpo/beta": 0.003048623912036419, "fcm_dpo/delta": 0.016290059313178062, "fcm_dpo/margin": 126.03412628173828, "fcm_dpo/q_t": 0.4134928584098816, "grad_norm": 24.42864418029785, "learning_rate": 4.047230911780736e-07, "logits/chosen": -0.440762996673584, "logits/rejected": -0.40071290731430054, "logps/chosen": -248.84593200683594, "logps/ref_chosen": -66.69696807861328, "logps/ref_rejected": -84.34634399414062, "logps/rejected": -392.5294189453125, "loss": 1.109, "margin_dpo/margin_mean": 126.03411865234375, "margin_dpo/margin_std": 182.46896362304688, "step": 246 }, { "KL/chosen_KL_mean": -209.86624145507812, "KL/mean": -298.015625, "KL/rejected_KL_mean": -386.1650390625, "KL/std": 176.03521728515625, "epoch": 0.36270190895741555, "fcm_dpo/beta": 0.002994304057210684, "fcm_dpo/delta": -0.13526105880737305, "fcm_dpo/margin": 176.29876708984375, "fcm_dpo/q_t": 0.3789059519767761, "grad_norm": 33.18147277832031, "learning_rate": 4.0371303213004814e-07, "logits/chosen": -0.3571593761444092, "logits/rejected": -0.355099618434906, "logps/chosen": -266.4715881347656, "logps/ref_chosen": -56.6053466796875, "logps/ref_rejected": -106.29326629638672, "logps/rejected": -492.45831298828125, "loss": 1.0026, "margin_dpo/margin_mean": 176.29876708984375, "margin_dpo/margin_std": 190.67379760742188, "step": 247 }, { "KL/chosen_KL_mean": -180.16818237304688, "KL/mean": -257.116455078125, "KL/rejected_KL_mean": -334.06475830078125, "KL/std": 142.95608520507812, "epoch": 0.3641703377386197, "fcm_dpo/beta": 0.0029416182078421116, "fcm_dpo/delta": -0.055744655430316925, "fcm_dpo/margin": 153.89654541015625, "fcm_dpo/q_t": 0.3923521637916565, "grad_norm": 21.914152145385742, "learning_rate": 4.0269892260067197e-07, "logits/chosen": -0.3768647313117981, "logits/rejected": -0.3953893482685089, "logps/chosen": -224.21139526367188, "logps/ref_chosen": -44.043216705322266, "logps/ref_rejected": -91.85687255859375, "logps/rejected": -425.921630859375, "loss": 1.0231, "margin_dpo/margin_mean": 153.89654541015625, "margin_dpo/margin_std": 135.817138671875, "step": 248 }, { "KL/chosen_KL_mean": -235.70599365234375, "KL/mean": -278.174560546875, "KL/rejected_KL_mean": -320.6430969238281, "KL/std": 158.992431640625, "epoch": 0.3656387665198238, "fcm_dpo/beta": 0.003009880194440484, "fcm_dpo/delta": 0.1481824517250061, "fcm_dpo/margin": 84.93710327148438, "fcm_dpo/q_t": 0.4420696496963501, "grad_norm": 31.89614486694336, "learning_rate": 4.0168078931267426e-07, "logits/chosen": -0.4115716814994812, "logits/rejected": -0.38695603609085083, "logps/chosen": -298.1483459472656, "logps/ref_chosen": -62.442352294921875, "logps/ref_rejected": -80.46806335449219, "logps/rejected": -401.11114501953125, "loss": 1.2227, "margin_dpo/margin_mean": 84.93710327148438, "margin_dpo/margin_std": 187.25634765625, "step": 249 }, { "KL/chosen_KL_mean": -207.01589965820312, "KL/mean": -286.22161865234375, "KL/rejected_KL_mean": -365.4273681640625, "KL/std": 158.22332763671875, "epoch": 0.3671071953010279, "fcm_dpo/beta": 0.0030103102326393127, "fcm_dpo/delta": -0.08075070381164551, "fcm_dpo/margin": 158.41148376464844, "fcm_dpo/q_t": 0.3886602520942688, "grad_norm": 30.4766845703125, "learning_rate": 4.006586590948141e-07, "logits/chosen": -0.39069664478302, "logits/rejected": -0.3340034484863281, "logps/chosen": -272.652587890625, "logps/ref_chosen": -65.63668823242188, "logps/ref_rejected": -73.87184143066406, "logps/rejected": -439.2991943359375, "loss": 1.0195, "margin_dpo/margin_mean": 158.41148376464844, "margin_dpo/margin_std": 157.6683349609375, "step": 250 }, { "KL/chosen_KL_mean": -221.11842346191406, "KL/mean": -274.54730224609375, "KL/rejected_KL_mean": -327.97613525390625, "KL/std": 165.54885864257812, "epoch": 0.368575624082232, "fcm_dpo/beta": 0.0030130401719361544, "fcm_dpo/delta": 0.08070008456707001, "fcm_dpo/margin": 106.85773468017578, "fcm_dpo/q_t": 0.42667731642723083, "grad_norm": 35.921974182128906, "learning_rate": 3.9963255888117325e-07, "logits/chosen": -0.3663170635700226, "logits/rejected": -0.33492955565452576, "logps/chosen": -278.3011474609375, "logps/ref_chosen": -57.182716369628906, "logps/ref_rejected": -77.66343688964844, "logps/rejected": -405.63958740234375, "loss": 1.1604, "margin_dpo/margin_mean": 106.85773468017578, "margin_dpo/margin_std": 176.058837890625, "step": 251 }, { "KL/chosen_KL_mean": -219.7275390625, "KL/mean": -291.9000549316406, "KL/rejected_KL_mean": -364.07257080078125, "KL/std": 145.91665649414062, "epoch": 0.3700440528634361, "fcm_dpo/beta": 0.0030095637775957584, "fcm_dpo/delta": -0.036197420209646225, "fcm_dpo/margin": 144.3450164794922, "fcm_dpo/q_t": 0.39602023363113403, "grad_norm": 25.36005210876465, "learning_rate": 3.9860251571044666e-07, "logits/chosen": -0.4368041753768921, "logits/rejected": -0.3982187509536743, "logps/chosen": -291.4132080078125, "logps/ref_chosen": -71.68563842773438, "logps/ref_rejected": -84.75799560546875, "logps/rejected": -448.83056640625, "loss": 1.046, "margin_dpo/margin_mean": 144.34500122070312, "margin_dpo/margin_std": 146.50564575195312, "step": 252 }, { "KL/chosen_KL_mean": -187.4129180908203, "KL/mean": -257.6170349121094, "KL/rejected_KL_mean": -327.8211364746094, "KL/std": 155.3035125732422, "epoch": 0.37151248164464024, "fcm_dpo/beta": 0.0030172369442880154, "fcm_dpo/delta": -0.025195002555847168, "fcm_dpo/margin": 140.40821838378906, "fcm_dpo/q_t": 0.40122461318969727, "grad_norm": 23.17310333251953, "learning_rate": 3.9756855672522986e-07, "logits/chosen": -0.42870020866394043, "logits/rejected": -0.42114484310150146, "logps/chosen": -256.5468444824219, "logps/ref_chosen": -69.1339340209961, "logps/ref_rejected": -98.70252990722656, "logps/rejected": -426.523681640625, "loss": 1.0717, "margin_dpo/margin_mean": 140.40821838378906, "margin_dpo/margin_std": 169.84083557128906, "step": 253 }, { "KL/chosen_KL_mean": -176.9703369140625, "KL/mean": -236.77066040039062, "KL/rejected_KL_mean": -296.5709533691406, "KL/std": 159.91812133789062, "epoch": 0.37298091042584436, "fcm_dpo/beta": 0.003011333290487528, "fcm_dpo/delta": 0.041351526975631714, "fcm_dpo/margin": 119.60062408447266, "fcm_dpo/q_t": 0.42020976543426514, "grad_norm": 20.98316192626953, "learning_rate": 3.965307091713037e-07, "logits/chosen": -0.4022292196750641, "logits/rejected": -0.3870220184326172, "logps/chosen": -231.12533569335938, "logps/ref_chosen": -54.154998779296875, "logps/ref_rejected": -90.30764770507812, "logps/rejected": -386.87860107421875, "loss": 1.1406, "margin_dpo/margin_mean": 119.60063171386719, "margin_dpo/margin_std": 200.53207397460938, "step": 254 }, { "KL/chosen_KL_mean": -176.31068420410156, "KL/mean": -240.4302215576172, "KL/rejected_KL_mean": -304.54974365234375, "KL/std": 140.88638305664062, "epoch": 0.3744493392070485, "fcm_dpo/beta": 0.003016393631696701, "fcm_dpo/delta": 0.013489855453372002, "fcm_dpo/margin": 128.23907470703125, "fcm_dpo/q_t": 0.4103944003582001, "grad_norm": 21.49859046936035, "learning_rate": 3.954890003969163e-07, "logits/chosen": -0.3628276288509369, "logits/rejected": -0.3527315855026245, "logps/chosen": -233.45236206054688, "logps/ref_chosen": -57.14167022705078, "logps/ref_rejected": -90.2085952758789, "logps/rejected": -394.7583312988281, "loss": 1.1055, "margin_dpo/margin_mean": 128.23907470703125, "margin_dpo/margin_std": 176.99844360351562, "step": 255 }, { "KL/chosen_KL_mean": -149.53353881835938, "KL/mean": -217.53964233398438, "KL/rejected_KL_mean": -285.5457458496094, "KL/std": 150.42579650878906, "epoch": 0.37591776798825255, "fcm_dpo/beta": 0.0030256398022174835, "fcm_dpo/delta": -0.012017881497740746, "fcm_dpo/margin": 136.01217651367188, "fcm_dpo/q_t": 0.4049100875854492, "grad_norm": 39.732818603515625, "learning_rate": 3.944434578520628e-07, "logits/chosen": -0.3158034384250641, "logits/rejected": -0.3230699896812439, "logps/chosen": -204.697021484375, "logps/ref_chosen": -55.163490295410156, "logps/ref_rejected": -92.56291961669922, "logps/rejected": -378.108642578125, "loss": 1.0802, "margin_dpo/margin_mean": 136.01217651367188, "margin_dpo/margin_std": 172.76388549804688, "step": 256 }, { "KL/chosen_KL_mean": -146.291259765625, "KL/mean": -217.93704223632812, "KL/rejected_KL_mean": -289.58282470703125, "KL/std": 154.93496704101562, "epoch": 0.37738619676945667, "fcm_dpo/beta": 0.0029899184592068195, "fcm_dpo/delta": -0.030670955777168274, "fcm_dpo/margin": 143.2915802001953, "fcm_dpo/q_t": 0.4019385874271393, "grad_norm": 23.517911911010742, "learning_rate": 3.933941090877615e-07, "logits/chosen": -0.36811453104019165, "logits/rejected": -0.354714035987854, "logps/chosen": -195.7149658203125, "logps/ref_chosen": -49.42369842529297, "logps/ref_rejected": -79.53791809082031, "logps/rejected": -369.1207580566406, "loss": 1.0716, "margin_dpo/margin_mean": 143.2915802001953, "margin_dpo/margin_std": 175.0234375, "step": 257 }, { "KL/chosen_KL_mean": -193.3330841064453, "KL/mean": -262.8382568359375, "KL/rejected_KL_mean": -332.3433837890625, "KL/std": 162.82940673828125, "epoch": 0.3788546255506608, "fcm_dpo/beta": 0.0030003516003489494, "fcm_dpo/delta": -0.01784433051943779, "fcm_dpo/margin": 139.01031494140625, "fcm_dpo/q_t": 0.4030870795249939, "grad_norm": 28.047407150268555, "learning_rate": 3.923409817553284e-07, "logits/chosen": -0.3141087591648102, "logits/rejected": -0.3110647201538086, "logps/chosen": -252.7172088623047, "logps/ref_chosen": -59.384124755859375, "logps/ref_rejected": -95.99010467529297, "logps/rejected": -428.33349609375, "loss": 1.0907, "margin_dpo/margin_mean": 139.01031494140625, "margin_dpo/margin_std": 193.4696502685547, "step": 258 }, { "KL/chosen_KL_mean": -188.4984130859375, "KL/mean": -248.42568969726562, "KL/rejected_KL_mean": -308.3529968261719, "KL/std": 159.6024169921875, "epoch": 0.3803230543318649, "fcm_dpo/beta": 0.0030027367174625397, "fcm_dpo/delta": 0.04155043140053749, "fcm_dpo/margin": 119.85458374023438, "fcm_dpo/q_t": 0.417955607175827, "grad_norm": 27.507400512695312, "learning_rate": 3.9128410360564793e-07, "logits/chosen": -0.3825646936893463, "logits/rejected": -0.3810487985610962, "logps/chosen": -241.32675170898438, "logps/ref_chosen": -52.828346252441406, "logps/ref_rejected": -89.191650390625, "logps/rejected": -397.5446472167969, "loss": 1.1286, "margin_dpo/margin_mean": 119.85458374023438, "margin_dpo/margin_std": 180.78173828125, "step": 259 }, { "KL/chosen_KL_mean": -193.53997802734375, "KL/mean": -273.78662109375, "KL/rejected_KL_mean": -354.0333251953125, "KL/std": 163.99403381347656, "epoch": 0.38179148311306904, "fcm_dpo/beta": 0.00299159437417984, "fcm_dpo/delta": -0.0841422975063324, "fcm_dpo/margin": 160.49334716796875, "fcm_dpo/q_t": 0.390764981508255, "grad_norm": 26.93035125732422, "learning_rate": 3.9022350248844246e-07, "logits/chosen": -0.33758312463760376, "logits/rejected": -0.35471126437187195, "logps/chosen": -240.9576416015625, "logps/ref_chosen": -47.41767501831055, "logps/ref_rejected": -95.08978271484375, "logps/rejected": -449.12310791015625, "loss": 1.0251, "margin_dpo/margin_mean": 160.49334716796875, "margin_dpo/margin_std": 176.1913604736328, "step": 260 }, { "KL/chosen_KL_mean": -203.53651428222656, "KL/mean": -278.8546142578125, "KL/rejected_KL_mean": -354.1727294921875, "KL/std": 180.8629150390625, "epoch": 0.3832599118942731, "fcm_dpo/beta": 0.0029416559264063835, "fcm_dpo/delta": -0.04529657959938049, "fcm_dpo/margin": 150.63624572753906, "fcm_dpo/q_t": 0.39985257387161255, "grad_norm": 23.40955924987793, "learning_rate": 3.891592063515376e-07, "logits/chosen": -0.30870985984802246, "logits/rejected": -0.3094845414161682, "logps/chosen": -256.5679016113281, "logps/ref_chosen": -53.03137969970703, "logps/ref_rejected": -88.51494598388672, "logps/rejected": -442.68768310546875, "loss": 1.0729, "margin_dpo/margin_mean": 150.63623046875, "margin_dpo/margin_std": 203.27151489257812, "step": 261 }, { "KL/chosen_KL_mean": -243.448974609375, "KL/mean": -305.376708984375, "KL/rejected_KL_mean": -367.30438232421875, "KL/std": 165.85348510742188, "epoch": 0.38472834067547723, "fcm_dpo/beta": 0.002966498024761677, "fcm_dpo/delta": 0.03342254459857941, "fcm_dpo/margin": 123.85542297363281, "fcm_dpo/q_t": 0.41466158628463745, "grad_norm": 25.519702911376953, "learning_rate": 3.880912432401264e-07, "logits/chosen": -0.29800450801849365, "logits/rejected": -0.2692173719406128, "logps/chosen": -303.0691223144531, "logps/ref_chosen": -59.620140075683594, "logps/ref_rejected": -86.41853332519531, "logps/rejected": -453.7229309082031, "loss": 1.108, "margin_dpo/margin_mean": 123.85542297363281, "margin_dpo/margin_std": 163.9331512451172, "step": 262 }, { "KL/chosen_KL_mean": -228.96612548828125, "KL/mean": -317.9360046386719, "KL/rejected_KL_mean": -406.9059143066406, "KL/std": 197.06179809570312, "epoch": 0.38619676945668135, "fcm_dpo/beta": 0.0028949188999831676, "fcm_dpo/delta": -0.12226266413927078, "fcm_dpo/margin": 177.93978881835938, "fcm_dpo/q_t": 0.38297536969184875, "grad_norm": 23.3193302154541, "learning_rate": 3.870196412960302e-07, "logits/chosen": -0.3331921398639679, "logits/rejected": -0.30877092480659485, "logps/chosen": -288.3870849609375, "logps/ref_chosen": -59.42094421386719, "logps/ref_rejected": -96.85720825195312, "logps/rejected": -503.76312255859375, "loss": 1.0198, "margin_dpo/margin_mean": 177.93978881835938, "margin_dpo/margin_std": 208.54910278320312, "step": 263 }, { "KL/chosen_KL_mean": -242.01239013671875, "KL/mean": -317.7347412109375, "KL/rejected_KL_mean": -393.45709228515625, "KL/std": 182.8231201171875, "epoch": 0.3876651982378855, "fcm_dpo/beta": 0.002853479702025652, "fcm_dpo/delta": -0.03490894287824631, "fcm_dpo/margin": 151.4447021484375, "fcm_dpo/q_t": 0.4024280309677124, "grad_norm": 27.41741180419922, "learning_rate": 3.8594442875695665e-07, "logits/chosen": -0.3761428892612457, "logits/rejected": -0.36597341299057007, "logps/chosen": -304.7344665527344, "logps/ref_chosen": -62.722084045410156, "logps/ref_rejected": -93.85620880126953, "logps/rejected": -487.31329345703125, "loss": 1.0845, "margin_dpo/margin_mean": 151.4447021484375, "margin_dpo/margin_std": 206.7536163330078, "step": 264 }, { "KL/chosen_KL_mean": -260.0290832519531, "KL/mean": -333.6810302734375, "KL/rejected_KL_mean": -407.3330383300781, "KL/std": 206.29835510253906, "epoch": 0.3891336270190896, "fcm_dpo/beta": 0.0028575900942087173, "fcm_dpo/delta": -0.021879900246858597, "fcm_dpo/margin": 147.30393981933594, "fcm_dpo/q_t": 0.4068824350833893, "grad_norm": 25.906906127929688, "learning_rate": 3.848656339557562e-07, "logits/chosen": -0.3324648439884186, "logits/rejected": -0.3139057755470276, "logps/chosen": -322.00054931640625, "logps/ref_chosen": -61.971466064453125, "logps/ref_rejected": -88.02059936523438, "logps/rejected": -495.3536376953125, "loss": 1.1181, "margin_dpo/margin_mean": 147.30393981933594, "margin_dpo/margin_std": 244.15444946289062, "step": 265 }, { "KL/chosen_KL_mean": -260.84808349609375, "KL/mean": -320.35455322265625, "KL/rejected_KL_mean": -379.8610534667969, "KL/std": 171.5772247314453, "epoch": 0.39060205580029367, "fcm_dpo/beta": 0.0028820079751312733, "fcm_dpo/delta": 0.059021368622779846, "fcm_dpo/margin": 119.01298522949219, "fcm_dpo/q_t": 0.4227384924888611, "grad_norm": 65.29761505126953, "learning_rate": 3.8378328531967507e-07, "logits/chosen": -0.3929414451122284, "logits/rejected": -0.3485182523727417, "logps/chosen": -327.94775390625, "logps/ref_chosen": -67.09967041015625, "logps/ref_rejected": -67.97122192382812, "logps/rejected": -447.832275390625, "loss": 1.1495, "margin_dpo/margin_mean": 119.01298522949219, "margin_dpo/margin_std": 200.31187438964844, "step": 266 }, { "KL/chosen_KL_mean": -230.92379760742188, "KL/mean": -304.11248779296875, "KL/rejected_KL_mean": -377.3011474609375, "KL/std": 182.7541961669922, "epoch": 0.3920704845814978, "fcm_dpo/beta": 0.002870975062251091, "fcm_dpo/delta": -0.021463816985487938, "fcm_dpo/margin": 146.3773193359375, "fcm_dpo/q_t": 0.40422123670578003, "grad_norm": 41.96255874633789, "learning_rate": 3.8269741136960646e-07, "logits/chosen": -0.3992302715778351, "logits/rejected": -0.36245715618133545, "logps/chosen": -299.8945617675781, "logps/ref_chosen": -68.97075653076172, "logps/ref_rejected": -90.16844940185547, "logps/rejected": -467.4696044921875, "loss": 1.0898, "margin_dpo/margin_mean": 146.3773193359375, "margin_dpo/margin_std": 205.5968780517578, "step": 267 }, { "KL/chosen_KL_mean": -235.22242736816406, "KL/mean": -306.08172607421875, "KL/rejected_KL_mean": -376.9410400390625, "KL/std": 170.38865661621094, "epoch": 0.3935389133627019, "fcm_dpo/beta": 0.002865626011043787, "fcm_dpo/delta": -0.006617257371544838, "fcm_dpo/margin": 141.7186279296875, "fcm_dpo/q_t": 0.4083732068538666, "grad_norm": 30.43846321105957, "learning_rate": 3.8160804071933894e-07, "logits/chosen": -0.37674716114997864, "logits/rejected": -0.3824624717235565, "logps/chosen": -291.12274169921875, "logps/ref_chosen": -55.90031051635742, "logps/ref_rejected": -101.64763641357422, "logps/rejected": -478.58868408203125, "loss": 1.0987, "margin_dpo/margin_mean": 141.71861267089844, "margin_dpo/margin_std": 204.48934936523438, "step": 268 }, { "KL/chosen_KL_mean": -245.32723999023438, "KL/mean": -330.68280029296875, "KL/rejected_KL_mean": -416.038330078125, "KL/std": 177.90664672851562, "epoch": 0.39500734214390604, "fcm_dpo/beta": 0.002839939435943961, "fcm_dpo/delta": -0.08905084431171417, "fcm_dpo/margin": 170.71112060546875, "fcm_dpo/q_t": 0.3899012804031372, "grad_norm": 27.612655639648438, "learning_rate": 3.8051520207480204e-07, "logits/chosen": -0.45941269397735596, "logits/rejected": -0.4373210668563843, "logps/chosen": -315.3667907714844, "logps/ref_chosen": -70.03955841064453, "logps/ref_rejected": -107.34937286376953, "logps/rejected": -523.3876953125, "loss": 1.0488, "margin_dpo/margin_mean": 170.71112060546875, "margin_dpo/margin_std": 219.97512817382812, "step": 269 }, { "KL/chosen_KL_mean": -226.80136108398438, "KL/mean": -291.0020751953125, "KL/rejected_KL_mean": -355.2027893066406, "KL/std": 163.08633422851562, "epoch": 0.3964757709251101, "fcm_dpo/beta": 0.002845948562026024, "fcm_dpo/delta": 0.03565208241343498, "fcm_dpo/margin": 128.40142822265625, "fcm_dpo/q_t": 0.41619110107421875, "grad_norm": 45.69468688964844, "learning_rate": 3.794189242333106e-07, "logits/chosen": -0.4949715733528137, "logits/rejected": -0.4854864776134491, "logps/chosen": -296.3348388671875, "logps/ref_chosen": -69.53347778320312, "logps/ref_rejected": -109.92864990234375, "logps/rejected": -465.1314392089844, "loss": 1.1308, "margin_dpo/margin_mean": 128.40142822265625, "margin_dpo/margin_std": 200.96267700195312, "step": 270 }, { "KL/chosen_KL_mean": -224.1912078857422, "KL/mean": -305.23382568359375, "KL/rejected_KL_mean": -386.27642822265625, "KL/std": 166.82310485839844, "epoch": 0.39794419970631423, "fcm_dpo/beta": 0.0028090826235711575, "fcm_dpo/delta": -0.058409713208675385, "fcm_dpo/margin": 162.08523559570312, "fcm_dpo/q_t": 0.3959714472293854, "grad_norm": 25.674280166625977, "learning_rate": 3.7831923608280514e-07, "logits/chosen": -0.4081183075904846, "logits/rejected": -0.3879523277282715, "logps/chosen": -280.9557800292969, "logps/ref_chosen": -56.76456832885742, "logps/ref_rejected": -92.51383972167969, "logps/rejected": -478.790283203125, "loss": 1.0488, "margin_dpo/margin_mean": 162.08523559570312, "margin_dpo/margin_std": 191.40362548828125, "step": 271 }, { "KL/chosen_KL_mean": -210.31185913085938, "KL/mean": -310.5980224609375, "KL/rejected_KL_mean": -410.8841552734375, "KL/std": 184.2425537109375, "epoch": 0.39941262848751835, "fcm_dpo/beta": 0.002755315974354744, "fcm_dpo/delta": -0.16125299036502838, "fcm_dpo/margin": 200.572265625, "fcm_dpo/q_t": 0.37152132391929626, "grad_norm": 33.71802520751953, "learning_rate": 3.772161666010912e-07, "logits/chosen": -0.3228394389152527, "logits/rejected": -0.3339686989784241, "logps/chosen": -259.80902099609375, "logps/ref_chosen": -49.497154235839844, "logps/ref_rejected": -105.54279327392578, "logps/rejected": -516.4269409179688, "loss": 0.973, "margin_dpo/margin_mean": 200.572265625, "margin_dpo/margin_std": 189.1830596923828, "step": 272 }, { "KL/chosen_KL_mean": -236.64874267578125, "KL/mean": -329.9783630371094, "KL/rejected_KL_mean": -423.3079833984375, "KL/std": 181.1544189453125, "epoch": 0.4008810572687225, "fcm_dpo/beta": 0.0026712960097938776, "fcm_dpo/delta": -0.10431107878684998, "fcm_dpo/margin": 186.65921020507812, "fcm_dpo/q_t": 0.3848682641983032, "grad_norm": 41.27581024169922, "learning_rate": 3.761097448550755e-07, "logits/chosen": -0.32734841108322144, "logits/rejected": -0.3092419505119324, "logps/chosen": -299.6241455078125, "logps/ref_chosen": -62.97539520263672, "logps/ref_rejected": -92.49858093261719, "logps/rejected": -515.8065185546875, "loss": 1.0169, "margin_dpo/margin_mean": 186.65921020507812, "margin_dpo/margin_std": 203.4211883544922, "step": 273 }, { "KL/chosen_KL_mean": -278.07952880859375, "KL/mean": -350.180419921875, "KL/rejected_KL_mean": -422.2813415527344, "KL/std": 171.34066772460938, "epoch": 0.4023494860499266, "fcm_dpo/beta": 0.002680413890630007, "fcm_dpo/delta": 0.013700582087039948, "fcm_dpo/margin": 144.20184326171875, "fcm_dpo/q_t": 0.41100966930389404, "grad_norm": 41.83562469482422, "learning_rate": 3.75e-07, "logits/chosen": -0.30099087953567505, "logits/rejected": -0.2856178879737854, "logps/chosen": -333.74725341796875, "logps/ref_chosen": -55.66770935058594, "logps/ref_rejected": -77.33308410644531, "logps/rejected": -499.61444091796875, "loss": 1.1027, "margin_dpo/margin_mean": 144.20184326171875, "margin_dpo/margin_std": 196.0882568359375, "step": 274 }, { "KL/chosen_KL_mean": -212.87841796875, "KL/mean": -288.3249206542969, "KL/rejected_KL_mean": -363.7713928222656, "KL/std": 169.57728576660156, "epoch": 0.40381791483113066, "fcm_dpo/beta": 0.0026775910519063473, "fcm_dpo/delta": -0.004319606348872185, "fcm_dpo/margin": 150.89297485351562, "fcm_dpo/q_t": 0.4061974585056305, "grad_norm": 29.38013458251953, "learning_rate": 3.738869612786737e-07, "logits/chosen": -0.35198503732681274, "logits/rejected": -0.35458293557167053, "logps/chosen": -261.47314453125, "logps/ref_chosen": -48.594703674316406, "logps/ref_rejected": -93.30369567871094, "logps/rejected": -457.0751037597656, "loss": 1.0836, "margin_dpo/margin_mean": 150.89297485351562, "margin_dpo/margin_std": 189.14598083496094, "step": 275 }, { "KL/chosen_KL_mean": -223.02987670898438, "KL/mean": -300.77679443359375, "KL/rejected_KL_mean": -378.52374267578125, "KL/std": 173.62960815429688, "epoch": 0.4052863436123348, "fcm_dpo/beta": 0.002657739445567131, "fcm_dpo/delta": -0.014008134603500366, "fcm_dpo/margin": 155.49386596679688, "fcm_dpo/q_t": 0.40572842955589294, "grad_norm": 26.85495376586914, "learning_rate": 3.7277065802070204e-07, "logits/chosen": -0.33715903759002686, "logits/rejected": -0.30880868434906006, "logps/chosen": -279.6072998046875, "logps/ref_chosen": -56.57740783691406, "logps/ref_rejected": -70.36566925048828, "logps/rejected": -448.889404296875, "loss": 1.0856, "margin_dpo/margin_mean": 155.49386596679688, "margin_dpo/margin_std": 207.1399383544922, "step": 276 }, { "KL/chosen_KL_mean": -241.0596923828125, "KL/mean": -318.1791687011719, "KL/rejected_KL_mean": -395.29864501953125, "KL/std": 173.15396118164062, "epoch": 0.4067547723935389, "fcm_dpo/beta": 0.0026624128222465515, "fcm_dpo/delta": -0.011134679429233074, "fcm_dpo/margin": 154.23895263671875, "fcm_dpo/q_t": 0.405393123626709, "grad_norm": 23.91503143310547, "learning_rate": 3.71651119641714e-07, "logits/chosen": -0.3526584506034851, "logits/rejected": -0.3328750431537628, "logps/chosen": -297.33123779296875, "logps/ref_chosen": -56.27156066894531, "logps/ref_rejected": -92.88127136230469, "logps/rejected": -488.1799011230469, "loss": 1.0874, "margin_dpo/margin_mean": 154.2389678955078, "margin_dpo/margin_std": 204.90927124023438, "step": 277 }, { "KL/chosen_KL_mean": -204.98043823242188, "KL/mean": -295.61114501953125, "KL/rejected_KL_mean": -386.24188232421875, "KL/std": 184.41384887695312, "epoch": 0.40822320117474303, "fcm_dpo/beta": 0.002619755920022726, "fcm_dpo/delta": -0.07888495177030563, "fcm_dpo/margin": 181.26145935058594, "fcm_dpo/q_t": 0.3913354277610779, "grad_norm": 29.900175094604492, "learning_rate": 3.705283756425872e-07, "logits/chosen": -0.339875727891922, "logits/rejected": -0.3422485291957855, "logps/chosen": -257.92236328125, "logps/ref_chosen": -52.94194030761719, "logps/ref_rejected": -91.25357818603516, "logps/rejected": -477.4954833984375, "loss": 1.0328, "margin_dpo/margin_mean": 181.261474609375, "margin_dpo/margin_std": 205.30252075195312, "step": 278 }, { "KL/chosen_KL_mean": -229.3821563720703, "KL/mean": -320.7900085449219, "KL/rejected_KL_mean": -412.1978759765625, "KL/std": 190.40365600585938, "epoch": 0.40969162995594716, "fcm_dpo/beta": 0.0025727972388267517, "fcm_dpo/delta": -0.07477246224880219, "fcm_dpo/margin": 182.81570434570312, "fcm_dpo/q_t": 0.39369115233421326, "grad_norm": 24.77928924560547, "learning_rate": 3.6940245560867e-07, "logits/chosen": -0.29181522130966187, "logits/rejected": -0.2875681519508362, "logps/chosen": -278.0234680175781, "logps/ref_chosen": -48.641319274902344, "logps/ref_rejected": -87.8514404296875, "logps/rejected": -500.04931640625, "loss": 1.0538, "margin_dpo/margin_mean": 182.81570434570312, "margin_dpo/margin_std": 233.40939331054688, "step": 279 }, { "KL/chosen_KL_mean": -224.71835327148438, "KL/mean": -317.265869140625, "KL/rejected_KL_mean": -409.8133544921875, "KL/std": 175.26214599609375, "epoch": 0.4111600587371512, "fcm_dpo/beta": 0.0025460803881287575, "fcm_dpo/delta": -0.07485491782426834, "fcm_dpo/margin": 185.09500122070312, "fcm_dpo/q_t": 0.389259934425354, "grad_norm": 30.460899353027344, "learning_rate": 3.6827338920900253e-07, "logits/chosen": -0.3362073004245758, "logits/rejected": -0.33699339628219604, "logps/chosen": -283.5154724121094, "logps/ref_chosen": -58.797122955322266, "logps/ref_rejected": -98.61885070800781, "logps/rejected": -508.43218994140625, "loss": 1.0257, "margin_dpo/margin_mean": 185.09500122070312, "margin_dpo/margin_std": 191.9048309326172, "step": 280 }, { "KL/chosen_KL_mean": -208.28512573242188, "KL/mean": -291.41558837890625, "KL/rejected_KL_mean": -374.5460510253906, "KL/std": 173.07376098632812, "epoch": 0.41262848751835535, "fcm_dpo/beta": 0.002538030967116356, "fcm_dpo/delta": -0.023100202903151512, "fcm_dpo/margin": 166.26089477539062, "fcm_dpo/q_t": 0.3998814821243286, "grad_norm": 23.17946434020996, "learning_rate": 3.6714120619553435e-07, "logits/chosen": -0.3537529706954956, "logits/rejected": -0.3235951066017151, "logps/chosen": -263.7736511230469, "logps/ref_chosen": -55.488521575927734, "logps/ref_rejected": -80.88258361816406, "logps/rejected": -455.4286193847656, "loss": 1.0649, "margin_dpo/margin_mean": 166.26089477539062, "margin_dpo/margin_std": 189.45980834960938, "step": 281 }, { "KL/chosen_KL_mean": -245.1143341064453, "KL/mean": -309.87060546875, "KL/rejected_KL_mean": -374.62689208984375, "KL/std": 184.24575805664062, "epoch": 0.41409691629955947, "fcm_dpo/beta": 0.002562709851190448, "fcm_dpo/delta": 0.06986706703901291, "fcm_dpo/margin": 129.51254272460938, "fcm_dpo/q_t": 0.4261664152145386, "grad_norm": 21.843597412109375, "learning_rate": 3.660059364023408e-07, "logits/chosen": -0.4187248945236206, "logits/rejected": -0.39314818382263184, "logps/chosen": -318.1844787597656, "logps/ref_chosen": -73.07014465332031, "logps/ref_rejected": -95.35098266601562, "logps/rejected": -469.97784423828125, "loss": 1.1482, "margin_dpo/margin_mean": 129.5125274658203, "margin_dpo/margin_std": 212.96649169921875, "step": 282 }, { "KL/chosen_KL_mean": -256.6600646972656, "KL/mean": -358.04266357421875, "KL/rejected_KL_mean": -459.42523193359375, "KL/std": 218.51571655273438, "epoch": 0.4155653450807636, "fcm_dpo/beta": 0.00252789119258523, "fcm_dpo/delta": -0.11851293593645096, "fcm_dpo/margin": 202.76519775390625, "fcm_dpo/q_t": 0.3806382119655609, "grad_norm": 25.943897247314453, "learning_rate": 3.6486760974483685e-07, "logits/chosen": -0.3803967833518982, "logits/rejected": -0.38235464692115784, "logps/chosen": -318.5585021972656, "logps/ref_chosen": -61.89844512939453, "logps/ref_rejected": -96.98655700683594, "logps/rejected": -556.4117431640625, "loss": 1.004, "margin_dpo/margin_mean": 202.7651824951172, "margin_dpo/margin_std": 206.27056884765625, "step": 283 }, { "KL/chosen_KL_mean": -249.64425659179688, "KL/mean": -341.79144287109375, "KL/rejected_KL_mean": -433.9386291503906, "KL/std": 203.23036193847656, "epoch": 0.4170337738619677, "fcm_dpo/beta": 0.002476719208061695, "fcm_dpo/delta": -0.05925939232110977, "fcm_dpo/margin": 184.29434204101562, "fcm_dpo/q_t": 0.3947269022464752, "grad_norm": 28.472728729248047, "learning_rate": 3.6372625621898863e-07, "logits/chosen": -0.4060632884502411, "logits/rejected": -0.392697274684906, "logps/chosen": -308.0798034667969, "logps/ref_chosen": -58.4355354309082, "logps/ref_rejected": -93.46926879882812, "logps/rejected": -527.4078979492188, "loss": 1.0392, "margin_dpo/margin_mean": 184.29434204101562, "margin_dpo/margin_std": 204.0950927734375, "step": 284 }, { "KL/chosen_KL_mean": -298.7416076660156, "KL/mean": -381.11907958984375, "KL/rejected_KL_mean": -463.49652099609375, "KL/std": 184.44662475585938, "epoch": 0.4185022026431718, "fcm_dpo/beta": 0.002482138341292739, "fcm_dpo/delta": -0.00985686480998993, "fcm_dpo/margin": 164.75491333007812, "fcm_dpo/q_t": 0.4034787714481354, "grad_norm": 28.78072738647461, "learning_rate": 3.625819059005228e-07, "logits/chosen": -0.3715532422065735, "logits/rejected": -0.3519101142883301, "logps/chosen": -364.97381591796875, "logps/ref_chosen": -66.23219299316406, "logps/ref_rejected": -99.1268310546875, "logps/rejected": -562.6233520507812, "loss": 1.0822, "margin_dpo/margin_mean": 164.7549285888672, "margin_dpo/margin_std": 204.08389282226562, "step": 285 }, { "KL/chosen_KL_mean": -304.86749267578125, "KL/mean": -396.3608093261719, "KL/rejected_KL_mean": -487.8541259765625, "KL/std": 213.21237182617188, "epoch": 0.4199706314243759, "fcm_dpo/beta": 0.002452992368489504, "fcm_dpo/delta": -0.051132772117853165, "fcm_dpo/margin": 182.9866485595703, "fcm_dpo/q_t": 0.3967989683151245, "grad_norm": 26.472732543945312, "learning_rate": 3.614345889441346e-07, "logits/chosen": -0.3906969428062439, "logits/rejected": -0.3723870813846588, "logps/chosen": -377.8184814453125, "logps/ref_chosen": -72.95100402832031, "logps/ref_rejected": -88.58845520019531, "logps/rejected": -576.4425659179688, "loss": 1.0611, "margin_dpo/margin_mean": 182.98663330078125, "margin_dpo/margin_std": 229.21395874023438, "step": 286 }, { "KL/chosen_KL_mean": -285.84619140625, "KL/mean": -360.84112548828125, "KL/rejected_KL_mean": -435.83599853515625, "KL/std": 188.15838623046875, "epoch": 0.42143906020558003, "fcm_dpo/beta": 0.002456413581967354, "fcm_dpo/delta": 0.032731398940086365, "fcm_dpo/margin": 149.98983764648438, "fcm_dpo/q_t": 0.4151533842086792, "grad_norm": 32.449649810791016, "learning_rate": 3.6028433558269275e-07, "logits/chosen": -0.3350308835506439, "logits/rejected": -0.3064236044883728, "logps/chosen": -347.3873291015625, "logps/ref_chosen": -61.54115295410156, "logps/ref_rejected": -77.69607543945312, "logps/rejected": -513.5321044921875, "loss": 1.1096, "margin_dpo/margin_mean": 149.98983764648438, "margin_dpo/margin_std": 203.369384765625, "step": 287 }, { "KL/chosen_KL_mean": -282.95465087890625, "KL/mean": -376.14697265625, "KL/rejected_KL_mean": -469.3393249511719, "KL/std": 190.60353088378906, "epoch": 0.42290748898678415, "fcm_dpo/beta": 0.0024195481091737747, "fcm_dpo/delta": -0.05505270138382912, "fcm_dpo/margin": 186.38467407226562, "fcm_dpo/q_t": 0.39451804757118225, "grad_norm": 28.688644409179688, "learning_rate": 3.5913117612644327e-07, "logits/chosen": -0.3529576063156128, "logits/rejected": -0.33676382899284363, "logps/chosen": -339.6158752441406, "logps/ref_chosen": -56.661224365234375, "logps/ref_rejected": -87.33570098876953, "logps/rejected": -556.675048828125, "loss": 1.0452, "margin_dpo/margin_mean": 186.38467407226562, "margin_dpo/margin_std": 199.76907348632812, "step": 288 }, { "KL/chosen_KL_mean": -267.9907531738281, "KL/mean": -372.37811279296875, "KL/rejected_KL_mean": -476.76544189453125, "KL/std": 207.67068481445312, "epoch": 0.4243759177679883, "fcm_dpo/beta": 0.002400734229013324, "fcm_dpo/delta": -0.10641852021217346, "fcm_dpo/margin": 208.77468872070312, "fcm_dpo/q_t": 0.3854549527168274, "grad_norm": 35.85638427734375, "learning_rate": 3.5797514096221024e-07, "logits/chosen": -0.30069178342819214, "logits/rejected": -0.30261388421058655, "logps/chosen": -313.2211608886719, "logps/ref_chosen": -45.23039245605469, "logps/ref_rejected": -87.64266967773438, "logps/rejected": -564.4080810546875, "loss": 1.0183, "margin_dpo/margin_mean": 208.77468872070312, "margin_dpo/margin_std": 230.7718505859375, "step": 289 }, { "KL/chosen_KL_mean": -281.2525634765625, "KL/mean": -386.38336181640625, "KL/rejected_KL_mean": -491.51422119140625, "KL/std": 217.03399658203125, "epoch": 0.42584434654919234, "fcm_dpo/beta": 0.002342382911592722, "fcm_dpo/delta": -0.09751632809638977, "fcm_dpo/margin": 210.26165771484375, "fcm_dpo/q_t": 0.38870781660079956, "grad_norm": 25.733200073242188, "learning_rate": 3.568162605525952e-07, "logits/chosen": -0.3384855091571808, "logits/rejected": -0.35991525650024414, "logps/chosen": -336.72406005859375, "logps/ref_chosen": -55.47149658203125, "logps/ref_rejected": -116.70857238769531, "logps/rejected": -608.2227783203125, "loss": 1.0358, "margin_dpo/margin_mean": 210.26165771484375, "margin_dpo/margin_std": 258.7429504394531, "step": 290 }, { "KL/chosen_KL_mean": -224.06832885742188, "KL/mean": -321.0989685058594, "KL/rejected_KL_mean": -418.129638671875, "KL/std": 182.753662109375, "epoch": 0.42731277533039647, "fcm_dpo/beta": 0.00232522701844573, "fcm_dpo/delta": -0.053772568702697754, "fcm_dpo/margin": 194.061279296875, "fcm_dpo/q_t": 0.3946029245853424, "grad_norm": 24.563919067382812, "learning_rate": 3.5565456543517485e-07, "logits/chosen": -0.34820133447647095, "logits/rejected": -0.3331354260444641, "logps/chosen": -287.3287048339844, "logps/ref_chosen": -63.26036834716797, "logps/ref_rejected": -89.29708862304688, "logps/rejected": -507.42669677734375, "loss": 1.0465, "margin_dpo/margin_mean": 194.061279296875, "margin_dpo/margin_std": 216.65316772460938, "step": 291 }, { "KL/chosen_KL_mean": -240.75106811523438, "KL/mean": -340.0538330078125, "KL/rejected_KL_mean": -439.35662841796875, "KL/std": 209.68600463867188, "epoch": 0.4287812041116006, "fcm_dpo/beta": 0.0022970177233219147, "fcm_dpo/delta": -0.058918386697769165, "fcm_dpo/margin": 198.60556030273438, "fcm_dpo/q_t": 0.3946416676044464, "grad_norm": 23.112098693847656, "learning_rate": 3.5449008622169583e-07, "logits/chosen": -0.35769540071487427, "logits/rejected": -0.3433658480644226, "logps/chosen": -294.66961669921875, "logps/ref_chosen": -53.91852951049805, "logps/ref_rejected": -89.96138000488281, "logps/rejected": -529.3179931640625, "loss": 1.0545, "margin_dpo/margin_mean": 198.60556030273438, "margin_dpo/margin_std": 243.43380737304688, "step": 292 }, { "KL/chosen_KL_mean": -235.00213623046875, "KL/mean": -311.6048583984375, "KL/rejected_KL_mean": -388.20758056640625, "KL/std": 205.09054565429688, "epoch": 0.4302496328928047, "fcm_dpo/beta": 0.002302415668964386, "fcm_dpo/delta": 0.04887353628873825, "fcm_dpo/margin": 153.2054443359375, "fcm_dpo/q_t": 0.41990119218826294, "grad_norm": 43.493621826171875, "learning_rate": 3.5332285359726846e-07, "logits/chosen": -0.3963527977466583, "logits/rejected": -0.385434627532959, "logps/chosen": -295.378173828125, "logps/ref_chosen": -60.376033782958984, "logps/ref_rejected": -77.85244750976562, "logps/rejected": -466.05999755859375, "loss": 1.1312, "margin_dpo/margin_mean": 153.2054443359375, "margin_dpo/margin_std": 233.70460510253906, "step": 293 }, { "KL/chosen_KL_mean": -212.15213012695312, "KL/mean": -297.83599853515625, "KL/rejected_KL_mean": -383.51983642578125, "KL/std": 185.86453247070312, "epoch": 0.43171806167400884, "fcm_dpo/beta": 0.002301940694451332, "fcm_dpo/delta": 0.005669664591550827, "fcm_dpo/margin": 171.36770629882812, "fcm_dpo/q_t": 0.4098511040210724, "grad_norm": 27.905162811279297, "learning_rate": 3.5215289831955786e-07, "logits/chosen": -0.3524784743785858, "logits/rejected": -0.3563184142112732, "logps/chosen": -260.23968505859375, "logps/ref_chosen": -48.0875358581543, "logps/ref_rejected": -81.89698791503906, "logps/rejected": -465.4168395996094, "loss": 1.0942, "margin_dpo/margin_mean": 171.36770629882812, "margin_dpo/margin_std": 227.7198486328125, "step": 294 }, { "KL/chosen_KL_mean": -258.012451171875, "KL/mean": -350.6398620605469, "KL/rejected_KL_mean": -443.26727294921875, "KL/std": 206.24761962890625, "epoch": 0.4331864904552129, "fcm_dpo/beta": 0.002300859661772847, "fcm_dpo/delta": -0.02741077169775963, "fcm_dpo/margin": 185.25485229492188, "fcm_dpo/q_t": 0.4024040997028351, "grad_norm": 32.099098205566406, "learning_rate": 3.509802512179737e-07, "logits/chosen": -0.3441588878631592, "logits/rejected": -0.3444691002368927, "logps/chosen": -307.9371032714844, "logps/ref_chosen": -49.92467498779297, "logps/ref_rejected": -87.45632934570312, "logps/rejected": -530.7236328125, "loss": 1.0767, "margin_dpo/margin_mean": 185.25485229492188, "margin_dpo/margin_std": 240.50933837890625, "step": 295 }, { "KL/chosen_KL_mean": -321.74249267578125, "KL/mean": -389.23687744140625, "KL/rejected_KL_mean": -456.7313232421875, "KL/std": 204.92347717285156, "epoch": 0.434654919236417, "fcm_dpo/beta": 0.0022906125523149967, "fcm_dpo/delta": -0.00922891590744257, "fcm_dpo/margin": 134.98883056640625, "fcm_dpo/q_t": 0.42863988876342773, "grad_norm": 31.48113250732422, "learning_rate": 3.498049431928577e-07, "logits/chosen": -0.40720900893211365, "logits/rejected": -0.3905686140060425, "logps/chosen": -387.2337341308594, "logps/ref_chosen": -65.49124145507812, "logps/ref_rejected": -93.08908081054688, "logps/rejected": -549.8204345703125, "loss": 1.1871, "margin_dpo/margin_mean": 134.98883056640625, "margin_dpo/margin_std": 261.055908203125, "step": 296 }, { "KL/chosen_KL_mean": -287.096435546875, "KL/mean": -368.1800537109375, "KL/rejected_KL_mean": -449.26361083984375, "KL/std": 196.81576538085938, "epoch": 0.43612334801762115, "fcm_dpo/beta": 0.002297044266015291, "fcm_dpo/delta": 0.028556976467370987, "fcm_dpo/margin": 162.16717529296875, "fcm_dpo/q_t": 0.41283226013183594, "grad_norm": 23.6795711517334, "learning_rate": 3.486270052146694e-07, "logits/chosen": -0.3867399990558624, "logits/rejected": -0.387523889541626, "logps/chosen": -343.57342529296875, "logps/ref_chosen": -56.476951599121094, "logps/ref_rejected": -95.1385498046875, "logps/rejected": -544.4021606445312, "loss": 1.0962, "margin_dpo/margin_mean": 162.16717529296875, "margin_dpo/margin_std": 195.96707153320312, "step": 297 }, { "KL/chosen_KL_mean": -323.70086669921875, "KL/mean": -429.325439453125, "KL/rejected_KL_mean": -534.9500122070312, "KL/std": 264.07275390625, "epoch": 0.43759177679882527, "fcm_dpo/beta": 0.0022826807107776403, "fcm_dpo/delta": -0.08633655309677124, "fcm_dpo/margin": 211.24917602539062, "fcm_dpo/q_t": 0.39509522914886475, "grad_norm": 26.146221160888672, "learning_rate": 3.474464683231698e-07, "logits/chosen": -0.39172685146331787, "logits/rejected": -0.41062480211257935, "logps/chosen": -391.0260314941406, "logps/ref_chosen": -67.32516479492188, "logps/ref_rejected": -116.66217041015625, "logps/rejected": -651.6121826171875, "loss": 1.0678, "margin_dpo/margin_mean": 211.24917602539062, "margin_dpo/margin_std": 307.80206298828125, "step": 298 }, { "KL/chosen_KL_mean": -274.95782470703125, "KL/mean": -365.26654052734375, "KL/rejected_KL_mean": -455.57525634765625, "KL/std": 200.08763122558594, "epoch": 0.4390602055800294, "fcm_dpo/beta": 0.002272904384881258, "fcm_dpo/delta": -0.011529970914125443, "fcm_dpo/margin": 180.617431640625, "fcm_dpo/q_t": 0.40732306241989136, "grad_norm": 33.49094772338867, "learning_rate": 3.462633636266041e-07, "logits/chosen": -0.36451274156570435, "logits/rejected": -0.3671821653842926, "logps/chosen": -323.919921875, "logps/ref_chosen": -48.96209716796875, "logps/ref_rejected": -84.32823944091797, "logps/rejected": -539.9035034179688, "loss": 1.095, "margin_dpo/margin_mean": 180.617431640625, "margin_dpo/margin_std": 251.55711364746094, "step": 299 }, { "KL/chosen_KL_mean": -354.8046875, "KL/mean": -461.15472412109375, "KL/rejected_KL_mean": -567.5047607421875, "KL/std": 243.07180786132812, "epoch": 0.44052863436123346, "fcm_dpo/beta": 0.002238738350570202, "fcm_dpo/delta": -0.07992631196975708, "fcm_dpo/margin": 212.70004272460938, "fcm_dpo/q_t": 0.3924391269683838, "grad_norm": 41.03089904785156, "learning_rate": 3.4507772230088147e-07, "logits/chosen": -0.35996705293655396, "logits/rejected": -0.36685582995414734, "logps/chosen": -413.8783874511719, "logps/ref_chosen": -59.07371139526367, "logps/ref_rejected": -95.9664535522461, "logps/rejected": -663.47119140625, "loss": 1.0665, "margin_dpo/margin_mean": 212.70004272460938, "margin_dpo/margin_std": 292.86248779296875, "step": 300 }, { "KL/chosen_KL_mean": -305.3809814453125, "KL/mean": -412.4756774902344, "KL/rejected_KL_mean": -519.5704345703125, "KL/std": 226.57168579101562, "epoch": 0.4419970631424376, "fcm_dpo/beta": 0.0021908977068960667, "fcm_dpo/delta": -0.07324320077896118, "fcm_dpo/margin": 214.18943786621094, "fcm_dpo/q_t": 0.3950774669647217, "grad_norm": 35.91939163208008, "learning_rate": 3.4388957558875316e-07, "logits/chosen": -0.3929128050804138, "logits/rejected": -0.3964204788208008, "logps/chosen": -362.63031005859375, "logps/ref_chosen": -57.249366760253906, "logps/ref_rejected": -92.35354614257812, "logps/rejected": -611.9239501953125, "loss": 1.051, "margin_dpo/margin_mean": 214.189453125, "margin_dpo/margin_std": 272.2989807128906, "step": 301 }, { "KL/chosen_KL_mean": -259.3084411621094, "KL/mean": -347.705810546875, "KL/rejected_KL_mean": -436.1031494140625, "KL/std": 198.4259033203125, "epoch": 0.4434654919236417, "fcm_dpo/beta": 0.0021872916258871555, "fcm_dpo/delta": 0.013604838401079178, "fcm_dpo/margin": 176.79473876953125, "fcm_dpo/q_t": 0.4114551544189453, "grad_norm": 35.29695129394531, "learning_rate": 3.426989547989902e-07, "logits/chosen": -0.37686437368392944, "logits/rejected": -0.3835713863372803, "logps/chosen": -310.5064392089844, "logps/ref_chosen": -51.197994232177734, "logps/ref_rejected": -97.22636413574219, "logps/rejected": -533.3295288085938, "loss": 1.0984, "margin_dpo/margin_mean": 176.79470825195312, "margin_dpo/margin_std": 234.30789184570312, "step": 302 }, { "KL/chosen_KL_mean": -262.9270935058594, "KL/mean": -342.5615234375, "KL/rejected_KL_mean": -422.1959228515625, "KL/std": 198.67970275878906, "epoch": 0.44493392070484583, "fcm_dpo/beta": 0.002209719270467758, "fcm_dpo/delta": 0.04984103888273239, "fcm_dpo/margin": 159.26882934570312, "fcm_dpo/q_t": 0.4192725419998169, "grad_norm": 39.67668914794922, "learning_rate": 3.4150589130555773e-07, "logits/chosen": -0.39630812406539917, "logits/rejected": -0.38712817430496216, "logps/chosen": -329.64105224609375, "logps/ref_chosen": -66.71394348144531, "logps/ref_rejected": -86.94542694091797, "logps/rejected": -509.141357421875, "loss": 1.1322, "margin_dpo/margin_mean": 159.26882934570312, "margin_dpo/margin_std": 240.13339233398438, "step": 303 }, { "KL/chosen_KL_mean": -224.00103759765625, "KL/mean": -315.3577880859375, "KL/rejected_KL_mean": -406.7145080566406, "KL/std": 181.42640686035156, "epoch": 0.44640234948604995, "fcm_dpo/beta": 0.0022241733968257904, "fcm_dpo/delta": -0.006868166849017143, "fcm_dpo/margin": 182.71343994140625, "fcm_dpo/q_t": 0.4025576710700989, "grad_norm": 28.95069694519043, "learning_rate": 3.403104165467883e-07, "logits/chosen": -0.4077110290527344, "logits/rejected": -0.40159422159194946, "logps/chosen": -295.95172119140625, "logps/ref_chosen": -71.95069885253906, "logps/ref_rejected": -90.47203063964844, "logps/rejected": -497.1865234375, "loss": 1.0501, "margin_dpo/margin_mean": 182.71343994140625, "margin_dpo/margin_std": 153.1214599609375, "step": 304 }, { "KL/chosen_KL_mean": -241.20379638671875, "KL/mean": -322.8453369140625, "KL/rejected_KL_mean": -404.48687744140625, "KL/std": 208.91802978515625, "epoch": 0.447870778267254, "fcm_dpo/beta": 0.002216045744717121, "fcm_dpo/delta": 0.03913535922765732, "fcm_dpo/margin": 163.2830810546875, "fcm_dpo/q_t": 0.41752344369888306, "grad_norm": 25.189720153808594, "learning_rate": 3.391125620245535e-07, "logits/chosen": -0.3988510072231293, "logits/rejected": -0.382416695356369, "logps/chosen": -307.9990234375, "logps/ref_chosen": -66.79523468017578, "logps/ref_rejected": -92.75459289550781, "logps/rejected": -497.241455078125, "loss": 1.1197, "margin_dpo/margin_mean": 163.2830810546875, "margin_dpo/margin_std": 229.060546875, "step": 305 }, { "KL/chosen_KL_mean": -243.65699768066406, "KL/mean": -323.1556396484375, "KL/rejected_KL_mean": -402.654296875, "KL/std": 187.81524658203125, "epoch": 0.44933920704845814, "fcm_dpo/beta": 0.0022513873409479856, "fcm_dpo/delta": 0.04347452521324158, "fcm_dpo/margin": 158.99728393554688, "fcm_dpo/q_t": 0.41661351919174194, "grad_norm": 25.411108016967773, "learning_rate": 3.3791235930343417e-07, "logits/chosen": -0.3887644410133362, "logits/rejected": -0.3672389090061188, "logps/chosen": -313.34088134765625, "logps/ref_chosen": -69.68389892578125, "logps/ref_rejected": -85.15919494628906, "logps/rejected": -487.8134765625, "loss": 1.1059, "margin_dpo/margin_mean": 158.99728393554688, "margin_dpo/margin_std": 197.02398681640625, "step": 306 }, { "KL/chosen_KL_mean": -229.40023803710938, "KL/mean": -310.8966979980469, "KL/rejected_KL_mean": -392.39312744140625, "KL/std": 175.57015991210938, "epoch": 0.45080763582966227, "fcm_dpo/beta": 0.002260227221995592, "fcm_dpo/delta": 0.03280823305249214, "fcm_dpo/margin": 162.99290466308594, "fcm_dpo/q_t": 0.41427597403526306, "grad_norm": 30.055885314941406, "learning_rate": 3.367098400098881e-07, "logits/chosen": -0.3810919523239136, "logits/rejected": -0.36472952365875244, "logps/chosen": -299.565673828125, "logps/ref_chosen": -70.16542053222656, "logps/ref_rejected": -86.97230529785156, "logps/rejected": -479.3654479980469, "loss": 1.1067, "margin_dpo/margin_mean": 162.99288940429688, "margin_dpo/margin_std": 214.08172607421875, "step": 307 }, { "KL/chosen_KL_mean": -229.43406677246094, "KL/mean": -322.5722351074219, "KL/rejected_KL_mean": -415.71038818359375, "KL/std": 199.0686492919922, "epoch": 0.4522760646108664, "fcm_dpo/beta": 0.002260176232084632, "fcm_dpo/delta": -0.021941393613815308, "fcm_dpo/margin": 186.27633666992188, "fcm_dpo/q_t": 0.40062737464904785, "grad_norm": 34.93415451049805, "learning_rate": 3.355050358314172e-07, "logits/chosen": -0.3374328017234802, "logits/rejected": -0.32412296533584595, "logps/chosen": -284.6790771484375, "logps/ref_chosen": -55.2449951171875, "logps/ref_rejected": -79.37226104736328, "logps/rejected": -495.0826416015625, "loss": 1.0539, "margin_dpo/margin_mean": 186.27633666992188, "margin_dpo/margin_std": 191.69363403320312, "step": 308 }, { "KL/chosen_KL_mean": -233.2113037109375, "KL/mean": -323.8895568847656, "KL/rejected_KL_mean": -414.56781005859375, "KL/std": 213.318603515625, "epoch": 0.45374449339207046, "fcm_dpo/beta": 0.0022653641644865274, "fcm_dpo/delta": -0.011782001703977585, "fcm_dpo/margin": 181.3565216064453, "fcm_dpo/q_t": 0.4036235809326172, "grad_norm": 54.994361877441406, "learning_rate": 3.3429797851573183e-07, "logits/chosen": -0.33466869592666626, "logits/rejected": -0.32765206694602966, "logps/chosen": -282.17041015625, "logps/ref_chosen": -48.959083557128906, "logps/ref_rejected": -82.34072875976562, "logps/rejected": -496.9085693359375, "loss": 1.0826, "margin_dpo/margin_mean": 181.3565216064453, "margin_dpo/margin_std": 227.5936279296875, "step": 309 }, { "KL/chosen_KL_mean": -264.2091369628906, "KL/mean": -353.9774169921875, "KL/rejected_KL_mean": -443.7456970214844, "KL/std": 186.558349609375, "epoch": 0.4552129221732746, "fcm_dpo/beta": 0.002258453518152237, "fcm_dpo/delta": -0.0059468671679496765, "fcm_dpo/margin": 179.53656005859375, "fcm_dpo/q_t": 0.40412867069244385, "grad_norm": 21.421035766601562, "learning_rate": 3.3308869986991487e-07, "logits/chosen": -0.35127896070480347, "logits/rejected": -0.33690258860588074, "logps/chosen": -326.950927734375, "logps/ref_chosen": -62.74177932739258, "logps/ref_rejected": -79.9302978515625, "logps/rejected": -523.676025390625, "loss": 1.0633, "margin_dpo/margin_mean": 179.5365447998047, "margin_dpo/margin_std": 181.55760192871094, "step": 310 }, { "KL/chosen_KL_mean": -284.3205871582031, "KL/mean": -382.16046142578125, "KL/rejected_KL_mean": -480.00030517578125, "KL/std": 239.59608459472656, "epoch": 0.4566813509544787, "fcm_dpo/beta": 0.0022359404247254133, "fcm_dpo/delta": -0.039258040487766266, "fcm_dpo/margin": 195.67971801757812, "fcm_dpo/q_t": 0.40077459812164307, "grad_norm": 27.19573402404785, "learning_rate": 3.3187723175958346e-07, "logits/chosen": -0.36365634202957153, "logits/rejected": -0.34147655963897705, "logps/chosen": -337.34857177734375, "logps/ref_chosen": -53.02798080444336, "logps/ref_rejected": -77.43820190429688, "logps/rejected": -557.4384765625, "loss": 1.0689, "margin_dpo/margin_mean": 195.67970275878906, "margin_dpo/margin_std": 252.54080200195312, "step": 311 }, { "KL/chosen_KL_mean": -272.4962158203125, "KL/mean": -364.0135498046875, "KL/rejected_KL_mean": -455.53094482421875, "KL/std": 213.603759765625, "epoch": 0.4581497797356828, "fcm_dpo/beta": 0.002237812615931034, "fcm_dpo/delta": -0.01030636951327324, "fcm_dpo/margin": 183.03475952148438, "fcm_dpo/q_t": 0.4078907370567322, "grad_norm": 24.042999267578125, "learning_rate": 3.306636061080487e-07, "logits/chosen": -0.2863520681858063, "logits/rejected": -0.27598023414611816, "logps/chosen": -321.888427734375, "logps/ref_chosen": -49.39221954345703, "logps/ref_rejected": -75.79280853271484, "logps/rejected": -531.32373046875, "loss": 1.0899, "margin_dpo/margin_mean": 183.03475952148438, "margin_dpo/margin_std": 251.37765502929688, "step": 312 }, { "KL/chosen_KL_mean": -254.2932891845703, "KL/mean": -349.1951599121094, "KL/rejected_KL_mean": -444.0970458984375, "KL/std": 223.68814086914062, "epoch": 0.45961820851688695, "fcm_dpo/beta": 0.0022342309821397066, "fcm_dpo/delta": -0.025912020355463028, "fcm_dpo/margin": 189.80377197265625, "fcm_dpo/q_t": 0.4029422402381897, "grad_norm": 25.419347763061523, "learning_rate": 3.2944785489547537e-07, "logits/chosen": -0.36437875032424927, "logits/rejected": -0.36159804463386536, "logps/chosen": -304.446044921875, "logps/ref_chosen": -50.152740478515625, "logps/ref_rejected": -86.40620422363281, "logps/rejected": -530.5032958984375, "loss": 1.0826, "margin_dpo/margin_mean": 189.80377197265625, "margin_dpo/margin_std": 249.74856567382812, "step": 313 }, { "KL/chosen_KL_mean": -285.2285461425781, "KL/mean": -373.33819580078125, "KL/rejected_KL_mean": -461.4478759765625, "KL/std": 217.80224609375, "epoch": 0.461086637298091, "fcm_dpo/beta": 0.0022100405767560005, "fcm_dpo/delta": 0.010647352784872055, "fcm_dpo/margin": 176.2192840576172, "fcm_dpo/q_t": 0.4131169021129608, "grad_norm": 22.368419647216797, "learning_rate": 3.2823001015803857e-07, "logits/chosen": -0.40202397108078003, "logits/rejected": -0.4044821262359619, "logps/chosen": -342.46612548828125, "logps/ref_chosen": -57.237579345703125, "logps/ref_rejected": -97.5965347290039, "logps/rejected": -559.04443359375, "loss": 1.1211, "margin_dpo/margin_mean": 176.21929931640625, "margin_dpo/margin_std": 275.643798828125, "step": 314 }, { "KL/chosen_KL_mean": -263.5173645019531, "KL/mean": -342.3055114746094, "KL/rejected_KL_mean": -421.0936279296875, "KL/std": 194.72991943359375, "epoch": 0.46255506607929514, "fcm_dpo/beta": 0.0022343965247273445, "fcm_dpo/delta": 0.049690838903188705, "fcm_dpo/margin": 157.5762939453125, "fcm_dpo/q_t": 0.4190711975097656, "grad_norm": 22.533966064453125, "learning_rate": 3.270101039870797e-07, "logits/chosen": -0.3270511329174042, "logits/rejected": -0.3310539126396179, "logps/chosen": -312.58697509765625, "logps/ref_chosen": -49.06958770751953, "logps/ref_rejected": -85.68087768554688, "logps/rejected": -506.7745361328125, "loss": 1.1212, "margin_dpo/margin_mean": 157.57626342773438, "margin_dpo/margin_std": 219.2792205810547, "step": 315 }, { "KL/chosen_KL_mean": -256.9066162109375, "KL/mean": -373.9107360839844, "KL/rejected_KL_mean": -490.9148864746094, "KL/std": 229.14527893066406, "epoch": 0.46402349486049926, "fcm_dpo/beta": 0.0021924672182649374, "fcm_dpo/delta": -0.12023768573999405, "fcm_dpo/margin": 234.0082244873047, "fcm_dpo/q_t": 0.3829796314239502, "grad_norm": 27.900861740112305, "learning_rate": 3.2578816852826086e-07, "logits/chosen": -0.3719561696052551, "logits/rejected": -0.3757820725440979, "logps/chosen": -311.1673583984375, "logps/ref_chosen": -54.26074981689453, "logps/ref_rejected": -101.2814712524414, "logps/rejected": -592.1963500976562, "loss": 1.0059, "margin_dpo/margin_mean": 234.00823974609375, "margin_dpo/margin_std": 248.06161499023438, "step": 316 }, { "KL/chosen_KL_mean": -261.26275634765625, "KL/mean": -384.9669189453125, "KL/rejected_KL_mean": -508.6710205078125, "KL/std": 205.64381408691406, "epoch": 0.4654919236417034, "fcm_dpo/beta": 0.0021530133672058582, "fcm_dpo/delta": -0.13990481197834015, "fcm_dpo/margin": 247.40829467773438, "fcm_dpo/q_t": 0.3760995864868164, "grad_norm": 25.268577575683594, "learning_rate": 3.2456423598071783e-07, "logits/chosen": -0.3866614103317261, "logits/rejected": -0.3781118392944336, "logps/chosen": -317.3569641113281, "logps/ref_chosen": -56.094207763671875, "logps/ref_rejected": -100.69905090332031, "logps/rejected": -609.3701171875, "loss": 0.9834, "margin_dpo/margin_mean": 247.40829467773438, "margin_dpo/margin_std": 231.6420135498047, "step": 317 }, { "KL/chosen_KL_mean": -285.3980407714844, "KL/mean": -377.98626708984375, "KL/rejected_KL_mean": -470.574462890625, "KL/std": 215.07859802246094, "epoch": 0.4669603524229075, "fcm_dpo/beta": 0.002135781804099679, "fcm_dpo/delta": 0.00455857440829277, "fcm_dpo/margin": 185.17642211914062, "fcm_dpo/q_t": 0.4088175892829895, "grad_norm": 25.41587257385254, "learning_rate": 3.233383385962115e-07, "logits/chosen": -0.43662551045417786, "logits/rejected": -0.40799379348754883, "logps/chosen": -350.0437316894531, "logps/ref_chosen": -64.64569854736328, "logps/ref_rejected": -82.76425170898438, "logps/rejected": -553.3387451171875, "loss": 1.0885, "margin_dpo/margin_mean": 185.17642211914062, "margin_dpo/margin_std": 236.15972900390625, "step": 318 }, { "KL/chosen_KL_mean": -250.97140502929688, "KL/mean": -361.4615478515625, "KL/rejected_KL_mean": -471.95166015625, "KL/std": 230.63836669921875, "epoch": 0.4684287812041116, "fcm_dpo/beta": 0.002109553199261427, "fcm_dpo/delta": -0.06939505785703659, "fcm_dpo/margin": 220.9802703857422, "fcm_dpo/q_t": 0.3919963836669922, "grad_norm": 22.969181060791016, "learning_rate": 3.2211050867827805e-07, "logits/chosen": -0.36899369955062866, "logits/rejected": -0.38391441106796265, "logps/chosen": -300.35516357421875, "logps/ref_chosen": -49.383758544921875, "logps/ref_rejected": -113.90650939941406, "logps/rejected": -585.858154296875, "loss": 1.0362, "margin_dpo/margin_mean": 220.98025512695312, "margin_dpo/margin_std": 247.80764770507812, "step": 319 }, { "KL/chosen_KL_mean": -261.7906494140625, "KL/mean": -381.9345703125, "KL/rejected_KL_mean": -502.07843017578125, "KL/std": 240.1503448486328, "epoch": 0.4698972099853157, "fcm_dpo/beta": 0.002064064610749483, "fcm_dpo/delta": -0.10146654397249222, "fcm_dpo/margin": 240.28778076171875, "fcm_dpo/q_t": 0.3857959806919098, "grad_norm": 25.455394744873047, "learning_rate": 3.208807785813777e-07, "logits/chosen": -0.3855065703392029, "logits/rejected": -0.39106667041778564, "logps/chosen": -321.2955322265625, "logps/ref_chosen": -59.50489044189453, "logps/ref_rejected": -97.66717529296875, "logps/rejected": -599.74560546875, "loss": 1.0151, "margin_dpo/margin_mean": 240.2877960205078, "margin_dpo/margin_std": 257.2183837890625, "step": 320 }, { "KL/chosen_KL_mean": -324.5029296875, "KL/mean": -426.83648681640625, "KL/rejected_KL_mean": -529.1701049804688, "KL/std": 247.80294799804688, "epoch": 0.4713656387665198, "fcm_dpo/beta": 0.002041730796918273, "fcm_dpo/delta": -0.019452113658189774, "fcm_dpo/margin": 204.66717529296875, "fcm_dpo/q_t": 0.4043254256248474, "grad_norm": 22.51926612854004, "learning_rate": 3.1964918071004217e-07, "logits/chosen": -0.36786073446273804, "logits/rejected": -0.35931724309921265, "logps/chosen": -386.05157470703125, "logps/ref_chosen": -61.548683166503906, "logps/ref_rejected": -91.64103698730469, "logps/rejected": -620.8111572265625, "loss": 1.084, "margin_dpo/margin_mean": 204.66717529296875, "margin_dpo/margin_std": 266.84027099609375, "step": 321 }, { "KL/chosen_KL_mean": -277.02001953125, "KL/mean": -390.20526123046875, "KL/rejected_KL_mean": -503.3905029296875, "KL/std": 221.27413940429688, "epoch": 0.47283406754772395, "fcm_dpo/beta": 0.002025635913014412, "fcm_dpo/delta": -0.0618242546916008, "fcm_dpo/margin": 226.3704833984375, "fcm_dpo/q_t": 0.39296412467956543, "grad_norm": 22.475814819335938, "learning_rate": 3.184157475180207e-07, "logits/chosen": -0.3741741180419922, "logits/rejected": -0.37576234340667725, "logps/chosen": -334.31005859375, "logps/ref_chosen": -57.29003143310547, "logps/ref_rejected": -95.74992370605469, "logps/rejected": -599.1404418945312, "loss": 1.0349, "margin_dpo/margin_mean": 226.3704833984375, "margin_dpo/margin_std": 237.702392578125, "step": 322 }, { "KL/chosen_KL_mean": -298.865234375, "KL/mean": -393.89263916015625, "KL/rejected_KL_mean": -488.9200439453125, "KL/std": 218.2051239013672, "epoch": 0.47430249632892807, "fcm_dpo/beta": 0.002028942573815584, "fcm_dpo/delta": 0.014963037334382534, "fcm_dpo/margin": 190.0548095703125, "fcm_dpo/q_t": 0.4098392724990845, "grad_norm": 34.564815521240234, "learning_rate": 3.171805115074251e-07, "logits/chosen": -0.3972129225730896, "logits/rejected": -0.39619508385658264, "logps/chosen": -350.09918212890625, "logps/ref_chosen": -51.23395919799805, "logps/ref_rejected": -75.06192016601562, "logps/rejected": -563.98193359375, "loss": 1.0968, "margin_dpo/margin_mean": 190.0548095703125, "margin_dpo/margin_std": 241.6279296875, "step": 323 }, { "KL/chosen_KL_mean": -339.42047119140625, "KL/mean": -431.14385986328125, "KL/rejected_KL_mean": -522.8672485351562, "KL/std": 229.73959350585938, "epoch": 0.47577092511013214, "fcm_dpo/beta": 0.002055136486887932, "fcm_dpo/delta": 0.022490426898002625, "fcm_dpo/margin": 183.44676208496094, "fcm_dpo/q_t": 0.41455233097076416, "grad_norm": 38.610740661621094, "learning_rate": 3.1594350522787295e-07, "logits/chosen": -0.37280696630477905, "logits/rejected": -0.35853368043899536, "logps/chosen": -404.5556640625, "logps/ref_chosen": -65.13516998291016, "logps/ref_rejected": -86.47750854492188, "logps/rejected": -609.3447265625, "loss": 1.1193, "margin_dpo/margin_mean": 183.44676208496094, "margin_dpo/margin_std": 259.76324462890625, "step": 324 }, { "KL/chosen_KL_mean": -265.3223876953125, "KL/mean": -344.13836669921875, "KL/rejected_KL_mean": -422.95440673828125, "KL/std": 204.55453491210938, "epoch": 0.47723935389133626, "fcm_dpo/beta": 0.0020671868696808815, "fcm_dpo/delta": 0.07653862237930298, "fcm_dpo/margin": 157.63201904296875, "fcm_dpo/q_t": 0.42351895570755005, "grad_norm": 25.032848358154297, "learning_rate": 3.147047612756302e-07, "logits/chosen": -0.4788200259208679, "logits/rejected": -0.4589323401451111, "logps/chosen": -321.5379638671875, "logps/ref_chosen": -56.215599060058594, "logps/ref_rejected": -70.08592987060547, "logps/rejected": -493.0403137207031, "loss": 1.1329, "margin_dpo/margin_mean": 157.63201904296875, "margin_dpo/margin_std": 206.6846466064453, "step": 325 }, { "KL/chosen_KL_mean": -288.2630615234375, "KL/mean": -365.63433837890625, "KL/rejected_KL_mean": -443.005615234375, "KL/std": 189.14825439453125, "epoch": 0.4787077826725404, "fcm_dpo/beta": 0.0020953970961272717, "fcm_dpo/delta": 0.07834838330745697, "fcm_dpo/margin": 154.7425537109375, "fcm_dpo/q_t": 0.42310160398483276, "grad_norm": 34.663387298583984, "learning_rate": 3.134643122927519e-07, "logits/chosen": -0.4711052179336548, "logits/rejected": -0.4501519799232483, "logps/chosen": -360.988037109375, "logps/ref_chosen": -72.72496032714844, "logps/ref_rejected": -79.8467788696289, "logps/rejected": -522.8524169921875, "loss": 1.1242, "margin_dpo/margin_mean": 154.7425537109375, "margin_dpo/margin_std": 181.62420654296875, "step": 326 }, { "KL/chosen_KL_mean": -249.2471923828125, "KL/mean": -359.001953125, "KL/rejected_KL_mean": -468.7567138671875, "KL/std": 200.14559936523438, "epoch": 0.4801762114537445, "fcm_dpo/beta": 0.00208103284239769, "fcm_dpo/delta": -0.05987313389778137, "fcm_dpo/margin": 219.50950622558594, "fcm_dpo/q_t": 0.3920641541481018, "grad_norm": 37.02153396606445, "learning_rate": 3.1222219096622264e-07, "logits/chosen": -0.46292924880981445, "logits/rejected": -0.45082515478134155, "logps/chosen": -318.3816223144531, "logps/ref_chosen": -69.13441467285156, "logps/ref_rejected": -111.93377685546875, "logps/rejected": -580.6904907226562, "loss": 1.0304, "margin_dpo/margin_mean": 219.50949096679688, "margin_dpo/margin_std": 218.749755859375, "step": 327 }, { "KL/chosen_KL_mean": -263.05267333984375, "KL/mean": -361.53070068359375, "KL/rejected_KL_mean": -460.0086669921875, "KL/std": 220.0689697265625, "epoch": 0.48164464023494863, "fcm_dpo/beta": 0.0020799068734049797, "fcm_dpo/delta": -0.010076452046632767, "fcm_dpo/margin": 196.9560089111328, "fcm_dpo/q_t": 0.4056174159049988, "grad_norm": 29.511402130126953, "learning_rate": 3.1097843002709427e-07, "logits/chosen": -0.4060869514942169, "logits/rejected": -0.4065949320793152, "logps/chosen": -322.7398681640625, "logps/ref_chosen": -59.68719482421875, "logps/ref_rejected": -90.85499572753906, "logps/rejected": -550.8636474609375, "loss": 1.0771, "margin_dpo/margin_mean": 196.95599365234375, "margin_dpo/margin_std": 243.63185119628906, "step": 328 }, { "KL/chosen_KL_mean": -295.07080078125, "KL/mean": -402.98480224609375, "KL/rejected_KL_mean": -510.89874267578125, "KL/std": 238.93309020996094, "epoch": 0.4831130690161527, "fcm_dpo/beta": 0.002052995143458247, "fcm_dpo/delta": -0.04573259502649307, "fcm_dpo/margin": 215.82797241210938, "fcm_dpo/q_t": 0.3972422778606415, "grad_norm": 23.92837142944336, "learning_rate": 3.0973306224962437e-07, "logits/chosen": -0.4066659212112427, "logits/rejected": -0.3950307369232178, "logps/chosen": -360.3170166015625, "logps/ref_chosen": -65.2461929321289, "logps/ref_rejected": -100.69770812988281, "logps/rejected": -611.596435546875, "loss": 1.0616, "margin_dpo/margin_mean": 215.82797241210938, "margin_dpo/margin_std": 258.53314208984375, "step": 329 }, { "KL/chosen_KL_mean": -256.5444641113281, "KL/mean": -369.53118896484375, "KL/rejected_KL_mean": -482.51788330078125, "KL/std": 233.46661376953125, "epoch": 0.4845814977973568, "fcm_dpo/beta": 0.002038386417552829, "fcm_dpo/delta": -0.06354449689388275, "fcm_dpo/margin": 225.9734344482422, "fcm_dpo/q_t": 0.3932916224002838, "grad_norm": 26.08322525024414, "learning_rate": 3.084861204504122e-07, "logits/chosen": -0.40733757615089417, "logits/rejected": -0.40871596336364746, "logps/chosen": -303.5428161621094, "logps/ref_chosen": -46.998348236083984, "logps/ref_rejected": -86.87684631347656, "logps/rejected": -569.394775390625, "loss": 1.04, "margin_dpo/margin_mean": 225.9734344482422, "margin_dpo/margin_std": 254.14117431640625, "step": 330 }, { "KL/chosen_KL_mean": -283.42938232421875, "KL/mean": -396.7335205078125, "KL/rejected_KL_mean": -510.03765869140625, "KL/std": 195.69345092773438, "epoch": 0.48604992657856094, "fcm_dpo/beta": 0.0020131845958530903, "fcm_dpo/delta": -0.058914512395858765, "fcm_dpo/margin": 226.6082763671875, "fcm_dpo/q_t": 0.3916124403476715, "grad_norm": 24.841083526611328, "learning_rate": 3.072376374875335e-07, "logits/chosen": -0.4177253246307373, "logits/rejected": -0.41352635622024536, "logps/chosen": -333.95367431640625, "logps/ref_chosen": -50.52424621582031, "logps/ref_rejected": -89.01544189453125, "logps/rejected": -599.0531005859375, "loss": 1.015, "margin_dpo/margin_mean": 226.6082763671875, "margin_dpo/margin_std": 190.06936645507812, "step": 331 }, { "KL/chosen_KL_mean": -292.7176513671875, "KL/mean": -374.81634521484375, "KL/rejected_KL_mean": -456.9150390625, "KL/std": 211.74179077148438, "epoch": 0.48751835535976507, "fcm_dpo/beta": 0.002031027339398861, "fcm_dpo/delta": 0.06868893653154373, "fcm_dpo/margin": 164.1973876953125, "fcm_dpo/q_t": 0.423465371131897, "grad_norm": 21.33700180053711, "learning_rate": 3.059876462596758e-07, "logits/chosen": -0.4500772953033447, "logits/rejected": -0.4330589771270752, "logps/chosen": -341.8979187011719, "logps/ref_chosen": -49.18028259277344, "logps/ref_rejected": -76.48515319824219, "logps/rejected": -533.4002075195312, "loss": 1.1317, "margin_dpo/margin_mean": 164.1973876953125, "margin_dpo/margin_std": 231.62454223632812, "step": 332 }, { "KL/chosen_KL_mean": -320.2816162109375, "KL/mean": -428.072998046875, "KL/rejected_KL_mean": -535.8644409179688, "KL/std": 247.35726928710938, "epoch": 0.4889867841409692, "fcm_dpo/beta": 0.0020122663117945194, "fcm_dpo/delta": -0.03633493557572365, "fcm_dpo/margin": 215.58282470703125, "fcm_dpo/q_t": 0.40161222219467163, "grad_norm": 20.72551918029785, "learning_rate": 3.0473617970527015e-07, "logits/chosen": -0.4343733787536621, "logits/rejected": -0.427177369594574, "logps/chosen": -384.037353515625, "logps/ref_chosen": -63.75574493408203, "logps/ref_rejected": -95.04411315917969, "logps/rejected": -630.9085693359375, "loss": 1.0836, "margin_dpo/margin_mean": 215.58282470703125, "margin_dpo/margin_std": 294.230712890625, "step": 333 }, { "KL/chosen_KL_mean": -313.5834045410156, "KL/mean": -410.6392822265625, "KL/rejected_KL_mean": -507.6951904296875, "KL/std": 270.35736083984375, "epoch": 0.49045521292217326, "fcm_dpo/beta": 0.002020814223214984, "fcm_dpo/delta": 0.008031206205487251, "fcm_dpo/margin": 194.11180114746094, "fcm_dpo/q_t": 0.411624014377594, "grad_norm": 25.060213088989258, "learning_rate": 3.034832708016243e-07, "logits/chosen": -0.4468313455581665, "logits/rejected": -0.44486457109451294, "logps/chosen": -380.56317138671875, "logps/ref_chosen": -66.97975158691406, "logps/ref_rejected": -95.31692504882812, "logps/rejected": -603.0120849609375, "loss": 1.121, "margin_dpo/margin_mean": 194.11180114746094, "margin_dpo/margin_std": 303.725341796875, "step": 334 }, { "KL/chosen_KL_mean": -346.2706604003906, "KL/mean": -420.2770080566406, "KL/rejected_KL_mean": -494.2833557128906, "KL/std": 254.03070068359375, "epoch": 0.4919236417033774, "fcm_dpo/beta": 0.0020229285582900047, "fcm_dpo/delta": -0.0058713615871965885, "fcm_dpo/margin": 148.0127410888672, "fcm_dpo/q_t": 0.4324929714202881, "grad_norm": 31.06818962097168, "learning_rate": 3.022289525640531e-07, "logits/chosen": -0.4586733281612396, "logits/rejected": -0.4365878701210022, "logps/chosen": -408.8131408691406, "logps/ref_chosen": -62.54248046875, "logps/ref_rejected": -87.61770629882812, "logps/rejected": -581.9010620117188, "loss": 1.1873, "margin_dpo/margin_mean": 148.0127410888672, "margin_dpo/margin_std": 279.176025390625, "step": 335 }, { "KL/chosen_KL_mean": -344.3829345703125, "KL/mean": -469.1910095214844, "KL/rejected_KL_mean": -593.9991455078125, "KL/std": 296.28973388671875, "epoch": 0.4933920704845815, "fcm_dpo/beta": 0.001999348634853959, "fcm_dpo/delta": -0.10420601069927216, "fcm_dpo/margin": 249.61618041992188, "fcm_dpo/q_t": 0.38974490761756897, "grad_norm": 26.365034103393555, "learning_rate": 3.009732580450086e-07, "logits/chosen": -0.43404412269592285, "logits/rejected": -0.43407052755355835, "logps/chosen": -398.9140625, "logps/ref_chosen": -54.53115463256836, "logps/ref_rejected": -104.40424346923828, "logps/rejected": -698.4033813476562, "loss": 1.0618, "margin_dpo/margin_mean": 249.61618041992188, "margin_dpo/margin_std": 351.21514892578125, "step": 336 }, { "KL/chosen_KL_mean": -312.9530029296875, "KL/mean": -438.01409912109375, "KL/rejected_KL_mean": -563.0751342773438, "KL/std": 238.04376220703125, "epoch": 0.4948604992657856, "fcm_dpo/beta": 0.0019480783957988024, "fcm_dpo/delta": -0.0921003520488739, "fcm_dpo/margin": 250.12213134765625, "fcm_dpo/q_t": 0.38848379254341125, "grad_norm": 26.773256301879883, "learning_rate": 2.9971622033320914e-07, "logits/chosen": -0.48545369505882263, "logits/rejected": -0.47427335381507874, "logps/chosen": -378.08172607421875, "logps/ref_chosen": -65.12869262695312, "logps/ref_rejected": -101.72701263427734, "logps/rejected": -664.8021240234375, "loss": 1.026, "margin_dpo/margin_mean": 250.12213134765625, "margin_dpo/margin_std": 282.658935546875, "step": 337 }, { "KL/chosen_KL_mean": -279.4560546875, "KL/mean": -396.4298400878906, "KL/rejected_KL_mean": -513.4036254882812, "KL/std": 230.5653533935547, "epoch": 0.49632892804698975, "fcm_dpo/beta": 0.001927088014781475, "fcm_dpo/delta": -0.05328977108001709, "fcm_dpo/margin": 233.9475555419922, "fcm_dpo/q_t": 0.39445608854293823, "grad_norm": 26.295778274536133, "learning_rate": 2.984578725527675e-07, "logits/chosen": -0.44427040219306946, "logits/rejected": -0.438721239566803, "logps/chosen": -337.8787536621094, "logps/ref_chosen": -58.422706604003906, "logps/ref_rejected": -89.06854248046875, "logps/rejected": -602.47216796875, "loss": 1.0356, "margin_dpo/margin_mean": 233.94757080078125, "margin_dpo/margin_std": 240.76602172851562, "step": 338 }, { "KL/chosen_KL_mean": -305.31781005859375, "KL/mean": -421.2132568359375, "KL/rejected_KL_mean": -537.1087646484375, "KL/std": 242.9071044921875, "epoch": 0.4977973568281938, "fcm_dpo/beta": 0.0019262076821178198, "fcm_dpo/delta": -0.04981581121683121, "fcm_dpo/margin": 231.79090881347656, "fcm_dpo/q_t": 0.3960764408111572, "grad_norm": 24.693683624267578, "learning_rate": 2.9719824786231796e-07, "logits/chosen": -0.4791075587272644, "logits/rejected": -0.46518805623054504, "logps/chosen": -365.3131408691406, "logps/ref_chosen": -59.99531555175781, "logps/ref_rejected": -103.9109115600586, "logps/rejected": -641.0196533203125, "loss": 1.0471, "margin_dpo/margin_mean": 231.79090881347656, "margin_dpo/margin_std": 240.60723876953125, "step": 339 }, { "KL/chosen_KL_mean": -322.9487609863281, "KL/mean": -422.3816223144531, "KL/rejected_KL_mean": -521.814453125, "KL/std": 234.76840209960938, "epoch": 0.49926578560939794, "fcm_dpo/beta": 0.0019024586072191596, "fcm_dpo/delta": 0.0222429558634758, "fcm_dpo/margin": 198.86572265625, "fcm_dpo/q_t": 0.41325610876083374, "grad_norm": 33.532562255859375, "learning_rate": 2.959373794541426e-07, "logits/chosen": -0.4173485040664673, "logits/rejected": -0.3965187966823578, "logps/chosen": -375.77899169921875, "logps/ref_chosen": -52.83022689819336, "logps/ref_rejected": -73.10723114013672, "logps/rejected": -594.9216918945312, "loss": 1.1133, "margin_dpo/margin_mean": 198.86572265625, "margin_dpo/margin_std": 284.8116455078125, "step": 340 }, { "KL/chosen_KL_mean": -319.81683349609375, "KL/mean": -433.06732177734375, "KL/rejected_KL_mean": -546.3178100585938, "KL/std": 254.18038940429688, "epoch": 0.5007342143906021, "fcm_dpo/beta": 0.0018979123560711741, "fcm_dpo/delta": -0.03151214122772217, "fcm_dpo/margin": 226.50100708007812, "fcm_dpo/q_t": 0.40071308612823486, "grad_norm": 26.986059188842773, "learning_rate": 2.946753005532965e-07, "logits/chosen": -0.43833357095718384, "logits/rejected": -0.4382708966732025, "logps/chosen": -367.7166442871094, "logps/ref_chosen": -47.899803161621094, "logps/ref_rejected": -101.80987548828125, "logps/rejected": -648.127685546875, "loss": 1.0612, "margin_dpo/margin_mean": 226.50100708007812, "margin_dpo/margin_std": 261.280517578125, "step": 341 }, { "KL/chosen_KL_mean": -317.66387939453125, "KL/mean": -418.5531005859375, "KL/rejected_KL_mean": -519.4423828125, "KL/std": 237.76806640625, "epoch": 0.5022026431718062, "fcm_dpo/beta": 0.0019124182872474194, "fcm_dpo/delta": 0.014296330511569977, "fcm_dpo/margin": 201.77845764160156, "fcm_dpo/q_t": 0.41131168603897095, "grad_norm": 24.853057861328125, "learning_rate": 2.934120444167326e-07, "logits/chosen": -0.4720449149608612, "logits/rejected": -0.4480515718460083, "logps/chosen": -389.6605224609375, "logps/ref_chosen": -71.99664306640625, "logps/ref_rejected": -92.58959197998047, "logps/rejected": -612.031982421875, "loss": 1.1035, "margin_dpo/margin_mean": 201.77847290039062, "margin_dpo/margin_std": 277.33526611328125, "step": 342 }, { "KL/chosen_KL_mean": -305.1226806640625, "KL/mean": -425.2977294921875, "KL/rejected_KL_mean": -545.4727783203125, "KL/std": 241.86111450195312, "epoch": 0.5036710719530103, "fcm_dpo/beta": 0.0018888043705374002, "fcm_dpo/delta": -0.05659223720431328, "fcm_dpo/margin": 240.35008239746094, "fcm_dpo/q_t": 0.392503947019577, "grad_norm": 22.807832717895508, "learning_rate": 2.9214764433242476e-07, "logits/chosen": -0.4819292724132538, "logits/rejected": -0.4859675168991089, "logps/chosen": -359.5283203125, "logps/ref_chosen": -54.405616760253906, "logps/ref_rejected": -111.04142761230469, "logps/rejected": -656.51416015625, "loss": 1.0242, "margin_dpo/margin_mean": 240.35009765625, "margin_dpo/margin_std": 217.2589874267578, "step": 343 }, { "KL/chosen_KL_mean": -299.1427001953125, "KL/mean": -402.548583984375, "KL/rejected_KL_mean": -505.9544677734375, "KL/std": 257.1475524902344, "epoch": 0.5051395007342144, "fcm_dpo/beta": 0.0019029853865504265, "fcm_dpo/delta": 0.0050534456968307495, "fcm_dpo/margin": 206.81173706054688, "fcm_dpo/q_t": 0.41077619791030884, "grad_norm": 28.06170654296875, "learning_rate": 2.9088213361849126e-07, "logits/chosen": -0.45593854784965515, "logits/rejected": -0.45924174785614014, "logps/chosen": -353.10736083984375, "logps/ref_chosen": -53.96466827392578, "logps/ref_rejected": -90.62336730957031, "logps/rejected": -596.5778198242188, "loss": 1.0943, "margin_dpo/margin_mean": 206.81173706054688, "margin_dpo/margin_std": 261.9127197265625, "step": 344 }, { "KL/chosen_KL_mean": -354.2217712402344, "KL/mean": -471.0653076171875, "KL/rejected_KL_mean": -587.9088745117188, "KL/std": 249.9000244140625, "epoch": 0.5066079295154186, "fcm_dpo/beta": 0.0018797038355842233, "fcm_dpo/delta": -0.04108835384249687, "fcm_dpo/margin": 233.6870880126953, "fcm_dpo/q_t": 0.3980991244316101, "grad_norm": 19.72064208984375, "learning_rate": 2.896155456223163e-07, "logits/chosen": -0.46400630474090576, "logits/rejected": -0.4607963263988495, "logps/chosen": -415.907470703125, "logps/ref_chosen": -61.685699462890625, "logps/ref_rejected": -99.49041748046875, "logps/rejected": -687.3992919921875, "loss": 1.0559, "margin_dpo/margin_mean": 233.68710327148438, "margin_dpo/margin_std": 272.0915222167969, "step": 345 }, { "KL/chosen_KL_mean": -343.4974365234375, "KL/mean": -453.58154296875, "KL/rejected_KL_mean": -563.6656494140625, "KL/std": 240.47105407714844, "epoch": 0.5080763582966226, "fcm_dpo/beta": 0.0018681611400097609, "fcm_dpo/delta": -0.011793499812483788, "fcm_dpo/margin": 220.168212890625, "fcm_dpo/q_t": 0.4032723307609558, "grad_norm": 25.529489517211914, "learning_rate": 2.883479137196714e-07, "logits/chosen": -0.43244969844818115, "logits/rejected": -0.42079615592956543, "logps/chosen": -398.75372314453125, "logps/ref_chosen": -55.256263732910156, "logps/ref_rejected": -77.41532135009766, "logps/rejected": -641.0809326171875, "loss": 1.0708, "margin_dpo/margin_mean": 220.168212890625, "margin_dpo/margin_std": 256.2587890625, "step": 346 }, { "KL/chosen_KL_mean": -339.9598388671875, "KL/mean": -450.0728454589844, "KL/rejected_KL_mean": -560.1858520507812, "KL/std": 254.47296142578125, "epoch": 0.5095447870778267, "fcm_dpo/beta": 0.00186370057053864, "fcm_dpo/delta": -0.010882144793868065, "fcm_dpo/margin": 220.22601318359375, "fcm_dpo/q_t": 0.4054613709449768, "grad_norm": 21.177968978881836, "learning_rate": 2.8707927131383614e-07, "logits/chosen": -0.4151489734649658, "logits/rejected": -0.407335102558136, "logps/chosen": -397.52606201171875, "logps/ref_chosen": -57.56623840332031, "logps/ref_rejected": -92.35509490966797, "logps/rejected": -652.5409545898438, "loss": 1.0836, "margin_dpo/margin_mean": 220.22601318359375, "margin_dpo/margin_std": 285.551513671875, "step": 347 }, { "KL/chosen_KL_mean": -299.27691650390625, "KL/mean": -389.9990234375, "KL/rejected_KL_mean": -480.7210998535156, "KL/std": 215.0816650390625, "epoch": 0.5110132158590308, "fcm_dpo/beta": 0.0018694268073886633, "fcm_dpo/delta": 0.06262210756540298, "fcm_dpo/margin": 181.44418334960938, "fcm_dpo/q_t": 0.42170295119285583, "grad_norm": 25.129812240600586, "learning_rate": 2.858096518347179e-07, "logits/chosen": -0.49058231711387634, "logits/rejected": -0.49180328845977783, "logps/chosen": -355.5946044921875, "logps/ref_chosen": -56.31770324707031, "logps/ref_rejected": -89.13836669921875, "logps/rejected": -569.8594970703125, "loss": 1.131, "margin_dpo/margin_mean": 181.44418334960938, "margin_dpo/margin_std": 251.6490020751953, "step": 348 }, { "KL/chosen_KL_mean": -280.0154724121094, "KL/mean": -386.7165832519531, "KL/rejected_KL_mean": -493.41766357421875, "KL/std": 241.07833862304688, "epoch": 0.5124816446402349, "fcm_dpo/beta": 0.0018920442089438438, "fcm_dpo/delta": -0.0041931793093681335, "fcm_dpo/margin": 213.40219116210938, "fcm_dpo/q_t": 0.40846750140190125, "grad_norm": 20.649566650390625, "learning_rate": 2.845390887379706e-07, "logits/chosen": -0.43403786420822144, "logits/rejected": -0.43399712443351746, "logps/chosen": -338.0409851074219, "logps/ref_chosen": -58.025516510009766, "logps/ref_rejected": -97.50515747070312, "logps/rejected": -590.9228515625, "loss": 1.0975, "margin_dpo/margin_mean": 213.40220642089844, "margin_dpo/margin_std": 298.5577392578125, "step": 349 }, { "KL/chosen_KL_mean": -297.84869384765625, "KL/mean": -400.894775390625, "KL/rejected_KL_mean": -503.94085693359375, "KL/std": 231.921875, "epoch": 0.5139500734214391, "fcm_dpo/beta": 0.0018797710072249174, "fcm_dpo/delta": 0.012750823050737381, "fcm_dpo/margin": 206.09213256835938, "fcm_dpo/q_t": 0.4099721312522888, "grad_norm": 27.677011489868164, "learning_rate": 2.8326761550411346e-07, "logits/chosen": -0.4817023277282715, "logits/rejected": -0.48322421312332153, "logps/chosen": -362.17919921875, "logps/ref_chosen": -64.33049011230469, "logps/ref_rejected": -89.87164306640625, "logps/rejected": -593.8125, "loss": 1.1045, "margin_dpo/margin_mean": 206.09213256835938, "margin_dpo/margin_std": 283.2620544433594, "step": 350 }, { "KL/chosen_KL_mean": -282.1582336425781, "KL/mean": -409.1272277832031, "KL/rejected_KL_mean": -536.09619140625, "KL/std": 275.96209716796875, "epoch": 0.5154185022026432, "fcm_dpo/beta": 0.0018625000957399607, "fcm_dpo/delta": -0.07697418332099915, "fcm_dpo/margin": 253.93798828125, "fcm_dpo/q_t": 0.39376571774482727, "grad_norm": 27.421785354614258, "learning_rate": 2.819952656376487e-07, "logits/chosen": -0.4568382501602173, "logits/rejected": -0.4523654282093048, "logps/chosen": -342.8303527832031, "logps/ref_chosen": -60.6721305847168, "logps/ref_rejected": -101.5654296875, "logps/rejected": -637.66162109375, "loss": 1.0495, "margin_dpo/margin_mean": 253.93798828125, "margin_dpo/margin_std": 320.8644104003906, "step": 351 }, { "KL/chosen_KL_mean": -334.1682434082031, "KL/mean": -413.40325927734375, "KL/rejected_KL_mean": -492.63824462890625, "KL/std": 255.41502380371094, "epoch": 0.5168869309838473, "fcm_dpo/beta": 0.0018845018930733204, "fcm_dpo/delta": 0.1046164482831955, "fcm_dpo/margin": 158.46998596191406, "fcm_dpo/q_t": 0.43189874291419983, "grad_norm": 38.70182800292969, "learning_rate": 2.8072207266617854e-07, "logits/chosen": -0.4881801903247833, "logits/rejected": -0.45606744289398193, "logps/chosen": -405.1116943359375, "logps/ref_chosen": -70.9434585571289, "logps/ref_rejected": -76.6419677734375, "logps/rejected": -569.2802124023438, "loss": 1.1774, "margin_dpo/margin_mean": 158.46998596191406, "margin_dpo/margin_std": 281.17779541015625, "step": 352 }, { "KL/chosen_KL_mean": -315.9549865722656, "KL/mean": -417.49176025390625, "KL/rejected_KL_mean": -519.028564453125, "KL/std": 250.93426513671875, "epoch": 0.5183553597650514, "fcm_dpo/beta": 0.0018944459734484553, "fcm_dpo/delta": 0.015432950109243393, "fcm_dpo/margin": 203.0735626220703, "fcm_dpo/q_t": 0.41237473487854004, "grad_norm": 27.271644592285156, "learning_rate": 2.794480701395219e-07, "logits/chosen": -0.5225635170936584, "logits/rejected": -0.5105962157249451, "logps/chosen": -374.350341796875, "logps/ref_chosen": -58.39533996582031, "logps/ref_rejected": -80.33553314208984, "logps/rejected": -599.3640747070312, "loss": 1.1081, "margin_dpo/margin_mean": 203.0735626220703, "margin_dpo/margin_std": 283.2236328125, "step": 353 }, { "KL/chosen_KL_mean": -264.493408203125, "KL/mean": -378.5892333984375, "KL/rejected_KL_mean": -492.68505859375, "KL/std": 236.89236450195312, "epoch": 0.5198237885462555, "fcm_dpo/beta": 0.0018932155799120665, "fcm_dpo/delta": -0.0335673987865448, "fcm_dpo/margin": 228.191650390625, "fcm_dpo/q_t": 0.3988182246685028, "grad_norm": 26.834888458251953, "learning_rate": 2.781732916288303e-07, "logits/chosen": -0.48592621088027954, "logits/rejected": -0.4727493226528168, "logps/chosen": -324.29638671875, "logps/ref_chosen": -59.80299377441406, "logps/ref_rejected": -88.75750732421875, "logps/rejected": -581.4425659179688, "loss": 1.0474, "margin_dpo/margin_mean": 228.191650390625, "margin_dpo/margin_std": 236.67225646972656, "step": 354 }, { "KL/chosen_KL_mean": -266.8475341796875, "KL/mean": -379.05157470703125, "KL/rejected_KL_mean": -491.2556457519531, "KL/std": 240.4083251953125, "epoch": 0.5212922173274597, "fcm_dpo/beta": 0.0018815842922776937, "fcm_dpo/delta": -0.023417077958583832, "fcm_dpo/margin": 224.40811157226562, "fcm_dpo/q_t": 0.400523841381073, "grad_norm": 34.09590530395508, "learning_rate": 2.7689777072570284e-07, "logits/chosen": -0.5412899255752563, "logits/rejected": -0.5298917293548584, "logps/chosen": -320.97601318359375, "logps/ref_chosen": -54.12849807739258, "logps/ref_rejected": -82.40606689453125, "logps/rejected": -573.6617431640625, "loss": 1.056, "margin_dpo/margin_mean": 224.4081268310547, "margin_dpo/margin_std": 237.8057403564453, "step": 355 }, { "KL/chosen_KL_mean": -328.6931457519531, "KL/mean": -391.6343078613281, "KL/rejected_KL_mean": -454.5754699707031, "KL/std": 245.00680541992188, "epoch": 0.5227606461086637, "fcm_dpo/beta": 0.001889348030090332, "fcm_dpo/delta": 0.027527010068297386, "fcm_dpo/margin": 125.88235473632812, "fcm_dpo/q_t": 0.4463768005371094, "grad_norm": 29.567127227783203, "learning_rate": 2.7562154104130176e-07, "logits/chosen": -0.5134952068328857, "logits/rejected": -0.49404820799827576, "logps/chosen": -393.366943359375, "logps/ref_chosen": -64.6738052368164, "logps/ref_rejected": -75.89926147460938, "logps/rejected": -530.4747314453125, "loss": 1.241, "margin_dpo/margin_mean": 125.88235473632812, "margin_dpo/margin_std": 300.37933349609375, "step": 356 }, { "KL/chosen_KL_mean": -305.4680480957031, "KL/mean": -405.6684875488281, "KL/rejected_KL_mean": -505.8689270019531, "KL/std": 243.4521484375, "epoch": 0.5242290748898678, "fcm_dpo/beta": 0.0018927913624793291, "fcm_dpo/delta": 0.021241577342152596, "fcm_dpo/margin": 200.40087890625, "fcm_dpo/q_t": 0.4112818241119385, "grad_norm": 25.547868728637695, "learning_rate": 2.7434463620546594e-07, "logits/chosen": -0.5083039999008179, "logits/rejected": -0.49678516387939453, "logps/chosen": -358.19384765625, "logps/ref_chosen": -52.725799560546875, "logps/ref_rejected": -86.84115600585938, "logps/rejected": -592.7100830078125, "loss": 1.0964, "margin_dpo/margin_mean": 200.40087890625, "margin_dpo/margin_std": 248.13658142089844, "step": 357 }, { "KL/chosen_KL_mean": -281.19622802734375, "KL/mean": -375.6390380859375, "KL/rejected_KL_mean": -470.081787109375, "KL/std": 238.79005432128906, "epoch": 0.5256975036710719, "fcm_dpo/beta": 0.0019136819755658507, "fcm_dpo/delta": 0.039984140545129776, "fcm_dpo/margin": 188.8855743408203, "fcm_dpo/q_t": 0.41719043254852295, "grad_norm": 26.77370262145996, "learning_rate": 2.730670898658255e-07, "logits/chosen": -0.49271106719970703, "logits/rejected": -0.4746229648590088, "logps/chosen": -344.40167236328125, "logps/ref_chosen": -63.20543670654297, "logps/ref_rejected": -88.373291015625, "logps/rejected": -558.455078125, "loss": 1.1142, "margin_dpo/margin_mean": 188.8855743408203, "margin_dpo/margin_std": 258.85284423828125, "step": 358 }, { "KL/chosen_KL_mean": -321.8160400390625, "KL/mean": -434.1688537597656, "KL/rejected_KL_mean": -546.5216674804688, "KL/std": 240.7170867919922, "epoch": 0.527165932452276, "fcm_dpo/beta": 0.0019082968356087804, "fcm_dpo/delta": -0.030117180198431015, "fcm_dpo/margin": 224.70556640625, "fcm_dpo/q_t": 0.40221983194351196, "grad_norm": 35.96456527709961, "learning_rate": 2.717889356869146e-07, "logits/chosen": -0.4789687991142273, "logits/rejected": -0.4714996814727783, "logps/chosen": -378.186279296875, "logps/ref_chosen": -56.370216369628906, "logps/ref_rejected": -82.17375183105469, "logps/rejected": -628.6954345703125, "loss": 1.0748, "margin_dpo/margin_mean": 224.70556640625, "margin_dpo/margin_std": 287.6982421875, "step": 359 }, { "KL/chosen_KL_mean": -325.312255859375, "KL/mean": -405.36517333984375, "KL/rejected_KL_mean": -485.41815185546875, "KL/std": 198.17393493652344, "epoch": 0.5286343612334802, "fcm_dpo/beta": 0.0019348189234733582, "fcm_dpo/delta": 0.09300471842288971, "fcm_dpo/margin": 160.10589599609375, "fcm_dpo/q_t": 0.42690205574035645, "grad_norm": 55.63818359375, "learning_rate": 2.7051020734928443e-07, "logits/chosen": -0.4359634816646576, "logits/rejected": -0.422908216714859, "logps/chosen": -376.77264404296875, "logps/ref_chosen": -51.460384368896484, "logps/ref_rejected": -69.83892059326172, "logps/rejected": -555.257080078125, "loss": 1.1367, "margin_dpo/margin_mean": 160.10589599609375, "margin_dpo/margin_std": 199.83489990234375, "step": 360 }, { "KL/chosen_KL_mean": -338.5645751953125, "KL/mean": -420.45660400390625, "KL/rejected_KL_mean": -502.3486328125, "KL/std": 235.536376953125, "epoch": 0.5301027900146843, "fcm_dpo/beta": 0.0019699514377862215, "fcm_dpo/delta": 0.07963744550943375, "fcm_dpo/margin": 163.78407287597656, "fcm_dpo/q_t": 0.4253769516944885, "grad_norm": 43.57426071166992, "learning_rate": 2.6923093854861593e-07, "logits/chosen": -0.44554078578948975, "logits/rejected": -0.4408929944038391, "logps/chosen": -392.43408203125, "logps/ref_chosen": -53.86951446533203, "logps/ref_rejected": -90.7692642211914, "logps/rejected": -593.117919921875, "loss": 1.1513, "margin_dpo/margin_mean": 163.7840576171875, "margin_dpo/margin_std": 258.46649169921875, "step": 361 }, { "KL/chosen_KL_mean": -293.4056396484375, "KL/mean": -428.86041259765625, "KL/rejected_KL_mean": -564.315185546875, "KL/std": 247.4333038330078, "epoch": 0.5315712187958884, "fcm_dpo/beta": 0.0019274294609203935, "fcm_dpo/delta": -0.1298113465309143, "fcm_dpo/margin": 270.9095153808594, "fcm_dpo/q_t": 0.38001787662506104, "grad_norm": 25.4000301361084, "learning_rate": 2.679511629948319e-07, "logits/chosen": -0.4862041473388672, "logits/rejected": -0.4968222975730896, "logps/chosen": -352.044677734375, "logps/ref_chosen": -58.639060974121094, "logps/ref_rejected": -105.58195495605469, "logps/rejected": -669.8970947265625, "loss": 0.991, "margin_dpo/margin_mean": 270.9095458984375, "margin_dpo/margin_std": 266.97491455078125, "step": 362 }, { "KL/chosen_KL_mean": -263.7435607910156, "KL/mean": -401.75506591796875, "KL/rejected_KL_mean": -539.7665405273438, "KL/std": 247.50381469726562, "epoch": 0.5330396475770925, "fcm_dpo/beta": 0.0018918986897915602, "fcm_dpo/delta": -0.12874022126197815, "fcm_dpo/margin": 276.02301025390625, "fcm_dpo/q_t": 0.37962085008621216, "grad_norm": 24.668289184570312, "learning_rate": 2.6667091441120816e-07, "logits/chosen": -0.4323340654373169, "logits/rejected": -0.4228121340274811, "logps/chosen": -308.30194091796875, "logps/ref_chosen": -44.558380126953125, "logps/ref_rejected": -74.69496154785156, "logps/rejected": -614.4615478515625, "loss": 0.9916, "margin_dpo/margin_mean": 276.02301025390625, "margin_dpo/margin_std": 272.3272705078125, "step": 363 }, { "KL/chosen_KL_mean": -291.88446044921875, "KL/mean": -395.2333984375, "KL/rejected_KL_mean": -498.5823059082031, "KL/std": 239.70700073242188, "epoch": 0.5345080763582967, "fcm_dpo/beta": 0.0018847124883905053, "fcm_dpo/delta": 0.010454859584569931, "fcm_dpo/margin": 206.69784545898438, "fcm_dpo/q_t": 0.40976476669311523, "grad_norm": 24.834049224853516, "learning_rate": 2.6539022653348575e-07, "logits/chosen": -0.4836190342903137, "logits/rejected": -0.4936879873275757, "logps/chosen": -340.779052734375, "logps/ref_chosen": -48.894622802734375, "logps/ref_rejected": -91.395751953125, "logps/rejected": -589.97802734375, "loss": 1.0996, "margin_dpo/margin_mean": 206.69784545898438, "margin_dpo/margin_std": 276.881103515625, "step": 364 }, { "KL/chosen_KL_mean": -279.62744140625, "KL/mean": -388.6301574707031, "KL/rejected_KL_mean": -497.63287353515625, "KL/std": 249.4620361328125, "epoch": 0.5359765051395007, "fcm_dpo/beta": 0.0018760417588055134, "fcm_dpo/delta": -0.009370389394462109, "fcm_dpo/margin": 218.00540161132812, "fcm_dpo/q_t": 0.40619686245918274, "grad_norm": 22.168062210083008, "learning_rate": 2.641091331089811e-07, "logits/chosen": -0.4246390461921692, "logits/rejected": -0.43436652421951294, "logps/chosen": -331.12017822265625, "logps/ref_chosen": -51.49274444580078, "logps/ref_rejected": -92.70166778564453, "logps/rejected": -590.3345336914062, "loss": 1.0731, "margin_dpo/margin_mean": 218.00540161132812, "margin_dpo/margin_std": 258.5311279296875, "step": 365 }, { "KL/chosen_KL_mean": -257.15478515625, "KL/mean": -366.0369873046875, "KL/rejected_KL_mean": -474.91925048828125, "KL/std": 234.462890625, "epoch": 0.5374449339207048, "fcm_dpo/beta": 0.0018647974357008934, "fcm_dpo/delta": -0.006713632494211197, "fcm_dpo/margin": 217.76443481445312, "fcm_dpo/q_t": 0.4064916968345642, "grad_norm": 22.63542366027832, "learning_rate": 2.6282766789569736e-07, "logits/chosen": -0.4495304822921753, "logits/rejected": -0.46502619981765747, "logps/chosen": -301.8753662109375, "logps/ref_chosen": -44.7205696105957, "logps/ref_rejected": -83.31040954589844, "logps/rejected": -558.2296142578125, "loss": 1.086, "margin_dpo/margin_mean": 217.76443481445312, "margin_dpo/margin_std": 280.0198669433594, "step": 366 }, { "KL/chosen_KL_mean": -264.7220458984375, "KL/mean": -356.255859375, "KL/rejected_KL_mean": -447.7897033691406, "KL/std": 209.75563049316406, "epoch": 0.5389133627019089, "fcm_dpo/beta": 0.0018905512988567352, "fcm_dpo/delta": 0.05568384379148483, "fcm_dpo/margin": 183.06765747070312, "fcm_dpo/q_t": 0.4182465672492981, "grad_norm": 18.776704788208008, "learning_rate": 2.615458646614349e-07, "logits/chosen": -0.4651241898536682, "logits/rejected": -0.44852566719055176, "logps/chosen": -323.12744140625, "logps/ref_chosen": -58.405418395996094, "logps/ref_rejected": -76.75132751464844, "logps/rejected": -524.541015625, "loss": 1.1141, "margin_dpo/margin_mean": 183.06765747070312, "margin_dpo/margin_std": 226.84693908691406, "step": 367 }, { "KL/chosen_KL_mean": -251.9151153564453, "KL/mean": -395.54022216796875, "KL/rejected_KL_mean": -539.165283203125, "KL/std": 242.84780883789062, "epoch": 0.540381791483113, "fcm_dpo/beta": 0.0018544028280302882, "fcm_dpo/delta": -0.14009898900985718, "fcm_dpo/margin": 287.25018310546875, "fcm_dpo/q_t": 0.373285174369812, "grad_norm": 33.4195671081543, "learning_rate": 2.6026375718290083e-07, "logits/chosen": -0.4662426710128784, "logits/rejected": -0.47398853302001953, "logps/chosen": -296.36761474609375, "logps/ref_chosen": -44.452518463134766, "logps/ref_rejected": -98.55526733398438, "logps/rejected": -637.7205810546875, "loss": 0.9614, "margin_dpo/margin_mean": 287.25018310546875, "margin_dpo/margin_std": 218.70684814453125, "step": 368 }, { "KL/chosen_KL_mean": -325.591796875, "KL/mean": -402.11102294921875, "KL/rejected_KL_mean": -478.63018798828125, "KL/std": 241.74417114257812, "epoch": 0.5418502202643172, "fcm_dpo/beta": 0.0018784540006890893, "fcm_dpo/delta": 0.11527148634195328, "fcm_dpo/margin": 153.03839111328125, "fcm_dpo/q_t": 0.43329665064811707, "grad_norm": 27.64653968811035, "learning_rate": 2.589813792448196e-07, "logits/chosen": -0.44844913482666016, "logits/rejected": -0.4270949065685272, "logps/chosen": -396.97332763671875, "logps/ref_chosen": -71.38150024414062, "logps/ref_rejected": -91.29582214355469, "logps/rejected": -569.926025390625, "loss": 1.1808, "margin_dpo/margin_mean": 153.0383758544922, "margin_dpo/margin_std": 270.01470947265625, "step": 369 }, { "KL/chosen_KL_mean": -349.7052307128906, "KL/mean": -421.733642578125, "KL/rejected_KL_mean": -493.76202392578125, "KL/std": 252.10357666015625, "epoch": 0.5433186490455213, "fcm_dpo/beta": 0.0019235580693930387, "fcm_dpo/delta": 0.12600602209568024, "fcm_dpo/margin": 144.05679321289062, "fcm_dpo/q_t": 0.43653106689453125, "grad_norm": 27.10540199279785, "learning_rate": 2.5769876463904263e-07, "logits/chosen": -0.4947051405906677, "logits/rejected": -0.487566202878952, "logps/chosen": -421.312744140625, "logps/ref_chosen": -71.60749816894531, "logps/ref_rejected": -97.25978088378906, "logps/rejected": -591.0218505859375, "loss": 1.1983, "margin_dpo/margin_mean": 144.05679321289062, "margin_dpo/margin_std": 280.3094482421875, "step": 370 }, { "KL/chosen_KL_mean": -337.39990234375, "KL/mean": -443.57403564453125, "KL/rejected_KL_mean": -549.7481689453125, "KL/std": 258.0762939453125, "epoch": 0.5447870778267254, "fcm_dpo/beta": 0.0019333376549184322, "fcm_dpo/delta": -0.011104363948106766, "fcm_dpo/margin": 212.34832763671875, "fcm_dpo/q_t": 0.40682950615882874, "grad_norm": 26.90560531616211, "learning_rate": 2.5641594716365744e-07, "logits/chosen": -0.5080785751342773, "logits/rejected": -0.4954741299152374, "logps/chosen": -406.81439208984375, "logps/ref_chosen": -69.41448974609375, "logps/ref_rejected": -99.17217254638672, "logps/rejected": -648.9203491210938, "loss": 1.095, "margin_dpo/margin_mean": 212.34832763671875, "margin_dpo/margin_std": 297.38665771484375, "step": 371 }, { "KL/chosen_KL_mean": -323.286865234375, "KL/mean": -451.05670166015625, "KL/rejected_KL_mean": -578.8265991210938, "KL/std": 294.25408935546875, "epoch": 0.5462555066079295, "fcm_dpo/beta": 0.0018996518338099122, "fcm_dpo/delta": -0.08990687876939774, "fcm_dpo/margin": 255.53970336914062, "fcm_dpo/q_t": 0.3917329013347626, "grad_norm": 22.939546585083008, "learning_rate": 2.551329606220976e-07, "logits/chosen": -0.4645116329193115, "logits/rejected": -0.444297730922699, "logps/chosen": -385.1048583984375, "logps/ref_chosen": -61.8179931640625, "logps/ref_rejected": -78.53948974609375, "logps/rejected": -657.3660888671875, "loss": 1.0443, "margin_dpo/margin_mean": 255.53970336914062, "margin_dpo/margin_std": 328.15814208984375, "step": 372 }, { "KL/chosen_KL_mean": -354.2593994140625, "KL/mean": -475.4353332519531, "KL/rejected_KL_mean": -596.6112060546875, "KL/std": 283.9608459472656, "epoch": 0.5477239353891337, "fcm_dpo/beta": 0.001885814475826919, "fcm_dpo/delta": -0.060002297163009644, "fcm_dpo/margin": 242.35189819335938, "fcm_dpo/q_t": 0.393940806388855, "grad_norm": 27.45345115661621, "learning_rate": 2.538498388222517e-07, "logits/chosen": -0.46065136790275574, "logits/rejected": -0.438961923122406, "logps/chosen": -418.4765319824219, "logps/ref_chosen": -64.21713256835938, "logps/ref_rejected": -85.95960998535156, "logps/rejected": -682.5708618164062, "loss": 1.0473, "margin_dpo/margin_mean": 242.35189819335938, "margin_dpo/margin_std": 272.34967041015625, "step": 373 }, { "KL/chosen_KL_mean": -322.3292236328125, "KL/mean": -431.51788330078125, "KL/rejected_KL_mean": -540.70654296875, "KL/std": 311.81536865234375, "epoch": 0.5491923641703378, "fcm_dpo/beta": 0.0018586989026516676, "fcm_dpo/delta": -0.0067335814237594604, "fcm_dpo/margin": 218.37728881835938, "fcm_dpo/q_t": 0.4113299250602722, "grad_norm": 24.44922637939453, "learning_rate": 2.525666155755725e-07, "logits/chosen": -0.524357795715332, "logits/rejected": -0.5057187676429749, "logps/chosen": -392.97943115234375, "logps/ref_chosen": -70.65018463134766, "logps/ref_rejected": -93.64016723632812, "logps/rejected": -634.3466796875, "loss": 1.1182, "margin_dpo/margin_mean": 218.37728881835938, "margin_dpo/margin_std": 353.18353271484375, "step": 374 }, { "KL/chosen_KL_mean": -333.91644287109375, "KL/mean": -439.2584228515625, "KL/rejected_KL_mean": -544.6004638671875, "KL/std": 251.31211853027344, "epoch": 0.5506607929515418, "fcm_dpo/beta": 0.0018582877237349749, "fcm_dpo/delta": 0.008078165352344513, "fcm_dpo/margin": 210.68402099609375, "fcm_dpo/q_t": 0.4099903106689453, "grad_norm": 27.943613052368164, "learning_rate": 2.512833246961859e-07, "logits/chosen": -0.4510612487792969, "logits/rejected": -0.44956958293914795, "logps/chosen": -393.9966735839844, "logps/ref_chosen": -60.080223083496094, "logps/ref_rejected": -88.93830871582031, "logps/rejected": -633.5387573242188, "loss": 1.1099, "margin_dpo/margin_mean": 210.68402099609375, "margin_dpo/margin_std": 301.5238952636719, "step": 375 }, { "KL/chosen_KL_mean": -335.46563720703125, "KL/mean": -467.35919189453125, "KL/rejected_KL_mean": -599.252685546875, "KL/std": 275.66827392578125, "epoch": 0.5521292217327459, "fcm_dpo/beta": 0.001843743957579136, "fcm_dpo/delta": -0.09088477492332458, "fcm_dpo/margin": 263.78704833984375, "fcm_dpo/q_t": 0.3887389302253723, "grad_norm": 23.84757423400879, "learning_rate": 2.5e-07, "logits/chosen": -0.4562457203865051, "logits/rejected": -0.446555495262146, "logps/chosen": -398.1259460449219, "logps/ref_chosen": -62.660308837890625, "logps/ref_rejected": -105.52660369873047, "logps/rejected": -704.779296875, "loss": 1.0383, "margin_dpo/margin_mean": 263.7870788574219, "margin_dpo/margin_std": 319.8635559082031, "step": 376 }, { "KL/chosen_KL_mean": -334.3457946777344, "KL/mean": -462.3714599609375, "KL/rejected_KL_mean": -590.397216796875, "KL/std": 288.103515625, "epoch": 0.55359765051395, "fcm_dpo/beta": 0.0018218334298580885, "fcm_dpo/delta": -0.0696791335940361, "fcm_dpo/margin": 256.0513916015625, "fcm_dpo/q_t": 0.3930322229862213, "grad_norm": 21.212696075439453, "learning_rate": 2.487166753038141e-07, "logits/chosen": -0.40101295709609985, "logits/rejected": -0.3998126685619354, "logps/chosen": -388.82452392578125, "logps/ref_chosen": -54.478736877441406, "logps/ref_rejected": -98.70335388183594, "logps/rejected": -689.1005249023438, "loss": 1.0428, "margin_dpo/margin_mean": 256.0513916015625, "margin_dpo/margin_std": 300.54119873046875, "step": 377 }, { "KL/chosen_KL_mean": -315.5291748046875, "KL/mean": -449.3112487792969, "KL/rejected_KL_mean": -583.0933227539062, "KL/std": 265.04840087890625, "epoch": 0.5550660792951542, "fcm_dpo/beta": 0.001788057736121118, "fcm_dpo/delta": -0.08244302868843079, "fcm_dpo/margin": 267.56414794921875, "fcm_dpo/q_t": 0.38815170526504517, "grad_norm": 26.153120040893555, "learning_rate": 2.4743338442442754e-07, "logits/chosen": -0.42576664686203003, "logits/rejected": -0.4415106773376465, "logps/chosen": -360.5497131347656, "logps/ref_chosen": -45.02053451538086, "logps/ref_rejected": -88.0469741821289, "logps/rejected": -671.1402587890625, "loss": 1.025, "margin_dpo/margin_mean": 267.56414794921875, "margin_dpo/margin_std": 286.3600769042969, "step": 378 }, { "KL/chosen_KL_mean": -355.9398193359375, "KL/mean": -485.5458984375, "KL/rejected_KL_mean": -615.1519775390625, "KL/std": 267.96209716796875, "epoch": 0.5565345080763583, "fcm_dpo/beta": 0.0017578438855707645, "fcm_dpo/delta": -0.05908029526472092, "fcm_dpo/margin": 259.2121887207031, "fcm_dpo/q_t": 0.3962175250053406, "grad_norm": 28.71318244934082, "learning_rate": 2.461501611777483e-07, "logits/chosen": -0.42304420471191406, "logits/rejected": -0.44598186016082764, "logps/chosen": -409.1219177246094, "logps/ref_chosen": -53.182098388671875, "logps/ref_rejected": -114.3001708984375, "logps/rejected": -729.4521484375, "loss": 1.0578, "margin_dpo/margin_mean": 259.21221923828125, "margin_dpo/margin_std": 324.53790283203125, "step": 379 }, { "KL/chosen_KL_mean": -336.93890380859375, "KL/mean": -479.41204833984375, "KL/rejected_KL_mean": -621.8851318359375, "KL/std": 297.79949951171875, "epoch": 0.5580029368575624, "fcm_dpo/beta": 0.0017373515293002129, "fcm_dpo/delta": -0.09990786015987396, "fcm_dpo/margin": 284.94622802734375, "fcm_dpo/q_t": 0.38500848412513733, "grad_norm": 25.73267364501953, "learning_rate": 2.4486703937790243e-07, "logits/chosen": -0.43626442551612854, "logits/rejected": -0.4630964398384094, "logps/chosen": -388.29193115234375, "logps/ref_chosen": -51.3530387878418, "logps/ref_rejected": -104.19169616699219, "logps/rejected": -726.0767822265625, "loss": 1.0264, "margin_dpo/margin_mean": 284.94622802734375, "margin_dpo/margin_std": 328.1457824707031, "step": 380 }, { "KL/chosen_KL_mean": -349.9728088378906, "KL/mean": -449.6802978515625, "KL/rejected_KL_mean": -549.3878173828125, "KL/std": 246.59634399414062, "epoch": 0.5594713656387665, "fcm_dpo/beta": 0.0017377103213220835, "fcm_dpo/delta": 0.055430181324481964, "fcm_dpo/margin": 199.41497802734375, "fcm_dpo/q_t": 0.42109525203704834, "grad_norm": 24.38262939453125, "learning_rate": 2.435840528363426e-07, "logits/chosen": -0.4588872790336609, "logits/rejected": -0.4429172873497009, "logps/chosen": -407.77587890625, "logps/ref_chosen": -57.80306625366211, "logps/ref_rejected": -79.21940612792969, "logps/rejected": -628.607177734375, "loss": 1.1478, "margin_dpo/margin_mean": 199.41497802734375, "margin_dpo/margin_std": 332.27398681640625, "step": 381 }, { "KL/chosen_KL_mean": -328.7398681640625, "KL/mean": -452.5684509277344, "KL/rejected_KL_mean": -576.39697265625, "KL/std": 232.17242431640625, "epoch": 0.5609397944199707, "fcm_dpo/beta": 0.001735961064696312, "fcm_dpo/delta": -0.031305499374866486, "fcm_dpo/margin": 247.65719604492188, "fcm_dpo/q_t": 0.3991192877292633, "grad_norm": 26.342195510864258, "learning_rate": 2.4230123536095745e-07, "logits/chosen": -0.48217618465423584, "logits/rejected": -0.48925304412841797, "logps/chosen": -394.7601623535156, "logps/ref_chosen": -66.02030181884766, "logps/ref_rejected": -110.71016693115234, "logps/rejected": -687.107177734375, "loss": 1.0484, "margin_dpo/margin_mean": 247.65719604492188, "margin_dpo/margin_std": 255.80958557128906, "step": 382 }, { "KL/chosen_KL_mean": -337.642822265625, "KL/mean": -457.468505859375, "KL/rejected_KL_mean": -577.294189453125, "KL/std": 262.3541564941406, "epoch": 0.5624082232011748, "fcm_dpo/beta": 0.0017293533310294151, "fcm_dpo/delta": -0.015089768916368484, "fcm_dpo/margin": 239.6513671875, "fcm_dpo/q_t": 0.40462052822113037, "grad_norm": 30.611806869506836, "learning_rate": 2.4101862075518037e-07, "logits/chosen": -0.4417022466659546, "logits/rejected": -0.4511658549308777, "logps/chosen": -388.0343017578125, "logps/ref_chosen": -50.39148712158203, "logps/ref_rejected": -93.71589660644531, "logps/rejected": -671.0100708007812, "loss": 1.0938, "margin_dpo/margin_mean": 239.65135192871094, "margin_dpo/margin_std": 338.258544921875, "step": 383 }, { "KL/chosen_KL_mean": -352.222900390625, "KL/mean": -446.8468322753906, "KL/rejected_KL_mean": -541.4708251953125, "KL/std": 242.38162231445312, "epoch": 0.5638766519823789, "fcm_dpo/beta": 0.0017538972897455096, "fcm_dpo/delta": 0.06988409906625748, "fcm_dpo/margin": 189.2479248046875, "fcm_dpo/q_t": 0.4214822053909302, "grad_norm": 24.98710060119629, "learning_rate": 2.397362428170992e-07, "logits/chosen": -0.4990885853767395, "logits/rejected": -0.4952540993690491, "logps/chosen": -404.26898193359375, "logps/ref_chosen": -52.046104431152344, "logps/ref_rejected": -85.76089477539062, "logps/rejected": -627.231689453125, "loss": 1.1222, "margin_dpo/margin_mean": 189.2479248046875, "margin_dpo/margin_std": 231.77182006835938, "step": 384 }, { "KL/chosen_KL_mean": -317.3160400390625, "KL/mean": -436.0095520019531, "KL/rejected_KL_mean": -554.7030639648438, "KL/std": 214.3333740234375, "epoch": 0.5653450807635829, "fcm_dpo/beta": 0.0017491495236754417, "fcm_dpo/delta": -0.015889476984739304, "fcm_dpo/margin": 237.3870086669922, "fcm_dpo/q_t": 0.40170639753341675, "grad_norm": 29.25759506225586, "learning_rate": 2.3845413533856514e-07, "logits/chosen": -0.5247458219528198, "logits/rejected": -0.5023648738861084, "logps/chosen": -382.8681945800781, "logps/ref_chosen": -65.55215454101562, "logps/ref_rejected": -77.82792663574219, "logps/rejected": -632.531005859375, "loss": 1.055, "margin_dpo/margin_mean": 237.38702392578125, "margin_dpo/margin_std": 238.87646484375, "step": 385 }, { "KL/chosen_KL_mean": -334.15228271484375, "KL/mean": -458.6162109375, "KL/rejected_KL_mean": -583.0802001953125, "KL/std": 262.49871826171875, "epoch": 0.566813509544787, "fcm_dpo/beta": 0.0017403860110789537, "fcm_dpo/delta": -0.03472103923559189, "fcm_dpo/margin": 248.92791748046875, "fcm_dpo/q_t": 0.3999601900577545, "grad_norm": 26.947490692138672, "learning_rate": 2.3717233210430254e-07, "logits/chosen": -0.5123308300971985, "logits/rejected": -0.5101590156555176, "logps/chosen": -392.3741455078125, "logps/ref_chosen": -58.22185516357422, "logps/ref_rejected": -92.32742309570312, "logps/rejected": -675.4075927734375, "loss": 1.0633, "margin_dpo/margin_mean": 248.9279022216797, "margin_dpo/margin_std": 302.4674987792969, "step": 386 }, { "KL/chosen_KL_mean": -361.41119384765625, "KL/mean": -466.2529602050781, "KL/rejected_KL_mean": -571.0947265625, "KL/std": 245.76097106933594, "epoch": 0.5682819383259912, "fcm_dpo/beta": 0.0017379240598529577, "fcm_dpo/delta": 0.036699328571558, "fcm_dpo/margin": 209.68359375, "fcm_dpo/q_t": 0.4142611622810364, "grad_norm": 30.391345977783203, "learning_rate": 2.3589086689101889e-07, "logits/chosen": -0.5567930340766907, "logits/rejected": -0.5412279367446899, "logps/chosen": -427.83062744140625, "logps/ref_chosen": -66.41944885253906, "logps/ref_rejected": -92.16915893554688, "logps/rejected": -663.263916015625, "loss": 1.1027, "margin_dpo/margin_mean": 209.68357849121094, "margin_dpo/margin_std": 252.79678344726562, "step": 387 }, { "KL/chosen_KL_mean": -343.9563903808594, "KL/mean": -484.5562744140625, "KL/rejected_KL_mean": -625.1561889648438, "KL/std": 287.8593444824219, "epoch": 0.5697503671071953, "fcm_dpo/beta": 0.0017218522261828184, "fcm_dpo/delta": -0.08878612518310547, "fcm_dpo/margin": 281.1998291015625, "fcm_dpo/q_t": 0.390036940574646, "grad_norm": 26.922496795654297, "learning_rate": 2.3460977346651428e-07, "logits/chosen": -0.46857941150665283, "logits/rejected": -0.48115378618240356, "logps/chosen": -394.0858459472656, "logps/ref_chosen": -50.129459381103516, "logps/ref_rejected": -104.43305969238281, "logps/rejected": -729.5892333984375, "loss": 1.03, "margin_dpo/margin_mean": 281.1997985839844, "margin_dpo/margin_std": 325.59906005859375, "step": 388 }, { "KL/chosen_KL_mean": -386.0830383300781, "KL/mean": -507.82427978515625, "KL/rejected_KL_mean": -629.5654296875, "KL/std": 286.28631591796875, "epoch": 0.5712187958883994, "fcm_dpo/beta": 0.0017109981272369623, "fcm_dpo/delta": -0.01739252358675003, "fcm_dpo/margin": 243.48245239257812, "fcm_dpo/q_t": 0.4042537808418274, "grad_norm": 24.15456771850586, "learning_rate": 2.3332908558879177e-07, "logits/chosen": -0.5262615084648132, "logits/rejected": -0.5186604261398315, "logps/chosen": -443.9896240234375, "logps/ref_chosen": -57.906593322753906, "logps/ref_rejected": -77.91454315185547, "logps/rejected": -707.47998046875, "loss": 1.0799, "margin_dpo/margin_mean": 243.48245239257812, "margin_dpo/margin_std": 314.66058349609375, "step": 389 }, { "KL/chosen_KL_mean": -384.50494384765625, "KL/mean": -505.17059326171875, "KL/rejected_KL_mean": -625.836181640625, "KL/std": 288.66546630859375, "epoch": 0.5726872246696035, "fcm_dpo/beta": 0.0017028467264026403, "fcm_dpo/delta": -0.011735277250409126, "fcm_dpo/margin": 241.33126831054688, "fcm_dpo/q_t": 0.4092911183834076, "grad_norm": 26.528804779052734, "learning_rate": 2.320488370051681e-07, "logits/chosen": -0.46930596232414246, "logits/rejected": -0.46219387650489807, "logps/chosen": -433.7308349609375, "logps/ref_chosen": -49.22591781616211, "logps/ref_rejected": -85.5281982421875, "logps/rejected": -711.3643798828125, "loss": 1.1069, "margin_dpo/margin_mean": 241.33126831054688, "margin_dpo/margin_std": 371.08599853515625, "step": 390 }, { "KL/chosen_KL_mean": -386.6776123046875, "KL/mean": -456.50341796875, "KL/rejected_KL_mean": -526.3292236328125, "KL/std": 271.3560791015625, "epoch": 0.5741556534508077, "fcm_dpo/beta": 0.0017502898117527366, "fcm_dpo/delta": 0.15936514735221863, "fcm_dpo/margin": 139.651611328125, "fcm_dpo/q_t": 0.4439963400363922, "grad_norm": 45.76322555541992, "learning_rate": 2.3076906145138405e-07, "logits/chosen": -0.5254815220832825, "logits/rejected": -0.5183066725730896, "logps/chosen": -451.00726318359375, "logps/ref_chosen": -64.32965087890625, "logps/ref_rejected": -86.73820495605469, "logps/rejected": -613.0674438476562, "loss": 1.2173, "margin_dpo/margin_mean": 139.65162658691406, "margin_dpo/margin_std": 283.3598937988281, "step": 391 }, { "KL/chosen_KL_mean": -326.25396728515625, "KL/mean": -469.5254211425781, "KL/rejected_KL_mean": -612.796875, "KL/std": 288.09954833984375, "epoch": 0.5756240822320118, "fcm_dpo/beta": 0.00174234458245337, "fcm_dpo/delta": -0.10435783863067627, "fcm_dpo/margin": 286.5428771972656, "fcm_dpo/q_t": 0.38457435369491577, "grad_norm": 23.80723762512207, "learning_rate": 2.294897926507156e-07, "logits/chosen": -0.4814883768558502, "logits/rejected": -0.4757598340511322, "logps/chosen": -379.7579345703125, "logps/ref_chosen": -53.50397872924805, "logps/ref_rejected": -102.34584045410156, "logps/rejected": -715.1427001953125, "loss": 1.0049, "margin_dpo/margin_mean": 286.5428771972656, "margin_dpo/margin_std": 283.0867004394531, "step": 392 }, { "KL/chosen_KL_mean": -324.5339660644531, "KL/mean": -436.1412353515625, "KL/rejected_KL_mean": -547.74853515625, "KL/std": 283.82720947265625, "epoch": 0.5770925110132159, "fcm_dpo/beta": 0.0017277842853218317, "fcm_dpo/delta": 0.014872867614030838, "fcm_dpo/margin": 223.2145233154297, "fcm_dpo/q_t": 0.41501516103744507, "grad_norm": 21.790613174438477, "learning_rate": 2.2821106431308543e-07, "logits/chosen": -0.46388766169548035, "logits/rejected": -0.46215295791625977, "logps/chosen": -371.00787353515625, "logps/ref_chosen": -46.473915100097656, "logps/ref_rejected": -71.96885681152344, "logps/rejected": -619.7174072265625, "loss": 1.1214, "margin_dpo/margin_mean": 223.21453857421875, "margin_dpo/margin_std": 357.96539306640625, "step": 393 }, { "KL/chosen_KL_mean": -370.5946044921875, "KL/mean": -487.31060791015625, "KL/rejected_KL_mean": -604.026611328125, "KL/std": 303.3579406738281, "epoch": 0.57856093979442, "fcm_dpo/beta": 0.001729074981994927, "fcm_dpo/delta": -0.0038064131513237953, "fcm_dpo/margin": 233.43197631835938, "fcm_dpo/q_t": 0.4082695245742798, "grad_norm": 26.26580810546875, "learning_rate": 2.2693291013417452e-07, "logits/chosen": -0.4947393238544464, "logits/rejected": -0.4953378438949585, "logps/chosen": -423.50616455078125, "logps/ref_chosen": -52.91154861450195, "logps/ref_rejected": -90.8226318359375, "logps/rejected": -694.8492431640625, "loss": 1.0907, "margin_dpo/margin_mean": 233.43197631835938, "margin_dpo/margin_std": 314.2247314453125, "step": 394 }, { "KL/chosen_KL_mean": -367.9709777832031, "KL/mean": -494.29119873046875, "KL/rejected_KL_mean": -620.6114501953125, "KL/std": 292.8189697265625, "epoch": 0.580029368575624, "fcm_dpo/beta": 0.001716281520202756, "fcm_dpo/delta": -0.03543686866760254, "fcm_dpo/margin": 252.64047241210938, "fcm_dpo/q_t": 0.4020352363586426, "grad_norm": 25.020362854003906, "learning_rate": 2.2565536379453404e-07, "logits/chosen": -0.5321957468986511, "logits/rejected": -0.5300949811935425, "logps/chosen": -430.51708984375, "logps/ref_chosen": -62.546112060546875, "logps/ref_rejected": -83.78262329101562, "logps/rejected": -704.39404296875, "loss": 1.0783, "margin_dpo/margin_mean": 252.64048767089844, "margin_dpo/margin_std": 341.7522888183594, "step": 395 }, { "KL/chosen_KL_mean": -370.54937744140625, "KL/mean": -482.26983642578125, "KL/rejected_KL_mean": -593.9903564453125, "KL/std": 286.0380554199219, "epoch": 0.5814977973568282, "fcm_dpo/beta": 0.001719313906505704, "fcm_dpo/delta": 0.016418248414993286, "fcm_dpo/margin": 223.4409942626953, "fcm_dpo/q_t": 0.4104015827178955, "grad_norm": 26.507614135742188, "learning_rate": 2.2437845895869825e-07, "logits/chosen": -0.5050040483474731, "logits/rejected": -0.4854010343551636, "logps/chosen": -439.5453186035156, "logps/ref_chosen": -68.99594116210938, "logps/ref_rejected": -88.64665985107422, "logps/rejected": -682.6370239257812, "loss": 1.0883, "margin_dpo/margin_mean": 223.44097900390625, "margin_dpo/margin_std": 268.4827880859375, "step": 396 }, { "KL/chosen_KL_mean": -350.86260986328125, "KL/mean": -498.1780700683594, "KL/rejected_KL_mean": -645.4935302734375, "KL/std": 290.6457214355469, "epoch": 0.5829662261380323, "fcm_dpo/beta": 0.0016906873788684607, "fcm_dpo/delta": -0.10405933111906052, "fcm_dpo/margin": 294.6309509277344, "fcm_dpo/q_t": 0.38490188121795654, "grad_norm": 32.973846435546875, "learning_rate": 2.2310222927429716e-07, "logits/chosen": -0.49156516790390015, "logits/rejected": -0.4970093369483948, "logps/chosen": -412.1397705078125, "logps/ref_chosen": -61.27716827392578, "logps/ref_rejected": -103.11612701416016, "logps/rejected": -748.6097412109375, "loss": 1.0113, "margin_dpo/margin_mean": 294.6309509277344, "margin_dpo/margin_std": 307.0301513671875, "step": 397 }, { "KL/chosen_KL_mean": -376.7286376953125, "KL/mean": -511.4410400390625, "KL/rejected_KL_mean": -646.1535034179688, "KL/std": 304.7323303222656, "epoch": 0.5844346549192364, "fcm_dpo/beta": 0.001674711937084794, "fcm_dpo/delta": -0.053648628294467926, "fcm_dpo/margin": 269.42486572265625, "fcm_dpo/q_t": 0.3986341953277588, "grad_norm": 23.229272842407227, "learning_rate": 2.2182670837116972e-07, "logits/chosen": -0.5308432579040527, "logits/rejected": -0.5298266410827637, "logps/chosen": -444.88018798828125, "logps/ref_chosen": -68.15155029296875, "logps/ref_rejected": -108.52360534667969, "logps/rejected": -754.6771240234375, "loss": 1.0636, "margin_dpo/margin_mean": 269.42486572265625, "margin_dpo/margin_std": 354.4512634277344, "step": 398 }, { "KL/chosen_KL_mean": -326.7265625, "KL/mean": -445.32049560546875, "KL/rejected_KL_mean": -563.9144287109375, "KL/std": 262.29473876953125, "epoch": 0.5859030837004405, "fcm_dpo/beta": 0.0016672208439558744, "fcm_dpo/delta": 0.004600860178470612, "fcm_dpo/margin": 237.18785095214844, "fcm_dpo/q_t": 0.40930691361427307, "grad_norm": 31.19171142578125, "learning_rate": 2.2055192986047804e-07, "logits/chosen": -0.4892912209033966, "logits/rejected": -0.45055025815963745, "logps/chosen": -387.6163635253906, "logps/ref_chosen": -60.889801025390625, "logps/ref_rejected": -77.965576171875, "logps/rejected": -641.8800048828125, "loss": 1.1002, "margin_dpo/margin_mean": 237.1878662109375, "margin_dpo/margin_std": 328.7167663574219, "step": 399 }, { "KL/chosen_KL_mean": -316.741943359375, "KL/mean": -485.7972412109375, "KL/rejected_KL_mean": -654.8525390625, "KL/std": 280.26068115234375, "epoch": 0.5873715124816447, "fcm_dpo/beta": 0.001628828700631857, "fcm_dpo/delta": -0.1600421667098999, "fcm_dpo/margin": 338.1106872558594, "fcm_dpo/q_t": 0.3711463212966919, "grad_norm": 22.955949783325195, "learning_rate": 2.192779273338215e-07, "logits/chosen": -0.5029030442237854, "logits/rejected": -0.4994921386241913, "logps/chosen": -380.385498046875, "logps/ref_chosen": -63.64359664916992, "logps/ref_rejected": -105.252685546875, "logps/rejected": -760.105224609375, "loss": 0.9701, "margin_dpo/margin_mean": 338.11065673828125, "margin_dpo/margin_std": 314.16839599609375, "step": 400 }, { "KL/chosen_KL_mean": -364.88543701171875, "KL/mean": -461.265869140625, "KL/rejected_KL_mean": -557.6463012695312, "KL/std": 291.4111022949219, "epoch": 0.5888399412628488, "fcm_dpo/beta": 0.001636154600419104, "fcm_dpo/delta": 0.0874527096748352, "fcm_dpo/margin": 192.76087951660156, "fcm_dpo/q_t": 0.43033739924430847, "grad_norm": 27.67872428894043, "learning_rate": 2.1800473436235136e-07, "logits/chosen": -0.499002069234848, "logits/rejected": -0.49258700013160706, "logps/chosen": -422.0484619140625, "logps/ref_chosen": -57.16303253173828, "logps/ref_rejected": -83.79249572753906, "logps/rejected": -641.4387817382812, "loss": 1.1922, "margin_dpo/margin_mean": 192.7608642578125, "margin_dpo/margin_std": 390.67706298828125, "step": 401 }, { "KL/chosen_KL_mean": -276.1816711425781, "KL/mean": -451.49737548828125, "KL/rejected_KL_mean": -626.81298828125, "KL/std": 308.62689208984375, "epoch": 0.5903083700440529, "fcm_dpo/beta": 0.0016060995403677225, "fcm_dpo/delta": -0.17291411757469177, "fcm_dpo/margin": 350.63134765625, "fcm_dpo/q_t": 0.3695389926433563, "grad_norm": 34.74311065673828, "learning_rate": 2.1673238449588665e-07, "logits/chosen": -0.4849190413951874, "logits/rejected": -0.4742359220981598, "logps/chosen": -326.92205810546875, "logps/ref_chosen": -50.74037170410156, "logps/ref_rejected": -81.0460433959961, "logps/rejected": -707.8590698242188, "loss": 0.9581, "margin_dpo/margin_mean": 350.63134765625, "margin_dpo/margin_std": 317.71551513671875, "step": 402 }, { "KL/chosen_KL_mean": -312.72406005859375, "KL/mean": -446.8978271484375, "KL/rejected_KL_mean": -581.0715942382812, "KL/std": 288.34051513671875, "epoch": 0.591776798825257, "fcm_dpo/beta": 0.001585017773322761, "fcm_dpo/delta": -0.026479586958885193, "fcm_dpo/margin": 268.3475341796875, "fcm_dpo/q_t": 0.4014202356338501, "grad_norm": 23.439414978027344, "learning_rate": 2.154609112620295e-07, "logits/chosen": -0.49934089183807373, "logits/rejected": -0.50015789270401, "logps/chosen": -359.87139892578125, "logps/ref_chosen": -47.14731216430664, "logps/ref_rejected": -77.2666015625, "logps/rejected": -658.3381958007812, "loss": 1.0603, "margin_dpo/margin_mean": 268.3475341796875, "margin_dpo/margin_std": 303.0990295410156, "step": 403 }, { "KL/chosen_KL_mean": -346.9129333496094, "KL/mean": -478.5081787109375, "KL/rejected_KL_mean": -610.1033935546875, "KL/std": 282.465087890625, "epoch": 0.593245227606461, "fcm_dpo/beta": 0.00157838873565197, "fcm_dpo/delta": -0.016118429601192474, "fcm_dpo/margin": 263.1905517578125, "fcm_dpo/q_t": 0.40540170669555664, "grad_norm": 29.329235076904297, "learning_rate": 2.1419034816528218e-07, "logits/chosen": -0.4767064154148102, "logits/rejected": -0.46850764751434326, "logps/chosen": -394.7882080078125, "logps/ref_chosen": -47.875274658203125, "logps/ref_rejected": -77.15499877929688, "logps/rejected": -687.2584228515625, "loss": 1.0909, "margin_dpo/margin_mean": 263.1905517578125, "margin_dpo/margin_std": 365.6813659667969, "step": 404 }, { "KL/chosen_KL_mean": -388.49285888671875, "KL/mean": -496.2237243652344, "KL/rejected_KL_mean": -603.95458984375, "KL/std": 306.67413330078125, "epoch": 0.5947136563876652, "fcm_dpo/beta": 0.0015723207034170628, "fcm_dpo/delta": -0.039775051176548004, "fcm_dpo/margin": 215.46173095703125, "fcm_dpo/q_t": 0.423758327960968, "grad_norm": 30.571292877197266, "learning_rate": 2.129207286861638e-07, "logits/chosen": -0.45147573947906494, "logits/rejected": -0.441570520401001, "logps/chosen": -453.65576171875, "logps/ref_chosen": -65.16290283203125, "logps/ref_rejected": -87.18678283691406, "logps/rejected": -691.141357421875, "loss": 1.1641, "margin_dpo/margin_mean": 215.4617462158203, "margin_dpo/margin_std": 380.0777587890625, "step": 405 }, { "KL/chosen_KL_mean": -344.60443115234375, "KL/mean": -486.24908447265625, "KL/rejected_KL_mean": -627.893798828125, "KL/std": 301.5284423828125, "epoch": 0.5961820851688693, "fcm_dpo/beta": 0.0015619369223713875, "fcm_dpo/delta": -0.044593267142772675, "fcm_dpo/margin": 283.2893981933594, "fcm_dpo/q_t": 0.39848363399505615, "grad_norm": 23.295684814453125, "learning_rate": 2.1165208628032861e-07, "logits/chosen": -0.5039137005805969, "logits/rejected": -0.5129928588867188, "logps/chosen": -394.34521484375, "logps/ref_chosen": -49.740814208984375, "logps/ref_rejected": -92.07862854003906, "logps/rejected": -719.972412109375, "loss": 1.0554, "margin_dpo/margin_mean": 283.28936767578125, "margin_dpo/margin_std": 333.28466796875, "step": 406 }, { "KL/chosen_KL_mean": -370.0421142578125, "KL/mean": -455.7298889160156, "KL/rejected_KL_mean": -541.4176025390625, "KL/std": 237.25067138671875, "epoch": 0.5976505139500734, "fcm_dpo/beta": 0.0015546645736321807, "fcm_dpo/delta": 0.016098780557513237, "fcm_dpo/margin": 171.37547302246094, "fcm_dpo/q_t": 0.436930388212204, "grad_norm": 48.98335647583008, "learning_rate": 2.1038445437768375e-07, "logits/chosen": -0.4963209331035614, "logits/rejected": -0.47049441933631897, "logps/chosen": -426.3728332519531, "logps/ref_chosen": -56.33069610595703, "logps/ref_rejected": -77.51209259033203, "logps/rejected": -618.9296875, "loss": 1.1985, "margin_dpo/margin_mean": 171.37548828125, "margin_dpo/margin_std": 317.56884765625, "step": 407 }, { "KL/chosen_KL_mean": -374.8062744140625, "KL/mean": -474.4832458496094, "KL/rejected_KL_mean": -574.1602783203125, "KL/std": 233.34344482421875, "epoch": 0.5991189427312775, "fcm_dpo/beta": 0.0015771770849823952, "fcm_dpo/delta": 0.08839617669582367, "fcm_dpo/margin": 199.35397338867188, "fcm_dpo/q_t": 0.4258885979652405, "grad_norm": 24.81488037109375, "learning_rate": 2.0911786638150872e-07, "logits/chosen": -0.484347403049469, "logits/rejected": -0.4580131769180298, "logps/chosen": -444.5955810546875, "logps/ref_chosen": -69.789306640625, "logps/ref_rejected": -90.09693908691406, "logps/rejected": -664.2572021484375, "loss": 1.1376, "margin_dpo/margin_mean": 199.35397338867188, "margin_dpo/margin_std": 261.7845764160156, "step": 408 }, { "KL/chosen_KL_mean": -363.4441833496094, "KL/mean": -462.02862548828125, "KL/rejected_KL_mean": -560.6130981445312, "KL/std": 254.19630432128906, "epoch": 0.6005873715124816, "fcm_dpo/beta": 0.0016081533394753933, "fcm_dpo/delta": 0.0854191780090332, "fcm_dpo/margin": 197.1689453125, "fcm_dpo/q_t": 0.42619985342025757, "grad_norm": 33.778438568115234, "learning_rate": 2.0785235566757517e-07, "logits/chosen": -0.4777407944202423, "logits/rejected": -0.46434783935546875, "logps/chosen": -430.7615966796875, "logps/ref_chosen": -67.31744384765625, "logps/ref_rejected": -84.904296875, "logps/rejected": -645.5173950195312, "loss": 1.1435, "margin_dpo/margin_mean": 197.1689453125, "margin_dpo/margin_std": 282.058349609375, "step": 409 }, { "KL/chosen_KL_mean": -339.41485595703125, "KL/mean": -453.9027404785156, "KL/rejected_KL_mean": -568.390625, "KL/std": 249.15707397460938, "epoch": 0.6020558002936858, "fcm_dpo/beta": 0.0016180926468223333, "fcm_dpo/delta": 0.030641639605164528, "fcm_dpo/margin": 228.97573852539062, "fcm_dpo/q_t": 0.41249266266822815, "grad_norm": 26.49384880065918, "learning_rate": 2.065879555832674e-07, "logits/chosen": -0.5207273960113525, "logits/rejected": -0.5231969952583313, "logps/chosen": -390.8802185058594, "logps/ref_chosen": -51.465354919433594, "logps/ref_rejected": -83.198974609375, "logps/rejected": -651.589599609375, "loss": 1.0996, "margin_dpo/margin_mean": 228.97573852539062, "margin_dpo/margin_std": 282.3933410644531, "step": 410 }, { "KL/chosen_KL_mean": -361.8911437988281, "KL/mean": -477.2146911621094, "KL/rejected_KL_mean": -592.5382080078125, "KL/std": 280.512939453125, "epoch": 0.6035242290748899, "fcm_dpo/beta": 0.0016009939135983586, "fcm_dpo/delta": -0.06567565351724625, "fcm_dpo/margin": 230.64710998535156, "fcm_dpo/q_t": 0.41523507237434387, "grad_norm": 34.77162170410156, "learning_rate": 2.0532469944670343e-07, "logits/chosen": -0.4923670291900635, "logits/rejected": -0.5041638612747192, "logps/chosen": -414.19842529296875, "logps/ref_chosen": -52.30727005004883, "logps/ref_rejected": -80.69495391845703, "logps/rejected": -673.2332153320312, "loss": 1.117, "margin_dpo/margin_mean": 230.6470947265625, "margin_dpo/margin_std": 321.73370361328125, "step": 411 }, { "KL/chosen_KL_mean": -363.21783447265625, "KL/mean": -484.7549133300781, "KL/rejected_KL_mean": -606.2919311523438, "KL/std": 272.1863098144531, "epoch": 0.604992657856094, "fcm_dpo/beta": 0.0016049096593633294, "fcm_dpo/delta": 0.010263003408908844, "fcm_dpo/margin": 243.07411193847656, "fcm_dpo/q_t": 0.40966540575027466, "grad_norm": 34.43694305419922, "learning_rate": 2.0406262054585738e-07, "logits/chosen": -0.5590307712554932, "logits/rejected": -0.5907352566719055, "logps/chosen": -416.3619689941406, "logps/ref_chosen": -53.144126892089844, "logps/ref_rejected": -100.0608139038086, "logps/rejected": -706.352783203125, "loss": 1.0948, "margin_dpo/margin_mean": 243.0740966796875, "margin_dpo/margin_std": 316.7934875488281, "step": 412 }, { "KL/chosen_KL_mean": -387.7173767089844, "KL/mean": -507.04754638671875, "KL/rejected_KL_mean": -626.3777465820312, "KL/std": 278.0577392578125, "epoch": 0.6064610866372981, "fcm_dpo/beta": 0.0016117544146254659, "fcm_dpo/delta": 0.01580866426229477, "fcm_dpo/margin": 238.6603240966797, "fcm_dpo/q_t": 0.40934064984321594, "grad_norm": 25.042572021484375, "learning_rate": 2.0280175213768205e-07, "logits/chosen": -0.5094854235649109, "logits/rejected": -0.5156064033508301, "logps/chosen": -449.2993469238281, "logps/ref_chosen": -61.58196258544922, "logps/ref_rejected": -99.47340393066406, "logps/rejected": -725.8511352539062, "loss": 1.0935, "margin_dpo/margin_mean": 238.66033935546875, "margin_dpo/margin_std": 299.20660400390625, "step": 413 }, { "KL/chosen_KL_mean": -353.5774841308594, "KL/mean": -484.88934326171875, "KL/rejected_KL_mean": -616.2012329101562, "KL/std": 261.0632019042969, "epoch": 0.6079295154185022, "fcm_dpo/beta": 0.0016139191575348377, "fcm_dpo/delta": -0.025633584707975388, "fcm_dpo/margin": 262.623779296875, "fcm_dpo/q_t": 0.4009360074996948, "grad_norm": 28.529882431030273, "learning_rate": 2.0154212744723247e-07, "logits/chosen": -0.43805867433547974, "logits/rejected": -0.43269163370132446, "logps/chosen": -400.208984375, "logps/ref_chosen": -46.63148498535156, "logps/ref_rejected": -87.64653015136719, "logps/rejected": -703.8477783203125, "loss": 1.0665, "margin_dpo/margin_mean": 262.6237487792969, "margin_dpo/margin_std": 301.7000732421875, "step": 414 }, { "KL/chosen_KL_mean": -398.528564453125, "KL/mean": -497.85247802734375, "KL/rejected_KL_mean": -597.1763916015625, "KL/std": 269.6009826660156, "epoch": 0.6093979441997063, "fcm_dpo/beta": 0.0016152863390743732, "fcm_dpo/delta": 0.08179127424955368, "fcm_dpo/margin": 198.6478271484375, "fcm_dpo/q_t": 0.42490124702453613, "grad_norm": 25.62877655029297, "learning_rate": 2.002837796667909e-07, "logits/chosen": -0.5635542869567871, "logits/rejected": -0.5637483596801758, "logps/chosen": -477.1468505859375, "logps/ref_chosen": -78.6182861328125, "logps/ref_rejected": -100.47752380371094, "logps/rejected": -697.6539306640625, "loss": 1.1494, "margin_dpo/margin_mean": 198.6478271484375, "margin_dpo/margin_std": 303.8634033203125, "step": 415 }, { "KL/chosen_KL_mean": -366.1192321777344, "KL/mean": -525.1725463867188, "KL/rejected_KL_mean": -684.225830078125, "KL/std": 304.578369140625, "epoch": 0.6108663729809104, "fcm_dpo/beta": 0.0016041703056544065, "fcm_dpo/delta": -0.11606433987617493, "fcm_dpo/margin": 318.10662841796875, "fcm_dpo/q_t": 0.38077855110168457, "grad_norm": 45.315086364746094, "learning_rate": 1.990267419549914e-07, "logits/chosen": -0.523003101348877, "logits/rejected": -0.5284410715103149, "logps/chosen": -424.39837646484375, "logps/ref_chosen": -58.27912521362305, "logps/ref_rejected": -90.56871795654297, "logps/rejected": -774.7945556640625, "loss": 0.9918, "margin_dpo/margin_mean": 318.10662841796875, "margin_dpo/margin_std": 293.16387939453125, "step": 416 }, { "KL/chosen_KL_mean": -363.7352600097656, "KL/mean": -493.62274169921875, "KL/rejected_KL_mean": -623.51025390625, "KL/std": 269.7994384765625, "epoch": 0.6123348017621145, "fcm_dpo/beta": 0.0015893441159278154, "fcm_dpo/delta": -0.013450254686176777, "fcm_dpo/margin": 259.77496337890625, "fcm_dpo/q_t": 0.4028276801109314, "grad_norm": 28.700593948364258, "learning_rate": 1.9777104743594686e-07, "logits/chosen": -0.5118107795715332, "logits/rejected": -0.49247753620147705, "logps/chosen": -413.9339599609375, "logps/ref_chosen": -50.1987190246582, "logps/ref_rejected": -68.15184020996094, "logps/rejected": -691.6620483398438, "loss": 1.0588, "margin_dpo/margin_mean": 259.77496337890625, "margin_dpo/margin_std": 269.56451416015625, "step": 417 }, { "KL/chosen_KL_mean": -390.80078125, "KL/mean": -528.1920166015625, "KL/rejected_KL_mean": -665.583251953125, "KL/std": 311.567626953125, "epoch": 0.6138032305433186, "fcm_dpo/beta": 0.0015889217611402273, "fcm_dpo/delta": -0.039183445274829865, "fcm_dpo/margin": 274.78253173828125, "fcm_dpo/q_t": 0.4020264744758606, "grad_norm": 25.165157318115234, "learning_rate": 1.965167291983757e-07, "logits/chosen": -0.6080072522163391, "logits/rejected": -0.5904369950294495, "logps/chosen": -472.77923583984375, "logps/ref_chosen": -81.97846984863281, "logps/ref_rejected": -104.69148254394531, "logps/rejected": -770.2747802734375, "loss": 1.0794, "margin_dpo/margin_mean": 274.78253173828125, "margin_dpo/margin_std": 366.9202575683594, "step": 418 }, { "KL/chosen_KL_mean": -365.83148193359375, "KL/mean": -516.37353515625, "KL/rejected_KL_mean": -666.91552734375, "KL/std": 287.1883544921875, "epoch": 0.6152716593245228, "fcm_dpo/beta": 0.0015577776357531548, "fcm_dpo/delta": -0.07237845659255981, "fcm_dpo/margin": 301.0840759277344, "fcm_dpo/q_t": 0.39097434282302856, "grad_norm": 31.140954971313477, "learning_rate": 1.9526382029472988e-07, "logits/chosen": -0.5190507173538208, "logits/rejected": -0.5203031897544861, "logps/chosen": -418.7801208496094, "logps/ref_chosen": -52.948646545410156, "logps/ref_rejected": -91.58309936523438, "logps/rejected": -758.4986572265625, "loss": 1.0359, "margin_dpo/margin_mean": 301.0841064453125, "margin_dpo/margin_std": 336.95245361328125, "step": 419 }, { "KL/chosen_KL_mean": -464.80908203125, "KL/mean": -552.330810546875, "KL/rejected_KL_mean": -639.8525390625, "KL/std": 300.29180908203125, "epoch": 0.6167400881057269, "fcm_dpo/beta": 0.0015820781700313091, "fcm_dpo/delta": 0.12642702460289001, "fcm_dpo/margin": 175.04342651367188, "fcm_dpo/q_t": 0.4385032057762146, "grad_norm": 58.83283996582031, "learning_rate": 1.9401235374032425e-07, "logits/chosen": -0.5799360275268555, "logits/rejected": -0.5508887767791748, "logps/chosen": -542.5789794921875, "logps/ref_chosen": -77.7699203491211, "logps/ref_rejected": -69.31985473632812, "logps/rejected": -709.17236328125, "loss": 1.2225, "margin_dpo/margin_mean": 175.04344177246094, "margin_dpo/margin_std": 401.68768310546875, "step": 420 }, { "KL/chosen_KL_mean": -378.92333984375, "KL/mean": -482.3503112792969, "KL/rejected_KL_mean": -585.7772216796875, "KL/std": 293.73455810546875, "epoch": 0.618208516886931, "fcm_dpo/beta": 0.0016132977325469255, "fcm_dpo/delta": 0.06797365099191666, "fcm_dpo/margin": 206.85389709472656, "fcm_dpo/q_t": 0.4212290644645691, "grad_norm": 25.49981117248535, "learning_rate": 1.9276236251246653e-07, "logits/chosen": -0.5703746676445007, "logits/rejected": -0.5595937371253967, "logps/chosen": -432.689208984375, "logps/ref_chosen": -53.765865325927734, "logps/ref_rejected": -89.28144836425781, "logps/rejected": -675.0587158203125, "loss": 1.1441, "margin_dpo/margin_mean": 206.85391235351562, "margin_dpo/margin_std": 313.36297607421875, "step": 421 }, { "KL/chosen_KL_mean": -427.87286376953125, "KL/mean": -548.6137084960938, "KL/rejected_KL_mean": -669.3544921875, "KL/std": 294.9280090332031, "epoch": 0.6196769456681351, "fcm_dpo/beta": 0.001614258624613285, "fcm_dpo/delta": 0.01060008816421032, "fcm_dpo/margin": 241.48162841796875, "fcm_dpo/q_t": 0.4094482660293579, "grad_norm": 32.47233963012695, "learning_rate": 1.9151387954958792e-07, "logits/chosen": -0.5928431749343872, "logits/rejected": -0.5967549681663513, "logps/chosen": -496.50665283203125, "logps/ref_chosen": -68.6337661743164, "logps/ref_rejected": -87.86351013183594, "logps/rejected": -757.218017578125, "loss": 1.1069, "margin_dpo/margin_mean": 241.4816436767578, "margin_dpo/margin_std": 345.95001220703125, "step": 422 }, { "KL/chosen_KL_mean": -398.66766357421875, "KL/mean": -534.7645263671875, "KL/rejected_KL_mean": -670.8614501953125, "KL/std": 283.50732421875, "epoch": 0.6211453744493393, "fcm_dpo/beta": 0.001606134930625558, "fcm_dpo/delta": -0.038889989256858826, "fcm_dpo/margin": 272.19378662109375, "fcm_dpo/q_t": 0.39851221442222595, "grad_norm": 29.974559783935547, "learning_rate": 1.902669377503756e-07, "logits/chosen": -0.5618699789047241, "logits/rejected": -0.5707763433456421, "logps/chosen": -453.657958984375, "logps/ref_chosen": -54.99030303955078, "logps/ref_rejected": -86.30654907226562, "logps/rejected": -757.16796875, "loss": 1.0539, "margin_dpo/margin_mean": 272.19378662109375, "margin_dpo/margin_std": 310.67779541015625, "step": 423 }, { "KL/chosen_KL_mean": -362.41912841796875, "KL/mean": -485.88983154296875, "KL/rejected_KL_mean": -609.360595703125, "KL/std": 279.98773193359375, "epoch": 0.6226138032305433, "fcm_dpo/beta": 0.0015977869043126702, "fcm_dpo/delta": 0.005315911024808884, "fcm_dpo/margin": 246.94143676757812, "fcm_dpo/q_t": 0.41002586483955383, "grad_norm": 31.341785430908203, "learning_rate": 1.890215699729057e-07, "logits/chosen": -0.5942381620407104, "logits/rejected": -0.574604332447052, "logps/chosen": -418.4310607910156, "logps/ref_chosen": -56.01192092895508, "logps/ref_rejected": -66.47896575927734, "logps/rejected": -675.839599609375, "loss": 1.0959, "margin_dpo/margin_mean": 246.94146728515625, "margin_dpo/margin_std": 331.605712890625, "step": 424 }, { "KL/chosen_KL_mean": -399.5330810546875, "KL/mean": -494.6478271484375, "KL/rejected_KL_mean": -589.7625732421875, "KL/std": 262.13092041015625, "epoch": 0.6240822320117474, "fcm_dpo/beta": 0.001631318125873804, "fcm_dpo/delta": 0.09219174087047577, "fcm_dpo/margin": 190.22943115234375, "fcm_dpo/q_t": 0.4265892803668976, "grad_norm": 32.349361419677734, "learning_rate": 1.8777780903377732e-07, "logits/chosen": -0.5598398447036743, "logits/rejected": -0.5601568818092346, "logps/chosen": -446.402099609375, "logps/ref_chosen": -46.86899948120117, "logps/ref_rejected": -95.92545318603516, "logps/rejected": -685.68798828125, "loss": 1.1678, "margin_dpo/margin_mean": 190.22943115234375, "margin_dpo/margin_std": 325.59716796875, "step": 425 }, { "KL/chosen_KL_mean": -367.98974609375, "KL/mean": -488.088134765625, "KL/rejected_KL_mean": -608.1864624023438, "KL/std": 268.52386474609375, "epoch": 0.6255506607929515, "fcm_dpo/beta": 0.0016432944685220718, "fcm_dpo/delta": 0.005093574523925781, "fcm_dpo/margin": 240.1967315673828, "fcm_dpo/q_t": 0.4079374670982361, "grad_norm": 29.509531021118164, "learning_rate": 1.8653568770724803e-07, "logits/chosen": -0.6483026742935181, "logits/rejected": -0.6253814697265625, "logps/chosen": -444.57330322265625, "logps/ref_chosen": -76.58354187011719, "logps/ref_rejected": -81.26658630371094, "logps/rejected": -689.4530639648438, "loss": 1.0895, "margin_dpo/margin_mean": 240.19671630859375, "margin_dpo/margin_std": 298.68304443359375, "step": 426 }, { "KL/chosen_KL_mean": -346.615478515625, "KL/mean": -436.65435791015625, "KL/rejected_KL_mean": -526.6932373046875, "KL/std": 234.79974365234375, "epoch": 0.6270190895741556, "fcm_dpo/beta": 0.00165902404114604, "fcm_dpo/delta": 0.10449196398258209, "fcm_dpo/margin": 180.07781982421875, "fcm_dpo/q_t": 0.4308916926383972, "grad_norm": 24.951610565185547, "learning_rate": 1.8529523872436977e-07, "logits/chosen": -0.5918477177619934, "logits/rejected": -0.5726908445358276, "logps/chosen": -411.4693603515625, "logps/ref_chosen": -64.8538818359375, "logps/ref_rejected": -78.5660171508789, "logps/rejected": -605.25927734375, "loss": 1.1632, "margin_dpo/margin_mean": 180.07781982421875, "margin_dpo/margin_std": 284.20269775390625, "step": 427 }, { "KL/chosen_KL_mean": -417.6781921386719, "KL/mean": -544.4334106445312, "KL/rejected_KL_mean": -671.1885986328125, "KL/std": 306.112060546875, "epoch": 0.6284875183553598, "fcm_dpo/beta": 0.0016601982060819864, "fcm_dpo/delta": -0.02208590693771839, "fcm_dpo/margin": 253.5104217529297, "fcm_dpo/q_t": 0.4037541151046753, "grad_norm": 30.45539665222168, "learning_rate": 1.8405649477212697e-07, "logits/chosen": -0.5910390615463257, "logits/rejected": -0.5962928533554077, "logps/chosen": -480.3148498535156, "logps/ref_chosen": -62.63666534423828, "logps/ref_rejected": -103.28181457519531, "logps/rejected": -774.470458984375, "loss": 1.0964, "margin_dpo/margin_mean": 253.5104217529297, "margin_dpo/margin_std": 367.505859375, "step": 428 }, { "KL/chosen_KL_mean": -423.49957275390625, "KL/mean": -519.6409301757812, "KL/rejected_KL_mean": -615.7822875976562, "KL/std": 274.84283447265625, "epoch": 0.6299559471365639, "fcm_dpo/beta": 0.0016591004095971584, "fcm_dpo/delta": -0.028070662170648575, "fcm_dpo/margin": 192.28273010253906, "fcm_dpo/q_t": 0.426498144865036, "grad_norm": 33.337589263916016, "learning_rate": 1.828194884925749e-07, "logits/chosen": -0.5892548561096191, "logits/rejected": -0.5679141283035278, "logps/chosen": -504.73358154296875, "logps/ref_chosen": -81.23401641845703, "logps/ref_rejected": -91.79493713378906, "logps/rejected": -707.5772094726562, "loss": 1.172, "margin_dpo/margin_mean": 192.28273010253906, "margin_dpo/margin_std": 336.96649169921875, "step": 429 }, { "KL/chosen_KL_mean": -343.0795593261719, "KL/mean": -443.554931640625, "KL/rejected_KL_mean": -544.0302734375, "KL/std": 249.0330047607422, "epoch": 0.631424375917768, "fcm_dpo/beta": 0.0016736264806240797, "fcm_dpo/delta": 0.06572603434324265, "fcm_dpo/margin": 200.95074462890625, "fcm_dpo/q_t": 0.42213696241378784, "grad_norm": 27.670103073120117, "learning_rate": 1.8158425248197928e-07, "logits/chosen": -0.5829579830169678, "logits/rejected": -0.5811977386474609, "logps/chosen": -403.9998779296875, "logps/ref_chosen": -60.920326232910156, "logps/ref_rejected": -104.42280578613281, "logps/rejected": -648.453125, "loss": 1.129, "margin_dpo/margin_mean": 200.95074462890625, "margin_dpo/margin_std": 278.2074890136719, "step": 430 }, { "KL/chosen_KL_mean": -320.6529541015625, "KL/mean": -468.82305908203125, "KL/rejected_KL_mean": -616.9931640625, "KL/std": 276.59454345703125, "epoch": 0.6328928046989721, "fcm_dpo/beta": 0.001651083119213581, "fcm_dpo/delta": -0.09401773661375046, "fcm_dpo/margin": 296.34014892578125, "fcm_dpo/q_t": 0.3859713673591614, "grad_norm": 23.38682746887207, "learning_rate": 1.8035081928995788e-07, "logits/chosen": -0.5685824751853943, "logits/rejected": -0.5741355419158936, "logps/chosen": -378.001708984375, "logps/ref_chosen": -57.34874725341797, "logps/ref_rejected": -92.84022521972656, "logps/rejected": -709.8333740234375, "loss": 1.0172, "margin_dpo/margin_mean": 296.3401794433594, "margin_dpo/margin_std": 303.3402404785156, "step": 431 }, { "KL/chosen_KL_mean": -317.8546447753906, "KL/mean": -456.84588623046875, "KL/rejected_KL_mean": -595.837158203125, "KL/std": 269.2180480957031, "epoch": 0.6343612334801763, "fcm_dpo/beta": 0.0016406788490712643, "fcm_dpo/delta": -0.0591546930372715, "fcm_dpo/margin": 277.9825439453125, "fcm_dpo/q_t": 0.3939441442489624, "grad_norm": 41.92903518676758, "learning_rate": 1.791192214186223e-07, "logits/chosen": -0.531327486038208, "logits/rejected": -0.520300030708313, "logps/chosen": -388.929443359375, "logps/ref_chosen": -71.07479095458984, "logps/ref_rejected": -98.57952880859375, "logps/rejected": -694.4166870117188, "loss": 1.0337, "margin_dpo/margin_mean": 277.9825134277344, "margin_dpo/margin_std": 276.3160400390625, "step": 432 }, { "KL/chosen_KL_mean": -403.21221923828125, "KL/mean": -496.0498046875, "KL/rejected_KL_mean": -588.8873291015625, "KL/std": 261.9140625, "epoch": 0.6358296622613803, "fcm_dpo/beta": 0.0016517346957698464, "fcm_dpo/delta": 0.09607505798339844, "fcm_dpo/margin": 185.67514038085938, "fcm_dpo/q_t": 0.4271540939807892, "grad_norm": 35.29652404785156, "learning_rate": 1.7788949132172193e-07, "logits/chosen": -0.5547606945037842, "logits/rejected": -0.541266679763794, "logps/chosen": -461.48541259765625, "logps/ref_chosen": -58.273193359375, "logps/ref_rejected": -95.95089721679688, "logps/rejected": -684.8382568359375, "loss": 1.1672, "margin_dpo/margin_mean": 185.67514038085938, "margin_dpo/margin_std": 311.87078857421875, "step": 433 }, { "KL/chosen_KL_mean": -343.99139404296875, "KL/mean": -456.23834228515625, "KL/rejected_KL_mean": -568.4853515625, "KL/std": 267.9556579589844, "epoch": 0.6372980910425844, "fcm_dpo/beta": 0.0016591593157500029, "fcm_dpo/delta": 0.02848285809159279, "fcm_dpo/margin": 224.49386596679688, "fcm_dpo/q_t": 0.4180990159511566, "grad_norm": 25.378862380981445, "learning_rate": 1.7666166140378853e-07, "logits/chosen": -0.5715805292129517, "logits/rejected": -0.5707394480705261, "logps/chosen": -405.965087890625, "logps/ref_chosen": -61.97370147705078, "logps/ref_rejected": -78.49861145019531, "logps/rejected": -646.98388671875, "loss": 1.1177, "margin_dpo/margin_mean": 224.49386596679688, "margin_dpo/margin_std": 339.36627197265625, "step": 434 }, { "KL/chosen_KL_mean": -311.0776062011719, "KL/mean": -433.509521484375, "KL/rejected_KL_mean": -555.94140625, "KL/std": 261.578857421875, "epoch": 0.6387665198237885, "fcm_dpo/beta": 0.00166351068764925, "fcm_dpo/delta": -0.007664802018553019, "fcm_dpo/margin": 244.86380004882812, "fcm_dpo/q_t": 0.40512967109680176, "grad_norm": 25.661197662353516, "learning_rate": 1.7543576401928218e-07, "logits/chosen": -0.5694348812103271, "logits/rejected": -0.559348464012146, "logps/chosen": -362.57965087890625, "logps/ref_chosen": -51.502052307128906, "logps/ref_rejected": -87.56689453125, "logps/rejected": -643.50830078125, "loss": 1.0788, "margin_dpo/margin_mean": 244.86380004882812, "margin_dpo/margin_std": 291.82879638671875, "step": 435 }, { "KL/chosen_KL_mean": -326.98468017578125, "KL/mean": -433.77142333984375, "KL/rejected_KL_mean": -540.5582275390625, "KL/std": 234.86660766601562, "epoch": 0.6402349486049926, "fcm_dpo/beta": 0.001671030418947339, "fcm_dpo/delta": 0.04468690603971481, "fcm_dpo/margin": 213.5735321044922, "fcm_dpo/q_t": 0.41665488481521606, "grad_norm": 39.46367263793945, "learning_rate": 1.742118314717391e-07, "logits/chosen": -0.5771512985229492, "logits/rejected": -0.5507988929748535, "logps/chosen": -398.3883972167969, "logps/ref_chosen": -71.40371704101562, "logps/ref_rejected": -82.72775268554688, "logps/rejected": -623.2860107421875, "loss": 1.1115, "margin_dpo/margin_mean": 213.57354736328125, "margin_dpo/margin_std": 275.3209533691406, "step": 436 }, { "KL/chosen_KL_mean": -331.77978515625, "KL/mean": -442.20428466796875, "KL/rejected_KL_mean": -552.6287841796875, "KL/std": 225.34506225585938, "epoch": 0.6417033773861968, "fcm_dpo/beta": 0.0016848563682287931, "fcm_dpo/delta": 0.028988715261220932, "fcm_dpo/margin": 220.84902954101562, "fcm_dpo/q_t": 0.41250330209732056, "grad_norm": 25.351360321044922, "learning_rate": 1.7298989601292036e-07, "logits/chosen": -0.5720341205596924, "logits/rejected": -0.5498570203781128, "logps/chosen": -396.5240478515625, "logps/ref_chosen": -64.7442626953125, "logps/ref_rejected": -82.04356384277344, "logps/rejected": -634.67236328125, "loss": 1.0967, "margin_dpo/margin_mean": 220.84902954101562, "margin_dpo/margin_std": 267.39385986328125, "step": 437 }, { "KL/chosen_KL_mean": -341.92059326171875, "KL/mean": -469.1558532714844, "KL/rejected_KL_mean": -596.3910522460938, "KL/std": 260.28424072265625, "epoch": 0.6431718061674009, "fcm_dpo/beta": 0.0016751789953559637, "fcm_dpo/delta": -0.027896108105778694, "fcm_dpo/margin": 254.47047424316406, "fcm_dpo/q_t": 0.3999551236629486, "grad_norm": 33.649723052978516, "learning_rate": 1.7176998984196144e-07, "logits/chosen": -0.5674476623535156, "logits/rejected": -0.5483890771865845, "logps/chosen": -400.93927001953125, "logps/ref_chosen": -59.0186653137207, "logps/ref_rejected": -83.07682800292969, "logps/rejected": -679.4678955078125, "loss": 1.0567, "margin_dpo/margin_mean": 254.470458984375, "margin_dpo/margin_std": 276.5482177734375, "step": 438 }, { "KL/chosen_KL_mean": -375.33843994140625, "KL/mean": -480.2781982421875, "KL/rejected_KL_mean": -585.218017578125, "KL/std": 268.84649658203125, "epoch": 0.644640234948605, "fcm_dpo/beta": 0.0016563256504014134, "fcm_dpo/delta": -0.06954063475131989, "fcm_dpo/margin": 209.87954711914062, "fcm_dpo/q_t": 0.4195774793624878, "grad_norm": 28.397993087768555, "learning_rate": 1.7055214510452458e-07, "logits/chosen": -0.6017969846725464, "logits/rejected": -0.6068276166915894, "logps/chosen": -429.1225280761719, "logps/ref_chosen": -53.78407669067383, "logps/ref_rejected": -83.98545837402344, "logps/rejected": -669.2034912109375, "loss": 1.1355, "margin_dpo/margin_mean": 209.87953186035156, "margin_dpo/margin_std": 304.0859375, "step": 439 }, { "KL/chosen_KL_mean": -404.01019287109375, "KL/mean": -522.5572509765625, "KL/rejected_KL_mean": -641.1043701171875, "KL/std": 323.501708984375, "epoch": 0.6461086637298091, "fcm_dpo/beta": 0.0016622185939922929, "fcm_dpo/delta": 0.005979446694254875, "fcm_dpo/margin": 237.09423828125, "fcm_dpo/q_t": 0.41194236278533936, "grad_norm": 36.218482971191406, "learning_rate": 1.6933639389195134e-07, "logits/chosen": -0.6418617367744446, "logits/rejected": -0.638819694519043, "logps/chosen": -482.576904296875, "logps/ref_chosen": -78.56671905517578, "logps/ref_rejected": -96.49775695800781, "logps/rejected": -737.6021728515625, "loss": 1.1007, "margin_dpo/margin_mean": 237.09423828125, "margin_dpo/margin_std": 334.8287658691406, "step": 440 }, { "KL/chosen_KL_mean": -462.35516357421875, "KL/mean": -583.5091552734375, "KL/rejected_KL_mean": -704.6630859375, "KL/std": 340.33734130859375, "epoch": 0.6475770925110133, "fcm_dpo/beta": 0.001664304407313466, "fcm_dpo/delta": -0.0037491731345653534, "fcm_dpo/margin": 242.307861328125, "fcm_dpo/q_t": 0.41193264722824097, "grad_norm": 46.04979705810547, "learning_rate": 1.681227682404166e-07, "logits/chosen": -0.6510436534881592, "logits/rejected": -0.6406994462013245, "logps/chosen": -523.1796264648438, "logps/ref_chosen": -60.824440002441406, "logps/ref_rejected": -96.47080993652344, "logps/rejected": -801.1338500976562, "loss": 1.1303, "margin_dpo/margin_mean": 242.307861328125, "margin_dpo/margin_std": 407.91796875, "step": 441 }, { "KL/chosen_KL_mean": -394.96697998046875, "KL/mean": -539.0298461914062, "KL/rejected_KL_mean": -683.0927124023438, "KL/std": 329.10321044921875, "epoch": 0.6490455212922174, "fcm_dpo/beta": 0.0016490614507347345, "fcm_dpo/delta": -0.07931334525346756, "fcm_dpo/margin": 288.12567138671875, "fcm_dpo/q_t": 0.3943568170070648, "grad_norm": 35.095680236816406, "learning_rate": 1.669113001300851e-07, "logits/chosen": -0.6585125923156738, "logits/rejected": -0.6561766862869263, "logps/chosen": -441.97821044921875, "logps/ref_chosen": -47.01121520996094, "logps/ref_rejected": -76.53926086425781, "logps/rejected": -759.6319580078125, "loss": 1.0536, "margin_dpo/margin_mean": 288.12567138671875, "margin_dpo/margin_std": 366.7057800292969, "step": 442 }, { "KL/chosen_KL_mean": -456.835693359375, "KL/mean": -547.46435546875, "KL/rejected_KL_mean": -638.0928955078125, "KL/std": 329.8770751953125, "epoch": 0.6505139500734214, "fcm_dpo/beta": 0.0016302757430821657, "fcm_dpo/delta": -0.0021791704930365086, "fcm_dpo/margin": 181.2572021484375, "fcm_dpo/q_t": 0.4339308440685272, "grad_norm": 37.560585021972656, "learning_rate": 1.6570202148426815e-07, "logits/chosen": -0.6085466146469116, "logits/rejected": -0.5840749740600586, "logps/chosen": -528.1087646484375, "logps/ref_chosen": -71.27301788330078, "logps/ref_rejected": -86.679931640625, "logps/rejected": -724.7728271484375, "loss": 1.2126, "margin_dpo/margin_mean": 181.2572021484375, "margin_dpo/margin_std": 397.4133605957031, "step": 443 }, { "KL/chosen_KL_mean": -444.24237060546875, "KL/mean": -589.8250732421875, "KL/rejected_KL_mean": -735.4078369140625, "KL/std": 349.052978515625, "epoch": 0.6519823788546255, "fcm_dpo/beta": 0.0016081281937658787, "fcm_dpo/delta": -0.07197729498147964, "fcm_dpo/margin": 291.16546630859375, "fcm_dpo/q_t": 0.39475017786026, "grad_norm": 26.796432495117188, "learning_rate": 1.6449496416858282e-07, "logits/chosen": -0.619565486907959, "logits/rejected": -0.6288525462150574, "logps/chosen": -501.4560546875, "logps/ref_chosen": -57.213706970214844, "logps/ref_rejected": -97.25489807128906, "logps/rejected": -832.6627197265625, "loss": 1.0554, "margin_dpo/margin_mean": 291.16546630859375, "margin_dpo/margin_std": 379.3155517578125, "step": 444 }, { "KL/chosen_KL_mean": -398.5299072265625, "KL/mean": -536.5014038085938, "KL/rejected_KL_mean": -674.472900390625, "KL/std": 279.18701171875, "epoch": 0.6534508076358296, "fcm_dpo/beta": 0.0015993316192179918, "fcm_dpo/delta": -0.04321688041090965, "fcm_dpo/margin": 275.9429931640625, "fcm_dpo/q_t": 0.3995450437068939, "grad_norm": 27.847251892089844, "learning_rate": 1.6329015999011182e-07, "logits/chosen": -0.6301474571228027, "logits/rejected": -0.6212267279624939, "logps/chosen": -465.8297119140625, "logps/ref_chosen": -67.29979705810547, "logps/ref_rejected": -92.68267059326172, "logps/rejected": -767.1555786132812, "loss": 1.0649, "margin_dpo/margin_mean": 275.9429931640625, "margin_dpo/margin_std": 342.8664245605469, "step": 445 }, { "KL/chosen_KL_mean": -363.5679016113281, "KL/mean": -512.4356689453125, "KL/rejected_KL_mean": -661.303466796875, "KL/std": 304.2501220703125, "epoch": 0.6549192364170338, "fcm_dpo/beta": 0.0015849031042307615, "fcm_dpo/delta": -0.07573074102401733, "fcm_dpo/margin": 297.735595703125, "fcm_dpo/q_t": 0.3897179961204529, "grad_norm": 30.288881301879883, "learning_rate": 1.6208764069656578e-07, "logits/chosen": -0.655229926109314, "logits/rejected": -0.6672055721282959, "logps/chosen": -422.6663818359375, "logps/ref_chosen": -59.098487854003906, "logps/ref_rejected": -101.26419067382812, "logps/rejected": -762.567626953125, "loss": 1.0286, "margin_dpo/margin_mean": 297.735595703125, "margin_dpo/margin_std": 308.36199951171875, "step": 446 }, { "KL/chosen_KL_mean": -363.2137756347656, "KL/mean": -519.5991821289062, "KL/rejected_KL_mean": -675.984619140625, "KL/std": 346.8345947265625, "epoch": 0.6563876651982379, "fcm_dpo/beta": 0.001544747268781066, "fcm_dpo/delta": -0.08775018155574799, "fcm_dpo/margin": 312.77081298828125, "fcm_dpo/q_t": 0.3913511037826538, "grad_norm": 26.699710845947266, "learning_rate": 1.608874379754465e-07, "logits/chosen": -0.7068610191345215, "logits/rejected": -0.7222627401351929, "logps/chosen": -419.28912353515625, "logps/ref_chosen": -56.07533264160156, "logps/ref_rejected": -98.69475555419922, "logps/rejected": -774.6793212890625, "loss": 1.0369, "margin_dpo/margin_mean": 312.7708435058594, "margin_dpo/margin_std": 385.6178283691406, "step": 447 }, { "KL/chosen_KL_mean": -406.2823791503906, "KL/mean": -554.0318603515625, "KL/rejected_KL_mean": -701.7813720703125, "KL/std": 298.70819091796875, "epoch": 0.657856093979442, "fcm_dpo/beta": 0.0015353120397776365, "fcm_dpo/delta": -0.05634545907378197, "fcm_dpo/margin": 295.49896240234375, "fcm_dpo/q_t": 0.395630419254303, "grad_norm": 35.73704528808594, "learning_rate": 1.5968958345321177e-07, "logits/chosen": -0.6082560420036316, "logits/rejected": -0.6136020421981812, "logps/chosen": -466.2862243652344, "logps/ref_chosen": -60.00384521484375, "logps/ref_rejected": -102.26465606689453, "logps/rejected": -804.0460205078125, "loss": 1.0467, "margin_dpo/margin_mean": 295.4989929199219, "margin_dpo/margin_std": 336.44561767578125, "step": 448 }, { "KL/chosen_KL_mean": -413.37811279296875, "KL/mean": -562.44775390625, "KL/rejected_KL_mean": -711.517333984375, "KL/std": 366.28466796875, "epoch": 0.6593245227606461, "fcm_dpo/beta": 0.0015101665630936623, "fcm_dpo/delta": -0.052754104137420654, "fcm_dpo/margin": 298.13922119140625, "fcm_dpo/q_t": 0.40061530470848083, "grad_norm": 28.19297981262207, "learning_rate": 1.584941086944423e-07, "logits/chosen": -0.6445102095603943, "logits/rejected": -0.6385193467140198, "logps/chosen": -480.90472412109375, "logps/ref_chosen": -67.52661895751953, "logps/ref_rejected": -88.59690856933594, "logps/rejected": -800.1142578125, "loss": 1.0796, "margin_dpo/margin_mean": 298.13922119140625, "margin_dpo/margin_std": 435.361572265625, "step": 449 }, { "KL/chosen_KL_mean": -338.682861328125, "KL/mean": -501.4732666015625, "KL/rejected_KL_mean": -664.263671875, "KL/std": 325.22833251953125, "epoch": 0.6607929515418502, "fcm_dpo/beta": 0.0014909481396898627, "fcm_dpo/delta": -0.08969271928071976, "fcm_dpo/margin": 325.58074951171875, "fcm_dpo/q_t": 0.386138379573822, "grad_norm": 47.2825813293457, "learning_rate": 1.573010452010098e-07, "logits/chosen": -0.6737087965011597, "logits/rejected": -0.685724675655365, "logps/chosen": -395.791015625, "logps/ref_chosen": -57.10811996459961, "logps/ref_rejected": -102.75494384765625, "logps/rejected": -767.0185546875, "loss": 1.0118, "margin_dpo/margin_mean": 325.58074951171875, "margin_dpo/margin_std": 320.9717102050781, "step": 450 }, { "KL/chosen_KL_mean": -454.3709716796875, "KL/mean": -571.3306274414062, "KL/rejected_KL_mean": -688.2903442382812, "KL/std": 374.0126647949219, "epoch": 0.6622613803230544, "fcm_dpo/beta": 0.0014971659984439611, "fcm_dpo/delta": 0.051308851689100266, "fcm_dpo/margin": 233.9193878173828, "fcm_dpo/q_t": 0.41749513149261475, "grad_norm": 33.79815673828125, "learning_rate": 1.5611042441124687e-07, "logits/chosen": -0.7189067602157593, "logits/rejected": -0.6965080499649048, "logps/chosen": -512.8397827148438, "logps/ref_chosen": -58.46883010864258, "logps/ref_rejected": -72.92941284179688, "logps/rejected": -761.2197265625, "loss": 1.1623, "margin_dpo/margin_mean": 233.91940307617188, "margin_dpo/margin_std": 428.302490234375, "step": 451 }, { "KL/chosen_KL_mean": -321.31689453125, "KL/mean": -462.2808532714844, "KL/rejected_KL_mean": -603.2447509765625, "KL/std": 290.2576904296875, "epoch": 0.6637298091042585, "fcm_dpo/beta": 0.0014898786321282387, "fcm_dpo/delta": -0.021011171862483025, "fcm_dpo/margin": 281.9278869628906, "fcm_dpo/q_t": 0.40081116557121277, "grad_norm": 25.158477783203125, "learning_rate": 1.549222776991186e-07, "logits/chosen": -0.6077337265014648, "logits/rejected": -0.6246554851531982, "logps/chosen": -371.70745849609375, "logps/ref_chosen": -50.39055252075195, "logps/ref_rejected": -97.77142333984375, "logps/rejected": -701.0162353515625, "loss": 1.0559, "margin_dpo/margin_mean": 281.9278564453125, "margin_dpo/margin_std": 295.3541564941406, "step": 452 }, { "KL/chosen_KL_mean": -374.7382507324219, "KL/mean": -508.03814697265625, "KL/rejected_KL_mean": -641.3380126953125, "KL/std": 293.4873046875, "epoch": 0.6651982378854625, "fcm_dpo/beta": 0.0014873708132654428, "fcm_dpo/delta": 0.0034573376178741455, "fcm_dpo/margin": 266.5997314453125, "fcm_dpo/q_t": 0.40999874472618103, "grad_norm": 26.13146209716797, "learning_rate": 1.5373663637339584e-07, "logits/chosen": -0.6614656448364258, "logits/rejected": -0.6468169689178467, "logps/chosen": -432.453125, "logps/ref_chosen": -57.71485137939453, "logps/ref_rejected": -82.20741271972656, "logps/rejected": -723.54541015625, "loss": 1.0931, "margin_dpo/margin_mean": 266.5997619628906, "margin_dpo/margin_std": 356.6203918457031, "step": 453 }, { "KL/chosen_KL_mean": -450.31854248046875, "KL/mean": -600.8720703125, "KL/rejected_KL_mean": -751.4256591796875, "KL/std": 340.4393005371094, "epoch": 0.6666666666666666, "fcm_dpo/beta": 0.0014775395393371582, "fcm_dpo/delta": -0.047248564660549164, "fcm_dpo/margin": 301.10711669921875, "fcm_dpo/q_t": 0.39838463068008423, "grad_norm": 28.11908721923828, "learning_rate": 1.5255353167683017e-07, "logits/chosen": -0.697075605392456, "logits/rejected": -0.6863827109336853, "logps/chosen": -511.26422119140625, "logps/ref_chosen": -60.945648193359375, "logps/ref_rejected": -84.95079040527344, "logps/rejected": -836.37646484375, "loss": 1.0611, "margin_dpo/margin_mean": 301.10711669921875, "margin_dpo/margin_std": 380.62457275390625, "step": 454 }, { "KL/chosen_KL_mean": -384.13677978515625, "KL/mean": -556.47265625, "KL/rejected_KL_mean": -728.8084716796875, "KL/std": 345.8653869628906, "epoch": 0.6681350954478708, "fcm_dpo/beta": 0.0014633602695539594, "fcm_dpo/delta": -0.11007063835859299, "fcm_dpo/margin": 344.6717224121094, "fcm_dpo/q_t": 0.38579294085502625, "grad_norm": 37.92613983154297, "learning_rate": 1.5137299478533064e-07, "logits/chosen": -0.6768559217453003, "logits/rejected": -0.6982386708259583, "logps/chosen": -429.02349853515625, "logps/ref_chosen": -44.88671112060547, "logps/ref_rejected": -115.30147552490234, "logps/rejected": -844.1099853515625, "loss": 1.0253, "margin_dpo/margin_mean": 344.6717224121094, "margin_dpo/margin_std": 396.2593078613281, "step": 455 }, { "KL/chosen_KL_mean": -407.1464538574219, "KL/mean": -576.0047607421875, "KL/rejected_KL_mean": -744.863037109375, "KL/std": 354.4407043457031, "epoch": 0.6696035242290749, "fcm_dpo/beta": 0.0014285333454608917, "fcm_dpo/delta": -0.08655368536710739, "fcm_dpo/margin": 337.71661376953125, "fcm_dpo/q_t": 0.3888140320777893, "grad_norm": 26.16177749633789, "learning_rate": 1.5019505680714232e-07, "logits/chosen": -0.6761212348937988, "logits/rejected": -0.6974040865898132, "logps/chosen": -464.1832275390625, "logps/ref_chosen": -57.036781311035156, "logps/ref_rejected": -105.21784210205078, "logps/rejected": -850.0808715820312, "loss": 1.015, "margin_dpo/margin_mean": 337.71661376953125, "margin_dpo/margin_std": 344.4359130859375, "step": 456 }, { "KL/chosen_KL_mean": -392.9554443359375, "KL/mean": -561.5748291015625, "KL/rejected_KL_mean": -730.1942138671875, "KL/std": 337.5885009765625, "epoch": 0.671071953010279, "fcm_dpo/beta": 0.0014012358151376247, "fcm_dpo/delta": -0.07646898925304413, "fcm_dpo/margin": 337.23883056640625, "fcm_dpo/q_t": 0.3888044059276581, "grad_norm": 28.89864158630371, "learning_rate": 1.4901974878202627e-07, "logits/chosen": -0.6724662780761719, "logits/rejected": -0.674906849861145, "logps/chosen": -447.197998046875, "logps/ref_chosen": -54.24253845214844, "logps/ref_rejected": -85.10956573486328, "logps/rejected": -815.3038330078125, "loss": 1.0173, "margin_dpo/margin_mean": 337.23883056640625, "margin_dpo/margin_std": 322.967529296875, "step": 457 }, { "KL/chosen_KL_mean": -399.1230163574219, "KL/mean": -552.3309326171875, "KL/rejected_KL_mean": -705.5389404296875, "KL/std": 313.0858154296875, "epoch": 0.6725403817914831, "fcm_dpo/beta": 0.0013883748324587941, "fcm_dpo/delta": -0.02694622240960598, "fcm_dpo/margin": 306.4158935546875, "fcm_dpo/q_t": 0.40227359533309937, "grad_norm": 22.595535278320312, "learning_rate": 1.4784710168044212e-07, "logits/chosen": -0.6672220826148987, "logits/rejected": -0.6631453633308411, "logps/chosen": -454.5318908691406, "logps/ref_chosen": -55.40888214111328, "logps/ref_rejected": -97.68325805664062, "logps/rejected": -803.22216796875, "loss": 1.0629, "margin_dpo/margin_mean": 306.4158935546875, "margin_dpo/margin_std": 356.400634765625, "step": 458 }, { "KL/chosen_KL_mean": -442.6391296386719, "KL/mean": -608.4810180664062, "KL/rejected_KL_mean": -774.3228759765625, "KL/std": 361.2178955078125, "epoch": 0.6740088105726872, "fcm_dpo/beta": 0.0013751968508586287, "fcm_dpo/delta": -0.05920097231864929, "fcm_dpo/margin": 331.68365478515625, "fcm_dpo/q_t": 0.39526090025901794, "grad_norm": 31.071313858032227, "learning_rate": 1.466771464027316e-07, "logits/chosen": -0.6531388759613037, "logits/rejected": -0.6709892749786377, "logps/chosen": -489.1966247558594, "logps/ref_chosen": -46.55748748779297, "logps/ref_rejected": -86.16854095458984, "logps/rejected": -860.4913940429688, "loss": 1.0531, "margin_dpo/margin_mean": 331.6836853027344, "margin_dpo/margin_std": 400.57806396484375, "step": 459 }, { "KL/chosen_KL_mean": -484.61431884765625, "KL/mean": -663.1461181640625, "KL/rejected_KL_mean": -841.6778564453125, "KL/std": 356.38165283203125, "epoch": 0.6754772393538914, "fcm_dpo/beta": 0.0013584838015958667, "fcm_dpo/delta": -0.08933592587709427, "fcm_dpo/margin": 357.0634765625, "fcm_dpo/q_t": 0.38839712738990784, "grad_norm": 46.59115982055664, "learning_rate": 1.4550991377830423e-07, "logits/chosen": -0.7092480063438416, "logits/rejected": -0.7396787405014038, "logps/chosen": -536.249267578125, "logps/ref_chosen": -51.63489532470703, "logps/ref_rejected": -104.11935424804688, "logps/rejected": -945.7972412109375, "loss": 1.022, "margin_dpo/margin_mean": 357.0635070800781, "margin_dpo/margin_std": 387.8919677734375, "step": 460 }, { "KL/chosen_KL_mean": -520.2779541015625, "KL/mean": -661.3561401367188, "KL/rejected_KL_mean": -802.4342651367188, "KL/std": 363.57623291015625, "epoch": 0.6769456681350955, "fcm_dpo/beta": 0.0013587003340944648, "fcm_dpo/delta": 0.017041990533471107, "fcm_dpo/margin": 282.1562805175781, "fcm_dpo/q_t": 0.4131912589073181, "grad_norm": 28.806053161621094, "learning_rate": 1.4434543456482518e-07, "logits/chosen": -0.7396203279495239, "logits/rejected": -0.7533408999443054, "logps/chosen": -575.4599609375, "logps/ref_chosen": -55.18195724487305, "logps/ref_rejected": -86.47689819335938, "logps/rejected": -888.9111328125, "loss": 1.1072, "margin_dpo/margin_mean": 282.15625, "margin_dpo/margin_std": 399.5581359863281, "step": 461 }, { "KL/chosen_KL_mean": -545.4285888671875, "KL/mean": -660.5108642578125, "KL/rejected_KL_mean": -775.5931396484375, "KL/std": 371.68798828125, "epoch": 0.6784140969162996, "fcm_dpo/beta": 0.001372592058032751, "fcm_dpo/delta": 0.08686043322086334, "fcm_dpo/margin": 230.16461181640625, "fcm_dpo/q_t": 0.4292982220649719, "grad_norm": 41.23543930053711, "learning_rate": 1.4318373944740484e-07, "logits/chosen": -0.8299468755722046, "logits/rejected": -0.8264528512954712, "logps/chosen": -615.3565673828125, "logps/ref_chosen": -69.92803192138672, "logps/ref_rejected": -78.84111022949219, "logps/rejected": -854.4342041015625, "loss": 1.1717, "margin_dpo/margin_mean": 230.1645965576172, "margin_dpo/margin_std": 417.91949462890625, "step": 462 }, { "KL/chosen_KL_mean": -548.3289184570312, "KL/mean": -702.0982055664062, "KL/rejected_KL_mean": -855.8675537109375, "KL/std": 388.29705810546875, "epoch": 0.6798825256975036, "fcm_dpo/beta": 0.0013804540503770113, "fcm_dpo/delta": -0.025950342416763306, "fcm_dpo/margin": 307.53863525390625, "fcm_dpo/q_t": 0.4052172899246216, "grad_norm": 37.95475387573242, "learning_rate": 1.4202485903778976e-07, "logits/chosen": -0.8180972337722778, "logits/rejected": -0.8314469456672668, "logps/chosen": -603.603271484375, "logps/ref_chosen": -55.27437210083008, "logps/ref_rejected": -89.02497863769531, "logps/rejected": -944.8925170898438, "loss": 1.0942, "margin_dpo/margin_mean": 307.53863525390625, "margin_dpo/margin_std": 443.52276611328125, "step": 463 }, { "KL/chosen_KL_mean": -544.8458251953125, "KL/mean": -780.4537353515625, "KL/rejected_KL_mean": -1016.0616455078125, "KL/std": 460.05206298828125, "epoch": 0.6813509544787077, "fcm_dpo/beta": 0.0013158408692106605, "fcm_dpo/delta": -0.23714584112167358, "fcm_dpo/margin": 471.21575927734375, "fcm_dpo/q_t": 0.359811931848526, "grad_norm": 43.35410690307617, "learning_rate": 1.4086882387355658e-07, "logits/chosen": -0.7928054332733154, "logits/rejected": -0.8558509945869446, "logps/chosen": -595.7581176757812, "logps/ref_chosen": -50.91230010986328, "logps/ref_rejected": -102.4893798828125, "logps/rejected": -1118.551025390625, "loss": 0.9447, "margin_dpo/margin_mean": 471.21575927734375, "margin_dpo/margin_std": 474.4074401855469, "step": 464 }, { "KL/chosen_KL_mean": -574.6008911132812, "KL/mean": -763.1315307617188, "KL/rejected_KL_mean": -951.6621704101562, "KL/std": 477.3629150390625, "epoch": 0.6828193832599119, "fcm_dpo/beta": 0.0012953910045325756, "fcm_dpo/delta": -0.09291453659534454, "fcm_dpo/margin": 377.061279296875, "fcm_dpo/q_t": 0.38684460520744324, "grad_norm": 57.58442306518555, "learning_rate": 1.3971566441730714e-07, "logits/chosen": -0.7876610159873962, "logits/rejected": -0.807873547077179, "logps/chosen": -634.7177734375, "logps/ref_chosen": -60.116851806640625, "logps/ref_rejected": -113.94602966308594, "logps/rejected": -1065.608154296875, "loss": 1.0584, "margin_dpo/margin_mean": 377.061279296875, "margin_dpo/margin_std": 511.11419677734375, "step": 465 }, { "KL/chosen_KL_mean": -613.3887939453125, "KL/mean": -784.2792358398438, "KL/rejected_KL_mean": -955.1697998046875, "KL/std": 442.28924560546875, "epoch": 0.684287812041116, "fcm_dpo/beta": 0.0012718967627733946, "fcm_dpo/delta": -0.03685159608721733, "fcm_dpo/margin": 341.7809753417969, "fcm_dpo/q_t": 0.4008026123046875, "grad_norm": 33.484703063964844, "learning_rate": 1.3856541105586545e-07, "logits/chosen": -0.8349906206130981, "logits/rejected": -0.843805193901062, "logps/chosen": -666.3096923828125, "logps/ref_chosen": -52.920921325683594, "logps/ref_rejected": -90.3154296875, "logps/rejected": -1045.4852294921875, "loss": 1.0906, "margin_dpo/margin_mean": 341.7809753417969, "margin_dpo/margin_std": 492.47955322265625, "step": 466 }, { "KL/chosen_KL_mean": -774.8294677734375, "KL/mean": -954.3599243164062, "KL/rejected_KL_mean": -1133.890380859375, "KL/std": 575.3023681640625, "epoch": 0.6857562408223201, "fcm_dpo/beta": 0.00125328847207129, "fcm_dpo/delta": -0.05458660423755646, "fcm_dpo/margin": 359.0608825683594, "fcm_dpo/q_t": 0.4034256041049957, "grad_norm": 52.08469009399414, "learning_rate": 1.3741809409947729e-07, "logits/chosen": -0.9275529384613037, "logits/rejected": -0.9244056940078735, "logps/chosen": -853.5452880859375, "logps/ref_chosen": -78.7158203125, "logps/ref_rejected": -102.86019897460938, "logps/rejected": -1236.7506103515625, "loss": 1.1488, "margin_dpo/margin_mean": 359.0609130859375, "margin_dpo/margin_std": 667.78173828125, "step": 467 }, { "KL/chosen_KL_mean": -607.3834228515625, "KL/mean": -832.3876953125, "KL/rejected_KL_mean": -1057.39208984375, "KL/std": 518.711669921875, "epoch": 0.6872246696035242, "fcm_dpo/beta": 0.0012304207775741816, "fcm_dpo/delta": -0.16281697154045105, "fcm_dpo/margin": 450.0086364746094, "fcm_dpo/q_t": 0.3800439238548279, "grad_norm": 44.7249641418457, "learning_rate": 1.362737437810114e-07, "logits/chosen": -0.9139019846916199, "logits/rejected": -0.9288034439086914, "logps/chosen": -677.3187255859375, "logps/ref_chosen": -69.93536376953125, "logps/ref_rejected": -101.02880859375, "logps/rejected": -1158.4208984375, "loss": 1.0164, "margin_dpo/margin_mean": 450.0086364746094, "margin_dpo/margin_std": 588.6818237304688, "step": 468 }, { "KL/chosen_KL_mean": -641.0040283203125, "KL/mean": -866.3514404296875, "KL/rejected_KL_mean": -1091.698974609375, "KL/std": 456.729736328125, "epoch": 0.6886930983847284, "fcm_dpo/beta": 0.0011876230128109455, "fcm_dpo/delta": -0.14456316828727722, "fcm_dpo/margin": 450.6948547363281, "fcm_dpo/q_t": 0.3780639171600342, "grad_norm": 36.414100646972656, "learning_rate": 1.351323902551631e-07, "logits/chosen": -0.9149258136749268, "logits/rejected": -0.9327446222305298, "logps/chosen": -709.1287231445312, "logps/ref_chosen": -68.12469482421875, "logps/ref_rejected": -104.78640747070312, "logps/rejected": -1196.4853515625, "loss": 1.0044, "margin_dpo/margin_mean": 450.69488525390625, "margin_dpo/margin_std": 503.34381103515625, "step": 469 }, { "KL/chosen_KL_mean": -555.3209838867188, "KL/mean": -754.7327880859375, "KL/rejected_KL_mean": -954.1446533203125, "KL/std": 484.958984375, "epoch": 0.6901615271659325, "fcm_dpo/beta": 0.0011768193216994405, "fcm_dpo/delta": -0.07272230088710785, "fcm_dpo/margin": 398.8236083984375, "fcm_dpo/q_t": 0.3930772542953491, "grad_norm": 25.32660484313965, "learning_rate": 1.339940635976592e-07, "logits/chosen": -0.8891603946685791, "logits/rejected": -0.9033347368240356, "logps/chosen": -599.1129150390625, "logps/ref_chosen": -43.791927337646484, "logps/ref_rejected": -82.70285034179688, "logps/rejected": -1036.847412109375, "loss": 1.0628, "margin_dpo/margin_mean": 398.8235778808594, "margin_dpo/margin_std": 535.6801147460938, "step": 470 }, { "KL/chosen_KL_mean": -693.4312744140625, "KL/mean": -855.115234375, "KL/rejected_KL_mean": -1016.7991943359375, "KL/std": 482.7373352050781, "epoch": 0.6916299559471366, "fcm_dpo/beta": 0.0011662011966109276, "fcm_dpo/delta": 0.02322380244731903, "fcm_dpo/margin": 323.3678283691406, "fcm_dpo/q_t": 0.416721373796463, "grad_norm": 45.39756393432617, "learning_rate": 1.3285879380446563e-07, "logits/chosen": -0.9871773719787598, "logits/rejected": -0.9965918064117432, "logps/chosen": -756.7708129882812, "logps/ref_chosen": -63.33952331542969, "logps/ref_rejected": -83.61048126220703, "logps/rejected": -1100.40966796875, "loss": 1.131, "margin_dpo/margin_mean": 323.36785888671875, "margin_dpo/margin_std": 523.2651977539062, "step": 471 }, { "KL/chosen_KL_mean": -685.0006103515625, "KL/mean": -886.5283203125, "KL/rejected_KL_mean": -1088.0560302734375, "KL/std": 587.74755859375, "epoch": 0.6930983847283406, "fcm_dpo/beta": 0.001157897524535656, "fcm_dpo/delta": -0.07049451023340225, "fcm_dpo/margin": 403.0553894042969, "fcm_dpo/q_t": 0.40090325474739075, "grad_norm": 32.389835357666016, "learning_rate": 1.317266107909975e-07, "logits/chosen": -0.9715889096260071, "logits/rejected": -0.9529412388801575, "logps/chosen": -768.666748046875, "logps/ref_chosen": -83.66610717773438, "logps/ref_rejected": -117.20919799804688, "logps/rejected": -1205.2652587890625, "loss": 1.0948, "margin_dpo/margin_mean": 403.055419921875, "margin_dpo/margin_std": 634.1861572265625, "step": 472 }, { "KL/chosen_KL_mean": -811.2958984375, "KL/mean": -903.3029174804688, "KL/rejected_KL_mean": -995.3099365234375, "KL/std": 595.69921875, "epoch": 0.6945668135095447, "fcm_dpo/beta": 0.001172641757875681, "fcm_dpo/delta": 0.06300715357065201, "fcm_dpo/margin": 184.01409912109375, "fcm_dpo/q_t": 0.45356637239456177, "grad_norm": 114.76943969726562, "learning_rate": 1.3059754439133002e-07, "logits/chosen": -0.9685148596763611, "logits/rejected": -0.9405593872070312, "logps/chosen": -874.7928466796875, "logps/ref_chosen": -63.49696731567383, "logps/ref_rejected": -81.14657592773438, "logps/rejected": -1076.45654296875, "loss": 1.3594, "margin_dpo/margin_mean": 184.0141143798828, "margin_dpo/margin_std": 777.1190185546875, "step": 473 }, { "KL/chosen_KL_mean": -650.262451171875, "KL/mean": -809.9664916992188, "KL/rejected_KL_mean": -969.6705322265625, "KL/std": 507.17437744140625, "epoch": 0.6960352422907489, "fcm_dpo/beta": 0.0011639699805527925, "fcm_dpo/delta": -0.07450275868177414, "fcm_dpo/margin": 319.40814208984375, "fcm_dpo/q_t": 0.4137033224105835, "grad_norm": 41.28981399536133, "learning_rate": 1.2947162435741277e-07, "logits/chosen": -0.8996328115463257, "logits/rejected": -0.9031381607055664, "logps/chosen": -702.8743896484375, "logps/ref_chosen": -52.6119384765625, "logps/ref_rejected": -90.08041381835938, "logps/rejected": -1059.7509765625, "loss": 1.1578, "margin_dpo/margin_mean": 319.4081726074219, "margin_dpo/margin_std": 566.008544921875, "step": 474 }, { "KL/chosen_KL_mean": -470.3363342285156, "KL/mean": -680.240966796875, "KL/rejected_KL_mean": -890.1456298828125, "KL/std": 411.4319763183594, "epoch": 0.697503671071953, "fcm_dpo/beta": 0.0011392869055271149, "fcm_dpo/delta": -0.08241432905197144, "fcm_dpo/margin": 419.8091735839844, "fcm_dpo/q_t": 0.3890076279640198, "grad_norm": 38.87031936645508, "learning_rate": 1.2834888035828596e-07, "logits/chosen": -0.930426836013794, "logits/rejected": -0.9583991765975952, "logps/chosen": -512.83154296875, "logps/ref_chosen": -42.49519348144531, "logps/ref_rejected": -90.06294250488281, "logps/rejected": -980.2085571289062, "loss": 1.0215, "margin_dpo/margin_mean": 419.8092041015625, "margin_dpo/margin_std": 437.57635498046875, "step": 475 }, { "KL/chosen_KL_mean": -583.7200317382812, "KL/mean": -751.1150512695312, "KL/rejected_KL_mean": -918.5101318359375, "KL/std": 458.8353576660156, "epoch": 0.6989720998531571, "fcm_dpo/beta": 0.001139120664447546, "fcm_dpo/delta": 0.019377058371901512, "fcm_dpo/margin": 334.79010009765625, "fcm_dpo/q_t": 0.4132460355758667, "grad_norm": 55.413394927978516, "learning_rate": 1.2722934197929802e-07, "logits/chosen": -0.9340738654136658, "logits/rejected": -0.9469287991523743, "logps/chosen": -626.66943359375, "logps/ref_chosen": -42.94938278198242, "logps/ref_rejected": -73.71023559570312, "logps/rejected": -992.2203979492188, "loss": 1.1033, "margin_dpo/margin_mean": 334.79010009765625, "margin_dpo/margin_std": 459.30084228515625, "step": 476 }, { "KL/chosen_KL_mean": -626.4135131835938, "KL/mean": -790.4285888671875, "KL/rejected_KL_mean": -954.4437255859375, "KL/std": 489.424560546875, "epoch": 0.7004405286343612, "fcm_dpo/beta": 0.0011485903523862362, "fcm_dpo/delta": 0.023847589269280434, "fcm_dpo/margin": 328.0302734375, "fcm_dpo/q_t": 0.41430675983428955, "grad_norm": 35.95357131958008, "learning_rate": 1.2611303872132631e-07, "logits/chosen": -0.9906863570213318, "logits/rejected": -0.9593477845191956, "logps/chosen": -697.1861572265625, "logps/ref_chosen": -70.77261352539062, "logps/ref_rejected": -76.13737487792969, "logps/rejected": -1030.5811767578125, "loss": 1.1429, "margin_dpo/margin_mean": 328.0302429199219, "margin_dpo/margin_std": 566.1884155273438, "step": 477 }, { "KL/chosen_KL_mean": -511.62579345703125, "KL/mean": -708.7156982421875, "KL/rejected_KL_mean": -905.8055419921875, "KL/std": 440.7845458984375, "epoch": 0.7019089574155654, "fcm_dpo/beta": 0.0011402592062950134, "fcm_dpo/delta": -0.05179014056921005, "fcm_dpo/margin": 394.17974853515625, "fcm_dpo/q_t": 0.39800071716308594, "grad_norm": 40.23908615112305, "learning_rate": 1.2500000000000005e-07, "logits/chosen": -0.8462599515914917, "logits/rejected": -0.871573805809021, "logps/chosen": -553.0662841796875, "logps/ref_chosen": -41.440513610839844, "logps/ref_rejected": -85.36196899414062, "logps/rejected": -991.1675415039062, "loss": 1.0683, "margin_dpo/margin_mean": 394.1797790527344, "margin_dpo/margin_std": 522.7603759765625, "step": 478 }, { "KL/chosen_KL_mean": -657.9866943359375, "KL/mean": -846.1517333984375, "KL/rejected_KL_mean": -1034.316650390625, "KL/std": 541.3948974609375, "epoch": 0.7033773861967695, "fcm_dpo/beta": 0.0011373090092092752, "fcm_dpo/delta": -0.030031614005565643, "fcm_dpo/margin": 376.3300476074219, "fcm_dpo/q_t": 0.40709632635116577, "grad_norm": 29.177635192871094, "learning_rate": 1.2389025514492456e-07, "logits/chosen": -0.9058327674865723, "logits/rejected": -0.9377299547195435, "logps/chosen": -711.8945922851562, "logps/ref_chosen": -53.907920837402344, "logps/ref_rejected": -95.1163330078125, "logps/rejected": -1129.43310546875, "loss": 1.1171, "margin_dpo/margin_mean": 376.3300476074219, "margin_dpo/margin_std": 624.73388671875, "step": 479 }, { "KL/chosen_KL_mean": -804.2739868164062, "KL/mean": -948.6687622070312, "KL/rejected_KL_mean": -1093.0634765625, "KL/std": 507.27008056640625, "epoch": 0.7048458149779736, "fcm_dpo/beta": 0.0011228574439883232, "fcm_dpo/delta": -0.042888376861810684, "fcm_dpo/margin": 288.78948974609375, "fcm_dpo/q_t": 0.4276391863822937, "grad_norm": 56.077796936035156, "learning_rate": 1.227838333989088e-07, "logits/chosen": -0.9502737522125244, "logits/rejected": -0.9440046548843384, "logps/chosen": -862.9566650390625, "logps/ref_chosen": -58.682701110839844, "logps/ref_rejected": -82.93248748779297, "logps/rejected": -1175.9959716796875, "loss": 1.188, "margin_dpo/margin_mean": 288.78948974609375, "margin_dpo/margin_std": 554.4617309570312, "step": 480 }, { "KL/chosen_KL_mean": -672.9927368164062, "KL/mean": -899.4321899414062, "KL/rejected_KL_mean": -1125.87158203125, "KL/std": 524.7504272460938, "epoch": 0.7063142437591777, "fcm_dpo/beta": 0.0011024028062820435, "fcm_dpo/delta": -0.10436421632766724, "fcm_dpo/margin": 452.87890625, "fcm_dpo/q_t": 0.38820528984069824, "grad_norm": 37.83536148071289, "learning_rate": 1.2168076391719489e-07, "logits/chosen": -0.9408276081085205, "logits/rejected": -0.9681203365325928, "logps/chosen": -727.95703125, "logps/ref_chosen": -54.964271545410156, "logps/ref_rejected": -92.42044067382812, "logps/rejected": -1218.2919921875, "loss": 1.0354, "margin_dpo/margin_mean": 452.87890625, "margin_dpo/margin_std": 566.91455078125, "step": 481 }, { "KL/chosen_KL_mean": -738.0073852539062, "KL/mean": -845.139892578125, "KL/rejected_KL_mean": -952.2725219726562, "KL/std": 542.2655029296875, "epoch": 0.7077826725403817, "fcm_dpo/beta": 0.0011032463517040014, "fcm_dpo/delta": 0.07522930204868317, "fcm_dpo/margin": 214.26512145996094, "fcm_dpo/q_t": 0.44445592164993286, "grad_norm": 56.61158752441406, "learning_rate": 1.2058107576668938e-07, "logits/chosen": -0.8482377529144287, "logits/rejected": -0.8390638828277588, "logps/chosen": -805.560791015625, "logps/ref_chosen": -67.553466796875, "logps/ref_rejected": -87.58953857421875, "logps/rejected": -1039.862060546875, "loss": 1.2751, "margin_dpo/margin_mean": 214.26513671875, "margin_dpo/margin_std": 631.7608642578125, "step": 482 }, { "KL/chosen_KL_mean": -640.1693115234375, "KL/mean": -879.429931640625, "KL/rejected_KL_mean": -1118.690673828125, "KL/std": 530.899658203125, "epoch": 0.7092511013215859, "fcm_dpo/beta": 0.0010884404182434082, "fcm_dpo/delta": -0.1278223842382431, "fcm_dpo/margin": 478.5213928222656, "fcm_dpo/q_t": 0.38443121314048767, "grad_norm": 34.9405517578125, "learning_rate": 1.194847979251979e-07, "logits/chosen": -0.9293410778045654, "logits/rejected": -0.9389553070068359, "logps/chosen": -703.4990844726562, "logps/ref_chosen": -63.32981872558594, "logps/ref_rejected": -95.78697204589844, "logps/rejected": -1214.4775390625, "loss": 1.0221, "margin_dpo/margin_mean": 478.5213623046875, "margin_dpo/margin_std": 590.3174438476562, "step": 483 }, { "KL/chosen_KL_mean": -548.8181762695312, "KL/mean": -766.58642578125, "KL/rejected_KL_mean": -984.354736328125, "KL/std": 516.6387939453125, "epoch": 0.71071953010279, "fcm_dpo/beta": 0.001076672924682498, "fcm_dpo/delta": -0.0724029541015625, "fcm_dpo/margin": 435.53656005859375, "fcm_dpo/q_t": 0.39384713768959045, "grad_norm": 53.70915985107422, "learning_rate": 1.1839195928066101e-07, "logits/chosen": -0.9350720643997192, "logits/rejected": -0.9612249135971069, "logps/chosen": -607.956298828125, "logps/ref_chosen": -59.13812255859375, "logps/ref_rejected": -84.37144470214844, "logps/rejected": -1068.7261962890625, "loss": 1.0454, "margin_dpo/margin_mean": 435.53656005859375, "margin_dpo/margin_std": 524.6602783203125, "step": 484 }, { "KL/chosen_KL_mean": -576.3857421875, "KL/mean": -781.9747314453125, "KL/rejected_KL_mean": -987.5636596679688, "KL/std": 519.3931884765625, "epoch": 0.7121879588839941, "fcm_dpo/beta": 0.001064480864442885, "fcm_dpo/delta": -0.039454929530620575, "fcm_dpo/margin": 411.17791748046875, "fcm_dpo/q_t": 0.4025202989578247, "grad_norm": 35.44499588012695, "learning_rate": 1.1730258863039347e-07, "logits/chosen": -0.8758097887039185, "logits/rejected": -0.8995819091796875, "logps/chosen": -635.2353515625, "logps/ref_chosen": -58.849571228027344, "logps/ref_rejected": -103.36408233642578, "logps/rejected": -1090.927734375, "loss": 1.0858, "margin_dpo/margin_mean": 411.17791748046875, "margin_dpo/margin_std": 591.315185546875, "step": 485 }, { "KL/chosen_KL_mean": -663.7813720703125, "KL/mean": -897.005126953125, "KL/rejected_KL_mean": -1130.22900390625, "KL/std": 587.8470458984375, "epoch": 0.7136563876651982, "fcm_dpo/beta": 0.0010440791957080364, "fcm_dpo/delta": -0.09158313274383545, "fcm_dpo/margin": 466.44757080078125, "fcm_dpo/q_t": 0.39278823137283325, "grad_norm": 38.644596099853516, "learning_rate": 1.1621671468032493e-07, "logits/chosen": -0.9534709453582764, "logits/rejected": -0.9716538786888123, "logps/chosen": -719.041015625, "logps/ref_chosen": -55.25966262817383, "logps/ref_rejected": -92.13936614990234, "logps/rejected": -1222.368408203125, "loss": 1.0796, "margin_dpo/margin_mean": 466.4476013183594, "margin_dpo/margin_std": 711.1044311523438, "step": 486 }, { "KL/chosen_KL_mean": -689.0608520507812, "KL/mean": -862.2830200195312, "KL/rejected_KL_mean": -1035.505126953125, "KL/std": 522.194580078125, "epoch": 0.7151248164464024, "fcm_dpo/beta": 0.001049531390890479, "fcm_dpo/delta": 0.03742973506450653, "fcm_dpo/margin": 346.44439697265625, "fcm_dpo/q_t": 0.4152664542198181, "grad_norm": 34.07633590698242, "learning_rate": 1.1513436604424378e-07, "logits/chosen": -0.9346251487731934, "logits/rejected": -0.9410355091094971, "logps/chosen": -742.1241455078125, "logps/ref_chosen": -53.06330871582031, "logps/ref_rejected": -92.41883087158203, "logps/rejected": -1127.924072265625, "loss": 1.1294, "margin_dpo/margin_mean": 346.44439697265625, "margin_dpo/margin_std": 535.1236572265625, "step": 487 }, { "KL/chosen_KL_mean": -562.1060791015625, "KL/mean": -746.0523681640625, "KL/rejected_KL_mean": -929.9986572265625, "KL/std": 474.5586853027344, "epoch": 0.7165932452276065, "fcm_dpo/beta": 0.0010536068584769964, "fcm_dpo/delta": 0.012613944709300995, "fcm_dpo/margin": 367.89251708984375, "fcm_dpo/q_t": 0.4102671444416046, "grad_norm": 30.069625854492188, "learning_rate": 1.1405557124304335e-07, "logits/chosen": -0.8993455767631531, "logits/rejected": -0.9078420400619507, "logps/chosen": -614.334228515625, "logps/ref_chosen": -52.22815704345703, "logps/ref_rejected": -84.00656127929688, "logps/rejected": -1014.0052490234375, "loss": 1.0911, "margin_dpo/margin_mean": 367.89251708984375, "margin_dpo/margin_std": 458.40264892578125, "step": 488 }, { "KL/chosen_KL_mean": -515.2543334960938, "KL/mean": -693.9641723632812, "KL/rejected_KL_mean": -872.674072265625, "KL/std": 473.46533203125, "epoch": 0.7180616740088106, "fcm_dpo/beta": 0.0010581349488347769, "fcm_dpo/delta": 0.02230164408683777, "fcm_dpo/margin": 357.4197082519531, "fcm_dpo/q_t": 0.41536301374435425, "grad_norm": 27.20409393310547, "learning_rate": 1.1298035870396985e-07, "logits/chosen": -0.8895210027694702, "logits/rejected": -0.8903396725654602, "logps/chosen": -571.2440185546875, "logps/ref_chosen": -55.989627838134766, "logps/ref_rejected": -79.39812469482422, "logps/rejected": -952.0721435546875, "loss": 1.1131, "margin_dpo/margin_mean": 357.419677734375, "margin_dpo/margin_std": 521.9287719726562, "step": 489 }, { "KL/chosen_KL_mean": -629.0771484375, "KL/mean": -812.7498779296875, "KL/rejected_KL_mean": -996.4226684570312, "KL/std": 568.397705078125, "epoch": 0.7195301027900147, "fcm_dpo/beta": 0.001059696776792407, "fcm_dpo/delta": 0.011018646880984306, "fcm_dpo/margin": 367.3455505371094, "fcm_dpo/q_t": 0.4139998257160187, "grad_norm": 36.384334564208984, "learning_rate": 1.1190875675987355e-07, "logits/chosen": -0.8977552652359009, "logits/rejected": -0.9355182647705078, "logps/chosen": -681.4434814453125, "logps/ref_chosen": -52.36639404296875, "logps/ref_rejected": -110.4090576171875, "logps/rejected": -1106.831787109375, "loss": 1.1447, "margin_dpo/margin_mean": 367.3455505371094, "margin_dpo/margin_std": 653.3365478515625, "step": 490 }, { "KL/chosen_KL_mean": -569.136474609375, "KL/mean": -696.23974609375, "KL/rejected_KL_mean": -823.343017578125, "KL/std": 484.46240234375, "epoch": 0.7209985315712188, "fcm_dpo/beta": 0.0010794580448418856, "fcm_dpo/delta": 0.1289866715669632, "fcm_dpo/margin": 254.20654296875, "fcm_dpo/q_t": 0.4374847710132599, "grad_norm": 30.11342430114746, "learning_rate": 1.1084079364846241e-07, "logits/chosen": -0.8976389169692993, "logits/rejected": -0.8920071125030518, "logps/chosen": -629.2527465820312, "logps/ref_chosen": -60.11626434326172, "logps/ref_rejected": -73.27278900146484, "logps/rejected": -896.6157836914062, "loss": 1.1925, "margin_dpo/margin_mean": 254.20654296875, "margin_dpo/margin_std": 475.316162109375, "step": 491 }, { "KL/chosen_KL_mean": -585.7317504882812, "KL/mean": -709.7108154296875, "KL/rejected_KL_mean": -833.68994140625, "KL/std": 479.278076171875, "epoch": 0.7224669603524229, "fcm_dpo/beta": 0.0011044761631637812, "fcm_dpo/delta": 0.1298675835132599, "fcm_dpo/margin": 247.95816040039062, "fcm_dpo/q_t": 0.4389011859893799, "grad_norm": 42.94180679321289, "learning_rate": 1.097764975115576e-07, "logits/chosen": -0.9602404832839966, "logits/rejected": -0.9433440566062927, "logps/chosen": -639.7259521484375, "logps/ref_chosen": -53.994178771972656, "logps/ref_rejected": -72.65962219238281, "logps/rejected": -906.349609375, "loss": 1.2187, "margin_dpo/margin_mean": 247.95819091796875, "margin_dpo/margin_std": 550.007080078125, "step": 492 }, { "KL/chosen_KL_mean": -619.7557373046875, "KL/mean": -758.3751220703125, "KL/rejected_KL_mean": -896.9945068359375, "KL/std": 526.8731689453125, "epoch": 0.723935389133627, "fcm_dpo/beta": 0.0011116546811535954, "fcm_dpo/delta": -0.010466049425303936, "fcm_dpo/margin": 277.23876953125, "fcm_dpo/q_t": 0.42805489897727966, "grad_norm": 33.16301727294922, "learning_rate": 1.0871589639435203e-07, "logits/chosen": -0.9904724359512329, "logits/rejected": -0.9639154076576233, "logps/chosen": -695.2529296875, "logps/ref_chosen": -75.49723815917969, "logps/ref_rejected": -87.32301330566406, "logps/rejected": -984.3175048828125, "loss": 1.1813, "margin_dpo/margin_mean": 277.2387390136719, "margin_dpo/margin_std": 518.248046875, "step": 493 }, { "KL/chosen_KL_mean": -485.149658203125, "KL/mean": -703.2138671875, "KL/rejected_KL_mean": -921.278076171875, "KL/std": 474.62786865234375, "epoch": 0.7254038179148311, "fcm_dpo/beta": 0.0010987753048539162, "fcm_dpo/delta": -0.08314534276723862, "fcm_dpo/margin": 436.12835693359375, "fcm_dpo/q_t": 0.3895169794559479, "grad_norm": 45.52617263793945, "learning_rate": 1.0765901824467166e-07, "logits/chosen": -0.8565849661827087, "logits/rejected": -0.8936357498168945, "logps/chosen": -526.5089111328125, "logps/ref_chosen": -41.35926818847656, "logps/ref_rejected": -86.09136962890625, "logps/rejected": -1007.369384765625, "loss": 1.0279, "margin_dpo/margin_mean": 436.12835693359375, "margin_dpo/margin_std": 479.73455810546875, "step": 494 }, { "KL/chosen_KL_mean": -545.6207275390625, "KL/mean": -735.65185546875, "KL/rejected_KL_mean": -925.6829833984375, "KL/std": 484.06903076171875, "epoch": 0.7268722466960352, "fcm_dpo/beta": 0.0010912488214671612, "fcm_dpo/delta": -0.015377325937151909, "fcm_dpo/margin": 380.0621337890625, "fcm_dpo/q_t": 0.40759721398353577, "grad_norm": 32.46592330932617, "learning_rate": 1.0660589091223854e-07, "logits/chosen": -0.9482539892196655, "logits/rejected": -0.9554197192192078, "logps/chosen": -609.1558227539062, "logps/ref_chosen": -63.53507995605469, "logps/ref_rejected": -91.42443084716797, "logps/rejected": -1017.1073608398438, "loss": 1.0985, "margin_dpo/margin_mean": 380.0621337890625, "margin_dpo/margin_std": 567.2548217773438, "step": 495 }, { "KL/chosen_KL_mean": -678.7178955078125, "KL/mean": -781.5296020507812, "KL/rejected_KL_mean": -884.3413696289062, "KL/std": 392.3121337890625, "epoch": 0.7283406754772394, "fcm_dpo/beta": 0.0011189571814611554, "fcm_dpo/delta": 0.1738756000995636, "fcm_dpo/margin": 205.62347412109375, "fcm_dpo/q_t": 0.44650715589523315, "grad_norm": 76.83142852783203, "learning_rate": 1.0555654214793722e-07, "logits/chosen": -0.9424889087677002, "logits/rejected": -0.9166613817214966, "logps/chosen": -751.309814453125, "logps/ref_chosen": -72.5919189453125, "logps/ref_rejected": -84.32933807373047, "logps/rejected": -968.6707153320312, "loss": 1.2282, "margin_dpo/margin_mean": 205.62347412109375, "margin_dpo/margin_std": 438.72454833984375, "step": 496 }, { "KL/chosen_KL_mean": -631.7765502929688, "KL/mean": -734.545166015625, "KL/rejected_KL_mean": -837.3136596679688, "KL/std": 477.0352478027344, "epoch": 0.7298091042584435, "fcm_dpo/beta": 0.0011342904763296247, "fcm_dpo/delta": 0.02543473243713379, "fcm_dpo/margin": 205.537109375, "fcm_dpo/q_t": 0.4462537467479706, "grad_norm": 40.14469528198242, "learning_rate": 1.0451099960308374e-07, "logits/chosen": -0.9075003266334534, "logits/rejected": -0.8960117101669312, "logps/chosen": -690.3705444335938, "logps/ref_chosen": -58.59397506713867, "logps/ref_rejected": -76.28836822509766, "logps/rejected": -913.60205078125, "loss": 1.2322, "margin_dpo/margin_mean": 205.537109375, "margin_dpo/margin_std": 450.22613525390625, "step": 497 }, { "KL/chosen_KL_mean": -571.6414794921875, "KL/mean": -741.5089111328125, "KL/rejected_KL_mean": -911.3762817382812, "KL/std": 496.2296142578125, "epoch": 0.7312775330396476, "fcm_dpo/beta": 0.0011364180827513337, "fcm_dpo/delta": 0.014480667188763618, "fcm_dpo/margin": 339.73480224609375, "fcm_dpo/q_t": 0.4120427668094635, "grad_norm": 27.699867248535156, "learning_rate": 1.0346929082869641e-07, "logits/chosen": -0.8944777250289917, "logits/rejected": -0.8845921754837036, "logps/chosen": -642.84716796875, "logps/ref_chosen": -71.20565795898438, "logps/ref_rejected": -83.95803833007812, "logps/rejected": -995.3343505859375, "loss": 1.1291, "margin_dpo/margin_mean": 339.73480224609375, "margin_dpo/margin_std": 556.2047119140625, "step": 498 }, { "KL/chosen_KL_mean": -502.248779296875, "KL/mean": -700.484375, "KL/rejected_KL_mean": -898.719970703125, "KL/std": 475.0845947265625, "epoch": 0.7327459618208517, "fcm_dpo/beta": 0.0011275302385911345, "fcm_dpo/delta": -0.04935740679502487, "fcm_dpo/margin": 396.4712829589844, "fcm_dpo/q_t": 0.39803507924079895, "grad_norm": 35.038902282714844, "learning_rate": 1.0243144327477013e-07, "logits/chosen": -0.9036816954612732, "logits/rejected": -0.9378571510314941, "logps/chosen": -553.5039672851562, "logps/ref_chosen": -51.25519561767578, "logps/ref_rejected": -101.07870483398438, "logps/rejected": -999.7987060546875, "loss": 1.0675, "margin_dpo/margin_mean": 396.47125244140625, "margin_dpo/margin_std": 522.6738891601562, "step": 499 }, { "KL/chosen_KL_mean": -610.1287841796875, "KL/mean": -780.4580078125, "KL/rejected_KL_mean": -950.7872314453125, "KL/std": 427.2696228027344, "epoch": 0.7342143906020558, "fcm_dpo/beta": 0.0011279778555035591, "fcm_dpo/delta": 0.01636883243918419, "fcm_dpo/margin": 340.65838623046875, "fcm_dpo/q_t": 0.4120955467224121, "grad_norm": 33.4288215637207, "learning_rate": 1.0139748428955333e-07, "logits/chosen": -0.9120993614196777, "logits/rejected": -0.9445118308067322, "logps/chosen": -667.15625, "logps/ref_chosen": -57.027442932128906, "logps/ref_rejected": -93.93421173095703, "logps/rejected": -1044.721435546875, "loss": 1.1223, "margin_dpo/margin_mean": 340.65838623046875, "margin_dpo/margin_std": 526.3189697265625, "step": 500 }, { "KL/chosen_KL_mean": -524.825439453125, "KL/mean": -709.2818603515625, "KL/rejected_KL_mean": -893.73828125, "KL/std": 459.21185302734375, "epoch": 0.73568281938326, "fcm_dpo/beta": 0.001129691954702139, "fcm_dpo/delta": -0.017550457268953323, "fcm_dpo/margin": 368.912841796875, "fcm_dpo/q_t": 0.40666812658309937, "grad_norm": 29.30938720703125, "learning_rate": 1.0036744111882672e-07, "logits/chosen": -0.8794831037521362, "logits/rejected": -0.8698313236236572, "logps/chosen": -579.1849365234375, "logps/ref_chosen": -54.359527587890625, "logps/ref_rejected": -80.15670013427734, "logps/rejected": -973.8949584960938, "loss": 1.1026, "margin_dpo/margin_mean": 368.912841796875, "margin_dpo/margin_std": 553.5332641601562, "step": 501 }, { "KL/chosen_KL_mean": -475.44189453125, "KL/mean": -656.1715087890625, "KL/rejected_KL_mean": -836.9010009765625, "KL/std": 405.69573974609375, "epoch": 0.737151248164464, "fcm_dpo/beta": 0.0011267581721767783, "fcm_dpo/delta": -0.007630977779626846, "fcm_dpo/margin": 361.4590759277344, "fcm_dpo/q_t": 0.4055173695087433, "grad_norm": 29.00743865966797, "learning_rate": 9.934134090518592e-08, "logits/chosen": -0.7667361497879028, "logits/rejected": -0.7510417699813843, "logps/chosen": -543.0424194335938, "logps/ref_chosen": -67.60050964355469, "logps/ref_rejected": -82.94876098632812, "logps/rejected": -919.8497924804688, "loss": 1.071, "margin_dpo/margin_mean": 361.4591064453125, "margin_dpo/margin_std": 415.29315185546875, "step": 502 }, { "KL/chosen_KL_mean": -481.5869140625, "KL/mean": -647.6417236328125, "KL/rejected_KL_mean": -813.696533203125, "KL/std": 403.74560546875, "epoch": 0.7386196769456681, "fcm_dpo/beta": 0.001126825693063438, "fcm_dpo/delta": 0.026751546189188957, "fcm_dpo/margin": 332.1096496582031, "fcm_dpo/q_t": 0.41448622941970825, "grad_norm": 27.742767333984375, "learning_rate": 9.831921068732571e-08, "logits/chosen": -0.794667661190033, "logits/rejected": -0.7808655500411987, "logps/chosen": -536.665283203125, "logps/ref_chosen": -55.078407287597656, "logps/ref_rejected": -82.50544738769531, "logps/rejected": -896.2020263671875, "loss": 1.1014, "margin_dpo/margin_mean": 332.10968017578125, "margin_dpo/margin_std": 432.3536376953125, "step": 503 }, { "KL/chosen_KL_mean": -538.2932739257812, "KL/mean": -737.7694702148438, "KL/rejected_KL_mean": -937.2457275390625, "KL/std": 478.7033996582031, "epoch": 0.7400881057268722, "fcm_dpo/beta": 0.0011236823629587889, "fcm_dpo/delta": -0.050532855093479156, "fcm_dpo/margin": 398.95245361328125, "fcm_dpo/q_t": 0.39904850721359253, "grad_norm": 32.015926361083984, "learning_rate": 9.730107739932805e-08, "logits/chosen": -0.8786238431930542, "logits/rejected": -0.9033294320106506, "logps/chosen": -598.259033203125, "logps/ref_chosen": -59.96575164794922, "logps/ref_rejected": -103.76212310791016, "logps/rejected": -1041.0078125, "loss": 1.0754, "margin_dpo/margin_mean": 398.95245361328125, "margin_dpo/margin_std": 538.3695678710938, "step": 504 }, { "KL/chosen_KL_mean": -600.709716796875, "KL/mean": -710.2824096679688, "KL/rejected_KL_mean": -819.8551025390625, "KL/std": 460.1452331542969, "epoch": 0.7415565345080763, "fcm_dpo/beta": 0.0011460301466286182, "fcm_dpo/delta": 0.1525171399116516, "fcm_dpo/margin": 219.1453094482422, "fcm_dpo/q_t": 0.4422228932380676, "grad_norm": 34.641334533691406, "learning_rate": 9.628696786995188e-08, "logits/chosen": -0.8766049742698669, "logits/rejected": -0.8530220985412598, "logps/chosen": -676.8646240234375, "logps/ref_chosen": -76.1549072265625, "logps/ref_rejected": -88.58537292480469, "logps/rejected": -908.4404296875, "loss": 1.2118, "margin_dpo/margin_mean": 219.14532470703125, "margin_dpo/margin_std": 433.828125, "step": 505 }, { "KL/chosen_KL_mean": -490.38372802734375, "KL/mean": -671.2208251953125, "KL/rejected_KL_mean": -852.057861328125, "KL/std": 451.6136169433594, "epoch": 0.7430249632892805, "fcm_dpo/beta": 0.0011465998832136393, "fcm_dpo/delta": -0.015700122341513634, "fcm_dpo/margin": 361.67413330078125, "fcm_dpo/q_t": 0.4050843119621277, "grad_norm": 36.54334259033203, "learning_rate": 9.527690882192635e-08, "logits/chosen": -0.8800439834594727, "logits/rejected": -0.8960914611816406, "logps/chosen": -539.34423828125, "logps/ref_chosen": -48.96050262451172, "logps/ref_rejected": -78.41505432128906, "logps/rejected": -930.472900390625, "loss": 1.0883, "margin_dpo/margin_mean": 361.67413330078125, "margin_dpo/margin_std": 490.28369140625, "step": 506 }, { "KL/chosen_KL_mean": -584.2061767578125, "KL/mean": -741.224853515625, "KL/rejected_KL_mean": -898.2435913085938, "KL/std": 549.9444580078125, "epoch": 0.7444933920704846, "fcm_dpo/beta": 0.0011556025128811598, "fcm_dpo/delta": 0.038512568920850754, "fcm_dpo/margin": 314.0374755859375, "fcm_dpo/q_t": 0.42233383655548096, "grad_norm": 33.112762451171875, "learning_rate": 9.427092687124691e-08, "logits/chosen": -0.9117947816848755, "logits/rejected": -0.918329119682312, "logps/chosen": -651.0076293945312, "logps/ref_chosen": -66.80149841308594, "logps/ref_rejected": -95.37289428710938, "logps/rejected": -993.616455078125, "loss": 1.162, "margin_dpo/margin_mean": 314.0374755859375, "margin_dpo/margin_std": 601.7278442382812, "step": 507 }, { "KL/chosen_KL_mean": -628.045166015625, "KL/mean": -761.368896484375, "KL/rejected_KL_mean": -894.692626953125, "KL/std": 525.7574462890625, "epoch": 0.7459618208516887, "fcm_dpo/beta": 0.0011775526218116283, "fcm_dpo/delta": 0.08841653168201447, "fcm_dpo/margin": 266.6474609375, "fcm_dpo/q_t": 0.43183645606040955, "grad_norm": 40.246891021728516, "learning_rate": 9.326904852647344e-08, "logits/chosen": -0.8933985233306885, "logits/rejected": -0.8938655853271484, "logps/chosen": -699.3486328125, "logps/ref_chosen": -71.303466796875, "logps/ref_rejected": -95.6275405883789, "logps/rejected": -990.3201904296875, "loss": 1.2157, "margin_dpo/margin_mean": 266.6474609375, "margin_dpo/margin_std": 611.8427734375, "step": 508 }, { "KL/chosen_KL_mean": -462.01898193359375, "KL/mean": -608.3475952148438, "KL/rejected_KL_mean": -754.6761474609375, "KL/std": 369.74627685546875, "epoch": 0.7474302496328928, "fcm_dpo/beta": 0.0011943180579692125, "fcm_dpo/delta": 0.051631003618240356, "fcm_dpo/margin": 292.65716552734375, "fcm_dpo/q_t": 0.4205209016799927, "grad_norm": 31.764202117919922, "learning_rate": 9.227130018803195e-08, "logits/chosen": -0.8035761117935181, "logits/rejected": -0.7988163232803345, "logps/chosen": -525.8379516601562, "logps/ref_chosen": -63.81895065307617, "logps/ref_rejected": -83.25643920898438, "logps/rejected": -837.9326171875, "loss": 1.1367, "margin_dpo/margin_mean": 292.65716552734375, "margin_dpo/margin_std": 450.46405029296875, "step": 509 }, { "KL/chosen_KL_mean": -559.66650390625, "KL/mean": -753.0732421875, "KL/rejected_KL_mean": -946.47998046875, "KL/std": 431.13751220703125, "epoch": 0.748898678414097, "fcm_dpo/beta": 0.0011832050513476133, "fcm_dpo/delta": -0.060402024537324905, "fcm_dpo/margin": 386.8134460449219, "fcm_dpo/q_t": 0.39339399337768555, "grad_norm": 38.86001205444336, "learning_rate": 9.127770814751932e-08, "logits/chosen": -0.7946321964263916, "logits/rejected": -0.8168176412582397, "logps/chosen": -611.544921875, "logps/ref_chosen": -51.878448486328125, "logps/ref_rejected": -102.7651596069336, "logps/rejected": -1049.2451171875, "loss": 1.039, "margin_dpo/margin_mean": 386.8134765625, "margin_dpo/margin_std": 423.6184997558594, "step": 510 }, { "KL/chosen_KL_mean": -525.6728515625, "KL/mean": -678.17822265625, "KL/rejected_KL_mean": -830.68359375, "KL/std": 464.34112548828125, "epoch": 0.750367107195301, "fcm_dpo/beta": 0.0011853575706481934, "fcm_dpo/delta": 0.03986484557390213, "fcm_dpo/margin": 305.0107727050781, "fcm_dpo/q_t": 0.4183180034160614, "grad_norm": 41.20817565917969, "learning_rate": 9.028829858700973e-08, "logits/chosen": -0.9006566405296326, "logits/rejected": -0.9088428616523743, "logps/chosen": -585.9109497070312, "logps/ref_chosen": -60.23811721801758, "logps/ref_rejected": -92.85676574707031, "logps/rejected": -923.5404052734375, "loss": 1.1564, "margin_dpo/margin_mean": 305.0107421875, "margin_dpo/margin_std": 559.034423828125, "step": 511 }, { "KL/chosen_KL_mean": -420.6644592285156, "KL/mean": -627.825439453125, "KL/rejected_KL_mean": -834.9864501953125, "KL/std": 417.6683349609375, "epoch": 0.7518355359765051, "fcm_dpo/beta": 0.0011696910951286554, "fcm_dpo/delta": -0.08911710977554321, "fcm_dpo/margin": 414.32196044921875, "fcm_dpo/q_t": 0.387323796749115, "grad_norm": 55.33091735839844, "learning_rate": 8.930309757836516e-08, "logits/chosen": -0.8243488073348999, "logits/rejected": -0.8429218530654907, "logps/chosen": -475.5699462890625, "logps/ref_chosen": -54.905494689941406, "logps/ref_rejected": -81.87586975097656, "logps/rejected": -916.8623046875, "loss": 1.0149, "margin_dpo/margin_mean": 414.32196044921875, "margin_dpo/margin_std": 421.341796875, "step": 512 }, { "KL/chosen_KL_mean": -542.2540283203125, "KL/mean": -693.2520751953125, "KL/rejected_KL_mean": -844.2501220703125, "KL/std": 411.43499755859375, "epoch": 0.7533039647577092, "fcm_dpo/beta": 0.001167251612059772, "fcm_dpo/delta": 0.04872651398181915, "fcm_dpo/margin": 301.99615478515625, "fcm_dpo/q_t": 0.42014437913894653, "grad_norm": 42.702476501464844, "learning_rate": 8.832213108254863e-08, "logits/chosen": -0.8896423578262329, "logits/rejected": -0.8757469654083252, "logps/chosen": -607.17041015625, "logps/ref_chosen": -64.91644287109375, "logps/ref_rejected": -76.06245422363281, "logps/rejected": -920.3125610351562, "loss": 1.1408, "margin_dpo/margin_mean": 301.9961242675781, "margin_dpo/margin_std": 475.26885986328125, "step": 513 }, { "KL/chosen_KL_mean": -558.1743774414062, "KL/mean": -709.8671875, "KL/rejected_KL_mean": -861.5599365234375, "KL/std": 448.15179443359375, "epoch": 0.7547723935389133, "fcm_dpo/beta": 0.0011872373288497329, "fcm_dpo/delta": 0.04112107306718826, "fcm_dpo/margin": 303.3856201171875, "fcm_dpo/q_t": 0.4199420213699341, "grad_norm": 35.660560607910156, "learning_rate": 8.734542494893954e-08, "logits/chosen": -0.8495243191719055, "logits/rejected": -0.8417561054229736, "logps/chosen": -632.4039306640625, "logps/ref_chosen": -74.22957611083984, "logps/ref_rejected": -78.945556640625, "logps/rejected": -940.5054931640625, "loss": 1.1399, "margin_dpo/margin_mean": 303.3856201171875, "margin_dpo/margin_std": 503.74169921875, "step": 514 }, { "KL/chosen_KL_mean": -477.5955505371094, "KL/mean": -589.99072265625, "KL/rejected_KL_mean": -702.3858642578125, "KL/std": 382.209716796875, "epoch": 0.7562408223201175, "fcm_dpo/beta": 0.0012109719682484865, "fcm_dpo/delta": 0.1313389241695404, "fcm_dpo/margin": 224.79022216796875, "fcm_dpo/q_t": 0.4361518621444702, "grad_norm": 48.169334411621094, "learning_rate": 8.637300491465272e-08, "logits/chosen": -0.7953609228134155, "logits/rejected": -0.8050397634506226, "logps/chosen": -527.9971313476562, "logps/ref_chosen": -50.40156555175781, "logps/ref_rejected": -87.09774780273438, "logps/rejected": -789.4835815429688, "loss": 1.201, "margin_dpo/margin_mean": 224.79022216796875, "margin_dpo/margin_std": 446.3857421875, "step": 515 }, { "KL/chosen_KL_mean": -504.6857604980469, "KL/mean": -675.954833984375, "KL/rejected_KL_mean": -847.2237548828125, "KL/std": 428.9405212402344, "epoch": 0.7577092511013216, "fcm_dpo/beta": 0.001219091354869306, "fcm_dpo/delta": -0.01847529225051403, "fcm_dpo/margin": 342.53802490234375, "fcm_dpo/q_t": 0.40270549058914185, "grad_norm": 45.79306411743164, "learning_rate": 8.540489660386064e-08, "logits/chosen": -0.883423924446106, "logits/rejected": -0.9122099876403809, "logps/chosen": -569.3353271484375, "logps/ref_chosen": -64.64956665039062, "logps/ref_rejected": -111.72237396240234, "logps/rejected": -958.9461669921875, "loss": 1.0738, "margin_dpo/margin_mean": 342.53802490234375, "margin_dpo/margin_std": 418.29913330078125, "step": 516 }, { "KL/chosen_KL_mean": -534.3342895507812, "KL/mean": -731.6787109375, "KL/rejected_KL_mean": -929.0232543945312, "KL/std": 476.9851989746094, "epoch": 0.7591776798825257, "fcm_dpo/beta": 0.0011984179727733135, "fcm_dpo/delta": -0.07682677358388901, "fcm_dpo/margin": 394.6889343261719, "fcm_dpo/q_t": 0.395224928855896, "grad_norm": 29.204376220703125, "learning_rate": 8.444112552711752e-08, "logits/chosen": -0.830208420753479, "logits/rejected": -0.8261853456497192, "logps/chosen": -595.247802734375, "logps/ref_chosen": -60.913551330566406, "logps/ref_rejected": -89.08308410644531, "logps/rejected": -1018.1063232421875, "loss": 1.0572, "margin_dpo/margin_mean": 394.6889343261719, "margin_dpo/margin_std": 530.7623291015625, "step": 517 }, { "KL/chosen_KL_mean": -472.4967041015625, "KL/mean": -634.7481689453125, "KL/rejected_KL_mean": -796.99951171875, "KL/std": 382.7030944824219, "epoch": 0.7606461086637298, "fcm_dpo/beta": 0.0011941856937482953, "fcm_dpo/delta": 0.012808417901396751, "fcm_dpo/margin": 324.5027770996094, "fcm_dpo/q_t": 0.4095669090747833, "grad_norm": 62.06501007080078, "learning_rate": 8.348171708068747e-08, "logits/chosen": -0.8725818395614624, "logits/rejected": -0.8878906965255737, "logps/chosen": -529.95263671875, "logps/ref_chosen": -57.45589065551758, "logps/ref_rejected": -85.31269836425781, "logps/rejected": -882.312255859375, "loss": 1.0945, "margin_dpo/margin_mean": 324.5028076171875, "margin_dpo/margin_std": 413.08428955078125, "step": 518 }, { "KL/chosen_KL_mean": -474.32818603515625, "KL/mean": -593.5274047851562, "KL/rejected_KL_mean": -712.7265625, "KL/std": 341.5205078125, "epoch": 0.762114537444934, "fcm_dpo/beta": 0.0012234165333211422, "fcm_dpo/delta": 0.11090720444917679, "fcm_dpo/margin": 238.39837646484375, "fcm_dpo/q_t": 0.4324970841407776, "grad_norm": 33.864437103271484, "learning_rate": 8.25266965458755e-08, "logits/chosen": -0.8436448574066162, "logits/rejected": -0.8289774060249329, "logps/chosen": -548.3914794921875, "logps/ref_chosen": -74.06331634521484, "logps/ref_rejected": -104.44416809082031, "logps/rejected": -817.1707763671875, "loss": 1.1817, "margin_dpo/margin_mean": 238.39837646484375, "margin_dpo/margin_std": 425.4404296875, "step": 519 }, { "KL/chosen_KL_mean": -510.67193603515625, "KL/mean": -660.7890625, "KL/rejected_KL_mean": -810.9061889648438, "KL/std": 406.78131103515625, "epoch": 0.7635829662261381, "fcm_dpo/beta": 0.001228465000167489, "fcm_dpo/delta": 0.0323098823428154, "fcm_dpo/margin": 300.2342529296875, "fcm_dpo/q_t": 0.4170858561992645, "grad_norm": 38.63158416748047, "learning_rate": 8.15760890883607e-08, "logits/chosen": -0.8047879934310913, "logits/rejected": -0.8105298280715942, "logps/chosen": -580.9718017578125, "logps/ref_chosen": -70.2998275756836, "logps/ref_rejected": -99.98133850097656, "logps/rejected": -910.8875732421875, "loss": 1.126, "margin_dpo/margin_mean": 300.2342529296875, "margin_dpo/margin_std": 449.99859619140625, "step": 520 }, { "KL/chosen_KL_mean": -450.53228759765625, "KL/mean": -614.1043090820312, "KL/rejected_KL_mean": -777.6763305664062, "KL/std": 415.71075439453125, "epoch": 0.7650513950073421, "fcm_dpo/beta": 0.0012406650930643082, "fcm_dpo/delta": -0.006974354386329651, "fcm_dpo/margin": 327.14410400390625, "fcm_dpo/q_t": 0.4070153832435608, "grad_norm": 31.36321258544922, "learning_rate": 8.062991975753378e-08, "logits/chosen": -0.8560887575149536, "logits/rejected": -0.8592597246170044, "logps/chosen": -508.67523193359375, "logps/ref_chosen": -58.14292526245117, "logps/ref_rejected": -83.28060913085938, "logps/rejected": -860.9569091796875, "loss": 1.0889, "margin_dpo/margin_mean": 327.1440734863281, "margin_dpo/margin_std": 424.02587890625, "step": 521 }, { "KL/chosen_KL_mean": -513.3594970703125, "KL/mean": -659.28515625, "KL/rejected_KL_mean": -805.2109375, "KL/std": 429.93572998046875, "epoch": 0.7665198237885462, "fcm_dpo/beta": 0.001239138189703226, "fcm_dpo/delta": 0.03980087861418724, "fcm_dpo/margin": 291.8514404296875, "fcm_dpo/q_t": 0.41743797063827515, "grad_norm": 32.587318420410156, "learning_rate": 7.968821348583643e-08, "logits/chosen": -0.8581516146659851, "logits/rejected": -0.8615491390228271, "logps/chosen": -559.9071655273438, "logps/ref_chosen": -46.54766845703125, "logps/ref_rejected": -66.01388549804688, "logps/rejected": -871.224853515625, "loss": 1.1325, "margin_dpo/margin_mean": 291.8514404296875, "margin_dpo/margin_std": 456.8363037109375, "step": 522 }, { "KL/chosen_KL_mean": -540.905517578125, "KL/mean": -699.052490234375, "KL/rejected_KL_mean": -857.1994018554688, "KL/std": 499.2489929199219, "epoch": 0.7679882525697503, "fcm_dpo/beta": 0.0012422900181263685, "fcm_dpo/delta": 0.007354713976383209, "fcm_dpo/margin": 316.2938232421875, "fcm_dpo/q_t": 0.4125151038169861, "grad_norm": 36.84627151489258, "learning_rate": 7.875099508810484e-08, "logits/chosen": -0.8944802284240723, "logits/rejected": -0.8932949304580688, "logps/chosen": -602.6751708984375, "logps/ref_chosen": -61.76960372924805, "logps/ref_rejected": -83.76141357421875, "logps/rejected": -940.9608154296875, "loss": 1.1311, "margin_dpo/margin_mean": 316.2938232421875, "margin_dpo/margin_std": 531.6959228515625, "step": 523 }, { "KL/chosen_KL_mean": -534.9653930664062, "KL/mean": -700.6502075195312, "KL/rejected_KL_mean": -866.3349609375, "KL/std": 458.77227783203125, "epoch": 0.7694566813509545, "fcm_dpo/beta": 0.0012349834432825446, "fcm_dpo/delta": -0.010303705930709839, "fcm_dpo/margin": 331.36962890625, "fcm_dpo/q_t": 0.4045429229736328, "grad_norm": 39.90791702270508, "learning_rate": 7.781828926091535e-08, "logits/chosen": -0.9295982122421265, "logits/rejected": -0.914442777633667, "logps/chosen": -613.0374755859375, "logps/ref_chosen": -78.0720443725586, "logps/ref_rejected": -81.30198669433594, "logps/rejected": -947.636962890625, "loss": 1.1012, "margin_dpo/margin_mean": 331.36962890625, "margin_dpo/margin_std": 464.6836853027344, "step": 524 }, { "KL/chosen_KL_mean": -519.85302734375, "KL/mean": -731.243408203125, "KL/rejected_KL_mean": -942.6337890625, "KL/std": 482.66436767578125, "epoch": 0.7709251101321586, "fcm_dpo/beta": 0.0012153794523328543, "fcm_dpo/delta": -0.12056128680706024, "fcm_dpo/margin": 422.78076171875, "fcm_dpo/q_t": 0.3845774531364441, "grad_norm": 41.53816604614258, "learning_rate": 7.689012058193384e-08, "logits/chosen": -0.8437707424163818, "logits/rejected": -0.8764776587486267, "logps/chosen": -570.680908203125, "logps/ref_chosen": -50.827857971191406, "logps/ref_rejected": -100.05294036865234, "logps/rejected": -1042.686767578125, "loss": 1.0215, "margin_dpo/margin_mean": 422.78076171875, "margin_dpo/margin_std": 505.25018310546875, "step": 525 }, { "KL/chosen_KL_mean": -562.0816650390625, "KL/mean": -768.6236572265625, "KL/rejected_KL_mean": -975.165771484375, "KL/std": 466.148193359375, "epoch": 0.7723935389133627, "fcm_dpo/beta": 0.001197699224576354, "fcm_dpo/delta": -0.09957602620124817, "fcm_dpo/margin": 413.0840148925781, "fcm_dpo/q_t": 0.38653671741485596, "grad_norm": 29.220232009887695, "learning_rate": 7.596651350926836e-08, "logits/chosen": -0.8780766725540161, "logits/rejected": -0.8744189739227295, "logps/chosen": -625.2489013671875, "logps/ref_chosen": -63.167236328125, "logps/ref_rejected": -86.30934143066406, "logps/rejected": -1061.47509765625, "loss": 1.0386, "margin_dpo/margin_mean": 413.083984375, "margin_dpo/margin_std": 507.3397521972656, "step": 526 }, { "KL/chosen_KL_mean": -590.2259521484375, "KL/mean": -731.3975830078125, "KL/rejected_KL_mean": -872.5692138671875, "KL/std": 487.9652099609375, "epoch": 0.7738619676945668, "fcm_dpo/beta": 0.0011968073667958379, "fcm_dpo/delta": 0.06430923938751221, "fcm_dpo/margin": 282.34320068359375, "fcm_dpo/q_t": 0.4214463233947754, "grad_norm": 34.30127716064453, "learning_rate": 7.504749238082414e-08, "logits/chosen": -1.0278353691101074, "logits/rejected": -0.9955443143844604, "logps/chosen": -661.3546142578125, "logps/ref_chosen": -71.12867736816406, "logps/ref_rejected": -78.3425521850586, "logps/rejected": -950.9118041992188, "loss": 1.1354, "margin_dpo/margin_mean": 282.34320068359375, "margin_dpo/margin_std": 413.549560546875, "step": 527 }, { "KL/chosen_KL_mean": -612.8468017578125, "KL/mean": -792.1258544921875, "KL/rejected_KL_mean": -971.405029296875, "KL/std": 480.26239013671875, "epoch": 0.775330396475771, "fcm_dpo/beta": 0.001196006080135703, "fcm_dpo/delta": -0.030209090560674667, "fcm_dpo/margin": 358.5581970214844, "fcm_dpo/q_t": 0.40584173798561096, "grad_norm": 41.24021530151367, "learning_rate": 7.413308141366254e-08, "logits/chosen": -0.9734677672386169, "logits/rejected": -0.9565155506134033, "logps/chosen": -680.9362182617188, "logps/ref_chosen": -68.0894546508789, "logps/ref_rejected": -93.91006469726562, "logps/rejected": -1065.3150634765625, "loss": 1.1046, "margin_dpo/margin_mean": 358.5582275390625, "margin_dpo/margin_std": 559.9374389648438, "step": 528 }, { "KL/chosen_KL_mean": -715.942626953125, "KL/mean": -837.968994140625, "KL/rejected_KL_mean": -959.995361328125, "KL/std": 430.2584228515625, "epoch": 0.7767988252569751, "fcm_dpo/beta": 0.001211107592098415, "fcm_dpo/delta": 0.10773831605911255, "fcm_dpo/margin": 244.0526580810547, "fcm_dpo/q_t": 0.4329761266708374, "grad_norm": 45.17340087890625, "learning_rate": 7.322330470336313e-08, "logits/chosen": -0.9782444834709167, "logits/rejected": -0.9867458939552307, "logps/chosen": -771.517578125, "logps/ref_chosen": -55.57495880126953, "logps/ref_rejected": -89.20909118652344, "logps/rejected": -1049.2044677734375, "loss": 1.2136, "margin_dpo/margin_mean": 244.05267333984375, "margin_dpo/margin_std": 532.4437255859375, "step": 529 }, { "KL/chosen_KL_mean": -611.8297119140625, "KL/mean": -803.12841796875, "KL/rejected_KL_mean": -994.427001953125, "KL/std": 524.01220703125, "epoch": 0.7782672540381792, "fcm_dpo/beta": 0.0012108308728784323, "fcm_dpo/delta": -0.06629342585802078, "fcm_dpo/margin": 382.5973205566406, "fcm_dpo/q_t": 0.39872339367866516, "grad_norm": 42.13739776611328, "learning_rate": 7.231818622338822e-08, "logits/chosen": -0.9113196730613708, "logits/rejected": -0.9070870876312256, "logps/chosen": -659.43115234375, "logps/ref_chosen": -47.601417541503906, "logps/ref_rejected": -87.2845230102539, "logps/rejected": -1081.7115478515625, "loss": 1.1077, "margin_dpo/margin_mean": 382.59735107421875, "margin_dpo/margin_std": 640.5091552734375, "step": 530 }, { "KL/chosen_KL_mean": -693.5, "KL/mean": -858.4159545898438, "KL/rejected_KL_mean": -1023.3319091796875, "KL/std": 566.25390625, "epoch": 0.7797356828193832, "fcm_dpo/beta": 0.0012086308561265469, "fcm_dpo/delta": 0.001310013234615326, "fcm_dpo/margin": 329.83197021484375, "fcm_dpo/q_t": 0.4118250906467438, "grad_norm": 42.0710563659668, "learning_rate": 7.141774982445147e-08, "logits/chosen": -1.0113496780395508, "logits/rejected": -0.9929705858230591, "logps/chosen": -748.7460327148438, "logps/ref_chosen": -55.246063232421875, "logps/ref_rejected": -70.60598754882812, "logps/rejected": -1093.93798828125, "loss": 1.1228, "margin_dpo/margin_mean": 329.83197021484375, "margin_dpo/margin_std": 535.4554443359375, "step": 531 }, { "KL/chosen_KL_mean": -689.9940185546875, "KL/mean": -870.5237426757812, "KL/rejected_KL_mean": -1051.053466796875, "KL/std": 547.52587890625, "epoch": 0.7812041116005873, "fcm_dpo/beta": 0.0011906104627996683, "fcm_dpo/delta": -0.03280455619096756, "fcm_dpo/margin": 361.05938720703125, "fcm_dpo/q_t": 0.4059098958969116, "grad_norm": 65.42340850830078, "learning_rate": 7.052201923388953e-08, "logits/chosen": -0.954756498336792, "logits/rejected": -0.9292545914649963, "logps/chosen": -760.280029296875, "logps/ref_chosen": -70.28601837158203, "logps/ref_rejected": -86.5913314819336, "logps/rejected": -1137.644775390625, "loss": 1.1397, "margin_dpo/margin_mean": 361.05938720703125, "margin_dpo/margin_std": 642.376953125, "step": 532 }, { "KL/chosen_KL_mean": -603.1324462890625, "KL/mean": -732.3844604492188, "KL/rejected_KL_mean": -861.636474609375, "KL/std": 453.2569580078125, "epoch": 0.7826725403817915, "fcm_dpo/beta": 0.0011966102756559849, "fcm_dpo/delta": -0.0130624333396554, "fcm_dpo/margin": 258.5040283203125, "fcm_dpo/q_t": 0.4301643371582031, "grad_norm": 43.53575897216797, "learning_rate": 6.963101805503646e-08, "logits/chosen": -0.9747976064682007, "logits/rejected": -0.9492688775062561, "logps/chosen": -667.987548828125, "logps/ref_chosen": -64.8551025390625, "logps/ref_rejected": -76.58805847167969, "logps/rejected": -938.2244873046875, "loss": 1.1999, "margin_dpo/margin_mean": 258.5040283203125, "margin_dpo/margin_std": 543.6436767578125, "step": 533 }, { "KL/chosen_KL_mean": -645.685791015625, "KL/mean": -820.00927734375, "KL/rejected_KL_mean": -994.332763671875, "KL/std": 505.59075927734375, "epoch": 0.7841409691629956, "fcm_dpo/beta": 0.0011840970255434513, "fcm_dpo/delta": -0.014506392180919647, "fcm_dpo/margin": 348.64691162109375, "fcm_dpo/q_t": 0.4072118401527405, "grad_norm": 40.776493072509766, "learning_rate": 6.874476976660184e-08, "logits/chosen": -0.9978982210159302, "logits/rejected": -0.9965052008628845, "logps/chosen": -705.80517578125, "logps/ref_chosen": -60.119388580322266, "logps/ref_rejected": -78.54347229003906, "logps/rejected": -1072.876220703125, "loss": 1.1036, "margin_dpo/margin_mean": 348.64691162109375, "margin_dpo/margin_std": 510.9076232910156, "step": 534 }, { "KL/chosen_KL_mean": -552.3275146484375, "KL/mean": -750.717529296875, "KL/rejected_KL_mean": -949.107666015625, "KL/std": 493.9169921875, "epoch": 0.7856093979441997, "fcm_dpo/beta": 0.0011865987908095121, "fcm_dpo/delta": -0.07477213442325592, "fcm_dpo/margin": 396.780029296875, "fcm_dpo/q_t": 0.39498764276504517, "grad_norm": 30.75901222229004, "learning_rate": 6.786329772205246e-08, "logits/chosen": -0.9087494611740112, "logits/rejected": -0.90961092710495, "logps/chosen": -606.6577758789062, "logps/ref_chosen": -54.330238342285156, "logps/ref_rejected": -96.30763244628906, "logps/rejected": -1045.415283203125, "loss": 1.061, "margin_dpo/margin_mean": 396.780029296875, "margin_dpo/margin_std": 520.8341064453125, "step": 535 }, { "KL/chosen_KL_mean": -481.53271484375, "KL/mean": -724.086181640625, "KL/rejected_KL_mean": -966.6396484375, "KL/std": 552.1593627929688, "epoch": 0.7870778267254038, "fcm_dpo/beta": 0.0011459384113550186, "fcm_dpo/delta": -0.16502085328102112, "fcm_dpo/margin": 485.10699462890625, "fcm_dpo/q_t": 0.38111627101898193, "grad_norm": 33.95513916015625, "learning_rate": 6.698662514899638e-08, "logits/chosen": -0.8919925689697266, "logits/rejected": -0.9215620756149292, "logps/chosen": -528.61328125, "logps/ref_chosen": -47.08053207397461, "logps/ref_rejected": -89.09783935546875, "logps/rejected": -1055.737548828125, "loss": 1.0176, "margin_dpo/margin_mean": 485.10693359375, "margin_dpo/margin_std": 655.8091430664062, "step": 536 }, { "KL/chosen_KL_mean": -525.6631469726562, "KL/mean": -683.4346923828125, "KL/rejected_KL_mean": -841.2061767578125, "KL/std": 446.1978454589844, "epoch": 0.788546255506608, "fcm_dpo/beta": 0.0011480746325105429, "fcm_dpo/delta": 0.038443662226200104, "fcm_dpo/margin": 315.5430603027344, "fcm_dpo/q_t": 0.4170358180999756, "grad_norm": 48.758907318115234, "learning_rate": 6.611477514857114e-08, "logits/chosen": -0.9522344470024109, "logits/rejected": -0.9362703561782837, "logps/chosen": -583.41064453125, "logps/ref_chosen": -57.747467041015625, "logps/ref_rejected": -70.43838500976562, "logps/rejected": -911.6445922851562, "loss": 1.1505, "margin_dpo/margin_mean": 315.5430603027344, "margin_dpo/margin_std": 543.536376953125, "step": 537 }, { "KL/chosen_KL_mean": -643.0128784179688, "KL/mean": -829.154541015625, "KL/rejected_KL_mean": -1015.29638671875, "KL/std": 483.1163330078125, "epoch": 0.7900146842878121, "fcm_dpo/beta": 0.0011375262401998043, "fcm_dpo/delta": -0.02470758929848671, "fcm_dpo/margin": 372.283447265625, "fcm_dpo/q_t": 0.40403687953948975, "grad_norm": 32.13328170776367, "learning_rate": 6.524777069483525e-08, "logits/chosen": -0.9377896785736084, "logits/rejected": -0.9210348129272461, "logps/chosen": -709.4288330078125, "logps/ref_chosen": -66.41594696044922, "logps/ref_rejected": -84.22808837890625, "logps/rejected": -1099.5244140625, "loss": 1.0819, "margin_dpo/margin_mean": 372.283447265625, "margin_dpo/margin_std": 503.72845458984375, "step": 538 }, { "KL/chosen_KL_mean": -555.8558349609375, "KL/mean": -729.8284912109375, "KL/rejected_KL_mean": -903.8011474609375, "KL/std": 412.5657653808594, "epoch": 0.7914831130690162, "fcm_dpo/beta": 0.00114156911149621, "fcm_dpo/delta": 0.002832382917404175, "fcm_dpo/margin": 347.9453430175781, "fcm_dpo/q_t": 0.40886110067367554, "grad_norm": 33.58725357055664, "learning_rate": 6.438563463416221e-08, "logits/chosen": -0.9776486158370972, "logits/rejected": -0.9680448174476624, "logps/chosen": -614.3486328125, "logps/ref_chosen": -58.492855072021484, "logps/ref_rejected": -91.85395050048828, "logps/rejected": -995.6550903320312, "loss": 1.0923, "margin_dpo/margin_mean": 347.9453430175781, "margin_dpo/margin_std": 457.59783935546875, "step": 539 }, { "KL/chosen_KL_mean": -555.8110961914062, "KL/mean": -786.366455078125, "KL/rejected_KL_mean": -1016.921875, "KL/std": 521.7627563476562, "epoch": 0.7929515418502202, "fcm_dpo/beta": 0.0011229969095438719, "fcm_dpo/delta": -0.12408408522605896, "fcm_dpo/margin": 461.1107177734375, "fcm_dpo/q_t": 0.38754981756210327, "grad_norm": 32.977603912353516, "learning_rate": 6.352838968463919e-08, "logits/chosen": -0.9218310713768005, "logits/rejected": -0.944137692451477, "logps/chosen": -619.2936401367188, "logps/ref_chosen": -63.482513427734375, "logps/ref_rejected": -116.42999267578125, "logps/rejected": -1133.351806640625, "loss": 1.0431, "margin_dpo/margin_mean": 461.1107177734375, "margin_dpo/margin_std": 613.2233276367188, "step": 540 }, { "KL/chosen_KL_mean": -667.7610473632812, "KL/mean": -789.4322509765625, "KL/rejected_KL_mean": -911.1033935546875, "KL/std": 467.2005920410156, "epoch": 0.7944199706314243, "fcm_dpo/beta": 0.0011111920466646552, "fcm_dpo/delta": -0.0019461165647953749, "fcm_dpo/margin": 243.34225463867188, "fcm_dpo/q_t": 0.43856528401374817, "grad_norm": 51.01187515258789, "learning_rate": 6.267605843546767e-08, "logits/chosen": -1.004181981086731, "logits/rejected": -0.992661714553833, "logps/chosen": -746.0413818359375, "logps/ref_chosen": -78.28036499023438, "logps/ref_rejected": -103.273681640625, "logps/rejected": -1014.3770751953125, "loss": 1.2244, "margin_dpo/margin_mean": 243.34228515625, "margin_dpo/margin_std": 547.9241943359375, "step": 541 }, { "KL/chosen_KL_mean": -576.544677734375, "KL/mean": -803.12255859375, "KL/rejected_KL_mean": -1029.7003173828125, "KL/std": 522.02685546875, "epoch": 0.7958883994126285, "fcm_dpo/beta": 0.001085467985831201, "fcm_dpo/delta": -0.09905168414115906, "fcm_dpo/margin": 453.1556091308594, "fcm_dpo/q_t": 0.39055657386779785, "grad_norm": 47.77986145019531, "learning_rate": 6.182866334636888e-08, "logits/chosen": -0.9915866255760193, "logits/rejected": -1.0235321521759033, "logps/chosen": -634.0296630859375, "logps/ref_chosen": -57.48497009277344, "logps/ref_rejected": -96.47506713867188, "logps/rejected": -1126.1754150390625, "loss": 1.0578, "margin_dpo/margin_mean": 453.1556091308594, "margin_dpo/margin_std": 612.7730712890625, "step": 542 }, { "KL/chosen_KL_mean": -665.509521484375, "KL/mean": -829.4156494140625, "KL/rejected_KL_mean": -993.3218994140625, "KL/std": 642.0631103515625, "epoch": 0.7973568281938326, "fcm_dpo/beta": 0.001093997503630817, "fcm_dpo/delta": 0.042909275740385056, "fcm_dpo/margin": 327.8123474121094, "fcm_dpo/q_t": 0.4311785399913788, "grad_norm": 30.012170791625977, "learning_rate": 6.098622674699147e-08, "logits/chosen": -0.9615781307220459, "logits/rejected": -0.9914584159851074, "logps/chosen": -726.1270141601562, "logps/ref_chosen": -60.61750793457031, "logps/ref_rejected": -105.59896850585938, "logps/rejected": -1098.9208984375, "loss": 1.1968, "margin_dpo/margin_mean": 327.8123474121094, "margin_dpo/margin_std": 748.0994262695312, "step": 543 }, { "KL/chosen_KL_mean": -667.9552001953125, "KL/mean": -856.1734008789062, "KL/rejected_KL_mean": -1044.3916015625, "KL/std": 508.0960693359375, "epoch": 0.7988252569750367, "fcm_dpo/beta": 0.0010954445460811257, "fcm_dpo/delta": -0.012936984188854694, "fcm_dpo/margin": 376.4364013671875, "fcm_dpo/q_t": 0.4067176282405853, "grad_norm": 32.4089469909668, "learning_rate": 6.01487708363232e-08, "logits/chosen": -0.9522734880447388, "logits/rejected": -0.9718469381332397, "logps/chosen": -727.5975341796875, "logps/ref_chosen": -59.642303466796875, "logps/ref_rejected": -100.95469665527344, "logps/rejected": -1145.34619140625, "loss": 1.1045, "margin_dpo/margin_mean": 376.4364013671875, "margin_dpo/margin_std": 568.992919921875, "step": 544 }, { "KL/chosen_KL_mean": -619.7081298828125, "KL/mean": -844.5244140625, "KL/rejected_KL_mean": -1069.3408203125, "KL/std": 505.60400390625, "epoch": 0.8002936857562408, "fcm_dpo/beta": 0.0010812245309352875, "fcm_dpo/delta": -0.09056208282709122, "fcm_dpo/margin": 449.6327209472656, "fcm_dpo/q_t": 0.3911089301109314, "grad_norm": 32.737972259521484, "learning_rate": 5.9316317682106294e-08, "logits/chosen": -0.9047988653182983, "logits/rejected": -0.938023567199707, "logps/chosen": -687.356689453125, "logps/ref_chosen": -67.64859771728516, "logps/ref_rejected": -95.90800476074219, "logps/rejected": -1165.248779296875, "loss": 1.0441, "margin_dpo/margin_mean": 449.6326904296875, "margin_dpo/margin_std": 570.2744140625, "step": 545 }, { "KL/chosen_KL_mean": -595.77880859375, "KL/mean": -746.6336669921875, "KL/rejected_KL_mean": -897.488525390625, "KL/std": 452.455322265625, "epoch": 0.801762114537445, "fcm_dpo/beta": 0.0010882640490308404, "fcm_dpo/delta": 0.07409149408340454, "fcm_dpo/margin": 301.70965576171875, "fcm_dpo/q_t": 0.4228670001029968, "grad_norm": 35.74420928955078, "learning_rate": 5.848888922025552e-08, "logits/chosen": -0.9621337652206421, "logits/rejected": -0.9499717354774475, "logps/chosen": -646.5230712890625, "logps/ref_chosen": -50.744232177734375, "logps/ref_rejected": -81.86622619628906, "logps/rejected": -979.354736328125, "loss": 1.1537, "margin_dpo/margin_mean": 301.70965576171875, "margin_dpo/margin_std": 491.32568359375, "step": 546 }, { "KL/chosen_KL_mean": -605.8523559570312, "KL/mean": -803.1954345703125, "KL/rejected_KL_mean": -1000.5384521484375, "KL/std": 501.6481628417969, "epoch": 0.8032305433186491, "fcm_dpo/beta": 0.0010885847732424736, "fcm_dpo/delta": -0.030972033739089966, "fcm_dpo/margin": 394.6861267089844, "fcm_dpo/q_t": 0.40285325050354004, "grad_norm": 43.03551483154297, "learning_rate": 5.7666507254280265e-08, "logits/chosen": -0.9044293165206909, "logits/rejected": -0.9168886542320251, "logps/chosen": -679.5400390625, "logps/ref_chosen": -73.6877212524414, "logps/ref_rejected": -90.76136779785156, "logps/rejected": -1091.2998046875, "loss": 1.0824, "margin_dpo/margin_mean": 394.68609619140625, "margin_dpo/margin_std": 543.5963745117188, "step": 547 }, { "KL/chosen_KL_mean": -626.2013549804688, "KL/mean": -800.2800903320312, "KL/rejected_KL_mean": -974.35888671875, "KL/std": 522.8885498046875, "epoch": 0.8046989720998532, "fcm_dpo/beta": 0.001088649732992053, "fcm_dpo/delta": 0.021810464560985565, "fcm_dpo/margin": 348.1575622558594, "fcm_dpo/q_t": 0.41777533292770386, "grad_norm": 30.352699279785156, "learning_rate": 5.684919345471029e-08, "logits/chosen": -0.9622275829315186, "logits/rejected": -0.9603374004364014, "logps/chosen": -691.4476928710938, "logps/ref_chosen": -65.24634552001953, "logps/ref_rejected": -94.11807250976562, "logps/rejected": -1068.47705078125, "loss": 1.1264, "margin_dpo/margin_mean": 348.1575622558594, "margin_dpo/margin_std": 577.4847412109375, "step": 548 }, { "KL/chosen_KL_mean": -650.4908447265625, "KL/mean": -785.2673950195312, "KL/rejected_KL_mean": -920.0439453125, "KL/std": 439.36041259765625, "epoch": 0.8061674008810573, "fcm_dpo/beta": 0.0010918962070718408, "fcm_dpo/delta": 0.014238527044653893, "fcm_dpo/margin": 269.5531005859375, "fcm_dpo/q_t": 0.43372684717178345, "grad_norm": 48.91337203979492, "learning_rate": 5.603696935852426e-08, "logits/chosen": -0.9819549918174744, "logits/rejected": -0.9728246927261353, "logps/chosen": -699.7032470703125, "logps/ref_chosen": -49.21235656738281, "logps/ref_rejected": -73.91031646728516, "logps/rejected": -993.9542846679688, "loss": 1.1922, "margin_dpo/margin_mean": 269.5531005859375, "margin_dpo/margin_std": 526.7017822265625, "step": 549 }, { "KL/chosen_KL_mean": -643.39013671875, "KL/mean": -804.6975708007812, "KL/rejected_KL_mean": -966.0050048828125, "KL/std": 482.32452392578125, "epoch": 0.8076358296622613, "fcm_dpo/beta": 0.0010998416692018509, "fcm_dpo/delta": 0.04686359316110611, "fcm_dpo/margin": 322.6148376464844, "fcm_dpo/q_t": 0.41901546716690063, "grad_norm": 45.43336868286133, "learning_rate": 5.5229856368582376e-08, "logits/chosen": -0.9264281988143921, "logits/rejected": -0.9501577615737915, "logps/chosen": -700.1971435546875, "logps/ref_chosen": -56.80695343017578, "logps/ref_rejected": -95.12580871582031, "logps/rejected": -1061.130859375, "loss": 1.1343, "margin_dpo/margin_mean": 322.6148376464844, "margin_dpo/margin_std": 508.62127685546875, "step": 550 }, { "KL/chosen_KL_mean": -564.6527099609375, "KL/mean": -833.1539306640625, "KL/rejected_KL_mean": -1101.6552734375, "KL/std": 532.0479736328125, "epoch": 0.8091042584434655, "fcm_dpo/beta": 0.0010715980315580964, "fcm_dpo/delta": -0.18658655881881714, "fcm_dpo/margin": 537.0025634765625, "fcm_dpo/q_t": 0.3684789538383484, "grad_norm": 60.08759689331055, "learning_rate": 5.4427875753062734e-08, "logits/chosen": -0.9065227508544922, "logits/rejected": -0.9625868797302246, "logps/chosen": -623.759033203125, "logps/ref_chosen": -59.10633087158203, "logps/ref_rejected": -111.67280578613281, "logps/rejected": -1213.3280029296875, "loss": 0.9628, "margin_dpo/margin_mean": 537.0025634765625, "margin_dpo/margin_std": 528.0545654296875, "step": 551 }, { "KL/chosen_KL_mean": -535.4678344726562, "KL/mean": -828.958984375, "KL/rejected_KL_mean": -1122.449951171875, "KL/std": 597.85693359375, "epoch": 0.8105726872246696, "fcm_dpo/beta": 0.0010204364079982042, "fcm_dpo/delta": -0.21537676453590393, "fcm_dpo/margin": 586.982177734375, "fcm_dpo/q_t": 0.3694334626197815, "grad_norm": 48.343841552734375, "learning_rate": 5.363104864490034e-08, "logits/chosen": -0.937026858329773, "logits/rejected": -0.9775291681289673, "logps/chosen": -597.8224487304688, "logps/ref_chosen": -62.35459899902344, "logps/ref_rejected": -104.56210327148438, "logps/rejected": -1227.0120849609375, "loss": 0.9784, "margin_dpo/margin_mean": 586.982177734375, "margin_dpo/margin_std": 677.5487060546875, "step": 552 }, { "KL/chosen_KL_mean": -622.9216918945312, "KL/mean": -777.06005859375, "KL/rejected_KL_mean": -931.198486328125, "KL/std": 503.83489990234375, "epoch": 0.8120411160058737, "fcm_dpo/beta": 0.0010282043367624283, "fcm_dpo/delta": 0.08580633997917175, "fcm_dpo/margin": 308.27679443359375, "fcm_dpo/q_t": 0.43020299077033997, "grad_norm": 28.72612762451172, "learning_rate": 5.2839396041230415e-08, "logits/chosen": -0.9406133890151978, "logits/rejected": -0.9367384910583496, "logps/chosen": -691.1805419921875, "logps/ref_chosen": -68.25881958007812, "logps/ref_rejected": -98.0971450805664, "logps/rejected": -1029.295654296875, "loss": 1.168, "margin_dpo/margin_mean": 308.27679443359375, "margin_dpo/margin_std": 553.154541015625, "step": 553 }, { "KL/chosen_KL_mean": -654.8176879882812, "KL/mean": -860.4717407226562, "KL/rejected_KL_mean": -1066.125732421875, "KL/std": 554.7606811523438, "epoch": 0.8135095447870778, "fcm_dpo/beta": 0.00103902374394238, "fcm_dpo/delta": -0.029413558542728424, "fcm_dpo/margin": 411.3079833984375, "fcm_dpo/q_t": 0.4067898094654083, "grad_norm": 94.50716400146484, "learning_rate": 5.205293880283551e-08, "logits/chosen": -0.9185788035392761, "logits/rejected": -0.8953433036804199, "logps/chosen": -722.765380859375, "logps/ref_chosen": -67.94767761230469, "logps/ref_rejected": -89.78272247314453, "logps/rejected": -1155.908447265625, "loss": 1.1295, "margin_dpo/margin_mean": 411.3079833984375, "margin_dpo/margin_std": 707.3028564453125, "step": 554 }, { "KL/chosen_KL_mean": -656.3858642578125, "KL/mean": -897.638916015625, "KL/rejected_KL_mean": -1138.89208984375, "KL/std": 570.92724609375, "epoch": 0.8149779735682819, "fcm_dpo/beta": 0.0010173844639211893, "fcm_dpo/delta": -0.09548080712556839, "fcm_dpo/margin": 482.5062255859375, "fcm_dpo/q_t": 0.39459365606307983, "grad_norm": 39.124813079833984, "learning_rate": 5.127169765359515e-08, "logits/chosen": -0.9555931687355042, "logits/rejected": -1.0082941055297852, "logps/chosen": -709.71630859375, "logps/ref_chosen": -53.33049011230469, "logps/ref_rejected": -108.47937774658203, "logps/rejected": -1247.3714599609375, "loss": 1.0765, "margin_dpo/margin_mean": 482.5062255859375, "margin_dpo/margin_std": 734.37890625, "step": 555 }, { "KL/chosen_KL_mean": -621.6959228515625, "KL/mean": -768.7464599609375, "KL/rejected_KL_mean": -915.7969970703125, "KL/std": 436.3918762207031, "epoch": 0.8164464023494861, "fcm_dpo/beta": 0.0010262987343594432, "fcm_dpo/delta": 0.10118204355239868, "fcm_dpo/margin": 294.1011047363281, "fcm_dpo/q_t": 0.4302240312099457, "grad_norm": 35.36925506591797, "learning_rate": 5.049569317994012e-08, "logits/chosen": -0.9388109445571899, "logits/rejected": -0.9352363348007202, "logps/chosen": -680.3403930664062, "logps/ref_chosen": -58.64447021484375, "logps/ref_rejected": -101.34040832519531, "logps/rejected": -1017.137451171875, "loss": 1.1551, "margin_dpo/margin_mean": 294.1011047363281, "margin_dpo/margin_std": 439.78289794921875, "step": 556 }, { "KL/chosen_KL_mean": -696.53369140625, "KL/mean": -917.8419799804688, "KL/rejected_KL_mean": -1139.150390625, "KL/std": 616.8480224609375, "epoch": 0.8179148311306902, "fcm_dpo/beta": 0.0010207702871412039, "fcm_dpo/delta": -0.05446251481771469, "fcm_dpo/margin": 442.61669921875, "fcm_dpo/q_t": 0.401253342628479, "grad_norm": 69.26335906982422, "learning_rate": 4.9724945830310144e-08, "logits/chosen": -0.9779127836227417, "logits/rejected": -1.0124623775482178, "logps/chosen": -764.374267578125, "logps/ref_chosen": -67.84066009521484, "logps/ref_rejected": -109.93965911865234, "logps/rejected": -1249.090087890625, "loss": 1.1, "margin_dpo/margin_mean": 442.61669921875, "margin_dpo/margin_std": 697.1099853515625, "step": 557 }, { "KL/chosen_KL_mean": -624.0965576171875, "KL/mean": -916.3228759765625, "KL/rejected_KL_mean": -1208.549072265625, "KL/std": 568.7822265625, "epoch": 0.8193832599118943, "fcm_dpo/beta": 0.0009917229181155562, "fcm_dpo/delta": -0.19064576923847198, "fcm_dpo/margin": 584.4525146484375, "fcm_dpo/q_t": 0.3662768006324768, "grad_norm": 35.57923126220703, "learning_rate": 4.8959475914614554e-08, "logits/chosen": -1.0224618911743164, "logits/rejected": -1.0410199165344238, "logps/chosen": -686.46484375, "logps/ref_chosen": -62.36824035644531, "logps/ref_rejected": -102.16102600097656, "logps/rejected": -1310.710205078125, "loss": 0.9776, "margin_dpo/margin_mean": 584.4525146484375, "margin_dpo/margin_std": 628.419677734375, "step": 558 }, { "KL/chosen_KL_mean": -702.362060546875, "KL/mean": -949.4503173828125, "KL/rejected_KL_mean": -1196.53857421875, "KL/std": 592.9994506835938, "epoch": 0.8208516886930984, "fcm_dpo/beta": 0.000971162342466414, "fcm_dpo/delta": -0.08387550711631775, "fcm_dpo/margin": 494.1765441894531, "fcm_dpo/q_t": 0.39221078157424927, "grad_norm": 28.913423538208008, "learning_rate": 4.8199303603697614e-08, "logits/chosen": -1.1046611070632935, "logits/rejected": -1.1130573749542236, "logps/chosen": -763.1143798828125, "logps/ref_chosen": -60.752323150634766, "logps/ref_rejected": -93.44229125976562, "logps/rejected": -1289.9808349609375, "loss": 1.0485, "margin_dpo/margin_mean": 494.1765441894531, "margin_dpo/margin_std": 634.8758544921875, "step": 559 }, { "KL/chosen_KL_mean": -646.1812744140625, "KL/mean": -825.5607299804688, "KL/rejected_KL_mean": -1004.9402465820312, "KL/std": 523.7559204101562, "epoch": 0.8223201174743024, "fcm_dpo/beta": 0.0009687429992482066, "fcm_dpo/delta": 0.05413452535867691, "fcm_dpo/margin": 358.7589111328125, "fcm_dpo/q_t": 0.42102909088134766, "grad_norm": 29.267030715942383, "learning_rate": 4.7444448928806615e-08, "logits/chosen": -0.9093506336212158, "logits/rejected": -0.8941901326179504, "logps/chosen": -704.2850952148438, "logps/ref_chosen": -58.10382080078125, "logps/ref_rejected": -79.99122619628906, "logps/rejected": -1084.931396484375, "loss": 1.1422, "margin_dpo/margin_mean": 358.7589111328125, "margin_dpo/margin_std": 570.3694458007812, "step": 560 }, { "KL/chosen_KL_mean": -744.1492919921875, "KL/mean": -893.382568359375, "KL/rejected_KL_mean": -1042.61572265625, "KL/std": 518.4768676757812, "epoch": 0.8237885462555066, "fcm_dpo/beta": 0.0009945239871740341, "fcm_dpo/delta": 0.10583681613206863, "fcm_dpo/margin": 298.46636962890625, "fcm_dpo/q_t": 0.4313068687915802, "grad_norm": 46.64256286621094, "learning_rate": 4.669493178106432e-08, "logits/chosen": -1.0665897130966187, "logits/rejected": -1.0877900123596191, "logps/chosen": -795.062255859375, "logps/ref_chosen": -50.912879943847656, "logps/ref_rejected": -99.06856536865234, "logps/rejected": -1141.684326171875, "loss": 1.2066, "margin_dpo/margin_mean": 298.46636962890625, "margin_dpo/margin_std": 644.6820678710938, "step": 561 }, { "KL/chosen_KL_mean": -701.2473754882812, "KL/mean": -912.952392578125, "KL/rejected_KL_mean": -1124.657470703125, "KL/std": 575.4769287109375, "epoch": 0.8252569750367107, "fcm_dpo/beta": 0.0009885327890515327, "fcm_dpo/delta": -0.02009068801999092, "fcm_dpo/margin": 423.4101257324219, "fcm_dpo/q_t": 0.4067729115486145, "grad_norm": 40.539154052734375, "learning_rate": 4.5950771910944596e-08, "logits/chosen": -1.0031108856201172, "logits/rejected": -1.0146968364715576, "logps/chosen": -760.7117919921875, "logps/ref_chosen": -59.46440124511719, "logps/ref_rejected": -96.54266357421875, "logps/rejected": -1221.2001953125, "loss": 1.0985, "margin_dpo/margin_mean": 423.41015625, "margin_dpo/margin_std": 631.9222412109375, "step": 562 }, { "KL/chosen_KL_mean": -767.9593505859375, "KL/mean": -925.231689453125, "KL/rejected_KL_mean": -1082.504150390625, "KL/std": 606.1444091796875, "epoch": 0.8267254038179148, "fcm_dpo/beta": 0.0009880930883809924, "fcm_dpo/delta": -0.04907416179776192, "fcm_dpo/margin": 314.54473876953125, "fcm_dpo/q_t": 0.42376065254211426, "grad_norm": 46.25846481323242, "learning_rate": 4.521198892775202e-08, "logits/chosen": -0.9742862582206726, "logits/rejected": -0.9843175411224365, "logps/chosen": -828.5675659179688, "logps/ref_chosen": -60.60819625854492, "logps/ref_rejected": -94.56770324707031, "logps/rejected": -1177.07177734375, "loss": 1.2224, "margin_dpo/margin_mean": 314.54473876953125, "margin_dpo/margin_std": 707.6341552734375, "step": 563 }, { "KL/chosen_KL_mean": -685.3291015625, "KL/mean": -886.4181518554688, "KL/rejected_KL_mean": -1087.50732421875, "KL/std": 537.7996826171875, "epoch": 0.8281938325991189, "fcm_dpo/beta": 0.0009863328887149692, "fcm_dpo/delta": 0.0033044088631868362, "fcm_dpo/margin": 402.17816162109375, "fcm_dpo/q_t": 0.40960630774497986, "grad_norm": 47.38982009887695, "learning_rate": 4.447860229910544e-08, "logits/chosen": -1.0377906560897827, "logits/rejected": -1.0290945768356323, "logps/chosen": -759.5974731445312, "logps/ref_chosen": -74.26837921142578, "logps/ref_rejected": -93.23818969726562, "logps/rejected": -1180.7454833984375, "loss": 1.091, "margin_dpo/margin_mean": 402.17816162109375, "margin_dpo/margin_std": 517.3626098632812, "step": 564 }, { "KL/chosen_KL_mean": -727.7404174804688, "KL/mean": -939.229736328125, "KL/rejected_KL_mean": -1150.718994140625, "KL/std": 615.11181640625, "epoch": 0.8296622613803231, "fcm_dpo/beta": 0.0009813096839934587, "fcm_dpo/delta": -0.015731915831565857, "fcm_dpo/margin": 422.97857666015625, "fcm_dpo/q_t": 0.41049522161483765, "grad_norm": 44.25437927246094, "learning_rate": 4.375063135042445e-08, "logits/chosen": -0.9896056652069092, "logits/rejected": -0.9939931035041809, "logps/chosen": -796.7603759765625, "logps/ref_chosen": -69.0199203491211, "logps/ref_rejected": -85.7789306640625, "logps/rejected": -1236.4979248046875, "loss": 1.1303, "margin_dpo/margin_mean": 422.9785461425781, "margin_dpo/margin_std": 737.2132568359375, "step": 565 }, { "KL/chosen_KL_mean": -672.3824462890625, "KL/mean": -907.526611328125, "KL/rejected_KL_mean": -1142.6708984375, "KL/std": 631.5331420898438, "epoch": 0.8311306901615272, "fcm_dpo/beta": 0.0009774458594620228, "fcm_dpo/delta": -0.06289710104465485, "fcm_dpo/margin": 470.2884216308594, "fcm_dpo/q_t": 0.3984670639038086, "grad_norm": 35.89476013183594, "learning_rate": 4.3028095264420525e-08, "logits/chosen": -0.9764306545257568, "logits/rejected": -1.0030491352081299, "logps/chosen": -738.927734375, "logps/ref_chosen": -66.5453109741211, "logps/ref_rejected": -103.86932373046875, "logps/rejected": -1246.5401611328125, "loss": 1.0977, "margin_dpo/margin_mean": 470.28839111328125, "margin_dpo/margin_std": 727.6954345703125, "step": 566 }, { "KL/chosen_KL_mean": -654.3175048828125, "KL/mean": -835.689453125, "KL/rejected_KL_mean": -1017.0613403320312, "KL/std": 442.9603576660156, "epoch": 0.8325991189427313, "fcm_dpo/beta": 0.0009747430449351668, "fcm_dpo/delta": 0.04814485087990761, "fcm_dpo/margin": 362.7438659667969, "fcm_dpo/q_t": 0.41720670461654663, "grad_norm": 37.64247512817383, "learning_rate": 4.231101308059165e-08, "logits/chosen": -1.0903135538101196, "logits/rejected": -1.1013944149017334, "logps/chosen": -707.17578125, "logps/ref_chosen": -52.85829544067383, "logps/ref_rejected": -85.37095642089844, "logps/rejected": -1102.432373046875, "loss": 1.1168, "margin_dpo/margin_mean": 362.7438659667969, "margin_dpo/margin_std": 488.75982666015625, "step": 567 }, { "KL/chosen_KL_mean": -640.649169921875, "KL/mean": -880.9572143554688, "KL/rejected_KL_mean": -1121.2652587890625, "KL/std": 509.034912109375, "epoch": 0.8340675477239354, "fcm_dpo/beta": 0.0009669936262071133, "fcm_dpo/delta": -0.06800977885723114, "fcm_dpo/margin": 480.6161804199219, "fcm_dpo/q_t": 0.39191970229148865, "grad_norm": 32.80691146850586, "learning_rate": 4.1599403694720145e-08, "logits/chosen": -0.951264500617981, "logits/rejected": -0.9899559020996094, "logps/chosen": -685.841552734375, "logps/ref_chosen": -45.1923828125, "logps/ref_rejected": -89.09236907958984, "logps/rejected": -1210.357666015625, "loss": 1.0349, "margin_dpo/margin_mean": 480.61614990234375, "margin_dpo/margin_std": 529.2095947265625, "step": 568 }, { "KL/chosen_KL_mean": -726.1346435546875, "KL/mean": -927.0540161132812, "KL/rejected_KL_mean": -1127.973388671875, "KL/std": 663.6150512695312, "epoch": 0.8355359765051396, "fcm_dpo/beta": 0.0009723026305437088, "fcm_dpo/delta": 0.008742645382881165, "fcm_dpo/margin": 401.8387451171875, "fcm_dpo/q_t": 0.4109431803226471, "grad_norm": 49.85871505737305, "learning_rate": 4.089328585837512e-08, "logits/chosen": -1.000624179840088, "logits/rejected": -1.0081329345703125, "logps/chosen": -789.855224609375, "logps/ref_chosen": -63.72056198120117, "logps/ref_rejected": -79.10325622558594, "logps/rejected": -1207.07666015625, "loss": 1.142, "margin_dpo/margin_mean": 401.8387451171875, "margin_dpo/margin_std": 688.9107055664062, "step": 569 }, { "KL/chosen_KL_mean": -679.8856811523438, "KL/mean": -865.4696044921875, "KL/rejected_KL_mean": -1051.053466796875, "KL/std": 531.23974609375, "epoch": 0.8370044052863436, "fcm_dpo/beta": 0.000971082947216928, "fcm_dpo/delta": 0.04106369987130165, "fcm_dpo/margin": 371.16790771484375, "fcm_dpo/q_t": 0.41922780871391296, "grad_norm": 30.256921768188477, "learning_rate": 4.019267817841834e-08, "logits/chosen": -1.0755581855773926, "logits/rejected": -1.0673818588256836, "logps/chosen": -741.500244140625, "logps/ref_chosen": -61.61454391479492, "logps/ref_rejected": -82.14186096191406, "logps/rejected": -1133.1954345703125, "loss": 1.134, "margin_dpo/margin_mean": 371.16790771484375, "margin_dpo/margin_std": 586.2235717773438, "step": 570 }, { "KL/chosen_KL_mean": -704.7464599609375, "KL/mean": -927.1207275390625, "KL/rejected_KL_mean": -1149.4949951171875, "KL/std": 556.0410766601562, "epoch": 0.8384728340675477, "fcm_dpo/beta": 0.0009676171466708183, "fcm_dpo/delta": -0.031879834830760956, "fcm_dpo/margin": 444.74859619140625, "fcm_dpo/q_t": 0.4041333794593811, "grad_norm": 37.00971221923828, "learning_rate": 3.9497599116513705e-08, "logits/chosen": -0.9698342680931091, "logits/rejected": -0.984066903591156, "logps/chosen": -757.800537109375, "logps/ref_chosen": -53.05406188964844, "logps/ref_rejected": -91.33682250976562, "logps/rejected": -1240.831787109375, "loss": 1.103, "margin_dpo/margin_mean": 444.74853515625, "margin_dpo/margin_std": 689.9317016601562, "step": 571 }, { "KL/chosen_KL_mean": -760.4070434570312, "KL/mean": -986.117431640625, "KL/rejected_KL_mean": -1211.82763671875, "KL/std": 642.9850463867188, "epoch": 0.8399412628487518, "fcm_dpo/beta": 0.0009603890357539058, "fcm_dpo/delta": -0.03531990945339203, "fcm_dpo/margin": 451.42071533203125, "fcm_dpo/q_t": 0.40735888481140137, "grad_norm": 33.24798583984375, "learning_rate": 3.880806698864086e-08, "logits/chosen": -1.01137113571167, "logits/rejected": -1.0419948101043701, "logps/chosen": -808.8663330078125, "logps/ref_chosen": -48.45928955078125, "logps/ref_rejected": -83.55703735351562, "logps/rejected": -1295.384765625, "loss": 1.1207, "margin_dpo/margin_mean": 451.42071533203125, "margin_dpo/margin_std": 778.9022216796875, "step": 572 }, { "KL/chosen_KL_mean": -738.4405517578125, "KL/mean": -941.948486328125, "KL/rejected_KL_mean": -1145.4564208984375, "KL/std": 572.2998657226562, "epoch": 0.8414096916299559, "fcm_dpo/beta": 0.0009635947062633932, "fcm_dpo/delta": 0.008085294626653194, "fcm_dpo/margin": 407.01580810546875, "fcm_dpo/q_t": 0.4119398593902588, "grad_norm": 30.2346248626709, "learning_rate": 3.812409996461275e-08, "logits/chosen": -1.0823559761047363, "logits/rejected": -1.0929925441741943, "logps/chosen": -790.063232421875, "logps/ref_chosen": -51.62262725830078, "logps/ref_rejected": -85.32499694824219, "logps/rejected": -1230.7813720703125, "loss": 1.0995, "margin_dpo/margin_mean": 407.01580810546875, "margin_dpo/margin_std": 571.386474609375, "step": 573 }, { "KL/chosen_KL_mean": -669.0106201171875, "KL/mean": -891.2628784179688, "KL/rejected_KL_mean": -1113.51513671875, "KL/std": 523.285888671875, "epoch": 0.8428781204111601, "fcm_dpo/beta": 0.0009609279222786427, "fcm_dpo/delta": -0.02838175743818283, "fcm_dpo/margin": 444.5045166015625, "fcm_dpo/q_t": 0.4030435085296631, "grad_norm": 39.73578643798828, "learning_rate": 3.74457160675965e-08, "logits/chosen": -1.0616734027862549, "logits/rejected": -1.087287187576294, "logps/chosen": -720.0551147460938, "logps/ref_chosen": -51.04446029663086, "logps/ref_rejected": -92.80640411376953, "logps/rejected": -1206.321533203125, "loss": 1.0826, "margin_dpo/margin_mean": 444.5045166015625, "margin_dpo/margin_std": 604.4505004882812, "step": 574 }, { "KL/chosen_KL_mean": -715.7283325195312, "KL/mean": -918.717529296875, "KL/rejected_KL_mean": -1121.70654296875, "KL/std": 521.6762084960938, "epoch": 0.8443465491923642, "fcm_dpo/beta": 0.0009517880389466882, "fcm_dpo/delta": 0.01296766847372055, "fcm_dpo/margin": 405.9783020019531, "fcm_dpo/q_t": 0.41225284337997437, "grad_norm": 34.28059005737305, "learning_rate": 3.677293317363864e-08, "logits/chosen": -0.95893394947052, "logits/rejected": -0.9681143760681152, "logps/chosen": -787.5184326171875, "logps/ref_chosen": -71.7901382446289, "logps/ref_rejected": -95.38619995117188, "logps/rejected": -1217.0927734375, "loss": 1.1349, "margin_dpo/margin_mean": 405.97833251953125, "margin_dpo/margin_std": 661.7513427734375, "step": 575 }, { "KL/chosen_KL_mean": -700.2211303710938, "KL/mean": -855.5504150390625, "KL/rejected_KL_mean": -1010.8797607421875, "KL/std": 483.954345703125, "epoch": 0.8458149779735683, "fcm_dpo/beta": 0.0009712062310427427, "fcm_dpo/delta": 0.10144974291324615, "fcm_dpo/margin": 310.6585998535156, "fcm_dpo/q_t": 0.4315390884876251, "grad_norm": 37.09640884399414, "learning_rate": 3.6105769011194224e-08, "logits/chosen": -1.0683985948562622, "logits/rejected": -1.0978965759277344, "logps/chosen": -754.484130859375, "logps/ref_chosen": -54.262962341308594, "logps/ref_rejected": -100.75428009033203, "logps/rejected": -1111.634033203125, "loss": 1.1802, "margin_dpo/margin_mean": 310.6585998535156, "margin_dpo/margin_std": 565.0277709960938, "step": 576 }, { "KL/chosen_KL_mean": -617.461669921875, "KL/mean": -820.9848022460938, "KL/rejected_KL_mean": -1024.5079345703125, "KL/std": 550.1454467773438, "epoch": 0.8472834067547724, "fcm_dpo/beta": 0.0009785243310034275, "fcm_dpo/delta": 0.0017292937263846397, "fcm_dpo/margin": 407.0462341308594, "fcm_dpo/q_t": 0.4103270471096039, "grad_norm": 28.927133560180664, "learning_rate": 3.5444241160659304e-08, "logits/chosen": -1.0405795574188232, "logits/rejected": -1.0324490070343018, "logps/chosen": -679.371337890625, "logps/ref_chosen": -61.909706115722656, "logps/ref_rejected": -84.07069396972656, "logps/rejected": -1108.57861328125, "loss": 1.1141, "margin_dpo/margin_mean": 407.0462646484375, "margin_dpo/margin_std": 594.7308959960938, "step": 577 }, { "KL/chosen_KL_mean": -613.3778076171875, "KL/mean": -831.669189453125, "KL/rejected_KL_mean": -1049.9605712890625, "KL/std": 529.0792236328125, "epoch": 0.8487518355359766, "fcm_dpo/beta": 0.0009709839941933751, "fcm_dpo/delta": -0.02570383995771408, "fcm_dpo/margin": 436.5827331542969, "fcm_dpo/q_t": 0.4028467535972595, "grad_norm": 37.58174133300781, "learning_rate": 3.478836705390808e-08, "logits/chosen": -0.949306845664978, "logits/rejected": -0.9812426567077637, "logps/chosen": -662.6415405273438, "logps/ref_chosen": -49.26368713378906, "logps/ref_rejected": -83.4362564086914, "logps/rejected": -1133.396728515625, "loss": 1.073, "margin_dpo/margin_mean": 436.5827331542969, "margin_dpo/margin_std": 540.6303100585938, "step": 578 }, { "KL/chosen_KL_mean": -723.9249267578125, "KL/mean": -857.285888671875, "KL/rejected_KL_mean": -990.6468505859375, "KL/std": 545.2241821289062, "epoch": 0.8502202643171806, "fcm_dpo/beta": 0.0009836649987846613, "fcm_dpo/delta": 0.0445760153234005, "fcm_dpo/margin": 266.721923828125, "fcm_dpo/q_t": 0.43926477432250977, "grad_norm": 57.85899353027344, "learning_rate": 3.41381639738331e-08, "logits/chosen": -1.0406593084335327, "logits/rejected": -1.0456761121749878, "logps/chosen": -782.810791015625, "logps/ref_chosen": -58.88581848144531, "logps/ref_rejected": -94.78762817382812, "logps/rejected": -1085.4345703125, "loss": 1.2246, "margin_dpo/margin_mean": 266.721923828125, "margin_dpo/margin_std": 599.7505493164062, "step": 579 }, { "KL/chosen_KL_mean": -523.7979125976562, "KL/mean": -777.8994140625, "KL/rejected_KL_mean": -1032.0008544921875, "KL/std": 608.378662109375, "epoch": 0.8516886930983847, "fcm_dpo/beta": 0.0009660617797635496, "fcm_dpo/delta": -0.09629727900028229, "fcm_dpo/margin": 508.2030029296875, "fcm_dpo/q_t": 0.39319556951522827, "grad_norm": 28.836139678955078, "learning_rate": 3.349364905389032e-08, "logits/chosen": -0.8956875205039978, "logits/rejected": -0.930424153804779, "logps/chosen": -572.5047607421875, "logps/ref_chosen": -48.70683670043945, "logps/ref_rejected": -81.7583999633789, "logps/rejected": -1113.75927734375, "loss": 1.0505, "margin_dpo/margin_mean": 508.2029724121094, "margin_dpo/margin_std": 699.3994140625, "step": 580 }, { "KL/chosen_KL_mean": -719.4566650390625, "KL/mean": -892.114501953125, "KL/rejected_KL_mean": -1064.7723388671875, "KL/std": 567.6749877929688, "epoch": 0.8531571218795888, "fcm_dpo/beta": 0.0009752740152180195, "fcm_dpo/delta": 0.06534610688686371, "fcm_dpo/margin": 345.31573486328125, "fcm_dpo/q_t": 0.4253264367580414, "grad_norm": 42.403324127197266, "learning_rate": 3.285483927764726e-08, "logits/chosen": -1.105149269104004, "logits/rejected": -1.113175630569458, "logps/chosen": -781.6790161132812, "logps/ref_chosen": -62.22235107421875, "logps/ref_rejected": -91.73568725585938, "logps/rejected": -1156.508056640625, "loss": 1.1609, "margin_dpo/margin_mean": 345.31573486328125, "margin_dpo/margin_std": 624.8870849609375, "step": 581 }, { "KL/chosen_KL_mean": -628.026611328125, "KL/mean": -828.7762451171875, "KL/rejected_KL_mean": -1029.52587890625, "KL/std": 499.4111022949219, "epoch": 0.8546255506607929, "fcm_dpo/beta": 0.000987016363069415, "fcm_dpo/delta": 0.00210629403591156, "fcm_dpo/margin": 401.499267578125, "fcm_dpo/q_t": 0.4085530638694763, "grad_norm": 30.787132263183594, "learning_rate": 3.222175147833556e-08, "logits/chosen": -1.0466606616973877, "logits/rejected": -1.0699677467346191, "logps/chosen": -686.2553100585938, "logps/ref_chosen": -58.228660583496094, "logps/ref_rejected": -110.06959533691406, "logps/rejected": -1139.595458984375, "loss": 1.1048, "margin_dpo/margin_mean": 401.499267578125, "margin_dpo/margin_std": 544.4295654296875, "step": 582 }, { "KL/chosen_KL_mean": -720.7072143554688, "KL/mean": -845.5337524414062, "KL/rejected_KL_mean": -970.3602905273438, "KL/std": 540.9931640625, "epoch": 0.856093979441997, "fcm_dpo/beta": 0.0009754466009326279, "fcm_dpo/delta": -0.013105042278766632, "fcm_dpo/margin": 249.6529998779297, "fcm_dpo/q_t": 0.44551074504852295, "grad_norm": 57.56175231933594, "learning_rate": 3.159440233840763e-08, "logits/chosen": -0.9852885007858276, "logits/rejected": -0.9814597368240356, "logps/chosen": -777.570068359375, "logps/ref_chosen": -56.86286163330078, "logps/ref_rejected": -88.4039306640625, "logps/rejected": -1058.76416015625, "loss": 1.2576, "margin_dpo/margin_mean": 249.65298461914062, "margin_dpo/margin_std": 645.1383056640625, "step": 583 }, { "KL/chosen_KL_mean": -624.9401245117188, "KL/mean": -870.3372802734375, "KL/rejected_KL_mean": -1115.734619140625, "KL/std": 560.4860229492188, "epoch": 0.8575624082232012, "fcm_dpo/beta": 0.0009648328414186835, "fcm_dpo/delta": -0.07715471088886261, "fcm_dpo/margin": 490.79443359375, "fcm_dpo/q_t": 0.3933956027030945, "grad_norm": 40.76878356933594, "learning_rate": 3.0972808389096635e-08, "logits/chosen": -1.0293495655059814, "logits/rejected": -1.0419096946716309, "logps/chosen": -681.8408203125, "logps/ref_chosen": -56.90068054199219, "logps/ref_rejected": -97.63606262207031, "logps/rejected": -1213.37060546875, "loss": 1.0435, "margin_dpo/margin_mean": 490.79443359375, "margin_dpo/margin_std": 595.7258911132812, "step": 584 }, { "KL/chosen_KL_mean": -713.83837890625, "KL/mean": -934.8936157226562, "KL/rejected_KL_mean": -1155.948974609375, "KL/std": 615.5269775390625, "epoch": 0.8590308370044053, "fcm_dpo/beta": 0.0009544256026856601, "fcm_dpo/delta": -0.023169085383415222, "fcm_dpo/margin": 442.110595703125, "fcm_dpo/q_t": 0.40594780445098877, "grad_norm": 35.20669937133789, "learning_rate": 3.035698600998121e-08, "logits/chosen": -1.0367913246154785, "logits/rejected": -1.058849811553955, "logps/chosen": -774.8123168945312, "logps/ref_chosen": -60.973968505859375, "logps/ref_rejected": -84.16952514648438, "logps/rejected": -1240.118408203125, "loss": 1.1181, "margin_dpo/margin_mean": 442.110595703125, "margin_dpo/margin_std": 730.5751953125, "step": 585 }, { "KL/chosen_KL_mean": -746.9095458984375, "KL/mean": -891.6566162109375, "KL/rejected_KL_mean": -1036.403564453125, "KL/std": 537.6516723632812, "epoch": 0.8604992657856094, "fcm_dpo/beta": 0.0009719742811284959, "fcm_dpo/delta": 0.12220651656389236, "fcm_dpo/margin": 289.494140625, "fcm_dpo/q_t": 0.4358428120613098, "grad_norm": 31.211702346801758, "learning_rate": 2.974695142855388e-08, "logits/chosen": -1.0327489376068115, "logits/rejected": -1.0522578954696655, "logps/chosen": -803.76513671875, "logps/ref_chosen": -56.85559844970703, "logps/ref_rejected": -91.80261993408203, "logps/rejected": -1128.206298828125, "loss": 1.2039, "margin_dpo/margin_mean": 289.494140625, "margin_dpo/margin_std": 594.351806640625, "step": 586 }, { "KL/chosen_KL_mean": -506.798583984375, "KL/mean": -714.7243041992188, "KL/rejected_KL_mean": -922.6500244140625, "KL/std": 565.3491821289062, "epoch": 0.8619676945668135, "fcm_dpo/beta": 0.0009805042063817382, "fcm_dpo/delta": -0.008139118552207947, "fcm_dpo/margin": 415.85150146484375, "fcm_dpo/q_t": 0.40643125772476196, "grad_norm": 38.273529052734375, "learning_rate": 2.9142720719793122e-08, "logits/chosen": -1.03069269657135, "logits/rejected": -1.058362603187561, "logps/chosen": -551.4901733398438, "logps/ref_chosen": -44.69159698486328, "logps/ref_rejected": -82.62385559082031, "logps/rejected": -1005.27392578125, "loss": 1.0918, "margin_dpo/margin_mean": 415.85150146484375, "margin_dpo/margin_std": 569.570068359375, "step": 587 }, { "KL/chosen_KL_mean": -664.525390625, "KL/mean": -837.8515014648438, "KL/rejected_KL_mean": -1011.1776123046875, "KL/std": 480.2796630859375, "epoch": 0.8634361233480177, "fcm_dpo/beta": 0.000982759054750204, "fcm_dpo/delta": 0.061286523938179016, "fcm_dpo/margin": 346.6522216796875, "fcm_dpo/q_t": 0.4209359884262085, "grad_norm": 31.023210525512695, "learning_rate": 2.8544309805740018e-08, "logits/chosen": -1.0278799533843994, "logits/rejected": -1.05256187915802, "logps/chosen": -714.8203735351562, "logps/ref_chosen": -50.29494857788086, "logps/ref_rejected": -107.36988067626953, "logps/rejected": -1118.5474853515625, "loss": 1.1364, "margin_dpo/margin_mean": 346.6522521972656, "margin_dpo/margin_std": 510.10968017578125, "step": 588 }, { "KL/chosen_KL_mean": -673.4818115234375, "KL/mean": -904.097900390625, "KL/rejected_KL_mean": -1134.7139892578125, "KL/std": 546.526611328125, "epoch": 0.8649045521292217, "fcm_dpo/beta": 0.0009825675515457988, "fcm_dpo/delta": -0.05569233000278473, "fcm_dpo/margin": 461.232177734375, "fcm_dpo/q_t": 0.3966492712497711, "grad_norm": 30.869823455810547, "learning_rate": 2.7951734455078786e-08, "logits/chosen": -0.9944198131561279, "logits/rejected": -1.005649447441101, "logps/chosen": -733.4117431640625, "logps/ref_chosen": -59.929908752441406, "logps/ref_rejected": -111.65534973144531, "logps/rejected": -1246.369384765625, "loss": 1.0576, "margin_dpo/margin_mean": 461.2321472167969, "margin_dpo/margin_std": 579.5751953125, "step": 589 }, { "KL/chosen_KL_mean": -577.0634765625, "KL/mean": -803.4448852539062, "KL/rejected_KL_mean": -1029.8262939453125, "KL/std": 528.4348754882812, "epoch": 0.8663729809104258, "fcm_dpo/beta": 0.0009721757378429174, "fcm_dpo/delta": -0.042073942720890045, "fcm_dpo/margin": 452.7629089355469, "fcm_dpo/q_t": 0.40020644664764404, "grad_norm": 33.35025405883789, "learning_rate": 2.736501028272095e-08, "logits/chosen": -0.9723612070083618, "logits/rejected": -1.0005714893341064, "logps/chosen": -632.8732299804688, "logps/ref_chosen": -55.80979537963867, "logps/ref_rejected": -106.06282043457031, "logps/rejected": -1135.88916015625, "loss": 1.0653, "margin_dpo/margin_mean": 452.762939453125, "margin_dpo/margin_std": 576.941162109375, "step": 590 }, { "KL/chosen_KL_mean": -649.7230224609375, "KL/mean": -853.7227783203125, "KL/rejected_KL_mean": -1057.722412109375, "KL/std": 513.365478515625, "epoch": 0.8678414096916299, "fcm_dpo/beta": 0.0009712845785543323, "fcm_dpo/delta": 0.003869034815579653, "fcm_dpo/margin": 407.99945068359375, "fcm_dpo/q_t": 0.40842798352241516, "grad_norm": 34.23089599609375, "learning_rate": 2.678415274939408e-08, "logits/chosen": -1.0266298055648804, "logits/rejected": -1.0202120542526245, "logps/chosen": -705.963623046875, "logps/ref_chosen": -56.24061965942383, "logps/ref_rejected": -83.78629302978516, "logps/rejected": -1141.5087890625, "loss": 1.1021, "margin_dpo/margin_mean": 407.9994812011719, "margin_dpo/margin_std": 576.1143798828125, "step": 591 }, { "KL/chosen_KL_mean": -680.3071899414062, "KL/mean": -866.3721923828125, "KL/rejected_KL_mean": -1052.4371337890625, "KL/std": 528.517822265625, "epoch": 0.869309838472834, "fcm_dpo/beta": 0.0009767988231033087, "fcm_dpo/delta": 0.0378945954144001, "fcm_dpo/margin": 372.12994384765625, "fcm_dpo/q_t": 0.4183656573295593, "grad_norm": 36.562538146972656, "learning_rate": 2.6209177161234442e-08, "logits/chosen": -1.0103018283843994, "logits/rejected": -1.0133998394012451, "logps/chosen": -728.2474365234375, "logps/ref_chosen": -47.94025421142578, "logps/ref_rejected": -75.73287963867188, "logps/rejected": -1128.169921875, "loss": 1.1739, "margin_dpo/margin_mean": 372.1299133300781, "margin_dpo/margin_std": 716.3402099609375, "step": 592 }, { "KL/chosen_KL_mean": -643.9327392578125, "KL/mean": -789.9352416992188, "KL/rejected_KL_mean": -935.937744140625, "KL/std": 571.6775512695312, "epoch": 0.8707782672540382, "fcm_dpo/beta": 0.000984064768999815, "fcm_dpo/delta": 0.02495434135198593, "fcm_dpo/margin": 292.0050048828125, "fcm_dpo/q_t": 0.4358568787574768, "grad_norm": 52.18275451660156, "learning_rate": 2.564009866938349e-08, "logits/chosen": -0.895799994468689, "logits/rejected": -0.8876699209213257, "logps/chosen": -692.62353515625, "logps/ref_chosen": -48.690757751464844, "logps/ref_rejected": -60.90800094604492, "logps/rejected": -996.845703125, "loss": 1.2122, "margin_dpo/margin_mean": 292.0050048828125, "margin_dpo/margin_std": 635.2003173828125, "step": 593 }, { "KL/chosen_KL_mean": -632.062255859375, "KL/mean": -817.6357421875, "KL/rejected_KL_mean": -1003.2091064453125, "KL/std": 560.24951171875, "epoch": 0.8722466960352423, "fcm_dpo/beta": 0.000995452981442213, "fcm_dpo/delta": 0.030517850071191788, "fcm_dpo/margin": 371.14691162109375, "fcm_dpo/q_t": 0.4163949191570282, "grad_norm": 37.190494537353516, "learning_rate": 2.5076932269588708e-08, "logits/chosen": -0.9844435453414917, "logits/rejected": -0.9757124185562134, "logps/chosen": -686.9970703125, "logps/ref_chosen": -54.93488693237305, "logps/ref_rejected": -86.09967803955078, "logps/rejected": -1089.308837890625, "loss": 1.1384, "margin_dpo/margin_mean": 371.1469421386719, "margin_dpo/margin_std": 597.141845703125, "step": 594 }, { "KL/chosen_KL_mean": -576.7757568359375, "KL/mean": -783.5555419921875, "KL/rejected_KL_mean": -990.33544921875, "KL/std": 514.779541015625, "epoch": 0.8737151248164464, "fcm_dpo/beta": 0.0009872771333903074, "fcm_dpo/delta": -0.008713661693036556, "fcm_dpo/margin": 413.5597229003906, "fcm_dpo/q_t": 0.4093300700187683, "grad_norm": 42.089027404785156, "learning_rate": 2.451969280180849e-08, "logits/chosen": -0.9605817794799805, "logits/rejected": -0.979555606842041, "logps/chosen": -626.1961669921875, "logps/ref_chosen": -49.4204216003418, "logps/ref_rejected": -80.62731170654297, "logps/rejected": -1070.962646484375, "loss": 1.0916, "margin_dpo/margin_mean": 413.5596923828125, "margin_dpo/margin_std": 573.5089721679688, "step": 595 }, { "KL/chosen_KL_mean": -680.4862060546875, "KL/mean": -826.2808837890625, "KL/rejected_KL_mean": -972.0756225585938, "KL/std": 520.2750244140625, "epoch": 0.8751835535976505, "fcm_dpo/beta": 0.001006106031127274, "fcm_dpo/delta": 0.10965707898139954, "fcm_dpo/margin": 291.58941650390625, "fcm_dpo/q_t": 0.4358452558517456, "grad_norm": 58.43737030029297, "learning_rate": 2.396839494982103e-08, "logits/chosen": -0.995140552520752, "logits/rejected": -0.9648805856704712, "logps/chosen": -740.27783203125, "logps/ref_chosen": -59.791683197021484, "logps/ref_rejected": -80.09111785888672, "logps/rejected": -1052.166748046875, "loss": 1.2046, "margin_dpo/margin_mean": 291.58941650390625, "margin_dpo/margin_std": 623.7481689453125, "step": 596 }, { "KL/chosen_KL_mean": -646.8114624023438, "KL/mean": -897.362060546875, "KL/rejected_KL_mean": -1147.91259765625, "KL/std": 604.0191650390625, "epoch": 0.8766519823788547, "fcm_dpo/beta": 0.0009849161142483354, "fcm_dpo/delta": -0.10163434594869614, "fcm_dpo/margin": 501.10113525390625, "fcm_dpo/q_t": 0.3910897970199585, "grad_norm": 27.942764282226562, "learning_rate": 2.3423053240837514e-08, "logits/chosen": -0.9165897369384766, "logits/rejected": -0.9646108746528625, "logps/chosen": -704.072265625, "logps/ref_chosen": -57.26078796386719, "logps/ref_rejected": -100.6937255859375, "logps/rejected": -1248.6063232421875, "loss": 1.057, "margin_dpo/margin_mean": 501.1011047363281, "margin_dpo/margin_std": 665.6370849609375, "step": 597 }, { "KL/chosen_KL_mean": -640.52880859375, "KL/mean": -837.5374755859375, "KL/rejected_KL_mean": -1034.5460205078125, "KL/std": 517.078857421875, "epoch": 0.8781204111600588, "fcm_dpo/beta": 0.0009835727978497744, "fcm_dpo/delta": 0.011667370796203613, "fcm_dpo/margin": 394.01727294921875, "fcm_dpo/q_t": 0.4106915593147278, "grad_norm": 44.37862777709961, "learning_rate": 2.2883682045119062e-08, "logits/chosen": -1.0113518238067627, "logits/rejected": -1.023685336112976, "logps/chosen": -693.0473022460938, "logps/ref_chosen": -52.51850509643555, "logps/ref_rejected": -89.44385528564453, "logps/rejected": -1123.989990234375, "loss": 1.1145, "margin_dpo/margin_mean": 394.01727294921875, "margin_dpo/margin_std": 565.1383666992188, "step": 598 }, { "KL/chosen_KL_mean": -643.5155029296875, "KL/mean": -810.739990234375, "KL/rejected_KL_mean": -977.9644775390625, "KL/std": 501.908935546875, "epoch": 0.8795888399412628, "fcm_dpo/beta": 0.0009813719661906362, "fcm_dpo/delta": -0.055194415152072906, "fcm_dpo/margin": 334.44903564453125, "fcm_dpo/q_t": 0.4202921986579895, "grad_norm": 33.33484649658203, "learning_rate": 2.2350295575598367e-08, "logits/chosen": -0.9392881989479065, "logits/rejected": -0.9477603435516357, "logps/chosen": -693.3181762695312, "logps/ref_chosen": -49.802677154541016, "logps/ref_rejected": -82.978515625, "logps/rejected": -1060.943115234375, "loss": 1.1434, "margin_dpo/margin_mean": 334.448974609375, "margin_dpo/margin_std": 476.6112060546875, "step": 599 }, { "KL/chosen_KL_mean": -708.796142578125, "KL/mean": -865.4288330078125, "KL/rejected_KL_mean": -1022.0615844726562, "KL/std": 508.1895446777344, "epoch": 0.8810572687224669, "fcm_dpo/beta": 0.0009930970845744014, "fcm_dpo/delta": 0.09185181558132172, "fcm_dpo/margin": 313.265380859375, "fcm_dpo/q_t": 0.4303100109100342, "grad_norm": 31.344772338867188, "learning_rate": 2.1822907887504932e-08, "logits/chosen": -1.0468547344207764, "logits/rejected": -1.0439157485961914, "logps/chosen": -775.2310791015625, "logps/ref_chosen": -66.43487548828125, "logps/ref_rejected": -85.45649719238281, "logps/rejected": -1107.51806640625, "loss": 1.1858, "margin_dpo/margin_mean": 313.265380859375, "margin_dpo/margin_std": 613.1177978515625, "step": 600 }, { "KL/chosen_KL_mean": -709.7387084960938, "KL/mean": -912.3361206054688, "KL/rejected_KL_mean": -1114.933349609375, "KL/std": 543.34033203125, "epoch": 0.882525697503671, "fcm_dpo/beta": 0.0009970087558031082, "fcm_dpo/delta": -0.004234878346323967, "fcm_dpo/margin": 405.19476318359375, "fcm_dpo/q_t": 0.4060080647468567, "grad_norm": 33.107521057128906, "learning_rate": 2.1301532877994742e-08, "logits/chosen": -0.9763351678848267, "logits/rejected": -0.9957572221755981, "logps/chosen": -768.872314453125, "logps/ref_chosen": -59.13361358642578, "logps/ref_rejected": -94.69093322753906, "logps/rejected": -1209.6243896484375, "loss": 1.0885, "margin_dpo/margin_mean": 405.19476318359375, "margin_dpo/margin_std": 531.271728515625, "step": 601 }, { "KL/chosen_KL_mean": -501.87945556640625, "KL/mean": -752.0775146484375, "KL/rejected_KL_mean": -1002.2755126953125, "KL/std": 506.71160888671875, "epoch": 0.8839941262848752, "fcm_dpo/beta": 0.0009898185962811112, "fcm_dpo/delta": -0.10039174556732178, "fcm_dpo/margin": 500.39605712890625, "fcm_dpo/q_t": 0.38656604290008545, "grad_norm": 53.6025276184082, "learning_rate": 2.0786184285784298e-08, "logits/chosen": -1.0038371086120605, "logits/rejected": -1.0388686656951904, "logps/chosen": -550.4730224609375, "logps/ref_chosen": -48.59352111816406, "logps/ref_rejected": -87.6685562133789, "logps/rejected": -1089.944091796875, "loss": 1.0169, "margin_dpo/margin_mean": 500.39605712890625, "margin_dpo/margin_std": 533.11669921875, "step": 602 }, { "KL/chosen_KL_mean": -630.3939819335938, "KL/mean": -857.3939208984375, "KL/rejected_KL_mean": -1084.393798828125, "KL/std": 564.5196533203125, "epoch": 0.8854625550660793, "fcm_dpo/beta": 0.0009718415094539523, "fcm_dpo/delta": -0.04316433519124985, "fcm_dpo/margin": 453.99981689453125, "fcm_dpo/q_t": 0.40223926305770874, "grad_norm": 34.395442962646484, "learning_rate": 2.0276875690788204e-08, "logits/chosen": -1.011382818222046, "logits/rejected": -1.0053396224975586, "logps/chosen": -700.80859375, "logps/ref_chosen": -70.41461944580078, "logps/ref_rejected": -100.32559967041016, "logps/rejected": -1184.719482421875, "loss": 1.0851, "margin_dpo/margin_mean": 453.99981689453125, "margin_dpo/margin_std": 659.65673828125, "step": 603 }, { "KL/chosen_KL_mean": -605.8519897460938, "KL/mean": -852.4639892578125, "KL/rejected_KL_mean": -1099.075927734375, "KL/std": 546.70166015625, "epoch": 0.8869309838472834, "fcm_dpo/beta": 0.0009594437433406711, "fcm_dpo/delta": -0.07685627043247223, "fcm_dpo/margin": 493.223876953125, "fcm_dpo/q_t": 0.3948679566383362, "grad_norm": 32.600433349609375, "learning_rate": 1.977362051376158e-08, "logits/chosen": -1.0046117305755615, "logits/rejected": -1.0419707298278809, "logps/chosen": -652.31005859375, "logps/ref_chosen": -46.45808029174805, "logps/ref_rejected": -91.8544921875, "logps/rejected": -1190.930419921875, "loss": 1.0625, "margin_dpo/margin_mean": 493.223876953125, "margin_dpo/margin_std": 673.2548828125, "step": 604 }, { "KL/chosen_KL_mean": -636.7569580078125, "KL/mean": -819.03857421875, "KL/rejected_KL_mean": -1001.3201904296875, "KL/std": 516.325927734375, "epoch": 0.8883994126284875, "fcm_dpo/beta": 0.0009607453248463571, "fcm_dpo/delta": 0.051560450345277786, "fcm_dpo/margin": 364.5632019042969, "fcm_dpo/q_t": 0.42265427112579346, "grad_norm": 36.99518966674805, "learning_rate": 1.9276432015946446e-08, "logits/chosen": -0.9250746965408325, "logits/rejected": -0.9353400468826294, "logps/chosen": -703.0062866210938, "logps/ref_chosen": -66.24933624267578, "logps/ref_rejected": -102.30496978759766, "logps/rejected": -1103.625244140625, "loss": 1.1441, "margin_dpo/margin_mean": 364.563232421875, "margin_dpo/margin_std": 616.972412109375, "step": 605 }, { "KL/chosen_KL_mean": -655.916015625, "KL/mean": -859.0430908203125, "KL/rejected_KL_mean": -1062.170166015625, "KL/std": 534.576171875, "epoch": 0.8898678414096917, "fcm_dpo/beta": 0.0009698671055957675, "fcm_dpo/delta": 0.005952846258878708, "fcm_dpo/margin": 406.254150390625, "fcm_dpo/q_t": 0.41086679697036743, "grad_norm": 30.389057159423828, "learning_rate": 1.8785323298722093e-08, "logits/chosen": -0.9856992363929749, "logits/rejected": -1.002555012702942, "logps/chosen": -710.735107421875, "logps/ref_chosen": -54.819122314453125, "logps/ref_rejected": -98.37146759033203, "logps/rejected": -1160.5416259765625, "loss": 1.1022, "margin_dpo/margin_mean": 406.254150390625, "margin_dpo/margin_std": 574.14404296875, "step": 606 }, { "KL/chosen_KL_mean": -664.7274780273438, "KL/mean": -838.5308837890625, "KL/rejected_KL_mean": -1012.334228515625, "KL/std": 534.4456176757812, "epoch": 0.8913362701908958, "fcm_dpo/beta": 0.0009783967398107052, "fcm_dpo/delta": 0.06167557090520859, "fcm_dpo/margin": 347.606689453125, "fcm_dpo/q_t": 0.42285820841789246, "grad_norm": 26.738142013549805, "learning_rate": 1.8300307303259904e-08, "logits/chosen": -0.9779009222984314, "logits/rejected": -0.9702655076980591, "logps/chosen": -722.8115234375, "logps/ref_chosen": -58.08403778076172, "logps/ref_rejected": -79.777099609375, "logps/rejected": -1092.111328125, "loss": 1.1479, "margin_dpo/margin_mean": 347.606689453125, "margin_dpo/margin_std": 570.4724731445312, "step": 607 }, { "KL/chosen_KL_mean": -588.347900390625, "KL/mean": -785.5659790039062, "KL/rejected_KL_mean": -982.7840576171875, "KL/std": 475.997314453125, "epoch": 0.8928046989720999, "fcm_dpo/beta": 0.0009828273905441165, "fcm_dpo/delta": 0.012696724385023117, "fcm_dpo/margin": 394.4361267089844, "fcm_dpo/q_t": 0.40994399785995483, "grad_norm": 33.70753479003906, "learning_rate": 1.7821396810182437e-08, "logits/chosen": -1.0321998596191406, "logits/rejected": -1.0463464260101318, "logps/chosen": -645.7987060546875, "logps/ref_chosen": -57.450836181640625, "logps/ref_rejected": -94.77339172363281, "logps/rejected": -1077.557373046875, "loss": 1.0892, "margin_dpo/margin_mean": 394.4361572265625, "margin_dpo/margin_std": 485.2535705566406, "step": 608 }, { "KL/chosen_KL_mean": -616.16552734375, "KL/mean": -871.0028076171875, "KL/rejected_KL_mean": -1125.840087890625, "KL/std": 653.2919311523438, "epoch": 0.8942731277533039, "fcm_dpo/beta": 0.0009676434565335512, "fcm_dpo/delta": -0.09796243906021118, "fcm_dpo/margin": 509.67462158203125, "fcm_dpo/q_t": 0.3957828879356384, "grad_norm": 30.651371002197266, "learning_rate": 1.7348604439226617e-08, "logits/chosen": -1.0793794393539429, "logits/rejected": -1.107104778289795, "logps/chosen": -674.9708862304688, "logps/ref_chosen": -58.805355072021484, "logps/ref_rejected": -88.81600952148438, "logps/rejected": -1214.6561279296875, "loss": 1.065, "margin_dpo/margin_mean": 509.67462158203125, "margin_dpo/margin_std": 778.646728515625, "step": 609 }, { "KL/chosen_KL_mean": -621.6527099609375, "KL/mean": -778.48388671875, "KL/rejected_KL_mean": -935.3150634765625, "KL/std": 504.77032470703125, "epoch": 0.895741556534508, "fcm_dpo/beta": 0.0009778111707419157, "fcm_dpo/delta": 0.09600942581892014, "fcm_dpo/margin": 313.6623229980469, "fcm_dpo/q_t": 0.4288497567176819, "grad_norm": 39.5369987487793, "learning_rate": 1.6881942648911074e-08, "logits/chosen": -0.9661835432052612, "logits/rejected": -0.9449666738510132, "logps/chosen": -687.3477783203125, "logps/ref_chosen": -65.69503784179688, "logps/ref_rejected": -83.40538787841797, "logps/rejected": -1018.720458984375, "loss": 1.18, "margin_dpo/margin_mean": 313.662353515625, "margin_dpo/margin_std": 588.4511108398438, "step": 610 }, { "KL/chosen_KL_mean": -647.6074829101562, "KL/mean": -917.2225952148438, "KL/rejected_KL_mean": -1186.837646484375, "KL/std": 658.553466796875, "epoch": 0.8972099853157122, "fcm_dpo/beta": 0.0009621235076338053, "fcm_dpo/delta": -0.12539134919643402, "fcm_dpo/margin": 539.230224609375, "fcm_dpo/q_t": 0.3878698945045471, "grad_norm": 32.128814697265625, "learning_rate": 1.6421423736208e-08, "logits/chosen": -1.0239993333816528, "logits/rejected": -1.0698425769805908, "logps/chosen": -700.2069091796875, "logps/ref_chosen": -52.59946823120117, "logps/ref_rejected": -86.33099365234375, "logps/rejected": -1273.168701171875, "loss": 1.0452, "margin_dpo/margin_mean": 539.230224609375, "margin_dpo/margin_std": 746.280029296875, "step": 611 }, { "KL/chosen_KL_mean": -688.6466674804688, "KL/mean": -894.856201171875, "KL/rejected_KL_mean": -1101.065673828125, "KL/std": 530.3755493164062, "epoch": 0.8986784140969163, "fcm_dpo/beta": 0.0009597926400601864, "fcm_dpo/delta": 0.004164084792137146, "fcm_dpo/margin": 412.4189453125, "fcm_dpo/q_t": 0.40916839241981506, "grad_norm": 29.971281051635742, "learning_rate": 1.5967059836219042e-08, "logits/chosen": -1.008693814277649, "logits/rejected": -1.013035535812378, "logps/chosen": -747.9703979492188, "logps/ref_chosen": -59.32372283935547, "logps/ref_rejected": -88.31239318847656, "logps/rejected": -1189.3780517578125, "loss": 1.0975, "margin_dpo/margin_mean": 412.4189758300781, "margin_dpo/margin_std": 563.840087890625, "step": 612 }, { "KL/chosen_KL_mean": -607.6298217773438, "KL/mean": -856.0455932617188, "KL/rejected_KL_mean": -1104.4613037109375, "KL/std": 585.9758911132812, "epoch": 0.9001468428781204, "fcm_dpo/beta": 0.0009444322204217315, "fcm_dpo/delta": -0.07297656685113907, "fcm_dpo/margin": 496.83154296875, "fcm_dpo/q_t": 0.3927931785583496, "grad_norm": 32.904354095458984, "learning_rate": 1.551886292185553e-08, "logits/chosen": -0.9863793849945068, "logits/rejected": -1.0393249988555908, "logps/chosen": -667.3597412109375, "logps/ref_chosen": -59.72996520996094, "logps/ref_rejected": -105.10752868652344, "logps/rejected": -1209.56884765625, "loss": 1.0444, "margin_dpo/margin_mean": 496.83154296875, "margin_dpo/margin_std": 600.5185546875, "step": 613 }, { "KL/chosen_KL_mean": -677.3827514648438, "KL/mean": -920.0013427734375, "KL/rejected_KL_mean": -1162.619873046875, "KL/std": 576.0763549804688, "epoch": 0.9016152716593245, "fcm_dpo/beta": 0.000936733849812299, "fcm_dpo/delta": -0.05709536373615265, "fcm_dpo/margin": 485.2372131347656, "fcm_dpo/q_t": 0.3984524607658386, "grad_norm": 37.47915267944336, "learning_rate": 1.507684480352292e-08, "logits/chosen": -0.9485939741134644, "logits/rejected": -1.0217413902282715, "logps/chosen": -730.3217163085938, "logps/ref_chosen": -52.93898010253906, "logps/ref_rejected": -104.67938232421875, "logps/rejected": -1267.29931640625, "loss": 1.0746, "margin_dpo/margin_mean": 485.2372131347656, "margin_dpo/margin_std": 681.718505859375, "step": 614 }, { "KL/chosen_KL_mean": -648.6558837890625, "KL/mean": -851.766845703125, "KL/rejected_KL_mean": -1054.877685546875, "KL/std": 605.865234375, "epoch": 0.9030837004405287, "fcm_dpo/beta": 0.0009383243741467595, "fcm_dpo/delta": 0.01932334341108799, "fcm_dpo/margin": 406.2218017578125, "fcm_dpo/q_t": 0.4133886396884918, "grad_norm": 23.292583465576172, "learning_rate": 1.4641017128809801e-08, "logits/chosen": -0.9777463674545288, "logits/rejected": -0.999763548374176, "logps/chosen": -714.4732055664062, "logps/ref_chosen": -65.81727600097656, "logps/ref_rejected": -95.17749786376953, "logps/rejected": -1150.05517578125, "loss": 1.1261, "margin_dpo/margin_mean": 406.22174072265625, "margin_dpo/margin_std": 651.3701171875, "step": 615 }, { "KL/chosen_KL_mean": -753.4498901367188, "KL/mean": -922.0699462890625, "KL/rejected_KL_mean": -1090.6898193359375, "KL/std": 498.5460205078125, "epoch": 0.9045521292217328, "fcm_dpo/beta": 0.0009485027985647321, "fcm_dpo/delta": 0.0827227309346199, "fcm_dpo/margin": 337.23992919921875, "fcm_dpo/q_t": 0.4269304871559143, "grad_norm": 29.635278701782227, "learning_rate": 1.4211391382180637e-08, "logits/chosen": -1.066072702407837, "logits/rejected": -1.0544729232788086, "logps/chosen": -818.582763671875, "logps/ref_chosen": -65.13285827636719, "logps/ref_rejected": -74.70050048828125, "logps/rejected": -1165.390380859375, "loss": 1.1591, "margin_dpo/margin_mean": 337.2399597167969, "margin_dpo/margin_std": 560.1079711914062, "step": 616 }, { "KL/chosen_KL_mean": -710.6087646484375, "KL/mean": -830.27587890625, "KL/rejected_KL_mean": -949.943115234375, "KL/std": 471.6356506347656, "epoch": 0.9060205580029369, "fcm_dpo/beta": 0.0009591138223186135, "fcm_dpo/delta": 0.0745362788438797, "fcm_dpo/margin": 239.3343048095703, "fcm_dpo/q_t": 0.4473581910133362, "grad_norm": 49.84982681274414, "learning_rate": 1.378797888467345e-08, "logits/chosen": -0.9499881267547607, "logits/rejected": -0.9198344945907593, "logps/chosen": -773.6142578125, "logps/ref_chosen": -63.005550384521484, "logps/ref_rejected": -64.234130859375, "logps/rejected": -1014.1771850585938, "loss": 1.2347, "margin_dpo/margin_mean": 239.3343048095703, "margin_dpo/margin_std": 541.6489868164062, "step": 617 }, { "KL/chosen_KL_mean": -755.9691162109375, "KL/mean": -1014.7872314453125, "KL/rejected_KL_mean": -1273.605224609375, "KL/std": 649.2423095703125, "epoch": 0.9074889867841409, "fcm_dpo/beta": 0.0009580876212567091, "fcm_dpo/delta": -0.1011531874537468, "fcm_dpo/margin": 517.6361694335938, "fcm_dpo/q_t": 0.39342206716537476, "grad_norm": 40.80027389526367, "learning_rate": 1.3370790793601371e-08, "logits/chosen": -1.010411024093628, "logits/rejected": -1.0457146167755127, "logps/chosen": -823.0704345703125, "logps/ref_chosen": -67.10134887695312, "logps/ref_rejected": -92.15340423583984, "logps/rejected": -1365.7586669921875, "loss": 1.0908, "margin_dpo/margin_mean": 517.6361694335938, "margin_dpo/margin_std": 831.257080078125, "step": 618 }, { "KL/chosen_KL_mean": -744.197021484375, "KL/mean": -942.0189819335938, "KL/rejected_KL_mean": -1139.841064453125, "KL/std": 602.969970703125, "epoch": 0.908957415565345, "fcm_dpo/beta": 0.0009506435599178076, "fcm_dpo/delta": 0.024809934198856354, "fcm_dpo/margin": 395.64404296875, "fcm_dpo/q_t": 0.4217052459716797, "grad_norm": 47.4921875, "learning_rate": 1.2959838102258535e-08, "logits/chosen": -0.9666841626167297, "logits/rejected": -0.9786205291748047, "logps/chosen": -800.1752319335938, "logps/ref_chosen": -55.978233337402344, "logps/ref_rejected": -93.1854019165039, "logps/rejected": -1233.0263671875, "loss": 1.1713, "margin_dpo/margin_mean": 395.64404296875, "margin_dpo/margin_std": 793.0050048828125, "step": 619 }, { "KL/chosen_KL_mean": -677.560546875, "KL/mean": -861.231689453125, "KL/rejected_KL_mean": -1044.9027099609375, "KL/std": 534.52587890625, "epoch": 0.9104258443465492, "fcm_dpo/beta": 0.0009550647810101509, "fcm_dpo/delta": 0.05083069950342178, "fcm_dpo/margin": 367.34222412109375, "fcm_dpo/q_t": 0.42078667879104614, "grad_norm": 38.08302307128906, "learning_rate": 1.2555131639630567e-08, "logits/chosen": -1.0355204343795776, "logits/rejected": -1.041193962097168, "logps/chosen": -737.3580322265625, "logps/ref_chosen": -59.79750061035156, "logps/ref_rejected": -78.41075134277344, "logps/rejected": -1123.3134765625, "loss": 1.1439, "margin_dpo/margin_mean": 367.34222412109375, "margin_dpo/margin_std": 601.9281005859375, "step": 620 }, { "KL/chosen_KL_mean": -640.847900390625, "KL/mean": -920.02197265625, "KL/rejected_KL_mean": -1199.1959228515625, "KL/std": 635.8433227539062, "epoch": 0.9118942731277533, "fcm_dpo/beta": 0.0009489471558481455, "fcm_dpo/delta": -0.13720259070396423, "fcm_dpo/margin": 558.3480834960938, "fcm_dpo/q_t": 0.3808121085166931, "grad_norm": 41.10859298706055, "learning_rate": 1.2156682070109086e-08, "logits/chosen": -1.0572166442871094, "logits/rejected": -1.106241226196289, "logps/chosen": -694.7816772460938, "logps/ref_chosen": -53.93375778198242, "logps/ref_rejected": -88.36951446533203, "logps/rejected": -1287.5654296875, "loss": 1.0292, "margin_dpo/margin_mean": 558.3480834960938, "margin_dpo/margin_std": 704.4603271484375, "step": 621 }, { "KL/chosen_KL_mean": -638.4073486328125, "KL/mean": -838.403076171875, "KL/rejected_KL_mean": -1038.39892578125, "KL/std": 490.0394592285156, "epoch": 0.9133627019089574, "fcm_dpo/beta": 0.0009346996666863561, "fcm_dpo/delta": 0.026821225881576538, "fcm_dpo/margin": 399.9915771484375, "fcm_dpo/q_t": 0.41576099395751953, "grad_norm": 29.741287231445312, "learning_rate": 1.1764499893210878e-08, "logits/chosen": -0.913569986820221, "logits/rejected": -0.9021658897399902, "logps/chosen": -698.693115234375, "logps/ref_chosen": -60.28582000732422, "logps/ref_rejected": -85.51873779296875, "logps/rejected": -1123.9176025390625, "loss": 1.1208, "margin_dpo/margin_mean": 399.9915771484375, "margin_dpo/margin_std": 609.2018432617188, "step": 622 }, { "KL/chosen_KL_mean": -704.8582763671875, "KL/mean": -860.2454833984375, "KL/rejected_KL_mean": -1015.6328125, "KL/std": 504.6408386230469, "epoch": 0.9148311306901615, "fcm_dpo/beta": 0.0009554900461807847, "fcm_dpo/delta": 0.10620071738958359, "fcm_dpo/margin": 310.77447509765625, "fcm_dpo/q_t": 0.43424922227859497, "grad_norm": 35.98710250854492, "learning_rate": 1.1378595443300998e-08, "logits/chosen": -1.0679330825805664, "logits/rejected": -1.069124460220337, "logps/chosen": -769.0152587890625, "logps/ref_chosen": -64.1569595336914, "logps/ref_rejected": -85.08304595947266, "logps/rejected": -1100.7158203125, "loss": 1.188, "margin_dpo/margin_mean": 310.77447509765625, "margin_dpo/margin_std": 607.74267578125, "step": 623 }, { "KL/chosen_KL_mean": -696.2843627929688, "KL/mean": -931.2808837890625, "KL/rejected_KL_mean": -1166.27734375, "KL/std": 522.650146484375, "epoch": 0.9162995594713657, "fcm_dpo/beta": 0.0009546733344905078, "fcm_dpo/delta": -0.050946250557899475, "fcm_dpo/margin": 469.9930419921875, "fcm_dpo/q_t": 0.39522331953048706, "grad_norm": 39.16311264038086, "learning_rate": 1.0998978889320582e-08, "logits/chosen": -1.0855488777160645, "logits/rejected": -1.0924354791641235, "logps/chosen": -768.2030029296875, "logps/ref_chosen": -71.91862487792969, "logps/ref_rejected": -97.13203430175781, "logps/rejected": -1263.409423828125, "loss": 1.0557, "margin_dpo/margin_mean": 469.9930419921875, "margin_dpo/margin_std": 563.95068359375, "step": 624 }, { "KL/chosen_KL_mean": -648.822509765625, "KL/mean": -904.4124755859375, "KL/rejected_KL_mean": -1160.00244140625, "KL/std": 575.8782958984375, "epoch": 0.9177679882525698, "fcm_dpo/beta": 0.0009412041981704533, "fcm_dpo/delta": -0.08514019101858139, "fcm_dpo/margin": 511.17987060546875, "fcm_dpo/q_t": 0.389559268951416, "grad_norm": 60.529544830322266, "learning_rate": 1.0625660234518913e-08, "logits/chosen": -0.9935369491577148, "logits/rejected": -1.0175690650939941, "logps/chosen": -707.16455078125, "logps/ref_chosen": -58.342071533203125, "logps/ref_rejected": -86.09038543701172, "logps/rejected": -1246.0927734375, "loss": 1.0202, "margin_dpo/margin_mean": 511.17987060546875, "margin_dpo/margin_std": 540.048095703125, "step": 625 }, { "KL/chosen_KL_mean": -831.2083740234375, "KL/mean": -975.5403442382812, "KL/rejected_KL_mean": -1119.872314453125, "KL/std": 650.2010498046875, "epoch": 0.9192364170337739, "fcm_dpo/beta": 0.0009531835094094276, "fcm_dpo/delta": 0.1283356249332428, "fcm_dpo/margin": 288.66387939453125, "fcm_dpo/q_t": 0.4358038902282715, "grad_norm": 34.610740661621094, "learning_rate": 1.0258649316189721e-08, "logits/chosen": -0.9936656951904297, "logits/rejected": -0.986907422542572, "logps/chosen": -906.321044921875, "logps/ref_chosen": -75.11260986328125, "logps/ref_rejected": -99.188720703125, "logps/rejected": -1219.06103515625, "loss": 1.2216, "margin_dpo/margin_mean": 288.66387939453125, "margin_dpo/margin_std": 637.5816650390625, "step": 626 }, { "KL/chosen_KL_mean": -562.23046875, "KL/mean": -855.1490478515625, "KL/rejected_KL_mean": -1148.0675048828125, "KL/std": 695.8411254882812, "epoch": 0.920704845814978, "fcm_dpo/beta": 0.0009427897166460752, "fcm_dpo/delta": -0.1610720157623291, "fcm_dpo/margin": 585.8370361328125, "fcm_dpo/q_t": 0.3851046562194824, "grad_norm": 25.602148056030273, "learning_rate": 9.897955805412e-09, "logits/chosen": -0.9343521595001221, "logits/rejected": -1.0077568292617798, "logps/chosen": -609.9736328125, "logps/ref_chosen": -47.74314880371094, "logps/ref_rejected": -106.75448608398438, "logps/rejected": -1254.822021484375, "loss": 1.0308, "margin_dpo/margin_mean": 585.8370361328125, "margin_dpo/margin_std": 801.694580078125, "step": 627 }, { "KL/chosen_KL_mean": -718.6756591796875, "KL/mean": -939.210205078125, "KL/rejected_KL_mean": -1159.744873046875, "KL/std": 562.9912109375, "epoch": 0.922173274596182, "fcm_dpo/beta": 0.0009263536194339395, "fcm_dpo/delta": -0.008988456800580025, "fcm_dpo/margin": 441.0692138671875, "fcm_dpo/q_t": 0.40781164169311523, "grad_norm": 31.12415313720703, "learning_rate": 9.543589206795238e-09, "logits/chosen": -1.0448391437530518, "logits/rejected": -1.059401273727417, "logps/chosen": -778.8585815429688, "logps/ref_chosen": -60.182945251464844, "logps/ref_rejected": -101.55467224121094, "logps/rejected": -1261.299560546875, "loss": 1.1016, "margin_dpo/margin_mean": 441.0692138671875, "margin_dpo/margin_std": 644.30810546875, "step": 628 }, { "KL/chosen_KL_mean": -718.2134399414062, "KL/mean": -919.1171875, "KL/rejected_KL_mean": -1120.0211181640625, "KL/std": 549.9234619140625, "epoch": 0.9236417033773862, "fcm_dpo/beta": 0.0009307701839134097, "fcm_dpo/delta": 0.027018554508686066, "fcm_dpo/margin": 401.8076171875, "fcm_dpo/q_t": 0.4127495288848877, "grad_norm": 37.944095611572266, "learning_rate": 9.19555885822887e-09, "logits/chosen": -1.0313966274261475, "logits/rejected": -1.0439316034317017, "logps/chosen": -782.427001953125, "logps/ref_chosen": -64.21354675292969, "logps/ref_rejected": -91.65367126464844, "logps/rejected": -1211.6748046875, "loss": 1.1066, "margin_dpo/margin_mean": 401.8076171875, "margin_dpo/margin_std": 538.9395751953125, "step": 629 }, { "KL/chosen_KL_mean": -656.501953125, "KL/mean": -770.98876953125, "KL/rejected_KL_mean": -885.4755249023438, "KL/std": 560.9046630859375, "epoch": 0.9251101321585903, "fcm_dpo/beta": 0.0009370100451633334, "fcm_dpo/delta": 0.05341341719031334, "fcm_dpo/margin": 228.9735565185547, "fcm_dpo/q_t": 0.4552198052406311, "grad_norm": 48.24060821533203, "learning_rate": 8.85387393063622e-09, "logits/chosen": -1.0444166660308838, "logits/rejected": -1.0228123664855957, "logps/chosen": -715.79296875, "logps/ref_chosen": -59.29100036621094, "logps/ref_rejected": -83.59829711914062, "logps/rejected": -969.0738525390625, "loss": 1.2752, "margin_dpo/margin_mean": 228.9735565185547, "margin_dpo/margin_std": 674.7388916015625, "step": 630 }, { "KL/chosen_KL_mean": -798.7627563476562, "KL/mean": -982.8570556640625, "KL/rejected_KL_mean": -1166.951416015625, "KL/std": 575.3311767578125, "epoch": 0.9265785609397944, "fcm_dpo/beta": 0.00095040921587497, "fcm_dpo/delta": 0.051799606531858444, "fcm_dpo/margin": 368.1885986328125, "fcm_dpo/q_t": 0.41983652114868164, "grad_norm": 30.060415267944336, "learning_rate": 8.518543427732949e-09, "logits/chosen": -1.1124560832977295, "logits/rejected": -1.1201171875, "logps/chosen": -858.2163696289062, "logps/ref_chosen": -59.45360565185547, "logps/ref_rejected": -80.95156860351562, "logps/rejected": -1247.9029541015625, "loss": 1.1601, "margin_dpo/margin_mean": 368.1886291503906, "margin_dpo/margin_std": 662.2765502929688, "step": 631 }, { "KL/chosen_KL_mean": -693.1824951171875, "KL/mean": -889.6626586914062, "KL/rejected_KL_mean": -1086.142822265625, "KL/std": 519.53759765625, "epoch": 0.9280469897209985, "fcm_dpo/beta": 0.0009511418174952269, "fcm_dpo/delta": 0.026925835758447647, "fcm_dpo/margin": 392.960205078125, "fcm_dpo/q_t": 0.4149981141090393, "grad_norm": 43.195838928222656, "learning_rate": 8.189576185789637e-09, "logits/chosen": -1.0623399019241333, "logits/rejected": -1.0656976699829102, "logps/chosen": -754.5340576171875, "logps/ref_chosen": -61.35155487060547, "logps/ref_rejected": -86.16017150878906, "logps/rejected": -1172.302978515625, "loss": 1.1353, "margin_dpo/margin_mean": 392.960205078125, "margin_dpo/margin_std": 638.5850830078125, "step": 632 }, { "KL/chosen_KL_mean": -757.2740478515625, "KL/mean": -897.8375244140625, "KL/rejected_KL_mean": -1038.4010009765625, "KL/std": 511.4078369140625, "epoch": 0.9295154185022027, "fcm_dpo/beta": 0.0009779944084584713, "fcm_dpo/delta": 0.12812459468841553, "fcm_dpo/margin": 281.12689208984375, "fcm_dpo/q_t": 0.43694406747817993, "grad_norm": 51.9892692565918, "learning_rate": 7.866980873399015e-09, "logits/chosen": -1.1087684631347656, "logits/rejected": -1.1197929382324219, "logps/chosen": -814.55224609375, "logps/ref_chosen": -57.27816390991211, "logps/ref_rejected": -91.58395385742188, "logps/rejected": -1129.98486328125, "loss": 1.2081, "margin_dpo/margin_mean": 281.1269226074219, "margin_dpo/margin_std": 584.89990234375, "step": 633 }, { "KL/chosen_KL_mean": -870.9852905273438, "KL/mean": -992.6185302734375, "KL/rejected_KL_mean": -1114.251708984375, "KL/std": 614.6510009765625, "epoch": 0.9309838472834068, "fcm_dpo/beta": 0.0009954730048775673, "fcm_dpo/delta": 0.06913463771343231, "fcm_dpo/margin": 243.26641845703125, "fcm_dpo/q_t": 0.44643303751945496, "grad_norm": 50.689117431640625, "learning_rate": 7.550765991247654e-09, "logits/chosen": -0.9997051358222961, "logits/rejected": -0.9940841197967529, "logps/chosen": -937.604248046875, "logps/ref_chosen": -66.61896514892578, "logps/ref_rejected": -107.12564849853516, "logps/rejected": -1221.37744140625, "loss": 1.2403, "margin_dpo/margin_mean": 243.2664337158203, "margin_dpo/margin_std": 593.8778076171875, "step": 634 }, { "KL/chosen_KL_mean": -731.5853271484375, "KL/mean": -910.15234375, "KL/rejected_KL_mean": -1088.71923828125, "KL/std": 636.81201171875, "epoch": 0.9324522760646109, "fcm_dpo/beta": 0.0010012383572757244, "fcm_dpo/delta": 0.04401912912726402, "fcm_dpo/margin": 357.1339416503906, "fcm_dpo/q_t": 0.42191681265830994, "grad_norm": 40.568695068359375, "learning_rate": 7.240939871891699e-09, "logits/chosen": -1.059622049331665, "logits/rejected": -1.0404071807861328, "logps/chosen": -805.5408935546875, "logps/ref_chosen": -73.95551300048828, "logps/ref_rejected": -82.50045776367188, "logps/rejected": -1171.2197265625, "loss": 1.1529, "margin_dpo/margin_mean": 357.1339416503906, "margin_dpo/margin_std": 645.6236572265625, "step": 635 }, { "KL/chosen_KL_mean": -694.026123046875, "KL/mean": -906.4205322265625, "KL/rejected_KL_mean": -1118.81494140625, "KL/std": 625.3888549804688, "epoch": 0.933920704845815, "fcm_dpo/beta": 0.0010070966091006994, "fcm_dpo/delta": -0.029895581305027008, "fcm_dpo/margin": 424.7888488769531, "fcm_dpo/q_t": 0.40687400102615356, "grad_norm": 29.367713928222656, "learning_rate": 6.937510679537628e-09, "logits/chosen": -0.9780547618865967, "logits/rejected": -0.9799286127090454, "logps/chosen": -753.655029296875, "logps/ref_chosen": -59.628910064697266, "logps/ref_rejected": -81.97883605957031, "logps/rejected": -1200.793701171875, "loss": 1.0945, "margin_dpo/margin_mean": 424.78887939453125, "margin_dpo/margin_std": 647.4033203125, "step": 636 }, { "KL/chosen_KL_mean": -701.5499267578125, "KL/mean": -936.32275390625, "KL/rejected_KL_mean": -1171.095703125, "KL/std": 619.800537109375, "epoch": 0.9353891336270191, "fcm_dpo/beta": 0.0009838433470577002, "fcm_dpo/delta": -0.06574591249227524, "fcm_dpo/margin": 469.54571533203125, "fcm_dpo/q_t": 0.3974034786224365, "grad_norm": 28.718305587768555, "learning_rate": 6.640486409826785e-09, "logits/chosen": -1.07195246219635, "logits/rejected": -1.1217677593231201, "logps/chosen": -751.2025756835938, "logps/ref_chosen": -49.652687072753906, "logps/ref_rejected": -98.40513610839844, "logps/rejected": -1269.500732421875, "loss": 1.0659, "margin_dpo/margin_mean": 469.54571533203125, "margin_dpo/margin_std": 641.02294921875, "step": 637 }, { "KL/chosen_KL_mean": -683.7498779296875, "KL/mean": -866.938720703125, "KL/rejected_KL_mean": -1050.1275634765625, "KL/std": 584.358154296875, "epoch": 0.9368575624082232, "fcm_dpo/beta": 0.000977477291598916, "fcm_dpo/delta": -0.07950125634670258, "fcm_dpo/margin": 366.377685546875, "fcm_dpo/q_t": 0.41288208961486816, "grad_norm": 35.89247131347656, "learning_rate": 6.349874889624962e-09, "logits/chosen": -0.9825940728187561, "logits/rejected": -0.9679138660430908, "logps/chosen": -741.9065551757812, "logps/ref_chosen": -58.156639099121094, "logps/ref_rejected": -79.3014907836914, "logps/rejected": -1129.4290771484375, "loss": 1.1675, "margin_dpo/margin_mean": 366.377685546875, "margin_dpo/margin_std": 677.9688720703125, "step": 638 }, { "KL/chosen_KL_mean": -931.100830078125, "KL/mean": -1014.3240966796875, "KL/rejected_KL_mean": -1097.54736328125, "KL/std": 560.0640869140625, "epoch": 0.9383259911894273, "fcm_dpo/beta": 0.0009697063360363245, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 166.44647216796875, "fcm_dpo/q_t": 0.4638892412185669, "grad_norm": 106.00291442871094, "learning_rate": 6.065683776815933e-09, "logits/chosen": -0.96961510181427, "logits/rejected": -0.9110531806945801, "logps/chosen": -1003.424072265625, "logps/ref_chosen": -72.32319641113281, "logps/ref_rejected": -74.2749252319336, "logps/rejected": -1171.822265625, "loss": 1.3398, "margin_dpo/margin_mean": 166.44647216796875, "margin_dpo/margin_std": 696.6536865234375, "step": 639 }, { "KL/chosen_KL_mean": -727.3014526367188, "KL/mean": -1004.7110595703125, "KL/rejected_KL_mean": -1282.1207275390625, "KL/std": 652.2802734375, "epoch": 0.9397944199706314, "fcm_dpo/beta": 0.0009544800268486142, "fcm_dpo/delta": -0.13660603761672974, "fcm_dpo/margin": 554.8192138671875, "fcm_dpo/q_t": 0.3860216438770294, "grad_norm": 35.883907318115234, "learning_rate": 5.7879205600998296e-09, "logits/chosen": -0.9869524240493774, "logits/rejected": -1.01767098903656, "logps/chosen": -783.435791015625, "logps/ref_chosen": -56.13436508178711, "logps/ref_rejected": -108.60014343261719, "logps/rejected": -1390.7208251953125, "loss": 1.042, "margin_dpo/margin_mean": 554.8192138671875, "margin_dpo/margin_std": 769.599853515625, "step": 640 }, { "KL/chosen_KL_mean": -833.5911865234375, "KL/mean": -1017.5059814453125, "KL/rejected_KL_mean": -1201.420654296875, "KL/std": 561.353759765625, "epoch": 0.9412628487518355, "fcm_dpo/beta": 0.0009492564713582397, "fcm_dpo/delta": 0.052702054381370544, "fcm_dpo/margin": 367.82965087890625, "fcm_dpo/q_t": 0.4235016107559204, "grad_norm": 43.7893180847168, "learning_rate": 5.516592558795746e-09, "logits/chosen": -1.0607787370681763, "logits/rejected": -1.0730290412902832, "logps/chosen": -898.5880126953125, "logps/ref_chosen": -64.99689483642578, "logps/ref_rejected": -86.99232482910156, "logps/rejected": -1288.4130859375, "loss": 1.1796, "margin_dpo/margin_mean": 367.82965087890625, "margin_dpo/margin_std": 744.2984619140625, "step": 641 }, { "KL/chosen_KL_mean": -781.8062744140625, "KL/mean": -1004.431396484375, "KL/rejected_KL_mean": -1227.056640625, "KL/std": 735.1798706054688, "epoch": 0.9427312775330396, "fcm_dpo/beta": 0.0009536816505715251, "fcm_dpo/delta": -0.025937873870134354, "fcm_dpo/margin": 445.2503662109375, "fcm_dpo/q_t": 0.41450613737106323, "grad_norm": 38.126136779785156, "learning_rate": 5.251706922648868e-09, "logits/chosen": -0.9798089861869812, "logits/rejected": -1.0176451206207275, "logps/chosen": -847.4955444335938, "logps/ref_chosen": -65.68924713134766, "logps/ref_rejected": -110.24205017089844, "logps/rejected": -1337.2987060546875, "loss": 1.1489, "margin_dpo/margin_mean": 445.2503662109375, "margin_dpo/margin_std": 889.4637451171875, "step": 642 }, { "KL/chosen_KL_mean": -700.666748046875, "KL/mean": -871.712890625, "KL/rejected_KL_mean": -1042.759033203125, "KL/std": 530.568603515625, "epoch": 0.9441997063142438, "fcm_dpo/beta": 0.0009416728862561285, "fcm_dpo/delta": -0.0368349552154541, "fcm_dpo/margin": 342.09228515625, "fcm_dpo/q_t": 0.42535167932510376, "grad_norm": 40.722110748291016, "learning_rate": 4.993270631642038e-09, "logits/chosen": -1.0998975038528442, "logits/rejected": -1.1016184091567993, "logps/chosen": -752.61669921875, "logps/ref_chosen": -51.94999694824219, "logps/ref_rejected": -87.46833801269531, "logps/rejected": -1130.227294921875, "loss": 1.1546, "margin_dpo/margin_mean": 342.09228515625, "margin_dpo/margin_std": 534.0775146484375, "step": 643 }, { "KL/chosen_KL_mean": -692.7113037109375, "KL/mean": -867.8548583984375, "KL/rejected_KL_mean": -1042.9984130859375, "KL/std": 628.4418334960938, "epoch": 0.9456681350954479, "fcm_dpo/beta": 0.0009508873336017132, "fcm_dpo/delta": 0.06925636529922485, "fcm_dpo/margin": 350.287109375, "fcm_dpo/q_t": 0.42657724022865295, "grad_norm": 48.49483871459961, "learning_rate": 4.741290495811873e-09, "logits/chosen": -1.0195714235305786, "logits/rejected": -1.0287786722183228, "logps/chosen": -751.72900390625, "logps/ref_chosen": -59.017662048339844, "logps/ref_rejected": -87.13668823242188, "logps/rejected": -1130.135009765625, "loss": 1.1883, "margin_dpo/margin_mean": 350.287109375, "margin_dpo/margin_std": 713.6632080078125, "step": 644 }, { "KL/chosen_KL_mean": -731.31298828125, "KL/mean": -818.722412109375, "KL/rejected_KL_mean": -906.1319580078125, "KL/std": 500.92510986328125, "epoch": 0.947136563876652, "fcm_dpo/beta": 0.0009679758222773671, "fcm_dpo/delta": 0.06848917156457901, "fcm_dpo/margin": 174.8188934326172, "fcm_dpo/q_t": 0.4633423388004303, "grad_norm": 94.91219329833984, "learning_rate": 4.495773155069299e-09, "logits/chosen": -1.0337581634521484, "logits/rejected": -1.0212106704711914, "logps/chosen": -787.1890258789062, "logps/ref_chosen": -55.87602233886719, "logps/ref_rejected": -97.78080749511719, "logps/rejected": -1003.9127197265625, "loss": 1.3337, "margin_dpo/margin_mean": 174.81890869140625, "margin_dpo/margin_std": 677.0361328125, "step": 645 }, { "KL/chosen_KL_mean": -678.8671875, "KL/mean": -832.9037475585938, "KL/rejected_KL_mean": -986.9403686523438, "KL/std": 467.6722412109375, "epoch": 0.9486049926578561, "fcm_dpo/beta": 0.000979509437456727, "fcm_dpo/delta": 0.10140877962112427, "fcm_dpo/margin": 308.0731201171875, "fcm_dpo/q_t": 0.4318525791168213, "grad_norm": 50.75778579711914, "learning_rate": 4.256725079024553e-09, "logits/chosen": -1.0464283227920532, "logits/rejected": -1.0351706743240356, "logps/chosen": -740.1429443359375, "logps/ref_chosen": -61.275787353515625, "logps/ref_rejected": -77.50580596923828, "logps/rejected": -1064.4461669921875, "loss": 1.1804, "margin_dpo/margin_mean": 308.0731201171875, "margin_dpo/margin_std": 558.4867553710938, "step": 646 }, { "KL/chosen_KL_mean": -601.7261962890625, "KL/mean": -791.6389770507812, "KL/rejected_KL_mean": -981.5518188476562, "KL/std": 535.917724609375, "epoch": 0.9500734214390602, "fcm_dpo/beta": 0.0009915875270962715, "fcm_dpo/delta": 0.024278640747070312, "fcm_dpo/margin": 379.82562255859375, "fcm_dpo/q_t": 0.41298890113830566, "grad_norm": 31.015090942382812, "learning_rate": 4.024152566816791e-09, "logits/chosen": -0.9510085582733154, "logits/rejected": -0.9799119830131531, "logps/chosen": -656.57861328125, "logps/ref_chosen": -54.8524169921875, "logps/ref_rejected": -93.5194091796875, "logps/rejected": -1075.0712890625, "loss": 1.1086, "margin_dpo/margin_mean": 379.82562255859375, "margin_dpo/margin_std": 518.4892578125, "step": 647 }, { "KL/chosen_KL_mean": -650.6891479492188, "KL/mean": -929.8692016601562, "KL/rejected_KL_mean": -1209.0491943359375, "KL/std": 650.9400024414062, "epoch": 0.9515418502202643, "fcm_dpo/beta": 0.0009713097242638469, "fcm_dpo/delta": -0.15041759610176086, "fcm_dpo/margin": 558.360107421875, "fcm_dpo/q_t": 0.3838508427143097, "grad_norm": 26.95428466796875, "learning_rate": 3.798061746947995e-09, "logits/chosen": -1.0590667724609375, "logits/rejected": -1.1171326637268066, "logps/chosen": -704.860595703125, "logps/ref_chosen": -54.17146682739258, "logps/ref_rejected": -98.7127914428711, "logps/rejected": -1307.761962890625, "loss": 1.0313, "margin_dpo/margin_mean": 558.360107421875, "margin_dpo/margin_std": 770.7064208984375, "step": 648 }, { "KL/chosen_KL_mean": -675.1800537109375, "KL/mean": -811.64111328125, "KL/rejected_KL_mean": -948.1021118164062, "KL/std": 509.8891296386719, "epoch": 0.9530102790014684, "fcm_dpo/beta": 0.0009851048234850168, "fcm_dpo/delta": 0.13432249426841736, "fcm_dpo/margin": 272.9219970703125, "fcm_dpo/q_t": 0.44306886196136475, "grad_norm": 30.479537963867188, "learning_rate": 3.5784585771215235e-09, "logits/chosen": -1.0864759683609009, "logits/rejected": -1.081239938735962, "logps/chosen": -737.660400390625, "logps/ref_chosen": -62.480350494384766, "logps/ref_rejected": -80.07717895507812, "logps/rejected": -1028.1793212890625, "loss": 1.2299, "margin_dpo/margin_mean": 272.9219665527344, "margin_dpo/margin_std": 634.4632568359375, "step": 649 }, { "KL/chosen_KL_mean": -719.4322509765625, "KL/mean": -930.966552734375, "KL/rejected_KL_mean": -1142.5008544921875, "KL/std": 623.9461059570312, "epoch": 0.9544787077826725, "fcm_dpo/beta": 0.000984450918622315, "fcm_dpo/delta": -0.017372816801071167, "fcm_dpo/margin": 423.06866455078125, "fcm_dpo/q_t": 0.4085081219673157, "grad_norm": 33.085140228271484, "learning_rate": 3.3653488440851253e-09, "logits/chosen": -1.0022144317626953, "logits/rejected": -1.0228140354156494, "logps/chosen": -775.5250244140625, "logps/ref_chosen": -56.09281921386719, "logps/ref_rejected": -98.26483917236328, "logps/rejected": -1240.765625, "loss": 1.125, "margin_dpo/margin_mean": 423.06866455078125, "margin_dpo/margin_std": 715.4996337890625, "step": 650 }, { "KL/chosen_KL_mean": -485.9007263183594, "KL/mean": -757.1597900390625, "KL/rejected_KL_mean": -1028.4189453125, "KL/std": 586.595703125, "epoch": 0.9559471365638766, "fcm_dpo/beta": 0.0009662117809057236, "fcm_dpo/delta": -0.13117295503616333, "fcm_dpo/margin": 542.51806640625, "fcm_dpo/q_t": 0.3815712332725525, "grad_norm": 39.065223693847656, "learning_rate": 3.158738163478475e-09, "logits/chosen": -1.039862871170044, "logits/rejected": -1.0981051921844482, "logps/chosen": -529.326171875, "logps/ref_chosen": -43.42544937133789, "logps/ref_rejected": -99.95791625976562, "logps/rejected": -1128.376708984375, "loss": 1.0046, "margin_dpo/margin_mean": 542.5181274414062, "margin_dpo/margin_std": 605.7119140625, "step": 651 }, { "KL/chosen_KL_mean": -620.2460327148438, "KL/mean": -826.7354736328125, "KL/rejected_KL_mean": -1033.224853515625, "KL/std": 587.8150024414062, "epoch": 0.9574155653450808, "fcm_dpo/beta": 0.0009623857913538814, "fcm_dpo/delta": 0.0026037218049168587, "fcm_dpo/margin": 412.978759765625, "fcm_dpo/q_t": 0.41180309653282166, "grad_norm": 32.109920501708984, "learning_rate": 2.9586319796851555e-09, "logits/chosen": -1.0359432697296143, "logits/rejected": -1.061659812927246, "logps/chosen": -682.8228759765625, "logps/ref_chosen": -62.57680892944336, "logps/ref_rejected": -111.76779174804688, "logps/rejected": -1144.99267578125, "loss": 1.1236, "margin_dpo/margin_mean": 412.978759765625, "margin_dpo/margin_std": 679.6004638671875, "step": 652 }, { "KL/chosen_KL_mean": -751.1220703125, "KL/mean": -942.161376953125, "KL/rejected_KL_mean": -1133.20068359375, "KL/std": 632.44580078125, "epoch": 0.9588839941262849, "fcm_dpo/beta": 0.0009670084109529853, "fcm_dpo/delta": 0.031598955392837524, "fcm_dpo/margin": 382.0786437988281, "fcm_dpo/q_t": 0.418673038482666, "grad_norm": 33.81359100341797, "learning_rate": 2.7650355656892166e-09, "logits/chosen": -1.081420660018921, "logits/rejected": -1.106847882270813, "logps/chosen": -812.2350463867188, "logps/ref_chosen": -61.11295700073242, "logps/ref_rejected": -103.24960327148438, "logps/rejected": -1236.4503173828125, "loss": 1.1426, "margin_dpo/margin_mean": 382.07867431640625, "margin_dpo/margin_std": 657.617431640625, "step": 653 }, { "KL/chosen_KL_mean": -700.0304565429688, "KL/mean": -884.8853759765625, "KL/rejected_KL_mean": -1069.740234375, "KL/std": 530.6134643554688, "epoch": 0.960352422907489, "fcm_dpo/beta": 0.0009726278949528933, "fcm_dpo/delta": 0.041933320462703705, "fcm_dpo/margin": 369.70977783203125, "fcm_dpo/q_t": 0.42105910181999207, "grad_norm": 36.13345718383789, "learning_rate": 2.577954022936174e-09, "logits/chosen": -1.0889091491699219, "logits/rejected": -1.1061911582946777, "logps/chosen": -761.7586059570312, "logps/ref_chosen": -61.7281379699707, "logps/ref_rejected": -98.7738037109375, "logps/rejected": -1168.5140380859375, "loss": 1.1425, "margin_dpo/margin_mean": 369.70977783203125, "margin_dpo/margin_std": 623.5421142578125, "step": 654 }, { "KL/chosen_KL_mean": -637.3580322265625, "KL/mean": -832.098876953125, "KL/rejected_KL_mean": -1026.83984375, "KL/std": 523.131103515625, "epoch": 0.9618208516886931, "fcm_dpo/beta": 0.0009784356225281954, "fcm_dpo/delta": 0.01965608447790146, "fcm_dpo/margin": 389.4818115234375, "fcm_dpo/q_t": 0.4150038957595825, "grad_norm": 30.601289749145508, "learning_rate": 2.397392281198729e-09, "logits/chosen": -1.062340259552002, "logits/rejected": -1.1041678190231323, "logps/chosen": -686.934814453125, "logps/ref_chosen": -49.576812744140625, "logps/ref_rejected": -98.29183197021484, "logps/rejected": -1125.131591796875, "loss": 1.1251, "margin_dpo/margin_mean": 389.4818115234375, "margin_dpo/margin_std": 619.3175659179688, "step": 655 }, { "KL/chosen_KL_mean": -698.026123046875, "KL/mean": -1021.94140625, "KL/rejected_KL_mean": -1345.856689453125, "KL/std": 696.60986328125, "epoch": 0.9632892804698973, "fcm_dpo/beta": 0.0009496092097833753, "fcm_dpo/delta": -0.2287594974040985, "fcm_dpo/margin": 647.8305053710938, "fcm_dpo/q_t": 0.36486658453941345, "grad_norm": 84.34307861328125, "learning_rate": 2.223355098446622e-09, "logits/chosen": -0.9495760202407837, "logits/rejected": -1.0183899402618408, "logps/chosen": -750.5755615234375, "logps/ref_chosen": -52.54943084716797, "logps/ref_rejected": -113.67464447021484, "logps/rejected": -1459.5313720703125, "loss": 0.9582, "margin_dpo/margin_mean": 647.8305053710938, "margin_dpo/margin_std": 702.5700073242188, "step": 656 }, { "KL/chosen_KL_mean": -649.0306396484375, "KL/mean": -907.1767578125, "KL/rejected_KL_mean": -1165.3228759765625, "KL/std": 657.1854248046875, "epoch": 0.9647577092511013, "fcm_dpo/beta": 0.0009227419504895806, "fcm_dpo/delta": -0.08039526641368866, "fcm_dpo/margin": 516.2921142578125, "fcm_dpo/q_t": 0.39258694648742676, "grad_norm": 39.40578079223633, "learning_rate": 2.055847060721566e-09, "logits/chosen": -1.1030490398406982, "logits/rejected": -1.1474685668945312, "logps/chosen": -695.731201171875, "logps/ref_chosen": -46.700538635253906, "logps/ref_rejected": -97.91487121582031, "logps/rejected": -1263.23779296875, "loss": 1.0539, "margin_dpo/margin_mean": 516.2921142578125, "margin_dpo/margin_std": 684.8028564453125, "step": 657 }, { "KL/chosen_KL_mean": -707.4656982421875, "KL/mean": -911.14208984375, "KL/rejected_KL_mean": -1114.818603515625, "KL/std": 517.7315063476562, "epoch": 0.9662261380323054, "fcm_dpo/beta": 0.0009188736439682543, "fcm_dpo/delta": 0.026215653866529465, "fcm_dpo/margin": 407.352783203125, "fcm_dpo/q_t": 0.41408517956733704, "grad_norm": 35.71732711791992, "learning_rate": 1.8948725820160662e-09, "logits/chosen": -1.0394493341445923, "logits/rejected": -1.0688188076019287, "logps/chosen": -768.4239501953125, "logps/ref_chosen": -60.95820999145508, "logps/ref_rejected": -95.93949127197266, "logps/rejected": -1210.758056640625, "loss": 1.1197, "margin_dpo/margin_mean": 407.352783203125, "margin_dpo/margin_std": 595.02197265625, "step": 658 }, { "KL/chosen_KL_mean": -623.0128784179688, "KL/mean": -827.2291259765625, "KL/rejected_KL_mean": -1031.4453125, "KL/std": 528.8475341796875, "epoch": 0.9676945668135095, "fcm_dpo/beta": 0.000925220490898937, "fcm_dpo/delta": 0.022823944687843323, "fcm_dpo/margin": 408.4324645996094, "fcm_dpo/q_t": 0.414547324180603, "grad_norm": 32.48310852050781, "learning_rate": 1.7404359041573723e-09, "logits/chosen": -0.990066409111023, "logits/rejected": -0.9657001495361328, "logps/chosen": -699.755859375, "logps/ref_chosen": -76.74298095703125, "logps/ref_rejected": -87.4709701538086, "logps/rejected": -1118.916259765625, "loss": 1.1117, "margin_dpo/margin_mean": 408.4324645996094, "margin_dpo/margin_std": 585.33984375, "step": 659 }, { "KL/chosen_KL_mean": -673.4608154296875, "KL/mean": -933.520263671875, "KL/rejected_KL_mean": -1193.5797119140625, "KL/std": 613.996826171875, "epoch": 0.9691629955947136, "fcm_dpo/beta": 0.0009196768514811993, "fcm_dpo/delta": -0.08219671249389648, "fcm_dpo/margin": 520.118896484375, "fcm_dpo/q_t": 0.39171260595321655, "grad_norm": 41.30915451049805, "learning_rate": 1.592541096695571e-09, "logits/chosen": -1.0613317489624023, "logits/rejected": -1.0815818309783936, "logps/chosen": -732.5086669921875, "logps/ref_chosen": -59.04788589477539, "logps/ref_rejected": -75.96005249023438, "logps/rejected": -1269.539794921875, "loss": 1.0459, "margin_dpo/margin_mean": 520.118896484375, "margin_dpo/margin_std": 653.3119506835938, "step": 660 }, { "KL/chosen_KL_mean": -606.4295654296875, "KL/mean": -838.5125732421875, "KL/rejected_KL_mean": -1070.595458984375, "KL/std": 678.2152709960938, "epoch": 0.9706314243759178, "fcm_dpo/beta": 0.0009147179080173373, "fcm_dpo/delta": -0.02594481222331524, "fcm_dpo/margin": 464.16583251953125, "fcm_dpo/q_t": 0.4064168334007263, "grad_norm": 51.29008865356445, "learning_rate": 1.4511920567963908e-09, "logits/chosen": -1.0680885314941406, "logits/rejected": -1.084218978881836, "logps/chosen": -657.1035766601562, "logps/ref_chosen": -50.673973083496094, "logps/ref_rejected": -86.00569152832031, "logps/rejected": -1156.6011962890625, "loss": 1.0866, "margin_dpo/margin_mean": 464.16583251953125, "margin_dpo/margin_std": 674.4207153320312, "step": 661 }, { "KL/chosen_KL_mean": -692.4723510742188, "KL/mean": -874.324462890625, "KL/rejected_KL_mean": -1056.1767578125, "KL/std": 554.696044921875, "epoch": 0.9720998531571219, "fcm_dpo/beta": 0.0009189635748043656, "fcm_dpo/delta": 0.06803098320960999, "fcm_dpo/margin": 363.70428466796875, "fcm_dpo/q_t": 0.42484885454177856, "grad_norm": 30.058595657348633, "learning_rate": 1.3163925091384532e-09, "logits/chosen": -0.986479640007019, "logits/rejected": -0.9849323034286499, "logps/chosen": -761.7333984375, "logps/ref_chosen": -69.26106262207031, "logps/ref_rejected": -89.05593872070312, "logps/rejected": -1145.232666015625, "loss": 1.1704, "margin_dpo/margin_mean": 363.70428466796875, "margin_dpo/margin_std": 682.1209106445312, "step": 662 }, { "KL/chosen_KL_mean": -647.2999267578125, "KL/mean": -866.9969482421875, "KL/rejected_KL_mean": -1086.6939697265625, "KL/std": 632.6029052734375, "epoch": 0.973568281938326, "fcm_dpo/beta": 0.0009198928019031882, "fcm_dpo/delta": -0.004432424902915955, "fcm_dpo/margin": 439.39404296875, "fcm_dpo/q_t": 0.4112858176231384, "grad_norm": 27.409046173095703, "learning_rate": 1.1881460058152382e-09, "logits/chosen": -1.0472636222839355, "logits/rejected": -1.0724174976348877, "logps/chosen": -712.1788330078125, "logps/ref_chosen": -64.87890625, "logps/ref_rejected": -113.92536926269531, "logps/rejected": -1200.619384765625, "loss": 1.1207, "margin_dpo/margin_mean": 439.39404296875, "margin_dpo/margin_std": 730.9638671875, "step": 663 }, { "KL/chosen_KL_mean": -672.3688354492188, "KL/mean": -912.7847900390625, "KL/rejected_KL_mean": -1153.2008056640625, "KL/std": 606.9482421875, "epoch": 0.9750367107195301, "fcm_dpo/beta": 0.0009121259208768606, "fcm_dpo/delta": -0.04079785570502281, "fcm_dpo/margin": 480.83197021484375, "fcm_dpo/q_t": 0.401122510433197, "grad_norm": 32.19367218017578, "learning_rate": 1.066455926241383e-09, "logits/chosen": -1.0128577947616577, "logits/rejected": -1.0457968711853027, "logps/chosen": -733.25732421875, "logps/ref_chosen": -60.88847351074219, "logps/ref_rejected": -105.521728515625, "logps/rejected": -1258.7225341796875, "loss": 1.0745, "margin_dpo/margin_mean": 480.83197021484375, "margin_dpo/margin_std": 644.64111328125, "step": 664 }, { "KL/chosen_KL_mean": -613.8486328125, "KL/mean": -820.011474609375, "KL/rejected_KL_mean": -1026.17431640625, "KL/std": 517.1300048828125, "epoch": 0.9765051395007343, "fcm_dpo/beta": 0.0009164921357296407, "fcm_dpo/delta": 0.022980544716119766, "fcm_dpo/margin": 412.32568359375, "fcm_dpo/q_t": 0.41263529658317566, "grad_norm": 42.685340881347656, "learning_rate": 9.513254770636137e-10, "logits/chosen": -1.1305358409881592, "logits/rejected": -1.1555566787719727, "logps/chosen": -674.4127807617188, "logps/ref_chosen": -60.56413269042969, "logps/ref_rejected": -84.80882263183594, "logps/rejected": -1110.983154296875, "loss": 1.0985, "margin_dpo/margin_mean": 412.32568359375, "margin_dpo/margin_std": 522.885009765625, "step": 665 }, { "KL/chosen_KL_mean": -656.64306640625, "KL/mean": -868.8929443359375, "KL/rejected_KL_mean": -1081.1427001953125, "KL/std": 534.3782348632812, "epoch": 0.9779735682819384, "fcm_dpo/beta": 0.0009179958724416792, "fcm_dpo/delta": 0.010700155980885029, "fcm_dpo/margin": 424.4996643066406, "fcm_dpo/q_t": 0.4115890562534332, "grad_norm": 31.538972854614258, "learning_rate": 8.427576920763956e-10, "logits/chosen": -0.9657202959060669, "logits/rejected": -0.9787443280220032, "logps/chosen": -721.06298828125, "logps/ref_chosen": -64.41996002197266, "logps/ref_rejected": -95.8916244506836, "logps/rejected": -1177.034423828125, "loss": 1.1055, "margin_dpo/margin_mean": 424.4996643066406, "margin_dpo/margin_std": 592.3191528320312, "step": 666 }, { "KL/chosen_KL_mean": -739.8983764648438, "KL/mean": -986.7392578125, "KL/rejected_KL_mean": -1233.580078125, "KL/std": 586.67529296875, "epoch": 0.9794419970631424, "fcm_dpo/beta": 0.0009104580385610461, "fcm_dpo/delta": -0.05206644535064697, "fcm_dpo/margin": 493.6817321777344, "fcm_dpo/q_t": 0.3979244828224182, "grad_norm": 36.603797912597656, "learning_rate": 7.407554321417764e-10, "logits/chosen": -0.9688647389411926, "logits/rejected": -0.9707045555114746, "logps/chosen": -809.1754150390625, "logps/ref_chosen": -69.27702331542969, "logps/ref_rejected": -87.83549499511719, "logps/rejected": -1321.41552734375, "loss": 1.0643, "margin_dpo/margin_mean": 493.68170166015625, "margin_dpo/margin_std": 641.2989501953125, "step": 667 }, { "KL/chosen_KL_mean": -802.19189453125, "KL/mean": -976.08984375, "KL/rejected_KL_mean": -1149.98779296875, "KL/std": 634.327392578125, "epoch": 0.9809104258443465, "fcm_dpo/beta": 0.0009259539656341076, "fcm_dpo/delta": 0.07960406690835953, "fcm_dpo/margin": 347.7958984375, "fcm_dpo/q_t": 0.4307333827018738, "grad_norm": 52.49308395385742, "learning_rate": 6.453213851142225e-10, "logits/chosen": -1.0459859371185303, "logits/rejected": -1.0500774383544922, "logps/chosen": -874.7958984375, "logps/ref_chosen": -72.60400390625, "logps/ref_rejected": -103.73905944824219, "logps/rejected": -1253.726806640625, "loss": 1.2043, "margin_dpo/margin_mean": 347.7958984375, "margin_dpo/margin_std": 759.7967529296875, "step": 668 }, { "KL/chosen_KL_mean": -591.334228515625, "KL/mean": -840.5963134765625, "KL/rejected_KL_mean": -1089.8583984375, "KL/std": 579.8489990234375, "epoch": 0.9823788546255506, "fcm_dpo/beta": 0.0009176377207040787, "fcm_dpo/delta": -0.06017923727631569, "fcm_dpo/margin": 498.524169921875, "fcm_dpo/q_t": 0.3957340717315674, "grad_norm": 30.24985122680664, "learning_rate": 5.564580657695939e-10, "logits/chosen": -1.0195106267929077, "logits/rejected": -1.0302537679672241, "logps/chosen": -637.45068359375, "logps/ref_chosen": -46.116416931152344, "logps/ref_rejected": -77.92434692382812, "logps/rejected": -1167.78271484375, "loss": 1.0627, "margin_dpo/margin_mean": 498.524169921875, "margin_dpo/margin_std": 653.9466552734375, "step": 669 }, { "KL/chosen_KL_mean": -564.9832763671875, "KL/mean": -813.625732421875, "KL/rejected_KL_mean": -1062.26806640625, "KL/std": 539.938232421875, "epoch": 0.9838472834067548, "fcm_dpo/beta": 0.0009101468604058027, "fcm_dpo/delta": -0.055214740335941315, "fcm_dpo/margin": 497.284912109375, "fcm_dpo/q_t": 0.39692699909210205, "grad_norm": 27.409191131591797, "learning_rate": 4.741678157389739e-10, "logits/chosen": -0.9492954015731812, "logits/rejected": -0.9691870212554932, "logps/chosen": -627.3289794921875, "logps/ref_chosen": -62.34575271606445, "logps/ref_rejected": -96.9405517578125, "logps/rejected": -1159.208740234375, "loss": 1.0676, "margin_dpo/margin_mean": 497.284912109375, "margin_dpo/margin_std": 651.83740234375, "step": 670 }, { "KL/chosen_KL_mean": -729.5746459960938, "KL/mean": -929.4378662109375, "KL/rejected_KL_mean": -1129.301025390625, "KL/std": 544.60791015625, "epoch": 0.9853157121879589, "fcm_dpo/beta": 0.000911594950594008, "fcm_dpo/delta": 0.03665146976709366, "fcm_dpo/margin": 399.72637939453125, "fcm_dpo/q_t": 0.4158746898174286, "grad_norm": 29.13888168334961, "learning_rate": 3.9845280344705245e-10, "logits/chosen": -1.0544450283050537, "logits/rejected": -1.084800362586975, "logps/chosen": -777.5747680664062, "logps/ref_chosen": -48.00010681152344, "logps/ref_rejected": -83.81932067871094, "logps/rejected": -1213.120361328125, "loss": 1.1388, "margin_dpo/margin_mean": 399.7263488769531, "margin_dpo/margin_std": 654.765869140625, "step": 671 }, { "KL/chosen_KL_mean": -805.3826904296875, "KL/mean": -1007.677001953125, "KL/rejected_KL_mean": -1209.97119140625, "KL/std": 662.1884155273438, "epoch": 0.986784140969163, "fcm_dpo/beta": 0.0009131274418905377, "fcm_dpo/delta": 0.03172078728675842, "fcm_dpo/margin": 404.588623046875, "fcm_dpo/q_t": 0.4185020923614502, "grad_norm": 53.180294036865234, "learning_rate": 3.293150240547549e-10, "logits/chosen": -1.1241331100463867, "logits/rejected": -1.1317377090454102, "logps/chosen": -863.9659423828125, "logps/ref_chosen": -58.58328628540039, "logps/ref_rejected": -93.14015197753906, "logps/rejected": -1303.1114501953125, "loss": 1.156, "margin_dpo/margin_mean": 404.588623046875, "margin_dpo/margin_std": 734.8128662109375, "step": 672 }, { "KL/chosen_KL_mean": -725.3074951171875, "KL/mean": -918.955078125, "KL/rejected_KL_mean": -1112.602783203125, "KL/std": 561.255859375, "epoch": 0.9882525697503671, "fcm_dpo/beta": 0.000922200852073729, "fcm_dpo/delta": 0.04443016275763512, "fcm_dpo/margin": 387.2952880859375, "fcm_dpo/q_t": 0.41957566142082214, "grad_norm": 33.6239128112793, "learning_rate": 2.6675629940689504e-10, "logits/chosen": -1.0476765632629395, "logits/rejected": -1.0524837970733643, "logps/chosen": -772.0306396484375, "logps/ref_chosen": -46.72320556640625, "logps/ref_rejected": -85.29623413085938, "logps/rejected": -1197.89892578125, "loss": 1.1365, "margin_dpo/margin_mean": 387.2952880859375, "margin_dpo/margin_std": 632.4697875976562, "step": 673 }, { "KL/chosen_KL_mean": -585.1640625, "KL/mean": -831.7803955078125, "KL/rejected_KL_mean": -1078.396728515625, "KL/std": 547.6962890625, "epoch": 0.9897209985315712, "fcm_dpo/beta": 0.0009187724208459258, "fcm_dpo/delta": -0.055665817111730576, "fcm_dpo/margin": 493.232666015625, "fcm_dpo/q_t": 0.39939314126968384, "grad_norm": 33.85618209838867, "learning_rate": 2.1077827798404725e-10, "logits/chosen": -0.9810643196105957, "logits/rejected": -1.0035473108291626, "logps/chosen": -630.609619140625, "logps/ref_chosen": -45.445526123046875, "logps/ref_rejected": -70.04593658447266, "logps/rejected": -1148.442626953125, "loss": 1.0637, "margin_dpo/margin_mean": 493.232666015625, "margin_dpo/margin_std": 653.6361083984375, "step": 674 }, { "KL/chosen_KL_mean": -663.6976318359375, "KL/mean": -916.9816284179688, "KL/rejected_KL_mean": -1170.265625, "KL/std": 600.1426391601562, "epoch": 0.9911894273127754, "fcm_dpo/beta": 0.0008998748380690813, "fcm_dpo/delta": -0.0599069781601429, "fcm_dpo/margin": 506.56787109375, "fcm_dpo/q_t": 0.39825230836868286, "grad_norm": 27.58077049255371, "learning_rate": 1.6138243485910863e-10, "logits/chosen": -1.038741111755371, "logits/rejected": -1.0553760528564453, "logps/chosen": -707.8739624023438, "logps/ref_chosen": -44.17628479003906, "logps/ref_rejected": -74.09197998046875, "logps/rejected": -1244.357666015625, "loss": 1.0612, "margin_dpo/margin_mean": 506.56787109375, "margin_dpo/margin_std": 637.0610961914062, "step": 675 }, { "KL/chosen_KL_mean": -725.62841796875, "KL/mean": -966.3801879882812, "KL/rejected_KL_mean": -1207.1319580078125, "KL/std": 584.3846435546875, "epoch": 0.9926578560939795, "fcm_dpo/beta": 0.0009004472522065043, "fcm_dpo/delta": -0.03510238975286484, "fcm_dpo/margin": 481.5036315917969, "fcm_dpo/q_t": 0.4012463092803955, "grad_norm": 27.7912654876709, "learning_rate": 1.1857007165852472e-10, "logits/chosen": -0.995841920375824, "logits/rejected": -1.0157501697540283, "logps/chosen": -797.0269165039062, "logps/ref_chosen": -71.39852905273438, "logps/ref_rejected": -88.3587646484375, "logps/rejected": -1295.49072265625, "loss": 1.0661, "margin_dpo/margin_mean": 481.50360107421875, "margin_dpo/margin_std": 591.6707763671875, "step": 676 }, { "KL/chosen_KL_mean": -729.6778564453125, "KL/mean": -962.5052490234375, "KL/rejected_KL_mean": -1195.332763671875, "KL/std": 609.0902099609375, "epoch": 0.9941262848751835, "fcm_dpo/beta": 0.0008935732766985893, "fcm_dpo/delta": -0.016854500398039818, "fcm_dpo/margin": 465.65484619140625, "fcm_dpo/q_t": 0.41068124771118164, "grad_norm": 29.935705184936523, "learning_rate": 8.23423165278725e-11, "logits/chosen": -1.073415756225586, "logits/rejected": -1.0710859298706055, "logps/chosen": -786.2052612304688, "logps/ref_chosen": -56.527435302734375, "logps/ref_rejected": -78.22654724121094, "logps/rejected": -1273.559326171875, "loss": 1.1046, "margin_dpo/margin_mean": 465.65484619140625, "margin_dpo/margin_std": 735.0831298828125, "step": 677 }, { "KL/chosen_KL_mean": -608.3975830078125, "KL/mean": -880.777587890625, "KL/rejected_KL_mean": -1153.1575927734375, "KL/std": 650.5914306640625, "epoch": 0.9955947136563876, "fcm_dpo/beta": 0.00088664231589064, "fcm_dpo/delta": -0.08726058155298233, "fcm_dpo/margin": 544.7600708007812, "fcm_dpo/q_t": 0.39224404096603394, "grad_norm": 33.50616455078125, "learning_rate": 5.270012410216185e-11, "logits/chosen": -1.0082026720046997, "logits/rejected": -1.0482615232467651, "logps/chosen": -654.531982421875, "logps/ref_chosen": -46.13447570800781, "logps/ref_rejected": -80.60462951660156, "logps/rejected": -1233.76220703125, "loss": 1.0566, "margin_dpo/margin_mean": 544.7600708007812, "margin_dpo/margin_std": 738.8284912109375, "step": 678 }, { "KL/chosen_KL_mean": -707.9012451171875, "KL/mean": -900.2404174804688, "KL/rejected_KL_mean": -1092.57958984375, "KL/std": 537.857177734375, "epoch": 0.9970631424375918, "fcm_dpo/beta": 0.0008858998189680278, "fcm_dpo/delta": 0.06133866682648659, "fcm_dpo/margin": 384.67840576171875, "fcm_dpo/q_t": 0.42340487241744995, "grad_norm": 31.147796630859375, "learning_rate": 2.9644275480772416e-11, "logits/chosen": -1.03668212890625, "logits/rejected": -1.0291433334350586, "logps/chosen": -758.1961669921875, "logps/ref_chosen": -50.294921875, "logps/ref_rejected": -76.59813690185547, "logps/rejected": -1169.177734375, "loss": 1.1451, "margin_dpo/margin_mean": 384.678466796875, "margin_dpo/margin_std": 626.0743408203125, "step": 679 }, { "KL/chosen_KL_mean": -720.6077880859375, "KL/mean": -977.27392578125, "KL/rejected_KL_mean": -1233.93994140625, "KL/std": 704.15478515625, "epoch": 0.9985315712187959, "fcm_dpo/beta": 0.0008777154725976288, "fcm_dpo/delta": -0.05377676337957382, "fcm_dpo/margin": 513.332275390625, "fcm_dpo/q_t": 0.39790278673171997, "grad_norm": 36.256160736083984, "learning_rate": 1.31753782067201e-11, "logits/chosen": -1.0335376262664795, "logits/rejected": -1.0608773231506348, "logps/chosen": -797.5235595703125, "logps/ref_chosen": -76.91569519042969, "logps/ref_rejected": -112.384765625, "logps/rejected": -1346.32470703125, "loss": 1.0875, "margin_dpo/margin_mean": 513.332275390625, "margin_dpo/margin_std": 758.932373046875, "step": 680 }, { "KL/chosen_KL_mean": -707.947998046875, "KL/mean": -907.2955322265625, "KL/rejected_KL_mean": -1106.64306640625, "KL/std": 573.4292602539062, "epoch": 1.0, "fcm_dpo/beta": 0.0008919438696466386, "fcm_dpo/delta": 0.04472469165921211, "fcm_dpo/margin": 398.6950988769531, "fcm_dpo/q_t": 0.41937246918678284, "grad_norm": 42.854522705078125, "learning_rate": 3.2938662507808745e-12, "logits/chosen": -1.0845022201538086, "logits/rejected": -1.1038618087768555, "logps/chosen": -768.9052734375, "logps/ref_chosen": -60.957279205322266, "logps/ref_rejected": -88.55797576904297, "logps/rejected": -1195.200927734375, "loss": 1.1404, "margin_dpo/margin_mean": 398.6950988769531, "margin_dpo/margin_std": 622.65087890625, "step": 681 }, { "epoch": 1.0, "step": 681, "total_flos": 0.0, "train_loss": 1.095842420442164, "train_runtime": 1736.9553, "train_samples_per_second": 25.1, "train_steps_per_second": 0.392 } ], "logging_steps": 1, "max_steps": 681, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }