{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 200, "global_step": 681, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "KL/chosen_KL_mean": 0.00527191162109375, "KL/mean": 0.016706019639968872, "KL/rejected_KL_mean": 0.028141021728515625, "KL/std": 0.272699236869812, "epoch": 0.0014684287812041115, "fcm_dpo/beta": 0.5, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": -0.02287006378173828, "fcm_dpo/q_t": 0.5027250051498413, "grad_norm": 420.2432861328125, "learning_rate": 0.0, "logits/chosen": -0.4974287748336792, "logits/rejected": -0.43299180269241333, "logps/chosen": -50.1435661315918, "logps/ref_chosen": -50.14883804321289, "logps/ref_rejected": -74.1280517578125, "logps/rejected": -74.09991455078125, "loss": 1.4087, "margin_dpo/margin_mean": -0.02287048101425171, "margin_dpo/margin_std": 0.41920793056488037, "step": 1 }, { "KL/chosen_KL_mean": -0.03498649597167969, "KL/mean": -0.00212840735912323, "KL/rejected_KL_mean": 0.030735015869140625, "KL/std": 0.24797174334526062, "epoch": 0.002936857562408223, "fcm_dpo/beta": 0.5, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": -0.06572261452674866, "fcm_dpo/q_t": 0.5081548094749451, "grad_norm": 364.62652587890625, "learning_rate": 7.246376811594203e-09, "logits/chosen": -0.49536412954330444, "logits/rejected": -0.4594460427761078, "logps/chosen": -52.65568923950195, "logps/ref_chosen": -52.620704650878906, "logps/ref_rejected": -75.30413818359375, "logps/rejected": -75.27340698242188, "loss": 1.4271, "margin_dpo/margin_mean": -0.06572240591049194, "margin_dpo/margin_std": 0.35048407316207886, "step": 2 }, { "KL/chosen_KL_mean": 0.052303314208984375, "KL/mean": 0.017774119973182678, "KL/rejected_KL_mean": -0.016756057739257812, "KL/std": 0.28824305534362793, "epoch": 0.004405286343612335, "fcm_dpo/beta": 0.5, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.06905469298362732, "fcm_dpo/q_t": 0.4915676712989807, "grad_norm": 347.2252197265625, "learning_rate": 1.4492753623188406e-08, "logits/chosen": -0.4816562235355377, "logits/rejected": -0.44209641218185425, "logps/chosen": -60.929290771484375, "logps/ref_chosen": -60.981597900390625, "logps/ref_rejected": -68.67259216308594, "logps/rejected": -68.6893539428711, "loss": 1.362, "margin_dpo/margin_mean": 0.06905469298362732, "margin_dpo/margin_std": 0.3988131284713745, "step": 3 }, { "KL/chosen_KL_mean": -0.021808624267578125, "KL/mean": -0.011183008551597595, "KL/rejected_KL_mean": -0.000560760498046875, "KL/std": 0.250108003616333, "epoch": 0.005873715124816446, "fcm_dpo/beta": 0.5, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": -0.021249920129776, "fcm_dpo/q_t": 0.5027137994766235, "grad_norm": 359.3165588378906, "learning_rate": 2.1739130434782606e-08, "logits/chosen": -0.4682066738605499, "logits/rejected": -0.44051969051361084, "logps/chosen": -56.789520263671875, "logps/ref_chosen": -56.7677116394043, "logps/ref_rejected": -86.64710998535156, "logps/rejected": -86.64767456054688, "loss": 1.4044, "margin_dpo/margin_mean": -0.02125033736228943, "margin_dpo/margin_std": 0.33959275484085083, "step": 4 }, { "KL/chosen_KL_mean": 0.040134429931640625, "KL/mean": 0.021857306361198425, "KL/rejected_KL_mean": 0.003582000732421875, "KL/std": 0.26523804664611816, "epoch": 0.007342143906020558, "fcm_dpo/beta": 0.5, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.036554425954818726, "fcm_dpo/q_t": 0.4954211413860321, "grad_norm": 448.5081481933594, "learning_rate": 2.898550724637681e-08, "logits/chosen": -0.49668556451797485, "logits/rejected": -0.45167264342308044, "logps/chosen": -53.81924057006836, "logps/ref_chosen": -53.859375, "logps/ref_rejected": -84.14918518066406, "logps/rejected": -84.14559936523438, "loss": 1.3765, "margin_dpo/margin_mean": 0.03655460476875305, "margin_dpo/margin_std": 0.3572620153427124, "step": 5 }, { "KL/chosen_KL_mean": -0.011350631713867188, "KL/mean": 0.008662402629852295, "KL/rejected_KL_mean": 0.028675079345703125, "KL/std": 0.28275883197784424, "epoch": 0.00881057268722467, "fcm_dpo/beta": 0.5, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": -0.04002267122268677, "fcm_dpo/q_t": 0.5049124956130981, "grad_norm": 474.76165771484375, "learning_rate": 3.6231884057971014e-08, "logits/chosen": -0.5011003613471985, "logits/rejected": -0.4586023688316345, "logps/chosen": -63.018836975097656, "logps/ref_chosen": -63.007484436035156, "logps/ref_rejected": -92.64534759521484, "logps/rejected": -92.61666870117188, "loss": 1.4172, "margin_dpo/margin_mean": -0.04002311825752258, "margin_dpo/margin_std": 0.41552552580833435, "step": 6 }, { "KL/chosen_KL_mean": 0.017522811889648438, "KL/mean": -0.0009044557809829712, "KL/rejected_KL_mean": -0.0193328857421875, "KL/std": 0.27743956446647644, "epoch": 0.010279001468428781, "fcm_dpo/beta": 0.5, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.0368523895740509, "fcm_dpo/q_t": 0.49542200565338135, "grad_norm": 406.9675598144531, "learning_rate": 4.347826086956521e-08, "logits/chosen": -0.5030827522277832, "logits/rejected": -0.4692496657371521, "logps/chosen": -57.75729751586914, "logps/ref_chosen": -57.774818420410156, "logps/ref_rejected": -103.92059326171875, "logps/rejected": -103.93992614746094, "loss": 1.3777, "margin_dpo/margin_mean": 0.03685298562049866, "margin_dpo/margin_std": 0.3953211307525635, "step": 7 }, { "KL/chosen_KL_mean": -0.03408622741699219, "KL/mean": -0.005419567227363586, "KL/rejected_KL_mean": 0.023250579833984375, "KL/std": 0.2861067056655884, "epoch": 0.011747430249632892, "fcm_dpo/beta": 0.5, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": -0.05733850598335266, "fcm_dpo/q_t": 0.5070033073425293, "grad_norm": 401.7236328125, "learning_rate": 5.0724637681159424e-08, "logits/chosen": -0.5170855522155762, "logits/rejected": -0.4922248125076294, "logps/chosen": -58.7501220703125, "logps/ref_chosen": -58.716033935546875, "logps/ref_rejected": -79.3114242553711, "logps/rejected": -79.28817749023438, "loss": 1.4225, "margin_dpo/margin_mean": -0.05733811855316162, "margin_dpo/margin_std": 0.3359847962856293, "step": 8 }, { "KL/chosen_KL_mean": -0.017595291137695312, "KL/mean": -0.022840231657028198, "KL/rejected_KL_mean": -0.02808380126953125, "KL/std": 0.28952154517173767, "epoch": 0.013215859030837005, "fcm_dpo/beta": 0.5, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.010491371154785156, "fcm_dpo/q_t": 0.49866122007369995, "grad_norm": 423.5147705078125, "learning_rate": 5.797101449275362e-08, "logits/chosen": -0.5013039708137512, "logits/rejected": -0.45518267154693604, "logps/chosen": -69.88443756103516, "logps/ref_chosen": -69.8668441772461, "logps/ref_rejected": -99.6026611328125, "logps/rejected": -99.63075256347656, "loss": 1.3925, "margin_dpo/margin_mean": 0.010491013526916504, "margin_dpo/margin_std": 0.42117273807525635, "step": 9 }, { "KL/chosen_KL_mean": 0.0170440673828125, "KL/mean": 0.02462557703256607, "KL/rejected_KL_mean": 0.0322113037109375, "KL/std": 0.2662718594074249, "epoch": 0.014684287812041116, "fcm_dpo/beta": 0.5, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": -0.015163615345954895, "fcm_dpo/q_t": 0.5018855333328247, "grad_norm": 353.2280578613281, "learning_rate": 6.521739130434782e-08, "logits/chosen": -0.4687877297401428, "logits/rejected": -0.42438995838165283, "logps/chosen": -48.340641021728516, "logps/ref_chosen": -48.35768508911133, "logps/ref_rejected": -80.37206268310547, "logps/rejected": -80.33985137939453, "loss": 1.4024, "margin_dpo/margin_mean": -0.015163183212280273, "margin_dpo/margin_std": 0.35796934366226196, "step": 10 }, { "KL/chosen_KL_mean": 0.021253585815429688, "KL/mean": 0.008961886167526245, "KL/rejected_KL_mean": -0.003330230712890625, "KL/std": 0.24493196606636047, "epoch": 0.016152716593245228, "fcm_dpo/beta": 0.5, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.024578213691711426, "fcm_dpo/q_t": 0.49689868092536926, "grad_norm": 344.31915283203125, "learning_rate": 7.246376811594203e-08, "logits/chosen": -0.4556809365749359, "logits/rejected": -0.43051183223724365, "logps/chosen": -52.995601654052734, "logps/ref_chosen": -53.01685333251953, "logps/ref_rejected": -87.78038024902344, "logps/rejected": -87.78370666503906, "loss": 1.3821, "margin_dpo/margin_mean": 0.024578243494033813, "margin_dpo/margin_std": 0.3401423990726471, "step": 11 }, { "KL/chosen_KL_mean": -0.0713043212890625, "KL/mean": -0.07425594329833984, "KL/rejected_KL_mean": -0.07719802856445312, "KL/std": 0.2839137315750122, "epoch": 0.01762114537444934, "fcm_dpo/beta": 0.5, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.0059018731117248535, "fcm_dpo/q_t": 0.4991750717163086, "grad_norm": 445.4076843261719, "learning_rate": 7.971014492753623e-08, "logits/chosen": -0.5402117967605591, "logits/rejected": -0.5041322708129883, "logps/chosen": -61.876739501953125, "logps/ref_chosen": -61.80543518066406, "logps/ref_rejected": -104.8582763671875, "logps/rejected": -104.93547058105469, "loss": 1.395, "margin_dpo/margin_mean": 0.005901157855987549, "margin_dpo/margin_std": 0.43184518814086914, "step": 12 }, { "KL/chosen_KL_mean": 0.018360137939453125, "KL/mean": 0.013540104031562805, "KL/rejected_KL_mean": 0.00872039794921875, "KL/std": 0.2952546775341034, "epoch": 0.01908957415565345, "fcm_dpo/beta": 0.5, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.009646743535995483, "fcm_dpo/q_t": 0.4988465905189514, "grad_norm": 399.35504150390625, "learning_rate": 8.695652173913042e-08, "logits/chosen": -0.49472951889038086, "logits/rejected": -0.46776068210601807, "logps/chosen": -64.24199676513672, "logps/ref_chosen": -64.2603530883789, "logps/ref_rejected": -87.20307922363281, "logps/rejected": -87.19436645507812, "loss": 1.3929, "margin_dpo/margin_mean": 0.009646564722061157, "margin_dpo/margin_std": 0.4087739586830139, "step": 13 }, { "KL/chosen_KL_mean": -0.015897750854492188, "KL/mean": -0.016403615474700928, "KL/rejected_KL_mean": -0.016910552978515625, "KL/std": 0.29813089966773987, "epoch": 0.020558002936857563, "fcm_dpo/beta": 0.5, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.0010128617286682129, "fcm_dpo/q_t": 0.49986132979393005, "grad_norm": 423.43255615234375, "learning_rate": 9.420289855072464e-08, "logits/chosen": -0.4713672995567322, "logits/rejected": -0.4317617416381836, "logps/chosen": -58.12610626220703, "logps/ref_chosen": -58.11021041870117, "logps/ref_rejected": -104.04708099365234, "logps/rejected": -104.06399536132812, "loss": 1.3972, "margin_dpo/margin_mean": 0.0010128915309906006, "margin_dpo/margin_std": 0.4278063476085663, "step": 14 }, { "KL/chosen_KL_mean": -0.029178619384765625, "KL/mean": -0.043839290738105774, "KL/rejected_KL_mean": -0.058498382568359375, "KL/std": 0.21881349384784698, "epoch": 0.022026431718061675, "fcm_dpo/beta": 0.5, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.029320567846298218, "fcm_dpo/q_t": 0.4963420033454895, "grad_norm": 320.57830810546875, "learning_rate": 1.0144927536231885e-07, "logits/chosen": -0.4776439368724823, "logits/rejected": -0.45821863412857056, "logps/chosen": -56.99608612060547, "logps/ref_chosen": -56.96691131591797, "logps/ref_rejected": -80.80863952636719, "logps/rejected": -80.86714172363281, "loss": 1.3803, "margin_dpo/margin_mean": 0.029320329427719116, "margin_dpo/margin_std": 0.3670857548713684, "step": 15 }, { "KL/chosen_KL_mean": 0.03498077392578125, "KL/mean": 0.0047643184661865234, "KL/rejected_KL_mean": -0.025447845458984375, "KL/std": 0.22115886211395264, "epoch": 0.023494860499265784, "fcm_dpo/beta": 0.5, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.06042605638504028, "fcm_dpo/q_t": 0.49249696731567383, "grad_norm": 413.9620056152344, "learning_rate": 1.0869565217391303e-07, "logits/chosen": -0.5402108430862427, "logits/rejected": -0.5008047223091125, "logps/chosen": -61.70491027832031, "logps/ref_chosen": -61.739891052246094, "logps/ref_rejected": -84.36947631835938, "logps/rejected": -84.3949203491211, "loss": 1.3621, "margin_dpo/margin_mean": 0.06042572855949402, "margin_dpo/margin_std": 0.28903117775917053, "step": 16 }, { "KL/chosen_KL_mean": 0.032032012939453125, "KL/mean": -0.021739423274993896, "KL/rejected_KL_mean": -0.07551193237304688, "KL/std": 0.28327155113220215, "epoch": 0.024963289280469897, "fcm_dpo/beta": 0.5, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.10754328966140747, "fcm_dpo/q_t": 0.4866830110549927, "grad_norm": 378.045166015625, "learning_rate": 1.1594202898550725e-07, "logits/chosen": -0.5082837343215942, "logits/rejected": -0.4719049036502838, "logps/chosen": -67.67829895019531, "logps/ref_chosen": -67.71033477783203, "logps/ref_rejected": -85.37865447998047, "logps/rejected": -85.45416259765625, "loss": 1.3408, "margin_dpo/margin_mean": 0.10754308104515076, "margin_dpo/margin_std": 0.33711522817611694, "step": 17 }, { "KL/chosen_KL_mean": -0.004711151123046875, "KL/mean": -0.020610541105270386, "KL/rejected_KL_mean": -0.0365142822265625, "KL/std": 0.24805116653442383, "epoch": 0.02643171806167401, "fcm_dpo/beta": 0.5, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.03179961442947388, "fcm_dpo/q_t": 0.49600082635879517, "grad_norm": 400.0282897949219, "learning_rate": 1.2318840579710146e-07, "logits/chosen": -0.4709518253803253, "logits/rejected": -0.41293156147003174, "logps/chosen": -47.74420166015625, "logps/ref_chosen": -47.7394905090332, "logps/ref_rejected": -75.4722900390625, "logps/rejected": -75.50880432128906, "loss": 1.3788, "margin_dpo/margin_mean": 0.0317995548248291, "margin_dpo/margin_std": 0.3525484800338745, "step": 18 }, { "KL/chosen_KL_mean": 0.040874481201171875, "KL/mean": -0.014696747064590454, "KL/rejected_KL_mean": -0.07026290893554688, "KL/std": 0.2666897773742676, "epoch": 0.027900146842878122, "fcm_dpo/beta": 0.5, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.11113607883453369, "fcm_dpo/q_t": 0.48626208305358887, "grad_norm": 357.18487548828125, "learning_rate": 1.3043478260869563e-07, "logits/chosen": -0.485554575920105, "logits/rejected": -0.4352126121520996, "logps/chosen": -70.16448974609375, "logps/ref_chosen": -70.20536041259766, "logps/ref_rejected": -89.7575912475586, "logps/rejected": -89.82785034179688, "loss": 1.3391, "margin_dpo/margin_mean": 0.11113619804382324, "margin_dpo/margin_std": 0.3337096571922302, "step": 19 }, { "KL/chosen_KL_mean": -0.010358810424804688, "KL/mean": -0.048069894313812256, "KL/rejected_KL_mean": -0.08577346801757812, "KL/std": 0.24265292286872864, "epoch": 0.02936857562408223, "fcm_dpo/beta": 0.5, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.0754203200340271, "fcm_dpo/q_t": 0.4906235337257385, "grad_norm": 360.1549377441406, "learning_rate": 1.3768115942028986e-07, "logits/chosen": -0.5490742325782776, "logits/rejected": -0.4924872815608978, "logps/chosen": -50.8135986328125, "logps/ref_chosen": -50.80324172973633, "logps/ref_rejected": -78.82334899902344, "logps/rejected": -78.90911865234375, "loss": 1.3552, "margin_dpo/margin_mean": 0.07542020082473755, "margin_dpo/margin_std": 0.3174728751182556, "step": 20 }, { "KL/chosen_KL_mean": -0.0028514862060546875, "KL/mean": -0.034926123917102814, "KL/rejected_KL_mean": -0.0670013427734375, "KL/std": 0.2600030303001404, "epoch": 0.030837004405286344, "fcm_dpo/beta": 0.5, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.06414835155010223, "fcm_dpo/q_t": 0.4920843839645386, "grad_norm": 378.02789306640625, "learning_rate": 1.4492753623188405e-07, "logits/chosen": -0.5145970582962036, "logits/rejected": -0.4921773076057434, "logps/chosen": -50.06586837768555, "logps/ref_chosen": -50.063018798828125, "logps/ref_rejected": -77.86878967285156, "logps/rejected": -77.935791015625, "loss": 1.3604, "margin_dpo/margin_mean": 0.06414888799190521, "margin_dpo/margin_std": 0.30178433656692505, "step": 21 }, { "KL/chosen_KL_mean": 0.03511619567871094, "KL/mean": -0.06825144588947296, "KL/rejected_KL_mean": -0.17161941528320312, "KL/std": 0.30668485164642334, "epoch": 0.032305433186490456, "fcm_dpo/beta": 0.5, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.20674064755439758, "fcm_dpo/q_t": 0.4745423197746277, "grad_norm": 388.7802429199219, "learning_rate": 1.5217391304347825e-07, "logits/chosen": -0.4853633642196655, "logits/rejected": -0.4417203366756439, "logps/chosen": -59.02252197265625, "logps/ref_chosen": -59.05763626098633, "logps/ref_rejected": -97.50466918945312, "logps/rejected": -97.6762924194336, "loss": 1.2978, "margin_dpo/margin_mean": 0.206741064786911, "margin_dpo/margin_std": 0.43350815773010254, "step": 22 }, { "KL/chosen_KL_mean": 0.09302711486816406, "KL/mean": -0.042252302169799805, "KL/rejected_KL_mean": -0.17752456665039062, "KL/std": 0.31162387132644653, "epoch": 0.033773861967694566, "fcm_dpo/beta": 0.5, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.27054840326309204, "fcm_dpo/q_t": 0.4666573405265808, "grad_norm": 364.82421875, "learning_rate": 1.5942028985507245e-07, "logits/chosen": -0.5055208206176758, "logits/rejected": -0.4839291274547577, "logps/chosen": -59.98466873168945, "logps/ref_chosen": -60.07769775390625, "logps/ref_rejected": -81.13955688476562, "logps/rejected": -81.31707763671875, "loss": 1.2662, "margin_dpo/margin_mean": 0.2705477774143219, "margin_dpo/margin_std": 0.40643441677093506, "step": 23 }, { "KL/chosen_KL_mean": 0.04521942138671875, "KL/mean": -0.0763748288154602, "KL/rejected_KL_mean": -0.19797515869140625, "KL/std": 0.28322041034698486, "epoch": 0.03524229074889868, "fcm_dpo/beta": 0.5, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.24319219589233398, "fcm_dpo/q_t": 0.4699401259422302, "grad_norm": 391.4231872558594, "learning_rate": 1.6666666666666665e-07, "logits/chosen": -0.5012839436531067, "logits/rejected": -0.4848848581314087, "logps/chosen": -44.24581527709961, "logps/ref_chosen": -44.29103469848633, "logps/ref_rejected": -99.12521362304688, "logps/rejected": -99.32318878173828, "loss": 1.2773, "margin_dpo/margin_mean": 0.24319320917129517, "margin_dpo/margin_std": 0.3720870018005371, "step": 24 }, { "KL/chosen_KL_mean": 0.05419921875, "KL/mean": -0.06559216976165771, "KL/rejected_KL_mean": -0.18538284301757812, "KL/std": 0.3871203064918518, "epoch": 0.03671071953010279, "fcm_dpo/beta": 0.5, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.2395843267440796, "fcm_dpo/q_t": 0.4704943895339966, "grad_norm": 349.39923095703125, "learning_rate": 1.7391304347826085e-07, "logits/chosen": -0.5076802968978882, "logits/rejected": -0.47814005613327026, "logps/chosen": -52.482852935791016, "logps/ref_chosen": -52.537052154541016, "logps/ref_rejected": -89.34219360351562, "logps/rejected": -89.52757263183594, "loss": 1.2852, "margin_dpo/margin_mean": 0.2395840287208557, "margin_dpo/margin_std": 0.49307340383529663, "step": 25 }, { "KL/chosen_KL_mean": 0.06473541259765625, "KL/mean": -0.1246185302734375, "KL/rejected_KL_mean": -0.31397247314453125, "KL/std": 0.4182741940021515, "epoch": 0.0381791483113069, "fcm_dpo/beta": 0.5046226978302002, "fcm_dpo/delta": 0.09160952270030975, "fcm_dpo/margin": 0.37870925664901733, "fcm_dpo/q_t": 0.4540257155895233, "grad_norm": 383.2384948730469, "learning_rate": 1.8115942028985507e-07, "logits/chosen": -0.5243556499481201, "logits/rejected": -0.4925326108932495, "logps/chosen": -53.858070373535156, "logps/ref_chosen": -53.92280578613281, "logps/ref_rejected": -103.35971069335938, "logps/rejected": -103.67369079589844, "loss": 1.2223, "margin_dpo/margin_mean": 0.37870919704437256, "margin_dpo/margin_std": 0.5341597199440002, "step": 26 }, { "KL/chosen_KL_mean": 0.1428699493408203, "KL/mean": -0.11810372769832611, "KL/rejected_KL_mean": -0.3790779113769531, "KL/std": 0.44679516553878784, "epoch": 0.039647577092511016, "fcm_dpo/beta": 0.5186325311660767, "fcm_dpo/delta": 0.13306188583374023, "fcm_dpo/margin": 0.5219477415084839, "fcm_dpo/q_t": 0.435089111328125, "grad_norm": 404.0160217285156, "learning_rate": 1.8840579710144927e-07, "logits/chosen": -0.5184708833694458, "logits/rejected": -0.4820369780063629, "logps/chosen": -42.75566101074219, "logps/ref_chosen": -42.898529052734375, "logps/ref_rejected": -98.72419738769531, "logps/rejected": -99.10327911376953, "loss": 1.1512, "margin_dpo/margin_mean": 0.5219472646713257, "margin_dpo/margin_std": 0.5090110301971436, "step": 27 }, { "KL/chosen_KL_mean": 0.03517341613769531, "KL/mean": -0.1448913812637329, "KL/rejected_KL_mean": -0.32495880126953125, "KL/std": 0.41489964723587036, "epoch": 0.041116005873715125, "fcm_dpo/beta": 0.5273520350456238, "fcm_dpo/delta": 0.08291557431221008, "fcm_dpo/margin": 0.36012983322143555, "fcm_dpo/q_t": 0.45420730113983154, "grad_norm": 340.0865173339844, "learning_rate": 1.9565217391304347e-07, "logits/chosen": -0.5174187421798706, "logits/rejected": -0.4631088972091675, "logps/chosen": -60.52132797241211, "logps/ref_chosen": -60.55650329589844, "logps/ref_rejected": -91.40111541748047, "logps/rejected": -91.72607421875, "loss": 1.2296, "margin_dpo/margin_mean": 0.36013028025627136, "margin_dpo/margin_std": 0.5909340381622314, "step": 28 }, { "KL/chosen_KL_mean": 0.09375, "KL/mean": -0.18463768064975739, "KL/rejected_KL_mean": -0.4630241394042969, "KL/std": 0.4841146767139435, "epoch": 0.042584434654919234, "fcm_dpo/beta": 0.5401861667633057, "fcm_dpo/delta": 0.10225643217563629, "fcm_dpo/margin": 0.5567755699157715, "fcm_dpo/q_t": 0.42827117443084717, "grad_norm": 398.7171936035156, "learning_rate": 2.028985507246377e-07, "logits/chosen": -0.5421187877655029, "logits/rejected": -0.49508053064346313, "logps/chosen": -57.71403503417969, "logps/ref_chosen": -57.80778503417969, "logps/ref_rejected": -97.39434814453125, "logps/rejected": -97.85737609863281, "loss": 1.1301, "margin_dpo/margin_mean": 0.5567758679389954, "margin_dpo/margin_std": 0.5642556548118591, "step": 29 }, { "KL/chosen_KL_mean": 0.15998458862304688, "KL/mean": -0.19396524131298065, "KL/rejected_KL_mean": -0.5479164123535156, "KL/std": 0.6062048673629761, "epoch": 0.04405286343612335, "fcm_dpo/beta": 0.542646050453186, "fcm_dpo/delta": 0.01639886572957039, "fcm_dpo/margin": 0.7079055309295654, "fcm_dpo/q_t": 0.40854281187057495, "grad_norm": 362.9571533203125, "learning_rate": 2.1014492753623187e-07, "logits/chosen": -0.482383131980896, "logits/rejected": -0.451177716255188, "logps/chosen": -52.417388916015625, "logps/ref_chosen": -52.577369689941406, "logps/ref_rejected": -98.48920440673828, "logps/rejected": -99.03712463378906, "loss": 1.0681, "margin_dpo/margin_mean": 0.7079058289527893, "margin_dpo/margin_std": 0.6428213119506836, "step": 30 }, { "KL/chosen_KL_mean": 0.10284805297851562, "KL/mean": -0.1433902531862259, "KL/rejected_KL_mean": -0.3896293640136719, "KL/std": 0.5450563430786133, "epoch": 0.04552129221732746, "fcm_dpo/beta": 0.5562627911567688, "fcm_dpo/delta": 0.12924844026565552, "fcm_dpo/margin": 0.4924760162830353, "fcm_dpo/q_t": 0.43538039922714233, "grad_norm": 309.9827575683594, "learning_rate": 2.1739130434782607e-07, "logits/chosen": -0.48639383912086487, "logits/rejected": -0.4393838047981262, "logps/chosen": -63.704071044921875, "logps/ref_chosen": -63.806922912597656, "logps/ref_rejected": -72.89400482177734, "logps/rejected": -73.28363037109375, "loss": 1.1624, "margin_dpo/margin_mean": 0.492476224899292, "margin_dpo/margin_std": 0.6477472186088562, "step": 31 }, { "KL/chosen_KL_mean": 0.1891956329345703, "KL/mean": -0.19269661605358124, "KL/rejected_KL_mean": -0.5745887756347656, "KL/std": 0.7516759634017944, "epoch": 0.04698972099853157, "fcm_dpo/beta": 0.5558298230171204, "fcm_dpo/delta": -0.025696825236082077, "fcm_dpo/margin": 0.7637799978256226, "fcm_dpo/q_t": 0.4026581645011902, "grad_norm": 343.0960998535156, "learning_rate": 2.2463768115942027e-07, "logits/chosen": -0.5173541307449341, "logits/rejected": -0.4762566089630127, "logps/chosen": -62.550331115722656, "logps/ref_chosen": -62.739524841308594, "logps/ref_rejected": -89.3175048828125, "logps/rejected": -89.89208984375, "loss": 1.0627, "margin_dpo/margin_mean": 0.7637800574302673, "margin_dpo/margin_std": 0.9043101668357849, "step": 32 }, { "KL/chosen_KL_mean": 0.09556961059570312, "KL/mean": -0.20322665572166443, "KL/rejected_KL_mean": -0.5020217895507812, "KL/std": 0.5590921640396118, "epoch": 0.048458149779735685, "fcm_dpo/beta": 0.562440037727356, "fcm_dpo/delta": 0.0659160241484642, "fcm_dpo/margin": 0.5975915789604187, "fcm_dpo/q_t": 0.4198671281337738, "grad_norm": 328.4997253417969, "learning_rate": 2.318840579710145e-07, "logits/chosen": -0.5058610439300537, "logits/rejected": -0.48015594482421875, "logps/chosen": -53.1654052734375, "logps/ref_chosen": -53.26097106933594, "logps/ref_rejected": -87.8851318359375, "logps/rejected": -88.38716125488281, "loss": 1.1085, "margin_dpo/margin_mean": 0.5975916385650635, "margin_dpo/margin_std": 0.6252603530883789, "step": 33 }, { "KL/chosen_KL_mean": 0.09558486938476562, "KL/mean": -0.3123227655887604, "KL/rejected_KL_mean": -0.7202377319335938, "KL/std": 0.7326186895370483, "epoch": 0.049926578560939794, "fcm_dpo/beta": 0.5575153827667236, "fcm_dpo/delta": -0.057598959654569626, "fcm_dpo/margin": 0.8158173561096191, "fcm_dpo/q_t": 0.39417457580566406, "grad_norm": 312.8501892089844, "learning_rate": 2.391304347826087e-07, "logits/chosen": -0.4918326139450073, "logits/rejected": -0.47446727752685547, "logps/chosen": -50.72174072265625, "logps/ref_chosen": -50.81732940673828, "logps/ref_rejected": -101.92184448242188, "logps/rejected": -102.64208221435547, "loss": 1.0336, "margin_dpo/margin_mean": 0.8158169984817505, "margin_dpo/margin_std": 0.8447773456573486, "step": 34 }, { "KL/chosen_KL_mean": 0.12476348876953125, "KL/mean": -0.46371960639953613, "KL/rejected_KL_mean": -1.0522003173828125, "KL/std": 0.9688708782196045, "epoch": 0.0513950073421439, "fcm_dpo/beta": 0.5353924036026001, "fcm_dpo/delta": -0.24643680453300476, "fcm_dpo/margin": 1.1769663095474243, "fcm_dpo/q_t": 0.35741329193115234, "grad_norm": 279.2007751464844, "learning_rate": 2.463768115942029e-07, "logits/chosen": -0.5165481567382812, "logits/rejected": -0.47960567474365234, "logps/chosen": -50.89972686767578, "logps/ref_chosen": -51.02449035644531, "logps/ref_rejected": -106.82443237304688, "logps/rejected": -107.87663269042969, "loss": 0.9218, "margin_dpo/margin_mean": 1.1769659519195557, "margin_dpo/margin_std": 1.0974863767623901, "step": 35 }, { "KL/chosen_KL_mean": 0.055484771728515625, "KL/mean": -0.4918856918811798, "KL/rejected_KL_mean": -1.0392494201660156, "KL/std": 1.049076795578003, "epoch": 0.05286343612334802, "fcm_dpo/beta": 0.5205714702606201, "fcm_dpo/delta": -0.1801259070634842, "fcm_dpo/margin": 1.09473717212677, "fcm_dpo/q_t": 0.37265270948410034, "grad_norm": 232.2285919189453, "learning_rate": 2.536231884057971e-07, "logits/chosen": -0.5617629885673523, "logits/rejected": -0.5254461765289307, "logps/chosen": -51.936004638671875, "logps/ref_chosen": -51.991493225097656, "logps/ref_rejected": -86.0406265258789, "logps/rejected": -87.07987976074219, "loss": 0.9947, "margin_dpo/margin_mean": 1.094736099243164, "margin_dpo/margin_std": 1.23842453956604, "step": 36 }, { "KL/chosen_KL_mean": 0.0018596649169921875, "KL/mean": -0.5054476261138916, "KL/rejected_KL_mean": -1.01275634765625, "KL/std": 1.0474822521209717, "epoch": 0.05433186490455213, "fcm_dpo/beta": 0.49882519245147705, "fcm_dpo/delta": -0.11387760937213898, "fcm_dpo/margin": 1.0146119594573975, "fcm_dpo/q_t": 0.3891563415527344, "grad_norm": 219.2965850830078, "learning_rate": 2.6086956521739126e-07, "logits/chosen": -0.4967535734176636, "logits/rejected": -0.4519627094268799, "logps/chosen": -62.80524826049805, "logps/ref_chosen": -62.807106018066406, "logps/ref_rejected": -77.89507293701172, "logps/rejected": -78.90782928466797, "loss": 1.0432, "margin_dpo/margin_mean": 1.0146114826202393, "margin_dpo/margin_std": 1.3467109203338623, "step": 37 }, { "KL/chosen_KL_mean": 0.1434650421142578, "KL/mean": -0.5286348462104797, "KL/rejected_KL_mean": -1.2007369995117188, "KL/std": 1.3252570629119873, "epoch": 0.055800293685756244, "fcm_dpo/beta": 0.4817589521408081, "fcm_dpo/delta": -0.2642138600349426, "fcm_dpo/margin": 1.3442010879516602, "fcm_dpo/q_t": 0.3633432388305664, "grad_norm": 220.9096221923828, "learning_rate": 2.681159420289855e-07, "logits/chosen": -0.5368313789367676, "logits/rejected": -0.5042594075202942, "logps/chosen": -48.24705505371094, "logps/ref_chosen": -48.39051818847656, "logps/ref_rejected": -97.91244506835938, "logps/rejected": -99.11317443847656, "loss": 0.9599, "margin_dpo/margin_mean": 1.344200849533081, "margin_dpo/margin_std": 1.6261742115020752, "step": 38 }, { "KL/chosen_KL_mean": 0.07047653198242188, "KL/mean": -0.7256151437759399, "KL/rejected_KL_mean": -1.5217018127441406, "KL/std": 1.2704432010650635, "epoch": 0.05726872246696035, "fcm_dpo/beta": 0.44901108741760254, "fcm_dpo/delta": -0.3407745361328125, "fcm_dpo/margin": 1.592177152633667, "fcm_dpo/q_t": 0.3372858464717865, "grad_norm": 206.50027465820312, "learning_rate": 2.753623188405797e-07, "logits/chosen": -0.5508787631988525, "logits/rejected": -0.5106680989265442, "logps/chosen": -50.679996490478516, "logps/ref_chosen": -50.75047302246094, "logps/ref_rejected": -78.56951141357422, "logps/rejected": -80.09121704101562, "loss": 0.8796, "margin_dpo/margin_mean": 1.5921775102615356, "margin_dpo/margin_std": 1.383728265762329, "step": 39 }, { "KL/chosen_KL_mean": 0.18862533569335938, "KL/mean": -0.5789550542831421, "KL/rejected_KL_mean": -1.3465385437011719, "KL/std": 1.350581407546997, "epoch": 0.05873715124816446, "fcm_dpo/beta": 0.4270463287830353, "fcm_dpo/delta": -0.2727539539337158, "fcm_dpo/margin": 1.5351669788360596, "fcm_dpo/q_t": 0.3570956885814667, "grad_norm": 164.9803009033203, "learning_rate": 2.8260869565217386e-07, "logits/chosen": -0.4705943763256073, "logits/rejected": -0.4372428059577942, "logps/chosen": -57.79644012451172, "logps/ref_chosen": -57.985069274902344, "logps/ref_rejected": -74.3000717163086, "logps/rejected": -75.6466064453125, "loss": 0.9362, "margin_dpo/margin_mean": 1.5351676940917969, "margin_dpo/margin_std": 1.6188992261886597, "step": 40 }, { "KL/chosen_KL_mean": 0.003345489501953125, "KL/mean": -0.9243937730789185, "KL/rejected_KL_mean": -1.8521308898925781, "KL/std": 1.8027801513671875, "epoch": 0.06020558002936858, "fcm_dpo/beta": 0.3977815508842468, "fcm_dpo/delta": -0.3646352291107178, "fcm_dpo/margin": 1.8554785251617432, "fcm_dpo/q_t": 0.3413015604019165, "grad_norm": 177.38894653320312, "learning_rate": 2.898550724637681e-07, "logits/chosen": -0.5352627038955688, "logits/rejected": -0.4983564019203186, "logps/chosen": -62.69247055053711, "logps/ref_chosen": -62.69581604003906, "logps/ref_rejected": -97.02352905273438, "logps/rejected": -98.87565612792969, "loss": 0.9018, "margin_dpo/margin_mean": 1.8554792404174805, "margin_dpo/margin_std": 1.9582023620605469, "step": 41 }, { "KL/chosen_KL_mean": 0.2212810516357422, "KL/mean": -1.0126826763153076, "KL/rejected_KL_mean": -2.24664306640625, "KL/std": 2.0452983379364014, "epoch": 0.06167400881057269, "fcm_dpo/beta": 0.3601230978965759, "fcm_dpo/delta": -0.5366164445877075, "fcm_dpo/margin": 2.467926502227783, "fcm_dpo/q_t": 0.3121350407600403, "grad_norm": 159.704833984375, "learning_rate": 2.971014492753623e-07, "logits/chosen": -0.5344812870025635, "logits/rejected": -0.4876905083656311, "logps/chosen": -58.745147705078125, "logps/ref_chosen": -58.966426849365234, "logps/ref_rejected": -109.90837097167969, "logps/rejected": -112.15501403808594, "loss": 0.8128, "margin_dpo/margin_mean": 2.467926502227783, "margin_dpo/margin_std": 2.2979540824890137, "step": 42 }, { "KL/chosen_KL_mean": 0.4799919128417969, "KL/mean": -0.6912780404090881, "KL/rejected_KL_mean": -1.862548828125, "KL/std": 1.731245756149292, "epoch": 0.0631424375917768, "fcm_dpo/beta": 0.32807010412216187, "fcm_dpo/delta": -0.4021656811237335, "fcm_dpo/margin": 2.342538356781006, "fcm_dpo/q_t": 0.325826495885849, "grad_norm": 152.13230895996094, "learning_rate": 3.043478260869565e-07, "logits/chosen": -0.5518888235092163, "logits/rejected": -0.5275447368621826, "logps/chosen": -53.676002502441406, "logps/ref_chosen": -54.15599822998047, "logps/ref_rejected": -96.48019409179688, "logps/rejected": -98.34274291992188, "loss": 0.8349, "margin_dpo/margin_mean": 2.342538356781006, "margin_dpo/margin_std": 1.8504887819290161, "step": 43 }, { "KL/chosen_KL_mean": 0.25227928161621094, "KL/mean": -1.1076438426971436, "KL/rejected_KL_mean": -2.4675674438476562, "KL/std": 2.1824231147766113, "epoch": 0.06461086637298091, "fcm_dpo/beta": 0.3006964921951294, "fcm_dpo/delta": -0.4574472904205322, "fcm_dpo/margin": 2.719846248626709, "fcm_dpo/q_t": 0.31683629751205444, "grad_norm": 148.4615020751953, "learning_rate": 3.115942028985507e-07, "logits/chosen": -0.49776938557624817, "logits/rejected": -0.4771164655685425, "logps/chosen": -49.82621765136719, "logps/ref_chosen": -50.07849884033203, "logps/ref_rejected": -108.78376007080078, "logps/rejected": -111.25132751464844, "loss": 0.8114, "margin_dpo/margin_mean": 2.719846248626709, "margin_dpo/margin_std": 2.149664878845215, "step": 44 }, { "KL/chosen_KL_mean": 0.12762832641601562, "KL/mean": -0.9766333103179932, "KL/rejected_KL_mean": -2.0808982849121094, "KL/std": 1.9980565309524536, "epoch": 0.06607929515418502, "fcm_dpo/beta": 0.2850699722766876, "fcm_dpo/delta": -0.2445305585861206, "fcm_dpo/margin": 2.2085297107696533, "fcm_dpo/q_t": 0.3639563322067261, "grad_norm": 119.1253890991211, "learning_rate": 3.188405797101449e-07, "logits/chosen": -0.47408968210220337, "logits/rejected": -0.46131014823913574, "logps/chosen": -48.28730010986328, "logps/ref_chosen": -48.4149284362793, "logps/ref_rejected": -77.93643188476562, "logps/rejected": -80.01732635498047, "loss": 0.9552, "margin_dpo/margin_mean": 2.208528518676758, "margin_dpo/margin_std": 2.4762864112854004, "step": 45 }, { "KL/chosen_KL_mean": 0.2157917022705078, "KL/mean": -1.1610474586486816, "KL/rejected_KL_mean": -2.5378875732421875, "KL/std": 2.4480698108673096, "epoch": 0.06754772393538913, "fcm_dpo/beta": 0.26706546545028687, "fcm_dpo/delta": -0.361088365316391, "fcm_dpo/margin": 2.753678798675537, "fcm_dpo/q_t": 0.34314534068107605, "grad_norm": 128.59988403320312, "learning_rate": 3.260869565217391e-07, "logits/chosen": -0.5371255278587341, "logits/rejected": -0.486606240272522, "logps/chosen": -55.783634185791016, "logps/ref_chosen": -55.999427795410156, "logps/ref_rejected": -95.652587890625, "logps/rejected": -98.19047546386719, "loss": 0.9049, "margin_dpo/margin_mean": 2.753678798675537, "margin_dpo/margin_std": 2.9481449127197266, "step": 46 }, { "KL/chosen_KL_mean": 0.42566680908203125, "KL/mean": -0.9785783290863037, "KL/rejected_KL_mean": -2.3828277587890625, "KL/std": 2.440776824951172, "epoch": 0.06901615271659324, "fcm_dpo/beta": 0.2504443824291229, "fcm_dpo/delta": -0.3251284062862396, "fcm_dpo/margin": 2.808493137359619, "fcm_dpo/q_t": 0.34311166405677795, "grad_norm": 118.37008666992188, "learning_rate": 3.333333333333333e-07, "logits/chosen": -0.5844066143035889, "logits/rejected": -0.5324996709823608, "logps/chosen": -57.50041198730469, "logps/ref_chosen": -57.92607879638672, "logps/ref_rejected": -94.67920684814453, "logps/rejected": -97.0620346069336, "loss": 0.8917, "margin_dpo/margin_mean": 2.8084943294525146, "margin_dpo/margin_std": 2.5350003242492676, "step": 47 }, { "KL/chosen_KL_mean": 0.11053085327148438, "KL/mean": -1.249524712562561, "KL/rejected_KL_mean": -2.609577178955078, "KL/std": 2.3069586753845215, "epoch": 0.07048458149779736, "fcm_dpo/beta": 0.23410022258758545, "fcm_dpo/delta": -0.2554876506328583, "fcm_dpo/margin": 2.720108985900879, "fcm_dpo/q_t": 0.35484230518341064, "grad_norm": 125.24164581298828, "learning_rate": 3.4057971014492755e-07, "logits/chosen": -0.5820130109786987, "logits/rejected": -0.5238351225852966, "logps/chosen": -57.077545166015625, "logps/ref_chosen": -57.188072204589844, "logps/ref_rejected": -88.0166015625, "logps/rejected": -90.62617492675781, "loss": 0.9335, "margin_dpo/margin_mean": 2.720109224319458, "margin_dpo/margin_std": 2.4631295204162598, "step": 48 }, { "KL/chosen_KL_mean": 0.39922142028808594, "KL/mean": -1.2344255447387695, "KL/rejected_KL_mean": -2.8680648803710938, "KL/std": 2.883481025695801, "epoch": 0.07195301027900147, "fcm_dpo/beta": 0.2220032513141632, "fcm_dpo/delta": -0.34955620765686035, "fcm_dpo/margin": 3.2672815322875977, "fcm_dpo/q_t": 0.3432408273220062, "grad_norm": 352.78228759765625, "learning_rate": 3.478260869565217e-07, "logits/chosen": -0.5358976125717163, "logits/rejected": -0.47647011280059814, "logps/chosen": -61.286048889160156, "logps/ref_chosen": -61.685272216796875, "logps/ref_rejected": -83.76747131347656, "logps/rejected": -86.63554382324219, "loss": 0.8999, "margin_dpo/margin_mean": 3.2672815322875977, "margin_dpo/margin_std": 3.3359837532043457, "step": 49 }, { "KL/chosen_KL_mean": -0.03353118896484375, "KL/mean": -1.8093570470809937, "KL/rejected_KL_mean": -3.585174560546875, "KL/std": 2.991940975189209, "epoch": 0.07342143906020558, "fcm_dpo/beta": 0.20502600073814392, "fcm_dpo/delta": -0.3554917573928833, "fcm_dpo/margin": 3.5516459941864014, "fcm_dpo/q_t": 0.33918917179107666, "grad_norm": 98.57312774658203, "learning_rate": 3.5507246376811595e-07, "logits/chosen": -0.5251749157905579, "logits/rejected": -0.4888863265514374, "logps/chosen": -58.757667541503906, "logps/ref_chosen": -58.72413635253906, "logps/ref_rejected": -96.35814666748047, "logps/rejected": -99.94332122802734, "loss": 0.8875, "margin_dpo/margin_mean": 3.5516457557678223, "margin_dpo/margin_std": 3.4122915267944336, "step": 50 }, { "KL/chosen_KL_mean": -0.10815811157226562, "KL/mean": -1.8943548202514648, "KL/rejected_KL_mean": -3.6805572509765625, "KL/std": 3.606013774871826, "epoch": 0.07488986784140969, "fcm_dpo/beta": 0.19137313961982727, "fcm_dpo/delta": -0.30843037366867065, "fcm_dpo/margin": 3.572404384613037, "fcm_dpo/q_t": 0.35805124044418335, "grad_norm": 78.19511413574219, "learning_rate": 3.6231884057971015e-07, "logits/chosen": -0.5243328809738159, "logits/rejected": -0.491935670375824, "logps/chosen": -61.48182678222656, "logps/ref_chosen": -61.3736686706543, "logps/ref_rejected": -76.00199890136719, "logps/rejected": -79.68255615234375, "loss": 0.9556, "margin_dpo/margin_mean": 3.5724036693573, "margin_dpo/margin_std": 4.397710800170898, "step": 51 }, { "KL/chosen_KL_mean": 0.5040225982666016, "KL/mean": -2.2015185356140137, "KL/rejected_KL_mean": -4.9070587158203125, "KL/std": 4.136686325073242, "epoch": 0.0763582966226138, "fcm_dpo/beta": 0.17365267872810364, "fcm_dpo/delta": -0.5967501401901245, "fcm_dpo/margin": 5.411087989807129, "fcm_dpo/q_t": 0.29616397619247437, "grad_norm": 78.32229614257812, "learning_rate": 3.695652173913043e-07, "logits/chosen": -0.5358279943466187, "logits/rejected": -0.4792342185974121, "logps/chosen": -51.833335876464844, "logps/ref_chosen": -52.33735656738281, "logps/ref_rejected": -79.97391510009766, "logps/rejected": -84.88097381591797, "loss": 0.7675, "margin_dpo/margin_mean": 5.411087989807129, "margin_dpo/margin_std": 4.405357360839844, "step": 52 }, { "KL/chosen_KL_mean": -0.03709983825683594, "KL/mean": -2.7792229652404785, "KL/rejected_KL_mean": -5.521343231201172, "KL/std": 4.680900573730469, "epoch": 0.07782672540381791, "fcm_dpo/beta": 0.1579442024230957, "fcm_dpo/delta": -0.5074787139892578, "fcm_dpo/margin": 5.484250068664551, "fcm_dpo/q_t": 0.32274216413497925, "grad_norm": 76.56503295898438, "learning_rate": 3.7681159420289855e-07, "logits/chosen": -0.620309591293335, "logits/rejected": -0.5990212559700012, "logps/chosen": -53.35175323486328, "logps/ref_chosen": -53.31465148925781, "logps/ref_rejected": -91.78359985351562, "logps/rejected": -97.30493927001953, "loss": 0.8413, "margin_dpo/margin_mean": 5.484249114990234, "margin_dpo/margin_std": 5.2846550941467285, "step": 53 }, { "KL/chosen_KL_mean": -0.15765953063964844, "KL/mean": -2.5494110584259033, "KL/rejected_KL_mean": -4.941162109375, "KL/std": 4.4056077003479, "epoch": 0.07929515418502203, "fcm_dpo/beta": 0.14552612602710724, "fcm_dpo/delta": -0.3178091049194336, "fcm_dpo/margin": 4.783502578735352, "fcm_dpo/q_t": 0.34551307559013367, "grad_norm": 68.20849609375, "learning_rate": 3.8405797101449274e-07, "logits/chosen": -0.5885263085365295, "logits/rejected": -0.5346698760986328, "logps/chosen": -50.84632110595703, "logps/ref_chosen": -50.68865966796875, "logps/ref_rejected": -91.71539306640625, "logps/rejected": -96.65655517578125, "loss": 0.8929, "margin_dpo/margin_mean": 4.783502578735352, "margin_dpo/margin_std": 4.461567401885986, "step": 54 }, { "KL/chosen_KL_mean": -0.5955238342285156, "KL/mean": -3.465237617492676, "KL/rejected_KL_mean": -6.334949493408203, "KL/std": 5.3761420249938965, "epoch": 0.08076358296622614, "fcm_dpo/beta": 0.13501086831092834, "fcm_dpo/delta": -0.40534526109695435, "fcm_dpo/margin": 5.739419460296631, "fcm_dpo/q_t": 0.3375312089920044, "grad_norm": 66.71011352539062, "learning_rate": 3.9130434782608694e-07, "logits/chosen": -0.653782844543457, "logits/rejected": -0.5924779176712036, "logps/chosen": -63.210758209228516, "logps/ref_chosen": -62.615234375, "logps/ref_rejected": -88.99349975585938, "logps/rejected": -95.32845306396484, "loss": 0.9046, "margin_dpo/margin_mean": 5.739418983459473, "margin_dpo/margin_std": 6.410279273986816, "step": 55 }, { "KL/chosen_KL_mean": -0.39885711669921875, "KL/mean": -3.1298060417175293, "KL/rejected_KL_mean": -5.860759735107422, "KL/std": 5.217003345489502, "epoch": 0.08223201174743025, "fcm_dpo/beta": 0.12596547603607178, "fcm_dpo/delta": -0.30908891558647156, "fcm_dpo/margin": 5.461906433105469, "fcm_dpo/q_t": 0.3533214330673218, "grad_norm": 55.40151596069336, "learning_rate": 3.9855072463768114e-07, "logits/chosen": -0.5743478536605835, "logits/rejected": -0.5299459099769592, "logps/chosen": -58.33158493041992, "logps/ref_chosen": -57.9327278137207, "logps/ref_rejected": -94.1744384765625, "logps/rejected": -100.03520202636719, "loss": 0.9462, "margin_dpo/margin_mean": 5.461906433105469, "margin_dpo/margin_std": 6.3516526222229, "step": 56 }, { "KL/chosen_KL_mean": -0.4383068084716797, "KL/mean": -3.5001718997955322, "KL/rejected_KL_mean": -6.562034606933594, "KL/std": 5.148193359375, "epoch": 0.08370044052863436, "fcm_dpo/beta": 0.11817534267902374, "fcm_dpo/delta": -0.3478173613548279, "fcm_dpo/margin": 6.123730659484863, "fcm_dpo/q_t": 0.3378201723098755, "grad_norm": 61.338130950927734, "learning_rate": 4.057971014492754e-07, "logits/chosen": -0.5510739088058472, "logits/rejected": -0.5219501256942749, "logps/chosen": -70.93359375, "logps/ref_chosen": -70.49528503417969, "logps/ref_rejected": -95.56546020507812, "logps/rejected": -102.12748718261719, "loss": 0.8847, "margin_dpo/margin_mean": 6.123730182647705, "margin_dpo/margin_std": 5.595479488372803, "step": 57 }, { "KL/chosen_KL_mean": -0.48359107971191406, "KL/mean": -3.9639272689819336, "KL/rejected_KL_mean": -7.444267272949219, "KL/std": 5.993289470672607, "epoch": 0.08516886930983847, "fcm_dpo/beta": 0.10953576862812042, "fcm_dpo/delta": -0.3910744786262512, "fcm_dpo/margin": 6.96067476272583, "fcm_dpo/q_t": 0.3375468850135803, "grad_norm": 61.08738708496094, "learning_rate": 4.1304347826086954e-07, "logits/chosen": -0.5824143886566162, "logits/rejected": -0.5054690837860107, "logps/chosen": -62.61653137207031, "logps/ref_chosen": -62.13294219970703, "logps/ref_rejected": -84.61729431152344, "logps/rejected": -92.06156158447266, "loss": 0.8919, "margin_dpo/margin_mean": 6.960675239562988, "margin_dpo/margin_std": 7.216721534729004, "step": 58 }, { "KL/chosen_KL_mean": -1.005767822265625, "KL/mean": -4.686802864074707, "KL/rejected_KL_mean": -8.367839813232422, "KL/std": 6.362232208251953, "epoch": 0.08663729809104258, "fcm_dpo/beta": 0.1001485139131546, "fcm_dpo/delta": -0.369930237531662, "fcm_dpo/margin": 7.362071514129639, "fcm_dpo/q_t": 0.339927077293396, "grad_norm": 56.65191650390625, "learning_rate": 4.2028985507246374e-07, "logits/chosen": -0.6022584438323975, "logits/rejected": -0.5596363544464111, "logps/chosen": -52.93829345703125, "logps/ref_chosen": -51.932525634765625, "logps/ref_rejected": -88.88520050048828, "logps/rejected": -97.25303649902344, "loss": 0.8958, "margin_dpo/margin_mean": 7.362071514129639, "margin_dpo/margin_std": 7.449038505554199, "step": 59 }, { "KL/chosen_KL_mean": -1.9381217956542969, "KL/mean": -4.90670919418335, "KL/rejected_KL_mean": -7.875293731689453, "KL/std": 5.7876200675964355, "epoch": 0.0881057268722467, "fcm_dpo/beta": 0.09618770331144333, "fcm_dpo/delta": -0.18251214921474457, "fcm_dpo/margin": 5.937169075012207, "fcm_dpo/q_t": 0.36946025490760803, "grad_norm": 63.26620864868164, "learning_rate": 4.2753623188405794e-07, "logits/chosen": -0.622028112411499, "logits/rejected": -0.5631489753723145, "logps/chosen": -62.88031005859375, "logps/ref_chosen": -60.94218826293945, "logps/ref_rejected": -85.39340209960938, "logps/rejected": -93.26869201660156, "loss": 0.9865, "margin_dpo/margin_mean": 5.937168598175049, "margin_dpo/margin_std": 6.618038177490234, "step": 60 }, { "KL/chosen_KL_mean": -0.9614639282226562, "KL/mean": -4.590740203857422, "KL/rejected_KL_mean": -8.220016479492188, "KL/std": 7.815638542175293, "epoch": 0.08957415565345081, "fcm_dpo/beta": 0.09131693840026855, "fcm_dpo/delta": -0.28324007987976074, "fcm_dpo/margin": 7.258551597595215, "fcm_dpo/q_t": 0.3635759949684143, "grad_norm": 50.66496658325195, "learning_rate": 4.3478260869565214e-07, "logits/chosen": -0.615394115447998, "logits/rejected": -0.5809042453765869, "logps/chosen": -61.59498596191406, "logps/ref_chosen": -60.633522033691406, "logps/ref_rejected": -89.85249328613281, "logps/rejected": -98.072509765625, "loss": 0.9727, "margin_dpo/margin_mean": 7.258552074432373, "margin_dpo/margin_std": 9.66031265258789, "step": 61 }, { "KL/chosen_KL_mean": -1.1735248565673828, "KL/mean": -3.935976266860962, "KL/rejected_KL_mean": -6.69842529296875, "KL/std": 5.688698768615723, "epoch": 0.09104258443465492, "fcm_dpo/beta": 0.08945208787918091, "fcm_dpo/delta": -0.09905220568180084, "fcm_dpo/margin": 5.52489709854126, "fcm_dpo/q_t": 0.389636754989624, "grad_norm": 49.19923782348633, "learning_rate": 4.420289855072464e-07, "logits/chosen": -0.5917966365814209, "logits/rejected": -0.5570877194404602, "logps/chosen": -57.32429504394531, "logps/ref_chosen": -56.15077209472656, "logps/ref_rejected": -75.56619262695312, "logps/rejected": -82.26461791992188, "loss": 1.0378, "margin_dpo/margin_mean": 5.524896621704102, "margin_dpo/margin_std": 6.8345046043396, "step": 62 }, { "KL/chosen_KL_mean": -1.9826297760009766, "KL/mean": -5.725447654724121, "KL/rejected_KL_mean": -9.468265533447266, "KL/std": 7.366238594055176, "epoch": 0.09251101321585903, "fcm_dpo/beta": 0.08485674113035202, "fcm_dpo/delta": -0.2533873915672302, "fcm_dpo/margin": 7.485637664794922, "fcm_dpo/q_t": 0.3566383123397827, "grad_norm": 52.68979263305664, "learning_rate": 4.4927536231884053e-07, "logits/chosen": -0.5868048667907715, "logits/rejected": -0.5401818752288818, "logps/chosen": -75.13002014160156, "logps/ref_chosen": -73.14739227294922, "logps/ref_rejected": -97.61006164550781, "logps/rejected": -107.07832336425781, "loss": 0.9454, "margin_dpo/margin_mean": 7.485637664794922, "margin_dpo/margin_std": 7.758219242095947, "step": 63 }, { "KL/chosen_KL_mean": -0.6014499664306641, "KL/mean": -4.980618953704834, "KL/rejected_KL_mean": -9.359790802001953, "KL/std": 7.880523204803467, "epoch": 0.09397944199706314, "fcm_dpo/beta": 0.0794319286942482, "fcm_dpo/delta": -0.3237980306148529, "fcm_dpo/margin": 8.758337020874023, "fcm_dpo/q_t": 0.34900087118148804, "grad_norm": 47.52156066894531, "learning_rate": 4.5652173913043473e-07, "logits/chosen": -0.5694026947021484, "logits/rejected": -0.5367158651351929, "logps/chosen": -54.60005187988281, "logps/ref_chosen": -53.998600006103516, "logps/ref_rejected": -93.53019714355469, "logps/rejected": -102.8899917602539, "loss": 0.9296, "margin_dpo/margin_mean": 8.758337020874023, "margin_dpo/margin_std": 9.623977661132812, "step": 64 }, { "KL/chosen_KL_mean": -2.417318344116211, "KL/mean": -6.842019081115723, "KL/rejected_KL_mean": -11.2667236328125, "KL/std": 8.855112075805664, "epoch": 0.09544787077826726, "fcm_dpo/beta": 0.07528192549943924, "fcm_dpo/delta": -0.2872685492038727, "fcm_dpo/margin": 8.849407196044922, "fcm_dpo/q_t": 0.3515852093696594, "grad_norm": 46.06278991699219, "learning_rate": 4.63768115942029e-07, "logits/chosen": -0.6717353463172913, "logits/rejected": -0.6576972007751465, "logps/chosen": -67.2533187866211, "logps/ref_chosen": -64.83599853515625, "logps/ref_rejected": -109.94645690917969, "logps/rejected": -121.21318054199219, "loss": 0.9428, "margin_dpo/margin_mean": 8.849407196044922, "margin_dpo/margin_std": 9.624493598937988, "step": 65 }, { "KL/chosen_KL_mean": -2.239471435546875, "KL/mean": -6.359401702880859, "KL/rejected_KL_mean": -10.479331970214844, "KL/std": 8.035075187683105, "epoch": 0.09691629955947137, "fcm_dpo/beta": 0.07178394496440887, "fcm_dpo/delta": -0.2061339020729065, "fcm_dpo/margin": 8.239856719970703, "fcm_dpo/q_t": 0.3704487979412079, "grad_norm": 40.174861907958984, "learning_rate": 4.7101449275362313e-07, "logits/chosen": -0.6172059774398804, "logits/rejected": -0.5826204419136047, "logps/chosen": -53.68299865722656, "logps/ref_chosen": -51.44352722167969, "logps/ref_rejected": -75.63629913330078, "logps/rejected": -86.11563110351562, "loss": 0.9802, "margin_dpo/margin_mean": 8.239856719970703, "margin_dpo/margin_std": 9.627714157104492, "step": 66 }, { "KL/chosen_KL_mean": -1.9857635498046875, "KL/mean": -6.205845832824707, "KL/rejected_KL_mean": -10.425933837890625, "KL/std": 8.025278091430664, "epoch": 0.09838472834067548, "fcm_dpo/beta": 0.06964662671089172, "fcm_dpo/delta": -0.19918228685855865, "fcm_dpo/margin": 8.440168380737305, "fcm_dpo/q_t": 0.3705397844314575, "grad_norm": 41.403099060058594, "learning_rate": 4.782608695652174e-07, "logits/chosen": -0.5965728759765625, "logits/rejected": -0.5546629428863525, "logps/chosen": -61.326568603515625, "logps/ref_chosen": -59.34080505371094, "logps/ref_rejected": -72.78728485107422, "logps/rejected": -83.21321868896484, "loss": 0.9789, "margin_dpo/margin_mean": 8.440168380737305, "margin_dpo/margin_std": 9.680936813354492, "step": 67 }, { "KL/chosen_KL_mean": -2.2271480560302734, "KL/mean": -6.318003177642822, "KL/rejected_KL_mean": -10.408863067626953, "KL/std": 7.332122325897217, "epoch": 0.09985315712187959, "fcm_dpo/beta": 0.06731708347797394, "fcm_dpo/delta": -0.15928582847118378, "fcm_dpo/margin": 8.181710243225098, "fcm_dpo/q_t": 0.3742911219596863, "grad_norm": 39.689701080322266, "learning_rate": 4.855072463768116e-07, "logits/chosen": -0.6384579539299011, "logits/rejected": -0.5809626579284668, "logps/chosen": -67.4329833984375, "logps/ref_chosen": -65.2058334350586, "logps/ref_rejected": -77.20724487304688, "logps/rejected": -87.6161117553711, "loss": 0.9771, "margin_dpo/margin_mean": 8.181710243225098, "margin_dpo/margin_std": 8.135688781738281, "step": 68 }, { "KL/chosen_KL_mean": -2.7193145751953125, "KL/mean": -7.553807258605957, "KL/rejected_KL_mean": -12.388301849365234, "KL/std": 8.551678657531738, "epoch": 0.1013215859030837, "fcm_dpo/beta": 0.06404094398021698, "fcm_dpo/delta": -0.23506096005439758, "fcm_dpo/margin": 9.668986320495605, "fcm_dpo/q_t": 0.360501229763031, "grad_norm": 42.04729080200195, "learning_rate": 4.927536231884058e-07, "logits/chosen": -0.6100502014160156, "logits/rejected": -0.5856792330741882, "logps/chosen": -62.53855514526367, "logps/ref_chosen": -59.81924057006836, "logps/ref_rejected": -103.38886260986328, "logps/rejected": -115.77716064453125, "loss": 0.937, "margin_dpo/margin_mean": 9.668987274169922, "margin_dpo/margin_std": 9.378090858459473, "step": 69 }, { "KL/chosen_KL_mean": -3.914093017578125, "KL/mean": -9.15247917175293, "KL/rejected_KL_mean": -14.390865325927734, "KL/std": 10.303367614746094, "epoch": 0.1027900146842878, "fcm_dpo/beta": 0.061325304210186005, "fcm_dpo/delta": -0.25857973098754883, "fcm_dpo/margin": 10.476768493652344, "fcm_dpo/q_t": 0.3579747676849365, "grad_norm": 42.36003875732422, "learning_rate": 5e-07, "logits/chosen": -0.6354060173034668, "logits/rejected": -0.6008400917053223, "logps/chosen": -65.84473419189453, "logps/ref_chosen": -61.930641174316406, "logps/ref_rejected": -91.06078338623047, "logps/rejected": -105.45164489746094, "loss": 0.9405, "margin_dpo/margin_mean": 10.476768493652344, "margin_dpo/margin_std": 10.90849494934082, "step": 70 }, { "KL/chosen_KL_mean": -3.4942684173583984, "KL/mean": -9.552824020385742, "KL/rejected_KL_mean": -15.611381530761719, "KL/std": 10.604869842529297, "epoch": 0.10425844346549193, "fcm_dpo/beta": 0.057439714670181274, "fcm_dpo/delta": -0.3188778758049011, "fcm_dpo/margin": 12.117112159729004, "fcm_dpo/q_t": 0.34699490666389465, "grad_norm": 38.4710807800293, "learning_rate": 4.999967061337492e-07, "logits/chosen": -0.6384104490280151, "logits/rejected": -0.5962811708450317, "logps/chosen": -65.2446060180664, "logps/ref_chosen": -61.750335693359375, "logps/ref_rejected": -97.33662414550781, "logps/rejected": -112.947998046875, "loss": 0.9015, "margin_dpo/margin_mean": 12.117112159729004, "margin_dpo/margin_std": 11.577495574951172, "step": 71 }, { "KL/chosen_KL_mean": -4.108892440795898, "KL/mean": -10.48940658569336, "KL/rejected_KL_mean": -16.869918823242188, "KL/std": 11.388107299804688, "epoch": 0.10572687224669604, "fcm_dpo/beta": 0.05391976609826088, "fcm_dpo/delta": -0.31035923957824707, "fcm_dpo/margin": 12.761024475097656, "fcm_dpo/q_t": 0.3477667570114136, "grad_norm": 38.119319915771484, "learning_rate": 4.999868246217933e-07, "logits/chosen": -0.6678510904312134, "logits/rejected": -0.6335985660552979, "logps/chosen": -70.16230773925781, "logps/ref_chosen": -66.05341339111328, "logps/ref_rejected": -95.2869873046875, "logps/rejected": -112.15690612792969, "loss": 0.9156, "margin_dpo/margin_mean": 12.761024475097656, "margin_dpo/margin_std": 13.06039810180664, "step": 72 }, { "KL/chosen_KL_mean": -5.835565567016602, "KL/mean": -12.30703067779541, "KL/rejected_KL_mean": -18.77849578857422, "KL/std": 14.999626159667969, "epoch": 0.10719530102790015, "fcm_dpo/beta": 0.05082736164331436, "fcm_dpo/delta": -0.27768224477767944, "fcm_dpo/margin": 12.9429292678833, "fcm_dpo/q_t": 0.36679285764694214, "grad_norm": 37.15599822998047, "learning_rate": 4.999703557245192e-07, "logits/chosen": -0.6618713736534119, "logits/rejected": -0.6186869144439697, "logps/chosen": -72.09184265136719, "logps/ref_chosen": -66.25627136230469, "logps/ref_rejected": -90.45613098144531, "logps/rejected": -109.23463439941406, "loss": 1.0098, "margin_dpo/margin_mean": 12.942928314208984, "margin_dpo/margin_std": 18.527137756347656, "step": 73 }, { "KL/chosen_KL_mean": -6.543451309204102, "KL/mean": -13.558289527893066, "KL/rejected_KL_mean": -20.573123931884766, "KL/std": 16.691537857055664, "epoch": 0.10866372980910426, "fcm_dpo/beta": 0.048035770654678345, "fcm_dpo/delta": -0.2947743535041809, "fcm_dpo/margin": 14.029674530029297, "fcm_dpo/q_t": 0.3622450828552246, "grad_norm": 38.12542724609375, "learning_rate": 4.999472998758977e-07, "logits/chosen": -0.6320587396621704, "logits/rejected": -0.6205792427062988, "logps/chosen": -59.96833419799805, "logps/ref_chosen": -53.42488098144531, "logps/ref_rejected": -95.94693756103516, "logps/rejected": -116.52006530761719, "loss": 0.9874, "margin_dpo/margin_mean": 14.029674530029297, "margin_dpo/margin_std": 20.53775405883789, "step": 74 }, { "KL/chosen_KL_mean": -6.452543258666992, "KL/mean": -16.478172302246094, "KL/rejected_KL_mean": -26.503795623779297, "KL/std": 17.552452087402344, "epoch": 0.11013215859030837, "fcm_dpo/beta": 0.04416520893573761, "fcm_dpo/delta": -0.5302780866622925, "fcm_dpo/margin": 20.05126190185547, "fcm_dpo/q_t": 0.31639227271080017, "grad_norm": 34.176265716552734, "learning_rate": 4.999176576834721e-07, "logits/chosen": -0.6823216676712036, "logits/rejected": -0.6725906133651733, "logps/chosen": -58.314205169677734, "logps/ref_chosen": -51.861663818359375, "logps/ref_rejected": -111.25398254394531, "logps/rejected": -137.75778198242188, "loss": 0.833, "margin_dpo/margin_mean": 20.05126190185547, "margin_dpo/margin_std": 19.639450073242188, "step": 75 }, { "KL/chosen_KL_mean": -8.126792907714844, "KL/mean": -14.718840599060059, "KL/rejected_KL_mean": -21.31088638305664, "KL/std": 14.556184768676758, "epoch": 0.11160058737151249, "fcm_dpo/beta": 0.04169227182865143, "fcm_dpo/delta": -0.15833759307861328, "fcm_dpo/margin": 13.184097290039062, "fcm_dpo/q_t": 0.3749655485153198, "grad_norm": 33.384490966796875, "learning_rate": 4.998814299283415e-07, "logits/chosen": -0.692663848400116, "logits/rejected": -0.6493555307388306, "logps/chosen": -61.39282989501953, "logps/ref_chosen": -53.26603698730469, "logps/ref_rejected": -78.21662902832031, "logps/rejected": -99.52751159667969, "loss": 1.0048, "margin_dpo/margin_mean": 13.184097290039062, "margin_dpo/margin_std": 15.775751113891602, "step": 76 }, { "KL/chosen_KL_mean": -7.239437103271484, "KL/mean": -17.858009338378906, "KL/rejected_KL_mean": -28.476581573486328, "KL/std": 19.998626708984375, "epoch": 0.1130690161527166, "fcm_dpo/beta": 0.03837820887565613, "fcm_dpo/delta": -0.4561229944229126, "fcm_dpo/margin": 21.237140655517578, "fcm_dpo/q_t": 0.3239055275917053, "grad_norm": 34.94086456298828, "learning_rate": 4.998386175651409e-07, "logits/chosen": -0.6894793510437012, "logits/rejected": -0.6502680778503418, "logps/chosen": -65.33612060546875, "logps/ref_chosen": -58.0966796875, "logps/ref_rejected": -93.77361297607422, "logps/rejected": -122.25019836425781, "loss": 0.8744, "margin_dpo/margin_mean": 21.237140655517578, "margin_dpo/margin_std": 21.865249633789062, "step": 77 }, { "KL/chosen_KL_mean": -7.513151168823242, "KL/mean": -15.731389999389648, "KL/rejected_KL_mean": -23.949626922607422, "KL/std": 16.417797088623047, "epoch": 0.1145374449339207, "fcm_dpo/beta": 0.03666268289089203, "fcm_dpo/delta": -0.21514025330543518, "fcm_dpo/margin": 16.436477661132812, "fcm_dpo/q_t": 0.3673900067806244, "grad_norm": 31.626035690307617, "learning_rate": 4.997892217220159e-07, "logits/chosen": -0.6481041312217712, "logits/rejected": -0.6217755079269409, "logps/chosen": -63.12693786621094, "logps/ref_chosen": -55.61378479003906, "logps/ref_rejected": -84.93436431884766, "logps/rejected": -108.88398742675781, "loss": 0.9745, "margin_dpo/margin_mean": 16.436477661132812, "margin_dpo/margin_std": 18.627796173095703, "step": 78 }, { "KL/chosen_KL_mean": -7.894931793212891, "KL/mean": -16.924148559570312, "KL/rejected_KL_mean": -25.95336151123047, "KL/std": 18.74026870727539, "epoch": 0.11600587371512482, "fcm_dpo/beta": 0.03483927622437477, "fcm_dpo/delta": -0.24468708038330078, "fcm_dpo/margin": 18.058425903320312, "fcm_dpo/q_t": 0.36786949634552, "grad_norm": 27.80043601989746, "learning_rate": 4.997332437005931e-07, "logits/chosen": -0.6485146284103394, "logits/rejected": -0.6170614957809448, "logps/chosen": -63.345420837402344, "logps/ref_chosen": -55.45048522949219, "logps/ref_rejected": -87.64756774902344, "logps/rejected": -113.60092163085938, "loss": 0.9866, "margin_dpo/margin_mean": 18.05842399597168, "margin_dpo/margin_std": 22.871458053588867, "step": 79 }, { "KL/chosen_KL_mean": -10.773929595947266, "KL/mean": -19.173215866088867, "KL/rejected_KL_mean": -27.572498321533203, "KL/std": 19.74143409729004, "epoch": 0.11747430249632893, "fcm_dpo/beta": 0.03327310085296631, "fcm_dpo/delta": -0.17127852141857147, "fcm_dpo/margin": 16.79857063293457, "fcm_dpo/q_t": 0.38316744565963745, "grad_norm": 29.989072799682617, "learning_rate": 4.996706849759452e-07, "logits/chosen": -0.7313661575317383, "logits/rejected": -0.688393235206604, "logps/chosen": -69.29322052001953, "logps/ref_chosen": -58.519290924072266, "logps/ref_rejected": -87.54750061035156, "logps/rejected": -115.1199951171875, "loss": 1.0327, "margin_dpo/margin_mean": 16.79857063293457, "margin_dpo/margin_std": 22.803003311157227, "step": 80 }, { "KL/chosen_KL_mean": -9.863761901855469, "KL/mean": -21.608978271484375, "KL/rejected_KL_mean": -33.35419464111328, "KL/std": 23.825132369995117, "epoch": 0.11894273127753303, "fcm_dpo/beta": 0.03129996731877327, "fcm_dpo/delta": -0.36527884006500244, "fcm_dpo/margin": 23.49043083190918, "fcm_dpo/q_t": 0.34688568115234375, "grad_norm": 30.72429847717285, "learning_rate": 4.996015471965529e-07, "logits/chosen": -0.7000492811203003, "logits/rejected": -0.6688964366912842, "logps/chosen": -76.31262969970703, "logps/ref_chosen": -66.44886779785156, "logps/ref_rejected": -129.66270446777344, "logps/rejected": -163.01690673828125, "loss": 0.9305, "margin_dpo/margin_mean": 23.490432739257812, "margin_dpo/margin_std": 28.210830688476562, "step": 81 }, { "KL/chosen_KL_mean": -12.267969131469727, "KL/mean": -21.549884796142578, "KL/rejected_KL_mean": -30.831790924072266, "KL/std": 21.572769165039062, "epoch": 0.12041116005873716, "fcm_dpo/beta": 0.030443139374256134, "fcm_dpo/delta": -0.17542892694473267, "fcm_dpo/margin": 18.563825607299805, "fcm_dpo/q_t": 0.3823755085468292, "grad_norm": 33.41337203979492, "learning_rate": 4.995258321842611e-07, "logits/chosen": -0.6645326614379883, "logits/rejected": -0.6522207260131836, "logps/chosen": -64.50035095214844, "logps/ref_chosen": -52.232383728027344, "logps/ref_rejected": -90.74325561523438, "logps/rejected": -121.57504272460938, "loss": 1.074, "margin_dpo/margin_mean": 18.563827514648438, "margin_dpo/margin_std": 29.59918975830078, "step": 82 }, { "KL/chosen_KL_mean": -12.634248733520508, "KL/mean": -23.630840301513672, "KL/rejected_KL_mean": -34.627437591552734, "KL/std": 22.750579833984375, "epoch": 0.12187958883994127, "fcm_dpo/beta": 0.028682120144367218, "fcm_dpo/delta": -0.24740472435951233, "fcm_dpo/margin": 21.993192672729492, "fcm_dpo/q_t": 0.36408424377441406, "grad_norm": 28.262123107910156, "learning_rate": 4.994435419342304e-07, "logits/chosen": -0.6717352867126465, "logits/rejected": -0.6345555782318115, "logps/chosen": -68.46163940429688, "logps/ref_chosen": -55.82738494873047, "logps/ref_rejected": -103.71589660644531, "logps/rejected": -138.3433380126953, "loss": 0.9787, "margin_dpo/margin_mean": 21.993192672729492, "margin_dpo/margin_std": 26.84136390686035, "step": 83 }, { "KL/chosen_KL_mean": -11.635591506958008, "KL/mean": -21.24039649963379, "KL/rejected_KL_mean": -30.845199584960938, "KL/std": 19.57217788696289, "epoch": 0.12334801762114538, "fcm_dpo/beta": 0.027607331052422523, "fcm_dpo/delta": -0.14001153409481049, "fcm_dpo/margin": 19.20960235595703, "fcm_dpo/q_t": 0.37740713357925415, "grad_norm": 27.846446990966797, "learning_rate": 4.993546786148857e-07, "logits/chosen": -0.6538140177726746, "logits/rejected": -0.6141324639320374, "logps/chosen": -78.81175994873047, "logps/ref_chosen": -67.1761703491211, "logps/ref_rejected": -87.29859924316406, "logps/rejected": -118.143798828125, "loss": 0.9978, "margin_dpo/margin_mean": 19.20960235595703, "margin_dpo/margin_std": 19.999858856201172, "step": 84 }, { "KL/chosen_KL_mean": -12.274053573608398, "KL/mean": -22.126728057861328, "KL/rejected_KL_mean": -31.97940444946289, "KL/std": 20.159584045410156, "epoch": 0.12481644640234948, "fcm_dpo/beta": 0.027254024520516396, "fcm_dpo/delta": -0.1449553519487381, "fcm_dpo/margin": 19.705352783203125, "fcm_dpo/q_t": 0.38130825757980347, "grad_norm": 27.484092712402344, "learning_rate": 4.992592445678582e-07, "logits/chosen": -0.6424893140792847, "logits/rejected": -0.6098573207855225, "logps/chosen": -70.68067169189453, "logps/ref_chosen": -58.4066162109375, "logps/ref_rejected": -78.63880157470703, "logps/rejected": -110.61820983886719, "loss": 1.0228, "margin_dpo/margin_mean": 19.705352783203125, "margin_dpo/margin_std": 24.122146606445312, "step": 85 }, { "KL/chosen_KL_mean": -16.05596351623535, "KL/mean": -26.548908233642578, "KL/rejected_KL_mean": -37.041839599609375, "KL/std": 27.3221492767334, "epoch": 0.1262848751835536, "fcm_dpo/beta": 0.026313815265893936, "fcm_dpo/delta": -0.16080215573310852, "fcm_dpo/margin": 20.985877990722656, "fcm_dpo/q_t": 0.3926513195037842, "grad_norm": 31.304685592651367, "learning_rate": 4.991572423079235e-07, "logits/chosen": -0.6901407241821289, "logits/rejected": -0.6772359609603882, "logps/chosen": -72.19342803955078, "logps/ref_chosen": -56.13746643066406, "logps/ref_rejected": -88.12165069580078, "logps/rejected": -125.16349029541016, "loss": 1.1027, "margin_dpo/margin_mean": 20.985881805419922, "margin_dpo/margin_std": 37.65880584716797, "step": 86 }, { "KL/chosen_KL_mean": -15.858743667602539, "KL/mean": -28.08181381225586, "KL/rejected_KL_mean": -40.30488204956055, "KL/std": 27.242843627929688, "epoch": 0.1277533039647577, "fcm_dpo/beta": 0.024988306686282158, "fcm_dpo/delta": -0.22651353478431702, "fcm_dpo/margin": 24.446142196655273, "fcm_dpo/q_t": 0.3679496645927429, "grad_norm": 26.263225555419922, "learning_rate": 4.990486745229364e-07, "logits/chosen": -0.7118654847145081, "logits/rejected": -0.6854358911514282, "logps/chosen": -71.49484252929688, "logps/ref_chosen": -55.63609313964844, "logps/ref_rejected": -95.46757507324219, "logps/rejected": -135.7724609375, "loss": 1.0137, "margin_dpo/margin_mean": 24.446144104003906, "margin_dpo/margin_std": 32.91810607910156, "step": 87 }, { "KL/chosen_KL_mean": -20.068950653076172, "KL/mean": -29.328380584716797, "KL/rejected_KL_mean": -38.58780288696289, "KL/std": 28.398073196411133, "epoch": 0.12922173274596183, "fcm_dpo/beta": 0.0244886577129364, "fcm_dpo/delta": -0.057016439735889435, "fcm_dpo/margin": 18.518863677978516, "fcm_dpo/q_t": 0.40317296981811523, "grad_norm": 27.875059127807617, "learning_rate": 4.989335440737586e-07, "logits/chosen": -0.699777364730835, "logits/rejected": -0.6927889585494995, "logps/chosen": -93.74009704589844, "logps/ref_chosen": -73.67115020751953, "logps/ref_rejected": -106.70849609375, "logps/rejected": -145.29629516601562, "loss": 1.1165, "margin_dpo/margin_mean": 18.518863677978516, "margin_dpo/margin_std": 29.796024322509766, "step": 88 }, { "KL/chosen_KL_mean": -12.274129867553711, "KL/mean": -22.63742446899414, "KL/rejected_KL_mean": -33.0007209777832, "KL/std": 24.061918258666992, "epoch": 0.13069016152716592, "fcm_dpo/beta": 0.024275628849864006, "fcm_dpo/delta": -0.10855366289615631, "fcm_dpo/margin": 20.72658920288086, "fcm_dpo/q_t": 0.3883446455001831, "grad_norm": 25.647180557250977, "learning_rate": 4.988118539941847e-07, "logits/chosen": -0.7325412631034851, "logits/rejected": -0.7000647783279419, "logps/chosen": -72.8990478515625, "logps/ref_chosen": -60.624916076660156, "logps/ref_rejected": -82.08354949951172, "logps/rejected": -115.08427429199219, "loss": 1.0399, "margin_dpo/margin_mean": 20.72658920288086, "margin_dpo/margin_std": 27.33102798461914, "step": 89 }, { "KL/chosen_KL_mean": -15.752443313598633, "KL/mean": -29.865989685058594, "KL/rejected_KL_mean": -43.97953414916992, "KL/std": 32.87799835205078, "epoch": 0.13215859030837004, "fcm_dpo/beta": 0.02329513430595398, "fcm_dpo/delta": -0.2749367952346802, "fcm_dpo/margin": 28.22708511352539, "fcm_dpo/q_t": 0.3700242340564728, "grad_norm": 27.188581466674805, "learning_rate": 4.986836074908615e-07, "logits/chosen": -0.6703182458877563, "logits/rejected": -0.681124210357666, "logps/chosen": -69.03775024414062, "logps/ref_chosen": -53.285308837890625, "logps/ref_rejected": -111.54470825195312, "logps/rejected": -155.5242462158203, "loss": 1.0186, "margin_dpo/margin_mean": 28.22708511352539, "margin_dpo/margin_std": 40.7965087890625, "step": 90 }, { "KL/chosen_KL_mean": -16.609392166137695, "KL/mean": -28.263938903808594, "KL/rejected_KL_mean": -39.91849899291992, "KL/std": 26.464740753173828, "epoch": 0.13362701908957417, "fcm_dpo/beta": 0.022392991930246353, "fcm_dpo/delta": -0.12847986817359924, "fcm_dpo/margin": 23.309101104736328, "fcm_dpo/q_t": 0.3886939287185669, "grad_norm": 25.46695899963379, "learning_rate": 4.985488079432037e-07, "logits/chosen": -0.6979262828826904, "logits/rejected": -0.6650443077087402, "logps/chosen": -78.412353515625, "logps/ref_chosen": -61.802955627441406, "logps/ref_rejected": -87.87395477294922, "logps/rejected": -127.79244995117188, "loss": 1.0595, "margin_dpo/margin_mean": 23.309099197387695, "margin_dpo/margin_std": 34.23745346069336, "step": 91 }, { "KL/chosen_KL_mean": -15.063024520874023, "KL/mean": -26.865272521972656, "KL/rejected_KL_mean": -38.66752624511719, "KL/std": 27.186147689819336, "epoch": 0.13509544787077826, "fcm_dpo/beta": 0.021884029731154442, "fcm_dpo/delta": -0.12284786254167557, "fcm_dpo/margin": 23.604501724243164, "fcm_dpo/q_t": 0.38834255933761597, "grad_norm": 23.927978515625, "learning_rate": 4.984074589033043e-07, "logits/chosen": -0.7328395247459412, "logits/rejected": -0.7109937071800232, "logps/chosen": -66.70379638671875, "logps/ref_chosen": -51.640769958496094, "logps/ref_rejected": -77.88117980957031, "logps/rejected": -116.5487060546875, "loss": 1.0512, "margin_dpo/margin_mean": 23.60449981689453, "margin_dpo/margin_std": 32.96575164794922, "step": 92 }, { "KL/chosen_KL_mean": -16.824951171875, "KL/mean": -28.743083953857422, "KL/rejected_KL_mean": -40.661216735839844, "KL/std": 25.748863220214844, "epoch": 0.13656387665198239, "fcm_dpo/beta": 0.021284889429807663, "fcm_dpo/delta": -0.11301136016845703, "fcm_dpo/margin": 23.836261749267578, "fcm_dpo/q_t": 0.3872656226158142, "grad_norm": 24.41376495361328, "learning_rate": 4.982595640958425e-07, "logits/chosen": -0.7215616703033447, "logits/rejected": -0.6693962812423706, "logps/chosen": -69.35418701171875, "logps/ref_chosen": -52.529239654541016, "logps/ref_rejected": -77.16075134277344, "logps/rejected": -117.82196044921875, "loss": 1.0264, "margin_dpo/margin_mean": 23.836261749267578, "margin_dpo/margin_std": 29.672298431396484, "step": 93 }, { "KL/chosen_KL_mean": -18.283151626586914, "KL/mean": -31.948049545288086, "KL/rejected_KL_mean": -45.61294937133789, "KL/std": 29.418785095214844, "epoch": 0.13803230543318648, "fcm_dpo/beta": 0.02049822360277176, "fcm_dpo/delta": -0.17131651937961578, "fcm_dpo/margin": 27.329792022705078, "fcm_dpo/q_t": 0.37480291724205017, "grad_norm": 24.020462036132812, "learning_rate": 4.98105127417984e-07, "logits/chosen": -0.6669220924377441, "logits/rejected": -0.6510493755340576, "logps/chosen": -79.50576782226562, "logps/ref_chosen": -61.22261047363281, "logps/ref_rejected": -99.59902954101562, "logps/rejected": -145.21197509765625, "loss": 0.9924, "margin_dpo/margin_mean": 27.329792022705078, "margin_dpo/margin_std": 30.807952880859375, "step": 94 }, { "KL/chosen_KL_mean": -17.35989761352539, "KL/mean": -28.47588539123535, "KL/rejected_KL_mean": -39.591880798339844, "KL/std": 28.130035400390625, "epoch": 0.1395007342143906, "fcm_dpo/beta": 0.02020413801074028, "fcm_dpo/delta": -0.051920242607593536, "fcm_dpo/margin": 22.231979370117188, "fcm_dpo/q_t": 0.3971063494682312, "grad_norm": 22.573118209838867, "learning_rate": 4.979441529392784e-07, "logits/chosen": -0.6930748224258423, "logits/rejected": -0.6638180017471313, "logps/chosen": -69.883544921875, "logps/ref_chosen": -52.523643493652344, "logps/ref_rejected": -75.8803482055664, "logps/rejected": -115.47222900390625, "loss": 1.0693, "margin_dpo/margin_mean": 22.231979370117188, "margin_dpo/margin_std": 29.545318603515625, "step": 95 }, { "KL/chosen_KL_mean": -16.8060245513916, "KL/mean": -31.907100677490234, "KL/rejected_KL_mean": -47.0081787109375, "KL/std": 32.275535583496094, "epoch": 0.14096916299559473, "fcm_dpo/beta": 0.01948397234082222, "fcm_dpo/delta": -0.20268620550632477, "fcm_dpo/margin": 30.20215606689453, "fcm_dpo/q_t": 0.3715973496437073, "grad_norm": 23.293750762939453, "learning_rate": 4.977766449015534e-07, "logits/chosen": -0.7053156495094299, "logits/rejected": -0.6753140091896057, "logps/chosen": -78.96299743652344, "logps/ref_chosen": -62.15697479248047, "logps/ref_rejected": -96.59601593017578, "logps/rejected": -143.60418701171875, "loss": 0.9826, "margin_dpo/margin_mean": 30.20215606689453, "margin_dpo/margin_std": 35.575836181640625, "step": 96 }, { "KL/chosen_KL_mean": -18.12676429748535, "KL/mean": -29.81930923461914, "KL/rejected_KL_mean": -41.51185989379883, "KL/std": 26.801036834716797, "epoch": 0.14243759177679882, "fcm_dpo/beta": 0.019416380673646927, "fcm_dpo/delta": -0.057599060237407684, "fcm_dpo/margin": 23.38509750366211, "fcm_dpo/q_t": 0.3948373794555664, "grad_norm": 23.826488494873047, "learning_rate": 4.976026077188012e-07, "logits/chosen": -0.6331626176834106, "logits/rejected": -0.5884179472923279, "logps/chosen": -72.77313232421875, "logps/ref_chosen": -54.646366119384766, "logps/ref_rejected": -76.96475219726562, "logps/rejected": -118.47660827636719, "loss": 1.0527, "margin_dpo/margin_mean": 23.38509750366211, "margin_dpo/margin_std": 26.815166473388672, "step": 97 }, { "KL/chosen_KL_mean": -22.460552215576172, "KL/mean": -35.82654571533203, "KL/rejected_KL_mean": -49.192535400390625, "KL/std": 30.553295135498047, "epoch": 0.14390602055800295, "fcm_dpo/beta": 0.018851084634661674, "fcm_dpo/delta": -0.10970290005207062, "fcm_dpo/margin": 26.731998443603516, "fcm_dpo/q_t": 0.38528013229370117, "grad_norm": 24.53318214416504, "learning_rate": 4.974220459770639e-07, "logits/chosen": -0.6960965394973755, "logits/rejected": -0.6801573038101196, "logps/chosen": -87.71917724609375, "logps/ref_chosen": -65.25862884521484, "logps/ref_rejected": -96.5274887084961, "logps/rejected": -145.72003173828125, "loss": 1.0547, "margin_dpo/margin_mean": 26.731998443603516, "margin_dpo/margin_std": 36.594322204589844, "step": 98 }, { "KL/chosen_KL_mean": -17.805437088012695, "KL/mean": -34.056640625, "KL/rejected_KL_mean": -50.307838439941406, "KL/std": 33.01419448852539, "epoch": 0.14537444933920704, "fcm_dpo/beta": 0.018170353025197983, "fcm_dpo/delta": -0.20326459407806396, "fcm_dpo/margin": 32.502403259277344, "fcm_dpo/q_t": 0.3738780617713928, "grad_norm": 21.70009994506836, "learning_rate": 4.972349644343108e-07, "logits/chosen": -0.6636701822280884, "logits/rejected": -0.6640149354934692, "logps/chosen": -63.44392395019531, "logps/ref_chosen": -45.638484954833984, "logps/ref_rejected": -86.43793487548828, "logps/rejected": -136.7457733154297, "loss": 0.993, "margin_dpo/margin_mean": 32.502403259277344, "margin_dpo/margin_std": 41.165550231933594, "step": 99 }, { "KL/chosen_KL_mean": -19.931947708129883, "KL/mean": -29.551782608032227, "KL/rejected_KL_mean": -39.17161178588867, "KL/std": 26.748775482177734, "epoch": 0.14684287812041116, "fcm_dpo/beta": 0.018238741904497147, "fcm_dpo/delta": 0.05035646632313728, "fcm_dpo/margin": 19.239667892456055, "fcm_dpo/q_t": 0.42096006870269775, "grad_norm": 24.107803344726562, "learning_rate": 4.970413680203148e-07, "logits/chosen": -0.6563955545425415, "logits/rejected": -0.6127746105194092, "logps/chosen": -77.52592468261719, "logps/ref_chosen": -57.59397888183594, "logps/ref_rejected": -74.06021118164062, "logps/rejected": -113.23182678222656, "loss": 1.1626, "margin_dpo/margin_mean": 19.239667892456055, "margin_dpo/margin_std": 34.46852111816406, "step": 100 }, { "KL/chosen_KL_mean": -24.661067962646484, "KL/mean": -36.265132904052734, "KL/rejected_KL_mean": -47.869197845458984, "KL/std": 32.89160919189453, "epoch": 0.14831130690161526, "fcm_dpo/beta": 0.0180535688996315, "fcm_dpo/delta": -0.020610351115465164, "fcm_dpo/margin": 23.2081298828125, "fcm_dpo/q_t": 0.4103754460811615, "grad_norm": 23.844804763793945, "learning_rate": 4.968412618365215e-07, "logits/chosen": -0.6886243224143982, "logits/rejected": -0.6581400632858276, "logps/chosen": -86.30992126464844, "logps/ref_chosen": -61.64885330200195, "logps/ref_rejected": -83.18968200683594, "logps/rejected": -131.0588836669922, "loss": 1.1263, "margin_dpo/margin_mean": 23.2081298828125, "margin_dpo/margin_std": 39.541419982910156, "step": 101 }, { "KL/chosen_KL_mean": -26.75433921813965, "KL/mean": -35.32197570800781, "KL/rejected_KL_mean": -43.889610290527344, "KL/std": 31.045133590698242, "epoch": 0.14977973568281938, "fcm_dpo/beta": 0.018092244863510132, "fcm_dpo/delta": -0.025346608832478523, "fcm_dpo/margin": 17.135272979736328, "fcm_dpo/q_t": 0.43100807070732117, "grad_norm": 26.781410217285156, "learning_rate": 4.966346511559149e-07, "logits/chosen": -0.6860433220863342, "logits/rejected": -0.6402877569198608, "logps/chosen": -90.83322143554688, "logps/ref_chosen": -64.0788803100586, "logps/ref_rejected": -68.18707275390625, "logps/rejected": -112.07669067382812, "loss": 1.2068, "margin_dpo/margin_mean": 17.13527488708496, "margin_dpo/margin_std": 36.66783905029297, "step": 102 }, { "KL/chosen_KL_mean": -20.93372344970703, "KL/mean": -38.12938690185547, "KL/rejected_KL_mean": -55.3250617980957, "KL/std": 34.48286437988281, "epoch": 0.1512481644640235, "fcm_dpo/beta": 0.01744980737566948, "fcm_dpo/delta": -0.21340087056159973, "fcm_dpo/margin": 34.39133834838867, "fcm_dpo/q_t": 0.3697454333305359, "grad_norm": 22.851566314697266, "learning_rate": 4.964215414228785e-07, "logits/chosen": -0.6631127595901489, "logits/rejected": -0.6278681755065918, "logps/chosen": -82.23300170898438, "logps/ref_chosen": -61.299278259277344, "logps/ref_rejected": -93.57270812988281, "logps/rejected": -148.89776611328125, "loss": 0.9817, "margin_dpo/margin_mean": 34.39133834838867, "margin_dpo/margin_std": 41.43703079223633, "step": 103 }, { "KL/chosen_KL_mean": -23.079187393188477, "KL/mean": -39.357391357421875, "KL/rejected_KL_mean": -55.635589599609375, "KL/std": 38.08613967895508, "epoch": 0.1527165932452276, "fcm_dpo/beta": 0.01691918447613716, "fcm_dpo/delta": -0.15939825773239136, "fcm_dpo/margin": 32.55640411376953, "fcm_dpo/q_t": 0.3846844732761383, "grad_norm": 22.517627716064453, "learning_rate": 4.96201938253052e-07, "logits/chosen": -0.7116140127182007, "logits/rejected": -0.6817853450775146, "logps/chosen": -77.45196533203125, "logps/ref_chosen": -54.372772216796875, "logps/ref_rejected": -89.5647201538086, "logps/rejected": -145.2003173828125, "loss": 1.041, "margin_dpo/margin_mean": 32.5564079284668, "margin_dpo/margin_std": 46.39738082885742, "step": 104 }, { "KL/chosen_KL_mean": -21.710777282714844, "KL/mean": -44.809566497802734, "KL/rejected_KL_mean": -67.90835571289062, "KL/std": 37.22575378417969, "epoch": 0.15418502202643172, "fcm_dpo/beta": 0.015895074233412743, "fcm_dpo/delta": -0.36052238941192627, "fcm_dpo/margin": 46.197574615478516, "fcm_dpo/q_t": 0.3344946503639221, "grad_norm": 22.841474533081055, "learning_rate": 4.959758474331832e-07, "logits/chosen": -0.6764773726463318, "logits/rejected": -0.6534477472305298, "logps/chosen": -76.34972381591797, "logps/ref_chosen": -54.638946533203125, "logps/ref_rejected": -97.97351837158203, "logps/rejected": -165.88186645507812, "loss": 0.8712, "margin_dpo/margin_mean": 46.19757843017578, "margin_dpo/margin_std": 40.87889099121094, "step": 105 }, { "KL/chosen_KL_mean": -24.2728328704834, "KL/mean": -38.736602783203125, "KL/rejected_KL_mean": -53.20037078857422, "KL/std": 32.29327392578125, "epoch": 0.15565345080763582, "fcm_dpo/beta": 0.015443746000528336, "fcm_dpo/delta": -0.04908674955368042, "fcm_dpo/margin": 28.927536010742188, "fcm_dpo/q_t": 0.3971561789512634, "grad_norm": 21.783618927001953, "learning_rate": 4.957432749209755e-07, "logits/chosen": -0.6298633217811584, "logits/rejected": -0.5977374315261841, "logps/chosen": -79.10572814941406, "logps/ref_chosen": -54.83289337158203, "logps/ref_rejected": -85.22461700439453, "logps/rejected": -138.42498779296875, "loss": 1.0575, "margin_dpo/margin_mean": 28.927536010742188, "margin_dpo/margin_std": 35.089813232421875, "step": 106 }, { "KL/chosen_KL_mean": -28.940296173095703, "KL/mean": -45.31855773925781, "KL/rejected_KL_mean": -61.69682312011719, "KL/std": 39.5135498046875, "epoch": 0.15712187958883994, "fcm_dpo/beta": 0.015136872418224812, "fcm_dpo/delta": -0.10162399709224701, "fcm_dpo/margin": 32.756526947021484, "fcm_dpo/q_t": 0.3875572979450226, "grad_norm": 21.18216323852539, "learning_rate": 4.955042268449307e-07, "logits/chosen": -0.697156548500061, "logits/rejected": -0.6531878113746643, "logps/chosen": -98.64810943603516, "logps/ref_chosen": -69.70780944824219, "logps/ref_rejected": -94.73950958251953, "logps/rejected": -156.43634033203125, "loss": 1.0427, "margin_dpo/margin_mean": 32.75652313232422, "margin_dpo/margin_std": 41.57597351074219, "step": 107 }, { "KL/chosen_KL_mean": -26.22125816345215, "KL/mean": -45.80976486206055, "KL/rejected_KL_mean": -65.39826965332031, "KL/std": 43.5434684753418, "epoch": 0.15859030837004406, "fcm_dpo/beta": 0.014710919000208378, "fcm_dpo/delta": -0.1870792806148529, "fcm_dpo/margin": 39.17702102661133, "fcm_dpo/q_t": 0.38031500577926636, "grad_norm": 21.38556671142578, "learning_rate": 4.952587095041881e-07, "logits/chosen": -0.6746413707733154, "logits/rejected": -0.6529253721237183, "logps/chosen": -82.23114013671875, "logps/ref_chosen": -56.0098876953125, "logps/ref_rejected": -95.79601287841797, "logps/rejected": -161.19427490234375, "loss": 1.0372, "margin_dpo/margin_mean": 39.17702102661133, "margin_dpo/margin_std": 56.45673370361328, "step": 108 }, { "KL/chosen_KL_mean": -24.449106216430664, "KL/mean": -45.4000358581543, "KL/rejected_KL_mean": -66.35096740722656, "KL/std": 42.009151458740234, "epoch": 0.16005873715124816, "fcm_dpo/beta": 0.014084616675972939, "fcm_dpo/delta": -0.2030661702156067, "fcm_dpo/margin": 41.90185546875, "fcm_dpo/q_t": 0.36869800090789795, "grad_norm": 21.996633529663086, "learning_rate": 4.95006729368358e-07, "logits/chosen": -0.6177815198898315, "logits/rejected": -0.5968196392059326, "logps/chosen": -87.3345947265625, "logps/ref_chosen": -62.88549041748047, "logps/ref_rejected": -98.68573760986328, "logps/rejected": -165.03671264648438, "loss": 0.9883, "margin_dpo/margin_mean": 41.901859283447266, "margin_dpo/margin_std": 49.483802795410156, "step": 109 }, { "KL/chosen_KL_mean": -25.081512451171875, "KL/mean": -43.997459411621094, "KL/rejected_KL_mean": -62.91341018676758, "KL/std": 41.81461715698242, "epoch": 0.16152716593245228, "fcm_dpo/beta": 0.013565946370363235, "fcm_dpo/delta": -0.12388351559638977, "fcm_dpo/margin": 37.83190155029297, "fcm_dpo/q_t": 0.386931836605072, "grad_norm": 19.18947982788086, "learning_rate": 4.947482930773511e-07, "logits/chosen": -0.6101734638214111, "logits/rejected": -0.5731357932090759, "logps/chosen": -83.83519744873047, "logps/ref_chosen": -58.753684997558594, "logps/ref_rejected": -79.75001525878906, "logps/rejected": -142.66342163085938, "loss": 1.0508, "margin_dpo/margin_mean": 37.83190155029297, "margin_dpo/margin_std": 50.25776672363281, "step": 110 }, { "KL/chosen_KL_mean": -28.801855087280273, "KL/mean": -50.240867614746094, "KL/rejected_KL_mean": -71.67987823486328, "KL/std": 47.443092346191406, "epoch": 0.16299559471365638, "fcm_dpo/beta": 0.01321389153599739, "fcm_dpo/delta": -0.17823287844657898, "fcm_dpo/margin": 42.87803649902344, "fcm_dpo/q_t": 0.3765709400177002, "grad_norm": 22.14818000793457, "learning_rate": 4.944834074412042e-07, "logits/chosen": -0.6801958084106445, "logits/rejected": -0.6596289873123169, "logps/chosen": -97.42596435546875, "logps/ref_chosen": -68.62410736083984, "logps/ref_rejected": -98.42886352539062, "logps/rejected": -170.10873413085938, "loss": 1.0313, "margin_dpo/margin_mean": 42.87803268432617, "margin_dpo/margin_std": 58.28101348876953, "step": 111 }, { "KL/chosen_KL_mean": -26.53110122680664, "KL/mean": -40.25721740722656, "KL/rejected_KL_mean": -53.983333587646484, "KL/std": 33.112037658691406, "epoch": 0.1644640234948605, "fcm_dpo/beta": 0.013187635689973831, "fcm_dpo/delta": 0.039192065596580505, "fcm_dpo/margin": 27.45223617553711, "fcm_dpo/q_t": 0.41697975993156433, "grad_norm": 20.016754150390625, "learning_rate": 4.942120794399002e-07, "logits/chosen": -0.6283408999443054, "logits/rejected": -0.5890357494354248, "logps/chosen": -76.78074645996094, "logps/ref_chosen": -50.24964141845703, "logps/ref_rejected": -64.77442932128906, "logps/rejected": -118.75776672363281, "loss": 1.1236, "margin_dpo/margin_mean": 27.452232360839844, "margin_dpo/margin_std": 39.88359069824219, "step": 112 }, { "KL/chosen_KL_mean": -33.448238372802734, "KL/mean": -48.20136642456055, "KL/rejected_KL_mean": -62.954490661621094, "KL/std": 34.57713317871094, "epoch": 0.16593245227606462, "fcm_dpo/beta": 0.013288527727127075, "fcm_dpo/delta": 0.008226404897868633, "fcm_dpo/margin": 29.506244659423828, "fcm_dpo/q_t": 0.4095924198627472, "grad_norm": 20.27177619934082, "learning_rate": 4.939343162231841e-07, "logits/chosen": -0.6053781509399414, "logits/rejected": -0.5618308782577515, "logps/chosen": -100.16119384765625, "logps/ref_chosen": -66.71295166015625, "logps/ref_rejected": -77.96870422363281, "logps/rejected": -140.92318725585938, "loss": 1.0924, "margin_dpo/margin_mean": 29.506242752075195, "margin_dpo/margin_std": 38.19648742675781, "step": 113 }, { "KL/chosen_KL_mean": -30.597002029418945, "KL/mean": -54.11078643798828, "KL/rejected_KL_mean": -77.62455749511719, "KL/std": 51.99464797973633, "epoch": 0.16740088105726872, "fcm_dpo/beta": 0.012787006795406342, "fcm_dpo/delta": -0.21655288338661194, "fcm_dpo/margin": 47.027565002441406, "fcm_dpo/q_t": 0.37409037351608276, "grad_norm": 21.58888053894043, "learning_rate": 4.936501251103751e-07, "logits/chosen": -0.6126964092254639, "logits/rejected": -0.5797730684280396, "logps/chosen": -88.382080078125, "logps/ref_chosen": -57.78507995605469, "logps/ref_rejected": -87.10966491699219, "logps/rejected": -164.73422241210938, "loss": 0.9983, "margin_dpo/margin_mean": 47.027557373046875, "margin_dpo/margin_std": 63.04608154296875, "step": 114 }, { "KL/chosen_KL_mean": -40.25880813598633, "KL/mean": -56.40677261352539, "KL/rejected_KL_mean": -72.55473327636719, "KL/std": 49.674827575683594, "epoch": 0.16886930983847284, "fcm_dpo/beta": 0.012744484469294548, "fcm_dpo/delta": -0.012245994061231613, "fcm_dpo/margin": 32.29592514038086, "fcm_dpo/q_t": 0.4138960838317871, "grad_norm": 28.310956954956055, "learning_rate": 4.933595135901732e-07, "logits/chosen": -0.6323011517524719, "logits/rejected": -0.6113392114639282, "logps/chosen": -105.84144592285156, "logps/ref_chosen": -65.5826416015625, "logps/ref_rejected": -98.56552124023438, "logps/rejected": -171.12025451660156, "loss": 1.1653, "margin_dpo/margin_mean": 32.29592514038086, "margin_dpo/margin_std": 65.29641723632812, "step": 115 }, { "KL/chosen_KL_mean": -30.069929122924805, "KL/mean": -47.952064514160156, "KL/rejected_KL_mean": -65.83419799804688, "KL/std": 41.692527770996094, "epoch": 0.17033773861967694, "fcm_dpo/beta": 0.012649480253458023, "fcm_dpo/delta": -0.055032700300216675, "fcm_dpo/margin": 35.764259338378906, "fcm_dpo/q_t": 0.39788171648979187, "grad_norm": 22.138639450073242, "learning_rate": 4.930624893204624e-07, "logits/chosen": -0.6037485599517822, "logits/rejected": -0.5925810933113098, "logps/chosen": -81.47024536132812, "logps/ref_chosen": -51.40031433105469, "logps/ref_rejected": -80.5218505859375, "logps/rejected": -146.35604858398438, "loss": 1.0586, "margin_dpo/margin_mean": 35.76426315307617, "margin_dpo/margin_std": 45.07810974121094, "step": 116 }, { "KL/chosen_KL_mean": -38.60306930541992, "KL/mean": -53.26951217651367, "KL/rejected_KL_mean": -67.93594360351562, "KL/std": 44.515228271484375, "epoch": 0.17180616740088106, "fcm_dpo/beta": 0.012619540095329285, "fcm_dpo/delta": 0.030971404165029526, "fcm_dpo/margin": 29.332874298095703, "fcm_dpo/q_t": 0.41701966524124146, "grad_norm": 27.81166648864746, "learning_rate": 4.927590601281083e-07, "logits/chosen": -0.5994728803634644, "logits/rejected": -0.5634751319885254, "logps/chosen": -107.90147399902344, "logps/ref_chosen": -69.29840850830078, "logps/ref_rejected": -66.583984375, "logps/rejected": -134.51992797851562, "loss": 1.1423, "margin_dpo/margin_mean": 29.332870483398438, "margin_dpo/margin_std": 50.427330017089844, "step": 117 }, { "KL/chosen_KL_mean": -30.75716781616211, "KL/mean": -48.66988754272461, "KL/rejected_KL_mean": -66.58258819580078, "KL/std": 40.41304397583008, "epoch": 0.17327459618208516, "fcm_dpo/beta": 0.01254614070057869, "fcm_dpo/delta": -0.05177786946296692, "fcm_dpo/margin": 35.82543182373047, "fcm_dpo/q_t": 0.3989246189594269, "grad_norm": 21.09397315979004, "learning_rate": 4.924492340087524e-07, "logits/chosen": -0.6358990669250488, "logits/rejected": -0.6165393590927124, "logps/chosen": -86.39814758300781, "logps/ref_chosen": -55.6409797668457, "logps/ref_rejected": -75.66905975341797, "logps/rejected": -142.25164794921875, "loss": 1.0623, "margin_dpo/margin_mean": 35.82543182373047, "margin_dpo/margin_std": 46.26477813720703, "step": 118 }, { "KL/chosen_KL_mean": -42.31685256958008, "KL/mean": -60.063663482666016, "KL/rejected_KL_mean": -77.81047058105469, "KL/std": 45.433780670166016, "epoch": 0.17474302496328928, "fcm_dpo/beta": 0.012348956428468227, "fcm_dpo/delta": -0.04121140390634537, "fcm_dpo/margin": 35.49361801147461, "fcm_dpo/q_t": 0.40484434366226196, "grad_norm": 23.629602432250977, "learning_rate": 4.92133019126601e-07, "logits/chosen": -0.6368391513824463, "logits/rejected": -0.6250006556510925, "logps/chosen": -115.8270492553711, "logps/ref_chosen": -73.51019287109375, "logps/ref_rejected": -102.977294921875, "logps/rejected": -180.7877655029297, "loss": 1.1009, "margin_dpo/margin_mean": 35.49361801147461, "margin_dpo/margin_std": 55.28799057006836, "step": 119 }, { "KL/chosen_KL_mean": -43.77620315551758, "KL/mean": -68.10749816894531, "KL/rejected_KL_mean": -92.43879699707031, "KL/std": 56.72157287597656, "epoch": 0.1762114537444934, "fcm_dpo/beta": 0.012037184089422226, "fcm_dpo/delta": -0.197471484541893, "fcm_dpo/margin": 48.66258239746094, "fcm_dpo/q_t": 0.3716173470020294, "grad_norm": 22.18862533569336, "learning_rate": 4.918104238142103e-07, "logits/chosen": -0.614201009273529, "logits/rejected": -0.5810754299163818, "logps/chosen": -120.55703735351562, "logps/ref_chosen": -76.78083801269531, "logps/ref_rejected": -108.02374267578125, "logps/rejected": -200.4625244140625, "loss": 0.9948, "margin_dpo/margin_mean": 48.66257858276367, "margin_dpo/margin_std": 59.4996223449707, "step": 120 }, { "KL/chosen_KL_mean": -42.099632263183594, "KL/mean": -69.11078643798828, "KL/rejected_KL_mean": -96.12193298339844, "KL/std": 55.44459915161133, "epoch": 0.1776798825256975, "fcm_dpo/beta": 0.011453816667199135, "fcm_dpo/delta": -0.2350277304649353, "fcm_dpo/margin": 54.022308349609375, "fcm_dpo/q_t": 0.3670397102832794, "grad_norm": 24.160505294799805, "learning_rate": 4.91481456572267e-07, "logits/chosen": -0.5971692800521851, "logits/rejected": -0.5936212539672852, "logps/chosen": -103.8895263671875, "logps/ref_chosen": -61.789894104003906, "logps/ref_rejected": -109.99456787109375, "logps/rejected": -206.11651611328125, "loss": 0.9958, "margin_dpo/margin_mean": 54.022308349609375, "margin_dpo/margin_std": 69.530029296875, "step": 121 }, { "KL/chosen_KL_mean": -38.35765838623047, "KL/mean": -72.02163696289062, "KL/rejected_KL_mean": -105.68560791015625, "KL/std": 63.397361755371094, "epoch": 0.17914831130690162, "fcm_dpo/beta": 0.010804468765854836, "fcm_dpo/delta": -0.35293835401535034, "fcm_dpo/margin": 67.32794952392578, "fcm_dpo/q_t": 0.3403121829032898, "grad_norm": 23.03303337097168, "learning_rate": 4.911461260693638e-07, "logits/chosen": -0.5504162311553955, "logits/rejected": -0.5663501024246216, "logps/chosen": -85.25987243652344, "logps/ref_chosen": -46.9022102355957, "logps/ref_rejected": -106.71418762207031, "logps/rejected": -212.39981079101562, "loss": 0.893, "margin_dpo/margin_mean": 67.32794952392578, "margin_dpo/margin_std": 66.18780517578125, "step": 122 }, { "KL/chosen_KL_mean": -42.53881072998047, "KL/mean": -64.56620025634766, "KL/rejected_KL_mean": -86.59358215332031, "KL/std": 54.408897399902344, "epoch": 0.18061674008810572, "fcm_dpo/beta": 0.010443691164255142, "fcm_dpo/delta": -0.06393568962812424, "fcm_dpo/margin": 44.054771423339844, "fcm_dpo/q_t": 0.401348352432251, "grad_norm": 21.118499755859375, "learning_rate": 4.908044411417711e-07, "logits/chosen": -0.5548320412635803, "logits/rejected": -0.537066638469696, "logps/chosen": -103.87744903564453, "logps/ref_chosen": -61.33863830566406, "logps/ref_rejected": -87.775390625, "logps/rejected": -174.36898803710938, "loss": 1.1089, "margin_dpo/margin_mean": 44.054771423339844, "margin_dpo/margin_std": 73.05723571777344, "step": 123 }, { "KL/chosen_KL_mean": -48.416603088378906, "KL/mean": -80.89775085449219, "KL/rejected_KL_mean": -113.37890625, "KL/std": 74.14457702636719, "epoch": 0.18208516886930984, "fcm_dpo/beta": 0.010025800205767155, "fcm_dpo/delta": -0.26980358362197876, "fcm_dpo/margin": 64.96229553222656, "fcm_dpo/q_t": 0.3692609667778015, "grad_norm": 26.101245880126953, "learning_rate": 4.904564107932048e-07, "logits/chosen": -0.5674383640289307, "logits/rejected": -0.5700336694717407, "logps/chosen": -119.86493682861328, "logps/ref_chosen": -71.44833374023438, "logps/ref_rejected": -117.58056640625, "logps/rejected": -230.95947265625, "loss": 1.0158, "margin_dpo/margin_mean": 64.96229553222656, "margin_dpo/margin_std": 93.43049621582031, "step": 124 }, { "KL/chosen_KL_mean": -39.75640106201172, "KL/mean": -68.591796875, "KL/rejected_KL_mean": -97.42720031738281, "KL/std": 62.443931579589844, "epoch": 0.18355359765051396, "fcm_dpo/beta": 0.009690500795841217, "fcm_dpo/delta": -0.16801846027374268, "fcm_dpo/margin": 57.67080307006836, "fcm_dpo/q_t": 0.3799913227558136, "grad_norm": 19.212617874145508, "learning_rate": 4.90102044194588e-07, "logits/chosen": -0.5034211874008179, "logits/rejected": -0.5046522617340088, "logps/chosen": -89.89334106445312, "logps/ref_chosen": -50.136940002441406, "logps/ref_rejected": -83.98861694335938, "logps/rejected": -181.4158172607422, "loss": 1.0263, "margin_dpo/margin_mean": 57.670806884765625, "margin_dpo/margin_std": 77.20497131347656, "step": 125 }, { "KL/chosen_KL_mean": -43.16718292236328, "KL/mean": -70.08540344238281, "KL/rejected_KL_mean": -97.00362396240234, "KL/std": 56.352882385253906, "epoch": 0.18502202643171806, "fcm_dpo/beta": 0.009411858394742012, "fcm_dpo/delta": -0.11267369985580444, "fcm_dpo/margin": 53.836448669433594, "fcm_dpo/q_t": 0.38875728845596313, "grad_norm": 20.403451919555664, "learning_rate": 4.897413506838102e-07, "logits/chosen": -0.5015150308609009, "logits/rejected": -0.4946970045566559, "logps/chosen": -98.8342514038086, "logps/ref_chosen": -55.66706848144531, "logps/ref_rejected": -98.1297607421875, "logps/rejected": -195.13339233398438, "loss": 1.043, "margin_dpo/margin_mean": 53.836448669433594, "margin_dpo/margin_std": 71.72261810302734, "step": 126 }, { "KL/chosen_KL_mean": -41.83540344238281, "KL/mean": -61.58380889892578, "KL/rejected_KL_mean": -81.33221435546875, "KL/std": 49.37230682373047, "epoch": 0.18649045521292218, "fcm_dpo/beta": 0.009420674294233322, "fcm_dpo/delta": 0.028969500213861465, "fcm_dpo/margin": 39.496803283691406, "fcm_dpo/q_t": 0.4141026735305786, "grad_norm": 20.757905960083008, "learning_rate": 4.89374339765481e-07, "logits/chosen": -0.5394914150238037, "logits/rejected": -0.5201703310012817, "logps/chosen": -98.39008331298828, "logps/ref_chosen": -56.55467987060547, "logps/ref_rejected": -76.7957763671875, "logps/rejected": -158.12799072265625, "loss": 1.1294, "margin_dpo/margin_mean": 39.496803283691406, "margin_dpo/margin_std": 62.23881149291992, "step": 127 }, { "KL/chosen_KL_mean": -44.993072509765625, "KL/mean": -65.68038177490234, "KL/rejected_KL_mean": -86.36769104003906, "KL/std": 58.315940856933594, "epoch": 0.18795888399412627, "fcm_dpo/beta": 0.009480522945523262, "fcm_dpo/delta": 0.007658433169126511, "fcm_dpo/margin": 41.37461853027344, "fcm_dpo/q_t": 0.41293513774871826, "grad_norm": 29.94881248474121, "learning_rate": 4.890010211106795e-07, "logits/chosen": -0.5278192162513733, "logits/rejected": -0.5076951384544373, "logps/chosen": -103.11402893066406, "logps/ref_chosen": -58.12095642089844, "logps/ref_rejected": -76.43896484375, "logps/rejected": -162.806640625, "loss": 1.143, "margin_dpo/margin_mean": 41.37461853027344, "margin_dpo/margin_std": 72.8564453125, "step": 128 }, { "KL/chosen_KL_mean": -54.26958465576172, "KL/mean": -76.15081024169922, "KL/rejected_KL_mean": -98.03204345703125, "KL/std": 64.25934600830078, "epoch": 0.1894273127753304, "fcm_dpo/beta": 0.009432371705770493, "fcm_dpo/delta": -0.013336148113012314, "fcm_dpo/margin": 43.762451171875, "fcm_dpo/q_t": 0.4137777090072632, "grad_norm": 20.718914031982422, "learning_rate": 4.88621404556699e-07, "logits/chosen": -0.5499258637428284, "logits/rejected": -0.539750337600708, "logps/chosen": -121.18595886230469, "logps/ref_chosen": -66.91637420654297, "logps/ref_rejected": -96.6422119140625, "logps/rejected": -194.67425537109375, "loss": 1.1482, "margin_dpo/margin_mean": 43.762454986572266, "margin_dpo/margin_std": 83.50243377685547, "step": 129 }, { "KL/chosen_KL_mean": -40.19133758544922, "KL/mean": -73.26377868652344, "KL/rejected_KL_mean": -106.33623504638672, "KL/std": 66.14134216308594, "epoch": 0.19089574155653452, "fcm_dpo/beta": 0.009208977222442627, "fcm_dpo/delta": -0.22258631885051727, "fcm_dpo/margin": 66.1448974609375, "fcm_dpo/q_t": 0.36926034092903137, "grad_norm": 21.168210983276367, "learning_rate": 4.882355001067891e-07, "logits/chosen": -0.48520392179489136, "logits/rejected": -0.47909384965896606, "logps/chosen": -84.85818481445312, "logps/ref_chosen": -44.66685104370117, "logps/ref_rejected": -82.78165435791016, "logps/rejected": -189.11788940429688, "loss": 0.9951, "margin_dpo/margin_mean": 66.1448974609375, "margin_dpo/margin_std": 80.59586334228516, "step": 130 }, { "KL/chosen_KL_mean": -36.06732177734375, "KL/mean": -69.21546173095703, "KL/rejected_KL_mean": -102.36358642578125, "KL/std": 65.6218490600586, "epoch": 0.19236417033773862, "fcm_dpo/beta": 0.008744207210838795, "fcm_dpo/delta": -0.19108328223228455, "fcm_dpo/margin": 66.29625701904297, "fcm_dpo/q_t": 0.3681938648223877, "grad_norm": 28.092988967895508, "learning_rate": 4.878433179298909e-07, "logits/chosen": -0.48882123827934265, "logits/rejected": -0.4953649342060089, "logps/chosen": -80.99191284179688, "logps/ref_chosen": -44.924591064453125, "logps/ref_rejected": -88.44401550292969, "logps/rejected": -190.80758666992188, "loss": 0.9777, "margin_dpo/margin_mean": 66.29625701904297, "margin_dpo/margin_std": 71.81926727294922, "step": 131 }, { "KL/chosen_KL_mean": -48.046173095703125, "KL/mean": -75.12873840332031, "KL/rejected_KL_mean": -102.2113037109375, "KL/std": 65.57552337646484, "epoch": 0.19383259911894274, "fcm_dpo/beta": 0.00856691226363182, "fcm_dpo/delta": -0.06736327707767487, "fcm_dpo/margin": 54.165130615234375, "fcm_dpo/q_t": 0.4011274576187134, "grad_norm": 19.968385696411133, "learning_rate": 4.874448683603694e-07, "logits/chosen": -0.5298300385475159, "logits/rejected": -0.5281400680541992, "logps/chosen": -107.04725646972656, "logps/ref_chosen": -59.00108337402344, "logps/ref_rejected": -87.89215087890625, "logps/rejected": -190.10345458984375, "loss": 1.0883, "margin_dpo/margin_mean": 54.165130615234375, "margin_dpo/margin_std": 85.23381042480469, "step": 132 }, { "KL/chosen_KL_mean": -56.799015045166016, "KL/mean": -80.92729187011719, "KL/rejected_KL_mean": -105.05557250976562, "KL/std": 59.583778381347656, "epoch": 0.19530102790014683, "fcm_dpo/beta": 0.008522960357367992, "fcm_dpo/delta": -0.011873488314449787, "fcm_dpo/margin": 48.256553649902344, "fcm_dpo/q_t": 0.40982773900032043, "grad_norm": 25.56277847290039, "learning_rate": 4.870401618977415e-07, "logits/chosen": -0.506017804145813, "logits/rejected": -0.4915475845336914, "logps/chosen": -123.40351867675781, "logps/ref_chosen": -66.60449981689453, "logps/ref_rejected": -96.33355712890625, "logps/rejected": -201.38912963867188, "loss": 1.109, "margin_dpo/margin_mean": 48.256561279296875, "margin_dpo/margin_std": 75.63455200195312, "step": 133 }, { "KL/chosen_KL_mean": -44.83940505981445, "KL/mean": -70.63905334472656, "KL/rejected_KL_mean": -96.43869018554688, "KL/std": 57.99497604370117, "epoch": 0.19676945668135096, "fcm_dpo/beta": 0.00851006992161274, "fcm_dpo/delta": -0.040991440415382385, "fcm_dpo/margin": 51.59928894042969, "fcm_dpo/q_t": 0.4014682173728943, "grad_norm": 18.941587448120117, "learning_rate": 4.866292092063986e-07, "logits/chosen": -0.49966758489608765, "logits/rejected": -0.4865725636482239, "logps/chosen": -96.90866088867188, "logps/ref_chosen": -52.06925582885742, "logps/ref_rejected": -87.6545181274414, "logps/rejected": -184.09320068359375, "loss": 1.0664, "margin_dpo/margin_mean": 51.59928894042969, "margin_dpo/margin_std": 67.03329467773438, "step": 134 }, { "KL/chosen_KL_mean": -49.720855712890625, "KL/mean": -86.716064453125, "KL/rejected_KL_mean": -123.71127319335938, "KL/std": 76.8663330078125, "epoch": 0.19823788546255505, "fcm_dpo/beta": 0.008192040026187897, "fcm_dpo/delta": -0.21948286890983582, "fcm_dpo/margin": 73.99042510986328, "fcm_dpo/q_t": 0.37052643299102783, "grad_norm": 22.25084686279297, "learning_rate": 4.862120211153265e-07, "logits/chosen": -0.4897315800189972, "logits/rejected": -0.5235172510147095, "logps/chosen": -100.07471466064453, "logps/ref_chosen": -50.353858947753906, "logps/ref_rejected": -115.97975158691406, "logps/rejected": -239.69102478027344, "loss": 0.9939, "margin_dpo/margin_mean": 73.99043273925781, "margin_dpo/margin_std": 92.94728088378906, "step": 135 }, { "KL/chosen_KL_mean": -59.06098175048828, "KL/mean": -83.65766906738281, "KL/rejected_KL_mean": -108.25436401367188, "KL/std": 69.63789367675781, "epoch": 0.19970631424375918, "fcm_dpo/beta": 0.008057507686316967, "fcm_dpo/delta": 0.003085322678089142, "fcm_dpo/margin": 49.19337844848633, "fcm_dpo/q_t": 0.4188251495361328, "grad_norm": 20.361692428588867, "learning_rate": 4.857886086178193e-07, "logits/chosen": -0.48390525579452515, "logits/rejected": -0.4752395749092102, "logps/chosen": -124.13349151611328, "logps/ref_chosen": -65.072509765625, "logps/ref_rejected": -96.32122802734375, "logps/rejected": -204.57559204101562, "loss": 1.1425, "margin_dpo/margin_mean": 49.193382263183594, "margin_dpo/margin_std": 89.2242431640625, "step": 136 }, { "KL/chosen_KL_mean": -56.935489654541016, "KL/mean": -95.57328796386719, "KL/rejected_KL_mean": -134.21109008789062, "KL/std": 92.69071960449219, "epoch": 0.2011747430249633, "fcm_dpo/beta": 0.007838413119316101, "fcm_dpo/delta": -0.21924690902233124, "fcm_dpo/margin": 77.27558898925781, "fcm_dpo/q_t": 0.37673407793045044, "grad_norm": 18.20696258544922, "learning_rate": 4.853589828711902e-07, "logits/chosen": -0.45655950903892517, "logits/rejected": -0.48352983593940735, "logps/chosen": -105.69461059570312, "logps/ref_chosen": -48.759117126464844, "logps/ref_rejected": -113.86376953125, "logps/rejected": -248.07485961914062, "loss": 1.025, "margin_dpo/margin_mean": 77.27558898925781, "margin_dpo/margin_std": 110.96575927734375, "step": 137 }, { "KL/chosen_KL_mean": -59.1551399230957, "KL/mean": -88.54357147216797, "KL/rejected_KL_mean": -117.9320068359375, "KL/std": 70.5914306640625, "epoch": 0.2026431718061674, "fcm_dpo/beta": 0.00770821887999773, "fcm_dpo/delta": -0.055549122393131256, "fcm_dpo/margin": 58.77688217163086, "fcm_dpo/q_t": 0.3963842988014221, "grad_norm": 21.46184730529785, "learning_rate": 4.849231551964771e-07, "logits/chosen": -0.4387979507446289, "logits/rejected": -0.4272313714027405, "logps/chosen": -119.67478942871094, "logps/ref_chosen": -60.519649505615234, "logps/ref_rejected": -93.19694519042969, "logps/rejected": -211.12896728515625, "loss": 1.0551, "margin_dpo/margin_mean": 58.77688217163086, "margin_dpo/margin_std": 72.2972640991211, "step": 138 }, { "KL/chosen_KL_mean": -49.92842483520508, "KL/mean": -85.01356506347656, "KL/rejected_KL_mean": -120.09870910644531, "KL/std": 67.0115737915039, "epoch": 0.20411160058737152, "fcm_dpo/beta": 0.007536326535046101, "fcm_dpo/delta": -0.13581906259059906, "fcm_dpo/margin": 70.17027282714844, "fcm_dpo/q_t": 0.3828091025352478, "grad_norm": 18.216188430786133, "learning_rate": 4.844811370781446e-07, "logits/chosen": -0.4372691512107849, "logits/rejected": -0.42744114995002747, "logps/chosen": -96.81980895996094, "logps/ref_chosen": -46.89138412475586, "logps/ref_rejected": -79.72798156738281, "logps/rejected": -199.82669067382812, "loss": 1.0186, "margin_dpo/margin_mean": 70.17028045654297, "margin_dpo/margin_std": 86.85481262207031, "step": 139 }, { "KL/chosen_KL_mean": -60.878379821777344, "KL/mean": -92.82592010498047, "KL/rejected_KL_mean": -124.77346801757812, "KL/std": 74.88318634033203, "epoch": 0.2055800293685756, "fcm_dpo/beta": 0.0073799854144454, "fcm_dpo/delta": -0.07513369619846344, "fcm_dpo/margin": 63.89509582519531, "fcm_dpo/q_t": 0.3952398896217346, "grad_norm": 21.68344497680664, "learning_rate": 4.840329401637809e-07, "logits/chosen": -0.4220992922782898, "logits/rejected": -0.40758657455444336, "logps/chosen": -119.85308837890625, "logps/ref_chosen": -58.97471618652344, "logps/ref_rejected": -83.28410339355469, "logps/rejected": -208.05758666992188, "loss": 1.0673, "margin_dpo/margin_mean": 63.89509201049805, "margin_dpo/margin_std": 89.70909118652344, "step": 140 }, { "KL/chosen_KL_mean": -68.69772338867188, "KL/mean": -98.87570190429688, "KL/rejected_KL_mean": -129.05369567871094, "KL/std": 82.59878540039062, "epoch": 0.20704845814977973, "fcm_dpo/beta": 0.007316044997423887, "fcm_dpo/delta": -0.04346423223614693, "fcm_dpo/margin": 60.355979919433594, "fcm_dpo/q_t": 0.40099745988845825, "grad_norm": 26.641067504882812, "learning_rate": 4.83578576263792e-07, "logits/chosen": -0.43964171409606934, "logits/rejected": -0.42764222621917725, "logps/chosen": -143.77337646484375, "logps/ref_chosen": -75.07566833496094, "logps/ref_rejected": -98.1922607421875, "logps/rejected": -227.24595642089844, "loss": 1.1025, "margin_dpo/margin_mean": 60.355979919433594, "margin_dpo/margin_std": 95.37814331054688, "step": 141 }, { "KL/chosen_KL_mean": -70.7205581665039, "KL/mean": -105.15746307373047, "KL/rejected_KL_mean": -139.5943603515625, "KL/std": 90.38099670410156, "epoch": 0.20851688693098386, "fcm_dpo/beta": 0.0072142817080020905, "fcm_dpo/delta": -0.10188804566860199, "fcm_dpo/margin": 68.87380981445312, "fcm_dpo/q_t": 0.39290913939476013, "grad_norm": 28.223947525024414, "learning_rate": 4.83118057351089e-07, "logits/chosen": -0.4045884907245636, "logits/rejected": -0.40342068672180176, "logps/chosen": -128.7484893798828, "logps/ref_chosen": -58.027931213378906, "logps/ref_rejected": -94.58222961425781, "logps/rejected": -234.1765899658203, "loss": 1.0857, "margin_dpo/margin_mean": 68.87380981445312, "margin_dpo/margin_std": 106.27733612060547, "step": 142 }, { "KL/chosen_KL_mean": -73.88005065917969, "KL/mean": -95.9843521118164, "KL/rejected_KL_mean": -118.08866119384766, "KL/std": 79.61582946777344, "epoch": 0.20998531571218795, "fcm_dpo/beta": 0.007206078618764877, "fcm_dpo/delta": 0.08411475270986557, "fcm_dpo/margin": 44.20860290527344, "fcm_dpo/q_t": 0.4320971667766571, "grad_norm": 23.867572784423828, "learning_rate": 4.826513955607734e-07, "logits/chosen": -0.40776681900024414, "logits/rejected": -0.4014623761177063, "logps/chosen": -131.47650146484375, "logps/ref_chosen": -57.59645080566406, "logps/ref_rejected": -78.99957275390625, "logps/rejected": -197.08822631835938, "loss": 1.1973, "margin_dpo/margin_mean": 44.20860290527344, "margin_dpo/margin_std": 93.17940521240234, "step": 143 }, { "KL/chosen_KL_mean": -65.05524444580078, "KL/mean": -92.26278686523438, "KL/rejected_KL_mean": -119.47032165527344, "KL/std": 66.92149353027344, "epoch": 0.21145374449339208, "fcm_dpo/beta": 0.007265343330800533, "fcm_dpo/delta": 0.004837200976908207, "fcm_dpo/margin": 54.41508483886719, "fcm_dpo/q_t": 0.41054314374923706, "grad_norm": 20.957622528076172, "learning_rate": 4.821786031898176e-07, "logits/chosen": -0.3970365524291992, "logits/rejected": -0.3828931450843811, "logps/chosen": -124.96160888671875, "logps/ref_chosen": -59.90636444091797, "logps/ref_rejected": -82.00025939941406, "logps/rejected": -201.4705810546875, "loss": 1.1054, "margin_dpo/margin_mean": 54.41508483886719, "margin_dpo/margin_std": 78.49324035644531, "step": 144 }, { "KL/chosen_KL_mean": -62.5151252746582, "KL/mean": -91.44053649902344, "KL/rejected_KL_mean": -120.36595153808594, "KL/std": 66.93643188476562, "epoch": 0.21292217327459617, "fcm_dpo/beta": 0.007246987894177437, "fcm_dpo/delta": -0.020106535404920578, "fcm_dpo/margin": 57.850833892822266, "fcm_dpo/q_t": 0.40498581528663635, "grad_norm": 24.766576766967773, "learning_rate": 4.816996926967401e-07, "logits/chosen": -0.4346858263015747, "logits/rejected": -0.4181862473487854, "logps/chosen": -119.11579132080078, "logps/ref_chosen": -56.60066604614258, "logps/ref_rejected": -77.86631774902344, "logps/rejected": -198.23226928710938, "loss": 1.0899, "margin_dpo/margin_mean": 57.850833892822266, "margin_dpo/margin_std": 81.18896484375, "step": 145 }, { "KL/chosen_KL_mean": -84.92266082763672, "KL/mean": -107.23759460449219, "KL/rejected_KL_mean": -129.55252075195312, "KL/std": 72.51426696777344, "epoch": 0.2143906020558003, "fcm_dpo/beta": 0.007306361570954323, "fcm_dpo/delta": 0.07646190375089645, "fcm_dpo/margin": 44.6298828125, "fcm_dpo/q_t": 0.4260821044445038, "grad_norm": 26.79239273071289, "learning_rate": 4.812146767012779e-07, "logits/chosen": -0.40483713150024414, "logits/rejected": -0.3770599961280823, "logps/chosen": -150.92311096191406, "logps/ref_chosen": -66.00045013427734, "logps/ref_rejected": -81.70278930664062, "logps/rejected": -211.25531005859375, "loss": 1.1855, "margin_dpo/margin_mean": 44.6298828125, "margin_dpo/margin_std": 87.585693359375, "step": 146 }, { "KL/chosen_KL_mean": -62.145790100097656, "KL/mean": -92.88223266601562, "KL/rejected_KL_mean": -123.61865997314453, "KL/std": 73.63186645507812, "epoch": 0.21585903083700442, "fcm_dpo/beta": 0.007286765147000551, "fcm_dpo/delta": -0.050357475876808167, "fcm_dpo/margin": 61.472869873046875, "fcm_dpo/q_t": 0.40113240480422974, "grad_norm": 20.187551498413086, "learning_rate": 4.807235679840536e-07, "logits/chosen": -0.4601389765739441, "logits/rejected": -0.4417986273765564, "logps/chosen": -115.55127716064453, "logps/ref_chosen": -53.405487060546875, "logps/ref_rejected": -71.39060974121094, "logps/rejected": -195.00927734375, "loss": 1.0871, "margin_dpo/margin_mean": 61.472869873046875, "margin_dpo/margin_std": 91.1969985961914, "step": 147 }, { "KL/chosen_KL_mean": -60.82073211669922, "KL/mean": -86.87813568115234, "KL/rejected_KL_mean": -112.935546875, "KL/std": 72.8238296508789, "epoch": 0.2173274596182085, "fcm_dpo/beta": 0.007221372798085213, "fcm_dpo/delta": -0.08385775983333588, "fcm_dpo/margin": 52.114810943603516, "fcm_dpo/q_t": 0.41665685176849365, "grad_norm": 18.90130043029785, "learning_rate": 4.802263794862384e-07, "logits/chosen": -0.4921185076236725, "logits/rejected": -0.4849007725715637, "logps/chosen": -125.7578125, "logps/ref_chosen": -64.93708038330078, "logps/ref_rejected": -103.09384155273438, "logps/rejected": -216.02938842773438, "loss": 1.1251, "margin_dpo/margin_mean": 52.114810943603516, "margin_dpo/margin_std": 76.679443359375, "step": 148 }, { "KL/chosen_KL_mean": -57.94389724731445, "KL/mean": -90.5237045288086, "KL/rejected_KL_mean": -123.103515625, "KL/std": 65.8777847290039, "epoch": 0.21879588839941264, "fcm_dpo/beta": 0.0070372275076806545, "fcm_dpo/delta": -0.0631796270608902, "fcm_dpo/margin": 65.15960693359375, "fcm_dpo/q_t": 0.3951166570186615, "grad_norm": 18.19388198852539, "learning_rate": 4.797231243092118e-07, "logits/chosen": -0.49327534437179565, "logits/rejected": -0.47827810049057007, "logps/chosen": -116.41766357421875, "logps/ref_chosen": -58.47376251220703, "logps/ref_rejected": -99.31474304199219, "logps/rejected": -222.4182586669922, "loss": 1.0541, "margin_dpo/margin_mean": 65.15960693359375, "margin_dpo/margin_std": 78.02362060546875, "step": 149 }, { "KL/chosen_KL_mean": -51.43506622314453, "KL/mean": -83.17096710205078, "KL/rejected_KL_mean": -114.90686798095703, "KL/std": 77.47767639160156, "epoch": 0.22026431718061673, "fcm_dpo/beta": 0.006972650997340679, "fcm_dpo/delta": -0.04591844975948334, "fcm_dpo/margin": 63.47180938720703, "fcm_dpo/q_t": 0.4047049582004547, "grad_norm": 17.938838958740234, "learning_rate": 4.792138157142157e-07, "logits/chosen": -0.46106183528900146, "logits/rejected": -0.4645771384239197, "logps/chosen": -97.14087677001953, "logps/ref_chosen": -45.705810546875, "logps/ref_rejected": -83.34759521484375, "logps/rejected": -198.25445556640625, "loss": 1.0829, "margin_dpo/margin_mean": 63.47180938720703, "margin_dpo/margin_std": 92.78956604003906, "step": 150 }, { "KL/chosen_KL_mean": -63.859195709228516, "KL/mean": -95.80221557617188, "KL/rejected_KL_mean": -127.74524688720703, "KL/std": 72.78681945800781, "epoch": 0.22173274596182085, "fcm_dpo/beta": 0.006956371478736401, "fcm_dpo/delta": -0.046492453664541245, "fcm_dpo/margin": 63.88605499267578, "fcm_dpo/q_t": 0.398138165473938, "grad_norm": 19.721174240112305, "learning_rate": 4.786984671220053e-07, "logits/chosen": -0.5282187461853027, "logits/rejected": -0.5002726912498474, "logps/chosen": -134.43002319335938, "logps/ref_chosen": -70.57083129882812, "logps/ref_rejected": -100.46382141113281, "logps/rejected": -228.2090606689453, "loss": 1.061, "margin_dpo/margin_mean": 63.88605499267578, "margin_dpo/margin_std": 79.7131118774414, "step": 151 }, { "KL/chosen_KL_mean": -55.69847106933594, "KL/mean": -94.40074157714844, "KL/rejected_KL_mean": -133.10302734375, "KL/std": 75.01636505126953, "epoch": 0.22320117474302498, "fcm_dpo/beta": 0.0068196142092347145, "fcm_dpo/delta": -0.13479407131671906, "fcm_dpo/margin": 77.40454864501953, "fcm_dpo/q_t": 0.38243043422698975, "grad_norm": 20.134836196899414, "learning_rate": 4.78177092112495e-07, "logits/chosen": -0.5108896493911743, "logits/rejected": -0.5100568532943726, "logps/chosen": -115.86285400390625, "logps/ref_chosen": -60.16438674926758, "logps/ref_rejected": -106.14045715332031, "logps/rejected": -239.24346923828125, "loss": 1.0175, "margin_dpo/margin_mean": 77.40455627441406, "margin_dpo/margin_std": 91.77665710449219, "step": 152 }, { "KL/chosen_KL_mean": -56.524017333984375, "KL/mean": -89.48245239257812, "KL/rejected_KL_mean": -122.4408950805664, "KL/std": 81.44053649902344, "epoch": 0.22466960352422907, "fcm_dpo/beta": 0.0067241257056593895, "fcm_dpo/delta": -0.04522576555609703, "fcm_dpo/margin": 65.91687774658203, "fcm_dpo/q_t": 0.40355831384658813, "grad_norm": 15.487606048583984, "learning_rate": 4.776497044244016e-07, "logits/chosen": -0.4853493571281433, "logits/rejected": -0.48000335693359375, "logps/chosen": -112.83929443359375, "logps/ref_chosen": -56.315277099609375, "logps/ref_rejected": -85.65583801269531, "logps/rejected": -208.0967254638672, "loss": 1.0897, "margin_dpo/margin_mean": 65.91687774658203, "margin_dpo/margin_std": 99.65748596191406, "step": 153 }, { "KL/chosen_KL_mean": -67.8558349609375, "KL/mean": -99.38180541992188, "KL/rejected_KL_mean": -130.9077606201172, "KL/std": 81.84564208984375, "epoch": 0.2261380323054332, "fcm_dpo/beta": 0.006699780933558941, "fcm_dpo/delta": -0.023667776957154274, "fcm_dpo/margin": 63.05192947387695, "fcm_dpo/q_t": 0.4071503281593323, "grad_norm": 18.857498168945312, "learning_rate": 4.771163179548808e-07, "logits/chosen": -0.4654182493686676, "logits/rejected": -0.4673847556114197, "logps/chosen": -130.59840393066406, "logps/ref_chosen": -62.74256896972656, "logps/ref_rejected": -104.24420166015625, "logps/rejected": -235.15194702148438, "loss": 1.1211, "margin_dpo/margin_mean": 63.05193328857422, "margin_dpo/margin_std": 104.29759216308594, "step": 154 }, { "KL/chosen_KL_mean": -61.78107452392578, "KL/mean": -93.73155975341797, "KL/rejected_KL_mean": -125.68203735351562, "KL/std": 76.08834838867188, "epoch": 0.2276064610866373, "fcm_dpo/beta": 0.006656583398580551, "fcm_dpo/delta": -0.026573501527309418, "fcm_dpo/margin": 63.900962829589844, "fcm_dpo/q_t": 0.4044332206249237, "grad_norm": 19.28769302368164, "learning_rate": 4.7657694675916247e-07, "logits/chosen": -0.48790478706359863, "logits/rejected": -0.468170702457428, "logps/chosen": -122.43425750732422, "logps/ref_chosen": -60.65318298339844, "logps/ref_rejected": -77.49220275878906, "logps/rejected": -203.1742401123047, "loss": 1.0941, "margin_dpo/margin_mean": 63.900962829589844, "margin_dpo/margin_std": 94.01338195800781, "step": 155 }, { "KL/chosen_KL_mean": -86.89419555664062, "KL/mean": -105.12815856933594, "KL/rejected_KL_mean": -123.36213684082031, "KL/std": 80.70298767089844, "epoch": 0.2290748898678414, "fcm_dpo/beta": 0.006699825637042522, "fcm_dpo/delta": 0.053138453513383865, "fcm_dpo/margin": 36.46794128417969, "fcm_dpo/q_t": 0.44462156295776367, "grad_norm": 27.65213966369629, "learning_rate": 4.7603160505017893e-07, "logits/chosen": -0.419753760099411, "logits/rejected": -0.41109997034072876, "logps/chosen": -156.38607788085938, "logps/ref_chosen": -69.49188232421875, "logps/ref_rejected": -77.16929626464844, "logps/rejected": -200.53143310546875, "loss": 1.276, "margin_dpo/margin_mean": 36.46794128417969, "margin_dpo/margin_std": 106.41633605957031, "step": 156 }, { "KL/chosen_KL_mean": -78.34427642822266, "KL/mean": -119.24581909179688, "KL/rejected_KL_mean": -160.1473388671875, "KL/std": 88.42298889160156, "epoch": 0.2305433186490455, "fcm_dpo/beta": 0.00652310810983181, "fcm_dpo/delta": -0.14256341755390167, "fcm_dpo/margin": 81.80308532714844, "fcm_dpo/q_t": 0.3782350420951843, "grad_norm": 23.294269561767578, "learning_rate": 4.7548030719819154e-07, "logits/chosen": -0.40555089712142944, "logits/rejected": -0.4124807119369507, "logps/chosen": -139.71270751953125, "logps/ref_chosen": -61.368438720703125, "logps/ref_rejected": -107.64636993408203, "logps/rejected": -267.793701171875, "loss": 1.0267, "margin_dpo/margin_mean": 81.80308532714844, "margin_dpo/margin_std": 100.67677307128906, "step": 157 }, { "KL/chosen_KL_mean": -78.94845581054688, "KL/mean": -122.50050354003906, "KL/rejected_KL_mean": -166.05255126953125, "KL/std": 109.67285919189453, "epoch": 0.23201174743024963, "fcm_dpo/beta": 0.006357924081385136, "fcm_dpo/delta": -0.16277411580085754, "fcm_dpo/margin": 87.10411834716797, "fcm_dpo/q_t": 0.386168897151947, "grad_norm": 20.690141677856445, "learning_rate": 4.7492306773041136e-07, "logits/chosen": -0.3853977918624878, "logits/rejected": -0.4026561975479126, "logps/chosen": -136.56137084960938, "logps/ref_chosen": -57.612918853759766, "logps/ref_rejected": -113.6946792602539, "logps/rejected": -279.74725341796875, "loss": 1.0524, "margin_dpo/margin_mean": 87.10411071777344, "margin_dpo/margin_std": 131.93109130859375, "step": 158 }, { "KL/chosen_KL_mean": -89.82890319824219, "KL/mean": -119.66598510742188, "KL/rejected_KL_mean": -149.5030517578125, "KL/std": 97.01811218261719, "epoch": 0.23348017621145375, "fcm_dpo/beta": 0.0063509754836559296, "fcm_dpo/delta": 0.02134835720062256, "fcm_dpo/margin": 59.67415237426758, "fcm_dpo/q_t": 0.4156301021575928, "grad_norm": 21.42896270751953, "learning_rate": 4.743599013306165e-07, "logits/chosen": -0.4047996401786804, "logits/rejected": -0.37329649925231934, "logps/chosen": -171.38925170898438, "logps/ref_chosen": -81.56034851074219, "logps/ref_rejected": -88.89871215820312, "logps/rejected": -238.40176391601562, "loss": 1.1467, "margin_dpo/margin_mean": 59.674156188964844, "margin_dpo/margin_std": 104.58900451660156, "step": 159 }, { "KL/chosen_KL_mean": -91.19225311279297, "KL/mean": -130.95059204101562, "KL/rejected_KL_mean": -170.70892333984375, "KL/std": 102.61758422851562, "epoch": 0.23494860499265785, "fcm_dpo/beta": 0.006208137609064579, "fcm_dpo/delta": -0.09941543638706207, "fcm_dpo/margin": 79.51667785644531, "fcm_dpo/q_t": 0.39641568064689636, "grad_norm": 22.98140525817871, "learning_rate": 4.737908228387656e-07, "logits/chosen": -0.3908793032169342, "logits/rejected": -0.38329264521598816, "logps/chosen": -156.92312622070312, "logps/ref_chosen": -65.73088073730469, "logps/ref_rejected": -97.21781921386719, "logps/rejected": -267.9267578125, "loss": 1.091, "margin_dpo/margin_mean": 79.51667785644531, "margin_dpo/margin_std": 128.43882751464844, "step": 160 }, { "KL/chosen_KL_mean": -78.4635238647461, "KL/mean": -113.22474670410156, "KL/rejected_KL_mean": -147.9859619140625, "KL/std": 81.19598388671875, "epoch": 0.23641703377386197, "fcm_dpo/beta": 0.00617564469575882, "fcm_dpo/delta": -0.03067013993859291, "fcm_dpo/margin": 69.5224380493164, "fcm_dpo/q_t": 0.4049830436706543, "grad_norm": 21.47429084777832, "learning_rate": 4.7321584725060594e-07, "logits/chosen": -0.3746282160282135, "logits/rejected": -0.3739486634731293, "logps/chosen": -130.89999389648438, "logps/ref_chosen": -52.43647003173828, "logps/ref_rejected": -83.43095397949219, "logps/rejected": -231.41690063476562, "loss": 1.0944, "margin_dpo/margin_mean": 69.5224380493164, "margin_dpo/margin_std": 102.78611755371094, "step": 161 }, { "KL/chosen_KL_mean": -75.56187438964844, "KL/mean": -110.21990966796875, "KL/rejected_KL_mean": -144.87796020507812, "KL/std": 90.19256591796875, "epoch": 0.23788546255506607, "fcm_dpo/beta": 0.006094048731029034, "fcm_dpo/delta": -0.02500341832637787, "fcm_dpo/margin": 69.31608581542969, "fcm_dpo/q_t": 0.4067990183830261, "grad_norm": 23.615018844604492, "learning_rate": 4.7263498971727905e-07, "logits/chosen": -0.4356382191181183, "logits/rejected": -0.41989463567733765, "logps/chosen": -138.17245483398438, "logps/ref_chosen": -62.6105842590332, "logps/ref_rejected": -89.39057922363281, "logps/rejected": -234.26853942871094, "loss": 1.108, "margin_dpo/margin_mean": 69.31608581542969, "margin_dpo/margin_std": 106.76126861572266, "step": 162 }, { "KL/chosen_KL_mean": -85.64208221435547, "KL/mean": -119.12933349609375, "KL/rejected_KL_mean": -152.61660766601562, "KL/std": 90.75747680664062, "epoch": 0.2393538913362702, "fcm_dpo/beta": 0.006128158885985613, "fcm_dpo/delta": -0.010875340551137924, "fcm_dpo/margin": 66.97450256347656, "fcm_dpo/q_t": 0.41041916608810425, "grad_norm": 22.441957473754883, "learning_rate": 4.720482655449212e-07, "logits/chosen": -0.32167524099349976, "logits/rejected": -0.3008124828338623, "logps/chosen": -140.66371154785156, "logps/ref_chosen": -55.021629333496094, "logps/ref_rejected": -75.418212890625, "logps/rejected": -228.03482055664062, "loss": 1.1182, "margin_dpo/margin_mean": 66.97450256347656, "margin_dpo/margin_std": 109.2835693359375, "step": 163 }, { "KL/chosen_KL_mean": -78.14186096191406, "KL/mean": -119.51951599121094, "KL/rejected_KL_mean": -160.89715576171875, "KL/std": 89.18498992919922, "epoch": 0.24082232011747431, "fcm_dpo/beta": 0.005989897530525923, "fcm_dpo/delta": -0.10233054310083389, "fcm_dpo/margin": 82.75530242919922, "fcm_dpo/q_t": 0.38811802864074707, "grad_norm": 22.113636016845703, "learning_rate": 4.714556901942599e-07, "logits/chosen": -0.3749139904975891, "logits/rejected": -0.36253267526626587, "logps/chosen": -133.78253173828125, "logps/ref_chosen": -55.64066696166992, "logps/ref_rejected": -79.66463470458984, "logps/rejected": -240.56179809570312, "loss": 1.0369, "margin_dpo/margin_mean": 82.75530242919922, "margin_dpo/margin_std": 102.402587890625, "step": 164 }, { "KL/chosen_KL_mean": -83.58428192138672, "KL/mean": -110.63142395019531, "KL/rejected_KL_mean": -137.67855834960938, "KL/std": 75.4303970336914, "epoch": 0.2422907488986784, "fcm_dpo/beta": 0.0060513936914503574, "fcm_dpo/delta": 0.07517173886299133, "fcm_dpo/margin": 54.094295501708984, "fcm_dpo/q_t": 0.4266318678855896, "grad_norm": 23.23404312133789, "learning_rate": 4.708572792802069e-07, "logits/chosen": -0.37104758620262146, "logits/rejected": -0.3426979184150696, "logps/chosen": -144.8949737548828, "logps/ref_chosen": -61.310691833496094, "logps/ref_rejected": -73.67060852050781, "logps/rejected": -211.34918212890625, "loss": 1.1723, "margin_dpo/margin_mean": 54.09429931640625, "margin_dpo/margin_std": 100.12647247314453, "step": 165 }, { "KL/chosen_KL_mean": -74.41826629638672, "KL/mean": -124.09632873535156, "KL/rejected_KL_mean": -173.77439880371094, "KL/std": 109.63383483886719, "epoch": 0.24375917767988253, "fcm_dpo/beta": 0.005887492559850216, "fcm_dpo/delta": -0.1977493166923523, "fcm_dpo/margin": 99.35612487792969, "fcm_dpo/q_t": 0.38029175996780396, "grad_norm": 18.252317428588867, "learning_rate": 4.702530485714461e-07, "logits/chosen": -0.3085266351699829, "logits/rejected": -0.3185623288154602, "logps/chosen": -125.40187072753906, "logps/ref_chosen": -50.98360061645508, "logps/ref_rejected": -98.09512329101562, "logps/rejected": -271.8695068359375, "loss": 1.0172, "margin_dpo/margin_mean": 99.35612487792969, "margin_dpo/margin_std": 138.07797241210938, "step": 166 }, { "KL/chosen_KL_mean": -74.95281982421875, "KL/mean": -127.87260437011719, "KL/rejected_KL_mean": -180.79238891601562, "KL/std": 100.11784362792969, "epoch": 0.24522760646108663, "fcm_dpo/beta": 0.005670108832418919, "fcm_dpo/delta": -0.21297289431095123, "fcm_dpo/margin": 105.83956909179688, "fcm_dpo/q_t": 0.36611077189445496, "grad_norm": 20.504474639892578, "learning_rate": 4.6964301399001877e-07, "logits/chosen": -0.32652735710144043, "logits/rejected": -0.32873308658599854, "logps/chosen": -125.37691497802734, "logps/ref_chosen": -50.424095153808594, "logps/ref_rejected": -96.03042602539062, "logps/rejected": -276.82281494140625, "loss": 0.9685, "margin_dpo/margin_mean": 105.83956909179688, "margin_dpo/margin_std": 115.47872924804688, "step": 167 }, { "KL/chosen_KL_mean": -81.31246948242188, "KL/mean": -119.5781478881836, "KL/rejected_KL_mean": -157.84381103515625, "KL/std": 93.51333618164062, "epoch": 0.24669603524229075, "fcm_dpo/beta": 0.00557487178593874, "fcm_dpo/delta": -0.027986720204353333, "fcm_dpo/margin": 76.5313491821289, "fcm_dpo/q_t": 0.40447282791137695, "grad_norm": 20.050628662109375, "learning_rate": 4.690271916109034e-07, "logits/chosen": -0.34561973810195923, "logits/rejected": -0.33558547496795654, "logps/chosen": -130.77529907226562, "logps/ref_chosen": -49.462825775146484, "logps/ref_rejected": -75.30855560302734, "logps/rejected": -233.15237426757812, "loss": 1.0813, "margin_dpo/margin_mean": 76.53134155273438, "margin_dpo/margin_std": 104.77227783203125, "step": 168 }, { "KL/chosen_KL_mean": -83.71910095214844, "KL/mean": -117.21250915527344, "KL/rejected_KL_mean": -150.70590209960938, "KL/std": 92.11152648925781, "epoch": 0.24816446402349487, "fcm_dpo/beta": 0.005499421618878841, "fcm_dpo/delta": -0.07137174159288406, "fcm_dpo/margin": 66.98680114746094, "fcm_dpo/q_t": 0.42043811082839966, "grad_norm": 20.767568588256836, "learning_rate": 4.6840559766159235e-07, "logits/chosen": -0.38283443450927734, "logits/rejected": -0.3673766255378723, "logps/chosen": -143.52255249023438, "logps/ref_chosen": -59.803443908691406, "logps/ref_rejected": -83.34574890136719, "logps/rejected": -234.05165100097656, "loss": 1.1616, "margin_dpo/margin_mean": 66.98680877685547, "margin_dpo/margin_std": 126.373779296875, "step": 169 }, { "KL/chosen_KL_mean": -73.89985656738281, "KL/mean": -113.43084716796875, "KL/rejected_KL_mean": -152.96185302734375, "KL/std": 87.49028778076172, "epoch": 0.24963289280469897, "fcm_dpo/beta": 0.005440497770905495, "fcm_dpo/delta": -0.032409437000751495, "fcm_dpo/margin": 79.06198120117188, "fcm_dpo/q_t": 0.4014926552772522, "grad_norm": 18.433393478393555, "learning_rate": 4.6777824852166437e-07, "logits/chosen": -0.32298341393470764, "logits/rejected": -0.3117997348308563, "logps/chosen": -123.37162780761719, "logps/ref_chosen": -49.471771240234375, "logps/ref_rejected": -75.91734313964844, "logps/rejected": -228.87918090820312, "loss": 1.0797, "margin_dpo/margin_mean": 79.06198120117188, "margin_dpo/margin_std": 104.669189453125, "step": 170 }, { "KL/chosen_KL_mean": -109.0721435546875, "KL/mean": -140.8416290283203, "KL/rejected_KL_mean": -172.61111450195312, "KL/std": 102.56979370117188, "epoch": 0.2511013215859031, "fcm_dpo/beta": 0.005503002088516951, "fcm_dpo/delta": 0.05219453573226929, "fcm_dpo/margin": 63.538963317871094, "fcm_dpo/q_t": 0.42633184790611267, "grad_norm": 27.85107421875, "learning_rate": 4.6714516072235273e-07, "logits/chosen": -0.3782072067260742, "logits/rejected": -0.36336031556129456, "logps/chosen": -193.5714569091797, "logps/ref_chosen": -84.49931335449219, "logps/ref_rejected": -109.38209533691406, "logps/rejected": -281.99322509765625, "loss": 1.1822, "margin_dpo/margin_mean": 63.538963317871094, "margin_dpo/margin_std": 132.4144287109375, "step": 171 }, { "KL/chosen_KL_mean": -95.08564758300781, "KL/mean": -128.90762329101562, "KL/rejected_KL_mean": -162.72958374023438, "KL/std": 98.64192199707031, "epoch": 0.2525697503671072, "fcm_dpo/beta": 0.0055364081636071205, "fcm_dpo/delta": 0.02648979052901268, "fcm_dpo/margin": 67.64393615722656, "fcm_dpo/q_t": 0.41614508628845215, "grad_norm": 19.190082550048828, "learning_rate": 4.6650635094610966e-07, "logits/chosen": -0.3858957886695862, "logits/rejected": -0.36709922552108765, "logps/chosen": -163.73956298828125, "logps/ref_chosen": -68.65391540527344, "logps/ref_rejected": -85.43667602539062, "logps/rejected": -248.166259765625, "loss": 1.1328, "margin_dpo/margin_mean": 67.64393615722656, "margin_dpo/margin_std": 111.18234252929688, "step": 172 }, { "KL/chosen_KL_mean": -86.4188461303711, "KL/mean": -119.80665588378906, "KL/rejected_KL_mean": -153.1944580078125, "KL/std": 91.71368408203125, "epoch": 0.2540381791483113, "fcm_dpo/beta": 0.005593603476881981, "fcm_dpo/delta": 0.027135606855154037, "fcm_dpo/margin": 66.7756118774414, "fcm_dpo/q_t": 0.4151589870452881, "grad_norm": 20.20654296875, "learning_rate": 4.6586183602616687e-07, "logits/chosen": -0.4084116816520691, "logits/rejected": -0.38007980585098267, "logps/chosen": -149.4697265625, "logps/ref_chosen": -63.050880432128906, "logps/ref_rejected": -78.68392181396484, "logps/rejected": -231.87838745117188, "loss": 1.1132, "margin_dpo/margin_mean": 66.7756118774414, "margin_dpo/margin_std": 95.39866638183594, "step": 173 }, { "KL/chosen_KL_mean": -82.23112487792969, "KL/mean": -121.97492980957031, "KL/rejected_KL_mean": -161.71873474121094, "KL/std": 96.5755615234375, "epoch": 0.2555066079295154, "fcm_dpo/beta": 0.005577293690294027, "fcm_dpo/delta": -0.045830775052309036, "fcm_dpo/margin": 79.48760986328125, "fcm_dpo/q_t": 0.40260159969329834, "grad_norm": 21.535198211669922, "learning_rate": 4.652116329460919e-07, "logits/chosen": -0.320295512676239, "logits/rejected": -0.338248610496521, "logps/chosen": -135.59408569335938, "logps/ref_chosen": -53.36296844482422, "logps/ref_rejected": -101.91120910644531, "logps/rejected": -263.62994384765625, "loss": 1.089, "margin_dpo/margin_mean": 79.48760986328125, "margin_dpo/margin_std": 115.94807434082031, "step": 174 }, { "KL/chosen_KL_mean": -75.58938598632812, "KL/mean": -130.3022003173828, "KL/rejected_KL_mean": -185.0150146484375, "KL/std": 103.86154174804688, "epoch": 0.25697503671071953, "fcm_dpo/beta": 0.00536087341606617, "fcm_dpo/delta": -0.19861072301864624, "fcm_dpo/margin": 109.42562866210938, "fcm_dpo/q_t": 0.3664923906326294, "grad_norm": 27.933094024658203, "learning_rate": 4.645557588393406e-07, "logits/chosen": -0.31206628680229187, "logits/rejected": -0.29794448614120483, "logps/chosen": -121.00715637207031, "logps/ref_chosen": -45.417762756347656, "logps/ref_rejected": -89.50579833984375, "logps/rejected": -274.52081298828125, "loss": 0.959, "margin_dpo/margin_mean": 109.42562866210938, "margin_dpo/margin_std": 109.13046264648438, "step": 175 }, { "KL/chosen_KL_mean": -80.81350708007812, "KL/mean": -126.73405456542969, "KL/rejected_KL_mean": -172.65460205078125, "KL/std": 101.8584976196289, "epoch": 0.25844346549192365, "fcm_dpo/beta": 0.0052553461864590645, "fcm_dpo/delta": -0.08676035702228546, "fcm_dpo/margin": 91.84110260009766, "fcm_dpo/q_t": 0.3937104344367981, "grad_norm": 19.47554588317871, "learning_rate": 4.638942309888058e-07, "logits/chosen": -0.3381134867668152, "logits/rejected": -0.35593676567077637, "logps/chosen": -131.26634216308594, "logps/ref_chosen": -50.452842712402344, "logps/ref_rejected": -95.5589599609375, "logps/rejected": -268.21356201171875, "loss": 1.0458, "margin_dpo/margin_mean": 91.84110260009766, "margin_dpo/margin_std": 117.94707489013672, "step": 176 }, { "KL/chosen_KL_mean": -95.21261596679688, "KL/mean": -140.65003967285156, "KL/rejected_KL_mean": -186.08746337890625, "KL/std": 111.35872650146484, "epoch": 0.2599118942731278, "fcm_dpo/beta": 0.005180859938263893, "fcm_dpo/delta": -0.07426586002111435, "fcm_dpo/margin": 90.87483215332031, "fcm_dpo/q_t": 0.39563536643981934, "grad_norm": 30.431970596313477, "learning_rate": 4.6322706682636137e-07, "logits/chosen": -0.35000866651535034, "logits/rejected": -0.3415108621120453, "logps/chosen": -156.4290771484375, "logps/ref_chosen": -61.216468811035156, "logps/ref_rejected": -95.89378356933594, "logps/rejected": -281.98126220703125, "loss": 1.053, "margin_dpo/margin_mean": 90.87483215332031, "margin_dpo/margin_std": 118.72608947753906, "step": 177 }, { "KL/chosen_KL_mean": -104.90020751953125, "KL/mean": -162.3438720703125, "KL/rejected_KL_mean": -219.78750610351562, "KL/std": 130.68798828125, "epoch": 0.26138032305433184, "fcm_dpo/beta": 0.004986546002328396, "fcm_dpo/delta": -0.1844998002052307, "fcm_dpo/margin": 114.88732147216797, "fcm_dpo/q_t": 0.37574303150177, "grad_norm": 27.06403350830078, "learning_rate": 4.6255428393240354e-07, "logits/chosen": -0.25030016899108887, "logits/rejected": -0.2403268814086914, "logps/chosen": -163.16500854492188, "logps/ref_chosen": -58.26478958129883, "logps/ref_rejected": -105.3653335571289, "logps/rejected": -325.15283203125, "loss": 1.0007, "margin_dpo/margin_mean": 114.88731384277344, "margin_dpo/margin_std": 142.7379150390625, "step": 178 }, { "KL/chosen_KL_mean": -108.67526245117188, "KL/mean": -149.40911865234375, "KL/rejected_KL_mean": -190.14297485351562, "KL/std": 112.27867126464844, "epoch": 0.26284875183553597, "fcm_dpo/beta": 0.004935364704579115, "fcm_dpo/delta": -0.0025902092456817627, "fcm_dpo/margin": 81.46771240234375, "fcm_dpo/q_t": 0.4115809500217438, "grad_norm": 36.69063949584961, "learning_rate": 4.6187590003538724e-07, "logits/chosen": -0.3055553138256073, "logits/rejected": -0.315255343914032, "logps/chosen": -169.73358154296875, "logps/ref_chosen": -61.05832290649414, "logps/ref_rejected": -90.52782440185547, "logps/rejected": -280.6708068847656, "loss": 1.1354, "margin_dpo/margin_mean": 81.46771240234375, "margin_dpo/margin_std": 140.07054138183594, "step": 179 }, { "KL/chosen_KL_mean": -93.43641662597656, "KL/mean": -145.72601318359375, "KL/rejected_KL_mean": -198.01556396484375, "KL/std": 100.83357238769531, "epoch": 0.2643171806167401, "fcm_dpo/beta": 0.0048674289137125015, "fcm_dpo/delta": -0.11503924429416656, "fcm_dpo/margin": 104.57914733886719, "fcm_dpo/q_t": 0.38437995314598083, "grad_norm": 20.31671142578125, "learning_rate": 4.611919330113591e-07, "logits/chosen": -0.28088757395744324, "logits/rejected": -0.27398407459259033, "logps/chosen": -147.77914428710938, "logps/ref_chosen": -54.34272003173828, "logps/ref_rejected": -98.21183776855469, "logps/rejected": -296.2274169921875, "loss": 1.0254, "margin_dpo/margin_mean": 104.57914733886719, "margin_dpo/margin_std": 125.9128646850586, "step": 180 }, { "KL/chosen_KL_mean": -81.22648620605469, "KL/mean": -113.69107818603516, "KL/rejected_KL_mean": -146.1556854248047, "KL/std": 92.112060546875, "epoch": 0.2657856093979442, "fcm_dpo/beta": 0.00491193775087595, "fcm_dpo/delta": 0.0835873931646347, "fcm_dpo/margin": 64.92919158935547, "fcm_dpo/q_t": 0.42752861976623535, "grad_norm": 17.186668395996094, "learning_rate": 4.605024008834863e-07, "logits/chosen": -0.3439704179763794, "logits/rejected": -0.32105350494384766, "logps/chosen": -136.22694396972656, "logps/ref_chosen": -55.000457763671875, "logps/ref_rejected": -61.656166076660156, "logps/rejected": -207.81185913085938, "loss": 1.1687, "margin_dpo/margin_mean": 64.92919921875, "margin_dpo/margin_std": 116.01119995117188, "step": 181 }, { "KL/chosen_KL_mean": -77.05168151855469, "KL/mean": -133.05014038085938, "KL/rejected_KL_mean": -189.04859924316406, "KL/std": 110.59321594238281, "epoch": 0.26725403817914833, "fcm_dpo/beta": 0.0048008207231760025, "fcm_dpo/delta": -0.14640963077545166, "fcm_dpo/margin": 111.99693298339844, "fcm_dpo/q_t": 0.3778771162033081, "grad_norm": 19.202186584472656, "learning_rate": 4.598073218215817e-07, "logits/chosen": -0.30555886030197144, "logits/rejected": -0.3154027462005615, "logps/chosen": -118.15953063964844, "logps/ref_chosen": -41.107852935791016, "logps/ref_rejected": -89.5215835571289, "logps/rejected": -278.5701904296875, "loss": 1.0138, "margin_dpo/margin_mean": 111.99693298339844, "margin_dpo/margin_std": 133.52532958984375, "step": 182 }, { "KL/chosen_KL_mean": -114.24200439453125, "KL/mean": -144.60440063476562, "KL/rejected_KL_mean": -174.966796875, "KL/std": 92.102294921875, "epoch": 0.2687224669603524, "fcm_dpo/beta": 0.00474231131374836, "fcm_dpo/delta": -0.043582916259765625, "fcm_dpo/margin": 60.724796295166016, "fcm_dpo/q_t": 0.4329318106174469, "grad_norm": 21.197261810302734, "learning_rate": 4.5910671414162484e-07, "logits/chosen": -0.32367801666259766, "logits/rejected": -0.31501567363739014, "logps/chosen": -171.76657104492188, "logps/ref_chosen": -57.52456283569336, "logps/ref_rejected": -75.97572326660156, "logps/rejected": -250.94252014160156, "loss": 1.1824, "margin_dpo/margin_mean": 60.72479248046875, "margin_dpo/margin_std": 101.34217834472656, "step": 183 }, { "KL/chosen_KL_mean": -94.91139221191406, "KL/mean": -126.96908569335938, "KL/rejected_KL_mean": -159.0267791748047, "KL/std": 86.96229553222656, "epoch": 0.2701908957415565, "fcm_dpo/beta": 0.004741538781672716, "fcm_dpo/delta": -0.00162951136007905, "fcm_dpo/margin": 64.11538696289062, "fcm_dpo/q_t": 0.430799275636673, "grad_norm": 20.073440551757812, "learning_rate": 4.5840059630527985e-07, "logits/chosen": -0.35681042075157166, "logits/rejected": -0.34759992361068726, "logps/chosen": -153.45635986328125, "logps/ref_chosen": -58.544952392578125, "logps/ref_rejected": -76.63406372070312, "logps/rejected": -235.6608428955078, "loss": 1.1707, "margin_dpo/margin_mean": 64.11538696289062, "margin_dpo/margin_std": 109.54376220703125, "step": 184 }, { "KL/chosen_KL_mean": -102.1708755493164, "KL/mean": -127.47833251953125, "KL/rejected_KL_mean": -152.78579711914062, "KL/std": 99.92794799804688, "epoch": 0.27165932452276065, "fcm_dpo/beta": 0.004851914010941982, "fcm_dpo/delta": 0.15838554501533508, "fcm_dpo/margin": 50.61490249633789, "fcm_dpo/q_t": 0.44705960154533386, "grad_norm": 19.03368377685547, "learning_rate": 4.5768898691940836e-07, "logits/chosen": -0.33275556564331055, "logits/rejected": -0.3096786439418793, "logps/chosen": -164.19671630859375, "logps/ref_chosen": -62.025848388671875, "logps/ref_rejected": -73.7625961303711, "logps/rejected": -226.5483856201172, "loss": 1.2336, "margin_dpo/margin_mean": 50.61490249633789, "margin_dpo/margin_std": 120.33627319335938, "step": 185 }, { "KL/chosen_KL_mean": -93.70341491699219, "KL/mean": -141.43142700195312, "KL/rejected_KL_mean": -189.159423828125, "KL/std": 100.74044799804688, "epoch": 0.27312775330396477, "fcm_dpo/beta": 0.00484071671962738, "fcm_dpo/delta": -0.06518108397722244, "fcm_dpo/margin": 95.45602416992188, "fcm_dpo/q_t": 0.39502984285354614, "grad_norm": 30.832712173461914, "learning_rate": 4.5697190473557947e-07, "logits/chosen": -0.35813000798225403, "logits/rejected": -0.3332071304321289, "logps/chosen": -163.056884765625, "logps/ref_chosen": -69.35346984863281, "logps/ref_rejected": -88.07244873046875, "logps/rejected": -277.23187255859375, "loss": 1.0476, "margin_dpo/margin_mean": 95.45602416992188, "margin_dpo/margin_std": 115.35481262207031, "step": 186 }, { "KL/chosen_KL_mean": -88.89076232910156, "KL/mean": -128.86146545410156, "KL/rejected_KL_mean": -168.83216857910156, "KL/std": 96.70646667480469, "epoch": 0.2745961820851689, "fcm_dpo/beta": 0.004859459586441517, "fcm_dpo/delta": 0.011555861681699753, "fcm_dpo/margin": 79.94140625, "fcm_dpo/q_t": 0.4108119606971741, "grad_norm": 24.247724533081055, "learning_rate": 4.5624936864957555e-07, "logits/chosen": -0.32936474680900574, "logits/rejected": -0.3230019807815552, "logps/chosen": -141.647216796875, "logps/ref_chosen": -52.7564582824707, "logps/ref_rejected": -81.96910095214844, "logps/rejected": -250.80126953125, "loss": 1.0976, "margin_dpo/margin_mean": 79.94140625, "margin_dpo/margin_std": 105.25481414794922, "step": 187 }, { "KL/chosen_KL_mean": -83.75914001464844, "KL/mean": -132.92103576660156, "KL/rejected_KL_mean": -182.08291625976562, "KL/std": 108.79667663574219, "epoch": 0.27606461086637296, "fcm_dpo/beta": 0.004792365245521069, "fcm_dpo/delta": -0.07467129826545715, "fcm_dpo/margin": 98.32379150390625, "fcm_dpo/q_t": 0.393817663192749, "grad_norm": 34.55025863647461, "learning_rate": 4.5552139770089454e-07, "logits/chosen": -0.33150649070739746, "logits/rejected": -0.338370680809021, "logps/chosen": -133.17462158203125, "logps/ref_chosen": -49.415489196777344, "logps/ref_rejected": -89.54043579101562, "logps/rejected": -271.62335205078125, "loss": 1.0429, "margin_dpo/margin_mean": 98.32378387451172, "margin_dpo/margin_std": 119.06608581542969, "step": 188 }, { "KL/chosen_KL_mean": -99.01392364501953, "KL/mean": -138.84449768066406, "KL/rejected_KL_mean": -178.67507934570312, "KL/std": 109.14806365966797, "epoch": 0.2775330396475771, "fcm_dpo/beta": 0.00478787487372756, "fcm_dpo/delta": 0.019290301948785782, "fcm_dpo/margin": 79.6611328125, "fcm_dpo/q_t": 0.41624516248703003, "grad_norm": 29.447795867919922, "learning_rate": 4.5478801107224794e-07, "logits/chosen": -0.334136962890625, "logits/rejected": -0.31781691312789917, "logps/chosen": -151.4128875732422, "logps/ref_chosen": -52.39896011352539, "logps/ref_rejected": -72.16735076904297, "logps/rejected": -250.84242248535156, "loss": 1.1305, "margin_dpo/margin_mean": 79.6611328125, "margin_dpo/margin_std": 133.5395050048828, "step": 189 }, { "KL/chosen_KL_mean": -105.0693359375, "KL/mean": -152.8391876220703, "KL/rejected_KL_mean": -200.60903930664062, "KL/std": 116.60220336914062, "epoch": 0.2790014684287812, "fcm_dpo/beta": 0.004786365665495396, "fcm_dpo/delta": -0.061055850237607956, "fcm_dpo/margin": 95.53968811035156, "fcm_dpo/q_t": 0.39865192770957947, "grad_norm": 18.84038543701172, "learning_rate": 4.5404922808905543e-07, "logits/chosen": -0.34232112765312195, "logits/rejected": -0.3296660780906677, "logps/chosen": -169.75241088867188, "logps/ref_chosen": -64.68305969238281, "logps/ref_rejected": -102.55052185058594, "logps/rejected": -303.1595458984375, "loss": 1.0824, "margin_dpo/margin_mean": 95.53968811035156, "margin_dpo/margin_std": 135.1875, "step": 190 }, { "KL/chosen_KL_mean": -98.90615844726562, "KL/mean": -167.89767456054688, "KL/rejected_KL_mean": -236.88919067382812, "KL/std": 137.1860809326172, "epoch": 0.28046989720998533, "fcm_dpo/beta": 0.0045428648591041565, "fcm_dpo/delta": -0.2434038668870926, "fcm_dpo/margin": 137.9830322265625, "fcm_dpo/q_t": 0.36233189702033997, "grad_norm": 21.52570152282715, "learning_rate": 4.5330506821893565e-07, "logits/chosen": -0.34583958983421326, "logits/rejected": -0.3241385817527771, "logps/chosen": -167.56503295898438, "logps/ref_chosen": -68.65887451171875, "logps/ref_rejected": -110.1396713256836, "logps/rejected": -347.02886962890625, "loss": 0.9544, "margin_dpo/margin_mean": 137.9830322265625, "margin_dpo/margin_std": 151.53329467773438, "step": 191 }, { "KL/chosen_KL_mean": -127.5339126586914, "KL/mean": -173.3841552734375, "KL/rejected_KL_mean": -219.23440551757812, "KL/std": 119.96187591552734, "epoch": 0.28193832599118945, "fcm_dpo/beta": 0.004507323727011681, "fcm_dpo/delta": -0.014000019058585167, "fcm_dpo/margin": 91.70048522949219, "fcm_dpo/q_t": 0.4093227982521057, "grad_norm": 25.540145874023438, "learning_rate": 4.5255555107119336e-07, "logits/chosen": -0.2959958016872406, "logits/rejected": -0.296117901802063, "logps/chosen": -197.26083374023438, "logps/ref_chosen": -69.72691345214844, "logps/ref_rejected": -103.32135009765625, "logps/rejected": -322.55572509765625, "loss": 1.115, "margin_dpo/margin_mean": 91.70048522949219, "margin_dpo/margin_std": 148.0252685546875, "step": 192 }, { "KL/chosen_KL_mean": -127.05119323730469, "KL/mean": -153.37745666503906, "KL/rejected_KL_mean": -179.70370483398438, "KL/std": 109.23312377929688, "epoch": 0.2834067547723935, "fcm_dpo/beta": 0.004510689992457628, "fcm_dpo/delta": 0.040593214333057404, "fcm_dpo/margin": 52.65251159667969, "fcm_dpo/q_t": 0.44376885890960693, "grad_norm": 29.541507720947266, "learning_rate": 4.5180069639630236e-07, "logits/chosen": -0.2886780798435211, "logits/rejected": -0.27803605794906616, "logps/chosen": -187.24169921875, "logps/ref_chosen": -60.19049835205078, "logps/ref_rejected": -76.40755462646484, "logps/rejected": -256.11126708984375, "loss": 1.2572, "margin_dpo/margin_mean": 52.65251159667969, "margin_dpo/margin_std": 139.58816528320312, "step": 193 }, { "KL/chosen_KL_mean": -78.30635833740234, "KL/mean": -121.43690490722656, "KL/rejected_KL_mean": -164.56744384765625, "KL/std": 90.03581237792969, "epoch": 0.28487518355359764, "fcm_dpo/beta": 0.004522847011685371, "fcm_dpo/delta": 0.01011504977941513, "fcm_dpo/margin": 86.26107788085938, "fcm_dpo/q_t": 0.40855488181114197, "grad_norm": 18.051904678344727, "learning_rate": 4.510405240853854e-07, "logits/chosen": -0.25177642703056335, "logits/rejected": -0.2375318706035614, "logps/chosen": -116.146728515625, "logps/ref_chosen": -37.84037399291992, "logps/ref_rejected": -60.684783935546875, "logps/rejected": -225.25222778320312, "loss": 1.0818, "margin_dpo/margin_mean": 86.26107788085938, "margin_dpo/margin_std": 98.7254638671875, "step": 194 }, { "KL/chosen_KL_mean": -125.2491226196289, "KL/mean": -171.83770751953125, "KL/rejected_KL_mean": -218.42633056640625, "KL/std": 114.07196807861328, "epoch": 0.28634361233480177, "fcm_dpo/beta": 0.004522291943430901, "fcm_dpo/delta": -0.022319436073303223, "fcm_dpo/margin": 93.17718505859375, "fcm_dpo/q_t": 0.4035479426383972, "grad_norm": 21.63848876953125, "learning_rate": 4.5027505416968985e-07, "logits/chosen": -0.2544304132461548, "logits/rejected": -0.27329152822494507, "logps/chosen": -180.14068603515625, "logps/ref_chosen": -54.891571044921875, "logps/ref_rejected": -96.77095794677734, "logps/rejected": -315.197265625, "loss": 1.0745, "margin_dpo/margin_mean": 93.17718505859375, "margin_dpo/margin_std": 118.36261749267578, "step": 195 }, { "KL/chosen_KL_mean": -96.55429077148438, "KL/mean": -149.50570678710938, "KL/rejected_KL_mean": -202.45712280273438, "KL/std": 115.13066101074219, "epoch": 0.2878120411160059, "fcm_dpo/beta": 0.0044434089213609695, "fcm_dpo/delta": -0.07507769018411636, "fcm_dpo/margin": 105.90283966064453, "fcm_dpo/q_t": 0.3950856924057007, "grad_norm": 17.6580753326416, "learning_rate": 4.495043068200599e-07, "logits/chosen": -0.29589658975601196, "logits/rejected": -0.2812860608100891, "logps/chosen": -149.79953002929688, "logps/ref_chosen": -53.245243072509766, "logps/ref_rejected": -76.05294799804688, "logps/rejected": -278.51007080078125, "loss": 1.0612, "margin_dpo/margin_mean": 105.90283966064453, "margin_dpo/margin_std": 138.68316650390625, "step": 196 }, { "KL/chosen_KL_mean": -98.7095947265625, "KL/mean": -139.4966278076172, "KL/rejected_KL_mean": -180.28367614746094, "KL/std": 99.37328338623047, "epoch": 0.28928046989721, "fcm_dpo/beta": 0.004487765487283468, "fcm_dpo/delta": 0.03489822521805763, "fcm_dpo/margin": 81.57408142089844, "fcm_dpo/q_t": 0.41627001762390137, "grad_norm": 18.291038513183594, "learning_rate": 4.4872830234640493e-07, "logits/chosen": -0.28296738862991333, "logits/rejected": -0.27726900577545166, "logps/chosen": -159.1299285888672, "logps/ref_chosen": -60.42033386230469, "logps/ref_rejected": -77.20890808105469, "logps/rejected": -257.4925842285156, "loss": 1.113, "margin_dpo/margin_mean": 81.5740737915039, "margin_dpo/margin_std": 112.14630889892578, "step": 197 }, { "KL/chosen_KL_mean": -111.65481567382812, "KL/mean": -163.402099609375, "KL/rejected_KL_mean": -215.14935302734375, "KL/std": 123.84089660644531, "epoch": 0.2907488986784141, "fcm_dpo/beta": 0.00444161519408226, "fcm_dpo/delta": -0.06254196166992188, "fcm_dpo/margin": 103.49454498291016, "fcm_dpo/q_t": 0.39844024181365967, "grad_norm": 21.28237533569336, "learning_rate": 4.479470611971645e-07, "logits/chosen": -0.29576927423477173, "logits/rejected": -0.2956548035144806, "logps/chosen": -166.69100952148438, "logps/ref_chosen": -55.03618621826172, "logps/ref_rejected": -97.24325561523438, "logps/rejected": -312.3926086425781, "loss": 1.0634, "margin_dpo/margin_mean": 103.49453735351562, "margin_dpo/margin_std": 140.37669372558594, "step": 198 }, { "KL/chosen_KL_mean": -105.4992904663086, "KL/mean": -157.50863647460938, "KL/rejected_KL_mean": -209.5179901123047, "KL/std": 112.66731262207031, "epoch": 0.2922173274596182, "fcm_dpo/beta": 0.004363642539829016, "fcm_dpo/delta": -0.057444989681243896, "fcm_dpo/margin": 104.01869201660156, "fcm_dpo/q_t": 0.39733150601387024, "grad_norm": 25.73158836364746, "learning_rate": 4.471606039587695e-07, "logits/chosen": -0.3182041049003601, "logits/rejected": -0.3008995056152344, "logps/chosen": -162.328125, "logps/ref_chosen": -56.828826904296875, "logps/ref_rejected": -84.64820861816406, "logps/rejected": -294.16619873046875, "loss": 1.0671, "margin_dpo/margin_mean": 104.01869201660156, "margin_dpo/margin_std": 136.8724365234375, "step": 199 }, { "KL/chosen_KL_mean": -104.5189208984375, "KL/mean": -155.94215393066406, "KL/rejected_KL_mean": -207.36537170410156, "KL/std": 121.28309631347656, "epoch": 0.2936857562408223, "fcm_dpo/beta": 0.004329666495323181, "fcm_dpo/delta": -0.047804687172174454, "fcm_dpo/margin": 102.84647369384766, "fcm_dpo/q_t": 0.4015011191368103, "grad_norm": 21.32215690612793, "learning_rate": 4.4636895135509966e-07, "logits/chosen": -0.2771759629249573, "logits/rejected": -0.25995227694511414, "logps/chosen": -157.58596801757812, "logps/ref_chosen": -53.06706237792969, "logps/ref_rejected": -80.60843658447266, "logps/rejected": -287.97381591796875, "loss": 1.091, "margin_dpo/margin_mean": 102.84646606445312, "margin_dpo/margin_std": 154.68792724609375, "step": 200 }, { "KL/chosen_KL_mean": -107.73637390136719, "KL/mean": -158.36456298828125, "KL/rejected_KL_mean": -208.99273681640625, "KL/std": 125.75869750976562, "epoch": 0.29515418502202645, "fcm_dpo/beta": 0.004319292958825827, "fcm_dpo/delta": -0.03909054771065712, "fcm_dpo/margin": 101.25636291503906, "fcm_dpo/q_t": 0.4014075696468353, "grad_norm": 19.30495262145996, "learning_rate": 4.455721242469372e-07, "logits/chosen": -0.3616677224636078, "logits/rejected": -0.3575963079929352, "logps/chosen": -183.1385955810547, "logps/ref_chosen": -75.4022216796875, "logps/ref_rejected": -114.80821990966797, "logps/rejected": -323.80096435546875, "loss": 1.0822, "margin_dpo/margin_mean": 101.25636291503906, "margin_dpo/margin_std": 141.83740234375, "step": 201 }, { "KL/chosen_KL_mean": -111.09174346923828, "KL/mean": -147.0572509765625, "KL/rejected_KL_mean": -183.02273559570312, "KL/std": 109.2310791015625, "epoch": 0.2966226138032305, "fcm_dpo/beta": 0.004360673949122429, "fcm_dpo/delta": 0.08905763924121857, "fcm_dpo/margin": 71.93099975585938, "fcm_dpo/q_t": 0.4304364323616028, "grad_norm": 21.131303787231445, "learning_rate": 4.4477014363141755e-07, "logits/chosen": -0.3155418336391449, "logits/rejected": -0.32926225662231445, "logps/chosen": -161.19305419921875, "logps/ref_chosen": -50.101318359375, "logps/ref_rejected": -86.98503112792969, "logps/rejected": -270.0077819824219, "loss": 1.1852, "margin_dpo/margin_mean": 71.93099975585938, "margin_dpo/margin_std": 141.54080200195312, "step": 202 }, { "KL/chosen_KL_mean": -109.16009521484375, "KL/mean": -153.531005859375, "KL/rejected_KL_mean": -197.90194702148438, "KL/std": 110.72138977050781, "epoch": 0.29809104258443464, "fcm_dpo/beta": 0.004380302503705025, "fcm_dpo/delta": 0.011728717014193535, "fcm_dpo/margin": 88.74185180664062, "fcm_dpo/q_t": 0.41108816862106323, "grad_norm": 20.558147430419922, "learning_rate": 4.439630306414758e-07, "logits/chosen": -0.3222927153110504, "logits/rejected": -0.3095286190509796, "logps/chosen": -169.769775390625, "logps/ref_chosen": -60.60969543457031, "logps/ref_rejected": -85.89596557617188, "logps/rejected": -283.79791259765625, "loss": 1.1006, "margin_dpo/margin_mean": 88.74185180664062, "margin_dpo/margin_std": 121.73361206054688, "step": 203 }, { "KL/chosen_KL_mean": -121.08845520019531, "KL/mean": -162.0384521484375, "KL/rejected_KL_mean": -202.98841857910156, "KL/std": 120.96675109863281, "epoch": 0.29955947136563876, "fcm_dpo/beta": 0.0044115157797932625, "fcm_dpo/delta": 0.04016388952732086, "fcm_dpo/margin": 81.89998626708984, "fcm_dpo/q_t": 0.4205179810523987, "grad_norm": 21.446792602539062, "learning_rate": 4.431508065452897e-07, "logits/chosen": -0.42557811737060547, "logits/rejected": -0.3860868215560913, "logps/chosen": -201.25341796875, "logps/ref_chosen": -80.16496276855469, "logps/ref_rejected": -87.69590759277344, "logps/rejected": -290.684326171875, "loss": 1.1481, "margin_dpo/margin_mean": 81.89998626708984, "margin_dpo/margin_std": 141.7510528564453, "step": 204 }, { "KL/chosen_KL_mean": -117.55274200439453, "KL/mean": -171.8702392578125, "KL/rejected_KL_mean": -226.18775939941406, "KL/std": 123.94536590576172, "epoch": 0.3010279001468429, "fcm_dpo/beta": 0.0043370481580495834, "fcm_dpo/delta": -0.07669728994369507, "fcm_dpo/margin": 108.635009765625, "fcm_dpo/q_t": 0.39190131425857544, "grad_norm": 22.160913467407227, "learning_rate": 4.4233349274571974e-07, "logits/chosen": -0.3477054834365845, "logits/rejected": -0.3181983232498169, "logps/chosen": -176.93746948242188, "logps/ref_chosen": -59.384735107421875, "logps/ref_rejected": -85.12505340576172, "logps/rejected": -311.31280517578125, "loss": 1.0557, "margin_dpo/margin_mean": 108.635009765625, "margin_dpo/margin_std": 135.24069213867188, "step": 205 }, { "KL/chosen_KL_mean": -107.3229751586914, "KL/mean": -165.88604736328125, "KL/rejected_KL_mean": -224.44912719726562, "KL/std": 115.81060791015625, "epoch": 0.302496328928047, "fcm_dpo/beta": 0.004273426253348589, "fcm_dpo/delta": -0.10639244318008423, "fcm_dpo/margin": 117.12614440917969, "fcm_dpo/q_t": 0.3835982382297516, "grad_norm": 24.27658462524414, "learning_rate": 4.415111107797445e-07, "logits/chosen": -0.25635069608688354, "logits/rejected": -0.2581319212913513, "logps/chosen": -154.2874755859375, "logps/ref_chosen": -46.964500427246094, "logps/ref_rejected": -98.9534912109375, "logps/rejected": -323.4026184082031, "loss": 1.0153, "margin_dpo/margin_mean": 117.12614440917969, "margin_dpo/margin_std": 126.34098052978516, "step": 206 }, { "KL/chosen_KL_mean": -99.67330932617188, "KL/mean": -166.9073028564453, "KL/rejected_KL_mean": -234.1413116455078, "KL/std": 132.98760986328125, "epoch": 0.3039647577092511, "fcm_dpo/beta": 0.0041627888567745686, "fcm_dpo/delta": -0.1690816581249237, "fcm_dpo/margin": 134.46800231933594, "fcm_dpo/q_t": 0.37616121768951416, "grad_norm": 23.397769927978516, "learning_rate": 4.4068368231789365e-07, "logits/chosen": -0.3611038029193878, "logits/rejected": -0.3351825773715973, "logps/chosen": -155.7295684814453, "logps/ref_chosen": -56.05625915527344, "logps/ref_rejected": -84.44779968261719, "logps/rejected": -318.589111328125, "loss": 0.993, "margin_dpo/margin_mean": 134.46800231933594, "margin_dpo/margin_std": 155.12615966796875, "step": 207 }, { "KL/chosen_KL_mean": -153.80804443359375, "KL/mean": -207.006591796875, "KL/rejected_KL_mean": -260.2052001953125, "KL/std": 128.0139617919922, "epoch": 0.3054331864904552, "fcm_dpo/beta": 0.004091139882802963, "fcm_dpo/delta": -0.037054985761642456, "fcm_dpo/margin": 106.39715576171875, "fcm_dpo/q_t": 0.40143436193466187, "grad_norm": 26.311290740966797, "learning_rate": 4.398512291636768e-07, "logits/chosen": -0.37917637825012207, "logits/rejected": -0.3616452217102051, "logps/chosen": -220.87564086914062, "logps/ref_chosen": -67.06761169433594, "logps/ref_rejected": -94.28689575195312, "logps/rejected": -354.4920654296875, "loss": 1.0912, "margin_dpo/margin_mean": 106.39714813232422, "margin_dpo/margin_std": 155.67181396484375, "step": 208 }, { "KL/chosen_KL_mean": -130.76773071289062, "KL/mean": -177.64361572265625, "KL/rejected_KL_mean": -224.51953125, "KL/std": 116.13549041748047, "epoch": 0.3069016152716593, "fcm_dpo/beta": 0.004103041719645262, "fcm_dpo/delta": 0.01593739353120327, "fcm_dpo/margin": 93.7518081665039, "fcm_dpo/q_t": 0.4129374623298645, "grad_norm": 29.005294799804688, "learning_rate": 4.3901377325300857e-07, "logits/chosen": -0.28290677070617676, "logits/rejected": -0.27193692326545715, "logps/chosen": -186.9494171142578, "logps/ref_chosen": -56.18169403076172, "logps/ref_rejected": -80.94152069091797, "logps/rejected": -305.4610595703125, "loss": 1.1266, "margin_dpo/margin_mean": 93.75180053710938, "margin_dpo/margin_std": 147.88401794433594, "step": 209 }, { "KL/chosen_KL_mean": -119.01356506347656, "KL/mean": -173.27197265625, "KL/rejected_KL_mean": -227.5303955078125, "KL/std": 119.99595642089844, "epoch": 0.30837004405286345, "fcm_dpo/beta": 0.0040941243059933186, "fcm_dpo/delta": -0.04651525244116783, "fcm_dpo/margin": 108.5168228149414, "fcm_dpo/q_t": 0.400329053401947, "grad_norm": 26.729955673217773, "learning_rate": 4.381713366536311e-07, "logits/chosen": -0.2923848628997803, "logits/rejected": -0.2843049168586731, "logps/chosen": -165.3853759765625, "logps/ref_chosen": -46.371822357177734, "logps/ref_rejected": -76.68162536621094, "logps/rejected": -304.2120056152344, "loss": 1.0743, "margin_dpo/margin_mean": 108.51681518554688, "margin_dpo/margin_std": 146.48370361328125, "step": 210 }, { "KL/chosen_KL_mean": -171.04937744140625, "KL/mean": -217.49700927734375, "KL/rejected_KL_mean": -263.94464111328125, "KL/std": 137.9586944580078, "epoch": 0.30983847283406757, "fcm_dpo/beta": 0.004082635045051575, "fcm_dpo/delta": 0.021537447348237038, "fcm_dpo/margin": 92.89524841308594, "fcm_dpo/q_t": 0.41833657026290894, "grad_norm": 36.12271499633789, "learning_rate": 4.373239415645323e-07, "logits/chosen": -0.3229391574859619, "logits/rejected": -0.2848234474658966, "logps/chosen": -249.98175048828125, "logps/ref_chosen": -78.93235778808594, "logps/ref_rejected": -86.82098388671875, "logps/rejected": -350.765625, "loss": 1.1436, "margin_dpo/margin_mean": 92.89524841308594, "margin_dpo/margin_std": 161.31678771972656, "step": 211 }, { "KL/chosen_KL_mean": -140.74990844726562, "KL/mean": -206.79635620117188, "KL/rejected_KL_mean": -272.84283447265625, "KL/std": 147.50723266601562, "epoch": 0.31130690161527164, "fcm_dpo/beta": 0.003975285217165947, "fcm_dpo/delta": -0.13405390083789825, "fcm_dpo/margin": 132.0928955078125, "fcm_dpo/q_t": 0.3824244737625122, "grad_norm": 24.483768463134766, "learning_rate": 4.3647161031536086e-07, "logits/chosen": -0.2860155701637268, "logits/rejected": -0.27637436985969543, "logps/chosen": -198.94692993164062, "logps/ref_chosen": -58.19701385498047, "logps/ref_rejected": -103.05785369873047, "logps/rejected": -375.9006652832031, "loss": 1.0326, "margin_dpo/margin_mean": 132.0928955078125, "margin_dpo/margin_std": 163.81600952148438, "step": 212 }, { "KL/chosen_KL_mean": -130.84754943847656, "KL/mean": -193.90252685546875, "KL/rejected_KL_mean": -256.9574890136719, "KL/std": 128.05874633789062, "epoch": 0.31277533039647576, "fcm_dpo/beta": 0.0039049675688147545, "fcm_dpo/delta": -0.09785507619380951, "fcm_dpo/margin": 126.10995483398438, "fcm_dpo/q_t": 0.38795384764671326, "grad_norm": 35.1922607421875, "learning_rate": 4.3561436536583774e-07, "logits/chosen": -0.3229708671569824, "logits/rejected": -0.29631006717681885, "logps/chosen": -198.36026000976562, "logps/ref_chosen": -67.51271057128906, "logps/ref_rejected": -93.91471862792969, "logps/rejected": -350.8721923828125, "loss": 1.0354, "margin_dpo/margin_mean": 126.10995483398438, "margin_dpo/margin_std": 152.4547119140625, "step": 213 }, { "KL/chosen_KL_mean": -112.09881591796875, "KL/mean": -168.9818115234375, "KL/rejected_KL_mean": -225.86480712890625, "KL/std": 126.0855712890625, "epoch": 0.3142437591776799, "fcm_dpo/beta": 0.003874241840094328, "fcm_dpo/delta": -0.04264100641012192, "fcm_dpo/margin": 113.76599884033203, "fcm_dpo/q_t": 0.4002940356731415, "grad_norm": 24.354455947875977, "learning_rate": 4.3475222930516473e-07, "logits/chosen": -0.23961499333381653, "logits/rejected": -0.24341589212417603, "logps/chosen": -153.70370483398438, "logps/ref_chosen": -41.604888916015625, "logps/ref_rejected": -77.51741027832031, "logps/rejected": -303.3822326660156, "loss": 1.0675, "margin_dpo/margin_mean": 113.76599884033203, "margin_dpo/margin_std": 146.8253173828125, "step": 214 }, { "KL/chosen_KL_mean": -131.69699096679688, "KL/mean": -190.83827209472656, "KL/rejected_KL_mean": -249.97955322265625, "KL/std": 130.4374237060547, "epoch": 0.315712187958884, "fcm_dpo/beta": 0.0038247781340032816, "fcm_dpo/delta": -0.05516364052891731, "fcm_dpo/margin": 118.28255462646484, "fcm_dpo/q_t": 0.3958727717399597, "grad_norm": 24.70524787902832, "learning_rate": 4.3388522485142885e-07, "logits/chosen": -0.2865249514579773, "logits/rejected": -0.2768559455871582, "logps/chosen": -184.97625732421875, "logps/ref_chosen": -53.279266357421875, "logps/ref_rejected": -89.96464538574219, "logps/rejected": -339.9442138671875, "loss": 1.0476, "margin_dpo/margin_mean": 118.28255462646484, "margin_dpo/margin_std": 136.10702514648438, "step": 215 }, { "KL/chosen_KL_mean": -133.49032592773438, "KL/mean": -192.19558715820312, "KL/rejected_KL_mean": -250.90084838867188, "KL/std": 134.2762451171875, "epoch": 0.31718061674008813, "fcm_dpo/beta": 0.0038109051529318094, "fcm_dpo/delta": -0.04979248717427254, "fcm_dpo/margin": 117.41053009033203, "fcm_dpo/q_t": 0.3997488021850586, "grad_norm": 23.230796813964844, "learning_rate": 4.330133748510036e-07, "logits/chosen": -0.3021494150161743, "logits/rejected": -0.28650131821632385, "logps/chosen": -182.37811279296875, "logps/ref_chosen": -48.887794494628906, "logps/ref_rejected": -77.19892883300781, "logps/rejected": -328.09979248046875, "loss": 1.0813, "margin_dpo/margin_mean": 117.4105224609375, "margin_dpo/margin_std": 166.27999877929688, "step": 216 }, { "KL/chosen_KL_mean": -135.45867919921875, "KL/mean": -203.8918914794922, "KL/rejected_KL_mean": -272.3250732421875, "KL/std": 136.49188232421875, "epoch": 0.3186490455212922, "fcm_dpo/beta": 0.003715306520462036, "fcm_dpo/delta": -0.11458480358123779, "fcm_dpo/margin": 136.8664093017578, "fcm_dpo/q_t": 0.3847663104534149, "grad_norm": 21.052268981933594, "learning_rate": 4.3213670227794757e-07, "logits/chosen": -0.270561158657074, "logits/rejected": -0.26448899507522583, "logps/chosen": -185.30398559570312, "logps/ref_chosen": -49.845306396484375, "logps/ref_rejected": -100.07832336425781, "logps/rejected": -372.4034118652344, "loss": 1.0148, "margin_dpo/margin_mean": 136.8664093017578, "margin_dpo/margin_std": 154.27218627929688, "step": 217 }, { "KL/chosen_KL_mean": -143.15106201171875, "KL/mean": -196.36883544921875, "KL/rejected_KL_mean": -249.58657836914062, "KL/std": 135.85345458984375, "epoch": 0.3201174743024963, "fcm_dpo/beta": 0.0036980193108320236, "fcm_dpo/delta": 0.0066223908215761185, "fcm_dpo/margin": 106.43551635742188, "fcm_dpo/q_t": 0.41142043471336365, "grad_norm": 20.975133895874023, "learning_rate": 4.3125523023339815e-07, "logits/chosen": -0.2958596646785736, "logits/rejected": -0.28984978795051575, "logps/chosen": -201.72775268554688, "logps/ref_chosen": -58.576683044433594, "logps/ref_rejected": -87.84639739990234, "logps/rejected": -337.4329833984375, "loss": 1.1123, "margin_dpo/margin_mean": 106.43551635742188, "margin_dpo/margin_std": 159.1250457763672, "step": 218 }, { "KL/chosen_KL_mean": -152.82916259765625, "KL/mean": -199.12118530273438, "KL/rejected_KL_mean": -245.41322326660156, "KL/std": 141.51058959960938, "epoch": 0.32158590308370044, "fcm_dpo/beta": 0.0037533333525061607, "fcm_dpo/delta": 0.05362574756145477, "fcm_dpo/margin": 92.58407592773438, "fcm_dpo/q_t": 0.4223693311214447, "grad_norm": 29.867891311645508, "learning_rate": 4.303689819449636e-07, "logits/chosen": -0.31183797121047974, "logits/rejected": -0.30414023995399475, "logps/chosen": -213.9130096435547, "logps/ref_chosen": -61.083858489990234, "logps/ref_rejected": -85.83042907714844, "logps/rejected": -331.24365234375, "loss": 1.1677, "margin_dpo/margin_mean": 92.58406066894531, "margin_dpo/margin_std": 172.917236328125, "step": 219 }, { "KL/chosen_KL_mean": -172.810791015625, "KL/mean": -211.48831176757812, "KL/rejected_KL_mean": -250.16583251953125, "KL/std": 122.16136169433594, "epoch": 0.32305433186490456, "fcm_dpo/beta": 0.0037990869022905827, "fcm_dpo/delta": 0.10947298258543015, "fcm_dpo/margin": 77.35502624511719, "fcm_dpo/q_t": 0.43133461475372314, "grad_norm": 25.090055465698242, "learning_rate": 4.2947798076611047e-07, "logits/chosen": -0.2918081283569336, "logits/rejected": -0.26791825890541077, "logps/chosen": -242.8420867919922, "logps/ref_chosen": -70.03128051757812, "logps/ref_rejected": -87.68551635742188, "logps/rejected": -337.8513488769531, "loss": 1.1694, "margin_dpo/margin_mean": 77.35502624511719, "margin_dpo/margin_std": 126.34854888916016, "step": 220 }, { "KL/chosen_KL_mean": -147.82846069335938, "KL/mean": -233.2677764892578, "KL/rejected_KL_mean": -318.70709228515625, "KL/std": 156.06951904296875, "epoch": 0.3245227606461087, "fcm_dpo/beta": 0.003700793255120516, "fcm_dpo/delta": -0.24753707647323608, "fcm_dpo/margin": 170.87864685058594, "fcm_dpo/q_t": 0.35590487718582153, "grad_norm": 26.41898536682129, "learning_rate": 4.285822501755485e-07, "logits/chosen": -0.2831432819366455, "logits/rejected": -0.2893243730068207, "logps/chosen": -199.983154296875, "logps/ref_chosen": -52.15470886230469, "logps/ref_rejected": -106.46768188476562, "logps/rejected": -425.1748046875, "loss": 0.9327, "margin_dpo/margin_mean": 170.878662109375, "margin_dpo/margin_std": 160.57461547851562, "step": 221 }, { "KL/chosen_KL_mean": -155.30770874023438, "KL/mean": -219.45260620117188, "KL/rejected_KL_mean": -283.5975341796875, "KL/std": 144.82785034179688, "epoch": 0.32599118942731276, "fcm_dpo/beta": 0.003622027114033699, "fcm_dpo/delta": -0.06790776550769806, "fcm_dpo/margin": 128.28977966308594, "fcm_dpo/q_t": 0.39438772201538086, "grad_norm": 20.189659118652344, "learning_rate": 4.276818137766118e-07, "logits/chosen": -0.31003278493881226, "logits/rejected": -0.31088048219680786, "logps/chosen": -216.27880859375, "logps/ref_chosen": -60.971099853515625, "logps/ref_rejected": -100.00115203857422, "logps/rejected": -383.5986633300781, "loss": 1.0542, "margin_dpo/margin_mean": 128.28977966308594, "margin_dpo/margin_std": 161.27833557128906, "step": 222 }, { "KL/chosen_KL_mean": -164.61181640625, "KL/mean": -218.14041137695312, "KL/rejected_KL_mean": -271.66900634765625, "KL/std": 140.00912475585938, "epoch": 0.3274596182085169, "fcm_dpo/beta": 0.003598616225644946, "fcm_dpo/delta": 0.01532889436930418, "fcm_dpo/margin": 107.05719757080078, "fcm_dpo/q_t": 0.4134790301322937, "grad_norm": 24.665771484375, "learning_rate": 4.2677669529663686e-07, "logits/chosen": -0.252638041973114, "logits/rejected": -0.24833783507347107, "logps/chosen": -217.2523956298828, "logps/ref_chosen": -52.64057540893555, "logps/ref_rejected": -82.82502746582031, "logps/rejected": -354.4940185546875, "loss": 1.1327, "margin_dpo/margin_mean": 107.05718994140625, "margin_dpo/margin_std": 178.87998962402344, "step": 223 }, { "KL/chosen_KL_mean": -146.0878143310547, "KL/mean": -208.273681640625, "KL/rejected_KL_mean": -270.4595642089844, "KL/std": 157.24331665039062, "epoch": 0.328928046989721, "fcm_dpo/beta": 0.003553580492734909, "fcm_dpo/delta": -0.04542648792266846, "fcm_dpo/margin": 124.37174987792969, "fcm_dpo/q_t": 0.4029054641723633, "grad_norm": 26.398706436157227, "learning_rate": 4.2586691858633747e-07, "logits/chosen": -0.27615243196487427, "logits/rejected": -0.25664016604423523, "logps/chosen": -194.6832275390625, "logps/ref_chosen": -48.59541320800781, "logps/ref_rejected": -77.11648559570312, "logps/rejected": -347.5760498046875, "loss": 1.089, "margin_dpo/margin_mean": 124.37174987792969, "margin_dpo/margin_std": 181.30401611328125, "step": 224 }, { "KL/chosen_KL_mean": -169.44223022460938, "KL/mean": -242.9072265625, "KL/rejected_KL_mean": -316.37225341796875, "KL/std": 153.35971069335938, "epoch": 0.3303964757709251, "fcm_dpo/beta": 0.003504401072859764, "fcm_dpo/delta": -0.12130744010210037, "fcm_dpo/margin": 146.93002319335938, "fcm_dpo/q_t": 0.38479962944984436, "grad_norm": 20.279191970825195, "learning_rate": 4.249525076191759e-07, "logits/chosen": -0.2664515972137451, "logits/rejected": -0.2539185881614685, "logps/chosen": -227.44268798828125, "logps/ref_chosen": -58.000465393066406, "logps/ref_rejected": -99.90291595458984, "logps/rejected": -416.275146484375, "loss": 1.0314, "margin_dpo/margin_mean": 146.93002319335938, "margin_dpo/margin_std": 186.25558471679688, "step": 225 }, { "KL/chosen_KL_mean": -140.94992065429688, "KL/mean": -200.4373321533203, "KL/rejected_KL_mean": -259.92474365234375, "KL/std": 148.1170654296875, "epoch": 0.33186490455212925, "fcm_dpo/beta": 0.0034582829102873802, "fcm_dpo/delta": -0.01282452791929245, "fcm_dpo/margin": 118.97482299804688, "fcm_dpo/q_t": 0.4084652364253998, "grad_norm": 25.379802703857422, "learning_rate": 4.2403348649073167e-07, "logits/chosen": -0.35269731283187866, "logits/rejected": -0.31248384714126587, "logps/chosen": -199.84872436523438, "logps/ref_chosen": -58.898799896240234, "logps/ref_rejected": -78.68775939941406, "logps/rejected": -338.61248779296875, "loss": 1.0981, "margin_dpo/margin_mean": 118.9748306274414, "margin_dpo/margin_std": 169.6277313232422, "step": 226 }, { "KL/chosen_KL_mean": -163.4235382080078, "KL/mean": -235.5128173828125, "KL/rejected_KL_mean": -307.60211181640625, "KL/std": 166.22787475585938, "epoch": 0.3333333333333333, "fcm_dpo/beta": 0.0034146863035857677, "fcm_dpo/delta": -0.09776041656732559, "fcm_dpo/margin": 144.17855834960938, "fcm_dpo/q_t": 0.38793981075286865, "grad_norm": 25.049428939819336, "learning_rate": 4.2310987941806615e-07, "logits/chosen": -0.37827032804489136, "logits/rejected": -0.3669503331184387, "logps/chosen": -222.4957275390625, "logps/ref_chosen": -59.072181701660156, "logps/ref_rejected": -99.41236877441406, "logps/rejected": -407.0144958496094, "loss": 1.0323, "margin_dpo/margin_mean": 144.17855834960938, "margin_dpo/margin_std": 172.8520050048828, "step": 227 }, { "KL/chosen_KL_mean": -163.7899169921875, "KL/mean": -215.34075927734375, "KL/rejected_KL_mean": -266.8916015625, "KL/std": 135.03140258789062, "epoch": 0.33480176211453744, "fcm_dpo/beta": 0.0034392657689750195, "fcm_dpo/delta": 0.046966154128313065, "fcm_dpo/margin": 103.10169982910156, "fcm_dpo/q_t": 0.4187896251678467, "grad_norm": 24.645570755004883, "learning_rate": 4.2218171073908463e-07, "logits/chosen": -0.35644814372062683, "logits/rejected": -0.3388446569442749, "logps/chosen": -229.6811981201172, "logps/ref_chosen": -65.89128875732422, "logps/ref_rejected": -91.04875183105469, "logps/rejected": -357.94036865234375, "loss": 1.139, "margin_dpo/margin_mean": 103.1017074584961, "margin_dpo/margin_std": 165.435546875, "step": 228 }, { "KL/chosen_KL_mean": -158.61346435546875, "KL/mean": -213.9613037109375, "KL/rejected_KL_mean": -269.30914306640625, "KL/std": 150.13076782226562, "epoch": 0.33627019089574156, "fcm_dpo/beta": 0.003454534336924553, "fcm_dpo/delta": 0.018272558227181435, "fcm_dpo/margin": 110.69569396972656, "fcm_dpo/q_t": 0.41169029474258423, "grad_norm": 30.438304901123047, "learning_rate": 4.212490049118951e-07, "logits/chosen": -0.44706737995147705, "logits/rejected": -0.41668009757995605, "logps/chosen": -229.31982421875, "logps/ref_chosen": -70.70637512207031, "logps/ref_rejected": -84.52741241455078, "logps/rejected": -353.8365478515625, "loss": 1.1093, "margin_dpo/margin_mean": 110.69569396972656, "margin_dpo/margin_std": 158.08216857910156, "step": 229 }, { "KL/chosen_KL_mean": -118.41344451904297, "KL/mean": -198.5885772705078, "KL/rejected_KL_mean": -278.76373291015625, "KL/std": 142.54483032226562, "epoch": 0.3377386196769457, "fcm_dpo/beta": 0.0033752424642443657, "fcm_dpo/delta": -0.1495116651058197, "fcm_dpo/margin": 160.35025024414062, "fcm_dpo/q_t": 0.37446969747543335, "grad_norm": 34.35503005981445, "learning_rate": 4.203117865141635e-07, "logits/chosen": -0.33029061555862427, "logits/rejected": -0.3350130319595337, "logps/chosen": -157.69544982910156, "logps/ref_chosen": -39.282005310058594, "logps/ref_rejected": -85.62191009521484, "logps/rejected": -364.3856201171875, "loss": 0.9808, "margin_dpo/margin_mean": 160.35025024414062, "margin_dpo/margin_std": 157.2830810546875, "step": 230 }, { "KL/chosen_KL_mean": -144.67678833007812, "KL/mean": -201.7078094482422, "KL/rejected_KL_mean": -258.7388610839844, "KL/std": 129.312255859375, "epoch": 0.3392070484581498, "fcm_dpo/beta": 0.0033622784540057182, "fcm_dpo/delta": 0.017147505655884743, "fcm_dpo/margin": 114.06205749511719, "fcm_dpo/q_t": 0.41282835602760315, "grad_norm": 26.74052619934082, "learning_rate": 4.1937008024246625e-07, "logits/chosen": -0.4000471532344818, "logits/rejected": -0.3726590871810913, "logps/chosen": -207.95321655273438, "logps/ref_chosen": -63.27644348144531, "logps/ref_rejected": -74.1239013671875, "logps/rejected": -332.86273193359375, "loss": 1.0974, "margin_dpo/margin_mean": 114.06205749511719, "margin_dpo/margin_std": 149.9913330078125, "step": 231 }, { "KL/chosen_KL_mean": -183.89407348632812, "KL/mean": -230.66317749023438, "KL/rejected_KL_mean": -277.4322814941406, "KL/std": 155.87942504882812, "epoch": 0.3406754772393539, "fcm_dpo/beta": 0.0034015290439128876, "fcm_dpo/delta": 0.0846027284860611, "fcm_dpo/margin": 93.53819274902344, "fcm_dpo/q_t": 0.4293164014816284, "grad_norm": 25.34986686706543, "learning_rate": 4.1842391091163933e-07, "logits/chosen": -0.3858756422996521, "logits/rejected": -0.3642328381538391, "logps/chosen": -254.642822265625, "logps/ref_chosen": -70.74876403808594, "logps/ref_rejected": -83.97706604003906, "logps/rejected": -361.40936279296875, "loss": 1.1607, "margin_dpo/margin_mean": 93.53819274902344, "margin_dpo/margin_std": 159.80599975585938, "step": 232 }, { "KL/chosen_KL_mean": -168.03411865234375, "KL/mean": -241.22540283203125, "KL/rejected_KL_mean": -314.41668701171875, "KL/std": 164.896728515625, "epoch": 0.342143906020558, "fcm_dpo/beta": 0.0033752245362848043, "fcm_dpo/delta": -0.09889530390501022, "fcm_dpo/margin": 146.382568359375, "fcm_dpo/q_t": 0.3929908871650696, "grad_norm": 26.03354835510254, "learning_rate": 4.174733034541245e-07, "logits/chosen": -0.38083887100219727, "logits/rejected": -0.384868860244751, "logps/chosen": -222.91705322265625, "logps/ref_chosen": -54.8829345703125, "logps/ref_rejected": -107.4800796508789, "logps/rejected": -421.89678955078125, "loss": 1.0648, "margin_dpo/margin_mean": 146.382568359375, "margin_dpo/margin_std": 210.76010131835938, "step": 233 }, { "KL/chosen_KL_mean": -162.310791015625, "KL/mean": -236.50253295898438, "KL/rejected_KL_mean": -310.69427490234375, "KL/std": 145.2280731201172, "epoch": 0.3436123348017621, "fcm_dpo/beta": 0.0032870229333639145, "fcm_dpo/delta": -0.09375564754009247, "fcm_dpo/margin": 148.3834686279297, "fcm_dpo/q_t": 0.38880789279937744, "grad_norm": 51.05461502075195, "learning_rate": 4.165182829193126e-07, "logits/chosen": -0.33859947323799133, "logits/rejected": -0.36448922753334045, "logps/chosen": -206.4053192138672, "logps/ref_chosen": -44.094520568847656, "logps/ref_rejected": -100.00663757324219, "logps/rejected": -410.7008972167969, "loss": 1.0276, "margin_dpo/margin_mean": 148.3834686279297, "margin_dpo/margin_std": 165.46307373046875, "step": 234 }, { "KL/chosen_KL_mean": -193.5567626953125, "KL/mean": -242.59243774414062, "KL/rejected_KL_mean": -291.62811279296875, "KL/std": 139.42709350585938, "epoch": 0.34508076358296624, "fcm_dpo/beta": 0.0033407763112336397, "fcm_dpo/delta": 0.07434496283531189, "fcm_dpo/margin": 98.07133483886719, "fcm_dpo/q_t": 0.42465054988861084, "grad_norm": 30.565860748291016, "learning_rate": 4.1555887447288255e-07, "logits/chosen": -0.4063182473182678, "logits/rejected": -0.388034462928772, "logps/chosen": -255.794677734375, "logps/ref_chosen": -62.237911224365234, "logps/ref_rejected": -90.39506530761719, "logps/rejected": -382.0231628417969, "loss": 1.1602, "margin_dpo/margin_mean": 98.07133483886719, "margin_dpo/margin_std": 167.26129150390625, "step": 235 }, { "KL/chosen_KL_mean": -141.887451171875, "KL/mean": -219.4722900390625, "KL/rejected_KL_mean": -297.0571594238281, "KL/std": 149.41790771484375, "epoch": 0.3465491923641703, "fcm_dpo/beta": 0.003285345621407032, "fcm_dpo/delta": -0.11568379402160645, "fcm_dpo/margin": 155.1697235107422, "fcm_dpo/q_t": 0.38063254952430725, "grad_norm": 56.562007904052734, "learning_rate": 4.1459510339613946e-07, "logits/chosen": -0.3532152771949768, "logits/rejected": -0.35226863622665405, "logps/chosen": -191.2288055419922, "logps/ref_chosen": -49.34136199951172, "logps/ref_rejected": -103.51162719726562, "logps/rejected": -400.56878662109375, "loss": 0.9904, "margin_dpo/margin_mean": 155.1697235107422, "margin_dpo/margin_std": 141.39013671875, "step": 236 }, { "KL/chosen_KL_mean": -188.53512573242188, "KL/mean": -248.512451171875, "KL/rejected_KL_mean": -308.48980712890625, "KL/std": 152.22921752929688, "epoch": 0.34801762114537443, "fcm_dpo/beta": 0.0032739704474806786, "fcm_dpo/delta": 0.00752119068056345, "fcm_dpo/margin": 119.95466613769531, "fcm_dpo/q_t": 0.4108693599700928, "grad_norm": 27.002674102783203, "learning_rate": 4.136269950853473e-07, "logits/chosen": -0.3559209108352661, "logits/rejected": -0.3510690927505493, "logps/chosen": -242.7032470703125, "logps/ref_chosen": -54.168121337890625, "logps/ref_rejected": -94.78036499023438, "logps/rejected": -403.2701416015625, "loss": 1.1075, "margin_dpo/margin_mean": 119.95466613769531, "margin_dpo/margin_std": 175.58291625976562, "step": 237 }, { "KL/chosen_KL_mean": -165.1414794921875, "KL/mean": -224.89373779296875, "KL/rejected_KL_mean": -284.64599609375, "KL/std": 151.98275756835938, "epoch": 0.34948604992657856, "fcm_dpo/beta": 0.0032602387946099043, "fcm_dpo/delta": 0.01038980484008789, "fcm_dpo/margin": 119.50453186035156, "fcm_dpo/q_t": 0.4138892889022827, "grad_norm": 23.35243034362793, "learning_rate": 4.126545750510605e-07, "logits/chosen": -0.329600989818573, "logits/rejected": -0.3452579975128174, "logps/chosen": -219.11459350585938, "logps/ref_chosen": -53.973121643066406, "logps/ref_rejected": -89.41795349121094, "logps/rejected": -374.06396484375, "loss": 1.1065, "margin_dpo/margin_mean": 119.50453186035156, "margin_dpo/margin_std": 172.2950439453125, "step": 238 }, { "KL/chosen_KL_mean": -174.6372833251953, "KL/mean": -240.18380737304688, "KL/rejected_KL_mean": -305.7303466796875, "KL/std": 140.24794006347656, "epoch": 0.3509544787077827, "fcm_dpo/beta": 0.003244359279051423, "fcm_dpo/delta": -0.02737080305814743, "fcm_dpo/margin": 131.09304809570312, "fcm_dpo/q_t": 0.4014323949813843, "grad_norm": 35.462642669677734, "learning_rate": 4.116778689174514e-07, "logits/chosen": -0.35082727670669556, "logits/rejected": -0.33832210302352905, "logps/chosen": -232.735107421875, "logps/ref_chosen": -58.09782409667969, "logps/ref_rejected": -93.59294128417969, "logps/rejected": -399.3232727050781, "loss": 1.0714, "margin_dpo/margin_mean": 131.09304809570312, "margin_dpo/margin_std": 158.469970703125, "step": 239 }, { "KL/chosen_KL_mean": -180.39125061035156, "KL/mean": -235.38194274902344, "KL/rejected_KL_mean": -290.3726501464844, "KL/std": 150.54681396484375, "epoch": 0.3524229074889868, "fcm_dpo/beta": 0.003269023261964321, "fcm_dpo/delta": 0.041884519159793854, "fcm_dpo/margin": 109.98141479492188, "fcm_dpo/q_t": 0.4180099368095398, "grad_norm": 37.75619125366211, "learning_rate": 4.106969024216348e-07, "logits/chosen": -0.4051019549369812, "logits/rejected": -0.38454490900039673, "logps/chosen": -241.0057373046875, "logps/ref_chosen": -60.6144905090332, "logps/ref_rejected": -74.1185302734375, "logps/rejected": -364.4911804199219, "loss": 1.1409, "margin_dpo/margin_mean": 109.98140716552734, "margin_dpo/margin_std": 178.82452392578125, "step": 240 }, { "KL/chosen_KL_mean": -149.62850952148438, "KL/mean": -235.679443359375, "KL/rejected_KL_mean": -321.7303771972656, "KL/std": 166.00146484375, "epoch": 0.35389133627019087, "fcm_dpo/beta": 0.0032152351923286915, "fcm_dpo/delta": -0.1620943695306778, "fcm_dpo/margin": 172.10183715820312, "fcm_dpo/q_t": 0.3768247365951538, "grad_norm": 28.70929527282715, "learning_rate": 4.097117014129903e-07, "logits/chosen": -0.4123689532279968, "logits/rejected": -0.38682758808135986, "logps/chosen": -215.71958923339844, "logps/ref_chosen": -66.091064453125, "logps/ref_rejected": -88.06088256835938, "logps/rejected": -409.791259765625, "loss": 0.9959, "margin_dpo/margin_mean": 172.10183715820312, "margin_dpo/margin_std": 193.75653076171875, "step": 241 }, { "KL/chosen_KL_mean": -172.19378662109375, "KL/mean": -234.080078125, "KL/rejected_KL_mean": -295.96636962890625, "KL/std": 142.62332153320312, "epoch": 0.355359765051395, "fcm_dpo/beta": 0.003198289545252919, "fcm_dpo/delta": 0.0038902349770069122, "fcm_dpo/margin": 123.77262115478516, "fcm_dpo/q_t": 0.41040879487991333, "grad_norm": 33.44797134399414, "learning_rate": 4.087222918524807e-07, "logits/chosen": -0.3743210732936859, "logits/rejected": -0.3515356183052063, "logps/chosen": -240.05770874023438, "logps/ref_chosen": -67.86392974853516, "logps/ref_rejected": -83.36033630371094, "logps/rejected": -379.32672119140625, "loss": 1.097, "margin_dpo/margin_mean": 123.77262115478516, "margin_dpo/margin_std": 168.39976501464844, "step": 242 }, { "KL/chosen_KL_mean": -173.37445068359375, "KL/mean": -249.24774169921875, "KL/rejected_KL_mean": -325.12103271484375, "KL/std": 158.65621948242188, "epoch": 0.3568281938325991, "fcm_dpo/beta": 0.0031418318394571543, "fcm_dpo/delta": -0.08077876269817352, "fcm_dpo/margin": 151.74655151367188, "fcm_dpo/q_t": 0.3906528353691101, "grad_norm": 23.109296798706055, "learning_rate": 4.07728699811968e-07, "logits/chosen": -0.3913189172744751, "logits/rejected": -0.36040928959846497, "logps/chosen": -236.45870971679688, "logps/ref_chosen": -63.0842399597168, "logps/ref_rejected": -76.33563232421875, "logps/rejected": -401.4566650390625, "loss": 1.0334, "margin_dpo/margin_mean": 151.74655151367188, "margin_dpo/margin_std": 174.57302856445312, "step": 243 }, { "KL/chosen_KL_mean": -153.2425079345703, "KL/mean": -229.29331970214844, "KL/rejected_KL_mean": -305.3441467285156, "KL/std": 151.1732635498047, "epoch": 0.35829662261380324, "fcm_dpo/beta": 0.0030940580181777477, "fcm_dpo/delta": -0.07428047060966492, "fcm_dpo/margin": 152.1016387939453, "fcm_dpo/q_t": 0.3908138573169708, "grad_norm": 40.61009216308594, "learning_rate": 4.067309514735267e-07, "logits/chosen": -0.4778062701225281, "logits/rejected": -0.4712453782558441, "logps/chosen": -214.38320922851562, "logps/ref_chosen": -61.140689849853516, "logps/ref_rejected": -94.89193725585938, "logps/rejected": -400.236083984375, "loss": 1.0244, "margin_dpo/margin_mean": 152.1016387939453, "margin_dpo/margin_std": 155.42276000976562, "step": 244 }, { "KL/chosen_KL_mean": -179.92636108398438, "KL/mean": -241.31871032714844, "KL/rejected_KL_mean": -302.7110290527344, "KL/std": 152.4658660888672, "epoch": 0.35976505139500736, "fcm_dpo/beta": 0.0030736280605196953, "fcm_dpo/delta": 0.022404037415981293, "fcm_dpo/margin": 122.78466796875, "fcm_dpo/q_t": 0.41335082054138184, "grad_norm": 25.50909423828125, "learning_rate": 4.057290731287531e-07, "logits/chosen": -0.4174082279205322, "logits/rejected": -0.3904969394207001, "logps/chosen": -247.1886444091797, "logps/ref_chosen": -67.26228332519531, "logps/ref_rejected": -87.64010620117188, "logps/rejected": -390.35113525390625, "loss": 1.1128, "margin_dpo/margin_mean": 122.78466796875, "margin_dpo/margin_std": 168.45639038085938, "step": 245 }, { "KL/chosen_KL_mean": -178.84425354003906, "KL/mean": -241.6790313720703, "KL/rejected_KL_mean": -304.5137939453125, "KL/std": 169.49676513671875, "epoch": 0.36123348017621143, "fcm_dpo/beta": 0.003099266439676285, "fcm_dpo/delta": 0.010860616341233253, "fcm_dpo/margin": 125.66952514648438, "fcm_dpo/q_t": 0.4126628637313843, "grad_norm": 26.56377601623535, "learning_rate": 4.047230911780736e-07, "logits/chosen": -0.4388137459754944, "logits/rejected": -0.40211886167526245, "logps/chosen": -245.54122924804688, "logps/ref_chosen": -66.69696807861328, "logps/ref_rejected": -84.34634399414062, "logps/rejected": -388.8601379394531, "loss": 1.1069, "margin_dpo/margin_mean": 125.66952514648438, "margin_dpo/margin_std": 182.71670532226562, "step": 246 }, { "KL/chosen_KL_mean": -211.3436279296875, "KL/mean": -300.1796875, "KL/rejected_KL_mean": -389.0157470703125, "KL/std": 181.46612548828125, "epoch": 0.36270190895741555, "fcm_dpo/beta": 0.003036319278180599, "fcm_dpo/delta": -0.14767590165138245, "fcm_dpo/margin": 177.672119140625, "fcm_dpo/q_t": 0.3771999478340149, "grad_norm": 27.967557907104492, "learning_rate": 4.0371303213004814e-07, "logits/chosen": -0.3680022656917572, "logits/rejected": -0.36837178468704224, "logps/chosen": -267.948974609375, "logps/ref_chosen": -56.6053466796875, "logps/ref_rejected": -106.29326629638672, "logps/rejected": -495.30902099609375, "loss": 0.9995, "margin_dpo/margin_mean": 177.672119140625, "margin_dpo/margin_std": 196.26101684570312, "step": 247 }, { "KL/chosen_KL_mean": -183.7578125, "KL/mean": -260.38360595703125, "KL/rejected_KL_mean": -337.0093994140625, "KL/std": 146.3836669921875, "epoch": 0.3641703377386197, "fcm_dpo/beta": 0.002979197073727846, "fcm_dpo/delta": -0.05982068181037903, "fcm_dpo/margin": 153.2515411376953, "fcm_dpo/q_t": 0.39190369844436646, "grad_norm": 25.113601684570312, "learning_rate": 4.0269892260067197e-07, "logits/chosen": -0.33221954107284546, "logits/rejected": -0.3516564965248108, "logps/chosen": -227.80104064941406, "logps/ref_chosen": -44.043216705322266, "logps/ref_rejected": -91.85687255859375, "logps/rejected": -428.86627197265625, "loss": 1.0241, "margin_dpo/margin_mean": 153.25155639648438, "margin_dpo/margin_std": 141.49179077148438, "step": 248 }, { "KL/chosen_KL_mean": -234.96240234375, "KL/mean": -277.1217041015625, "KL/rejected_KL_mean": -319.281005859375, "KL/std": 158.48863220214844, "epoch": 0.3656387665198238, "fcm_dpo/beta": 0.003046369180083275, "fcm_dpo/delta": 0.1469813883304596, "fcm_dpo/margin": 84.31859588623047, "fcm_dpo/q_t": 0.44190624356269836, "grad_norm": 34.834327697753906, "learning_rate": 4.0168078931267426e-07, "logits/chosen": -0.3620932698249817, "logits/rejected": -0.33795762062072754, "logps/chosen": -297.4047546386719, "logps/ref_chosen": -62.442352294921875, "logps/ref_rejected": -80.46806335449219, "logps/rejected": -399.7490539550781, "loss": 1.2226, "margin_dpo/margin_mean": 84.31858825683594, "margin_dpo/margin_std": 186.0101318359375, "step": 249 }, { "KL/chosen_KL_mean": -206.82498168945312, "KL/mean": -285.0578918457031, "KL/rejected_KL_mean": -363.29083251953125, "KL/std": 159.93698120117188, "epoch": 0.3671071953010279, "fcm_dpo/beta": 0.003046911209821701, "fcm_dpo/delta": -0.08060043305158615, "fcm_dpo/margin": 156.46585083007812, "fcm_dpo/q_t": 0.38924139738082886, "grad_norm": 35.6130485534668, "learning_rate": 4.006586590948141e-07, "logits/chosen": -0.33317434787750244, "logits/rejected": -0.2766192555427551, "logps/chosen": -272.461669921875, "logps/ref_chosen": -65.63668823242188, "logps/ref_rejected": -73.87184143066406, "logps/rejected": -437.16265869140625, "loss": 1.0237, "margin_dpo/margin_mean": 156.4658660888672, "margin_dpo/margin_std": 162.3227996826172, "step": 250 }, { "KL/chosen_KL_mean": -218.11549377441406, "KL/mean": -271.47332763671875, "KL/rejected_KL_mean": -324.8311767578125, "KL/std": 169.133056640625, "epoch": 0.368575624082232, "fcm_dpo/beta": 0.0030482178553938866, "fcm_dpo/delta": 0.07727696746587753, "fcm_dpo/margin": 106.71568298339844, "fcm_dpo/q_t": 0.42633721232414246, "grad_norm": 28.103612899780273, "learning_rate": 3.9963255888117325e-07, "logits/chosen": -0.32768577337265015, "logits/rejected": -0.30002111196517944, "logps/chosen": -275.2982177734375, "logps/ref_chosen": -57.182716369628906, "logps/ref_rejected": -77.66343688964844, "logps/rejected": -402.4945983886719, "loss": 1.1649, "margin_dpo/margin_mean": 106.71568298339844, "margin_dpo/margin_std": 182.81895446777344, "step": 251 }, { "KL/chosen_KL_mean": -215.59848022460938, "KL/mean": -287.9356689453125, "KL/rejected_KL_mean": -360.2728271484375, "KL/std": 146.37741088867188, "epoch": 0.3700440528634361, "fcm_dpo/beta": 0.0030416897498071194, "fcm_dpo/delta": -0.04213904216885567, "fcm_dpo/margin": 144.67434692382812, "fcm_dpo/q_t": 0.394910991191864, "grad_norm": 24.940649032592773, "learning_rate": 3.9860251571044666e-07, "logits/chosen": -0.4142671227455139, "logits/rejected": -0.37881606817245483, "logps/chosen": -287.28411865234375, "logps/ref_chosen": -71.68563842773438, "logps/ref_rejected": -84.75799560546875, "logps/rejected": -445.03082275390625, "loss": 1.0441, "margin_dpo/margin_mean": 144.67434692382812, "margin_dpo/margin_std": 149.41883850097656, "step": 252 }, { "KL/chosen_KL_mean": -186.49574279785156, "KL/mean": -257.6673583984375, "KL/rejected_KL_mean": -328.8389892578125, "KL/std": 157.70240783691406, "epoch": 0.37151248164464024, "fcm_dpo/beta": 0.003045113291591406, "fcm_dpo/delta": -0.03548625111579895, "fcm_dpo/margin": 142.34326171875, "fcm_dpo/q_t": 0.39946746826171875, "grad_norm": 19.748661041259766, "learning_rate": 3.9756855672522986e-07, "logits/chosen": -0.39257919788360596, "logits/rejected": -0.3857148289680481, "logps/chosen": -255.62966918945312, "logps/ref_chosen": -69.1339340209961, "logps/ref_rejected": -98.70252990722656, "logps/rejected": -427.54150390625, "loss": 1.0705, "margin_dpo/margin_mean": 142.34324645996094, "margin_dpo/margin_std": 176.71458435058594, "step": 253 }, { "KL/chosen_KL_mean": -179.48486328125, "KL/mean": -239.58071899414062, "KL/rejected_KL_mean": -299.67657470703125, "KL/std": 164.30686950683594, "epoch": 0.37298091042584436, "fcm_dpo/beta": 0.0030348035506904125, "fcm_dpo/delta": 0.03659197315573692, "fcm_dpo/margin": 120.19171142578125, "fcm_dpo/q_t": 0.4199643135070801, "grad_norm": 25.14466094970703, "learning_rate": 3.965307091713037e-07, "logits/chosen": -0.3829053044319153, "logits/rejected": -0.36821985244750977, "logps/chosen": -233.63986206054688, "logps/ref_chosen": -54.154998779296875, "logps/ref_rejected": -90.30764770507812, "logps/rejected": -389.9842224121094, "loss": 1.1421, "margin_dpo/margin_mean": 120.19171142578125, "margin_dpo/margin_std": 206.28839111328125, "step": 254 }, { "KL/chosen_KL_mean": -183.00448608398438, "KL/mean": -247.2868194580078, "KL/rejected_KL_mean": -311.56915283203125, "KL/std": 143.20118713378906, "epoch": 0.3744493392070485, "fcm_dpo/beta": 0.0030361046083271503, "fcm_dpo/delta": 0.009780865162611008, "fcm_dpo/margin": 128.564697265625, "fcm_dpo/q_t": 0.4100106954574585, "grad_norm": 20.29219627380371, "learning_rate": 3.954890003969163e-07, "logits/chosen": -0.34874504804611206, "logits/rejected": -0.338106632232666, "logps/chosen": -240.14614868164062, "logps/ref_chosen": -57.14167022705078, "logps/ref_rejected": -90.2085952758789, "logps/rejected": -401.77777099609375, "loss": 1.1067, "margin_dpo/margin_mean": 128.564697265625, "margin_dpo/margin_std": 180.92298889160156, "step": 255 }, { "KL/chosen_KL_mean": -158.49624633789062, "KL/mean": -227.50888061523438, "KL/rejected_KL_mean": -296.521484375, "KL/std": 154.35052490234375, "epoch": 0.37591776798825255, "fcm_dpo/beta": 0.003040488576516509, "fcm_dpo/delta": -0.02052391692996025, "fcm_dpo/margin": 138.02523803710938, "fcm_dpo/q_t": 0.40354132652282715, "grad_norm": 26.907196044921875, "learning_rate": 3.944434578520628e-07, "logits/chosen": -0.31947365403175354, "logits/rejected": -0.3274417519569397, "logps/chosen": -213.6597442626953, "logps/ref_chosen": -55.163490295410156, "logps/ref_rejected": -92.56291961669922, "logps/rejected": -389.08441162109375, "loss": 1.0781, "margin_dpo/margin_mean": 138.02523803710938, "margin_dpo/margin_std": 178.26329040527344, "step": 256 }, { "KL/chosen_KL_mean": -155.09429931640625, "KL/mean": -228.4635009765625, "KL/rejected_KL_mean": -301.83270263671875, "KL/std": 161.17501831054688, "epoch": 0.37738619676945667, "fcm_dpo/beta": 0.0029973145574331284, "fcm_dpo/delta": -0.042888298630714417, "fcm_dpo/margin": 146.73841857910156, "fcm_dpo/q_t": 0.4001276195049286, "grad_norm": 20.602216720581055, "learning_rate": 3.933941090877615e-07, "logits/chosen": -0.35195407271385193, "logits/rejected": -0.338517963886261, "logps/chosen": -204.51800537109375, "logps/ref_chosen": -49.42369842529297, "logps/ref_rejected": -79.53791809082031, "logps/rejected": -381.37060546875, "loss": 1.0685, "margin_dpo/margin_mean": 146.73841857910156, "margin_dpo/margin_std": 182.62957763671875, "step": 257 }, { "KL/chosen_KL_mean": -204.89328002929688, "KL/mean": -276.2060546875, "KL/rejected_KL_mean": -347.518798828125, "KL/std": 168.318115234375, "epoch": 0.3788546255506608, "fcm_dpo/beta": 0.0030030158814042807, "fcm_dpo/delta": -0.029582539573311806, "fcm_dpo/margin": 142.62550354003906, "fcm_dpo/q_t": 0.40110859274864197, "grad_norm": 29.499923706054688, "learning_rate": 3.923409817553284e-07, "logits/chosen": -0.3328630030155182, "logits/rejected": -0.33087849617004395, "logps/chosen": -264.27740478515625, "logps/ref_chosen": -59.384124755859375, "logps/ref_rejected": -95.99010467529297, "logps/rejected": -443.5089111328125, "loss": 1.0867, "margin_dpo/margin_mean": 142.62550354003906, "margin_dpo/margin_std": 199.8113250732422, "step": 258 }, { "KL/chosen_KL_mean": -198.52450561523438, "KL/mean": -258.52996826171875, "KL/rejected_KL_mean": -318.53546142578125, "KL/std": 160.81214904785156, "epoch": 0.3803230543318649, "fcm_dpo/beta": 0.0030021152924746275, "fcm_dpo/delta": 0.04111909121274948, "fcm_dpo/margin": 120.01093292236328, "fcm_dpo/q_t": 0.41781848669052124, "grad_norm": 23.914457321166992, "learning_rate": 3.9128410360564793e-07, "logits/chosen": -0.4086461663246155, "logits/rejected": -0.4056541323661804, "logps/chosen": -251.35284423828125, "logps/ref_chosen": -52.828346252441406, "logps/ref_rejected": -89.191650390625, "logps/rejected": -407.72711181640625, "loss": 1.1286, "margin_dpo/margin_mean": 120.01094055175781, "margin_dpo/margin_std": 181.0947723388672, "step": 259 }, { "KL/chosen_KL_mean": -203.28500366210938, "KL/mean": -284.781982421875, "KL/rejected_KL_mean": -366.2789306640625, "KL/std": 167.090087890625, "epoch": 0.38179148311306904, "fcm_dpo/beta": 0.0029884944669902325, "fcm_dpo/delta": -0.09151086211204529, "fcm_dpo/margin": 162.99392700195312, "fcm_dpo/q_t": 0.3894280791282654, "grad_norm": 31.707292556762695, "learning_rate": 3.9022350248844246e-07, "logits/chosen": -0.3748651146888733, "logits/rejected": -0.391143798828125, "logps/chosen": -250.7026824951172, "logps/ref_chosen": -47.41767501831055, "logps/ref_rejected": -95.08978271484375, "logps/rejected": -461.36871337890625, "loss": 1.0208, "margin_dpo/margin_mean": 162.99392700195312, "margin_dpo/margin_std": 178.20040893554688, "step": 260 }, { "KL/chosen_KL_mean": -212.65603637695312, "KL/mean": -288.040771484375, "KL/rejected_KL_mean": -363.4255065917969, "KL/std": 181.0100555419922, "epoch": 0.3832599118942731, "fcm_dpo/beta": 0.002936106640845537, "fcm_dpo/delta": -0.04489829018712044, "fcm_dpo/margin": 150.76947021484375, "fcm_dpo/q_t": 0.39958545565605164, "grad_norm": 21.78121566772461, "learning_rate": 3.891592063515376e-07, "logits/chosen": -0.3127421438694, "logits/rejected": -0.31155508756637573, "logps/chosen": -265.68743896484375, "logps/ref_chosen": -53.03137969970703, "logps/ref_rejected": -88.51494598388672, "logps/rejected": -451.9404602050781, "loss": 1.0713, "margin_dpo/margin_mean": 150.76947021484375, "margin_dpo/margin_std": 200.406005859375, "step": 261 }, { "KL/chosen_KL_mean": -252.27764892578125, "KL/mean": -314.0080871582031, "KL/rejected_KL_mean": -375.738525390625, "KL/std": 165.99118041992188, "epoch": 0.38472834067547723, "fcm_dpo/beta": 0.0029631485231220722, "fcm_dpo/delta": 0.0350569412112236, "fcm_dpo/margin": 123.46089172363281, "fcm_dpo/q_t": 0.4153136610984802, "grad_norm": 28.57038688659668, "learning_rate": 3.880912432401264e-07, "logits/chosen": -0.3218010663986206, "logits/rejected": -0.2904987037181854, "logps/chosen": -311.89776611328125, "logps/ref_chosen": -59.620140075683594, "logps/ref_rejected": -86.41853332519531, "logps/rejected": -462.15704345703125, "loss": 1.1117, "margin_dpo/margin_mean": 123.46089172363281, "margin_dpo/margin_std": 167.72061157226562, "step": 262 }, { "KL/chosen_KL_mean": -227.56837463378906, "KL/mean": -315.9892883300781, "KL/rejected_KL_mean": -404.4101867675781, "KL/std": 192.87933349609375, "epoch": 0.38619676945668135, "fcm_dpo/beta": 0.0028930227272212505, "fcm_dpo/delta": -0.11854880303144455, "fcm_dpo/margin": 176.84181213378906, "fcm_dpo/q_t": 0.3831733465194702, "grad_norm": 26.555845260620117, "learning_rate": 3.870196412960302e-07, "logits/chosen": -0.3501706123352051, "logits/rejected": -0.3262799084186554, "logps/chosen": -286.98931884765625, "logps/ref_chosen": -59.42094421386719, "logps/ref_rejected": -96.85720825195312, "logps/rejected": -501.26739501953125, "loss": 1.0194, "margin_dpo/margin_mean": 176.84181213378906, "margin_dpo/margin_std": 203.54107666015625, "step": 263 }, { "KL/chosen_KL_mean": -234.73135375976562, "KL/mean": -310.48260498046875, "KL/rejected_KL_mean": -386.2339172363281, "KL/std": 176.64036560058594, "epoch": 0.3876651982378855, "fcm_dpo/beta": 0.0028530117124319077, "fcm_dpo/delta": -0.034962985664606094, "fcm_dpo/margin": 151.5025634765625, "fcm_dpo/q_t": 0.40188103914260864, "grad_norm": 29.80936622619629, "learning_rate": 3.8594442875695665e-07, "logits/chosen": -0.38784724473953247, "logits/rejected": -0.377646803855896, "logps/chosen": -297.45343017578125, "logps/ref_chosen": -62.722084045410156, "logps/ref_rejected": -93.85620880126953, "logps/rejected": -480.09014892578125, "loss": 1.0785, "margin_dpo/margin_mean": 151.50254821777344, "margin_dpo/margin_std": 198.95477294921875, "step": 264 }, { "KL/chosen_KL_mean": -255.1561279296875, "KL/mean": -327.5084533691406, "KL/rejected_KL_mean": -399.8607482910156, "KL/std": 199.5748291015625, "epoch": 0.3891336270190896, "fcm_dpo/beta": 0.0028611307498067617, "fcm_dpo/delta": -0.014629107899963856, "fcm_dpo/margin": 144.70462036132812, "fcm_dpo/q_t": 0.4080343246459961, "grad_norm": 29.3031005859375, "learning_rate": 3.848656339557562e-07, "logits/chosen": -0.3426782488822937, "logits/rejected": -0.32741084694862366, "logps/chosen": -317.12762451171875, "logps/ref_chosen": -61.971466064453125, "logps/ref_rejected": -88.02059936523438, "logps/rejected": -487.88134765625, "loss": 1.1186, "margin_dpo/margin_mean": 144.70462036132812, "margin_dpo/margin_std": 237.73236083984375, "step": 265 }, { "KL/chosen_KL_mean": -252.57809448242188, "KL/mean": -311.6585693359375, "KL/rejected_KL_mean": -370.73907470703125, "KL/std": 166.17433166503906, "epoch": 0.39060205580029367, "fcm_dpo/beta": 0.002886436879634857, "fcm_dpo/delta": 0.061003364622592926, "fcm_dpo/margin": 118.16098022460938, "fcm_dpo/q_t": 0.42259740829467773, "grad_norm": 43.39963912963867, "learning_rate": 3.8378328531967507e-07, "logits/chosen": -0.37320268154144287, "logits/rejected": -0.3327832818031311, "logps/chosen": -319.6777648925781, "logps/ref_chosen": -67.09967041015625, "logps/ref_rejected": -67.97122192382812, "logps/rejected": -438.7103271484375, "loss": 1.146, "margin_dpo/margin_mean": 118.16098022460938, "margin_dpo/margin_std": 192.56309509277344, "step": 266 }, { "KL/chosen_KL_mean": -226.99365234375, "KL/mean": -299.7353515625, "KL/rejected_KL_mean": -372.47698974609375, "KL/std": 180.788818359375, "epoch": 0.3920704845814978, "fcm_dpo/beta": 0.002876041457056999, "fcm_dpo/delta": -0.01955413445830345, "fcm_dpo/margin": 145.48333740234375, "fcm_dpo/q_t": 0.40417009592056274, "grad_norm": 31.87999153137207, "learning_rate": 3.8269741136960646e-07, "logits/chosen": -0.39031726121902466, "logits/rejected": -0.3596029281616211, "logps/chosen": -295.96441650390625, "logps/ref_chosen": -68.97075653076172, "logps/ref_rejected": -90.16844940185547, "logps/rejected": -462.64544677734375, "loss": 1.0906, "margin_dpo/margin_mean": 145.48333740234375, "margin_dpo/margin_std": 203.5807342529297, "step": 267 }, { "KL/chosen_KL_mean": -232.36375427246094, "KL/mean": -301.0833740234375, "KL/rejected_KL_mean": -369.802978515625, "KL/std": 167.608154296875, "epoch": 0.3935389133627019, "fcm_dpo/beta": 0.0028773611411452293, "fcm_dpo/delta": 0.004575518891215324, "fcm_dpo/margin": 137.43919372558594, "fcm_dpo/q_t": 0.4103718400001526, "grad_norm": 29.13970184326172, "learning_rate": 3.8160804071933894e-07, "logits/chosen": -0.3367459774017334, "logits/rejected": -0.34343862533569336, "logps/chosen": -288.2640686035156, "logps/ref_chosen": -55.90031051635742, "logps/ref_rejected": -101.64763641357422, "logps/rejected": -471.4505920410156, "loss": 1.1069, "margin_dpo/margin_mean": 137.439208984375, "margin_dpo/margin_std": 202.26385498046875, "step": 268 }, { "KL/chosen_KL_mean": -247.22647094726562, "KL/mean": -333.1874084472656, "KL/rejected_KL_mean": -419.1483154296875, "KL/std": 174.40249633789062, "epoch": 0.39500734214390604, "fcm_dpo/beta": 0.002847407478839159, "fcm_dpo/delta": -0.09410010278224945, "fcm_dpo/margin": 171.92185974121094, "fcm_dpo/q_t": 0.38866060972213745, "grad_norm": 24.988513946533203, "learning_rate": 3.8051520207480204e-07, "logits/chosen": -0.3828558027744293, "logits/rejected": -0.363941490650177, "logps/chosen": -317.26605224609375, "logps/ref_chosen": -70.03955841064453, "logps/ref_rejected": -107.34937286376953, "logps/rejected": -526.4976806640625, "loss": 1.0466, "margin_dpo/margin_mean": 171.92185974121094, "margin_dpo/margin_std": 220.83987426757812, "step": 269 }, { "KL/chosen_KL_mean": -214.0692138671875, "KL/mean": -278.02642822265625, "KL/rejected_KL_mean": -341.98370361328125, "KL/std": 153.86318969726562, "epoch": 0.3964757709251101, "fcm_dpo/beta": 0.002855871804058552, "fcm_dpo/delta": 0.035725079476833344, "fcm_dpo/margin": 127.91445922851562, "fcm_dpo/q_t": 0.41616952419281006, "grad_norm": 27.773122787475586, "learning_rate": 3.794189242333106e-07, "logits/chosen": -0.44005024433135986, "logits/rejected": -0.43610844016075134, "logps/chosen": -283.6026916503906, "logps/ref_chosen": -69.53347778320312, "logps/ref_rejected": -109.92864990234375, "logps/rejected": -451.9123229980469, "loss": 1.1275, "margin_dpo/margin_mean": 127.91445922851562, "margin_dpo/margin_std": 195.2829132080078, "step": 270 }, { "KL/chosen_KL_mean": -198.44947814941406, "KL/mean": -275.9836120605469, "KL/rejected_KL_mean": -353.5177001953125, "KL/std": 153.60189819335938, "epoch": 0.39794419970631423, "fcm_dpo/beta": 0.002827045973390341, "fcm_dpo/delta": -0.04044891148805618, "fcm_dpo/margin": 155.06826782226562, "fcm_dpo/q_t": 0.3988415598869324, "grad_norm": 24.570371627807617, "learning_rate": 3.7831923608280514e-07, "logits/chosen": -0.36762213706970215, "logits/rejected": -0.35115593671798706, "logps/chosen": -255.21405029296875, "logps/ref_chosen": -56.76456832885742, "logps/ref_rejected": -92.51383972167969, "logps/rejected": -446.03155517578125, "loss": 1.0517, "margin_dpo/margin_mean": 155.06826782226562, "margin_dpo/margin_std": 173.97885131835938, "step": 271 }, { "KL/chosen_KL_mean": -188.15049743652344, "KL/mean": -285.01849365234375, "KL/rejected_KL_mean": -381.8865051269531, "KL/std": 175.69546508789062, "epoch": 0.39941262848751835, "fcm_dpo/beta": 0.002780818846076727, "fcm_dpo/delta": -0.1463950276374817, "fcm_dpo/margin": 193.7360076904297, "fcm_dpo/q_t": 0.3746742010116577, "grad_norm": 35.40150451660156, "learning_rate": 3.772161666010912e-07, "logits/chosen": -0.3044808804988861, "logits/rejected": -0.3166738450527191, "logps/chosen": -237.64764404296875, "logps/ref_chosen": -49.497154235839844, "logps/ref_rejected": -105.54279327392578, "logps/rejected": -487.4293212890625, "loss": 0.9804, "margin_dpo/margin_mean": 193.7360076904297, "margin_dpo/margin_std": 182.89303588867188, "step": 272 }, { "KL/chosen_KL_mean": -219.2149658203125, "KL/mean": -313.57196044921875, "KL/rejected_KL_mean": -407.92889404296875, "KL/std": 178.0384063720703, "epoch": 0.4008810572687225, "fcm_dpo/beta": 0.002692791633307934, "fcm_dpo/delta": -0.11455152183771133, "fcm_dpo/margin": 188.71389770507812, "fcm_dpo/q_t": 0.3825136423110962, "grad_norm": 26.675121307373047, "learning_rate": 3.761097448550755e-07, "logits/chosen": -0.32241398096084595, "logits/rejected": -0.30522340536117554, "logps/chosen": -282.19036865234375, "logps/ref_chosen": -62.97539520263672, "logps/ref_rejected": -92.49858093261719, "logps/rejected": -500.427490234375, "loss": 1.0069, "margin_dpo/margin_mean": 188.71389770507812, "margin_dpo/margin_std": 197.89047241210938, "step": 273 }, { "KL/chosen_KL_mean": -264.813232421875, "KL/mean": -337.2680358886719, "KL/rejected_KL_mean": -409.72283935546875, "KL/std": 165.03564453125, "epoch": 0.4023494860499266, "fcm_dpo/beta": 0.0026972047053277493, "fcm_dpo/delta": 0.00932791456580162, "fcm_dpo/margin": 144.9096221923828, "fcm_dpo/q_t": 0.4096784293651581, "grad_norm": 25.53626823425293, "learning_rate": 3.75e-07, "logits/chosen": -0.30063068866729736, "logits/rejected": -0.2833949625492096, "logps/chosen": -320.48095703125, "logps/ref_chosen": -55.66770935058594, "logps/ref_rejected": -77.33308410644531, "logps/rejected": -487.055908203125, "loss": 1.0939, "margin_dpo/margin_mean": 144.9096221923828, "margin_dpo/margin_std": 187.6047821044922, "step": 274 }, { "KL/chosen_KL_mean": -206.4573974609375, "KL/mean": -283.26287841796875, "KL/rejected_KL_mean": -360.0683898925781, "KL/std": 167.62567138671875, "epoch": 0.40381791483113066, "fcm_dpo/beta": 0.0026927865110337734, "fcm_dpo/delta": -0.0143581572920084, "fcm_dpo/margin": 153.6110076904297, "fcm_dpo/q_t": 0.4042346179485321, "grad_norm": 24.385211944580078, "learning_rate": 3.738869612786737e-07, "logits/chosen": -0.32019296288490295, "logits/rejected": -0.32504212856292725, "logps/chosen": -255.0521240234375, "logps/ref_chosen": -48.594703674316406, "logps/ref_rejected": -93.30369567871094, "logps/rejected": -453.3720703125, "loss": 1.0768, "margin_dpo/margin_mean": 153.61099243164062, "margin_dpo/margin_std": 189.52232360839844, "step": 275 }, { "KL/chosen_KL_mean": -225.26080322265625, "KL/mean": -302.2386779785156, "KL/rejected_KL_mean": -379.216552734375, "KL/std": 172.80694580078125, "epoch": 0.4052863436123348, "fcm_dpo/beta": 0.002671858761459589, "fcm_dpo/delta": -0.011981412768363953, "fcm_dpo/margin": 153.95571899414062, "fcm_dpo/q_t": 0.40615737438201904, "grad_norm": 26.473548889160156, "learning_rate": 3.7277065802070204e-07, "logits/chosen": -0.3291136622428894, "logits/rejected": -0.30698275566101074, "logps/chosen": -281.83819580078125, "logps/ref_chosen": -56.57740783691406, "logps/ref_rejected": -70.36566925048828, "logps/rejected": -449.58221435546875, "loss": 1.0868, "margin_dpo/margin_mean": 153.9557342529297, "margin_dpo/margin_std": 205.688232421875, "step": 276 }, { "KL/chosen_KL_mean": -248.16168212890625, "KL/mean": -327.30120849609375, "KL/rejected_KL_mean": -406.44073486328125, "KL/std": 174.60836791992188, "epoch": 0.4067547723935389, "fcm_dpo/beta": 0.002672237576916814, "fcm_dpo/delta": -0.02400265261530876, "fcm_dpo/margin": 158.27906799316406, "fcm_dpo/q_t": 0.4026916027069092, "grad_norm": 30.444353103637695, "learning_rate": 3.71651119641714e-07, "logits/chosen": -0.32184985280036926, "logits/rejected": -0.30650681257247925, "logps/chosen": -304.4332275390625, "logps/ref_chosen": -56.27156066894531, "logps/ref_rejected": -92.88127136230469, "logps/rejected": -499.322021484375, "loss": 1.0794, "margin_dpo/margin_mean": 158.27908325195312, "margin_dpo/margin_std": 206.87417602539062, "step": 277 }, { "KL/chosen_KL_mean": -222.39517211914062, "KL/mean": -314.3505859375, "KL/rejected_KL_mean": -406.30596923828125, "KL/std": 186.62579345703125, "epoch": 0.40822320117474303, "fcm_dpo/beta": 0.0026234271936118603, "fcm_dpo/delta": -0.08697425574064255, "fcm_dpo/margin": 183.9108123779297, "fcm_dpo/q_t": 0.389728844165802, "grad_norm": 26.94320297241211, "learning_rate": 3.705283756425872e-07, "logits/chosen": -0.34066635370254517, "logits/rejected": -0.3471217155456543, "logps/chosen": -275.33709716796875, "logps/ref_chosen": -52.94194030761719, "logps/ref_rejected": -91.25357818603516, "logps/rejected": -497.5595397949219, "loss": 1.0289, "margin_dpo/margin_mean": 183.9108123779297, "margin_dpo/margin_std": 208.0062255859375, "step": 278 }, { "KL/chosen_KL_mean": -254.62709045410156, "KL/mean": -344.94903564453125, "KL/rejected_KL_mean": -435.2709655761719, "KL/std": 193.30276489257812, "epoch": 0.40969162995594716, "fcm_dpo/beta": 0.0025754275266081095, "fcm_dpo/delta": -0.06957367807626724, "fcm_dpo/margin": 180.64385986328125, "fcm_dpo/q_t": 0.39524978399276733, "grad_norm": 29.17497444152832, "learning_rate": 3.6940245560867e-07, "logits/chosen": -0.26812469959259033, "logits/rejected": -0.26797914505004883, "logps/chosen": -303.2684326171875, "logps/ref_chosen": -48.641319274902344, "logps/ref_rejected": -87.8514404296875, "logps/rejected": -523.1224365234375, "loss": 1.061, "margin_dpo/margin_mean": 180.6438751220703, "margin_dpo/margin_std": 237.0701141357422, "step": 279 }, { "KL/chosen_KL_mean": -249.67874145507812, "KL/mean": -342.8053894042969, "KL/rejected_KL_mean": -435.93206787109375, "KL/std": 178.5480499267578, "epoch": 0.4111600587371512, "fcm_dpo/beta": 0.002550060860812664, "fcm_dpo/delta": -0.07875210046768188, "fcm_dpo/margin": 186.25331115722656, "fcm_dpo/q_t": 0.3887876272201538, "grad_norm": 33.25874710083008, "learning_rate": 3.6827338920900253e-07, "logits/chosen": -0.29288673400878906, "logits/rejected": -0.29508256912231445, "logps/chosen": -308.4758605957031, "logps/ref_chosen": -58.797122955322266, "logps/ref_rejected": -98.61885070800781, "logps/rejected": -534.5509033203125, "loss": 1.0245, "margin_dpo/margin_mean": 186.25331115722656, "margin_dpo/margin_std": 194.61471557617188, "step": 280 }, { "KL/chosen_KL_mean": -224.7278289794922, "KL/mean": -309.36773681640625, "KL/rejected_KL_mean": -394.0076904296875, "KL/std": 175.15994262695312, "epoch": 0.41262848751835535, "fcm_dpo/beta": 0.0025381785817444324, "fcm_dpo/delta": -0.031121131032705307, "fcm_dpo/margin": 169.27987670898438, "fcm_dpo/q_t": 0.3983193635940552, "grad_norm": 21.142444610595703, "learning_rate": 3.6714120619553435e-07, "logits/chosen": -0.33014634251594543, "logits/rejected": -0.30275779962539673, "logps/chosen": -280.2163391113281, "logps/ref_chosen": -55.488521575927734, "logps/ref_rejected": -80.88258361816406, "logps/rejected": -474.8902587890625, "loss": 1.0583, "margin_dpo/margin_mean": 169.27987670898438, "margin_dpo/margin_std": 189.16592407226562, "step": 281 }, { "KL/chosen_KL_mean": -250.84788513183594, "KL/mean": -315.7371826171875, "KL/rejected_KL_mean": -380.62646484375, "KL/std": 179.92788696289062, "epoch": 0.41409691629955947, "fcm_dpo/beta": 0.002561165951192379, "fcm_dpo/delta": 0.06942006200551987, "fcm_dpo/margin": 129.77859497070312, "fcm_dpo/q_t": 0.42569971084594727, "grad_norm": 26.15049934387207, "learning_rate": 3.660059364023408e-07, "logits/chosen": -0.41560858488082886, "logits/rejected": -0.3941164016723633, "logps/chosen": -323.91802978515625, "logps/ref_chosen": -73.07014465332031, "logps/ref_rejected": -95.35098266601562, "logps/rejected": -475.9774475097656, "loss": 1.1459, "margin_dpo/margin_mean": 129.77859497070312, "margin_dpo/margin_std": 209.7483673095703, "step": 282 }, { "KL/chosen_KL_mean": -260.31744384765625, "KL/mean": -355.35028076171875, "KL/rejected_KL_mean": -450.3831787109375, "KL/std": 210.8460693359375, "epoch": 0.4155653450807636, "fcm_dpo/beta": 0.002537979045882821, "fcm_dpo/delta": -0.08652851730585098, "fcm_dpo/margin": 190.0657196044922, "fcm_dpo/q_t": 0.3873726427555084, "grad_norm": 35.29081726074219, "learning_rate": 3.6486760974483685e-07, "logits/chosen": -0.3637465834617615, "logits/rejected": -0.3678331673145294, "logps/chosen": -322.21588134765625, "logps/ref_chosen": -61.89844512939453, "logps/ref_rejected": -96.98655700683594, "logps/rejected": -547.3697509765625, "loss": 1.0228, "margin_dpo/margin_mean": 190.0657196044922, "margin_dpo/margin_std": 199.068115234375, "step": 283 }, { "KL/chosen_KL_mean": -247.5406494140625, "KL/mean": -339.8221435546875, "KL/rejected_KL_mean": -432.1036071777344, "KL/std": 199.65481567382812, "epoch": 0.4170337738619677, "fcm_dpo/beta": 0.002489683451130986, "fcm_dpo/delta": -0.06252136826515198, "fcm_dpo/margin": 184.56295776367188, "fcm_dpo/q_t": 0.39381855726242065, "grad_norm": 27.652767181396484, "learning_rate": 3.6372625621898863e-07, "logits/chosen": -0.3889576494693756, "logits/rejected": -0.37424755096435547, "logps/chosen": -305.9761962890625, "logps/ref_chosen": -58.4355354309082, "logps/ref_rejected": -93.46926879882812, "logps/rejected": -525.5728759765625, "loss": 1.0346, "margin_dpo/margin_mean": 184.56295776367188, "margin_dpo/margin_std": 199.2593994140625, "step": 284 }, { "KL/chosen_KL_mean": -287.4296875, "KL/mean": -370.3851318359375, "KL/rejected_KL_mean": -453.340576171875, "KL/std": 177.46621704101562, "epoch": 0.4185022026431718, "fcm_dpo/beta": 0.0024931158404797316, "fcm_dpo/delta": -0.014689784497022629, "fcm_dpo/margin": 165.91085815429688, "fcm_dpo/q_t": 0.4023195803165436, "grad_norm": 25.144207000732422, "learning_rate": 3.625819059005228e-07, "logits/chosen": -0.35408467054367065, "logits/rejected": -0.3383770287036896, "logps/chosen": -353.661865234375, "logps/ref_chosen": -66.23219299316406, "logps/ref_rejected": -99.1268310546875, "logps/rejected": -552.4674072265625, "loss": 1.0729, "margin_dpo/margin_mean": 165.91085815429688, "margin_dpo/margin_std": 193.71629333496094, "step": 285 }, { "KL/chosen_KL_mean": -297.5478210449219, "KL/mean": -387.598388671875, "KL/rejected_KL_mean": -477.64892578125, "KL/std": 204.80935668945312, "epoch": 0.4199706314243759, "fcm_dpo/beta": 0.00246500875800848, "fcm_dpo/delta": -0.04596859961748123, "fcm_dpo/margin": 180.1011199951172, "fcm_dpo/q_t": 0.39727091789245605, "grad_norm": 25.498445510864258, "learning_rate": 3.614345889441346e-07, "logits/chosen": -0.37891730666160583, "logits/rejected": -0.36352336406707764, "logps/chosen": -370.49884033203125, "logps/ref_chosen": -72.95100402832031, "logps/ref_rejected": -88.58845520019531, "logps/rejected": -566.2373657226562, "loss": 1.0588, "margin_dpo/margin_mean": 180.10110473632812, "margin_dpo/margin_std": 218.45489501953125, "step": 286 }, { "KL/chosen_KL_mean": -279.0062255859375, "KL/mean": -351.84539794921875, "KL/rejected_KL_mean": -424.6845703125, "KL/std": 178.66867065429688, "epoch": 0.42143906020558003, "fcm_dpo/beta": 0.0024740160442888737, "fcm_dpo/delta": 0.04098087176680565, "fcm_dpo/margin": 145.67831420898438, "fcm_dpo/q_t": 0.41632279753685, "grad_norm": 28.178863525390625, "learning_rate": 3.6028433558269275e-07, "logits/chosen": -0.37850940227508545, "logits/rejected": -0.35931217670440674, "logps/chosen": -340.5473937988281, "logps/ref_chosen": -61.54115295410156, "logps/ref_rejected": -77.69607543945312, "logps/rejected": -502.380615234375, "loss": 1.1114, "margin_dpo/margin_mean": 145.67831420898438, "margin_dpo/margin_std": 192.67111206054688, "step": 287 }, { "KL/chosen_KL_mean": -276.2578430175781, "KL/mean": -370.36175537109375, "KL/rejected_KL_mean": -464.46563720703125, "KL/std": 186.77584838867188, "epoch": 0.42290748898678415, "fcm_dpo/beta": 0.0024337535724043846, "fcm_dpo/delta": -0.0626014918088913, "fcm_dpo/margin": 188.20782470703125, "fcm_dpo/q_t": 0.3927251994609833, "grad_norm": 25.88406753540039, "learning_rate": 3.5913117612644327e-07, "logits/chosen": -0.34253329038619995, "logits/rejected": -0.3305118680000305, "logps/chosen": -332.9190673828125, "logps/ref_chosen": -56.661224365234375, "logps/ref_rejected": -87.33570098876953, "logps/rejected": -551.8013916015625, "loss": 1.0367, "margin_dpo/margin_mean": 188.20785522460938, "margin_dpo/margin_std": 193.08023071289062, "step": 288 }, { "KL/chosen_KL_mean": -269.5328369140625, "KL/mean": -373.37274169921875, "KL/rejected_KL_mean": -477.2126770019531, "KL/std": 201.24783325195312, "epoch": 0.4243759177679883, "fcm_dpo/beta": 0.0024142626207321882, "fcm_dpo/delta": -0.10661280155181885, "fcm_dpo/margin": 207.67982482910156, "fcm_dpo/q_t": 0.38501453399658203, "grad_norm": 38.1318473815918, "learning_rate": 3.5797514096221024e-07, "logits/chosen": -0.2764891982078552, "logits/rejected": -0.27845776081085205, "logps/chosen": -314.76324462890625, "logps/ref_chosen": -45.23039245605469, "logps/ref_rejected": -87.64266967773438, "logps/rejected": -564.8553466796875, "loss": 1.0157, "margin_dpo/margin_mean": 207.6798095703125, "margin_dpo/margin_std": 225.15972900390625, "step": 289 }, { "KL/chosen_KL_mean": -279.3312072753906, "KL/mean": -385.49371337890625, "KL/rejected_KL_mean": -491.65625, "KL/std": 216.3263397216797, "epoch": 0.42584434654919234, "fcm_dpo/beta": 0.0023515745997428894, "fcm_dpo/delta": -0.10482804477214813, "fcm_dpo/margin": 212.32504272460938, "fcm_dpo/q_t": 0.38738417625427246, "grad_norm": 25.646024703979492, "learning_rate": 3.568162605525952e-07, "logits/chosen": -0.2992396950721741, "logits/rejected": -0.318649560213089, "logps/chosen": -334.8027038574219, "logps/ref_chosen": -55.47149658203125, "logps/ref_rejected": -116.70857238769531, "logps/rejected": -608.3648071289062, "loss": 1.0308, "margin_dpo/margin_mean": 212.3250274658203, "margin_dpo/margin_std": 258.3596496582031, "step": 290 }, { "KL/chosen_KL_mean": -233.43585205078125, "KL/mean": -327.27154541015625, "KL/rejected_KL_mean": -421.1072082519531, "KL/std": 182.66287231445312, "epoch": 0.42731277533039647, "fcm_dpo/beta": 0.002339608035981655, "fcm_dpo/delta": -0.04106954485177994, "fcm_dpo/margin": 187.67138671875, "fcm_dpo/q_t": 0.3976435661315918, "grad_norm": 23.715065002441406, "learning_rate": 3.5565456543517485e-07, "logits/chosen": -0.3613763451576233, "logits/rejected": -0.3522465229034424, "logps/chosen": -296.69622802734375, "logps/ref_chosen": -63.26036834716797, "logps/ref_rejected": -89.29708862304688, "logps/rejected": -510.404296875, "loss": 1.0598, "margin_dpo/margin_mean": 187.67138671875, "margin_dpo/margin_std": 221.575927734375, "step": 291 }, { "KL/chosen_KL_mean": -245.43325805664062, "KL/mean": -345.85260009765625, "KL/rejected_KL_mean": -446.27191162109375, "KL/std": 213.15567016601562, "epoch": 0.4287812041116006, "fcm_dpo/beta": 0.0023100622929632664, "fcm_dpo/delta": -0.06708824634552002, "fcm_dpo/margin": 200.83868408203125, "fcm_dpo/q_t": 0.39310041069984436, "grad_norm": 20.776294708251953, "learning_rate": 3.5449008622169583e-07, "logits/chosen": -0.3369968831539154, "logits/rejected": -0.32491156458854675, "logps/chosen": -299.351806640625, "logps/ref_chosen": -53.91852951049805, "logps/ref_rejected": -89.96138000488281, "logps/rejected": -536.2332763671875, "loss": 1.0516, "margin_dpo/margin_mean": 200.83868408203125, "margin_dpo/margin_std": 247.34228515625, "step": 292 }, { "KL/chosen_KL_mean": -245.34991455078125, "KL/mean": -322.8323669433594, "KL/rejected_KL_mean": -400.3148193359375, "KL/std": 209.0297088623047, "epoch": 0.4302496328928047, "fcm_dpo/beta": 0.0023124441504478455, "fcm_dpo/delta": 0.043067529797554016, "fcm_dpo/margin": 154.9648895263672, "fcm_dpo/q_t": 0.4190768599510193, "grad_norm": 49.972896575927734, "learning_rate": 3.5332285359726846e-07, "logits/chosen": -0.3593342900276184, "logits/rejected": -0.35283225774765015, "logps/chosen": -305.7259521484375, "logps/ref_chosen": -60.376033782958984, "logps/ref_rejected": -77.85244750976562, "logps/rejected": -478.167236328125, "loss": 1.1325, "margin_dpo/margin_mean": 154.9648895263672, "margin_dpo/margin_std": 242.87945556640625, "step": 293 }, { "KL/chosen_KL_mean": -227.1817626953125, "KL/mean": -313.35595703125, "KL/rejected_KL_mean": -399.5301513671875, "KL/std": 188.7653045654297, "epoch": 0.43171806167400884, "fcm_dpo/beta": 0.0023088366724550724, "fcm_dpo/delta": 0.002053305506706238, "fcm_dpo/margin": 172.348388671875, "fcm_dpo/q_t": 0.4094581604003906, "grad_norm": 26.523639678955078, "learning_rate": 3.5215289831955786e-07, "logits/chosen": -0.34593725204467773, "logits/rejected": -0.3513278663158417, "logps/chosen": -275.269287109375, "logps/ref_chosen": -48.0875358581543, "logps/ref_rejected": -81.89698791503906, "logps/rejected": -481.4271240234375, "loss": 1.0954, "margin_dpo/margin_mean": 172.348388671875, "margin_dpo/margin_std": 234.5027618408203, "step": 294 }, { "KL/chosen_KL_mean": -278.93609619140625, "KL/mean": -374.4381103515625, "KL/rejected_KL_mean": -469.940185546875, "KL/std": 214.35740661621094, "epoch": 0.4331864904552129, "fcm_dpo/beta": 0.0023033185862004757, "fcm_dpo/delta": -0.041762471199035645, "fcm_dpo/margin": 191.0041046142578, "fcm_dpo/q_t": 0.39998045563697815, "grad_norm": 33.26468276977539, "learning_rate": 3.509802512179737e-07, "logits/chosen": -0.3557334244251251, "logits/rejected": -0.3590019941329956, "logps/chosen": -328.86077880859375, "logps/ref_chosen": -49.92467498779297, "logps/ref_rejected": -87.45632934570312, "logps/rejected": -557.396484375, "loss": 1.0729, "margin_dpo/margin_mean": 191.0041046142578, "margin_dpo/margin_std": 251.3482208251953, "step": 295 }, { "KL/chosen_KL_mean": -355.9974060058594, "KL/mean": -421.620849609375, "KL/rejected_KL_mean": -487.2442932128906, "KL/std": 209.83924865722656, "epoch": 0.434654919236417, "fcm_dpo/beta": 0.0022890730760991573, "fcm_dpo/delta": -0.013507579453289509, "fcm_dpo/margin": 131.2469024658203, "fcm_dpo/q_t": 0.4310154318809509, "grad_norm": 39.7788200378418, "learning_rate": 3.498049431928577e-07, "logits/chosen": -0.380574107170105, "logits/rejected": -0.36485421657562256, "logps/chosen": -421.4886474609375, "logps/ref_chosen": -65.49124145507812, "logps/ref_rejected": -93.08908081054688, "logps/rejected": -580.3333740234375, "loss": 1.2009, "margin_dpo/margin_mean": 131.2469024658203, "margin_dpo/margin_std": 270.313232421875, "step": 296 }, { "KL/chosen_KL_mean": -318.48907470703125, "KL/mean": -398.3763122558594, "KL/rejected_KL_mean": -478.2635192871094, "KL/std": 206.62014770507812, "epoch": 0.43612334801762115, "fcm_dpo/beta": 0.0022979602217674255, "fcm_dpo/delta": 0.03410791605710983, "fcm_dpo/margin": 159.7744598388672, "fcm_dpo/q_t": 0.41503405570983887, "grad_norm": 35.391021728515625, "learning_rate": 3.486270052146694e-07, "logits/chosen": -0.37360668182373047, "logits/rejected": -0.3786901831626892, "logps/chosen": -374.9660339355469, "logps/ref_chosen": -56.476951599121094, "logps/ref_rejected": -95.1385498046875, "logps/rejected": -573.402099609375, "loss": 1.1089, "margin_dpo/margin_mean": 159.77444458007812, "margin_dpo/margin_std": 212.7035369873047, "step": 297 }, { "KL/chosen_KL_mean": -348.817626953125, "KL/mean": -454.68817138671875, "KL/rejected_KL_mean": -560.5587158203125, "KL/std": 270.39697265625, "epoch": 0.43759177679882527, "fcm_dpo/beta": 0.0022850334644317627, "fcm_dpo/delta": -0.08811478316783905, "fcm_dpo/margin": 211.7411346435547, "fcm_dpo/q_t": 0.3950398564338684, "grad_norm": 28.887287139892578, "learning_rate": 3.474464683231698e-07, "logits/chosen": -0.3834819495677948, "logits/rejected": -0.3983224630355835, "logps/chosen": -416.14276123046875, "logps/ref_chosen": -67.32516479492188, "logps/ref_rejected": -116.66217041015625, "logps/rejected": -677.2208862304688, "loss": 1.0737, "margin_dpo/margin_mean": 211.7411346435547, "margin_dpo/margin_std": 316.9564514160156, "step": 298 }, { "KL/chosen_KL_mean": -286.4410095214844, "KL/mean": -374.9492492675781, "KL/rejected_KL_mean": -463.45751953125, "KL/std": 203.78659057617188, "epoch": 0.4390602055800294, "fcm_dpo/beta": 0.0022771679796278477, "fcm_dpo/delta": -0.0039763785898685455, "fcm_dpo/margin": 177.01644897460938, "fcm_dpo/q_t": 0.4095514416694641, "grad_norm": 41.97990417480469, "learning_rate": 3.462633636266041e-07, "logits/chosen": -0.36679205298423767, "logits/rejected": -0.37225013971328735, "logps/chosen": -335.4031066894531, "logps/ref_chosen": -48.96209716796875, "logps/ref_rejected": -84.32823944091797, "logps/rejected": -547.7857666015625, "loss": 1.1071, "margin_dpo/margin_mean": 177.01646423339844, "margin_dpo/margin_std": 260.2517395019531, "step": 299 }, { "KL/chosen_KL_mean": -354.22882080078125, "KL/mean": -464.64483642578125, "KL/rejected_KL_mean": -575.0608520507812, "KL/std": 244.3948974609375, "epoch": 0.44052863436123346, "fcm_dpo/beta": 0.002236669883131981, "fcm_dpo/delta": -0.09869952499866486, "fcm_dpo/margin": 220.83203125, "fcm_dpo/q_t": 0.38846272230148315, "grad_norm": 29.504758834838867, "learning_rate": 3.4507772230088147e-07, "logits/chosen": -0.35466793179512024, "logits/rejected": -0.35978570580482483, "logps/chosen": -413.30255126953125, "logps/ref_chosen": -59.07371139526367, "logps/ref_rejected": -95.9664535522461, "logps/rejected": -671.02734375, "loss": 1.0565, "margin_dpo/margin_mean": 220.83203125, "margin_dpo/margin_std": 298.01202392578125, "step": 300 }, { "KL/chosen_KL_mean": -297.02484130859375, "KL/mean": -402.21649169921875, "KL/rejected_KL_mean": -507.4081115722656, "KL/std": 221.8513641357422, "epoch": 0.4419970631424376, "fcm_dpo/beta": 0.0021925170440226793, "fcm_dpo/delta": -0.06446747481822968, "fcm_dpo/margin": 210.38323974609375, "fcm_dpo/q_t": 0.39620107412338257, "grad_norm": 24.30299949645996, "learning_rate": 3.4388957558875316e-07, "logits/chosen": -0.319614976644516, "logits/rejected": -0.31558164954185486, "logps/chosen": -354.27423095703125, "logps/ref_chosen": -57.249366760253906, "logps/ref_rejected": -92.35354614257812, "logps/rejected": -599.7616577148438, "loss": 1.0522, "margin_dpo/margin_mean": 210.38323974609375, "margin_dpo/margin_std": 263.6629333496094, "step": 301 }, { "KL/chosen_KL_mean": -258.3230895996094, "KL/mean": -345.97906494140625, "KL/rejected_KL_mean": -433.63507080078125, "KL/std": 195.16726684570312, "epoch": 0.4434654919236417, "fcm_dpo/beta": 0.0021865563467144966, "fcm_dpo/delta": 0.017055466771125793, "fcm_dpo/margin": 175.31198120117188, "fcm_dpo/q_t": 0.41217830777168274, "grad_norm": 22.057884216308594, "learning_rate": 3.426989547989902e-07, "logits/chosen": -0.3739083409309387, "logits/rejected": -0.3783670663833618, "logps/chosen": -309.5210876464844, "logps/ref_chosen": -51.197994232177734, "logps/ref_rejected": -97.22636413574219, "logps/rejected": -530.8614501953125, "loss": 1.1013, "margin_dpo/margin_mean": 175.31198120117188, "margin_dpo/margin_std": 233.5333251953125, "step": 302 }, { "KL/chosen_KL_mean": -268.55145263671875, "KL/mean": -352.54443359375, "KL/rejected_KL_mean": -436.537353515625, "KL/std": 213.00778198242188, "epoch": 0.44493392070484583, "fcm_dpo/beta": 0.002201956696808338, "fcm_dpo/delta": 0.031198769807815552, "fcm_dpo/margin": 167.98590087890625, "fcm_dpo/q_t": 0.4157490134239197, "grad_norm": 21.407352447509766, "learning_rate": 3.4150589130555773e-07, "logits/chosen": -0.3980519771575928, "logits/rejected": -0.38533222675323486, "logps/chosen": -335.2654113769531, "logps/ref_chosen": -66.71394348144531, "logps/ref_rejected": -86.94542694091797, "logps/rejected": -523.4827880859375, "loss": 1.1241, "margin_dpo/margin_mean": 167.98590087890625, "margin_dpo/margin_std": 252.49813842773438, "step": 303 }, { "KL/chosen_KL_mean": -235.60621643066406, "KL/mean": -327.85736083984375, "KL/rejected_KL_mean": -420.10845947265625, "KL/std": 188.72259521484375, "epoch": 0.44640234948604995, "fcm_dpo/beta": 0.002216983586549759, "fcm_dpo/delta": -0.009714346379041672, "fcm_dpo/margin": 184.502197265625, "fcm_dpo/q_t": 0.40225833654403687, "grad_norm": 33.168792724609375, "learning_rate": 3.403104165467883e-07, "logits/chosen": -0.42254841327667236, "logits/rejected": -0.41290074586868286, "logps/chosen": -307.55694580078125, "logps/ref_chosen": -71.95069885253906, "logps/ref_rejected": -90.47203063964844, "logps/rejected": -510.58050537109375, "loss": 1.0529, "margin_dpo/margin_mean": 184.50221252441406, "margin_dpo/margin_std": 165.41900634765625, "step": 304 }, { "KL/chosen_KL_mean": -246.27264404296875, "KL/mean": -327.6571350097656, "KL/rejected_KL_mean": -409.0416259765625, "KL/std": 209.47964477539062, "epoch": 0.447870778267254, "fcm_dpo/beta": 0.002209282945841551, "fcm_dpo/delta": 0.04157250002026558, "fcm_dpo/margin": 162.76901245117188, "fcm_dpo/q_t": 0.4180421531200409, "grad_norm": 26.381446838378906, "learning_rate": 3.391125620245535e-07, "logits/chosen": -0.4123404622077942, "logits/rejected": -0.39488470554351807, "logps/chosen": -313.06787109375, "logps/ref_chosen": -66.79523468017578, "logps/ref_rejected": -92.75459289550781, "logps/rejected": -501.7962341308594, "loss": 1.1211, "margin_dpo/margin_mean": 162.76901245117188, "margin_dpo/margin_std": 229.144775390625, "step": 305 }, { "KL/chosen_KL_mean": -237.97418212890625, "KL/mean": -316.761474609375, "KL/rejected_KL_mean": -395.54876708984375, "KL/std": 183.5909881591797, "epoch": 0.44933920704845814, "fcm_dpo/beta": 0.0022436161525547504, "fcm_dpo/delta": 0.0480995737016201, "fcm_dpo/margin": 157.5745849609375, "fcm_dpo/q_t": 0.417450875043869, "grad_norm": 32.627220153808594, "learning_rate": 3.3791235930343417e-07, "logits/chosen": -0.41048943996429443, "logits/rejected": -0.3866746425628662, "logps/chosen": -307.6580810546875, "logps/ref_chosen": -69.68389892578125, "logps/ref_rejected": -85.15919494628906, "logps/rejected": -480.70794677734375, "loss": 1.1087, "margin_dpo/margin_mean": 157.5745849609375, "margin_dpo/margin_std": 195.82992553710938, "step": 306 }, { "KL/chosen_KL_mean": -217.2437286376953, "KL/mean": -297.4852294921875, "KL/rejected_KL_mean": -377.72674560546875, "KL/std": 166.21441650390625, "epoch": 0.45080763582966227, "fcm_dpo/beta": 0.002255768049508333, "fcm_dpo/delta": 0.03941379487514496, "fcm_dpo/margin": 160.48297119140625, "fcm_dpo/q_t": 0.415368914604187, "grad_norm": 28.697053909301758, "learning_rate": 3.367098400098881e-07, "logits/chosen": -0.4190334677696228, "logits/rejected": -0.400789737701416, "logps/chosen": -287.4091491699219, "logps/ref_chosen": -70.16542053222656, "logps/ref_rejected": -86.97230529785156, "logps/rejected": -464.69903564453125, "loss": 1.1073, "margin_dpo/margin_mean": 160.48297119140625, "margin_dpo/margin_std": 204.71377563476562, "step": 307 }, { "KL/chosen_KL_mean": -227.00399780273438, "KL/mean": -317.9705810546875, "KL/rejected_KL_mean": -408.9371337890625, "KL/std": 191.77737426757812, "epoch": 0.4522760646108664, "fcm_dpo/beta": 0.002262132242321968, "fcm_dpo/delta": -0.012053810060024261, "fcm_dpo/margin": 181.93313598632812, "fcm_dpo/q_t": 0.40270644426345825, "grad_norm": 32.00320053100586, "learning_rate": 3.355050358314172e-07, "logits/chosen": -0.40174388885498047, "logits/rejected": -0.3911542594432831, "logps/chosen": -282.2489929199219, "logps/ref_chosen": -55.2449951171875, "logps/ref_rejected": -79.37226104736328, "logps/rejected": -488.3094177246094, "loss": 1.0594, "margin_dpo/margin_mean": 181.93313598632812, "margin_dpo/margin_std": 186.53610229492188, "step": 308 }, { "KL/chosen_KL_mean": -235.02227783203125, "KL/mean": -328.88671875, "KL/rejected_KL_mean": -422.7511291503906, "KL/std": 208.54153442382812, "epoch": 0.45374449339207046, "fcm_dpo/beta": 0.0022613410837948322, "fcm_dpo/delta": -0.025947626680135727, "fcm_dpo/margin": 187.72882080078125, "fcm_dpo/q_t": 0.40044116973876953, "grad_norm": 36.268280029296875, "learning_rate": 3.3429797851573183e-07, "logits/chosen": -0.3440445065498352, "logits/rejected": -0.3352274000644684, "logps/chosen": -283.98138427734375, "logps/ref_chosen": -48.959083557128906, "logps/ref_rejected": -82.34072875976562, "logps/rejected": -505.09185791015625, "loss": 1.067, "margin_dpo/margin_mean": 187.72882080078125, "margin_dpo/margin_std": 219.98861694335938, "step": 309 }, { "KL/chosen_KL_mean": -275.0888671875, "KL/mean": -361.7545471191406, "KL/rejected_KL_mean": -448.42022705078125, "KL/std": 183.4505157470703, "epoch": 0.4552129221732746, "fcm_dpo/beta": 0.002258842345327139, "fcm_dpo/delta": 0.008499890565872192, "fcm_dpo/margin": 173.33135986328125, "fcm_dpo/q_t": 0.40748023986816406, "grad_norm": 27.633638381958008, "learning_rate": 3.3308869986991487e-07, "logits/chosen": -0.3811958432197571, "logits/rejected": -0.36551085114479065, "logps/chosen": -337.8306579589844, "logps/ref_chosen": -62.74177932739258, "logps/ref_rejected": -79.9302978515625, "logps/rejected": -528.3505249023438, "loss": 1.0746, "margin_dpo/margin_mean": 173.33135986328125, "margin_dpo/margin_std": 180.37374877929688, "step": 310 }, { "KL/chosen_KL_mean": -287.706787109375, "KL/mean": -388.335205078125, "KL/rejected_KL_mean": -488.963623046875, "KL/std": 243.3779296875, "epoch": 0.4566813509544787, "fcm_dpo/beta": 0.0022331131622195244, "fcm_dpo/delta": -0.05178193002939224, "fcm_dpo/margin": 201.25680541992188, "fcm_dpo/q_t": 0.3981863260269165, "grad_norm": 27.40664291381836, "learning_rate": 3.3187723175958346e-07, "logits/chosen": -0.3802347779273987, "logits/rejected": -0.35250845551490784, "logps/chosen": -340.7347717285156, "logps/ref_chosen": -53.02798080444336, "logps/ref_rejected": -77.43820190429688, "logps/rejected": -566.40185546875, "loss": 1.0617, "margin_dpo/margin_mean": 201.25680541992188, "margin_dpo/margin_std": 256.835693359375, "step": 311 }, { "KL/chosen_KL_mean": -276.9559326171875, "KL/mean": -367.42620849609375, "KL/rejected_KL_mean": -457.896484375, "KL/std": 211.4532470703125, "epoch": 0.4581497797356828, "fcm_dpo/beta": 0.002234598621726036, "fcm_dpo/delta": -0.004756327718496323, "fcm_dpo/margin": 180.94058227539062, "fcm_dpo/q_t": 0.4089137613773346, "grad_norm": 25.603744506835938, "learning_rate": 3.306636061080487e-07, "logits/chosen": -0.3543195128440857, "logits/rejected": -0.34367918968200684, "logps/chosen": -326.34814453125, "logps/ref_chosen": -49.39221954345703, "logps/ref_rejected": -75.79280853271484, "logps/rejected": -533.6893310546875, "loss": 1.0944, "margin_dpo/margin_mean": 180.94058227539062, "margin_dpo/margin_std": 252.2486114501953, "step": 312 }, { "KL/chosen_KL_mean": -258.70135498046875, "KL/mean": -349.859619140625, "KL/rejected_KL_mean": -441.01788330078125, "KL/std": 218.23260498046875, "epoch": 0.45961820851688695, "fcm_dpo/beta": 0.0022380563896149397, "fcm_dpo/delta": -0.009237736463546753, "fcm_dpo/margin": 182.31651306152344, "fcm_dpo/q_t": 0.4060678482055664, "grad_norm": 25.151771545410156, "learning_rate": 3.2944785489547537e-07, "logits/chosen": -0.40875959396362305, "logits/rejected": -0.40443694591522217, "logps/chosen": -308.8540954589844, "logps/ref_chosen": -50.152740478515625, "logps/ref_rejected": -86.40620422363281, "logps/rejected": -527.424072265625, "loss": 1.0913, "margin_dpo/margin_mean": 182.31651306152344, "margin_dpo/margin_std": 241.20245361328125, "step": 313 }, { "KL/chosen_KL_mean": -273.3019714355469, "KL/mean": -359.92864990234375, "KL/rejected_KL_mean": -446.5552978515625, "KL/std": 209.75840759277344, "epoch": 0.461086637298091, "fcm_dpo/beta": 0.002218043664470315, "fcm_dpo/delta": 0.01605740562081337, "fcm_dpo/margin": 173.25335693359375, "fcm_dpo/q_t": 0.4140198826789856, "grad_norm": 21.262128829956055, "learning_rate": 3.2823001015803857e-07, "logits/chosen": -0.4432973861694336, "logits/rejected": -0.44494277238845825, "logps/chosen": -330.53955078125, "logps/ref_chosen": -57.237579345703125, "logps/ref_rejected": -97.5965347290039, "logps/rejected": -544.15185546875, "loss": 1.1216, "margin_dpo/margin_mean": 173.25335693359375, "margin_dpo/margin_std": 268.62567138671875, "step": 314 }, { "KL/chosen_KL_mean": -246.94622802734375, "KL/mean": -326.3898620605469, "KL/rejected_KL_mean": -405.83349609375, "KL/std": 184.21778869628906, "epoch": 0.46255506607929514, "fcm_dpo/beta": 0.0022414117120206356, "fcm_dpo/delta": 0.04551296681165695, "fcm_dpo/margin": 158.88729858398438, "fcm_dpo/q_t": 0.41757166385650635, "grad_norm": 21.051586151123047, "learning_rate": 3.270101039870797e-07, "logits/chosen": -0.3383270502090454, "logits/rejected": -0.33996307849884033, "logps/chosen": -296.01580810546875, "logps/ref_chosen": -49.06958770751953, "logps/ref_rejected": -85.68087768554688, "logps/rejected": -491.5143737792969, "loss": 1.1129, "margin_dpo/margin_mean": 158.88729858398438, "margin_dpo/margin_std": 208.52919006347656, "step": 315 }, { "KL/chosen_KL_mean": -247.3397216796875, "KL/mean": -359.8223876953125, "KL/rejected_KL_mean": -472.3050537109375, "KL/std": 221.7631378173828, "epoch": 0.46402349486049926, "fcm_dpo/beta": 0.002204576972872019, "fcm_dpo/delta": -0.10198242217302322, "fcm_dpo/margin": 224.96533203125, "fcm_dpo/q_t": 0.386934369802475, "grad_norm": 26.301368713378906, "learning_rate": 3.2578816852826086e-07, "logits/chosen": -0.37936335802078247, "logits/rejected": -0.38319075107574463, "logps/chosen": -301.6004638671875, "logps/ref_chosen": -54.26074981689453, "logps/ref_rejected": -101.2814712524414, "logps/rejected": -573.5865478515625, "loss": 1.0184, "margin_dpo/margin_mean": 224.96531677246094, "margin_dpo/margin_std": 245.899169921875, "step": 316 }, { "KL/chosen_KL_mean": -254.56671142578125, "KL/mean": -377.3856201171875, "KL/rejected_KL_mean": -500.2044372558594, "KL/std": 204.3238525390625, "epoch": 0.4654919236417034, "fcm_dpo/beta": 0.0021673087030649185, "fcm_dpo/delta": -0.1395837366580963, "fcm_dpo/margin": 245.63772583007812, "fcm_dpo/q_t": 0.3760732412338257, "grad_norm": 27.939563751220703, "learning_rate": 3.2456423598071783e-07, "logits/chosen": -0.36965489387512207, "logits/rejected": -0.3580781817436218, "logps/chosen": -310.66094970703125, "logps/ref_chosen": -56.094207763671875, "logps/ref_rejected": -100.69905090332031, "logps/rejected": -600.9035034179688, "loss": 0.9823, "margin_dpo/margin_mean": 245.6377410888672, "margin_dpo/margin_std": 226.56988525390625, "step": 317 }, { "KL/chosen_KL_mean": -279.697265625, "KL/mean": -372.810546875, "KL/rejected_KL_mean": -465.92388916015625, "KL/std": 213.03033447265625, "epoch": 0.4669603524229075, "fcm_dpo/beta": 0.002148838248103857, "fcm_dpo/delta": -0.0003134552389383316, "fcm_dpo/margin": 186.2266387939453, "fcm_dpo/q_t": 0.4076169729232788, "grad_norm": 25.99791717529297, "learning_rate": 3.233383385962115e-07, "logits/chosen": -0.4427594542503357, "logits/rejected": -0.4159233570098877, "logps/chosen": -344.34295654296875, "logps/ref_chosen": -64.64569854736328, "logps/ref_rejected": -82.76425170898438, "logps/rejected": -548.6881103515625, "loss": 1.0842, "margin_dpo/margin_mean": 186.2266387939453, "margin_dpo/margin_std": 232.8768310546875, "step": 318 }, { "KL/chosen_KL_mean": -254.43267822265625, "KL/mean": -364.23388671875, "KL/rejected_KL_mean": -474.03509521484375, "KL/std": 231.52847290039062, "epoch": 0.4684287812041116, "fcm_dpo/beta": 0.002122014295309782, "fcm_dpo/delta": -0.06920456886291504, "fcm_dpo/margin": 219.60238647460938, "fcm_dpo/q_t": 0.3920608162879944, "grad_norm": 22.634004592895508, "learning_rate": 3.2211050867827805e-07, "logits/chosen": -0.40253257751464844, "logits/rejected": -0.41706568002700806, "logps/chosen": -303.81646728515625, "logps/ref_chosen": -49.383758544921875, "logps/ref_rejected": -113.90650939941406, "logps/rejected": -587.9415893554688, "loss": 1.04, "margin_dpo/margin_mean": 219.60238647460938, "margin_dpo/margin_std": 253.1864013671875, "step": 319 }, { "KL/chosen_KL_mean": -270.4678955078125, "KL/mean": -391.669677734375, "KL/rejected_KL_mean": -512.8714599609375, "KL/std": 244.3293914794922, "epoch": 0.4698972099853157, "fcm_dpo/beta": 0.002073149662464857, "fcm_dpo/delta": -0.10854032635688782, "fcm_dpo/margin": 242.403564453125, "fcm_dpo/q_t": 0.38467395305633545, "grad_norm": 26.852354049682617, "learning_rate": 3.208807785813777e-07, "logits/chosen": -0.37782442569732666, "logits/rejected": -0.3842761516571045, "logps/chosen": -329.9727783203125, "logps/ref_chosen": -59.50489044189453, "logps/ref_rejected": -97.66717529296875, "logps/rejected": -610.5386352539062, "loss": 1.0125, "margin_dpo/margin_mean": 242.403564453125, "margin_dpo/margin_std": 261.4935607910156, "step": 320 }, { "KL/chosen_KL_mean": -335.5428771972656, "KL/mean": -437.40179443359375, "KL/rejected_KL_mean": -539.2606201171875, "KL/std": 246.63409423828125, "epoch": 0.4713656387665198, "fcm_dpo/beta": 0.002050905954092741, "fcm_dpo/delta": -0.019337691366672516, "fcm_dpo/margin": 203.71778869628906, "fcm_dpo/q_t": 0.4041670560836792, "grad_norm": 25.74936294555664, "learning_rate": 3.1964918071004217e-07, "logits/chosen": -0.3634049594402313, "logits/rejected": -0.35541000962257385, "logps/chosen": -397.091552734375, "logps/ref_chosen": -61.548683166503906, "logps/ref_rejected": -91.64103698730469, "logps/rejected": -630.9017333984375, "loss": 1.0823, "margin_dpo/margin_mean": 203.71778869628906, "margin_dpo/margin_std": 262.6986083984375, "step": 321 }, { "KL/chosen_KL_mean": -293.60589599609375, "KL/mean": -406.685302734375, "KL/rejected_KL_mean": -519.7646484375, "KL/std": 222.54376220703125, "epoch": 0.47283406754772395, "fcm_dpo/beta": 0.0020338515751063824, "fcm_dpo/delta": -0.0633477047085762, "fcm_dpo/margin": 226.15872192382812, "fcm_dpo/q_t": 0.39247822761535645, "grad_norm": 21.486406326293945, "learning_rate": 3.184157475180207e-07, "logits/chosen": -0.3742543160915375, "logits/rejected": -0.37500399351119995, "logps/chosen": -350.89593505859375, "logps/ref_chosen": -57.29003143310547, "logps/ref_rejected": -95.74992370605469, "logps/rejected": -615.5145263671875, "loss": 1.0333, "margin_dpo/margin_mean": 226.15872192382812, "margin_dpo/margin_std": 235.21531677246094, "step": 322 }, { "KL/chosen_KL_mean": -314.616943359375, "KL/mean": -410.58935546875, "KL/rejected_KL_mean": -506.56170654296875, "KL/std": 222.89964294433594, "epoch": 0.47430249632892807, "fcm_dpo/beta": 0.002034769393503666, "fcm_dpo/delta": 0.009797626174986362, "fcm_dpo/margin": 191.94479370117188, "fcm_dpo/q_t": 0.4092080295085907, "grad_norm": 40.52562713623047, "learning_rate": 3.171805115074251e-07, "logits/chosen": -0.39556217193603516, "logits/rejected": -0.3936944603919983, "logps/chosen": -365.85089111328125, "logps/ref_chosen": -51.23395919799805, "logps/ref_rejected": -75.06192016601562, "logps/rejected": -581.6236572265625, "loss": 1.094, "margin_dpo/margin_mean": 191.94479370117188, "margin_dpo/margin_std": 242.95655822753906, "step": 323 }, { "KL/chosen_KL_mean": -364.46820068359375, "KL/mean": -454.6254577636719, "KL/rejected_KL_mean": -544.78271484375, "KL/std": 239.51873779296875, "epoch": 0.47577092511013214, "fcm_dpo/beta": 0.0020614464301615953, "fcm_dpo/delta": 0.0281895250082016, "fcm_dpo/margin": 180.3145294189453, "fcm_dpo/q_t": 0.41647300124168396, "grad_norm": 45.35006332397461, "learning_rate": 3.1594350522787295e-07, "logits/chosen": -0.3872758746147156, "logits/rejected": -0.3733510971069336, "logps/chosen": -429.6033630371094, "logps/ref_chosen": -65.13516998291016, "logps/ref_rejected": -86.47750854492188, "logps/rejected": -631.26025390625, "loss": 1.1314, "margin_dpo/margin_mean": 180.3145294189453, "margin_dpo/margin_std": 275.48095703125, "step": 324 }, { "KL/chosen_KL_mean": -282.35870361328125, "KL/mean": -360.821533203125, "KL/rejected_KL_mean": -439.28436279296875, "KL/std": 208.74551391601562, "epoch": 0.47723935389133626, "fcm_dpo/beta": 0.0020753461867570877, "fcm_dpo/delta": 0.076748326420784, "fcm_dpo/margin": 156.92562866210938, "fcm_dpo/q_t": 0.42378658056259155, "grad_norm": 30.042072296142578, "learning_rate": 3.147047612756302e-07, "logits/chosen": -0.44832050800323486, "logits/rejected": -0.43317437171936035, "logps/chosen": -338.57427978515625, "logps/ref_chosen": -56.215599060058594, "logps/ref_rejected": -70.08592987060547, "logps/rejected": -509.37030029296875, "loss": 1.1354, "margin_dpo/margin_mean": 156.92564392089844, "margin_dpo/margin_std": 211.30374145507812, "step": 325 }, { "KL/chosen_KL_mean": -312.6626281738281, "KL/mean": -387.2479553222656, "KL/rejected_KL_mean": -461.83331298828125, "KL/std": 196.3777618408203, "epoch": 0.4787077826725404, "fcm_dpo/beta": 0.002105048391968012, "fcm_dpo/delta": 0.08887322247028351, "fcm_dpo/margin": 149.17068481445312, "fcm_dpo/q_t": 0.4256167709827423, "grad_norm": 45.18388748168945, "learning_rate": 3.134643122927519e-07, "logits/chosen": -0.4697904884815216, "logits/rejected": -0.45466452836990356, "logps/chosen": -385.3875732421875, "logps/ref_chosen": -72.72496032714844, "logps/ref_rejected": -79.8467788696289, "logps/rejected": -541.6800537109375, "loss": 1.1383, "margin_dpo/margin_mean": 149.17068481445312, "margin_dpo/margin_std": 190.10052490234375, "step": 326 }, { "KL/chosen_KL_mean": -267.8811950683594, "KL/mean": -382.8114013671875, "KL/rejected_KL_mean": -497.74163818359375, "KL/std": 212.30575561523438, "epoch": 0.4801762114537445, "fcm_dpo/beta": 0.0020871213637292385, "fcm_dpo/delta": -0.08413384854793549, "fcm_dpo/margin": 229.8604278564453, "fcm_dpo/q_t": 0.38719016313552856, "grad_norm": 40.068824768066406, "learning_rate": 3.1222219096622264e-07, "logits/chosen": -0.419431209564209, "logits/rejected": -0.40632164478302, "logps/chosen": -337.015625, "logps/ref_chosen": -69.13441467285156, "logps/ref_rejected": -111.93377685546875, "logps/rejected": -609.6754150390625, "loss": 1.0185, "margin_dpo/margin_mean": 229.86044311523438, "margin_dpo/margin_std": 231.85646057128906, "step": 327 }, { "KL/chosen_KL_mean": -283.38665771484375, "KL/mean": -388.9765930175781, "KL/rejected_KL_mean": -494.5665283203125, "KL/std": 235.5018310546875, "epoch": 0.48164464023494863, "fcm_dpo/beta": 0.002073537092655897, "fcm_dpo/delta": -0.03960520401597023, "fcm_dpo/margin": 211.17982482910156, "fcm_dpo/q_t": 0.4000805914402008, "grad_norm": 26.695911407470703, "learning_rate": 3.1097843002709427e-07, "logits/chosen": -0.40584391355514526, "logits/rejected": -0.4096938371658325, "logps/chosen": -343.0738525390625, "logps/ref_chosen": -59.68719482421875, "logps/ref_rejected": -90.85499572753906, "logps/rejected": -585.4215087890625, "loss": 1.0631, "margin_dpo/margin_mean": 211.17984008789062, "margin_dpo/margin_std": 262.86199951171875, "step": 328 }, { "KL/chosen_KL_mean": -315.77923583984375, "KL/mean": -425.44476318359375, "KL/rejected_KL_mean": -535.1102294921875, "KL/std": 248.7440185546875, "epoch": 0.4831130690161527, "fcm_dpo/beta": 0.0020416276529431343, "fcm_dpo/delta": -0.05077539384365082, "fcm_dpo/margin": 219.33106994628906, "fcm_dpo/q_t": 0.3970402479171753, "grad_norm": 27.266719818115234, "learning_rate": 3.0973306224962437e-07, "logits/chosen": -0.39109185338020325, "logits/rejected": -0.37896549701690674, "logps/chosen": -381.0254211425781, "logps/ref_chosen": -65.2461929321289, "logps/ref_rejected": -100.69770812988281, "logps/rejected": -635.8079833984375, "loss": 1.0669, "margin_dpo/margin_mean": 219.33106994628906, "margin_dpo/margin_std": 276.76806640625, "step": 329 }, { "KL/chosen_KL_mean": -271.9311828613281, "KL/mean": -387.9265441894531, "KL/rejected_KL_mean": -503.921875, "KL/std": 245.21937561035156, "epoch": 0.4845814977973568, "fcm_dpo/beta": 0.002023911103606224, "fcm_dpo/delta": -0.07296737283468246, "fcm_dpo/margin": 231.99075317382812, "fcm_dpo/q_t": 0.39241012930870056, "grad_norm": 22.988527297973633, "learning_rate": 3.084861204504122e-07, "logits/chosen": -0.38984963297843933, "logits/rejected": -0.3909507393836975, "logps/chosen": -318.92950439453125, "logps/ref_chosen": -46.998348236083984, "logps/ref_rejected": -86.87684631347656, "logps/rejected": -590.7987060546875, "loss": 1.0401, "margin_dpo/margin_mean": 231.99075317382812, "margin_dpo/margin_std": 271.9549560546875, "step": 330 }, { "KL/chosen_KL_mean": -292.27081298828125, "KL/mean": -406.2383117675781, "KL/rejected_KL_mean": -520.205810546875, "KL/std": 200.16488647460938, "epoch": 0.48604992657856094, "fcm_dpo/beta": 0.001998601946979761, "fcm_dpo/delta": -0.058225952088832855, "fcm_dpo/margin": 227.9350128173828, "fcm_dpo/q_t": 0.391870379447937, "grad_norm": 29.254379272460938, "learning_rate": 3.072376374875335e-07, "logits/chosen": -0.4170438051223755, "logits/rejected": -0.41196513175964355, "logps/chosen": -342.7950744628906, "logps/ref_chosen": -50.52424621582031, "logps/ref_rejected": -89.01544189453125, "logps/rejected": -609.2212524414062, "loss": 1.0166, "margin_dpo/margin_mean": 227.9350128173828, "margin_dpo/margin_std": 194.46043395996094, "step": 331 }, { "KL/chosen_KL_mean": -294.25115966796875, "KL/mean": -377.06805419921875, "KL/rejected_KL_mean": -459.8848876953125, "KL/std": 212.61550903320312, "epoch": 0.48751835535976507, "fcm_dpo/beta": 0.002016157377511263, "fcm_dpo/delta": 0.06822776794433594, "fcm_dpo/margin": 165.63372802734375, "fcm_dpo/q_t": 0.4236387014389038, "grad_norm": 23.76082420349121, "learning_rate": 3.059876462596758e-07, "logits/chosen": -0.4166898727416992, "logits/rejected": -0.39659583568573, "logps/chosen": -343.43145751953125, "logps/ref_chosen": -49.18028259277344, "logps/ref_rejected": -76.48515319824219, "logps/rejected": -536.3699951171875, "loss": 1.1333, "margin_dpo/margin_mean": 165.63372802734375, "margin_dpo/margin_std": 237.92779541015625, "step": 332 }, { "KL/chosen_KL_mean": -314.040771484375, "KL/mean": -421.1554870605469, "KL/rejected_KL_mean": -528.270263671875, "KL/std": 245.66607666015625, "epoch": 0.4889867841409692, "fcm_dpo/beta": 0.0019987255800515413, "fcm_dpo/delta": -0.030487176030874252, "fcm_dpo/margin": 214.22947692871094, "fcm_dpo/q_t": 0.40265339612960815, "grad_norm": 21.290910720825195, "learning_rate": 3.0473617970527015e-07, "logits/chosen": -0.414547860622406, "logits/rejected": -0.40775951743125916, "logps/chosen": -377.7965087890625, "logps/ref_chosen": -63.75574493408203, "logps/ref_rejected": -95.04411315917969, "logps/rejected": -623.3143310546875, "loss": 1.0872, "margin_dpo/margin_mean": 214.22946166992188, "margin_dpo/margin_std": 293.54949951171875, "step": 333 }, { "KL/chosen_KL_mean": -301.93621826171875, "KL/mean": -397.41796875, "KL/rejected_KL_mean": -492.8997802734375, "KL/std": 265.2902526855469, "epoch": 0.49045521292217326, "fcm_dpo/beta": 0.0020114602521061897, "fcm_dpo/delta": 0.016515735536813736, "fcm_dpo/margin": 190.96356201171875, "fcm_dpo/q_t": 0.4135018587112427, "grad_norm": 25.199310302734375, "learning_rate": 3.034832708016243e-07, "logits/chosen": -0.4625867009162903, "logits/rejected": -0.4604346752166748, "logps/chosen": -368.91595458984375, "logps/ref_chosen": -66.97975158691406, "logps/ref_rejected": -95.31692504882812, "logps/rejected": -588.2166748046875, "loss": 1.1255, "margin_dpo/margin_mean": 190.9635467529297, "margin_dpo/margin_std": 301.6986389160156, "step": 334 }, { "KL/chosen_KL_mean": -332.12335205078125, "KL/mean": -407.17266845703125, "KL/rejected_KL_mean": -482.2219543457031, "KL/std": 248.314453125, "epoch": 0.4919236417033774, "fcm_dpo/beta": 0.0020141825079917908, "fcm_dpo/delta": -0.004568679258227348, "fcm_dpo/margin": 150.0985870361328, "fcm_dpo/q_t": 0.4313260614871979, "grad_norm": 32.44525146484375, "learning_rate": 3.022289525640531e-07, "logits/chosen": -0.43714985251426697, "logits/rejected": -0.4139357805252075, "logps/chosen": -394.66583251953125, "logps/ref_chosen": -62.54248046875, "logps/ref_rejected": -87.61770629882812, "logps/rejected": -569.8396606445312, "loss": 1.1788, "margin_dpo/margin_mean": 150.0985870361328, "margin_dpo/margin_std": 268.7898254394531, "step": 335 }, { "KL/chosen_KL_mean": -334.3876647949219, "KL/mean": -455.1687927246094, "KL/rejected_KL_mean": -575.949951171875, "KL/std": 287.1412353515625, "epoch": 0.4933920704845815, "fcm_dpo/beta": 0.0019955080933868885, "fcm_dpo/delta": -0.08617211878299713, "fcm_dpo/margin": 241.56228637695312, "fcm_dpo/q_t": 0.39297786355018616, "grad_norm": 29.619176864624023, "learning_rate": 3.009732580450086e-07, "logits/chosen": -0.399771511554718, "logits/rejected": -0.3994015157222748, "logps/chosen": -388.9188232421875, "logps/ref_chosen": -54.53115463256836, "logps/ref_rejected": -104.40424346923828, "logps/rejected": -680.3541870117188, "loss": 1.0672, "margin_dpo/margin_mean": 241.56228637695312, "margin_dpo/margin_std": 339.20538330078125, "step": 336 }, { "KL/chosen_KL_mean": -299.7122802734375, "KL/mean": -419.55731201171875, "KL/rejected_KL_mean": -539.40234375, "KL/std": 230.1120147705078, "epoch": 0.4948604992657856, "fcm_dpo/beta": 0.0019532032310962677, "fcm_dpo/delta": -0.07179627567529678, "fcm_dpo/margin": 239.69003295898438, "fcm_dpo/q_t": 0.3924025893211365, "grad_norm": 31.691265106201172, "learning_rate": 2.9971622033320914e-07, "logits/chosen": -0.44581082463264465, "logits/rejected": -0.4338313341140747, "logps/chosen": -364.8409729003906, "logps/ref_chosen": -65.12869262695312, "logps/ref_rejected": -101.72701263427734, "logps/rejected": -641.1293334960938, "loss": 1.0365, "margin_dpo/margin_mean": 239.69003295898438, "margin_dpo/margin_std": 272.0997314453125, "step": 337 }, { "KL/chosen_KL_mean": -265.2712707519531, "KL/mean": -381.5762634277344, "KL/rejected_KL_mean": -497.88128662109375, "KL/std": 221.42172241210938, "epoch": 0.49632892804698975, "fcm_dpo/beta": 0.0019332608208060265, "fcm_dpo/delta": -0.052127446979284286, "fcm_dpo/margin": 232.6099853515625, "fcm_dpo/q_t": 0.3942795991897583, "grad_norm": 33.03255081176758, "learning_rate": 2.984578725527675e-07, "logits/chosen": -0.44652998447418213, "logits/rejected": -0.4394975006580353, "logps/chosen": -323.6939697265625, "logps/ref_chosen": -58.422706604003906, "logps/ref_rejected": -89.06854248046875, "logps/rejected": -586.9498291015625, "loss": 1.0325, "margin_dpo/margin_mean": 232.6099853515625, "margin_dpo/margin_std": 229.81480407714844, "step": 338 }, { "KL/chosen_KL_mean": -288.043701171875, "KL/mean": -402.6147155761719, "KL/rejected_KL_mean": -517.1857299804688, "KL/std": 232.51951599121094, "epoch": 0.4977973568281938, "fcm_dpo/beta": 0.0019332109950482845, "fcm_dpo/delta": -0.045980703085660934, "fcm_dpo/margin": 229.14202880859375, "fcm_dpo/q_t": 0.3964860439300537, "grad_norm": 26.379383087158203, "learning_rate": 2.9719824786231796e-07, "logits/chosen": -0.4995361864566803, "logits/rejected": -0.486974835395813, "logps/chosen": -348.03900146484375, "logps/ref_chosen": -59.99531555175781, "logps/ref_rejected": -103.9109115600586, "logps/rejected": -621.0966186523438, "loss": 1.0459, "margin_dpo/margin_mean": 229.14202880859375, "margin_dpo/margin_std": 232.25975036621094, "step": 339 }, { "KL/chosen_KL_mean": -305.80303955078125, "KL/mean": -403.6094665527344, "KL/rejected_KL_mean": -501.4158935546875, "KL/std": 221.99676513671875, "epoch": 0.49926578560939794, "fcm_dpo/beta": 0.001913035404868424, "fcm_dpo/delta": 0.026591314002871513, "fcm_dpo/margin": 195.61285400390625, "fcm_dpo/q_t": 0.4138457477092743, "grad_norm": 24.018463134765625, "learning_rate": 2.959373794541426e-07, "logits/chosen": -0.41224145889282227, "logits/rejected": -0.3901046812534332, "logps/chosen": -358.63323974609375, "logps/ref_chosen": -52.83022689819336, "logps/ref_rejected": -73.10723114013672, "logps/rejected": -574.5231323242188, "loss": 1.1118, "margin_dpo/margin_mean": 195.61285400390625, "margin_dpo/margin_std": 273.03375244140625, "step": 340 }, { "KL/chosen_KL_mean": -299.90802001953125, "KL/mean": -412.6207580566406, "KL/rejected_KL_mean": -525.3335571289062, "KL/std": 242.2279510498047, "epoch": 0.5007342143906021, "fcm_dpo/beta": 0.0019069017143920064, "fcm_dpo/delta": -0.03159831091761589, "fcm_dpo/margin": 225.42550659179688, "fcm_dpo/q_t": 0.4006652235984802, "grad_norm": 25.791650772094727, "learning_rate": 2.946753005532965e-07, "logits/chosen": -0.40798383951187134, "logits/rejected": -0.4071720838546753, "logps/chosen": -347.8078308105469, "logps/ref_chosen": -47.899803161621094, "logps/ref_rejected": -101.80987548828125, "logps/rejected": -627.1434326171875, "loss": 1.0601, "margin_dpo/margin_mean": 225.42550659179688, "margin_dpo/margin_std": 256.30413818359375, "step": 341 }, { "KL/chosen_KL_mean": -308.8389892578125, "KL/mean": -408.7991638183594, "KL/rejected_KL_mean": -508.7593078613281, "KL/std": 230.74411010742188, "epoch": 0.5022026431718062, "fcm_dpo/beta": 0.001922906143590808, "fcm_dpo/delta": 0.01581621915102005, "fcm_dpo/margin": 199.9203338623047, "fcm_dpo/q_t": 0.4113759696483612, "grad_norm": 25.002784729003906, "learning_rate": 2.934120444167326e-07, "logits/chosen": -0.4692263603210449, "logits/rejected": -0.44338518381118774, "logps/chosen": -380.83563232421875, "logps/ref_chosen": -71.99664306640625, "logps/ref_rejected": -92.58959197998047, "logps/rejected": -601.348876953125, "loss": 1.1027, "margin_dpo/margin_mean": 199.92034912109375, "margin_dpo/margin_std": 271.4499206542969, "step": 342 }, { "KL/chosen_KL_mean": -296.18927001953125, "KL/mean": -417.68475341796875, "KL/rejected_KL_mean": -539.18017578125, "KL/std": 236.11985778808594, "epoch": 0.5036710719530103, "fcm_dpo/beta": 0.001898743212223053, "fcm_dpo/delta": -0.06434239447116852, "fcm_dpo/margin": 242.99090576171875, "fcm_dpo/q_t": 0.39045825600624084, "grad_norm": 23.37291145324707, "learning_rate": 2.9214764433242476e-07, "logits/chosen": -0.4716584086418152, "logits/rejected": -0.4760972261428833, "logps/chosen": -350.59490966796875, "logps/ref_chosen": -54.405616760253906, "logps/ref_rejected": -111.04142761230469, "logps/rejected": -650.2216186523438, "loss": 1.0162, "margin_dpo/margin_mean": 242.99090576171875, "margin_dpo/margin_std": 209.3662109375, "step": 343 }, { "KL/chosen_KL_mean": -303.44384765625, "KL/mean": -406.74298095703125, "KL/rejected_KL_mean": -510.04217529296875, "KL/std": 256.48602294921875, "epoch": 0.5051395007342144, "fcm_dpo/beta": 0.0019109161803498864, "fcm_dpo/delta": 0.0036756470799446106, "fcm_dpo/margin": 206.59829711914062, "fcm_dpo/q_t": 0.41091352701187134, "grad_norm": 31.523855209350586, "learning_rate": 2.9088213361849126e-07, "logits/chosen": -0.4483766555786133, "logits/rejected": -0.45180875062942505, "logps/chosen": -357.40850830078125, "logps/ref_chosen": -53.96466827392578, "logps/ref_rejected": -90.62336730957031, "logps/rejected": -600.66552734375, "loss": 1.0951, "margin_dpo/margin_mean": 206.59829711914062, "margin_dpo/margin_std": 264.9103698730469, "step": 344 }, { "KL/chosen_KL_mean": -361.2244567871094, "KL/mean": -478.6163024902344, "KL/rejected_KL_mean": -596.0081787109375, "KL/std": 251.1361083984375, "epoch": 0.5066079295154186, "fcm_dpo/beta": 0.0018856715178117156, "fcm_dpo/delta": -0.04471251741051674, "fcm_dpo/margin": 234.78369140625, "fcm_dpo/q_t": 0.3976198434829712, "grad_norm": 22.779315948486328, "learning_rate": 2.896155456223163e-07, "logits/chosen": -0.4647873342037201, "logits/rejected": -0.4615704417228699, "logps/chosen": -422.91015625, "logps/ref_chosen": -61.685699462890625, "logps/ref_rejected": -99.49041748046875, "logps/rejected": -695.4985961914062, "loss": 1.0557, "margin_dpo/margin_mean": 234.78372192382812, "margin_dpo/margin_std": 277.4552307128906, "step": 345 }, { "KL/chosen_KL_mean": -358.7704162597656, "KL/mean": -470.405517578125, "KL/rejected_KL_mean": -582.0405883789062, "KL/std": 246.00839233398438, "epoch": 0.5080763582966226, "fcm_dpo/beta": 0.0018714326433837414, "fcm_dpo/delta": -0.018616080284118652, "fcm_dpo/margin": 223.2701873779297, "fcm_dpo/q_t": 0.4019482135772705, "grad_norm": 22.303312301635742, "learning_rate": 2.883479137196714e-07, "logits/chosen": -0.42917925119400024, "logits/rejected": -0.41765835881233215, "logps/chosen": -414.02667236328125, "logps/ref_chosen": -55.256263732910156, "logps/ref_rejected": -77.41532135009766, "logps/rejected": -659.4559326171875, "loss": 1.0682, "margin_dpo/margin_mean": 223.27017211914062, "margin_dpo/margin_std": 262.06787109375, "step": 346 }, { "KL/chosen_KL_mean": -364.54046630859375, "KL/mean": -475.5876770019531, "KL/rejected_KL_mean": -586.6348876953125, "KL/std": 260.4476318359375, "epoch": 0.5095447870778267, "fcm_dpo/beta": 0.001866616541519761, "fcm_dpo/delta": -0.015192577615380287, "fcm_dpo/margin": 222.09446716308594, "fcm_dpo/q_t": 0.4048117995262146, "grad_norm": 26.2041015625, "learning_rate": 2.8707927131383614e-07, "logits/chosen": -0.4826762080192566, "logits/rejected": -0.47840413451194763, "logps/chosen": -422.106689453125, "logps/ref_chosen": -57.56623840332031, "logps/ref_rejected": -92.35509490966797, "logps/rejected": -678.989990234375, "loss": 1.0814, "margin_dpo/margin_mean": 222.094482421875, "margin_dpo/margin_std": 288.15325927734375, "step": 347 }, { "KL/chosen_KL_mean": -328.40960693359375, "KL/mean": -422.31512451171875, "KL/rejected_KL_mean": -516.2205810546875, "KL/std": 226.09664916992188, "epoch": 0.5110132158590308, "fcm_dpo/beta": 0.0018670517019927502, "fcm_dpo/delta": 0.05069158226251602, "fcm_dpo/margin": 187.81101989746094, "fcm_dpo/q_t": 0.41940367221832275, "grad_norm": 27.389596939086914, "learning_rate": 2.858096518347179e-07, "logits/chosen": -0.47834187746047974, "logits/rejected": -0.47976285219192505, "logps/chosen": -384.727294921875, "logps/ref_chosen": -56.31770324707031, "logps/ref_rejected": -89.13836669921875, "logps/rejected": -605.3590087890625, "loss": 1.1255, "margin_dpo/margin_mean": 187.81101989746094, "margin_dpo/margin_std": 261.6010437011719, "step": 348 }, { "KL/chosen_KL_mean": -309.04058837890625, "KL/mean": -417.7922668457031, "KL/rejected_KL_mean": -526.5439453125, "KL/std": 246.7399139404297, "epoch": 0.5124816446402349, "fcm_dpo/beta": 0.0018880900461226702, "fcm_dpo/delta": -0.011443812400102615, "fcm_dpo/margin": 217.50335693359375, "fcm_dpo/q_t": 0.4071810245513916, "grad_norm": 21.567838668823242, "learning_rate": 2.845390887379706e-07, "logits/chosen": -0.48450082540512085, "logits/rejected": -0.4876035153865814, "logps/chosen": -367.06610107421875, "logps/ref_chosen": -58.025516510009766, "logps/ref_rejected": -97.50515747070312, "logps/rejected": -624.0491333007812, "loss": 1.0928, "margin_dpo/margin_mean": 217.50335693359375, "margin_dpo/margin_std": 300.6847839355469, "step": 349 }, { "KL/chosen_KL_mean": -325.901611328125, "KL/mean": -431.23809814453125, "KL/rejected_KL_mean": -536.5746459960938, "KL/std": 234.06008911132812, "epoch": 0.5139500734214391, "fcm_dpo/beta": 0.0018717560451477766, "fcm_dpo/delta": 0.00548534095287323, "fcm_dpo/margin": 210.67298889160156, "fcm_dpo/q_t": 0.40805143117904663, "grad_norm": 34.51575469970703, "learning_rate": 2.8326761550411346e-07, "logits/chosen": -0.5076397657394409, "logits/rejected": -0.510471761226654, "logps/chosen": -390.2320861816406, "logps/ref_chosen": -64.33049011230469, "logps/ref_rejected": -89.87164306640625, "logps/rejected": -626.4462890625, "loss": 1.0998, "margin_dpo/margin_mean": 210.67300415039062, "margin_dpo/margin_std": 285.7344055175781, "step": 350 }, { "KL/chosen_KL_mean": -306.2958984375, "KL/mean": -430.8505859375, "KL/rejected_KL_mean": -555.4052734375, "KL/std": 270.62652587890625, "epoch": 0.5154185022026432, "fcm_dpo/beta": 0.0018583099590614438, "fcm_dpo/delta": -0.06623544543981552, "fcm_dpo/margin": 249.109375, "fcm_dpo/q_t": 0.39529091119766235, "grad_norm": 33.712581634521484, "learning_rate": 2.819952656376487e-07, "logits/chosen": -0.5161840915679932, "logits/rejected": -0.5157532095909119, "logps/chosen": -366.968017578125, "logps/ref_chosen": -60.6721305847168, "logps/ref_rejected": -101.5654296875, "logps/rejected": -656.970703125, "loss": 1.0536, "margin_dpo/margin_mean": 249.109375, "margin_dpo/margin_std": 312.37847900390625, "step": 351 }, { "KL/chosen_KL_mean": -353.2392578125, "KL/mean": -434.8507385253906, "KL/rejected_KL_mean": -516.4622802734375, "KL/std": 244.72021484375, "epoch": 0.5168869309838473, "fcm_dpo/beta": 0.0018763558473438025, "fcm_dpo/delta": 0.09677629917860031, "fcm_dpo/margin": 163.22299194335938, "fcm_dpo/q_t": 0.429553359746933, "grad_norm": 34.96100997924805, "learning_rate": 2.8072207266617854e-07, "logits/chosen": -0.470248281955719, "logits/rejected": -0.4367384910583496, "logps/chosen": -424.18267822265625, "logps/ref_chosen": -70.9434585571289, "logps/ref_rejected": -76.6419677734375, "logps/rejected": -593.104248046875, "loss": 1.1643, "margin_dpo/margin_mean": 163.22300720214844, "margin_dpo/margin_std": 265.39306640625, "step": 352 }, { "KL/chosen_KL_mean": -322.45953369140625, "KL/mean": -425.41156005859375, "KL/rejected_KL_mean": -528.363525390625, "KL/std": 244.48275756835938, "epoch": 0.5183553597650514, "fcm_dpo/beta": 0.0018868569750338793, "fcm_dpo/delta": 0.01154874637722969, "fcm_dpo/margin": 205.90402221679688, "fcm_dpo/q_t": 0.4107271432876587, "grad_norm": 26.08613395690918, "learning_rate": 2.794480701395219e-07, "logits/chosen": -0.5393311977386475, "logits/rejected": -0.5296196937561035, "logps/chosen": -380.8548583984375, "logps/ref_chosen": -58.39533996582031, "logps/ref_rejected": -80.33553314208984, "logps/rejected": -608.6990966796875, "loss": 1.0991, "margin_dpo/margin_mean": 205.90402221679688, "margin_dpo/margin_std": 272.3714294433594, "step": 353 }, { "KL/chosen_KL_mean": -268.17083740234375, "KL/mean": -381.08599853515625, "KL/rejected_KL_mean": -494.00115966796875, "KL/std": 217.59808349609375, "epoch": 0.5198237885462555, "fcm_dpo/beta": 0.0018875201931223273, "fcm_dpo/delta": -0.02746821939945221, "fcm_dpo/margin": 225.830322265625, "fcm_dpo/q_t": 0.39921072125434875, "grad_norm": 40.203250885009766, "learning_rate": 2.781732916288303e-07, "logits/chosen": -0.49349963665008545, "logits/rejected": -0.4861293137073517, "logps/chosen": -327.97381591796875, "logps/ref_chosen": -59.80299377441406, "logps/ref_rejected": -88.75750732421875, "logps/rejected": -582.7586669921875, "loss": 1.0442, "margin_dpo/margin_mean": 225.83033752441406, "margin_dpo/margin_std": 216.87001037597656, "step": 354 }, { "KL/chosen_KL_mean": -275.6778564453125, "KL/mean": -384.2642822265625, "KL/rejected_KL_mean": -492.8507080078125, "KL/std": 227.14535522460938, "epoch": 0.5212922173274597, "fcm_dpo/beta": 0.0018801202531903982, "fcm_dpo/delta": -0.00877285934984684, "fcm_dpo/margin": 217.17283630371094, "fcm_dpo/q_t": 0.40371406078338623, "grad_norm": 44.118072509765625, "learning_rate": 2.7689777072570284e-07, "logits/chosen": -0.5707640647888184, "logits/rejected": -0.5644550323486328, "logps/chosen": -329.806396484375, "logps/ref_chosen": -54.12849807739258, "logps/ref_rejected": -82.40606689453125, "logps/rejected": -575.2567749023438, "loss": 1.0634, "margin_dpo/margin_mean": 217.17282104492188, "margin_dpo/margin_std": 228.13082885742188, "step": 355 }, { "KL/chosen_KL_mean": -340.9484558105469, "KL/mean": -402.58343505859375, "KL/rejected_KL_mean": -464.2183837890625, "KL/std": 240.2225341796875, "epoch": 0.5227606461086637, "fcm_dpo/beta": 0.0018893997184932232, "fcm_dpo/delta": 0.03272160887718201, "fcm_dpo/margin": 123.2698974609375, "fcm_dpo/q_t": 0.44711410999298096, "grad_norm": 34.34591293334961, "learning_rate": 2.7562154104130176e-07, "logits/chosen": -0.47087016701698303, "logits/rejected": -0.4466116726398468, "logps/chosen": -405.62225341796875, "logps/ref_chosen": -64.6738052368164, "logps/ref_rejected": -75.89926147460938, "logps/rejected": -540.11767578125, "loss": 1.2434, "margin_dpo/margin_mean": 123.2698974609375, "margin_dpo/margin_std": 296.36102294921875, "step": 356 }, { "KL/chosen_KL_mean": -309.8677978515625, "KL/mean": -412.3357849121094, "KL/rejected_KL_mean": -514.8037109375, "KL/std": 240.03021240234375, "epoch": 0.5242290748898678, "fcm_dpo/beta": 0.0018900984432548285, "fcm_dpo/delta": 0.012749601155519485, "fcm_dpo/margin": 204.93594360351562, "fcm_dpo/q_t": 0.40922728180885315, "grad_norm": 30.455440521240234, "learning_rate": 2.7434463620546594e-07, "logits/chosen": -0.5195204019546509, "logits/rejected": -0.5092127323150635, "logps/chosen": -362.5935974121094, "logps/ref_chosen": -52.725799560546875, "logps/ref_rejected": -86.84115600585938, "logps/rejected": -601.6448974609375, "loss": 1.0905, "margin_dpo/margin_mean": 204.93594360351562, "margin_dpo/margin_std": 249.49462890625, "step": 357 }, { "KL/chosen_KL_mean": -282.8341369628906, "KL/mean": -379.4464111328125, "KL/rejected_KL_mean": -476.05865478515625, "KL/std": 233.26222229003906, "epoch": 0.5256975036710719, "fcm_dpo/beta": 0.0019095418974757195, "fcm_dpo/delta": 0.032218970358371735, "fcm_dpo/margin": 193.22451782226562, "fcm_dpo/q_t": 0.4149158000946045, "grad_norm": 24.4883975982666, "learning_rate": 2.730670898658255e-07, "logits/chosen": -0.5249518156051636, "logits/rejected": -0.5085197687149048, "logps/chosen": -346.0395812988281, "logps/ref_chosen": -63.20543670654297, "logps/ref_rejected": -88.373291015625, "logps/rejected": -564.4319458007812, "loss": 1.1049, "margin_dpo/margin_mean": 193.22451782226562, "margin_dpo/margin_std": 251.6090087890625, "step": 358 }, { "KL/chosen_KL_mean": -319.8917541503906, "KL/mean": -433.31768798828125, "KL/rejected_KL_mean": -546.74365234375, "KL/std": 235.4507293701172, "epoch": 0.527165932452276, "fcm_dpo/beta": 0.0019002794288098812, "fcm_dpo/delta": -0.03259321302175522, "fcm_dpo/margin": 226.85186767578125, "fcm_dpo/q_t": 0.4016547203063965, "grad_norm": 27.346670150756836, "learning_rate": 2.717889356869146e-07, "logits/chosen": -0.4491726756095886, "logits/rejected": -0.43380558490753174, "logps/chosen": -376.261962890625, "logps/ref_chosen": -56.370216369628906, "logps/ref_rejected": -82.17375183105469, "logps/rejected": -628.9173583984375, "loss": 1.0712, "margin_dpo/margin_mean": 226.85186767578125, "margin_dpo/margin_std": 285.25946044921875, "step": 359 }, { "KL/chosen_KL_mean": -322.7139587402344, "KL/mean": -408.2466735839844, "KL/rejected_KL_mean": -493.77935791015625, "KL/std": 200.92642211914062, "epoch": 0.5286343612334802, "fcm_dpo/beta": 0.0019250914920121431, "fcm_dpo/delta": 0.07284688949584961, "fcm_dpo/margin": 171.06541442871094, "fcm_dpo/q_t": 0.4223863184452057, "grad_norm": 37.464805603027344, "learning_rate": 2.7051020734928443e-07, "logits/chosen": -0.43620049953460693, "logits/rejected": -0.4205591678619385, "logps/chosen": -374.1743469238281, "logps/ref_chosen": -51.460384368896484, "logps/ref_rejected": -69.83892059326172, "logps/rejected": -563.6182861328125, "loss": 1.1224, "margin_dpo/margin_mean": 171.06541442871094, "margin_dpo/margin_std": 206.2929229736328, "step": 360 }, { "KL/chosen_KL_mean": -337.85443115234375, "KL/mean": -423.9326477050781, "KL/rejected_KL_mean": -510.0108337402344, "KL/std": 233.32354736328125, "epoch": 0.5301027900146843, "fcm_dpo/beta": 0.0019522447837516665, "fcm_dpo/delta": 0.06582384556531906, "fcm_dpo/margin": 172.15640258789062, "fcm_dpo/q_t": 0.4221458435058594, "grad_norm": 30.787109375, "learning_rate": 2.6923093854861593e-07, "logits/chosen": -0.474801242351532, "logits/rejected": -0.47210389375686646, "logps/chosen": -391.72393798828125, "logps/ref_chosen": -53.86951446533203, "logps/ref_rejected": -90.7692642211914, "logps/rejected": -600.7801513671875, "loss": 1.1377, "margin_dpo/margin_mean": 172.15640258789062, "margin_dpo/margin_std": 255.71853637695312, "step": 361 }, { "KL/chosen_KL_mean": -293.02099609375, "KL/mean": -428.1585693359375, "KL/rejected_KL_mean": -563.296142578125, "KL/std": 246.54940795898438, "epoch": 0.5315712187958884, "fcm_dpo/beta": 0.0019115547183901072, "fcm_dpo/delta": -0.12366791069507599, "fcm_dpo/margin": 270.275146484375, "fcm_dpo/q_t": 0.3806150555610657, "grad_norm": 25.072677612304688, "learning_rate": 2.679511629948319e-07, "logits/chosen": -0.461614191532135, "logits/rejected": -0.471387654542923, "logps/chosen": -351.6600341796875, "logps/ref_chosen": -58.639060974121094, "logps/ref_rejected": -105.58195495605469, "logps/rejected": -668.8780517578125, "loss": 0.9921, "margin_dpo/margin_mean": 270.275146484375, "margin_dpo/margin_std": 262.97430419921875, "step": 362 }, { "KL/chosen_KL_mean": -270.0954284667969, "KL/mean": -409.5333251953125, "KL/rejected_KL_mean": -548.97119140625, "KL/std": 249.90145874023438, "epoch": 0.5330396475770925, "fcm_dpo/beta": 0.0018741564126685262, "fcm_dpo/delta": -0.12923146784305573, "fcm_dpo/margin": 278.8757629394531, "fcm_dpo/q_t": 0.37928664684295654, "grad_norm": 25.539962768554688, "learning_rate": 2.6667091441120816e-07, "logits/chosen": -0.45062029361724854, "logits/rejected": -0.445356547832489, "logps/chosen": -314.65380859375, "logps/ref_chosen": -44.558380126953125, "logps/ref_rejected": -74.69496154785156, "logps/rejected": -623.6661376953125, "loss": 0.9901, "margin_dpo/margin_mean": 278.87579345703125, "margin_dpo/margin_std": 272.97113037109375, "step": 363 }, { "KL/chosen_KL_mean": -301.60052490234375, "KL/mean": -405.09051513671875, "KL/rejected_KL_mean": -508.58050537109375, "KL/std": 244.21636962890625, "epoch": 0.5345080763582967, "fcm_dpo/beta": 0.0018688710406422615, "fcm_dpo/delta": 0.013297256082296371, "fcm_dpo/margin": 206.97998046875, "fcm_dpo/q_t": 0.4103432595729828, "grad_norm": 24.92216682434082, "learning_rate": 2.6539022653348575e-07, "logits/chosen": -0.49542540311813354, "logits/rejected": -0.5060637593269348, "logps/chosen": -350.4951171875, "logps/ref_chosen": -48.894622802734375, "logps/ref_rejected": -91.395751953125, "logps/rejected": -599.9762573242188, "loss": 1.1024, "margin_dpo/margin_mean": 206.97999572753906, "margin_dpo/margin_std": 280.64617919921875, "step": 364 }, { "KL/chosen_KL_mean": -285.0455322265625, "KL/mean": -395.47760009765625, "KL/rejected_KL_mean": -505.90972900390625, "KL/std": 253.05226135253906, "epoch": 0.5359765051395007, "fcm_dpo/beta": 0.0018601326737552881, "fcm_dpo/delta": -0.011300592683255672, "fcm_dpo/margin": 220.8641815185547, "fcm_dpo/q_t": 0.4056779444217682, "grad_norm": 25.717348098754883, "learning_rate": 2.641091331089811e-07, "logits/chosen": -0.44014662504196167, "logits/rejected": -0.45165252685546875, "logps/chosen": -336.53826904296875, "logps/ref_chosen": -51.49274444580078, "logps/ref_rejected": -92.70166778564453, "logps/rejected": -598.6113891601562, "loss": 1.0716, "margin_dpo/margin_mean": 220.8641815185547, "margin_dpo/margin_std": 261.02288818359375, "step": 365 }, { "KL/chosen_KL_mean": -266.95123291015625, "KL/mean": -379.23828125, "KL/rejected_KL_mean": -491.52532958984375, "KL/std": 242.17172241210938, "epoch": 0.5374449339207048, "fcm_dpo/beta": 0.0018460990395396948, "fcm_dpo/delta": -0.015634853392839432, "fcm_dpo/margin": 224.57406616210938, "fcm_dpo/q_t": 0.40484321117401123, "grad_norm": 22.693359375, "learning_rate": 2.6282766789569736e-07, "logits/chosen": -0.4734732210636139, "logits/rejected": -0.4902943968772888, "logps/chosen": -311.67181396484375, "logps/ref_chosen": -44.7205696105957, "logps/ref_rejected": -83.31040954589844, "logps/rejected": -574.835693359375, "loss": 1.0809, "margin_dpo/margin_mean": 224.57406616210938, "margin_dpo/margin_std": 287.28973388671875, "step": 366 }, { "KL/chosen_KL_mean": -279.1624450683594, "KL/mean": -372.0902404785156, "KL/rejected_KL_mean": -465.01806640625, "KL/std": 217.56224060058594, "epoch": 0.5389133627019089, "fcm_dpo/beta": 0.0018712931778281927, "fcm_dpo/delta": 0.05387837439775467, "fcm_dpo/margin": 185.85562133789062, "fcm_dpo/q_t": 0.4180119037628174, "grad_norm": 22.13631820678711, "learning_rate": 2.615458646614349e-07, "logits/chosen": -0.47421082854270935, "logits/rejected": -0.45852982997894287, "logps/chosen": -337.56787109375, "logps/ref_chosen": -58.405418395996094, "logps/ref_rejected": -76.75132751464844, "logps/rejected": -541.7694091796875, "loss": 1.1149, "margin_dpo/margin_mean": 185.85562133789062, "margin_dpo/margin_std": 234.53326416015625, "step": 367 }, { "KL/chosen_KL_mean": -260.6910400390625, "KL/mean": -406.54931640625, "KL/rejected_KL_mean": -552.4075317382812, "KL/std": 245.8842010498047, "epoch": 0.540381791483113, "fcm_dpo/beta": 0.0018334980122745037, "fcm_dpo/delta": -0.14245912432670593, "fcm_dpo/margin": 291.716552734375, "fcm_dpo/q_t": 0.37282127141952515, "grad_norm": 36.742767333984375, "learning_rate": 2.6026375718290083e-07, "logits/chosen": -0.48603498935699463, "logits/rejected": -0.4929526448249817, "logps/chosen": -305.1435546875, "logps/ref_chosen": -44.452518463134766, "logps/ref_rejected": -98.55526733398438, "logps/rejected": -650.9627685546875, "loss": 0.9603, "margin_dpo/margin_mean": 291.7165222167969, "margin_dpo/margin_std": 222.715087890625, "step": 368 }, { "KL/chosen_KL_mean": -353.1409606933594, "KL/mean": -428.58447265625, "KL/rejected_KL_mean": -504.0279235839844, "KL/std": 253.20993041992188, "epoch": 0.5418502202643172, "fcm_dpo/beta": 0.0018251673318445683, "fcm_dpo/delta": 0.02955966256558895, "fcm_dpo/margin": 150.8870086669922, "fcm_dpo/q_t": 0.43544578552246094, "grad_norm": 26.54815673828125, "learning_rate": 2.589813792448196e-07, "logits/chosen": -0.49702537059783936, "logits/rejected": -0.47865352034568787, "logps/chosen": -424.5224609375, "logps/ref_chosen": -71.38150024414062, "logps/ref_rejected": -91.29582214355469, "logps/rejected": -595.32373046875, "loss": 1.1922, "margin_dpo/margin_mean": 150.88702392578125, "margin_dpo/margin_std": 279.23529052734375, "step": 369 }, { "KL/chosen_KL_mean": -370.0912170410156, "KL/mean": -444.4624938964844, "KL/rejected_KL_mean": -518.833740234375, "KL/std": 263.0101318359375, "epoch": 0.5433186490455213, "fcm_dpo/beta": 0.0018681371584534645, "fcm_dpo/delta": 0.1253683865070343, "fcm_dpo/margin": 148.7425537109375, "fcm_dpo/q_t": 0.43673175573349, "grad_norm": 31.06038475036621, "learning_rate": 2.5769876463904263e-07, "logits/chosen": -0.5078903436660767, "logits/rejected": -0.5006515979766846, "logps/chosen": -441.69873046875, "logps/ref_chosen": -71.60749816894531, "logps/ref_rejected": -97.25978088378906, "logps/rejected": -616.093505859375, "loss": 1.1989, "margin_dpo/margin_mean": 148.74256896972656, "margin_dpo/margin_std": 292.08331298828125, "step": 370 }, { "KL/chosen_KL_mean": -357.910888671875, "KL/mean": -464.66607666015625, "KL/rejected_KL_mean": -571.4212646484375, "KL/std": 263.6308288574219, "epoch": 0.5447870778267254, "fcm_dpo/beta": 0.0018815153744071722, "fcm_dpo/delta": -0.0019350722432136536, "fcm_dpo/margin": 213.51040649414062, "fcm_dpo/q_t": 0.40850624442100525, "grad_norm": 33.49449920654297, "learning_rate": 2.5641594716365744e-07, "logits/chosen": -0.557892918586731, "logits/rejected": -0.5437754392623901, "logps/chosen": -427.32537841796875, "logps/ref_chosen": -69.41448974609375, "logps/ref_rejected": -99.17217254638672, "logps/rejected": -670.593505859375, "loss": 1.0989, "margin_dpo/margin_mean": 213.51040649414062, "margin_dpo/margin_std": 299.0671081542969, "step": 371 }, { "KL/chosen_KL_mean": -353.14971923828125, "KL/mean": -480.6755676269531, "KL/rejected_KL_mean": -608.201416015625, "KL/std": 304.527099609375, "epoch": 0.5462555066079295, "fcm_dpo/beta": 0.0018529519438743591, "fcm_dpo/delta": -0.07631818950176239, "fcm_dpo/margin": 255.05174255371094, "fcm_dpo/q_t": 0.395152747631073, "grad_norm": 28.089242935180664, "learning_rate": 2.551329606220976e-07, "logits/chosen": -0.5017907619476318, "logits/rejected": -0.4822356402873993, "logps/chosen": -414.96771240234375, "logps/ref_chosen": -61.8179931640625, "logps/ref_rejected": -78.53948974609375, "logps/rejected": -686.740966796875, "loss": 1.0575, "margin_dpo/margin_mean": 255.0517578125, "margin_dpo/margin_std": 344.2037353515625, "step": 372 }, { "KL/chosen_KL_mean": -380.80279541015625, "KL/mean": -503.22076416015625, "KL/rejected_KL_mean": -625.6387939453125, "KL/std": 292.56231689453125, "epoch": 0.5477239353891337, "fcm_dpo/beta": 0.001842833822593093, "fcm_dpo/delta": -0.0538918599486351, "fcm_dpo/margin": 244.8360137939453, "fcm_dpo/q_t": 0.39565837383270264, "grad_norm": 25.535263061523438, "learning_rate": 2.538498388222517e-07, "logits/chosen": -0.4881356954574585, "logits/rejected": -0.46633967757225037, "logps/chosen": -445.0198974609375, "logps/ref_chosen": -64.21713256835938, "logps/ref_rejected": -85.95960998535156, "logps/rejected": -711.598388671875, "loss": 1.0544, "margin_dpo/margin_mean": 244.83599853515625, "margin_dpo/margin_std": 285.15032958984375, "step": 373 }, { "KL/chosen_KL_mean": -344.94671630859375, "KL/mean": -455.2513427734375, "KL/rejected_KL_mean": -565.5560302734375, "KL/std": 316.259765625, "epoch": 0.5491923641703378, "fcm_dpo/beta": 0.0018177898600697517, "fcm_dpo/delta": -0.0016709566116333008, "fcm_dpo/margin": 220.6092529296875, "fcm_dpo/q_t": 0.4120628535747528, "grad_norm": 27.090904235839844, "learning_rate": 2.525666155755725e-07, "logits/chosen": -0.5761805772781372, "logits/rejected": -0.5577331781387329, "logps/chosen": -415.596923828125, "logps/ref_chosen": -70.65018463134766, "logps/ref_rejected": -93.64016723632812, "logps/rejected": -659.1961669921875, "loss": 1.1193, "margin_dpo/margin_mean": 220.60926818847656, "margin_dpo/margin_std": 354.3055419921875, "step": 374 }, { "KL/chosen_KL_mean": -349.4085693359375, "KL/mean": -457.72344970703125, "KL/rejected_KL_mean": -566.038330078125, "KL/std": 255.31455993652344, "epoch": 0.5506607929515418, "fcm_dpo/beta": 0.00181809242349118, "fcm_dpo/delta": 0.005695123225450516, "fcm_dpo/margin": 216.62973022460938, "fcm_dpo/q_t": 0.4092579185962677, "grad_norm": 36.25385665893555, "learning_rate": 2.512833246961859e-07, "logits/chosen": -0.5407253503799438, "logits/rejected": -0.5411194562911987, "logps/chosen": -409.48876953125, "logps/ref_chosen": -60.080223083496094, "logps/ref_rejected": -88.93830871582031, "logps/rejected": -654.9766235351562, "loss": 1.1045, "margin_dpo/margin_mean": 216.62973022460938, "margin_dpo/margin_std": 301.60943603515625, "step": 375 }, { "KL/chosen_KL_mean": -342.84234619140625, "KL/mean": -473.917724609375, "KL/rejected_KL_mean": -604.9931030273438, "KL/std": 271.80767822265625, "epoch": 0.5521292217327459, "fcm_dpo/beta": 0.001806009327992797, "fcm_dpo/delta": -0.0772535428404808, "fcm_dpo/margin": 262.1507568359375, "fcm_dpo/q_t": 0.3911965489387512, "grad_norm": 23.30925750732422, "learning_rate": 2.5e-07, "logits/chosen": -0.5112703442573547, "logits/rejected": -0.5029022693634033, "logps/chosen": -405.50262451171875, "logps/ref_chosen": -62.660308837890625, "logps/ref_rejected": -105.52660369873047, "logps/rejected": -710.5196533203125, "loss": 1.0426, "margin_dpo/margin_mean": 262.1507568359375, "margin_dpo/margin_std": 314.1611328125, "step": 376 }, { "KL/chosen_KL_mean": -344.38580322265625, "KL/mean": -467.46221923828125, "KL/rejected_KL_mean": -590.5386352539062, "KL/std": 280.3377685546875, "epoch": 0.55359765051395, "fcm_dpo/beta": 0.0017937154043465853, "fcm_dpo/delta": -0.043446458876132965, "fcm_dpo/margin": 246.15281677246094, "fcm_dpo/q_t": 0.3983774781227112, "grad_norm": 29.571670532226562, "learning_rate": 2.487166753038141e-07, "logits/chosen": -0.4719467759132385, "logits/rejected": -0.47322678565979004, "logps/chosen": -398.8645324707031, "logps/ref_chosen": -54.478736877441406, "logps/ref_rejected": -98.70335388183594, "logps/rejected": -689.241943359375, "loss": 1.0577, "margin_dpo/margin_mean": 246.15280151367188, "margin_dpo/margin_std": 293.776123046875, "step": 377 }, { "KL/chosen_KL_mean": -327.5260925292969, "KL/mean": -458.94586181640625, "KL/rejected_KL_mean": -590.3656005859375, "KL/std": 256.42523193359375, "epoch": 0.5550660792951542, "fcm_dpo/beta": 0.0017670897068455815, "fcm_dpo/delta": -0.0676431879401207, "fcm_dpo/margin": 262.839599609375, "fcm_dpo/q_t": 0.3910645842552185, "grad_norm": 30.750337600708008, "learning_rate": 2.4743338442442754e-07, "logits/chosen": -0.48483866453170776, "logits/rejected": -0.505121648311615, "logps/chosen": -372.546630859375, "logps/ref_chosen": -45.02053451538086, "logps/ref_rejected": -88.0469741821289, "logps/rejected": -678.41259765625, "loss": 1.0324, "margin_dpo/margin_mean": 262.8395690917969, "margin_dpo/margin_std": 280.932861328125, "step": 378 }, { "KL/chosen_KL_mean": -355.95745849609375, "KL/mean": -482.84857177734375, "KL/rejected_KL_mean": -609.73974609375, "KL/std": 260.3935546875, "epoch": 0.5565345080763583, "fcm_dpo/beta": 0.0017432118766009808, "fcm_dpo/delta": -0.04486480727791786, "fcm_dpo/margin": 253.78221130371094, "fcm_dpo/q_t": 0.39824381470680237, "grad_norm": 28.86778450012207, "learning_rate": 2.461501611777483e-07, "logits/chosen": -0.4996095299720764, "logits/rejected": -0.5224326848983765, "logps/chosen": -409.1395568847656, "logps/ref_chosen": -53.182098388671875, "logps/ref_rejected": -114.3001708984375, "logps/rejected": -724.0399169921875, "loss": 1.0587, "margin_dpo/margin_mean": 253.78219604492188, "margin_dpo/margin_std": 305.1265563964844, "step": 379 }, { "KL/chosen_KL_mean": -341.1017761230469, "KL/mean": -481.23809814453125, "KL/rejected_KL_mean": -621.3743896484375, "KL/std": 289.7486267089844, "epoch": 0.5580029368575624, "fcm_dpo/beta": 0.001726464950479567, "fcm_dpo/delta": -0.08805520087480545, "fcm_dpo/margin": 280.27264404296875, "fcm_dpo/q_t": 0.38729774951934814, "grad_norm": 27.250988006591797, "learning_rate": 2.4486703937790243e-07, "logits/chosen": -0.47446852922439575, "logits/rejected": -0.5016424655914307, "logps/chosen": -392.4548034667969, "logps/ref_chosen": -51.3530387878418, "logps/ref_rejected": -104.19169616699219, "logps/rejected": -725.5660400390625, "loss": 1.027, "margin_dpo/margin_mean": 280.27264404296875, "margin_dpo/margin_std": 311.29791259765625, "step": 380 }, { "KL/chosen_KL_mean": -358.0355529785156, "KL/mean": -458.411376953125, "KL/rejected_KL_mean": -558.7872314453125, "KL/std": 245.9998016357422, "epoch": 0.5594713656387665, "fcm_dpo/beta": 0.001726742135360837, "fcm_dpo/delta": 0.05530242994427681, "fcm_dpo/margin": 200.75161743164062, "fcm_dpo/q_t": 0.4205264449119568, "grad_norm": 25.71771812438965, "learning_rate": 2.435840528363426e-07, "logits/chosen": -0.45564746856689453, "logits/rejected": -0.43560299277305603, "logps/chosen": -415.838623046875, "logps/ref_chosen": -57.80306625366211, "logps/ref_rejected": -79.21940612792969, "logps/rejected": -638.006591796875, "loss": 1.1465, "margin_dpo/margin_mean": 200.75160217285156, "margin_dpo/margin_std": 330.10223388671875, "step": 381 }, { "KL/chosen_KL_mean": -339.0993347167969, "KL/mean": -461.7208557128906, "KL/rejected_KL_mean": -584.34228515625, "KL/std": 235.19894409179688, "epoch": 0.5609397944199707, "fcm_dpo/beta": 0.0017278792802244425, "fcm_dpo/delta": -0.024810172617435455, "fcm_dpo/margin": 245.24298095703125, "fcm_dpo/q_t": 0.4007778763771057, "grad_norm": 29.356277465820312, "learning_rate": 2.4230123536095745e-07, "logits/chosen": -0.4832066297531128, "logits/rejected": -0.4879419803619385, "logps/chosen": -405.11962890625, "logps/ref_chosen": -66.02030181884766, "logps/ref_rejected": -110.71016693115234, "logps/rejected": -695.052490234375, "loss": 1.0551, "margin_dpo/margin_mean": 245.24298095703125, "margin_dpo/margin_std": 262.1793518066406, "step": 382 }, { "KL/chosen_KL_mean": -346.09075927734375, "KL/mean": -468.5663757324219, "KL/rejected_KL_mean": -591.0419921875, "KL/std": 261.1944274902344, "epoch": 0.5624082232011748, "fcm_dpo/beta": 0.0017200370784848928, "fcm_dpo/delta": -0.022270262241363525, "fcm_dpo/margin": 244.95123291015625, "fcm_dpo/q_t": 0.4029914140701294, "grad_norm": 25.48250961303711, "learning_rate": 2.4101862075518037e-07, "logits/chosen": -0.46931326389312744, "logits/rejected": -0.47857019305229187, "logps/chosen": -396.48223876953125, "logps/ref_chosen": -50.39148712158203, "logps/ref_rejected": -93.71589660644531, "logps/rejected": -684.7578735351562, "loss": 1.0874, "margin_dpo/margin_mean": 244.95123291015625, "margin_dpo/margin_std": 338.13385009765625, "step": 383 }, { "KL/chosen_KL_mean": -348.53369140625, "KL/mean": -444.30133056640625, "KL/rejected_KL_mean": -540.0689697265625, "KL/std": 233.04791259765625, "epoch": 0.5638766519823789, "fcm_dpo/beta": 0.001742619788274169, "fcm_dpo/delta": 0.06796430051326752, "fcm_dpo/margin": 191.53521728515625, "fcm_dpo/q_t": 0.4206535220146179, "grad_norm": 25.28335952758789, "learning_rate": 2.397362428170992e-07, "logits/chosen": -0.5027947425842285, "logits/rejected": -0.4957225024700165, "logps/chosen": -400.5798034667969, "logps/ref_chosen": -52.046104431152344, "logps/ref_rejected": -85.76089477539062, "logps/rejected": -625.829833984375, "loss": 1.1164, "margin_dpo/margin_mean": 191.53521728515625, "margin_dpo/margin_std": 219.35858154296875, "step": 384 }, { "KL/chosen_KL_mean": -316.4886474609375, "KL/mean": -433.07574462890625, "KL/rejected_KL_mean": -549.6627807617188, "KL/std": 208.34341430664062, "epoch": 0.5653450807635829, "fcm_dpo/beta": 0.0017409389838576317, "fcm_dpo/delta": -0.006193262059241533, "fcm_dpo/margin": 233.17413330078125, "fcm_dpo/q_t": 0.4035521149635315, "grad_norm": 30.516326904296875, "learning_rate": 2.3845413533856514e-07, "logits/chosen": -0.5098952054977417, "logits/rejected": -0.4792426824569702, "logps/chosen": -382.04083251953125, "logps/ref_chosen": -65.55215454101562, "logps/ref_rejected": -77.82792663574219, "logps/rejected": -627.49072265625, "loss": 1.0601, "margin_dpo/margin_mean": 233.17413330078125, "margin_dpo/margin_std": 232.32888793945312, "step": 385 }, { "KL/chosen_KL_mean": -332.295654296875, "KL/mean": -461.0802307128906, "KL/rejected_KL_mean": -589.8648071289062, "KL/std": 266.24664306640625, "epoch": 0.566813509544787, "fcm_dpo/beta": 0.0017288768431171775, "fcm_dpo/delta": -0.047393690794706345, "fcm_dpo/margin": 257.5691833496094, "fcm_dpo/q_t": 0.3971378803253174, "grad_norm": 22.08116912841797, "learning_rate": 2.3717233210430254e-07, "logits/chosen": -0.5011097192764282, "logits/rejected": -0.49594956636428833, "logps/chosen": -390.5174865722656, "logps/ref_chosen": -58.22185516357422, "logps/ref_rejected": -92.32742309570312, "logps/rejected": -682.1922607421875, "loss": 1.0541, "margin_dpo/margin_mean": 257.5691833496094, "margin_dpo/margin_std": 305.404052734375, "step": 386 }, { "KL/chosen_KL_mean": -353.61395263671875, "KL/mean": -457.56298828125, "KL/rejected_KL_mean": -561.5120849609375, "KL/std": 239.767333984375, "epoch": 0.5682819383259912, "fcm_dpo/beta": 0.0017271433025598526, "fcm_dpo/delta": 0.042273543775081635, "fcm_dpo/margin": 207.89810180664062, "fcm_dpo/q_t": 0.41534334421157837, "grad_norm": 29.83814239501953, "learning_rate": 2.3589086689101889e-07, "logits/chosen": -0.5749870538711548, "logits/rejected": -0.5577224493026733, "logps/chosen": -420.0334167480469, "logps/ref_chosen": -66.41944885253906, "logps/ref_rejected": -92.16915893554688, "logps/rejected": -653.6812744140625, "loss": 1.1016, "margin_dpo/margin_mean": 207.89810180664062, "margin_dpo/margin_std": 238.62474060058594, "step": 387 }, { "KL/chosen_KL_mean": -342.80242919921875, "KL/mean": -484.3439636230469, "KL/rejected_KL_mean": -625.8855590820312, "KL/std": 283.77276611328125, "epoch": 0.5697503671071953, "fcm_dpo/beta": 0.0017118379473686218, "fcm_dpo/delta": -0.08915528655052185, "fcm_dpo/margin": 283.0830993652344, "fcm_dpo/q_t": 0.3895290791988373, "grad_norm": 24.234586715698242, "learning_rate": 2.3460977346651428e-07, "logits/chosen": -0.4718579649925232, "logits/rejected": -0.48248615860939026, "logps/chosen": -392.931884765625, "logps/ref_chosen": -50.129459381103516, "logps/ref_rejected": -104.43305969238281, "logps/rejected": -730.318603515625, "loss": 1.0286, "margin_dpo/margin_mean": 283.08306884765625, "margin_dpo/margin_std": 323.8463134765625, "step": 388 }, { "KL/chosen_KL_mean": -386.37750244140625, "KL/mean": -505.4602966308594, "KL/rejected_KL_mean": -624.5430908203125, "KL/std": 282.40252685546875, "epoch": 0.5712187958883994, "fcm_dpo/beta": 0.0017030881717801094, "fcm_dpo/delta": -0.005919036455452442, "fcm_dpo/margin": 238.1655731201172, "fcm_dpo/q_t": 0.40635746717453003, "grad_norm": 30.227901458740234, "learning_rate": 2.3332908558879177e-07, "logits/chosen": -0.5446810722351074, "logits/rejected": -0.5356224775314331, "logps/chosen": -444.28411865234375, "logps/ref_chosen": -57.906593322753906, "logps/ref_rejected": -77.91454315185547, "logps/rejected": -702.4576416015625, "loss": 1.0864, "margin_dpo/margin_mean": 238.1655731201172, "margin_dpo/margin_std": 309.348876953125, "step": 389 }, { "KL/chosen_KL_mean": -377.35198974609375, "KL/mean": -499.4270935058594, "KL/rejected_KL_mean": -621.502197265625, "KL/std": 282.41424560546875, "epoch": 0.5726872246696035, "fcm_dpo/beta": 0.0016950219869613647, "fcm_dpo/delta": -0.014803212136030197, "fcm_dpo/margin": 244.15020751953125, "fcm_dpo/q_t": 0.40823304653167725, "grad_norm": 25.220243453979492, "learning_rate": 2.320488370051681e-07, "logits/chosen": -0.4850386083126068, "logits/rejected": -0.4768223166465759, "logps/chosen": -426.5779113769531, "logps/ref_chosen": -49.22591781616211, "logps/ref_rejected": -85.5281982421875, "logps/rejected": -707.0303955078125, "loss": 1.0979, "margin_dpo/margin_mean": 244.15020751953125, "margin_dpo/margin_std": 357.4918212890625, "step": 390 }, { "KL/chosen_KL_mean": -382.5862731933594, "KL/mean": -454.3216552734375, "KL/rejected_KL_mean": -526.0570068359375, "KL/std": 272.00946044921875, "epoch": 0.5741556534508077, "fcm_dpo/beta": 0.0017408509738743305, "fcm_dpo/delta": 0.15400069952011108, "fcm_dpo/margin": 143.47076416015625, "fcm_dpo/q_t": 0.4426559805870056, "grad_norm": 46.24735641479492, "learning_rate": 2.3076906145138405e-07, "logits/chosen": -0.5240955352783203, "logits/rejected": -0.5136964321136475, "logps/chosen": -446.9159240722656, "logps/ref_chosen": -64.32965087890625, "logps/ref_rejected": -86.73820495605469, "logps/rejected": -612.7952270507812, "loss": 1.2128, "margin_dpo/margin_mean": 143.4707794189453, "margin_dpo/margin_std": 285.8855285644531, "step": 391 }, { "KL/chosen_KL_mean": -327.81842041015625, "KL/mean": -468.28118896484375, "KL/rejected_KL_mean": -608.7439575195312, "KL/std": 280.24822998046875, "epoch": 0.5756240822320118, "fcm_dpo/beta": 0.0017360819038003683, "fcm_dpo/delta": -0.09214547276496887, "fcm_dpo/margin": 280.925537109375, "fcm_dpo/q_t": 0.3868769109249115, "grad_norm": 26.776010513305664, "learning_rate": 2.294897926507156e-07, "logits/chosen": -0.4836753010749817, "logits/rejected": -0.47832804918289185, "logps/chosen": -381.3224182128906, "logps/ref_chosen": -53.50397872924805, "logps/ref_rejected": -102.34584045410156, "logps/rejected": -711.0897827148438, "loss": 1.0105, "margin_dpo/margin_mean": 280.9255065917969, "margin_dpo/margin_std": 274.7219543457031, "step": 392 }, { "KL/chosen_KL_mean": -320.5147399902344, "KL/mean": -430.2720031738281, "KL/rejected_KL_mean": -540.029296875, "KL/std": 273.57257080078125, "epoch": 0.5770925110132159, "fcm_dpo/beta": 0.001724720699712634, "fcm_dpo/delta": 0.022230474278330803, "fcm_dpo/margin": 219.5145263671875, "fcm_dpo/q_t": 0.4159342050552368, "grad_norm": 22.34610939025879, "learning_rate": 2.2821106431308543e-07, "logits/chosen": -0.4929124414920807, "logits/rejected": -0.4912495017051697, "logps/chosen": -366.9886474609375, "logps/ref_chosen": -46.473915100097656, "logps/ref_rejected": -71.96885681152344, "logps/rejected": -611.9981689453125, "loss": 1.1217, "margin_dpo/margin_mean": 219.51451110839844, "margin_dpo/margin_std": 345.4471740722656, "step": 393 }, { "KL/chosen_KL_mean": -359.45440673828125, "KL/mean": -476.61602783203125, "KL/rejected_KL_mean": -593.777587890625, "KL/std": 293.0263671875, "epoch": 0.57856093979442, "fcm_dpo/beta": 0.0017247963696718216, "fcm_dpo/delta": -0.004414796829223633, "fcm_dpo/margin": 234.32321166992188, "fcm_dpo/q_t": 0.40792059898376465, "grad_norm": 22.500810623168945, "learning_rate": 2.2693291013417452e-07, "logits/chosen": -0.477075457572937, "logits/rejected": -0.47646427154541016, "logps/chosen": -412.365966796875, "logps/ref_chosen": -52.91154861450195, "logps/ref_rejected": -90.8226318359375, "logps/rejected": -684.6002197265625, "loss": 1.0862, "margin_dpo/margin_mean": 234.3231964111328, "margin_dpo/margin_std": 305.0433349609375, "step": 394 }, { "KL/chosen_KL_mean": -354.402587890625, "KL/mean": -479.6373596191406, "KL/rejected_KL_mean": -604.8721923828125, "KL/std": 283.220458984375, "epoch": 0.580029368575624, "fcm_dpo/beta": 0.0017139037372544408, "fcm_dpo/delta": -0.03093145042657852, "fcm_dpo/margin": 250.46954345703125, "fcm_dpo/q_t": 0.4019749164581299, "grad_norm": 24.562576293945312, "learning_rate": 2.2565536379453404e-07, "logits/chosen": -0.545151948928833, "logits/rejected": -0.540196418762207, "logps/chosen": -416.94866943359375, "logps/ref_chosen": -62.546112060546875, "logps/ref_rejected": -83.78262329101562, "logps/rejected": -688.65478515625, "loss": 1.0727, "margin_dpo/margin_mean": 250.46954345703125, "margin_dpo/margin_std": 320.5044250488281, "step": 395 }, { "KL/chosen_KL_mean": -372.64703369140625, "KL/mean": -483.7370910644531, "KL/rejected_KL_mean": -594.8271484375, "KL/std": 283.7913818359375, "epoch": 0.5814977973568282, "fcm_dpo/beta": 0.0017187923658639193, "fcm_dpo/delta": 0.01880822703242302, "fcm_dpo/margin": 222.18011474609375, "fcm_dpo/q_t": 0.4109712243080139, "grad_norm": 26.06972312927246, "learning_rate": 2.2437845895869825e-07, "logits/chosen": -0.5321019887924194, "logits/rejected": -0.514782726764679, "logps/chosen": -441.6429443359375, "logps/ref_chosen": -68.99594116210938, "logps/ref_rejected": -88.64665985107422, "logps/rejected": -683.4737548828125, "loss": 1.0874, "margin_dpo/margin_mean": 222.1800994873047, "margin_dpo/margin_std": 261.53839111328125, "step": 396 }, { "KL/chosen_KL_mean": -350.56103515625, "KL/mean": -496.0792236328125, "KL/rejected_KL_mean": -641.597412109375, "KL/std": 283.2353820800781, "epoch": 0.5829662261380323, "fcm_dpo/beta": 0.0016919523477554321, "fcm_dpo/delta": -0.09789647161960602, "fcm_dpo/margin": 291.03643798828125, "fcm_dpo/q_t": 0.3857801556587219, "grad_norm": 34.39192581176758, "learning_rate": 2.2310222927429716e-07, "logits/chosen": -0.4891834259033203, "logits/rejected": -0.4951217472553253, "logps/chosen": -411.83819580078125, "logps/ref_chosen": -61.27716827392578, "logps/ref_rejected": -103.11612701416016, "logps/rejected": -744.713623046875, "loss": 1.0122, "margin_dpo/margin_mean": 291.03643798828125, "margin_dpo/margin_std": 297.6669616699219, "step": 397 }, { "KL/chosen_KL_mean": -382.7860412597656, "KL/mean": -513.4688110351562, "KL/rejected_KL_mean": -644.151611328125, "KL/std": 299.1744384765625, "epoch": 0.5844346549192364, "fcm_dpo/beta": 0.001679509412497282, "fcm_dpo/delta": -0.04076296463608742, "fcm_dpo/margin": 261.36553955078125, "fcm_dpo/q_t": 0.4011620283126831, "grad_norm": 22.932445526123047, "learning_rate": 2.2182670837116972e-07, "logits/chosen": -0.5526921153068542, "logits/rejected": -0.5512826442718506, "logps/chosen": -450.9375915527344, "logps/ref_chosen": -68.15155029296875, "logps/ref_rejected": -108.52360534667969, "logps/rejected": -752.6752319335938, "loss": 1.0711, "margin_dpo/margin_mean": 261.36553955078125, "margin_dpo/margin_std": 347.5543518066406, "step": 398 }, { "KL/chosen_KL_mean": -337.339111328125, "KL/mean": -454.7564392089844, "KL/rejected_KL_mean": -572.1737060546875, "KL/std": 264.2313232421875, "epoch": 0.5859030837004405, "fcm_dpo/beta": 0.0016741682775318623, "fcm_dpo/delta": 0.0070329029113054276, "fcm_dpo/margin": 234.8346405029297, "fcm_dpo/q_t": 0.4100215435028076, "grad_norm": 32.84918975830078, "learning_rate": 2.2055192986047804e-07, "logits/chosen": -0.5098556280136108, "logits/rejected": -0.47407281398773193, "logps/chosen": -398.22894287109375, "logps/ref_chosen": -60.889801025390625, "logps/ref_rejected": -77.965576171875, "logps/rejected": -650.1392822265625, "loss": 1.1039, "margin_dpo/margin_mean": 234.8346405029297, "margin_dpo/margin_std": 332.3253173828125, "step": 399 }, { "KL/chosen_KL_mean": -326.5251159667969, "KL/mean": -493.1486511230469, "KL/rejected_KL_mean": -659.772216796875, "KL/std": 275.30633544921875, "epoch": 0.5873715124816447, "fcm_dpo/beta": 0.001637051347643137, "fcm_dpo/delta": -0.15437571704387665, "fcm_dpo/margin": 333.2471008300781, "fcm_dpo/q_t": 0.3719968795776367, "grad_norm": 22.400123596191406, "learning_rate": 2.192779273338215e-07, "logits/chosen": -0.523221492767334, "logits/rejected": -0.5234454870223999, "logps/chosen": -390.168701171875, "logps/ref_chosen": -63.64359664916992, "logps/ref_rejected": -105.252685546875, "logps/rejected": -765.02490234375, "loss": 0.9719, "margin_dpo/margin_mean": 333.24713134765625, "margin_dpo/margin_std": 307.0823974609375, "step": 400 }, { "KL/chosen_KL_mean": -377.3856201171875, "KL/mean": -472.60052490234375, "KL/rejected_KL_mean": -567.8154296875, "KL/std": 292.94207763671875, "epoch": 0.5888399412628488, "fcm_dpo/beta": 0.001645284821279347, "fcm_dpo/delta": 0.0895879864692688, "fcm_dpo/margin": 190.42982482910156, "fcm_dpo/q_t": 0.4307780861854553, "grad_norm": 29.05838394165039, "learning_rate": 2.1800473436235136e-07, "logits/chosen": -0.47050148248672485, "logits/rejected": -0.4605827033519745, "logps/chosen": -434.54864501953125, "logps/ref_chosen": -57.16303253173828, "logps/ref_rejected": -83.79249572753906, "logps/rejected": -651.60791015625, "loss": 1.1966, "margin_dpo/margin_mean": 190.42984008789062, "margin_dpo/margin_std": 395.08624267578125, "step": 401 }, { "KL/chosen_KL_mean": -287.9420166015625, "KL/mean": -462.9802551269531, "KL/rejected_KL_mean": -638.0184936523438, "KL/std": 305.24725341796875, "epoch": 0.5903083700440529, "fcm_dpo/beta": 0.0016145255649462342, "fcm_dpo/delta": -0.17511004209518433, "fcm_dpo/margin": 350.07647705078125, "fcm_dpo/q_t": 0.3686904013156891, "grad_norm": 31.056983947753906, "learning_rate": 2.1673238449588665e-07, "logits/chosen": -0.5516624450683594, "logits/rejected": -0.5455505847930908, "logps/chosen": -338.682373046875, "logps/ref_chosen": -50.74037170410156, "logps/ref_rejected": -81.0460433959961, "logps/rejected": -719.0645751953125, "loss": 0.955, "margin_dpo/margin_mean": 350.07647705078125, "margin_dpo/margin_std": 310.8368225097656, "step": 402 }, { "KL/chosen_KL_mean": -327.7877197265625, "KL/mean": -463.3001403808594, "KL/rejected_KL_mean": -598.8125, "KL/std": 292.3427429199219, "epoch": 0.591776798825257, "fcm_dpo/beta": 0.0015922733582556248, "fcm_dpo/delta": -0.032966844737529755, "fcm_dpo/margin": 271.0248107910156, "fcm_dpo/q_t": 0.40002089738845825, "grad_norm": 26.145050048828125, "learning_rate": 2.154609112620295e-07, "logits/chosen": -0.5232188701629639, "logits/rejected": -0.525371789932251, "logps/chosen": -374.93505859375, "logps/ref_chosen": -47.14731216430664, "logps/ref_rejected": -77.2666015625, "logps/rejected": -676.0791015625, "loss": 1.0561, "margin_dpo/margin_mean": 271.0248107910156, "margin_dpo/margin_std": 303.0567932128906, "step": 403 }, { "KL/chosen_KL_mean": -357.9871520996094, "KL/mean": -491.9744873046875, "KL/rejected_KL_mean": -625.9617919921875, "KL/std": 283.893310546875, "epoch": 0.593245227606461, "fcm_dpo/beta": 0.0015823390567675233, "fcm_dpo/delta": -0.025122996419668198, "fcm_dpo/margin": 267.9746398925781, "fcm_dpo/q_t": 0.4035566449165344, "grad_norm": 32.78266906738281, "learning_rate": 2.1419034816528218e-07, "logits/chosen": -0.5385224223136902, "logits/rejected": -0.533818244934082, "logps/chosen": -405.8624267578125, "logps/ref_chosen": -47.875274658203125, "logps/ref_rejected": -77.15499877929688, "logps/rejected": -703.1168212890625, "loss": 1.0866, "margin_dpo/margin_mean": 267.97467041015625, "margin_dpo/margin_std": 370.1915283203125, "step": 404 }, { "KL/chosen_KL_mean": -402.0423278808594, "KL/mean": -512.295166015625, "KL/rejected_KL_mean": -622.5479125976562, "KL/std": 303.5537414550781, "epoch": 0.5947136563876652, "fcm_dpo/beta": 0.0015738653019070625, "fcm_dpo/delta": -0.04883524030447006, "fcm_dpo/margin": 220.505615234375, "fcm_dpo/q_t": 0.42195454239845276, "grad_norm": 32.378849029541016, "learning_rate": 2.129207286861638e-07, "logits/chosen": -0.5482072830200195, "logits/rejected": -0.5404790639877319, "logps/chosen": -467.20526123046875, "logps/ref_chosen": -65.16290283203125, "logps/ref_rejected": -87.18678283691406, "logps/rejected": -709.7347412109375, "loss": 1.1557, "margin_dpo/margin_mean": 220.505615234375, "margin_dpo/margin_std": 375.1750183105469, "step": 405 }, { "KL/chosen_KL_mean": -362.15118408203125, "KL/mean": -505.815185546875, "KL/rejected_KL_mean": -649.479248046875, "KL/std": 306.6311950683594, "epoch": 0.5961820851688693, "fcm_dpo/beta": 0.001560859032906592, "fcm_dpo/delta": -0.050896137952804565, "fcm_dpo/margin": 287.3280334472656, "fcm_dpo/q_t": 0.3976570963859558, "grad_norm": 26.965667724609375, "learning_rate": 2.1165208628032861e-07, "logits/chosen": -0.5607165098190308, "logits/rejected": -0.5739609599113464, "logps/chosen": -411.8919982910156, "logps/ref_chosen": -49.740814208984375, "logps/ref_rejected": -92.07862854003906, "logps/rejected": -741.557861328125, "loss": 1.0558, "margin_dpo/margin_mean": 287.3280334472656, "margin_dpo/margin_std": 347.114990234375, "step": 406 }, { "KL/chosen_KL_mean": -390.5394287109375, "KL/mean": -479.0431823730469, "KL/rejected_KL_mean": -567.546875, "KL/std": 242.9478759765625, "epoch": 0.5976505139500734, "fcm_dpo/beta": 0.0015518320724368095, "fcm_dpo/delta": 0.009759590961039066, "fcm_dpo/margin": 177.0074462890625, "fcm_dpo/q_t": 0.43519794940948486, "grad_norm": 27.866214752197266, "learning_rate": 2.1038445437768375e-07, "logits/chosen": -0.5981777310371399, "logits/rejected": -0.5707347393035889, "logps/chosen": -446.8701477050781, "logps/ref_chosen": -56.33069610595703, "logps/ref_rejected": -77.51209259033203, "logps/rejected": -645.0589599609375, "loss": 1.1933, "margin_dpo/margin_mean": 177.0074462890625, "margin_dpo/margin_std": 323.0165710449219, "step": 407 }, { "KL/chosen_KL_mean": -390.214599609375, "KL/mean": -492.9902038574219, "KL/rejected_KL_mean": -595.7658081054688, "KL/std": 239.10140991210938, "epoch": 0.5991189427312775, "fcm_dpo/beta": 0.0015722161624580622, "fcm_dpo/delta": 0.07937593758106232, "fcm_dpo/margin": 205.55116271972656, "fcm_dpo/q_t": 0.4239957928657532, "grad_norm": 26.215208053588867, "learning_rate": 2.0911786638150872e-07, "logits/chosen": -0.5892931222915649, "logits/rejected": -0.5651265382766724, "logps/chosen": -460.0039367675781, "logps/ref_chosen": -69.789306640625, "logps/ref_rejected": -90.09693908691406, "logps/rejected": -685.8627319335938, "loss": 1.1321, "margin_dpo/margin_mean": 205.55116271972656, "margin_dpo/margin_std": 269.0951232910156, "step": 408 }, { "KL/chosen_KL_mean": -384.1105041503906, "KL/mean": -484.8536682128906, "KL/rejected_KL_mean": -585.5968017578125, "KL/std": 270.78424072265625, "epoch": 0.6005873715124816, "fcm_dpo/beta": 0.00160063779912889, "fcm_dpo/delta": 0.07980034500360489, "fcm_dpo/margin": 201.486328125, "fcm_dpo/q_t": 0.4251343607902527, "grad_norm": 37.040836334228516, "learning_rate": 2.0785235566757517e-07, "logits/chosen": -0.5418749451637268, "logits/rejected": -0.5251990556716919, "logps/chosen": -451.42791748046875, "logps/ref_chosen": -67.31744384765625, "logps/ref_rejected": -84.904296875, "logps/rejected": -670.5010986328125, "loss": 1.1421, "margin_dpo/margin_mean": 201.48635864257812, "margin_dpo/margin_std": 292.822998046875, "step": 409 }, { "KL/chosen_KL_mean": -369.51556396484375, "KL/mean": -493.2908630371094, "KL/rejected_KL_mean": -617.066162109375, "KL/std": 272.4659423828125, "epoch": 0.6020558002936858, "fcm_dpo/beta": 0.0016031713457778096, "fcm_dpo/delta": 0.003254479728639126, "fcm_dpo/margin": 247.55059814453125, "fcm_dpo/q_t": 0.4068432152271271, "grad_norm": 28.998903274536133, "learning_rate": 2.065879555832674e-07, "logits/chosen": -0.5895723104476929, "logits/rejected": -0.5948858857154846, "logps/chosen": -420.98089599609375, "logps/ref_chosen": -51.465354919433594, "logps/ref_rejected": -83.198974609375, "logps/rejected": -700.26513671875, "loss": 1.0854, "margin_dpo/margin_mean": 247.5506134033203, "margin_dpo/margin_std": 306.04541015625, "step": 410 }, { "KL/chosen_KL_mean": -385.24688720703125, "KL/mean": -502.3451843261719, "KL/rejected_KL_mean": -619.4434204101562, "KL/std": 290.84130859375, "epoch": 0.6035242290748899, "fcm_dpo/beta": 0.001580849289894104, "fcm_dpo/delta": -0.0746782049536705, "fcm_dpo/margin": 234.19656372070312, "fcm_dpo/q_t": 0.41524988412857056, "grad_norm": 33.94599533081055, "learning_rate": 2.0532469944670343e-07, "logits/chosen": -0.5782661437988281, "logits/rejected": -0.589745819568634, "logps/chosen": -437.5541687011719, "logps/ref_chosen": -52.30727005004883, "logps/ref_rejected": -80.69495391845703, "logps/rejected": -700.138427734375, "loss": 1.1204, "margin_dpo/margin_mean": 234.19654846191406, "margin_dpo/margin_std": 333.4945068359375, "step": 411 }, { "KL/chosen_KL_mean": -381.74725341796875, "KL/mean": -506.9585266113281, "KL/rejected_KL_mean": -632.1698608398438, "KL/std": 275.69854736328125, "epoch": 0.604992657856094, "fcm_dpo/beta": 0.001582764321938157, "fcm_dpo/delta": 0.0037793107330799103, "fcm_dpo/margin": 250.42257690429688, "fcm_dpo/q_t": 0.4081728160381317, "grad_norm": 30.582490921020508, "learning_rate": 2.0406262054585738e-07, "logits/chosen": -0.6569205522537231, "logits/rejected": -0.6889303922653198, "logps/chosen": -434.891357421875, "logps/ref_chosen": -53.144126892089844, "logps/ref_rejected": -100.0608139038086, "logps/rejected": -732.2306518554688, "loss": 1.0898, "margin_dpo/margin_mean": 250.42259216308594, "margin_dpo/margin_std": 321.5178527832031, "step": 412 }, { "KL/chosen_KL_mean": -417.2314453125, "KL/mean": -536.9945068359375, "KL/rejected_KL_mean": -656.7576293945312, "KL/std": 290.6329345703125, "epoch": 0.6064610866372981, "fcm_dpo/beta": 0.001589751336723566, "fcm_dpo/delta": 0.01986226998269558, "fcm_dpo/margin": 239.52615356445312, "fcm_dpo/q_t": 0.410112202167511, "grad_norm": 35.33984375, "learning_rate": 2.0280175213768205e-07, "logits/chosen": -0.6048665046691895, "logits/rejected": -0.6170832514762878, "logps/chosen": -478.81341552734375, "logps/ref_chosen": -61.58196258544922, "logps/ref_rejected": -99.47340393066406, "logps/rejected": -756.2310180664062, "loss": 1.1012, "margin_dpo/margin_mean": 239.52615356445312, "margin_dpo/margin_std": 314.6912841796875, "step": 413 }, { "KL/chosen_KL_mean": -384.1409912109375, "KL/mean": -512.2708129882812, "KL/rejected_KL_mean": -640.400634765625, "KL/std": 261.5974426269531, "epoch": 0.6079295154185022, "fcm_dpo/beta": 0.0015965222846716642, "fcm_dpo/delta": -0.010348714888095856, "fcm_dpo/margin": 256.2596435546875, "fcm_dpo/q_t": 0.40425509214401245, "grad_norm": 36.57903289794922, "learning_rate": 2.0154212744723247e-07, "logits/chosen": -0.5525113344192505, "logits/rejected": -0.5497645139694214, "logps/chosen": -430.7724609375, "logps/ref_chosen": -46.63148498535156, "logps/ref_rejected": -87.64653015136719, "logps/rejected": -728.047119140625, "loss": 1.0772, "margin_dpo/margin_mean": 256.2596435546875, "margin_dpo/margin_std": 301.134033203125, "step": 414 }, { "KL/chosen_KL_mean": -396.94146728515625, "KL/mean": -496.112548828125, "KL/rejected_KL_mean": -595.28369140625, "KL/std": 266.48736572265625, "epoch": 0.6093979441997063, "fcm_dpo/beta": 0.0015998441958799958, "fcm_dpo/delta": 0.08544344455003738, "fcm_dpo/margin": 198.34219360351562, "fcm_dpo/q_t": 0.4255606532096863, "grad_norm": 29.083877563476562, "learning_rate": 2.002837796667909e-07, "logits/chosen": -0.5818853974342346, "logits/rejected": -0.5800877809524536, "logps/chosen": -475.55975341796875, "logps/ref_chosen": -78.6182861328125, "logps/ref_rejected": -100.47752380371094, "logps/rejected": -695.7611694335938, "loss": 1.1493, "margin_dpo/margin_mean": 198.34219360351562, "margin_dpo/margin_std": 297.89312744140625, "step": 415 }, { "KL/chosen_KL_mean": -360.2153015136719, "KL/mean": -515.66943359375, "KL/rejected_KL_mean": -671.12353515625, "KL/std": 292.68878173828125, "epoch": 0.6108663729809104, "fcm_dpo/beta": 0.0015928513603284955, "fcm_dpo/delta": -0.10007157921791077, "fcm_dpo/margin": 310.9083251953125, "fcm_dpo/q_t": 0.38390302658081055, "grad_norm": 30.425918579101562, "learning_rate": 1.990267419549914e-07, "logits/chosen": -0.5869680643081665, "logits/rejected": -0.5953909754753113, "logps/chosen": -418.49444580078125, "logps/ref_chosen": -58.27912521362305, "logps/ref_rejected": -90.56871795654297, "logps/rejected": -761.6922607421875, "loss": 1.0009, "margin_dpo/margin_mean": 310.9082946777344, "margin_dpo/margin_std": 287.4730224609375, "step": 416 }, { "KL/chosen_KL_mean": -359.5291748046875, "KL/mean": -491.1758117675781, "KL/rejected_KL_mean": -622.8224487304688, "KL/std": 265.6627197265625, "epoch": 0.6123348017621145, "fcm_dpo/beta": 0.0015787691809237003, "fcm_dpo/delta": -0.016375936567783356, "fcm_dpo/margin": 263.29327392578125, "fcm_dpo/q_t": 0.4014412462711334, "grad_norm": 36.952362060546875, "learning_rate": 1.9777104743594686e-07, "logits/chosen": -0.5713890790939331, "logits/rejected": -0.5573433637619019, "logps/chosen": -409.7278747558594, "logps/ref_chosen": -50.1987190246582, "logps/ref_rejected": -68.15184020996094, "logps/rejected": -690.9742431640625, "loss": 1.0514, "margin_dpo/margin_mean": 263.2933044433594, "margin_dpo/margin_std": 253.69268798828125, "step": 417 }, { "KL/chosen_KL_mean": -388.34796142578125, "KL/mean": -523.8869018554688, "KL/rejected_KL_mean": -659.4258422851562, "KL/std": 309.9178466796875, "epoch": 0.6138032305433186, "fcm_dpo/beta": 0.0015799949178472161, "fcm_dpo/delta": -0.030489690601825714, "fcm_dpo/margin": 271.077880859375, "fcm_dpo/q_t": 0.40348464250564575, "grad_norm": 22.66263771057129, "learning_rate": 1.965167291983757e-07, "logits/chosen": -0.6504318714141846, "logits/rejected": -0.634763240814209, "logps/chosen": -470.326416015625, "logps/ref_chosen": -81.97846984863281, "logps/ref_rejected": -104.69148254394531, "logps/rejected": -764.1173095703125, "loss": 1.082, "margin_dpo/margin_mean": 271.077880859375, "margin_dpo/margin_std": 359.14801025390625, "step": 418 }, { "KL/chosen_KL_mean": -358.2254638671875, "KL/mean": -507.04156494140625, "KL/rejected_KL_mean": -655.857666015625, "KL/std": 277.0826110839844, "epoch": 0.6152716593245228, "fcm_dpo/beta": 0.001551083056256175, "fcm_dpo/delta": -0.06462173163890839, "fcm_dpo/margin": 297.6322021484375, "fcm_dpo/q_t": 0.3922021687030792, "grad_norm": 32.60568618774414, "learning_rate": 1.9526382029472988e-07, "logits/chosen": -0.5689761638641357, "logits/rejected": -0.5716375708580017, "logps/chosen": -411.17413330078125, "logps/ref_chosen": -52.948646545410156, "logps/ref_rejected": -91.58309936523438, "logps/rejected": -747.4407958984375, "loss": 1.0358, "margin_dpo/margin_mean": 297.6322021484375, "margin_dpo/margin_std": 322.8487854003906, "step": 419 }, { "KL/chosen_KL_mean": -455.716064453125, "KL/mean": -545.5599365234375, "KL/rejected_KL_mean": -635.4039306640625, "KL/std": 290.76470947265625, "epoch": 0.6167400881057269, "fcm_dpo/beta": 0.0015754573978483677, "fcm_dpo/delta": 0.1201419085264206, "fcm_dpo/margin": 179.68783569335938, "fcm_dpo/q_t": 0.43682652711868286, "grad_norm": 52.570884704589844, "learning_rate": 1.9401235374032425e-07, "logits/chosen": -0.5963464975357056, "logits/rejected": -0.5628973245620728, "logps/chosen": -533.4859619140625, "logps/ref_chosen": -77.7699203491211, "logps/ref_rejected": -69.31985473632812, "logps/rejected": -704.7237548828125, "loss": 1.2113, "margin_dpo/margin_mean": 179.68783569335938, "margin_dpo/margin_std": 389.99383544921875, "step": 420 }, { "KL/chosen_KL_mean": -387.1592102050781, "KL/mean": -489.2259216308594, "KL/rejected_KL_mean": -591.2926025390625, "KL/std": 287.3149719238281, "epoch": 0.618208516886931, "fcm_dpo/beta": 0.0016072317957878113, "fcm_dpo/delta": 0.07370726019144058, "fcm_dpo/margin": 204.13345336914062, "fcm_dpo/q_t": 0.42217308282852173, "grad_norm": 27.965801239013672, "learning_rate": 1.9276236251246653e-07, "logits/chosen": -0.6343262195587158, "logits/rejected": -0.6225738525390625, "logps/chosen": -440.9250793457031, "logps/ref_chosen": -53.765865325927734, "logps/ref_rejected": -89.28144836425781, "logps/rejected": -680.5740966796875, "loss": 1.1435, "margin_dpo/margin_mean": 204.13345336914062, "margin_dpo/margin_std": 298.565185546875, "step": 421 }, { "KL/chosen_KL_mean": -434.88446044921875, "KL/mean": -556.2061157226562, "KL/rejected_KL_mean": -677.5277099609375, "KL/std": 292.18963623046875, "epoch": 0.6196769456681351, "fcm_dpo/beta": 0.0016083747614175081, "fcm_dpo/delta": 0.010134613141417503, "fcm_dpo/margin": 242.6432647705078, "fcm_dpo/q_t": 0.40930503606796265, "grad_norm": 36.293704986572266, "learning_rate": 1.9151387954958792e-07, "logits/chosen": -0.6203078031539917, "logits/rejected": -0.6245821714401245, "logps/chosen": -503.51824951171875, "logps/ref_chosen": -68.6337661743164, "logps/ref_rejected": -87.86351013183594, "logps/rejected": -765.3912353515625, "loss": 1.1034, "margin_dpo/margin_mean": 242.6432647705078, "margin_dpo/margin_std": 339.7408142089844, "step": 422 }, { "KL/chosen_KL_mean": -425.70440673828125, "KL/mean": -560.471923828125, "KL/rejected_KL_mean": -695.239501953125, "KL/std": 282.6575012207031, "epoch": 0.6211453744493393, "fcm_dpo/beta": 0.0016007790109142661, "fcm_dpo/delta": -0.032924652099609375, "fcm_dpo/margin": 269.53515625, "fcm_dpo/q_t": 0.40004196763038635, "grad_norm": 34.40102005004883, "learning_rate": 1.902669377503756e-07, "logits/chosen": -0.613274097442627, "logits/rejected": -0.6200335025787354, "logps/chosen": -480.6947021484375, "logps/ref_chosen": -54.99030303955078, "logps/ref_rejected": -86.30654907226562, "logps/rejected": -781.5460815429688, "loss": 1.0598, "margin_dpo/margin_mean": 269.53515625, "margin_dpo/margin_std": 315.0151672363281, "step": 423 }, { "KL/chosen_KL_mean": -387.79833984375, "KL/mean": -513.0828247070312, "KL/rejected_KL_mean": -638.3672485351562, "KL/std": 285.3108215332031, "epoch": 0.6226138032305433, "fcm_dpo/beta": 0.0015928398352116346, "fcm_dpo/delta": 0.0005655810236930847, "fcm_dpo/margin": 250.56887817382812, "fcm_dpo/q_t": 0.40977025032043457, "grad_norm": 35.320560455322266, "learning_rate": 1.890215699729057e-07, "logits/chosen": -0.6114071011543274, "logits/rejected": -0.5866736769676208, "logps/chosen": -443.8102722167969, "logps/ref_chosen": -56.01192092895508, "logps/ref_rejected": -66.47896575927734, "logps/rejected": -704.84619140625, "loss": 1.0981, "margin_dpo/margin_mean": 250.56887817382812, "margin_dpo/margin_std": 348.552978515625, "step": 424 }, { "KL/chosen_KL_mean": -441.84747314453125, "KL/mean": -535.390380859375, "KL/rejected_KL_mean": -628.933349609375, "KL/std": 266.4034423828125, "epoch": 0.6240822320117474, "fcm_dpo/beta": 0.00162741937674582, "fcm_dpo/delta": 0.09813511371612549, "fcm_dpo/margin": 187.08584594726562, "fcm_dpo/q_t": 0.42789530754089355, "grad_norm": 35.080318450927734, "learning_rate": 1.8777780903377732e-07, "logits/chosen": -0.6289581060409546, "logits/rejected": -0.630668044090271, "logps/chosen": -488.7164306640625, "logps/ref_chosen": -46.86899948120117, "logps/ref_rejected": -95.92545318603516, "logps/rejected": -724.8587646484375, "loss": 1.1745, "margin_dpo/margin_mean": 187.0858612060547, "margin_dpo/margin_std": 329.364990234375, "step": 425 }, { "KL/chosen_KL_mean": -400.1252136230469, "KL/mean": -519.4071044921875, "KL/rejected_KL_mean": -638.68896484375, "KL/std": 272.60137939453125, "epoch": 0.6255506607929515, "fcm_dpo/beta": 0.0016408449737355113, "fcm_dpo/delta": 0.008432462811470032, "fcm_dpo/margin": 238.56373596191406, "fcm_dpo/q_t": 0.4089978337287903, "grad_norm": 30.283872604370117, "learning_rate": 1.8653568770724803e-07, "logits/chosen": -0.6305758953094482, "logits/rejected": -0.5998473167419434, "logps/chosen": -476.708740234375, "logps/ref_chosen": -76.58354187011719, "logps/ref_rejected": -81.26658630371094, "logps/rejected": -719.95556640625, "loss": 1.0926, "margin_dpo/margin_mean": 238.56373596191406, "margin_dpo/margin_std": 298.9935607910156, "step": 426 }, { "KL/chosen_KL_mean": -372.81451416015625, "KL/mean": -467.484130859375, "KL/rejected_KL_mean": -562.1536865234375, "KL/std": 240.36863708496094, "epoch": 0.6270190895741556, "fcm_dpo/beta": 0.0016517244512215257, "fcm_dpo/delta": 0.09017623960971832, "fcm_dpo/margin": 189.3392333984375, "fcm_dpo/q_t": 0.42767786979675293, "grad_norm": 27.282377243041992, "learning_rate": 1.8529523872436977e-07, "logits/chosen": -0.6336793899536133, "logits/rejected": -0.6121144890785217, "logps/chosen": -437.66839599609375, "logps/ref_chosen": -64.8538818359375, "logps/ref_rejected": -78.5660171508789, "logps/rejected": -640.7197265625, "loss": 1.1522, "margin_dpo/margin_mean": 189.33921813964844, "margin_dpo/margin_std": 286.495849609375, "step": 427 }, { "KL/chosen_KL_mean": -453.0958251953125, "KL/mean": -580.8818359375, "KL/rejected_KL_mean": -708.6678466796875, "KL/std": 316.7676696777344, "epoch": 0.6284875183553598, "fcm_dpo/beta": 0.0016518604243174195, "fcm_dpo/delta": -0.02350825071334839, "fcm_dpo/margin": 255.57199096679688, "fcm_dpo/q_t": 0.40379005670547485, "grad_norm": 31.117158889770508, "learning_rate": 1.8405649477212697e-07, "logits/chosen": -0.6245919466018677, "logits/rejected": -0.6301891803741455, "logps/chosen": -515.7324829101562, "logps/ref_chosen": -62.63666534423828, "logps/ref_rejected": -103.28181457519531, "logps/rejected": -811.9496459960938, "loss": 1.1038, "margin_dpo/margin_mean": 255.57199096679688, "margin_dpo/margin_std": 387.40362548828125, "step": 428 }, { "KL/chosen_KL_mean": -449.23046875, "KL/mean": -552.645263671875, "KL/rejected_KL_mean": -656.06005859375, "KL/std": 283.962158203125, "epoch": 0.6299559471365639, "fcm_dpo/beta": 0.001649289857596159, "fcm_dpo/delta": -0.040331680327653885, "fcm_dpo/margin": 206.8295440673828, "fcm_dpo/q_t": 0.42213714122772217, "grad_norm": 32.68413543701172, "learning_rate": 1.828194884925749e-07, "logits/chosen": -0.652100682258606, "logits/rejected": -0.6345040798187256, "logps/chosen": -530.4644775390625, "logps/ref_chosen": -81.23401641845703, "logps/ref_rejected": -91.79493713378906, "logps/rejected": -747.85498046875, "loss": 1.1566, "margin_dpo/margin_mean": 206.82952880859375, "margin_dpo/margin_std": 348.011474609375, "step": 429 }, { "KL/chosen_KL_mean": -376.54345703125, "KL/mean": -479.85723876953125, "KL/rejected_KL_mean": -583.1709594726562, "KL/std": 259.899169921875, "epoch": 0.631424375917768, "fcm_dpo/beta": 0.0016597865615040064, "fcm_dpo/delta": 0.058922089636325836, "fcm_dpo/margin": 206.62753295898438, "fcm_dpo/q_t": 0.4206964075565338, "grad_norm": 29.323801040649414, "learning_rate": 1.8158425248197928e-07, "logits/chosen": -0.5997954607009888, "logits/rejected": -0.598332941532135, "logps/chosen": -437.4637451171875, "logps/ref_chosen": -60.920326232910156, "logps/ref_rejected": -104.42280578613281, "logps/rejected": -687.59375, "loss": 1.1256, "margin_dpo/margin_mean": 206.62753295898438, "margin_dpo/margin_std": 286.5379638671875, "step": 430 }, { "KL/chosen_KL_mean": -349.5550537109375, "KL/mean": -498.06231689453125, "KL/rejected_KL_mean": -646.569580078125, "KL/std": 281.20587158203125, "epoch": 0.6328928046989721, "fcm_dpo/beta": 0.0016381596215069294, "fcm_dpo/delta": -0.09109188616275787, "fcm_dpo/margin": 297.0145263671875, "fcm_dpo/q_t": 0.38691771030426025, "grad_norm": 30.228837966918945, "learning_rate": 1.8035081928995788e-07, "logits/chosen": -0.59189772605896, "logits/rejected": -0.5967794060707092, "logps/chosen": -406.90380859375, "logps/ref_chosen": -57.34874725341797, "logps/ref_rejected": -92.84022521972656, "logps/rejected": -739.4097900390625, "loss": 1.0216, "margin_dpo/margin_mean": 297.0145568847656, "margin_dpo/margin_std": 313.81036376953125, "step": 431 }, { "KL/chosen_KL_mean": -339.9703674316406, "KL/mean": -485.2652893066406, "KL/rejected_KL_mean": -630.5601806640625, "KL/std": 279.104736328125, "epoch": 0.6343612334801763, "fcm_dpo/beta": 0.0016237597446888685, "fcm_dpo/delta": -0.07570492476224899, "fcm_dpo/margin": 290.58984375, "fcm_dpo/q_t": 0.39008086919784546, "grad_norm": 34.41071701049805, "learning_rate": 1.791192214186223e-07, "logits/chosen": -0.5879085063934326, "logits/rejected": -0.5795068740844727, "logps/chosen": -411.045166015625, "logps/ref_chosen": -71.07479095458984, "logps/ref_rejected": -98.57952880859375, "logps/rejected": -729.1397705078125, "loss": 1.0208, "margin_dpo/margin_mean": 290.58984375, "margin_dpo/margin_std": 279.1397705078125, "step": 432 }, { "KL/chosen_KL_mean": -436.6034851074219, "KL/mean": -530.861083984375, "KL/rejected_KL_mean": -625.1187133789062, "KL/std": 284.5777587890625, "epoch": 0.6358296622613803, "fcm_dpo/beta": 0.0016333262901753187, "fcm_dpo/delta": 0.09479224681854248, "fcm_dpo/margin": 188.51522827148438, "fcm_dpo/q_t": 0.4265033006668091, "grad_norm": 33.45824432373047, "learning_rate": 1.7788949132172193e-07, "logits/chosen": -0.6549836993217468, "logits/rejected": -0.6436095237731934, "logps/chosen": -494.8766784667969, "logps/ref_chosen": -58.273193359375, "logps/ref_rejected": -95.95089721679688, "logps/rejected": -721.069580078125, "loss": 1.1745, "margin_dpo/margin_mean": 188.51522827148438, "margin_dpo/margin_std": 336.2310485839844, "step": 433 }, { "KL/chosen_KL_mean": -377.69427490234375, "KL/mean": -490.43060302734375, "KL/rejected_KL_mean": -603.1669921875, "KL/std": 274.6502685546875, "epoch": 0.6372980910425844, "fcm_dpo/beta": 0.0016410250682383776, "fcm_dpo/delta": 0.031055327504873276, "fcm_dpo/margin": 225.47265625, "fcm_dpo/q_t": 0.41886186599731445, "grad_norm": 22.17997932434082, "learning_rate": 1.7666166140378853e-07, "logits/chosen": -0.6539719104766846, "logits/rejected": -0.6557008624076843, "logps/chosen": -439.6679992675781, "logps/ref_chosen": -61.97370147705078, "logps/ref_rejected": -78.49861145019531, "logps/rejected": -681.6655883789062, "loss": 1.1216, "margin_dpo/margin_mean": 225.47265625, "margin_dpo/margin_std": 347.4930725097656, "step": 434 }, { "KL/chosen_KL_mean": -346.69720458984375, "KL/mean": -469.296875, "KL/rejected_KL_mean": -591.896484375, "KL/std": 269.6709289550781, "epoch": 0.6387665198237885, "fcm_dpo/beta": 0.0016461058985441923, "fcm_dpo/delta": -0.003797288052737713, "fcm_dpo/margin": 245.1993408203125, "fcm_dpo/q_t": 0.4061928987503052, "grad_norm": 26.81846046447754, "learning_rate": 1.7543576401928218e-07, "logits/chosen": -0.6933680772781372, "logits/rejected": -0.6857917308807373, "logps/chosen": -398.1992492675781, "logps/ref_chosen": -51.502052307128906, "logps/ref_rejected": -87.56689453125, "logps/rejected": -679.46337890625, "loss": 1.084, "margin_dpo/margin_mean": 245.1993408203125, "margin_dpo/margin_std": 302.0467529296875, "step": 435 }, { "KL/chosen_KL_mean": -362.07177734375, "KL/mean": -472.35260009765625, "KL/rejected_KL_mean": -582.6334838867188, "KL/std": 247.13198852539062, "epoch": 0.6402349486049926, "fcm_dpo/beta": 0.001651562051847577, "fcm_dpo/delta": 0.03701151907444, "fcm_dpo/margin": 220.5616912841797, "fcm_dpo/q_t": 0.41518351435661316, "grad_norm": 44.578636169433594, "learning_rate": 1.742118314717391e-07, "logits/chosen": -0.6292225122451782, "logits/rejected": -0.5981835126876831, "logps/chosen": -433.4754943847656, "logps/ref_chosen": -71.40371704101562, "logps/ref_rejected": -82.72775268554688, "logps/rejected": -665.3612060546875, "loss": 1.1089, "margin_dpo/margin_mean": 220.5616912841797, "margin_dpo/margin_std": 287.855224609375, "step": 436 }, { "KL/chosen_KL_mean": -370.03875732421875, "KL/mean": -483.41693115234375, "KL/rejected_KL_mean": -596.7950439453125, "KL/std": 239.50619506835938, "epoch": 0.6417033773861968, "fcm_dpo/beta": 0.0016632757615298033, "fcm_dpo/delta": 0.02373369038105011, "fcm_dpo/margin": 226.75625610351562, "fcm_dpo/q_t": 0.41175174713134766, "grad_norm": 30.342931747436523, "learning_rate": 1.7298989601292036e-07, "logits/chosen": -0.6698806881904602, "logits/rejected": -0.64947509765625, "logps/chosen": -434.78302001953125, "logps/ref_chosen": -64.7442626953125, "logps/ref_rejected": -82.04356384277344, "logps/rejected": -678.838623046875, "loss": 1.098, "margin_dpo/margin_mean": 226.75625610351562, "margin_dpo/margin_std": 285.7284851074219, "step": 437 }, { "KL/chosen_KL_mean": -386.3381652832031, "KL/mean": -517.0180053710938, "KL/rejected_KL_mean": -647.6978759765625, "KL/std": 274.5076904296875, "epoch": 0.6431718061674009, "fcm_dpo/beta": 0.0016521508805453777, "fcm_dpo/delta": -0.033741071820259094, "fcm_dpo/margin": 261.35968017578125, "fcm_dpo/q_t": 0.39868584275245667, "grad_norm": 32.4385871887207, "learning_rate": 1.7176998984196144e-07, "logits/chosen": -0.6905832290649414, "logits/rejected": -0.6742756366729736, "logps/chosen": -445.3568115234375, "logps/ref_chosen": -59.0186653137207, "logps/ref_rejected": -83.07682800292969, "logps/rejected": -730.774658203125, "loss": 1.0551, "margin_dpo/margin_mean": 261.35968017578125, "margin_dpo/margin_std": 287.7158508300781, "step": 438 }, { "KL/chosen_KL_mean": -398.215087890625, "KL/mean": -503.7099914550781, "KL/rejected_KL_mean": -609.204833984375, "KL/std": 279.349609375, "epoch": 0.644640234948605, "fcm_dpo/beta": 0.001632839790545404, "fcm_dpo/delta": -0.0710936188697815, "fcm_dpo/margin": 210.98980712890625, "fcm_dpo/q_t": 0.4208637773990631, "grad_norm": 35.499446868896484, "learning_rate": 1.7055214510452458e-07, "logits/chosen": -0.6591403484344482, "logits/rejected": -0.662467360496521, "logps/chosen": -451.9991760253906, "logps/ref_chosen": -53.78407669067383, "logps/ref_rejected": -83.98545837402344, "logps/rejected": -693.1903076171875, "loss": 1.1405, "margin_dpo/margin_mean": 210.98980712890625, "margin_dpo/margin_std": 313.9350891113281, "step": 439 }, { "KL/chosen_KL_mean": -421.02459716796875, "KL/mean": -543.773681640625, "KL/rejected_KL_mean": -666.5227661132812, "KL/std": 341.6332092285156, "epoch": 0.6461086637298091, "fcm_dpo/beta": 0.001635729568079114, "fcm_dpo/delta": -0.0017141718417406082, "fcm_dpo/margin": 245.49813842773438, "fcm_dpo/q_t": 0.410071462392807, "grad_norm": 42.761192321777344, "learning_rate": 1.6933639389195134e-07, "logits/chosen": -0.705498218536377, "logits/rejected": -0.7023329138755798, "logps/chosen": -499.59130859375, "logps/ref_chosen": -78.56671905517578, "logps/ref_rejected": -96.49775695800781, "logps/rejected": -763.0205078125, "loss": 1.0949, "margin_dpo/margin_mean": 245.49813842773438, "margin_dpo/margin_std": 340.76123046875, "step": 440 }, { "KL/chosen_KL_mean": -490.50604248046875, "KL/mean": -611.987548828125, "KL/rejected_KL_mean": -733.4691162109375, "KL/std": 356.1271667480469, "epoch": 0.6475770925110133, "fcm_dpo/beta": 0.00163905113004148, "fcm_dpo/delta": 0.001527242362499237, "fcm_dpo/margin": 242.963134765625, "fcm_dpo/q_t": 0.41395312547683716, "grad_norm": 46.89374923706055, "learning_rate": 1.681227682404166e-07, "logits/chosen": -0.7099902629852295, "logits/rejected": -0.6972057223320007, "logps/chosen": -551.3304443359375, "logps/ref_chosen": -60.824440002441406, "logps/ref_rejected": -96.47080993652344, "logps/rejected": -829.93994140625, "loss": 1.1424, "margin_dpo/margin_mean": 242.963134765625, "margin_dpo/margin_std": 429.1325988769531, "step": 441 }, { "KL/chosen_KL_mean": -414.8262939453125, "KL/mean": -562.6405029296875, "KL/rejected_KL_mean": -710.4547119140625, "KL/std": 340.4912109375, "epoch": 0.6490455212922174, "fcm_dpo/beta": 0.0016239210963249207, "fcm_dpo/delta": -0.08454307168722153, "fcm_dpo/margin": 295.62847900390625, "fcm_dpo/q_t": 0.3937837481498718, "grad_norm": 32.3649787902832, "learning_rate": 1.669113001300851e-07, "logits/chosen": -0.6760904788970947, "logits/rejected": -0.6695461273193359, "logps/chosen": -461.8374938964844, "logps/ref_chosen": -47.01121520996094, "logps/ref_rejected": -76.53926086425781, "logps/rejected": -786.9940185546875, "loss": 1.0551, "margin_dpo/margin_mean": 295.6284484863281, "margin_dpo/margin_std": 383.97320556640625, "step": 442 }, { "KL/chosen_KL_mean": -478.28509521484375, "KL/mean": -572.2210693359375, "KL/rejected_KL_mean": -666.1571044921875, "KL/std": 346.36370849609375, "epoch": 0.6505139500734214, "fcm_dpo/beta": 0.0016035648295655847, "fcm_dpo/delta": -0.005727704148739576, "fcm_dpo/margin": 187.87205505371094, "fcm_dpo/q_t": 0.43321898579597473, "grad_norm": 35.34192657470703, "learning_rate": 1.6570202148426815e-07, "logits/chosen": -0.6955731511116028, "logits/rejected": -0.6718583106994629, "logps/chosen": -549.55810546875, "logps/ref_chosen": -71.27301788330078, "logps/ref_rejected": -86.679931640625, "logps/rejected": -752.8370361328125, "loss": 1.2151, "margin_dpo/margin_mean": 187.87205505371094, "margin_dpo/margin_std": 419.6290283203125, "step": 443 }, { "KL/chosen_KL_mean": -469.6700439453125, "KL/mean": -620.1004638671875, "KL/rejected_KL_mean": -770.5308837890625, "KL/std": 358.370849609375, "epoch": 0.6519823788546255, "fcm_dpo/beta": 0.0015793245984241366, "fcm_dpo/delta": -0.07941662520170212, "fcm_dpo/margin": 300.8608093261719, "fcm_dpo/q_t": 0.3940558433532715, "grad_norm": 29.70403480529785, "learning_rate": 1.6449496416858282e-07, "logits/chosen": -0.6584120988845825, "logits/rejected": -0.6683436632156372, "logps/chosen": -526.8837280273438, "logps/ref_chosen": -57.213706970214844, "logps/ref_rejected": -97.25489807128906, "logps/rejected": -867.7857666015625, "loss": 1.0553, "margin_dpo/margin_mean": 300.8608093261719, "margin_dpo/margin_std": 400.1059265136719, "step": 444 }, { "KL/chosen_KL_mean": -416.6427917480469, "KL/mean": -557.0631103515625, "KL/rejected_KL_mean": -697.4834594726562, "KL/std": 289.13763427734375, "epoch": 0.6534508076358296, "fcm_dpo/beta": 0.0015705095138400793, "fcm_dpo/delta": -0.04293816536664963, "fcm_dpo/margin": 280.84063720703125, "fcm_dpo/q_t": 0.3998865485191345, "grad_norm": 34.561370849609375, "learning_rate": 1.6329015999011182e-07, "logits/chosen": -0.666840672492981, "logits/rejected": -0.6581023931503296, "logps/chosen": -483.9425964355469, "logps/ref_chosen": -67.29979705810547, "logps/ref_rejected": -92.68267059326172, "logps/rejected": -790.1661376953125, "loss": 1.067, "margin_dpo/margin_mean": 280.84063720703125, "margin_dpo/margin_std": 354.77947998046875, "step": 445 }, { "KL/chosen_KL_mean": -381.79669189453125, "KL/mean": -531.6203002929688, "KL/rejected_KL_mean": -681.4439086914062, "KL/std": 308.4264221191406, "epoch": 0.6549192364170338, "fcm_dpo/beta": 0.0015590311959385872, "fcm_dpo/delta": -0.07091644406318665, "fcm_dpo/margin": 299.64715576171875, "fcm_dpo/q_t": 0.39108988642692566, "grad_norm": 32.68497085571289, "learning_rate": 1.6208764069656578e-07, "logits/chosen": -0.6791973114013672, "logits/rejected": -0.6917370557785034, "logps/chosen": -440.89520263671875, "logps/ref_chosen": -59.098487854003906, "logps/ref_rejected": -101.26419067382812, "logps/rejected": -782.7080688476562, "loss": 1.0346, "margin_dpo/margin_mean": 299.64715576171875, "margin_dpo/margin_std": 317.11669921875, "step": 446 }, { "KL/chosen_KL_mean": -396.80615234375, "KL/mean": -554.9542236328125, "KL/rejected_KL_mean": -713.1023559570312, "KL/std": 362.07012939453125, "epoch": 0.6563876651982379, "fcm_dpo/beta": 0.0015180823393166065, "fcm_dpo/delta": -0.08475878089666367, "fcm_dpo/margin": 316.29620361328125, "fcm_dpo/q_t": 0.3928487300872803, "grad_norm": 32.39216613769531, "learning_rate": 1.608874379754465e-07, "logits/chosen": -0.7233532667160034, "logits/rejected": -0.7404334545135498, "logps/chosen": -452.8814697265625, "logps/ref_chosen": -56.07533264160156, "logps/ref_rejected": -98.69475555419922, "logps/rejected": -811.797119140625, "loss": 1.0442, "margin_dpo/margin_mean": 316.2962341308594, "margin_dpo/margin_std": 406.5645751953125, "step": 447 }, { "KL/chosen_KL_mean": -435.0650939941406, "KL/mean": -581.727294921875, "KL/rejected_KL_mean": -728.3895263671875, "KL/std": 313.0533447265625, "epoch": 0.657856093979442, "fcm_dpo/beta": 0.001512328744865954, "fcm_dpo/delta": -0.04575078934431076, "fcm_dpo/margin": 293.324462890625, "fcm_dpo/q_t": 0.398230642080307, "grad_norm": 37.61946105957031, "learning_rate": 1.5968958345321177e-07, "logits/chosen": -0.6439417600631714, "logits/rejected": -0.6513484716415405, "logps/chosen": -495.0689392089844, "logps/ref_chosen": -60.00384521484375, "logps/ref_rejected": -102.26465606689453, "logps/rejected": -830.6541748046875, "loss": 1.058, "margin_dpo/margin_mean": 293.324462890625, "margin_dpo/margin_std": 352.77362060546875, "step": 448 }, { "KL/chosen_KL_mean": -438.35260009765625, "KL/mean": -588.6976318359375, "KL/rejected_KL_mean": -739.042724609375, "KL/std": 370.13311767578125, "epoch": 0.6593245227606461, "fcm_dpo/beta": 0.0014895712956786156, "fcm_dpo/delta": -0.05026708170771599, "fcm_dpo/margin": 300.6900939941406, "fcm_dpo/q_t": 0.4015531539916992, "grad_norm": 27.254419326782227, "learning_rate": 1.584941086944423e-07, "logits/chosen": -0.6604284048080444, "logits/rejected": -0.6543349623680115, "logps/chosen": -505.8792419433594, "logps/ref_chosen": -67.52661895751953, "logps/ref_rejected": -88.59690856933594, "logps/rejected": -827.6396484375, "loss": 1.0842, "margin_dpo/margin_mean": 300.6900939941406, "margin_dpo/margin_std": 448.55859375, "step": 449 }, { "KL/chosen_KL_mean": -353.40374755859375, "KL/mean": -518.1417236328125, "KL/rejected_KL_mean": -682.8796997070312, "KL/std": 330.11187744140625, "epoch": 0.6607929515418502, "fcm_dpo/beta": 0.001471104216761887, "fcm_dpo/delta": -0.08891390264034271, "fcm_dpo/margin": 329.4759216308594, "fcm_dpo/q_t": 0.38630813360214233, "grad_norm": 44.962730407714844, "learning_rate": 1.573010452010098e-07, "logits/chosen": -0.6974214315414429, "logits/rejected": -0.7095401883125305, "logps/chosen": -410.5118713378906, "logps/ref_chosen": -57.10811996459961, "logps/ref_rejected": -102.75494384765625, "logps/rejected": -785.6346435546875, "loss": 1.0136, "margin_dpo/margin_mean": 329.4759521484375, "margin_dpo/margin_std": 328.4858703613281, "step": 450 }, { "KL/chosen_KL_mean": -480.493408203125, "KL/mean": -596.1444702148438, "KL/rejected_KL_mean": -711.7955322265625, "KL/std": 380.8516540527344, "epoch": 0.6622613803230544, "fcm_dpo/beta": 0.0014787260442972183, "fcm_dpo/delta": 0.05972132831811905, "fcm_dpo/margin": 231.30206298828125, "fcm_dpo/q_t": 0.41907864809036255, "grad_norm": 39.84464645385742, "learning_rate": 1.5611042441124687e-07, "logits/chosen": -0.7402889728546143, "logits/rejected": -0.7134509086608887, "logps/chosen": -538.9622192382812, "logps/ref_chosen": -58.46883010864258, "logps/ref_rejected": -72.92941284179688, "logps/rejected": -784.7249145507812, "loss": 1.1722, "margin_dpo/margin_mean": 231.3020782470703, "margin_dpo/margin_std": 440.70330810546875, "step": 451 }, { "KL/chosen_KL_mean": -340.0597229003906, "KL/mean": -483.1123046875, "KL/rejected_KL_mean": -626.1648559570312, "KL/std": 296.316162109375, "epoch": 0.6637298091042585, "fcm_dpo/beta": 0.0014726007357239723, "fcm_dpo/delta": -0.022314528003335, "fcm_dpo/margin": 286.1051330566406, "fcm_dpo/q_t": 0.400713711977005, "grad_norm": 21.744396209716797, "learning_rate": 1.549222776991186e-07, "logits/chosen": -0.6544848680496216, "logits/rejected": -0.6705133318901062, "logps/chosen": -390.4502868652344, "logps/ref_chosen": -50.39055252075195, "logps/ref_rejected": -97.77142333984375, "logps/rejected": -723.936279296875, "loss": 1.0562, "margin_dpo/margin_mean": 286.10516357421875, "margin_dpo/margin_std": 303.95037841796875, "step": 452 }, { "KL/chosen_KL_mean": -392.181396484375, "KL/mean": -527.95263671875, "KL/rejected_KL_mean": -663.723876953125, "KL/std": 302.70947265625, "epoch": 0.6651982378854625, "fcm_dpo/beta": 0.0014694023411720991, "fcm_dpo/delta": 0.0009205006062984467, "fcm_dpo/margin": 271.54254150390625, "fcm_dpo/q_t": 0.40955957770347595, "grad_norm": 30.068889617919922, "learning_rate": 1.5373663637339584e-07, "logits/chosen": -0.7128562331199646, "logits/rejected": -0.6981433629989624, "logps/chosen": -449.896240234375, "logps/ref_chosen": -57.71485137939453, "logps/ref_rejected": -82.20741271972656, "logps/rejected": -745.9313354492188, "loss": 1.0919, "margin_dpo/margin_mean": 271.54254150390625, "margin_dpo/margin_std": 364.5362548828125, "step": 453 }, { "KL/chosen_KL_mean": -482.7806701660156, "KL/mean": -633.9759521484375, "KL/rejected_KL_mean": -785.1713256835938, "KL/std": 346.2369384765625, "epoch": 0.6666666666666666, "fcm_dpo/beta": 0.0014605964533984661, "fcm_dpo/delta": -0.043758489191532135, "fcm_dpo/margin": 302.390625, "fcm_dpo/q_t": 0.39894014596939087, "grad_norm": 31.32180404663086, "learning_rate": 1.5255353167683017e-07, "logits/chosen": -0.7373279333114624, "logits/rejected": -0.725917637348175, "logps/chosen": -543.726318359375, "logps/ref_chosen": -60.945648193359375, "logps/ref_rejected": -84.95079040527344, "logps/rejected": -870.1220703125, "loss": 1.067, "margin_dpo/margin_mean": 302.390625, "margin_dpo/margin_std": 393.43988037109375, "step": 454 }, { "KL/chosen_KL_mean": -409.9599609375, "KL/mean": -590.746826171875, "KL/rejected_KL_mean": -771.53369140625, "KL/std": 357.16259765625, "epoch": 0.6681350954478708, "fcm_dpo/beta": 0.0014415888581424952, "fcm_dpo/delta": -0.12789547443389893, "fcm_dpo/margin": 361.5738220214844, "fcm_dpo/q_t": 0.3820483982563019, "grad_norm": 35.17242431640625, "learning_rate": 1.5137299478533064e-07, "logits/chosen": -0.6761902570724487, "logits/rejected": -0.6945501565933228, "logps/chosen": -454.8466491699219, "logps/ref_chosen": -44.88671112060547, "logps/ref_rejected": -115.30147552490234, "logps/rejected": -886.835205078125, "loss": 1.014, "margin_dpo/margin_mean": 361.5738220214844, "margin_dpo/margin_std": 407.70379638671875, "step": 455 }, { "KL/chosen_KL_mean": -441.02471923828125, "KL/mean": -611.8526000976562, "KL/rejected_KL_mean": -782.6804809570312, "KL/std": 368.62628173828125, "epoch": 0.6696035242290749, "fcm_dpo/beta": 0.0014074372593313456, "fcm_dpo/delta": -0.08486473560333252, "fcm_dpo/margin": 341.6557312011719, "fcm_dpo/q_t": 0.38988521695137024, "grad_norm": 24.956411361694336, "learning_rate": 1.5019505680714232e-07, "logits/chosen": -0.7183812260627747, "logits/rejected": -0.7420483827590942, "logps/chosen": -498.0615234375, "logps/ref_chosen": -57.036781311035156, "logps/ref_rejected": -105.21784210205078, "logps/rejected": -887.8983154296875, "loss": 1.0206, "margin_dpo/margin_mean": 341.65576171875, "margin_dpo/margin_std": 363.93701171875, "step": 456 }, { "KL/chosen_KL_mean": -427.92364501953125, "KL/mean": -595.2412719726562, "KL/rejected_KL_mean": -762.558837890625, "KL/std": 349.4745178222656, "epoch": 0.671071953010279, "fcm_dpo/beta": 0.0013823909685015678, "fcm_dpo/delta": -0.06590519100427628, "fcm_dpo/margin": 334.6352233886719, "fcm_dpo/q_t": 0.39179015159606934, "grad_norm": 29.466060638427734, "learning_rate": 1.4901974878202627e-07, "logits/chosen": -0.6991932392120361, "logits/rejected": -0.7005044221878052, "logps/chosen": -482.16619873046875, "logps/ref_chosen": -54.24253845214844, "logps/ref_rejected": -85.10956573486328, "logps/rejected": -847.66845703125, "loss": 1.0307, "margin_dpo/margin_mean": 334.6352233886719, "margin_dpo/margin_std": 347.06805419921875, "step": 457 }, { "KL/chosen_KL_mean": -452.7229309082031, "KL/mean": -606.5921020507812, "KL/rejected_KL_mean": -760.4613037109375, "KL/std": 326.22540283203125, "epoch": 0.6725403817914831, "fcm_dpo/beta": 0.001369113102555275, "fcm_dpo/delta": -0.022949304431676865, "fcm_dpo/margin": 307.73828125, "fcm_dpo/q_t": 0.4038897454738617, "grad_norm": 30.682571411132812, "learning_rate": 1.4784710168044212e-07, "logits/chosen": -0.751007616519928, "logits/rejected": -0.7468206286430359, "logps/chosen": -508.1318359375, "logps/ref_chosen": -55.40888214111328, "logps/ref_rejected": -97.68325805664062, "logps/rejected": -858.14453125, "loss": 1.0701, "margin_dpo/margin_mean": 307.7383117675781, "margin_dpo/margin_std": 368.6239929199219, "step": 458 }, { "KL/chosen_KL_mean": -485.3470458984375, "KL/mean": -654.956298828125, "KL/rejected_KL_mean": -824.5655517578125, "KL/std": 385.02587890625, "epoch": 0.6740088105726872, "fcm_dpo/beta": 0.001356898806989193, "fcm_dpo/delta": -0.0636778175830841, "fcm_dpo/margin": 339.2184753417969, "fcm_dpo/q_t": 0.3953377604484558, "grad_norm": 32.14663314819336, "learning_rate": 1.466771464027316e-07, "logits/chosen": -0.7115896940231323, "logits/rejected": -0.7303779125213623, "logps/chosen": -531.904541015625, "logps/ref_chosen": -46.55748748779297, "logps/ref_rejected": -86.16854095458984, "logps/rejected": -910.7340698242188, "loss": 1.0571, "margin_dpo/margin_mean": 339.218505859375, "margin_dpo/margin_std": 428.857177734375, "step": 459 }, { "KL/chosen_KL_mean": -530.1577758789062, "KL/mean": -716.4615478515625, "KL/rejected_KL_mean": -902.7651977539062, "KL/std": 376.7371826171875, "epoch": 0.6754772393538914, "fcm_dpo/beta": 0.001337511232122779, "fcm_dpo/delta": -0.10346446931362152, "fcm_dpo/margin": 372.607421875, "fcm_dpo/q_t": 0.3863397240638733, "grad_norm": 35.66582107543945, "learning_rate": 1.4550991377830423e-07, "logits/chosen": -0.7767213582992554, "logits/rejected": -0.8095457553863525, "logps/chosen": -581.792724609375, "logps/ref_chosen": -51.63489532470703, "logps/ref_rejected": -104.11935424804688, "logps/rejected": -1006.884521484375, "loss": 1.0181, "margin_dpo/margin_mean": 372.607421875, "margin_dpo/margin_std": 416.0313415527344, "step": 460 }, { "KL/chosen_KL_mean": -561.940673828125, "KL/mean": -709.7664794921875, "KL/rejected_KL_mean": -857.5922241210938, "KL/std": 384.3741149902344, "epoch": 0.6769456681350955, "fcm_dpo/beta": 0.001335039036348462, "fcm_dpo/delta": 0.00526130385696888, "fcm_dpo/margin": 295.65155029296875, "fcm_dpo/q_t": 0.4116092324256897, "grad_norm": 29.899106979370117, "learning_rate": 1.4434543456482518e-07, "logits/chosen": -0.7882189750671387, "logits/rejected": -0.8021144270896912, "logps/chosen": -617.1226196289062, "logps/ref_chosen": -55.18195724487305, "logps/ref_rejected": -86.47689819335938, "logps/rejected": -944.069091796875, "loss": 1.1054, "margin_dpo/margin_mean": 295.65155029296875, "margin_dpo/margin_std": 430.65924072265625, "step": 461 }, { "KL/chosen_KL_mean": -578.3634033203125, "KL/mean": -700.3892822265625, "KL/rejected_KL_mean": -822.4152221679688, "KL/std": 376.43658447265625, "epoch": 0.6784140969162996, "fcm_dpo/beta": 0.0013445301447063684, "fcm_dpo/delta": 0.07435386627912521, "fcm_dpo/margin": 244.05184936523438, "fcm_dpo/q_t": 0.42686232924461365, "grad_norm": 39.60346984863281, "learning_rate": 1.4318373944740484e-07, "logits/chosen": -0.8968935012817383, "logits/rejected": -0.8908392190933228, "logps/chosen": -648.2914428710938, "logps/ref_chosen": -69.92803192138672, "logps/ref_rejected": -78.84111022949219, "logps/rejected": -901.25634765625, "loss": 1.1632, "margin_dpo/margin_mean": 244.05184936523438, "margin_dpo/margin_std": 432.3650817871094, "step": 462 }, { "KL/chosen_KL_mean": -589.15576171875, "KL/mean": -753.0881958007812, "KL/rejected_KL_mean": -917.0206298828125, "KL/std": 411.322021484375, "epoch": 0.6798825256975036, "fcm_dpo/beta": 0.0013473678845912218, "fcm_dpo/delta": -0.04382871836423874, "fcm_dpo/margin": 327.86480712890625, "fcm_dpo/q_t": 0.401495099067688, "grad_norm": 44.926334381103516, "learning_rate": 1.4202485903778976e-07, "logits/chosen": -0.8459576964378357, "logits/rejected": -0.8583585023880005, "logps/chosen": -644.43017578125, "logps/ref_chosen": -55.27437210083008, "logps/ref_rejected": -89.02497863769531, "logps/rejected": -1006.0455932617188, "loss": 1.0814, "margin_dpo/margin_mean": 327.86480712890625, "margin_dpo/margin_std": 460.8346862792969, "step": 463 }, { "KL/chosen_KL_mean": -577.5390625, "KL/mean": -824.5910034179688, "KL/rejected_KL_mean": -1071.6429443359375, "KL/std": 482.9642333984375, "epoch": 0.6813509544787077, "fcm_dpo/beta": 0.0012808447936549783, "fcm_dpo/delta": -0.25145474076271057, "fcm_dpo/margin": 494.1038818359375, "fcm_dpo/q_t": 0.35675048828125, "grad_norm": 50.258697509765625, "learning_rate": 1.4086882387355658e-07, "logits/chosen": -0.8322213888168335, "logits/rejected": -0.8996630311012268, "logps/chosen": -628.4513549804688, "logps/ref_chosen": -50.91230010986328, "logps/ref_rejected": -102.4893798828125, "logps/rejected": -1174.13232421875, "loss": 0.9347, "margin_dpo/margin_mean": 494.1038818359375, "margin_dpo/margin_std": 484.71392822265625, "step": 464 }, { "KL/chosen_KL_mean": -601.5679321289062, "KL/mean": -797.656982421875, "KL/rejected_KL_mean": -993.74609375, "KL/std": 499.0876159667969, "epoch": 0.6828193832599119, "fcm_dpo/beta": 0.0012595669832080603, "fcm_dpo/delta": -0.09877628087997437, "fcm_dpo/margin": 392.17816162109375, "fcm_dpo/q_t": 0.3856554627418518, "grad_norm": 39.39644241333008, "learning_rate": 1.3971566441730714e-07, "logits/chosen": -0.8465057015419006, "logits/rejected": -0.8691214323043823, "logps/chosen": -661.684814453125, "logps/ref_chosen": -60.116851806640625, "logps/ref_rejected": -113.94602966308594, "logps/rejected": -1107.692138671875, "loss": 1.0558, "margin_dpo/margin_mean": 392.17816162109375, "margin_dpo/margin_std": 528.9464111328125, "step": 465 }, { "KL/chosen_KL_mean": -660.0999755859375, "KL/mean": -833.5242919921875, "KL/rejected_KL_mean": -1006.94873046875, "KL/std": 460.6139221191406, "epoch": 0.684287812041116, "fcm_dpo/beta": 0.0012364451540634036, "fcm_dpo/delta": -0.03083794191479683, "fcm_dpo/margin": 346.8487854003906, "fcm_dpo/q_t": 0.4021310806274414, "grad_norm": 38.076751708984375, "learning_rate": 1.3856541105586545e-07, "logits/chosen": -0.8880220651626587, "logits/rejected": -0.8946011662483215, "logps/chosen": -713.0208740234375, "logps/ref_chosen": -52.920921325683594, "logps/ref_rejected": -90.3154296875, "logps/rejected": -1097.26416015625, "loss": 1.0962, "margin_dpo/margin_mean": 346.84881591796875, "margin_dpo/margin_std": 507.0184326171875, "step": 466 }, { "KL/chosen_KL_mean": -823.9940185546875, "KL/mean": -1010.909912109375, "KL/rejected_KL_mean": -1197.8258056640625, "KL/std": 614.4384155273438, "epoch": 0.6857562408223201, "fcm_dpo/beta": 0.0012187270913273096, "fcm_dpo/delta": -0.060437288135290146, "fcm_dpo/margin": 373.8318786621094, "fcm_dpo/q_t": 0.40229350328445435, "grad_norm": 62.107765197753906, "learning_rate": 1.3741809409947729e-07, "logits/chosen": -0.9601616859436035, "logits/rejected": -0.9515029788017273, "logps/chosen": -902.7098388671875, "logps/ref_chosen": -78.7158203125, "logps/ref_rejected": -102.86019897460938, "logps/rejected": -1300.68603515625, "loss": 1.1519, "margin_dpo/margin_mean": 373.8318786621094, "margin_dpo/margin_std": 707.5423583984375, "step": 467 }, { "KL/chosen_KL_mean": -662.00439453125, "KL/mean": -897.078857421875, "KL/rejected_KL_mean": -1132.1533203125, "KL/std": 548.8661499023438, "epoch": 0.6872246696035242, "fcm_dpo/beta": 0.001194630516692996, "fcm_dpo/delta": -0.17134541273117065, "fcm_dpo/margin": 470.14892578125, "fcm_dpo/q_t": 0.3789057433605194, "grad_norm": 45.27859878540039, "learning_rate": 1.362737437810114e-07, "logits/chosen": -0.9165079593658447, "logits/rejected": -0.9324535131454468, "logps/chosen": -731.9397583007812, "logps/ref_chosen": -69.93536376953125, "logps/ref_rejected": -101.02880859375, "logps/rejected": -1233.18212890625, "loss": 1.0136, "margin_dpo/margin_mean": 470.14892578125, "margin_dpo/margin_std": 618.6029663085938, "step": 468 }, { "KL/chosen_KL_mean": -682.61962890625, "KL/mean": -917.685546875, "KL/rejected_KL_mean": -1152.75146484375, "KL/std": 480.97088623046875, "epoch": 0.6886930983847284, "fcm_dpo/beta": 0.0011510958429425955, "fcm_dpo/delta": -0.150983989238739, "fcm_dpo/margin": 470.13177490234375, "fcm_dpo/q_t": 0.37772035598754883, "grad_norm": 44.61856460571289, "learning_rate": 1.351323902551631e-07, "logits/chosen": -0.9298604726791382, "logits/rejected": -0.9497323036193848, "logps/chosen": -750.744384765625, "logps/ref_chosen": -68.12469482421875, "logps/ref_rejected": -104.78640747070312, "logps/rejected": -1257.537841796875, "loss": 1.0078, "margin_dpo/margin_mean": 470.13177490234375, "margin_dpo/margin_std": 545.7459716796875, "step": 469 }, { "KL/chosen_KL_mean": -585.1773071289062, "KL/mean": -789.4063720703125, "KL/rejected_KL_mean": -993.6353759765625, "KL/std": 501.39984130859375, "epoch": 0.6901615271659325, "fcm_dpo/beta": 0.0011422440875321627, "fcm_dpo/delta": -0.06988134980201721, "fcm_dpo/margin": 408.4580993652344, "fcm_dpo/q_t": 0.39377570152282715, "grad_norm": 27.795562744140625, "learning_rate": 1.339940635976592e-07, "logits/chosen": -0.9022126197814941, "logits/rejected": -0.9164737462997437, "logps/chosen": -628.96923828125, "logps/ref_chosen": -43.791927337646484, "logps/ref_rejected": -82.70285034179688, "logps/rejected": -1076.3382568359375, "loss": 1.0646, "margin_dpo/margin_mean": 408.45806884765625, "margin_dpo/margin_std": 547.7627563476562, "step": 470 }, { "KL/chosen_KL_mean": -710.951416015625, "KL/mean": -879.2623291015625, "KL/rejected_KL_mean": -1047.5733642578125, "KL/std": 499.705810546875, "epoch": 0.6916299559471366, "fcm_dpo/beta": 0.0011305524967610836, "fcm_dpo/delta": 0.019698694348335266, "fcm_dpo/margin": 336.62200927734375, "fcm_dpo/q_t": 0.41612815856933594, "grad_norm": 53.5421142578125, "learning_rate": 1.3285879380446563e-07, "logits/chosen": -0.9983842968940735, "logits/rejected": -1.0077528953552246, "logps/chosen": -774.2908935546875, "logps/ref_chosen": -63.33952331542969, "logps/ref_rejected": -83.61048126220703, "logps/rejected": -1131.183837890625, "loss": 1.1312, "margin_dpo/margin_mean": 336.62200927734375, "margin_dpo/margin_std": 550.3225708007812, "step": 471 }, { "KL/chosen_KL_mean": -691.177001953125, "KL/mean": -886.953857421875, "KL/rejected_KL_mean": -1082.730712890625, "KL/std": 583.6131591796875, "epoch": 0.6930983847283406, "fcm_dpo/beta": 0.0011264560744166374, "fcm_dpo/delta": -0.04338788241147995, "fcm_dpo/margin": 391.5537109375, "fcm_dpo/q_t": 0.40537628531455994, "grad_norm": 36.24667739868164, "learning_rate": 1.317266107909975e-07, "logits/chosen": -0.9884932041168213, "logits/rejected": -0.9719465970993042, "logps/chosen": -774.8431396484375, "logps/ref_chosen": -83.66610717773438, "logps/ref_rejected": -117.20919799804688, "logps/rejected": -1199.93994140625, "loss": 1.1093, "margin_dpo/margin_mean": 391.5537109375, "margin_dpo/margin_std": 631.884033203125, "step": 472 }, { "KL/chosen_KL_mean": -832.517578125, "KL/mean": -925.3512573242188, "KL/rejected_KL_mean": -1018.184814453125, "KL/std": 606.7120361328125, "epoch": 0.6945668135095447, "fcm_dpo/beta": 0.0011280329199507833, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 185.667236328125, "fcm_dpo/q_t": 0.4554038643836975, "grad_norm": 122.36241912841797, "learning_rate": 1.3059754439133002e-07, "logits/chosen": -0.988477349281311, "logits/rejected": -0.9607559442520142, "logps/chosen": -896.0145874023438, "logps/ref_chosen": -63.49696731567383, "logps/ref_rejected": -81.14657592773438, "logps/rejected": -1099.3314208984375, "loss": 1.3563, "margin_dpo/margin_mean": 185.667236328125, "margin_dpo/margin_std": 783.8460693359375, "step": 473 }, { "KL/chosen_KL_mean": -654.71240234375, "KL/mean": -823.9849853515625, "KL/rejected_KL_mean": -993.2574462890625, "KL/std": 526.5281982421875, "epoch": 0.6960352422907489, "fcm_dpo/beta": 0.0011190182995051146, "fcm_dpo/delta": -0.08055973798036575, "fcm_dpo/margin": 338.5450134277344, "fcm_dpo/q_t": 0.4132459759712219, "grad_norm": 37.246849060058594, "learning_rate": 1.2947162435741277e-07, "logits/chosen": -0.92474365234375, "logits/rejected": -0.9312121868133545, "logps/chosen": -707.3243408203125, "logps/ref_chosen": -52.6119384765625, "logps/ref_rejected": -90.08041381835938, "logps/rejected": -1083.337890625, "loss": 1.1495, "margin_dpo/margin_mean": 338.5450439453125, "margin_dpo/margin_std": 580.7293701171875, "step": 474 }, { "KL/chosen_KL_mean": -465.27777099609375, "KL/mean": -679.0506591796875, "KL/rejected_KL_mean": -892.8236083984375, "KL/std": 421.060546875, "epoch": 0.697503671071953, "fcm_dpo/beta": 0.0010959157953038812, "fcm_dpo/delta": -0.0721563771367073, "fcm_dpo/margin": 427.5458679199219, "fcm_dpo/q_t": 0.39104607701301575, "grad_norm": 51.85737991333008, "learning_rate": 1.2834888035828596e-07, "logits/chosen": -0.9467837810516357, "logits/rejected": -0.977871835231781, "logps/chosen": -507.7729797363281, "logps/ref_chosen": -42.49519348144531, "logps/ref_rejected": -90.06294250488281, "logps/rejected": -982.8865966796875, "loss": 1.0281, "margin_dpo/margin_mean": 427.5458984375, "margin_dpo/margin_std": 449.01556396484375, "step": 475 }, { "KL/chosen_KL_mean": -574.3722534179688, "KL/mean": -739.1430053710938, "KL/rejected_KL_mean": -903.913818359375, "KL/std": 451.5216064453125, "epoch": 0.6989720998531571, "fcm_dpo/beta": 0.0010992654133588076, "fcm_dpo/delta": 0.03918338194489479, "fcm_dpo/margin": 329.5415344238281, "fcm_dpo/q_t": 0.41674578189849854, "grad_norm": 79.2264175415039, "learning_rate": 1.2722934197929802e-07, "logits/chosen": -0.9211336970329285, "logits/rejected": -0.9354342222213745, "logps/chosen": -617.3216552734375, "logps/ref_chosen": -42.94938278198242, "logps/ref_rejected": -73.71023559570312, "logps/rejected": -977.6240234375, "loss": 1.1116, "margin_dpo/margin_mean": 329.54150390625, "margin_dpo/margin_std": 444.04339599609375, "step": 476 }, { "KL/chosen_KL_mean": -601.9998779296875, "KL/mean": -778.6286010742188, "KL/rejected_KL_mean": -955.2572021484375, "KL/std": 492.07171630859375, "epoch": 0.7004405286343612, "fcm_dpo/beta": 0.0011070938780903816, "fcm_dpo/delta": 0.009110800921916962, "fcm_dpo/margin": 353.25738525390625, "fcm_dpo/q_t": 0.4111800789833069, "grad_norm": 36.04801940917969, "learning_rate": 1.2611303872132631e-07, "logits/chosen": -0.9658732414245605, "logits/rejected": -0.9317635297775269, "logps/chosen": -672.7725219726562, "logps/ref_chosen": -70.77261352539062, "logps/ref_rejected": -76.13737487792969, "logps/rejected": -1031.3946533203125, "loss": 1.1286, "margin_dpo/margin_mean": 353.2573547363281, "margin_dpo/margin_std": 582.523193359375, "step": 477 }, { "KL/chosen_KL_mean": -485.2200927734375, "KL/mean": -681.3143920898438, "KL/rejected_KL_mean": -877.40869140625, "KL/std": 427.36639404296875, "epoch": 0.7019089574155654, "fcm_dpo/beta": 0.0011018933728337288, "fcm_dpo/delta": -0.0336417555809021, "fcm_dpo/margin": 392.1885986328125, "fcm_dpo/q_t": 0.40104353427886963, "grad_norm": 55.68071365356445, "learning_rate": 1.2500000000000005e-07, "logits/chosen": -0.8429279327392578, "logits/rejected": -0.8667222857475281, "logps/chosen": -526.66064453125, "logps/ref_chosen": -41.440513610839844, "logps/ref_rejected": -85.36196899414062, "logps/rejected": -962.7706298828125, "loss": 1.0694, "margin_dpo/margin_mean": 392.1885986328125, "margin_dpo/margin_std": 496.35369873046875, "step": 478 }, { "KL/chosen_KL_mean": -646.2515869140625, "KL/mean": -834.347900390625, "KL/rejected_KL_mean": -1022.4442138671875, "KL/std": 526.5473022460938, "epoch": 0.7033773861967695, "fcm_dpo/beta": 0.0011019103694707155, "fcm_dpo/delta": -0.015925616025924683, "fcm_dpo/margin": 376.192626953125, "fcm_dpo/q_t": 0.4086768627166748, "grad_norm": 31.531246185302734, "learning_rate": 1.2389025514492456e-07, "logits/chosen": -0.8888028860092163, "logits/rejected": -0.9207658767700195, "logps/chosen": -700.1594848632812, "logps/ref_chosen": -53.907920837402344, "logps/ref_rejected": -95.1163330078125, "logps/rejected": -1117.560546875, "loss": 1.1182, "margin_dpo/margin_mean": 376.192626953125, "margin_dpo/margin_std": 613.9232177734375, "step": 479 }, { "KL/chosen_KL_mean": -766.6184692382812, "KL/mean": -914.756591796875, "KL/rejected_KL_mean": -1062.894775390625, "KL/std": 496.72235107421875, "epoch": 0.7048458149779736, "fcm_dpo/beta": 0.0010890522971749306, "fcm_dpo/delta": -0.04379244148731232, "fcm_dpo/margin": 296.2763366699219, "fcm_dpo/q_t": 0.42761245369911194, "grad_norm": 53.24702453613281, "learning_rate": 1.227838333989088e-07, "logits/chosen": -0.9728115797042847, "logits/rejected": -0.9693245887756348, "logps/chosen": -825.3011474609375, "logps/ref_chosen": -58.682701110839844, "logps/ref_rejected": -82.93248748779297, "logps/rejected": -1145.827392578125, "loss": 1.1792, "margin_dpo/margin_mean": 296.27630615234375, "margin_dpo/margin_std": 538.973876953125, "step": 480 }, { "KL/chosen_KL_mean": -635.2621459960938, "KL/mean": -866.452880859375, "KL/rejected_KL_mean": -1097.6435546875, "KL/std": 512.14208984375, "epoch": 0.7063142437591777, "fcm_dpo/beta": 0.0010695490054786205, "fcm_dpo/delta": -0.09937489032745361, "fcm_dpo/margin": 462.38153076171875, "fcm_dpo/q_t": 0.38825610280036926, "grad_norm": 51.05171203613281, "learning_rate": 1.2168076391719489e-07, "logits/chosen": -0.9634227752685547, "logits/rejected": -0.9947628974914551, "logps/chosen": -690.2264404296875, "logps/ref_chosen": -54.964271545410156, "logps/ref_rejected": -92.42044067382812, "logps/rejected": -1190.0640869140625, "loss": 1.0295, "margin_dpo/margin_mean": 462.38153076171875, "margin_dpo/margin_std": 550.9169311523438, "step": 481 }, { "KL/chosen_KL_mean": -712.7791137695312, "KL/mean": -819.776123046875, "KL/rejected_KL_mean": -926.7730102539062, "KL/std": 536.703857421875, "epoch": 0.7077826725403817, "fcm_dpo/beta": 0.001071346690878272, "fcm_dpo/delta": 0.07841724902391434, "fcm_dpo/margin": 213.99391174316406, "fcm_dpo/q_t": 0.4453909397125244, "grad_norm": 65.0921859741211, "learning_rate": 1.2058107576668938e-07, "logits/chosen": -0.8759874105453491, "logits/rejected": -0.8658995032310486, "logps/chosen": -780.3325805664062, "logps/ref_chosen": -67.553466796875, "logps/ref_rejected": -87.58953857421875, "logps/rejected": -1014.362548828125, "loss": 1.2719, "margin_dpo/margin_mean": 213.993896484375, "margin_dpo/margin_std": 620.146484375, "step": 482 }, { "KL/chosen_KL_mean": -646.241455078125, "KL/mean": -885.39501953125, "KL/rejected_KL_mean": -1124.548583984375, "KL/std": 534.4652099609375, "epoch": 0.7092511013215859, "fcm_dpo/beta": 0.0010598013177514076, "fcm_dpo/delta": -0.11290125548839569, "fcm_dpo/margin": 478.30712890625, "fcm_dpo/q_t": 0.38709717988967896, "grad_norm": 32.96552276611328, "learning_rate": 1.194847979251979e-07, "logits/chosen": -0.9770244359970093, "logits/rejected": -0.9915624856948853, "logps/chosen": -709.5712890625, "logps/ref_chosen": -63.32981872558594, "logps/ref_rejected": -95.78697204589844, "logps/rejected": -1220.3355712890625, "loss": 1.0269, "margin_dpo/margin_mean": 478.30712890625, "margin_dpo/margin_std": 586.1543579101562, "step": 483 }, { "KL/chosen_KL_mean": -553.4284057617188, "KL/mean": -776.1009521484375, "KL/rejected_KL_mean": -998.7734375, "KL/std": 526.0048828125, "epoch": 0.71071953010279, "fcm_dpo/beta": 0.001048530451953411, "fcm_dpo/delta": -0.07025562971830368, "fcm_dpo/margin": 445.3449401855469, "fcm_dpo/q_t": 0.39412179589271545, "grad_norm": 53.46791458129883, "learning_rate": 1.1839195928066101e-07, "logits/chosen": -0.9660162329673767, "logits/rejected": -0.993166983127594, "logps/chosen": -612.5665283203125, "logps/ref_chosen": -59.13812255859375, "logps/ref_rejected": -84.37144470214844, "logps/rejected": -1083.144775390625, "loss": 1.0446, "margin_dpo/margin_mean": 445.3449401855469, "margin_dpo/margin_std": 531.8111572265625, "step": 484 }, { "KL/chosen_KL_mean": -588.0751342773438, "KL/mean": -801.0594482421875, "KL/rejected_KL_mean": -1014.043701171875, "KL/std": 530.8162841796875, "epoch": 0.7121879588839941, "fcm_dpo/beta": 0.001037056790664792, "fcm_dpo/delta": -0.043720267713069916, "fcm_dpo/margin": 425.9684753417969, "fcm_dpo/q_t": 0.40099409222602844, "grad_norm": 34.87718200683594, "learning_rate": 1.1730258863039347e-07, "logits/chosen": -0.8909621238708496, "logits/rejected": -0.9123867750167847, "logps/chosen": -646.9246826171875, "logps/ref_chosen": -58.849571228027344, "logps/ref_rejected": -103.36408233642578, "logps/rejected": -1117.40771484375, "loss": 1.0789, "margin_dpo/margin_mean": 425.968505859375, "margin_dpo/margin_std": 593.173583984375, "step": 485 }, { "KL/chosen_KL_mean": -691.0508422851562, "KL/mean": -929.311279296875, "KL/rejected_KL_mean": -1167.5716552734375, "KL/std": 593.0984497070312, "epoch": 0.7136563876651982, "fcm_dpo/beta": 0.0010174668859690428, "fcm_dpo/delta": -0.08924780786037445, "fcm_dpo/margin": 476.5207824707031, "fcm_dpo/q_t": 0.39317959547042847, "grad_norm": 46.1490364074707, "learning_rate": 1.1621671468032493e-07, "logits/chosen": -0.9853001832962036, "logits/rejected": -0.999464750289917, "logps/chosen": -746.310546875, "logps/ref_chosen": -55.25966262817383, "logps/ref_rejected": -92.13936614990234, "logps/rejected": -1259.7109375, "loss": 1.0787, "margin_dpo/margin_mean": 476.5207824707031, "margin_dpo/margin_std": 722.4910278320312, "step": 486 }, { "KL/chosen_KL_mean": -745.9581298828125, "KL/mean": -923.1339111328125, "KL/rejected_KL_mean": -1100.3096923828125, "KL/std": 566.0206298828125, "epoch": 0.7151248164464024, "fcm_dpo/beta": 0.0010237495880573988, "fcm_dpo/delta": 0.03809621185064316, "fcm_dpo/margin": 354.3516845703125, "fcm_dpo/q_t": 0.4159180819988251, "grad_norm": 39.361175537109375, "learning_rate": 1.1513436604424378e-07, "logits/chosen": -1.0012881755828857, "logits/rejected": -1.0079126358032227, "logps/chosen": -799.0213623046875, "logps/ref_chosen": -53.06330871582031, "logps/ref_rejected": -92.41883087158203, "logps/rejected": -1192.728515625, "loss": 1.136, "margin_dpo/margin_mean": 354.3516845703125, "margin_dpo/margin_std": 571.5254516601562, "step": 487 }, { "KL/chosen_KL_mean": -629.125732421875, "KL/mean": -814.207275390625, "KL/rejected_KL_mean": -999.2886962890625, "KL/std": 508.64453125, "epoch": 0.7165932452276065, "fcm_dpo/beta": 0.0010283133015036583, "fcm_dpo/delta": 0.019743794575333595, "fcm_dpo/margin": 370.1629943847656, "fcm_dpo/q_t": 0.41300255060195923, "grad_norm": 33.7827033996582, "learning_rate": 1.1405557124304335e-07, "logits/chosen": -0.9368076324462891, "logits/rejected": -0.9440141916275024, "logps/chosen": -681.3538818359375, "logps/ref_chosen": -52.22815704345703, "logps/ref_rejected": -84.00656127929688, "logps/rejected": -1083.2952880859375, "loss": 1.1032, "margin_dpo/margin_mean": 370.16302490234375, "margin_dpo/margin_std": 498.775390625, "step": 488 }, { "KL/chosen_KL_mean": -575.0646362304688, "KL/mean": -758.0647583007812, "KL/rejected_KL_mean": -941.0648803710938, "KL/std": 495.6026611328125, "epoch": 0.7180616740088106, "fcm_dpo/beta": 0.0010333817917853594, "fcm_dpo/delta": 0.022133469581604004, "fcm_dpo/margin": 366.0002136230469, "fcm_dpo/q_t": 0.4154972434043884, "grad_norm": 27.7478084564209, "learning_rate": 1.1298035870396985e-07, "logits/chosen": -0.9634197354316711, "logits/rejected": -0.9643290042877197, "logps/chosen": -631.0543212890625, "logps/ref_chosen": -55.989627838134766, "logps/ref_rejected": -79.39812469482422, "logps/rejected": -1020.4630126953125, "loss": 1.1184, "margin_dpo/margin_mean": 366.00018310546875, "margin_dpo/margin_std": 550.1982421875, "step": 489 }, { "KL/chosen_KL_mean": -674.772216796875, "KL/mean": -867.3712158203125, "KL/rejected_KL_mean": -1059.97021484375, "KL/std": 602.034912109375, "epoch": 0.7195301027900147, "fcm_dpo/beta": 0.001032583648338914, "fcm_dpo/delta": 0.00223751924932003, "fcm_dpo/margin": 385.1979064941406, "fcm_dpo/q_t": 0.41282522678375244, "grad_norm": 40.58791732788086, "learning_rate": 1.1190875675987355e-07, "logits/chosen": -0.9857407808303833, "logits/rejected": -1.026228427886963, "logps/chosen": -727.138671875, "logps/ref_chosen": -52.36639404296875, "logps/ref_rejected": -110.4090576171875, "logps/rejected": -1170.379150390625, "loss": 1.1483, "margin_dpo/margin_mean": 385.19793701171875, "margin_dpo/margin_std": 706.33056640625, "step": 490 }, { "KL/chosen_KL_mean": -634.9647216796875, "KL/mean": -764.6456298828125, "KL/rejected_KL_mean": -894.3265380859375, "KL/std": 501.4613952636719, "epoch": 0.7209985315712188, "fcm_dpo/beta": 0.0010518557392060757, "fcm_dpo/delta": 0.13058799505233765, "fcm_dpo/margin": 259.36181640625, "fcm_dpo/q_t": 0.4379756450653076, "grad_norm": 33.82534408569336, "learning_rate": 1.1084079364846241e-07, "logits/chosen": -0.9741504192352295, "logits/rejected": -0.9680135250091553, "logps/chosen": -695.0809936523438, "logps/ref_chosen": -60.11626434326172, "logps/ref_rejected": -73.27278900146484, "logps/rejected": -967.599365234375, "loss": 1.1963, "margin_dpo/margin_mean": 259.3618469238281, "margin_dpo/margin_std": 497.3216552734375, "step": 491 }, { "KL/chosen_KL_mean": -663.180419921875, "KL/mean": -787.96923828125, "KL/rejected_KL_mean": -912.758056640625, "KL/std": 512.7420043945312, "epoch": 0.7224669603524229, "fcm_dpo/beta": 0.0010776289273053408, "fcm_dpo/delta": 0.13482382893562317, "fcm_dpo/margin": 249.57763671875, "fcm_dpo/q_t": 0.44000089168548584, "grad_norm": 44.77201843261719, "learning_rate": 1.097764975115576e-07, "logits/chosen": -1.0639835596084595, "logits/rejected": -1.0435137748718262, "logps/chosen": -717.174560546875, "logps/ref_chosen": -53.994178771972656, "logps/ref_rejected": -72.65962219238281, "logps/rejected": -985.4176635742188, "loss": 1.23, "margin_dpo/margin_mean": 249.57763671875, "margin_dpo/margin_std": 586.6539306640625, "step": 492 }, { "KL/chosen_KL_mean": -686.341552734375, "KL/mean": -834.3265380859375, "KL/rejected_KL_mean": -982.3115234375, "KL/std": 570.59423828125, "epoch": 0.723935389133627, "fcm_dpo/beta": 0.0010814403649419546, "fcm_dpo/delta": -0.023665668442845345, "fcm_dpo/margin": 295.9699401855469, "fcm_dpo/q_t": 0.4249815344810486, "grad_norm": 34.14286804199219, "learning_rate": 1.0871589639435203e-07, "logits/chosen": -1.0923945903778076, "logits/rejected": -1.0642774105072021, "logps/chosen": -761.8388061523438, "logps/ref_chosen": -75.49723815917969, "logps/ref_rejected": -87.32301330566406, "logps/rejected": -1069.634521484375, "loss": 1.1701, "margin_dpo/margin_mean": 295.9699401855469, "margin_dpo/margin_std": 527.4650268554688, "step": 493 }, { "KL/chosen_KL_mean": -532.9840087890625, "KL/mean": -762.801513671875, "KL/rejected_KL_mean": -992.619140625, "KL/std": 505.04736328125, "epoch": 0.7254038179148311, "fcm_dpo/beta": 0.0010658178944140673, "fcm_dpo/delta": -0.09456932544708252, "fcm_dpo/margin": 459.6351318359375, "fcm_dpo/q_t": 0.3873167037963867, "grad_norm": 52.453407287597656, "learning_rate": 1.0765901824467166e-07, "logits/chosen": -0.9276965856552124, "logits/rejected": -0.9675771594047546, "logps/chosen": -574.34326171875, "logps/ref_chosen": -41.35926818847656, "logps/ref_rejected": -86.09136962890625, "logps/rejected": -1078.71044921875, "loss": 1.022, "margin_dpo/margin_mean": 459.6351318359375, "margin_dpo/margin_std": 506.0677490234375, "step": 494 }, { "KL/chosen_KL_mean": -601.6910400390625, "KL/mean": -798.136962890625, "KL/rejected_KL_mean": -994.582763671875, "KL/std": 516.9666137695312, "epoch": 0.7268722466960352, "fcm_dpo/beta": 0.0010593379847705364, "fcm_dpo/delta": -0.016908157616853714, "fcm_dpo/margin": 392.8917236328125, "fcm_dpo/q_t": 0.40819650888442993, "grad_norm": 33.18449020385742, "learning_rate": 1.0660589091223854e-07, "logits/chosen": -1.020897388458252, "logits/rejected": -1.0267536640167236, "logps/chosen": -665.22607421875, "logps/ref_chosen": -63.53507995605469, "logps/ref_rejected": -91.42443084716797, "logps/rejected": -1086.0072021484375, "loss": 1.1072, "margin_dpo/margin_mean": 392.8917236328125, "margin_dpo/margin_std": 618.4354248046875, "step": 495 }, { "KL/chosen_KL_mean": -744.3991088867188, "KL/mean": -843.0897216796875, "KL/rejected_KL_mean": -941.7803344726562, "KL/std": 415.9389343261719, "epoch": 0.7283406754772394, "fcm_dpo/beta": 0.0010575959458947182, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 197.38121032714844, "fcm_dpo/q_t": 0.4504912197589874, "grad_norm": 70.50399017333984, "learning_rate": 1.0555654214793722e-07, "logits/chosen": -0.9933898448944092, "logits/rejected": -0.957461953163147, "logps/chosen": -816.9910888671875, "logps/ref_chosen": -72.5919189453125, "logps/ref_rejected": -84.32933807373047, "logps/rejected": -1026.109619140625, "loss": 1.2514, "margin_dpo/margin_mean": 197.3812255859375, "margin_dpo/margin_std": 476.3318176269531, "step": 496 }, { "KL/chosen_KL_mean": -704.3419189453125, "KL/mean": -808.200439453125, "KL/rejected_KL_mean": -912.058837890625, "KL/std": 518.0106201171875, "epoch": 0.7298091042584435, "fcm_dpo/beta": 0.0010635224170982838, "fcm_dpo/delta": 0.02794058434665203, "fcm_dpo/margin": 207.71685791015625, "fcm_dpo/q_t": 0.44891273975372314, "grad_norm": 35.43773651123047, "learning_rate": 1.0451099960308374e-07, "logits/chosen": -0.9681912660598755, "logits/rejected": -0.9516055583953857, "logps/chosen": -762.9359130859375, "logps/ref_chosen": -58.59397506713867, "logps/ref_rejected": -76.28836822509766, "logps/rejected": -988.34716796875, "loss": 1.2436, "margin_dpo/margin_mean": 207.7168731689453, "margin_dpo/margin_std": 479.673095703125, "step": 497 }, { "KL/chosen_KL_mean": -667.029541015625, "KL/mean": -843.3291015625, "KL/rejected_KL_mean": -1019.628662109375, "KL/std": 551.8720703125, "epoch": 0.7312775330396476, "fcm_dpo/beta": 0.0010668218601495028, "fcm_dpo/delta": 0.024778790771961212, "fcm_dpo/margin": 352.5992126464844, "fcm_dpo/q_t": 0.4150667190551758, "grad_norm": 37.042057037353516, "learning_rate": 1.0346929082869641e-07, "logits/chosen": -0.9754823446273804, "logits/rejected": -0.9640058279037476, "logps/chosen": -738.2352294921875, "logps/ref_chosen": -71.20565795898438, "logps/ref_rejected": -83.95803833007812, "logps/rejected": -1103.586669921875, "loss": 1.145, "margin_dpo/margin_mean": 352.5992126464844, "margin_dpo/margin_std": 618.8363647460938, "step": 498 }, { "KL/chosen_KL_mean": -573.7318115234375, "KL/mean": -789.2236938476562, "KL/rejected_KL_mean": -1004.7156982421875, "KL/std": 525.0882568359375, "epoch": 0.7327459618208517, "fcm_dpo/beta": 0.0010582150425761938, "fcm_dpo/delta": -0.058853406459093094, "fcm_dpo/margin": 430.98388671875, "fcm_dpo/q_t": 0.39665037393569946, "grad_norm": 41.18540573120117, "learning_rate": 1.0243144327477013e-07, "logits/chosen": -1.0105154514312744, "logits/rejected": -1.0502002239227295, "logps/chosen": -624.9869384765625, "logps/ref_chosen": -51.25519561767578, "logps/ref_rejected": -101.07870483398438, "logps/rejected": -1105.79443359375, "loss": 1.0651, "margin_dpo/margin_mean": 430.98388671875, "margin_dpo/margin_std": 575.690673828125, "step": 499 }, { "KL/chosen_KL_mean": -679.6956787109375, "KL/mean": -855.8604125976562, "KL/rejected_KL_mean": -1032.025146484375, "KL/std": 456.21795654296875, "epoch": 0.7342143906020558, "fcm_dpo/beta": 0.001059262314811349, "fcm_dpo/delta": 0.02782963030040264, "fcm_dpo/margin": 352.3294677734375, "fcm_dpo/q_t": 0.4146909713745117, "grad_norm": 31.199586868286133, "learning_rate": 1.0139748428955333e-07, "logits/chosen": -0.979456901550293, "logits/rejected": -1.0102999210357666, "logps/chosen": -736.72314453125, "logps/ref_chosen": -57.027442932128906, "logps/ref_rejected": -93.93421173095703, "logps/rejected": -1125.9593505859375, "loss": 1.1312, "margin_dpo/margin_mean": 352.3294677734375, "margin_dpo/margin_std": 558.6328125, "step": 500 }, { "KL/chosen_KL_mean": -593.8455810546875, "KL/mean": -783.6693115234375, "KL/rejected_KL_mean": -973.4930419921875, "KL/std": 487.61322021484375, "epoch": 0.73568281938326, "fcm_dpo/beta": 0.001063595525920391, "fcm_dpo/delta": -0.004017947241663933, "fcm_dpo/margin": 379.6474304199219, "fcm_dpo/q_t": 0.4093387722969055, "grad_norm": 32.033573150634766, "learning_rate": 1.0036744111882672e-07, "logits/chosen": -0.9334768056869507, "logits/rejected": -0.9198960065841675, "logps/chosen": -648.205078125, "logps/ref_chosen": -54.359527587890625, "logps/ref_rejected": -80.15670013427734, "logps/rejected": -1053.649658203125, "loss": 1.1161, "margin_dpo/margin_mean": 379.64739990234375, "margin_dpo/margin_std": 597.58349609375, "step": 501 }, { "KL/chosen_KL_mean": -542.5103759765625, "KL/mean": -729.060791015625, "KL/rejected_KL_mean": -915.6112060546875, "KL/std": 439.54412841796875, "epoch": 0.737151248164464, "fcm_dpo/beta": 0.0010629099560901523, "fcm_dpo/delta": 0.003547299187630415, "fcm_dpo/margin": 373.10076904296875, "fcm_dpo/q_t": 0.4092276096343994, "grad_norm": 33.64237594604492, "learning_rate": 9.934134090518592e-08, "logits/chosen": -0.8644669055938721, "logits/rejected": -0.8459637761116028, "logps/chosen": -610.1109008789062, "logps/ref_chosen": -67.60050964355469, "logps/ref_rejected": -82.94876098632812, "logps/rejected": -998.5599365234375, "loss": 1.0855, "margin_dpo/margin_mean": 373.1007995605469, "margin_dpo/margin_std": 468.3107604980469, "step": 502 }, { "KL/chosen_KL_mean": -537.25927734375, "KL/mean": -710.22998046875, "KL/rejected_KL_mean": -883.2006225585938, "KL/std": 428.82354736328125, "epoch": 0.7386196769456681, "fcm_dpo/beta": 0.0010641318513080478, "fcm_dpo/delta": 0.03300508111715317, "fcm_dpo/margin": 345.9414367675781, "fcm_dpo/q_t": 0.41630834341049194, "grad_norm": 27.046812057495117, "learning_rate": 9.831921068732571e-08, "logits/chosen": -0.8558259010314941, "logits/rejected": -0.8414930105209351, "logps/chosen": -592.337646484375, "logps/ref_chosen": -55.078407287597656, "logps/ref_rejected": -82.50544738769531, "logps/rejected": -965.7060546875, "loss": 1.1087, "margin_dpo/margin_mean": 345.94140625, "margin_dpo/margin_std": 465.1625671386719, "step": 503 }, { "KL/chosen_KL_mean": -591.099853515625, "KL/mean": -802.704345703125, "KL/rejected_KL_mean": -1014.3087768554688, "KL/std": 516.8352661132812, "epoch": 0.7400881057268722, "fcm_dpo/beta": 0.0010626555886119604, "fcm_dpo/delta": -0.052036985754966736, "fcm_dpo/margin": 423.208984375, "fcm_dpo/q_t": 0.3984360992908478, "grad_norm": 29.352018356323242, "learning_rate": 9.730107739932805e-08, "logits/chosen": -0.917930006980896, "logits/rejected": -0.9410198926925659, "logps/chosen": -651.0655517578125, "logps/ref_chosen": -59.96575164794922, "logps/ref_rejected": -103.76212310791016, "logps/rejected": -1118.0709228515625, "loss": 1.0722, "margin_dpo/margin_mean": 423.208984375, "margin_dpo/margin_std": 561.8984375, "step": 504 }, { "KL/chosen_KL_mean": -648.5, "KL/mean": -761.6331176757812, "KL/rejected_KL_mean": -874.7662353515625, "KL/std": 479.6357421875, "epoch": 0.7415565345080763, "fcm_dpo/beta": 0.001084325835108757, "fcm_dpo/delta": 0.1583971083164215, "fcm_dpo/margin": 226.2662811279297, "fcm_dpo/q_t": 0.4433596134185791, "grad_norm": 47.79065704345703, "learning_rate": 9.628696786995188e-08, "logits/chosen": -0.9595932960510254, "logits/rejected": -0.9328126907348633, "logps/chosen": -724.6549072265625, "logps/ref_chosen": -76.1549072265625, "logps/ref_rejected": -88.58537292480469, "logps/rejected": -963.3516235351562, "loss": 1.2142, "margin_dpo/margin_mean": 226.26626586914062, "margin_dpo/margin_std": 447.0828857421875, "step": 505 }, { "KL/chosen_KL_mean": -521.2240600585938, "KL/mean": -705.6286010742188, "KL/rejected_KL_mean": -890.033203125, "KL/std": 477.63641357421875, "epoch": 0.7430249632892805, "fcm_dpo/beta": 0.0010876674205064774, "fcm_dpo/delta": -0.0015001185238361359, "fcm_dpo/margin": 368.80908203125, "fcm_dpo/q_t": 0.40826907753944397, "grad_norm": 31.51370620727539, "learning_rate": 9.527690882192635e-08, "logits/chosen": -0.9328019618988037, "logits/rejected": -0.9463713765144348, "logps/chosen": -570.1845703125, "logps/ref_chosen": -48.96050262451172, "logps/ref_rejected": -78.41505432128906, "logps/rejected": -968.4482421875, "loss": 1.0999, "margin_dpo/margin_mean": 368.8091125488281, "margin_dpo/margin_std": 518.3751220703125, "step": 506 }, { "KL/chosen_KL_mean": -618.9168701171875, "KL/mean": -781.8189697265625, "KL/rejected_KL_mean": -944.7210693359375, "KL/std": 559.7078857421875, "epoch": 0.7444933920704846, "fcm_dpo/beta": 0.0010977927595376968, "fcm_dpo/delta": 0.043928615748882294, "fcm_dpo/margin": 325.8041076660156, "fcm_dpo/q_t": 0.4235384464263916, "grad_norm": 29.80331039428711, "learning_rate": 9.427092687124691e-08, "logits/chosen": -0.9519898891448975, "logits/rejected": -0.956289529800415, "logps/chosen": -685.7183837890625, "logps/ref_chosen": -66.80149841308594, "logps/ref_rejected": -95.37289428710938, "logps/rejected": -1040.093994140625, "loss": 1.1629, "margin_dpo/margin_mean": 325.80413818359375, "margin_dpo/margin_std": 623.13720703125, "step": 507 }, { "KL/chosen_KL_mean": -653.6170654296875, "KL/mean": -794.548583984375, "KL/rejected_KL_mean": -935.4800415039062, "KL/std": 542.185546875, "epoch": 0.7459618208516887, "fcm_dpo/beta": 0.0011185563635081053, "fcm_dpo/delta": 0.0871460884809494, "fcm_dpo/margin": 281.86297607421875, "fcm_dpo/q_t": 0.43127191066741943, "grad_norm": 37.2701416015625, "learning_rate": 9.326904852647344e-08, "logits/chosen": -0.9200087189674377, "logits/rejected": -0.9178076386451721, "logps/chosen": -724.9205322265625, "logps/ref_chosen": -71.303466796875, "logps/ref_rejected": -95.6275405883789, "logps/rejected": -1031.107666015625, "loss": 1.2098, "margin_dpo/margin_mean": 281.86297607421875, "margin_dpo/margin_std": 628.1875610351562, "step": 508 }, { "KL/chosen_KL_mean": -504.98626708984375, "KL/mean": -655.8489990234375, "KL/rejected_KL_mean": -806.7117919921875, "KL/std": 394.4847412109375, "epoch": 0.7474302496328928, "fcm_dpo/beta": 0.001136223552748561, "fcm_dpo/delta": 0.058409832417964935, "fcm_dpo/margin": 301.7255554199219, "fcm_dpo/q_t": 0.4222422242164612, "grad_norm": 36.30656433105469, "learning_rate": 9.227130018803195e-08, "logits/chosen": -0.8557006120681763, "logits/rejected": -0.8502145409584045, "logps/chosen": -568.8052368164062, "logps/ref_chosen": -63.81895065307617, "logps/ref_rejected": -83.25643920898438, "logps/rejected": -889.96826171875, "loss": 1.1457, "margin_dpo/margin_mean": 301.72552490234375, "margin_dpo/margin_std": 484.4477844238281, "step": 509 }, { "KL/chosen_KL_mean": -597.9219970703125, "KL/mean": -796.3427734375, "KL/rejected_KL_mean": -994.7635498046875, "KL/std": 451.759521484375, "epoch": 0.748898678414097, "fcm_dpo/beta": 0.0011275724973529577, "fcm_dpo/delta": -0.04966657981276512, "fcm_dpo/margin": 396.84149169921875, "fcm_dpo/q_t": 0.3957008123397827, "grad_norm": 40.821044921875, "learning_rate": 9.127770814751932e-08, "logits/chosen": -0.8606332540512085, "logits/rejected": -0.8811938166618347, "logps/chosen": -649.8004760742188, "logps/ref_chosen": -51.878448486328125, "logps/ref_rejected": -102.7651596069336, "logps/rejected": -1097.5286865234375, "loss": 1.0478, "margin_dpo/margin_mean": 396.84149169921875, "margin_dpo/margin_std": 446.3687438964844, "step": 510 }, { "KL/chosen_KL_mean": -562.03466796875, "KL/mean": -717.6114501953125, "KL/rejected_KL_mean": -873.188232421875, "KL/std": 484.3371887207031, "epoch": 0.750367107195301, "fcm_dpo/beta": 0.001131793251261115, "fcm_dpo/delta": 0.04954507201910019, "fcm_dpo/margin": 311.153564453125, "fcm_dpo/q_t": 0.4202183187007904, "grad_norm": 44.68586730957031, "learning_rate": 9.028829858700973e-08, "logits/chosen": -0.9428844451904297, "logits/rejected": -0.9506068825721741, "logps/chosen": -622.2727661132812, "logps/ref_chosen": -60.23811721801758, "logps/ref_rejected": -92.85676574707031, "logps/rejected": -966.0450439453125, "loss": 1.165, "margin_dpo/margin_mean": 311.153564453125, "margin_dpo/margin_std": 587.4927978515625, "step": 511 }, { "KL/chosen_KL_mean": -448.8963623046875, "KL/mean": -666.0167846679688, "KL/rejected_KL_mean": -883.13720703125, "KL/std": 444.51080322265625, "epoch": 0.7518355359765051, "fcm_dpo/beta": 0.0011163100134581327, "fcm_dpo/delta": -0.08939085900783539, "fcm_dpo/margin": 434.24078369140625, "fcm_dpo/q_t": 0.38778460025787354, "grad_norm": 60.86901092529297, "learning_rate": 8.930309757836516e-08, "logits/chosen": -0.8777337074279785, "logits/rejected": -0.8991394639015198, "logps/chosen": -503.8018798828125, "logps/ref_chosen": -54.905494689941406, "logps/ref_rejected": -81.87586975097656, "logps/rejected": -965.0130615234375, "loss": 1.0178, "margin_dpo/margin_mean": 434.24078369140625, "margin_dpo/margin_std": 454.16876220703125, "step": 512 }, { "KL/chosen_KL_mean": -567.0150146484375, "KL/mean": -720.166015625, "KL/rejected_KL_mean": -873.3170166015625, "KL/std": 418.872802734375, "epoch": 0.7533039647577092, "fcm_dpo/beta": 0.00110536755528301, "fcm_dpo/delta": -0.03816516324877739, "fcm_dpo/margin": 306.30194091796875, "fcm_dpo/q_t": 0.422201544046402, "grad_norm": 50.96452331542969, "learning_rate": 8.832213108254863e-08, "logits/chosen": -0.9502312541007996, "logits/rejected": -0.9348673820495605, "logps/chosen": -631.9314575195312, "logps/ref_chosen": -64.91644287109375, "logps/ref_rejected": -76.06245422363281, "logps/rejected": -949.3794555664062, "loss": 1.1469, "margin_dpo/margin_mean": 306.30194091796875, "margin_dpo/margin_std": 481.57958984375, "step": 513 }, { "KL/chosen_KL_mean": -592.06298828125, "KL/mean": -742.8245239257812, "KL/rejected_KL_mean": -893.5859985351562, "KL/std": 463.70751953125, "epoch": 0.7547723935389133, "fcm_dpo/beta": 0.0011181586887687445, "fcm_dpo/delta": 0.06485524028539658, "fcm_dpo/margin": 301.52301025390625, "fcm_dpo/q_t": 0.4251614212989807, "grad_norm": 38.09934616088867, "learning_rate": 8.734542494893954e-08, "logits/chosen": -0.8954925537109375, "logits/rejected": -0.8867564797401428, "logps/chosen": -666.2926025390625, "logps/ref_chosen": -74.22957611083984, "logps/ref_rejected": -78.945556640625, "logps/rejected": -972.5315551757812, "loss": 1.1551, "margin_dpo/margin_mean": 301.52301025390625, "margin_dpo/margin_std": 522.4912109375, "step": 514 }, { "KL/chosen_KL_mean": -481.15533447265625, "KL/mean": -597.14990234375, "KL/rejected_KL_mean": -713.14453125, "KL/std": 382.8921813964844, "epoch": 0.7562408223201175, "fcm_dpo/beta": 0.0011437044013291597, "fcm_dpo/delta": 0.13827666640281677, "fcm_dpo/margin": 231.9891357421875, "fcm_dpo/q_t": 0.437407523393631, "grad_norm": 50.51891326904297, "learning_rate": 8.637300491465272e-08, "logits/chosen": -0.8350532054901123, "logits/rejected": -0.846880316734314, "logps/chosen": -531.556884765625, "logps/ref_chosen": -50.40156555175781, "logps/ref_rejected": -87.09774780273438, "logps/rejected": -800.2422485351562, "loss": 1.1981, "margin_dpo/margin_mean": 231.9891357421875, "margin_dpo/margin_std": 439.46319580078125, "step": 515 }, { "KL/chosen_KL_mean": -508.79656982421875, "KL/mean": -676.51318359375, "KL/rejected_KL_mean": -844.229736328125, "KL/std": 420.5047607421875, "epoch": 0.7577092511013216, "fcm_dpo/beta": 0.0011569425696507096, "fcm_dpo/delta": 0.012162066996097565, "fcm_dpo/margin": 335.4332275390625, "fcm_dpo/q_t": 0.4088793396949768, "grad_norm": 50.28904724121094, "learning_rate": 8.540489660386064e-08, "logits/chosen": -0.910446286201477, "logits/rejected": -0.9385887384414673, "logps/chosen": -573.4461669921875, "logps/ref_chosen": -64.64956665039062, "logps/ref_rejected": -111.72237396240234, "logps/rejected": -955.9521484375, "loss": 1.0912, "margin_dpo/margin_mean": 335.4331970214844, "margin_dpo/margin_std": 415.19305419921875, "step": 516 }, { "KL/chosen_KL_mean": -550.5552978515625, "KL/mean": -756.6061401367188, "KL/rejected_KL_mean": -962.656982421875, "KL/std": 494.27984619140625, "epoch": 0.7591776798825257, "fcm_dpo/beta": 0.0011394355678930879, "fcm_dpo/delta": -0.07319086790084839, "fcm_dpo/margin": 412.10162353515625, "fcm_dpo/q_t": 0.39602527022361755, "grad_norm": 31.643814086914062, "learning_rate": 8.444112552711752e-08, "logits/chosen": -0.8593244552612305, "logits/rejected": -0.8574497699737549, "logps/chosen": -611.4688720703125, "logps/ref_chosen": -60.913551330566406, "logps/ref_rejected": -89.08308410644531, "logps/rejected": -1051.739990234375, "loss": 1.0563, "margin_dpo/margin_mean": 412.10162353515625, "margin_dpo/margin_std": 548.390380859375, "step": 517 }, { "KL/chosen_KL_mean": -496.5261535644531, "KL/mean": -666.101318359375, "KL/rejected_KL_mean": -835.676513671875, "KL/std": 392.9124755859375, "epoch": 0.7606461086637298, "fcm_dpo/beta": 0.001135983387939632, "fcm_dpo/delta": 0.015141095966100693, "fcm_dpo/margin": 339.1503601074219, "fcm_dpo/q_t": 0.40997710824012756, "grad_norm": 58.1636848449707, "learning_rate": 8.348171708068747e-08, "logits/chosen": -0.8889042139053345, "logits/rejected": -0.904350221157074, "logps/chosen": -553.9820556640625, "logps/ref_chosen": -57.45589065551758, "logps/ref_rejected": -85.31269836425781, "logps/rejected": -920.9891357421875, "loss": 1.0946, "margin_dpo/margin_mean": 339.1503601074219, "margin_dpo/margin_std": 427.3458557128906, "step": 518 }, { "KL/chosen_KL_mean": -510.1135559082031, "KL/mean": -627.830322265625, "KL/rejected_KL_mean": -745.547119140625, "KL/std": 358.55865478515625, "epoch": 0.762114537444934, "fcm_dpo/beta": 0.0011437054490670562, "fcm_dpo/delta": 0.026432547718286514, "fcm_dpo/margin": 235.43359375, "fcm_dpo/q_t": 0.43695303797721863, "grad_norm": 37.28139877319336, "learning_rate": 8.25266965458755e-08, "logits/chosen": -0.8758772611618042, "logits/rejected": -0.859151303768158, "logps/chosen": -584.1768798828125, "logps/ref_chosen": -74.06331634521484, "logps/ref_rejected": -104.44416809082031, "logps/rejected": -849.9913330078125, "loss": 1.2004, "margin_dpo/margin_mean": 235.43359375, "margin_dpo/margin_std": 449.8384094238281, "step": 519 }, { "KL/chosen_KL_mean": -533.606689453125, "KL/mean": -691.7373046875, "KL/rejected_KL_mean": -849.8678588867188, "KL/std": 418.42547607421875, "epoch": 0.7635829662261381, "fcm_dpo/beta": 0.0011494287755340338, "fcm_dpo/delta": 0.03775997459888458, "fcm_dpo/margin": 316.26123046875, "fcm_dpo/q_t": 0.4179002046585083, "grad_norm": 45.021728515625, "learning_rate": 8.15760890883607e-08, "logits/chosen": -0.8589770793914795, "logits/rejected": -0.8674265146255493, "logps/chosen": -603.906494140625, "logps/ref_chosen": -70.2998275756836, "logps/ref_rejected": -99.98133850097656, "logps/rejected": -949.8492431640625, "loss": 1.1257, "margin_dpo/margin_mean": 316.26123046875, "margin_dpo/margin_std": 463.71783447265625, "step": 520 }, { "KL/chosen_KL_mean": -490.48187255859375, "KL/mean": -661.524169921875, "KL/rejected_KL_mean": -832.5665283203125, "KL/std": 435.0758972167969, "epoch": 0.7650513950073421, "fcm_dpo/beta": 0.0011624578619375825, "fcm_dpo/delta": 0.0017420090734958649, "fcm_dpo/margin": 342.0847473144531, "fcm_dpo/q_t": 0.4088232219219208, "grad_norm": 29.704362869262695, "learning_rate": 8.062991975753378e-08, "logits/chosen": -0.9206105470657349, "logits/rejected": -0.9278547763824463, "logps/chosen": -548.624755859375, "logps/ref_chosen": -58.14292526245117, "logps/ref_rejected": -83.28060913085938, "logps/rejected": -915.84716796875, "loss": 1.0938, "margin_dpo/margin_mean": 342.084716796875, "margin_dpo/margin_std": 449.4230041503906, "step": 521 }, { "KL/chosen_KL_mean": -558.9736328125, "KL/mean": -713.635498046875, "KL/rejected_KL_mean": -868.29736328125, "KL/std": 458.9757080078125, "epoch": 0.7665198237885462, "fcm_dpo/beta": 0.0011631404049694538, "fcm_dpo/delta": 0.04171000048518181, "fcm_dpo/margin": 309.32373046875, "fcm_dpo/q_t": 0.41798996925354004, "grad_norm": 31.109495162963867, "learning_rate": 7.968821348583643e-08, "logits/chosen": -0.9124878644943237, "logits/rejected": -0.9153552055358887, "logps/chosen": -605.5213623046875, "logps/ref_chosen": -46.54766845703125, "logps/ref_rejected": -66.01388549804688, "logps/rejected": -934.311279296875, "loss": 1.1346, "margin_dpo/margin_mean": 309.32373046875, "margin_dpo/margin_std": 488.1703796386719, "step": 522 }, { "KL/chosen_KL_mean": -593.5350341796875, "KL/mean": -758.7939453125, "KL/rejected_KL_mean": -924.0529174804688, "KL/std": 532.2261962890625, "epoch": 0.7679882525697503, "fcm_dpo/beta": 0.001167641719803214, "fcm_dpo/delta": 0.014641055837273598, "fcm_dpo/margin": 330.5179443359375, "fcm_dpo/q_t": 0.4140872061252594, "grad_norm": 43.91381072998047, "learning_rate": 7.875099508810484e-08, "logits/chosen": -0.9662898182868958, "logits/rejected": -0.9687439203262329, "logps/chosen": -655.3046264648438, "logps/ref_chosen": -61.76960372924805, "logps/ref_rejected": -83.76141357421875, "logps/rejected": -1007.8143310546875, "loss": 1.1351, "margin_dpo/margin_mean": 330.5179443359375, "margin_dpo/margin_std": 560.5762329101562, "step": 523 }, { "KL/chosen_KL_mean": -574.9899291992188, "KL/mean": -747.72412109375, "KL/rejected_KL_mean": -920.458251953125, "KL/std": 479.4917297363281, "epoch": 0.7694566813509545, "fcm_dpo/beta": 0.0011622272431850433, "fcm_dpo/delta": -0.002190619707107544, "fcm_dpo/margin": 345.46832275390625, "fcm_dpo/q_t": 0.4057931900024414, "grad_norm": 47.520774841308594, "learning_rate": 7.781828926091535e-08, "logits/chosen": -0.9906701445579529, "logits/rejected": -0.9787558317184448, "logps/chosen": -653.06201171875, "logps/ref_chosen": -78.0720443725586, "logps/ref_rejected": -81.30198669433594, "logps/rejected": -1001.76025390625, "loss": 1.1068, "margin_dpo/margin_mean": 345.46832275390625, "margin_dpo/margin_std": 491.1083984375, "step": 524 }, { "KL/chosen_KL_mean": -581.4891357421875, "KL/mean": -802.2073974609375, "KL/rejected_KL_mean": -1022.925537109375, "KL/std": 510.1922607421875, "epoch": 0.7709251101321586, "fcm_dpo/beta": 0.001145128975622356, "fcm_dpo/delta": -0.11176417022943497, "fcm_dpo/margin": 441.4364929199219, "fcm_dpo/q_t": 0.38664761185646057, "grad_norm": 42.60094451904297, "learning_rate": 7.689012058193384e-08, "logits/chosen": -0.925658106803894, "logits/rejected": -0.9625715017318726, "logps/chosen": -632.3170166015625, "logps/ref_chosen": -50.827857971191406, "logps/ref_rejected": -100.05294036865234, "logps/rejected": -1122.978515625, "loss": 1.0305, "margin_dpo/margin_mean": 441.43646240234375, "margin_dpo/margin_std": 542.1448364257812, "step": 525 }, { "KL/chosen_KL_mean": -626.0618896484375, "KL/mean": -843.650390625, "KL/rejected_KL_mean": -1061.239013671875, "KL/std": 508.11346435546875, "epoch": 0.7723935389133627, "fcm_dpo/beta": 0.0011303203646093607, "fcm_dpo/delta": -0.09655816853046417, "fcm_dpo/margin": 435.17706298828125, "fcm_dpo/q_t": 0.3878590166568756, "grad_norm": 30.649280548095703, "learning_rate": 7.596651350926836e-08, "logits/chosen": -0.9626432657241821, "logits/rejected": -0.9604432582855225, "logps/chosen": -689.2291259765625, "logps/ref_chosen": -63.167236328125, "logps/ref_rejected": -86.30934143066406, "logps/rejected": -1147.54833984375, "loss": 1.0485, "margin_dpo/margin_mean": 435.17706298828125, "margin_dpo/margin_std": 561.9842529296875, "step": 526 }, { "KL/chosen_KL_mean": -637.3934326171875, "KL/mean": -784.9993896484375, "KL/rejected_KL_mean": -932.60546875, "KL/std": 524.7011108398438, "epoch": 0.7738619676945668, "fcm_dpo/beta": 0.0011297144228592515, "fcm_dpo/delta": 0.06883951276540756, "fcm_dpo/margin": 295.2120361328125, "fcm_dpo/q_t": 0.4224596917629242, "grad_norm": 32.22301483154297, "learning_rate": 7.504749238082414e-08, "logits/chosen": -1.1435140371322632, "logits/rejected": -1.1115856170654297, "logps/chosen": -708.5220947265625, "logps/ref_chosen": -71.12867736816406, "logps/ref_rejected": -78.3425521850586, "logps/rejected": -1010.947998046875, "loss": 1.1391, "margin_dpo/margin_mean": 295.2120361328125, "margin_dpo/margin_std": 437.1567687988281, "step": 527 }, { "KL/chosen_KL_mean": -652.498046875, "KL/mean": -834.37255859375, "KL/rejected_KL_mean": -1016.2471313476562, "KL/std": 502.570068359375, "epoch": 0.775330396475771, "fcm_dpo/beta": 0.001133130630478263, "fcm_dpo/delta": -0.012725085951387882, "fcm_dpo/margin": 363.7490539550781, "fcm_dpo/q_t": 0.40897679328918457, "grad_norm": 43.005401611328125, "learning_rate": 7.413308141366254e-08, "logits/chosen": -1.045546293258667, "logits/rejected": -1.0283198356628418, "logps/chosen": -720.5875244140625, "logps/ref_chosen": -68.0894546508789, "logps/ref_rejected": -93.91006469726562, "logps/rejected": -1110.1572265625, "loss": 1.1151, "margin_dpo/margin_mean": 363.7490539550781, "margin_dpo/margin_std": 581.20263671875, "step": 528 }, { "KL/chosen_KL_mean": -768.2669677734375, "KL/mean": -888.3395385742188, "KL/rejected_KL_mean": -1008.412109375, "KL/std": 458.5384521484375, "epoch": 0.7767988252569751, "fcm_dpo/beta": 0.0011390424333512783, "fcm_dpo/delta": 0.02491956390440464, "fcm_dpo/margin": 240.14520263671875, "fcm_dpo/q_t": 0.43738028407096863, "grad_norm": 47.4577522277832, "learning_rate": 7.322330470336313e-08, "logits/chosen": -1.0672532320022583, "logits/rejected": -1.0765759944915771, "logps/chosen": -823.8419189453125, "logps/ref_chosen": -55.57495880126953, "logps/ref_rejected": -89.20909118652344, "logps/rejected": -1097.6212158203125, "loss": 1.234, "margin_dpo/margin_mean": 240.14520263671875, "margin_dpo/margin_std": 571.541748046875, "step": 529 }, { "KL/chosen_KL_mean": -659.3563232421875, "KL/mean": -857.35400390625, "KL/rejected_KL_mean": -1055.3516845703125, "KL/std": 551.8106079101562, "epoch": 0.7782672540381792, "fcm_dpo/beta": 0.0011311739217489958, "fcm_dpo/delta": -0.050166938453912735, "fcm_dpo/margin": 395.99542236328125, "fcm_dpo/q_t": 0.40238332748413086, "grad_norm": 49.10841369628906, "learning_rate": 7.231818622338822e-08, "logits/chosen": -0.9842202663421631, "logits/rejected": -0.980260968208313, "logps/chosen": -706.957763671875, "logps/ref_chosen": -47.601417541503906, "logps/ref_rejected": -87.2845230102539, "logps/rejected": -1142.63623046875, "loss": 1.1208, "margin_dpo/margin_mean": 395.99542236328125, "margin_dpo/margin_std": 689.074462890625, "step": 530 }, { "KL/chosen_KL_mean": -727.6012573242188, "KL/mean": -897.2587280273438, "KL/rejected_KL_mean": -1066.916259765625, "KL/std": 580.2291870117188, "epoch": 0.7797356828193832, "fcm_dpo/beta": 0.0011335888411849737, "fcm_dpo/delta": 0.015755577012896538, "fcm_dpo/margin": 339.31494140625, "fcm_dpo/q_t": 0.4141680896282196, "grad_norm": 44.63459777832031, "learning_rate": 7.141774982445147e-08, "logits/chosen": -1.0530567169189453, "logits/rejected": -1.0324784517288208, "logps/chosen": -782.8472900390625, "logps/ref_chosen": -55.246063232421875, "logps/ref_rejected": -70.60598754882812, "logps/rejected": -1137.522216796875, "loss": 1.1281, "margin_dpo/margin_mean": 339.31494140625, "margin_dpo/margin_std": 551.3843994140625, "step": 531 }, { "KL/chosen_KL_mean": -740.7996826171875, "KL/mean": -921.9634399414062, "KL/rejected_KL_mean": -1103.127197265625, "KL/std": 562.6202392578125, "epoch": 0.7812041116005873, "fcm_dpo/beta": 0.0011215780396014452, "fcm_dpo/delta": -0.00783345103263855, "fcm_dpo/margin": 362.3275451660156, "fcm_dpo/q_t": 0.41034865379333496, "grad_norm": 86.36136627197266, "learning_rate": 7.052201923388953e-08, "logits/chosen": -1.013758897781372, "logits/rejected": -0.9856699705123901, "logps/chosen": -811.085693359375, "logps/ref_chosen": -70.28601837158203, "logps/ref_rejected": -86.5913314819336, "logps/rejected": -1189.718505859375, "loss": 1.1499, "margin_dpo/margin_mean": 362.3275451660156, "margin_dpo/margin_std": 658.3814697265625, "step": 532 }, { "KL/chosen_KL_mean": -646.8947143554688, "KL/mean": -779.6904907226562, "KL/rejected_KL_mean": -912.4862670898438, "KL/std": 475.87249755859375, "epoch": 0.7826725403817915, "fcm_dpo/beta": 0.001128980191424489, "fcm_dpo/delta": -0.004243167117238045, "fcm_dpo/margin": 265.5915222167969, "fcm_dpo/q_t": 0.43269672989845276, "grad_norm": 44.21409606933594, "learning_rate": 6.963101805503646e-08, "logits/chosen": -1.0471582412719727, "logits/rejected": -1.0218884944915771, "logps/chosen": -711.7498168945312, "logps/ref_chosen": -64.8551025390625, "logps/ref_rejected": -76.58805847167969, "logps/rejected": -989.0743408203125, "loss": 1.2058, "margin_dpo/margin_mean": 265.591552734375, "margin_dpo/margin_std": 571.443359375, "step": 533 }, { "KL/chosen_KL_mean": -682.74267578125, "KL/mean": -859.9710693359375, "KL/rejected_KL_mean": -1037.199462890625, "KL/std": 517.587890625, "epoch": 0.7841409691629956, "fcm_dpo/beta": 0.0011213625548407435, "fcm_dpo/delta": 0.001744687557220459, "fcm_dpo/margin": 354.456787109375, "fcm_dpo/q_t": 0.4103449285030365, "grad_norm": 40.56071853637695, "learning_rate": 6.874476976660184e-08, "logits/chosen": -1.0277010202407837, "logits/rejected": -1.022787094116211, "logps/chosen": -742.862060546875, "logps/ref_chosen": -60.119388580322266, "logps/ref_rejected": -78.54347229003906, "logps/rejected": -1115.742919921875, "loss": 1.1133, "margin_dpo/margin_mean": 354.456787109375, "margin_dpo/margin_std": 530.1246337890625, "step": 534 }, { "KL/chosen_KL_mean": -598.3570556640625, "KL/mean": -804.5665283203125, "KL/rejected_KL_mean": -1010.776123046875, "KL/std": 515.3653564453125, "epoch": 0.7856093979441997, "fcm_dpo/beta": 0.0011251430260017514, "fcm_dpo/delta": -0.06760876625776291, "fcm_dpo/margin": 412.41912841796875, "fcm_dpo/q_t": 0.39604315161705017, "grad_norm": 32.73942565917969, "learning_rate": 6.786329772205246e-08, "logits/chosen": -0.9553531408309937, "logits/rejected": -0.9562033414840698, "logps/chosen": -652.687255859375, "logps/ref_chosen": -54.330238342285156, "logps/ref_rejected": -96.30763244628906, "logps/rejected": -1107.083740234375, "loss": 1.0635, "margin_dpo/margin_mean": 412.41912841796875, "margin_dpo/margin_std": 538.2338256835938, "step": 535 }, { "KL/chosen_KL_mean": -510.51318359375, "KL/mean": -757.499755859375, "KL/rejected_KL_mean": -1004.4862060546875, "KL/std": 566.7451782226562, "epoch": 0.7870778267254038, "fcm_dpo/beta": 0.0010907297255471349, "fcm_dpo/delta": -0.14658187329769135, "fcm_dpo/margin": 493.97308349609375, "fcm_dpo/q_t": 0.38460248708724976, "grad_norm": 26.8385066986084, "learning_rate": 6.698662514899638e-08, "logits/chosen": -0.9329211711883545, "logits/rejected": -0.9625818729400635, "logps/chosen": -557.59375, "logps/ref_chosen": -47.08053207397461, "logps/ref_rejected": -89.09783935546875, "logps/rejected": -1093.5841064453125, "loss": 1.0253, "margin_dpo/margin_mean": 493.97308349609375, "margin_dpo/margin_std": 671.49755859375, "step": 536 }, { "KL/chosen_KL_mean": -550.3721313476562, "KL/mean": -718.6456298828125, "KL/rejected_KL_mean": -886.919189453125, "KL/std": 459.65936279296875, "epoch": 0.788546255506608, "fcm_dpo/beta": 0.0010922504588961601, "fcm_dpo/delta": 0.0329880453646183, "fcm_dpo/margin": 336.547119140625, "fcm_dpo/q_t": 0.4157649874687195, "grad_norm": 46.854251861572266, "learning_rate": 6.611477514857114e-08, "logits/chosen": -0.9757102727890015, "logits/rejected": -0.9494297504425049, "logps/chosen": -608.1195678710938, "logps/ref_chosen": -57.747467041015625, "logps/ref_rejected": -70.43838500976562, "logps/rejected": -957.3576049804688, "loss": 1.1432, "margin_dpo/margin_mean": 336.547119140625, "margin_dpo/margin_std": 562.57177734375, "step": 537 }, { "KL/chosen_KL_mean": -694.86279296875, "KL/mean": -885.943115234375, "KL/rejected_KL_mean": -1077.0233154296875, "KL/std": 506.5946960449219, "epoch": 0.7900146842878121, "fcm_dpo/beta": 0.0010833143023774028, "fcm_dpo/delta": -0.014802441000938416, "fcm_dpo/margin": 382.1605224609375, "fcm_dpo/q_t": 0.40582871437072754, "grad_norm": 31.442834854125977, "learning_rate": 6.524777069483525e-08, "logits/chosen": -0.9763197898864746, "logits/rejected": -0.9549228549003601, "logps/chosen": -761.2787475585938, "logps/ref_chosen": -66.41594696044922, "logps/ref_rejected": -84.22808837890625, "logps/rejected": -1161.25146484375, "loss": 1.0873, "margin_dpo/margin_mean": 382.1605224609375, "margin_dpo/margin_std": 518.7582397460938, "step": 538 }, { "KL/chosen_KL_mean": -594.95458984375, "KL/mean": -775.4962158203125, "KL/rejected_KL_mean": -956.0377807617188, "KL/std": 433.60675048828125, "epoch": 0.7914831130690162, "fcm_dpo/beta": 0.0010887064272537827, "fcm_dpo/delta": 0.007081391289830208, "fcm_dpo/margin": 361.0832214355469, "fcm_dpo/q_t": 0.41017356514930725, "grad_norm": 30.531400680541992, "learning_rate": 6.438563463416221e-08, "logits/chosen": -1.028662919998169, "logits/rejected": -1.015451192855835, "logps/chosen": -653.4474487304688, "logps/ref_chosen": -58.492855072021484, "logps/ref_rejected": -91.85395050048828, "logps/rejected": -1047.8917236328125, "loss": 1.0956, "margin_dpo/margin_mean": 361.0832214355469, "margin_dpo/margin_std": 479.93939208984375, "step": 539 }, { "KL/chosen_KL_mean": -608.3707885742188, "KL/mean": -851.121826171875, "KL/rejected_KL_mean": -1093.8729248046875, "KL/std": 551.8743896484375, "epoch": 0.7929515418502202, "fcm_dpo/beta": 0.0010707840556278825, "fcm_dpo/delta": -0.12625397741794586, "fcm_dpo/margin": 485.5020751953125, "fcm_dpo/q_t": 0.38717466592788696, "grad_norm": 33.186012268066406, "learning_rate": 6.352838968463919e-08, "logits/chosen": -0.9437620639801025, "logits/rejected": -0.9658418893814087, "logps/chosen": -671.853271484375, "logps/ref_chosen": -63.482513427734375, "logps/ref_rejected": -116.42999267578125, "logps/rejected": -1210.3028564453125, "loss": 1.0431, "margin_dpo/margin_mean": 485.5020446777344, "margin_dpo/margin_std": 648.8251953125, "step": 540 }, { "KL/chosen_KL_mean": -717.1644287109375, "KL/mean": -842.0802612304688, "KL/rejected_KL_mean": -966.99609375, "KL/std": 490.08099365234375, "epoch": 0.7944199706314243, "fcm_dpo/beta": 0.0010596727952361107, "fcm_dpo/delta": -0.0012176802847534418, "fcm_dpo/margin": 249.83154296875, "fcm_dpo/q_t": 0.4399704039096832, "grad_norm": 45.27843475341797, "learning_rate": 6.267605843546767e-08, "logits/chosen": -1.0905866622924805, "logits/rejected": -1.0777101516723633, "logps/chosen": -795.44482421875, "logps/ref_chosen": -78.28036499023438, "logps/ref_rejected": -103.273681640625, "logps/rejected": -1070.269775390625, "loss": 1.2291, "margin_dpo/margin_mean": 249.83154296875, "margin_dpo/margin_std": 572.767822265625, "step": 541 }, { "KL/chosen_KL_mean": -626.7008056640625, "KL/mean": -863.0870361328125, "KL/rejected_KL_mean": -1099.4732666015625, "KL/std": 545.5762939453125, "epoch": 0.7958883994126285, "fcm_dpo/beta": 0.0010356687707826495, "fcm_dpo/delta": -0.09657715260982513, "fcm_dpo/margin": 472.7724609375, "fcm_dpo/q_t": 0.3910575807094574, "grad_norm": 49.864784240722656, "learning_rate": 6.182866334636888e-08, "logits/chosen": -1.0657211542129517, "logits/rejected": -1.098391056060791, "logps/chosen": -684.185791015625, "logps/ref_chosen": -57.48497009277344, "logps/ref_rejected": -96.47506713867188, "logps/rejected": -1195.9483642578125, "loss": 1.0595, "margin_dpo/margin_mean": 472.7724609375, "margin_dpo/margin_std": 642.970947265625, "step": 542 }, { "KL/chosen_KL_mean": -702.564697265625, "KL/mean": -870.582763671875, "KL/rejected_KL_mean": -1038.6007080078125, "KL/std": 663.33447265625, "epoch": 0.7973568281938326, "fcm_dpo/beta": 0.001045349519699812, "fcm_dpo/delta": 0.050522927194833755, "fcm_dpo/margin": 336.0360107421875, "fcm_dpo/q_t": 0.43216556310653687, "grad_norm": 39.975833892822266, "learning_rate": 6.098622674699147e-08, "logits/chosen": -0.9809169769287109, "logits/rejected": -1.0095728635787964, "logps/chosen": -763.1822509765625, "logps/ref_chosen": -60.61750793457031, "logps/ref_rejected": -105.59896850585938, "logps/rejected": -1144.19970703125, "loss": 1.202, "margin_dpo/margin_mean": 336.0360107421875, "margin_dpo/margin_std": 774.7861328125, "step": 543 }, { "KL/chosen_KL_mean": -709.45947265625, "KL/mean": -901.91064453125, "KL/rejected_KL_mean": -1094.36181640625, "KL/std": 522.1925048828125, "epoch": 0.7988252569750367, "fcm_dpo/beta": 0.0010487872641533613, "fcm_dpo/delta": -0.003843428334221244, "fcm_dpo/margin": 384.9021911621094, "fcm_dpo/q_t": 0.40891605615615845, "grad_norm": 32.359127044677734, "learning_rate": 6.01487708363232e-08, "logits/chosen": -1.0074541568756104, "logits/rejected": -1.025747299194336, "logps/chosen": -769.101806640625, "logps/ref_chosen": -59.642303466796875, "logps/ref_rejected": -100.95469665527344, "logps/rejected": -1195.31640625, "loss": 1.1097, "margin_dpo/margin_mean": 384.9022216796875, "margin_dpo/margin_std": 589.19677734375, "step": 544 }, { "KL/chosen_KL_mean": -657.5155029296875, "KL/mean": -898.0047607421875, "KL/rejected_KL_mean": -1138.494140625, "KL/std": 535.8654174804688, "epoch": 0.8002936857562408, "fcm_dpo/beta": 0.0010332402307540178, "fcm_dpo/delta": -0.10204954445362091, "fcm_dpo/margin": 480.9786376953125, "fcm_dpo/q_t": 0.38913267850875854, "grad_norm": 33.80911636352539, "learning_rate": 5.9316317682106294e-08, "logits/chosen": -0.9045934677124023, "logits/rejected": -0.9373363256454468, "logps/chosen": -725.1640625, "logps/ref_chosen": -67.64859771728516, "logps/ref_rejected": -95.90800476074219, "logps/rejected": -1234.402099609375, "loss": 1.0393, "margin_dpo/margin_mean": 480.9786376953125, "margin_dpo/margin_std": 609.970458984375, "step": 545 }, { "KL/chosen_KL_mean": -630.7238159179688, "KL/mean": -781.1712646484375, "KL/rejected_KL_mean": -931.6187133789062, "KL/std": 462.85028076171875, "epoch": 0.801762114537445, "fcm_dpo/beta": 0.0010417320299893618, "fcm_dpo/delta": 0.0893513560295105, "fcm_dpo/margin": 300.89483642578125, "fcm_dpo/q_t": 0.4263242185115814, "grad_norm": 38.7053108215332, "learning_rate": 5.848888922025552e-08, "logits/chosen": -0.9846795797348022, "logits/rejected": -0.9681577682495117, "logps/chosen": -681.468017578125, "logps/ref_chosen": -50.744232177734375, "logps/ref_rejected": -81.86622619628906, "logps/rejected": -1013.4849853515625, "loss": 1.1632, "margin_dpo/margin_mean": 300.89483642578125, "margin_dpo/margin_std": 500.70306396484375, "step": 546 }, { "KL/chosen_KL_mean": -622.949462890625, "KL/mean": -825.2542724609375, "KL/rejected_KL_mean": -1027.55908203125, "KL/std": 521.1716918945312, "epoch": 0.8032305433186491, "fcm_dpo/beta": 0.0010446913074702024, "fcm_dpo/delta": -0.02371753193438053, "fcm_dpo/margin": 404.6095275878906, "fcm_dpo/q_t": 0.4047175645828247, "grad_norm": 48.27106857299805, "learning_rate": 5.7666507254280265e-08, "logits/chosen": -0.9310117363929749, "logits/rejected": -0.9426393508911133, "logps/chosen": -696.63720703125, "logps/ref_chosen": -73.6877212524414, "logps/ref_rejected": -90.76136779785156, "logps/rejected": -1118.3204345703125, "loss": 1.0885, "margin_dpo/margin_mean": 404.60955810546875, "margin_dpo/margin_std": 567.8289184570312, "step": 547 }, { "KL/chosen_KL_mean": -656.554443359375, "KL/mean": -834.1263427734375, "KL/rejected_KL_mean": -1011.6983642578125, "KL/std": 554.41552734375, "epoch": 0.8046989720998532, "fcm_dpo/beta": 0.0010460072662681341, "fcm_dpo/delta": 0.02962075173854828, "fcm_dpo/margin": 355.14385986328125, "fcm_dpo/q_t": 0.42021819949150085, "grad_norm": 30.078670501708984, "learning_rate": 5.684919345471029e-08, "logits/chosen": -1.0434558391571045, "logits/rejected": -1.0432292222976685, "logps/chosen": -721.80078125, "logps/ref_chosen": -65.24634552001953, "logps/ref_rejected": -94.11807250976562, "logps/rejected": -1105.81640625, "loss": 1.1356, "margin_dpo/margin_mean": 355.14385986328125, "margin_dpo/margin_std": 616.107177734375, "step": 548 }, { "KL/chosen_KL_mean": -705.240478515625, "KL/mean": -842.5841064453125, "KL/rejected_KL_mean": -979.927734375, "KL/std": 473.9487609863281, "epoch": 0.8061674008810573, "fcm_dpo/beta": 0.0010487986728549004, "fcm_dpo/delta": 0.010232968255877495, "fcm_dpo/margin": 274.687255859375, "fcm_dpo/q_t": 0.4355009198188782, "grad_norm": 55.40736770629883, "learning_rate": 5.603696935852426e-08, "logits/chosen": -1.0249577760696411, "logits/rejected": -1.0135101079940796, "logps/chosen": -754.4528198242188, "logps/ref_chosen": -49.21235656738281, "logps/ref_rejected": -73.91031646728516, "logps/rejected": -1053.8380126953125, "loss": 1.2042, "margin_dpo/margin_mean": 274.687255859375, "margin_dpo/margin_std": 570.506103515625, "step": 549 }, { "KL/chosen_KL_mean": -676.3907470703125, "KL/mean": -843.399169921875, "KL/rejected_KL_mean": -1010.4075927734375, "KL/std": 506.3270263671875, "epoch": 0.8076358296622613, "fcm_dpo/beta": 0.0010564997792243958, "fcm_dpo/delta": 0.04886303097009659, "fcm_dpo/margin": 334.0168762207031, "fcm_dpo/q_t": 0.4197149872779846, "grad_norm": 45.87727737426758, "learning_rate": 5.5229856368582376e-08, "logits/chosen": -0.9613184928894043, "logits/rejected": -0.9839845299720764, "logps/chosen": -733.1976318359375, "logps/ref_chosen": -56.80695343017578, "logps/ref_rejected": -95.12580871582031, "logps/rejected": -1105.533447265625, "loss": 1.1355, "margin_dpo/margin_mean": 334.01690673828125, "margin_dpo/margin_std": 528.5301513671875, "step": 550 }, { "KL/chosen_KL_mean": -605.405029296875, "KL/mean": -882.225341796875, "KL/rejected_KL_mean": -1159.045654296875, "KL/std": 553.9127197265625, "epoch": 0.8091042584434655, "fcm_dpo/beta": 0.001030082581564784, "fcm_dpo/delta": -0.18102356791496277, "fcm_dpo/margin": 553.6406860351562, "fcm_dpo/q_t": 0.3695809245109558, "grad_norm": 48.99431610107422, "learning_rate": 5.4427875753062734e-08, "logits/chosen": -0.9334255456924438, "logits/rejected": -0.9928078055381775, "logps/chosen": -664.5113525390625, "logps/ref_chosen": -59.10633087158203, "logps/ref_rejected": -111.67280578613281, "logps/rejected": -1270.718505859375, "loss": 0.9681, "margin_dpo/margin_mean": 553.6406860351562, "margin_dpo/margin_std": 553.6318359375, "step": 551 }, { "KL/chosen_KL_mean": -561.7005004882812, "KL/mean": -867.2351684570312, "KL/rejected_KL_mean": -1172.769775390625, "KL/std": 621.8682861328125, "epoch": 0.8105726872246696, "fcm_dpo/beta": 0.000981001416221261, "fcm_dpo/delta": -0.21600230038166046, "fcm_dpo/margin": 611.0693359375, "fcm_dpo/q_t": 0.3693755269050598, "grad_norm": 55.24009704589844, "learning_rate": 5.363104864490034e-08, "logits/chosen": -0.9999994039535522, "logits/rejected": -1.0425043106079102, "logps/chosen": -624.0551147460938, "logps/ref_chosen": -62.35459899902344, "logps/ref_rejected": -104.56210327148438, "logps/rejected": -1277.33203125, "loss": 0.9781, "margin_dpo/margin_mean": 611.0692749023438, "margin_dpo/margin_std": 701.7575073242188, "step": 552 }, { "KL/chosen_KL_mean": -648.89208984375, "KL/mean": -806.120849609375, "KL/rejected_KL_mean": -963.349609375, "KL/std": 528.423828125, "epoch": 0.8120411160058737, "fcm_dpo/beta": 0.0009901414159685373, "fcm_dpo/delta": 0.09159143269062042, "fcm_dpo/margin": 314.45751953125, "fcm_dpo/q_t": 0.43176624178886414, "grad_norm": 28.201841354370117, "learning_rate": 5.2839396041230415e-08, "logits/chosen": -0.9241914749145508, "logits/rejected": -0.9152238368988037, "logps/chosen": -717.1509399414062, "logps/ref_chosen": -68.25881958007812, "logps/ref_rejected": -98.0971450805664, "logps/rejected": -1061.44677734375, "loss": 1.1743, "margin_dpo/margin_mean": 314.45751953125, "margin_dpo/margin_std": 584.146240234375, "step": 553 }, { "KL/chosen_KL_mean": -661.1310424804688, "KL/mean": -876.3670654296875, "KL/rejected_KL_mean": -1091.60302734375, "KL/std": 559.095947265625, "epoch": 0.8135095447870778, "fcm_dpo/beta": 0.000999167561531067, "fcm_dpo/delta": -0.03213735669851303, "fcm_dpo/margin": 430.4720153808594, "fcm_dpo/q_t": 0.4063616394996643, "grad_norm": 72.81156921386719, "learning_rate": 5.205293880283551e-08, "logits/chosen": -0.9473394155502319, "logits/rejected": -0.9234938621520996, "logps/chosen": -729.0787353515625, "logps/ref_chosen": -67.94767761230469, "logps/ref_rejected": -89.78272247314453, "logps/rejected": -1181.3857421875, "loss": 1.1178, "margin_dpo/margin_mean": 430.4720153808594, "margin_dpo/margin_std": 707.259521484375, "step": 554 }, { "KL/chosen_KL_mean": -683.7077026367188, "KL/mean": -935.7298583984375, "KL/rejected_KL_mean": -1187.7520751953125, "KL/std": 597.616943359375, "epoch": 0.8149779735682819, "fcm_dpo/beta": 0.0009781017433851957, "fcm_dpo/delta": -0.09773121029138565, "fcm_dpo/margin": 504.04437255859375, "fcm_dpo/q_t": 0.3947869837284088, "grad_norm": 50.65883255004883, "learning_rate": 5.127169765359515e-08, "logits/chosen": -1.0049800872802734, "logits/rejected": -1.0619277954101562, "logps/chosen": -737.0382080078125, "logps/ref_chosen": -53.33049011230469, "logps/ref_rejected": -108.47937774658203, "logps/rejected": -1296.2314453125, "loss": 1.078, "margin_dpo/margin_mean": 504.04437255859375, "margin_dpo/margin_std": 776.3997802734375, "step": 555 }, { "KL/chosen_KL_mean": -654.3289794921875, "KL/mean": -803.46728515625, "KL/rejected_KL_mean": -952.6055908203125, "KL/std": 454.75347900390625, "epoch": 0.8164464023494861, "fcm_dpo/beta": 0.0009878533892333508, "fcm_dpo/delta": 0.10852371156215668, "fcm_dpo/margin": 298.27655029296875, "fcm_dpo/q_t": 0.4318218529224396, "grad_norm": 28.539920806884766, "learning_rate": 5.049569317994012e-08, "logits/chosen": -1.0015957355499268, "logits/rejected": -0.997646689414978, "logps/chosen": -712.9735107421875, "logps/ref_chosen": -58.64447021484375, "logps/ref_rejected": -101.34040832519531, "logps/rejected": -1053.946044921875, "loss": 1.1608, "margin_dpo/margin_mean": 298.2765808105469, "margin_dpo/margin_std": 454.01800537109375, "step": 556 }, { "KL/chosen_KL_mean": -716.552978515625, "KL/mean": -943.0986328125, "KL/rejected_KL_mean": -1169.6444091796875, "KL/std": 638.390625, "epoch": 0.8179148311306902, "fcm_dpo/beta": 0.0009835660457611084, "fcm_dpo/delta": -0.04802338778972626, "fcm_dpo/margin": 453.0914306640625, "fcm_dpo/q_t": 0.402864545583725, "grad_norm": 74.13558959960938, "learning_rate": 4.9724945830310144e-08, "logits/chosen": -1.025956392288208, "logits/rejected": -1.0636675357818604, "logps/chosen": -784.3936157226562, "logps/ref_chosen": -67.84066009521484, "logps/ref_rejected": -109.93965911865234, "logps/rejected": -1279.583984375, "loss": 1.105, "margin_dpo/margin_mean": 453.0914306640625, "margin_dpo/margin_std": 724.352783203125, "step": 557 }, { "KL/chosen_KL_mean": -638.923095703125, "KL/mean": -944.195068359375, "KL/rejected_KL_mean": -1249.467041015625, "KL/std": 590.7932739257812, "epoch": 0.8193832599118943, "fcm_dpo/beta": 0.000956161180511117, "fcm_dpo/delta": -0.19506213068962097, "fcm_dpo/margin": 610.5439453125, "fcm_dpo/q_t": 0.36552464962005615, "grad_norm": 37.18507385253906, "learning_rate": 4.8959475914614554e-08, "logits/chosen": -1.0881562232971191, "logits/rejected": -1.1110167503356934, "logps/chosen": -701.2913208007812, "logps/ref_chosen": -62.36824035644531, "logps/ref_rejected": -102.16102600097656, "logps/rejected": -1351.628173828125, "loss": 0.9725, "margin_dpo/margin_mean": 610.5439453125, "margin_dpo/margin_std": 645.1649169921875, "step": 558 }, { "KL/chosen_KL_mean": -737.7335205078125, "KL/mean": -990.26171875, "KL/rejected_KL_mean": -1242.789794921875, "KL/std": 609.5598754882812, "epoch": 0.8208516886930984, "fcm_dpo/beta": 0.000936803175136447, "fcm_dpo/delta": -0.07669977843761444, "fcm_dpo/margin": 505.05615234375, "fcm_dpo/q_t": 0.39399462938308716, "grad_norm": 30.2044734954834, "learning_rate": 4.8199303603697614e-08, "logits/chosen": -1.1786550283432007, "logits/rejected": -1.18735933303833, "logps/chosen": -798.4859008789062, "logps/ref_chosen": -60.752323150634766, "logps/ref_rejected": -93.44229125976562, "logps/rejected": -1336.2320556640625, "loss": 1.0549, "margin_dpo/margin_mean": 505.05621337890625, "margin_dpo/margin_std": 663.0927734375, "step": 559 }, { "KL/chosen_KL_mean": -665.8540649414062, "KL/mean": -852.7691650390625, "KL/rejected_KL_mean": -1039.684326171875, "KL/std": 539.5546875, "epoch": 0.8223201174743024, "fcm_dpo/beta": 0.0009346292354166508, "fcm_dpo/delta": 0.0522555373609066, "fcm_dpo/margin": 373.83026123046875, "fcm_dpo/q_t": 0.4206470251083374, "grad_norm": 34.21842575073242, "learning_rate": 4.7444448928806615e-08, "logits/chosen": -0.9070395231246948, "logits/rejected": -0.8892009258270264, "logps/chosen": -723.9578857421875, "logps/ref_chosen": -58.10382080078125, "logps/ref_rejected": -79.99122619628906, "logps/rejected": -1119.675537109375, "loss": 1.1402, "margin_dpo/margin_mean": 373.83026123046875, "margin_dpo/margin_std": 590.425537109375, "step": 560 }, { "KL/chosen_KL_mean": -797.4371948242188, "KL/mean": -945.9874877929688, "KL/rejected_KL_mean": -1094.537841796875, "KL/std": 540.9091186523438, "epoch": 0.8237885462555066, "fcm_dpo/beta": 0.0009608013788238168, "fcm_dpo/delta": 0.11739911884069443, "fcm_dpo/margin": 297.1005554199219, "fcm_dpo/q_t": 0.4335824251174927, "grad_norm": 41.68199157714844, "learning_rate": 4.669493178106432e-08, "logits/chosen": -1.0954780578613281, "logits/rejected": -1.1158504486083984, "logps/chosen": -848.35009765625, "logps/ref_chosen": -50.912879943847656, "logps/ref_rejected": -99.06856536865234, "logps/rejected": -1193.6063232421875, "loss": 1.2181, "margin_dpo/margin_mean": 297.10052490234375, "margin_dpo/margin_std": 675.0682373046875, "step": 561 }, { "KL/chosen_KL_mean": -731.2474975585938, "KL/mean": -947.933837890625, "KL/rejected_KL_mean": -1164.6201171875, "KL/std": 597.2823486328125, "epoch": 0.8252569750367107, "fcm_dpo/beta": 0.0009560026228427887, "fcm_dpo/delta": -0.01568439230322838, "fcm_dpo/margin": 433.3727722167969, "fcm_dpo/q_t": 0.40745779871940613, "grad_norm": 44.4998664855957, "learning_rate": 4.5950771910944596e-08, "logits/chosen": -1.0381966829299927, "logits/rejected": -1.0505871772766113, "logps/chosen": -790.7119140625, "logps/ref_chosen": -59.46440124511719, "logps/ref_rejected": -96.54266357421875, "logps/rejected": -1261.162841796875, "loss": 1.1008, "margin_dpo/margin_mean": 433.3727722167969, "margin_dpo/margin_std": 646.6458740234375, "step": 562 }, { "KL/chosen_KL_mean": -818.318603515625, "KL/mean": -976.48193359375, "KL/rejected_KL_mean": -1134.645263671875, "KL/std": 644.12939453125, "epoch": 0.8267254038179148, "fcm_dpo/beta": 0.0009566263761371374, "fcm_dpo/delta": -0.042735543102025986, "fcm_dpo/margin": 316.32666015625, "fcm_dpo/q_t": 0.42516082525253296, "grad_norm": 57.121238708496094, "learning_rate": 4.521198892775202e-08, "logits/chosen": -1.0335191488265991, "logits/rejected": -1.0376369953155518, "logps/chosen": -878.9268188476562, "logps/ref_chosen": -60.60819625854492, "logps/ref_rejected": -94.56770324707031, "logps/rejected": -1229.2130126953125, "loss": 1.2364, "margin_dpo/margin_mean": 316.32666015625, "margin_dpo/margin_std": 755.5484619140625, "step": 563 }, { "KL/chosen_KL_mean": -720.1054077148438, "KL/mean": -927.0765380859375, "KL/rejected_KL_mean": -1134.0478515625, "KL/std": 552.9932250976562, "epoch": 0.8281938325991189, "fcm_dpo/beta": 0.0009554863208904862, "fcm_dpo/delta": 0.004538660869002342, "fcm_dpo/margin": 413.9423828125, "fcm_dpo/q_t": 0.40960174798965454, "grad_norm": 40.964012145996094, "learning_rate": 4.447860229910544e-08, "logits/chosen": -1.0873092412948608, "logits/rejected": -1.0802876949310303, "logps/chosen": -794.373779296875, "logps/ref_chosen": -74.26837921142578, "logps/ref_rejected": -93.23818969726562, "logps/rejected": -1227.285888671875, "loss": 1.0915, "margin_dpo/margin_mean": 413.9423828125, "margin_dpo/margin_std": 534.1187744140625, "step": 564 }, { "KL/chosen_KL_mean": -773.20751953125, "KL/mean": -995.1334228515625, "KL/rejected_KL_mean": -1217.059326171875, "KL/std": 649.26171875, "epoch": 0.8296622613803231, "fcm_dpo/beta": 0.0009499896550551057, "fcm_dpo/delta": -0.022613905370235443, "fcm_dpo/margin": 443.85186767578125, "fcm_dpo/q_t": 0.4094652831554413, "grad_norm": 60.46991729736328, "learning_rate": 4.375063135042445e-08, "logits/chosen": -1.0429582595825195, "logits/rejected": -1.0467889308929443, "logps/chosen": -842.2274169921875, "logps/ref_chosen": -69.0199203491211, "logps/ref_rejected": -85.7789306640625, "logps/rejected": -1302.83837890625, "loss": 1.1303, "margin_dpo/margin_mean": 443.85186767578125, "margin_dpo/margin_std": 780.3602294921875, "step": 565 }, { "KL/chosen_KL_mean": -731.8004760742188, "KL/mean": -975.913818359375, "KL/rejected_KL_mean": -1220.0272216796875, "KL/std": 664.28466796875, "epoch": 0.8311306901615272, "fcm_dpo/beta": 0.000945397128816694, "fcm_dpo/delta": -0.06484313309192657, "fcm_dpo/margin": 488.22674560546875, "fcm_dpo/q_t": 0.39865684509277344, "grad_norm": 32.33478927612305, "learning_rate": 4.3028095264420525e-08, "logits/chosen": -1.0355273485183716, "logits/rejected": -1.0623806715011597, "logps/chosen": -798.3458251953125, "logps/ref_chosen": -66.5453109741211, "logps/ref_rejected": -103.86932373046875, "logps/rejected": -1323.896484375, "loss": 1.1021, "margin_dpo/margin_mean": 488.22674560546875, "margin_dpo/margin_std": 773.612060546875, "step": 566 }, { "KL/chosen_KL_mean": -693.953369140625, "KL/mean": -878.732666015625, "KL/rejected_KL_mean": -1063.511962890625, "KL/std": 454.4079284667969, "epoch": 0.8325991189427313, "fcm_dpo/beta": 0.0009434693492949009, "fcm_dpo/delta": 0.053218990564346313, "fcm_dpo/margin": 369.55865478515625, "fcm_dpo/q_t": 0.41841405630111694, "grad_norm": 40.58536911010742, "learning_rate": 4.231101308059165e-08, "logits/chosen": -1.1621546745300293, "logits/rejected": -1.1755425930023193, "logps/chosen": -746.8116455078125, "logps/ref_chosen": -52.85829544067383, "logps/ref_rejected": -85.37095642089844, "logps/rejected": -1148.8829345703125, "loss": 1.1196, "margin_dpo/margin_mean": 369.55865478515625, "margin_dpo/margin_std": 497.5330505371094, "step": 567 }, { "KL/chosen_KL_mean": -696.86279296875, "KL/mean": -949.0692749023438, "KL/rejected_KL_mean": -1201.275634765625, "KL/std": 546.216064453125, "epoch": 0.8340675477239354, "fcm_dpo/beta": 0.0009356088703498244, "fcm_dpo/delta": -0.07556474953889847, "fcm_dpo/margin": 504.4129333496094, "fcm_dpo/q_t": 0.39082616567611694, "grad_norm": 34.73697280883789, "learning_rate": 4.1599403694720145e-08, "logits/chosen": -1.0284827947616577, "logits/rejected": -1.069136142730713, "logps/chosen": -742.05517578125, "logps/ref_chosen": -45.1923828125, "logps/ref_rejected": -89.09236907958984, "logps/rejected": -1290.3680419921875, "loss": 1.0361, "margin_dpo/margin_mean": 504.41290283203125, "margin_dpo/margin_std": 577.991943359375, "step": 568 }, { "KL/chosen_KL_mean": -800.5419921875, "KL/mean": -999.2967529296875, "KL/rejected_KL_mean": -1198.051513671875, "KL/std": 707.2173461914062, "epoch": 0.8355359765051396, "fcm_dpo/beta": 0.0009422843577340245, "fcm_dpo/delta": 0.025416847318410873, "fcm_dpo/margin": 397.50946044921875, "fcm_dpo/q_t": 0.4149819612503052, "grad_norm": 68.37850952148438, "learning_rate": 4.089328585837512e-08, "logits/chosen": -1.0683939456939697, "logits/rejected": -1.0735970735549927, "logps/chosen": -864.2625732421875, "logps/ref_chosen": -63.72056198120117, "logps/ref_rejected": -79.10325622558594, "logps/rejected": -1277.15478515625, "loss": 1.1639, "margin_dpo/margin_mean": 397.50946044921875, "margin_dpo/margin_std": 739.0003051757812, "step": 569 }, { "KL/chosen_KL_mean": -741.1410522460938, "KL/mean": -932.583251953125, "KL/rejected_KL_mean": -1124.025390625, "KL/std": 565.6273803710938, "epoch": 0.8370044052863436, "fcm_dpo/beta": 0.0009413023362867534, "fcm_dpo/delta": 0.04106989875435829, "fcm_dpo/margin": 382.88421630859375, "fcm_dpo/q_t": 0.4200833737850189, "grad_norm": 34.74723815917969, "learning_rate": 4.019267817841834e-08, "logits/chosen": -1.177173137664795, "logits/rejected": -1.1700718402862549, "logps/chosen": -802.755615234375, "logps/ref_chosen": -61.61454391479492, "logps/ref_rejected": -82.14186096191406, "logps/rejected": -1206.167236328125, "loss": 1.1407, "margin_dpo/margin_mean": 382.8842468261719, "margin_dpo/margin_std": 631.297607421875, "step": 570 }, { "KL/chosen_KL_mean": -775.4749755859375, "KL/mean": -999.5449829101562, "KL/rejected_KL_mean": -1223.614990234375, "KL/std": 576.8671875, "epoch": 0.8384728340675477, "fcm_dpo/beta": 0.00093979382654652, "fcm_dpo/delta": -0.02226072922348976, "fcm_dpo/margin": 448.14007568359375, "fcm_dpo/q_t": 0.4062590003013611, "grad_norm": 51.04140090942383, "learning_rate": 3.9497599116513705e-08, "logits/chosen": -1.042744755744934, "logits/rejected": -1.0550953149795532, "logps/chosen": -828.529052734375, "logps/ref_chosen": -53.05406188964844, "logps/ref_rejected": -91.33682250976562, "logps/rejected": -1314.9517822265625, "loss": 1.1144, "margin_dpo/margin_mean": 448.14007568359375, "margin_dpo/margin_std": 723.029052734375, "step": 571 }, { "KL/chosen_KL_mean": -804.140625, "KL/mean": -1039.684814453125, "KL/rejected_KL_mean": -1275.22900390625, "KL/std": 669.639404296875, "epoch": 0.8399412628487518, "fcm_dpo/beta": 0.0009325648425146937, "fcm_dpo/delta": -0.04140661656856537, "fcm_dpo/margin": 471.0883483886719, "fcm_dpo/q_t": 0.4063248038291931, "grad_norm": 32.49717712402344, "learning_rate": 3.880806698864086e-08, "logits/chosen": -1.0944292545318604, "logits/rejected": -1.1295243501663208, "logps/chosen": -852.5999755859375, "logps/ref_chosen": -48.45928955078125, "logps/ref_rejected": -83.55703735351562, "logps/rejected": -1358.7861328125, "loss": 1.12, "margin_dpo/margin_mean": 471.08837890625, "margin_dpo/margin_std": 815.548828125, "step": 572 }, { "KL/chosen_KL_mean": -792.255615234375, "KL/mean": -1000.1666259765625, "KL/rejected_KL_mean": -1208.07763671875, "KL/std": 596.44189453125, "epoch": 0.8414096916299559, "fcm_dpo/beta": 0.0009360272670164704, "fcm_dpo/delta": 0.011164238676428795, "fcm_dpo/margin": 415.822021484375, "fcm_dpo/q_t": 0.41275107860565186, "grad_norm": 29.790285110473633, "learning_rate": 3.812409996461275e-08, "logits/chosen": -1.1403576135635376, "logits/rejected": -1.1524157524108887, "logps/chosen": -843.8782348632812, "logps/ref_chosen": -51.62262725830078, "logps/ref_rejected": -85.32499694824219, "logps/rejected": -1293.402587890625, "loss": 1.1041, "margin_dpo/margin_mean": 415.822021484375, "margin_dpo/margin_std": 595.5224609375, "step": 573 }, { "KL/chosen_KL_mean": -698.723876953125, "KL/mean": -926.019287109375, "KL/rejected_KL_mean": -1153.314697265625, "KL/std": 534.9053344726562, "epoch": 0.8428781204111601, "fcm_dpo/beta": 0.0009335580398328602, "fcm_dpo/delta": -0.025494040921330452, "fcm_dpo/margin": 454.5906982421875, "fcm_dpo/q_t": 0.4033309519290924, "grad_norm": 44.21940612792969, "learning_rate": 3.74457160675965e-08, "logits/chosen": -1.1207423210144043, "logits/rejected": -1.147803783416748, "logps/chosen": -749.768310546875, "logps/ref_chosen": -51.04446029663086, "logps/ref_rejected": -92.80640411376953, "logps/rejected": -1246.12109375, "loss": 1.0836, "margin_dpo/margin_mean": 454.5906982421875, "margin_dpo/margin_std": 616.3843994140625, "step": 574 }, { "KL/chosen_KL_mean": -788.0826416015625, "KL/mean": -997.0467529296875, "KL/rejected_KL_mean": -1206.0108642578125, "KL/std": 558.839599609375, "epoch": 0.8443465491923642, "fcm_dpo/beta": 0.0009250047150999308, "fcm_dpo/delta": 0.012776091694831848, "fcm_dpo/margin": 417.9283447265625, "fcm_dpo/q_t": 0.41284099221229553, "grad_norm": 47.46909713745117, "learning_rate": 3.677293317363864e-08, "logits/chosen": -0.9957201480865479, "logits/rejected": -1.003206491470337, "logps/chosen": -859.8727416992188, "logps/ref_chosen": -71.7901382446289, "logps/ref_rejected": -95.38619995117188, "logps/rejected": -1301.3970947265625, "loss": 1.1451, "margin_dpo/margin_mean": 417.9283447265625, "margin_dpo/margin_std": 721.6036376953125, "step": 575 }, { "KL/chosen_KL_mean": -724.9219970703125, "KL/mean": -881.224853515625, "KL/rejected_KL_mean": -1037.5277099609375, "KL/std": 497.8431396484375, "epoch": 0.8458149779735683, "fcm_dpo/beta": 0.0009450684301555157, "fcm_dpo/delta": 0.10788638889789581, "fcm_dpo/margin": 312.60577392578125, "fcm_dpo/q_t": 0.4328186810016632, "grad_norm": 32.42967224121094, "learning_rate": 3.6105769011194224e-08, "logits/chosen": -1.0785043239593506, "logits/rejected": -1.1083261966705322, "logps/chosen": -779.1849365234375, "logps/ref_chosen": -54.262962341308594, "logps/ref_rejected": -100.75428009033203, "logps/rejected": -1138.281982421875, "loss": 1.1847, "margin_dpo/margin_mean": 312.60577392578125, "margin_dpo/margin_std": 577.78369140625, "step": 576 }, { "KL/chosen_KL_mean": -675.4276123046875, "KL/mean": -879.83935546875, "KL/rejected_KL_mean": -1084.251220703125, "KL/std": 572.3829345703125, "epoch": 0.8472834067547724, "fcm_dpo/beta": 0.0009535005083307624, "fcm_dpo/delta": 0.01057706493884325, "fcm_dpo/margin": 408.823486328125, "fcm_dpo/q_t": 0.41230309009552, "grad_norm": 30.890453338623047, "learning_rate": 3.5444241160659304e-08, "logits/chosen": -1.104528546333313, "logits/rejected": -1.0938575267791748, "logps/chosen": -737.3373413085938, "logps/ref_chosen": -61.909706115722656, "logps/ref_rejected": -84.07069396972656, "logps/rejected": -1168.32177734375, "loss": 1.1236, "margin_dpo/margin_mean": 408.8235168457031, "margin_dpo/margin_std": 617.4876098632812, "step": 577 }, { "KL/chosen_KL_mean": -653.1123046875, "KL/mean": -878.5861206054688, "KL/rejected_KL_mean": -1104.0599365234375, "KL/std": 559.1902465820312, "epoch": 0.8487518355359766, "fcm_dpo/beta": 0.0009461954468861222, "fcm_dpo/delta": -0.028605271130800247, "fcm_dpo/margin": 450.94769287109375, "fcm_dpo/q_t": 0.40265679359436035, "grad_norm": 39.44662094116211, "learning_rate": 3.478836705390808e-08, "logits/chosen": -0.960713803768158, "logits/rejected": -0.9943492412567139, "logps/chosen": -702.3759765625, "logps/ref_chosen": -49.26368713378906, "logps/ref_rejected": -83.4362564086914, "logps/rejected": -1187.4962158203125, "loss": 1.0739, "margin_dpo/margin_mean": 450.94769287109375, "margin_dpo/margin_std": 569.8036499023438, "step": 578 }, { "KL/chosen_KL_mean": -740.7020263671875, "KL/mean": -872.2453002929688, "KL/rejected_KL_mean": -1003.78857421875, "KL/std": 550.201416015625, "epoch": 0.8502202643171806, "fcm_dpo/beta": 0.0009595105657353997, "fcm_dpo/delta": 0.050605472177267075, "fcm_dpo/margin": 263.0865478515625, "fcm_dpo/q_t": 0.44163453578948975, "grad_norm": 62.6992301940918, "learning_rate": 3.41381639738331e-08, "logits/chosen": -0.9965687394142151, "logits/rejected": -0.9950494766235352, "logps/chosen": -799.587890625, "logps/ref_chosen": -58.88581848144531, "logps/ref_rejected": -94.78762817382812, "logps/rejected": -1098.576171875, "loss": 1.2315, "margin_dpo/margin_mean": 263.0865478515625, "margin_dpo/margin_std": 606.66259765625, "step": 579 }, { "KL/chosen_KL_mean": -536.5447998046875, "KL/mean": -791.0418701171875, "KL/rejected_KL_mean": -1045.5389404296875, "KL/std": 616.0582275390625, "epoch": 0.8516886930983847, "fcm_dpo/beta": 0.0009441368165425956, "fcm_dpo/delta": -0.08517200499773026, "fcm_dpo/margin": 508.99420166015625, "fcm_dpo/q_t": 0.3952370882034302, "grad_norm": 31.18793487548828, "learning_rate": 3.349364905389032e-08, "logits/chosen": -0.8859065771102905, "logits/rejected": -0.9189168214797974, "logps/chosen": -585.2516479492188, "logps/ref_chosen": -48.70683670043945, "logps/ref_rejected": -81.7583999633789, "logps/rejected": -1127.29736328125, "loss": 1.0568, "margin_dpo/margin_mean": 508.9941711425781, "margin_dpo/margin_std": 707.2589111328125, "step": 580 }, { "KL/chosen_KL_mean": -736.6950073242188, "KL/mean": -915.5865478515625, "KL/rejected_KL_mean": -1094.477783203125, "KL/std": 581.46533203125, "epoch": 0.8531571218795888, "fcm_dpo/beta": 0.0009529366507194936, "fcm_dpo/delta": 0.06105329841375351, "fcm_dpo/margin": 357.78289794921875, "fcm_dpo/q_t": 0.4243428111076355, "grad_norm": 41.60260009765625, "learning_rate": 3.285483927764726e-08, "logits/chosen": -1.1242549419403076, "logits/rejected": -1.1350033283233643, "logps/chosen": -798.9173583984375, "logps/ref_chosen": -62.22235107421875, "logps/ref_rejected": -91.73568725585938, "logps/rejected": -1186.213623046875, "loss": 1.1564, "margin_dpo/margin_mean": 357.78289794921875, "margin_dpo/margin_std": 635.5386962890625, "step": 581 }, { "KL/chosen_KL_mean": -648.818603515625, "KL/mean": -849.9896240234375, "KL/rejected_KL_mean": -1051.16064453125, "KL/std": 508.7437744140625, "epoch": 0.8546255506607929, "fcm_dpo/beta": 0.0009470300283282995, "fcm_dpo/delta": -0.08303224295377731, "fcm_dpo/margin": 402.34197998046875, "fcm_dpo/q_t": 0.41153034567832947, "grad_norm": 39.270023345947266, "learning_rate": 3.222175147833556e-08, "logits/chosen": -1.0631489753723145, "logits/rejected": -1.087165355682373, "logps/chosen": -707.0472412109375, "logps/ref_chosen": -58.228660583496094, "logps/ref_rejected": -110.06959533691406, "logps/rejected": -1161.230224609375, "loss": 1.1143, "margin_dpo/margin_mean": 402.34197998046875, "margin_dpo/margin_std": 554.0859375, "step": 582 }, { "KL/chosen_KL_mean": -734.060546875, "KL/mean": -855.7951049804688, "KL/rejected_KL_mean": -977.5296630859375, "KL/std": 547.7103271484375, "epoch": 0.856093979441997, "fcm_dpo/beta": 0.0009388748439960182, "fcm_dpo/delta": -0.0015546621289104223, "fcm_dpo/margin": 243.4691162109375, "fcm_dpo/q_t": 0.44874462485313416, "grad_norm": 69.85670471191406, "learning_rate": 3.159440233840763e-08, "logits/chosen": -1.029801845550537, "logits/rejected": -1.0272910594940186, "logps/chosen": -790.9234619140625, "logps/ref_chosen": -56.86286163330078, "logps/ref_rejected": -88.4039306640625, "logps/rejected": -1065.93359375, "loss": 1.2665, "margin_dpo/margin_mean": 243.4691162109375, "margin_dpo/margin_std": 655.7301025390625, "step": 583 }, { "KL/chosen_KL_mean": -614.206787109375, "KL/mean": -863.93896484375, "KL/rejected_KL_mean": -1113.671142578125, "KL/std": 565.6908569335938, "epoch": 0.8575624082232012, "fcm_dpo/beta": 0.0009302167454734445, "fcm_dpo/delta": -0.06771711260080338, "fcm_dpo/margin": 499.46441650390625, "fcm_dpo/q_t": 0.3950466513633728, "grad_norm": 31.617774963378906, "learning_rate": 3.0972808389096635e-08, "logits/chosen": -1.0071210861206055, "logits/rejected": -1.0183664560317993, "logps/chosen": -671.107421875, "logps/ref_chosen": -56.90068054199219, "logps/ref_rejected": -97.63606262207031, "logps/rejected": -1211.3072509765625, "loss": 1.0467, "margin_dpo/margin_mean": 499.46441650390625, "margin_dpo/margin_std": 599.928466796875, "step": 584 }, { "KL/chosen_KL_mean": -719.5697631835938, "KL/mean": -944.973388671875, "KL/rejected_KL_mean": -1170.376953125, "KL/std": 628.8306884765625, "epoch": 0.8590308370044053, "fcm_dpo/beta": 0.0009213717421516776, "fcm_dpo/delta": -0.016240080818533897, "fcm_dpo/margin": 450.80718994140625, "fcm_dpo/q_t": 0.4065605700016022, "grad_norm": 37.25502395629883, "learning_rate": 3.035698600998121e-08, "logits/chosen": -1.0849614143371582, "logits/rejected": -1.1072694063186646, "logps/chosen": -780.543701171875, "logps/ref_chosen": -60.973968505859375, "logps/ref_rejected": -84.16952514648438, "logps/rejected": -1254.5465087890625, "loss": 1.1214, "margin_dpo/margin_mean": 450.8072509765625, "margin_dpo/margin_std": 750.4278564453125, "step": 585 }, { "KL/chosen_KL_mean": -764.2457275390625, "KL/mean": -916.5640869140625, "KL/rejected_KL_mean": -1068.88232421875, "KL/std": 550.64111328125, "epoch": 0.8604992657856094, "fcm_dpo/beta": 0.0009377297828905284, "fcm_dpo/delta": 0.11785154044628143, "fcm_dpo/margin": 304.63677978515625, "fcm_dpo/q_t": 0.4342753291130066, "grad_norm": 32.1750602722168, "learning_rate": 2.974695142855388e-08, "logits/chosen": -1.0714232921600342, "logits/rejected": -1.0927150249481201, "logps/chosen": -821.101318359375, "logps/ref_chosen": -56.85559844970703, "logps/ref_rejected": -91.80261993408203, "logps/rejected": -1160.68505859375, "loss": 1.1966, "margin_dpo/margin_mean": 304.63677978515625, "margin_dpo/margin_std": 600.1655883789062, "step": 586 }, { "KL/chosen_KL_mean": -531.717041015625, "KL/mean": -734.9912109375, "KL/rejected_KL_mean": -938.265380859375, "KL/std": 562.0120849609375, "epoch": 0.8619676945668135, "fcm_dpo/beta": 0.0009496827842667699, "fcm_dpo/delta": 0.01429927907884121, "fcm_dpo/margin": 406.54833984375, "fcm_dpo/q_t": 0.4112315773963928, "grad_norm": 39.31088638305664, "learning_rate": 2.9142720719793122e-08, "logits/chosen": -1.075244426727295, "logits/rejected": -1.0990477800369263, "logps/chosen": -576.4086303710938, "logps/ref_chosen": -44.69159698486328, "logps/ref_rejected": -82.62385559082031, "logps/rejected": -1020.8892211914062, "loss": 1.1065, "margin_dpo/margin_mean": 406.54833984375, "margin_dpo/margin_std": 574.6107177734375, "step": 587 }, { "KL/chosen_KL_mean": -691.1477661132812, "KL/mean": -867.112060546875, "KL/rejected_KL_mean": -1043.0762939453125, "KL/std": 485.6700134277344, "epoch": 0.8634361233480177, "fcm_dpo/beta": 0.0009533903794363141, "fcm_dpo/delta": 0.06662734597921371, "fcm_dpo/margin": 351.92852783203125, "fcm_dpo/q_t": 0.4219910204410553, "grad_norm": 29.434553146362305, "learning_rate": 2.8544309805740018e-08, "logits/chosen": -1.0492743253707886, "logits/rejected": -1.0743939876556396, "logps/chosen": -741.4427490234375, "logps/ref_chosen": -50.29494857788086, "logps/ref_rejected": -107.36988067626953, "logps/rejected": -1150.4461669921875, "loss": 1.1371, "margin_dpo/margin_mean": 351.92852783203125, "margin_dpo/margin_std": 510.2389831542969, "step": 588 }, { "KL/chosen_KL_mean": -680.4524536132812, "KL/mean": -915.0347290039062, "KL/rejected_KL_mean": -1149.616943359375, "KL/std": 550.6396484375, "epoch": 0.8649045521292217, "fcm_dpo/beta": 0.0009546533692628145, "fcm_dpo/delta": -0.05010441318154335, "fcm_dpo/margin": 469.16448974609375, "fcm_dpo/q_t": 0.3972678780555725, "grad_norm": 34.32419204711914, "learning_rate": 2.7951734455078786e-08, "logits/chosen": -1.0105873346328735, "logits/rejected": -1.0250484943389893, "logps/chosen": -740.38232421875, "logps/ref_chosen": -59.929908752441406, "logps/ref_rejected": -111.65534973144531, "logps/rejected": -1261.2723388671875, "loss": 1.0566, "margin_dpo/margin_mean": 469.16448974609375, "margin_dpo/margin_std": 573.5706787109375, "step": 589 }, { "KL/chosen_KL_mean": -611.4031982421875, "KL/mean": -840.3101196289062, "KL/rejected_KL_mean": -1069.217041015625, "KL/std": 545.7060546875, "epoch": 0.8663729809104258, "fcm_dpo/beta": 0.0009457060368731618, "fcm_dpo/delta": -0.034474555402994156, "fcm_dpo/margin": 457.81378173828125, "fcm_dpo/q_t": 0.4021071493625641, "grad_norm": 26.10135269165039, "learning_rate": 2.736501028272095e-08, "logits/chosen": -0.9935860633850098, "logits/rejected": -1.0191072225570679, "logps/chosen": -667.2130126953125, "logps/ref_chosen": -55.80979537963867, "logps/ref_rejected": -106.06282043457031, "logps/rejected": -1175.2799072265625, "loss": 1.0732, "margin_dpo/margin_mean": 457.81378173828125, "margin_dpo/margin_std": 602.8309936523438, "step": 590 }, { "KL/chosen_KL_mean": -662.7632446289062, "KL/mean": -870.50341796875, "KL/rejected_KL_mean": -1078.24365234375, "KL/std": 519.09814453125, "epoch": 0.8678414096916299, "fcm_dpo/beta": 0.0009449812932871282, "fcm_dpo/delta": 0.007675642147660255, "fcm_dpo/margin": 415.48028564453125, "fcm_dpo/q_t": 0.4091309905052185, "grad_norm": 33.07461166381836, "learning_rate": 2.678415274939408e-08, "logits/chosen": -1.0695421695709229, "logits/rejected": -1.0605621337890625, "logps/chosen": -719.00390625, "logps/ref_chosen": -56.24061965942383, "logps/ref_rejected": -83.78629302978516, "logps/rejected": -1162.02978515625, "loss": 1.1031, "margin_dpo/margin_mean": 415.48028564453125, "margin_dpo/margin_std": 583.4301147460938, "step": 591 }, { "KL/chosen_KL_mean": -719.1552734375, "KL/mean": -909.2796020507812, "KL/rejected_KL_mean": -1099.40380859375, "KL/std": 545.5037841796875, "epoch": 0.869309838472834, "fcm_dpo/beta": 0.0009508398361504078, "fcm_dpo/delta": 0.039905961602926254, "fcm_dpo/margin": 380.2485656738281, "fcm_dpo/q_t": 0.4189043939113617, "grad_norm": 33.25096893310547, "learning_rate": 2.6209177161234442e-08, "logits/chosen": -1.052534818649292, "logits/rejected": -1.05497407913208, "logps/chosen": -767.0955810546875, "logps/ref_chosen": -47.94025421142578, "logps/ref_rejected": -75.73287963867188, "logps/rejected": -1175.13671875, "loss": 1.1759, "margin_dpo/margin_mean": 380.24853515625, "margin_dpo/margin_std": 739.9800415039062, "step": 592 }, { "KL/chosen_KL_mean": -685.7623901367188, "KL/mean": -834.489990234375, "KL/rejected_KL_mean": -983.2177124023438, "KL/std": 593.387939453125, "epoch": 0.8707782672540382, "fcm_dpo/beta": 0.0009585937950760126, "fcm_dpo/delta": 0.026102518662810326, "fcm_dpo/margin": 297.45526123046875, "fcm_dpo/q_t": 0.43642657995224, "grad_norm": 60.29446029663086, "learning_rate": 2.564009866938349e-08, "logits/chosen": -0.925950825214386, "logits/rejected": -0.916424036026001, "logps/chosen": -734.453125, "logps/ref_chosen": -48.690757751464844, "logps/ref_rejected": -60.90800094604492, "logps/rejected": -1044.125732421875, "loss": 1.2163, "margin_dpo/margin_mean": 297.45526123046875, "margin_dpo/margin_std": 659.6752319335938, "step": 593 }, { "KL/chosen_KL_mean": -658.8277587890625, "KL/mean": -844.7755737304688, "KL/rejected_KL_mean": -1030.723388671875, "KL/std": 573.47119140625, "epoch": 0.8722466960352423, "fcm_dpo/beta": 0.0009541836334392428, "fcm_dpo/delta": -0.046220000833272934, "fcm_dpo/margin": 371.89556884765625, "fcm_dpo/q_t": 0.41900911927223206, "grad_norm": 36.9598388671875, "learning_rate": 2.5076932269588708e-08, "logits/chosen": -1.0531035661697388, "logits/rejected": -1.044195294380188, "logps/chosen": -713.7626953125, "logps/ref_chosen": -54.93488693237305, "logps/ref_rejected": -86.09967803955078, "logps/rejected": -1116.822998046875, "loss": 1.1458, "margin_dpo/margin_mean": 371.8955993652344, "margin_dpo/margin_std": 604.8126831054688, "step": 594 }, { "KL/chosen_KL_mean": -617.7359619140625, "KL/mean": -833.1766357421875, "KL/rejected_KL_mean": -1048.6171875, "KL/std": 538.6633911132812, "epoch": 0.8737151248164464, "fcm_dpo/beta": 0.0009470278164371848, "fcm_dpo/delta": -0.00846764538437128, "fcm_dpo/margin": 430.8812255859375, "fcm_dpo/q_t": 0.4093731641769409, "grad_norm": 39.55894470214844, "learning_rate": 2.451969280180849e-08, "logits/chosen": -1.0494012832641602, "logits/rejected": -1.0700163841247559, "logps/chosen": -667.1563720703125, "logps/ref_chosen": -49.4204216003418, "logps/ref_rejected": -80.62731170654297, "logps/rejected": -1129.24462890625, "loss": 1.0908, "margin_dpo/margin_mean": 430.8812255859375, "margin_dpo/margin_std": 596.7766723632812, "step": 595 }, { "KL/chosen_KL_mean": -716.04296875, "KL/mean": -864.3089599609375, "KL/rejected_KL_mean": -1012.574951171875, "KL/std": 536.0817260742188, "epoch": 0.8751835535976505, "fcm_dpo/beta": 0.0009654526365920901, "fcm_dpo/delta": 0.11698772758245468, "fcm_dpo/margin": 296.531982421875, "fcm_dpo/q_t": 0.43718546628952026, "grad_norm": 68.12952423095703, "learning_rate": 2.396839494982103e-08, "logits/chosen": -1.018229603767395, "logits/rejected": -0.9841310381889343, "logps/chosen": -775.834716796875, "logps/ref_chosen": -59.791683197021484, "logps/ref_rejected": -80.09111785888672, "logps/rejected": -1092.666015625, "loss": 1.2079, "margin_dpo/margin_mean": 296.531982421875, "margin_dpo/margin_std": 639.1839599609375, "step": 596 }, { "KL/chosen_KL_mean": -683.271728515625, "KL/mean": -945.9227905273438, "KL/rejected_KL_mean": -1208.5738525390625, "KL/std": 629.5780639648438, "epoch": 0.8766519823788547, "fcm_dpo/beta": 0.0009458234999328852, "fcm_dpo/delta": -0.10513734817504883, "fcm_dpo/margin": 525.3020629882812, "fcm_dpo/q_t": 0.3902336359024048, "grad_norm": 26.535120010375977, "learning_rate": 2.3423053240837514e-08, "logits/chosen": -0.9855027198791504, "logits/rejected": -1.036144733428955, "logps/chosen": -740.5325317382812, "logps/ref_chosen": -57.26078796386719, "logps/ref_rejected": -100.6937255859375, "logps/rejected": -1309.267578125, "loss": 1.0533, "margin_dpo/margin_mean": 525.3020629882812, "margin_dpo/margin_std": 688.6737060546875, "step": 597 }, { "KL/chosen_KL_mean": -684.9342041015625, "KL/mean": -889.61328125, "KL/rejected_KL_mean": -1094.292236328125, "KL/std": 542.2847900390625, "epoch": 0.8781204111600588, "fcm_dpo/beta": 0.0009448026539757848, "fcm_dpo/delta": 0.012614801526069641, "fcm_dpo/margin": 409.35821533203125, "fcm_dpo/q_t": 0.41108059883117676, "grad_norm": 40.530555725097656, "learning_rate": 2.2883682045119062e-08, "logits/chosen": -1.0675361156463623, "logits/rejected": -1.0781702995300293, "logps/chosen": -737.45263671875, "logps/ref_chosen": -52.51850509643555, "logps/ref_rejected": -89.44385528564453, "logps/rejected": -1183.7362060546875, "loss": 1.1156, "margin_dpo/margin_mean": 409.3581848144531, "margin_dpo/margin_std": 592.7867431640625, "step": 598 }, { "KL/chosen_KL_mean": -699.3439331054688, "KL/mean": -867.4818725585938, "KL/rejected_KL_mean": -1035.619873046875, "KL/std": 521.607421875, "epoch": 0.8795888399412628, "fcm_dpo/beta": 0.0009433372761122882, "fcm_dpo/delta": -0.04999526962637901, "fcm_dpo/margin": 336.27593994140625, "fcm_dpo/q_t": 0.4230959117412567, "grad_norm": 32.54823303222656, "learning_rate": 2.2350295575598367e-08, "logits/chosen": -1.0585663318634033, "logits/rejected": -1.0668901205062866, "logps/chosen": -749.1466064453125, "logps/ref_chosen": -49.802677154541016, "logps/ref_rejected": -82.978515625, "logps/rejected": -1118.598388671875, "loss": 1.1528, "margin_dpo/margin_mean": 336.27593994140625, "margin_dpo/margin_std": 495.86431884765625, "step": 599 }, { "KL/chosen_KL_mean": -743.595947265625, "KL/mean": -904.8917846679688, "KL/rejected_KL_mean": -1066.1876220703125, "KL/std": 529.2329711914062, "epoch": 0.8810572687224669, "fcm_dpo/beta": 0.0009553448762744665, "fcm_dpo/delta": 0.0948304608464241, "fcm_dpo/margin": 322.5916748046875, "fcm_dpo/q_t": 0.4312170147895813, "grad_norm": 32.78029251098633, "learning_rate": 2.1822907887504932e-08, "logits/chosen": -1.1024036407470703, "logits/rejected": -1.0984766483306885, "logps/chosen": -810.03076171875, "logps/ref_chosen": -66.43487548828125, "logps/ref_rejected": -85.45649719238281, "logps/rejected": -1151.64404296875, "loss": 1.1881, "margin_dpo/margin_mean": 322.5916748046875, "margin_dpo/margin_std": 638.26123046875, "step": 600 }, { "KL/chosen_KL_mean": -767.73779296875, "KL/mean": -976.8656005859375, "KL/rejected_KL_mean": -1185.993408203125, "KL/std": 572.8872680664062, "epoch": 0.882525697503671, "fcm_dpo/beta": 0.0009599350159987807, "fcm_dpo/delta": -0.0015968242660164833, "fcm_dpo/margin": 418.25567626953125, "fcm_dpo/q_t": 0.40683579444885254, "grad_norm": 36.36787796020508, "learning_rate": 2.1301532877994742e-08, "logits/chosen": -1.050248622894287, "logits/rejected": -1.0717060565948486, "logps/chosen": -826.8714599609375, "logps/ref_chosen": -59.13361358642578, "logps/ref_rejected": -94.69093322753906, "logps/rejected": -1280.684326171875, "loss": 1.0942, "margin_dpo/margin_mean": 418.2556457519531, "margin_dpo/margin_std": 568.2978515625, "step": 601 }, { "KL/chosen_KL_mean": -537.829345703125, "KL/mean": -798.304931640625, "KL/rejected_KL_mean": -1058.780517578125, "KL/std": 537.489990234375, "epoch": 0.8839941262848752, "fcm_dpo/beta": 0.0009524415945634246, "fcm_dpo/delta": -0.10132233053445816, "fcm_dpo/margin": 520.951171875, "fcm_dpo/q_t": 0.38715463876724243, "grad_norm": 62.27396774291992, "learning_rate": 2.0786184285784298e-08, "logits/chosen": -1.0703651905059814, "logits/rejected": -1.1087815761566162, "logps/chosen": -586.4228515625, "logps/ref_chosen": -48.59352111816406, "logps/ref_rejected": -87.6685562133789, "logps/rejected": -1146.4490966796875, "loss": 1.0206, "margin_dpo/margin_mean": 520.9512329101562, "margin_dpo/margin_std": 575.708740234375, "step": 602 }, { "KL/chosen_KL_mean": -672.793701171875, "KL/mean": -911.109619140625, "KL/rejected_KL_mean": -1149.425537109375, "KL/std": 605.7391967773438, "epoch": 0.8854625550660793, "fcm_dpo/beta": 0.0009342863922938704, "fcm_dpo/delta": -0.04748653993010521, "fcm_dpo/margin": 476.6319580078125, "fcm_dpo/q_t": 0.4025436341762543, "grad_norm": 34.53113555908203, "learning_rate": 2.0276875690788204e-08, "logits/chosen": -1.0783579349517822, "logits/rejected": -1.0710588693618774, "logps/chosen": -743.2083129882812, "logps/ref_chosen": -70.41461944580078, "logps/ref_rejected": -100.32559967041016, "logps/rejected": -1249.751220703125, "loss": 1.0872, "margin_dpo/margin_mean": 476.6319580078125, "margin_dpo/margin_std": 712.30078125, "step": 603 }, { "KL/chosen_KL_mean": -660.3494873046875, "KL/mean": -920.5330810546875, "KL/rejected_KL_mean": -1180.716552734375, "KL/std": 584.669677734375, "epoch": 0.8869309838472834, "fcm_dpo/beta": 0.0009213722078129649, "fcm_dpo/delta": -0.08345615863800049, "fcm_dpo/margin": 520.3671264648438, "fcm_dpo/q_t": 0.39393433928489685, "grad_norm": 40.38688278198242, "learning_rate": 1.977362051376158e-08, "logits/chosen": -1.0343176126480103, "logits/rejected": -1.0740426778793335, "logps/chosen": -706.8075561523438, "logps/ref_chosen": -46.45808029174805, "logps/ref_rejected": -91.8544921875, "logps/rejected": -1272.571044921875, "loss": 1.0614, "margin_dpo/margin_mean": 520.3671875, "margin_dpo/margin_std": 716.9524536132812, "step": 604 }, { "KL/chosen_KL_mean": -681.8074951171875, "KL/mean": -865.7674560546875, "KL/rejected_KL_mean": -1049.7274169921875, "KL/std": 528.2335205078125, "epoch": 0.8883994126284875, "fcm_dpo/beta": 0.0009236353216692805, "fcm_dpo/delta": 0.06231696531176567, "fcm_dpo/margin": 367.9199523925781, "fcm_dpo/q_t": 0.4245484471321106, "grad_norm": 31.172998428344727, "learning_rate": 1.9276432015946446e-08, "logits/chosen": -1.030253291130066, "logits/rejected": -1.046684980392456, "logps/chosen": -748.0568237304688, "logps/ref_chosen": -66.24933624267578, "logps/ref_rejected": -102.30496978759766, "logps/rejected": -1152.032470703125, "loss": 1.1514, "margin_dpo/margin_mean": 367.919921875, "margin_dpo/margin_std": 631.5323486328125, "step": 605 }, { "KL/chosen_KL_mean": -690.3057861328125, "KL/mean": -892.7607421875, "KL/rejected_KL_mean": -1095.2156982421875, "KL/std": 546.4476928710938, "epoch": 0.8898678414096917, "fcm_dpo/beta": 0.0009358040988445282, "fcm_dpo/delta": 0.021479565650224686, "fcm_dpo/margin": 404.9098205566406, "fcm_dpo/q_t": 0.41396206617355347, "grad_norm": 34.67936706542969, "learning_rate": 1.8785323298722093e-08, "logits/chosen": -1.0415606498718262, "logits/rejected": -1.0564000606536865, "logps/chosen": -745.1249389648438, "logps/ref_chosen": -54.819122314453125, "logps/ref_rejected": -98.37146759033203, "logps/rejected": -1193.587158203125, "loss": 1.1119, "margin_dpo/margin_mean": 404.90985107421875, "margin_dpo/margin_std": 580.0758666992188, "step": 606 }, { "KL/chosen_KL_mean": -705.6632080078125, "KL/mean": -875.1304931640625, "KL/rejected_KL_mean": -1044.597900390625, "KL/std": 545.179931640625, "epoch": 0.8913362701908958, "fcm_dpo/beta": 0.0009470410877838731, "fcm_dpo/delta": 0.08123958110809326, "fcm_dpo/margin": 338.9346923828125, "fcm_dpo/q_t": 0.4270426332950592, "grad_norm": 25.185142517089844, "learning_rate": 1.8300307303259904e-08, "logits/chosen": -1.0110514163970947, "logits/rejected": -0.9996987581253052, "logps/chosen": -763.7471923828125, "logps/ref_chosen": -58.08403778076172, "logps/ref_rejected": -79.777099609375, "logps/rejected": -1124.375, "loss": 1.1634, "margin_dpo/margin_mean": 338.9346923828125, "margin_dpo/margin_std": 585.0699462890625, "step": 607 }, { "KL/chosen_KL_mean": -620.40673828125, "KL/mean": -821.8770751953125, "KL/rejected_KL_mean": -1023.3474731445312, "KL/std": 486.0280456542969, "epoch": 0.8928046989720999, "fcm_dpo/beta": 0.0009526251233182847, "fcm_dpo/delta": 0.016666967421770096, "fcm_dpo/margin": 402.94073486328125, "fcm_dpo/q_t": 0.410768061876297, "grad_norm": 29.697845458984375, "learning_rate": 1.7821396810182437e-08, "logits/chosen": -1.0471224784851074, "logits/rejected": -1.0614254474639893, "logps/chosen": -677.8575439453125, "logps/ref_chosen": -57.450836181640625, "logps/ref_rejected": -94.77339172363281, "logps/rejected": -1118.120849609375, "loss": 1.0917, "margin_dpo/margin_mean": 402.94073486328125, "margin_dpo/margin_std": 497.81024169921875, "step": 608 }, { "KL/chosen_KL_mean": -658.7738037109375, "KL/mean": -919.4385986328125, "KL/rejected_KL_mean": -1180.103271484375, "KL/std": 681.7255249023438, "epoch": 0.8942731277533039, "fcm_dpo/beta": 0.000938827870413661, "fcm_dpo/delta": -0.09398971498012543, "fcm_dpo/margin": 521.3295288085938, "fcm_dpo/q_t": 0.39671069383621216, "grad_norm": 34.812618255615234, "learning_rate": 1.7348604439226617e-08, "logits/chosen": -1.0940017700195312, "logits/rejected": -1.117903709411621, "logps/chosen": -717.5792236328125, "logps/ref_chosen": -58.805355072021484, "logps/ref_rejected": -88.81600952148438, "logps/rejected": -1268.91943359375, "loss": 1.0694, "margin_dpo/margin_mean": 521.32958984375, "margin_dpo/margin_std": 807.185302734375, "step": 609 }, { "KL/chosen_KL_mean": -647.0574340820312, "KL/mean": -813.5626220703125, "KL/rejected_KL_mean": -980.0677490234375, "KL/std": 518.0669555664062, "epoch": 0.895741556534508, "fcm_dpo/beta": 0.0009477235144004226, "fcm_dpo/delta": 0.08690465986728668, "fcm_dpo/margin": 333.0103759765625, "fcm_dpo/q_t": 0.4269651770591736, "grad_norm": 41.71244812011719, "learning_rate": 1.6881942648911074e-08, "logits/chosen": -0.9856526851654053, "logits/rejected": -0.9613279104232788, "logps/chosen": -712.75244140625, "logps/ref_chosen": -65.69503784179688, "logps/ref_rejected": -83.40538787841797, "logps/rejected": -1063.47314453125, "loss": 1.1704, "margin_dpo/margin_mean": 333.0104064941406, "margin_dpo/margin_std": 598.3924560546875, "step": 610 }, { "KL/chosen_KL_mean": -674.8112182617188, "KL/mean": -951.8814697265625, "KL/rejected_KL_mean": -1228.951904296875, "KL/std": 677.4830932617188, "epoch": 0.8972099853157122, "fcm_dpo/beta": 0.000932648777961731, "fcm_dpo/delta": -0.12321210652589798, "fcm_dpo/margin": 554.140625, "fcm_dpo/q_t": 0.3880399465560913, "grad_norm": 28.344263076782227, "learning_rate": 1.6421423736208e-08, "logits/chosen": -1.058631181716919, "logits/rejected": -1.1068617105484009, "logps/chosen": -727.41064453125, "logps/ref_chosen": -52.59946823120117, "logps/ref_rejected": -86.33099365234375, "logps/rejected": -1315.2828369140625, "loss": 1.0449, "margin_dpo/margin_mean": 554.140625, "margin_dpo/margin_std": 762.7548828125, "step": 611 }, { "KL/chosen_KL_mean": -731.4866943359375, "KL/mean": -947.14697265625, "KL/rejected_KL_mean": -1162.8072509765625, "KL/std": 552.690185546875, "epoch": 0.8986784140969163, "fcm_dpo/beta": 0.0009295167401432991, "fcm_dpo/delta": -0.001130029559135437, "fcm_dpo/margin": 431.320556640625, "fcm_dpo/q_t": 0.40825164318084717, "grad_norm": 29.17259979248047, "learning_rate": 1.5967059836219042e-08, "logits/chosen": -1.0582959651947021, "logits/rejected": -1.0618293285369873, "logps/chosen": -790.8104248046875, "logps/ref_chosen": -59.32372283935547, "logps/ref_rejected": -88.31239318847656, "logps/rejected": -1251.11962890625, "loss": 1.0948, "margin_dpo/margin_mean": 431.320556640625, "margin_dpo/margin_std": 588.9520263671875, "step": 612 }, { "KL/chosen_KL_mean": -643.850830078125, "KL/mean": -893.5146484375, "KL/rejected_KL_mean": -1143.178466796875, "KL/std": 606.8695068359375, "epoch": 0.9001468428781204, "fcm_dpo/beta": 0.0009158622706308961, "fcm_dpo/delta": -0.0603950060904026, "fcm_dpo/margin": 499.3277282714844, "fcm_dpo/q_t": 0.39547261595726013, "grad_norm": 36.79396057128906, "learning_rate": 1.551886292185553e-08, "logits/chosen": -1.0371217727661133, "logits/rejected": -1.0906472206115723, "logps/chosen": -703.5807495117188, "logps/ref_chosen": -59.72996520996094, "logps/ref_rejected": -105.10752868652344, "logps/rejected": -1248.2861328125, "loss": 1.0542, "margin_dpo/margin_mean": 499.3277282714844, "margin_dpo/margin_std": 621.1013793945312, "step": 613 }, { "KL/chosen_KL_mean": -698.552978515625, "KL/mean": -949.2009887695312, "KL/rejected_KL_mean": -1199.84912109375, "KL/std": 597.0087890625, "epoch": 0.9016152716593245, "fcm_dpo/beta": 0.0009089302038773894, "fcm_dpo/delta": -0.058258313685655594, "fcm_dpo/margin": 501.296142578125, "fcm_dpo/q_t": 0.398156076669693, "grad_norm": 42.726219177246094, "learning_rate": 1.507684480352292e-08, "logits/chosen": -1.0097222328186035, "logits/rejected": -1.0844841003417969, "logps/chosen": -751.491943359375, "logps/ref_chosen": -52.93898010253906, "logps/ref_rejected": -104.67938232421875, "logps/rejected": -1304.5284423828125, "loss": 1.0711, "margin_dpo/margin_mean": 501.296142578125, "margin_dpo/margin_std": 695.1583251953125, "step": 614 }, { "KL/chosen_KL_mean": -673.9354248046875, "KL/mean": -883.3118896484375, "KL/rejected_KL_mean": -1092.688232421875, "KL/std": 623.587158203125, "epoch": 0.9030837004405287, "fcm_dpo/beta": 0.0009105931967496872, "fcm_dpo/delta": 0.019142257049679756, "fcm_dpo/margin": 418.7528076171875, "fcm_dpo/q_t": 0.41320300102233887, "grad_norm": 30.516206741333008, "learning_rate": 1.4641017128809801e-08, "logits/chosen": -1.0232172012329102, "logits/rejected": -1.0445995330810547, "logps/chosen": -739.7527465820312, "logps/ref_chosen": -65.81727600097656, "logps/ref_rejected": -95.17749786376953, "logps/rejected": -1187.86572265625, "loss": 1.1256, "margin_dpo/margin_mean": 418.7528076171875, "margin_dpo/margin_std": 670.531982421875, "step": 615 }, { "KL/chosen_KL_mean": -789.86279296875, "KL/mean": -958.6298828125, "KL/rejected_KL_mean": -1127.3968505859375, "KL/std": 513.2537841796875, "epoch": 0.9045521292217328, "fcm_dpo/beta": 0.0009216421167366207, "fcm_dpo/delta": 0.0916953831911087, "fcm_dpo/margin": 337.53411865234375, "fcm_dpo/q_t": 0.4289320111274719, "grad_norm": 29.76529884338379, "learning_rate": 1.4211391382180637e-08, "logits/chosen": -1.0982820987701416, "logits/rejected": -1.080725073814392, "logps/chosen": -854.9956665039062, "logps/ref_chosen": -65.13285827636719, "logps/ref_rejected": -74.70050048828125, "logps/rejected": -1202.097412109375, "loss": 1.1665, "margin_dpo/margin_mean": 337.53411865234375, "margin_dpo/margin_std": 574.6236572265625, "step": 616 }, { "KL/chosen_KL_mean": -749.3828125, "KL/mean": -869.354248046875, "KL/rejected_KL_mean": -989.32568359375, "KL/std": 496.443359375, "epoch": 0.9060205580029369, "fcm_dpo/beta": 0.0009326934814453125, "fcm_dpo/delta": 0.079354427754879, "fcm_dpo/margin": 239.94287109375, "fcm_dpo/q_t": 0.4489472508430481, "grad_norm": 61.406524658203125, "learning_rate": 1.378797888467345e-08, "logits/chosen": -1.001933217048645, "logits/rejected": -0.9698858261108398, "logps/chosen": -812.3883056640625, "logps/ref_chosen": -63.005550384521484, "logps/ref_rejected": -64.234130859375, "logps/rejected": -1053.559814453125, "loss": 1.2466, "margin_dpo/margin_mean": 239.94284057617188, "margin_dpo/margin_std": 587.9033813476562, "step": 617 }, { "KL/chosen_KL_mean": -782.6476440429688, "KL/mean": -1043.6942138671875, "KL/rejected_KL_mean": -1304.74072265625, "KL/std": 657.9475708007812, "epoch": 0.9074889867841409, "fcm_dpo/beta": 0.0009334392379969358, "fcm_dpo/delta": -0.0920899510383606, "fcm_dpo/margin": 522.0931396484375, "fcm_dpo/q_t": 0.39517101645469666, "grad_norm": 39.66215896606445, "learning_rate": 1.3370790793601371e-08, "logits/chosen": -1.0424597263336182, "logits/rejected": -1.0812008380889893, "logps/chosen": -849.7490234375, "logps/ref_chosen": -67.10134887695312, "logps/ref_rejected": -92.15340423583984, "logps/rejected": -1396.8941650390625, "loss": 1.0962, "margin_dpo/margin_mean": 522.0931396484375, "margin_dpo/margin_std": 848.4417114257812, "step": 618 }, { "KL/chosen_KL_mean": -782.80810546875, "KL/mean": -982.2987060546875, "KL/rejected_KL_mean": -1181.789306640625, "KL/std": 620.9131469726562, "epoch": 0.908957415565345, "fcm_dpo/beta": 0.000927778659388423, "fcm_dpo/delta": 0.030953753739595413, "fcm_dpo/margin": 398.981201171875, "fcm_dpo/q_t": 0.42306482791900635, "grad_norm": 59.24640655517578, "learning_rate": 1.2959838102258535e-08, "logits/chosen": -1.0202059745788574, "logits/rejected": -1.0303071737289429, "logps/chosen": -838.7863159179688, "logps/ref_chosen": -55.978233337402344, "logps/ref_rejected": -93.1854019165039, "logps/rejected": -1274.974609375, "loss": 1.1805, "margin_dpo/margin_mean": 398.981201171875, "margin_dpo/margin_std": 824.7608642578125, "step": 619 }, { "KL/chosen_KL_mean": -691.927490234375, "KL/mean": -884.0120239257812, "KL/rejected_KL_mean": -1076.0965576171875, "KL/std": 550.659912109375, "epoch": 0.9104258443465492, "fcm_dpo/beta": 0.0009312491165474057, "fcm_dpo/delta": 0.04368671402335167, "fcm_dpo/margin": 384.1690673828125, "fcm_dpo/q_t": 0.4192659258842468, "grad_norm": 35.7849235534668, "learning_rate": 1.2555131639630567e-08, "logits/chosen": -1.0948054790496826, "logits/rejected": -1.1047601699829102, "logps/chosen": -751.7249755859375, "logps/ref_chosen": -59.79750061035156, "logps/ref_rejected": -78.41075134277344, "logps/rejected": -1154.50732421875, "loss": 1.1388, "margin_dpo/margin_mean": 384.1690979003906, "margin_dpo/margin_std": 621.861572265625, "step": 620 }, { "KL/chosen_KL_mean": -702.0250244140625, "KL/mean": -988.3785400390625, "KL/rejected_KL_mean": -1274.73193359375, "KL/std": 669.7676391601562, "epoch": 0.9118942731277533, "fcm_dpo/beta": 0.0009251298615708947, "fcm_dpo/delta": -0.1372598260641098, "fcm_dpo/margin": 572.7069091796875, "fcm_dpo/q_t": 0.3810199499130249, "grad_norm": 43.405094146728516, "learning_rate": 1.2156682070109086e-08, "logits/chosen": -1.1240839958190918, "logits/rejected": -1.1762495040893555, "logps/chosen": -755.9588012695312, "logps/ref_chosen": -53.93375778198242, "logps/ref_rejected": -88.36951446533203, "logps/rejected": -1363.1015625, "loss": 1.0337, "margin_dpo/margin_mean": 572.7069091796875, "margin_dpo/margin_std": 737.669189453125, "step": 621 }, { "KL/chosen_KL_mean": -679.5125732421875, "KL/mean": -879.1805419921875, "KL/rejected_KL_mean": -1078.8485107421875, "KL/std": 515.3754272460938, "epoch": 0.9133627019089574, "fcm_dpo/beta": 0.0009116814471781254, "fcm_dpo/delta": 0.03691772744059563, "fcm_dpo/margin": 399.3359680175781, "fcm_dpo/q_t": 0.4187527894973755, "grad_norm": 34.01809310913086, "learning_rate": 1.1764499893210878e-08, "logits/chosen": -0.9621305465698242, "logits/rejected": -0.9480363726615906, "logps/chosen": -739.79833984375, "logps/ref_chosen": -60.28582000732422, "logps/ref_rejected": -85.51873779296875, "logps/rejected": -1164.3673095703125, "loss": 1.1357, "margin_dpo/margin_mean": 399.3359680175781, "margin_dpo/margin_std": 653.330322265625, "step": 622 }, { "KL/chosen_KL_mean": -744.8868408203125, "KL/mean": -900.721435546875, "KL/rejected_KL_mean": -1056.55615234375, "KL/std": 522.1895141601562, "epoch": 0.9148311306901615, "fcm_dpo/beta": 0.0009337057126685977, "fcm_dpo/delta": 0.11226323246955872, "fcm_dpo/margin": 311.66925048828125, "fcm_dpo/q_t": 0.435872882604599, "grad_norm": 34.93113327026367, "learning_rate": 1.1378595443300998e-08, "logits/chosen": -1.1241803169250488, "logits/rejected": -1.125817060470581, "logps/chosen": -809.0438232421875, "logps/ref_chosen": -64.1569595336914, "logps/ref_rejected": -85.08304595947266, "logps/rejected": -1141.63916015625, "loss": 1.1971, "margin_dpo/margin_mean": 311.66925048828125, "margin_dpo/margin_std": 637.570556640625, "step": 623 }, { "KL/chosen_KL_mean": -713.7887573242188, "KL/mean": -955.57958984375, "KL/rejected_KL_mean": -1197.370361328125, "KL/std": 544.0978393554688, "epoch": 0.9162995594713657, "fcm_dpo/beta": 0.0009321460966020823, "fcm_dpo/delta": -0.05314317345619202, "fcm_dpo/margin": 483.5816650390625, "fcm_dpo/q_t": 0.3945468068122864, "grad_norm": 43.926517486572266, "learning_rate": 1.0998978889320582e-08, "logits/chosen": -1.0758100748062134, "logits/rejected": -1.0797600746154785, "logps/chosen": -785.7073974609375, "logps/ref_chosen": -71.91862487792969, "logps/ref_rejected": -97.13203430175781, "logps/rejected": -1294.50244140625, "loss": 1.0544, "margin_dpo/margin_mean": 483.5816345214844, "margin_dpo/margin_std": 579.5042114257812, "step": 624 }, { "KL/chosen_KL_mean": -692.0985107421875, "KL/mean": -961.903076171875, "KL/rejected_KL_mean": -1231.707763671875, "KL/std": 610.5233764648438, "epoch": 0.9177679882525698, "fcm_dpo/beta": 0.0009185270173475146, "fcm_dpo/delta": -0.10053034871816635, "fcm_dpo/margin": 539.6091918945312, "fcm_dpo/q_t": 0.386716365814209, "grad_norm": 63.644493103027344, "learning_rate": 1.0625660234518913e-08, "logits/chosen": -1.0029915571212769, "logits/rejected": -1.0321152210235596, "logps/chosen": -750.4405517578125, "logps/ref_chosen": -58.342071533203125, "logps/ref_rejected": -86.09038543701172, "logps/rejected": -1317.798095703125, "loss": 1.0132, "margin_dpo/margin_mean": 539.6091918945312, "margin_dpo/margin_std": 572.4862670898438, "step": 625 }, { "KL/chosen_KL_mean": -853.5245971679688, "KL/mean": -1002.6884765625, "KL/rejected_KL_mean": -1151.852294921875, "KL/std": 662.7627563476562, "epoch": 0.9192364170337739, "fcm_dpo/beta": 0.0009280656231567264, "fcm_dpo/delta": 0.1266339272260666, "fcm_dpo/margin": 298.32769775390625, "fcm_dpo/q_t": 0.4352928698062897, "grad_norm": 35.488224029541016, "learning_rate": 1.0258649316189721e-08, "logits/chosen": -0.9830967783927917, "logits/rejected": -0.974500298500061, "logps/chosen": -928.63720703125, "logps/ref_chosen": -75.11260986328125, "logps/ref_rejected": -99.188720703125, "logps/rejected": -1251.041015625, "loss": 1.2186, "margin_dpo/margin_mean": 298.3277282714844, "margin_dpo/margin_std": 647.6847534179688, "step": 626 }, { "KL/chosen_KL_mean": -586.8433227539062, "KL/mean": -882.2752685546875, "KL/rejected_KL_mean": -1177.707275390625, "KL/std": 712.061767578125, "epoch": 0.920704845814978, "fcm_dpo/beta": 0.0009197980398312211, "fcm_dpo/delta": -0.15168313682079315, "fcm_dpo/margin": 590.864013671875, "fcm_dpo/q_t": 0.3880508542060852, "grad_norm": 31.904329299926758, "learning_rate": 9.897955805412e-09, "logits/chosen": -0.9286566972732544, "logits/rejected": -1.0041477680206299, "logps/chosen": -634.5864868164062, "logps/ref_chosen": -47.74314880371094, "logps/ref_rejected": -106.75448608398438, "logps/rejected": -1284.4617919921875, "loss": 1.0393, "margin_dpo/margin_mean": 590.864013671875, "margin_dpo/margin_std": 831.529296875, "step": 627 }, { "KL/chosen_KL_mean": -786.1205444335938, "KL/mean": -1010.9585571289062, "KL/rejected_KL_mean": -1235.796630859375, "KL/std": 596.643798828125, "epoch": 0.922173274596182, "fcm_dpo/beta": 0.0009038818534463644, "fcm_dpo/delta": -0.006778441369533539, "fcm_dpo/margin": 449.6760559082031, "fcm_dpo/q_t": 0.4091545343399048, "grad_norm": 33.698768615722656, "learning_rate": 9.543589206795238e-09, "logits/chosen": -1.0994905233383179, "logits/rejected": -1.1181318759918213, "logps/chosen": -846.303466796875, "logps/ref_chosen": -60.182945251464844, "logps/ref_rejected": -101.55467224121094, "logps/rejected": -1337.351318359375, "loss": 1.1095, "margin_dpo/margin_mean": 449.6760559082031, "margin_dpo/margin_std": 686.2415161132812, "step": 628 }, { "KL/chosen_KL_mean": -741.7413330078125, "KL/mean": -951.3485107421875, "KL/rejected_KL_mean": -1160.955810546875, "KL/std": 565.2870483398438, "epoch": 0.9236417033773862, "fcm_dpo/beta": 0.0009075739653781056, "fcm_dpo/delta": 0.020304802805185318, "fcm_dpo/margin": 419.21453857421875, "fcm_dpo/q_t": 0.4112434983253479, "grad_norm": 35.47370910644531, "learning_rate": 9.19555885822887e-09, "logits/chosen": -1.0898232460021973, "logits/rejected": -1.107914686203003, "logps/chosen": -805.954833984375, "logps/ref_chosen": -64.21354675292969, "logps/ref_rejected": -91.65367126464844, "logps/rejected": -1252.609375, "loss": 1.1025, "margin_dpo/margin_mean": 419.21453857421875, "margin_dpo/margin_std": 558.5067138671875, "step": 629 }, { "KL/chosen_KL_mean": -688.7796020507812, "KL/mean": -806.9313354492188, "KL/rejected_KL_mean": -925.0830078125, "KL/std": 583.607421875, "epoch": 0.9251101321585903, "fcm_dpo/beta": 0.0009130248799920082, "fcm_dpo/delta": 0.04915432631969452, "fcm_dpo/margin": 236.30340576171875, "fcm_dpo/q_t": 0.45570600032806396, "grad_norm": 55.93423080444336, "learning_rate": 8.85387393063622e-09, "logits/chosen": -1.0273975133895874, "logits/rejected": -1.0003504753112793, "logps/chosen": -748.0706176757812, "logps/ref_chosen": -59.29100036621094, "logps/ref_rejected": -83.59829711914062, "logps/rejected": -1008.6813354492188, "loss": 1.2755, "margin_dpo/margin_mean": 236.30337524414062, "margin_dpo/margin_std": 700.1854248046875, "step": 630 }, { "KL/chosen_KL_mean": -818.428955078125, "KL/mean": -1009.658935546875, "KL/rejected_KL_mean": -1200.888916015625, "KL/std": 587.8419189453125, "epoch": 0.9265785609397944, "fcm_dpo/beta": 0.0009250535513274372, "fcm_dpo/delta": 0.04782557487487793, "fcm_dpo/margin": 382.4600830078125, "fcm_dpo/q_t": 0.41905054450035095, "grad_norm": 36.24885559082031, "learning_rate": 8.518543427732949e-09, "logits/chosen": -1.1538431644439697, "logits/rejected": -1.1630300283432007, "logps/chosen": -877.882568359375, "logps/ref_chosen": -59.45360565185547, "logps/ref_rejected": -80.95156860351562, "logps/rejected": -1281.840576171875, "loss": 1.1544, "margin_dpo/margin_mean": 382.4600830078125, "margin_dpo/margin_std": 670.58447265625, "step": 631 }, { "KL/chosen_KL_mean": -706.385986328125, "KL/mean": -901.2720947265625, "KL/rejected_KL_mean": -1096.158203125, "KL/std": 529.7587280273438, "epoch": 0.9280469897209985, "fcm_dpo/beta": 0.0009276444325223565, "fcm_dpo/delta": 0.039635516703128815, "fcm_dpo/margin": 389.7721252441406, "fcm_dpo/q_t": 0.4175952672958374, "grad_norm": 47.56220245361328, "learning_rate": 8.189576185789637e-09, "logits/chosen": -1.1132001876831055, "logits/rejected": -1.1175191402435303, "logps/chosen": -767.737548828125, "logps/ref_chosen": -61.35155487060547, "logps/ref_rejected": -86.16017150878906, "logps/rejected": -1182.318359375, "loss": 1.1434, "margin_dpo/margin_mean": 389.7721252441406, "margin_dpo/margin_std": 647.5654296875, "step": 632 }, { "KL/chosen_KL_mean": -791.9481201171875, "KL/mean": -929.552978515625, "KL/rejected_KL_mean": -1067.157958984375, "KL/std": 522.26171875, "epoch": 0.9295154185022027, "fcm_dpo/beta": 0.0009377728565596044, "fcm_dpo/delta": 0.043396495282649994, "fcm_dpo/margin": 275.209716796875, "fcm_dpo/q_t": 0.4401985704898834, "grad_norm": 48.036155700683594, "learning_rate": 7.866980873399015e-09, "logits/chosen": -1.1201856136322021, "logits/rejected": -1.129399061203003, "logps/chosen": -849.226318359375, "logps/ref_chosen": -57.27816390991211, "logps/ref_rejected": -91.58395385742188, "logps/rejected": -1158.741943359375, "loss": 1.2186, "margin_dpo/margin_mean": 275.209716796875, "margin_dpo/margin_std": 591.9662475585938, "step": 633 }, { "KL/chosen_KL_mean": -896.2001342773438, "KL/mean": -1023.7356567382812, "KL/rejected_KL_mean": -1151.271240234375, "KL/std": 637.21044921875, "epoch": 0.9309838472834068, "fcm_dpo/beta": 0.0009550247923471034, "fcm_dpo/delta": 0.06949655711650848, "fcm_dpo/margin": 255.07102966308594, "fcm_dpo/q_t": 0.44587743282318115, "grad_norm": 40.304161071777344, "learning_rate": 7.550765991247654e-09, "logits/chosen": -1.0025546550750732, "logits/rejected": -0.9946834444999695, "logps/chosen": -962.819091796875, "logps/ref_chosen": -66.61896514892578, "logps/ref_rejected": -107.12564849853516, "logps/rejected": -1258.3968505859375, "loss": 1.2358, "margin_dpo/margin_mean": 255.071044921875, "margin_dpo/margin_std": 605.7219848632812, "step": 634 }, { "KL/chosen_KL_mean": -769.7693481445312, "KL/mean": -949.5761108398438, "KL/rejected_KL_mean": -1129.3828125, "KL/std": 656.5932006835938, "epoch": 0.9324522760646109, "fcm_dpo/beta": 0.0009627408580854535, "fcm_dpo/delta": 0.0557277575135231, "fcm_dpo/margin": 359.6134948730469, "fcm_dpo/q_t": 0.42400288581848145, "grad_norm": 48.429481506347656, "learning_rate": 7.240939871891699e-09, "logits/chosen": -1.0978808403015137, "logits/rejected": -1.0793735980987549, "logps/chosen": -843.724853515625, "logps/ref_chosen": -73.95551300048828, "logps/ref_rejected": -82.50045776367188, "logps/rejected": -1211.88330078125, "loss": 1.1613, "margin_dpo/margin_mean": 359.61346435546875, "margin_dpo/margin_std": 665.019287109375, "step": 635 }, { "KL/chosen_KL_mean": -728.061279296875, "KL/mean": -942.2210693359375, "KL/rejected_KL_mean": -1156.380859375, "KL/std": 644.4195556640625, "epoch": 0.933920704845815, "fcm_dpo/beta": 0.0009705645497888327, "fcm_dpo/delta": -0.017428025603294373, "fcm_dpo/margin": 428.31964111328125, "fcm_dpo/q_t": 0.4088486135005951, "grad_norm": 26.07723617553711, "learning_rate": 6.937510679537628e-09, "logits/chosen": -1.0272531509399414, "logits/rejected": -1.0296359062194824, "logps/chosen": -787.690185546875, "logps/ref_chosen": -59.628910064697266, "logps/ref_rejected": -81.97883605957031, "logps/rejected": -1238.3597412109375, "loss": 1.1025, "margin_dpo/margin_mean": 428.3197021484375, "margin_dpo/margin_std": 656.77587890625, "step": 636 }, { "KL/chosen_KL_mean": -722.7738647460938, "KL/mean": -958.4360961914062, "KL/rejected_KL_mean": -1194.098388671875, "KL/std": 627.090576171875, "epoch": 0.9353891336270191, "fcm_dpo/beta": 0.0009508421644568443, "fcm_dpo/delta": -0.051047492772340775, "fcm_dpo/margin": 471.324462890625, "fcm_dpo/q_t": 0.400098979473114, "grad_norm": 28.57129669189453, "learning_rate": 6.640486409826785e-09, "logits/chosen": -1.144999384880066, "logits/rejected": -1.1962953805923462, "logps/chosen": -772.426513671875, "logps/ref_chosen": -49.652687072753906, "logps/ref_rejected": -98.40513610839844, "logps/rejected": -1292.50341796875, "loss": 1.0731, "margin_dpo/margin_mean": 471.32452392578125, "margin_dpo/margin_std": 648.1543579101562, "step": 637 }, { "KL/chosen_KL_mean": -715.32373046875, "KL/mean": -905.3469848632812, "KL/rejected_KL_mean": -1095.3702392578125, "KL/std": 610.3677368164062, "epoch": 0.9368575624082232, "fcm_dpo/beta": 0.0009441774454899132, "fcm_dpo/delta": -0.08913271129131317, "fcm_dpo/margin": 380.04638671875, "fcm_dpo/q_t": 0.41255509853363037, "grad_norm": 32.758209228515625, "learning_rate": 6.349874889624962e-09, "logits/chosen": -0.9814478158950806, "logits/rejected": -0.9641016721725464, "logps/chosen": -773.4804077148438, "logps/ref_chosen": -58.156639099121094, "logps/ref_rejected": -79.3014907836914, "logps/rejected": -1174.671630859375, "loss": 1.1707, "margin_dpo/margin_mean": 380.0464172363281, "margin_dpo/margin_std": 710.3338623046875, "step": 638 }, { "KL/chosen_KL_mean": -967.8865966796875, "KL/mean": -1046.7158203125, "KL/rejected_KL_mean": -1125.5450439453125, "KL/std": 574.20703125, "epoch": 0.9383259911894273, "fcm_dpo/beta": 0.0009357619564980268, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 157.65838623046875, "fcm_dpo/q_t": 0.4668254852294922, "grad_norm": 124.77977752685547, "learning_rate": 6.065683776815933e-09, "logits/chosen": -1.0108537673950195, "logits/rejected": -0.9488674402236938, "logps/chosen": -1040.209716796875, "logps/ref_chosen": -72.32319641113281, "logps/ref_rejected": -74.2749252319336, "logps/rejected": -1199.8199462890625, "loss": 1.3504, "margin_dpo/margin_mean": 157.65838623046875, "margin_dpo/margin_std": 713.7766723632812, "step": 639 }, { "KL/chosen_KL_mean": -752.870361328125, "KL/mean": -1040.066650390625, "KL/rejected_KL_mean": -1327.262939453125, "KL/std": 677.1709594726562, "epoch": 0.9397944199706314, "fcm_dpo/beta": 0.0009211286087520421, "fcm_dpo/delta": -0.13610000908374786, "fcm_dpo/margin": 574.392578125, "fcm_dpo/q_t": 0.3863770365715027, "grad_norm": 41.8037223815918, "learning_rate": 5.7879205600998296e-09, "logits/chosen": -0.9777020215988159, "logits/rejected": -1.0074682235717773, "logps/chosen": -809.0047607421875, "logps/ref_chosen": -56.13436508178711, "logps/ref_rejected": -108.60014343261719, "logps/rejected": -1435.863037109375, "loss": 1.0416, "margin_dpo/margin_mean": 574.392578125, "margin_dpo/margin_std": 796.70458984375, "step": 640 }, { "KL/chosen_KL_mean": -851.2357177734375, "KL/mean": -1040.4761962890625, "KL/rejected_KL_mean": -1229.7166748046875, "KL/std": 575.572509765625, "epoch": 0.9412628487518355, "fcm_dpo/beta": 0.0009160140762105584, "fcm_dpo/delta": 0.0552375465631485, "fcm_dpo/margin": 378.48095703125, "fcm_dpo/q_t": 0.4241793751716614, "grad_norm": 35.59013366699219, "learning_rate": 5.516592558795746e-09, "logits/chosen": -1.040936827659607, "logits/rejected": -1.0504437685012817, "logps/chosen": -916.2326049804688, "logps/ref_chosen": -64.99689483642578, "logps/ref_rejected": -86.99232482910156, "logps/rejected": -1316.708984375, "loss": 1.1765, "margin_dpo/margin_mean": 378.48095703125, "margin_dpo/margin_std": 753.5254516601562, "step": 641 }, { "KL/chosen_KL_mean": -817.596435546875, "KL/mean": -1044.927001953125, "KL/rejected_KL_mean": -1272.257568359375, "KL/std": 752.8553466796875, "epoch": 0.9427312775330396, "fcm_dpo/beta": 0.0009222212247550488, "fcm_dpo/delta": -0.02048617973923683, "fcm_dpo/margin": 454.66107177734375, "fcm_dpo/q_t": 0.4156268537044525, "grad_norm": 43.08086013793945, "learning_rate": 5.251706922648868e-09, "logits/chosen": -0.9874919652938843, "logits/rejected": -1.022093653678894, "logps/chosen": -883.28564453125, "logps/ref_chosen": -65.68924713134766, "logps/ref_rejected": -110.24205017089844, "logps/rejected": -1382.4996337890625, "loss": 1.1533, "margin_dpo/margin_mean": 454.6611328125, "margin_dpo/margin_std": 916.9835205078125, "step": 642 }, { "KL/chosen_KL_mean": -737.8060302734375, "KL/mean": -912.438720703125, "KL/rejected_KL_mean": -1087.071533203125, "KL/std": 547.3657836914062, "epoch": 0.9441997063142438, "fcm_dpo/beta": 0.0009113398264162242, "fcm_dpo/delta": -0.030877836048603058, "fcm_dpo/margin": 349.2655334472656, "fcm_dpo/q_t": 0.42619654536247253, "grad_norm": 44.11703872680664, "learning_rate": 4.993270631642038e-09, "logits/chosen": -1.1141959428787231, "logits/rejected": -1.1129988431930542, "logps/chosen": -789.7559814453125, "logps/ref_chosen": -51.94999694824219, "logps/ref_rejected": -87.46833801269531, "logps/rejected": -1174.539794921875, "loss": 1.1568, "margin_dpo/margin_mean": 349.2655029296875, "margin_dpo/margin_std": 548.5968017578125, "step": 643 }, { "KL/chosen_KL_mean": -727.4107666015625, "KL/mean": -910.2191162109375, "KL/rejected_KL_mean": -1093.02734375, "KL/std": 654.0157470703125, "epoch": 0.9456681350954479, "fcm_dpo/beta": 0.0009200773201882839, "fcm_dpo/delta": 0.06583556532859802, "fcm_dpo/margin": 365.61663818359375, "fcm_dpo/q_t": 0.42611658573150635, "grad_norm": 56.25822067260742, "learning_rate": 4.741290495811873e-09, "logits/chosen": -1.02555513381958, "logits/rejected": -1.0345053672790527, "logps/chosen": -786.428466796875, "logps/ref_chosen": -59.017662048339844, "logps/ref_rejected": -87.13668823242188, "logps/rejected": -1180.1640625, "loss": 1.1889, "margin_dpo/margin_mean": 365.61663818359375, "margin_dpo/margin_std": 751.4439697265625, "step": 644 }, { "KL/chosen_KL_mean": -752.749267578125, "KL/mean": -840.9324951171875, "KL/rejected_KL_mean": -929.1156005859375, "KL/std": 507.17401123046875, "epoch": 0.947136563876652, "fcm_dpo/beta": 0.0009371960768476129, "fcm_dpo/delta": 0.0740480124950409, "fcm_dpo/margin": 176.36630249023438, "fcm_dpo/q_t": 0.4641192555427551, "grad_norm": 108.85242462158203, "learning_rate": 4.495773155069299e-09, "logits/chosen": -0.961013913154602, "logits/rejected": -0.9447523355484009, "logps/chosen": -808.6253051757812, "logps/ref_chosen": -55.87602233886719, "logps/ref_rejected": -97.78080749511719, "logps/rejected": -1026.896484375, "loss": 1.3301, "margin_dpo/margin_mean": 176.3662872314453, "margin_dpo/margin_std": 675.3572998046875, "step": 645 }, { "KL/chosen_KL_mean": -723.980712890625, "KL/mean": -878.9730224609375, "KL/rejected_KL_mean": -1033.9652099609375, "KL/std": 492.32672119140625, "epoch": 0.9486049926578561, "fcm_dpo/beta": 0.0009495633421465755, "fcm_dpo/delta": 0.10899513214826584, "fcm_dpo/margin": 309.98443603515625, "fcm_dpo/q_t": 0.4337136745452881, "grad_norm": 58.64928436279297, "learning_rate": 4.256725079024553e-09, "logits/chosen": -1.0737169981002808, "logits/rejected": -1.0561877489089966, "logps/chosen": -785.2565307617188, "logps/ref_chosen": -61.275787353515625, "logps/ref_rejected": -77.50580596923828, "logps/rejected": -1111.470947265625, "loss": 1.1901, "margin_dpo/margin_mean": 309.9844665527344, "margin_dpo/margin_std": 590.6322021484375, "step": 646 }, { "KL/chosen_KL_mean": -644.8477172851562, "KL/mean": -838.4265747070312, "KL/rejected_KL_mean": -1032.00537109375, "KL/std": 562.4344482421875, "epoch": 0.9500734214390602, "fcm_dpo/beta": 0.0009617937030270696, "fcm_dpo/delta": 0.028707262128591537, "fcm_dpo/margin": 387.1577453613281, "fcm_dpo/q_t": 0.41403427720069885, "grad_norm": 30.214832305908203, "learning_rate": 4.024152566816791e-09, "logits/chosen": -0.966199517250061, "logits/rejected": -0.9970808029174805, "logps/chosen": -699.7001342773438, "logps/ref_chosen": -54.8524169921875, "logps/ref_rejected": -93.5194091796875, "logps/rejected": -1125.52490234375, "loss": 1.1156, "margin_dpo/margin_mean": 387.1577453613281, "margin_dpo/margin_std": 551.2623291015625, "step": 647 }, { "KL/chosen_KL_mean": -684.664306640625, "KL/mean": -976.056884765625, "KL/rejected_KL_mean": -1267.4493408203125, "KL/std": 685.91015625, "epoch": 0.9515418502202643, "fcm_dpo/beta": 0.0009418315021321177, "fcm_dpo/delta": -0.15742585062980652, "fcm_dpo/margin": 582.7850952148438, "fcm_dpo/q_t": 0.38359707593917847, "grad_norm": 31.0360164642334, "learning_rate": 3.798061746947995e-09, "logits/chosen": -1.1210038661956787, "logits/rejected": -1.1833868026733398, "logps/chosen": -738.8358154296875, "logps/ref_chosen": -54.17146682739258, "logps/ref_rejected": -98.7127914428711, "logps/rejected": -1366.162109375, "loss": 1.0315, "margin_dpo/margin_mean": 582.7850952148438, "margin_dpo/margin_std": 816.8055419921875, "step": 648 }, { "KL/chosen_KL_mean": -711.638916015625, "KL/mean": -853.6250610351562, "KL/rejected_KL_mean": -995.6112060546875, "KL/std": 527.212890625, "epoch": 0.9530102790014684, "fcm_dpo/beta": 0.0009543564519844949, "fcm_dpo/delta": 0.13216045498847961, "fcm_dpo/margin": 283.9723205566406, "fcm_dpo/q_t": 0.4425292909145355, "grad_norm": 28.521751403808594, "learning_rate": 3.5784585771215235e-09, "logits/chosen": -1.1217185258865356, "logits/rejected": -1.1153336763381958, "logps/chosen": -774.1192626953125, "logps/ref_chosen": -62.480350494384766, "logps/ref_rejected": -80.07717895507812, "logps/rejected": -1075.6884765625, "loss": 1.2295, "margin_dpo/margin_mean": 283.9723205566406, "margin_dpo/margin_std": 654.6543579101562, "step": 649 }, { "KL/chosen_KL_mean": -754.6192016601562, "KL/mean": -973.61767578125, "KL/rejected_KL_mean": -1192.6162109375, "KL/std": 647.8673095703125, "epoch": 0.9544787077826725, "fcm_dpo/beta": 0.000953975017182529, "fcm_dpo/delta": -0.018730733543634415, "fcm_dpo/margin": 437.99713134765625, "fcm_dpo/q_t": 0.4086850881576538, "grad_norm": 33.09385681152344, "learning_rate": 3.3653488440851253e-09, "logits/chosen": -1.028601884841919, "logits/rejected": -1.0516587495803833, "logps/chosen": -810.7120361328125, "logps/ref_chosen": -56.09281921386719, "logps/ref_rejected": -98.26483917236328, "logps/rejected": -1290.881103515625, "loss": 1.1243, "margin_dpo/margin_mean": 437.99713134765625, "margin_dpo/margin_std": 743.27490234375, "step": 650 }, { "KL/chosen_KL_mean": -512.770263671875, "KL/mean": -790.38525390625, "KL/rejected_KL_mean": -1068.000244140625, "KL/std": 609.9718017578125, "epoch": 0.9559471365638766, "fcm_dpo/beta": 0.0009362648124806583, "fcm_dpo/delta": -0.12655048072338104, "fcm_dpo/margin": 555.2301025390625, "fcm_dpo/q_t": 0.38282567262649536, "grad_norm": 40.27021408081055, "learning_rate": 3.158738163478475e-09, "logits/chosen": -1.0640699863433838, "logits/rejected": -1.1213992834091187, "logps/chosen": -556.1956787109375, "logps/ref_chosen": -43.42544937133789, "logps/ref_rejected": -99.95791625976562, "logps/rejected": -1167.958251953125, "loss": 1.0095, "margin_dpo/margin_mean": 555.2301025390625, "margin_dpo/margin_std": 632.1912841796875, "step": 651 }, { "KL/chosen_KL_mean": -651.0936279296875, "KL/mean": -865.922119140625, "KL/rejected_KL_mean": -1080.7506103515625, "KL/std": 616.6910400390625, "epoch": 0.9574155653450808, "fcm_dpo/beta": 0.000932047376409173, "fcm_dpo/delta": -0.0005056131631135941, "fcm_dpo/margin": 429.6570129394531, "fcm_dpo/q_t": 0.411517471075058, "grad_norm": 32.868282318115234, "learning_rate": 2.9586319796851555e-09, "logits/chosen": -1.0609800815582275, "logits/rejected": -1.087823748588562, "logps/chosen": -713.67041015625, "logps/ref_chosen": -62.57680892944336, "logps/ref_rejected": -111.76779174804688, "logps/rejected": -1192.5184326171875, "loss": 1.1248, "margin_dpo/margin_mean": 429.6570129394531, "margin_dpo/margin_std": 717.76123046875, "step": 652 }, { "KL/chosen_KL_mean": -803.664794921875, "KL/mean": -996.8284912109375, "KL/rejected_KL_mean": -1189.9921875, "KL/std": 658.72021484375, "epoch": 0.9588839941262849, "fcm_dpo/beta": 0.0009377988171763718, "fcm_dpo/delta": 0.03901583328843117, "fcm_dpo/margin": 386.3274841308594, "fcm_dpo/q_t": 0.4204859137535095, "grad_norm": 35.157344818115234, "learning_rate": 2.7650355656892166e-09, "logits/chosen": -1.138892412185669, "logits/rejected": -1.1634893417358398, "logps/chosen": -864.7777709960938, "logps/ref_chosen": -61.11295700073242, "logps/ref_rejected": -103.24960327148438, "logps/rejected": -1293.241943359375, "loss": 1.153, "margin_dpo/margin_mean": 386.3274841308594, "margin_dpo/margin_std": 692.9034423828125, "step": 653 }, { "KL/chosen_KL_mean": -724.307373046875, "KL/mean": -912.4171142578125, "KL/rejected_KL_mean": -1100.52685546875, "KL/std": 534.213134765625, "epoch": 0.960352422907489, "fcm_dpo/beta": 0.0009442999726161361, "fcm_dpo/delta": 0.04640195518732071, "fcm_dpo/margin": 376.21954345703125, "fcm_dpo/q_t": 0.4215119779109955, "grad_norm": 39.73265075683594, "learning_rate": 2.577954022936174e-09, "logits/chosen": -1.0920642614364624, "logits/rejected": -1.1086204051971436, "logps/chosen": -786.0355224609375, "logps/ref_chosen": -61.7281379699707, "logps/ref_rejected": -98.7738037109375, "logps/rejected": -1199.30078125, "loss": 1.1429, "margin_dpo/margin_mean": 376.21954345703125, "margin_dpo/margin_std": 627.8160400390625, "step": 654 }, { "KL/chosen_KL_mean": -665.0706787109375, "KL/mean": -861.1571044921875, "KL/rejected_KL_mean": -1057.24365234375, "KL/std": 536.1744384765625, "epoch": 0.9618208516886931, "fcm_dpo/beta": 0.0009508723160251975, "fcm_dpo/delta": 0.028146151453256607, "fcm_dpo/margin": 392.17303466796875, "fcm_dpo/q_t": 0.416969895362854, "grad_norm": 28.9021053314209, "learning_rate": 2.397392281198729e-09, "logits/chosen": -1.0546410083770752, "logits/rejected": -1.0955651998519897, "logps/chosen": -714.6474609375, "logps/ref_chosen": -49.576812744140625, "logps/ref_rejected": -98.29183197021484, "logps/rejected": -1155.535400390625, "loss": 1.1326, "margin_dpo/margin_mean": 392.17303466796875, "margin_dpo/margin_std": 639.540771484375, "step": 655 }, { "KL/chosen_KL_mean": -728.656005859375, "KL/mean": -1060.576904296875, "KL/rejected_KL_mean": -1392.497802734375, "KL/std": 723.1806030273438, "epoch": 0.9632892804698973, "fcm_dpo/beta": 0.0009240615181624889, "fcm_dpo/delta": -0.2268456667661667, "fcm_dpo/margin": 663.841796875, "fcm_dpo/q_t": 0.3659891188144684, "grad_norm": 87.20188903808594, "learning_rate": 2.223355098446622e-09, "logits/chosen": -0.9737097024917603, "logits/rejected": -1.0463311672210693, "logps/chosen": -781.2054443359375, "logps/ref_chosen": -52.54943084716797, "logps/ref_rejected": -113.67464447021484, "logps/rejected": -1506.1724853515625, "loss": 0.9623, "margin_dpo/margin_mean": 663.841796875, "margin_dpo/margin_std": 737.1829833984375, "step": 656 }, { "KL/chosen_KL_mean": -679.6723022460938, "KL/mean": -946.5911865234375, "KL/rejected_KL_mean": -1213.510009765625, "KL/std": 687.6986083984375, "epoch": 0.9647577092511013, "fcm_dpo/beta": 0.0008973278454504907, "fcm_dpo/delta": -0.08321470022201538, "fcm_dpo/margin": 533.8377075195312, "fcm_dpo/q_t": 0.39242735505104065, "grad_norm": 39.09135437011719, "learning_rate": 2.055847060721566e-09, "logits/chosen": -1.1246776580810547, "logits/rejected": -1.1695971488952637, "logps/chosen": -726.372802734375, "logps/ref_chosen": -46.700538635253906, "logps/ref_rejected": -97.91487121582031, "logps/rejected": -1311.4248046875, "loss": 1.0554, "margin_dpo/margin_mean": 533.8377075195312, "margin_dpo/margin_std": 719.7000122070312, "step": 657 }, { "KL/chosen_KL_mean": -739.4154663085938, "KL/mean": -951.917724609375, "KL/rejected_KL_mean": -1164.420166015625, "KL/std": 541.0897216796875, "epoch": 0.9662261380323054, "fcm_dpo/beta": 0.0008927997550927103, "fcm_dpo/delta": 0.020825423300266266, "fcm_dpo/margin": 425.004638671875, "fcm_dpo/q_t": 0.4129961133003235, "grad_norm": 36.563560485839844, "learning_rate": 1.8948725820160662e-09, "logits/chosen": -1.074481725692749, "logits/rejected": -1.1052826642990112, "logps/chosen": -800.3736572265625, "logps/ref_chosen": -60.95820999145508, "logps/ref_rejected": -95.93949127197266, "logps/rejected": -1260.359619140625, "loss": 1.1161, "margin_dpo/margin_mean": 425.004638671875, "margin_dpo/margin_std": 614.8718872070312, "step": 658 }, { "KL/chosen_KL_mean": -648.5974731445312, "KL/mean": -853.8447265625, "KL/rejected_KL_mean": -1059.092041015625, "KL/std": 544.5018310546875, "epoch": 0.9676945668135095, "fcm_dpo/beta": 0.0009003398008644581, "fcm_dpo/delta": 0.03149181231856346, "fcm_dpo/margin": 410.4945068359375, "fcm_dpo/q_t": 0.41639554500579834, "grad_norm": 42.586883544921875, "learning_rate": 1.7404359041573723e-09, "logits/chosen": -0.9929611086845398, "logits/rejected": -0.9660124778747559, "logps/chosen": -725.3404541015625, "logps/ref_chosen": -76.74298095703125, "logps/ref_rejected": -87.4709701538086, "logps/rejected": -1146.56298828125, "loss": 1.1178, "margin_dpo/margin_mean": 410.4945068359375, "margin_dpo/margin_std": 598.6842041015625, "step": 659 }, { "KL/chosen_KL_mean": -703.4244384765625, "KL/mean": -971.2889404296875, "KL/rejected_KL_mean": -1239.1533203125, "KL/std": 633.0969848632812, "epoch": 0.9691629955947136, "fcm_dpo/beta": 0.0008946568705141544, "fcm_dpo/delta": -0.0832086056470871, "fcm_dpo/margin": 535.7288818359375, "fcm_dpo/q_t": 0.3920001983642578, "grad_norm": 36.593955993652344, "learning_rate": 1.592541096695571e-09, "logits/chosen": -1.0897400379180908, "logits/rejected": -1.1135540008544922, "logps/chosen": -762.4723510742188, "logps/ref_chosen": -59.04788589477539, "logps/ref_rejected": -75.96005249023438, "logps/rejected": -1315.11328125, "loss": 1.0481, "margin_dpo/margin_mean": 535.7288818359375, "margin_dpo/margin_std": 685.7335205078125, "step": 660 }, { "KL/chosen_KL_mean": -620.7945556640625, "KL/mean": -853.5462646484375, "KL/rejected_KL_mean": -1086.2978515625, "KL/std": 685.778564453125, "epoch": 0.9706314243759178, "fcm_dpo/beta": 0.0008907719748094678, "fcm_dpo/delta": -0.015506003051996231, "fcm_dpo/margin": 465.50323486328125, "fcm_dpo/q_t": 0.4084845185279846, "grad_norm": 50.08948516845703, "learning_rate": 1.4511920567963908e-09, "logits/chosen": -1.1007812023162842, "logits/rejected": -1.116791009902954, "logps/chosen": -671.4685668945312, "logps/ref_chosen": -50.673973083496094, "logps/ref_rejected": -86.00569152832031, "logps/rejected": -1172.303466796875, "loss": 1.089, "margin_dpo/margin_mean": 465.5032958984375, "margin_dpo/margin_std": 672.30322265625, "step": 661 }, { "KL/chosen_KL_mean": -724.219970703125, "KL/mean": -912.091552734375, "KL/rejected_KL_mean": -1099.963134765625, "KL/std": 585.681884765625, "epoch": 0.9720998531571219, "fcm_dpo/beta": 0.0008955647936090827, "fcm_dpo/delta": 0.06570842862129211, "fcm_dpo/margin": 375.7431640625, "fcm_dpo/q_t": 0.4247671663761139, "grad_norm": 29.727699279785156, "learning_rate": 1.3163925091384532e-09, "logits/chosen": -1.0112884044647217, "logits/rejected": -1.012375831604004, "logps/chosen": -793.4810791015625, "logps/ref_chosen": -69.26106262207031, "logps/ref_rejected": -89.05593872070312, "logps/rejected": -1189.01904296875, "loss": 1.1725, "margin_dpo/margin_mean": 375.74310302734375, "margin_dpo/margin_std": 715.9241333007812, "step": 662 }, { "KL/chosen_KL_mean": -677.1693115234375, "KL/mean": -901.0325927734375, "KL/rejected_KL_mean": -1124.8958740234375, "KL/std": 655.4639892578125, "epoch": 0.973568281938326, "fcm_dpo/beta": 0.0008970214985311031, "fcm_dpo/delta": -0.0017335359007120132, "fcm_dpo/margin": 447.7265930175781, "fcm_dpo/q_t": 0.4119381904602051, "grad_norm": 25.254993438720703, "learning_rate": 1.1881460058152382e-09, "logits/chosen": -1.0759081840515137, "logits/rejected": -1.1006672382354736, "logps/chosen": -742.0482177734375, "logps/ref_chosen": -64.87890625, "logps/ref_rejected": -113.92536926269531, "logps/rejected": -1238.8212890625, "loss": 1.125, "margin_dpo/margin_mean": 447.7265930175781, "margin_dpo/margin_std": 758.81591796875, "step": 663 }, { "KL/chosen_KL_mean": -732.6714477539062, "KL/mean": -988.3836669921875, "KL/rejected_KL_mean": -1244.095947265625, "KL/std": 662.098876953125, "epoch": 0.9750367107195301, "fcm_dpo/beta": 0.0008872643811628222, "fcm_dpo/delta": -0.05684386566281319, "fcm_dpo/margin": 511.424560546875, "fcm_dpo/q_t": 0.39906027913093567, "grad_norm": 27.473468780517578, "learning_rate": 1.066455926241383e-09, "logits/chosen": -1.0546207427978516, "logits/rejected": -1.0900723934173584, "logps/chosen": -793.5599365234375, "logps/ref_chosen": -60.88847351074219, "logps/ref_rejected": -105.521728515625, "logps/rejected": -1349.61767578125, "loss": 1.0717, "margin_dpo/margin_mean": 511.424560546875, "margin_dpo/margin_std": 705.2711181640625, "step": 664 }, { "KL/chosen_KL_mean": -635.2431640625, "KL/mean": -848.298583984375, "KL/rejected_KL_mean": -1061.35400390625, "KL/std": 531.826416015625, "epoch": 0.9765051395007343, "fcm_dpo/beta": 0.0008908901363611221, "fcm_dpo/delta": 0.021186400204896927, "fcm_dpo/margin": 426.11090087890625, "fcm_dpo/q_t": 0.4121313691139221, "grad_norm": 48.51000213623047, "learning_rate": 9.513254770636137e-10, "logits/chosen": -1.1572396755218506, "logits/rejected": -1.1837971210479736, "logps/chosen": -695.8072509765625, "logps/ref_chosen": -60.56413269042969, "logps/ref_rejected": -84.80882263183594, "logps/rejected": -1146.162841796875, "loss": 1.0953, "margin_dpo/margin_mean": 426.11090087890625, "margin_dpo/margin_std": 529.1512451171875, "step": 665 }, { "KL/chosen_KL_mean": -686.9071044921875, "KL/mean": -899.4036865234375, "KL/rejected_KL_mean": -1111.900146484375, "KL/std": 544.5043334960938, "epoch": 0.9779735682819384, "fcm_dpo/beta": 0.0008939065737649798, "fcm_dpo/delta": 0.020892852917313576, "fcm_dpo/margin": 424.9931335449219, "fcm_dpo/q_t": 0.4139803946018219, "grad_norm": 34.465694427490234, "learning_rate": 8.427576920763956e-10, "logits/chosen": -0.9737902283668518, "logits/rejected": -0.9849244356155396, "logps/chosen": -751.3270263671875, "logps/ref_chosen": -64.41996002197266, "logps/ref_rejected": -95.8916244506836, "logps/rejected": -1207.791748046875, "loss": 1.1128, "margin_dpo/margin_mean": 424.9931640625, "margin_dpo/margin_std": 604.1651611328125, "step": 666 }, { "KL/chosen_KL_mean": -779.2694091796875, "KL/mean": -1032.952392578125, "KL/rejected_KL_mean": -1286.635498046875, "KL/std": 609.7968139648438, "epoch": 0.9794419970631424, "fcm_dpo/beta": 0.0008860268862918019, "fcm_dpo/delta": -0.05220697447657585, "fcm_dpo/margin": 507.36602783203125, "fcm_dpo/q_t": 0.3982255458831787, "grad_norm": 44.31229782104492, "learning_rate": 7.407554321417764e-10, "logits/chosen": -1.008927822113037, "logits/rejected": -1.0107920169830322, "logps/chosen": -848.5464477539062, "logps/ref_chosen": -69.27702331542969, "logps/ref_rejected": -87.83549499511719, "logps/rejected": -1374.470947265625, "loss": 1.067, "margin_dpo/margin_mean": 507.36602783203125, "margin_dpo/margin_std": 670.2469482421875, "step": 667 }, { "KL/chosen_KL_mean": -829.6917724609375, "KL/mean": -1003.8605346679688, "KL/rejected_KL_mean": -1178.029296875, "KL/std": 655.914794921875, "epoch": 0.9809104258443465, "fcm_dpo/beta": 0.0009021821897476912, "fcm_dpo/delta": 0.08765879273414612, "fcm_dpo/margin": 348.3375244140625, "fcm_dpo/q_t": 0.43241050839424133, "grad_norm": 51.814239501953125, "learning_rate": 6.453213851142225e-10, "logits/chosen": -1.082472324371338, "logits/rejected": -1.0882298946380615, "logps/chosen": -902.2957763671875, "logps/ref_chosen": -72.60400390625, "logps/ref_rejected": -103.73905944824219, "logps/rejected": -1281.768310546875, "loss": 1.2118, "margin_dpo/margin_mean": 348.3375244140625, "margin_dpo/margin_std": 782.4864501953125, "step": 668 }, { "KL/chosen_KL_mean": -632.581787109375, "KL/mean": -886.140625, "KL/rejected_KL_mean": -1139.6993408203125, "KL/std": 600.3204345703125, "epoch": 0.9823788546255506, "fcm_dpo/beta": 0.0008957190439105034, "fcm_dpo/delta": -0.05677647516131401, "fcm_dpo/margin": 507.1175537109375, "fcm_dpo/q_t": 0.3966999053955078, "grad_norm": 27.12503433227539, "learning_rate": 5.564580657695939e-10, "logits/chosen": -1.0696676969528198, "logits/rejected": -1.0817254781723022, "logps/chosen": -678.6981811523438, "logps/ref_chosen": -46.116416931152344, "logps/ref_rejected": -77.92434692382812, "logps/rejected": -1217.623779296875, "loss": 1.0658, "margin_dpo/margin_mean": 507.1175537109375, "margin_dpo/margin_std": 671.839599609375, "step": 669 }, { "KL/chosen_KL_mean": -601.8228759765625, "KL/mean": -861.4261474609375, "KL/rejected_KL_mean": -1121.0294189453125, "KL/std": 569.886962890625, "epoch": 0.9838472834067548, "fcm_dpo/beta": 0.0008871153695508838, "fcm_dpo/delta": -0.06361524760723114, "fcm_dpo/margin": 519.2064819335938, "fcm_dpo/q_t": 0.3957204818725586, "grad_norm": 23.987220764160156, "learning_rate": 4.741678157389739e-10, "logits/chosen": -0.9869112968444824, "logits/rejected": -1.0059700012207031, "logps/chosen": -664.168701171875, "logps/ref_chosen": -62.34575271606445, "logps/ref_rejected": -96.9405517578125, "logps/rejected": -1217.969970703125, "loss": 1.066, "margin_dpo/margin_mean": 519.2064819335938, "margin_dpo/margin_std": 689.0188598632812, "step": 670 }, { "KL/chosen_KL_mean": -770.5162353515625, "KL/mean": -969.230224609375, "KL/rejected_KL_mean": -1167.944091796875, "KL/std": 568.30517578125, "epoch": 0.9853157121879589, "fcm_dpo/beta": 0.0008903343696147203, "fcm_dpo/delta": 0.047325365245342255, "fcm_dpo/margin": 397.4279479980469, "fcm_dpo/q_t": 0.41793012619018555, "grad_norm": 37.049495697021484, "learning_rate": 3.9845280344705245e-10, "logits/chosen": -1.0835880041122437, "logits/rejected": -1.1127347946166992, "logps/chosen": -818.516357421875, "logps/ref_chosen": -48.00010681152344, "logps/ref_rejected": -83.81932067871094, "logps/rejected": -1251.763427734375, "loss": 1.1536, "margin_dpo/margin_mean": 397.427978515625, "margin_dpo/margin_std": 692.941650390625, "step": 671 }, { "KL/chosen_KL_mean": -854.4781494140625, "KL/mean": -1061.944091796875, "KL/rejected_KL_mean": -1269.41015625, "KL/std": 686.4140625, "epoch": 0.986784140969163, "fcm_dpo/beta": 0.0008913551573641598, "fcm_dpo/delta": 0.03129229322075844, "fcm_dpo/margin": 414.9320983886719, "fcm_dpo/q_t": 0.41862136125564575, "grad_norm": 60.62648010253906, "learning_rate": 3.293150240547549e-10, "logits/chosen": -1.1662323474884033, "logits/rejected": -1.1740036010742188, "logps/chosen": -913.0614013671875, "logps/ref_chosen": -58.58328628540039, "logps/ref_rejected": -93.14015197753906, "logps/rejected": -1362.55029296875, "loss": 1.1557, "margin_dpo/margin_mean": 414.9320983886719, "margin_dpo/margin_std": 755.1729125976562, "step": 672 }, { "KL/chosen_KL_mean": -753.8118896484375, "KL/mean": -946.7315063476562, "KL/rejected_KL_mean": -1139.651123046875, "KL/std": 579.491455078125, "epoch": 0.9882525697503671, "fcm_dpo/beta": 0.0009015346877276897, "fcm_dpo/delta": 0.054041508585214615, "fcm_dpo/margin": 385.8390808105469, "fcm_dpo/q_t": 0.42215287685394287, "grad_norm": 42.15021514892578, "learning_rate": 2.6675629940689504e-10, "logits/chosen": -1.1026825904846191, "logits/rejected": -1.1077499389648438, "logps/chosen": -800.53515625, "logps/ref_chosen": -46.72320556640625, "logps/ref_rejected": -85.29623413085938, "logps/rejected": -1224.947265625, "loss": 1.1464, "margin_dpo/margin_mean": 385.8390808105469, "margin_dpo/margin_std": 660.7847900390625, "step": 673 }, { "KL/chosen_KL_mean": -630.5238037109375, "KL/mean": -885.9769287109375, "KL/rejected_KL_mean": -1141.429931640625, "KL/std": 571.075439453125, "epoch": 0.9897209985315712, "fcm_dpo/beta": 0.0008984719170257449, "fcm_dpo/delta": -0.061865366995334625, "fcm_dpo/margin": 510.90618896484375, "fcm_dpo/q_t": 0.3981897830963135, "grad_norm": 36.43729782104492, "learning_rate": 2.1077827798404725e-10, "logits/chosen": -0.9947335720062256, "logits/rejected": -1.015453577041626, "logps/chosen": -675.9693603515625, "logps/ref_chosen": -45.445526123046875, "logps/ref_rejected": -70.04593658447266, "logps/rejected": -1211.475830078125, "loss": 1.0617, "margin_dpo/margin_mean": 510.90618896484375, "margin_dpo/margin_std": 678.5731811523438, "step": 674 }, { "KL/chosen_KL_mean": -708.9229736328125, "KL/mean": -968.3958740234375, "KL/rejected_KL_mean": -1227.868896484375, "KL/std": 633.1185302734375, "epoch": 0.9911894273127754, "fcm_dpo/beta": 0.0008785349782556295, "fcm_dpo/delta": -0.060169536620378494, "fcm_dpo/margin": 518.9458618164062, "fcm_dpo/q_t": 0.39929330348968506, "grad_norm": 25.439342498779297, "learning_rate": 1.6138243485910863e-10, "logits/chosen": -1.0798540115356445, "logits/rejected": -1.094804286956787, "logps/chosen": -753.0992431640625, "logps/ref_chosen": -44.17628479003906, "logps/ref_rejected": -74.09197998046875, "logps/rejected": -1301.9608154296875, "loss": 1.0676, "margin_dpo/margin_mean": 518.9458618164062, "margin_dpo/margin_std": 681.2320556640625, "step": 675 }, { "KL/chosen_KL_mean": -749.7974853515625, "KL/mean": -996.75341796875, "KL/rejected_KL_mean": -1243.709228515625, "KL/std": 598.0169067382812, "epoch": 0.9926578560939795, "fcm_dpo/beta": 0.0008790518622845411, "fcm_dpo/delta": -0.03571845218539238, "fcm_dpo/margin": 493.9117736816406, "fcm_dpo/q_t": 0.4009990990161896, "grad_norm": 26.6938419342041, "learning_rate": 1.1857007165852472e-10, "logits/chosen": -1.0037989616394043, "logits/rejected": -1.0201971530914307, "logps/chosen": -821.196044921875, "logps/ref_chosen": -71.39852905273438, "logps/ref_rejected": -88.3587646484375, "logps/rejected": -1332.0679931640625, "loss": 1.065, "margin_dpo/margin_mean": 493.9117736816406, "margin_dpo/margin_std": 605.2867431640625, "step": 676 }, { "KL/chosen_KL_mean": -764.7623291015625, "KL/mean": -998.9996337890625, "KL/rejected_KL_mean": -1233.237060546875, "KL/std": 625.9779052734375, "epoch": 0.9941262848751835, "fcm_dpo/beta": 0.0008738588076084852, "fcm_dpo/delta": -0.009816518053412437, "fcm_dpo/margin": 468.474609375, "fcm_dpo/q_t": 0.4123176634311676, "grad_norm": 31.511207580566406, "learning_rate": 8.23423165278725e-11, "logits/chosen": -1.093052864074707, "logits/rejected": -1.0892189741134644, "logps/chosen": -821.289794921875, "logps/ref_chosen": -56.527435302734375, "logps/ref_rejected": -78.22654724121094, "logps/rejected": -1311.4635009765625, "loss": 1.1107, "margin_dpo/margin_mean": 468.4746398925781, "margin_dpo/margin_std": 753.8814697265625, "step": 677 }, { "KL/chosen_KL_mean": -644.7113037109375, "KL/mean": -927.87255859375, "KL/rejected_KL_mean": -1211.03369140625, "KL/std": 692.720458984375, "epoch": 0.9955947136563876, "fcm_dpo/beta": 0.0008661206811666489, "fcm_dpo/delta": -0.09516976773738861, "fcm_dpo/margin": 566.3224487304688, "fcm_dpo/q_t": 0.39175188541412354, "grad_norm": 28.7852840423584, "learning_rate": 5.270012410216185e-11, "logits/chosen": -1.0311825275421143, "logits/rejected": -1.070950984954834, "logps/chosen": -690.8457641601562, "logps/ref_chosen": -46.13447570800781, "logps/ref_rejected": -80.60462951660156, "logps/rejected": -1291.638427734375, "loss": 1.0583, "margin_dpo/margin_mean": 566.3224487304688, "margin_dpo/margin_std": 791.4454345703125, "step": 678 }, { "KL/chosen_KL_mean": -734.8692626953125, "KL/mean": -927.3165283203125, "KL/rejected_KL_mean": -1119.763916015625, "KL/std": 549.86669921875, "epoch": 0.9970631424375918, "fcm_dpo/beta": 0.0008657841826789081, "fcm_dpo/delta": 0.06911883503198624, "fcm_dpo/margin": 384.8945617675781, "fcm_dpo/q_t": 0.4249575436115265, "grad_norm": 39.39456558227539, "learning_rate": 2.9644275480772416e-11, "logits/chosen": -1.0478802919387817, "logits/rejected": -1.0363208055496216, "logps/chosen": -785.1641845703125, "logps/ref_chosen": -50.294921875, "logps/ref_rejected": -76.59813690185547, "logps/rejected": -1196.3619384765625, "loss": 1.1507, "margin_dpo/margin_mean": 384.8945617675781, "margin_dpo/margin_std": 635.107666015625, "step": 679 }, { "KL/chosen_KL_mean": -748.73779296875, "KL/mean": -1005.2867431640625, "KL/rejected_KL_mean": -1261.835693359375, "KL/std": 711.034912109375, "epoch": 0.9985315712187959, "fcm_dpo/beta": 0.0008600302971899509, "fcm_dpo/delta": -0.04386995732784271, "fcm_dpo/margin": 513.0979614257812, "fcm_dpo/q_t": 0.39925122261047363, "grad_norm": 38.68387985229492, "learning_rate": 1.31753782067201e-11, "logits/chosen": -1.037444829940796, "logits/rejected": -1.0661684274673462, "logps/chosen": -825.6535034179688, "logps/ref_chosen": -76.91569519042969, "logps/ref_rejected": -112.384765625, "logps/rejected": -1374.220458984375, "loss": 1.0929, "margin_dpo/margin_mean": 513.0979614257812, "margin_dpo/margin_std": 767.623046875, "step": 680 }, { "KL/chosen_KL_mean": -737.5836791992188, "KL/mean": -939.5941772460938, "KL/rejected_KL_mean": -1141.6046142578125, "KL/std": 583.9456176757812, "epoch": 1.0, "fcm_dpo/beta": 0.0008584200404584408, "fcm_dpo/delta": -0.04448072612285614, "fcm_dpo/margin": 404.02093505859375, "fcm_dpo/q_t": 0.4208639860153198, "grad_norm": 35.66378402709961, "learning_rate": 3.2938662507808745e-12, "logits/chosen": -1.1038322448730469, "logits/rejected": -1.1210821866989136, "logps/chosen": -798.5409545898438, "logps/ref_chosen": -60.957279205322266, "logps/ref_rejected": -88.55797576904297, "logps/rejected": -1230.16259765625, "loss": 1.1462, "margin_dpo/margin_mean": 404.0209655761719, "margin_dpo/margin_std": 639.079833984375, "step": 681 }, { "epoch": 1.0, "step": 681, "total_flos": 0.0, "train_loss": 1.093637848565582, "train_runtime": 1736.9515, "train_samples_per_second": 25.1, "train_steps_per_second": 0.392 } ], "logging_steps": 1, "max_steps": 681, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }