{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 200, "global_step": 681, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "KL/chosen_KL_mean": 0.00527191162109375, "KL/mean": 0.016706019639968872, "KL/rejected_KL_mean": 0.028141021728515625, "KL/std": 0.272699236869812, "epoch": 0.0014684287812041115, "fcm_dpo/beta": 0.05000000074505806, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": -0.02287006378173828, "fcm_dpo/q_t": 0.5002857446670532, "grad_norm": 41.73493576049805, "learning_rate": 0.0, "logits/chosen": -0.4974287748336792, "logits/rejected": -0.43299180269241333, "logps/chosen": -50.1435661315918, "logps/ref_chosen": -50.14883804321289, "logps/ref_rejected": -74.1280517578125, "logps/rejected": -74.09991455078125, "loss": 1.3875, "margin_dpo/margin_mean": -0.02287048101425171, "margin_dpo/margin_std": 0.41920793056488037, "step": 1 }, { "KL/chosen_KL_mean": -0.03498649597167969, "KL/mean": -0.00212840735912323, "KL/rejected_KL_mean": 0.030735015869140625, "KL/std": 0.24797174334526062, "epoch": 0.002936857562408223, "fcm_dpo/beta": 0.05000000074505806, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": -0.06572261452674866, "fcm_dpo/q_t": 0.5008214712142944, "grad_norm": 36.00978469848633, "learning_rate": 7.246376811594203e-09, "logits/chosen": -0.49536412954330444, "logits/rejected": -0.4594460427761078, "logps/chosen": -52.65568923950195, "logps/ref_chosen": -52.620704650878906, "logps/ref_rejected": -75.30413818359375, "logps/rejected": -75.27340698242188, "loss": 1.3897, "margin_dpo/margin_mean": -0.06572240591049194, "margin_dpo/margin_std": 0.35048407316207886, "step": 2 }, { "KL/chosen_KL_mean": -0.0045108795166015625, "KL/mean": 0.003316923975944519, "KL/rejected_KL_mean": 0.01114654541015625, "KL/std": 0.2563997805118561, "epoch": 0.004405286343612335, "fcm_dpo/beta": 0.05000000074505806, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": -0.015658468008041382, "fcm_dpo/q_t": 0.5001957416534424, "grad_norm": 35.48371505737305, "learning_rate": 1.4492753623188406e-08, "logits/chosen": -0.48171138763427734, "logits/rejected": -0.4422028362751007, "logps/chosen": -60.986106872558594, "logps/ref_chosen": -60.981597900390625, "logps/ref_rejected": -68.67259216308594, "logps/rejected": -68.66145324707031, "loss": 1.3872, "margin_dpo/margin_mean": -0.015658140182495117, "margin_dpo/margin_std": 0.39206600189208984, "step": 3 }, { "KL/chosen_KL_mean": -0.0025787353515625, "KL/mean": 0.015432953834533691, "KL/rejected_KL_mean": 0.03343963623046875, "KL/std": 0.23463661968708038, "epoch": 0.005873715124816446, "fcm_dpo/beta": 0.05000000074505806, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": -0.03601771593093872, "fcm_dpo/q_t": 0.5004501342773438, "grad_norm": 35.9489860534668, "learning_rate": 2.1739130434782606e-08, "logits/chosen": -0.4681958258152008, "logits/rejected": -0.44056397676467896, "logps/chosen": -56.77029037475586, "logps/ref_chosen": -56.7677116394043, "logps/ref_rejected": -86.64710998535156, "logps/rejected": -86.6136703491211, "loss": 1.3882, "margin_dpo/margin_mean": -0.036018311977386475, "margin_dpo/margin_std": 0.3561931252479553, "step": 4 }, { "KL/chosen_KL_mean": 0.04430961608886719, "KL/mean": 0.030420929193496704, "KL/rejected_KL_mean": 0.01653289794921875, "KL/std": 0.26933568716049194, "epoch": 0.007342143906020558, "fcm_dpo/beta": 0.05000000074505806, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.027776658535003662, "fcm_dpo/q_t": 0.49965283274650574, "grad_norm": 44.74127960205078, "learning_rate": 2.898550724637681e-08, "logits/chosen": -0.5143798589706421, "logits/rejected": -0.47071516513824463, "logps/chosen": -53.8150634765625, "logps/ref_chosen": -53.859375, "logps/ref_rejected": -84.14918518066406, "logps/rejected": -84.13265228271484, "loss": 1.385, "margin_dpo/margin_mean": 0.027777403593063354, "margin_dpo/margin_std": 0.3397705554962158, "step": 5 }, { "KL/chosen_KL_mean": -0.016576766967773438, "KL/mean": -0.036144837737083435, "KL/rejected_KL_mean": -0.0557098388671875, "KL/std": 0.2481634020805359, "epoch": 0.00881057268722467, "fcm_dpo/beta": 0.05000000074505806, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.03913220763206482, "fcm_dpo/q_t": 0.49951091408729553, "grad_norm": 45.87062072753906, "learning_rate": 3.6231884057971014e-08, "logits/chosen": -0.5242589712142944, "logits/rejected": -0.4836902618408203, "logps/chosen": -63.02406311035156, "logps/ref_chosen": -63.007484436035156, "logps/ref_rejected": -92.64534759521484, "logps/rejected": -92.70105743408203, "loss": 1.3844, "margin_dpo/margin_mean": 0.03913196921348572, "margin_dpo/margin_std": 0.38666093349456787, "step": 6 }, { "KL/chosen_KL_mean": 0.025547027587890625, "KL/mean": 0.029840022325515747, "KL/rejected_KL_mean": 0.0341339111328125, "KL/std": 0.2671242356300354, "epoch": 0.010279001468428781, "fcm_dpo/beta": 0.05000000074505806, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": -0.008586883544921875, "fcm_dpo/q_t": 0.5001072883605957, "grad_norm": 41.22108840942383, "learning_rate": 4.347826086956521e-08, "logits/chosen": -0.5003604292869568, "logits/rejected": -0.4664100110530853, "logps/chosen": -57.74927520751953, "logps/ref_chosen": -57.774818420410156, "logps/ref_rejected": -103.92059326171875, "logps/rejected": -103.88645935058594, "loss": 1.3868, "margin_dpo/margin_mean": -0.0085868239402771, "margin_dpo/margin_std": 0.36728373169898987, "step": 7 }, { "KL/chosen_KL_mean": -0.002834320068359375, "KL/mean": 0.04533374309539795, "KL/rejected_KL_mean": 0.093505859375, "KL/std": 0.28405576944351196, "epoch": 0.011747430249632892, "fcm_dpo/beta": 0.05000000074505806, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": -0.09634339809417725, "fcm_dpo/q_t": 0.5012041926383972, "grad_norm": 39.53245544433594, "learning_rate": 5.0724637681159424e-08, "logits/chosen": -0.5020660161972046, "logits/rejected": -0.4754522442817688, "logps/chosen": -58.7188720703125, "logps/ref_chosen": -58.716033935546875, "logps/ref_rejected": -79.3114242553711, "logps/rejected": -79.2179183959961, "loss": 1.3912, "margin_dpo/margin_mean": -0.09634318947792053, "margin_dpo/margin_std": 0.40796253085136414, "step": 8 }, { "KL/chosen_KL_mean": 0.0199737548828125, "KL/mean": 0.02644728124141693, "KL/rejected_KL_mean": 0.03292083740234375, "KL/std": 0.3076080083847046, "epoch": 0.013215859030837005, "fcm_dpo/beta": 0.05000000074505806, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": -0.012944132089614868, "fcm_dpo/q_t": 0.5001616477966309, "grad_norm": 42.5697021484375, "learning_rate": 5.797101449275362e-08, "logits/chosen": -0.49002158641815186, "logits/rejected": -0.4431573152542114, "logps/chosen": -69.84687042236328, "logps/ref_chosen": -69.8668441772461, "logps/ref_rejected": -99.6026611328125, "logps/rejected": -99.56974792480469, "loss": 1.3871, "margin_dpo/margin_mean": -0.012945234775543213, "margin_dpo/margin_std": 0.432614266872406, "step": 9 }, { "KL/chosen_KL_mean": 0.018072128295898438, "KL/mean": 0.0005231276154518127, "KL/rejected_KL_mean": -0.01702117919921875, "KL/std": 0.22773060202598572, "epoch": 0.014684287812041116, "fcm_dpo/beta": 0.05000000074505806, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.03509734570980072, "fcm_dpo/q_t": 0.4995613098144531, "grad_norm": 35.374786376953125, "learning_rate": 6.521739130434782e-08, "logits/chosen": -0.4773544371128082, "logits/rejected": -0.43332165479660034, "logps/chosen": -48.33961486816406, "logps/ref_chosen": -48.35768508911133, "logps/ref_rejected": -80.37206268310547, "logps/rejected": -80.38908386230469, "loss": 1.3846, "margin_dpo/margin_mean": 0.035097718238830566, "margin_dpo/margin_std": 0.32590410113334656, "step": 10 }, { "KL/chosen_KL_mean": 0.055454254150390625, "KL/mean": -0.006332814693450928, "KL/rejected_KL_mean": -0.06811904907226562, "KL/std": 0.2968614101409912, "epoch": 0.016152716593245228, "fcm_dpo/beta": 0.05000000074505806, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.1235695481300354, "fcm_dpo/q_t": 0.4984557032585144, "grad_norm": 34.34830093383789, "learning_rate": 7.246376811594203e-08, "logits/chosen": -0.4542468190193176, "logits/rejected": -0.42898106575012207, "logps/chosen": -52.961402893066406, "logps/ref_chosen": -53.01685333251953, "logps/ref_rejected": -87.78038024902344, "logps/rejected": -87.84849548339844, "loss": 1.3802, "margin_dpo/margin_mean": 0.12356960773468018, "margin_dpo/margin_std": 0.4112103283405304, "step": 11 }, { "KL/chosen_KL_mean": -0.037494659423828125, "KL/mean": -0.04345113784074783, "KL/rejected_KL_mean": -0.049404144287109375, "KL/std": 0.3219500184059143, "epoch": 0.01762114537444934, "fcm_dpo/beta": 0.05000000074505806, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.011913254857063293, "fcm_dpo/q_t": 0.4998508393764496, "grad_norm": 45.08433532714844, "learning_rate": 7.971014492753623e-08, "logits/chosen": -0.5271201133728027, "logits/rejected": -0.4905800521373749, "logps/chosen": -61.842933654785156, "logps/ref_chosen": -61.80543518066406, "logps/ref_rejected": -104.8582763671875, "logps/rejected": -104.90767669677734, "loss": 1.3858, "margin_dpo/margin_mean": 0.011912867426872253, "margin_dpo/margin_std": 0.4435839056968689, "step": 12 }, { "KL/chosen_KL_mean": -0.0016231536865234375, "KL/mean": 0.008255481719970703, "KL/rejected_KL_mean": 0.018131256103515625, "KL/std": 0.2522842288017273, "epoch": 0.01908957415565345, "fcm_dpo/beta": 0.05000000074505806, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": -0.019751250743865967, "fcm_dpo/q_t": 0.5002469420433044, "grad_norm": 39.63324737548828, "learning_rate": 8.695652173913042e-08, "logits/chosen": -0.44067633152008057, "logits/rejected": -0.409400999546051, "logps/chosen": -64.26197814941406, "logps/ref_chosen": -64.2603530883789, "logps/ref_rejected": -87.20307922363281, "logps/rejected": -87.18495178222656, "loss": 1.3874, "margin_dpo/margin_mean": -0.01975110173225403, "margin_dpo/margin_std": 0.3618961572647095, "step": 13 }, { "KL/chosen_KL_mean": 0.0144195556640625, "KL/mean": -0.015029460191726685, "KL/rejected_KL_mean": -0.0444793701171875, "KL/std": 0.254509299993515, "epoch": 0.020558002936857563, "fcm_dpo/beta": 0.05000000074505806, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.0589028000831604, "fcm_dpo/q_t": 0.49926379323005676, "grad_norm": 42.595924377441406, "learning_rate": 9.420289855072464e-08, "logits/chosen": -0.4786085784435272, "logits/rejected": -0.43931445479393005, "logps/chosen": -58.09579086303711, "logps/ref_chosen": -58.11021041870117, "logps/ref_rejected": -104.04708099365234, "logps/rejected": -104.09156036376953, "loss": 1.3834, "margin_dpo/margin_mean": 0.05890271067619324, "margin_dpo/margin_std": 0.36086153984069824, "step": 14 }, { "KL/chosen_KL_mean": -0.0267486572265625, "KL/mean": -0.03432337939739227, "KL/rejected_KL_mean": -0.041904449462890625, "KL/std": 0.24552780389785767, "epoch": 0.022026431718061675, "fcm_dpo/beta": 0.05000000074505806, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.01515999436378479, "fcm_dpo/q_t": 0.49981045722961426, "grad_norm": 32.06040573120117, "learning_rate": 1.0144927536231885e-07, "logits/chosen": -0.501712441444397, "logits/rejected": -0.4834703207015991, "logps/chosen": -56.99365997314453, "logps/ref_chosen": -56.96691131591797, "logps/ref_rejected": -80.80863952636719, "logps/rejected": -80.85054779052734, "loss": 1.3856, "margin_dpo/margin_mean": 0.015159964561462402, "margin_dpo/margin_std": 0.37245649099349976, "step": 15 }, { "KL/chosen_KL_mean": -0.00298309326171875, "KL/mean": -0.015021562576293945, "KL/rejected_KL_mean": -0.0270538330078125, "KL/std": 0.269855260848999, "epoch": 0.023494860499265784, "fcm_dpo/beta": 0.05000000074505806, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.02406895160675049, "fcm_dpo/q_t": 0.49969929456710815, "grad_norm": 42.0484619140625, "learning_rate": 1.0869565217391303e-07, "logits/chosen": -0.4899655878543854, "logits/rejected": -0.4453532099723816, "logps/chosen": -61.74287414550781, "logps/ref_chosen": -61.739891052246094, "logps/ref_rejected": -84.36947631835938, "logps/rejected": -84.39653015136719, "loss": 1.3852, "margin_dpo/margin_mean": 0.024068236351013184, "margin_dpo/margin_std": 0.3988404870033264, "step": 16 }, { "KL/chosen_KL_mean": 0.05109405517578125, "KL/mean": -0.004536911845207214, "KL/rejected_KL_mean": -0.06017303466796875, "KL/std": 0.2485760748386383, "epoch": 0.024963289280469897, "fcm_dpo/beta": 0.05000000074505806, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.11126986145973206, "fcm_dpo/q_t": 0.4986092150211334, "grad_norm": 39.231082916259766, "learning_rate": 1.1594202898550725e-07, "logits/chosen": -0.5107743740081787, "logits/rejected": -0.47472792863845825, "logps/chosen": -67.65924072265625, "logps/ref_chosen": -67.71033477783203, "logps/ref_rejected": -85.37865447998047, "logps/rejected": -85.43882751464844, "loss": 1.3808, "margin_dpo/margin_mean": 0.1112699806690216, "margin_dpo/margin_std": 0.33091142773628235, "step": 17 }, { "KL/chosen_KL_mean": -0.00733184814453125, "KL/mean": -0.011897072196006775, "KL/rejected_KL_mean": -0.016460418701171875, "KL/std": 0.2411435842514038, "epoch": 0.02643171806167401, "fcm_dpo/beta": 0.05000000074505806, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.009129971265792847, "fcm_dpo/q_t": 0.4998858571052551, "grad_norm": 41.25909423828125, "learning_rate": 1.2318840579710146e-07, "logits/chosen": -0.4928959012031555, "logits/rejected": -0.43723440170288086, "logps/chosen": -47.746822357177734, "logps/ref_chosen": -47.7394905090332, "logps/ref_rejected": -75.4722900390625, "logps/rejected": -75.4887466430664, "loss": 1.3859, "margin_dpo/margin_mean": 0.009129911661148071, "margin_dpo/margin_std": 0.3549841642379761, "step": 18 }, { "KL/chosen_KL_mean": 0.03185272216796875, "KL/mean": -0.017944127321243286, "KL/rejected_KL_mean": -0.06773757934570312, "KL/std": 0.23240481317043304, "epoch": 0.027900146842878122, "fcm_dpo/beta": 0.05000000074505806, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.09958422183990479, "fcm_dpo/q_t": 0.4987553358078003, "grad_norm": 36.71526336669922, "learning_rate": 1.3043478260869563e-07, "logits/chosen": -0.47301602363586426, "logits/rejected": -0.42177867889404297, "logps/chosen": -70.17350769042969, "logps/ref_chosen": -70.20536041259766, "logps/ref_rejected": -89.7575912475586, "logps/rejected": -89.82533264160156, "loss": 1.3814, "margin_dpo/margin_mean": 0.09958454966545105, "margin_dpo/margin_std": 0.33764326572418213, "step": 19 }, { "KL/chosen_KL_mean": 0.016534805297851562, "KL/mean": -0.01947064697742462, "KL/rejected_KL_mean": -0.05547332763671875, "KL/std": 0.24900861084461212, "epoch": 0.02936857562408223, "fcm_dpo/beta": 0.05000000074505806, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.07201334834098816, "fcm_dpo/q_t": 0.49909985065460205, "grad_norm": 37.05985641479492, "learning_rate": 1.3768115942028986e-07, "logits/chosen": -0.5658631324768066, "logits/rejected": -0.510959267616272, "logps/chosen": -50.786705017089844, "logps/ref_chosen": -50.80324172973633, "logps/ref_rejected": -78.82334899902344, "logps/rejected": -78.87882232666016, "loss": 1.3828, "margin_dpo/margin_mean": 0.07201322913169861, "margin_dpo/margin_std": 0.3497501015663147, "step": 20 }, { "KL/chosen_KL_mean": 0.010187149047851562, "KL/mean": -0.04661019146442413, "KL/rejected_KL_mean": -0.1034088134765625, "KL/std": 0.2971247434616089, "epoch": 0.030837004405286344, "fcm_dpo/beta": 0.05000000074505806, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.11359718441963196, "fcm_dpo/q_t": 0.49858027696609497, "grad_norm": 38.864524841308594, "learning_rate": 1.4492753623188405e-07, "logits/chosen": -0.4945378303527832, "logits/rejected": -0.47060784697532654, "logps/chosen": -50.05282974243164, "logps/ref_chosen": -50.063018798828125, "logps/ref_rejected": -77.86878967285156, "logps/rejected": -77.97219848632812, "loss": 1.3807, "margin_dpo/margin_mean": 0.1135970950126648, "margin_dpo/margin_std": 0.3924105763435364, "step": 21 }, { "KL/chosen_KL_mean": 0.025745391845703125, "KL/mean": -0.054353222250938416, "KL/rejected_KL_mean": -0.13445281982421875, "KL/std": 0.2926764190196991, "epoch": 0.032305433186490456, "fcm_dpo/beta": 0.05000000074505806, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.16020318865776062, "fcm_dpo/q_t": 0.4979976713657379, "grad_norm": 42.72419357299805, "learning_rate": 1.5217391304347825e-07, "logits/chosen": -0.4607342481613159, "logits/rejected": -0.4157930612564087, "logps/chosen": -59.031890869140625, "logps/ref_chosen": -59.05763626098633, "logps/ref_rejected": -97.50466918945312, "logps/rejected": -97.63912963867188, "loss": 1.3784, "margin_dpo/margin_mean": 0.1602029800415039, "margin_dpo/margin_std": 0.4115052819252014, "step": 22 }, { "KL/chosen_KL_mean": 0.0500335693359375, "KL/mean": -0.061298683285713196, "KL/rejected_KL_mean": -0.17262649536132812, "KL/std": 0.3359306752681732, "epoch": 0.033773861967694566, "fcm_dpo/beta": 0.05000000074505806, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.22266075015068054, "fcm_dpo/q_t": 0.49721741676330566, "grad_norm": 40.804290771484375, "learning_rate": 1.5942028985507245e-07, "logits/chosen": -0.49854540824890137, "logits/rejected": -0.47655850648880005, "logps/chosen": -60.02766418457031, "logps/ref_chosen": -60.07769775390625, "logps/ref_rejected": -81.13955688476562, "logps/rejected": -81.31217956542969, "loss": 1.3753, "margin_dpo/margin_mean": 0.2226608693599701, "margin_dpo/margin_std": 0.4632441997528076, "step": 23 }, { "KL/chosen_KL_mean": 0.0107421875, "KL/mean": -0.08500338345766068, "KL/rejected_KL_mean": -0.18075180053710938, "KL/std": 0.29481637477874756, "epoch": 0.03524229074889868, "fcm_dpo/beta": 0.05000000074505806, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.19149301946163177, "fcm_dpo/q_t": 0.4976065456867218, "grad_norm": 46.37499237060547, "learning_rate": 1.6666666666666665e-07, "logits/chosen": -0.5174161195755005, "logits/rejected": -0.5007544159889221, "logps/chosen": -44.28029251098633, "logps/ref_chosen": -44.29103469848633, "logps/ref_rejected": -99.12521362304688, "logps/rejected": -99.30596923828125, "loss": 1.3768, "margin_dpo/margin_mean": 0.19149288535118103, "margin_dpo/margin_std": 0.36473649740219116, "step": 24 }, { "KL/chosen_KL_mean": -0.03951263427734375, "KL/mean": -0.12438388168811798, "KL/rejected_KL_mean": -0.20925140380859375, "KL/std": 0.3535291850566864, "epoch": 0.03671071953010279, "fcm_dpo/beta": 0.05000000074505806, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.16974028944969177, "fcm_dpo/q_t": 0.4978786110877991, "grad_norm": 37.40980529785156, "learning_rate": 1.7391304347826085e-07, "logits/chosen": -0.45742011070251465, "logits/rejected": -0.42607590556144714, "logps/chosen": -52.57656478881836, "logps/ref_chosen": -52.537052154541016, "logps/ref_rejected": -89.34219360351562, "logps/rejected": -89.55143737792969, "loss": 1.378, "margin_dpo/margin_mean": 0.16974005103111267, "margin_dpo/margin_std": 0.45375657081604004, "step": 25 }, { "KL/chosen_KL_mean": 0.05691337585449219, "KL/mean": -0.13059790432453156, "KL/rejected_KL_mean": -0.31810760498046875, "KL/std": 0.4097515344619751, "epoch": 0.0381791483113069, "fcm_dpo/beta": 0.05000000074505806, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.375026136636734, "fcm_dpo/q_t": 0.4953131675720215, "grad_norm": 44.840396881103516, "learning_rate": 1.8115942028985507e-07, "logits/chosen": -0.5315337181091309, "logits/rejected": -0.5001455545425415, "logps/chosen": -53.86589431762695, "logps/ref_chosen": -53.92280578613281, "logps/ref_rejected": -103.35971069335938, "logps/rejected": -103.67782592773438, "loss": 1.3678, "margin_dpo/margin_mean": 0.3750268816947937, "margin_dpo/margin_std": 0.46361756324768066, "step": 26 }, { "KL/chosen_KL_mean": 0.11811256408691406, "KL/mean": -0.12570391595363617, "KL/rejected_KL_mean": -0.3695220947265625, "KL/std": 0.4389367699623108, "epoch": 0.039647577092511016, "fcm_dpo/beta": 0.05000000074505806, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.4876362979412079, "fcm_dpo/q_t": 0.4939061999320984, "grad_norm": 47.54275131225586, "learning_rate": 1.8840579710144927e-07, "logits/chosen": -0.531154990196228, "logits/rejected": -0.4949020743370056, "logps/chosen": -42.780418395996094, "logps/ref_chosen": -42.898529052734375, "logps/ref_rejected": -98.72419738769531, "logps/rejected": -99.09371948242188, "loss": 1.3622, "margin_dpo/margin_mean": 0.4876362681388855, "margin_dpo/margin_std": 0.5126945972442627, "step": 27 }, { "KL/chosen_KL_mean": 0.028047561645507812, "KL/mean": -0.13079789280891418, "KL/rejected_KL_mean": -0.2896461486816406, "KL/std": 0.37808555364608765, "epoch": 0.041116005873715125, "fcm_dpo/beta": 0.05000000074505806, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.3176928162574768, "fcm_dpo/q_t": 0.49602949619293213, "grad_norm": 38.3453254699707, "learning_rate": 1.9565217391304347e-07, "logits/chosen": -0.516849160194397, "logits/rejected": -0.46265852451324463, "logps/chosen": -60.5284538269043, "logps/ref_chosen": -60.55650329589844, "logps/ref_rejected": -91.40111541748047, "logps/rejected": -91.69076538085938, "loss": 1.3706, "margin_dpo/margin_mean": 0.31769293546676636, "margin_dpo/margin_std": 0.5036317110061646, "step": 28 }, { "KL/chosen_KL_mean": 0.06805419921875, "KL/mean": -0.18801212310791016, "KL/rejected_KL_mean": -0.4440765380859375, "KL/std": 0.4913862943649292, "epoch": 0.042584434654919234, "fcm_dpo/beta": 0.05000000074505806, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.512132465839386, "fcm_dpo/q_t": 0.4936005473136902, "grad_norm": 46.29709243774414, "learning_rate": 2.028985507246377e-07, "logits/chosen": -0.5535327196121216, "logits/rejected": -0.5072311758995056, "logps/chosen": -57.73973083496094, "logps/ref_chosen": -57.80778503417969, "logps/ref_rejected": -97.39434814453125, "logps/rejected": -97.83842468261719, "loss": 1.3611, "margin_dpo/margin_mean": 0.5121327638626099, "margin_dpo/margin_std": 0.6046355366706848, "step": 29 }, { "KL/chosen_KL_mean": 0.16227149963378906, "KL/mean": -0.18443751335144043, "KL/rejected_KL_mean": -0.5311508178710938, "KL/std": 0.5754395723342896, "epoch": 0.04405286343612335, "fcm_dpo/beta": 0.05000000074505806, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.6934210658073425, "fcm_dpo/q_t": 0.4913354218006134, "grad_norm": 45.00627899169922, "learning_rate": 2.1014492753623187e-07, "logits/chosen": -0.5093830823898315, "logits/rejected": -0.47936421632766724, "logps/chosen": -52.41510009765625, "logps/ref_chosen": -52.577369689941406, "logps/ref_rejected": -98.48920440673828, "logps/rejected": -99.02035522460938, "loss": 1.3522, "margin_dpo/margin_mean": 0.6934208869934082, "margin_dpo/margin_std": 0.5992348194122314, "step": 30 }, { "KL/chosen_KL_mean": 0.11116981506347656, "KL/mean": -0.14780662953853607, "KL/rejected_KL_mean": -0.4067840576171875, "KL/std": 0.56305992603302, "epoch": 0.04552129221732746, "fcm_dpo/beta": 0.05000000074505806, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.5179520845413208, "fcm_dpo/q_t": 0.4935287833213806, "grad_norm": 34.56964874267578, "learning_rate": 2.1739130434782607e-07, "logits/chosen": -0.5045328140258789, "logits/rejected": -0.4596520662307739, "logps/chosen": -63.69575119018555, "logps/ref_chosen": -63.806922912597656, "logps/ref_rejected": -72.89400482177734, "logps/rejected": -73.30078887939453, "loss": 1.3609, "margin_dpo/margin_mean": 0.5179519653320312, "margin_dpo/margin_std": 0.6779955625534058, "step": 31 }, { "KL/chosen_KL_mean": 0.2145843505859375, "KL/mean": -0.16551779210567474, "KL/rejected_KL_mean": -0.545623779296875, "KL/std": 0.735187828540802, "epoch": 0.04698972099853157, "fcm_dpo/beta": 0.05000000074505806, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.760206937789917, "fcm_dpo/q_t": 0.4905046820640564, "grad_norm": 43.04856872558594, "learning_rate": 2.2463768115942027e-07, "logits/chosen": -0.5065813064575195, "logits/rejected": -0.46486425399780273, "logps/chosen": -62.524940490722656, "logps/ref_chosen": -62.739524841308594, "logps/ref_rejected": -89.3175048828125, "logps/rejected": -89.86312866210938, "loss": 1.3491, "margin_dpo/margin_mean": 0.760206937789917, "margin_dpo/margin_std": 0.8800061941146851, "step": 32 }, { "KL/chosen_KL_mean": 0.09627151489257812, "KL/mean": -0.18434438109397888, "KL/rejected_KL_mean": -0.46495819091796875, "KL/std": 0.5440672636032104, "epoch": 0.048458149779735685, "fcm_dpo/beta": 0.05000000074505806, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.5612262487411499, "fcm_dpo/q_t": 0.492986798286438, "grad_norm": 37.19485092163086, "learning_rate": 2.318840579710145e-07, "logits/chosen": -0.4748949110507965, "logits/rejected": -0.4482702612876892, "logps/chosen": -53.164703369140625, "logps/ref_chosen": -53.26097106933594, "logps/ref_rejected": -87.8851318359375, "logps/rejected": -88.35009765625, "loss": 1.3587, "margin_dpo/margin_mean": 0.561225950717926, "margin_dpo/margin_std": 0.6160410642623901, "step": 33 }, { "KL/chosen_KL_mean": 0.09169197082519531, "KL/mean": -0.32464924454689026, "KL/rejected_KL_mean": -0.7409934997558594, "KL/std": 0.7199804782867432, "epoch": 0.049926578560939794, "fcm_dpo/beta": 0.05000000074505806, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.8326817750930786, "fcm_dpo/q_t": 0.48959898948669434, "grad_norm": 40.41777801513672, "learning_rate": 2.391304347826087e-07, "logits/chosen": -0.48803627490997314, "logits/rejected": -0.47066670656204224, "logps/chosen": -50.72563552856445, "logps/ref_chosen": -50.81732940673828, "logps/ref_rejected": -101.92184448242188, "logps/rejected": -102.662841796875, "loss": 1.3456, "margin_dpo/margin_mean": 0.8326810002326965, "margin_dpo/margin_std": 0.8663803339004517, "step": 34 }, { "KL/chosen_KL_mean": 0.12615013122558594, "KL/mean": -0.4437118172645569, "KL/rejected_KL_mean": -1.0135726928710938, "KL/std": 0.9808096885681152, "epoch": 0.0513950073421439, "fcm_dpo/beta": 0.05000000074505806, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 1.1397216320037842, "fcm_dpo/q_t": 0.4857790470123291, "grad_norm": 43.86104965209961, "learning_rate": 2.463768115942029e-07, "logits/chosen": -0.538067638874054, "logits/rejected": -0.5010450482368469, "logps/chosen": -50.898338317871094, "logps/ref_chosen": -51.02449035644531, "logps/ref_rejected": -106.82443237304688, "logps/rejected": -107.83799743652344, "loss": 1.3309, "margin_dpo/margin_mean": 1.1397206783294678, "margin_dpo/margin_std": 1.1088385581970215, "step": 35 }, { "KL/chosen_KL_mean": 0.036411285400390625, "KL/mean": -0.529353678226471, "KL/rejected_KL_mean": -1.0951156616210938, "KL/std": 1.0766912698745728, "epoch": 0.05286343612334802, "fcm_dpo/beta": 0.05000000074505806, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 1.1315288543701172, "fcm_dpo/q_t": 0.48587337136268616, "grad_norm": 38.84652328491211, "learning_rate": 2.536231884057971e-07, "logits/chosen": -0.570672869682312, "logits/rejected": -0.5350126028060913, "logps/chosen": -51.955078125, "logps/ref_chosen": -51.991493225097656, "logps/ref_rejected": -86.0406265258789, "logps/rejected": -87.1357421875, "loss": 1.3314, "margin_dpo/margin_mean": 1.1315281391143799, "margin_dpo/margin_std": 1.1600990295410156, "step": 36 }, { "KL/chosen_KL_mean": 0.039447784423828125, "KL/mean": -0.4909515976905823, "KL/rejected_KL_mean": -1.0213546752929688, "KL/std": 1.065048336982727, "epoch": 0.05433186490455213, "fcm_dpo/beta": 0.05000000074505806, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 1.0608052015304565, "fcm_dpo/q_t": 0.4867693781852722, "grad_norm": 32.67951202392578, "learning_rate": 2.6086956521739126e-07, "logits/chosen": -0.4992326498031616, "logits/rejected": -0.45456790924072266, "logps/chosen": -62.767662048339844, "logps/ref_chosen": -62.807106018066406, "logps/ref_rejected": -77.89507293701172, "logps/rejected": -78.91642761230469, "loss": 1.3351, "margin_dpo/margin_mean": 1.0608049631118774, "margin_dpo/margin_std": 1.3338418006896973, "step": 37 }, { "KL/chosen_KL_mean": 0.1778545379638672, "KL/mean": -0.534498929977417, "KL/rejected_KL_mean": -1.2468528747558594, "KL/std": 1.3990492820739746, "epoch": 0.055800293685756244, "fcm_dpo/beta": 0.05000000074505806, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 1.4247064590454102, "fcm_dpo/q_t": 0.4822547733783722, "grad_norm": 37.384422302246094, "learning_rate": 2.681159420289855e-07, "logits/chosen": -0.5284711122512817, "logits/rejected": -0.4957655370235443, "logps/chosen": -48.21266555786133, "logps/ref_chosen": -48.39051818847656, "logps/ref_rejected": -97.91244506835938, "logps/rejected": -99.15929412841797, "loss": 1.3182, "margin_dpo/margin_mean": 1.4247064590454102, "margin_dpo/margin_std": 1.7057501077651978, "step": 38 }, { "KL/chosen_KL_mean": 0.08975982666015625, "KL/mean": -0.7341597080230713, "KL/rejected_KL_mean": -1.5580787658691406, "KL/std": 1.322374939918518, "epoch": 0.05726872246696035, "fcm_dpo/beta": 0.05000000074505806, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 1.6478345394134521, "fcm_dpo/q_t": 0.47944512963294983, "grad_norm": 39.39072799682617, "learning_rate": 2.753623188405797e-07, "logits/chosen": -0.5558722019195557, "logits/rejected": -0.5158903002738953, "logps/chosen": -50.66071319580078, "logps/ref_chosen": -50.75047302246094, "logps/ref_rejected": -78.56951141357422, "logps/rejected": -80.12759399414062, "loss": 1.307, "margin_dpo/margin_mean": 1.6478347778320312, "margin_dpo/margin_std": 1.4638022184371948, "step": 39 }, { "KL/chosen_KL_mean": 0.1870288848876953, "KL/mean": -0.5974045395851135, "KL/rejected_KL_mean": -1.3818397521972656, "KL/std": 1.3903248310089111, "epoch": 0.05873715124816446, "fcm_dpo/beta": 0.05000000074505806, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 1.568869948387146, "fcm_dpo/q_t": 0.48045170307159424, "grad_norm": 32.67512512207031, "learning_rate": 2.8260869565217386e-07, "logits/chosen": -0.531123161315918, "logits/rejected": -0.5019083619117737, "logps/chosen": -57.798038482666016, "logps/ref_chosen": -57.985069274902344, "logps/ref_rejected": -74.3000717163086, "logps/rejected": -75.68191528320312, "loss": 1.3112, "margin_dpo/margin_mean": 1.5688700675964355, "margin_dpo/margin_std": 1.6431810855865479, "step": 40 }, { "KL/chosen_KL_mean": 0.026700973510742188, "KL/mean": -0.9341164827346802, "KL/rejected_KL_mean": -1.894927978515625, "KL/std": 1.8969902992248535, "epoch": 0.06020558002936858, "fcm_dpo/beta": 0.05000000074505806, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 1.921630620956421, "fcm_dpo/q_t": 0.47608882188796997, "grad_norm": 37.314823150634766, "learning_rate": 2.898550724637681e-07, "logits/chosen": -0.549726128578186, "logits/rejected": -0.5137777328491211, "logps/chosen": -62.66911315917969, "logps/ref_chosen": -62.69581604003906, "logps/ref_rejected": -97.02352905273438, "logps/rejected": -98.91845703125, "loss": 1.2953, "margin_dpo/margin_mean": 1.9216312170028687, "margin_dpo/margin_std": 2.0707690715789795, "step": 41 }, { "KL/chosen_KL_mean": 0.20686912536621094, "KL/mean": -1.109514832496643, "KL/rejected_KL_mean": -2.4258995056152344, "KL/std": 2.2611937522888184, "epoch": 0.06167400881057269, "fcm_dpo/beta": 0.05000000074505806, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 2.6327667236328125, "fcm_dpo/q_t": 0.46733659505844116, "grad_norm": 44.921146392822266, "learning_rate": 2.971014492753623e-07, "logits/chosen": -0.5433107614517212, "logits/rejected": -0.49691134691238403, "logps/chosen": -58.759559631347656, "logps/ref_chosen": -58.966426849365234, "logps/ref_rejected": -109.90837097167969, "logps/rejected": -112.33427429199219, "loss": 1.2632, "margin_dpo/margin_mean": 2.6327667236328125, "margin_dpo/margin_std": 2.5917067527770996, "step": 42 }, { "KL/chosen_KL_mean": 0.5680294036865234, "KL/mean": -0.7123653888702393, "KL/rejected_KL_mean": -1.9927635192871094, "KL/std": 1.8695602416992188, "epoch": 0.0631424375917768, "fcm_dpo/beta": 0.05000000074505806, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 2.5607895851135254, "fcm_dpo/q_t": 0.46813303232192993, "grad_norm": 39.96173858642578, "learning_rate": 3.043478260869565e-07, "logits/chosen": -0.529514729976654, "logits/rejected": -0.50406813621521, "logps/chosen": -53.58796691894531, "logps/ref_chosen": -54.15599822998047, "logps/ref_rejected": -96.48019409179688, "logps/rejected": -98.47295379638672, "loss": 1.2648, "margin_dpo/margin_mean": 2.5607893466949463, "margin_dpo/margin_std": 1.9669482707977295, "step": 43 }, { "KL/chosen_KL_mean": 0.22922897338867188, "KL/mean": -1.1871364116668701, "KL/rejected_KL_mean": -2.603504180908203, "KL/std": 2.2904388904571533, "epoch": 0.06461086637298091, "fcm_dpo/beta": 0.05000000074505806, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 2.832730293273926, "fcm_dpo/q_t": 0.46478694677352905, "grad_norm": 44.83015823364258, "learning_rate": 3.115942028985507e-07, "logits/chosen": -0.4886033236980438, "logits/rejected": -0.4682733416557312, "logps/chosen": -49.84926986694336, "logps/ref_chosen": -50.07849884033203, "logps/ref_rejected": -108.78376007080078, "logps/rejected": -111.38726806640625, "loss": 1.2528, "margin_dpo/margin_mean": 2.832730293273926, "margin_dpo/margin_std": 2.242119789123535, "step": 44 }, { "KL/chosen_KL_mean": 0.15188217163085938, "KL/mean": -1.0016334056854248, "KL/rejected_KL_mean": -2.1551513671875, "KL/std": 2.107585906982422, "epoch": 0.06607929515418502, "fcm_dpo/beta": 0.05000000074505806, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 2.307036876678467, "fcm_dpo/q_t": 0.4713747501373291, "grad_norm": 35.4964485168457, "learning_rate": 3.188405797101449e-07, "logits/chosen": -0.49817246198654175, "logits/rejected": -0.48587897419929504, "logps/chosen": -48.26304626464844, "logps/ref_chosen": -48.4149284362793, "logps/ref_rejected": -77.93643188476562, "logps/rejected": -80.09158325195312, "loss": 1.2787, "margin_dpo/margin_mean": 2.307036876678467, "margin_dpo/margin_std": 2.619992733001709, "step": 45 }, { "KL/chosen_KL_mean": 0.21907806396484375, "KL/mean": -1.2944903373718262, "KL/rejected_KL_mean": -2.808063507080078, "KL/std": 2.69203782081604, "epoch": 0.06754772393538913, "fcm_dpo/beta": 0.05000000074505806, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 3.0271382331848145, "fcm_dpo/q_t": 0.4626089632511139, "grad_norm": 40.982444763183594, "learning_rate": 3.260869565217391e-07, "logits/chosen": -0.5164551138877869, "logits/rejected": -0.464849591255188, "logps/chosen": -55.78034973144531, "logps/ref_chosen": -55.999427795410156, "logps/ref_rejected": -95.652587890625, "logps/rejected": -98.46064758300781, "loss": 1.2474, "margin_dpo/margin_mean": 3.0271389484405518, "margin_dpo/margin_std": 3.295480966567993, "step": 46 }, { "KL/chosen_KL_mean": 0.3923931121826172, "KL/mean": -1.1159597635269165, "KL/rejected_KL_mean": -2.6243133544921875, "KL/std": 2.6395797729492188, "epoch": 0.06901615271659324, "fcm_dpo/beta": 0.05000000074505806, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 3.0167040824890137, "fcm_dpo/q_t": 0.46258771419525146, "grad_norm": 37.45136642456055, "learning_rate": 3.333333333333333e-07, "logits/chosen": -0.5594693422317505, "logits/rejected": -0.5059822797775269, "logps/chosen": -57.53368377685547, "logps/ref_chosen": -57.92607879638672, "logps/ref_rejected": -94.67920684814453, "logps/rejected": -97.30352020263672, "loss": 1.246, "margin_dpo/margin_mean": 3.016704559326172, "margin_dpo/margin_std": 2.752382516860962, "step": 47 }, { "KL/chosen_KL_mean": 0.07604217529296875, "KL/mean": -1.5602600574493408, "KL/rejected_KL_mean": -3.1965599060058594, "KL/std": 2.765866756439209, "epoch": 0.07048458149779736, "fcm_dpo/beta": 0.05000000074505806, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 3.2726051807403564, "fcm_dpo/q_t": 0.4594641327857971, "grad_norm": 42.72023391723633, "learning_rate": 3.4057971014492755e-07, "logits/chosen": -0.5792273879051208, "logits/rejected": -0.5201135277748108, "logps/chosen": -57.112030029296875, "logps/ref_chosen": -57.188072204589844, "logps/ref_rejected": -88.0166015625, "logps/rejected": -91.21316528320312, "loss": 1.2359, "margin_dpo/margin_mean": 3.2726054191589355, "margin_dpo/margin_std": 2.975618362426758, "step": 48 }, { "KL/chosen_KL_mean": 0.2817058563232422, "KL/mean": -1.6360485553741455, "KL/rejected_KL_mean": -3.5537986755371094, "KL/std": 3.4877753257751465, "epoch": 0.07195301027900147, "fcm_dpo/beta": 0.05044425278902054, "fcm_dpo/delta": 0.08806969970464706, "fcm_dpo/margin": 3.8355047702789307, "fcm_dpo/q_t": 0.45288553833961487, "grad_norm": 37.98999786376953, "learning_rate": 3.478260869565217e-07, "logits/chosen": -0.5384161472320557, "logits/rejected": -0.4786253571510315, "logps/chosen": -61.403564453125, "logps/ref_chosen": -61.685272216796875, "logps/ref_rejected": -83.76747131347656, "logps/rejected": -87.32127380371094, "loss": 1.2127, "margin_dpo/margin_mean": 3.8355050086975098, "margin_dpo/margin_std": 4.0506486892700195, "step": 49 }, { "KL/chosen_KL_mean": -0.2352313995361328, "KL/mean": -2.3096749782562256, "KL/rejected_KL_mean": -4.384113311767578, "KL/std": 3.5547854900360107, "epoch": 0.07342143906020558, "fcm_dpo/beta": 0.052318423986434937, "fcm_dpo/delta": 0.1870485544204712, "fcm_dpo/margin": 4.148881435394287, "fcm_dpo/q_t": 0.4476335346698761, "grad_norm": 38.957035064697266, "learning_rate": 3.5507246376811595e-07, "logits/chosen": -0.5327342748641968, "logits/rejected": -0.49643486738204956, "logps/chosen": -58.95936965942383, "logps/ref_chosen": -58.72413635253906, "logps/ref_rejected": -96.35814666748047, "logps/rejected": -100.74226379394531, "loss": 1.1922, "margin_dpo/margin_mean": 4.148880958557129, "margin_dpo/margin_std": 4.06521463394165, "step": 50 }, { "KL/chosen_KL_mean": -0.2880744934082031, "KL/mean": -2.3413543701171875, "KL/rejected_KL_mean": -4.394641876220703, "KL/std": 4.241177558898926, "epoch": 0.07488986784140969, "fcm_dpo/beta": 0.05356086045503616, "fcm_dpo/delta": 0.06885935366153717, "fcm_dpo/margin": 4.106563568115234, "fcm_dpo/q_t": 0.447162002325058, "grad_norm": 34.38425827026367, "learning_rate": 3.6231884057971015e-07, "logits/chosen": -0.5253022313117981, "logits/rejected": -0.4924160838127136, "logps/chosen": -61.6617431640625, "logps/ref_chosen": -61.3736686706543, "logps/ref_rejected": -76.00199890136719, "logps/rejected": -80.39664459228516, "loss": 1.1977, "margin_dpo/margin_mean": 4.106563091278076, "margin_dpo/margin_std": 5.20696496963501, "step": 51 }, { "KL/chosen_KL_mean": 0.3370513916015625, "KL/mean": -2.786116600036621, "KL/rejected_KL_mean": -5.9092864990234375, "KL/std": 4.868247985839844, "epoch": 0.0763582966226138, "fcm_dpo/beta": 0.05409781634807587, "fcm_dpo/delta": 0.06422993540763855, "fcm_dpo/margin": 6.246335983276367, "fcm_dpo/q_t": 0.4186936020851135, "grad_norm": 41.69569396972656, "learning_rate": 3.695652173913043e-07, "logits/chosen": -0.5800528526306152, "logits/rejected": -0.5262706875801086, "logps/chosen": -52.00030517578125, "logps/ref_chosen": -52.33735656738281, "logps/ref_rejected": -79.97391510009766, "logps/rejected": -85.8832015991211, "loss": 1.096, "margin_dpo/margin_mean": 6.246336936950684, "margin_dpo/margin_std": 5.239194393157959, "step": 52 }, { "KL/chosen_KL_mean": -0.2391986846923828, "KL/mean": -3.4154043197631836, "KL/rejected_KL_mean": -6.591606140136719, "KL/std": 5.5702056884765625, "epoch": 0.07782672540381791, "fcm_dpo/beta": 0.05494330823421478, "fcm_dpo/delta": 0.05208485573530197, "fcm_dpo/margin": 6.352412223815918, "fcm_dpo/q_t": 0.41828417778015137, "grad_norm": 41.55133056640625, "learning_rate": 3.7681159420289855e-07, "logits/chosen": -0.6179283857345581, "logits/rejected": -0.5963842272758484, "logps/chosen": -53.55384826660156, "logps/ref_chosen": -53.31465148925781, "logps/ref_rejected": -91.78359985351562, "logps/rejected": -98.37519836425781, "loss": 1.1022, "margin_dpo/margin_mean": 6.352412223815918, "margin_dpo/margin_std": 6.420080184936523, "step": 53 }, { "KL/chosen_KL_mean": -0.4640941619873047, "KL/mean": -3.2297964096069336, "KL/rejected_KL_mean": -5.995494842529297, "KL/std": 5.26720666885376, "epoch": 0.07929515418502203, "fcm_dpo/beta": 0.055665239691734314, "fcm_dpo/delta": 0.09488870948553085, "fcm_dpo/margin": 5.531402111053467, "fcm_dpo/q_t": 0.42631345987319946, "grad_norm": 38.000980377197266, "learning_rate": 3.8405797101449274e-07, "logits/chosen": -0.5974017381668091, "logits/rejected": -0.5442031025886536, "logps/chosen": -51.15275573730469, "logps/ref_chosen": -50.68865966796875, "logps/ref_rejected": -91.71539306640625, "logps/rejected": -97.71089172363281, "loss": 1.1223, "margin_dpo/margin_mean": 5.53140115737915, "margin_dpo/margin_std": 5.259613990783691, "step": 54 }, { "KL/chosen_KL_mean": -1.0277824401855469, "KL/mean": -4.298252105712891, "KL/rejected_KL_mean": -7.568717956542969, "KL/std": 6.322789192199707, "epoch": 0.08076358296622614, "fcm_dpo/beta": 0.05629376322031021, "fcm_dpo/delta": 0.03277287259697914, "fcm_dpo/margin": 6.540935039520264, "fcm_dpo/q_t": 0.4138604402542114, "grad_norm": 37.752960205078125, "learning_rate": 3.9130434782608694e-07, "logits/chosen": -0.6345809698104858, "logits/rejected": -0.5711982250213623, "logps/chosen": -63.64301681518555, "logps/ref_chosen": -62.615234375, "logps/ref_rejected": -88.99349975585938, "logps/rejected": -96.56222534179688, "loss": 1.0959, "margin_dpo/margin_mean": 6.5409345626831055, "margin_dpo/margin_std": 7.682841777801514, "step": 55 }, { "KL/chosen_KL_mean": -0.8249740600585938, "KL/mean": -3.960031032562256, "KL/rejected_KL_mean": -7.095088958740234, "KL/std": 6.235048294067383, "epoch": 0.08223201174743025, "fcm_dpo/beta": 0.05667191743850708, "fcm_dpo/delta": 0.0462585911154747, "fcm_dpo/margin": 6.270114421844482, "fcm_dpo/q_t": 0.41691917181015015, "grad_norm": 35.4822883605957, "learning_rate": 3.9855072463768114e-07, "logits/chosen": -0.6079816818237305, "logits/rejected": -0.5654845237731934, "logps/chosen": -58.7577018737793, "logps/ref_chosen": -57.9327278137207, "logps/ref_rejected": -94.1744384765625, "logps/rejected": -101.26953125, "loss": 1.1059, "margin_dpo/margin_mean": 6.270113945007324, "margin_dpo/margin_std": 7.562032699584961, "step": 56 }, { "KL/chosen_KL_mean": -0.8334732055664062, "KL/mean": -4.382845401763916, "KL/rejected_KL_mean": -7.932216644287109, "KL/std": 6.173903942108154, "epoch": 0.08370044052863436, "fcm_dpo/beta": 0.05690793693065643, "fcm_dpo/delta": -0.004338288679718971, "fcm_dpo/margin": 7.098737716674805, "fcm_dpo/q_t": 0.4039880931377411, "grad_norm": 39.752960205078125, "learning_rate": 4.057971014492754e-07, "logits/chosen": -0.6026902198791504, "logits/rejected": -0.5755797624588013, "logps/chosen": -71.3287582397461, "logps/ref_chosen": -70.49528503417969, "logps/ref_rejected": -95.56546020507812, "logps/rejected": -103.49767303466797, "loss": 1.0592, "margin_dpo/margin_mean": 7.0987372398376465, "margin_dpo/margin_std": 6.829120635986328, "step": 57 }, { "KL/chosen_KL_mean": -1.1680717468261719, "KL/mean": -5.124210834503174, "KL/rejected_KL_mean": -9.08034896850586, "KL/std": 7.0405378341674805, "epoch": 0.08516886930983847, "fcm_dpo/beta": 0.05647977069020271, "fcm_dpo/delta": -0.049127254635095596, "fcm_dpo/margin": 7.912271499633789, "fcm_dpo/q_t": 0.3962929844856262, "grad_norm": 41.28611755371094, "learning_rate": 4.1304347826086954e-07, "logits/chosen": -0.5904037952423096, "logits/rejected": -0.5139462947845459, "logps/chosen": -63.30101013183594, "logps/ref_chosen": -62.13294219970703, "logps/ref_rejected": -84.61729431152344, "logps/rejected": -93.69764709472656, "loss": 1.0414, "margin_dpo/margin_mean": 7.912272930145264, "margin_dpo/margin_std": 8.415109634399414, "step": 58 }, { "KL/chosen_KL_mean": -1.6745834350585938, "KL/mean": -6.046883583068848, "KL/rejected_KL_mean": -10.419181823730469, "KL/std": 7.833342552185059, "epoch": 0.08663729809104258, "fcm_dpo/beta": 0.05512422323226929, "fcm_dpo/delta": -0.08732414245605469, "fcm_dpo/margin": 8.744604110717773, "fcm_dpo/q_t": 0.388535737991333, "grad_norm": 41.06474304199219, "learning_rate": 4.2028985507246374e-07, "logits/chosen": -0.6156203746795654, "logits/rejected": -0.5749033689498901, "logps/chosen": -53.60710906982422, "logps/ref_chosen": -51.932525634765625, "logps/ref_rejected": -88.88520050048828, "logps/rejected": -99.30438232421875, "loss": 1.0196, "margin_dpo/margin_mean": 8.744604110717773, "margin_dpo/margin_std": 9.00253677368164, "step": 59 }, { "KL/chosen_KL_mean": -3.0149459838867188, "KL/mean": -6.47147274017334, "KL/rejected_KL_mean": -9.927997589111328, "KL/std": 7.12081241607666, "epoch": 0.0881057268722467, "fcm_dpo/beta": 0.055202968418598175, "fcm_dpo/delta": 0.01902601681649685, "fcm_dpo/margin": 6.913043975830078, "fcm_dpo/q_t": 0.4095836579799652, "grad_norm": 41.82515335083008, "learning_rate": 4.2753623188405794e-07, "logits/chosen": -0.6264551877975464, "logits/rejected": -0.5676676034927368, "logps/chosen": -63.95713424682617, "logps/ref_chosen": -60.94218826293945, "logps/ref_rejected": -85.39340209960938, "logps/rejected": -95.32139587402344, "loss": 1.0887, "margin_dpo/margin_mean": 6.913043975830078, "margin_dpo/margin_std": 8.055152893066406, "step": 60 }, { "KL/chosen_KL_mean": -1.785238265991211, "KL/mean": -6.138444900512695, "KL/rejected_KL_mean": -10.491649627685547, "KL/std": 9.666519165039062, "epoch": 0.08957415565345081, "fcm_dpo/beta": 0.054592475295066833, "fcm_dpo/delta": -0.07944516837596893, "fcm_dpo/margin": 8.706413269042969, "fcm_dpo/q_t": 0.3965580463409424, "grad_norm": 36.854209899902344, "learning_rate": 4.3478260869565214e-07, "logits/chosen": -0.6180921196937561, "logits/rejected": -0.5846239328384399, "logps/chosen": -62.41876220703125, "logps/ref_chosen": -60.633522033691406, "logps/ref_rejected": -89.85249328613281, "logps/rejected": -100.34414672851562, "loss": 1.0562, "margin_dpo/margin_mean": 8.706413269042969, "margin_dpo/margin_std": 12.064128875732422, "step": 61 }, { "KL/chosen_KL_mean": -1.9152297973632812, "KL/mean": -5.1878767013549805, "KL/rejected_KL_mean": -8.460521697998047, "KL/std": 7.09406852722168, "epoch": 0.09104258443465492, "fcm_dpo/beta": 0.05502016097307205, "fcm_dpo/delta": 0.041067853569984436, "fcm_dpo/margin": 6.545290470123291, "fcm_dpo/q_t": 0.416960746049881, "grad_norm": 35.16862487792969, "learning_rate": 4.420289855072464e-07, "logits/chosen": -0.6116189956665039, "logits/rejected": -0.5774843692779541, "logps/chosen": -58.066001892089844, "logps/ref_chosen": -56.15077209472656, "logps/ref_rejected": -75.56619262695312, "logps/rejected": -84.02671813964844, "loss": 1.1131, "margin_dpo/margin_mean": 6.545290946960449, "margin_dpo/margin_std": 8.706681251525879, "step": 62 }, { "KL/chosen_KL_mean": -3.2132091522216797, "KL/mean": -7.6087846755981445, "KL/rejected_KL_mean": -12.004364013671875, "KL/std": 9.102246284484863, "epoch": 0.09251101321585903, "fcm_dpo/beta": 0.05407857149839401, "fcm_dpo/delta": -0.07995294779539108, "fcm_dpo/margin": 8.791152000427246, "fcm_dpo/q_t": 0.38956284523010254, "grad_norm": 39.14129638671875, "learning_rate": 4.4927536231884053e-07, "logits/chosen": -0.6333186626434326, "logits/rejected": -0.5910948514938354, "logps/chosen": -76.36060333251953, "logps/ref_chosen": -73.14739227294922, "logps/ref_rejected": -97.61006164550781, "logps/rejected": -109.61442565917969, "loss": 1.0306, "margin_dpo/margin_mean": 8.791152000427246, "margin_dpo/margin_std": 9.617431640625, "step": 63 }, { "KL/chosen_KL_mean": -1.370086669921875, "KL/mean": -6.633951187133789, "KL/rejected_KL_mean": -11.897815704345703, "KL/std": 9.861505508422852, "epoch": 0.09397944199706314, "fcm_dpo/beta": 0.05238521099090576, "fcm_dpo/delta": -0.16296426951885223, "fcm_dpo/margin": 10.527724266052246, "fcm_dpo/q_t": 0.37639370560646057, "grad_norm": 35.86033248901367, "learning_rate": 4.5652173913043473e-07, "logits/chosen": -0.5971044898033142, "logits/rejected": -0.5653672218322754, "logps/chosen": -55.36868667602539, "logps/ref_chosen": -53.998600006103516, "logps/ref_rejected": -93.53019714355469, "logps/rejected": -105.42801666259766, "loss": 0.9991, "margin_dpo/margin_mean": 10.527724266052246, "margin_dpo/margin_std": 12.154861450195312, "step": 64 }, { "KL/chosen_KL_mean": -3.860137939453125, "KL/mean": -9.250676155090332, "KL/rejected_KL_mean": -14.641212463378906, "KL/std": 11.2495698928833, "epoch": 0.09544787077826726, "fcm_dpo/beta": 0.05090530961751938, "fcm_dpo/delta": -0.15866145491600037, "fcm_dpo/margin": 10.78107738494873, "fcm_dpo/q_t": 0.3747457265853882, "grad_norm": 36.674705505371094, "learning_rate": 4.63768115942029e-07, "logits/chosen": -0.6921563148498535, "logits/rejected": -0.6807618737220764, "logps/chosen": -68.69613647460938, "logps/ref_chosen": -64.83599853515625, "logps/ref_rejected": -109.94645690917969, "logps/rejected": -124.58767700195312, "loss": 1.0021, "margin_dpo/margin_mean": 10.781078338623047, "margin_dpo/margin_std": 12.379752159118652, "step": 65 }, { "KL/chosen_KL_mean": -3.5839481353759766, "KL/mean": -8.451730728149414, "KL/rejected_KL_mean": -13.319507598876953, "KL/std": 10.042497634887695, "epoch": 0.09691629955947137, "fcm_dpo/beta": 0.04975783824920654, "fcm_dpo/delta": -0.08998537063598633, "fcm_dpo/margin": 9.735556602478027, "fcm_dpo/q_t": 0.39193886518478394, "grad_norm": 32.98362350463867, "learning_rate": 4.7101449275362313e-07, "logits/chosen": -0.6278406381607056, "logits/rejected": -0.5947624444961548, "logps/chosen": -55.02747344970703, "logps/ref_chosen": -51.44352722167969, "logps/ref_rejected": -75.63629913330078, "logps/rejected": -88.955810546875, "loss": 1.0403, "margin_dpo/margin_mean": 9.735556602478027, "margin_dpo/margin_std": 12.185860633850098, "step": 66 }, { "KL/chosen_KL_mean": -3.3043766021728516, "KL/mean": -8.373483657836914, "KL/rejected_KL_mean": -13.44259262084961, "KL/std": 10.349268913269043, "epoch": 0.09838472834067548, "fcm_dpo/beta": 0.049183670431375504, "fcm_dpo/delta": -0.10368002951145172, "fcm_dpo/margin": 10.138212203979492, "fcm_dpo/q_t": 0.38883906602859497, "grad_norm": 33.585758209228516, "learning_rate": 4.782608695652174e-07, "logits/chosen": -0.6145930886268616, "logits/rejected": -0.5722061395645142, "logps/chosen": -62.645179748535156, "logps/ref_chosen": -59.34080505371094, "logps/ref_rejected": -72.78728485107422, "logps/rejected": -86.22987365722656, "loss": 1.031, "margin_dpo/margin_mean": 10.138212203979492, "margin_dpo/margin_std": 12.514627456665039, "step": 67 }, { "KL/chosen_KL_mean": -4.238227844238281, "KL/mean": -9.08108901977539, "KL/rejected_KL_mean": -13.923954010009766, "KL/std": 9.300742149353027, "epoch": 0.09985315712187959, "fcm_dpo/beta": 0.04838772863149643, "fcm_dpo/delta": -0.0719866156578064, "fcm_dpo/margin": 9.685721397399902, "fcm_dpo/q_t": 0.39209192991256714, "grad_norm": 33.56498336791992, "learning_rate": 4.855072463768116e-07, "logits/chosen": -0.6377149820327759, "logits/rejected": -0.5744598507881165, "logps/chosen": -69.44406127929688, "logps/ref_chosen": -65.2058334350586, "logps/ref_rejected": -77.20724487304688, "logps/rejected": -91.1312026977539, "loss": 1.0309, "margin_dpo/margin_mean": 9.685721397399902, "margin_dpo/margin_std": 10.507135391235352, "step": 68 }, { "KL/chosen_KL_mean": -5.249656677246094, "KL/mean": -11.655780792236328, "KL/rejected_KL_mean": -18.061904907226562, "KL/std": 11.926582336425781, "epoch": 0.1013215859030837, "fcm_dpo/beta": 0.04644005745649338, "fcm_dpo/delta": -0.20860767364501953, "fcm_dpo/margin": 12.812248229980469, "fcm_dpo/q_t": 0.36728864908218384, "grad_norm": 35.174522399902344, "learning_rate": 4.927536231884058e-07, "logits/chosen": -0.6080530285835266, "logits/rejected": -0.5861386060714722, "logps/chosen": -65.06889343261719, "logps/ref_chosen": -59.81924057006836, "logps/ref_rejected": -103.38886260986328, "logps/rejected": -121.45076751708984, "loss": 0.9606, "margin_dpo/margin_mean": 12.812247276306152, "margin_dpo/margin_std": 13.367576599121094, "step": 69 }, { "KL/chosen_KL_mean": -7.555110931396484, "KL/mean": -14.482812881469727, "KL/rejected_KL_mean": -21.41051483154297, "KL/std": 15.016265869140625, "epoch": 0.1027900146842878, "fcm_dpo/beta": 0.04471848905086517, "fcm_dpo/delta": -0.2335546314716339, "fcm_dpo/margin": 13.855405807495117, "fcm_dpo/q_t": 0.3647538721561432, "grad_norm": 36.163818359375, "learning_rate": 5e-07, "logits/chosen": -0.6315578818321228, "logits/rejected": -0.5984231233596802, "logps/chosen": -69.48574829101562, "logps/ref_chosen": -61.930641174316406, "logps/ref_rejected": -91.06078338623047, "logps/rejected": -112.47129821777344, "loss": 0.9698, "margin_dpo/margin_mean": 13.855405807495117, "margin_dpo/margin_std": 15.959779739379883, "step": 70 }, { "KL/chosen_KL_mean": -7.743133544921875, "KL/mean": -16.083194732666016, "KL/rejected_KL_mean": -24.423255920410156, "KL/std": 16.247331619262695, "epoch": 0.10425844346549193, "fcm_dpo/beta": 0.04178931191563606, "fcm_dpo/delta": -0.32131147384643555, "fcm_dpo/margin": 16.68012237548828, "fcm_dpo/q_t": 0.35157865285873413, "grad_norm": 33.68754196166992, "learning_rate": 4.999967061337492e-07, "logits/chosen": -0.6753981113433838, "logits/rejected": -0.63995361328125, "logps/chosen": -69.49346923828125, "logps/ref_chosen": -61.750335693359375, "logps/ref_rejected": -97.33662414550781, "logps/rejected": -121.75987243652344, "loss": 0.9201, "margin_dpo/margin_mean": 16.68012237548828, "margin_dpo/margin_std": 17.86024284362793, "step": 71 }, { "KL/chosen_KL_mean": -8.834901809692383, "KL/mean": -17.32482147216797, "KL/rejected_KL_mean": -25.814735412597656, "KL/std": 17.12436866760254, "epoch": 0.10572687224669604, "fcm_dpo/beta": 0.03948363661766052, "fcm_dpo/delta": -0.290458619594574, "fcm_dpo/margin": 16.97983169555664, "fcm_dpo/q_t": 0.35484981536865234, "grad_norm": 35.54236602783203, "learning_rate": 4.999868246217933e-07, "logits/chosen": -0.6629385352134705, "logits/rejected": -0.6264936327934265, "logps/chosen": -74.88831329345703, "logps/ref_chosen": -66.05341339111328, "logps/ref_rejected": -95.2869873046875, "logps/rejected": -121.10172271728516, "loss": 0.9598, "margin_dpo/margin_mean": 16.97983169555664, "margin_dpo/margin_std": 20.594505310058594, "step": 72 }, { "KL/chosen_KL_mean": -10.812238693237305, "KL/mean": -19.365093231201172, "KL/rejected_KL_mean": -27.917957305908203, "KL/std": 21.935474395751953, "epoch": 0.10719530102790015, "fcm_dpo/beta": 0.03733060508966446, "fcm_dpo/delta": -0.25653302669525146, "fcm_dpo/margin": 17.105716705322266, "fcm_dpo/q_t": 0.37481170892715454, "grad_norm": 35.56188201904297, "learning_rate": 4.999703557245192e-07, "logits/chosen": -0.6949942708015442, "logits/rejected": -0.6558982729911804, "logps/chosen": -77.06851196289062, "logps/ref_chosen": -66.25627136230469, "logps/ref_rejected": -90.45613098144531, "logps/rejected": -118.37409210205078, "loss": 1.0587, "margin_dpo/margin_mean": 17.105716705322266, "margin_dpo/margin_std": 27.76058006286621, "step": 73 }, { "KL/chosen_KL_mean": -11.020193099975586, "KL/mean": -20.51835823059082, "KL/rejected_KL_mean": -30.016517639160156, "KL/std": 24.099422454833984, "epoch": 0.10866372980910426, "fcm_dpo/beta": 0.03536809980869293, "fcm_dpo/delta": -0.29220515489578247, "fcm_dpo/margin": 18.996326446533203, "fcm_dpo/q_t": 0.36699697375297546, "grad_norm": 35.718910217285156, "learning_rate": 4.999472998758977e-07, "logits/chosen": -0.6316280364990234, "logits/rejected": -0.6258025169372559, "logps/chosen": -64.44507598876953, "logps/ref_chosen": -53.42488098144531, "logps/ref_rejected": -95.94693756103516, "logps/rejected": -125.96345520019531, "loss": 1.0082, "margin_dpo/margin_mean": 18.996326446533203, "margin_dpo/margin_std": 30.294841766357422, "step": 74 }, { "KL/chosen_KL_mean": -10.813570022583008, "KL/mean": -24.00226402282715, "KL/rejected_KL_mean": -37.190948486328125, "KL/std": 24.315690994262695, "epoch": 0.11013215859030837, "fcm_dpo/beta": 0.03265610337257385, "fcm_dpo/delta": -0.5023984909057617, "fcm_dpo/margin": 26.377384185791016, "fcm_dpo/q_t": 0.3248700201511383, "grad_norm": 30.176677703857422, "learning_rate": 4.999176576834721e-07, "logits/chosen": -0.6893298625946045, "logits/rejected": -0.6836451292037964, "logps/chosen": -62.67523193359375, "logps/ref_chosen": -51.861663818359375, "logps/ref_rejected": -111.25398254394531, "logps/rejected": -148.44493103027344, "loss": 0.862, "margin_dpo/margin_mean": 26.377382278442383, "margin_dpo/margin_std": 27.90032196044922, "step": 75 }, { "KL/chosen_KL_mean": -12.86973762512207, "KL/mean": -20.976146697998047, "KL/rejected_KL_mean": -29.082550048828125, "KL/std": 19.501949310302734, "epoch": 0.11160058737151249, "fcm_dpo/beta": 0.031097372993826866, "fcm_dpo/delta": -0.10961665213108063, "fcm_dpo/margin": 16.212825775146484, "fcm_dpo/q_t": 0.3852229416370392, "grad_norm": 31.088647842407227, "learning_rate": 4.998814299283415e-07, "logits/chosen": -0.7124214172363281, "logits/rejected": -0.6733113527297974, "logps/chosen": -66.13577270507812, "logps/ref_chosen": -53.26603698730469, "logps/ref_rejected": -78.21662902832031, "logps/rejected": -107.29917907714844, "loss": 1.0435, "margin_dpo/margin_mean": 16.21282386779785, "margin_dpo/margin_std": 21.476770401000977, "step": 76 }, { "KL/chosen_KL_mean": -11.584232330322266, "KL/mean": -24.901695251464844, "KL/rejected_KL_mean": -38.21916580200195, "KL/std": 26.45264434814453, "epoch": 0.1130690161527166, "fcm_dpo/beta": 0.028904041275382042, "fcm_dpo/delta": -0.40436428785324097, "fcm_dpo/margin": 26.634933471679688, "fcm_dpo/q_t": 0.3336452543735504, "grad_norm": 32.505531311035156, "learning_rate": 4.998386175651409e-07, "logits/chosen": -0.678729236125946, "logits/rejected": -0.6420924663543701, "logps/chosen": -69.680908203125, "logps/ref_chosen": -58.0966796875, "logps/ref_rejected": -93.77361297607422, "logps/rejected": -131.99278259277344, "loss": 0.9116, "margin_dpo/margin_mean": 26.634933471679688, "margin_dpo/margin_std": 29.75762176513672, "step": 77 }, { "KL/chosen_KL_mean": -11.404712677001953, "KL/mean": -21.245365142822266, "KL/rejected_KL_mean": -31.086013793945312, "KL/std": 21.289440155029297, "epoch": 0.1145374449339207, "fcm_dpo/beta": 0.027905140072107315, "fcm_dpo/delta": -0.15760990977287292, "fcm_dpo/margin": 19.68130111694336, "fcm_dpo/q_t": 0.3796628713607788, "grad_norm": 28.661237716674805, "learning_rate": 4.997892217220159e-07, "logits/chosen": -0.6651836037635803, "logits/rejected": -0.6431748270988464, "logps/chosen": -67.01849365234375, "logps/ref_chosen": -55.61378479003906, "logps/ref_rejected": -84.93436431884766, "logps/rejected": -116.02037811279297, "loss": 1.0166, "margin_dpo/margin_mean": 19.68130111694336, "margin_dpo/margin_std": 24.757617950439453, "step": 78 }, { "KL/chosen_KL_mean": -11.86384391784668, "KL/mean": -22.519290924072266, "KL/rejected_KL_mean": -33.17473602294922, "KL/std": 23.79071044921875, "epoch": 0.11600587371512482, "fcm_dpo/beta": 0.02685295045375824, "fcm_dpo/delta": -0.18282675743103027, "fcm_dpo/margin": 21.310894012451172, "fcm_dpo/q_t": 0.3792785704135895, "grad_norm": 25.604280471801758, "learning_rate": 4.997332437005931e-07, "logits/chosen": -0.6872971057891846, "logits/rejected": -0.6611636281013489, "logps/chosen": -67.3143310546875, "logps/ref_chosen": -55.45048522949219, "logps/ref_rejected": -87.64756774902344, "logps/rejected": -120.82229614257812, "loss": 1.0255, "margin_dpo/margin_mean": 21.310894012451172, "margin_dpo/margin_std": 29.21087074279785, "step": 79 }, { "KL/chosen_KL_mean": -14.992725372314453, "KL/mean": -25.173377990722656, "KL/rejected_KL_mean": -35.354034423828125, "KL/std": 25.4627742767334, "epoch": 0.11747430249632893, "fcm_dpo/beta": 0.02588074468076229, "fcm_dpo/delta": -0.13631115853786469, "fcm_dpo/margin": 20.36130142211914, "fcm_dpo/q_t": 0.39049336314201355, "grad_norm": 27.309263229370117, "learning_rate": 4.996706849759452e-07, "logits/chosen": -0.7352666854858398, "logits/rejected": -0.69718337059021, "logps/chosen": -73.51201629638672, "logps/ref_chosen": -58.519290924072266, "logps/ref_rejected": -87.54750061035156, "logps/rejected": -122.90153503417969, "loss": 1.0581, "margin_dpo/margin_mean": 20.36130142211914, "margin_dpo/margin_std": 29.580612182617188, "step": 80 }, { "KL/chosen_KL_mean": -14.040748596191406, "KL/mean": -27.886680603027344, "KL/rejected_KL_mean": -41.73262023925781, "KL/std": 29.870553970336914, "epoch": 0.11894273127753303, "fcm_dpo/beta": 0.024616166949272156, "fcm_dpo/delta": -0.30468764901161194, "fcm_dpo/margin": 27.69186782836914, "fcm_dpo/q_t": 0.35815176367759705, "grad_norm": 27.911373138427734, "learning_rate": 4.996015471965529e-07, "logits/chosen": -0.6956121921539307, "logits/rejected": -0.6685779094696045, "logps/chosen": -80.48961639404297, "logps/ref_chosen": -66.44886779785156, "logps/ref_rejected": -129.66270446777344, "logps/rejected": -171.39532470703125, "loss": 0.9671, "margin_dpo/margin_mean": 27.69186782836914, "margin_dpo/margin_std": 35.64756393432617, "step": 81 }, { "KL/chosen_KL_mean": -16.342798233032227, "KL/mean": -27.334274291992188, "KL/rejected_KL_mean": -38.32575607299805, "KL/std": 27.074535369873047, "epoch": 0.12041116005873716, "fcm_dpo/beta": 0.024119626730680466, "fcm_dpo/delta": -0.1382198929786682, "fcm_dpo/margin": 21.982955932617188, "fcm_dpo/q_t": 0.389964759349823, "grad_norm": 31.045066833496094, "learning_rate": 4.995258321842611e-07, "logits/chosen": -0.6336376070976257, "logits/rejected": -0.6242895722389221, "logps/chosen": -68.57518005371094, "logps/ref_chosen": -52.232383728027344, "logps/ref_rejected": -90.74325561523438, "logps/rejected": -129.06900024414062, "loss": 1.1014, "margin_dpo/margin_mean": 21.982955932617188, "margin_dpo/margin_std": 37.50407028198242, "step": 82 }, { "KL/chosen_KL_mean": -16.703378677368164, "KL/mean": -29.769197463989258, "KL/rejected_KL_mean": -42.835018157958984, "KL/std": 28.040857315063477, "epoch": 0.12187958883994127, "fcm_dpo/beta": 0.022889206185936928, "fcm_dpo/delta": -0.21175748109817505, "fcm_dpo/margin": 26.131641387939453, "fcm_dpo/q_t": 0.3712141811847687, "grad_norm": 27.287622451782227, "learning_rate": 4.994435419342304e-07, "logits/chosen": -0.6630829572677612, "logits/rejected": -0.6318089962005615, "logps/chosen": -72.53076171875, "logps/ref_chosen": -55.82738494873047, "logps/ref_rejected": -103.71589660644531, "logps/rejected": -146.55091857910156, "loss": 1.0016, "margin_dpo/margin_mean": 26.131643295288086, "margin_dpo/margin_std": 33.42725372314453, "step": 83 }, { "KL/chosen_KL_mean": -15.800683975219727, "KL/mean": -26.602916717529297, "KL/rejected_KL_mean": -37.4051513671875, "KL/std": 23.81344223022461, "epoch": 0.12334801762114538, "fcm_dpo/beta": 0.022253597155213356, "fcm_dpo/delta": -0.08681607246398926, "fcm_dpo/margin": 21.604461669921875, "fcm_dpo/q_t": 0.3886079490184784, "grad_norm": 24.89923858642578, "learning_rate": 4.993546786148857e-07, "logits/chosen": -0.6726013422012329, "logits/rejected": -0.635522723197937, "logps/chosen": -82.97685241699219, "logps/ref_chosen": -67.1761703491211, "logps/ref_rejected": -87.29859924316406, "logps/rejected": -124.70375061035156, "loss": 1.0329, "margin_dpo/margin_mean": 21.604461669921875, "margin_dpo/margin_std": 24.17331314086914, "step": 84 }, { "KL/chosen_KL_mean": -15.828641891479492, "KL/mean": -27.07083511352539, "KL/rejected_KL_mean": -38.31303024291992, "KL/std": 24.207460403442383, "epoch": 0.12481644640234948, "fcm_dpo/beta": 0.022155042737722397, "fcm_dpo/delta": -0.10365713387727737, "fcm_dpo/margin": 22.484390258789062, "fcm_dpo/q_t": 0.3894878029823303, "grad_norm": 24.93961524963379, "learning_rate": 4.992592445678582e-07, "logits/chosen": -0.6551598310470581, "logits/rejected": -0.623712420463562, "logps/chosen": -74.23526000976562, "logps/ref_chosen": -58.4066162109375, "logps/ref_rejected": -78.63880157470703, "logps/rejected": -116.95182800292969, "loss": 1.0486, "margin_dpo/margin_mean": 22.484390258789062, "margin_dpo/margin_std": 29.117321014404297, "step": 85 }, { "KL/chosen_KL_mean": -20.43706703186035, "KL/mean": -32.29643249511719, "KL/rejected_KL_mean": -44.155792236328125, "KL/std": 33.11329650878906, "epoch": 0.1262848751835536, "fcm_dpo/beta": 0.0215867031365633, "fcm_dpo/delta": -0.1179293692111969, "fcm_dpo/margin": 23.718732833862305, "fcm_dpo/q_t": 0.40107935667037964, "grad_norm": 31.930374145507812, "learning_rate": 4.991572423079235e-07, "logits/chosen": -0.6938769817352295, "logits/rejected": -0.6838431358337402, "logps/chosen": -76.57453155517578, "logps/ref_chosen": -56.13746643066406, "logps/ref_rejected": -88.12165069580078, "logps/rejected": -132.27743530273438, "loss": 1.1337, "margin_dpo/margin_mean": 23.718732833862305, "margin_dpo/margin_std": 45.869529724121094, "step": 86 }, { "KL/chosen_KL_mean": -20.00033950805664, "KL/mean": -34.11670684814453, "KL/rejected_KL_mean": -48.23307418823242, "KL/std": 32.86204528808594, "epoch": 0.1277533039647577, "fcm_dpo/beta": 0.020640596747398376, "fcm_dpo/delta": -0.19566936790943146, "fcm_dpo/margin": 28.232730865478516, "fcm_dpo/q_t": 0.3739252984523773, "grad_norm": 25.189109802246094, "learning_rate": 4.990486745229364e-07, "logits/chosen": -0.7301120758056641, "logits/rejected": -0.7094823122024536, "logps/chosen": -75.63643646240234, "logps/ref_chosen": -55.63609313964844, "logps/ref_rejected": -95.46757507324219, "logps/rejected": -143.70065307617188, "loss": 1.0366, "margin_dpo/margin_mean": 28.232730865478516, "margin_dpo/margin_std": 40.02729797363281, "step": 87 }, { "KL/chosen_KL_mean": -25.23326873779297, "KL/mean": -35.901371002197266, "KL/rejected_KL_mean": -46.5694694519043, "KL/std": 34.48196792602539, "epoch": 0.12922173274596183, "fcm_dpo/beta": 0.020320210605859756, "fcm_dpo/delta": -0.03590092435479164, "fcm_dpo/margin": 21.33620262145996, "fcm_dpo/q_t": 0.40818527340888977, "grad_norm": 26.31541633605957, "learning_rate": 4.989335440737586e-07, "logits/chosen": -0.6587375402450562, "logits/rejected": -0.6542388796806335, "logps/chosen": -98.9044189453125, "logps/ref_chosen": -73.67115020751953, "logps/ref_rejected": -106.70849609375, "logps/rejected": -153.27796936035156, "loss": 1.1371, "margin_dpo/margin_mean": 21.33620262145996, "margin_dpo/margin_std": 36.64844512939453, "step": 88 }, { "KL/chosen_KL_mean": -15.452627182006836, "KL/mean": -27.265708923339844, "KL/rejected_KL_mean": -39.078792572021484, "KL/std": 28.668766021728516, "epoch": 0.13069016152716592, "fcm_dpo/beta": 0.02023715153336525, "fcm_dpo/delta": -0.08208386600017548, "fcm_dpo/margin": 23.62615966796875, "fcm_dpo/q_t": 0.3936477303504944, "grad_norm": 23.98124122619629, "learning_rate": 4.988118539941847e-07, "logits/chosen": -0.7175350189208984, "logits/rejected": -0.6864731311798096, "logps/chosen": -76.07754516601562, "logps/ref_chosen": -60.624916076660156, "logps/ref_rejected": -82.08354949951172, "logps/rejected": -121.16233825683594, "loss": 1.0599, "margin_dpo/margin_mean": 23.626155853271484, "margin_dpo/margin_std": 32.83501434326172, "step": 89 }, { "KL/chosen_KL_mean": -19.54310417175293, "KL/mean": -35.97503662109375, "KL/rejected_KL_mean": -52.406978607177734, "KL/std": 39.376708984375, "epoch": 0.13215859030837004, "fcm_dpo/beta": 0.01951216161251068, "fcm_dpo/delta": -0.2572743892669678, "fcm_dpo/margin": 32.86386489868164, "fcm_dpo/q_t": 0.37370768189430237, "grad_norm": 24.72134780883789, "learning_rate": 4.986836074908615e-07, "logits/chosen": -0.636969268321991, "logits/rejected": -0.6531749963760376, "logps/chosen": -72.82841491699219, "logps/ref_chosen": -53.285308837890625, "logps/ref_rejected": -111.54470825195312, "logps/rejected": -163.95169067382812, "loss": 1.0349, "margin_dpo/margin_mean": 32.863868713378906, "margin_dpo/margin_std": 49.08679962158203, "step": 90 }, { "KL/chosen_KL_mean": -20.63615608215332, "KL/mean": -33.67596435546875, "KL/rejected_KL_mean": -46.71577453613281, "KL/std": 31.137435913085938, "epoch": 0.13362701908957417, "fcm_dpo/beta": 0.018853671848773956, "fcm_dpo/delta": -0.0963558554649353, "fcm_dpo/margin": 26.079620361328125, "fcm_dpo/q_t": 0.3956088721752167, "grad_norm": 24.04185676574707, "learning_rate": 4.985488079432037e-07, "logits/chosen": -0.6773035526275635, "logits/rejected": -0.645221471786499, "logps/chosen": -82.43911743164062, "logps/ref_chosen": -61.802955627441406, "logps/ref_rejected": -87.87395477294922, "logps/rejected": -134.5897216796875, "loss": 1.0839, "margin_dpo/margin_mean": 26.079620361328125, "margin_dpo/margin_std": 40.8889274597168, "step": 91 }, { "KL/chosen_KL_mean": -18.164283752441406, "KL/mean": -31.569286346435547, "KL/rejected_KL_mean": -44.97429275512695, "KL/std": 31.935270309448242, "epoch": 0.13509544787077826, "fcm_dpo/beta": 0.01851240172982216, "fcm_dpo/delta": -0.10138699412345886, "fcm_dpo/margin": 26.81001091003418, "fcm_dpo/q_t": 0.3927502930164337, "grad_norm": 22.194721221923828, "learning_rate": 4.984074589033043e-07, "logits/chosen": -0.6895343065261841, "logits/rejected": -0.6665633916854858, "logps/chosen": -69.8050537109375, "logps/ref_chosen": -51.640769958496094, "logps/ref_rejected": -77.88117980957031, "logps/rejected": -122.85547637939453, "loss": 1.0646, "margin_dpo/margin_mean": 26.81001091003418, "margin_dpo/margin_std": 38.659828186035156, "step": 92 }, { "KL/chosen_KL_mean": -20.49032211303711, "KL/mean": -33.852867126464844, "KL/rejected_KL_mean": -47.21541213989258, "KL/std": 29.824676513671875, "epoch": 0.13656387665198239, "fcm_dpo/beta": 0.018106218427419662, "fcm_dpo/delta": -0.08807133883237839, "fcm_dpo/margin": 26.72509002685547, "fcm_dpo/q_t": 0.39256197214126587, "grad_norm": 23.166257858276367, "learning_rate": 4.982595640958425e-07, "logits/chosen": -0.7384850978851318, "logits/rejected": -0.6940040588378906, "logps/chosen": -73.01956176757812, "logps/ref_chosen": -52.529239654541016, "logps/ref_rejected": -77.16075134277344, "logps/rejected": -124.37615966796875, "loss": 1.0447, "margin_dpo/margin_mean": 26.72509002685547, "margin_dpo/margin_std": 34.69382095336914, "step": 93 }, { "KL/chosen_KL_mean": -21.992738723754883, "KL/mean": -37.579185485839844, "KL/rejected_KL_mean": -53.16563034057617, "KL/std": 34.558380126953125, "epoch": 0.13803230543318648, "fcm_dpo/beta": 0.017487093806266785, "fcm_dpo/delta": -0.15506845712661743, "fcm_dpo/margin": 31.172901153564453, "fcm_dpo/q_t": 0.378325879573822, "grad_norm": 22.59712028503418, "learning_rate": 4.98105127417984e-07, "logits/chosen": -0.6630722880363464, "logits/rejected": -0.6506177186965942, "logps/chosen": -83.21534729003906, "logps/ref_chosen": -61.22261047363281, "logps/ref_rejected": -99.59902954101562, "logps/rejected": -152.76466369628906, "loss": 1.0039, "margin_dpo/margin_mean": 31.172901153564453, "margin_dpo/margin_std": 36.12760925292969, "step": 94 }, { "KL/chosen_KL_mean": -21.189931869506836, "KL/mean": -33.615150451660156, "KL/rejected_KL_mean": -46.040374755859375, "KL/std": 33.43156433105469, "epoch": 0.1395007342143906, "fcm_dpo/beta": 0.01729883998632431, "fcm_dpo/delta": -0.031661614775657654, "fcm_dpo/margin": 24.85043716430664, "fcm_dpo/q_t": 0.4017173647880554, "grad_norm": 21.476041793823242, "learning_rate": 4.979441529392784e-07, "logits/chosen": -0.6821004152297974, "logits/rejected": -0.655462920665741, "logps/chosen": -73.71357727050781, "logps/ref_chosen": -52.523643493652344, "logps/ref_rejected": -75.8803482055664, "logps/rejected": -121.92072296142578, "loss": 1.0881, "margin_dpo/margin_mean": 24.850439071655273, "margin_dpo/margin_std": 35.13576889038086, "step": 95 }, { "KL/chosen_KL_mean": -20.64446449279785, "KL/mean": -37.62309646606445, "KL/rejected_KL_mean": -54.601722717285156, "KL/std": 37.295902252197266, "epoch": 0.14096916299559473, "fcm_dpo/beta": 0.016751719638705254, "fcm_dpo/delta": -0.18152689933776855, "fcm_dpo/margin": 33.95726013183594, "fcm_dpo/q_t": 0.3761478066444397, "grad_norm": 21.628402709960938, "learning_rate": 4.977766449015534e-07, "logits/chosen": -0.7095851898193359, "logits/rejected": -0.6844866275787354, "logps/chosen": -82.80143737792969, "logps/ref_chosen": -62.15697479248047, "logps/ref_rejected": -96.59601593017578, "logps/rejected": -151.19773864746094, "loss": 0.9975, "margin_dpo/margin_mean": 33.95726013183594, "margin_dpo/margin_std": 41.307655334472656, "step": 96 }, { "KL/chosen_KL_mean": -21.8316650390625, "KL/mean": -34.830955505371094, "KL/rejected_KL_mean": -47.83024215698242, "KL/std": 31.186649322509766, "epoch": 0.14243759177679882, "fcm_dpo/beta": 0.0167661365121603, "fcm_dpo/delta": -0.0386638417840004, "fcm_dpo/margin": 25.998580932617188, "fcm_dpo/q_t": 0.3992045521736145, "grad_norm": 22.661230087280273, "learning_rate": 4.976026077188012e-07, "logits/chosen": -0.6369616389274597, "logits/rejected": -0.5970015525817871, "logps/chosen": -76.47802734375, "logps/ref_chosen": -54.646366119384766, "logps/ref_rejected": -76.96475219726562, "logps/rejected": -124.79499816894531, "loss": 1.0676, "margin_dpo/margin_mean": 25.998584747314453, "margin_dpo/margin_std": 31.176647186279297, "step": 97 }, { "KL/chosen_KL_mean": -27.28506088256836, "KL/mean": -42.106605529785156, "KL/rejected_KL_mean": -56.92816162109375, "KL/std": 35.70084762573242, "epoch": 0.14390602055800295, "fcm_dpo/beta": 0.01634235680103302, "fcm_dpo/delta": -0.08897878974676132, "fcm_dpo/margin": 29.64310073852539, "fcm_dpo/q_t": 0.38946154713630676, "grad_norm": 24.021251678466797, "learning_rate": 4.974220459770639e-07, "logits/chosen": -0.6834473013877869, "logits/rejected": -0.669763445854187, "logps/chosen": -92.54368591308594, "logps/ref_chosen": -65.25862884521484, "logps/ref_rejected": -96.5274887084961, "logps/rejected": -153.45565795898438, "loss": 1.074, "margin_dpo/margin_mean": 29.64310073852539, "margin_dpo/margin_std": 42.92453384399414, "step": 98 }, { "KL/chosen_KL_mean": -21.082822799682617, "KL/mean": -39.566993713378906, "KL/rejected_KL_mean": -58.05116271972656, "KL/std": 38.398990631103516, "epoch": 0.14537444933920704, "fcm_dpo/beta": 0.015781186521053314, "fcm_dpo/delta": -0.19561892747879028, "fcm_dpo/margin": 36.968345642089844, "fcm_dpo/q_t": 0.3759078085422516, "grad_norm": 20.537532806396484, "learning_rate": 4.972349644343108e-07, "logits/chosen": -0.637118935585022, "logits/rejected": -0.639615535736084, "logps/chosen": -66.72130584716797, "logps/ref_chosen": -45.638484954833984, "logps/ref_rejected": -86.43793487548828, "logps/rejected": -144.48910522460938, "loss": 1.0012, "margin_dpo/margin_mean": 36.968345642089844, "margin_dpo/margin_std": 47.97369384765625, "step": 99 }, { "KL/chosen_KL_mean": -23.904550552368164, "KL/mean": -34.20349884033203, "KL/rejected_KL_mean": -44.5024528503418, "KL/std": 30.948299407958984, "epoch": 0.14684287812041116, "fcm_dpo/beta": 0.015916183590888977, "fcm_dpo/delta": 0.07390052080154419, "fcm_dpo/margin": 20.597900390625, "fcm_dpo/q_t": 0.4260128140449524, "grad_norm": 23.365909576416016, "learning_rate": 4.970413680203148e-07, "logits/chosen": -0.6662120819091797, "logits/rejected": -0.6264818906784058, "logps/chosen": -81.49853515625, "logps/ref_chosen": -57.59397888183594, "logps/ref_rejected": -74.06021118164062, "logps/rejected": -118.56266784667969, "loss": 1.1847, "margin_dpo/margin_mean": 20.597902297973633, "margin_dpo/margin_std": 39.958717346191406, "step": 100 }, { "KL/chosen_KL_mean": -29.311023712158203, "KL/mean": -42.13480758666992, "KL/rejected_KL_mean": -54.958587646484375, "KL/std": 38.1388053894043, "epoch": 0.14831130690161526, "fcm_dpo/beta": 0.015804601833224297, "fcm_dpo/delta": -0.006234418600797653, "fcm_dpo/margin": 25.647552490234375, "fcm_dpo/q_t": 0.41378289461135864, "grad_norm": 23.029918670654297, "learning_rate": 4.968412618365215e-07, "logits/chosen": -0.6485938429832458, "logits/rejected": -0.6187626123428345, "logps/chosen": -90.95987701416016, "logps/ref_chosen": -61.64885330200195, "logps/ref_rejected": -83.18968200683594, "logps/rejected": -138.1482696533203, "loss": 1.1411, "margin_dpo/margin_mean": 25.647552490234375, "margin_dpo/margin_std": 45.90587615966797, "step": 101 }, { "KL/chosen_KL_mean": -31.269521713256836, "KL/mean": -40.59046936035156, "KL/rejected_KL_mean": -49.91142654418945, "KL/std": 35.46381378173828, "epoch": 0.14977973568281938, "fcm_dpo/beta": 0.015855927020311356, "fcm_dpo/delta": -0.019583335146307945, "fcm_dpo/margin": 18.64190673828125, "fcm_dpo/q_t": 0.43402665853500366, "grad_norm": 26.079177856445312, "learning_rate": 4.966346511559149e-07, "logits/chosen": -0.6906998157501221, "logits/rejected": -0.6490976810455322, "logps/chosen": -95.34840393066406, "logps/ref_chosen": -64.0788803100586, "logps/ref_rejected": -68.18707275390625, "logps/rejected": -118.09850311279297, "loss": 1.222, "margin_dpo/margin_mean": 18.641904830932617, "margin_dpo/margin_std": 42.348793029785156, "step": 102 }, { "KL/chosen_KL_mean": -25.099937438964844, "KL/mean": -44.034488677978516, "KL/rejected_KL_mean": -62.969051361083984, "KL/std": 39.63392639160156, "epoch": 0.1512481644640235, "fcm_dpo/beta": 0.015347588807344437, "fcm_dpo/delta": -0.19285300374031067, "fcm_dpo/margin": 37.86910629272461, "fcm_dpo/q_t": 0.37476682662963867, "grad_norm": 22.30409812927246, "learning_rate": 4.964215414228785e-07, "logits/chosen": -0.6355269551277161, "logits/rejected": -0.6004114151000977, "logps/chosen": -86.39921569824219, "logps/ref_chosen": -61.299278259277344, "logps/ref_rejected": -93.57270812988281, "logps/rejected": -156.54176330566406, "loss": 1.0005, "margin_dpo/margin_mean": 37.869102478027344, "margin_dpo/margin_std": 48.369873046875, "step": 103 }, { "KL/chosen_KL_mean": -26.97751808166504, "KL/mean": -45.011497497558594, "KL/rejected_KL_mean": -63.04547119140625, "KL/std": 43.440818786621094, "epoch": 0.1527165932452276, "fcm_dpo/beta": 0.014920437708497047, "fcm_dpo/delta": -0.14584705233573914, "fcm_dpo/margin": 36.067962646484375, "fcm_dpo/q_t": 0.38805246353149414, "grad_norm": 21.63632583618164, "learning_rate": 4.96201938253052e-07, "logits/chosen": -0.6813393831253052, "logits/rejected": -0.6534780859947205, "logps/chosen": -81.35029602050781, "logps/ref_chosen": -54.372772216796875, "logps/ref_rejected": -89.5647201538086, "logps/rejected": -152.61019897460938, "loss": 1.0535, "margin_dpo/margin_mean": 36.06795883178711, "margin_dpo/margin_std": 53.349693298339844, "step": 104 }, { "KL/chosen_KL_mean": -25.276098251342773, "KL/mean": -50.77607727050781, "KL/rejected_KL_mean": -76.27605438232422, "KL/std": 41.882408142089844, "epoch": 0.15418502202643172, "fcm_dpo/beta": 0.014073311351239681, "fcm_dpo/delta": -0.34200507402420044, "fcm_dpo/margin": 50.99995422363281, "fcm_dpo/q_t": 0.3382055163383484, "grad_norm": 21.752716064453125, "learning_rate": 4.959758474331832e-07, "logits/chosen": -0.6410149931907654, "logits/rejected": -0.6186502575874329, "logps/chosen": -79.9150390625, "logps/ref_chosen": -54.638946533203125, "logps/ref_rejected": -97.97351837158203, "logps/rejected": -174.24957275390625, "loss": 0.8843, "margin_dpo/margin_mean": 50.99995803833008, "margin_dpo/margin_std": 46.56443405151367, "step": 105 }, { "KL/chosen_KL_mean": -28.383203506469727, "KL/mean": -44.35602569580078, "KL/rejected_KL_mean": -60.32884216308594, "KL/std": 36.69441223144531, "epoch": 0.15565345080763582, "fcm_dpo/beta": 0.013703379780054092, "fcm_dpo/delta": -0.03964092954993248, "fcm_dpo/margin": 31.945636749267578, "fcm_dpo/q_t": 0.3993247449398041, "grad_norm": 20.311628341674805, "learning_rate": 4.957432749209755e-07, "logits/chosen": -0.6390097141265869, "logits/rejected": -0.6126164197921753, "logps/chosen": -83.21609497070312, "logps/ref_chosen": -54.83289337158203, "logps/ref_rejected": -85.22461700439453, "logps/rejected": -145.553466796875, "loss": 1.0661, "margin_dpo/margin_mean": 31.94563865661621, "margin_dpo/margin_std": 39.89073944091797, "step": 106 }, { "KL/chosen_KL_mean": -33.93869400024414, "KL/mean": -51.83905029296875, "KL/rejected_KL_mean": -69.7394027709961, "KL/std": 44.899288177490234, "epoch": 0.15712187958883994, "fcm_dpo/beta": 0.013466178439557552, "fcm_dpo/delta": -0.087033212184906, "fcm_dpo/margin": 35.80072021484375, "fcm_dpo/q_t": 0.3910368084907532, "grad_norm": 20.68709373474121, "learning_rate": 4.955042268449307e-07, "logits/chosen": -0.6427664756774902, "logits/rejected": -0.5967296361923218, "logps/chosen": -103.6465072631836, "logps/ref_chosen": -69.70780944824219, "logps/ref_rejected": -94.73950958251953, "logps/rejected": -164.47891235351562, "loss": 1.0566, "margin_dpo/margin_mean": 35.800716400146484, "margin_dpo/margin_std": 47.552371978759766, "step": 107 }, { "KL/chosen_KL_mean": -29.895898818969727, "KL/mean": -51.84703063964844, "KL/rejected_KL_mean": -73.79816436767578, "KL/std": 49.70708465576172, "epoch": 0.15859030837004406, "fcm_dpo/beta": 0.013104308396577835, "fcm_dpo/delta": -0.18593883514404297, "fcm_dpo/margin": 43.90226745605469, "fcm_dpo/q_t": 0.3812934160232544, "grad_norm": 20.52142906188965, "learning_rate": 4.952587095041881e-07, "logits/chosen": -0.6195969581604004, "logits/rejected": -0.5994083881378174, "logps/chosen": -85.9057846069336, "logps/ref_chosen": -56.0098876953125, "logps/ref_rejected": -95.79601287841797, "logps/rejected": -169.59417724609375, "loss": 1.0418, "margin_dpo/margin_mean": 43.90226745605469, "margin_dpo/margin_std": 64.3550033569336, "step": 108 }, { "KL/chosen_KL_mean": -27.957853317260742, "KL/mean": -51.0029296875, "KL/rejected_KL_mean": -74.04800415039062, "KL/std": 47.15357971191406, "epoch": 0.16005873715124816, "fcm_dpo/beta": 0.012563558295369148, "fcm_dpo/delta": -0.19107185304164886, "fcm_dpo/margin": 46.09014892578125, "fcm_dpo/q_t": 0.37124601006507874, "grad_norm": 21.661203384399414, "learning_rate": 4.95006729368358e-07, "logits/chosen": -0.5873284339904785, "logits/rejected": -0.5662369132041931, "logps/chosen": -90.84333801269531, "logps/ref_chosen": -62.88549041748047, "logps/ref_rejected": -98.68573760986328, "logps/rejected": -172.73373413085938, "loss": 1.0005, "margin_dpo/margin_mean": 46.090152740478516, "margin_dpo/margin_std": 56.48835754394531, "step": 109 }, { "KL/chosen_KL_mean": -28.93518829345703, "KL/mean": -49.65400695800781, "KL/rejected_KL_mean": -70.37283325195312, "KL/std": 47.03770065307617, "epoch": 0.16152716593245228, "fcm_dpo/beta": 0.012130336835980415, "fcm_dpo/delta": -0.11256064474582672, "fcm_dpo/margin": 41.43762969970703, "fcm_dpo/q_t": 0.3895985782146454, "grad_norm": 18.499637603759766, "learning_rate": 4.947482930773511e-07, "logits/chosen": -0.5971484780311584, "logits/rejected": -0.5627081394195557, "logps/chosen": -87.68887329101562, "logps/ref_chosen": -58.753684997558594, "logps/ref_rejected": -79.75001525878906, "logps/rejected": -150.1228485107422, "loss": 1.0618, "margin_dpo/margin_mean": 41.43762969970703, "margin_dpo/margin_std": 56.833656311035156, "step": 110 }, { "KL/chosen_KL_mean": -32.69416046142578, "KL/mean": -56.263702392578125, "KL/rejected_KL_mean": -79.83323669433594, "KL/std": 52.9500732421875, "epoch": 0.16299559471365638, "fcm_dpo/beta": 0.011841144412755966, "fcm_dpo/delta": -0.1690835952758789, "fcm_dpo/margin": 47.13909149169922, "fcm_dpo/q_t": 0.37844541668891907, "grad_norm": 21.36251449584961, "learning_rate": 4.944834074412042e-07, "logits/chosen": -0.6595109701156616, "logits/rejected": -0.6392641067504883, "logps/chosen": -101.31826782226562, "logps/ref_chosen": -68.62410736083984, "logps/ref_rejected": -98.42886352539062, "logps/rejected": -178.26210021972656, "loss": 1.0405, "margin_dpo/margin_mean": 47.13909149169922, "margin_dpo/margin_std": 65.69253540039062, "step": 111 }, { "KL/chosen_KL_mean": -29.994304656982422, "KL/mean": -44.81261444091797, "KL/rejected_KL_mean": -59.63092041015625, "KL/std": 36.23802947998047, "epoch": 0.1644640234948605, "fcm_dpo/beta": 0.011842923238873482, "fcm_dpo/delta": 0.050649721175432205, "fcm_dpo/margin": 29.636615753173828, "fcm_dpo/q_t": 0.41945815086364746, "grad_norm": 19.11754035949707, "learning_rate": 4.942120794399002e-07, "logits/chosen": -0.6208142042160034, "logits/rejected": -0.5849310159683228, "logps/chosen": -80.24394226074219, "logps/ref_chosen": -50.24964141845703, "logps/ref_rejected": -64.77442932128906, "logps/rejected": -124.40534973144531, "loss": 1.1323, "margin_dpo/margin_mean": 29.636615753173828, "margin_dpo/margin_std": 44.190711975097656, "step": 112 }, { "KL/chosen_KL_mean": -37.94208526611328, "KL/mean": -53.6860237121582, "KL/rejected_KL_mean": -69.4299545288086, "KL/std": 38.465965270996094, "epoch": 0.16593245227606462, "fcm_dpo/beta": 0.011969354934990406, "fcm_dpo/delta": 0.024007823318243027, "fcm_dpo/margin": 31.487873077392578, "fcm_dpo/q_t": 0.41338014602661133, "grad_norm": 19.75728988647461, "learning_rate": 4.939343162231841e-07, "logits/chosen": -0.5755143165588379, "logits/rejected": -0.5319409370422363, "logps/chosen": -104.65503692626953, "logps/ref_chosen": -66.71295166015625, "logps/ref_rejected": -77.96870422363281, "logps/rejected": -147.39865112304688, "loss": 1.1054, "margin_dpo/margin_mean": 31.487873077392578, "margin_dpo/margin_std": 42.64485549926758, "step": 113 }, { "KL/chosen_KL_mean": -34.591270446777344, "KL/mean": -59.78630065917969, "KL/rejected_KL_mean": -84.9813232421875, "KL/std": 56.91395568847656, "epoch": 0.16740088105726872, "fcm_dpo/beta": 0.011564414948225021, "fcm_dpo/delta": -0.19607561826705933, "fcm_dpo/margin": 50.390052795410156, "fcm_dpo/q_t": 0.3780399262905121, "grad_norm": 21.318517684936523, "learning_rate": 4.936501251103751e-07, "logits/chosen": -0.5932904481887817, "logits/rejected": -0.562382698059082, "logps/chosen": -92.37635803222656, "logps/ref_chosen": -57.78507995605469, "logps/ref_rejected": -87.10966491699219, "logps/rejected": -172.09097290039062, "loss": 1.0109, "margin_dpo/margin_mean": 50.390052795410156, "margin_dpo/margin_std": 69.09920501708984, "step": 114 }, { "KL/chosen_KL_mean": -44.7384033203125, "KL/mean": -61.84308624267578, "KL/rejected_KL_mean": -78.94776153564453, "KL/std": 54.04515838623047, "epoch": 0.16886930983847284, "fcm_dpo/beta": 0.011562837287783623, "fcm_dpo/delta": 0.004423616454005241, "fcm_dpo/margin": 34.20935821533203, "fcm_dpo/q_t": 0.41715824604034424, "grad_norm": 26.597360610961914, "learning_rate": 4.933595135901732e-07, "logits/chosen": -0.619019627571106, "logits/rejected": -0.6011543273925781, "logps/chosen": -110.321044921875, "logps/ref_chosen": -65.5826416015625, "logps/ref_rejected": -98.56552124023438, "logps/rejected": -177.51327514648438, "loss": 1.1766, "margin_dpo/margin_mean": 34.20935821533203, "margin_dpo/margin_std": 71.29178619384766, "step": 115 }, { "KL/chosen_KL_mean": -33.04991912841797, "KL/mean": -52.28227996826172, "KL/rejected_KL_mean": -71.5146484375, "KL/std": 45.42361831665039, "epoch": 0.17033773861967694, "fcm_dpo/beta": 0.011507030576467514, "fcm_dpo/delta": -0.04481929540634155, "fcm_dpo/margin": 38.464725494384766, "fcm_dpo/q_t": 0.40007728338241577, "grad_norm": 21.224323272705078, "learning_rate": 4.930624893204624e-07, "logits/chosen": -0.5934985876083374, "logits/rejected": -0.5850518345832825, "logps/chosen": -84.45022583007812, "logps/ref_chosen": -51.40031433105469, "logps/ref_rejected": -80.5218505859375, "logps/rejected": -152.0364990234375, "loss": 1.0662, "margin_dpo/margin_mean": 38.464725494384766, "margin_dpo/margin_std": 49.343894958496094, "step": 116 }, { "KL/chosen_KL_mean": -42.33665466308594, "KL/mean": -57.977378845214844, "KL/rejected_KL_mean": -73.61810302734375, "KL/std": 48.001014709472656, "epoch": 0.17180616740088106, "fcm_dpo/beta": 0.011499254032969475, "fcm_dpo/delta": 0.041788313537836075, "fcm_dpo/margin": 31.281452178955078, "fcm_dpo/q_t": 0.41948583722114563, "grad_norm": 25.782733917236328, "learning_rate": 4.927590601281083e-07, "logits/chosen": -0.5640000104904175, "logits/rejected": -0.526547372341156, "logps/chosen": -111.63506317138672, "logps/ref_chosen": -69.29840850830078, "logps/ref_rejected": -66.583984375, "logps/rejected": -140.20208740234375, "loss": 1.1488, "margin_dpo/margin_mean": 31.281452178955078, "margin_dpo/margin_std": 54.715789794921875, "step": 117 }, { "KL/chosen_KL_mean": -33.31963348388672, "KL/mean": -52.253089904785156, "KL/rejected_KL_mean": -71.1865463256836, "KL/std": 43.34165954589844, "epoch": 0.17327459618208516, "fcm_dpo/beta": 0.011471563950181007, "fcm_dpo/delta": -0.03593885153532028, "fcm_dpo/margin": 37.86691665649414, "fcm_dpo/q_t": 0.40212157368659973, "grad_norm": 20.143999099731445, "learning_rate": 4.924492340087524e-07, "logits/chosen": -0.6188483238220215, "logits/rejected": -0.6008873581886292, "logps/chosen": -88.96061706542969, "logps/ref_chosen": -55.6409797668457, "logps/ref_rejected": -75.66905975341797, "logps/rejected": -146.85560607910156, "loss": 1.072, "margin_dpo/margin_mean": 37.86691665649414, "margin_dpo/margin_std": 49.769737243652344, "step": 118 }, { "KL/chosen_KL_mean": -45.8137092590332, "KL/mean": -64.60531616210938, "KL/rejected_KL_mean": -83.39691162109375, "KL/std": 48.94361877441406, "epoch": 0.17474302496328928, "fcm_dpo/beta": 0.011316780932247639, "fcm_dpo/delta": -0.02757979929447174, "fcm_dpo/margin": 37.58320236206055, "fcm_dpo/q_t": 0.407728374004364, "grad_norm": 22.411529541015625, "learning_rate": 4.92133019126601e-07, "logits/chosen": -0.6017279624938965, "logits/rejected": -0.5897752046585083, "logps/chosen": -119.32390594482422, "logps/ref_chosen": -73.51019287109375, "logps/ref_rejected": -102.977294921875, "logps/rejected": -186.37420654296875, "loss": 1.1113, "margin_dpo/margin_mean": 37.58320236206055, "margin_dpo/margin_std": 60.125755310058594, "step": 119 }, { "KL/chosen_KL_mean": -46.51409912109375, "KL/mean": -72.65493774414062, "KL/rejected_KL_mean": -98.7957763671875, "KL/std": 60.961395263671875, "epoch": 0.1762114537444934, "fcm_dpo/beta": 0.011053888127207756, "fcm_dpo/delta": -0.1890048086643219, "fcm_dpo/margin": 52.28166961669922, "fcm_dpo/q_t": 0.37308794260025024, "grad_norm": 21.354141235351562, "learning_rate": 4.918104238142103e-07, "logits/chosen": -0.6113423109054565, "logits/rejected": -0.5809808969497681, "logps/chosen": -123.29493713378906, "logps/ref_chosen": -76.78083801269531, "logps/ref_rejected": -108.02374267578125, "logps/rejected": -206.81951904296875, "loss": 0.9987, "margin_dpo/margin_mean": 52.281673431396484, "margin_dpo/margin_std": 64.23561096191406, "step": 120 }, { "KL/chosen_KL_mean": -45.217079162597656, "KL/mean": -74.15641784667969, "KL/rejected_KL_mean": -103.09576416015625, "KL/std": 59.655128479003906, "epoch": 0.1776798825256975, "fcm_dpo/beta": 0.010542536154389381, "fcm_dpo/delta": -0.22549036145210266, "fcm_dpo/margin": 57.878684997558594, "fcm_dpo/q_t": 0.36856669187545776, "grad_norm": 23.214689254760742, "learning_rate": 4.91481456572267e-07, "logits/chosen": -0.5403860807418823, "logits/rejected": -0.5361485481262207, "logps/chosen": -107.00697326660156, "logps/ref_chosen": -61.789894104003906, "logps/ref_rejected": -109.99456787109375, "logps/rejected": -213.09033203125, "loss": 0.9989, "margin_dpo/margin_mean": 57.878684997558594, "margin_dpo/margin_std": 74.57313537597656, "step": 121 }, { "KL/chosen_KL_mean": -41.20557403564453, "KL/mean": -76.79521179199219, "KL/rejected_KL_mean": -112.38485717773438, "KL/std": 67.50093078613281, "epoch": 0.17914831130690162, "fcm_dpo/beta": 0.009974541142582893, "fcm_dpo/delta": -0.33350038528442383, "fcm_dpo/margin": 71.17928314208984, "fcm_dpo/q_t": 0.343948096036911, "grad_norm": 22.87774658203125, "learning_rate": 4.911461260693638e-07, "logits/chosen": -0.5265074968338013, "logits/rejected": -0.5429497957229614, "logps/chosen": -88.1077880859375, "logps/ref_chosen": -46.9022102355957, "logps/ref_rejected": -106.71418762207031, "logps/rejected": -219.09906005859375, "loss": 0.9029, "margin_dpo/margin_mean": 71.17927551269531, "margin_dpo/margin_std": 70.96698760986328, "step": 122 }, { "KL/chosen_KL_mean": -45.384735107421875, "KL/mean": -68.55912780761719, "KL/rejected_KL_mean": -91.7335205078125, "KL/std": 57.45310592651367, "epoch": 0.18061674008810572, "fcm_dpo/beta": 0.00966709479689598, "fcm_dpo/delta": -0.051273368299007416, "fcm_dpo/margin": 46.348785400390625, "fcm_dpo/q_t": 0.4034884572029114, "grad_norm": 20.789918899536133, "learning_rate": 4.908044411417711e-07, "logits/chosen": -0.5363984107971191, "logits/rejected": -0.5196830034255981, "logps/chosen": -106.72337341308594, "logps/ref_chosen": -61.33863830566406, "logps/ref_rejected": -87.775390625, "logps/rejected": -179.5089111328125, "loss": 1.1135, "margin_dpo/margin_mean": 46.34878158569336, "margin_dpo/margin_std": 77.05763244628906, "step": 123 }, { "KL/chosen_KL_mean": -51.98781967163086, "KL/mean": -86.58702087402344, "KL/rejected_KL_mean": -121.18624114990234, "KL/std": 78.8485107421875, "epoch": 0.18208516886930984, "fcm_dpo/beta": 0.009301427751779556, "fcm_dpo/delta": -0.26134854555130005, "fcm_dpo/margin": 69.19841766357422, "fcm_dpo/q_t": 0.36982783675193787, "grad_norm": 24.75668716430664, "learning_rate": 4.904564107932048e-07, "logits/chosen": -0.5215315222740173, "logits/rejected": -0.5242322683334351, "logps/chosen": -123.4361572265625, "logps/ref_chosen": -71.44833374023438, "logps/ref_rejected": -117.58056640625, "logps/rejected": -238.76681518554688, "loss": 1.0166, "margin_dpo/margin_mean": 69.19841766357422, "margin_dpo/margin_std": 98.82803344726562, "step": 124 }, { "KL/chosen_KL_mean": -42.808067321777344, "KL/mean": -73.27595520019531, "KL/rejected_KL_mean": -103.74385070800781, "KL/std": 66.60159301757812, "epoch": 0.18355359765051396, "fcm_dpo/beta": 0.00900559313595295, "fcm_dpo/delta": -0.1572001874446869, "fcm_dpo/margin": 60.93578338623047, "fcm_dpo/q_t": 0.38217341899871826, "grad_norm": 19.192096710205078, "learning_rate": 4.90102044194588e-07, "logits/chosen": -0.4833022356033325, "logits/rejected": -0.4855707287788391, "logps/chosen": -92.94500732421875, "logps/ref_chosen": -50.136940002441406, "logps/ref_rejected": -83.98861694335938, "logps/rejected": -187.7324676513672, "loss": 1.034, "margin_dpo/margin_mean": 60.93578338623047, "margin_dpo/margin_std": 83.12398529052734, "step": 125 }, { "KL/chosen_KL_mean": -46.238609313964844, "KL/mean": -74.72161865234375, "KL/rejected_KL_mean": -103.20464324951172, "KL/std": 60.32928466796875, "epoch": 0.18502202643171806, "fcm_dpo/beta": 0.008764306083321571, "fcm_dpo/delta": -0.10474735498428345, "fcm_dpo/margin": 56.966033935546875, "fcm_dpo/q_t": 0.39040666818618774, "grad_norm": 20.248746871948242, "learning_rate": 4.897413506838102e-07, "logits/chosen": -0.5261760354042053, "logits/rejected": -0.5226148366928101, "logps/chosen": -101.90567779541016, "logps/ref_chosen": -55.66706848144531, "logps/ref_rejected": -98.1297607421875, "logps/rejected": -201.33441162109375, "loss": 1.048, "margin_dpo/margin_mean": 56.966033935546875, "margin_dpo/margin_std": 76.7184066772461, "step": 126 }, { "KL/chosen_KL_mean": -44.78712844848633, "KL/mean": -65.41035461425781, "KL/rejected_KL_mean": -86.03358459472656, "KL/std": 52.55406951904297, "epoch": 0.18649045521292218, "fcm_dpo/beta": 0.008787820115685463, "fcm_dpo/delta": 0.03892592340707779, "fcm_dpo/margin": 41.24645233154297, "fcm_dpo/q_t": 0.4162459075450897, "grad_norm": 20.812768936157227, "learning_rate": 4.89374339765481e-07, "logits/chosen": -0.5199460983276367, "logits/rejected": -0.5012995004653931, "logps/chosen": -101.34181213378906, "logps/ref_chosen": -56.55467987060547, "logps/ref_rejected": -76.7957763671875, "logps/rejected": -162.82937622070312, "loss": 1.136, "margin_dpo/margin_mean": 41.24645233154297, "margin_dpo/margin_std": 66.14584350585938, "step": 127 }, { "KL/chosen_KL_mean": -47.59389877319336, "KL/mean": -69.38520812988281, "KL/rejected_KL_mean": -91.176513671875, "KL/std": 61.367488861083984, "epoch": 0.18795888399412627, "fcm_dpo/beta": 0.008857084438204765, "fcm_dpo/delta": 0.014133721590042114, "fcm_dpo/margin": 43.582611083984375, "fcm_dpo/q_t": 0.4142889976501465, "grad_norm": 27.369966506958008, "learning_rate": 4.890010211106795e-07, "logits/chosen": -0.49865514039993286, "logits/rejected": -0.47880756855010986, "logps/chosen": -105.71485900878906, "logps/ref_chosen": -58.12095642089844, "logps/ref_rejected": -76.43896484375, "logps/rejected": -167.615478515625, "loss": 1.1451, "margin_dpo/margin_mean": 43.58261489868164, "margin_dpo/margin_std": 76.90882873535156, "step": 128 }, { "KL/chosen_KL_mean": -57.667259216308594, "KL/mean": -80.6406478881836, "KL/rejected_KL_mean": -103.61404418945312, "KL/std": 68.38490295410156, "epoch": 0.1894273127753304, "fcm_dpo/beta": 0.008825141936540604, "fcm_dpo/delta": -0.0057245357893407345, "fcm_dpo/margin": 45.94677734375, "fcm_dpo/q_t": 0.4152906835079193, "grad_norm": 20.145679473876953, "learning_rate": 4.88621404556699e-07, "logits/chosen": -0.5370001196861267, "logits/rejected": -0.5274189710617065, "logps/chosen": -124.58363342285156, "logps/ref_chosen": -66.91637420654297, "logps/ref_rejected": -96.6422119140625, "logps/rejected": -200.25625610351562, "loss": 1.1535, "margin_dpo/margin_mean": 45.94677734375, "margin_dpo/margin_std": 89.10274505615234, "step": 129 }, { "KL/chosen_KL_mean": -42.64732360839844, "KL/mean": -77.17532348632812, "KL/rejected_KL_mean": -111.70331573486328, "KL/std": 69.53358459472656, "epoch": 0.19089574155653452, "fcm_dpo/beta": 0.008637124672532082, "fcm_dpo/delta": -0.2089286893606186, "fcm_dpo/margin": 69.05598449707031, "fcm_dpo/q_t": 0.37180206179618835, "grad_norm": 19.886871337890625, "learning_rate": 4.882355001067891e-07, "logits/chosen": -0.5179574489593506, "logits/rejected": -0.513495147228241, "logps/chosen": -87.31417846679688, "logps/ref_chosen": -44.66685104370117, "logps/ref_rejected": -82.78165435791016, "logps/rejected": -194.48497009277344, "loss": 1.0019, "margin_dpo/margin_mean": 69.05598449707031, "margin_dpo/margin_std": 84.9914779663086, "step": 130 }, { "KL/chosen_KL_mean": -38.19416046142578, "KL/mean": -72.767822265625, "KL/rejected_KL_mean": -107.34147644042969, "KL/std": 69.4886474609375, "epoch": 0.19236417033773862, "fcm_dpo/beta": 0.008221091702580452, "fcm_dpo/delta": -0.1789543628692627, "fcm_dpo/margin": 69.14730834960938, "fcm_dpo/q_t": 0.3706004023551941, "grad_norm": 27.72515869140625, "learning_rate": 4.878433179298909e-07, "logits/chosen": -0.4905538558959961, "logits/rejected": -0.49695295095443726, "logps/chosen": -83.11874389648438, "logps/ref_chosen": -44.924591064453125, "logps/ref_rejected": -88.44401550292969, "logps/rejected": -195.78549194335938, "loss": 0.9848, "margin_dpo/margin_mean": 69.14730834960938, "margin_dpo/margin_std": 75.87403869628906, "step": 131 }, { "KL/chosen_KL_mean": -50.51142120361328, "KL/mean": -78.78823852539062, "KL/rejected_KL_mean": -107.0650634765625, "KL/std": 68.71534729003906, "epoch": 0.19383259911894274, "fcm_dpo/beta": 0.008070580661296844, "fcm_dpo/delta": -0.05931827053427696, "fcm_dpo/margin": 56.55363082885742, "fcm_dpo/q_t": 0.40248197317123413, "grad_norm": 19.825626373291016, "learning_rate": 4.874448683603694e-07, "logits/chosen": -0.5076676607131958, "logits/rejected": -0.5060294270515442, "logps/chosen": -109.51251220703125, "logps/ref_chosen": -59.00108337402344, "logps/ref_rejected": -87.89215087890625, "logps/rejected": -194.95721435546875, "loss": 1.0917, "margin_dpo/margin_mean": 56.55363082885742, "margin_dpo/margin_std": 89.22288513183594, "step": 132 }, { "KL/chosen_KL_mean": -60.22153854370117, "KL/mean": -85.20774841308594, "KL/rejected_KL_mean": -110.1939697265625, "KL/std": 62.262081146240234, "epoch": 0.19530102790014683, "fcm_dpo/beta": 0.008044019341468811, "fcm_dpo/delta": -0.002142667770385742, "fcm_dpo/margin": 49.97242736816406, "fcm_dpo/q_t": 0.41164666414260864, "grad_norm": 27.044200897216797, "learning_rate": 4.870401618977415e-07, "logits/chosen": -0.5105775594711304, "logits/rejected": -0.49744895100593567, "logps/chosen": -126.82603454589844, "logps/ref_chosen": -66.60449981689453, "logps/ref_rejected": -96.33355712890625, "logps/rejected": -206.52752685546875, "loss": 1.115, "margin_dpo/margin_mean": 49.97242736816406, "margin_dpo/margin_std": 79.255859375, "step": 133 }, { "KL/chosen_KL_mean": -47.19751739501953, "KL/mean": -73.87967681884766, "KL/rejected_KL_mean": -100.56185150146484, "KL/std": 60.19834518432617, "epoch": 0.19676945668135096, "fcm_dpo/beta": 0.00804828479886055, "fcm_dpo/delta": -0.03093547746539116, "fcm_dpo/margin": 53.36433029174805, "fcm_dpo/q_t": 0.4034237563610077, "grad_norm": 18.76936149597168, "learning_rate": 4.866292092063986e-07, "logits/chosen": -0.4705553650856018, "logits/rejected": -0.45708662271499634, "logps/chosen": -99.26676940917969, "logps/ref_chosen": -52.06925582885742, "logps/ref_rejected": -87.6545181274414, "logps/rejected": -188.21636962890625, "loss": 1.0719, "margin_dpo/margin_mean": 53.36433410644531, "margin_dpo/margin_std": 69.71368408203125, "step": 134 }, { "KL/chosen_KL_mean": -52.56504821777344, "KL/mean": -91.32998657226562, "KL/rejected_KL_mean": -130.09490966796875, "KL/std": 81.02059936523438, "epoch": 0.19823788546255505, "fcm_dpo/beta": 0.007756436243653297, "fcm_dpo/delta": -0.21430166065692902, "fcm_dpo/margin": 77.52987670898438, "fcm_dpo/q_t": 0.37159401178359985, "grad_norm": 23.092226028442383, "learning_rate": 4.862120211153265e-07, "logits/chosen": -0.4758632481098175, "logits/rejected": -0.5096943974494934, "logps/chosen": -102.91890716552734, "logps/ref_chosen": -50.353858947753906, "logps/ref_rejected": -115.97975158691406, "logps/rejected": -246.0746612548828, "loss": 0.998, "margin_dpo/margin_mean": 77.52987670898438, "margin_dpo/margin_std": 98.24815368652344, "step": 135 }, { "KL/chosen_KL_mean": -62.72093963623047, "KL/mean": -88.66222381591797, "KL/rejected_KL_mean": -114.603515625, "KL/std": 74.2306137084961, "epoch": 0.19970631424375918, "fcm_dpo/beta": 0.00763201666995883, "fcm_dpo/delta": 0.0035090260207653046, "fcm_dpo/margin": 51.88257598876953, "fcm_dpo/q_t": 0.41930729150772095, "grad_norm": 20.256174087524414, "learning_rate": 4.857886086178193e-07, "logits/chosen": -0.4780592918395996, "logits/rejected": -0.4707057476043701, "logps/chosen": -127.79344940185547, "logps/ref_chosen": -65.072509765625, "logps/ref_rejected": -96.32122802734375, "logps/rejected": -210.92474365234375, "loss": 1.1452, "margin_dpo/margin_mean": 51.88257598876953, "margin_dpo/margin_std": 95.30119323730469, "step": 136 }, { "KL/chosen_KL_mean": -59.67708206176758, "KL/mean": -100.20204162597656, "KL/rejected_KL_mean": -140.7270050048828, "KL/std": 97.27854919433594, "epoch": 0.2011747430249633, "fcm_dpo/beta": 0.007428483106195927, "fcm_dpo/delta": -0.21530447900295258, "fcm_dpo/margin": 81.04991912841797, "fcm_dpo/q_t": 0.3775022029876709, "grad_norm": 17.79768180847168, "learning_rate": 4.853589828711902e-07, "logits/chosen": -0.4339534640312195, "logits/rejected": -0.46067190170288086, "logps/chosen": -108.43620300292969, "logps/ref_chosen": -48.759117126464844, "logps/ref_rejected": -113.86376953125, "logps/rejected": -254.5907745361328, "loss": 1.0282, "margin_dpo/margin_mean": 81.0499267578125, "margin_dpo/margin_std": 117.30170440673828, "step": 137 }, { "KL/chosen_KL_mean": -62.279048919677734, "KL/mean": -92.98139953613281, "KL/rejected_KL_mean": -123.68376159667969, "KL/std": 73.94004821777344, "epoch": 0.2026431718061674, "fcm_dpo/beta": 0.007311869412660599, "fcm_dpo/delta": -0.051259323954582214, "fcm_dpo/margin": 61.40470886230469, "fcm_dpo/q_t": 0.3972048759460449, "grad_norm": 20.16287612915039, "learning_rate": 4.849231551964771e-07, "logits/chosen": -0.4445374608039856, "logits/rejected": -0.43380314111709595, "logps/chosen": -122.79869842529297, "logps/ref_chosen": -60.519649505615234, "logps/ref_rejected": -93.19694519042969, "logps/rejected": -216.88070678710938, "loss": 1.0574, "margin_dpo/margin_mean": 61.40470886230469, "margin_dpo/margin_std": 75.6790771484375, "step": 138 }, { "KL/chosen_KL_mean": -51.84934616088867, "KL/mean": -88.24090576171875, "KL/rejected_KL_mean": -124.6324691772461, "KL/std": 69.78694152832031, "epoch": 0.20411160058737152, "fcm_dpo/beta": 0.007158602587878704, "fcm_dpo/delta": -0.12749908864498138, "fcm_dpo/margin": 72.78312683105469, "fcm_dpo/q_t": 0.3843567967414856, "grad_norm": 19.195222854614258, "learning_rate": 4.844811370781446e-07, "logits/chosen": -0.43394410610198975, "logits/rejected": -0.4245094060897827, "logps/chosen": -98.74073028564453, "logps/ref_chosen": -46.89138412475586, "logps/ref_rejected": -79.72798156738281, "logps/rejected": -204.36044311523438, "loss": 1.0226, "margin_dpo/margin_mean": 72.78312683105469, "margin_dpo/margin_std": 90.51847839355469, "step": 139 }, { "KL/chosen_KL_mean": -62.725608825683594, "KL/mean": -95.730224609375, "KL/rejected_KL_mean": -128.73483276367188, "KL/std": 77.60523986816406, "epoch": 0.2055800293685756, "fcm_dpo/beta": 0.007023262791335583, "fcm_dpo/delta": -0.06672540307044983, "fcm_dpo/margin": 66.00922393798828, "fcm_dpo/q_t": 0.39678555727005005, "grad_norm": 19.862146377563477, "learning_rate": 4.840329401637809e-07, "logits/chosen": -0.38486558198928833, "logits/rejected": -0.36864161491394043, "logps/chosen": -121.70032501220703, "logps/ref_chosen": -58.97471618652344, "logps/ref_rejected": -83.28410339355469, "logps/rejected": -212.01895141601562, "loss": 1.0714, "margin_dpo/margin_mean": 66.00922393798828, "margin_dpo/margin_std": 93.16436767578125, "step": 140 }, { "KL/chosen_KL_mean": -70.66569519042969, "KL/mean": -101.82574462890625, "KL/rejected_KL_mean": -132.9857635498047, "KL/std": 85.2192611694336, "epoch": 0.20704845814977973, "fcm_dpo/beta": 0.00697126192972064, "fcm_dpo/delta": -0.0359983891248703, "fcm_dpo/margin": 62.32007598876953, "fcm_dpo/q_t": 0.40241730213165283, "grad_norm": 24.785545349121094, "learning_rate": 4.83578576263792e-07, "logits/chosen": -0.43056273460388184, "logits/rejected": -0.4188095033168793, "logps/chosen": -145.74136352539062, "logps/ref_chosen": -75.07566833496094, "logps/ref_rejected": -98.1922607421875, "logps/rejected": -231.17803955078125, "loss": 1.1045, "margin_dpo/margin_mean": 62.320072174072266, "margin_dpo/margin_std": 98.26313781738281, "step": 141 }, { "KL/chosen_KL_mean": -71.45011901855469, "KL/mean": -107.04524230957031, "KL/rejected_KL_mean": -142.64035034179688, "KL/std": 92.68605041503906, "epoch": 0.20851688693098386, "fcm_dpo/beta": 0.006883557885885239, "fcm_dpo/delta": -0.0946403294801712, "fcm_dpo/margin": 71.19023895263672, "fcm_dpo/q_t": 0.3939516842365265, "grad_norm": 25.63075065612793, "learning_rate": 4.83118057351089e-07, "logits/chosen": -0.40616393089294434, "logits/rejected": -0.40541693568229675, "logps/chosen": -129.47805786132812, "logps/ref_chosen": -58.027931213378906, "logps/ref_rejected": -94.58222961425781, "logps/rejected": -237.22259521484375, "loss": 1.0855, "margin_dpo/margin_mean": 71.19023895263672, "margin_dpo/margin_std": 109.05268859863281, "step": 142 }, { "KL/chosen_KL_mean": -75.96367645263672, "KL/mean": -98.91988372802734, "KL/rejected_KL_mean": -121.87608337402344, "KL/std": 81.95751953125, "epoch": 0.20998531571218795, "fcm_dpo/beta": 0.006882138084620237, "fcm_dpo/delta": 0.08677390962839127, "fcm_dpo/margin": 45.912410736083984, "fcm_dpo/q_t": 0.432314932346344, "grad_norm": 22.879791259765625, "learning_rate": 4.826513955607734e-07, "logits/chosen": -0.36676502227783203, "logits/rejected": -0.35898709297180176, "logps/chosen": -133.56011962890625, "logps/ref_chosen": -57.59645080566406, "logps/ref_rejected": -78.99957275390625, "logps/rejected": -200.8756561279297, "loss": 1.1959, "margin_dpo/margin_mean": 45.91241455078125, "margin_dpo/margin_std": 95.7051010131836, "step": 143 }, { "KL/chosen_KL_mean": -67.95751953125, "KL/mean": -95.87046813964844, "KL/rejected_KL_mean": -123.78343200683594, "KL/std": 69.62263488769531, "epoch": 0.21145374449339208, "fcm_dpo/beta": 0.006947984918951988, "fcm_dpo/delta": 0.012606319040060043, "fcm_dpo/margin": 55.825904846191406, "fcm_dpo/q_t": 0.4123944938182831, "grad_norm": 20.645727157592773, "learning_rate": 4.821786031898176e-07, "logits/chosen": -0.4141218066215515, "logits/rejected": -0.40556472539901733, "logps/chosen": -127.86387634277344, "logps/ref_chosen": -59.90636444091797, "logps/ref_rejected": -82.00025939941406, "logps/rejected": -205.78369140625, "loss": 1.1116, "margin_dpo/margin_mean": 55.82590866088867, "margin_dpo/margin_std": 82.17929077148438, "step": 144 }, { "KL/chosen_KL_mean": -65.5662841796875, "KL/mean": -95.84506225585938, "KL/rejected_KL_mean": -126.12384033203125, "KL/std": 70.3823013305664, "epoch": 0.21292217327459617, "fcm_dpo/beta": 0.00693280715495348, "fcm_dpo/delta": -0.02072506584227085, "fcm_dpo/margin": 60.55757141113281, "fcm_dpo/q_t": 0.4049571752548218, "grad_norm": 23.787511825561523, "learning_rate": 4.816996926967401e-07, "logits/chosen": -0.41279762983322144, "logits/rejected": -0.39724746346473694, "logps/chosen": -122.16694641113281, "logps/ref_chosen": -56.60066604614258, "logps/ref_rejected": -77.86631774902344, "logps/rejected": -203.9901580810547, "loss": 1.0903, "margin_dpo/margin_mean": 60.55756378173828, "margin_dpo/margin_std": 85.34068298339844, "step": 145 }, { "KL/chosen_KL_mean": -88.61365509033203, "KL/mean": -111.90090942382812, "KL/rejected_KL_mean": -135.18817138671875, "KL/std": 75.67164611816406, "epoch": 0.2143906020558003, "fcm_dpo/beta": 0.00698929512873292, "fcm_dpo/delta": 0.0770314633846283, "fcm_dpo/margin": 46.57452392578125, "fcm_dpo/q_t": 0.42611053586006165, "grad_norm": 27.229778289794922, "learning_rate": 4.812146767012779e-07, "logits/chosen": -0.386644184589386, "logits/rejected": -0.35976487398147583, "logps/chosen": -154.61410522460938, "logps/ref_chosen": -66.00045013427734, "logps/ref_rejected": -81.70278930664062, "logps/rejected": -216.89096069335938, "loss": 1.1846, "margin_dpo/margin_mean": 46.57452392578125, "margin_dpo/margin_std": 90.82237243652344, "step": 146 }, { "KL/chosen_KL_mean": -65.0084457397461, "KL/mean": -96.55009460449219, "KL/rejected_KL_mean": -128.09173583984375, "KL/std": 76.3470687866211, "epoch": 0.21585903083700442, "fcm_dpo/beta": 0.006981690879911184, "fcm_dpo/delta": -0.04241678863763809, "fcm_dpo/margin": 63.08330535888672, "fcm_dpo/q_t": 0.4026370644569397, "grad_norm": 20.30803108215332, "learning_rate": 4.807235679840536e-07, "logits/chosen": -0.42511412501335144, "logits/rejected": -0.4061092436313629, "logps/chosen": -118.41392517089844, "logps/ref_chosen": -53.405487060546875, "logps/ref_rejected": -71.39060974121094, "logps/rejected": -199.48236083984375, "loss": 1.0927, "margin_dpo/margin_mean": 63.083309173583984, "margin_dpo/margin_std": 94.804443359375, "step": 147 }, { "KL/chosen_KL_mean": -63.701988220214844, "KL/mean": -90.98545837402344, "KL/rejected_KL_mean": -118.2689208984375, "KL/std": 76.31551361083984, "epoch": 0.2173274596182085, "fcm_dpo/beta": 0.0069200447760522366, "fcm_dpo/delta": -0.08388624340295792, "fcm_dpo/margin": 54.566932678222656, "fcm_dpo/q_t": 0.41639888286590576, "grad_norm": 18.78589630126953, "learning_rate": 4.802263794862384e-07, "logits/chosen": -0.4691488444805145, "logits/rejected": -0.46223020553588867, "logps/chosen": -128.63906860351562, "logps/ref_chosen": -64.93708038330078, "logps/ref_rejected": -103.09384155273438, "logps/rejected": -221.36276245117188, "loss": 1.1243, "margin_dpo/margin_mean": 54.566932678222656, "margin_dpo/margin_std": 80.36856842041016, "step": 148 }, { "KL/chosen_KL_mean": -60.93719482421875, "KL/mean": -94.6129150390625, "KL/rejected_KL_mean": -128.2886199951172, "KL/std": 68.51680755615234, "epoch": 0.21879588839941264, "fcm_dpo/beta": 0.006747937761247158, "fcm_dpo/delta": -0.05886346101760864, "fcm_dpo/margin": 67.35143280029297, "fcm_dpo/q_t": 0.3960561752319336, "grad_norm": 18.21072769165039, "learning_rate": 4.797231243092118e-07, "logits/chosen": -0.4674052298069, "logits/rejected": -0.45243215560913086, "logps/chosen": -119.41095733642578, "logps/ref_chosen": -58.47376251220703, "logps/ref_rejected": -99.31474304199219, "logps/rejected": -227.60336303710938, "loss": 1.057, "margin_dpo/margin_mean": 67.3514404296875, "margin_dpo/margin_std": 81.17979431152344, "step": 149 }, { "KL/chosen_KL_mean": -53.51652526855469, "KL/mean": -86.48956298828125, "KL/rejected_KL_mean": -119.46260833740234, "KL/std": 79.87619018554688, "epoch": 0.22026431718061673, "fcm_dpo/beta": 0.006690857000648975, "fcm_dpo/delta": -0.04441402480006218, "fcm_dpo/margin": 65.94608306884766, "fcm_dpo/q_t": 0.404508501291275, "grad_norm": 17.715530395507812, "learning_rate": 4.792138157142157e-07, "logits/chosen": -0.46521174907684326, "logits/rejected": -0.46958300471305847, "logps/chosen": -99.22233581542969, "logps/ref_chosen": -45.705810546875, "logps/ref_rejected": -83.34759521484375, "logps/rejected": -202.81021118164062, "loss": 1.081, "margin_dpo/margin_mean": 65.94608306884766, "margin_dpo/margin_std": 95.0557861328125, "step": 150 }, { "KL/chosen_KL_mean": -67.021728515625, "KL/mean": -100.35306549072266, "KL/rejected_KL_mean": -133.6844024658203, "KL/std": 76.07426452636719, "epoch": 0.22173274596182085, "fcm_dpo/beta": 0.006673037074506283, "fcm_dpo/delta": -0.046944983303546906, "fcm_dpo/margin": 66.66267395019531, "fcm_dpo/q_t": 0.39813345670700073, "grad_norm": 19.35520362854004, "learning_rate": 4.786984671220053e-07, "logits/chosen": -0.5319645404815674, "logits/rejected": -0.5065436363220215, "logps/chosen": -137.59255981445312, "logps/ref_chosen": -70.57083129882812, "logps/ref_rejected": -100.46382141113281, "logps/rejected": -234.14822387695312, "loss": 1.0611, "margin_dpo/margin_mean": 66.66267395019531, "margin_dpo/margin_std": 83.35139465332031, "step": 151 }, { "KL/chosen_KL_mean": -58.77227020263672, "KL/mean": -99.20213317871094, "KL/rejected_KL_mean": -139.6320037841797, "KL/std": 78.30158996582031, "epoch": 0.22320117474302498, "fcm_dpo/beta": 0.0065421732142567635, "fcm_dpo/delta": -0.13599231839179993, "fcm_dpo/margin": 80.85972595214844, "fcm_dpo/q_t": 0.3823656737804413, "grad_norm": 20.457353591918945, "learning_rate": 4.78177092112495e-07, "logits/chosen": -0.46430838108062744, "logits/rejected": -0.46231526136398315, "logps/chosen": -118.93666076660156, "logps/ref_chosen": -60.16438674926758, "logps/ref_rejected": -106.14045715332031, "logps/rejected": -245.7724609375, "loss": 1.0172, "margin_dpo/margin_mean": 80.85972595214844, "margin_dpo/margin_std": 96.03659057617188, "step": 152 }, { "KL/chosen_KL_mean": -60.03376007080078, "KL/mean": -94.54715728759766, "KL/rejected_KL_mean": -129.060546875, "KL/std": 85.97286224365234, "epoch": 0.22466960352422907, "fcm_dpo/beta": 0.006446994375437498, "fcm_dpo/delta": -0.04709509760141373, "fcm_dpo/margin": 69.02679443359375, "fcm_dpo/q_t": 0.4036220908164978, "grad_norm": 15.283037185668945, "learning_rate": 4.776497044244016e-07, "logits/chosen": -0.4555599093437195, "logits/rejected": -0.4504218101501465, "logps/chosen": -116.34903717041016, "logps/ref_chosen": -56.315277099609375, "logps/ref_rejected": -85.65583801269531, "logps/rejected": -214.7163848876953, "loss": 1.091, "margin_dpo/margin_mean": 69.02678680419922, "margin_dpo/margin_std": 105.63395690917969, "step": 153 }, { "KL/chosen_KL_mean": -71.58514404296875, "KL/mean": -104.48786926269531, "KL/rejected_KL_mean": -137.3905792236328, "KL/std": 85.70448303222656, "epoch": 0.2261380323054332, "fcm_dpo/beta": 0.006422577425837517, "fcm_dpo/delta": -0.023874616250395775, "fcm_dpo/margin": 65.80543518066406, "fcm_dpo/q_t": 0.40717241168022156, "grad_norm": 18.80577278137207, "learning_rate": 4.771163179548808e-07, "logits/chosen": -0.46241965889930725, "logits/rejected": -0.4656856656074524, "logps/chosen": -134.3277130126953, "logps/ref_chosen": -62.74256896972656, "logps/ref_rejected": -104.24420166015625, "logps/rejected": -241.63478088378906, "loss": 1.1225, "margin_dpo/margin_mean": 65.8054428100586, "margin_dpo/margin_std": 109.62528991699219, "step": 154 }, { "KL/chosen_KL_mean": -65.22631072998047, "KL/mean": -98.56913757324219, "KL/rejected_KL_mean": -131.91197204589844, "KL/std": 79.4256591796875, "epoch": 0.2276064610866373, "fcm_dpo/beta": 0.006380689330399036, "fcm_dpo/delta": -0.02670937031507492, "fcm_dpo/margin": 66.6856689453125, "fcm_dpo/q_t": 0.4044456481933594, "grad_norm": 19.153099060058594, "learning_rate": 4.7657694675916247e-07, "logits/chosen": -0.4766504764556885, "logits/rejected": -0.4583345055580139, "logps/chosen": -125.8794937133789, "logps/ref_chosen": -60.65318298339844, "logps/ref_rejected": -77.49220275878906, "logps/rejected": -209.4041748046875, "loss": 1.0951, "margin_dpo/margin_mean": 66.6856689453125, "margin_dpo/margin_std": 98.61933135986328, "step": 155 }, { "KL/chosen_KL_mean": -91.69331359863281, "KL/mean": -110.75820922851562, "KL/rejected_KL_mean": -129.82310485839844, "KL/std": 84.64684295654297, "epoch": 0.2290748898678414, "fcm_dpo/beta": 0.006421338301151991, "fcm_dpo/delta": 0.05203431844711304, "fcm_dpo/margin": 38.129791259765625, "fcm_dpo/q_t": 0.4446510076522827, "grad_norm": 28.152284622192383, "learning_rate": 4.7603160505017893e-07, "logits/chosen": -0.40159350633621216, "logits/rejected": -0.3953508138656616, "logps/chosen": -161.18519592285156, "logps/ref_chosen": -69.49188232421875, "logps/ref_rejected": -77.16929626464844, "logps/rejected": -206.99240112304688, "loss": 1.2762, "margin_dpo/margin_mean": 38.129791259765625, "margin_dpo/margin_std": 111.29301452636719, "step": 156 }, { "KL/chosen_KL_mean": -82.54095458984375, "KL/mean": -125.1497573852539, "KL/rejected_KL_mean": -167.75857543945312, "KL/std": 92.44023895263672, "epoch": 0.2305433186490455, "fcm_dpo/beta": 0.006253876723349094, "fcm_dpo/delta": -0.14177267253398895, "fcm_dpo/margin": 85.21761322021484, "fcm_dpo/q_t": 0.3786003589630127, "grad_norm": 23.096948623657227, "learning_rate": 4.7548030719819154e-07, "logits/chosen": -0.3750728964805603, "logits/rejected": -0.3818325996398926, "logps/chosen": -143.90939331054688, "logps/ref_chosen": -61.368438720703125, "logps/ref_rejected": -107.64636993408203, "logps/rejected": -275.4049377441406, "loss": 1.0271, "margin_dpo/margin_mean": 85.21761322021484, "margin_dpo/margin_std": 105.14042663574219, "step": 157 }, { "KL/chosen_KL_mean": -83.61564636230469, "KL/mean": -128.85971069335938, "KL/rejected_KL_mean": -174.10379028320312, "KL/std": 114.70477294921875, "epoch": 0.23201174743024963, "fcm_dpo/beta": 0.0060958778485655785, "fcm_dpo/delta": -0.1604328155517578, "fcm_dpo/margin": 90.4881591796875, "fcm_dpo/q_t": 0.38691407442092896, "grad_norm": 19.036361694335938, "learning_rate": 4.7492306773041136e-07, "logits/chosen": -0.37780940532684326, "logits/rejected": -0.39571529626846313, "logps/chosen": -141.2285614013672, "logps/ref_chosen": -57.612918853759766, "logps/ref_rejected": -113.6946792602539, "logps/rejected": -287.7984619140625, "loss": 1.0566, "margin_dpo/margin_mean": 90.4881591796875, "margin_dpo/margin_std": 138.78392028808594, "step": 158 }, { "KL/chosen_KL_mean": -93.97590637207031, "KL/mean": -124.90428161621094, "KL/rejected_KL_mean": -155.83265686035156, "KL/std": 100.59730529785156, "epoch": 0.23348017621145375, "fcm_dpo/beta": 0.006093316245824099, "fcm_dpo/delta": 0.02347235381603241, "fcm_dpo/margin": 61.85674285888672, "fcm_dpo/q_t": 0.4160599112510681, "grad_norm": 25.451343536376953, "learning_rate": 4.743599013306165e-07, "logits/chosen": -0.39171531796455383, "logits/rejected": -0.36167389154434204, "logps/chosen": -175.5362548828125, "logps/ref_chosen": -81.56034851074219, "logps/ref_rejected": -88.89871215820312, "logps/rejected": -244.73135375976562, "loss": 1.1492, "margin_dpo/margin_mean": 61.85674285888672, "margin_dpo/margin_std": 109.38902282714844, "step": 159 }, { "KL/chosen_KL_mean": -96.04105377197266, "KL/mean": -137.7037811279297, "KL/rejected_KL_mean": -179.3665313720703, "KL/std": 107.62297058105469, "epoch": 0.23494860499265785, "fcm_dpo/beta": 0.0059524280950427055, "fcm_dpo/delta": -0.10199404507875443, "fcm_dpo/margin": 83.32546997070312, "fcm_dpo/q_t": 0.39600396156311035, "grad_norm": 22.877182006835938, "learning_rate": 4.737908228387656e-07, "logits/chosen": -0.35929036140441895, "logits/rejected": -0.35081833600997925, "logps/chosen": -161.77194213867188, "logps/ref_chosen": -65.73088073730469, "logps/ref_rejected": -97.21781921386719, "logps/rejected": -276.5843505859375, "loss": 1.0904, "margin_dpo/margin_mean": 83.32546997070312, "margin_dpo/margin_std": 134.5235595703125, "step": 160 }, { "KL/chosen_KL_mean": -81.80270385742188, "KL/mean": -118.25526428222656, "KL/rejected_KL_mean": -154.70782470703125, "KL/std": 84.72265625, "epoch": 0.23641703377386197, "fcm_dpo/beta": 0.005920952185988426, "fcm_dpo/delta": -0.033098410815000534, "fcm_dpo/margin": 72.9051284790039, "fcm_dpo/q_t": 0.40439367294311523, "grad_norm": 21.35667610168457, "learning_rate": 4.7321584725060594e-07, "logits/chosen": -0.38000649213790894, "logits/rejected": -0.38120192289352417, "logps/chosen": -134.23916625976562, "logps/ref_chosen": -52.43647003173828, "logps/ref_rejected": -83.43095397949219, "logps/rejected": -238.13877868652344, "loss": 1.0923, "margin_dpo/margin_mean": 72.9051284790039, "margin_dpo/margin_std": 107.10279846191406, "step": 161 }, { "KL/chosen_KL_mean": -79.02703094482422, "KL/mean": -114.97813415527344, "KL/rejected_KL_mean": -150.92922973632812, "KL/std": 93.30059814453125, "epoch": 0.23788546255506607, "fcm_dpo/beta": 0.0058417608961462975, "fcm_dpo/delta": -0.022580057382583618, "fcm_dpo/margin": 71.90220642089844, "fcm_dpo/q_t": 0.40738850831985474, "grad_norm": 21.55710792541504, "learning_rate": 4.7263498971727905e-07, "logits/chosen": -0.4415048360824585, "logits/rejected": -0.42762479186058044, "logps/chosen": -141.6376190185547, "logps/ref_chosen": -62.6105842590332, "logps/ref_rejected": -89.39057922363281, "logps/rejected": -240.31982421875, "loss": 1.1092, "margin_dpo/margin_mean": 71.90221405029297, "margin_dpo/margin_std": 110.78158569335938, "step": 162 }, { "KL/chosen_KL_mean": -88.2254638671875, "KL/mean": -123.40544128417969, "KL/rejected_KL_mean": -158.58541870117188, "KL/std": 93.95071411132812, "epoch": 0.2393538913362702, "fcm_dpo/beta": 0.00587341096252203, "fcm_dpo/delta": -0.013822587206959724, "fcm_dpo/margin": 70.35995483398438, "fcm_dpo/q_t": 0.4094018042087555, "grad_norm": 19.775178909301758, "learning_rate": 4.720482655449212e-07, "logits/chosen": -0.3558083772659302, "logits/rejected": -0.33801817893981934, "logps/chosen": -143.24710083007812, "logps/ref_chosen": -55.021629333496094, "logps/ref_rejected": -75.418212890625, "logps/rejected": -234.00363159179688, "loss": 1.1137, "margin_dpo/margin_mean": 70.35994720458984, "margin_dpo/margin_std": 112.65255737304688, "step": 163 }, { "KL/chosen_KL_mean": -80.43484497070312, "KL/mean": -123.2110366821289, "KL/rejected_KL_mean": -165.98721313476562, "KL/std": 91.2215576171875, "epoch": 0.24082232011747431, "fcm_dpo/beta": 0.005744011141359806, "fcm_dpo/delta": -0.09778200834989548, "fcm_dpo/margin": 85.5523681640625, "fcm_dpo/q_t": 0.3890402913093567, "grad_norm": 21.015703201293945, "learning_rate": 4.714556901942599e-07, "logits/chosen": -0.34407860040664673, "logits/rejected": -0.3298642039299011, "logps/chosen": -136.0755157470703, "logps/ref_chosen": -55.64066696166992, "logps/ref_rejected": -79.66463470458984, "logps/rejected": -245.65184020996094, "loss": 1.0381, "margin_dpo/margin_mean": 85.5523681640625, "margin_dpo/margin_std": 105.12376403808594, "step": 164 }, { "KL/chosen_KL_mean": -87.14698028564453, "KL/mean": -114.9844970703125, "KL/rejected_KL_mean": -142.822021484375, "KL/std": 77.58013916015625, "epoch": 0.2422907488986784, "fcm_dpo/beta": 0.0058072819374501705, "fcm_dpo/delta": 0.07930518686771393, "fcm_dpo/margin": 55.67503356933594, "fcm_dpo/q_t": 0.42748406529426575, "grad_norm": 21.678955078125, "learning_rate": 4.708572792802069e-07, "logits/chosen": -0.3774159252643585, "logits/rejected": -0.351327121257782, "logps/chosen": -148.45767211914062, "logps/ref_chosen": -61.310691833496094, "logps/ref_rejected": -73.67060852050781, "logps/rejected": -216.49261474609375, "loss": 1.1741, "margin_dpo/margin_mean": 55.67503356933594, "margin_dpo/margin_std": 103.26383972167969, "step": 165 }, { "KL/chosen_KL_mean": -77.0203857421875, "KL/mean": -128.22457885742188, "KL/rejected_KL_mean": -179.4287872314453, "KL/std": 112.74888610839844, "epoch": 0.24375917767988253, "fcm_dpo/beta": 0.00565761886537075, "fcm_dpo/delta": -0.19167430698871613, "fcm_dpo/margin": 102.40840148925781, "fcm_dpo/q_t": 0.3809058368206024, "grad_norm": 17.48912239074707, "learning_rate": 4.702530485714461e-07, "logits/chosen": -0.30309057235717773, "logits/rejected": -0.31303203105926514, "logps/chosen": -128.0039825439453, "logps/ref_chosen": -50.98360061645508, "logps/ref_rejected": -98.09512329101562, "logps/rejected": -277.52392578125, "loss": 1.019, "margin_dpo/margin_mean": 102.40840148925781, "margin_dpo/margin_std": 141.89622497558594, "step": 166 }, { "KL/chosen_KL_mean": -78.29979705810547, "KL/mean": -132.09503173828125, "KL/rejected_KL_mean": -185.89027404785156, "KL/std": 102.53227233886719, "epoch": 0.24522760646108663, "fcm_dpo/beta": 0.005461276508867741, "fcm_dpo/delta": -0.19935590028762817, "fcm_dpo/margin": 107.59046173095703, "fcm_dpo/q_t": 0.368857204914093, "grad_norm": 18.757923126220703, "learning_rate": 4.6964301399001877e-07, "logits/chosen": -0.34970927238464355, "logits/rejected": -0.35314348340034485, "logps/chosen": -128.72389221191406, "logps/ref_chosen": -50.424095153808594, "logps/ref_rejected": -96.03042602539062, "logps/rejected": -281.92071533203125, "loss": 0.9757, "margin_dpo/margin_mean": 107.59046936035156, "margin_dpo/margin_std": 118.7750015258789, "step": 167 }, { "KL/chosen_KL_mean": -84.44252014160156, "KL/mean": -124.02970886230469, "KL/rejected_KL_mean": -163.61688232421875, "KL/std": 95.80686950683594, "epoch": 0.24669603524229075, "fcm_dpo/beta": 0.005374173633754253, "fcm_dpo/delta": -0.02677498757839203, "fcm_dpo/margin": 79.17437744140625, "fcm_dpo/q_t": 0.4044000506401062, "grad_norm": 20.719741821289062, "learning_rate": 4.690271916109034e-07, "logits/chosen": -0.32083988189697266, "logits/rejected": -0.3103061020374298, "logps/chosen": -133.9053497314453, "logps/ref_chosen": -49.462825775146484, "logps/ref_rejected": -75.30855560302734, "logps/rejected": -238.92544555664062, "loss": 1.0804, "margin_dpo/margin_mean": 79.17437744140625, "margin_dpo/margin_std": 107.18096923828125, "step": 168 }, { "KL/chosen_KL_mean": -86.89261627197266, "KL/mean": -121.52689361572266, "KL/rejected_KL_mean": -156.1611785888672, "KL/std": 94.73361206054688, "epoch": 0.24816446402349487, "fcm_dpo/beta": 0.005301401484757662, "fcm_dpo/delta": -0.07176721096038818, "fcm_dpo/margin": 69.26856231689453, "fcm_dpo/q_t": 0.4205383062362671, "grad_norm": 21.35027503967285, "learning_rate": 4.6840559766159235e-07, "logits/chosen": -0.3565632700920105, "logits/rejected": -0.34026655554771423, "logps/chosen": -146.69606018066406, "logps/ref_chosen": -59.803443908691406, "logps/ref_rejected": -83.34574890136719, "logps/rejected": -239.50692749023438, "loss": 1.1631, "margin_dpo/margin_mean": 69.26856994628906, "margin_dpo/margin_std": 130.9052734375, "step": 169 }, { "KL/chosen_KL_mean": -77.08535766601562, "KL/mean": -117.763671875, "KL/rejected_KL_mean": -158.44198608398438, "KL/std": 90.00228881835938, "epoch": 0.24963289280469897, "fcm_dpo/beta": 0.0052484553307294846, "fcm_dpo/delta": -0.029036525636911392, "fcm_dpo/margin": 81.35664367675781, "fcm_dpo/q_t": 0.401960551738739, "grad_norm": 17.703929901123047, "learning_rate": 4.6777824852166437e-07, "logits/chosen": -0.33288633823394775, "logits/rejected": -0.3227166533470154, "logps/chosen": -126.55712890625, "logps/ref_chosen": -49.471771240234375, "logps/ref_rejected": -75.91734313964844, "logps/rejected": -234.3593292236328, "loss": 1.0812, "margin_dpo/margin_mean": 81.35664367675781, "margin_dpo/margin_std": 107.8404541015625, "step": 170 }, { "KL/chosen_KL_mean": -111.29542541503906, "KL/mean": -144.80523681640625, "KL/rejected_KL_mean": -178.31504821777344, "KL/std": 104.81484985351562, "epoch": 0.2511013215859031, "fcm_dpo/beta": 0.005305338650941849, "fcm_dpo/delta": 0.046089254319667816, "fcm_dpo/margin": 67.01963806152344, "fcm_dpo/q_t": 0.42465952038764954, "grad_norm": 29.754013061523438, "learning_rate": 4.6714516072235273e-07, "logits/chosen": -0.35111328959465027, "logits/rejected": -0.3345106542110443, "logps/chosen": -195.79473876953125, "logps/ref_chosen": -84.49931335449219, "logps/ref_rejected": -109.38209533691406, "logps/rejected": -287.6971435546875, "loss": 1.1718, "margin_dpo/margin_mean": 67.01963806152344, "margin_dpo/margin_std": 134.08868408203125, "step": 171 }, { "KL/chosen_KL_mean": -98.73640441894531, "KL/mean": -133.205810546875, "KL/rejected_KL_mean": -167.67523193359375, "KL/std": 101.24346160888672, "epoch": 0.2525697503671072, "fcm_dpo/beta": 0.005340388976037502, "fcm_dpo/delta": 0.033069491386413574, "fcm_dpo/margin": 68.93881225585938, "fcm_dpo/q_t": 0.4173203706741333, "grad_norm": 22.75365447998047, "learning_rate": 4.6650635094610966e-07, "logits/chosen": -0.38472980260849, "logits/rejected": -0.3684314489364624, "logps/chosen": -167.39031982421875, "logps/ref_chosen": -68.65391540527344, "logps/ref_rejected": -85.43667602539062, "logps/rejected": -253.1118927001953, "loss": 1.1366, "margin_dpo/margin_mean": 68.9388198852539, "margin_dpo/margin_std": 114.0185775756836, "step": 172 }, { "KL/chosen_KL_mean": -89.91429901123047, "KL/mean": -124.62344360351562, "KL/rejected_KL_mean": -159.3325958251953, "KL/std": 94.92538452148438, "epoch": 0.2540381791483113, "fcm_dpo/beta": 0.0053945546969771385, "fcm_dpo/delta": 0.026172153651714325, "fcm_dpo/margin": 69.41828918457031, "fcm_dpo/q_t": 0.41492652893066406, "grad_norm": 21.469446182250977, "learning_rate": 4.6586183602616687e-07, "logits/chosen": -0.40895795822143555, "logits/rejected": -0.3820996880531311, "logps/chosen": -152.96517944335938, "logps/ref_chosen": -63.050880432128906, "logps/ref_rejected": -78.68392181396484, "logps/rejected": -238.01651000976562, "loss": 1.1124, "margin_dpo/margin_mean": 69.41828918457031, "margin_dpo/margin_std": 98.99739074707031, "step": 173 }, { "KL/chosen_KL_mean": -85.3913345336914, "KL/mean": -126.82902526855469, "KL/rejected_KL_mean": -168.26670837402344, "KL/std": 99.21536254882812, "epoch": 0.2555066079295154, "fcm_dpo/beta": 0.005375551991164684, "fcm_dpo/delta": -0.04802219197154045, "fcm_dpo/margin": 82.8753662109375, "fcm_dpo/q_t": 0.40161585807800293, "grad_norm": 21.169090270996094, "learning_rate": 4.652116329460919e-07, "logits/chosen": -0.326399028301239, "logits/rejected": -0.3440948724746704, "logps/chosen": -138.75430297851562, "logps/ref_chosen": -53.36296844482422, "logps/ref_rejected": -101.91120910644531, "logps/rejected": -270.17791748046875, "loss": 1.0841, "margin_dpo/margin_mean": 82.8753662109375, "margin_dpo/margin_std": 118.40190887451172, "step": 174 }, { "KL/chosen_KL_mean": -78.79037475585938, "KL/mean": -135.27874755859375, "KL/rejected_KL_mean": -191.76712036132812, "KL/std": 107.09109497070312, "epoch": 0.25697503671071953, "fcm_dpo/beta": 0.005171348340809345, "fcm_dpo/delta": -0.195995032787323, "fcm_dpo/margin": 112.97673797607422, "fcm_dpo/q_t": 0.3668813109397888, "grad_norm": 27.521982192993164, "learning_rate": 4.645557588393406e-07, "logits/chosen": -0.3237273693084717, "logits/rejected": -0.31100332736968994, "logps/chosen": -124.20813751220703, "logps/ref_chosen": -45.417762756347656, "logps/ref_rejected": -89.50579833984375, "logps/rejected": -281.2729187011719, "loss": 0.9595, "margin_dpo/margin_mean": 112.97673797607422, "margin_dpo/margin_std": 112.159912109375, "step": 175 }, { "KL/chosen_KL_mean": -85.66651916503906, "KL/mean": -133.4976348876953, "KL/rejected_KL_mean": -181.3287353515625, "KL/std": 107.46882629394531, "epoch": 0.25844346549192365, "fcm_dpo/beta": 0.005067367106676102, "fcm_dpo/delta": -0.08898322284221649, "fcm_dpo/margin": 95.66221618652344, "fcm_dpo/q_t": 0.3938441872596741, "grad_norm": 18.892227172851562, "learning_rate": 4.638942309888058e-07, "logits/chosen": -0.3252803385257721, "logits/rejected": -0.34228193759918213, "logps/chosen": -136.11935424804688, "logps/ref_chosen": -50.452842712402344, "logps/ref_rejected": -95.5589599609375, "logps/rejected": -276.8876953125, "loss": 1.0478, "margin_dpo/margin_mean": 95.66221618652344, "margin_dpo/margin_std": 125.021484375, "step": 176 }, { "KL/chosen_KL_mean": -99.08952331542969, "KL/mean": -146.7650146484375, "KL/rejected_KL_mean": -194.4405059814453, "KL/std": 116.030517578125, "epoch": 0.2599118942731278, "fcm_dpo/beta": 0.004990983754396439, "fcm_dpo/delta": -0.07962613552808762, "fcm_dpo/margin": 95.35098266601562, "fcm_dpo/q_t": 0.3944365382194519, "grad_norm": 26.08759117126465, "learning_rate": 4.6322706682636137e-07, "logits/chosen": -0.36417263746261597, "logits/rejected": -0.3568111062049866, "logps/chosen": -160.30599975585938, "logps/ref_chosen": -61.216468811035156, "logps/ref_rejected": -95.89378356933594, "logps/rejected": -290.33428955078125, "loss": 1.049, "margin_dpo/margin_mean": 95.35098266601562, "margin_dpo/margin_std": 123.23958587646484, "step": 177 }, { "KL/chosen_KL_mean": -109.24971008300781, "KL/mean": -168.47171020507812, "KL/rejected_KL_mean": -227.69369506835938, "KL/std": 135.19757080078125, "epoch": 0.26138032305433184, "fcm_dpo/beta": 0.00480748200789094, "fcm_dpo/delta": -0.18060356378555298, "fcm_dpo/margin": 118.44398498535156, "fcm_dpo/q_t": 0.37657660245895386, "grad_norm": 27.715505599975586, "learning_rate": 4.6255428393240354e-07, "logits/chosen": -0.1987697333097458, "logits/rejected": -0.18940778076648712, "logps/chosen": -167.51449584960938, "logps/ref_chosen": -58.26478958129883, "logps/ref_rejected": -105.3653335571289, "logps/rejected": -333.05902099609375, "loss": 1.0039, "margin_dpo/margin_mean": 118.4439926147461, "margin_dpo/margin_std": 148.56935119628906, "step": 178 }, { "KL/chosen_KL_mean": -115.18132019042969, "KL/mean": -156.70834350585938, "KL/rejected_KL_mean": -198.23538208007812, "KL/std": 117.613525390625, "epoch": 0.26284875183553597, "fcm_dpo/beta": 0.00476008839905262, "fcm_dpo/delta": 0.004362210631370544, "fcm_dpo/margin": 83.05406188964844, "fcm_dpo/q_t": 0.41322624683380127, "grad_norm": 28.284343719482422, "learning_rate": 4.6187590003538724e-07, "logits/chosen": -0.30069026350975037, "logits/rejected": -0.3113616406917572, "logps/chosen": -176.23965454101562, "logps/ref_chosen": -61.05832290649414, "logps/ref_rejected": -90.52782440185547, "logps/rejected": -288.7632141113281, "loss": 1.1442, "margin_dpo/margin_mean": 83.05406188964844, "margin_dpo/margin_std": 147.2235107421875, "step": 179 }, { "KL/chosen_KL_mean": -99.40576171875, "KL/mean": -153.18653869628906, "KL/rejected_KL_mean": -206.9673309326172, "KL/std": 105.46197509765625, "epoch": 0.2643171806167401, "fcm_dpo/beta": 0.004700476303696632, "fcm_dpo/delta": -0.11138296127319336, "fcm_dpo/margin": 107.56156158447266, "fcm_dpo/q_t": 0.3854052722454071, "grad_norm": 20.0328369140625, "learning_rate": 4.611919330113591e-07, "logits/chosen": -0.26214757561683655, "logits/rejected": -0.25771957635879517, "logps/chosen": -153.74847412109375, "logps/ref_chosen": -54.34272003173828, "logps/ref_rejected": -98.21183776855469, "logps/rejected": -305.1791687011719, "loss": 1.0301, "margin_dpo/margin_mean": 107.56156158447266, "margin_dpo/margin_std": 131.99119567871094, "step": 180 }, { "KL/chosen_KL_mean": -86.6176986694336, "KL/mean": -119.9801254272461, "KL/rejected_KL_mean": -153.34255981445312, "KL/std": 96.91038513183594, "epoch": 0.2657856093979442, "fcm_dpo/beta": 0.004746724851429462, "fcm_dpo/delta": 0.08582982420921326, "fcm_dpo/margin": 66.72486877441406, "fcm_dpo/q_t": 0.428183913230896, "grad_norm": 23.883167266845703, "learning_rate": 4.605024008834863e-07, "logits/chosen": -0.3121437132358551, "logits/rejected": -0.2885586619377136, "logps/chosen": -141.6181640625, "logps/ref_chosen": -55.000457763671875, "logps/ref_rejected": -61.656166076660156, "logps/rejected": -214.99871826171875, "loss": 1.174, "margin_dpo/margin_mean": 66.72486877441406, "margin_dpo/margin_std": 122.89712524414062, "step": 181 }, { "KL/chosen_KL_mean": -82.56523895263672, "KL/mean": -142.1549530029297, "KL/rejected_KL_mean": -201.74465942382812, "KL/std": 118.71504211425781, "epoch": 0.26725403817914833, "fcm_dpo/beta": 0.00462943222373724, "fcm_dpo/delta": -0.16154250502586365, "fcm_dpo/margin": 119.17942810058594, "fcm_dpo/q_t": 0.37539470195770264, "grad_norm": 18.672292709350586, "learning_rate": 4.598073218215817e-07, "logits/chosen": -0.2760277986526489, "logits/rejected": -0.28654640913009644, "logps/chosen": -123.673095703125, "logps/ref_chosen": -41.107852935791016, "logps/ref_rejected": -89.5215835571289, "logps/rejected": -291.2662353515625, "loss": 1.0089, "margin_dpo/margin_mean": 119.17942810058594, "margin_dpo/margin_std": 142.9564971923828, "step": 182 }, { "KL/chosen_KL_mean": -120.36442565917969, "KL/mean": -152.0966339111328, "KL/rejected_KL_mean": -183.828857421875, "KL/std": 97.2325210571289, "epoch": 0.2687224669603524, "fcm_dpo/beta": 0.004568018019199371, "fcm_dpo/delta": -0.04479080066084862, "fcm_dpo/margin": 63.46442413330078, "fcm_dpo/q_t": 0.43251746892929077, "grad_norm": 21.404373168945312, "learning_rate": 4.5910671414162484e-07, "logits/chosen": -0.29337215423583984, "logits/rejected": -0.28445976972579956, "logps/chosen": -177.8889923095703, "logps/ref_chosen": -57.52456283569336, "logps/ref_rejected": -75.97572326660156, "logps/rejected": -259.8045654296875, "loss": 1.1828, "margin_dpo/margin_mean": 63.46442413330078, "margin_dpo/margin_std": 107.15849304199219, "step": 183 }, { "KL/chosen_KL_mean": -99.98910522460938, "KL/mean": -133.18710327148438, "KL/rejected_KL_mean": -166.38510131835938, "KL/std": 91.612060546875, "epoch": 0.2701908957415565, "fcm_dpo/beta": 0.0045661963522434235, "fcm_dpo/delta": -0.003989125601947308, "fcm_dpo/margin": 66.39598083496094, "fcm_dpo/q_t": 0.4311205744743347, "grad_norm": 19.19173240661621, "learning_rate": 4.5840059630527985e-07, "logits/chosen": -0.33634817600250244, "logits/rejected": -0.32674121856689453, "logps/chosen": -158.5340576171875, "logps/ref_chosen": -58.544952392578125, "logps/ref_rejected": -76.63406372070312, "logps/rejected": -243.0191650390625, "loss": 1.1735, "margin_dpo/margin_mean": 66.39598083496094, "margin_dpo/margin_std": 115.35989379882812, "step": 184 }, { "KL/chosen_KL_mean": -108.26544189453125, "KL/mean": -134.82513427734375, "KL/rejected_KL_mean": -161.38482666015625, "KL/std": 105.25448608398438, "epoch": 0.27165932452276065, "fcm_dpo/beta": 0.004670283757150173, "fcm_dpo/delta": 0.15583746135234833, "fcm_dpo/margin": 53.11936950683594, "fcm_dpo/q_t": 0.4466909170150757, "grad_norm": 20.08793067932129, "learning_rate": 4.5768898691940836e-07, "logits/chosen": -0.28927063941955566, "logits/rejected": -0.26542210578918457, "logps/chosen": -170.29129028320312, "logps/ref_chosen": -62.025848388671875, "logps/ref_rejected": -73.7625961303711, "logps/rejected": -235.1474151611328, "loss": 1.2331, "margin_dpo/margin_mean": 53.1193733215332, "margin_dpo/margin_std": 126.44635772705078, "step": 185 }, { "KL/chosen_KL_mean": -99.16331481933594, "KL/mean": -149.17770385742188, "KL/rejected_KL_mean": -199.19210815429688, "KL/std": 105.57518768310547, "epoch": 0.27312775330396477, "fcm_dpo/beta": 0.004654415883123875, "fcm_dpo/delta": -0.06891189515590668, "fcm_dpo/margin": 100.02880096435547, "fcm_dpo/q_t": 0.39436179399490356, "grad_norm": 22.96204948425293, "learning_rate": 4.5697190473557947e-07, "logits/chosen": -0.36928582191467285, "logits/rejected": -0.3490529954433441, "logps/chosen": -168.51678466796875, "logps/ref_chosen": -69.35346984863281, "logps/ref_rejected": -88.07244873046875, "logps/rejected": -287.2645568847656, "loss": 1.0462, "margin_dpo/margin_mean": 100.02880096435547, "margin_dpo/margin_std": 121.30867767333984, "step": 186 }, { "KL/chosen_KL_mean": -92.08950805664062, "KL/mean": -133.75186157226562, "KL/rejected_KL_mean": -175.4142303466797, "KL/std": 100.45945739746094, "epoch": 0.2745961820851689, "fcm_dpo/beta": 0.004670889116823673, "fcm_dpo/delta": 0.01085655763745308, "fcm_dpo/margin": 83.32472229003906, "fcm_dpo/q_t": 0.4105120003223419, "grad_norm": 21.904727935791016, "learning_rate": 4.5624936864957555e-07, "logits/chosen": -0.3443525433540344, "logits/rejected": -0.3386707901954651, "logps/chosen": -144.84596252441406, "logps/ref_chosen": -52.7564582824707, "logps/ref_rejected": -81.96910095214844, "logps/rejected": -257.3833312988281, "loss": 1.096, "margin_dpo/margin_mean": 83.32472229003906, "margin_dpo/margin_std": 108.60208129882812, "step": 187 }, { "KL/chosen_KL_mean": -86.056640625, "KL/mean": -135.95150756835938, "KL/rejected_KL_mean": -185.84640502929688, "KL/std": 111.05039978027344, "epoch": 0.27606461086637296, "fcm_dpo/beta": 0.004614308476448059, "fcm_dpo/delta": -0.06334332376718521, "fcm_dpo/margin": 99.78976440429688, "fcm_dpo/q_t": 0.3958631753921509, "grad_norm": 25.85726547241211, "learning_rate": 4.5552139770089454e-07, "logits/chosen": -0.3261992633342743, "logits/rejected": -0.33161741495132446, "logps/chosen": -135.4721221923828, "logps/ref_chosen": -49.415489196777344, "logps/ref_rejected": -89.54043579101562, "logps/rejected": -275.3868408203125, "loss": 1.0498, "margin_dpo/margin_mean": 99.7897720336914, "margin_dpo/margin_std": 121.95850372314453, "step": 188 }, { "KL/chosen_KL_mean": -99.45054626464844, "KL/mean": -141.6304931640625, "KL/rejected_KL_mean": -183.81045532226562, "KL/std": 111.99412536621094, "epoch": 0.2775330396475771, "fcm_dpo/beta": 0.004608414135873318, "fcm_dpo/delta": 0.011658096686005592, "fcm_dpo/margin": 84.35992431640625, "fcm_dpo/q_t": 0.4146321415901184, "grad_norm": 20.663795471191406, "learning_rate": 4.5478801107224794e-07, "logits/chosen": -0.3318672776222229, "logits/rejected": -0.31224292516708374, "logps/chosen": -151.84950256347656, "logps/ref_chosen": -52.39896011352539, "logps/ref_rejected": -72.16735076904297, "logps/rejected": -255.97781372070312, "loss": 1.1224, "margin_dpo/margin_mean": 84.35991668701172, "margin_dpo/margin_std": 137.48220825195312, "step": 189 }, { "KL/chosen_KL_mean": -106.70040893554688, "KL/mean": -155.38369750976562, "KL/rejected_KL_mean": -204.06698608398438, "KL/std": 119.2169418334961, "epoch": 0.2790014684287812, "fcm_dpo/beta": 0.004610296338796616, "fcm_dpo/delta": -0.05222197249531746, "fcm_dpo/margin": 97.3665771484375, "fcm_dpo/q_t": 0.4002448320388794, "grad_norm": 17.597808837890625, "learning_rate": 4.5404922808905543e-07, "logits/chosen": -0.367323637008667, "logits/rejected": -0.35587793588638306, "logps/chosen": -171.3834686279297, "logps/ref_chosen": -64.68305969238281, "logps/ref_rejected": -102.55052185058594, "logps/rejected": -306.61749267578125, "loss": 1.085, "margin_dpo/margin_mean": 97.3665771484375, "margin_dpo/margin_std": 137.46681213378906, "step": 190 }, { "KL/chosen_KL_mean": -99.138916015625, "KL/mean": -169.95510864257812, "KL/rejected_KL_mean": -240.77130126953125, "KL/std": 138.72772216796875, "epoch": 0.28046989720998533, "fcm_dpo/beta": 0.0043829334899783134, "fcm_dpo/delta": -0.23674961924552917, "fcm_dpo/margin": 141.63238525390625, "fcm_dpo/q_t": 0.3627857565879822, "grad_norm": 18.488035202026367, "learning_rate": 4.5330506821893565e-07, "logits/chosen": -0.3747413754463196, "logits/rejected": -0.35563361644744873, "logps/chosen": -167.79779052734375, "logps/ref_chosen": -68.65887451171875, "logps/ref_rejected": -110.1396713256836, "logps/rejected": -350.9109802246094, "loss": 0.9539, "margin_dpo/margin_mean": 141.63238525390625, "margin_dpo/margin_std": 152.71173095703125, "step": 191 }, { "KL/chosen_KL_mean": -128.03395080566406, "KL/mean": -174.38095092773438, "KL/rejected_KL_mean": -220.72796630859375, "KL/std": 119.869873046875, "epoch": 0.28193832599118945, "fcm_dpo/beta": 0.0043565696105360985, "fcm_dpo/delta": -0.004114950075745583, "fcm_dpo/margin": 92.69398498535156, "fcm_dpo/q_t": 0.4108489751815796, "grad_norm": 22.34331703186035, "learning_rate": 4.5255555107119336e-07, "logits/chosen": -0.3097224831581116, "logits/rejected": -0.3084886074066162, "logps/chosen": -197.7608642578125, "logps/ref_chosen": -69.72691345214844, "logps/ref_rejected": -103.32135009765625, "logps/rejected": -324.04931640625, "loss": 1.1169, "margin_dpo/margin_mean": 92.69398498535156, "margin_dpo/margin_std": 148.20654296875, "step": 192 }, { "KL/chosen_KL_mean": -128.42974853515625, "KL/mean": -156.01699829101562, "KL/rejected_KL_mean": -183.60421752929688, "KL/std": 111.40166473388672, "epoch": 0.2834067547723935, "fcm_dpo/beta": 0.004361086059361696, "fcm_dpo/delta": 0.038300659507513046, "fcm_dpo/margin": 55.174468994140625, "fcm_dpo/q_t": 0.44291800260543823, "grad_norm": 26.714811325073242, "learning_rate": 4.5180069639630236e-07, "logits/chosen": -0.31071868538856506, "logits/rejected": -0.2987961769104004, "logps/chosen": -188.6202392578125, "logps/ref_chosen": -60.19049835205078, "logps/ref_rejected": -76.40755462646484, "logps/rejected": -260.01177978515625, "loss": 1.2515, "margin_dpo/margin_mean": 55.17446517944336, "margin_dpo/margin_std": 142.05120849609375, "step": 193 }, { "KL/chosen_KL_mean": -80.51405334472656, "KL/mean": -124.77084350585938, "KL/rejected_KL_mean": -169.02764892578125, "KL/std": 93.25508117675781, "epoch": 0.28487518355359764, "fcm_dpo/beta": 0.004372420255094767, "fcm_dpo/delta": 0.013335110619664192, "fcm_dpo/margin": 88.51360321044922, "fcm_dpo/q_t": 0.40928915143013, "grad_norm": 18.569744110107422, "learning_rate": 4.510405240853854e-07, "logits/chosen": -0.22196577489376068, "logits/rejected": -0.20398879051208496, "logps/chosen": -118.35442352294922, "logps/ref_chosen": -37.84037399291992, "logps/ref_rejected": -60.684783935546875, "logps/rejected": -229.71243286132812, "loss": 1.0846, "margin_dpo/margin_mean": 88.51359558105469, "margin_dpo/margin_std": 102.0927963256836, "step": 194 }, { "KL/chosen_KL_mean": -125.90398406982422, "KL/mean": -173.78524780273438, "KL/rejected_KL_mean": -221.66653442382812, "KL/std": 114.66876220703125, "epoch": 0.28634361233480177, "fcm_dpo/beta": 0.004376476630568504, "fcm_dpo/delta": -0.019938159734010696, "fcm_dpo/margin": 95.7625503540039, "fcm_dpo/q_t": 0.4035083055496216, "grad_norm": 21.247806549072266, "learning_rate": 4.5027505416968985e-07, "logits/chosen": -0.28381267189979553, "logits/rejected": -0.3019316792488098, "logps/chosen": -180.79556274414062, "logps/ref_chosen": -54.891571044921875, "logps/ref_rejected": -96.77095794677734, "logps/rejected": -318.4375, "loss": 1.0719, "margin_dpo/margin_mean": 95.76254272460938, "margin_dpo/margin_std": 117.87249755859375, "step": 195 }, { "KL/chosen_KL_mean": -98.61161041259766, "KL/mean": -152.5880584716797, "KL/rejected_KL_mean": -206.56451416015625, "KL/std": 116.51144409179688, "epoch": 0.2878120411160059, "fcm_dpo/beta": 0.004304712638258934, "fcm_dpo/delta": -0.06879311800003052, "fcm_dpo/margin": 107.95289611816406, "fcm_dpo/q_t": 0.3957948684692383, "grad_norm": 16.711626052856445, "learning_rate": 4.495043068200599e-07, "logits/chosen": -0.31423407793045044, "logits/rejected": -0.29917240142822266, "logps/chosen": -151.8568572998047, "logps/ref_chosen": -53.245243072509766, "logps/ref_rejected": -76.05294799804688, "logps/rejected": -282.6174621582031, "loss": 1.0614, "margin_dpo/margin_mean": 107.95289611816406, "margin_dpo/margin_std": 139.44515991210938, "step": 196 }, { "KL/chosen_KL_mean": -100.81398010253906, "KL/mean": -142.37448120117188, "KL/rejected_KL_mean": -183.9349822998047, "KL/std": 100.89237976074219, "epoch": 0.28928046989721, "fcm_dpo/beta": 0.004351750016212463, "fcm_dpo/delta": 0.03937269002199173, "fcm_dpo/margin": 83.12100982666016, "fcm_dpo/q_t": 0.4173157811164856, "grad_norm": 18.018680572509766, "learning_rate": 4.4872830234640493e-07, "logits/chosen": -0.303945392370224, "logits/rejected": -0.2977169454097748, "logps/chosen": -161.23431396484375, "logps/ref_chosen": -60.42033386230469, "logps/ref_rejected": -77.20890808105469, "logps/rejected": -261.1438903808594, "loss": 1.116, "margin_dpo/margin_mean": 83.12100982666016, "margin_dpo/margin_std": 115.0757827758789, "step": 197 }, { "KL/chosen_KL_mean": -112.49896240234375, "KL/mean": -166.41671752929688, "KL/rejected_KL_mean": -220.33447265625, "KL/std": 126.63810729980469, "epoch": 0.2907488986784141, "fcm_dpo/beta": 0.0043054306879639626, "fcm_dpo/delta": -0.06737668812274933, "fcm_dpo/margin": 107.83549499511719, "fcm_dpo/q_t": 0.3973381817340851, "grad_norm": 19.930042266845703, "learning_rate": 4.479470611971645e-07, "logits/chosen": -0.3347511887550354, "logits/rejected": -0.3346249759197235, "logps/chosen": -167.53515625, "logps/ref_chosen": -55.03618621826172, "logps/ref_rejected": -97.24325561523438, "logps/rejected": -317.5777282714844, "loss": 1.0595, "margin_dpo/margin_mean": 107.83550262451172, "margin_dpo/margin_std": 144.60723876953125, "step": 198 }, { "KL/chosen_KL_mean": -108.42107391357422, "KL/mean": -160.91925048828125, "KL/rejected_KL_mean": -213.41744995117188, "KL/std": 113.93467712402344, "epoch": 0.2922173274596182, "fcm_dpo/beta": 0.004234119318425655, "fcm_dpo/delta": -0.047552645206451416, "fcm_dpo/margin": 104.99636840820312, "fcm_dpo/q_t": 0.39938676357269287, "grad_norm": 21.286582946777344, "learning_rate": 4.471606039587695e-07, "logits/chosen": -0.3568047285079956, "logits/rejected": -0.34048551321029663, "logps/chosen": -165.24990844726562, "logps/ref_chosen": -56.828826904296875, "logps/ref_rejected": -84.64820861816406, "logps/rejected": -298.065673828125, "loss": 1.0729, "margin_dpo/margin_mean": 104.99636840820312, "margin_dpo/margin_std": 139.47503662109375, "step": 199 }, { "KL/chosen_KL_mean": -106.86558532714844, "KL/mean": -160.4095916748047, "KL/rejected_KL_mean": -213.95359802246094, "KL/std": 124.127197265625, "epoch": 0.2936857562408223, "fcm_dpo/beta": 0.0041997479274868965, "fcm_dpo/delta": -0.05252185836434364, "fcm_dpo/margin": 107.0880126953125, "fcm_dpo/q_t": 0.4007049798965454, "grad_norm": 19.893062591552734, "learning_rate": 4.4636895135509966e-07, "logits/chosen": -0.26880979537963867, "logits/rejected": -0.24893805384635925, "logps/chosen": -159.93264770507812, "logps/ref_chosen": -53.06706237792969, "logps/ref_rejected": -80.60843658447266, "logps/rejected": -294.5620422363281, "loss": 1.0865, "margin_dpo/margin_mean": 107.0880126953125, "margin_dpo/margin_std": 158.81661987304688, "step": 200 }, { "KL/chosen_KL_mean": -111.46297454833984, "KL/mean": -163.86080932617188, "KL/rejected_KL_mean": -216.25863647460938, "KL/std": 128.08802795410156, "epoch": 0.29515418502202645, "fcm_dpo/beta": 0.004188035614788532, "fcm_dpo/delta": -0.04069505259394646, "fcm_dpo/margin": 104.79566955566406, "fcm_dpo/q_t": 0.4008665084838867, "grad_norm": 19.711732864379883, "learning_rate": 4.455721242469372e-07, "logits/chosen": -0.3536655306816101, "logits/rejected": -0.34921911358833313, "logps/chosen": -186.86520385742188, "logps/ref_chosen": -75.4022216796875, "logps/ref_rejected": -114.80821990966797, "logps/rejected": -331.06683349609375, "loss": 1.0782, "margin_dpo/margin_mean": 104.79566955566406, "margin_dpo/margin_std": 143.77313232421875, "step": 201 }, { "KL/chosen_KL_mean": -115.6180191040039, "KL/mean": -152.35928344726562, "KL/rejected_KL_mean": -189.10052490234375, "KL/std": 110.5206527709961, "epoch": 0.2966226138032305, "fcm_dpo/beta": 0.004229954443871975, "fcm_dpo/delta": 0.09195201843976974, "fcm_dpo/margin": 73.4825210571289, "fcm_dpo/q_t": 0.43059661984443665, "grad_norm": 19.91847038269043, "learning_rate": 4.4477014363141755e-07, "logits/chosen": -0.3016967177391052, "logits/rejected": -0.3155549168586731, "logps/chosen": -165.71932983398438, "logps/ref_chosen": -50.101318359375, "logps/ref_rejected": -86.98503112792969, "logps/rejected": -276.0855712890625, "loss": 1.1837, "margin_dpo/margin_mean": 73.48252868652344, "margin_dpo/margin_std": 142.18524169921875, "step": 202 }, { "KL/chosen_KL_mean": -113.98049926757812, "KL/mean": -159.70396423339844, "KL/rejected_KL_mean": -205.42742919921875, "KL/std": 113.47210693359375, "epoch": 0.29809104258443464, "fcm_dpo/beta": 0.0042491694912314415, "fcm_dpo/delta": 0.011875176802277565, "fcm_dpo/margin": 91.44692993164062, "fcm_dpo/q_t": 0.41100189089775085, "grad_norm": 20.661174774169922, "learning_rate": 4.439630306414758e-07, "logits/chosen": -0.34466350078582764, "logits/rejected": -0.33388030529022217, "logps/chosen": -174.59019470214844, "logps/ref_chosen": -60.60969543457031, "logps/ref_rejected": -85.89596557617188, "logps/rejected": -291.3233947753906, "loss": 1.1, "margin_dpo/margin_mean": 91.44693756103516, "margin_dpo/margin_std": 124.70313262939453, "step": 203 }, { "KL/chosen_KL_mean": -128.3519287109375, "KL/mean": -170.64601135253906, "KL/rejected_KL_mean": -212.94009399414062, "KL/std": 125.15516662597656, "epoch": 0.29955947136563876, "fcm_dpo/beta": 0.004280552733689547, "fcm_dpo/delta": 0.03934932500123978, "fcm_dpo/margin": 84.58815002441406, "fcm_dpo/q_t": 0.42007431387901306, "grad_norm": 22.547698974609375, "learning_rate": 4.431508065452897e-07, "logits/chosen": -0.42533358931541443, "logits/rejected": -0.38667869567871094, "logps/chosen": -208.5168914794922, "logps/ref_chosen": -80.16496276855469, "logps/ref_rejected": -87.69590759277344, "logps/rejected": -300.635986328125, "loss": 1.1461, "margin_dpo/margin_mean": 84.58815002441406, "margin_dpo/margin_std": 144.9606170654297, "step": 204 }, { "KL/chosen_KL_mean": -124.03070068359375, "KL/mean": -180.54966735839844, "KL/rejected_KL_mean": -237.0686492919922, "KL/std": 127.14155578613281, "epoch": 0.3010279001468429, "fcm_dpo/beta": 0.004203906282782555, "fcm_dpo/delta": -0.08103010058403015, "fcm_dpo/margin": 113.03794860839844, "fcm_dpo/q_t": 0.39093706011772156, "grad_norm": 20.967945098876953, "learning_rate": 4.4233349274571974e-07, "logits/chosen": -0.31887465715408325, "logits/rejected": -0.2883029878139496, "logps/chosen": -183.41543579101562, "logps/ref_chosen": -59.384735107421875, "logps/ref_rejected": -85.12505340576172, "logps/rejected": -322.1936950683594, "loss": 1.0529, "margin_dpo/margin_mean": 113.03794860839844, "margin_dpo/margin_std": 139.7296142578125, "step": 205 }, { "KL/chosen_KL_mean": -115.17393493652344, "KL/mean": -175.67129516601562, "KL/rejected_KL_mean": -236.16864013671875, "KL/std": 120.58004760742188, "epoch": 0.302496328928047, "fcm_dpo/beta": 0.004139425233006477, "fcm_dpo/delta": -0.10684061050415039, "fcm_dpo/margin": 120.99469757080078, "fcm_dpo/q_t": 0.38356366753578186, "grad_norm": 26.109760284423828, "learning_rate": 4.415111107797445e-07, "logits/chosen": -0.27412861585617065, "logits/rejected": -0.2763686776161194, "logps/chosen": -162.138427734375, "logps/ref_chosen": -46.964500427246094, "logps/ref_rejected": -98.9534912109375, "logps/rejected": -335.12213134765625, "loss": 1.0156, "margin_dpo/margin_mean": 120.99468994140625, "margin_dpo/margin_std": 130.6953125, "step": 206 }, { "KL/chosen_KL_mean": -105.1968002319336, "KL/mean": -174.7778778076172, "KL/rejected_KL_mean": -244.35894775390625, "KL/std": 136.5259246826172, "epoch": 0.3039647577092511, "fcm_dpo/beta": 0.004033949691802263, "fcm_dpo/delta": -0.17078402638435364, "fcm_dpo/margin": 139.16213989257812, "fcm_dpo/q_t": 0.37533849477767944, "grad_norm": 25.737268447875977, "learning_rate": 4.4068368231789365e-07, "logits/chosen": -0.3557325005531311, "logits/rejected": -0.3302071690559387, "logps/chosen": -161.2530517578125, "logps/ref_chosen": -56.05625915527344, "logps/ref_rejected": -84.44779968261719, "logps/rejected": -328.8067626953125, "loss": 0.9897, "margin_dpo/margin_mean": 139.16213989257812, "margin_dpo/margin_std": 157.79168701171875, "step": 207 }, { "KL/chosen_KL_mean": -164.72898864746094, "KL/mean": -220.01724243164062, "KL/rejected_KL_mean": -275.3055114746094, "KL/std": 134.671875, "epoch": 0.3054331864904552, "fcm_dpo/beta": 0.003962271846830845, "fcm_dpo/delta": -0.04002426564693451, "fcm_dpo/margin": 110.57653045654297, "fcm_dpo/q_t": 0.40099895000457764, "grad_norm": 24.69184112548828, "learning_rate": 4.398512291636768e-07, "logits/chosen": -0.3562470078468323, "logits/rejected": -0.337843656539917, "logps/chosen": -231.79660034179688, "logps/ref_chosen": -67.06761169433594, "logps/ref_rejected": -94.28689575195312, "logps/rejected": -369.5924072265625, "loss": 1.092, "margin_dpo/margin_mean": 110.57653045654297, "margin_dpo/margin_std": 163.5906982421875, "step": 208 }, { "KL/chosen_KL_mean": -138.68316650390625, "KL/mean": -187.58811950683594, "KL/rejected_KL_mean": -236.49307250976562, "KL/std": 122.4765853881836, "epoch": 0.3069016152716593, "fcm_dpo/beta": 0.003970026038587093, "fcm_dpo/delta": 0.012157567776739597, "fcm_dpo/margin": 97.8099365234375, "fcm_dpo/q_t": 0.4124048352241516, "grad_norm": 29.169567108154297, "learning_rate": 4.3901377325300857e-07, "logits/chosen": -0.25604674220085144, "logits/rejected": -0.24463605880737305, "logps/chosen": -194.86485290527344, "logps/ref_chosen": -56.18169403076172, "logps/ref_rejected": -80.94152069091797, "logps/rejected": -317.4346008300781, "loss": 1.1271, "margin_dpo/margin_mean": 97.8099365234375, "margin_dpo/margin_std": 156.21786499023438, "step": 209 }, { "KL/chosen_KL_mean": -126.92047119140625, "KL/mean": -183.3939971923828, "KL/rejected_KL_mean": -239.86752319335938, "KL/std": 125.841552734375, "epoch": 0.30837004405286345, "fcm_dpo/beta": 0.003958011977374554, "fcm_dpo/delta": -0.0493808314204216, "fcm_dpo/margin": 112.94706726074219, "fcm_dpo/q_t": 0.39954549074172974, "grad_norm": 24.660263061523438, "learning_rate": 4.381713366536311e-07, "logits/chosen": -0.2841013967990875, "logits/rejected": -0.275867760181427, "logps/chosen": -173.29229736328125, "logps/ref_chosen": -46.371822357177734, "logps/ref_rejected": -76.68162536621094, "logps/rejected": -316.54913330078125, "loss": 1.0728, "margin_dpo/margin_mean": 112.94705200195312, "margin_dpo/margin_std": 152.02679443359375, "step": 210 }, { "KL/chosen_KL_mean": -178.33975219726562, "KL/mean": -226.99276733398438, "KL/rejected_KL_mean": -275.6457824707031, "KL/std": 142.97116088867188, "epoch": 0.30983847283406757, "fcm_dpo/beta": 0.003944946452975273, "fcm_dpo/delta": 0.01675173081457615, "fcm_dpo/margin": 97.3060302734375, "fcm_dpo/q_t": 0.4174480438232422, "grad_norm": 32.09341812133789, "learning_rate": 4.373239415645323e-07, "logits/chosen": -0.2926616668701172, "logits/rejected": -0.25190287828445435, "logps/chosen": -257.2720947265625, "logps/ref_chosen": -78.93235778808594, "logps/ref_rejected": -86.82098388671875, "logps/rejected": -362.4667663574219, "loss": 1.141, "margin_dpo/margin_mean": 97.3060302734375, "margin_dpo/margin_std": 168.17588806152344, "step": 211 }, { "KL/chosen_KL_mean": -147.63983154296875, "KL/mean": -216.47125244140625, "KL/rejected_KL_mean": -285.30267333984375, "KL/std": 154.91241455078125, "epoch": 0.31130690161527164, "fcm_dpo/beta": 0.0038361717015504837, "fcm_dpo/delta": -0.13747426867485046, "fcm_dpo/margin": 137.662841796875, "fcm_dpo/q_t": 0.3819977641105652, "grad_norm": 25.37755584716797, "learning_rate": 4.3647161031536086e-07, "logits/chosen": -0.3288855254650116, "logits/rejected": -0.32180070877075195, "logps/chosen": -205.83685302734375, "logps/ref_chosen": -58.19701385498047, "logps/ref_rejected": -103.05785369873047, "logps/rejected": -388.36053466796875, "loss": 1.0332, "margin_dpo/margin_mean": 137.66285705566406, "margin_dpo/margin_std": 171.55409240722656, "step": 212 }, { "KL/chosen_KL_mean": -137.6666259765625, "KL/mean": -203.332275390625, "KL/rejected_KL_mean": -268.9979248046875, "KL/std": 133.84786987304688, "epoch": 0.31277533039647576, "fcm_dpo/beta": 0.0037672575563192368, "fcm_dpo/delta": -0.10035522282123566, "fcm_dpo/margin": 131.33128356933594, "fcm_dpo/q_t": 0.38752636313438416, "grad_norm": 25.538270950317383, "learning_rate": 4.3561436536583774e-07, "logits/chosen": -0.3342677354812622, "logits/rejected": -0.3116719126701355, "logps/chosen": -205.17935180664062, "logps/ref_chosen": -67.51271057128906, "logps/ref_rejected": -93.91471862792969, "logps/rejected": -362.9126281738281, "loss": 1.036, "margin_dpo/margin_mean": 131.33128356933594, "margin_dpo/margin_std": 160.22622680664062, "step": 213 }, { "KL/chosen_KL_mean": -119.18568420410156, "KL/mean": -178.6602020263672, "KL/rejected_KL_mean": -238.13473510742188, "KL/std": 132.51971435546875, "epoch": 0.3142437591776799, "fcm_dpo/beta": 0.0037363125011324883, "fcm_dpo/delta": -0.04649418964982033, "fcm_dpo/margin": 118.94905090332031, "fcm_dpo/q_t": 0.39983218908309937, "grad_norm": 20.97165870666504, "learning_rate": 4.3475222930516473e-07, "logits/chosen": -0.23450475931167603, "logits/rejected": -0.23990775644779205, "logps/chosen": -160.7905731201172, "logps/ref_chosen": -41.604888916015625, "logps/ref_rejected": -77.51741027832031, "logps/rejected": -315.65216064453125, "loss": 1.0677, "margin_dpo/margin_mean": 118.94905090332031, "margin_dpo/margin_std": 155.6826629638672, "step": 214 }, { "KL/chosen_KL_mean": -140.20767211914062, "KL/mean": -203.81375122070312, "KL/rejected_KL_mean": -267.41986083984375, "KL/std": 139.04193115234375, "epoch": 0.315712187958884, "fcm_dpo/beta": 0.0036775285843759775, "fcm_dpo/delta": -0.0715101957321167, "fcm_dpo/margin": 127.212158203125, "fcm_dpo/q_t": 0.39243778586387634, "grad_norm": 25.126728057861328, "learning_rate": 4.3388522485142885e-07, "logits/chosen": -0.28624850511550903, "logits/rejected": -0.2777059078216553, "logps/chosen": -193.4869384765625, "logps/ref_chosen": -53.279266357421875, "logps/ref_rejected": -89.96464538574219, "logps/rejected": -357.3844909667969, "loss": 1.0375, "margin_dpo/margin_mean": 127.212158203125, "margin_dpo/margin_std": 143.94126892089844, "step": 215 }, { "KL/chosen_KL_mean": -144.26304626464844, "KL/mean": -206.27752685546875, "KL/rejected_KL_mean": -268.2920227050781, "KL/std": 143.73822021484375, "epoch": 0.31718061674008813, "fcm_dpo/beta": 0.003656826913356781, "fcm_dpo/delta": -0.05615860968828201, "fcm_dpo/margin": 124.02898406982422, "fcm_dpo/q_t": 0.39861971139907837, "grad_norm": 26.616178512573242, "learning_rate": 4.330133748510036e-07, "logits/chosen": -0.26884669065475464, "logits/rejected": -0.2517741322517395, "logps/chosen": -193.15084838867188, "logps/ref_chosen": -48.887794494628906, "logps/ref_rejected": -77.19892883300781, "logps/rejected": -345.490966796875, "loss": 1.0802, "margin_dpo/margin_mean": 124.02898406982422, "margin_dpo/margin_std": 176.90469360351562, "step": 216 }, { "KL/chosen_KL_mean": -149.80250549316406, "KL/mean": -223.11126708984375, "KL/rejected_KL_mean": -296.4200439453125, "KL/std": 148.53799438476562, "epoch": 0.3186490455212922, "fcm_dpo/beta": 0.0035566347651183605, "fcm_dpo/delta": -0.12853044271469116, "fcm_dpo/margin": 146.6175537109375, "fcm_dpo/q_t": 0.3826901316642761, "grad_norm": 20.384174346923828, "learning_rate": 4.3213670227794757e-07, "logits/chosen": -0.2615566849708557, "logits/rejected": -0.25557541847229004, "logps/chosen": -199.64779663085938, "logps/ref_chosen": -49.845306396484375, "logps/ref_rejected": -100.07832336425781, "logps/rejected": -396.4983825683594, "loss": 1.0109, "margin_dpo/margin_mean": 146.6175537109375, "margin_dpo/margin_std": 168.1234130859375, "step": 217 }, { "KL/chosen_KL_mean": -155.2289581298828, "KL/mean": -211.18832397460938, "KL/rejected_KL_mean": -267.147705078125, "KL/std": 144.78065490722656, "epoch": 0.3201174743024963, "fcm_dpo/beta": 0.003537412267178297, "fcm_dpo/delta": 0.0042178574949502945, "fcm_dpo/margin": 111.91874694824219, "fcm_dpo/q_t": 0.4114026129245758, "grad_norm": 23.56877326965332, "learning_rate": 4.3125523023339815e-07, "logits/chosen": -0.2710033059120178, "logits/rejected": -0.26507091522216797, "logps/chosen": -213.80563354492188, "logps/ref_chosen": -58.576683044433594, "logps/ref_rejected": -87.84639739990234, "logps/rejected": -354.9941101074219, "loss": 1.1138, "margin_dpo/margin_mean": 111.91874694824219, "margin_dpo/margin_std": 170.3919677734375, "step": 218 }, { "KL/chosen_KL_mean": -167.38955688476562, "KL/mean": -215.71990966796875, "KL/rejected_KL_mean": -264.05023193359375, "KL/std": 150.576904296875, "epoch": 0.32158590308370044, "fcm_dpo/beta": 0.003590244799852371, "fcm_dpo/delta": 0.05411606281995773, "fcm_dpo/margin": 96.66064453125, "fcm_dpo/q_t": 0.4225817918777466, "grad_norm": 27.783300399780273, "learning_rate": 4.303689819449636e-07, "logits/chosen": -0.3175503611564636, "logits/rejected": -0.31175172328948975, "logps/chosen": -228.47341918945312, "logps/ref_chosen": -61.083858489990234, "logps/ref_rejected": -85.83042907714844, "logps/rejected": -349.88067626953125, "loss": 1.1723, "margin_dpo/margin_mean": 96.66064453125, "margin_dpo/margin_std": 184.8462371826172, "step": 219 }, { "KL/chosen_KL_mean": -190.74407958984375, "KL/mean": -230.7036590576172, "KL/rejected_KL_mean": -270.6632080078125, "KL/std": 130.32717895507812, "epoch": 0.32305433186490456, "fcm_dpo/beta": 0.0036365140695124865, "fcm_dpo/delta": 0.11278827488422394, "fcm_dpo/margin": 79.91914367675781, "fcm_dpo/q_t": 0.4320225119590759, "grad_norm": 28.48792266845703, "learning_rate": 4.2947798076611047e-07, "logits/chosen": -0.3003755211830139, "logits/rejected": -0.2773016095161438, "logps/chosen": -260.775390625, "logps/ref_chosen": -70.03128051757812, "logps/ref_rejected": -87.68551635742188, "logps/rejected": -358.3487548828125, "loss": 1.1734, "margin_dpo/margin_mean": 79.91913604736328, "margin_dpo/margin_std": 133.44189453125, "step": 220 }, { "KL/chosen_KL_mean": -163.2496337890625, "KL/mean": -252.80859375, "KL/rejected_KL_mean": -342.3675231933594, "KL/std": 164.04974365234375, "epoch": 0.3245227606461087, "fcm_dpo/beta": 0.003541819052770734, "fcm_dpo/delta": -0.24972575902938843, "fcm_dpo/margin": 179.11788940429688, "fcm_dpo/q_t": 0.3551170825958252, "grad_norm": 28.492124557495117, "learning_rate": 4.285822501755485e-07, "logits/chosen": -0.28169721364974976, "logits/rejected": -0.28886687755584717, "logps/chosen": -215.40435791015625, "logps/ref_chosen": -52.15470886230469, "logps/ref_rejected": -106.46768188476562, "logps/rejected": -448.835205078125, "loss": 0.9307, "margin_dpo/margin_mean": 179.11788940429688, "margin_dpo/margin_std": 166.56512451171875, "step": 221 }, { "KL/chosen_KL_mean": -164.61285400390625, "KL/mean": -231.60415649414062, "KL/rejected_KL_mean": -298.595458984375, "KL/std": 149.91842651367188, "epoch": 0.32599118942731276, "fcm_dpo/beta": 0.0034665679559111595, "fcm_dpo/delta": -0.0677119642496109, "fcm_dpo/margin": 133.98260498046875, "fcm_dpo/q_t": 0.39466869831085205, "grad_norm": 19.775772094726562, "learning_rate": 4.276818137766118e-07, "logits/chosen": -0.31213629245758057, "logits/rejected": -0.3134229779243469, "logps/chosen": -225.58395385742188, "logps/ref_chosen": -60.971099853515625, "logps/ref_rejected": -100.00115203857422, "logps/rejected": -398.59661865234375, "loss": 1.0546, "margin_dpo/margin_mean": 133.98260498046875, "margin_dpo/margin_std": 168.62554931640625, "step": 222 }, { "KL/chosen_KL_mean": -172.0750274658203, "KL/mean": -228.94554138183594, "KL/rejected_KL_mean": -285.8160705566406, "KL/std": 145.8609161376953, "epoch": 0.3274596182085169, "fcm_dpo/beta": 0.0034393020905554295, "fcm_dpo/delta": 0.009148719720542431, "fcm_dpo/margin": 113.74103546142578, "fcm_dpo/q_t": 0.4120517075061798, "grad_norm": 23.93748664855957, "learning_rate": 4.2677669529663686e-07, "logits/chosen": -0.25658541917800903, "logits/rejected": -0.2517361640930176, "logps/chosen": -224.71560668945312, "logps/ref_chosen": -52.64057540893555, "logps/ref_rejected": -82.82502746582031, "logps/rejected": -368.64111328125, "loss": 1.1262, "margin_dpo/margin_mean": 113.74102783203125, "margin_dpo/margin_std": 185.43588256835938, "step": 223 }, { "KL/chosen_KL_mean": -147.19102478027344, "KL/mean": -212.12939453125, "KL/rejected_KL_mean": -277.0677490234375, "KL/std": 160.6813507080078, "epoch": 0.328928046989721, "fcm_dpo/beta": 0.003397725522518158, "fcm_dpo/delta": -0.04458841681480408, "fcm_dpo/margin": 129.87673950195312, "fcm_dpo/q_t": 0.4026610255241394, "grad_norm": 25.208892822265625, "learning_rate": 4.2586691858633747e-07, "logits/chosen": -0.29166120290756226, "logits/rejected": -0.2709968686103821, "logps/chosen": -195.78643798828125, "logps/ref_chosen": -48.59541320800781, "logps/ref_rejected": -77.11648559570312, "logps/rejected": -354.18426513671875, "loss": 1.0858, "margin_dpo/margin_mean": 129.87673950195312, "margin_dpo/margin_std": 186.29876708984375, "step": 224 }, { "KL/chosen_KL_mean": -164.91543579101562, "KL/mean": -239.89012145996094, "KL/rejected_KL_mean": -314.86480712890625, "KL/std": 153.07418823242188, "epoch": 0.3303964757709251, "fcm_dpo/beta": 0.0033569016959518194, "fcm_dpo/delta": -0.10895158350467682, "fcm_dpo/margin": 149.94940185546875, "fcm_dpo/q_t": 0.3864118158817291, "grad_norm": 20.780384063720703, "learning_rate": 4.249525076191759e-07, "logits/chosen": -0.32863831520080566, "logits/rejected": -0.317804753780365, "logps/chosen": -222.9158935546875, "logps/ref_chosen": -58.000465393066406, "logps/ref_rejected": -99.90291595458984, "logps/rejected": -414.7677307128906, "loss": 1.0318, "margin_dpo/margin_mean": 149.94940185546875, "margin_dpo/margin_std": 185.24148559570312, "step": 225 }, { "KL/chosen_KL_mean": -139.17755126953125, "KL/mean": -198.74496459960938, "KL/rejected_KL_mean": -258.3123779296875, "KL/std": 148.14669799804688, "epoch": 0.33186490455212925, "fcm_dpo/beta": 0.0033233477734029293, "fcm_dpo/delta": 0.00356471911072731, "fcm_dpo/margin": 119.13485717773438, "fcm_dpo/q_t": 0.4114220142364502, "grad_norm": 24.846914291381836, "learning_rate": 4.2403348649073167e-07, "logits/chosen": -0.3634711503982544, "logits/rejected": -0.3225502371788025, "logps/chosen": -198.0763397216797, "logps/ref_chosen": -58.898799896240234, "logps/ref_rejected": -78.68775939941406, "logps/rejected": -337.0001525878906, "loss": 1.1056, "margin_dpo/margin_mean": 119.13485717773438, "margin_dpo/margin_std": 170.89193725585938, "step": 226 }, { "KL/chosen_KL_mean": -161.0607452392578, "KL/mean": -235.2391357421875, "KL/rejected_KL_mean": -309.41754150390625, "KL/std": 167.58319091796875, "epoch": 0.3333333333333333, "fcm_dpo/beta": 0.0032854501623660326, "fcm_dpo/delta": -0.09252498298883438, "fcm_dpo/margin": 148.35679626464844, "fcm_dpo/q_t": 0.38895145058631897, "grad_norm": 21.650360107421875, "learning_rate": 4.2310987941806615e-07, "logits/chosen": -0.3720400631427765, "logits/rejected": -0.35931509733200073, "logps/chosen": -220.1329345703125, "logps/ref_chosen": -59.072181701660156, "logps/ref_rejected": -99.41236877441406, "logps/rejected": -408.82989501953125, "loss": 1.0331, "margin_dpo/margin_mean": 148.35679626464844, "margin_dpo/margin_std": 176.366943359375, "step": 227 }, { "KL/chosen_KL_mean": -169.7421875, "KL/mean": -223.08303833007812, "KL/rejected_KL_mean": -276.4239196777344, "KL/std": 140.11859130859375, "epoch": 0.33480176211453744, "fcm_dpo/beta": 0.003310044761747122, "fcm_dpo/delta": 0.04850192740559578, "fcm_dpo/margin": 106.68174743652344, "fcm_dpo/q_t": 0.41904619336128235, "grad_norm": 23.304601669311523, "learning_rate": 4.2218171073908463e-07, "logits/chosen": -0.33180832862854004, "logits/rejected": -0.31371521949768066, "logps/chosen": -235.6334686279297, "logps/ref_chosen": -65.89128875732422, "logps/ref_rejected": -91.04875183105469, "logps/rejected": -367.47265625, "loss": 1.1403, "margin_dpo/margin_mean": 106.6817398071289, "margin_dpo/margin_std": 171.9312286376953, "step": 228 }, { "KL/chosen_KL_mean": -174.5965576171875, "KL/mean": -231.82008361816406, "KL/rejected_KL_mean": -289.04364013671875, "KL/std": 162.17124938964844, "epoch": 0.33627019089574156, "fcm_dpo/beta": 0.0033262791112065315, "fcm_dpo/delta": 0.02005820721387863, "fcm_dpo/margin": 114.44705200195312, "fcm_dpo/q_t": 0.41243118047714233, "grad_norm": 36.011322021484375, "learning_rate": 4.212490049118951e-07, "logits/chosen": -0.4325563311576843, "logits/rejected": -0.4033244848251343, "logps/chosen": -245.3029327392578, "logps/ref_chosen": -70.70637512207031, "logps/ref_rejected": -84.52741241455078, "logps/rejected": -373.571044921875, "loss": 1.1159, "margin_dpo/margin_mean": 114.4470443725586, "margin_dpo/margin_std": 170.42984008789062, "step": 229 }, { "KL/chosen_KL_mean": -136.42132568359375, "KL/mean": -223.09548950195312, "KL/rejected_KL_mean": -309.7696533203125, "KL/std": 155.21017456054688, "epoch": 0.3377386196769457, "fcm_dpo/beta": 0.003240791615098715, "fcm_dpo/delta": -0.17158903181552887, "fcm_dpo/margin": 173.3483123779297, "fcm_dpo/q_t": 0.3700660765171051, "grad_norm": 35.2843132019043, "learning_rate": 4.203117865141635e-07, "logits/chosen": -0.3165392279624939, "logits/rejected": -0.3209174871444702, "logps/chosen": -175.70333862304688, "logps/ref_chosen": -39.282005310058594, "logps/ref_rejected": -85.62191009521484, "logps/rejected": -395.3915710449219, "loss": 0.9711, "margin_dpo/margin_mean": 173.34832763671875, "margin_dpo/margin_std": 170.71090698242188, "step": 230 }, { "KL/chosen_KL_mean": -159.6990509033203, "KL/mean": -219.6187744140625, "KL/rejected_KL_mean": -279.5384826660156, "KL/std": 139.4580078125, "epoch": 0.3392070484581498, "fcm_dpo/beta": 0.00322412746027112, "fcm_dpo/delta": 0.014150663278996944, "fcm_dpo/margin": 119.83944702148438, "fcm_dpo/q_t": 0.4127604365348816, "grad_norm": 24.092422485351562, "learning_rate": 4.1937008024246625e-07, "logits/chosen": -0.3582533597946167, "logits/rejected": -0.3271549940109253, "logps/chosen": -222.97549438476562, "logps/ref_chosen": -63.27644348144531, "logps/ref_rejected": -74.1239013671875, "logps/rejected": -353.66241455078125, "loss": 1.1, "margin_dpo/margin_mean": 119.83944702148438, "margin_dpo/margin_std": 162.927978515625, "step": 231 }, { "KL/chosen_KL_mean": -195.55023193359375, "KL/mean": -243.71401977539062, "KL/rejected_KL_mean": -291.87774658203125, "KL/std": 161.44937133789062, "epoch": 0.3406754772393539, "fcm_dpo/beta": 0.003260795958340168, "fcm_dpo/delta": 0.0887773260474205, "fcm_dpo/margin": 96.32752227783203, "fcm_dpo/q_t": 0.43005359172821045, "grad_norm": 22.96601676940918, "learning_rate": 4.1842391091163933e-07, "logits/chosen": -0.38562819361686707, "logits/rejected": -0.3633359670639038, "logps/chosen": -266.29901123046875, "logps/ref_chosen": -70.74876403808594, "logps/ref_rejected": -83.97706604003906, "logps/rejected": -375.8548278808594, "loss": 1.164, "margin_dpo/margin_mean": 96.3275146484375, "margin_dpo/margin_std": 165.77828979492188, "step": 232 }, { "KL/chosen_KL_mean": -174.73155212402344, "KL/mean": -252.10891723632812, "KL/rejected_KL_mean": -329.48626708984375, "KL/std": 172.6971435546875, "epoch": 0.342143906020558, "fcm_dpo/beta": 0.003234952688217163, "fcm_dpo/delta": -0.10582563281059265, "fcm_dpo/margin": 154.7547149658203, "fcm_dpo/q_t": 0.3919374644756317, "grad_norm": 22.466392517089844, "learning_rate": 4.174733034541245e-07, "logits/chosen": -0.3720843195915222, "logits/rejected": -0.373913049697876, "logps/chosen": -229.61448669433594, "logps/ref_chosen": -54.8829345703125, "logps/ref_rejected": -107.4800796508789, "logps/rejected": -436.96630859375, "loss": 1.064, "margin_dpo/margin_mean": 154.7547149658203, "margin_dpo/margin_std": 224.2666015625, "step": 233 }, { "KL/chosen_KL_mean": -169.48391723632812, "KL/mean": -249.71534729003906, "KL/rejected_KL_mean": -329.9467468261719, "KL/std": 153.66720581054688, "epoch": 0.3436123348017621, "fcm_dpo/beta": 0.0031358040869235992, "fcm_dpo/delta": -0.11081574857234955, "fcm_dpo/margin": 160.4628448486328, "fcm_dpo/q_t": 0.3850485682487488, "grad_norm": 30.547178268432617, "learning_rate": 4.165182829193126e-07, "logits/chosen": -0.3440871834754944, "logits/rejected": -0.37053510546684265, "logps/chosen": -213.57843017578125, "logps/ref_chosen": -44.094520568847656, "logps/ref_rejected": -100.00663757324219, "logps/rejected": -429.953369140625, "loss": 1.0167, "margin_dpo/margin_mean": 160.4628448486328, "margin_dpo/margin_std": 173.27178955078125, "step": 234 }, { "KL/chosen_KL_mean": -202.32957458496094, "KL/mean": -254.01788330078125, "KL/rejected_KL_mean": -305.7062072753906, "KL/std": 147.2139892578125, "epoch": 0.34508076358296624, "fcm_dpo/beta": 0.0031859464943408966, "fcm_dpo/delta": 0.072694793343544, "fcm_dpo/margin": 103.37664031982422, "fcm_dpo/q_t": 0.4246324300765991, "grad_norm": 26.673070907592773, "learning_rate": 4.1555887447288255e-07, "logits/chosen": -0.40312904119491577, "logits/rejected": -0.38146403431892395, "logps/chosen": -264.5675048828125, "logps/ref_chosen": -62.237911224365234, "logps/ref_rejected": -90.39506530761719, "logps/rejected": -396.10125732421875, "loss": 1.1606, "margin_dpo/margin_mean": 103.37664031982422, "margin_dpo/margin_std": 178.27023315429688, "step": 235 }, { "KL/chosen_KL_mean": -145.07064819335938, "KL/mean": -228.562255859375, "KL/rejected_KL_mean": -312.0538635253906, "KL/std": 156.35775756835938, "epoch": 0.3465491923641703, "fcm_dpo/beta": 0.003130989382043481, "fcm_dpo/delta": -0.12949799001216888, "fcm_dpo/margin": 166.98321533203125, "fcm_dpo/q_t": 0.37771958112716675, "grad_norm": 37.26255416870117, "learning_rate": 4.1459510339613946e-07, "logits/chosen": -0.3943854570388794, "logits/rejected": -0.3957618474960327, "logps/chosen": -194.41201782226562, "logps/ref_chosen": -49.34136199951172, "logps/ref_rejected": -103.51162719726562, "logps/rejected": -415.56549072265625, "loss": 0.9815, "margin_dpo/margin_mean": 166.98321533203125, "margin_dpo/margin_std": 149.95436096191406, "step": 236 }, { "KL/chosen_KL_mean": -200.8836669921875, "KL/mean": -262.3459777832031, "KL/rejected_KL_mean": -323.80828857421875, "KL/std": 158.80929565429688, "epoch": 0.34801762114537443, "fcm_dpo/beta": 0.0031184733379632235, "fcm_dpo/delta": 0.01728936657309532, "fcm_dpo/margin": 122.92462921142578, "fcm_dpo/q_t": 0.41311001777648926, "grad_norm": 30.76072120666504, "learning_rate": 4.136269950853473e-07, "logits/chosen": -0.36829280853271484, "logits/rejected": -0.36081379652023315, "logps/chosen": -255.05178833007812, "logps/ref_chosen": -54.168121337890625, "logps/ref_rejected": -94.78036499023438, "logps/rejected": -418.5886535644531, "loss": 1.1151, "margin_dpo/margin_mean": 122.92462921142578, "margin_dpo/margin_std": 184.13796997070312, "step": 237 }, { "KL/chosen_KL_mean": -175.17578125, "KL/mean": -237.3186492919922, "KL/rejected_KL_mean": -299.4615478515625, "KL/std": 155.89273071289062, "epoch": 0.34948604992657856, "fcm_dpo/beta": 0.0031098374165594578, "fcm_dpo/delta": 0.01364682987332344, "fcm_dpo/margin": 124.28575134277344, "fcm_dpo/q_t": 0.4137893319129944, "grad_norm": 22.210086822509766, "learning_rate": 4.126545750510605e-07, "logits/chosen": -0.3522963523864746, "logits/rejected": -0.3655349612236023, "logps/chosen": -229.14889526367188, "logps/ref_chosen": -53.973121643066406, "logps/ref_rejected": -89.41795349121094, "logps/rejected": -388.8794860839844, "loss": 1.1056, "margin_dpo/margin_mean": 124.28575134277344, "margin_dpo/margin_std": 175.60623168945312, "step": 238 }, { "KL/chosen_KL_mean": -194.32131958007812, "KL/mean": -264.3119201660156, "KL/rejected_KL_mean": -334.3025207519531, "KL/std": 149.57962036132812, "epoch": 0.3509544787077827, "fcm_dpo/beta": 0.0030899234116077423, "fcm_dpo/delta": -0.0351216085255146, "fcm_dpo/margin": 139.981201171875, "fcm_dpo/q_t": 0.3998359739780426, "grad_norm": 27.21063804626465, "learning_rate": 4.116778689174514e-07, "logits/chosen": -0.34761273860931396, "logits/rejected": -0.3327868580818176, "logps/chosen": -252.4191436767578, "logps/ref_chosen": -58.09782409667969, "logps/ref_rejected": -93.59294128417969, "logps/rejected": -427.89544677734375, "loss": 1.0671, "margin_dpo/margin_mean": 139.981201171875, "margin_dpo/margin_std": 168.2056121826172, "step": 239 }, { "KL/chosen_KL_mean": -205.28285217285156, "KL/mean": -263.65673828125, "KL/rejected_KL_mean": -322.0306396484375, "KL/std": 164.07862854003906, "epoch": 0.3524229074889868, "fcm_dpo/beta": 0.0031123950611799955, "fcm_dpo/delta": 0.03792831301689148, "fcm_dpo/margin": 116.74779510498047, "fcm_dpo/q_t": 0.4176512360572815, "grad_norm": 35.285789489746094, "learning_rate": 4.106969024216348e-07, "logits/chosen": -0.4174574017524719, "logits/rejected": -0.39536041021347046, "logps/chosen": -265.8973388671875, "logps/ref_chosen": -60.6144905090332, "logps/ref_rejected": -74.1185302734375, "logps/rejected": -396.149169921875, "loss": 1.1445, "margin_dpo/margin_mean": 116.74779510498047, "margin_dpo/margin_std": 196.01150512695312, "step": 240 }, { "KL/chosen_KL_mean": -170.59561157226562, "KL/mean": -261.97332763671875, "KL/rejected_KL_mean": -353.35101318359375, "KL/std": 178.49505615234375, "epoch": 0.35389133627019087, "fcm_dpo/beta": 0.003058013506233692, "fcm_dpo/delta": -0.1680062711238861, "fcm_dpo/margin": 182.75537109375, "fcm_dpo/q_t": 0.37589675188064575, "grad_norm": 23.488187789916992, "learning_rate": 4.097117014129903e-07, "logits/chosen": -0.4499303102493286, "logits/rejected": -0.42419755458831787, "logps/chosen": -236.68667602539062, "logps/ref_chosen": -66.091064453125, "logps/ref_rejected": -88.06088256835938, "logps/rejected": -441.41192626953125, "loss": 0.9946, "margin_dpo/margin_mean": 182.75540161132812, "margin_dpo/margin_std": 207.28306579589844, "step": 241 }, { "KL/chosen_KL_mean": -195.0386199951172, "KL/mean": -261.244140625, "KL/rejected_KL_mean": -327.4496154785156, "KL/std": 152.1888427734375, "epoch": 0.355359765051395, "fcm_dpo/beta": 0.003036870388314128, "fcm_dpo/delta": -0.0025676079094409943, "fcm_dpo/margin": 132.41098022460938, "fcm_dpo/q_t": 0.40894001722335815, "grad_norm": 31.73065185546875, "learning_rate": 4.087222918524807e-07, "logits/chosen": -0.3427576422691345, "logits/rejected": -0.31177082657814026, "logps/chosen": -262.9025573730469, "logps/ref_chosen": -67.86392974853516, "logps/ref_rejected": -83.36033630371094, "logps/rejected": -410.8099365234375, "loss": 1.0916, "margin_dpo/margin_mean": 132.41098022460938, "margin_dpo/margin_std": 177.2808380126953, "step": 242 }, { "KL/chosen_KL_mean": -196.42782592773438, "KL/mean": -277.6934814453125, "KL/rejected_KL_mean": -358.9591064453125, "KL/std": 171.84295654296875, "epoch": 0.3568281938325991, "fcm_dpo/beta": 0.002979143988341093, "fcm_dpo/delta": -0.0886797159910202, "fcm_dpo/margin": 162.53128051757812, "fcm_dpo/q_t": 0.38965845108032227, "grad_norm": 21.928974151611328, "learning_rate": 4.07728699811968e-07, "logits/chosen": -0.3825536072254181, "logits/rejected": -0.349841833114624, "logps/chosen": -259.5120849609375, "logps/ref_chosen": -63.0842399597168, "logps/ref_rejected": -76.33563232421875, "logps/rejected": -435.29473876953125, "loss": 1.0322, "margin_dpo/margin_mean": 162.53128051757812, "margin_dpo/margin_std": 191.0034637451172, "step": 243 }, { "KL/chosen_KL_mean": -171.73031616210938, "KL/mean": -254.4647979736328, "KL/rejected_KL_mean": -337.19927978515625, "KL/std": 164.24757385253906, "epoch": 0.35829662261380324, "fcm_dpo/beta": 0.002924954518675804, "fcm_dpo/delta": -0.08855760842561722, "fcm_dpo/margin": 165.46893310546875, "fcm_dpo/q_t": 0.38810837268829346, "grad_norm": 32.71875762939453, "learning_rate": 4.067309514735267e-07, "logits/chosen": -0.43552297353744507, "logits/rejected": -0.4287059009075165, "logps/chosen": -232.87100219726562, "logps/ref_chosen": -61.140689849853516, "logps/ref_rejected": -94.89193725585938, "logps/rejected": -432.0912170410156, "loss": 1.0172, "margin_dpo/margin_mean": 165.4689483642578, "margin_dpo/margin_std": 169.16017150878906, "step": 244 }, { "KL/chosen_KL_mean": -196.77499389648438, "KL/mean": -261.5411376953125, "KL/rejected_KL_mean": -326.3072509765625, "KL/std": 160.04470825195312, "epoch": 0.35976505139500736, "fcm_dpo/beta": 0.0029053720645606518, "fcm_dpo/delta": 0.023488402366638184, "fcm_dpo/margin": 129.53224182128906, "fcm_dpo/q_t": 0.4132363200187683, "grad_norm": 32.37158966064453, "learning_rate": 4.057290731287531e-07, "logits/chosen": -0.43005210161209106, "logits/rejected": -0.40020519495010376, "logps/chosen": -264.03729248046875, "logps/ref_chosen": -67.26228332519531, "logps/ref_rejected": -87.64010620117188, "logps/rejected": -413.94732666015625, "loss": 1.1129, "margin_dpo/margin_mean": 129.53224182128906, "margin_dpo/margin_std": 176.98114013671875, "step": 245 }, { "KL/chosen_KL_mean": -197.54446411132812, "KL/mean": -263.78521728515625, "KL/rejected_KL_mean": -330.02593994140625, "KL/std": 181.79562377929688, "epoch": 0.36123348017621143, "fcm_dpo/beta": 0.002928508911281824, "fcm_dpo/delta": 0.012364866212010384, "fcm_dpo/margin": 132.4814910888672, "fcm_dpo/q_t": 0.41279125213623047, "grad_norm": 29.707550048828125, "learning_rate": 4.047230911780736e-07, "logits/chosen": -0.4249339699745178, "logits/rejected": -0.3837849497795105, "logps/chosen": -264.241455078125, "logps/ref_chosen": -66.69696807861328, "logps/ref_rejected": -84.34634399414062, "logps/rejected": -414.372314453125, "loss": 1.1073, "margin_dpo/margin_mean": 132.4814910888672, "margin_dpo/margin_std": 192.24024963378906, "step": 246 }, { "KL/chosen_KL_mean": -230.63812255859375, "KL/mean": -323.56597900390625, "KL/rejected_KL_mean": -416.493896484375, "KL/std": 187.34368896484375, "epoch": 0.36270190895741555, "fcm_dpo/beta": 0.002874248195439577, "fcm_dpo/delta": -0.1419781595468521, "fcm_dpo/margin": 185.85574340820312, "fcm_dpo/q_t": 0.3779839277267456, "grad_norm": 32.81269836425781, "learning_rate": 4.0371303213004814e-07, "logits/chosen": -0.36910104751586914, "logits/rejected": -0.3661007285118103, "logps/chosen": -287.24346923828125, "logps/ref_chosen": -56.6053466796875, "logps/ref_rejected": -106.29326629638672, "logps/rejected": -522.787109375, "loss": 1.0, "margin_dpo/margin_mean": 185.8557586669922, "margin_dpo/margin_std": 202.15628051757812, "step": 247 }, { "KL/chosen_KL_mean": -196.98757934570312, "KL/mean": -277.89544677734375, "KL/rejected_KL_mean": -358.8033447265625, "KL/std": 152.73196411132812, "epoch": 0.3641703377386197, "fcm_dpo/beta": 0.0028204985428601503, "fcm_dpo/delta": -0.05965063348412514, "fcm_dpo/margin": 161.81573486328125, "fcm_dpo/q_t": 0.3918088674545288, "grad_norm": 24.8028564453125, "learning_rate": 4.0269892260067197e-07, "logits/chosen": -0.3791336119174957, "logits/rejected": -0.3977039158344269, "logps/chosen": -241.03079223632812, "logps/ref_chosen": -44.043216705322266, "logps/ref_rejected": -91.85687255859375, "logps/rejected": -450.66021728515625, "loss": 1.0232, "margin_dpo/margin_mean": 161.81573486328125, "margin_dpo/margin_std": 147.67108154296875, "step": 248 }, { "KL/chosen_KL_mean": -255.34439086914062, "KL/mean": -299.1787109375, "KL/rejected_KL_mean": -343.01300048828125, "KL/std": 171.1071014404297, "epoch": 0.3656387665198238, "fcm_dpo/beta": 0.0028861965984106064, "fcm_dpo/delta": 0.15084651112556458, "fcm_dpo/margin": 87.6685791015625, "fcm_dpo/q_t": 0.44312575459480286, "grad_norm": 36.00886154174805, "learning_rate": 4.0168078931267426e-07, "logits/chosen": -0.41152477264404297, "logits/rejected": -0.38563063740730286, "logps/chosen": -317.7867431640625, "logps/ref_chosen": -62.442352294921875, "logps/ref_rejected": -80.46806335449219, "logps/rejected": -423.4810485839844, "loss": 1.2297, "margin_dpo/margin_mean": 87.6685791015625, "margin_dpo/margin_std": 201.31149291992188, "step": 249 }, { "KL/chosen_KL_mean": -218.36785888671875, "KL/mean": -301.58306884765625, "KL/rejected_KL_mean": -384.79827880859375, "KL/std": 167.09228515625, "epoch": 0.3671071953010279, "fcm_dpo/beta": 0.0028843069449067116, "fcm_dpo/delta": -0.08404796570539474, "fcm_dpo/margin": 166.430419921875, "fcm_dpo/q_t": 0.3882160782814026, "grad_norm": 38.20355987548828, "learning_rate": 4.006586590948141e-07, "logits/chosen": -0.3953922390937805, "logits/rejected": -0.3377394676208496, "logps/chosen": -284.0045471191406, "logps/ref_chosen": -65.63668823242188, "logps/ref_rejected": -73.87184143066406, "logps/rejected": -458.67010498046875, "loss": 1.0207, "margin_dpo/margin_mean": 166.430419921875, "margin_dpo/margin_std": 170.97409057617188, "step": 250 }, { "KL/chosen_KL_mean": -231.93658447265625, "KL/mean": -286.5303649902344, "KL/rejected_KL_mean": -341.1241455078125, "KL/std": 173.23834228515625, "epoch": 0.368575624082232, "fcm_dpo/beta": 0.002891149837523699, "fcm_dpo/delta": 0.08716142922639847, "fcm_dpo/margin": 109.18754577636719, "fcm_dpo/q_t": 0.42818212509155273, "grad_norm": 36.455989837646484, "learning_rate": 3.9963255888117325e-07, "logits/chosen": -0.3735540509223938, "logits/rejected": -0.34010833501815796, "logps/chosen": -289.11932373046875, "logps/ref_chosen": -57.182716369628906, "logps/ref_rejected": -77.66343688964844, "logps/rejected": -418.7875671386719, "loss": 1.1682, "margin_dpo/margin_mean": 109.18755340576172, "margin_dpo/margin_std": 187.57749938964844, "step": 251 }, { "KL/chosen_KL_mean": -218.92428588867188, "KL/mean": -294.3023986816406, "KL/rejected_KL_mean": -369.68048095703125, "KL/std": 150.38790893554688, "epoch": 0.3700440528634361, "fcm_dpo/beta": 0.002887298120185733, "fcm_dpo/delta": -0.03713885694742203, "fcm_dpo/margin": 150.75619506835938, "fcm_dpo/q_t": 0.39595597982406616, "grad_norm": 26.218870162963867, "learning_rate": 3.9860251571044666e-07, "logits/chosen": -0.45149749517440796, "logits/rejected": -0.4118601083755493, "logps/chosen": -290.60992431640625, "logps/ref_chosen": -71.68563842773438, "logps/ref_rejected": -84.75799560546875, "logps/rejected": -454.4384765625, "loss": 1.0427, "margin_dpo/margin_mean": 150.75619506835938, "margin_dpo/margin_std": 148.70468139648438, "step": 252 }, { "KL/chosen_KL_mean": -190.15496826171875, "KL/mean": -262.7193298339844, "KL/rejected_KL_mean": -335.2836608886719, "KL/std": 159.39236450195312, "epoch": 0.37151248164464024, "fcm_dpo/beta": 0.0028968360275030136, "fcm_dpo/delta": -0.021846629679203033, "fcm_dpo/margin": 145.12869262695312, "fcm_dpo/q_t": 0.40177327394485474, "grad_norm": 25.34404182434082, "learning_rate": 3.9756855672522986e-07, "logits/chosen": -0.4528924822807312, "logits/rejected": -0.444851279258728, "logps/chosen": -259.2889099121094, "logps/ref_chosen": -69.1339340209961, "logps/ref_rejected": -98.70252990722656, "logps/rejected": -433.9862060546875, "loss": 1.0731, "margin_dpo/margin_mean": 145.12869262695312, "margin_dpo/margin_std": 174.61839294433594, "step": 253 }, { "KL/chosen_KL_mean": -183.0496368408203, "KL/mean": -245.39013671875, "KL/rejected_KL_mean": -307.73065185546875, "KL/std": 165.1351318359375, "epoch": 0.37298091042584436, "fcm_dpo/beta": 0.0028911656700074673, "fcm_dpo/delta": 0.04102412983775139, "fcm_dpo/margin": 124.68098449707031, "fcm_dpo/q_t": 0.41999658942222595, "grad_norm": 24.079992294311523, "learning_rate": 3.965307091713037e-07, "logits/chosen": -0.40010130405426025, "logits/rejected": -0.3824934661388397, "logps/chosen": -237.2046356201172, "logps/ref_chosen": -54.154998779296875, "logps/ref_rejected": -90.30764770507812, "logps/rejected": -398.03826904296875, "loss": 1.139, "margin_dpo/margin_mean": 124.68099975585938, "margin_dpo/margin_std": 206.97445678710938, "step": 254 }, { "KL/chosen_KL_mean": -189.504150390625, "KL/mean": -255.857666015625, "KL/rejected_KL_mean": -322.211181640625, "KL/std": 146.82505798339844, "epoch": 0.3744493392070485, "fcm_dpo/beta": 0.0028981873765587807, "fcm_dpo/delta": 0.015838047489523888, "fcm_dpo/margin": 132.70704650878906, "fcm_dpo/q_t": 0.41084420680999756, "grad_norm": 21.63443946838379, "learning_rate": 3.954890003969163e-07, "logits/chosen": -0.3540547490119934, "logits/rejected": -0.34169769287109375, "logps/chosen": -246.64581298828125, "logps/ref_chosen": -57.14167022705078, "logps/ref_rejected": -90.2085952758789, "logps/rejected": -412.4197998046875, "loss": 1.1089, "margin_dpo/margin_mean": 132.70704650878906, "margin_dpo/margin_std": 186.76455688476562, "step": 255 }, { "KL/chosen_KL_mean": -168.35980224609375, "KL/mean": -241.50079345703125, "KL/rejected_KL_mean": -314.6417541503906, "KL/std": 160.8679656982422, "epoch": 0.37591776798825255, "fcm_dpo/beta": 0.002899360843002796, "fcm_dpo/delta": -0.025190845131874084, "fcm_dpo/margin": 146.28195190429688, "fcm_dpo/q_t": 0.4021248519420624, "grad_norm": 26.03676414489746, "learning_rate": 3.944434578520628e-07, "logits/chosen": -0.33700472116470337, "logits/rejected": -0.34537577629089355, "logps/chosen": -223.52330017089844, "logps/ref_chosen": -55.163490295410156, "logps/ref_rejected": -92.56291961669922, "logps/rejected": -407.2046813964844, "loss": 1.0717, "margin_dpo/margin_mean": 146.28195190429688, "margin_dpo/margin_std": 182.9776611328125, "step": 256 }, { "KL/chosen_KL_mean": -165.2210693359375, "KL/mean": -241.1984100341797, "KL/rejected_KL_mean": -317.1757507324219, "KL/std": 167.52880859375, "epoch": 0.37738619676945667, "fcm_dpo/beta": 0.0028601905796676874, "fcm_dpo/delta": -0.03735721856355667, "fcm_dpo/margin": 151.95468139648438, "fcm_dpo/q_t": 0.4005059599876404, "grad_norm": 21.53156089782715, "learning_rate": 3.933941090877615e-07, "logits/chosen": -0.3417869806289673, "logits/rejected": -0.3275744915008545, "logps/chosen": -214.644775390625, "logps/ref_chosen": -49.42369842529297, "logps/ref_rejected": -79.53791809082031, "logps/rejected": -396.71368408203125, "loss": 1.0698, "margin_dpo/margin_mean": 151.95468139648438, "margin_dpo/margin_std": 187.14981079101562, "step": 257 }, { "KL/chosen_KL_mean": -218.06576538085938, "KL/mean": -292.39105224609375, "KL/rejected_KL_mean": -366.71636962890625, "KL/std": 176.79669189453125, "epoch": 0.3788546255506608, "fcm_dpo/beta": 0.0028670839965343475, "fcm_dpo/delta": -0.02737291157245636, "fcm_dpo/margin": 148.650634765625, "fcm_dpo/q_t": 0.40125665068626404, "grad_norm": 25.97199249267578, "learning_rate": 3.923409817553284e-07, "logits/chosen": -0.32123446464538574, "logits/rejected": -0.3179280161857605, "logps/chosen": -277.44989013671875, "logps/ref_chosen": -59.384124755859375, "logps/ref_rejected": -95.99010467529297, "logps/rejected": -462.70648193359375, "loss": 1.0901, "margin_dpo/margin_mean": 148.650634765625, "margin_dpo/margin_std": 211.3903045654297, "step": 258 }, { "KL/chosen_KL_mean": -207.76504516601562, "KL/mean": -270.2027893066406, "KL/rejected_KL_mean": -332.64056396484375, "KL/std": 169.08270263671875, "epoch": 0.3803230543318649, "fcm_dpo/beta": 0.002867575269192457, "fcm_dpo/delta": 0.04339686781167984, "fcm_dpo/margin": 124.87550354003906, "fcm_dpo/q_t": 0.4185197353363037, "grad_norm": 22.1605281829834, "learning_rate": 3.9128410360564793e-07, "logits/chosen": -0.3787084221839905, "logits/rejected": -0.37561601400375366, "logps/chosen": -260.5933837890625, "logps/ref_chosen": -52.828346252441406, "logps/ref_rejected": -89.191650390625, "logps/rejected": -421.83221435546875, "loss": 1.1309, "margin_dpo/margin_mean": 124.8755111694336, "margin_dpo/margin_std": 190.36257934570312, "step": 259 }, { "KL/chosen_KL_mean": -206.97616577148438, "KL/mean": -292.4883728027344, "KL/rejected_KL_mean": -378.00054931640625, "KL/std": 176.58251953125, "epoch": 0.38179148311306904, "fcm_dpo/beta": 0.002854567486792803, "fcm_dpo/delta": -0.09267206490039825, "fcm_dpo/margin": 171.02438354492188, "fcm_dpo/q_t": 0.38960734009742737, "grad_norm": 27.828453063964844, "learning_rate": 3.9022350248844246e-07, "logits/chosen": -0.3447574973106384, "logits/rejected": -0.36221379041671753, "logps/chosen": -254.3938446044922, "logps/ref_chosen": -47.41767501831055, "logps/ref_rejected": -95.08978271484375, "logps/rejected": -473.09033203125, "loss": 1.0221, "margin_dpo/margin_mean": 171.02438354492188, "margin_dpo/margin_std": 191.23297119140625, "step": 260 }, { "KL/chosen_KL_mean": -216.47369384765625, "KL/mean": -295.7045593261719, "KL/rejected_KL_mean": -374.9354553222656, "KL/std": 191.41683959960938, "epoch": 0.3832599118942731, "fcm_dpo/beta": 0.002804287476465106, "fcm_dpo/delta": -0.04664212465286255, "fcm_dpo/margin": 158.46176147460938, "fcm_dpo/q_t": 0.3995262384414673, "grad_norm": 20.731279373168945, "learning_rate": 3.891592063515376e-07, "logits/chosen": -0.3055582344532013, "logits/rejected": -0.3050229549407959, "logps/chosen": -269.50506591796875, "logps/ref_chosen": -53.03137969970703, "logps/ref_rejected": -88.51494598388672, "logps/rejected": -463.4504089355469, "loss": 1.0723, "margin_dpo/margin_mean": 158.46176147460938, "margin_dpo/margin_std": 213.8656005859375, "step": 261 }, { "KL/chosen_KL_mean": -248.84571838378906, "KL/mean": -314.5561218261719, "KL/rejected_KL_mean": -380.2665100097656, "KL/std": 172.76287841796875, "epoch": 0.38472834067547723, "fcm_dpo/beta": 0.00282662408426404, "fcm_dpo/delta": 0.029231306165456772, "fcm_dpo/margin": 131.42083740234375, "fcm_dpo/q_t": 0.4136051535606384, "grad_norm": 23.95509147644043, "learning_rate": 3.880912432401264e-07, "logits/chosen": -0.31161201000213623, "logits/rejected": -0.2819845676422119, "logps/chosen": -308.4658508300781, "logps/ref_chosen": -59.620140075683594, "logps/ref_rejected": -86.41853332519531, "logps/rejected": -466.68505859375, "loss": 1.1038, "margin_dpo/margin_mean": 131.42083740234375, "margin_dpo/margin_std": 170.3394317626953, "step": 262 }, { "KL/chosen_KL_mean": -222.57464599609375, "KL/mean": -316.29998779296875, "KL/rejected_KL_mean": -410.025390625, "KL/std": 204.56842041015625, "epoch": 0.38619676945668135, "fcm_dpo/beta": 0.0027571117971092463, "fcm_dpo/delta": -0.1240774393081665, "fcm_dpo/margin": 187.4507598876953, "fcm_dpo/q_t": 0.38271111249923706, "grad_norm": 21.146869659423828, "learning_rate": 3.870196412960302e-07, "logits/chosen": -0.3639793395996094, "logits/rejected": -0.3423152267932892, "logps/chosen": -281.99560546875, "logps/ref_chosen": -59.42094421386719, "logps/ref_rejected": -96.85720825195312, "logps/rejected": -506.8825988769531, "loss": 1.0183, "margin_dpo/margin_mean": 187.45074462890625, "margin_dpo/margin_std": 219.32289123535156, "step": 263 }, { "KL/chosen_KL_mean": -233.87161254882812, "KL/mean": -311.49884033203125, "KL/rejected_KL_mean": -389.1260681152344, "KL/std": 185.69674682617188, "epoch": 0.3876651982378855, "fcm_dpo/beta": 0.002720474498346448, "fcm_dpo/delta": -0.024707935750484467, "fcm_dpo/margin": 155.25442504882812, "fcm_dpo/q_t": 0.4044179320335388, "grad_norm": 23.543067932128906, "learning_rate": 3.8594442875695665e-07, "logits/chosen": -0.40215420722961426, "logits/rejected": -0.3942739963531494, "logps/chosen": -296.59368896484375, "logps/ref_chosen": -62.722084045410156, "logps/ref_rejected": -93.85620880126953, "logps/rejected": -482.9822998046875, "loss": 1.0891, "margin_dpo/margin_mean": 155.25442504882812, "margin_dpo/margin_std": 211.25302124023438, "step": 264 }, { "KL/chosen_KL_mean": -253.85220336914062, "KL/mean": -328.5089111328125, "KL/rejected_KL_mean": -403.16558837890625, "KL/std": 207.49630737304688, "epoch": 0.3891336270190896, "fcm_dpo/beta": 0.0027331099845469, "fcm_dpo/delta": -0.008440444245934486, "fcm_dpo/margin": 149.31336975097656, "fcm_dpo/q_t": 0.40931302309036255, "grad_norm": 23.92888832092285, "learning_rate": 3.848656339557562e-07, "logits/chosen": -0.29150500893592834, "logits/rejected": -0.2723013758659363, "logps/chosen": -315.82366943359375, "logps/ref_chosen": -61.971466064453125, "logps/ref_rejected": -88.02059936523438, "logps/rejected": -491.1861877441406, "loss": 1.1199, "margin_dpo/margin_mean": 149.31336975097656, "margin_dpo/margin_std": 244.71060180664062, "step": 265 }, { "KL/chosen_KL_mean": -257.31298828125, "KL/mean": -317.381591796875, "KL/rejected_KL_mean": -377.4501647949219, "KL/std": 173.2138671875, "epoch": 0.39060205580029367, "fcm_dpo/beta": 0.002761277835816145, "fcm_dpo/delta": 0.07062655687332153, "fcm_dpo/margin": 120.13714599609375, "fcm_dpo/q_t": 0.42465198040008545, "grad_norm": 50.139766693115234, "learning_rate": 3.8378328531967507e-07, "logits/chosen": -0.34931957721710205, "logits/rejected": -0.30927109718322754, "logps/chosen": -324.41265869140625, "logps/ref_chosen": -67.09967041015625, "logps/ref_rejected": -67.97122192382812, "logps/rejected": -445.42138671875, "loss": 1.1531, "margin_dpo/margin_mean": 120.13715362548828, "margin_dpo/margin_std": 199.931640625, "step": 266 }, { "KL/chosen_KL_mean": -230.34881591796875, "KL/mean": -303.38250732421875, "KL/rejected_KL_mean": -376.41619873046875, "KL/std": 187.39883422851562, "epoch": 0.3920704845814978, "fcm_dpo/beta": 0.0027605746872723103, "fcm_dpo/delta": -0.0036356858909130096, "fcm_dpo/margin": 146.06735229492188, "fcm_dpo/q_t": 0.40831679105758667, "grad_norm": 34.82039260864258, "learning_rate": 3.8269741136960646e-07, "logits/chosen": -0.3749678134918213, "logits/rejected": -0.34407055377960205, "logps/chosen": -299.319580078125, "logps/ref_chosen": -68.97075653076172, "logps/ref_rejected": -90.16844940185547, "logps/rejected": -466.58465576171875, "loss": 1.1023, "margin_dpo/margin_mean": 146.06735229492188, "margin_dpo/margin_std": 212.41896057128906, "step": 267 }, { "KL/chosen_KL_mean": -238.78897094726562, "KL/mean": -314.13409423828125, "KL/rejected_KL_mean": -389.47918701171875, "KL/std": 180.19076538085938, "epoch": 0.3935389133627019, "fcm_dpo/beta": 0.002754632383584976, "fcm_dpo/delta": -0.016000591218471527, "fcm_dpo/margin": 150.69021606445312, "fcm_dpo/q_t": 0.4059686064720154, "grad_norm": 28.66946029663086, "learning_rate": 3.8160804071933894e-07, "logits/chosen": -0.35488247871398926, "logits/rejected": -0.3602542281150818, "logps/chosen": -294.68927001953125, "logps/ref_chosen": -55.90031051635742, "logps/ref_rejected": -101.64763641357422, "logps/rejected": -491.1268310546875, "loss": 1.0941, "margin_dpo/margin_mean": 150.69021606445312, "margin_dpo/margin_std": 216.28262329101562, "step": 268 }, { "KL/chosen_KL_mean": -251.7904052734375, "KL/mean": -337.79144287109375, "KL/rejected_KL_mean": -423.79254150390625, "KL/std": 181.66836547851562, "epoch": 0.39500734214390604, "fcm_dpo/beta": 0.0027336953207850456, "fcm_dpo/delta": -0.07361201196908951, "fcm_dpo/margin": 172.00210571289062, "fcm_dpo/q_t": 0.3932827413082123, "grad_norm": 26.64388084411621, "learning_rate": 3.8051520207480204e-07, "logits/chosen": -0.4130655527114868, "logits/rejected": -0.39535683393478394, "logps/chosen": -321.8299560546875, "logps/ref_chosen": -70.03955841064453, "logps/ref_rejected": -107.34937286376953, "logps/rejected": -531.1419067382812, "loss": 1.0618, "margin_dpo/margin_mean": 172.00210571289062, "margin_dpo/margin_std": 230.4850616455078, "step": 269 }, { "KL/chosen_KL_mean": -218.31060791015625, "KL/mean": -282.62591552734375, "KL/rejected_KL_mean": -346.9412841796875, "KL/std": 164.40748596191406, "epoch": 0.3964757709251101, "fcm_dpo/beta": 0.0027484484016895294, "fcm_dpo/delta": 0.04783637449145317, "fcm_dpo/margin": 128.63064575195312, "fcm_dpo/q_t": 0.41904473304748535, "grad_norm": 33.281341552734375, "learning_rate": 3.794189242333106e-07, "logits/chosen": -0.43409767746925354, "logits/rejected": -0.4289151430130005, "logps/chosen": -287.8440856933594, "logps/ref_chosen": -69.53347778320312, "logps/ref_rejected": -109.92864990234375, "logps/rejected": -456.8699035644531, "loss": 1.1378, "margin_dpo/margin_mean": 128.6306610107422, "margin_dpo/margin_std": 204.2493896484375, "step": 270 }, { "KL/chosen_KL_mean": -209.23326110839844, "KL/mean": -294.4723205566406, "KL/rejected_KL_mean": -379.71136474609375, "KL/std": 168.14682006835938, "epoch": 0.39794419970631423, "fcm_dpo/beta": 0.002710944041609764, "fcm_dpo/delta": -0.06562402844429016, "fcm_dpo/margin": 170.47811889648438, "fcm_dpo/q_t": 0.39391976594924927, "grad_norm": 23.12546730041504, "learning_rate": 3.7831923608280514e-07, "logits/chosen": -0.3764873743057251, "logits/rejected": -0.36014989018440247, "logps/chosen": -265.9978332519531, "logps/ref_chosen": -56.76456832885742, "logps/ref_rejected": -92.51383972167969, "logps/rejected": -472.2252197265625, "loss": 1.0393, "margin_dpo/margin_mean": 170.47811889648438, "margin_dpo/margin_std": 191.98031616210938, "step": 271 }, { "KL/chosen_KL_mean": -200.45635986328125, "KL/mean": -302.6602783203125, "KL/rejected_KL_mean": -404.86419677734375, "KL/std": 187.67584228515625, "epoch": 0.39941262848751835, "fcm_dpo/beta": 0.0026611106004565954, "fcm_dpo/delta": -0.15195293724536896, "fcm_dpo/margin": 204.4078369140625, "fcm_dpo/q_t": 0.3735220432281494, "grad_norm": 33.842586517333984, "learning_rate": 3.772161666010912e-07, "logits/chosen": -0.2986787259578705, "logits/rejected": -0.31125104427337646, "logps/chosen": -249.95352172851562, "logps/ref_chosen": -49.497154235839844, "logps/ref_rejected": -105.54279327392578, "logps/rejected": -510.406982421875, "loss": 0.9774, "margin_dpo/margin_mean": 204.4078369140625, "margin_dpo/margin_std": 192.80450439453125, "step": 272 }, { "KL/chosen_KL_mean": -231.20156860351562, "KL/mean": -327.5953369140625, "KL/rejected_KL_mean": -423.98907470703125, "KL/std": 184.35150146484375, "epoch": 0.4008810572687225, "fcm_dpo/beta": 0.0025815139524638653, "fcm_dpo/delta": -0.1033368706703186, "fcm_dpo/margin": 192.7875213623047, "fcm_dpo/q_t": 0.38513267040252686, "grad_norm": 30.54670524597168, "learning_rate": 3.761097448550755e-07, "logits/chosen": -0.2978020906448364, "logits/rejected": -0.2806628346443176, "logps/chosen": -294.1769714355469, "logps/ref_chosen": -62.97539520263672, "logps/ref_rejected": -92.49858093261719, "logps/rejected": -516.4876708984375, "loss": 1.0174, "margin_dpo/margin_mean": 192.7875213623047, "margin_dpo/margin_std": 209.65719604492188, "step": 273 }, { "KL/chosen_KL_mean": -276.0918884277344, "KL/mean": -348.8900146484375, "KL/rejected_KL_mean": -421.68817138671875, "KL/std": 173.11854553222656, "epoch": 0.4023494860499266, "fcm_dpo/beta": 0.002592704724520445, "fcm_dpo/delta": 0.023143114522099495, "fcm_dpo/margin": 145.5963134765625, "fcm_dpo/q_t": 0.41287532448768616, "grad_norm": 26.89436149597168, "learning_rate": 3.75e-07, "logits/chosen": -0.2569617033004761, "logits/rejected": -0.24136140942573547, "logps/chosen": -331.75958251953125, "logps/ref_chosen": -55.66770935058594, "logps/ref_rejected": -77.33308410644531, "logps/rejected": -499.0212707519531, "loss": 1.1059, "margin_dpo/margin_mean": 145.5963134765625, "margin_dpo/margin_std": 196.85382080078125, "step": 274 }, { "KL/chosen_KL_mean": -210.4385986328125, "KL/mean": -290.48822021484375, "KL/rejected_KL_mean": -370.5378112792969, "KL/std": 176.77706909179688, "epoch": 0.40381791483113066, "fcm_dpo/beta": 0.0025886246003210545, "fcm_dpo/delta": -0.015149945393204689, "fcm_dpo/margin": 160.09922790527344, "fcm_dpo/q_t": 0.403909832239151, "grad_norm": 26.170629501342773, "learning_rate": 3.738869612786737e-07, "logits/chosen": -0.31770947575569153, "logits/rejected": -0.32402610778808594, "logps/chosen": -259.0332946777344, "logps/ref_chosen": -48.594703674316406, "logps/ref_rejected": -93.30369567871094, "logps/rejected": -463.84149169921875, "loss": 1.0756, "margin_dpo/margin_mean": 160.09922790527344, "margin_dpo/margin_std": 196.3067169189453, "step": 275 }, { "KL/chosen_KL_mean": -227.3229217529297, "KL/mean": -307.79864501953125, "KL/rejected_KL_mean": -388.2743835449219, "KL/std": 183.3173065185547, "epoch": 0.4052863436123348, "fcm_dpo/beta": 0.0025693178176879883, "fcm_dpo/delta": -0.014239070937037468, "fcm_dpo/margin": 160.95144653320312, "fcm_dpo/q_t": 0.4061383008956909, "grad_norm": 33.1538200378418, "learning_rate": 3.7277065802070204e-07, "logits/chosen": -0.2713956832885742, "logits/rejected": -0.2456541657447815, "logps/chosen": -283.90032958984375, "logps/ref_chosen": -56.57740783691406, "logps/ref_rejected": -70.36566925048828, "logps/rejected": -458.64007568359375, "loss": 1.0897, "margin_dpo/margin_mean": 160.95144653320312, "margin_dpo/margin_std": 220.95907592773438, "step": 276 }, { "KL/chosen_KL_mean": -251.29473876953125, "KL/mean": -333.46185302734375, "KL/rejected_KL_mean": -415.6289978027344, "KL/std": 182.88909912109375, "epoch": 0.4067547723935389, "fcm_dpo/beta": 0.002568996511399746, "fcm_dpo/delta": -0.023184221237897873, "fcm_dpo/margin": 164.33425903320312, "fcm_dpo/q_t": 0.4032415747642517, "grad_norm": 23.64580726623535, "learning_rate": 3.71651119641714e-07, "logits/chosen": -0.2841571569442749, "logits/rejected": -0.26456978917121887, "logps/chosen": -307.5662841796875, "logps/ref_chosen": -56.27156066894531, "logps/ref_rejected": -92.88127136230469, "logps/rejected": -508.51025390625, "loss": 1.082, "margin_dpo/margin_mean": 164.33425903320312, "margin_dpo/margin_std": 218.15975952148438, "step": 277 }, { "KL/chosen_KL_mean": -219.04319763183594, "KL/mean": -315.867919921875, "KL/rejected_KL_mean": -412.692626953125, "KL/std": 195.05999755859375, "epoch": 0.40822320117474303, "fcm_dpo/beta": 0.0025190459564328194, "fcm_dpo/delta": -0.09271565079689026, "fcm_dpo/margin": 193.64944458007812, "fcm_dpo/q_t": 0.38852113485336304, "grad_norm": 24.158504486083984, "learning_rate": 3.705283756425872e-07, "logits/chosen": -0.27769631147384644, "logits/rejected": -0.2823639214038849, "logps/chosen": -271.9851379394531, "logps/ref_chosen": -52.94194030761719, "logps/ref_rejected": -91.25357818603516, "logps/rejected": -503.94622802734375, "loss": 1.026, "margin_dpo/margin_mean": 193.6494598388672, "margin_dpo/margin_std": 218.25051879882812, "step": 278 }, { "KL/chosen_KL_mean": -250.65829467773438, "KL/mean": -346.52783203125, "KL/rejected_KL_mean": -442.3973388671875, "KL/std": 203.59503173828125, "epoch": 0.40969162995594716, "fcm_dpo/beta": 0.0024712784215807915, "fcm_dpo/delta": -0.07853814959526062, "fcm_dpo/margin": 191.73904418945312, "fcm_dpo/q_t": 0.39302849769592285, "grad_norm": 27.312673568725586, "learning_rate": 3.6940245560867e-07, "logits/chosen": -0.24171388149261475, "logits/rejected": -0.2399359941482544, "logps/chosen": -299.29962158203125, "logps/ref_chosen": -48.641319274902344, "logps/ref_rejected": -87.8514404296875, "logps/rejected": -530.248779296875, "loss": 1.0546, "margin_dpo/margin_mean": 191.73904418945312, "margin_dpo/margin_std": 247.916748046875, "step": 279 }, { "KL/chosen_KL_mean": -246.4661865234375, "KL/mean": -343.66387939453125, "KL/rejected_KL_mean": -440.861572265625, "KL/std": 186.5137939453125, "epoch": 0.4111600587371512, "fcm_dpo/beta": 0.0024436868261545897, "fcm_dpo/delta": -0.0788697600364685, "fcm_dpo/margin": 194.39535522460938, "fcm_dpo/q_t": 0.3888060450553894, "grad_norm": 25.471012115478516, "learning_rate": 3.6827338920900253e-07, "logits/chosen": -0.2643176317214966, "logits/rejected": -0.2642500698566437, "logps/chosen": -305.2633056640625, "logps/ref_chosen": -58.797122955322266, "logps/ref_rejected": -98.61885070800781, "logps/rejected": -539.4804077148438, "loss": 1.0254, "margin_dpo/margin_mean": 194.39535522460938, "margin_dpo/margin_std": 205.177001953125, "step": 280 }, { "KL/chosen_KL_mean": -228.09291076660156, "KL/mean": -314.8627624511719, "KL/rejected_KL_mean": -401.6325988769531, "KL/std": 182.05502319335938, "epoch": 0.41262848751835535, "fcm_dpo/beta": 0.0024358248338103294, "fcm_dpo/delta": -0.02387945167720318, "fcm_dpo/margin": 173.53970336914062, "fcm_dpo/q_t": 0.3996858596801758, "grad_norm": 20.681076049804688, "learning_rate": 3.6714120619553435e-07, "logits/chosen": -0.32415997982025146, "logits/rejected": -0.2965441346168518, "logps/chosen": -283.5814208984375, "logps/ref_chosen": -55.488521575927734, "logps/ref_rejected": -80.88258361816406, "logps/rejected": -482.51519775390625, "loss": 1.0639, "margin_dpo/margin_mean": 173.53970336914062, "margin_dpo/margin_std": 196.98211669921875, "step": 281 }, { "KL/chosen_KL_mean": -254.01113891601562, "KL/mean": -322.19805908203125, "KL/rejected_KL_mean": -390.38494873046875, "KL/std": 191.5809326171875, "epoch": 0.41409691629955947, "fcm_dpo/beta": 0.0024580340832471848, "fcm_dpo/delta": 0.06647245585918427, "fcm_dpo/margin": 136.3738555908203, "fcm_dpo/q_t": 0.42514273524284363, "grad_norm": 22.19266128540039, "learning_rate": 3.660059364023408e-07, "logits/chosen": -0.3909962475299835, "logits/rejected": -0.3682512640953064, "logps/chosen": -327.081298828125, "logps/ref_chosen": -73.07014465332031, "logps/ref_rejected": -95.35098266601562, "logps/rejected": -485.7359619140625, "loss": 1.1443, "margin_dpo/margin_mean": 136.37387084960938, "margin_dpo/margin_std": 219.83755493164062, "step": 282 }, { "KL/chosen_KL_mean": -266.194091796875, "KL/mean": -369.5931091308594, "KL/rejected_KL_mean": -472.99212646484375, "KL/std": 225.01544189453125, "epoch": 0.4155653450807636, "fcm_dpo/beta": 0.0024265965912491083, "fcm_dpo/delta": -0.10707136243581772, "fcm_dpo/margin": 206.79803466796875, "fcm_dpo/q_t": 0.38304460048675537, "grad_norm": 30.945764541625977, "learning_rate": 3.6486760974483685e-07, "logits/chosen": -0.31001657247543335, "logits/rejected": -0.3113076388835907, "logps/chosen": -328.092529296875, "logps/ref_chosen": -61.89844512939453, "logps/ref_rejected": -96.98655700683594, "logps/rejected": -569.9786376953125, "loss": 1.0088, "margin_dpo/margin_mean": 206.79803466796875, "margin_dpo/margin_std": 210.00332641601562, "step": 283 }, { "KL/chosen_KL_mean": -263.2822265625, "KL/mean": -357.4686279296875, "KL/rejected_KL_mean": -451.6550598144531, "KL/std": 208.6199951171875, "epoch": 0.4170337738619677, "fcm_dpo/beta": 0.0023820092901587486, "fcm_dpo/delta": -0.051164183765649796, "fcm_dpo/margin": 188.37283325195312, "fcm_dpo/q_t": 0.3964000344276428, "grad_norm": 29.025760650634766, "learning_rate": 3.6372625621898863e-07, "logits/chosen": -0.3494116961956024, "logits/rejected": -0.33238211274147034, "logps/chosen": -321.7177734375, "logps/ref_chosen": -58.4355354309082, "logps/ref_rejected": -93.46926879882812, "logps/rejected": -545.1243286132812, "loss": 1.0441, "margin_dpo/margin_mean": 188.3728485107422, "margin_dpo/margin_std": 209.56866455078125, "step": 284 }, { "KL/chosen_KL_mean": -302.94207763671875, "KL/mean": -389.82421875, "KL/rejected_KL_mean": -476.7063903808594, "KL/std": 186.600830078125, "epoch": 0.4185022026431718, "fcm_dpo/beta": 0.0023880950175225735, "fcm_dpo/delta": -0.016177460551261902, "fcm_dpo/margin": 173.7642822265625, "fcm_dpo/q_t": 0.401868611574173, "grad_norm": 24.644567489624023, "learning_rate": 3.625819059005228e-07, "logits/chosen": -0.3161693811416626, "logits/rejected": -0.29976439476013184, "logps/chosen": -369.1742858886719, "logps/ref_chosen": -66.23219299316406, "logps/ref_rejected": -99.1268310546875, "logps/rejected": -575.833251953125, "loss": 1.0727, "margin_dpo/margin_mean": 173.7642822265625, "margin_dpo/margin_std": 203.36099243164062, "step": 285 }, { "KL/chosen_KL_mean": -321.2906494140625, "KL/mean": -415.0553283691406, "KL/rejected_KL_mean": -508.82000732421875, "KL/std": 211.68765258789062, "epoch": 0.4199706314243759, "fcm_dpo/beta": 0.0023618116974830627, "fcm_dpo/delta": -0.044894296675920486, "fcm_dpo/margin": 187.52935791015625, "fcm_dpo/q_t": 0.3975561857223511, "grad_norm": 32.87556457519531, "learning_rate": 3.614345889441346e-07, "logits/chosen": -0.33208775520324707, "logits/rejected": -0.31639528274536133, "logps/chosen": -394.24163818359375, "logps/ref_chosen": -72.95100402832031, "logps/ref_rejected": -88.58845520019531, "logps/rejected": -597.408447265625, "loss": 1.0606, "margin_dpo/margin_mean": 187.52935791015625, "margin_dpo/margin_std": 228.80307006835938, "step": 286 }, { "KL/chosen_KL_mean": -296.5609130859375, "KL/mean": -373.24261474609375, "KL/rejected_KL_mean": -449.92431640625, "KL/std": 189.50225830078125, "epoch": 0.42143906020558003, "fcm_dpo/beta": 0.0023672073148190975, "fcm_dpo/delta": 0.038290925323963165, "fcm_dpo/margin": 153.36337280273438, "fcm_dpo/q_t": 0.41586506366729736, "grad_norm": 30.704463958740234, "learning_rate": 3.6028433558269275e-07, "logits/chosen": -0.2822296619415283, "logits/rejected": -0.2562822699546814, "logps/chosen": -358.1020812988281, "logps/ref_chosen": -61.54115295410156, "logps/ref_rejected": -77.69607543945312, "logps/rejected": -527.620361328125, "loss": 1.1102, "margin_dpo/margin_mean": 153.36337280273438, "margin_dpo/margin_std": 203.52426147460938, "step": 287 }, { "KL/chosen_KL_mean": -283.6439514160156, "KL/mean": -377.3069763183594, "KL/rejected_KL_mean": -470.969970703125, "KL/std": 187.588134765625, "epoch": 0.42290748898678415, "fcm_dpo/beta": 0.0023379437625408173, "fcm_dpo/delta": -0.04108269885182381, "fcm_dpo/margin": 187.32608032226562, "fcm_dpo/q_t": 0.3972797393798828, "grad_norm": 23.47823715209961, "learning_rate": 3.5913117612644327e-07, "logits/chosen": -0.32454603910446167, "logits/rejected": -0.30999091267585754, "logps/chosen": -340.30517578125, "logps/ref_chosen": -56.661224365234375, "logps/ref_rejected": -87.33570098876953, "logps/rejected": -558.3056640625, "loss": 1.0495, "margin_dpo/margin_mean": 187.32608032226562, "margin_dpo/margin_std": 196.14199829101562, "step": 288 }, { "KL/chosen_KL_mean": -265.7540588378906, "KL/mean": -373.11285400390625, "KL/rejected_KL_mean": -480.47161865234375, "KL/std": 206.18136596679688, "epoch": 0.4243759177679883, "fcm_dpo/beta": 0.0023207864724099636, "fcm_dpo/delta": -0.10334105789661407, "fcm_dpo/margin": 214.71755981445312, "fcm_dpo/q_t": 0.3854060769081116, "grad_norm": 26.55727195739746, "learning_rate": 3.5797514096221024e-07, "logits/chosen": -0.26507318019866943, "logits/rejected": -0.2650468349456787, "logps/chosen": -310.98443603515625, "logps/ref_chosen": -45.23039245605469, "logps/ref_rejected": -87.64266967773438, "logps/rejected": -568.1142578125, "loss": 1.0143, "margin_dpo/margin_mean": 214.71755981445312, "margin_dpo/margin_std": 228.02871704101562, "step": 289 }, { "KL/chosen_KL_mean": -276.853759765625, "KL/mean": -383.96502685546875, "KL/rejected_KL_mean": -491.07623291015625, "KL/std": 217.60955810546875, "epoch": 0.42584434654919234, "fcm_dpo/beta": 0.0022673578932881355, "fcm_dpo/delta": -0.09031336009502411, "fcm_dpo/margin": 214.22250366210938, "fcm_dpo/q_t": 0.3898102939128876, "grad_norm": 20.6153621673584, "learning_rate": 3.568162605525952e-07, "logits/chosen": -0.3317207098007202, "logits/rejected": -0.3523035943508148, "logps/chosen": -332.32525634765625, "logps/ref_chosen": -55.47149658203125, "logps/ref_rejected": -116.70857238769531, "logps/rejected": -607.7847900390625, "loss": 1.0356, "margin_dpo/margin_mean": 214.22250366210938, "margin_dpo/margin_std": 258.4100341796875, "step": 290 }, { "KL/chosen_KL_mean": -230.03976440429688, "KL/mean": -328.71478271484375, "KL/rejected_KL_mean": -427.3897705078125, "KL/std": 188.07647705078125, "epoch": 0.42731277533039647, "fcm_dpo/beta": 0.0022549815475940704, "fcm_dpo/delta": -0.047311414033174515, "fcm_dpo/margin": 197.35003662109375, "fcm_dpo/q_t": 0.39613714814186096, "grad_norm": 26.71583366394043, "learning_rate": 3.5565456543517485e-07, "logits/chosen": -0.34385907649993896, "logits/rejected": -0.3322584629058838, "logps/chosen": -293.30010986328125, "logps/ref_chosen": -63.26036834716797, "logps/ref_rejected": -89.29708862304688, "logps/rejected": -516.6868896484375, "loss": 1.052, "margin_dpo/margin_mean": 197.35003662109375, "margin_dpo/margin_std": 223.89544677734375, "step": 291 }, { "KL/chosen_KL_mean": -248.18878173828125, "KL/mean": -349.1991882324219, "KL/rejected_KL_mean": -450.2095947265625, "KL/std": 218.77377319335938, "epoch": 0.4287812041116006, "fcm_dpo/beta": 0.0022292518988251686, "fcm_dpo/delta": -0.05277468264102936, "fcm_dpo/margin": 202.0208282470703, "fcm_dpo/q_t": 0.3962145447731018, "grad_norm": 23.873905181884766, "learning_rate": 3.5449008622169583e-07, "logits/chosen": -0.31092000007629395, "logits/rejected": -0.2947191596031189, "logps/chosen": -302.1072998046875, "logps/ref_chosen": -53.91852951049805, "logps/ref_rejected": -89.96138000488281, "logps/rejected": -540.1710205078125, "loss": 1.0614, "margin_dpo/margin_mean": 202.02084350585938, "margin_dpo/margin_std": 255.2861328125, "step": 292 }, { "KL/chosen_KL_mean": -252.22222900390625, "KL/mean": -329.9274597167969, "KL/rejected_KL_mean": -407.6326904296875, "KL/std": 214.19297790527344, "epoch": 0.4302496328928047, "fcm_dpo/beta": 0.0022371455561369658, "fcm_dpo/delta": 0.05408930033445358, "fcm_dpo/margin": 155.41049194335938, "fcm_dpo/q_t": 0.42154398560523987, "grad_norm": 44.14265060424805, "learning_rate": 3.5332285359726846e-07, "logits/chosen": -0.32412204146385193, "logits/rejected": -0.3133804202079773, "logps/chosen": -312.5982666015625, "logps/ref_chosen": -60.376033782958984, "logps/ref_rejected": -77.85244750976562, "logps/rejected": -485.4851379394531, "loss": 1.1388, "margin_dpo/margin_mean": 155.41049194335938, "margin_dpo/margin_std": 247.1007080078125, "step": 293 }, { "KL/chosen_KL_mean": -224.80003356933594, "KL/mean": -312.6026916503906, "KL/rejected_KL_mean": -400.40533447265625, "KL/std": 189.26589965820312, "epoch": 0.43171806167400884, "fcm_dpo/beta": 0.0022378209978342056, "fcm_dpo/delta": 0.0072397105395793915, "fcm_dpo/margin": 175.60528564453125, "fcm_dpo/q_t": 0.4101282060146332, "grad_norm": 27.088520050048828, "learning_rate": 3.5215289831955786e-07, "logits/chosen": -0.30610185861587524, "logits/rejected": -0.31053173542022705, "logps/chosen": -272.8875732421875, "logps/ref_chosen": -48.0875358581543, "logps/ref_rejected": -81.89698791503906, "logps/rejected": -482.30230712890625, "loss": 1.0958, "margin_dpo/margin_mean": 175.60528564453125, "margin_dpo/margin_std": 234.76486206054688, "step": 294 }, { "KL/chosen_KL_mean": -276.4921875, "KL/mean": -373.5928955078125, "KL/rejected_KL_mean": -470.69366455078125, "KL/std": 215.0086669921875, "epoch": 0.4331864904552129, "fcm_dpo/beta": 0.002234848216176033, "fcm_dpo/delta": -0.03554647043347359, "fcm_dpo/margin": 194.2014617919922, "fcm_dpo/q_t": 0.4011088013648987, "grad_norm": 29.52766990661621, "learning_rate": 3.509802512179737e-07, "logits/chosen": -0.34492525458335876, "logits/rejected": -0.34738168120384216, "logps/chosen": -326.4168701171875, "logps/ref_chosen": -49.92467498779297, "logps/ref_rejected": -87.45632934570312, "logps/rejected": -558.1499633789062, "loss": 1.0748, "margin_dpo/margin_mean": 194.2014617919922, "margin_dpo/margin_std": 255.34695434570312, "step": 295 }, { "KL/chosen_KL_mean": -349.96075439453125, "KL/mean": -420.53668212890625, "KL/rejected_KL_mean": -491.1127014160156, "KL/std": 214.16937255859375, "epoch": 0.434654919236417, "fcm_dpo/beta": 0.0022216294892132282, "fcm_dpo/delta": -0.01307538989931345, "fcm_dpo/margin": 141.15194702148438, "fcm_dpo/q_t": 0.4277653992176056, "grad_norm": 30.11356544494629, "learning_rate": 3.498049431928577e-07, "logits/chosen": -0.35794180631637573, "logits/rejected": -0.3423447906970978, "logps/chosen": -415.45196533203125, "logps/ref_chosen": -65.49124145507812, "logps/ref_rejected": -93.08908081054688, "logps/rejected": -584.2017822265625, "loss": 1.1858, "margin_dpo/margin_mean": 141.15194702148438, "margin_dpo/margin_std": 273.21856689453125, "step": 296 }, { "KL/chosen_KL_mean": -306.0766906738281, "KL/mean": -389.6324768066406, "KL/rejected_KL_mean": -473.188232421875, "KL/std": 202.3120574951172, "epoch": 0.43612334801762115, "fcm_dpo/beta": 0.0022283056750893593, "fcm_dpo/delta": 0.028696084395051003, "fcm_dpo/margin": 167.111572265625, "fcm_dpo/q_t": 0.41309747099876404, "grad_norm": 26.392620086669922, "learning_rate": 3.486270052146694e-07, "logits/chosen": -0.3816351294517517, "logits/rejected": -0.38721585273742676, "logps/chosen": -362.55364990234375, "logps/ref_chosen": -56.476951599121094, "logps/ref_rejected": -95.1385498046875, "logps/rejected": -568.3267822265625, "loss": 1.0975, "margin_dpo/margin_mean": 167.111572265625, "margin_dpo/margin_std": 204.6174774169922, "step": 297 }, { "KL/chosen_KL_mean": -336.15673828125, "KL/mean": -440.7702941894531, "KL/rejected_KL_mean": -545.3837890625, "KL/std": 268.1351318359375, "epoch": 0.43759177679882527, "fcm_dpo/beta": 0.002221038332208991, "fcm_dpo/delta": -0.06791778653860092, "fcm_dpo/margin": 209.22708129882812, "fcm_dpo/q_t": 0.39830517768859863, "grad_norm": 23.501754760742188, "learning_rate": 3.474464683231698e-07, "logits/chosen": -0.3646508455276489, "logits/rejected": -0.382364422082901, "logps/chosen": -403.48193359375, "logps/ref_chosen": -67.32516479492188, "logps/ref_rejected": -116.66217041015625, "logps/rejected": -662.0460205078125, "loss": 1.0784, "margin_dpo/margin_mean": 209.2270965576172, "margin_dpo/margin_std": 309.3797607421875, "step": 298 }, { "KL/chosen_KL_mean": -277.727783203125, "KL/mean": -368.62017822265625, "KL/rejected_KL_mean": -459.5125732421875, "KL/std": 201.8565673828125, "epoch": 0.4390602055800294, "fcm_dpo/beta": 0.0022161747328937054, "fcm_dpo/delta": -0.0036800652742385864, "fcm_dpo/margin": 181.78482055664062, "fcm_dpo/q_t": 0.4088858366012573, "grad_norm": 30.56585121154785, "learning_rate": 3.462633636266041e-07, "logits/chosen": -0.38511306047439575, "logits/rejected": -0.39016664028167725, "logps/chosen": -326.68988037109375, "logps/ref_chosen": -48.96209716796875, "logps/ref_rejected": -84.32823944091797, "logps/rejected": -543.8408203125, "loss": 1.0997, "margin_dpo/margin_mean": 181.78482055664062, "margin_dpo/margin_std": 254.47584533691406, "step": 299 }, { "KL/chosen_KL_mean": -359.6534423828125, "KL/mean": -468.8331604003906, "KL/rejected_KL_mean": -578.0128784179688, "KL/std": 245.97247314453125, "epoch": 0.44052863436123346, "fcm_dpo/beta": 0.0021824706345796585, "fcm_dpo/delta": -0.08032761514186859, "fcm_dpo/margin": 218.35943603515625, "fcm_dpo/q_t": 0.3919123411178589, "grad_norm": 32.0359992980957, "learning_rate": 3.4507772230088147e-07, "logits/chosen": -0.3638511300086975, "logits/rejected": -0.36910757422447205, "logps/chosen": -418.7271728515625, "logps/ref_chosen": -59.07371139526367, "logps/ref_rejected": -95.9664535522461, "logps/rejected": -673.9793701171875, "loss": 1.0673, "margin_dpo/margin_mean": 218.3594512939453, "margin_dpo/margin_std": 301.63983154296875, "step": 300 }, { "KL/chosen_KL_mean": -299.93310546875, "KL/mean": -405.40899658203125, "KL/rejected_KL_mean": -510.8848571777344, "KL/std": 220.83499145507812, "epoch": 0.4419970631424376, "fcm_dpo/beta": 0.0021431921049952507, "fcm_dpo/delta": -0.05494837090373039, "fcm_dpo/margin": 210.9517822265625, "fcm_dpo/q_t": 0.397805392742157, "grad_norm": 23.908777236938477, "learning_rate": 3.4388957558875316e-07, "logits/chosen": -0.3962569832801819, "logits/rejected": -0.39823591709136963, "logps/chosen": -357.1824951171875, "logps/ref_chosen": -57.249366760253906, "logps/ref_rejected": -92.35354614257812, "logps/rejected": -603.2384033203125, "loss": 1.0574, "margin_dpo/margin_mean": 210.9517822265625, "margin_dpo/margin_std": 264.4083557128906, "step": 301 }, { "KL/chosen_KL_mean": -250.50064086914062, "KL/mean": -340.6607971191406, "KL/rejected_KL_mean": -430.82098388671875, "KL/std": 200.06076049804688, "epoch": 0.4434654919236417, "fcm_dpo/beta": 0.0021400072146207094, "fcm_dpo/delta": 0.014417506754398346, "fcm_dpo/margin": 180.3203125, "fcm_dpo/q_t": 0.4114646017551422, "grad_norm": 25.441043853759766, "learning_rate": 3.426989547989902e-07, "logits/chosen": -0.37367284297943115, "logits/rejected": -0.3793327212333679, "logps/chosen": -301.6986083984375, "logps/ref_chosen": -51.197994232177734, "logps/ref_rejected": -97.22636413574219, "logps/rejected": -528.04736328125, "loss": 1.0961, "margin_dpo/margin_mean": 180.3203125, "margin_dpo/margin_std": 233.26841735839844, "step": 302 }, { "KL/chosen_KL_mean": -252.14584350585938, "KL/mean": -333.20892333984375, "KL/rejected_KL_mean": -414.27197265625, "KL/std": 197.92535400390625, "epoch": 0.44493392070484583, "fcm_dpo/beta": 0.00216277944855392, "fcm_dpo/delta": 0.05117795616388321, "fcm_dpo/margin": 162.12612915039062, "fcm_dpo/q_t": 0.41918832063674927, "grad_norm": 25.745454788208008, "learning_rate": 3.4150589130555773e-07, "logits/chosen": -0.3918335437774658, "logits/rejected": -0.3803231716156006, "logps/chosen": -318.85980224609375, "logps/ref_chosen": -66.71394348144531, "logps/ref_rejected": -86.94542694091797, "logps/rejected": -501.2174072265625, "loss": 1.1285, "margin_dpo/margin_mean": 162.12612915039062, "margin_dpo/margin_std": 235.95956420898438, "step": 303 }, { "KL/chosen_KL_mean": -221.95501708984375, "KL/mean": -313.56036376953125, "KL/rejected_KL_mean": -405.16571044921875, "KL/std": 181.8985137939453, "epoch": 0.44640234948604995, "fcm_dpo/beta": 0.002179923001676798, "fcm_dpo/delta": 0.00038868188858032227, "fcm_dpo/margin": 183.21072387695312, "fcm_dpo/q_t": 0.40427181124687195, "grad_norm": 44.084224700927734, "learning_rate": 3.403104165467883e-07, "logits/chosen": -0.4207175672054291, "logits/rejected": -0.41341572999954224, "logps/chosen": -293.90570068359375, "logps/ref_chosen": -71.95069885253906, "logps/ref_rejected": -90.47203063964844, "logps/rejected": -495.63775634765625, "loss": 1.0561, "margin_dpo/margin_mean": 183.21072387695312, "margin_dpo/margin_std": 157.74085998535156, "step": 304 }, { "KL/chosen_KL_mean": -244.41680908203125, "KL/mean": -326.9585876464844, "KL/rejected_KL_mean": -409.5003662109375, "KL/std": 211.88983154296875, "epoch": 0.447870778267254, "fcm_dpo/beta": 0.0021737192291766405, "fcm_dpo/delta": 0.042266424745321274, "fcm_dpo/margin": 165.08358764648438, "fcm_dpo/q_t": 0.41834086179733276, "grad_norm": 31.972244262695312, "learning_rate": 3.391125620245535e-07, "logits/chosen": -0.41458988189697266, "logits/rejected": -0.39753109216690063, "logps/chosen": -311.2120361328125, "logps/ref_chosen": -66.79523468017578, "logps/ref_rejected": -92.75459289550781, "logps/rejected": -502.25494384765625, "loss": 1.1229, "margin_dpo/margin_mean": 165.08358764648438, "margin_dpo/margin_std": 234.5511932373047, "step": 305 }, { "KL/chosen_KL_mean": -250.080322265625, "KL/mean": -330.5018005371094, "KL/rejected_KL_mean": -410.92327880859375, "KL/std": 189.47010803222656, "epoch": 0.44933920704845814, "fcm_dpo/beta": 0.0022093781735748053, "fcm_dpo/delta": 0.04615872725844383, "fcm_dpo/margin": 160.84295654296875, "fcm_dpo/q_t": 0.4171670079231262, "grad_norm": 25.6021785736084, "learning_rate": 3.3791235930343417e-07, "logits/chosen": -0.386802077293396, "logits/rejected": -0.36289817094802856, "logps/chosen": -319.76422119140625, "logps/ref_chosen": -69.68389892578125, "logps/ref_rejected": -85.15919494628906, "logps/rejected": -496.08245849609375, "loss": 1.1083, "margin_dpo/margin_mean": 160.84295654296875, "margin_dpo/margin_std": 201.04806518554688, "step": 306 }, { "KL/chosen_KL_mean": -236.98658752441406, "KL/mean": -322.44061279296875, "KL/rejected_KL_mean": -407.89459228515625, "KL/std": 182.07717895507812, "epoch": 0.45080763582966227, "fcm_dpo/beta": 0.0022153654135763645, "fcm_dpo/delta": 0.022209253162145615, "fcm_dpo/margin": 170.90798950195312, "fcm_dpo/q_t": 0.4120749235153198, "grad_norm": 25.33635139465332, "learning_rate": 3.367098400098881e-07, "logits/chosen": -0.35360145568847656, "logits/rejected": -0.3347788155078888, "logps/chosen": -307.1520080566406, "logps/ref_chosen": -70.16542053222656, "logps/ref_rejected": -86.97230529785156, "logps/rejected": -494.8669128417969, "loss": 1.1004, "margin_dpo/margin_mean": 170.90798950195312, "margin_dpo/margin_std": 223.1322021484375, "step": 307 }, { "KL/chosen_KL_mean": -244.5159149169922, "KL/mean": -342.49346923828125, "KL/rejected_KL_mean": -440.47100830078125, "KL/std": 209.0433349609375, "epoch": 0.4522760646108664, "fcm_dpo/beta": 0.0022103004157543182, "fcm_dpo/delta": -0.03460945934057236, "fcm_dpo/margin": 195.95510864257812, "fcm_dpo/q_t": 0.39782798290252686, "grad_norm": 35.38976287841797, "learning_rate": 3.355050358314172e-07, "logits/chosen": -0.33471328020095825, "logits/rejected": -0.32273852825164795, "logps/chosen": -299.76092529296875, "logps/ref_chosen": -55.2449951171875, "logps/ref_rejected": -79.37226104736328, "logps/rejected": -519.84326171875, "loss": 1.0462, "margin_dpo/margin_mean": 195.95510864257812, "margin_dpo/margin_std": 199.99911499023438, "step": 308 }, { "KL/chosen_KL_mean": -252.39642333984375, "KL/mean": -347.4449157714844, "KL/rejected_KL_mean": -442.4934387207031, "KL/std": 221.7578125, "epoch": 0.45374449339207046, "fcm_dpo/beta": 0.0022104280069470406, "fcm_dpo/delta": -0.02155473083257675, "fcm_dpo/margin": 190.0969696044922, "fcm_dpo/q_t": 0.40153375267982483, "grad_norm": 46.212398529052734, "learning_rate": 3.3429797851573183e-07, "logits/chosen": -0.3147510886192322, "logits/rejected": -0.30644166469573975, "logps/chosen": -301.35552978515625, "logps/ref_chosen": -48.959083557128906, "logps/ref_rejected": -82.34072875976562, "logps/rejected": -524.8341674804688, "loss": 1.0737, "margin_dpo/margin_mean": 190.09698486328125, "margin_dpo/margin_std": 229.8643798828125, "step": 309 }, { "KL/chosen_KL_mean": -293.9149169921875, "KL/mean": -383.8049621582031, "KL/rejected_KL_mean": -473.69500732421875, "KL/std": 197.09872436523438, "epoch": 0.4552129221732746, "fcm_dpo/beta": 0.0022050600964576006, "fcm_dpo/delta": 0.0034537650644779205, "fcm_dpo/margin": 179.78012084960938, "fcm_dpo/q_t": 0.40635180473327637, "grad_norm": 33.55656051635742, "learning_rate": 3.3308869986991487e-07, "logits/chosen": -0.3364931046962738, "logits/rejected": -0.3217809200286865, "logps/chosen": -356.65667724609375, "logps/ref_chosen": -62.74177932739258, "logps/ref_rejected": -79.9302978515625, "logps/rejected": -553.6253051757812, "loss": 1.072, "margin_dpo/margin_mean": 179.7801055908203, "margin_dpo/margin_std": 189.23294067382812, "step": 310 }, { "KL/chosen_KL_mean": -307.0128173828125, "KL/mean": -407.85986328125, "KL/rejected_KL_mean": -508.7069091796875, "KL/std": 248.04620361328125, "epoch": 0.4566813509544787, "fcm_dpo/beta": 0.0021827276796102524, "fcm_dpo/delta": -0.04212527349591255, "fcm_dpo/margin": 201.69406127929688, "fcm_dpo/q_t": 0.4002191424369812, "grad_norm": 23.36595344543457, "learning_rate": 3.3187723175958346e-07, "logits/chosen": -0.3425959348678589, "logits/rejected": -0.31884661316871643, "logps/chosen": -360.04083251953125, "logps/ref_chosen": -53.02798080444336, "logps/ref_rejected": -77.43820190429688, "logps/rejected": -586.1451416015625, "loss": 1.0687, "margin_dpo/margin_mean": 201.69406127929688, "margin_dpo/margin_std": 261.9150085449219, "step": 311 }, { "KL/chosen_KL_mean": -291.6320495605469, "KL/mean": -385.4327697753906, "KL/rejected_KL_mean": -479.23345947265625, "KL/std": 222.2432861328125, "epoch": 0.4581497797356828, "fcm_dpo/beta": 0.0021842336282134056, "fcm_dpo/delta": -0.010445069521665573, "fcm_dpo/margin": 187.60142517089844, "fcm_dpo/q_t": 0.4080832004547119, "grad_norm": 25.498783111572266, "learning_rate": 3.306636061080487e-07, "logits/chosen": -0.2573780417442322, "logits/rejected": -0.2468489408493042, "logps/chosen": -341.0242614746094, "logps/ref_chosen": -49.39221954345703, "logps/ref_rejected": -75.79280853271484, "logps/rejected": -555.0262451171875, "loss": 1.0923, "margin_dpo/margin_mean": 187.6014404296875, "margin_dpo/margin_std": 262.27001953125, "step": 312 }, { "KL/chosen_KL_mean": -274.81085205078125, "KL/mean": -373.3189392089844, "KL/rejected_KL_mean": -471.82708740234375, "KL/std": 231.67160034179688, "epoch": 0.45961820851688695, "fcm_dpo/beta": 0.0021794200874865055, "fcm_dpo/delta": -0.03143874555826187, "fcm_dpo/margin": 197.0161895751953, "fcm_dpo/q_t": 0.40177449584007263, "grad_norm": 24.87454605102539, "learning_rate": 3.2944785489547537e-07, "logits/chosen": -0.33512693643569946, "logits/rejected": -0.3323609530925751, "logps/chosen": -324.9635925292969, "logps/ref_chosen": -50.152740478515625, "logps/ref_rejected": -86.40620422363281, "logps/rejected": -558.2332763671875, "loss": 1.0768, "margin_dpo/margin_mean": 197.0161895751953, "margin_dpo/margin_std": 253.28038024902344, "step": 313 }, { "KL/chosen_KL_mean": -298.7161560058594, "KL/mean": -388.66937255859375, "KL/rejected_KL_mean": -478.62261962890625, "KL/std": 222.7989501953125, "epoch": 0.461086637298091, "fcm_dpo/beta": 0.0021564702037721872, "fcm_dpo/delta": 0.012253139168024063, "fcm_dpo/margin": 179.90646362304688, "fcm_dpo/q_t": 0.4130541682243347, "grad_norm": 22.955171585083008, "learning_rate": 3.2823001015803857e-07, "logits/chosen": -0.3842218816280365, "logits/rejected": -0.3846893906593323, "logps/chosen": -355.9537353515625, "logps/ref_chosen": -57.237579345703125, "logps/ref_rejected": -97.5965347290039, "logps/rejected": -576.2191772460938, "loss": 1.1194, "margin_dpo/margin_mean": 179.90646362304688, "margin_dpo/margin_std": 277.59881591796875, "step": 314 }, { "KL/chosen_KL_mean": -269.84991455078125, "KL/mean": -350.29864501953125, "KL/rejected_KL_mean": -430.747314453125, "KL/std": 197.81268310546875, "epoch": 0.46255506607929514, "fcm_dpo/beta": 0.002180763054639101, "fcm_dpo/delta": 0.050937261432409286, "fcm_dpo/margin": 160.8974151611328, "fcm_dpo/q_t": 0.41917771100997925, "grad_norm": 24.254276275634766, "learning_rate": 3.270101039870797e-07, "logits/chosen": -0.30912429094314575, "logits/rejected": -0.31343331933021545, "logps/chosen": -318.91949462890625, "logps/ref_chosen": -49.06958770751953, "logps/ref_rejected": -85.68087768554688, "logps/rejected": -516.42822265625, "loss": 1.1207, "margin_dpo/margin_mean": 160.8974151611328, "margin_dpo/margin_std": 221.48760986328125, "step": 315 }, { "KL/chosen_KL_mean": -255.51815795898438, "KL/mean": -372.13592529296875, "KL/rejected_KL_mean": -488.75372314453125, "KL/std": 227.74038696289062, "epoch": 0.46402349486049926, "fcm_dpo/beta": 0.00214382354170084, "fcm_dpo/delta": -0.10627135634422302, "fcm_dpo/margin": 233.23556518554688, "fcm_dpo/q_t": 0.38564345240592957, "grad_norm": 31.184669494628906, "learning_rate": 3.2578816852826086e-07, "logits/chosen": -0.3521896004676819, "logits/rejected": -0.35530799627304077, "logps/chosen": -309.7789306640625, "logps/ref_chosen": -54.26074981689453, "logps/ref_rejected": -101.2814712524414, "logps/rejected": -590.03515625, "loss": 1.0134, "margin_dpo/margin_mean": 233.23556518554688, "margin_dpo/margin_std": 248.93862915039062, "step": 316 }, { "KL/chosen_KL_mean": -252.7716064453125, "KL/mean": -380.02117919921875, "KL/rejected_KL_mean": -507.27069091796875, "KL/std": 206.77581787109375, "epoch": 0.4654919236417034, "fcm_dpo/beta": 0.002105377148836851, "fcm_dpo/delta": -0.14325766265392303, "fcm_dpo/margin": 254.4990997314453, "fcm_dpo/q_t": 0.3748946189880371, "grad_norm": 26.17375946044922, "learning_rate": 3.2456423598071783e-07, "logits/chosen": -0.390718936920166, "logits/rejected": -0.3826904892921448, "logps/chosen": -308.8658447265625, "logps/ref_chosen": -56.094207763671875, "logps/ref_rejected": -100.69905090332031, "logps/rejected": -607.9697265625, "loss": 0.978, "margin_dpo/margin_mean": 254.4990997314453, "margin_dpo/margin_std": 230.30081176757812, "step": 317 }, { "KL/chosen_KL_mean": -269.4692687988281, "KL/mean": -365.70343017578125, "KL/rejected_KL_mean": -461.93756103515625, "KL/std": 210.68841552734375, "epoch": 0.4669603524229075, "fcm_dpo/beta": 0.0020867723505944014, "fcm_dpo/delta": -0.001836409792304039, "fcm_dpo/margin": 192.4683380126953, "fcm_dpo/q_t": 0.4067472815513611, "grad_norm": 23.02154541015625, "learning_rate": 3.233383385962115e-07, "logits/chosen": -0.4208700656890869, "logits/rejected": -0.3912370800971985, "logps/chosen": -334.1149597167969, "logps/ref_chosen": -64.64569854736328, "logps/ref_rejected": -82.76425170898438, "logps/rejected": -544.7018432617188, "loss": 1.0773, "margin_dpo/margin_mean": 192.4683380126953, "margin_dpo/margin_std": 227.6715850830078, "step": 318 }, { "KL/chosen_KL_mean": -239.6466064453125, "KL/mean": -350.7529296875, "KL/rejected_KL_mean": -461.8592529296875, "KL/std": 224.13626098632812, "epoch": 0.4684287812041116, "fcm_dpo/beta": 0.00206323666498065, "fcm_dpo/delta": -0.0612642765045166, "fcm_dpo/margin": 222.212646484375, "fcm_dpo/q_t": 0.3929300308227539, "grad_norm": 28.023488998413086, "learning_rate": 3.2211050867827805e-07, "logits/chosen": -0.38181760907173157, "logits/rejected": -0.3934275507926941, "logps/chosen": -289.0303649902344, "logps/ref_chosen": -49.383758544921875, "logps/ref_rejected": -113.90650939941406, "logps/rejected": -575.7657470703125, "loss": 1.0361, "margin_dpo/margin_mean": 222.212646484375, "margin_dpo/margin_std": 239.76544189453125, "step": 319 }, { "KL/chosen_KL_mean": -251.48712158203125, "KL/mean": -370.67779541015625, "KL/rejected_KL_mean": -489.8685302734375, "KL/std": 234.5609130859375, "epoch": 0.4698972099853157, "fcm_dpo/beta": 0.0020233364775776863, "fcm_dpo/delta": -0.08694636821746826, "fcm_dpo/margin": 238.38136291503906, "fcm_dpo/q_t": 0.3885904550552368, "grad_norm": 31.34676742553711, "learning_rate": 3.208807785813777e-07, "logits/chosen": -0.3809563219547272, "logits/rejected": -0.3856205940246582, "logps/chosen": -310.99200439453125, "logps/ref_chosen": -59.50489044189453, "logps/ref_rejected": -97.66717529296875, "logps/rejected": -587.53564453125, "loss": 1.0213, "margin_dpo/margin_mean": 238.38136291503906, "margin_dpo/margin_std": 253.5137481689453, "step": 320 }, { "KL/chosen_KL_mean": -314.7099304199219, "KL/mean": -417.4952392578125, "KL/rejected_KL_mean": -520.2805786132812, "KL/std": 242.76919555664062, "epoch": 0.4713656387665198, "fcm_dpo/beta": 0.0020058308728039265, "fcm_dpo/delta": -0.013504378497600555, "fcm_dpo/margin": 205.5706787109375, "fcm_dpo/q_t": 0.4050200581550598, "grad_norm": 22.613510131835938, "learning_rate": 3.1964918071004217e-07, "logits/chosen": -0.3279907703399658, "logits/rejected": -0.31290388107299805, "logps/chosen": -376.25860595703125, "logps/ref_chosen": -61.548683166503906, "logps/ref_rejected": -91.64103698730469, "logps/rejected": -611.921630859375, "loss": 1.0825, "margin_dpo/margin_mean": 205.5706787109375, "margin_dpo/margin_std": 260.4750061035156, "step": 321 }, { "KL/chosen_KL_mean": -277.9368896484375, "KL/mean": -393.11395263671875, "KL/rejected_KL_mean": -508.2909851074219, "KL/std": 222.57351684570312, "epoch": 0.47283406754772395, "fcm_dpo/beta": 0.0019898181781172752, "fcm_dpo/delta": -0.06161149963736534, "fcm_dpo/margin": 230.3541259765625, "fcm_dpo/q_t": 0.3928527235984802, "grad_norm": 27.584213256835938, "learning_rate": 3.184157475180207e-07, "logits/chosen": -0.3369908928871155, "logits/rejected": -0.33506596088409424, "logps/chosen": -335.2269287109375, "logps/ref_chosen": -57.29003143310547, "logps/ref_rejected": -95.74992370605469, "logps/rejected": -604.0408935546875, "loss": 1.0334, "margin_dpo/margin_mean": 230.3541259765625, "margin_dpo/margin_std": 238.74765014648438, "step": 322 }, { "KL/chosen_KL_mean": -308.34637451171875, "KL/mean": -404.15570068359375, "KL/rejected_KL_mean": -499.96502685546875, "KL/std": 221.2327423095703, "epoch": 0.47430249632892807, "fcm_dpo/beta": 0.0019924892112612724, "fcm_dpo/delta": 0.018896615132689476, "fcm_dpo/margin": 191.61865234375, "fcm_dpo/q_t": 0.4107508361339569, "grad_norm": 37.98896789550781, "learning_rate": 3.171805115074251e-07, "logits/chosen": -0.3975059986114502, "logits/rejected": -0.39840167760849, "logps/chosen": -359.580322265625, "logps/ref_chosen": -51.23395919799805, "logps/ref_rejected": -75.06192016601562, "logps/rejected": -575.0269775390625, "loss": 1.0976, "margin_dpo/margin_mean": 191.61865234375, "margin_dpo/margin_std": 239.77487182617188, "step": 323 }, { "KL/chosen_KL_mean": -363.1470031738281, "KL/mean": -459.2158203125, "KL/rejected_KL_mean": -555.28466796875, "KL/std": 245.32525634765625, "epoch": 0.47577092511013214, "fcm_dpo/beta": 0.0020171115174889565, "fcm_dpo/delta": 0.011590391397476196, "fcm_dpo/margin": 192.13768005371094, "fcm_dpo/q_t": 0.4130977690219879, "grad_norm": 45.15947723388672, "learning_rate": 3.1594350522787295e-07, "logits/chosen": -0.382703959941864, "logits/rejected": -0.36886465549468994, "logps/chosen": -428.28216552734375, "logps/ref_chosen": -65.13516998291016, "logps/ref_rejected": -86.47750854492188, "logps/rejected": -641.76220703125, "loss": 1.1188, "margin_dpo/margin_mean": 192.13768005371094, "margin_dpo/margin_std": 280.11346435546875, "step": 324 }, { "KL/chosen_KL_mean": -291.805419921875, "KL/mean": -372.7662353515625, "KL/rejected_KL_mean": -453.72705078125, "KL/std": 219.71487426757812, "epoch": 0.47723935389133626, "fcm_dpo/beta": 0.002026339527219534, "fcm_dpo/delta": 0.07426172494888306, "fcm_dpo/margin": 161.92161560058594, "fcm_dpo/q_t": 0.4235909581184387, "grad_norm": 27.52566909790039, "learning_rate": 3.147047612756302e-07, "logits/chosen": -0.38832759857177734, "logits/rejected": -0.36508649587631226, "logps/chosen": -348.02099609375, "logps/ref_chosen": -56.215599060058594, "logps/ref_rejected": -70.08592987060547, "logps/rejected": -523.81298828125, "loss": 1.1353, "margin_dpo/margin_mean": 161.921630859375, "margin_dpo/margin_std": 220.10113525390625, "step": 325 }, { "KL/chosen_KL_mean": -334.75213623046875, "KL/mean": -415.12261962890625, "KL/rejected_KL_mean": -495.49310302734375, "KL/std": 209.2508544921875, "epoch": 0.4787077826725404, "fcm_dpo/beta": 0.00205246196128428, "fcm_dpo/delta": 0.07252933084964752, "fcm_dpo/margin": 160.740966796875, "fcm_dpo/q_t": 0.42195820808410645, "grad_norm": 49.80532455444336, "learning_rate": 3.134643122927519e-07, "logits/chosen": -0.4096953272819519, "logits/rejected": -0.3885076642036438, "logps/chosen": -407.47711181640625, "logps/ref_chosen": -72.72496032714844, "logps/ref_rejected": -79.8467788696289, "logps/rejected": -575.33984375, "loss": 1.1258, "margin_dpo/margin_mean": 160.74095153808594, "margin_dpo/margin_std": 199.02053833007812, "step": 326 }, { "KL/chosen_KL_mean": -289.6980895996094, "KL/mean": -412.4832763671875, "KL/rejected_KL_mean": -535.2684936523438, "KL/std": 226.18698120117188, "epoch": 0.4801762114537445, "fcm_dpo/beta": 0.0020246244966983795, "fcm_dpo/delta": -0.10278213024139404, "fcm_dpo/margin": 245.57040405273438, "fcm_dpo/q_t": 0.38342660665512085, "grad_norm": 34.839141845703125, "learning_rate": 3.1222219096622264e-07, "logits/chosen": -0.40583473443984985, "logits/rejected": -0.3938768804073334, "logps/chosen": -358.83251953125, "logps/ref_chosen": -69.13441467285156, "logps/ref_rejected": -111.93377685546875, "logps/rejected": -647.2022705078125, "loss": 1.0081, "margin_dpo/margin_mean": 245.57040405273438, "margin_dpo/margin_std": 245.6561279296875, "step": 327 }, { "KL/chosen_KL_mean": -313.56195068359375, "KL/mean": -424.1883850097656, "KL/rejected_KL_mean": -534.8148803710938, "KL/std": 248.53616333007812, "epoch": 0.48164464023494863, "fcm_dpo/beta": 0.002008104231208563, "fcm_dpo/delta": -0.046333495527505875, "fcm_dpo/margin": 221.25289916992188, "fcm_dpo/q_t": 0.3990749418735504, "grad_norm": 31.601329803466797, "learning_rate": 3.1097843002709427e-07, "logits/chosen": -0.37076377868652344, "logits/rejected": -0.3748210668563843, "logps/chosen": -373.2491455078125, "logps/ref_chosen": -59.68719482421875, "logps/ref_rejected": -90.85499572753906, "logps/rejected": -625.6698608398438, "loss": 1.0624, "margin_dpo/margin_mean": 221.25289916992188, "margin_dpo/margin_std": 280.74591064453125, "step": 328 }, { "KL/chosen_KL_mean": -348.364013671875, "KL/mean": -463.31585693359375, "KL/rejected_KL_mean": -578.2677001953125, "KL/std": 264.98114013671875, "epoch": 0.4831130690161527, "fcm_dpo/beta": 0.001973442966118455, "fcm_dpo/delta": -0.057223327457904816, "fcm_dpo/margin": 229.90365600585938, "fcm_dpo/q_t": 0.3950253129005432, "grad_norm": 29.35655975341797, "learning_rate": 3.0973306224962437e-07, "logits/chosen": -0.38779300451278687, "logits/rejected": -0.3826950788497925, "logps/chosen": -413.6102294921875, "logps/ref_chosen": -65.2461929321289, "logps/ref_rejected": -100.69770812988281, "logps/rejected": -678.9653930664062, "loss": 1.0626, "margin_dpo/margin_mean": 229.90365600585938, "margin_dpo/margin_std": 285.9508972167969, "step": 329 }, { "KL/chosen_KL_mean": -297.8939208984375, "KL/mean": -418.2544250488281, "KL/rejected_KL_mean": -538.614990234375, "KL/std": 251.08877563476562, "epoch": 0.4845814977973568, "fcm_dpo/beta": 0.00195663096383214, "fcm_dpo/delta": -0.07452473044395447, "fcm_dpo/margin": 240.72103881835938, "fcm_dpo/q_t": 0.3914690613746643, "grad_norm": 24.276758193969727, "learning_rate": 3.084861204504122e-07, "logits/chosen": -0.37196603417396545, "logits/rejected": -0.37232887744903564, "logps/chosen": -344.89227294921875, "logps/ref_chosen": -46.998348236083984, "logps/ref_rejected": -86.87684631347656, "logps/rejected": -625.4918212890625, "loss": 1.0349, "margin_dpo/margin_mean": 240.72105407714844, "margin_dpo/margin_std": 271.8794860839844, "step": 330 }, { "KL/chosen_KL_mean": -324.826904296875, "KL/mean": -441.84423828125, "KL/rejected_KL_mean": -558.861572265625, "KL/std": 205.87355041503906, "epoch": 0.48604992657856094, "fcm_dpo/beta": 0.0019327991176396608, "fcm_dpo/delta": -0.054849639534950256, "fcm_dpo/margin": 234.03463745117188, "fcm_dpo/q_t": 0.3927758038043976, "grad_norm": 30.4269962310791, "learning_rate": 3.072376374875335e-07, "logits/chosen": -0.4110090136528015, "logits/rejected": -0.4085603356361389, "logps/chosen": -375.3511657714844, "logps/ref_chosen": -50.52424621582031, "logps/ref_rejected": -89.01544189453125, "logps/rejected": -647.876953125, "loss": 1.02, "margin_dpo/margin_mean": 234.03463745117188, "margin_dpo/margin_std": 203.57736206054688, "step": 331 }, { "KL/chosen_KL_mean": -331.472412109375, "KL/mean": -412.5047302246094, "KL/rejected_KL_mean": -493.5369873046875, "KL/std": 217.41600036621094, "epoch": 0.48751835535976507, "fcm_dpo/beta": 0.0019560197833925486, "fcm_dpo/delta": 0.08553433418273926, "fcm_dpo/margin": 162.0646209716797, "fcm_dpo/q_t": 0.42724329233169556, "grad_norm": 26.13042449951172, "learning_rate": 3.059876462596758e-07, "logits/chosen": -0.43391871452331543, "logits/rejected": -0.4177435040473938, "logps/chosen": -380.6527099609375, "logps/ref_chosen": -49.18028259277344, "logps/ref_rejected": -76.48515319824219, "logps/rejected": -570.0221557617188, "loss": 1.1466, "margin_dpo/margin_mean": 162.0646209716797, "margin_dpo/margin_std": 242.46429443359375, "step": 332 }, { "KL/chosen_KL_mean": -346.9090270996094, "KL/mean": -456.2929382324219, "KL/rejected_KL_mean": -565.6768798828125, "KL/std": 254.33526611328125, "epoch": 0.4889867841409692, "fcm_dpo/beta": 0.0019417135044932365, "fcm_dpo/delta": -0.02682226523756981, "fcm_dpo/margin": 218.76780700683594, "fcm_dpo/q_t": 0.40386736392974854, "grad_norm": 26.93458366394043, "learning_rate": 3.0473617970527015e-07, "logits/chosen": -0.43507999181747437, "logits/rejected": -0.43221938610076904, "logps/chosen": -410.664794921875, "logps/ref_chosen": -63.75574493408203, "logps/ref_rejected": -95.04411315917969, "logps/rejected": -660.720947265625, "loss": 1.0917, "margin_dpo/margin_mean": 218.76780700683594, "margin_dpo/margin_std": 306.374267578125, "step": 333 }, { "KL/chosen_KL_mean": -324.4681396484375, "KL/mean": -426.024169921875, "KL/rejected_KL_mean": -527.5802001953125, "KL/std": 273.755859375, "epoch": 0.49045521292217326, "fcm_dpo/beta": 0.0019499869085848331, "fcm_dpo/delta": 0.004075163975358009, "fcm_dpo/margin": 203.11204528808594, "fcm_dpo/q_t": 0.41010695695877075, "grad_norm": 25.05001449584961, "learning_rate": 3.034832708016243e-07, "logits/chosen": -0.42128774523735046, "logits/rejected": -0.4194262623786926, "logps/chosen": -391.4478759765625, "logps/ref_chosen": -66.97975158691406, "logps/ref_rejected": -95.31692504882812, "logps/rejected": -622.8970947265625, "loss": 1.1117, "margin_dpo/margin_mean": 203.112060546875, "margin_dpo/margin_std": 303.74609375, "step": 334 }, { "KL/chosen_KL_mean": -345.0743408203125, "KL/mean": -423.9620361328125, "KL/rejected_KL_mean": -502.8497314453125, "KL/std": 249.84854125976562, "epoch": 0.4919236417033774, "fcm_dpo/beta": 0.0019520404748618603, "fcm_dpo/delta": -0.0027985575143247843, "fcm_dpo/margin": 157.77536010742188, "fcm_dpo/q_t": 0.42951834201812744, "grad_norm": 27.41286277770996, "learning_rate": 3.022289525640531e-07, "logits/chosen": -0.45320796966552734, "logits/rejected": -0.43336862325668335, "logps/chosen": -407.6168212890625, "logps/ref_chosen": -62.54248046875, "logps/ref_rejected": -87.61770629882812, "logps/rejected": -590.4674072265625, "loss": 1.1714, "margin_dpo/margin_mean": 157.77536010742188, "margin_dpo/margin_std": 273.3466491699219, "step": 335 }, { "KL/chosen_KL_mean": -327.80242919921875, "KL/mean": -452.0769348144531, "KL/rejected_KL_mean": -576.3514404296875, "KL/std": 280.5369873046875, "epoch": 0.4933920704845815, "fcm_dpo/beta": 0.0019327957415953279, "fcm_dpo/delta": -0.08437924087047577, "fcm_dpo/margin": 248.549072265625, "fcm_dpo/q_t": 0.3922100067138672, "grad_norm": 27.173513412475586, "learning_rate": 3.009732580450086e-07, "logits/chosen": -0.41087979078292847, "logits/rejected": -0.41120561957359314, "logps/chosen": -382.33355712890625, "logps/ref_chosen": -54.53115463256836, "logps/ref_rejected": -104.40424346923828, "logps/rejected": -680.7556762695312, "loss": 1.0543, "margin_dpo/margin_mean": 248.549072265625, "margin_dpo/margin_std": 324.4247131347656, "step": 336 }, { "KL/chosen_KL_mean": -295.12164306640625, "KL/mean": -416.0631103515625, "KL/rejected_KL_mean": -537.0045166015625, "KL/std": 226.8846893310547, "epoch": 0.4948604992657856, "fcm_dpo/beta": 0.0018971418030560017, "fcm_dpo/delta": -0.061946481466293335, "fcm_dpo/margin": 241.8828887939453, "fcm_dpo/q_t": 0.39354777336120605, "grad_norm": 32.10745620727539, "learning_rate": 2.9971622033320914e-07, "logits/chosen": -0.4443337321281433, "logits/rejected": -0.4344029426574707, "logps/chosen": -360.2503356933594, "logps/ref_chosen": -65.12869262695312, "logps/ref_rejected": -101.72701263427734, "logps/rejected": -638.7315673828125, "loss": 1.0376, "margin_dpo/margin_mean": 241.88287353515625, "margin_dpo/margin_std": 264.9962463378906, "step": 337 }, { "KL/chosen_KL_mean": -272.0726318359375, "KL/mean": -388.24578857421875, "KL/rejected_KL_mean": -504.41900634765625, "KL/std": 222.21337890625, "epoch": 0.49632892804698975, "fcm_dpo/beta": 0.0018818873213604093, "fcm_dpo/delta": -0.03903310373425484, "fcm_dpo/margin": 232.34637451171875, "fcm_dpo/q_t": 0.3970540463924408, "grad_norm": 31.5482234954834, "learning_rate": 2.984578725527675e-07, "logits/chosen": -0.4323264956474304, "logits/rejected": -0.4272562265396118, "logps/chosen": -330.4953308105469, "logps/ref_chosen": -58.422706604003906, "logps/ref_rejected": -89.06854248046875, "logps/rejected": -593.487548828125, "loss": 1.0405, "margin_dpo/margin_mean": 232.34637451171875, "margin_dpo/margin_std": 230.95556640625, "step": 338 }, { "KL/chosen_KL_mean": -287.5809326171875, "KL/mean": -402.3648986816406, "KL/rejected_KL_mean": -517.1488647460938, "KL/std": 230.27059936523438, "epoch": 0.4977973568281938, "fcm_dpo/beta": 0.0018855368252843618, "fcm_dpo/delta": -0.03534376621246338, "fcm_dpo/margin": 229.56790161132812, "fcm_dpo/q_t": 0.3984706401824951, "grad_norm": 28.834077835083008, "learning_rate": 2.9719824786231796e-07, "logits/chosen": -0.473450243473053, "logits/rejected": -0.46168074011802673, "logps/chosen": -347.5762634277344, "logps/ref_chosen": -59.99531555175781, "logps/ref_rejected": -103.9109115600586, "logps/rejected": -621.0597534179688, "loss": 1.0496, "margin_dpo/margin_mean": 229.56790161132812, "margin_dpo/margin_std": 228.10092163085938, "step": 339 }, { "KL/chosen_KL_mean": -303.6623840332031, "KL/mean": -404.23040771484375, "KL/rejected_KL_mean": -504.79840087890625, "KL/std": 221.79913330078125, "epoch": 0.49926578560939794, "fcm_dpo/beta": 0.001867425860837102, "fcm_dpo/delta": 0.025164764374494553, "fcm_dpo/margin": 201.13601684570312, "fcm_dpo/q_t": 0.412939190864563, "grad_norm": 31.226028442382812, "learning_rate": 2.959373794541426e-07, "logits/chosen": -0.3601798415184021, "logits/rejected": -0.338324636220932, "logps/chosen": -356.49261474609375, "logps/ref_chosen": -52.83022689819336, "logps/ref_rejected": -73.10723114013672, "logps/rejected": -577.9056396484375, "loss": 1.1047, "margin_dpo/margin_mean": 201.13601684570312, "margin_dpo/margin_std": 265.9939270019531, "step": 340 }, { "KL/chosen_KL_mean": -305.0142517089844, "KL/mean": -421.94378662109375, "KL/rejected_KL_mean": -538.8733520507812, "KL/std": 246.80142211914062, "epoch": 0.5007342143906021, "fcm_dpo/beta": 0.001860608346760273, "fcm_dpo/delta": -0.03702467307448387, "fcm_dpo/margin": 233.8590850830078, "fcm_dpo/q_t": 0.3988453149795532, "grad_norm": 25.454540252685547, "learning_rate": 2.946753005532965e-07, "logits/chosen": -0.38325321674346924, "logits/rejected": -0.38400086760520935, "logps/chosen": -352.9140625, "logps/ref_chosen": -47.899803161621094, "logps/ref_rejected": -101.80987548828125, "logps/rejected": -640.6832275390625, "loss": 1.0527, "margin_dpo/margin_mean": 233.8590850830078, "margin_dpo/margin_std": 255.57666015625, "step": 341 }, { "KL/chosen_KL_mean": -308.7816162109375, "KL/mean": -410.565673828125, "KL/rejected_KL_mean": -512.3497314453125, "KL/std": 233.906982421875, "epoch": 0.5022026431718062, "fcm_dpo/beta": 0.001875395653769374, "fcm_dpo/delta": 0.018558282405138016, "fcm_dpo/margin": 203.568115234375, "fcm_dpo/q_t": 0.41176915168762207, "grad_norm": 25.053524017333984, "learning_rate": 2.934120444167326e-07, "logits/chosen": -0.42473480105400085, "logits/rejected": -0.40138232707977295, "logps/chosen": -380.77825927734375, "logps/ref_chosen": -71.99664306640625, "logps/ref_rejected": -92.58959197998047, "logps/rejected": -604.9393310546875, "loss": 1.1045, "margin_dpo/margin_mean": 203.568115234375, "margin_dpo/margin_std": 277.70416259765625, "step": 342 }, { "KL/chosen_KL_mean": -299.65240478515625, "KL/mean": -421.80462646484375, "KL/rejected_KL_mean": -543.956787109375, "KL/std": 237.9898223876953, "epoch": 0.5036710719530103, "fcm_dpo/beta": 0.0018535295967012644, "fcm_dpo/delta": -0.05536113679409027, "fcm_dpo/margin": 244.30438232421875, "fcm_dpo/q_t": 0.39244067668914795, "grad_norm": 22.644685745239258, "learning_rate": 2.9214764433242476e-07, "logits/chosen": -0.44045504927635193, "logits/rejected": -0.44200748205184937, "logps/chosen": -354.05804443359375, "logps/ref_chosen": -54.405616760253906, "logps/ref_rejected": -111.04142761230469, "logps/rejected": -654.9982299804688, "loss": 1.0218, "margin_dpo/margin_mean": 244.30438232421875, "margin_dpo/margin_std": 211.76568603515625, "step": 343 }, { "KL/chosen_KL_mean": -298.38482666015625, "KL/mean": -410.74530029296875, "KL/rejected_KL_mean": -523.105712890625, "KL/std": 269.9559326171875, "epoch": 0.5051395007342144, "fcm_dpo/beta": 0.0018617368768900633, "fcm_dpo/delta": -0.020948857069015503, "fcm_dpo/margin": 224.7209014892578, "fcm_dpo/q_t": 0.4059738516807556, "grad_norm": 23.597854614257812, "learning_rate": 2.9088213361849126e-07, "logits/chosen": -0.3812987804412842, "logits/rejected": -0.3833147883415222, "logps/chosen": -352.3494873046875, "logps/ref_chosen": -53.96466827392578, "logps/ref_rejected": -90.62336730957031, "logps/rejected": -613.7291259765625, "loss": 1.0795, "margin_dpo/margin_mean": 224.7209014892578, "margin_dpo/margin_std": 275.2275390625, "step": 344 }, { "KL/chosen_KL_mean": -355.3021240234375, "KL/mean": -474.9087219238281, "KL/rejected_KL_mean": -594.5153198242188, "KL/std": 251.06130981445312, "epoch": 0.5066079295154186, "fcm_dpo/beta": 0.0018341855611652136, "fcm_dpo/delta": -0.04054499790072441, "fcm_dpo/margin": 239.21319580078125, "fcm_dpo/q_t": 0.39786964654922485, "grad_norm": 20.200607299804688, "learning_rate": 2.896155456223163e-07, "logits/chosen": -0.4079459309577942, "logits/rejected": -0.4065578877925873, "logps/chosen": -416.98779296875, "logps/ref_chosen": -61.685699462890625, "logps/ref_rejected": -99.49041748046875, "logps/rejected": -694.0057373046875, "loss": 1.0538, "margin_dpo/margin_mean": 239.21319580078125, "margin_dpo/margin_std": 273.10797119140625, "step": 345 }, { "KL/chosen_KL_mean": -354.8230895996094, "KL/mean": -467.76507568359375, "KL/rejected_KL_mean": -580.70703125, "KL/std": 250.11328125, "epoch": 0.5080763582966226, "fcm_dpo/beta": 0.001822044956497848, "fcm_dpo/delta": -0.012084376066923141, "fcm_dpo/margin": 225.88394165039062, "fcm_dpo/q_t": 0.4034339189529419, "grad_norm": 21.91631317138672, "learning_rate": 2.883479137196714e-07, "logits/chosen": -0.36091554164886475, "logits/rejected": -0.35045647621154785, "logps/chosen": -410.079345703125, "logps/ref_chosen": -55.256263732910156, "logps/ref_rejected": -77.41532135009766, "logps/rejected": -658.122314453125, "loss": 1.0737, "margin_dpo/margin_mean": 225.88394165039062, "margin_dpo/margin_std": 270.33984375, "step": 346 }, { "KL/chosen_KL_mean": -356.4461364746094, "KL/mean": -470.38409423828125, "KL/rejected_KL_mean": -584.322021484375, "KL/std": 266.44903564453125, "epoch": 0.5095447870778267, "fcm_dpo/beta": 0.0018181647174060345, "fcm_dpo/delta": -0.014934061095118523, "fcm_dpo/margin": 227.87588500976562, "fcm_dpo/q_t": 0.40489083528518677, "grad_norm": 24.682762145996094, "learning_rate": 2.8707927131383614e-07, "logits/chosen": -0.38032758235931396, "logits/rejected": -0.37665826082229614, "logps/chosen": -414.01239013671875, "logps/ref_chosen": -57.56623840332031, "logps/ref_rejected": -92.35509490966797, "logps/rejected": -676.6771240234375, "loss": 1.083, "margin_dpo/margin_mean": 227.87588500976562, "margin_dpo/margin_std": 299.0618896484375, "step": 347 }, { "KL/chosen_KL_mean": -315.14227294921875, "KL/mean": -411.5518493652344, "KL/rejected_KL_mean": -507.9614562988281, "KL/std": 229.25570678710938, "epoch": 0.5110132158590308, "fcm_dpo/beta": 0.001820417819544673, "fcm_dpo/delta": 0.05043090134859085, "fcm_dpo/margin": 192.81918334960938, "fcm_dpo/q_t": 0.4191325306892395, "grad_norm": 23.302974700927734, "learning_rate": 2.858096518347179e-07, "logits/chosen": -0.4306999444961548, "logits/rejected": -0.4324670433998108, "logps/chosen": -371.4599609375, "logps/ref_chosen": -56.31770324707031, "logps/ref_rejected": -89.13836669921875, "logps/rejected": -597.099853515625, "loss": 1.1225, "margin_dpo/margin_mean": 192.81918334960938, "margin_dpo/margin_std": 263.36297607421875, "step": 348 }, { "KL/chosen_KL_mean": -301.320556640625, "KL/mean": -410.7008056640625, "KL/rejected_KL_mean": -520.0811157226562, "KL/std": 251.19491577148438, "epoch": 0.5124816446402349, "fcm_dpo/beta": 0.0018413200741633773, "fcm_dpo/delta": -0.003216017037630081, "fcm_dpo/margin": 218.76055908203125, "fcm_dpo/q_t": 0.4088849425315857, "grad_norm": 18.977298736572266, "learning_rate": 2.845390887379706e-07, "logits/chosen": -0.38458961248397827, "logits/rejected": -0.38436293601989746, "logps/chosen": -359.3460693359375, "logps/ref_chosen": -58.025516510009766, "logps/ref_rejected": -97.50515747070312, "logps/rejected": -617.5862426757812, "loss": 1.1007, "margin_dpo/margin_mean": 218.7605438232422, "margin_dpo/margin_std": 311.55657958984375, "step": 349 }, { "KL/chosen_KL_mean": -319.11181640625, "KL/mean": -425.6887512207031, "KL/rejected_KL_mean": -532.2657470703125, "KL/std": 239.318603515625, "epoch": 0.5139500734214391, "fcm_dpo/beta": 0.0018291505984961987, "fcm_dpo/delta": 0.010211531072854996, "fcm_dpo/margin": 213.15390014648438, "fcm_dpo/q_t": 0.4092941880226135, "grad_norm": 30.917598724365234, "learning_rate": 2.8326761550411346e-07, "logits/chosen": -0.4026058614253998, "logits/rejected": -0.40751904249191284, "logps/chosen": -383.44232177734375, "logps/ref_chosen": -64.33049011230469, "logps/ref_rejected": -89.87164306640625, "logps/rejected": -622.1373291015625, "loss": 1.1063, "margin_dpo/margin_mean": 213.15390014648438, "margin_dpo/margin_std": 299.6205749511719, "step": 350 }, { "KL/chosen_KL_mean": -282.3033447265625, "KL/mean": -412.9404602050781, "KL/rejected_KL_mean": -543.5775146484375, "KL/std": 274.8599853515625, "epoch": 0.5154185022026432, "fcm_dpo/beta": 0.001811300404369831, "fcm_dpo/delta": -0.07729293406009674, "fcm_dpo/margin": 261.2742004394531, "fcm_dpo/q_t": 0.3929722309112549, "grad_norm": 29.79630470275879, "learning_rate": 2.819952656376487e-07, "logits/chosen": -0.41858357191085815, "logits/rejected": -0.4184862971305847, "logps/chosen": -342.9754943847656, "logps/ref_chosen": -60.6721305847168, "logps/ref_rejected": -101.5654296875, "logps/rejected": -645.1429443359375, "loss": 1.0441, "margin_dpo/margin_mean": 261.2742004394531, "margin_dpo/margin_std": 318.12158203125, "step": 351 }, { "KL/chosen_KL_mean": -331.2454528808594, "KL/mean": -410.2402648925781, "KL/rejected_KL_mean": -489.23504638671875, "KL/std": 247.09182739257812, "epoch": 0.5168869309838473, "fcm_dpo/beta": 0.0018348516896367073, "fcm_dpo/delta": 0.11355704069137573, "fcm_dpo/margin": 157.98960876464844, "fcm_dpo/q_t": 0.4335269033908844, "grad_norm": 34.465721130371094, "learning_rate": 2.8072207266617854e-07, "logits/chosen": -0.37872931361198425, "logits/rejected": -0.3453086316585541, "logps/chosen": -402.18890380859375, "logps/ref_chosen": -70.9434585571289, "logps/ref_rejected": -76.6419677734375, "logps/rejected": -565.8770751953125, "loss": 1.1791, "margin_dpo/margin_mean": 157.9896240234375, "margin_dpo/margin_std": 274.4894104003906, "step": 352 }, { "KL/chosen_KL_mean": -305.1307373046875, "KL/mean": -407.434814453125, "KL/rejected_KL_mean": -509.7388916015625, "KL/std": 240.89715576171875, "epoch": 0.5183553597650514, "fcm_dpo/beta": 0.0018470755312591791, "fcm_dpo/delta": 0.022442463785409927, "fcm_dpo/margin": 204.60816955566406, "fcm_dpo/q_t": 0.4131419360637665, "grad_norm": 25.36398696899414, "learning_rate": 2.794480701395219e-07, "logits/chosen": -0.420898973941803, "logits/rejected": -0.40882444381713867, "logps/chosen": -363.52606201171875, "logps/ref_chosen": -58.39533996582031, "logps/ref_rejected": -80.33553314208984, "logps/rejected": -590.074462890625, "loss": 1.1061, "margin_dpo/margin_mean": 204.60816955566406, "margin_dpo/margin_std": 271.8081970214844, "step": 353 }, { "KL/chosen_KL_mean": -247.3609619140625, "KL/mean": -360.4647216796875, "KL/rejected_KL_mean": -473.56854248046875, "KL/std": 221.80062866210938, "epoch": 0.5198237885462555, "fcm_dpo/beta": 0.0018517575226724148, "fcm_dpo/delta": -0.019758004695177078, "fcm_dpo/margin": 226.20755004882812, "fcm_dpo/q_t": 0.4013108015060425, "grad_norm": 31.546432495117188, "learning_rate": 2.781732916288303e-07, "logits/chosen": -0.3748503029346466, "logits/rejected": -0.3650524616241455, "logps/chosen": -307.1639404296875, "logps/ref_chosen": -59.80299377441406, "logps/ref_rejected": -88.75750732421875, "logps/rejected": -562.3260498046875, "loss": 1.0535, "margin_dpo/margin_mean": 226.20755004882812, "margin_dpo/margin_std": 229.88156127929688, "step": 354 }, { "KL/chosen_KL_mean": -252.09201049804688, "KL/mean": -361.91143798828125, "KL/rejected_KL_mean": -471.73089599609375, "KL/std": 225.0244140625, "epoch": 0.5212922173274597, "fcm_dpo/beta": 0.0018464226741343737, "fcm_dpo/delta": -0.005889484658837318, "fcm_dpo/margin": 219.6388702392578, "fcm_dpo/q_t": 0.4039592742919922, "grad_norm": 40.25979232788086, "learning_rate": 2.7689777072570284e-07, "logits/chosen": -0.4974886476993561, "logits/rejected": -0.4918820858001709, "logps/chosen": -306.22052001953125, "logps/ref_chosen": -54.12849807739258, "logps/ref_rejected": -82.40606689453125, "logps/rejected": -554.136962890625, "loss": 1.0617, "margin_dpo/margin_mean": 219.6388702392578, "margin_dpo/margin_std": 220.4207763671875, "step": 355 }, { "KL/chosen_KL_mean": -312.7377624511719, "KL/mean": -375.03485107421875, "KL/rejected_KL_mean": -437.3319091796875, "KL/std": 239.71302795410156, "epoch": 0.5227606461086637, "fcm_dpo/beta": 0.001856822520494461, "fcm_dpo/delta": 0.0374276302754879, "fcm_dpo/margin": 124.59414672851562, "fcm_dpo/q_t": 0.44722917675971985, "grad_norm": 30.493345260620117, "learning_rate": 2.7562154104130176e-07, "logits/chosen": -0.4328617453575134, "logits/rejected": -0.4156278967857361, "logps/chosen": -377.41156005859375, "logps/ref_chosen": -64.6738052368164, "logps/ref_rejected": -75.89926147460938, "logps/rejected": -513.231201171875, "loss": 1.2406, "margin_dpo/margin_mean": 124.59414672851562, "margin_dpo/margin_std": 293.3489990234375, "step": 356 }, { "KL/chosen_KL_mean": -279.3336181640625, "KL/mean": -384.30426025390625, "KL/rejected_KL_mean": -489.27496337890625, "KL/std": 236.43228149414062, "epoch": 0.5242290748898678, "fcm_dpo/beta": 0.0018584367353469133, "fcm_dpo/delta": 0.009893104434013367, "fcm_dpo/margin": 209.9413604736328, "fcm_dpo/q_t": 0.40836572647094727, "grad_norm": 28.883237838745117, "learning_rate": 2.7434463620546594e-07, "logits/chosen": -0.40963172912597656, "logits/rejected": -0.3999664783477783, "logps/chosen": -332.05938720703125, "logps/ref_chosen": -52.725799560546875, "logps/ref_rejected": -86.84115600585938, "logps/rejected": -576.1160888671875, "loss": 1.0839, "margin_dpo/margin_mean": 209.9413604736328, "margin_dpo/margin_std": 243.27175903320312, "step": 357 }, { "KL/chosen_KL_mean": -262.045166015625, "KL/mean": -358.10491943359375, "KL/rejected_KL_mean": -454.16461181640625, "KL/std": 234.3748016357422, "epoch": 0.5256975036710719, "fcm_dpo/beta": 0.001879463205114007, "fcm_dpo/delta": 0.04035775363445282, "fcm_dpo/margin": 192.11947631835938, "fcm_dpo/q_t": 0.4165921211242676, "grad_norm": 24.633974075317383, "learning_rate": 2.730670898658255e-07, "logits/chosen": -0.46384721994400024, "logits/rejected": -0.4513862133026123, "logps/chosen": -325.2506103515625, "logps/ref_chosen": -63.20543670654297, "logps/ref_rejected": -88.373291015625, "logps/rejected": -542.5379028320312, "loss": 1.1107, "margin_dpo/margin_mean": 192.11947631835938, "margin_dpo/margin_std": 253.2946014404297, "step": 358 }, { "KL/chosen_KL_mean": -302.02459716796875, "KL/mean": -414.83648681640625, "KL/rejected_KL_mean": -527.6484375, "KL/std": 234.064453125, "epoch": 0.527165932452276, "fcm_dpo/beta": 0.001871941378340125, "fcm_dpo/delta": -0.02348851040005684, "fcm_dpo/margin": 225.623779296875, "fcm_dpo/q_t": 0.40334317088127136, "grad_norm": 30.42852783203125, "learning_rate": 2.717889356869146e-07, "logits/chosen": -0.3995208442211151, "logits/rejected": -0.39267587661743164, "logps/chosen": -358.39483642578125, "logps/ref_chosen": -56.370216369628906, "logps/ref_rejected": -82.17375183105469, "logps/rejected": -609.8221435546875, "loss": 1.0718, "margin_dpo/margin_mean": 225.62379455566406, "margin_dpo/margin_std": 277.3382873535156, "step": 359 }, { "KL/chosen_KL_mean": -297.1678771972656, "KL/mean": -389.17437744140625, "KL/rejected_KL_mean": -481.1808776855469, "KL/std": 202.0519561767578, "epoch": 0.5286343612334802, "fcm_dpo/beta": 0.0018919282592833042, "fcm_dpo/delta": 0.05358727648854256, "fcm_dpo/margin": 184.01300048828125, "fcm_dpo/q_t": 0.4175671935081482, "grad_norm": 30.946216583251953, "learning_rate": 2.7051020734928443e-07, "logits/chosen": -0.3726957440376282, "logits/rejected": -0.3605055510997772, "logps/chosen": -348.6282653808594, "logps/ref_chosen": -51.460384368896484, "logps/ref_rejected": -69.83892059326172, "logps/rejected": -551.019775390625, "loss": 1.1038, "margin_dpo/margin_mean": 184.01300048828125, "margin_dpo/margin_std": 202.7188262939453, "step": 360 }, { "KL/chosen_KL_mean": -327.22723388671875, "KL/mean": -416.80938720703125, "KL/rejected_KL_mean": -506.39154052734375, "KL/std": 240.4334716796875, "epoch": 0.5301027900146843, "fcm_dpo/beta": 0.001915230881422758, "fcm_dpo/delta": 0.05861767381429672, "fcm_dpo/margin": 179.164306640625, "fcm_dpo/q_t": 0.42051640152931213, "grad_norm": 29.356969833374023, "learning_rate": 2.6923093854861593e-07, "logits/chosen": -0.39026200771331787, "logits/rejected": -0.3884269595146179, "logps/chosen": -381.0967712402344, "logps/ref_chosen": -53.86951446533203, "logps/ref_rejected": -90.7692642211914, "logps/rejected": -597.1607666015625, "loss": 1.1314, "margin_dpo/margin_mean": 179.164306640625, "margin_dpo/margin_std": 260.03955078125, "step": 361 }, { "KL/chosen_KL_mean": -290.72216796875, "KL/mean": -430.7841796875, "KL/rejected_KL_mean": -570.84619140625, "KL/std": 250.39520263671875, "epoch": 0.5315712187958884, "fcm_dpo/beta": 0.001872203079983592, "fcm_dpo/delta": -0.13209237158298492, "fcm_dpo/margin": 280.12396240234375, "fcm_dpo/q_t": 0.3787139654159546, "grad_norm": 28.38374900817871, "learning_rate": 2.679511629948319e-07, "logits/chosen": -0.3565158247947693, "logits/rejected": -0.3644503951072693, "logps/chosen": -349.3612060546875, "logps/ref_chosen": -58.639060974121094, "logps/ref_rejected": -105.58195495605469, "logps/rejected": -676.4281005859375, "loss": 0.9864, "margin_dpo/margin_mean": 280.12396240234375, "margin_dpo/margin_std": 268.27490234375, "step": 362 }, { "KL/chosen_KL_mean": -277.21441650390625, "KL/mean": -418.75067138671875, "KL/rejected_KL_mean": -560.2869262695312, "KL/std": 253.55523681640625, "epoch": 0.5330396475770925, "fcm_dpo/beta": 0.0018373643979430199, "fcm_dpo/delta": -0.12650209665298462, "fcm_dpo/margin": 283.072509765625, "fcm_dpo/q_t": 0.37961655855178833, "grad_norm": 22.140954971313477, "learning_rate": 2.6667091441120816e-07, "logits/chosen": -0.3845895528793335, "logits/rejected": -0.3806605339050293, "logps/chosen": -321.7727966308594, "logps/ref_chosen": -44.558380126953125, "logps/ref_rejected": -74.69496154785156, "logps/rejected": -634.98193359375, "loss": 0.9916, "margin_dpo/margin_mean": 283.072509765625, "margin_dpo/margin_std": 275.70379638671875, "step": 363 }, { "KL/chosen_KL_mean": -303.1225280761719, "KL/mean": -410.65924072265625, "KL/rejected_KL_mean": -518.1959228515625, "KL/std": 250.283447265625, "epoch": 0.5345080763582967, "fcm_dpo/beta": 0.0018285869155079126, "fcm_dpo/delta": 0.006663650274276733, "fcm_dpo/margin": 215.0734100341797, "fcm_dpo/q_t": 0.4090085029602051, "grad_norm": 24.997861862182617, "learning_rate": 2.6539022653348575e-07, "logits/chosen": -0.4224643111228943, "logits/rejected": -0.4338565468788147, "logps/chosen": -352.01715087890625, "logps/ref_chosen": -48.894622802734375, "logps/ref_rejected": -91.395751953125, "logps/rejected": -609.5916748046875, "loss": 1.0981, "margin_dpo/margin_mean": 215.0734100341797, "margin_dpo/margin_std": 289.23095703125, "step": 364 }, { "KL/chosen_KL_mean": -294.5177001953125, "KL/mean": -405.79449462890625, "KL/rejected_KL_mean": -517.0712890625, "KL/std": 263.12286376953125, "epoch": 0.5359765051395007, "fcm_dpo/beta": 0.0018205586820840836, "fcm_dpo/delta": -0.005390607286244631, "fcm_dpo/margin": 222.5535430908203, "fcm_dpo/q_t": 0.4073421359062195, "grad_norm": 24.999404907226562, "learning_rate": 2.641091331089811e-07, "logits/chosen": -0.4111742377281189, "logits/rejected": -0.4230782985687256, "logps/chosen": -346.01043701171875, "logps/ref_chosen": -51.49274444580078, "logps/ref_rejected": -92.70166778564453, "logps/rejected": -609.77294921875, "loss": 1.0785, "margin_dpo/margin_mean": 222.5535430908203, "margin_dpo/margin_std": 272.5834655761719, "step": 365 }, { "KL/chosen_KL_mean": -270.3475646972656, "KL/mean": -384.409423828125, "KL/rejected_KL_mean": -498.47125244140625, "KL/std": 244.6814422607422, "epoch": 0.5374449339207048, "fcm_dpo/beta": 0.001808905741199851, "fcm_dpo/delta": -0.013614185154438019, "fcm_dpo/margin": 228.12367248535156, "fcm_dpo/q_t": 0.4051057696342468, "grad_norm": 21.499807357788086, "learning_rate": 2.6282766789569736e-07, "logits/chosen": -0.41397756338119507, "logits/rejected": -0.4283139109611511, "logps/chosen": -315.0681457519531, "logps/ref_chosen": -44.7205696105957, "logps/ref_rejected": -83.31040954589844, "logps/rejected": -581.7816162109375, "loss": 1.0815, "margin_dpo/margin_mean": 228.12368774414062, "margin_dpo/margin_std": 290.66448974609375, "step": 366 }, { "KL/chosen_KL_mean": -284.0663146972656, "KL/mean": -375.30828857421875, "KL/rejected_KL_mean": -466.55029296875, "KL/std": 219.5314178466797, "epoch": 0.5389133627019089, "fcm_dpo/beta": 0.0018367799930274487, "fcm_dpo/delta": 0.06687445938587189, "fcm_dpo/margin": 182.48397827148438, "fcm_dpo/q_t": 0.4210050106048584, "grad_norm": 19.561819076538086, "learning_rate": 2.615458646614349e-07, "logits/chosen": -0.418517529964447, "logits/rejected": -0.40183088183403015, "logps/chosen": -342.47174072265625, "logps/ref_chosen": -58.405418395996094, "logps/ref_rejected": -76.75132751464844, "logps/rejected": -543.3016357421875, "loss": 1.1247, "margin_dpo/margin_mean": 182.48397827148438, "margin_dpo/margin_std": 237.13613891601562, "step": 367 }, { "KL/chosen_KL_mean": -266.5557861328125, "KL/mean": -412.49371337890625, "KL/rejected_KL_mean": -558.431640625, "KL/std": 249.94715881347656, "epoch": 0.540381791483113, "fcm_dpo/beta": 0.0018038455164059997, "fcm_dpo/delta": -0.1335085779428482, "fcm_dpo/margin": 291.8758850097656, "fcm_dpo/q_t": 0.37479937076568604, "grad_norm": 45.23311233520508, "learning_rate": 2.6026375718290083e-07, "logits/chosen": -0.41935569047927856, "logits/rejected": -0.4271644651889801, "logps/chosen": -311.00830078125, "logps/ref_chosen": -44.452518463134766, "logps/ref_rejected": -98.55526733398438, "logps/rejected": -656.9869384765625, "loss": 0.9664, "margin_dpo/margin_mean": 291.8758544921875, "margin_dpo/margin_std": 225.90078735351562, "step": 368 }, { "KL/chosen_KL_mean": -350.90386962890625, "KL/mean": -427.52069091796875, "KL/rejected_KL_mean": -504.13751220703125, "KL/std": 248.12368774414062, "epoch": 0.5418502202643172, "fcm_dpo/beta": 0.001830049091950059, "fcm_dpo/delta": 0.12243049591779709, "fcm_dpo/margin": 153.2336883544922, "fcm_dpo/q_t": 0.43473055958747864, "grad_norm": 29.0583553314209, "learning_rate": 2.589813792448196e-07, "logits/chosen": -0.43031615018844604, "logits/rejected": -0.413091778755188, "logps/chosen": -422.2853698730469, "logps/ref_chosen": -71.38150024414062, "logps/ref_rejected": -91.29582214355469, "logps/rejected": -595.433349609375, "loss": 1.1876, "margin_dpo/margin_mean": 153.2336883544922, "margin_dpo/margin_std": 278.4016418457031, "step": 369 }, { "KL/chosen_KL_mean": -361.59027099609375, "KL/mean": -435.93548583984375, "KL/rejected_KL_mean": -510.28070068359375, "KL/std": 257.38360595703125, "epoch": 0.5433186490455213, "fcm_dpo/beta": 0.001874544657766819, "fcm_dpo/delta": 0.12432844936847687, "fcm_dpo/margin": 148.69049072265625, "fcm_dpo/q_t": 0.43603307008743286, "grad_norm": 32.01681137084961, "learning_rate": 2.5769876463904263e-07, "logits/chosen": -0.4668412208557129, "logits/rejected": -0.4614550471305847, "logps/chosen": -433.19775390625, "logps/ref_chosen": -71.60749816894531, "logps/ref_rejected": -97.25978088378906, "logps/rejected": -607.54052734375, "loss": 1.1953, "margin_dpo/margin_mean": 148.69049072265625, "margin_dpo/margin_std": 284.5626220703125, "step": 370 }, { "KL/chosen_KL_mean": -344.98797607421875, "KL/mean": -453.09259033203125, "KL/rejected_KL_mean": -561.197265625, "KL/std": 260.2284851074219, "epoch": 0.5447870778267254, "fcm_dpo/beta": 0.0018845757003873587, "fcm_dpo/delta": -0.007901359349489212, "fcm_dpo/margin": 216.20925903320312, "fcm_dpo/q_t": 0.4071364104747772, "grad_norm": 31.476709365844727, "learning_rate": 2.5641594716365744e-07, "logits/chosen": -0.4888390302658081, "logits/rejected": -0.4776480793952942, "logps/chosen": -414.4024658203125, "logps/ref_chosen": -69.41448974609375, "logps/ref_rejected": -99.17217254638672, "logps/rejected": -660.369384765625, "loss": 1.0943, "margin_dpo/margin_mean": 216.20925903320312, "margin_dpo/margin_std": 298.52569580078125, "step": 371 }, { "KL/chosen_KL_mean": -332.8981628417969, "KL/mean": -460.857177734375, "KL/rejected_KL_mean": -588.816162109375, "KL/std": 296.18841552734375, "epoch": 0.5462555066079295, "fcm_dpo/beta": 0.0018558462616056204, "fcm_dpo/delta": -0.07873637974262238, "fcm_dpo/margin": 255.91799926757812, "fcm_dpo/q_t": 0.39386242628097534, "grad_norm": 24.130836486816406, "learning_rate": 2.551329606220976e-07, "logits/chosen": -0.44447630643844604, "logits/rejected": -0.4267101287841797, "logps/chosen": -394.7161560058594, "logps/ref_chosen": -61.8179931640625, "logps/ref_rejected": -78.53948974609375, "logps/rejected": -667.3556518554688, "loss": 1.0516, "margin_dpo/margin_mean": 255.91799926757812, "margin_dpo/margin_std": 333.7495422363281, "step": 372 }, { "KL/chosen_KL_mean": -361.79736328125, "KL/mean": -481.796875, "KL/rejected_KL_mean": -601.7963256835938, "KL/std": 286.9070129394531, "epoch": 0.5477239353891337, "fcm_dpo/beta": 0.0018471537623554468, "fcm_dpo/delta": -0.04569406807422638, "fcm_dpo/margin": 239.99903869628906, "fcm_dpo/q_t": 0.3971483111381531, "grad_norm": 27.912851333618164, "learning_rate": 2.538498388222517e-07, "logits/chosen": -0.4145781695842743, "logits/rejected": -0.3933746814727783, "logps/chosen": -426.01446533203125, "logps/ref_chosen": -64.21713256835938, "logps/ref_rejected": -85.95960998535156, "logps/rejected": -687.7559814453125, "loss": 1.0572, "margin_dpo/margin_mean": 239.99905395507812, "margin_dpo/margin_std": 276.5765075683594, "step": 373 }, { "KL/chosen_KL_mean": -320.88250732421875, "KL/mean": -430.1343078613281, "KL/rejected_KL_mean": -539.3861083984375, "KL/std": 306.24932861328125, "epoch": 0.5491923641703378, "fcm_dpo/beta": 0.0018239655764773488, "fcm_dpo/delta": 0.0010283365845680237, "fcm_dpo/margin": 218.50360107421875, "fcm_dpo/q_t": 0.41195404529571533, "grad_norm": 25.18126106262207, "learning_rate": 2.525666155755725e-07, "logits/chosen": -0.5014743208885193, "logits/rejected": -0.4834766983985901, "logps/chosen": -391.53271484375, "logps/ref_chosen": -70.65018463134766, "logps/ref_rejected": -93.64016723632812, "logps/rejected": -633.0263061523438, "loss": 1.1167, "margin_dpo/margin_mean": 218.50360107421875, "margin_dpo/margin_std": 344.5791015625, "step": 374 }, { "KL/chosen_KL_mean": -325.4786071777344, "KL/mean": -431.72906494140625, "KL/rejected_KL_mean": -537.9794921875, "KL/std": 243.8807373046875, "epoch": 0.5506607929515418, "fcm_dpo/beta": 0.001825918909162283, "fcm_dpo/delta": 0.011883806437253952, "fcm_dpo/margin": 212.50088500976562, "fcm_dpo/q_t": 0.4101504981517792, "grad_norm": 34.303585052490234, "learning_rate": 2.512833246961859e-07, "logits/chosen": -0.45803767442703247, "logits/rejected": -0.4583319425582886, "logps/chosen": -385.558837890625, "logps/ref_chosen": -60.080223083496094, "logps/ref_rejected": -88.93830871582031, "logps/rejected": -626.9177856445312, "loss": 1.1046, "margin_dpo/margin_mean": 212.50088500976562, "margin_dpo/margin_std": 290.35430908203125, "step": 375 }, { "KL/chosen_KL_mean": -315.9422302246094, "KL/mean": -448.74517822265625, "KL/rejected_KL_mean": -581.548095703125, "KL/std": 266.8795166015625, "epoch": 0.5521292217327459, "fcm_dpo/beta": 0.0018120380118489265, "fcm_dpo/delta": -0.08552736043930054, "fcm_dpo/margin": 265.6058654785156, "fcm_dpo/q_t": 0.3894526958465576, "grad_norm": 25.428913116455078, "learning_rate": 2.5e-07, "logits/chosen": -0.44534194469451904, "logits/rejected": -0.4348585307598114, "logps/chosen": -378.6025390625, "logps/ref_chosen": -62.660308837890625, "logps/ref_rejected": -105.52660369873047, "logps/rejected": -687.07470703125, "loss": 1.0361, "margin_dpo/margin_mean": 265.6058654785156, "margin_dpo/margin_std": 312.51824951171875, "step": 376 }, { "KL/chosen_KL_mean": -318.29412841796875, "KL/mean": -442.55340576171875, "KL/rejected_KL_mean": -566.812744140625, "KL/std": 271.7928771972656, "epoch": 0.55359765051395, "fcm_dpo/beta": 0.0017974915681406856, "fcm_dpo/delta": -0.048895493149757385, "fcm_dpo/margin": 248.51861572265625, "fcm_dpo/q_t": 0.3962140679359436, "grad_norm": 34.237945556640625, "learning_rate": 2.487166753038141e-07, "logits/chosen": -0.39339831471443176, "logits/rejected": -0.39519575238227844, "logps/chosen": -372.7728576660156, "logps/ref_chosen": -54.478736877441406, "logps/ref_rejected": -98.70335388183594, "logps/rejected": -665.5160522460938, "loss": 1.0485, "margin_dpo/margin_mean": 248.5186004638672, "margin_dpo/margin_std": 279.26947021484375, "step": 377 }, { "KL/chosen_KL_mean": -306.4930114746094, "KL/mean": -437.6435546875, "KL/rejected_KL_mean": -568.7940673828125, "KL/std": 253.0465087890625, "epoch": 0.5550660792951542, "fcm_dpo/beta": 0.001769623951986432, "fcm_dpo/delta": -0.06735318899154663, "fcm_dpo/margin": 262.3010559082031, "fcm_dpo/q_t": 0.39068034291267395, "grad_norm": 28.7508544921875, "learning_rate": 2.4743338442442754e-07, "logits/chosen": -0.4216500520706177, "logits/rejected": -0.43784886598587036, "logps/chosen": -351.5135498046875, "logps/ref_chosen": -45.02053451538086, "logps/ref_rejected": -88.0469741821289, "logps/rejected": -656.841064453125, "loss": 1.0276, "margin_dpo/margin_mean": 262.3010559082031, "margin_dpo/margin_std": 266.9130554199219, "step": 378 }, { "KL/chosen_KL_mean": -351.31341552734375, "KL/mean": -483.653564453125, "KL/rejected_KL_mean": -615.9937744140625, "KL/std": 259.1260986328125, "epoch": 0.5565345080763583, "fcm_dpo/beta": 0.0017398163909092546, "fcm_dpo/delta": -0.06416111439466476, "fcm_dpo/margin": 264.68035888671875, "fcm_dpo/q_t": 0.39443039894104004, "grad_norm": 26.82124137878418, "learning_rate": 2.461501611777483e-07, "logits/chosen": -0.41827017068862915, "logits/rejected": -0.4405589699745178, "logps/chosen": -404.4954833984375, "logps/ref_chosen": -53.182098388671875, "logps/ref_rejected": -114.3001708984375, "logps/rejected": -730.2939453125, "loss": 1.0472, "margin_dpo/margin_mean": 264.68035888671875, "margin_dpo/margin_std": 312.6787109375, "step": 379 }, { "KL/chosen_KL_mean": -350.41455078125, "KL/mean": -491.9856262207031, "KL/rejected_KL_mean": -633.5567626953125, "KL/std": 297.56414794921875, "epoch": 0.5580029368575624, "fcm_dpo/beta": 0.0017209737561643124, "fcm_dpo/delta": -0.0916648805141449, "fcm_dpo/margin": 283.14215087890625, "fcm_dpo/q_t": 0.3864797353744507, "grad_norm": 27.675559997558594, "learning_rate": 2.4486703937790243e-07, "logits/chosen": -0.42042213678359985, "logits/rejected": -0.447647362947464, "logps/chosen": -401.767578125, "logps/ref_chosen": -51.3530387878418, "logps/ref_rejected": -104.19169616699219, "logps/rejected": -737.7484130859375, "loss": 1.0301, "margin_dpo/margin_mean": 283.14215087890625, "margin_dpo/margin_std": 325.3090515136719, "step": 380 }, { "KL/chosen_KL_mean": -372.5402526855469, "KL/mean": -474.24298095703125, "KL/rejected_KL_mean": -575.9457397460938, "KL/std": 250.25369262695312, "epoch": 0.5594713656387665, "fcm_dpo/beta": 0.001720770844258368, "fcm_dpo/delta": 0.05182623863220215, "fcm_dpo/margin": 203.4054718017578, "fcm_dpo/q_t": 0.42006832361221313, "grad_norm": 28.662925720214844, "learning_rate": 2.435840528363426e-07, "logits/chosen": -0.42481085658073425, "logits/rejected": -0.4088062047958374, "logps/chosen": -430.34332275390625, "logps/ref_chosen": -57.80306625366211, "logps/ref_rejected": -79.21940612792969, "logps/rejected": -655.1651611328125, "loss": 1.148, "margin_dpo/margin_mean": 203.40545654296875, "margin_dpo/margin_std": 341.2312927246094, "step": 381 }, { "KL/chosen_KL_mean": -368.66522216796875, "KL/mean": -494.4615173339844, "KL/rejected_KL_mean": -620.2578125, "KL/std": 243.38522338867188, "epoch": 0.5609397944199707, "fcm_dpo/beta": 0.0017197042470797896, "fcm_dpo/delta": -0.03414086997509003, "fcm_dpo/margin": 251.59266662597656, "fcm_dpo/q_t": 0.39874282479286194, "grad_norm": 24.968591690063477, "learning_rate": 2.4230123536095745e-07, "logits/chosen": -0.4726359248161316, "logits/rejected": -0.47993141412734985, "logps/chosen": -434.68548583984375, "logps/ref_chosen": -66.02030181884766, "logps/ref_rejected": -110.71016693115234, "logps/rejected": -730.968017578125, "loss": 1.0487, "margin_dpo/margin_mean": 251.59266662597656, "margin_dpo/margin_std": 264.68463134765625, "step": 382 }, { "KL/chosen_KL_mean": -396.24041748046875, "KL/mean": -516.6358032226562, "KL/rejected_KL_mean": -637.0311889648438, "KL/std": 278.3402099609375, "epoch": 0.5624082232011748, "fcm_dpo/beta": 0.0017115201335400343, "fcm_dpo/delta": -0.012677527032792568, "fcm_dpo/margin": 240.79074096679688, "fcm_dpo/q_t": 0.4055634140968323, "grad_norm": 30.020652770996094, "learning_rate": 2.4101862075518037e-07, "logits/chosen": -0.4480942487716675, "logits/rejected": -0.4583819806575775, "logps/chosen": -446.63189697265625, "logps/ref_chosen": -50.39148712158203, "logps/ref_rejected": -93.71589660644531, "logps/rejected": -730.7470703125, "loss": 1.1069, "margin_dpo/margin_mean": 240.79074096679688, "margin_dpo/margin_std": 365.572021484375, "step": 383 }, { "KL/chosen_KL_mean": -396.1028137207031, "KL/mean": -495.835693359375, "KL/rejected_KL_mean": -595.568603515625, "KL/std": 259.7392578125, "epoch": 0.5638766519823789, "fcm_dpo/beta": 0.0017325121443718672, "fcm_dpo/delta": 0.055945903062820435, "fcm_dpo/margin": 199.46575927734375, "fcm_dpo/q_t": 0.418599933385849, "grad_norm": 25.613414764404297, "learning_rate": 2.397362428170992e-07, "logits/chosen": -0.5003800392150879, "logits/rejected": -0.4956563413143158, "logps/chosen": -448.14892578125, "logps/ref_chosen": -52.046104431152344, "logps/ref_rejected": -85.76089477539062, "logps/rejected": -681.3294677734375, "loss": 1.1132, "margin_dpo/margin_mean": 199.46575927734375, "margin_dpo/margin_std": 242.4035186767578, "step": 384 }, { "KL/chosen_KL_mean": -381.33917236328125, "KL/mean": -502.6947021484375, "KL/rejected_KL_mean": -624.0501708984375, "KL/std": 228.0861053466797, "epoch": 0.5653450807635829, "fcm_dpo/beta": 0.0017262771725654602, "fcm_dpo/delta": -0.019818957895040512, "fcm_dpo/margin": 242.7109832763672, "fcm_dpo/q_t": 0.40087053179740906, "grad_norm": 34.80295181274414, "learning_rate": 2.3845413533856514e-07, "logits/chosen": -0.5185421705245972, "logits/rejected": -0.4977598786354065, "logps/chosen": -446.891357421875, "logps/ref_chosen": -65.55215454101562, "logps/ref_rejected": -77.82792663574219, "logps/rejected": -701.8780517578125, "loss": 1.054, "margin_dpo/margin_mean": 242.71096801757812, "margin_dpo/margin_std": 247.6511993408203, "step": 385 }, { "KL/chosen_KL_mean": -391.0846862792969, "KL/mean": -520.10107421875, "KL/rejected_KL_mean": -649.117431640625, "KL/std": 283.81964111328125, "epoch": 0.566813509544787, "fcm_dpo/beta": 0.0017148086335510015, "fcm_dpo/delta": -0.044420357793569565, "fcm_dpo/margin": 258.03277587890625, "fcm_dpo/q_t": 0.39896559715270996, "grad_norm": 34.90454864501953, "learning_rate": 2.3717233210430254e-07, "logits/chosen": -0.5139098167419434, "logits/rejected": -0.5114161968231201, "logps/chosen": -449.3065490722656, "logps/ref_chosen": -58.22185516357422, "logps/ref_rejected": -92.32742309570312, "logps/rejected": -741.4448852539062, "loss": 1.0673, "margin_dpo/margin_mean": 258.03277587890625, "margin_dpo/margin_std": 333.954833984375, "step": 386 }, { "KL/chosen_KL_mean": -408.6097412109375, "KL/mean": -517.918701171875, "KL/rejected_KL_mean": -627.2276611328125, "KL/std": 263.1587219238281, "epoch": 0.5682819383259912, "fcm_dpo/beta": 0.0017092199996113777, "fcm_dpo/delta": 0.027111487463116646, "fcm_dpo/margin": 218.617919921875, "fcm_dpo/q_t": 0.412253201007843, "grad_norm": 33.19843292236328, "learning_rate": 2.3589086689101889e-07, "logits/chosen": -0.5541732311248779, "logits/rejected": -0.5390141010284424, "logps/chosen": -475.0291748046875, "logps/ref_chosen": -66.41944885253906, "logps/ref_rejected": -92.16915893554688, "logps/rejected": -719.3968505859375, "loss": 1.0981, "margin_dpo/margin_mean": 218.617919921875, "margin_dpo/margin_std": 265.804443359375, "step": 387 }, { "KL/chosen_KL_mean": -377.7342224121094, "KL/mean": -525.062744140625, "KL/rejected_KL_mean": -672.391357421875, "KL/std": 302.99163818359375, "epoch": 0.5697503671071953, "fcm_dpo/beta": 0.0016880175098776817, "fcm_dpo/delta": -0.10291901975870132, "fcm_dpo/margin": 294.6571350097656, "fcm_dpo/q_t": 0.387167364358902, "grad_norm": 31.615806579589844, "learning_rate": 2.3460977346651428e-07, "logits/chosen": -0.49094468355178833, "logits/rejected": -0.5020414590835571, "logps/chosen": -427.8636779785156, "logps/ref_chosen": -50.129459381103516, "logps/ref_rejected": -104.43305969238281, "logps/rejected": -776.8244018554688, "loss": 1.0239, "margin_dpo/margin_mean": 294.65716552734375, "margin_dpo/margin_std": 341.442138671875, "step": 388 }, { "KL/chosen_KL_mean": -403.025146484375, "KL/mean": -526.2021484375, "KL/rejected_KL_mean": -649.379150390625, "KL/std": 294.0400390625, "epoch": 0.5712187958883994, "fcm_dpo/beta": 0.001677666325122118, "fcm_dpo/delta": -0.013942467980086803, "fcm_dpo/margin": 246.35403442382812, "fcm_dpo/q_t": 0.4049556255340576, "grad_norm": 31.12624740600586, "learning_rate": 2.3332908558879177e-07, "logits/chosen": -0.5525184273719788, "logits/rejected": -0.5475004315376282, "logps/chosen": -460.9317626953125, "logps/ref_chosen": -57.906593322753906, "logps/ref_rejected": -77.91454315185547, "logps/rejected": -727.293701171875, "loss": 1.083, "margin_dpo/margin_mean": 246.35403442382812, "margin_dpo/margin_std": 321.620361328125, "step": 389 }, { "KL/chosen_KL_mean": -382.27349853515625, "KL/mean": -502.8726806640625, "KL/rejected_KL_mean": -623.4718017578125, "KL/std": 278.3486022949219, "epoch": 0.5726872246696035, "fcm_dpo/beta": 0.0016733764205127954, "fcm_dpo/delta": -0.00393829308450222, "fcm_dpo/margin": 241.19834899902344, "fcm_dpo/q_t": 0.4098580479621887, "grad_norm": 22.825424194335938, "learning_rate": 2.320488370051681e-07, "logits/chosen": -0.4611801207065582, "logits/rejected": -0.45201367139816284, "logps/chosen": -431.4993896484375, "logps/ref_chosen": -49.22591781616211, "logps/ref_rejected": -85.5281982421875, "logps/rejected": -709.0, "loss": 1.1036, "margin_dpo/margin_mean": 241.19834899902344, "margin_dpo/margin_std": 353.98956298828125, "step": 390 }, { "KL/chosen_KL_mean": -373.9214782714844, "KL/mean": -445.02374267578125, "KL/rejected_KL_mean": -516.1259765625, "KL/std": 262.6861572265625, "epoch": 0.5741556534508077, "fcm_dpo/beta": 0.0017185378819704056, "fcm_dpo/delta": 0.1594843566417694, "fcm_dpo/margin": 142.20448303222656, "fcm_dpo/q_t": 0.44371968507766724, "grad_norm": 40.434242248535156, "learning_rate": 2.3076906145138405e-07, "logits/chosen": -0.47930610179901123, "logits/rejected": -0.4707353711128235, "logps/chosen": -438.2511291503906, "logps/ref_chosen": -64.32965087890625, "logps/ref_rejected": -86.73820495605469, "logps/rejected": -602.8641357421875, "loss": 1.2116, "margin_dpo/margin_mean": 142.20448303222656, "margin_dpo/margin_std": 274.74224853515625, "step": 391 }, { "KL/chosen_KL_mean": -309.6976318359375, "KL/mean": -447.2847900390625, "KL/rejected_KL_mean": -584.8719482421875, "KL/std": 268.35784912109375, "epoch": 0.5756240822320118, "fcm_dpo/beta": 0.0017193170497193933, "fcm_dpo/delta": -0.076762355864048, "fcm_dpo/margin": 275.17437744140625, "fcm_dpo/q_t": 0.389517605304718, "grad_norm": 23.396804809570312, "learning_rate": 2.294897926507156e-07, "logits/chosen": -0.4492862820625305, "logits/rejected": -0.44523316621780396, "logps/chosen": -363.20159912109375, "logps/ref_chosen": -53.50397872924805, "logps/ref_rejected": -102.34584045410156, "logps/rejected": -687.2177734375, "loss": 1.0162, "margin_dpo/margin_mean": 275.17437744140625, "margin_dpo/margin_std": 261.0281066894531, "step": 392 }, { "KL/chosen_KL_mean": -305.1656494140625, "KL/mean": -412.20819091796875, "KL/rejected_KL_mean": -519.2507934570312, "KL/std": 271.5501708984375, "epoch": 0.5770925110132159, "fcm_dpo/beta": 0.001711581600829959, "fcm_dpo/delta": 0.03485105559229851, "fcm_dpo/margin": 214.08511352539062, "fcm_dpo/q_t": 0.4182642698287964, "grad_norm": 21.774555206298828, "learning_rate": 2.2821106431308543e-07, "logits/chosen": -0.44074547290802, "logits/rejected": -0.4389492869377136, "logps/chosen": -351.63958740234375, "logps/ref_chosen": -46.473915100097656, "logps/ref_rejected": -71.96885681152344, "logps/rejected": -591.2196044921875, "loss": 1.1278, "margin_dpo/margin_mean": 214.0851287841797, "margin_dpo/margin_std": 337.0278015136719, "step": 393 }, { "KL/chosen_KL_mean": -334.2687683105469, "KL/mean": -451.5635681152344, "KL/rejected_KL_mean": -568.8583984375, "KL/std": 285.47515869140625, "epoch": 0.57856093979442, "fcm_dpo/beta": 0.0017150124767795205, "fcm_dpo/delta": -0.0024520214647054672, "fcm_dpo/margin": 234.589599609375, "fcm_dpo/q_t": 0.4073120653629303, "grad_norm": 26.771778106689453, "learning_rate": 2.2693291013417452e-07, "logits/chosen": -0.44740962982177734, "logits/rejected": -0.4476820230484009, "logps/chosen": -387.1803283691406, "logps/ref_chosen": -52.91154861450195, "logps/ref_rejected": -90.8226318359375, "logps/rejected": -659.6810302734375, "loss": 1.0818, "margin_dpo/margin_mean": 234.589599609375, "margin_dpo/margin_std": 290.7215576171875, "step": 394 }, { "KL/chosen_KL_mean": -330.6572265625, "KL/mean": -454.97314453125, "KL/rejected_KL_mean": -579.2891845703125, "KL/std": 274.084228515625, "epoch": 0.580029368575624, "fcm_dpo/beta": 0.001704660477116704, "fcm_dpo/delta": -0.025200337171554565, "fcm_dpo/margin": 248.63189697265625, "fcm_dpo/q_t": 0.4029311537742615, "grad_norm": 27.568639755249023, "learning_rate": 2.2565536379453404e-07, "logits/chosen": -0.5028017163276672, "logits/rejected": -0.49913692474365234, "logps/chosen": -393.2033386230469, "logps/ref_chosen": -62.546112060546875, "logps/ref_rejected": -83.78262329101562, "logps/rejected": -663.07177734375, "loss": 1.0744, "margin_dpo/margin_mean": 248.63189697265625, "margin_dpo/margin_std": 315.91448974609375, "step": 395 }, { "KL/chosen_KL_mean": -339.6167297363281, "KL/mean": -453.63134765625, "KL/rejected_KL_mean": -567.64599609375, "KL/std": 269.79595947265625, "epoch": 0.5814977973568282, "fcm_dpo/beta": 0.0017089219763875008, "fcm_dpo/delta": 0.010719288140535355, "fcm_dpo/margin": 228.02923583984375, "fcm_dpo/q_t": 0.4084508419036865, "grad_norm": 20.283159255981445, "learning_rate": 2.2437845895869825e-07, "logits/chosen": -0.4966447353363037, "logits/rejected": -0.4823087155818939, "logps/chosen": -408.6126708984375, "logps/ref_chosen": -68.99594116210938, "logps/ref_rejected": -88.64665985107422, "logps/rejected": -656.2926025390625, "loss": 1.0763, "margin_dpo/margin_mean": 228.02923583984375, "margin_dpo/margin_std": 249.16293334960938, "step": 396 }, { "KL/chosen_KL_mean": -331.72991943359375, "KL/mean": -472.43841552734375, "KL/rejected_KL_mean": -613.1468505859375, "KL/std": 266.5982360839844, "epoch": 0.5829662261380323, "fcm_dpo/beta": 0.0016858780290931463, "fcm_dpo/delta": -0.07868388295173645, "fcm_dpo/margin": 281.4169921875, "fcm_dpo/q_t": 0.38912802934646606, "grad_norm": 31.900684356689453, "learning_rate": 2.2310222927429716e-07, "logits/chosen": -0.45149320363998413, "logits/rejected": -0.45756763219833374, "logps/chosen": -393.007080078125, "logps/ref_chosen": -61.27716827392578, "logps/ref_rejected": -103.11612701416016, "logps/rejected": -716.2630004882812, "loss": 1.0206, "margin_dpo/margin_mean": 281.4169921875, "margin_dpo/margin_std": 282.707275390625, "step": 397 }, { "KL/chosen_KL_mean": -356.6610412597656, "KL/mean": -489.5479736328125, "KL/rejected_KL_mean": -622.4349365234375, "KL/std": 285.5445556640625, "epoch": 0.5844346549192364, "fcm_dpo/beta": 0.0016720399726182222, "fcm_dpo/delta": -0.04650367423892021, "fcm_dpo/margin": 265.7738342285156, "fcm_dpo/q_t": 0.39866209030151367, "grad_norm": 24.273387908935547, "learning_rate": 2.2182670837116972e-07, "logits/chosen": -0.5021190643310547, "logits/rejected": -0.4984011650085449, "logps/chosen": -424.8125915527344, "logps/ref_chosen": -68.15155029296875, "logps/ref_rejected": -108.52360534667969, "logps/rejected": -730.95849609375, "loss": 1.0615, "margin_dpo/margin_mean": 265.7738342285156, "margin_dpo/margin_std": 334.2044677734375, "step": 398 }, { "KL/chosen_KL_mean": -323.0514221191406, "KL/mean": -437.18963623046875, "KL/rejected_KL_mean": -551.327880859375, "KL/std": 256.7247314453125, "epoch": 0.5859030837004405, "fcm_dpo/beta": 0.0016683805733919144, "fcm_dpo/delta": 0.019710222259163857, "fcm_dpo/margin": 228.27645874023438, "fcm_dpo/q_t": 0.4121752977371216, "grad_norm": 26.480926513671875, "learning_rate": 2.2055192986047804e-07, "logits/chosen": -0.45461803674697876, "logits/rejected": -0.4152987003326416, "logps/chosen": -383.94122314453125, "logps/ref_chosen": -60.889801025390625, "logps/ref_rejected": -77.965576171875, "logps/rejected": -629.29345703125, "loss": 1.1112, "margin_dpo/margin_mean": 228.27645874023438, "margin_dpo/margin_std": 323.93865966796875, "step": 399 }, { "KL/chosen_KL_mean": -316.2700500488281, "KL/mean": -488.2925720214844, "KL/rejected_KL_mean": -660.3151245117188, "KL/std": 281.55780029296875, "epoch": 0.5873715124816447, "fcm_dpo/beta": 0.0016302517615258694, "fcm_dpo/delta": -0.17095670104026794, "fcm_dpo/margin": 344.0451354980469, "fcm_dpo/q_t": 0.3687145709991455, "grad_norm": 19.69918441772461, "learning_rate": 2.192779273338215e-07, "logits/chosen": -0.43350642919540405, "logits/rejected": -0.4296361804008484, "logps/chosen": -379.91363525390625, "logps/ref_chosen": -63.64359664916992, "logps/ref_rejected": -105.252685546875, "logps/rejected": -765.5678100585938, "loss": 0.9622, "margin_dpo/margin_mean": 344.04510498046875, "margin_dpo/margin_std": 313.25408935546875, "step": 400 }, { "KL/chosen_KL_mean": -363.84393310546875, "KL/mean": -460.88262939453125, "KL/rejected_KL_mean": -557.92138671875, "KL/std": 290.2945861816406, "epoch": 0.5888399412628488, "fcm_dpo/beta": 0.00163645064458251, "fcm_dpo/delta": 0.08519099652767181, "fcm_dpo/margin": 194.07742309570312, "fcm_dpo/q_t": 0.42967379093170166, "grad_norm": 24.62172508239746, "learning_rate": 2.1800473436235136e-07, "logits/chosen": -0.4492917060852051, "logits/rejected": -0.44258540868759155, "logps/chosen": -421.0069580078125, "logps/ref_chosen": -57.16303253173828, "logps/ref_rejected": -83.79249572753906, "logps/rejected": -641.7138671875, "loss": 1.1887, "margin_dpo/margin_mean": 194.07742309570312, "margin_dpo/margin_std": 388.0357666015625, "step": 401 }, { "KL/chosen_KL_mean": -277.44989013671875, "KL/mean": -456.07666015625, "KL/rejected_KL_mean": -634.7034301757812, "KL/std": 316.1929016113281, "epoch": 0.5903083700440529, "fcm_dpo/beta": 0.0016029919497668743, "fcm_dpo/delta": -0.18321484327316284, "fcm_dpo/margin": 357.2535400390625, "fcm_dpo/q_t": 0.36773985624313354, "grad_norm": 21.816125869750977, "learning_rate": 2.1673238449588665e-07, "logits/chosen": -0.4676339626312256, "logits/rejected": -0.4571627974510193, "logps/chosen": -328.19024658203125, "logps/ref_chosen": -50.74037170410156, "logps/ref_rejected": -81.0460433959961, "logps/rejected": -715.74951171875, "loss": 0.9533, "margin_dpo/margin_mean": 357.2535400390625, "margin_dpo/margin_std": 325.81842041015625, "step": 402 }, { "KL/chosen_KL_mean": -314.04962158203125, "KL/mean": -449.58245849609375, "KL/rejected_KL_mean": -585.115234375, "KL/std": 293.69842529296875, "epoch": 0.591776798825257, "fcm_dpo/beta": 0.0015804520808160305, "fcm_dpo/delta": -0.029699519276618958, "fcm_dpo/margin": 271.06561279296875, "fcm_dpo/q_t": 0.4006895124912262, "grad_norm": 27.178544998168945, "learning_rate": 2.154609112620295e-07, "logits/chosen": -0.4410826563835144, "logits/rejected": -0.4390965700149536, "logps/chosen": -361.19696044921875, "logps/ref_chosen": -47.14731216430664, "logps/ref_rejected": -77.2666015625, "logps/rejected": -662.3818359375, "loss": 1.0603, "margin_dpo/margin_mean": 271.06561279296875, "margin_dpo/margin_std": 310.4146423339844, "step": 403 }, { "KL/chosen_KL_mean": -349.8211975097656, "KL/mean": -481.5411376953125, "KL/rejected_KL_mean": -613.2611083984375, "KL/std": 281.17803955078125, "epoch": 0.593245227606461, "fcm_dpo/beta": 0.0015735691413283348, "fcm_dpo/delta": -0.015212337486445904, "fcm_dpo/margin": 263.43988037109375, "fcm_dpo/q_t": 0.40503576397895813, "grad_norm": 29.41777992248535, "learning_rate": 2.1419034816528218e-07, "logits/chosen": -0.46026161313056946, "logits/rejected": -0.45394134521484375, "logps/chosen": -397.69647216796875, "logps/ref_chosen": -47.875274658203125, "logps/ref_rejected": -77.15499877929688, "logps/rejected": -690.4160766601562, "loss": 1.0903, "margin_dpo/margin_mean": 263.43988037109375, "margin_dpo/margin_std": 362.440673828125, "step": 404 }, { "KL/chosen_KL_mean": -395.85577392578125, "KL/mean": -504.7757873535156, "KL/rejected_KL_mean": -613.69580078125, "KL/std": 314.71160888671875, "epoch": 0.5947136563876652, "fcm_dpo/beta": 0.0015670396387577057, "fcm_dpo/delta": -0.04510403424501419, "fcm_dpo/margin": 217.84002685546875, "fcm_dpo/q_t": 0.4239059090614319, "grad_norm": 35.01424789428711, "learning_rate": 2.129207286861638e-07, "logits/chosen": -0.4418267011642456, "logits/rejected": -0.43295902013778687, "logps/chosen": -461.0186767578125, "logps/ref_chosen": -65.16290283203125, "logps/ref_rejected": -87.18678283691406, "logps/rejected": -700.882568359375, "loss": 1.1669, "margin_dpo/margin_mean": 217.84002685546875, "margin_dpo/margin_std": 392.323486328125, "step": 405 }, { "KL/chosen_KL_mean": -354.01934814453125, "KL/mean": -493.54443359375, "KL/rejected_KL_mean": -633.069580078125, "KL/std": 308.9290771484375, "epoch": 0.5961820851688693, "fcm_dpo/beta": 0.0015584398061037064, "fcm_dpo/delta": -0.03671257197856903, "fcm_dpo/margin": 279.05023193359375, "fcm_dpo/q_t": 0.40036964416503906, "grad_norm": 30.385452270507812, "learning_rate": 2.1165208628032861e-07, "logits/chosen": -0.46909600496292114, "logits/rejected": -0.47889643907546997, "logps/chosen": -403.7601623535156, "logps/ref_chosen": -49.740814208984375, "logps/ref_rejected": -92.07862854003906, "logps/rejected": -725.148193359375, "loss": 1.0625, "margin_dpo/margin_mean": 279.05023193359375, "margin_dpo/margin_std": 336.1007995605469, "step": 406 }, { "KL/chosen_KL_mean": -377.31060791015625, "KL/mean": -466.502685546875, "KL/rejected_KL_mean": -555.6947631835938, "KL/std": 247.73455810546875, "epoch": 0.5976505139500734, "fcm_dpo/beta": 0.001549946959130466, "fcm_dpo/delta": 0.008951360359787941, "fcm_dpo/margin": 178.3841552734375, "fcm_dpo/q_t": 0.4347270131111145, "grad_norm": 31.04163360595703, "learning_rate": 2.1038445437768375e-07, "logits/chosen": -0.4896223545074463, "logits/rejected": -0.46283426880836487, "logps/chosen": -433.6413269042969, "logps/ref_chosen": -56.33069610595703, "logps/ref_rejected": -77.51209259033203, "logps/rejected": -633.2068481445312, "loss": 1.1926, "margin_dpo/margin_mean": 178.38414001464844, "margin_dpo/margin_std": 325.7279968261719, "step": 407 }, { "KL/chosen_KL_mean": -381.83782958984375, "KL/mean": -482.71337890625, "KL/rejected_KL_mean": -583.5889892578125, "KL/std": 239.08734130859375, "epoch": 0.5991189427312775, "fcm_dpo/beta": 0.0015726467827335, "fcm_dpo/delta": 0.0853329598903656, "fcm_dpo/margin": 201.75115966796875, "fcm_dpo/q_t": 0.42552345991134644, "grad_norm": 24.999143600463867, "learning_rate": 2.0911786638150872e-07, "logits/chosen": -0.4955484867095947, "logits/rejected": -0.47467708587646484, "logps/chosen": -451.62713623046875, "logps/ref_chosen": -69.789306640625, "logps/ref_rejected": -90.09693908691406, "logps/rejected": -673.6859130859375, "loss": 1.1374, "margin_dpo/margin_mean": 201.75115966796875, "margin_dpo/margin_std": 270.3141784667969, "step": 408 }, { "KL/chosen_KL_mean": -364.642578125, "KL/mean": -463.87872314453125, "KL/rejected_KL_mean": -563.1149291992188, "KL/std": 257.52569580078125, "epoch": 0.6005873715124816, "fcm_dpo/beta": 0.0015998759772628546, "fcm_dpo/delta": 0.0850619375705719, "fcm_dpo/margin": 198.47232055664062, "fcm_dpo/q_t": 0.4262683391571045, "grad_norm": 29.315763473510742, "learning_rate": 2.0785235566757517e-07, "logits/chosen": -0.4769352376461029, "logits/rejected": -0.4611578583717346, "logps/chosen": -431.96002197265625, "logps/ref_chosen": -67.31744384765625, "logps/ref_rejected": -84.904296875, "logps/rejected": -648.0191650390625, "loss": 1.145, "margin_dpo/margin_mean": 198.47232055664062, "margin_dpo/margin_std": 289.51141357421875, "step": 409 }, { "KL/chosen_KL_mean": -339.183349609375, "KL/mean": -455.3343505859375, "KL/rejected_KL_mean": -571.4853515625, "KL/std": 257.6640319824219, "epoch": 0.6020558002936858, "fcm_dpo/beta": 0.0016098904889076948, "fcm_dpo/delta": 0.027035847306251526, "fcm_dpo/margin": 232.30197143554688, "fcm_dpo/q_t": 0.41175514459609985, "grad_norm": 26.693159103393555, "learning_rate": 2.065879555832674e-07, "logits/chosen": -0.5109409093856812, "logits/rejected": -0.5129973888397217, "logps/chosen": -390.648681640625, "logps/ref_chosen": -51.465354919433594, "logps/ref_rejected": -83.198974609375, "logps/rejected": -654.684326171875, "loss": 1.0976, "margin_dpo/margin_mean": 232.30198669433594, "margin_dpo/margin_std": 286.159912109375, "step": 410 }, { "KL/chosen_KL_mean": -354.3096923828125, "KL/mean": -474.0059814453125, "KL/rejected_KL_mean": -593.7022705078125, "KL/std": 283.52850341796875, "epoch": 0.6035242290748899, "fcm_dpo/beta": 0.0016047862591221929, "fcm_dpo/delta": 0.015297271311283112, "fcm_dpo/margin": 239.392578125, "fcm_dpo/q_t": 0.41245776414871216, "grad_norm": 24.12554931640625, "learning_rate": 2.0532469944670343e-07, "logits/chosen": -0.5120722055435181, "logits/rejected": -0.520818293094635, "logps/chosen": -406.6169738769531, "logps/ref_chosen": -52.30727005004883, "logps/ref_rejected": -80.69495391845703, "logps/rejected": -674.397216796875, "loss": 1.1053, "margin_dpo/margin_mean": 239.392578125, "margin_dpo/margin_std": 321.0933532714844, "step": 411 }, { "KL/chosen_KL_mean": -361.31805419921875, "KL/mean": -484.0078125, "KL/rejected_KL_mean": -606.6976318359375, "KL/std": 272.78863525390625, "epoch": 0.604992657856094, "fcm_dpo/beta": 0.0016203692648559809, "fcm_dpo/delta": 0.0024842238053679466, "fcm_dpo/margin": 245.37954711914062, "fcm_dpo/q_t": 0.40797942876815796, "grad_norm": 33.52204132080078, "learning_rate": 2.0406262054585738e-07, "logits/chosen": -0.5373940467834473, "logits/rejected": -0.5674378275871277, "logps/chosen": -414.4621887207031, "logps/ref_chosen": -53.144126892089844, "logps/ref_rejected": -100.0608139038086, "logps/rejected": -706.7584228515625, "loss": 1.0918, "margin_dpo/margin_mean": 245.3795623779297, "margin_dpo/margin_std": 322.3990173339844, "step": 412 }, { "KL/chosen_KL_mean": -378.59344482421875, "KL/mean": -501.33282470703125, "KL/rejected_KL_mean": -624.072265625, "KL/std": 269.05731201171875, "epoch": 0.6064610866372981, "fcm_dpo/beta": 0.001623795717023313, "fcm_dpo/delta": 0.0013288334012031555, "fcm_dpo/margin": 245.4788055419922, "fcm_dpo/q_t": 0.4057984948158264, "grad_norm": 29.867643356323242, "learning_rate": 2.0280175213768205e-07, "logits/chosen": -0.5124090909957886, "logits/rejected": -0.5228337049484253, "logps/chosen": -440.1754150390625, "logps/ref_chosen": -61.58196258544922, "logps/ref_rejected": -99.47340393066406, "logps/rejected": -723.545654296875, "loss": 1.081, "margin_dpo/margin_mean": 245.4788055419922, "margin_dpo/margin_std": 293.564697265625, "step": 413 }, { "KL/chosen_KL_mean": -343.92242431640625, "KL/mean": -475.8585510253906, "KL/rejected_KL_mean": -607.794677734375, "KL/std": 259.8440856933594, "epoch": 0.6079295154185022, "fcm_dpo/beta": 0.0016230610199272633, "fcm_dpo/delta": -0.030179578810930252, "fcm_dpo/margin": 263.87225341796875, "fcm_dpo/q_t": 0.3998476266860962, "grad_norm": 43.11936950683594, "learning_rate": 2.0154212744723247e-07, "logits/chosen": -0.45165306329727173, "logits/rejected": -0.4454384446144104, "logps/chosen": -390.5539245605469, "logps/ref_chosen": -46.63148498535156, "logps/ref_rejected": -87.64653015136719, "logps/rejected": -695.441162109375, "loss": 1.0618, "margin_dpo/margin_mean": 263.87225341796875, "margin_dpo/margin_std": 297.9873046875, "step": 414 }, { "KL/chosen_KL_mean": -382.28839111328125, "KL/mean": -484.4581604003906, "KL/rejected_KL_mean": -586.6279296875, "KL/std": 261.7178649902344, "epoch": 0.6093979441997063, "fcm_dpo/beta": 0.001621844945475459, "fcm_dpo/delta": 0.0709480568766594, "fcm_dpo/margin": 204.3395233154297, "fcm_dpo/q_t": 0.42238879203796387, "grad_norm": 27.217483520507812, "learning_rate": 2.002837796667909e-07, "logits/chosen": -0.5291392207145691, "logits/rejected": -0.5265468955039978, "logps/chosen": -460.90667724609375, "logps/ref_chosen": -78.6182861328125, "logps/ref_rejected": -100.47752380371094, "logps/rejected": -687.10546875, "loss": 1.1367, "margin_dpo/margin_mean": 204.3395233154297, "margin_dpo/margin_std": 292.1011962890625, "step": 415 }, { "KL/chosen_KL_mean": -353.57177734375, "KL/mean": -510.877197265625, "KL/rejected_KL_mean": -668.1825561523438, "KL/std": 296.62371826171875, "epoch": 0.6108663729809104, "fcm_dpo/beta": 0.0016112902667373419, "fcm_dpo/delta": -0.11248860508203506, "fcm_dpo/margin": 314.61083984375, "fcm_dpo/q_t": 0.3814007043838501, "grad_norm": 35.17634582519531, "learning_rate": 1.990267419549914e-07, "logits/chosen": -0.5343978404998779, "logits/rejected": -0.5423879623413086, "logps/chosen": -411.85089111328125, "logps/ref_chosen": -58.27912521362305, "logps/ref_rejected": -90.56871795654297, "logps/rejected": -758.7512817382812, "loss": 0.9928, "margin_dpo/margin_mean": 314.61077880859375, "margin_dpo/margin_std": 286.4470520019531, "step": 416 }, { "KL/chosen_KL_mean": -357.342041015625, "KL/mean": -489.947265625, "KL/rejected_KL_mean": -622.552490234375, "KL/std": 266.30548095703125, "epoch": 0.6123348017621145, "fcm_dpo/beta": 0.0015942594036459923, "fcm_dpo/delta": -0.023848645389080048, "fcm_dpo/margin": 265.21044921875, "fcm_dpo/q_t": 0.4002404808998108, "grad_norm": 30.77235984802246, "learning_rate": 1.9777104743594686e-07, "logits/chosen": -0.5373271703720093, "logits/rejected": -0.5231212377548218, "logps/chosen": -407.540771484375, "logps/ref_chosen": -50.1987190246582, "logps/ref_rejected": -68.15184020996094, "logps/rejected": -690.704345703125, "loss": 1.0501, "margin_dpo/margin_mean": 265.21044921875, "margin_dpo/margin_std": 265.76727294921875, "step": 417 }, { "KL/chosen_KL_mean": -382.75750732421875, "KL/mean": -518.67626953125, "KL/rejected_KL_mean": -654.5950317382812, "KL/std": 304.6961669921875, "epoch": 0.6138032305433186, "fcm_dpo/beta": 0.001592871267348528, "fcm_dpo/delta": -0.03536780923604965, "fcm_dpo/margin": 271.8375244140625, "fcm_dpo/q_t": 0.4023270905017853, "grad_norm": 22.210895538330078, "learning_rate": 1.965167291983757e-07, "logits/chosen": -0.6004199385643005, "logits/rejected": -0.5827990174293518, "logps/chosen": -464.7359619140625, "logps/ref_chosen": -81.97846984863281, "logps/ref_rejected": -104.69148254394531, "logps/rejected": -759.2864990234375, "loss": 1.0779, "margin_dpo/margin_mean": 271.8375244140625, "margin_dpo/margin_std": 355.95672607421875, "step": 418 }, { "KL/chosen_KL_mean": -359.854248046875, "KL/mean": -510.2494201660156, "KL/rejected_KL_mean": -660.6445922851562, "KL/std": 277.7416076660156, "epoch": 0.6152716593245228, "fcm_dpo/beta": 0.001561171025969088, "fcm_dpo/delta": -0.07300984114408493, "fcm_dpo/margin": 300.7904052734375, "fcm_dpo/q_t": 0.39019423723220825, "grad_norm": 28.61322784423828, "learning_rate": 1.9526382029472988e-07, "logits/chosen": -0.5227010846138, "logits/rejected": -0.5222500562667847, "logps/chosen": -412.8028869628906, "logps/ref_chosen": -52.948646545410156, "logps/ref_rejected": -91.58309936523438, "logps/rejected": -752.2276611328125, "loss": 1.0314, "margin_dpo/margin_mean": 300.7904052734375, "margin_dpo/margin_std": 325.02825927734375, "step": 419 }, { "KL/chosen_KL_mean": -452.51690673828125, "KL/mean": -546.392333984375, "KL/rejected_KL_mean": -640.2677001953125, "KL/std": 296.0887145996094, "epoch": 0.6167400881057269, "fcm_dpo/beta": 0.0015826968010514975, "fcm_dpo/delta": 0.1057576984167099, "fcm_dpo/margin": 187.75083923339844, "fcm_dpo/q_t": 0.4339344799518585, "grad_norm": 54.64779281616211, "learning_rate": 1.9401235374032425e-07, "logits/chosen": -0.5721093416213989, "logits/rejected": -0.5451463460922241, "logps/chosen": -530.2868041992188, "logps/ref_chosen": -77.7699203491211, "logps/ref_rejected": -69.31985473632812, "logps/rejected": -709.5875244140625, "loss": 1.2047, "margin_dpo/margin_mean": 187.75082397460938, "margin_dpo/margin_std": 400.8773193359375, "step": 420 }, { "KL/chosen_KL_mean": -379.40533447265625, "KL/mean": -485.106201171875, "KL/rejected_KL_mean": -590.8070068359375, "KL/std": 287.34088134765625, "epoch": 0.618208516886931, "fcm_dpo/beta": 0.0016100335633382201, "fcm_dpo/delta": 0.0610845573246479, "fcm_dpo/margin": 211.4017333984375, "fcm_dpo/q_t": 0.4196345806121826, "grad_norm": 27.371572494506836, "learning_rate": 1.9276236251246653e-07, "logits/chosen": -0.5819834470748901, "logits/rejected": -0.5714644193649292, "logps/chosen": -433.17120361328125, "logps/ref_chosen": -53.765865325927734, "logps/ref_rejected": -89.28144836425781, "logps/rejected": -680.0885009765625, "loss": 1.1373, "margin_dpo/margin_mean": 211.4017333984375, "margin_dpo/margin_std": 309.2540283203125, "step": 421 }, { "KL/chosen_KL_mean": -423.903076171875, "KL/mean": -544.1499633789062, "KL/rejected_KL_mean": -664.3968505859375, "KL/std": 294.40728759765625, "epoch": 0.6196769456681351, "fcm_dpo/beta": 0.0016107236733660102, "fcm_dpo/delta": 0.013142132200300694, "fcm_dpo/margin": 240.49374389648438, "fcm_dpo/q_t": 0.4098985493183136, "grad_norm": 33.85724639892578, "learning_rate": 1.9151387954958792e-07, "logits/chosen": -0.6141137480735779, "logits/rejected": -0.6188079118728638, "logps/chosen": -492.536865234375, "logps/ref_chosen": -68.6337661743164, "logps/ref_rejected": -87.86351013183594, "logps/rejected": -752.2603759765625, "loss": 1.1085, "margin_dpo/margin_mean": 240.49374389648438, "margin_dpo/margin_std": 345.5645446777344, "step": 422 }, { "KL/chosen_KL_mean": -404.6935119628906, "KL/mean": -534.154052734375, "KL/rejected_KL_mean": -663.6144409179688, "KL/std": 282.32171630859375, "epoch": 0.6211453744493393, "fcm_dpo/beta": 0.0016070720739662647, "fcm_dpo/delta": -0.016826242208480835, "fcm_dpo/margin": 258.9209899902344, "fcm_dpo/q_t": 0.4035068154335022, "grad_norm": 31.080442428588867, "learning_rate": 1.902669377503756e-07, "logits/chosen": -0.5942707061767578, "logits/rejected": -0.6040855646133423, "logps/chosen": -459.6838073730469, "logps/ref_chosen": -54.99030303955078, "logps/ref_rejected": -86.30654907226562, "logps/rejected": -749.9210205078125, "loss": 1.072, "margin_dpo/margin_mean": 258.9209899902344, "margin_dpo/margin_std": 313.36004638671875, "step": 423 }, { "KL/chosen_KL_mean": -365.32806396484375, "KL/mean": -491.30462646484375, "KL/rejected_KL_mean": -617.28125, "KL/std": 282.08697509765625, "epoch": 0.6226138032305433, "fcm_dpo/beta": 0.0015995125286281109, "fcm_dpo/delta": -0.003478415310382843, "fcm_dpo/margin": 251.95314025878906, "fcm_dpo/q_t": 0.40843045711517334, "grad_norm": 33.83623123168945, "learning_rate": 1.890215699729057e-07, "logits/chosen": -0.577785313129425, "logits/rejected": -0.555591881275177, "logps/chosen": -421.3399658203125, "logps/ref_chosen": -56.01192092895508, "logps/ref_rejected": -66.47896575927734, "logps/rejected": -683.7601928710938, "loss": 1.0919, "margin_dpo/margin_mean": 251.95315551757812, "margin_dpo/margin_std": 339.11346435546875, "step": 424 }, { "KL/chosen_KL_mean": -404.3356018066406, "KL/mean": -499.7007141113281, "KL/rejected_KL_mean": -595.0657958984375, "KL/std": 263.603759765625, "epoch": 0.6240822320117474, "fcm_dpo/beta": 0.001631980761885643, "fcm_dpo/delta": 0.09120546281337738, "fcm_dpo/margin": 190.730224609375, "fcm_dpo/q_t": 0.42644861340522766, "grad_norm": 31.754074096679688, "learning_rate": 1.8777780903377732e-07, "logits/chosen": -0.5840317606925964, "logits/rejected": -0.5841487646102905, "logps/chosen": -451.20458984375, "logps/ref_chosen": -46.86899948120117, "logps/ref_rejected": -95.92545318603516, "logps/rejected": -690.9912719726562, "loss": 1.1673, "margin_dpo/margin_mean": 190.730224609375, "margin_dpo/margin_std": 325.1813049316406, "step": 425 }, { "KL/chosen_KL_mean": -375.7733154296875, "KL/mean": -498.88397216796875, "KL/rejected_KL_mean": -621.99462890625, "KL/std": 270.572509765625, "epoch": 0.6255506607929515, "fcm_dpo/beta": 0.001641254872083664, "fcm_dpo/delta": -0.004648171365261078, "fcm_dpo/margin": 246.22134399414062, "fcm_dpo/q_t": 0.40564680099487305, "grad_norm": 29.338314056396484, "learning_rate": 1.8653568770724803e-07, "logits/chosen": -0.5852512121200562, "logits/rejected": -0.5579032897949219, "logps/chosen": -452.35687255859375, "logps/ref_chosen": -76.58354187011719, "logps/ref_rejected": -81.26658630371094, "logps/rejected": -703.26123046875, "loss": 1.0805, "margin_dpo/margin_mean": 246.22132873535156, "margin_dpo/margin_std": 294.9763488769531, "step": 426 }, { "KL/chosen_KL_mean": -339.8883361816406, "KL/mean": -433.8622741699219, "KL/rejected_KL_mean": -527.8362426757812, "KL/std": 233.69332885742188, "epoch": 0.6270190895741556, "fcm_dpo/beta": 0.0016534591559320688, "fcm_dpo/delta": 0.09219777584075928, "fcm_dpo/margin": 187.9479217529297, "fcm_dpo/q_t": 0.4280179440975189, "grad_norm": 25.174968719482422, "learning_rate": 1.8529523872436977e-07, "logits/chosen": -0.6304788589477539, "logits/rejected": -0.6161661744117737, "logps/chosen": -404.7421875, "logps/ref_chosen": -64.8538818359375, "logps/ref_rejected": -78.5660171508789, "logps/rejected": -606.4022216796875, "loss": 1.1492, "margin_dpo/margin_mean": 187.94793701171875, "margin_dpo/margin_std": 274.6386413574219, "step": 427 }, { "KL/chosen_KL_mean": -421.1455383300781, "KL/mean": -548.4664306640625, "KL/rejected_KL_mean": -675.787353515625, "KL/std": 309.02484130859375, "epoch": 0.6284875183553598, "fcm_dpo/beta": 0.0016524514649063349, "fcm_dpo/delta": -0.022071223706007004, "fcm_dpo/margin": 254.64175415039062, "fcm_dpo/q_t": 0.4040681719779968, "grad_norm": 36.647064208984375, "learning_rate": 1.8405649477212697e-07, "logits/chosen": -0.5900696516036987, "logits/rejected": -0.5930036306381226, "logps/chosen": -483.7822265625, "logps/ref_chosen": -62.63666534423828, "logps/ref_rejected": -103.28181457519531, "logps/rejected": -779.0691528320312, "loss": 1.1009, "margin_dpo/margin_mean": 254.64175415039062, "margin_dpo/margin_std": 378.00115966796875, "step": 428 }, { "KL/chosen_KL_mean": -412.0851135253906, "KL/mean": -507.26202392578125, "KL/rejected_KL_mean": -602.43896484375, "KL/std": 265.12261962890625, "epoch": 0.6299559471365639, "fcm_dpo/beta": 0.0016535113099962473, "fcm_dpo/delta": -0.020081549882888794, "fcm_dpo/margin": 190.3538818359375, "fcm_dpo/q_t": 0.42727112770080566, "grad_norm": 34.380889892578125, "learning_rate": 1.828194884925749e-07, "logits/chosen": -0.5973831415176392, "logits/rejected": -0.5831949710845947, "logps/chosen": -493.3191223144531, "logps/ref_chosen": -81.23401641845703, "logps/ref_rejected": -91.79493713378906, "logps/rejected": -694.23388671875, "loss": 1.171, "margin_dpo/margin_mean": 190.3538818359375, "margin_dpo/margin_std": 328.217041015625, "step": 429 }, { "KL/chosen_KL_mean": -324.106689453125, "KL/mean": -425.34185791015625, "KL/rejected_KL_mean": -526.5770263671875, "KL/std": 241.1708221435547, "epoch": 0.631424375917768, "fcm_dpo/beta": 0.0016685712616890669, "fcm_dpo/delta": 0.06419498473405838, "fcm_dpo/margin": 202.4704132080078, "fcm_dpo/q_t": 0.42112964391708374, "grad_norm": 28.673410415649414, "learning_rate": 1.8158425248197928e-07, "logits/chosen": -0.5664153099060059, "logits/rejected": -0.562206506729126, "logps/chosen": -385.0269775390625, "logps/ref_chosen": -60.920326232910156, "logps/ref_rejected": -104.42280578613281, "logps/rejected": -630.9998779296875, "loss": 1.1216, "margin_dpo/margin_mean": 202.4704132080078, "margin_dpo/margin_std": 261.1304016113281, "step": 430 }, { "KL/chosen_KL_mean": -300.27374267578125, "KL/mean": -448.88720703125, "KL/rejected_KL_mean": -597.5006103515625, "KL/std": 266.6134338378906, "epoch": 0.6328928046989721, "fcm_dpo/beta": 0.001646057702600956, "fcm_dpo/delta": -0.09401103109121323, "fcm_dpo/margin": 297.22686767578125, "fcm_dpo/q_t": 0.38535940647125244, "grad_norm": 27.440969467163086, "learning_rate": 1.8035081928995788e-07, "logits/chosen": -0.53639817237854, "logits/rejected": -0.5400429964065552, "logps/chosen": -357.62249755859375, "logps/ref_chosen": -57.34874725341797, "logps/ref_rejected": -92.84022521972656, "logps/rejected": -690.3408203125, "loss": 1.01, "margin_dpo/margin_mean": 297.2269287109375, "margin_dpo/margin_std": 285.8399658203125, "step": 431 }, { "KL/chosen_KL_mean": -309.4350891113281, "KL/mean": -450.547119140625, "KL/rejected_KL_mean": -591.6591186523438, "KL/std": 265.4091796875, "epoch": 0.6343612334801763, "fcm_dpo/beta": 0.0016348997596651316, "fcm_dpo/delta": -0.06477323174476624, "fcm_dpo/margin": 282.22406005859375, "fcm_dpo/q_t": 0.3924236297607422, "grad_norm": 43.64820098876953, "learning_rate": 1.791192214186223e-07, "logits/chosen": -0.502853274345398, "logits/rejected": -0.4937119781970978, "logps/chosen": -380.5098876953125, "logps/ref_chosen": -71.07479095458984, "logps/ref_rejected": -98.57952880859375, "logps/rejected": -690.2386474609375, "loss": 1.0284, "margin_dpo/margin_mean": 282.2240295410156, "margin_dpo/margin_std": 273.5948181152344, "step": 432 }, { "KL/chosen_KL_mean": -404.2077331542969, "KL/mean": -498.65142822265625, "KL/rejected_KL_mean": -593.0950927734375, "KL/std": 266.9083251953125, "epoch": 0.6358296622613803, "fcm_dpo/beta": 0.0016442297492176294, "fcm_dpo/delta": 0.09210029989480972, "fcm_dpo/margin": 188.88734436035156, "fcm_dpo/q_t": 0.42625609040260315, "grad_norm": 37.26985549926758, "learning_rate": 1.7788949132172193e-07, "logits/chosen": -0.5648001432418823, "logits/rejected": -0.5520174503326416, "logps/chosen": -462.4809265136719, "logps/ref_chosen": -58.273193359375, "logps/ref_rejected": -95.95089721679688, "logps/rejected": -689.0460205078125, "loss": 1.1645, "margin_dpo/margin_mean": 188.88734436035156, "margin_dpo/margin_std": 314.8270263671875, "step": 433 }, { "KL/chosen_KL_mean": -358.794921875, "KL/mean": -477.2853088378906, "KL/rejected_KL_mean": -595.7756958007812, "KL/std": 286.1195068359375, "epoch": 0.6372980910425844, "fcm_dpo/beta": 0.001646613236516714, "fcm_dpo/delta": 0.010008249431848526, "fcm_dpo/margin": 236.9807586669922, "fcm_dpo/q_t": 0.4145994186401367, "grad_norm": 22.998327255249023, "learning_rate": 1.7666166140378853e-07, "logits/chosen": -0.5716849565505981, "logits/rejected": -0.5764377117156982, "logps/chosen": -420.76861572265625, "logps/ref_chosen": -61.97370147705078, "logps/ref_rejected": -78.49861145019531, "logps/rejected": -674.2742919921875, "loss": 1.1083, "margin_dpo/margin_mean": 236.98074340820312, "margin_dpo/margin_std": 357.0912780761719, "step": 434 }, { "KL/chosen_KL_mean": -329.38238525390625, "KL/mean": -454.7186279296875, "KL/rejected_KL_mean": -580.0548706054688, "KL/std": 273.7630615234375, "epoch": 0.6387665198237885, "fcm_dpo/beta": 0.0016480737831443548, "fcm_dpo/delta": -0.01371398288756609, "fcm_dpo/margin": 250.6724853515625, "fcm_dpo/q_t": 0.40409788489341736, "grad_norm": 30.54098892211914, "learning_rate": 1.7543576401928218e-07, "logits/chosen": -0.5849795937538147, "logits/rejected": -0.5741031169891357, "logps/chosen": -380.8844299316406, "logps/ref_chosen": -51.502052307128906, "logps/ref_rejected": -87.56689453125, "logps/rejected": -667.6217651367188, "loss": 1.0804, "margin_dpo/margin_mean": 250.6724853515625, "margin_dpo/margin_std": 310.9295654296875, "step": 435 }, { "KL/chosen_KL_mean": -350.72491455078125, "KL/mean": -460.7542724609375, "KL/rejected_KL_mean": -570.7836303710938, "KL/std": 239.6090087890625, "epoch": 0.6402349486049926, "fcm_dpo/beta": 0.0016525493701919913, "fcm_dpo/delta": 0.03764678165316582, "fcm_dpo/margin": 220.05868530273438, "fcm_dpo/q_t": 0.41519662737846375, "grad_norm": 44.92102813720703, "learning_rate": 1.742118314717391e-07, "logits/chosen": -0.5648950934410095, "logits/rejected": -0.5372939109802246, "logps/chosen": -422.1286315917969, "logps/ref_chosen": -71.40371704101562, "logps/ref_rejected": -82.72775268554688, "logps/rejected": -653.5113525390625, "loss": 1.1082, "margin_dpo/margin_mean": 220.05868530273438, "margin_dpo/margin_std": 284.269287109375, "step": 436 }, { "KL/chosen_KL_mean": -357.39385986328125, "KL/mean": -470.41290283203125, "KL/rejected_KL_mean": -583.431884765625, "KL/std": 238.08837890625, "epoch": 0.6417033773861968, "fcm_dpo/beta": 0.0016638417728245258, "fcm_dpo/delta": 0.02482348121702671, "fcm_dpo/margin": 226.03802490234375, "fcm_dpo/q_t": 0.41194066405296326, "grad_norm": 30.965953826904297, "learning_rate": 1.7298989601292036e-07, "logits/chosen": -0.5961349010467529, "logits/rejected": -0.575666606426239, "logps/chosen": -422.13812255859375, "logps/ref_chosen": -64.7442626953125, "logps/ref_rejected": -82.04356384277344, "logps/rejected": -665.4754638671875, "loss": 1.1003, "margin_dpo/margin_mean": 226.03802490234375, "margin_dpo/margin_std": 288.8181457519531, "step": 437 }, { "KL/chosen_KL_mean": -368.9373779296875, "KL/mean": -496.147216796875, "KL/rejected_KL_mean": -623.3570556640625, "KL/std": 269.57830810546875, "epoch": 0.6431718061674009, "fcm_dpo/beta": 0.0016566277481615543, "fcm_dpo/delta": -0.02283564768731594, "fcm_dpo/margin": 254.41964721679688, "fcm_dpo/q_t": 0.40168195962905884, "grad_norm": 31.922332763671875, "learning_rate": 1.7176998984196144e-07, "logits/chosen": -0.6157523393630981, "logits/rejected": -0.6025946736335754, "logps/chosen": -427.9560546875, "logps/ref_chosen": -59.0186653137207, "logps/ref_rejected": -83.07682800292969, "logps/rejected": -706.433837890625, "loss": 1.0646, "margin_dpo/margin_mean": 254.41964721679688, "margin_dpo/margin_std": 291.51605224609375, "step": 438 }, { "KL/chosen_KL_mean": -385.7437744140625, "KL/mean": -491.2313537597656, "KL/rejected_KL_mean": -596.718994140625, "KL/std": 272.39862060546875, "epoch": 0.644640234948605, "fcm_dpo/beta": 0.0016382005997002125, "fcm_dpo/delta": -0.06990180164575577, "fcm_dpo/margin": 210.9752655029297, "fcm_dpo/q_t": 0.420367956161499, "grad_norm": 29.01395606994629, "learning_rate": 1.7055214510452458e-07, "logits/chosen": -0.5831667184829712, "logits/rejected": -0.5851659774780273, "logps/chosen": -439.52783203125, "logps/ref_chosen": -53.78407669067383, "logps/ref_rejected": -83.98545837402344, "logps/rejected": -680.7044677734375, "loss": 1.1382, "margin_dpo/margin_mean": 210.97525024414062, "margin_dpo/margin_std": 309.2723388671875, "step": 439 }, { "KL/chosen_KL_mean": -426.24176025390625, "KL/mean": -549.2017822265625, "KL/rejected_KL_mean": -672.161865234375, "KL/std": 338.7626953125, "epoch": 0.6461086637298091, "fcm_dpo/beta": 0.0016412187833338976, "fcm_dpo/delta": -0.0038700848817825317, "fcm_dpo/margin": 245.9200897216797, "fcm_dpo/q_t": 0.41035932302474976, "grad_norm": 30.73674201965332, "learning_rate": 1.6933639389195134e-07, "logits/chosen": -0.6162642240524292, "logits/rejected": -0.6109949946403503, "logps/chosen": -504.8084716796875, "logps/ref_chosen": -78.56671905517578, "logps/ref_rejected": -96.49775695800781, "logps/rejected": -768.65966796875, "loss": 1.0978, "margin_dpo/margin_mean": 245.92010498046875, "margin_dpo/margin_std": 351.30108642578125, "step": 440 }, { "KL/chosen_KL_mean": -475.21966552734375, "KL/mean": -594.8802490234375, "KL/rejected_KL_mean": -714.5408935546875, "KL/std": 347.33465576171875, "epoch": 0.6475770925110133, "fcm_dpo/beta": 0.0016440332401543856, "fcm_dpo/delta": 0.006565794348716736, "fcm_dpo/margin": 239.3212127685547, "fcm_dpo/q_t": 0.4138296842575073, "grad_norm": 38.487876892089844, "learning_rate": 1.681227682404166e-07, "logits/chosen": -0.6530791521072388, "logits/rejected": -0.6417888402938843, "logps/chosen": -536.0440673828125, "logps/ref_chosen": -60.824440002441406, "logps/ref_rejected": -96.47080993652344, "logps/rejected": -811.0116577148438, "loss": 1.1403, "margin_dpo/margin_mean": 239.3212127685547, "margin_dpo/margin_std": 414.8189392089844, "step": 441 }, { "KL/chosen_KL_mean": -410.76324462890625, "KL/mean": -554.4638671875, "KL/rejected_KL_mean": -698.1644287109375, "KL/std": 337.224853515625, "epoch": 0.6490455212922174, "fcm_dpo/beta": 0.0016326969489455223, "fcm_dpo/delta": -0.07312282174825668, "fcm_dpo/margin": 287.401123046875, "fcm_dpo/q_t": 0.39594757556915283, "grad_norm": 32.10857391357422, "learning_rate": 1.669113001300851e-07, "logits/chosen": -0.6253660321235657, "logits/rejected": -0.6190581321716309, "logps/chosen": -457.77447509765625, "logps/ref_chosen": -47.01121520996094, "logps/ref_rejected": -76.53926086425781, "logps/rejected": -774.7036743164062, "loss": 1.0621, "margin_dpo/margin_mean": 287.401123046875, "margin_dpo/margin_std": 378.95294189453125, "step": 442 }, { "KL/chosen_KL_mean": -458.8716735839844, "KL/mean": -553.6380615234375, "KL/rejected_KL_mean": -648.4044799804688, "KL/std": 334.3310546875, "epoch": 0.6505139500734214, "fcm_dpo/beta": 0.0016142401145771146, "fcm_dpo/delta": -0.0032433748710900545, "fcm_dpo/margin": 189.53274536132812, "fcm_dpo/q_t": 0.43172866106033325, "grad_norm": 36.12958526611328, "learning_rate": 1.6570202148426815e-07, "logits/chosen": -0.6477606296539307, "logits/rejected": -0.6300950050354004, "logps/chosen": -530.1447143554688, "logps/ref_chosen": -71.27301788330078, "logps/ref_rejected": -86.679931640625, "logps/rejected": -735.0844116210938, "loss": 1.2045, "margin_dpo/margin_mean": 189.53274536132812, "margin_dpo/margin_std": 403.9544677734375, "step": 443 }, { "KL/chosen_KL_mean": -452.4591064453125, "KL/mean": -600.4232177734375, "KL/rejected_KL_mean": -748.3873291015625, "KL/std": 348.21685791015625, "epoch": 0.6519823788546255, "fcm_dpo/beta": 0.0015917312121018767, "fcm_dpo/delta": -0.0749378427863121, "fcm_dpo/margin": 295.9281921386719, "fcm_dpo/q_t": 0.39380595088005066, "grad_norm": 33.08613586425781, "learning_rate": 1.6449496416858282e-07, "logits/chosen": -0.6086077690124512, "logits/rejected": -0.6182563900947571, "logps/chosen": -509.6728515625, "logps/ref_chosen": -57.213706970214844, "logps/ref_rejected": -97.25489807128906, "logps/rejected": -845.6422119140625, "loss": 1.0518, "margin_dpo/margin_mean": 295.92822265625, "margin_dpo/margin_std": 379.2266845703125, "step": 444 }, { "KL/chosen_KL_mean": -396.7808532714844, "KL/mean": -538.607177734375, "KL/rejected_KL_mean": -680.43359375, "KL/std": 282.3050537109375, "epoch": 0.6534508076358296, "fcm_dpo/beta": 0.0015803833957761526, "fcm_dpo/delta": -0.05051477625966072, "fcm_dpo/margin": 283.6526794433594, "fcm_dpo/q_t": 0.39755940437316895, "grad_norm": 30.974002838134766, "learning_rate": 1.6329015999011182e-07, "logits/chosen": -0.6230882406234741, "logits/rejected": -0.6196198463439941, "logps/chosen": -464.0806579589844, "logps/ref_chosen": -67.29979705810547, "logps/ref_rejected": -92.68267059326172, "logps/rejected": -773.1162109375, "loss": 1.0577, "margin_dpo/margin_mean": 283.6527099609375, "margin_dpo/margin_std": 343.00897216796875, "step": 445 }, { "KL/chosen_KL_mean": -362.6717529296875, "KL/mean": -511.12188720703125, "KL/rejected_KL_mean": -659.572021484375, "KL/std": 300.0148620605469, "epoch": 0.6549192364170338, "fcm_dpo/beta": 0.0015668668784201145, "fcm_dpo/delta": -0.06862294673919678, "fcm_dpo/margin": 296.90032958984375, "fcm_dpo/q_t": 0.39133375883102417, "grad_norm": 34.6982421875, "learning_rate": 1.6208764069656578e-07, "logits/chosen": -0.6164995431900024, "logits/rejected": -0.6332226991653442, "logps/chosen": -421.77020263671875, "logps/ref_chosen": -59.098487854003906, "logps/ref_rejected": -101.26419067382812, "logps/rejected": -760.836181640625, "loss": 1.0322, "margin_dpo/margin_mean": 296.90032958984375, "margin_dpo/margin_std": 308.796875, "step": 446 }, { "KL/chosen_KL_mean": -374.0321044921875, "KL/mean": -528.794189453125, "KL/rejected_KL_mean": -683.5562744140625, "KL/std": 340.5907897949219, "epoch": 0.6563876651982379, "fcm_dpo/beta": 0.0015306383138522506, "fcm_dpo/delta": -0.07781445980072021, "fcm_dpo/margin": 309.5242004394531, "fcm_dpo/q_t": 0.3929889500141144, "grad_norm": 34.17955780029297, "learning_rate": 1.608874379754465e-07, "logits/chosen": -0.6846290826797485, "logits/rejected": -0.6984615325927734, "logps/chosen": -430.107421875, "logps/ref_chosen": -56.07533264160156, "logps/ref_rejected": -98.69475555419922, "logps/rejected": -782.2510986328125, "loss": 1.0422, "margin_dpo/margin_mean": 309.5242004394531, "margin_dpo/margin_std": 381.69482421875, "step": 447 }, { "KL/chosen_KL_mean": -422.5325622558594, "KL/mean": -566.2469482421875, "KL/rejected_KL_mean": -709.9613037109375, "KL/std": 306.77191162109375, "epoch": 0.657856093979442, "fcm_dpo/beta": 0.0015249757561832666, "fcm_dpo/delta": -0.040184423327445984, "fcm_dpo/margin": 287.4287414550781, "fcm_dpo/q_t": 0.3992360234260559, "grad_norm": 42.507076263427734, "learning_rate": 1.5968958345321177e-07, "logits/chosen": -0.579893171787262, "logits/rejected": -0.5896936655044556, "logps/chosen": -482.5364074707031, "logps/ref_chosen": -60.00384521484375, "logps/ref_rejected": -102.26465606689453, "logps/rejected": -812.2259521484375, "loss": 1.0631, "margin_dpo/margin_mean": 287.42877197265625, "margin_dpo/margin_std": 350.75927734375, "step": 448 }, { "KL/chosen_KL_mean": -422.4124755859375, "KL/mean": -570.7263793945312, "KL/rejected_KL_mean": -719.040283203125, "KL/std": 365.52325439453125, "epoch": 0.6593245227606461, "fcm_dpo/beta": 0.0015035069081932306, "fcm_dpo/delta": -0.048243433237075806, "fcm_dpo/margin": 296.62786865234375, "fcm_dpo/q_t": 0.40181848406791687, "grad_norm": 32.49612808227539, "learning_rate": 1.584941086944423e-07, "logits/chosen": -0.6347248554229736, "logits/rejected": -0.6339551210403442, "logps/chosen": -489.9390869140625, "logps/ref_chosen": -67.52661895751953, "logps/ref_rejected": -88.59690856933594, "logps/rejected": -807.63720703125, "loss": 1.0819, "margin_dpo/margin_mean": 296.62786865234375, "margin_dpo/margin_std": 435.5914306640625, "step": 449 }, { "KL/chosen_KL_mean": -343.0860595703125, "KL/mean": -506.05120849609375, "KL/rejected_KL_mean": -669.016357421875, "KL/std": 322.73223876953125, "epoch": 0.6607929515418502, "fcm_dpo/beta": 0.0014846834819763899, "fcm_dpo/delta": -0.08808425813913345, "fcm_dpo/margin": 325.9302978515625, "fcm_dpo/q_t": 0.3863303065299988, "grad_norm": 61.81324005126953, "learning_rate": 1.573010452010098e-07, "logits/chosen": -0.6573776006698608, "logits/rejected": -0.6733522415161133, "logps/chosen": -400.1941833496094, "logps/ref_chosen": -57.10811996459961, "logps/ref_rejected": -102.75494384765625, "logps/rejected": -771.7713623046875, "loss": 1.013, "margin_dpo/margin_mean": 325.9302978515625, "margin_dpo/margin_std": 321.34356689453125, "step": 450 }, { "KL/chosen_KL_mean": -470.1484375, "KL/mean": -591.3046875, "KL/rejected_KL_mean": -712.4609985351562, "KL/std": 378.45947265625, "epoch": 0.6622613803230544, "fcm_dpo/beta": 0.001488700625486672, "fcm_dpo/delta": 0.04048318788409233, "fcm_dpo/margin": 242.31259155273438, "fcm_dpo/q_t": 0.4151974320411682, "grad_norm": 32.96580505371094, "learning_rate": 1.5611042441124687e-07, "logits/chosen": -0.7209557294845581, "logits/rejected": -0.7045374512672424, "logps/chosen": -528.6172485351562, "logps/ref_chosen": -58.46883010864258, "logps/ref_rejected": -72.92941284179688, "logps/rejected": -785.390380859375, "loss": 1.153, "margin_dpo/margin_mean": 242.31259155273438, "margin_dpo/margin_std": 429.1678466796875, "step": 451 }, { "KL/chosen_KL_mean": -337.15887451171875, "KL/mean": -480.21685791015625, "KL/rejected_KL_mean": -623.2747802734375, "KL/std": 290.8729248046875, "epoch": 0.6637298091042585, "fcm_dpo/beta": 0.001479277154430747, "fcm_dpo/delta": -0.024412650614976883, "fcm_dpo/margin": 286.115966796875, "fcm_dpo/q_t": 0.39996248483657837, "grad_norm": 22.152088165283203, "learning_rate": 1.549222776991186e-07, "logits/chosen": -0.6340690851211548, "logits/rejected": -0.6552602052688599, "logps/chosen": -387.5494384765625, "logps/ref_chosen": -50.39055252075195, "logps/ref_rejected": -97.77142333984375, "logps/rejected": -721.0462646484375, "loss": 1.0531, "margin_dpo/margin_mean": 286.115966796875, "margin_dpo/margin_std": 296.19451904296875, "step": 452 }, { "KL/chosen_KL_mean": -392.34857177734375, "KL/mean": -526.1040649414062, "KL/rejected_KL_mean": -659.8594970703125, "KL/std": 298.7330322265625, "epoch": 0.6651982378854625, "fcm_dpo/beta": 0.0014776124153286219, "fcm_dpo/delta": 0.004782242700457573, "fcm_dpo/margin": 267.510986328125, "fcm_dpo/q_t": 0.4101276695728302, "grad_norm": 30.73015785217285, "learning_rate": 1.5373663637339584e-07, "logits/chosen": -0.6592116355895996, "logits/rejected": -0.6468891501426697, "logps/chosen": -450.06341552734375, "logps/ref_chosen": -57.71485137939453, "logps/ref_rejected": -82.20741271972656, "logps/rejected": -742.0669555664062, "loss": 1.0938, "margin_dpo/margin_mean": 267.510986328125, "margin_dpo/margin_std": 357.81243896484375, "step": 453 }, { "KL/chosen_KL_mean": -472.8430480957031, "KL/mean": -623.6031494140625, "KL/rejected_KL_mean": -774.3634033203125, "KL/std": 340.2906494140625, "epoch": 0.6666666666666666, "fcm_dpo/beta": 0.0014682337641716003, "fcm_dpo/delta": -0.044936180114746094, "fcm_dpo/margin": 301.52032470703125, "fcm_dpo/q_t": 0.3982999324798584, "grad_norm": 27.640525817871094, "learning_rate": 1.5255353167683017e-07, "logits/chosen": -0.7141397595405579, "logits/rejected": -0.707220196723938, "logps/chosen": -533.7886962890625, "logps/ref_chosen": -60.945648193359375, "logps/ref_rejected": -84.95079040527344, "logps/rejected": -859.3141479492188, "loss": 1.0603, "margin_dpo/margin_mean": 301.52032470703125, "margin_dpo/margin_std": 374.39154052734375, "step": 454 }, { "KL/chosen_KL_mean": -406.83978271484375, "KL/mean": -583.8450927734375, "KL/rejected_KL_mean": -760.850341796875, "KL/std": 351.13433837890625, "epoch": 0.6681350954478708, "fcm_dpo/beta": 0.0014530689222738147, "fcm_dpo/delta": -0.12079726159572601, "fcm_dpo/margin": 354.0106201171875, "fcm_dpo/q_t": 0.38350850343704224, "grad_norm": 39.29196548461914, "learning_rate": 1.5137299478533064e-07, "logits/chosen": -0.6605424284934998, "logits/rejected": -0.683269202709198, "logps/chosen": -451.7264709472656, "logps/ref_chosen": -44.88671112060547, "logps/ref_rejected": -115.30147552490234, "logps/rejected": -876.15185546875, "loss": 1.0177, "margin_dpo/margin_mean": 354.0106201171875, "margin_dpo/margin_std": 396.34912109375, "step": 455 }, { "KL/chosen_KL_mean": -423.3841552734375, "KL/mean": -593.1605224609375, "KL/rejected_KL_mean": -762.9368896484375, "KL/std": 356.31634521484375, "epoch": 0.6696035242290749, "fcm_dpo/beta": 0.0014169735368341208, "fcm_dpo/delta": -0.08517496287822723, "fcm_dpo/margin": 339.55279541015625, "fcm_dpo/q_t": 0.3890087902545929, "grad_norm": 29.1214599609375, "learning_rate": 1.5019505680714232e-07, "logits/chosen": -0.6683529615402222, "logits/rejected": -0.6928262710571289, "logps/chosen": -480.4209289550781, "logps/ref_chosen": -57.036781311035156, "logps/ref_rejected": -105.21784210205078, "logps/rejected": -868.1547241210938, "loss": 1.0156, "margin_dpo/margin_mean": 339.552734375, "margin_dpo/margin_std": 345.1535949707031, "step": 456 }, { "KL/chosen_KL_mean": -411.3397216796875, "KL/mean": -585.0991821289062, "KL/rejected_KL_mean": -758.858642578125, "KL/std": 348.389892578125, "epoch": 0.671071953010279, "fcm_dpo/beta": 0.0013881283812224865, "fcm_dpo/delta": -0.08688442409038544, "fcm_dpo/margin": 347.5188903808594, "fcm_dpo/q_t": 0.3869348466396332, "grad_norm": 29.03094482421875, "learning_rate": 1.4901974878202627e-07, "logits/chosen": -0.6927535533905029, "logits/rejected": -0.6937886476516724, "logps/chosen": -465.582275390625, "logps/ref_chosen": -54.24253845214844, "logps/ref_rejected": -85.10956573486328, "logps/rejected": -843.9682006835938, "loss": 1.0138, "margin_dpo/margin_mean": 347.5188903808594, "margin_dpo/margin_std": 340.45025634765625, "step": 457 }, { "KL/chosen_KL_mean": -413.8070068359375, "KL/mean": -569.9956665039062, "KL/rejected_KL_mean": -726.184326171875, "KL/std": 318.3636474609375, "epoch": 0.6725403817914831, "fcm_dpo/beta": 0.0013727301266044378, "fcm_dpo/delta": -0.030638840049505234, "fcm_dpo/margin": 312.3773193359375, "fcm_dpo/q_t": 0.401597797870636, "grad_norm": 23.270376205444336, "learning_rate": 1.4784710168044212e-07, "logits/chosen": -0.7083392143249512, "logits/rejected": -0.7040765285491943, "logps/chosen": -469.21588134765625, "logps/ref_chosen": -55.40888214111328, "logps/ref_rejected": -97.68325805664062, "logps/rejected": -823.8675537109375, "loss": 1.0625, "margin_dpo/margin_mean": 312.3773193359375, "margin_dpo/margin_std": 366.5815734863281, "step": 458 }, { "KL/chosen_KL_mean": -460.8892822265625, "KL/mean": -626.6010131835938, "KL/rejected_KL_mean": -792.312744140625, "KL/std": 359.20355224609375, "epoch": 0.6740088105726872, "fcm_dpo/beta": 0.0013611916219815612, "fcm_dpo/delta": -0.05394328758120537, "fcm_dpo/margin": 331.42340087890625, "fcm_dpo/q_t": 0.3961183726787567, "grad_norm": 29.07042121887207, "learning_rate": 1.466771464027316e-07, "logits/chosen": -0.690535306930542, "logits/rejected": -0.708480954170227, "logps/chosen": -507.44677734375, "logps/ref_chosen": -46.55748748779297, "logps/ref_rejected": -86.16854095458984, "logps/rejected": -878.4812622070312, "loss": 1.0548, "margin_dpo/margin_mean": 331.4234313964844, "margin_dpo/margin_std": 397.4347839355469, "step": 459 }, { "KL/chosen_KL_mean": -498.5191650390625, "KL/mean": -676.511474609375, "KL/rejected_KL_mean": -854.5037841796875, "KL/std": 354.446533203125, "epoch": 0.6754772393538914, "fcm_dpo/beta": 0.001346941338852048, "fcm_dpo/delta": -0.08341852575540543, "fcm_dpo/margin": 355.984619140625, "fcm_dpo/q_t": 0.3894132971763611, "grad_norm": 34.67963790893555, "learning_rate": 1.4550991377830423e-07, "logits/chosen": -0.7603031396865845, "logits/rejected": -0.7895260453224182, "logps/chosen": -550.154052734375, "logps/ref_chosen": -51.63489532470703, "logps/ref_rejected": -104.11935424804688, "logps/rejected": -958.6231689453125, "loss": 1.0251, "margin_dpo/margin_mean": 355.984619140625, "margin_dpo/margin_std": 386.4249572753906, "step": 460 }, { "KL/chosen_KL_mean": -527.8145751953125, "KL/mean": -668.647705078125, "KL/rejected_KL_mean": -809.4808349609375, "KL/std": 355.2934265136719, "epoch": 0.6769456681350955, "fcm_dpo/beta": 0.001346740871667862, "fcm_dpo/delta": 0.02132502943277359, "fcm_dpo/margin": 281.6662902832031, "fcm_dpo/q_t": 0.4133082628250122, "grad_norm": 28.388534545898438, "learning_rate": 1.4434543456482518e-07, "logits/chosen": -0.7709100842475891, "logits/rejected": -0.7842754125595093, "logps/chosen": -582.9965209960938, "logps/ref_chosen": -55.18195724487305, "logps/ref_rejected": -86.47689819335938, "logps/rejected": -895.957763671875, "loss": 1.1057, "margin_dpo/margin_mean": 281.666259765625, "margin_dpo/margin_std": 388.0255432128906, "step": 461 }, { "KL/chosen_KL_mean": -528.8763427734375, "KL/mean": -648.3497314453125, "KL/rejected_KL_mean": -767.8230590820312, "KL/std": 363.253662109375, "epoch": 0.6784140969162996, "fcm_dpo/beta": 0.0013595143100246787, "fcm_dpo/delta": 0.0777268186211586, "fcm_dpo/margin": 238.94668579101562, "fcm_dpo/q_t": 0.4269101023674011, "grad_norm": 32.2818717956543, "learning_rate": 1.4318373944740484e-07, "logits/chosen": -0.83504319190979, "logits/rejected": -0.8283437490463257, "logps/chosen": -598.804443359375, "logps/ref_chosen": -69.92803192138672, "logps/ref_rejected": -78.84111022949219, "logps/rejected": -846.6641845703125, "loss": 1.1576, "margin_dpo/margin_mean": 238.94668579101562, "margin_dpo/margin_std": 403.227294921875, "step": 462 }, { "KL/chosen_KL_mean": -522.392578125, "KL/mean": -674.9669799804688, "KL/rejected_KL_mean": -827.5413818359375, "KL/std": 369.949951171875, "epoch": 0.6798825256975036, "fcm_dpo/beta": 0.0013674467336386442, "fcm_dpo/delta": -0.018231874331831932, "fcm_dpo/margin": 305.1488037109375, "fcm_dpo/q_t": 0.4052046537399292, "grad_norm": 36.683773040771484, "learning_rate": 1.4202485903778976e-07, "logits/chosen": -0.7768852710723877, "logits/rejected": -0.7839001417160034, "logps/chosen": -577.6669921875, "logps/ref_chosen": -55.27437210083008, "logps/ref_rejected": -89.02497863769531, "logps/rejected": -916.5663452148438, "loss": 1.0875, "margin_dpo/margin_mean": 305.1487731933594, "margin_dpo/margin_std": 413.82952880859375, "step": 463 }, { "KL/chosen_KL_mean": -523.5928955078125, "KL/mean": -754.2376098632812, "KL/rejected_KL_mean": -984.88232421875, "KL/std": 429.15692138671875, "epoch": 0.6813509544787077, "fcm_dpo/beta": 0.0013100993819534779, "fcm_dpo/delta": -0.2194000482559204, "fcm_dpo/margin": 461.2894592285156, "fcm_dpo/q_t": 0.3615615665912628, "grad_norm": 38.82695770263672, "learning_rate": 1.4086882387355658e-07, "logits/chosen": -0.7948259115219116, "logits/rejected": -0.8514028787612915, "logps/chosen": -574.5052490234375, "logps/ref_chosen": -50.91230010986328, "logps/ref_rejected": -102.4893798828125, "logps/rejected": -1087.3717041015625, "loss": 0.9451, "margin_dpo/margin_mean": 461.2894287109375, "margin_dpo/margin_std": 441.2557678222656, "step": 464 }, { "KL/chosen_KL_mean": -508.05780029296875, "KL/mean": -700.8915405273438, "KL/rejected_KL_mean": -893.725341796875, "KL/std": 442.56414794921875, "epoch": 0.6828193832599119, "fcm_dpo/beta": 0.0012883164454251528, "fcm_dpo/delta": -0.10184454917907715, "fcm_dpo/margin": 385.66748046875, "fcm_dpo/q_t": 0.38422703742980957, "grad_norm": 43.68606948852539, "learning_rate": 1.3971566441730714e-07, "logits/chosen": -0.7833234071731567, "logits/rejected": -0.8054988980293274, "logps/chosen": -568.1746826171875, "logps/ref_chosen": -60.116851806640625, "logps/ref_rejected": -113.94602966308594, "logps/rejected": -1007.67138671875, "loss": 1.0303, "margin_dpo/margin_mean": 385.66748046875, "margin_dpo/margin_std": 453.04437255859375, "step": 465 }, { "KL/chosen_KL_mean": -554.3604125976562, "KL/mean": -719.1517944335938, "KL/rejected_KL_mean": -883.9432373046875, "KL/std": 401.8727111816406, "epoch": 0.684287812041116, "fcm_dpo/beta": 0.0012672768207266927, "fcm_dpo/delta": -0.01892733946442604, "fcm_dpo/margin": 329.58282470703125, "fcm_dpo/q_t": 0.4031521677970886, "grad_norm": 32.980648040771484, "learning_rate": 1.3856541105586545e-07, "logits/chosen": -0.7898865938186646, "logits/rejected": -0.7923921942710876, "logps/chosen": -607.2813110351562, "logps/ref_chosen": -52.920921325683594, "logps/ref_rejected": -90.3154296875, "logps/rejected": -974.2586669921875, "loss": 1.0868, "margin_dpo/margin_mean": 329.58282470703125, "margin_dpo/margin_std": 441.3002624511719, "step": 466 }, { "KL/chosen_KL_mean": -718.9763793945312, "KL/mean": -897.8390502929688, "KL/rejected_KL_mean": -1076.70166015625, "KL/std": 534.5426635742188, "epoch": 0.6857562408223201, "fcm_dpo/beta": 0.0012504856567829847, "fcm_dpo/delta": -0.051718711853027344, "fcm_dpo/margin": 357.7253112792969, "fcm_dpo/q_t": 0.4021691381931305, "grad_norm": 48.61074447631836, "learning_rate": 1.3741809409947729e-07, "logits/chosen": -0.899175763130188, "logits/rejected": -0.8960803747177124, "logps/chosen": -797.6921997070312, "logps/ref_chosen": -78.7158203125, "logps/ref_rejected": -102.86019897460938, "logps/rejected": -1179.5618896484375, "loss": 1.1327, "margin_dpo/margin_mean": 357.72528076171875, "margin_dpo/margin_std": 623.5672607421875, "step": 467 }, { "KL/chosen_KL_mean": -574.6669921875, "KL/mean": -787.5557861328125, "KL/rejected_KL_mean": -1000.444580078125, "KL/std": 476.1236267089844, "epoch": 0.6872246696035242, "fcm_dpo/beta": 0.001233407761901617, "fcm_dpo/delta": -0.13217654824256897, "fcm_dpo/margin": 425.77752685546875, "fcm_dpo/q_t": 0.38278520107269287, "grad_norm": 49.870811462402344, "learning_rate": 1.362737437810114e-07, "logits/chosen": -0.8734508752822876, "logits/rejected": -0.8809393644332886, "logps/chosen": -644.6024169921875, "logps/ref_chosen": -69.93536376953125, "logps/ref_rejected": -101.02880859375, "logps/rejected": -1101.473388671875, "loss": 1.0183, "margin_dpo/margin_mean": 425.77752685546875, "margin_dpo/margin_std": 522.344970703125, "step": 468 }, { "KL/chosen_KL_mean": -616.0804443359375, "KL/mean": -829.309814453125, "KL/rejected_KL_mean": -1042.5391845703125, "KL/std": 435.38775634765625, "epoch": 0.6886930983847284, "fcm_dpo/beta": 0.0011981693096458912, "fcm_dpo/delta": -0.11811123043298721, "fcm_dpo/margin": 426.45867919921875, "fcm_dpo/q_t": 0.38290101289749146, "grad_norm": 41.835994720458984, "learning_rate": 1.351323902551631e-07, "logits/chosen": -0.871213436126709, "logits/rejected": -0.8826764822006226, "logps/chosen": -684.2052001953125, "logps/ref_chosen": -68.12469482421875, "logps/ref_rejected": -104.78640747070312, "logps/rejected": -1147.32568359375, "loss": 1.0167, "margin_dpo/margin_mean": 426.4587097167969, "margin_dpo/margin_std": 479.927978515625, "step": 469 }, { "KL/chosen_KL_mean": -526.5010986328125, "KL/mean": -717.6756591796875, "KL/rejected_KL_mean": -908.85009765625, "KL/std": 461.4809265136719, "epoch": 0.6901615271659325, "fcm_dpo/beta": 0.0011902997503057122, "fcm_dpo/delta": -0.05773991718888283, "fcm_dpo/margin": 382.3490905761719, "fcm_dpo/q_t": 0.39579594135284424, "grad_norm": 28.576284408569336, "learning_rate": 1.339940635976592e-07, "logits/chosen": -0.8578736782073975, "logits/rejected": -0.8692770004272461, "logps/chosen": -570.29296875, "logps/ref_chosen": -43.791927337646484, "logps/ref_rejected": -82.70285034179688, "logps/rejected": -991.552978515625, "loss": 1.07, "margin_dpo/margin_mean": 382.34912109375, "margin_dpo/margin_std": 517.0437622070312, "step": 470 }, { "KL/chosen_KL_mean": -668.16845703125, "KL/mean": -836.795654296875, "KL/rejected_KL_mean": -1005.4229125976562, "KL/std": 489.2864990234375, "epoch": 0.6916299559471366, "fcm_dpo/beta": 0.0011771449353545904, "fcm_dpo/delta": 0.0024843141436576843, "fcm_dpo/margin": 337.25445556640625, "fcm_dpo/q_t": 0.41261669993400574, "grad_norm": 33.51979446411133, "learning_rate": 1.3285879380446563e-07, "logits/chosen": -0.9775102734565735, "logits/rejected": -0.9886398315429688, "logps/chosen": -731.5079345703125, "logps/ref_chosen": -63.33952331542969, "logps/ref_rejected": -83.61048126220703, "logps/rejected": -1089.033447265625, "loss": 1.1157, "margin_dpo/margin_mean": 337.25445556640625, "margin_dpo/margin_std": 525.2001342773438, "step": 471 }, { "KL/chosen_KL_mean": -668.5523681640625, "KL/mean": -864.1845703125, "KL/rejected_KL_mean": -1059.816650390625, "KL/std": 573.5145263671875, "epoch": 0.6930983847283406, "fcm_dpo/beta": 0.001169139752164483, "fcm_dpo/delta": -0.060674797743558884, "fcm_dpo/margin": 391.2642822265625, "fcm_dpo/q_t": 0.4020659327507019, "grad_norm": 32.862762451171875, "learning_rate": 1.317266107909975e-07, "logits/chosen": -0.9176386594772339, "logits/rejected": -0.8923181295394897, "logps/chosen": -752.218505859375, "logps/ref_chosen": -83.66610717773438, "logps/ref_rejected": -117.20919799804688, "logps/rejected": -1177.02587890625, "loss": 1.098, "margin_dpo/margin_mean": 391.2642822265625, "margin_dpo/margin_std": 617.0120849609375, "step": 472 }, { "KL/chosen_KL_mean": -801.0420532226562, "KL/mean": -896.4287109375, "KL/rejected_KL_mean": -991.8154296875, "KL/std": 595.682373046875, "epoch": 0.6945668135095447, "fcm_dpo/beta": 0.0011696891160681844, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 190.77340698242188, "fcm_dpo/q_t": 0.45167526602745056, "grad_norm": 116.70816802978516, "learning_rate": 1.3059754439133002e-07, "logits/chosen": -0.896651029586792, "logits/rejected": -0.8670951128005981, "logps/chosen": -864.5390625, "logps/ref_chosen": -63.49696731567383, "logps/ref_rejected": -81.14657592773438, "logps/rejected": -1072.9620361328125, "loss": 1.3537, "margin_dpo/margin_mean": 190.77340698242188, "margin_dpo/margin_std": 781.681640625, "step": 473 }, { "KL/chosen_KL_mean": -648.32958984375, "KL/mean": -815.1319580078125, "KL/rejected_KL_mean": -981.9342041015625, "KL/std": 509.63934326171875, "epoch": 0.6960352422907489, "fcm_dpo/beta": 0.001158315921202302, "fcm_dpo/delta": -0.09819056838750839, "fcm_dpo/margin": 333.6046447753906, "fcm_dpo/q_t": 0.4119930565357208, "grad_norm": 38.174800872802734, "learning_rate": 1.2947162435741277e-07, "logits/chosen": -0.8967859745025635, "logits/rejected": -0.9001563191413879, "logps/chosen": -700.9415283203125, "logps/ref_chosen": -52.6119384765625, "logps/ref_rejected": -90.08041381835938, "logps/rejected": -1072.0146484375, "loss": 1.149, "margin_dpo/margin_mean": 333.6046142578125, "margin_dpo/margin_std": 572.8232421875, "step": 474 }, { "KL/chosen_KL_mean": -477.9742126464844, "KL/mean": -690.75537109375, "KL/rejected_KL_mean": -903.536376953125, "KL/std": 416.74029541015625, "epoch": 0.697503671071953, "fcm_dpo/beta": 0.0011308316607028246, "fcm_dpo/delta": -0.08551047742366791, "fcm_dpo/margin": 425.5621337890625, "fcm_dpo/q_t": 0.3888343572616577, "grad_norm": 39.29723358154297, "learning_rate": 1.2834888035828596e-07, "logits/chosen": -0.9726539850234985, "logits/rejected": -0.9993470907211304, "logps/chosen": -520.4694213867188, "logps/ref_chosen": -42.49519348144531, "logps/ref_rejected": -90.06294250488281, "logps/rejected": -993.599365234375, "loss": 1.0222, "margin_dpo/margin_mean": 425.56219482421875, "margin_dpo/margin_std": 453.77056884765625, "step": 475 }, { "KL/chosen_KL_mean": -573.8340454101562, "KL/mean": -741.2000732421875, "KL/rejected_KL_mean": -908.5662841796875, "KL/std": 460.19805908203125, "epoch": 0.6989720998531571, "fcm_dpo/beta": 0.0011303846258670092, "fcm_dpo/delta": 0.02247927524149418, "fcm_dpo/margin": 334.7321472167969, "fcm_dpo/q_t": 0.41371750831604004, "grad_norm": 51.48725509643555, "learning_rate": 1.2722934197929802e-07, "logits/chosen": -0.9355987310409546, "logits/rejected": -0.9493337869644165, "logps/chosen": -616.783447265625, "logps/ref_chosen": -42.94938278198242, "logps/ref_rejected": -73.71023559570312, "logps/rejected": -982.2764892578125, "loss": 1.1039, "margin_dpo/margin_mean": 334.732177734375, "margin_dpo/margin_std": 454.2925720214844, "step": 476 }, { "KL/chosen_KL_mean": -620.339599609375, "KL/mean": -789.1563110351562, "KL/rejected_KL_mean": -957.9730224609375, "KL/std": 484.395751953125, "epoch": 0.7004405286343612, "fcm_dpo/beta": 0.001139188650995493, "fcm_dpo/delta": 0.0156848281621933, "fcm_dpo/margin": 337.6333923339844, "fcm_dpo/q_t": 0.4125925898551941, "grad_norm": 32.003143310546875, "learning_rate": 1.2611303872132631e-07, "logits/chosen": -0.9794288873672485, "logits/rejected": -0.9472505450248718, "logps/chosen": -691.1122436523438, "logps/ref_chosen": -70.77261352539062, "logps/ref_rejected": -76.13737487792969, "logps/rejected": -1034.1103515625, "loss": 1.1367, "margin_dpo/margin_mean": 337.6333923339844, "margin_dpo/margin_std": 572.432373046875, "step": 477 }, { "KL/chosen_KL_mean": -509.4632568359375, "KL/mean": -703.776123046875, "KL/rejected_KL_mean": -898.0889282226562, "KL/std": 428.4776611328125, "epoch": 0.7019089574155654, "fcm_dpo/beta": 0.0011328569380566478, "fcm_dpo/delta": -0.04220225661993027, "fcm_dpo/margin": 388.6256103515625, "fcm_dpo/q_t": 0.3992430567741394, "grad_norm": 36.747989654541016, "learning_rate": 1.2500000000000005e-07, "logits/chosen": -0.8643758296966553, "logits/rejected": -0.8856371641159058, "logps/chosen": -550.90380859375, "logps/ref_chosen": -41.440513610839844, "logps/ref_rejected": -85.36196899414062, "logps/rejected": -983.450927734375, "loss": 1.071, "margin_dpo/margin_mean": 388.6256103515625, "margin_dpo/margin_std": 506.4171142578125, "step": 478 }, { "KL/chosen_KL_mean": -674.1643676757812, "KL/mean": -861.4671020507812, "KL/rejected_KL_mean": -1048.769775390625, "KL/std": 529.594482421875, "epoch": 0.7033773861967695, "fcm_dpo/beta": 0.0011302338680252433, "fcm_dpo/delta": -0.025240201503038406, "fcm_dpo/margin": 374.60540771484375, "fcm_dpo/q_t": 0.40734556317329407, "grad_norm": 28.37042236328125, "learning_rate": 1.2389025514492456e-07, "logits/chosen": -0.9004903435707092, "logits/rejected": -0.9291303753852844, "logps/chosen": -728.072265625, "logps/ref_chosen": -53.907920837402344, "logps/ref_rejected": -95.1163330078125, "logps/rejected": -1143.8861083984375, "loss": 1.1188, "margin_dpo/margin_mean": 374.60540771484375, "margin_dpo/margin_std": 616.9238891601562, "step": 479 }, { "KL/chosen_KL_mean": -794.90771484375, "KL/mean": -941.1514892578125, "KL/rejected_KL_mean": -1087.395263671875, "KL/std": 502.28509521484375, "epoch": 0.7048458149779736, "fcm_dpo/beta": 0.0011154343374073505, "fcm_dpo/delta": -0.04910217225551605, "fcm_dpo/margin": 292.4875183105469, "fcm_dpo/q_t": 0.42702075839042664, "grad_norm": 56.938568115234375, "learning_rate": 1.227838333989088e-07, "logits/chosen": -0.9496725797653198, "logits/rejected": -0.9423930644989014, "logps/chosen": -853.5903930664062, "logps/ref_chosen": -58.682701110839844, "logps/ref_rejected": -82.93248748779297, "logps/rejected": -1170.32763671875, "loss": 1.1837, "margin_dpo/margin_mean": 292.48748779296875, "margin_dpo/margin_std": 548.897216796875, "step": 480 }, { "KL/chosen_KL_mean": -666.3424072265625, "KL/mean": -894.8223876953125, "KL/rejected_KL_mean": -1123.3023681640625, "KL/std": 532.44287109375, "epoch": 0.7063142437591777, "fcm_dpo/beta": 0.0010935836471617222, "fcm_dpo/delta": -0.10490460693836212, "fcm_dpo/margin": 456.9600524902344, "fcm_dpo/q_t": 0.3881867527961731, "grad_norm": 31.939149856567383, "learning_rate": 1.2168076391719489e-07, "logits/chosen": -0.9752233028411865, "logits/rejected": -1.0002660751342773, "logps/chosen": -721.306640625, "logps/ref_chosen": -54.964271545410156, "logps/ref_rejected": -92.42044067382812, "logps/rejected": -1215.722900390625, "loss": 1.0369, "margin_dpo/margin_mean": 456.96002197265625, "margin_dpo/margin_std": 578.0529174804688, "step": 481 }, { "KL/chosen_KL_mean": -705.298828125, "KL/mean": -807.7088623046875, "KL/rejected_KL_mean": -910.1187744140625, "KL/std": 523.2813720703125, "epoch": 0.7077826725403817, "fcm_dpo/beta": 0.0010957256890833378, "fcm_dpo/delta": 0.08043741434812546, "fcm_dpo/margin": 204.8200225830078, "fcm_dpo/q_t": 0.4466710090637207, "grad_norm": 58.26255416870117, "learning_rate": 1.2058107576668938e-07, "logits/chosen": -0.8530906438827515, "logits/rejected": -0.8416086435317993, "logps/chosen": -772.852294921875, "logps/ref_chosen": -67.553466796875, "logps/ref_rejected": -87.58953857421875, "logps/rejected": -997.7083740234375, "loss": 1.2776, "margin_dpo/margin_mean": 204.82000732421875, "margin_dpo/margin_std": 610.1978759765625, "step": 482 }, { "KL/chosen_KL_mean": -627.250244140625, "KL/mean": -863.0478515625, "KL/rejected_KL_mean": -1098.845458984375, "KL/std": 514.4844970703125, "epoch": 0.7092511013215859, "fcm_dpo/beta": 0.0010834920685738325, "fcm_dpo/delta": -0.11722610890865326, "fcm_dpo/margin": 471.59527587890625, "fcm_dpo/q_t": 0.3861696720123291, "grad_norm": 35.892913818359375, "learning_rate": 1.194847979251979e-07, "logits/chosen": -0.9211816787719727, "logits/rejected": -0.9332787394523621, "logps/chosen": -690.580078125, "logps/ref_chosen": -63.32981872558594, "logps/ref_rejected": -95.78697204589844, "logps/rejected": -1194.632568359375, "loss": 1.026, "margin_dpo/margin_mean": 471.59527587890625, "margin_dpo/margin_std": 582.1723022460938, "step": 483 }, { "KL/chosen_KL_mean": -537.6214599609375, "KL/mean": -750.1485595703125, "KL/rejected_KL_mean": -962.6756591796875, "KL/std": 499.0599670410156, "epoch": 0.71071953010279, "fcm_dpo/beta": 0.0010744791943579912, "fcm_dpo/delta": -0.059597231447696686, "fcm_dpo/margin": 425.05419921875, "fcm_dpo/q_t": 0.39632922410964966, "grad_norm": 45.20163345336914, "learning_rate": 1.1839195928066101e-07, "logits/chosen": -0.9501423835754395, "logits/rejected": -0.974023699760437, "logps/chosen": -596.7596435546875, "logps/ref_chosen": -59.13812255859375, "logps/ref_rejected": -84.37144470214844, "logps/rejected": -1047.047119140625, "loss": 1.0526, "margin_dpo/margin_mean": 425.0542297363281, "margin_dpo/margin_std": 516.1748657226562, "step": 484 }, { "KL/chosen_KL_mean": -551.4486083984375, "KL/mean": -755.1604614257812, "KL/rejected_KL_mean": -958.8723754882812, "KL/std": 496.97821044921875, "epoch": 0.7121879588839941, "fcm_dpo/beta": 0.0010628815507516265, "fcm_dpo/delta": -0.034554317593574524, "fcm_dpo/margin": 407.4237060546875, "fcm_dpo/q_t": 0.40300631523132324, "grad_norm": 41.25438690185547, "learning_rate": 1.1730258863039347e-07, "logits/chosen": -0.8435344696044922, "logits/rejected": -0.8657543659210205, "logps/chosen": -610.2982177734375, "logps/ref_chosen": -58.849571228027344, "logps/ref_rejected": -103.36408233642578, "logps/rejected": -1062.2364501953125, "loss": 1.0827, "margin_dpo/margin_mean": 407.4237060546875, "margin_dpo/margin_std": 568.4012451171875, "step": 485 }, { "KL/chosen_KL_mean": -635.773681640625, "KL/mean": -867.3421630859375, "KL/rejected_KL_mean": -1098.91064453125, "KL/std": 573.5325927734375, "epoch": 0.7136563876651982, "fcm_dpo/beta": 0.0010442393831908703, "fcm_dpo/delta": -0.08795761317014694, "fcm_dpo/margin": 463.13702392578125, "fcm_dpo/q_t": 0.3934386372566223, "grad_norm": 33.520912170410156, "learning_rate": 1.1621671468032493e-07, "logits/chosen": -0.9062224626541138, "logits/rejected": -0.9161352515220642, "logps/chosen": -691.0333251953125, "logps/ref_chosen": -55.25966262817383, "logps/ref_rejected": -92.13936614990234, "logps/rejected": -1191.050048828125, "loss": 1.0755, "margin_dpo/margin_mean": 463.13702392578125, "margin_dpo/margin_std": 690.687255859375, "step": 486 }, { "KL/chosen_KL_mean": -682.1150512695312, "KL/mean": -852.4932861328125, "KL/rejected_KL_mean": -1022.8714599609375, "KL/std": 518.8946533203125, "epoch": 0.7151248164464024, "fcm_dpo/beta": 0.0010508847190067172, "fcm_dpo/delta": 0.04303121566772461, "fcm_dpo/margin": 340.75640869140625, "fcm_dpo/q_t": 0.41665130853652954, "grad_norm": 35.01145935058594, "learning_rate": 1.1513436604424378e-07, "logits/chosen": -0.9379677772521973, "logits/rejected": -0.9421348571777344, "logps/chosen": -735.1783447265625, "logps/ref_chosen": -53.06330871582031, "logps/ref_rejected": -92.41883087158203, "logps/rejected": -1115.290283203125, "loss": 1.1348, "margin_dpo/margin_mean": 340.75640869140625, "margin_dpo/margin_std": 538.1895141601562, "step": 487 }, { "KL/chosen_KL_mean": -549.458740234375, "KL/mean": -727.3135375976562, "KL/rejected_KL_mean": -905.1682739257812, "KL/std": 453.9654235839844, "epoch": 0.7165932452276065, "fcm_dpo/beta": 0.0010568746365606785, "fcm_dpo/delta": 0.024668315425515175, "fcm_dpo/margin": 355.7095947265625, "fcm_dpo/q_t": 0.4128245711326599, "grad_norm": 31.856176376342773, "learning_rate": 1.1405557124304335e-07, "logits/chosen": -0.8694427609443665, "logits/rejected": -0.8729550838470459, "logps/chosen": -601.6868896484375, "logps/ref_chosen": -52.22815704345703, "logps/ref_rejected": -84.00656127929688, "logps/rejected": -989.1748046875, "loss": 1.098, "margin_dpo/margin_mean": 355.7095642089844, "margin_dpo/margin_std": 445.57000732421875, "step": 488 }, { "KL/chosen_KL_mean": -507.87225341796875, "KL/mean": -690.164794921875, "KL/rejected_KL_mean": -872.4573974609375, "KL/std": 465.132568359375, "epoch": 0.7180616740088106, "fcm_dpo/beta": 0.001060036476701498, "fcm_dpo/delta": 0.013825876638293266, "fcm_dpo/margin": 364.5850830078125, "fcm_dpo/q_t": 0.41301560401916504, "grad_norm": 29.248310089111328, "learning_rate": 1.1298035870396985e-07, "logits/chosen": -0.9246722459793091, "logits/rejected": -0.9232733249664307, "logps/chosen": -563.8619384765625, "logps/ref_chosen": -55.989627838134766, "logps/ref_rejected": -79.39812469482422, "logps/rejected": -951.85546875, "loss": 1.1035, "margin_dpo/margin_mean": 364.5850830078125, "margin_dpo/margin_std": 510.5589599609375, "step": 489 }, { "KL/chosen_KL_mean": -607.0692138671875, "KL/mean": -791.8487548828125, "KL/rejected_KL_mean": -976.6283569335938, "KL/std": 546.3399047851562, "epoch": 0.7195301027900147, "fcm_dpo/beta": 0.0010612778132781386, "fcm_dpo/delta": 0.007974715903401375, "fcm_dpo/margin": 369.5592041015625, "fcm_dpo/q_t": 0.4127916693687439, "grad_norm": 38.29257583618164, "learning_rate": 1.1190875675987355e-07, "logits/chosen": -0.9299312829971313, "logits/rejected": -0.9667763710021973, "logps/chosen": -659.435546875, "logps/ref_chosen": -52.36639404296875, "logps/ref_rejected": -110.4090576171875, "logps/rejected": -1087.037353515625, "loss": 1.1355, "margin_dpo/margin_mean": 369.5591735839844, "margin_dpo/margin_std": 631.1531982421875, "step": 490 }, { "KL/chosen_KL_mean": -584.8209228515625, "KL/mean": -708.0709228515625, "KL/rejected_KL_mean": -831.3209228515625, "KL/std": 479.92486572265625, "epoch": 0.7209985315712188, "fcm_dpo/beta": 0.0010824804194271564, "fcm_dpo/delta": 0.13659176230430603, "fcm_dpo/margin": 246.49998474121094, "fcm_dpo/q_t": 0.43931227922439575, "grad_norm": 29.697641372680664, "learning_rate": 1.1084079364846241e-07, "logits/chosen": -0.9027219414710999, "logits/rejected": -0.8923330307006836, "logps/chosen": -644.9371948242188, "logps/ref_chosen": -60.11626434326172, "logps/ref_rejected": -73.27278900146484, "logps/rejected": -904.59375, "loss": 1.1979, "margin_dpo/margin_mean": 246.5, "margin_dpo/margin_std": 469.1783752441406, "step": 491 }, { "KL/chosen_KL_mean": -603.3214721679688, "KL/mean": -728.932861328125, "KL/rejected_KL_mean": -854.5443115234375, "KL/std": 482.99114990234375, "epoch": 0.7224669603524229, "fcm_dpo/beta": 0.001107184449210763, "fcm_dpo/delta": 0.12549251317977905, "fcm_dpo/margin": 251.2227325439453, "fcm_dpo/q_t": 0.4381140470504761, "grad_norm": 31.3509521484375, "learning_rate": 1.097764975115576e-07, "logits/chosen": -0.9563778638839722, "logits/rejected": -0.9354947805404663, "logps/chosen": -657.315673828125, "logps/ref_chosen": -53.994178771972656, "logps/ref_rejected": -72.65962219238281, "logps/rejected": -927.203857421875, "loss": 1.2164, "margin_dpo/margin_mean": 251.22274780273438, "margin_dpo/margin_std": 553.6586303710938, "step": 492 }, { "KL/chosen_KL_mean": -617.3892211914062, "KL/mean": -754.9351806640625, "KL/rejected_KL_mean": -892.4810791015625, "KL/std": 521.77880859375, "epoch": 0.723935389133627, "fcm_dpo/beta": 0.001113426173105836, "fcm_dpo/delta": -0.012180797755718231, "fcm_dpo/margin": 275.09185791015625, "fcm_dpo/q_t": 0.42827779054641724, "grad_norm": 33.60331344604492, "learning_rate": 1.0871589639435203e-07, "logits/chosen": -0.975821852684021, "logits/rejected": -0.9446998834609985, "logps/chosen": -692.886474609375, "logps/ref_chosen": -75.49723815917969, "logps/ref_rejected": -87.32301330566406, "logps/rejected": -979.8040771484375, "loss": 1.1783, "margin_dpo/margin_mean": 275.09185791015625, "margin_dpo/margin_std": 499.40582275390625, "step": 493 }, { "KL/chosen_KL_mean": -487.6140441894531, "KL/mean": -709.284912109375, "KL/rejected_KL_mean": -930.9556884765625, "KL/std": 471.108154296875, "epoch": 0.7254038179148311, "fcm_dpo/beta": 0.0010977558558806777, "fcm_dpo/delta": -0.09117947518825531, "fcm_dpo/margin": 443.3415832519531, "fcm_dpo/q_t": 0.38731634616851807, "grad_norm": 42.78213882446289, "learning_rate": 1.0765901824467166e-07, "logits/chosen": -0.8541857600212097, "logits/rejected": -0.8858389854431152, "logps/chosen": -528.9733276367188, "logps/ref_chosen": -41.35926818847656, "logps/ref_rejected": -86.09136962890625, "logps/rejected": -1017.0469970703125, "loss": 1.0186, "margin_dpo/margin_mean": 443.34161376953125, "margin_dpo/margin_std": 465.9763488769531, "step": 494 }, { "KL/chosen_KL_mean": -549.9886474609375, "KL/mean": -741.8480834960938, "KL/rejected_KL_mean": -933.7076416015625, "KL/std": 484.11676025390625, "epoch": 0.7268722466960352, "fcm_dpo/beta": 0.0010909372940659523, "fcm_dpo/delta": -0.019423317164182663, "fcm_dpo/margin": 383.71905517578125, "fcm_dpo/q_t": 0.4072011411190033, "grad_norm": 31.058788299560547, "learning_rate": 1.0660589091223854e-07, "logits/chosen": -0.9760909080505371, "logits/rejected": -0.9824463725090027, "logps/chosen": -613.523681640625, "logps/ref_chosen": -63.53507995605469, "logps/ref_rejected": -91.42443084716797, "logps/rejected": -1025.132080078125, "loss": 1.0979, "margin_dpo/margin_mean": 383.71905517578125, "margin_dpo/margin_std": 579.0, "step": 495 }, { "KL/chosen_KL_mean": -678.1318359375, "KL/mean": -781.4765625, "KL/rejected_KL_mean": -884.8211669921875, "KL/std": 376.869384765625, "epoch": 0.7283406754772394, "fcm_dpo/beta": 0.0011184395989403129, "fcm_dpo/delta": 0.17267850041389465, "fcm_dpo/margin": 206.68927001953125, "fcm_dpo/q_t": 0.4458683431148529, "grad_norm": 56.444026947021484, "learning_rate": 1.0555654214793722e-07, "logits/chosen": -0.9465994238853455, "logits/rejected": -0.9162840843200684, "logps/chosen": -750.7238159179688, "logps/ref_chosen": -72.5919189453125, "logps/ref_rejected": -84.32933807373047, "logps/rejected": -969.1505126953125, "loss": 1.2216, "margin_dpo/margin_mean": 206.6892852783203, "margin_dpo/margin_std": 417.4690856933594, "step": 496 }, { "KL/chosen_KL_mean": -637.2471313476562, "KL/mean": -742.4765625, "KL/rejected_KL_mean": -847.7059326171875, "KL/std": 471.04632568359375, "epoch": 0.7298091042584435, "fcm_dpo/beta": 0.0011322898790240288, "fcm_dpo/delta": 0.021924598142504692, "fcm_dpo/margin": 210.45884704589844, "fcm_dpo/q_t": 0.4449055790901184, "grad_norm": 34.23469924926758, "learning_rate": 1.0451099960308374e-07, "logits/chosen": -0.89613938331604, "logits/rejected": -0.8781349658966064, "logps/chosen": -695.841064453125, "logps/ref_chosen": -58.59397506713867, "logps/ref_rejected": -76.28836822509766, "logps/rejected": -923.9942626953125, "loss": 1.2261, "margin_dpo/margin_mean": 210.45883178710938, "margin_dpo/margin_std": 444.05548095703125, "step": 497 }, { "KL/chosen_KL_mean": -613.268798828125, "KL/mean": -784.17431640625, "KL/rejected_KL_mean": -955.0799560546875, "KL/std": 500.90350341796875, "epoch": 0.7312775330396476, "fcm_dpo/beta": 0.001133624231442809, "fcm_dpo/delta": 0.01301711704581976, "fcm_dpo/margin": 341.8111267089844, "fcm_dpo/q_t": 0.4114514887332916, "grad_norm": 39.67582321166992, "learning_rate": 1.0346929082869641e-07, "logits/chosen": -0.9115738868713379, "logits/rejected": -0.8932760953903198, "logps/chosen": -684.4744873046875, "logps/ref_chosen": -71.20565795898438, "logps/ref_rejected": -83.95803833007812, "logps/rejected": -1039.0379638671875, "loss": 1.1264, "margin_dpo/margin_mean": 341.8111267089844, "margin_dpo/margin_std": 551.8848266601562, "step": 498 }, { "KL/chosen_KL_mean": -532.6873168945312, "KL/mean": -742.4135131835938, "KL/rejected_KL_mean": -952.1397705078125, "KL/std": 498.259033203125, "epoch": 0.7327459618208517, "fcm_dpo/beta": 0.0011214257683604956, "fcm_dpo/delta": -0.07398218661546707, "fcm_dpo/margin": 419.45245361328125, "fcm_dpo/q_t": 0.39359456300735474, "grad_norm": 47.4464111328125, "learning_rate": 1.0243144327477013e-07, "logits/chosen": -0.9583698511123657, "logits/rejected": -0.991510272026062, "logps/chosen": -583.9425048828125, "logps/ref_chosen": -51.25519561767578, "logps/ref_rejected": -101.07870483398438, "logps/rejected": -1053.218505859375, "loss": 1.0567, "margin_dpo/margin_mean": 419.45245361328125, "margin_dpo/margin_std": 553.3563232421875, "step": 499 }, { "KL/chosen_KL_mean": -643.7020874023438, "KL/mean": -815.4398193359375, "KL/rejected_KL_mean": -987.177490234375, "KL/std": 434.6232604980469, "epoch": 0.7342143906020558, "fcm_dpo/beta": 0.00111986487172544, "fcm_dpo/delta": 0.015957213938236237, "fcm_dpo/margin": 343.4754333496094, "fcm_dpo/q_t": 0.41234683990478516, "grad_norm": 42.036109924316406, "learning_rate": 1.0139748428955333e-07, "logits/chosen": -0.9217053055763245, "logits/rejected": -0.9468744993209839, "logps/chosen": -700.7294921875, "logps/ref_chosen": -57.027442932128906, "logps/ref_rejected": -93.93421173095703, "logps/rejected": -1081.1116943359375, "loss": 1.1254, "margin_dpo/margin_mean": 343.4754333496094, "margin_dpo/margin_std": 541.9052734375, "step": 500 }, { "KL/chosen_KL_mean": -556.8917846679688, "KL/mean": -738.462158203125, "KL/rejected_KL_mean": -920.032470703125, "KL/std": 467.4353942871094, "epoch": 0.73568281938326, "fcm_dpo/beta": 0.0011241002939641476, "fcm_dpo/delta": -0.008717566728591919, "fcm_dpo/margin": 363.14068603515625, "fcm_dpo/q_t": 0.4085647165775299, "grad_norm": 31.888412475585938, "learning_rate": 1.0036744111882672e-07, "logits/chosen": -0.8804645538330078, "logits/rejected": -0.8640455007553101, "logps/chosen": -611.2513427734375, "logps/ref_chosen": -54.359527587890625, "logps/ref_rejected": -80.15670013427734, "logps/rejected": -1000.189208984375, "loss": 1.113, "margin_dpo/margin_mean": 363.14068603515625, "margin_dpo/margin_std": 566.4456787109375, "step": 501 }, { "KL/chosen_KL_mean": -507.1001892089844, "KL/mean": -698.0469360351562, "KL/rejected_KL_mean": -888.99365234375, "KL/std": 443.9556579589844, "epoch": 0.737151248164464, "fcm_dpo/beta": 0.001116321887820959, "fcm_dpo/delta": -0.027478674426674843, "fcm_dpo/margin": 381.8934326171875, "fcm_dpo/q_t": 0.40353554487228394, "grad_norm": 25.909517288208008, "learning_rate": 9.934134090518592e-08, "logits/chosen": -0.8332573175430298, "logits/rejected": -0.8167060017585754, "logps/chosen": -574.70068359375, "logps/ref_chosen": -67.60050964355469, "logps/ref_rejected": -82.94876098632812, "logps/rejected": -971.9423828125, "loss": 1.067, "margin_dpo/margin_mean": 381.8934326171875, "margin_dpo/margin_std": 470.9022216796875, "step": 502 }, { "KL/chosen_KL_mean": -491.82965087890625, "KL/mean": -663.3250732421875, "KL/rejected_KL_mean": -834.8204345703125, "KL/std": 403.2861022949219, "epoch": 0.7386196769456681, "fcm_dpo/beta": 0.0011143197771161795, "fcm_dpo/delta": 0.018423786386847496, "fcm_dpo/margin": 342.99078369140625, "fcm_dpo/q_t": 0.41295260190963745, "grad_norm": 23.40604591369629, "learning_rate": 9.831921068732571e-08, "logits/chosen": -0.868687629699707, "logits/rejected": -0.8567318320274353, "logps/chosen": -546.9080810546875, "logps/ref_chosen": -55.078407287597656, "logps/ref_rejected": -82.50544738769531, "logps/rejected": -917.3258666992188, "loss": 1.0968, "margin_dpo/margin_mean": 342.99078369140625, "margin_dpo/margin_std": 444.3095397949219, "step": 503 }, { "KL/chosen_KL_mean": -552.1359252929688, "KL/mean": -755.714599609375, "KL/rejected_KL_mean": -959.2933349609375, "KL/std": 474.62152099609375, "epoch": 0.7400881057268722, "fcm_dpo/beta": 0.001110826968215406, "fcm_dpo/delta": -0.0547223836183548, "fcm_dpo/margin": 407.1573486328125, "fcm_dpo/q_t": 0.39741408824920654, "grad_norm": 27.982126235961914, "learning_rate": 9.730107739932805e-08, "logits/chosen": -0.8701947927474976, "logits/rejected": -0.8900790214538574, "logps/chosen": -612.1016845703125, "logps/ref_chosen": -59.96575164794922, "logps/ref_rejected": -103.76212310791016, "logps/rejected": -1063.055419921875, "loss": 1.0645, "margin_dpo/margin_mean": 407.1573486328125, "margin_dpo/margin_std": 519.5411987304688, "step": 504 }, { "KL/chosen_KL_mean": -625.5159301757812, "KL/mean": -729.96484375, "KL/rejected_KL_mean": -834.413818359375, "KL/std": 455.81085205078125, "epoch": 0.7415565345080763, "fcm_dpo/beta": 0.0011349001433700323, "fcm_dpo/delta": 0.1667182594537735, "fcm_dpo/margin": 208.89784240722656, "fcm_dpo/q_t": 0.4452478885650635, "grad_norm": 34.76630783081055, "learning_rate": 9.628696786995188e-08, "logits/chosen": -0.8760533332824707, "logits/rejected": -0.8471982479095459, "logps/chosen": -701.6708374023438, "logps/ref_chosen": -76.1549072265625, "logps/ref_rejected": -88.58537292480469, "logps/rejected": -922.9991455078125, "loss": 1.2236, "margin_dpo/margin_mean": 208.8978271484375, "margin_dpo/margin_std": 435.1164245605469, "step": 505 }, { "KL/chosen_KL_mean": -502.8348083496094, "KL/mean": -683.107666015625, "KL/rejected_KL_mean": -863.3804931640625, "KL/std": 453.17449951171875, "epoch": 0.7430249632892805, "fcm_dpo/beta": 0.0011371751315891743, "fcm_dpo/delta": -0.01079019159078598, "fcm_dpo/margin": 360.545654296875, "fcm_dpo/q_t": 0.40623512864112854, "grad_norm": 38.504554748535156, "learning_rate": 9.527690882192635e-08, "logits/chosen": -0.9111833572387695, "logits/rejected": -0.9226495027542114, "logps/chosen": -551.7952880859375, "logps/ref_chosen": -48.96050262451172, "logps/ref_rejected": -78.41505432128906, "logps/rejected": -941.7955322265625, "loss": 1.0915, "margin_dpo/margin_mean": 360.545654296875, "margin_dpo/margin_std": 491.61199951171875, "step": 506 }, { "KL/chosen_KL_mean": -591.5228271484375, "KL/mean": -751.4690551757812, "KL/rejected_KL_mean": -911.4154052734375, "KL/std": 523.9932250976562, "epoch": 0.7444933920704846, "fcm_dpo/beta": 0.0011462382972240448, "fcm_dpo/delta": 0.03460888937115669, "fcm_dpo/margin": 319.8925476074219, "fcm_dpo/q_t": 0.4203076958656311, "grad_norm": 36.931053161621094, "learning_rate": 9.427092687124691e-08, "logits/chosen": -0.9362499713897705, "logits/rejected": -0.9389501214027405, "logps/chosen": -658.3242797851562, "logps/ref_chosen": -66.80149841308594, "logps/ref_rejected": -95.37289428710938, "logps/rejected": -1006.7882690429688, "loss": 1.1484, "margin_dpo/margin_mean": 319.8925476074219, "margin_dpo/margin_std": 573.2492065429688, "step": 507 }, { "KL/chosen_KL_mean": -627.1077880859375, "KL/mean": -764.3685302734375, "KL/rejected_KL_mean": -901.6292724609375, "KL/std": 510.8812255859375, "epoch": 0.7459618208516887, "fcm_dpo/beta": 0.001165491994470358, "fcm_dpo/delta": 0.08244814723730087, "fcm_dpo/margin": 274.52154541015625, "fcm_dpo/q_t": 0.4301344156265259, "grad_norm": 38.228172302246094, "learning_rate": 9.326904852647344e-08, "logits/chosen": -0.8927318453788757, "logits/rejected": -0.890540599822998, "logps/chosen": -698.4112548828125, "logps/ref_chosen": -71.303466796875, "logps/ref_rejected": -95.6275405883789, "logps/rejected": -997.2568359375, "loss": 1.2033, "margin_dpo/margin_mean": 274.5215759277344, "margin_dpo/margin_std": 596.1304931640625, "step": 508 }, { "KL/chosen_KL_mean": -482.7675476074219, "KL/mean": -632.9190673828125, "KL/rejected_KL_mean": -783.0706176757812, "KL/std": 380.39312744140625, "epoch": 0.7474302496328928, "fcm_dpo/beta": 0.0011804470559582114, "fcm_dpo/delta": 0.04675152152776718, "fcm_dpo/margin": 300.30303955078125, "fcm_dpo/q_t": 0.41989073157310486, "grad_norm": 29.72893524169922, "learning_rate": 9.227130018803195e-08, "logits/chosen": -0.8038022518157959, "logits/rejected": -0.79693204164505, "logps/chosen": -546.5865478515625, "logps/ref_chosen": -63.81895065307617, "logps/ref_rejected": -83.25643920898438, "logps/rejected": -866.3270874023438, "loss": 1.1333, "margin_dpo/margin_mean": 300.30303955078125, "margin_dpo/margin_std": 463.26605224609375, "step": 509 }, { "KL/chosen_KL_mean": -588.75, "KL/mean": -778.4863891601562, "KL/rejected_KL_mean": -968.22265625, "KL/std": 429.193115234375, "epoch": 0.748898678414097, "fcm_dpo/beta": 0.0011725020594894886, "fcm_dpo/delta": -0.046999622136354446, "fcm_dpo/margin": 379.4727478027344, "fcm_dpo/q_t": 0.3961718678474426, "grad_norm": 29.25191307067871, "learning_rate": 9.127770814751932e-08, "logits/chosen": -0.8102399110794067, "logits/rejected": -0.8293131589889526, "logps/chosen": -640.6284790039062, "logps/ref_chosen": -51.878448486328125, "logps/ref_rejected": -102.7651596069336, "logps/rejected": -1070.98779296875, "loss": 1.0473, "margin_dpo/margin_mean": 379.47271728515625, "margin_dpo/margin_std": 420.7159423828125, "step": 510 }, { "KL/chosen_KL_mean": -559.4649658203125, "KL/mean": -713.5518798828125, "KL/rejected_KL_mean": -867.6387939453125, "KL/std": 470.72216796875, "epoch": 0.750367107195301, "fcm_dpo/beta": 0.001175806624814868, "fcm_dpo/delta": 0.039014674723148346, "fcm_dpo/margin": 308.1739196777344, "fcm_dpo/q_t": 0.4179048538208008, "grad_norm": 39.78738021850586, "learning_rate": 9.028829858700973e-08, "logits/chosen": -0.8937386274337769, "logits/rejected": -0.8976330161094666, "logps/chosen": -619.7030029296875, "logps/ref_chosen": -60.23811721801758, "logps/ref_rejected": -92.85676574707031, "logps/rejected": -960.49560546875, "loss": 1.1575, "margin_dpo/margin_mean": 308.17388916015625, "margin_dpo/margin_std": 568.642333984375, "step": 511 }, { "KL/chosen_KL_mean": -453.3255615234375, "KL/mean": -663.5873413085938, "KL/rejected_KL_mean": -873.84912109375, "KL/std": 430.00079345703125, "epoch": 0.7518355359765051, "fcm_dpo/beta": 0.0011590380454435945, "fcm_dpo/delta": -0.09211389720439911, "fcm_dpo/margin": 420.5235595703125, "fcm_dpo/q_t": 0.38680607080459595, "grad_norm": 51.16664505004883, "learning_rate": 8.930309757836516e-08, "logits/chosen": -0.8603556752204895, "logits/rejected": -0.8814679384231567, "logps/chosen": -508.2310791015625, "logps/ref_chosen": -54.905494689941406, "logps/ref_rejected": -81.87586975097656, "logps/rejected": -955.7249755859375, "loss": 1.0153, "margin_dpo/margin_mean": 420.5235290527344, "margin_dpo/margin_std": 434.5292053222656, "step": 512 }, { "KL/chosen_KL_mean": -557.367431640625, "KL/mean": -709.244873046875, "KL/rejected_KL_mean": -861.1224365234375, "KL/std": 405.1826171875, "epoch": 0.7533039647577092, "fcm_dpo/beta": 0.0011576918186619878, "fcm_dpo/delta": 0.04973098263144493, "fcm_dpo/margin": 303.75506591796875, "fcm_dpo/q_t": 0.42015981674194336, "grad_norm": 40.44023513793945, "learning_rate": 8.832213108254863e-08, "logits/chosen": -0.914627194404602, "logits/rejected": -0.8992458581924438, "logps/chosen": -622.2838134765625, "logps/ref_chosen": -64.91644287109375, "logps/ref_rejected": -76.06245422363281, "logps/rejected": -937.1848754882812, "loss": 1.1394, "margin_dpo/margin_mean": 303.75506591796875, "margin_dpo/margin_std": 474.69561767578125, "step": 513 }, { "KL/chosen_KL_mean": -580.2469482421875, "KL/mean": -727.4422607421875, "KL/rejected_KL_mean": -874.6375732421875, "KL/std": 435.10223388671875, "epoch": 0.7547723935389133, "fcm_dpo/beta": 0.0011787754483520985, "fcm_dpo/delta": 0.05473232641816139, "fcm_dpo/margin": 294.3906555175781, "fcm_dpo/q_t": 0.42276865243911743, "grad_norm": 35.93750762939453, "learning_rate": 8.734542494893954e-08, "logits/chosen": -0.8526400327682495, "logits/rejected": -0.8441455364227295, "logps/chosen": -654.4765625, "logps/ref_chosen": -74.22957611083984, "logps/ref_rejected": -78.945556640625, "logps/rejected": -953.5831298828125, "loss": 1.1468, "margin_dpo/margin_mean": 294.39068603515625, "margin_dpo/margin_std": 494.26495361328125, "step": 514 }, { "KL/chosen_KL_mean": -495.4428405761719, "KL/mean": -614.765625, "KL/rejected_KL_mean": -734.0885009765625, "KL/std": 379.7894287109375, "epoch": 0.7562408223201175, "fcm_dpo/beta": 0.0012007859768345952, "fcm_dpo/delta": 0.11678852140903473, "fcm_dpo/margin": 238.64556884765625, "fcm_dpo/q_t": 0.43298569321632385, "grad_norm": 42.832855224609375, "learning_rate": 8.637300491465272e-08, "logits/chosen": -0.8383795022964478, "logits/rejected": -0.8531197905540466, "logps/chosen": -545.8444213867188, "logps/ref_chosen": -50.40156555175781, "logps/ref_rejected": -87.09774780273438, "logps/rejected": -821.1862182617188, "loss": 1.1849, "margin_dpo/margin_mean": 238.6455841064453, "margin_dpo/margin_std": 437.5038146972656, "step": 515 }, { "KL/chosen_KL_mean": -530.1577758789062, "KL/mean": -699.3049926757812, "KL/rejected_KL_mean": -868.4521484375, "KL/std": 425.07623291015625, "epoch": 0.7577092511013216, "fcm_dpo/beta": 0.0012106327340006828, "fcm_dpo/delta": -0.01020483672618866, "fcm_dpo/margin": 338.2943420410156, "fcm_dpo/q_t": 0.4044458270072937, "grad_norm": 54.36648941040039, "learning_rate": 8.540489660386064e-08, "logits/chosen": -0.8974713087081909, "logits/rejected": -0.9246504902839661, "logps/chosen": -594.807373046875, "logps/ref_chosen": -64.64956665039062, "logps/ref_rejected": -111.72237396240234, "logps/rejected": -980.1744995117188, "loss": 1.0818, "margin_dpo/margin_mean": 338.2943420410156, "margin_dpo/margin_std": 424.7276611328125, "step": 516 }, { "KL/chosen_KL_mean": -559.2984008789062, "KL/mean": -758.3714599609375, "KL/rejected_KL_mean": -957.4444580078125, "KL/std": 472.23651123046875, "epoch": 0.7591776798825257, "fcm_dpo/beta": 0.0011898339726030827, "fcm_dpo/delta": -0.07756029814481735, "fcm_dpo/margin": 398.1460266113281, "fcm_dpo/q_t": 0.3944876194000244, "grad_norm": 29.054262161254883, "learning_rate": 8.444112552711752e-08, "logits/chosen": -0.8497953414916992, "logits/rejected": -0.8503054976463318, "logps/chosen": -620.2119750976562, "logps/ref_chosen": -60.913551330566406, "logps/ref_rejected": -89.08308410644531, "logps/rejected": -1046.527587890625, "loss": 1.0518, "margin_dpo/margin_mean": 398.14605712890625, "margin_dpo/margin_std": 519.8525390625, "step": 517 }, { "KL/chosen_KL_mean": -528.1893310546875, "KL/mean": -698.426025390625, "KL/rejected_KL_mean": -868.6627197265625, "KL/std": 393.81671142578125, "epoch": 0.7606461086637298, "fcm_dpo/beta": 0.0011824161047115922, "fcm_dpo/delta": -0.002931937575340271, "fcm_dpo/margin": 340.473388671875, "fcm_dpo/q_t": 0.40656790137290955, "grad_norm": 54.556785583496094, "learning_rate": 8.348171708068747e-08, "logits/chosen": -0.8558133840560913, "logits/rejected": -0.874567985534668, "logps/chosen": -585.6452026367188, "logps/ref_chosen": -57.45589065551758, "logps/ref_rejected": -85.31269836425781, "logps/rejected": -953.9754638671875, "loss": 1.0875, "margin_dpo/margin_mean": 340.473388671875, "margin_dpo/margin_std": 436.8955078125, "step": 518 }, { "KL/chosen_KL_mean": -525.33154296875, "KL/mean": -649.0626220703125, "KL/rejected_KL_mean": -772.793701171875, "KL/std": 368.4232177734375, "epoch": 0.762114537444934, "fcm_dpo/beta": 0.001208610599860549, "fcm_dpo/delta": 0.10347578674554825, "fcm_dpo/margin": 247.46214294433594, "fcm_dpo/q_t": 0.4309791624546051, "grad_norm": 33.39023971557617, "learning_rate": 8.25266965458755e-08, "logits/chosen": -0.837517261505127, "logits/rejected": -0.8200976848602295, "logps/chosen": -599.3948974609375, "logps/ref_chosen": -74.06331634521484, "logps/ref_rejected": -104.44416809082031, "logps/rejected": -877.2378540039062, "loss": 1.183, "margin_dpo/margin_mean": 247.46214294433594, "margin_dpo/margin_std": 456.01971435546875, "step": 519 }, { "KL/chosen_KL_mean": -569.647705078125, "KL/mean": -727.2234497070312, "KL/rejected_KL_mean": -884.7991943359375, "KL/std": 423.0360412597656, "epoch": 0.7635829662261381, "fcm_dpo/beta": 0.0012114193523302674, "fcm_dpo/delta": 0.018862294033169746, "fcm_dpo/margin": 315.1514892578125, "fcm_dpo/q_t": 0.4147945046424866, "grad_norm": 34.308570861816406, "learning_rate": 8.15760890883607e-08, "logits/chosen": -0.81200110912323, "logits/rejected": -0.8175575733184814, "logps/chosen": -639.947509765625, "logps/ref_chosen": -70.2998275756836, "logps/ref_rejected": -99.98133850097656, "logps/rejected": -984.780517578125, "loss": 1.1189, "margin_dpo/margin_mean": 315.1515197753906, "margin_dpo/margin_std": 470.97100830078125, "step": 520 }, { "KL/chosen_KL_mean": -510.13311767578125, "KL/mean": -689.7613525390625, "KL/rejected_KL_mean": -869.3895263671875, "KL/std": 446.6356201171875, "epoch": 0.7650513950073421, "fcm_dpo/beta": 0.0012165037915110588, "fcm_dpo/delta": -0.03935041278600693, "fcm_dpo/margin": 359.2563781738281, "fcm_dpo/q_t": 0.40068429708480835, "grad_norm": 40.11237335205078, "learning_rate": 8.062991975753378e-08, "logits/chosen": -0.8692072629928589, "logits/rejected": -0.8731534481048584, "logps/chosen": -568.2760620117188, "logps/ref_chosen": -58.14292526245117, "logps/ref_rejected": -83.28060913085938, "logps/rejected": -952.670166015625, "loss": 1.0694, "margin_dpo/margin_mean": 359.2563781738281, "margin_dpo/margin_std": 451.70306396484375, "step": 521 }, { "KL/chosen_KL_mean": -590.7149658203125, "KL/mean": -751.5833740234375, "KL/rejected_KL_mean": -912.4517822265625, "KL/std": 471.05303955078125, "epoch": 0.7665198237885462, "fcm_dpo/beta": 0.001208572182804346, "fcm_dpo/delta": 0.011602986603975296, "fcm_dpo/margin": 321.7369079589844, "fcm_dpo/q_t": 0.4123893082141876, "grad_norm": 32.06018829345703, "learning_rate": 7.968821348583643e-08, "logits/chosen": -0.8791370987892151, "logits/rejected": -0.8823133707046509, "logps/chosen": -637.2626342773438, "logps/ref_chosen": -46.54766845703125, "logps/ref_rejected": -66.01388549804688, "logps/rejected": -978.4656982421875, "loss": 1.1248, "margin_dpo/margin_mean": 321.7369079589844, "margin_dpo/margin_std": 514.0554809570312, "step": 522 }, { "KL/chosen_KL_mean": -634.4599609375, "KL/mean": -804.0902099609375, "KL/rejected_KL_mean": -973.7205810546875, "KL/std": 544.9263916015625, "epoch": 0.7679882525697503, "fcm_dpo/beta": 0.0012069594813510776, "fcm_dpo/delta": -0.0098798843100667, "fcm_dpo/margin": 339.26068115234375, "fcm_dpo/q_t": 0.4096330404281616, "grad_norm": 39.04078674316406, "learning_rate": 7.875099508810484e-08, "logits/chosen": -0.9305659532546997, "logits/rejected": -0.9318529367446899, "logps/chosen": -696.2294921875, "logps/ref_chosen": -61.76960372924805, "logps/ref_rejected": -83.76141357421875, "logps/rejected": -1057.48193359375, "loss": 1.1257, "margin_dpo/margin_mean": 339.26068115234375, "margin_dpo/margin_std": 569.8573608398438, "step": 523 }, { "KL/chosen_KL_mean": -631.42236328125, "KL/mean": -814.2626953125, "KL/rejected_KL_mean": -997.10302734375, "KL/std": 514.68408203125, "epoch": 0.7694566813509545, "fcm_dpo/beta": 0.001192695926874876, "fcm_dpo/delta": -0.03869359940290451, "fcm_dpo/margin": 365.6807556152344, "fcm_dpo/q_t": 0.3989192843437195, "grad_norm": 36.887882232666016, "learning_rate": 7.781828926091535e-08, "logits/chosen": -0.9622774124145508, "logits/rejected": -0.9509581327438354, "logps/chosen": -709.494384765625, "logps/ref_chosen": -78.0720443725586, "logps/ref_rejected": -81.30198669433594, "logps/rejected": -1078.405029296875, "loss": 1.0926, "margin_dpo/margin_mean": 365.68072509765625, "margin_dpo/margin_std": 523.1288452148438, "step": 524 }, { "KL/chosen_KL_mean": -622.1337890625, "KL/mean": -846.5458984375, "KL/rejected_KL_mean": -1070.9580078125, "KL/std": 526.7284545898438, "epoch": 0.7709251101321586, "fcm_dpo/beta": 0.0011688778176903725, "fcm_dpo/delta": -0.13245530426502228, "fcm_dpo/margin": 448.8243408203125, "fcm_dpo/q_t": 0.3837316036224365, "grad_norm": 25.891273498535156, "learning_rate": 7.689012058193384e-08, "logits/chosen": -0.918233335018158, "logits/rejected": -0.9528594017028809, "logps/chosen": -672.9616088867188, "logps/ref_chosen": -50.827857971191406, "logps/ref_rejected": -100.05294036865234, "logps/rejected": -1171.010986328125, "loss": 1.0231, "margin_dpo/margin_mean": 448.82427978515625, "margin_dpo/margin_std": 557.4779663085938, "step": 525 }, { "KL/chosen_KL_mean": -663.134765625, "KL/mean": -881.9912109375, "KL/rejected_KL_mean": -1100.84765625, "KL/std": 521.672119140625, "epoch": 0.7723935389133627, "fcm_dpo/beta": 0.0011508764000609517, "fcm_dpo/delta": -0.10911859571933746, "fcm_dpo/margin": 437.71282958984375, "fcm_dpo/q_t": 0.38582324981689453, "grad_norm": 23.709228515625, "learning_rate": 7.596651350926836e-08, "logits/chosen": -0.8999603986740112, "logits/rejected": -0.8906654119491577, "logps/chosen": -726.302001953125, "logps/ref_chosen": -63.167236328125, "logps/ref_rejected": -86.30934143066406, "logps/rejected": -1187.156982421875, "loss": 1.0506, "margin_dpo/margin_mean": 437.7127990722656, "margin_dpo/margin_std": 586.166259765625, "step": 526 }, { "KL/chosen_KL_mean": -687.536865234375, "KL/mean": -834.2434692382812, "KL/rejected_KL_mean": -980.9501342773438, "KL/std": 535.2979736328125, "epoch": 0.7738619676945668, "fcm_dpo/beta": 0.001149723306298256, "fcm_dpo/delta": 0.06489390134811401, "fcm_dpo/margin": 293.4132385253906, "fcm_dpo/q_t": 0.42192360758781433, "grad_norm": 43.69949722290039, "learning_rate": 7.504749238082414e-08, "logits/chosen": -1.1003575325012207, "logits/rejected": -1.0651922225952148, "logps/chosen": -758.66552734375, "logps/ref_chosen": -71.12867736816406, "logps/ref_rejected": -78.3425521850586, "logps/rejected": -1059.292724609375, "loss": 1.1423, "margin_dpo/margin_mean": 293.4132385253906, "margin_dpo/margin_std": 454.53448486328125, "step": 527 }, { "KL/chosen_KL_mean": -689.6435546875, "KL/mean": -871.0379638671875, "KL/rejected_KL_mean": -1052.4324951171875, "KL/std": 510.6888427734375, "epoch": 0.775330396475771, "fcm_dpo/beta": 0.0011513070203363895, "fcm_dpo/delta": -0.018490692600607872, "fcm_dpo/margin": 362.7888488769531, "fcm_dpo/q_t": 0.4090343713760376, "grad_norm": 49.07489776611328, "learning_rate": 7.413308141366254e-08, "logits/chosen": -1.0010507106781006, "logits/rejected": -0.9815536141395569, "logps/chosen": -757.7330322265625, "logps/ref_chosen": -68.0894546508789, "logps/ref_rejected": -93.91006469726562, "logps/rejected": -1146.342529296875, "loss": 1.1277, "margin_dpo/margin_mean": 362.7888488769531, "margin_dpo/margin_std": 622.3973388671875, "step": 528 }, { "KL/chosen_KL_mean": -799.9935302734375, "KL/mean": -926.8414306640625, "KL/rejected_KL_mean": -1053.689208984375, "KL/std": 460.82391357421875, "epoch": 0.7767988252569751, "fcm_dpo/beta": 0.0011541005223989487, "fcm_dpo/delta": 0.01186126284301281, "fcm_dpo/margin": 253.6956787109375, "fcm_dpo/q_t": 0.43285101652145386, "grad_norm": 45.102630615234375, "learning_rate": 7.322330470336313e-08, "logits/chosen": -1.0135385990142822, "logits/rejected": -1.021782398223877, "logps/chosen": -855.5684814453125, "logps/ref_chosen": -55.57495880126953, "logps/ref_rejected": -89.20909118652344, "logps/rejected": -1142.8983154296875, "loss": 1.2213, "margin_dpo/margin_mean": 253.6956787109375, "margin_dpo/margin_std": 574.0363159179688, "step": 529 }, { "KL/chosen_KL_mean": -676.3377685546875, "KL/mean": -873.301513671875, "KL/rejected_KL_mean": -1070.2652587890625, "KL/std": 554.6763916015625, "epoch": 0.7782672540381792, "fcm_dpo/beta": 0.0011464983690530062, "fcm_dpo/delta": -0.0540442019701004, "fcm_dpo/margin": 393.927490234375, "fcm_dpo/q_t": 0.4019849896430969, "grad_norm": 44.16566467285156, "learning_rate": 7.231818622338822e-08, "logits/chosen": -0.9260751008987427, "logits/rejected": -0.919657289981842, "logps/chosen": -723.939208984375, "logps/ref_chosen": -47.601417541503906, "logps/ref_rejected": -87.2845230102539, "logps/rejected": -1157.5498046875, "loss": 1.1222, "margin_dpo/margin_mean": 393.927490234375, "margin_dpo/margin_std": 693.3948974609375, "step": 530 }, { "KL/chosen_KL_mean": -755.704833984375, "KL/mean": -917.4498901367188, "KL/rejected_KL_mean": -1079.19482421875, "KL/std": 589.1134033203125, "epoch": 0.7797356828193832, "fcm_dpo/beta": 0.0011490847682580352, "fcm_dpo/delta": 0.0292234905064106, "fcm_dpo/margin": 323.489990234375, "fcm_dpo/q_t": 0.417187362909317, "grad_norm": 41.70063781738281, "learning_rate": 7.141774982445147e-08, "logits/chosen": -1.028259038925171, "logits/rejected": -1.0060193538665771, "logps/chosen": -810.950927734375, "logps/ref_chosen": -55.246063232421875, "logps/ref_rejected": -70.60598754882812, "logps/rejected": -1149.80078125, "loss": 1.1423, "margin_dpo/margin_mean": 323.4900207519531, "margin_dpo/margin_std": 554.1434326171875, "step": 531 }, { "KL/chosen_KL_mean": -746.92626953125, "KL/mean": -928.888671875, "KL/rejected_KL_mean": -1110.85107421875, "KL/std": 552.4840087890625, "epoch": 0.7812041116005873, "fcm_dpo/beta": 0.0011367748957127333, "fcm_dpo/delta": -0.01567455381155014, "fcm_dpo/margin": 363.9248046875, "fcm_dpo/q_t": 0.40855199098587036, "grad_norm": 68.5473861694336, "learning_rate": 7.052201923388953e-08, "logits/chosen": -0.9877306818962097, "logits/rejected": -0.9620273113250732, "logps/chosen": -817.2122802734375, "logps/ref_chosen": -70.28601837158203, "logps/ref_rejected": -86.5913314819336, "logps/rejected": -1197.4423828125, "loss": 1.1483, "margin_dpo/margin_mean": 363.9248046875, "margin_dpo/margin_std": 657.9794311523438, "step": 532 }, { "KL/chosen_KL_mean": -678.8699951171875, "KL/mean": -809.2179565429688, "KL/rejected_KL_mean": -939.5658569335938, "KL/std": 484.06103515625, "epoch": 0.7826725403817915, "fcm_dpo/beta": 0.0011436111526563764, "fcm_dpo/delta": -0.010318025015294552, "fcm_dpo/margin": 260.69586181640625, "fcm_dpo/q_t": 0.43308863043785095, "grad_norm": 41.75889205932617, "learning_rate": 6.963101805503646e-08, "logits/chosen": -1.0021346807479858, "logits/rejected": -0.9763340950012207, "logps/chosen": -743.72509765625, "logps/ref_chosen": -64.8551025390625, "logps/ref_rejected": -76.58805847167969, "logps/rejected": -1016.1539306640625, "loss": 1.2151, "margin_dpo/margin_mean": 260.69586181640625, "margin_dpo/margin_std": 586.0986328125, "step": 533 }, { "KL/chosen_KL_mean": -687.0634765625, "KL/mean": -865.5532836914062, "KL/rejected_KL_mean": -1044.04296875, "KL/std": 514.9797973632812, "epoch": 0.7841409691629956, "fcm_dpo/beta": 0.0011327785905450583, "fcm_dpo/delta": -0.005801960825920105, "fcm_dpo/margin": 356.9794616699219, "fcm_dpo/q_t": 0.40905874967575073, "grad_norm": 35.797950744628906, "learning_rate": 6.874476976660184e-08, "logits/chosen": -0.9988099336624146, "logits/rejected": -0.9967177510261536, "logps/chosen": -747.182861328125, "logps/ref_chosen": -60.119388580322266, "logps/ref_rejected": -78.54347229003906, "logps/rejected": -1122.58642578125, "loss": 1.1125, "margin_dpo/margin_mean": 356.9794921875, "margin_dpo/margin_std": 537.181396484375, "step": 534 }, { "KL/chosen_KL_mean": -576.67333984375, "KL/mean": -783.545654296875, "KL/rejected_KL_mean": -990.41796875, "KL/std": 499.79852294921875, "epoch": 0.7856093979441997, "fcm_dpo/beta": 0.001136034494265914, "fcm_dpo/delta": -0.07386443018913269, "fcm_dpo/margin": 413.7446594238281, "fcm_dpo/q_t": 0.39456337690353394, "grad_norm": 30.948278427124023, "learning_rate": 6.786329772205246e-08, "logits/chosen": -0.9196850061416626, "logits/rejected": -0.921947717666626, "logps/chosen": -631.0035400390625, "logps/ref_chosen": -54.330238342285156, "logps/ref_rejected": -96.30763244628906, "logps/rejected": -1086.7255859375, "loss": 1.0557, "margin_dpo/margin_mean": 413.7446594238281, "margin_dpo/margin_std": 526.1167602539062, "step": 535 }, { "KL/chosen_KL_mean": -520.1702880859375, "KL/mean": -766.1087646484375, "KL/rejected_KL_mean": -1012.0472412109375, "KL/std": 554.3941650390625, "epoch": 0.7870778267254038, "fcm_dpo/beta": 0.0011004150146618485, "fcm_dpo/delta": -0.14926910400390625, "fcm_dpo/margin": 491.8769226074219, "fcm_dpo/q_t": 0.383211225271225, "grad_norm": 29.700851440429688, "learning_rate": 6.698662514899638e-08, "logits/chosen": -0.8961449265480042, "logits/rejected": -0.9250037670135498, "logps/chosen": -567.2508544921875, "logps/ref_chosen": -47.08053207397461, "logps/ref_rejected": -89.09783935546875, "logps/rejected": -1101.14501953125, "loss": 1.0215, "margin_dpo/margin_mean": 491.8769226074219, "margin_dpo/margin_std": 654.18408203125, "step": 536 }, { "KL/chosen_KL_mean": -537.0285034179688, "KL/mean": -701.8114013671875, "KL/rejected_KL_mean": -866.59423828125, "KL/std": 445.88031005859375, "epoch": 0.788546255506608, "fcm_dpo/beta": 0.0011028747539967299, "fcm_dpo/delta": 0.037213459610939026, "fcm_dpo/margin": 329.5657958984375, "fcm_dpo/q_t": 0.41649651527404785, "grad_norm": 44.981773376464844, "learning_rate": 6.611477514857114e-08, "logits/chosen": -0.9382889270782471, "logits/rejected": -0.9202646017074585, "logps/chosen": -594.7760009765625, "logps/ref_chosen": -57.747467041015625, "logps/ref_rejected": -70.43838500976562, "logps/rejected": -937.0326538085938, "loss": 1.1431, "margin_dpo/margin_mean": 329.5657958984375, "margin_dpo/margin_std": 545.5430297851562, "step": 537 }, { "KL/chosen_KL_mean": -655.9216918945312, "KL/mean": -841.97509765625, "KL/rejected_KL_mean": -1028.028564453125, "KL/std": 484.8907165527344, "epoch": 0.7900146842878121, "fcm_dpo/beta": 0.00109610625077039, "fcm_dpo/delta": -0.008285703137516975, "fcm_dpo/margin": 372.10687255859375, "fcm_dpo/q_t": 0.40666401386260986, "grad_norm": 28.537601470947266, "learning_rate": 6.524777069483525e-08, "logits/chosen": -0.9427316188812256, "logits/rejected": -0.9291995763778687, "logps/chosen": -722.337646484375, "logps/ref_chosen": -66.41594696044922, "logps/ref_rejected": -84.22808837890625, "logps/rejected": -1112.256591796875, "loss": 1.0874, "margin_dpo/margin_mean": 372.1068420410156, "margin_dpo/margin_std": 494.2760314941406, "step": 538 }, { "KL/chosen_KL_mean": -552.2450561523438, "KL/mean": -729.5706176757812, "KL/rejected_KL_mean": -906.8961791992188, "KL/std": 395.36083984375, "epoch": 0.7914831130690162, "fcm_dpo/beta": 0.0011011988390237093, "fcm_dpo/delta": 0.009760351851582527, "fcm_dpo/margin": 354.651123046875, "fcm_dpo/q_t": 0.40968143939971924, "grad_norm": 34.4008903503418, "learning_rate": 6.438563463416221e-08, "logits/chosen": -0.9456039667129517, "logits/rejected": -0.9376469254493713, "logps/chosen": -610.7379150390625, "logps/ref_chosen": -58.492855072021484, "logps/ref_rejected": -91.85395050048828, "logps/rejected": -998.7501220703125, "loss": 1.0878, "margin_dpo/margin_mean": 354.651123046875, "margin_dpo/margin_std": 434.3199462890625, "step": 539 }, { "KL/chosen_KL_mean": -543.5631103515625, "KL/mean": -769.376953125, "KL/rejected_KL_mean": -995.1907958984375, "KL/std": 495.0863342285156, "epoch": 0.7929515418502202, "fcm_dpo/beta": 0.0010876839514821768, "fcm_dpo/delta": -0.09584314376115799, "fcm_dpo/margin": 451.6276550292969, "fcm_dpo/q_t": 0.3912101984024048, "grad_norm": 39.40283966064453, "learning_rate": 6.352838968463919e-08, "logits/chosen": -0.8707677721977234, "logits/rejected": -0.8923947811126709, "logps/chosen": -607.045654296875, "logps/ref_chosen": -63.482513427734375, "logps/ref_rejected": -116.42999267578125, "logps/rejected": -1111.620849609375, "loss": 1.0481, "margin_dpo/margin_mean": 451.6276550292969, "margin_dpo/margin_std": 583.0654907226562, "step": 540 }, { "KL/chosen_KL_mean": -658.597412109375, "KL/mean": -783.5596923828125, "KL/rejected_KL_mean": -908.5219116210938, "KL/std": 451.7571716308594, "epoch": 0.7944199706314243, "fcm_dpo/beta": 0.001078011584468186, "fcm_dpo/delta": -0.004111842717975378, "fcm_dpo/margin": 249.92453002929688, "fcm_dpo/q_t": 0.43801432847976685, "grad_norm": 53.83041763305664, "learning_rate": 6.267605843546767e-08, "logits/chosen": -0.9981366395950317, "logits/rejected": -0.9937785863876343, "logps/chosen": -736.8777465820312, "logps/ref_chosen": -78.28036499023438, "logps/ref_rejected": -103.273681640625, "logps/rejected": -1011.7955932617188, "loss": 1.2177, "margin_dpo/margin_mean": 249.92453002929688, "margin_dpo/margin_std": 536.0916748046875, "step": 541 }, { "KL/chosen_KL_mean": -562.605712890625, "KL/mean": -788.047607421875, "KL/rejected_KL_mean": -1013.4893798828125, "KL/std": 504.0601806640625, "epoch": 0.7958883994126285, "fcm_dpo/beta": 0.0010561456438153982, "fcm_dpo/delta": -0.08209630846977234, "fcm_dpo/margin": 450.8837585449219, "fcm_dpo/q_t": 0.3928494155406952, "grad_norm": 52.30241012573242, "learning_rate": 6.182866334636888e-08, "logits/chosen": -0.9771475791931152, "logits/rejected": -1.0094921588897705, "logps/chosen": -620.0906982421875, "logps/ref_chosen": -57.48497009277344, "logps/ref_rejected": -96.47506713867188, "logps/rejected": -1109.9644775390625, "loss": 1.0592, "margin_dpo/margin_mean": 450.8837585449219, "margin_dpo/margin_std": 593.1970825195312, "step": 542 }, { "KL/chosen_KL_mean": -628.9979858398438, "KL/mean": -786.9501953125, "KL/rejected_KL_mean": -944.90234375, "KL/std": 587.5977783203125, "epoch": 0.7973568281938326, "fcm_dpo/beta": 0.0010682092979550362, "fcm_dpo/delta": 0.06477095186710358, "fcm_dpo/margin": 315.904296875, "fcm_dpo/q_t": 0.4326469302177429, "grad_norm": 30.09369659423828, "learning_rate": 6.098622674699147e-08, "logits/chosen": -0.9427838325500488, "logits/rejected": -0.9720630645751953, "logps/chosen": -689.615478515625, "logps/ref_chosen": -60.61750793457031, "logps/ref_rejected": -105.59896850585938, "logps/rejected": -1050.501220703125, "loss": 1.1962, "margin_dpo/margin_mean": 315.9043273925781, "margin_dpo/margin_std": 699.5291748046875, "step": 543 }, { "KL/chosen_KL_mean": -639.6248779296875, "KL/mean": -832.869873046875, "KL/rejected_KL_mean": -1026.1148681640625, "KL/std": 483.07550048828125, "epoch": 0.7988252569750367, "fcm_dpo/beta": 0.0010710186325013638, "fcm_dpo/delta": -0.014568203128874302, "fcm_dpo/margin": 386.49005126953125, "fcm_dpo/q_t": 0.40565305948257446, "grad_norm": 32.88768005371094, "learning_rate": 6.01487708363232e-08, "logits/chosen": -0.9224880933761597, "logits/rejected": -0.9422965049743652, "logps/chosen": -699.2671508789062, "logps/ref_chosen": -59.642303466796875, "logps/ref_rejected": -100.95469665527344, "logps/rejected": -1127.069580078125, "loss": 1.094, "margin_dpo/margin_mean": 386.49005126953125, "margin_dpo/margin_std": 549.4237060546875, "step": 544 }, { "KL/chosen_KL_mean": -592.945556640625, "KL/mean": -817.5401611328125, "KL/rejected_KL_mean": -1042.134765625, "KL/std": 495.79449462890625, "epoch": 0.8002936857562408, "fcm_dpo/beta": 0.0010588113218545914, "fcm_dpo/delta": -0.07936666160821915, "fcm_dpo/margin": 449.18927001953125, "fcm_dpo/q_t": 0.39326316118240356, "grad_norm": 32.99470520019531, "learning_rate": 5.9316317682106294e-08, "logits/chosen": -0.8537076711654663, "logits/rejected": -0.8849306106567383, "logps/chosen": -660.5941162109375, "logps/ref_chosen": -67.64859771728516, "logps/ref_rejected": -95.90800476074219, "logps/rejected": -1138.042724609375, "loss": 1.0488, "margin_dpo/margin_mean": 449.18927001953125, "margin_dpo/margin_std": 571.0772705078125, "step": 545 }, { "KL/chosen_KL_mean": -566.8978271484375, "KL/mean": -716.198486328125, "KL/rejected_KL_mean": -865.499267578125, "KL/std": 434.204833984375, "epoch": 0.801762114537445, "fcm_dpo/beta": 0.0010674262885004282, "fcm_dpo/delta": 0.08395257592201233, "fcm_dpo/margin": 298.6014709472656, "fcm_dpo/q_t": 0.42482131719589233, "grad_norm": 32.57497024536133, "learning_rate": 5.848888922025552e-08, "logits/chosen": -0.9205929040908813, "logits/rejected": -0.910815954208374, "logps/chosen": -617.6420288085938, "logps/ref_chosen": -50.744232177734375, "logps/ref_rejected": -81.86622619628906, "logps/rejected": -947.365478515625, "loss": 1.1519, "margin_dpo/margin_mean": 298.6014709472656, "margin_dpo/margin_std": 461.811767578125, "step": 546 }, { "KL/chosen_KL_mean": -573.2467041015625, "KL/mean": -762.8134765625, "KL/rejected_KL_mean": -952.38037109375, "KL/std": 485.2721862792969, "epoch": 0.8032305433186491, "fcm_dpo/beta": 0.0010726114269345999, "fcm_dpo/delta": -0.006979792378842831, "fcm_dpo/margin": 379.1336669921875, "fcm_dpo/q_t": 0.40798118710517883, "grad_norm": 50.77888870239258, "learning_rate": 5.7666507254280265e-08, "logits/chosen": -0.8564267158508301, "logits/rejected": -0.8684166669845581, "logps/chosen": -646.9344482421875, "logps/ref_chosen": -73.6877212524414, "logps/ref_rejected": -90.76136779785156, "logps/rejected": -1043.1417236328125, "loss": 1.0961, "margin_dpo/margin_mean": 379.1336669921875, "margin_dpo/margin_std": 533.245361328125, "step": 547 }, { "KL/chosen_KL_mean": -600.245361328125, "KL/mean": -777.9320068359375, "KL/rejected_KL_mean": -955.61865234375, "KL/std": 507.2066650390625, "epoch": 0.8046989720998532, "fcm_dpo/beta": 0.0010729740606620908, "fcm_dpo/delta": 0.019438141956925392, "fcm_dpo/margin": 355.3732604980469, "fcm_dpo/q_t": 0.4163675606250763, "grad_norm": 31.580347061157227, "learning_rate": 5.684919345471029e-08, "logits/chosen": -0.9392424821853638, "logits/rejected": -0.9410542845726013, "logps/chosen": -665.49169921875, "logps/ref_chosen": -65.24634552001953, "logps/ref_rejected": -94.11807250976562, "logps/rejected": -1049.7366943359375, "loss": 1.1184, "margin_dpo/margin_mean": 355.373291015625, "margin_dpo/margin_std": 558.9320068359375, "step": 548 }, { "KL/chosen_KL_mean": -618.5299072265625, "KL/mean": -756.0477294921875, "KL/rejected_KL_mean": -893.565673828125, "KL/std": 416.1364440917969, "epoch": 0.8061674008810573, "fcm_dpo/beta": 0.0010954017052426934, "fcm_dpo/delta": 0.10109251737594604, "fcm_dpo/margin": 275.03582763671875, "fcm_dpo/q_t": 0.43178755044937134, "grad_norm": 48.66642379760742, "learning_rate": 5.603696935852426e-08, "logits/chosen": -0.9122521877288818, "logits/rejected": -0.9025084376335144, "logps/chosen": -667.7421875, "logps/ref_chosen": -49.21235656738281, "logps/ref_rejected": -73.91031646728516, "logps/rejected": -967.4760131835938, "loss": 1.1787, "margin_dpo/margin_mean": 275.03582763671875, "margin_dpo/margin_std": 496.511474609375, "step": 549 }, { "KL/chosen_KL_mean": -637.8577880859375, "KL/mean": -800.6353759765625, "KL/rejected_KL_mean": -963.4129638671875, "KL/std": 484.2886962890625, "epoch": 0.8076358296622613, "fcm_dpo/beta": 0.0011030520545318723, "fcm_dpo/delta": 0.042437393218278885, "fcm_dpo/margin": 325.55517578125, "fcm_dpo/q_t": 0.4183220863342285, "grad_norm": 37.804141998291016, "learning_rate": 5.5229856368582376e-08, "logits/chosen": -0.8820310831069946, "logits/rejected": -0.9049103260040283, "logps/chosen": -694.664794921875, "logps/ref_chosen": -56.80695343017578, "logps/ref_rejected": -95.12580871582031, "logps/rejected": -1058.538818359375, "loss": 1.132, "margin_dpo/margin_mean": 325.55517578125, "margin_dpo/margin_std": 513.7775268554688, "step": 550 }, { "KL/chosen_KL_mean": -547.8771362304688, "KL/mean": -811.1815185546875, "KL/rejected_KL_mean": -1074.48583984375, "KL/std": 508.75482177734375, "epoch": 0.8091042584434655, "fcm_dpo/beta": 0.0010755530092865229, "fcm_dpo/delta": -0.17676769196987152, "fcm_dpo/margin": 526.6087036132812, "fcm_dpo/q_t": 0.36954620480537415, "grad_norm": 56.2789306640625, "learning_rate": 5.4427875753062734e-08, "logits/chosen": -0.8909007906913757, "logits/rejected": -0.9451035857200623, "logps/chosen": -606.9834594726562, "logps/ref_chosen": -59.10633087158203, "logps/ref_rejected": -111.67280578613281, "logps/rejected": -1186.15869140625, "loss": 0.9641, "margin_dpo/margin_mean": 526.6087036132812, "margin_dpo/margin_std": 504.5882568359375, "step": 551 }, { "KL/chosen_KL_mean": -535.0650634765625, "KL/mean": -834.066162109375, "KL/rejected_KL_mean": -1133.067138671875, "KL/std": 607.322265625, "epoch": 0.8105726872246696, "fcm_dpo/beta": 0.0010228096507489681, "fcm_dpo/delta": -0.229129359126091, "fcm_dpo/margin": 598.0020751953125, "fcm_dpo/q_t": 0.3673725724220276, "grad_norm": 48.699928283691406, "learning_rate": 5.363104864490034e-08, "logits/chosen": -0.9140257835388184, "logits/rejected": -0.9522314071655273, "logps/chosen": -597.419677734375, "logps/ref_chosen": -62.35459899902344, "logps/ref_rejected": -104.56210327148438, "logps/rejected": -1237.62939453125, "loss": 0.9746, "margin_dpo/margin_mean": 598.0020751953125, "margin_dpo/margin_std": 696.8597412109375, "step": 552 }, { "KL/chosen_KL_mean": -627.572509765625, "KL/mean": -790.2903442382812, "KL/rejected_KL_mean": -953.0081176757812, "KL/std": 503.21502685546875, "epoch": 0.8120411160058737, "fcm_dpo/beta": 0.0010280333226546645, "fcm_dpo/delta": 0.06775818020105362, "fcm_dpo/margin": 325.43560791015625, "fcm_dpo/q_t": 0.42561495304107666, "grad_norm": 26.667526245117188, "learning_rate": 5.2839396041230415e-08, "logits/chosen": -0.8898186683654785, "logits/rejected": -0.8853092789649963, "logps/chosen": -695.8313598632812, "logps/ref_chosen": -68.25881958007812, "logps/ref_rejected": -98.0971450805664, "logps/rejected": -1051.105224609375, "loss": 1.1505, "margin_dpo/margin_mean": 325.43560791015625, "margin_dpo/margin_std": 544.8580322265625, "step": 553 }, { "KL/chosen_KL_mean": -648.32177734375, "KL/mean": -857.53759765625, "KL/rejected_KL_mean": -1066.75341796875, "KL/std": 539.8403930664062, "epoch": 0.8135095447870778, "fcm_dpo/beta": 0.0010344828478991985, "fcm_dpo/delta": -0.034839678555727005, "fcm_dpo/margin": 418.43157958984375, "fcm_dpo/q_t": 0.40528228878974915, "grad_norm": 76.91921997070312, "learning_rate": 5.205293880283551e-08, "logits/chosen": -0.8666242957115173, "logits/rejected": -0.8389246463775635, "logps/chosen": -716.26953125, "logps/ref_chosen": -67.94767761230469, "logps/ref_rejected": -89.78272247314453, "logps/rejected": -1156.5361328125, "loss": 1.1182, "margin_dpo/margin_mean": 418.43157958984375, "margin_dpo/margin_std": 690.6478881835938, "step": 554 }, { "KL/chosen_KL_mean": -650.5548095703125, "KL/mean": -899.6168212890625, "KL/rejected_KL_mean": -1148.6787109375, "KL/std": 573.9271850585938, "epoch": 0.8149779735682819, "fcm_dpo/beta": 0.0010111583396792412, "fcm_dpo/delta": -0.10909023135900497, "fcm_dpo/margin": 498.12396240234375, "fcm_dpo/q_t": 0.3918069899082184, "grad_norm": 40.76702880859375, "learning_rate": 5.127169765359515e-08, "logits/chosen": -0.9580224752426147, "logits/rejected": -1.0123507976531982, "logps/chosen": -703.8853149414062, "logps/ref_chosen": -53.33049011230469, "logps/ref_rejected": -108.47937774658203, "logps/rejected": -1257.158203125, "loss": 1.0655, "margin_dpo/margin_mean": 498.12396240234375, "margin_dpo/margin_std": 737.4571533203125, "step": 555 }, { "KL/chosen_KL_mean": -646.2598266601562, "KL/mean": -798.2440185546875, "KL/rejected_KL_mean": -950.2281494140625, "KL/std": 454.10919189453125, "epoch": 0.8164464023494861, "fcm_dpo/beta": 0.0010182505939155817, "fcm_dpo/delta": 0.09340062737464905, "fcm_dpo/margin": 303.96832275390625, "fcm_dpo/q_t": 0.4286388158798218, "grad_norm": 35.782039642333984, "learning_rate": 5.049569317994012e-08, "logits/chosen": -0.9508916735649109, "logits/rejected": -0.9452144503593445, "logps/chosen": -704.904296875, "logps/ref_chosen": -58.64447021484375, "logps/ref_rejected": -101.34040832519531, "logps/rejected": -1051.568603515625, "loss": 1.1524, "margin_dpo/margin_mean": 303.96832275390625, "margin_dpo/margin_std": 460.0691223144531, "step": 556 }, { "KL/chosen_KL_mean": -717.3458251953125, "KL/mean": -942.2198486328125, "KL/rejected_KL_mean": -1167.0938720703125, "KL/std": 636.468505859375, "epoch": 0.8179148311306902, "fcm_dpo/beta": 0.0010126400738954544, "fcm_dpo/delta": -0.05826106667518616, "fcm_dpo/margin": 449.748046875, "fcm_dpo/q_t": 0.4009940028190613, "grad_norm": 52.255699157714844, "learning_rate": 4.9724945830310144e-08, "logits/chosen": -1.0088746547698975, "logits/rejected": -1.0433576107025146, "logps/chosen": -785.1865234375, "logps/ref_chosen": -67.84066009521484, "logps/ref_rejected": -109.93965911865234, "logps/rejected": -1277.033447265625, "loss": 1.1038, "margin_dpo/margin_mean": 449.748046875, "margin_dpo/margin_std": 723.2161865234375, "step": 557 }, { "KL/chosen_KL_mean": -643.977783203125, "KL/mean": -949.288330078125, "KL/rejected_KL_mean": -1254.5987548828125, "KL/std": 584.4927978515625, "epoch": 0.8193832599118943, "fcm_dpo/beta": 0.0009801845299080014, "fcm_dpo/delta": -0.21119916439056396, "fcm_dpo/margin": 610.62109375, "fcm_dpo/q_t": 0.36214083433151245, "grad_norm": 30.629545211791992, "learning_rate": 4.8959475914614554e-08, "logits/chosen": -1.027779221534729, "logits/rejected": -1.046311855316162, "logps/chosen": -706.3460083007812, "logps/ref_chosen": -62.36824035644531, "logps/ref_rejected": -102.16102600097656, "logps/rejected": -1356.759765625, "loss": 0.9661, "margin_dpo/margin_mean": 610.62109375, "margin_dpo/margin_std": 642.599365234375, "step": 558 }, { "KL/chosen_KL_mean": -743.7402954101562, "KL/mean": -1001.60595703125, "KL/rejected_KL_mean": -1259.4716796875, "KL/std": 617.6702270507812, "epoch": 0.8208516886930984, "fcm_dpo/beta": 0.0009573526913300157, "fcm_dpo/delta": -0.09849410504102707, "fcm_dpo/margin": 515.7313842773438, "fcm_dpo/q_t": 0.3901920020580292, "grad_norm": 32.09720993041992, "learning_rate": 4.8199303603697614e-08, "logits/chosen": -1.132476568222046, "logits/rejected": -1.138415813446045, "logps/chosen": -804.49267578125, "logps/ref_chosen": -60.752323150634766, "logps/ref_rejected": -93.44229125976562, "logps/rejected": -1352.9139404296875, "loss": 1.0467, "margin_dpo/margin_mean": 515.7313232421875, "margin_dpo/margin_std": 678.709228515625, "step": 559 }, { "KL/chosen_KL_mean": -679.7794189453125, "KL/mean": -859.0238647460938, "KL/rejected_KL_mean": -1038.268310546875, "KL/std": 535.8975830078125, "epoch": 0.8223201174743024, "fcm_dpo/beta": 0.0009546001674607396, "fcm_dpo/delta": 0.05963495746254921, "fcm_dpo/margin": 358.48895263671875, "fcm_dpo/q_t": 0.4228675365447998, "grad_norm": 37.20246505737305, "learning_rate": 4.7444448928806615e-08, "logits/chosen": -0.8968836069107056, "logits/rejected": -0.8791143894195557, "logps/chosen": -737.8831787109375, "logps/ref_chosen": -58.10382080078125, "logps/ref_rejected": -79.99122619628906, "logps/rejected": -1118.259521484375, "loss": 1.1513, "margin_dpo/margin_mean": 358.48895263671875, "margin_dpo/margin_std": 598.878173828125, "step": 560 }, { "KL/chosen_KL_mean": -780.15966796875, "KL/mean": -938.270263671875, "KL/rejected_KL_mean": -1096.380615234375, "KL/std": 530.6477661132812, "epoch": 0.8237885462555066, "fcm_dpo/beta": 0.0009781813714653254, "fcm_dpo/delta": 0.09319829940795898, "fcm_dpo/margin": 316.22100830078125, "fcm_dpo/q_t": 0.4291490614414215, "grad_norm": 47.342132568359375, "learning_rate": 4.669493178106432e-08, "logits/chosen": -1.0569636821746826, "logits/rejected": -1.0779967308044434, "logps/chosen": -831.0726318359375, "logps/ref_chosen": -50.912879943847656, "logps/ref_rejected": -99.06856536865234, "logps/rejected": -1195.44921875, "loss": 1.1995, "margin_dpo/margin_mean": 316.22100830078125, "margin_dpo/margin_std": 669.9414672851562, "step": 561 }, { "KL/chosen_KL_mean": -745.3583984375, "KL/mean": -957.2404174804688, "KL/rejected_KL_mean": -1169.122314453125, "KL/std": 593.1848754882812, "epoch": 0.8252569750367107, "fcm_dpo/beta": 0.0009731657337397337, "fcm_dpo/delta": -0.013568423688411713, "fcm_dpo/margin": 423.7640380859375, "fcm_dpo/q_t": 0.4083036184310913, "grad_norm": 35.87330627441406, "learning_rate": 4.5950771910944596e-08, "logits/chosen": -0.9769254326820374, "logits/rejected": -0.9813790321350098, "logps/chosen": -804.82275390625, "logps/ref_chosen": -59.46440124511719, "logps/ref_rejected": -96.54266357421875, "logps/rejected": -1265.6650390625, "loss": 1.1048, "margin_dpo/margin_mean": 423.7640380859375, "margin_dpo/margin_std": 645.1516723632812, "step": 562 }, { "KL/chosen_KL_mean": -829.703125, "KL/mean": -990.771240234375, "KL/rejected_KL_mean": -1151.83935546875, "KL/std": 633.045166015625, "epoch": 0.8267254038179148, "fcm_dpo/beta": 0.000972322653979063, "fcm_dpo/delta": -0.05465248227119446, "fcm_dpo/margin": 322.13623046875, "fcm_dpo/q_t": 0.42368167638778687, "grad_norm": 42.06772232055664, "learning_rate": 4.521198892775202e-08, "logits/chosen": -1.0264474153518677, "logits/rejected": -1.0321646928787231, "logps/chosen": -890.311279296875, "logps/ref_chosen": -60.60819625854492, "logps/ref_rejected": -94.56770324707031, "logps/rejected": -1246.406982421875, "loss": 1.2296, "margin_dpo/margin_mean": 322.13623046875, "margin_dpo/margin_std": 744.3589477539062, "step": 563 }, { "KL/chosen_KL_mean": -747.0021362304688, "KL/mean": -952.5867919921875, "KL/rejected_KL_mean": -1158.1715087890625, "KL/std": 572.9649658203125, "epoch": 0.8281938325991189, "fcm_dpo/beta": 0.0009697899222373962, "fcm_dpo/delta": 0.0011525209993124008, "fcm_dpo/margin": 411.1693115234375, "fcm_dpo/q_t": 0.4103137254714966, "grad_norm": 35.352901458740234, "learning_rate": 4.447860229910544e-08, "logits/chosen": -1.1010963916778564, "logits/rejected": -1.0915511846542358, "logps/chosen": -821.2705078125, "logps/ref_chosen": -74.26837921142578, "logps/ref_rejected": -93.23818969726562, "logps/rejected": -1251.40966796875, "loss": 1.099, "margin_dpo/margin_mean": 411.1693115234375, "margin_dpo/margin_std": 568.3698120117188, "step": 564 }, { "KL/chosen_KL_mean": -776.9501953125, "KL/mean": -992.6700439453125, "KL/rejected_KL_mean": -1208.389892578125, "KL/std": 637.2417602539062, "epoch": 0.8296622613803231, "fcm_dpo/beta": 0.0009645746322348714, "fcm_dpo/delta": -0.01686248928308487, "fcm_dpo/margin": 431.43963623046875, "fcm_dpo/q_t": 0.4098883271217346, "grad_norm": 44.35929870605469, "learning_rate": 4.375063135042445e-08, "logits/chosen": -1.0143120288848877, "logits/rejected": -1.0142502784729004, "logps/chosen": -845.9700927734375, "logps/ref_chosen": -69.0199203491211, "logps/ref_rejected": -85.7789306640625, "logps/rejected": -1294.1688232421875, "loss": 1.1322, "margin_dpo/margin_mean": 431.43963623046875, "margin_dpo/margin_std": 756.8804931640625, "step": 565 }, { "KL/chosen_KL_mean": -730.8787841796875, "KL/mean": -973.5142822265625, "KL/rejected_KL_mean": -1216.14990234375, "KL/std": 658.5827026367188, "epoch": 0.8311306901615272, "fcm_dpo/beta": 0.0009599901968613267, "fcm_dpo/delta": -0.06941938400268555, "fcm_dpo/margin": 485.2709655761719, "fcm_dpo/q_t": 0.39742326736450195, "grad_norm": 35.317893981933594, "learning_rate": 4.3028095264420525e-08, "logits/chosen": -1.0451146364212036, "logits/rejected": -1.0700435638427734, "logps/chosen": -797.424072265625, "logps/ref_chosen": -66.5453109741211, "logps/ref_rejected": -103.86932373046875, "logps/rejected": -1320.0191650390625, "loss": 1.1008, "margin_dpo/margin_mean": 485.27099609375, "margin_dpo/margin_std": 765.5435180664062, "step": 566 }, { "KL/chosen_KL_mean": -689.0420532226562, "KL/mean": -877.3385009765625, "KL/rejected_KL_mean": -1065.6348876953125, "KL/std": 457.2042236328125, "epoch": 0.8325991189427313, "fcm_dpo/beta": 0.0009558956371620297, "fcm_dpo/delta": 0.0415302999317646, "fcm_dpo/margin": 376.59283447265625, "fcm_dpo/q_t": 0.41558361053466797, "grad_norm": 29.64704132080078, "learning_rate": 4.231101308059165e-08, "logits/chosen": -1.1439913511276245, "logits/rejected": -1.1560258865356445, "logps/chosen": -741.9003295898438, "logps/ref_chosen": -52.85829544067383, "logps/ref_rejected": -85.37095642089844, "logps/rejected": -1151.005859375, "loss": 1.1121, "margin_dpo/margin_mean": 376.59283447265625, "margin_dpo/margin_std": 499.68634033203125, "step": 567 }, { "KL/chosen_KL_mean": -682.358642578125, "KL/mean": -934.0941162109375, "KL/rejected_KL_mean": -1185.82958984375, "KL/std": 537.24072265625, "epoch": 0.8340675477239354, "fcm_dpo/beta": 0.0009455858962610364, "fcm_dpo/delta": -0.0800839364528656, "fcm_dpo/margin": 503.4710693359375, "fcm_dpo/q_t": 0.3895892798900604, "grad_norm": 32.13995361328125, "learning_rate": 4.1599403694720145e-08, "logits/chosen": -0.9833190441131592, "logits/rejected": -1.0224902629852295, "logps/chosen": -727.551025390625, "logps/ref_chosen": -45.1923828125, "logps/ref_rejected": -89.09236907958984, "logps/rejected": -1274.9219970703125, "loss": 1.0304, "margin_dpo/margin_mean": 503.47100830078125, "margin_dpo/margin_std": 561.0274658203125, "step": 568 }, { "KL/chosen_KL_mean": -783.7449951171875, "KL/mean": -988.6644287109375, "KL/rejected_KL_mean": -1193.583740234375, "KL/std": 691.3892211914062, "epoch": 0.8355359765051396, "fcm_dpo/beta": 0.0009511442622169852, "fcm_dpo/delta": 0.009604483842849731, "fcm_dpo/margin": 409.83868408203125, "fcm_dpo/q_t": 0.4123598337173462, "grad_norm": 56.671836853027344, "learning_rate": 4.089328585837512e-08, "logits/chosen": -1.0582460165023804, "logits/rejected": -1.064152479171753, "logps/chosen": -847.465576171875, "logps/ref_chosen": -63.72056198120117, "logps/ref_rejected": -79.10325622558594, "logps/rejected": -1272.68701171875, "loss": 1.1468, "margin_dpo/margin_mean": 409.83868408203125, "margin_dpo/margin_std": 721.668701171875, "step": 569 }, { "KL/chosen_KL_mean": -723.0333862304688, "KL/mean": -910.5472412109375, "KL/rejected_KL_mean": -1098.06103515625, "KL/std": 545.8590087890625, "epoch": 0.8370044052863436, "fcm_dpo/beta": 0.0009502613684162498, "fcm_dpo/delta": 0.0452612042427063, "fcm_dpo/margin": 375.0276184082031, "fcm_dpo/q_t": 0.41972124576568604, "grad_norm": 27.773193359375, "learning_rate": 4.019267817841834e-08, "logits/chosen": -1.1307826042175293, "logits/rejected": -1.123297095298767, "logps/chosen": -784.64794921875, "logps/ref_chosen": -61.61454391479492, "logps/ref_rejected": -82.14186096191406, "logps/rejected": -1180.202880859375, "loss": 1.1368, "margin_dpo/margin_mean": 375.02764892578125, "margin_dpo/margin_std": 592.921875, "step": 570 }, { "KL/chosen_KL_mean": -717.0699462890625, "KL/mean": -943.7703857421875, "KL/rejected_KL_mean": -1170.4708251953125, "KL/std": 552.2366333007812, "epoch": 0.8384728340675477, "fcm_dpo/beta": 0.0009471910889260471, "fcm_dpo/delta": -0.030962642282247543, "fcm_dpo/margin": 453.40087890625, "fcm_dpo/q_t": 0.4038824439048767, "grad_norm": 41.67679977416992, "learning_rate": 3.9497599116513705e-08, "logits/chosen": -1.0074293613433838, "logits/rejected": -1.0243608951568604, "logps/chosen": -770.1240234375, "logps/ref_chosen": -53.05406188964844, "logps/ref_rejected": -91.33682250976562, "logps/rejected": -1261.8076171875, "loss": 1.0993, "margin_dpo/margin_mean": 453.40087890625, "margin_dpo/margin_std": 687.6110229492188, "step": 571 }, { "KL/chosen_KL_mean": -762.98486328125, "KL/mean": -999.034912109375, "KL/rejected_KL_mean": -1235.0849609375, "KL/std": 659.6268310546875, "epoch": 0.8399412628487518, "fcm_dpo/beta": 0.000938563549425453, "fcm_dpo/delta": -0.04543805494904518, "fcm_dpo/margin": 472.1002197265625, "fcm_dpo/q_t": 0.4047321081161499, "grad_norm": 35.46324157714844, "learning_rate": 3.880806698864086e-08, "logits/chosen": -1.0459448099136353, "logits/rejected": -1.078913688659668, "logps/chosen": -811.444091796875, "logps/ref_chosen": -48.45928955078125, "logps/ref_rejected": -83.55703735351562, "logps/rejected": -1318.64208984375, "loss": 1.1112, "margin_dpo/margin_mean": 472.1002197265625, "margin_dpo/margin_std": 790.4967651367188, "step": 572 }, { "KL/chosen_KL_mean": -757.2117919921875, "KL/mean": -961.093505859375, "KL/rejected_KL_mean": -1164.975341796875, "KL/std": 571.9259643554688, "epoch": 0.8414096916299559, "fcm_dpo/beta": 0.0009429033380001783, "fcm_dpo/delta": 0.016076089814305305, "fcm_dpo/margin": 407.7635498046875, "fcm_dpo/q_t": 0.4132142663002014, "grad_norm": 28.161340713500977, "learning_rate": 3.812409996461275e-08, "logits/chosen": -1.086681604385376, "logits/rejected": -1.0990102291107178, "logps/chosen": -808.8343505859375, "logps/ref_chosen": -51.62262725830078, "logps/ref_rejected": -85.32499694824219, "logps/rejected": -1250.30029296875, "loss": 1.1033, "margin_dpo/margin_mean": 407.7635498046875, "margin_dpo/margin_std": 570.0986328125, "step": 573 }, { "KL/chosen_KL_mean": -668.5501708984375, "KL/mean": -886.3728637695312, "KL/rejected_KL_mean": -1104.195556640625, "KL/std": 513.22802734375, "epoch": 0.8428781204111601, "fcm_dpo/beta": 0.0009430091013200581, "fcm_dpo/delta": -0.011370273306965828, "fcm_dpo/margin": 435.6454772949219, "fcm_dpo/q_t": 0.40598154067993164, "grad_norm": 33.936519622802734, "learning_rate": 3.74457160675965e-08, "logits/chosen": -1.0922186374664307, "logits/rejected": -1.1221637725830078, "logps/chosen": -719.5946044921875, "logps/ref_chosen": -51.04446029663086, "logps/ref_rejected": -92.80640411376953, "logps/rejected": -1197.001953125, "loss": 1.0903, "margin_dpo/margin_mean": 435.6454772949219, "margin_dpo/margin_std": 592.3309936523438, "step": 574 }, { "KL/chosen_KL_mean": -727.7802734375, "KL/mean": -933.4301147460938, "KL/rejected_KL_mean": -1139.079833984375, "KL/std": 528.300537109375, "epoch": 0.8443465491923642, "fcm_dpo/beta": 0.0009349288884550333, "fcm_dpo/delta": 0.014928296208381653, "fcm_dpo/margin": 411.2996520996094, "fcm_dpo/q_t": 0.4125257134437561, "grad_norm": 35.51095199584961, "learning_rate": 3.677293317363864e-08, "logits/chosen": -0.9489999413490295, "logits/rejected": -0.9602969288825989, "logps/chosen": -799.5704345703125, "logps/ref_chosen": -71.7901382446289, "logps/ref_rejected": -95.38619995117188, "logps/rejected": -1234.466064453125, "loss": 1.1378, "margin_dpo/margin_mean": 411.2996520996094, "margin_dpo/margin_std": 677.067138671875, "step": 575 }, { "KL/chosen_KL_mean": -690.3848876953125, "KL/mean": -842.4647216796875, "KL/rejected_KL_mean": -994.5445556640625, "KL/std": 470.174560546875, "epoch": 0.8458149779735683, "fcm_dpo/beta": 0.0009562689810991287, "fcm_dpo/delta": 0.11256685107946396, "fcm_dpo/margin": 304.1596374511719, "fcm_dpo/q_t": 0.43350422382354736, "grad_norm": 36.57832717895508, "learning_rate": 3.6105769011194224e-08, "logits/chosen": -1.0214297771453857, "logits/rejected": -1.048740029335022, "logps/chosen": -744.6478881835938, "logps/ref_chosen": -54.262962341308594, "logps/ref_rejected": -100.75428009033203, "logps/rejected": -1095.298828125, "loss": 1.1852, "margin_dpo/margin_mean": 304.15960693359375, "margin_dpo/margin_std": 556.28271484375, "step": 576 }, { "KL/chosen_KL_mean": -627.6497802734375, "KL/mean": -830.4925537109375, "KL/rejected_KL_mean": -1033.33544921875, "KL/std": 542.0623779296875, "epoch": 0.8472834067547724, "fcm_dpo/beta": 0.000964190810918808, "fcm_dpo/delta": 0.00915931724011898, "fcm_dpo/margin": 405.6855773925781, "fcm_dpo/q_t": 0.4120888113975525, "grad_norm": 29.529443740844727, "learning_rate": 3.5444241160659304e-08, "logits/chosen": -1.0218915939331055, "logits/rejected": -1.0101161003112793, "logps/chosen": -689.5594482421875, "logps/ref_chosen": -61.909706115722656, "logps/ref_rejected": -84.07069396972656, "logps/rejected": -1117.406005859375, "loss": 1.1187, "margin_dpo/margin_mean": 405.68560791015625, "margin_dpo/margin_std": 598.992919921875, "step": 577 }, { "KL/chosen_KL_mean": -595.876220703125, "KL/mean": -812.5816650390625, "KL/rejected_KL_mean": -1029.2872314453125, "KL/std": 519.2777099609375, "epoch": 0.8487518355359766, "fcm_dpo/beta": 0.0009588984539732337, "fcm_dpo/delta": -0.01690073311328888, "fcm_dpo/margin": 433.4109802246094, "fcm_dpo/q_t": 0.40431180596351624, "grad_norm": 50.43282699584961, "learning_rate": 3.478836705390808e-08, "logits/chosen": -0.9214882850646973, "logits/rejected": -0.9523541331291199, "logps/chosen": -645.139892578125, "logps/ref_chosen": -49.26368713378906, "logps/ref_rejected": -83.4362564086914, "logps/rejected": -1112.723388671875, "loss": 1.0771, "margin_dpo/margin_mean": 433.41094970703125, "margin_dpo/margin_std": 535.86767578125, "step": 578 }, { "KL/chosen_KL_mean": -723.193603515625, "KL/mean": -859.6403198242188, "KL/rejected_KL_mean": -996.0870361328125, "KL/std": 547.017578125, "epoch": 0.8502202643171806, "fcm_dpo/beta": 0.0009806466987356544, "fcm_dpo/delta": 0.13623100519180298, "fcm_dpo/margin": 272.8934326171875, "fcm_dpo/q_t": 0.43853724002838135, "grad_norm": 68.8424301147461, "learning_rate": 3.41381639738331e-08, "logits/chosen": -0.9866000413894653, "logits/rejected": -0.9893920421600342, "logps/chosen": -782.0794677734375, "logps/ref_chosen": -58.88581848144531, "logps/ref_rejected": -94.78762817382812, "logps/rejected": -1090.8746337890625, "loss": 1.2174, "margin_dpo/margin_mean": 272.8934326171875, "margin_dpo/margin_std": 593.593017578125, "step": 579 }, { "KL/chosen_KL_mean": -507.4736633300781, "KL/mean": -756.1246948242188, "KL/rejected_KL_mean": -1004.7757568359375, "KL/std": 589.247802734375, "epoch": 0.8516886930983847, "fcm_dpo/beta": 0.0009732701582834125, "fcm_dpo/delta": -0.0888274759054184, "fcm_dpo/margin": 497.30218505859375, "fcm_dpo/q_t": 0.3939579725265503, "grad_norm": 39.8839111328125, "learning_rate": 3.349364905389032e-08, "logits/chosen": -0.8748548030853271, "logits/rejected": -0.9090137481689453, "logps/chosen": -556.1804809570312, "logps/ref_chosen": -48.70683670043945, "logps/ref_rejected": -81.7583999633789, "logps/rejected": -1086.5341796875, "loss": 1.0507, "margin_dpo/margin_mean": 497.3021545410156, "margin_dpo/margin_std": 672.8572998046875, "step": 580 }, { "KL/chosen_KL_mean": -710.769287109375, "KL/mean": -885.140380859375, "KL/rejected_KL_mean": -1059.5115966796875, "KL/std": 566.3184814453125, "epoch": 0.8531571218795888, "fcm_dpo/beta": 0.0009817921090871096, "fcm_dpo/delta": 0.05957435816526413, "fcm_dpo/margin": 348.74224853515625, "fcm_dpo/q_t": 0.42394953966140747, "grad_norm": 37.68699264526367, "learning_rate": 3.285483927764726e-08, "logits/chosen": -1.0647389888763428, "logits/rejected": -1.0730311870574951, "logps/chosen": -772.9916381835938, "logps/ref_chosen": -62.22235107421875, "logps/ref_rejected": -91.73568725585938, "logps/rejected": -1151.247314453125, "loss": 1.1566, "margin_dpo/margin_mean": 348.74224853515625, "margin_dpo/margin_std": 622.7120361328125, "step": 581 }, { "KL/chosen_KL_mean": -618.490478515625, "KL/mean": -812.1707153320312, "KL/rejected_KL_mean": -1005.8509521484375, "KL/std": 488.8114929199219, "epoch": 0.8546255506607929, "fcm_dpo/beta": 0.0009761706460267305, "fcm_dpo/delta": -0.07844623178243637, "fcm_dpo/margin": 387.3603515625, "fcm_dpo/q_t": 0.41167423129081726, "grad_norm": 29.930849075317383, "learning_rate": 3.222175147833556e-08, "logits/chosen": -1.0147862434387207, "logits/rejected": -1.0369963645935059, "logps/chosen": -676.7191772460938, "logps/ref_chosen": -58.228660583496094, "logps/ref_rejected": -110.06959533691406, "logps/rejected": -1115.9205322265625, "loss": 1.1147, "margin_dpo/margin_mean": 387.3603820800781, "margin_dpo/margin_std": 530.1618041992188, "step": 582 }, { "KL/chosen_KL_mean": -691.4786376953125, "KL/mean": -823.6566162109375, "KL/rejected_KL_mean": -955.8345947265625, "KL/std": 531.1171264648438, "epoch": 0.856093979441997, "fcm_dpo/beta": 0.0009659786010161042, "fcm_dpo/delta": -0.013101693242788315, "fcm_dpo/margin": 264.35589599609375, "fcm_dpo/q_t": 0.4426102340221405, "grad_norm": 65.24808502197266, "learning_rate": 3.159440233840763e-08, "logits/chosen": -0.9646916389465332, "logits/rejected": -0.9632136821746826, "logps/chosen": -748.341552734375, "logps/ref_chosen": -56.86286163330078, "logps/ref_rejected": -88.4039306640625, "logps/rejected": -1044.238525390625, "loss": 1.2428, "margin_dpo/margin_mean": 264.3559265136719, "margin_dpo/margin_std": 639.0501098632812, "step": 583 }, { "KL/chosen_KL_mean": -608.3406982421875, "KL/mean": -859.684326171875, "KL/rejected_KL_mean": -1111.0279541015625, "KL/std": 554.8764038085938, "epoch": 0.8575624082232012, "fcm_dpo/beta": 0.0009539815364405513, "fcm_dpo/delta": -0.08356797695159912, "fcm_dpo/margin": 502.6873779296875, "fcm_dpo/q_t": 0.39147210121154785, "grad_norm": 36.447509765625, "learning_rate": 3.0972808389096635e-08, "logits/chosen": -0.9898185133934021, "logits/rejected": -1.0079997777938843, "logps/chosen": -665.2413330078125, "logps/ref_chosen": -56.90068054199219, "logps/ref_rejected": -97.63606262207031, "logps/rejected": -1208.6640625, "loss": 1.0358, "margin_dpo/margin_mean": 502.6874084472656, "margin_dpo/margin_std": 587.2181396484375, "step": 584 }, { "KL/chosen_KL_mean": -696.600341796875, "KL/mean": -919.7645263671875, "KL/rejected_KL_mean": -1142.9287109375, "KL/std": 609.264892578125, "epoch": 0.8590308370044053, "fcm_dpo/beta": 0.0009440815774723887, "fcm_dpo/delta": -0.022542130202054977, "fcm_dpo/margin": 446.328369140625, "fcm_dpo/q_t": 0.4053837060928345, "grad_norm": 32.03108215332031, "learning_rate": 3.035698600998121e-08, "logits/chosen": -0.9939338564872742, "logits/rejected": -1.0176794528961182, "logps/chosen": -757.5743408203125, "logps/ref_chosen": -60.973968505859375, "logps/ref_rejected": -84.16952514648438, "logps/rejected": -1227.0982666015625, "loss": 1.1155, "margin_dpo/margin_mean": 446.328369140625, "margin_dpo/margin_std": 727.369384765625, "step": 585 }, { "KL/chosen_KL_mean": -743.328125, "KL/mean": -890.031005859375, "KL/rejected_KL_mean": -1036.73388671875, "KL/std": 537.0108642578125, "epoch": 0.8604992657856094, "fcm_dpo/beta": 0.0009613102884031832, "fcm_dpo/delta": 0.12152184545993805, "fcm_dpo/margin": 293.4059143066406, "fcm_dpo/q_t": 0.43567806482315063, "grad_norm": 36.572792053222656, "learning_rate": 2.974695142855388e-08, "logits/chosen": -1.0138568878173828, "logits/rejected": -1.0334415435791016, "logps/chosen": -800.1837158203125, "logps/ref_chosen": -56.85559844970703, "logps/ref_rejected": -91.80261993408203, "logps/rejected": -1128.53662109375, "loss": 1.2015, "margin_dpo/margin_mean": 293.4058837890625, "margin_dpo/margin_std": 593.2252197265625, "step": 586 }, { "KL/chosen_KL_mean": -507.5320129394531, "KL/mean": -715.7158813476562, "KL/rejected_KL_mean": -923.899658203125, "KL/std": 561.7125244140625, "epoch": 0.8619676945668135, "fcm_dpo/beta": 0.0009705749107524753, "fcm_dpo/delta": -0.0043886564671993256, "fcm_dpo/margin": 416.36767578125, "fcm_dpo/q_t": 0.40726912021636963, "grad_norm": 42.70491409301758, "learning_rate": 2.9142720719793122e-08, "logits/chosen": -1.0300676822662354, "logits/rejected": -1.0568914413452148, "logps/chosen": -552.2236328125, "logps/ref_chosen": -44.69159698486328, "logps/ref_rejected": -82.62385559082031, "logps/rejected": -1006.5235595703125, "loss": 1.0931, "margin_dpo/margin_mean": 416.36767578125, "margin_dpo/margin_std": 568.321044921875, "step": 587 }, { "KL/chosen_KL_mean": -685.742919921875, "KL/mean": -856.51806640625, "KL/rejected_KL_mean": -1027.293212890625, "KL/std": 494.27239990234375, "epoch": 0.8634361233480177, "fcm_dpo/beta": 0.0009743442060425878, "fcm_dpo/delta": 0.06948099285364151, "fcm_dpo/margin": 341.55035400390625, "fcm_dpo/q_t": 0.4227873980998993, "grad_norm": 31.037988662719727, "learning_rate": 2.8544309805740018e-08, "logits/chosen": -0.9946512579917908, "logits/rejected": -1.018219232559204, "logps/chosen": -736.037841796875, "logps/ref_chosen": -50.29494857788086, "logps/ref_rejected": -107.36988067626953, "logps/rejected": -1134.6630859375, "loss": 1.1426, "margin_dpo/margin_mean": 341.55035400390625, "margin_dpo/margin_std": 514.8743896484375, "step": 588 }, { "KL/chosen_KL_mean": -675.970458984375, "KL/mean": -908.8043212890625, "KL/rejected_KL_mean": -1141.63818359375, "KL/std": 545.934326171875, "epoch": 0.8649045521292217, "fcm_dpo/beta": 0.0009745459537953138, "fcm_dpo/delta": -0.0563356988132, "fcm_dpo/margin": 465.66778564453125, "fcm_dpo/q_t": 0.3962155878543854, "grad_norm": 30.49479103088379, "learning_rate": 2.7951734455078786e-08, "logits/chosen": -0.9664604663848877, "logits/rejected": -0.9765450954437256, "logps/chosen": -735.9003295898438, "logps/ref_chosen": -59.929908752441406, "logps/ref_rejected": -111.65534973144531, "logps/rejected": -1253.2935791015625, "loss": 1.0564, "margin_dpo/margin_mean": 465.66778564453125, "margin_dpo/margin_std": 579.8321533203125, "step": 589 }, { "KL/chosen_KL_mean": -586.0206298828125, "KL/mean": -815.29736328125, "KL/rejected_KL_mean": -1044.573974609375, "KL/std": 534.37109375, "epoch": 0.8663729809104258, "fcm_dpo/beta": 0.0009633679874241352, "fcm_dpo/delta": -0.04375208914279938, "fcm_dpo/margin": 458.553466796875, "fcm_dpo/q_t": 0.3995480537414551, "grad_norm": 30.36831283569336, "learning_rate": 2.736501028272095e-08, "logits/chosen": -0.9607778191566467, "logits/rejected": -0.9915695190429688, "logps/chosen": -641.8304443359375, "logps/ref_chosen": -55.80979537963867, "logps/ref_rejected": -106.06282043457031, "logps/rejected": -1150.636962890625, "loss": 1.0625, "margin_dpo/margin_mean": 458.553466796875, "margin_dpo/margin_std": 575.5927734375, "step": 590 }, { "KL/chosen_KL_mean": -667.932373046875, "KL/mean": -878.5347900390625, "KL/rejected_KL_mean": -1089.13720703125, "KL/std": 525.6906127929688, "epoch": 0.8678414096916299, "fcm_dpo/beta": 0.0009612845606170595, "fcm_dpo/delta": -0.005106211174279451, "fcm_dpo/margin": 421.20489501953125, "fcm_dpo/q_t": 0.40647366642951965, "grad_norm": 31.83711051940918, "learning_rate": 2.678415274939408e-08, "logits/chosen": -1.0476133823394775, "logits/rejected": -1.0394680500030518, "logps/chosen": -724.1729736328125, "logps/ref_chosen": -56.24061965942383, "logps/ref_rejected": -83.78629302978516, "logps/rejected": -1172.9234619140625, "loss": 1.0985, "margin_dpo/margin_mean": 421.20489501953125, "margin_dpo/margin_std": 595.3729248046875, "step": 591 }, { "KL/chosen_KL_mean": -706.2188720703125, "KL/mean": -897.91455078125, "KL/rejected_KL_mean": -1089.610107421875, "KL/std": 542.8892211914062, "epoch": 0.869309838472834, "fcm_dpo/beta": 0.0009650047868490219, "fcm_dpo/delta": 0.031190991401672363, "fcm_dpo/margin": 383.3913269042969, "fcm_dpo/q_t": 0.41676104068756104, "grad_norm": 38.8540153503418, "learning_rate": 2.6209177161234442e-08, "logits/chosen": -1.016085147857666, "logits/rejected": -1.019473910331726, "logps/chosen": -754.1591186523438, "logps/ref_chosen": -47.94025421142578, "logps/ref_rejected": -75.73287963867188, "logps/rejected": -1165.343017578125, "loss": 1.1713, "margin_dpo/margin_mean": 383.391357421875, "margin_dpo/margin_std": 732.060546875, "step": 592 }, { "KL/chosen_KL_mean": -659.2498779296875, "KL/mean": -815.0553588867188, "KL/rejected_KL_mean": -970.86083984375, "KL/std": 574.805908203125, "epoch": 0.8707782672540382, "fcm_dpo/beta": 0.0009783029090613127, "fcm_dpo/delta": 0.09824425727128983, "fcm_dpo/margin": 311.6109313964844, "fcm_dpo/q_t": 0.4322025775909424, "grad_norm": 47.76630783081055, "learning_rate": 2.564009866938349e-08, "logits/chosen": -0.8949644565582275, "logits/rejected": -0.8865162134170532, "logps/chosen": -707.9406127929688, "logps/ref_chosen": -48.690757751464844, "logps/ref_rejected": -60.90800094604492, "logps/rejected": -1031.768798828125, "loss": 1.1964, "margin_dpo/margin_mean": 311.61090087890625, "margin_dpo/margin_std": 633.5354614257812, "step": 593 }, { "KL/chosen_KL_mean": -633.9850463867188, "KL/mean": -820.4825439453125, "KL/rejected_KL_mean": -1006.97998046875, "KL/std": 562.014404296875, "epoch": 0.8722466960352423, "fcm_dpo/beta": 0.0009973826818168163, "fcm_dpo/delta": 0.02790883556008339, "fcm_dpo/margin": 372.99493408203125, "fcm_dpo/q_t": 0.4159358739852905, "grad_norm": 40.98539733886719, "learning_rate": 2.5076932269588708e-08, "logits/chosen": -0.9719296097755432, "logits/rejected": -0.9616006016731262, "logps/chosen": -688.919921875, "logps/ref_chosen": -54.93488693237305, "logps/ref_rejected": -86.09967803955078, "logps/rejected": -1093.07958984375, "loss": 1.1352, "margin_dpo/margin_mean": 372.99493408203125, "margin_dpo/margin_std": 592.751708984375, "step": 594 }, { "KL/chosen_KL_mean": -608.255615234375, "KL/mean": -821.6201171875, "KL/rejected_KL_mean": -1034.984619140625, "KL/std": 540.2607421875, "epoch": 0.8737151248164464, "fcm_dpo/beta": 0.0009868217166513205, "fcm_dpo/delta": -0.022147677838802338, "fcm_dpo/margin": 426.7290344238281, "fcm_dpo/q_t": 0.4070265293121338, "grad_norm": 43.79144287109375, "learning_rate": 2.451969280180849e-08, "logits/chosen": -0.944753885269165, "logits/rejected": -0.9602541923522949, "logps/chosen": -657.676025390625, "logps/ref_chosen": -49.4204216003418, "logps/ref_rejected": -80.62731170654297, "logps/rejected": -1115.6119384765625, "loss": 1.0866, "margin_dpo/margin_mean": 426.7290344238281, "margin_dpo/margin_std": 598.78759765625, "step": 595 }, { "KL/chosen_KL_mean": -692.2659301757812, "KL/mean": -836.673095703125, "KL/rejected_KL_mean": -981.0802001953125, "KL/std": 531.4737548828125, "epoch": 0.8751835535976505, "fcm_dpo/beta": 0.0010060444474220276, "fcm_dpo/delta": 0.11245694756507874, "fcm_dpo/margin": 288.8142395019531, "fcm_dpo/q_t": 0.4370243549346924, "grad_norm": 68.51116180419922, "learning_rate": 2.396839494982103e-08, "logits/chosen": -0.9666332006454468, "logits/rejected": -0.9320765733718872, "logps/chosen": -752.0576171875, "logps/ref_chosen": -59.791683197021484, "logps/ref_rejected": -80.09111785888672, "logps/rejected": -1061.17138671875, "loss": 1.2115, "margin_dpo/margin_mean": 288.8142395019531, "margin_dpo/margin_std": 640.1383666992188, "step": 596 }, { "KL/chosen_KL_mean": -652.91552734375, "KL/mean": -912.2109375, "KL/rejected_KL_mean": -1171.50634765625, "KL/std": 616.0120849609375, "epoch": 0.8766519823788547, "fcm_dpo/beta": 0.0009824027074500918, "fcm_dpo/delta": -0.11863398551940918, "fcm_dpo/margin": 518.5908203125, "fcm_dpo/q_t": 0.3876197040081024, "grad_norm": 28.101728439331055, "learning_rate": 2.3423053240837514e-08, "logits/chosen": -0.9247469305992126, "logits/rejected": -0.9725657105445862, "logps/chosen": -710.17626953125, "logps/ref_chosen": -57.26078796386719, "logps/ref_rejected": -100.6937255859375, "logps/rejected": -1272.2000732421875, "loss": 1.0463, "margin_dpo/margin_mean": 518.5908203125, "margin_dpo/margin_std": 676.21826171875, "step": 597 }, { "KL/chosen_KL_mean": -655.470947265625, "KL/mean": -857.9932250976562, "KL/rejected_KL_mean": -1060.515625, "KL/std": 524.6902465820312, "epoch": 0.8781204111600588, "fcm_dpo/beta": 0.0009790980257093906, "fcm_dpo/delta": 0.002352789044380188, "fcm_dpo/margin": 405.0446472167969, "fcm_dpo/q_t": 0.4087187945842743, "grad_norm": 34.318355560302734, "learning_rate": 2.2883682045119062e-08, "logits/chosen": -1.0320333242416382, "logits/rejected": -1.0434290170669556, "logps/chosen": -707.9893798828125, "logps/ref_chosen": -52.51850509643555, "logps/ref_rejected": -89.44385528564453, "logps/rejected": -1149.95947265625, "loss": 1.1089, "margin_dpo/margin_mean": 405.0446472167969, "margin_dpo/margin_std": 579.4035034179688, "step": 598 }, { "KL/chosen_KL_mean": -673.4180908203125, "KL/mean": -848.3385009765625, "KL/rejected_KL_mean": -1023.2587890625, "KL/std": 517.2516479492188, "epoch": 0.8795888399412628, "fcm_dpo/beta": 0.0009746775031089783, "fcm_dpo/delta": -0.061856959015131, "fcm_dpo/margin": 349.8406982421875, "fcm_dpo/q_t": 0.41758590936660767, "grad_norm": 32.6776123046875, "learning_rate": 2.2350295575598367e-08, "logits/chosen": -0.9535913467407227, "logits/rejected": -0.9626870155334473, "logps/chosen": -723.2208251953125, "logps/ref_chosen": -49.802677154541016, "logps/ref_rejected": -82.978515625, "logps/rejected": -1106.2373046875, "loss": 1.1342, "margin_dpo/margin_mean": 349.8407287597656, "margin_dpo/margin_std": 487.59490966796875, "step": 599 }, { "KL/chosen_KL_mean": -726.7335205078125, "KL/mean": -885.2261962890625, "KL/rejected_KL_mean": -1043.7188720703125, "KL/std": 520.7708740234375, "epoch": 0.8810572687224669, "fcm_dpo/beta": 0.0009859842248260975, "fcm_dpo/delta": 0.0903782919049263, "fcm_dpo/margin": 316.98529052734375, "fcm_dpo/q_t": 0.430108904838562, "grad_norm": 33.96622848510742, "learning_rate": 2.1822907887504932e-08, "logits/chosen": -1.0657624006271362, "logits/rejected": -1.0632259845733643, "logps/chosen": -793.16845703125, "logps/ref_chosen": -66.43487548828125, "logps/ref_rejected": -85.45649719238281, "logps/rejected": -1129.17529296875, "loss": 1.1868, "margin_dpo/margin_mean": 316.98529052734375, "margin_dpo/margin_std": 626.9907836914062, "step": 600 }, { "KL/chosen_KL_mean": -737.2628173828125, "KL/mean": -945.91796875, "KL/rejected_KL_mean": -1154.5731201171875, "KL/std": 556.4426879882812, "epoch": 0.882525697503671, "fcm_dpo/beta": 0.0009882240556180477, "fcm_dpo/delta": -0.01307043619453907, "fcm_dpo/margin": 417.3103942871094, "fcm_dpo/q_t": 0.4040505588054657, "grad_norm": 31.343103408813477, "learning_rate": 2.1301532877994742e-08, "logits/chosen": -0.97291100025177, "logits/rejected": -0.9917502403259277, "logps/chosen": -796.3963623046875, "logps/ref_chosen": -59.13361358642578, "logps/ref_rejected": -94.69093322753906, "logps/rejected": -1249.26416015625, "loss": 1.0847, "margin_dpo/margin_mean": 417.3103942871094, "margin_dpo/margin_std": 547.4290771484375, "step": 601 }, { "KL/chosen_KL_mean": -505.7738037109375, "KL/mean": -752.028076171875, "KL/rejected_KL_mean": -998.2823486328125, "KL/std": 502.33154296875, "epoch": 0.8839941262848752, "fcm_dpo/beta": 0.0009830892086029053, "fcm_dpo/delta": -0.08871287107467651, "fcm_dpo/margin": 492.5085754394531, "fcm_dpo/q_t": 0.3889528512954712, "grad_norm": 67.23153686523438, "learning_rate": 2.0786184285784298e-08, "logits/chosen": -1.0278465747833252, "logits/rejected": -1.060103416442871, "logps/chosen": -554.3673095703125, "logps/ref_chosen": -48.59352111816406, "logps/ref_rejected": -87.6685562133789, "logps/rejected": -1085.950927734375, "loss": 1.025, "margin_dpo/margin_mean": 492.5085754394531, "margin_dpo/margin_std": 532.3154296875, "step": 602 }, { "KL/chosen_KL_mean": -637.751953125, "KL/mean": -871.9014892578125, "KL/rejected_KL_mean": -1106.051025390625, "KL/std": 581.3104248046875, "epoch": 0.8854625550660793, "fcm_dpo/beta": 0.0009643337689340115, "fcm_dpo/delta": -0.05405785143375397, "fcm_dpo/margin": 468.2989501953125, "fcm_dpo/q_t": 0.40109044313430786, "grad_norm": 38.15021896362305, "learning_rate": 2.0276875690788204e-08, "logits/chosen": -0.9976146817207336, "logits/rejected": -0.9900000095367432, "logps/chosen": -708.1666259765625, "logps/ref_chosen": -70.41461944580078, "logps/ref_rejected": -100.32559967041016, "logps/rejected": -1206.3765869140625, "loss": 1.0822, "margin_dpo/margin_mean": 468.2989501953125, "margin_dpo/margin_std": 690.9556274414062, "step": 603 }, { "KL/chosen_KL_mean": -629.3424072265625, "KL/mean": -877.6458740234375, "KL/rejected_KL_mean": -1125.9493408203125, "KL/std": 553.5061645507812, "epoch": 0.8869309838472834, "fcm_dpo/beta": 0.0009511223761364818, "fcm_dpo/delta": -0.07592622190713882, "fcm_dpo/margin": 496.60693359375, "fcm_dpo/q_t": 0.3955162465572357, "grad_norm": 38.61325454711914, "learning_rate": 1.977362051376158e-08, "logits/chosen": -0.9782446622848511, "logits/rejected": -1.012909173965454, "logps/chosen": -675.800537109375, "logps/ref_chosen": -46.45808029174805, "logps/ref_rejected": -91.8544921875, "logps/rejected": -1217.8038330078125, "loss": 1.0656, "margin_dpo/margin_mean": 496.60693359375, "margin_dpo/margin_std": 690.3172607421875, "step": 604 }, { "KL/chosen_KL_mean": -670.883544921875, "KL/mean": -858.872314453125, "KL/rejected_KL_mean": -1046.861083984375, "KL/std": 531.7296752929688, "epoch": 0.8883994126284875, "fcm_dpo/beta": 0.0009511103853583336, "fcm_dpo/delta": 0.04394224286079407, "fcm_dpo/margin": 375.9776611328125, "fcm_dpo/q_t": 0.4207463264465332, "grad_norm": 34.01826095581055, "learning_rate": 1.9276432015946446e-08, "logits/chosen": -0.9739004969596863, "logits/rejected": -0.9862950444221497, "logps/chosen": -737.1328125, "logps/ref_chosen": -66.24933624267578, "logps/ref_rejected": -102.30496978759766, "logps/rejected": -1149.166015625, "loss": 1.1403, "margin_dpo/margin_mean": 375.9776306152344, "margin_dpo/margin_std": 629.7609252929688, "step": 605 }, { "KL/chosen_KL_mean": -666.6454467773438, "KL/mean": -881.18115234375, "KL/rejected_KL_mean": -1095.7169189453125, "KL/std": 542.5753173828125, "epoch": 0.8898678414096917, "fcm_dpo/beta": 0.0009582208003848791, "fcm_dpo/delta": -0.011937655508518219, "fcm_dpo/margin": 429.0714416503906, "fcm_dpo/q_t": 0.40722784399986267, "grad_norm": 25.309036254882812, "learning_rate": 1.8785323298722093e-08, "logits/chosen": -0.9902355670928955, "logits/rejected": -1.004211664199829, "logps/chosen": -721.4645385742188, "logps/ref_chosen": -54.819122314453125, "logps/ref_rejected": -98.37146759033203, "logps/rejected": -1194.08837890625, "loss": 1.0911, "margin_dpo/margin_mean": 429.0714111328125, "margin_dpo/margin_std": 591.7274169921875, "step": 606 }, { "KL/chosen_KL_mean": -685.667236328125, "KL/mean": -858.60791015625, "KL/rejected_KL_mean": -1031.548583984375, "KL/std": 543.4450073242188, "epoch": 0.8913362701908958, "fcm_dpo/beta": 0.0009662234224379063, "fcm_dpo/delta": 0.06767666339874268, "fcm_dpo/margin": 345.88134765625, "fcm_dpo/q_t": 0.42443907260894775, "grad_norm": 29.7037353515625, "learning_rate": 1.8300307303259904e-08, "logits/chosen": -0.9784862399101257, "logits/rejected": -0.9676879048347473, "logps/chosen": -743.7513427734375, "logps/ref_chosen": -58.08403778076172, "logps/ref_rejected": -79.777099609375, "logps/rejected": -1111.32568359375, "loss": 1.1544, "margin_dpo/margin_mean": 345.88134765625, "margin_dpo/margin_std": 584.5785522460938, "step": 607 }, { "KL/chosen_KL_mean": -619.2991943359375, "KL/mean": -820.6446533203125, "KL/rejected_KL_mean": -1021.9901733398438, "KL/std": 493.3348388671875, "epoch": 0.8928046989720999, "fcm_dpo/beta": 0.0009703817777335644, "fcm_dpo/delta": 0.009458957239985466, "fcm_dpo/margin": 402.6909484863281, "fcm_dpo/q_t": 0.409574419260025, "grad_norm": 36.573951721191406, "learning_rate": 1.7821396810182437e-08, "logits/chosen": -1.0082218647003174, "logits/rejected": -1.019978642463684, "logps/chosen": -676.75, "logps/ref_chosen": -57.450836181640625, "logps/ref_rejected": -94.77339172363281, "logps/rejected": -1116.7635498046875, "loss": 1.091, "margin_dpo/margin_mean": 402.69097900390625, "margin_dpo/margin_std": 511.0771179199219, "step": 608 }, { "KL/chosen_KL_mean": -638.9405517578125, "KL/mean": -896.8486328125, "KL/rejected_KL_mean": -1154.756591796875, "KL/std": 665.662841796875, "epoch": 0.8942731277533039, "fcm_dpo/beta": 0.0009552284609526396, "fcm_dpo/delta": -0.09746446460485458, "fcm_dpo/margin": 515.8161010742188, "fcm_dpo/q_t": 0.395630419254303, "grad_norm": 28.152240753173828, "learning_rate": 1.7348604439226617e-08, "logits/chosen": -1.0558668375015259, "logits/rejected": -1.0789850950241089, "logps/chosen": -697.7459716796875, "logps/ref_chosen": -58.805355072021484, "logps/ref_rejected": -88.81600952148438, "logps/rejected": -1243.57275390625, "loss": 1.0668, "margin_dpo/margin_mean": 515.8161010742188, "margin_dpo/margin_std": 792.5299682617188, "step": 609 }, { "KL/chosen_KL_mean": -631.236083984375, "KL/mean": -791.146484375, "KL/rejected_KL_mean": -951.0569458007812, "KL/std": 500.4407653808594, "epoch": 0.895741556534508, "fcm_dpo/beta": 0.0009649534476920962, "fcm_dpo/delta": 0.09404957294464111, "fcm_dpo/margin": 319.82086181640625, "fcm_dpo/q_t": 0.42811504006385803, "grad_norm": 38.88047409057617, "learning_rate": 1.6881942648911074e-08, "logits/chosen": -0.9684814214706421, "logits/rejected": -0.9458719491958618, "logps/chosen": -696.9310913085938, "logps/ref_chosen": -65.69503784179688, "logps/ref_rejected": -83.40538787841797, "logps/rejected": -1034.46240234375, "loss": 1.175, "margin_dpo/margin_mean": 319.8208923339844, "margin_dpo/margin_std": 581.705078125, "step": 610 }, { "KL/chosen_KL_mean": -665.9498291015625, "KL/mean": -936.8076171875, "KL/rejected_KL_mean": -1207.665283203125, "KL/std": 653.8685302734375, "epoch": 0.8972099853157122, "fcm_dpo/beta": 0.0009503072360530496, "fcm_dpo/delta": -0.12105247378349304, "fcm_dpo/margin": 541.71533203125, "fcm_dpo/q_t": 0.38792964816093445, "grad_norm": 27.193374633789062, "learning_rate": 1.6421423736208e-08, "logits/chosen": -1.0235629081726074, "logits/rejected": -1.0670585632324219, "logps/chosen": -718.54931640625, "logps/ref_chosen": -52.59946823120117, "logps/ref_rejected": -86.33099365234375, "logps/rejected": -1293.9962158203125, "loss": 1.0461, "margin_dpo/margin_mean": 541.71533203125, "margin_dpo/margin_std": 745.772705078125, "step": 611 }, { "KL/chosen_KL_mean": -726.8673095703125, "KL/mean": -931.6278076171875, "KL/rejected_KL_mean": -1136.388427734375, "KL/std": 533.6715087890625, "epoch": 0.8986784140969163, "fcm_dpo/beta": 0.000949513225350529, "fcm_dpo/delta": 0.01131674274802208, "fcm_dpo/margin": 409.5210266113281, "fcm_dpo/q_t": 0.4112043082714081, "grad_norm": 31.31951141357422, "learning_rate": 1.5967059836219042e-08, "logits/chosen": -1.0205453634262085, "logits/rejected": -1.0193266868591309, "logps/chosen": -786.1910400390625, "logps/ref_chosen": -59.32372283935547, "logps/ref_rejected": -88.31239318847656, "logps/rejected": -1224.70068359375, "loss": 1.1061, "margin_dpo/margin_mean": 409.5210266113281, "margin_dpo/margin_std": 582.0534057617188, "step": 612 }, { "KL/chosen_KL_mean": -618.98388671875, "KL/mean": -881.4771728515625, "KL/rejected_KL_mean": -1143.9703369140625, "KL/std": 597.9173583984375, "epoch": 0.9001468428781204, "fcm_dpo/beta": 0.0009315350907854736, "fcm_dpo/delta": -0.09393209218978882, "fcm_dpo/margin": 524.9864501953125, "fcm_dpo/q_t": 0.3888673782348633, "grad_norm": 34.82392883300781, "learning_rate": 1.551886292185553e-08, "logits/chosen": -1.0204041004180908, "logits/rejected": -1.0734975337982178, "logps/chosen": -678.7138671875, "logps/ref_chosen": -59.72996520996094, "logps/ref_rejected": -105.10752868652344, "logps/rejected": -1249.077880859375, "loss": 1.0324, "margin_dpo/margin_mean": 524.9863891601562, "margin_dpo/margin_std": 627.1439208984375, "step": 613 }, { "KL/chosen_KL_mean": -688.460693359375, "KL/mean": -937.7681884765625, "KL/rejected_KL_mean": -1187.07568359375, "KL/std": 594.8397827148438, "epoch": 0.9016152716593245, "fcm_dpo/beta": 0.000922086532227695, "fcm_dpo/delta": -0.06259925663471222, "fcm_dpo/margin": 498.6150207519531, "fcm_dpo/q_t": 0.39733168482780457, "grad_norm": 43.358585357666016, "learning_rate": 1.507684480352292e-08, "logits/chosen": -1.0039961338043213, "logits/rejected": -1.0779341459274292, "logps/chosen": -741.399658203125, "logps/ref_chosen": -52.93898010253906, "logps/ref_rejected": -104.67938232421875, "logps/rejected": -1291.755126953125, "loss": 1.0741, "margin_dpo/margin_mean": 498.614990234375, "margin_dpo/margin_std": 705.9459228515625, "step": 614 }, { "KL/chosen_KL_mean": -664.404541015625, "KL/mean": -871.7774047851562, "KL/rejected_KL_mean": -1079.150146484375, "KL/std": 616.7755126953125, "epoch": 0.9030837004405287, "fcm_dpo/beta": 0.0009228853159584105, "fcm_dpo/delta": 0.017665421590209007, "fcm_dpo/margin": 414.74560546875, "fcm_dpo/q_t": 0.4132547974586487, "grad_norm": 27.085163116455078, "learning_rate": 1.4641017128809801e-08, "logits/chosen": -0.9974070191383362, "logits/rejected": -1.0210152864456177, "logps/chosen": -730.2218627929688, "logps/ref_chosen": -65.81727600097656, "logps/ref_rejected": -95.17749786376953, "logps/rejected": -1174.32763671875, "loss": 1.1269, "margin_dpo/margin_mean": 414.7456359863281, "margin_dpo/margin_std": 670.7782592773438, "step": 615 }, { "KL/chosen_KL_mean": -786.2246704101562, "KL/mean": -949.08154296875, "KL/rejected_KL_mean": -1111.9384765625, "KL/std": 518.137451171875, "epoch": 0.9045521292217328, "fcm_dpo/beta": 0.00093449791893363, "fcm_dpo/delta": 0.09862032532691956, "fcm_dpo/margin": 325.71380615234375, "fcm_dpo/q_t": 0.430539608001709, "grad_norm": 33.06167221069336, "learning_rate": 1.4211391382180637e-08, "logits/chosen": -1.0744967460632324, "logits/rejected": -1.0585415363311768, "logps/chosen": -851.3575439453125, "logps/ref_chosen": -65.13285827636719, "logps/ref_rejected": -74.70050048828125, "logps/rejected": -1186.638916015625, "loss": 1.1748, "margin_dpo/margin_mean": 325.71380615234375, "margin_dpo/margin_std": 578.4287719726562, "step": 616 }, { "KL/chosen_KL_mean": -737.30859375, "KL/mean": -861.140625, "KL/rejected_KL_mean": -984.97265625, "KL/std": 481.3660583496094, "epoch": 0.9060205580029369, "fcm_dpo/beta": 0.0009459134307689965, "fcm_dpo/delta": 0.07147952169179916, "fcm_dpo/margin": 247.66403198242188, "fcm_dpo/q_t": 0.44637531042099, "grad_norm": 58.81951141357422, "learning_rate": 1.378797888467345e-08, "logits/chosen": -0.9604687690734863, "logits/rejected": -0.9302307367324829, "logps/chosen": -800.3141479492188, "logps/ref_chosen": -63.005550384521484, "logps/ref_rejected": -64.234130859375, "logps/rejected": -1049.206787109375, "loss": 1.2335, "margin_dpo/margin_mean": 247.66403198242188, "margin_dpo/margin_std": 561.760009765625, "step": 617 }, { "KL/chosen_KL_mean": -777.463623046875, "KL/mean": -1048.628662109375, "KL/rejected_KL_mean": -1319.7938232421875, "KL/std": 667.0958251953125, "epoch": 0.9074889867841409, "fcm_dpo/beta": 0.0009419023990631104, "fcm_dpo/delta": -0.11681665480136871, "fcm_dpo/margin": 542.3301391601562, "fcm_dpo/q_t": 0.39000076055526733, "grad_norm": 38.52542495727539, "learning_rate": 1.3370790793601371e-08, "logits/chosen": -0.9963364601135254, "logits/rejected": -1.0328341722488403, "logps/chosen": -844.5650024414062, "logps/ref_chosen": -67.10134887695312, "logps/ref_rejected": -92.15340423583984, "logps/rejected": -1411.947265625, "loss": 1.0842, "margin_dpo/margin_mean": 542.3301391601562, "margin_dpo/margin_std": 860.1188354492188, "step": 618 }, { "KL/chosen_KL_mean": -769.1409912109375, "KL/mean": -969.6728515625, "KL/rejected_KL_mean": -1170.204833984375, "KL/std": 620.4271240234375, "epoch": 0.908957415565345, "fcm_dpo/beta": 0.0009351515327580273, "fcm_dpo/delta": 0.025874076411128044, "fcm_dpo/margin": 401.0638427734375, "fcm_dpo/q_t": 0.42254310846328735, "grad_norm": 50.947975158691406, "learning_rate": 1.2959838102258535e-08, "logits/chosen": -0.9994246959686279, "logits/rejected": -1.0098530054092407, "logps/chosen": -825.1192016601562, "logps/ref_chosen": -55.978233337402344, "logps/ref_rejected": -93.1854019165039, "logps/rejected": -1263.39013671875, "loss": 1.1798, "margin_dpo/margin_mean": 401.0638427734375, "margin_dpo/margin_std": 831.8917236328125, "step": 619 }, { "KL/chosen_KL_mean": -677.9056396484375, "KL/mean": -864.861328125, "KL/rejected_KL_mean": -1051.817138671875, "KL/std": 525.7140502929688, "epoch": 0.9104258443465492, "fcm_dpo/beta": 0.0009386817691847682, "fcm_dpo/delta": 0.05065443366765976, "fcm_dpo/margin": 373.9115295410156, "fcm_dpo/q_t": 0.4203672409057617, "grad_norm": 34.1131706237793, "learning_rate": 1.2555131639630567e-08, "logits/chosen": -1.036217451095581, "logits/rejected": -1.0394688844680786, "logps/chosen": -737.703125, "logps/ref_chosen": -59.79750061035156, "logps/ref_rejected": -78.41075134277344, "logps/rejected": -1130.2279052734375, "loss": 1.1384, "margin_dpo/margin_mean": 373.9115295410156, "margin_dpo/margin_std": 589.354248046875, "step": 620 }, { "KL/chosen_KL_mean": -688.1146240234375, "KL/mean": -972.752197265625, "KL/rejected_KL_mean": -1257.3896484375, "KL/std": 649.8193359375, "epoch": 0.9118942731277533, "fcm_dpo/beta": 0.0009327299194410443, "fcm_dpo/delta": -0.1384207010269165, "fcm_dpo/margin": 569.275146484375, "fcm_dpo/q_t": 0.3806772232055664, "grad_norm": 41.840362548828125, "learning_rate": 1.2156682070109086e-08, "logits/chosen": -1.082035779953003, "logits/rejected": -1.1331275701522827, "logps/chosen": -742.04833984375, "logps/ref_chosen": -53.93375778198242, "logps/ref_rejected": -88.36951446533203, "logps/rejected": -1345.75927734375, "loss": 1.0289, "margin_dpo/margin_mean": 569.275146484375, "margin_dpo/margin_std": 718.2183227539062, "step": 621 }, { "KL/chosen_KL_mean": -670.6583251953125, "KL/mean": -874.29296875, "KL/rejected_KL_mean": -1077.927490234375, "KL/std": 506.2366943359375, "epoch": 0.9133627019089574, "fcm_dpo/beta": 0.0009187752148136497, "fcm_dpo/delta": 0.026527073234319687, "fcm_dpo/margin": 407.2692565917969, "fcm_dpo/q_t": 0.41589581966400146, "grad_norm": 29.52936553955078, "learning_rate": 1.1764499893210878e-08, "logits/chosen": -0.9557490348815918, "logits/rejected": -0.944530189037323, "logps/chosen": -730.9442138671875, "logps/ref_chosen": -60.28582000732422, "logps/ref_rejected": -85.51873779296875, "logps/rejected": -1163.4462890625, "loss": 1.1216, "margin_dpo/margin_mean": 407.26922607421875, "margin_dpo/margin_std": 625.4295654296875, "step": 622 }, { "KL/chosen_KL_mean": -723.661376953125, "KL/mean": -879.453369140625, "KL/rejected_KL_mean": -1035.2454833984375, "KL/std": 520.5437622070312, "epoch": 0.9148311306901615, "fcm_dpo/beta": 0.0009395014494657516, "fcm_dpo/delta": 0.11050058901309967, "fcm_dpo/margin": 311.5841064453125, "fcm_dpo/q_t": 0.4357511103153229, "grad_norm": 37.05131912231445, "learning_rate": 1.1378595443300998e-08, "logits/chosen": -1.0592715740203857, "logits/rejected": -1.0595531463623047, "logps/chosen": -787.8182983398438, "logps/ref_chosen": -64.1569595336914, "logps/ref_rejected": -85.08304595947266, "logps/rejected": -1120.3284912109375, "loss": 1.1952, "margin_dpo/margin_mean": 311.5841369628906, "margin_dpo/margin_std": 634.1216430664062, "step": 623 }, { "KL/chosen_KL_mean": -728.6787719726562, "KL/mean": -973.8768920898438, "KL/rejected_KL_mean": -1219.074951171875, "KL/std": 544.4788818359375, "epoch": 0.9162995594713657, "fcm_dpo/beta": 0.0009369177860207856, "fcm_dpo/delta": -0.062284573912620544, "fcm_dpo/margin": 490.396240234375, "fcm_dpo/q_t": 0.3928346335887909, "grad_norm": 32.84885787963867, "learning_rate": 1.0998978889320582e-08, "logits/chosen": -1.0861725807189941, "logits/rejected": -1.088505506515503, "logps/chosen": -800.597412109375, "logps/ref_chosen": -71.91862487792969, "logps/ref_rejected": -97.13203430175781, "logps/rejected": -1316.20703125, "loss": 1.0496, "margin_dpo/margin_mean": 490.39617919921875, "margin_dpo/margin_std": 585.4035034179688, "step": 624 }, { "KL/chosen_KL_mean": -670.5234375, "KL/mean": -930.5636596679688, "KL/rejected_KL_mean": -1190.603759765625, "KL/std": 588.0369873046875, "epoch": 0.9177679882525698, "fcm_dpo/beta": 0.0009238402126356959, "fcm_dpo/delta": -0.08445164561271667, "fcm_dpo/margin": 520.080322265625, "fcm_dpo/q_t": 0.38963061571121216, "grad_norm": 70.59078979492188, "learning_rate": 1.0625660234518913e-08, "logits/chosen": -0.9677177667617798, "logits/rejected": -0.9921514391899109, "logps/chosen": -728.8655395507812, "logps/ref_chosen": -58.342071533203125, "logps/ref_rejected": -86.09038543701172, "logps/rejected": -1276.6942138671875, "loss": 1.0202, "margin_dpo/margin_mean": 520.080322265625, "margin_dpo/margin_std": 548.3095703125, "step": 625 }, { "KL/chosen_KL_mean": -836.838623046875, "KL/mean": -983.3013305664062, "KL/rejected_KL_mean": -1129.763916015625, "KL/std": 631.4542236328125, "epoch": 0.9192364170337739, "fcm_dpo/beta": 0.0009358528186567128, "fcm_dpo/delta": 0.12933696806430817, "fcm_dpo/margin": 292.92529296875, "fcm_dpo/q_t": 0.43587183952331543, "grad_norm": 34.235252380371094, "learning_rate": 1.0258649316189721e-08, "logits/chosen": -0.9614785313606262, "logits/rejected": -0.9531521797180176, "logps/chosen": -911.9512939453125, "logps/ref_chosen": -75.11260986328125, "logps/ref_rejected": -99.188720703125, "logps/rejected": -1228.95263671875, "loss": 1.2146, "margin_dpo/margin_mean": 292.92529296875, "margin_dpo/margin_std": 614.7378540039062, "step": 626 }, { "KL/chosen_KL_mean": -566.9566650390625, "KL/mean": -861.046875, "KL/rejected_KL_mean": -1155.136962890625, "KL/std": 695.9779052734375, "epoch": 0.920704845814978, "fcm_dpo/beta": 0.0009262310341000557, "fcm_dpo/delta": -0.15298572182655334, "fcm_dpo/margin": 588.1803588867188, "fcm_dpo/q_t": 0.38598155975341797, "grad_norm": 32.391971588134766, "learning_rate": 9.897955805412e-09, "logits/chosen": -0.9232186079025269, "logits/rejected": -0.9977039098739624, "logps/chosen": -614.6998291015625, "logps/ref_chosen": -47.74314880371094, "logps/ref_rejected": -106.75448608398438, "logps/rejected": -1261.8914794921875, "loss": 1.0311, "margin_dpo/margin_mean": 588.1802978515625, "margin_dpo/margin_std": 796.7737426757812, "step": 627 }, { "KL/chosen_KL_mean": -744.3592529296875, "KL/mean": -966.3853759765625, "KL/rejected_KL_mean": -1188.41162109375, "KL/std": 574.35107421875, "epoch": 0.922173274596182, "fcm_dpo/beta": 0.000911533716134727, "fcm_dpo/delta": -0.004994707182049751, "fcm_dpo/margin": 444.0523681640625, "fcm_dpo/q_t": 0.4087638854980469, "grad_norm": 32.3719367980957, "learning_rate": 9.543589206795238e-09, "logits/chosen": -1.0520081520080566, "logits/rejected": -1.065995454788208, "logps/chosen": -804.5421752929688, "logps/ref_chosen": -60.182945251464844, "logps/ref_rejected": -101.55467224121094, "logps/rejected": -1289.96630859375, "loss": 1.1065, "margin_dpo/margin_mean": 444.0523681640625, "margin_dpo/margin_std": 661.4337158203125, "step": 628 }, { "KL/chosen_KL_mean": -735.479248046875, "KL/mean": -940.03759765625, "KL/rejected_KL_mean": -1144.595947265625, "KL/std": 554.8238525390625, "epoch": 0.9236417033773862, "fcm_dpo/beta": 0.0009158846805803478, "fcm_dpo/delta": 0.026277855038642883, "fcm_dpo/margin": 409.11669921875, "fcm_dpo/q_t": 0.4125128388404846, "grad_norm": 35.62141036987305, "learning_rate": 9.19555885822887e-09, "logits/chosen": -1.0496397018432617, "logits/rejected": -1.0647929906845093, "logps/chosen": -799.6928100585938, "logps/ref_chosen": -64.21354675292969, "logps/ref_rejected": -91.65367126464844, "logps/rejected": -1236.2496337890625, "loss": 1.1059, "margin_dpo/margin_mean": 409.11669921875, "margin_dpo/margin_std": 548.577880859375, "step": 629 }, { "KL/chosen_KL_mean": -664.2507934570312, "KL/mean": -786.7623291015625, "KL/rejected_KL_mean": -909.27392578125, "KL/std": 577.2899169921875, "epoch": 0.9251101321585903, "fcm_dpo/beta": 0.0009212232544086874, "fcm_dpo/delta": 0.045750658959150314, "fcm_dpo/margin": 245.02310180664062, "fcm_dpo/q_t": 0.4528850317001343, "grad_norm": 56.84680938720703, "learning_rate": 8.85387393063622e-09, "logits/chosen": -1.043975830078125, "logits/rejected": -1.0191277265548706, "logps/chosen": -723.5418090820312, "logps/ref_chosen": -59.29100036621094, "logps/ref_rejected": -83.59829711914062, "logps/rejected": -992.8721923828125, "loss": 1.2698, "margin_dpo/margin_mean": 245.02308654785156, "margin_dpo/margin_std": 707.3883056640625, "step": 630 }, { "KL/chosen_KL_mean": -800.809814453125, "KL/mean": -987.3739013671875, "KL/rejected_KL_mean": -1173.93798828125, "KL/std": 571.5894775390625, "epoch": 0.9265785609397944, "fcm_dpo/beta": 0.0009341588011011481, "fcm_dpo/delta": 0.053176864981651306, "fcm_dpo/margin": 373.1282043457031, "fcm_dpo/q_t": 0.4202990233898163, "grad_norm": 33.039405822753906, "learning_rate": 8.518543427732949e-09, "logits/chosen": -1.1113148927688599, "logits/rejected": -1.120398759841919, "logps/chosen": -860.263427734375, "logps/ref_chosen": -59.45360565185547, "logps/ref_rejected": -80.95156860351562, "logps/rejected": -1254.8896484375, "loss": 1.1597, "margin_dpo/margin_mean": 373.1282043457031, "margin_dpo/margin_std": 670.2313232421875, "step": 631 }, { "KL/chosen_KL_mean": -700.687744140625, "KL/mean": -902.2703857421875, "KL/rejected_KL_mean": -1103.85302734375, "KL/std": 529.5032348632812, "epoch": 0.9280469897209985, "fcm_dpo/beta": 0.0009340323740616441, "fcm_dpo/delta": 0.023960798978805542, "fcm_dpo/margin": 403.1651916503906, "fcm_dpo/q_t": 0.4138449430465698, "grad_norm": 43.67294692993164, "learning_rate": 8.189576185789637e-09, "logits/chosen": -1.0655746459960938, "logits/rejected": -1.0689226388931274, "logps/chosen": -762.039306640625, "logps/ref_chosen": -61.35155487060547, "logps/ref_rejected": -86.16017150878906, "logps/rejected": -1190.01318359375, "loss": 1.1318, "margin_dpo/margin_mean": 403.16522216796875, "margin_dpo/margin_std": 643.6358642578125, "step": 632 }, { "KL/chosen_KL_mean": -774.7955322265625, "KL/mean": -919.6793212890625, "KL/rejected_KL_mean": -1064.563232421875, "KL/std": 531.289306640625, "epoch": 0.9295154185022027, "fcm_dpo/beta": 0.0009426448959857225, "fcm_dpo/delta": 0.02990627847611904, "fcm_dpo/margin": 289.7676696777344, "fcm_dpo/q_t": 0.437002032995224, "grad_norm": 47.90824508666992, "learning_rate": 7.866980873399015e-09, "logits/chosen": -1.116697072982788, "logits/rejected": -1.1289957761764526, "logps/chosen": -832.0736694335938, "logps/ref_chosen": -57.27816390991211, "logps/ref_rejected": -91.58395385742188, "logps/rejected": -1156.147216796875, "loss": 1.2107, "margin_dpo/margin_mean": 289.76763916015625, "margin_dpo/margin_std": 609.2022094726562, "step": 633 }, { "KL/chosen_KL_mean": -864.3818969726562, "KL/mean": -987.146728515625, "KL/rejected_KL_mean": -1109.9114990234375, "KL/std": 618.4544677734375, "epoch": 0.9309838472834068, "fcm_dpo/beta": 0.0009592788992449641, "fcm_dpo/delta": 0.0725301131606102, "fcm_dpo/margin": 245.5295867919922, "fcm_dpo/q_t": 0.44761770963668823, "grad_norm": 36.095970153808594, "learning_rate": 7.550765991247654e-09, "logits/chosen": -0.9695584774017334, "logits/rejected": -0.9623770117759705, "logps/chosen": -931.0008544921875, "logps/ref_chosen": -66.61896514892578, "logps/ref_rejected": -107.12564849853516, "logps/rejected": -1217.037109375, "loss": 1.2439, "margin_dpo/margin_mean": 245.52960205078125, "margin_dpo/margin_std": 606.821533203125, "step": 634 }, { "KL/chosen_KL_mean": -748.3845825195312, "KL/mean": -925.947509765625, "KL/rejected_KL_mean": -1103.51025390625, "KL/std": 645.3294067382812, "epoch": 0.9324522760646109, "fcm_dpo/beta": 0.0009680173825472593, "fcm_dpo/delta": 0.05820862203836441, "fcm_dpo/margin": 355.125732421875, "fcm_dpo/q_t": 0.42416542768478394, "grad_norm": 35.836708068847656, "learning_rate": 7.240939871891699e-09, "logits/chosen": -1.0464322566986084, "logits/rejected": -1.0263543128967285, "logps/chosen": -822.340087890625, "logps/ref_chosen": -73.95551300048828, "logps/ref_rejected": -82.50045776367188, "logps/rejected": -1186.0107421875, "loss": 1.1569, "margin_dpo/margin_mean": 355.125732421875, "margin_dpo/margin_std": 636.2976684570312, "step": 635 }, { "KL/chosen_KL_mean": -698.6964111328125, "KL/mean": -911.1131591796875, "KL/rejected_KL_mean": -1123.5299072265625, "KL/std": 614.4910888671875, "epoch": 0.933920704845815, "fcm_dpo/beta": 0.0009746984578669071, "fcm_dpo/delta": -0.015476349741220474, "fcm_dpo/margin": 424.83349609375, "fcm_dpo/q_t": 0.4083341956138611, "grad_norm": 29.87440299987793, "learning_rate": 6.937510679537628e-09, "logits/chosen": -1.0113909244537354, "logits/rejected": -1.0161449909210205, "logps/chosen": -758.3253173828125, "logps/ref_chosen": -59.628910064697266, "logps/ref_rejected": -81.97883605957031, "logps/rejected": -1205.5087890625, "loss": 1.0972, "margin_dpo/margin_mean": 424.83349609375, "margin_dpo/margin_std": 628.7258911132812, "step": 636 }, { "KL/chosen_KL_mean": -713.9525146484375, "KL/mean": -948.3504638671875, "KL/rejected_KL_mean": -1182.74853515625, "KL/std": 626.5479736328125, "epoch": 0.9353891336270191, "fcm_dpo/beta": 0.0009556564618833363, "fcm_dpo/delta": -0.050962455570697784, "fcm_dpo/margin": 468.7959899902344, "fcm_dpo/q_t": 0.4004845917224884, "grad_norm": 30.655946731567383, "learning_rate": 6.640486409826785e-09, "logits/chosen": -1.1256394386291504, "logits/rejected": -1.1713881492614746, "logps/chosen": -763.6051635742188, "logps/ref_chosen": -49.652687072753906, "logps/ref_rejected": -98.40513610839844, "logps/rejected": -1281.153564453125, "loss": 1.0745, "margin_dpo/margin_mean": 468.79595947265625, "margin_dpo/margin_std": 649.401123046875, "step": 637 }, { "KL/chosen_KL_mean": -686.6837768554688, "KL/mean": -873.9114990234375, "KL/rejected_KL_mean": -1061.1392822265625, "KL/std": 583.2347412109375, "epoch": 0.9368575624082232, "fcm_dpo/beta": 0.0009507788345217705, "fcm_dpo/delta": -0.07280878722667694, "fcm_dpo/margin": 374.45550537109375, "fcm_dpo/q_t": 0.41397538781166077, "grad_norm": 36.20570755004883, "learning_rate": 6.349874889624962e-09, "logits/chosen": -0.9657926559448242, "logits/rejected": -0.9476113319396973, "logps/chosen": -744.8404541015625, "logps/ref_chosen": -58.156639099121094, "logps/ref_rejected": -79.3014907836914, "logps/rejected": -1140.4407958984375, "loss": 1.1633, "margin_dpo/margin_mean": 374.45550537109375, "margin_dpo/margin_std": 676.257568359375, "step": 638 }, { "KL/chosen_KL_mean": -945.5469360351562, "KL/mean": -1030.381591796875, "KL/rejected_KL_mean": -1115.21630859375, "KL/std": 560.040283203125, "epoch": 0.9383259911894273, "fcm_dpo/beta": 0.0009438564302399755, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 169.6693115234375, "fcm_dpo/q_t": 0.46391725540161133, "grad_norm": 108.60933685302734, "learning_rate": 6.065683776815933e-09, "logits/chosen": -0.9735069274902344, "logits/rejected": -0.9152404069900513, "logps/chosen": -1017.8701171875, "logps/ref_chosen": -72.32319641113281, "logps/ref_rejected": -74.2749252319336, "logps/rejected": -1189.4912109375, "loss": 1.3368, "margin_dpo/margin_mean": 169.6693115234375, "margin_dpo/margin_std": 701.151611328125, "step": 639 }, { "KL/chosen_KL_mean": -711.00732421875, "KL/mean": -993.779541015625, "KL/rejected_KL_mean": -1276.5517578125, "KL/std": 650.807373046875, "epoch": 0.9397944199706314, "fcm_dpo/beta": 0.0009295439813286066, "fcm_dpo/delta": -0.13248543441295624, "fcm_dpo/margin": 565.5443725585938, "fcm_dpo/q_t": 0.3859960734844208, "grad_norm": 46.08125305175781, "learning_rate": 5.7879205600998296e-09, "logits/chosen": -0.9270308613777161, "logits/rejected": -0.9523489475250244, "logps/chosen": -767.1417236328125, "logps/ref_chosen": -56.13436508178711, "logps/ref_rejected": -108.60014343261719, "logps/rejected": -1385.15185546875, "loss": 1.0366, "margin_dpo/margin_mean": 565.5443725585938, "margin_dpo/margin_std": 759.7664794921875, "step": 640 }, { "KL/chosen_KL_mean": -829.0867919921875, "KL/mean": -1006.7977294921875, "KL/rejected_KL_mean": -1184.5086669921875, "KL/std": 552.3861083984375, "epoch": 0.9412628487518355, "fcm_dpo/beta": 0.000928039662539959, "fcm_dpo/delta": 0.0726061537861824, "fcm_dpo/margin": 355.4219055175781, "fcm_dpo/q_t": 0.4274270534515381, "grad_norm": 28.046123504638672, "learning_rate": 5.516592558795746e-09, "logits/chosen": -1.041335940361023, "logits/rejected": -1.0468769073486328, "logps/chosen": -894.083740234375, "logps/ref_chosen": -64.99689483642578, "logps/ref_rejected": -86.99232482910156, "logps/rejected": -1271.5009765625, "loss": 1.1886, "margin_dpo/margin_mean": 355.42193603515625, "margin_dpo/margin_std": 733.6763916015625, "step": 641 }, { "KL/chosen_KL_mean": -794.72314453125, "KL/mean": -1015.3919677734375, "KL/rejected_KL_mean": -1236.060791015625, "KL/std": 731.3734130859375, "epoch": 0.9427312775330396, "fcm_dpo/beta": 0.0009348751045763493, "fcm_dpo/delta": -0.013475339859724045, "fcm_dpo/margin": 441.3376770019531, "fcm_dpo/q_t": 0.4158053398132324, "grad_norm": 40.215126037597656, "learning_rate": 5.251706922648868e-09, "logits/chosen": -0.9885178804397583, "logits/rejected": -1.0216963291168213, "logps/chosen": -860.412353515625, "logps/ref_chosen": -65.68924713134766, "logps/ref_rejected": -110.24205017089844, "logps/rejected": -1346.3028564453125, "loss": 1.1535, "margin_dpo/margin_mean": 441.33770751953125, "margin_dpo/margin_std": 880.50634765625, "step": 642 }, { "KL/chosen_KL_mean": -712.148193359375, "KL/mean": -884.007080078125, "KL/rejected_KL_mean": -1055.865966796875, "KL/std": 530.5042724609375, "epoch": 0.9441997063142438, "fcm_dpo/beta": 0.0009251800365746021, "fcm_dpo/delta": -0.02657410502433777, "fcm_dpo/margin": 343.71783447265625, "fcm_dpo/q_t": 0.42580801248550415, "grad_norm": 39.23821258544922, "learning_rate": 4.993270631642038e-09, "logits/chosen": -1.1170873641967773, "logits/rejected": -1.1149516105651855, "logps/chosen": -764.09814453125, "logps/ref_chosen": -51.94999694824219, "logps/ref_rejected": -87.46833801269531, "logps/rejected": -1143.3343505859375, "loss": 1.1537, "margin_dpo/margin_mean": 343.71783447265625, "margin_dpo/margin_std": 526.9955444335938, "step": 643 }, { "KL/chosen_KL_mean": -688.5614624023438, "KL/mean": -862.1695556640625, "KL/rejected_KL_mean": -1035.777587890625, "KL/std": 615.8262939453125, "epoch": 0.9456681350954479, "fcm_dpo/beta": 0.0009356926893815398, "fcm_dpo/delta": 0.07765576243400574, "fcm_dpo/margin": 347.2160949707031, "fcm_dpo/q_t": 0.4279705882072449, "grad_norm": 38.85586166381836, "learning_rate": 4.741290495811873e-09, "logits/chosen": -0.9950805306434631, "logits/rejected": -1.0022577047348022, "logps/chosen": -747.5791015625, "logps/ref_chosen": -59.017662048339844, "logps/ref_rejected": -87.13668823242188, "logps/rejected": -1122.914306640625, "loss": 1.1888, "margin_dpo/margin_mean": 347.2160949707031, "margin_dpo/margin_std": 702.9293212890625, "step": 644 }, { "KL/chosen_KL_mean": -722.38525390625, "KL/mean": -811.447509765625, "KL/rejected_KL_mean": -900.5097045898438, "KL/std": 496.90313720703125, "epoch": 0.947136563876652, "fcm_dpo/beta": 0.0009528464288450778, "fcm_dpo/delta": 0.06967134773731232, "fcm_dpo/margin": 178.12442016601562, "fcm_dpo/q_t": 0.4628395438194275, "grad_norm": 89.61138916015625, "learning_rate": 4.495773155069299e-09, "logits/chosen": -0.9825261831283569, "logits/rejected": -0.968986988067627, "logps/chosen": -778.2612915039062, "logps/ref_chosen": -55.87602233886719, "logps/ref_rejected": -97.78080749511719, "logps/rejected": -998.29052734375, "loss": 1.3289, "margin_dpo/margin_mean": 178.12442016601562, "margin_dpo/margin_std": 673.5927734375, "step": 645 }, { "KL/chosen_KL_mean": -689.6497802734375, "KL/mean": -844.453857421875, "KL/rejected_KL_mean": -999.2579956054688, "KL/std": 476.8377990722656, "epoch": 0.9486049926578561, "fcm_dpo/beta": 0.0009646883700042963, "fcm_dpo/delta": 0.10456812381744385, "fcm_dpo/margin": 309.60821533203125, "fcm_dpo/q_t": 0.4325307607650757, "grad_norm": 46.54256057739258, "learning_rate": 4.256725079024553e-09, "logits/chosen": -1.0059431791305542, "logits/rejected": -0.9890854954719543, "logps/chosen": -750.925537109375, "logps/ref_chosen": -61.275787353515625, "logps/ref_rejected": -77.50580596923828, "logps/rejected": -1076.7637939453125, "loss": 1.1834, "margin_dpo/margin_mean": 309.6082458496094, "margin_dpo/margin_std": 569.5079345703125, "step": 646 }, { "KL/chosen_KL_mean": -608.05615234375, "KL/mean": -797.486083984375, "KL/rejected_KL_mean": -986.9160766601562, "KL/std": 535.0508422851562, "epoch": 0.9500734214390602, "fcm_dpo/beta": 0.0009775401558727026, "fcm_dpo/delta": 0.030786845833063126, "fcm_dpo/margin": 378.85992431640625, "fcm_dpo/q_t": 0.41445714235305786, "grad_norm": 35.470096588134766, "learning_rate": 4.024152566816791e-09, "logits/chosen": -0.937364935874939, "logits/rejected": -0.965479850769043, "logps/chosen": -662.9085693359375, "logps/ref_chosen": -54.8524169921875, "logps/ref_rejected": -93.5194091796875, "logps/rejected": -1080.435546875, "loss": 1.1137, "margin_dpo/margin_mean": 378.85992431640625, "margin_dpo/margin_std": 526.0119018554688, "step": 647 }, { "KL/chosen_KL_mean": -638.0758056640625, "KL/mean": -920.2573852539062, "KL/rejected_KL_mean": -1202.43896484375, "KL/std": 642.937255859375, "epoch": 0.9515418502202643, "fcm_dpo/beta": 0.000957622891291976, "fcm_dpo/delta": -0.14847612380981445, "fcm_dpo/margin": 564.3631591796875, "fcm_dpo/q_t": 0.3837600648403168, "grad_norm": 27.33829116821289, "learning_rate": 3.798061746947995e-09, "logits/chosen": -1.0410782098770142, "logits/rejected": -1.0981464385986328, "logps/chosen": -692.247314453125, "logps/ref_chosen": -54.17146682739258, "logps/ref_rejected": -98.7127914428711, "logps/rejected": -1301.15185546875, "loss": 1.0285, "margin_dpo/margin_mean": 564.3631591796875, "margin_dpo/margin_std": 768.2374267578125, "step": 648 }, { "KL/chosen_KL_mean": -680.2294921875, "KL/mean": -809.4317626953125, "KL/rejected_KL_mean": -938.6341552734375, "KL/std": 502.25543212890625, "epoch": 0.9530102790014684, "fcm_dpo/beta": 0.0009556890581734478, "fcm_dpo/delta": 0.055874936282634735, "fcm_dpo/margin": 258.4045715332031, "fcm_dpo/q_t": 0.4464304447174072, "grad_norm": 29.119403839111328, "learning_rate": 3.5784585771215235e-09, "logits/chosen": -1.0787172317504883, "logits/rejected": -1.0679619312286377, "logps/chosen": -742.7098388671875, "logps/ref_chosen": -62.480350494384766, "logps/ref_rejected": -80.07717895507812, "logps/rejected": -1018.7113037109375, "loss": 1.2415, "margin_dpo/margin_mean": 258.4045715332031, "margin_dpo/margin_std": 620.7310791015625, "step": 649 }, { "KL/chosen_KL_mean": -735.5718994140625, "KL/mean": -945.16455078125, "KL/rejected_KL_mean": -1154.757080078125, "KL/std": 632.589599609375, "epoch": 0.9544787077826725, "fcm_dpo/beta": 0.0009581187041476369, "fcm_dpo/delta": -0.001857999712228775, "fcm_dpo/margin": 419.18524169921875, "fcm_dpo/q_t": 0.4114704728126526, "grad_norm": 34.31390380859375, "learning_rate": 3.3653488440851253e-09, "logits/chosen": -1.0013569593429565, "logits/rejected": -1.0217807292938232, "logps/chosen": -791.6647338867188, "logps/ref_chosen": -56.09281921386719, "logps/ref_rejected": -98.26483917236328, "logps/rejected": -1253.02197265625, "loss": 1.1353, "margin_dpo/margin_mean": 419.18524169921875, "margin_dpo/margin_std": 727.79541015625, "step": 650 }, { "KL/chosen_KL_mean": -482.6407165527344, "KL/mean": -760.6094970703125, "KL/rejected_KL_mean": -1038.578369140625, "KL/std": 583.9876708984375, "epoch": 0.9559471365638766, "fcm_dpo/beta": 0.000941460719332099, "fcm_dpo/delta": -0.13033278286457062, "fcm_dpo/margin": 555.9376220703125, "fcm_dpo/q_t": 0.38108521699905396, "grad_norm": 45.66421127319336, "learning_rate": 3.158738163478475e-09, "logits/chosen": -1.0007972717285156, "logits/rejected": -1.059419870376587, "logps/chosen": -526.066162109375, "logps/ref_chosen": -43.42544937133789, "logps/ref_rejected": -99.95791625976562, "logps/rejected": -1138.5361328125, "loss": 1.0005, "margin_dpo/margin_mean": 555.9376220703125, "margin_dpo/margin_std": 601.0250244140625, "step": 651 }, { "KL/chosen_KL_mean": -632.904052734375, "KL/mean": -837.604736328125, "KL/rejected_KL_mean": -1042.305419921875, "KL/std": 583.210693359375, "epoch": 0.9574155653450808, "fcm_dpo/beta": 0.0009400760754942894, "fcm_dpo/delta": 0.015617836266756058, "fcm_dpo/margin": 409.4013671875, "fcm_dpo/q_t": 0.4142574071884155, "grad_norm": 32.21805953979492, "learning_rate": 2.9586319796851555e-09, "logits/chosen": -1.0335817337036133, "logits/rejected": -1.0595180988311768, "logps/chosen": -695.4808959960938, "logps/ref_chosen": -62.57680892944336, "logps/ref_rejected": -111.76779174804688, "logps/rejected": -1154.0732421875, "loss": 1.1271, "margin_dpo/margin_mean": 409.4013671875, "margin_dpo/margin_std": 668.522216796875, "step": 652 }, { "KL/chosen_KL_mean": -756.5819091796875, "KL/mean": -952.8492431640625, "KL/rejected_KL_mean": -1149.116455078125, "KL/std": 617.5569458007812, "epoch": 0.9588839941262849, "fcm_dpo/beta": 0.0009441639995202422, "fcm_dpo/delta": 0.03045791946351528, "fcm_dpo/margin": 392.53460693359375, "fcm_dpo/q_t": 0.41772544384002686, "grad_norm": 32.95127487182617, "learning_rate": 2.7650355656892166e-09, "logits/chosen": -1.0887930393218994, "logits/rejected": -1.1141128540039062, "logps/chosen": -817.6948852539062, "logps/ref_chosen": -61.11295700073242, "logps/ref_rejected": -103.24960327148438, "logps/rejected": -1252.3660888671875, "loss": 1.1353, "margin_dpo/margin_mean": 392.53460693359375, "margin_dpo/margin_std": 648.3425903320312, "step": 653 }, { "KL/chosen_KL_mean": -682.6639404296875, "KL/mean": -862.1605834960938, "KL/rejected_KL_mean": -1041.6572265625, "KL/std": 506.5113220214844, "epoch": 0.960352422907489, "fcm_dpo/beta": 0.0009525552159175277, "fcm_dpo/delta": 0.06012295186519623, "fcm_dpo/margin": 358.99334716796875, "fcm_dpo/q_t": 0.42324844002723694, "grad_norm": 34.27675247192383, "learning_rate": 2.577954022936174e-09, "logits/chosen": -1.0422253608703613, "logits/rejected": -1.0543601512908936, "logps/chosen": -744.39208984375, "logps/ref_chosen": -61.7281379699707, "logps/ref_rejected": -98.7738037109375, "logps/rejected": -1140.43115234375, "loss": 1.1455, "margin_dpo/margin_mean": 358.9933776855469, "margin_dpo/margin_std": 584.0943603515625, "step": 654 }, { "KL/chosen_KL_mean": -647.9940185546875, "KL/mean": -838.0845947265625, "KL/rejected_KL_mean": -1028.1751708984375, "KL/std": 522.7816162109375, "epoch": 0.9618208516886931, "fcm_dpo/beta": 0.0009614527225494385, "fcm_dpo/delta": 0.03576880693435669, "fcm_dpo/margin": 380.1811828613281, "fcm_dpo/q_t": 0.41819822788238525, "grad_norm": 34.595184326171875, "learning_rate": 2.397392281198729e-09, "logits/chosen": -1.0417159795761108, "logits/rejected": -1.0822257995605469, "logps/chosen": -697.57080078125, "logps/ref_chosen": -49.576812744140625, "logps/ref_rejected": -98.29183197021484, "logps/rejected": -1126.467041015625, "loss": 1.136, "margin_dpo/margin_mean": 380.1811828613281, "margin_dpo/margin_std": 621.2908935546875, "step": 655 }, { "KL/chosen_KL_mean": -692.2393798828125, "KL/mean": -1018.1945190429688, "KL/rejected_KL_mean": -1344.149658203125, "KL/std": 693.30029296875, "epoch": 0.9632892804698973, "fcm_dpo/beta": 0.0009351709159091115, "fcm_dpo/delta": -0.2227155566215515, "fcm_dpo/margin": 651.910400390625, "fcm_dpo/q_t": 0.36535900831222534, "grad_norm": 73.65252685546875, "learning_rate": 2.223355098446622e-09, "logits/chosen": -0.9182928800582886, "logits/rejected": -0.9882034063339233, "logps/chosen": -744.788818359375, "logps/ref_chosen": -52.54943084716797, "logps/ref_rejected": -113.67464447021484, "logps/rejected": -1457.824462890625, "loss": 0.9582, "margin_dpo/margin_mean": 651.910400390625, "margin_dpo/margin_std": 694.8057250976562, "step": 656 }, { "KL/chosen_KL_mean": -656.0784301757812, "KL/mean": -918.288818359375, "KL/rejected_KL_mean": -1180.499267578125, "KL/std": 663.51416015625, "epoch": 0.9647577092511013, "fcm_dpo/beta": 0.0009086633799597621, "fcm_dpo/delta": -0.08051308244466782, "fcm_dpo/margin": 524.4208374023438, "fcm_dpo/q_t": 0.39257729053497314, "grad_norm": 42.068206787109375, "learning_rate": 2.055847060721566e-09, "logits/chosen": -1.087989330291748, "logits/rejected": -1.1325247287750244, "logps/chosen": -702.7789306640625, "logps/ref_chosen": -46.700538635253906, "logps/ref_rejected": -97.91487121582031, "logps/rejected": -1278.4140625, "loss": 1.0508, "margin_dpo/margin_mean": 524.4208374023438, "margin_dpo/margin_std": 682.353271484375, "step": 657 }, { "KL/chosen_KL_mean": -682.652099609375, "KL/mean": -883.319580078125, "KL/rejected_KL_mean": -1083.987060546875, "KL/std": 494.90802001953125, "epoch": 0.9662261380323054, "fcm_dpo/beta": 0.000907151261344552, "fcm_dpo/delta": 0.037033095955848694, "fcm_dpo/margin": 401.3349609375, "fcm_dpo/q_t": 0.4157608151435852, "grad_norm": 31.92685890197754, "learning_rate": 1.8948725820160662e-09, "logits/chosen": -1.0313150882720947, "logits/rejected": -1.0605497360229492, "logps/chosen": -743.6102905273438, "logps/ref_chosen": -60.95820999145508, "logps/ref_rejected": -95.93949127197266, "logps/rejected": -1179.926513671875, "loss": 1.1177, "margin_dpo/margin_mean": 401.3349609375, "margin_dpo/margin_std": 558.94873046875, "step": 658 }, { "KL/chosen_KL_mean": -622.085205078125, "KL/mean": -824.7811279296875, "KL/rejected_KL_mean": -1027.47705078125, "KL/std": 517.73681640625, "epoch": 0.9676945668135095, "fcm_dpo/beta": 0.0009139457251876593, "fcm_dpo/delta": 0.030485082417726517, "fcm_dpo/margin": 405.39166259765625, "fcm_dpo/q_t": 0.41561028361320496, "grad_norm": 32.02241516113281, "learning_rate": 1.7404359041573723e-09, "logits/chosen": -0.9981797933578491, "logits/rejected": -0.9754196405410767, "logps/chosen": -698.8282470703125, "logps/ref_chosen": -76.74298095703125, "logps/ref_rejected": -87.4709701538086, "logps/rejected": -1114.947998046875, "loss": 1.114, "margin_dpo/margin_mean": 405.39166259765625, "margin_dpo/margin_std": 572.6546020507812, "step": 659 }, { "KL/chosen_KL_mean": -681.49755859375, "KL/mean": -938.8291625976562, "KL/rejected_KL_mean": -1196.1607666015625, "KL/std": 619.5654907226562, "epoch": 0.9691629955947136, "fcm_dpo/beta": 0.0009104161872528493, "fcm_dpo/delta": -0.0718650072813034, "fcm_dpo/margin": 514.663330078125, "fcm_dpo/q_t": 0.3939950466156006, "grad_norm": 37.91978454589844, "learning_rate": 1.592541096695571e-09, "logits/chosen": -1.0615503787994385, "logits/rejected": -1.0776853561401367, "logps/chosen": -740.54541015625, "logps/ref_chosen": -59.04788589477539, "logps/ref_rejected": -75.96005249023438, "logps/rejected": -1272.120849609375, "loss": 1.0543, "margin_dpo/margin_mean": 514.663330078125, "margin_dpo/margin_std": 662.3004150390625, "step": 660 }, { "KL/chosen_KL_mean": -598.23583984375, "KL/mean": -829.0057983398438, "KL/rejected_KL_mean": -1059.775634765625, "KL/std": 670.0771484375, "epoch": 0.9706314243759178, "fcm_dpo/beta": 0.0009062248282134533, "fcm_dpo/delta": -0.019264454022049904, "fcm_dpo/margin": 461.5398254394531, "fcm_dpo/q_t": 0.40717989206314087, "grad_norm": 50.24213409423828, "learning_rate": 1.4511920567963908e-09, "logits/chosen": -1.0732464790344238, "logits/rejected": -1.0874643325805664, "logps/chosen": -648.9098510742188, "logps/ref_chosen": -50.673973083496094, "logps/ref_rejected": -86.00569152832031, "logps/rejected": -1145.7813720703125, "loss": 1.0881, "margin_dpo/margin_mean": 461.53985595703125, "margin_dpo/margin_std": 661.6412963867188, "step": 661 }, { "KL/chosen_KL_mean": -688.9097900390625, "KL/mean": -871.3480224609375, "KL/rejected_KL_mean": -1053.786376953125, "KL/std": 555.1720581054688, "epoch": 0.9720998531571219, "fcm_dpo/beta": 0.0009117955341935158, "fcm_dpo/delta": 0.06958886981010437, "fcm_dpo/margin": 364.87664794921875, "fcm_dpo/q_t": 0.4251037836074829, "grad_norm": 28.40976905822754, "learning_rate": 1.3163925091384532e-09, "logits/chosen": -0.9545935392379761, "logits/rejected": -0.9512023329734802, "logps/chosen": -758.1707763671875, "logps/ref_chosen": -69.26106262207031, "logps/ref_rejected": -89.05593872070312, "logps/rejected": -1142.84228515625, "loss": 1.1688, "margin_dpo/margin_mean": 364.87664794921875, "margin_dpo/margin_std": 676.1181030273438, "step": 662 }, { "KL/chosen_KL_mean": -660.3707885742188, "KL/mean": -882.6888427734375, "KL/rejected_KL_mean": -1105.0068359375, "KL/std": 639.6807250976562, "epoch": 0.973568281938326, "fcm_dpo/beta": 0.0009125665528699756, "fcm_dpo/delta": -0.006048870272934437, "fcm_dpo/margin": 444.63604736328125, "fcm_dpo/q_t": 0.4114909768104553, "grad_norm": 29.266870498657227, "learning_rate": 1.1881460058152382e-09, "logits/chosen": -1.043156385421753, "logits/rejected": -1.0693552494049072, "logps/chosen": -725.249755859375, "logps/ref_chosen": -64.87890625, "logps/ref_rejected": -113.92536926269531, "logps/rejected": -1218.9322509765625, "loss": 1.1201, "margin_dpo/margin_mean": 444.63604736328125, "margin_dpo/margin_std": 745.1632690429688, "step": 663 }, { "KL/chosen_KL_mean": -671.679931640625, "KL/mean": -922.8211669921875, "KL/rejected_KL_mean": -1173.96240234375, "KL/std": 610.1594848632812, "epoch": 0.9750367107195301, "fcm_dpo/beta": 0.0009023561142385006, "fcm_dpo/delta": -0.05629858374595642, "fcm_dpo/margin": 502.282470703125, "fcm_dpo/q_t": 0.3978724479675293, "grad_norm": 29.01834487915039, "learning_rate": 1.066455926241383e-09, "logits/chosen": -1.0154392719268799, "logits/rejected": -1.0499646663665771, "logps/chosen": -732.568359375, "logps/ref_chosen": -60.88847351074219, "logps/ref_rejected": -105.521728515625, "logps/rejected": -1279.484130859375, "loss": 1.063, "margin_dpo/margin_mean": 502.2824401855469, "margin_dpo/margin_std": 654.3357543945312, "step": 664 }, { "KL/chosen_KL_mean": -607.6499633789062, "KL/mean": -818.168701171875, "KL/rejected_KL_mean": -1028.6873779296875, "KL/std": 506.8304138183594, "epoch": 0.9765051395007343, "fcm_dpo/beta": 0.0009059334406629205, "fcm_dpo/delta": 0.01930341310799122, "fcm_dpo/margin": 421.0374755859375, "fcm_dpo/q_t": 0.4117254316806793, "grad_norm": 45.89773941040039, "learning_rate": 9.513254770636137e-10, "logits/chosen": -1.088966965675354, "logits/rejected": -1.1092216968536377, "logps/chosen": -668.214111328125, "logps/ref_chosen": -60.56413269042969, "logps/ref_rejected": -84.80882263183594, "logps/rejected": -1113.4962158203125, "loss": 1.0939, "margin_dpo/margin_mean": 421.0375061035156, "margin_dpo/margin_std": 522.3038940429688, "step": 665 }, { "KL/chosen_KL_mean": -657.01708984375, "KL/mean": -866.1459350585938, "KL/rejected_KL_mean": -1075.274658203125, "KL/std": 536.670166015625, "epoch": 0.9779735682819384, "fcm_dpo/beta": 0.0009081506868824363, "fcm_dpo/delta": 0.020935581997036934, "fcm_dpo/margin": 418.2576904296875, "fcm_dpo/q_t": 0.4136677384376526, "grad_norm": 33.37041091918945, "learning_rate": 8.427576920763956e-10, "logits/chosen": -0.9409841299057007, "logits/rejected": -0.9495470523834229, "logps/chosen": -721.43701171875, "logps/ref_chosen": -64.41996002197266, "logps/ref_rejected": -95.8916244506836, "logps/rejected": -1171.1663818359375, "loss": 1.1133, "margin_dpo/margin_mean": 418.2577209472656, "margin_dpo/margin_std": 598.1719970703125, "step": 666 }, { "KL/chosen_KL_mean": -755.029052734375, "KL/mean": -1004.461669921875, "KL/rejected_KL_mean": -1253.8944091796875, "KL/std": 589.2806396484375, "epoch": 0.9794419970631424, "fcm_dpo/beta": 0.0009012054651975632, "fcm_dpo/delta": -0.052186060696840286, "fcm_dpo/margin": 498.8653259277344, "fcm_dpo/q_t": 0.3979080319404602, "grad_norm": 38.522857666015625, "learning_rate": 7.407554321417764e-10, "logits/chosen": -0.9830505847930908, "logits/rejected": -0.9843896627426147, "logps/chosen": -824.3060302734375, "logps/ref_chosen": -69.27702331542969, "logps/ref_rejected": -87.83549499511719, "logps/rejected": -1341.7298583984375, "loss": 1.0648, "margin_dpo/margin_mean": 498.86529541015625, "margin_dpo/margin_std": 650.4932861328125, "step": 667 }, { "KL/chosen_KL_mean": -784.714111328125, "KL/mean": -958.1055908203125, "KL/rejected_KL_mean": -1131.4970703125, "KL/std": 627.3168334960938, "epoch": 0.9809104258443465, "fcm_dpo/beta": 0.0009167675743810833, "fcm_dpo/delta": 0.08393767476081848, "fcm_dpo/margin": 346.7828674316406, "fcm_dpo/q_t": 0.43127357959747314, "grad_norm": 44.24299621582031, "learning_rate": 6.453213851142225e-10, "logits/chosen": -1.0314850807189941, "logits/rejected": -1.0380046367645264, "logps/chosen": -857.318115234375, "logps/ref_chosen": -72.60400390625, "logps/ref_rejected": -103.73905944824219, "logps/rejected": -1235.236083984375, "loss": 1.2016, "margin_dpo/margin_mean": 346.7828674316406, "margin_dpo/margin_std": 747.9251098632812, "step": 668 }, { "KL/chosen_KL_mean": -601.5374755859375, "KL/mean": -861.4692993164062, "KL/rejected_KL_mean": -1121.401123046875, "KL/std": 594.085693359375, "epoch": 0.9823788546255506, "fcm_dpo/beta": 0.0009072460234165192, "fcm_dpo/delta": -0.07513141632080078, "fcm_dpo/margin": 519.8635864257812, "fcm_dpo/q_t": 0.3927006125450134, "grad_norm": 29.531373977661133, "learning_rate": 5.564580657695939e-10, "logits/chosen": -1.0162256956100464, "logits/rejected": -1.0264288187026978, "logps/chosen": -647.6539306640625, "logps/ref_chosen": -46.116416931152344, "logps/ref_rejected": -77.92434692382812, "logps/rejected": -1199.325439453125, "loss": 1.0536, "margin_dpo/margin_mean": 519.8635864257812, "margin_dpo/margin_std": 669.6885375976562, "step": 669 }, { "KL/chosen_KL_mean": -569.926025390625, "KL/mean": -822.7012939453125, "KL/rejected_KL_mean": -1075.4765625, "KL/std": 546.4443359375, "epoch": 0.9838472834067548, "fcm_dpo/beta": 0.0008986732573248446, "fcm_dpo/delta": -0.057006560266017914, "fcm_dpo/margin": 505.550537109375, "fcm_dpo/q_t": 0.396476686000824, "grad_norm": 30.942873001098633, "learning_rate": 4.741678157389739e-10, "logits/chosen": -0.9540762901306152, "logits/rejected": -0.972830593585968, "logps/chosen": -632.271728515625, "logps/ref_chosen": -62.34575271606445, "logps/ref_rejected": -96.9405517578125, "logps/rejected": -1172.4169921875, "loss": 1.0656, "margin_dpo/margin_mean": 505.550537109375, "margin_dpo/margin_std": 658.2862548828125, "step": 670 }, { "KL/chosen_KL_mean": -731.1207275390625, "KL/mean": -929.6273803710938, "KL/rejected_KL_mean": -1128.134033203125, "KL/std": 551.165283203125, "epoch": 0.9853157121879589, "fcm_dpo/beta": 0.0009013921953737736, "fcm_dpo/delta": 0.0433029942214489, "fcm_dpo/margin": 397.0133361816406, "fcm_dpo/q_t": 0.4173119068145752, "grad_norm": 34.9372673034668, "learning_rate": 3.9845280344705245e-10, "logits/chosen": -1.0201672315597534, "logits/rejected": -1.0463124513626099, "logps/chosen": -779.120849609375, "logps/ref_chosen": -48.00010681152344, "logps/ref_rejected": -83.81932067871094, "logps/rejected": -1211.953369140625, "loss": 1.1447, "margin_dpo/margin_mean": 397.0133361816406, "margin_dpo/margin_std": 663.8474731445312, "step": 671 }, { "KL/chosen_KL_mean": -809.3194580078125, "KL/mean": -1013.943603515625, "KL/rejected_KL_mean": -1218.567626953125, "KL/std": 674.4744873046875, "epoch": 0.986784140969163, "fcm_dpo/beta": 0.0009026298066601157, "fcm_dpo/delta": 0.0317508839070797, "fcm_dpo/margin": 409.248291015625, "fcm_dpo/q_t": 0.418613076210022, "grad_norm": 49.699440002441406, "learning_rate": 3.293150240547549e-10, "logits/chosen": -1.111328363418579, "logits/rejected": -1.1195930242538452, "logps/chosen": -867.9027099609375, "logps/ref_chosen": -58.58328628540039, "logps/ref_rejected": -93.14015197753906, "logps/rejected": -1311.7078857421875, "loss": 1.157, "margin_dpo/margin_mean": 409.248291015625, "margin_dpo/margin_std": 746.950927734375, "step": 672 }, { "KL/chosen_KL_mean": -722.28857421875, "KL/mean": -917.1810302734375, "KL/rejected_KL_mean": -1112.073486328125, "KL/std": 562.255615234375, "epoch": 0.9882525697503671, "fcm_dpo/beta": 0.0009116331348195672, "fcm_dpo/delta": 0.04632698372006416, "fcm_dpo/margin": 389.7848205566406, "fcm_dpo/q_t": 0.4198199510574341, "grad_norm": 29.702667236328125, "learning_rate": 2.6675629940689504e-10, "logits/chosen": -1.029843807220459, "logits/rejected": -1.0315158367156982, "logps/chosen": -769.0118408203125, "logps/ref_chosen": -46.72320556640625, "logps/ref_rejected": -85.29623413085938, "logps/rejected": -1197.36962890625, "loss": 1.1329, "margin_dpo/margin_mean": 389.7848205566406, "margin_dpo/margin_std": 619.7523803710938, "step": 673 }, { "KL/chosen_KL_mean": -580.1348266601562, "KL/mean": -826.81103515625, "KL/rejected_KL_mean": -1073.4873046875, "KL/std": 549.4868774414062, "epoch": 0.9897209985315712, "fcm_dpo/beta": 0.0009095786954276264, "fcm_dpo/delta": -0.05102291703224182, "fcm_dpo/margin": 493.3524475097656, "fcm_dpo/q_t": 0.4003087282180786, "grad_norm": 38.08716583251953, "learning_rate": 2.1077827798404725e-10, "logits/chosen": -0.9516767263412476, "logits/rejected": -0.9709774255752563, "logps/chosen": -625.580322265625, "logps/ref_chosen": -45.445526123046875, "logps/ref_rejected": -70.04593658447266, "logps/rejected": -1143.533203125, "loss": 1.0664, "margin_dpo/margin_mean": 493.3524169921875, "margin_dpo/margin_std": 656.5906982421875, "step": 674 }, { "KL/chosen_KL_mean": -678.9006958007812, "KL/mean": -938.8206787109375, "KL/rejected_KL_mean": -1198.7406005859375, "KL/std": 619.1822509765625, "epoch": 0.9911894273127754, "fcm_dpo/beta": 0.000889546936377883, "fcm_dpo/delta": -0.06706520915031433, "fcm_dpo/margin": 519.83984375, "fcm_dpo/q_t": 0.3973570168018341, "grad_norm": 24.625337600708008, "learning_rate": 1.6138243485910863e-10, "logits/chosen": -1.0201187133789062, "logits/rejected": -1.0345721244812012, "logps/chosen": -723.0770263671875, "logps/ref_chosen": -44.17628479003906, "logps/ref_rejected": -74.09197998046875, "logps/rejected": -1272.83251953125, "loss": 1.0605, "margin_dpo/margin_mean": 519.83984375, "margin_dpo/margin_std": 662.8849487304688, "step": 675 }, { "KL/chosen_KL_mean": -727.82421875, "KL/mean": -965.162109375, "KL/rejected_KL_mean": -1202.5, "KL/std": 583.515869140625, "epoch": 0.9926578560939795, "fcm_dpo/beta": 0.000891472096554935, "fcm_dpo/delta": -0.024206459522247314, "fcm_dpo/margin": 474.67572021484375, "fcm_dpo/q_t": 0.4033673405647278, "grad_norm": 24.022327423095703, "learning_rate": 1.1857007165852472e-10, "logits/chosen": -0.9483212232589722, "logits/rejected": -0.9590877294540405, "logps/chosen": -799.2227783203125, "logps/ref_chosen": -71.39852905273438, "logps/ref_rejected": -88.3587646484375, "logps/rejected": -1290.858642578125, "loss": 1.0732, "margin_dpo/margin_mean": 474.6757507324219, "margin_dpo/margin_std": 593.3427124023438, "step": 676 }, { "KL/chosen_KL_mean": -726.002685546875, "KL/mean": -963.833251953125, "KL/rejected_KL_mean": -1201.663818359375, "KL/std": 609.81103515625, "epoch": 0.9941262848751835, "fcm_dpo/beta": 0.0008846810087561607, "fcm_dpo/delta": -0.021789535880088806, "fcm_dpo/margin": 475.6611022949219, "fcm_dpo/q_t": 0.40949833393096924, "grad_norm": 27.512174606323242, "learning_rate": 8.23423165278725e-11, "logits/chosen": -1.0342793464660645, "logits/rejected": -1.0246810913085938, "logps/chosen": -782.5301513671875, "logps/ref_chosen": -56.527435302734375, "logps/ref_rejected": -78.22654724121094, "logps/rejected": -1279.890380859375, "loss": 1.0982, "margin_dpo/margin_mean": 475.66107177734375, "margin_dpo/margin_std": 737.0035400390625, "step": 677 }, { "KL/chosen_KL_mean": -603.8602294921875, "KL/mean": -871.7525634765625, "KL/rejected_KL_mean": -1139.6448974609375, "KL/std": 642.23291015625, "epoch": 0.9955947136563876, "fcm_dpo/beta": 0.0008793273009359837, "fcm_dpo/delta": -0.07475695013999939, "fcm_dpo/margin": 535.78466796875, "fcm_dpo/q_t": 0.3942733407020569, "grad_norm": 32.770172119140625, "learning_rate": 5.270012410216185e-11, "logits/chosen": -0.9874995946884155, "logits/rejected": -1.0228235721588135, "logps/chosen": -649.9947509765625, "logps/ref_chosen": -46.13447570800781, "logps/ref_rejected": -80.60462951660156, "logps/rejected": -1220.24951171875, "loss": 1.0583, "margin_dpo/margin_mean": 535.78466796875, "margin_dpo/margin_std": 712.2026977539062, "step": 678 }, { "KL/chosen_KL_mean": -696.7166748046875, "KL/mean": -885.9703369140625, "KL/rejected_KL_mean": -1075.22412109375, "KL/std": 518.1109619140625, "epoch": 0.9970631424375918, "fcm_dpo/beta": 0.0008801834774203598, "fcm_dpo/delta": 0.06918685883283615, "fcm_dpo/margin": 378.50738525390625, "fcm_dpo/q_t": 0.42413240671157837, "grad_norm": 30.4984130859375, "learning_rate": 2.9644275480772416e-11, "logits/chosen": -1.0104858875274658, "logits/rejected": -1.000281810760498, "logps/chosen": -747.0115966796875, "logps/ref_chosen": -50.294921875, "logps/ref_rejected": -76.59813690185547, "logps/rejected": -1151.8221435546875, "loss": 1.1443, "margin_dpo/margin_mean": 378.50738525390625, "margin_dpo/margin_std": 593.6575927734375, "step": 679 }, { "KL/chosen_KL_mean": -685.6356201171875, "KL/mean": -937.115966796875, "KL/rejected_KL_mean": -1188.59619140625, "KL/std": 657.9539184570312, "epoch": 0.9985315712187959, "fcm_dpo/beta": 0.0008748341351747513, "fcm_dpo/delta": -0.04234904423356056, "fcm_dpo/margin": 502.96063232421875, "fcm_dpo/q_t": 0.3994414210319519, "grad_norm": 45.33549118041992, "learning_rate": 1.31753782067201e-11, "logits/chosen": -0.9940932989120483, "logits/rejected": -1.0226861238479614, "logps/chosen": -762.55126953125, "logps/ref_chosen": -76.91569519042969, "logps/ref_rejected": -112.384765625, "logps/rejected": -1300.98095703125, "loss": 1.0878, "margin_dpo/margin_mean": 502.96063232421875, "margin_dpo/margin_std": 730.917724609375, "step": 680 }, { "KL/chosen_KL_mean": -695.575439453125, "KL/mean": -892.23193359375, "KL/rejected_KL_mean": -1088.888427734375, "KL/std": 566.686767578125, "epoch": 1.0, "fcm_dpo/beta": 0.0008728657849133015, "fcm_dpo/delta": -0.041127026081085205, "fcm_dpo/margin": 393.31298828125, "fcm_dpo/q_t": 0.4215954542160034, "grad_norm": 27.740650177001953, "learning_rate": 3.2938662507808745e-12, "logits/chosen": -1.0516822338104248, "logits/rejected": -1.0712807178497314, "logps/chosen": -756.53271484375, "logps/ref_chosen": -60.957279205322266, "logps/ref_rejected": -88.55797576904297, "logps/rejected": -1177.446533203125, "loss": 1.1472, "margin_dpo/margin_mean": 393.31298828125, "margin_dpo/margin_std": 621.3822631835938, "step": 681 }, { "epoch": 1.0, "step": 681, "total_flos": 0.0, "train_loss": 1.1094634984558374, "train_runtime": 1738.7131, "train_samples_per_second": 25.075, "train_steps_per_second": 0.392 } ], "logging_steps": 1, "max_steps": 681, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }