{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.999244142101285, "eval_steps": 200, "global_step": 661, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "KL/chosen_KL_mean": 0.02867889404296875, "KL/mean": 0.029354453086853027, "KL/rejected_KL_mean": 0.030029296875, "KL/std": 0.2071000635623932, "epoch": 0.0015117157974300832, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": -0.0013532638549804688, "fcm_dpo/q_t": 0.5000972747802734, "grad_norm": 84.81343841552734, "learning_rate": 0.0, "logits/chosen": 0.13337239623069763, "logits/rejected": 0.12492949515581131, "logps/chosen": -64.5841293334961, "logps/ref_chosen": -64.61280822753906, "logps/ref_rejected": -64.17195129394531, "logps/rejected": -64.14192199707031, "loss": 1.3882, "margin_dpo/margin_mean": -0.0013527870178222656, "margin_dpo/margin_std": 0.2561596930027008, "step": 1 }, { "KL/chosen_KL_mean": -0.00289154052734375, "KL/mean": -0.021616414189338684, "KL/rejected_KL_mean": -0.04033660888671875, "KL/std": 0.19624735414981842, "epoch": 0.0030234315948601664, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.037450045347213745, "fcm_dpo/q_t": 0.4971959590911865, "grad_norm": 83.32388305664062, "learning_rate": 7.462686567164179e-09, "logits/chosen": 0.09414851665496826, "logits/rejected": 0.07363267242908478, "logps/chosen": -56.101890563964844, "logps/ref_chosen": -56.0989990234375, "logps/ref_rejected": -66.59971618652344, "logps/rejected": -66.64006042480469, "loss": 1.3769, "margin_dpo/margin_mean": 0.03744968771934509, "margin_dpo/margin_std": 0.27811938524246216, "step": 2 }, { "KL/chosen_KL_mean": 0.033405303955078125, "KL/mean": 0.020700395107269287, "KL/rejected_KL_mean": 0.007991790771484375, "KL/std": 0.2221832275390625, "epoch": 0.0045351473922902496, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.025411665439605713, "fcm_dpo/q_t": 0.49810415506362915, "grad_norm": 93.74189758300781, "learning_rate": 1.4925373134328357e-08, "logits/chosen": 0.09963999688625336, "logits/rejected": 0.061615269631147385, "logps/chosen": -65.42385864257812, "logps/ref_chosen": -65.45726013183594, "logps/ref_rejected": -90.82853698730469, "logps/rejected": -90.82054138183594, "loss": 1.3804, "margin_dpo/margin_mean": 0.025411784648895264, "margin_dpo/margin_std": 0.2622186541557312, "step": 3 }, { "KL/chosen_KL_mean": 0.017333984375, "KL/mean": 0.0062576234340667725, "KL/rejected_KL_mean": -0.00482177734375, "KL/std": 0.231220081448555, "epoch": 0.006046863189720333, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.022152245044708252, "fcm_dpo/q_t": 0.4983437657356262, "grad_norm": 103.19758605957031, "learning_rate": 2.2388059701492534e-08, "logits/chosen": 0.10081079602241516, "logits/rejected": 0.0847533643245697, "logps/chosen": -76.84284973144531, "logps/ref_chosen": -76.86018371582031, "logps/ref_rejected": -79.91523742675781, "logps/rejected": -79.92005920410156, "loss": 1.3816, "margin_dpo/margin_mean": 0.022151529788970947, "margin_dpo/margin_std": 0.2929977774620056, "step": 4 }, { "KL/chosen_KL_mean": 0.017900466918945312, "KL/mean": -0.008704200387001038, "KL/rejected_KL_mean": -0.035305023193359375, "KL/std": 0.23076409101486206, "epoch": 0.007558578987150416, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.05320963263511658, "fcm_dpo/q_t": 0.4960266351699829, "grad_norm": 87.17352294921875, "learning_rate": 2.9850746268656714e-08, "logits/chosen": 0.08232609927654266, "logits/rejected": 0.04349789023399353, "logps/chosen": -62.95343780517578, "logps/ref_chosen": -62.97134017944336, "logps/ref_rejected": -79.9192123413086, "logps/rejected": -79.95451354980469, "loss": 1.3728, "margin_dpo/margin_mean": 0.053210049867630005, "margin_dpo/margin_std": 0.32491230964660645, "step": 5 }, { "KL/chosen_KL_mean": -0.041103363037109375, "KL/mean": -0.004406198859214783, "KL/rejected_KL_mean": 0.03229522705078125, "KL/std": 0.23096251487731934, "epoch": 0.009070294784580499, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": -0.07339546084403992, "fcm_dpo/q_t": 0.5054706931114197, "grad_norm": 90.77230834960938, "learning_rate": 3.731343283582089e-08, "logits/chosen": 0.12801773846149445, "logits/rejected": 0.089537113904953, "logps/chosen": -51.34846496582031, "logps/ref_chosen": -51.30736541748047, "logps/ref_rejected": -82.77239227294922, "logps/rejected": -82.74009704589844, "loss": 1.4107, "margin_dpo/margin_mean": -0.07339566946029663, "margin_dpo/margin_std": 0.3095110356807709, "step": 6 }, { "KL/chosen_KL_mean": -0.022672653198242188, "KL/mean": -0.006443768739700317, "KL/rejected_KL_mean": 0.009784698486328125, "KL/std": 0.19946160912513733, "epoch": 0.010582010582010581, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": -0.03245997428894043, "fcm_dpo/q_t": 0.5024325251579285, "grad_norm": 82.3846435546875, "learning_rate": 4.477611940298507e-08, "logits/chosen": 0.02592495083808899, "logits/rejected": -0.018398292362689972, "logps/chosen": -51.482086181640625, "logps/ref_chosen": -51.45941162109375, "logps/ref_rejected": -66.3828125, "logps/rejected": -66.3730239868164, "loss": 1.3973, "margin_dpo/margin_mean": -0.03246006369590759, "margin_dpo/margin_std": 0.23344238102436066, "step": 7 }, { "KL/chosen_KL_mean": 0.013820648193359375, "KL/mean": 0.006065875291824341, "KL/rejected_KL_mean": -0.001689910888671875, "KL/std": 0.24223633110523224, "epoch": 0.012093726379440665, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.015507936477661133, "fcm_dpo/q_t": 0.49885261058807373, "grad_norm": 84.78572082519531, "learning_rate": 5.223880597014925e-08, "logits/chosen": 0.06976951658725739, "logits/rejected": 0.04768180847167969, "logps/chosen": -62.18372344970703, "logps/ref_chosen": -62.197547912597656, "logps/ref_rejected": -74.66180419921875, "logps/rejected": -74.66349792480469, "loss": 1.3849, "margin_dpo/margin_mean": 0.015507668256759644, "margin_dpo/margin_std": 0.37152230739593506, "step": 8 }, { "KL/chosen_KL_mean": -0.012002944946289062, "KL/mean": -0.03779882192611694, "KL/rejected_KL_mean": -0.0635986328125, "KL/std": 0.2661692202091217, "epoch": 0.013605442176870748, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.0515933632850647, "fcm_dpo/q_t": 0.4961661696434021, "grad_norm": 93.60535430908203, "learning_rate": 5.970149253731343e-08, "logits/chosen": 0.15112018585205078, "logits/rejected": 0.09316952526569366, "logps/chosen": -55.6417236328125, "logps/ref_chosen": -55.629722595214844, "logps/ref_rejected": -86.21221923828125, "logps/rejected": -86.27581787109375, "loss": 1.3746, "margin_dpo/margin_mean": 0.051593929529190063, "margin_dpo/margin_std": 0.3885486423969269, "step": 9 }, { "KL/chosen_KL_mean": 0.027301788330078125, "KL/mean": 0.024095460772514343, "KL/rejected_KL_mean": 0.020885467529296875, "KL/std": 0.22871175408363342, "epoch": 0.015117157974300832, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.006406038999557495, "fcm_dpo/q_t": 0.4995288550853729, "grad_norm": 89.24808502197266, "learning_rate": 6.71641791044776e-08, "logits/chosen": 0.1497282236814499, "logits/rejected": 0.11771087348461151, "logps/chosen": -62.663299560546875, "logps/ref_chosen": -62.69060134887695, "logps/ref_rejected": -90.610107421875, "logps/rejected": -90.58922576904297, "loss": 1.387, "margin_dpo/margin_mean": 0.00640559196472168, "margin_dpo/margin_std": 0.3381892442703247, "step": 10 }, { "KL/chosen_KL_mean": 0.03916168212890625, "KL/mean": -0.0018206536769866943, "KL/rejected_KL_mean": -0.042804718017578125, "KL/std": 0.21469756960868835, "epoch": 0.016628873771730914, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.0819656252861023, "fcm_dpo/q_t": 0.49387264251708984, "grad_norm": 86.24508666992188, "learning_rate": 7.462686567164178e-08, "logits/chosen": 0.09704963862895966, "logits/rejected": 0.09043502807617188, "logps/chosen": -65.72795867919922, "logps/ref_chosen": -65.76712036132812, "logps/ref_rejected": -72.4764633178711, "logps/rejected": -72.51927185058594, "loss": 1.3639, "margin_dpo/margin_mean": 0.08196571469306946, "margin_dpo/margin_std": 0.29107633233070374, "step": 11 }, { "KL/chosen_KL_mean": -0.01404571533203125, "KL/mean": -0.011210396885871887, "KL/rejected_KL_mean": -0.008375167846679688, "KL/std": 0.20475125312805176, "epoch": 0.018140589569160998, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": -0.00567290186882019, "fcm_dpo/q_t": 0.5004265308380127, "grad_norm": 85.0528564453125, "learning_rate": 8.208955223880596e-08, "logits/chosen": 0.01019902154803276, "logits/rejected": -0.005590981803834438, "logps/chosen": -60.718936920166016, "logps/ref_chosen": -60.704891204833984, "logps/ref_rejected": -69.41564178466797, "logps/rejected": -69.42401885986328, "loss": 1.3901, "margin_dpo/margin_mean": -0.005672812461853027, "margin_dpo/margin_std": 0.3051266372203827, "step": 12 }, { "KL/chosen_KL_mean": -0.02446746826171875, "KL/mean": -0.00045931339263916016, "KL/rejected_KL_mean": 0.0235595703125, "KL/std": 0.20995768904685974, "epoch": 0.019652305366591082, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": -0.04801982641220093, "fcm_dpo/q_t": 0.503569483757019, "grad_norm": 88.48363494873047, "learning_rate": 8.955223880597014e-08, "logits/chosen": 0.10459847003221512, "logits/rejected": 0.042581334710121155, "logps/chosen": -49.93372344970703, "logps/ref_chosen": -49.90925598144531, "logps/ref_rejected": -92.37818145751953, "logps/rejected": -92.35462188720703, "loss": 1.4033, "margin_dpo/margin_mean": -0.04802015423774719, "margin_dpo/margin_std": 0.3207463324069977, "step": 13 }, { "KL/chosen_KL_mean": -0.035037994384765625, "KL/mean": -0.013952985405921936, "KL/rejected_KL_mean": 0.00713348388671875, "KL/std": 0.22607147693634033, "epoch": 0.021164021164021163, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": -0.04217222332954407, "fcm_dpo/q_t": 0.5031682252883911, "grad_norm": 89.4442367553711, "learning_rate": 9.701492537313432e-08, "logits/chosen": 0.08840907365083694, "logits/rejected": 0.07058637589216232, "logps/chosen": -60.653831481933594, "logps/ref_chosen": -60.61879348754883, "logps/ref_rejected": -71.79306030273438, "logps/rejected": -71.78592681884766, "loss": 1.4014, "margin_dpo/margin_mean": -0.042172253131866455, "margin_dpo/margin_std": 0.31967276334762573, "step": 14 }, { "KL/chosen_KL_mean": -0.00018310546875, "KL/mean": -0.00222662091255188, "KL/rejected_KL_mean": -0.004276275634765625, "KL/std": 0.21713778376579285, "epoch": 0.022675736961451247, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.00408780574798584, "fcm_dpo/q_t": 0.49969691038131714, "grad_norm": 99.908447265625, "learning_rate": 1.044776119402985e-07, "logits/chosen": 0.07341223210096359, "logits/rejected": 0.02995806373655796, "logps/chosen": -63.46971893310547, "logps/ref_chosen": -63.46953582763672, "logps/ref_rejected": -88.88951110839844, "logps/rejected": -88.89379119873047, "loss": 1.3872, "margin_dpo/margin_mean": 0.004087239503860474, "margin_dpo/margin_std": 0.3059452772140503, "step": 15 }, { "KL/chosen_KL_mean": -0.012371063232421875, "KL/mean": -0.013426609337329865, "KL/rejected_KL_mean": -0.014484405517578125, "KL/std": 0.1921122968196869, "epoch": 0.02418745275888133, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.0021147578954696655, "fcm_dpo/q_t": 0.49984389543533325, "grad_norm": 80.92469787597656, "learning_rate": 1.1194029850746268e-07, "logits/chosen": 0.12906810641288757, "logits/rejected": 0.09102576971054077, "logps/chosen": -46.54467010498047, "logps/ref_chosen": -46.53229904174805, "logps/ref_rejected": -74.27533721923828, "logps/rejected": -74.28982543945312, "loss": 1.3873, "margin_dpo/margin_mean": 0.002114519476890564, "margin_dpo/margin_std": 0.2640277147293091, "step": 16 }, { "KL/chosen_KL_mean": -0.012212753295898438, "KL/mean": 0.0008438229560852051, "KL/rejected_KL_mean": 0.013904571533203125, "KL/std": 0.2587102949619293, "epoch": 0.025699168556311415, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": -0.026117920875549316, "fcm_dpo/q_t": 0.5019273161888123, "grad_norm": 101.69012451171875, "learning_rate": 1.1940298507462686e-07, "logits/chosen": 0.052943162620067596, "logits/rejected": 0.03429335355758667, "logps/chosen": -64.09004211425781, "logps/ref_chosen": -64.07783508300781, "logps/ref_rejected": -86.40876770019531, "logps/rejected": -86.39486694335938, "loss": 1.3977, "margin_dpo/margin_mean": -0.026118546724319458, "margin_dpo/margin_std": 0.38860464096069336, "step": 17 }, { "KL/chosen_KL_mean": 0.048213958740234375, "KL/mean": 0.015008881688117981, "KL/rejected_KL_mean": -0.018199920654296875, "KL/std": 0.2065563201904297, "epoch": 0.027210884353741496, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.06641146540641785, "fcm_dpo/q_t": 0.49501660466194153, "grad_norm": 82.85562133789062, "learning_rate": 1.2686567164179106e-07, "logits/chosen": 0.08620916306972504, "logits/rejected": 0.041173599660396576, "logps/chosen": -44.82612228393555, "logps/ref_chosen": -44.87433624267578, "logps/ref_rejected": -70.97604370117188, "logps/rejected": -70.99424743652344, "loss": 1.368, "margin_dpo/margin_mean": 0.06641131639480591, "margin_dpo/margin_std": 0.25283902883529663, "step": 18 }, { "KL/chosen_KL_mean": 0.051776885986328125, "KL/mean": 0.008657693862915039, "KL/rejected_KL_mean": -0.034465789794921875, "KL/std": 0.26961037516593933, "epoch": 0.02872260015117158, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.08624601364135742, "fcm_dpo/q_t": 0.49355414509773254, "grad_norm": 91.98597717285156, "learning_rate": 1.343283582089552e-07, "logits/chosen": 0.06989361345767975, "logits/rejected": 0.05635995790362358, "logps/chosen": -68.1080322265625, "logps/ref_chosen": -68.1598129272461, "logps/ref_rejected": -81.17138671875, "logps/rejected": -81.20585632324219, "loss": 1.3634, "margin_dpo/margin_mean": 0.08624613285064697, "margin_dpo/margin_std": 0.35482057929039, "step": 19 }, { "KL/chosen_KL_mean": -0.011426925659179688, "KL/mean": -0.0012049600481987, "KL/rejected_KL_mean": 0.00902557373046875, "KL/std": 0.20342442393302917, "epoch": 0.030234315948601664, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": -0.020451828837394714, "fcm_dpo/q_t": 0.5015274286270142, "grad_norm": 88.2505111694336, "learning_rate": 1.4179104477611938e-07, "logits/chosen": 0.10274842381477356, "logits/rejected": 0.08114857971668243, "logps/chosen": -53.68998718261719, "logps/ref_chosen": -53.67856216430664, "logps/ref_rejected": -74.16911315917969, "logps/rejected": -74.16009521484375, "loss": 1.3943, "margin_dpo/margin_mean": -0.020452216267585754, "margin_dpo/margin_std": 0.28839802742004395, "step": 20 }, { "KL/chosen_KL_mean": 0.0011920928955078125, "KL/mean": 0.021433278918266296, "KL/rejected_KL_mean": 0.041675567626953125, "KL/std": 0.2135191559791565, "epoch": 0.031746031746031744, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": -0.040478020906448364, "fcm_dpo/q_t": 0.5030294060707092, "grad_norm": 89.75379180908203, "learning_rate": 1.4925373134328355e-07, "logits/chosen": 0.12021639943122864, "logits/rejected": 0.09408889710903168, "logps/chosen": -64.70036315917969, "logps/ref_chosen": -64.70155334472656, "logps/ref_rejected": -81.02095031738281, "logps/rejected": -80.9792709350586, "loss": 1.4002, "margin_dpo/margin_mean": -0.040478646755218506, "margin_dpo/margin_std": 0.275388240814209, "step": 21 }, { "KL/chosen_KL_mean": -0.03302574157714844, "KL/mean": -0.022005170583724976, "KL/rejected_KL_mean": -0.010982513427734375, "KL/std": 0.21539588272571564, "epoch": 0.03325774754346183, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": -0.022036850452423096, "fcm_dpo/q_t": 0.5016478300094604, "grad_norm": 87.7178955078125, "learning_rate": 1.5671641791044775e-07, "logits/chosen": -0.014869213104248047, "logits/rejected": -0.0348513200879097, "logps/chosen": -58.06901550292969, "logps/ref_chosen": -58.03599166870117, "logps/ref_rejected": -80.72721862792969, "logps/rejected": -80.73820495605469, "loss": 1.3949, "margin_dpo/margin_mean": -0.02203691005706787, "margin_dpo/margin_std": 0.29411375522613525, "step": 22 }, { "KL/chosen_KL_mean": 0.0063323974609375, "KL/mean": -0.019633755087852478, "KL/rejected_KL_mean": -0.04560089111328125, "KL/std": 0.24943846464157104, "epoch": 0.03476946334089191, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.05193856358528137, "fcm_dpo/q_t": 0.4961240291595459, "grad_norm": 99.2575912475586, "learning_rate": 1.6417910447761193e-07, "logits/chosen": 0.15819749236106873, "logits/rejected": 0.13187375664710999, "logps/chosen": -66.3497543334961, "logps/ref_chosen": -66.35608673095703, "logps/ref_rejected": -93.02769470214844, "logps/rejected": -93.07328796386719, "loss": 1.3736, "margin_dpo/margin_mean": 0.0519389808177948, "margin_dpo/margin_std": 0.35010451078414917, "step": 23 }, { "KL/chosen_KL_mean": -0.020721435546875, "KL/mean": -0.015026941895484924, "KL/rejected_KL_mean": -0.009332656860351562, "KL/std": 0.20408214628696442, "epoch": 0.036281179138321996, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": -0.011387258768081665, "fcm_dpo/q_t": 0.5008534789085388, "grad_norm": 79.21713256835938, "learning_rate": 1.716417910447761e-07, "logits/chosen": 0.13129377365112305, "logits/rejected": 0.09845592081546783, "logps/chosen": -54.48196029663086, "logps/ref_chosen": -54.461238861083984, "logps/ref_rejected": -68.33817291259766, "logps/rejected": -68.34750366210938, "loss": 1.3914, "margin_dpo/margin_mean": -0.011387407779693604, "margin_dpo/margin_std": 0.2734227776527405, "step": 24 }, { "KL/chosen_KL_mean": -0.06030082702636719, "KL/mean": -0.049270108342170715, "KL/rejected_KL_mean": -0.038238525390625, "KL/std": 0.2109871208667755, "epoch": 0.03779289493575208, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": -0.022065013647079468, "fcm_dpo/q_t": 0.5016451478004456, "grad_norm": 89.02693176269531, "learning_rate": 1.7910447761194027e-07, "logits/chosen": 0.12795281410217285, "logits/rejected": 0.07486993819475174, "logps/chosen": -60.06450653076172, "logps/ref_chosen": -60.00420379638672, "logps/ref_rejected": -90.47376251220703, "logps/rejected": -90.51200103759766, "loss": 1.3947, "margin_dpo/margin_mean": -0.022065043449401855, "margin_dpo/margin_std": 0.28272759914398193, "step": 25 }, { "KL/chosen_KL_mean": -0.015651702880859375, "KL/mean": -0.0425771027803421, "KL/rejected_KL_mean": -0.06949996948242188, "KL/std": 0.22068238258361816, "epoch": 0.039304610733182165, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.05385205149650574, "fcm_dpo/q_t": 0.4959774613380432, "grad_norm": 88.33043670654297, "learning_rate": 1.8656716417910447e-07, "logits/chosen": 0.09218540787696838, "logits/rejected": 0.07426909357309341, "logps/chosen": -56.834808349609375, "logps/ref_chosen": -56.81915283203125, "logps/ref_rejected": -77.84333038330078, "logps/rejected": -77.91282653808594, "loss": 1.3722, "margin_dpo/margin_mean": 0.05385178327560425, "margin_dpo/margin_std": 0.2991413176059723, "step": 26 }, { "KL/chosen_KL_mean": -0.004428863525390625, "KL/mean": -0.02671918272972107, "KL/rejected_KL_mean": -0.04901123046875, "KL/std": 0.2074870467185974, "epoch": 0.04081632653061224, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.04458415508270264, "fcm_dpo/q_t": 0.4966648817062378, "grad_norm": 85.84480285644531, "learning_rate": 1.9402985074626865e-07, "logits/chosen": 0.1258346140384674, "logits/rejected": 0.09999653697013855, "logps/chosen": -62.88145446777344, "logps/ref_chosen": -62.87702560424805, "logps/ref_rejected": -71.34437561035156, "logps/rejected": -71.39338684082031, "loss": 1.3751, "margin_dpo/margin_mean": 0.04458439350128174, "margin_dpo/margin_std": 0.30327099561691284, "step": 27 }, { "KL/chosen_KL_mean": -0.015651702880859375, "KL/mean": -0.006240963935852051, "KL/rejected_KL_mean": 0.00316619873046875, "KL/std": 0.2177903950214386, "epoch": 0.042328042328042326, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": -0.01882070302963257, "fcm_dpo/q_t": 0.5014153718948364, "grad_norm": 83.35669708251953, "learning_rate": 2.0149253731343282e-07, "logits/chosen": 0.06463215500116348, "logits/rejected": 0.05566532164812088, "logps/chosen": -59.849029541015625, "logps/ref_chosen": -59.8333740234375, "logps/ref_rejected": -70.39804077148438, "logps/rejected": -70.39486694335938, "loss": 1.3942, "margin_dpo/margin_mean": -0.018820255994796753, "margin_dpo/margin_std": 0.31055182218551636, "step": 28 }, { "KL/chosen_KL_mean": 0.0048370361328125, "KL/mean": -0.006291203200817108, "KL/rejected_KL_mean": -0.017414093017578125, "KL/std": 0.24426929652690887, "epoch": 0.04383975812547241, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.022249296307563782, "fcm_dpo/q_t": 0.49833962321281433, "grad_norm": 98.00789642333984, "learning_rate": 2.08955223880597e-07, "logits/chosen": 0.11844631284475327, "logits/rejected": 0.1013779491186142, "logps/chosen": -74.11536407470703, "logps/ref_chosen": -74.12020111083984, "logps/ref_rejected": -83.33099365234375, "logps/rejected": -83.34840393066406, "loss": 1.3828, "margin_dpo/margin_mean": 0.022249162197113037, "margin_dpo/margin_std": 0.36899277567863464, "step": 29 }, { "KL/chosen_KL_mean": -0.04900550842285156, "KL/mean": -0.04353049397468567, "KL/rejected_KL_mean": -0.038059234619140625, "KL/std": 0.22240221500396729, "epoch": 0.045351473922902494, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": -0.010949134826660156, "fcm_dpo/q_t": 0.5008254647254944, "grad_norm": 91.40675354003906, "learning_rate": 2.1641791044776117e-07, "logits/chosen": 0.11927343904972076, "logits/rejected": 0.06514443457126617, "logps/chosen": -50.80029296875, "logps/ref_chosen": -50.75128936767578, "logps/ref_rejected": -89.29063415527344, "logps/rejected": -89.32868957519531, "loss": 1.3919, "margin_dpo/margin_mean": -0.010949641466140747, "margin_dpo/margin_std": 0.31788113713264465, "step": 30 }, { "KL/chosen_KL_mean": 0.0055103302001953125, "KL/mean": -0.05117788910865784, "KL/rejected_KL_mean": -0.10786819458007812, "KL/std": 0.24446845054626465, "epoch": 0.04686318972033258, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.11337786912918091, "fcm_dpo/q_t": 0.49152177572250366, "grad_norm": 100.92430877685547, "learning_rate": 2.2388059701492537e-07, "logits/chosen": 0.1095237135887146, "logits/rejected": 0.06315511465072632, "logps/chosen": -65.33124542236328, "logps/ref_chosen": -65.33675384521484, "logps/ref_rejected": -100.76666259765625, "logps/rejected": -100.87452697753906, "loss": 1.3546, "margin_dpo/margin_mean": 0.11337828636169434, "margin_dpo/margin_std": 0.2907155156135559, "step": 31 }, { "KL/chosen_KL_mean": -0.033222198486328125, "KL/mean": -0.04459533095359802, "KL/rejected_KL_mean": -0.055965423583984375, "KL/std": 0.2614104151725769, "epoch": 0.04837490551776266, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.022748589515686035, "fcm_dpo/q_t": 0.4983008801937103, "grad_norm": 90.58628845214844, "learning_rate": 2.3134328358208954e-07, "logits/chosen": 0.08372671902179718, "logits/rejected": 0.07587000727653503, "logps/chosen": -67.216552734375, "logps/ref_chosen": -67.18333435058594, "logps/ref_rejected": -82.80763244628906, "logps/rejected": -82.86360168457031, "loss": 1.3821, "margin_dpo/margin_mean": 0.02274876832962036, "margin_dpo/margin_std": 0.33092159032821655, "step": 32 }, { "KL/chosen_KL_mean": -0.021730422973632812, "KL/mean": -0.03515494614839554, "KL/rejected_KL_mean": -0.048580169677734375, "KL/std": 0.25206100940704346, "epoch": 0.049886621315192746, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.026843026280403137, "fcm_dpo/q_t": 0.497951865196228, "grad_norm": 94.47178649902344, "learning_rate": 2.388059701492537e-07, "logits/chosen": 0.024030443280935287, "logits/rejected": -0.0015371758490800858, "logps/chosen": -64.06121063232422, "logps/ref_chosen": -64.03948211669922, "logps/ref_rejected": -75.68357849121094, "logps/rejected": -75.73216247558594, "loss": 1.3817, "margin_dpo/margin_mean": 0.02684326469898224, "margin_dpo/margin_std": 0.3841768503189087, "step": 33 }, { "KL/chosen_KL_mean": -0.04722023010253906, "KL/mean": -0.07731017470359802, "KL/rejected_KL_mean": -0.10739898681640625, "KL/std": 0.18384665250778198, "epoch": 0.05139833711262283, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.060186147689819336, "fcm_dpo/q_t": 0.4954932630062103, "grad_norm": 85.21406555175781, "learning_rate": 2.4626865671641786e-07, "logits/chosen": 0.07339510321617126, "logits/rejected": 0.04473770409822464, "logps/chosen": -53.711517333984375, "logps/ref_chosen": -53.6642951965332, "logps/ref_rejected": -65.77989959716797, "logps/rejected": -65.88729858398438, "loss": 1.3695, "margin_dpo/margin_mean": 0.0601862370967865, "margin_dpo/margin_std": 0.21969038248062134, "step": 34 }, { "KL/chosen_KL_mean": -0.07001876831054688, "KL/mean": -0.07800742983818054, "KL/rejected_KL_mean": -0.08599090576171875, "KL/std": 0.23264986276626587, "epoch": 0.05291005291005291, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.01597297191619873, "fcm_dpo/q_t": 0.4988135099411011, "grad_norm": 83.28887939453125, "learning_rate": 2.537313432835821e-07, "logits/chosen": 0.05433880537748337, "logits/rejected": 0.031922899186611176, "logps/chosen": -61.086883544921875, "logps/ref_chosen": -61.01686096191406, "logps/ref_rejected": -72.78598022460938, "logps/rejected": -72.87197875976562, "loss": 1.384, "margin_dpo/margin_mean": 0.015972524881362915, "margin_dpo/margin_std": 0.3337644934654236, "step": 35 }, { "KL/chosen_KL_mean": -0.07984733581542969, "KL/mean": -0.08086289465427399, "KL/rejected_KL_mean": -0.081878662109375, "KL/std": 0.2356133759021759, "epoch": 0.05442176870748299, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.002035290002822876, "fcm_dpo/q_t": 0.4998547434806824, "grad_norm": 86.62390899658203, "learning_rate": 2.611940298507462e-07, "logits/chosen": 0.1039305329322815, "logits/rejected": 0.050773825496435165, "logps/chosen": -50.617210388183594, "logps/ref_chosen": -50.53736114501953, "logps/ref_rejected": -78.11678314208984, "logps/rejected": -78.19866180419922, "loss": 1.3888, "margin_dpo/margin_mean": 0.002035379409790039, "margin_dpo/margin_std": 0.3694022595882416, "step": 36 }, { "KL/chosen_KL_mean": -0.05243110656738281, "KL/mean": -0.11357352137565613, "KL/rejected_KL_mean": -0.17471694946289062, "KL/std": 0.2989353537559509, "epoch": 0.055933484504913075, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.12227410078048706, "fcm_dpo/q_t": 0.4909391403198242, "grad_norm": 111.64187622070312, "learning_rate": 2.686567164179104e-07, "logits/chosen": 0.1005006730556488, "logits/rejected": 0.021147655323147774, "logps/chosen": -59.60637664794922, "logps/ref_chosen": -59.55394744873047, "logps/ref_rejected": -108.27702331542969, "logps/rejected": -108.45174407958984, "loss": 1.3544, "margin_dpo/margin_mean": 0.12227392196655273, "margin_dpo/margin_std": 0.43613699078559875, "step": 37 }, { "KL/chosen_KL_mean": -0.08492851257324219, "KL/mean": -0.09462648630142212, "KL/rejected_KL_mean": -0.1043243408203125, "KL/std": 0.26704442501068115, "epoch": 0.05744520030234316, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.01939481496810913, "fcm_dpo/q_t": 0.49856406450271606, "grad_norm": 88.78561401367188, "learning_rate": 2.761194029850746e-07, "logits/chosen": 0.05304524302482605, "logits/rejected": 0.0392342135310173, "logps/chosen": -65.87328338623047, "logps/ref_chosen": -65.78836059570312, "logps/ref_rejected": -76.1619873046875, "logps/rejected": -76.26631164550781, "loss": 1.3841, "margin_dpo/margin_mean": 0.019394874572753906, "margin_dpo/margin_std": 0.3750036358833313, "step": 38 }, { "KL/chosen_KL_mean": -0.09122467041015625, "KL/mean": -0.12790058553218842, "KL/rejected_KL_mean": -0.16457366943359375, "KL/std": 0.2797583341598511, "epoch": 0.05895691609977324, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.07334819436073303, "fcm_dpo/q_t": 0.4944817125797272, "grad_norm": 87.12678527832031, "learning_rate": 2.8358208955223876e-07, "logits/chosen": 0.15867194533348083, "logits/rejected": 0.13192220032215118, "logps/chosen": -57.268035888671875, "logps/ref_chosen": -57.17681121826172, "logps/ref_rejected": -79.486328125, "logps/rejected": -79.65090942382812, "loss": 1.3679, "margin_dpo/margin_mean": 0.07334855198860168, "margin_dpo/margin_std": 0.3873194754123688, "step": 39 }, { "KL/chosen_KL_mean": -0.06866455078125, "KL/mean": -0.0880993902683258, "KL/rejected_KL_mean": -0.10753631591796875, "KL/std": 0.22621940076351166, "epoch": 0.06046863189720333, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.038871049880981445, "fcm_dpo/q_t": 0.4970887005329132, "grad_norm": 93.81474304199219, "learning_rate": 2.9104477611940296e-07, "logits/chosen": 0.12493468821048737, "logits/rejected": 0.0745604857802391, "logps/chosen": -61.40283203125, "logps/ref_chosen": -61.33416748046875, "logps/ref_rejected": -79.10697174072266, "logps/rejected": -79.21450805664062, "loss": 1.377, "margin_dpo/margin_mean": 0.03887134790420532, "margin_dpo/margin_std": 0.3202136158943176, "step": 40 }, { "KL/chosen_KL_mean": -0.1145477294921875, "KL/mean": -0.16746143996715546, "KL/rejected_KL_mean": -0.2203693389892578, "KL/std": 0.29607999324798584, "epoch": 0.06198034769463341, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.10581693053245544, "fcm_dpo/q_t": 0.4921224117279053, "grad_norm": 90.43517303466797, "learning_rate": 2.985074626865671e-07, "logits/chosen": 0.0521029531955719, "logits/rejected": 0.03200225159525871, "logps/chosen": -67.6612777709961, "logps/ref_chosen": -67.5467300415039, "logps/ref_rejected": -83.87788391113281, "logps/rejected": -84.09825897216797, "loss": 1.3586, "margin_dpo/margin_mean": 0.1058172881603241, "margin_dpo/margin_std": 0.41018247604370117, "step": 41 }, { "KL/chosen_KL_mean": -0.13617897033691406, "KL/mean": -0.15122415125370026, "KL/rejected_KL_mean": -0.1662750244140625, "KL/std": 0.2629133462905884, "epoch": 0.06349206349206349, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.030096739530563354, "fcm_dpo/q_t": 0.49774858355522156, "grad_norm": 87.64628601074219, "learning_rate": 3.059701492537313e-07, "logits/chosen": 0.06373605132102966, "logits/rejected": 0.04182063788175583, "logps/chosen": -61.401039123535156, "logps/ref_chosen": -61.26485824584961, "logps/ref_rejected": -76.3629150390625, "logps/rejected": -76.52919006347656, "loss": 1.3806, "margin_dpo/margin_mean": 0.030096828937530518, "margin_dpo/margin_std": 0.37984082102775574, "step": 42 }, { "KL/chosen_KL_mean": -0.09618759155273438, "KL/mean": -0.11588230729103088, "KL/rejected_KL_mean": -0.13557052612304688, "KL/std": 0.322396457195282, "epoch": 0.06500377928949358, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.039386093616485596, "fcm_dpo/q_t": 0.4970521330833435, "grad_norm": 102.6285400390625, "learning_rate": 3.134328358208955e-07, "logits/chosen": 0.07803015410900116, "logits/rejected": 0.06729613244533539, "logps/chosen": -71.90521240234375, "logps/ref_chosen": -71.80902862548828, "logps/ref_rejected": -81.12464141845703, "logps/rejected": -81.26020812988281, "loss": 1.3782, "margin_dpo/margin_mean": 0.03938618302345276, "margin_dpo/margin_std": 0.40487977862358093, "step": 43 }, { "KL/chosen_KL_mean": -0.13219261169433594, "KL/mean": -0.16115543246269226, "KL/rejected_KL_mean": -0.19011688232421875, "KL/std": 0.28901779651641846, "epoch": 0.06651549508692366, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.05792546272277832, "fcm_dpo/q_t": 0.49570155143737793, "grad_norm": 96.78971099853516, "learning_rate": 3.2089552238805965e-07, "logits/chosen": 0.03522084280848503, "logits/rejected": 0.00526299886405468, "logps/chosen": -66.68262481689453, "logps/ref_chosen": -66.55043029785156, "logps/ref_rejected": -85.06198120117188, "logps/rejected": -85.2520980834961, "loss": 1.3728, "margin_dpo/margin_mean": 0.05792495608329773, "margin_dpo/margin_std": 0.40486952662467957, "step": 44 }, { "KL/chosen_KL_mean": -0.12037467956542969, "KL/mean": -0.18742917478084564, "KL/rejected_KL_mean": -0.2544822692871094, "KL/std": 0.31321200728416443, "epoch": 0.06802721088435375, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.13411030173301697, "fcm_dpo/q_t": 0.4900025725364685, "grad_norm": 94.40776824951172, "learning_rate": 3.2835820895522385e-07, "logits/chosen": 0.11696986854076385, "logits/rejected": 0.06440313160419464, "logps/chosen": -62.364227294921875, "logps/ref_chosen": -62.24385452270508, "logps/ref_rejected": -92.96665954589844, "logps/rejected": -93.22114562988281, "loss": 1.3504, "margin_dpo/margin_mean": 0.1341111958026886, "margin_dpo/margin_std": 0.4199225604534149, "step": 45 }, { "KL/chosen_KL_mean": -0.11914825439453125, "KL/mean": -0.17785024642944336, "KL/rejected_KL_mean": -0.2365570068359375, "KL/std": 0.3071748614311218, "epoch": 0.06953892668178382, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.11740612983703613, "fcm_dpo/q_t": 0.49122726917266846, "grad_norm": 91.4646987915039, "learning_rate": 3.3582089552238805e-07, "logits/chosen": 0.125150665640831, "logits/rejected": 0.07922039180994034, "logps/chosen": -61.6180534362793, "logps/ref_chosen": -61.498905181884766, "logps/ref_rejected": -78.91172790527344, "logps/rejected": -79.14828491210938, "loss": 1.3552, "margin_dpo/margin_mean": 0.11740574240684509, "margin_dpo/margin_std": 0.4100007116794586, "step": 46 }, { "KL/chosen_KL_mean": -0.12606048583984375, "KL/mean": -0.21223239600658417, "KL/rejected_KL_mean": -0.2984046936035156, "KL/std": 0.2702568769454956, "epoch": 0.0710506424792139, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.17234352231025696, "fcm_dpo/q_t": 0.4871085584163666, "grad_norm": 84.35171508789062, "learning_rate": 3.432835820895522e-07, "logits/chosen": 0.03952018916606903, "logits/rejected": -0.003650798462331295, "logps/chosen": -51.70440673828125, "logps/ref_chosen": -51.578346252441406, "logps/ref_rejected": -68.2215576171875, "logps/rejected": -68.51995849609375, "loss": 1.3383, "margin_dpo/margin_mean": 0.17234358191490173, "margin_dpo/margin_std": 0.36827313899993896, "step": 47 }, { "KL/chosen_KL_mean": -0.2223663330078125, "KL/mean": -0.2300795167684555, "KL/rejected_KL_mean": -0.23779869079589844, "KL/std": 0.31414487957954407, "epoch": 0.07256235827664399, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.015427500009536743, "fcm_dpo/q_t": 0.49886929988861084, "grad_norm": 80.18048858642578, "learning_rate": 3.507462686567164e-07, "logits/chosen": 0.16679422557353973, "logits/rejected": 0.13644982874393463, "logps/chosen": -52.01601791381836, "logps/ref_chosen": -51.79365158081055, "logps/ref_rejected": -64.22503662109375, "logps/rejected": -64.46283721923828, "loss": 1.3859, "margin_dpo/margin_mean": 0.015427738428115845, "margin_dpo/margin_std": 0.4319096505641937, "step": 48 }, { "KL/chosen_KL_mean": -0.214080810546875, "KL/mean": -0.2529619038105011, "KL/rejected_KL_mean": -0.2918434143066406, "KL/std": 0.33376023173332214, "epoch": 0.07407407407407407, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.07775886356830597, "fcm_dpo/q_t": 0.49420154094696045, "grad_norm": 81.2991714477539, "learning_rate": 3.5820895522388055e-07, "logits/chosen": 0.004545837640762329, "logits/rejected": -0.01614885963499546, "logps/chosen": -58.34868240356445, "logps/ref_chosen": -58.13460159301758, "logps/ref_rejected": -64.63206481933594, "logps/rejected": -64.92391204833984, "loss": 1.368, "margin_dpo/margin_mean": 0.07775917649269104, "margin_dpo/margin_std": 0.4653674364089966, "step": 49 }, { "KL/chosen_KL_mean": -0.2766857147216797, "KL/mean": -0.3321298360824585, "KL/rejected_KL_mean": -0.3875732421875, "KL/std": 0.3180408477783203, "epoch": 0.07558578987150416, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.11088606715202332, "fcm_dpo/q_t": 0.49169936776161194, "grad_norm": 83.06045532226562, "learning_rate": 3.6567164179104475e-07, "logits/chosen": 0.08905792236328125, "logits/rejected": 0.060204170644283295, "logps/chosen": -53.13312530517578, "logps/ref_chosen": -52.85643768310547, "logps/ref_rejected": -72.17460632324219, "logps/rejected": -72.56217956542969, "loss": 1.3569, "margin_dpo/margin_mean": 0.11088606715202332, "margin_dpo/margin_std": 0.39939045906066895, "step": 50 }, { "KL/chosen_KL_mean": -0.2582511901855469, "KL/mean": -0.3603350520133972, "KL/rejected_KL_mean": -0.4624176025390625, "KL/std": 0.3999677896499634, "epoch": 0.07709750566893424, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.20416602492332458, "fcm_dpo/q_t": 0.48485416173934937, "grad_norm": 87.8531723022461, "learning_rate": 3.7313432835820895e-07, "logits/chosen": 0.08908696472644806, "logits/rejected": 0.06121304631233215, "logps/chosen": -63.914695739746094, "logps/ref_chosen": -63.65644073486328, "logps/ref_rejected": -86.13229370117188, "logps/rejected": -86.59471130371094, "loss": 1.3327, "margin_dpo/margin_mean": 0.20416662096977234, "margin_dpo/margin_std": 0.5418384075164795, "step": 51 }, { "KL/chosen_KL_mean": -0.30074310302734375, "KL/mean": -0.399644672870636, "KL/rejected_KL_mean": -0.4985504150390625, "KL/std": 0.41704005002975464, "epoch": 0.07860922146636433, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.1978067010641098, "fcm_dpo/q_t": 0.4854433238506317, "grad_norm": 94.91314697265625, "learning_rate": 3.805970149253731e-07, "logits/chosen": 0.1003977358341217, "logits/rejected": 0.04964097589254379, "logps/chosen": -68.14096069335938, "logps/ref_chosen": -67.8402099609375, "logps/ref_rejected": -96.97090911865234, "logps/rejected": -97.4694595336914, "loss": 1.3351, "margin_dpo/margin_mean": 0.1978059560060501, "margin_dpo/margin_std": 0.5647962093353271, "step": 52 }, { "KL/chosen_KL_mean": -0.32163238525390625, "KL/mean": -0.39948517084121704, "KL/rejected_KL_mean": -0.4773368835449219, "KL/std": 0.3442782163619995, "epoch": 0.0801209372637944, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.155705988407135, "fcm_dpo/q_t": 0.48833927512168884, "grad_norm": 80.36358642578125, "learning_rate": 3.880597014925373e-07, "logits/chosen": 0.10558272898197174, "logits/rejected": 0.09447262436151505, "logps/chosen": -57.199771881103516, "logps/ref_chosen": -56.87813949584961, "logps/ref_rejected": -60.75569152832031, "logps/rejected": -61.2330322265625, "loss": 1.3443, "margin_dpo/margin_mean": 0.15570643544197083, "margin_dpo/margin_std": 0.42194539308547974, "step": 53 }, { "KL/chosen_KL_mean": -0.3733062744140625, "KL/mean": -0.4348277151584625, "KL/rejected_KL_mean": -0.49634742736816406, "KL/std": 0.37108659744262695, "epoch": 0.08163265306122448, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.12304168939590454, "fcm_dpo/q_t": 0.490837961435318, "grad_norm": 79.11129760742188, "learning_rate": 3.9552238805970144e-07, "logits/chosen": 0.06367582082748413, "logits/rejected": 0.048242468386888504, "logps/chosen": -47.640228271484375, "logps/ref_chosen": -47.26692199707031, "logps/ref_rejected": -62.19426727294922, "logps/rejected": -62.69061279296875, "loss": 1.3543, "margin_dpo/margin_mean": 0.12304195761680603, "margin_dpo/margin_std": 0.4499589800834656, "step": 54 }, { "KL/chosen_KL_mean": -0.3419971466064453, "KL/mean": -0.46538835763931274, "KL/rejected_KL_mean": -0.5887794494628906, "KL/std": 0.4728488326072693, "epoch": 0.08314436885865457, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.24678145349025726, "fcm_dpo/q_t": 0.4817239046096802, "grad_norm": 90.97213745117188, "learning_rate": 4.0298507462686564e-07, "logits/chosen": 0.04848404973745346, "logits/rejected": -0.029526766389608383, "logps/chosen": -50.668190002441406, "logps/ref_chosen": -50.32619094848633, "logps/ref_rejected": -92.44389343261719, "logps/rejected": -93.03266906738281, "loss": 1.323, "margin_dpo/margin_mean": 0.2467818409204483, "margin_dpo/margin_std": 0.6396048665046692, "step": 55 }, { "KL/chosen_KL_mean": -0.3279705047607422, "KL/mean": -0.4335082471370697, "KL/rejected_KL_mean": -0.5390377044677734, "KL/std": 0.4586765170097351, "epoch": 0.08465608465608465, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.2110726237297058, "fcm_dpo/q_t": 0.4842711389064789, "grad_norm": 80.57612609863281, "learning_rate": 4.1044776119402984e-07, "logits/chosen": 0.10764053463935852, "logits/rejected": 0.085502989590168, "logps/chosen": -57.094940185546875, "logps/ref_chosen": -56.766971588134766, "logps/ref_rejected": -66.30504608154297, "logps/rejected": -66.84408569335938, "loss": 1.3314, "margin_dpo/margin_mean": 0.21107253432273865, "margin_dpo/margin_std": 0.5673775672912598, "step": 56 }, { "KL/chosen_KL_mean": -0.48122596740722656, "KL/mean": -0.6418454647064209, "KL/rejected_KL_mean": -0.8024635314941406, "KL/std": 0.5893479585647583, "epoch": 0.08616780045351474, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.3212365210056305, "fcm_dpo/q_t": 0.4762771725654602, "grad_norm": 85.89154815673828, "learning_rate": 4.17910447761194e-07, "logits/chosen": 0.09039437025785446, "logits/rejected": 0.026721905916929245, "logps/chosen": -58.24897003173828, "logps/ref_chosen": -57.76774597167969, "logps/ref_rejected": -82.75698852539062, "logps/rejected": -83.5594482421875, "loss": 1.3034, "margin_dpo/margin_mean": 0.32123592495918274, "margin_dpo/margin_std": 0.675315260887146, "step": 57 }, { "KL/chosen_KL_mean": -0.5188522338867188, "KL/mean": -0.6452780961990356, "KL/rejected_KL_mean": -0.7717018127441406, "KL/std": 0.6151344776153564, "epoch": 0.08767951625094482, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.25284668803215027, "fcm_dpo/q_t": 0.4821554720401764, "grad_norm": 88.01100158691406, "learning_rate": 4.253731343283582e-07, "logits/chosen": 0.04262678697705269, "logits/rejected": 0.02768554352223873, "logps/chosen": -73.28294372558594, "logps/ref_chosen": -72.76408386230469, "logps/ref_rejected": -84.49275207519531, "logps/rejected": -85.26445007324219, "loss": 1.3309, "margin_dpo/margin_mean": 0.25284650921821594, "margin_dpo/margin_std": 0.8918960094451904, "step": 58 }, { "KL/chosen_KL_mean": -0.48831939697265625, "KL/mean": -0.6330121755599976, "KL/rejected_KL_mean": -0.7777099609375, "KL/std": 0.559989869594574, "epoch": 0.08919123204837491, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.2893878221511841, "fcm_dpo/q_t": 0.4787580072879791, "grad_norm": 75.81594848632812, "learning_rate": 4.3283582089552234e-07, "logits/chosen": 0.13536059856414795, "logits/rejected": 0.06879055500030518, "logps/chosen": -50.30909729003906, "logps/ref_chosen": -49.820777893066406, "logps/ref_rejected": -77.14368438720703, "logps/rejected": -77.92139434814453, "loss": 1.3139, "margin_dpo/margin_mean": 0.2893882989883423, "margin_dpo/margin_std": 0.6970615386962891, "step": 59 }, { "KL/chosen_KL_mean": -0.5602378845214844, "KL/mean": -0.6013921499252319, "KL/rejected_KL_mean": -0.6425514221191406, "KL/std": 0.5349780917167664, "epoch": 0.09070294784580499, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.08231174945831299, "fcm_dpo/q_t": 0.49379658699035645, "grad_norm": 90.27252960205078, "learning_rate": 4.4029850746268654e-07, "logits/chosen": 0.1320168673992157, "logits/rejected": 0.1305527687072754, "logps/chosen": -63.785011291503906, "logps/ref_chosen": -63.22477340698242, "logps/ref_rejected": -61.360477447509766, "logps/rejected": -62.003028869628906, "loss": 1.3724, "margin_dpo/margin_mean": 0.08231207728385925, "margin_dpo/margin_std": 0.6790165901184082, "step": 60 }, { "KL/chosen_KL_mean": -0.642669677734375, "KL/mean": -0.7031924724578857, "KL/rejected_KL_mean": -0.7637138366699219, "KL/std": 0.5996388792991638, "epoch": 0.09221466364323508, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.12104533612728119, "fcm_dpo/q_t": 0.4910633862018585, "grad_norm": 85.4162826538086, "learning_rate": 4.4776119402985074e-07, "logits/chosen": 0.14337512850761414, "logits/rejected": 0.11035867780447006, "logps/chosen": -49.65946960449219, "logps/ref_chosen": -49.01679992675781, "logps/ref_rejected": -74.90817260742188, "logps/rejected": -75.67188262939453, "loss": 1.3664, "margin_dpo/margin_mean": 0.12104541063308716, "margin_dpo/margin_std": 0.8405271768569946, "step": 61 }, { "KL/chosen_KL_mean": -0.7178230285644531, "KL/mean": -0.8114420175552368, "KL/rejected_KL_mean": -0.905059814453125, "KL/std": 0.5907775163650513, "epoch": 0.09372637944066516, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.18723812699317932, "fcm_dpo/q_t": 0.48603251576423645, "grad_norm": 86.27386474609375, "learning_rate": 4.552238805970149e-07, "logits/chosen": 0.11343187838792801, "logits/rejected": 0.07415207475423813, "logps/chosen": -63.46969223022461, "logps/ref_chosen": -62.751869201660156, "logps/ref_rejected": -78.93360900878906, "logps/rejected": -79.83866882324219, "loss": 1.3462, "margin_dpo/margin_mean": 0.18723803758621216, "margin_dpo/margin_std": 0.8265249729156494, "step": 62 }, { "KL/chosen_KL_mean": -0.5149002075195312, "KL/mean": -0.7922695875167847, "KL/rejected_KL_mean": -1.0696449279785156, "KL/std": 0.6339981555938721, "epoch": 0.09523809523809523, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.5547427535057068, "fcm_dpo/q_t": 0.4589221477508545, "grad_norm": 85.83872985839844, "learning_rate": 4.626865671641791e-07, "logits/chosen": 0.15265555679798126, "logits/rejected": 0.1290743201971054, "logps/chosen": -61.0301513671875, "logps/ref_chosen": -60.51525115966797, "logps/ref_rejected": -85.11021423339844, "logps/rejected": -86.17985534667969, "loss": 1.2391, "margin_dpo/margin_mean": 0.5547425746917725, "margin_dpo/margin_std": 0.7428404092788696, "step": 63 }, { "KL/chosen_KL_mean": -0.7253684997558594, "KL/mean": -0.7832847833633423, "KL/rejected_KL_mean": -0.8412017822265625, "KL/std": 0.6213120222091675, "epoch": 0.09674981103552532, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.11583417654037476, "fcm_dpo/q_t": 0.49128633737564087, "grad_norm": 79.28340148925781, "learning_rate": 4.701492537313433e-07, "logits/chosen": 0.09198494255542755, "logits/rejected": 0.06694923341274261, "logps/chosen": -51.932212829589844, "logps/ref_chosen": -51.20684814453125, "logps/ref_rejected": -66.93081665039062, "logps/rejected": -67.77201843261719, "loss": 1.364, "margin_dpo/margin_mean": 0.11583459377288818, "margin_dpo/margin_std": 0.7325628995895386, "step": 64 }, { "KL/chosen_KL_mean": -0.7872371673583984, "KL/mean": -1.0325498580932617, "KL/rejected_KL_mean": -1.2778663635253906, "KL/std": 0.7820297479629517, "epoch": 0.0982615268329554, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.4906308054924011, "fcm_dpo/q_t": 0.4642921984195709, "grad_norm": 88.44102478027344, "learning_rate": 4.776119402985074e-07, "logits/chosen": 0.21399691700935364, "logits/rejected": 0.1837337613105774, "logps/chosen": -68.075927734375, "logps/ref_chosen": -67.2886962890625, "logps/ref_rejected": -74.44281005859375, "logps/rejected": -75.72067260742188, "loss": 1.2673, "margin_dpo/margin_mean": 0.49063026905059814, "margin_dpo/margin_std": 1.015063762664795, "step": 65 }, { "KL/chosen_KL_mean": -0.785797119140625, "KL/mean": -0.9667012691497803, "KL/rejected_KL_mean": -1.1476058959960938, "KL/std": 0.7261425256729126, "epoch": 0.09977324263038549, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.3618091642856598, "fcm_dpo/q_t": 0.47361284494400024, "grad_norm": 84.55229187011719, "learning_rate": 4.850746268656717e-07, "logits/chosen": 0.10004591941833496, "logits/rejected": 0.07599621266126633, "logps/chosen": -71.52920532226562, "logps/ref_chosen": -70.743408203125, "logps/ref_rejected": -77.26499938964844, "logps/rejected": -78.41261291503906, "loss": 1.3008, "margin_dpo/margin_mean": 0.36180832982063293, "margin_dpo/margin_std": 0.9393926858901978, "step": 66 }, { "KL/chosen_KL_mean": -0.6787815093994141, "KL/mean": -0.8098489046096802, "KL/rejected_KL_mean": -0.94091796875, "KL/std": 0.6241350173950195, "epoch": 0.10128495842781557, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.26213303208351135, "fcm_dpo/q_t": 0.4806175231933594, "grad_norm": 85.96515655517578, "learning_rate": 4.925373134328357e-07, "logits/chosen": 0.08429770171642303, "logits/rejected": 0.028592076152563095, "logps/chosen": -61.2813835144043, "logps/ref_chosen": -60.60260009765625, "logps/ref_rejected": -75.22235870361328, "logps/rejected": -76.16327667236328, "loss": 1.3223, "margin_dpo/margin_mean": 0.26213228702545166, "margin_dpo/margin_std": 0.761053204536438, "step": 67 }, { "KL/chosen_KL_mean": -1.0498847961425781, "KL/mean": -1.2132654190063477, "KL/rejected_KL_mean": -1.37664794921875, "KL/std": 0.8114407062530518, "epoch": 0.10279667422524566, "fcm_dpo/beta": 0.30000001192092896, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.3267657160758972, "fcm_dpo/q_t": 0.47629600763320923, "grad_norm": 91.1146011352539, "learning_rate": 5e-07, "logits/chosen": 0.058436907827854156, "logits/rejected": 0.028530534356832504, "logps/chosen": -78.5782470703125, "logps/ref_chosen": -77.52836608886719, "logps/ref_rejected": -93.17778015136719, "logps/rejected": -94.55442810058594, "loss": 1.3158, "margin_dpo/margin_mean": 0.32676631212234497, "margin_dpo/margin_std": 1.0499687194824219, "step": 68 }, { "KL/chosen_KL_mean": -1.0008392333984375, "KL/mean": -1.304330587387085, "KL/rejected_KL_mean": -1.607818603515625, "KL/std": 0.8572876453399658, "epoch": 0.10430839002267574, "fcm_dpo/beta": 0.30193930864334106, "fcm_dpo/delta": 0.06422863900661469, "fcm_dpo/margin": 0.6069808006286621, "fcm_dpo/q_t": 0.45600593090057373, "grad_norm": 83.39071655273438, "learning_rate": 4.999965034812934e-07, "logits/chosen": 0.08316853642463684, "logits/rejected": 0.04070412740111351, "logps/chosen": -66.94389343261719, "logps/ref_chosen": -65.94305419921875, "logps/ref_rejected": -89.7735595703125, "logps/rejected": -91.38137817382812, "loss": 1.2345, "margin_dpo/margin_mean": 0.6069809198379517, "margin_dpo/margin_std": 0.9907248020172119, "step": 69 }, { "KL/chosen_KL_mean": -1.0711631774902344, "KL/mean": -1.231170892715454, "KL/rejected_KL_mean": -1.3911724090576172, "KL/std": 0.7602438926696777, "epoch": 0.10582010582010581, "fcm_dpo/beta": 0.3038785755634308, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.32000821828842163, "fcm_dpo/q_t": 0.47606074810028076, "grad_norm": 85.077392578125, "learning_rate": 4.999860140229787e-07, "logits/chosen": 0.15712867677211761, "logits/rejected": 0.13347765803337097, "logps/chosen": -63.029075622558594, "logps/ref_chosen": -61.95791244506836, "logps/ref_rejected": -75.80945587158203, "logps/rejected": -77.20063018798828, "loss": 1.3142, "margin_dpo/margin_mean": 0.32000866532325745, "margin_dpo/margin_std": 0.9947592616081238, "step": 70 }, { "KL/chosen_KL_mean": -1.2546577453613281, "KL/mean": -1.3389543294906616, "KL/rejected_KL_mean": -1.4232540130615234, "KL/std": 0.7805109024047852, "epoch": 0.1073318216175359, "fcm_dpo/beta": 0.3038785755634308, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.16859367489814758, "fcm_dpo/q_t": 0.4871462285518646, "grad_norm": 88.05529022216797, "learning_rate": 4.999685319184688e-07, "logits/chosen": 0.10442924499511719, "logits/rejected": 0.08813776820898056, "logps/chosen": -64.60223388671875, "logps/ref_chosen": -63.34757995605469, "logps/ref_rejected": -67.49658203125, "logps/rejected": -68.91983032226562, "loss": 1.3636, "margin_dpo/margin_mean": 0.16859376430511475, "margin_dpo/margin_std": 1.1053366661071777, "step": 71 }, { "KL/chosen_KL_mean": -1.0062751770019531, "KL/mean": -1.3182861804962158, "KL/rejected_KL_mean": -1.6302986145019531, "KL/std": 0.8503187894821167, "epoch": 0.10884353741496598, "fcm_dpo/beta": 0.30923372507095337, "fcm_dpo/delta": 0.08734607696533203, "fcm_dpo/margin": 0.6240215301513672, "fcm_dpo/q_t": 0.4535670876502991, "grad_norm": 86.13911437988281, "learning_rate": 4.999440576567755e-07, "logits/chosen": 0.11493426561355591, "logits/rejected": 0.05126555263996124, "logps/chosen": -56.86557388305664, "logps/ref_chosen": -55.85929870605469, "logps/ref_rejected": -68.45423889160156, "logps/rejected": -70.08454132080078, "loss": 1.2278, "margin_dpo/margin_mean": 0.6240211129188538, "margin_dpo/margin_std": 1.0259413719177246, "step": 72 }, { "KL/chosen_KL_mean": -1.4853363037109375, "KL/mean": -1.5479178428649902, "KL/rejected_KL_mean": -1.6105022430419922, "KL/std": 0.927099347114563, "epoch": 0.11035525321239607, "fcm_dpo/beta": 0.30923372507095337, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.12517189979553223, "fcm_dpo/q_t": 0.4914061725139618, "grad_norm": 98.4456558227539, "learning_rate": 4.999125919224965e-07, "logits/chosen": 0.09999468922615051, "logits/rejected": 0.08582901209592819, "logps/chosen": -70.6241455078125, "logps/ref_chosen": -69.13880920410156, "logps/ref_rejected": -79.04586791992188, "logps/rejected": -80.6563720703125, "loss": 1.3773, "margin_dpo/margin_mean": 0.1251724660396576, "margin_dpo/margin_std": 1.081213116645813, "step": 73 }, { "KL/chosen_KL_mean": -1.0307197570800781, "KL/mean": -1.3247830867767334, "KL/rejected_KL_mean": -1.6188468933105469, "KL/std": 0.7862107753753662, "epoch": 0.11186696900982615, "fcm_dpo/beta": 0.3115134537220001, "fcm_dpo/delta": 0.07318352907896042, "fcm_dpo/margin": 0.5881245136260986, "fcm_dpo/q_t": 0.45581668615341187, "grad_norm": 80.35842895507812, "learning_rate": 4.998741355957963e-07, "logits/chosen": 0.10959874093532562, "logits/rejected": 0.05822136998176575, "logps/chosen": -50.9544563293457, "logps/ref_chosen": -49.923736572265625, "logps/ref_rejected": -81.73213958740234, "logps/rejected": -83.35098266601562, "loss": 1.2356, "margin_dpo/margin_mean": 0.5881245136260986, "margin_dpo/margin_std": 0.9894160628318787, "step": 74 }, { "KL/chosen_KL_mean": -1.2068157196044922, "KL/mean": -1.5130023956298828, "KL/rejected_KL_mean": -1.8191852569580078, "KL/std": 0.8901680707931519, "epoch": 0.11337868480725624, "fcm_dpo/beta": 0.3137931823730469, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.6123712062835693, "fcm_dpo/q_t": 0.45388418436050415, "grad_norm": 73.69344329833984, "learning_rate": 4.998286897523808e-07, "logits/chosen": 0.11047861725091934, "logits/rejected": 0.07770434021949768, "logps/chosen": -47.27556610107422, "logps/ref_chosen": -46.06875228881836, "logps/ref_rejected": -66.1181411743164, "logps/rejected": -67.93733215332031, "loss": 1.2334, "margin_dpo/margin_mean": 0.6123708486557007, "margin_dpo/margin_std": 1.1229393482208252, "step": 75 }, { "KL/chosen_KL_mean": -1.3518695831298828, "KL/mean": -1.4724314212799072, "KL/rejected_KL_mean": -1.5929927825927734, "KL/std": 0.9321086406707764, "epoch": 0.11489040060468632, "fcm_dpo/beta": 0.3137931823730469, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.24112612009048462, "fcm_dpo/q_t": 0.48176026344299316, "grad_norm": 91.54261016845703, "learning_rate": 4.997762556634679e-07, "logits/chosen": 0.12209813296794891, "logits/rejected": 0.07763922214508057, "logps/chosen": -55.41461944580078, "logps/ref_chosen": -54.06275177001953, "logps/ref_rejected": -74.87464141845703, "logps/rejected": -76.46763610839844, "loss": 1.3417, "margin_dpo/margin_mean": 0.24112600088119507, "margin_dpo/margin_std": 1.1081591844558716, "step": 76 }, { "KL/chosen_KL_mean": -1.383779525756836, "KL/mean": -1.6501636505126953, "KL/rejected_KL_mean": -1.9165458679199219, "KL/std": 0.9299043416976929, "epoch": 0.1164021164021164, "fcm_dpo/beta": 0.3181629478931427, "fcm_dpo/delta": 0.06914756447076797, "fcm_dpo/margin": 0.532768964767456, "fcm_dpo/q_t": 0.45917877554893494, "grad_norm": 85.72856140136719, "learning_rate": 4.99716834795752e-07, "logits/chosen": 0.1453985869884491, "logits/rejected": 0.10386494547128677, "logps/chosen": -54.459869384765625, "logps/ref_chosen": -53.07609176635742, "logps/ref_rejected": -74.45601654052734, "logps/rejected": -76.37255859375, "loss": 1.25, "margin_dpo/margin_mean": 0.5327691435813904, "margin_dpo/margin_std": 0.9784738421440125, "step": 77 }, { "KL/chosen_KL_mean": -1.4582748413085938, "KL/mean": -1.6500697135925293, "KL/rejected_KL_mean": -1.8418655395507812, "KL/std": 0.9414688348770142, "epoch": 0.11791383219954649, "fcm_dpo/beta": 0.3181629478931427, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.38358843326568604, "fcm_dpo/q_t": 0.4702114462852478, "grad_norm": 92.69743347167969, "learning_rate": 4.996504288113623e-07, "logits/chosen": 0.07882325351238251, "logits/rejected": 0.05884188786149025, "logps/chosen": -69.18370056152344, "logps/ref_chosen": -67.72541809082031, "logps/ref_rejected": -79.03926849365234, "logps/rejected": -80.88113403320312, "loss": 1.296, "margin_dpo/margin_mean": 0.3835884630680084, "margin_dpo/margin_std": 1.0585718154907227, "step": 78 }, { "KL/chosen_KL_mean": -1.411651611328125, "KL/mean": -1.8503923416137695, "KL/rejected_KL_mean": -2.289134979248047, "KL/std": 1.109024167060852, "epoch": 0.11942554799697656, "fcm_dpo/beta": 0.3238750100135803, "fcm_dpo/delta": 0.11919578909873962, "fcm_dpo/margin": 0.8774796724319458, "fcm_dpo/q_t": 0.4344840943813324, "grad_norm": 80.9024429321289, "learning_rate": 4.995770395678171e-07, "logits/chosen": 0.17850381135940552, "logits/rejected": 0.11637181043624878, "logps/chosen": -53.572296142578125, "logps/ref_chosen": -52.16064453125, "logps/ref_rejected": -83.31062316894531, "logps/rejected": -85.5997543334961, "loss": 1.1664, "margin_dpo/margin_mean": 0.8774796724319458, "margin_dpo/margin_std": 1.3488304615020752, "step": 79 }, { "KL/chosen_KL_mean": -1.6040763854980469, "KL/mean": -1.8379108905792236, "KL/rejected_KL_mean": -2.0717430114746094, "KL/std": 0.9534475803375244, "epoch": 0.12093726379440665, "fcm_dpo/beta": 0.3258388340473175, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 0.4676697254180908, "fcm_dpo/q_t": 0.46378254890441895, "grad_norm": 91.92669677734375, "learning_rate": 4.994966691179711e-07, "logits/chosen": 0.14511343836784363, "logits/rejected": 0.08402586728334427, "logps/chosen": -63.0146369934082, "logps/ref_chosen": -61.410560607910156, "logps/ref_rejected": -78.66004943847656, "logps/rejected": -80.73179626464844, "loss": 1.2836, "margin_dpo/margin_mean": 0.46766987442970276, "margin_dpo/margin_std": 1.2826039791107178, "step": 80 }, { "KL/chosen_KL_mean": -1.6509742736816406, "KL/mean": -2.025747060775757, "KL/rejected_KL_mean": -2.4005126953125, "KL/std": 1.0507984161376953, "epoch": 0.12244897959183673, "fcm_dpo/beta": 0.32829275727272034, "fcm_dpo/delta": 0.037514351308345795, "fcm_dpo/margin": 0.7495485544204712, "fcm_dpo/q_t": 0.44190722703933716, "grad_norm": 86.37997436523438, "learning_rate": 4.994093197099587e-07, "logits/chosen": 0.0986589565873146, "logits/rejected": 0.0652787908911705, "logps/chosen": -65.4553451538086, "logps/ref_chosen": -63.80437088012695, "logps/ref_rejected": -79.3484115600586, "logps/rejected": -81.7489242553711, "loss": 1.188, "margin_dpo/margin_mean": 0.7495482563972473, "margin_dpo/margin_std": 1.0859642028808594, "step": 81 }, { "KL/chosen_KL_mean": -1.4204177856445312, "KL/mean": -1.8572697639465332, "KL/rejected_KL_mean": -2.2941246032714844, "KL/std": 1.002649188041687, "epoch": 0.12396069538926682, "fcm_dpo/beta": 0.33317315578460693, "fcm_dpo/delta": 0.11231572926044464, "fcm_dpo/margin": 0.8737020492553711, "fcm_dpo/q_t": 0.43078649044036865, "grad_norm": 77.9083023071289, "learning_rate": 4.993149937871306e-07, "logits/chosen": 0.08195002377033234, "logits/rejected": 0.019727405160665512, "logps/chosen": -50.238311767578125, "logps/ref_chosen": -48.817893981933594, "logps/ref_rejected": -70.31497955322266, "logps/rejected": -72.60910034179688, "loss": 1.1441, "margin_dpo/margin_mean": 0.8737020492553711, "margin_dpo/margin_std": 1.0093238353729248, "step": 82 }, { "KL/chosen_KL_mean": -1.5698013305664062, "KL/mean": -2.0242576599121094, "KL/rejected_KL_mean": -2.4787139892578125, "KL/std": 1.0570318698883057, "epoch": 0.1254724111866969, "fcm_dpo/beta": 0.340545117855072, "fcm_dpo/delta": 0.0933399647474289, "fcm_dpo/margin": 0.9089138507843018, "fcm_dpo/q_t": 0.4278455376625061, "grad_norm": 83.79415893554688, "learning_rate": 4.992136939879856e-07, "logits/chosen": 0.18783977627754211, "logits/rejected": 0.13673317432403564, "logps/chosen": -58.72057342529297, "logps/ref_chosen": -57.15077209472656, "logps/ref_rejected": -75.1710205078125, "logps/rejected": -77.64973449707031, "loss": 1.1465, "margin_dpo/margin_mean": 0.9089135527610779, "margin_dpo/margin_std": 1.3083041906356812, "step": 83 }, { "KL/chosen_KL_mean": -1.8939476013183594, "KL/mean": -2.211676597595215, "KL/rejected_KL_mean": -2.5294113159179688, "KL/std": 1.1902132034301758, "epoch": 0.12698412698412698, "fcm_dpo/beta": 0.3474050462245941, "fcm_dpo/delta": 0.077272430062294, "fcm_dpo/margin": 0.6354621052742004, "fcm_dpo/q_t": 0.4475017786026001, "grad_norm": 101.39741516113281, "learning_rate": 4.991054231460969e-07, "logits/chosen": 0.16154229640960693, "logits/rejected": 0.11815261840820312, "logps/chosen": -66.67124938964844, "logps/ref_chosen": -64.77729797363281, "logps/ref_rejected": -84.71949768066406, "logps/rejected": -87.24890899658203, "loss": 1.2179, "margin_dpo/margin_mean": 0.6354624032974243, "margin_dpo/margin_std": 1.1679608821868896, "step": 84 }, { "KL/chosen_KL_mean": -1.697763442993164, "KL/mean": -2.1500391960144043, "KL/rejected_KL_mean": -2.602313995361328, "KL/std": 1.1836422681808472, "epoch": 0.12849584278155707, "fcm_dpo/beta": 0.35248273611068726, "fcm_dpo/delta": 0.08357930928468704, "fcm_dpo/margin": 0.9045514464378357, "fcm_dpo/q_t": 0.4248403310775757, "grad_norm": 86.7568359375, "learning_rate": 4.989901842900325e-07, "logits/chosen": 0.14641568064689636, "logits/rejected": 0.10217833518981934, "logps/chosen": -51.949459075927734, "logps/ref_chosen": -50.25169372558594, "logps/ref_rejected": -66.55439758300781, "logps/rejected": -69.15670776367188, "loss": 1.1425, "margin_dpo/margin_mean": 0.9045511484146118, "margin_dpo/margin_std": 1.268752932548523, "step": 85 }, { "KL/chosen_KL_mean": -2.056539535522461, "KL/mean": -2.414137840270996, "KL/rejected_KL_mean": -2.771738052368164, "KL/std": 1.2594623565673828, "epoch": 0.13000755857898716, "fcm_dpo/beta": 0.36062800884246826, "fcm_dpo/delta": 0.14598755538463593, "fcm_dpo/margin": 0.7151964902877808, "fcm_dpo/q_t": 0.440341591835022, "grad_norm": 85.87276458740234, "learning_rate": 4.988679806432711e-07, "logits/chosen": 0.15964752435684204, "logits/rejected": 0.14105254411697388, "logps/chosen": -62.78572082519531, "logps/ref_chosen": -60.72917938232422, "logps/ref_rejected": -72.30961608886719, "logps/rejected": -75.08135223388672, "loss": 1.2022, "margin_dpo/margin_mean": 0.7151964902877808, "margin_dpo/margin_std": 1.350599765777588, "step": 86 }, { "KL/chosen_KL_mean": -2.0709495544433594, "KL/mean": -2.4845941066741943, "KL/rejected_KL_mean": -2.8982315063476562, "KL/std": 1.3656513690948486, "epoch": 0.13151927437641722, "fcm_dpo/beta": 0.3639698028564453, "fcm_dpo/delta": 0.0033310160506516695, "fcm_dpo/margin": 0.8272854089736938, "fcm_dpo/q_t": 0.43046247959136963, "grad_norm": 107.46381378173828, "learning_rate": 4.987388156241114e-07, "logits/chosen": 0.12424930930137634, "logits/rejected": 0.06647076457738876, "logps/chosen": -67.82891845703125, "logps/ref_chosen": -65.75796508789062, "logps/ref_rejected": -84.81159973144531, "logps/rejected": -87.70982360839844, "loss": 1.1947, "margin_dpo/margin_mean": 0.8272853493690491, "margin_dpo/margin_std": 1.648939847946167, "step": 87 }, { "KL/chosen_KL_mean": -2.0886478424072266, "KL/mean": -2.4435644149780273, "KL/rejected_KL_mean": -2.7984771728515625, "KL/std": 1.3572802543640137, "epoch": 0.1330309901738473, "fcm_dpo/beta": 0.36330440640449524, "fcm_dpo/delta": -0.00914906058460474, "fcm_dpo/margin": 0.7098320722579956, "fcm_dpo/q_t": 0.4415057897567749, "grad_norm": 106.92044067382812, "learning_rate": 4.986026928455767e-07, "logits/chosen": 0.18091943860054016, "logits/rejected": 0.15450939536094666, "logps/chosen": -64.91267395019531, "logps/ref_chosen": -62.82402801513672, "logps/ref_rejected": -74.9607162475586, "logps/rejected": -77.75919342041016, "loss": 1.2392, "margin_dpo/margin_mean": 0.709831953048706, "margin_dpo/margin_std": 1.69313383102417, "step": 88 }, { "KL/chosen_KL_mean": -1.8108139038085938, "KL/mean": -2.2503674030303955, "KL/rejected_KL_mean": -2.689922332763672, "KL/std": 1.357825756072998, "epoch": 0.1345427059712774, "fcm_dpo/beta": 0.36267584562301636, "fcm_dpo/delta": -0.017331281676888466, "fcm_dpo/margin": 0.8791050314903259, "fcm_dpo/q_t": 0.42747825384140015, "grad_norm": 90.00340270996094, "learning_rate": 4.984596161153135e-07, "logits/chosen": 0.21299389004707336, "logits/rejected": 0.13079392910003662, "logps/chosen": -43.00225067138672, "logps/ref_chosen": -41.191436767578125, "logps/ref_rejected": -85.44769287109375, "logps/rejected": -88.13761901855469, "loss": 1.1665, "margin_dpo/margin_mean": 0.8791056871414185, "margin_dpo/margin_std": 1.5281598567962646, "step": 89 }, { "KL/chosen_KL_mean": -2.111286163330078, "KL/mean": -2.547137975692749, "KL/rejected_KL_mean": -2.9829940795898438, "KL/std": 1.4041776657104492, "epoch": 0.1360544217687075, "fcm_dpo/beta": 0.366120308637619, "fcm_dpo/delta": 0.0835900530219078, "fcm_dpo/margin": 0.8717069029808044, "fcm_dpo/q_t": 0.4256633520126343, "grad_norm": 95.42098999023438, "learning_rate": 4.983095894354857e-07, "logits/chosen": 0.1341867744922638, "logits/rejected": 0.07782384008169174, "logps/chosen": -58.695194244384766, "logps/ref_chosen": -56.58390808105469, "logps/ref_rejected": -86.86978149414062, "logps/rejected": -89.85277557373047, "loss": 1.1616, "margin_dpo/margin_mean": 0.8717068433761597, "margin_dpo/margin_std": 1.47410249710083, "step": 90 }, { "KL/chosen_KL_mean": -2.0727901458740234, "KL/mean": -2.543304443359375, "KL/rejected_KL_mean": -3.0138168334960938, "KL/std": 1.3842124938964844, "epoch": 0.13756613756613756, "fcm_dpo/beta": 0.3716731667518616, "fcm_dpo/delta": 0.05190932750701904, "fcm_dpo/margin": 0.9410274028778076, "fcm_dpo/q_t": 0.42051270604133606, "grad_norm": 86.90682220458984, "learning_rate": 4.98152617002662e-07, "logits/chosen": 0.14020134508609772, "logits/rejected": 0.0952892005443573, "logps/chosen": -54.45513153076172, "logps/ref_chosen": -52.38234329223633, "logps/ref_rejected": -72.17642211914062, "logps/rejected": -75.19023895263672, "loss": 1.144, "margin_dpo/margin_mean": 0.9410272240638733, "margin_dpo/margin_std": 1.5481452941894531, "step": 91 }, { "KL/chosen_KL_mean": -2.225687026977539, "KL/mean": -2.6920647621154785, "KL/rejected_KL_mean": -3.158447265625, "KL/std": 1.5069758892059326, "epoch": 0.13907785336356765, "fcm_dpo/beta": 0.37716007232666016, "fcm_dpo/delta": 0.0487772673368454, "fcm_dpo/margin": 0.9327561855316162, "fcm_dpo/q_t": 0.42104315757751465, "grad_norm": 88.81369018554688, "learning_rate": 4.979887032076988e-07, "logits/chosen": 0.19652123749256134, "logits/rejected": 0.1548977941274643, "logps/chosen": -55.23439025878906, "logps/ref_chosen": -53.00870132446289, "logps/ref_rejected": -79.77812957763672, "logps/rejected": -82.93657684326172, "loss": 1.1653, "margin_dpo/margin_mean": 0.9327565431594849, "margin_dpo/margin_std": 1.705445408821106, "step": 92 }, { "KL/chosen_KL_mean": -2.371583938598633, "KL/mean": -2.724341630935669, "KL/rejected_KL_mean": -3.077098846435547, "KL/std": 1.5108641386032104, "epoch": 0.14058956916099774, "fcm_dpo/beta": 0.383634090423584, "fcm_dpo/delta": 0.1327441930770874, "fcm_dpo/margin": 0.7055189609527588, "fcm_dpo/q_t": 0.4405834972858429, "grad_norm": 87.71575164794922, "learning_rate": 4.978178526356172e-07, "logits/chosen": 0.12634697556495667, "logits/rejected": 0.0989193394780159, "logps/chosen": -47.27863693237305, "logps/ref_chosen": -44.90705108642578, "logps/ref_rejected": -58.7879524230957, "logps/rejected": -61.86505126953125, "loss": 1.2266, "margin_dpo/margin_mean": 0.7055187225341797, "margin_dpo/margin_std": 1.6528334617614746, "step": 93 }, { "KL/chosen_KL_mean": -1.999124526977539, "KL/mean": -2.560619831085205, "KL/rejected_KL_mean": -3.122119903564453, "KL/std": 1.6296097040176392, "epoch": 0.1421012849584278, "fcm_dpo/beta": 0.38215768337249756, "fcm_dpo/delta": -0.031070705503225327, "fcm_dpo/margin": 1.1229921579360962, "fcm_dpo/q_t": 0.4010883867740631, "grad_norm": 87.27040100097656, "learning_rate": 4.976400700654751e-07, "logits/chosen": 0.24216991662979126, "logits/rejected": 0.20032037794589996, "logps/chosen": -61.93689727783203, "logps/ref_chosen": -59.93777084350586, "logps/ref_rejected": -79.3138427734375, "logps/rejected": -82.43595886230469, "loss": 1.1372, "margin_dpo/margin_mean": 1.1229921579360962, "margin_dpo/margin_std": 1.9949820041656494, "step": 94 }, { "KL/chosen_KL_mean": -2.6242218017578125, "KL/mean": -3.158700466156006, "KL/rejected_KL_mean": -3.6931800842285156, "KL/std": 1.4618337154388428, "epoch": 0.1436130007558579, "fcm_dpo/beta": 0.38164472579956055, "fcm_dpo/delta": -0.008536683395504951, "fcm_dpo/margin": 1.0689539909362793, "fcm_dpo/q_t": 0.40741458535194397, "grad_norm": 97.12776947021484, "learning_rate": 4.974553604702332e-07, "logits/chosen": 0.11770854890346527, "logits/rejected": 0.053069088608026505, "logps/chosen": -62.79270935058594, "logps/ref_chosen": -60.168487548828125, "logps/ref_rejected": -90.73665618896484, "logps/rejected": -94.42984008789062, "loss": 1.1114, "margin_dpo/margin_mean": 1.0689538717269897, "margin_dpo/margin_std": 1.6474608182907104, "step": 95 }, { "KL/chosen_KL_mean": -2.4545211791992188, "KL/mean": -3.0417590141296387, "KL/rejected_KL_mean": -3.628997802734375, "KL/std": 1.6787238121032715, "epoch": 0.14512471655328799, "fcm_dpo/beta": 0.3811071515083313, "fcm_dpo/delta": -0.04988434165716171, "fcm_dpo/margin": 1.1744762659072876, "fcm_dpo/q_t": 0.40318265557289124, "grad_norm": 91.59760284423828, "learning_rate": 4.972637290166157e-07, "logits/chosen": 0.16960498690605164, "logits/rejected": 0.12512166798114777, "logps/chosen": -63.12329864501953, "logps/ref_chosen": -60.66877746582031, "logps/ref_rejected": -88.30673217773438, "logps/rejected": -91.93572998046875, "loss": 1.1182, "margin_dpo/margin_mean": 1.1744755506515503, "margin_dpo/margin_std": 1.96297025680542, "step": 96 }, { "KL/chosen_KL_mean": -2.9270896911621094, "KL/mean": -3.338170051574707, "KL/rejected_KL_mean": -3.7492523193359375, "KL/std": 1.692612648010254, "epoch": 0.14663643235071808, "fcm_dpo/beta": 0.37780511379241943, "fcm_dpo/delta": -0.014510155655443668, "fcm_dpo/margin": 0.8221678733825684, "fcm_dpo/q_t": 0.4347788691520691, "grad_norm": 125.3469009399414, "learning_rate": 4.970651810649666e-07, "logits/chosen": 0.11404736340045929, "logits/rejected": 0.06933660805225372, "logps/chosen": -67.97120666503906, "logps/ref_chosen": -65.04412078857422, "logps/ref_rejected": -78.42092895507812, "logps/rejected": -82.17018127441406, "loss": 1.2494, "margin_dpo/margin_mean": 0.8221681118011475, "margin_dpo/margin_std": 2.178349256515503, "step": 97 }, { "KL/chosen_KL_mean": -2.5152339935302734, "KL/mean": -2.84354567527771, "KL/rejected_KL_mean": -3.1718597412109375, "KL/std": 1.5594619512557983, "epoch": 0.14814814814814814, "fcm_dpo/beta": 0.38081395626068115, "fcm_dpo/delta": 0.03966221213340759, "fcm_dpo/margin": 0.6566241979598999, "fcm_dpo/q_t": 0.44297564029693604, "grad_norm": 106.45557403564453, "learning_rate": 4.968597221690985e-07, "logits/chosen": 0.1855883002281189, "logits/rejected": 0.1580853909254074, "logps/chosen": -58.018463134765625, "logps/ref_chosen": -55.503231048583984, "logps/ref_rejected": -72.81553649902344, "logps/rejected": -75.98739624023438, "loss": 1.2274, "margin_dpo/margin_mean": 0.6566237211227417, "margin_dpo/margin_std": 1.4679479598999023, "step": 98 }, { "KL/chosen_KL_mean": -2.582561492919922, "KL/mean": -3.024656295776367, "KL/rejected_KL_mean": -3.466754913330078, "KL/std": 1.746955394744873, "epoch": 0.14965986394557823, "fcm_dpo/beta": 0.3836577236652374, "fcm_dpo/delta": 0.06295155733823776, "fcm_dpo/margin": 0.8841931819915771, "fcm_dpo/q_t": 0.4282259941101074, "grad_norm": 119.62471771240234, "learning_rate": 4.966473580761389e-07, "logits/chosen": 0.1989203691482544, "logits/rejected": 0.15977294743061066, "logps/chosen": -61.158199310302734, "logps/ref_chosen": -58.57563781738281, "logps/ref_rejected": -78.693603515625, "logps/rejected": -82.16036224365234, "loss": 1.2194, "margin_dpo/margin_mean": 0.8841932415962219, "margin_dpo/margin_std": 2.078291416168213, "step": 99 }, { "KL/chosen_KL_mean": -2.8010482788085938, "KL/mean": -3.2622342109680176, "KL/rejected_KL_mean": -3.723419189453125, "KL/std": 1.7660582065582275, "epoch": 0.15117157974300832, "fcm_dpo/beta": 0.3828757405281067, "fcm_dpo/delta": -0.07216782122850418, "fcm_dpo/margin": 0.9223718047142029, "fcm_dpo/q_t": 0.42660045623779297, "grad_norm": 122.73509979248047, "learning_rate": 4.964280947263676e-07, "logits/chosen": 0.1641688048839569, "logits/rejected": 0.15674951672554016, "logps/chosen": -82.38448333740234, "logps/ref_chosen": -79.58343505859375, "logps/ref_rejected": -92.152587890625, "logps/rejected": -95.87600708007812, "loss": 1.2701, "margin_dpo/margin_mean": 0.9223724007606506, "margin_dpo/margin_std": 2.5030999183654785, "step": 100 }, { "KL/chosen_KL_mean": -2.440837860107422, "KL/mean": -3.11445951461792, "KL/rejected_KL_mean": -3.7880859375, "KL/std": 1.7899543046951294, "epoch": 0.15268329554043839, "fcm_dpo/beta": 0.37569791078567505, "fcm_dpo/delta": -0.11176390200853348, "fcm_dpo/margin": 1.3472542762756348, "fcm_dpo/q_t": 0.38718152046203613, "grad_norm": 83.7920913696289, "learning_rate": 4.96201938253052e-07, "logits/chosen": 0.17261841893196106, "logits/rejected": 0.13419027626514435, "logps/chosen": -54.773624420166016, "logps/ref_chosen": -52.332786560058594, "logps/ref_rejected": -69.55589294433594, "logps/rejected": -73.34397888183594, "loss": 1.0477, "margin_dpo/margin_mean": 1.3472540378570557, "margin_dpo/margin_std": 1.786298155784607, "step": 101 }, { "KL/chosen_KL_mean": -2.836507797241211, "KL/mean": -3.2560760974884033, "KL/rejected_KL_mean": -3.675647735595703, "KL/std": 1.7688740491867065, "epoch": 0.15419501133786848, "fcm_dpo/beta": 0.3721296191215515, "fcm_dpo/delta": 0.01126640010625124, "fcm_dpo/margin": 0.8391386270523071, "fcm_dpo/q_t": 0.435501366853714, "grad_norm": 109.37149810791016, "learning_rate": 4.959688949822748e-07, "logits/chosen": 0.11337646096944809, "logits/rejected": 0.07395602017641068, "logps/chosen": -67.57998657226562, "logps/ref_chosen": -64.74348449707031, "logps/ref_rejected": -69.06132507324219, "logps/rejected": -72.73697662353516, "loss": 1.2682, "margin_dpo/margin_mean": 0.8391384482383728, "margin_dpo/margin_std": 2.346881866455078, "step": 102 }, { "KL/chosen_KL_mean": -2.750640869140625, "KL/mean": -3.240105628967285, "KL/rejected_KL_mean": -3.729572296142578, "KL/std": 1.8294150829315186, "epoch": 0.15570672713529857, "fcm_dpo/beta": 0.3753799796104431, "fcm_dpo/delta": 0.033547695726156235, "fcm_dpo/margin": 0.9789345860481262, "fcm_dpo/q_t": 0.4204777479171753, "grad_norm": 106.16229248046875, "learning_rate": 4.957289714327572e-07, "logits/chosen": 0.21147795021533966, "logits/rejected": 0.17772606015205383, "logps/chosen": -66.5872802734375, "logps/ref_chosen": -63.83664321899414, "logps/ref_rejected": -79.32362365722656, "logps/rejected": -83.0531997680664, "loss": 1.1696, "margin_dpo/margin_mean": 0.9789342284202576, "margin_dpo/margin_std": 1.9150274991989136, "step": 103 }, { "KL/chosen_KL_mean": -2.7089309692382812, "KL/mean": -3.289179563522339, "KL/rejected_KL_mean": -3.8694305419921875, "KL/std": 2.190309524536133, "epoch": 0.15721844293272866, "fcm_dpo/beta": 0.3745134472846985, "fcm_dpo/delta": -0.03637208789587021, "fcm_dpo/margin": 1.1605020761489868, "fcm_dpo/q_t": 0.4128270149230957, "grad_norm": 108.48844146728516, "learning_rate": 4.954821743156767e-07, "logits/chosen": 0.18590426445007324, "logits/rejected": 0.10031882673501968, "logps/chosen": -63.70813751220703, "logps/ref_chosen": -60.99920654296875, "logps/ref_rejected": -98.84645080566406, "logps/rejected": -102.71588134765625, "loss": 1.1683, "margin_dpo/margin_mean": 1.1605031490325928, "margin_dpo/margin_std": 2.385338544845581, "step": 104 }, { "KL/chosen_KL_mean": -2.86285400390625, "KL/mean": -3.2976603507995605, "KL/rejected_KL_mean": -3.7324676513671875, "KL/std": 1.7748684883117676, "epoch": 0.15873015873015872, "fcm_dpo/beta": 0.37785446643829346, "fcm_dpo/delta": 0.07327243685722351, "fcm_dpo/margin": 0.8696129322052002, "fcm_dpo/q_t": 0.4284285306930542, "grad_norm": 114.82933044433594, "learning_rate": 4.952285105344791e-07, "logits/chosen": 0.1492481380701065, "logits/rejected": 0.09406997263431549, "logps/chosen": -73.81312561035156, "logps/ref_chosen": -70.95027160644531, "logps/ref_rejected": -87.88340759277344, "logps/rejected": -91.61587524414062, "loss": 1.2138, "margin_dpo/margin_mean": 0.8696126937866211, "margin_dpo/margin_std": 1.9789862632751465, "step": 105 }, { "KL/chosen_KL_mean": -2.6894569396972656, "KL/mean": -3.172947883605957, "KL/rejected_KL_mean": -3.6564407348632812, "KL/std": 1.8021878004074097, "epoch": 0.1602418745275888, "fcm_dpo/beta": 0.3791837692260742, "fcm_dpo/delta": 0.034607887268066406, "fcm_dpo/margin": 0.9669825434684753, "fcm_dpo/q_t": 0.4173208773136139, "grad_norm": 111.66971588134766, "learning_rate": 4.949679871846857e-07, "logits/chosen": 0.1852017343044281, "logits/rejected": 0.17121094465255737, "logps/chosen": -65.14878845214844, "logps/ref_chosen": -62.45933151245117, "logps/ref_rejected": -67.00595092773438, "logps/rejected": -70.66238403320312, "loss": 1.2061, "margin_dpo/margin_mean": 0.9669830203056335, "margin_dpo/margin_std": 2.168097972869873, "step": 106 }, { "KL/chosen_KL_mean": -3.152782440185547, "KL/mean": -3.4210877418518066, "KL/rejected_KL_mean": -3.68939208984375, "KL/std": 2.1011857986450195, "epoch": 0.1617535903250189, "fcm_dpo/beta": 0.3795863389968872, "fcm_dpo/delta": -0.011487273499369621, "fcm_dpo/margin": 0.5366103649139404, "fcm_dpo/q_t": 0.4552128314971924, "grad_norm": 144.89495849609375, "learning_rate": 4.947006115536947e-07, "logits/chosen": 0.1324639618396759, "logits/rejected": 0.1091371551156044, "logps/chosen": -78.99075317382812, "logps/ref_chosen": -75.83796691894531, "logps/ref_rejected": -87.74038696289062, "logps/rejected": -91.42977905273438, "loss": 1.3867, "margin_dpo/margin_mean": 0.5366103649139404, "margin_dpo/margin_std": 2.3823909759521484, "step": 107 }, { "KL/chosen_KL_mean": -2.808917999267578, "KL/mean": -3.367018938064575, "KL/rejected_KL_mean": -3.925121307373047, "KL/std": 1.635740041732788, "epoch": 0.16326530612244897, "fcm_dpo/beta": 0.37955912947654724, "fcm_dpo/delta": -0.024890627712011337, "fcm_dpo/margin": 1.1162042617797852, "fcm_dpo/q_t": 0.41028887033462524, "grad_norm": 100.42587280273438, "learning_rate": 4.944263911205772e-07, "logits/chosen": 0.12748947739601135, "logits/rejected": 0.09685810655355453, "logps/chosen": -71.20215606689453, "logps/ref_chosen": -68.39323425292969, "logps/ref_rejected": -83.24267578125, "logps/rejected": -87.16779327392578, "loss": 1.1615, "margin_dpo/margin_mean": 1.1162045001983643, "margin_dpo/margin_std": 2.1749002933502197, "step": 108 }, { "KL/chosen_KL_mean": -2.5797672271728516, "KL/mean": -3.326822519302368, "KL/rejected_KL_mean": -4.073879241943359, "KL/std": 1.9906002283096313, "epoch": 0.16477702191987906, "fcm_dpo/beta": 0.3688945174217224, "fcm_dpo/delta": -0.1599644124507904, "fcm_dpo/margin": 1.4941089153289795, "fcm_dpo/q_t": 0.38224801421165466, "grad_norm": 84.49877166748047, "learning_rate": 4.941453335558681e-07, "logits/chosen": 0.13400131464004517, "logits/rejected": 0.08052568882703781, "logps/chosen": -58.10724639892578, "logps/ref_chosen": -55.52748107910156, "logps/ref_rejected": -83.55218505859375, "logps/rejected": -87.62606811523438, "loss": 1.0232, "margin_dpo/margin_mean": 1.4941084384918213, "margin_dpo/margin_std": 2.015568256378174, "step": 109 }, { "KL/chosen_KL_mean": -2.9763832092285156, "KL/mean": -3.3232810497283936, "KL/rejected_KL_mean": -3.670177459716797, "KL/std": 1.8275551795959473, "epoch": 0.16628873771730915, "fcm_dpo/beta": 0.3677034378051758, "fcm_dpo/delta": 0.05152323469519615, "fcm_dpo/margin": 0.6937993764877319, "fcm_dpo/q_t": 0.4441373944282532, "grad_norm": 126.11421966552734, "learning_rate": 4.938574467213517e-07, "logits/chosen": 0.12569531798362732, "logits/rejected": 0.13502708077430725, "logps/chosen": -84.1351318359375, "logps/ref_chosen": -81.15874481201172, "logps/ref_rejected": -72.56021118164062, "logps/rejected": -76.23039245605469, "loss": 1.2935, "margin_dpo/margin_mean": 0.6937993764877319, "margin_dpo/margin_std": 2.1806392669677734, "step": 110 }, { "KL/chosen_KL_mean": -3.1503047943115234, "KL/mean": -3.660186290740967, "KL/rejected_KL_mean": -4.170066833496094, "KL/std": 1.851228952407837, "epoch": 0.16780045351473924, "fcm_dpo/beta": 0.37204888463020325, "fcm_dpo/delta": 0.021093428134918213, "fcm_dpo/margin": 1.0197619199752808, "fcm_dpo/q_t": 0.4174486994743347, "grad_norm": 95.4344253540039, "learning_rate": 4.935627386698418e-07, "logits/chosen": 0.23428216576576233, "logits/rejected": 0.19636960327625275, "logps/chosen": -55.5092887878418, "logps/ref_chosen": -52.358985900878906, "logps/ref_rejected": -77.06150817871094, "logps/rejected": -81.2315673828125, "loss": 1.1928, "margin_dpo/margin_mean": 1.0197620391845703, "margin_dpo/margin_std": 2.1588549613952637, "step": 111 }, { "KL/chosen_KL_mean": -2.7293567657470703, "KL/mean": -3.3851919174194336, "KL/rejected_KL_mean": -4.041027069091797, "KL/std": 1.8818962574005127, "epoch": 0.1693121693121693, "fcm_dpo/beta": 0.3664921522140503, "fcm_dpo/delta": -0.08479724079370499, "fcm_dpo/margin": 1.311669945716858, "fcm_dpo/q_t": 0.39188939332962036, "grad_norm": 103.5470199584961, "learning_rate": 4.932612176449559e-07, "logits/chosen": 0.13160556554794312, "logits/rejected": 0.06806673109531403, "logps/chosen": -65.74942016601562, "logps/ref_chosen": -63.02006530761719, "logps/ref_rejected": -111.36941528320312, "logps/rejected": -115.41044616699219, "loss": 1.0968, "margin_dpo/margin_mean": 1.3116694688796997, "margin_dpo/margin_std": 2.088703155517578, "step": 112 }, { "KL/chosen_KL_mean": -3.0512046813964844, "KL/mean": -3.5629920959472656, "KL/rejected_KL_mean": -4.074779510498047, "KL/std": 1.8660565614700317, "epoch": 0.1708238851095994, "fcm_dpo/beta": 0.3687834143638611, "fcm_dpo/delta": 0.02218996360898018, "fcm_dpo/margin": 1.023565411567688, "fcm_dpo/q_t": 0.4146605432033539, "grad_norm": 110.78216552734375, "learning_rate": 4.929528920808854e-07, "logits/chosen": 0.15470312535762787, "logits/rejected": 0.11601442843675613, "logps/chosen": -58.85886764526367, "logps/ref_chosen": -55.80766296386719, "logps/ref_rejected": -69.84014129638672, "logps/rejected": -73.9149169921875, "loss": 1.1921, "margin_dpo/margin_mean": 1.023565649986267, "margin_dpo/margin_std": 2.1081085205078125, "step": 113 }, { "KL/chosen_KL_mean": -2.608489990234375, "KL/mean": -3.454742431640625, "KL/rejected_KL_mean": -4.300998687744141, "KL/std": 2.0123279094696045, "epoch": 0.17233560090702948, "fcm_dpo/beta": 0.3527457118034363, "fcm_dpo/delta": -0.21197696030139923, "fcm_dpo/margin": 1.6924998760223389, "fcm_dpo/q_t": 0.3716784119606018, "grad_norm": 82.17540740966797, "learning_rate": 4.92637770602159e-07, "logits/chosen": 0.19436286389827728, "logits/rejected": 0.13452968001365662, "logps/chosen": -68.9412612915039, "logps/ref_chosen": -66.33277130126953, "logps/ref_rejected": -71.61489868164062, "logps/rejected": -75.9158935546875, "loss": 1.0058, "margin_dpo/margin_mean": 1.692500352859497, "margin_dpo/margin_std": 2.1777796745300293, "step": 114 }, { "KL/chosen_KL_mean": -2.9941158294677734, "KL/mean": -3.6165049076080322, "KL/rejected_KL_mean": -4.238897323608398, "KL/std": 1.8756771087646484, "epoch": 0.17384731670445955, "fcm_dpo/beta": 0.3499326705932617, "fcm_dpo/delta": -0.03719992935657501, "fcm_dpo/margin": 1.244775414466858, "fcm_dpo/q_t": 0.40905773639678955, "grad_norm": 95.71516418457031, "learning_rate": 4.923158620234019e-07, "logits/chosen": 0.15890584886074066, "logits/rejected": 0.0992494523525238, "logps/chosen": -58.74315643310547, "logps/ref_chosen": -55.74903869628906, "logps/ref_rejected": -79.59849548339844, "logps/rejected": -83.83739471435547, "loss": 1.1274, "margin_dpo/margin_mean": 1.2447755336761475, "margin_dpo/margin_std": 2.1931252479553223, "step": 115 }, { "KL/chosen_KL_mean": -2.815164566040039, "KL/mean": -3.583864450454712, "KL/rejected_KL_mean": -4.352565765380859, "KL/std": 1.9524595737457275, "epoch": 0.17535903250188964, "fcm_dpo/beta": 0.34256207942962646, "fcm_dpo/delta": -0.13351455330848694, "fcm_dpo/margin": 1.5374069213867188, "fcm_dpo/q_t": 0.38108181953430176, "grad_norm": 73.72758483886719, "learning_rate": 4.91987175349089e-07, "logits/chosen": 0.17657245695590973, "logits/rejected": 0.10971198976039886, "logps/chosen": -52.180328369140625, "logps/ref_chosen": -49.36516571044922, "logps/ref_rejected": -72.84671020507812, "logps/rejected": -77.19927215576172, "loss": 1.0193, "margin_dpo/margin_mean": 1.5374069213867188, "margin_dpo/margin_std": 1.8536643981933594, "step": 116 }, { "KL/chosen_KL_mean": -2.679628372192383, "KL/mean": -3.292019844055176, "KL/rejected_KL_mean": -3.9044113159179688, "KL/std": 1.817359209060669, "epoch": 0.17687074829931973, "fcm_dpo/beta": 0.33216163516044617, "fcm_dpo/delta": -0.10976561903953552, "fcm_dpo/margin": 1.224778413772583, "fcm_dpo/q_t": 0.4070253074169159, "grad_norm": 81.04077911376953, "learning_rate": 4.916517197732933e-07, "logits/chosen": 0.17474356293678284, "logits/rejected": 0.13906052708625793, "logps/chosen": -60.390525817871094, "logps/ref_chosen": -57.710899353027344, "logps/ref_rejected": -69.77253723144531, "logps/rejected": -73.67694854736328, "loss": 1.1357, "margin_dpo/margin_mean": 1.2247787714004517, "margin_dpo/margin_std": 2.056997776031494, "step": 117 }, { "KL/chosen_KL_mean": -2.587177276611328, "KL/mean": -3.387826442718506, "KL/rejected_KL_mean": -4.188468933105469, "KL/std": 1.9094345569610596, "epoch": 0.17838246409674982, "fcm_dpo/beta": 0.3244737982749939, "fcm_dpo/delta": -0.12712615728378296, "fcm_dpo/margin": 1.6012959480285645, "fcm_dpo/q_t": 0.38269591331481934, "grad_norm": 78.91576385498047, "learning_rate": 4.913095046794281e-07, "logits/chosen": 0.24776798486709595, "logits/rejected": 0.20680418610572815, "logps/chosen": -55.06707763671875, "logps/ref_chosen": -52.479896545410156, "logps/ref_rejected": -81.359130859375, "logps/rejected": -85.54759216308594, "loss": 1.032, "margin_dpo/margin_mean": 1.601295828819275, "margin_dpo/margin_std": 2.008662223815918, "step": 118 }, { "KL/chosen_KL_mean": -3.4319400787353516, "KL/mean": -3.953449249267578, "KL/rejected_KL_mean": -4.474956512451172, "KL/std": 2.010892629623413, "epoch": 0.17989417989417988, "fcm_dpo/beta": 0.32741856575012207, "fcm_dpo/delta": 0.06035677716135979, "fcm_dpo/margin": 1.043018102645874, "fcm_dpo/q_t": 0.42501676082611084, "grad_norm": 88.64505004882812, "learning_rate": 4.909605396399855e-07, "logits/chosen": 0.15717501938343048, "logits/rejected": 0.11950940638780594, "logps/chosen": -64.78961181640625, "logps/ref_chosen": -61.35767364501953, "logps/ref_rejected": -75.71510314941406, "logps/rejected": -80.19005584716797, "loss": 1.2151, "margin_dpo/margin_mean": 1.0430182218551636, "margin_dpo/margin_std": 2.3741917610168457, "step": 119 }, { "KL/chosen_KL_mean": -2.8419265747070312, "KL/mean": -3.633551597595215, "KL/rejected_KL_mean": -4.425178527832031, "KL/std": 1.9663417339324951, "epoch": 0.18140589569160998, "fcm_dpo/beta": 0.3221905827522278, "fcm_dpo/delta": -0.1160150021314621, "fcm_dpo/margin": 1.5832512378692627, "fcm_dpo/q_t": 0.3869457244873047, "grad_norm": 78.55064392089844, "learning_rate": 4.906048344162676e-07, "logits/chosen": 0.1555761843919754, "logits/rejected": 0.09702657908201218, "logps/chosen": -62.74949645996094, "logps/ref_chosen": -59.907569885253906, "logps/ref_rejected": -79.6910629272461, "logps/rejected": -84.11624145507812, "loss": 1.0397, "margin_dpo/margin_mean": 1.5832513570785522, "margin_dpo/margin_std": 2.069026470184326, "step": 120 }, { "KL/chosen_KL_mean": -3.206563949584961, "KL/mean": -3.898369312286377, "KL/rejected_KL_mean": -4.590179443359375, "KL/std": 2.03234601020813, "epoch": 0.18291761148904007, "fcm_dpo/beta": 0.31733065843582153, "fcm_dpo/delta": -0.041163019835948944, "fcm_dpo/margin": 1.3836127519607544, "fcm_dpo/q_t": 0.40332311391830444, "grad_norm": 73.39022064208984, "learning_rate": 4.902423989581143e-07, "logits/chosen": 0.260642945766449, "logits/rejected": 0.1762283444404602, "logps/chosen": -58.87260437011719, "logps/ref_chosen": -55.66604232788086, "logps/ref_rejected": -101.56233978271484, "logps/rejected": -106.15251922607422, "loss": 1.0848, "margin_dpo/margin_mean": 1.3836126327514648, "margin_dpo/margin_std": 2.011979818344116, "step": 121 }, { "KL/chosen_KL_mean": -3.2638092041015625, "KL/mean": -4.106398582458496, "KL/rejected_KL_mean": -4.94898796081543, "KL/std": 2.240880012512207, "epoch": 0.18442932728647016, "fcm_dpo/beta": 0.31221216917037964, "fcm_dpo/delta": -0.13292989134788513, "fcm_dpo/margin": 1.6851718425750732, "fcm_dpo/q_t": 0.3885393738746643, "grad_norm": 84.64109802246094, "learning_rate": 4.898732434036243e-07, "logits/chosen": 0.17164123058319092, "logits/rejected": 0.1370231956243515, "logps/chosen": -66.59818267822266, "logps/ref_chosen": -63.334373474121094, "logps/ref_rejected": -73.67523193359375, "logps/rejected": -78.62422180175781, "loss": 1.0543, "margin_dpo/margin_mean": 1.6851723194122314, "margin_dpo/margin_std": 2.4770126342773438, "step": 122 }, { "KL/chosen_KL_mean": -3.0445899963378906, "KL/mean": -3.7831180095672607, "KL/rejected_KL_mean": -4.521644592285156, "KL/std": 2.076758861541748, "epoch": 0.18594104308390022, "fcm_dpo/beta": 0.3086293935775757, "fcm_dpo/delta": -0.059040434658527374, "fcm_dpo/margin": 1.4770545959472656, "fcm_dpo/q_t": 0.39844340085983276, "grad_norm": 77.71772766113281, "learning_rate": 4.894973780788722e-07, "logits/chosen": 0.19182562828063965, "logits/rejected": 0.150858074426651, "logps/chosen": -59.94333267211914, "logps/ref_chosen": -56.89874267578125, "logps/ref_rejected": -78.97028350830078, "logps/rejected": -83.49192810058594, "loss": 1.1071, "margin_dpo/margin_mean": 1.4770545959472656, "margin_dpo/margin_std": 2.3821732997894287, "step": 123 }, { "KL/chosen_KL_mean": -3.3188209533691406, "KL/mean": -4.330348014831543, "KL/rejected_KL_mean": -5.341876983642578, "KL/std": 2.195726156234741, "epoch": 0.1874527588813303, "fcm_dpo/beta": 0.29499292373657227, "fcm_dpo/delta": -0.21054358780384064, "fcm_dpo/margin": 2.0230560302734375, "fcm_dpo/q_t": 0.36717766523361206, "grad_norm": 64.97583770751953, "learning_rate": 4.89114813497619e-07, "logits/chosen": 0.20473948121070862, "logits/rejected": 0.14627373218536377, "logps/chosen": -60.434906005859375, "logps/ref_chosen": -57.116085052490234, "logps/ref_rejected": -87.93074035644531, "logps/rejected": -93.27262115478516, "loss": 0.9775, "margin_dpo/margin_mean": 2.0230560302734375, "margin_dpo/margin_std": 2.260397434234619, "step": 124 }, { "KL/chosen_KL_mean": -3.5366687774658203, "KL/mean": -4.391350746154785, "KL/rejected_KL_mean": -5.246028900146484, "KL/std": 2.463884115219116, "epoch": 0.1889644746787604, "fcm_dpo/beta": 0.28741711378097534, "fcm_dpo/delta": -0.09700541943311691, "fcm_dpo/margin": 1.7093640565872192, "fcm_dpo/q_t": 0.3919650614261627, "grad_norm": 72.87663269042969, "learning_rate": 4.887255603610184e-07, "logits/chosen": 0.2485215663909912, "logits/rejected": 0.189075767993927, "logps/chosen": -69.24284362792969, "logps/ref_chosen": -65.7061767578125, "logps/ref_rejected": -91.72711944580078, "logps/rejected": -96.97314453125, "loss": 1.0619, "margin_dpo/margin_mean": 1.709364414215088, "margin_dpo/margin_std": 2.397721767425537, "step": 125 }, { "KL/chosen_KL_mean": -3.3044567108154297, "KL/mean": -3.940857410430908, "KL/rejected_KL_mean": -4.577259063720703, "KL/std": 2.6586263179779053, "epoch": 0.19047619047619047, "fcm_dpo/beta": 0.2884438931941986, "fcm_dpo/delta": 0.03412620350718498, "fcm_dpo/margin": 1.2727997303009033, "fcm_dpo/q_t": 0.42388850450515747, "grad_norm": 71.63743591308594, "learning_rate": 4.883296295573176e-07, "logits/chosen": 0.03753478080034256, "logits/rejected": 0.03161387890577316, "logps/chosen": -71.48054504394531, "logps/ref_chosen": -68.17608642578125, "logps/ref_rejected": -65.1175537109375, "logps/rejected": -69.69480895996094, "loss": 1.1999, "margin_dpo/margin_mean": 1.2727999687194824, "margin_dpo/margin_std": 2.8984875679016113, "step": 126 }, { "KL/chosen_KL_mean": -3.549501419067383, "KL/mean": -4.510357856750488, "KL/rejected_KL_mean": -5.471212387084961, "KL/std": 2.167954444885254, "epoch": 0.19198790627362056, "fcm_dpo/beta": 0.2807749807834625, "fcm_dpo/delta": -0.1500011682510376, "fcm_dpo/margin": 1.921714186668396, "fcm_dpo/q_t": 0.37806618213653564, "grad_norm": 65.7569351196289, "learning_rate": 4.87927032161552e-07, "logits/chosen": 0.15910570323467255, "logits/rejected": 0.12889736890792847, "logps/chosen": -65.42973327636719, "logps/ref_chosen": -61.88023376464844, "logps/ref_rejected": -68.46012878417969, "logps/rejected": -73.93133544921875, "loss": 1.0062, "margin_dpo/margin_mean": 1.921714186668396, "margin_dpo/margin_std": 2.158545970916748, "step": 127 }, { "KL/chosen_KL_mean": -3.7951793670654297, "KL/mean": -4.584693908691406, "KL/rejected_KL_mean": -5.374214172363281, "KL/std": 2.567638874053955, "epoch": 0.19349962207105065, "fcm_dpo/beta": 0.27851366996765137, "fcm_dpo/delta": -0.04178363084793091, "fcm_dpo/margin": 1.579025149345398, "fcm_dpo/q_t": 0.4044601321220398, "grad_norm": 73.24718475341797, "learning_rate": 4.875177794352363e-07, "logits/chosen": 0.19215698540210724, "logits/rejected": 0.13608311116695404, "logps/chosen": -70.50416564941406, "logps/ref_chosen": -66.708984375, "logps/ref_rejected": -94.97969055175781, "logps/rejected": -100.3539047241211, "loss": 1.1428, "margin_dpo/margin_mean": 1.579025387763977, "margin_dpo/margin_std": 2.9122214317321777, "step": 128 }, { "KL/chosen_KL_mean": -4.064935684204102, "KL/mean": -4.7622389793396, "KL/rejected_KL_mean": -5.459545135498047, "KL/std": 2.355567455291748, "epoch": 0.19501133786848074, "fcm_dpo/beta": 0.2787662148475647, "fcm_dpo/delta": 0.011680129915475845, "fcm_dpo/margin": 1.394613265991211, "fcm_dpo/q_t": 0.4161040186882019, "grad_norm": 79.67732238769531, "learning_rate": 4.871018828260491e-07, "logits/chosen": 0.16714146733283997, "logits/rejected": 0.1604004204273224, "logps/chosen": -69.40376281738281, "logps/ref_chosen": -65.33882904052734, "logps/ref_rejected": -68.06109619140625, "logps/rejected": -73.52063751220703, "loss": 1.1585, "margin_dpo/margin_mean": 1.394613265991211, "margin_dpo/margin_std": 2.656960964202881, "step": 129 }, { "KL/chosen_KL_mean": -3.8464221954345703, "KL/mean": -4.694123268127441, "KL/rejected_KL_mean": -5.5418243408203125, "KL/std": 2.2987711429595947, "epoch": 0.1965230536659108, "fcm_dpo/beta": 0.2733253240585327, "fcm_dpo/delta": -0.06958719342947006, "fcm_dpo/margin": 1.6954036951065063, "fcm_dpo/q_t": 0.3956819772720337, "grad_norm": 77.89653778076172, "learning_rate": 4.866793539675126e-07, "logits/chosen": 0.14793270826339722, "logits/rejected": 0.09856880456209183, "logps/chosen": -62.50716781616211, "logps/ref_chosen": -58.660743713378906, "logps/ref_rejected": -79.24510192871094, "logps/rejected": -84.78692626953125, "loss": 1.0656, "margin_dpo/margin_mean": 1.6954035758972168, "margin_dpo/margin_std": 2.1648051738739014, "step": 130 }, { "KL/chosen_KL_mean": -3.7749671936035156, "KL/mean": -4.794083595275879, "KL/rejected_KL_mean": -5.813201904296875, "KL/std": 2.7752933502197266, "epoch": 0.1980347694633409, "fcm_dpo/beta": 0.2693408131599426, "fcm_dpo/delta": -0.15744194388389587, "fcm_dpo/margin": 2.0382328033447266, "fcm_dpo/q_t": 0.3810405433177948, "grad_norm": 65.14391326904297, "learning_rate": 4.86250204678667e-07, "logits/chosen": 0.173618346452713, "logits/rejected": 0.10944752395153046, "logps/chosen": -56.28950500488281, "logps/ref_chosen": -52.51453399658203, "logps/ref_rejected": -85.18299865722656, "logps/rejected": -90.99620056152344, "loss": 1.0637, "margin_dpo/margin_mean": 2.0382332801818848, "margin_dpo/margin_std": 3.0586225986480713, "step": 131 }, { "KL/chosen_KL_mean": -4.116878509521484, "KL/mean": -5.01156759262085, "KL/rejected_KL_mean": -5.906259536743164, "KL/std": 2.63340425491333, "epoch": 0.19954648526077098, "fcm_dpo/beta": 0.2637348175048828, "fcm_dpo/delta": -0.07549773156642914, "fcm_dpo/margin": 1.7893848419189453, "fcm_dpo/q_t": 0.3935900330543518, "grad_norm": 68.34086608886719, "learning_rate": 4.858144469637408e-07, "logits/chosen": 0.23281046748161316, "logits/rejected": 0.20056718587875366, "logps/chosen": -69.80200958251953, "logps/ref_chosen": -65.68513488769531, "logps/ref_rejected": -69.54120635986328, "logps/rejected": -75.44746398925781, "loss": 1.0935, "margin_dpo/margin_mean": 1.7893848419189453, "margin_dpo/margin_std": 2.7972469329833984, "step": 132 }, { "KL/chosen_KL_mean": -4.085521697998047, "KL/mean": -4.918117523193359, "KL/rejected_KL_mean": -5.750713348388672, "KL/std": 2.594369411468506, "epoch": 0.20105820105820105, "fcm_dpo/beta": 0.2613708972930908, "fcm_dpo/delta": -0.036830320954322815, "fcm_dpo/margin": 1.6651947498321533, "fcm_dpo/q_t": 0.4029567837715149, "grad_norm": 69.82433319091797, "learning_rate": 4.853720930118138e-07, "logits/chosen": 0.13591939210891724, "logits/rejected": 0.12657515704631805, "logps/chosen": -67.68363952636719, "logps/ref_chosen": -63.598114013671875, "logps/ref_rejected": -73.72798156738281, "logps/rejected": -79.47869873046875, "loss": 1.1048, "margin_dpo/margin_mean": 1.6651947498321533, "margin_dpo/margin_std": 2.6259703636169434, "step": 133 }, { "KL/chosen_KL_mean": -3.926084518432617, "KL/mean": -5.143423080444336, "KL/rejected_KL_mean": -6.3607635498046875, "KL/std": 2.994990110397339, "epoch": 0.20256991685563114, "fcm_dpo/beta": 0.25156599283218384, "fcm_dpo/delta": -0.22704170644283295, "fcm_dpo/margin": 2.434678077697754, "fcm_dpo/q_t": 0.36555933952331543, "grad_norm": 57.35163497924805, "learning_rate": 4.849231551964771e-07, "logits/chosen": 0.2282027304172516, "logits/rejected": 0.17454509437084198, "logps/chosen": -57.72065734863281, "logps/ref_chosen": -53.79457092285156, "logps/ref_rejected": -74.16741943359375, "logps/rejected": -80.52818298339844, "loss": 0.9806, "margin_dpo/margin_mean": 2.4346795082092285, "margin_dpo/margin_std": 2.905587911605835, "step": 134 }, { "KL/chosen_KL_mean": -3.9533214569091797, "KL/mean": -4.761846542358398, "KL/rejected_KL_mean": -5.570377349853516, "KL/std": 2.8324992656707764, "epoch": 0.20408163265306123, "fcm_dpo/beta": 0.24910268187522888, "fcm_dpo/delta": -0.0029318425804376602, "fcm_dpo/margin": 1.6170556545257568, "fcm_dpo/q_t": 0.4120703339576721, "grad_norm": 57.255069732666016, "learning_rate": 4.844676460754862e-07, "logits/chosen": 0.1793169230222702, "logits/rejected": 0.14716970920562744, "logps/chosen": -53.3943977355957, "logps/ref_chosen": -49.441078186035156, "logps/ref_rejected": -65.96878051757812, "logps/rejected": -71.53915405273438, "loss": 1.1396, "margin_dpo/margin_mean": 1.6170554161071777, "margin_dpo/margin_std": 2.8914742469787598, "step": 135 }, { "KL/chosen_KL_mean": -4.708171844482422, "KL/mean": -5.69216775894165, "KL/rejected_KL_mean": -6.676166534423828, "KL/std": 3.125887870788574, "epoch": 0.20559334845049132, "fcm_dpo/beta": 0.2455032616853714, "fcm_dpo/delta": -0.08755337446928024, "fcm_dpo/margin": 1.9679951667785645, "fcm_dpo/q_t": 0.40462052822113037, "grad_norm": 73.57177734375, "learning_rate": 4.840055783904106e-07, "logits/chosen": 0.17779377102851868, "logits/rejected": 0.10765485465526581, "logps/chosen": -71.46743774414062, "logps/ref_chosen": -66.75926208496094, "logps/ref_rejected": -94.61787414550781, "logps/rejected": -101.29403686523438, "loss": 1.1648, "margin_dpo/margin_mean": 1.96799635887146, "margin_dpo/margin_std": 4.027059555053711, "step": 136 }, { "KL/chosen_KL_mean": -4.485187530517578, "KL/mean": -5.504098892211914, "KL/rejected_KL_mean": -6.523014068603516, "KL/std": 2.911930561065674, "epoch": 0.20710506424792138, "fcm_dpo/beta": 0.24210922420024872, "fcm_dpo/delta": -0.09813450276851654, "fcm_dpo/margin": 2.037825345993042, "fcm_dpo/q_t": 0.3939235806465149, "grad_norm": 57.26094436645508, "learning_rate": 4.835369650662767e-07, "logits/chosen": 0.19808395206928253, "logits/rejected": 0.170088991522789, "logps/chosen": -61.268985748291016, "logps/ref_chosen": -56.78379821777344, "logps/ref_rejected": -69.89952087402344, "logps/rejected": -76.42253112792969, "loss": 1.0878, "margin_dpo/margin_mean": 2.037825345993042, "margin_dpo/margin_std": 3.2442502975463867, "step": 137 }, { "KL/chosen_KL_mean": -5.205926895141602, "KL/mean": -6.052507400512695, "KL/rejected_KL_mean": -6.8990936279296875, "KL/std": 3.0128889083862305, "epoch": 0.20861678004535147, "fcm_dpo/beta": 0.23976297676563263, "fcm_dpo/delta": -0.006211782805621624, "fcm_dpo/margin": 1.6931648254394531, "fcm_dpo/q_t": 0.4101250171661377, "grad_norm": 60.49470520019531, "learning_rate": 4.830618192112065e-07, "logits/chosen": 0.1866264045238495, "logits/rejected": 0.1513412892818451, "logps/chosen": -63.97193908691406, "logps/ref_chosen": -58.766014099121094, "logps/ref_rejected": -68.12371826171875, "logps/rejected": -75.02281188964844, "loss": 1.1522, "margin_dpo/margin_mean": 1.6931647062301636, "margin_dpo/margin_std": 3.155078887939453, "step": 138 }, { "KL/chosen_KL_mean": -5.2599029541015625, "KL/mean": -6.343092918395996, "KL/rejected_KL_mean": -7.426288604736328, "KL/std": 2.9821972846984863, "epoch": 0.21012849584278157, "fcm_dpo/beta": 0.2368057668209076, "fcm_dpo/delta": -0.11907797306776047, "fcm_dpo/margin": 2.1663827896118164, "fcm_dpo/q_t": 0.3863199055194855, "grad_norm": 62.60260009765625, "learning_rate": 4.825801541160509e-07, "logits/chosen": 0.15238192677497864, "logits/rejected": 0.12422216683626175, "logps/chosen": -76.48548889160156, "logps/ref_chosen": -71.2255859375, "logps/ref_rejected": -82.1834716796875, "logps/rejected": -89.60975646972656, "loss": 1.0454, "margin_dpo/margin_mean": 2.1663827896118164, "margin_dpo/margin_std": 2.890249013900757, "step": 139 }, { "KL/chosen_KL_mean": -4.85059928894043, "KL/mean": -6.238257884979248, "KL/rejected_KL_mean": -7.62591552734375, "KL/std": 3.931154251098633, "epoch": 0.21164021164021163, "fcm_dpo/beta": 0.22562667727470398, "fcm_dpo/delta": -0.2415589690208435, "fcm_dpo/margin": 2.77531361579895, "fcm_dpo/q_t": 0.36584708094596863, "grad_norm": 64.5361099243164, "learning_rate": 4.820919832540181e-07, "logits/chosen": 0.17804387211799622, "logits/rejected": 0.13242071866989136, "logps/chosen": -68.12826538085938, "logps/ref_chosen": -63.27766418457031, "logps/ref_rejected": -83.30647277832031, "logps/rejected": -90.93238830566406, "loss": 1.0538, "margin_dpo/margin_mean": 2.775313138961792, "margin_dpo/margin_std": 4.248003005981445, "step": 140 }, { "KL/chosen_KL_mean": -5.133699417114258, "KL/mean": -6.48502254486084, "KL/rejected_KL_mean": -7.8363494873046875, "KL/std": 3.323653221130371, "epoch": 0.21315192743764172, "fcm_dpo/beta": 0.21512871980667114, "fcm_dpo/delta": -0.19506794214248657, "fcm_dpo/margin": 2.7026472091674805, "fcm_dpo/q_t": 0.3796807825565338, "grad_norm": 57.2955207824707, "learning_rate": 4.815973202802966e-07, "logits/chosen": 0.21243543922901154, "logits/rejected": 0.16858990490436554, "logps/chosen": -66.90046691894531, "logps/ref_chosen": -61.76676940917969, "logps/ref_rejected": -88.60601806640625, "logps/rejected": -96.44236755371094, "loss": 1.055, "margin_dpo/margin_mean": 2.7026472091674805, "margin_dpo/margin_std": 4.074060440063477, "step": 141 }, { "KL/chosen_KL_mean": -5.233737945556641, "KL/mean": -6.267353057861328, "KL/rejected_KL_mean": -7.300971984863281, "KL/std": 3.2057394981384277, "epoch": 0.2146636432350718, "fcm_dpo/beta": 0.21345248818397522, "fcm_dpo/delta": -0.04314158111810684, "fcm_dpo/margin": 2.0672333240509033, "fcm_dpo/q_t": 0.40279993414878845, "grad_norm": 54.912445068359375, "learning_rate": 4.810961790316729e-07, "logits/chosen": 0.2118801474571228, "logits/rejected": 0.1868029087781906, "logps/chosen": -70.50851440429688, "logps/ref_chosen": -65.2747802734375, "logps/ref_rejected": -81.1378173828125, "logps/rejected": -88.43878936767578, "loss": 1.1032, "margin_dpo/margin_mean": 2.0672333240509033, "margin_dpo/margin_std": 3.2750396728515625, "step": 142 }, { "KL/chosen_KL_mean": -5.515165328979492, "KL/mean": -6.453474998474121, "KL/rejected_KL_mean": -7.391780853271484, "KL/std": 3.489405632019043, "epoch": 0.2161753590325019, "fcm_dpo/beta": 0.21241332590579987, "fcm_dpo/delta": 0.001401122659444809, "fcm_dpo/margin": 1.876617431640625, "fcm_dpo/q_t": 0.41124582290649414, "grad_norm": 67.3259506225586, "learning_rate": 4.805885735261454e-07, "logits/chosen": 0.20787885785102844, "logits/rejected": 0.1913605034351349, "logps/chosen": -68.13299560546875, "logps/ref_chosen": -62.617828369140625, "logps/ref_rejected": -70.39239501953125, "logps/rejected": -77.78417205810547, "loss": 1.1985, "margin_dpo/margin_mean": 1.876617670059204, "margin_dpo/margin_std": 4.119785308837891, "step": 143 }, { "KL/chosen_KL_mean": -5.80128288269043, "KL/mean": -6.973703384399414, "KL/rejected_KL_mean": -8.146125793457031, "KL/std": 3.674846649169922, "epoch": 0.21768707482993196, "fcm_dpo/beta": 0.20906080305576324, "fcm_dpo/delta": -0.09537584334611893, "fcm_dpo/margin": 2.3448352813720703, "fcm_dpo/q_t": 0.39523231983184814, "grad_norm": 57.71143341064453, "learning_rate": 4.800745179625307e-07, "logits/chosen": 0.18431693315505981, "logits/rejected": 0.1551814079284668, "logps/chosen": -66.60397338867188, "logps/ref_chosen": -60.80268859863281, "logps/ref_rejected": -79.07284545898438, "logps/rejected": -87.21896362304688, "loss": 1.1129, "margin_dpo/margin_mean": 2.3448359966278076, "margin_dpo/margin_std": 4.056647777557373, "step": 144 }, { "KL/chosen_KL_mean": -5.424762725830078, "KL/mean": -6.619411468505859, "KL/rejected_KL_mean": -7.814060211181641, "KL/std": 3.728973865509033, "epoch": 0.21919879062736206, "fcm_dpo/beta": 0.20575754344463348, "fcm_dpo/delta": -0.09634465724229813, "fcm_dpo/margin": 2.389300584793091, "fcm_dpo/q_t": 0.39475393295288086, "grad_norm": 61.22201156616211, "learning_rate": 4.795540267200686e-07, "logits/chosen": 0.14887337386608124, "logits/rejected": 0.1657814085483551, "logps/chosen": -80.03622436523438, "logps/ref_chosen": -74.61146545410156, "logps/ref_rejected": -83.24461364746094, "logps/rejected": -91.05867004394531, "loss": 1.1167, "margin_dpo/margin_mean": 2.3893003463745117, "margin_dpo/margin_std": 4.214175224304199, "step": 145 }, { "KL/chosen_KL_mean": -5.140588760375977, "KL/mean": -6.458826541900635, "KL/rejected_KL_mean": -7.777061462402344, "KL/std": 3.671992063522339, "epoch": 0.22071050642479215, "fcm_dpo/beta": 0.20110957324504852, "fcm_dpo/delta": -0.1372949779033661, "fcm_dpo/margin": 2.636476516723633, "fcm_dpo/q_t": 0.38520655035972595, "grad_norm": 52.14006423950195, "learning_rate": 4.790271143580173e-07, "logits/chosen": 0.1352338343858719, "logits/rejected": 0.12038532644510269, "logps/chosen": -62.98157501220703, "logps/ref_chosen": -57.84098434448242, "logps/ref_rejected": -67.47422790527344, "logps/rejected": -75.25128936767578, "loss": 1.0543, "margin_dpo/margin_mean": 2.6364758014678955, "margin_dpo/margin_std": 3.813873052597046, "step": 146 }, { "KL/chosen_KL_mean": -6.244224548339844, "KL/mean": -7.27488374710083, "KL/rejected_KL_mean": -8.305549621582031, "KL/std": 3.919419765472412, "epoch": 0.2222222222222222, "fcm_dpo/beta": 0.1990683525800705, "fcm_dpo/delta": -0.010799014940857887, "fcm_dpo/margin": 2.061326265335083, "fcm_dpo/q_t": 0.4108003079891205, "grad_norm": 66.64073181152344, "learning_rate": 4.784937956152489e-07, "logits/chosen": 0.18059472739696503, "logits/rejected": 0.13869327306747437, "logps/chosen": -73.05769348144531, "logps/ref_chosen": -66.81346893310547, "logps/ref_rejected": -81.1796875, "logps/rejected": -89.48524475097656, "loss": 1.1795, "margin_dpo/margin_mean": 2.061326026916504, "margin_dpo/margin_std": 4.308503150939941, "step": 147 }, { "KL/chosen_KL_mean": -5.884458541870117, "KL/mean": -7.247371673583984, "KL/rejected_KL_mean": -8.610279083251953, "KL/std": 3.7114810943603516, "epoch": 0.2237339380196523, "fcm_dpo/beta": 0.19570748507976532, "fcm_dpo/delta": -0.14082609117031097, "fcm_dpo/margin": 2.7258195877075195, "fcm_dpo/q_t": 0.3858853578567505, "grad_norm": 43.99631118774414, "learning_rate": 4.779540854098347e-07, "logits/chosen": 0.30080389976501465, "logits/rejected": 0.22915518283843994, "logps/chosen": -54.57221221923828, "logps/ref_chosen": -48.6877555847168, "logps/ref_rejected": -67.50503540039062, "logps/rejected": -76.11531066894531, "loss": 1.0789, "margin_dpo/margin_mean": 2.7258191108703613, "margin_dpo/margin_std": 4.251043319702148, "step": 148 }, { "KL/chosen_KL_mean": -7.186712265014648, "KL/mean": -8.830345153808594, "KL/rejected_KL_mean": -10.473979949951172, "KL/std": 4.486575603485107, "epoch": 0.2252456538170824, "fcm_dpo/beta": 0.1870003193616867, "fcm_dpo/delta": -0.228560209274292, "fcm_dpo/margin": 3.2872657775878906, "fcm_dpo/q_t": 0.3655872642993927, "grad_norm": 44.14234924316406, "learning_rate": 4.774079988386296e-07, "logits/chosen": 0.18786531686782837, "logits/rejected": 0.1401975154876709, "logps/chosen": -62.33049011230469, "logps/ref_chosen": -55.143775939941406, "logps/ref_rejected": -64.79888916015625, "logps/rejected": -75.27287292480469, "loss": 0.9905, "margin_dpo/margin_mean": 3.2872653007507324, "margin_dpo/margin_std": 4.089900016784668, "step": 149 }, { "KL/chosen_KL_mean": -5.73365592956543, "KL/mean": -7.739154815673828, "KL/rejected_KL_mean": -9.744651794433594, "KL/std": 4.402659893035889, "epoch": 0.22675736961451248, "fcm_dpo/beta": 0.17568576335906982, "fcm_dpo/delta": -0.328109472990036, "fcm_dpo/margin": 4.0110015869140625, "fcm_dpo/q_t": 0.3461223244667053, "grad_norm": 41.878448486328125, "learning_rate": 4.768555511768486e-07, "logits/chosen": 0.17943960428237915, "logits/rejected": 0.13765186071395874, "logps/chosen": -73.20440673828125, "logps/ref_chosen": -67.47074890136719, "logps/ref_rejected": -89.21170806884766, "logps/rejected": -98.95635986328125, "loss": 0.9237, "margin_dpo/margin_mean": 4.011001110076904, "margin_dpo/margin_std": 4.230114936828613, "step": 150 }, { "KL/chosen_KL_mean": -5.552589416503906, "KL/mean": -7.6664886474609375, "KL/rejected_KL_mean": -9.780387878417969, "KL/std": 4.4994049072265625, "epoch": 0.22826908541194255, "fcm_dpo/beta": 0.16503149271011353, "fcm_dpo/delta": -0.31979477405548096, "fcm_dpo/margin": 4.227801322937012, "fcm_dpo/q_t": 0.34837108850479126, "grad_norm": 37.085697174072266, "learning_rate": 4.762967578776406e-07, "logits/chosen": 0.17865802347660065, "logits/rejected": 0.12742634117603302, "logps/chosen": -58.01213073730469, "logps/ref_chosen": -52.45954132080078, "logps/ref_rejected": -79.0630111694336, "logps/rejected": -88.84339904785156, "loss": 0.9289, "margin_dpo/margin_mean": 4.2278008460998535, "margin_dpo/margin_std": 4.604149341583252, "step": 151 }, { "KL/chosen_KL_mean": -7.170042037963867, "KL/mean": -8.724004745483398, "KL/rejected_KL_mean": -10.277965545654297, "KL/std": 4.563355445861816, "epoch": 0.22978080120937264, "fcm_dpo/beta": 0.16083967685699463, "fcm_dpo/delta": -0.10548294335603714, "fcm_dpo/margin": 3.1079230308532715, "fcm_dpo/q_t": 0.394059956073761, "grad_norm": 43.28002166748047, "learning_rate": 4.757316345716553e-07, "logits/chosen": 0.2588500380516052, "logits/rejected": 0.2083607017993927, "logps/chosen": -63.723876953125, "logps/ref_chosen": -56.5538330078125, "logps/ref_rejected": -76.55074310302734, "logps/rejected": -86.82870483398438, "loss": 1.1054, "margin_dpo/margin_mean": 3.1079227924346924, "margin_dpo/margin_std": 5.150308609008789, "step": 152 }, { "KL/chosen_KL_mean": -6.822696685791016, "KL/mean": -8.533772468566895, "KL/rejected_KL_mean": -10.24484634399414, "KL/std": 4.866215705871582, "epoch": 0.23129251700680273, "fcm_dpo/beta": 0.1553048938512802, "fcm_dpo/delta": -0.13909754157066345, "fcm_dpo/margin": 3.422149181365967, "fcm_dpo/q_t": 0.3812229335308075, "grad_norm": 39.0296630859375, "learning_rate": 4.751601970666064e-07, "logits/chosen": 0.1727154552936554, "logits/rejected": 0.13628573715686798, "logps/chosen": -74.82958984375, "logps/ref_chosen": -68.00689697265625, "logps/ref_rejected": -74.83482360839844, "logps/rejected": -85.07966613769531, "loss": 1.0252, "margin_dpo/margin_mean": 3.422149896621704, "margin_dpo/margin_std": 4.313758850097656, "step": 153 }, { "KL/chosen_KL_mean": -7.823060989379883, "KL/mean": -9.09119987487793, "KL/rejected_KL_mean": -10.35934066772461, "KL/std": 4.949188232421875, "epoch": 0.2328042328042328, "fcm_dpo/beta": 0.15466338396072388, "fcm_dpo/delta": 0.008039739914238453, "fcm_dpo/margin": 2.5362794399261475, "fcm_dpo/q_t": 0.4132029414176941, "grad_norm": 40.396934509277344, "learning_rate": 4.745824613468292e-07, "logits/chosen": 0.2506559491157532, "logits/rejected": 0.24754533171653748, "logps/chosen": -67.04560089111328, "logps/ref_chosen": -59.222537994384766, "logps/ref_rejected": -64.19131469726562, "logps/rejected": -74.5506591796875, "loss": 1.181, "margin_dpo/margin_mean": 2.5362792015075684, "margin_dpo/margin_std": 5.215124130249023, "step": 154 }, { "KL/chosen_KL_mean": -7.939079284667969, "KL/mean": -9.668106079101562, "KL/rejected_KL_mean": -11.39712905883789, "KL/std": 4.829972267150879, "epoch": 0.23431594860166288, "fcm_dpo/beta": 0.1508539468050003, "fcm_dpo/delta": -0.12975244224071503, "fcm_dpo/margin": 3.4580554962158203, "fcm_dpo/q_t": 0.3903197944164276, "grad_norm": 41.401206970214844, "learning_rate": 4.7399844357283393e-07, "logits/chosen": 0.2590706944465637, "logits/rejected": 0.23935838043689728, "logps/chosen": -76.3937759399414, "logps/ref_chosen": -68.45469665527344, "logps/ref_rejected": -77.91763305664062, "logps/rejected": -89.31476593017578, "loss": 1.1174, "margin_dpo/margin_mean": 3.458055019378662, "margin_dpo/margin_std": 6.045299530029297, "step": 155 }, { "KL/chosen_KL_mean": -8.034944534301758, "KL/mean": -10.08774185180664, "KL/rejected_KL_mean": -12.140533447265625, "KL/std": 5.307281494140625, "epoch": 0.23582766439909297, "fcm_dpo/beta": 0.14601437747478485, "fcm_dpo/delta": -0.21223029494285583, "fcm_dpo/margin": 4.105587005615234, "fcm_dpo/q_t": 0.36960870027542114, "grad_norm": 39.38912582397461, "learning_rate": 4.7340816008085305e-07, "logits/chosen": 0.21155962347984314, "logits/rejected": 0.16681228578090668, "logps/chosen": -75.3045425415039, "logps/ref_chosen": -67.26959991455078, "logps/ref_rejected": -86.95914459228516, "logps/rejected": -99.09967803955078, "loss": 0.9936, "margin_dpo/margin_mean": 4.105587005615234, "margin_dpo/margin_std": 5.054868698120117, "step": 156 }, { "KL/chosen_KL_mean": -7.5355682373046875, "KL/mean": -9.299856185913086, "KL/rejected_KL_mean": -11.064144134521484, "KL/std": 5.4581708908081055, "epoch": 0.23733938019652306, "fcm_dpo/beta": 0.14071068167686462, "fcm_dpo/delta": -0.10495176911354065, "fcm_dpo/margin": 3.5285744667053223, "fcm_dpo/q_t": 0.39000144600868225, "grad_norm": 35.28091049194336, "learning_rate": 4.728116273823847e-07, "logits/chosen": 0.21964195370674133, "logits/rejected": 0.19987328350543976, "logps/chosen": -62.308441162109375, "logps/ref_chosen": -54.77287292480469, "logps/ref_rejected": -63.87866973876953, "logps/rejected": -74.94281005859375, "loss": 1.0718, "margin_dpo/margin_mean": 3.528574228286743, "margin_dpo/margin_std": 5.030998706817627, "step": 157 }, { "KL/chosen_KL_mean": -8.38583755493164, "KL/mean": -10.133772850036621, "KL/rejected_KL_mean": -11.881710052490234, "KL/std": 5.295492172241211, "epoch": 0.23885109599395313, "fcm_dpo/beta": 0.139404296875, "fcm_dpo/delta": -0.0919056087732315, "fcm_dpo/margin": 3.4958715438842773, "fcm_dpo/q_t": 0.3929098844528198, "grad_norm": 37.92874526977539, "learning_rate": 4.7220886216373085e-07, "logits/chosen": 0.23326879739761353, "logits/rejected": 0.19794759154319763, "logps/chosen": -73.30855560302734, "logps/ref_chosen": -64.92271423339844, "logps/ref_rejected": -82.23789978027344, "logps/rejected": -94.1196060180664, "loss": 1.081, "margin_dpo/margin_mean": 3.4958715438842773, "margin_dpo/margin_std": 5.306426048278809, "step": 158 }, { "KL/chosen_KL_mean": -8.631183624267578, "KL/mean": -10.692588806152344, "KL/rejected_KL_mean": -12.753990173339844, "KL/std": 5.806535720825195, "epoch": 0.24036281179138322, "fcm_dpo/beta": 0.1363297551870346, "fcm_dpo/delta": -0.17151199281215668, "fcm_dpo/margin": 4.122804641723633, "fcm_dpo/q_t": 0.3767700791358948, "grad_norm": 40.3121223449707, "learning_rate": 4.715998812855304e-07, "logits/chosen": 0.2474721223115921, "logits/rejected": 0.2103067934513092, "logps/chosen": -65.67817687988281, "logps/ref_chosen": -57.046993255615234, "logps/ref_rejected": -73.32441711425781, "logps/rejected": -86.07840728759766, "loss": 1.0537, "margin_dpo/margin_mean": 4.122803688049316, "margin_dpo/margin_std": 6.021585464477539, "step": 159 }, { "KL/chosen_KL_mean": -9.751371383666992, "KL/mean": -11.582653045654297, "KL/rejected_KL_mean": -13.4139404296875, "KL/std": 5.5372772216796875, "epoch": 0.2418745275888133, "fcm_dpo/beta": 0.13256987929344177, "fcm_dpo/delta": -0.08983123302459717, "fcm_dpo/margin": 3.662567615509033, "fcm_dpo/q_t": 0.39909666776657104, "grad_norm": 32.94814682006836, "learning_rate": 4.7098470178228755e-07, "logits/chosen": 0.11559563130140305, "logits/rejected": 0.07372243702411652, "logps/chosen": -59.558284759521484, "logps/ref_chosen": -49.806915283203125, "logps/ref_rejected": -68.3370132446289, "logps/rejected": -81.7509536743164, "loss": 1.116, "margin_dpo/margin_mean": 3.662567615509033, "margin_dpo/margin_std": 6.4256486892700195, "step": 160 }, { "KL/chosen_KL_mean": -9.797685623168945, "KL/mean": -11.716354370117188, "KL/rejected_KL_mean": -13.635025024414062, "KL/std": 5.4578633308410645, "epoch": 0.24338624338624337, "fcm_dpo/beta": 0.12983298301696777, "fcm_dpo/delta": -0.1033276617527008, "fcm_dpo/margin": 3.8373398780822754, "fcm_dpo/q_t": 0.3906995356082916, "grad_norm": 32.17088317871094, "learning_rate": 4.703633408618955e-07, "logits/chosen": 0.27047622203826904, "logits/rejected": 0.23168951272964478, "logps/chosen": -62.29817199707031, "logps/ref_chosen": -52.50048828125, "logps/ref_rejected": -66.04540252685547, "logps/rejected": -79.68042755126953, "loss": 1.0829, "margin_dpo/margin_mean": 3.8373401165008545, "margin_dpo/margin_std": 5.90944766998291, "step": 161 }, { "KL/chosen_KL_mean": -10.624420166015625, "KL/mean": -13.289154052734375, "KL/rejected_KL_mean": -15.953887939453125, "KL/std": 6.185821533203125, "epoch": 0.24489795918367346, "fcm_dpo/beta": 0.1238074004650116, "fcm_dpo/delta": -0.27837228775024414, "fcm_dpo/margin": 5.329469203948975, "fcm_dpo/q_t": 0.3530166745185852, "grad_norm": 32.45170211791992, "learning_rate": 4.697358159051549e-07, "logits/chosen": 0.29613155126571655, "logits/rejected": 0.24742354452610016, "logps/chosen": -80.09361267089844, "logps/ref_chosen": -69.46919250488281, "logps/ref_rejected": -92.00952911376953, "logps/rejected": -107.96341705322266, "loss": 0.9466, "margin_dpo/margin_mean": 5.329469680786133, "margin_dpo/margin_std": 5.826746940612793, "step": 162 }, { "KL/chosen_KL_mean": -9.745094299316406, "KL/mean": -12.373571395874023, "KL/rejected_KL_mean": -15.00204849243164, "KL/std": 5.979671478271484, "epoch": 0.24640967498110355, "fcm_dpo/beta": 0.11905130743980408, "fcm_dpo/delta": -0.24060265719890594, "fcm_dpo/margin": 5.256951332092285, "fcm_dpo/q_t": 0.3639863133430481, "grad_norm": 30.063385009765625, "learning_rate": 4.691021444652876e-07, "logits/chosen": 0.237145334482193, "logits/rejected": 0.1899363398551941, "logps/chosen": -60.35892868041992, "logps/ref_chosen": -50.613834381103516, "logps/ref_rejected": -74.62033081054688, "logps/rejected": -89.62237548828125, "loss": 1.0008, "margin_dpo/margin_mean": 5.256951808929443, "margin_dpo/margin_std": 6.490505218505859, "step": 163 }, { "KL/chosen_KL_mean": -10.524375915527344, "KL/mean": -13.134865760803223, "KL/rejected_KL_mean": -15.745357513427734, "KL/std": 6.348791599273682, "epoch": 0.24792139077853365, "fcm_dpo/beta": 0.11331381648778915, "fcm_dpo/delta": -0.20320162177085876, "fcm_dpo/margin": 5.220986366271973, "fcm_dpo/q_t": 0.3712468147277832, "grad_norm": 28.374286651611328, "learning_rate": 4.6846234426744624e-07, "logits/chosen": 0.23074942827224731, "logits/rejected": 0.16782602667808533, "logps/chosen": -65.37248992919922, "logps/ref_chosen": -54.848114013671875, "logps/ref_rejected": -79.0630111694336, "logps/rejected": -94.80836486816406, "loss": 1.0193, "margin_dpo/margin_mean": 5.220986366271973, "margin_dpo/margin_std": 6.8066887855529785, "step": 164 }, { "KL/chosen_KL_mean": -11.484821319580078, "KL/mean": -13.85247802734375, "KL/rejected_KL_mean": -16.220138549804688, "KL/std": 6.508500099182129, "epoch": 0.2494331065759637, "fcm_dpo/beta": 0.11034692823886871, "fcm_dpo/delta": -0.1293293684720993, "fcm_dpo/margin": 4.735317230224609, "fcm_dpo/q_t": 0.38344305753707886, "grad_norm": 29.64312744140625, "learning_rate": 4.678164332082175e-07, "logits/chosen": 0.30432748794555664, "logits/rejected": 0.24835875630378723, "logps/chosen": -62.574031829833984, "logps/ref_chosen": -51.089210510253906, "logps/ref_rejected": -71.23370361328125, "logps/rejected": -87.45384216308594, "loss": 1.0599, "margin_dpo/margin_mean": 4.735316753387451, "margin_dpo/margin_std": 6.592318058013916, "step": 165 }, { "KL/chosen_KL_mean": -11.269798278808594, "KL/mean": -13.099468231201172, "KL/rejected_KL_mean": -14.929134368896484, "KL/std": 6.357509613037109, "epoch": 0.2509448223733938, "fcm_dpo/beta": 0.10865189135074615, "fcm_dpo/delta": 0.0023868978023529053, "fcm_dpo/margin": 3.659334182739258, "fcm_dpo/q_t": 0.4133417010307312, "grad_norm": 33.37204360961914, "learning_rate": 4.6716442935512214e-07, "logits/chosen": 0.27174800634384155, "logits/rejected": 0.18109279870986938, "logps/chosen": -74.46060943603516, "logps/ref_chosen": -63.19081115722656, "logps/ref_rejected": -93.8402099609375, "logps/rejected": -108.76934814453125, "loss": 1.1318, "margin_dpo/margin_mean": 3.659334182739258, "margin_dpo/margin_std": 6.208669185638428, "step": 166 }, { "KL/chosen_KL_mean": -10.212823867797852, "KL/mean": -13.08865737915039, "KL/rejected_KL_mean": -15.964488983154297, "KL/std": 6.5995635986328125, "epoch": 0.25245653817082386, "fcm_dpo/beta": 0.10473838448524475, "fcm_dpo/delta": -0.21790046989917755, "fcm_dpo/margin": 5.751662254333496, "fcm_dpo/q_t": 0.36511939764022827, "grad_norm": 25.732704162597656, "learning_rate": 4.6650635094610966e-07, "logits/chosen": 0.187101811170578, "logits/rejected": 0.15242861211299896, "logps/chosen": -69.13710021972656, "logps/ref_chosen": -58.92427062988281, "logps/ref_rejected": -72.97377014160156, "logps/rejected": -88.93826293945312, "loss": 0.9736, "margin_dpo/margin_mean": 5.751662254333496, "margin_dpo/margin_std": 6.356039047241211, "step": 167 }, { "KL/chosen_KL_mean": -11.96699333190918, "KL/mean": -14.050497055053711, "KL/rejected_KL_mean": -16.133995056152344, "KL/std": 6.929432392120361, "epoch": 0.25396825396825395, "fcm_dpo/beta": 0.10364580899477005, "fcm_dpo/delta": -0.033402007073163986, "fcm_dpo/margin": 4.166999816894531, "fcm_dpo/q_t": 0.4044637084007263, "grad_norm": 30.63161277770996, "learning_rate": 4.6584221638904767e-07, "logits/chosen": 0.2525111138820648, "logits/rejected": 0.2172178328037262, "logps/chosen": -77.61837768554688, "logps/ref_chosen": -65.65138244628906, "logps/ref_rejected": -79.71418762207031, "logps/rejected": -95.84818267822266, "loss": 1.0965, "margin_dpo/margin_mean": 4.1670002937316895, "margin_dpo/margin_std": 6.206585884094238, "step": 168 }, { "KL/chosen_KL_mean": -10.402040481567383, "KL/mean": -13.141075134277344, "KL/rejected_KL_mean": -15.880107879638672, "KL/std": 7.222902774810791, "epoch": 0.25547996976568405, "fcm_dpo/beta": 0.1010751873254776, "fcm_dpo/delta": -0.16278433799743652, "fcm_dpo/margin": 5.4780592918396, "fcm_dpo/q_t": 0.3843042850494385, "grad_norm": 28.764638900756836, "learning_rate": 4.651720442612075e-07, "logits/chosen": 0.3002695143222809, "logits/rejected": 0.2664685547351837, "logps/chosen": -71.82791137695312, "logps/ref_chosen": -61.425865173339844, "logps/ref_rejected": -76.09590148925781, "logps/rejected": -91.97600555419922, "loss": 1.0566, "margin_dpo/margin_mean": 5.4780592918396, "margin_dpo/margin_std": 8.38476276397705, "step": 169 }, { "KL/chosen_KL_mean": -10.680133819580078, "KL/mean": -13.217702865600586, "KL/rejected_KL_mean": -15.755268096923828, "KL/std": 7.103891849517822, "epoch": 0.25699168556311414, "fcm_dpo/beta": 0.09894725680351257, "fcm_dpo/delta": -0.10743223875761032, "fcm_dpo/margin": 5.075137138366699, "fcm_dpo/q_t": 0.38986092805862427, "grad_norm": 24.42136573791504, "learning_rate": 4.6449585330874425e-07, "logits/chosen": 0.22937563061714172, "logits/rejected": 0.2268170267343521, "logps/chosen": -67.33332824707031, "logps/ref_chosen": -56.65319061279297, "logps/ref_rejected": -63.45965576171875, "logps/rejected": -79.21492004394531, "loss": 1.099, "margin_dpo/margin_mean": 5.075137138366699, "margin_dpo/margin_std": 8.269186973571777, "step": 170 }, { "KL/chosen_KL_mean": -11.31601333618164, "KL/mean": -14.252115249633789, "KL/rejected_KL_mean": -17.188217163085938, "KL/std": 8.03049087524414, "epoch": 0.2585034013605442, "fcm_dpo/beta": 0.094817154109478, "fcm_dpo/delta": -0.1693970412015915, "fcm_dpo/margin": 5.87220573425293, "fcm_dpo/q_t": 0.37948426604270935, "grad_norm": 27.524608612060547, "learning_rate": 4.6381366244617224e-07, "logits/chosen": 0.3146061897277832, "logits/rejected": 0.26350659132003784, "logps/chosen": -75.05077362060547, "logps/ref_chosen": -63.73476028442383, "logps/ref_rejected": -78.50328063964844, "logps/rejected": -95.69149780273438, "loss": 1.073, "margin_dpo/margin_mean": 5.8722052574157715, "margin_dpo/margin_std": 9.038765907287598, "step": 171 }, { "KL/chosen_KL_mean": -12.447174072265625, "KL/mean": -15.299983024597168, "KL/rejected_KL_mean": -18.152790069580078, "KL/std": 7.975404739379883, "epoch": 0.2600151171579743, "fcm_dpo/beta": 0.09365987777709961, "fcm_dpo/delta": -0.14217537641525269, "fcm_dpo/margin": 5.705615520477295, "fcm_dpo/q_t": 0.38168519735336304, "grad_norm": 25.322359085083008, "learning_rate": 4.631254907558365e-07, "logits/chosen": 0.2984645962715149, "logits/rejected": 0.24226480722427368, "logps/chosen": -64.64893341064453, "logps/ref_chosen": -52.201759338378906, "logps/ref_rejected": -82.85285949707031, "logps/rejected": -101.00565338134766, "loss": 1.0538, "margin_dpo/margin_mean": 5.705615520477295, "margin_dpo/margin_std": 7.993048191070557, "step": 172 }, { "KL/chosen_KL_mean": -12.049524307250977, "KL/mean": -15.134384155273438, "KL/rejected_KL_mean": -18.21924591064453, "KL/std": 8.450578689575195, "epoch": 0.2615268329554044, "fcm_dpo/beta": 0.08889605104923248, "fcm_dpo/delta": -0.1623501181602478, "fcm_dpo/margin": 6.169720649719238, "fcm_dpo/q_t": 0.387276291847229, "grad_norm": 24.629371643066406, "learning_rate": 4.624313574873786e-07, "logits/chosen": 0.28725624084472656, "logits/rejected": 0.2010902315378189, "logps/chosen": -67.4842529296875, "logps/ref_chosen": -55.434722900390625, "logps/ref_rejected": -77.81967163085938, "logps/rejected": -96.03892517089844, "loss": 1.1168, "margin_dpo/margin_mean": 6.169720649719238, "margin_dpo/margin_std": 10.586685180664062, "step": 173 }, { "KL/chosen_KL_mean": -13.29041862487793, "KL/mean": -16.51142120361328, "KL/rejected_KL_mean": -19.732425689697266, "KL/std": 8.349371910095215, "epoch": 0.26303854875283444, "fcm_dpo/beta": 0.08677835762500763, "fcm_dpo/delta": -0.16918835043907166, "fcm_dpo/margin": 6.442007064819336, "fcm_dpo/q_t": 0.3800202012062073, "grad_norm": 25.858070373535156, "learning_rate": 4.61731282057198e-07, "logits/chosen": 0.2942475974559784, "logits/rejected": 0.2271643579006195, "logps/chosen": -70.46237182617188, "logps/ref_chosen": -57.17195129394531, "logps/ref_rejected": -85.47578430175781, "logps/rejected": -105.20820617675781, "loss": 1.0471, "margin_dpo/margin_mean": 6.442006587982178, "margin_dpo/margin_std": 9.350664138793945, "step": 174 }, { "KL/chosen_KL_mean": -12.60334587097168, "KL/mean": -16.154342651367188, "KL/rejected_KL_mean": -19.70534896850586, "KL/std": 8.862190246582031, "epoch": 0.26455026455026454, "fcm_dpo/beta": 0.08383272588253021, "fcm_dpo/delta": -0.20745116472244263, "fcm_dpo/margin": 7.101996421813965, "fcm_dpo/q_t": 0.374935120344162, "grad_norm": 24.903615951538086, "learning_rate": 4.6102528404790965e-07, "logits/chosen": 0.32692593336105347, "logits/rejected": 0.29601743817329407, "logps/chosen": -80.26897430419922, "logps/ref_chosen": -67.6656265258789, "logps/ref_rejected": -84.36766815185547, "logps/rejected": -104.07301330566406, "loss": 1.0374, "margin_dpo/margin_mean": 7.101997375488281, "margin_dpo/margin_std": 10.175451278686523, "step": 175 }, { "KL/chosen_KL_mean": -14.360851287841797, "KL/mean": -16.80423927307129, "KL/rejected_KL_mean": -19.24762725830078, "KL/std": 9.378913879394531, "epoch": 0.2660619803476946, "fcm_dpo/beta": 0.08223021030426025, "fcm_dpo/delta": -0.0027749016880989075, "fcm_dpo/margin": 4.886781692504883, "fcm_dpo/q_t": 0.4155081510543823, "grad_norm": 29.54825782775879, "learning_rate": 4.603133832077953e-07, "logits/chosen": 0.25847089290618896, "logits/rejected": 0.2319958209991455, "logps/chosen": -92.21961212158203, "logps/ref_chosen": -77.8587646484375, "logps/ref_rejected": -81.08732604980469, "logps/rejected": -100.33495330810547, "loss": 1.1803, "margin_dpo/margin_mean": 4.886781692504883, "margin_dpo/margin_std": 10.080177307128906, "step": 176 }, { "KL/chosen_KL_mean": -11.897407531738281, "KL/mean": -16.65441131591797, "KL/rejected_KL_mean": -21.411415100097656, "KL/std": 9.643539428710938, "epoch": 0.2675736961451247, "fcm_dpo/beta": 0.07796752452850342, "fcm_dpo/delta": -0.37131455540657043, "fcm_dpo/margin": 9.514007568359375, "fcm_dpo/q_t": 0.3376469910144806, "grad_norm": 26.62158203125, "learning_rate": 4.5959559945025183e-07, "logits/chosen": 0.3913840651512146, "logits/rejected": 0.2953065037727356, "logps/chosen": -67.11780548095703, "logps/ref_chosen": -55.22039794921875, "logps/ref_rejected": -92.54973602294922, "logps/rejected": -113.96115112304688, "loss": 0.9034, "margin_dpo/margin_mean": 9.514006614685059, "margin_dpo/margin_std": 9.890893936157227, "step": 177 }, { "KL/chosen_KL_mean": -13.228601455688477, "KL/mean": -16.130857467651367, "KL/rejected_KL_mean": -19.033111572265625, "KL/std": 9.27237606048584, "epoch": 0.2690854119425548, "fcm_dpo/beta": 0.07568572461605072, "fcm_dpo/delta": -0.04298366606235504, "fcm_dpo/margin": 5.804502487182617, "fcm_dpo/q_t": 0.4023052155971527, "grad_norm": 24.669376373291016, "learning_rate": 4.588719528532341e-07, "logits/chosen": 0.27082359790802, "logits/rejected": 0.2212221920490265, "logps/chosen": -74.03909301757812, "logps/ref_chosen": -60.81049346923828, "logps/ref_rejected": -81.12973022460938, "logps/rejected": -100.162841796875, "loss": 1.1118, "margin_dpo/margin_mean": 5.804503440856934, "margin_dpo/margin_std": 9.205760955810547, "step": 178 }, { "KL/chosen_KL_mean": -14.178958892822266, "KL/mean": -17.51313018798828, "KL/rejected_KL_mean": -20.84729766845703, "KL/std": 9.534402847290039, "epoch": 0.2705971277399849, "fcm_dpo/beta": 0.07499967515468597, "fcm_dpo/delta": -0.10537131875753403, "fcm_dpo/margin": 6.668337821960449, "fcm_dpo/q_t": 0.3920973837375641, "grad_norm": 24.396095275878906, "learning_rate": 4.581424636586928e-07, "logits/chosen": 0.33049434423446655, "logits/rejected": 0.31331005692481995, "logps/chosen": -79.85067749023438, "logps/ref_chosen": -65.67171478271484, "logps/ref_rejected": -75.32586669921875, "logps/rejected": -96.17316436767578, "loss": 1.1029, "margin_dpo/margin_mean": 6.668338775634766, "margin_dpo/margin_std": 11.172536849975586, "step": 179 }, { "KL/chosen_KL_mean": -11.334993362426758, "KL/mean": -14.348894119262695, "KL/rejected_KL_mean": -17.362794876098633, "KL/std": 9.822802543640137, "epoch": 0.272108843537415, "fcm_dpo/beta": 0.07425501942634583, "fcm_dpo/delta": -0.04986467584967613, "fcm_dpo/margin": 6.027800559997559, "fcm_dpo/q_t": 0.40598782896995544, "grad_norm": 22.733217239379883, "learning_rate": 4.5740715227200897e-07, "logits/chosen": 0.13708952069282532, "logits/rejected": 0.11808039993047714, "logps/chosen": -68.0177993774414, "logps/ref_chosen": -56.68280792236328, "logps/ref_rejected": -64.94414520263672, "logps/rejected": -82.30694580078125, "loss": 1.1542, "margin_dpo/margin_mean": 6.027801036834717, "margin_dpo/margin_std": 11.640966415405273, "step": 180 }, { "KL/chosen_KL_mean": -10.154378890991211, "KL/mean": -14.627374649047852, "KL/rejected_KL_mean": -19.100364685058594, "KL/std": 10.613178253173828, "epoch": 0.273620559334845, "fcm_dpo/beta": 0.07093991339206696, "fcm_dpo/delta": -0.2513868510723114, "fcm_dpo/margin": 8.945984840393066, "fcm_dpo/q_t": 0.3586830794811249, "grad_norm": 21.344892501831055, "learning_rate": 4.566660392614228e-07, "logits/chosen": 0.32632309198379517, "logits/rejected": 0.28618094325065613, "logps/chosen": -70.93042755126953, "logps/ref_chosen": -60.77604675292969, "logps/ref_rejected": -83.98361206054688, "logps/rejected": -103.083984375, "loss": 0.9519, "margin_dpo/margin_mean": 8.945984840393066, "margin_dpo/margin_std": 9.652619361877441, "step": 181 }, { "KL/chosen_KL_mean": -12.560722351074219, "KL/mean": -17.493358612060547, "KL/rejected_KL_mean": -22.425994873046875, "KL/std": 11.259750366210938, "epoch": 0.2751322751322751, "fcm_dpo/beta": 0.0671565979719162, "fcm_dpo/delta": -0.28209519386291504, "fcm_dpo/margin": 9.865280151367188, "fcm_dpo/q_t": 0.35815727710723877, "grad_norm": 20.7074031829834, "learning_rate": 4.5591914535745817e-07, "logits/chosen": 0.3049561381340027, "logits/rejected": 0.22382745146751404, "logps/chosen": -72.81450653076172, "logps/ref_chosen": -60.2537841796875, "logps/ref_rejected": -89.7706298828125, "logps/rejected": -112.19662475585938, "loss": 0.9903, "margin_dpo/margin_mean": 9.865280151367188, "margin_dpo/margin_std": 12.753199577331543, "step": 182 }, { "KL/chosen_KL_mean": -15.433515548706055, "KL/mean": -17.407238006591797, "KL/rejected_KL_mean": -19.38095474243164, "KL/std": 10.853394508361816, "epoch": 0.2766439909297052, "fcm_dpo/beta": 0.06693626940250397, "fcm_dpo/delta": 0.042862582951784134, "fcm_dpo/margin": 3.947441577911377, "fcm_dpo/q_t": 0.4407636523246765, "grad_norm": 22.94739532470703, "learning_rate": 4.551664914523433e-07, "logits/chosen": 0.2950702905654907, "logits/rejected": 0.2728223204612732, "logps/chosen": -77.19493103027344, "logps/ref_chosen": -61.76142120361328, "logps/ref_rejected": -72.54627990722656, "logps/rejected": -91.92723846435547, "loss": 1.2581, "margin_dpo/margin_mean": 3.947441816329956, "margin_dpo/margin_std": 10.635702133178711, "step": 183 }, { "KL/chosen_KL_mean": -10.490285873413086, "KL/mean": -14.313344955444336, "KL/rejected_KL_mean": -18.136398315429688, "KL/std": 9.76352310180664, "epoch": 0.2781557067271353, "fcm_dpo/beta": 0.06539727747440338, "fcm_dpo/delta": -0.1071850061416626, "fcm_dpo/margin": 7.646112442016602, "fcm_dpo/q_t": 0.3893827199935913, "grad_norm": 19.206035614013672, "learning_rate": 4.544080985994258e-07, "logits/chosen": 0.416187047958374, "logits/rejected": 0.35007432103157043, "logps/chosen": -57.33100891113281, "logps/ref_chosen": -46.840721130371094, "logps/ref_rejected": -69.3609390258789, "logps/rejected": -87.4973373413086, "loss": 1.0439, "margin_dpo/margin_mean": 7.646113395690918, "margin_dpo/margin_std": 9.755975723266602, "step": 184 }, { "KL/chosen_KL_mean": -12.928775787353516, "KL/mean": -16.942461013793945, "KL/rejected_KL_mean": -20.95614242553711, "KL/std": 11.343210220336914, "epoch": 0.2796674225245654, "fcm_dpo/beta": 0.06398440897464752, "fcm_dpo/delta": -0.1211206391453743, "fcm_dpo/margin": 8.02737045288086, "fcm_dpo/q_t": 0.3917185068130493, "grad_norm": 18.914852142333984, "learning_rate": 4.5364398801258394e-07, "logits/chosen": 0.3066721558570862, "logits/rejected": 0.25998419523239136, "logps/chosen": -65.24991607666016, "logps/ref_chosen": -52.32114028930664, "logps/ref_rejected": -68.3885726928711, "logps/rejected": -89.34471130371094, "loss": 1.1203, "margin_dpo/margin_mean": 8.02737045288086, "margin_dpo/margin_std": 14.069936752319336, "step": 185 }, { "KL/chosen_KL_mean": -11.656740188598633, "KL/mean": -16.188270568847656, "KL/rejected_KL_mean": -20.719799041748047, "KL/std": 11.402138710021973, "epoch": 0.2811791383219955, "fcm_dpo/beta": 0.06227569282054901, "fcm_dpo/delta": -0.1743626892566681, "fcm_dpo/margin": 9.063055992126465, "fcm_dpo/q_t": 0.3803493082523346, "grad_norm": 22.49077033996582, "learning_rate": 4.5287418106563354e-07, "logits/chosen": 0.2469421923160553, "logits/rejected": 0.2048531025648117, "logps/chosen": -79.07687377929688, "logps/ref_chosen": -67.42012786865234, "logps/ref_rejected": -82.50968933105469, "logps/rejected": -103.2294921875, "loss": 1.0638, "margin_dpo/margin_mean": 9.063056945800781, "margin_dpo/margin_std": 13.931187629699707, "step": 186 }, { "KL/chosen_KL_mean": -13.783466339111328, "KL/mean": -17.95578384399414, "KL/rejected_KL_mean": -22.12810516357422, "KL/std": 12.003030776977539, "epoch": 0.28269085411942557, "fcm_dpo/beta": 0.06057630479335785, "fcm_dpo/delta": -0.11155885457992554, "fcm_dpo/margin": 8.344644546508789, "fcm_dpo/q_t": 0.3909391760826111, "grad_norm": 21.94649314880371, "learning_rate": 4.520986992917297e-07, "logits/chosen": 0.29726389050483704, "logits/rejected": 0.2419080287218094, "logps/chosen": -89.3089599609375, "logps/ref_chosen": -75.52549743652344, "logps/ref_rejected": -94.76289367675781, "logps/rejected": -116.89099884033203, "loss": 1.0997, "margin_dpo/margin_mean": 8.344643592834473, "margin_dpo/margin_std": 13.742973327636719, "step": 187 }, { "KL/chosen_KL_mean": -12.482467651367188, "KL/mean": -16.90377426147461, "KL/rejected_KL_mean": -21.32508087158203, "KL/std": 12.468690872192383, "epoch": 0.2842025699168556, "fcm_dpo/beta": 0.05926317349076271, "fcm_dpo/delta": -0.13083845376968384, "fcm_dpo/margin": 8.842605590820312, "fcm_dpo/q_t": 0.38721227645874023, "grad_norm": 20.538850784301758, "learning_rate": 4.5131756438276466e-07, "logits/chosen": 0.32081130146980286, "logits/rejected": 0.27670738101005554, "logps/chosen": -84.00579833984375, "logps/ref_chosen": -71.52333068847656, "logps/ref_rejected": -78.29949951171875, "logps/rejected": -99.62458038330078, "loss": 1.0656, "margin_dpo/margin_mean": 8.842605590820312, "margin_dpo/margin_std": 13.314140319824219, "step": 188 }, { "KL/chosen_KL_mean": -11.685968399047852, "KL/mean": -15.84921646118164, "KL/rejected_KL_mean": -20.012466430664062, "KL/std": 12.300812721252441, "epoch": 0.2857142857142857, "fcm_dpo/beta": 0.057532232254743576, "fcm_dpo/delta": -0.08574981987476349, "fcm_dpo/margin": 8.326496124267578, "fcm_dpo/q_t": 0.3938596844673157, "grad_norm": 21.206645965576172, "learning_rate": 4.5053079818876096e-07, "logits/chosen": 0.34900641441345215, "logits/rejected": 0.3621196746826172, "logps/chosen": -83.86223602294922, "logps/ref_chosen": -72.17626953125, "logps/ref_rejected": -75.26313781738281, "logps/rejected": -95.27560424804688, "loss": 1.0997, "margin_dpo/margin_mean": 8.326497077941895, "margin_dpo/margin_std": 13.010383605957031, "step": 189 }, { "KL/chosen_KL_mean": -10.961227416992188, "KL/mean": -16.931194305419922, "KL/rejected_KL_mean": -22.90115737915039, "KL/std": 12.874874114990234, "epoch": 0.2872260015117158, "fcm_dpo/beta": 0.05544852837920189, "fcm_dpo/delta": -0.28079941868782043, "fcm_dpo/margin": 11.939929008483887, "fcm_dpo/q_t": 0.35604608058929443, "grad_norm": 22.097734451293945, "learning_rate": 4.4973842271726024e-07, "logits/chosen": 0.3808823823928833, "logits/rejected": 0.23725879192352295, "logps/chosen": -65.58549499511719, "logps/ref_chosen": -54.624271392822266, "logps/ref_rejected": -101.47068786621094, "logps/rejected": -124.37184143066406, "loss": 0.9549, "margin_dpo/margin_mean": 11.939929962158203, "margin_dpo/margin_std": 13.697011947631836, "step": 190 }, { "KL/chosen_KL_mean": -15.0380859375, "KL/mean": -19.413564682006836, "KL/rejected_KL_mean": -23.789047241210938, "KL/std": 13.08713436126709, "epoch": 0.2887377173091459, "fcm_dpo/beta": 0.0540933758020401, "fcm_dpo/delta": -0.07696905732154846, "fcm_dpo/margin": 8.750959396362305, "fcm_dpo/q_t": 0.39417406916618347, "grad_norm": 20.902061462402344, "learning_rate": 4.48940460132708e-07, "logits/chosen": 0.38643670082092285, "logits/rejected": 0.3580207824707031, "logps/chosen": -87.97059631347656, "logps/ref_chosen": -72.93251037597656, "logps/ref_rejected": -89.95103454589844, "logps/rejected": -113.74008178710938, "loss": 1.1029, "margin_dpo/margin_mean": 8.750959396362305, "margin_dpo/margin_std": 14.159086227416992, "step": 191 }, { "KL/chosen_KL_mean": -15.891511917114258, "KL/mean": -18.878381729125977, "KL/rejected_KL_mean": -21.86524200439453, "KL/std": 13.809000015258789, "epoch": 0.29024943310657597, "fcm_dpo/beta": 0.05458749830722809, "fcm_dpo/delta": 0.07609562575817108, "fcm_dpo/margin": 5.973730087280273, "fcm_dpo/q_t": 0.4266872704029083, "grad_norm": 17.381935119628906, "learning_rate": 4.481369327558329e-07, "logits/chosen": 0.371783971786499, "logits/rejected": 0.34520506858825684, "logps/chosen": -69.89263153076172, "logps/ref_chosen": -54.001121520996094, "logps/ref_rejected": -63.531551361083984, "logps/rejected": -85.39678955078125, "loss": 1.1883, "margin_dpo/margin_mean": 5.97373104095459, "margin_dpo/margin_std": 12.114863395690918, "step": 192 }, { "KL/chosen_KL_mean": -12.972223281860352, "KL/mean": -18.518402099609375, "KL/rejected_KL_mean": -24.064579010009766, "KL/std": 13.090436935424805, "epoch": 0.29176114890400606, "fcm_dpo/beta": 0.05279640108346939, "fcm_dpo/delta": -0.1988125890493393, "fcm_dpo/margin": 11.09235668182373, "fcm_dpo/q_t": 0.3709743320941925, "grad_norm": 16.628772735595703, "learning_rate": 4.47327863063023e-07, "logits/chosen": 0.3248763084411621, "logits/rejected": 0.30150750279426575, "logps/chosen": -69.72149658203125, "logps/ref_chosen": -56.74927520751953, "logps/ref_rejected": -58.80629348754883, "logps/rejected": -82.8708724975586, "loss": 0.9962, "margin_dpo/margin_mean": 11.092357635498047, "margin_dpo/margin_std": 13.47452449798584, "step": 193 }, { "KL/chosen_KL_mean": -13.635934829711914, "KL/mean": -17.933311462402344, "KL/rejected_KL_mean": -22.230697631835938, "KL/std": 13.053489685058594, "epoch": 0.29327286470143615, "fcm_dpo/beta": 0.05177993327379227, "fcm_dpo/delta": -0.04856480658054352, "fcm_dpo/margin": 8.594758033752441, "fcm_dpo/q_t": 0.4021752178668976, "grad_norm": 19.066179275512695, "learning_rate": 4.4651327368569684e-07, "logits/chosen": 0.3664540946483612, "logits/rejected": 0.33667171001434326, "logps/chosen": -70.2853775024414, "logps/ref_chosen": -56.64944076538086, "logps/ref_rejected": -69.98954772949219, "logps/rejected": -92.22024536132812, "loss": 1.1541, "margin_dpo/margin_mean": 8.594758033752441, "margin_dpo/margin_std": 16.146366119384766, "step": 194 }, { "KL/chosen_KL_mean": -16.097759246826172, "KL/mean": -21.750097274780273, "KL/rejected_KL_mean": -27.40243911743164, "KL/std": 14.986997604370117, "epoch": 0.2947845804988662, "fcm_dpo/beta": 0.05039390176534653, "fcm_dpo/delta": -0.18134789168834686, "fcm_dpo/margin": 11.304683685302734, "fcm_dpo/q_t": 0.376251220703125, "grad_norm": 19.290634155273438, "learning_rate": 4.4569318740967043e-07, "logits/chosen": 0.265621155500412, "logits/rejected": 0.2691984474658966, "logps/chosen": -86.50753784179688, "logps/ref_chosen": -70.40977478027344, "logps/ref_rejected": -74.39448547363281, "logps/rejected": -101.79692077636719, "loss": 1.0314, "margin_dpo/margin_mean": 11.304681777954102, "margin_dpo/margin_std": 15.417613983154297, "step": 195 }, { "KL/chosen_KL_mean": -15.376646041870117, "KL/mean": -19.852920532226562, "KL/rejected_KL_mean": -24.329193115234375, "KL/std": 14.921621322631836, "epoch": 0.2962962962962963, "fcm_dpo/beta": 0.050030939280986786, "fcm_dpo/delta": -0.050353750586509705, "fcm_dpo/margin": 8.952546119689941, "fcm_dpo/q_t": 0.3985461890697479, "grad_norm": 18.4036808013916, "learning_rate": 4.448676271745197e-07, "logits/chosen": 0.3879624605178833, "logits/rejected": 0.3449851870536804, "logps/chosen": -74.60421752929688, "logps/ref_chosen": -59.227577209472656, "logps/ref_rejected": -83.54757690429688, "logps/rejected": -107.87677001953125, "loss": 1.0967, "margin_dpo/margin_mean": 8.952545166015625, "margin_dpo/margin_std": 13.567102432250977, "step": 196 }, { "KL/chosen_KL_mean": -13.274065017700195, "KL/mean": -18.908409118652344, "KL/rejected_KL_mean": -24.54275131225586, "KL/std": 14.880483627319336, "epoch": 0.29780801209372637, "fcm_dpo/beta": 0.04886094853281975, "fcm_dpo/delta": -0.15927977859973907, "fcm_dpo/margin": 11.268684387207031, "fcm_dpo/q_t": 0.38195258378982544, "grad_norm": 18.35759162902832, "learning_rate": 4.440366160729392e-07, "logits/chosen": 0.4663141965866089, "logits/rejected": 0.4145383834838867, "logps/chosen": -64.80319213867188, "logps/ref_chosen": -51.52912902832031, "logps/ref_rejected": -73.70631408691406, "logps/rejected": -98.24906158447266, "loss": 1.0997, "margin_dpo/margin_mean": 11.268684387207031, "margin_dpo/margin_std": 18.52632713317871, "step": 197 }, { "KL/chosen_KL_mean": -14.127727508544922, "KL/mean": -20.223243713378906, "KL/rejected_KL_mean": -26.318767547607422, "KL/std": 14.258949279785156, "epoch": 0.29931972789115646, "fcm_dpo/beta": 0.0470733568072319, "fcm_dpo/delta": -0.1840635985136032, "fcm_dpo/margin": 12.191038131713867, "fcm_dpo/q_t": 0.3711358308792114, "grad_norm": 17.605937957763672, "learning_rate": 4.432001773500957e-07, "logits/chosen": 0.44902199506759644, "logits/rejected": 0.40681999921798706, "logps/chosen": -73.91040802001953, "logps/ref_chosen": -59.78268051147461, "logps/ref_rejected": -72.24533081054688, "logps/rejected": -98.56409454345703, "loss": 1.0004, "margin_dpo/margin_mean": 12.191038131713867, "margin_dpo/margin_std": 14.459760665893555, "step": 198 }, { "KL/chosen_KL_mean": -17.205591201782227, "KL/mean": -21.93572235107422, "KL/rejected_KL_mean": -26.665851593017578, "KL/std": 15.276073455810547, "epoch": 0.30083144368858655, "fcm_dpo/beta": 0.04653170332312584, "fcm_dpo/delta": -0.04358825087547302, "fcm_dpo/margin": 9.460265159606934, "fcm_dpo/q_t": 0.40333688259124756, "grad_norm": 17.478683471679688, "learning_rate": 4.4235833440297856e-07, "logits/chosen": 0.3941565155982971, "logits/rejected": 0.30348044633865356, "logps/chosen": -73.59236145019531, "logps/ref_chosen": -56.38677215576172, "logps/ref_rejected": -74.56779479980469, "logps/rejected": -101.23365020751953, "loss": 1.1656, "margin_dpo/margin_mean": 9.460264205932617, "margin_dpo/margin_std": 18.094928741455078, "step": 199 }, { "KL/chosen_KL_mean": -13.340187072753906, "KL/mean": -20.093761444091797, "KL/rejected_KL_mean": -26.847335815429688, "KL/std": 15.945539474487305, "epoch": 0.30234315948601664, "fcm_dpo/beta": 0.04431159049272537, "fcm_dpo/delta": -0.21283170580863953, "fcm_dpo/margin": 13.5071439743042, "fcm_dpo/q_t": 0.37099677324295044, "grad_norm": 16.520164489746094, "learning_rate": 4.415111107797445e-07, "logits/chosen": 0.460144579410553, "logits/rejected": 0.3838568925857544, "logps/chosen": -71.16451263427734, "logps/ref_chosen": -57.82432556152344, "logps/ref_rejected": -89.28246307373047, "logps/rejected": -116.12979888916016, "loss": 1.0343, "margin_dpo/margin_mean": 13.507144927978516, "margin_dpo/margin_std": 19.13675308227539, "step": 200 }, { "KL/chosen_KL_mean": -18.767276763916016, "KL/mean": -24.315343856811523, "KL/rejected_KL_mean": -29.863414764404297, "KL/std": 16.66211700439453, "epoch": 0.30385487528344673, "fcm_dpo/beta": 0.04330876097083092, "fcm_dpo/delta": -0.085403211414814, "fcm_dpo/margin": 11.096136093139648, "fcm_dpo/q_t": 0.39601463079452515, "grad_norm": 18.463565826416016, "learning_rate": 4.4065853017905953e-07, "logits/chosen": 0.462460458278656, "logits/rejected": 0.41501501202583313, "logps/chosen": -77.76703643798828, "logps/ref_chosen": -58.999759674072266, "logps/ref_rejected": -84.67575073242188, "logps/rejected": -114.5391616821289, "loss": 1.1022, "margin_dpo/margin_mean": 11.096136093139648, "margin_dpo/margin_std": 18.115814208984375, "step": 201 }, { "KL/chosen_KL_mean": -15.379348754882812, "KL/mean": -22.084678649902344, "KL/rejected_KL_mean": -28.790000915527344, "KL/std": 16.251558303833008, "epoch": 0.30536659108087677, "fcm_dpo/beta": 0.04236089065670967, "fcm_dpo/delta": -0.1778273731470108, "fcm_dpo/margin": 13.410651206970215, "fcm_dpo/q_t": 0.37218576669692993, "grad_norm": 17.639385223388672, "learning_rate": 4.3980061644943575e-07, "logits/chosen": 0.34011542797088623, "logits/rejected": 0.26836439967155457, "logps/chosen": -63.03999710083008, "logps/ref_chosen": -47.660648345947266, "logps/ref_rejected": -73.63249969482422, "logps/rejected": -102.42250061035156, "loss": 1.0283, "margin_dpo/margin_mean": 13.410651206970215, "margin_dpo/margin_std": 17.646211624145508, "step": 202 }, { "KL/chosen_KL_mean": -19.203052520751953, "KL/mean": -25.161983489990234, "KL/rejected_KL_mean": -31.12091064453125, "KL/std": 16.60664176940918, "epoch": 0.30687830687830686, "fcm_dpo/beta": 0.041295044124126434, "fcm_dpo/delta": -0.09680425375699997, "fcm_dpo/margin": 11.91786003112793, "fcm_dpo/q_t": 0.3928524851799011, "grad_norm": 19.795772552490234, "learning_rate": 4.3893739358856455e-07, "logits/chosen": 0.4575344920158386, "logits/rejected": 0.3789180517196655, "logps/chosen": -81.5285873413086, "logps/ref_chosen": -62.32553482055664, "logps/ref_rejected": -99.37226104736328, "logps/rejected": -130.4931640625, "loss": 1.0821, "margin_dpo/margin_mean": 11.91786003112793, "margin_dpo/margin_std": 18.32806968688965, "step": 203 }, { "KL/chosen_KL_mean": -17.718111038208008, "KL/mean": -24.134754180908203, "KL/rejected_KL_mean": -30.551395416259766, "KL/std": 17.6030216217041, "epoch": 0.30839002267573695, "fcm_dpo/beta": 0.039820194244384766, "fcm_dpo/delta": -0.12089164555072784, "fcm_dpo/margin": 12.833280563354492, "fcm_dpo/q_t": 0.3872482478618622, "grad_norm": 17.259748458862305, "learning_rate": 4.380688857426449e-07, "logits/chosen": 0.40379756689071655, "logits/rejected": 0.33288702368736267, "logps/chosen": -68.34742736816406, "logps/ref_chosen": -50.62931823730469, "logps/ref_rejected": -66.60475158691406, "logps/rejected": -97.15614318847656, "loss": 1.0627, "margin_dpo/margin_mean": 12.83327865600586, "margin_dpo/margin_std": 17.871044158935547, "step": 204 }, { "KL/chosen_KL_mean": -20.115196228027344, "KL/mean": -26.49364471435547, "KL/rejected_KL_mean": -32.87209701538086, "KL/std": 17.846654891967773, "epoch": 0.30990173847316704, "fcm_dpo/beta": 0.0393013134598732, "fcm_dpo/delta": -0.10703231394290924, "fcm_dpo/margin": 12.756902694702148, "fcm_dpo/q_t": 0.390725314617157, "grad_norm": 21.37082290649414, "learning_rate": 4.3719511720570814e-07, "logits/chosen": 0.436495840549469, "logits/rejected": 0.3707747757434845, "logps/chosen": -90.47137451171875, "logps/ref_chosen": -70.3561782836914, "logps/ref_rejected": -93.39848327636719, "logps/rejected": -126.27057647705078, "loss": 1.0886, "margin_dpo/margin_mean": 12.756902694702148, "margin_dpo/margin_std": 20.21804428100586, "step": 205 }, { "KL/chosen_KL_mean": -21.660934448242188, "KL/mean": -26.15941047668457, "KL/rejected_KL_mean": -30.657882690429688, "KL/std": 17.831066131591797, "epoch": 0.31141345427059713, "fcm_dpo/beta": 0.0396423414349556, "fcm_dpo/delta": 0.044101741164922714, "fcm_dpo/margin": 8.996944427490234, "fcm_dpo/q_t": 0.4247448444366455, "grad_norm": 18.77968406677246, "learning_rate": 4.363161124189387e-07, "logits/chosen": 0.5169934034347534, "logits/rejected": 0.49965721368789673, "logps/chosen": -89.30641174316406, "logps/ref_chosen": -67.64547729492188, "logps/ref_rejected": -79.89584350585938, "logps/rejected": -110.55372619628906, "loss": 1.2337, "margin_dpo/margin_mean": 8.996943473815918, "margin_dpo/margin_std": 21.76844024658203, "step": 206 }, { "KL/chosen_KL_mean": -25.184377670288086, "KL/mean": -31.723377227783203, "KL/rejected_KL_mean": -38.262367248535156, "KL/std": 18.837566375732422, "epoch": 0.3129251700680272, "fcm_dpo/beta": 0.03880295902490616, "fcm_dpo/delta": -0.11340102553367615, "fcm_dpo/margin": 13.077993392944336, "fcm_dpo/q_t": 0.3916068375110626, "grad_norm": 16.598234176635742, "learning_rate": 4.3543189596998986e-07, "logits/chosen": 0.3862527012825012, "logits/rejected": 0.31814223527908325, "logps/chosen": -92.84857177734375, "logps/ref_chosen": -67.66419219970703, "logps/ref_rejected": -85.10249328613281, "logps/rejected": -123.36485290527344, "loss": 1.0715, "margin_dpo/margin_mean": 13.077995300292969, "margin_dpo/margin_std": 19.769428253173828, "step": 207 }, { "KL/chosen_KL_mean": -19.551837921142578, "KL/mean": -23.417842864990234, "KL/rejected_KL_mean": -27.28384780883789, "KL/std": 17.606971740722656, "epoch": 0.3144368858654573, "fcm_dpo/beta": 0.03920549526810646, "fcm_dpo/delta": 0.0999031811952591, "fcm_dpo/margin": 7.732011795043945, "fcm_dpo/q_t": 0.43431955575942993, "grad_norm": 21.032760620117188, "learning_rate": 4.3454249259229664e-07, "logits/chosen": 0.4478047490119934, "logits/rejected": 0.42077624797821045, "logps/chosen": -77.28355407714844, "logps/ref_chosen": -57.731712341308594, "logps/ref_rejected": -74.19276428222656, "logps/rejected": -101.47660827636719, "loss": 1.2478, "margin_dpo/margin_mean": 7.732011795043945, "margin_dpo/margin_std": 19.74880599975586, "step": 208 }, { "KL/chosen_KL_mean": -20.090803146362305, "KL/mean": -28.256765365600586, "KL/rejected_KL_mean": -36.4227294921875, "KL/std": 20.101736068725586, "epoch": 0.31594860166288735, "fcm_dpo/beta": 0.03807171434164047, "fcm_dpo/delta": -0.23638266324996948, "fcm_dpo/margin": 16.331928253173828, "fcm_dpo/q_t": 0.3660344183444977, "grad_norm": 19.51934051513672, "learning_rate": 4.336479271643833e-07, "logits/chosen": 0.381446897983551, "logits/rejected": 0.32572823762893677, "logps/chosen": -88.64088439941406, "logps/ref_chosen": -68.55007934570312, "logps/ref_rejected": -87.90541076660156, "logps/rejected": -124.32814025878906, "loss": 1.037, "margin_dpo/margin_mean": 16.331928253173828, "margin_dpo/margin_std": 23.80112075805664, "step": 209 }, { "KL/chosen_KL_mean": -19.75511360168457, "KL/mean": -27.526519775390625, "KL/rejected_KL_mean": -35.29792404174805, "KL/std": 21.410018920898438, "epoch": 0.31746031746031744, "fcm_dpo/beta": 0.03674852102994919, "fcm_dpo/delta": -0.1811467409133911, "fcm_dpo/margin": 15.542804718017578, "fcm_dpo/q_t": 0.37848860025405884, "grad_norm": 16.914072036743164, "learning_rate": 4.327482247091679e-07, "logits/chosen": 0.4980680048465729, "logits/rejected": 0.3969573974609375, "logps/chosen": -77.02339172363281, "logps/ref_chosen": -57.268272399902344, "logps/ref_rejected": -85.72807312011719, "logps/rejected": -121.0260009765625, "loss": 1.0464, "margin_dpo/margin_mean": 15.542803764343262, "margin_dpo/margin_std": 22.403995513916016, "step": 210 }, { "KL/chosen_KL_mean": -17.39479637145996, "KL/mean": -24.74917984008789, "KL/rejected_KL_mean": -32.10356521606445, "KL/std": 19.60296630859375, "epoch": 0.31897203325774753, "fcm_dpo/beta": 0.03573797643184662, "fcm_dpo/delta": -0.13251164555549622, "fcm_dpo/margin": 14.70876407623291, "fcm_dpo/q_t": 0.38738757371902466, "grad_norm": 20.49784278869629, "learning_rate": 4.3184341039326217e-07, "logits/chosen": 0.493042916059494, "logits/rejected": 0.39660608768463135, "logps/chosen": -71.03550720214844, "logps/ref_chosen": -53.640708923339844, "logps/ref_rejected": -93.0387954711914, "logps/rejected": -125.14236450195312, "loss": 1.0595, "margin_dpo/margin_mean": 14.70876407623291, "margin_dpo/margin_std": 21.361112594604492, "step": 211 }, { "KL/chosen_KL_mean": -20.63507652282715, "KL/mean": -28.782855987548828, "KL/rejected_KL_mean": -36.93062973022461, "KL/std": 21.19561004638672, "epoch": 0.3204837490551776, "fcm_dpo/beta": 0.03439202904701233, "fcm_dpo/delta": -0.1701374351978302, "fcm_dpo/margin": 16.295551300048828, "fcm_dpo/q_t": 0.3761540651321411, "grad_norm": 14.773843765258789, "learning_rate": 4.309335095262675e-07, "logits/chosen": 0.49070367217063904, "logits/rejected": 0.4169609546661377, "logps/chosen": -78.00181579589844, "logps/ref_chosen": -57.36674499511719, "logps/ref_rejected": -79.89643096923828, "logps/rejected": -116.82705688476562, "loss": 1.0404, "margin_dpo/margin_mean": 16.295551300048828, "margin_dpo/margin_std": 22.684921264648438, "step": 212 }, { "KL/chosen_KL_mean": -14.704254150390625, "KL/mean": -24.007169723510742, "KL/rejected_KL_mean": -33.31008529663086, "KL/std": 21.575820922851562, "epoch": 0.3219954648526077, "fcm_dpo/beta": 0.032846976071596146, "fcm_dpo/delta": -0.2263413965702057, "fcm_dpo/margin": 18.605833053588867, "fcm_dpo/q_t": 0.3665982484817505, "grad_norm": 13.922098159790039, "learning_rate": 4.3001854756006724e-07, "logits/chosen": 0.4768121838569641, "logits/rejected": 0.4529619812965393, "logps/chosen": -79.92536926269531, "logps/ref_chosen": -65.22111511230469, "logps/ref_rejected": -80.1810302734375, "logps/rejected": -113.4911117553711, "loss": 1.0064, "margin_dpo/margin_mean": 18.6058349609375, "margin_dpo/margin_std": 24.34903335571289, "step": 213 }, { "KL/chosen_KL_mean": -17.853666305541992, "KL/mean": -27.147632598876953, "KL/rejected_KL_mean": -36.44160461425781, "KL/std": 22.199668884277344, "epoch": 0.3235071806500378, "fcm_dpo/beta": 0.03187070041894913, "fcm_dpo/delta": -0.20411977171897888, "fcm_dpo/margin": 18.587932586669922, "fcm_dpo/q_t": 0.3724803924560547, "grad_norm": 18.471385955810547, "learning_rate": 4.290985500881143e-07, "logits/chosen": 0.35173967480659485, "logits/rejected": 0.33097124099731445, "logps/chosen": -79.14599609375, "logps/ref_chosen": -61.292327880859375, "logps/ref_rejected": -67.69841003417969, "logps/rejected": -104.1400146484375, "loss": 1.0337, "margin_dpo/margin_mean": 18.587932586669922, "margin_dpo/margin_std": 25.626651763916016, "step": 214 }, { "KL/chosen_KL_mean": -22.558399200439453, "KL/mean": -31.958229064941406, "KL/rejected_KL_mean": -41.358055114746094, "KL/std": 23.33050537109375, "epoch": 0.3250188964474679, "fcm_dpo/beta": 0.03069309890270233, "fcm_dpo/delta": -0.18765932321548462, "fcm_dpo/margin": 18.799654006958008, "fcm_dpo/q_t": 0.3768354654312134, "grad_norm": 16.50494956970215, "learning_rate": 4.281735428447157e-07, "logits/chosen": 0.3813681900501251, "logits/rejected": 0.2712689936161041, "logps/chosen": -86.42753601074219, "logps/ref_chosen": -63.869136810302734, "logps/ref_rejected": -98.7657241821289, "logps/rejected": -140.123779296875, "loss": 1.0485, "margin_dpo/margin_mean": 18.799654006958008, "margin_dpo/margin_std": 26.779800415039062, "step": 215 }, { "KL/chosen_KL_mean": -19.95848846435547, "KL/mean": -29.962337493896484, "KL/rejected_KL_mean": -39.966182708740234, "KL/std": 23.723655700683594, "epoch": 0.32653061224489793, "fcm_dpo/beta": 0.02930794656276703, "fcm_dpo/delta": -0.19784387946128845, "fcm_dpo/margin": 20.0076961517334, "fcm_dpo/q_t": 0.36996203660964966, "grad_norm": 21.667011260986328, "learning_rate": 4.2724355170431247e-07, "logits/chosen": 0.5238769054412842, "logits/rejected": 0.43370354175567627, "logps/chosen": -87.783447265625, "logps/ref_chosen": -67.824951171875, "logps/ref_rejected": -96.40231323242188, "logps/rejected": -136.36849975585938, "loss": 1.0001, "margin_dpo/margin_mean": 20.0076961517334, "margin_dpo/margin_std": 24.749391555786133, "step": 216 }, { "KL/chosen_KL_mean": -23.450279235839844, "KL/mean": -34.004703521728516, "KL/rejected_KL_mean": -44.55912780761719, "KL/std": 24.92733383178711, "epoch": 0.328042328042328, "fcm_dpo/beta": 0.028042098507285118, "fcm_dpo/delta": -0.20471635460853577, "fcm_dpo/margin": 21.108844757080078, "fcm_dpo/q_t": 0.370144248008728, "grad_norm": 14.39207935333252, "learning_rate": 4.26308602680756e-07, "logits/chosen": 0.4617077112197876, "logits/rejected": 0.3549914062023163, "logps/chosen": -83.95527648925781, "logps/ref_chosen": -60.5049934387207, "logps/ref_rejected": -84.26618194580078, "logps/rejected": -128.8253173828125, "loss": 1.0011, "margin_dpo/margin_mean": 21.108844757080078, "margin_dpo/margin_std": 26.42914390563965, "step": 217 }, { "KL/chosen_KL_mean": -25.236366271972656, "KL/mean": -32.00666809082031, "KL/rejected_KL_mean": -38.776973724365234, "KL/std": 23.776065826416016, "epoch": 0.3295540438397581, "fcm_dpo/beta": 0.028121955692768097, "fcm_dpo/delta": 0.018542245030403137, "fcm_dpo/margin": 13.540607452392578, "fcm_dpo/q_t": 0.4171530604362488, "grad_norm": 16.439096450805664, "learning_rate": 4.253687219265803e-07, "logits/chosen": 0.35637742280960083, "logits/rejected": 0.3534776270389557, "logps/chosen": -95.83068084716797, "logps/ref_chosen": -70.59431457519531, "logps/ref_rejected": -73.89038848876953, "logps/rejected": -112.6673583984375, "loss": 1.2055, "margin_dpo/margin_mean": 13.540607452392578, "margin_dpo/margin_std": 29.244144439697266, "step": 218 }, { "KL/chosen_KL_mean": -24.29410171508789, "KL/mean": -31.36556625366211, "KL/rejected_KL_mean": -38.437042236328125, "KL/std": 25.121906280517578, "epoch": 0.3310657596371882, "fcm_dpo/beta": 0.027831317856907845, "fcm_dpo/delta": 0.006290003657341003, "fcm_dpo/margin": 14.14294147491455, "fcm_dpo/q_t": 0.41288232803344727, "grad_norm": 16.93846893310547, "learning_rate": 4.2442393573227043e-07, "logits/chosen": 0.4106125235557556, "logits/rejected": 0.369218647480011, "logps/chosen": -84.78504943847656, "logps/ref_chosen": -60.490943908691406, "logps/ref_rejected": -75.85001373291016, "logps/rejected": -114.28705596923828, "loss": 1.1407, "margin_dpo/margin_mean": 14.142939567565918, "margin_dpo/margin_std": 24.56814956665039, "step": 219 }, { "KL/chosen_KL_mean": -20.769855499267578, "KL/mean": -29.15686798095703, "KL/rejected_KL_mean": -37.543880462646484, "KL/std": 25.591283798217773, "epoch": 0.3325774754346183, "fcm_dpo/beta": 0.027524903416633606, "fcm_dpo/delta": -0.06579715758562088, "fcm_dpo/margin": 16.774028778076172, "fcm_dpo/q_t": 0.40150097012519836, "grad_norm": 13.661136627197266, "learning_rate": 4.234742705255272e-07, "logits/chosen": 0.5282034873962402, "logits/rejected": 0.46261727809906006, "logps/chosen": -65.78325653076172, "logps/ref_chosen": -45.013397216796875, "logps/ref_rejected": -70.49369812011719, "logps/rejected": -108.03758239746094, "loss": 1.1284, "margin_dpo/margin_mean": 16.774028778076172, "margin_dpo/margin_std": 29.391447067260742, "step": 220 }, { "KL/chosen_KL_mean": -20.707170486450195, "KL/mean": -29.900436401367188, "KL/rejected_KL_mean": -39.09370422363281, "KL/std": 25.316604614257812, "epoch": 0.3340891912320484, "fcm_dpo/beta": 0.027201924473047256, "fcm_dpo/delta": -0.10532025247812271, "fcm_dpo/margin": 18.386539459228516, "fcm_dpo/q_t": 0.3930332660675049, "grad_norm": 16.611440658569336, "learning_rate": 4.22519752870528e-07, "logits/chosen": 0.4675843119621277, "logits/rejected": 0.3941592574119568, "logps/chosen": -79.80300903320312, "logps/ref_chosen": -59.09584045410156, "logps/ref_rejected": -88.64388275146484, "logps/rejected": -127.73758697509766, "loss": 1.0887, "margin_dpo/margin_mean": 18.386539459228516, "margin_dpo/margin_std": 29.355281829833984, "step": 221 }, { "KL/chosen_KL_mean": -21.834548950195312, "KL/mean": -34.32554244995117, "KL/rejected_KL_mean": -46.81653594970703, "KL/std": 28.691497802734375, "epoch": 0.3356009070294785, "fcm_dpo/beta": 0.025927722454071045, "fcm_dpo/delta": -0.26547971367836, "fcm_dpo/margin": 24.98198699951172, "fcm_dpo/q_t": 0.35853731632232666, "grad_norm": 16.38346290588379, "learning_rate": 4.2156040946718343e-07, "logits/chosen": 0.5366965532302856, "logits/rejected": 0.447675883769989, "logps/chosen": -77.83224487304688, "logps/ref_chosen": -55.9976921081543, "logps/ref_rejected": -111.94727325439453, "logps/rejected": -158.76380920410156, "loss": 0.9827, "margin_dpo/margin_mean": 24.98198699951172, "margin_dpo/margin_std": 30.753662109375, "step": 222 }, { "KL/chosen_KL_mean": -26.12401580810547, "KL/mean": -37.08482360839844, "KL/rejected_KL_mean": -48.04563903808594, "KL/std": 28.470643997192383, "epoch": 0.3371126228269085, "fcm_dpo/beta": 0.024924414232373238, "fcm_dpo/delta": -0.1559496819972992, "fcm_dpo/margin": 21.921615600585938, "fcm_dpo/q_t": 0.3798648416996002, "grad_norm": 14.7803955078125, "learning_rate": 4.2059626715039065e-07, "logits/chosen": 0.5469553470611572, "logits/rejected": 0.4884379506111145, "logps/chosen": -86.01544189453125, "logps/ref_chosen": -59.891422271728516, "logps/ref_rejected": -86.28954315185547, "logps/rejected": -134.33517456054688, "loss": 1.0237, "margin_dpo/margin_mean": 21.921615600585938, "margin_dpo/margin_std": 27.957950592041016, "step": 223 }, { "KL/chosen_KL_mean": -30.5401611328125, "KL/mean": -36.695762634277344, "KL/rejected_KL_mean": -42.85136795043945, "KL/std": 28.002614974975586, "epoch": 0.3386243386243386, "fcm_dpo/beta": 0.025149494409561157, "fcm_dpo/delta": 0.09335803985595703, "fcm_dpo/margin": 12.31120491027832, "fcm_dpo/q_t": 0.43343234062194824, "grad_norm": 20.98682403564453, "learning_rate": 4.1962735288928304e-07, "logits/chosen": 0.5356206297874451, "logits/rejected": 0.5141184329986572, "logps/chosen": -94.58479309082031, "logps/ref_chosen": -64.04463195800781, "logps/ref_rejected": -75.05450439453125, "logps/rejected": -117.90586853027344, "loss": 1.2264, "margin_dpo/margin_mean": 12.31120491027832, "margin_dpo/margin_std": 29.377330780029297, "step": 224 }, { "KL/chosen_KL_mean": -29.27713966369629, "KL/mean": -40.99596405029297, "KL/rejected_KL_mean": -52.714778900146484, "KL/std": 31.698970794677734, "epoch": 0.3401360544217687, "fcm_dpo/beta": 0.024466045200824738, "fcm_dpo/delta": -0.18586499989032745, "fcm_dpo/margin": 23.437641143798828, "fcm_dpo/q_t": 0.375728040933609, "grad_norm": 15.36949348449707, "learning_rate": 4.186536937864752e-07, "logits/chosen": 0.5330972671508789, "logits/rejected": 0.4151610732078552, "logps/chosen": -95.37296295166016, "logps/ref_chosen": -66.0958251953125, "logps/ref_rejected": -97.68675231933594, "logps/rejected": -150.40151977539062, "loss": 1.0186, "margin_dpo/margin_mean": 23.437641143798828, "margin_dpo/margin_std": 30.503211975097656, "step": 225 }, { "KL/chosen_KL_mean": -27.569765090942383, "KL/mean": -36.93824005126953, "KL/rejected_KL_mean": -46.30670928955078, "KL/std": 29.689159393310547, "epoch": 0.3416477702191988, "fcm_dpo/beta": 0.024274379014968872, "fcm_dpo/delta": -0.057556722313165665, "fcm_dpo/margin": 18.736942291259766, "fcm_dpo/q_t": 0.40134507417678833, "grad_norm": 14.987967491149902, "learning_rate": 4.176753170773052e-07, "logits/chosen": 0.5614030361175537, "logits/rejected": 0.5139415860176086, "logps/chosen": -78.98663330078125, "logps/ref_chosen": -51.4168701171875, "logps/ref_rejected": -66.30068969726562, "logps/rejected": -112.60739135742188, "loss": 1.148, "margin_dpo/margin_mean": 18.736942291259766, "margin_dpo/margin_std": 34.95653533935547, "step": 226 }, { "KL/chosen_KL_mean": -30.180944442749023, "KL/mean": -40.023277282714844, "KL/rejected_KL_mean": -49.86561584472656, "KL/std": 32.3335075378418, "epoch": 0.3431594860166289, "fcm_dpo/beta": 0.023784009739756584, "fcm_dpo/delta": -0.07184967398643494, "fcm_dpo/margin": 19.684673309326172, "fcm_dpo/q_t": 0.4008902311325073, "grad_norm": 16.232606887817383, "learning_rate": 4.166922501290729e-07, "logits/chosen": 0.6180914044380188, "logits/rejected": 0.5779677629470825, "logps/chosen": -88.17072296142578, "logps/ref_chosen": -57.989776611328125, "logps/ref_rejected": -75.05464172363281, "logps/rejected": -124.92025756835938, "loss": 1.1363, "margin_dpo/margin_mean": 19.684673309326172, "margin_dpo/margin_std": 35.894100189208984, "step": 227 }, { "KL/chosen_KL_mean": -32.853782653808594, "KL/mean": -43.579261779785156, "KL/rejected_KL_mean": -54.30473709106445, "KL/std": 31.011653900146484, "epoch": 0.34467120181405897, "fcm_dpo/beta": 0.023445097729563713, "fcm_dpo/delta": -0.10822771489620209, "fcm_dpo/margin": 21.450952529907227, "fcm_dpo/q_t": 0.3900336027145386, "grad_norm": 16.652875900268555, "learning_rate": 4.1570452044027405e-07, "logits/chosen": 0.597733736038208, "logits/rejected": 0.5147572755813599, "logps/chosen": -88.41314697265625, "logps/ref_chosen": -55.55936813354492, "logps/ref_rejected": -77.02364349365234, "logps/rejected": -131.32838439941406, "loss": 1.0736, "margin_dpo/margin_mean": 21.45095443725586, "margin_dpo/margin_std": 32.027835845947266, "step": 228 }, { "KL/chosen_KL_mean": -26.866287231445312, "KL/mean": -36.41496276855469, "KL/rejected_KL_mean": -45.96363830566406, "KL/std": 30.118515014648438, "epoch": 0.34618291761148906, "fcm_dpo/beta": 0.023083781823515892, "fcm_dpo/delta": -0.042767249047756195, "fcm_dpo/margin": 19.097354888916016, "fcm_dpo/q_t": 0.40380337834358215, "grad_norm": 24.853879928588867, "learning_rate": 4.147121556398312e-07, "logits/chosen": 0.6568770408630371, "logits/rejected": 0.5840392112731934, "logps/chosen": -77.66094970703125, "logps/ref_chosen": -50.79466247558594, "logps/ref_rejected": -78.4474105834961, "logps/rejected": -124.41104888916016, "loss": 1.1599, "margin_dpo/margin_mean": 19.097354888916016, "margin_dpo/margin_std": 37.24081802368164, "step": 229 }, { "KL/chosen_KL_mean": -30.54840087890625, "KL/mean": -41.796043395996094, "KL/rejected_KL_mean": -53.043678283691406, "KL/std": 33.654014587402344, "epoch": 0.3476946334089191, "fcm_dpo/beta": 0.022863391786813736, "fcm_dpo/delta": -0.12128210067749023, "fcm_dpo/margin": 22.495281219482422, "fcm_dpo/q_t": 0.3881077170372009, "grad_norm": 15.285496711730957, "learning_rate": 4.137151834863213e-07, "logits/chosen": 0.5394281148910522, "logits/rejected": 0.5401008129119873, "logps/chosen": -87.27762603759766, "logps/ref_chosen": -56.729225158691406, "logps/ref_rejected": -62.99180603027344, "logps/rejected": -116.03548431396484, "loss": 1.066, "margin_dpo/margin_mean": 22.495281219482422, "margin_dpo/margin_std": 32.23970031738281, "step": 230 }, { "KL/chosen_KL_mean": -35.239524841308594, "KL/mean": -52.531097412109375, "KL/rejected_KL_mean": -69.82267761230469, "KL/std": 33.33924865722656, "epoch": 0.3492063492063492, "fcm_dpo/beta": 0.02119772881269455, "fcm_dpo/delta": -0.36233824491500854, "fcm_dpo/margin": 34.58314895629883, "fcm_dpo/q_t": 0.33587509393692017, "grad_norm": 15.62218952178955, "learning_rate": 4.1271363186719835e-07, "logits/chosen": 0.5032404661178589, "logits/rejected": 0.49624475836753845, "logps/chosen": -107.83662414550781, "logps/ref_chosen": -72.59709930419922, "logps/ref_rejected": -86.2322998046875, "logps/rejected": -156.0549774169922, "loss": 0.9115, "margin_dpo/margin_mean": 34.58314514160156, "margin_dpo/margin_std": 35.64418029785156, "step": 231 }, { "KL/chosen_KL_mean": -35.60374069213867, "KL/mean": -47.167205810546875, "KL/rejected_KL_mean": -58.73066329956055, "KL/std": 36.09865951538086, "epoch": 0.3507180650037793, "fcm_dpo/beta": 0.020644482225179672, "fcm_dpo/delta": -0.08148273080587387, "fcm_dpo/margin": 23.126922607421875, "fcm_dpo/q_t": 0.39880990982055664, "grad_norm": 15.3305025100708, "learning_rate": 4.1170752879801436e-07, "logits/chosen": 0.5095189809799194, "logits/rejected": 0.48098161816596985, "logps/chosen": -103.72227478027344, "logps/ref_chosen": -68.1185302734375, "logps/ref_rejected": -83.79415893554688, "logps/rejected": -142.5248260498047, "loss": 1.125, "margin_dpo/margin_mean": 23.126924514770508, "margin_dpo/margin_std": 41.08509063720703, "step": 232 }, { "KL/chosen_KL_mean": -42.186336517333984, "KL/mean": -51.113990783691406, "KL/rejected_KL_mean": -60.04164505004883, "KL/std": 34.599830627441406, "epoch": 0.35222978080120937, "fcm_dpo/beta": 0.02022051438689232, "fcm_dpo/delta": -0.08994609117507935, "fcm_dpo/margin": 17.855304718017578, "fcm_dpo/q_t": 0.42118969559669495, "grad_norm": 15.223427772521973, "learning_rate": 4.106969024216348e-07, "logits/chosen": 0.6028063297271729, "logits/rejected": 0.5426856875419617, "logps/chosen": -97.25648498535156, "logps/ref_chosen": -55.070152282714844, "logps/ref_rejected": -66.61845397949219, "logps/rejected": -126.66009521484375, "loss": 1.1772, "margin_dpo/margin_mean": 17.85530662536621, "margin_dpo/margin_std": 34.058006286621094, "step": 233 }, { "KL/chosen_KL_mean": -37.9176025390625, "KL/mean": -46.871253967285156, "KL/rejected_KL_mean": -55.82490539550781, "KL/std": 33.52410888671875, "epoch": 0.35374149659863946, "fcm_dpo/beta": 0.019993459805846214, "fcm_dpo/delta": -0.11356954276561737, "fcm_dpo/margin": 17.907297134399414, "fcm_dpo/q_t": 0.42242223024368286, "grad_norm": 19.345218658447266, "learning_rate": 4.09681781007452e-07, "logits/chosen": 0.4989665746688843, "logits/rejected": 0.49075978994369507, "logps/chosen": -93.84349822998047, "logps/ref_chosen": -55.92589569091797, "logps/ref_rejected": -51.11608123779297, "logps/rejected": -106.94098663330078, "loss": 1.2008, "margin_dpo/margin_mean": 17.907297134399414, "margin_dpo/margin_std": 36.353973388671875, "step": 234 }, { "KL/chosen_KL_mean": -32.505218505859375, "KL/mean": -47.86866760253906, "KL/rejected_KL_mean": -63.23212814331055, "KL/std": 36.218650817871094, "epoch": 0.35525321239606955, "fcm_dpo/beta": 0.01923798769712448, "fcm_dpo/delta": -0.2027154266834259, "fcm_dpo/margin": 30.726903915405273, "fcm_dpo/q_t": 0.3672284185886383, "grad_norm": 14.244831085205078, "learning_rate": 4.08662192950594e-07, "logits/chosen": 0.5909181237220764, "logits/rejected": 0.5762823224067688, "logps/chosen": -97.0449447631836, "logps/ref_chosen": -64.53972625732422, "logps/ref_rejected": -77.69151306152344, "logps/rejected": -140.92364501953125, "loss": 0.9866, "margin_dpo/margin_mean": 30.726903915405273, "margin_dpo/margin_std": 35.55228805541992, "step": 235 }, { "KL/chosen_KL_mean": -51.062217712402344, "KL/mean": -62.52552032470703, "KL/rejected_KL_mean": -73.98883056640625, "KL/std": 36.14385223388672, "epoch": 0.35676492819349964, "fcm_dpo/beta": 0.018829286098480225, "fcm_dpo/delta": -0.0334140881896019, "fcm_dpo/margin": 22.92660903930664, "fcm_dpo/q_t": 0.40489548444747925, "grad_norm": 14.33649730682373, "learning_rate": 4.076381667711306e-07, "logits/chosen": 0.5399256944656372, "logits/rejected": 0.5271640419960022, "logps/chosen": -122.21694946289062, "logps/ref_chosen": -71.15473937988281, "logps/ref_rejected": -84.88541412353516, "logps/rejected": -158.87423706054688, "loss": 1.1367, "margin_dpo/margin_mean": 22.92660903930664, "margin_dpo/margin_std": 40.703575134277344, "step": 236 }, { "KL/chosen_KL_mean": -43.26026916503906, "KL/mean": -57.158023834228516, "KL/rejected_KL_mean": -71.05577850341797, "KL/std": 36.259342193603516, "epoch": 0.35827664399092973, "fcm_dpo/beta": 0.018561359494924545, "fcm_dpo/delta": -0.1220531016588211, "fcm_dpo/margin": 27.795513153076172, "fcm_dpo/q_t": 0.3856336772441864, "grad_norm": 17.481191635131836, "learning_rate": 4.066097311132753e-07, "logits/chosen": 0.6076939105987549, "logits/rejected": 0.597443163394928, "logps/chosen": -119.40228271484375, "logps/ref_chosen": -76.14201354980469, "logps/ref_rejected": -80.88479614257812, "logps/rejected": -151.94058227539062, "loss": 1.0758, "margin_dpo/margin_mean": 27.795513153076172, "margin_dpo/margin_std": 41.35049819946289, "step": 237 }, { "KL/chosen_KL_mean": -39.10129928588867, "KL/mean": -53.06718826293945, "KL/rejected_KL_mean": -67.0330810546875, "KL/std": 37.695091247558594, "epoch": 0.35978835978835977, "fcm_dpo/beta": 0.018000055104494095, "fcm_dpo/delta": -0.10950126498937607, "fcm_dpo/margin": 27.93178367614746, "fcm_dpo/q_t": 0.3884713351726532, "grad_norm": 21.357378005981445, "learning_rate": 4.0557691474458414e-07, "logits/chosen": 0.5728151798248291, "logits/rejected": 0.5639808177947998, "logps/chosen": -107.98614501953125, "logps/ref_chosen": -68.88484954833984, "logps/ref_rejected": -75.8946304321289, "logps/rejected": -142.92770385742188, "loss": 1.0653, "margin_dpo/margin_mean": 27.93178367614746, "margin_dpo/margin_std": 39.66338348388672, "step": 238 }, { "KL/chosen_KL_mean": -48.25627136230469, "KL/mean": -61.66255569458008, "KL/rejected_KL_mean": -75.06884765625, "KL/std": 40.41633224487305, "epoch": 0.36130007558578986, "fcm_dpo/beta": 0.017894674092531204, "fcm_dpo/delta": -0.08414621651172638, "fcm_dpo/margin": 26.812572479248047, "fcm_dpo/q_t": 0.3949527442455292, "grad_norm": 16.937339782714844, "learning_rate": 4.045397465551513e-07, "logits/chosen": 0.6998695135116577, "logits/rejected": 0.5612987279891968, "logps/chosen": -105.0280990600586, "logps/ref_chosen": -56.771827697753906, "logps/ref_rejected": -116.23050689697266, "logps/rejected": -191.29934692382812, "loss": 1.096, "margin_dpo/margin_mean": 26.812570571899414, "margin_dpo/margin_std": 41.91584777832031, "step": 239 }, { "KL/chosen_KL_mean": -46.63644027709961, "KL/mean": -64.09809875488281, "KL/rejected_KL_mean": -81.55975341796875, "KL/std": 40.71031188964844, "epoch": 0.36281179138321995, "fcm_dpo/beta": 0.017164895310997963, "fcm_dpo/delta": -0.21204231679439545, "fcm_dpo/margin": 34.923309326171875, "fcm_dpo/q_t": 0.3671082854270935, "grad_norm": 13.129941940307617, "learning_rate": 4.0349825555680045e-07, "logits/chosen": 0.590008556842804, "logits/rejected": 0.4937673807144165, "logps/chosen": -99.99055480957031, "logps/ref_chosen": -53.35411071777344, "logps/ref_rejected": -80.12019348144531, "logps/rejected": -161.67996215820312, "loss": 0.992, "margin_dpo/margin_mean": 34.92331314086914, "margin_dpo/margin_std": 42.06145477294922, "step": 240 }, { "KL/chosen_KL_mean": -44.580406188964844, "KL/mean": -56.076847076416016, "KL/rejected_KL_mean": -67.57328796386719, "KL/std": 38.25746536254883, "epoch": 0.36432350718065004, "fcm_dpo/beta": 0.0170365609228611, "fcm_dpo/delta": 0.008422527462244034, "fcm_dpo/margin": 22.99288558959961, "fcm_dpo/q_t": 0.413882851600647, "grad_norm": 17.834257125854492, "learning_rate": 4.0245247088227377e-07, "logits/chosen": 0.547561526298523, "logits/rejected": 0.5109948515892029, "logps/chosen": -116.47581481933594, "logps/ref_chosen": -71.89541625976562, "logps/ref_rejected": -83.03492736816406, "logps/rejected": -150.60821533203125, "loss": 1.1496, "margin_dpo/margin_mean": 22.99288558959961, "margin_dpo/margin_std": 41.658973693847656, "step": 241 }, { "KL/chosen_KL_mean": -46.325008392333984, "KL/mean": -61.806419372558594, "KL/rejected_KL_mean": -77.28782653808594, "KL/std": 41.088233947753906, "epoch": 0.36583522297808013, "fcm_dpo/beta": 0.016576804220676422, "fcm_dpo/delta": -0.12119430303573608, "fcm_dpo/margin": 30.96282196044922, "fcm_dpo/q_t": 0.3866094946861267, "grad_norm": 12.655095100402832, "learning_rate": 4.0140242178441665e-07, "logits/chosen": 0.5564873814582825, "logits/rejected": 0.5371856689453125, "logps/chosen": -104.25244140625, "logps/ref_chosen": -57.927433013916016, "logps/ref_rejected": -67.838623046875, "logps/rejected": -145.12644958496094, "loss": 1.0591, "margin_dpo/margin_mean": 30.96282196044922, "margin_dpo/margin_std": 43.73744201660156, "step": 242 }, { "KL/chosen_KL_mean": -45.42218017578125, "KL/mean": -59.03864669799805, "KL/rejected_KL_mean": -72.65509796142578, "KL/std": 40.808799743652344, "epoch": 0.3673469387755102, "fcm_dpo/beta": 0.0164511539041996, "fcm_dpo/delta": -0.05031604319810867, "fcm_dpo/margin": 27.232929229736328, "fcm_dpo/q_t": 0.3998001217842102, "grad_norm": 16.670740127563477, "learning_rate": 4.003481376353596e-07, "logits/chosen": 0.5832501649856567, "logits/rejected": 0.5915569067001343, "logps/chosen": -119.6988525390625, "logps/ref_chosen": -74.27667236328125, "logps/ref_rejected": -73.24340057373047, "logps/rejected": -145.89849853515625, "loss": 1.0961, "margin_dpo/margin_mean": 27.232929229736328, "margin_dpo/margin_std": 41.72847366333008, "step": 243 }, { "KL/chosen_KL_mean": -44.195556640625, "KL/mean": -62.29296875, "KL/rejected_KL_mean": -80.390380859375, "KL/std": 40.255855560302734, "epoch": 0.3688586545729403, "fcm_dpo/beta": 0.01603120006620884, "fcm_dpo/delta": -0.1909106969833374, "fcm_dpo/margin": 36.19482421875, "fcm_dpo/q_t": 0.36936506628990173, "grad_norm": 14.04793930053711, "learning_rate": 3.9928964792569654e-07, "logits/chosen": 0.6235268115997314, "logits/rejected": 0.5367005467414856, "logps/chosen": -97.55946350097656, "logps/ref_chosen": -53.36390686035156, "logps/ref_rejected": -71.10276794433594, "logps/rejected": -151.4931640625, "loss": 0.9898, "margin_dpo/margin_mean": 36.194820404052734, "margin_dpo/margin_std": 41.15448760986328, "step": 244 }, { "KL/chosen_KL_mean": -48.746925354003906, "KL/mean": -69.70963287353516, "KL/rejected_KL_mean": -90.6723403930664, "KL/std": 41.50957489013672, "epoch": 0.37037037037037035, "fcm_dpo/beta": 0.015208459459245205, "fcm_dpo/delta": -0.25406065583229065, "fcm_dpo/margin": 41.92542266845703, "fcm_dpo/q_t": 0.3544720709323883, "grad_norm": 16.570419311523438, "learning_rate": 3.982269822636601e-07, "logits/chosen": 0.6451157331466675, "logits/rejected": 0.618733286857605, "logps/chosen": -119.94203186035156, "logps/ref_chosen": -71.19510650634766, "logps/ref_rejected": -80.76235961914062, "logps/rejected": -171.4346923828125, "loss": 0.9386, "margin_dpo/margin_mean": 41.92542266845703, "margin_dpo/margin_std": 41.73632049560547, "step": 245 }, { "KL/chosen_KL_mean": -57.780372619628906, "KL/mean": -73.9398193359375, "KL/rejected_KL_mean": -90.09927368164062, "KL/std": 42.128692626953125, "epoch": 0.37188208616780044, "fcm_dpo/beta": 0.014911343343555927, "fcm_dpo/delta": -0.08618468046188354, "fcm_dpo/margin": 32.31889343261719, "fcm_dpo/q_t": 0.3935587406158447, "grad_norm": 14.610726356506348, "learning_rate": 3.971601703742932e-07, "logits/chosen": 0.6617841124534607, "logits/rejected": 0.5988600254058838, "logps/chosen": -129.40142822265625, "logps/ref_chosen": -71.62104797363281, "logps/ref_rejected": -94.03392028808594, "logps/rejected": -184.13319396972656, "loss": 1.1032, "margin_dpo/margin_mean": 32.31889343261719, "margin_dpo/margin_std": 52.08976364135742, "step": 246 }, { "KL/chosen_KL_mean": -63.13233947753906, "KL/mean": -72.38699340820312, "KL/rejected_KL_mean": -81.64165496826172, "KL/std": 41.207435607910156, "epoch": 0.37339380196523053, "fcm_dpo/beta": 0.014794323593378067, "fcm_dpo/delta": 0.018812095746397972, "fcm_dpo/margin": 18.509305953979492, "fcm_dpo/q_t": 0.43781301379203796, "grad_norm": 16.236923217773438, "learning_rate": 3.960892420986177e-07, "logits/chosen": 0.627230167388916, "logits/rejected": 0.6176382303237915, "logps/chosen": -143.15487670898438, "logps/ref_chosen": -80.02254486083984, "logps/ref_rejected": -89.22705841064453, "logps/rejected": -170.86871337890625, "loss": 1.2363, "margin_dpo/margin_mean": 18.50930404663086, "margin_dpo/margin_std": 44.51948547363281, "step": 247 }, { "KL/chosen_KL_mean": -52.94854736328125, "KL/mean": -70.20024108886719, "KL/rejected_KL_mean": -87.45193481445312, "KL/std": 45.64698791503906, "epoch": 0.3749055177626606, "fcm_dpo/beta": 0.01463395357131958, "fcm_dpo/delta": -0.11036857962608337, "fcm_dpo/margin": 34.503379821777344, "fcm_dpo/q_t": 0.3901105523109436, "grad_norm": 14.174400329589844, "learning_rate": 3.9501422739279953e-07, "logits/chosen": 0.6166936159133911, "logits/rejected": 0.67302405834198, "logps/chosen": -118.32650756835938, "logps/ref_chosen": -65.37796020507812, "logps/ref_rejected": -61.365787506103516, "logps/rejected": -148.81771850585938, "loss": 1.0734, "margin_dpo/margin_mean": 34.503379821777344, "margin_dpo/margin_std": 51.22041702270508, "step": 248 }, { "KL/chosen_KL_mean": -67.64529418945312, "KL/mean": -74.41305541992188, "KL/rejected_KL_mean": -81.18081665039062, "KL/std": 42.51622009277344, "epoch": 0.3764172335600907, "fcm_dpo/beta": 0.01455026213079691, "fcm_dpo/delta": 0.035534489899873734, "fcm_dpo/margin": 13.535521507263184, "fcm_dpo/q_t": 0.4550870954990387, "grad_norm": 17.20585823059082, "learning_rate": 3.9393515632731094e-07, "logits/chosen": 0.6215823888778687, "logits/rejected": 0.6616165637969971, "logps/chosen": -142.24676513671875, "logps/ref_chosen": -74.60145568847656, "logps/ref_rejected": -63.79338455200195, "logps/rejected": -144.97421264648438, "loss": 1.3186, "margin_dpo/margin_mean": 13.5355224609375, "margin_dpo/margin_std": 47.453304290771484, "step": 249 }, { "KL/chosen_KL_mean": -57.918060302734375, "KL/mean": -75.33280944824219, "KL/rejected_KL_mean": -92.74755859375, "KL/std": 45.837074279785156, "epoch": 0.3779289493575208, "fcm_dpo/beta": 0.014430014416575432, "fcm_dpo/delta": -0.10792499035596848, "fcm_dpo/margin": 34.82949447631836, "fcm_dpo/q_t": 0.3871173858642578, "grad_norm": 14.864943504333496, "learning_rate": 3.9285205908608934e-07, "logits/chosen": 0.6762036681175232, "logits/rejected": 0.6336355209350586, "logps/chosen": -119.85627746582031, "logps/ref_chosen": -61.938209533691406, "logps/ref_rejected": -72.21602630615234, "logps/rejected": -164.96359252929688, "loss": 1.0565, "margin_dpo/margin_mean": 34.82949447631836, "margin_dpo/margin_std": 47.963722229003906, "step": 250 }, { "KL/chosen_KL_mean": -63.92451477050781, "KL/mean": -74.58179473876953, "KL/rejected_KL_mean": -85.23907470703125, "KL/std": 42.38800811767578, "epoch": 0.3794406651549509, "fcm_dpo/beta": 0.014470743015408516, "fcm_dpo/delta": 0.09457513689994812, "fcm_dpo/margin": 21.314552307128906, "fcm_dpo/q_t": 0.43116044998168945, "grad_norm": 23.076126098632812, "learning_rate": 3.9176496596569265e-07, "logits/chosen": 0.7020321488380432, "logits/rejected": 0.6589365601539612, "logps/chosen": -130.78146362304688, "logps/ref_chosen": -66.85694885253906, "logps/ref_rejected": -84.83396911621094, "logps/rejected": -170.0730438232422, "loss": 1.2174, "margin_dpo/margin_mean": 21.314552307128906, "margin_dpo/margin_std": 48.53556823730469, "step": 251 }, { "KL/chosen_KL_mean": -58.9593505859375, "KL/mean": -68.72596740722656, "KL/rejected_KL_mean": -78.49258422851562, "KL/std": 44.51349639892578, "epoch": 0.38095238095238093, "fcm_dpo/beta": 0.014497705735266209, "fcm_dpo/delta": -0.045059625059366226, "fcm_dpo/margin": 19.533231735229492, "fcm_dpo/q_t": 0.43602365255355835, "grad_norm": 22.25311851501465, "learning_rate": 3.9067390737445254e-07, "logits/chosen": 0.605857253074646, "logits/rejected": 0.550957202911377, "logps/chosen": -115.18328094482422, "logps/ref_chosen": -56.22393035888672, "logps/ref_rejected": -77.1136245727539, "logps/rejected": -155.606201171875, "loss": 1.2489, "margin_dpo/margin_mean": 19.533233642578125, "margin_dpo/margin_std": 48.49584197998047, "step": 252 }, { "KL/chosen_KL_mean": -59.36012268066406, "KL/mean": -71.94851684570312, "KL/rejected_KL_mean": -84.53691101074219, "KL/std": 43.9737548828125, "epoch": 0.382464096749811, "fcm_dpo/beta": 0.014276335947215557, "fcm_dpo/delta": -0.05435481294989586, "fcm_dpo/margin": 25.176788330078125, "fcm_dpo/q_t": 0.4188511371612549, "grad_norm": 14.540387153625488, "learning_rate": 3.8957891383162304e-07, "logits/chosen": 0.7189779281616211, "logits/rejected": 0.67665034532547, "logps/chosen": -111.57013702392578, "logps/ref_chosen": -52.21001434326172, "logps/ref_rejected": -58.75764846801758, "logps/rejected": -143.2945556640625, "loss": 1.1572, "margin_dpo/margin_mean": 25.176786422729492, "margin_dpo/margin_std": 44.18144226074219, "step": 253 }, { "KL/chosen_KL_mean": -62.73114013671875, "KL/mean": -76.93605041503906, "KL/rejected_KL_mean": -91.14094543457031, "KL/std": 45.79835510253906, "epoch": 0.3839758125472411, "fcm_dpo/beta": 0.014211953617632389, "fcm_dpo/delta": -0.004243422299623489, "fcm_dpo/margin": 28.40981674194336, "fcm_dpo/q_t": 0.41077563166618347, "grad_norm": 14.012600898742676, "learning_rate": 3.884800159665276e-07, "logits/chosen": 0.6211004257202148, "logits/rejected": 0.5690572261810303, "logps/chosen": -128.36746215820312, "logps/ref_chosen": -65.63632202148438, "logps/ref_rejected": -82.34425354003906, "logps/rejected": -173.48519897460938, "loss": 1.1239, "margin_dpo/margin_mean": 28.409818649291992, "margin_dpo/margin_std": 46.22943878173828, "step": 254 }, { "KL/chosen_KL_mean": -56.52094268798828, "KL/mean": -73.97906494140625, "KL/rejected_KL_mean": -91.43719482421875, "KL/std": 46.1926155090332, "epoch": 0.3854875283446712, "fcm_dpo/beta": 0.014045731164515018, "fcm_dpo/delta": -0.09524255245923996, "fcm_dpo/margin": 34.916255950927734, "fcm_dpo/q_t": 0.39121708273887634, "grad_norm": 19.894235610961914, "learning_rate": 3.873772445177015e-07, "logits/chosen": 0.59712153673172, "logits/rejected": 0.5688225030899048, "logps/chosen": -124.43203735351562, "logps/ref_chosen": -67.91108703613281, "logps/ref_rejected": -83.89114379882812, "logps/rejected": -175.32833862304688, "loss": 1.068, "margin_dpo/margin_mean": 34.916255950927734, "margin_dpo/margin_std": 50.28325271606445, "step": 255 }, { "KL/chosen_KL_mean": -67.51197052001953, "KL/mean": -83.4915542602539, "KL/rejected_KL_mean": -99.47113800048828, "KL/std": 43.156517028808594, "epoch": 0.3869992441421013, "fcm_dpo/beta": 0.013857575133442879, "fcm_dpo/delta": -0.04520774260163307, "fcm_dpo/margin": 31.95915412902832, "fcm_dpo/q_t": 0.4027029275894165, "grad_norm": 17.021940231323242, "learning_rate": 3.862706303320329e-07, "logits/chosen": 0.6515041589736938, "logits/rejected": 0.585243284702301, "logps/chosen": -131.011962890625, "logps/ref_chosen": -63.49998474121094, "logps/ref_rejected": -90.77104187011719, "logps/rejected": -190.2421875, "loss": 1.1247, "margin_dpo/margin_mean": 31.959152221679688, "margin_dpo/margin_std": 54.93577575683594, "step": 256 }, { "KL/chosen_KL_mean": -65.01585388183594, "KL/mean": -84.360107421875, "KL/rejected_KL_mean": -103.7043685913086, "KL/std": 48.42784881591797, "epoch": 0.3885109599395314, "fcm_dpo/beta": 0.013574027456343174, "fcm_dpo/delta": -0.13240863382816315, "fcm_dpo/margin": 38.688533782958984, "fcm_dpo/q_t": 0.3846198320388794, "grad_norm": 14.428050994873047, "learning_rate": 3.851602043638994e-07, "logits/chosen": 0.6495592594146729, "logits/rejected": 0.5813044309616089, "logps/chosen": -135.6165008544922, "logps/ref_chosen": -70.60064697265625, "logps/ref_rejected": -108.58313751220703, "logps/rejected": -212.28750610351562, "loss": 1.0516, "margin_dpo/margin_mean": 38.68852996826172, "margin_dpo/margin_std": 54.61080551147461, "step": 257 }, { "KL/chosen_KL_mean": -64.80550384521484, "KL/mean": -79.96160888671875, "KL/rejected_KL_mean": -95.11770629882812, "KL/std": 42.581756591796875, "epoch": 0.3900226757369615, "fcm_dpo/beta": 0.013502737507224083, "fcm_dpo/delta": -0.00971025601029396, "fcm_dpo/margin": 30.312185287475586, "fcm_dpo/q_t": 0.40481850504875183, "grad_norm": 16.551925659179688, "learning_rate": 3.840459976743023e-07, "logits/chosen": 0.6557002663612366, "logits/rejected": 0.6039286255836487, "logps/chosen": -124.0596694946289, "logps/ref_chosen": -59.25416564941406, "logps/ref_rejected": -85.58709716796875, "logps/rejected": -180.70480346679688, "loss": 1.0877, "margin_dpo/margin_mean": 30.312183380126953, "margin_dpo/margin_std": 40.388633728027344, "step": 258 }, { "KL/chosen_KL_mean": -55.14162826538086, "KL/mean": -78.388671875, "KL/rejected_KL_mean": -101.63571166992188, "KL/std": 48.088260650634766, "epoch": 0.3915343915343915, "fcm_dpo/beta": 0.012981683015823364, "fcm_dpo/delta": -0.21838447451591492, "fcm_dpo/margin": 46.49407958984375, "fcm_dpo/q_t": 0.36385172605514526, "grad_norm": 14.587737083435059, "learning_rate": 3.8292804142999796e-07, "logits/chosen": 0.5898592472076416, "logits/rejected": 0.48572519421577454, "logps/chosen": -120.57650756835938, "logps/ref_chosen": -65.43487548828125, "logps/ref_rejected": -95.41731262207031, "logps/rejected": -197.05300903320312, "loss": 0.9912, "margin_dpo/margin_mean": 46.49407958984375, "margin_dpo/margin_std": 55.28227233886719, "step": 259 }, { "KL/chosen_KL_mean": -55.989410400390625, "KL/mean": -74.40701293945312, "KL/rejected_KL_mean": -92.82461547851562, "KL/std": 46.183128356933594, "epoch": 0.3930461073318216, "fcm_dpo/beta": 0.012708716094493866, "fcm_dpo/delta": -0.07222156971693039, "fcm_dpo/margin": 36.835208892822266, "fcm_dpo/q_t": 0.39630264043807983, "grad_norm": 14.415229797363281, "learning_rate": 3.818063669026256e-07, "logits/chosen": 0.6366469860076904, "logits/rejected": 0.5467737913131714, "logps/chosen": -105.07899475097656, "logps/ref_chosen": -49.08958435058594, "logps/ref_rejected": -79.01708221435547, "logps/rejected": -171.84170532226562, "loss": 1.0961, "margin_dpo/margin_mean": 36.835208892822266, "margin_dpo/margin_std": 57.32475662231445, "step": 260 }, { "KL/chosen_KL_mean": -54.49605178833008, "KL/mean": -70.47838592529297, "KL/rejected_KL_mean": -86.4607162475586, "KL/std": 44.49393081665039, "epoch": 0.3945578231292517, "fcm_dpo/beta": 0.012731574475765228, "fcm_dpo/delta": -0.007326893508434296, "fcm_dpo/margin": 31.964664459228516, "fcm_dpo/q_t": 0.4090234041213989, "grad_norm": 16.047224044799805, "learning_rate": 3.806810054678331e-07, "logits/chosen": 0.5308667421340942, "logits/rejected": 0.5668069124221802, "logps/chosen": -125.36844635009766, "logps/ref_chosen": -70.87239074707031, "logps/ref_rejected": -65.01522064208984, "logps/rejected": -151.47593688964844, "loss": 1.1149, "margin_dpo/margin_mean": 31.964664459228516, "margin_dpo/margin_std": 50.045799255371094, "step": 261 }, { "KL/chosen_KL_mean": -59.13224792480469, "KL/mean": -74.1536865234375, "KL/rejected_KL_mean": -89.17512512207031, "KL/std": 43.567718505859375, "epoch": 0.3960695389266818, "fcm_dpo/beta": 0.01275954395532608, "fcm_dpo/delta": 0.017208915203809738, "fcm_dpo/margin": 30.042877197265625, "fcm_dpo/q_t": 0.4133336842060089, "grad_norm": 16.049264907836914, "learning_rate": 3.7955198860439887e-07, "logits/chosen": 0.6843748092651367, "logits/rejected": 0.6185039281845093, "logps/chosen": -127.00287628173828, "logps/ref_chosen": -67.8706283569336, "logps/ref_rejected": -88.7205810546875, "logps/rejected": -177.8957061767578, "loss": 1.1271, "margin_dpo/margin_mean": 30.042877197265625, "margin_dpo/margin_std": 47.83500671386719, "step": 262 }, { "KL/chosen_KL_mean": -55.72979736328125, "KL/mean": -70.17361450195312, "KL/rejected_KL_mean": -84.617431640625, "KL/std": 43.927146911621094, "epoch": 0.3975812547241119, "fcm_dpo/beta": 0.012823976576328278, "fcm_dpo/delta": 0.030575139448046684, "fcm_dpo/margin": 28.88762664794922, "fcm_dpo/q_t": 0.41590556502342224, "grad_norm": 14.43605899810791, "learning_rate": 3.784193478933516e-07, "logits/chosen": 0.6303784251213074, "logits/rejected": 0.517768144607544, "logps/chosen": -110.92437744140625, "logps/ref_chosen": -55.194583892822266, "logps/ref_rejected": -80.54048156738281, "logps/rejected": -165.1579132080078, "loss": 1.1425, "margin_dpo/margin_mean": 28.88762664794922, "margin_dpo/margin_std": 48.990997314453125, "step": 263 }, { "KL/chosen_KL_mean": -58.10887908935547, "KL/mean": -74.0867919921875, "KL/rejected_KL_mean": -90.06471252441406, "KL/std": 48.00414276123047, "epoch": 0.39909297052154197, "fcm_dpo/beta": 0.012778695672750473, "fcm_dpo/delta": -0.008820058777928352, "fcm_dpo/margin": 31.955841064453125, "fcm_dpo/q_t": 0.40929892659187317, "grad_norm": 14.940961837768555, "learning_rate": 3.7728311501708674e-07, "logits/chosen": 0.5113496780395508, "logits/rejected": 0.467226505279541, "logps/chosen": -141.27957153320312, "logps/ref_chosen": -83.17068481445312, "logps/ref_rejected": -88.33625793457031, "logps/rejected": -178.40097045898438, "loss": 1.1295, "margin_dpo/margin_mean": 31.955841064453125, "margin_dpo/margin_std": 53.869483947753906, "step": 264 }, { "KL/chosen_KL_mean": -56.73738098144531, "KL/mean": -74.24681091308594, "KL/rejected_KL_mean": -91.7562484741211, "KL/std": 48.956146240234375, "epoch": 0.40060468631897206, "fcm_dpo/beta": 0.012759245000779629, "fcm_dpo/delta": -0.04914683848619461, "fcm_dpo/margin": 35.01887512207031, "fcm_dpo/q_t": 0.40133213996887207, "grad_norm": 13.167969703674316, "learning_rate": 3.7614332175848027e-07, "logits/chosen": 0.7118724584579468, "logits/rejected": 0.6441998481750488, "logps/chosen": -108.40023040771484, "logps/ref_chosen": -51.66284942626953, "logps/ref_rejected": -67.1720962524414, "logps/rejected": -158.9283447265625, "loss": 1.1306, "margin_dpo/margin_mean": 35.01887512207031, "margin_dpo/margin_std": 60.94348907470703, "step": 265 }, { "KL/chosen_KL_mean": -53.77226257324219, "KL/mean": -70.8931884765625, "KL/rejected_KL_mean": -88.01409912109375, "KL/std": 48.8175048828125, "epoch": 0.4021164021164021, "fcm_dpo/beta": 0.012599462643265724, "fcm_dpo/delta": -0.03290412202477455, "fcm_dpo/margin": 34.241844177246094, "fcm_dpo/q_t": 0.40384742617607117, "grad_norm": 17.787485122680664, "learning_rate": 3.75e-07, "logits/chosen": 0.6365302205085754, "logits/rejected": 0.5593627691268921, "logps/chosen": -111.22276306152344, "logps/ref_chosen": -57.45049285888672, "logps/ref_rejected": -77.60826110839844, "logps/rejected": -165.6223602294922, "loss": 1.1044, "margin_dpo/margin_mean": 34.24184799194336, "margin_dpo/margin_std": 53.32252502441406, "step": 266 }, { "KL/chosen_KL_mean": -48.61222457885742, "KL/mean": -62.22151184082031, "KL/rejected_KL_mean": -75.830810546875, "KL/std": 45.412784576416016, "epoch": 0.4036281179138322, "fcm_dpo/beta": 0.012520255520939827, "fcm_dpo/delta": -0.05705377832055092, "fcm_dpo/margin": 27.218570709228516, "fcm_dpo/q_t": 0.4225795567035675, "grad_norm": 15.039100646972656, "learning_rate": 3.738531817228131e-07, "logits/chosen": 0.6629495620727539, "logits/rejected": 0.6446437835693359, "logps/chosen": -103.64757537841797, "logps/ref_chosen": -55.03535079956055, "logps/ref_rejected": -66.0953369140625, "logps/rejected": -141.9261474609375, "loss": 1.1933, "margin_dpo/margin_mean": 27.218570709228516, "margin_dpo/margin_std": 54.503211975097656, "step": 267 }, { "KL/chosen_KL_mean": -48.70326614379883, "KL/mean": -64.34419250488281, "KL/rejected_KL_mean": -79.98513793945312, "KL/std": 47.980010986328125, "epoch": 0.4051398337112623, "fcm_dpo/beta": 0.012494435533881187, "fcm_dpo/delta": 0.009397927671670914, "fcm_dpo/margin": 31.2818660736084, "fcm_dpo/q_t": 0.4124756157398224, "grad_norm": 13.093320846557617, "learning_rate": 3.7270289900589204e-07, "logits/chosen": 0.5435836315155029, "logits/rejected": 0.5286034345626831, "logps/chosen": -113.77500915527344, "logps/ref_chosen": -65.07174682617188, "logps/ref_rejected": -71.42485809326172, "logps/rejected": -151.41000366210938, "loss": 1.113, "margin_dpo/margin_mean": 31.2818660736084, "margin_dpo/margin_std": 46.741065979003906, "step": 268 }, { "KL/chosen_KL_mean": -54.56511688232422, "KL/mean": -74.28379821777344, "KL/rejected_KL_mean": -94.00248718261719, "KL/std": 50.83015060424805, "epoch": 0.40665154950869237, "fcm_dpo/beta": 0.012384241446852684, "fcm_dpo/delta": -0.09324120730161667, "fcm_dpo/margin": 39.4373779296875, "fcm_dpo/q_t": 0.39027777314186096, "grad_norm": 13.482841491699219, "learning_rate": 3.7154918402511714e-07, "logits/chosen": 0.7641857862472534, "logits/rejected": 0.7141634225845337, "logps/chosen": -121.70132446289062, "logps/ref_chosen": -67.1362075805664, "logps/ref_rejected": -82.55778503417969, "logps/rejected": -176.56027221679688, "loss": 1.0564, "margin_dpo/margin_mean": 39.4373779296875, "margin_dpo/margin_std": 51.888084411621094, "step": 269 }, { "KL/chosen_KL_mean": -56.4378662109375, "KL/mean": -71.77268981933594, "KL/rejected_KL_mean": -87.10750579833984, "KL/std": 49.6619873046875, "epoch": 0.40816326530612246, "fcm_dpo/beta": 0.012249600142240524, "fcm_dpo/delta": 0.025138020515441895, "fcm_dpo/margin": 30.669635772705078, "fcm_dpo/q_t": 0.41466546058654785, "grad_norm": 14.274627685546875, "learning_rate": 3.7039206905237656e-07, "logits/chosen": 0.6896152496337891, "logits/rejected": 0.6048238277435303, "logps/chosen": -123.1265640258789, "logps/ref_chosen": -66.6886978149414, "logps/ref_rejected": -85.16129302978516, "logps/rejected": -172.268798828125, "loss": 1.146, "margin_dpo/margin_mean": 30.669635772705078, "margin_dpo/margin_std": 53.44659423828125, "step": 270 }, { "KL/chosen_KL_mean": -56.94865417480469, "KL/mean": -69.40467071533203, "KL/rejected_KL_mean": -81.86067962646484, "KL/std": 51.098960876464844, "epoch": 0.40967498110355255, "fcm_dpo/beta": 0.012330984696745872, "fcm_dpo/delta": 0.011150313541293144, "fcm_dpo/margin": 24.91202735900879, "fcm_dpo/q_t": 0.4350472688674927, "grad_norm": 16.124603271484375, "learning_rate": 3.692315864546635e-07, "logits/chosen": 0.6839946508407593, "logits/rejected": 0.6181609630584717, "logps/chosen": -129.356201171875, "logps/ref_chosen": -72.40754699707031, "logps/ref_rejected": -92.06311798095703, "logps/rejected": -173.92379760742188, "loss": 1.2306, "margin_dpo/margin_mean": 24.912025451660156, "margin_dpo/margin_std": 60.20992660522461, "step": 271 }, { "KL/chosen_KL_mean": -47.66930389404297, "KL/mean": -72.03286743164062, "KL/rejected_KL_mean": -96.39643096923828, "KL/std": 48.550479888916016, "epoch": 0.41118669690098264, "fcm_dpo/beta": 0.012013398110866547, "fcm_dpo/delta": -0.19650031626224518, "fcm_dpo/margin": 48.72712707519531, "fcm_dpo/q_t": 0.3648834228515625, "grad_norm": 15.218764305114746, "learning_rate": 3.6806776869317067e-07, "logits/chosen": 0.6682814359664917, "logits/rejected": 0.6894842386245728, "logps/chosen": -114.27070617675781, "logps/ref_chosen": -66.60140228271484, "logps/ref_rejected": -67.74340057373047, "logps/rejected": -164.13983154296875, "loss": 0.9579, "margin_dpo/margin_mean": 48.72712707519531, "margin_dpo/margin_std": 46.79485321044922, "step": 272 }, { "KL/chosen_KL_mean": -67.06260681152344, "KL/mean": -84.49526977539062, "KL/rejected_KL_mean": -101.92792510986328, "KL/std": 51.95482635498047, "epoch": 0.4126984126984127, "fcm_dpo/beta": 0.011803549714386463, "fcm_dpo/delta": -0.012199468910694122, "fcm_dpo/margin": 34.865325927734375, "fcm_dpo/q_t": 0.408597469329834, "grad_norm": 16.49392318725586, "learning_rate": 3.669006483223828e-07, "logits/chosen": 0.6596027612686157, "logits/rejected": 0.5886712670326233, "logps/chosen": -124.41748046875, "logps/ref_chosen": -57.35487747192383, "logps/ref_rejected": -84.17168426513672, "logps/rejected": -186.099609375, "loss": 1.1588, "margin_dpo/margin_mean": 34.86532974243164, "margin_dpo/margin_std": 66.22444152832031, "step": 273 }, { "KL/chosen_KL_mean": -65.01911163330078, "KL/mean": -84.14996337890625, "KL/rejected_KL_mean": -103.28082275390625, "KL/std": 52.411319732666016, "epoch": 0.41421012849584277, "fcm_dpo/beta": 0.011741770431399345, "fcm_dpo/delta": -0.05155833438038826, "fcm_dpo/margin": 38.26170349121094, "fcm_dpo/q_t": 0.3997541069984436, "grad_norm": 12.381172180175781, "learning_rate": 3.657302579891656e-07, "logits/chosen": 0.557734489440918, "logits/rejected": 0.5413792729377747, "logps/chosen": -124.66060638427734, "logps/ref_chosen": -59.64149475097656, "logps/ref_rejected": -68.29348754882812, "logps/rejected": -171.57431030273438, "loss": 1.1098, "margin_dpo/margin_mean": 38.26170349121094, "margin_dpo/margin_std": 62.18931198120117, "step": 274 }, { "KL/chosen_KL_mean": -64.44328308105469, "KL/mean": -83.67219543457031, "KL/rejected_KL_mean": -102.9011001586914, "KL/std": 50.57617950439453, "epoch": 0.41572184429327286, "fcm_dpo/beta": 0.011653339490294456, "fcm_dpo/delta": -0.050456371158361435, "fcm_dpo/margin": 38.457820892333984, "fcm_dpo/q_t": 0.3979693651199341, "grad_norm": 14.888864517211914, "learning_rate": 3.645566304318526e-07, "logits/chosen": 0.6520686745643616, "logits/rejected": 0.5673788189888, "logps/chosen": -117.70992279052734, "logps/ref_chosen": -53.26664352416992, "logps/ref_rejected": -73.84062194824219, "logps/rejected": -176.74172973632812, "loss": 1.0767, "margin_dpo/margin_mean": 38.457820892333984, "margin_dpo/margin_std": 52.77085876464844, "step": 275 }, { "KL/chosen_KL_mean": -60.3723258972168, "KL/mean": -79.3885269165039, "KL/rejected_KL_mean": -98.40472412109375, "KL/std": 48.34068298339844, "epoch": 0.41723356009070295, "fcm_dpo/beta": 0.011523595079779625, "fcm_dpo/delta": -0.040011994540691376, "fcm_dpo/margin": 38.03240966796875, "fcm_dpo/q_t": 0.40035754442214966, "grad_norm": 14.894147872924805, "learning_rate": 3.633797984793294e-07, "logits/chosen": 0.6216329336166382, "logits/rejected": 0.5886694192886353, "logps/chosen": -113.39311981201172, "logps/ref_chosen": -53.02079772949219, "logps/ref_rejected": -61.56678771972656, "logps/rejected": -159.9715118408203, "loss": 1.0853, "margin_dpo/margin_mean": 38.03240966796875, "margin_dpo/margin_std": 53.81486511230469, "step": 276 }, { "KL/chosen_KL_mean": -68.69551086425781, "KL/mean": -79.44567108154297, "KL/rejected_KL_mean": -90.19581604003906, "KL/std": 50.85203552246094, "epoch": 0.41874527588813304, "fcm_dpo/beta": 0.011638839729130268, "fcm_dpo/delta": 0.061889614909887314, "fcm_dpo/margin": 21.500307083129883, "fcm_dpo/q_t": 0.4438678026199341, "grad_norm": 19.602415084838867, "learning_rate": 3.6219979505011555e-07, "logits/chosen": 0.6830171346664429, "logits/rejected": 0.7113779783248901, "logps/chosen": -140.12850952148438, "logps/ref_chosen": -71.43299102783203, "logps/ref_rejected": -67.65852355957031, "logps/rejected": -157.85433959960938, "loss": 1.26, "margin_dpo/margin_mean": 21.500307083129883, "margin_dpo/margin_std": 58.075443267822266, "step": 277 }, { "KL/chosen_KL_mean": -72.06103515625, "KL/mean": -90.506591796875, "KL/rejected_KL_mean": -108.9521484375, "KL/std": 55.39421081542969, "epoch": 0.42025699168556313, "fcm_dpo/beta": 0.011667946353554726, "fcm_dpo/delta": -0.032669879496097565, "fcm_dpo/margin": 36.89111328125, "fcm_dpo/q_t": 0.40320485830307007, "grad_norm": 17.36385726928711, "learning_rate": 3.6101665315144353e-07, "logits/chosen": 0.6103044748306274, "logits/rejected": 0.5539520978927612, "logps/chosen": -139.1717987060547, "logps/ref_chosen": -67.11076354980469, "logps/ref_rejected": -88.74851989746094, "logps/rejected": -197.70066833496094, "loss": 1.1134, "margin_dpo/margin_mean": 36.89111328125, "margin_dpo/margin_std": 58.222145080566406, "step": 278 }, { "KL/chosen_KL_mean": -56.835533142089844, "KL/mean": -82.46629333496094, "KL/rejected_KL_mean": -108.09706115722656, "KL/std": 54.52870559692383, "epoch": 0.4217687074829932, "fcm_dpo/beta": 0.011270842514932156, "fcm_dpo/delta": -0.18833574652671814, "fcm_dpo/margin": 51.26152420043945, "fcm_dpo/q_t": 0.3676702678203583, "grad_norm": 14.672026634216309, "learning_rate": 3.5983040587833563e-07, "logits/chosen": 0.647136926651001, "logits/rejected": 0.6091359257698059, "logps/chosen": -111.33302307128906, "logps/ref_chosen": -54.49748611450195, "logps/ref_rejected": -70.42373657226562, "logps/rejected": -178.52078247070312, "loss": 0.9705, "margin_dpo/margin_mean": 51.26152038574219, "margin_dpo/margin_std": 51.992767333984375, "step": 279 }, { "KL/chosen_KL_mean": -61.58481216430664, "KL/mean": -87.17666625976562, "KL/rejected_KL_mean": -112.76852416992188, "KL/std": 56.85373306274414, "epoch": 0.42328042328042326, "fcm_dpo/beta": 0.010863588191568851, "fcm_dpo/delta": -0.16526158154010773, "fcm_dpo/margin": 51.18370819091797, "fcm_dpo/q_t": 0.37368202209472656, "grad_norm": 11.658512115478516, "learning_rate": 3.586410864126781e-07, "logits/chosen": 0.6816037893295288, "logits/rejected": 0.6382617354393005, "logps/chosen": -122.01762390136719, "logps/ref_chosen": -60.43281173706055, "logps/ref_rejected": -78.39051818847656, "logps/rejected": -191.15904235839844, "loss": 0.9808, "margin_dpo/margin_mean": 51.18370819091797, "margin_dpo/margin_std": 53.1890869140625, "step": 280 }, { "KL/chosen_KL_mean": -65.01192474365234, "KL/mean": -88.04139709472656, "KL/rejected_KL_mean": -111.07086181640625, "KL/std": 55.35615539550781, "epoch": 0.42479213907785335, "fcm_dpo/beta": 0.010645313188433647, "fcm_dpo/delta": -0.09485936164855957, "fcm_dpo/margin": 46.05894470214844, "fcm_dpo/q_t": 0.3892815113067627, "grad_norm": 13.532862663269043, "learning_rate": 3.574487280222929e-07, "logits/chosen": 0.661474883556366, "logits/rejected": 0.6880728006362915, "logps/chosen": -125.29401397705078, "logps/ref_chosen": -60.2820930480957, "logps/ref_rejected": -62.04009246826172, "logps/rejected": -173.1109619140625, "loss": 1.0537, "margin_dpo/margin_mean": 46.05894470214844, "margin_dpo/margin_std": 61.077674865722656, "step": 281 }, { "KL/chosen_KL_mean": -71.59873962402344, "KL/mean": -93.08067321777344, "KL/rejected_KL_mean": -114.56260681152344, "KL/std": 55.64825439453125, "epoch": 0.42630385487528344, "fcm_dpo/beta": 0.01057264395058155, "fcm_dpo/delta": -0.05786660686135292, "fcm_dpo/margin": 42.963863372802734, "fcm_dpo/q_t": 0.39888957142829895, "grad_norm": 15.894862174987793, "learning_rate": 3.562533640600075e-07, "logits/chosen": 0.584148645401001, "logits/rejected": 0.5378983020782471, "logps/chosen": -132.22265625, "logps/ref_chosen": -60.623924255371094, "logps/ref_rejected": -68.67400360107422, "logps/rejected": -183.23660278320312, "loss": 1.099, "margin_dpo/margin_mean": 42.963863372802734, "margin_dpo/margin_std": 64.56341552734375, "step": 282 }, { "KL/chosen_KL_mean": -71.82366943359375, "KL/mean": -91.46395874023438, "KL/rejected_KL_mean": -111.104248046875, "KL/std": 55.65943908691406, "epoch": 0.42781557067271353, "fcm_dpo/beta": 0.01045767217874527, "fcm_dpo/delta": -0.011342188343405724, "fcm_dpo/margin": 39.28058624267578, "fcm_dpo/q_t": 0.406854510307312, "grad_norm": 15.663429260253906, "learning_rate": 3.550550279627215e-07, "logits/chosen": 0.652621865272522, "logits/rejected": 0.5456954836845398, "logps/chosen": -139.47140502929688, "logps/ref_chosen": -67.64775085449219, "logps/ref_rejected": -99.96835327148438, "logps/rejected": -211.07260131835938, "loss": 1.1152, "margin_dpo/margin_mean": 39.28058624267578, "margin_dpo/margin_std": 61.75912094116211, "step": 283 }, { "KL/chosen_KL_mean": -67.76885986328125, "KL/mean": -89.06997680664062, "KL/rejected_KL_mean": -110.37109375, "KL/std": 58.809104919433594, "epoch": 0.4293272864701436, "fcm_dpo/beta": 0.010321895591914654, "fcm_dpo/delta": -0.04201076924800873, "fcm_dpo/margin": 42.602237701416016, "fcm_dpo/q_t": 0.3993057608604431, "grad_norm": 12.512202262878418, "learning_rate": 3.5385375325047163e-07, "logits/chosen": 0.7079395651817322, "logits/rejected": 0.6430385708808899, "logps/chosen": -124.73628234863281, "logps/ref_chosen": -56.96742630004883, "logps/ref_rejected": -86.36236572265625, "logps/rejected": -196.73345947265625, "loss": 1.0768, "margin_dpo/margin_mean": 42.60223388671875, "margin_dpo/margin_std": 57.22046661376953, "step": 284 }, { "KL/chosen_KL_mean": -80.4854736328125, "KL/mean": -97.457763671875, "KL/rejected_KL_mean": -114.4300537109375, "KL/std": 53.97811508178711, "epoch": 0.4308390022675737, "fcm_dpo/beta": 0.010450178757309914, "fcm_dpo/delta": 0.0464649423956871, "fcm_dpo/margin": 33.944583892822266, "fcm_dpo/q_t": 0.42112481594085693, "grad_norm": 19.473621368408203, "learning_rate": 3.5264957352549375e-07, "logits/chosen": 0.6763700246810913, "logits/rejected": 0.6494711637496948, "logps/chosen": -152.14158630371094, "logps/ref_chosen": -71.65611267089844, "logps/ref_rejected": -81.63829803466797, "logps/rejected": -196.068359375, "loss": 1.1583, "margin_dpo/margin_mean": 33.944583892822266, "margin_dpo/margin_std": 60.767845153808594, "step": 285 }, { "KL/chosen_KL_mean": -74.67512512207031, "KL/mean": -102.37751770019531, "KL/rejected_KL_mean": -130.0799102783203, "KL/std": 57.60707092285156, "epoch": 0.4323507180650038, "fcm_dpo/beta": 0.010169255547225475, "fcm_dpo/delta": -0.17320239543914795, "fcm_dpo/margin": 55.404788970947266, "fcm_dpo/q_t": 0.3723100423812866, "grad_norm": 13.81632137298584, "learning_rate": 3.514425224712835e-07, "logits/chosen": 0.6021387577056885, "logits/rejected": 0.5077069401741028, "logps/chosen": -135.754638671875, "logps/ref_chosen": -61.07952117919922, "logps/ref_rejected": -91.28128051757812, "logps/rejected": -221.36119079589844, "loss": 0.9865, "margin_dpo/margin_mean": 55.40479278564453, "margin_dpo/margin_std": 60.73088073730469, "step": 286 }, { "KL/chosen_KL_mean": -62.27655029296875, "KL/mean": -88.59188842773438, "KL/rejected_KL_mean": -114.9072265625, "KL/std": 59.33280944824219, "epoch": 0.43386243386243384, "fcm_dpo/beta": 0.00989883579313755, "fcm_dpo/delta": -0.12754103541374207, "fcm_dpo/margin": 52.63068389892578, "fcm_dpo/q_t": 0.3816481828689575, "grad_norm": 13.578323364257812, "learning_rate": 3.502326338516534e-07, "logits/chosen": 0.691969633102417, "logits/rejected": 0.6520496606826782, "logps/chosen": -108.31234741210938, "logps/ref_chosen": -46.035789489746094, "logps/ref_rejected": -59.95293426513672, "logps/rejected": -174.86016845703125, "loss": 1.0164, "margin_dpo/margin_mean": 52.63068389892578, "margin_dpo/margin_std": 61.405296325683594, "step": 287 }, { "KL/chosen_KL_mean": -78.91499328613281, "KL/mean": -98.74652099609375, "KL/rejected_KL_mean": -118.57804107666016, "KL/std": 56.958038330078125, "epoch": 0.43537414965986393, "fcm_dpo/beta": 0.009849481284618378, "fcm_dpo/delta": 0.009691323153674603, "fcm_dpo/margin": 39.66306686401367, "fcm_dpo/q_t": 0.41141414642333984, "grad_norm": 16.115100860595703, "learning_rate": 3.490199415097892e-07, "logits/chosen": 0.542695164680481, "logits/rejected": 0.4857466220855713, "logps/chosen": -144.30584716796875, "logps/ref_chosen": -65.3908462524414, "logps/ref_rejected": -88.53607940673828, "logps/rejected": -207.11412048339844, "loss": 1.1235, "margin_dpo/margin_mean": 39.66307067871094, "margin_dpo/margin_std": 62.92963409423828, "step": 288 }, { "KL/chosen_KL_mean": -81.19337463378906, "KL/mean": -98.5098876953125, "KL/rejected_KL_mean": -115.826416015625, "KL/std": 57.35633087158203, "epoch": 0.436885865457294, "fcm_dpo/beta": 0.009964533150196075, "fcm_dpo/delta": 0.0563802607357502, "fcm_dpo/margin": 34.633052825927734, "fcm_dpo/q_t": 0.42408448457717896, "grad_norm": 14.43678092956543, "learning_rate": 3.4780447936730247e-07, "logits/chosen": 0.7671295404434204, "logits/rejected": 0.7262423038482666, "logps/chosen": -135.78700256347656, "logps/ref_chosen": -54.5936279296875, "logps/ref_rejected": -67.20855712890625, "logps/rejected": -183.03497314453125, "loss": 1.1781, "margin_dpo/margin_mean": 34.633052825927734, "margin_dpo/margin_std": 67.29803466796875, "step": 289 }, { "KL/chosen_KL_mean": -89.79861450195312, "KL/mean": -112.21324157714844, "KL/rejected_KL_mean": -134.62786865234375, "KL/std": 56.158790588378906, "epoch": 0.4383975812547241, "fcm_dpo/beta": 0.00985310971736908, "fcm_dpo/delta": -0.044114850461483, "fcm_dpo/margin": 44.82925033569336, "fcm_dpo/q_t": 0.4000610113143921, "grad_norm": 17.263036727905273, "learning_rate": 3.465862814232821e-07, "logits/chosen": 0.7442500591278076, "logits/rejected": 0.6700857877731323, "logps/chosen": -151.18319702148438, "logps/ref_chosen": -61.38457489013672, "logps/ref_rejected": -91.92778015136719, "logps/rejected": -226.55564880371094, "loss": 1.0959, "margin_dpo/margin_mean": 44.82925033569336, "margin_dpo/margin_std": 67.37724304199219, "step": 290 }, { "KL/chosen_KL_mean": -84.3948974609375, "KL/mean": -109.53266143798828, "KL/rejected_KL_mean": -134.67041015625, "KL/std": 55.95367431640625, "epoch": 0.4399092970521542, "fcm_dpo/beta": 0.00979924201965332, "fcm_dpo/delta": -0.0978565365076065, "fcm_dpo/margin": 50.27552795410156, "fcm_dpo/q_t": 0.38950875401496887, "grad_norm": 15.610273361206055, "learning_rate": 3.4536538175334343e-07, "logits/chosen": 0.8200286626815796, "logits/rejected": 0.7451068162918091, "logps/chosen": -135.2579345703125, "logps/ref_chosen": -50.863037109375, "logps/ref_rejected": -82.20868682861328, "logps/rejected": -216.8791046142578, "loss": 1.0476, "margin_dpo/margin_mean": 50.2755241394043, "margin_dpo/margin_std": 63.88002014160156, "step": 291 }, { "KL/chosen_KL_mean": -83.32215881347656, "KL/mean": -102.67535400390625, "KL/rejected_KL_mean": -122.02855682373047, "KL/std": 59.864112854003906, "epoch": 0.4414210128495843, "fcm_dpo/beta": 0.009721076115965843, "fcm_dpo/delta": 0.024628205224871635, "fcm_dpo/margin": 38.706398010253906, "fcm_dpo/q_t": 0.4157206118106842, "grad_norm": 14.623159408569336, "learning_rate": 3.4414181450867465e-07, "logits/chosen": 0.6698214411735535, "logits/rejected": 0.6206130981445312, "logps/chosen": -147.67105102539062, "logps/ref_chosen": -64.34888458251953, "logps/ref_rejected": -72.86434173583984, "logps/rejected": -194.8928985595703, "loss": 1.1507, "margin_dpo/margin_mean": 38.706398010253906, "margin_dpo/margin_std": 69.27012634277344, "step": 292 }, { "KL/chosen_KL_mean": -84.00607299804688, "KL/mean": -111.59648895263672, "KL/rejected_KL_mean": -139.18690490722656, "KL/std": 61.00130081176758, "epoch": 0.4429327286470144, "fcm_dpo/beta": 0.009570857509970665, "fcm_dpo/delta": -0.13505950570106506, "fcm_dpo/margin": 55.180824279785156, "fcm_dpo/q_t": 0.3820232152938843, "grad_norm": 12.031527519226074, "learning_rate": 3.4291561391508185e-07, "logits/chosen": 0.7909771203994751, "logits/rejected": 0.6990966200828552, "logps/chosen": -138.87554931640625, "logps/ref_chosen": -54.869468688964844, "logps/ref_rejected": -81.858642578125, "logps/rejected": -221.04556274414062, "loss": 1.047, "margin_dpo/margin_mean": 55.180824279785156, "margin_dpo/margin_std": 74.77305603027344, "step": 293 }, { "KL/chosen_KL_mean": -88.59320068359375, "KL/mean": -107.06346130371094, "KL/rejected_KL_mean": -125.53370666503906, "KL/std": 61.93351745605469, "epoch": 0.4444444444444444, "fcm_dpo/beta": 0.00950655434280634, "fcm_dpo/delta": 0.050542715936899185, "fcm_dpo/margin": 36.94049835205078, "fcm_dpo/q_t": 0.4209939241409302, "grad_norm": 12.825860977172852, "learning_rate": 3.4168681427203153e-07, "logits/chosen": 0.7113804221153259, "logits/rejected": 0.6638482213020325, "logps/chosen": -145.26409912109375, "logps/ref_chosen": -56.670902252197266, "logps/ref_rejected": -70.32819366455078, "logps/rejected": -195.8618927001953, "loss": 1.1372, "margin_dpo/margin_mean": 36.940494537353516, "margin_dpo/margin_std": 58.059391021728516, "step": 294 }, { "KL/chosen_KL_mean": -94.66636657714844, "KL/mean": -112.08953094482422, "KL/rejected_KL_mean": -129.5126953125, "KL/std": 59.72174072265625, "epoch": 0.4459561602418745, "fcm_dpo/beta": 0.009636422619223595, "fcm_dpo/delta": 0.06648371368646622, "fcm_dpo/margin": 34.846317291259766, "fcm_dpo/q_t": 0.4251624345779419, "grad_norm": 17.546260833740234, "learning_rate": 3.4045544995169125e-07, "logits/chosen": 0.7090365290641785, "logits/rejected": 0.6008093357086182, "logps/chosen": -145.0672607421875, "logps/ref_chosen": -50.40088653564453, "logps/ref_rejected": -83.43521881103516, "logps/rejected": -212.94790649414062, "loss": 1.1728, "margin_dpo/margin_mean": 34.8463134765625, "margin_dpo/margin_std": 65.76693725585938, "step": 295 }, { "KL/chosen_KL_mean": -96.35165405273438, "KL/mean": -117.66732788085938, "KL/rejected_KL_mean": -138.98297119140625, "KL/std": 62.33965301513672, "epoch": 0.4474678760393046, "fcm_dpo/beta": 0.009635808877646923, "fcm_dpo/delta": -0.011614136397838593, "fcm_dpo/margin": 42.631317138671875, "fcm_dpo/q_t": 0.4079824388027191, "grad_norm": 13.738656044006348, "learning_rate": 3.392215553979679e-07, "logits/chosen": 0.6606223583221436, "logits/rejected": 0.6123418807983398, "logps/chosen": -165.50201416015625, "logps/ref_chosen": -69.15034484863281, "logps/ref_rejected": -89.60166931152344, "logps/rejected": -228.58465576171875, "loss": 1.1259, "margin_dpo/margin_mean": 42.631317138671875, "margin_dpo/margin_std": 70.50044250488281, "step": 296 }, { "KL/chosen_KL_mean": -100.27669525146484, "KL/mean": -123.90884399414062, "KL/rejected_KL_mean": -147.54100036621094, "KL/std": 55.378578186035156, "epoch": 0.4489795918367347, "fcm_dpo/beta": 0.00963042862713337, "fcm_dpo/delta": -0.0579850971698761, "fcm_dpo/margin": 47.264305114746094, "fcm_dpo/q_t": 0.3957344889640808, "grad_norm": 13.572564125061035, "learning_rate": 3.3798516512554485e-07, "logits/chosen": 0.682603657245636, "logits/rejected": 0.6262906193733215, "logps/chosen": -158.29299926757812, "logps/ref_chosen": -58.01630401611328, "logps/ref_rejected": -69.95780944824219, "logps/rejected": -217.49880981445312, "loss": 1.0599, "margin_dpo/margin_mean": 47.264305114746094, "margin_dpo/margin_std": 59.17694854736328, "step": 297 }, { "KL/chosen_KL_mean": -97.90091705322266, "KL/mean": -115.77299499511719, "KL/rejected_KL_mean": -133.64505004882812, "KL/std": 58.46490478515625, "epoch": 0.4504913076341648, "fcm_dpo/beta": 0.009631148539483547, "fcm_dpo/delta": 0.05776507034897804, "fcm_dpo/margin": 35.74412536621094, "fcm_dpo/q_t": 0.4234750270843506, "grad_norm": 13.524314880371094, "learning_rate": 3.367463137189156e-07, "logits/chosen": 0.7889485359191895, "logits/rejected": 0.7306005358695984, "logps/chosen": -154.07022094726562, "logps/ref_chosen": -56.1693115234375, "logps/ref_rejected": -68.55052185058594, "logps/rejected": -202.195556640625, "loss": 1.1802, "margin_dpo/margin_mean": 35.74412536621094, "margin_dpo/margin_std": 70.49020385742188, "step": 298 }, { "KL/chosen_KL_mean": -97.00253295898438, "KL/mean": -112.92283630371094, "KL/rejected_KL_mean": -128.84315490722656, "KL/std": 57.701011657714844, "epoch": 0.4520030234315949, "fcm_dpo/beta": 0.009767703711986542, "fcm_dpo/delta": 0.09192134439945221, "fcm_dpo/margin": 31.840625762939453, "fcm_dpo/q_t": 0.43105369806289673, "grad_norm": 19.134227752685547, "learning_rate": 3.355050358314172e-07, "logits/chosen": 0.6059396266937256, "logits/rejected": 0.5756997466087341, "logps/chosen": -159.32034301757812, "logps/ref_chosen": -62.31780242919922, "logps/ref_rejected": -72.60028839111328, "logps/rejected": -201.44345092773438, "loss": 1.2229, "margin_dpo/margin_mean": 31.840625762939453, "margin_dpo/margin_std": 73.06194305419922, "step": 299 }, { "KL/chosen_KL_mean": -99.65127563476562, "KL/mean": -119.23041534423828, "KL/rejected_KL_mean": -138.80955505371094, "KL/std": 58.07111358642578, "epoch": 0.45351473922902497, "fcm_dpo/beta": 0.00987608078867197, "fcm_dpo/delta": 0.013775285333395004, "fcm_dpo/margin": 39.158287048339844, "fcm_dpo/q_t": 0.41372305154800415, "grad_norm": 15.81484317779541, "learning_rate": 3.3426136618426043e-07, "logits/chosen": 0.7372875213623047, "logits/rejected": 0.6648412942886353, "logps/chosen": -160.03285217285156, "logps/ref_chosen": -60.38157653808594, "logps/ref_rejected": -75.45442199707031, "logps/rejected": -214.26397705078125, "loss": 1.1487, "margin_dpo/margin_mean": 39.15829086303711, "margin_dpo/margin_std": 69.74807739257812, "step": 300 }, { "KL/chosen_KL_mean": -96.7698745727539, "KL/mean": -114.06098175048828, "KL/rejected_KL_mean": -131.35208129882812, "KL/std": 57.57362365722656, "epoch": 0.455026455026455, "fcm_dpo/beta": 0.009932178072631359, "fcm_dpo/delta": 0.05853221192955971, "fcm_dpo/margin": 34.58222198486328, "fcm_dpo/q_t": 0.42294973134994507, "grad_norm": 14.549666404724121, "learning_rate": 3.3301533956555885e-07, "logits/chosen": 0.7522227764129639, "logits/rejected": 0.7229958772659302, "logps/chosen": -149.62075805664062, "logps/ref_chosen": -52.85089111328125, "logps/ref_rejected": -69.97584533691406, "logps/rejected": -201.32794189453125, "loss": 1.1778, "margin_dpo/margin_mean": 34.58222198486328, "margin_dpo/margin_std": 67.5133056640625, "step": 301 }, { "KL/chosen_KL_mean": -100.72758483886719, "KL/mean": -114.74748992919922, "KL/rejected_KL_mean": -128.76739501953125, "KL/std": 58.46039962768555, "epoch": 0.4565381708238851, "fcm_dpo/beta": 0.010159955359995365, "fcm_dpo/delta": 0.11858371645212173, "fcm_dpo/margin": 28.039810180664062, "fcm_dpo/q_t": 0.4356432557106018, "grad_norm": 19.248348236083984, "learning_rate": 3.317669908293554e-07, "logits/chosen": 0.5658413171768188, "logits/rejected": 0.5086290240287781, "logps/chosen": -167.694091796875, "logps/ref_chosen": -66.96650695800781, "logps/ref_rejected": -88.09510803222656, "logps/rejected": -216.86251831054688, "loss": 1.2246, "margin_dpo/margin_mean": 28.03980827331543, "margin_dpo/margin_std": 65.29124450683594, "step": 302 }, { "KL/chosen_KL_mean": -89.91455078125, "KL/mean": -114.56990051269531, "KL/rejected_KL_mean": -139.22525024414062, "KL/std": 59.440101623535156, "epoch": 0.4580498866213152, "fcm_dpo/beta": 0.010085317306220531, "fcm_dpo/delta": -0.10232987999916077, "fcm_dpo/margin": 49.310699462890625, "fcm_dpo/q_t": 0.3895440101623535, "grad_norm": 12.996821403503418, "learning_rate": 3.3051635489464793e-07, "logits/chosen": 0.6715907454490662, "logits/rejected": 0.600356936454773, "logps/chosen": -152.03607177734375, "logps/ref_chosen": -62.12152862548828, "logps/ref_rejected": -90.31204223632812, "logps/rejected": -229.53729248046875, "loss": 1.0802, "margin_dpo/margin_mean": 49.310699462890625, "margin_dpo/margin_std": 74.4521713256836, "step": 303 }, { "KL/chosen_KL_mean": -78.18890380859375, "KL/mean": -103.50267028808594, "KL/rejected_KL_mean": -128.81643676757812, "KL/std": 62.76209259033203, "epoch": 0.4595616024187453, "fcm_dpo/beta": 0.009861658327281475, "fcm_dpo/delta": -0.10457085072994232, "fcm_dpo/margin": 50.62752151489258, "fcm_dpo/q_t": 0.38456183671951294, "grad_norm": 12.847184181213379, "learning_rate": 3.292634667444117e-07, "logits/chosen": 0.677819013595581, "logits/rejected": 0.6205200552940369, "logps/chosen": -138.88400268554688, "logps/ref_chosen": -60.695091247558594, "logps/ref_rejected": -78.2525405883789, "logps/rejected": -207.0689697265625, "loss": 1.0224, "margin_dpo/margin_mean": 50.627525329589844, "margin_dpo/margin_std": 56.92477798461914, "step": 304 }, { "KL/chosen_KL_mean": -90.60421752929688, "KL/mean": -108.50660705566406, "KL/rejected_KL_mean": -126.40899658203125, "KL/std": 59.10084915161133, "epoch": 0.46107331821617537, "fcm_dpo/beta": 0.009848186746239662, "fcm_dpo/delta": 0.048988066613674164, "fcm_dpo/margin": 35.80479049682617, "fcm_dpo/q_t": 0.4205666184425354, "grad_norm": 14.523816108703613, "learning_rate": 3.280083614246217e-07, "logits/chosen": 0.6243883371353149, "logits/rejected": 0.6565730571746826, "logps/chosen": -163.3033447265625, "logps/ref_chosen": -72.69914245605469, "logps/ref_rejected": -65.65670776367188, "logps/rejected": -192.06570434570312, "loss": 1.1836, "margin_dpo/margin_mean": 35.80479049682617, "margin_dpo/margin_std": 71.57237243652344, "step": 305 }, { "KL/chosen_KL_mean": -84.85050964355469, "KL/mean": -104.3355712890625, "KL/rejected_KL_mean": -123.82064056396484, "KL/std": 56.57829284667969, "epoch": 0.46258503401360546, "fcm_dpo/beta": 0.009873464703559875, "fcm_dpo/delta": 0.015147637575864792, "fcm_dpo/margin": 38.970123291015625, "fcm_dpo/q_t": 0.4119231104850769, "grad_norm": 13.490785598754883, "learning_rate": 3.267510740432719e-07, "logits/chosen": 0.7450041770935059, "logits/rejected": 0.6291377544403076, "logps/chosen": -138.821044921875, "logps/ref_chosen": -53.97052764892578, "logps/ref_rejected": -71.02423095703125, "logps/rejected": -194.84487915039062, "loss": 1.1112, "margin_dpo/margin_mean": 38.970123291015625, "margin_dpo/margin_std": 55.11603546142578, "step": 306 }, { "KL/chosen_KL_mean": -80.17931365966797, "KL/mean": -89.0488052368164, "KL/rejected_KL_mean": -97.91830444335938, "KL/std": 55.07842254638672, "epoch": 0.46409674981103555, "fcm_dpo/beta": 0.01002093218266964, "fcm_dpo/delta": 0.07840843498706818, "fcm_dpo/margin": 17.73899269104004, "fcm_dpo/q_t": 0.46142441034317017, "grad_norm": 18.435049057006836, "learning_rate": 3.2549163976939285e-07, "logits/chosen": 0.7022722959518433, "logits/rejected": 0.6523076295852661, "logps/chosen": -137.59242248535156, "logps/ref_chosen": -57.413108825683594, "logps/ref_rejected": -68.68010711669922, "logps/rejected": -166.59841918945312, "loss": 1.3403, "margin_dpo/margin_mean": 17.738990783691406, "margin_dpo/margin_std": 71.95256042480469, "step": 307 }, { "KL/chosen_KL_mean": -79.00904083251953, "KL/mean": -97.37933349609375, "KL/rejected_KL_mean": -115.74961853027344, "KL/std": 56.884033203125, "epoch": 0.4656084656084656, "fcm_dpo/beta": 0.010100344195961952, "fcm_dpo/delta": 0.029762284830212593, "fcm_dpo/margin": 36.740577697753906, "fcm_dpo/q_t": 0.4161604642868042, "grad_norm": 11.407052040100098, "learning_rate": 3.2423009383206874e-07, "logits/chosen": 0.6483219265937805, "logits/rejected": 0.6376514434814453, "logps/chosen": -145.6078338623047, "logps/ref_chosen": -66.59879302978516, "logps/ref_rejected": -74.337158203125, "logps/rejected": -190.08676147460938, "loss": 1.152, "margin_dpo/margin_mean": 36.74058151245117, "margin_dpo/margin_std": 64.99626922607422, "step": 308 }, { "KL/chosen_KL_mean": -89.90048217773438, "KL/mean": -108.94097900390625, "KL/rejected_KL_mean": -127.98147583007812, "KL/std": 54.53465270996094, "epoch": 0.4671201814058957, "fcm_dpo/beta": 0.010132771916687489, "fcm_dpo/delta": 0.014298100024461746, "fcm_dpo/margin": 38.080989837646484, "fcm_dpo/q_t": 0.4123424291610718, "grad_norm": 12.06197452545166, "learning_rate": 3.229664715194511e-07, "logits/chosen": 0.7297148704528809, "logits/rejected": 0.6692063808441162, "logps/chosen": -155.29522705078125, "logps/ref_chosen": -65.39474487304688, "logps/ref_rejected": -75.70930480957031, "logps/rejected": -203.69078063964844, "loss": 1.1252, "margin_dpo/margin_mean": 38.080989837646484, "margin_dpo/margin_std": 59.38434600830078, "step": 309 }, { "KL/chosen_KL_mean": -92.47454071044922, "KL/mean": -103.85682678222656, "KL/rejected_KL_mean": -115.23912811279297, "KL/std": 50.35633850097656, "epoch": 0.46863189720332576, "fcm_dpo/beta": 0.010324651375412941, "fcm_dpo/delta": 0.0661807730793953, "fcm_dpo/margin": 22.764583587646484, "fcm_dpo/q_t": 0.44870710372924805, "grad_norm": 13.881424903869629, "learning_rate": 3.2170080817777257e-07, "logits/chosen": 0.6816816926002502, "logits/rejected": 0.6661825776100159, "logps/chosen": -167.142822265625, "logps/ref_chosen": -74.66827392578125, "logps/ref_rejected": -80.5689697265625, "logps/rejected": -195.80810546875, "loss": 1.2581, "margin_dpo/margin_mean": 22.764583587646484, "margin_dpo/margin_std": 60.6732292175293, "step": 310 }, { "KL/chosen_KL_mean": -74.03821563720703, "KL/mean": -94.06935119628906, "KL/rejected_KL_mean": -114.10049438476562, "KL/std": 61.16158676147461, "epoch": 0.47014361300075586, "fcm_dpo/beta": 0.010256282985210419, "fcm_dpo/delta": -0.011901382356882095, "fcm_dpo/margin": 40.06228256225586, "fcm_dpo/q_t": 0.4090367257595062, "grad_norm": 12.809626579284668, "learning_rate": 3.204331392103574e-07, "logits/chosen": 0.6014379262924194, "logits/rejected": 0.45061808824539185, "logps/chosen": -133.7762451171875, "logps/ref_chosen": -59.738033294677734, "logps/ref_rejected": -93.60757446289062, "logps/rejected": -207.70806884765625, "loss": 1.1129, "margin_dpo/margin_mean": 40.062286376953125, "margin_dpo/margin_std": 62.55815887451172, "step": 311 }, { "KL/chosen_KL_mean": -75.43489837646484, "KL/mean": -99.37994384765625, "KL/rejected_KL_mean": -123.32498168945312, "KL/std": 55.53879165649414, "epoch": 0.47165532879818595, "fcm_dpo/beta": 0.01018855907022953, "fcm_dpo/delta": -0.09234406054019928, "fcm_dpo/margin": 47.89008331298828, "fcm_dpo/q_t": 0.387556791305542, "grad_norm": 12.009736061096191, "learning_rate": 3.1916350007663176e-07, "logits/chosen": 0.7049341201782227, "logits/rejected": 0.6049414873123169, "logps/chosen": -129.2513427734375, "logps/ref_chosen": -53.816436767578125, "logps/ref_rejected": -68.6575698852539, "logps/rejected": -191.9825439453125, "loss": 1.0305, "margin_dpo/margin_mean": 47.89008712768555, "margin_dpo/margin_std": 55.0544319152832, "step": 312 }, { "KL/chosen_KL_mean": -77.99341583251953, "KL/mean": -90.22091674804688, "KL/rejected_KL_mean": -102.44841766357422, "KL/std": 53.576873779296875, "epoch": 0.47316704459561604, "fcm_dpo/beta": 0.010341975837945938, "fcm_dpo/delta": 0.15090999007225037, "fcm_dpo/margin": 24.45500373840332, "fcm_dpo/q_t": 0.44433069229125977, "grad_norm": 12.877307891845703, "learning_rate": 3.178919262911314e-07, "logits/chosen": 0.7542744874954224, "logits/rejected": 0.7319517731666565, "logps/chosen": -137.95077514648438, "logps/ref_chosen": -59.957359313964844, "logps/ref_rejected": -69.31729888916016, "logps/rejected": -171.76571655273438, "loss": 1.2403, "margin_dpo/margin_mean": 24.455005645751953, "margin_dpo/margin_std": 59.98102569580078, "step": 313 }, { "KL/chosen_KL_mean": -73.31350708007812, "KL/mean": -97.73330688476562, "KL/rejected_KL_mean": -122.15311431884766, "KL/std": 55.16771697998047, "epoch": 0.47467876039304613, "fcm_dpo/beta": 0.010225845500826836, "fcm_dpo/delta": -0.10522407293319702, "fcm_dpo/margin": 48.83960723876953, "fcm_dpo/q_t": 0.38791847229003906, "grad_norm": 12.379100799560547, "learning_rate": 3.166184534225087e-07, "logits/chosen": 0.6708400845527649, "logits/rejected": 0.7052596211433411, "logps/chosen": -143.5816650390625, "logps/ref_chosen": -70.26815795898438, "logps/ref_rejected": -69.23971557617188, "logps/rejected": -191.392822265625, "loss": 1.0394, "margin_dpo/margin_mean": 48.83960723876953, "margin_dpo/margin_std": 61.088401794433594, "step": 314 }, { "KL/chosen_KL_mean": -78.89202880859375, "KL/mean": -97.85880279541016, "KL/rejected_KL_mean": -116.82557678222656, "KL/std": 55.19784927368164, "epoch": 0.47619047619047616, "fcm_dpo/beta": 0.010268254205584526, "fcm_dpo/delta": 0.010419394820928574, "fcm_dpo/margin": 37.93355178833008, "fcm_dpo/q_t": 0.41082364320755005, "grad_norm": 12.748796463012695, "learning_rate": 3.1534311709253723e-07, "logits/chosen": 0.5733602643013, "logits/rejected": 0.5370824933052063, "logps/chosen": -146.6867218017578, "logps/ref_chosen": -67.79469299316406, "logps/ref_rejected": -74.55148315429688, "logps/rejected": -191.37704467773438, "loss": 1.1182, "margin_dpo/margin_mean": 37.93355178833008, "margin_dpo/margin_std": 57.445091247558594, "step": 315 }, { "KL/chosen_KL_mean": -73.29759979248047, "KL/mean": -98.80491638183594, "KL/rejected_KL_mean": -124.31222534179688, "KL/std": 57.12293243408203, "epoch": 0.47770219198790626, "fcm_dpo/beta": 0.010146599262952805, "fcm_dpo/delta": -0.1250520497560501, "fcm_dpo/margin": 51.01461410522461, "fcm_dpo/q_t": 0.3825622797012329, "grad_norm": 13.721057891845703, "learning_rate": 3.1406595297511564e-07, "logits/chosen": 0.5659812092781067, "logits/rejected": 0.4247177839279175, "logps/chosen": -128.58609008789062, "logps/ref_chosen": -55.288482666015625, "logps/ref_rejected": -96.15723419189453, "logps/rejected": -220.46945190429688, "loss": 1.0217, "margin_dpo/margin_mean": 51.014610290527344, "margin_dpo/margin_std": 56.46538543701172, "step": 316 }, { "KL/chosen_KL_mean": -69.94444274902344, "KL/mean": -95.49288177490234, "KL/rejected_KL_mean": -121.04132843017578, "KL/std": 54.22065734863281, "epoch": 0.47921390778533635, "fcm_dpo/beta": 0.00978115200996399, "fcm_dpo/delta": -0.10571274906396866, "fcm_dpo/margin": 51.09687805175781, "fcm_dpo/q_t": 0.38502439856529236, "grad_norm": 16.166257858276367, "learning_rate": 3.1278699679526975e-07, "logits/chosen": 0.7258541584014893, "logits/rejected": 0.6784178614616394, "logps/chosen": -124.52581787109375, "logps/ref_chosen": -54.58137512207031, "logps/ref_rejected": -72.77232360839844, "logps/rejected": -193.81365966796875, "loss": 1.0285, "margin_dpo/margin_mean": 51.09687805175781, "margin_dpo/margin_std": 60.18268585205078, "step": 317 }, { "KL/chosen_KL_mean": -77.82942199707031, "KL/mean": -96.2706298828125, "KL/rejected_KL_mean": -114.71183776855469, "KL/std": 57.44794464111328, "epoch": 0.48072562358276644, "fcm_dpo/beta": 0.009802887216210365, "fcm_dpo/delta": 0.03988488018512726, "fcm_dpo/margin": 36.882423400878906, "fcm_dpo/q_t": 0.42149603366851807, "grad_norm": 12.203614234924316, "learning_rate": 3.1150628432815336e-07, "logits/chosen": 0.711035966873169, "logits/rejected": 0.6370011568069458, "logps/chosen": -130.7176513671875, "logps/ref_chosen": -52.88822937011719, "logps/ref_rejected": -80.63988494873047, "logps/rejected": -195.35171508789062, "loss": 1.1825, "margin_dpo/margin_mean": 36.882423400878906, "margin_dpo/margin_std": 75.04031372070312, "step": 318 }, { "KL/chosen_KL_mean": -75.06709289550781, "KL/mean": -100.05406188964844, "KL/rejected_KL_mean": -125.04103088378906, "KL/std": 59.51347732543945, "epoch": 0.48223733938019653, "fcm_dpo/beta": 0.009708519093692303, "fcm_dpo/delta": -0.08955653756856918, "fcm_dpo/margin": 49.97394561767578, "fcm_dpo/q_t": 0.3921103775501251, "grad_norm": 12.865338325500488, "learning_rate": 3.1022385139804707e-07, "logits/chosen": 0.6233776807785034, "logits/rejected": 0.6064482927322388, "logps/chosen": -139.430419921875, "logps/ref_chosen": -64.36333465576172, "logps/ref_rejected": -79.47296142578125, "logps/rejected": -204.51397705078125, "loss": 1.061, "margin_dpo/margin_mean": 49.97394561767578, "margin_dpo/margin_std": 69.11061096191406, "step": 319 }, { "KL/chosen_KL_mean": -70.72856140136719, "KL/mean": -91.74876403808594, "KL/rejected_KL_mean": -112.76896667480469, "KL/std": 60.215702056884766, "epoch": 0.4837490551776266, "fcm_dpo/beta": 0.009549040347337723, "fcm_dpo/delta": -0.1245603933930397, "fcm_dpo/margin": 42.040409088134766, "fcm_dpo/q_t": 0.4108247756958008, "grad_norm": 14.30135440826416, "learning_rate": 3.0893973387735683e-07, "logits/chosen": 0.5607609748840332, "logits/rejected": 0.5193840265274048, "logps/chosen": -120.28730773925781, "logps/ref_chosen": -49.558746337890625, "logps/ref_rejected": -71.23444366455078, "logps/rejected": -184.00341796875, "loss": 1.1267, "margin_dpo/margin_mean": 42.040409088134766, "margin_dpo/margin_std": 64.90843200683594, "step": 320 }, { "KL/chosen_KL_mean": -76.87751007080078, "KL/mean": -99.6007080078125, "KL/rejected_KL_mean": -122.32390594482422, "KL/std": 56.155517578125, "epoch": 0.4852607709750567, "fcm_dpo/beta": 0.009326249361038208, "fcm_dpo/delta": -0.026146888732910156, "fcm_dpo/margin": 45.44639587402344, "fcm_dpo/q_t": 0.40283846855163574, "grad_norm": 19.537832260131836, "learning_rate": 3.0765396768561004e-07, "logits/chosen": 0.6632350087165833, "logits/rejected": 0.6437802910804749, "logps/chosen": -128.96278381347656, "logps/ref_chosen": -52.08526611328125, "logps/ref_rejected": -55.58674621582031, "logps/rejected": -177.91064453125, "loss": 1.1029, "margin_dpo/margin_mean": 45.44639205932617, "margin_dpo/margin_std": 66.36302947998047, "step": 321 }, { "KL/chosen_KL_mean": -88.12907409667969, "KL/mean": -116.02623748779297, "KL/rejected_KL_mean": -143.9234161376953, "KL/std": 62.116004943847656, "epoch": 0.48677248677248675, "fcm_dpo/beta": 0.009237117134034634, "fcm_dpo/delta": -0.12147174775600433, "fcm_dpo/margin": 55.794349670410156, "fcm_dpo/q_t": 0.38129109144210815, "grad_norm": 12.237570762634277, "learning_rate": 3.063665887884511e-07, "logits/chosen": 0.7275417447090149, "logits/rejected": 0.6380044221878052, "logps/chosen": -135.53317260742188, "logps/ref_chosen": -47.404109954833984, "logps/ref_rejected": -73.4260025024414, "logps/rejected": -217.34942626953125, "loss": 1.0136, "margin_dpo/margin_mean": 55.794349670410156, "margin_dpo/margin_std": 62.14208221435547, "step": 322 }, { "KL/chosen_KL_mean": -89.85215759277344, "KL/mean": -108.70999145507812, "KL/rejected_KL_mean": -127.56783294677734, "KL/std": 60.365882873535156, "epoch": 0.48828420256991684, "fcm_dpo/beta": 0.009247412905097008, "fcm_dpo/delta": 0.052881501615047455, "fcm_dpo/margin": 37.715667724609375, "fcm_dpo/q_t": 0.4239242970943451, "grad_norm": 13.688651084899902, "learning_rate": 3.0507763319663517e-07, "logits/chosen": 0.6098207235336304, "logits/rejected": 0.528053343296051, "logps/chosen": -159.85845947265625, "logps/ref_chosen": -70.00630187988281, "logps/ref_rejected": -86.96690368652344, "logps/rejected": -214.53472900390625, "loss": 1.1845, "margin_dpo/margin_mean": 37.715667724609375, "margin_dpo/margin_std": 76.72145080566406, "step": 323 }, { "KL/chosen_KL_mean": -80.1814956665039, "KL/mean": -105.88117218017578, "KL/rejected_KL_mean": -131.5808563232422, "KL/std": 66.73388671875, "epoch": 0.4897959183673469, "fcm_dpo/beta": 0.009120061993598938, "fcm_dpo/delta": -0.07268932461738586, "fcm_dpo/margin": 51.39935302734375, "fcm_dpo/q_t": 0.3926229476928711, "grad_norm": 17.593368530273438, "learning_rate": 3.0378713696502097e-07, "logits/chosen": 0.725456714630127, "logits/rejected": 0.6644530892372131, "logps/chosen": -136.0703125, "logps/ref_chosen": -55.88882064819336, "logps/ref_rejected": -75.23088073730469, "logps/rejected": -206.81173706054688, "loss": 1.0481, "margin_dpo/margin_mean": 51.39935302734375, "margin_dpo/margin_std": 62.69535827636719, "step": 324 }, { "KL/chosen_KL_mean": -99.65099334716797, "KL/mean": -123.00679016113281, "KL/rejected_KL_mean": -146.3625946044922, "KL/std": 63.96126937866211, "epoch": 0.491307634164777, "fcm_dpo/beta": 0.009046638384461403, "fcm_dpo/delta": -0.02409055456519127, "fcm_dpo/margin": 46.71160125732422, "fcm_dpo/q_t": 0.4040486216545105, "grad_norm": 14.271781921386719, "learning_rate": 3.0249513619156206e-07, "logits/chosen": 0.6499701738357544, "logits/rejected": 0.5811442136764526, "logps/chosen": -163.79800415039062, "logps/ref_chosen": -64.14701843261719, "logps/ref_rejected": -79.91143798828125, "logps/rejected": -226.27403259277344, "loss": 1.1168, "margin_dpo/margin_mean": 46.71160125732422, "margin_dpo/margin_std": 75.5943603515625, "step": 325 }, { "KL/chosen_KL_mean": -116.2341537475586, "KL/mean": -126.52784729003906, "KL/rejected_KL_mean": -136.821533203125, "KL/std": 63.562782287597656, "epoch": 0.4928193499622071, "fcm_dpo/beta": 0.009226701222360134, "fcm_dpo/delta": 0.08249004930257797, "fcm_dpo/margin": 20.587383270263672, "fcm_dpo/q_t": 0.4575117528438568, "grad_norm": 14.76871109008789, "learning_rate": 3.012016670162977e-07, "logits/chosen": 0.5993965864181519, "logits/rejected": 0.6069393754005432, "logps/chosen": -191.76547241210938, "logps/ref_chosen": -75.53131103515625, "logps/ref_rejected": -76.5898666381836, "logps/rejected": -213.41140747070312, "loss": 1.3094, "margin_dpo/margin_mean": 20.587383270263672, "margin_dpo/margin_std": 71.02540588378906, "step": 326 }, { "KL/chosen_KL_mean": -106.27345275878906, "KL/mean": -125.02197265625, "KL/rejected_KL_mean": -143.77049255371094, "KL/std": 66.11697387695312, "epoch": 0.4943310657596372, "fcm_dpo/beta": 0.00933161936700344, "fcm_dpo/delta": 0.051452260464429855, "fcm_dpo/margin": 37.49703598022461, "fcm_dpo/q_t": 0.4224441647529602, "grad_norm": 16.04136848449707, "learning_rate": 2.99906765620341e-07, "logits/chosen": 0.5685694813728333, "logits/rejected": 0.5389949083328247, "logps/chosen": -175.61062622070312, "logps/ref_chosen": -69.33717346191406, "logps/ref_rejected": -73.37751770019531, "logps/rejected": -217.14801025390625, "loss": 1.1825, "margin_dpo/margin_mean": 37.497032165527344, "margin_dpo/margin_std": 74.90901184082031, "step": 327 }, { "KL/chosen_KL_mean": -93.10350036621094, "KL/mean": -115.86221313476562, "KL/rejected_KL_mean": -138.62091064453125, "KL/std": 66.50804138183594, "epoch": 0.4958427815570673, "fcm_dpo/beta": 0.009304068051278591, "fcm_dpo/delta": -0.02455383911728859, "fcm_dpo/margin": 45.51738739013672, "fcm_dpo/q_t": 0.40471357107162476, "grad_norm": 13.909423828125, "learning_rate": 2.9861046822486766e-07, "logits/chosen": 0.5772397518157959, "logits/rejected": 0.540619969367981, "logps/chosen": -154.8097381591797, "logps/ref_chosen": -61.70623016357422, "logps/ref_rejected": -83.73808288574219, "logps/rejected": -222.35897827148438, "loss": 1.0946, "margin_dpo/margin_mean": 45.51738739013672, "margin_dpo/margin_std": 65.49751281738281, "step": 328 }, { "KL/chosen_KL_mean": -103.38446807861328, "KL/mean": -126.82218933105469, "KL/rejected_KL_mean": -150.2598876953125, "KL/std": 67.91434478759766, "epoch": 0.4973544973544973, "fcm_dpo/beta": 0.00923411175608635, "fcm_dpo/delta": -0.03432675451040268, "fcm_dpo/margin": 46.87541580200195, "fcm_dpo/q_t": 0.40388986468315125, "grad_norm": 14.858785629272461, "learning_rate": 2.9731281109010253e-07, "logits/chosen": 0.7128061056137085, "logits/rejected": 0.6509952545166016, "logps/chosen": -167.88287353515625, "logps/ref_chosen": -64.4984130859375, "logps/ref_rejected": -83.6591796875, "logps/rejected": -233.9190673828125, "loss": 1.0941, "margin_dpo/margin_mean": 46.87541580200195, "margin_dpo/margin_std": 69.43316650390625, "step": 329 }, { "KL/chosen_KL_mean": -88.59403991699219, "KL/mean": -114.03706359863281, "KL/rejected_KL_mean": -139.48008728027344, "KL/std": 65.30772399902344, "epoch": 0.4988662131519274, "fcm_dpo/beta": 0.009153059683740139, "fcm_dpo/delta": -0.06900187581777573, "fcm_dpo/margin": 50.88603973388672, "fcm_dpo/q_t": 0.39630311727523804, "grad_norm": 14.516247749328613, "learning_rate": 2.9601383051430505e-07, "logits/chosen": 0.672234058380127, "logits/rejected": 0.6011776924133301, "logps/chosen": -143.398681640625, "logps/ref_chosen": -54.80464172363281, "logps/ref_rejected": -75.3194351196289, "logps/rejected": -214.79953002929688, "loss": 1.1088, "margin_dpo/margin_mean": 50.88603973388672, "margin_dpo/margin_std": 82.7798080444336, "step": 330 }, { "KL/chosen_KL_mean": -96.29144287109375, "KL/mean": -127.70050811767578, "KL/rejected_KL_mean": -159.1095733642578, "KL/std": 69.24229431152344, "epoch": 0.5003779289493575, "fcm_dpo/beta": 0.008882714435458183, "fcm_dpo/delta": -0.16707566380500793, "fcm_dpo/margin": 62.818115234375, "fcm_dpo/q_t": 0.37319380044937134, "grad_norm": 12.577916145324707, "learning_rate": 2.947135628327544e-07, "logits/chosen": 0.7713624238967896, "logits/rejected": 0.7439139485359192, "logps/chosen": -155.53402709960938, "logps/ref_chosen": -59.242584228515625, "logps/ref_rejected": -69.87483215332031, "logps/rejected": -228.98440551757812, "loss": 1.0101, "margin_dpo/margin_mean": 62.818115234375, "margin_dpo/margin_std": 76.17150115966797, "step": 331 }, { "KL/chosen_KL_mean": -97.85059356689453, "KL/mean": -124.42953491210938, "KL/rejected_KL_mean": -151.00848388671875, "KL/std": 66.03111267089844, "epoch": 0.5018896447467877, "fcm_dpo/beta": 0.008775634691119194, "fcm_dpo/delta": -0.07077940553426743, "fcm_dpo/margin": 53.15788269042969, "fcm_dpo/q_t": 0.3945736885070801, "grad_norm": 12.883719444274902, "learning_rate": 2.934120444167326e-07, "logits/chosen": 0.6039571166038513, "logits/rejected": 0.5597080588340759, "logps/chosen": -164.96035766601562, "logps/ref_chosen": -67.10975646972656, "logps/ref_rejected": -77.11839294433594, "logps/rejected": -228.12686157226562, "loss": 1.0613, "margin_dpo/margin_mean": 53.15788269042969, "margin_dpo/margin_std": 67.07878112792969, "step": 332 }, { "KL/chosen_KL_mean": -105.75825500488281, "KL/mean": -133.48265075683594, "KL/rejected_KL_mean": -161.20703125, "KL/std": 65.89384460449219, "epoch": 0.5034013605442177, "fcm_dpo/beta": 0.00857758242636919, "fcm_dpo/delta": -0.07931698858737946, "fcm_dpo/margin": 55.44878387451172, "fcm_dpo/q_t": 0.3922927975654602, "grad_norm": 13.02332878112793, "learning_rate": 2.921093116725076e-07, "logits/chosen": 0.6592659950256348, "logits/rejected": 0.5803956389427185, "logps/chosen": -164.13938903808594, "logps/ref_chosen": -58.381134033203125, "logps/ref_rejected": -85.02839660644531, "logps/rejected": -246.23544311523438, "loss": 1.0501, "margin_dpo/margin_mean": 55.44878387451172, "margin_dpo/margin_std": 70.19156646728516, "step": 333 }, { "KL/chosen_KL_mean": -100.52021789550781, "KL/mean": -120.38670349121094, "KL/rejected_KL_mean": -140.25320434570312, "KL/std": 67.8695297241211, "epoch": 0.5049130763416477, "fcm_dpo/beta": 0.00862037017941475, "fcm_dpo/delta": 0.059339895844459534, "fcm_dpo/margin": 39.732994079589844, "fcm_dpo/q_t": 0.42372041940689087, "grad_norm": 13.022583961486816, "learning_rate": 2.9080540104031484e-07, "logits/chosen": 0.7147078514099121, "logits/rejected": 0.6651749610900879, "logps/chosen": -167.41221618652344, "logps/ref_chosen": -66.89199829101562, "logps/ref_rejected": -91.83695220947266, "logps/rejected": -232.09014892578125, "loss": 1.1831, "margin_dpo/margin_mean": 39.732994079589844, "margin_dpo/margin_std": 79.65113830566406, "step": 334 }, { "KL/chosen_KL_mean": -101.7665023803711, "KL/mean": -124.23223876953125, "KL/rejected_KL_mean": -146.697998046875, "KL/std": 67.52774047851562, "epoch": 0.5064247921390779, "fcm_dpo/beta": 0.008682340383529663, "fcm_dpo/delta": 0.009819921106100082, "fcm_dpo/margin": 44.931480407714844, "fcm_dpo/q_t": 0.41297808289527893, "grad_norm": 17.79497718811035, "learning_rate": 2.895003489933375e-07, "logits/chosen": 0.7010518312454224, "logits/rejected": 0.6607016324996948, "logps/chosen": -163.2809600830078, "logps/ref_chosen": -61.51445770263672, "logps/ref_rejected": -75.68916320800781, "logps/rejected": -222.38714599609375, "loss": 1.1368, "margin_dpo/margin_mean": 44.931480407714844, "margin_dpo/margin_std": 76.04925537109375, "step": 335 }, { "KL/chosen_KL_mean": -112.51408386230469, "KL/mean": -135.68399047851562, "KL/rejected_KL_mean": -158.8538818359375, "KL/std": 69.78599548339844, "epoch": 0.5079365079365079, "fcm_dpo/beta": 0.008585982024669647, "fcm_dpo/delta": 0.001142159104347229, "fcm_dpo/margin": 46.33979034423828, "fcm_dpo/q_t": 0.4132213592529297, "grad_norm": 12.932156562805176, "learning_rate": 2.8819419203668675e-07, "logits/chosen": 0.5943493843078613, "logits/rejected": 0.5698835849761963, "logps/chosen": -181.36415100097656, "logps/ref_chosen": -68.85006713867188, "logps/ref_rejected": -92.99603271484375, "logps/rejected": -251.84991455078125, "loss": 1.1325, "margin_dpo/margin_mean": 46.33979415893555, "margin_dpo/margin_std": 77.21353149414062, "step": 336 }, { "KL/chosen_KL_mean": -116.28144836425781, "KL/mean": -135.18017578125, "KL/rejected_KL_mean": -154.07891845703125, "KL/std": 68.48802185058594, "epoch": 0.509448223733938, "fcm_dpo/beta": 0.008749064058065414, "fcm_dpo/delta": 0.07160548120737076, "fcm_dpo/margin": 37.79745864868164, "fcm_dpo/q_t": 0.42533576488494873, "grad_norm": 12.524788856506348, "learning_rate": 2.8688696670638053e-07, "logits/chosen": 0.5457628965377808, "logits/rejected": 0.513214111328125, "logps/chosen": -189.4692840576172, "logps/ref_chosen": -73.18783569335938, "logps/ref_rejected": -86.89118957519531, "logps/rejected": -240.9700927734375, "loss": 1.1772, "margin_dpo/margin_mean": 37.797454833984375, "margin_dpo/margin_std": 72.6024398803711, "step": 337 }, { "KL/chosen_KL_mean": -112.45155334472656, "KL/mean": -132.64056396484375, "KL/rejected_KL_mean": -152.82958984375, "KL/std": 65.70372009277344, "epoch": 0.5109599395313681, "fcm_dpo/beta": 0.008817563764750957, "fcm_dpo/delta": 0.045602478086948395, "fcm_dpo/margin": 40.37804412841797, "fcm_dpo/q_t": 0.41995948553085327, "grad_norm": 12.00542163848877, "learning_rate": 2.8557870956832133e-07, "logits/chosen": 0.6216901540756226, "logits/rejected": 0.5951056480407715, "logps/chosen": -176.39117431640625, "logps/ref_chosen": -63.939613342285156, "logps/ref_rejected": -75.34243774414062, "logps/rejected": -228.17202758789062, "loss": 1.1622, "margin_dpo/margin_mean": 40.37804412841797, "margin_dpo/margin_std": 74.25537872314453, "step": 338 }, { "KL/chosen_KL_mean": -92.30021667480469, "KL/mean": -114.72796630859375, "KL/rejected_KL_mean": -137.15573120117188, "KL/std": 64.6395492553711, "epoch": 0.5124716553287982, "fcm_dpo/beta": 0.008841393515467644, "fcm_dpo/delta": 0.003484068438410759, "fcm_dpo/margin": 44.855525970458984, "fcm_dpo/q_t": 0.4097171425819397, "grad_norm": 13.899869918823242, "learning_rate": 2.842694572172736e-07, "logits/chosen": 0.768153965473175, "logits/rejected": 0.6789811849594116, "logps/chosen": -137.84933471679688, "logps/ref_chosen": -45.54913330078125, "logps/ref_rejected": -67.0482177734375, "logps/rejected": -204.20394897460938, "loss": 1.1167, "margin_dpo/margin_mean": 44.855525970458984, "margin_dpo/margin_std": 68.58335876464844, "step": 339 }, { "KL/chosen_KL_mean": -104.43131256103516, "KL/mean": -127.0196304321289, "KL/rejected_KL_mean": -149.6079559326172, "KL/std": 71.02337646484375, "epoch": 0.5139833711262283, "fcm_dpo/beta": 0.008893972262740135, "fcm_dpo/delta": -0.0022116824984550476, "fcm_dpo/margin": 45.17664337158203, "fcm_dpo/q_t": 0.4118138253688812, "grad_norm": 12.8064546585083, "learning_rate": 2.8295924627584004e-07, "logits/chosen": 0.6542789936065674, "logits/rejected": 0.6323903799057007, "logps/chosen": -158.43695068359375, "logps/ref_chosen": -54.00564956665039, "logps/ref_rejected": -61.314430236816406, "logps/rejected": -210.92239379882812, "loss": 1.151, "margin_dpo/margin_mean": 45.1766357421875, "margin_dpo/margin_std": 82.57893371582031, "step": 340 }, { "KL/chosen_KL_mean": -103.22740936279297, "KL/mean": -129.3534698486328, "KL/rejected_KL_mean": -155.4795379638672, "KL/std": 69.85865783691406, "epoch": 0.5154950869236583, "fcm_dpo/beta": 0.008588971570134163, "fcm_dpo/delta": -0.15329314768314362, "fcm_dpo/margin": 52.25213623046875, "fcm_dpo/q_t": 0.39821314811706543, "grad_norm": 13.21523666381836, "learning_rate": 2.816481133934373e-07, "logits/chosen": 0.6862123012542725, "logits/rejected": 0.6346328258514404, "logps/chosen": -166.62249755859375, "logps/ref_chosen": -63.39509582519531, "logps/ref_rejected": -76.20973205566406, "logps/rejected": -231.68927001953125, "loss": 1.0928, "margin_dpo/margin_mean": 52.252132415771484, "margin_dpo/margin_std": 73.69546508789062, "step": 341 }, { "KL/chosen_KL_mean": -102.83758544921875, "KL/mean": -128.9514617919922, "KL/rejected_KL_mean": -155.06533813476562, "KL/std": 73.05850219726562, "epoch": 0.5170068027210885, "fcm_dpo/beta": 0.008501582778990269, "fcm_dpo/delta": -0.046442486345767975, "fcm_dpo/margin": 52.227760314941406, "fcm_dpo/q_t": 0.4010956883430481, "grad_norm": 11.47313404083252, "learning_rate": 2.8033609524527046e-07, "logits/chosen": 0.7249006032943726, "logits/rejected": 0.6812983155250549, "logps/chosen": -155.88540649414062, "logps/ref_chosen": -53.047813415527344, "logps/ref_rejected": -68.2854232788086, "logps/rejected": -223.35076904296875, "loss": 1.087, "margin_dpo/margin_mean": 52.227760314941406, "margin_dpo/margin_std": 75.20831298828125, "step": 342 }, { "KL/chosen_KL_mean": -95.32682037353516, "KL/mean": -113.88499450683594, "KL/rejected_KL_mean": -132.44317626953125, "KL/std": 67.90447998046875, "epoch": 0.5185185185185185, "fcm_dpo/beta": 0.008454539813101292, "fcm_dpo/delta": -0.03243470564484596, "fcm_dpo/margin": 37.11636734008789, "fcm_dpo/q_t": 0.42777007818222046, "grad_norm": 11.508443832397461, "learning_rate": 2.7902322853130753e-07, "logits/chosen": 0.5104277729988098, "logits/rejected": 0.5028120279312134, "logps/chosen": -165.9053497314453, "logps/ref_chosen": -70.57852935791016, "logps/ref_rejected": -84.73873901367188, "logps/rejected": -217.18191528320312, "loss": 1.1827, "margin_dpo/margin_mean": 37.11636734008789, "margin_dpo/margin_std": 69.8095932006836, "step": 343 }, { "KL/chosen_KL_mean": -105.62783813476562, "KL/mean": -132.8270721435547, "KL/rejected_KL_mean": -160.0263214111328, "KL/std": 68.96226501464844, "epoch": 0.5200302343159486, "fcm_dpo/beta": 0.008366056717932224, "fcm_dpo/delta": -0.0579121895134449, "fcm_dpo/margin": 54.39847946166992, "fcm_dpo/q_t": 0.3975214660167694, "grad_norm": 14.799226760864258, "learning_rate": 2.7770954997525274e-07, "logits/chosen": 0.6866724491119385, "logits/rejected": 0.6144804358482361, "logps/chosen": -161.4388427734375, "logps/ref_chosen": -55.811004638671875, "logps/ref_rejected": -84.77637481689453, "logps/rejected": -244.80270385742188, "loss": 1.0739, "margin_dpo/margin_mean": 54.39847946166992, "margin_dpo/margin_std": 75.30357360839844, "step": 344 }, { "KL/chosen_KL_mean": -85.24315643310547, "KL/mean": -108.30152893066406, "KL/rejected_KL_mean": -131.3599090576172, "KL/std": 62.293460845947266, "epoch": 0.5215419501133787, "fcm_dpo/beta": 0.008398323319852352, "fcm_dpo/delta": 0.012989198789000511, "fcm_dpo/margin": 46.116756439208984, "fcm_dpo/q_t": 0.4122108519077301, "grad_norm": 13.309449195861816, "learning_rate": 2.7639509632351927e-07, "logits/chosen": 0.7263613939285278, "logits/rejected": 0.6789531707763672, "logps/chosen": -143.0292510986328, "logps/ref_chosen": -57.78609848022461, "logps/ref_rejected": -78.91847229003906, "logps/rejected": -210.27838134765625, "loss": 1.1272, "margin_dpo/margin_mean": 46.11676025390625, "margin_dpo/margin_std": 74.05513000488281, "step": 345 }, { "KL/chosen_KL_mean": -92.9122543334961, "KL/mean": -120.13545989990234, "KL/rejected_KL_mean": -147.35865783691406, "KL/std": 71.83949279785156, "epoch": 0.5230536659108088, "fcm_dpo/beta": 0.008344133384525776, "fcm_dpo/delta": -0.05702626705169678, "fcm_dpo/margin": 54.446407318115234, "fcm_dpo/q_t": 0.39658302068710327, "grad_norm": 14.099263191223145, "learning_rate": 2.7507990434420123e-07, "logits/chosen": 0.704248309135437, "logits/rejected": 0.613318920135498, "logps/chosen": -149.1973876953125, "logps/ref_chosen": -56.285125732421875, "logps/ref_rejected": -91.15303039550781, "logps/rejected": -238.51168823242188, "loss": 1.0799, "margin_dpo/margin_mean": 54.446407318115234, "margin_dpo/margin_std": 75.3853988647461, "step": 346 }, { "KL/chosen_KL_mean": -102.66455841064453, "KL/mean": -123.77452087402344, "KL/rejected_KL_mean": -144.88449096679688, "KL/std": 69.3719482421875, "epoch": 0.5245653817082389, "fcm_dpo/beta": 0.008339539170265198, "fcm_dpo/delta": 0.04967883229255676, "fcm_dpo/margin": 42.219913482666016, "fcm_dpo/q_t": 0.4208434820175171, "grad_norm": 15.475859642028809, "learning_rate": 2.737640108260456e-07, "logits/chosen": 0.7972027063369751, "logits/rejected": 0.7434085607528687, "logps/chosen": -156.16409301757812, "logps/ref_chosen": -53.499542236328125, "logps/ref_rejected": -72.52565002441406, "logps/rejected": -217.41014099121094, "loss": 1.1499, "margin_dpo/margin_mean": 42.21991729736328, "margin_dpo/margin_std": 72.33638000488281, "step": 347 }, { "KL/chosen_KL_mean": -91.33484649658203, "KL/mean": -116.74815368652344, "KL/rejected_KL_mean": -142.16146850585938, "KL/std": 66.52864074707031, "epoch": 0.5260770975056689, "fcm_dpo/beta": 0.008313821628689766, "fcm_dpo/delta": -0.02378438226878643, "fcm_dpo/margin": 50.826629638671875, "fcm_dpo/q_t": 0.407415509223938, "grad_norm": 13.359156608581543, "learning_rate": 2.724474525774229e-07, "logits/chosen": 0.7767215967178345, "logits/rejected": 0.7447653412818909, "logps/chosen": -142.1217041015625, "logps/ref_chosen": -50.78684997558594, "logps/ref_rejected": -68.63732147216797, "logps/rejected": -210.7987823486328, "loss": 1.1143, "margin_dpo/margin_mean": 50.826629638671875, "margin_dpo/margin_std": 81.95619201660156, "step": 348 }, { "KL/chosen_KL_mean": -90.36459350585938, "KL/mean": -116.50259399414062, "KL/rejected_KL_mean": -142.64059448242188, "KL/std": 70.53898620605469, "epoch": 0.527588813303099, "fcm_dpo/beta": 0.00831620767712593, "fcm_dpo/delta": -0.03651543706655502, "fcm_dpo/margin": 52.27601623535156, "fcm_dpo/q_t": 0.4029201865196228, "grad_norm": 12.86361312866211, "learning_rate": 2.711302664252973e-07, "logits/chosen": 0.7323557138442993, "logits/rejected": 0.6329072713851929, "logps/chosen": -143.68960571289062, "logps/ref_chosen": -53.325008392333984, "logps/ref_rejected": -83.21236419677734, "logps/rejected": -225.85296630859375, "loss": 1.0928, "margin_dpo/margin_mean": 52.27601623535156, "margin_dpo/margin_std": 76.0950927734375, "step": 349 }, { "KL/chosen_KL_mean": -97.27735137939453, "KL/mean": -129.69583129882812, "KL/rejected_KL_mean": -162.11431884765625, "KL/std": 75.63677978515625, "epoch": 0.5291005291005291, "fcm_dpo/beta": 0.008093519136309624, "fcm_dpo/delta": -0.13191843032836914, "fcm_dpo/margin": 64.83696746826172, "fcm_dpo/q_t": 0.3799842596054077, "grad_norm": 16.259782791137695, "learning_rate": 2.698124892141971e-07, "logits/chosen": 0.6206036806106567, "logits/rejected": 0.5398292541503906, "logps/chosen": -158.9031219482422, "logps/ref_chosen": -61.625770568847656, "logps/ref_rejected": -87.63627624511719, "logps/rejected": -249.75057983398438, "loss": 1.0126, "margin_dpo/margin_mean": 64.83696746826172, "margin_dpo/margin_std": 74.60860443115234, "step": 350 }, { "KL/chosen_KL_mean": -92.27975463867188, "KL/mean": -116.63713073730469, "KL/rejected_KL_mean": -140.9945068359375, "KL/std": 65.88157653808594, "epoch": 0.5306122448979592, "fcm_dpo/beta": 0.008037666790187359, "fcm_dpo/delta": 0.008638240396976471, "fcm_dpo/margin": 48.71472930908203, "fcm_dpo/q_t": 0.4095621705055237, "grad_norm": 13.260066032409668, "learning_rate": 2.6849415780518357e-07, "logits/chosen": 0.6087486743927002, "logits/rejected": 0.5308948755264282, "logps/chosen": -148.53610229492188, "logps/ref_chosen": -56.2563362121582, "logps/ref_rejected": -79.11589813232422, "logps/rejected": -220.1103973388672, "loss": 1.1364, "margin_dpo/margin_mean": 48.71472930908203, "margin_dpo/margin_std": 81.64208984375, "step": 351 }, { "KL/chosen_KL_mean": -92.50155639648438, "KL/mean": -120.04151916503906, "KL/rejected_KL_mean": -147.5814666748047, "KL/std": 71.08851623535156, "epoch": 0.5321239606953893, "fcm_dpo/beta": 0.007998155429959297, "fcm_dpo/delta": -0.04257092997431755, "fcm_dpo/margin": 55.07991027832031, "fcm_dpo/q_t": 0.400249183177948, "grad_norm": 12.502577781677246, "learning_rate": 2.6717530907482024e-07, "logits/chosen": 0.7027904987335205, "logits/rejected": 0.6465107202529907, "logps/chosen": -155.5535125732422, "logps/ref_chosen": -63.05195236206055, "logps/ref_rejected": -85.52035522460938, "logps/rejected": -233.10182189941406, "loss": 1.0804, "margin_dpo/margin_mean": 55.07991027832031, "margin_dpo/margin_std": 76.85850524902344, "step": 352 }, { "KL/chosen_KL_mean": -90.88661193847656, "KL/mean": -117.17766571044922, "KL/rejected_KL_mean": -143.4687042236328, "KL/std": 67.98793029785156, "epoch": 0.5336356764928194, "fcm_dpo/beta": 0.007991371676325798, "fcm_dpo/delta": -0.021163104102015495, "fcm_dpo/margin": 52.58208465576172, "fcm_dpo/q_t": 0.40378886461257935, "grad_norm": 12.144339561462402, "learning_rate": 2.658559799141411e-07, "logits/chosen": 0.6585350036621094, "logits/rejected": 0.6634508967399597, "logps/chosen": -159.89581298828125, "logps/ref_chosen": -69.00918579101562, "logps/ref_rejected": -72.65840148925781, "logps/rejected": -216.12710571289062, "loss": 1.0923, "margin_dpo/margin_mean": 52.58208465576172, "margin_dpo/margin_std": 73.67620849609375, "step": 353 }, { "KL/chosen_KL_mean": -92.0418472290039, "KL/mean": -121.78923034667969, "KL/rejected_KL_mean": -151.53662109375, "KL/std": 66.71517181396484, "epoch": 0.5351473922902494, "fcm_dpo/beta": 0.007846582680940628, "fcm_dpo/delta": -0.07075677067041397, "fcm_dpo/margin": 59.49475860595703, "fcm_dpo/q_t": 0.3936957120895386, "grad_norm": 13.496247291564941, "learning_rate": 2.6453620722761895e-07, "logits/chosen": 0.7533440589904785, "logits/rejected": 0.6166994571685791, "logps/chosen": -131.83018493652344, "logps/ref_chosen": -39.78833770751953, "logps/ref_rejected": -69.56885528564453, "logps/rejected": -221.10546875, "loss": 1.0668, "margin_dpo/margin_mean": 59.4947624206543, "margin_dpo/margin_std": 80.03372192382812, "step": 354 }, { "KL/chosen_KL_mean": -98.19453430175781, "KL/mean": -128.38323974609375, "KL/rejected_KL_mean": -158.57196044921875, "KL/std": 74.0573959350586, "epoch": 0.5366591080876795, "fcm_dpo/beta": 0.007783657871186733, "fcm_dpo/delta": -0.07334629446268082, "fcm_dpo/margin": 60.377410888671875, "fcm_dpo/q_t": 0.3937861919403076, "grad_norm": 15.718170166015625, "learning_rate": 2.632160279321328e-07, "logits/chosen": 0.7349262833595276, "logits/rejected": 0.5939828157424927, "logps/chosen": -144.44992065429688, "logps/ref_chosen": -46.25537872314453, "logps/ref_rejected": -78.20236206054688, "logps/rejected": -236.77432250976562, "loss": 1.0763, "margin_dpo/margin_mean": 60.377410888671875, "margin_dpo/margin_std": 85.76248168945312, "step": 355 }, { "KL/chosen_KL_mean": -92.71003723144531, "KL/mean": -117.63554382324219, "KL/rejected_KL_mean": -142.56106567382812, "KL/std": 70.776123046875, "epoch": 0.5381708238851096, "fcm_dpo/beta": 0.007716212421655655, "fcm_dpo/delta": 0.01552538201212883, "fcm_dpo/margin": 49.851016998291016, "fcm_dpo/q_t": 0.41399505734443665, "grad_norm": 12.432771682739258, "learning_rate": 2.618954789559356e-07, "logits/chosen": 0.701271653175354, "logits/rejected": 0.6160717606544495, "logps/chosen": -140.61619567871094, "logps/ref_chosen": -47.906158447265625, "logps/ref_rejected": -74.29397583007812, "logps/rejected": -216.8550262451172, "loss": 1.1583, "margin_dpo/margin_mean": 49.85102081298828, "margin_dpo/margin_std": 91.85768127441406, "step": 356 }, { "KL/chosen_KL_mean": -110.83968353271484, "KL/mean": -132.12408447265625, "KL/rejected_KL_mean": -153.40847778320312, "KL/std": 70.29598999023438, "epoch": 0.5396825396825397, "fcm_dpo/beta": 0.007659477647393942, "fcm_dpo/delta": -0.06563596427440643, "fcm_dpo/margin": 42.56879425048828, "fcm_dpo/q_t": 0.42393821477890015, "grad_norm": 12.754502296447754, "learning_rate": 2.6057459723762076e-07, "logits/chosen": 0.693777322769165, "logits/rejected": 0.6680060625076294, "logps/chosen": -173.4746856689453, "logps/ref_chosen": -62.63500213623047, "logps/ref_rejected": -65.11399841308594, "logps/rejected": -218.52249145507812, "loss": 1.1746, "margin_dpo/margin_mean": 42.56879425048828, "margin_dpo/margin_std": 75.46603393554688, "step": 357 }, { "KL/chosen_KL_mean": -106.20034790039062, "KL/mean": -136.15338134765625, "KL/rejected_KL_mean": -166.10638427734375, "KL/std": 72.7083740234375, "epoch": 0.5411942554799698, "fcm_dpo/beta": 0.007647065445780754, "fcm_dpo/delta": -0.06158173456788063, "fcm_dpo/margin": 59.90604019165039, "fcm_dpo/q_t": 0.3970829248428345, "grad_norm": 16.220916748046875, "learning_rate": 2.5925341972508954e-07, "logits/chosen": 0.6153690814971924, "logits/rejected": 0.6308099031448364, "logps/chosen": -173.40997314453125, "logps/ref_chosen": -67.20960998535156, "logps/ref_rejected": -69.34715270996094, "logps/rejected": -235.45355224609375, "loss": 1.0741, "margin_dpo/margin_mean": 59.906036376953125, "margin_dpo/margin_std": 81.21144104003906, "step": 358 }, { "KL/chosen_KL_mean": -120.16806030273438, "KL/mean": -135.023193359375, "KL/rejected_KL_mean": -149.8783416748047, "KL/std": 70.89126586914062, "epoch": 0.5427059712773998, "fcm_dpo/beta": 0.007586339488625526, "fcm_dpo/delta": 0.027181455865502357, "fcm_dpo/margin": 29.710283279418945, "fcm_dpo/q_t": 0.4486175775527954, "grad_norm": 14.202005386352539, "learning_rate": 2.579319833745169e-07, "logits/chosen": 0.6100592613220215, "logits/rejected": 0.5783581137657166, "logps/chosen": -182.69384765625, "logps/ref_chosen": -62.52578353881836, "logps/ref_rejected": -76.63114929199219, "logps/rejected": -226.50949096679688, "loss": 1.2516, "margin_dpo/margin_mean": 29.710281372070312, "margin_dpo/margin_std": 73.21360778808594, "step": 359 }, { "KL/chosen_KL_mean": -115.72007751464844, "KL/mean": -140.93589782714844, "KL/rejected_KL_mean": -166.15171813964844, "KL/std": 75.85881042480469, "epoch": 0.54421768707483, "fcm_dpo/beta": 0.007609867490828037, "fcm_dpo/delta": 0.01681261509656906, "fcm_dpo/margin": 50.4316520690918, "fcm_dpo/q_t": 0.4133782684803009, "grad_norm": 11.879422187805176, "learning_rate": 2.5661032514931834e-07, "logits/chosen": 0.5995759963989258, "logits/rejected": 0.5037303566932678, "logps/chosen": -179.20779418945312, "logps/ref_chosen": -63.48772048950195, "logps/ref_rejected": -90.6891098022461, "logps/rejected": -256.8408203125, "loss": 1.12, "margin_dpo/margin_mean": 50.43164825439453, "margin_dpo/margin_std": 76.99786376953125, "step": 360 }, { "KL/chosen_KL_mean": -111.41509246826172, "KL/mean": -141.5027618408203, "KL/rejected_KL_mean": -171.59042358398438, "KL/std": 74.95464324951172, "epoch": 0.54572940287226, "fcm_dpo/beta": 0.007585292682051659, "fcm_dpo/delta": -0.05913050100207329, "fcm_dpo/margin": 60.175331115722656, "fcm_dpo/q_t": 0.3952474594116211, "grad_norm": 11.69443130493164, "learning_rate": 2.552884820191154e-07, "logits/chosen": 0.7824530601501465, "logits/rejected": 0.7306559085845947, "logps/chosen": -169.33224487304688, "logps/ref_chosen": -57.917144775390625, "logps/ref_rejected": -72.39089965820312, "logps/rejected": -243.9813232421875, "loss": 1.0599, "margin_dpo/margin_mean": 60.175331115722656, "margin_dpo/margin_std": 76.08274841308594, "step": 361 }, { "KL/chosen_KL_mean": -113.10176086425781, "KL/mean": -144.88027954101562, "KL/rejected_KL_mean": -176.65878295898438, "KL/std": 79.89237976074219, "epoch": 0.54724111866969, "fcm_dpo/beta": 0.0074944887310266495, "fcm_dpo/delta": -0.08031899482011795, "fcm_dpo/margin": 63.55701446533203, "fcm_dpo/q_t": 0.39540350437164307, "grad_norm": 13.342317581176758, "learning_rate": 2.53966490958702e-07, "logits/chosen": 0.7600926756858826, "logits/rejected": 0.6429303884506226, "logps/chosen": -176.54522705078125, "logps/ref_chosen": -63.4434700012207, "logps/ref_rejected": -103.45516967773438, "logps/rejected": -280.11395263671875, "loss": 1.0807, "margin_dpo/margin_mean": 63.55701446533203, "margin_dpo/margin_std": 94.53982543945312, "step": 362 }, { "KL/chosen_KL_mean": -118.61378479003906, "KL/mean": -148.8767547607422, "KL/rejected_KL_mean": -179.1397247314453, "KL/std": 71.99586486816406, "epoch": 0.5487528344671202, "fcm_dpo/beta": 0.007360072806477547, "fcm_dpo/delta": -0.04769909381866455, "fcm_dpo/margin": 60.52595520019531, "fcm_dpo/q_t": 0.3989811837673187, "grad_norm": 14.419317245483398, "learning_rate": 2.526443889470099e-07, "logits/chosen": 0.7493158578872681, "logits/rejected": 0.6072036027908325, "logps/chosen": -167.26560974121094, "logps/ref_chosen": -48.65182876586914, "logps/ref_rejected": -88.65904235839844, "logps/rejected": -267.79876708984375, "loss": 1.0764, "margin_dpo/margin_mean": 60.52595520019531, "margin_dpo/margin_std": 83.46663665771484, "step": 363 }, { "KL/chosen_KL_mean": -105.04194641113281, "KL/mean": -138.92398071289062, "KL/rejected_KL_mean": -172.8060302734375, "KL/std": 79.77528381347656, "epoch": 0.5502645502645502, "fcm_dpo/beta": 0.007244332693517208, "fcm_dpo/delta": -0.09565840661525726, "fcm_dpo/margin": 67.76409912109375, "fcm_dpo/q_t": 0.3908158838748932, "grad_norm": 11.784923553466797, "learning_rate": 2.513222129660744e-07, "logits/chosen": 0.5907178521156311, "logits/rejected": 0.5006883144378662, "logps/chosen": -162.91302490234375, "logps/ref_chosen": -57.87107467651367, "logps/ref_rejected": -80.95503234863281, "logps/rejected": -253.7610626220703, "loss": 1.0727, "margin_dpo/margin_mean": 67.76409912109375, "margin_dpo/margin_std": 99.25141143798828, "step": 364 }, { "KL/chosen_KL_mean": -97.74810791015625, "KL/mean": -129.11260986328125, "KL/rejected_KL_mean": -160.47711181640625, "KL/std": 83.03392028808594, "epoch": 0.5517762660619804, "fcm_dpo/beta": 0.007144194096326828, "fcm_dpo/delta": -0.05059466511011124, "fcm_dpo/margin": 62.729000091552734, "fcm_dpo/q_t": 0.39662739634513855, "grad_norm": 10.587419509887695, "learning_rate": 2.5e-07, "logits/chosen": 0.6932646036148071, "logits/rejected": 0.6871670484542847, "logps/chosen": -162.69027709960938, "logps/ref_chosen": -64.94217681884766, "logps/ref_rejected": -74.8599853515625, "logps/rejected": -235.33709716796875, "loss": 1.0513, "margin_dpo/margin_mean": 62.72900390625, "margin_dpo/margin_std": 73.52715301513672, "step": 365 }, { "KL/chosen_KL_mean": -97.37129211425781, "KL/mean": -123.66143798828125, "KL/rejected_KL_mean": -149.95156860351562, "KL/std": 73.82386779785156, "epoch": 0.5532879818594104, "fcm_dpo/beta": 0.007191378623247147, "fcm_dpo/delta": 0.022355100139975548, "fcm_dpo/margin": 52.580291748046875, "fcm_dpo/q_t": 0.4155963361263275, "grad_norm": 13.658476829528809, "learning_rate": 2.486777870339255e-07, "logits/chosen": 0.6577446460723877, "logits/rejected": 0.6454114317893982, "logps/chosen": -152.53726196289062, "logps/ref_chosen": -55.16598129272461, "logps/ref_rejected": -65.26121520996094, "logps/rejected": -215.21279907226562, "loss": 1.1562, "margin_dpo/margin_mean": 52.580291748046875, "margin_dpo/margin_std": 96.0002670288086, "step": 366 }, { "KL/chosen_KL_mean": -110.28848266601562, "KL/mean": -136.48446655273438, "KL/rejected_KL_mean": -162.68048095703125, "KL/std": 75.3712158203125, "epoch": 0.5547996976568406, "fcm_dpo/beta": 0.0071844179183244705, "fcm_dpo/delta": 0.024486679583787918, "fcm_dpo/margin": 52.39199447631836, "fcm_dpo/q_t": 0.4123581349849701, "grad_norm": 12.222148895263672, "learning_rate": 2.4735561105299014e-07, "logits/chosen": 0.6872113943099976, "logits/rejected": 0.5772612690925598, "logps/chosen": -166.2989501953125, "logps/ref_chosen": -56.01046371459961, "logps/ref_rejected": -77.31010437011719, "logps/rejected": -239.99057006835938, "loss": 1.1242, "margin_dpo/margin_mean": 52.391990661621094, "margin_dpo/margin_std": 80.36027526855469, "step": 367 }, { "KL/chosen_KL_mean": -121.23825073242188, "KL/mean": -146.34426879882812, "KL/rejected_KL_mean": -171.45030212402344, "KL/std": 74.06891632080078, "epoch": 0.5563114134542706, "fcm_dpo/beta": 0.007237586658447981, "fcm_dpo/delta": 0.03797682747244835, "fcm_dpo/margin": 50.21206283569336, "fcm_dpo/q_t": 0.415981650352478, "grad_norm": 13.983097076416016, "learning_rate": 2.46033509041298e-07, "logits/chosen": 0.5310577750205994, "logits/rejected": 0.5317566394805908, "logps/chosen": -196.06752014160156, "logps/ref_chosen": -74.82927703857422, "logps/ref_rejected": -76.11680603027344, "logps/rejected": -247.56710815429688, "loss": 1.1322, "margin_dpo/margin_mean": 50.21206283569336, "margin_dpo/margin_std": 78.31637573242188, "step": 368 }, { "KL/chosen_KL_mean": -113.98828125, "KL/mean": -135.5532989501953, "KL/rejected_KL_mean": -157.11831665039062, "KL/std": 74.72607421875, "epoch": 0.5578231292517006, "fcm_dpo/beta": 0.007384549826383591, "fcm_dpo/delta": 0.08346908539533615, "fcm_dpo/margin": 43.130043029785156, "fcm_dpo/q_t": 0.42775410413742065, "grad_norm": 12.494375228881836, "learning_rate": 2.447115179808846e-07, "logits/chosen": 0.7088351249694824, "logits/rejected": 0.6526861190795898, "logps/chosen": -172.3144989013672, "logps/ref_chosen": -58.32621765136719, "logps/ref_rejected": -80.92183685302734, "logps/rejected": -238.0401611328125, "loss": 1.1757, "margin_dpo/margin_mean": 43.130043029785156, "margin_dpo/margin_std": 79.66732025146484, "step": 369 }, { "KL/chosen_KL_mean": -110.04635620117188, "KL/mean": -140.6636962890625, "KL/rejected_KL_mean": -171.2810516357422, "KL/std": 77.74684143066406, "epoch": 0.5593348450491308, "fcm_dpo/beta": 0.007315288297832012, "fcm_dpo/delta": -0.050372689962387085, "fcm_dpo/margin": 61.23469543457031, "fcm_dpo/q_t": 0.39772140979766846, "grad_norm": 12.534859657287598, "learning_rate": 2.4338967485068164e-07, "logits/chosen": 0.773854672908783, "logits/rejected": 0.7020103931427002, "logps/chosen": -162.93008422851562, "logps/ref_chosen": -52.88372039794922, "logps/ref_rejected": -79.43692016601562, "logps/rejected": -250.7179718017578, "loss": 1.0899, "margin_dpo/margin_mean": 61.23469543457031, "margin_dpo/margin_std": 90.65008544921875, "step": 370 }, { "KL/chosen_KL_mean": -110.13628387451172, "KL/mean": -136.8223114013672, "KL/rejected_KL_mean": -163.5083465576172, "KL/std": 75.33867645263672, "epoch": 0.5608465608465608, "fcm_dpo/beta": 0.00735745532438159, "fcm_dpo/delta": 0.006999436765909195, "fcm_dpo/margin": 53.3720588684082, "fcm_dpo/q_t": 0.4111405611038208, "grad_norm": 16.180383682250977, "learning_rate": 2.420680166254831e-07, "logits/chosen": 0.8312065601348877, "logits/rejected": 0.7964282035827637, "logps/chosen": -159.36050415039062, "logps/ref_chosen": -49.224212646484375, "logps/ref_rejected": -63.348472595214844, "logps/rejected": -226.8568115234375, "loss": 1.1207, "margin_dpo/margin_mean": 53.3720588684082, "margin_dpo/margin_std": 82.38186645507812, "step": 371 }, { "KL/chosen_KL_mean": -118.095703125, "KL/mean": -134.08782958984375, "KL/rejected_KL_mean": -150.0799560546875, "KL/std": 74.64507293701172, "epoch": 0.562358276643991, "fcm_dpo/beta": 0.007336446549743414, "fcm_dpo/delta": 0.021236741915345192, "fcm_dpo/margin": 31.984272003173828, "fcm_dpo/q_t": 0.44746047258377075, "grad_norm": 16.273944854736328, "learning_rate": 2.4074658027491044e-07, "logits/chosen": 0.6723369359970093, "logits/rejected": 0.5741250514984131, "logps/chosen": -170.36526489257812, "logps/ref_chosen": -52.269554138183594, "logps/ref_rejected": -72.99522399902344, "logps/rejected": -223.0751953125, "loss": 1.2901, "margin_dpo/margin_mean": 31.984268188476562, "margin_dpo/margin_std": 97.44116973876953, "step": 372 }, { "KL/chosen_KL_mean": -130.05322265625, "KL/mean": -152.4982147216797, "KL/rejected_KL_mean": -174.94320678710938, "KL/std": 75.69514465332031, "epoch": 0.563869992441421, "fcm_dpo/beta": 0.007434169761836529, "fcm_dpo/delta": 0.06849108636379242, "fcm_dpo/margin": 44.88999938964844, "fcm_dpo/q_t": 0.425289511680603, "grad_norm": 13.385211944580078, "learning_rate": 2.394254027623792e-07, "logits/chosen": 0.694267749786377, "logits/rejected": 0.6211506128311157, "logps/chosen": -191.1662139892578, "logps/ref_chosen": -61.112998962402344, "logps/ref_rejected": -76.24851989746094, "logps/rejected": -251.1917266845703, "loss": 1.2026, "margin_dpo/margin_mean": 44.88999557495117, "margin_dpo/margin_std": 96.96002197265625, "step": 373 }, { "KL/chosen_KL_mean": -109.11332702636719, "KL/mean": -146.11172485351562, "KL/rejected_KL_mean": -183.11012268066406, "KL/std": 78.90641784667969, "epoch": 0.5653817082388511, "fcm_dpo/beta": 0.007282897364348173, "fcm_dpo/delta": -0.14705148339271545, "fcm_dpo/margin": 73.99679565429688, "fcm_dpo/q_t": 0.3776131272315979, "grad_norm": 12.882840156555176, "learning_rate": 2.381045210440644e-07, "logits/chosen": 0.5803056955337524, "logits/rejected": 0.5816897749900818, "logps/chosen": -181.78253173828125, "logps/ref_chosen": -72.66920471191406, "logps/ref_rejected": -76.83158874511719, "logps/rejected": -259.94171142578125, "loss": 1.0207, "margin_dpo/margin_mean": 73.99679565429688, "margin_dpo/margin_std": 91.89556884765625, "step": 374 }, { "KL/chosen_KL_mean": -107.30612182617188, "KL/mean": -134.16314697265625, "KL/rejected_KL_mean": -161.0201873779297, "KL/std": 83.42892456054688, "epoch": 0.5668934240362812, "fcm_dpo/beta": 0.007231268100440502, "fcm_dpo/delta": 0.011937053874135017, "fcm_dpo/margin": 53.71406555175781, "fcm_dpo/q_t": 0.4125698506832123, "grad_norm": 16.0015926361084, "learning_rate": 2.3678397206786715e-07, "logits/chosen": 0.6959929466247559, "logits/rejected": 0.6345775127410889, "logps/chosen": -164.9894256591797, "logps/ref_chosen": -57.68330383300781, "logps/ref_rejected": -79.34097290039062, "logps/rejected": -240.3611602783203, "loss": 1.1375, "margin_dpo/margin_mean": 53.71406555175781, "margin_dpo/margin_std": 91.20944213867188, "step": 375 }, { "KL/chosen_KL_mean": -116.24351501464844, "KL/mean": -148.65135192871094, "KL/rejected_KL_mean": -181.05918884277344, "KL/std": 82.94913482666016, "epoch": 0.5684051398337112, "fcm_dpo/beta": 0.007167559117078781, "fcm_dpo/delta": -0.06793060153722763, "fcm_dpo/margin": 64.815673828125, "fcm_dpo/q_t": 0.3970538377761841, "grad_norm": 13.341163635253906, "learning_rate": 2.3546379277238103e-07, "logits/chosen": 0.7436013221740723, "logits/rejected": 0.6675286293029785, "logps/chosen": -167.91758728027344, "logps/ref_chosen": -51.674072265625, "logps/ref_rejected": -75.69713592529297, "logps/rejected": -256.7563171386719, "loss": 1.0877, "margin_dpo/margin_mean": 64.815673828125, "margin_dpo/margin_std": 97.90460205078125, "step": 376 }, { "KL/chosen_KL_mean": -119.63145446777344, "KL/mean": -143.13299560546875, "KL/rejected_KL_mean": -166.63455200195312, "KL/std": 73.71533203125, "epoch": 0.5699168556311414, "fcm_dpo/beta": 0.007252939976751804, "fcm_dpo/delta": 0.06053512543439865, "fcm_dpo/margin": 47.00309753417969, "fcm_dpo/q_t": 0.4217602014541626, "grad_norm": 13.08659553527832, "learning_rate": 2.3414402008585886e-07, "logits/chosen": 0.6866433620452881, "logits/rejected": 0.6627596616744995, "logps/chosen": -165.8099822998047, "logps/ref_chosen": -46.17853546142578, "logps/ref_rejected": -57.756500244140625, "logps/rejected": -224.39105224609375, "loss": 1.1636, "margin_dpo/margin_mean": 47.00309753417969, "margin_dpo/margin_std": 84.02613830566406, "step": 377 }, { "KL/chosen_KL_mean": -119.20115661621094, "KL/mean": -140.73007202148438, "KL/rejected_KL_mean": -162.25900268554688, "KL/std": 78.96051025390625, "epoch": 0.5714285714285714, "fcm_dpo/beta": 0.007366209290921688, "fcm_dpo/delta": 0.0848575159907341, "fcm_dpo/margin": 43.057865142822266, "fcm_dpo/q_t": 0.4277133047580719, "grad_norm": 13.07816219329834, "learning_rate": 2.3282469092517977e-07, "logits/chosen": 0.7556013464927673, "logits/rejected": 0.7047852277755737, "logps/chosen": -178.42002868652344, "logps/ref_chosen": -59.21887969970703, "logps/ref_rejected": -71.24818420410156, "logps/rejected": -233.5072021484375, "loss": 1.183, "margin_dpo/margin_mean": 43.057861328125, "margin_dpo/margin_std": 81.8153076171875, "step": 378 }, { "KL/chosen_KL_mean": -114.84449768066406, "KL/mean": -145.83969116210938, "KL/rejected_KL_mean": -176.83486938476562, "KL/std": 81.19743347167969, "epoch": 0.5729402872260015, "fcm_dpo/beta": 0.007311449386179447, "fcm_dpo/delta": -0.05574117228388786, "fcm_dpo/margin": 61.99034881591797, "fcm_dpo/q_t": 0.39902064204216003, "grad_norm": 14.925641059875488, "learning_rate": 2.3150584219481643e-07, "logits/chosen": 0.6609820127487183, "logits/rejected": 0.5843163728713989, "logps/chosen": -191.1610870361328, "logps/ref_chosen": -76.31658935546875, "logps/ref_rejected": -104.26200103759766, "logps/rejected": -281.09686279296875, "loss": 1.0836, "margin_dpo/margin_mean": 61.99034881591797, "margin_dpo/margin_std": 90.74832153320312, "step": 379 }, { "KL/chosen_KL_mean": -99.79998016357422, "KL/mean": -136.4771728515625, "KL/rejected_KL_mean": -173.15435791015625, "KL/std": 75.69488525390625, "epoch": 0.5744520030234316, "fcm_dpo/beta": 0.007152165286242962, "fcm_dpo/delta": -0.13143953680992126, "fcm_dpo/margin": 73.35438537597656, "fcm_dpo/q_t": 0.3798936605453491, "grad_norm": 11.697549819946289, "learning_rate": 2.3018751078580283e-07, "logits/chosen": 0.7148063778877258, "logits/rejected": 0.6736452579498291, "logps/chosen": -161.08314514160156, "logps/ref_chosen": -61.283164978027344, "logps/ref_rejected": -72.38892364501953, "logps/rejected": -245.54327392578125, "loss": 1.0287, "margin_dpo/margin_mean": 73.35438537597656, "margin_dpo/margin_std": 91.20191192626953, "step": 380 }, { "KL/chosen_KL_mean": -122.75875091552734, "KL/mean": -136.77732849121094, "KL/rejected_KL_mean": -150.79592895507812, "KL/std": 77.52383422851562, "epoch": 0.5759637188208617, "fcm_dpo/beta": 0.00713972095400095, "fcm_dpo/delta": 0.05948397517204285, "fcm_dpo/margin": 28.03717803955078, "fcm_dpo/q_t": 0.4550970792770386, "grad_norm": 12.341383934020996, "learning_rate": 2.288697335747027e-07, "logits/chosen": 0.6320232152938843, "logits/rejected": 0.6118979454040527, "logps/chosen": -180.97274780273438, "logps/ref_chosen": -58.2139892578125, "logps/ref_rejected": -60.78669357299805, "logps/rejected": -211.58261108398438, "loss": 1.299, "margin_dpo/margin_mean": 28.03717803955078, "margin_dpo/margin_std": 92.05723571777344, "step": 381 }, { "KL/chosen_KL_mean": -123.69637298583984, "KL/mean": -149.37307739257812, "KL/rejected_KL_mean": -175.0498046875, "KL/std": 77.38346099853516, "epoch": 0.5774754346182918, "fcm_dpo/beta": 0.007241186685860157, "fcm_dpo/delta": 0.028807764872908592, "fcm_dpo/margin": 51.353416442871094, "fcm_dpo/q_t": 0.41480135917663574, "grad_norm": 12.818017959594727, "learning_rate": 2.2755254742257706e-07, "logits/chosen": 0.7081664800643921, "logits/rejected": 0.6503809690475464, "logps/chosen": -185.52169799804688, "logps/ref_chosen": -61.82532501220703, "logps/ref_rejected": -83.0452880859375, "logps/rejected": -258.0950927734375, "loss": 1.122, "margin_dpo/margin_mean": 51.35341262817383, "margin_dpo/margin_std": 76.22883605957031, "step": 382 }, { "KL/chosen_KL_mean": -123.33224487304688, "KL/mean": -147.388916015625, "KL/rejected_KL_mean": -171.44558715820312, "KL/std": 78.26579284667969, "epoch": 0.5789871504157218, "fcm_dpo/beta": 0.00725115742534399, "fcm_dpo/delta": 0.052844174206256866, "fcm_dpo/margin": 48.11332702636719, "fcm_dpo/q_t": 0.4227685034275055, "grad_norm": 14.382017135620117, "learning_rate": 2.2623598917395436e-07, "logits/chosen": 0.5631277561187744, "logits/rejected": 0.5959005355834961, "logps/chosen": -203.8955078125, "logps/ref_chosen": -80.56326293945312, "logps/ref_rejected": -74.62922668457031, "logps/rejected": -246.07479858398438, "loss": 1.1887, "margin_dpo/margin_mean": 48.11333465576172, "margin_dpo/margin_std": 98.64120483398438, "step": 383 }, { "KL/chosen_KL_mean": -119.82319641113281, "KL/mean": -145.9679412841797, "KL/rejected_KL_mean": -172.1126708984375, "KL/std": 77.66960144042969, "epoch": 0.5804988662131519, "fcm_dpo/beta": 0.0073190066032111645, "fcm_dpo/delta": 0.0179832074791193, "fcm_dpo/margin": 52.28948211669922, "fcm_dpo/q_t": 0.4109513759613037, "grad_norm": 15.066558837890625, "learning_rate": 2.2492009565579875e-07, "logits/chosen": 0.7564608454704285, "logits/rejected": 0.7100478410720825, "logps/chosen": -185.29833984375, "logps/ref_chosen": -65.47514343261719, "logps/ref_rejected": -79.67378234863281, "logps/rejected": -251.78646850585938, "loss": 1.1257, "margin_dpo/margin_mean": 52.28948211669922, "margin_dpo/margin_std": 82.88512420654297, "step": 384 }, { "KL/chosen_KL_mean": -117.14468383789062, "KL/mean": -151.48260498046875, "KL/rejected_KL_mean": -185.82054138183594, "KL/std": 78.35690307617188, "epoch": 0.582010582010582, "fcm_dpo/beta": 0.007244712673127651, "fcm_dpo/delta": -0.10255894809961319, "fcm_dpo/margin": 68.67587280273438, "fcm_dpo/q_t": 0.387284517288208, "grad_norm": 12.45457935333252, "learning_rate": 2.2360490367648084e-07, "logits/chosen": 0.5941613912582397, "logits/rejected": 0.5519691705703735, "logps/chosen": -183.20120239257812, "logps/ref_chosen": -66.0565185546875, "logps/ref_rejected": -86.68023681640625, "logps/rejected": -272.50079345703125, "loss": 1.0359, "margin_dpo/margin_mean": 68.67587280273438, "margin_dpo/margin_std": 83.38473510742188, "step": 385 }, { "KL/chosen_KL_mean": -136.16485595703125, "KL/mean": -158.68191528320312, "KL/rejected_KL_mean": -181.19894409179688, "KL/std": 81.03706359863281, "epoch": 0.5835222978080121, "fcm_dpo/beta": 0.007247047498822212, "fcm_dpo/delta": 0.07619121670722961, "fcm_dpo/margin": 45.03410720825195, "fcm_dpo/q_t": 0.42447221279144287, "grad_norm": 13.433361053466797, "learning_rate": 2.2229045002474724e-07, "logits/chosen": 0.5844802856445312, "logits/rejected": 0.5262288451194763, "logps/chosen": -211.78851318359375, "logps/ref_chosen": -75.6236572265625, "logps/ref_rejected": -92.62330627441406, "logps/rejected": -273.822265625, "loss": 1.1744, "margin_dpo/margin_mean": 45.03410720825195, "margin_dpo/margin_std": 83.9128189086914, "step": 386 }, { "KL/chosen_KL_mean": -119.8702392578125, "KL/mean": -152.5230712890625, "KL/rejected_KL_mean": -185.17593383789062, "KL/std": 76.57776641845703, "epoch": 0.5850340136054422, "fcm_dpo/beta": 0.007200167048722506, "fcm_dpo/delta": -0.07379137724637985, "fcm_dpo/margin": 65.30567932128906, "fcm_dpo/q_t": 0.3926030397415161, "grad_norm": 13.578509330749512, "learning_rate": 2.209767714686924e-07, "logits/chosen": 0.6923336982727051, "logits/rejected": 0.5794006586074829, "logps/chosen": -167.09194946289062, "logps/ref_chosen": -47.22170639038086, "logps/ref_rejected": -87.338134765625, "logps/rejected": -272.5140686035156, "loss": 1.0411, "margin_dpo/margin_mean": 65.30567932128906, "margin_dpo/margin_std": 77.0108642578125, "step": 387 }, { "KL/chosen_KL_mean": -122.36817932128906, "KL/mean": -142.67022705078125, "KL/rejected_KL_mean": -162.9722900390625, "KL/std": 80.08634185791016, "epoch": 0.5865457294028723, "fcm_dpo/beta": 0.007184567395597696, "fcm_dpo/delta": 0.0021784361451864243, "fcm_dpo/margin": 40.60411071777344, "fcm_dpo/q_t": 0.435527503490448, "grad_norm": 13.143341064453125, "learning_rate": 2.1966390475472954e-07, "logits/chosen": 0.6523053646087646, "logits/rejected": 0.6470510959625244, "logps/chosen": -196.94766235351562, "logps/ref_chosen": -74.5794677734375, "logps/ref_rejected": -79.92558288574219, "logps/rejected": -242.89788818359375, "loss": 1.2264, "margin_dpo/margin_mean": 40.6041145324707, "margin_dpo/margin_std": 95.03282928466797, "step": 388 }, { "KL/chosen_KL_mean": -116.84317779541016, "KL/mean": -149.46731567382812, "KL/rejected_KL_mean": -182.09146118164062, "KL/std": 78.38906860351562, "epoch": 0.5880574452003023, "fcm_dpo/beta": 0.00712235551327467, "fcm_dpo/delta": -0.06782112270593643, "fcm_dpo/margin": 65.24827575683594, "fcm_dpo/q_t": 0.3948487639427185, "grad_norm": 24.90400505065918, "learning_rate": 2.1835188660656265e-07, "logits/chosen": 0.7172328233718872, "logits/rejected": 0.6779006719589233, "logps/chosen": -178.46754455566406, "logps/ref_chosen": -61.624366760253906, "logps/ref_rejected": -76.50978088378906, "logps/rejected": -258.6012268066406, "loss": 1.0635, "margin_dpo/margin_mean": 65.24828338623047, "margin_dpo/margin_std": 87.07835388183594, "step": 389 }, { "KL/chosen_KL_mean": -105.14115142822266, "KL/mean": -130.4149932861328, "KL/rejected_KL_mean": -155.6888427734375, "KL/std": 75.40281677246094, "epoch": 0.5895691609977324, "fcm_dpo/beta": 0.007114923559129238, "fcm_dpo/delta": 0.04184433072805405, "fcm_dpo/margin": 50.54767990112305, "fcm_dpo/q_t": 0.4177587330341339, "grad_norm": 11.612406730651855, "learning_rate": 2.170407537241599e-07, "logits/chosen": 0.7645602226257324, "logits/rejected": 0.6881904602050781, "logps/chosen": -151.0130157470703, "logps/ref_chosen": -45.871864318847656, "logps/ref_rejected": -61.305999755859375, "logps/rejected": -216.99484252929688, "loss": 1.1343, "margin_dpo/margin_mean": 50.54767990112305, "margin_dpo/margin_std": 79.19270324707031, "step": 390 }, { "KL/chosen_KL_mean": -117.92922973632812, "KL/mean": -148.20327758789062, "KL/rejected_KL_mean": -178.477294921875, "KL/std": 77.03036499023438, "epoch": 0.5910808767951625, "fcm_dpo/beta": 0.007091089151799679, "fcm_dpo/delta": -0.030991503968834877, "fcm_dpo/margin": 60.54808044433594, "fcm_dpo/q_t": 0.40246233344078064, "grad_norm": 12.812200546264648, "learning_rate": 2.1573054278272636e-07, "logits/chosen": 0.673202633857727, "logits/rejected": 0.6029750108718872, "logps/chosen": -176.11624145507812, "logps/ref_chosen": -58.18701171875, "logps/ref_rejected": -83.63442993164062, "logps/rejected": -262.1117248535156, "loss": 1.1098, "margin_dpo/margin_mean": 60.54808044433594, "margin_dpo/margin_std": 94.94903564453125, "step": 391 }, { "KL/chosen_KL_mean": -104.69487762451172, "KL/mean": -137.88522338867188, "KL/rejected_KL_mean": -171.0755615234375, "KL/std": 81.64057922363281, "epoch": 0.5925925925925926, "fcm_dpo/beta": 0.007069198414683342, "fcm_dpo/delta": -0.07297083735466003, "fcm_dpo/margin": 66.38069915771484, "fcm_dpo/q_t": 0.3947794437408447, "grad_norm": 11.337733268737793, "learning_rate": 2.1442129043167873e-07, "logits/chosen": 0.725287914276123, "logits/rejected": 0.6642845869064331, "logps/chosen": -174.4394073486328, "logps/ref_chosen": -69.7445297241211, "logps/ref_rejected": -94.05877685546875, "logps/rejected": -265.13433837890625, "loss": 1.0791, "margin_dpo/margin_mean": 66.38069915771484, "margin_dpo/margin_std": 95.79165649414062, "step": 392 }, { "KL/chosen_KL_mean": -119.12683868408203, "KL/mean": -153.39144897460938, "KL/rejected_KL_mean": -187.65603637695312, "KL/std": 80.12135314941406, "epoch": 0.5941043083900227, "fcm_dpo/beta": 0.006890019401907921, "fcm_dpo/delta": -0.07650090008974075, "fcm_dpo/margin": 68.52922058105469, "fcm_dpo/q_t": 0.3914652466773987, "grad_norm": 11.588418006896973, "learning_rate": 2.131130332936195e-07, "logits/chosen": 0.7181768417358398, "logits/rejected": 0.6763323545455933, "logps/chosen": -171.46173095703125, "logps/ref_chosen": -52.33489990234375, "logps/ref_rejected": -74.33809661865234, "logps/rejected": -261.994140625, "loss": 1.0425, "margin_dpo/margin_mean": 68.52922058105469, "margin_dpo/margin_std": 80.68852233886719, "step": 393 }, { "KL/chosen_KL_mean": -113.88552856445312, "KL/mean": -143.79721069335938, "KL/rejected_KL_mean": -173.70889282226562, "KL/std": 73.71353149414062, "epoch": 0.5956160241874527, "fcm_dpo/beta": 0.006903508678078651, "fcm_dpo/delta": -0.013742895796895027, "fcm_dpo/margin": 59.8233757019043, "fcm_dpo/q_t": 0.4036034345626831, "grad_norm": 11.373407363891602, "learning_rate": 2.1180580796331323e-07, "logits/chosen": 0.7522497177124023, "logits/rejected": 0.7197262048721313, "logps/chosen": -174.56166076660156, "logps/ref_chosen": -60.6761360168457, "logps/ref_rejected": -71.36074829101562, "logps/rejected": -245.06964111328125, "loss": 1.0801, "margin_dpo/margin_mean": 59.8233757019043, "margin_dpo/margin_std": 73.93080139160156, "step": 394 }, { "KL/chosen_KL_mean": -117.74978637695312, "KL/mean": -144.3744354248047, "KL/rejected_KL_mean": -170.99908447265625, "KL/std": 76.4329833984375, "epoch": 0.5971277399848829, "fcm_dpo/beta": 0.00693103764206171, "fcm_dpo/delta": 0.0317731611430645, "fcm_dpo/margin": 53.24930191040039, "fcm_dpo/q_t": 0.41695183515548706, "grad_norm": 13.579878807067871, "learning_rate": 2.104996510066625e-07, "logits/chosen": 0.6971070766448975, "logits/rejected": 0.5955677032470703, "logps/chosen": -168.35411071777344, "logps/ref_chosen": -50.60432434082031, "logps/ref_rejected": -77.08731079101562, "logps/rejected": -248.08639526367188, "loss": 1.1268, "margin_dpo/margin_mean": 53.24930191040039, "margin_dpo/margin_std": 82.009033203125, "step": 395 }, { "KL/chosen_KL_mean": -111.3974838256836, "KL/mean": -140.07427978515625, "KL/rejected_KL_mean": -168.75108337402344, "KL/std": 85.39405059814453, "epoch": 0.5986394557823129, "fcm_dpo/beta": 0.006866908632218838, "fcm_dpo/delta": 0.005037456750869751, "fcm_dpo/margin": 57.35359191894531, "fcm_dpo/q_t": 0.40966546535491943, "grad_norm": 11.246968269348145, "learning_rate": 2.0919459895968517e-07, "logits/chosen": 0.7125102877616882, "logits/rejected": 0.6121931076049805, "logps/chosen": -162.75709533691406, "logps/ref_chosen": -51.35961151123047, "logps/ref_rejected": -79.89360046386719, "logps/rejected": -248.64468383789062, "loss": 1.0992, "margin_dpo/margin_mean": 57.35358810424805, "margin_dpo/margin_std": 74.61653137207031, "step": 396 }, { "KL/chosen_KL_mean": -127.80692291259766, "KL/mean": -143.97879028320312, "KL/rejected_KL_mean": -160.15065002441406, "KL/std": 78.08262634277344, "epoch": 0.600151171579743, "fcm_dpo/beta": 0.007112812250852585, "fcm_dpo/delta": 0.17382901906967163, "fcm_dpo/margin": 32.343746185302734, "fcm_dpo/q_t": 0.4479358494281769, "grad_norm": 12.444058418273926, "learning_rate": 2.078906883274924e-07, "logits/chosen": 0.6378138661384583, "logits/rejected": 0.585270881652832, "logps/chosen": -194.26315307617188, "logps/ref_chosen": -66.45622253417969, "logps/ref_rejected": -85.74736785888672, "logps/rejected": -245.89801025390625, "loss": 1.276, "margin_dpo/margin_mean": 32.343746185302734, "margin_dpo/margin_std": 93.49945831298828, "step": 397 }, { "KL/chosen_KL_mean": -109.18138122558594, "KL/mean": -145.88284301757812, "KL/rejected_KL_mean": -182.58428955078125, "KL/std": 81.37733459472656, "epoch": 0.6016628873771731, "fcm_dpo/beta": 0.007010785397142172, "fcm_dpo/delta": -0.12178494781255722, "fcm_dpo/margin": 73.40287780761719, "fcm_dpo/q_t": 0.38393402099609375, "grad_norm": 10.435127258300781, "learning_rate": 2.065879555832674e-07, "logits/chosen": 0.6973479986190796, "logits/rejected": 0.627879798412323, "logps/chosen": -158.42562866210938, "logps/ref_chosen": -49.244239807128906, "logps/ref_rejected": -75.18949127197266, "logps/rejected": -257.7737731933594, "loss": 1.0154, "margin_dpo/margin_mean": 73.40287780761719, "margin_dpo/margin_std": 84.17864990234375, "step": 398 }, { "KL/chosen_KL_mean": -127.14164733886719, "KL/mean": -168.18804931640625, "KL/rejected_KL_mean": -209.23443603515625, "KL/std": 87.93510437011719, "epoch": 0.6031746031746031, "fcm_dpo/beta": 0.006795777007937431, "fcm_dpo/delta": -0.16834063827991486, "fcm_dpo/margin": 82.09280395507812, "fcm_dpo/q_t": 0.3756554424762726, "grad_norm": 12.41195297241211, "learning_rate": 2.052864371672457e-07, "logits/chosen": 0.6412574052810669, "logits/rejected": 0.4887663424015045, "logps/chosen": -195.44842529296875, "logps/ref_chosen": -68.30679321289062, "logps/ref_rejected": -113.2708511352539, "logps/rejected": -322.50531005859375, "loss": 1.0013, "margin_dpo/margin_mean": 82.09280395507812, "margin_dpo/margin_std": 97.44155883789062, "step": 399 }, { "KL/chosen_KL_mean": -141.19509887695312, "KL/mean": -164.96173095703125, "KL/rejected_KL_mean": -188.72833251953125, "KL/std": 81.94612121582031, "epoch": 0.6046863189720333, "fcm_dpo/beta": 0.006722395773977041, "fcm_dpo/delta": -0.03278467804193497, "fcm_dpo/margin": 47.53325653076172, "fcm_dpo/q_t": 0.42564094066619873, "grad_norm": 17.240482330322266, "learning_rate": 2.0398616948569493e-07, "logits/chosen": 0.7048459649085999, "logits/rejected": 0.6405730247497559, "logps/chosen": -212.82159423828125, "logps/ref_chosen": -71.62649536132812, "logps/ref_rejected": -90.98765563964844, "logps/rejected": -279.71600341796875, "loss": 1.1671, "margin_dpo/margin_mean": 47.53325653076172, "margin_dpo/margin_std": 80.81613159179688, "step": 400 }, { "KL/chosen_KL_mean": -108.57626342773438, "KL/mean": -142.01815795898438, "KL/rejected_KL_mean": -175.46006774902344, "KL/std": 87.77301025390625, "epoch": 0.6061980347694633, "fcm_dpo/beta": 0.006652448792010546, "fcm_dpo/delta": -0.04743156582117081, "fcm_dpo/margin": 66.8838119506836, "fcm_dpo/q_t": 0.39868029952049255, "grad_norm": 9.973950386047363, "learning_rate": 2.0268718890989752e-07, "logits/chosen": 0.7249618768692017, "logits/rejected": 0.6270743608474731, "logps/chosen": -162.30120849609375, "logps/ref_chosen": -53.72495651245117, "logps/ref_rejected": -75.06304931640625, "logps/rejected": -250.5231170654297, "loss": 1.0624, "margin_dpo/margin_mean": 66.88381958007812, "margin_dpo/margin_std": 83.62776184082031, "step": 401 }, { "KL/chosen_KL_mean": -117.71533203125, "KL/mean": -143.90321350097656, "KL/rejected_KL_mean": -170.09109497070312, "KL/std": 76.22942352294922, "epoch": 0.6077097505668935, "fcm_dpo/beta": 0.006672222167253494, "fcm_dpo/delta": 0.05198511481285095, "fcm_dpo/margin": 52.375770568847656, "fcm_dpo/q_t": 0.4207387864589691, "grad_norm": 12.8624267578125, "learning_rate": 2.013895317751323e-07, "logits/chosen": 0.6989619135856628, "logits/rejected": 0.6750861406326294, "logps/chosen": -179.58926391601562, "logps/ref_chosen": -61.873931884765625, "logps/ref_rejected": -66.15198516845703, "logps/rejected": -236.2430877685547, "loss": 1.1603, "margin_dpo/margin_mean": 52.37577819824219, "margin_dpo/margin_std": 91.92092895507812, "step": 402 }, { "KL/chosen_KL_mean": -124.72791290283203, "KL/mean": -159.05577087402344, "KL/rejected_KL_mean": -193.38363647460938, "KL/std": 86.9122314453125, "epoch": 0.6092214663643235, "fcm_dpo/beta": 0.006661761552095413, "fcm_dpo/delta": -0.06019973009824753, "fcm_dpo/margin": 68.65573120117188, "fcm_dpo/q_t": 0.39702337980270386, "grad_norm": 10.455925941467285, "learning_rate": 2.0009323437965898e-07, "logits/chosen": 0.8395401239395142, "logits/rejected": 0.7461362481117249, "logps/chosen": -176.04940795898438, "logps/ref_chosen": -51.321502685546875, "logps/ref_rejected": -86.54010772705078, "logps/rejected": -279.92376708984375, "loss": 1.07, "margin_dpo/margin_mean": 68.65573120117188, "margin_dpo/margin_std": 92.64551544189453, "step": 403 }, { "KL/chosen_KL_mean": -116.79646301269531, "KL/mean": -151.17056274414062, "KL/rejected_KL_mean": -185.54466247558594, "KL/std": 88.23384094238281, "epoch": 0.6107331821617535, "fcm_dpo/beta": 0.00654949527233839, "fcm_dpo/delta": -0.05398311838507652, "fcm_dpo/margin": 68.74818420410156, "fcm_dpo/q_t": 0.3989015221595764, "grad_norm": 14.889890670776367, "learning_rate": 1.9879833298370237e-07, "logits/chosen": 0.6515312194824219, "logits/rejected": 0.5520956516265869, "logps/chosen": -179.05935668945312, "logps/ref_chosen": -62.26288604736328, "logps/ref_rejected": -95.19029998779297, "logps/rejected": -280.7349548339844, "loss": 1.0814, "margin_dpo/margin_mean": 68.74818420410156, "margin_dpo/margin_std": 95.72023010253906, "step": 404 }, { "KL/chosen_KL_mean": -120.44222259521484, "KL/mean": -148.1865692138672, "KL/rejected_KL_mean": -175.930908203125, "KL/std": 79.2171859741211, "epoch": 0.6122448979591837, "fcm_dpo/beta": 0.006502949167042971, "fcm_dpo/delta": -0.05631444603204727, "fcm_dpo/margin": 55.48868942260742, "fcm_dpo/q_t": 0.4172636866569519, "grad_norm": 11.45528507232666, "learning_rate": 1.975048638084379e-07, "logits/chosen": 0.7875458002090454, "logits/rejected": 0.7361311912536621, "logps/chosen": -171.0265655517578, "logps/ref_chosen": -50.5843391418457, "logps/ref_rejected": -65.43156433105469, "logps/rejected": -241.36248779296875, "loss": 1.1298, "margin_dpo/margin_mean": 55.48868942260742, "margin_dpo/margin_std": 81.64513397216797, "step": 405 }, { "KL/chosen_KL_mean": -115.50141143798828, "KL/mean": -152.22653198242188, "KL/rejected_KL_mean": -188.95166015625, "KL/std": 85.67283630371094, "epoch": 0.6137566137566137, "fcm_dpo/beta": 0.006442304700613022, "fcm_dpo/delta": -0.07675281167030334, "fcm_dpo/margin": 73.45025634765625, "fcm_dpo/q_t": 0.39158371090888977, "grad_norm": 12.25676441192627, "learning_rate": 1.9621286303497914e-07, "logits/chosen": 0.7497522830963135, "logits/rejected": 0.5800542235374451, "logps/chosen": -164.49700927734375, "logps/ref_chosen": -48.99560546875, "logps/ref_rejected": -92.47774505615234, "logps/rejected": -281.42938232421875, "loss": 1.058, "margin_dpo/margin_mean": 73.45025634765625, "margin_dpo/margin_std": 95.53860473632812, "step": 406 }, { "KL/chosen_KL_mean": -143.9900665283203, "KL/mean": -172.96371459960938, "KL/rejected_KL_mean": -201.9373779296875, "KL/std": 95.75923156738281, "epoch": 0.6152683295540439, "fcm_dpo/beta": 0.006449670530855656, "fcm_dpo/delta": 0.026999279856681824, "fcm_dpo/margin": 57.94728088378906, "fcm_dpo/q_t": 0.41506102681159973, "grad_norm": 13.276222229003906, "learning_rate": 1.9492236680336483e-07, "logits/chosen": 0.5893508195877075, "logits/rejected": 0.5180951356887817, "logps/chosen": -233.390625, "logps/ref_chosen": -89.40056610107422, "logps/ref_rejected": -99.28775024414062, "logps/rejected": -301.2251281738281, "loss": 1.1425, "margin_dpo/margin_mean": 57.94728088378906, "margin_dpo/margin_std": 98.04299926757812, "step": 407 }, { "KL/chosen_KL_mean": -109.03900146484375, "KL/mean": -150.15414428710938, "KL/rejected_KL_mean": -191.269287109375, "KL/std": 81.42919921875, "epoch": 0.6167800453514739, "fcm_dpo/beta": 0.006355122663080692, "fcm_dpo/delta": -0.12931808829307556, "fcm_dpo/margin": 82.23027038574219, "fcm_dpo/q_t": 0.3791518211364746, "grad_norm": 10.347234725952148, "learning_rate": 1.9363341121154895e-07, "logits/chosen": 0.6950646638870239, "logits/rejected": 0.6137909889221191, "logps/chosen": -163.742919921875, "logps/ref_chosen": -54.70391845703125, "logps/ref_rejected": -73.98648834228516, "logps/rejected": -265.2557678222656, "loss": 1.0043, "margin_dpo/margin_mean": 82.23027038574219, "margin_dpo/margin_std": 86.0679931640625, "step": 408 }, { "KL/chosen_KL_mean": -134.66537475585938, "KL/mean": -153.56837463378906, "KL/rejected_KL_mean": -172.4713897705078, "KL/std": 71.11965942382812, "epoch": 0.618291761148904, "fcm_dpo/beta": 0.006422149017453194, "fcm_dpo/delta": 0.1612337976694107, "fcm_dpo/margin": 37.8060188293457, "fcm_dpo/q_t": 0.44450077414512634, "grad_norm": 12.294546127319336, "learning_rate": 1.9234603231438994e-07, "logits/chosen": 0.7155551910400391, "logits/rejected": 0.7242038249969482, "logps/chosen": -196.78359985351562, "logps/ref_chosen": -62.11822509765625, "logps/ref_rejected": -61.933509826660156, "logps/rejected": -234.4049072265625, "loss": 1.2245, "margin_dpo/margin_mean": 37.8060188293457, "margin_dpo/margin_std": 81.56556701660156, "step": 409 }, { "KL/chosen_KL_mean": -125.79985046386719, "KL/mean": -159.2677001953125, "KL/rejected_KL_mean": -192.7355499267578, "KL/std": 78.95205688476562, "epoch": 0.6198034769463341, "fcm_dpo/beta": 0.006404371000826359, "fcm_dpo/delta": -0.031209833920001984, "fcm_dpo/margin": 66.93568420410156, "fcm_dpo/q_t": 0.40025562047958374, "grad_norm": 11.22482967376709, "learning_rate": 1.9106026612264315e-07, "logits/chosen": 0.7111754417419434, "logits/rejected": 0.6851361989974976, "logps/chosen": -187.60250854492188, "logps/ref_chosen": -61.80266189575195, "logps/ref_rejected": -76.60002136230469, "logps/rejected": -269.3355712890625, "loss": 1.0633, "margin_dpo/margin_mean": 66.93568420410156, "margin_dpo/margin_std": 76.35328674316406, "step": 410 }, { "KL/chosen_KL_mean": -132.326171875, "KL/mean": -164.41603088378906, "KL/rejected_KL_mean": -196.50588989257812, "KL/std": 86.12704467773438, "epoch": 0.6213151927437641, "fcm_dpo/beta": 0.006428801920264959, "fcm_dpo/delta": -0.013138813897967339, "fcm_dpo/margin": 64.17971801757812, "fcm_dpo/q_t": 0.40587669610977173, "grad_norm": 11.010346412658691, "learning_rate": 1.8977614860195296e-07, "logits/chosen": 0.7318480014801025, "logits/rejected": 0.6702442169189453, "logps/chosen": -186.7715606689453, "logps/ref_chosen": -54.44539260864258, "logps/ref_rejected": -74.5650863647461, "logps/rejected": -271.07098388671875, "loss": 1.0975, "margin_dpo/margin_mean": 64.17972564697266, "margin_dpo/margin_std": 92.28285217285156, "step": 411 }, { "KL/chosen_KL_mean": -136.4827880859375, "KL/mean": -165.96719360351562, "KL/rejected_KL_mean": -195.45162963867188, "KL/std": 75.48411560058594, "epoch": 0.6228269085411943, "fcm_dpo/beta": 0.006420055404305458, "fcm_dpo/delta": 0.022063056007027626, "fcm_dpo/margin": 58.968833923339844, "fcm_dpo/q_t": 0.4125543534755707, "grad_norm": 12.068582534790039, "learning_rate": 1.8849371567184662e-07, "logits/chosen": 0.7080731391906738, "logits/rejected": 0.6416032314300537, "logps/chosen": -191.73086547851562, "logps/ref_chosen": -55.248085021972656, "logps/ref_rejected": -68.96623229980469, "logps/rejected": -264.4178466796875, "loss": 1.1081, "margin_dpo/margin_mean": 58.968833923339844, "margin_dpo/margin_std": 80.7718505859375, "step": 412 }, { "KL/chosen_KL_mean": -149.6476593017578, "KL/mean": -176.8856201171875, "KL/rejected_KL_mean": -204.12356567382812, "KL/std": 85.43605041503906, "epoch": 0.6243386243386243, "fcm_dpo/beta": 0.006504066288471222, "fcm_dpo/delta": 0.04728236049413681, "fcm_dpo/margin": 54.47590255737305, "fcm_dpo/q_t": 0.42095059156417847, "grad_norm": 13.920833587646484, "learning_rate": 1.872130032047302e-07, "logits/chosen": 0.5637932419776917, "logits/rejected": 0.5278281569480896, "logps/chosen": -218.368408203125, "logps/ref_chosen": -68.72074890136719, "logps/ref_rejected": -78.76539611816406, "logps/rejected": -282.88897705078125, "loss": 1.1758, "margin_dpo/margin_mean": 54.47590255737305, "margin_dpo/margin_std": 105.6761245727539, "step": 413 }, { "KL/chosen_KL_mean": -132.0833740234375, "KL/mean": -165.0142059326172, "KL/rejected_KL_mean": -197.94503784179688, "KL/std": 91.266357421875, "epoch": 0.6258503401360545, "fcm_dpo/beta": 0.006491639651358128, "fcm_dpo/delta": -0.02877388335764408, "fcm_dpo/margin": 65.86165618896484, "fcm_dpo/q_t": 0.4008955657482147, "grad_norm": 10.989670753479004, "learning_rate": 1.8593404702488436e-07, "logits/chosen": 0.7148370742797852, "logits/rejected": 0.6511275172233582, "logps/chosen": -186.22158813476562, "logps/ref_chosen": -54.138214111328125, "logps/ref_rejected": -74.65741729736328, "logps/rejected": -272.6024475097656, "loss": 1.0773, "margin_dpo/margin_mean": 65.86166381835938, "margin_dpo/margin_std": 85.32364654541016, "step": 414 }, { "KL/chosen_KL_mean": -131.07241821289062, "KL/mean": -158.78738403320312, "KL/rejected_KL_mean": -186.5023193359375, "KL/std": 85.7287826538086, "epoch": 0.6273620559334845, "fcm_dpo/beta": 0.006505992729216814, "fcm_dpo/delta": 0.040865208953619, "fcm_dpo/margin": 55.42988586425781, "fcm_dpo/q_t": 0.41789719462394714, "grad_norm": 11.600848197937012, "learning_rate": 1.846568829074628e-07, "logits/chosen": 0.7361269593238831, "logits/rejected": 0.7179920673370361, "logps/chosen": -186.99098205566406, "logps/ref_chosen": -55.91856002807617, "logps/ref_rejected": -61.747703552246094, "logps/rejected": -248.25003051757812, "loss": 1.1444, "margin_dpo/margin_mean": 55.42988586425781, "margin_dpo/margin_std": 93.18145751953125, "step": 415 }, { "KL/chosen_KL_mean": -140.72235107421875, "KL/mean": -163.9053192138672, "KL/rejected_KL_mean": -187.08828735351562, "KL/std": 89.27188110351562, "epoch": 0.6288737717309146, "fcm_dpo/beta": 0.006478393450379372, "fcm_dpo/delta": -0.0766264796257019, "fcm_dpo/margin": 46.36594009399414, "fcm_dpo/q_t": 0.43208593130111694, "grad_norm": 13.682758331298828, "learning_rate": 1.8338154657749128e-07, "logits/chosen": 0.6860790252685547, "logits/rejected": 0.6344266533851624, "logps/chosen": -195.4454345703125, "logps/ref_chosen": -54.72308349609375, "logps/ref_rejected": -69.17388916015625, "logps/rejected": -256.2621765136719, "loss": 1.1979, "margin_dpo/margin_mean": 46.36594009399414, "margin_dpo/margin_std": 87.04725646972656, "step": 416 }, { "KL/chosen_KL_mean": -144.76004028320312, "KL/mean": -176.5319366455078, "KL/rejected_KL_mean": -208.30384826660156, "KL/std": 84.364990234375, "epoch": 0.6303854875283447, "fcm_dpo/beta": 0.0064516691491007805, "fcm_dpo/delta": -0.010877702385187149, "fcm_dpo/margin": 63.543800354003906, "fcm_dpo/q_t": 0.4047169089317322, "grad_norm": 12.379412651062012, "learning_rate": 1.8210807370886849e-07, "logits/chosen": 0.8253967761993408, "logits/rejected": 0.7563052177429199, "logps/chosen": -201.55130004882812, "logps/ref_chosen": -56.791259765625, "logps/ref_rejected": -68.7791748046875, "logps/rejected": -277.0830078125, "loss": 1.1207, "margin_dpo/margin_mean": 63.543800354003906, "margin_dpo/margin_std": 101.88276672363281, "step": 417 }, { "KL/chosen_KL_mean": -153.71319580078125, "KL/mean": -179.72494506835938, "KL/rejected_KL_mean": -205.73672485351562, "KL/std": 92.75643920898438, "epoch": 0.6318972033257747, "fcm_dpo/beta": 0.006342011503875256, "fcm_dpo/delta": -0.0570448562502861, "fcm_dpo/margin": 52.02351760864258, "fcm_dpo/q_t": 0.4253191351890564, "grad_norm": 13.267729759216309, "learning_rate": 1.8083649992336825e-07, "logits/chosen": 0.7561519145965576, "logits/rejected": 0.7601677775382996, "logps/chosen": -222.82118225097656, "logps/ref_chosen": -69.10798645019531, "logps/ref_rejected": -75.09132385253906, "logps/rejected": -280.82806396484375, "loss": 1.1738, "margin_dpo/margin_mean": 52.02351760864258, "margin_dpo/margin_std": 94.43706512451172, "step": 418 }, { "KL/chosen_KL_mean": -125.10302734375, "KL/mean": -161.56472778320312, "KL/rejected_KL_mean": -198.0264129638672, "KL/std": 86.49341583251953, "epoch": 0.6334089191232048, "fcm_dpo/beta": 0.006274879910051823, "fcm_dpo/delta": -0.06050105020403862, "fcm_dpo/margin": 72.92340087890625, "fcm_dpo/q_t": 0.39721137285232544, "grad_norm": 11.693915367126465, "learning_rate": 1.7956686078964255e-07, "logits/chosen": 0.6105080842971802, "logits/rejected": 0.5592924952507019, "logps/chosen": -183.27479553222656, "logps/ref_chosen": -58.1717643737793, "logps/ref_rejected": -71.67066955566406, "logps/rejected": -269.69708251953125, "loss": 1.0687, "margin_dpo/margin_mean": 72.92339324951172, "margin_dpo/margin_std": 99.07386016845703, "step": 419 }, { "KL/chosen_KL_mean": -154.8756561279297, "KL/mean": -173.73912048339844, "KL/rejected_KL_mean": -192.60260009765625, "KL/std": 89.06033325195312, "epoch": 0.6349206349206349, "fcm_dpo/beta": 0.006329146213829517, "fcm_dpo/delta": 0.05034765601158142, "fcm_dpo/margin": 37.7269401550293, "fcm_dpo/q_t": 0.4468899369239807, "grad_norm": 12.428634643554688, "learning_rate": 1.782991918222275e-07, "logits/chosen": 0.7127636671066284, "logits/rejected": 0.6661313772201538, "logps/chosen": -211.92916870117188, "logps/ref_chosen": -57.05351257324219, "logps/ref_rejected": -62.670982360839844, "logps/rejected": -255.27357482910156, "loss": 1.2612, "margin_dpo/margin_mean": 37.7269401550293, "margin_dpo/margin_std": 101.82627868652344, "step": 420 }, { "KL/chosen_KL_mean": -141.91476440429688, "KL/mean": -170.19818115234375, "KL/rejected_KL_mean": -198.48159790039062, "KL/std": 88.26173400878906, "epoch": 0.636432350718065, "fcm_dpo/beta": 0.006338524632155895, "fcm_dpo/delta": 0.04270947724580765, "fcm_dpo/margin": 56.56682586669922, "fcm_dpo/q_t": 0.42009735107421875, "grad_norm": 13.580318450927734, "learning_rate": 1.7703352848054887e-07, "logits/chosen": 0.6985728740692139, "logits/rejected": 0.6350290179252625, "logps/chosen": -199.23800659179688, "logps/ref_chosen": -57.32324981689453, "logps/ref_rejected": -75.33782958984375, "logps/rejected": -273.8194274902344, "loss": 1.1848, "margin_dpo/margin_mean": 56.56682586669922, "margin_dpo/margin_std": 114.31954956054688, "step": 421 }, { "KL/chosen_KL_mean": -125.34371948242188, "KL/mean": -161.59173583984375, "KL/rejected_KL_mean": -197.8397216796875, "KL/std": 87.74114990234375, "epoch": 0.6379440665154951, "fcm_dpo/beta": 0.006344118155539036, "fcm_dpo/delta": -0.06281741708517075, "fcm_dpo/margin": 72.49601745605469, "fcm_dpo/q_t": 0.39477962255477905, "grad_norm": 13.04092025756836, "learning_rate": 1.7576990616793137e-07, "logits/chosen": 0.6696562767028809, "logits/rejected": 0.6616383194923401, "logps/chosen": -192.40130615234375, "logps/ref_chosen": -67.05757141113281, "logps/ref_rejected": -72.12803649902344, "logps/rejected": -269.9677734375, "loss": 1.0539, "margin_dpo/margin_mean": 72.49601745605469, "margin_dpo/margin_std": 88.96668243408203, "step": 422 }, { "KL/chosen_KL_mean": -127.96492767333984, "KL/mean": -166.805908203125, "KL/rejected_KL_mean": -205.64686584472656, "KL/std": 90.0579833984375, "epoch": 0.6394557823129252, "fcm_dpo/beta": 0.0062129320576786995, "fcm_dpo/delta": -0.08701039850711823, "fcm_dpo/margin": 77.68196105957031, "fcm_dpo/q_t": 0.39081767201423645, "grad_norm": 11.37246322631836, "learning_rate": 1.745083602306071e-07, "logits/chosen": 0.7676882743835449, "logits/rejected": 0.695237398147583, "logps/chosen": -182.02659606933594, "logps/ref_chosen": -54.06167221069336, "logps/ref_rejected": -76.64092254638672, "logps/rejected": -282.28778076171875, "loss": 1.0433, "margin_dpo/margin_mean": 77.68196105957031, "margin_dpo/margin_std": 96.68798828125, "step": 423 }, { "KL/chosen_KL_mean": -137.3669891357422, "KL/mean": -174.32687377929688, "KL/rejected_KL_mean": -211.28677368164062, "KL/std": 85.37979125976562, "epoch": 0.6409674981103552, "fcm_dpo/beta": 0.00610921997576952, "fcm_dpo/delta": -0.05502733588218689, "fcm_dpo/margin": 73.91978454589844, "fcm_dpo/q_t": 0.39746588468551636, "grad_norm": 15.935237884521484, "learning_rate": 1.7324892595672804e-07, "logits/chosen": 0.6066380143165588, "logits/rejected": 0.5631780624389648, "logps/chosen": -190.97586059570312, "logps/ref_chosen": -53.60887145996094, "logps/ref_rejected": -79.2139892578125, "logps/rejected": -290.5007629394531, "loss": 1.069, "margin_dpo/margin_mean": 73.91978454589844, "margin_dpo/margin_std": 97.44770812988281, "step": 424 }, { "KL/chosen_KL_mean": -136.45765686035156, "KL/mean": -165.27542114257812, "KL/rejected_KL_mean": -194.0931854248047, "KL/std": 82.22036743164062, "epoch": 0.6424792139077853, "fcm_dpo/beta": 0.006168010178953409, "fcm_dpo/delta": 0.04615384340286255, "fcm_dpo/margin": 57.63554382324219, "fcm_dpo/q_t": 0.41899484395980835, "grad_norm": 12.926393508911133, "learning_rate": 1.7199163857537824e-07, "logits/chosen": 0.7487435936927795, "logits/rejected": 0.7196539640426636, "logps/chosen": -194.87232971191406, "logps/ref_chosen": -58.41468048095703, "logps/ref_rejected": -66.59054565429688, "logps/rejected": -260.6837158203125, "loss": 1.141, "margin_dpo/margin_mean": 57.63554382324219, "margin_dpo/margin_std": 93.35127258300781, "step": 425 }, { "KL/chosen_KL_mean": -165.10255432128906, "KL/mean": -182.7888641357422, "KL/rejected_KL_mean": -200.4751739501953, "KL/std": 86.8814468383789, "epoch": 0.6439909297052154, "fcm_dpo/beta": 0.006237998604774475, "fcm_dpo/delta": 0.0865492895245552, "fcm_dpo/margin": 35.37261199951172, "fcm_dpo/q_t": 0.449169397354126, "grad_norm": 15.806463241577148, "learning_rate": 1.7073653325558828e-07, "logits/chosen": 0.6705982089042664, "logits/rejected": 0.6754001379013062, "logps/chosen": -236.81077575683594, "logps/ref_chosen": -71.70822143554688, "logps/ref_rejected": -73.57725524902344, "logps/rejected": -274.05242919921875, "loss": 1.2862, "margin_dpo/margin_mean": 35.37261199951172, "margin_dpo/margin_std": 109.31277465820312, "step": 426 }, { "KL/chosen_KL_mean": -155.25128173828125, "KL/mean": -184.58871459960938, "KL/rejected_KL_mean": -213.92611694335938, "KL/std": 94.68376159667969, "epoch": 0.6455026455026455, "fcm_dpo/beta": 0.006314431317150593, "fcm_dpo/delta": 0.030646849423646927, "fcm_dpo/margin": 58.674861907958984, "fcm_dpo/q_t": 0.41754698753356934, "grad_norm": 13.462782859802246, "learning_rate": 1.6948364510535218e-07, "logits/chosen": 0.7108075618743896, "logits/rejected": 0.6439088582992554, "logps/chosen": -213.89404296875, "logps/ref_chosen": -58.64276885986328, "logps/ref_rejected": -86.25437927246094, "logps/rejected": -300.18048095703125, "loss": 1.1543, "margin_dpo/margin_mean": 58.67486572265625, "margin_dpo/margin_std": 106.71563720703125, "step": 427 }, { "KL/chosen_KL_mean": -148.2368621826172, "KL/mean": -181.1078338623047, "KL/rejected_KL_mean": -213.97882080078125, "KL/std": 96.19194030761719, "epoch": 0.6470143613000756, "fcm_dpo/beta": 0.006340525578707457, "fcm_dpo/delta": -0.017856691032648087, "fcm_dpo/margin": 65.74195098876953, "fcm_dpo/q_t": 0.40633878111839294, "grad_norm": 11.959637641906738, "learning_rate": 1.6823300917064458e-07, "logits/chosen": 0.6576073169708252, "logits/rejected": 0.6139048933982849, "logps/chosen": -214.83291625976562, "logps/ref_chosen": -66.5960464477539, "logps/ref_rejected": -82.3941650390625, "logps/rejected": -296.37298583984375, "loss": 1.1129, "margin_dpo/margin_mean": 65.741943359375, "margin_dpo/margin_std": 102.61549377441406, "step": 428 }, { "KL/chosen_KL_mean": -151.12130737304688, "KL/mean": -176.2879638671875, "KL/rejected_KL_mean": -201.45462036132812, "KL/std": 83.37110900878906, "epoch": 0.6485260770975056, "fcm_dpo/beta": 0.00640984158962965, "fcm_dpo/delta": 0.07926566898822784, "fcm_dpo/margin": 50.33329772949219, "fcm_dpo/q_t": 0.42531687021255493, "grad_norm": 13.677525520324707, "learning_rate": 1.669846604344412e-07, "logits/chosen": 0.6421541571617126, "logits/rejected": 0.6608945727348328, "logps/chosen": -208.13101196289062, "logps/ref_chosen": -57.00970458984375, "logps/ref_rejected": -59.86549377441406, "logps/rejected": -261.3200988769531, "loss": 1.1774, "margin_dpo/margin_mean": 50.33329772949219, "margin_dpo/margin_std": 93.18931579589844, "step": 429 }, { "KL/chosen_KL_mean": -135.48016357421875, "KL/mean": -176.63815307617188, "KL/rejected_KL_mean": -217.79617309570312, "KL/std": 85.43701934814453, "epoch": 0.6500377928949358, "fcm_dpo/beta": 0.006288270931690931, "fcm_dpo/delta": -0.1241229772567749, "fcm_dpo/margin": 82.31600952148438, "fcm_dpo/q_t": 0.38002169132232666, "grad_norm": 12.92713737487793, "learning_rate": 1.6573863381573954e-07, "logits/chosen": 0.6190842986106873, "logits/rejected": 0.6174815893173218, "logps/chosen": -195.04336547851562, "logps/ref_chosen": -59.563194274902344, "logps/ref_rejected": -70.52289581298828, "logps/rejected": -288.319091796875, "loss": 1.0235, "margin_dpo/margin_mean": 82.3160171508789, "margin_dpo/margin_std": 97.65048217773438, "step": 430 }, { "KL/chosen_KL_mean": -134.37684631347656, "KL/mean": -163.82293701171875, "KL/rejected_KL_mean": -193.26902770996094, "KL/std": 86.12620544433594, "epoch": 0.6515495086923658, "fcm_dpo/beta": 0.006274573504924774, "fcm_dpo/delta": 0.03165648132562637, "fcm_dpo/margin": 58.892181396484375, "fcm_dpo/q_t": 0.4157334268093109, "grad_norm": 12.676860809326172, "learning_rate": 1.6449496416858282e-07, "logits/chosen": 0.6868596076965332, "logits/rejected": 0.6294798254966736, "logps/chosen": -184.57717895507812, "logps/ref_chosen": -50.20032501220703, "logps/ref_rejected": -77.81680297851562, "logps/rejected": -271.0858154296875, "loss": 1.1325, "margin_dpo/margin_mean": 58.892181396484375, "margin_dpo/margin_std": 93.61036682128906, "step": 431 }, { "KL/chosen_KL_mean": -140.93447875976562, "KL/mean": -172.25730895996094, "KL/rejected_KL_mean": -203.58010864257812, "KL/std": 84.81881713867188, "epoch": 0.6530612244897959, "fcm_dpo/beta": 0.006294472608715296, "fcm_dpo/delta": 0.005910965614020824, "fcm_dpo/margin": 62.6456298828125, "fcm_dpo/q_t": 0.4095492959022522, "grad_norm": 12.92427921295166, "learning_rate": 1.632536862810844e-07, "logits/chosen": 0.7295043468475342, "logits/rejected": 0.674199640750885, "logps/chosen": -202.5972442626953, "logps/ref_chosen": -61.662757873535156, "logps/ref_rejected": -83.94496154785156, "logps/rejected": -287.52508544921875, "loss": 1.1192, "margin_dpo/margin_mean": 62.6456298828125, "margin_dpo/margin_std": 97.64552307128906, "step": 432 }, { "KL/chosen_KL_mean": -139.57357788085938, "KL/mean": -176.31478881835938, "KL/rejected_KL_mean": -213.0560302734375, "KL/std": 81.1341552734375, "epoch": 0.654572940287226, "fcm_dpo/beta": 0.0062561118975281715, "fcm_dpo/delta": -0.06256237626075745, "fcm_dpo/margin": 73.48242950439453, "fcm_dpo/q_t": 0.3936130404472351, "grad_norm": 12.761076927185059, "learning_rate": 1.6201483487445515e-07, "logits/chosen": 0.793116569519043, "logits/rejected": 0.7925975322723389, "logps/chosen": -203.30276489257812, "logps/ref_chosen": -63.72917938232422, "logps/ref_rejected": -65.8391342163086, "logps/rejected": -278.8951416015625, "loss": 1.0605, "margin_dpo/margin_mean": 73.48243713378906, "margin_dpo/margin_std": 93.43466186523438, "step": 433 }, { "KL/chosen_KL_mean": -118.41752624511719, "KL/mean": -159.06114196777344, "KL/rejected_KL_mean": -199.70474243164062, "KL/std": 89.88044738769531, "epoch": 0.656084656084656, "fcm_dpo/beta": 0.006085899658501148, "fcm_dpo/delta": -0.10128242522478104, "fcm_dpo/margin": 81.28720092773438, "fcm_dpo/q_t": 0.38656648993492126, "grad_norm": 11.475704193115234, "learning_rate": 1.6077844460203204e-07, "logits/chosen": 0.8146035671234131, "logits/rejected": 0.7479780912399292, "logps/chosen": -166.39083862304688, "logps/ref_chosen": -47.97331619262695, "logps/ref_rejected": -72.51132202148438, "logps/rejected": -272.216064453125, "loss": 1.0568, "margin_dpo/margin_mean": 81.2872085571289, "margin_dpo/margin_std": 108.73129272460938, "step": 434 }, { "KL/chosen_KL_mean": -144.13433837890625, "KL/mean": -174.63088989257812, "KL/rejected_KL_mean": -205.12741088867188, "KL/std": 85.32199096679688, "epoch": 0.6575963718820862, "fcm_dpo/beta": 0.00614683423191309, "fcm_dpo/delta": 0.025531083345413208, "fcm_dpo/margin": 60.993080139160156, "fcm_dpo/q_t": 0.41487905383110046, "grad_norm": 13.900237083435059, "learning_rate": 1.5954455004830878e-07, "logits/chosen": 0.7956072688102722, "logits/rejected": 0.7552189826965332, "logps/chosen": -201.194580078125, "logps/ref_chosen": -57.06024932861328, "logps/ref_rejected": -71.69146728515625, "logps/rejected": -276.8188781738281, "loss": 1.1362, "margin_dpo/margin_mean": 60.99307632446289, "margin_dpo/margin_std": 99.48747253417969, "step": 435 }, { "KL/chosen_KL_mean": -143.6798095703125, "KL/mean": -168.6260986328125, "KL/rejected_KL_mean": -193.57240295410156, "KL/std": 84.32862854003906, "epoch": 0.6591080876795162, "fcm_dpo/beta": 0.006208301987498999, "fcm_dpo/delta": 0.0931948646903038, "fcm_dpo/margin": 49.892601013183594, "fcm_dpo/q_t": 0.4299197793006897, "grad_norm": 14.518027305603027, "learning_rate": 1.5831318572796847e-07, "logits/chosen": 0.724181056022644, "logits/rejected": 0.664134681224823, "logps/chosen": -199.83786010742188, "logps/ref_chosen": -56.158050537109375, "logps/ref_rejected": -67.63787841796875, "logps/rejected": -261.21026611328125, "loss": 1.1982, "margin_dpo/margin_mean": 49.892601013183594, "margin_dpo/margin_std": 103.509765625, "step": 436 }, { "KL/chosen_KL_mean": -149.77796936035156, "KL/mean": -179.45657348632812, "KL/rejected_KL_mean": -209.1351776123047, "KL/std": 91.01817321777344, "epoch": 0.6606198034769464, "fcm_dpo/beta": 0.006153785157948732, "fcm_dpo/delta": -0.07142957299947739, "fcm_dpo/margin": 59.357200622558594, "fcm_dpo/q_t": 0.4173312783241272, "grad_norm": 14.787918090820312, "learning_rate": 1.5708438608491815e-07, "logits/chosen": 0.7029831409454346, "logits/rejected": 0.5732403993606567, "logps/chosen": -206.76376342773438, "logps/ref_chosen": -56.98578643798828, "logps/ref_rejected": -85.61524963378906, "logps/rejected": -294.75042724609375, "loss": 1.1768, "margin_dpo/margin_mean": 59.357200622558594, "margin_dpo/margin_std": 114.67839050292969, "step": 437 }, { "KL/chosen_KL_mean": -128.87005615234375, "KL/mean": -169.50265502929688, "KL/rejected_KL_mean": -210.13522338867188, "KL/std": 94.86117553710938, "epoch": 0.6621315192743764, "fcm_dpo/beta": 0.006083897314965725, "fcm_dpo/delta": -0.09921536594629288, "fcm_dpo/margin": 81.26517486572266, "fcm_dpo/q_t": 0.3899085223674774, "grad_norm": 12.116286277770996, "learning_rate": 1.558581854913253e-07, "logits/chosen": 0.7898980975151062, "logits/rejected": 0.7203817367553711, "logps/chosen": -170.1478271484375, "logps/ref_chosen": -41.27777862548828, "logps/ref_rejected": -65.33840942382812, "logps/rejected": -275.4736328125, "loss": 1.0404, "margin_dpo/margin_mean": 81.26518249511719, "margin_dpo/margin_std": 103.11805725097656, "step": 438 }, { "KL/chosen_KL_mean": -146.58200073242188, "KL/mean": -182.0059356689453, "KL/rejected_KL_mean": -217.42987060546875, "KL/std": 96.97052001953125, "epoch": 0.6636432350718064, "fcm_dpo/beta": 0.006045582704246044, "fcm_dpo/delta": -0.030269447714090347, "fcm_dpo/margin": 70.84788513183594, "fcm_dpo/q_t": 0.4024859666824341, "grad_norm": 12.636711120605469, "learning_rate": 1.5463461824665658e-07, "logits/chosen": 0.6318942308425903, "logits/rejected": 0.5952730178833008, "logps/chosen": -227.99964904785156, "logps/ref_chosen": -81.41764831542969, "logps/ref_rejected": -94.72309875488281, "logps/rejected": -312.1529846191406, "loss": 1.0893, "margin_dpo/margin_mean": 70.84788513183594, "margin_dpo/margin_std": 98.64799499511719, "step": 439 }, { "KL/chosen_KL_mean": -127.22040557861328, "KL/mean": -163.5577392578125, "KL/rejected_KL_mean": -199.89508056640625, "KL/std": 86.24961853027344, "epoch": 0.6651549508692366, "fcm_dpo/beta": 0.005986891686916351, "fcm_dpo/delta": -0.03685595840215683, "fcm_dpo/margin": 72.67469024658203, "fcm_dpo/q_t": 0.40108078718185425, "grad_norm": 20.813627243041992, "learning_rate": 1.534137185767178e-07, "logits/chosen": 0.7139894962310791, "logits/rejected": 0.6110374927520752, "logps/chosen": -169.7585906982422, "logps/ref_chosen": -42.538185119628906, "logps/ref_rejected": -69.78813934326172, "logps/rejected": -269.6832275390625, "loss": 1.0846, "margin_dpo/margin_mean": 72.6746826171875, "margin_dpo/margin_std": 100.51685333251953, "step": 440 }, { "KL/chosen_KL_mean": -127.33625030517578, "KL/mean": -167.41632080078125, "KL/rejected_KL_mean": -207.49636840820312, "KL/std": 93.09278869628906, "epoch": 0.6666666666666666, "fcm_dpo/beta": 0.005862266756594181, "fcm_dpo/delta": -0.07405117899179459, "fcm_dpo/margin": 80.1601333618164, "fcm_dpo/q_t": 0.39000365138053894, "grad_norm": 12.914335250854492, "learning_rate": 1.521955206326976e-07, "logits/chosen": 0.7205959558486938, "logits/rejected": 0.6202556490898132, "logps/chosen": -184.92947387695312, "logps/ref_chosen": -57.593223571777344, "logps/ref_rejected": -84.82878875732422, "logps/rejected": -292.3251647949219, "loss": 1.0276, "margin_dpo/margin_mean": 80.1601333618164, "margin_dpo/margin_std": 82.66327667236328, "step": 441 }, { "KL/chosen_KL_mean": -154.25204467773438, "KL/mean": -192.8415069580078, "KL/rejected_KL_mean": -231.43096923828125, "KL/std": 92.8477783203125, "epoch": 0.6681783824640968, "fcm_dpo/beta": 0.005807263310998678, "fcm_dpo/delta": -0.05068985000252724, "fcm_dpo/margin": 77.17893981933594, "fcm_dpo/q_t": 0.3969552516937256, "grad_norm": 13.84139347076416, "learning_rate": 1.5098005849021078e-07, "logits/chosen": 0.6708108186721802, "logits/rejected": 0.6216378808021545, "logps/chosen": -221.7132568359375, "logps/ref_chosen": -67.46121978759766, "logps/ref_rejected": -89.0693588256836, "logps/rejected": -320.50030517578125, "loss": 1.0624, "margin_dpo/margin_mean": 77.17893981933594, "margin_dpo/margin_std": 97.06315612792969, "step": 442 }, { "KL/chosen_KL_mean": -127.90887451171875, "KL/mean": -175.74288940429688, "KL/rejected_KL_mean": -223.57688903808594, "KL/std": 98.24057006835938, "epoch": 0.6696900982615268, "fcm_dpo/beta": 0.005674402695149183, "fcm_dpo/delta": -0.1511317789554596, "fcm_dpo/margin": 95.66800689697266, "fcm_dpo/q_t": 0.3759717047214508, "grad_norm": 13.66612434387207, "learning_rate": 1.4976736614834662e-07, "logits/chosen": 0.7465083599090576, "logits/rejected": 0.6766018867492676, "logps/chosen": -182.70498657226562, "logps/ref_chosen": -54.79610061645508, "logps/ref_rejected": -77.80781555175781, "logps/rejected": -301.38470458984375, "loss": 1.0016, "margin_dpo/margin_mean": 95.66800689697266, "margin_dpo/margin_std": 109.18048095703125, "step": 443 }, { "KL/chosen_KL_mean": -157.7965850830078, "KL/mean": -178.49429321289062, "KL/rejected_KL_mean": -199.19200134277344, "KL/std": 96.99405670166016, "epoch": 0.671201814058957, "fcm_dpo/beta": 0.005667074583470821, "fcm_dpo/delta": 0.03024156205356121, "fcm_dpo/margin": 41.395389556884766, "fcm_dpo/q_t": 0.4473768472671509, "grad_norm": 16.36332893371582, "learning_rate": 1.4855747752871654e-07, "logits/chosen": 0.7650264501571655, "logits/rejected": 0.6669203042984009, "logps/chosen": -216.545654296875, "logps/ref_chosen": -58.749061584472656, "logps/ref_rejected": -86.87396240234375, "logps/rejected": -286.06597900390625, "loss": 1.2555, "margin_dpo/margin_mean": 41.3953857421875, "margin_dpo/margin_std": 106.90706634521484, "step": 444 }, { "KL/chosen_KL_mean": -148.9952850341797, "KL/mean": -188.8018341064453, "KL/rejected_KL_mean": -228.60836791992188, "KL/std": 90.53689575195312, "epoch": 0.672713529856387, "fcm_dpo/beta": 0.005627226084470749, "fcm_dpo/delta": -0.05023486167192459, "fcm_dpo/margin": 79.61310577392578, "fcm_dpo/q_t": 0.3959387540817261, "grad_norm": 12.833502769470215, "learning_rate": 1.473504264745062e-07, "logits/chosen": 0.6719219088554382, "logits/rejected": 0.6625571250915527, "logps/chosen": -209.9127197265625, "logps/ref_chosen": -60.91743850708008, "logps/ref_rejected": -71.5637435913086, "logps/rejected": -300.172119140625, "loss": 1.0577, "margin_dpo/margin_mean": 79.61310577392578, "margin_dpo/margin_std": 96.70479583740234, "step": 445 }, { "KL/chosen_KL_mean": -136.00213623046875, "KL/mean": -176.71603393554688, "KL/rejected_KL_mean": -217.429931640625, "KL/std": 92.85810852050781, "epoch": 0.674225245653817, "fcm_dpo/beta": 0.005610132589936256, "fcm_dpo/delta": -0.06062261387705803, "fcm_dpo/margin": 81.42779541015625, "fcm_dpo/q_t": 0.3943191170692444, "grad_norm": 11.739607810974121, "learning_rate": 1.461462467495284e-07, "logits/chosen": 0.759577751159668, "logits/rejected": 0.6762539744377136, "logps/chosen": -184.8013916015625, "logps/ref_chosen": -48.79924774169922, "logps/ref_rejected": -71.8719482421875, "logps/rejected": -289.3018798828125, "loss": 1.0461, "margin_dpo/margin_mean": 81.42778778076172, "margin_dpo/margin_std": 87.92593383789062, "step": 446 }, { "KL/chosen_KL_mean": -132.41104125976562, "KL/mean": -179.89007568359375, "KL/rejected_KL_mean": -227.36911010742188, "KL/std": 92.40835571289062, "epoch": 0.6757369614512472, "fcm_dpo/beta": 0.005422515794634819, "fcm_dpo/delta": -0.1219148337841034, "fcm_dpo/margin": 94.95809936523438, "fcm_dpo/q_t": 0.37980031967163086, "grad_norm": 12.705660820007324, "learning_rate": 1.4494497203727843e-07, "logits/chosen": 0.6679835319519043, "logits/rejected": 0.5609793066978455, "logps/chosen": -186.09375, "logps/ref_chosen": -53.682716369628906, "logps/ref_rejected": -88.17315673828125, "logps/rejected": -315.54229736328125, "loss": 1.0157, "margin_dpo/margin_mean": 94.95809936523438, "margin_dpo/margin_std": 105.59651184082031, "step": 447 }, { "KL/chosen_KL_mean": -143.4046630859375, "KL/mean": -180.82546997070312, "KL/rejected_KL_mean": -218.2462921142578, "KL/std": 94.02133178710938, "epoch": 0.6772486772486772, "fcm_dpo/beta": 0.005405202507972717, "fcm_dpo/delta": -0.00472560990601778, "fcm_dpo/margin": 74.84162902832031, "fcm_dpo/q_t": 0.4060589671134949, "grad_norm": 9.806294441223145, "learning_rate": 1.4374663593999256e-07, "logits/chosen": 0.7528284788131714, "logits/rejected": 0.6972676515579224, "logps/chosen": -197.15591430664062, "logps/ref_chosen": -53.75125503540039, "logps/ref_rejected": -77.17623901367188, "logps/rejected": -295.42254638671875, "loss": 1.0911, "margin_dpo/margin_mean": 74.84162902832031, "margin_dpo/margin_std": 100.25143432617188, "step": 448 }, { "KL/chosen_KL_mean": -170.77175903320312, "KL/mean": -189.1611328125, "KL/rejected_KL_mean": -207.55050659179688, "KL/std": 96.70616149902344, "epoch": 0.6787603930461074, "fcm_dpo/beta": 0.005480349063873291, "fcm_dpo/delta": 0.06997599452733994, "fcm_dpo/margin": 36.778724670410156, "fcm_dpo/q_t": 0.45346102118492126, "grad_norm": 18.269245147705078, "learning_rate": 1.4255127197770707e-07, "logits/chosen": 0.5740299820899963, "logits/rejected": 0.5731356143951416, "logps/chosen": -246.59915161132812, "logps/ref_chosen": -75.82737731933594, "logps/ref_rejected": -82.20687866210938, "logps/rejected": -289.75738525390625, "loss": 1.2613, "margin_dpo/margin_mean": 36.778724670410156, "margin_dpo/margin_std": 95.16302490234375, "step": 449 }, { "KL/chosen_KL_mean": -141.65185546875, "KL/mean": -172.264404296875, "KL/rejected_KL_mean": -202.87693786621094, "KL/std": 98.33901977539062, "epoch": 0.6802721088435374, "fcm_dpo/beta": 0.005516710691154003, "fcm_dpo/delta": 0.06442365795373917, "fcm_dpo/margin": 61.22511291503906, "fcm_dpo/q_t": 0.4257189631462097, "grad_norm": 12.029897689819336, "learning_rate": 1.4135891358732205e-07, "logits/chosen": 0.8027943968772888, "logits/rejected": 0.6883209347724915, "logps/chosen": -188.767578125, "logps/ref_chosen": -47.11572265625, "logps/ref_rejected": -78.7546615600586, "logps/rejected": -281.631591796875, "loss": 1.1699, "margin_dpo/margin_mean": 61.2251091003418, "margin_dpo/margin_std": 113.70191192626953, "step": 450 }, { "KL/chosen_KL_mean": -143.9608154296875, "KL/mean": -171.2278594970703, "KL/rejected_KL_mean": -198.49488830566406, "KL/std": 92.972900390625, "epoch": 0.6817838246409675, "fcm_dpo/beta": 0.005640652030706406, "fcm_dpo/delta": 0.095095694065094, "fcm_dpo/margin": 54.53407287597656, "fcm_dpo/q_t": 0.4293200373649597, "grad_norm": 12.105690002441406, "learning_rate": 1.4016959412166437e-07, "logits/chosen": 0.6591833829879761, "logits/rejected": 0.6084069013595581, "logps/chosen": -207.31126403808594, "logps/ref_chosen": -63.350440979003906, "logps/ref_rejected": -76.28530883789062, "logps/rejected": -274.78021240234375, "loss": 1.175, "margin_dpo/margin_mean": 54.53407287597656, "margin_dpo/margin_std": 98.20162200927734, "step": 451 }, { "KL/chosen_KL_mean": -141.15440368652344, "KL/mean": -173.48129272460938, "KL/rejected_KL_mean": -205.80819702148438, "KL/std": 88.95596313476562, "epoch": 0.6832955404383976, "fcm_dpo/beta": 0.005686076357960701, "fcm_dpo/delta": 0.03361125290393829, "fcm_dpo/margin": 64.65379333496094, "fcm_dpo/q_t": 0.4162237048149109, "grad_norm": 13.642257690429688, "learning_rate": 1.3898334684855645e-07, "logits/chosen": 0.6565678119659424, "logits/rejected": 0.5726250410079956, "logps/chosen": -196.740234375, "logps/ref_chosen": -55.58583450317383, "logps/ref_rejected": -77.68738555908203, "logps/rejected": -283.4955749511719, "loss": 1.1442, "margin_dpo/margin_mean": 64.65379333496094, "margin_dpo/margin_std": 109.71839141845703, "step": 452 }, { "KL/chosen_KL_mean": -140.89617919921875, "KL/mean": -172.29718017578125, "KL/rejected_KL_mean": -203.69818115234375, "KL/std": 93.05279541015625, "epoch": 0.6848072562358276, "fcm_dpo/beta": 0.005723862908780575, "fcm_dpo/delta": 0.04205973818898201, "fcm_dpo/margin": 62.802001953125, "fcm_dpo/q_t": 0.41862282156944275, "grad_norm": 13.660717964172363, "learning_rate": 1.3780020494988445e-07, "logits/chosen": 0.6440606117248535, "logits/rejected": 0.6197322607040405, "logps/chosen": -202.67437744140625, "logps/ref_chosen": -61.778202056884766, "logps/ref_rejected": -71.51403045654297, "logps/rejected": -275.21221923828125, "loss": 1.1468, "margin_dpo/margin_mean": 62.802001953125, "margin_dpo/margin_std": 106.21121215820312, "step": 453 }, { "KL/chosen_KL_mean": -130.77272033691406, "KL/mean": -166.72314453125, "KL/rejected_KL_mean": -202.673583984375, "KL/std": 93.65628814697266, "epoch": 0.6863189720332578, "fcm_dpo/beta": 0.005729289725422859, "fcm_dpo/delta": -0.012474976480007172, "fcm_dpo/margin": 71.90084838867188, "fcm_dpo/q_t": 0.40495869517326355, "grad_norm": 12.87133502960205, "learning_rate": 1.366202015206706e-07, "logits/chosen": 0.7237143516540527, "logits/rejected": 0.6826895475387573, "logps/chosen": -182.3678741455078, "logps/ref_chosen": -51.59515380859375, "logps/ref_rejected": -63.96732711791992, "logps/rejected": -266.64093017578125, "loss": 1.0994, "margin_dpo/margin_mean": 71.90084838867188, "margin_dpo/margin_std": 104.72161865234375, "step": 454 }, { "KL/chosen_KL_mean": -151.150390625, "KL/mean": -185.60336303710938, "KL/rejected_KL_mean": -220.05633544921875, "KL/std": 93.06497192382812, "epoch": 0.6878306878306878, "fcm_dpo/beta": 0.005727289244532585, "fcm_dpo/delta": 0.0055487025529146194, "fcm_dpo/margin": 68.90592956542969, "fcm_dpo/q_t": 0.41084253787994385, "grad_norm": 13.051907539367676, "learning_rate": 1.354433695681474e-07, "logits/chosen": 0.577187180519104, "logits/rejected": 0.5470499992370605, "logps/chosen": -221.80210876464844, "logps/ref_chosen": -70.65170288085938, "logps/ref_rejected": -77.44276428222656, "logps/rejected": -297.49908447265625, "loss": 1.1057, "margin_dpo/margin_mean": 68.90592956542969, "margin_dpo/margin_std": 99.8013916015625, "step": 455 }, { "KL/chosen_KL_mean": -148.49453735351562, "KL/mean": -180.41351318359375, "KL/rejected_KL_mean": -212.33248901367188, "KL/std": 92.60049438476562, "epoch": 0.6893424036281179, "fcm_dpo/beta": 0.005755506921559572, "fcm_dpo/delta": 0.033820997923612595, "fcm_dpo/margin": 63.83796691894531, "fcm_dpo/q_t": 0.41782069206237793, "grad_norm": 15.247598648071289, "learning_rate": 1.3426974201083439e-07, "logits/chosen": 0.6207253336906433, "logits/rejected": 0.5548273921012878, "logps/chosen": -204.892822265625, "logps/ref_chosen": -56.398284912109375, "logps/ref_rejected": -82.61642456054688, "logps/rejected": -294.94891357421875, "loss": 1.1381, "margin_dpo/margin_mean": 63.837974548339844, "margin_dpo/margin_std": 105.94026947021484, "step": 456 }, { "KL/chosen_KL_mean": -146.1392059326172, "KL/mean": -180.75950622558594, "KL/rejected_KL_mean": -215.3798065185547, "KL/std": 95.18257141113281, "epoch": 0.690854119425548, "fcm_dpo/beta": 0.005760158412158489, "fcm_dpo/delta": 0.0010564979165792465, "fcm_dpo/margin": 69.24058532714844, "fcm_dpo/q_t": 0.4076615869998932, "grad_norm": 12.572127342224121, "learning_rate": 1.3309935167761717e-07, "logits/chosen": 0.834632158279419, "logits/rejected": 0.7506304979324341, "logps/chosen": -190.85977172851562, "logps/ref_chosen": -44.72057342529297, "logps/ref_rejected": -68.1158676147461, "logps/rejected": -283.49566650390625, "loss": 1.0929, "margin_dpo/margin_mean": 69.24058532714844, "margin_dpo/margin_std": 91.57593536376953, "step": 457 }, { "KL/chosen_KL_mean": -139.739990234375, "KL/mean": -176.490478515625, "KL/rejected_KL_mean": -213.24095153808594, "KL/std": 98.15718841552734, "epoch": 0.6923658352229781, "fcm_dpo/beta": 0.005762549117207527, "fcm_dpo/delta": -0.024595728144049644, "fcm_dpo/margin": 73.50098419189453, "fcm_dpo/q_t": 0.4034860134124756, "grad_norm": 12.442438125610352, "learning_rate": 1.3193223130682936e-07, "logits/chosen": 0.7277786731719971, "logits/rejected": 0.6062558889389038, "logps/chosen": -189.74566650390625, "logps/ref_chosen": -50.00569152832031, "logps/ref_rejected": -87.50015258789062, "logps/rejected": -300.7410888671875, "loss": 1.0998, "margin_dpo/margin_mean": 73.5009765625, "margin_dpo/margin_std": 109.82258605957031, "step": 458 }, { "KL/chosen_KL_mean": -135.27378845214844, "KL/mean": -179.198486328125, "KL/rejected_KL_mean": -223.1231689453125, "KL/std": 110.36168670654297, "epoch": 0.6938775510204082, "fcm_dpo/beta": 0.005706362426280975, "fcm_dpo/delta": -0.10723035037517548, "fcm_dpo/margin": 87.84938049316406, "fcm_dpo/q_t": 0.38416963815689087, "grad_norm": 12.058633804321289, "learning_rate": 1.3076841354533658e-07, "logits/chosen": 0.7243565917015076, "logits/rejected": 0.6900701522827148, "logps/chosen": -200.6517333984375, "logps/ref_chosen": -65.37794494628906, "logps/ref_rejected": -88.19244384765625, "logps/rejected": -311.31561279296875, "loss": 1.0353, "margin_dpo/margin_mean": 87.84938049316406, "margin_dpo/margin_std": 103.46025085449219, "step": 459 }, { "KL/chosen_KL_mean": -147.19268798828125, "KL/mean": -190.1608428955078, "KL/rejected_KL_mean": -233.12899780273438, "KL/std": 98.28622436523438, "epoch": 0.6953892668178382, "fcm_dpo/beta": 0.005540589801967144, "fcm_dpo/delta": -0.08044654130935669, "fcm_dpo/margin": 85.93632507324219, "fcm_dpo/q_t": 0.3923659026622772, "grad_norm": 11.548605918884277, "learning_rate": 1.2960793094762345e-07, "logits/chosen": 0.7367826104164124, "logits/rejected": 0.6124519109725952, "logps/chosen": -211.7543487548828, "logps/ref_chosen": -64.5616683959961, "logps/ref_rejected": -88.67890167236328, "logps/rejected": -321.80792236328125, "loss": 1.0464, "margin_dpo/margin_mean": 85.93632507324219, "margin_dpo/margin_std": 106.88599395751953, "step": 460 }, { "KL/chosen_KL_mean": -123.23249816894531, "KL/mean": -166.4996337890625, "KL/rejected_KL_mean": -209.76675415039062, "KL/std": 90.42996978759766, "epoch": 0.6969009826152683, "fcm_dpo/beta": 0.005424691364169121, "fcm_dpo/delta": -0.07526582479476929, "fcm_dpo/margin": 86.53425598144531, "fcm_dpo/q_t": 0.39163506031036377, "grad_norm": 11.95759105682373, "learning_rate": 1.2845081597488286e-07, "logits/chosen": 0.8077278733253479, "logits/rejected": 0.7222809791564941, "logps/chosen": -172.71041870117188, "logps/ref_chosen": -49.4779167175293, "logps/ref_rejected": -72.65262603759766, "logps/rejected": -282.41937255859375, "loss": 1.0457, "margin_dpo/margin_mean": 86.53426361083984, "margin_dpo/margin_std": 98.84943389892578, "step": 461 }, { "KL/chosen_KL_mean": -135.95791625976562, "KL/mean": -179.03619384765625, "KL/rejected_KL_mean": -222.114501953125, "KL/std": 89.76913452148438, "epoch": 0.6984126984126984, "fcm_dpo/beta": 0.005385175347328186, "fcm_dpo/delta": -0.06739608943462372, "fcm_dpo/margin": 86.15657043457031, "fcm_dpo/q_t": 0.39155688881874084, "grad_norm": 11.953049659729004, "learning_rate": 1.27297100994108e-07, "logits/chosen": 0.6929783225059509, "logits/rejected": 0.6404250860214233, "logps/chosen": -196.45301818847656, "logps/ref_chosen": -60.4951171875, "logps/ref_rejected": -74.82136535644531, "logps/rejected": -296.93585205078125, "loss": 1.0454, "margin_dpo/margin_mean": 86.15657043457031, "margin_dpo/margin_std": 100.91889190673828, "step": 462 }, { "KL/chosen_KL_mean": -162.77044677734375, "KL/mean": -188.78884887695312, "KL/rejected_KL_mean": -214.8072509765625, "KL/std": 89.98290252685547, "epoch": 0.6999244142101285, "fcm_dpo/beta": 0.005398467183113098, "fcm_dpo/delta": 0.01652398146688938, "fcm_dpo/margin": 52.03683090209961, "fcm_dpo/q_t": 0.4342408776283264, "grad_norm": 16.551116943359375, "learning_rate": 1.2614681827718695e-07, "logits/chosen": 0.6927798390388489, "logits/rejected": 0.6907912492752075, "logps/chosen": -230.45555114746094, "logps/ref_chosen": -67.68511962890625, "logps/ref_rejected": -71.32196044921875, "logps/rejected": -286.12921142578125, "loss": 1.195, "margin_dpo/margin_mean": 52.03683090209961, "margin_dpo/margin_std": 98.47354125976562, "step": 463 }, { "KL/chosen_KL_mean": -146.96664428710938, "KL/mean": -187.74020385742188, "KL/rejected_KL_mean": -228.51376342773438, "KL/std": 100.28412628173828, "epoch": 0.7014361300075586, "fcm_dpo/beta": 0.0053864228539168835, "fcm_dpo/delta": -0.04119878262281418, "fcm_dpo/margin": 81.547119140625, "fcm_dpo/q_t": 0.39934396743774414, "grad_norm": 11.005327224731445, "learning_rate": 1.2500000000000005e-07, "logits/chosen": 0.696610689163208, "logits/rejected": 0.6663835048675537, "logps/chosen": -206.13229370117188, "logps/ref_chosen": -59.16564178466797, "logps/ref_rejected": -69.56146240234375, "logps/rejected": -298.0752258300781, "loss": 1.0882, "margin_dpo/margin_mean": 81.547119140625, "margin_dpo/margin_std": 114.78641510009766, "step": 464 }, { "KL/chosen_KL_mean": -155.0091552734375, "KL/mean": -189.71551513671875, "KL/rejected_KL_mean": -224.42185974121094, "KL/std": 96.31067657470703, "epoch": 0.7029478458049887, "fcm_dpo/beta": 0.0053637344390153885, "fcm_dpo/delta": 0.028703685849905014, "fcm_dpo/margin": 69.41270446777344, "fcm_dpo/q_t": 0.4148036241531372, "grad_norm": 11.943255424499512, "learning_rate": 1.238566782415197e-07, "logits/chosen": 0.7990638613700867, "logits/rejected": 0.7346522808074951, "logps/chosen": -213.5228271484375, "logps/ref_chosen": -58.513671875, "logps/ref_rejected": -84.31745910644531, "logps/rejected": -308.73931884765625, "loss": 1.1292, "margin_dpo/margin_mean": 69.41270446777344, "margin_dpo/margin_std": 108.23686218261719, "step": 465 }, { "KL/chosen_KL_mean": -168.52197265625, "KL/mean": -187.99380493164062, "KL/rejected_KL_mean": -207.4656524658203, "KL/std": 103.00288391113281, "epoch": 0.7044595616024187, "fcm_dpo/beta": 0.005454571917653084, "fcm_dpo/delta": 0.06419079005718231, "fcm_dpo/margin": 38.94368362426758, "fcm_dpo/q_t": 0.45021939277648926, "grad_norm": 19.09351921081543, "learning_rate": 1.2271688498291334e-07, "logits/chosen": 0.7152209877967834, "logits/rejected": 0.7208351492881775, "logps/chosen": -241.78778076171875, "logps/ref_chosen": -73.26580810546875, "logps/ref_rejected": -74.83621215820312, "logps/rejected": -282.3018798828125, "loss": 1.2646, "margin_dpo/margin_mean": 38.943687438964844, "margin_dpo/margin_std": 104.86178588867188, "step": 466 }, { "KL/chosen_KL_mean": -149.31288146972656, "KL/mean": -181.1001434326172, "KL/rejected_KL_mean": -212.88742065429688, "KL/std": 99.1036376953125, "epoch": 0.7059712773998488, "fcm_dpo/beta": 0.005494489334523678, "fcm_dpo/delta": 0.052547842264175415, "fcm_dpo/margin": 63.57453918457031, "fcm_dpo/q_t": 0.42108604311943054, "grad_norm": 11.177997589111328, "learning_rate": 1.2158065210664848e-07, "logits/chosen": 0.7435323596000671, "logits/rejected": 0.5959830284118652, "logps/chosen": -196.89236450195312, "logps/ref_chosen": -47.57947540283203, "logps/ref_rejected": -78.68522644042969, "logps/rejected": -291.5726318359375, "loss": 1.1397, "margin_dpo/margin_mean": 63.57454299926758, "margin_dpo/margin_std": 102.56330871582031, "step": 467 }, { "KL/chosen_KL_mean": -141.69357299804688, "KL/mean": -184.28823852539062, "KL/rejected_KL_mean": -226.8829345703125, "KL/std": 97.80046081542969, "epoch": 0.7074829931972789, "fcm_dpo/beta": 0.005458240397274494, "fcm_dpo/delta": -0.06813469529151917, "fcm_dpo/margin": 85.18936157226562, "fcm_dpo/q_t": 0.3926512598991394, "grad_norm": 14.956868171691895, "learning_rate": 1.204480113956011e-07, "logits/chosen": 0.6580522060394287, "logits/rejected": 0.644254207611084, "logps/chosen": -205.62135314941406, "logps/ref_chosen": -63.92778778076172, "logps/ref_rejected": -76.51626586914062, "logps/rejected": -303.3992004394531, "loss": 1.0619, "margin_dpo/margin_mean": 85.18936157226562, "margin_dpo/margin_std": 111.8876953125, "step": 468 }, { "KL/chosen_KL_mean": -144.67218017578125, "KL/mean": -183.44866943359375, "KL/rejected_KL_mean": -222.22515869140625, "KL/std": 96.74673461914062, "epoch": 0.708994708994709, "fcm_dpo/beta": 0.005391741171479225, "fcm_dpo/delta": -0.01967572420835495, "fcm_dpo/margin": 77.552978515625, "fcm_dpo/q_t": 0.40283912420272827, "grad_norm": 12.874052047729492, "learning_rate": 1.1931899453216697e-07, "logits/chosen": 0.784105658531189, "logits/rejected": 0.7682899236679077, "logps/chosen": -203.73036193847656, "logps/ref_chosen": -59.05818176269531, "logps/ref_rejected": -75.67672729492188, "logps/rejected": -297.9018859863281, "loss": 1.0699, "margin_dpo/margin_mean": 77.552978515625, "margin_dpo/margin_std": 90.48800659179688, "step": 469 }, { "KL/chosen_KL_mean": -137.43231201171875, "KL/mean": -175.70677185058594, "KL/rejected_KL_mean": -213.98123168945312, "KL/std": 93.86619567871094, "epoch": 0.7105064247921391, "fcm_dpo/beta": 0.005419607274234295, "fcm_dpo/delta": -0.015639644116163254, "fcm_dpo/margin": 76.54893493652344, "fcm_dpo/q_t": 0.4043824374675751, "grad_norm": 12.502593040466309, "learning_rate": 1.1819363309737438e-07, "logits/chosen": 0.7346839904785156, "logits/rejected": 0.6654283404350281, "logps/chosen": -185.29974365234375, "logps/ref_chosen": -47.86743927001953, "logps/ref_rejected": -65.96859741210938, "logps/rejected": -279.9498291015625, "loss": 1.0913, "margin_dpo/margin_mean": 76.54893493652344, "margin_dpo/margin_std": 105.1298828125, "step": 470 }, { "KL/chosen_KL_mean": -133.7714385986328, "KL/mean": -175.70535278320312, "KL/rejected_KL_mean": -217.63925170898438, "KL/std": 93.56643676757812, "epoch": 0.7120181405895691, "fcm_dpo/beta": 0.005351074505597353, "fcm_dpo/delta": -0.0511823333799839, "fcm_dpo/margin": 83.86780548095703, "fcm_dpo/q_t": 0.39533868432044983, "grad_norm": 12.772603988647461, "learning_rate": 1.1707195857000215e-07, "logits/chosen": 0.71797776222229, "logits/rejected": 0.6627525687217712, "logps/chosen": -191.54928588867188, "logps/ref_chosen": -57.777854919433594, "logps/ref_rejected": -73.81172180175781, "logps/rejected": -291.45098876953125, "loss": 1.0597, "margin_dpo/margin_mean": 83.8677978515625, "margin_dpo/margin_std": 102.9112548828125, "step": 471 }, { "KL/chosen_KL_mean": -135.59100341796875, "KL/mean": -168.46200561523438, "KL/rejected_KL_mean": -201.33299255371094, "KL/std": 96.90724182128906, "epoch": 0.7135298563869993, "fcm_dpo/beta": 0.0053999642841517925, "fcm_dpo/delta": 0.04624803736805916, "fcm_dpo/margin": 65.74198913574219, "fcm_dpo/q_t": 0.4202154874801636, "grad_norm": 13.063240051269531, "learning_rate": 1.1595400232569768e-07, "logits/chosen": 0.7257484197616577, "logits/rejected": 0.6784834861755371, "logps/chosen": -191.4996795654297, "logps/ref_chosen": -55.908668518066406, "logps/ref_rejected": -74.70294189453125, "logps/rejected": -276.03594970703125, "loss": 1.1649, "margin_dpo/margin_mean": 65.74198913574219, "margin_dpo/margin_std": 122.63934326171875, "step": 472 }, { "KL/chosen_KL_mean": -138.72012329101562, "KL/mean": -177.83302307128906, "KL/rejected_KL_mean": -216.94590759277344, "KL/std": 109.40580749511719, "epoch": 0.7150415721844293, "fcm_dpo/beta": 0.005374724976718426, "fcm_dpo/delta": -0.021351143717765808, "fcm_dpo/margin": 78.22579956054688, "fcm_dpo/q_t": 0.40716153383255005, "grad_norm": 14.212732315063477, "learning_rate": 1.1483979563610069e-07, "logits/chosen": 0.8261843919754028, "logits/rejected": 0.707236111164093, "logps/chosen": -192.88101196289062, "logps/ref_chosen": -54.16088104248047, "logps/ref_rejected": -92.76789855957031, "logps/rejected": -309.71380615234375, "loss": 1.1248, "margin_dpo/margin_mean": 78.22579193115234, "margin_dpo/margin_std": 131.7283477783203, "step": 473 }, { "KL/chosen_KL_mean": -140.85987854003906, "KL/mean": -175.586669921875, "KL/rejected_KL_mean": -210.31341552734375, "KL/std": 99.45855712890625, "epoch": 0.7165532879818595, "fcm_dpo/beta": 0.0053957062773406506, "fcm_dpo/delta": 0.026204703375697136, "fcm_dpo/margin": 69.45356750488281, "fcm_dpo/q_t": 0.41605931520462036, "grad_norm": 17.51959228515625, "learning_rate": 1.1372936966796709e-07, "logits/chosen": 0.7989671230316162, "logits/rejected": 0.7212754487991333, "logps/chosen": -187.54559326171875, "logps/ref_chosen": -46.685707092285156, "logps/ref_rejected": -71.44731903076172, "logps/rejected": -281.7607421875, "loss": 1.1427, "margin_dpo/margin_mean": 69.45356750488281, "margin_dpo/margin_std": 119.27337646484375, "step": 474 }, { "KL/chosen_KL_mean": -134.79974365234375, "KL/mean": -184.23065185546875, "KL/rejected_KL_mean": -233.66156005859375, "KL/std": 100.78251647949219, "epoch": 0.7180650037792895, "fcm_dpo/beta": 0.005278711207211018, "fcm_dpo/delta": -0.12924090027809143, "fcm_dpo/margin": 98.86182403564453, "fcm_dpo/q_t": 0.38123923540115356, "grad_norm": 9.865950584411621, "learning_rate": 1.126227554822985e-07, "logits/chosen": 0.6981426477432251, "logits/rejected": 0.6461870074272156, "logps/chosen": -193.28704833984375, "logps/ref_chosen": -58.4873046875, "logps/ref_rejected": -87.00187683105469, "logps/rejected": -320.6634521484375, "loss": 1.0109, "margin_dpo/margin_mean": 98.86181640625, "margin_dpo/margin_std": 111.42034912109375, "step": 475 }, { "KL/chosen_KL_mean": -164.25599670410156, "KL/mean": -195.68017578125, "KL/rejected_KL_mean": -227.10433959960938, "KL/std": 104.61061096191406, "epoch": 0.7195767195767195, "fcm_dpo/beta": 0.005338331684470177, "fcm_dpo/delta": 0.0659838393330574, "fcm_dpo/margin": 62.84834289550781, "fcm_dpo/q_t": 0.42415595054626465, "grad_norm": 13.760232925415039, "learning_rate": 1.1151998403347243e-07, "logits/chosen": 0.6337641477584839, "logits/rejected": 0.6350452899932861, "logps/chosen": -239.6376190185547, "logps/ref_chosen": -75.38162231445312, "logps/ref_rejected": -76.99822235107422, "logps/rejected": -304.1025695800781, "loss": 1.165, "margin_dpo/margin_mean": 62.84834289550781, "margin_dpo/margin_std": 112.1347427368164, "step": 476 }, { "KL/chosen_KL_mean": -164.06210327148438, "KL/mean": -198.26193237304688, "KL/rejected_KL_mean": -232.46176147460938, "KL/std": 105.01254272460938, "epoch": 0.7210884353741497, "fcm_dpo/beta": 0.005356259644031525, "fcm_dpo/delta": 0.03492579236626625, "fcm_dpo/margin": 68.399658203125, "fcm_dpo/q_t": 0.41792139410972595, "grad_norm": 13.895101547241211, "learning_rate": 1.1042108616837692e-07, "logits/chosen": 0.7479244470596313, "logits/rejected": 0.6951473355293274, "logps/chosen": -225.135498046875, "logps/ref_chosen": -61.073387145996094, "logps/ref_rejected": -81.34375, "logps/rejected": -313.8055114746094, "loss": 1.1767, "margin_dpo/margin_mean": 68.399658203125, "margin_dpo/margin_std": 135.6920166015625, "step": 477 }, { "KL/chosen_KL_mean": -145.3203125, "KL/mean": -172.40277099609375, "KL/rejected_KL_mean": -199.4852294921875, "KL/std": 91.06497192382812, "epoch": 0.7226001511715797, "fcm_dpo/beta": 0.005441304761916399, "fcm_dpo/delta": 0.10861382633447647, "fcm_dpo/margin": 54.164920806884766, "fcm_dpo/q_t": 0.4324156939983368, "grad_norm": 15.880545616149902, "learning_rate": 1.0932609262554746e-07, "logits/chosen": 0.6472866535186768, "logits/rejected": 0.6585315465927124, "logps/chosen": -202.48760986328125, "logps/ref_chosen": -57.16731643676758, "logps/ref_rejected": -53.30917739868164, "logps/rejected": -252.79440307617188, "loss": 1.201, "margin_dpo/margin_mean": 54.164920806884766, "margin_dpo/margin_std": 110.67176818847656, "step": 478 }, { "KL/chosen_KL_mean": -150.194091796875, "KL/mean": -174.72964477539062, "KL/rejected_KL_mean": -199.26522827148438, "KL/std": 91.32644653320312, "epoch": 0.7241118669690099, "fcm_dpo/beta": 0.005511588882654905, "fcm_dpo/delta": 0.022694114595651627, "fcm_dpo/margin": 49.071144104003906, "fcm_dpo/q_t": 0.4381629526615143, "grad_norm": 14.859735488891602, "learning_rate": 1.0823503403430734e-07, "logits/chosen": 0.686680793762207, "logits/rejected": 0.6446273326873779, "logps/chosen": -209.10739135742188, "logps/ref_chosen": -58.91331481933594, "logps/ref_rejected": -63.7403450012207, "logps/rejected": -263.00555419921875, "loss": 1.2189, "margin_dpo/margin_mean": 49.07115173339844, "margin_dpo/margin_std": 107.41427612304688, "step": 479 }, { "KL/chosen_KL_mean": -154.4395751953125, "KL/mean": -192.31060791015625, "KL/rejected_KL_mean": -230.181640625, "KL/std": 96.46765899658203, "epoch": 0.7256235827664399, "fcm_dpo/beta": 0.005537570454180241, "fcm_dpo/delta": -0.021367572247982025, "fcm_dpo/margin": 75.74203491210938, "fcm_dpo/q_t": 0.4044311046600342, "grad_norm": 15.87986946105957, "learning_rate": 1.0714794091391072e-07, "logits/chosen": 0.6649582386016846, "logits/rejected": 0.6550390720367432, "logps/chosen": -217.24020385742188, "logps/ref_chosen": -62.80061340332031, "logps/ref_rejected": -67.58859252929688, "logps/rejected": -297.7702331542969, "loss": 1.1043, "margin_dpo/margin_mean": 75.7420425415039, "margin_dpo/margin_std": 111.02420043945312, "step": 480 }, { "KL/chosen_KL_mean": -150.41412353515625, "KL/mean": -184.60824584960938, "KL/rejected_KL_mean": -218.80238342285156, "KL/std": 98.55257415771484, "epoch": 0.72713529856387, "fcm_dpo/beta": 0.005498744547367096, "fcm_dpo/delta": 0.02486630156636238, "fcm_dpo/margin": 68.38827514648438, "fcm_dpo/q_t": 0.4149879813194275, "grad_norm": 14.223176956176758, "learning_rate": 1.0606484367268906e-07, "logits/chosen": 0.6595680713653564, "logits/rejected": 0.66021329164505, "logps/chosen": -215.7006072998047, "logps/ref_chosen": -65.28649139404297, "logps/ref_rejected": -70.78668212890625, "logps/rejected": -289.58905029296875, "loss": 1.1309, "margin_dpo/margin_mean": 68.38827514648438, "margin_dpo/margin_std": 110.73530578613281, "step": 481 }, { "KL/chosen_KL_mean": -170.07754516601562, "KL/mean": -205.50686645507812, "KL/rejected_KL_mean": -240.93618774414062, "KL/std": 104.53562927246094, "epoch": 0.7286470143613001, "fcm_dpo/beta": 0.005535529926419258, "fcm_dpo/delta": 0.007934560999274254, "fcm_dpo/margin": 70.85865783691406, "fcm_dpo/q_t": 0.41427192091941833, "grad_norm": 16.027814865112305, "learning_rate": 1.0498577260720048e-07, "logits/chosen": 0.6437935829162598, "logits/rejected": 0.4879078269004822, "logps/chosen": -230.98373413085938, "logps/ref_chosen": -60.906185150146484, "logps/ref_rejected": -103.44656372070312, "logps/rejected": -344.38275146484375, "loss": 1.1537, "margin_dpo/margin_mean": 70.85865783691406, "margin_dpo/margin_std": 131.55447387695312, "step": 482 }, { "KL/chosen_KL_mean": -142.55783081054688, "KL/mean": -184.89833068847656, "KL/rejected_KL_mean": -227.23883056640625, "KL/std": 96.09109497070312, "epoch": 0.7301587301587301, "fcm_dpo/beta": 0.005480245687067509, "fcm_dpo/delta": -0.06713807582855225, "fcm_dpo/margin": 84.68096160888672, "fcm_dpo/q_t": 0.393879234790802, "grad_norm": 12.341558456420898, "learning_rate": 1.0391075790138232e-07, "logits/chosen": 0.787866473197937, "logits/rejected": 0.6739017963409424, "logps/chosen": -195.74984741210938, "logps/ref_chosen": -53.192012786865234, "logps/ref_rejected": -81.83927154541016, "logps/rejected": -309.0780944824219, "loss": 1.0587, "margin_dpo/margin_mean": 84.68096160888672, "margin_dpo/margin_std": 109.34596252441406, "step": 483 }, { "KL/chosen_KL_mean": -148.7979736328125, "KL/mean": -176.161865234375, "KL/rejected_KL_mean": -203.52577209472656, "KL/std": 93.59253692626953, "epoch": 0.7316704459561603, "fcm_dpo/beta": 0.005553838796913624, "fcm_dpo/delta": 0.0983155369758606, "fcm_dpo/margin": 54.72779846191406, "fcm_dpo/q_t": 0.42891383171081543, "grad_norm": 16.872257232666016, "learning_rate": 1.0283982962570681e-07, "logits/chosen": 0.8028154373168945, "logits/rejected": 0.7692093849182129, "logps/chosen": -206.56744384765625, "logps/ref_chosen": -57.76945877075195, "logps/ref_rejected": -71.6829833984375, "logps/rejected": -275.208740234375, "loss": 1.1569, "margin_dpo/margin_mean": 54.727806091308594, "margin_dpo/margin_std": 82.58448028564453, "step": 484 }, { "KL/chosen_KL_mean": -151.40916442871094, "KL/mean": -182.75428771972656, "KL/rejected_KL_mean": -214.0994110107422, "KL/std": 96.26882934570312, "epoch": 0.7331821617535903, "fcm_dpo/beta": 0.005504989065229893, "fcm_dpo/delta": -0.04860132187604904, "fcm_dpo/margin": 62.69023132324219, "fcm_dpo/q_t": 0.42029935121536255, "grad_norm": 12.182242393493652, "learning_rate": 1.0177301773633992e-07, "logits/chosen": 0.7824962139129639, "logits/rejected": 0.7588146924972534, "logps/chosen": -208.04501342773438, "logps/ref_chosen": -56.63584899902344, "logps/ref_rejected": -70.85614013671875, "logps/rejected": -284.95556640625, "loss": 1.139, "margin_dpo/margin_mean": 62.69023132324219, "margin_dpo/margin_std": 94.56282043457031, "step": 485 }, { "KL/chosen_KL_mean": -175.21188354492188, "KL/mean": -204.45416259765625, "KL/rejected_KL_mean": -233.6964111328125, "KL/std": 109.50180053710938, "epoch": 0.7346938775510204, "fcm_dpo/beta": 0.005566400475800037, "fcm_dpo/delta": 0.07698450982570648, "fcm_dpo/margin": 58.484527587890625, "fcm_dpo/q_t": 0.4291951656341553, "grad_norm": 11.861063957214355, "learning_rate": 1.007103520743035e-07, "logits/chosen": 0.7440255880355835, "logits/rejected": 0.6231940984725952, "logps/chosen": -231.55889892578125, "logps/ref_chosen": -56.347023010253906, "logps/ref_rejected": -85.97221374511719, "logps/rejected": -319.66864013671875, "loss": 1.2028, "margin_dpo/margin_mean": 58.484535217285156, "margin_dpo/margin_std": 127.72596740722656, "step": 486 }, { "KL/chosen_KL_mean": -156.40859985351562, "KL/mean": -192.46267700195312, "KL/rejected_KL_mean": -228.51675415039062, "KL/std": 100.277587890625, "epoch": 0.7362055933484505, "fcm_dpo/beta": 0.0055840518325567245, "fcm_dpo/delta": -0.002779799047857523, "fcm_dpo/margin": 72.108154296875, "fcm_dpo/q_t": 0.41184118390083313, "grad_norm": 13.544445037841797, "learning_rate": 9.965186236464046e-08, "logits/chosen": 0.8478513360023499, "logits/rejected": 0.7844290733337402, "logps/chosen": -217.02581787109375, "logps/ref_chosen": -60.617218017578125, "logps/ref_rejected": -82.50975036621094, "logps/rejected": -311.0264892578125, "loss": 1.1213, "margin_dpo/margin_mean": 72.108154296875, "margin_dpo/margin_std": 119.23967742919922, "step": 487 }, { "KL/chosen_KL_mean": -147.1437530517578, "KL/mean": -185.4268798828125, "KL/rejected_KL_mean": -223.70999145507812, "KL/std": 96.11285400390625, "epoch": 0.7377173091458806, "fcm_dpo/beta": 0.005586473271250725, "fcm_dpo/delta": -0.029210463166236877, "fcm_dpo/margin": 76.56622314453125, "fcm_dpo/q_t": 0.4036315083503723, "grad_norm": 14.993550300598145, "learning_rate": 9.859757821558337e-08, "logits/chosen": 0.7558993697166443, "logits/rejected": 0.6950063705444336, "logps/chosen": -210.2528076171875, "logps/ref_chosen": -63.10905075073242, "logps/ref_rejected": -82.49348449707031, "logps/rejected": -306.2034606933594, "loss": 1.0866, "margin_dpo/margin_mean": 76.56622314453125, "margin_dpo/margin_std": 106.5186767578125, "step": 488 }, { "KL/chosen_KL_mean": -172.05471801757812, "KL/mean": -196.04173278808594, "KL/rejected_KL_mean": -220.0287322998047, "KL/std": 106.26925659179688, "epoch": 0.7392290249433107, "fcm_dpo/beta": 0.005663672462105751, "fcm_dpo/delta": 0.13191191852092743, "fcm_dpo/margin": 47.9740104675293, "fcm_dpo/q_t": 0.43856281042099, "grad_norm": 13.174731254577637, "learning_rate": 9.754752911772615e-08, "logits/chosen": 0.6860691905021667, "logits/rejected": 0.6383463144302368, "logps/chosen": -237.04367065429688, "logps/ref_chosen": -64.98896026611328, "logps/ref_rejected": -84.39607238769531, "logps/rejected": -304.4248046875, "loss": 1.2359, "margin_dpo/margin_mean": 47.97401428222656, "margin_dpo/margin_std": 116.38835144042969, "step": 489 }, { "KL/chosen_KL_mean": -154.94491577148438, "KL/mean": -183.457763671875, "KL/rejected_KL_mean": -211.97061157226562, "KL/std": 103.63092041015625, "epoch": 0.7407407407407407, "fcm_dpo/beta": 0.005740485154092312, "fcm_dpo/delta": 0.07500448077917099, "fcm_dpo/margin": 57.02571105957031, "fcm_dpo/q_t": 0.4276433289051056, "grad_norm": 12.981752395629883, "learning_rate": 9.650174444319956e-08, "logits/chosen": 0.7899433374404907, "logits/rejected": 0.7687404155731201, "logps/chosen": -216.85366821289062, "logps/ref_chosen": -61.90874481201172, "logps/ref_rejected": -70.58566284179688, "logps/rejected": -282.5562744140625, "loss": 1.219, "margin_dpo/margin_mean": 57.02571105957031, "margin_dpo/margin_std": 131.20643615722656, "step": 490 }, { "KL/chosen_KL_mean": -148.44052124023438, "KL/mean": -179.82432556152344, "KL/rejected_KL_mean": -211.20814514160156, "KL/std": 95.20518493652344, "epoch": 0.7422524565381708, "fcm_dpo/beta": 0.005784884095191956, "fcm_dpo/delta": 0.03768404945731163, "fcm_dpo/margin": 62.767616271972656, "fcm_dpo/q_t": 0.4178526997566223, "grad_norm": 13.990225791931152, "learning_rate": 9.546025344484868e-08, "logits/chosen": 0.6701542139053345, "logits/rejected": 0.6087601184844971, "logps/chosen": -203.9162139892578, "logps/ref_chosen": -55.47570037841797, "logps/ref_rejected": -78.70318603515625, "logps/rejected": -289.91131591796875, "loss": 1.1376, "margin_dpo/margin_mean": 62.76762008666992, "margin_dpo/margin_std": 100.01399993896484, "step": 491 }, { "KL/chosen_KL_mean": -176.35731506347656, "KL/mean": -202.4155731201172, "KL/rejected_KL_mean": -228.47381591796875, "KL/std": 105.18617248535156, "epoch": 0.7437641723356009, "fcm_dpo/beta": 0.0058359187096357346, "fcm_dpo/delta": 0.004578735213726759, "fcm_dpo/margin": 52.11651611328125, "fcm_dpo/q_t": 0.43150758743286133, "grad_norm": 15.62034797668457, "learning_rate": 9.442308525541589e-08, "logits/chosen": 0.6855677366256714, "logits/rejected": 0.6149797439575195, "logps/chosen": -243.64370727539062, "logps/ref_chosen": -67.28638458251953, "logps/ref_rejected": -82.78628540039062, "logps/rejected": -311.2601318359375, "loss": 1.2233, "margin_dpo/margin_mean": 52.11651611328125, "margin_dpo/margin_std": 118.65442657470703, "step": 492 }, { "KL/chosen_KL_mean": -144.435302734375, "KL/mean": -184.0953369140625, "KL/rejected_KL_mean": -223.75538635253906, "KL/std": 102.452880859375, "epoch": 0.745275888133031, "fcm_dpo/beta": 0.005826625041663647, "fcm_dpo/delta": -0.06588587909936905, "fcm_dpo/margin": 79.32008361816406, "fcm_dpo/q_t": 0.3962059020996094, "grad_norm": 14.322379112243652, "learning_rate": 9.339026888672468e-08, "logits/chosen": 0.6777129173278809, "logits/rejected": 0.6000720858573914, "logps/chosen": -200.36279296875, "logps/ref_chosen": -55.92750549316406, "logps/ref_rejected": -79.12149810791016, "logps/rejected": -302.87689208984375, "loss": 1.0791, "margin_dpo/margin_mean": 79.32008361816406, "margin_dpo/margin_std": 110.83384704589844, "step": 493 }, { "KL/chosen_KL_mean": -150.37063598632812, "KL/mean": -183.9254150390625, "KL/rejected_KL_mean": -217.48019409179688, "KL/std": 102.46748352050781, "epoch": 0.7467876039304611, "fcm_dpo/beta": 0.005762025713920593, "fcm_dpo/delta": 0.01378884632140398, "fcm_dpo/margin": 67.10954284667969, "fcm_dpo/q_t": 0.4137095808982849, "grad_norm": 14.783381462097168, "learning_rate": 9.236183322886945e-08, "logits/chosen": 0.6219607591629028, "logits/rejected": 0.5691405534744263, "logps/chosen": -218.32473754882812, "logps/ref_chosen": -67.95410919189453, "logps/ref_rejected": -90.50865173339844, "logps/rejected": -307.98883056640625, "loss": 1.1587, "margin_dpo/margin_mean": 67.10954284667969, "margin_dpo/margin_std": 126.5955581665039, "step": 494 }, { "KL/chosen_KL_mean": -148.03256225585938, "KL/mean": -175.156494140625, "KL/rejected_KL_mean": -202.28042602539062, "KL/std": 101.37467956542969, "epoch": 0.7482993197278912, "fcm_dpo/beta": 0.005779461935162544, "fcm_dpo/delta": 0.002404775470495224, "fcm_dpo/margin": 54.24787521362305, "fcm_dpo/q_t": 0.43343716859817505, "grad_norm": 17.738088607788086, "learning_rate": 9.133780704940594e-08, "logits/chosen": 0.7821865081787109, "logits/rejected": 0.7169306874275208, "logps/chosen": -200.65802001953125, "logps/ref_chosen": -52.62546157836914, "logps/ref_rejected": -72.06781005859375, "logps/rejected": -274.3482360839844, "loss": 1.2123, "margin_dpo/margin_mean": 54.24787139892578, "margin_dpo/margin_std": 123.74436950683594, "step": 495 }, { "KL/chosen_KL_mean": -162.66360473632812, "KL/mean": -198.964111328125, "KL/rejected_KL_mean": -235.26458740234375, "KL/std": 108.78822326660156, "epoch": 0.7498110355253212, "fcm_dpo/beta": 0.00572592206299305, "fcm_dpo/delta": -0.017500266432762146, "fcm_dpo/margin": 72.60098266601562, "fcm_dpo/q_t": 0.41438016295433044, "grad_norm": 13.382797241210938, "learning_rate": 9.031821899254797e-08, "logits/chosen": 0.7177716493606567, "logits/rejected": 0.599869966506958, "logps/chosen": -220.26092529296875, "logps/ref_chosen": -57.597320556640625, "logps/ref_rejected": -94.36127471923828, "logps/rejected": -329.6258544921875, "loss": 1.1475, "margin_dpo/margin_mean": 72.6009750366211, "margin_dpo/margin_std": 137.6909942626953, "step": 496 }, { "KL/chosen_KL_mean": -162.68719482421875, "KL/mean": -200.897705078125, "KL/rejected_KL_mean": -239.1082305908203, "KL/std": 102.14337921142578, "epoch": 0.7513227513227513, "fcm_dpo/beta": 0.00572133157402277, "fcm_dpo/delta": -0.03899545967578888, "fcm_dpo/margin": 76.42105102539062, "fcm_dpo/q_t": 0.399300217628479, "grad_norm": 12.679224967956543, "learning_rate": 8.930309757836516e-08, "logits/chosen": 0.7402607798576355, "logits/rejected": 0.7042181491851807, "logps/chosen": -235.47714233398438, "logps/ref_chosen": -72.78994750976562, "logps/ref_rejected": -89.48483276367188, "logps/rejected": -328.59307861328125, "loss": 1.0908, "margin_dpo/margin_mean": 76.42105102539062, "margin_dpo/margin_std": 111.14985656738281, "step": 497 }, { "KL/chosen_KL_mean": -147.01861572265625, "KL/mean": -184.81727600097656, "KL/rejected_KL_mean": -222.61593627929688, "KL/std": 96.9556884765625, "epoch": 0.7528344671201814, "fcm_dpo/beta": 0.005685499403625727, "fcm_dpo/delta": -0.0311785489320755, "fcm_dpo/margin": 75.59732055664062, "fcm_dpo/q_t": 0.40061208605766296, "grad_norm": 14.99083423614502, "learning_rate": 8.829247120198563e-08, "logits/chosen": 0.7274049520492554, "logits/rejected": 0.7008908987045288, "logps/chosen": -215.38433837890625, "logps/ref_chosen": -68.36572265625, "logps/ref_rejected": -71.28846740722656, "logps/rejected": -293.9044189453125, "loss": 1.0826, "margin_dpo/margin_mean": 75.59732055664062, "margin_dpo/margin_std": 103.3454360961914, "step": 498 }, { "KL/chosen_KL_mean": -145.50221252441406, "KL/mean": -184.955322265625, "KL/rejected_KL_mean": -224.40847778320312, "KL/std": 104.96760559082031, "epoch": 0.7543461829176115, "fcm_dpo/beta": 0.0056427340023219585, "fcm_dpo/delta": -0.047339845448732376, "fcm_dpo/margin": 78.90623474121094, "fcm_dpo/q_t": 0.40204769372940063, "grad_norm": 16.080547332763672, "learning_rate": 8.728636813280163e-08, "logits/chosen": 0.6872826814651489, "logits/rejected": 0.6203812956809998, "logps/chosen": -207.41104125976562, "logps/ref_chosen": -61.90882873535156, "logps/ref_rejected": -91.9411392211914, "logps/rejected": -316.349609375, "loss": 1.1277, "margin_dpo/margin_mean": 78.90623474121094, "margin_dpo/margin_std": 136.77581787109375, "step": 499 }, { "KL/chosen_KL_mean": -152.29983520507812, "KL/mean": -185.60238647460938, "KL/rejected_KL_mean": -218.90492248535156, "KL/std": 94.96754455566406, "epoch": 0.7558578987150416, "fcm_dpo/beta": 0.005632858257740736, "fcm_dpo/delta": 0.025705356150865555, "fcm_dpo/margin": 66.60507202148438, "fcm_dpo/q_t": 0.41267120838165283, "grad_norm": 15.23499584197998, "learning_rate": 8.628481651367875e-08, "logits/chosen": 0.6591007709503174, "logits/rejected": 0.6626341342926025, "logps/chosen": -222.52566528320312, "logps/ref_chosen": -70.225830078125, "logps/ref_rejected": -71.72203063964844, "logps/rejected": -290.626953125, "loss": 1.1633, "margin_dpo/margin_mean": 66.6050796508789, "margin_dpo/margin_std": 125.03921508789062, "step": 500 }, { "KL/chosen_KL_mean": -153.7359619140625, "KL/mean": -183.51657104492188, "KL/rejected_KL_mean": -213.29718017578125, "KL/std": 100.77657318115234, "epoch": 0.7573696145124716, "fcm_dpo/beta": 0.0057217953726649284, "fcm_dpo/delta": 0.06101294606924057, "fcm_dpo/margin": 59.56121063232422, "fcm_dpo/q_t": 0.42077404260635376, "grad_norm": 11.977298736572266, "learning_rate": 8.528784436016878e-08, "logits/chosen": 0.7040742635726929, "logits/rejected": 0.7069046497344971, "logps/chosen": -218.33477783203125, "logps/ref_chosen": -64.59880828857422, "logps/ref_rejected": -70.59329223632812, "logps/rejected": -283.8904724121094, "loss": 1.1291, "margin_dpo/margin_mean": 59.56120681762695, "margin_dpo/margin_std": 83.0285873413086, "step": 501 }, { "KL/chosen_KL_mean": -152.78558349609375, "KL/mean": -185.839599609375, "KL/rejected_KL_mean": -218.8936309814453, "KL/std": 106.23690795898438, "epoch": 0.7588813303099018, "fcm_dpo/beta": 0.005758670158684254, "fcm_dpo/delta": 0.019846642389893532, "fcm_dpo/margin": 66.10803985595703, "fcm_dpo/q_t": 0.41289129853248596, "grad_norm": 16.029478073120117, "learning_rate": 8.4295479559726e-08, "logits/chosen": 0.7329978942871094, "logits/rejected": 0.6826174855232239, "logps/chosen": -218.25221252441406, "logps/ref_chosen": -65.46662902832031, "logps/ref_rejected": -90.22233581542969, "logps/rejected": -309.115966796875, "loss": 1.1207, "margin_dpo/margin_mean": 66.10804748535156, "margin_dpo/margin_std": 100.45468139648438, "step": 502 }, { "KL/chosen_KL_mean": -138.15911865234375, "KL/mean": -171.57608032226562, "KL/rejected_KL_mean": -204.9930419921875, "KL/std": 94.16381072998047, "epoch": 0.7603930461073318, "fcm_dpo/beta": 0.005755226127803326, "fcm_dpo/delta": 0.01596178486943245, "fcm_dpo/margin": 66.83392333984375, "fcm_dpo/q_t": 0.41114452481269836, "grad_norm": 12.258625030517578, "learning_rate": 8.330774987092712e-08, "logits/chosen": 0.7005965709686279, "logits/rejected": 0.7042487263679504, "logps/chosen": -189.993896484375, "logps/ref_chosen": -51.83476257324219, "logps/ref_rejected": -57.62522506713867, "logps/rejected": -262.6182861328125, "loss": 1.1319, "margin_dpo/margin_mean": 66.83393096923828, "margin_dpo/margin_std": 108.51680755615234, "step": 503 }, { "KL/chosen_KL_mean": -142.242431640625, "KL/mean": -186.28753662109375, "KL/rejected_KL_mean": -230.33262634277344, "KL/std": 93.79852294921875, "epoch": 0.7619047619047619, "fcm_dpo/beta": 0.005709344986826181, "fcm_dpo/delta": -0.10849276185035706, "fcm_dpo/margin": 88.09019470214844, "fcm_dpo/q_t": 0.38354435563087463, "grad_norm": 13.861669540405273, "learning_rate": 8.232468292269479e-08, "logits/chosen": 0.705795407295227, "logits/rejected": 0.6852295398712158, "logps/chosen": -210.8936309814453, "logps/ref_chosen": -68.65119934082031, "logps/ref_rejected": -77.91394805908203, "logps/rejected": -308.24658203125, "loss": 1.0176, "margin_dpo/margin_mean": 88.0902099609375, "margin_dpo/margin_std": 94.9272689819336, "step": 504 }, { "KL/chosen_KL_mean": -150.25692749023438, "KL/mean": -179.53431701660156, "KL/rejected_KL_mean": -208.81167602539062, "KL/std": 107.07967376708984, "epoch": 0.763416477702192, "fcm_dpo/beta": 0.005603378638625145, "fcm_dpo/delta": -0.034925676882267, "fcm_dpo/margin": 58.55474853515625, "fcm_dpo/q_t": 0.42683646082878113, "grad_norm": 13.413351058959961, "learning_rate": 8.134630621352483e-08, "logits/chosen": 0.7296899557113647, "logits/rejected": 0.6891266107559204, "logps/chosen": -210.2557830810547, "logps/ref_chosen": -59.99884796142578, "logps/ref_rejected": -76.88048553466797, "logps/rejected": -285.6921691894531, "loss": 1.1942, "margin_dpo/margin_mean": 58.55474853515625, "margin_dpo/margin_std": 119.70396423339844, "step": 505 }, { "KL/chosen_KL_mean": -148.57571411132812, "KL/mean": -180.71624755859375, "KL/rejected_KL_mean": -212.85678100585938, "KL/std": 95.81402587890625, "epoch": 0.764928193499622, "fcm_dpo/beta": 0.005625975783914328, "fcm_dpo/delta": 0.03979700803756714, "fcm_dpo/margin": 64.28105163574219, "fcm_dpo/q_t": 0.4171451926231384, "grad_norm": 14.383041381835938, "learning_rate": 8.037264711071698e-08, "logits/chosen": 0.6947674751281738, "logits/rejected": 0.6782245635986328, "logps/chosen": -218.6470184326172, "logps/ref_chosen": -70.07130432128906, "logps/ref_rejected": -82.03775024414062, "logps/rejected": -294.89453125, "loss": 1.1652, "margin_dpo/margin_mean": 64.28105163574219, "margin_dpo/margin_std": 120.19103240966797, "step": 506 }, { "KL/chosen_KL_mean": -161.39913940429688, "KL/mean": -196.85897827148438, "KL/rejected_KL_mean": -232.31884765625, "KL/std": 107.52017211914062, "epoch": 0.7664399092970522, "fcm_dpo/beta": 0.005623640492558479, "fcm_dpo/delta": 0.0008153766393661499, "fcm_dpo/margin": 70.91970825195312, "fcm_dpo/q_t": 0.4142609238624573, "grad_norm": 14.372379302978516, "learning_rate": 7.940373284960933e-08, "logits/chosen": 0.7300077676773071, "logits/rejected": 0.6774189472198486, "logps/chosen": -233.40615844726562, "logps/ref_chosen": -72.00703430175781, "logps/ref_rejected": -93.94987487792969, "logps/rejected": -326.26873779296875, "loss": 1.1467, "margin_dpo/margin_mean": 70.91970825195312, "margin_dpo/margin_std": 127.12944793701172, "step": 507 }, { "KL/chosen_KL_mean": -147.455322265625, "KL/mean": -188.34677124023438, "KL/rejected_KL_mean": -229.2382049560547, "KL/std": 110.47491455078125, "epoch": 0.7679516250944822, "fcm_dpo/beta": 0.0056335581466555595, "fcm_dpo/delta": -0.06416130065917969, "fcm_dpo/margin": 81.78291320800781, "fcm_dpo/q_t": 0.3971662223339081, "grad_norm": 16.375459671020508, "learning_rate": 7.843959053281663e-08, "logits/chosen": 0.6673075556755066, "logits/rejected": 0.5410950183868408, "logps/chosen": -207.67523193359375, "logps/ref_chosen": -60.21992492675781, "logps/ref_rejected": -95.9200668334961, "logps/rejected": -325.15826416015625, "loss": 1.0862, "margin_dpo/margin_mean": 81.78291320800781, "margin_dpo/margin_std": 118.78164672851562, "step": 508 }, { "KL/chosen_KL_mean": -156.5254364013672, "KL/mean": -188.06759643554688, "KL/rejected_KL_mean": -219.60977172851562, "KL/std": 99.26644897460938, "epoch": 0.7694633408919124, "fcm_dpo/beta": 0.0056020780466496944, "fcm_dpo/delta": 0.04828350618481636, "fcm_dpo/margin": 63.08431625366211, "fcm_dpo/q_t": 0.41927778720855713, "grad_norm": 16.41911506652832, "learning_rate": 7.748024712947204e-08, "logits/chosen": 0.647754967212677, "logits/rejected": 0.6242961883544922, "logps/chosen": -222.7956085205078, "logps/ref_chosen": -66.27017211914062, "logps/ref_rejected": -71.73065185546875, "logps/rejected": -291.34039306640625, "loss": 1.161, "margin_dpo/margin_mean": 63.084320068359375, "margin_dpo/margin_std": 113.75473022460938, "step": 509 }, { "KL/chosen_KL_mean": -155.71856689453125, "KL/mean": -196.0922088623047, "KL/rejected_KL_mean": -236.46585083007812, "KL/std": 108.44694519042969, "epoch": 0.7709750566893424, "fcm_dpo/beta": 0.005561854690313339, "fcm_dpo/delta": -0.05209430307149887, "fcm_dpo/margin": 80.74728393554688, "fcm_dpo/q_t": 0.40340501070022583, "grad_norm": 13.982592582702637, "learning_rate": 7.652572947447272e-08, "logits/chosen": 0.8070030808448792, "logits/rejected": 0.7064452767372131, "logps/chosen": -209.26344299316406, "logps/ref_chosen": -53.54487609863281, "logps/ref_rejected": -91.36648559570312, "logps/rejected": -327.83233642578125, "loss": 1.1192, "margin_dpo/margin_mean": 80.7472915649414, "margin_dpo/margin_std": 137.96044921875, "step": 510 }, { "KL/chosen_KL_mean": -147.0612335205078, "KL/mean": -192.7187042236328, "KL/rejected_KL_mean": -238.37619018554688, "KL/std": 98.8441162109375, "epoch": 0.7724867724867724, "fcm_dpo/beta": 0.005492490716278553, "fcm_dpo/delta": -0.10680200159549713, "fcm_dpo/margin": 91.31495666503906, "fcm_dpo/q_t": 0.3846975862979889, "grad_norm": 17.924640655517578, "learning_rate": 7.557606426772961e-08, "logits/chosen": 0.7626087665557861, "logits/rejected": 0.7006202340126038, "logps/chosen": -202.90560913085938, "logps/ref_chosen": -55.844383239746094, "logps/ref_rejected": -86.49819946289062, "logps/rejected": -324.8743896484375, "loss": 1.0305, "margin_dpo/margin_mean": 91.31495666503906, "margin_dpo/margin_std": 107.77976989746094, "step": 511 }, { "KL/chosen_KL_mean": -152.10626220703125, "KL/mean": -180.44918823242188, "KL/rejected_KL_mean": -208.79208374023438, "KL/std": 92.8382339477539, "epoch": 0.7739984882842026, "fcm_dpo/beta": 0.005513361655175686, "fcm_dpo/delta": 0.09038425981998444, "fcm_dpo/margin": 56.68581771850586, "fcm_dpo/q_t": 0.427639365196228, "grad_norm": 19.25876235961914, "learning_rate": 7.463127807341966e-08, "logits/chosen": 0.6121835708618164, "logits/rejected": 0.608819305896759, "logps/chosen": -213.75930786132812, "logps/ref_chosen": -61.653038024902344, "logps/ref_rejected": -72.83148193359375, "logps/rejected": -281.62359619140625, "loss": 1.1948, "margin_dpo/margin_mean": 56.685821533203125, "margin_dpo/margin_std": 114.38213348388672, "step": 512 }, { "KL/chosen_KL_mean": -135.70703125, "KL/mean": -172.83096313476562, "KL/rejected_KL_mean": -209.95486450195312, "KL/std": 95.25938415527344, "epoch": 0.7755102040816326, "fcm_dpo/beta": 0.005531280301511288, "fcm_dpo/delta": -0.011300716549158096, "fcm_dpo/margin": 74.24784088134766, "fcm_dpo/q_t": 0.40637362003326416, "grad_norm": 11.144159317016602, "learning_rate": 7.369139731924401e-08, "logits/chosen": 0.8424023389816284, "logits/rejected": 0.7836489677429199, "logps/chosen": -186.55960083007812, "logps/ref_chosen": -50.85256576538086, "logps/ref_rejected": -69.21754455566406, "logps/rejected": -279.17242431640625, "loss": 1.0914, "margin_dpo/margin_mean": 74.24784851074219, "margin_dpo/margin_std": 101.40265655517578, "step": 513 }, { "KL/chosen_KL_mean": -151.7583770751953, "KL/mean": -193.6513214111328, "KL/rejected_KL_mean": -235.54425048828125, "KL/std": 103.4957046508789, "epoch": 0.7770219198790628, "fcm_dpo/beta": 0.0055114515125751495, "fcm_dpo/delta": -0.0648510754108429, "fcm_dpo/margin": 83.785888671875, "fcm_dpo/q_t": 0.39495134353637695, "grad_norm": 16.04780387878418, "learning_rate": 7.275644829568747e-08, "logits/chosen": 0.7463619709014893, "logits/rejected": 0.7119932174682617, "logps/chosen": -221.143310546875, "logps/ref_chosen": -69.38493347167969, "logps/ref_rejected": -83.32447814941406, "logps/rejected": -318.86871337890625, "loss": 1.0738, "margin_dpo/margin_mean": 83.78589630126953, "margin_dpo/margin_std": 116.66566467285156, "step": 514 }, { "KL/chosen_KL_mean": -160.38107299804688, "KL/mean": -193.44113159179688, "KL/rejected_KL_mean": -226.501220703125, "KL/std": 97.73841094970703, "epoch": 0.7785336356764928, "fcm_dpo/beta": 0.00549755385145545, "fcm_dpo/delta": 0.03789468854665756, "fcm_dpo/margin": 66.12013244628906, "fcm_dpo/q_t": 0.41624802350997925, "grad_norm": 15.595403671264648, "learning_rate": 7.182645715528435e-08, "logits/chosen": 0.7761479616165161, "logits/rejected": 0.6886708736419678, "logps/chosen": -214.068115234375, "logps/ref_chosen": -53.687034606933594, "logps/ref_rejected": -83.59614562988281, "logps/rejected": -310.09735107421875, "loss": 1.1456, "margin_dpo/margin_mean": 66.12013244628906, "margin_dpo/margin_std": 112.26373291015625, "step": 515 }, { "KL/chosen_KL_mean": -137.80105590820312, "KL/mean": -169.00758361816406, "KL/rejected_KL_mean": -200.21409606933594, "KL/std": 96.8656234741211, "epoch": 0.780045351473923, "fcm_dpo/beta": 0.0055713956244289875, "fcm_dpo/delta": 0.05386962741613388, "fcm_dpo/margin": 62.41304016113281, "fcm_dpo/q_t": 0.4193543493747711, "grad_norm": 15.21324348449707, "learning_rate": 7.090144991188568e-08, "logits/chosen": 0.7267540693283081, "logits/rejected": 0.6865274906158447, "logps/chosen": -194.70278930664062, "logps/ref_chosen": -56.9017219543457, "logps/ref_rejected": -67.83477783203125, "logps/rejected": -268.04888916015625, "loss": 1.1665, "margin_dpo/margin_mean": 62.41304016113281, "margin_dpo/margin_std": 115.04786682128906, "step": 516 }, { "KL/chosen_KL_mean": -165.46522521972656, "KL/mean": -189.28358459472656, "KL/rejected_KL_mean": -213.1019287109375, "KL/std": 102.19977569580078, "epoch": 0.781557067271353, "fcm_dpo/beta": 0.005589386448264122, "fcm_dpo/delta": 0.031279418617486954, "fcm_dpo/margin": 47.63669967651367, "fcm_dpo/q_t": 0.4400600790977478, "grad_norm": 15.077012062072754, "learning_rate": 6.998145243993284e-08, "logits/chosen": 0.7679086923599243, "logits/rejected": 0.7680100798606873, "logps/chosen": -227.24037170410156, "logps/ref_chosen": -61.775142669677734, "logps/ref_rejected": -62.88270950317383, "logps/rejected": -275.9846496582031, "loss": 1.224, "margin_dpo/margin_mean": 47.63670349121094, "margin_dpo/margin_std": 107.50513458251953, "step": 517 }, { "KL/chosen_KL_mean": -139.8690185546875, "KL/mean": -174.25653076171875, "KL/rejected_KL_mean": -208.64401245117188, "KL/std": 99.41122436523438, "epoch": 0.783068783068783, "fcm_dpo/beta": 0.00561901181936264, "fcm_dpo/delta": 0.014096262864768505, "fcm_dpo/margin": 68.77500915527344, "fcm_dpo/q_t": 0.4143136441707611, "grad_norm": 13.11536979675293, "learning_rate": 6.906649047373245e-08, "logits/chosen": 0.6999236345291138, "logits/rejected": 0.6547974348068237, "logps/chosen": -201.89425659179688, "logps/ref_chosen": -62.02523422241211, "logps/ref_rejected": -79.06085205078125, "logps/rejected": -287.7048645019531, "loss": 1.1285, "margin_dpo/margin_mean": 68.77500915527344, "margin_dpo/margin_std": 112.69450378417969, "step": 518 }, { "KL/chosen_KL_mean": -170.07679748535156, "KL/mean": -190.23263549804688, "KL/rejected_KL_mean": -210.38848876953125, "KL/std": 101.8365478515625, "epoch": 0.7845804988662132, "fcm_dpo/beta": 0.005660324357450008, "fcm_dpo/delta": 0.06647325307130814, "fcm_dpo/margin": 40.31169891357422, "fcm_dpo/q_t": 0.44826728105545044, "grad_norm": 22.8746337890625, "learning_rate": 6.815658960673781e-08, "logits/chosen": 0.7596914172172546, "logits/rejected": 0.7081311941146851, "logps/chosen": -231.68316650390625, "logps/ref_chosen": -61.60636901855469, "logps/ref_rejected": -74.50727844238281, "logps/rejected": -284.895751953125, "loss": 1.3091, "margin_dpo/margin_mean": 40.31169891357422, "margin_dpo/margin_std": 134.94049072265625, "step": 519 }, { "KL/chosen_KL_mean": -155.76344299316406, "KL/mean": -185.70492553710938, "KL/rejected_KL_mean": -215.64642333984375, "KL/std": 101.99075317382812, "epoch": 0.7860922146636432, "fcm_dpo/beta": 0.005716031417250633, "fcm_dpo/delta": 0.059424035251140594, "fcm_dpo/margin": 59.88298797607422, "fcm_dpo/q_t": 0.4232429265975952, "grad_norm": 15.03718090057373, "learning_rate": 6.725177529083209e-08, "logits/chosen": 0.7997463345527649, "logits/rejected": 0.7399656772613525, "logps/chosen": -218.63687133789062, "logps/ref_chosen": -62.87343215942383, "logps/ref_rejected": -76.505615234375, "logps/rejected": -292.15203857421875, "loss": 1.1605, "margin_dpo/margin_mean": 59.88298797607422, "margin_dpo/margin_std": 105.26655578613281, "step": 520 }, { "KL/chosen_KL_mean": -153.63787841796875, "KL/mean": -197.2700653076172, "KL/rejected_KL_mean": -240.90225219726562, "KL/std": 100.7902603149414, "epoch": 0.7876039304610734, "fcm_dpo/beta": 0.005690417252480984, "fcm_dpo/delta": -0.10149868577718735, "fcm_dpo/margin": 87.26436614990234, "fcm_dpo/q_t": 0.3881688117980957, "grad_norm": 12.79781436920166, "learning_rate": 6.63520728356167e-08, "logits/chosen": 0.5910981297492981, "logits/rejected": 0.5132564306259155, "logps/chosen": -217.8445587158203, "logps/ref_chosen": -64.20668029785156, "logps/ref_rejected": -92.28083038330078, "logps/rejected": -333.18310546875, "loss": 1.0456, "margin_dpo/margin_mean": 87.26436614990234, "margin_dpo/margin_std": 114.09363555908203, "step": 521 }, { "KL/chosen_KL_mean": -154.46090698242188, "KL/mean": -180.5797576904297, "KL/rejected_KL_mean": -206.6986083984375, "KL/std": 101.05316162109375, "epoch": 0.7891156462585034, "fcm_dpo/beta": 0.005724855698645115, "fcm_dpo/delta": 0.10419190675020218, "fcm_dpo/margin": 52.237709045410156, "fcm_dpo/q_t": 0.43299511075019836, "grad_norm": 15.916192054748535, "learning_rate": 6.545750740770336e-08, "logits/chosen": 0.7318228483200073, "logits/rejected": 0.7229518294334412, "logps/chosen": -212.83062744140625, "logps/ref_chosen": -58.369720458984375, "logps/ref_rejected": -68.79248046875, "logps/rejected": -275.4910888671875, "loss": 1.2419, "margin_dpo/margin_mean": 52.237709045410156, "margin_dpo/margin_std": 131.13580322265625, "step": 522 }, { "KL/chosen_KL_mean": -159.59005737304688, "KL/mean": -190.48709106445312, "KL/rejected_KL_mean": -221.38412475585938, "KL/std": 101.59083557128906, "epoch": 0.7906273620559335, "fcm_dpo/beta": 0.005801432300359011, "fcm_dpo/delta": 0.04307107254862785, "fcm_dpo/margin": 61.794071197509766, "fcm_dpo/q_t": 0.41679686307907104, "grad_norm": 18.064197540283203, "learning_rate": 6.456810403001012e-08, "logits/chosen": 0.7310836315155029, "logits/rejected": 0.6016096472740173, "logps/chosen": -225.3032989501953, "logps/ref_chosen": -65.71324157714844, "logps/ref_rejected": -91.98896789550781, "logps/rejected": -313.37310791015625, "loss": 1.1681, "margin_dpo/margin_mean": 61.7940673828125, "margin_dpo/margin_std": 117.1754150390625, "step": 523 }, { "KL/chosen_KL_mean": -137.47650146484375, "KL/mean": -170.70718383789062, "KL/rejected_KL_mean": -203.9378662109375, "KL/std": 97.53777313232422, "epoch": 0.7921390778533636, "fcm_dpo/beta": 0.005849760957062244, "fcm_dpo/delta": 0.011371836066246033, "fcm_dpo/margin": 66.46137237548828, "fcm_dpo/q_t": 0.4108603000640869, "grad_norm": 15.16220474243164, "learning_rate": 6.368388758106134e-08, "logits/chosen": 0.611961841583252, "logits/rejected": 0.5869717597961426, "logps/chosen": -213.82774353027344, "logps/ref_chosen": -76.35124969482422, "logps/ref_rejected": -89.96072387695312, "logps/rejected": -293.89862060546875, "loss": 1.1182, "margin_dpo/margin_mean": 66.46138000488281, "margin_dpo/margin_std": 101.52986145019531, "step": 524 }, { "KL/chosen_KL_mean": -156.3553009033203, "KL/mean": -182.8892822265625, "KL/rejected_KL_mean": -209.4232940673828, "KL/std": 98.40839385986328, "epoch": 0.7936507936507936, "fcm_dpo/beta": 0.005908666644245386, "fcm_dpo/delta": 0.08926425874233246, "fcm_dpo/margin": 53.06797790527344, "fcm_dpo/q_t": 0.43097251653671265, "grad_norm": 19.45557975769043, "learning_rate": 6.280488279429185e-08, "logits/chosen": 0.5528122186660767, "logits/rejected": 0.5484282970428467, "logps/chosen": -231.85110473632812, "logps/ref_chosen": -75.49578857421875, "logps/ref_rejected": -84.04852294921875, "logps/rejected": -293.4718017578125, "loss": 1.2106, "margin_dpo/margin_mean": 53.06797790527344, "margin_dpo/margin_std": 118.72987365722656, "step": 525 }, { "KL/chosen_KL_mean": -164.54603576660156, "KL/mean": -188.56399536132812, "KL/rejected_KL_mean": -212.58197021484375, "KL/std": 100.05648040771484, "epoch": 0.7951625094482238, "fcm_dpo/beta": 0.005912186577916145, "fcm_dpo/delta": -0.021277720108628273, "fcm_dpo/margin": 48.035945892333984, "fcm_dpo/q_t": 0.43512216210365295, "grad_norm": 14.809891700744629, "learning_rate": 6.193111425735515e-08, "logits/chosen": 0.7245358228683472, "logits/rejected": 0.6518833041191101, "logps/chosen": -225.83843994140625, "logps/ref_chosen": -61.29241943359375, "logps/ref_rejected": -82.47763061523438, "logps/rejected": -295.0596008300781, "loss": 1.2179, "margin_dpo/margin_mean": 48.035945892333984, "margin_dpo/margin_std": 104.76473999023438, "step": 526 }, { "KL/chosen_KL_mean": -172.33621215820312, "KL/mean": -193.9043731689453, "KL/rejected_KL_mean": -215.47250366210938, "KL/std": 99.20576477050781, "epoch": 0.7966742252456538, "fcm_dpo/beta": 0.005930366460233927, "fcm_dpo/delta": 0.03065553866326809, "fcm_dpo/margin": 43.13629150390625, "fcm_dpo/q_t": 0.442903995513916, "grad_norm": 16.849000930786133, "learning_rate": 6.106260641143546e-08, "logits/chosen": 0.7592703104019165, "logits/rejected": 0.6743391752243042, "logps/chosen": -233.808837890625, "logps/ref_chosen": -61.472625732421875, "logps/ref_rejected": -90.52831268310547, "logps/rejected": -306.00079345703125, "loss": 1.2542, "margin_dpo/margin_mean": 43.13629150390625, "margin_dpo/margin_std": 113.23367309570312, "step": 527 }, { "KL/chosen_KL_mean": -151.78260803222656, "KL/mean": -176.40643310546875, "KL/rejected_KL_mean": -201.03025817871094, "KL/std": 96.38426208496094, "epoch": 0.7981859410430839, "fcm_dpo/beta": 0.006025585811585188, "fcm_dpo/delta": 0.10654733330011368, "fcm_dpo/margin": 49.24766540527344, "fcm_dpo/q_t": 0.4344805181026459, "grad_norm": 16.57709503173828, "learning_rate": 6.019938355056422e-08, "logits/chosen": 0.6439261436462402, "logits/rejected": 0.5666943192481995, "logps/chosen": -210.57461547851562, "logps/ref_chosen": -58.792015075683594, "logps/ref_rejected": -71.82516479492188, "logps/rejected": -272.85540771484375, "loss": 1.244, "margin_dpo/margin_mean": 49.24766540527344, "margin_dpo/margin_std": 123.16279602050781, "step": 528 }, { "KL/chosen_KL_mean": -143.96681213378906, "KL/mean": -192.940673828125, "KL/rejected_KL_mean": -241.91455078125, "KL/std": 95.53553771972656, "epoch": 0.799697656840514, "fcm_dpo/beta": 0.0059036496095359325, "fcm_dpo/delta": -0.18935421109199524, "fcm_dpo/margin": 97.94773864746094, "fcm_dpo/q_t": 0.3672763407230377, "grad_norm": 16.625507354736328, "learning_rate": 5.934146982094049e-08, "logits/chosen": 0.6493830680847168, "logits/rejected": 0.5940313339233398, "logps/chosen": -199.03778076171875, "logps/ref_chosen": -55.070960998535156, "logps/ref_rejected": -75.44007873535156, "logps/rejected": -317.3546142578125, "loss": 0.9772, "margin_dpo/margin_mean": 97.94773864746094, "margin_dpo/margin_std": 104.44805908203125, "step": 529 }, { "KL/chosen_KL_mean": -146.566650390625, "KL/mean": -177.39117431640625, "KL/rejected_KL_mean": -208.21568298339844, "KL/std": 100.66690063476562, "epoch": 0.8012093726379441, "fcm_dpo/beta": 0.005883745849132538, "fcm_dpo/delta": 0.038681499660015106, "fcm_dpo/margin": 61.64900207519531, "fcm_dpo/q_t": 0.4194566011428833, "grad_norm": 18.862146377563477, "learning_rate": 5.848888922025552e-08, "logits/chosen": 0.7509942054748535, "logits/rejected": 0.702675461769104, "logps/chosen": -203.3104705810547, "logps/ref_chosen": -56.743812561035156, "logps/ref_rejected": -76.6692123413086, "logps/rejected": -284.8848876953125, "loss": 1.1477, "margin_dpo/margin_mean": 61.64900207519531, "margin_dpo/margin_std": 105.2386474609375, "step": 530 }, { "KL/chosen_KL_mean": -145.84561157226562, "KL/mean": -178.4844512939453, "KL/rejected_KL_mean": -211.123291015625, "KL/std": 99.67879486083984, "epoch": 0.8027210884353742, "fcm_dpo/beta": 0.005922066047787666, "fcm_dpo/delta": 0.013820935040712357, "fcm_dpo/margin": 65.27767944335938, "fcm_dpo/q_t": 0.4134918749332428, "grad_norm": 14.803869247436523, "learning_rate": 5.7641665597021435e-08, "logits/chosen": 0.7278386354446411, "logits/rejected": 0.6435098648071289, "logps/chosen": -196.96206665039062, "logps/ref_chosen": -51.116455078125, "logps/ref_rejected": -79.52884674072266, "logps/rejected": -290.65216064453125, "loss": 1.1337, "margin_dpo/margin_mean": 65.27767944335938, "margin_dpo/margin_std": 108.86320495605469, "step": 531 }, { "KL/chosen_KL_mean": -168.71844482421875, "KL/mean": -203.8729248046875, "KL/rejected_KL_mean": -239.02740478515625, "KL/std": 98.42269897460938, "epoch": 0.8042328042328042, "fcm_dpo/beta": 0.005900254473090172, "fcm_dpo/delta": -0.01548069715499878, "fcm_dpo/margin": 70.3089599609375, "fcm_dpo/q_t": 0.40614721179008484, "grad_norm": 15.068279266357422, "learning_rate": 5.679982264990424e-08, "logits/chosen": 0.6419936418533325, "logits/rejected": 0.5893096923828125, "logps/chosen": -226.99838256835938, "logps/ref_chosen": -58.279945373535156, "logps/ref_rejected": -78.05426788330078, "logps/rejected": -317.0816650390625, "loss": 1.1149, "margin_dpo/margin_mean": 70.3089599609375, "margin_dpo/margin_std": 111.21963500976562, "step": 532 }, { "KL/chosen_KL_mean": -121.37933349609375, "KL/mean": -158.35519409179688, "KL/rejected_KL_mean": -195.3310546875, "KL/std": 101.6238784790039, "epoch": 0.8057445200302343, "fcm_dpo/beta": 0.0058759888634085655, "fcm_dpo/delta": -0.036131080240011215, "fcm_dpo/margin": 73.95172119140625, "fcm_dpo/q_t": 0.4019339382648468, "grad_norm": 14.678104400634766, "learning_rate": 5.596338392706076e-08, "logits/chosen": 0.7913415431976318, "logits/rejected": 0.723665177822113, "logps/chosen": -177.79733276367188, "logps/ref_chosen": -56.41801071166992, "logps/ref_rejected": -73.89324951171875, "logps/rejected": -269.22430419921875, "loss": 1.0932, "margin_dpo/margin_mean": 73.95172119140625, "margin_dpo/margin_std": 107.9453125, "step": 533 }, { "KL/chosen_KL_mean": -151.45907592773438, "KL/mean": -183.31573486328125, "KL/rejected_KL_mean": -215.17239379882812, "KL/std": 97.56094360351562, "epoch": 0.8072562358276644, "fcm_dpo/beta": 0.005855937488377094, "fcm_dpo/delta": 0.027793139219284058, "fcm_dpo/margin": 63.71331787109375, "fcm_dpo/q_t": 0.41731053590774536, "grad_norm": 14.168919563293457, "learning_rate": 5.513237282548033e-08, "logits/chosen": 0.7265303134918213, "logits/rejected": 0.6878026127815247, "logps/chosen": -212.207763671875, "logps/ref_chosen": -60.748687744140625, "logps/ref_rejected": -73.8623046875, "logps/rejected": -289.0346984863281, "loss": 1.1629, "margin_dpo/margin_mean": 63.71331787109375, "margin_dpo/margin_std": 119.80328369140625, "step": 534 }, { "KL/chosen_KL_mean": -161.570556640625, "KL/mean": -190.62298583984375, "KL/rejected_KL_mean": -219.67538452148438, "KL/std": 102.95240020751953, "epoch": 0.8087679516250945, "fcm_dpo/beta": 0.005945261567831039, "fcm_dpo/delta": 0.05632663890719414, "fcm_dpo/margin": 58.104827880859375, "fcm_dpo/q_t": 0.42285987734794617, "grad_norm": 15.750730514526367, "learning_rate": 5.430681259032957e-08, "logits/chosen": 0.6182043552398682, "logits/rejected": 0.5551555752754211, "logps/chosen": -223.20797729492188, "logps/ref_chosen": -61.637413024902344, "logps/ref_rejected": -80.93138885498047, "logps/rejected": -300.6067810058594, "loss": 1.1758, "margin_dpo/margin_mean": 58.10482406616211, "margin_dpo/margin_std": 111.81983947753906, "step": 535 }, { "KL/chosen_KL_mean": -140.1672821044922, "KL/mean": -185.82449340820312, "KL/rejected_KL_mean": -231.48171997070312, "KL/std": 103.58207702636719, "epoch": 0.8102796674225246, "fcm_dpo/beta": 0.005817831493914127, "fcm_dpo/delta": -0.13897940516471863, "fcm_dpo/margin": 91.31442260742188, "fcm_dpo/q_t": 0.3792431652545929, "grad_norm": 12.471626281738281, "learning_rate": 5.3486726314303175e-08, "logits/chosen": 0.7547929883003235, "logits/rejected": 0.6657021045684814, "logps/chosen": -192.05625915527344, "logps/ref_chosen": -51.88897705078125, "logps/ref_rejected": -73.34864044189453, "logps/rejected": -304.8303527832031, "loss": 1.0019, "margin_dpo/margin_mean": 91.31441497802734, "margin_dpo/margin_std": 100.58480834960938, "step": 536 }, { "KL/chosen_KL_mean": -159.59193420410156, "KL/mean": -197.22723388671875, "KL/rejected_KL_mean": -234.862548828125, "KL/std": 109.48220825195312, "epoch": 0.8117913832199547, "fcm_dpo/beta": 0.00573838222771883, "fcm_dpo/delta": -0.03375764191150665, "fcm_dpo/margin": 75.27059936523438, "fcm_dpo/q_t": 0.4040025472640991, "grad_norm": 14.284053802490234, "learning_rate": 5.267213693697695e-08, "logits/chosen": 0.8378667831420898, "logits/rejected": 0.7384310960769653, "logps/chosen": -213.84054565429688, "logps/ref_chosen": -54.248619079589844, "logps/ref_rejected": -94.94343566894531, "logps/rejected": -329.80596923828125, "loss": 1.108, "margin_dpo/margin_mean": 75.2706069946289, "margin_dpo/margin_std": 118.4455795288086, "step": 537 }, { "KL/chosen_KL_mean": -157.94598388671875, "KL/mean": -196.46746826171875, "KL/rejected_KL_mean": -234.9889373779297, "KL/std": 105.69438171386719, "epoch": 0.8133030990173847, "fcm_dpo/beta": 0.005723871290683746, "fcm_dpo/delta": -0.04287005215883255, "fcm_dpo/margin": 77.04296875, "fcm_dpo/q_t": 0.399586945772171, "grad_norm": 13.896480560302734, "learning_rate": 5.1863067244167144e-08, "logits/chosen": 0.7036222219467163, "logits/rejected": 0.6767950057983398, "logps/chosen": -228.03952026367188, "logps/ref_chosen": -70.09353637695312, "logps/ref_rejected": -79.49833679199219, "logps/rejected": -314.4872741699219, "loss": 1.0768, "margin_dpo/margin_mean": 77.04296875, "margin_dpo/margin_std": 105.25886535644531, "step": 538 }, { "KL/chosen_KL_mean": -166.3479461669922, "KL/mean": -195.4829864501953, "KL/rejected_KL_mean": -224.6180419921875, "KL/std": 100.32119750976562, "epoch": 0.8148148148148148, "fcm_dpo/beta": 0.005745704751461744, "fcm_dpo/delta": 0.067507803440094, "fcm_dpo/margin": 58.270084381103516, "fcm_dpo/q_t": 0.42635536193847656, "grad_norm": 14.9727201461792, "learning_rate": 5.105953986729195e-08, "logits/chosen": 0.6758599281311035, "logits/rejected": 0.5885668396949768, "logps/chosen": -228.27963256835938, "logps/ref_chosen": -61.93169403076172, "logps/ref_rejected": -84.08946228027344, "logps/rejected": -308.70751953125, "loss": 1.1656, "margin_dpo/margin_mean": 58.270084381103516, "margin_dpo/margin_std": 106.63487243652344, "step": 539 }, { "KL/chosen_KL_mean": -151.53689575195312, "KL/mean": -196.16537475585938, "KL/rejected_KL_mean": -240.79385375976562, "KL/std": 112.3409423828125, "epoch": 0.8163265306122449, "fcm_dpo/beta": 0.0057037402875721455, "fcm_dpo/delta": -0.11482920497655869, "fcm_dpo/margin": 89.2569580078125, "fcm_dpo/q_t": 0.3840544819831848, "grad_norm": 14.26491928100586, "learning_rate": 5.026157728273966e-08, "logits/chosen": 0.7716501951217651, "logits/rejected": 0.6706273555755615, "logps/chosen": -214.24114990234375, "logps/ref_chosen": -62.704254150390625, "logps/ref_rejected": -95.63597106933594, "logps/rejected": -336.4298095703125, "loss": 1.0273, "margin_dpo/margin_mean": 89.2569580078125, "margin_dpo/margin_std": 106.0938720703125, "step": 540 }, { "KL/chosen_KL_mean": -148.39193725585938, "KL/mean": -185.59091186523438, "KL/rejected_KL_mean": -222.78988647460938, "KL/std": 100.17250061035156, "epoch": 0.817838246409675, "fcm_dpo/beta": 0.005604578647762537, "fcm_dpo/delta": -0.01824624091386795, "fcm_dpo/margin": 74.39794921875, "fcm_dpo/q_t": 0.4034620523452759, "grad_norm": 12.724387168884277, "learning_rate": 4.9469201811239035e-08, "logits/chosen": 0.7491267323493958, "logits/rejected": 0.7754353284835815, "logps/chosen": -210.87277221679688, "logps/ref_chosen": -62.48084259033203, "logps/ref_rejected": -57.55541229248047, "logps/rejected": -280.3453063964844, "loss": 1.0865, "margin_dpo/margin_mean": 74.39794921875, "margin_dpo/margin_std": 98.57090759277344, "step": 541 }, { "KL/chosen_KL_mean": -128.881103515625, "KL/mean": -170.5752716064453, "KL/rejected_KL_mean": -212.26943969726562, "KL/std": 96.8449935913086, "epoch": 0.8193499622071051, "fcm_dpo/beta": 0.005561105906963348, "fcm_dpo/delta": -0.06698856502771378, "fcm_dpo/margin": 83.38833618164062, "fcm_dpo/q_t": 0.39425086975097656, "grad_norm": 13.705977439880371, "learning_rate": 4.868243561723534e-08, "logits/chosen": 0.7660166621208191, "logits/rejected": 0.7166833877563477, "logps/chosen": -178.33599853515625, "logps/ref_chosen": -49.454891204833984, "logps/ref_rejected": -65.33275604248047, "logps/rejected": -277.6022033691406, "loss": 1.0761, "margin_dpo/margin_mean": 83.38833618164062, "margin_dpo/margin_std": 118.37450408935547, "step": 542 }, { "KL/chosen_KL_mean": -138.73416137695312, "KL/mean": -180.11659240722656, "KL/rejected_KL_mean": -221.4990234375, "KL/std": 99.09233093261719, "epoch": 0.8208616780045351, "fcm_dpo/beta": 0.005519067868590355, "fcm_dpo/delta": -0.059514693915843964, "fcm_dpo/margin": 82.76486206054688, "fcm_dpo/q_t": 0.3950622081756592, "grad_norm": 11.757946014404297, "learning_rate": 4.790130070827028e-08, "logits/chosen": 0.728924036026001, "logits/rejected": 0.6395883560180664, "logps/chosen": -189.83502197265625, "logps/ref_chosen": -51.100860595703125, "logps/ref_rejected": -76.06130981445312, "logps/rejected": -297.5603332519531, "loss": 1.0672, "margin_dpo/margin_mean": 82.76486206054688, "margin_dpo/margin_std": 108.3336181640625, "step": 543 }, { "KL/chosen_KL_mean": -149.43783569335938, "KL/mean": -195.19053649902344, "KL/rejected_KL_mean": -240.9432373046875, "KL/std": 108.03707885742188, "epoch": 0.8223733938019653, "fcm_dpo/beta": 0.005403600633144379, "fcm_dpo/delta": -0.09939450025558472, "fcm_dpo/margin": 91.50540924072266, "fcm_dpo/q_t": 0.388713538646698, "grad_norm": 14.772136688232422, "learning_rate": 4.7125818934366454e-08, "logits/chosen": 0.7341815829277039, "logits/rejected": 0.6511387825012207, "logps/chosen": -209.71505737304688, "logps/ref_chosen": -60.2772331237793, "logps/ref_rejected": -88.40553283691406, "logps/rejected": -329.3487548828125, "loss": 1.0552, "margin_dpo/margin_mean": 91.50540924072266, "margin_dpo/margin_std": 124.33331298828125, "step": 544 }, { "KL/chosen_KL_mean": -163.93820190429688, "KL/mean": -190.05526733398438, "KL/rejected_KL_mean": -216.17230224609375, "KL/std": 100.32677459716797, "epoch": 0.8238851095993953, "fcm_dpo/beta": 0.005473896861076355, "fcm_dpo/delta": 0.11739957332611084, "fcm_dpo/margin": 52.234092712402344, "fcm_dpo/q_t": 0.43499264121055603, "grad_norm": 14.396151542663574, "learning_rate": 4.635601198741607e-08, "logits/chosen": 0.6655288934707642, "logits/rejected": 0.6052983999252319, "logps/chosen": -225.55345153808594, "logps/ref_chosen": -61.61524963378906, "logps/ref_rejected": -78.71266174316406, "logps/rejected": -294.88494873046875, "loss": 1.2079, "margin_dpo/margin_mean": 52.23409652709961, "margin_dpo/margin_std": 110.28699493408203, "step": 545 }, { "KL/chosen_KL_mean": -154.5303192138672, "KL/mean": -184.57818603515625, "KL/rejected_KL_mean": -214.6260986328125, "KL/std": 97.45379638671875, "epoch": 0.8253968253968254, "fcm_dpo/beta": 0.00555295217782259, "fcm_dpo/delta": 0.06863728165626526, "fcm_dpo/margin": 60.095760345458984, "fcm_dpo/q_t": 0.4237426817417145, "grad_norm": 17.143327713012695, "learning_rate": 4.559190140057428e-08, "logits/chosen": 0.7776880264282227, "logits/rejected": 0.7711125612258911, "logps/chosen": -213.8435821533203, "logps/ref_chosen": -59.313262939453125, "logps/ref_rejected": -64.73631286621094, "logps/rejected": -279.3623962402344, "loss": 1.1881, "margin_dpo/margin_mean": 60.09575653076172, "margin_dpo/margin_std": 121.6714859008789, "step": 546 }, { "KL/chosen_KL_mean": -134.7870635986328, "KL/mean": -177.70162963867188, "KL/rejected_KL_mean": -220.61618041992188, "KL/std": 101.6994857788086, "epoch": 0.8269085411942555, "fcm_dpo/beta": 0.005503546446561813, "fcm_dpo/delta": -0.076295867562294, "fcm_dpo/margin": 85.82913208007812, "fcm_dpo/q_t": 0.3918975591659546, "grad_norm": 13.849120140075684, "learning_rate": 4.483350854765672e-08, "logits/chosen": 0.6785303354263306, "logits/rejected": 0.6111510396003723, "logps/chosen": -189.76380920410156, "logps/ref_chosen": -54.97674560546875, "logps/ref_rejected": -75.35922241210938, "logps/rejected": -295.97540283203125, "loss": 1.0626, "margin_dpo/margin_mean": 85.82913208007812, "margin_dpo/margin_std": 115.88031005859375, "step": 547 }, { "KL/chosen_KL_mean": -159.3828125, "KL/mean": -186.0314483642578, "KL/rejected_KL_mean": -212.6800994873047, "KL/std": 101.64471435546875, "epoch": 0.8284202569916855, "fcm_dpo/beta": 0.005597149953246117, "fcm_dpo/delta": 0.10441551357507706, "fcm_dpo/margin": 53.29730224609375, "fcm_dpo/q_t": 0.43269434571266174, "grad_norm": 16.53727912902832, "learning_rate": 4.4080854642541826e-08, "logits/chosen": 0.618332028388977, "logits/rejected": 0.5591973066329956, "logps/chosen": -222.59347534179688, "logps/ref_chosen": -63.21067428588867, "logps/ref_rejected": -81.23347473144531, "logps/rejected": -293.91357421875, "loss": 1.1975, "margin_dpo/margin_mean": 53.29730224609375, "margin_dpo/margin_std": 108.28794860839844, "step": 548 }, { "KL/chosen_KL_mean": -156.263916015625, "KL/mean": -189.34469604492188, "KL/rejected_KL_mean": -222.42547607421875, "KL/std": 108.56674194335938, "epoch": 0.8299319727891157, "fcm_dpo/beta": 0.00561708677560091, "fcm_dpo/delta": 0.029304249212145805, "fcm_dpo/margin": 66.16154479980469, "fcm_dpo/q_t": 0.4174022674560547, "grad_norm": 16.30482292175293, "learning_rate": 4.333396073857723e-08, "logits/chosen": 0.8058149218559265, "logits/rejected": 0.7340287566184998, "logps/chosen": -220.53744506835938, "logps/ref_chosen": -64.27351379394531, "logps/ref_rejected": -92.31663513183594, "logps/rejected": -314.74212646484375, "loss": 1.169, "margin_dpo/margin_mean": 66.16155242919922, "margin_dpo/margin_std": 127.995361328125, "step": 549 }, { "KL/chosen_KL_mean": -163.3920440673828, "KL/mean": -185.50946044921875, "KL/rejected_KL_mean": -207.62689208984375, "KL/std": 98.04955291748047, "epoch": 0.8314436885865457, "fcm_dpo/beta": 0.005671496503055096, "fcm_dpo/delta": 0.022939864546060562, "fcm_dpo/margin": 44.23485565185547, "fcm_dpo/q_t": 0.4429607391357422, "grad_norm": 15.985555648803711, "learning_rate": 4.259284772799099e-08, "logits/chosen": 0.7515543699264526, "logits/rejected": 0.7224549651145935, "logps/chosen": -219.6224822998047, "logps/ref_chosen": -56.230438232421875, "logps/ref_rejected": -62.59788513183594, "logps/rejected": -270.22479248046875, "loss": 1.2416, "margin_dpo/margin_mean": 44.2348518371582, "margin_dpo/margin_std": 106.80397033691406, "step": 550 }, { "KL/chosen_KL_mean": -166.0439910888672, "KL/mean": -194.88690185546875, "KL/rejected_KL_mean": -223.72982788085938, "KL/std": 105.55809020996094, "epoch": 0.8329554043839759, "fcm_dpo/beta": 0.005742200184613466, "fcm_dpo/delta": 0.0709170550107956, "fcm_dpo/margin": 57.68581008911133, "fcm_dpo/q_t": 0.4269568920135498, "grad_norm": 14.377694129943848, "learning_rate": 4.1857536341307176e-08, "logits/chosen": 0.770805835723877, "logits/rejected": 0.734519362449646, "logps/chosen": -233.79119873046875, "logps/ref_chosen": -67.74720764160156, "logps/ref_rejected": -87.04285430908203, "logps/rejected": -310.7726745605469, "loss": 1.1609, "margin_dpo/margin_mean": 57.68581008911133, "margin_dpo/margin_std": 103.31864929199219, "step": 551 }, { "KL/chosen_KL_mean": -153.67115783691406, "KL/mean": -186.76593017578125, "KL/rejected_KL_mean": -219.8607177734375, "KL/std": 104.97171020507812, "epoch": 0.8344671201814059, "fcm_dpo/beta": 0.0057947514578700066, "fcm_dpo/delta": 0.016545481979846954, "fcm_dpo/margin": 66.18955993652344, "fcm_dpo/q_t": 0.41048935055732727, "grad_norm": 15.772445678710938, "learning_rate": 4.112804714676593e-08, "logits/chosen": 0.71265709400177, "logits/rejected": 0.6561511754989624, "logps/chosen": -216.597412109375, "logps/ref_chosen": -62.92625427246094, "logps/ref_rejected": -82.98365783691406, "logps/rejected": -302.8443603515625, "loss": 1.119, "margin_dpo/margin_mean": 66.1895523071289, "margin_dpo/margin_std": 99.23854064941406, "step": 552 }, { "KL/chosen_KL_mean": -165.35675048828125, "KL/mean": -198.490234375, "KL/rejected_KL_mean": -231.62371826171875, "KL/std": 101.29109191894531, "epoch": 0.8359788359788359, "fcm_dpo/beta": 0.005785372108221054, "fcm_dpo/delta": 0.01728604920208454, "fcm_dpo/margin": 66.2669677734375, "fcm_dpo/q_t": 0.41565388441085815, "grad_norm": 15.792521476745605, "learning_rate": 4.0404400549748144e-08, "logits/chosen": 0.7108063697814941, "logits/rejected": 0.6027768850326538, "logps/chosen": -221.39523315429688, "logps/ref_chosen": -56.038490295410156, "logps/ref_rejected": -84.48454284667969, "logps/rejected": -316.1082458496094, "loss": 1.153, "margin_dpo/margin_mean": 66.26697540283203, "margin_dpo/margin_std": 121.81985473632812, "step": 553 }, { "KL/chosen_KL_mean": -148.8656768798828, "KL/mean": -187.40798950195312, "KL/rejected_KL_mean": -225.95028686523438, "KL/std": 99.06117248535156, "epoch": 0.8374905517762661, "fcm_dpo/beta": 0.005752754397690296, "fcm_dpo/delta": -0.045463062822818756, "fcm_dpo/margin": 77.08460235595703, "fcm_dpo/q_t": 0.3978348970413208, "grad_norm": 14.663110733032227, "learning_rate": 3.968661679220467e-08, "logits/chosen": 0.6909962892532349, "logits/rejected": 0.6762232184410095, "logps/chosen": -213.39627075195312, "logps/ref_chosen": -64.53059387207031, "logps/ref_rejected": -71.2155990600586, "logps/rejected": -297.1658935546875, "loss": 1.0826, "margin_dpo/margin_mean": 77.0845947265625, "margin_dpo/margin_std": 108.04198455810547, "step": 554 }, { "KL/chosen_KL_mean": -164.56484985351562, "KL/mean": -195.12782287597656, "KL/rejected_KL_mean": -225.6907958984375, "KL/std": 99.44901275634766, "epoch": 0.8390022675736961, "fcm_dpo/beta": 0.005819863174110651, "fcm_dpo/delta": 0.04446956515312195, "fcm_dpo/margin": 61.12594223022461, "fcm_dpo/q_t": 0.4173806309700012, "grad_norm": 14.34953784942627, "learning_rate": 3.89747159520904e-08, "logits/chosen": 0.7013709545135498, "logits/rejected": 0.677711546421051, "logps/chosen": -231.21676635742188, "logps/ref_chosen": -66.65191650390625, "logps/ref_rejected": -68.6667251586914, "logps/rejected": -294.3575439453125, "loss": 1.183, "margin_dpo/margin_mean": 61.125946044921875, "margin_dpo/margin_std": 119.1573715209961, "step": 555 }, { "KL/chosen_KL_mean": -163.2904052734375, "KL/mean": -191.6531982421875, "KL/rejected_KL_mean": -220.01596069335938, "KL/std": 103.49365234375, "epoch": 0.8405139833711263, "fcm_dpo/beta": 0.0058401417918503284, "fcm_dpo/delta": 0.07112909853458405, "fcm_dpo/margin": 56.725555419921875, "fcm_dpo/q_t": 0.4268096387386322, "grad_norm": 15.318163871765137, "learning_rate": 3.826871794280192e-08, "logits/chosen": 0.7458856105804443, "logits/rejected": 0.6961278319358826, "logps/chosen": -216.12278747558594, "logps/ref_chosen": -52.832366943359375, "logps/ref_rejected": -64.49044036865234, "logps/rejected": -284.50640869140625, "loss": 1.1967, "margin_dpo/margin_mean": 56.72555923461914, "margin_dpo/margin_std": 118.74533081054688, "step": 556 }, { "KL/chosen_KL_mean": -161.11195373535156, "KL/mean": -203.443115234375, "KL/rejected_KL_mean": -245.7742919921875, "KL/std": 101.37711334228516, "epoch": 0.8420256991685563, "fcm_dpo/beta": 0.005770375952124596, "fcm_dpo/delta": -0.09372119605541229, "fcm_dpo/margin": 84.66233825683594, "fcm_dpo/q_t": 0.38854146003723145, "grad_norm": 12.033824920654297, "learning_rate": 3.756864251262143e-08, "logits/chosen": 0.8076841831207275, "logits/rejected": 0.7288253307342529, "logps/chosen": -216.14793395996094, "logps/ref_chosen": -55.03598403930664, "logps/ref_rejected": -75.80644989013672, "logps/rejected": -321.58074951171875, "loss": 1.0341, "margin_dpo/margin_mean": 84.66233825683594, "margin_dpo/margin_std": 98.88333129882812, "step": 557 }, { "KL/chosen_KL_mean": -148.3917694091797, "KL/mean": -194.3935089111328, "KL/rejected_KL_mean": -240.395263671875, "KL/std": 108.55181884765625, "epoch": 0.8435374149659864, "fcm_dpo/beta": 0.005635085515677929, "fcm_dpo/delta": -0.12575006484985352, "fcm_dpo/margin": 92.00345611572266, "fcm_dpo/q_t": 0.3834267854690552, "grad_norm": 10.859488487243652, "learning_rate": 3.687450924416341e-08, "logits/chosen": 0.7484444379806519, "logits/rejected": 0.692782998085022, "logps/chosen": -211.61813354492188, "logps/ref_chosen": -63.226348876953125, "logps/ref_rejected": -91.46881866455078, "logps/rejected": -331.86407470703125, "loss": 1.0258, "margin_dpo/margin_mean": 92.00344848632812, "margin_dpo/margin_std": 112.358154296875, "step": 558 }, { "KL/chosen_KL_mean": -150.44482421875, "KL/mean": -191.24813842773438, "KL/rejected_KL_mean": -232.05145263671875, "KL/std": 106.0418472290039, "epoch": 0.8450491307634165, "fcm_dpo/beta": 0.00554150715470314, "fcm_dpo/delta": -0.05576099827885628, "fcm_dpo/margin": 81.60662841796875, "fcm_dpo/q_t": 0.40131908655166626, "grad_norm": 12.017210006713867, "learning_rate": 3.6186337553827743e-08, "logits/chosen": 0.7015753984451294, "logits/rejected": 0.6351474523544312, "logps/chosen": -211.96646118164062, "logps/ref_chosen": -61.521644592285156, "logps/ref_rejected": -82.83859252929688, "logps/rejected": -314.8900451660156, "loss": 1.0989, "margin_dpo/margin_mean": 81.60662841796875, "margin_dpo/margin_std": 127.14248657226562, "step": 559 }, { "KL/chosen_KL_mean": -165.9353790283203, "KL/mean": -202.11032104492188, "KL/rejected_KL_mean": -238.28524780273438, "KL/std": 104.14012145996094, "epoch": 0.8465608465608465, "fcm_dpo/beta": 0.005577336065471172, "fcm_dpo/delta": -0.003964267671108246, "fcm_dpo/margin": 72.34988403320312, "fcm_dpo/q_t": 0.40748393535614014, "grad_norm": 16.099252700805664, "learning_rate": 3.550414669125573e-08, "logits/chosen": 0.7320427298545837, "logits/rejected": 0.6903941631317139, "logps/chosen": -226.57659912109375, "logps/ref_chosen": -60.64122009277344, "logps/ref_rejected": -78.75474548339844, "logps/rejected": -317.03997802734375, "loss": 1.0959, "margin_dpo/margin_mean": 72.34988403320312, "margin_dpo/margin_std": 98.12393188476562, "step": 560 }, { "KL/chosen_KL_mean": -150.53158569335938, "KL/mean": -185.16610717773438, "KL/rejected_KL_mean": -219.80064392089844, "KL/std": 101.51571655273438, "epoch": 0.8480725623582767, "fcm_dpo/beta": 0.005558688193559647, "fcm_dpo/delta": 0.015529034659266472, "fcm_dpo/margin": 69.26907348632812, "fcm_dpo/q_t": 0.41366326808929443, "grad_norm": 13.311105728149414, "learning_rate": 3.482795573879241e-08, "logits/chosen": 0.6639317274093628, "logits/rejected": 0.6297309398651123, "logps/chosen": -213.03018188476562, "logps/ref_chosen": -62.49859619140625, "logps/ref_rejected": -78.72064208984375, "logps/rejected": -298.52130126953125, "loss": 1.1256, "margin_dpo/margin_mean": 69.26907348632812, "margin_dpo/margin_std": 109.72549438476562, "step": 561 }, { "KL/chosen_KL_mean": -161.22512817382812, "KL/mean": -205.66433715820312, "KL/rejected_KL_mean": -250.10354614257812, "KL/std": 116.16006469726562, "epoch": 0.8495842781557067, "fcm_dpo/beta": 0.00546213798224926, "fcm_dpo/delta": -0.09121154248714447, "fcm_dpo/margin": 88.87841033935547, "fcm_dpo/q_t": 0.3912147879600525, "grad_norm": 17.746353149414062, "learning_rate": 3.415778361095226e-08, "logits/chosen": 0.6920984983444214, "logits/rejected": 0.6553425788879395, "logps/chosen": -236.00686645507812, "logps/ref_chosen": -74.78173828125, "logps/ref_rejected": -92.63499450683594, "logps/rejected": -342.738525390625, "loss": 1.049, "margin_dpo/margin_mean": 88.87841796875, "margin_dpo/margin_std": 113.43470001220703, "step": 562 }, { "KL/chosen_KL_mean": -134.72525024414062, "KL/mean": -172.85711669921875, "KL/rejected_KL_mean": -210.98898315429688, "KL/std": 91.38922119140625, "epoch": 0.8510959939531368, "fcm_dpo/beta": 0.0054597314447164536, "fcm_dpo/delta": -0.017087846994400024, "fcm_dpo/margin": 76.26376342773438, "fcm_dpo/q_t": 0.4041873812675476, "grad_norm": 18.914257049560547, "learning_rate": 3.349364905389032e-08, "logits/chosen": 0.8005296587944031, "logits/rejected": 0.7465337514877319, "logps/chosen": -184.92373657226562, "logps/ref_chosen": -50.19850158691406, "logps/ref_rejected": -66.76687622070312, "logps/rejected": -277.755859375, "loss": 1.11, "margin_dpo/margin_mean": 76.26374816894531, "margin_dpo/margin_std": 117.66149139404297, "step": 563 }, { "KL/chosen_KL_mean": -137.57955932617188, "KL/mean": -185.97503662109375, "KL/rejected_KL_mean": -234.37054443359375, "KL/std": 99.5344467163086, "epoch": 0.8526077097505669, "fcm_dpo/beta": 0.005348237697035074, "fcm_dpo/delta": -0.12420009821653366, "fcm_dpo/margin": 96.79098510742188, "fcm_dpo/q_t": 0.3806850016117096, "grad_norm": 13.006587982177734, "learning_rate": 3.283557064487785e-08, "logits/chosen": 0.7082855105400085, "logits/rejected": 0.6764841079711914, "logps/chosen": -193.32040405273438, "logps/ref_chosen": -55.7408447265625, "logps/ref_rejected": -74.82323455810547, "logps/rejected": -309.19378662109375, "loss": 1.0282, "margin_dpo/margin_mean": 96.79098510742188, "margin_dpo/margin_std": 118.69717407226562, "step": 564 }, { "KL/chosen_KL_mean": -167.14999389648438, "KL/mean": -201.27960205078125, "KL/rejected_KL_mean": -235.4091796875, "KL/std": 102.02427673339844, "epoch": 0.854119425547997, "fcm_dpo/beta": 0.005367398262023926, "fcm_dpo/delta": 0.03443855792284012, "fcm_dpo/margin": 68.25919342041016, "fcm_dpo/q_t": 0.41531914472579956, "grad_norm": 14.870681762695312, "learning_rate": 3.218356679178252e-08, "logits/chosen": 0.7321624755859375, "logits/rejected": 0.6772886514663696, "logps/chosen": -225.48736572265625, "logps/ref_chosen": -58.33738327026367, "logps/ref_rejected": -78.31776428222656, "logps/rejected": -313.7269287109375, "loss": 1.1258, "margin_dpo/margin_mean": 68.25918579101562, "margin_dpo/margin_std": 102.50274658203125, "step": 565 }, { "KL/chosen_KL_mean": -156.40234375, "KL/mean": -188.11712646484375, "KL/rejected_KL_mean": -219.83187866210938, "KL/std": 105.54273986816406, "epoch": 0.8556311413454271, "fcm_dpo/beta": 0.0054251449182629585, "fcm_dpo/delta": 0.05722519010305405, "fcm_dpo/margin": 63.429542541503906, "fcm_dpo/q_t": 0.423449844121933, "grad_norm": 15.344841957092285, "learning_rate": 3.1537655732553764e-08, "logits/chosen": 0.710625171661377, "logits/rejected": 0.6977615356445312, "logps/chosen": -227.62608337402344, "logps/ref_chosen": -71.22373962402344, "logps/ref_rejected": -71.11601257324219, "logps/rejected": -290.9478759765625, "loss": 1.1949, "margin_dpo/margin_mean": 63.42953872680664, "margin_dpo/margin_std": 133.69830322265625, "step": 566 }, { "KL/chosen_KL_mean": -150.36984252929688, "KL/mean": -190.3795623779297, "KL/rejected_KL_mean": -230.38929748535156, "KL/std": 100.10775756835938, "epoch": 0.8571428571428571, "fcm_dpo/beta": 0.005360864102840424, "fcm_dpo/delta": -0.031134188175201416, "fcm_dpo/margin": 80.01945495605469, "fcm_dpo/q_t": 0.4008104205131531, "grad_norm": 11.746376991271973, "learning_rate": 3.089785553471233e-08, "logits/chosen": 0.7463988065719604, "logits/rejected": 0.6522544622421265, "logps/chosen": -203.0391082763672, "logps/ref_chosen": -52.669273376464844, "logps/ref_rejected": -74.34785461425781, "logps/rejected": -304.7371520996094, "loss": 1.0849, "margin_dpo/margin_mean": 80.01945495605469, "margin_dpo/margin_std": 107.98060607910156, "step": 567 }, { "KL/chosen_KL_mean": -137.55935668945312, "KL/mean": -187.34161376953125, "KL/rejected_KL_mean": -237.12388610839844, "KL/std": 106.27365112304688, "epoch": 0.8586545729402872, "fcm_dpo/beta": 0.005308773368597031, "fcm_dpo/delta": -0.13568180799484253, "fcm_dpo/margin": 99.56451416015625, "fcm_dpo/q_t": 0.38007158041000366, "grad_norm": 16.17815589904785, "learning_rate": 3.026418409484513e-08, "logits/chosen": 0.7562973499298096, "logits/rejected": 0.671377420425415, "logps/chosen": -189.73736572265625, "logps/ref_chosen": -52.178001403808594, "logps/ref_rejected": -85.8277587890625, "logps/rejected": -322.95166015625, "loss": 1.0049, "margin_dpo/margin_mean": 99.56451416015625, "margin_dpo/margin_std": 107.11201477050781, "step": 568 }, { "KL/chosen_KL_mean": -162.22164916992188, "KL/mean": -186.17001342773438, "KL/rejected_KL_mean": -210.11837768554688, "KL/std": 106.58187103271484, "epoch": 0.8601662887377173, "fcm_dpo/beta": 0.005240259226411581, "fcm_dpo/delta": 0.0010257888352498412, "fcm_dpo/margin": 47.89672088623047, "fcm_dpo/q_t": 0.4422146677970886, "grad_norm": 14.931404113769531, "learning_rate": 2.963665913810451e-08, "logits/chosen": 0.640442430973053, "logits/rejected": 0.606290340423584, "logps/chosen": -224.87091064453125, "logps/ref_chosen": -62.649261474609375, "logps/ref_rejected": -75.4298324584961, "logps/rejected": -285.5482177734375, "loss": 1.2299, "margin_dpo/margin_mean": 47.89672088623047, "margin_dpo/margin_std": 106.46678924560547, "step": 569 }, { "KL/chosen_KL_mean": -142.37106323242188, "KL/mean": -189.57301330566406, "KL/rejected_KL_mean": -236.77496337890625, "KL/std": 98.6229019165039, "epoch": 0.8616780045351474, "fcm_dpo/beta": 0.005193280056118965, "fcm_dpo/delta": -0.09496532380580902, "fcm_dpo/margin": 94.40391540527344, "fcm_dpo/q_t": 0.38758718967437744, "grad_norm": 13.439196586608887, "learning_rate": 2.9015298217712453e-08, "logits/chosen": 0.6957643032073975, "logits/rejected": 0.6096549034118652, "logps/chosen": -192.41285705566406, "logps/ref_chosen": -50.04179382324219, "logps/ref_rejected": -78.27146911621094, "logps/rejected": -315.04644775390625, "loss": 1.0343, "margin_dpo/margin_mean": 94.40391540527344, "margin_dpo/margin_std": 111.02665710449219, "step": 570 }, { "KL/chosen_KL_mean": -155.65533447265625, "KL/mean": -182.16305541992188, "KL/rejected_KL_mean": -208.67076110839844, "KL/std": 97.05805206298828, "epoch": 0.8631897203325775, "fcm_dpo/beta": 0.005157129839062691, "fcm_dpo/delta": 0.02997731603682041, "fcm_dpo/margin": 53.015419006347656, "fcm_dpo/q_t": 0.4364148676395416, "grad_norm": 12.774622917175293, "learning_rate": 2.840011871446962e-08, "logits/chosen": 0.7486839294433594, "logits/rejected": 0.7202929258346558, "logps/chosen": -209.31214904785156, "logps/ref_chosen": -53.65681457519531, "logps/ref_rejected": -66.13298034667969, "logps/rejected": -274.8037414550781, "loss": 1.2169, "margin_dpo/margin_mean": 53.015419006347656, "margin_dpo/margin_std": 113.913818359375, "step": 571 }, { "KL/chosen_KL_mean": -156.36764526367188, "KL/mean": -187.063720703125, "KL/rejected_KL_mean": -217.75982666015625, "KL/std": 102.88874816894531, "epoch": 0.8647014361300076, "fcm_dpo/beta": 0.005256508942693472, "fcm_dpo/delta": 0.07915620505809784, "fcm_dpo/margin": 61.39219665527344, "fcm_dpo/q_t": 0.42540502548217773, "grad_norm": 12.966663360595703, "learning_rate": 2.7791137836269158e-08, "logits/chosen": 0.6802760362625122, "logits/rejected": 0.7232311964035034, "logps/chosen": -231.18556213378906, "logps/ref_chosen": -74.81792449951172, "logps/ref_rejected": -65.88681030273438, "logps/rejected": -283.6466369628906, "loss": 1.1626, "margin_dpo/margin_mean": 61.39219665527344, "margin_dpo/margin_std": 104.12713623046875, "step": 572 }, { "KL/chosen_KL_mean": -170.07090759277344, "KL/mean": -204.24813842773438, "KL/rejected_KL_mean": -238.42535400390625, "KL/std": 110.96490478515625, "epoch": 0.8662131519274376, "fcm_dpo/beta": 0.005288993939757347, "fcm_dpo/delta": 0.039893269538879395, "fcm_dpo/margin": 68.35444641113281, "fcm_dpo/q_t": 0.4202578365802765, "grad_norm": 16.049808502197266, "learning_rate": 2.718837261761528e-08, "logits/chosen": 0.7141411304473877, "logits/rejected": 0.6672098636627197, "logps/chosen": -238.7965545654297, "logps/ref_chosen": -68.72564697265625, "logps/ref_rejected": -88.16201782226562, "logps/rejected": -326.5873718261719, "loss": 1.1702, "margin_dpo/margin_mean": 68.35444641113281, "margin_dpo/margin_std": 132.52700805664062, "step": 573 }, { "KL/chosen_KL_mean": -151.50253295898438, "KL/mean": -195.33425903320312, "KL/rejected_KL_mean": -239.16595458984375, "KL/std": 101.14380645751953, "epoch": 0.8677248677248677, "fcm_dpo/beta": 0.005275283940136433, "fcm_dpo/delta": -0.0657506138086319, "fcm_dpo/margin": 87.66343688964844, "fcm_dpo/q_t": 0.39322227239608765, "grad_norm": 11.67337417602539, "learning_rate": 2.659183991914696e-08, "logits/chosen": 0.7605029344558716, "logits/rejected": 0.6940839290618896, "logps/chosen": -207.81593322753906, "logps/ref_chosen": -56.31340026855469, "logps/ref_rejected": -83.91553497314453, "logps/rejected": -323.08148193359375, "loss": 1.0367, "margin_dpo/margin_mean": 87.66343688964844, "margin_dpo/margin_std": 94.43531799316406, "step": 574 }, { "KL/chosen_KL_mean": -156.16954040527344, "KL/mean": -187.68914794921875, "KL/rejected_KL_mean": -219.208740234375, "KL/std": 104.60845947265625, "epoch": 0.8692365835222978, "fcm_dpo/beta": 0.005184421315789223, "fcm_dpo/delta": -0.041874419897794724, "fcm_dpo/margin": 63.039207458496094, "fcm_dpo/q_t": 0.42742812633514404, "grad_norm": 13.501452445983887, "learning_rate": 2.600155642716606e-08, "logits/chosen": 0.7840526103973389, "logits/rejected": 0.6993541717529297, "logps/chosen": -220.753662109375, "logps/ref_chosen": -64.5841293334961, "logps/ref_rejected": -93.47034454345703, "logps/rejected": -312.6790771484375, "loss": 1.1935, "margin_dpo/margin_mean": 63.039207458496094, "margin_dpo/margin_std": 126.58856201171875, "step": 575 }, { "KL/chosen_KL_mean": -138.4634246826172, "KL/mean": -183.80848693847656, "KL/rejected_KL_mean": -229.15353393554688, "KL/std": 104.09663391113281, "epoch": 0.8707482993197279, "fcm_dpo/beta": 0.005099663510918617, "fcm_dpo/delta": -0.06683328002691269, "fcm_dpo/margin": 90.69012451171875, "fcm_dpo/q_t": 0.3938714861869812, "grad_norm": 12.622983932495117, "learning_rate": 2.5417538653170754e-08, "logits/chosen": 0.7396451234817505, "logits/rejected": 0.6320977807044983, "logps/chosen": -191.74395751953125, "logps/ref_chosen": -53.28052520751953, "logps/ref_rejected": -84.2000503540039, "logps/rejected": -313.35357666015625, "loss": 1.061, "margin_dpo/margin_mean": 90.69012451171875, "margin_dpo/margin_std": 115.56700897216797, "step": 576 }, { "KL/chosen_KL_mean": -155.62779235839844, "KL/mean": -185.17532348632812, "KL/rejected_KL_mean": -214.72286987304688, "KL/std": 102.67268371582031, "epoch": 0.872260015117158, "fcm_dpo/beta": 0.005185229238122702, "fcm_dpo/delta": 0.09660777449607849, "fcm_dpo/margin": 59.095054626464844, "fcm_dpo/q_t": 0.4290727376937866, "grad_norm": 12.990002632141113, "learning_rate": 2.4839802933393607e-08, "logits/chosen": 0.7272605895996094, "logits/rejected": 0.7168679237365723, "logps/chosen": -217.95248413085938, "logps/ref_chosen": -62.32468795776367, "logps/ref_rejected": -67.300537109375, "logps/rejected": -282.02337646484375, "loss": 1.1866, "margin_dpo/margin_mean": 59.095054626464844, "margin_dpo/margin_std": 111.05635833740234, "step": 577 }, { "KL/chosen_KL_mean": -145.708251953125, "KL/mean": -175.96636962890625, "KL/rejected_KL_mean": -206.22450256347656, "KL/std": 105.77242279052734, "epoch": 0.873771730914588, "fcm_dpo/beta": 0.005279114469885826, "fcm_dpo/delta": 0.08320680260658264, "fcm_dpo/margin": 60.516265869140625, "fcm_dpo/q_t": 0.4290190637111664, "grad_norm": 13.1907958984375, "learning_rate": 2.4268365428344733e-08, "logits/chosen": 0.7556581497192383, "logits/rejected": 0.7388467192649841, "logps/chosen": -202.36383056640625, "logps/ref_chosen": -56.65557861328125, "logps/ref_rejected": -68.21835327148438, "logps/rejected": -274.44287109375, "loss": 1.1888, "margin_dpo/margin_mean": 60.516265869140625, "margin_dpo/margin_std": 121.384521484375, "step": 578 }, { "KL/chosen_KL_mean": -152.46368408203125, "KL/mean": -195.7517547607422, "KL/rejected_KL_mean": -239.03982543945312, "KL/std": 98.7073745727539, "epoch": 0.8752834467120182, "fcm_dpo/beta": 0.005250965710729361, "fcm_dpo/delta": -0.05731963366270065, "fcm_dpo/margin": 86.57614135742188, "fcm_dpo/q_t": 0.39372166991233826, "grad_norm": 13.139662742614746, "learning_rate": 2.3703242122359357e-08, "logits/chosen": 0.6754535436630249, "logits/rejected": 0.6529947519302368, "logps/chosen": -209.27334594726562, "logps/ref_chosen": -56.809661865234375, "logps/ref_rejected": -68.09613037109375, "logps/rejected": -307.1359558105469, "loss": 1.0498, "margin_dpo/margin_mean": 86.57614135742188, "margin_dpo/margin_std": 101.74164581298828, "step": 579 }, { "KL/chosen_KL_mean": -153.70989990234375, "KL/mean": -190.75645446777344, "KL/rejected_KL_mean": -227.80299377441406, "KL/std": 113.76496124267578, "epoch": 0.8767951625094482, "fcm_dpo/beta": 0.005272259004414082, "fcm_dpo/delta": 0.009330503642559052, "fcm_dpo/margin": 74.09309387207031, "fcm_dpo/q_t": 0.4134565591812134, "grad_norm": 14.717636108398438, "learning_rate": 2.3144448823151392e-08, "logits/chosen": 0.7032333016395569, "logits/rejected": 0.6492955088615417, "logps/chosen": -211.4100341796875, "logps/ref_chosen": -57.70011520385742, "logps/ref_rejected": -77.90664672851562, "logps/rejected": -305.70965576171875, "loss": 1.138, "margin_dpo/margin_mean": 74.09309387207031, "margin_dpo/margin_std": 127.00874328613281, "step": 580 }, { "KL/chosen_KL_mean": -168.05001831054688, "KL/mean": -202.69422912597656, "KL/rejected_KL_mean": -237.3384246826172, "KL/std": 103.01370239257812, "epoch": 0.8783068783068783, "fcm_dpo/beta": 0.005260917823761702, "fcm_dpo/delta": 0.036662764847278595, "fcm_dpo/margin": 69.28840637207031, "fcm_dpo/q_t": 0.41850724816322327, "grad_norm": 13.817371368408203, "learning_rate": 2.259200116137039e-08, "logits/chosen": 0.7487903833389282, "logits/rejected": 0.6824551820755005, "logps/chosen": -227.38238525390625, "logps/ref_chosen": -59.332359313964844, "logps/ref_rejected": -83.64482116699219, "logps/rejected": -320.9832458496094, "loss": 1.1503, "margin_dpo/margin_mean": 69.28839874267578, "margin_dpo/margin_std": 120.6309814453125, "step": 581 }, { "KL/chosen_KL_mean": -153.35769653320312, "KL/mean": -186.18087768554688, "KL/rejected_KL_mean": -219.0040283203125, "KL/std": 97.19963073730469, "epoch": 0.8798185941043084, "fcm_dpo/beta": 0.005341984797269106, "fcm_dpo/delta": 0.05092533677816391, "fcm_dpo/margin": 65.6463394165039, "fcm_dpo/q_t": 0.42038506269454956, "grad_norm": 11.518352508544922, "learning_rate": 2.204591459016525e-08, "logits/chosen": 0.6978170871734619, "logits/rejected": 0.7259989976882935, "logps/chosen": -217.52056884765625, "logps/ref_chosen": -64.16285705566406, "logps/ref_rejected": -58.632896423339844, "logps/rejected": -277.6369323730469, "loss": 1.1511, "margin_dpo/margin_mean": 65.6463394165039, "margin_dpo/margin_std": 111.71089935302734, "step": 582 }, { "KL/chosen_KL_mean": -151.95028686523438, "KL/mean": -193.89425659179688, "KL/rejected_KL_mean": -235.83824157714844, "KL/std": 109.51445007324219, "epoch": 0.8813303099017384, "fcm_dpo/beta": 0.00530315563082695, "fcm_dpo/delta": -0.04701667279005051, "fcm_dpo/margin": 83.887939453125, "fcm_dpo/q_t": 0.39889317750930786, "grad_norm": 14.888934135437012, "learning_rate": 2.1506204384751064e-08, "logits/chosen": 0.7880719304084778, "logits/rejected": 0.6842619180679321, "logps/chosen": -203.82269287109375, "logps/ref_chosen": -51.87239456176758, "logps/ref_rejected": -83.86331176757812, "logps/rejected": -319.7015380859375, "loss": 1.0924, "margin_dpo/margin_mean": 83.887939453125, "margin_dpo/margin_std": 124.24688720703125, "step": 583 }, { "KL/chosen_KL_mean": -140.185791015625, "KL/mean": -175.4810791015625, "KL/rejected_KL_mean": -210.7763671875, "KL/std": 100.93035888671875, "epoch": 0.8828420256991686, "fcm_dpo/beta": 0.005299385171383619, "fcm_dpo/delta": 0.02679905854165554, "fcm_dpo/margin": 70.59056091308594, "fcm_dpo/q_t": 0.41769716143608093, "grad_norm": 14.666333198547363, "learning_rate": 2.09728856419826e-08, "logits/chosen": 0.8587048053741455, "logits/rejected": 0.7487021684646606, "logps/chosen": -186.75718688964844, "logps/ref_chosen": -46.571388244628906, "logps/ref_rejected": -80.67969512939453, "logps/rejected": -291.4560546875, "loss": 1.1588, "margin_dpo/margin_mean": 70.59056091308594, "margin_dpo/margin_std": 130.73898315429688, "step": 584 }, { "KL/chosen_KL_mean": -164.0458984375, "KL/mean": -189.77200317382812, "KL/rejected_KL_mean": -215.49810791015625, "KL/std": 111.33201599121094, "epoch": 0.8843537414965986, "fcm_dpo/beta": 0.0053357696160674095, "fcm_dpo/delta": 0.022998645901679993, "fcm_dpo/margin": 51.45219802856445, "fcm_dpo/q_t": 0.43656566739082336, "grad_norm": 12.086616516113281, "learning_rate": 2.044597327993153e-08, "logits/chosen": 0.6961206197738647, "logits/rejected": 0.6410224437713623, "logps/chosen": -222.17044067382812, "logps/ref_chosen": -58.124534606933594, "logps/ref_rejected": -79.00538635253906, "logps/rejected": -294.50347900390625, "loss": 1.2151, "margin_dpo/margin_mean": 51.45220184326172, "margin_dpo/margin_std": 110.92315673828125, "step": 585 }, { "KL/chosen_KL_mean": -151.13497924804688, "KL/mean": -185.87631225585938, "KL/rejected_KL_mean": -220.61761474609375, "KL/std": 93.34712219238281, "epoch": 0.8858654572940288, "fcm_dpo/beta": 0.005359075032174587, "fcm_dpo/delta": 0.02866952307522297, "fcm_dpo/margin": 69.48262786865234, "fcm_dpo/q_t": 0.41366758942604065, "grad_norm": 14.31550121307373, "learning_rate": 1.9925482037469187e-08, "logits/chosen": 0.7719130516052246, "logits/rejected": 0.7257754802703857, "logps/chosen": -205.23663330078125, "logps/ref_chosen": -54.10163879394531, "logps/ref_rejected": -63.72113037109375, "logps/rejected": -284.3387451171875, "loss": 1.0991, "margin_dpo/margin_mean": 69.48262786865234, "margin_dpo/margin_std": 86.57853698730469, "step": 586 }, { "KL/chosen_KL_mean": -158.19403076171875, "KL/mean": -192.151123046875, "KL/rejected_KL_mean": -226.1082305908203, "KL/std": 109.19635009765625, "epoch": 0.8873771730914588, "fcm_dpo/beta": 0.005401215516030788, "fcm_dpo/delta": 0.0344584584236145, "fcm_dpo/margin": 67.91419982910156, "fcm_dpo/q_t": 0.4123992919921875, "grad_norm": 15.158578872680664, "learning_rate": 1.9411426473854687e-08, "logits/chosen": 0.7452373504638672, "logits/rejected": 0.7412750124931335, "logps/chosen": -221.61123657226562, "logps/ref_chosen": -63.41719436645508, "logps/ref_rejected": -63.47003936767578, "logps/rejected": -289.5782775878906, "loss": 1.1915, "margin_dpo/margin_mean": 67.91419982910156, "margin_dpo/margin_std": 146.912109375, "step": 587 }, { "KL/chosen_KL_mean": -160.67141723632812, "KL/mean": -199.68881225585938, "KL/rejected_KL_mean": -238.70619201660156, "KL/std": 107.306640625, "epoch": 0.8888888888888888, "fcm_dpo/beta": 0.0053809527307748795, "fcm_dpo/delta": -0.021074390038847923, "fcm_dpo/margin": 78.03477478027344, "fcm_dpo/q_t": 0.40630820393562317, "grad_norm": 17.240671157836914, "learning_rate": 1.890382096832699e-08, "logits/chosen": 0.7501190900802612, "logits/rejected": 0.707175612449646, "logps/chosen": -222.87245178222656, "logps/ref_chosen": -62.20103454589844, "logps/ref_rejected": -82.10249328613281, "logps/rejected": -320.8086853027344, "loss": 1.1235, "margin_dpo/margin_mean": 78.0347671508789, "margin_dpo/margin_std": 130.63900756835938, "step": 588 }, { "KL/chosen_KL_mean": -150.62606811523438, "KL/mean": -189.75662231445312, "KL/rejected_KL_mean": -228.88720703125, "KL/std": 103.38765716552734, "epoch": 0.890400604686319, "fcm_dpo/beta": 0.005385834723711014, "fcm_dpo/delta": -0.022483011707663536, "fcm_dpo/margin": 78.26113891601562, "fcm_dpo/q_t": 0.4018729627132416, "grad_norm": 12.26170539855957, "learning_rate": 1.840267971970344e-08, "logits/chosen": 0.7219746708869934, "logits/rejected": 0.690538227558136, "logps/chosen": -207.3396759033203, "logps/ref_chosen": -56.71361541748047, "logps/ref_rejected": -76.7366943359375, "logps/rejected": -305.6239013671875, "loss": 1.0726, "margin_dpo/margin_mean": 78.26113891601562, "margin_dpo/margin_std": 96.6390609741211, "step": 589 }, { "KL/chosen_KL_mean": -164.7786865234375, "KL/mean": -201.01124572753906, "KL/rejected_KL_mean": -237.2438201904297, "KL/std": 97.29721069335938, "epoch": 0.891912320483749, "fcm_dpo/beta": 0.005396964028477669, "fcm_dpo/delta": 0.00893310084939003, "fcm_dpo/margin": 72.46513366699219, "fcm_dpo/q_t": 0.4101407527923584, "grad_norm": 16.080488204956055, "learning_rate": 1.7908016745981856e-08, "logits/chosen": 0.7100391387939453, "logits/rejected": 0.6726541519165039, "logps/chosen": -231.29249572753906, "logps/ref_chosen": -66.5138168334961, "logps/ref_rejected": -85.70820617675781, "logps/rejected": -322.9520263671875, "loss": 1.1119, "margin_dpo/margin_mean": 72.46513366699219, "margin_dpo/margin_std": 105.790283203125, "step": 590 }, { "KL/chosen_KL_mean": -140.5738525390625, "KL/mean": -186.54745483398438, "KL/rejected_KL_mean": -232.52105712890625, "KL/std": 113.1652603149414, "epoch": 0.8934240362811792, "fcm_dpo/beta": 0.005266258493065834, "fcm_dpo/delta": -0.09040172398090363, "fcm_dpo/margin": 91.94720458984375, "fcm_dpo/q_t": 0.39322781562805176, "grad_norm": 14.349787712097168, "learning_rate": 1.7419845883949098e-08, "logits/chosen": 0.8036490082740784, "logits/rejected": 0.7412289381027222, "logps/chosen": -201.27102661132812, "logps/ref_chosen": -60.697181701660156, "logps/ref_rejected": -86.12278747558594, "logps/rejected": -318.64385986328125, "loss": 1.0849, "margin_dpo/margin_mean": 91.94720458984375, "margin_dpo/margin_std": 139.12965393066406, "step": 591 }, { "KL/chosen_KL_mean": -153.30078125, "KL/mean": -188.5167236328125, "KL/rejected_KL_mean": -223.732666015625, "KL/std": 96.46400451660156, "epoch": 0.8949357520786092, "fcm_dpo/beta": 0.005277402698993683, "fcm_dpo/delta": 0.028983741998672485, "fcm_dpo/margin": 70.43186950683594, "fcm_dpo/q_t": 0.4168894290924072, "grad_norm": 14.597060203552246, "learning_rate": 1.6938180788793556e-08, "logits/chosen": 0.8267362117767334, "logits/rejected": 0.7063460946083069, "logps/chosen": -204.53811645507812, "logps/ref_chosen": -51.237327575683594, "logps/ref_rejected": -81.60242462158203, "logps/rejected": -305.3350830078125, "loss": 1.1292, "margin_dpo/margin_mean": 70.43186950683594, "margin_dpo/margin_std": 109.211181640625, "step": 592 }, { "KL/chosen_KL_mean": -127.90972900390625, "KL/mean": -165.362060546875, "KL/rejected_KL_mean": -202.8143768310547, "KL/std": 98.97264099121094, "epoch": 0.8964474678760394, "fcm_dpo/beta": 0.005305076017975807, "fcm_dpo/delta": 0.0026665516197681427, "fcm_dpo/margin": 74.90463256835938, "fcm_dpo/q_t": 0.40857166051864624, "grad_norm": 15.66858196258545, "learning_rate": 1.6463034933723336e-08, "logits/chosen": 0.7333135604858398, "logits/rejected": 0.6371482610702515, "logps/chosen": -169.98973083496094, "logps/ref_chosen": -42.08000183105469, "logps/ref_rejected": -68.47499084472656, "logps/rejected": -271.28936767578125, "loss": 1.1152, "margin_dpo/margin_mean": 74.90463256835938, "margin_dpo/margin_std": 114.46412658691406, "step": 593 }, { "KL/chosen_KL_mean": -161.46206665039062, "KL/mean": -194.55519104003906, "KL/rejected_KL_mean": -227.6483154296875, "KL/std": 98.2662353515625, "epoch": 0.8979591836734694, "fcm_dpo/beta": 0.005352815147489309, "fcm_dpo/delta": 0.04740230739116669, "fcm_dpo/margin": 66.18623352050781, "fcm_dpo/q_t": 0.4173516035079956, "grad_norm": 13.510444641113281, "learning_rate": 1.5994421609589385e-08, "logits/chosen": 0.6477546691894531, "logits/rejected": 0.6340515613555908, "logps/chosen": -225.12074279785156, "logps/ref_chosen": -63.658668518066406, "logps/ref_rejected": -70.35597229003906, "logps/rejected": -298.0043029785156, "loss": 1.1323, "margin_dpo/margin_mean": 66.18622589111328, "margin_dpo/margin_std": 100.29750061035156, "step": 594 }, { "KL/chosen_KL_mean": -148.9857177734375, "KL/mean": -192.3857421875, "KL/rejected_KL_mean": -235.7857666015625, "KL/std": 104.509765625, "epoch": 0.8994708994708994, "fcm_dpo/beta": 0.005331944674253464, "fcm_dpo/delta": -0.0658692866563797, "fcm_dpo/margin": 86.800048828125, "fcm_dpo/q_t": 0.39613407850265503, "grad_norm": 11.309722900390625, "learning_rate": 1.553235392451377e-08, "logits/chosen": 0.7755295634269714, "logits/rejected": 0.6839370131492615, "logps/chosen": -205.2044677734375, "logps/ref_chosen": -56.21875762939453, "logps/ref_rejected": -83.95773315429688, "logps/rejected": -319.7434997558594, "loss": 1.0863, "margin_dpo/margin_mean": 86.800048828125, "margin_dpo/margin_std": 129.6040802001953, "step": 595 }, { "KL/chosen_KL_mean": -172.08807373046875, "KL/mean": -189.11563110351562, "KL/rejected_KL_mean": -206.14321899414062, "KL/std": 97.88066101074219, "epoch": 0.9009826152683296, "fcm_dpo/beta": 0.005374398548156023, "fcm_dpo/delta": 0.07329612970352173, "fcm_dpo/margin": 34.055152893066406, "fcm_dpo/q_t": 0.4589841365814209, "grad_norm": 15.025259017944336, "learning_rate": 1.507684480352292e-08, "logits/chosen": 0.6385211944580078, "logits/rejected": 0.6607295870780945, "logps/chosen": -240.56893920898438, "logps/ref_chosen": -68.48088073730469, "logps/ref_rejected": -61.732967376708984, "logps/rejected": -267.8761901855469, "loss": 1.287, "margin_dpo/margin_mean": 34.055152893066406, "margin_dpo/margin_std": 103.45262145996094, "step": 596 }, { "KL/chosen_KL_mean": -130.9452362060547, "KL/mean": -164.6648712158203, "KL/rejected_KL_mean": -198.38450622558594, "KL/std": 86.19706726074219, "epoch": 0.9024943310657596, "fcm_dpo/beta": 0.005414964631199837, "fcm_dpo/delta": 0.035965919494628906, "fcm_dpo/margin": 67.43927001953125, "fcm_dpo/q_t": 0.4164079427719116, "grad_norm": 11.359607696533203, "learning_rate": 1.4627906988186111e-08, "logits/chosen": 0.7188578844070435, "logits/rejected": 0.7021818161010742, "logps/chosen": -179.802734375, "logps/ref_chosen": -48.85750961303711, "logps/ref_rejected": -55.068084716796875, "logps/rejected": -253.4525909423828, "loss": 1.1284, "margin_dpo/margin_mean": 67.43927001953125, "margin_dpo/margin_std": 103.29020690917969, "step": 597 }, { "KL/chosen_KL_mean": -173.59539794921875, "KL/mean": -194.8944091796875, "KL/rejected_KL_mean": -216.19338989257812, "KL/std": 101.44609069824219, "epoch": 0.9040060468631897, "fcm_dpo/beta": 0.005446711555123329, "fcm_dpo/delta": 0.06153283640742302, "fcm_dpo/margin": 42.597984313964844, "fcm_dpo/q_t": 0.4477514624595642, "grad_norm": 13.305130004882812, "learning_rate": 1.4185553036259095e-08, "logits/chosen": 0.7357190847396851, "logits/rejected": 0.6533316373825073, "logps/chosen": -232.48255920410156, "logps/ref_chosen": -58.88715362548828, "logps/ref_rejected": -81.43145751953125, "logps/rejected": -297.6248474121094, "loss": 1.2527, "margin_dpo/margin_mean": 42.597984313964844, "margin_dpo/margin_std": 109.55743408203125, "step": 598 }, { "KL/chosen_KL_mean": -174.72056579589844, "KL/mean": -200.32901000976562, "KL/rejected_KL_mean": -225.93746948242188, "KL/std": 98.85242462158203, "epoch": 0.9055177626606198, "fcm_dpo/beta": 0.005590873304754496, "fcm_dpo/delta": 0.11656653881072998, "fcm_dpo/margin": 51.2169075012207, "fcm_dpo/q_t": 0.43582531809806824, "grad_norm": 16.227569580078125, "learning_rate": 1.3749795321332885e-08, "logits/chosen": 0.7683210372924805, "logits/rejected": 0.7261084914207458, "logps/chosen": -232.3277587890625, "logps/ref_chosen": -57.60719299316406, "logps/ref_rejected": -71.80469512939453, "logps/rejected": -297.7421569824219, "loss": 1.2128, "margin_dpo/margin_mean": 51.21691131591797, "margin_dpo/margin_std": 111.82340240478516, "step": 599 }, { "KL/chosen_KL_mean": -158.89527893066406, "KL/mean": -190.06353759765625, "KL/rejected_KL_mean": -221.2317657470703, "KL/std": 107.9918212890625, "epoch": 0.9070294784580499, "fcm_dpo/beta": 0.005586580373346806, "fcm_dpo/delta": -0.0410081222653389, "fcm_dpo/margin": 62.336490631103516, "fcm_dpo/q_t": 0.42305952310562134, "grad_norm": 16.775184631347656, "learning_rate": 1.3320646032487393e-08, "logits/chosen": 0.7906534671783447, "logits/rejected": 0.7318333387374878, "logps/chosen": -217.33758544921875, "logps/ref_chosen": -58.44231414794922, "logps/ref_rejected": -83.64639282226562, "logps/rejected": -304.878173828125, "loss": 1.1682, "margin_dpo/margin_mean": 62.336490631103516, "margin_dpo/margin_std": 115.70437622070312, "step": 600 }, { "KL/chosen_KL_mean": -142.1400146484375, "KL/mean": -184.20022583007812, "KL/rejected_KL_mean": -226.26043701171875, "KL/std": 112.20680236816406, "epoch": 0.90854119425548, "fcm_dpo/beta": 0.0054881456308066845, "fcm_dpo/delta": -0.06522935628890991, "fcm_dpo/margin": 84.12042236328125, "fcm_dpo/q_t": 0.39672377705574036, "grad_norm": 12.236357688903809, "learning_rate": 1.2898117173950868e-08, "logits/chosen": 0.7509416341781616, "logits/rejected": 0.6693045496940613, "logps/chosen": -197.73434448242188, "logps/ref_chosen": -55.59432601928711, "logps/ref_rejected": -83.68630981445312, "logps/rejected": -309.94677734375, "loss": 1.0897, "margin_dpo/margin_mean": 84.12042236328125, "margin_dpo/margin_std": 127.16033172607422, "step": 601 }, { "KL/chosen_KL_mean": -131.32569885253906, "KL/mean": -169.94210815429688, "KL/rejected_KL_mean": -208.55850219726562, "KL/std": 97.39225769042969, "epoch": 0.91005291005291, "fcm_dpo/beta": 0.005489659495651722, "fcm_dpo/delta": -0.025193627923727036, "fcm_dpo/margin": 77.23280334472656, "fcm_dpo/q_t": 0.402509868144989, "grad_norm": 13.297761917114258, "learning_rate": 1.2482220564763667e-08, "logits/chosen": 0.7436432838439941, "logits/rejected": 0.7134118676185608, "logps/chosen": -187.67489624023438, "logps/ref_chosen": -56.349185943603516, "logps/ref_rejected": -71.9959716796875, "logps/rejected": -280.5544738769531, "loss": 1.084, "margin_dpo/margin_mean": 77.23280334472656, "margin_dpo/margin_std": 104.47931671142578, "step": 602 }, { "KL/chosen_KL_mean": -144.1072998046875, "KL/mean": -182.05587768554688, "KL/rejected_KL_mean": -220.00448608398438, "KL/std": 97.42413330078125, "epoch": 0.9115646258503401, "fcm_dpo/beta": 0.005443079397082329, "fcm_dpo/delta": -0.013768583536148071, "fcm_dpo/margin": 75.89717864990234, "fcm_dpo/q_t": 0.4060080647468567, "grad_norm": 13.597131729125977, "learning_rate": 1.2072967838448051e-08, "logits/chosen": 0.6887112855911255, "logits/rejected": 0.6357647180557251, "logps/chosen": -197.2756805419922, "logps/ref_chosen": -53.16838836669922, "logps/ref_rejected": -73.8604736328125, "logps/rejected": -293.8649597167969, "loss": 1.1078, "margin_dpo/margin_mean": 75.89717864990234, "margin_dpo/margin_std": 115.06390380859375, "step": 603 }, { "KL/chosen_KL_mean": -141.97164916992188, "KL/mean": -174.11126708984375, "KL/rejected_KL_mean": -206.25086975097656, "KL/std": 92.68603515625, "epoch": 0.9130763416477702, "fcm_dpo/beta": 0.0054985228925943375, "fcm_dpo/delta": 0.048101652413606644, "fcm_dpo/margin": 64.27925109863281, "fcm_dpo/q_t": 0.4208827018737793, "grad_norm": 15.929814338684082, "learning_rate": 1.1670370442682459e-08, "logits/chosen": 0.6625460982322693, "logits/rejected": 0.6690924763679504, "logps/chosen": -214.62106323242188, "logps/ref_chosen": -72.64942169189453, "logps/ref_rejected": -69.8792724609375, "logps/rejected": -276.130126953125, "loss": 1.1673, "margin_dpo/margin_mean": 64.27925109863281, "margin_dpo/margin_std": 121.24957275390625, "step": 604 }, { "KL/chosen_KL_mean": -159.9560546875, "KL/mean": -195.05694580078125, "KL/rejected_KL_mean": -230.1578369140625, "KL/std": 97.04273986816406, "epoch": 0.9145880574452003, "fcm_dpo/beta": 0.005499058403074741, "fcm_dpo/delta": 0.01441466249525547, "fcm_dpo/margin": 70.2017822265625, "fcm_dpo/q_t": 0.4118395149707794, "grad_norm": 15.055326461791992, "learning_rate": 1.1274439638981532e-08, "logits/chosen": 0.7732815742492676, "logits/rejected": 0.7187706232070923, "logps/chosen": -221.56890869140625, "logps/ref_chosen": -61.61284637451172, "logps/ref_rejected": -79.34398651123047, "logps/rejected": -309.5018310546875, "loss": 1.1348, "margin_dpo/margin_mean": 70.2017822265625, "margin_dpo/margin_std": 116.32188415527344, "step": 605 }, { "KL/chosen_KL_mean": -140.86871337890625, "KL/mean": -183.3809356689453, "KL/rejected_KL_mean": -225.8931884765625, "KL/std": 101.25337219238281, "epoch": 0.9160997732426304, "fcm_dpo/beta": 0.00548307690769434, "fcm_dpo/delta": -0.06948762387037277, "fcm_dpo/margin": 85.02445983886719, "fcm_dpo/q_t": 0.39542317390441895, "grad_norm": 15.74163818359375, "learning_rate": 1.0885186502381016e-08, "logits/chosen": 0.6895310282707214, "logits/rejected": 0.6235339045524597, "logps/chosen": -195.33294677734375, "logps/ref_chosen": -54.46424102783203, "logps/ref_rejected": -79.62708282470703, "logps/rejected": -305.520263671875, "loss": 1.0711, "margin_dpo/margin_mean": 85.02445983886719, "margin_dpo/margin_std": 117.96329498291016, "step": 606 }, { "KL/chosen_KL_mean": -156.14984130859375, "KL/mean": -192.8448944091797, "KL/rejected_KL_mean": -229.5399627685547, "KL/std": 98.42378234863281, "epoch": 0.9176114890400605, "fcm_dpo/beta": 0.005396674387156963, "fcm_dpo/delta": 0.002588912844657898, "fcm_dpo/margin": 73.39010620117188, "fcm_dpo/q_t": 0.40936577320098877, "grad_norm": 12.765244483947754, "learning_rate": 1.0502621921127774e-08, "logits/chosen": 0.7129979133605957, "logits/rejected": 0.687645673751831, "logps/chosen": -219.01071166992188, "logps/ref_chosen": -62.86086654663086, "logps/ref_rejected": -72.5501937866211, "logps/rejected": -302.09014892578125, "loss": 1.1243, "margin_dpo/margin_mean": 73.3901138305664, "margin_dpo/margin_std": 113.34681701660156, "step": 607 }, { "KL/chosen_KL_mean": -159.1494140625, "KL/mean": -198.17312622070312, "KL/rejected_KL_mean": -237.19683837890625, "KL/std": 106.50177001953125, "epoch": 0.9191232048374905, "fcm_dpo/beta": 0.005432832054793835, "fcm_dpo/delta": -0.025096310302615166, "fcm_dpo/margin": 78.04742431640625, "fcm_dpo/q_t": 0.40351438522338867, "grad_norm": 12.990138053894043, "learning_rate": 1.0126756596375685e-08, "logits/chosen": 0.7043001651763916, "logits/rejected": 0.6233786344528198, "logps/chosen": -222.33013916015625, "logps/ref_chosen": -63.18071746826172, "logps/ref_rejected": -99.15888214111328, "logps/rejected": -336.355712890625, "loss": 1.089, "margin_dpo/margin_mean": 78.04742431640625, "margin_dpo/margin_std": 109.10804748535156, "step": 608 }, { "KL/chosen_KL_mean": -141.6947021484375, "KL/mean": -183.05978393554688, "KL/rejected_KL_mean": -224.4248809814453, "KL/std": 97.03974914550781, "epoch": 0.9206349206349206, "fcm_dpo/beta": 0.005357364658266306, "fcm_dpo/delta": -0.0457816943526268, "fcm_dpo/margin": 82.7302017211914, "fcm_dpo/q_t": 0.3962804973125458, "grad_norm": 12.888740539550781, "learning_rate": 9.757601041885694e-09, "logits/chosen": 0.7823031544685364, "logits/rejected": 0.7437509894371033, "logps/chosen": -190.3179168701172, "logps/ref_chosen": -48.62322235107422, "logps/ref_rejected": -68.28271484375, "logps/rejected": -292.70758056640625, "loss": 1.0595, "margin_dpo/margin_mean": 82.73019409179688, "margin_dpo/margin_std": 96.67643737792969, "step": 609 }, { "KL/chosen_KL_mean": -154.00808715820312, "KL/mean": -194.78179931640625, "KL/rejected_KL_mean": -235.55548095703125, "KL/std": 106.52471923828125, "epoch": 0.9221466364323507, "fcm_dpo/beta": 0.005320107098668814, "fcm_dpo/delta": -0.035760559141635895, "fcm_dpo/margin": 81.54739379882812, "fcm_dpo/q_t": 0.4024896025657654, "grad_norm": 12.770600318908691, "learning_rate": 9.395165583732379e-09, "logits/chosen": 0.684196949005127, "logits/rejected": 0.6781659126281738, "logps/chosen": -226.6732177734375, "logps/ref_chosen": -72.66513061523438, "logps/ref_rejected": -87.15310668945312, "logps/rejected": -322.7085876464844, "loss": 1.094, "margin_dpo/margin_mean": 81.54739379882812, "margin_dpo/margin_std": 119.98110961914062, "step": 610 }, { "KL/chosen_KL_mean": -143.1591033935547, "KL/mean": -175.167236328125, "KL/rejected_KL_mean": -207.17535400390625, "KL/std": 94.00639343261719, "epoch": 0.9236583522297808, "fcm_dpo/beta": 0.00536438450217247, "fcm_dpo/delta": 0.05863542854785919, "fcm_dpo/margin": 64.0162582397461, "fcm_dpo/q_t": 0.42030617594718933, "grad_norm": 14.609975814819336, "learning_rate": 9.03946036001449e-09, "logits/chosen": 0.7666869163513184, "logits/rejected": 0.7165364623069763, "logps/chosen": -191.46768188476562, "logps/ref_chosen": -48.30857849121094, "logps/ref_rejected": -70.6141128540039, "logps/rejected": -277.7894592285156, "loss": 1.1373, "margin_dpo/margin_mean": 64.0162582397461, "margin_dpo/margin_std": 97.14239501953125, "step": 611 }, { "KL/chosen_KL_mean": -156.49105834960938, "KL/mean": -203.49362182617188, "KL/rejected_KL_mean": -250.49618530273438, "KL/std": 101.47637176513672, "epoch": 0.9251700680272109, "fcm_dpo/beta": 0.005310682579874992, "fcm_dpo/delta": -0.10441690683364868, "fcm_dpo/margin": 94.00511169433594, "fcm_dpo/q_t": 0.38476818799972534, "grad_norm": 12.069144248962402, "learning_rate": 8.690495320571839e-09, "logits/chosen": 0.6626961827278137, "logits/rejected": 0.590441107749939, "logps/chosen": -217.72262573242188, "logps/ref_chosen": -61.23155975341797, "logps/ref_rejected": -94.37979888916016, "logps/rejected": -344.8759765625, "loss": 1.0295, "margin_dpo/margin_mean": 94.00511169433594, "margin_dpo/margin_std": 111.62675476074219, "step": 612 }, { "KL/chosen_KL_mean": -135.76759338378906, "KL/mean": -177.13162231445312, "KL/rejected_KL_mean": -218.49566650390625, "KL/std": 106.75114440917969, "epoch": 0.926681783824641, "fcm_dpo/beta": 0.005268789827823639, "fcm_dpo/delta": -0.037613268941640854, "fcm_dpo/margin": 82.72808837890625, "fcm_dpo/q_t": 0.3990470767021179, "grad_norm": 11.488716125488281, "learning_rate": 8.348280226706722e-09, "logits/chosen": 0.6710951328277588, "logits/rejected": 0.6657571196556091, "logps/chosen": -189.75070190429688, "logps/ref_chosen": -53.98310852050781, "logps/ref_rejected": -58.32208251953125, "logps/rejected": -276.8177490234375, "loss": 1.0762, "margin_dpo/margin_mean": 82.72808074951172, "margin_dpo/margin_std": 109.206787109375, "step": 613 }, { "KL/chosen_KL_mean": -155.44308471679688, "KL/mean": -194.57798767089844, "KL/rejected_KL_mean": -233.712890625, "KL/std": 93.10488891601562, "epoch": 0.9281934996220711, "fcm_dpo/beta": 0.005259955767542124, "fcm_dpo/delta": -0.012738246470689774, "fcm_dpo/margin": 78.2697982788086, "fcm_dpo/q_t": 0.40427324175834656, "grad_norm": 14.937153816223145, "learning_rate": 8.012824650910937e-09, "logits/chosen": 0.7486348152160645, "logits/rejected": 0.7401151657104492, "logps/chosen": -215.68612670898438, "logps/ref_chosen": -60.24303436279297, "logps/ref_rejected": -72.26258850097656, "logps/rejected": -305.9754638671875, "loss": 1.0934, "margin_dpo/margin_mean": 78.2697982788086, "margin_dpo/margin_std": 106.5931396484375, "step": 614 }, { "KL/chosen_KL_mean": -164.6011962890625, "KL/mean": -205.89425659179688, "KL/rejected_KL_mean": -247.18734741210938, "KL/std": 118.27519226074219, "epoch": 0.9297052154195011, "fcm_dpo/beta": 0.005172071512788534, "fcm_dpo/delta": -0.02936476096510887, "fcm_dpo/margin": 82.58613586425781, "fcm_dpo/q_t": 0.4036427140235901, "grad_norm": 13.221552848815918, "learning_rate": 7.684137976598088e-09, "logits/chosen": 0.7252025604248047, "logits/rejected": 0.6627861857414246, "logps/chosen": -236.69586181640625, "logps/ref_chosen": -72.09467315673828, "logps/ref_rejected": -104.02980041503906, "logps/rejected": -351.2171630859375, "loss": 1.1122, "margin_dpo/margin_mean": 82.58614349365234, "margin_dpo/margin_std": 130.2043914794922, "step": 615 }, { "KL/chosen_KL_mean": -155.0189208984375, "KL/mean": -188.56732177734375, "KL/rejected_KL_mean": -222.11569213867188, "KL/std": 110.29572296142578, "epoch": 0.9312169312169312, "fcm_dpo/beta": 0.005230366718024015, "fcm_dpo/delta": 0.050875380635261536, "fcm_dpo/margin": 67.09677124023438, "fcm_dpo/q_t": 0.41983866691589355, "grad_norm": 12.850919723510742, "learning_rate": 7.36222939784098e-09, "logits/chosen": 0.735611081123352, "logits/rejected": 0.6595158576965332, "logps/chosen": -213.54965209960938, "logps/ref_chosen": -58.530723571777344, "logps/ref_rejected": -75.48025512695312, "logps/rejected": -297.595947265625, "loss": 1.1427, "margin_dpo/margin_mean": 67.09676361083984, "margin_dpo/margin_std": 108.70661926269531, "step": 616 }, { "KL/chosen_KL_mean": -164.60107421875, "KL/mean": -195.19232177734375, "KL/rejected_KL_mean": -225.7835693359375, "KL/std": 101.45540618896484, "epoch": 0.9327286470143613, "fcm_dpo/beta": 0.005212991964071989, "fcm_dpo/delta": -0.035686977207660675, "fcm_dpo/margin": 61.1824951171875, "fcm_dpo/q_t": 0.4273007810115814, "grad_norm": 16.660669326782227, "learning_rate": 7.047107919114586e-09, "logits/chosen": 0.7245724201202393, "logits/rejected": 0.6729332208633423, "logps/chosen": -222.20974731445312, "logps/ref_chosen": -57.608673095703125, "logps/ref_rejected": -81.22109985351562, "logps/rejected": -307.0046691894531, "loss": 1.1743, "margin_dpo/margin_mean": 61.1824951171875, "margin_dpo/margin_std": 108.80859375, "step": 617 }, { "KL/chosen_KL_mean": -150.4832763671875, "KL/mean": -187.10198974609375, "KL/rejected_KL_mean": -223.72068786621094, "KL/std": 110.5191421508789, "epoch": 0.9342403628117913, "fcm_dpo/beta": 0.005213525611907244, "fcm_dpo/delta": 0.01879797875881195, "fcm_dpo/margin": 73.23739624023438, "fcm_dpo/q_t": 0.4143332540988922, "grad_norm": 16.433107376098633, "learning_rate": 6.738782355044048e-09, "logits/chosen": 0.7141435146331787, "logits/rejected": 0.6125196814537048, "logps/chosen": -207.17921447753906, "logps/ref_chosen": -56.69594192504883, "logps/ref_rejected": -85.92362976074219, "logps/rejected": -309.6443176269531, "loss": 1.1188, "margin_dpo/margin_mean": 73.2374038696289, "margin_dpo/margin_std": 111.80946350097656, "step": 618 }, { "KL/chosen_KL_mean": -144.3744659423828, "KL/mean": -184.07183837890625, "KL/rejected_KL_mean": -223.76919555664062, "KL/std": 105.90087890625, "epoch": 0.9357520786092215, "fcm_dpo/beta": 0.0052141789346933365, "fcm_dpo/delta": -0.014639066532254219, "fcm_dpo/margin": 79.39471435546875, "fcm_dpo/q_t": 0.40503403544425964, "grad_norm": 13.595865249633789, "learning_rate": 6.437261330158206e-09, "logits/chosen": 0.7946481704711914, "logits/rejected": 0.7174685001373291, "logps/chosen": -198.43289184570312, "logps/ref_chosen": -54.05841827392578, "logps/ref_rejected": -83.55493927001953, "logps/rejected": -307.3241271972656, "loss": 1.1006, "margin_dpo/margin_mean": 79.39472961425781, "margin_dpo/margin_std": 116.74202728271484, "step": 619 }, { "KL/chosen_KL_mean": -148.53500366210938, "KL/mean": -179.40289306640625, "KL/rejected_KL_mean": -210.27078247070312, "KL/std": 95.71381378173828, "epoch": 0.9372637944066515, "fcm_dpo/beta": 0.005186089780181646, "fcm_dpo/delta": -0.0602385550737381, "fcm_dpo/margin": 61.73577117919922, "fcm_dpo/q_t": 0.42560505867004395, "grad_norm": 13.164807319641113, "learning_rate": 6.142553278648238e-09, "logits/chosen": 0.7316728830337524, "logits/rejected": 0.7319644093513489, "logps/chosen": -211.90472412109375, "logps/ref_chosen": -63.36971664428711, "logps/ref_rejected": -65.68269348144531, "logps/rejected": -275.9534912109375, "loss": 1.1679, "margin_dpo/margin_mean": 61.73577117919922, "margin_dpo/margin_std": 101.75544738769531, "step": 620 }, { "KL/chosen_KL_mean": -146.6448516845703, "KL/mean": -181.1959991455078, "KL/rejected_KL_mean": -215.74716186523438, "KL/std": 102.8896484375, "epoch": 0.9387755102040817, "fcm_dpo/beta": 0.005201369524002075, "fcm_dpo/delta": 0.04181717336177826, "fcm_dpo/margin": 69.10228729248047, "fcm_dpo/q_t": 0.41966116428375244, "grad_norm": 13.882325172424316, "learning_rate": 5.854666444131934e-09, "logits/chosen": 0.7830545902252197, "logits/rejected": 0.6678333282470703, "logps/chosen": -198.96607971191406, "logps/ref_chosen": -52.321224212646484, "logps/ref_rejected": -88.09001159667969, "logps/rejected": -303.837158203125, "loss": 1.1548, "margin_dpo/margin_mean": 69.102294921875, "margin_dpo/margin_std": 122.76210021972656, "step": 621 }, { "KL/chosen_KL_mean": -161.6434326171875, "KL/mean": -197.0876007080078, "KL/rejected_KL_mean": -232.53176879882812, "KL/std": 107.89146423339844, "epoch": 0.9402872260015117, "fcm_dpo/beta": 0.005237799137830734, "fcm_dpo/delta": 0.029504524543881416, "fcm_dpo/margin": 70.88835144042969, "fcm_dpo/q_t": 0.4152719974517822, "grad_norm": 16.072410583496094, "learning_rate": 5.573608879422875e-09, "logits/chosen": 0.6776634454727173, "logits/rejected": 0.6312199831008911, "logps/chosen": -221.50888061523438, "logps/ref_chosen": -59.86545944213867, "logps/ref_rejected": -81.86668395996094, "logps/rejected": -314.3984375, "loss": 1.1328, "margin_dpo/margin_mean": 70.88835144042969, "margin_dpo/margin_std": 112.87138366699219, "step": 622 }, { "KL/chosen_KL_mean": -170.9632110595703, "KL/mean": -207.90390014648438, "KL/rejected_KL_mean": -244.8446044921875, "KL/std": 113.174560546875, "epoch": 0.9417989417989417, "fcm_dpo/beta": 0.005216827616095543, "fcm_dpo/delta": 0.014805559068918228, "fcm_dpo/margin": 73.88140106201172, "fcm_dpo/q_t": 0.41209039092063904, "grad_norm": 14.516220092773438, "learning_rate": 5.299388446305342e-09, "logits/chosen": 0.7114442586898804, "logits/rejected": 0.6579437255859375, "logps/chosen": -238.3316650390625, "logps/ref_chosen": -67.36846160888672, "logps/ref_rejected": -82.02733612060547, "logps/rejected": -326.8719482421875, "loss": 1.1281, "margin_dpo/margin_mean": 73.88140106201172, "margin_dpo/margin_std": 118.04656982421875, "step": 623 }, { "KL/chosen_KL_mean": -143.55258178710938, "KL/mean": -184.1567840576172, "KL/rejected_KL_mean": -224.76101684570312, "KL/std": 107.9732437133789, "epoch": 0.9433106575963719, "fcm_dpo/beta": 0.005214087665081024, "fcm_dpo/delta": -0.02460940182209015, "fcm_dpo/margin": 81.20841979980469, "fcm_dpo/q_t": 0.4050566554069519, "grad_norm": 15.29123592376709, "learning_rate": 5.03201281531429e-09, "logits/chosen": 0.7389147877693176, "logits/rejected": 0.6430974006652832, "logps/chosen": -194.57913208007812, "logps/ref_chosen": -51.02655029296875, "logps/ref_rejected": -76.49203491210938, "logps/rejected": -301.2530517578125, "loss": 1.0985, "margin_dpo/margin_mean": 81.20842742919922, "margin_dpo/margin_std": 120.22374725341797, "step": 624 }, { "KL/chosen_KL_mean": -145.1616973876953, "KL/mean": -177.22189331054688, "KL/rejected_KL_mean": -209.28207397460938, "KL/std": 102.99531555175781, "epoch": 0.9448223733938019, "fcm_dpo/beta": 0.005266295745968819, "fcm_dpo/delta": 0.06453034281730652, "fcm_dpo/margin": 64.12037658691406, "fcm_dpo/q_t": 0.42570820450782776, "grad_norm": 12.769986152648926, "learning_rate": 4.7714894655209174e-09, "logits/chosen": 0.7937861084938049, "logits/rejected": 0.7034215331077576, "logps/chosen": -199.36932373046875, "logps/ref_chosen": -54.20761489868164, "logps/ref_rejected": -84.93669128417969, "logps/rejected": -294.21875, "loss": 1.1776, "margin_dpo/margin_mean": 64.12036895751953, "margin_dpo/margin_std": 125.11459350585938, "step": 625 }, { "KL/chosen_KL_mean": -138.48724365234375, "KL/mean": -181.26748657226562, "KL/rejected_KL_mean": -224.0477294921875, "KL/std": 108.09788513183594, "epoch": 0.9463340891912321, "fcm_dpo/beta": 0.005240323953330517, "fcm_dpo/delta": -0.05072672292590141, "fcm_dpo/margin": 85.56048583984375, "fcm_dpo/q_t": 0.4015912413597107, "grad_norm": 13.545746803283691, "learning_rate": 4.517825684323323e-09, "logits/chosen": 0.8476173877716064, "logits/rejected": 0.7096439599990845, "logps/chosen": -183.5492706298828, "logps/ref_chosen": -45.06201934814453, "logps/ref_rejected": -89.66368103027344, "logps/rejected": -313.71142578125, "loss": 1.094, "margin_dpo/margin_mean": 85.56048583984375, "margin_dpo/margin_std": 132.3420867919922, "step": 626 }, { "KL/chosen_KL_mean": -158.35194396972656, "KL/mean": -204.31967163085938, "KL/rejected_KL_mean": -250.28741455078125, "KL/std": 112.77745056152344, "epoch": 0.9478458049886621, "fcm_dpo/beta": 0.005159787833690643, "fcm_dpo/delta": -0.07838514447212219, "fcm_dpo/margin": 91.93545532226562, "fcm_dpo/q_t": 0.39006662368774414, "grad_norm": 14.823331832885742, "learning_rate": 4.271028567242818e-09, "logits/chosen": 0.6601865291595459, "logits/rejected": 0.5414648056030273, "logps/chosen": -217.14300537109375, "logps/ref_chosen": -58.791053771972656, "logps/ref_rejected": -94.90802001953125, "logps/rejected": -345.1954345703125, "loss": 1.0496, "margin_dpo/margin_mean": 91.93545532226562, "margin_dpo/margin_std": 114.65377807617188, "step": 627 }, { "KL/chosen_KL_mean": -143.81678771972656, "KL/mean": -187.36337280273438, "KL/rejected_KL_mean": -230.9099578857422, "KL/std": 105.25289916992188, "epoch": 0.9493575207860923, "fcm_dpo/beta": 0.005149932112544775, "fcm_dpo/delta": -0.05151147022843361, "fcm_dpo/margin": 87.09317016601562, "fcm_dpo/q_t": 0.3962075710296631, "grad_norm": 14.854714393615723, "learning_rate": 4.0311050177251895e-09, "logits/chosen": 0.7292068004608154, "logits/rejected": 0.6884187459945679, "logps/chosen": -196.620361328125, "logps/ref_chosen": -52.80357360839844, "logps/ref_rejected": -76.49468994140625, "logps/rejected": -307.4046630859375, "loss": 1.0907, "margin_dpo/margin_mean": 87.09317016601562, "margin_dpo/margin_std": 120.31452941894531, "step": 628 }, { "KL/chosen_KL_mean": -156.3859100341797, "KL/mean": -188.27789306640625, "KL/rejected_KL_mean": -220.16989135742188, "KL/std": 98.98799133300781, "epoch": 0.9508692365835223, "fcm_dpo/beta": 0.005165508016943932, "fcm_dpo/delta": 0.07267770171165466, "fcm_dpo/margin": 63.783966064453125, "fcm_dpo/q_t": 0.42540132999420166, "grad_norm": 14.011495590209961, "learning_rate": 3.798061746947995e-09, "logits/chosen": 0.7398512363433838, "logits/rejected": 0.7367151379585266, "logps/chosen": -227.10340881347656, "logps/ref_chosen": -70.71749877929688, "logps/ref_rejected": -78.96273803710938, "logps/rejected": -299.13262939453125, "loss": 1.1541, "margin_dpo/margin_mean": 63.78396987915039, "margin_dpo/margin_std": 105.56842041015625, "step": 629 }, { "KL/chosen_KL_mean": -144.41912841796875, "KL/mean": -188.89906311035156, "KL/rejected_KL_mean": -233.3789825439453, "KL/std": 106.57427215576172, "epoch": 0.9523809523809523, "fcm_dpo/beta": 0.005139235407114029, "fcm_dpo/delta": -0.05989410728216171, "fcm_dpo/margin": 88.9598617553711, "fcm_dpo/q_t": 0.39508897066116333, "grad_norm": 11.034381866455078, "learning_rate": 3.5719052736323806e-09, "logits/chosen": 0.7051947116851807, "logits/rejected": 0.6615438461303711, "logps/chosen": -200.62054443359375, "logps/ref_chosen": -56.201412200927734, "logps/ref_rejected": -74.69807434082031, "logps/rejected": -308.0770568847656, "loss": 1.0604, "margin_dpo/margin_mean": 88.95986938476562, "margin_dpo/margin_std": 113.7017593383789, "step": 630 }, { "KL/chosen_KL_mean": -139.29513549804688, "KL/mean": -188.87086486816406, "KL/rejected_KL_mean": -238.44659423828125, "KL/std": 113.19669342041016, "epoch": 0.9538926681783825, "fcm_dpo/beta": 0.004996112547814846, "fcm_dpo/delta": -0.10234710574150085, "fcm_dpo/margin": 99.15147399902344, "fcm_dpo/q_t": 0.3882313072681427, "grad_norm": 13.787875175476074, "learning_rate": 3.352641923861144e-09, "logits/chosen": 0.8040734529495239, "logits/rejected": 0.6909521818161011, "logps/chosen": -198.11572265625, "logps/ref_chosen": -58.82059860229492, "logps/ref_rejected": -96.51437377929688, "logps/rejected": -334.96099853515625, "loss": 1.0453, "margin_dpo/margin_mean": 99.15147399902344, "margin_dpo/margin_std": 124.88319396972656, "step": 631 }, { "KL/chosen_KL_mean": -141.20208740234375, "KL/mean": -187.02703857421875, "KL/rejected_KL_mean": -232.85202026367188, "KL/std": 99.86505126953125, "epoch": 0.9554043839758125, "fcm_dpo/beta": 0.004978152923285961, "fcm_dpo/delta": -0.058926571160554886, "fcm_dpo/margin": 91.64991760253906, "fcm_dpo/q_t": 0.39418140053749084, "grad_norm": 13.233796119689941, "learning_rate": 3.140277830901428e-09, "logits/chosen": 0.75133216381073, "logits/rejected": 0.7323135137557983, "logps/chosen": -199.98814392089844, "logps/ref_chosen": -58.786048889160156, "logps/ref_rejected": -67.21923828125, "logps/rejected": -300.0712585449219, "loss": 1.0529, "margin_dpo/margin_mean": 91.64991760253906, "margin_dpo/margin_std": 110.22685241699219, "step": 632 }, { "KL/chosen_KL_mean": -135.2391357421875, "KL/mean": -170.4383544921875, "KL/rejected_KL_mean": -205.63760375976562, "KL/std": 95.1596450805664, "epoch": 0.9569160997732427, "fcm_dpo/beta": 0.005002975929528475, "fcm_dpo/delta": 0.049144282937049866, "fcm_dpo/margin": 70.39846801757812, "fcm_dpo/q_t": 0.4198831021785736, "grad_norm": 14.44809341430664, "learning_rate": 2.9348189350335007e-09, "logits/chosen": 0.7198129892349243, "logits/rejected": 0.6612646579742432, "logps/chosen": -187.36932373046875, "logps/ref_chosen": -52.13019561767578, "logps/ref_rejected": -67.23016357421875, "logps/rejected": -272.86773681640625, "loss": 1.1362, "margin_dpo/margin_mean": 70.39846801757812, "margin_dpo/margin_std": 108.93657684326172, "step": 633 }, { "KL/chosen_KL_mean": -163.69000244140625, "KL/mean": -178.94308471679688, "KL/rejected_KL_mean": -194.19619750976562, "KL/std": 99.78311157226562, "epoch": 0.9584278155706727, "fcm_dpo/beta": 0.0049985796213150024, "fcm_dpo/delta": 0.0, "fcm_dpo/margin": 30.50618553161621, "fcm_dpo/q_t": 0.46565011143684387, "grad_norm": 15.874098777770996, "learning_rate": 2.736270983384276e-09, "logits/chosen": 0.7957136631011963, "logits/rejected": 0.8110651969909668, "logps/chosen": -224.6697998046875, "logps/ref_chosen": -60.97979736328125, "logps/ref_rejected": -58.50825119018555, "logps/rejected": -252.70445251464844, "loss": 1.3189, "margin_dpo/margin_mean": 30.506187438964844, "margin_dpo/margin_std": 114.90726470947266, "step": 634 }, { "KL/chosen_KL_mean": -180.7266845703125, "KL/mean": -210.17645263671875, "KL/rejected_KL_mean": -239.62625122070312, "KL/std": 99.87394714355469, "epoch": 0.9599395313681028, "fcm_dpo/beta": 0.005090565420687199, "fcm_dpo/delta": 0.10279709845781326, "fcm_dpo/margin": 58.899574279785156, "fcm_dpo/q_t": 0.4319508671760559, "grad_norm": 12.962136268615723, "learning_rate": 2.5446395297668287e-09, "logits/chosen": 0.6403921842575073, "logits/rejected": 0.5827170610427856, "logps/chosen": -246.69969177246094, "logps/ref_chosen": -65.9730224609375, "logps/ref_rejected": -85.61317443847656, "logps/rejected": -325.2394104003906, "loss": 1.1999, "margin_dpo/margin_mean": 58.89957809448242, "margin_dpo/margin_std": 120.45620727539062, "step": 635 }, { "KL/chosen_KL_mean": -140.232666015625, "KL/mean": -180.35409545898438, "KL/rejected_KL_mean": -220.4755401611328, "KL/std": 100.5526123046875, "epoch": 0.9614512471655329, "fcm_dpo/beta": 0.005084035452455282, "fcm_dpo/delta": -0.008429108187556267, "fcm_dpo/margin": 80.24287414550781, "fcm_dpo/q_t": 0.4051709771156311, "grad_norm": 11.293302536010742, "learning_rate": 2.359929934524829e-09, "logits/chosen": 0.7284525632858276, "logits/rejected": 0.6281647682189941, "logps/chosen": -189.37283325195312, "logps/ref_chosen": -49.140167236328125, "logps/ref_rejected": -81.26971435546875, "logps/rejected": -301.7452392578125, "loss": 1.0911, "margin_dpo/margin_mean": 80.24287414550781, "margin_dpo/margin_std": 107.7694091796875, "step": 636 }, { "KL/chosen_KL_mean": -187.85205078125, "KL/mean": -216.92355346679688, "KL/rejected_KL_mean": -245.99508666992188, "KL/std": 101.05802917480469, "epoch": 0.9629629629629629, "fcm_dpo/beta": 0.005187293514609337, "fcm_dpo/delta": 0.10094261169433594, "fcm_dpo/margin": 58.14302062988281, "fcm_dpo/q_t": 0.432273268699646, "grad_norm": 13.91185474395752, "learning_rate": 2.1821473643827137e-09, "logits/chosen": 0.6850996613502502, "logits/rejected": 0.6232686042785645, "logps/chosen": -261.54864501953125, "logps/ref_chosen": -73.69658660888672, "logps/ref_rejected": -83.01487731933594, "logps/rejected": -329.00994873046875, "loss": 1.2093, "margin_dpo/margin_mean": 58.14302062988281, "margin_dpo/margin_std": 127.46809387207031, "step": 637 }, { "KL/chosen_KL_mean": -163.23129272460938, "KL/mean": -198.5665283203125, "KL/rejected_KL_mean": -233.90174865722656, "KL/std": 100.63964080810547, "epoch": 0.9644746787603931, "fcm_dpo/beta": 0.005223121494054794, "fcm_dpo/delta": 0.03205978870391846, "fcm_dpo/margin": 70.67045593261719, "fcm_dpo/q_t": 0.41628775000572205, "grad_norm": 12.878883361816406, "learning_rate": 2.0112967923011646e-09, "logits/chosen": 0.7184966802597046, "logits/rejected": 0.6726780533790588, "logps/chosen": -226.01287841796875, "logps/ref_chosen": -62.78158187866211, "logps/ref_rejected": -85.40478515625, "logps/rejected": -319.3065185546875, "loss": 1.1291, "margin_dpo/margin_mean": 70.67045593261719, "margin_dpo/margin_std": 110.60838317871094, "step": 638 }, { "KL/chosen_KL_mean": -144.66845703125, "KL/mean": -187.10421752929688, "KL/rejected_KL_mean": -229.5399627685547, "KL/std": 107.97879028320312, "epoch": 0.9659863945578231, "fcm_dpo/beta": 0.005202891305088997, "fcm_dpo/delta": -0.04347489774227142, "fcm_dpo/margin": 84.87150573730469, "fcm_dpo/q_t": 0.40035775303840637, "grad_norm": 15.735711097717285, "learning_rate": 1.847382997337943e-09, "logits/chosen": 0.707938015460968, "logits/rejected": 0.6151958107948303, "logps/chosen": -198.43505859375, "logps/ref_chosen": -53.76658630371094, "logps/ref_rejected": -72.30009460449219, "logps/rejected": -301.8400573730469, "loss": 1.0791, "margin_dpo/margin_mean": 84.87150573730469, "margin_dpo/margin_std": 117.43939208984375, "step": 639 }, { "KL/chosen_KL_mean": -151.08187866210938, "KL/mean": -189.92904663085938, "KL/rejected_KL_mean": -228.77621459960938, "KL/std": 103.72026062011719, "epoch": 0.9674981103552532, "fcm_dpo/beta": 0.005173161625862122, "fcm_dpo/delta": -0.0021038129925727844, "fcm_dpo/margin": 77.69434356689453, "fcm_dpo/q_t": 0.4079463481903076, "grad_norm": 13.133944511413574, "learning_rate": 1.690410564514244e-09, "logits/chosen": 0.7980025410652161, "logits/rejected": 0.7335871458053589, "logps/chosen": -202.49964904785156, "logps/ref_chosen": -51.41777801513672, "logps/ref_rejected": -77.27879333496094, "logps/rejected": -306.05499267578125, "loss": 1.1059, "margin_dpo/margin_mean": 77.69434356689453, "margin_dpo/margin_std": 113.72712707519531, "step": 640 }, { "KL/chosen_KL_mean": -159.562255859375, "KL/mean": -193.78067016601562, "KL/rejected_KL_mean": -227.99908447265625, "KL/std": 95.99458312988281, "epoch": 0.9690098261526833, "fcm_dpo/beta": 0.005207887850701809, "fcm_dpo/delta": 0.04520513489842415, "fcm_dpo/margin": 68.43684387207031, "fcm_dpo/q_t": 0.4166935086250305, "grad_norm": 14.292564392089844, "learning_rate": 1.5403838846864692e-09, "logits/chosen": 0.7009308934211731, "logits/rejected": 0.6851996779441833, "logps/chosen": -230.61691284179688, "logps/ref_chosen": -71.0546646118164, "logps/ref_rejected": -82.2440185546875, "logps/rejected": -310.24310302734375, "loss": 1.1164, "margin_dpo/margin_mean": 68.43684387207031, "margin_dpo/margin_std": 92.28787994384766, "step": 641 }, { "KL/chosen_KL_mean": -167.956787109375, "KL/mean": -193.73696899414062, "KL/rejected_KL_mean": -219.5171356201172, "KL/std": 93.58724975585938, "epoch": 0.9705215419501134, "fcm_dpo/beta": 0.005239700898528099, "fcm_dpo/delta": 0.008556126616895199, "fcm_dpo/margin": 51.56036376953125, "fcm_dpo/q_t": 0.4381951093673706, "grad_norm": 16.687023162841797, "learning_rate": 1.3973071544233218e-09, "logits/chosen": 0.6564966440200806, "logits/rejected": 0.6771268844604492, "logps/chosen": -236.88604736328125, "logps/ref_chosen": -68.92927551269531, "logps/ref_rejected": -70.85682678222656, "logps/rejected": -290.37396240234375, "loss": 1.2203, "margin_dpo/margin_mean": 51.56036376953125, "margin_dpo/margin_std": 112.46025085449219, "step": 642 }, { "KL/chosen_KL_mean": -152.04673767089844, "KL/mean": -191.3953399658203, "KL/rejected_KL_mean": -230.74392700195312, "KL/std": 101.4207992553711, "epoch": 0.9720332577475435, "fcm_dpo/beta": 0.005223228596150875, "fcm_dpo/delta": -0.011585216037929058, "fcm_dpo/margin": 78.69720458984375, "fcm_dpo/q_t": 0.40665584802627563, "grad_norm": 18.86591911315918, "learning_rate": 1.261184375888541e-09, "logits/chosen": 0.6632203459739685, "logits/rejected": 0.5793460607528687, "logps/chosen": -217.35577392578125, "logps/ref_chosen": -65.30903625488281, "logps/ref_rejected": -83.61613464355469, "logps/rejected": -314.36004638671875, "loss": 1.1091, "margin_dpo/margin_mean": 78.69720458984375, "margin_dpo/margin_std": 119.00251770019531, "step": 643 }, { "KL/chosen_KL_mean": -142.48770141601562, "KL/mean": -169.7671356201172, "KL/rejected_KL_mean": -197.0465850830078, "KL/std": 91.83268737792969, "epoch": 0.9735449735449735, "fcm_dpo/beta": 0.005224712658673525, "fcm_dpo/delta": -0.005476703401654959, "fcm_dpo/margin": 54.558868408203125, "fcm_dpo/q_t": 0.43380093574523926, "grad_norm": 14.273197174072266, "learning_rate": 1.1320193567288527e-09, "logits/chosen": 0.8106564283370972, "logits/rejected": 0.7780516743659973, "logps/chosen": -193.4903106689453, "logps/ref_chosen": -51.002601623535156, "logps/ref_rejected": -64.46372985839844, "logps/rejected": -261.51031494140625, "loss": 1.2084, "margin_dpo/margin_mean": 54.55887222290039, "margin_dpo/margin_std": 112.9139175415039, "step": 644 }, { "KL/chosen_KL_mean": -155.4702606201172, "KL/mean": -188.92922973632812, "KL/rejected_KL_mean": -222.38821411132812, "KL/std": 98.68727111816406, "epoch": 0.9750566893424036, "fcm_dpo/beta": 0.005260554142296314, "fcm_dpo/delta": 0.049726568162441254, "fcm_dpo/margin": 66.9179458618164, "fcm_dpo/q_t": 0.41817063093185425, "grad_norm": 14.965581893920898, "learning_rate": 1.0098157099674987e-09, "logits/chosen": 0.6664207577705383, "logits/rejected": 0.6510252952575684, "logps/chosen": -216.4336700439453, "logps/ref_chosen": -60.963409423828125, "logps/ref_rejected": -69.73353576660156, "logps/rejected": -292.12176513671875, "loss": 1.1305, "margin_dpo/margin_mean": 66.9179458618164, "margin_dpo/margin_std": 99.93928527832031, "step": 645 }, { "KL/chosen_KL_mean": -172.07501220703125, "KL/mean": -203.429931640625, "KL/rejected_KL_mean": -234.78482055664062, "KL/std": 113.03104400634766, "epoch": 0.9765684051398337, "fcm_dpo/beta": 0.005329879932105541, "fcm_dpo/delta": 0.06800927966833115, "fcm_dpo/margin": 62.709808349609375, "fcm_dpo/q_t": 0.4245767891407013, "grad_norm": 12.221449851989746, "learning_rate": 8.945768539031783e-10, "logits/chosen": 0.7537873983383179, "logits/rejected": 0.6941382884979248, "logps/chosen": -234.36508178710938, "logps/ref_chosen": -62.290069580078125, "logps/ref_rejected": -85.54812622070312, "logps/rejected": -320.33294677734375, "loss": 1.1712, "margin_dpo/margin_mean": 62.70981216430664, "margin_dpo/margin_std": 117.16531372070312, "step": 646 }, { "KL/chosen_KL_mean": -169.59872436523438, "KL/mean": -218.59487915039062, "KL/rejected_KL_mean": -267.591064453125, "KL/std": 113.086181640625, "epoch": 0.9780801209372638, "fcm_dpo/beta": 0.00525968661531806, "fcm_dpo/delta": -0.12153247743844986, "fcm_dpo/margin": 97.99230194091797, "fcm_dpo/q_t": 0.38150864839553833, "grad_norm": 14.665154457092285, "learning_rate": 7.863060120144316e-10, "logits/chosen": 0.7330044507980347, "logits/rejected": 0.6367726922035217, "logps/chosen": -237.11459350585938, "logps/ref_chosen": -67.515869140625, "logps/ref_rejected": -101.50871276855469, "logps/rejected": -369.0997314453125, "loss": 1.0151, "margin_dpo/margin_mean": 97.99229431152344, "margin_dpo/margin_std": 110.91431427001953, "step": 647 }, { "KL/chosen_KL_mean": -168.221923828125, "KL/mean": -202.50161743164062, "KL/rejected_KL_mean": -236.78131103515625, "KL/std": 98.9996566772461, "epoch": 0.9795918367346939, "fcm_dpo/beta": 0.005227650515735149, "fcm_dpo/delta": 0.04291488975286484, "fcm_dpo/margin": 68.55940246582031, "fcm_dpo/q_t": 0.4196903705596924, "grad_norm": 13.943243026733398, "learning_rate": 6.850062128694045e-10, "logits/chosen": 0.666149377822876, "logits/rejected": 0.6047707796096802, "logps/chosen": -232.81785583496094, "logps/ref_chosen": -64.59593963623047, "logps/ref_rejected": -83.384033203125, "logps/rejected": -320.16534423828125, "loss": 1.1715, "margin_dpo/margin_mean": 68.55940246582031, "margin_dpo/margin_std": 128.8475341796875, "step": 648 }, { "KL/chosen_KL_mean": -151.02784729003906, "KL/mean": -187.35830688476562, "KL/rejected_KL_mean": -223.68875122070312, "KL/std": 97.62661743164062, "epoch": 0.981103552532124, "fcm_dpo/beta": 0.0052569955587387085, "fcm_dpo/delta": 0.018501881510019302, "fcm_dpo/margin": 72.66088104248047, "fcm_dpo/q_t": 0.4125017821788788, "grad_norm": 17.302661895751953, "learning_rate": 5.906802900412788e-10, "logits/chosen": 0.7981792688369751, "logits/rejected": 0.7338933944702148, "logps/chosen": -200.33749389648438, "logps/ref_chosen": -49.30964660644531, "logps/ref_rejected": -73.73710632324219, "logps/rejected": -297.42584228515625, "loss": 1.1293, "margin_dpo/margin_mean": 72.66088104248047, "margin_dpo/margin_std": 115.62672424316406, "step": 649 }, { "KL/chosen_KL_mean": -148.79168701171875, "KL/mean": -185.49697875976562, "KL/rejected_KL_mean": -222.20228576660156, "KL/std": 98.9957504272461, "epoch": 0.982615268329554, "fcm_dpo/beta": 0.005280426703393459, "fcm_dpo/delta": 0.012782499194145203, "fcm_dpo/margin": 73.41058349609375, "fcm_dpo/q_t": 0.41188716888427734, "grad_norm": 13.39968490600586, "learning_rate": 5.033308820289184e-10, "logits/chosen": 0.8054826259613037, "logits/rejected": 0.7400684952735901, "logps/chosen": -203.85494995117188, "logps/ref_chosen": -55.06325912475586, "logps/ref_rejected": -77.39610290527344, "logps/rejected": -299.598388671875, "loss": 1.1313, "margin_dpo/margin_mean": 73.41058349609375, "margin_dpo/margin_std": 119.93722534179688, "step": 650 }, { "KL/chosen_KL_mean": -147.55654907226562, "KL/mean": -179.268310546875, "KL/rejected_KL_mean": -210.98008728027344, "KL/std": 99.23576354980469, "epoch": 0.9841269841269841, "fcm_dpo/beta": 0.005356069654226303, "fcm_dpo/delta": 0.062210820615291595, "fcm_dpo/margin": 63.42353820800781, "fcm_dpo/q_t": 0.4238058924674988, "grad_norm": 12.844218254089355, "learning_rate": 4.2296043218295606e-10, "logits/chosen": 0.7807134389877319, "logits/rejected": 0.7063891887664795, "logps/chosen": -201.6217041015625, "logps/ref_chosen": -54.065162658691406, "logps/ref_rejected": -77.79080200195312, "logps/rejected": -288.7708740234375, "loss": 1.1504, "margin_dpo/margin_mean": 63.42353820800781, "margin_dpo/margin_std": 105.43598937988281, "step": 651 }, { "KL/chosen_KL_mean": -174.65106201171875, "KL/mean": -204.90283203125, "KL/rejected_KL_mean": -235.15463256835938, "KL/std": 108.58076477050781, "epoch": 0.9856386999244142, "fcm_dpo/beta": 0.005346385296434164, "fcm_dpo/delta": -0.014769421890377998, "fcm_dpo/margin": 60.503578186035156, "fcm_dpo/q_t": 0.42873865365982056, "grad_norm": 14.795418739318848, "learning_rate": 3.4957118863768176e-10, "logits/chosen": 0.7643985748291016, "logits/rejected": 0.7152303457260132, "logps/chosen": -238.29135131835938, "logps/ref_chosen": -63.64030456542969, "logps/ref_rejected": -78.86882019042969, "logps/rejected": -314.0234375, "loss": 1.2016, "margin_dpo/margin_mean": 60.503578186035156, "margin_dpo/margin_std": 130.01535034179688, "step": 652 }, { "KL/chosen_KL_mean": -153.60595703125, "KL/mean": -189.78741455078125, "KL/rejected_KL_mean": -225.9688720703125, "KL/std": 97.430908203125, "epoch": 0.9871504157218443, "fcm_dpo/beta": 0.005368629936128855, "fcm_dpo/delta": 0.011830903589725494, "fcm_dpo/margin": 72.36293029785156, "fcm_dpo/q_t": 0.4113900065422058, "grad_norm": 14.955126762390137, "learning_rate": 2.831652042480093e-10, "logits/chosen": 0.7443436980247498, "logits/rejected": 0.6979455351829529, "logps/chosen": -215.27432250976562, "logps/ref_chosen": -61.668373107910156, "logps/ref_rejected": -73.83012390136719, "logps/rejected": -299.79901123046875, "loss": 1.1247, "margin_dpo/margin_mean": 72.36292266845703, "margin_dpo/margin_std": 114.2525634765625, "step": 653 }, { "KL/chosen_KL_mean": -161.9186248779297, "KL/mean": -194.57980346679688, "KL/rejected_KL_mean": -227.24098205566406, "KL/std": 113.89588928222656, "epoch": 0.9886621315192744, "fcm_dpo/beta": 0.005293640773743391, "fcm_dpo/delta": -0.061403125524520874, "fcm_dpo/margin": 65.32235717773438, "fcm_dpo/q_t": 0.4208984076976776, "grad_norm": 13.296249389648438, "learning_rate": 2.2374433653205016e-10, "logits/chosen": 0.7252536416053772, "logits/rejected": 0.624152660369873, "logps/chosen": -219.4868927001953, "logps/ref_chosen": -57.568267822265625, "logps/ref_rejected": -87.74789428710938, "logps/rejected": -314.9888916015625, "loss": 1.1519, "margin_dpo/margin_mean": 65.3223648071289, "margin_dpo/margin_std": 105.86907196044922, "step": 654 }, { "KL/chosen_KL_mean": -126.71316528320312, "KL/mean": -176.79617309570312, "KL/rejected_KL_mean": -226.87921142578125, "KL/std": 101.05697631835938, "epoch": 0.9901738473167044, "fcm_dpo/beta": 0.005152501165866852, "fcm_dpo/delta": -0.124945729970932, "fcm_dpo/margin": 100.16602325439453, "fcm_dpo/q_t": 0.3801180124282837, "grad_norm": 12.216167449951172, "learning_rate": 1.7131024761923852e-10, "logits/chosen": 0.6825774908065796, "logits/rejected": 0.5976626873016357, "logps/chosen": -178.86032104492188, "logps/ref_chosen": -52.14714813232422, "logps/ref_rejected": -80.85014343261719, "logps/rejected": -307.7293395996094, "loss": 1.001, "margin_dpo/margin_mean": 100.16602325439453, "margin_dpo/margin_std": 98.94831848144531, "step": 655 }, { "KL/chosen_KL_mean": -166.34686279296875, "KL/mean": -202.80499267578125, "KL/rejected_KL_mean": -239.26312255859375, "KL/std": 101.93389892578125, "epoch": 0.9916855631141346, "fcm_dpo/beta": 0.005183545872569084, "fcm_dpo/delta": 0.02288138121366501, "fcm_dpo/margin": 72.91627502441406, "fcm_dpo/q_t": 0.4133659601211548, "grad_norm": 10.842991828918457, "learning_rate": 1.2586440420372934e-10, "logits/chosen": 0.6511036157608032, "logits/rejected": 0.6055686473846436, "logps/chosen": -239.60357666015625, "logps/ref_chosen": -73.25672912597656, "logps/ref_rejected": -85.35127258300781, "logps/rejected": -324.6143798828125, "loss": 1.1257, "margin_dpo/margin_mean": 72.91627502441406, "margin_dpo/margin_std": 113.99592590332031, "step": 656 }, { "KL/chosen_KL_mean": -148.92398071289062, "KL/mean": -193.31072998046875, "KL/rejected_KL_mean": -237.69747924804688, "KL/std": 113.50834655761719, "epoch": 0.9931972789115646, "fcm_dpo/beta": 0.0051452526822686195, "fcm_dpo/delta": -0.05944373458623886, "fcm_dpo/margin": 88.77348327636719, "fcm_dpo/q_t": 0.3963528871536255, "grad_norm": 10.741239547729492, "learning_rate": 8.740807750345913e-11, "logits/chosen": 0.8246313333511353, "logits/rejected": 0.7395132780075073, "logps/chosen": -198.6473846435547, "logps/ref_chosen": -49.72339630126953, "logps/ref_rejected": -75.1568603515625, "logps/rejected": -312.8543395996094, "loss": 1.0758, "margin_dpo/margin_mean": 88.77348327636719, "margin_dpo/margin_std": 124.34440612792969, "step": 657 }, { "KL/chosen_KL_mean": -158.15025329589844, "KL/mean": -194.25250244140625, "KL/rejected_KL_mean": -230.35475158691406, "KL/std": 115.25859832763672, "epoch": 0.9947089947089947, "fcm_dpo/beta": 0.00517710205167532, "fcm_dpo/delta": 0.026284661144018173, "fcm_dpo/margin": 72.20449829101562, "fcm_dpo/q_t": 0.4157407581806183, "grad_norm": 11.579418182373047, "learning_rate": 5.594234322453539e-11, "logits/chosen": 0.7650084495544434, "logits/rejected": 0.7185569405555725, "logps/chosen": -221.19659423828125, "logps/ref_chosen": -63.04634094238281, "logps/ref_rejected": -83.44963073730469, "logps/rejected": -313.80438232421875, "loss": 1.162, "margin_dpo/margin_mean": 72.20449829101562, "margin_dpo/margin_std": 134.1488037109375, "step": 658 }, { "KL/chosen_KL_mean": -159.69915771484375, "KL/mean": -188.3558349609375, "KL/rejected_KL_mean": -217.01248168945312, "KL/std": 101.98857116699219, "epoch": 0.9962207105064248, "fcm_dpo/beta": 0.005153452977538109, "fcm_dpo/delta": 0.001027753110975027, "fcm_dpo/margin": 57.313316345214844, "fcm_dpo/q_t": 0.431816428899765, "grad_norm": 16.712438583374023, "learning_rate": 3.146808153123293e-11, "logits/chosen": 0.8149210214614868, "logits/rejected": 0.7490121126174927, "logps/chosen": -214.77935791015625, "logps/ref_chosen": -55.0802001953125, "logps/ref_rejected": -71.91049194335938, "logps/rejected": -288.9229736328125, "loss": 1.2014, "margin_dpo/margin_mean": 57.313316345214844, "margin_dpo/margin_std": 116.82188415527344, "step": 659 }, { "KL/chosen_KL_mean": -151.657958984375, "KL/mean": -196.70738220214844, "KL/rejected_KL_mean": -241.75680541992188, "KL/std": 109.42488098144531, "epoch": 0.9977324263038548, "fcm_dpo/beta": 0.005107267759740353, "fcm_dpo/delta": -0.06304004788398743, "fcm_dpo/margin": 90.09884643554688, "fcm_dpo/q_t": 0.39589014649391174, "grad_norm": 12.976646423339844, "learning_rate": 1.3985977021235829e-11, "logits/chosen": 0.8472395539283752, "logits/rejected": 0.7774761915206909, "logps/chosen": -206.18386840820312, "logps/ref_chosen": -54.525917053222656, "logps/ref_rejected": -81.23604583740234, "logps/rejected": -322.99285888671875, "loss": 1.0573, "margin_dpo/margin_mean": 90.09884643554688, "margin_dpo/margin_std": 116.89100646972656, "step": 660 }, { "KL/chosen_KL_mean": -175.95584106445312, "KL/mean": -204.52122497558594, "KL/rejected_KL_mean": -233.08660888671875, "KL/std": 109.93765258789062, "epoch": 0.999244142101285, "fcm_dpo/beta": 0.005181189626455307, "fcm_dpo/delta": 0.10686805099248886, "fcm_dpo/margin": 57.130775451660156, "fcm_dpo/q_t": 0.4327910542488098, "grad_norm": 12.338637351989746, "learning_rate": 3.4965187065971735e-12, "logits/chosen": 0.7141545414924622, "logits/rejected": 0.6361143589019775, "logps/chosen": -236.32847595214844, "logps/ref_chosen": -60.37263870239258, "logps/ref_rejected": -77.42874145507812, "logps/rejected": -310.515380859375, "loss": 1.2135, "margin_dpo/margin_mean": 57.13077926635742, "margin_dpo/margin_std": 126.87828063964844, "step": 661 }, { "epoch": 0.999244142101285, "step": 661, "total_flos": 0.0, "train_loss": 1.1452037545087297, "train_runtime": 1649.729, "train_samples_per_second": 25.662, "train_steps_per_second": 0.401 } ], "logging_steps": 1, "max_steps": 661, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }