{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.999244142101285, "eval_steps": 300, "global_step": 661, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0015117157974300832, "grad_norm": 2.8220584392547607, "learning_rate": 0.0, "logits/chosen": 0.15524622797966003, "logits/rejected": 0.17812994122505188, "logps/chosen": -72.87983703613281, "logps/ref_chosen": -72.91278839111328, "logps/ref_rejected": -47.538291931152344, "logps/rejected": -47.55305480957031, "loss": 1.3863, "margin_dpo/margin_mean": 0.04771292209625244, "margin_dpo/margin_std": 0.24055811762809753, "step": 1 }, { "epoch": 0.0030234315948601664, "grad_norm": 2.786863088607788, "learning_rate": 7.462686567164179e-09, "logits/chosen": 0.13265366852283478, "logits/rejected": 0.06028885394334793, "logps/chosen": -47.69560241699219, "logps/ref_chosen": -47.742698669433594, "logps/ref_rejected": -54.36896896362305, "logps/rejected": -54.41385269165039, "loss": 1.3859, "margin_dpo/margin_mean": 0.09197819232940674, "margin_dpo/margin_std": 0.19671888649463654, "step": 2 }, { "epoch": 0.0045351473922902496, "grad_norm": 3.1143651008605957, "learning_rate": 1.4925373134328357e-08, "logits/chosen": 0.12916997075080872, "logits/rejected": 0.0313495397567749, "logps/chosen": -57.6026611328125, "logps/ref_chosen": -57.59052276611328, "logps/ref_rejected": -123.58332824707031, "logps/rejected": -123.56676483154297, "loss": 1.3866, "margin_dpo/margin_mean": -0.028696417808532715, "margin_dpo/margin_std": 0.30107247829437256, "step": 3 }, { "epoch": 0.006046863189720333, "grad_norm": 3.436974287033081, "learning_rate": 2.2388059701492534e-08, "logits/chosen": 0.11005942523479462, "logits/rejected": 0.08743590116500854, "logps/chosen": -72.13690185546875, "logps/ref_chosen": -72.2152328491211, "logps/ref_rejected": -72.17367553710938, "logps/rejected": -72.15191650390625, "loss": 1.3864, "margin_dpo/margin_mean": 0.05657494068145752, "margin_dpo/margin_std": 0.35384583473205566, "step": 4 }, { "epoch": 0.007558578987150416, "grad_norm": 2.9546422958374023, "learning_rate": 2.9850746268656714e-08, "logits/chosen": 0.1004139631986618, "logits/rejected": 0.015552863478660583, "logps/chosen": -56.2198486328125, "logps/ref_chosen": -56.174278259277344, "logps/ref_rejected": -111.51732635498047, "logps/rejected": -111.60870361328125, "loss": 1.3863, "margin_dpo/margin_mean": 0.04580581188201904, "margin_dpo/margin_std": 0.2617151737213135, "step": 5 }, { "epoch": 0.009070294784580499, "grad_norm": 2.960731267929077, "learning_rate": 3.731343283582089e-08, "logits/chosen": 0.09757982194423676, "logits/rejected": 0.021553121507167816, "logps/chosen": -33.81324005126953, "logps/ref_chosen": -33.68452453613281, "logps/ref_rejected": -102.40830993652344, "logps/rejected": -102.35445404052734, "loss": 1.387, "margin_dpo/margin_mean": -0.1825714111328125, "margin_dpo/margin_std": 0.29179731011390686, "step": 6 }, { "epoch": 0.010582010582010581, "grad_norm": 2.716484546661377, "learning_rate": 4.477611940298507e-08, "logits/chosen": -0.05511504039168358, "logits/rejected": -0.10139390826225281, "logps/chosen": -36.465389251708984, "logps/ref_chosen": -36.51213073730469, "logps/ref_rejected": -55.43703842163086, "logps/rejected": -55.4085578918457, "loss": 1.3864, "margin_dpo/margin_mean": 0.01826620101928711, "margin_dpo/margin_std": 0.24058791995048523, "step": 7 }, { "epoch": 0.012093726379440665, "grad_norm": 2.8784360885620117, "learning_rate": 5.223880597014925e-08, "logits/chosen": 0.09966567903757095, "logits/rejected": 0.06772118806838989, "logps/chosen": -91.59811401367188, "logps/ref_chosen": -91.63668823242188, "logps/ref_rejected": -115.22460174560547, "logps/rejected": -115.18936920166016, "loss": 1.3864, "margin_dpo/margin_mean": 0.0033463239669799805, "margin_dpo/margin_std": 0.3163236379623413, "step": 8 }, { "epoch": 0.013605442176870748, "grad_norm": 3.1453640460968018, "learning_rate": 5.970149253731343e-08, "logits/chosen": 0.08192189782857895, "logits/rejected": 0.07255452871322632, "logps/chosen": -91.2304916381836, "logps/ref_chosen": -91.08434295654297, "logps/ref_rejected": -78.1643295288086, "logps/rejected": -78.20406341552734, "loss": 1.3863, "margin_dpo/margin_mean": -0.10640597343444824, "margin_dpo/margin_std": 0.37841445207595825, "step": 9 }, { "epoch": 0.015117157974300832, "grad_norm": 2.9692389965057373, "learning_rate": 6.71641791044776e-08, "logits/chosen": 0.1450178623199463, "logits/rejected": 0.08948463946580887, "logps/chosen": -67.3464126586914, "logps/ref_chosen": -67.3323974609375, "logps/ref_rejected": -115.37638854980469, "logps/rejected": -115.22101593017578, "loss": 1.3866, "margin_dpo/margin_mean": -0.16937530040740967, "margin_dpo/margin_std": 0.2835836708545685, "step": 10 }, { "epoch": 0.016628873771730914, "grad_norm": 2.9334216117858887, "learning_rate": 7.462686567164178e-08, "logits/chosen": 0.06959305703639984, "logits/rejected": 0.031854622066020966, "logps/chosen": -48.15754699707031, "logps/ref_chosen": -48.158531188964844, "logps/ref_rejected": -66.28083038330078, "logps/rejected": -66.29977416992188, "loss": 1.3861, "margin_dpo/margin_mean": 0.01993393898010254, "margin_dpo/margin_std": 0.2706565260887146, "step": 11 }, { "epoch": 0.018140589569160998, "grad_norm": 2.821261167526245, "learning_rate": 8.208955223880596e-08, "logits/chosen": -0.059022121131420135, "logits/rejected": -0.055310823023319244, "logps/chosen": -68.31194305419922, "logps/ref_chosen": -68.27070617675781, "logps/ref_rejected": -66.15010070800781, "logps/rejected": -66.14984130859375, "loss": 1.3862, "margin_dpo/margin_mean": -0.04150247573852539, "margin_dpo/margin_std": 0.30395328998565674, "step": 12 }, { "epoch": 0.019652305366591082, "grad_norm": 2.898451089859009, "learning_rate": 8.955223880597014e-08, "logits/chosen": 0.07420723140239716, "logits/rejected": -0.013622896745800972, "logps/chosen": -55.22724151611328, "logps/ref_chosen": -55.27293395996094, "logps/ref_rejected": -116.98251342773438, "logps/rejected": -116.99977111816406, "loss": 1.386, "margin_dpo/margin_mean": 0.06296110153198242, "margin_dpo/margin_std": 0.41959983110427856, "step": 13 }, { "epoch": 0.021164021164021163, "grad_norm": 2.963911533355713, "learning_rate": 9.701492537313432e-08, "logits/chosen": 0.1168874204158783, "logits/rejected": 0.0010670926421880722, "logps/chosen": -54.75418472290039, "logps/ref_chosen": -54.73517608642578, "logps/ref_rejected": -79.82144165039062, "logps/rejected": -79.80023193359375, "loss": 1.3859, "margin_dpo/margin_mean": -0.0402069091796875, "margin_dpo/margin_std": 0.21686886250972748, "step": 14 }, { "epoch": 0.022675736961451247, "grad_norm": 3.3150715827941895, "learning_rate": 1.044776119402985e-07, "logits/chosen": 0.09902211278676987, "logits/rejected": 0.0021891482174396515, "logps/chosen": -47.44508361816406, "logps/ref_chosen": -47.35077667236328, "logps/ref_rejected": -98.93363189697266, "logps/rejected": -98.7593994140625, "loss": 1.3867, "margin_dpo/margin_mean": -0.26853013038635254, "margin_dpo/margin_std": 0.3253557085990906, "step": 15 }, { "epoch": 0.02418745275888133, "grad_norm": 2.685232639312744, "learning_rate": 1.1194029850746268e-07, "logits/chosen": 0.13864967226982117, "logits/rejected": 0.09606489539146423, "logps/chosen": -37.010860443115234, "logps/ref_chosen": -36.986331939697266, "logps/ref_rejected": -55.56471633911133, "logps/rejected": -55.6058464050293, "loss": 1.3861, "margin_dpo/margin_mean": 0.016601920127868652, "margin_dpo/margin_std": 0.2442459762096405, "step": 16 }, { "epoch": 0.025699168556311415, "grad_norm": 3.2944958209991455, "learning_rate": 1.1940298507462686e-07, "logits/chosen": 0.06346956640481949, "logits/rejected": 0.028996460139751434, "logps/chosen": -53.554481506347656, "logps/ref_chosen": -53.60247802734375, "logps/ref_rejected": -77.51017761230469, "logps/rejected": -77.53253936767578, "loss": 1.386, "margin_dpo/margin_mean": 0.07034468650817871, "margin_dpo/margin_std": 0.30959659814834595, "step": 17 }, { "epoch": 0.027210884353741496, "grad_norm": 2.7822513580322266, "learning_rate": 1.2686567164179106e-07, "logits/chosen": 0.08662936091423035, "logits/rejected": 0.05197212100028992, "logps/chosen": -43.53667449951172, "logps/ref_chosen": -43.651275634765625, "logps/ref_rejected": -70.06555938720703, "logps/rejected": -70.07490539550781, "loss": 1.3857, "margin_dpo/margin_mean": 0.12395095825195312, "margin_dpo/margin_std": 0.20322147011756897, "step": 18 }, { "epoch": 0.02872260015117158, "grad_norm": 3.082880735397339, "learning_rate": 1.343283582089552e-07, "logits/chosen": 0.09144282341003418, "logits/rejected": 0.04711594060063362, "logps/chosen": -59.105987548828125, "logps/ref_chosen": -59.121559143066406, "logps/ref_rejected": -95.91956329345703, "logps/rejected": -95.91903686523438, "loss": 1.386, "margin_dpo/margin_mean": 0.015046358108520508, "margin_dpo/margin_std": 0.3638499677181244, "step": 19 }, { "epoch": 0.030234315948601664, "grad_norm": 2.926210880279541, "learning_rate": 1.4179104477611938e-07, "logits/chosen": 0.12322017550468445, "logits/rejected": 0.061798982322216034, "logps/chosen": -51.08586120605469, "logps/ref_chosen": -51.031883239746094, "logps/ref_rejected": -96.53536224365234, "logps/rejected": -96.57363891601562, "loss": 1.3865, "margin_dpo/margin_mean": -0.015711426734924316, "margin_dpo/margin_std": 0.19928568601608276, "step": 20 }, { "epoch": 0.031746031746031744, "grad_norm": 2.9375104904174805, "learning_rate": 1.4925373134328355e-07, "logits/chosen": -0.0577910915017128, "logits/rejected": -0.00928708165884018, "logps/chosen": -76.64593505859375, "logps/ref_chosen": -76.7261962890625, "logps/ref_rejected": -53.63877868652344, "logps/rejected": -53.602439880371094, "loss": 1.3867, "margin_dpo/margin_mean": 0.0439220666885376, "margin_dpo/margin_std": 0.23736098408699036, "step": 21 }, { "epoch": 0.03325774754346183, "grad_norm": 2.896641969680786, "learning_rate": 1.5671641791044775e-07, "logits/chosen": -0.03439096361398697, "logits/rejected": -0.11815059185028076, "logps/chosen": -36.69358444213867, "logps/ref_chosen": -36.67463302612305, "logps/ref_rejected": -58.37761688232422, "logps/rejected": -58.41578674316406, "loss": 1.3866, "margin_dpo/margin_mean": 0.019217491149902344, "margin_dpo/margin_std": 0.21051761507987976, "step": 22 }, { "epoch": 0.03476946334089191, "grad_norm": 3.29414963722229, "learning_rate": 1.6417910447761193e-07, "logits/chosen": 0.2472844123840332, "logits/rejected": 0.1739306002855301, "logps/chosen": -88.78631591796875, "logps/ref_chosen": -88.72676849365234, "logps/ref_rejected": -151.3695068359375, "logps/rejected": -151.45941162109375, "loss": 1.3865, "margin_dpo/margin_mean": 0.030363917350769043, "margin_dpo/margin_std": 0.18649427592754364, "step": 23 }, { "epoch": 0.036281179138321996, "grad_norm": 2.626323938369751, "learning_rate": 1.716417910447761e-07, "logits/chosen": 0.10805254429578781, "logits/rejected": 0.052521951496601105, "logps/chosen": -46.81217575073242, "logps/ref_chosen": -46.81597137451172, "logps/ref_rejected": -77.96964263916016, "logps/rejected": -78.01282501220703, "loss": 1.3866, "margin_dpo/margin_mean": 0.04697418212890625, "margin_dpo/margin_std": 0.1974717229604721, "step": 24 }, { "epoch": 0.03779289493575208, "grad_norm": 2.9760308265686035, "learning_rate": 1.7910447761194027e-07, "logits/chosen": 0.027542775496840477, "logits/rejected": -0.008236430585384369, "logps/chosen": -66.06083679199219, "logps/ref_chosen": -66.00652313232422, "logps/ref_rejected": -94.2716293334961, "logps/rejected": -94.29405212402344, "loss": 1.3861, "margin_dpo/margin_mean": -0.031876444816589355, "margin_dpo/margin_std": 0.2860802412033081, "step": 25 }, { "epoch": 0.039304610733182165, "grad_norm": 2.988525152206421, "learning_rate": 1.8656716417910447e-07, "logits/chosen": 0.16524234414100647, "logits/rejected": 0.14268483221530914, "logps/chosen": -67.2355728149414, "logps/ref_chosen": -67.12877655029297, "logps/ref_rejected": -115.98387145996094, "logps/rejected": -116.02112579345703, "loss": 1.3867, "margin_dpo/margin_mean": -0.06954813003540039, "margin_dpo/margin_std": 0.4333837032318115, "step": 26 }, { "epoch": 0.04081632653061224, "grad_norm": 2.903107166290283, "learning_rate": 1.9402985074626865e-07, "logits/chosen": 0.1587177813053131, "logits/rejected": 0.14436712861061096, "logps/chosen": -45.9591178894043, "logps/ref_chosen": -45.86199188232422, "logps/ref_rejected": -56.50844955444336, "logps/rejected": -56.52033615112305, "loss": 1.3864, "margin_dpo/margin_mean": -0.08524084091186523, "margin_dpo/margin_std": 0.2513440251350403, "step": 27 }, { "epoch": 0.042328042328042326, "grad_norm": 2.764413833618164, "learning_rate": 2.0149253731343282e-07, "logits/chosen": -0.02600286155939102, "logits/rejected": -0.024466849863529205, "logps/chosen": -100.1274642944336, "logps/ref_chosen": -100.07420349121094, "logps/ref_rejected": -98.97936248779297, "logps/rejected": -98.93499755859375, "loss": 1.3864, "margin_dpo/margin_mean": -0.09762585163116455, "margin_dpo/margin_std": 0.4535467326641083, "step": 28 }, { "epoch": 0.04383975812547241, "grad_norm": 3.2600700855255127, "learning_rate": 2.08955223880597e-07, "logits/chosen": 0.09348219633102417, "logits/rejected": 0.09450555592775345, "logps/chosen": -77.15126037597656, "logps/ref_chosen": -77.19854736328125, "logps/ref_rejected": -73.5864028930664, "logps/rejected": -73.67513275146484, "loss": 1.3857, "margin_dpo/margin_mean": 0.13602125644683838, "margin_dpo/margin_std": 0.40513163805007935, "step": 29 }, { "epoch": 0.045351473922902494, "grad_norm": 3.0433361530303955, "learning_rate": 2.1641791044776117e-07, "logits/chosen": 0.15750174224376678, "logits/rejected": 0.1133815348148346, "logps/chosen": -59.61478805541992, "logps/ref_chosen": -59.58109664916992, "logps/ref_rejected": -81.88029479980469, "logps/rejected": -81.91694641113281, "loss": 1.3854, "margin_dpo/margin_mean": 0.002964496612548828, "margin_dpo/margin_std": 0.1755043864250183, "step": 30 }, { "epoch": 0.04686318972033258, "grad_norm": 3.4319612979888916, "learning_rate": 2.2388059701492537e-07, "logits/chosen": 0.1796724498271942, "logits/rejected": 0.1043396145105362, "logps/chosen": -69.11215209960938, "logps/ref_chosen": -69.12844848632812, "logps/ref_rejected": -135.9136505126953, "logps/rejected": -136.04110717773438, "loss": 1.3858, "margin_dpo/margin_mean": 0.14375102519989014, "margin_dpo/margin_std": 0.2275710105895996, "step": 31 }, { "epoch": 0.04837490551776266, "grad_norm": 3.0030133724212646, "learning_rate": 2.3134328358208954e-07, "logits/chosen": 0.08424107730388641, "logits/rejected": 0.03786729276180267, "logps/chosen": -79.7166748046875, "logps/ref_chosen": -79.69361877441406, "logps/ref_rejected": -119.91200256347656, "logps/rejected": -119.93031311035156, "loss": 1.3867, "margin_dpo/margin_mean": -0.004748940467834473, "margin_dpo/margin_std": 0.19554254412651062, "step": 32 }, { "epoch": 0.049886621315192746, "grad_norm": 3.1022226810455322, "learning_rate": 2.388059701492537e-07, "logits/chosen": 0.05021132156252861, "logits/rejected": 0.02479320764541626, "logps/chosen": -55.22189712524414, "logps/ref_chosen": -55.10992431640625, "logps/ref_rejected": -82.52067565917969, "logps/rejected": -82.57884216308594, "loss": 1.3863, "margin_dpo/margin_mean": -0.05379873514175415, "margin_dpo/margin_std": 0.37445366382598877, "step": 33 }, { "epoch": 0.05139833711262283, "grad_norm": 2.8541648387908936, "learning_rate": 2.4626865671641786e-07, "logits/chosen": 0.057025909423828125, "logits/rejected": -0.0010375156998634338, "logps/chosen": -52.59141540527344, "logps/ref_chosen": -52.49810028076172, "logps/ref_rejected": -74.43905639648438, "logps/rejected": -74.50889587402344, "loss": 1.3854, "margin_dpo/margin_mean": -0.02347743511199951, "margin_dpo/margin_std": 0.2468748390674591, "step": 34 }, { "epoch": 0.05291005291005291, "grad_norm": 2.7856571674346924, "learning_rate": 2.537313432835821e-07, "logits/chosen": -0.023874841630458832, "logits/rejected": -0.03130623698234558, "logps/chosen": -67.71501159667969, "logps/ref_chosen": -67.55126953125, "logps/ref_rejected": -55.26748275756836, "logps/rejected": -55.42961502075195, "loss": 1.3864, "margin_dpo/margin_mean": -0.0016133785247802734, "margin_dpo/margin_std": 0.33188915252685547, "step": 35 }, { "epoch": 0.05442176870748299, "grad_norm": 2.8720510005950928, "learning_rate": 2.611940298507462e-07, "logits/chosen": 0.16390232741832733, "logits/rejected": 0.08551906049251556, "logps/chosen": -52.384647369384766, "logps/ref_chosen": -52.303955078125, "logps/ref_rejected": -88.28939819335938, "logps/rejected": -88.45569610595703, "loss": 1.3856, "margin_dpo/margin_mean": 0.085601806640625, "margin_dpo/margin_std": 0.35991254448890686, "step": 36 }, { "epoch": 0.055933484504913075, "grad_norm": 3.7995986938476562, "learning_rate": 2.686567164179104e-07, "logits/chosen": 0.06787224858999252, "logits/rejected": 0.008850198239088058, "logps/chosen": -60.5541877746582, "logps/ref_chosen": -60.51344680786133, "logps/ref_rejected": -103.91683959960938, "logps/rejected": -104.0807113647461, "loss": 1.3853, "margin_dpo/margin_mean": 0.12312948703765869, "margin_dpo/margin_std": 0.36983656883239746, "step": 37 }, { "epoch": 0.05744520030234316, "grad_norm": 2.977374792098999, "learning_rate": 2.761194029850746e-07, "logits/chosen": 0.05500081554055214, "logits/rejected": 0.019983187317848206, "logps/chosen": -50.67215347290039, "logps/ref_chosen": -50.62315368652344, "logps/ref_rejected": -60.629703521728516, "logps/rejected": -60.82992935180664, "loss": 1.3861, "margin_dpo/margin_mean": 0.15122318267822266, "margin_dpo/margin_std": 0.3122575581073761, "step": 38 }, { "epoch": 0.05895691609977324, "grad_norm": 2.931081771850586, "learning_rate": 2.8358208955223876e-07, "logits/chosen": 0.2395174652338028, "logits/rejected": 0.20490288734436035, "logps/chosen": -45.89991760253906, "logps/ref_chosen": -45.8764533996582, "logps/ref_rejected": -69.75498962402344, "logps/rejected": -69.87886047363281, "loss": 1.3858, "margin_dpo/margin_mean": 0.10041189193725586, "margin_dpo/margin_std": 0.28835514187812805, "step": 39 }, { "epoch": 0.06046863189720333, "grad_norm": 3.1318254470825195, "learning_rate": 2.9104477611940296e-07, "logits/chosen": 0.18963474035263062, "logits/rejected": 0.16616499423980713, "logps/chosen": -39.96923828125, "logps/ref_chosen": -39.950111389160156, "logps/ref_rejected": -62.60026931762695, "logps/rejected": -62.79283142089844, "loss": 1.3859, "margin_dpo/margin_mean": 0.1734316349029541, "margin_dpo/margin_std": 0.2163560539484024, "step": 40 }, { "epoch": 0.06198034769463341, "grad_norm": 3.0570766925811768, "learning_rate": 2.985074626865671e-07, "logits/chosen": -0.12535785138607025, "logits/rejected": -0.12432264536619186, "logps/chosen": -65.45966339111328, "logps/ref_chosen": -65.39937591552734, "logps/ref_rejected": -83.19218444824219, "logps/rejected": -83.37960815429688, "loss": 1.3852, "margin_dpo/margin_mean": 0.1271369457244873, "margin_dpo/margin_std": 0.2651064991950989, "step": 41 }, { "epoch": 0.06349206349206349, "grad_norm": 2.934258460998535, "learning_rate": 3.059701492537313e-07, "logits/chosen": 0.06298904120922089, "logits/rejected": 0.05617032200098038, "logps/chosen": -61.81056594848633, "logps/ref_chosen": -61.71239471435547, "logps/ref_rejected": -81.27043151855469, "logps/rejected": -81.32711029052734, "loss": 1.3864, "margin_dpo/margin_mean": -0.04149121046066284, "margin_dpo/margin_std": 0.3146214485168457, "step": 42 }, { "epoch": 0.06500377928949358, "grad_norm": 3.4698517322540283, "learning_rate": 3.134328358208955e-07, "logits/chosen": 0.029873725026845932, "logits/rejected": -0.042266130447387695, "logps/chosen": -68.39386749267578, "logps/ref_chosen": -68.25798797607422, "logps/ref_rejected": -94.93944549560547, "logps/rejected": -95.09349060058594, "loss": 1.3861, "margin_dpo/margin_mean": 0.018149971961975098, "margin_dpo/margin_std": 0.2860589325428009, "step": 43 }, { "epoch": 0.06651549508692366, "grad_norm": 3.261988639831543, "learning_rate": 3.2089552238805965e-07, "logits/chosen": 0.059243109077215195, "logits/rejected": 0.07599075883626938, "logps/chosen": -76.56920623779297, "logps/ref_chosen": -76.42689514160156, "logps/ref_rejected": -84.94781494140625, "logps/rejected": -84.98534393310547, "loss": 1.3858, "margin_dpo/margin_mean": -0.10478079319000244, "margin_dpo/margin_std": 0.2691563367843628, "step": 44 }, { "epoch": 0.06802721088435375, "grad_norm": 3.200819492340088, "learning_rate": 3.2835820895522385e-07, "logits/chosen": 0.09944377094507217, "logits/rejected": 0.05152427405118942, "logps/chosen": -74.33448791503906, "logps/ref_chosen": -74.13058471679688, "logps/ref_rejected": -82.69816589355469, "logps/rejected": -82.89010620117188, "loss": 1.385, "margin_dpo/margin_mean": -0.011955618858337402, "margin_dpo/margin_std": 0.3313651978969574, "step": 45 }, { "epoch": 0.06953892668178382, "grad_norm": 3.1459238529205322, "learning_rate": 3.3582089552238805e-07, "logits/chosen": 0.17175917327404022, "logits/rejected": 0.15970298647880554, "logps/chosen": -96.9361343383789, "logps/ref_chosen": -96.75468444824219, "logps/ref_rejected": -82.88079833984375, "logps/rejected": -83.0982666015625, "loss": 1.3848, "margin_dpo/margin_mean": 0.03600466251373291, "margin_dpo/margin_std": 0.3888518512248993, "step": 46 }, { "epoch": 0.0710506424792139, "grad_norm": 2.8698384761810303, "learning_rate": 3.432835820895522e-07, "logits/chosen": 0.1564871072769165, "logits/rejected": 0.1589561551809311, "logps/chosen": -53.687896728515625, "logps/ref_chosen": -53.5174560546875, "logps/ref_rejected": -55.22039794921875, "logps/rejected": -55.589107513427734, "loss": 1.385, "margin_dpo/margin_mean": 0.19826769828796387, "margin_dpo/margin_std": 0.3298158049583435, "step": 47 }, { "epoch": 0.07256235827664399, "grad_norm": 2.6883156299591064, "learning_rate": 3.507462686567164e-07, "logits/chosen": 0.17292676866054535, "logits/rejected": 0.13669899106025696, "logps/chosen": -62.340145111083984, "logps/ref_chosen": -62.185054779052734, "logps/ref_rejected": -74.84376525878906, "logps/rejected": -75.03015899658203, "loss": 1.386, "margin_dpo/margin_mean": 0.03130638599395752, "margin_dpo/margin_std": 0.34968793392181396, "step": 48 }, { "epoch": 0.07407407407407407, "grad_norm": 2.750591993331909, "learning_rate": 3.5820895522388055e-07, "logits/chosen": 0.057202327996492386, "logits/rejected": 0.037843670696020126, "logps/chosen": -37.986602783203125, "logps/ref_chosen": -37.77415084838867, "logps/ref_rejected": -51.56128692626953, "logps/rejected": -51.90427780151367, "loss": 1.3855, "margin_dpo/margin_mean": 0.13054275512695312, "margin_dpo/margin_std": 0.2379816472530365, "step": 49 }, { "epoch": 0.07558578987150416, "grad_norm": 2.820672035217285, "learning_rate": 3.6567164179104475e-07, "logits/chosen": 0.10407137125730515, "logits/rejected": 0.08040489256381989, "logps/chosen": -60.96312713623047, "logps/ref_chosen": -60.664947509765625, "logps/ref_rejected": -89.31661987304688, "logps/rejected": -89.6517333984375, "loss": 1.3852, "margin_dpo/margin_mean": 0.03693675994873047, "margin_dpo/margin_std": 0.25188928842544556, "step": 50 }, { "epoch": 0.07709750566893424, "grad_norm": 3.1633975505828857, "learning_rate": 3.7313432835820895e-07, "logits/chosen": 0.11444868892431259, "logits/rejected": 0.04659546911716461, "logps/chosen": -48.322227478027344, "logps/ref_chosen": -48.012168884277344, "logps/ref_rejected": -68.38160705566406, "logps/rejected": -68.88851928710938, "loss": 1.3847, "margin_dpo/margin_mean": 0.19685041904449463, "margin_dpo/margin_std": 0.40103837847709656, "step": 51 }, { "epoch": 0.07860922146636433, "grad_norm": 3.325279951095581, "learning_rate": 3.805970149253731e-07, "logits/chosen": 0.08999551832675934, "logits/rejected": 0.05174541473388672, "logps/chosen": -69.64974975585938, "logps/ref_chosen": -69.34451293945312, "logps/ref_rejected": -115.13761901855469, "logps/rejected": -115.90428924560547, "loss": 1.3839, "margin_dpo/margin_mean": 0.46144330501556396, "margin_dpo/margin_std": 0.610642671585083, "step": 52 }, { "epoch": 0.0801209372637944, "grad_norm": 2.7831149101257324, "learning_rate": 3.880597014925373e-07, "logits/chosen": 0.05799449607729912, "logits/rejected": 0.057163748890161514, "logps/chosen": -61.22389602661133, "logps/ref_chosen": -60.885520935058594, "logps/ref_rejected": -51.6776237487793, "logps/rejected": -52.1979866027832, "loss": 1.3842, "margin_dpo/margin_mean": 0.18198823928833008, "margin_dpo/margin_std": 0.3276767432689667, "step": 53 }, { "epoch": 0.08163265306122448, "grad_norm": 2.6667864322662354, "learning_rate": 3.9552238805970144e-07, "logits/chosen": -0.010790073312819004, "logits/rejected": -0.014523019082844257, "logps/chosen": -43.61071014404297, "logps/ref_chosen": -43.147682189941406, "logps/ref_rejected": -48.63517761230469, "logps/rejected": -49.14527130126953, "loss": 1.3849, "margin_dpo/margin_mean": 0.047069668769836426, "margin_dpo/margin_std": 0.5451761484146118, "step": 54 }, { "epoch": 0.08314436885865457, "grad_norm": 3.2038674354553223, "learning_rate": 4.0298507462686564e-07, "logits/chosen": 0.09067589789628983, "logits/rejected": -0.015168176032602787, "logps/chosen": -45.248573303222656, "logps/ref_chosen": -44.79475402832031, "logps/ref_rejected": -87.80093383789062, "logps/rejected": -88.53862762451172, "loss": 1.3832, "margin_dpo/margin_mean": 0.28387510776519775, "margin_dpo/margin_std": 0.6332917809486389, "step": 55 }, { "epoch": 0.08465608465608465, "grad_norm": 2.775723934173584, "learning_rate": 4.1044776119402984e-07, "logits/chosen": 0.09276323020458221, "logits/rejected": 0.11390136182308197, "logps/chosen": -71.15162658691406, "logps/ref_chosen": -70.83395385742188, "logps/ref_rejected": -66.29704284667969, "logps/rejected": -66.77851867675781, "loss": 1.3843, "margin_dpo/margin_mean": 0.16381430625915527, "margin_dpo/margin_std": 0.6528092622756958, "step": 56 }, { "epoch": 0.08616780045351474, "grad_norm": 3.1573398113250732, "learning_rate": 4.17910447761194e-07, "logits/chosen": 0.13995778560638428, "logits/rejected": 0.10124865919351578, "logps/chosen": -55.33793640136719, "logps/ref_chosen": -54.77841567993164, "logps/ref_rejected": -97.11053466796875, "logps/rejected": -97.8675537109375, "loss": 1.3835, "margin_dpo/margin_mean": 0.19750165939331055, "margin_dpo/margin_std": 0.8898135423660278, "step": 57 }, { "epoch": 0.08767951625094482, "grad_norm": 3.0929791927337646, "learning_rate": 4.253731343283582e-07, "logits/chosen": -0.024497557431459427, "logits/rejected": -0.11392132192850113, "logps/chosen": -59.87010955810547, "logps/ref_chosen": -59.280887603759766, "logps/ref_rejected": -82.80585479736328, "logps/rejected": -83.43619537353516, "loss": 1.3839, "margin_dpo/margin_mean": 0.041118621826171875, "margin_dpo/margin_std": 0.6544058322906494, "step": 58 }, { "epoch": 0.08919123204837491, "grad_norm": 2.7556982040405273, "learning_rate": 4.3283582089552234e-07, "logits/chosen": 0.17279072105884552, "logits/rejected": 0.04841926693916321, "logps/chosen": -38.3438720703125, "logps/ref_chosen": -37.76800537109375, "logps/ref_rejected": -84.53325653076172, "logps/rejected": -85.44366455078125, "loss": 1.3833, "margin_dpo/margin_mean": 0.33453965187072754, "margin_dpo/margin_std": 0.6480045318603516, "step": 59 }, { "epoch": 0.09070294784580499, "grad_norm": 2.9187021255493164, "learning_rate": 4.4029850746268654e-07, "logits/chosen": 0.2775009274482727, "logits/rejected": 0.2738117575645447, "logps/chosen": -67.29905700683594, "logps/ref_chosen": -66.45524597167969, "logps/ref_rejected": -67.77998352050781, "logps/rejected": -68.42279052734375, "loss": 1.3855, "margin_dpo/margin_mean": -0.20100653171539307, "margin_dpo/margin_std": 1.0322511196136475, "step": 60 }, { "epoch": 0.09221466364323508, "grad_norm": 2.7786881923675537, "learning_rate": 4.4776119402985074e-07, "logits/chosen": 0.02796616032719612, "logits/rejected": -0.045561283826828, "logps/chosen": -41.18583679199219, "logps/ref_chosen": -40.7296257019043, "logps/ref_rejected": -82.87712097167969, "logps/rejected": -83.8257827758789, "loss": 1.3858, "margin_dpo/margin_mean": 0.4924490451812744, "margin_dpo/margin_std": 0.6613912582397461, "step": 61 }, { "epoch": 0.09372637944066516, "grad_norm": 2.8831284046173096, "learning_rate": 4.552238805970149e-07, "logits/chosen": 0.16551688313484192, "logits/rejected": 0.05868356674909592, "logps/chosen": -47.318138122558594, "logps/ref_chosen": -46.39446258544922, "logps/ref_rejected": -79.28895568847656, "logps/rejected": -80.39089965820312, "loss": 1.3838, "margin_dpo/margin_mean": 0.17827367782592773, "margin_dpo/margin_std": 1.0472949743270874, "step": 62 }, { "epoch": 0.09523809523809523, "grad_norm": 3.3160440921783447, "learning_rate": 4.626865671641791e-07, "logits/chosen": 0.2583298087120056, "logits/rejected": 0.2626573443412781, "logps/chosen": -75.17607116699219, "logps/ref_chosen": -74.44918823242188, "logps/ref_rejected": -115.08148193359375, "logps/rejected": -116.2177734375, "loss": 1.3806, "margin_dpo/margin_mean": 0.4094170331954956, "margin_dpo/margin_std": 1.2669267654418945, "step": 63 }, { "epoch": 0.09674981103552532, "grad_norm": 2.5979647636413574, "learning_rate": 4.701492537313433e-07, "logits/chosen": 0.0404229536652565, "logits/rejected": -0.03692961111664772, "logps/chosen": -42.32487869262695, "logps/ref_chosen": -41.524444580078125, "logps/ref_rejected": -76.30519104003906, "logps/rejected": -77.62030029296875, "loss": 1.3855, "margin_dpo/margin_mean": 0.5146619081497192, "margin_dpo/margin_std": 0.8373426198959351, "step": 64 }, { "epoch": 0.0982615268329554, "grad_norm": 3.180335760116577, "learning_rate": 4.776119402985074e-07, "logits/chosen": 0.22089830040931702, "logits/rejected": 0.2287295162677765, "logps/chosen": -79.51335144042969, "logps/ref_chosen": -78.54818725585938, "logps/ref_rejected": -58.596473693847656, "logps/rejected": -59.96930694580078, "loss": 1.3814, "margin_dpo/margin_mean": 0.4076697826385498, "margin_dpo/margin_std": 0.8789803981781006, "step": 65 }, { "epoch": 0.09977324263038549, "grad_norm": 3.0080127716064453, "learning_rate": 4.850746268656717e-07, "logits/chosen": 0.15983106195926666, "logits/rejected": 0.14227060973644257, "logps/chosen": -69.20197296142578, "logps/ref_chosen": -68.18994903564453, "logps/ref_rejected": -83.489013671875, "logps/rejected": -85.03985595703125, "loss": 1.3826, "margin_dpo/margin_mean": 0.5388225317001343, "margin_dpo/margin_std": 1.0032914876937866, "step": 66 }, { "epoch": 0.10128495842781557, "grad_norm": 2.978635311126709, "learning_rate": 4.925373134328357e-07, "logits/chosen": 0.13758046925067902, "logits/rejected": 0.084464430809021, "logps/chosen": -63.52684020996094, "logps/ref_chosen": -62.41529083251953, "logps/ref_rejected": -53.900169372558594, "logps/rejected": -54.98922348022461, "loss": 1.3834, "margin_dpo/margin_mean": -0.02249324321746826, "margin_dpo/margin_std": 1.1306034326553345, "step": 67 }, { "epoch": 0.10279667422524566, "grad_norm": 3.2404768466949463, "learning_rate": 5e-07, "logits/chosen": 0.06684955954551697, "logits/rejected": -0.005514336749911308, "logps/chosen": -81.26565551757812, "logps/ref_chosen": -80.14068603515625, "logps/ref_rejected": -116.50318145751953, "logps/rejected": -118.35859680175781, "loss": 1.3832, "margin_dpo/margin_mean": 0.7304507493972778, "margin_dpo/margin_std": 1.5464614629745483, "step": 68 }, { "epoch": 0.10430839002267574, "grad_norm": 3.3028857707977295, "learning_rate": 4.999965034812934e-07, "logits/chosen": 0.16428935527801514, "logits/rejected": 0.11686080694198608, "logps/chosen": -54.37907409667969, "logps/ref_chosen": -53.415428161621094, "logps/ref_rejected": -71.89765930175781, "logps/rejected": -73.68708801269531, "loss": 1.3792, "margin_dpo/margin_mean": 0.8257811069488525, "margin_dpo/margin_std": 0.9166597127914429, "step": 69 }, { "epoch": 0.10582010582010581, "grad_norm": 2.9263834953308105, "learning_rate": 4.999860140229787e-07, "logits/chosen": 0.006922289729118347, "logits/rejected": 0.01605862006545067, "logps/chosen": -72.73168182373047, "logps/ref_chosen": -71.148193359375, "logps/ref_rejected": -67.78597259521484, "logps/rejected": -69.44383239746094, "loss": 1.383, "margin_dpo/margin_mean": 0.07435917854309082, "margin_dpo/margin_std": 1.4182093143463135, "step": 70 }, { "epoch": 0.1073318216175359, "grad_norm": 2.8662917613983154, "learning_rate": 4.999685319184688e-07, "logits/chosen": 0.1492251604795456, "logits/rejected": 0.13701248168945312, "logps/chosen": -46.276390075683594, "logps/ref_chosen": -44.75934982299805, "logps/ref_rejected": -65.2507553100586, "logps/rejected": -66.79752349853516, "loss": 1.3832, "margin_dpo/margin_mean": 0.029730796813964844, "margin_dpo/margin_std": 1.3828539848327637, "step": 71 }, { "epoch": 0.10884353741496598, "grad_norm": 3.228018045425415, "learning_rate": 4.999440576567755e-07, "logits/chosen": 0.09348595142364502, "logits/rejected": 0.025297988206148148, "logps/chosen": -48.47857666015625, "logps/ref_chosen": -47.3697395324707, "logps/ref_rejected": -52.99060821533203, "logps/rejected": -54.79874801635742, "loss": 1.3786, "margin_dpo/margin_mean": 0.699303388595581, "margin_dpo/margin_std": 1.9094823598861694, "step": 72 }, { "epoch": 0.11035525321239607, "grad_norm": 3.229674816131592, "learning_rate": 4.999125919224965e-07, "logits/chosen": 0.10412274301052094, "logits/rejected": 0.11492769420146942, "logps/chosen": -58.23365783691406, "logps/ref_chosen": -56.49576187133789, "logps/ref_rejected": -49.6622200012207, "logps/rejected": -50.660560607910156, "loss": 1.3857, "margin_dpo/margin_mean": -0.7395575046539307, "margin_dpo/margin_std": 1.3145250082015991, "step": 73 }, { "epoch": 0.11186696900982615, "grad_norm": 2.963764190673828, "learning_rate": 4.998741355957963e-07, "logits/chosen": 0.2109803855419159, "logits/rejected": 0.14343787729740143, "logps/chosen": -58.918479919433594, "logps/ref_chosen": -57.7161750793457, "logps/ref_rejected": -114.44114685058594, "logps/rejected": -117.02381896972656, "loss": 1.3789, "margin_dpo/margin_mean": 1.3803461790084839, "margin_dpo/margin_std": 1.7276947498321533, "step": 74 }, { "epoch": 0.11337868480725624, "grad_norm": 2.831622362136841, "learning_rate": 4.998286897523808e-07, "logits/chosen": 0.19602981209754944, "logits/rejected": 0.17468248307704926, "logps/chosen": -41.735137939453125, "logps/ref_chosen": -39.99764633178711, "logps/ref_rejected": -56.469932556152344, "logps/rejected": -58.855743408203125, "loss": 1.3781, "margin_dpo/margin_mean": 0.6483190059661865, "margin_dpo/margin_std": 1.347118616104126, "step": 75 }, { "epoch": 0.11489040060468632, "grad_norm": 2.9740262031555176, "learning_rate": 4.997762556634679e-07, "logits/chosen": -0.004973419010639191, "logits/rejected": -0.06459110975265503, "logps/chosen": -60.27781677246094, "logps/ref_chosen": -57.66736602783203, "logps/ref_rejected": -80.56336212158203, "logps/rejected": -83.19686889648438, "loss": 1.3828, "margin_dpo/margin_mean": 0.023052692413330078, "margin_dpo/margin_std": 2.44602632522583, "step": 76 }, { "epoch": 0.1164021164021164, "grad_norm": 3.190723419189453, "learning_rate": 4.99716834795752e-07, "logits/chosen": -0.021925870329141617, "logits/rejected": -0.0063882917165756226, "logps/chosen": -55.694488525390625, "logps/ref_chosen": -53.56401824951172, "logps/ref_rejected": -48.90995407104492, "logps/rejected": -51.173919677734375, "loss": 1.3771, "margin_dpo/margin_mean": 0.13348984718322754, "margin_dpo/margin_std": 1.8197870254516602, "step": 77 }, { "epoch": 0.11791383219954649, "grad_norm": 2.9408459663391113, "learning_rate": 4.996504288113623e-07, "logits/chosen": 0.07129096984863281, "logits/rejected": 0.04697857052087784, "logps/chosen": -74.45344543457031, "logps/ref_chosen": -72.49877166748047, "logps/ref_rejected": -85.70948791503906, "logps/rejected": -88.24615478515625, "loss": 1.3815, "margin_dpo/margin_mean": 0.5819820165634155, "margin_dpo/margin_std": 1.5051074028015137, "step": 78 }, { "epoch": 0.11942554799697656, "grad_norm": 3.75710391998291, "learning_rate": 4.995770395678171e-07, "logits/chosen": 0.2161446511745453, "logits/rejected": 0.2010088711977005, "logps/chosen": -57.58828353881836, "logps/ref_chosen": -55.347450256347656, "logps/ref_rejected": -58.748321533203125, "logps/rejected": -61.51164245605469, "loss": 1.3723, "margin_dpo/margin_mean": 0.522484540939331, "margin_dpo/margin_std": 2.684553384780884, "step": 79 }, { "epoch": 0.12093726379440665, "grad_norm": 2.9566855430603027, "learning_rate": 4.994966691179711e-07, "logits/chosen": 0.16256621479988098, "logits/rejected": 0.08070458471775055, "logps/chosen": -61.3144416809082, "logps/ref_chosen": -58.95909118652344, "logps/ref_rejected": -62.06755065917969, "logps/rejected": -65.10287475585938, "loss": 1.3796, "margin_dpo/margin_mean": 0.6799747943878174, "margin_dpo/margin_std": 2.2472503185272217, "step": 80 }, { "epoch": 0.12244897959183673, "grad_norm": 3.1515696048736572, "learning_rate": 4.994093197099587e-07, "logits/chosen": 0.10985112190246582, "logits/rejected": 0.07357762008905411, "logps/chosen": -60.51456069946289, "logps/ref_chosen": -57.94086456298828, "logps/ref_rejected": -64.90560913085938, "logps/rejected": -68.10665893554688, "loss": 1.3767, "margin_dpo/margin_mean": 0.6273548603057861, "margin_dpo/margin_std": 2.312195301055908, "step": 81 }, { "epoch": 0.12396069538926682, "grad_norm": 2.9304919242858887, "learning_rate": 4.993149937871306e-07, "logits/chosen": 0.1599178910255432, "logits/rejected": 0.06249549984931946, "logps/chosen": -27.858848571777344, "logps/ref_chosen": -26.338790893554688, "logps/ref_rejected": -56.80085754394531, "logps/rejected": -59.76110076904297, "loss": 1.374, "margin_dpo/margin_mean": 1.4401792287826538, "margin_dpo/margin_std": 1.2950589656829834, "step": 82 }, { "epoch": 0.1254724111866969, "grad_norm": 3.3920578956604004, "learning_rate": 4.992136939879856e-07, "logits/chosen": 0.07286302000284195, "logits/rejected": -0.008843222633004189, "logps/chosen": -58.036651611328125, "logps/ref_chosen": -55.43024444580078, "logps/ref_rejected": -90.0384521484375, "logps/rejected": -94.02841186523438, "loss": 1.3722, "margin_dpo/margin_mean": 1.3835418224334717, "margin_dpo/margin_std": 2.0829572677612305, "step": 83 }, { "epoch": 0.12698412698412698, "grad_norm": 3.293107271194458, "learning_rate": 4.991054231460969e-07, "logits/chosen": 0.14062954485416412, "logits/rejected": 0.09686338156461716, "logps/chosen": -64.60865020751953, "logps/ref_chosen": -60.98677444458008, "logps/ref_rejected": -61.2043342590332, "logps/rejected": -66.28961944580078, "loss": 1.3749, "margin_dpo/margin_mean": 1.463415503501892, "margin_dpo/margin_std": 2.3290019035339355, "step": 84 }, { "epoch": 0.12849584278155707, "grad_norm": 3.0449564456939697, "learning_rate": 4.989901842900325e-07, "logits/chosen": 0.2767505645751953, "logits/rejected": 0.2392296940088272, "logps/chosen": -58.43796157836914, "logps/ref_chosen": -55.2398681640625, "logps/ref_rejected": -69.06980895996094, "logps/rejected": -73.96517181396484, "loss": 1.3711, "margin_dpo/margin_mean": 1.6972711086273193, "margin_dpo/margin_std": 1.7987269163131714, "step": 85 }, { "epoch": 0.13000755857898716, "grad_norm": 3.007859468460083, "learning_rate": 4.988679806432711e-07, "logits/chosen": 0.16140137612819672, "logits/rejected": 0.1533210277557373, "logps/chosen": -64.31861114501953, "logps/ref_chosen": -60.13792419433594, "logps/ref_rejected": -66.80750274658203, "logps/rejected": -72.00105285644531, "loss": 1.3752, "margin_dpo/margin_mean": 1.0128706693649292, "margin_dpo/margin_std": 3.6858959197998047, "step": 86 }, { "epoch": 0.13151927437641722, "grad_norm": 3.371891975402832, "learning_rate": 4.987388156241114e-07, "logits/chosen": 0.1272026002407074, "logits/rejected": 0.08599858731031418, "logps/chosen": -95.75196075439453, "logps/ref_chosen": -90.86170959472656, "logps/ref_rejected": -81.98894500732422, "logps/rejected": -88.12923431396484, "loss": 1.3721, "margin_dpo/margin_mean": 1.2500503063201904, "margin_dpo/margin_std": 4.517343997955322, "step": 87 }, { "epoch": 0.1330309901738473, "grad_norm": 3.1865837574005127, "learning_rate": 4.986026928455767e-07, "logits/chosen": 0.10692833364009857, "logits/rejected": 0.08406249433755875, "logps/chosen": -47.30534362792969, "logps/ref_chosen": -44.642494201660156, "logps/ref_rejected": -57.57598114013672, "logps/rejected": -62.85344314575195, "loss": 1.3767, "margin_dpo/margin_mean": 2.6146082878112793, "margin_dpo/margin_std": 3.4296021461486816, "step": 88 }, { "epoch": 0.1345427059712774, "grad_norm": 3.5698328018188477, "learning_rate": 4.984596161153135e-07, "logits/chosen": 0.23458774387836456, "logits/rejected": 0.11213146895170212, "logps/chosen": -47.105751037597656, "logps/ref_chosen": -43.498695373535156, "logps/ref_rejected": -72.5770034790039, "logps/rejected": -78.28662109375, "loss": 1.3671, "margin_dpo/margin_mean": 2.10256290435791, "margin_dpo/margin_std": 2.2028491497039795, "step": 89 }, { "epoch": 0.1360544217687075, "grad_norm": 3.3276965618133545, "learning_rate": 4.983095894354857e-07, "logits/chosen": 0.24859435856342316, "logits/rejected": 0.14004702866077423, "logps/chosen": -35.86720275878906, "logps/ref_chosen": -32.05683517456055, "logps/ref_rejected": -76.27911376953125, "logps/rejected": -82.3067626953125, "loss": 1.3688, "margin_dpo/margin_mean": 2.2172858715057373, "margin_dpo/margin_std": 4.022238731384277, "step": 90 }, { "epoch": 0.13756613756613756, "grad_norm": 2.8259410858154297, "learning_rate": 4.98152617002662e-07, "logits/chosen": 0.2829027473926544, "logits/rejected": 0.1933433711528778, "logps/chosen": -42.66006088256836, "logps/ref_chosen": -38.95655822753906, "logps/ref_rejected": -85.30648803710938, "logps/rejected": -92.27479553222656, "loss": 1.3726, "margin_dpo/margin_mean": 3.264805793762207, "margin_dpo/margin_std": 5.691169738769531, "step": 91 }, { "epoch": 0.13907785336356765, "grad_norm": 3.3671655654907227, "learning_rate": 4.979887032076988e-07, "logits/chosen": 0.1655869483947754, "logits/rejected": 0.14279474318027496, "logps/chosen": -50.55265808105469, "logps/ref_chosen": -46.43939208984375, "logps/ref_rejected": -56.59052276611328, "logps/rejected": -61.2132568359375, "loss": 1.3639, "margin_dpo/margin_mean": 0.5094633102416992, "margin_dpo/margin_std": 2.5982000827789307, "step": 92 }, { "epoch": 0.14058956916099774, "grad_norm": 2.7980592250823975, "learning_rate": 4.978178526356172e-07, "logits/chosen": 0.23437252640724182, "logits/rejected": 0.18408940732479095, "logps/chosen": -47.498390197753906, "logps/ref_chosen": -42.59188461303711, "logps/ref_rejected": -66.06549072265625, "logps/rejected": -73.67166900634766, "loss": 1.3733, "margin_dpo/margin_mean": 2.699659824371338, "margin_dpo/margin_std": 6.147436141967773, "step": 93 }, { "epoch": 0.1421012849584278, "grad_norm": 3.8317432403564453, "learning_rate": 4.976400700654751e-07, "logits/chosen": 0.23454198241233826, "logits/rejected": 0.24940252304077148, "logps/chosen": -82.16134643554688, "logps/ref_chosen": -77.08745574951172, "logps/ref_rejected": -72.75309753417969, "logps/rejected": -78.77753448486328, "loss": 1.3598, "margin_dpo/margin_mean": 0.9505494832992554, "margin_dpo/margin_std": 5.345444679260254, "step": 94 }, { "epoch": 0.1436130007558579, "grad_norm": 3.494490146636963, "learning_rate": 4.974553604702332e-07, "logits/chosen": 0.13477879762649536, "logits/rejected": 0.0803680494427681, "logps/chosen": -63.49205017089844, "logps/ref_chosen": -57.205665588378906, "logps/ref_rejected": -101.0076904296875, "logps/rejected": -108.45451354980469, "loss": 1.3646, "margin_dpo/margin_mean": 1.1604300737380981, "margin_dpo/margin_std": 4.039283752441406, "step": 95 }, { "epoch": 0.14512471655328799, "grad_norm": 3.4159934520721436, "learning_rate": 4.972637290166157e-07, "logits/chosen": 0.27271753549575806, "logits/rejected": 0.200592041015625, "logps/chosen": -78.8208999633789, "logps/ref_chosen": -71.87321472167969, "logps/ref_rejected": -108.4333267211914, "logps/rejected": -119.43417358398438, "loss": 1.3618, "margin_dpo/margin_mean": 4.053152084350586, "margin_dpo/margin_std": 8.04190731048584, "step": 96 }, { "epoch": 0.14663643235071808, "grad_norm": 3.486067056655884, "learning_rate": 4.970651810649666e-07, "logits/chosen": 0.15996894240379333, "logits/rejected": 0.2043858766555786, "logps/chosen": -104.0246810913086, "logps/ref_chosen": -93.0615463256836, "logps/ref_rejected": -54.65837097167969, "logps/rejected": -61.65705871582031, "loss": 1.3753, "margin_dpo/margin_mean": -3.9644370079040527, "margin_dpo/margin_std": 11.941303253173828, "step": 97 }, { "epoch": 0.14814814814814814, "grad_norm": 2.9447083473205566, "learning_rate": 4.968597221690985e-07, "logits/chosen": 0.19196490943431854, "logits/rejected": 0.1588142365217209, "logps/chosen": -72.50187683105469, "logps/ref_chosen": -64.01171875, "logps/ref_rejected": -64.54795837402344, "logps/rejected": -73.04808044433594, "loss": 1.3791, "margin_dpo/margin_mean": 0.009970307350158691, "margin_dpo/margin_std": 4.951154708862305, "step": 98 }, { "epoch": 0.14965986394557823, "grad_norm": 3.1145100593566895, "learning_rate": 4.966473580761389e-07, "logits/chosen": 0.3419758081436157, "logits/rejected": 0.2835521697998047, "logps/chosen": -59.71137619018555, "logps/ref_chosen": -53.621009826660156, "logps/ref_rejected": -93.46940612792969, "logps/rejected": -103.66671752929688, "loss": 1.3786, "margin_dpo/margin_mean": 4.106945991516113, "margin_dpo/margin_std": 5.849977016448975, "step": 99 }, { "epoch": 0.15117157974300832, "grad_norm": 3.7920093536376953, "learning_rate": 4.964280947263676e-07, "logits/chosen": 0.24154677987098694, "logits/rejected": 0.23493698239326477, "logps/chosen": -102.99555969238281, "logps/ref_chosen": -93.66744995117188, "logps/ref_rejected": -110.8492431640625, "logps/rejected": -124.81897735595703, "loss": 1.3656, "margin_dpo/margin_mean": 4.6416401863098145, "margin_dpo/margin_std": 7.766916275024414, "step": 100 }, { "epoch": 0.15268329554043839, "grad_norm": 3.0823545455932617, "learning_rate": 4.96201938253052e-07, "logits/chosen": 0.09690912812948227, "logits/rejected": 0.05974133312702179, "logps/chosen": -71.51644897460938, "logps/ref_chosen": -63.71492004394531, "logps/ref_rejected": -83.56236267089844, "logps/rejected": -93.69319152832031, "loss": 1.3574, "margin_dpo/margin_mean": 2.32930326461792, "margin_dpo/margin_std": 8.121504783630371, "step": 101 }, { "epoch": 0.15419501133786848, "grad_norm": 3.003969430923462, "learning_rate": 4.959688949822748e-07, "logits/chosen": 0.11170051246881485, "logits/rejected": 0.12498050928115845, "logps/chosen": -73.76181030273438, "logps/ref_chosen": -65.03767395019531, "logps/ref_rejected": -65.62431335449219, "logps/rejected": -73.43655395507812, "loss": 1.381, "margin_dpo/margin_mean": -0.9118953943252563, "margin_dpo/margin_std": 7.858981132507324, "step": 102 }, { "epoch": 0.15570672713529857, "grad_norm": 3.363142251968384, "learning_rate": 4.957289714327572e-07, "logits/chosen": 0.28459155559539795, "logits/rejected": 0.2856888771057129, "logps/chosen": -62.49891662597656, "logps/ref_chosen": -54.93040466308594, "logps/ref_rejected": -56.902076721191406, "logps/rejected": -65.07192993164062, "loss": 1.3643, "margin_dpo/margin_mean": 0.6013284921646118, "margin_dpo/margin_std": 7.1304521560668945, "step": 103 }, { "epoch": 0.15721844293272866, "grad_norm": 4.440778732299805, "learning_rate": 4.954821743156767e-07, "logits/chosen": 0.28602921962738037, "logits/rejected": 0.20815476775169373, "logps/chosen": -76.05664825439453, "logps/ref_chosen": -66.83149719238281, "logps/ref_rejected": -114.09371185302734, "logps/rejected": -128.50567626953125, "loss": 1.3449, "margin_dpo/margin_mean": 5.186819553375244, "margin_dpo/margin_std": 18.05282974243164, "step": 104 }, { "epoch": 0.15873015873015872, "grad_norm": 3.9823782444000244, "learning_rate": 4.952285105344791e-07, "logits/chosen": 0.2622066140174866, "logits/rejected": 0.18628236651420593, "logps/chosen": -52.09508514404297, "logps/ref_chosen": -46.05514144897461, "logps/ref_rejected": -78.72990417480469, "logps/rejected": -89.09690856933594, "loss": 1.3504, "margin_dpo/margin_mean": 4.327066898345947, "margin_dpo/margin_std": 8.973100662231445, "step": 105 }, { "epoch": 0.1602418745275888, "grad_norm": 3.0525312423706055, "learning_rate": 4.949679871846857e-07, "logits/chosen": 0.2560195326805115, "logits/rejected": 0.21072283387184143, "logps/chosen": -62.558570861816406, "logps/ref_chosen": -53.9271240234375, "logps/ref_rejected": -66.69943237304688, "logps/rejected": -78.46056365966797, "loss": 1.3676, "margin_dpo/margin_mean": 3.129687786102295, "margin_dpo/margin_std": 10.326120376586914, "step": 106 }, { "epoch": 0.1617535903250189, "grad_norm": 3.6129863262176514, "learning_rate": 4.947006115536947e-07, "logits/chosen": 0.20093579590320587, "logits/rejected": 0.26784881949424744, "logps/chosen": -103.26329040527344, "logps/ref_chosen": -87.38455200195312, "logps/ref_rejected": -76.76750183105469, "logps/rejected": -89.45541381835938, "loss": 1.3811, "margin_dpo/margin_mean": -3.1908369064331055, "margin_dpo/margin_std": 17.2308292388916, "step": 107 }, { "epoch": 0.16326530612244897, "grad_norm": 3.4911181926727295, "learning_rate": 4.944263911205772e-07, "logits/chosen": 0.2522239685058594, "logits/rejected": 0.1837356686592102, "logps/chosen": -72.13520812988281, "logps/ref_chosen": -62.109657287597656, "logps/ref_rejected": -91.77952575683594, "logps/rejected": -107.07212829589844, "loss": 1.3568, "margin_dpo/margin_mean": 5.267055034637451, "margin_dpo/margin_std": 8.78154182434082, "step": 108 }, { "epoch": 0.16477702191987906, "grad_norm": 4.055303573608398, "learning_rate": 4.941453335558681e-07, "logits/chosen": 0.3957793116569519, "logits/rejected": 0.35673829913139343, "logps/chosen": -84.13023376464844, "logps/ref_chosen": -70.22321319580078, "logps/ref_rejected": -103.89926147460938, "logps/rejected": -124.2558364868164, "loss": 1.3295, "margin_dpo/margin_mean": 6.449550151824951, "margin_dpo/margin_std": 12.290249824523926, "step": 109 }, { "epoch": 0.16628873771730915, "grad_norm": 4.451986312866211, "learning_rate": 4.938574467213517e-07, "logits/chosen": 0.2378203570842743, "logits/rejected": 0.25508666038513184, "logps/chosen": -85.72879028320312, "logps/ref_chosen": -72.66490173339844, "logps/ref_rejected": -62.63153076171875, "logps/rejected": -74.57170867919922, "loss": 1.4061, "margin_dpo/margin_mean": -1.1237014532089233, "margin_dpo/margin_std": 12.696800231933594, "step": 110 }, { "epoch": 0.16780045351473924, "grad_norm": 3.417356014251709, "learning_rate": 4.935627386698418e-07, "logits/chosen": 0.26538121700286865, "logits/rejected": 0.27719810605049133, "logps/chosen": -81.70519256591797, "logps/ref_chosen": -67.52632141113281, "logps/ref_rejected": -73.84911346435547, "logps/rejected": -89.34820556640625, "loss": 1.3481, "margin_dpo/margin_mean": 1.3202223777770996, "margin_dpo/margin_std": 9.906134605407715, "step": 111 }, { "epoch": 0.1693121693121693, "grad_norm": 4.943251132965088, "learning_rate": 4.932612176449559e-07, "logits/chosen": 0.19840675592422485, "logits/rejected": 0.1570388376712799, "logps/chosen": -72.82176971435547, "logps/ref_chosen": -61.945491790771484, "logps/ref_rejected": -113.05886840820312, "logps/rejected": -132.51878356933594, "loss": 1.3318, "margin_dpo/margin_mean": 8.58362865447998, "margin_dpo/margin_std": 13.595785140991211, "step": 112 }, { "epoch": 0.1708238851095994, "grad_norm": 3.653705358505249, "learning_rate": 4.929528920808854e-07, "logits/chosen": 0.18068620562553406, "logits/rejected": 0.22447502613067627, "logps/chosen": -83.29192352294922, "logps/ref_chosen": -68.68717193603516, "logps/ref_rejected": -49.824913024902344, "logps/rejected": -59.991519927978516, "loss": 1.3697, "margin_dpo/margin_mean": -4.438149452209473, "margin_dpo/margin_std": 8.201181411743164, "step": 113 }, { "epoch": 0.17233560090702948, "grad_norm": 3.5117268562316895, "learning_rate": 4.92637770602159e-07, "logits/chosen": 0.37020614743232727, "logits/rejected": 0.3371107578277588, "logps/chosen": -83.77205657958984, "logps/ref_chosen": -68.70469665527344, "logps/ref_rejected": -66.02621459960938, "logps/rejected": -80.38362121582031, "loss": 1.3549, "margin_dpo/margin_mean": -0.7099564075469971, "margin_dpo/margin_std": 13.317527770996094, "step": 114 }, { "epoch": 0.17384731670445955, "grad_norm": 3.4344635009765625, "learning_rate": 4.923158620234019e-07, "logits/chosen": 0.5059663653373718, "logits/rejected": 0.3934090733528137, "logps/chosen": -78.51239776611328, "logps/ref_chosen": -64.35462951660156, "logps/ref_rejected": -91.54823303222656, "logps/rejected": -109.61679077148438, "loss": 1.3504, "margin_dpo/margin_mean": 3.910794496536255, "margin_dpo/margin_std": 20.801626205444336, "step": 115 }, { "epoch": 0.17535903250188964, "grad_norm": 4.414751052856445, "learning_rate": 4.91987175349089e-07, "logits/chosen": 0.35310861468315125, "logits/rejected": 0.28566908836364746, "logps/chosen": -62.23815155029297, "logps/ref_chosen": -49.481990814208984, "logps/ref_rejected": -81.90135192871094, "logps/rejected": -104.6424560546875, "loss": 1.3141, "margin_dpo/margin_mean": 9.984952926635742, "margin_dpo/margin_std": 16.990009307861328, "step": 116 }, { "epoch": 0.17687074829931973, "grad_norm": 3.1478612422943115, "learning_rate": 4.916517197732933e-07, "logits/chosen": 0.4462072551250458, "logits/rejected": 0.3813575208187103, "logps/chosen": -64.82511901855469, "logps/ref_chosen": -52.36582946777344, "logps/ref_rejected": -76.98123931884766, "logps/rejected": -93.8857192993164, "loss": 1.3637, "margin_dpo/margin_mean": 4.445184707641602, "margin_dpo/margin_std": 15.84908676147461, "step": 117 }, { "epoch": 0.17838246409674982, "grad_norm": 4.316784381866455, "learning_rate": 4.913095046794281e-07, "logits/chosen": 0.33763909339904785, "logits/rejected": 0.33866265416145325, "logps/chosen": -76.38975524902344, "logps/ref_chosen": -59.73781204223633, "logps/ref_rejected": -71.17877197265625, "logps/rejected": -88.17243194580078, "loss": 1.3327, "margin_dpo/margin_mean": 0.3417208194732666, "margin_dpo/margin_std": 10.12466049194336, "step": 118 }, { "epoch": 0.17989417989417988, "grad_norm": 3.382061719894409, "learning_rate": 4.909605396399855e-07, "logits/chosen": 0.42505258321762085, "logits/rejected": 0.4622851014137268, "logps/chosen": -75.50498962402344, "logps/ref_chosen": -57.297969818115234, "logps/ref_rejected": -43.269874572753906, "logps/rejected": -58.09909439086914, "loss": 1.3609, "margin_dpo/margin_mean": -3.377796173095703, "margin_dpo/margin_std": 17.354841232299805, "step": 119 }, { "epoch": 0.18140589569160998, "grad_norm": 4.64827823638916, "learning_rate": 4.906048344162676e-07, "logits/chosen": 0.3838917016983032, "logits/rejected": 0.2518424391746521, "logps/chosen": -73.36843872070312, "logps/ref_chosen": -59.898441314697266, "logps/ref_rejected": -103.18621826171875, "logps/rejected": -131.10989379882812, "loss": 1.3166, "margin_dpo/margin_mean": 14.453689575195312, "margin_dpo/margin_std": 17.748037338256836, "step": 120 }, { "epoch": 0.18291761148904007, "grad_norm": 5.509910583496094, "learning_rate": 4.902423989581143e-07, "logits/chosen": 0.4782486855983734, "logits/rejected": 0.3805123567581177, "logps/chosen": -88.04644775390625, "logps/ref_chosen": -67.99308776855469, "logps/ref_rejected": -128.11260986328125, "logps/rejected": -163.5806427001953, "loss": 1.2951, "margin_dpo/margin_mean": 15.414678573608398, "margin_dpo/margin_std": 29.943111419677734, "step": 121 }, { "epoch": 0.18442932728647016, "grad_norm": 3.7154958248138428, "learning_rate": 4.898732434036243e-07, "logits/chosen": 0.4762170612812042, "logits/rejected": 0.41412898898124695, "logps/chosen": -74.78926086425781, "logps/ref_chosen": -57.45248031616211, "logps/ref_rejected": -94.9098892211914, "logps/rejected": -132.783447265625, "loss": 1.3373, "margin_dpo/margin_mean": 20.53677749633789, "margin_dpo/margin_std": 39.82550811767578, "step": 122 }, { "epoch": 0.18594104308390022, "grad_norm": 3.9479894638061523, "learning_rate": 4.894973780788722e-07, "logits/chosen": 0.4904557466506958, "logits/rejected": 0.3759152293205261, "logps/chosen": -68.86778259277344, "logps/ref_chosen": -51.397483825683594, "logps/ref_rejected": -106.05813598632812, "logps/rejected": -142.84266662597656, "loss": 1.3303, "margin_dpo/margin_mean": 19.314233779907227, "margin_dpo/margin_std": 27.63502311706543, "step": 123 }, { "epoch": 0.1874527588813303, "grad_norm": 4.315618515014648, "learning_rate": 4.89114813497619e-07, "logits/chosen": 0.5175679922103882, "logits/rejected": 0.3383052349090576, "logps/chosen": -46.36642837524414, "logps/ref_chosen": -35.62065887451172, "logps/ref_rejected": -100.73350524902344, "logps/rejected": -135.20120239257812, "loss": 1.3198, "margin_dpo/margin_mean": 23.721927642822266, "margin_dpo/margin_std": 20.40395736694336, "step": 124 }, { "epoch": 0.1889644746787604, "grad_norm": 4.568764686584473, "learning_rate": 4.887255603610184e-07, "logits/chosen": 0.5989629030227661, "logits/rejected": 0.4955187439918518, "logps/chosen": -93.56169891357422, "logps/ref_chosen": -66.91831970214844, "logps/ref_rejected": -116.68222045898438, "logps/rejected": -157.46566772460938, "loss": 1.313, "margin_dpo/margin_mean": 14.140068054199219, "margin_dpo/margin_std": 47.2589225769043, "step": 125 }, { "epoch": 0.19047619047619047, "grad_norm": 4.645914077758789, "learning_rate": 4.883296295573176e-07, "logits/chosen": 0.3819458484649658, "logits/rejected": 0.3799610137939453, "logps/chosen": -78.845703125, "logps/ref_chosen": -60.64362335205078, "logps/ref_rejected": -54.69144821166992, "logps/rejected": -76.06550598144531, "loss": 1.3929, "margin_dpo/margin_mean": 3.171980381011963, "margin_dpo/margin_std": 13.540816307067871, "step": 126 }, { "epoch": 0.19198790627362056, "grad_norm": 3.9225544929504395, "learning_rate": 4.87927032161552e-07, "logits/chosen": 0.5223321914672852, "logits/rejected": 0.4658096730709076, "logps/chosen": -100.17329406738281, "logps/ref_chosen": -73.2366943359375, "logps/ref_rejected": -83.54694366455078, "logps/rejected": -119.79974365234375, "loss": 1.3401, "margin_dpo/margin_mean": 9.31619930267334, "margin_dpo/margin_std": 25.015779495239258, "step": 127 }, { "epoch": 0.19349962207105065, "grad_norm": 4.997095584869385, "learning_rate": 4.875177794352363e-07, "logits/chosen": 0.4383442997932434, "logits/rejected": 0.4075517952442169, "logps/chosen": -108.09526062011719, "logps/ref_chosen": -81.03706359863281, "logps/ref_rejected": -95.60237121582031, "logps/rejected": -140.10910034179688, "loss": 1.2741, "margin_dpo/margin_mean": 17.448543548583984, "margin_dpo/margin_std": 43.10020446777344, "step": 128 }, { "epoch": 0.19501133786848074, "grad_norm": 5.618246078491211, "learning_rate": 4.871018828260491e-07, "logits/chosen": 0.44885939359664917, "logits/rejected": 0.41066890954971313, "logps/chosen": -70.06056213378906, "logps/ref_chosen": -46.31350326538086, "logps/ref_rejected": -68.42625427246094, "logps/rejected": -98.83761596679688, "loss": 1.3926, "margin_dpo/margin_mean": 6.664305686950684, "margin_dpo/margin_std": 23.976375579833984, "step": 129 }, { "epoch": 0.1965230536659108, "grad_norm": 5.429229259490967, "learning_rate": 4.866793539675126e-07, "logits/chosen": 0.4905567169189453, "logits/rejected": 0.4603291153907776, "logps/chosen": -101.65276336669922, "logps/ref_chosen": -65.72907257080078, "logps/ref_rejected": -97.77519226074219, "logps/rejected": -144.22227478027344, "loss": 1.2914, "margin_dpo/margin_mean": 10.523391723632812, "margin_dpo/margin_std": 27.65914535522461, "step": 130 }, { "epoch": 0.1980347694633409, "grad_norm": 7.263116359710693, "learning_rate": 4.86250204678667e-07, "logits/chosen": 0.46883371472358704, "logits/rejected": 0.37517213821411133, "logps/chosen": -45.65996551513672, "logps/ref_chosen": -30.02720069885254, "logps/ref_rejected": -52.36793518066406, "logps/rejected": -78.58646392822266, "loss": 1.2509, "margin_dpo/margin_mean": 10.585772514343262, "margin_dpo/margin_std": 22.57959747314453, "step": 131 }, { "epoch": 0.19954648526077098, "grad_norm": 4.200234413146973, "learning_rate": 4.858144469637408e-07, "logits/chosen": 0.5417824387550354, "logits/rejected": 0.5621850490570068, "logps/chosen": -76.25199127197266, "logps/ref_chosen": -51.91057586669922, "logps/ref_rejected": -42.87281799316406, "logps/rejected": -67.77296447753906, "loss": 1.3117, "margin_dpo/margin_mean": 0.5587306022644043, "margin_dpo/margin_std": 15.402095794677734, "step": 132 }, { "epoch": 0.20105820105820105, "grad_norm": 6.736537933349609, "learning_rate": 4.853720930118138e-07, "logits/chosen": 0.5202792882919312, "logits/rejected": 0.49684813618659973, "logps/chosen": -79.01284790039062, "logps/ref_chosen": -51.23572540283203, "logps/ref_rejected": -57.740684509277344, "logps/rejected": -93.11215209960938, "loss": 1.3884, "margin_dpo/margin_mean": 7.594354152679443, "margin_dpo/margin_std": 35.9656867980957, "step": 133 }, { "epoch": 0.20256991685563114, "grad_norm": 4.357104301452637, "learning_rate": 4.849231551964771e-07, "logits/chosen": 0.6357393264770508, "logits/rejected": 0.6053987741470337, "logps/chosen": -93.45689392089844, "logps/ref_chosen": -60.42839813232422, "logps/ref_rejected": -74.99949645996094, "logps/rejected": -121.59634399414062, "loss": 1.281, "margin_dpo/margin_mean": 13.568346977233887, "margin_dpo/margin_std": 36.36805725097656, "step": 134 }, { "epoch": 0.20408163265306123, "grad_norm": 4.28125524520874, "learning_rate": 4.844676460754862e-07, "logits/chosen": 0.5447328090667725, "logits/rejected": 0.5454561710357666, "logps/chosen": -93.66383361816406, "logps/ref_chosen": -57.496219635009766, "logps/ref_rejected": -65.18019104003906, "logps/rejected": -101.57806396484375, "loss": 1.324, "margin_dpo/margin_mean": 0.23026132583618164, "margin_dpo/margin_std": 23.302963256835938, "step": 135 }, { "epoch": 0.20559334845049132, "grad_norm": 5.157009124755859, "learning_rate": 4.840055783904106e-07, "logits/chosen": 0.4646771550178528, "logits/rejected": 0.49990448355674744, "logps/chosen": -127.57366180419922, "logps/ref_chosen": -86.13214111328125, "logps/ref_rejected": -57.09363555908203, "logps/rejected": -86.46631622314453, "loss": 1.2754, "margin_dpo/margin_mean": -12.068828582763672, "margin_dpo/margin_std": 32.699989318847656, "step": 136 }, { "epoch": 0.20710506424792138, "grad_norm": 5.023794174194336, "learning_rate": 4.835369650662767e-07, "logits/chosen": 0.4657016396522522, "logits/rejected": 0.41886240243911743, "logps/chosen": -117.60987091064453, "logps/ref_chosen": -85.38725280761719, "logps/ref_rejected": -74.20018005371094, "logps/rejected": -122.04615020751953, "loss": 1.321, "margin_dpo/margin_mean": 15.623347282409668, "margin_dpo/margin_std": 53.613494873046875, "step": 137 }, { "epoch": 0.20861678004535147, "grad_norm": 6.945674896240234, "learning_rate": 4.830618192112065e-07, "logits/chosen": 0.47126084566116333, "logits/rejected": 0.48690515756607056, "logps/chosen": -163.68258666992188, "logps/ref_chosen": -91.57633972167969, "logps/ref_rejected": -72.2829360961914, "logps/rejected": -130.02001953125, "loss": 1.3501, "margin_dpo/margin_mean": -14.36915397644043, "margin_dpo/margin_std": 60.06218338012695, "step": 138 }, { "epoch": 0.21012849584278157, "grad_norm": 6.725894927978516, "learning_rate": 4.825801541160509e-07, "logits/chosen": 0.4866417348384857, "logits/rejected": 0.49573272466659546, "logps/chosen": -97.94691467285156, "logps/ref_chosen": -61.32575225830078, "logps/ref_rejected": -58.40146255493164, "logps/rejected": -101.65697479248047, "loss": 1.3529, "margin_dpo/margin_mean": 6.634352684020996, "margin_dpo/margin_std": 25.386695861816406, "step": 139 }, { "epoch": 0.21164021164021163, "grad_norm": 6.632240295410156, "learning_rate": 4.820919832540181e-07, "logits/chosen": 0.4285508096218109, "logits/rejected": 0.4244239926338196, "logps/chosen": -144.7003631591797, "logps/ref_chosen": -90.57447814941406, "logps/ref_rejected": -99.98605346679688, "logps/rejected": -158.67343139648438, "loss": 1.3232, "margin_dpo/margin_mean": 4.56149959564209, "margin_dpo/margin_std": 71.53721618652344, "step": 140 }, { "epoch": 0.21315192743764172, "grad_norm": 5.187021255493164, "learning_rate": 4.815973202802966e-07, "logits/chosen": 0.5863088965415955, "logits/rejected": 0.5247952938079834, "logps/chosen": -98.68556213378906, "logps/ref_chosen": -48.589813232421875, "logps/ref_rejected": -86.63417053222656, "logps/rejected": -149.1506805419922, "loss": 1.299, "margin_dpo/margin_mean": 12.420760154724121, "margin_dpo/margin_std": 37.57579040527344, "step": 141 }, { "epoch": 0.2146636432350718, "grad_norm": 4.731565475463867, "learning_rate": 4.810961790316729e-07, "logits/chosen": 0.5422487258911133, "logits/rejected": 0.5291632413864136, "logps/chosen": -82.97702026367188, "logps/ref_chosen": -52.91022491455078, "logps/ref_rejected": -69.94438934326172, "logps/rejected": -105.55659484863281, "loss": 1.3097, "margin_dpo/margin_mean": 5.545400142669678, "margin_dpo/margin_std": 24.14703941345215, "step": 142 }, { "epoch": 0.2161753590325019, "grad_norm": 7.777275085449219, "learning_rate": 4.805885735261454e-07, "logits/chosen": 0.6047331690788269, "logits/rejected": 0.5495315194129944, "logps/chosen": -65.96603393554688, "logps/ref_chosen": -41.020355224609375, "logps/ref_rejected": -76.39324951171875, "logps/rejected": -121.12960815429688, "loss": 1.399, "margin_dpo/margin_mean": 19.79067611694336, "margin_dpo/margin_std": 37.09278106689453, "step": 143 }, { "epoch": 0.21768707482993196, "grad_norm": 7.344333171844482, "learning_rate": 4.800745179625307e-07, "logits/chosen": 0.5492057204246521, "logits/rejected": 0.5295698046684265, "logps/chosen": -72.70046997070312, "logps/ref_chosen": -42.882530212402344, "logps/ref_rejected": -55.65336608886719, "logps/rejected": -92.50035095214844, "loss": 1.3536, "margin_dpo/margin_mean": 7.029045104980469, "margin_dpo/margin_std": 13.494336128234863, "step": 144 }, { "epoch": 0.21919879062736206, "grad_norm": 13.654488563537598, "learning_rate": 4.795540267200686e-07, "logits/chosen": 0.3647320866584778, "logits/rejected": 0.3844793438911438, "logps/chosen": -107.22215270996094, "logps/ref_chosen": -64.46841430664062, "logps/ref_rejected": -74.22611236572266, "logps/rejected": -127.66377258300781, "loss": 1.4044, "margin_dpo/margin_mean": 10.683923721313477, "margin_dpo/margin_std": 56.14466857910156, "step": 145 }, { "epoch": 0.22071050642479215, "grad_norm": 5.76055908203125, "learning_rate": 4.790271143580173e-07, "logits/chosen": 0.5389979481697083, "logits/rejected": 0.5553423166275024, "logps/chosen": -108.01651000976562, "logps/ref_chosen": -66.7105712890625, "logps/ref_rejected": -65.54419708251953, "logps/rejected": -108.33381652832031, "loss": 1.3497, "margin_dpo/margin_mean": 1.4836831092834473, "margin_dpo/margin_std": 34.92258071899414, "step": 146 }, { "epoch": 0.2222222222222222, "grad_norm": 5.631329536437988, "learning_rate": 4.784937956152489e-07, "logits/chosen": 0.42997854948043823, "logits/rejected": 0.3938245177268982, "logps/chosen": -113.09949493408203, "logps/ref_chosen": -69.16227722167969, "logps/ref_rejected": -93.64544677734375, "logps/rejected": -146.63816833496094, "loss": 1.3586, "margin_dpo/margin_mean": 9.055498123168945, "margin_dpo/margin_std": 28.972103118896484, "step": 147 }, { "epoch": 0.2237339380196523, "grad_norm": 5.31839656829834, "learning_rate": 4.779540854098347e-07, "logits/chosen": 0.7581343650817871, "logits/rejected": 0.6984580755233765, "logps/chosen": -70.00361633300781, "logps/ref_chosen": -43.368194580078125, "logps/ref_rejected": -76.12153625488281, "logps/rejected": -122.95445251464844, "loss": 1.2599, "margin_dpo/margin_mean": 20.197490692138672, "margin_dpo/margin_std": 35.63082504272461, "step": 148 }, { "epoch": 0.2252456538170824, "grad_norm": 4.855349063873291, "learning_rate": 4.774079988386296e-07, "logits/chosen": 0.4739559292793274, "logits/rejected": 0.5105680227279663, "logps/chosen": -98.31448364257812, "logps/ref_chosen": -59.394657135009766, "logps/ref_rejected": -50.62220001220703, "logps/rejected": -90.08949279785156, "loss": 1.3099, "margin_dpo/margin_mean": 0.5474551916122437, "margin_dpo/margin_std": 29.59663963317871, "step": 149 }, { "epoch": 0.22675736961451248, "grad_norm": 4.553208827972412, "learning_rate": 4.768555511768486e-07, "logits/chosen": 0.41769298911094666, "logits/rejected": 0.42729195952415466, "logps/chosen": -122.25434875488281, "logps/ref_chosen": -75.73005676269531, "logps/ref_rejected": -88.11376190185547, "logps/rejected": -137.2162322998047, "loss": 1.277, "margin_dpo/margin_mean": 2.578186511993408, "margin_dpo/margin_std": 40.36921691894531, "step": 150 }, { "epoch": 0.22826908541194255, "grad_norm": 4.969564437866211, "learning_rate": 4.762967578776406e-07, "logits/chosen": 0.528440535068512, "logits/rejected": 0.4810951352119446, "logps/chosen": -94.2465591430664, "logps/ref_chosen": -64.3418960571289, "logps/ref_rejected": -81.62193298339844, "logps/rejected": -135.70932006835938, "loss": 1.2305, "margin_dpo/margin_mean": 24.182723999023438, "margin_dpo/margin_std": 51.8789176940918, "step": 151 }, { "epoch": 0.22978080120937264, "grad_norm": 4.437036037445068, "learning_rate": 4.757316345716553e-07, "logits/chosen": 0.4535418152809143, "logits/rejected": 0.4450215995311737, "logps/chosen": -111.42601013183594, "logps/ref_chosen": -71.10409545898438, "logps/ref_rejected": -85.16281127929688, "logps/rejected": -132.8240203857422, "loss": 1.2759, "margin_dpo/margin_mean": 7.339301586151123, "margin_dpo/margin_std": 32.49983596801758, "step": 152 }, { "epoch": 0.23129251700680273, "grad_norm": 4.642518043518066, "learning_rate": 4.751601970666064e-07, "logits/chosen": 0.30414754152297974, "logits/rejected": 0.25842684507369995, "logps/chosen": -119.28223419189453, "logps/ref_chosen": -72.91175842285156, "logps/ref_rejected": -77.55793762207031, "logps/rejected": -136.66567993164062, "loss": 1.2979, "margin_dpo/margin_mean": 12.737245559692383, "margin_dpo/margin_std": 24.552635192871094, "step": 153 }, { "epoch": 0.2328042328042328, "grad_norm": 4.9511237144470215, "learning_rate": 4.745824613468292e-07, "logits/chosen": 0.35246241092681885, "logits/rejected": 0.5255336761474609, "logps/chosen": -121.34557342529297, "logps/ref_chosen": -75.31983947753906, "logps/ref_rejected": -54.22064208984375, "logps/rejected": -90.056884765625, "loss": 1.3369, "margin_dpo/margin_mean": -10.189491271972656, "margin_dpo/margin_std": 41.34107971191406, "step": 154 }, { "epoch": 0.23431594860166288, "grad_norm": 5.262056350708008, "learning_rate": 4.7399844357283393e-07, "logits/chosen": 0.5434817671775818, "logits/rejected": 0.46925586462020874, "logps/chosen": -76.32972717285156, "logps/ref_chosen": -44.81412124633789, "logps/ref_rejected": -76.29104614257812, "logps/rejected": -128.21963500976562, "loss": 1.3033, "margin_dpo/margin_mean": 20.412994384765625, "margin_dpo/margin_std": 27.084177017211914, "step": 155 }, { "epoch": 0.23582766439909297, "grad_norm": 5.0457353591918945, "learning_rate": 4.7340816008085305e-07, "logits/chosen": 0.44192785024642944, "logits/rejected": 0.4855366349220276, "logps/chosen": -137.42698669433594, "logps/ref_chosen": -94.79901123046875, "logps/ref_rejected": -81.31282806396484, "logps/rejected": -137.68701171875, "loss": 1.2281, "margin_dpo/margin_mean": 13.746213912963867, "margin_dpo/margin_std": 34.27767562866211, "step": 156 }, { "epoch": 0.23733938019652306, "grad_norm": 4.6686692237854, "learning_rate": 4.728116273823847e-07, "logits/chosen": 0.4652218222618103, "logits/rejected": 0.45060834288597107, "logps/chosen": -98.19709777832031, "logps/ref_chosen": -58.356666564941406, "logps/ref_rejected": -77.88996887207031, "logps/rejected": -127.36456298828125, "loss": 1.2929, "margin_dpo/margin_mean": 9.634150505065918, "margin_dpo/margin_std": 30.75609588623047, "step": 157 }, { "epoch": 0.23885109599395313, "grad_norm": 4.9093017578125, "learning_rate": 4.7220886216373085e-07, "logits/chosen": 0.4573588967323303, "logits/rejected": 0.458060622215271, "logps/chosen": -108.16549682617188, "logps/ref_chosen": -66.18313598632812, "logps/ref_rejected": -68.98072814941406, "logps/rejected": -120.69749450683594, "loss": 1.2881, "margin_dpo/margin_mean": 9.734394073486328, "margin_dpo/margin_std": 32.37115478515625, "step": 158 }, { "epoch": 0.24036281179138322, "grad_norm": 4.744980812072754, "learning_rate": 4.715998812855304e-07, "logits/chosen": 0.6077166199684143, "logits/rejected": 0.550294816493988, "logps/chosen": -97.12281799316406, "logps/ref_chosen": -63.03964614868164, "logps/ref_rejected": -94.90765380859375, "logps/rejected": -150.78564453125, "loss": 1.2671, "margin_dpo/margin_mean": 21.7947998046875, "margin_dpo/margin_std": 43.32719039916992, "step": 159 }, { "epoch": 0.2418745275888133, "grad_norm": 5.374088287353516, "learning_rate": 4.7098470178228755e-07, "logits/chosen": 0.3768664002418518, "logits/rejected": 0.35334479808807373, "logps/chosen": -82.2614517211914, "logps/ref_chosen": -53.58171081542969, "logps/ref_rejected": -70.01301574707031, "logps/rejected": -118.45684814453125, "loss": 1.2652, "margin_dpo/margin_mean": 19.764087677001953, "margin_dpo/margin_std": 29.50685691833496, "step": 160 }, { "epoch": 0.24338624338624337, "grad_norm": 5.152878761291504, "learning_rate": 4.703633408618955e-07, "logits/chosen": 0.5939978957176208, "logits/rejected": 0.5104175806045532, "logps/chosen": -85.92366790771484, "logps/ref_chosen": -51.761775970458984, "logps/ref_rejected": -89.95056915283203, "logps/rejected": -145.84454345703125, "loss": 1.2587, "margin_dpo/margin_mean": 21.732070922851562, "margin_dpo/margin_std": 34.75413513183594, "step": 161 }, { "epoch": 0.24489795918367346, "grad_norm": 6.838240146636963, "learning_rate": 4.697358159051549e-07, "logits/chosen": 0.5239239931106567, "logits/rejected": 0.5126291513442993, "logps/chosen": -148.87701416015625, "logps/ref_chosen": -93.13358306884766, "logps/ref_rejected": -96.1287841796875, "logps/rejected": -170.743408203125, "loss": 1.2403, "margin_dpo/margin_mean": 18.871191024780273, "margin_dpo/margin_std": 61.78876495361328, "step": 162 }, { "epoch": 0.24640967498110355, "grad_norm": 5.787431240081787, "learning_rate": 4.691021444652876e-07, "logits/chosen": 0.4375625252723694, "logits/rejected": 0.2801172137260437, "logps/chosen": -79.89445495605469, "logps/ref_chosen": -46.83258056640625, "logps/ref_rejected": -108.36699676513672, "logps/rejected": -170.7933807373047, "loss": 1.2107, "margin_dpo/margin_mean": 29.364521026611328, "margin_dpo/margin_std": 36.74135208129883, "step": 163 }, { "epoch": 0.24792139077853365, "grad_norm": 5.994203090667725, "learning_rate": 4.6846234426744624e-07, "logits/chosen": 0.490617573261261, "logits/rejected": 0.4238327443599701, "logps/chosen": -90.10145568847656, "logps/ref_chosen": -54.36710739135742, "logps/ref_rejected": -96.78005981445312, "logps/rejected": -158.1606903076172, "loss": 1.1904, "margin_dpo/margin_mean": 25.646286010742188, "margin_dpo/margin_std": 32.140342712402344, "step": 164 }, { "epoch": 0.2494331065759637, "grad_norm": 5.857151508331299, "learning_rate": 4.678164332082175e-07, "logits/chosen": 0.6752983331680298, "logits/rejected": 0.6979560852050781, "logps/chosen": -97.69610595703125, "logps/ref_chosen": -57.447242736816406, "logps/ref_rejected": -57.388519287109375, "logps/rejected": -110.6009750366211, "loss": 1.2174, "margin_dpo/margin_mean": 12.963600158691406, "margin_dpo/margin_std": 32.28080749511719, "step": 165 }, { "epoch": 0.2509448223733938, "grad_norm": 6.883800029754639, "learning_rate": 4.6716442935512214e-07, "logits/chosen": 0.6320142149925232, "logits/rejected": 0.524321436882019, "logps/chosen": -117.2408447265625, "logps/ref_chosen": -75.21217346191406, "logps/ref_rejected": -97.00743103027344, "logps/rejected": -162.87738037109375, "loss": 1.2198, "margin_dpo/margin_mean": 23.84128189086914, "margin_dpo/margin_std": 30.363832473754883, "step": 166 }, { "epoch": 0.25245653817082386, "grad_norm": 7.713088512420654, "learning_rate": 4.6650635094610966e-07, "logits/chosen": 0.6184214949607849, "logits/rejected": 0.5350244641304016, "logps/chosen": -72.7265853881836, "logps/ref_chosen": -39.998783111572266, "logps/ref_rejected": -68.48226928710938, "logps/rejected": -122.67631530761719, "loss": 1.2041, "margin_dpo/margin_mean": 21.466243743896484, "margin_dpo/margin_std": 22.175495147705078, "step": 167 }, { "epoch": 0.25396825396825395, "grad_norm": 6.5206685066223145, "learning_rate": 4.6584221638904767e-07, "logits/chosen": 0.46801936626434326, "logits/rejected": 0.5133869647979736, "logps/chosen": -126.92230224609375, "logps/ref_chosen": -78.39299774169922, "logps/ref_rejected": -56.804256439208984, "logps/rejected": -111.1657485961914, "loss": 1.2315, "margin_dpo/margin_mean": 5.832178592681885, "margin_dpo/margin_std": 34.10231399536133, "step": 168 }, { "epoch": 0.25547996976568405, "grad_norm": 6.418181419372559, "learning_rate": 4.651720442612075e-07, "logits/chosen": 0.5367448329925537, "logits/rejected": 0.49534183740615845, "logps/chosen": -96.13229370117188, "logps/ref_chosen": -61.65520477294922, "logps/ref_rejected": -74.08130645751953, "logps/rejected": -126.73211669921875, "loss": 1.2526, "margin_dpo/margin_mean": 18.173721313476562, "margin_dpo/margin_std": 33.40047073364258, "step": 169 }, { "epoch": 0.25699168556311414, "grad_norm": 6.973052501678467, "learning_rate": 4.6449585330874425e-07, "logits/chosen": 0.6434294581413269, "logits/rejected": 0.6778239011764526, "logps/chosen": -76.18331146240234, "logps/ref_chosen": -48.08928680419922, "logps/ref_rejected": -49.94983673095703, "logps/rejected": -97.5601577758789, "loss": 1.2333, "margin_dpo/margin_mean": 19.516292572021484, "margin_dpo/margin_std": 36.30491638183594, "step": 170 }, { "epoch": 0.2585034013605442, "grad_norm": 7.348653316497803, "learning_rate": 4.6381366244617224e-07, "logits/chosen": 0.7886656522750854, "logits/rejected": 0.7550907135009766, "logps/chosen": -70.39274597167969, "logps/ref_chosen": -49.932838439941406, "logps/ref_rejected": -69.74531555175781, "logps/rejected": -124.51280212402344, "loss": 1.2094, "margin_dpo/margin_mean": 34.307586669921875, "margin_dpo/margin_std": 35.918819427490234, "step": 171 }, { "epoch": 0.2600151171579743, "grad_norm": 8.536656379699707, "learning_rate": 4.631254907558365e-07, "logits/chosen": 0.8007753491401672, "logits/rejected": 0.7138994932174683, "logps/chosen": -96.61810302734375, "logps/ref_chosen": -49.946144104003906, "logps/ref_rejected": -93.07025146484375, "logps/rejected": -180.4694366455078, "loss": 1.1897, "margin_dpo/margin_mean": 40.72722244262695, "margin_dpo/margin_std": 49.622161865234375, "step": 172 }, { "epoch": 0.2615268329554044, "grad_norm": 7.331446647644043, "learning_rate": 4.624313574873786e-07, "logits/chosen": 0.6109236478805542, "logits/rejected": 0.6213595867156982, "logps/chosen": -75.77210235595703, "logps/ref_chosen": -49.288230895996094, "logps/ref_rejected": -48.86689758300781, "logps/rejected": -101.83216094970703, "loss": 1.1658, "margin_dpo/margin_mean": 26.481386184692383, "margin_dpo/margin_std": 57.468788146972656, "step": 173 }, { "epoch": 0.26303854875283444, "grad_norm": 8.760350227355957, "learning_rate": 4.61731282057198e-07, "logits/chosen": 0.6046967506408691, "logits/rejected": 0.5037060379981995, "logps/chosen": -112.89518737792969, "logps/ref_chosen": -64.05992126464844, "logps/ref_rejected": -93.19732666015625, "logps/rejected": -186.00070190429688, "loss": 1.1071, "margin_dpo/margin_mean": 43.968109130859375, "margin_dpo/margin_std": 35.23388671875, "step": 174 }, { "epoch": 0.26455026455026454, "grad_norm": 9.087226867675781, "learning_rate": 4.6102528404790965e-07, "logits/chosen": 0.8991174697875977, "logits/rejected": 0.8246195316314697, "logps/chosen": -83.77995300292969, "logps/ref_chosen": -40.91709899902344, "logps/ref_rejected": -84.71610260009766, "logps/rejected": -157.31451416015625, "loss": 1.1565, "margin_dpo/margin_mean": 29.735551834106445, "margin_dpo/margin_std": 53.80352783203125, "step": 175 }, { "epoch": 0.2660619803476946, "grad_norm": 13.09632396697998, "learning_rate": 4.603133832077953e-07, "logits/chosen": 0.6280794143676758, "logits/rejected": 0.6072982549667358, "logps/chosen": -153.69549560546875, "logps/ref_chosen": -83.32914733886719, "logps/ref_rejected": -69.77914428710938, "logps/rejected": -140.9823455810547, "loss": 1.2784, "margin_dpo/margin_mean": 0.8368606567382812, "margin_dpo/margin_std": 59.72896957397461, "step": 176 }, { "epoch": 0.2675736961451247, "grad_norm": 11.76092529296875, "learning_rate": 4.5959559945025183e-07, "logits/chosen": 0.843967080116272, "logits/rejected": 0.8128781318664551, "logps/chosen": -91.85818481445312, "logps/ref_chosen": -56.50011444091797, "logps/ref_rejected": -76.53157806396484, "logps/rejected": -166.71331787109375, "loss": 1.008, "margin_dpo/margin_mean": 54.82364273071289, "margin_dpo/margin_std": 54.522438049316406, "step": 177 }, { "epoch": 0.2690854119425548, "grad_norm": 12.708898544311523, "learning_rate": 4.588719528532341e-07, "logits/chosen": 0.7774707078933716, "logits/rejected": 0.6817446351051331, "logps/chosen": -82.52491760253906, "logps/ref_chosen": -35.268131256103516, "logps/ref_rejected": -70.45591735839844, "logps/rejected": -159.62277221679688, "loss": 1.1395, "margin_dpo/margin_mean": 41.910072326660156, "margin_dpo/margin_std": 58.84284973144531, "step": 178 }, { "epoch": 0.2705971277399849, "grad_norm": 11.324702262878418, "learning_rate": 4.581424636586928e-07, "logits/chosen": 0.724172830581665, "logits/rejected": 0.7666547298431396, "logps/chosen": -149.95443725585938, "logps/ref_chosen": -82.57086181640625, "logps/ref_rejected": -75.7454605102539, "logps/rejected": -170.7916717529297, "loss": 1.1648, "margin_dpo/margin_mean": 27.66265106201172, "margin_dpo/margin_std": 52.755733489990234, "step": 179 }, { "epoch": 0.272108843537415, "grad_norm": 10.527196884155273, "learning_rate": 4.5740715227200897e-07, "logits/chosen": 0.6827090978622437, "logits/rejected": 0.591106116771698, "logps/chosen": -93.48646545410156, "logps/ref_chosen": -49.27946472167969, "logps/ref_rejected": -70.7477035522461, "logps/rejected": -130.7356719970703, "loss": 1.2205, "margin_dpo/margin_mean": 15.780977249145508, "margin_dpo/margin_std": 23.79660415649414, "step": 180 }, { "epoch": 0.273620559334845, "grad_norm": 12.358736991882324, "learning_rate": 4.566660392614228e-07, "logits/chosen": 0.7833654880523682, "logits/rejected": 0.7754448652267456, "logps/chosen": -116.57708740234375, "logps/ref_chosen": -68.44458770751953, "logps/ref_rejected": -75.00827026367188, "logps/rejected": -140.25169372558594, "loss": 1.0862, "margin_dpo/margin_mean": 17.11092185974121, "margin_dpo/margin_std": 26.67954444885254, "step": 181 }, { "epoch": 0.2751322751322751, "grad_norm": 11.587776184082031, "learning_rate": 4.5591914535745817e-07, "logits/chosen": 0.7390822768211365, "logits/rejected": 0.6326345205307007, "logps/chosen": -106.85360717773438, "logps/ref_chosen": -53.30392074584961, "logps/ref_rejected": -106.17438507080078, "logps/rejected": -206.2800750732422, "loss": 1.0938, "margin_dpo/margin_mean": 46.55598449707031, "margin_dpo/margin_std": 62.35004425048828, "step": 182 }, { "epoch": 0.2766439909297052, "grad_norm": 14.335674285888672, "learning_rate": 4.551664914523433e-07, "logits/chosen": 0.6751140356063843, "logits/rejected": 0.6843445301055908, "logps/chosen": -129.42478942871094, "logps/ref_chosen": -61.0320930480957, "logps/ref_rejected": -64.14570617675781, "logps/rejected": -138.19187927246094, "loss": 1.3218, "margin_dpo/margin_mean": 5.653476238250732, "margin_dpo/margin_std": 64.97230529785156, "step": 183 }, { "epoch": 0.2781557067271353, "grad_norm": 14.724100112915039, "learning_rate": 4.544080985994258e-07, "logits/chosen": 0.8014061450958252, "logits/rejected": 0.7991318702697754, "logps/chosen": -102.4467544555664, "logps/ref_chosen": -58.40687561035156, "logps/ref_rejected": -65.13507080078125, "logps/rejected": -135.5673828125, "loss": 1.1217, "margin_dpo/margin_mean": 26.392431259155273, "margin_dpo/margin_std": 42.3356819152832, "step": 184 }, { "epoch": 0.2796674225245654, "grad_norm": 13.151495933532715, "learning_rate": 4.5364398801258394e-07, "logits/chosen": 0.8872311115264893, "logits/rejected": 0.8844795227050781, "logps/chosen": -117.208984375, "logps/ref_chosen": -68.01717376708984, "logps/ref_rejected": -68.56169128417969, "logps/rejected": -155.34658813476562, "loss": 1.1561, "margin_dpo/margin_mean": 37.59309387207031, "margin_dpo/margin_std": 63.59172058105469, "step": 185 }, { "epoch": 0.2811791383219955, "grad_norm": 18.820205688476562, "learning_rate": 4.5287418106563354e-07, "logits/chosen": 0.7388228178024292, "logits/rejected": 0.6680508255958557, "logps/chosen": -118.04601287841797, "logps/ref_chosen": -66.14515686035156, "logps/ref_rejected": -101.60063934326172, "logps/rejected": -203.31298828125, "loss": 1.1578, "margin_dpo/margin_mean": 49.811492919921875, "margin_dpo/margin_std": 85.83422088623047, "step": 186 }, { "epoch": 0.28269085411942557, "grad_norm": 13.48037052154541, "learning_rate": 4.520986992917297e-07, "logits/chosen": 0.6655247211456299, "logits/rejected": 0.5620222687721252, "logps/chosen": -162.19210815429688, "logps/ref_chosen": -80.47019958496094, "logps/ref_rejected": -118.81498718261719, "logps/rejected": -255.6341094970703, "loss": 1.1797, "margin_dpo/margin_mean": 55.09719467163086, "margin_dpo/margin_std": 82.15184020996094, "step": 187 }, { "epoch": 0.2842025699168556, "grad_norm": 12.772809982299805, "learning_rate": 4.5131756438276466e-07, "logits/chosen": 0.8540096282958984, "logits/rejected": 0.7868589758872986, "logps/chosen": -118.0152587890625, "logps/ref_chosen": -66.75248718261719, "logps/ref_rejected": -97.99404907226562, "logps/rejected": -214.93295288085938, "loss": 1.1456, "margin_dpo/margin_mean": 65.6761474609375, "margin_dpo/margin_std": 84.41012573242188, "step": 188 }, { "epoch": 0.2857142857142857, "grad_norm": 19.746803283691406, "learning_rate": 4.5053079818876096e-07, "logits/chosen": 0.742944598197937, "logits/rejected": 0.8294093012809753, "logps/chosen": -144.683349609375, "logps/ref_chosen": -75.2247314453125, "logps/ref_rejected": -72.38041687011719, "logps/rejected": -162.834228515625, "loss": 1.3007, "margin_dpo/margin_mean": 20.995182037353516, "margin_dpo/margin_std": 69.88140869140625, "step": 189 }, { "epoch": 0.2872260015117158, "grad_norm": 15.231517791748047, "learning_rate": 4.4973842271726024e-07, "logits/chosen": 0.8356347680091858, "logits/rejected": 0.7419763803482056, "logps/chosen": -101.99494934082031, "logps/ref_chosen": -47.77008056640625, "logps/ref_rejected": -84.10618591308594, "logps/rejected": -201.740966796875, "loss": 0.9599, "margin_dpo/margin_mean": 63.4099235534668, "margin_dpo/margin_std": 64.51853942871094, "step": 190 }, { "epoch": 0.2887377173091459, "grad_norm": 20.671388626098633, "learning_rate": 4.48940460132708e-07, "logits/chosen": 0.8784452676773071, "logits/rejected": 0.8466886281967163, "logps/chosen": -134.3704833984375, "logps/ref_chosen": -66.10474395751953, "logps/ref_rejected": -79.35490417480469, "logps/rejected": -206.15818786621094, "loss": 1.2401, "margin_dpo/margin_mean": 58.53753662109375, "margin_dpo/margin_std": 76.90192413330078, "step": 191 }, { "epoch": 0.29024943310657597, "grad_norm": 21.7989559173584, "learning_rate": 4.481369327558329e-07, "logits/chosen": 0.8773584961891174, "logits/rejected": 0.8835272789001465, "logps/chosen": -154.4402618408203, "logps/ref_chosen": -61.76245880126953, "logps/ref_rejected": -60.038848876953125, "logps/rejected": -150.0125732421875, "loss": 1.279, "margin_dpo/margin_mean": -2.704071521759033, "margin_dpo/margin_std": 65.4942626953125, "step": 192 }, { "epoch": 0.29176114890400606, "grad_norm": 13.274942398071289, "learning_rate": 4.47327863063023e-07, "logits/chosen": 0.7359751462936401, "logits/rejected": 0.6782011985778809, "logps/chosen": -131.29852294921875, "logps/ref_chosen": -50.086849212646484, "logps/ref_rejected": -69.65550231933594, "logps/rejected": -187.8815460205078, "loss": 1.1063, "margin_dpo/margin_mean": 37.01435089111328, "margin_dpo/margin_std": 56.66736602783203, "step": 193 }, { "epoch": 0.29327286470143615, "grad_norm": 21.689014434814453, "learning_rate": 4.4651327368569684e-07, "logits/chosen": 0.7673474550247192, "logits/rejected": 0.8195520639419556, "logps/chosen": -152.31005859375, "logps/ref_chosen": -72.05107116699219, "logps/ref_rejected": -71.8668212890625, "logps/rejected": -168.17160034179688, "loss": 1.2219, "margin_dpo/margin_mean": 16.045808792114258, "margin_dpo/margin_std": 65.90227508544922, "step": 194 }, { "epoch": 0.2947845804988662, "grad_norm": 19.105045318603516, "learning_rate": 4.4569318740967043e-07, "logits/chosen": 0.6917105317115784, "logits/rejected": 0.761000394821167, "logps/chosen": -192.57481384277344, "logps/ref_chosen": -83.29867553710938, "logps/ref_rejected": -66.07734680175781, "logps/rejected": -193.638671875, "loss": 1.179, "margin_dpo/margin_mean": 18.285186767578125, "margin_dpo/margin_std": 70.0030288696289, "step": 195 }, { "epoch": 0.2962962962962963, "grad_norm": 14.678657531738281, "learning_rate": 4.448676271745197e-07, "logits/chosen": 0.8080000877380371, "logits/rejected": 0.8198258876800537, "logps/chosen": -177.1346435546875, "logps/ref_chosen": -81.20787048339844, "logps/ref_rejected": -79.35533142089844, "logps/rejected": -194.04977416992188, "loss": 1.1227, "margin_dpo/margin_mean": 18.76766014099121, "margin_dpo/margin_std": 53.9957160949707, "step": 196 }, { "epoch": 0.29780801209372637, "grad_norm": 20.904054641723633, "learning_rate": 4.440366160729392e-07, "logits/chosen": 1.08381986618042, "logits/rejected": 0.9524755477905273, "logps/chosen": -121.47453308105469, "logps/ref_chosen": -38.60869598388672, "logps/ref_rejected": -74.34709167480469, "logps/rejected": -196.74826049804688, "loss": 1.1447, "margin_dpo/margin_mean": 39.53533935546875, "margin_dpo/margin_std": 104.43431091308594, "step": 197 }, { "epoch": 0.29931972789115646, "grad_norm": 18.375619888305664, "learning_rate": 4.432001773500957e-07, "logits/chosen": 1.0280174016952515, "logits/rejected": 0.9963433742523193, "logps/chosen": -113.64097595214844, "logps/ref_chosen": -48.90552520751953, "logps/ref_rejected": -63.93467712402344, "logps/rejected": -166.58355712890625, "loss": 1.1247, "margin_dpo/margin_mean": 37.91343307495117, "margin_dpo/margin_std": 68.63130187988281, "step": 198 }, { "epoch": 0.30083144368858655, "grad_norm": 13.685064315795898, "learning_rate": 4.4235833440297856e-07, "logits/chosen": 0.8831381797790527, "logits/rejected": 0.8222548961639404, "logps/chosen": -129.5958709716797, "logps/ref_chosen": -57.552146911621094, "logps/ref_rejected": -76.73622131347656, "logps/rejected": -196.77474975585938, "loss": 1.1776, "margin_dpo/margin_mean": 47.994808197021484, "margin_dpo/margin_std": 66.5357437133789, "step": 199 }, { "epoch": 0.30234315948601664, "grad_norm": 13.301373481750488, "learning_rate": 4.415111107797445e-07, "logits/chosen": 0.9643712043762207, "logits/rejected": 0.7956128120422363, "logps/chosen": -104.0108413696289, "logps/ref_chosen": -49.7380485534668, "logps/ref_rejected": -119.85460662841797, "logps/rejected": -247.071533203125, "loss": 1.0529, "margin_dpo/margin_mean": 72.94413757324219, "margin_dpo/margin_std": 100.05728149414062, "step": 200 }, { "epoch": 0.30385487528344673, "grad_norm": 14.938904762268066, "learning_rate": 4.4065853017905953e-07, "logits/chosen": 0.9494801759719849, "logits/rejected": 0.963019847869873, "logps/chosen": -168.0663604736328, "logps/ref_chosen": -76.25155639648438, "logps/ref_rejected": -75.89337921142578, "logps/rejected": -206.00071716308594, "loss": 1.1877, "margin_dpo/margin_mean": 38.292537689208984, "margin_dpo/margin_std": 74.59246063232422, "step": 201 }, { "epoch": 0.30536659108087677, "grad_norm": 19.734487533569336, "learning_rate": 4.3980061644943575e-07, "logits/chosen": 0.8432959318161011, "logits/rejected": 0.6981616020202637, "logps/chosen": -75.70082092285156, "logps/ref_chosen": -31.3531551361084, "logps/ref_rejected": -70.96551513671875, "logps/rejected": -184.80126953125, "loss": 1.0936, "margin_dpo/margin_mean": 69.48809051513672, "margin_dpo/margin_std": 59.054290771484375, "step": 202 }, { "epoch": 0.30687830687830686, "grad_norm": 17.070323944091797, "learning_rate": 4.3893739358856455e-07, "logits/chosen": 0.9048564434051514, "logits/rejected": 0.7673661708831787, "logps/chosen": -144.82659912109375, "logps/ref_chosen": -63.048912048339844, "logps/ref_rejected": -137.18235778808594, "logps/rejected": -288.850341796875, "loss": 1.0973, "margin_dpo/margin_mean": 69.89026641845703, "margin_dpo/margin_std": 88.3589096069336, "step": 203 }, { "epoch": 0.30839002267573695, "grad_norm": 14.029024124145508, "learning_rate": 4.380688857426449e-07, "logits/chosen": 0.8778685331344604, "logits/rejected": 0.7175527811050415, "logps/chosen": -111.14114379882812, "logps/ref_chosen": -45.349220275878906, "logps/ref_rejected": -89.06627655029297, "logps/rejected": -215.51800537109375, "loss": 1.0848, "margin_dpo/margin_mean": 60.6597900390625, "margin_dpo/margin_std": 76.72471618652344, "step": 204 }, { "epoch": 0.30990173847316704, "grad_norm": 15.573190689086914, "learning_rate": 4.3719511720570814e-07, "logits/chosen": 1.0194439888000488, "logits/rejected": 0.9591749310493469, "logps/chosen": -138.189697265625, "logps/ref_chosen": -68.1277847290039, "logps/ref_rejected": -94.7017593383789, "logps/rejected": -237.84596252441406, "loss": 1.1396, "margin_dpo/margin_mean": 73.08229064941406, "margin_dpo/margin_std": 103.01658630371094, "step": 205 }, { "epoch": 0.31141345427059713, "grad_norm": 21.639747619628906, "learning_rate": 4.363161124189387e-07, "logits/chosen": 1.0190932750701904, "logits/rejected": 0.9483546614646912, "logps/chosen": -123.68317413330078, "logps/ref_chosen": -49.561851501464844, "logps/ref_rejected": -91.47630310058594, "logps/rejected": -198.70071411132812, "loss": 1.2179, "margin_dpo/margin_mean": 33.103092193603516, "margin_dpo/margin_std": 57.71031951904297, "step": 206 }, { "epoch": 0.3129251700680272, "grad_norm": 17.420026779174805, "learning_rate": 4.3543189596998986e-07, "logits/chosen": 0.9188834428787231, "logits/rejected": 0.8394033908843994, "logps/chosen": -133.870361328125, "logps/ref_chosen": -39.89246368408203, "logps/ref_rejected": -70.57868194580078, "logps/rejected": -204.41519165039062, "loss": 1.14, "margin_dpo/margin_mean": 39.858604431152344, "margin_dpo/margin_std": 71.88990783691406, "step": 207 }, { "epoch": 0.3144368858654573, "grad_norm": 16.912046432495117, "learning_rate": 4.3454249259229664e-07, "logits/chosen": 0.8252842426300049, "logits/rejected": 0.7790793776512146, "logps/chosen": -161.83831787109375, "logps/ref_chosen": -72.9476547241211, "logps/ref_rejected": -94.71766662597656, "logps/rejected": -216.20074462890625, "loss": 1.31, "margin_dpo/margin_mean": 32.592403411865234, "margin_dpo/margin_std": 57.00579071044922, "step": 208 }, { "epoch": 0.31594860166288735, "grad_norm": 25.769882202148438, "learning_rate": 4.336479271643833e-07, "logits/chosen": 0.9054467678070068, "logits/rejected": 0.831355631351471, "logps/chosen": -154.79385375976562, "logps/ref_chosen": -65.3952865600586, "logps/ref_rejected": -94.79216003417969, "logps/rejected": -261.7325439453125, "loss": 1.166, "margin_dpo/margin_mean": 77.54179382324219, "margin_dpo/margin_std": 97.3060073852539, "step": 209 }, { "epoch": 0.31746031746031744, "grad_norm": 12.30976676940918, "learning_rate": 4.327482247091679e-07, "logits/chosen": 0.8562976121902466, "logits/rejected": 0.6786376237869263, "logps/chosen": -197.23080444335938, "logps/ref_chosen": -88.16167449951172, "logps/ref_rejected": -143.62713623046875, "logps/rejected": -364.16387939453125, "loss": 1.0647, "margin_dpo/margin_mean": 111.46759033203125, "margin_dpo/margin_std": 77.64178466796875, "step": 210 }, { "epoch": 0.31897203325774753, "grad_norm": 16.888795852661133, "learning_rate": 4.3184341039326217e-07, "logits/chosen": 1.019036054611206, "logits/rejected": 0.9224546551704407, "logps/chosen": -116.31884765625, "logps/ref_chosen": -43.544952392578125, "logps/ref_rejected": -96.58848571777344, "logps/rejected": -221.36495971679688, "loss": 1.0403, "margin_dpo/margin_mean": 52.002593994140625, "margin_dpo/margin_std": 71.8609390258789, "step": 211 }, { "epoch": 0.3204837490551776, "grad_norm": 14.35758113861084, "learning_rate": 4.309335095262675e-07, "logits/chosen": 0.9997249841690063, "logits/rejected": 1.0045267343521118, "logps/chosen": -140.09591674804688, "logps/ref_chosen": -57.794212341308594, "logps/ref_rejected": -72.26956176757812, "logps/rejected": -212.42254638671875, "loss": 1.0802, "margin_dpo/margin_mean": 57.851280212402344, "margin_dpo/margin_std": 110.18971252441406, "step": 212 }, { "epoch": 0.3219954648526077, "grad_norm": 15.879948616027832, "learning_rate": 4.3001854756006724e-07, "logits/chosen": 0.8429861664772034, "logits/rejected": 0.8865307569503784, "logps/chosen": -176.1222381591797, "logps/ref_chosen": -81.40860748291016, "logps/ref_rejected": -64.03448486328125, "logps/rejected": -180.094970703125, "loss": 1.1033, "margin_dpo/margin_mean": 21.346853256225586, "margin_dpo/margin_std": 100.63861083984375, "step": 213 }, { "epoch": 0.3235071806500378, "grad_norm": 18.610511779785156, "learning_rate": 4.290985500881143e-07, "logits/chosen": 0.7390300035476685, "logits/rejected": 0.8046228885650635, "logps/chosen": -145.5377655029297, "logps/ref_chosen": -63.65519714355469, "logps/ref_rejected": -61.175392150878906, "logps/rejected": -205.8516845703125, "loss": 1.1949, "margin_dpo/margin_mean": 62.793731689453125, "margin_dpo/margin_std": 71.61257934570312, "step": 214 }, { "epoch": 0.3250188964474679, "grad_norm": 15.646477699279785, "learning_rate": 4.281735428447157e-07, "logits/chosen": 0.8204779624938965, "logits/rejected": 0.7469308376312256, "logps/chosen": -167.19033813476562, "logps/ref_chosen": -59.09471893310547, "logps/ref_rejected": -93.95791625976562, "logps/rejected": -225.2965087890625, "loss": 1.0742, "margin_dpo/margin_mean": 23.242979049682617, "margin_dpo/margin_std": 72.47611999511719, "step": 215 }, { "epoch": 0.32653061224489793, "grad_norm": 25.667827606201172, "learning_rate": 4.2724355170431247e-07, "logits/chosen": 0.8592058420181274, "logits/rejected": 0.8133991360664368, "logps/chosen": -183.30780029296875, "logps/ref_chosen": -88.48361206054688, "logps/ref_rejected": -95.00409698486328, "logps/rejected": -257.6964111328125, "loss": 0.9971, "margin_dpo/margin_mean": 67.86811065673828, "margin_dpo/margin_std": 59.22956466674805, "step": 216 }, { "epoch": 0.328042328042328, "grad_norm": 13.31557559967041, "learning_rate": 4.26308602680756e-07, "logits/chosen": 0.9996564388275146, "logits/rejected": 0.8627007007598877, "logps/chosen": -174.81881713867188, "logps/ref_chosen": -62.363914489746094, "logps/ref_rejected": -104.9856948852539, "logps/rejected": -284.2158203125, "loss": 1.0242, "margin_dpo/margin_mean": 66.77523803710938, "margin_dpo/margin_std": 126.6890869140625, "step": 217 }, { "epoch": 0.3295540438397581, "grad_norm": 18.151487350463867, "learning_rate": 4.253687219265803e-07, "logits/chosen": 0.7707165479660034, "logits/rejected": 0.7527076005935669, "logps/chosen": -201.19351196289062, "logps/ref_chosen": -87.87580871582031, "logps/ref_rejected": -102.97655487060547, "logps/rejected": -231.63278198242188, "loss": 1.2726, "margin_dpo/margin_mean": 15.338525772094727, "margin_dpo/margin_std": 67.2188720703125, "step": 218 }, { "epoch": 0.3310657596371882, "grad_norm": 14.18816089630127, "learning_rate": 4.2442393573227043e-07, "logits/chosen": 0.9285779595375061, "logits/rejected": 0.9085969924926758, "logps/chosen": -155.8358154296875, "logps/ref_chosen": -58.567657470703125, "logps/ref_rejected": -94.29285430908203, "logps/rejected": -231.346923828125, "loss": 1.1618, "margin_dpo/margin_mean": 39.78589630126953, "margin_dpo/margin_std": 81.78353881835938, "step": 219 }, { "epoch": 0.3325774754346183, "grad_norm": 13.471064567565918, "learning_rate": 4.234742705255272e-07, "logits/chosen": 0.8746180534362793, "logits/rejected": 0.7802847623825073, "logps/chosen": -130.3991241455078, "logps/ref_chosen": -45.8538703918457, "logps/ref_rejected": -76.9227066040039, "logps/rejected": -201.6156005859375, "loss": 1.0699, "margin_dpo/margin_mean": 40.14763641357422, "margin_dpo/margin_std": 70.33467102050781, "step": 220 }, { "epoch": 0.3340891912320484, "grad_norm": 13.25949478149414, "learning_rate": 4.22519752870528e-07, "logits/chosen": 0.8579016923904419, "logits/rejected": 0.7474366426467896, "logps/chosen": -150.62062072753906, "logps/ref_chosen": -55.79085922241211, "logps/ref_rejected": -81.47898864746094, "logps/rejected": -204.39466857910156, "loss": 1.0478, "margin_dpo/margin_mean": 28.085922241210938, "margin_dpo/margin_std": 60.90304946899414, "step": 221 }, { "epoch": 0.3356009070294785, "grad_norm": 11.687209129333496, "learning_rate": 4.2156040946718343e-07, "logits/chosen": 0.9172873497009277, "logits/rejected": 0.8860268592834473, "logps/chosen": -135.50936889648438, "logps/ref_chosen": -51.75988006591797, "logps/ref_rejected": -97.38671875, "logps/rejected": -218.99237060546875, "loss": 0.9846, "margin_dpo/margin_mean": 37.856170654296875, "margin_dpo/margin_std": 90.37283325195312, "step": 222 }, { "epoch": 0.3371126228269085, "grad_norm": 12.334152221679688, "learning_rate": 4.2059626715039065e-07, "logits/chosen": 0.8898248672485352, "logits/rejected": 0.8581580519676208, "logps/chosen": -131.7398681640625, "logps/ref_chosen": -60.334190368652344, "logps/ref_rejected": -67.8521728515625, "logps/rejected": -187.02847290039062, "loss": 1.0231, "margin_dpo/margin_mean": 47.77062225341797, "margin_dpo/margin_std": 67.18444061279297, "step": 223 }, { "epoch": 0.3386243386243386, "grad_norm": 13.227733612060547, "learning_rate": 4.1962735288928304e-07, "logits/chosen": 1.0880929231643677, "logits/rejected": 1.0430279970169067, "logps/chosen": -98.79350280761719, "logps/ref_chosen": -36.07902908325195, "logps/ref_rejected": -63.69470977783203, "logps/rejected": -191.50698852539062, "loss": 1.3037, "margin_dpo/margin_mean": 65.09780883789062, "margin_dpo/margin_std": 61.339786529541016, "step": 224 }, { "epoch": 0.3401360544217687, "grad_norm": 13.988600730895996, "learning_rate": 4.186536937864752e-07, "logits/chosen": 0.8999257683753967, "logits/rejected": 0.7167999744415283, "logps/chosen": -125.11752319335938, "logps/ref_chosen": -46.84956359863281, "logps/ref_rejected": -112.8951416015625, "logps/rejected": -255.221435546875, "loss": 1.0097, "margin_dpo/margin_mean": 64.05833435058594, "margin_dpo/margin_std": 75.08706665039062, "step": 225 }, { "epoch": 0.3416477702191988, "grad_norm": 11.088345527648926, "learning_rate": 4.176753170773052e-07, "logits/chosen": 0.9160268306732178, "logits/rejected": 0.8631049990653992, "logps/chosen": -130.91664123535156, "logps/ref_chosen": -44.67559814453125, "logps/ref_rejected": -63.792022705078125, "logps/rejected": -193.21331787109375, "loss": 1.1595, "margin_dpo/margin_mean": 43.18025207519531, "margin_dpo/margin_std": 66.46514892578125, "step": 226 }, { "epoch": 0.3431594860166289, "grad_norm": 13.128989219665527, "learning_rate": 4.166922501290729e-07, "logits/chosen": 0.8024705648422241, "logits/rejected": 0.7597426772117615, "logps/chosen": -116.00537109375, "logps/ref_chosen": -47.16712188720703, "logps/ref_rejected": -63.74501419067383, "logps/rejected": -188.3765106201172, "loss": 1.1549, "margin_dpo/margin_mean": 55.79324722290039, "margin_dpo/margin_std": 75.62820434570312, "step": 227 }, { "epoch": 0.34467120181405897, "grad_norm": 18.30823516845703, "learning_rate": 4.1570452044027405e-07, "logits/chosen": 0.9423525333404541, "logits/rejected": 0.9120630621910095, "logps/chosen": -164.05587768554688, "logps/ref_chosen": -63.36100387573242, "logps/ref_rejected": -80.64863586425781, "logps/rejected": -217.6392364501953, "loss": 1.0638, "margin_dpo/margin_mean": 36.29573059082031, "margin_dpo/margin_std": 62.555362701416016, "step": 228 }, { "epoch": 0.34618291761148906, "grad_norm": 15.304150581359863, "learning_rate": 4.147121556398312e-07, "logits/chosen": 1.0203680992126465, "logits/rejected": 0.9438357353210449, "logps/chosen": -110.92491149902344, "logps/ref_chosen": -46.663169860839844, "logps/ref_rejected": -88.81582641601562, "logps/rejected": -221.1804962158203, "loss": 1.1233, "margin_dpo/margin_mean": 68.10292053222656, "margin_dpo/margin_std": 46.62117004394531, "step": 229 }, { "epoch": 0.3476946334089191, "grad_norm": 17.100791931152344, "learning_rate": 4.137151834863213e-07, "logits/chosen": 0.8101639151573181, "logits/rejected": 0.9272513389587402, "logps/chosen": -166.52886962890625, "logps/ref_chosen": -74.076171875, "logps/ref_rejected": -60.03583526611328, "logps/rejected": -188.06703186035156, "loss": 1.1756, "margin_dpo/margin_mean": 35.5784912109375, "margin_dpo/margin_std": 85.3873519897461, "step": 230 }, { "epoch": 0.3492063492063492, "grad_norm": 15.612849235534668, "learning_rate": 4.1271363186719835e-07, "logits/chosen": 0.7588058710098267, "logits/rejected": 0.6893700957298279, "logps/chosen": -156.947998046875, "logps/ref_chosen": -60.65083312988281, "logps/ref_rejected": -88.65960693359375, "logps/rejected": -248.28106689453125, "loss": 0.9731, "margin_dpo/margin_mean": 63.32429504394531, "margin_dpo/margin_std": 63.744873046875, "step": 231 }, { "epoch": 0.3507180650037793, "grad_norm": 16.46584701538086, "learning_rate": 4.1170752879801436e-07, "logits/chosen": 0.8945074081420898, "logits/rejected": 0.9332794547080994, "logps/chosen": -147.59791564941406, "logps/ref_chosen": -69.09077453613281, "logps/ref_rejected": -57.635154724121094, "logps/rejected": -174.9073486328125, "loss": 1.1969, "margin_dpo/margin_mean": 38.76505661010742, "margin_dpo/margin_std": 74.07946014404297, "step": 232 }, { "epoch": 0.35222978080120937, "grad_norm": 17.694772720336914, "learning_rate": 4.106969024216348e-07, "logits/chosen": 1.0592719316482544, "logits/rejected": 0.9124851226806641, "logps/chosen": -154.83291625976562, "logps/ref_chosen": -47.50149917602539, "logps/ref_rejected": -65.11629486083984, "logps/rejected": -212.47425842285156, "loss": 1.1982, "margin_dpo/margin_mean": 40.02655792236328, "margin_dpo/margin_std": 79.98373413085938, "step": 233 }, { "epoch": 0.35374149659863946, "grad_norm": 15.55416488647461, "learning_rate": 4.09681781007452e-07, "logits/chosen": 0.6289401650428772, "logits/rejected": 0.6006139516830444, "logps/chosen": -184.160888671875, "logps/ref_chosen": -71.29449462890625, "logps/ref_rejected": -75.5903091430664, "logps/rejected": -222.56118774414062, "loss": 1.1917, "margin_dpo/margin_mean": 34.104488372802734, "margin_dpo/margin_std": 52.7165412902832, "step": 234 }, { "epoch": 0.35525321239606955, "grad_norm": 12.542901039123535, "learning_rate": 4.08662192950594e-07, "logits/chosen": 0.8517894744873047, "logits/rejected": 0.8705936670303345, "logps/chosen": -171.2367401123047, "logps/ref_chosen": -85.16849517822266, "logps/ref_rejected": -84.92215728759766, "logps/rejected": -236.8095703125, "loss": 1.0258, "margin_dpo/margin_mean": 65.81916809082031, "margin_dpo/margin_std": 52.77665710449219, "step": 235 }, { "epoch": 0.35676492819349964, "grad_norm": 14.79189682006836, "learning_rate": 4.076381667711306e-07, "logits/chosen": 0.9688406586647034, "logits/rejected": 0.9256927967071533, "logps/chosen": -192.56378173828125, "logps/ref_chosen": -79.2076416015625, "logps/ref_rejected": -100.40895080566406, "logps/rejected": -259.85693359375, "loss": 1.1711, "margin_dpo/margin_mean": 46.09184265136719, "margin_dpo/margin_std": 55.658546447753906, "step": 236 }, { "epoch": 0.35827664399092973, "grad_norm": 18.867671966552734, "learning_rate": 4.066097311132753e-07, "logits/chosen": 0.8245556354522705, "logits/rejected": 0.794967770576477, "logps/chosen": -178.4556121826172, "logps/ref_chosen": -75.55789184570312, "logps/ref_rejected": -86.2087631225586, "logps/rejected": -248.25030517578125, "loss": 1.0857, "margin_dpo/margin_mean": 59.14380645751953, "margin_dpo/margin_std": 55.71824645996094, "step": 237 }, { "epoch": 0.35978835978835977, "grad_norm": 16.57659339904785, "learning_rate": 4.0557691474458414e-07, "logits/chosen": 0.8863713145256042, "logits/rejected": 0.8904386758804321, "logps/chosen": -154.5315399169922, "logps/ref_chosen": -59.983848571777344, "logps/ref_rejected": -59.57722854614258, "logps/rejected": -193.907958984375, "loss": 1.1075, "margin_dpo/margin_mean": 39.78303527832031, "margin_dpo/margin_std": 73.26469421386719, "step": 238 }, { "epoch": 0.36130007558578986, "grad_norm": 21.753231048583984, "learning_rate": 4.045397465551513e-07, "logits/chosen": 0.9004707336425781, "logits/rejected": 0.8397661447525024, "logps/chosen": -204.30355834960938, "logps/ref_chosen": -71.07061767578125, "logps/ref_rejected": -99.01707458496094, "logps/rejected": -268.3192443847656, "loss": 1.0612, "margin_dpo/margin_mean": 36.0692138671875, "margin_dpo/margin_std": 65.23834228515625, "step": 239 }, { "epoch": 0.36281179138321995, "grad_norm": 14.38486099243164, "learning_rate": 4.0349825555680045e-07, "logits/chosen": 0.9007062911987305, "logits/rejected": 0.8683948516845703, "logps/chosen": -195.65853881835938, "logps/ref_chosen": -62.13431930541992, "logps/ref_rejected": -94.44198608398438, "logps/rejected": -284.7762451171875, "loss": 1.0184, "margin_dpo/margin_mean": 56.8100471496582, "margin_dpo/margin_std": 98.01765441894531, "step": 240 }, { "epoch": 0.36432350718065004, "grad_norm": 13.766716003417969, "learning_rate": 4.0245247088227377e-07, "logits/chosen": 0.8529886603355408, "logits/rejected": 0.8130964040756226, "logps/chosen": -183.49896240234375, "logps/ref_chosen": -64.64911651611328, "logps/ref_rejected": -76.52389526367188, "logps/rejected": -236.65924072265625, "loss": 1.1746, "margin_dpo/margin_mean": 41.285484313964844, "margin_dpo/margin_std": 58.063209533691406, "step": 241 }, { "epoch": 0.36583522297808013, "grad_norm": 12.187637329101562, "learning_rate": 4.0140242178441665e-07, "logits/chosen": 0.8713346719741821, "logits/rejected": 0.8219842910766602, "logps/chosen": -147.36550903320312, "logps/ref_chosen": -50.54002380371094, "logps/ref_rejected": -72.02078247070312, "logps/rejected": -221.0113067626953, "loss": 1.0658, "margin_dpo/margin_mean": 52.1650505065918, "margin_dpo/margin_std": 68.18669891357422, "step": 242 }, { "epoch": 0.3673469387755102, "grad_norm": 15.1566801071167, "learning_rate": 4.003481376353596e-07, "logits/chosen": 0.9196364879608154, "logits/rejected": 0.9491422176361084, "logps/chosen": -203.58029174804688, "logps/ref_chosen": -89.10255432128906, "logps/ref_rejected": -77.90412902832031, "logps/rejected": -224.44149780273438, "loss": 1.1345, "margin_dpo/margin_mean": 32.059627532958984, "margin_dpo/margin_std": 67.19066619873047, "step": 243 }, { "epoch": 0.3688586545729403, "grad_norm": 16.03683090209961, "learning_rate": 3.9928964792569654e-07, "logits/chosen": 0.8897314071655273, "logits/rejected": 0.8393882513046265, "logps/chosen": -187.6811981201172, "logps/ref_chosen": -66.64555358886719, "logps/ref_rejected": -92.27547454833984, "logps/rejected": -261.2939453125, "loss": 0.9875, "margin_dpo/margin_mean": 47.982826232910156, "margin_dpo/margin_std": 67.17561340332031, "step": 244 }, { "epoch": 0.37037037037037035, "grad_norm": 12.031768798828125, "learning_rate": 3.982269822636601e-07, "logits/chosen": 0.9159705638885498, "logits/rejected": 0.9142988324165344, "logps/chosen": -185.25064086914062, "logps/ref_chosen": -79.13615417480469, "logps/ref_rejected": -79.21699523925781, "logps/rejected": -262.3095397949219, "loss": 0.9243, "margin_dpo/margin_mean": 76.97804260253906, "margin_dpo/margin_std": 56.90997314453125, "step": 245 }, { "epoch": 0.37188208616780044, "grad_norm": 19.480737686157227, "learning_rate": 3.971601703742932e-07, "logits/chosen": 0.9996259212493896, "logits/rejected": 1.0093920230865479, "logps/chosen": -149.17922973632812, "logps/ref_chosen": -57.22200012207031, "logps/ref_rejected": -77.80888366699219, "logps/rejected": -234.3947296142578, "loss": 1.1167, "margin_dpo/margin_mean": 64.62861633300781, "margin_dpo/margin_std": 74.76765441894531, "step": 246 }, { "epoch": 0.37339380196523053, "grad_norm": 19.866626739501953, "learning_rate": 3.960892420986177e-07, "logits/chosen": 0.7821018695831299, "logits/rejected": 0.7258821725845337, "logps/chosen": -219.67767333984375, "logps/ref_chosen": -94.68956756591797, "logps/ref_rejected": -122.22605895996094, "logps/rejected": -318.41412353515625, "loss": 1.257, "margin_dpo/margin_mean": 71.19993591308594, "margin_dpo/margin_std": 81.91409301757812, "step": 247 }, { "epoch": 0.3749055177626606, "grad_norm": 22.328622817993164, "learning_rate": 3.9501422739279953e-07, "logits/chosen": 0.7986111640930176, "logits/rejected": 0.8373353481292725, "logps/chosen": -175.7537384033203, "logps/ref_chosen": -60.833953857421875, "logps/ref_rejected": -60.90985107421875, "logps/rejected": -218.660888671875, "loss": 1.1383, "margin_dpo/margin_mean": 42.83125305175781, "margin_dpo/margin_std": 89.69184112548828, "step": 248 }, { "epoch": 0.3764172335600907, "grad_norm": 19.794172286987305, "learning_rate": 3.9393515632731094e-07, "logits/chosen": 1.0464762449264526, "logits/rejected": 1.056666374206543, "logps/chosen": -196.64761352539062, "logps/ref_chosen": -74.77812194824219, "logps/ref_rejected": -75.08592987060547, "logps/rejected": -216.56761169433594, "loss": 1.4198, "margin_dpo/margin_mean": 19.612201690673828, "margin_dpo/margin_std": 99.21858215332031, "step": 249 }, { "epoch": 0.3779289493575208, "grad_norm": 19.19304847717285, "learning_rate": 3.9285205908608934e-07, "logits/chosen": 1.015788197517395, "logits/rejected": 0.939686119556427, "logps/chosen": -132.5025634765625, "logps/ref_chosen": -47.93787384033203, "logps/ref_rejected": -61.55204772949219, "logps/rejected": -208.47003173828125, "loss": 1.0567, "margin_dpo/margin_mean": 62.353302001953125, "margin_dpo/margin_std": 72.25045776367188, "step": 250 }, { "epoch": 0.3794406651549509, "grad_norm": 18.333349227905273, "learning_rate": 3.9176496596569265e-07, "logits/chosen": 0.7942206263542175, "logits/rejected": 0.8282942771911621, "logps/chosen": -186.79269409179688, "logps/ref_chosen": -77.54549407958984, "logps/ref_rejected": -68.58549499511719, "logps/rejected": -215.02297973632812, "loss": 1.1574, "margin_dpo/margin_mean": 37.190284729003906, "margin_dpo/margin_std": 79.62153625488281, "step": 251 }, { "epoch": 0.38095238095238093, "grad_norm": 26.73542594909668, "learning_rate": 3.9067390737445254e-07, "logits/chosen": 0.9108133316040039, "logits/rejected": 0.8635146021842957, "logps/chosen": -196.31478881835938, "logps/ref_chosen": -73.16785430908203, "logps/ref_rejected": -70.33341979980469, "logps/rejected": -239.6460723876953, "loss": 1.2713, "margin_dpo/margin_mean": 46.16571044921875, "margin_dpo/margin_std": 107.81159973144531, "step": 252 }, { "epoch": 0.382464096749811, "grad_norm": 21.395902633666992, "learning_rate": 3.8957891383162304e-07, "logits/chosen": 0.9631872177124023, "logits/rejected": 0.9409669637680054, "logps/chosen": -125.77433776855469, "logps/ref_chosen": -33.16981506347656, "logps/ref_rejected": -42.722469329833984, "logps/rejected": -179.7202911376953, "loss": 1.1379, "margin_dpo/margin_mean": 44.39329528808594, "margin_dpo/margin_std": 63.10321044921875, "step": 253 }, { "epoch": 0.3839758125472411, "grad_norm": 13.39516544342041, "learning_rate": 3.884800159665276e-07, "logits/chosen": 0.9127005934715271, "logits/rejected": 0.8767493367195129, "logps/chosen": -138.35330200195312, "logps/ref_chosen": -50.45591735839844, "logps/ref_rejected": -71.39493560791016, "logps/rejected": -223.3489532470703, "loss": 1.1077, "margin_dpo/margin_mean": 64.056640625, "margin_dpo/margin_std": 61.15374755859375, "step": 254 }, { "epoch": 0.3854875283446712, "grad_norm": 21.80673599243164, "learning_rate": 3.873772445177015e-07, "logits/chosen": 0.929595410823822, "logits/rejected": 0.8490023612976074, "logps/chosen": -134.93621826171875, "logps/ref_chosen": -57.36243438720703, "logps/ref_rejected": -83.85030364990234, "logps/rejected": -219.91397094726562, "loss": 1.0894, "margin_dpo/margin_mean": 58.489891052246094, "margin_dpo/margin_std": 79.15504455566406, "step": 255 }, { "epoch": 0.3869992441421013, "grad_norm": 17.780683517456055, "learning_rate": 3.862706303320329e-07, "logits/chosen": 0.8780696988105774, "logits/rejected": 0.8686283230781555, "logps/chosen": -191.927734375, "logps/ref_chosen": -72.70278930664062, "logps/ref_rejected": -87.58718872070312, "logps/rejected": -250.40631103515625, "loss": 1.1458, "margin_dpo/margin_mean": 43.59416580200195, "margin_dpo/margin_std": 86.399169921875, "step": 256 }, { "epoch": 0.3885109599395314, "grad_norm": 13.700689315795898, "learning_rate": 3.851602043638994e-07, "logits/chosen": 0.9361928701400757, "logits/rejected": 0.8859877586364746, "logps/chosen": -137.43472290039062, "logps/ref_chosen": -56.16447448730469, "logps/ref_rejected": -79.54229736328125, "logps/rejected": -244.7681427001953, "loss": 0.9962, "margin_dpo/margin_mean": 83.9555892944336, "margin_dpo/margin_std": 90.04832458496094, "step": 257 }, { "epoch": 0.3900226757369615, "grad_norm": 15.653840065002441, "learning_rate": 3.840459976743023e-07, "logits/chosen": 0.857273280620575, "logits/rejected": 0.8150414228439331, "logps/chosen": -171.31089782714844, "logps/ref_chosen": -68.2958755493164, "logps/ref_rejected": -88.37379455566406, "logps/rejected": -259.4873046875, "loss": 1.0813, "margin_dpo/margin_mean": 68.0985107421875, "margin_dpo/margin_std": 73.86476135253906, "step": 258 }, { "epoch": 0.3915343915343915, "grad_norm": 16.806671142578125, "learning_rate": 3.8292804142999796e-07, "logits/chosen": 0.8986009955406189, "logits/rejected": 0.8750613927841187, "logps/chosen": -197.89768981933594, "logps/ref_chosen": -86.00422668457031, "logps/ref_rejected": -106.7618408203125, "logps/rejected": -272.7347106933594, "loss": 0.9853, "margin_dpo/margin_mean": 54.07939910888672, "margin_dpo/margin_std": 119.56117248535156, "step": 259 }, { "epoch": 0.3930461073318216, "grad_norm": 22.537059783935547, "learning_rate": 3.818063669026256e-07, "logits/chosen": 0.9305140972137451, "logits/rejected": 0.8760488629341125, "logps/chosen": -117.2443618774414, "logps/ref_chosen": -53.69929504394531, "logps/ref_rejected": -80.42738342285156, "logps/rejected": -225.81988525390625, "loss": 1.0383, "margin_dpo/margin_mean": 81.84744262695312, "margin_dpo/margin_std": 77.02603149414062, "step": 260 }, { "epoch": 0.3945578231292517, "grad_norm": 16.964977264404297, "learning_rate": 3.806810054678331e-07, "logits/chosen": 0.8644047975540161, "logits/rejected": 1.017871379852295, "logps/chosen": -177.18267822265625, "logps/ref_chosen": -87.48385620117188, "logps/ref_rejected": -45.14533233642578, "logps/rejected": -152.22369384765625, "loss": 1.1648, "margin_dpo/margin_mean": 17.37952995300293, "margin_dpo/margin_std": 70.14508056640625, "step": 261 }, { "epoch": 0.3960695389266818, "grad_norm": 15.852773666381836, "learning_rate": 3.7955198860439887e-07, "logits/chosen": 1.01748526096344, "logits/rejected": 0.9842813014984131, "logps/chosen": -153.70370483398438, "logps/ref_chosen": -52.95308303833008, "logps/ref_rejected": -64.26335906982422, "logps/rejected": -199.62222290039062, "loss": 1.0934, "margin_dpo/margin_mean": 34.60824203491211, "margin_dpo/margin_std": 63.7818489074707, "step": 262 }, { "epoch": 0.3975812547241119, "grad_norm": 18.748830795288086, "learning_rate": 3.784193478933516e-07, "logits/chosen": 0.9187393188476562, "logits/rejected": 0.8768531084060669, "logps/chosen": -145.96719360351562, "logps/ref_chosen": -59.93905258178711, "logps/ref_rejected": -70.00927734375, "logps/rejected": -191.09591674804688, "loss": 1.0738, "margin_dpo/margin_mean": 35.05849838256836, "margin_dpo/margin_std": 68.49369812011719, "step": 263 }, { "epoch": 0.39909297052154197, "grad_norm": 16.476381301879883, "learning_rate": 3.7728311501708674e-07, "logits/chosen": 0.8161172866821289, "logits/rejected": 0.7861305475234985, "logps/chosen": -201.89556884765625, "logps/ref_chosen": -77.3072509765625, "logps/ref_rejected": -83.79466247558594, "logps/rejected": -217.04876708984375, "loss": 1.0982, "margin_dpo/margin_mean": 8.665780067443848, "margin_dpo/margin_std": 75.14938354492188, "step": 264 }, { "epoch": 0.40060468631897206, "grad_norm": 14.398598670959473, "learning_rate": 3.7614332175848027e-07, "logits/chosen": 0.8012948036193848, "logits/rejected": 0.7619481086730957, "logps/chosen": -183.10635375976562, "logps/ref_chosen": -62.340206146240234, "logps/ref_rejected": -57.182029724121094, "logps/rejected": -195.21893310546875, "loss": 1.0655, "margin_dpo/margin_mean": 17.27075958251953, "margin_dpo/margin_std": 70.34654235839844, "step": 265 }, { "epoch": 0.4021164021164021, "grad_norm": 19.04807472229004, "learning_rate": 3.75e-07, "logits/chosen": 0.9468996524810791, "logits/rejected": 0.8705604076385498, "logps/chosen": -128.9197235107422, "logps/ref_chosen": -47.8638916015625, "logps/ref_rejected": -65.85595703125, "logps/rejected": -201.5211181640625, "loss": 1.1209, "margin_dpo/margin_mean": 54.60932922363281, "margin_dpo/margin_std": 57.88256072998047, "step": 266 }, { "epoch": 0.4036281179138322, "grad_norm": 22.434011459350586, "learning_rate": 3.738531817228131e-07, "logits/chosen": 0.8454642295837402, "logits/rejected": 0.786316990852356, "logps/chosen": -106.54922485351562, "logps/ref_chosen": -43.666568756103516, "logps/ref_rejected": -68.1474380493164, "logps/rejected": -161.97854614257812, "loss": 1.2043, "margin_dpo/margin_mean": 30.94845199584961, "margin_dpo/margin_std": 81.37274169921875, "step": 267 }, { "epoch": 0.4051398337112623, "grad_norm": 18.147586822509766, "learning_rate": 3.7270289900589204e-07, "logits/chosen": 0.7394974231719971, "logits/rejected": 0.7207775115966797, "logps/chosen": -133.07296752929688, "logps/ref_chosen": -67.96279907226562, "logps/ref_rejected": -72.69281005859375, "logps/rejected": -186.4730224609375, "loss": 1.1032, "margin_dpo/margin_mean": 48.67003631591797, "margin_dpo/margin_std": 64.53730010986328, "step": 268 }, { "epoch": 0.40665154950869237, "grad_norm": 12.17220687866211, "learning_rate": 3.7154918402511714e-07, "logits/chosen": 0.8191482424736023, "logits/rejected": 0.8883626461029053, "logps/chosen": -182.84963989257812, "logps/ref_chosen": -80.52581787109375, "logps/ref_rejected": -65.83181762695312, "logps/rejected": -205.61041259765625, "loss": 1.0584, "margin_dpo/margin_mean": 37.45478820800781, "margin_dpo/margin_std": 63.13644027709961, "step": 269 }, { "epoch": 0.40816326530612246, "grad_norm": 18.165111541748047, "learning_rate": 3.7039206905237656e-07, "logits/chosen": 0.8216699957847595, "logits/rejected": 0.8250892758369446, "logps/chosen": -129.14108276367188, "logps/ref_chosen": -64.29264831542969, "logps/ref_rejected": -65.4633560180664, "logps/rejected": -201.03909301757812, "loss": 1.0999, "margin_dpo/margin_mean": 70.72732543945312, "margin_dpo/margin_std": 76.4561767578125, "step": 270 }, { "epoch": 0.40967498110355255, "grad_norm": 21.99321174621582, "learning_rate": 3.692315864546635e-07, "logits/chosen": 0.9689754247665405, "logits/rejected": 0.8169035911560059, "logps/chosen": -90.29707336425781, "logps/ref_chosen": -39.26963424682617, "logps/ref_rejected": -87.15721130371094, "logps/rejected": -214.21963500976562, "loss": 1.2105, "margin_dpo/margin_mean": 76.03497314453125, "margin_dpo/margin_std": 88.13627624511719, "step": 271 }, { "epoch": 0.41118669690098264, "grad_norm": 12.809111595153809, "learning_rate": 3.6806776869317067e-07, "logits/chosen": 0.9838389158248901, "logits/rejected": 1.008858323097229, "logps/chosen": -140.18511962890625, "logps/ref_chosen": -55.88648223876953, "logps/ref_rejected": -57.95124816894531, "logps/rejected": -215.2462158203125, "loss": 0.9634, "margin_dpo/margin_mean": 72.996337890625, "margin_dpo/margin_std": 73.1607666015625, "step": 272 }, { "epoch": 0.4126984126984127, "grad_norm": 17.283329010009766, "learning_rate": 3.669006483223828e-07, "logits/chosen": 0.9234431982040405, "logits/rejected": 0.8033227324485779, "logps/chosen": -175.26605224609375, "logps/ref_chosen": -59.144004821777344, "logps/ref_rejected": -123.4438247680664, "logps/rejected": -278.2921447753906, "loss": 1.14, "margin_dpo/margin_mean": 38.72627258300781, "margin_dpo/margin_std": 74.85566711425781, "step": 273 }, { "epoch": 0.41421012849584277, "grad_norm": 18.224842071533203, "learning_rate": 3.657302579891656e-07, "logits/chosen": 0.7041195631027222, "logits/rejected": 0.8130519986152649, "logps/chosen": -175.5207977294922, "logps/ref_chosen": -77.74801635742188, "logps/ref_rejected": -63.99616241455078, "logps/rejected": -199.80519104003906, "loss": 1.1015, "margin_dpo/margin_mean": 38.036251068115234, "margin_dpo/margin_std": 89.42131042480469, "step": 274 }, { "epoch": 0.41572184429327286, "grad_norm": 21.59168815612793, "learning_rate": 3.645566304318526e-07, "logits/chosen": 0.9048452377319336, "logits/rejected": 0.8862916231155396, "logps/chosen": -169.61819458007812, "logps/ref_chosen": -56.68327331542969, "logps/ref_rejected": -65.53984832763672, "logps/rejected": -225.29954528808594, "loss": 1.0305, "margin_dpo/margin_mean": 46.82477569580078, "margin_dpo/margin_std": 65.18940734863281, "step": 275 }, { "epoch": 0.41723356009070295, "grad_norm": 20.635337829589844, "learning_rate": 3.633797984793294e-07, "logits/chosen": 0.8774361610412598, "logits/rejected": 0.8986802101135254, "logps/chosen": -137.88626098632812, "logps/ref_chosen": -43.50504684448242, "logps/ref_rejected": -43.088871002197266, "logps/rejected": -173.4464111328125, "loss": 1.123, "margin_dpo/margin_mean": 35.97632598876953, "margin_dpo/margin_std": 63.101436614990234, "step": 276 }, { "epoch": 0.41874527588813304, "grad_norm": 18.922773361206055, "learning_rate": 3.6219979505011555e-07, "logits/chosen": 0.7700687646865845, "logits/rejected": 0.7433135509490967, "logps/chosen": -165.19195556640625, "logps/ref_chosen": -52.701934814453125, "logps/ref_rejected": -62.01823425292969, "logps/rejected": -202.63037109375, "loss": 1.3038, "margin_dpo/margin_mean": 28.122108459472656, "margin_dpo/margin_std": 74.96601867675781, "step": 277 }, { "epoch": 0.42025699168556313, "grad_norm": 24.745040893554688, "learning_rate": 3.6101665315144353e-07, "logits/chosen": 0.9205065965652466, "logits/rejected": 0.8737306594848633, "logps/chosen": -186.35955810546875, "logps/ref_chosen": -66.36759948730469, "logps/ref_rejected": -71.12834930419922, "logps/rejected": -234.69064331054688, "loss": 1.0695, "margin_dpo/margin_mean": 43.57034683227539, "margin_dpo/margin_std": 91.35447692871094, "step": 278 }, { "epoch": 0.4217687074829932, "grad_norm": 19.72016143798828, "learning_rate": 3.5983040587833563e-07, "logits/chosen": 0.8044095039367676, "logits/rejected": 0.7835577726364136, "logps/chosen": -125.09947967529297, "logps/ref_chosen": -57.34808349609375, "logps/ref_rejected": -60.212989807128906, "logps/rejected": -194.96324157714844, "loss": 0.942, "margin_dpo/margin_mean": 66.99885559082031, "margin_dpo/margin_std": 86.55064392089844, "step": 279 }, { "epoch": 0.42328042328042326, "grad_norm": 13.24905014038086, "learning_rate": 3.586410864126781e-07, "logits/chosen": 0.8917016983032227, "logits/rejected": 0.8699558973312378, "logps/chosen": -111.32423400878906, "logps/ref_chosen": -58.24922561645508, "logps/ref_rejected": -59.01625061035156, "logps/rejected": -200.49822998046875, "loss": 0.8917, "margin_dpo/margin_mean": 88.40696716308594, "margin_dpo/margin_std": 55.23664474487305, "step": 280 }, { "epoch": 0.42479213907785335, "grad_norm": 14.725621223449707, "learning_rate": 3.574487280222929e-07, "logits/chosen": 0.8508050441741943, "logits/rejected": 0.8704952597618103, "logps/chosen": -189.38882446289062, "logps/ref_chosen": -78.43274688720703, "logps/ref_rejected": -91.09056091308594, "logps/rejected": -267.9164733886719, "loss": 1.081, "margin_dpo/margin_mean": 65.86985778808594, "margin_dpo/margin_std": 70.21514892578125, "step": 281 }, { "epoch": 0.42630385487528344, "grad_norm": 16.900634765625, "learning_rate": 3.562533640600075e-07, "logits/chosen": 0.8684970140457153, "logits/rejected": 0.7901803255081177, "logps/chosen": -150.54010009765625, "logps/ref_chosen": -53.83773422241211, "logps/ref_rejected": -75.1729507446289, "logps/rejected": -234.26873779296875, "loss": 1.1197, "margin_dpo/margin_mean": 62.39341354370117, "margin_dpo/margin_std": 64.95208740234375, "step": 282 }, { "epoch": 0.42781557067271353, "grad_norm": 15.944957733154297, "learning_rate": 3.550550279627215e-07, "logits/chosen": 0.8715301156044006, "logits/rejected": 0.8303920030593872, "logps/chosen": -142.8111572265625, "logps/ref_chosen": -53.5611457824707, "logps/ref_rejected": -73.18958282470703, "logps/rejected": -201.0227813720703, "loss": 1.0702, "margin_dpo/margin_mean": 38.58319091796875, "margin_dpo/margin_std": 71.02609252929688, "step": 283 }, { "epoch": 0.4293272864701436, "grad_norm": 19.076061248779297, "learning_rate": 3.5385375325047163e-07, "logits/chosen": 0.9148997068405151, "logits/rejected": 0.8821941614151001, "logps/chosen": -152.02041625976562, "logps/ref_chosen": -55.81263732910156, "logps/ref_rejected": -90.23190307617188, "logps/rejected": -226.96029663085938, "loss": 1.0726, "margin_dpo/margin_mean": 40.52062225341797, "margin_dpo/margin_std": 77.91551971435547, "step": 284 }, { "epoch": 0.4308390022675737, "grad_norm": 24.047950744628906, "learning_rate": 3.5264957352549375e-07, "logits/chosen": 0.9095529913902283, "logits/rejected": 0.9729138016700745, "logps/chosen": -185.89060974121094, "logps/ref_chosen": -71.53235626220703, "logps/ref_rejected": -46.31084060668945, "logps/rejected": -196.288818359375, "loss": 1.1315, "margin_dpo/margin_mean": 35.61972427368164, "margin_dpo/margin_std": 69.28761291503906, "step": 285 }, { "epoch": 0.4323507180650038, "grad_norm": 16.382957458496094, "learning_rate": 3.514425224712835e-07, "logits/chosen": 0.9263152480125427, "logits/rejected": 0.9703081846237183, "logps/chosen": -178.46542358398438, "logps/ref_chosen": -56.29132080078125, "logps/ref_rejected": -54.583534240722656, "logps/rejected": -236.7250518798828, "loss": 0.9443, "margin_dpo/margin_mean": 59.96741485595703, "margin_dpo/margin_std": 58.34821319580078, "step": 286 }, { "epoch": 0.43386243386243384, "grad_norm": 18.149490356445312, "learning_rate": 3.502326338516534e-07, "logits/chosen": 0.9496700763702393, "logits/rejected": 0.798512876033783, "logps/chosen": -130.92161560058594, "logps/ref_chosen": -39.158355712890625, "logps/ref_rejected": -74.88023376464844, "logps/rejected": -248.06649780273438, "loss": 1.0283, "margin_dpo/margin_mean": 81.4229965209961, "margin_dpo/margin_std": 76.41511535644531, "step": 287 }, { "epoch": 0.43537414965986393, "grad_norm": 17.73630714416504, "learning_rate": 3.490199415097892e-07, "logits/chosen": 0.7672666311264038, "logits/rejected": 0.7976804971694946, "logps/chosen": -178.63949584960938, "logps/ref_chosen": -69.44332885742188, "logps/ref_rejected": -54.98228454589844, "logps/rejected": -200.60418701171875, "loss": 1.1142, "margin_dpo/margin_mean": 36.425743103027344, "margin_dpo/margin_std": 56.845184326171875, "step": 288 }, { "epoch": 0.436885865457294, "grad_norm": 16.403818130493164, "learning_rate": 3.4780447936730247e-07, "logits/chosen": 0.8484159111976624, "logits/rejected": 0.8917375802993774, "logps/chosen": -221.443359375, "logps/ref_chosen": -64.24480438232422, "logps/ref_rejected": -67.39839172363281, "logps/rejected": -246.82095336914062, "loss": 1.1333, "margin_dpo/margin_mean": 22.224018096923828, "margin_dpo/margin_std": 46.693260192871094, "step": 289 }, { "epoch": 0.4383975812547241, "grad_norm": 17.30687141418457, "learning_rate": 3.465862814232821e-07, "logits/chosen": 0.9498525261878967, "logits/rejected": 0.9050639867782593, "logps/chosen": -192.14419555664062, "logps/ref_chosen": -65.73394012451172, "logps/ref_rejected": -74.00738525390625, "logps/rejected": -251.295654296875, "loss": 1.0606, "margin_dpo/margin_mean": 50.87800598144531, "margin_dpo/margin_std": 90.56031799316406, "step": 290 }, { "epoch": 0.4399092970521542, "grad_norm": 19.28776741027832, "learning_rate": 3.4536538175334343e-07, "logits/chosen": 0.9647561311721802, "logits/rejected": 0.9699366092681885, "logps/chosen": -183.28567504882812, "logps/ref_chosen": -67.45711517333984, "logps/ref_rejected": -75.27851867675781, "logps/rejected": -262.3265075683594, "loss": 1.0179, "margin_dpo/margin_mean": 71.21941375732422, "margin_dpo/margin_std": 88.33670806884766, "step": 291 }, { "epoch": 0.4414210128495843, "grad_norm": 15.975472450256348, "learning_rate": 3.4414181450867465e-07, "logits/chosen": 0.9394838809967041, "logits/rejected": 0.9308391809463501, "logps/chosen": -167.02105712890625, "logps/ref_chosen": -58.86817932128906, "logps/ref_rejected": -64.91166687011719, "logps/rejected": -216.97520446777344, "loss": 1.1114, "margin_dpo/margin_mean": 43.910648345947266, "margin_dpo/margin_std": 96.6522216796875, "step": 292 }, { "epoch": 0.4429327286470144, "grad_norm": 14.782191276550293, "learning_rate": 3.4291561391508185e-07, "logits/chosen": 0.9456629753112793, "logits/rejected": 0.9435967803001404, "logps/chosen": -138.83485412597656, "logps/ref_chosen": -48.30006408691406, "logps/ref_rejected": -58.28700256347656, "logps/rejected": -214.57583618164062, "loss": 0.9865, "margin_dpo/margin_mean": 65.7540512084961, "margin_dpo/margin_std": 53.61219024658203, "step": 293 }, { "epoch": 0.4444444444444444, "grad_norm": 13.40251350402832, "learning_rate": 3.4168681427203153e-07, "logits/chosen": 0.8696566224098206, "logits/rejected": 0.7992103099822998, "logps/chosen": -201.7752685546875, "logps/ref_chosen": -73.32835388183594, "logps/ref_rejected": -100.44032287597656, "logps/rejected": -282.5135192871094, "loss": 1.0812, "margin_dpo/margin_mean": 53.62628936767578, "margin_dpo/margin_std": 67.26288604736328, "step": 294 }, { "epoch": 0.4459561602418745, "grad_norm": 20.00554656982422, "learning_rate": 3.4045544995169125e-07, "logits/chosen": 1.0336451530456543, "logits/rejected": 0.9124792814254761, "logps/chosen": -154.9568328857422, "logps/ref_chosen": -36.71764373779297, "logps/ref_rejected": -77.01786804199219, "logps/rejected": -266.3442687988281, "loss": 1.0952, "margin_dpo/margin_mean": 71.08719635009766, "margin_dpo/margin_std": 75.32640838623047, "step": 295 }, { "epoch": 0.4474678760393046, "grad_norm": 17.829896926879883, "learning_rate": 3.392215553979679e-07, "logits/chosen": 0.8829468488693237, "logits/rejected": 0.7915094494819641, "logps/chosen": -156.69998168945312, "logps/ref_chosen": -47.72833251953125, "logps/ref_rejected": -85.0424575805664, "logps/rejected": -267.91741943359375, "loss": 1.0734, "margin_dpo/margin_mean": 73.9033203125, "margin_dpo/margin_std": 85.67912292480469, "step": 296 }, { "epoch": 0.4489795918367347, "grad_norm": 13.065276145935059, "learning_rate": 3.3798516512554485e-07, "logits/chosen": 1.0190558433532715, "logits/rejected": 0.9399404525756836, "logps/chosen": -171.317626953125, "logps/ref_chosen": -45.453880310058594, "logps/ref_rejected": -60.58012390136719, "logps/rejected": -240.07229614257812, "loss": 0.9618, "margin_dpo/margin_mean": 53.62842559814453, "margin_dpo/margin_std": 68.6543197631836, "step": 297 }, { "epoch": 0.4504913076341648, "grad_norm": 14.449824333190918, "learning_rate": 3.367463137189156e-07, "logits/chosen": 1.0604900121688843, "logits/rejected": 1.045938491821289, "logps/chosen": -196.72889709472656, "logps/ref_chosen": -65.93342590332031, "logps/ref_rejected": -81.31886291503906, "logps/rejected": -258.4837341308594, "loss": 1.1329, "margin_dpo/margin_mean": 46.3693962097168, "margin_dpo/margin_std": 84.15235900878906, "step": 298 }, { "epoch": 0.4520030234315949, "grad_norm": 18.29178237915039, "learning_rate": 3.355050358314172e-07, "logits/chosen": 0.9493238925933838, "logits/rejected": 0.9166898727416992, "logps/chosen": -167.92984008789062, "logps/ref_chosen": -41.45861053466797, "logps/ref_rejected": -55.44845199584961, "logps/rejected": -203.72457885742188, "loss": 1.2108, "margin_dpo/margin_mean": 21.804901123046875, "margin_dpo/margin_std": 75.58480834960938, "step": 299 }, { "epoch": 0.45351473922902497, "grad_norm": 15.63215446472168, "learning_rate": 3.3426136618426043e-07, "logits/chosen": 0.95320063829422, "logits/rejected": 1.0048692226409912, "logps/chosen": -177.46788024902344, "logps/ref_chosen": -61.02867889404297, "logps/ref_rejected": -52.653968811035156, "logps/rejected": -235.43170166015625, "loss": 1.1015, "margin_dpo/margin_mean": 66.33853149414062, "margin_dpo/margin_std": 77.45952606201172, "step": 300 }, { "epoch": 0.45351473922902497, "eval_logits/chosen": 0.910781741142273, "eval_logits/rejected": 0.8756802678108215, "eval_logps/chosen": -200.87997436523438, "eval_logps/ref_chosen": -75.30646514892578, "eval_logps/ref_rejected": -77.75511932373047, "eval_logps/rejected": -251.94520568847656, "eval_loss": 0.557418942451477, "eval_margin_dpo/margin_mean": 48.61654281616211, "eval_margin_dpo/margin_std": 83.31050872802734, "eval_runtime": 37.4536, "eval_samples_per_second": 61.489, "eval_steps_per_second": 1.922, "step": 300 }, { "epoch": 0.455026455026455, "grad_norm": 14.323222160339355, "learning_rate": 3.3301533956555885e-07, "logits/chosen": 1.0346510410308838, "logits/rejected": 0.985994815826416, "logps/chosen": -206.44583129882812, "logps/ref_chosen": -49.377403259277344, "logps/ref_rejected": -72.45796966552734, "logps/rejected": -270.4298095703125, "loss": 1.1779, "margin_dpo/margin_mean": 40.90338897705078, "margin_dpo/margin_std": 61.039005279541016, "step": 301 }, { "epoch": 0.4565381708238851, "grad_norm": 18.329959869384766, "learning_rate": 3.317669908293554e-07, "logits/chosen": 0.8981478810310364, "logits/rejected": 0.8204025030136108, "logps/chosen": -166.45431518554688, "logps/ref_chosen": -42.884490966796875, "logps/ref_rejected": -72.04084777832031, "logps/rejected": -267.53070068359375, "loss": 1.2516, "margin_dpo/margin_mean": 71.92005920410156, "margin_dpo/margin_std": 84.18696594238281, "step": 302 }, { "epoch": 0.4580498866213152, "grad_norm": 13.802994728088379, "learning_rate": 3.3051635489464793e-07, "logits/chosen": 0.9232026934623718, "logits/rejected": 0.8720027804374695, "logps/chosen": -161.49925231933594, "logps/ref_chosen": -50.44966506958008, "logps/ref_rejected": -77.09220886230469, "logps/rejected": -234.8109893798828, "loss": 1.0738, "margin_dpo/margin_mean": 46.669189453125, "margin_dpo/margin_std": 89.64495849609375, "step": 303 }, { "epoch": 0.4595616024187453, "grad_norm": 13.76878833770752, "learning_rate": 3.292634667444117e-07, "logits/chosen": 1.0514378547668457, "logits/rejected": 0.9998120069503784, "logps/chosen": -169.84353637695312, "logps/ref_chosen": -54.7811279296875, "logps/ref_rejected": -74.56997680664062, "logps/rejected": -264.3577880859375, "loss": 0.9872, "margin_dpo/margin_mean": 74.72541809082031, "margin_dpo/margin_std": 65.64923858642578, "step": 304 }, { "epoch": 0.46107331821617537, "grad_norm": 18.287494659423828, "learning_rate": 3.280083614246217e-07, "logits/chosen": 0.8871467113494873, "logits/rejected": 0.917915940284729, "logps/chosen": -238.80645751953125, "logps/ref_chosen": -81.34001159667969, "logps/ref_rejected": -77.41258239746094, "logps/rejected": -272.7764892578125, "loss": 1.1496, "margin_dpo/margin_mean": 37.89744567871094, "margin_dpo/margin_std": 99.84722900390625, "step": 305 }, { "epoch": 0.46258503401360546, "grad_norm": 13.584040641784668, "learning_rate": 3.267510740432719e-07, "logits/chosen": 1.079813003540039, "logits/rejected": 1.0361995697021484, "logps/chosen": -143.89230346679688, "logps/ref_chosen": -49.74858474731445, "logps/ref_rejected": -54.940982818603516, "logps/rejected": -209.38763427734375, "loss": 1.0814, "margin_dpo/margin_mean": 60.302940368652344, "margin_dpo/margin_std": 79.73137664794922, "step": 306 }, { "epoch": 0.46409674981103555, "grad_norm": 19.44350814819336, "learning_rate": 3.2549163976939285e-07, "logits/chosen": 0.8657441139221191, "logits/rejected": 0.8813103437423706, "logps/chosen": -183.44241333007812, "logps/ref_chosen": -65.90791320800781, "logps/ref_rejected": -65.25321960449219, "logps/rejected": -221.45703125, "loss": 1.2949, "margin_dpo/margin_mean": 38.6693229675293, "margin_dpo/margin_std": 113.20942687988281, "step": 307 }, { "epoch": 0.4656084656084656, "grad_norm": 17.951297760009766, "learning_rate": 3.2423009383206874e-07, "logits/chosen": 0.9202988147735596, "logits/rejected": 0.9259182214736938, "logps/chosen": -203.8446807861328, "logps/ref_chosen": -71.3767318725586, "logps/ref_rejected": -82.10542297363281, "logps/rejected": -271.75042724609375, "loss": 1.1168, "margin_dpo/margin_mean": 57.17706298828125, "margin_dpo/margin_std": 100.58518981933594, "step": 308 }, { "epoch": 0.4671201814058957, "grad_norm": 14.301473617553711, "learning_rate": 3.229664715194511e-07, "logits/chosen": 0.9366397857666016, "logits/rejected": 0.913209080696106, "logps/chosen": -179.5953369140625, "logps/ref_chosen": -41.846153259277344, "logps/ref_rejected": -61.37134552001953, "logps/rejected": -237.18511962890625, "loss": 1.0857, "margin_dpo/margin_mean": 38.064598083496094, "margin_dpo/margin_std": 66.49103546142578, "step": 309 }, { "epoch": 0.46863189720332576, "grad_norm": 16.19812774658203, "learning_rate": 3.2170080817777257e-07, "logits/chosen": 0.969029426574707, "logits/rejected": 0.9913873672485352, "logps/chosen": -214.78329467773438, "logps/ref_chosen": -70.55810546875, "logps/ref_rejected": -64.62115478515625, "logps/rejected": -227.16543579101562, "loss": 1.2835, "margin_dpo/margin_mean": 18.319095611572266, "margin_dpo/margin_std": 73.50588989257812, "step": 310 }, { "epoch": 0.47014361300075586, "grad_norm": 16.054964065551758, "learning_rate": 3.204331392103574e-07, "logits/chosen": 0.8452152013778687, "logits/rejected": 0.8800424337387085, "logps/chosen": -181.9866485595703, "logps/ref_chosen": -60.32414245605469, "logps/ref_rejected": -68.1629638671875, "logps/rejected": -208.29612731933594, "loss": 1.0987, "margin_dpo/margin_mean": 18.47066307067871, "margin_dpo/margin_std": 71.49610137939453, "step": 311 }, { "epoch": 0.47165532879818595, "grad_norm": 13.902632713317871, "learning_rate": 3.1916350007663176e-07, "logits/chosen": 1.0027267932891846, "logits/rejected": 0.952292799949646, "logps/chosen": -169.0614013671875, "logps/ref_chosen": -55.58141326904297, "logps/ref_rejected": -71.82810974121094, "logps/rejected": -234.40756225585938, "loss": 0.9795, "margin_dpo/margin_mean": 49.09947967529297, "margin_dpo/margin_std": 47.16817855834961, "step": 312 }, { "epoch": 0.47316704459561604, "grad_norm": 15.096352577209473, "learning_rate": 3.178919262911314e-07, "logits/chosen": 0.9660900831222534, "logits/rejected": 1.0016883611679077, "logps/chosen": -156.38772583007812, "logps/ref_chosen": -52.92902374267578, "logps/ref_rejected": -47.91901779174805, "logps/rejected": -186.65386962890625, "loss": 1.2484, "margin_dpo/margin_mean": 35.27616500854492, "margin_dpo/margin_std": 60.62500762939453, "step": 313 }, { "epoch": 0.47467876039304613, "grad_norm": 15.742197036743164, "learning_rate": 3.166184534225087e-07, "logits/chosen": 0.9835371971130371, "logits/rejected": 0.9550716876983643, "logps/chosen": -169.02850341796875, "logps/ref_chosen": -64.4450454711914, "logps/ref_rejected": -76.83822631835938, "logps/rejected": -248.9261474609375, "loss": 1.0354, "margin_dpo/margin_mean": 67.50444793701172, "margin_dpo/margin_std": 74.06520080566406, "step": 314 }, { "epoch": 0.47619047619047616, "grad_norm": 13.899375915527344, "learning_rate": 3.1534311709253723e-07, "logits/chosen": 0.9853957295417786, "logits/rejected": 0.9791759252548218, "logps/chosen": -192.0851593017578, "logps/ref_chosen": -56.93284225463867, "logps/ref_rejected": -50.28406524658203, "logps/rejected": -199.18292236328125, "loss": 1.0917, "margin_dpo/margin_mean": 13.746543884277344, "margin_dpo/margin_std": 78.6737060546875, "step": 315 }, { "epoch": 0.47770219198790626, "grad_norm": 16.974472045898438, "learning_rate": 3.1406595297511564e-07, "logits/chosen": 0.7342613935470581, "logits/rejected": 0.5312126874923706, "logps/chosen": -182.5743408203125, "logps/ref_chosen": -66.3306884765625, "logps/ref_rejected": -137.49655151367188, "logps/rejected": -309.92919921875, "loss": 1.0034, "margin_dpo/margin_mean": 56.18898391723633, "margin_dpo/margin_std": 85.84481048583984, "step": 316 }, { "epoch": 0.47921390778533635, "grad_norm": 14.526897430419922, "learning_rate": 3.1278699679526975e-07, "logits/chosen": 0.9011315107345581, "logits/rejected": 0.8500291705131531, "logps/chosen": -154.86817932128906, "logps/ref_chosen": -42.494422912597656, "logps/ref_rejected": -66.68242645263672, "logps/rejected": -229.81800842285156, "loss": 0.9879, "margin_dpo/margin_mean": 50.761817932128906, "margin_dpo/margin_std": 63.989234924316406, "step": 317 }, { "epoch": 0.48072562358276644, "grad_norm": 17.438730239868164, "learning_rate": 3.1150628432815336e-07, "logits/chosen": 0.8117409944534302, "logits/rejected": 0.8378602266311646, "logps/chosen": -200.1689453125, "logps/ref_chosen": -80.13600158691406, "logps/ref_rejected": -83.40070343017578, "logps/rejected": -256.28057861328125, "loss": 1.1962, "margin_dpo/margin_mean": 52.8469352722168, "margin_dpo/margin_std": 89.18522644042969, "step": 318 }, { "epoch": 0.48223733938019653, "grad_norm": 13.24758529663086, "learning_rate": 3.1022385139804707e-07, "logits/chosen": 0.8036596179008484, "logits/rejected": 0.758333683013916, "logps/chosen": -186.25949096679688, "logps/ref_chosen": -83.42949676513672, "logps/ref_rejected": -113.88960266113281, "logps/rejected": -307.47576904296875, "loss": 1.0176, "margin_dpo/margin_mean": 90.75616455078125, "margin_dpo/margin_std": 84.7353515625, "step": 319 }, { "epoch": 0.4837490551776266, "grad_norm": 16.077190399169922, "learning_rate": 3.0893973387735683e-07, "logits/chosen": 0.8179005980491638, "logits/rejected": 0.7942554950714111, "logps/chosen": -119.91881561279297, "logps/ref_chosen": -34.690284729003906, "logps/ref_rejected": -57.00449752807617, "logps/rejected": -183.24380493164062, "loss": 1.1051, "margin_dpo/margin_mean": 41.01078796386719, "margin_dpo/margin_std": 86.33978271484375, "step": 320 }, { "epoch": 0.4852607709750567, "grad_norm": 18.350908279418945, "learning_rate": 3.0765396768561004e-07, "logits/chosen": 0.8898118734359741, "logits/rejected": 0.9107556343078613, "logps/chosen": -122.81785583496094, "logps/ref_chosen": -43.647361755371094, "logps/ref_rejected": -49.380775451660156, "logps/rejected": -199.43606567382812, "loss": 1.0686, "margin_dpo/margin_mean": 70.88478088378906, "margin_dpo/margin_std": 72.49562072753906, "step": 321 }, { "epoch": 0.48677248677248675, "grad_norm": 15.368215560913086, "learning_rate": 3.063665887884511e-07, "logits/chosen": 1.0382732152938843, "logits/rejected": 0.9146447777748108, "logps/chosen": -97.49690246582031, "logps/ref_chosen": -30.90003204345703, "logps/ref_rejected": -70.10389709472656, "logps/rejected": -234.061767578125, "loss": 0.9306, "margin_dpo/margin_mean": 97.36099243164062, "margin_dpo/margin_std": 62.040199279785156, "step": 322 }, { "epoch": 0.48828420256991684, "grad_norm": 14.468438148498535, "learning_rate": 3.0507763319663517e-07, "logits/chosen": 0.8593270778656006, "logits/rejected": 0.8456133008003235, "logps/chosen": -175.40945434570312, "logps/ref_chosen": -65.93765258789062, "logps/ref_rejected": -73.23563385009766, "logps/rejected": -233.8284912109375, "loss": 1.1484, "margin_dpo/margin_mean": 51.121055603027344, "margin_dpo/margin_std": 80.7939453125, "step": 323 }, { "epoch": 0.4897959183673469, "grad_norm": 15.986069679260254, "learning_rate": 3.0378713696502097e-07, "logits/chosen": 0.8656540513038635, "logits/rejected": 0.8433347940444946, "logps/chosen": -124.39891052246094, "logps/ref_chosen": -40.30308532714844, "logps/ref_rejected": -56.531700134277344, "logps/rejected": -194.01043701171875, "loss": 1.0246, "margin_dpo/margin_mean": 53.382911682128906, "margin_dpo/margin_std": 58.651451110839844, "step": 324 }, { "epoch": 0.491307634164777, "grad_norm": 14.863397598266602, "learning_rate": 3.0249513619156206e-07, "logits/chosen": 1.018980860710144, "logits/rejected": 1.0166943073272705, "logps/chosen": -157.23825073242188, "logps/ref_chosen": -52.309547424316406, "logps/ref_rejected": -56.243019104003906, "logps/rejected": -230.12759399414062, "loss": 1.0454, "margin_dpo/margin_mean": 68.95586395263672, "margin_dpo/margin_std": 71.14509582519531, "step": 325 }, { "epoch": 0.4928193499622071, "grad_norm": 18.127382278442383, "learning_rate": 3.012016670162977e-07, "logits/chosen": 0.9008328914642334, "logits/rejected": 0.8836438655853271, "logps/chosen": -221.15011596679688, "logps/ref_chosen": -74.94476318359375, "logps/ref_rejected": -84.40521240234375, "logps/rejected": -264.2186584472656, "loss": 1.2972, "margin_dpo/margin_mean": 33.60809326171875, "margin_dpo/margin_std": 81.20984649658203, "step": 326 }, { "epoch": 0.4943310657596372, "grad_norm": 15.47046947479248, "learning_rate": 2.99906765620341e-07, "logits/chosen": 0.7541144490242004, "logits/rejected": 0.7670720815658569, "logps/chosen": -199.9362030029297, "logps/ref_chosen": -75.78781127929688, "logps/ref_rejected": -58.949928283691406, "logps/rejected": -254.4146728515625, "loss": 1.1629, "margin_dpo/margin_mean": 71.31634521484375, "margin_dpo/margin_std": 152.7041473388672, "step": 327 }, { "epoch": 0.4958427815570673, "grad_norm": 14.062821388244629, "learning_rate": 2.9861046822486766e-07, "logits/chosen": 0.7631938457489014, "logits/rejected": 0.7273193597793579, "logps/chosen": -151.08351135253906, "logps/ref_chosen": -64.362060546875, "logps/ref_rejected": -83.52467346191406, "logps/rejected": -229.90032958984375, "loss": 1.0252, "margin_dpo/margin_mean": 59.65420150756836, "margin_dpo/margin_std": 63.522666931152344, "step": 328 }, { "epoch": 0.4973544973544973, "grad_norm": 16.59518814086914, "learning_rate": 2.9731281109010253e-07, "logits/chosen": 0.9434309005737305, "logits/rejected": 0.8342186212539673, "logps/chosen": -146.32696533203125, "logps/ref_chosen": -49.1827278137207, "logps/ref_rejected": -84.71371459960938, "logps/rejected": -222.47955322265625, "loss": 1.0562, "margin_dpo/margin_mean": 40.62158966064453, "margin_dpo/margin_std": 68.19479370117188, "step": 329 }, { "epoch": 0.4988662131519274, "grad_norm": 14.8185396194458, "learning_rate": 2.9601383051430505e-07, "logits/chosen": 0.9047882556915283, "logits/rejected": 0.8687087297439575, "logps/chosen": -178.20071411132812, "logps/ref_chosen": -55.316497802734375, "logps/ref_rejected": -65.46820831298828, "logps/rejected": -196.5502166748047, "loss": 1.1321, "margin_dpo/margin_mean": 8.197792053222656, "margin_dpo/margin_std": 85.20492553710938, "step": 330 }, { "epoch": 0.5003779289493575, "grad_norm": 15.615235328674316, "learning_rate": 2.947135628327544e-07, "logits/chosen": 0.8331134915351868, "logits/rejected": 0.7597118616104126, "logps/chosen": -159.10202026367188, "logps/ref_chosen": -59.31645965576172, "logps/ref_rejected": -85.34983825683594, "logps/rejected": -289.3092346191406, "loss": 0.957, "margin_dpo/margin_mean": 104.17383575439453, "margin_dpo/margin_std": 75.5068588256836, "step": 331 }, { "epoch": 0.5018896447467877, "grad_norm": 16.04841423034668, "learning_rate": 2.934120444167326e-07, "logits/chosen": 0.8443226218223572, "logits/rejected": 0.8055863976478577, "logps/chosen": -169.5580291748047, "logps/ref_chosen": -58.70336151123047, "logps/ref_rejected": -75.91543579101562, "logps/rejected": -240.45140075683594, "loss": 1.0368, "margin_dpo/margin_mean": 53.68130874633789, "margin_dpo/margin_std": 67.17088317871094, "step": 332 }, { "epoch": 0.5034013605442177, "grad_norm": 17.165664672851562, "learning_rate": 2.921093116725076e-07, "logits/chosen": 0.9496264457702637, "logits/rejected": 0.8828880190849304, "logps/chosen": -196.00656127929688, "logps/ref_chosen": -68.11222839355469, "logps/ref_rejected": -106.30081939697266, "logps/rejected": -314.76043701171875, "loss": 0.9578, "margin_dpo/margin_mean": 80.5653076171875, "margin_dpo/margin_std": 64.04613494873047, "step": 333 }, { "epoch": 0.5049130763416477, "grad_norm": 13.934477806091309, "learning_rate": 2.9080540104031484e-07, "logits/chosen": 0.9245538711547852, "logits/rejected": 0.8373509645462036, "logps/chosen": -169.71932983398438, "logps/ref_chosen": -59.10272216796875, "logps/ref_rejected": -81.27894592285156, "logps/rejected": -235.17379760742188, "loss": 1.1124, "margin_dpo/margin_mean": 43.278263092041016, "margin_dpo/margin_std": 132.07708740234375, "step": 334 }, { "epoch": 0.5064247921390779, "grad_norm": 18.533241271972656, "learning_rate": 2.895003489933375e-07, "logits/chosen": 1.05397367477417, "logits/rejected": 0.9336830973625183, "logps/chosen": -147.48800659179688, "logps/ref_chosen": -59.12438201904297, "logps/ref_rejected": -121.7302017211914, "logps/rejected": -331.6442565917969, "loss": 1.0951, "margin_dpo/margin_mean": 121.5504150390625, "margin_dpo/margin_std": 88.70282745361328, "step": 335 }, { "epoch": 0.5079365079365079, "grad_norm": 16.828493118286133, "learning_rate": 2.8819419203668675e-07, "logits/chosen": 0.8980951309204102, "logits/rejected": 0.8811999559402466, "logps/chosen": -182.00933837890625, "logps/ref_chosen": -58.688018798828125, "logps/ref_rejected": -89.30653381347656, "logps/rejected": -294.16778564453125, "loss": 1.0804, "margin_dpo/margin_mean": 81.53993225097656, "margin_dpo/margin_std": 102.6664047241211, "step": 336 }, { "epoch": 0.509448223733938, "grad_norm": 13.274103164672852, "learning_rate": 2.8688696670638053e-07, "logits/chosen": 0.6885286569595337, "logits/rejected": 0.6868454813957214, "logps/chosen": -265.0749206542969, "logps/ref_chosen": -97.68962860107422, "logps/ref_rejected": -91.40831756591797, "logps/rejected": -269.9544677734375, "loss": 1.1635, "margin_dpo/margin_mean": 11.160860061645508, "margin_dpo/margin_std": 96.92953491210938, "step": 337 }, { "epoch": 0.5109599395313681, "grad_norm": 14.477042198181152, "learning_rate": 2.8557870956832133e-07, "logits/chosen": 0.8425217866897583, "logits/rejected": 0.7798970937728882, "logps/chosen": -184.1267852783203, "logps/ref_chosen": -73.13327026367188, "logps/ref_rejected": -104.0283432006836, "logps/rejected": -278.3711853027344, "loss": 1.1303, "margin_dpo/margin_mean": 63.349342346191406, "margin_dpo/margin_std": 92.2508544921875, "step": 338 }, { "epoch": 0.5124716553287982, "grad_norm": 17.419946670532227, "learning_rate": 2.842694572172736e-07, "logits/chosen": 0.9565955400466919, "logits/rejected": 0.8105146884918213, "logps/chosen": -125.23301696777344, "logps/ref_chosen": -27.726638793945312, "logps/ref_rejected": -54.045658111572266, "logps/rejected": -183.23825073242188, "loss": 1.0861, "margin_dpo/margin_mean": 31.686233520507812, "margin_dpo/margin_std": 49.92768859863281, "step": 339 }, { "epoch": 0.5139833711262283, "grad_norm": 18.45969009399414, "learning_rate": 2.8295924627584004e-07, "logits/chosen": 0.9575048685073853, "logits/rejected": 0.8758723735809326, "logps/chosen": -149.28018188476562, "logps/ref_chosen": -37.378753662109375, "logps/ref_rejected": -67.66883850097656, "logps/rejected": -262.060546875, "loss": 1.1941, "margin_dpo/margin_mean": 82.4902572631836, "margin_dpo/margin_std": 83.99423217773438, "step": 340 }, { "epoch": 0.5154950869236583, "grad_norm": 17.296634674072266, "learning_rate": 2.816481133934373e-07, "logits/chosen": 1.0063215494155884, "logits/rejected": 0.9997066259384155, "logps/chosen": -138.48793029785156, "logps/ref_chosen": -42.783775329589844, "logps/ref_rejected": -59.344329833984375, "logps/rejected": -233.50625610351562, "loss": 1.0223, "margin_dpo/margin_mean": 78.457763671875, "margin_dpo/margin_std": 88.61229705810547, "step": 341 }, { "epoch": 0.5170068027210885, "grad_norm": 14.958573341369629, "learning_rate": 2.8033609524527046e-07, "logits/chosen": 0.9206830859184265, "logits/rejected": 0.9482388496398926, "logps/chosen": -197.25350952148438, "logps/ref_chosen": -72.35289764404297, "logps/ref_rejected": -63.26990509033203, "logps/rejected": -223.66114807128906, "loss": 1.0141, "margin_dpo/margin_mean": 35.490623474121094, "margin_dpo/margin_std": 68.44136810302734, "step": 342 }, { "epoch": 0.5185185185185185, "grad_norm": 14.569660186767578, "learning_rate": 2.7902322853130753e-07, "logits/chosen": 0.8026296496391296, "logits/rejected": 0.8457683324813843, "logps/chosen": -219.88909912109375, "logps/ref_chosen": -83.87641906738281, "logps/ref_rejected": -75.55497741699219, "logps/rejected": -238.33535766601562, "loss": 1.1457, "margin_dpo/margin_mean": 26.76772689819336, "margin_dpo/margin_std": 85.9823226928711, "step": 343 }, { "epoch": 0.5200302343159486, "grad_norm": 16.628511428833008, "learning_rate": 2.7770954997525274e-07, "logits/chosen": 1.025362491607666, "logits/rejected": 0.8970023393630981, "logps/chosen": -157.8984832763672, "logps/ref_chosen": -35.154476165771484, "logps/ref_rejected": -77.97383880615234, "logps/rejected": -260.02105712890625, "loss": 1.0538, "margin_dpo/margin_mean": 59.303184509277344, "margin_dpo/margin_std": 57.50261306762695, "step": 344 }, { "epoch": 0.5215419501133787, "grad_norm": 18.29877471923828, "learning_rate": 2.7639509632351927e-07, "logits/chosen": 0.9883379936218262, "logits/rejected": 0.9794098734855652, "logps/chosen": -154.01934814453125, "logps/ref_chosen": -39.99463653564453, "logps/ref_rejected": -52.60383224487305, "logps/rejected": -210.22317504882812, "loss": 1.0774, "margin_dpo/margin_mean": 43.594635009765625, "margin_dpo/margin_std": 90.1194076538086, "step": 345 }, { "epoch": 0.5230536659108088, "grad_norm": 14.863842964172363, "learning_rate": 2.7507990434420123e-07, "logits/chosen": 0.9803563356399536, "logits/rejected": 0.9136096239089966, "logps/chosen": -155.381103515625, "logps/ref_chosen": -61.2567024230957, "logps/ref_rejected": -102.48171997070312, "logps/rejected": -321.823974609375, "loss": 0.9885, "margin_dpo/margin_mean": 125.21788787841797, "margin_dpo/margin_std": 68.31744384765625, "step": 346 }, { "epoch": 0.5245653817082389, "grad_norm": 16.280048370361328, "learning_rate": 2.737640108260456e-07, "logits/chosen": 1.080568790435791, "logits/rejected": 1.0347011089324951, "logps/chosen": -183.25091552734375, "logps/ref_chosen": -58.63034439086914, "logps/ref_rejected": -79.94859313964844, "logps/rejected": -263.5897216796875, "loss": 1.058, "margin_dpo/margin_mean": 59.020530700683594, "margin_dpo/margin_std": 88.26313781738281, "step": 347 }, { "epoch": 0.5260770975056689, "grad_norm": 18.49781036376953, "learning_rate": 2.724474525774229e-07, "logits/chosen": 0.9393756985664368, "logits/rejected": 0.9025633335113525, "logps/chosen": -187.76321411132812, "logps/ref_chosen": -72.25175476074219, "logps/ref_rejected": -95.0661392211914, "logps/rejected": -275.28118896484375, "loss": 1.0541, "margin_dpo/margin_mean": 64.70359802246094, "margin_dpo/margin_std": 109.8628158569336, "step": 348 }, { "epoch": 0.527588813303099, "grad_norm": 15.981618881225586, "learning_rate": 2.711302664252973e-07, "logits/chosen": 0.9622572660446167, "logits/rejected": 0.8644802570343018, "logps/chosen": -134.60650634765625, "logps/ref_chosen": -34.93451690673828, "logps/ref_rejected": -71.41903686523438, "logps/rejected": -250.6934814453125, "loss": 1.0037, "margin_dpo/margin_mean": 79.60247039794922, "margin_dpo/margin_std": 61.48447799682617, "step": 349 }, { "epoch": 0.5291005291005291, "grad_norm": 17.303754806518555, "learning_rate": 2.698124892141971e-07, "logits/chosen": 0.9259968996047974, "logits/rejected": 0.8843666911125183, "logps/chosen": -206.68112182617188, "logps/ref_chosen": -71.93693542480469, "logps/ref_rejected": -97.71165466308594, "logps/rejected": -320.2801513671875, "loss": 0.8634, "margin_dpo/margin_mean": 87.82434844970703, "margin_dpo/margin_std": 68.40359497070312, "step": 350 }, { "epoch": 0.5306122448979592, "grad_norm": 15.037851333618164, "learning_rate": 2.6849415780518357e-07, "logits/chosen": 0.8401812314987183, "logits/rejected": 0.6766891479492188, "logps/chosen": -163.47976684570312, "logps/ref_chosen": -55.08075714111328, "logps/ref_rejected": -107.10870361328125, "logps/rejected": -288.9677734375, "loss": 1.092, "margin_dpo/margin_mean": 73.46005249023438, "margin_dpo/margin_std": 82.07776641845703, "step": 351 }, { "epoch": 0.5321239606953893, "grad_norm": 19.045122146606445, "learning_rate": 2.6717530907482024e-07, "logits/chosen": 0.8240054845809937, "logits/rejected": 0.8280425071716309, "logps/chosen": -186.6539306640625, "logps/ref_chosen": -75.67378234863281, "logps/ref_rejected": -88.66374206542969, "logps/rejected": -267.3773193359375, "loss": 1.0297, "margin_dpo/margin_mean": 67.73342895507812, "margin_dpo/margin_std": 94.85681915283203, "step": 352 }, { "epoch": 0.5336356764928194, "grad_norm": 16.103891372680664, "learning_rate": 2.658559799141411e-07, "logits/chosen": 0.9016702175140381, "logits/rejected": 0.7808640003204346, "logps/chosen": -195.90447998046875, "logps/ref_chosen": -62.94065856933594, "logps/ref_rejected": -104.08489990234375, "logps/rejected": -294.2989807128906, "loss": 1.082, "margin_dpo/margin_mean": 57.25025177001953, "margin_dpo/margin_std": 92.04182434082031, "step": 353 }, { "epoch": 0.5351473922902494, "grad_norm": 18.149988174438477, "learning_rate": 2.6453620722761895e-07, "logits/chosen": 1.0865049362182617, "logits/rejected": 1.0457913875579834, "logps/chosen": -135.3533172607422, "logps/ref_chosen": -28.847824096679688, "logps/ref_rejected": -53.78091812133789, "logps/rejected": -229.83432006835938, "loss": 1.0416, "margin_dpo/margin_mean": 69.54791259765625, "margin_dpo/margin_std": 84.38679504394531, "step": 354 }, { "epoch": 0.5366591080876795, "grad_norm": 17.005126953125, "learning_rate": 2.632160279321328e-07, "logits/chosen": 1.035976767539978, "logits/rejected": 0.9110543727874756, "logps/chosen": -159.94003295898438, "logps/ref_chosen": -53.094722747802734, "logps/ref_rejected": -91.13424682617188, "logps/rejected": -290.590087890625, "loss": 0.9458, "margin_dpo/margin_mean": 92.61053466796875, "margin_dpo/margin_std": 103.92430114746094, "step": 355 }, { "epoch": 0.5381708238851096, "grad_norm": 15.574033737182617, "learning_rate": 2.618954789559356e-07, "logits/chosen": 0.9952447414398193, "logits/rejected": 0.85740727186203, "logps/chosen": -137.01632690429688, "logps/ref_chosen": -34.362483978271484, "logps/ref_rejected": -77.31940460205078, "logps/rejected": -252.238037109375, "loss": 1.0505, "margin_dpo/margin_mean": 72.2647933959961, "margin_dpo/margin_std": 116.42106628417969, "step": 356 }, { "epoch": 0.5396825396825397, "grad_norm": 18.58148765563965, "learning_rate": 2.6057459723762076e-07, "logits/chosen": 0.9232524633407593, "logits/rejected": 0.8515598773956299, "logps/chosen": -193.28883361816406, "logps/ref_chosen": -64.16845703125, "logps/ref_rejected": -78.76988983154297, "logps/rejected": -273.12249755859375, "loss": 1.1607, "margin_dpo/margin_mean": 65.23223114013672, "margin_dpo/margin_std": 106.97843933105469, "step": 357 }, { "epoch": 0.5411942554799698, "grad_norm": 25.09504508972168, "learning_rate": 2.5925341972508954e-07, "logits/chosen": 0.8110201954841614, "logits/rejected": 0.862175464630127, "logps/chosen": -185.81414794921875, "logps/ref_chosen": -64.39706420898438, "logps/ref_rejected": -56.678443908691406, "logps/rejected": -252.69879150390625, "loss": 1.107, "margin_dpo/margin_mean": 74.60326385498047, "margin_dpo/margin_std": 79.77681732177734, "step": 358 }, { "epoch": 0.5427059712773998, "grad_norm": 27.411643981933594, "learning_rate": 2.579319833745169e-07, "logits/chosen": 0.9938050508499146, "logits/rejected": 1.0131170749664307, "logps/chosen": -222.49644470214844, "logps/ref_chosen": -71.20832824707031, "logps/ref_rejected": -75.58880615234375, "logps/rejected": -236.30592346191406, "loss": 1.3531, "margin_dpo/margin_mean": 9.428986549377441, "margin_dpo/margin_std": 102.51341247558594, "step": 359 }, { "epoch": 0.54421768707483, "grad_norm": 16.536500930786133, "learning_rate": 2.5661032514931834e-07, "logits/chosen": 0.8350504636764526, "logits/rejected": 0.687712550163269, "logps/chosen": -199.9302520751953, "logps/ref_chosen": -65.89573669433594, "logps/ref_rejected": -97.73664855957031, "logps/rejected": -276.3427734375, "loss": 1.0164, "margin_dpo/margin_mean": 44.57158660888672, "margin_dpo/margin_std": 97.77774047851562, "step": 360 }, { "epoch": 0.54572940287226, "grad_norm": 15.011344909667969, "learning_rate": 2.552884820191154e-07, "logits/chosen": 0.9806392192840576, "logits/rejected": 0.9534279108047485, "logps/chosen": -175.77911376953125, "logps/ref_chosen": -50.53264617919922, "logps/ref_rejected": -59.25585174560547, "logps/rejected": -233.24551391601562, "loss": 1.0024, "margin_dpo/margin_mean": 48.743194580078125, "margin_dpo/margin_std": 77.80059051513672, "step": 361 }, { "epoch": 0.54724111866969, "grad_norm": 16.17755699157715, "learning_rate": 2.53966490958702e-07, "logits/chosen": 1.0447218418121338, "logits/rejected": 0.9513689279556274, "logps/chosen": -221.77896118164062, "logps/ref_chosen": -58.92408752441406, "logps/ref_rejected": -104.06151580810547, "logps/rejected": -342.5570373535156, "loss": 0.971, "margin_dpo/margin_mean": 75.64065551757812, "margin_dpo/margin_std": 177.78872680664062, "step": 362 }, { "epoch": 0.5487528344671202, "grad_norm": 21.469377517700195, "learning_rate": 2.526443889470099e-07, "logits/chosen": 0.9666943550109863, "logits/rejected": 0.7734654545783997, "logps/chosen": -171.7001495361328, "logps/ref_chosen": -46.72846984863281, "logps/ref_rejected": -140.4446258544922, "logps/rejected": -366.104736328125, "loss": 0.9585, "margin_dpo/margin_mean": 100.68843078613281, "margin_dpo/margin_std": 130.09930419921875, "step": 363 }, { "epoch": 0.5502645502645502, "grad_norm": 16.017593383789062, "learning_rate": 2.513222129660744e-07, "logits/chosen": 0.9620314240455627, "logits/rejected": 0.8384478688240051, "logps/chosen": -144.81683349609375, "logps/ref_chosen": -47.71454620361328, "logps/ref_rejected": -85.33769226074219, "logps/rejected": -307.9134521484375, "loss": 0.9883, "margin_dpo/margin_mean": 125.4734878540039, "margin_dpo/margin_std": 125.29339599609375, "step": 364 }, { "epoch": 0.5517762660619804, "grad_norm": 16.882993698120117, "learning_rate": 2.5e-07, "logits/chosen": 0.9969468116760254, "logits/rejected": 1.0294103622436523, "logps/chosen": -152.65603637695312, "logps/ref_chosen": -53.76380157470703, "logps/ref_rejected": -46.24406433105469, "logps/rejected": -212.58950805664062, "loss": 0.9774, "margin_dpo/margin_mean": 67.45320892333984, "margin_dpo/margin_std": 76.16273498535156, "step": 365 }, { "epoch": 0.5532879818594104, "grad_norm": 19.378551483154297, "learning_rate": 2.486777870339255e-07, "logits/chosen": 0.9522498846054077, "logits/rejected": 0.9834457635879517, "logps/chosen": -179.36004638671875, "logps/ref_chosen": -67.52264404296875, "logps/ref_rejected": -70.28094482421875, "logps/rejected": -228.3491973876953, "loss": 1.0973, "margin_dpo/margin_mean": 46.230857849121094, "margin_dpo/margin_std": 88.74593353271484, "step": 366 }, { "epoch": 0.5547996976568406, "grad_norm": 18.30280876159668, "learning_rate": 2.4735561105299014e-07, "logits/chosen": 0.9707492589950562, "logits/rejected": 0.8614064455032349, "logps/chosen": -161.2140655517578, "logps/ref_chosen": -55.156681060791016, "logps/ref_rejected": -82.34903717041016, "logps/rejected": -260.894775390625, "loss": 1.0604, "margin_dpo/margin_mean": 72.48836517333984, "margin_dpo/margin_std": 68.52483367919922, "step": 367 }, { "epoch": 0.5563114134542706, "grad_norm": 19.369901657104492, "learning_rate": 2.46033509041298e-07, "logits/chosen": 0.7760097980499268, "logits/rejected": 0.8570929765701294, "logps/chosen": -175.87571716308594, "logps/ref_chosen": -74.56654357910156, "logps/ref_rejected": -55.081199645996094, "logps/rejected": -217.58401489257812, "loss": 1.137, "margin_dpo/margin_mean": 61.19365692138672, "margin_dpo/margin_std": 94.93489074707031, "step": 368 }, { "epoch": 0.5578231292517006, "grad_norm": 24.349010467529297, "learning_rate": 2.447115179808846e-07, "logits/chosen": 0.9876595735549927, "logits/rejected": 0.9113900661468506, "logps/chosen": -190.04141235351562, "logps/ref_chosen": -61.67764663696289, "logps/ref_rejected": -88.59959411621094, "logps/rejected": -285.03338623046875, "loss": 1.1874, "margin_dpo/margin_mean": 68.07002258300781, "margin_dpo/margin_std": 85.15034484863281, "step": 369 }, { "epoch": 0.5593348450491308, "grad_norm": 16.110780715942383, "learning_rate": 2.4338967485068164e-07, "logits/chosen": 1.0228030681610107, "logits/rejected": 0.9811384677886963, "logps/chosen": -119.4067153930664, "logps/ref_chosen": -44.50119400024414, "logps/ref_rejected": -81.18331909179688, "logps/rejected": -273.24188232421875, "loss": 0.9948, "margin_dpo/margin_mean": 117.15304565429688, "margin_dpo/margin_std": 106.01179504394531, "step": 370 }, { "epoch": 0.5608465608465608, "grad_norm": 21.480369567871094, "learning_rate": 2.420680166254831e-07, "logits/chosen": 1.0966333150863647, "logits/rejected": 1.0227328538894653, "logps/chosen": -148.44422912597656, "logps/ref_chosen": -39.17439651489258, "logps/ref_rejected": -76.12638092041016, "logps/rejected": -258.49078369140625, "loss": 1.0422, "margin_dpo/margin_mean": 73.09457397460938, "margin_dpo/margin_std": 105.94315338134766, "step": 371 }, { "epoch": 0.562358276643991, "grad_norm": 20.347423553466797, "learning_rate": 2.4074658027491044e-07, "logits/chosen": 0.9855044484138489, "logits/rejected": 0.8605490922927856, "logps/chosen": -152.38363647460938, "logps/ref_chosen": -52.27345275878906, "logps/ref_rejected": -87.656494140625, "logps/rejected": -296.9883117675781, "loss": 1.2809, "margin_dpo/margin_mean": 109.22164916992188, "margin_dpo/margin_std": 77.62582397460938, "step": 372 }, { "epoch": 0.563869992441421, "grad_norm": 17.717870712280273, "learning_rate": 2.394254027623792e-07, "logits/chosen": 1.0428986549377441, "logits/rejected": 1.0207685232162476, "logps/chosen": -223.8621368408203, "logps/ref_chosen": -68.01244354248047, "logps/ref_rejected": -64.4259033203125, "logps/rejected": -273.6564636230469, "loss": 1.1361, "margin_dpo/margin_mean": 53.3808708190918, "margin_dpo/margin_std": 147.9482421875, "step": 373 }, { "epoch": 0.5653817082388511, "grad_norm": 27.212177276611328, "learning_rate": 2.381045210440644e-07, "logits/chosen": 0.8525890111923218, "logits/rejected": 0.7748229503631592, "logps/chosen": -152.43893432617188, "logps/ref_chosen": -56.639495849609375, "logps/ref_rejected": -83.10781860351562, "logps/rejected": -317.96673583984375, "loss": 1.0262, "margin_dpo/margin_mean": 139.05946350097656, "margin_dpo/margin_std": 114.1051025390625, "step": 374 }, { "epoch": 0.5668934240362812, "grad_norm": 21.642242431640625, "learning_rate": 2.3678397206786715e-07, "logits/chosen": 1.0544872283935547, "logits/rejected": 0.9103308320045471, "logps/chosen": -140.70907592773438, "logps/ref_chosen": -31.620290756225586, "logps/ref_rejected": -70.079345703125, "logps/rejected": -234.93954467773438, "loss": 1.1789, "margin_dpo/margin_mean": 55.77141189575195, "margin_dpo/margin_std": 87.87168884277344, "step": 375 }, { "epoch": 0.5684051398337112, "grad_norm": 18.192161560058594, "learning_rate": 2.3546379277238103e-07, "logits/chosen": 0.788690447807312, "logits/rejected": 0.8902658224105835, "logps/chosen": -237.19358825683594, "logps/ref_chosen": -78.47712707519531, "logps/ref_rejected": -57.03622055053711, "logps/rejected": -222.91226196289062, "loss": 1.0841, "margin_dpo/margin_mean": 7.1595916748046875, "margin_dpo/margin_std": 78.5771255493164, "step": 376 }, { "epoch": 0.5699168556311414, "grad_norm": 16.722244262695312, "learning_rate": 2.3414402008585886e-07, "logits/chosen": 0.9365607500076294, "logits/rejected": 0.937558650970459, "logps/chosen": -185.59255981445312, "logps/ref_chosen": -60.769771575927734, "logps/ref_rejected": -65.29888916015625, "logps/rejected": -243.30319213867188, "loss": 1.1232, "margin_dpo/margin_mean": 53.181495666503906, "margin_dpo/margin_std": 76.733642578125, "step": 377 }, { "epoch": 0.5714285714285714, "grad_norm": 18.274368286132812, "learning_rate": 2.3282469092517977e-07, "logits/chosen": 1.0017985105514526, "logits/rejected": 0.944943904876709, "logps/chosen": -159.78944396972656, "logps/ref_chosen": -50.79759979248047, "logps/ref_rejected": -75.34347534179688, "logps/rejected": -254.2080841064453, "loss": 1.1378, "margin_dpo/margin_mean": 69.87276458740234, "margin_dpo/margin_std": 101.12820434570312, "step": 378 }, { "epoch": 0.5729402872260015, "grad_norm": 17.872270584106445, "learning_rate": 2.3150584219481643e-07, "logits/chosen": 0.9760261178016663, "logits/rejected": 0.9050667881965637, "logps/chosen": -185.8768310546875, "logps/ref_chosen": -61.738677978515625, "logps/ref_rejected": -97.11418151855469, "logps/rejected": -275.95050048828125, "loss": 1.0379, "margin_dpo/margin_mean": 54.698150634765625, "margin_dpo/margin_std": 97.39860534667969, "step": 379 }, { "epoch": 0.5744520030234316, "grad_norm": 15.110595703125, "learning_rate": 2.3018751078580283e-07, "logits/chosen": 0.8375756740570068, "logits/rejected": 0.9170357584953308, "logps/chosen": -169.6956329345703, "logps/ref_chosen": -44.782066345214844, "logps/ref_rejected": -41.68242263793945, "logps/rejected": -221.0014190673828, "loss": 0.9764, "margin_dpo/margin_mean": 54.40543746948242, "margin_dpo/margin_std": 82.51588439941406, "step": 380 }, { "epoch": 0.5759637188208617, "grad_norm": 19.764453887939453, "learning_rate": 2.288697335747027e-07, "logits/chosen": 0.975003719329834, "logits/rejected": 0.9306457042694092, "logps/chosen": -211.8162384033203, "logps/ref_chosen": -59.876434326171875, "logps/ref_rejected": -56.535682678222656, "logps/rejected": -277.5439147949219, "loss": 1.3201, "margin_dpo/margin_mean": 69.06842041015625, "margin_dpo/margin_std": 162.8582763671875, "step": 381 }, { "epoch": 0.5774754346182918, "grad_norm": 14.49204158782959, "learning_rate": 2.2755254742257706e-07, "logits/chosen": 1.0008131265640259, "logits/rejected": 0.9001951217651367, "logps/chosen": -203.72560119628906, "logps/ref_chosen": -63.76511764526367, "logps/ref_rejected": -100.84956359863281, "logps/rejected": -320.87860107421875, "loss": 1.0773, "margin_dpo/margin_mean": 80.06857299804688, "margin_dpo/margin_std": 91.30355072021484, "step": 382 }, { "epoch": 0.5789871504157218, "grad_norm": 20.305614471435547, "learning_rate": 2.2623598917395436e-07, "logits/chosen": 0.9180213809013367, "logits/rejected": 0.8855539560317993, "logps/chosen": -171.7388458251953, "logps/ref_chosen": -56.75093078613281, "logps/ref_rejected": -63.029056549072266, "logps/rejected": -220.02169799804688, "loss": 1.1631, "margin_dpo/margin_mean": 42.00471115112305, "margin_dpo/margin_std": 75.70281982421875, "step": 383 }, { "epoch": 0.5804988662131519, "grad_norm": 19.855918884277344, "learning_rate": 2.2492009565579875e-07, "logits/chosen": 0.8351778984069824, "logits/rejected": 0.8428291082382202, "logps/chosen": -205.7308807373047, "logps/ref_chosen": -79.68529510498047, "logps/ref_rejected": -81.70601654052734, "logps/rejected": -268.79345703125, "loss": 1.0003, "margin_dpo/margin_mean": 61.041873931884766, "margin_dpo/margin_std": 60.530391693115234, "step": 384 }, { "epoch": 0.582010582010582, "grad_norm": 15.318973541259766, "learning_rate": 2.2360490367648084e-07, "logits/chosen": 0.8146077394485474, "logits/rejected": 0.808496356010437, "logps/chosen": -191.92584228515625, "logps/ref_chosen": -55.45124053955078, "logps/ref_rejected": -74.28287506103516, "logps/rejected": -266.0337219238281, "loss": 0.9743, "margin_dpo/margin_mean": 55.27625274658203, "margin_dpo/margin_std": 67.60725402832031, "step": 385 }, { "epoch": 0.5835222978080121, "grad_norm": 16.764909744262695, "learning_rate": 2.2229045002474724e-07, "logits/chosen": 0.8237930536270142, "logits/rejected": 0.7379618287086487, "logps/chosen": -165.93934631347656, "logps/ref_chosen": -49.65403747558594, "logps/ref_rejected": -81.07264709472656, "logps/rejected": -254.802978515625, "loss": 1.1167, "margin_dpo/margin_mean": 57.445030212402344, "margin_dpo/margin_std": 128.3809814453125, "step": 386 }, { "epoch": 0.5850340136054422, "grad_norm": 20.964162826538086, "learning_rate": 2.209767714686924e-07, "logits/chosen": 1.0351340770721436, "logits/rejected": 0.93593430519104, "logps/chosen": -142.808349609375, "logps/ref_chosen": -30.506126403808594, "logps/ref_rejected": -75.85283660888672, "logps/rejected": -261.2908935546875, "loss": 0.9562, "margin_dpo/margin_mean": 73.13583374023438, "margin_dpo/margin_std": 91.49317932128906, "step": 387 }, { "epoch": 0.5865457294028723, "grad_norm": 16.960439682006836, "learning_rate": 2.1966390475472954e-07, "logits/chosen": 0.8540966510772705, "logits/rejected": 0.7713406085968018, "logps/chosen": -190.65914916992188, "logps/ref_chosen": -74.6607437133789, "logps/ref_rejected": -112.83131408691406, "logps/rejected": -320.3790588378906, "loss": 1.1545, "margin_dpo/margin_mean": 91.54933166503906, "margin_dpo/margin_std": 84.64852142333984, "step": 388 }, { "epoch": 0.5880574452003023, "grad_norm": 28.818973541259766, "learning_rate": 2.1835188660656265e-07, "logits/chosen": 0.9836728572845459, "logits/rejected": 0.8946930170059204, "logps/chosen": -191.19837951660156, "logps/ref_chosen": -57.50859069824219, "logps/ref_rejected": -99.1073989868164, "logps/rejected": -297.79901123046875, "loss": 1.0733, "margin_dpo/margin_mean": 65.00183868408203, "margin_dpo/margin_std": 77.60054016113281, "step": 389 }, { "epoch": 0.5895691609977324, "grad_norm": 14.67129135131836, "learning_rate": 2.170407537241599e-07, "logits/chosen": 0.8287357687950134, "logits/rejected": 0.8107761144638062, "logps/chosen": -167.33096313476562, "logps/ref_chosen": -48.60906982421875, "logps/ref_rejected": -54.462955474853516, "logps/rejected": -222.6549530029297, "loss": 1.0588, "margin_dpo/margin_mean": 49.4700927734375, "margin_dpo/margin_std": 65.86190795898438, "step": 390 }, { "epoch": 0.5910808767951625, "grad_norm": 15.129668235778809, "learning_rate": 2.1573054278272636e-07, "logits/chosen": 0.8232520818710327, "logits/rejected": 0.7771520018577576, "logps/chosen": -169.54945373535156, "logps/ref_chosen": -64.54489135742188, "logps/ref_rejected": -90.6060791015625, "logps/rejected": -272.054931640625, "loss": 1.0709, "margin_dpo/margin_mean": 76.44432067871094, "margin_dpo/margin_std": 110.20875549316406, "step": 391 }, { "epoch": 0.5925925925925926, "grad_norm": 15.302267074584961, "learning_rate": 2.1442129043167873e-07, "logits/chosen": 0.946514904499054, "logits/rejected": 0.9243011474609375, "logps/chosen": -161.78128051757812, "logps/ref_chosen": -62.38185119628906, "logps/ref_rejected": -71.40414428710938, "logps/rejected": -220.90402221679688, "loss": 1.0166, "margin_dpo/margin_mean": 50.100440979003906, "margin_dpo/margin_std": 103.29415130615234, "step": 392 }, { "epoch": 0.5941043083900227, "grad_norm": 15.310603141784668, "learning_rate": 2.131130332936195e-07, "logits/chosen": 0.8303643465042114, "logits/rejected": 0.830100953578949, "logps/chosen": -150.3733367919922, "logps/ref_chosen": -40.24174499511719, "logps/ref_rejected": -50.49744415283203, "logps/rejected": -233.74623107910156, "loss": 0.9411, "margin_dpo/margin_mean": 73.11720275878906, "margin_dpo/margin_std": 63.29140853881836, "step": 393 }, { "epoch": 0.5956160241874527, "grad_norm": 15.765230178833008, "learning_rate": 2.1180580796331323e-07, "logits/chosen": 0.971742570400238, "logits/rejected": 0.847053050994873, "logps/chosen": -156.80836486816406, "logps/ref_chosen": -46.0341911315918, "logps/ref_rejected": -90.94654846191406, "logps/rejected": -264.73736572265625, "loss": 1.076, "margin_dpo/margin_mean": 63.01665496826172, "margin_dpo/margin_std": 79.52315521240234, "step": 394 }, { "epoch": 0.5971277399848829, "grad_norm": 14.962550163269043, "learning_rate": 2.104996510066625e-07, "logits/chosen": 1.010396122932434, "logits/rejected": 0.888831615447998, "logps/chosen": -136.7878875732422, "logps/ref_chosen": -45.55821990966797, "logps/ref_rejected": -93.29295349121094, "logps/rejected": -282.3095397949219, "loss": 1.0567, "margin_dpo/margin_mean": 97.78689575195312, "margin_dpo/margin_std": 64.2834701538086, "step": 395 }, { "epoch": 0.5986394557823129, "grad_norm": 15.773469924926758, "learning_rate": 2.0919459895968517e-07, "logits/chosen": 0.9865584373474121, "logits/rejected": 0.9610118865966797, "logps/chosen": -135.16543579101562, "logps/ref_chosen": -47.607505798339844, "logps/ref_rejected": -52.5338020324707, "logps/rejected": -236.95713806152344, "loss": 1.0154, "margin_dpo/margin_mean": 96.86540222167969, "margin_dpo/margin_std": 54.51676940917969, "step": 396 }, { "epoch": 0.600151171579743, "grad_norm": 17.012723922729492, "learning_rate": 2.078906883274924e-07, "logits/chosen": 0.8290398120880127, "logits/rejected": 0.7684098482131958, "logps/chosen": -193.76319885253906, "logps/ref_chosen": -61.47978973388672, "logps/ref_rejected": -80.81649780273438, "logps/rejected": -270.89862060546875, "loss": 1.2961, "margin_dpo/margin_mean": 57.79869842529297, "margin_dpo/margin_std": 66.68041229248047, "step": 397 }, { "epoch": 0.6016628873771731, "grad_norm": 14.493592262268066, "learning_rate": 2.065879555832674e-07, "logits/chosen": 0.8568699359893799, "logits/rejected": 0.7129446268081665, "logps/chosen": -158.29476928710938, "logps/ref_chosen": -47.49082946777344, "logps/ref_rejected": -101.38699340820312, "logps/rejected": -306.26104736328125, "loss": 0.9401, "margin_dpo/margin_mean": 94.07012939453125, "margin_dpo/margin_std": 80.53518676757812, "step": 398 }, { "epoch": 0.6031746031746031, "grad_norm": 16.002872467041016, "learning_rate": 2.052864371672457e-07, "logits/chosen": 0.9500819444656372, "logits/rejected": 0.8516980409622192, "logps/chosen": -241.56881713867188, "logps/ref_chosen": -76.00422668457031, "logps/ref_rejected": -139.26205444335938, "logps/rejected": -371.9350280761719, "loss": 0.9017, "margin_dpo/margin_mean": 67.10838317871094, "margin_dpo/margin_std": 61.81909942626953, "step": 399 }, { "epoch": 0.6046863189720333, "grad_norm": 22.764698028564453, "learning_rate": 2.0398616948569493e-07, "logits/chosen": 0.8686426281929016, "logits/rejected": 0.9016132354736328, "logps/chosen": -222.953857421875, "logps/ref_chosen": -95.81818389892578, "logps/ref_rejected": -84.06385803222656, "logps/rejected": -276.75958251953125, "loss": 1.1296, "margin_dpo/margin_mean": 65.56008911132812, "margin_dpo/margin_std": 57.78480529785156, "step": 400 }, { "epoch": 0.6061980347694633, "grad_norm": 11.519392013549805, "learning_rate": 2.0268718890989752e-07, "logits/chosen": 0.8911803960800171, "logits/rejected": 0.8166502714157104, "logps/chosen": -165.61497497558594, "logps/ref_chosen": -53.86456298828125, "logps/ref_rejected": -53.25059509277344, "logps/rejected": -205.1787567138672, "loss": 0.9321, "margin_dpo/margin_mean": 40.17775344848633, "margin_dpo/margin_std": 83.69491577148438, "step": 401 }, { "epoch": 0.6077097505668935, "grad_norm": 16.777515411376953, "learning_rate": 2.013895317751323e-07, "logits/chosen": 0.9933120012283325, "logits/rejected": 0.814152717590332, "logps/chosen": -142.3320770263672, "logps/ref_chosen": -37.891700744628906, "logps/ref_rejected": -85.77980041503906, "logps/rejected": -285.2149963378906, "loss": 1.0406, "margin_dpo/margin_mean": 94.99481201171875, "margin_dpo/margin_std": 68.64460754394531, "step": 402 }, { "epoch": 0.6092214663643235, "grad_norm": 13.203496932983398, "learning_rate": 2.0009323437965898e-07, "logits/chosen": 1.0703718662261963, "logits/rejected": 0.9925715923309326, "logps/chosen": -180.50936889648438, "logps/ref_chosen": -63.857696533203125, "logps/ref_rejected": -93.38938903808594, "logps/rejected": -301.53302001953125, "loss": 0.9559, "margin_dpo/margin_mean": 91.49195861816406, "margin_dpo/margin_std": 91.93600463867188, "step": 403 }, { "epoch": 0.6107331821617535, "grad_norm": 15.767083168029785, "learning_rate": 1.9879833298370237e-07, "logits/chosen": 0.9310320019721985, "logits/rejected": 0.9439609050750732, "logps/chosen": -240.16302490234375, "logps/ref_chosen": -84.43171691894531, "logps/ref_rejected": -89.65742492675781, "logps/rejected": -274.97003173828125, "loss": 0.9804, "margin_dpo/margin_mean": 29.5813045501709, "margin_dpo/margin_std": 90.8002700805664, "step": 404 }, { "epoch": 0.6122448979591837, "grad_norm": 14.391467094421387, "learning_rate": 1.975048638084379e-07, "logits/chosen": 1.0961458683013916, "logits/rejected": 1.0419206619262695, "logps/chosen": -144.2384033203125, "logps/ref_chosen": -43.280792236328125, "logps/ref_rejected": -58.38227844238281, "logps/rejected": -217.02481079101562, "loss": 1.0274, "margin_dpo/margin_mean": 57.6849365234375, "margin_dpo/margin_std": 87.62693786621094, "step": 405 }, { "epoch": 0.6137566137566137, "grad_norm": 17.21456527709961, "learning_rate": 1.9621286303497914e-07, "logits/chosen": 1.0429320335388184, "logits/rejected": 0.8181569576263428, "logps/chosen": -143.40634155273438, "logps/ref_chosen": -38.76139831542969, "logps/ref_rejected": -95.4449462890625, "logps/rejected": -312.931396484375, "loss": 0.9634, "margin_dpo/margin_mean": 112.84149169921875, "margin_dpo/margin_std": 117.30062866210938, "step": 406 }, { "epoch": 0.6152683295540439, "grad_norm": 17.83815574645996, "learning_rate": 1.9492236680336483e-07, "logits/chosen": 0.921512246131897, "logits/rejected": 0.9170160293579102, "logps/chosen": -201.85311889648438, "logps/ref_chosen": -70.94854736328125, "logps/ref_rejected": -83.073486328125, "logps/rejected": -236.95440673828125, "loss": 1.065, "margin_dpo/margin_mean": 22.976346969604492, "margin_dpo/margin_std": 84.53071594238281, "step": 407 }, { "epoch": 0.6167800453514739, "grad_norm": 14.140363693237305, "learning_rate": 1.9363341121154895e-07, "logits/chosen": 0.9865923523902893, "logits/rejected": 0.8969758749008179, "logps/chosen": -151.7926788330078, "logps/ref_chosen": -57.809539794921875, "logps/ref_rejected": -81.67845153808594, "logps/rejected": -253.80831909179688, "loss": 0.8738, "margin_dpo/margin_mean": 78.146728515625, "margin_dpo/margin_std": 85.02195739746094, "step": 408 }, { "epoch": 0.618291761148904, "grad_norm": 18.92486000061035, "learning_rate": 1.9234603231438994e-07, "logits/chosen": 0.9477880001068115, "logits/rejected": 1.024693250656128, "logps/chosen": -208.4686279296875, "logps/ref_chosen": -77.07215118408203, "logps/ref_rejected": -52.54692459106445, "logps/rejected": -207.8866729736328, "loss": 1.1856, "margin_dpo/margin_mean": 23.943286895751953, "margin_dpo/margin_std": 57.939476013183594, "step": 409 }, { "epoch": 0.6198034769463341, "grad_norm": 16.562345504760742, "learning_rate": 1.9106026612264315e-07, "logits/chosen": 1.0837414264678955, "logits/rejected": 1.0848373174667358, "logps/chosen": -157.5943603515625, "logps/ref_chosen": -44.102970123291016, "logps/ref_rejected": -48.312713623046875, "logps/rejected": -213.13218688964844, "loss": 1.0328, "margin_dpo/margin_mean": 51.32807159423828, "margin_dpo/margin_std": 73.68001556396484, "step": 410 }, { "epoch": 0.6213151927437641, "grad_norm": 15.483366012573242, "learning_rate": 1.8977614860195296e-07, "logits/chosen": 0.9602583646774292, "logits/rejected": 0.9861100316047668, "logps/chosen": -207.64120483398438, "logps/ref_chosen": -68.72139739990234, "logps/ref_rejected": -60.70808792114258, "logps/rejected": -261.77276611328125, "loss": 1.0486, "margin_dpo/margin_mean": 62.14488983154297, "margin_dpo/margin_std": 83.70924377441406, "step": 411 }, { "epoch": 0.6228269085411943, "grad_norm": 18.00762176513672, "learning_rate": 1.8849371567184662e-07, "logits/chosen": 0.9532754421234131, "logits/rejected": 0.9993252754211426, "logps/chosen": -164.32028198242188, "logps/ref_chosen": -48.907501220703125, "logps/ref_rejected": -40.223628997802734, "logps/rejected": -204.09124755859375, "loss": 1.0641, "margin_dpo/margin_mean": 48.454830169677734, "margin_dpo/margin_std": 77.01388549804688, "step": 412 }, { "epoch": 0.6243386243386243, "grad_norm": 18.54705810546875, "learning_rate": 1.872130032047302e-07, "logits/chosen": 0.7836633324623108, "logits/rejected": 0.6924010515213013, "logps/chosen": -224.32937622070312, "logps/ref_chosen": -66.48075103759766, "logps/ref_rejected": -88.64950561523438, "logps/rejected": -281.3914794921875, "loss": 1.1014, "margin_dpo/margin_mean": 34.89335250854492, "margin_dpo/margin_std": 108.35519409179688, "step": 413 }, { "epoch": 0.6258503401360545, "grad_norm": 17.233596801757812, "learning_rate": 1.8593404702488436e-07, "logits/chosen": 0.8586117029190063, "logits/rejected": 0.8090198040008545, "logps/chosen": -217.32992553710938, "logps/ref_chosen": -64.2005386352539, "logps/ref_rejected": -92.14444732666016, "logps/rejected": -309.4424743652344, "loss": 0.9987, "margin_dpo/margin_mean": 64.16864013671875, "margin_dpo/margin_std": 81.55545806884766, "step": 414 }, { "epoch": 0.6273620559334845, "grad_norm": 15.781834602355957, "learning_rate": 1.846568829074628e-07, "logits/chosen": 0.8974360227584839, "logits/rejected": 1.0069502592086792, "logps/chosen": -171.6870880126953, "logps/ref_chosen": -58.819007873535156, "logps/ref_rejected": -41.336639404296875, "logps/rejected": -227.85916137695312, "loss": 1.1027, "margin_dpo/margin_mean": 73.65443420410156, "margin_dpo/margin_std": 78.06534576416016, "step": 415 }, { "epoch": 0.6288737717309146, "grad_norm": 18.85218620300293, "learning_rate": 1.8338154657749128e-07, "logits/chosen": 0.892058253288269, "logits/rejected": 0.7859885692596436, "logps/chosen": -194.0150146484375, "logps/ref_chosen": -53.452110290527344, "logps/ref_rejected": -97.50613403320312, "logps/rejected": -326.833984375, "loss": 1.1854, "margin_dpo/margin_mean": 88.76496124267578, "margin_dpo/margin_std": 134.0952911376953, "step": 416 }, { "epoch": 0.6303854875283447, "grad_norm": 16.071107864379883, "learning_rate": 1.8210807370886849e-07, "logits/chosen": 0.9685580730438232, "logits/rejected": 0.9870299100875854, "logps/chosen": -228.75973510742188, "logps/ref_chosen": -75.47906494140625, "logps/ref_rejected": -67.37366485595703, "logps/rejected": -281.687255859375, "loss": 1.0758, "margin_dpo/margin_mean": 61.032928466796875, "margin_dpo/margin_std": 164.45205688476562, "step": 417 }, { "epoch": 0.6318972033257747, "grad_norm": 31.492109298706055, "learning_rate": 1.8083649992336825e-07, "logits/chosen": 1.0411242246627808, "logits/rejected": 0.9932562112808228, "logps/chosen": -163.39306640625, "logps/ref_chosen": -51.03925323486328, "logps/ref_rejected": -80.96292877197266, "logps/rejected": -269.3048095703125, "loss": 1.2937, "margin_dpo/margin_mean": 75.98806762695312, "margin_dpo/margin_std": 79.07701110839844, "step": 418 }, { "epoch": 0.6334089191232048, "grad_norm": 15.3997802734375, "learning_rate": 1.7956686078964255e-07, "logits/chosen": 0.9788599014282227, "logits/rejected": 0.8751944303512573, "logps/chosen": -196.85659790039062, "logps/ref_chosen": -60.755767822265625, "logps/ref_rejected": -77.95507049560547, "logps/rejected": -270.27056884765625, "loss": 0.9464, "margin_dpo/margin_mean": 56.214637756347656, "margin_dpo/margin_std": 98.2620849609375, "step": 419 }, { "epoch": 0.6349206349206349, "grad_norm": 16.48794937133789, "learning_rate": 1.782991918222275e-07, "logits/chosen": 0.9061227440834045, "logits/rejected": 0.9082077741622925, "logps/chosen": -246.45130920410156, "logps/ref_chosen": -64.6197738647461, "logps/ref_rejected": -65.47144317626953, "logps/rejected": -271.0919494628906, "loss": 1.2391, "margin_dpo/margin_mean": 23.788970947265625, "margin_dpo/margin_std": 90.23184204101562, "step": 420 }, { "epoch": 0.636432350718065, "grad_norm": 20.588409423828125, "learning_rate": 1.7703352848054887e-07, "logits/chosen": 1.1465280055999756, "logits/rejected": 0.9681707620620728, "logps/chosen": -153.81906127929688, "logps/ref_chosen": -37.7196159362793, "logps/ref_rejected": -102.12132263183594, "logps/rejected": -318.0316162109375, "loss": 1.1831, "margin_dpo/margin_mean": 99.81085205078125, "margin_dpo/margin_std": 135.99864196777344, "step": 421 }, { "epoch": 0.6379440665154951, "grad_norm": 15.120969772338867, "learning_rate": 1.7576990616793137e-07, "logits/chosen": 0.988216757774353, "logits/rejected": 0.943016529083252, "logps/chosen": -174.78521728515625, "logps/ref_chosen": -70.57130432128906, "logps/ref_rejected": -81.15480041503906, "logps/rejected": -301.62823486328125, "loss": 0.9731, "margin_dpo/margin_mean": 116.259521484375, "margin_dpo/margin_std": 165.31149291992188, "step": 422 }, { "epoch": 0.6394557823129252, "grad_norm": 18.961877822875977, "learning_rate": 1.745083602306071e-07, "logits/chosen": 1.0410032272338867, "logits/rejected": 1.0402342081069946, "logps/chosen": -181.06886291503906, "logps/ref_chosen": -65.94102478027344, "logps/ref_rejected": -58.68115234375, "logps/rejected": -218.81155395507812, "loss": 0.968, "margin_dpo/margin_mean": 45.00257873535156, "margin_dpo/margin_std": 87.43341827392578, "step": 423 }, { "epoch": 0.6409674981103552, "grad_norm": 16.557573318481445, "learning_rate": 1.7324892595672804e-07, "logits/chosen": 0.966809868812561, "logits/rejected": 0.9489647150039673, "logps/chosen": -159.09210205078125, "logps/ref_chosen": -47.5775032043457, "logps/ref_rejected": -81.55694580078125, "logps/rejected": -263.04534912109375, "loss": 0.9385, "margin_dpo/margin_mean": 69.97380828857422, "margin_dpo/margin_std": 96.92681884765625, "step": 424 }, { "epoch": 0.6424792139077853, "grad_norm": 18.266592025756836, "learning_rate": 1.7199163857537824e-07, "logits/chosen": 0.9221489429473877, "logits/rejected": 0.9283965826034546, "logps/chosen": -210.3824462890625, "logps/ref_chosen": -73.816650390625, "logps/ref_rejected": -68.2657470703125, "logps/rejected": -270.66265869140625, "loss": 1.0809, "margin_dpo/margin_mean": 65.83113861083984, "margin_dpo/margin_std": 87.40162658691406, "step": 425 }, { "epoch": 0.6439909297052154, "grad_norm": 23.678329467773438, "learning_rate": 1.7073653325558828e-07, "logits/chosen": 0.7612646818161011, "logits/rejected": 0.8192156553268433, "logps/chosen": -212.634033203125, "logps/ref_chosen": -73.34886169433594, "logps/ref_rejected": -49.84626007080078, "logps/rejected": -236.68417358398438, "loss": 1.28, "margin_dpo/margin_mean": 47.552730560302734, "margin_dpo/margin_std": 78.64155578613281, "step": 426 }, { "epoch": 0.6455026455026455, "grad_norm": 18.935203552246094, "learning_rate": 1.6948364510535218e-07, "logits/chosen": 0.8741757869720459, "logits/rejected": 0.8858860731124878, "logps/chosen": -192.26693725585938, "logps/ref_chosen": -59.81298828125, "logps/ref_rejected": -72.67082214355469, "logps/rejected": -229.54574584960938, "loss": 1.1213, "margin_dpo/margin_mean": 24.420982360839844, "margin_dpo/margin_std": 100.69892883300781, "step": 427 }, { "epoch": 0.6470143613000756, "grad_norm": 14.1466646194458, "learning_rate": 1.6823300917064458e-07, "logits/chosen": 0.810982346534729, "logits/rejected": 0.8677164316177368, "logps/chosen": -231.75439453125, "logps/ref_chosen": -75.25834655761719, "logps/ref_rejected": -71.87213134765625, "logps/rejected": -261.0834655761719, "loss": 1.0756, "margin_dpo/margin_mean": 32.71527099609375, "margin_dpo/margin_std": 101.5775146484375, "step": 428 }, { "epoch": 0.6485260770975056, "grad_norm": 18.27950668334961, "learning_rate": 1.669846604344412e-07, "logits/chosen": 0.8632928133010864, "logits/rejected": 0.9631503820419312, "logps/chosen": -232.76438903808594, "logps/ref_chosen": -85.73371887207031, "logps/ref_rejected": -54.903968811035156, "logps/rejected": -241.41387939453125, "loss": 1.0955, "margin_dpo/margin_mean": 39.47923278808594, "margin_dpo/margin_std": 66.2182846069336, "step": 429 }, { "epoch": 0.6500377928949358, "grad_norm": 17.305450439453125, "learning_rate": 1.6573863381573954e-07, "logits/chosen": 0.8608442544937134, "logits/rejected": 0.8532319068908691, "logps/chosen": -167.2602081298828, "logps/ref_chosen": -54.592891693115234, "logps/ref_rejected": -57.52851867675781, "logps/rejected": -240.46214294433594, "loss": 0.9472, "margin_dpo/margin_mean": 70.26631164550781, "margin_dpo/margin_std": 83.55381774902344, "step": 430 }, { "epoch": 0.6515495086923658, "grad_norm": 15.106849670410156, "learning_rate": 1.6449496416858282e-07, "logits/chosen": 0.9315043091773987, "logits/rejected": 0.8595200777053833, "logps/chosen": -139.00140380859375, "logps/ref_chosen": -28.757225036621094, "logps/ref_rejected": -60.048500061035156, "logps/rejected": -241.73719787597656, "loss": 1.0949, "margin_dpo/margin_mean": 71.44451141357422, "margin_dpo/margin_std": 95.85964965820312, "step": 431 }, { "epoch": 0.6530612244897959, "grad_norm": 15.847650527954102, "learning_rate": 1.632536862810844e-07, "logits/chosen": 0.9007859230041504, "logits/rejected": 0.9773823022842407, "logps/chosen": -222.88815307617188, "logps/ref_chosen": -72.49076080322266, "logps/ref_rejected": -61.208106994628906, "logps/rejected": -219.5262451171875, "loss": 1.0237, "margin_dpo/margin_mean": 7.920762062072754, "margin_dpo/margin_std": 85.05551147460938, "step": 432 }, { "epoch": 0.654572940287226, "grad_norm": 18.91417121887207, "learning_rate": 1.6201483487445515e-07, "logits/chosen": 1.0263733863830566, "logits/rejected": 1.010549545288086, "logps/chosen": -214.4171905517578, "logps/ref_chosen": -74.03857421875, "logps/ref_rejected": -75.64851379394531, "logps/rejected": -267.9232177734375, "loss": 0.9721, "margin_dpo/margin_mean": 51.896095275878906, "margin_dpo/margin_std": 69.40410614013672, "step": 433 }, { "epoch": 0.656084656084656, "grad_norm": 14.838190078735352, "learning_rate": 1.6077844460203204e-07, "logits/chosen": 0.9536264538764954, "logits/rejected": 0.9389366507530212, "logps/chosen": -183.00430297851562, "logps/ref_chosen": -56.56264114379883, "logps/ref_rejected": -75.03836822509766, "logps/rejected": -236.72100830078125, "loss": 1.0005, "margin_dpo/margin_mean": 35.24098587036133, "margin_dpo/margin_std": 116.1950912475586, "step": 434 }, { "epoch": 0.6575963718820862, "grad_norm": 14.722707748413086, "learning_rate": 1.5954455004830878e-07, "logits/chosen": 1.0074257850646973, "logits/rejected": 0.9908655881881714, "logps/chosen": -178.95724487304688, "logps/ref_chosen": -52.70317840576172, "logps/ref_rejected": -59.57474899291992, "logps/rejected": -245.07876586914062, "loss": 1.1085, "margin_dpo/margin_mean": 59.24994659423828, "margin_dpo/margin_std": 70.93247985839844, "step": 435 }, { "epoch": 0.6591080876795162, "grad_norm": 21.451398849487305, "learning_rate": 1.5831318572796847e-07, "logits/chosen": 0.8413320183753967, "logits/rejected": 0.8829286098480225, "logps/chosen": -164.8529815673828, "logps/ref_chosen": -54.026947021484375, "logps/ref_rejected": -50.91650390625, "logps/rejected": -223.85400390625, "loss": 1.1831, "margin_dpo/margin_mean": 62.111473083496094, "margin_dpo/margin_std": 77.27197265625, "step": 436 }, { "epoch": 0.6606198034769464, "grad_norm": 15.607953071594238, "learning_rate": 1.5708438608491815e-07, "logits/chosen": 0.8712018728256226, "logits/rejected": 0.796289324760437, "logps/chosen": -210.6629180908203, "logps/ref_chosen": -65.94082641601562, "logps/ref_rejected": -101.48641967773438, "logps/rejected": -329.0594482421875, "loss": 1.1607, "margin_dpo/margin_mean": 82.8509292602539, "margin_dpo/margin_std": 156.46224975585938, "step": 437 }, { "epoch": 0.6621315192743764, "grad_norm": 18.58790397644043, "learning_rate": 1.558581854913253e-07, "logits/chosen": 0.9901245832443237, "logits/rejected": 0.8835450410842896, "logps/chosen": -163.1690673828125, "logps/ref_chosen": -37.30860137939453, "logps/ref_rejected": -83.07009887695312, "logps/rejected": -284.5603942871094, "loss": 1.0057, "margin_dpo/margin_mean": 75.62983703613281, "margin_dpo/margin_std": 144.90997314453125, "step": 438 }, { "epoch": 0.6636432350718064, "grad_norm": 14.376500129699707, "learning_rate": 1.5463461824665658e-07, "logits/chosen": 0.9146884679794312, "logits/rejected": 0.8561520576477051, "logps/chosen": -174.28402709960938, "logps/ref_chosen": -58.679443359375, "logps/ref_rejected": -86.585693359375, "logps/rejected": -287.9344177246094, "loss": 0.9811, "margin_dpo/margin_mean": 85.74412536621094, "margin_dpo/margin_std": 75.79908752441406, "step": 439 }, { "epoch": 0.6651549508692366, "grad_norm": 23.994163513183594, "learning_rate": 1.534137185767178e-07, "logits/chosen": 0.90851229429245, "logits/rejected": 0.7904536724090576, "logps/chosen": -171.59495544433594, "logps/ref_chosen": -52.94178009033203, "logps/ref_rejected": -108.83475494384766, "logps/rejected": -340.7264404296875, "loss": 1.0222, "margin_dpo/margin_mean": 113.23851013183594, "margin_dpo/margin_std": 85.38230895996094, "step": 440 }, { "epoch": 0.6666666666666666, "grad_norm": 16.344532012939453, "learning_rate": 1.521955206326976e-07, "logits/chosen": 0.9741239547729492, "logits/rejected": 0.7976517081260681, "logps/chosen": -146.74644470214844, "logps/ref_chosen": -43.878997802734375, "logps/ref_rejected": -85.12787628173828, "logps/rejected": -268.0721130371094, "loss": 0.9318, "margin_dpo/margin_mean": 80.07679748535156, "margin_dpo/margin_std": 80.79681396484375, "step": 441 }, { "epoch": 0.6681783824640968, "grad_norm": 16.937883377075195, "learning_rate": 1.5098005849021078e-07, "logits/chosen": 0.9499423503875732, "logits/rejected": 0.8875406980514526, "logps/chosen": -202.962158203125, "logps/ref_chosen": -82.47845458984375, "logps/ref_rejected": -119.25947570800781, "logps/rejected": -351.79766845703125, "loss": 1.0069, "margin_dpo/margin_mean": 112.05450439453125, "margin_dpo/margin_std": 77.75383758544922, "step": 442 }, { "epoch": 0.6696900982615268, "grad_norm": 15.244129180908203, "learning_rate": 1.4976736614834662e-07, "logits/chosen": 0.7210106253623962, "logits/rejected": 0.6923055648803711, "logps/chosen": -197.3117218017578, "logps/ref_chosen": -68.7590560913086, "logps/ref_rejected": -75.88587951660156, "logps/rejected": -294.58660888671875, "loss": 0.8711, "margin_dpo/margin_mean": 90.1480712890625, "margin_dpo/margin_std": 78.41068267822266, "step": 443 }, { "epoch": 0.671201814058957, "grad_norm": 21.625139236450195, "learning_rate": 1.4855747752871654e-07, "logits/chosen": 0.8997494578361511, "logits/rejected": 0.8529607057571411, "logps/chosen": -243.5794677734375, "logps/ref_chosen": -69.02406311035156, "logps/ref_rejected": -82.6009750366211, "logps/rejected": -275.14569091796875, "loss": 1.2998, "margin_dpo/margin_mean": 17.989328384399414, "margin_dpo/margin_std": 90.46211242675781, "step": 444 }, { "epoch": 0.672713529856387, "grad_norm": 18.841861724853516, "learning_rate": 1.473504264745062e-07, "logits/chosen": 0.9031407833099365, "logits/rejected": 0.9619897603988647, "logps/chosen": -269.9713439941406, "logps/ref_chosen": -90.35289001464844, "logps/ref_rejected": -69.4399185180664, "logps/rejected": -283.7539367675781, "loss": 0.9734, "margin_dpo/margin_mean": 34.695556640625, "margin_dpo/margin_std": 91.03128051757812, "step": 445 }, { "epoch": 0.674225245653817, "grad_norm": 14.573994636535645, "learning_rate": 1.461462467495284e-07, "logits/chosen": 0.8910402655601501, "logits/rejected": 0.8468337655067444, "logps/chosen": -196.3031768798828, "logps/ref_chosen": -68.73054504394531, "logps/ref_rejected": -94.55728149414062, "logps/rejected": -304.6093444824219, "loss": 0.8754, "margin_dpo/margin_mean": 82.47945404052734, "margin_dpo/margin_std": 77.50213623046875, "step": 446 }, { "epoch": 0.6757369614512472, "grad_norm": 19.80596351623535, "learning_rate": 1.4494497203727843e-07, "logits/chosen": 0.9224183559417725, "logits/rejected": 0.8308309316635132, "logps/chosen": -180.90097045898438, "logps/ref_chosen": -63.873809814453125, "logps/ref_rejected": -93.55643463134766, "logps/rejected": -295.10546875, "loss": 0.9013, "margin_dpo/margin_mean": 84.52186584472656, "margin_dpo/margin_std": 88.04441833496094, "step": 447 }, { "epoch": 0.6772486772486772, "grad_norm": 15.010350227355957, "learning_rate": 1.4374663593999256e-07, "logits/chosen": 1.0064988136291504, "logits/rejected": 0.9316452145576477, "logps/chosen": -173.94631958007812, "logps/ref_chosen": -48.06145477294922, "logps/ref_rejected": -74.88455963134766, "logps/rejected": -297.13531494140625, "loss": 1.015, "margin_dpo/margin_mean": 96.36589050292969, "margin_dpo/margin_std": 80.901611328125, "step": 448 }, { "epoch": 0.6787603930461074, "grad_norm": 21.837177276611328, "learning_rate": 1.4255127197770707e-07, "logits/chosen": 0.7422596216201782, "logits/rejected": 0.7190842628479004, "logps/chosen": -238.9716796875, "logps/ref_chosen": -68.59271240234375, "logps/ref_rejected": -88.01607513427734, "logps/rejected": -317.0701599121094, "loss": 1.233, "margin_dpo/margin_mean": 58.67515563964844, "margin_dpo/margin_std": 102.90806579589844, "step": 449 }, { "epoch": 0.6802721088435374, "grad_norm": 16.762248992919922, "learning_rate": 1.4135891358732205e-07, "logits/chosen": 1.0215989351272583, "logits/rejected": 0.838610053062439, "logps/chosen": -167.00927734375, "logps/ref_chosen": -35.716896057128906, "logps/ref_rejected": -95.92900085449219, "logps/rejected": -310.751953125, "loss": 1.049, "margin_dpo/margin_mean": 83.53057861328125, "margin_dpo/margin_std": 82.19971466064453, "step": 450 }, { "epoch": 0.6817838246409675, "grad_norm": 15.963050842285156, "learning_rate": 1.4016959412166437e-07, "logits/chosen": 0.8930540084838867, "logits/rejected": 0.7441238760948181, "logps/chosen": -221.75592041015625, "logps/ref_chosen": -73.0178451538086, "logps/ref_rejected": -109.17976379394531, "logps/rejected": -333.75726318359375, "loss": 1.0905, "margin_dpo/margin_mean": 75.83943939208984, "margin_dpo/margin_std": 87.21731567382812, "step": 451 }, { "epoch": 0.6832955404383976, "grad_norm": 17.484153747558594, "learning_rate": 1.3898334684855645e-07, "logits/chosen": 0.7930479645729065, "logits/rejected": 0.7267175316810608, "logps/chosen": -191.09988403320312, "logps/ref_chosen": -68.31144714355469, "logps/ref_rejected": -97.96285247802734, "logps/rejected": -300.38970947265625, "loss": 1.096, "margin_dpo/margin_mean": 79.63841247558594, "margin_dpo/margin_std": 85.86831665039062, "step": 452 }, { "epoch": 0.6848072562358276, "grad_norm": 24.5980167388916, "learning_rate": 1.3780020494988445e-07, "logits/chosen": 0.8875958919525146, "logits/rejected": 0.8231840133666992, "logps/chosen": -145.66799926757812, "logps/ref_chosen": -51.74473190307617, "logps/ref_rejected": -60.13861083984375, "logps/rejected": -229.53916931152344, "loss": 1.0671, "margin_dpo/margin_mean": 75.477294921875, "margin_dpo/margin_std": 93.73162841796875, "step": 453 }, { "epoch": 0.6863189720332578, "grad_norm": 12.73465633392334, "learning_rate": 1.366202015206706e-07, "logits/chosen": 0.9996371269226074, "logits/rejected": 1.0085588693618774, "logps/chosen": -156.74002075195312, "logps/ref_chosen": -44.98827362060547, "logps/ref_rejected": -50.99993133544922, "logps/rejected": -196.87745666503906, "loss": 1.0144, "margin_dpo/margin_mean": 34.12577819824219, "margin_dpo/margin_std": 120.2644271850586, "step": 454 }, { "epoch": 0.6878306878306878, "grad_norm": 16.96663475036621, "learning_rate": 1.354433695681474e-07, "logits/chosen": 0.8430243134498596, "logits/rejected": 0.8127152919769287, "logps/chosen": -215.993896484375, "logps/ref_chosen": -71.83073425292969, "logps/ref_rejected": -84.22274780273438, "logps/rejected": -321.873046875, "loss": 1.0001, "margin_dpo/margin_mean": 93.48712921142578, "margin_dpo/margin_std": 97.56796264648438, "step": 455 }, { "epoch": 0.6893424036281179, "grad_norm": 21.244468688964844, "learning_rate": 1.3426974201083439e-07, "logits/chosen": 0.8168896436691284, "logits/rejected": 0.7225136756896973, "logps/chosen": -166.49514770507812, "logps/ref_chosen": -37.67304229736328, "logps/ref_rejected": -87.09375, "logps/rejected": -294.4206848144531, "loss": 1.0769, "margin_dpo/margin_mean": 78.50482177734375, "margin_dpo/margin_std": 95.56376647949219, "step": 456 }, { "epoch": 0.690854119425548, "grad_norm": 20.784381866455078, "learning_rate": 1.3309935167761717e-07, "logits/chosen": 0.9731137752532959, "logits/rejected": 0.890478789806366, "logps/chosen": -193.4537353515625, "logps/ref_chosen": -36.889923095703125, "logps/ref_rejected": -71.26411437988281, "logps/rejected": -267.1976623535156, "loss": 1.0515, "margin_dpo/margin_mean": 39.369747161865234, "margin_dpo/margin_std": 71.71172332763672, "step": 457 }, { "epoch": 0.6923658352229781, "grad_norm": 17.790769577026367, "learning_rate": 1.3193223130682936e-07, "logits/chosen": 0.9141269326210022, "logits/rejected": 0.8772815465927124, "logps/chosen": -211.3562774658203, "logps/ref_chosen": -61.25056076049805, "logps/ref_rejected": -70.772216796875, "logps/rejected": -278.8810119628906, "loss": 0.9925, "margin_dpo/margin_mean": 58.00306701660156, "margin_dpo/margin_std": 117.56779479980469, "step": 458 }, { "epoch": 0.6938775510204082, "grad_norm": 15.180387496948242, "learning_rate": 1.3076841354533658e-07, "logits/chosen": 1.0155048370361328, "logits/rejected": 0.9921817779541016, "logps/chosen": -206.2635498046875, "logps/ref_chosen": -73.28994750976562, "logps/ref_rejected": -108.29696655273438, "logps/rejected": -306.605712890625, "loss": 0.9325, "margin_dpo/margin_mean": 65.33513641357422, "margin_dpo/margin_std": 104.4261474609375, "step": 459 }, { "epoch": 0.6953892668178382, "grad_norm": 15.735591888427734, "learning_rate": 1.2960793094762345e-07, "logits/chosen": 0.8665391802787781, "logits/rejected": 0.744087815284729, "logps/chosen": -220.26666259765625, "logps/ref_chosen": -76.68836975097656, "logps/ref_rejected": -99.02154541015625, "logps/rejected": -351.54107666015625, "loss": 0.8928, "margin_dpo/margin_mean": 108.94125366210938, "margin_dpo/margin_std": 133.6810302734375, "step": 460 }, { "epoch": 0.6969009826152683, "grad_norm": 14.483108520507812, "learning_rate": 1.2845081597488286e-07, "logits/chosen": 0.9958152770996094, "logits/rejected": 0.9232292175292969, "logps/chosen": -167.3024139404297, "logps/ref_chosen": -63.83565902709961, "logps/ref_rejected": -87.00081634521484, "logps/rejected": -267.07049560546875, "loss": 0.883, "margin_dpo/margin_mean": 76.60294342041016, "margin_dpo/margin_std": 62.0404167175293, "step": 461 }, { "epoch": 0.6984126984126984, "grad_norm": 18.1173095703125, "learning_rate": 1.27297100994108e-07, "logits/chosen": 0.9541089534759521, "logits/rejected": 0.9522888660430908, "logps/chosen": -175.2286376953125, "logps/ref_chosen": -64.87322998046875, "logps/ref_rejected": -68.76766967773438, "logps/rejected": -291.7859191894531, "loss": 0.9019, "margin_dpo/margin_mean": 112.66285705566406, "margin_dpo/margin_std": 76.85243225097656, "step": 462 }, { "epoch": 0.6999244142101285, "grad_norm": 21.672767639160156, "learning_rate": 1.2614681827718695e-07, "logits/chosen": 0.9698790907859802, "logits/rejected": 1.0222316980361938, "logps/chosen": -219.83706665039062, "logps/ref_chosen": -66.8828125, "logps/ref_rejected": -47.84074783325195, "logps/rejected": -255.33847045898438, "loss": 1.0292, "margin_dpo/margin_mean": 54.54346466064453, "margin_dpo/margin_std": 97.27693176269531, "step": 463 }, { "epoch": 0.7014361300075586, "grad_norm": 16.856752395629883, "learning_rate": 1.2500000000000005e-07, "logits/chosen": 1.0088590383529663, "logits/rejected": 1.0314637422561646, "logps/chosen": -243.63446044921875, "logps/ref_chosen": -90.68864440917969, "logps/ref_rejected": -78.24504852294922, "logps/rejected": -304.3553161621094, "loss": 0.9942, "margin_dpo/margin_mean": 73.16445922851562, "margin_dpo/margin_std": 115.3930435180664, "step": 464 }, { "epoch": 0.7029478458049887, "grad_norm": 18.76186752319336, "learning_rate": 1.238566782415197e-07, "logits/chosen": 0.9639427661895752, "logits/rejected": 0.8763701915740967, "logps/chosen": -196.27841186523438, "logps/ref_chosen": -68.18783569335938, "logps/ref_rejected": -107.69645690917969, "logps/rejected": -325.6939697265625, "loss": 1.0667, "margin_dpo/margin_mean": 89.90691375732422, "margin_dpo/margin_std": 111.15441131591797, "step": 465 }, { "epoch": 0.7044595616024187, "grad_norm": 25.630605697631836, "learning_rate": 1.2271688498291334e-07, "logits/chosen": 0.7273076772689819, "logits/rejected": 0.7946709394454956, "logps/chosen": -287.0624084472656, "logps/ref_chosen": -109.91573333740234, "logps/ref_rejected": -71.43077087402344, "logps/rejected": -251.70777893066406, "loss": 1.2282, "margin_dpo/margin_mean": 3.1303582191467285, "margin_dpo/margin_std": 107.84750366210938, "step": 466 }, { "epoch": 0.7059712773998488, "grad_norm": 19.084739685058594, "learning_rate": 1.2158065210664848e-07, "logits/chosen": 0.861819326877594, "logits/rejected": 0.7350021600723267, "logps/chosen": -200.34225463867188, "logps/ref_chosen": -44.50347900390625, "logps/ref_rejected": -68.12409973144531, "logps/rejected": -296.81658935546875, "loss": 0.9962, "margin_dpo/margin_mean": 72.85372924804688, "margin_dpo/margin_std": 124.93696594238281, "step": 467 }, { "epoch": 0.7074829931972789, "grad_norm": 17.534069061279297, "learning_rate": 1.204480113956011e-07, "logits/chosen": 0.9389829635620117, "logits/rejected": 0.8962525129318237, "logps/chosen": -154.1627197265625, "logps/ref_chosen": -46.26074981689453, "logps/ref_rejected": -77.6624755859375, "logps/rejected": -269.74609375, "loss": 0.9274, "margin_dpo/margin_mean": 84.1816635131836, "margin_dpo/margin_std": 95.72941589355469, "step": 468 }, { "epoch": 0.708994708994709, "grad_norm": 18.879037857055664, "learning_rate": 1.1931899453216697e-07, "logits/chosen": 0.98770672082901, "logits/rejected": 0.9201364517211914, "logps/chosen": -188.12139892578125, "logps/ref_chosen": -62.30226516723633, "logps/ref_rejected": -88.74978637695312, "logps/rejected": -278.533203125, "loss": 0.9651, "margin_dpo/margin_mean": 63.96430206298828, "margin_dpo/margin_std": 68.25845336914062, "step": 469 }, { "epoch": 0.7105064247921391, "grad_norm": 17.858858108520508, "learning_rate": 1.1819363309737438e-07, "logits/chosen": 1.0070170164108276, "logits/rejected": 0.9005523920059204, "logps/chosen": -177.07325744628906, "logps/ref_chosen": -53.556175231933594, "logps/ref_rejected": -83.76905059814453, "logps/rejected": -305.9222717285156, "loss": 1.0157, "margin_dpo/margin_mean": 98.63612365722656, "margin_dpo/margin_std": 116.62651062011719, "step": 470 }, { "epoch": 0.7120181405895691, "grad_norm": 22.893625259399414, "learning_rate": 1.1707195857000215e-07, "logits/chosen": 0.9534423351287842, "logits/rejected": 0.9319720268249512, "logps/chosen": -163.5334014892578, "logps/ref_chosen": -51.11562728881836, "logps/ref_rejected": -63.00703048706055, "logps/rejected": -267.5377197265625, "loss": 0.9278, "margin_dpo/margin_mean": 92.11293029785156, "margin_dpo/margin_std": 100.85304260253906, "step": 471 }, { "epoch": 0.7135298563869993, "grad_norm": 16.294567108154297, "learning_rate": 1.1595400232569768e-07, "logits/chosen": 0.7927904725074768, "logits/rejected": 0.8178097009658813, "logps/chosen": -197.45468139648438, "logps/ref_chosen": -69.28681182861328, "logps/ref_rejected": -85.08757019042969, "logps/rejected": -298.28179931640625, "loss": 1.103, "margin_dpo/margin_mean": 85.0263900756836, "margin_dpo/margin_std": 92.62736511230469, "step": 472 }, { "epoch": 0.7150415721844293, "grad_norm": 19.770648956298828, "learning_rate": 1.1483979563610069e-07, "logits/chosen": 0.9858765602111816, "logits/rejected": 0.7901525497436523, "logps/chosen": -140.59815979003906, "logps/ref_chosen": -34.896080017089844, "logps/ref_rejected": -81.10395812988281, "logps/rejected": -261.3387145996094, "loss": 1.0053, "margin_dpo/margin_mean": 74.53266906738281, "margin_dpo/margin_std": 110.72260284423828, "step": 473 }, { "epoch": 0.7165532879818595, "grad_norm": 23.826879501342773, "learning_rate": 1.1372936966796709e-07, "logits/chosen": 1.108849287033081, "logits/rejected": 1.0479471683502197, "logps/chosen": -195.81671142578125, "logps/ref_chosen": -45.78113555908203, "logps/ref_rejected": -68.88629150390625, "logps/rejected": -293.0316162109375, "loss": 1.1275, "margin_dpo/margin_mean": 74.10977172851562, "margin_dpo/margin_std": 110.27363586425781, "step": 474 }, { "epoch": 0.7180650037792895, "grad_norm": 13.597807884216309, "learning_rate": 1.126227554822985e-07, "logits/chosen": 0.8909091949462891, "logits/rejected": 0.9018880724906921, "logps/chosen": -191.0635223388672, "logps/ref_chosen": -58.526344299316406, "logps/ref_rejected": -76.01811981201172, "logps/rejected": -290.8351745605469, "loss": 0.8632, "margin_dpo/margin_mean": 82.27987670898438, "margin_dpo/margin_std": 109.62013244628906, "step": 475 }, { "epoch": 0.7195767195767195, "grad_norm": 22.143871307373047, "learning_rate": 1.1151998403347243e-07, "logits/chosen": 0.8504942059516907, "logits/rejected": 0.801427960395813, "logps/chosen": -185.2305145263672, "logps/ref_chosen": -54.09751510620117, "logps/ref_rejected": -84.35127258300781, "logps/rejected": -310.95294189453125, "loss": 1.0874, "margin_dpo/margin_mean": 95.46868133544922, "margin_dpo/margin_std": 103.3380126953125, "step": 476 }, { "epoch": 0.7210884353741497, "grad_norm": 20.90325355529785, "learning_rate": 1.1042108616837692e-07, "logits/chosen": 0.9148606657981873, "logits/rejected": 0.8745808601379395, "logps/chosen": -211.44253540039062, "logps/ref_chosen": -67.8685531616211, "logps/ref_rejected": -72.77481842041016, "logps/rejected": -246.86721801757812, "loss": 1.1216, "margin_dpo/margin_mean": 30.518428802490234, "margin_dpo/margin_std": 85.03158569335938, "step": 477 }, { "epoch": 0.7226001511715797, "grad_norm": 19.139371871948242, "learning_rate": 1.0932609262554746e-07, "logits/chosen": 0.8678562641143799, "logits/rejected": 0.8669169545173645, "logps/chosen": -157.50787353515625, "logps/ref_chosen": -56.62529754638672, "logps/ref_rejected": -58.720787048339844, "logps/rejected": -254.533203125, "loss": 1.1308, "margin_dpo/margin_mean": 94.92986297607422, "margin_dpo/margin_std": 100.27252197265625, "step": 478 }, { "epoch": 0.7241118669690099, "grad_norm": 20.085996627807617, "learning_rate": 1.0823503403430734e-07, "logits/chosen": 0.8876814842224121, "logits/rejected": 0.7483628988265991, "logps/chosen": -176.6420135498047, "logps/ref_chosen": -45.32330322265625, "logps/ref_rejected": -55.02246856689453, "logps/rejected": -235.74169921875, "loss": 1.1586, "margin_dpo/margin_mean": 49.40052795410156, "margin_dpo/margin_std": 96.12824249267578, "step": 479 }, { "epoch": 0.7256235827664399, "grad_norm": 21.663545608520508, "learning_rate": 1.0714794091391072e-07, "logits/chosen": 0.8385661244392395, "logits/rejected": 0.8454256057739258, "logps/chosen": -211.0986328125, "logps/ref_chosen": -63.22759246826172, "logps/ref_rejected": -74.44642639160156, "logps/rejected": -270.2560729980469, "loss": 1.0536, "margin_dpo/margin_mean": 47.93859100341797, "margin_dpo/margin_std": 117.68538665771484, "step": 480 }, { "epoch": 0.72713529856387, "grad_norm": 17.779111862182617, "learning_rate": 1.0606484367268906e-07, "logits/chosen": 0.8755944967269897, "logits/rejected": 0.9314774870872498, "logps/chosen": -223.0485076904297, "logps/ref_chosen": -79.15220642089844, "logps/ref_rejected": -68.31973266601562, "logps/rejected": -270.32794189453125, "loss": 1.049, "margin_dpo/margin_mean": 58.11188507080078, "margin_dpo/margin_std": 97.23744201660156, "step": 481 }, { "epoch": 0.7286470143613001, "grad_norm": 24.795852661132812, "learning_rate": 1.0498577260720048e-07, "logits/chosen": 0.8218737840652466, "logits/rejected": 0.7339863777160645, "logps/chosen": -193.96629333496094, "logps/ref_chosen": -54.06950378417969, "logps/ref_rejected": -87.615234375, "logps/rejected": -284.41217041015625, "loss": 1.1686, "margin_dpo/margin_mean": 56.900150299072266, "margin_dpo/margin_std": 103.31169128417969, "step": 482 }, { "epoch": 0.7301587301587301, "grad_norm": 15.781610488891602, "learning_rate": 1.0391075790138232e-07, "logits/chosen": 0.894173800945282, "logits/rejected": 0.9251315593719482, "logps/chosen": -174.34373474121094, "logps/ref_chosen": -54.128578186035156, "logps/ref_rejected": -67.60116577148438, "logps/rejected": -278.50946044921875, "loss": 0.986, "margin_dpo/margin_mean": 90.693115234375, "margin_dpo/margin_std": 83.16683197021484, "step": 483 }, { "epoch": 0.7316704459561603, "grad_norm": 22.09979820251465, "learning_rate": 1.0283982962570681e-07, "logits/chosen": 0.8971969485282898, "logits/rejected": 0.8970204591751099, "logps/chosen": -179.95797729492188, "logps/ref_chosen": -50.43122100830078, "logps/ref_rejected": -66.30445861816406, "logps/rejected": -237.34353637695312, "loss": 1.0404, "margin_dpo/margin_mean": 41.512306213378906, "margin_dpo/margin_std": 72.52426147460938, "step": 484 }, { "epoch": 0.7331821617535903, "grad_norm": 17.71631622314453, "learning_rate": 1.0177301773633992e-07, "logits/chosen": 1.0137141942977905, "logits/rejected": 0.9617197513580322, "logps/chosen": -193.55841064453125, "logps/ref_chosen": -51.1461067199707, "logps/ref_rejected": -61.148231506347656, "logps/rejected": -281.61785888671875, "loss": 1.0243, "margin_dpo/margin_mean": 78.0573501586914, "margin_dpo/margin_std": 87.72028350830078, "step": 485 }, { "epoch": 0.7346938775510204, "grad_norm": 18.199512481689453, "learning_rate": 1.007103520743035e-07, "logits/chosen": 0.9664397239685059, "logits/rejected": 0.8338623046875, "logps/chosen": -239.65943908691406, "logps/ref_chosen": -70.02229309082031, "logps/ref_rejected": -114.05645751953125, "logps/rejected": -355.57293701171875, "loss": 1.0931, "margin_dpo/margin_mean": 71.87936401367188, "margin_dpo/margin_std": 91.45281982421875, "step": 486 }, { "epoch": 0.7362055933484505, "grad_norm": 21.441146850585938, "learning_rate": 9.965186236464046e-08, "logits/chosen": 0.8630800843238831, "logits/rejected": 0.9823142290115356, "logps/chosen": -227.5937042236328, "logps/ref_chosen": -81.48747253417969, "logps/ref_rejected": -57.63191604614258, "logps/rejected": -252.54348754882812, "loss": 1.0282, "margin_dpo/margin_mean": 48.80535125732422, "margin_dpo/margin_std": 73.48953247070312, "step": 487 }, { "epoch": 0.7377173091458806, "grad_norm": 16.673267364501953, "learning_rate": 9.859757821558337e-08, "logits/chosen": 1.1053991317749023, "logits/rejected": 1.0277700424194336, "logps/chosen": -159.07391357421875, "logps/ref_chosen": -40.96950912475586, "logps/ref_rejected": -58.97332000732422, "logps/rejected": -254.75424194335938, "loss": 0.9396, "margin_dpo/margin_mean": 77.67652130126953, "margin_dpo/margin_std": 103.35063934326172, "step": 488 }, { "epoch": 0.7392290249433107, "grad_norm": 17.88896942138672, "learning_rate": 9.754752911772615e-08, "logits/chosen": 0.9406849145889282, "logits/rejected": 0.9168277978897095, "logps/chosen": -206.83840942382812, "logps/ref_chosen": -61.101173400878906, "logps/ref_rejected": -98.35012817382812, "logps/rejected": -302.9677734375, "loss": 1.2049, "margin_dpo/margin_mean": 58.880401611328125, "margin_dpo/margin_std": 115.15332794189453, "step": 489 }, { "epoch": 0.7407407407407407, "grad_norm": 17.050439834594727, "learning_rate": 9.650174444319956e-08, "logits/chosen": 1.056211233139038, "logits/rejected": 1.0357109308242798, "logps/chosen": -221.88174438476562, "logps/ref_chosen": -62.3327751159668, "logps/ref_rejected": -79.33484649658203, "logps/rejected": -289.1636962890625, "loss": 1.1471, "margin_dpo/margin_mean": 50.279884338378906, "margin_dpo/margin_std": 150.81320190429688, "step": 490 }, { "epoch": 0.7422524565381708, "grad_norm": 15.308135032653809, "learning_rate": 9.546025344484868e-08, "logits/chosen": 0.9274444580078125, "logits/rejected": 0.9464068412780762, "logps/chosen": -178.4617919921875, "logps/ref_chosen": -56.26042175292969, "logps/ref_rejected": -60.113643646240234, "logps/rejected": -242.59280395507812, "loss": 1.0716, "margin_dpo/margin_mean": 60.27778625488281, "margin_dpo/margin_std": 79.55590057373047, "step": 491 }, { "epoch": 0.7437641723356009, "grad_norm": 18.683191299438477, "learning_rate": 9.442308525541589e-08, "logits/chosen": 1.0775320529937744, "logits/rejected": 0.8653636574745178, "logps/chosen": -203.3031463623047, "logps/ref_chosen": -45.579383850097656, "logps/ref_rejected": -113.18993377685547, "logps/rejected": -362.9856872558594, "loss": 1.1972, "margin_dpo/margin_mean": 92.07199096679688, "margin_dpo/margin_std": 111.55503845214844, "step": 492 }, { "epoch": 0.745275888133031, "grad_norm": 16.538679122924805, "learning_rate": 9.339026888672468e-08, "logits/chosen": 0.9095848798751831, "logits/rejected": 0.843256950378418, "logps/chosen": -190.7752685546875, "logps/ref_chosen": -59.649986267089844, "logps/ref_rejected": -87.15731811523438, "logps/rejected": -255.50759887695312, "loss": 0.9946, "margin_dpo/margin_mean": 37.224998474121094, "margin_dpo/margin_std": 83.05178833007812, "step": 493 }, { "epoch": 0.7467876039304611, "grad_norm": 22.777530670166016, "learning_rate": 9.236183322886945e-08, "logits/chosen": 0.7308402061462402, "logits/rejected": 0.6717466711997986, "logps/chosen": -180.57354736328125, "logps/ref_chosen": -52.495460510253906, "logps/ref_rejected": -76.69441223144531, "logps/rejected": -251.54196166992188, "loss": 1.1615, "margin_dpo/margin_mean": 46.76948547363281, "margin_dpo/margin_std": 64.22396850585938, "step": 494 }, { "epoch": 0.7482993197278912, "grad_norm": 20.922956466674805, "learning_rate": 9.133780704940594e-08, "logits/chosen": 0.8490627408027649, "logits/rejected": 0.8381332755088806, "logps/chosen": -224.40399169921875, "logps/ref_chosen": -61.861454010009766, "logps/ref_rejected": -66.13208770751953, "logps/rejected": -220.16583251953125, "loss": 1.1718, "margin_dpo/margin_mean": -8.508773803710938, "margin_dpo/margin_std": 103.41836547851562, "step": 495 }, { "epoch": 0.7498110355253212, "grad_norm": 17.408597946166992, "learning_rate": 9.031821899254797e-08, "logits/chosen": 0.9441404938697815, "logits/rejected": 0.7932374477386475, "logps/chosen": -185.04629516601562, "logps/ref_chosen": -45.550537109375, "logps/ref_rejected": -100.87998962402344, "logps/rejected": -331.1201477050781, "loss": 1.0355, "margin_dpo/margin_mean": 90.74439239501953, "margin_dpo/margin_std": 99.05509948730469, "step": 496 }, { "epoch": 0.7513227513227513, "grad_norm": 17.972551345825195, "learning_rate": 8.930309757836516e-08, "logits/chosen": 0.9454480409622192, "logits/rejected": 0.8560576438903809, "logps/chosen": -160.0005645751953, "logps/ref_chosen": -59.57224655151367, "logps/ref_rejected": -98.0445556640625, "logps/rejected": -283.73138427734375, "loss": 1.0466, "margin_dpo/margin_mean": 85.25851440429688, "margin_dpo/margin_std": 80.23634338378906, "step": 497 }, { "epoch": 0.7528344671201814, "grad_norm": 15.286055564880371, "learning_rate": 8.829247120198563e-08, "logits/chosen": 0.9226865768432617, "logits/rejected": 0.8130264282226562, "logps/chosen": -209.95169067382812, "logps/ref_chosen": -62.02671432495117, "logps/ref_rejected": -102.47027587890625, "logps/rejected": -339.2867431640625, "loss": 0.9486, "margin_dpo/margin_mean": 88.89149475097656, "margin_dpo/margin_std": 98.82290649414062, "step": 498 }, { "epoch": 0.7543461829176115, "grad_norm": 19.15370750427246, "learning_rate": 8.728636813280163e-08, "logits/chosen": 0.9353519082069397, "logits/rejected": 0.817765474319458, "logps/chosen": -201.40264892578125, "logps/ref_chosen": -59.132904052734375, "logps/ref_rejected": -109.99728393554688, "logps/rejected": -319.89678955078125, "loss": 1.0559, "margin_dpo/margin_mean": 67.6297607421875, "margin_dpo/margin_std": 111.14754486083984, "step": 499 }, { "epoch": 0.7558578987150416, "grad_norm": 19.469440460205078, "learning_rate": 8.628481651367875e-08, "logits/chosen": 0.7339733839035034, "logits/rejected": 0.7467474341392517, "logps/chosen": -233.1659393310547, "logps/ref_chosen": -83.37364959716797, "logps/ref_rejected": -85.28791809082031, "logps/rejected": -321.4836730957031, "loss": 1.0818, "margin_dpo/margin_mean": 86.4034423828125, "margin_dpo/margin_std": 75.89598083496094, "step": 500 }, { "epoch": 0.7573696145124716, "grad_norm": 13.901487350463867, "learning_rate": 8.528784436016878e-08, "logits/chosen": 0.9171326756477356, "logits/rejected": 0.9103375673294067, "logps/chosen": -201.10220336914062, "logps/ref_chosen": -71.76341247558594, "logps/ref_rejected": -77.99368286132812, "logps/rejected": -288.089599609375, "loss": 0.9773, "margin_dpo/margin_mean": 80.75711059570312, "margin_dpo/margin_std": 61.29353713989258, "step": 501 }, { "epoch": 0.7588813303099018, "grad_norm": 16.59943389892578, "learning_rate": 8.4295479559726e-08, "logits/chosen": 0.9081599712371826, "logits/rejected": 0.8390029072761536, "logps/chosen": -177.0962677001953, "logps/ref_chosen": -60.16824722290039, "logps/ref_rejected": -88.61361694335938, "logps/rejected": -263.456787109375, "loss": 1.0216, "margin_dpo/margin_mean": 57.91515350341797, "margin_dpo/margin_std": 97.9842300415039, "step": 502 }, { "epoch": 0.7603930461073318, "grad_norm": 15.813238143920898, "learning_rate": 8.330774987092712e-08, "logits/chosen": 0.9412652254104614, "logits/rejected": 0.9647856950759888, "logps/chosen": -200.7902374267578, "logps/ref_chosen": -64.67495727539062, "logps/ref_rejected": -68.66828918457031, "logps/rejected": -268.06390380859375, "loss": 1.0442, "margin_dpo/margin_mean": 63.280311584472656, "margin_dpo/margin_std": 97.19824981689453, "step": 503 }, { "epoch": 0.7619047619047619, "grad_norm": 15.342427253723145, "learning_rate": 8.232468292269479e-08, "logits/chosen": 0.9019661545753479, "logits/rejected": 0.8831943273544312, "logps/chosen": -195.31814575195312, "logps/ref_chosen": -59.16814041137695, "logps/ref_rejected": -80.79418182373047, "logps/rejected": -313.3367919921875, "loss": 0.88, "margin_dpo/margin_mean": 96.392578125, "margin_dpo/margin_std": 102.2872314453125, "step": 504 }, { "epoch": 0.763416477702192, "grad_norm": 18.31743621826172, "learning_rate": 8.134630621352483e-08, "logits/chosen": 0.8820549249649048, "logits/rejected": 0.8513251543045044, "logps/chosen": -191.5747528076172, "logps/ref_chosen": -70.2003173828125, "logps/ref_rejected": -96.7729721069336, "logps/rejected": -323.912841796875, "loss": 1.1311, "margin_dpo/margin_mean": 105.76544189453125, "margin_dpo/margin_std": 129.75132751464844, "step": 505 }, { "epoch": 0.764928193499622, "grad_norm": 19.81454849243164, "learning_rate": 8.037264711071698e-08, "logits/chosen": 0.9372712969779968, "logits/rejected": 0.9639301300048828, "logps/chosen": -213.41952514648438, "logps/ref_chosen": -77.78474426269531, "logps/ref_rejected": -76.35060119628906, "logps/rejected": -262.79083251953125, "loss": 1.1566, "margin_dpo/margin_mean": 50.80542755126953, "margin_dpo/margin_std": 70.04209899902344, "step": 506 }, { "epoch": 0.7664399092970522, "grad_norm": 16.390592575073242, "learning_rate": 7.940373284960933e-08, "logits/chosen": 0.8365110158920288, "logits/rejected": 0.869307279586792, "logps/chosen": -263.01031494140625, "logps/ref_chosen": -102.45763397216797, "logps/ref_rejected": -107.32395935058594, "logps/rejected": -325.25531005859375, "loss": 1.0336, "margin_dpo/margin_mean": 57.37864685058594, "margin_dpo/margin_std": 107.75471496582031, "step": 507 }, { "epoch": 0.7679516250944822, "grad_norm": 18.062192916870117, "learning_rate": 7.843959053281663e-08, "logits/chosen": 0.9670290946960449, "logits/rejected": 0.8693418502807617, "logps/chosen": -160.03451538085938, "logps/ref_chosen": -54.93028259277344, "logps/ref_rejected": -94.2146987915039, "logps/rejected": -347.9501953125, "loss": 0.9714, "margin_dpo/margin_mean": 148.63128662109375, "margin_dpo/margin_std": 169.13424682617188, "step": 508 }, { "epoch": 0.7694633408919124, "grad_norm": 21.243953704833984, "learning_rate": 7.748024712947204e-08, "logits/chosen": 0.7485306262969971, "logits/rejected": 0.8197051286697388, "logps/chosen": -231.2813262939453, "logps/ref_chosen": -68.31385803222656, "logps/ref_rejected": -49.03086853027344, "logps/rejected": -233.74880981445312, "loss": 1.0812, "margin_dpo/margin_mean": 21.75048065185547, "margin_dpo/margin_std": 92.87283325195312, "step": 509 }, { "epoch": 0.7709750566893424, "grad_norm": 18.86398696899414, "learning_rate": 7.652572947447272e-08, "logits/chosen": 0.8953909873962402, "logits/rejected": 0.8276166319847107, "logps/chosen": -233.6016845703125, "logps/ref_chosen": -73.07958984375, "logps/ref_rejected": -96.33815002441406, "logps/rejected": -307.2120666503906, "loss": 1.0692, "margin_dpo/margin_mean": 50.35181427001953, "margin_dpo/margin_std": 74.205322265625, "step": 510 }, { "epoch": 0.7724867724867724, "grad_norm": 21.314716339111328, "learning_rate": 7.557606426772961e-08, "logits/chosen": 1.0289305448532104, "logits/rejected": 0.9097828269004822, "logps/chosen": -169.629638671875, "logps/ref_chosen": -42.50914764404297, "logps/ref_rejected": -75.06625366210938, "logps/rejected": -301.7984619140625, "loss": 0.9783, "margin_dpo/margin_mean": 99.61170959472656, "margin_dpo/margin_std": 98.32453155517578, "step": 511 }, { "epoch": 0.7739984882842026, "grad_norm": 31.400192260742188, "learning_rate": 7.463127807341966e-08, "logits/chosen": 0.9768285155296326, "logits/rejected": 1.0176584720611572, "logps/chosen": -181.43753051757812, "logps/ref_chosen": -56.2095947265625, "logps/ref_rejected": -53.511173248291016, "logps/rejected": -220.98828125, "loss": 1.1303, "margin_dpo/margin_mean": 42.249168395996094, "margin_dpo/margin_std": 113.38427734375, "step": 512 }, { "epoch": 0.7755102040816326, "grad_norm": 18.380573272705078, "learning_rate": 7.369139731924401e-08, "logits/chosen": 1.0797477960586548, "logits/rejected": 1.0573055744171143, "logps/chosen": -140.5891571044922, "logps/ref_chosen": -38.419105529785156, "logps/ref_rejected": -52.706207275390625, "logps/rejected": -238.41525268554688, "loss": 0.9919, "margin_dpo/margin_mean": 83.53899383544922, "margin_dpo/margin_std": 82.92984008789062, "step": 513 }, { "epoch": 0.7770219198790628, "grad_norm": 18.939552307128906, "learning_rate": 7.275644829568747e-08, "logits/chosen": 0.9963523149490356, "logits/rejected": 0.9776211977005005, "logps/chosen": -203.09329223632812, "logps/ref_chosen": -62.999786376953125, "logps/ref_rejected": -66.38030242919922, "logps/rejected": -280.66339111328125, "loss": 1.0044, "margin_dpo/margin_mean": 74.18960571289062, "margin_dpo/margin_std": 45.82038116455078, "step": 514 }, { "epoch": 0.7785336356764928, "grad_norm": 18.924774169921875, "learning_rate": 7.182645715528435e-08, "logits/chosen": 1.1489193439483643, "logits/rejected": 1.0843408107757568, "logps/chosen": -159.77142333984375, "logps/ref_chosen": -38.9024658203125, "logps/ref_rejected": -66.47667694091797, "logps/rejected": -273.12286376953125, "loss": 1.0422, "margin_dpo/margin_mean": 85.77719116210938, "margin_dpo/margin_std": 76.35650634765625, "step": 515 }, { "epoch": 0.780045351473923, "grad_norm": 17.709348678588867, "learning_rate": 7.090144991188568e-08, "logits/chosen": 0.980697751045227, "logits/rejected": 0.9438163042068481, "logps/chosen": -185.6435546875, "logps/ref_chosen": -56.46565246582031, "logps/ref_rejected": -69.74240112304688, "logps/rejected": -303.88299560546875, "loss": 1.0754, "margin_dpo/margin_mean": 104.96269989013672, "margin_dpo/margin_std": 115.3629150390625, "step": 516 }, { "epoch": 0.781557067271353, "grad_norm": 19.174524307250977, "learning_rate": 6.998145243993284e-08, "logits/chosen": 1.0660595893859863, "logits/rejected": 1.0764927864074707, "logps/chosen": -227.71185302734375, "logps/ref_chosen": -54.366859436035156, "logps/ref_rejected": -43.79294204711914, "logps/rejected": -237.5216064453125, "loss": 1.1571, "margin_dpo/margin_mean": 20.383655548095703, "margin_dpo/margin_std": 77.95108032226562, "step": 517 }, { "epoch": 0.783068783068783, "grad_norm": 17.056671142578125, "learning_rate": 6.906649047373245e-08, "logits/chosen": 0.927460789680481, "logits/rejected": 0.9620521068572998, "logps/chosen": -204.52810668945312, "logps/ref_chosen": -62.31662368774414, "logps/ref_rejected": -56.51953887939453, "logps/rejected": -246.00726318359375, "loss": 1.0561, "margin_dpo/margin_mean": 47.2762451171875, "margin_dpo/margin_std": 91.59005737304688, "step": 518 }, { "epoch": 0.7845804988662132, "grad_norm": 21.632381439208984, "learning_rate": 6.815658960673781e-08, "logits/chosen": 0.9049865007400513, "logits/rejected": 0.8612052202224731, "logps/chosen": -216.9791717529297, "logps/ref_chosen": -56.87085723876953, "logps/ref_rejected": -81.93241882324219, "logps/rejected": -281.4555969238281, "loss": 1.3521, "margin_dpo/margin_mean": 39.41484069824219, "margin_dpo/margin_std": 104.36680603027344, "step": 519 }, { "epoch": 0.7860922146636432, "grad_norm": 17.4406795501709, "learning_rate": 6.725177529083209e-08, "logits/chosen": 1.0133739709854126, "logits/rejected": 0.9440522789955139, "logps/chosen": -191.7762908935547, "logps/ref_chosen": -57.21955108642578, "logps/ref_rejected": -66.96128845214844, "logps/rejected": -295.70098876953125, "loss": 1.0819, "margin_dpo/margin_mean": 94.18293762207031, "margin_dpo/margin_std": 142.52069091796875, "step": 520 }, { "epoch": 0.7876039304610734, "grad_norm": 15.527779579162598, "learning_rate": 6.63520728356167e-08, "logits/chosen": 0.6215152740478516, "logits/rejected": 0.6250849962234497, "logps/chosen": -193.13436889648438, "logps/ref_chosen": -82.06413269042969, "logps/ref_rejected": -93.37128448486328, "logps/rejected": -282.43743896484375, "loss": 0.9091, "margin_dpo/margin_mean": 77.99594116210938, "margin_dpo/margin_std": 60.643978118896484, "step": 521 }, { "epoch": 0.7891156462585034, "grad_norm": 17.787490844726562, "learning_rate": 6.545750740770336e-08, "logits/chosen": 0.9250505566596985, "logits/rejected": 0.8676795959472656, "logps/chosen": -171.79269409179688, "logps/ref_chosen": -36.52648162841797, "logps/ref_rejected": -69.4611587524414, "logps/rejected": -259.8004455566406, "loss": 1.2173, "margin_dpo/margin_mean": 55.07307434082031, "margin_dpo/margin_std": 102.25482940673828, "step": 522 }, { "epoch": 0.7906273620559335, "grad_norm": 19.3404483795166, "learning_rate": 6.456810403001012e-08, "logits/chosen": 1.0570783615112305, "logits/rejected": 0.8110660910606384, "logps/chosen": -182.0962371826172, "logps/ref_chosen": -43.33647918701172, "logps/ref_rejected": -82.51911926269531, "logps/rejected": -322.3846130371094, "loss": 1.0791, "margin_dpo/margin_mean": 101.10575103759766, "margin_dpo/margin_std": 114.18901062011719, "step": 523 }, { "epoch": 0.7921390778533636, "grad_norm": 18.590288162231445, "learning_rate": 6.368388758106134e-08, "logits/chosen": 0.7667090892791748, "logits/rejected": 0.7514083385467529, "logps/chosen": -188.31198120117188, "logps/ref_chosen": -71.81331634521484, "logps/ref_rejected": -80.51419067382812, "logps/rejected": -263.0672302246094, "loss": 1.0755, "margin_dpo/margin_mean": 66.05437469482422, "margin_dpo/margin_std": 116.67527770996094, "step": 524 }, { "epoch": 0.7936507936507936, "grad_norm": 19.926000595092773, "learning_rate": 6.280488279429185e-08, "logits/chosen": 0.6996693015098572, "logits/rejected": 0.661113977432251, "logps/chosen": -237.54904174804688, "logps/ref_chosen": -86.49774169921875, "logps/ref_rejected": -98.95057678222656, "logps/rejected": -275.597412109375, "loss": 1.1514, "margin_dpo/margin_mean": 25.595535278320312, "margin_dpo/margin_std": 67.40962219238281, "step": 525 }, { "epoch": 0.7951625094482238, "grad_norm": 18.31951332092285, "learning_rate": 6.193111425735515e-08, "logits/chosen": 0.8428486585617065, "logits/rejected": 0.7279390096664429, "logps/chosen": -215.59396362304688, "logps/ref_chosen": -71.5922622680664, "logps/ref_rejected": -89.43500518798828, "logps/rejected": -250.84075927734375, "loss": 1.1416, "margin_dpo/margin_mean": 17.404056549072266, "margin_dpo/margin_std": 81.23014831542969, "step": 526 }, { "epoch": 0.7966742252456538, "grad_norm": 21.556499481201172, "learning_rate": 6.106260641143546e-08, "logits/chosen": 1.0022237300872803, "logits/rejected": 0.8509221076965332, "logps/chosen": -152.88330078125, "logps/ref_chosen": -39.413856506347656, "logps/ref_rejected": -96.52778625488281, "logps/rejected": -277.23040771484375, "loss": 1.2379, "margin_dpo/margin_mean": 67.23316192626953, "margin_dpo/margin_std": 81.01995086669922, "step": 527 }, { "epoch": 0.7981859410430839, "grad_norm": 22.758813858032227, "learning_rate": 6.019938355056422e-08, "logits/chosen": 0.6834633350372314, "logits/rejected": 0.8445563316345215, "logps/chosen": -205.08164978027344, "logps/ref_chosen": -77.37605285644531, "logps/ref_rejected": -46.57087707519531, "logps/rejected": -181.67904663085938, "loss": 1.249, "margin_dpo/margin_mean": 7.4025726318359375, "margin_dpo/margin_std": 108.43580627441406, "step": 528 }, { "epoch": 0.799697656840514, "grad_norm": 14.703264236450195, "learning_rate": 5.934146982094049e-08, "logits/chosen": 0.8407646417617798, "logits/rejected": 0.8265160322189331, "logps/chosen": -189.27963256835938, "logps/ref_chosen": -58.75701141357422, "logps/ref_rejected": -71.7961196899414, "logps/rejected": -265.8705749511719, "loss": 0.8373, "margin_dpo/margin_mean": 63.551815032958984, "margin_dpo/margin_std": 80.8453598022461, "step": 529 }, { "epoch": 0.8012093726379441, "grad_norm": 16.264793395996094, "learning_rate": 5.848888922025552e-08, "logits/chosen": 1.0589945316314697, "logits/rejected": 0.912827730178833, "logps/chosen": -172.25611877441406, "logps/ref_chosen": -45.41581726074219, "logps/ref_rejected": -80.25865173339844, "logps/rejected": -290.4316711425781, "loss": 1.1008, "margin_dpo/margin_mean": 83.33270263671875, "margin_dpo/margin_std": 107.66976165771484, "step": 530 }, { "epoch": 0.8027210884353742, "grad_norm": 18.56505584716797, "learning_rate": 5.7641665597021435e-08, "logits/chosen": 1.0082396268844604, "logits/rejected": 0.9188197255134583, "logps/chosen": -170.84205627441406, "logps/ref_chosen": -49.827571868896484, "logps/ref_rejected": -87.68862915039062, "logps/rejected": -295.68585205078125, "loss": 1.1211, "margin_dpo/margin_mean": 86.98274993896484, "margin_dpo/margin_std": 96.60000610351562, "step": 531 }, { "epoch": 0.8042328042328042, "grad_norm": 18.886518478393555, "learning_rate": 5.679982264990424e-08, "logits/chosen": 0.8886632919311523, "logits/rejected": 0.7267423868179321, "logps/chosen": -214.15121459960938, "logps/ref_chosen": -59.289215087890625, "logps/ref_rejected": -88.06941223144531, "logps/rejected": -338.8141174316406, "loss": 1.069, "margin_dpo/margin_mean": 95.8827133178711, "margin_dpo/margin_std": 99.42604064941406, "step": 532 }, { "epoch": 0.8057445200302343, "grad_norm": 16.084077835083008, "learning_rate": 5.596338392706076e-08, "logits/chosen": 1.04569411277771, "logits/rejected": 1.0000990629196167, "logps/chosen": -164.958740234375, "logps/ref_chosen": -57.924049377441406, "logps/ref_rejected": -70.69439697265625, "logps/rejected": -268.8208312988281, "loss": 0.9852, "margin_dpo/margin_mean": 91.09176635742188, "margin_dpo/margin_std": 97.24915313720703, "step": 533 }, { "epoch": 0.8072562358276644, "grad_norm": 18.018505096435547, "learning_rate": 5.513237282548033e-08, "logits/chosen": 0.8638824820518494, "logits/rejected": 0.6540743708610535, "logps/chosen": -193.56890869140625, "logps/ref_chosen": -59.63468551635742, "logps/ref_rejected": -109.55838012695312, "logps/rejected": -294.1387634277344, "loss": 1.0613, "margin_dpo/margin_mean": 50.64615249633789, "margin_dpo/margin_std": 116.63580322265625, "step": 534 }, { "epoch": 0.8087679516250945, "grad_norm": 19.90399932861328, "learning_rate": 5.430681259032957e-08, "logits/chosen": 0.7901930212974548, "logits/rejected": 0.7147120237350464, "logps/chosen": -205.218994140625, "logps/ref_chosen": -45.231201171875, "logps/ref_rejected": -70.03333282470703, "logps/rejected": -260.12701416015625, "loss": 1.1115, "margin_dpo/margin_mean": 30.105876922607422, "margin_dpo/margin_std": 91.64392852783203, "step": 535 }, { "epoch": 0.8102796674225246, "grad_norm": 12.067679405212402, "learning_rate": 5.3486726314303175e-08, "logits/chosen": 0.9899567365646362, "logits/rejected": 0.9399088621139526, "logps/chosen": -187.06640625, "logps/ref_chosen": -61.66278839111328, "logps/ref_rejected": -78.1522445678711, "logps/rejected": -277.64288330078125, "loss": 0.8678, "margin_dpo/margin_mean": 74.0870361328125, "margin_dpo/margin_std": 70.92481994628906, "step": 536 }, { "epoch": 0.8117913832199547, "grad_norm": 18.532299041748047, "learning_rate": 5.267213693697695e-08, "logits/chosen": 0.9581376314163208, "logits/rejected": 0.8182666301727295, "logps/chosen": -162.33416748046875, "logps/ref_chosen": -39.777610778808594, "logps/ref_rejected": -122.8561782836914, "logps/rejected": -347.7552490234375, "loss": 1.0719, "margin_dpo/margin_mean": 102.342529296875, "margin_dpo/margin_std": 55.462013244628906, "step": 537 }, { "epoch": 0.8133030990173847, "grad_norm": 15.579018592834473, "learning_rate": 5.1863067244167144e-08, "logits/chosen": 1.0169093608856201, "logits/rejected": 1.08309006690979, "logps/chosen": -190.21852111816406, "logps/ref_chosen": -53.97972106933594, "logps/ref_rejected": -40.60531997680664, "logps/rejected": -223.557373046875, "loss": 0.9732, "margin_dpo/margin_mean": 46.71326446533203, "margin_dpo/margin_std": 72.07386779785156, "step": 538 }, { "epoch": 0.8148148148148148, "grad_norm": 18.707054138183594, "learning_rate": 5.105953986729195e-08, "logits/chosen": 0.7336928844451904, "logits/rejected": 0.6928164958953857, "logps/chosen": -191.83001708984375, "logps/ref_chosen": -75.5906753540039, "logps/ref_rejected": -90.88420104980469, "logps/rejected": -264.90667724609375, "loss": 1.0761, "margin_dpo/margin_mean": 57.78313064575195, "margin_dpo/margin_std": 57.98158264160156, "step": 539 }, { "epoch": 0.8163265306122449, "grad_norm": 17.138023376464844, "learning_rate": 5.026157728273966e-08, "logits/chosen": 1.0368293523788452, "logits/rejected": 0.9761728048324585, "logps/chosen": -202.86624145507812, "logps/ref_chosen": -53.29787063598633, "logps/ref_rejected": -80.79837799072266, "logps/rejected": -307.9012451171875, "loss": 0.9363, "margin_dpo/margin_mean": 77.53450012207031, "margin_dpo/margin_std": 124.25914764404297, "step": 540 }, { "epoch": 0.817838246409675, "grad_norm": 18.846561431884766, "learning_rate": 4.9469201811239035e-08, "logits/chosen": 0.8673323392868042, "logits/rejected": 1.0076346397399902, "logps/chosen": -233.70118713378906, "logps/ref_chosen": -89.37522888183594, "logps/ref_rejected": -50.772430419921875, "logps/rejected": -251.73175048828125, "loss": 0.9834, "margin_dpo/margin_mean": 56.633365631103516, "margin_dpo/margin_std": 85.56578063964844, "step": 541 }, { "epoch": 0.8193499622071051, "grad_norm": 13.740031242370605, "learning_rate": 4.868243561723534e-08, "logits/chosen": 0.8200675845146179, "logits/rejected": 0.7452304363250732, "logps/chosen": -162.4728240966797, "logps/ref_chosen": -53.12909698486328, "logps/ref_rejected": -85.17762756347656, "logps/rejected": -272.97418212890625, "loss": 0.9604, "margin_dpo/margin_mean": 78.45283508300781, "margin_dpo/margin_std": 69.77384948730469, "step": 542 }, { "epoch": 0.8208616780045351, "grad_norm": 15.152771949768066, "learning_rate": 4.790130070827028e-08, "logits/chosen": 0.8890964984893799, "logits/rejected": 0.8061411380767822, "logps/chosen": -171.46209716796875, "logps/ref_chosen": -49.52074432373047, "logps/ref_rejected": -89.69110870361328, "logps/rejected": -296.0673828125, "loss": 0.9858, "margin_dpo/margin_mean": 84.43494415283203, "margin_dpo/margin_std": 86.14839172363281, "step": 543 }, { "epoch": 0.8223733938019653, "grad_norm": 18.41205596923828, "learning_rate": 4.7125818934366454e-08, "logits/chosen": 0.8170167207717896, "logits/rejected": 0.8432954549789429, "logps/chosen": -270.7913818359375, "logps/ref_chosen": -79.90715026855469, "logps/ref_rejected": -65.11402130126953, "logps/rejected": -296.67626953125, "loss": 0.9642, "margin_dpo/margin_mean": 40.67804718017578, "margin_dpo/margin_std": 80.78584289550781, "step": 544 }, { "epoch": 0.8238851095993953, "grad_norm": 20.38388442993164, "learning_rate": 4.635601198741607e-08, "logits/chosen": 0.9498938322067261, "logits/rejected": 0.8978836536407471, "logps/chosen": -191.94818115234375, "logps/ref_chosen": -49.8741455078125, "logps/ref_rejected": -74.81036376953125, "logps/rejected": -259.6213073730469, "loss": 1.1264, "margin_dpo/margin_mean": 42.736907958984375, "margin_dpo/margin_std": 79.07319641113281, "step": 545 }, { "epoch": 0.8253968253968254, "grad_norm": 19.26015853881836, "learning_rate": 4.559190140057428e-08, "logits/chosen": 0.9439926743507385, "logits/rejected": 0.9385854601860046, "logps/chosen": -254.52891540527344, "logps/ref_chosen": -83.20497131347656, "logps/ref_rejected": -79.42691040039062, "logps/rejected": -289.6729736328125, "loss": 1.0739, "margin_dpo/margin_mean": 38.9221305847168, "margin_dpo/margin_std": 98.08988952636719, "step": 546 }, { "epoch": 0.8269085411942555, "grad_norm": 16.925718307495117, "learning_rate": 4.483350854765672e-08, "logits/chosen": 0.9709137678146362, "logits/rejected": 0.7829012870788574, "logps/chosen": -161.71536254882812, "logps/ref_chosen": -30.964988708496094, "logps/ref_rejected": -70.52467346191406, "logps/rejected": -272.62640380859375, "loss": 0.9703, "margin_dpo/margin_mean": 71.35137939453125, "margin_dpo/margin_std": 87.27641296386719, "step": 547 }, { "epoch": 0.8284202569916855, "grad_norm": 20.741260528564453, "learning_rate": 4.4080854642541826e-08, "logits/chosen": 0.7766395211219788, "logits/rejected": 0.7720953822135925, "logps/chosen": -190.9144287109375, "logps/ref_chosen": -63.495338439941406, "logps/ref_rejected": -73.84056091308594, "logps/rejected": -251.48435974121094, "loss": 1.1606, "margin_dpo/margin_mean": 50.224700927734375, "margin_dpo/margin_std": 94.93733215332031, "step": 548 }, { "epoch": 0.8299319727891157, "grad_norm": 27.655611038208008, "learning_rate": 4.333396073857723e-08, "logits/chosen": 0.9606146812438965, "logits/rejected": 0.9492313861846924, "logps/chosen": -232.9254608154297, "logps/ref_chosen": -73.39987182617188, "logps/ref_rejected": -76.37744140625, "logps/rejected": -271.0856628417969, "loss": 1.18, "margin_dpo/margin_mean": 35.182621002197266, "margin_dpo/margin_std": 107.7386703491211, "step": 549 }, { "epoch": 0.8314436885865457, "grad_norm": 17.47970199584961, "learning_rate": 4.259284772799099e-08, "logits/chosen": 1.0232315063476562, "logits/rejected": 1.0318495035171509, "logps/chosen": -138.86331176757812, "logps/ref_chosen": -41.076881408691406, "logps/ref_rejected": -47.863609313964844, "logps/rejected": -196.962158203125, "loss": 1.1886, "margin_dpo/margin_mean": 51.31211471557617, "margin_dpo/margin_std": 87.56314086914062, "step": 550 }, { "epoch": 0.8329554043839759, "grad_norm": 16.67303466796875, "learning_rate": 4.1857536341307176e-08, "logits/chosen": 1.0085614919662476, "logits/rejected": 0.9332855939865112, "logps/chosen": -187.7180633544922, "logps/ref_chosen": -43.32201385498047, "logps/ref_rejected": -80.82965850830078, "logps/rejected": -318.88507080078125, "loss": 1.0361, "margin_dpo/margin_mean": 93.65936279296875, "margin_dpo/margin_std": 136.92462158203125, "step": 551 }, { "epoch": 0.8344671201814059, "grad_norm": 20.140682220458984, "learning_rate": 4.112804714676593e-08, "logits/chosen": 1.030794382095337, "logits/rejected": 0.9187796115875244, "logps/chosen": -200.61541748046875, "logps/ref_chosen": -51.589683532714844, "logps/ref_rejected": -92.36204528808594, "logps/rejected": -306.091064453125, "loss": 1.0185, "margin_dpo/margin_mean": 64.70327758789062, "margin_dpo/margin_std": 70.84175109863281, "step": 552 }, { "epoch": 0.8359788359788359, "grad_norm": 18.41575050354004, "learning_rate": 4.0404400549748144e-08, "logits/chosen": 1.0242167711257935, "logits/rejected": 0.8356890082359314, "logps/chosen": -183.38101196289062, "logps/ref_chosen": -46.85206604003906, "logps/ref_rejected": -111.05335235595703, "logps/rejected": -361.5396728515625, "loss": 1.1174, "margin_dpo/margin_mean": 113.9573745727539, "margin_dpo/margin_std": 144.8773193359375, "step": 553 }, { "epoch": 0.8374905517762661, "grad_norm": 18.322969436645508, "learning_rate": 3.968661679220467e-08, "logits/chosen": 0.8415111303329468, "logits/rejected": 0.7810766696929932, "logps/chosen": -176.80612182617188, "logps/ref_chosen": -56.84330368041992, "logps/ref_rejected": -71.64338684082031, "logps/rejected": -253.65359497070312, "loss": 1.0157, "margin_dpo/margin_mean": 62.0473747253418, "margin_dpo/margin_std": 105.61851501464844, "step": 554 }, { "epoch": 0.8390022675736961, "grad_norm": 18.993671417236328, "learning_rate": 3.89747159520904e-08, "logits/chosen": 0.8262725472450256, "logits/rejected": 0.7698884010314941, "logps/chosen": -232.79461669921875, "logps/ref_chosen": -88.44906616210938, "logps/ref_rejected": -92.66055297851562, "logps/rejected": -271.56744384765625, "loss": 1.1031, "margin_dpo/margin_mean": 34.56132125854492, "margin_dpo/margin_std": 118.9439926147461, "step": 555 }, { "epoch": 0.8405139833711263, "grad_norm": 17.27111053466797, "learning_rate": 3.826871794280192e-08, "logits/chosen": 0.9412505030632019, "logits/rejected": 0.9116497039794922, "logps/chosen": -217.48727416992188, "logps/ref_chosen": -56.714725494384766, "logps/ref_rejected": -64.49860382080078, "logps/rejected": -274.18817138671875, "loss": 1.1464, "margin_dpo/margin_mean": 48.91699981689453, "margin_dpo/margin_std": 71.00090026855469, "step": 556 }, { "epoch": 0.8420256991685563, "grad_norm": 16.771560668945312, "learning_rate": 3.756864251262143e-08, "logits/chosen": 0.9594881534576416, "logits/rejected": 0.9018399715423584, "logps/chosen": -157.4939422607422, "logps/ref_chosen": -35.662818908691406, "logps/ref_rejected": -49.78130340576172, "logps/rejected": -229.19021606445312, "loss": 0.9731, "margin_dpo/margin_mean": 57.57780456542969, "margin_dpo/margin_std": 78.69767761230469, "step": 557 }, { "epoch": 0.8435374149659864, "grad_norm": 15.087602615356445, "learning_rate": 3.687450924416341e-08, "logits/chosen": 1.0110355615615845, "logits/rejected": 0.9612963199615479, "logps/chosen": -196.357666015625, "logps/ref_chosen": -67.20350646972656, "logps/ref_rejected": -90.46063232421875, "logps/rejected": -308.90771484375, "loss": 0.9023, "margin_dpo/margin_mean": 89.29290771484375, "margin_dpo/margin_std": 79.98139953613281, "step": 558 }, { "epoch": 0.8450491307634165, "grad_norm": 15.841297149658203, "learning_rate": 3.6186337553827743e-08, "logits/chosen": 0.8561170101165771, "logits/rejected": 0.7312831878662109, "logps/chosen": -225.37515258789062, "logps/ref_chosen": -82.38276672363281, "logps/ref_rejected": -148.32070922851562, "logps/rejected": -394.68408203125, "loss": 1.0114, "margin_dpo/margin_mean": 103.37101745605469, "margin_dpo/margin_std": 142.59449768066406, "step": 559 }, { "epoch": 0.8465608465608465, "grad_norm": 18.28428077697754, "learning_rate": 3.550414669125573e-08, "logits/chosen": 0.8422687649726868, "logits/rejected": 0.8341982364654541, "logps/chosen": -237.72918701171875, "logps/ref_chosen": -87.37367248535156, "logps/ref_rejected": -98.05863952636719, "logps/rejected": -306.2855529785156, "loss": 1.0279, "margin_dpo/margin_mean": 57.87139892578125, "margin_dpo/margin_std": 98.06793212890625, "step": 560 }, { "epoch": 0.8480725623582767, "grad_norm": 15.278297424316406, "learning_rate": 3.482795573879241e-08, "logits/chosen": 0.9140326976776123, "logits/rejected": 0.8790804743766785, "logps/chosen": -175.6079559326172, "logps/ref_chosen": -46.72880935668945, "logps/ref_rejected": -72.11878204345703, "logps/rejected": -249.57186889648438, "loss": 1.0484, "margin_dpo/margin_mean": 48.573936462402344, "margin_dpo/margin_std": 56.30522155761719, "step": 561 }, { "epoch": 0.8495842781557067, "grad_norm": 17.868244171142578, "learning_rate": 3.415778361095226e-08, "logits/chosen": 0.9695894718170166, "logits/rejected": 0.8319031000137329, "logps/chosen": -219.78314208984375, "logps/ref_chosen": -73.16322326660156, "logps/ref_rejected": -130.28079223632812, "logps/rejected": -364.35076904296875, "loss": 0.9357, "margin_dpo/margin_mean": 87.45001220703125, "margin_dpo/margin_std": 103.97313690185547, "step": 562 }, { "epoch": 0.8510959939531368, "grad_norm": 20.500097274780273, "learning_rate": 3.349364905389032e-08, "logits/chosen": 1.082218885421753, "logits/rejected": 1.0179955959320068, "logps/chosen": -173.17086791992188, "logps/ref_chosen": -52.58354187011719, "logps/ref_rejected": -81.23396301269531, "logps/rejected": -261.9834899902344, "loss": 1.0687, "margin_dpo/margin_mean": 60.16220474243164, "margin_dpo/margin_std": 83.78437805175781, "step": 563 }, { "epoch": 0.8526077097505669, "grad_norm": 20.62779426574707, "learning_rate": 3.283557064487785e-08, "logits/chosen": 0.9116697311401367, "logits/rejected": 0.9356800317764282, "logps/chosen": -168.27835083007812, "logps/ref_chosen": -54.56080627441406, "logps/ref_rejected": -52.1234245300293, "logps/rejected": -250.03018188476562, "loss": 0.9691, "margin_dpo/margin_mean": 84.18922424316406, "margin_dpo/margin_std": 93.12564849853516, "step": 564 }, { "epoch": 0.854119425547997, "grad_norm": 17.084941864013672, "learning_rate": 3.218356679178252e-08, "logits/chosen": 0.9586303234100342, "logits/rejected": 0.7509380578994751, "logps/chosen": -274.4070129394531, "logps/ref_chosen": -56.54259490966797, "logps/ref_rejected": -125.5247573852539, "logps/rejected": -408.77130126953125, "loss": 0.9844, "margin_dpo/margin_mean": 65.38215637207031, "margin_dpo/margin_std": 83.48878479003906, "step": 565 }, { "epoch": 0.8556311413454271, "grad_norm": 17.82448959350586, "learning_rate": 3.1537655732553764e-08, "logits/chosen": 0.9717875719070435, "logits/rejected": 0.9099733233451843, "logps/chosen": -195.46827697753906, "logps/ref_chosen": -66.43058776855469, "logps/ref_rejected": -88.7685775756836, "logps/rejected": -317.3566589355469, "loss": 1.0874, "margin_dpo/margin_mean": 99.5504150390625, "margin_dpo/margin_std": 77.15394592285156, "step": 566 }, { "epoch": 0.8571428571428571, "grad_norm": 15.454371452331543, "learning_rate": 3.089785553471233e-08, "logits/chosen": 0.8322715759277344, "logits/rejected": 0.8646911978721619, "logps/chosen": -187.09246826171875, "logps/ref_chosen": -61.010929107666016, "logps/ref_rejected": -59.36076736450195, "logps/rejected": -264.74932861328125, "loss": 0.9967, "margin_dpo/margin_mean": 79.30699920654297, "margin_dpo/margin_std": 74.8924789428711, "step": 567 }, { "epoch": 0.8586545729402872, "grad_norm": 16.217979431152344, "learning_rate": 3.026418409484513e-08, "logits/chosen": 1.0041697025299072, "logits/rejected": 0.9253759384155273, "logps/chosen": -160.50274658203125, "logps/ref_chosen": -39.93986892700195, "logps/ref_rejected": -80.44910430908203, "logps/rejected": -245.42349243164062, "loss": 0.9188, "margin_dpo/margin_mean": 44.4115104675293, "margin_dpo/margin_std": 74.29348754882812, "step": 568 }, { "epoch": 0.8601662887377173, "grad_norm": 20.45829963684082, "learning_rate": 2.963665913810451e-08, "logits/chosen": 0.9038550853729248, "logits/rejected": 0.9029619693756104, "logps/chosen": -228.15231323242188, "logps/ref_chosen": -81.28688049316406, "logps/ref_rejected": -83.6900405883789, "logps/rejected": -316.5882873535156, "loss": 1.1757, "margin_dpo/margin_mean": 86.03282165527344, "margin_dpo/margin_std": 147.73619079589844, "step": 569 }, { "epoch": 0.8616780045351474, "grad_norm": 17.267189025878906, "learning_rate": 2.9015298217712453e-08, "logits/chosen": 0.9757269620895386, "logits/rejected": 0.9647901058197021, "logps/chosen": -202.49285888671875, "logps/ref_chosen": -53.46790313720703, "logps/ref_rejected": -51.320770263671875, "logps/rejected": -260.1011962890625, "loss": 0.9055, "margin_dpo/margin_mean": 59.75544738769531, "margin_dpo/margin_std": 78.49115753173828, "step": 570 }, { "epoch": 0.8631897203325775, "grad_norm": 19.252838134765625, "learning_rate": 2.840011871446962e-08, "logits/chosen": 0.9464564323425293, "logits/rejected": 0.8572288751602173, "logps/chosen": -164.54974365234375, "logps/ref_chosen": -45.11099624633789, "logps/ref_rejected": -71.55215454101562, "logps/rejected": -285.15167236328125, "loss": 1.1555, "margin_dpo/margin_mean": 94.1607666015625, "margin_dpo/margin_std": 116.37611389160156, "step": 571 }, { "epoch": 0.8647014361300076, "grad_norm": 22.696313858032227, "learning_rate": 2.7791137836269158e-08, "logits/chosen": 0.9143853187561035, "logits/rejected": 0.863966703414917, "logps/chosen": -178.92416381835938, "logps/ref_chosen": -50.03799819946289, "logps/ref_rejected": -69.7855224609375, "logps/rejected": -282.4919738769531, "loss": 1.0403, "margin_dpo/margin_mean": 83.82028198242188, "margin_dpo/margin_std": 97.30913543701172, "step": 572 }, { "epoch": 0.8662131519274376, "grad_norm": 17.95223617553711, "learning_rate": 2.718837261761528e-08, "logits/chosen": 0.9436647891998291, "logits/rejected": 0.8891823291778564, "logps/chosen": -223.79197692871094, "logps/ref_chosen": -65.06491088867188, "logps/ref_rejected": -74.16371154785156, "logps/rejected": -237.40768432617188, "loss": 1.1207, "margin_dpo/margin_mean": 4.516890525817871, "margin_dpo/margin_std": 93.70068359375, "step": 573 }, { "epoch": 0.8677248677248677, "grad_norm": 13.765616416931152, "learning_rate": 2.659183991914696e-08, "logits/chosen": 0.9557688236236572, "logits/rejected": 0.9775044918060303, "logps/chosen": -214.97642517089844, "logps/ref_chosen": -50.88386917114258, "logps/ref_rejected": -51.98750686645508, "logps/rejected": -228.66259765625, "loss": 0.8728, "margin_dpo/margin_mean": 12.582534790039062, "margin_dpo/margin_std": 53.39247131347656, "step": 574 }, { "epoch": 0.8692365835222978, "grad_norm": 17.010929107666016, "learning_rate": 2.600155642716606e-08, "logits/chosen": 0.9337095022201538, "logits/rejected": 0.786598801612854, "logps/chosen": -208.71426391601562, "logps/ref_chosen": -61.41114044189453, "logps/ref_rejected": -112.07469177246094, "logps/rejected": -286.92864990234375, "loss": 1.1869, "margin_dpo/margin_mean": 27.55083465576172, "margin_dpo/margin_std": 119.59794616699219, "step": 575 }, { "epoch": 0.8707482993197279, "grad_norm": 15.698728561401367, "learning_rate": 2.5417538653170754e-08, "logits/chosen": 1.0223554372787476, "logits/rejected": 0.9235273599624634, "logps/chosen": -183.23175048828125, "logps/ref_chosen": -54.792724609375, "logps/ref_rejected": -84.78044128417969, "logps/rejected": -297.049072265625, "loss": 0.8786, "margin_dpo/margin_mean": 83.82960510253906, "margin_dpo/margin_std": 106.40802001953125, "step": 576 }, { "epoch": 0.872260015117158, "grad_norm": 17.254444122314453, "learning_rate": 2.4839802933393607e-08, "logits/chosen": 1.0162341594696045, "logits/rejected": 0.9787265062332153, "logps/chosen": -163.14552307128906, "logps/ref_chosen": -44.14771270751953, "logps/ref_rejected": -65.19963836669922, "logps/rejected": -230.81610107421875, "loss": 1.113, "margin_dpo/margin_mean": 46.61864471435547, "margin_dpo/margin_std": 70.7182846069336, "step": 577 }, { "epoch": 0.873771730914588, "grad_norm": 17.691556930541992, "learning_rate": 2.4268365428344733e-08, "logits/chosen": 1.0092999935150146, "logits/rejected": 0.9245076179504395, "logps/chosen": -200.7638702392578, "logps/ref_chosen": -47.827537536621094, "logps/ref_rejected": -84.31831359863281, "logps/rejected": -295.13720703125, "loss": 1.1273, "margin_dpo/margin_mean": 57.8825798034668, "margin_dpo/margin_std": 108.26670837402344, "step": 578 }, { "epoch": 0.8752834467120182, "grad_norm": 17.17439842224121, "learning_rate": 2.3703242122359357e-08, "logits/chosen": 0.9395872950553894, "logits/rejected": 0.883934497833252, "logps/chosen": -172.1771240234375, "logps/ref_chosen": -39.918418884277344, "logps/ref_rejected": -67.36483001708984, "logps/rejected": -293.5935974121094, "loss": 0.9021, "margin_dpo/margin_mean": 93.97006225585938, "margin_dpo/margin_std": 82.55453491210938, "step": 579 }, { "epoch": 0.8767951625094482, "grad_norm": 16.5377197265625, "learning_rate": 2.3144448823151392e-08, "logits/chosen": 0.8914264440536499, "logits/rejected": 0.8536027669906616, "logps/chosen": -203.43321228027344, "logps/ref_chosen": -57.42926025390625, "logps/ref_rejected": -71.1881332397461, "logps/rejected": -251.05111694335938, "loss": 1.0703, "margin_dpo/margin_mean": 33.859039306640625, "margin_dpo/margin_std": 79.80448150634766, "step": 580 }, { "epoch": 0.8783068783068783, "grad_norm": 16.735671997070312, "learning_rate": 2.259200116137039e-08, "logits/chosen": 0.9245070219039917, "logits/rejected": 0.8710756301879883, "logps/chosen": -207.47396850585938, "logps/ref_chosen": -64.48869323730469, "logps/ref_rejected": -80.07521057128906, "logps/rejected": -284.458251953125, "loss": 1.0719, "margin_dpo/margin_mean": 61.39775085449219, "margin_dpo/margin_std": 92.19239807128906, "step": 581 }, { "epoch": 0.8798185941043084, "grad_norm": 21.038320541381836, "learning_rate": 2.204591459016525e-08, "logits/chosen": 0.9050915241241455, "logits/rejected": 0.7998449802398682, "logps/chosen": -180.35040283203125, "logps/ref_chosen": -52.15564727783203, "logps/ref_rejected": -81.67626190185547, "logps/rejected": -324.54791259765625, "loss": 1.106, "margin_dpo/margin_mean": 114.67691802978516, "margin_dpo/margin_std": 95.79690551757812, "step": 582 }, { "epoch": 0.8813303099017384, "grad_norm": 21.99834442138672, "learning_rate": 2.1506204384751064e-08, "logits/chosen": 1.0146265029907227, "logits/rejected": 0.9865950345993042, "logps/chosen": -198.27951049804688, "logps/ref_chosen": -53.600677490234375, "logps/ref_rejected": -66.55783081054688, "logps/rejected": -231.4248046875, "loss": 1.05, "margin_dpo/margin_mean": 20.188133239746094, "margin_dpo/margin_std": 88.58358001708984, "step": 583 }, { "epoch": 0.8828420256991686, "grad_norm": 18.144161224365234, "learning_rate": 2.09728856419826e-08, "logits/chosen": 0.9950936436653137, "logits/rejected": 0.8881626129150391, "logps/chosen": -145.82467651367188, "logps/ref_chosen": -39.92349624633789, "logps/ref_rejected": -85.95851135253906, "logps/rejected": -260.9383544921875, "loss": 1.0972, "margin_dpo/margin_mean": 69.07866668701172, "margin_dpo/margin_std": 94.79948425292969, "step": 584 }, { "epoch": 0.8843537414965986, "grad_norm": 15.75501537322998, "learning_rate": 2.044597327993153e-08, "logits/chosen": 0.9527912139892578, "logits/rejected": 0.7973681092262268, "logps/chosen": -196.22564697265625, "logps/ref_chosen": -54.76570129394531, "logps/ref_rejected": -115.46517944335938, "logps/rejected": -307.131591796875, "loss": 1.1429, "margin_dpo/margin_mean": 50.206451416015625, "margin_dpo/margin_std": 92.417724609375, "step": 585 }, { "epoch": 0.8858654572940288, "grad_norm": 18.22370147705078, "learning_rate": 1.9925482037469187e-08, "logits/chosen": 0.9899888038635254, "logits/rejected": 1.0135765075683594, "logps/chosen": -180.9400634765625, "logps/ref_chosen": -62.72941207885742, "logps/ref_rejected": -46.449256896972656, "logps/rejected": -233.3268585205078, "loss": 0.9577, "margin_dpo/margin_mean": 68.66695404052734, "margin_dpo/margin_std": 74.32232666015625, "step": 586 }, { "epoch": 0.8873771730914588, "grad_norm": 21.38698387145996, "learning_rate": 1.9411426473854687e-08, "logits/chosen": 1.0005683898925781, "logits/rejected": 0.954791784286499, "logps/chosen": -218.3308868408203, "logps/ref_chosen": -58.70615768432617, "logps/ref_rejected": -70.26844024658203, "logps/rejected": -261.1861267089844, "loss": 1.0981, "margin_dpo/margin_mean": 31.292953491210938, "margin_dpo/margin_std": 83.6708984375, "step": 587 }, { "epoch": 0.8888888888888888, "grad_norm": 19.388957977294922, "learning_rate": 1.890382096832699e-08, "logits/chosen": 0.8030673265457153, "logits/rejected": 0.7511934041976929, "logps/chosen": -196.69418334960938, "logps/ref_chosen": -66.02383422851562, "logps/ref_rejected": -92.533203125, "logps/rejected": -316.143798828125, "loss": 1.0657, "margin_dpo/margin_mean": 92.94023895263672, "margin_dpo/margin_std": 82.5245361328125, "step": 588 }, { "epoch": 0.890400604686319, "grad_norm": 16.032787322998047, "learning_rate": 1.840267971970344e-08, "logits/chosen": 0.8890354633331299, "logits/rejected": 0.8605284690856934, "logps/chosen": -199.17193603515625, "logps/ref_chosen": -52.7588005065918, "logps/ref_rejected": -68.78264617919922, "logps/rejected": -266.2153015136719, "loss": 0.9575, "margin_dpo/margin_mean": 51.01952362060547, "margin_dpo/margin_std": 97.24200439453125, "step": 589 }, { "epoch": 0.891912320483749, "grad_norm": 16.486291885375977, "learning_rate": 1.7908016745981856e-08, "logits/chosen": 0.9132862091064453, "logits/rejected": 0.8946816921234131, "logps/chosen": -198.87548828125, "logps/ref_chosen": -53.84131622314453, "logps/ref_rejected": -64.9970703125, "logps/rejected": -240.9401092529297, "loss": 0.9877, "margin_dpo/margin_mean": 30.908885955810547, "margin_dpo/margin_std": 70.15216827392578, "step": 590 }, { "epoch": 0.8934240362811792, "grad_norm": 18.833675384521484, "learning_rate": 1.7419845883949098e-08, "logits/chosen": 0.9794512987136841, "logits/rejected": 0.9539260268211365, "logps/chosen": -170.65090942382812, "logps/ref_chosen": -50.85618591308594, "logps/ref_rejected": -61.08381271362305, "logps/rejected": -261.77154541015625, "loss": 0.9912, "margin_dpo/margin_mean": 80.89300537109375, "margin_dpo/margin_std": 80.72918701171875, "step": 591 }, { "epoch": 0.8949357520786092, "grad_norm": 18.000661849975586, "learning_rate": 1.6938180788793556e-08, "logits/chosen": 1.0009410381317139, "logits/rejected": 0.9112914800643921, "logps/chosen": -176.25234985351562, "logps/ref_chosen": -51.354652404785156, "logps/ref_rejected": -67.10594177246094, "logps/rejected": -271.6142578125, "loss": 0.985, "margin_dpo/margin_mean": 79.610595703125, "margin_dpo/margin_std": 81.41339111328125, "step": 592 }, { "epoch": 0.8964474678760394, "grad_norm": 18.60577392578125, "learning_rate": 1.6463034933723336e-08, "logits/chosen": 0.8909306526184082, "logits/rejected": 0.8740547299385071, "logps/chosen": -187.12136840820312, "logps/ref_chosen": -61.935089111328125, "logps/ref_rejected": -82.4512939453125, "logps/rejected": -286.46282958984375, "loss": 1.0216, "margin_dpo/margin_mean": 78.82524871826172, "margin_dpo/margin_std": 103.09696960449219, "step": 593 }, { "epoch": 0.8979591836734694, "grad_norm": 18.425268173217773, "learning_rate": 1.5994421609589385e-08, "logits/chosen": 0.8772367835044861, "logits/rejected": 0.8558773994445801, "logps/chosen": -211.46441650390625, "logps/ref_chosen": -63.053680419921875, "logps/ref_rejected": -68.9009017944336, "logps/rejected": -248.54051208496094, "loss": 1.0588, "margin_dpo/margin_mean": 31.228851318359375, "margin_dpo/margin_std": 52.22967529296875, "step": 594 }, { "epoch": 0.8994708994708994, "grad_norm": 14.107276916503906, "learning_rate": 1.553235392451377e-08, "logits/chosen": 1.0201103687286377, "logits/rejected": 0.9065714478492737, "logps/chosen": -164.76438903808594, "logps/ref_chosen": -36.28746032714844, "logps/ref_rejected": -76.62598419189453, "logps/rejected": -292.2261962890625, "loss": 0.9302, "margin_dpo/margin_mean": 87.12327575683594, "margin_dpo/margin_std": 96.01534271240234, "step": 595 }, { "epoch": 0.9009826152683296, "grad_norm": 16.283180236816406, "learning_rate": 1.507684480352292e-08, "logits/chosen": 0.8756155371665955, "logits/rejected": 0.8407232761383057, "logps/chosen": -196.04852294921875, "logps/ref_chosen": -58.18864440917969, "logps/ref_rejected": -69.7575454711914, "logps/rejected": -244.40557861328125, "loss": 1.2206, "margin_dpo/margin_mean": 36.78814697265625, "margin_dpo/margin_std": 108.71817016601562, "step": 596 }, { "epoch": 0.9024943310657596, "grad_norm": 14.265636444091797, "learning_rate": 1.4627906988186111e-08, "logits/chosen": 0.9384099245071411, "logits/rejected": 0.9882324934005737, "logps/chosen": -195.47030639648438, "logps/ref_chosen": -62.06855773925781, "logps/ref_rejected": -62.94855880737305, "logps/rejected": -221.61302185058594, "loss": 1.0398, "margin_dpo/margin_mean": 25.262706756591797, "margin_dpo/margin_std": 77.90994262695312, "step": 597 }, { "epoch": 0.9040060468631897, "grad_norm": 22.982269287109375, "learning_rate": 1.4185553036259095e-08, "logits/chosen": 0.8697057962417603, "logits/rejected": 0.8449388742446899, "logps/chosen": -206.47000122070312, "logps/ref_chosen": -48.21617889404297, "logps/ref_rejected": -68.27192687988281, "logps/rejected": -257.6476745605469, "loss": 1.2139, "margin_dpo/margin_mean": 31.121919631958008, "margin_dpo/margin_std": 93.97370147705078, "step": 598 }, { "epoch": 0.9055177626606198, "grad_norm": 17.03192138671875, "learning_rate": 1.3749795321332885e-08, "logits/chosen": 0.8180956244468689, "logits/rejected": 0.8315231800079346, "logps/chosen": -224.57041931152344, "logps/ref_chosen": -61.865814208984375, "logps/ref_rejected": -62.11842346191406, "logps/rejected": -258.51904296875, "loss": 1.146, "margin_dpo/margin_mean": 33.696022033691406, "margin_dpo/margin_std": 111.64860534667969, "step": 599 }, { "epoch": 0.9070294784580499, "grad_norm": 17.681379318237305, "learning_rate": 1.3320646032487393e-08, "logits/chosen": 0.9060215353965759, "logits/rejected": 0.9525216817855835, "logps/chosen": -173.75914001464844, "logps/ref_chosen": -55.92002868652344, "logps/ref_rejected": -53.79228210449219, "logps/rejected": -239.77134704589844, "loss": 1.1391, "margin_dpo/margin_mean": 68.13994598388672, "margin_dpo/margin_std": 82.62960815429688, "step": 600 }, { "epoch": 0.9070294784580499, "eval_logits/chosen": 0.8960238695144653, "eval_logits/rejected": 0.863480806350708, "eval_logps/chosen": -211.5150146484375, "eval_logps/ref_chosen": -75.30646514892578, "eval_logps/ref_rejected": -77.75511932373047, "eval_logps/rejected": -274.14215087890625, "eval_loss": 0.5347773432731628, "eval_margin_dpo/margin_mean": 60.178462982177734, "eval_margin_dpo/margin_std": 94.62100982666016, "eval_runtime": 37.4693, "eval_samples_per_second": 61.464, "eval_steps_per_second": 1.922, "step": 600 }, { "epoch": 0.90854119425548, "grad_norm": 14.842012405395508, "learning_rate": 1.2898117173950868e-08, "logits/chosen": 0.8835675716400146, "logits/rejected": 0.8951883912086487, "logps/chosen": -201.0157470703125, "logps/ref_chosen": -70.18791198730469, "logps/ref_rejected": -71.40547180175781, "logps/rejected": -291.8153381347656, "loss": 0.9772, "margin_dpo/margin_mean": 89.58202362060547, "margin_dpo/margin_std": 125.6627197265625, "step": 601 }, { "epoch": 0.91005291005291, "grad_norm": 18.32841682434082, "learning_rate": 1.2482220564763667e-08, "logits/chosen": 0.9220967292785645, "logits/rejected": 0.8645837306976318, "logps/chosen": -187.869873046875, "logps/ref_chosen": -70.41340637207031, "logps/ref_rejected": -97.12376403808594, "logps/rejected": -304.54473876953125, "loss": 0.989, "margin_dpo/margin_mean": 89.96450805664062, "margin_dpo/margin_std": 100.75538635253906, "step": 602 }, { "epoch": 0.9115646258503401, "grad_norm": 19.300460815429688, "learning_rate": 1.2072967838448051e-08, "logits/chosen": 0.9347689747810364, "logits/rejected": 0.8060243129730225, "logps/chosen": -211.57089233398438, "logps/ref_chosen": -62.89923858642578, "logps/ref_rejected": -108.35671997070312, "logps/rejected": -351.4485778808594, "loss": 0.9984, "margin_dpo/margin_mean": 94.42021179199219, "margin_dpo/margin_std": 152.901123046875, "step": 603 }, { "epoch": 0.9130763416477702, "grad_norm": 15.397801399230957, "learning_rate": 1.1670370442682459e-08, "logits/chosen": 0.9415032267570496, "logits/rejected": 0.9144819974899292, "logps/chosen": -188.33102416992188, "logps/ref_chosen": -62.4847412109375, "logps/ref_rejected": -62.371429443359375, "logps/rejected": -235.138427734375, "loss": 1.1185, "margin_dpo/margin_mean": 46.92070770263672, "margin_dpo/margin_std": 84.41587829589844, "step": 604 }, { "epoch": 0.9145880574452003, "grad_norm": 18.700698852539062, "learning_rate": 1.1274439638981532e-08, "logits/chosen": 1.0574252605438232, "logits/rejected": 0.938467264175415, "logps/chosen": -192.36363220214844, "logps/ref_chosen": -48.73389434814453, "logps/ref_rejected": -83.2470703125, "logps/rejected": -292.3554992675781, "loss": 1.0445, "margin_dpo/margin_mean": 65.47867584228516, "margin_dpo/margin_std": 119.60535430908203, "step": 605 }, { "epoch": 0.9160997732426304, "grad_norm": 17.30083465576172, "learning_rate": 1.0885186502381016e-08, "logits/chosen": 0.9336838722229004, "logits/rejected": 0.846339225769043, "logps/chosen": -190.03634643554688, "logps/ref_chosen": -53.08481979370117, "logps/ref_rejected": -80.11920166015625, "logps/rejected": -295.1481018066406, "loss": 0.989, "margin_dpo/margin_mean": 78.07736206054688, "margin_dpo/margin_std": 93.74771118164062, "step": 606 }, { "epoch": 0.9176114890400605, "grad_norm": 16.944414138793945, "learning_rate": 1.0502621921127774e-08, "logits/chosen": 0.7050179243087769, "logits/rejected": 0.8792567253112793, "logps/chosen": -264.5372009277344, "logps/ref_chosen": -97.28004455566406, "logps/ref_rejected": -86.22888946533203, "logps/rejected": -283.00103759765625, "loss": 1.0318, "margin_dpo/margin_mean": 29.514984130859375, "margin_dpo/margin_std": 90.30196380615234, "step": 607 }, { "epoch": 0.9191232048374905, "grad_norm": 18.15805435180664, "learning_rate": 1.0126756596375685e-08, "logits/chosen": 0.8514116406440735, "logits/rejected": 0.8852680921554565, "logps/chosen": -181.86837768554688, "logps/ref_chosen": -66.10746765136719, "logps/ref_rejected": -59.404441833496094, "logps/rejected": -243.4190673828125, "loss": 1.0596, "margin_dpo/margin_mean": 68.25370788574219, "margin_dpo/margin_std": 68.52870178222656, "step": 608 }, { "epoch": 0.9206349206349206, "grad_norm": 14.3372163772583, "learning_rate": 9.757601041885694e-09, "logits/chosen": 1.0790916681289673, "logits/rejected": 0.9650440216064453, "logps/chosen": -173.4683380126953, "logps/ref_chosen": -49.93000030517578, "logps/ref_rejected": -98.36846923828125, "logps/rejected": -298.0489501953125, "loss": 0.9481, "margin_dpo/margin_mean": 76.14215087890625, "margin_dpo/margin_std": 91.03385925292969, "step": 609 }, { "epoch": 0.9221466364323507, "grad_norm": 31.100370407104492, "learning_rate": 9.395165583732379e-09, "logits/chosen": 0.9039211869239807, "logits/rejected": 0.9379135370254517, "logps/chosen": -220.49267578125, "logps/ref_chosen": -71.89997863769531, "logps/ref_rejected": -89.20841979980469, "logps/rejected": -285.78515625, "loss": 1.1377, "margin_dpo/margin_mean": 47.98402404785156, "margin_dpo/margin_std": 88.36325073242188, "step": 610 }, { "epoch": 0.9236583522297808, "grad_norm": 18.051315307617188, "learning_rate": 9.03946036001449e-09, "logits/chosen": 0.9056074619293213, "logits/rejected": 0.9064819812774658, "logps/chosen": -214.71875, "logps/ref_chosen": -67.77259826660156, "logps/ref_rejected": -66.90919494628906, "logps/rejected": -263.32421875, "loss": 1.1144, "margin_dpo/margin_mean": 49.4688606262207, "margin_dpo/margin_std": 66.27508544921875, "step": 611 }, { "epoch": 0.9251700680272109, "grad_norm": 15.253009796142578, "learning_rate": 8.690495320571839e-09, "logits/chosen": 0.6356790065765381, "logits/rejected": 0.6141253113746643, "logps/chosen": -254.52713012695312, "logps/ref_chosen": -98.77779388427734, "logps/ref_rejected": -129.40658569335938, "logps/rejected": -327.4943542480469, "loss": 0.9714, "margin_dpo/margin_mean": 42.33841323852539, "margin_dpo/margin_std": 53.188514709472656, "step": 612 }, { "epoch": 0.926681783824641, "grad_norm": 22.37784767150879, "learning_rate": 8.348280226706722e-09, "logits/chosen": 0.9218869209289551, "logits/rejected": 0.9058539867401123, "logps/chosen": -176.6582794189453, "logps/ref_chosen": -39.422340393066406, "logps/ref_rejected": -48.479248046875, "logps/rejected": -244.13925170898438, "loss": 0.9836, "margin_dpo/margin_mean": 58.42406463623047, "margin_dpo/margin_std": 89.93438720703125, "step": 613 }, { "epoch": 0.9281934996220711, "grad_norm": 24.711509704589844, "learning_rate": 8.012824650910937e-09, "logits/chosen": 1.0547163486480713, "logits/rejected": 0.9173270463943481, "logps/chosen": -167.89730834960938, "logps/ref_chosen": -52.9261474609375, "logps/ref_rejected": -68.24092102050781, "logps/rejected": -296.45574951171875, "loss": 1.0583, "margin_dpo/margin_mean": 113.24365234375, "margin_dpo/margin_std": 79.29691314697266, "step": 614 }, { "epoch": 0.9297052154195011, "grad_norm": 25.62435531616211, "learning_rate": 7.684137976598088e-09, "logits/chosen": 0.9779610633850098, "logits/rejected": 0.9523606896400452, "logps/chosen": -217.04598999023438, "logps/ref_chosen": -70.00674438476562, "logps/ref_rejected": -101.93522644042969, "logps/rejected": -298.4114074707031, "loss": 1.0525, "margin_dpo/margin_mean": 49.43693542480469, "margin_dpo/margin_std": 92.1063003540039, "step": 615 }, { "epoch": 0.9312169312169312, "grad_norm": 16.771865844726562, "learning_rate": 7.36222939784098e-09, "logits/chosen": 0.9289509057998657, "logits/rejected": 0.8705403804779053, "logps/chosen": -209.6519012451172, "logps/ref_chosen": -66.25517272949219, "logps/ref_rejected": -92.31936645507812, "logps/rejected": -286.00390625, "loss": 1.0395, "margin_dpo/margin_mean": 50.28779220581055, "margin_dpo/margin_std": 81.98731994628906, "step": 616 }, { "epoch": 0.9327286470143613, "grad_norm": 19.335580825805664, "learning_rate": 7.047107919114586e-09, "logits/chosen": 0.9605733752250671, "logits/rejected": 0.9251487255096436, "logps/chosen": -212.73922729492188, "logps/ref_chosen": -70.3993911743164, "logps/ref_rejected": -90.81809997558594, "logps/rejected": -292.81744384765625, "loss": 1.0436, "margin_dpo/margin_mean": 59.65951919555664, "margin_dpo/margin_std": 60.16530227661133, "step": 617 }, { "epoch": 0.9342403628117913, "grad_norm": 19.181501388549805, "learning_rate": 6.738782355044048e-09, "logits/chosen": 0.9526699781417847, "logits/rejected": 0.8337723016738892, "logps/chosen": -169.63429260253906, "logps/ref_chosen": -62.809661865234375, "logps/ref_rejected": -109.43612670898438, "logps/rejected": -310.1464538574219, "loss": 0.9829, "margin_dpo/margin_mean": 93.88569641113281, "margin_dpo/margin_std": 91.3293228149414, "step": 618 }, { "epoch": 0.9357520786092215, "grad_norm": 16.765586853027344, "learning_rate": 6.437261330158206e-09, "logits/chosen": 0.9776492118835449, "logits/rejected": 0.9457724094390869, "logps/chosen": -158.1160888671875, "logps/ref_chosen": -47.85075378417969, "logps/ref_rejected": -72.59744262695312, "logps/rejected": -258.400634765625, "loss": 0.9783, "margin_dpo/margin_mean": 75.53782653808594, "margin_dpo/margin_std": 94.00806427001953, "step": 619 }, { "epoch": 0.9372637944066515, "grad_norm": 19.020383834838867, "learning_rate": 6.142553278648238e-09, "logits/chosen": 0.8768226504325867, "logits/rejected": 0.8743495345115662, "logps/chosen": -224.13729858398438, "logps/ref_chosen": -79.60867309570312, "logps/ref_rejected": -86.96884155273438, "logps/rejected": -288.10784912109375, "loss": 1.1095, "margin_dpo/margin_mean": 56.610382080078125, "margin_dpo/margin_std": 97.45252990722656, "step": 620 }, { "epoch": 0.9387755102040817, "grad_norm": 17.923816680908203, "learning_rate": 5.854666444131934e-09, "logits/chosen": 0.7942125797271729, "logits/rejected": 0.8802160620689392, "logps/chosen": -215.23654174804688, "logps/ref_chosen": -68.8500747680664, "logps/ref_rejected": -53.94578552246094, "logps/rejected": -231.7372589111328, "loss": 1.0581, "margin_dpo/margin_mean": 31.405014038085938, "margin_dpo/margin_std": 76.86921691894531, "step": 621 }, { "epoch": 0.9402872260015117, "grad_norm": 20.861454010009766, "learning_rate": 5.573608879422875e-09, "logits/chosen": 0.8228497505187988, "logits/rejected": 0.8221659660339355, "logps/chosen": -211.20352172851562, "logps/ref_chosen": -49.52813720703125, "logps/ref_rejected": -56.0748291015625, "logps/rejected": -251.8733673095703, "loss": 1.0707, "margin_dpo/margin_mean": 34.12314987182617, "margin_dpo/margin_std": 85.17213439941406, "step": 622 }, { "epoch": 0.9417989417989417, "grad_norm": 18.039180755615234, "learning_rate": 5.299388446305342e-09, "logits/chosen": 0.9442222118377686, "logits/rejected": 0.8388766050338745, "logps/chosen": -213.2373809814453, "logps/ref_chosen": -51.98558807373047, "logps/ref_rejected": -89.86474609375, "logps/rejected": -364.82171630859375, "loss": 0.9715, "margin_dpo/margin_mean": 113.70519256591797, "margin_dpo/margin_std": 154.77015686035156, "step": 623 }, { "epoch": 0.9433106575963719, "grad_norm": 16.79059410095215, "learning_rate": 5.03201281531429e-09, "logits/chosen": 0.9908918142318726, "logits/rejected": 0.9922984838485718, "logps/chosen": -189.19586181640625, "logps/ref_chosen": -61.39537811279297, "logps/ref_rejected": -64.496826171875, "logps/rejected": -265.69671630859375, "loss": 0.9914, "margin_dpo/margin_mean": 73.39938354492188, "margin_dpo/margin_std": 75.53822326660156, "step": 624 }, { "epoch": 0.9448223733938019, "grad_norm": 16.281400680541992, "learning_rate": 4.7714894655209174e-09, "logits/chosen": 1.096935510635376, "logits/rejected": 0.9919509887695312, "logps/chosen": -173.40103149414062, "logps/ref_chosen": -49.35209274291992, "logps/ref_rejected": -75.47923278808594, "logps/rejected": -266.97930908203125, "loss": 1.1236, "margin_dpo/margin_mean": 67.45115661621094, "margin_dpo/margin_std": 78.00559997558594, "step": 625 }, { "epoch": 0.9463340891912321, "grad_norm": 16.243318557739258, "learning_rate": 4.517825684323323e-09, "logits/chosen": 1.0832741260528564, "logits/rejected": 1.023711085319519, "logps/chosen": -122.06949615478516, "logps/ref_chosen": -32.55897521972656, "logps/ref_rejected": -71.60407257080078, "logps/rejected": -270.57061767578125, "loss": 1.0152, "margin_dpo/margin_mean": 109.45604705810547, "margin_dpo/margin_std": 145.14871215820312, "step": 626 }, { "epoch": 0.9478458049886621, "grad_norm": 18.35875701904297, "learning_rate": 4.271028567242818e-09, "logits/chosen": 0.8411812782287598, "logits/rejected": 0.708235502243042, "logps/chosen": -198.689453125, "logps/ref_chosen": -65.68330383300781, "logps/ref_rejected": -124.0399169921875, "logps/rejected": -374.5428161621094, "loss": 0.9747, "margin_dpo/margin_mean": 117.49674224853516, "margin_dpo/margin_std": 99.51753234863281, "step": 627 }, { "epoch": 0.9493575207860923, "grad_norm": 17.412473678588867, "learning_rate": 4.0311050177251895e-09, "logits/chosen": 0.857243001461029, "logits/rejected": 0.9196332693099976, "logps/chosen": -188.25320434570312, "logps/ref_chosen": -70.50054168701172, "logps/ref_rejected": -69.05288696289062, "logps/rejected": -286.83795166015625, "loss": 1.0335, "margin_dpo/margin_mean": 100.03239440917969, "margin_dpo/margin_std": 102.7291259765625, "step": 628 }, { "epoch": 0.9508692365835223, "grad_norm": 17.979310989379883, "learning_rate": 3.798061746947995e-09, "logits/chosen": 0.9377484321594238, "logits/rejected": 1.008423089981079, "logps/chosen": -184.6234893798828, "logps/ref_chosen": -68.93040466308594, "logps/ref_rejected": -49.09862518310547, "logps/rejected": -198.7704620361328, "loss": 1.0083, "margin_dpo/margin_mean": 33.978755950927734, "margin_dpo/margin_std": 70.85810089111328, "step": 629 }, { "epoch": 0.9523809523809523, "grad_norm": 17.69316864013672, "learning_rate": 3.5719052736323806e-09, "logits/chosen": 1.0553034543991089, "logits/rejected": 0.9892318844795227, "logps/chosen": -134.7643585205078, "logps/ref_chosen": -36.853294372558594, "logps/ref_rejected": -71.96033477783203, "logps/rejected": -252.25209045410156, "loss": 0.9744, "margin_dpo/margin_mean": 82.38069152832031, "margin_dpo/margin_std": 89.0606460571289, "step": 630 }, { "epoch": 0.9538926681783825, "grad_norm": 18.01328468322754, "learning_rate": 3.352641923861144e-09, "logits/chosen": 0.9821938872337341, "logits/rejected": 0.938322901725769, "logps/chosen": -195.1949462890625, "logps/ref_chosen": -64.66175079345703, "logps/ref_rejected": -89.40802001953125, "logps/rejected": -301.36553955078125, "loss": 0.9122, "margin_dpo/margin_mean": 81.42431640625, "margin_dpo/margin_std": 62.148040771484375, "step": 631 }, { "epoch": 0.9554043839758125, "grad_norm": 18.51304054260254, "learning_rate": 3.140277830901428e-09, "logits/chosen": 0.9517176151275635, "logits/rejected": 0.9564006328582764, "logps/chosen": -203.13833618164062, "logps/ref_chosen": -74.05264282226562, "logps/ref_rejected": -66.81067657470703, "logps/rejected": -266.5632629394531, "loss": 0.93, "margin_dpo/margin_mean": 70.66688537597656, "margin_dpo/margin_std": 84.40769958496094, "step": 632 }, { "epoch": 0.9569160997732427, "grad_norm": 17.859493255615234, "learning_rate": 2.9348189350335007e-09, "logits/chosen": 0.9254288077354431, "logits/rejected": 0.87281334400177, "logps/chosen": -146.23092651367188, "logps/ref_chosen": -45.47814178466797, "logps/ref_rejected": -66.22723388671875, "logps/rejected": -234.7810516357422, "loss": 1.0317, "margin_dpo/margin_mean": 67.80101776123047, "margin_dpo/margin_std": 67.22663879394531, "step": 633 }, { "epoch": 0.9584278155706727, "grad_norm": 23.980148315429688, "learning_rate": 2.736270983384276e-09, "logits/chosen": 0.8979266285896301, "logits/rejected": 0.8862963914871216, "logps/chosen": -212.658203125, "logps/ref_chosen": -57.611724853515625, "logps/ref_rejected": -54.337623596191406, "logps/rejected": -215.5278778076172, "loss": 1.3386, "margin_dpo/margin_mean": 6.1437668800354, "margin_dpo/margin_std": 45.94617462158203, "step": 634 }, { "epoch": 0.9599395313681028, "grad_norm": 18.18239974975586, "learning_rate": 2.5446395297668287e-09, "logits/chosen": 0.945531964302063, "logits/rejected": 0.9129035472869873, "logps/chosen": -211.33154296875, "logps/ref_chosen": -49.714500427246094, "logps/ref_rejected": -67.80001831054688, "logps/rejected": -269.0682067871094, "loss": 1.1286, "margin_dpo/margin_mean": 39.65115737915039, "margin_dpo/margin_std": 108.38533020019531, "step": 635 }, { "epoch": 0.9614512471655329, "grad_norm": 15.897844314575195, "learning_rate": 2.359929934524829e-09, "logits/chosen": 0.9161902666091919, "logits/rejected": 0.8111059069633484, "logps/chosen": -202.0909423828125, "logps/ref_chosen": -50.605045318603516, "logps/ref_rejected": -77.53902435302734, "logps/rejected": -293.85711669921875, "loss": 0.9728, "margin_dpo/margin_mean": 64.83220672607422, "margin_dpo/margin_std": 55.20343017578125, "step": 636 }, { "epoch": 0.9629629629629629, "grad_norm": 18.38071060180664, "learning_rate": 2.1821473643827137e-09, "logits/chosen": 0.9871467351913452, "logits/rejected": 0.8986793160438538, "logps/chosen": -245.16378784179688, "logps/ref_chosen": -74.90892028808594, "logps/ref_rejected": -97.07221984863281, "logps/rejected": -350.8941650390625, "loss": 1.0848, "margin_dpo/margin_mean": 83.56706237792969, "margin_dpo/margin_std": 141.6215057373047, "step": 637 }, { "epoch": 0.9644746787603931, "grad_norm": 16.282268524169922, "learning_rate": 2.0112967923011646e-09, "logits/chosen": 0.8492100238800049, "logits/rejected": 0.823014497756958, "logps/chosen": -249.89724731445312, "logps/ref_chosen": -67.56559753417969, "logps/ref_rejected": -83.56005859375, "logps/rejected": -284.5699157714844, "loss": 0.998, "margin_dpo/margin_mean": 18.678207397460938, "margin_dpo/margin_std": 95.15095520019531, "step": 638 }, { "epoch": 0.9659863945578231, "grad_norm": 18.59486198425293, "learning_rate": 1.847382997337943e-09, "logits/chosen": 0.9617091417312622, "logits/rejected": 0.8375701308250427, "logps/chosen": -166.54019165039062, "logps/ref_chosen": -40.15863037109375, "logps/ref_rejected": -65.15992736816406, "logps/rejected": -268.45574951171875, "loss": 0.9745, "margin_dpo/margin_mean": 76.91429138183594, "margin_dpo/margin_std": 78.58573913574219, "step": 639 }, { "epoch": 0.9674981103552532, "grad_norm": 19.1875, "learning_rate": 1.690410564514244e-09, "logits/chosen": 0.8923076391220093, "logits/rejected": 0.7876547574996948, "logps/chosen": -186.8981475830078, "logps/ref_chosen": -56.694557189941406, "logps/ref_rejected": -100.86506652832031, "logps/rejected": -306.697265625, "loss": 1.0631, "margin_dpo/margin_mean": 75.62860870361328, "margin_dpo/margin_std": 84.25181579589844, "step": 640 }, { "epoch": 0.9690098261526833, "grad_norm": 16.5269775390625, "learning_rate": 1.5403838846864692e-09, "logits/chosen": 0.9277850389480591, "logits/rejected": 1.0083123445510864, "logps/chosen": -200.206298828125, "logps/ref_chosen": -67.92603302001953, "logps/ref_rejected": -50.31890106201172, "logps/rejected": -228.96484375, "loss": 1.0154, "margin_dpo/margin_mean": 46.365638732910156, "margin_dpo/margin_std": 66.46011352539062, "step": 641 }, { "epoch": 0.9705215419501134, "grad_norm": 26.048898696899414, "learning_rate": 1.3973071544233218e-09, "logits/chosen": 0.9023500084877014, "logits/rejected": 0.9192708134651184, "logps/chosen": -210.30873107910156, "logps/ref_chosen": -66.0007553100586, "logps/ref_rejected": -71.08174133300781, "logps/rejected": -260.4029235839844, "loss": 1.1862, "margin_dpo/margin_mean": 45.013214111328125, "margin_dpo/margin_std": 130.67271423339844, "step": 642 }, { "epoch": 0.9720332577475435, "grad_norm": 18.16506004333496, "learning_rate": 1.261184375888541e-09, "logits/chosen": 0.8603549599647522, "logits/rejected": 0.8541243076324463, "logps/chosen": -236.55422973632812, "logps/ref_chosen": -85.6345443725586, "logps/ref_rejected": -86.74612426757812, "logps/rejected": -362.2666320800781, "loss": 1.0099, "margin_dpo/margin_mean": 124.60084533691406, "margin_dpo/margin_std": 134.28097534179688, "step": 643 }, { "epoch": 0.9735449735449735, "grad_norm": 15.612887382507324, "learning_rate": 1.1320193567288527e-09, "logits/chosen": 1.0640028715133667, "logits/rejected": 1.0360305309295654, "logps/chosen": -185.77064514160156, "logps/ref_chosen": -43.66929626464844, "logps/ref_rejected": -61.423343658447266, "logps/rejected": -251.08602905273438, "loss": 1.2055, "margin_dpo/margin_mean": 47.56130599975586, "margin_dpo/margin_std": 103.23611450195312, "step": 644 }, { "epoch": 0.9750566893424036, "grad_norm": 18.053247451782227, "learning_rate": 1.0098157099674987e-09, "logits/chosen": 0.9287192821502686, "logits/rejected": 0.9003345966339111, "logps/chosen": -219.2329864501953, "logps/ref_chosen": -58.112247467041016, "logps/ref_rejected": -62.135780334472656, "logps/rejected": -301.72528076171875, "loss": 1.0062, "margin_dpo/margin_mean": 78.46878051757812, "margin_dpo/margin_std": 99.51702880859375, "step": 645 }, { "epoch": 0.9765684051398337, "grad_norm": 19.156396865844727, "learning_rate": 8.945768539031783e-10, "logits/chosen": 0.8444064855575562, "logits/rejected": 0.7318651676177979, "logps/chosen": -187.85931396484375, "logps/ref_chosen": -48.80539321899414, "logps/ref_rejected": -79.34556579589844, "logps/rejected": -289.73748779296875, "loss": 1.1103, "margin_dpo/margin_mean": 71.3379898071289, "margin_dpo/margin_std": 87.06074523925781, "step": 646 }, { "epoch": 0.9780801209372638, "grad_norm": 22.503517150878906, "learning_rate": 7.863060120144316e-10, "logits/chosen": 0.9451459050178528, "logits/rejected": 0.7789862155914307, "logps/chosen": -220.54901123046875, "logps/ref_chosen": -63.22569274902344, "logps/ref_rejected": -149.20523071289062, "logps/rejected": -385.1798095703125, "loss": 0.8997, "margin_dpo/margin_mean": 78.65129089355469, "margin_dpo/margin_std": 98.49140930175781, "step": 647 }, { "epoch": 0.9795918367346939, "grad_norm": 15.99667739868164, "learning_rate": 6.850062128694045e-10, "logits/chosen": 0.8275370597839355, "logits/rejected": 0.7766451835632324, "logps/chosen": -226.41717529296875, "logps/ref_chosen": -67.55680084228516, "logps/ref_rejected": -78.43806457519531, "logps/rejected": -259.7667236328125, "loss": 1.0994, "margin_dpo/margin_mean": 22.468292236328125, "margin_dpo/margin_std": 122.872314453125, "step": 648 }, { "epoch": 0.981103552532124, "grad_norm": 26.316551208496094, "learning_rate": 5.906802900412788e-10, "logits/chosen": 0.9751644134521484, "logits/rejected": 0.926426887512207, "logps/chosen": -177.2069091796875, "logps/ref_chosen": -42.90663146972656, "logps/ref_rejected": -64.04318237304688, "logps/rejected": -287.71142578125, "loss": 1.1161, "margin_dpo/margin_mean": 89.36796569824219, "margin_dpo/margin_std": 82.30953979492188, "step": 649 }, { "epoch": 0.982615268329554, "grad_norm": 17.684829711914062, "learning_rate": 5.033308820289184e-10, "logits/chosen": 1.0313094854354858, "logits/rejected": 0.9337575435638428, "logps/chosen": -158.346923828125, "logps/ref_chosen": -44.43085479736328, "logps/ref_rejected": -88.01769256591797, "logps/rejected": -314.0736083984375, "loss": 1.0373, "margin_dpo/margin_mean": 112.13986206054688, "margin_dpo/margin_std": 143.96426391601562, "step": 650 }, { "epoch": 0.9841269841269841, "grad_norm": 16.09817886352539, "learning_rate": 4.2296043218295606e-10, "logits/chosen": 0.8436592221260071, "logits/rejected": 0.8099097013473511, "logps/chosen": -188.1514892578125, "logps/ref_chosen": -54.74213790893555, "logps/ref_rejected": -69.22908020019531, "logps/rejected": -218.41043090820312, "loss": 1.0272, "margin_dpo/margin_mean": 15.772006034851074, "margin_dpo/margin_std": 73.69939422607422, "step": 651 }, { "epoch": 0.9856386999244142, "grad_norm": 21.886402130126953, "learning_rate": 3.4957118863768176e-10, "logits/chosen": 0.9487487077713013, "logits/rejected": 0.9406764507293701, "logps/chosen": -158.75241088867188, "logps/ref_chosen": -39.43302917480469, "logps/ref_rejected": -45.17872619628906, "logps/rejected": -234.00820922851562, "loss": 1.1864, "margin_dpo/margin_mean": 69.51010131835938, "margin_dpo/margin_std": 92.36702728271484, "step": 652 }, { "epoch": 0.9871504157218443, "grad_norm": 16.707674026489258, "learning_rate": 2.831652042480093e-10, "logits/chosen": 0.7637461423873901, "logits/rejected": 0.8205589056015015, "logps/chosen": -223.41229248046875, "logps/ref_chosen": -68.25508117675781, "logps/ref_rejected": -78.01954650878906, "logps/rejected": -293.35223388671875, "loss": 0.9948, "margin_dpo/margin_mean": 60.175498962402344, "margin_dpo/margin_std": 88.66094970703125, "step": 653 }, { "epoch": 0.9886621315192744, "grad_norm": 18.35707664489746, "learning_rate": 2.2374433653205016e-10, "logits/chosen": 0.9188174605369568, "logits/rejected": 0.8119950890541077, "logps/chosen": -183.5347442626953, "logps/ref_chosen": -44.32666778564453, "logps/ref_rejected": -75.5877685546875, "logps/rejected": -248.86541748046875, "loss": 1.062, "margin_dpo/margin_mean": 34.06956100463867, "margin_dpo/margin_std": 89.29640197753906, "step": 654 }, { "epoch": 0.9901738473167044, "grad_norm": 15.2190580368042, "learning_rate": 1.7131024761923852e-10, "logits/chosen": 0.7755975723266602, "logits/rejected": 0.8196406364440918, "logps/chosen": -192.36354064941406, "logps/ref_chosen": -72.61821746826172, "logps/ref_rejected": -61.733245849609375, "logps/rejected": -256.1627197265625, "loss": 0.8594, "margin_dpo/margin_mean": 74.68414306640625, "margin_dpo/margin_std": 70.4334487915039, "step": 655 }, { "epoch": 0.9916855631141346, "grad_norm": 15.999876022338867, "learning_rate": 1.2586440420372934e-10, "logits/chosen": 0.7340442538261414, "logits/rejected": 0.7868098020553589, "logps/chosen": -193.5328369140625, "logps/ref_chosen": -88.81657409667969, "logps/ref_rejected": -67.2658462524414, "logps/rejected": -228.84466552734375, "loss": 0.9997, "margin_dpo/margin_mean": 56.862571716308594, "margin_dpo/margin_std": 70.70635986328125, "step": 656 }, { "epoch": 0.9931972789115646, "grad_norm": 20.399385452270508, "learning_rate": 8.740807750345913e-11, "logits/chosen": 1.0027674436569214, "logits/rejected": 0.8794127702713013, "logps/chosen": -157.93316650390625, "logps/ref_chosen": -34.980865478515625, "logps/ref_rejected": -85.61087036132812, "logps/rejected": -322.360595703125, "loss": 0.9962, "margin_dpo/margin_mean": 113.79745483398438, "margin_dpo/margin_std": 116.89727783203125, "step": 657 }, { "epoch": 0.9947089947089947, "grad_norm": 18.558809280395508, "learning_rate": 5.594234322453539e-11, "logits/chosen": 1.0621709823608398, "logits/rejected": 1.0080801248550415, "logps/chosen": -221.82943725585938, "logps/ref_chosen": -67.89546203613281, "logps/ref_rejected": -78.8687515258789, "logps/rejected": -280.9857177734375, "loss": 1.1266, "margin_dpo/margin_mean": 48.18297576904297, "margin_dpo/margin_std": 105.40860748291016, "step": 658 }, { "epoch": 0.9962207105064248, "grad_norm": 17.150558471679688, "learning_rate": 3.146808153123293e-11, "logits/chosen": 1.0832101106643677, "logits/rejected": 0.9403669834136963, "logps/chosen": -138.60635375976562, "logps/ref_chosen": -34.2476806640625, "logps/ref_rejected": -74.97065734863281, "logps/rejected": -273.552001953125, "loss": 1.091, "margin_dpo/margin_mean": 94.22268676757812, "margin_dpo/margin_std": 84.5680923461914, "step": 659 }, { "epoch": 0.9977324263038548, "grad_norm": 18.795955657958984, "learning_rate": 1.3985977021235829e-11, "logits/chosen": 0.9274425506591797, "logits/rejected": 0.9459260106086731, "logps/chosen": -189.62442016601562, "logps/ref_chosen": -57.3446044921875, "logps/ref_rejected": -62.27751922607422, "logps/rejected": -275.3188171386719, "loss": 0.8589, "margin_dpo/margin_mean": 80.76148223876953, "margin_dpo/margin_std": 117.38458251953125, "step": 660 }, { "epoch": 0.999244142101285, "grad_norm": 18.007423400878906, "learning_rate": 3.4965187065971735e-12, "logits/chosen": 0.883224368095398, "logits/rejected": 0.9031695127487183, "logps/chosen": -226.634521484375, "logps/ref_chosen": -63.84727478027344, "logps/ref_rejected": -49.75703430175781, "logps/rejected": -239.585205078125, "loss": 1.1426, "margin_dpo/margin_mean": 27.040931701660156, "margin_dpo/margin_std": 113.18049621582031, "step": 661 }, { "epoch": 0.999244142101285, "step": 661, "total_flos": 0.0, "train_loss": 1.1443162902220294, "train_runtime": 1702.0491, "train_samples_per_second": 24.874, "train_steps_per_second": 0.388 } ], "logging_steps": 1, "max_steps": 661, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }