{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.999244142101285, "eval_steps": 100, "global_step": 661, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0015117157974300832, "grad_norm": 28.220060348510742, "learning_rate": 0.0, "logits/chosen": 0.13337239623069763, "logits/rejected": 0.12492948770523071, "logps/chosen": -64.5841293334961, "logps/ref_chosen": -64.61280822753906, "logps/ref_rejected": -64.17195129394531, "logps/rejected": -64.14192199707031, "loss": 1.3866, "margin_dpo/margin_mean": -0.0013527870178222656, "margin_dpo/margin_std": 0.2561596930027008, "step": 1 }, { "epoch": 0.0030234315948601664, "grad_norm": 27.82727813720703, "learning_rate": 7.462686567164179e-09, "logits/chosen": 0.09414851665496826, "logits/rejected": 0.07363267242908478, "logps/chosen": -56.101890563964844, "logps/ref_chosen": -56.0989990234375, "logps/ref_rejected": -66.59971618652344, "logps/rejected": -66.64006042480469, "loss": 1.3828, "margin_dpo/margin_mean": 0.03744968771934509, "margin_dpo/margin_std": 0.27811938524246216, "step": 2 }, { "epoch": 0.0045351473922902496, "grad_norm": 31.255678176879883, "learning_rate": 1.4925373134328357e-08, "logits/chosen": 0.09937217831611633, "logits/rejected": 0.061470769345760345, "logps/chosen": -65.42631530761719, "logps/ref_chosen": -65.45726013183594, "logps/ref_rejected": -90.82853698730469, "logps/rejected": -90.77711486816406, "loss": 1.3886, "margin_dpo/margin_mean": -0.02046513557434082, "margin_dpo/margin_std": 0.29388636350631714, "step": 3 }, { "epoch": 0.006046863189720333, "grad_norm": 34.38330841064453, "learning_rate": 2.2388059701492534e-08, "logits/chosen": 0.11287139356136322, "logits/rejected": 0.09666792303323746, "logps/chosen": -76.84639739990234, "logps/ref_chosen": -76.86018371582031, "logps/ref_rejected": -79.91523742675781, "logps/rejected": -79.90673828125, "loss": 1.386, "margin_dpo/margin_mean": 0.005287140607833862, "margin_dpo/margin_std": 0.30723485350608826, "step": 4 }, { "epoch": 0.007558578987150416, "grad_norm": 29.698986053466797, "learning_rate": 2.9850746268656714e-08, "logits/chosen": 0.08247077465057373, "logits/rejected": 0.04365617036819458, "logps/chosen": -62.99720764160156, "logps/ref_chosen": -62.97134017944336, "logps/ref_rejected": -79.91920471191406, "logps/rejected": -79.87831115722656, "loss": 1.3933, "margin_dpo/margin_mean": -0.06676921248435974, "margin_dpo/margin_std": 0.3261260688304901, "step": 5 }, { "epoch": 0.009070294784580499, "grad_norm": 29.868539810180664, "learning_rate": 3.731343283582089e-08, "logits/chosen": 0.14518634974956512, "logits/rejected": 0.10563785582780838, "logps/chosen": -51.37147521972656, "logps/ref_chosen": -51.30736541748047, "logps/ref_rejected": -82.77239227294922, "logps/rejected": -82.7464828491211, "loss": 1.3956, "margin_dpo/margin_mean": -0.09001976251602173, "margin_dpo/margin_std": 0.3415699601173401, "step": 6 }, { "epoch": 0.010582010582010581, "grad_norm": 27.209218978881836, "learning_rate": 4.477611940298507e-08, "logits/chosen": 0.015805965289473534, "logits/rejected": -0.027883023023605347, "logps/chosen": -51.44762420654297, "logps/ref_chosen": -51.45941162109375, "logps/ref_rejected": -66.3828125, "logps/rejected": -66.3854751586914, "loss": 1.385, "margin_dpo/margin_mean": 0.014449506998062134, "margin_dpo/margin_std": 0.22094310820102692, "step": 7 }, { "epoch": 0.012093726379440665, "grad_norm": 28.625896453857422, "learning_rate": 5.223880597014925e-08, "logits/chosen": 0.05074763670563698, "logits/rejected": 0.029051048681139946, "logps/chosen": -62.23204803466797, "logps/ref_chosen": -62.19754409790039, "logps/ref_rejected": -74.66180419921875, "logps/rejected": -74.68563842773438, "loss": 1.3877, "margin_dpo/margin_mean": -0.010671883821487427, "margin_dpo/margin_std": 0.3669050931930542, "step": 8 }, { "epoch": 0.013605442176870748, "grad_norm": 31.59478759765625, "learning_rate": 5.970149253731343e-08, "logits/chosen": 0.15643024444580078, "logits/rejected": 0.09815741330385208, "logps/chosen": -55.671485900878906, "logps/ref_chosen": -55.629722595214844, "logps/ref_rejected": -86.21221923828125, "logps/rejected": -86.2544937133789, "loss": 1.3865, "margin_dpo/margin_mean": 0.0005104541778564453, "margin_dpo/margin_std": 0.29080653190612793, "step": 9 }, { "epoch": 0.015117157974300832, "grad_norm": 29.54948616027832, "learning_rate": 6.71641791044776e-08, "logits/chosen": 0.15007850527763367, "logits/rejected": 0.1181110367178917, "logps/chosen": -62.665287017822266, "logps/ref_chosen": -62.69060134887695, "logps/ref_rejected": -90.61012268066406, "logps/rejected": -90.62495422363281, "loss": 1.3826, "margin_dpo/margin_mean": 0.040148526430130005, "margin_dpo/margin_std": 0.3635545074939728, "step": 10 }, { "epoch": 0.016628873771730914, "grad_norm": 29.36393928527832, "learning_rate": 7.462686567164178e-08, "logits/chosen": 0.10430046170949936, "logits/rejected": 0.09751109778881073, "logps/chosen": -65.7414321899414, "logps/ref_chosen": -65.76712036132812, "logps/ref_rejected": -72.4764633178711, "logps/rejected": -72.47653198242188, "loss": 1.384, "margin_dpo/margin_mean": 0.02575582265853882, "margin_dpo/margin_std": 0.2989434003829956, "step": 11 }, { "epoch": 0.018140589569160998, "grad_norm": 28.3092098236084, "learning_rate": 8.208955223880596e-08, "logits/chosen": 0.026789026334881783, "logits/rejected": 0.010548613965511322, "logps/chosen": -60.722389221191406, "logps/ref_chosen": -60.704891204833984, "logps/ref_rejected": -69.41564178466797, "logps/rejected": -69.4146957397461, "loss": 1.3883, "margin_dpo/margin_mean": -0.018443971872329712, "margin_dpo/margin_std": 0.2729582190513611, "step": 12 }, { "epoch": 0.019652305366591082, "grad_norm": 29.16897201538086, "learning_rate": 8.955223880597014e-08, "logits/chosen": 0.11780130863189697, "logits/rejected": 0.054500848054885864, "logps/chosen": -49.9007568359375, "logps/ref_chosen": -49.90925216674805, "logps/ref_rejected": -92.378173828125, "logps/rejected": -92.35137939453125, "loss": 1.3884, "margin_dpo/margin_mean": -0.0182991623878479, "margin_dpo/margin_std": 0.28390124440193176, "step": 13 }, { "epoch": 0.021164021164021163, "grad_norm": 29.623899459838867, "learning_rate": 9.701492537313432e-08, "logits/chosen": 0.08536244928836823, "logits/rejected": 0.06769540160894394, "logps/chosen": -60.600833892822266, "logps/ref_chosen": -60.61879348754883, "logps/ref_rejected": -71.79306030273438, "logps/rejected": -71.79493713378906, "loss": 1.3846, "margin_dpo/margin_mean": 0.019834458827972412, "margin_dpo/margin_std": 0.32009801268577576, "step": 14 }, { "epoch": 0.022675736961451247, "grad_norm": 33.00798797607422, "learning_rate": 1.044776119402985e-07, "logits/chosen": 0.0717407837510109, "logits/rejected": 0.02849598601460457, "logps/chosen": -63.45430374145508, "logps/ref_chosen": -63.46953582763672, "logps/ref_rejected": -88.88951110839844, "logps/rejected": -88.90917205810547, "loss": 1.383, "margin_dpo/margin_mean": 0.03488925099372864, "margin_dpo/margin_std": 0.29492419958114624, "step": 15 }, { "epoch": 0.02418745275888133, "grad_norm": 26.881074905395508, "learning_rate": 1.1194029850746268e-07, "logits/chosen": 0.10169962048530579, "logits/rejected": 0.06554323434829712, "logps/chosen": -46.536376953125, "logps/ref_chosen": -46.53229904174805, "logps/ref_rejected": -74.27534484863281, "logps/rejected": -74.26747131347656, "loss": 1.3877, "margin_dpo/margin_mean": -0.011950835585594177, "margin_dpo/margin_std": 0.2769685983657837, "step": 16 }, { "epoch": 0.025699168556311415, "grad_norm": 33.024173736572266, "learning_rate": 1.1940298507462686e-07, "logits/chosen": 0.06722284853458405, "logits/rejected": 0.04799235984683037, "logps/chosen": -64.07486724853516, "logps/ref_chosen": -64.07783508300781, "logps/ref_rejected": -86.40876770019531, "logps/rejected": -86.42149353027344, "loss": 1.3851, "margin_dpo/margin_mean": 0.015699952840805054, "margin_dpo/margin_std": 0.3443329334259033, "step": 17 }, { "epoch": 0.027210884353741496, "grad_norm": 27.680784225463867, "learning_rate": 1.2686567164179106e-07, "logits/chosen": 0.09066110849380493, "logits/rejected": 0.04530249536037445, "logps/chosen": -44.830833435058594, "logps/ref_chosen": -44.87433624267578, "logps/ref_rejected": -70.9760513305664, "logps/rejected": -70.99610900878906, "loss": 1.3802, "margin_dpo/margin_mean": 0.06356379389762878, "margin_dpo/margin_std": 0.2836337685585022, "step": 18 }, { "epoch": 0.02872260015117158, "grad_norm": 30.68499183654785, "learning_rate": 1.343283582089552e-07, "logits/chosen": 0.0559040792286396, "logits/rejected": 0.04259665310382843, "logps/chosen": -68.11859130859375, "logps/ref_chosen": -68.1598129272461, "logps/ref_rejected": -81.17138671875, "logps/rejected": -81.19941711425781, "loss": 1.3796, "margin_dpo/margin_mean": 0.06925900280475616, "margin_dpo/margin_std": 0.27486366033554077, "step": 19 }, { "epoch": 0.030234315948601664, "grad_norm": 29.224321365356445, "learning_rate": 1.4179104477611938e-07, "logits/chosen": 0.13296857476234436, "logits/rejected": 0.11000999808311462, "logps/chosen": -53.684165954589844, "logps/ref_chosen": -53.678558349609375, "logps/ref_rejected": -74.16911315917969, "logps/rejected": -74.16539764404297, "loss": 1.3874, "margin_dpo/margin_mean": -0.009327858686447144, "margin_dpo/margin_std": 0.28060251474380493, "step": 20 }, { "epoch": 0.031746031746031744, "grad_norm": 29.44382667541504, "learning_rate": 1.4925373134328355e-07, "logits/chosen": 0.09574359655380249, "logits/rejected": 0.07080109417438507, "logps/chosen": -64.70553588867188, "logps/ref_chosen": -64.70155334472656, "logps/ref_rejected": -81.02095031738281, "logps/rejected": -81.00267028808594, "loss": 1.3887, "margin_dpo/margin_mean": -0.022250384092330933, "margin_dpo/margin_std": 0.29801231622695923, "step": 21 }, { "epoch": 0.03325774754346183, "grad_norm": 29.0031795501709, "learning_rate": 1.5671641791044775e-07, "logits/chosen": 0.010743018239736557, "logits/rejected": -0.010531796142458916, "logps/chosen": -58.05990219116211, "logps/ref_chosen": -58.03599548339844, "logps/ref_rejected": -80.72721862792969, "logps/rejected": -80.74945831298828, "loss": 1.3867, "margin_dpo/margin_mean": -0.0016689598560333252, "margin_dpo/margin_std": 0.27896028757095337, "step": 22 }, { "epoch": 0.03476946334089191, "grad_norm": 32.909122467041016, "learning_rate": 1.6417910447761193e-07, "logits/chosen": 0.1421518325805664, "logits/rejected": 0.11664065718650818, "logps/chosen": -66.34078216552734, "logps/ref_chosen": -66.35609436035156, "logps/ref_rejected": -93.02769470214844, "logps/rejected": -93.0291976928711, "loss": 1.3849, "margin_dpo/margin_mean": 0.01681581139564514, "margin_dpo/margin_std": 0.3285256624221802, "step": 23 }, { "epoch": 0.036281179138321996, "grad_norm": 26.222774505615234, "learning_rate": 1.716417910447761e-07, "logits/chosen": 0.13952991366386414, "logits/rejected": 0.10637363791465759, "logps/chosen": -54.47034454345703, "logps/ref_chosen": -54.461238861083984, "logps/ref_rejected": -68.33817291259766, "logps/rejected": -68.36705780029297, "loss": 1.3845, "margin_dpo/margin_mean": 0.019780874252319336, "margin_dpo/margin_std": 0.25748512148857117, "step": 24 }, { "epoch": 0.03779289493575208, "grad_norm": 29.700237274169922, "learning_rate": 1.7910447761194027e-07, "logits/chosen": 0.10331503301858902, "logits/rejected": 0.05197351798415184, "logps/chosen": -60.05360794067383, "logps/ref_chosen": -60.00420379638672, "logps/ref_rejected": -90.47376251220703, "logps/rejected": -90.52792358398438, "loss": 1.3861, "margin_dpo/margin_mean": 0.0047473907470703125, "margin_dpo/margin_std": 0.3563765287399292, "step": 25 }, { "epoch": 0.039304610733182165, "grad_norm": 29.708221435546875, "learning_rate": 1.8656716417910447e-07, "logits/chosen": 0.12899595499038696, "logits/rejected": 0.10993089526891708, "logps/chosen": -56.8546028137207, "logps/ref_chosen": -56.81915283203125, "logps/ref_rejected": -77.84333038330078, "logps/rejected": -77.90251159667969, "loss": 1.3842, "margin_dpo/margin_mean": 0.02373906970024109, "margin_dpo/margin_std": 0.30973872542381287, "step": 26 }, { "epoch": 0.04081632653061224, "grad_norm": 29.129623413085938, "learning_rate": 1.9402985074626865e-07, "logits/chosen": 0.10666880756616592, "logits/rejected": 0.08166207373142242, "logps/chosen": -62.90531921386719, "logps/ref_chosen": -62.87702178955078, "logps/ref_rejected": -71.34437561035156, "logps/rejected": -71.36808776855469, "loss": 1.387, "margin_dpo/margin_mean": -0.004584580659866333, "margin_dpo/margin_std": 0.285133421421051, "step": 27 }, { "epoch": 0.042328042328042326, "grad_norm": 27.702333450317383, "learning_rate": 2.0149253731343282e-07, "logits/chosen": 0.07180536538362503, "logits/rejected": 0.06273311376571655, "logps/chosen": -59.86304473876953, "logps/ref_chosen": -59.833377838134766, "logps/ref_rejected": -70.39804077148438, "logps/rejected": -70.39251708984375, "loss": 1.39, "margin_dpo/margin_mean": -0.03518790006637573, "margin_dpo/margin_std": 0.2781359553337097, "step": 28 }, { "epoch": 0.04383975812547241, "grad_norm": 32.63071823120117, "learning_rate": 2.08955223880597e-07, "logits/chosen": 0.12691108882427216, "logits/rejected": 0.10950787365436554, "logps/chosen": -74.12796783447266, "logps/ref_chosen": -74.12020111083984, "logps/ref_rejected": -83.33098602294922, "logps/rejected": -83.3443603515625, "loss": 1.3859, "margin_dpo/margin_mean": 0.005606889724731445, "margin_dpo/margin_std": 0.2921278774738312, "step": 29 }, { "epoch": 0.045351473922902494, "grad_norm": 30.414403915405273, "learning_rate": 2.1641791044776117e-07, "logits/chosen": 0.1324155628681183, "logits/rejected": 0.0772828459739685, "logps/chosen": -50.75823211669922, "logps/ref_chosen": -50.75128936767578, "logps/ref_rejected": -89.29063415527344, "logps/rejected": -89.32701873779297, "loss": 1.3836, "margin_dpo/margin_mean": 0.02944222092628479, "margin_dpo/margin_std": 0.33608317375183105, "step": 30 }, { "epoch": 0.04686318972033258, "grad_norm": 34.32074737548828, "learning_rate": 2.2388059701492537e-07, "logits/chosen": 0.11036145687103271, "logits/rejected": 0.06397197395563126, "logps/chosen": -65.36607360839844, "logps/ref_chosen": -65.33675384521484, "logps/ref_rejected": -100.76666259765625, "logps/rejected": -100.819091796875, "loss": 1.3842, "margin_dpo/margin_mean": 0.023108333349227905, "margin_dpo/margin_std": 0.31038618087768555, "step": 31 }, { "epoch": 0.04837490551776266, "grad_norm": 30.099220275878906, "learning_rate": 2.3134328358208954e-07, "logits/chosen": 0.08162057399749756, "logits/rejected": 0.07382632791996002, "logps/chosen": -67.17050170898438, "logps/ref_chosen": -67.18333435058594, "logps/ref_rejected": -82.80763244628906, "logps/rejected": -82.83109283447266, "loss": 1.3829, "margin_dpo/margin_mean": 0.036289215087890625, "margin_dpo/margin_std": 0.3070800304412842, "step": 32 }, { "epoch": 0.049886621315192746, "grad_norm": 31.10877227783203, "learning_rate": 2.388059701492537e-07, "logits/chosen": 0.043758779764175415, "logits/rejected": 0.01711263135075569, "logps/chosen": -64.09259033203125, "logps/ref_chosen": -64.03947448730469, "logps/ref_rejected": -75.68357849121094, "logps/rejected": -75.74140167236328, "loss": 1.3861, "margin_dpo/margin_mean": 0.004706323146820068, "margin_dpo/margin_std": 0.3460730314254761, "step": 33 }, { "epoch": 0.05139833711262283, "grad_norm": 28.492658615112305, "learning_rate": 2.4626865671641786e-07, "logits/chosen": 0.09808081388473511, "logits/rejected": 0.06829625368118286, "logps/chosen": -53.72270202636719, "logps/ref_chosen": -53.66429901123047, "logps/ref_rejected": -65.77989196777344, "logps/rejected": -65.87895202636719, "loss": 1.3825, "margin_dpo/margin_mean": 0.040650635957717896, "margin_dpo/margin_std": 0.2930639982223511, "step": 34 }, { "epoch": 0.05291005291005291, "grad_norm": 27.739458084106445, "learning_rate": 2.537313432835821e-07, "logits/chosen": 0.04755338653922081, "logits/rejected": 0.02539633959531784, "logps/chosen": -61.09136962890625, "logps/ref_chosen": -61.01686096191406, "logps/ref_rejected": -72.78598022460938, "logps/rejected": -72.928955078125, "loss": 1.3797, "margin_dpo/margin_mean": 0.06845930218696594, "margin_dpo/margin_std": 0.3211126923561096, "step": 35 }, { "epoch": 0.05442176870748299, "grad_norm": 28.59171485900879, "learning_rate": 2.611940298507462e-07, "logits/chosen": 0.10078567266464233, "logits/rejected": 0.04806087166070938, "logps/chosen": -50.616050720214844, "logps/ref_chosen": -50.53736114501953, "logps/ref_rejected": -78.11678314208984, "logps/rejected": -78.25202941894531, "loss": 1.381, "margin_dpo/margin_mean": 0.05655008554458618, "margin_dpo/margin_std": 0.3539975881576538, "step": 36 }, { "epoch": 0.055933484504913075, "grad_norm": 37.603458404541016, "learning_rate": 2.686567164179104e-07, "logits/chosen": 0.09838317334651947, "logits/rejected": 0.019203372299671173, "logps/chosen": -59.59420394897461, "logps/ref_chosen": -59.55394744873047, "logps/ref_rejected": -108.27703094482422, "logps/rejected": -108.44245910644531, "loss": 1.3742, "margin_dpo/margin_mean": 0.12516844272613525, "margin_dpo/margin_std": 0.38829922676086426, "step": 37 }, { "epoch": 0.05744520030234316, "grad_norm": 29.565021514892578, "learning_rate": 2.761194029850746e-07, "logits/chosen": 0.06575263291597366, "logits/rejected": 0.05181782692670822, "logps/chosen": -65.838134765625, "logps/ref_chosen": -65.7883529663086, "logps/ref_rejected": -76.1619873046875, "logps/rejected": -76.24898529052734, "loss": 1.383, "margin_dpo/margin_mean": 0.037221550941467285, "margin_dpo/margin_std": 0.39967113733291626, "step": 38 }, { "epoch": 0.05895691609977324, "grad_norm": 29.25869369506836, "learning_rate": 2.8358208955223876e-07, "logits/chosen": 0.13992644846439362, "logits/rejected": 0.1137220561504364, "logps/chosen": -57.26211166381836, "logps/ref_chosen": -57.17680358886719, "logps/ref_rejected": -79.486328125, "logps/rejected": -79.61747741699219, "loss": 1.3822, "margin_dpo/margin_mean": 0.04584622383117676, "margin_dpo/margin_std": 0.43921124935150146, "step": 39 }, { "epoch": 0.06046863189720333, "grad_norm": 31.404504776000977, "learning_rate": 2.9104477611940296e-07, "logits/chosen": 0.12540677189826965, "logits/rejected": 0.07498523592948914, "logps/chosen": -61.43571472167969, "logps/ref_chosen": -61.33416748046875, "logps/ref_rejected": -79.10697174072266, "logps/rejected": -79.22048950195312, "loss": 1.3853, "margin_dpo/margin_mean": 0.0119723379611969, "margin_dpo/margin_std": 0.2859070897102356, "step": 40 }, { "epoch": 0.06198034769463341, "grad_norm": 30.36787986755371, "learning_rate": 2.985074626865671e-07, "logits/chosen": 0.02323339134454727, "logits/rejected": 0.00391228124499321, "logps/chosen": -67.64244842529297, "logps/ref_chosen": -67.54672241210938, "logps/ref_rejected": -83.87788391113281, "logps/rejected": -84.09368133544922, "loss": 1.3747, "margin_dpo/margin_mean": 0.12007108330726624, "margin_dpo/margin_std": 0.36786937713623047, "step": 41 }, { "epoch": 0.06349206349206349, "grad_norm": 29.299169540405273, "learning_rate": 3.059701492537313e-07, "logits/chosen": 0.058651238679885864, "logits/rejected": 0.03681695833802223, "logps/chosen": -61.39659881591797, "logps/ref_chosen": -61.26485824584961, "logps/ref_rejected": -76.3629150390625, "logps/rejected": -76.49739074707031, "loss": 1.3865, "margin_dpo/margin_mean": 0.00273972749710083, "margin_dpo/margin_std": 0.4193479120731354, "step": 42 }, { "epoch": 0.06500377928949358, "grad_norm": 34.54471206665039, "learning_rate": 3.134328358208955e-07, "logits/chosen": 0.08210780471563339, "logits/rejected": 0.07124543190002441, "logps/chosen": -71.9305419921875, "logps/ref_chosen": -71.80902862548828, "logps/ref_rejected": -81.12464141845703, "logps/rejected": -81.28547668457031, "loss": 1.3827, "margin_dpo/margin_mean": 0.0393202006816864, "margin_dpo/margin_std": 0.37330394983291626, "step": 43 }, { "epoch": 0.06651549508692366, "grad_norm": 32.71466064453125, "learning_rate": 3.2089552238805965e-07, "logits/chosen": 0.022556209936738014, "logits/rejected": -0.006862609181553125, "logps/chosen": -66.73717498779297, "logps/ref_chosen": -66.55043029785156, "logps/ref_rejected": -85.06198120117188, "logps/rejected": -85.24290466308594, "loss": 1.3874, "margin_dpo/margin_mean": -0.005816161632537842, "margin_dpo/margin_std": 0.4439757466316223, "step": 44 }, { "epoch": 0.06802721088435375, "grad_norm": 31.814245223999023, "learning_rate": 3.2835820895522385e-07, "logits/chosen": 0.10233305394649506, "logits/rejected": 0.05074525997042656, "logps/chosen": -62.383018493652344, "logps/ref_chosen": -62.243858337402344, "logps/ref_rejected": -92.96665954589844, "logps/rejected": -93.20918273925781, "loss": 1.3763, "margin_dpo/margin_mean": 0.10335150361061096, "margin_dpo/margin_std": 0.3743368983268738, "step": 45 }, { "epoch": 0.06953892668178382, "grad_norm": 31.20781707763672, "learning_rate": 3.3582089552238805e-07, "logits/chosen": 0.13038724660873413, "logits/rejected": 0.08414691686630249, "logps/chosen": -61.61685562133789, "logps/ref_chosen": -61.498905181884766, "logps/ref_rejected": -78.91172790527344, "logps/rejected": -79.18026733398438, "loss": 1.3718, "margin_dpo/margin_mean": 0.1505853533744812, "margin_dpo/margin_std": 0.43144309520721436, "step": 46 }, { "epoch": 0.0710506424792139, "grad_norm": 28.37842559814453, "learning_rate": 3.432835820895522e-07, "logits/chosen": 0.030327381566166878, "logits/rejected": -0.012373650446534157, "logps/chosen": -51.69384002685547, "logps/ref_chosen": -51.578346252441406, "logps/ref_rejected": -68.2215576171875, "logps/rejected": -68.53022003173828, "loss": 1.3674, "margin_dpo/margin_mean": 0.19316792488098145, "margin_dpo/margin_std": 0.33303409814834595, "step": 47 }, { "epoch": 0.07256235827664399, "grad_norm": 26.839597702026367, "learning_rate": 3.507462686567164e-07, "logits/chosen": 0.15095138549804688, "logits/rejected": 0.12109607458114624, "logps/chosen": -52.03797912597656, "logps/ref_chosen": -51.79365158081055, "logps/ref_rejected": -64.22504425048828, "logps/rejected": -64.51231384277344, "loss": 1.3824, "margin_dpo/margin_mean": 0.042948633432388306, "margin_dpo/margin_std": 0.3989154100418091, "step": 48 }, { "epoch": 0.07407407407407407, "grad_norm": 27.287622451782227, "learning_rate": 3.5820895522388055e-07, "logits/chosen": 0.001452181488275528, "logits/rejected": -0.01914474181830883, "logps/chosen": -58.36638641357422, "logps/ref_chosen": -58.13460159301758, "logps/ref_rejected": -64.63206481933594, "logps/rejected": -64.96180725097656, "loss": 1.3769, "margin_dpo/margin_mean": 0.09795981645584106, "margin_dpo/margin_std": 0.40706026554107666, "step": 49 }, { "epoch": 0.07558578987150416, "grad_norm": 28.038366317749023, "learning_rate": 3.6567164179104475e-07, "logits/chosen": 0.10713882744312286, "logits/rejected": 0.07751593738794327, "logps/chosen": -53.1766357421875, "logps/ref_chosen": -52.85643768310547, "logps/ref_rejected": -72.17460632324219, "logps/rejected": -72.57747650146484, "loss": 1.3785, "margin_dpo/margin_mean": 0.0826747715473175, "margin_dpo/margin_std": 0.44336575269699097, "step": 50 }, { "epoch": 0.07709750566893424, "grad_norm": 31.035123825073242, "learning_rate": 3.7313432835820895e-07, "logits/chosen": 0.1037454828619957, "logits/rejected": 0.07543984055519104, "logps/chosen": -63.92970657348633, "logps/ref_chosen": -63.65644073486328, "logps/ref_rejected": -86.1323013305664, "logps/rejected": -86.58238220214844, "loss": 1.3694, "margin_dpo/margin_mean": 0.17682453989982605, "margin_dpo/margin_std": 0.5223456621170044, "step": 51 }, { "epoch": 0.07860922146636433, "grad_norm": 32.71433639526367, "learning_rate": 3.805970149253731e-07, "logits/chosen": 0.10190063714981079, "logits/rejected": 0.0511334203183651, "logps/chosen": -68.15455627441406, "logps/ref_chosen": -67.8402099609375, "logps/ref_rejected": -96.97091674804688, "logps/rejected": -97.4761962890625, "loss": 1.3681, "margin_dpo/margin_mean": 0.19093959033489227, "margin_dpo/margin_std": 0.560549795627594, "step": 52 }, { "epoch": 0.0801209372637944, "grad_norm": 27.500490188598633, "learning_rate": 3.880597014925373e-07, "logits/chosen": 0.08714406192302704, "logits/rejected": 0.07636132836341858, "logps/chosen": -57.211326599121094, "logps/ref_chosen": -56.87813949584961, "logps/ref_rejected": -60.75569152832031, "logps/rejected": -61.279624938964844, "loss": 1.3678, "margin_dpo/margin_mean": 0.1907462179660797, "margin_dpo/margin_std": 0.4284651279449463, "step": 53 }, { "epoch": 0.08163265306122448, "grad_norm": 26.583499908447266, "learning_rate": 3.9552238805970144e-07, "logits/chosen": 0.07808351516723633, "logits/rejected": 0.06246686726808548, "logps/chosen": -47.68182373046875, "logps/ref_chosen": -47.26692199707031, "logps/ref_rejected": -62.19426727294922, "logps/rejected": -62.76362228393555, "loss": 1.3716, "margin_dpo/margin_mean": 0.15445497632026672, "margin_dpo/margin_std": 0.5264220237731934, "step": 54 }, { "epoch": 0.08314436885865457, "grad_norm": 31.259164810180664, "learning_rate": 4.0298507462686564e-07, "logits/chosen": 0.04221351444721222, "logits/rejected": -0.034855540841817856, "logps/chosen": -50.701995849609375, "logps/ref_chosen": -50.32619094848633, "logps/ref_rejected": -92.44389343261719, "logps/rejected": -93.12765502929688, "loss": 1.357, "margin_dpo/margin_mean": 0.30795878171920776, "margin_dpo/margin_std": 0.7030289173126221, "step": 55 }, { "epoch": 0.08465608465608465, "grad_norm": 27.52784538269043, "learning_rate": 4.1044776119402984e-07, "logits/chosen": 0.11271204054355621, "logits/rejected": 0.09056483209133148, "logps/chosen": -57.138084411621094, "logps/ref_chosen": -56.766971588134766, "logps/ref_rejected": -66.30503845214844, "logps/rejected": -66.86466979980469, "loss": 1.3686, "margin_dpo/margin_mean": 0.18851301074028015, "margin_dpo/margin_std": 0.6414389610290527, "step": 56 }, { "epoch": 0.08616780045351474, "grad_norm": 30.54404067993164, "learning_rate": 4.17910447761194e-07, "logits/chosen": 0.11222894489765167, "logits/rejected": 0.047266341745853424, "logps/chosen": -58.253807067871094, "logps/ref_chosen": -57.76774597167969, "logps/ref_rejected": -82.75698852539062, "logps/rejected": -83.54388427734375, "loss": 1.3576, "margin_dpo/margin_mean": 0.30083510279655457, "margin_dpo/margin_std": 0.6732007265090942, "step": 57 }, { "epoch": 0.08767951625094482, "grad_norm": 30.254480361938477, "learning_rate": 4.253731343283582e-07, "logits/chosen": 0.05982009693980217, "logits/rejected": 0.04435000568628311, "logps/chosen": -73.35845947265625, "logps/ref_chosen": -72.76408386230469, "logps/ref_rejected": -84.49275207519531, "logps/rejected": -85.32954406738281, "loss": 1.3651, "margin_dpo/margin_mean": 0.24241399765014648, "margin_dpo/margin_std": 1.0065019130706787, "step": 58 }, { "epoch": 0.08919123204837491, "grad_norm": 26.581438064575195, "learning_rate": 4.3283582089552234e-07, "logits/chosen": 0.1372973918914795, "logits/rejected": 0.0706198588013649, "logps/chosen": -50.35957336425781, "logps/ref_chosen": -49.82077407836914, "logps/ref_rejected": -77.14368438720703, "logps/rejected": -77.98661041259766, "loss": 1.3579, "margin_dpo/margin_mean": 0.30412447452545166, "margin_dpo/margin_std": 0.7922423481941223, "step": 59 }, { "epoch": 0.09070294784580499, "grad_norm": 29.44552993774414, "learning_rate": 4.4029850746268654e-07, "logits/chosen": 0.12459641695022583, "logits/rejected": 0.12324239313602448, "logps/chosen": -63.80072784423828, "logps/ref_chosen": -63.22477340698242, "logps/ref_rejected": -61.360477447509766, "logps/rejected": -62.06523132324219, "loss": 1.3751, "margin_dpo/margin_mean": 0.1287935972213745, "margin_dpo/margin_std": 0.8118811845779419, "step": 60 }, { "epoch": 0.09221466364323508, "grad_norm": 28.075557708740234, "learning_rate": 4.4776119402985074e-07, "logits/chosen": 0.1363564431667328, "logits/rejected": 0.10384637117385864, "logps/chosen": -49.75391387939453, "logps/ref_chosen": -49.01679992675781, "logps/ref_rejected": -74.90817260742188, "logps/rejected": -75.72248840332031, "loss": 1.3813, "margin_dpo/margin_mean": 0.0772022008895874, "margin_dpo/margin_std": 1.0377774238586426, "step": 61 }, { "epoch": 0.09372637944066516, "grad_norm": 28.631183624267578, "learning_rate": 4.552238805970149e-07, "logits/chosen": 0.11217498779296875, "logits/rejected": 0.07311881333589554, "logps/chosen": -63.50098419189453, "logps/ref_chosen": -62.751869201660156, "logps/ref_rejected": -78.93360900878906, "logps/rejected": -79.89679718017578, "loss": 1.3671, "margin_dpo/margin_mean": 0.2140759527683258, "margin_dpo/margin_std": 0.912464439868927, "step": 62 }, { "epoch": 0.09523809523809523, "grad_norm": 31.452425003051758, "learning_rate": 4.626865671641791e-07, "logits/chosen": 0.1870669424533844, "logits/rejected": 0.1623249351978302, "logps/chosen": -61.0701904296875, "logps/ref_chosen": -60.51525115966797, "logps/ref_rejected": -85.11021423339844, "logps/rejected": -86.22477722167969, "loss": 1.3328, "margin_dpo/margin_mean": 0.5596264600753784, "margin_dpo/margin_std": 0.8306376934051514, "step": 63 }, { "epoch": 0.09674981103552532, "grad_norm": 26.20331573486328, "learning_rate": 4.701492537313433e-07, "logits/chosen": 0.0834483653306961, "logits/rejected": 0.05883026495575905, "logps/chosen": -52.03789520263672, "logps/ref_chosen": -51.20684814453125, "logps/ref_rejected": -66.93082427978516, "logps/rejected": -67.8565673828125, "loss": 1.3788, "margin_dpo/margin_mean": 0.09469178318977356, "margin_dpo/margin_std": 0.8809771537780762, "step": 64 }, { "epoch": 0.0982615268329554, "grad_norm": 30.83704376220703, "learning_rate": 4.776119402985074e-07, "logits/chosen": 0.1760270595550537, "logits/rejected": 0.14696374535560608, "logps/chosen": -68.1497802734375, "logps/ref_chosen": -67.2886962890625, "logps/ref_rejected": -74.44281005859375, "logps/rejected": -75.79031372070312, "loss": 1.3414, "margin_dpo/margin_mean": 0.48643139004707336, "margin_dpo/margin_std": 1.1251728534698486, "step": 65 }, { "epoch": 0.09977324263038549, "grad_norm": 29.31989860534668, "learning_rate": 4.850746268656717e-07, "logits/chosen": 0.09745411574840546, "logits/rejected": 0.07364879548549652, "logps/chosen": -71.69542694091797, "logps/ref_chosen": -70.743408203125, "logps/ref_rejected": -77.26499938964844, "logps/rejected": -78.53517150878906, "loss": 1.3582, "margin_dpo/margin_mean": 0.31815215945243835, "margin_dpo/margin_std": 1.1648637056350708, "step": 66 }, { "epoch": 0.10128495842781557, "grad_norm": 29.129894256591797, "learning_rate": 4.925373134328357e-07, "logits/chosen": 0.09226509928703308, "logits/rejected": 0.0366465225815773, "logps/chosen": -61.3585319519043, "logps/ref_chosen": -60.60260009765625, "logps/ref_rejected": -75.22235870361328, "logps/rejected": -76.31483459472656, "loss": 1.3551, "margin_dpo/margin_mean": 0.33653974533081055, "margin_dpo/margin_std": 0.923335611820221, "step": 67 }, { "epoch": 0.10279667422524566, "grad_norm": 31.375276565551758, "learning_rate": 5e-07, "logits/chosen": 0.060302793979644775, "logits/rejected": 0.030451811850070953, "logps/chosen": -78.77995300292969, "logps/ref_chosen": -77.52836608886719, "logps/ref_rejected": -93.17778015136719, "logps/rejected": -94.77074432373047, "loss": 1.3568, "margin_dpo/margin_mean": 0.3413764536380768, "margin_dpo/margin_std": 1.2794163227081299, "step": 68 }, { "epoch": 0.10430839002267574, "grad_norm": 31.02152442932129, "learning_rate": 4.999965034812934e-07, "logits/chosen": 0.09882189333438873, "logits/rejected": 0.05591355264186859, "logps/chosen": -67.13074493408203, "logps/ref_chosen": -65.94305419921875, "logps/ref_rejected": -89.7735595703125, "logps/rejected": -91.55255126953125, "loss": 1.3319, "margin_dpo/margin_mean": 0.5912973880767822, "margin_dpo/margin_std": 1.2093827724456787, "step": 69 }, { "epoch": 0.10582010582010581, "grad_norm": 28.711166381835938, "learning_rate": 4.999860140229787e-07, "logits/chosen": 0.13724008202552795, "logits/rejected": 0.11432601511478424, "logps/chosen": -63.15776824951172, "logps/ref_chosen": -61.957908630371094, "logps/ref_rejected": -75.80946350097656, "logps/rejected": -77.35932159423828, "loss": 1.3551, "margin_dpo/margin_mean": 0.35000741481781006, "margin_dpo/margin_std": 1.1883422136306763, "step": 70 }, { "epoch": 0.1073318216175359, "grad_norm": 28.578266143798828, "learning_rate": 4.999685319184688e-07, "logits/chosen": 0.07535459101200104, "logits/rejected": 0.060557231307029724, "logps/chosen": -64.78665161132812, "logps/ref_chosen": -63.34757995605469, "logps/ref_rejected": -67.49658203125, "logps/rejected": -69.17135620117188, "loss": 1.3678, "margin_dpo/margin_mean": 0.23569674789905548, "margin_dpo/margin_std": 1.394590139389038, "step": 71 }, { "epoch": 0.10884353741496598, "grad_norm": 30.338516235351562, "learning_rate": 4.999440576567755e-07, "logits/chosen": 0.13761760294437408, "logits/rejected": 0.07318543642759323, "logps/chosen": -57.080726623535156, "logps/ref_chosen": -55.85929870605469, "logps/ref_rejected": -68.45423889160156, "logps/rejected": -70.37165069580078, "loss": 1.3222, "margin_dpo/margin_mean": 0.6959859132766724, "margin_dpo/margin_std": 1.2814478874206543, "step": 72 }, { "epoch": 0.11035525321239607, "grad_norm": 31.91270637512207, "learning_rate": 4.999125919224965e-07, "logits/chosen": 0.09682485461235046, "logits/rejected": 0.0830550342798233, "logps/chosen": -70.92161560058594, "logps/ref_chosen": -69.13880920410156, "logps/ref_rejected": -79.04586791992188, "logps/rejected": -81.02145385742188, "loss": 1.3739, "margin_dpo/margin_mean": 0.19277739524841309, "margin_dpo/margin_std": 1.5637190341949463, "step": 73 }, { "epoch": 0.11186696900982615, "grad_norm": 27.909252166748047, "learning_rate": 4.998741355957963e-07, "logits/chosen": 0.12167972326278687, "logits/rejected": 0.07026355713605881, "logps/chosen": -51.18006134033203, "logps/ref_chosen": -49.923736572265625, "logps/ref_rejected": -81.73213958740234, "logps/rejected": -83.69065856933594, "loss": 1.3217, "margin_dpo/margin_mean": 0.7022018432617188, "margin_dpo/margin_std": 1.275315523147583, "step": 74 }, { "epoch": 0.11337868480725624, "grad_norm": 26.173181533813477, "learning_rate": 4.998286897523808e-07, "logits/chosen": 0.12206012010574341, "logits/rejected": 0.08930613100528717, "logps/chosen": -47.47442626953125, "logps/ref_chosen": -46.06875228881836, "logps/ref_rejected": -66.1181411743164, "logps/rejected": -68.2774658203125, "loss": 1.3168, "margin_dpo/margin_mean": 0.7536484003067017, "margin_dpo/margin_std": 1.3362785577774048, "step": 75 }, { "epoch": 0.11489040060468632, "grad_norm": 29.238500595092773, "learning_rate": 4.997762556634679e-07, "logits/chosen": 0.11465884745121002, "logits/rejected": 0.07164441794157028, "logps/chosen": -55.64056396484375, "logps/ref_chosen": -54.06275177001953, "logps/ref_rejected": -74.87464141845703, "logps/rejected": -76.78042602539062, "loss": 1.3597, "margin_dpo/margin_mean": 0.32796770334243774, "margin_dpo/margin_std": 1.5447661876678467, "step": 76 }, { "epoch": 0.1164021164021164, "grad_norm": 29.32517433166504, "learning_rate": 4.99716834795752e-07, "logits/chosen": 0.1788126826286316, "logits/rejected": 0.13609115779399872, "logps/chosen": -54.69091796875, "logps/ref_chosen": -53.07609176635742, "logps/ref_rejected": -74.45601654052734, "logps/rejected": -76.81924438476562, "loss": 1.3182, "margin_dpo/margin_mean": 0.748406171798706, "margin_dpo/margin_std": 1.4287664890289307, "step": 77 }, { "epoch": 0.11791383219954649, "grad_norm": 29.321949005126953, "learning_rate": 4.996504288113623e-07, "logits/chosen": 0.10718972980976105, "logits/rejected": 0.08687709271907806, "logps/chosen": -69.55426025390625, "logps/ref_chosen": -67.72541809082031, "logps/ref_rejected": -79.03927612304688, "logps/rejected": -81.33000183105469, "loss": 1.3456, "margin_dpo/margin_mean": 0.4618911147117615, "margin_dpo/margin_std": 1.4008138179779053, "step": 78 }, { "epoch": 0.11942554799697656, "grad_norm": 30.852628707885742, "learning_rate": 4.995770395678171e-07, "logits/chosen": 0.20388326048851013, "logits/rejected": 0.1409316062927246, "logps/chosen": -53.90932083129883, "logps/ref_chosen": -52.16064453125, "logps/ref_rejected": -83.31062316894531, "logps/rejected": -86.15391540527344, "loss": 1.29, "margin_dpo/margin_mean": 1.0946189165115356, "margin_dpo/margin_std": 2.0189208984375, "step": 79 }, { "epoch": 0.12093726379440665, "grad_norm": 28.484214782714844, "learning_rate": 4.994966691179711e-07, "logits/chosen": 0.1793014109134674, "logits/rejected": 0.11673000454902649, "logps/chosen": -63.449790954589844, "logps/ref_chosen": -61.410560607910156, "logps/ref_rejected": -78.66004943847656, "logps/rejected": -81.23995971679688, "loss": 1.3423, "margin_dpo/margin_mean": 0.5406800508499146, "margin_dpo/margin_std": 1.8861579895019531, "step": 80 }, { "epoch": 0.12244897959183673, "grad_norm": 29.091909408569336, "learning_rate": 4.994093197099587e-07, "logits/chosen": 0.1445290595293045, "logits/rejected": 0.11002925038337708, "logps/chosen": -65.95135498046875, "logps/ref_chosen": -63.80437088012695, "logps/ref_rejected": -79.34840393066406, "logps/rejected": -82.34586334228516, "loss": 1.3111, "margin_dpo/margin_mean": 0.8504737615585327, "margin_dpo/margin_std": 1.7258979082107544, "step": 81 }, { "epoch": 0.12396069538926682, "grad_norm": 26.700366973876953, "learning_rate": 4.993149937871306e-07, "logits/chosen": 0.12470388412475586, "logits/rejected": 0.0606868639588356, "logps/chosen": -50.59052276611328, "logps/ref_chosen": -48.817893981933594, "logps/ref_rejected": -70.31497955322266, "logps/rejected": -73.132080078125, "loss": 1.2905, "margin_dpo/margin_mean": 1.0444717407226562, "margin_dpo/margin_std": 1.529382586479187, "step": 82 }, { "epoch": 0.1254724111866969, "grad_norm": 29.110549926757812, "learning_rate": 4.992136939879856e-07, "logits/chosen": 0.2097686380147934, "logits/rejected": 0.15886147320270538, "logps/chosen": -59.19554138183594, "logps/ref_chosen": -57.15077209472656, "logps/ref_rejected": -75.1710205078125, "logps/rejected": -78.39938354492188, "loss": 1.2798, "margin_dpo/margin_mean": 1.183598518371582, "margin_dpo/margin_std": 1.8257801532745361, "step": 83 }, { "epoch": 0.12698412698412698, "grad_norm": 30.618595123291016, "learning_rate": 4.991054231460969e-07, "logits/chosen": 0.17933443188667297, "logits/rejected": 0.13655216991901398, "logps/chosen": -67.2566146850586, "logps/ref_chosen": -64.77730560302734, "logps/ref_rejected": -84.71949768066406, "logps/rejected": -88.12132263183594, "loss": 1.3053, "margin_dpo/margin_mean": 0.9225126504898071, "margin_dpo/margin_std": 1.9059677124023438, "step": 84 }, { "epoch": 0.12849584278155707, "grad_norm": 26.86264419555664, "learning_rate": 4.989901842900325e-07, "logits/chosen": 0.17478173971176147, "logits/rejected": 0.13025707006454468, "logps/chosen": -52.47950744628906, "logps/ref_chosen": -50.25169372558594, "logps/ref_rejected": -66.55438995361328, "logps/rejected": -70.04747009277344, "loss": 1.274, "margin_dpo/margin_mean": 1.265273094177246, "margin_dpo/margin_std": 1.9645485877990723, "step": 85 }, { "epoch": 0.13000755857898716, "grad_norm": 27.148221969604492, "learning_rate": 4.988679806432711e-07, "logits/chosen": 0.20728754997253418, "logits/rejected": 0.1880839616060257, "logps/chosen": -63.547637939453125, "logps/ref_chosen": -60.72917938232422, "logps/ref_rejected": -72.30960845947266, "logps/rejected": -76.11151123046875, "loss": 1.3012, "margin_dpo/margin_mean": 0.9834346771240234, "margin_dpo/margin_std": 2.0893325805664062, "step": 86 }, { "epoch": 0.13151927437641722, "grad_norm": 30.48325538635254, "learning_rate": 4.987388156241114e-07, "logits/chosen": 0.17650364339351654, "logits/rejected": 0.11666995286941528, "logps/chosen": -68.77723693847656, "logps/ref_chosen": -65.75796508789062, "logps/ref_rejected": -84.81159973144531, "logps/rejected": -88.91807556152344, "loss": 1.2962, "margin_dpo/margin_mean": 1.0872149467468262, "margin_dpo/margin_std": 2.498264789581299, "step": 87 }, { "epoch": 0.1330309901738473, "grad_norm": 30.30896759033203, "learning_rate": 4.986026928455767e-07, "logits/chosen": 0.22927185893058777, "logits/rejected": 0.20243728160858154, "logps/chosen": -65.86044311523438, "logps/ref_chosen": -62.82402801513672, "logps/ref_rejected": -74.9607162475586, "logps/rejected": -78.82502746582031, "loss": 1.3276, "margin_dpo/margin_mean": 0.8278965950012207, "margin_dpo/margin_std": 2.913116455078125, "step": 88 }, { "epoch": 0.1345427059712774, "grad_norm": 29.144811630249023, "learning_rate": 4.984596161153135e-07, "logits/chosen": 0.2610534727573395, "logits/rejected": 0.17740146815776825, "logps/chosen": -43.72743225097656, "logps/ref_chosen": -41.191436767578125, "logps/ref_rejected": -85.44769287109375, "logps/rejected": -89.39289855957031, "loss": 1.267, "margin_dpo/margin_mean": 1.409203290939331, "margin_dpo/margin_std": 2.509359359741211, "step": 89 }, { "epoch": 0.1360544217687075, "grad_norm": 29.26353645324707, "learning_rate": 4.983095894354857e-07, "logits/chosen": 0.18076924979686737, "logits/rejected": 0.12345144152641296, "logps/chosen": -59.70683670043945, "logps/ref_chosen": -56.58390808105469, "logps/ref_rejected": -86.86978149414062, "logps/rejected": -91.25166320800781, "loss": 1.2798, "margin_dpo/margin_mean": 1.2589483261108398, "margin_dpo/margin_std": 2.442626476287842, "step": 90 }, { "epoch": 0.13756613756613756, "grad_norm": 25.777061462402344, "learning_rate": 4.98152617002662e-07, "logits/chosen": 0.17539820075035095, "logits/rejected": 0.13064050674438477, "logps/chosen": -55.531593322753906, "logps/ref_chosen": -52.38234329223633, "logps/ref_rejected": -72.17642211914062, "logps/rejected": -76.45957946777344, "loss": 1.2953, "margin_dpo/margin_mean": 1.1339049339294434, "margin_dpo/margin_std": 2.7821226119995117, "step": 91 }, { "epoch": 0.13907785336356765, "grad_norm": 27.002519607543945, "learning_rate": 4.979887032076988e-07, "logits/chosen": 0.20402228832244873, "logits/rejected": 0.16468676924705505, "logps/chosen": -56.24298095703125, "logps/ref_chosen": -53.00870132446289, "logps/ref_rejected": -79.77813720703125, "logps/rejected": -84.57749938964844, "loss": 1.2601, "margin_dpo/margin_mean": 1.5650835037231445, "margin_dpo/margin_std": 3.127380847930908, "step": 92 }, { "epoch": 0.14058956916099774, "grad_norm": 24.539724349975586, "learning_rate": 4.978178526356172e-07, "logits/chosen": 0.18196739256381989, "logits/rejected": 0.1539272964000702, "logps/chosen": -48.484527587890625, "logps/ref_chosen": -44.90705108642578, "logps/ref_rejected": -58.7879524230957, "logps/rejected": -63.36863708496094, "loss": 1.3137, "margin_dpo/margin_mean": 1.0032141208648682, "margin_dpo/margin_std": 3.2164130210876465, "step": 93 }, { "epoch": 0.1421012849584278, "grad_norm": 27.276403427124023, "learning_rate": 4.976400700654751e-07, "logits/chosen": 0.2772451341152191, "logits/rejected": 0.23560284078121185, "logps/chosen": -63.25147247314453, "logps/ref_chosen": -59.93777084350586, "logps/ref_rejected": -79.3138427734375, "logps/rejected": -84.50923156738281, "loss": 1.2464, "margin_dpo/margin_mean": 1.8816919326782227, "margin_dpo/margin_std": 4.022270202636719, "step": 94 }, { "epoch": 0.1436130007558579, "grad_norm": 29.317035675048828, "learning_rate": 4.974553604702332e-07, "logits/chosen": 0.15013039112091064, "logits/rejected": 0.0862259641289711, "logps/chosen": -64.44532775878906, "logps/ref_chosen": -60.168487548828125, "logps/ref_rejected": -90.73665618896484, "logps/rejected": -96.60997009277344, "loss": 1.2614, "margin_dpo/margin_mean": 1.5964728593826294, "margin_dpo/margin_std": 3.3925909996032715, "step": 95 }, { "epoch": 0.14512471655328799, "grad_norm": 27.277469635009766, "learning_rate": 4.972637290166157e-07, "logits/chosen": 0.20983225107192993, "logits/rejected": 0.1660861372947693, "logps/chosen": -64.80738830566406, "logps/ref_chosen": -60.66877746582031, "logps/ref_rejected": -88.30673217773438, "logps/rejected": -94.34305572509766, "loss": 1.256, "margin_dpo/margin_mean": 1.8977141380310059, "margin_dpo/margin_std": 4.532830238342285, "step": 96 }, { "epoch": 0.14663643235071808, "grad_norm": 36.27639389038086, "learning_rate": 4.970651810649666e-07, "logits/chosen": 0.14166215062141418, "logits/rejected": 0.09780453145503998, "logps/chosen": -70.00448608398438, "logps/ref_chosen": -65.04412841796875, "logps/ref_rejected": -78.42092895507812, "logps/rejected": -84.42875671386719, "loss": 1.3444, "margin_dpo/margin_mean": 1.0474696159362793, "margin_dpo/margin_std": 4.991357803344727, "step": 97 }, { "epoch": 0.14814814814814814, "grad_norm": 30.31495475769043, "learning_rate": 4.968597221690985e-07, "logits/chosen": 0.22561600804328918, "logits/rejected": 0.1984190046787262, "logps/chosen": -60.15273666381836, "logps/ref_chosen": -55.503231048583984, "logps/ref_rejected": -72.81553649902344, "logps/rejected": -78.14527893066406, "loss": 1.3466, "margin_dpo/margin_mean": 0.680237352848053, "margin_dpo/margin_std": 3.2830731868743896, "step": 98 }, { "epoch": 0.14965986394557823, "grad_norm": 32.687068939208984, "learning_rate": 4.966473580761389e-07, "logits/chosen": 0.24951310455799103, "logits/rejected": 0.20980459451675415, "logps/chosen": -63.30138397216797, "logps/ref_chosen": -58.57563781738281, "logps/ref_rejected": -78.69361114501953, "logps/rejected": -84.39778137207031, "loss": 1.3425, "margin_dpo/margin_mean": 0.9784270524978638, "margin_dpo/margin_std": 4.579858779907227, "step": 99 }, { "epoch": 0.15117157974300832, "grad_norm": 33.35084533691406, "learning_rate": 4.964280947263676e-07, "logits/chosen": 0.22355619072914124, "logits/rejected": 0.21577070653438568, "logps/chosen": -84.90021514892578, "logps/ref_chosen": -79.58343505859375, "logps/ref_rejected": -92.152587890625, "logps/rejected": -98.91537475585938, "loss": 1.3342, "margin_dpo/margin_mean": 1.4460134506225586, "margin_dpo/margin_std": 5.88123893737793, "step": 100 }, { "epoch": 0.15117157974300832, "eval_logits/chosen": 0.2556447982788086, "eval_logits/rejected": 0.21831558644771576, "eval_logps/chosen": -79.70143127441406, "eval_logps/ref_chosen": -74.85946655273438, "eval_logps/ref_rejected": -79.54898834228516, "eval_logps/rejected": -85.81148529052734, "eval_loss": 0.6557220816612244, "eval_margin_dpo/margin_mean": 1.4205234050750732, "eval_margin_dpo/margin_std": 4.978596210479736, "eval_runtime": 38.9596, "eval_samples_per_second": 59.113, "eval_steps_per_second": 1.848, "step": 100 }, { "epoch": 0.15268329554043839, "grad_norm": 25.8350772857666, "learning_rate": 4.96201938253052e-07, "logits/chosen": 0.24167990684509277, "logits/rejected": 0.20206353068351746, "logps/chosen": -56.619720458984375, "logps/ref_chosen": -52.332786560058594, "logps/ref_rejected": -69.55589294433594, "logps/rejected": -75.89532470703125, "loss": 1.2349, "margin_dpo/margin_mean": 2.0524988174438477, "margin_dpo/margin_std": 4.153055667877197, "step": 101 }, { "epoch": 0.15419501133786848, "grad_norm": 32.661865234375, "learning_rate": 4.959688949822748e-07, "logits/chosen": 0.16603882610797882, "logits/rejected": 0.12703979015350342, "logps/chosen": -70.07479858398438, "logps/ref_chosen": -64.74348449707031, "logps/ref_rejected": -69.06133270263672, "logps/rejected": -75.20350646972656, "loss": 1.3772, "margin_dpo/margin_mean": 0.8108617067337036, "margin_dpo/margin_std": 5.474085807800293, "step": 102 }, { "epoch": 0.15570672713529857, "grad_norm": 29.438844680786133, "learning_rate": 4.957289714327572e-07, "logits/chosen": 0.2649979591369629, "logits/rejected": 0.23107215762138367, "logps/chosen": -68.64143371582031, "logps/ref_chosen": -63.836647033691406, "logps/ref_rejected": -79.3236312866211, "logps/rejected": -85.67254638671875, "loss": 1.2717, "margin_dpo/margin_mean": 1.5441327095031738, "margin_dpo/margin_std": 3.6750550270080566, "step": 103 }, { "epoch": 0.15721844293272866, "grad_norm": 30.968130111694336, "learning_rate": 4.954821743156767e-07, "logits/chosen": 0.2689138352870941, "logits/rejected": 0.18012598156929016, "logps/chosen": -65.74430847167969, "logps/ref_chosen": -60.99920654296875, "logps/ref_rejected": -98.8464584350586, "logps/rejected": -105.85238647460938, "loss": 1.2669, "margin_dpo/margin_mean": 2.2608296871185303, "margin_dpo/margin_std": 6.357419013977051, "step": 104 }, { "epoch": 0.15873015873015872, "grad_norm": 30.747453689575195, "learning_rate": 4.952285105344791e-07, "logits/chosen": 0.21757668256759644, "logits/rejected": 0.16157402098178864, "logps/chosen": -76.26542663574219, "logps/ref_chosen": -70.95027160644531, "logps/ref_rejected": -87.88340759277344, "logps/rejected": -95.04217529296875, "loss": 1.27, "margin_dpo/margin_mean": 1.8436135053634644, "margin_dpo/margin_std": 4.962711334228516, "step": 105 }, { "epoch": 0.1602418745275888, "grad_norm": 30.977380752563477, "learning_rate": 4.949679871846857e-07, "logits/chosen": 0.26043418049812317, "logits/rejected": 0.245887890458107, "logps/chosen": -67.49542236328125, "logps/ref_chosen": -62.45933151245117, "logps/ref_rejected": -67.00595092773438, "logps/rejected": -73.60855102539062, "loss": 1.3035, "margin_dpo/margin_mean": 1.566506266593933, "margin_dpo/margin_std": 5.295645713806152, "step": 106 }, { "epoch": 0.1617535903250189, "grad_norm": 42.034934997558594, "learning_rate": 4.947006115536947e-07, "logits/chosen": 0.198299378156662, "logits/rejected": 0.1734483540058136, "logps/chosen": -82.38461303710938, "logps/ref_chosen": -75.83796691894531, "logps/ref_rejected": -87.74038696289062, "logps/rejected": -95.11250305175781, "loss": 1.3998, "margin_dpo/margin_mean": 0.8254714012145996, "margin_dpo/margin_std": 6.266752243041992, "step": 107 }, { "epoch": 0.16326530612244897, "grad_norm": 28.688661575317383, "learning_rate": 4.944263911205772e-07, "logits/chosen": 0.20003950595855713, "logits/rejected": 0.16793784499168396, "logps/chosen": -73.90744018554688, "logps/ref_chosen": -68.39323425292969, "logps/ref_rejected": -83.24267578125, "logps/rejected": -90.59925842285156, "loss": 1.2797, "margin_dpo/margin_mean": 1.842378854751587, "margin_dpo/margin_std": 5.342957496643066, "step": 108 }, { "epoch": 0.16477702191987906, "grad_norm": 28.382535934448242, "learning_rate": 4.941453335558681e-07, "logits/chosen": 0.18867141008377075, "logits/rejected": 0.13436651229858398, "logps/chosen": -60.398643493652344, "logps/ref_chosen": -55.52748107910156, "logps/ref_rejected": -83.55218505859375, "logps/rejected": -91.43389129638672, "loss": 1.1746, "margin_dpo/margin_mean": 3.0105397701263428, "margin_dpo/margin_std": 5.49082088470459, "step": 109 }, { "epoch": 0.16628873771730915, "grad_norm": 44.29096984863281, "learning_rate": 4.938574467213517e-07, "logits/chosen": 0.16931723058223724, "logits/rejected": 0.17854322493076324, "logps/chosen": -87.40373229980469, "logps/ref_chosen": -81.15874481201172, "logps/ref_rejected": -72.56021118164062, "logps/rejected": -79.09624481201172, "loss": 1.4275, "margin_dpo/margin_mean": 0.2910418212413788, "margin_dpo/margin_std": 5.465949058532715, "step": 110 }, { "epoch": 0.16780045351473924, "grad_norm": 25.80181884765625, "learning_rate": 4.935627386698418e-07, "logits/chosen": 0.3088276982307434, "logits/rejected": 0.2699512839317322, "logps/chosen": -58.064239501953125, "logps/ref_chosen": -52.358985900878906, "logps/ref_rejected": -77.06150817871094, "logps/rejected": -84.72740936279297, "loss": 1.2645, "margin_dpo/margin_mean": 1.9606516361236572, "margin_dpo/margin_std": 5.206435203552246, "step": 111 }, { "epoch": 0.1693121693121693, "grad_norm": 32.21335220336914, "learning_rate": 4.932612176449559e-07, "logits/chosen": 0.19986756145954132, "logits/rejected": 0.13390694558620453, "logps/chosen": -68.38388061523438, "logps/ref_chosen": -63.02006912231445, "logps/ref_rejected": -111.36941528320312, "logps/rejected": -119.13533020019531, "loss": 1.2146, "margin_dpo/margin_mean": 2.4020986557006836, "margin_dpo/margin_std": 4.748931884765625, "step": 112 }, { "epoch": 0.1708238851095994, "grad_norm": 35.52518844604492, "learning_rate": 4.929528920808854e-07, "logits/chosen": 0.1938510537147522, "logits/rejected": 0.15514397621154785, "logps/chosen": -61.15318298339844, "logps/ref_chosen": -55.80766296386719, "logps/ref_rejected": -69.84014129638672, "logps/rejected": -76.59230041503906, "loss": 1.3038, "margin_dpo/margin_mean": 1.406644344329834, "margin_dpo/margin_std": 4.629127502441406, "step": 113 }, { "epoch": 0.17233560090702948, "grad_norm": 29.548250198364258, "learning_rate": 4.92637770602159e-07, "logits/chosen": 0.23794038593769073, "logits/rejected": 0.1779293566942215, "logps/chosen": -71.40888977050781, "logps/ref_chosen": -66.33277130126953, "logps/ref_rejected": -71.61489868164062, "logps/rejected": -79.04914855957031, "loss": 1.2177, "margin_dpo/margin_mean": 2.3581337928771973, "margin_dpo/margin_std": 4.735161781311035, "step": 114 }, { "epoch": 0.17384731670445955, "grad_norm": 28.59156608581543, "learning_rate": 4.923158620234019e-07, "logits/chosen": 0.24258871376514435, "logits/rejected": 0.17910319566726685, "logps/chosen": -61.24739074707031, "logps/ref_chosen": -55.74903869628906, "logps/ref_rejected": -79.59849548339844, "logps/rejected": -86.9781494140625, "loss": 1.2531, "margin_dpo/margin_mean": 1.8812994956970215, "margin_dpo/margin_std": 4.353668689727783, "step": 115 }, { "epoch": 0.17535903250188964, "grad_norm": 25.978452682495117, "learning_rate": 4.91987175349089e-07, "logits/chosen": 0.2222248762845993, "logits/rejected": 0.1548064798116684, "logps/chosen": -54.570945739746094, "logps/ref_chosen": -49.365169525146484, "logps/ref_rejected": -72.84671020507812, "logps/rejected": -80.85499572753906, "loss": 1.1655, "margin_dpo/margin_mean": 2.8025131225585938, "margin_dpo/margin_std": 4.0062384605407715, "step": 116 }, { "epoch": 0.17687074829931973, "grad_norm": 28.063459396362305, "learning_rate": 4.916517197732933e-07, "logits/chosen": 0.228042870759964, "logits/rejected": 0.1916525959968567, "logps/chosen": -62.87383270263672, "logps/ref_chosen": -57.710899353027344, "logps/ref_rejected": -69.77254486083984, "logps/rejected": -76.8323974609375, "loss": 1.2566, "margin_dpo/margin_mean": 1.8969154357910156, "margin_dpo/margin_std": 4.350034713745117, "step": 117 }, { "epoch": 0.17838246409674982, "grad_norm": 27.701557159423828, "learning_rate": 4.913095046794281e-07, "logits/chosen": 0.29179418087005615, "logits/rejected": 0.25057122111320496, "logps/chosen": -57.449615478515625, "logps/ref_chosen": -52.479896545410156, "logps/ref_rejected": -81.35912322998047, "logps/rejected": -88.98104858398438, "loss": 1.1754, "margin_dpo/margin_mean": 2.6522061824798584, "margin_dpo/margin_std": 3.8634443283081055, "step": 118 }, { "epoch": 0.17989417989417988, "grad_norm": 29.29024314880371, "learning_rate": 4.909605396399855e-07, "logits/chosen": 0.20849823951721191, "logits/rejected": 0.16951939463615417, "logps/chosen": -67.72459411621094, "logps/ref_chosen": -61.35767364501953, "logps/ref_rejected": -75.71510314941406, "logps/rejected": -83.93721771240234, "loss": 1.2795, "margin_dpo/margin_mean": 1.855197787284851, "margin_dpo/margin_std": 5.46286678314209, "step": 119 }, { "epoch": 0.18140589569160998, "grad_norm": 28.82642364501953, "learning_rate": 4.906048344162676e-07, "logits/chosen": 0.20952869951725006, "logits/rejected": 0.14921404421329498, "logps/chosen": -65.2104263305664, "logps/ref_chosen": -59.907569885253906, "logps/ref_rejected": -79.6910629272461, "logps/rejected": -87.75076293945312, "loss": 1.1657, "margin_dpo/margin_mean": 2.7568416595458984, "margin_dpo/margin_std": 3.812615394592285, "step": 120 }, { "epoch": 0.18291761148904007, "grad_norm": 26.929418563842773, "learning_rate": 4.902423989581143e-07, "logits/chosen": 0.34039121866226196, "logits/rejected": 0.25128114223480225, "logps/chosen": -61.47722244262695, "logps/ref_chosen": -55.666046142578125, "logps/ref_rejected": -101.56233978271484, "logps/rejected": -110.37584686279297, "loss": 1.1653, "margin_dpo/margin_mean": 3.0023269653320312, "margin_dpo/margin_std": 5.023721694946289, "step": 121 }, { "epoch": 0.18442932728647016, "grad_norm": 30.553749084472656, "learning_rate": 4.898732434036243e-07, "logits/chosen": 0.21363535523414612, "logits/rejected": 0.17875628173351288, "logps/chosen": -69.63571166992188, "logps/ref_chosen": -63.334373474121094, "logps/ref_rejected": -73.67523193359375, "logps/rejected": -82.67070007324219, "loss": 1.2097, "margin_dpo/margin_mean": 2.6941189765930176, "margin_dpo/margin_std": 5.76659631729126, "step": 122 }, { "epoch": 0.18594104308390022, "grad_norm": 27.00295066833496, "learning_rate": 4.894973780788722e-07, "logits/chosen": 0.2702626883983612, "logits/rejected": 0.22773674130439758, "logps/chosen": -62.96525955200195, "logps/ref_chosen": -56.89874267578125, "logps/ref_rejected": -78.97029113769531, "logps/rejected": -87.6001205444336, "loss": 1.1931, "margin_dpo/margin_mean": 2.5633208751678467, "margin_dpo/margin_std": 4.423187732696533, "step": 123 }, { "epoch": 0.1874527588813303, "grad_norm": 27.339513778686523, "learning_rate": 4.89114813497619e-07, "logits/chosen": 0.25164029002189636, "logits/rejected": 0.1909327208995819, "logps/chosen": -63.49843215942383, "logps/ref_chosen": -57.116085052490234, "logps/ref_rejected": -87.93074035644531, "logps/rejected": -97.9527587890625, "loss": 1.1111, "margin_dpo/margin_mean": 3.63966703414917, "margin_dpo/margin_std": 4.890292167663574, "step": 124 }, { "epoch": 0.1889644746787604, "grad_norm": 29.097652435302734, "learning_rate": 4.887255603610184e-07, "logits/chosen": 0.2958196997642517, "logits/rejected": 0.23451215028762817, "logps/chosen": -72.66911315917969, "logps/ref_chosen": -65.7061767578125, "logps/ref_rejected": -91.72711944580078, "logps/rejected": -101.81961059570312, "loss": 1.1585, "margin_dpo/margin_mean": 3.129549980163574, "margin_dpo/margin_std": 5.042257308959961, "step": 125 }, { "epoch": 0.19047619047619047, "grad_norm": 28.814516067504883, "learning_rate": 4.883296295573176e-07, "logits/chosen": 0.12805432081222534, "logits/rejected": 0.1223287507891655, "logps/chosen": -74.755126953125, "logps/ref_chosen": -68.17608642578125, "logps/ref_rejected": -65.1175537109375, "logps/rejected": -73.43314361572266, "loss": 1.2741, "margin_dpo/margin_mean": 1.7365505695343018, "margin_dpo/margin_std": 4.742924690246582, "step": 126 }, { "epoch": 0.19198790627362056, "grad_norm": 26.409942626953125, "learning_rate": 4.87927032161552e-07, "logits/chosen": 0.22645384073257446, "logits/rejected": 0.19578759372234344, "logps/chosen": -68.62709045410156, "logps/ref_chosen": -61.88023376464844, "logps/ref_rejected": -68.46012878417969, "logps/rejected": -78.52706146240234, "loss": 1.1273, "margin_dpo/margin_mean": 3.3200788497924805, "margin_dpo/margin_std": 4.245710372924805, "step": 127 }, { "epoch": 0.19349962207105065, "grad_norm": 27.905487060546875, "learning_rate": 4.875177794352363e-07, "logits/chosen": 0.25532764196395874, "logits/rejected": 0.19668430089950562, "logps/chosen": -74.02288818359375, "logps/ref_chosen": -66.708984375, "logps/ref_rejected": -94.97969055175781, "logps/rejected": -105.91804504394531, "loss": 1.1543, "margin_dpo/margin_mean": 3.6244349479675293, "margin_dpo/margin_std": 6.532609939575195, "step": 128 }, { "epoch": 0.19501133786848074, "grad_norm": 34.52588653564453, "learning_rate": 4.871018828260491e-07, "logits/chosen": 0.2155529260635376, "logits/rejected": 0.2097143977880478, "logps/chosen": -73.59596252441406, "logps/ref_chosen": -65.33882904052734, "logps/ref_rejected": -68.06109619140625, "logps/rejected": -78.3628921508789, "loss": 1.2761, "margin_dpo/margin_mean": 2.044658660888672, "margin_dpo/margin_std": 5.964291572570801, "step": 129 }, { "epoch": 0.1965230536659108, "grad_norm": 30.163326263427734, "learning_rate": 4.866793539675126e-07, "logits/chosen": 0.17354336380958557, "logits/rejected": 0.12331511080265045, "logps/chosen": -66.47359466552734, "logps/ref_chosen": -58.660743713378906, "logps/ref_rejected": -79.24510192871094, "logps/rejected": -90.50035095214844, "loss": 1.1303, "margin_dpo/margin_mean": 3.44240140914917, "margin_dpo/margin_std": 4.8902740478515625, "step": 130 }, { "epoch": 0.1980347694633409, "grad_norm": 25.82981300354004, "learning_rate": 4.86250204678667e-07, "logits/chosen": 0.20545755326747894, "logits/rejected": 0.14028334617614746, "logps/chosen": -59.99113464355469, "logps/ref_chosen": -52.51454162597656, "logps/ref_rejected": -85.18299865722656, "logps/rejected": -97.01617431640625, "loss": 1.0927, "margin_dpo/margin_mean": 4.3565826416015625, "margin_dpo/margin_std": 6.3896894454956055, "step": 131 }, { "epoch": 0.19954648526077098, "grad_norm": 27.41193962097168, "learning_rate": 4.858144469637408e-07, "logits/chosen": 0.275441974401474, "logits/rejected": 0.24331185221672058, "logps/chosen": -73.9671859741211, "logps/ref_chosen": -65.68513488769531, "logps/ref_rejected": -69.54120635986328, "logps/rejected": -81.4193344116211, "loss": 1.1275, "margin_dpo/margin_mean": 3.5960710048675537, "margin_dpo/margin_std": 5.342451095581055, "step": 132 }, { "epoch": 0.20105820105820105, "grad_norm": 31.791473388671875, "learning_rate": 4.853720930118138e-07, "logits/chosen": 0.219748854637146, "logits/rejected": 0.20972420275211334, "logps/chosen": -72.35598754882812, "logps/ref_chosen": -63.598114013671875, "logps/ref_rejected": -73.72798156738281, "logps/rejected": -85.32035827636719, "loss": 1.2049, "margin_dpo/margin_mean": 2.83449649810791, "margin_dpo/margin_std": 5.86693000793457, "step": 133 }, { "epoch": 0.20256991685563114, "grad_norm": 25.357341766357422, "learning_rate": 4.849231551964771e-07, "logits/chosen": 0.31225156784057617, "logits/rejected": 0.2546500563621521, "logps/chosen": -61.913734436035156, "logps/ref_chosen": -53.79457092285156, "logps/ref_rejected": -74.16741943359375, "logps/rejected": -87.17252349853516, "loss": 1.0432, "margin_dpo/margin_mean": 4.8859405517578125, "margin_dpo/margin_std": 6.1819233894348145, "step": 134 }, { "epoch": 0.20408163265306123, "grad_norm": 25.44196128845215, "learning_rate": 4.844676460754862e-07, "logits/chosen": 0.2653396427631378, "logits/rejected": 0.23224963247776031, "logps/chosen": -57.6514892578125, "logps/ref_chosen": -49.441078186035156, "logps/ref_rejected": -65.96878051757812, "logps/rejected": -77.29350280761719, "loss": 1.1794, "margin_dpo/margin_mean": 3.1143112182617188, "margin_dpo/margin_std": 5.866373062133789, "step": 135 }, { "epoch": 0.20559334845049132, "grad_norm": 31.02347755432129, "learning_rate": 4.840055783904106e-07, "logits/chosen": 0.23869748413562775, "logits/rejected": 0.16473013162612915, "logps/chosen": -76.93392944335938, "logps/ref_chosen": -66.75926208496094, "logps/ref_rejected": -94.61787414550781, "logps/rejected": -108.9141845703125, "loss": 1.1644, "margin_dpo/margin_mean": 4.121640205383301, "margin_dpo/margin_std": 8.036227226257324, "step": 136 }, { "epoch": 0.20710506424792138, "grad_norm": 26.123674392700195, "learning_rate": 4.835369650662767e-07, "logits/chosen": 0.26602545380592346, "logits/rejected": 0.23649254441261292, "logps/chosen": -66.29134368896484, "logps/ref_chosen": -56.78379821777344, "logps/ref_rejected": -69.89952087402344, "logps/rejected": -83.55348205566406, "loss": 1.1089, "margin_dpo/margin_mean": 4.146416664123535, "margin_dpo/margin_std": 6.458062171936035, "step": 137 }, { "epoch": 0.20861678004535147, "grad_norm": 27.785795211791992, "learning_rate": 4.830618192112065e-07, "logits/chosen": 0.28734517097473145, "logits/rejected": 0.2500568628311157, "logps/chosen": -70.0023422241211, "logps/ref_chosen": -58.766014099121094, "logps/ref_rejected": -68.12371826171875, "logps/rejected": -83.01116180419922, "loss": 1.1753, "margin_dpo/margin_mean": 3.6511096954345703, "margin_dpo/margin_std": 7.198888778686523, "step": 138 }, { "epoch": 0.21012849584278157, "grad_norm": 30.565093994140625, "learning_rate": 4.825801541160509e-07, "logits/chosen": 0.21461869776248932, "logits/rejected": 0.18480184674263, "logps/chosen": -83.13623046875, "logps/ref_chosen": -71.2255859375, "logps/ref_rejected": -82.1834716796875, "logps/rejected": -98.353759765625, "loss": 1.0982, "margin_dpo/margin_mean": 4.259641170501709, "margin_dpo/margin_std": 6.333117485046387, "step": 139 }, { "epoch": 0.21164021164021163, "grad_norm": 31.754594802856445, "learning_rate": 4.820919832540181e-07, "logits/chosen": 0.24357524514198303, "logits/rejected": 0.19461680948734283, "logps/chosen": -74.13949584960938, "logps/ref_chosen": -63.27766418457031, "logps/ref_rejected": -83.30647277832031, "logps/rejected": -100.13687133789062, "loss": 1.0653, "margin_dpo/margin_mean": 5.968560218811035, "margin_dpo/margin_std": 9.302078247070312, "step": 140 }, { "epoch": 0.21315192743764172, "grad_norm": 29.929162979125977, "learning_rate": 4.815973202802966e-07, "logits/chosen": 0.2960834503173828, "logits/rejected": 0.24834388494491577, "logps/chosen": -73.89019012451172, "logps/ref_chosen": -61.76676940917969, "logps/ref_rejected": -88.60601806640625, "logps/rejected": -106.03081512451172, "loss": 1.0978, "margin_dpo/margin_mean": 5.30136775970459, "margin_dpo/margin_std": 8.73034954071045, "step": 141 }, { "epoch": 0.2146636432350718, "grad_norm": 27.8828067779541, "learning_rate": 4.810961790316729e-07, "logits/chosen": 0.2557069659233093, "logits/rejected": 0.2308621108531952, "logps/chosen": -77.07974243164062, "logps/ref_chosen": -65.2747802734375, "logps/ref_rejected": -81.1378173828125, "logps/rejected": -97.32429504394531, "loss": 1.101, "margin_dpo/margin_mean": 4.381509304046631, "margin_dpo/margin_std": 6.8174028396606445, "step": 142 }, { "epoch": 0.2161753590325019, "grad_norm": 39.272682189941406, "learning_rate": 4.805885735261454e-07, "logits/chosen": 0.287389874458313, "logits/rejected": 0.2696545124053955, "logps/chosen": -75.27376556396484, "logps/ref_chosen": -62.61782455444336, "logps/ref_rejected": -70.39239501953125, "logps/rejected": -86.78271484375, "loss": 1.2447, "margin_dpo/margin_mean": 3.7343828678131104, "margin_dpo/margin_std": 9.431318283081055, "step": 143 }, { "epoch": 0.21768707482993196, "grad_norm": 30.782737731933594, "learning_rate": 4.800745179625307e-07, "logits/chosen": 0.29217907786369324, "logits/rejected": 0.25874489545822144, "logps/chosen": -73.63032531738281, "logps/ref_chosen": -60.80268859863281, "logps/ref_rejected": -79.07284545898438, "logps/rejected": -96.77928161621094, "loss": 1.1405, "margin_dpo/margin_mean": 4.878805637359619, "margin_dpo/margin_std": 9.18574333190918, "step": 144 }, { "epoch": 0.21919879062736206, "grad_norm": 34.85631561279297, "learning_rate": 4.795540267200686e-07, "logits/chosen": 0.23453988134860992, "logits/rejected": 0.2504653036594391, "logps/chosen": -86.54104614257812, "logps/ref_chosen": -74.61146545410156, "logps/ref_rejected": -83.24461364746094, "logps/rejected": -100.37863159179688, "loss": 1.1346, "margin_dpo/margin_mean": 5.204441547393799, "margin_dpo/margin_std": 9.736194610595703, "step": 145 }, { "epoch": 0.22071050642479215, "grad_norm": 30.479814529418945, "learning_rate": 4.790271143580173e-07, "logits/chosen": 0.2428048700094223, "logits/rejected": 0.22861063480377197, "logps/chosen": -69.46223449707031, "logps/ref_chosen": -57.84098434448242, "logps/ref_rejected": -67.47422790527344, "logps/rejected": -84.34223937988281, "loss": 1.0977, "margin_dpo/margin_mean": 5.246753215789795, "margin_dpo/margin_std": 8.723588943481445, "step": 146 }, { "epoch": 0.2222222222222222, "grad_norm": 36.99082565307617, "learning_rate": 4.784937956152489e-07, "logits/chosen": 0.2565242350101471, "logits/rejected": 0.2118988335132599, "logps/chosen": -80.17362976074219, "logps/ref_chosen": -66.8134765625, "logps/ref_rejected": -81.1796875, "logps/rejected": -98.7314224243164, "loss": 1.1908, "margin_dpo/margin_mean": 4.191573619842529, "margin_dpo/margin_std": 9.052894592285156, "step": 147 }, { "epoch": 0.2237339380196523, "grad_norm": 25.193017959594727, "learning_rate": 4.779540854098347e-07, "logits/chosen": 0.38067975640296936, "logits/rejected": 0.3060184121131897, "logps/chosen": -60.72514724731445, "logps/ref_chosen": -48.68775177001953, "logps/ref_rejected": -67.50503540039062, "logps/rejected": -85.04951477050781, "loss": 1.0795, "margin_dpo/margin_mean": 5.507093906402588, "margin_dpo/margin_std": 8.676559448242188, "step": 148 }, { "epoch": 0.2252456538170824, "grad_norm": 27.02715492248535, "learning_rate": 4.774079988386296e-07, "logits/chosen": 0.24912810325622559, "logits/rejected": 0.20178565382957458, "logps/chosen": -69.67440795898438, "logps/ref_chosen": -55.14377975463867, "logps/ref_rejected": -64.79888916015625, "logps/rejected": -85.38681030273438, "loss": 1.0067, "margin_dpo/margin_mean": 6.057290077209473, "margin_dpo/margin_std": 7.906113624572754, "step": 149 }, { "epoch": 0.22675736961451248, "grad_norm": 27.364370346069336, "learning_rate": 4.768555511768486e-07, "logits/chosen": 0.25342485308647156, "logits/rejected": 0.20886380970478058, "logps/chosen": -79.80754089355469, "logps/ref_chosen": -67.47074890136719, "logps/ref_rejected": -89.21170043945312, "logps/rejected": -109.59925842285156, "loss": 0.928, "margin_dpo/margin_mean": 8.050765037536621, "margin_dpo/margin_std": 9.564001083374023, "step": 150 }, { "epoch": 0.22826908541194255, "grad_norm": 24.586162567138672, "learning_rate": 4.762967578776406e-07, "logits/chosen": 0.27967917919158936, "logits/rejected": 0.22348374128341675, "logps/chosen": -62.881961822509766, "logps/ref_chosen": -52.45954132080078, "logps/ref_rejected": -79.06301879882812, "logps/rejected": -98.59596252441406, "loss": 0.8551, "margin_dpo/margin_mean": 9.110525131225586, "margin_dpo/margin_std": 9.856712341308594, "step": 151 }, { "epoch": 0.22978080120937264, "grad_norm": 30.191181182861328, "learning_rate": 4.757316345716553e-07, "logits/chosen": 0.3457234501838684, "logits/rejected": 0.29061341285705566, "logps/chosen": -69.90003967285156, "logps/ref_chosen": -56.5538330078125, "logps/ref_rejected": -76.55074310302734, "logps/rejected": -95.70962524414062, "loss": 1.126, "margin_dpo/margin_mean": 5.8126726150512695, "margin_dpo/margin_std": 10.448650360107422, "step": 152 }, { "epoch": 0.23129251700680273, "grad_norm": 27.93670654296875, "learning_rate": 4.751601970666064e-07, "logits/chosen": 0.23497727513313293, "logits/rejected": 0.1989767700433731, "logps/chosen": -80.46448516845703, "logps/ref_chosen": -68.00689697265625, "logps/ref_rejected": -74.83482360839844, "logps/rejected": -94.14203643798828, "loss": 0.9868, "margin_dpo/margin_mean": 6.849617958068848, "margin_dpo/margin_std": 8.963647842407227, "step": 153 }, { "epoch": 0.2328042328042328, "grad_norm": 31.00896453857422, "learning_rate": 4.745824613468292e-07, "logits/chosen": 0.32073622941970825, "logits/rejected": 0.31813472509384155, "logps/chosen": -72.668701171875, "logps/ref_chosen": -59.222537994384766, "logps/ref_rejected": -64.19132232666016, "logps/rejected": -82.69126892089844, "loss": 1.1691, "margin_dpo/margin_mean": 5.05378532409668, "margin_dpo/margin_std": 10.114835739135742, "step": 154 }, { "epoch": 0.23431594860166288, "grad_norm": 31.16225242614746, "learning_rate": 4.7399844357283393e-07, "logits/chosen": 0.31456199288368225, "logits/rejected": 0.2942940294742584, "logps/chosen": -82.74867248535156, "logps/ref_chosen": -68.45469665527344, "logps/ref_rejected": -77.91763305664062, "logps/rejected": -99.23988342285156, "loss": 1.1028, "margin_dpo/margin_mean": 7.028270721435547, "margin_dpo/margin_std": 12.166418075561523, "step": 155 }, { "epoch": 0.23582766439909297, "grad_norm": 27.807287216186523, "learning_rate": 4.7340816008085305e-07, "logits/chosen": 0.26738643646240234, "logits/rejected": 0.22227174043655396, "logps/chosen": -81.36599731445312, "logps/ref_chosen": -67.26959991455078, "logps/ref_rejected": -86.95914459228516, "logps/rejected": -109.32891845703125, "loss": 0.9253, "margin_dpo/margin_mean": 8.273383140563965, "margin_dpo/margin_std": 10.093043327331543, "step": 156 }, { "epoch": 0.23733938019652306, "grad_norm": 30.04600715637207, "learning_rate": 4.728116273823847e-07, "logits/chosen": 0.2983720600605011, "logits/rejected": 0.27970391511917114, "logps/chosen": -67.21517181396484, "logps/ref_chosen": -54.77287292480469, "logps/ref_rejected": -63.87866973876953, "logps/rejected": -82.85496520996094, "loss": 1.0662, "margin_dpo/margin_mean": 6.533998012542725, "margin_dpo/margin_std": 9.998207092285156, "step": 157 }, { "epoch": 0.23885109599395313, "grad_norm": 30.961042404174805, "learning_rate": 4.7220886216373085e-07, "logits/chosen": 0.3131396174430847, "logits/rejected": 0.274808406829834, "logps/chosen": -78.74344635009766, "logps/ref_chosen": -64.92271423339844, "logps/ref_rejected": -82.23789978027344, "logps/rejected": -103.00406646728516, "loss": 1.059, "margin_dpo/margin_mean": 6.945442199707031, "margin_dpo/margin_std": 11.043777465820312, "step": 158 }, { "epoch": 0.24036281179138322, "grad_norm": 32.95261001586914, "learning_rate": 4.715998812855304e-07, "logits/chosen": 0.3162548840045929, "logits/rejected": 0.27656984329223633, "logps/chosen": -70.92852783203125, "logps/ref_chosen": -57.04698944091797, "logps/ref_rejected": -73.32441711425781, "logps/rejected": -95.10652160644531, "loss": 1.0111, "margin_dpo/margin_mean": 7.900569915771484, "margin_dpo/margin_std": 11.508578300476074, "step": 159 }, { "epoch": 0.2418745275888133, "grad_norm": 29.758676528930664, "learning_rate": 4.7098470178228755e-07, "logits/chosen": 0.19076451659202576, "logits/rejected": 0.14449487626552582, "logps/chosen": -65.33329772949219, "logps/ref_chosen": -49.806915283203125, "logps/ref_rejected": -68.3370132446289, "logps/rejected": -90.4979248046875, "loss": 1.1061, "margin_dpo/margin_mean": 6.634532928466797, "margin_dpo/margin_std": 11.820215225219727, "step": 160 }, { "epoch": 0.24338624338624337, "grad_norm": 28.17973518371582, "learning_rate": 4.703633408618955e-07, "logits/chosen": 0.35479670763015747, "logits/rejected": 0.3140922486782074, "logps/chosen": -67.69532012939453, "logps/ref_chosen": -52.50048828125, "logps/ref_rejected": -66.04540252685547, "logps/rejected": -88.5848617553711, "loss": 1.0352, "margin_dpo/margin_mean": 7.344627857208252, "margin_dpo/margin_std": 11.086959838867188, "step": 161 }, { "epoch": 0.24489795918367346, "grad_norm": 30.024721145629883, "learning_rate": 4.697358159051549e-07, "logits/chosen": 0.3540419638156891, "logits/rejected": 0.3033027648925781, "logps/chosen": -85.53033447265625, "logps/ref_chosen": -69.46919250488281, "logps/ref_rejected": -92.00952911376953, "logps/rejected": -117.64090728759766, "loss": 0.8992, "margin_dpo/margin_mean": 9.570234298706055, "margin_dpo/margin_std": 11.462608337402344, "step": 162 }, { "epoch": 0.24640967498110355, "grad_norm": 27.456750869750977, "learning_rate": 4.691021444652876e-07, "logits/chosen": 0.2802005112171173, "logits/rejected": 0.2320232391357422, "logps/chosen": -64.46339416503906, "logps/ref_chosen": -50.613834381103516, "logps/ref_rejected": -74.62033081054688, "logps/rejected": -97.98037719726562, "loss": 0.9264, "margin_dpo/margin_mean": 9.510485649108887, "margin_dpo/margin_std": 11.316560745239258, "step": 163 }, { "epoch": 0.24792139077853365, "grad_norm": 27.12653350830078, "learning_rate": 4.6846234426744624e-07, "logits/chosen": 0.27665191888809204, "logits/rejected": 0.21187984943389893, "logps/chosen": -69.832275390625, "logps/ref_chosen": -54.848114013671875, "logps/ref_rejected": -79.0630111694336, "logps/rejected": -103.15208435058594, "loss": 0.9709, "margin_dpo/margin_mean": 9.104915618896484, "margin_dpo/margin_std": 12.152335166931152, "step": 164 }, { "epoch": 0.2494331065759637, "grad_norm": 29.926708221435547, "learning_rate": 4.678164332082175e-07, "logits/chosen": 0.3327620327472687, "logits/rejected": 0.2769315242767334, "logps/chosen": -67.74002838134766, "logps/ref_chosen": -51.089210510253906, "logps/ref_rejected": -71.23370361328125, "logps/rejected": -96.0595703125, "loss": 1.0398, "margin_dpo/margin_mean": 8.175054550170898, "margin_dpo/margin_std": 12.092546463012695, "step": 165 }, { "epoch": 0.2509448223733938, "grad_norm": 33.68288803100586, "learning_rate": 4.6716442935512214e-07, "logits/chosen": 0.3111993670463562, "logits/rejected": 0.216147780418396, "logps/chosen": -79.06806182861328, "logps/ref_chosen": -63.19081115722656, "logps/ref_rejected": -93.8402099609375, "logps/rejected": -116.00359344482422, "loss": 1.0843, "margin_dpo/margin_mean": 6.286128997802734, "margin_dpo/margin_std": 10.556337356567383, "step": 166 }, { "epoch": 0.25245653817082386, "grad_norm": 27.461366653442383, "learning_rate": 4.6650635094610966e-07, "logits/chosen": 0.2629314363002777, "logits/rejected": 0.22724226117134094, "logps/chosen": -72.44942474365234, "logps/ref_chosen": -58.92427062988281, "logps/ref_rejected": -72.97377014160156, "logps/rejected": -96.02169036865234, "loss": 0.8741, "margin_dpo/margin_mean": 9.522764205932617, "margin_dpo/margin_std": 10.78989028930664, "step": 167 }, { "epoch": 0.25396825396825395, "grad_norm": 30.712583541870117, "learning_rate": 4.6584221638904767e-07, "logits/chosen": 0.24128466844558716, "logits/rejected": 0.20569536089897156, "logps/chosen": -81.29560089111328, "logps/ref_chosen": -65.65138244628906, "logps/ref_rejected": -79.71418762207031, "logps/rejected": -102.78042602539062, "loss": 1.0124, "margin_dpo/margin_mean": 7.422019004821777, "margin_dpo/margin_std": 10.804079055786133, "step": 168 }, { "epoch": 0.25547996976568405, "grad_norm": 34.614925384521484, "learning_rate": 4.651720442612075e-07, "logits/chosen": 0.3499833941459656, "logits/rejected": 0.31476855278015137, "logps/chosen": -74.23861694335938, "logps/ref_chosen": -61.425865173339844, "logps/ref_rejected": -76.09590148925781, "logps/rejected": -97.2747802734375, "loss": 1.0498, "margin_dpo/margin_mean": 8.366128921508789, "margin_dpo/margin_std": 13.607866287231445, "step": 169 }, { "epoch": 0.25699168556311414, "grad_norm": 28.536113739013672, "learning_rate": 4.6449585330874425e-07, "logits/chosen": 0.2757744789123535, "logits/rejected": 0.2731916606426239, "logps/chosen": -68.78987121582031, "logps/ref_chosen": -56.65319061279297, "logps/ref_rejected": -63.45965576171875, "logps/rejected": -83.94721984863281, "loss": 1.0727, "margin_dpo/margin_mean": 8.3508882522583, "margin_dpo/margin_std": 13.240031242370605, "step": 170 }, { "epoch": 0.2585034013605442, "grad_norm": 31.635536193847656, "learning_rate": 4.6381366244617224e-07, "logits/chosen": 0.3446548283100128, "logits/rejected": 0.2929956614971161, "logps/chosen": -77.2772216796875, "logps/ref_chosen": -63.734764099121094, "logps/ref_rejected": -78.50328063964844, "logps/rejected": -101.05764770507812, "loss": 1.0595, "margin_dpo/margin_mean": 9.011910438537598, "margin_dpo/margin_std": 13.920629501342773, "step": 171 }, { "epoch": 0.2600151171579743, "grad_norm": 30.46820640563965, "learning_rate": 4.631254907558365e-07, "logits/chosen": 0.3391519784927368, "logits/rejected": 0.2795373201370239, "logps/chosen": -66.91981506347656, "logps/ref_chosen": -52.201759338378906, "logps/ref_rejected": -82.85285949707031, "logps/rejected": -105.961181640625, "loss": 1.0236, "margin_dpo/margin_mean": 8.390253067016602, "margin_dpo/margin_std": 12.497659683227539, "step": 172 }, { "epoch": 0.2615268329554044, "grad_norm": 32.092742919921875, "learning_rate": 4.624313574873786e-07, "logits/chosen": 0.33410799503326416, "logits/rejected": 0.24394693970680237, "logps/chosen": -69.11787414550781, "logps/ref_chosen": -55.43472671508789, "logps/ref_rejected": -77.8196792602539, "logps/rejected": -100.34062194824219, "loss": 1.1421, "margin_dpo/margin_mean": 8.837798118591309, "margin_dpo/margin_std": 15.246070861816406, "step": 173 }, { "epoch": 0.26303854875283444, "grad_norm": 32.23822021484375, "learning_rate": 4.61731282057198e-07, "logits/chosen": 0.3190588355064392, "logits/rejected": 0.25044068694114685, "logps/chosen": -72.16645050048828, "logps/ref_chosen": -57.17195129394531, "logps/ref_rejected": -85.47578430175781, "logps/rejected": -109.65440368652344, "loss": 0.9885, "margin_dpo/margin_mean": 9.184125900268555, "margin_dpo/margin_std": 13.11090087890625, "step": 174 }, { "epoch": 0.26455026455026454, "grad_norm": 31.132305145263672, "learning_rate": 4.6102528404790965e-07, "logits/chosen": 0.35476261377334595, "logits/rejected": 0.3234173655509949, "logps/chosen": -80.94275665283203, "logps/ref_chosen": -67.6656265258789, "logps/ref_rejected": -84.36767578125, "logps/rejected": -107.86813354492188, "loss": 0.9698, "margin_dpo/margin_mean": 10.22334098815918, "margin_dpo/margin_std": 14.124544143676758, "step": 175 }, { "epoch": 0.2660619803476946, "grad_norm": 42.15481185913086, "learning_rate": 4.603133832077953e-07, "logits/chosen": 0.28306037187576294, "logits/rejected": 0.2561969459056854, "logps/chosen": -93.82209777832031, "logps/ref_chosen": -77.8587646484375, "logps/ref_rejected": -81.08732604980469, "logps/rejected": -103.56967163085938, "loss": 1.2243, "margin_dpo/margin_mean": 6.519006729125977, "margin_dpo/margin_std": 14.000919342041016, "step": 176 }, { "epoch": 0.2675736961451247, "grad_norm": 33.34611511230469, "learning_rate": 4.5959559945025183e-07, "logits/chosen": 0.40190625190734863, "logits/rejected": 0.3057857155799866, "logps/chosen": -67.558837890625, "logps/ref_chosen": -55.22039794921875, "logps/ref_rejected": -92.54974365234375, "logps/rejected": -117.42526245117188, "loss": 0.7731, "margin_dpo/margin_mean": 12.537084579467773, "margin_dpo/margin_std": 12.930008888244629, "step": 177 }, { "epoch": 0.2690854119425548, "grad_norm": 34.81660079956055, "learning_rate": 4.588719528532341e-07, "logits/chosen": 0.2513394057750702, "logits/rejected": 0.2027273029088974, "logps/chosen": -74.91337585449219, "logps/ref_chosen": -60.81048583984375, "logps/ref_rejected": -81.12973022460938, "logps/rejected": -102.42813110351562, "loss": 1.0615, "margin_dpo/margin_mean": 7.1955084800720215, "margin_dpo/margin_std": 11.241630554199219, "step": 178 }, { "epoch": 0.2705971277399849, "grad_norm": 36.181095123291016, "learning_rate": 4.581424636586928e-07, "logits/chosen": 0.31768059730529785, "logits/rejected": 0.30116671323776245, "logps/chosen": -80.27589416503906, "logps/ref_chosen": -65.67171478271484, "logps/ref_rejected": -75.32586669921875, "logps/rejected": -98.68672180175781, "loss": 1.0505, "margin_dpo/margin_mean": 8.756677627563477, "margin_dpo/margin_std": 13.856939315795898, "step": 179 }, { "epoch": 0.272108843537415, "grad_norm": 32.07508850097656, "learning_rate": 4.5740715227200897e-07, "logits/chosen": 0.1807074397802353, "logits/rejected": 0.1605818122625351, "logps/chosen": -67.91615295410156, "logps/ref_chosen": -56.68280792236328, "logps/ref_rejected": -64.94414520263672, "logps/rejected": -83.39351654052734, "loss": 1.1246, "margin_dpo/margin_mean": 7.216020107269287, "margin_dpo/margin_std": 13.130704879760742, "step": 180 }, { "epoch": 0.273620559334845, "grad_norm": 28.916379928588867, "learning_rate": 4.566660392614228e-07, "logits/chosen": 0.3331793546676636, "logits/rejected": 0.2920447587966919, "logps/chosen": -70.67239379882812, "logps/ref_chosen": -60.77604675292969, "logps/ref_rejected": -83.98361206054688, "logps/rejected": -104.28045654296875, "loss": 0.8325, "margin_dpo/margin_mean": 10.40049934387207, "margin_dpo/margin_std": 11.44849967956543, "step": 181 }, { "epoch": 0.2751322751322751, "grad_norm": 32.2235107421875, "learning_rate": 4.5591914535745817e-07, "logits/chosen": 0.2782343626022339, "logits/rejected": 0.1968703716993332, "logps/chosen": -72.92794036865234, "logps/ref_chosen": -60.2537841796875, "logps/ref_rejected": -89.7706298828125, "logps/rejected": -112.99656677246094, "loss": 0.9611, "margin_dpo/margin_mean": 10.551786422729492, "margin_dpo/margin_std": 14.136862754821777, "step": 182 }, { "epoch": 0.2766439909297052, "grad_norm": 35.716888427734375, "learning_rate": 4.551664914523433e-07, "logits/chosen": 0.2787359952926636, "logits/rejected": 0.25693994760513306, "logps/chosen": -77.08547973632812, "logps/ref_chosen": -61.76142120361328, "logps/ref_rejected": -72.54627990722656, "logps/rejected": -92.31723022460938, "loss": 1.2769, "margin_dpo/margin_mean": 4.446890830993652, "margin_dpo/margin_std": 11.614776611328125, "step": 183 }, { "epoch": 0.2781557067271353, "grad_norm": 26.859216690063477, "learning_rate": 4.544080985994258e-07, "logits/chosen": 0.3913435935974121, "logits/rejected": 0.3254718780517578, "logps/chosen": -56.61007308959961, "logps/ref_chosen": -46.840721130371094, "logps/ref_rejected": -69.3609390258789, "logps/rejected": -86.7547378540039, "loss": 0.9711, "margin_dpo/margin_mean": 7.624444007873535, "margin_dpo/margin_std": 10.045758247375488, "step": 184 }, { "epoch": 0.2796674225245654, "grad_norm": 30.3480224609375, "learning_rate": 4.5364398801258394e-07, "logits/chosen": 0.3209272027015686, "logits/rejected": 0.2729586958885193, "logps/chosen": -64.24158477783203, "logps/ref_chosen": -52.321136474609375, "logps/ref_rejected": -68.3885726928711, "logps/rejected": -88.5174560546875, "loss": 1.1302, "margin_dpo/margin_mean": 8.20844554901123, "margin_dpo/margin_std": 14.266172409057617, "step": 185 }, { "epoch": 0.2811791383219955, "grad_norm": 38.07506561279297, "learning_rate": 4.5287418106563354e-07, "logits/chosen": 0.2616552710533142, "logits/rejected": 0.21807625889778137, "logps/chosen": -77.84188079833984, "logps/ref_chosen": -67.42012786865234, "logps/ref_rejected": -82.50968933105469, "logps/rejected": -101.93424987792969, "loss": 1.0208, "margin_dpo/margin_mean": 9.002808570861816, "margin_dpo/margin_std": 13.614838600158691, "step": 186 }, { "epoch": 0.28269085411942557, "grad_norm": 38.66781234741211, "learning_rate": 4.520986992917297e-07, "logits/chosen": 0.274710476398468, "logits/rejected": 0.2203405201435089, "logps/chosen": -87.93611907958984, "logps/ref_chosen": -75.52549743652344, "logps/ref_rejected": -94.76289367675781, "logps/rejected": -115.23272705078125, "loss": 1.0829, "margin_dpo/margin_mean": 8.059209823608398, "margin_dpo/margin_std": 13.422420501708984, "step": 187 }, { "epoch": 0.2842025699168556, "grad_norm": 33.12516403198242, "learning_rate": 4.5131756438276466e-07, "logits/chosen": 0.2892908751964569, "logits/rejected": 0.24562746286392212, "logps/chosen": -82.09852600097656, "logps/ref_chosen": -71.52333068847656, "logps/ref_rejected": -78.29949951171875, "logps/rejected": -97.32029724121094, "loss": 1.0067, "margin_dpo/margin_mean": 8.44560432434082, "margin_dpo/margin_std": 12.50286865234375, "step": 188 }, { "epoch": 0.2857142857142857, "grad_norm": 34.14213943481445, "learning_rate": 4.5053079818876096e-07, "logits/chosen": 0.29462122917175293, "logits/rejected": 0.3075593411922455, "logps/chosen": -82.23419189453125, "logps/ref_chosen": -72.17626953125, "logps/ref_rejected": -75.26313781738281, "logps/rejected": -93.46210479736328, "loss": 1.0092, "margin_dpo/margin_mean": 8.141053199768066, "margin_dpo/margin_std": 11.562788963317871, "step": 189 }, { "epoch": 0.2872260015117158, "grad_norm": 36.18482208251953, "learning_rate": 4.4973842271726024e-07, "logits/chosen": 0.38970842957496643, "logits/rejected": 0.2420949637889862, "logps/chosen": -64.18421936035156, "logps/ref_chosen": -54.624267578125, "logps/ref_rejected": -101.47068786621094, "logps/rejected": -120.8863525390625, "loss": 0.8885, "margin_dpo/margin_mean": 9.855717658996582, "margin_dpo/margin_std": 11.999580383300781, "step": 190 }, { "epoch": 0.2887377173091459, "grad_norm": 40.0859260559082, "learning_rate": 4.48940460132708e-07, "logits/chosen": 0.355924129486084, "logits/rejected": 0.3264332413673401, "logps/chosen": -85.68058776855469, "logps/ref_chosen": -72.9325180053711, "logps/ref_rejected": -89.95103454589844, "logps/rejected": -110.76531982421875, "loss": 1.0575, "margin_dpo/margin_mean": 8.066211700439453, "margin_dpo/margin_std": 12.901345252990723, "step": 191 }, { "epoch": 0.29024943310657597, "grad_norm": 28.41532325744629, "learning_rate": 4.481369327558329e-07, "logits/chosen": 0.33127346634864807, "logits/rejected": 0.30464470386505127, "logps/chosen": -66.98524475097656, "logps/ref_chosen": -54.001121520996094, "logps/ref_rejected": -63.53154754638672, "logps/rejected": -81.61679077148438, "loss": 1.1598, "margin_dpo/margin_mean": 5.101117134094238, "margin_dpo/margin_std": 10.235267639160156, "step": 192 }, { "epoch": 0.29176114890400606, "grad_norm": 29.145444869995117, "learning_rate": 4.47327863063023e-07, "logits/chosen": 0.27890801429748535, "logits/rejected": 0.25508564710617065, "logps/chosen": -67.446044921875, "logps/ref_chosen": -56.74927520751953, "logps/ref_rejected": -58.80628967285156, "logps/rejected": -78.36878204345703, "loss": 0.9339, "margin_dpo/margin_mean": 8.865718841552734, "margin_dpo/margin_std": 11.468514442443848, "step": 193 }, { "epoch": 0.29327286470143615, "grad_norm": 28.73845863342285, "learning_rate": 4.4651327368569684e-07, "logits/chosen": 0.3396986722946167, "logits/rejected": 0.30936378240585327, "logps/chosen": -66.97239685058594, "logps/ref_chosen": -56.649444580078125, "logps/ref_rejected": -69.98954772949219, "logps/rejected": -88.03752899169922, "loss": 1.0814, "margin_dpo/margin_mean": 7.725024700164795, "margin_dpo/margin_std": 12.432638168334961, "step": 194 }, { "epoch": 0.2947845804988662, "grad_norm": 35.004730224609375, "learning_rate": 4.4569318740967043e-07, "logits/chosen": 0.21706655621528625, "logits/rejected": 0.21985185146331787, "logps/chosen": -83.40870666503906, "logps/ref_chosen": -70.40978240966797, "logps/ref_rejected": -74.39448547363281, "logps/rejected": -96.0592041015625, "loss": 0.9816, "margin_dpo/margin_mean": 8.665790557861328, "margin_dpo/margin_std": 11.95964241027832, "step": 195 }, { "epoch": 0.2962962962962963, "grad_norm": 27.6475830078125, "learning_rate": 4.448676271745197e-07, "logits/chosen": 0.3428490161895752, "logits/rejected": 0.29989540576934814, "logps/chosen": -71.03530883789062, "logps/ref_chosen": -59.227577209472656, "logps/ref_rejected": -83.54757690429688, "logps/rejected": -102.79083251953125, "loss": 1.0041, "margin_dpo/margin_mean": 7.435524940490723, "margin_dpo/margin_std": 10.328752517700195, "step": 196 }, { "epoch": 0.29780801209372637, "grad_norm": 30.879732131958008, "learning_rate": 4.440366160729392e-07, "logits/chosen": 0.4135209619998932, "logits/rejected": 0.36199289560317993, "logps/chosen": -61.91220474243164, "logps/ref_chosen": -51.52912902832031, "logps/ref_rejected": -73.70631408691406, "logps/rejected": -92.97248840332031, "loss": 1.0631, "margin_dpo/margin_mean": 8.88310432434082, "margin_dpo/margin_std": 13.773177146911621, "step": 197 }, { "epoch": 0.29931972789115646, "grad_norm": 29.16172981262207, "learning_rate": 4.432001773500957e-07, "logits/chosen": 0.3768477439880371, "logits/rejected": 0.33511894941329956, "logps/chosen": -70.8149642944336, "logps/ref_chosen": -59.78268051147461, "logps/ref_rejected": -72.24533081054688, "logps/rejected": -92.16888427734375, "loss": 0.9079, "margin_dpo/margin_mean": 8.891267776489258, "margin_dpo/margin_std": 10.59740161895752, "step": 198 }, { "epoch": 0.30083144368858655, "grad_norm": 31.893878936767578, "learning_rate": 4.4235833440297856e-07, "logits/chosen": 0.33347535133361816, "logits/rejected": 0.24467086791992188, "logps/chosen": -69.01676940917969, "logps/ref_chosen": -56.38677215576172, "logps/ref_rejected": -74.56779479980469, "logps/rejected": -94.29923248291016, "loss": 1.1132, "margin_dpo/margin_mean": 7.101439476013184, "margin_dpo/margin_std": 12.053339958190918, "step": 199 }, { "epoch": 0.30234315948601664, "grad_norm": 28.685178756713867, "learning_rate": 4.415111107797445e-07, "logits/chosen": 0.36256861686706543, "logits/rejected": 0.289836585521698, "logps/chosen": -67.39482116699219, "logps/ref_chosen": -57.82432556152344, "logps/ref_rejected": -89.28246307373047, "logps/rejected": -109.56626892089844, "loss": 0.9165, "margin_dpo/margin_mean": 10.713313102722168, "margin_dpo/margin_std": 13.711478233337402, "step": 200 }, { "epoch": 0.30234315948601664, "eval_logits/chosen": 0.3345372676849365, "eval_logits/rejected": 0.28681638836860657, "eval_logps/chosen": -86.55070495605469, "eval_logps/ref_chosen": -74.85946655273438, "eval_logps/ref_rejected": -79.54898834228516, "eval_logps/rejected": -98.71231079101562, "eval_loss": 0.5447199940681458, "eval_margin_dpo/margin_mean": 7.472083568572998, "eval_margin_dpo/margin_std": 12.559971809387207, "eval_runtime": 38.8204, "eval_samples_per_second": 59.324, "eval_steps_per_second": 1.855, "step": 200 }, { "epoch": 0.30385487528344673, "grad_norm": 36.19288635253906, "learning_rate": 4.4065853017905953e-07, "logits/chosen": 0.37724757194519043, "logits/rejected": 0.3313670754432678, "logps/chosen": -72.42201232910156, "logps/ref_chosen": -58.999759674072266, "logps/ref_rejected": -84.67575073242188, "logps/rejected": -105.98941802978516, "loss": 1.0514, "margin_dpo/margin_mean": 7.891414165496826, "margin_dpo/margin_std": 12.470344543457031, "step": 201 }, { "epoch": 0.30536659108087677, "grad_norm": 30.247343063354492, "learning_rate": 4.3980061644943575e-07, "logits/chosen": 0.30372416973114014, "logits/rejected": 0.23039022088050842, "logps/chosen": -58.71697235107422, "logps/ref_chosen": -47.660648345947266, "logps/ref_rejected": -73.63249206542969, "logps/rejected": -93.1816635131836, "loss": 1.0008, "margin_dpo/margin_mean": 8.492842674255371, "margin_dpo/margin_std": 12.252424240112305, "step": 202 }, { "epoch": 0.30687830687830686, "grad_norm": 34.87704086303711, "learning_rate": 4.3893739358856455e-07, "logits/chosen": 0.34375059604644775, "logits/rejected": 0.26960641145706177, "logps/chosen": -75.08006286621094, "logps/ref_chosen": -62.32553482055664, "logps/ref_rejected": -99.37225341796875, "logps/rejected": -119.60014343261719, "loss": 1.0524, "margin_dpo/margin_mean": 7.473361968994141, "margin_dpo/margin_std": 11.997831344604492, "step": 203 }, { "epoch": 0.30839002267573695, "grad_norm": 28.913715362548828, "learning_rate": 4.380688857426449e-07, "logits/chosen": 0.30120253562927246, "logits/rejected": 0.23232108354568481, "logps/chosen": -62.70631790161133, "logps/ref_chosen": -50.62931442260742, "logps/ref_rejected": -66.60475158691406, "logps/rejected": -87.44784545898438, "loss": 0.9799, "margin_dpo/margin_mean": 8.766094207763672, "margin_dpo/margin_std": 11.864765167236328, "step": 204 }, { "epoch": 0.30990173847316704, "grad_norm": 41.798789978027344, "learning_rate": 4.3719511720570814e-07, "logits/chosen": 0.37054669857025146, "logits/rejected": 0.30491408705711365, "logps/chosen": -83.5518569946289, "logps/ref_chosen": -70.35617065429688, "logps/ref_rejected": -93.39848327636719, "logps/rejected": -114.78820037841797, "loss": 1.0827, "margin_dpo/margin_mean": 8.19404411315918, "margin_dpo/margin_std": 13.707481384277344, "step": 205 }, { "epoch": 0.31141345427059713, "grad_norm": 35.05681610107422, "learning_rate": 4.363161124189387e-07, "logits/chosen": 0.3611377775669098, "logits/rejected": 0.34499603509902954, "logps/chosen": -81.56375122070312, "logps/ref_chosen": -67.64547729492188, "logps/ref_rejected": -79.89584350585938, "logps/rejected": -100.60433959960938, "loss": 1.1781, "margin_dpo/margin_mean": 6.790228843688965, "margin_dpo/margin_std": 13.328733444213867, "step": 206 }, { "epoch": 0.3129251700680272, "grad_norm": 29.690143585205078, "learning_rate": 4.3543189596998986e-07, "logits/chosen": 0.287255197763443, "logits/rejected": 0.22083953022956848, "logps/chosen": -83.5641860961914, "logps/ref_chosen": -67.66419219970703, "logps/ref_rejected": -85.10249328613281, "logps/rejected": -110.21287536621094, "loss": 0.9536, "margin_dpo/margin_mean": 9.210400581359863, "margin_dpo/margin_std": 12.47515869140625, "step": 207 }, { "epoch": 0.3144368858654573, "grad_norm": 35.294986724853516, "learning_rate": 4.3454249259229664e-07, "logits/chosen": 0.3147898316383362, "logits/rejected": 0.28870663046836853, "logps/chosen": -70.04411315917969, "logps/ref_chosen": -57.731712341308594, "logps/ref_rejected": -74.19276428222656, "logps/rejected": -91.46826171875, "loss": 1.2561, "margin_dpo/margin_mean": 4.963096618652344, "margin_dpo/margin_std": 11.9405517578125, "step": 208 }, { "epoch": 0.31594860166288735, "grad_norm": 35.39768981933594, "learning_rate": 4.336479271643833e-07, "logits/chosen": 0.2884509563446045, "logits/rejected": 0.23318856954574585, "logps/chosen": -81.53282165527344, "logps/ref_chosen": -68.55007934570312, "logps/ref_rejected": -87.90542602539062, "logps/rejected": -110.58944702148438, "loss": 0.9925, "margin_dpo/margin_mean": 9.701295852661133, "margin_dpo/margin_std": 13.965337753295898, "step": 209 }, { "epoch": 0.31746031746031744, "grad_norm": 29.025083541870117, "learning_rate": 4.327482247091679e-07, "logits/chosen": 0.36582332849502563, "logits/rejected": 0.26993420720100403, "logps/chosen": -70.67583465576172, "logps/ref_chosen": -57.268272399902344, "logps/ref_rejected": -85.72807312011719, "logps/rejected": -108.93333435058594, "loss": 0.9766, "margin_dpo/margin_mean": 9.797691345214844, "margin_dpo/margin_std": 13.741132736206055, "step": 210 }, { "epoch": 0.31897203325774753, "grad_norm": 35.95180130004883, "learning_rate": 4.3184341039326217e-07, "logits/chosen": 0.373915433883667, "logits/rejected": 0.28178757429122925, "logps/chosen": -65.3688735961914, "logps/ref_chosen": -53.640708923339844, "logps/ref_rejected": -93.03880310058594, "logps/rejected": -113.27926635742188, "loss": 0.9593, "margin_dpo/margin_mean": 8.512306213378906, "margin_dpo/margin_std": 11.634811401367188, "step": 211 }, { "epoch": 0.3204837490551776, "grad_norm": 30.75922966003418, "learning_rate": 4.309335095262675e-07, "logits/chosen": 0.35718491673469543, "logits/rejected": 0.2856866717338562, "logps/chosen": -71.02579498291016, "logps/ref_chosen": -57.36674499511719, "logps/ref_rejected": -79.89643096923828, "logps/rejected": -102.65644836425781, "loss": 0.9821, "margin_dpo/margin_mean": 9.100960731506348, "margin_dpo/margin_std": 12.861712455749512, "step": 212 }, { "epoch": 0.3219954648526077, "grad_norm": 30.642589569091797, "learning_rate": 4.3001854756006724e-07, "logits/chosen": 0.3269196152687073, "logits/rejected": 0.30273616313934326, "logps/chosen": -75.70863342285156, "logps/ref_chosen": -65.22111511230469, "logps/ref_rejected": -80.1810302734375, "logps/rejected": -100.77503967285156, "loss": 0.9715, "margin_dpo/margin_mean": 10.106489181518555, "margin_dpo/margin_std": 14.099427223205566, "step": 213 }, { "epoch": 0.3235071806500378, "grad_norm": 36.855289459228516, "learning_rate": 4.290985500881143e-07, "logits/chosen": 0.22347499430179596, "logits/rejected": 0.2006388008594513, "logps/chosen": -73.53646850585938, "logps/ref_chosen": -61.292327880859375, "logps/ref_rejected": -67.69841003417969, "logps/rejected": -89.87754821777344, "loss": 0.9768, "margin_dpo/margin_mean": 9.935011863708496, "margin_dpo/margin_std": 13.721136093139648, "step": 214 }, { "epoch": 0.3250188964474679, "grad_norm": 32.00320053100586, "learning_rate": 4.281735428447157e-07, "logits/chosen": 0.25742843747138977, "logits/rejected": 0.15232022106647491, "logps/chosen": -77.86520385742188, "logps/ref_chosen": -63.86913299560547, "logps/ref_rejected": -98.7657241821289, "logps/rejected": -123.19509887695312, "loss": 0.9391, "margin_dpo/margin_mean": 10.43331241607666, "margin_dpo/margin_std": 13.504112243652344, "step": 215 }, { "epoch": 0.32653061224489793, "grad_norm": 29.31924819946289, "learning_rate": 4.2724355170431247e-07, "logits/chosen": 0.3576727509498596, "logits/rejected": 0.2741282284259796, "logps/chosen": -80.46449279785156, "logps/ref_chosen": -67.824951171875, "logps/ref_rejected": -96.40231323242188, "logps/rejected": -119.64827728271484, "loss": 0.9112, "margin_dpo/margin_mean": 10.606414794921875, "margin_dpo/margin_std": 13.692004203796387, "step": 216 }, { "epoch": 0.328042328042328, "grad_norm": 28.878402709960938, "learning_rate": 4.26308602680756e-07, "logits/chosen": 0.33748987317085266, "logits/rejected": 0.23344279825687408, "logps/chosen": -74.45117950439453, "logps/ref_chosen": -60.50499725341797, "logps/ref_rejected": -84.26618194580078, "logps/rejected": -109.54447937011719, "loss": 0.8522, "margin_dpo/margin_mean": 11.332113265991211, "margin_dpo/margin_std": 13.563920021057129, "step": 217 }, { "epoch": 0.3295540438397581, "grad_norm": 35.3365592956543, "learning_rate": 4.253687219265803e-07, "logits/chosen": 0.2013692557811737, "logits/rejected": 0.19567659497261047, "logps/chosen": -85.214111328125, "logps/ref_chosen": -70.59431457519531, "logps/ref_rejected": -73.89038848876953, "logps/rejected": -95.47663116455078, "loss": 1.2315, "margin_dpo/margin_mean": 6.9664506912231445, "margin_dpo/margin_std": 14.436254501342773, "step": 218 }, { "epoch": 0.3310657596371882, "grad_norm": 32.062076568603516, "learning_rate": 4.2442393573227043e-07, "logits/chosen": 0.2901223301887512, "logits/rejected": 0.24815303087234497, "logps/chosen": -73.95545959472656, "logps/ref_chosen": -60.490943908691406, "logps/ref_rejected": -75.85001373291016, "logps/rejected": -96.14393615722656, "loss": 1.0681, "margin_dpo/margin_mean": 6.829412460327148, "margin_dpo/margin_std": 11.092742919921875, "step": 219 }, { "epoch": 0.3325774754346183, "grad_norm": 29.108675003051758, "learning_rate": 4.234742705255272e-07, "logits/chosen": 0.3480263352394104, "logits/rejected": 0.28689658641815186, "logps/chosen": -56.813446044921875, "logps/ref_chosen": -45.013397216796875, "logps/ref_rejected": -70.49369812011719, "logps/rejected": -90.70354461669922, "loss": 1.0393, "margin_dpo/margin_mean": 8.409794807434082, "margin_dpo/margin_std": 13.006105422973633, "step": 220 }, { "epoch": 0.3340891912320484, "grad_norm": 30.952327728271484, "learning_rate": 4.22519752870528e-07, "logits/chosen": 0.33196067810058594, "logits/rejected": 0.26177024841308594, "logps/chosen": -70.25636291503906, "logps/ref_chosen": -59.09584045410156, "logps/ref_rejected": -88.64388275146484, "logps/rejected": -108.89505004882812, "loss": 1.0002, "margin_dpo/margin_mean": 9.09065055847168, "margin_dpo/margin_std": 13.789722442626953, "step": 221 }, { "epoch": 0.3356009070294785, "grad_norm": 29.369138717651367, "learning_rate": 4.2156040946718343e-07, "logits/chosen": 0.3795938193798065, "logits/rejected": 0.29519224166870117, "logps/chosen": -67.63729858398438, "logps/ref_chosen": -55.9976921081543, "logps/ref_rejected": -111.94727325439453, "logps/rejected": -135.4719696044922, "loss": 0.8308, "margin_dpo/margin_mean": 11.885089874267578, "margin_dpo/margin_std": 13.283910751342773, "step": 222 }, { "epoch": 0.3371126228269085, "grad_norm": 28.02263069152832, "learning_rate": 4.2059626715039065e-07, "logits/chosen": 0.3661195635795593, "logits/rejected": 0.30957603454589844, "logps/chosen": -72.39132690429688, "logps/ref_chosen": -59.891422271728516, "logps/ref_rejected": -86.28954315185547, "logps/rejected": -109.10917663574219, "loss": 0.8733, "margin_dpo/margin_mean": 10.319726943969727, "margin_dpo/margin_std": 12.016202926635742, "step": 223 }, { "epoch": 0.3386243386243386, "grad_norm": 36.042808532714844, "learning_rate": 4.1962735288928304e-07, "logits/chosen": 0.3896036744117737, "logits/rejected": 0.36814165115356445, "logps/chosen": -77.76066589355469, "logps/ref_chosen": -64.04463195800781, "logps/ref_rejected": -75.05450439453125, "logps/rejected": -94.96039581298828, "loss": 1.123, "margin_dpo/margin_mean": 6.189866065979004, "margin_dpo/margin_std": 11.54200267791748, "step": 224 }, { "epoch": 0.3401360544217687, "grad_norm": 39.249610900878906, "learning_rate": 4.186536937864752e-07, "logits/chosen": 0.3702365756034851, "logits/rejected": 0.257682740688324, "logps/chosen": -79.36314392089844, "logps/ref_chosen": -66.0958251953125, "logps/ref_rejected": -97.68675231933594, "logps/rejected": -120.77241516113281, "loss": 0.9436, "margin_dpo/margin_mean": 9.818339347839355, "margin_dpo/margin_std": 13.009382247924805, "step": 225 }, { "epoch": 0.3416477702191988, "grad_norm": 30.140186309814453, "learning_rate": 4.176753170773052e-07, "logits/chosen": 0.3666003942489624, "logits/rejected": 0.3200559914112091, "logps/chosen": -63.04680633544922, "logps/ref_chosen": -51.4168701171875, "logps/ref_rejected": -66.30068969726562, "logps/rejected": -85.97077941894531, "loss": 1.067, "margin_dpo/margin_mean": 8.040155410766602, "margin_dpo/margin_std": 12.97111701965332, "step": 226 }, { "epoch": 0.3431594860166289, "grad_norm": 37.78723907470703, "learning_rate": 4.166922501290729e-07, "logits/chosen": 0.4130534529685974, "logits/rejected": 0.3729804754257202, "logps/chosen": -70.11088562011719, "logps/ref_chosen": -57.98978042602539, "logps/ref_rejected": -75.05464172363281, "logps/rejected": -95.43928527832031, "loss": 1.0876, "margin_dpo/margin_mean": 8.263538360595703, "margin_dpo/margin_std": 13.883774757385254, "step": 227 }, { "epoch": 0.34467120181405897, "grad_norm": 29.51940155029297, "learning_rate": 4.1570452044027405e-07, "logits/chosen": 0.36011219024658203, "logits/rejected": 0.28398561477661133, "logps/chosen": -68.40899658203125, "logps/ref_chosen": -55.559364318847656, "logps/ref_rejected": -77.02364349365234, "logps/rejected": -98.55033874511719, "loss": 1.0238, "margin_dpo/margin_mean": 8.67706298828125, "margin_dpo/margin_std": 13.047750473022461, "step": 228 }, { "epoch": 0.34618291761148906, "grad_norm": 73.98247528076172, "learning_rate": 4.147121556398312e-07, "logits/chosen": 0.4597111940383911, "logits/rejected": 0.3937884569168091, "logps/chosen": -61.63206481933594, "logps/ref_chosen": -50.79466247558594, "logps/ref_rejected": -78.44740295410156, "logps/rejected": -96.77717590332031, "loss": 1.1032, "margin_dpo/margin_mean": 7.492367744445801, "margin_dpo/margin_std": 12.852076530456543, "step": 229 }, { "epoch": 0.3476946334089191, "grad_norm": 31.33769989013672, "learning_rate": 4.137151834863213e-07, "logits/chosen": 0.33958911895751953, "logits/rejected": 0.33543652296066284, "logps/chosen": -68.4496841430664, "logps/ref_chosen": -56.729225158691406, "logps/ref_rejected": -62.99180603027344, "logps/rejected": -83.51909637451172, "loss": 0.979, "margin_dpo/margin_mean": 8.806828498840332, "margin_dpo/margin_std": 12.270458221435547, "step": 230 }, { "epoch": 0.3492063492063492, "grad_norm": 29.804668426513672, "learning_rate": 4.1271363186719835e-07, "logits/chosen": 0.271785706281662, "logits/rejected": 0.2568974196910858, "logps/chosen": -85.11524200439453, "logps/ref_chosen": -72.59710693359375, "logps/ref_rejected": -86.2322998046875, "logps/rejected": -111.23246002197266, "loss": 0.7668, "margin_dpo/margin_mean": 12.482011795043945, "margin_dpo/margin_std": 12.127299308776855, "step": 231 }, { "epoch": 0.3507180650037793, "grad_norm": 36.537750244140625, "learning_rate": 4.1170752879801436e-07, "logits/chosen": 0.32862627506256104, "logits/rejected": 0.29829728603363037, "logps/chosen": -80.80549621582031, "logps/ref_chosen": -68.1185302734375, "logps/ref_rejected": -83.79415893554688, "logps/rejected": -105.02426147460938, "loss": 1.0913, "margin_dpo/margin_mean": 8.543130874633789, "margin_dpo/margin_std": 14.806516647338867, "step": 232 }, { "epoch": 0.35222978080120937, "grad_norm": 33.485225677490234, "learning_rate": 4.106969024216348e-07, "logits/chosen": 0.36027052998542786, "logits/rejected": 0.302791953086853, "logps/chosen": -69.43925476074219, "logps/ref_chosen": -55.070152282714844, "logps/ref_rejected": -66.61845397949219, "logps/rejected": -88.03739166259766, "loss": 1.1212, "margin_dpo/margin_mean": 7.049837589263916, "margin_dpo/margin_std": 12.219358444213867, "step": 233 }, { "epoch": 0.35374149659863946, "grad_norm": 34.852638244628906, "learning_rate": 4.09681781007452e-07, "logits/chosen": 0.27572065591812134, "logits/rejected": 0.26296335458755493, "logps/chosen": -68.66838073730469, "logps/ref_chosen": -55.92589569091797, "logps/ref_rejected": -51.11608123779297, "logps/rejected": -69.86701965332031, "loss": 1.2086, "margin_dpo/margin_mean": 6.008461952209473, "margin_dpo/margin_std": 12.386802673339844, "step": 234 }, { "epoch": 0.35525321239606955, "grad_norm": 27.907825469970703, "learning_rate": 4.08662192950594e-07, "logits/chosen": 0.3932538628578186, "logits/rejected": 0.3740085959434509, "logps/chosen": -74.77420806884766, "logps/ref_chosen": -64.53972625732422, "logps/ref_rejected": -77.69151306152344, "logps/rejected": -98.98680877685547, "loss": 0.7748, "margin_dpo/margin_mean": 11.060816764831543, "margin_dpo/margin_std": 10.737753868103027, "step": 235 }, { "epoch": 0.35676492819349964, "grad_norm": 33.75122833251953, "learning_rate": 4.076381667711306e-07, "logits/chosen": 0.3117947280406952, "logits/rejected": 0.2975386381149292, "logps/chosen": -86.66970825195312, "logps/ref_chosen": -71.15473937988281, "logps/ref_rejected": -84.88542175292969, "logps/rejected": -109.42959594726562, "loss": 1.0202, "margin_dpo/margin_mean": 9.029207229614258, "margin_dpo/margin_std": 13.528522491455078, "step": 236 }, { "epoch": 0.35827664399092973, "grad_norm": 35.89513397216797, "learning_rate": 4.066097311132753e-07, "logits/chosen": 0.37658143043518066, "logits/rejected": 0.3630974590778351, "logps/chosen": -88.70964050292969, "logps/ref_chosen": -76.14201354980469, "logps/ref_rejected": -80.88479614257812, "logps/rejected": -102.75782775878906, "loss": 1.012, "margin_dpo/margin_mean": 9.305397033691406, "margin_dpo/margin_std": 13.216224670410156, "step": 237 }, { "epoch": 0.35978835978835977, "grad_norm": 40.203250885009766, "learning_rate": 4.0557691474458414e-07, "logits/chosen": 0.3156086802482605, "logits/rejected": 0.30020976066589355, "logps/chosen": -80.70606231689453, "logps/ref_chosen": -68.88484954833984, "logps/ref_rejected": -75.8946304321289, "logps/rejected": -97.01763916015625, "loss": 0.9747, "margin_dpo/margin_mean": 9.301795959472656, "margin_dpo/margin_std": 12.781771659851074, "step": 238 }, { "epoch": 0.36130007558578986, "grad_norm": 30.92405128479004, "learning_rate": 4.045397465551513e-07, "logits/chosen": 0.4430525302886963, "logits/rejected": 0.32228922843933105, "logps/chosen": -71.54067993164062, "logps/ref_chosen": -56.771827697753906, "logps/ref_rejected": -116.23049926757812, "logps/rejected": -141.01368713378906, "loss": 0.952, "margin_dpo/margin_mean": 10.014330863952637, "margin_dpo/margin_std": 13.373289108276367, "step": 239 }, { "epoch": 0.36281179138321995, "grad_norm": 24.328351974487305, "learning_rate": 4.0349825555680045e-07, "logits/chosen": 0.3731088638305664, "logits/rejected": 0.28009554743766785, "logps/chosen": -67.007080078125, "logps/ref_chosen": -53.35411071777344, "logps/ref_rejected": -80.12019348144531, "logps/rejected": -105.83534240722656, "loss": 0.8245, "margin_dpo/margin_mean": 12.062172889709473, "margin_dpo/margin_std": 13.12973403930664, "step": 240 }, { "epoch": 0.36432350718065004, "grad_norm": 36.700382232666016, "learning_rate": 4.0245247088227377e-07, "logits/chosen": 0.3141937851905823, "logits/rejected": 0.2778211832046509, "logps/chosen": -85.28544616699219, "logps/ref_chosen": -71.89541625976562, "logps/ref_rejected": -83.03492736816406, "logps/rejected": -103.01599884033203, "loss": 1.1318, "margin_dpo/margin_mean": 6.591043472290039, "margin_dpo/margin_std": 12.219178199768066, "step": 241 }, { "epoch": 0.36583522297808013, "grad_norm": 26.824167251586914, "learning_rate": 4.0140242178441665e-07, "logits/chosen": 0.3006839156150818, "logits/rejected": 0.2773579955101013, "logps/chosen": -70.70115661621094, "logps/ref_chosen": -57.927433013916016, "logps/ref_rejected": -67.83861541748047, "logps/rejected": -91.4476318359375, "loss": 0.9275, "margin_dpo/margin_mean": 10.835296630859375, "margin_dpo/margin_std": 14.33999252319336, "step": 242 }, { "epoch": 0.3673469387755102, "grad_norm": 32.75960922241211, "learning_rate": 4.003481376353596e-07, "logits/chosen": 0.3232348561286926, "logits/rejected": 0.32300877571105957, "logps/chosen": -86.814453125, "logps/ref_chosen": -74.27667236328125, "logps/ref_rejected": -73.24340057373047, "logps/rejected": -95.0753402709961, "loss": 0.9749, "margin_dpo/margin_mean": 9.294158935546875, "margin_dpo/margin_std": 12.90059757232666, "step": 243 }, { "epoch": 0.3688586545729403, "grad_norm": 23.66632843017578, "learning_rate": 3.9928964792569654e-07, "logits/chosen": 0.36699485778808594, "logits/rejected": 0.2831515073776245, "logps/chosen": -66.01345825195312, "logps/ref_chosen": -53.36390686035156, "logps/ref_rejected": -71.10276794433594, "logps/rejected": -95.97030639648438, "loss": 0.7451, "margin_dpo/margin_mean": 12.21798324584961, "margin_dpo/margin_std": 11.687814712524414, "step": 244 }, { "epoch": 0.37037037037037035, "grad_norm": 80.20890808105469, "learning_rate": 3.982269822636601e-07, "logits/chosen": 0.3938656449317932, "logits/rejected": 0.36316561698913574, "logps/chosen": -84.29216003417969, "logps/ref_chosen": -71.19510650634766, "logps/ref_rejected": -80.76235961914062, "logps/rejected": -107.34429168701172, "loss": 0.7235, "margin_dpo/margin_mean": 13.484872817993164, "margin_dpo/margin_std": 12.965568542480469, "step": 245 }, { "epoch": 0.37188208616780044, "grad_norm": 36.538177490234375, "learning_rate": 3.971601703742932e-07, "logits/chosen": 0.3998722434043884, "logits/rejected": 0.33893048763275146, "logps/chosen": -87.93953704833984, "logps/ref_chosen": -71.62104797363281, "logps/ref_rejected": -94.03392028808594, "logps/rejected": -121.22987365722656, "loss": 1.0117, "margin_dpo/margin_mean": 10.877462387084961, "margin_dpo/margin_std": 15.941274642944336, "step": 246 }, { "epoch": 0.37339380196523053, "grad_norm": 37.14095687866211, "learning_rate": 3.960892420986177e-07, "logits/chosen": 0.36805200576782227, "logits/rejected": 0.3567737936973572, "logps/chosen": -96.70210266113281, "logps/ref_chosen": -80.02254486083984, "logps/ref_rejected": -89.22705078125, "logps/rejected": -112.31900024414062, "loss": 1.169, "margin_dpo/margin_mean": 6.412394046783447, "margin_dpo/margin_std": 12.312080383300781, "step": 247 }, { "epoch": 0.3749055177626606, "grad_norm": 39.509544372558594, "learning_rate": 3.9501422739279953e-07, "logits/chosen": 0.34632980823516846, "logits/rejected": 0.38421913981437683, "logps/chosen": -80.1478271484375, "logps/ref_chosen": -65.37796020507812, "logps/ref_rejected": -61.36579132080078, "logps/rejected": -86.59133911132812, "loss": 1.0258, "margin_dpo/margin_mean": 10.455678939819336, "margin_dpo/margin_std": 15.734882354736328, "step": 248 }, { "epoch": 0.3764172335600907, "grad_norm": 47.325138092041016, "learning_rate": 3.9393515632731094e-07, "logits/chosen": 0.34551554918289185, "logits/rejected": 0.3774801194667816, "logps/chosen": -92.70658874511719, "logps/ref_chosen": -74.60145568847656, "logps/ref_rejected": -63.79338455200195, "logps/rejected": -86.26434326171875, "loss": 1.3957, "margin_dpo/margin_mean": 4.365830421447754, "margin_dpo/margin_std": 13.425724029541016, "step": 249 }, { "epoch": 0.3779289493575208, "grad_norm": 30.818462371826172, "learning_rate": 3.9285205908608934e-07, "logits/chosen": 0.4377121031284332, "logits/rejected": 0.39241671562194824, "logps/chosen": -77.27799224853516, "logps/ref_chosen": -61.93821334838867, "logps/ref_rejected": -72.21602630615234, "logps/rejected": -98.83741760253906, "loss": 0.9205, "margin_dpo/margin_mean": 11.281606674194336, "margin_dpo/margin_std": 14.549354553222656, "step": 250 }, { "epoch": 0.3794406651549509, "grad_norm": 38.55635452270508, "learning_rate": 3.9176496596569265e-07, "logits/chosen": 0.41947078704833984, "logits/rejected": 0.37922918796539307, "logps/chosen": -83.08955383300781, "logps/ref_chosen": -66.85694122314453, "logps/ref_rejected": -84.83396911621094, "logps/rejected": -108.45306396484375, "loss": 1.1422, "margin_dpo/margin_mean": 7.386477470397949, "margin_dpo/margin_std": 13.823336601257324, "step": 251 }, { "epoch": 0.38095238095238093, "grad_norm": 37.60855484008789, "learning_rate": 3.9067390737445254e-07, "logits/chosen": 0.3326997756958008, "logits/rejected": 0.2792121469974518, "logps/chosen": -71.88542938232422, "logps/ref_chosen": -56.22393035888672, "logps/ref_rejected": -77.1136245727539, "logps/rejected": -99.84173583984375, "loss": 1.2423, "margin_dpo/margin_mean": 7.06661319732666, "margin_dpo/margin_std": 14.898574829101562, "step": 252 }, { "epoch": 0.382464096749811, "grad_norm": 30.489540100097656, "learning_rate": 3.8957891383162304e-07, "logits/chosen": 0.40060853958129883, "logits/rejected": 0.3583984971046448, "logps/chosen": -67.68878173828125, "logps/ref_chosen": -52.21001434326172, "logps/ref_rejected": -58.75764465332031, "logps/rejected": -81.91943359375, "loss": 1.0707, "margin_dpo/margin_mean": 7.683013439178467, "margin_dpo/margin_std": 12.151586532592773, "step": 253 }, { "epoch": 0.3839758125472411, "grad_norm": 35.47545623779297, "learning_rate": 3.884800159665276e-07, "logits/chosen": 0.33447444438934326, "logits/rejected": 0.28286612033843994, "logps/chosen": -81.57742309570312, "logps/ref_chosen": -65.63632202148438, "logps/ref_rejected": -82.34425354003906, "logps/rejected": -106.67283630371094, "loss": 1.0607, "margin_dpo/margin_mean": 8.387495040893555, "margin_dpo/margin_std": 13.46995735168457, "step": 254 }, { "epoch": 0.3854875283446712, "grad_norm": 30.077173233032227, "learning_rate": 3.873772445177015e-07, "logits/chosen": 0.33370327949523926, "logits/rejected": 0.301089346408844, "logps/chosen": -81.6827392578125, "logps/ref_chosen": -67.91109466552734, "logps/ref_rejected": -83.89114379882812, "logps/rejected": -108.51498413085938, "loss": 0.9371, "margin_dpo/margin_mean": 10.852192878723145, "margin_dpo/margin_std": 14.53363037109375, "step": 255 }, { "epoch": 0.3869992441421013, "grad_norm": 35.74851989746094, "learning_rate": 3.862706303320329e-07, "logits/chosen": 0.3504742980003357, "logits/rejected": 0.28951138257980347, "logps/chosen": -80.06915283203125, "logps/ref_chosen": -63.49998474121094, "logps/ref_rejected": -90.77104187011719, "logps/rejected": -116.5390625, "loss": 1.0522, "margin_dpo/margin_mean": 9.198848724365234, "margin_dpo/margin_std": 14.691661834716797, "step": 256 }, { "epoch": 0.3885109599395314, "grad_norm": 34.2720947265625, "learning_rate": 3.851602043638994e-07, "logits/chosen": 0.354351282119751, "logits/rejected": 0.2911589741706848, "logps/chosen": -88.06474304199219, "logps/ref_chosen": -70.60064697265625, "logps/ref_rejected": -108.5831298828125, "logps/rejected": -138.23046875, "loss": 0.9146, "margin_dpo/margin_mean": 12.183237075805664, "margin_dpo/margin_std": 16.567211151123047, "step": 257 }, { "epoch": 0.3900226757369615, "grad_norm": 33.438987731933594, "learning_rate": 3.840459976743023e-07, "logits/chosen": 0.3662058711051941, "logits/rejected": 0.31560778617858887, "logps/chosen": -76.77505493164062, "logps/ref_chosen": -59.25416564941406, "logps/ref_rejected": -85.58709716796875, "logps/rejected": -112.08135986328125, "loss": 0.9038, "margin_dpo/margin_mean": 8.97337818145752, "margin_dpo/margin_std": 10.622298240661621, "step": 258 }, { "epoch": 0.3915343915343915, "grad_norm": 29.57775115966797, "learning_rate": 3.8292804142999796e-07, "logits/chosen": 0.32210099697113037, "logits/rejected": 0.22113582491874695, "logps/chosen": -79.4823989868164, "logps/ref_chosen": -65.43487548828125, "logps/ref_rejected": -95.41731262207031, "logps/rejected": -123.13349151611328, "loss": 0.7817, "margin_dpo/margin_mean": 13.668659210205078, "margin_dpo/margin_std": 14.191095352172852, "step": 259 }, { "epoch": 0.3930461073318216, "grad_norm": 32.224021911621094, "learning_rate": 3.818063669026256e-07, "logits/chosen": 0.36096808314323425, "logits/rejected": 0.274898886680603, "logps/chosen": -64.27941131591797, "logps/ref_chosen": -49.08958435058594, "logps/ref_rejected": -79.01708221435547, "logps/rejected": -104.52508544921875, "loss": 0.9997, "margin_dpo/margin_mean": 10.318174362182617, "margin_dpo/margin_std": 14.65306568145752, "step": 260 }, { "epoch": 0.3945578231292517, "grad_norm": 40.651424407958984, "learning_rate": 3.806810054678331e-07, "logits/chosen": 0.2242593765258789, "logits/rejected": 0.24470031261444092, "logps/chosen": -86.40238952636719, "logps/ref_chosen": -70.87239074707031, "logps/ref_rejected": -65.01522064208984, "logps/rejected": -88.8436279296875, "loss": 1.0641, "margin_dpo/margin_mean": 8.298402786254883, "margin_dpo/margin_std": 13.585638999938965, "step": 261 }, { "epoch": 0.3960695389266818, "grad_norm": 33.025047302246094, "learning_rate": 3.7955198860439887e-07, "logits/chosen": 0.4122297167778015, "logits/rejected": 0.3500533998012543, "logps/chosen": -83.70536804199219, "logps/ref_chosen": -67.87063598632812, "logps/ref_rejected": -88.7205810546875, "logps/rejected": -114.37922668457031, "loss": 0.9181, "margin_dpo/margin_mean": 9.823917388916016, "margin_dpo/margin_std": 12.397453308105469, "step": 262 }, { "epoch": 0.3975812547241119, "grad_norm": 30.91775894165039, "learning_rate": 3.784193478933516e-07, "logits/chosen": 0.342675119638443, "logits/rejected": 0.24015334248542786, "logps/chosen": -70.62643432617188, "logps/ref_chosen": -55.194580078125, "logps/ref_rejected": -80.54048156738281, "logps/rejected": -103.75779724121094, "loss": 1.0746, "margin_dpo/margin_mean": 7.785467147827148, "margin_dpo/margin_std": 12.66977596282959, "step": 263 }, { "epoch": 0.39909297052154197, "grad_norm": 37.161075592041016, "learning_rate": 3.7728311501708674e-07, "logits/chosen": 0.2615211606025696, "logits/rejected": 0.21496251225471497, "logps/chosen": -99.06900024414062, "logps/ref_chosen": -83.17068481445312, "logps/ref_rejected": -88.33625793457031, "logps/rejected": -113.83183288574219, "loss": 1.0343, "margin_dpo/margin_mean": 9.59725570678711, "margin_dpo/margin_std": 14.79243278503418, "step": 264 }, { "epoch": 0.40060468631897206, "grad_norm": 38.972530364990234, "learning_rate": 3.7614332175848027e-07, "logits/chosen": 0.4157707691192627, "logits/rejected": 0.3489346504211426, "logps/chosen": -67.16413879394531, "logps/ref_chosen": -51.66284942626953, "logps/ref_rejected": -67.1720962524414, "logps/rejected": -92.74415588378906, "loss": 1.0821, "margin_dpo/margin_mean": 10.07077407836914, "margin_dpo/margin_std": 15.829389572143555, "step": 265 }, { "epoch": 0.4021164021164021, "grad_norm": 33.671661376953125, "learning_rate": 3.75e-07, "logits/chosen": 0.3509487807750702, "logits/rejected": 0.2787778973579407, "logps/chosen": -72.02786254882812, "logps/ref_chosen": -57.45049285888672, "logps/ref_rejected": -77.60826110839844, "logps/rejected": -101.64306640625, "loss": 0.9936, "margin_dpo/margin_mean": 9.457446098327637, "margin_dpo/margin_std": 13.773505210876465, "step": 266 }, { "epoch": 0.4036281179138322, "grad_norm": 31.01226234436035, "learning_rate": 3.738531817228131e-07, "logits/chosen": 0.39176273345947266, "logits/rejected": 0.3712225556373596, "logps/chosen": -68.095703125, "logps/ref_chosen": -55.03534698486328, "logps/ref_rejected": -66.0953369140625, "logps/rejected": -86.55458068847656, "loss": 1.1479, "margin_dpo/margin_mean": 7.398888111114502, "margin_dpo/margin_std": 13.48221206665039, "step": 267 }, { "epoch": 0.4051398337112623, "grad_norm": 28.24346160888672, "learning_rate": 3.7270289900589204e-07, "logits/chosen": 0.25299277901649475, "logits/rejected": 0.23204386234283447, "logps/chosen": -77.99839782714844, "logps/ref_chosen": -65.07174682617188, "logps/ref_rejected": -71.42486572265625, "logps/rejected": -93.17605590820312, "loss": 0.9574, "margin_dpo/margin_mean": 8.824535369873047, "margin_dpo/margin_std": 11.893662452697754, "step": 268 }, { "epoch": 0.40665154950869237, "grad_norm": 27.909160614013672, "learning_rate": 3.7154918402511714e-07, "logits/chosen": 0.4610741436481476, "logits/rejected": 0.41174769401550293, "logps/chosen": -81.76616668701172, "logps/ref_chosen": -67.1362075805664, "logps/ref_rejected": -82.55778503417969, "logps/rejected": -106.98124694824219, "loss": 0.9278, "margin_dpo/margin_mean": 9.793498992919922, "margin_dpo/margin_std": 12.174901962280273, "step": 269 }, { "epoch": 0.40816326530612246, "grad_norm": 34.2803840637207, "learning_rate": 3.7039206905237656e-07, "logits/chosen": 0.3973391056060791, "logits/rejected": 0.321039617061615, "logps/chosen": -80.73213195800781, "logps/ref_chosen": -66.6886978149414, "logps/ref_rejected": -85.16129302978516, "logps/rejected": -107.2292709350586, "loss": 1.0113, "margin_dpo/margin_mean": 8.024534225463867, "margin_dpo/margin_std": 11.851795196533203, "step": 270 }, { "epoch": 0.40967498110355255, "grad_norm": 38.43772888183594, "learning_rate": 3.692315864546635e-07, "logits/chosen": 0.3932819962501526, "logits/rejected": 0.33234933018684387, "logps/chosen": -85.6281967163086, "logps/ref_chosen": -72.40754699707031, "logps/ref_rejected": -92.0631103515625, "logps/rejected": -112.34785461425781, "loss": 1.2241, "margin_dpo/margin_mean": 7.064090728759766, "margin_dpo/margin_std": 15.192045211791992, "step": 271 }, { "epoch": 0.41118669690098264, "grad_norm": 27.955270767211914, "learning_rate": 3.6806776869317067e-07, "logits/chosen": 0.36552393436431885, "logits/rejected": 0.372539758682251, "logps/chosen": -77.8504867553711, "logps/ref_chosen": -66.60140228271484, "logps/ref_rejected": -67.74339294433594, "logps/rejected": -90.815185546875, "loss": 0.7762, "margin_dpo/margin_mean": 11.822696685791016, "margin_dpo/margin_std": 12.071565628051758, "step": 272 }, { "epoch": 0.4126984126984127, "grad_norm": 33.07332229614258, "learning_rate": 3.669006483223828e-07, "logits/chosen": 0.3804330825805664, "logits/rejected": 0.3152013123035431, "logps/chosen": -71.91949462890625, "logps/ref_chosen": -57.35487365722656, "logps/ref_rejected": -84.17168426513672, "logps/rejected": -108.44322204589844, "loss": 1.0126, "margin_dpo/margin_mean": 9.70692253112793, "margin_dpo/margin_std": 14.580245018005371, "step": 273 }, { "epoch": 0.41421012849584277, "grad_norm": 28.571704864501953, "learning_rate": 3.657302579891656e-07, "logits/chosen": 0.24608616530895233, "logits/rejected": 0.22161783277988434, "logps/chosen": -73.17286682128906, "logps/ref_chosen": -59.64149475097656, "logps/ref_rejected": -68.29348754882812, "logps/rejected": -91.27055358886719, "loss": 0.98, "margin_dpo/margin_mean": 9.445707321166992, "margin_dpo/margin_std": 13.195257186889648, "step": 274 }, { "epoch": 0.41572184429327286, "grad_norm": 28.784631729125977, "learning_rate": 3.645566304318526e-07, "logits/chosen": 0.339069664478302, "logits/rejected": 0.25784239172935486, "logps/chosen": -66.6173095703125, "logps/ref_chosen": -53.26664733886719, "logps/ref_rejected": -73.84062194824219, "logps/rejected": -97.59869384765625, "loss": 0.9142, "margin_dpo/margin_mean": 10.407403945922852, "margin_dpo/margin_std": 13.310070037841797, "step": 275 }, { "epoch": 0.41723356009070295, "grad_norm": 27.40554428100586, "learning_rate": 3.633797984793294e-07, "logits/chosen": 0.29572436213493347, "logits/rejected": 0.26014089584350586, "logps/chosen": -65.08883666992188, "logps/ref_chosen": -53.02079772949219, "logps/ref_rejected": -61.56678771972656, "logps/rejected": -83.73110961914062, "loss": 0.8431, "margin_dpo/margin_mean": 10.096281051635742, "margin_dpo/margin_std": 11.02065658569336, "step": 276 }, { "epoch": 0.41874527588813304, "grad_norm": 40.223724365234375, "learning_rate": 3.6219979505011555e-07, "logits/chosen": 0.4081869125366211, "logits/rejected": 0.42862510681152344, "logps/chosen": -85.9726333618164, "logps/ref_chosen": -71.43299102783203, "logps/ref_rejected": -67.65852355957031, "logps/rejected": -88.13728332519531, "loss": 1.1897, "margin_dpo/margin_mean": 5.939116477966309, "margin_dpo/margin_std": 12.596590995788574, "step": 277 }, { "epoch": 0.42025699168556313, "grad_norm": 33.17136001586914, "learning_rate": 3.6101665315144353e-07, "logits/chosen": 0.26065516471862793, "logits/rejected": 0.21105234324932098, "logps/chosen": -80.96624755859375, "logps/ref_chosen": -67.11076354980469, "logps/ref_rejected": -88.74851989746094, "logps/rejected": -112.27589416503906, "loss": 0.959, "margin_dpo/margin_mean": 9.671895980834961, "margin_dpo/margin_std": 12.61258316040039, "step": 278 }, { "epoch": 0.4217687074829932, "grad_norm": 25.243629455566406, "learning_rate": 3.5983040587833563e-07, "logits/chosen": 0.31561681628227234, "logits/rejected": 0.2724509835243225, "logps/chosen": -64.49711608886719, "logps/ref_chosen": -54.49748611450195, "logps/ref_rejected": -70.4237289428711, "logps/rejected": -92.49717712402344, "loss": 0.769, "margin_dpo/margin_mean": 12.073814392089844, "margin_dpo/margin_std": 11.994571685791016, "step": 279 }, { "epoch": 0.42328042328042326, "grad_norm": 24.20424461364746, "learning_rate": 3.586410864126781e-07, "logits/chosen": 0.33959245681762695, "logits/rejected": 0.2953893542289734, "logps/chosen": -71.14170837402344, "logps/ref_chosen": -60.43281173706055, "logps/ref_rejected": -78.39051818847656, "logps/rejected": -101.32532501220703, "loss": 0.724, "margin_dpo/margin_mean": 12.225910186767578, "margin_dpo/margin_std": 11.65771484375, "step": 280 }, { "epoch": 0.42479213907785335, "grad_norm": 26.259754180908203, "learning_rate": 3.574487280222929e-07, "logits/chosen": 0.32685786485671997, "logits/rejected": 0.33631807565689087, "logps/chosen": -72.1130142211914, "logps/ref_chosen": -60.2820930480957, "logps/ref_rejected": -62.04009246826172, "logps/rejected": -84.68453979492188, "loss": 0.8761, "margin_dpo/margin_mean": 10.813521385192871, "margin_dpo/margin_std": 12.737310409545898, "step": 281 }, { "epoch": 0.42630385487528344, "grad_norm": 31.841053009033203, "learning_rate": 3.562533640600075e-07, "logits/chosen": 0.2697226107120514, "logits/rejected": 0.21670247614383698, "logps/chosen": -73.82437133789062, "logps/ref_chosen": -60.623924255371094, "logps/ref_rejected": -68.67400360107422, "logps/rejected": -92.24623107910156, "loss": 0.9296, "margin_dpo/margin_mean": 10.371776580810547, "margin_dpo/margin_std": 13.209760665893555, "step": 282 }, { "epoch": 0.42781557067271353, "grad_norm": 37.73383712768555, "learning_rate": 3.550550279627215e-07, "logits/chosen": 0.33592864871025085, "logits/rejected": 0.24044758081436157, "logps/chosen": -81.87095642089844, "logps/ref_chosen": -67.64775085449219, "logps/ref_rejected": -99.96835327148438, "logps/rejected": -122.95320129394531, "loss": 1.0706, "margin_dpo/margin_mean": 8.761629104614258, "margin_dpo/margin_std": 14.105447769165039, "step": 283 }, { "epoch": 0.4293272864701436, "grad_norm": 28.0395450592041, "learning_rate": 3.5385375325047163e-07, "logits/chosen": 0.3895118832588196, "logits/rejected": 0.325203537940979, "logps/chosen": -70.05673217773438, "logps/ref_chosen": -56.967430114746094, "logps/ref_rejected": -86.36236572265625, "logps/rejected": -109.27890014648438, "loss": 0.9022, "margin_dpo/margin_mean": 9.827235221862793, "margin_dpo/margin_std": 12.691057205200195, "step": 284 }, { "epoch": 0.4308390022675737, "grad_norm": 34.68986129760742, "learning_rate": 3.5264957352549375e-07, "logits/chosen": 0.39354732632637024, "logits/rejected": 0.3635793924331665, "logps/chosen": -87.55171203613281, "logps/ref_chosen": -71.65611267089844, "logps/ref_rejected": -81.63829803466797, "logps/rejected": -105.88143920898438, "loss": 1.0097, "margin_dpo/margin_mean": 8.347532272338867, "margin_dpo/margin_std": 12.33486557006836, "step": 285 }, { "epoch": 0.4323507180650038, "grad_norm": 27.683141708374023, "learning_rate": 3.514425224712835e-07, "logits/chosen": 0.3177586793899536, "logits/rejected": 0.22109857201576233, "logps/chosen": -75.74806213378906, "logps/ref_chosen": -61.07952117919922, "logps/ref_rejected": -91.28128051757812, "logps/rejected": -119.27302551269531, "loss": 0.7605, "margin_dpo/margin_mean": 13.323205947875977, "margin_dpo/margin_std": 13.627128601074219, "step": 286 }, { "epoch": 0.43386243386243384, "grad_norm": 28.568553924560547, "learning_rate": 3.502326338516534e-07, "logits/chosen": 0.36465156078338623, "logits/rejected": 0.3220062255859375, "logps/chosen": -58.37489318847656, "logps/ref_chosen": -46.035789489746094, "logps/ref_rejected": -59.95293426513672, "logps/rejected": -85.12361907958984, "loss": 0.7988, "margin_dpo/margin_mean": 12.83158016204834, "margin_dpo/margin_std": 14.233297348022461, "step": 287 }, { "epoch": 0.43537414965986393, "grad_norm": 38.184391021728516, "learning_rate": 3.490199415097892e-07, "logits/chosen": 0.2519870400428772, "logits/rejected": 0.1951802521944046, "logps/chosen": -81.74073791503906, "logps/ref_chosen": -65.3908462524414, "logps/ref_rejected": -88.53607177734375, "logps/rejected": -113.48297119140625, "loss": 1.0718, "margin_dpo/margin_mean": 8.597015380859375, "margin_dpo/margin_std": 14.126516342163086, "step": 288 }, { "epoch": 0.436885865457294, "grad_norm": 31.642969131469727, "learning_rate": 3.4780447936730247e-07, "logits/chosen": 0.42887574434280396, "logits/rejected": 0.38934090733528137, "logps/chosen": -71.02117156982422, "logps/ref_chosen": -54.5936279296875, "logps/ref_rejected": -67.20855712890625, "logps/rejected": -92.87224578857422, "loss": 1.0588, "margin_dpo/margin_mean": 9.236154556274414, "margin_dpo/margin_std": 14.629898071289062, "step": 289 }, { "epoch": 0.4383975812547241, "grad_norm": 38.783477783203125, "learning_rate": 3.465862814232821e-07, "logits/chosen": 0.42857229709625244, "logits/rejected": 0.3584096431732178, "logps/chosen": -79.49140930175781, "logps/ref_chosen": -61.38457489013672, "logps/ref_rejected": -91.92778015136719, "logps/rejected": -121.74188232421875, "loss": 0.9043, "margin_dpo/margin_mean": 11.707275390625, "margin_dpo/margin_std": 14.761146545410156, "step": 290 }, { "epoch": 0.4399092970521542, "grad_norm": 33.59368133544922, "learning_rate": 3.4536538175334343e-07, "logits/chosen": 0.5070397853851318, "logits/rejected": 0.4380100965499878, "logps/chosen": -67.4765853881836, "logps/ref_chosen": -50.863037109375, "logps/ref_rejected": -82.20868682861328, "logps/rejected": -110.57408142089844, "loss": 0.9453, "margin_dpo/margin_mean": 11.751852035522461, "margin_dpo/margin_std": 15.614330291748047, "step": 291 }, { "epoch": 0.4414210128495843, "grad_norm": 37.523006439208984, "learning_rate": 3.4414181450867465e-07, "logits/chosen": 0.3803209662437439, "logits/rejected": 0.32875359058380127, "logps/chosen": -80.26577758789062, "logps/ref_chosen": -64.34888458251953, "logps/ref_rejected": -72.86434936523438, "logps/rejected": -98.49430847167969, "loss": 0.9875, "margin_dpo/margin_mean": 9.71307373046875, "margin_dpo/margin_std": 14.086446762084961, "step": 292 }, { "epoch": 0.4429327286470144, "grad_norm": 25.93706512451172, "learning_rate": 3.4291561391508185e-07, "logits/chosen": 0.4410402476787567, "logits/rejected": 0.3540686368942261, "logps/chosen": -72.45366668701172, "logps/ref_chosen": -54.86946487426758, "logps/ref_rejected": -81.858642578125, "logps/rejected": -112.58206176757812, "loss": 0.8776, "margin_dpo/margin_mean": 13.139215469360352, "margin_dpo/margin_std": 15.717092514038086, "step": 293 }, { "epoch": 0.4444444444444444, "grad_norm": 26.823040008544922, "learning_rate": 3.4168681427203153e-07, "logits/chosen": 0.3839040994644165, "logits/rejected": 0.3379971981048584, "logps/chosen": -72.74606323242188, "logps/ref_chosen": -56.6708984375, "logps/ref_rejected": -70.32819366455078, "logps/rejected": -97.80169677734375, "loss": 0.865, "margin_dpo/margin_mean": 11.398344039916992, "margin_dpo/margin_std": 14.199283599853516, "step": 294 }, { "epoch": 0.4459561602418745, "grad_norm": 34.710105895996094, "learning_rate": 3.4045544995169125e-07, "logits/chosen": 0.3906914293766022, "logits/rejected": 0.2945278286933899, "logps/chosen": -68.47222900390625, "logps/ref_chosen": -50.40088653564453, "logps/ref_rejected": -83.43521881103516, "logps/rejected": -110.99464416503906, "loss": 1.0035, "margin_dpo/margin_mean": 9.488086700439453, "margin_dpo/margin_std": 14.339176177978516, "step": 295 }, { "epoch": 0.4474678760393046, "grad_norm": 36.2674560546875, "learning_rate": 3.392215553979679e-07, "logits/chosen": 0.3322068750858307, "logits/rejected": 0.2882440686225891, "logps/chosen": -87.16343688964844, "logps/ref_chosen": -69.15034484863281, "logps/ref_rejected": -89.60166931152344, "logps/rejected": -119.3226318359375, "loss": 0.8998, "margin_dpo/margin_mean": 11.70787239074707, "margin_dpo/margin_std": 14.860249519348145, "step": 296 }, { "epoch": 0.4489795918367347, "grad_norm": 28.834030151367188, "learning_rate": 3.3798516512554485e-07, "logits/chosen": 0.36532461643218994, "logits/rejected": 0.30405259132385254, "logps/chosen": -77.24577331542969, "logps/ref_chosen": -58.01630401611328, "logps/ref_rejected": -69.95780944824219, "logps/rejected": -101.18505859375, "loss": 0.8016, "margin_dpo/margin_mean": 11.997785568237305, "margin_dpo/margin_std": 12.494903564453125, "step": 297 }, { "epoch": 0.4504913076341648, "grad_norm": 32.918025970458984, "learning_rate": 3.367463137189156e-07, "logits/chosen": 0.5060819387435913, "logits/rejected": 0.44333118200302124, "logps/chosen": -75.56437683105469, "logps/ref_chosen": -56.1693115234375, "logps/ref_rejected": -68.55052185058594, "logps/rejected": -97.68669891357422, "loss": 1.0316, "margin_dpo/margin_mean": 9.741113662719727, "margin_dpo/margin_std": 14.936502456665039, "step": 298 }, { "epoch": 0.4520030234315949, "grad_norm": 34.70051574707031, "learning_rate": 3.355050358314172e-07, "logits/chosen": 0.27476340532302856, "logits/rejected": 0.24570544064044952, "logps/chosen": -80.7921142578125, "logps/ref_chosen": -62.31780242919922, "logps/ref_rejected": -72.60028839111328, "logps/rejected": -100.0041732788086, "loss": 1.1003, "margin_dpo/margin_mean": 8.929572105407715, "margin_dpo/margin_std": 15.097801208496094, "step": 299 }, { "epoch": 0.45351473922902497, "grad_norm": 33.734275817871094, "learning_rate": 3.3426136618426043e-07, "logits/chosen": 0.3949470818042755, "logits/rejected": 0.3290703594684601, "logps/chosen": -80.29096984863281, "logps/ref_chosen": -60.38157653808594, "logps/ref_rejected": -75.45442199707031, "logps/rejected": -105.5843734741211, "loss": 0.9692, "margin_dpo/margin_mean": 10.220561027526855, "margin_dpo/margin_std": 13.958076477050781, "step": 300 }, { "epoch": 0.45351473922902497, "eval_logits/chosen": 0.4017273187637329, "eval_logits/rejected": 0.35065409541130066, "eval_logps/chosen": -93.17941284179688, "eval_logps/ref_chosen": -74.85946655273438, "eval_logps/ref_rejected": -79.54898834228516, "eval_logps/rejected": -107.24836730957031, "eval_loss": 0.5345103144645691, "eval_margin_dpo/margin_mean": 9.37943172454834, "eval_margin_dpo/margin_std": 14.973759651184082, "eval_runtime": 38.6788, "eval_samples_per_second": 59.542, "eval_steps_per_second": 1.861, "step": 300 }, { "epoch": 0.455026455026455, "grad_norm": 31.209848403930664, "learning_rate": 3.3301533956555885e-07, "logits/chosen": 0.42101189494132996, "logits/rejected": 0.38934606313705444, "logps/chosen": -71.96283721923828, "logps/ref_chosen": -52.85089111328125, "logps/ref_rejected": -69.97584533691406, "logps/rejected": -98.6317138671875, "loss": 1.0621, "margin_dpo/margin_mean": 9.543924331665039, "margin_dpo/margin_std": 15.364656448364258, "step": 301 }, { "epoch": 0.4565381708238851, "grad_norm": 37.35209274291992, "learning_rate": 3.317669908293554e-07, "logits/chosen": 0.2666228413581848, "logits/rejected": 0.21263065934181213, "logps/chosen": -88.26080322265625, "logps/ref_chosen": -66.96651458740234, "logps/ref_rejected": -88.0951156616211, "logps/rejected": -115.92124938964844, "loss": 1.1827, "margin_dpo/margin_mean": 6.531844139099121, "margin_dpo/margin_std": 13.561601638793945, "step": 302 }, { "epoch": 0.4580498866213152, "grad_norm": 28.165313720703125, "learning_rate": 3.3051635489464793e-07, "logits/chosen": 0.3595314621925354, "logits/rejected": 0.2943815290927887, "logps/chosen": -80.27766418457031, "logps/ref_chosen": -62.12152862548828, "logps/ref_rejected": -90.31204223632812, "logps/rejected": -121.2498779296875, "loss": 0.9179, "margin_dpo/margin_mean": 12.781694412231445, "margin_dpo/margin_std": 16.26740264892578, "step": 303 }, { "epoch": 0.4595616024187453, "grad_norm": 28.933210372924805, "learning_rate": 3.292634667444117e-07, "logits/chosen": 0.38219448924064636, "logits/rejected": 0.3254969120025635, "logps/chosen": -78.41160583496094, "logps/ref_chosen": -60.69508361816406, "logps/ref_rejected": -78.25254821777344, "logps/rejected": -108.52848815917969, "loss": 0.7588, "margin_dpo/margin_mean": 12.559429168701172, "margin_dpo/margin_std": 12.42584228515625, "step": 304 }, { "epoch": 0.46107331821617537, "grad_norm": 42.87114334106445, "learning_rate": 3.280083614246217e-07, "logits/chosen": 0.32686659693717957, "logits/rejected": 0.3448963165283203, "logps/chosen": -93.79513549804688, "logps/ref_chosen": -72.69914245605469, "logps/ref_rejected": -65.65670776367188, "logps/rejected": -96.12918090820312, "loss": 1.0819, "margin_dpo/margin_mean": 9.376477241516113, "margin_dpo/margin_std": 15.228221893310547, "step": 305 }, { "epoch": 0.46258503401360546, "grad_norm": 28.915489196777344, "learning_rate": 3.267510740432719e-07, "logits/chosen": 0.4310336410999298, "logits/rejected": 0.31829434633255005, "logps/chosen": -73.72835540771484, "logps/ref_chosen": -53.97052764892578, "logps/ref_rejected": -71.02423095703125, "logps/rejected": -99.50276184082031, "loss": 0.9685, "margin_dpo/margin_mean": 8.720699310302734, "margin_dpo/margin_std": 11.995311737060547, "step": 306 }, { "epoch": 0.46409674981103555, "grad_norm": 35.531272888183594, "learning_rate": 3.2549163976939285e-07, "logits/chosen": 0.4187588095664978, "logits/rejected": 0.37263208627700806, "logps/chosen": -74.16632080078125, "logps/ref_chosen": -57.413108825683594, "logps/ref_rejected": -68.68011474609375, "logps/rejected": -92.7537612915039, "loss": 1.2241, "margin_dpo/margin_mean": 7.320440292358398, "margin_dpo/margin_std": 15.137401580810547, "step": 307 }, { "epoch": 0.4656084656084656, "grad_norm": 30.61981773376465, "learning_rate": 3.2423009383206874e-07, "logits/chosen": 0.36663228273391724, "logits/rejected": 0.34980136156082153, "logps/chosen": -84.69308471679688, "logps/ref_chosen": -66.59878540039062, "logps/ref_rejected": -74.337158203125, "logps/rejected": -101.78118133544922, "loss": 1.0013, "margin_dpo/margin_mean": 9.3497314453125, "margin_dpo/margin_std": 13.311956405639648, "step": 308 }, { "epoch": 0.4671201814058957, "grad_norm": 42.480690002441406, "learning_rate": 3.229664715194511e-07, "logits/chosen": 0.4416660666465759, "logits/rejected": 0.3822864294052124, "logps/chosen": -86.4417724609375, "logps/ref_chosen": -65.39474487304688, "logps/ref_rejected": -75.70930480957031, "logps/rejected": -105.63099670410156, "loss": 1.0418, "margin_dpo/margin_mean": 8.874654769897461, "margin_dpo/margin_std": 13.672810554504395, "step": 309 }, { "epoch": 0.46863189720332576, "grad_norm": 38.71232223510742, "learning_rate": 3.2170080817777257e-07, "logits/chosen": 0.42346304655075073, "logits/rejected": 0.4100581407546997, "logps/chosen": -95.80581665039062, "logps/ref_chosen": -74.66827392578125, "logps/ref_rejected": -80.5689697265625, "logps/rejected": -107.26795959472656, "loss": 1.2398, "margin_dpo/margin_mean": 5.561444282531738, "margin_dpo/margin_std": 13.021998405456543, "step": 310 }, { "epoch": 0.47014361300075586, "grad_norm": 32.81990432739258, "learning_rate": 3.204331392103574e-07, "logits/chosen": 0.3420078754425049, "logits/rejected": 0.21063414216041565, "logps/chosen": -76.67539978027344, "logps/ref_chosen": -59.73802947998047, "logps/ref_rejected": -93.60757446289062, "logps/rejected": -119.02238464355469, "loss": 1.072, "margin_dpo/margin_mean": 8.477436065673828, "margin_dpo/margin_std": 14.32551383972168, "step": 311 }, { "epoch": 0.47165532879818595, "grad_norm": 26.659669876098633, "learning_rate": 3.1916350007663176e-07, "logits/chosen": 0.38971781730651855, "logits/rejected": 0.2917502522468567, "logps/chosen": -71.44837951660156, "logps/ref_chosen": -53.816436767578125, "logps/ref_rejected": -68.6575698852539, "logps/rejected": -97.73336791992188, "loss": 0.8242, "margin_dpo/margin_mean": 11.443855285644531, "margin_dpo/margin_std": 12.68366813659668, "step": 312 }, { "epoch": 0.47316704459561604, "grad_norm": 34.83692932128906, "learning_rate": 3.178919262911314e-07, "logits/chosen": 0.45790788531303406, "logits/rejected": 0.43460702896118164, "logps/chosen": -77.33392333984375, "logps/ref_chosen": -59.957359313964844, "logps/ref_rejected": -69.31729888916016, "logps/rejected": -94.20777893066406, "loss": 1.158, "margin_dpo/margin_mean": 7.513920783996582, "margin_dpo/margin_std": 14.714834213256836, "step": 313 }, { "epoch": 0.47467876039304613, "grad_norm": 26.88656234741211, "learning_rate": 3.166184534225087e-07, "logits/chosen": 0.3927513360977173, "logits/rejected": 0.41145119071006775, "logps/chosen": -87.16189575195312, "logps/ref_chosen": -70.26815795898438, "logps/ref_rejected": -69.23971557617188, "logps/rejected": -97.7408447265625, "loss": 0.8053, "margin_dpo/margin_mean": 11.607396125793457, "margin_dpo/margin_std": 12.333427429199219, "step": 314 }, { "epoch": 0.47619047619047616, "grad_norm": 28.846769332885742, "learning_rate": 3.1534311709253723e-07, "logits/chosen": 0.3296147286891937, "logits/rejected": 0.28625988960266113, "logps/chosen": -85.16159057617188, "logps/ref_chosen": -67.79469299316406, "logps/ref_rejected": -74.55148315429688, "logps/rejected": -101.44076538085938, "loss": 0.9432, "margin_dpo/margin_mean": 9.52238655090332, "margin_dpo/margin_std": 11.987009048461914, "step": 315 }, { "epoch": 0.47770219198790626, "grad_norm": 29.60664939880371, "learning_rate": 3.1406595297511564e-07, "logits/chosen": 0.3064402639865875, "logits/rejected": 0.18259896337985992, "logps/chosen": -71.98179626464844, "logps/ref_chosen": -55.288482666015625, "logps/ref_rejected": -96.15723419189453, "logps/rejected": -123.8666000366211, "loss": 0.8814, "margin_dpo/margin_mean": 11.016056060791016, "margin_dpo/margin_std": 12.859323501586914, "step": 316 }, { "epoch": 0.47921390778533635, "grad_norm": 27.596532821655273, "learning_rate": 3.1278699679526975e-07, "logits/chosen": 0.4423693120479584, "logits/rejected": 0.3917258381843567, "logps/chosen": -68.88323974609375, "logps/ref_chosen": -54.58137512207031, "logps/ref_rejected": -72.77232360839844, "logps/rejected": -100.27496337890625, "loss": 0.7309, "margin_dpo/margin_mean": 13.200772285461426, "margin_dpo/margin_std": 12.760543823242188, "step": 317 }, { "epoch": 0.48072562358276644, "grad_norm": 37.705631256103516, "learning_rate": 3.1150628432815336e-07, "logits/chosen": 0.42552927136421204, "logits/rejected": 0.35953375697135925, "logps/chosen": -70.18506622314453, "logps/ref_chosen": -52.88822937011719, "logps/ref_rejected": -80.63988494873047, "logps/rejected": -105.99900817871094, "loss": 1.2682, "margin_dpo/margin_mean": 8.06229019165039, "margin_dpo/margin_std": 16.971851348876953, "step": 318 }, { "epoch": 0.48223733938019653, "grad_norm": 29.031652450561523, "learning_rate": 3.1022385139804707e-07, "logits/chosen": 0.3563545346260071, "logits/rejected": 0.33486121892929077, "logps/chosen": -79.50239562988281, "logps/ref_chosen": -64.36333465576172, "logps/ref_rejected": -79.47296142578125, "logps/rejected": -106.15999603271484, "loss": 0.8888, "margin_dpo/margin_mean": 11.5479736328125, "margin_dpo/margin_std": 14.676193237304688, "step": 319 }, { "epoch": 0.4837490551776266, "grad_norm": 32.68266296386719, "learning_rate": 3.0893973387735683e-07, "logits/chosen": 0.2867361307144165, "logits/rejected": 0.24268287420272827, "logps/chosen": -64.48707580566406, "logps/ref_chosen": -49.558746337890625, "logps/ref_rejected": -71.23444366455078, "logps/rejected": -94.714599609375, "loss": 1.1087, "margin_dpo/margin_mean": 8.551826477050781, "margin_dpo/margin_std": 14.7891206741333, "step": 320 }, { "epoch": 0.4852607709750567, "grad_norm": 29.052352905273438, "learning_rate": 3.0765396768561004e-07, "logits/chosen": 0.3400368392467499, "logits/rejected": 0.3190291225910187, "logps/chosen": -68.13433837890625, "logps/ref_chosen": -52.085269927978516, "logps/ref_rejected": -55.58674621582031, "logps/rejected": -80.85602569580078, "loss": 1.0443, "margin_dpo/margin_mean": 9.220213890075684, "margin_dpo/margin_std": 13.879295349121094, "step": 321 }, { "epoch": 0.48677248677248675, "grad_norm": 26.432174682617188, "learning_rate": 3.063665887884511e-07, "logits/chosen": 0.4483844041824341, "logits/rejected": 0.36465245485305786, "logps/chosen": -63.66038513183594, "logps/ref_chosen": -47.404109954833984, "logps/ref_rejected": -73.4260025024414, "logps/rejected": -102.2420654296875, "loss": 0.7555, "margin_dpo/margin_mean": 12.559789657592773, "margin_dpo/margin_std": 12.248069763183594, "step": 322 }, { "epoch": 0.48828420256991684, "grad_norm": 36.914634704589844, "learning_rate": 3.0507763319663517e-07, "logits/chosen": 0.32914960384368896, "logits/rejected": 0.25222885608673096, "logps/chosen": -86.92568969726562, "logps/ref_chosen": -70.00630187988281, "logps/ref_rejected": -86.96690368652344, "logps/rejected": -112.45706176757812, "loss": 1.1786, "margin_dpo/margin_mean": 8.570770263671875, "margin_dpo/margin_std": 16.58449363708496, "step": 323 }, { "epoch": 0.4897959183673469, "grad_norm": 28.488908767700195, "learning_rate": 3.0378713696502097e-07, "logits/chosen": 0.4177021384239197, "logits/rejected": 0.35851603746414185, "logps/chosen": -69.78921508789062, "logps/ref_chosen": -55.88882064819336, "logps/ref_rejected": -75.23088073730469, "logps/rejected": -99.71137237548828, "loss": 0.8159, "margin_dpo/margin_mean": 10.580099105834961, "margin_dpo/margin_std": 11.190553665161133, "step": 324 }, { "epoch": 0.491307634164777, "grad_norm": 31.288358688354492, "learning_rate": 3.0249513619156206e-07, "logits/chosen": 0.3787188231945038, "logits/rejected": 0.3136671781539917, "logps/chosen": -81.69401550292969, "logps/ref_chosen": -64.14701843261719, "logps/ref_rejected": -79.91143035888672, "logps/rejected": -107.18046569824219, "loss": 0.9959, "margin_dpo/margin_mean": 9.72203254699707, "margin_dpo/margin_std": 14.314732551574707, "step": 325 }, { "epoch": 0.4928193499622071, "grad_norm": 42.53952407836914, "learning_rate": 3.012016670162977e-07, "logits/chosen": 0.32631510496139526, "logits/rejected": 0.32735714316368103, "logps/chosen": -95.90467834472656, "logps/ref_chosen": -75.53131103515625, "logps/ref_rejected": -76.5898666381836, "logps/rejected": -102.08723449707031, "loss": 1.2879, "margin_dpo/margin_mean": 5.123997688293457, "margin_dpo/margin_std": 12.83140754699707, "step": 326 }, { "epoch": 0.4943310657596372, "grad_norm": 34.51028060913086, "learning_rate": 2.99906765620341e-07, "logits/chosen": 0.2692107856273651, "logits/rejected": 0.22894282639026642, "logps/chosen": -87.27745056152344, "logps/ref_chosen": -69.337158203125, "logps/ref_rejected": -73.37751770019531, "logps/rejected": -99.87676239013672, "loss": 1.1168, "margin_dpo/margin_mean": 8.558960914611816, "margin_dpo/margin_std": 15.341290473937988, "step": 327 }, { "epoch": 0.4958427815570673, "grad_norm": 29.792686462402344, "learning_rate": 2.9861046822486766e-07, "logits/chosen": 0.3397352695465088, "logits/rejected": 0.3053368926048279, "logps/chosen": -76.30783081054688, "logps/ref_chosen": -61.70623016357422, "logps/ref_rejected": -83.73808288574219, "logps/rejected": -107.90557861328125, "loss": 0.8958, "margin_dpo/margin_mean": 9.565893173217773, "margin_dpo/margin_std": 11.804071426391602, "step": 328 }, { "epoch": 0.4973544973544973, "grad_norm": 35.93716049194336, "learning_rate": 2.9731281109010253e-07, "logits/chosen": 0.41245976090431213, "logits/rejected": 0.3557360768318176, "logps/chosen": -82.2403564453125, "logps/ref_chosen": -64.4984130859375, "logps/ref_rejected": -83.6591796875, "logps/rejected": -111.41017150878906, "loss": 0.9671, "margin_dpo/margin_mean": 10.009061813354492, "margin_dpo/margin_std": 14.135910987854004, "step": 329 }, { "epoch": 0.4988662131519274, "grad_norm": 29.736265182495117, "learning_rate": 2.9601383051430505e-07, "logits/chosen": 0.36417415738105774, "logits/rejected": 0.2935639023780823, "logps/chosen": -69.07024383544922, "logps/ref_chosen": -54.80464172363281, "logps/ref_rejected": -75.31942749023438, "logps/rejected": -99.49536895751953, "loss": 1.0317, "margin_dpo/margin_mean": 9.9103364944458, "margin_dpo/margin_std": 14.863494873046875, "step": 330 }, { "epoch": 0.5003779289493575, "grad_norm": 27.832918167114258, "learning_rate": 2.947135628327544e-07, "logits/chosen": 0.46214789152145386, "logits/rejected": 0.42954060435295105, "logps/chosen": -74.25878143310547, "logps/ref_chosen": -59.242576599121094, "logps/ref_rejected": -69.87483215332031, "logps/rejected": -98.79541778564453, "loss": 0.8271, "margin_dpo/margin_mean": 13.904380798339844, "margin_dpo/margin_std": 16.01378631591797, "step": 331 }, { "epoch": 0.5018896447467877, "grad_norm": 32.08829116821289, "learning_rate": 2.934120444167326e-07, "logits/chosen": 0.32679808139801025, "logits/rejected": 0.2778639793395996, "logps/chosen": -83.60955810546875, "logps/ref_chosen": -67.10975646972656, "logps/ref_rejected": -77.11839294433594, "logps/rejected": -103.84635925292969, "loss": 0.9184, "margin_dpo/margin_mean": 10.228164672851562, "margin_dpo/margin_std": 13.449007034301758, "step": 332 }, { "epoch": 0.5034013605442177, "grad_norm": 26.578086853027344, "learning_rate": 2.921093116725076e-07, "logits/chosen": 0.3908570408821106, "logits/rejected": 0.3197840452194214, "logps/chosen": -74.55105590820312, "logps/ref_chosen": -58.381126403808594, "logps/ref_rejected": -85.02839660644531, "logps/rejected": -114.37781524658203, "loss": 0.7668, "margin_dpo/margin_mean": 13.17949104309082, "margin_dpo/margin_std": 14.035030364990234, "step": 333 }, { "epoch": 0.5049130763416477, "grad_norm": 37.4295539855957, "learning_rate": 2.9080540104031484e-07, "logits/chosen": 0.4013047218322754, "logits/rejected": 0.35357779264450073, "logps/chosen": -83.73133850097656, "logps/ref_chosen": -66.89199829101562, "logps/ref_rejected": -91.83695220947266, "logps/rejected": -117.2813491821289, "loss": 1.1505, "margin_dpo/margin_mean": 8.60505485534668, "margin_dpo/margin_std": 15.63020133972168, "step": 334 }, { "epoch": 0.5064247921390779, "grad_norm": 31.525114059448242, "learning_rate": 2.895003489933375e-07, "logits/chosen": 0.4016830623149872, "logits/rejected": 0.3621571660041809, "logps/chosen": -77.64753723144531, "logps/ref_chosen": -61.51445770263672, "logps/ref_rejected": -75.68916320800781, "logps/rejected": -102.18275451660156, "loss": 0.9979, "margin_dpo/margin_mean": 10.360504150390625, "margin_dpo/margin_std": 15.029167175292969, "step": 335 }, { "epoch": 0.5079365079365079, "grad_norm": 37.780757904052734, "learning_rate": 2.8819419203668675e-07, "logits/chosen": 0.3117416501045227, "logits/rejected": 0.2896695137023926, "logps/chosen": -89.01457214355469, "logps/ref_chosen": -68.85006713867188, "logps/ref_rejected": -92.99603271484375, "logps/rejected": -122.92210388183594, "loss": 1.0539, "margin_dpo/margin_mean": 9.76156997680664, "margin_dpo/margin_std": 16.293773651123047, "step": 336 }, { "epoch": 0.509448223733938, "grad_norm": 34.2430419921875, "learning_rate": 2.8688696670638053e-07, "logits/chosen": 0.2840282917022705, "logits/rejected": 0.24821636080741882, "logps/chosen": -92.56698608398438, "logps/ref_chosen": -73.18783569335938, "logps/ref_rejected": -86.89118957519531, "logps/rejected": -114.37174987792969, "loss": 1.0807, "margin_dpo/margin_mean": 8.101402282714844, "margin_dpo/margin_std": 13.815885543823242, "step": 337 }, { "epoch": 0.5109599395313681, "grad_norm": 33.25615692138672, "learning_rate": 2.8557870956832133e-07, "logits/chosen": 0.33388757705688477, "logits/rejected": 0.3053157329559326, "logps/chosen": -83.34864044189453, "logps/ref_chosen": -63.939613342285156, "logps/ref_rejected": -75.34243774414062, "logps/rejected": -103.55175018310547, "loss": 1.0324, "margin_dpo/margin_mean": 8.800281524658203, "margin_dpo/margin_std": 13.543548583984375, "step": 338 }, { "epoch": 0.5124716553287982, "grad_norm": 29.72187614440918, "learning_rate": 2.842694572172736e-07, "logits/chosen": 0.47476959228515625, "logits/rejected": 0.3901880979537964, "logps/chosen": -61.99988555908203, "logps/ref_chosen": -45.54913330078125, "logps/ref_rejected": -67.0482177734375, "logps/rejected": -93.39913940429688, "loss": 0.9283, "margin_dpo/margin_mean": 9.900163650512695, "margin_dpo/margin_std": 12.933232307434082, "step": 339 }, { "epoch": 0.5139833711262283, "grad_norm": 33.46787643432617, "learning_rate": 2.8295924627584004e-07, "logits/chosen": 0.36157703399658203, "logits/rejected": 0.3349456489086151, "logps/chosen": -72.63715362548828, "logps/ref_chosen": -54.00564956665039, "logps/ref_rejected": -61.314430236816406, "logps/rejected": -90.69908142089844, "loss": 1.0838, "margin_dpo/margin_mean": 10.753141403198242, "margin_dpo/margin_std": 16.928892135620117, "step": 340 }, { "epoch": 0.5154950869236583, "grad_norm": 36.91661834716797, "learning_rate": 2.816481133934373e-07, "logits/chosen": 0.3951025605201721, "logits/rejected": 0.34404298663139343, "logps/chosen": -80.79064178466797, "logps/ref_chosen": -63.39509582519531, "logps/ref_rejected": -76.20973205566406, "logps/rejected": -104.360107421875, "loss": 1.0046, "margin_dpo/margin_mean": 10.754829406738281, "margin_dpo/margin_std": 14.602158546447754, "step": 341 }, { "epoch": 0.5170068027210885, "grad_norm": 25.358030319213867, "learning_rate": 2.8033609524527046e-07, "logits/chosen": 0.4110845625400543, "logits/rejected": 0.3650810718536377, "logps/chosen": -70.498779296875, "logps/ref_chosen": -53.047813415527344, "logps/ref_rejected": -68.2854232788086, "logps/rejected": -98.16246795654297, "loss": 0.8504, "margin_dpo/margin_mean": 12.426074981689453, "margin_dpo/margin_std": 15.092041015625, "step": 342 }, { "epoch": 0.5185185185185185, "grad_norm": 32.1805534362793, "learning_rate": 2.7902322853130753e-07, "logits/chosen": 0.27944275736808777, "logits/rejected": 0.2696824073791504, "logps/chosen": -87.056640625, "logps/ref_chosen": -70.57853698730469, "logps/ref_rejected": -84.73873901367188, "logps/rejected": -108.2984848022461, "loss": 1.1287, "margin_dpo/margin_mean": 7.081644058227539, "margin_dpo/margin_std": 12.546673774719238, "step": 343 }, { "epoch": 0.5200302343159486, "grad_norm": 33.30143737792969, "learning_rate": 2.7770954997525274e-07, "logits/chosen": 0.37141865491867065, "logits/rejected": 0.3081679344177246, "logps/chosen": -75.291015625, "logps/ref_chosen": -55.811004638671875, "logps/ref_rejected": -84.7763671875, "logps/rejected": -114.64897918701172, "loss": 0.9329, "margin_dpo/margin_mean": 10.392600059509277, "margin_dpo/margin_std": 13.607931137084961, "step": 344 }, { "epoch": 0.5215419501133787, "grad_norm": 28.18509292602539, "learning_rate": 2.7639509632351927e-07, "logits/chosen": 0.461821585893631, "logits/rejected": 0.41458916664123535, "logps/chosen": -72.05027770996094, "logps/ref_chosen": -57.786094665527344, "logps/ref_rejected": -78.91847229003906, "logps/rejected": -103.32426452636719, "loss": 0.9874, "margin_dpo/margin_mean": 10.14161491394043, "margin_dpo/margin_std": 14.554746627807617, "step": 345 }, { "epoch": 0.5230536659108088, "grad_norm": 31.265893936157227, "learning_rate": 2.7507990434420123e-07, "logits/chosen": 0.43421846628189087, "logits/rejected": 0.3566897511482239, "logps/chosen": -72.08039855957031, "logps/ref_chosen": -56.285125732421875, "logps/ref_rejected": -91.15303039550781, "logps/rejected": -118.93693542480469, "loss": 0.9291, "margin_dpo/margin_mean": 11.988627433776855, "margin_dpo/margin_std": 15.381285667419434, "step": 346 }, { "epoch": 0.5245653817082389, "grad_norm": 31.849092483520508, "learning_rate": 2.737640108260456e-07, "logits/chosen": 0.49194493889808655, "logits/rejected": 0.4413068890571594, "logps/chosen": -71.65328979492188, "logps/ref_chosen": -53.499542236328125, "logps/ref_rejected": -72.52565002441406, "logps/rejected": -99.69212341308594, "loss": 1.0635, "margin_dpo/margin_mean": 9.012718200683594, "margin_dpo/margin_std": 15.008731842041016, "step": 347 }, { "epoch": 0.5260770975056689, "grad_norm": 27.368101119995117, "learning_rate": 2.724474525774229e-07, "logits/chosen": 0.5060254335403442, "logits/rejected": 0.47323817014694214, "logps/chosen": -65.67312622070312, "logps/ref_chosen": -50.78684997558594, "logps/ref_rejected": -68.63732147216797, "logps/rejected": -96.16216278076172, "loss": 0.9281, "margin_dpo/margin_mean": 12.63856315612793, "margin_dpo/margin_std": 17.71712875366211, "step": 348 }, { "epoch": 0.527588813303099, "grad_norm": 29.27608299255371, "learning_rate": 2.711302664252973e-07, "logits/chosen": 0.43929237127304077, "logits/rejected": 0.34969234466552734, "logps/chosen": -68.72640991210938, "logps/ref_chosen": -53.32501220703125, "logps/ref_rejected": -83.21235656738281, "logps/rejected": -110.11225891113281, "loss": 0.8832, "margin_dpo/margin_mean": 11.49850082397461, "margin_dpo/margin_std": 14.258995056152344, "step": 349 }, { "epoch": 0.5291005291005291, "grad_norm": 32.149715423583984, "learning_rate": 2.698124892141971e-07, "logits/chosen": 0.3800434470176697, "logits/rejected": 0.3004031777381897, "logps/chosen": -78.58746337890625, "logps/ref_chosen": -61.62577438354492, "logps/ref_rejected": -87.63627624511719, "logps/rejected": -118.50624084472656, "loss": 0.7994, "margin_dpo/margin_mean": 13.908271789550781, "margin_dpo/margin_std": 15.544957160949707, "step": 350 }, { "epoch": 0.5306122448979592, "grad_norm": 28.493574142456055, "learning_rate": 2.6849415780518357e-07, "logits/chosen": 0.33453550934791565, "logits/rejected": 0.25856178998947144, "logps/chosen": -71.84654235839844, "logps/ref_chosen": -56.2563362121582, "logps/ref_rejected": -79.11589813232422, "logps/rejected": -105.61781311035156, "loss": 1.0, "margin_dpo/margin_mean": 10.911720275878906, "margin_dpo/margin_std": 15.10361099243164, "step": 351 }, { "epoch": 0.5321239606953893, "grad_norm": 31.185754776000977, "learning_rate": 2.6717530907482024e-07, "logits/chosen": 0.41737568378448486, "logits/rejected": 0.36074936389923096, "logps/chosen": -78.22322082519531, "logps/ref_chosen": -63.05195617675781, "logps/ref_rejected": -85.52035522460938, "logps/rejected": -110.64308166503906, "loss": 0.9394, "margin_dpo/margin_mean": 9.951467514038086, "margin_dpo/margin_std": 13.785094261169434, "step": 352 }, { "epoch": 0.5336356764928194, "grad_norm": 27.732799530029297, "learning_rate": 2.658559799141411e-07, "logits/chosen": 0.3916947841644287, "logits/rejected": 0.38925182819366455, "logps/chosen": -82.88592529296875, "logps/ref_chosen": -69.00918579101562, "logps/ref_rejected": -72.65840148925781, "logps/rejected": -97.80364990234375, "loss": 0.9485, "margin_dpo/margin_mean": 11.268506050109863, "margin_dpo/margin_std": 15.41522216796875, "step": 353 }, { "epoch": 0.5351473922902494, "grad_norm": 31.069217681884766, "learning_rate": 2.6453620722761895e-07, "logits/chosen": 0.4487117528915405, "logits/rejected": 0.31808778643608093, "logps/chosen": -54.345149993896484, "logps/ref_chosen": -39.78833770751953, "logps/ref_rejected": -69.56885528564453, "logps/rejected": -96.2296371459961, "loss": 0.8994, "margin_dpo/margin_mean": 12.10396957397461, "margin_dpo/margin_std": 15.011777877807617, "step": 354 }, { "epoch": 0.5366591080876795, "grad_norm": 34.739498138427734, "learning_rate": 2.632160279321328e-07, "logits/chosen": 0.3989719748497009, "logits/rejected": 0.2739609479904175, "logps/chosen": -62.557098388671875, "logps/ref_chosen": -46.25537872314453, "logps/ref_rejected": -78.20236206054688, "logps/rejected": -106.247802734375, "loss": 0.9684, "margin_dpo/margin_mean": 11.743728637695312, "margin_dpo/margin_std": 15.869745254516602, "step": 355 }, { "epoch": 0.5381708238851096, "grad_norm": 33.271121978759766, "learning_rate": 2.618954789559356e-07, "logits/chosen": 0.3909762501716614, "logits/rejected": 0.31542685627937317, "logps/chosen": -62.46385955810547, "logps/ref_chosen": -47.906158447265625, "logps/ref_rejected": -74.29397583007812, "logps/rejected": -98.53050994873047, "loss": 1.2069, "margin_dpo/margin_mean": 9.678831100463867, "margin_dpo/margin_std": 17.947500228881836, "step": 356 }, { "epoch": 0.5396825396825397, "grad_norm": 36.11610412597656, "learning_rate": 2.6057459723762076e-07, "logits/chosen": 0.3990505039691925, "logits/rejected": 0.3684351444244385, "logps/chosen": -79.40770721435547, "logps/ref_chosen": -62.63499450683594, "logps/ref_rejected": -65.11400604248047, "logps/rejected": -90.53868103027344, "loss": 1.1123, "margin_dpo/margin_mean": 8.651975631713867, "margin_dpo/margin_std": 14.43873405456543, "step": 357 }, { "epoch": 0.5411942554799698, "grad_norm": 29.094547271728516, "learning_rate": 2.5925341972508954e-07, "logits/chosen": 0.3368394374847412, "logits/rejected": 0.3385453522205353, "logps/chosen": -82.24810791015625, "logps/ref_chosen": -67.20960998535156, "logps/ref_rejected": -69.34715270996094, "logps/rejected": -96.86444091796875, "loss": 0.8898, "margin_dpo/margin_mean": 12.47878646850586, "margin_dpo/margin_std": 15.653668403625488, "step": 358 }, { "epoch": 0.5427059712773998, "grad_norm": 36.99692153930664, "learning_rate": 2.579319833745169e-07, "logits/chosen": 0.3435346782207489, "logits/rejected": 0.3147648572921753, "logps/chosen": -79.03623962402344, "logps/ref_chosen": -62.52578353881836, "logps/ref_rejected": -76.63114929199219, "logps/rejected": -100.06315612792969, "loss": 1.1669, "margin_dpo/margin_mean": 6.921560764312744, "margin_dpo/margin_std": 12.897404670715332, "step": 359 }, { "epoch": 0.54421768707483, "grad_norm": 32.149078369140625, "learning_rate": 2.5661032514931834e-07, "logits/chosen": 0.281266987323761, "logits/rejected": 0.18974991142749786, "logps/chosen": -79.66699981689453, "logps/ref_chosen": -63.48772048950195, "logps/ref_rejected": -90.6891098022461, "logps/rejected": -116.84323120117188, "loss": 0.964, "margin_dpo/margin_mean": 9.97484016418457, "margin_dpo/margin_std": 14.54484748840332, "step": 360 }, { "epoch": 0.54572940287226, "grad_norm": 29.79141616821289, "learning_rate": 2.552884820191154e-07, "logits/chosen": 0.4189506769180298, "logits/rejected": 0.3658691644668579, "logps/chosen": -72.93891143798828, "logps/ref_chosen": -57.917144775390625, "logps/ref_rejected": -72.39089965820312, "logps/rejected": -99.44670104980469, "loss": 0.8476, "margin_dpo/margin_mean": 12.034027099609375, "margin_dpo/margin_std": 14.130279541015625, "step": 361 }, { "epoch": 0.54724111866969, "grad_norm": 28.21626853942871, "learning_rate": 2.53966490958702e-07, "logits/chosen": 0.41983890533447266, "logits/rejected": 0.31845760345458984, "logps/chosen": -78.23361206054688, "logps/ref_chosen": -63.4434700012207, "logps/ref_rejected": -103.45516967773438, "logps/rejected": -129.9938201904297, "loss": 0.8654, "margin_dpo/margin_mean": 11.748510360717773, "margin_dpo/margin_std": 13.821972846984863, "step": 362 }, { "epoch": 0.5487528344671202, "grad_norm": 26.874298095703125, "learning_rate": 2.526443889470099e-07, "logits/chosen": 0.4121060371398926, "logits/rejected": 0.2874605059623718, "logps/chosen": -65.51070404052734, "logps/ref_chosen": -48.65182876586914, "logps/ref_rejected": -88.65904235839844, "logps/rejected": -117.357421875, "loss": 0.8606, "margin_dpo/margin_mean": 11.839500427246094, "margin_dpo/margin_std": 14.284195899963379, "step": 363 }, { "epoch": 0.5502645502645502, "grad_norm": 27.861297607421875, "learning_rate": 2.513222129660744e-07, "logits/chosen": 0.2716137170791626, "logits/rejected": 0.17793412506580353, "logps/chosen": -72.13548278808594, "logps/ref_chosen": -57.87107467651367, "logps/ref_rejected": -80.95502471923828, "logps/rejected": -107.41427612304688, "loss": 0.9838, "margin_dpo/margin_mean": 12.194845199584961, "margin_dpo/margin_std": 17.369735717773438, "step": 364 }, { "epoch": 0.5517762660619804, "grad_norm": 24.756385803222656, "learning_rate": 2.5e-07, "logits/chosen": 0.3685954213142395, "logits/rejected": 0.35689201951026917, "logps/chosen": -77.160888671875, "logps/ref_chosen": -64.94217681884766, "logps/ref_rejected": -74.8599853515625, "logps/rejected": -98.56990051269531, "loss": 0.8058, "margin_dpo/margin_mean": 11.4912109375, "margin_dpo/margin_std": 12.382892608642578, "step": 365 }, { "epoch": 0.5532879818594104, "grad_norm": 31.192968368530273, "learning_rate": 2.486777870339255e-07, "logits/chosen": 0.314043253660202, "logits/rejected": 0.29208898544311523, "logps/chosen": -66.73348999023438, "logps/ref_chosen": -55.165985107421875, "logps/ref_rejected": -65.2612075805664, "logps/rejected": -86.15950012207031, "loss": 1.0801, "margin_dpo/margin_mean": 9.330782890319824, "margin_dpo/margin_std": 15.033758163452148, "step": 366 }, { "epoch": 0.5547996976568406, "grad_norm": 28.845701217651367, "learning_rate": 2.4735561105299014e-07, "logits/chosen": 0.33946436643600464, "logits/rejected": 0.23690305650234222, "logps/chosen": -70.849365234375, "logps/ref_chosen": -56.010467529296875, "logps/ref_rejected": -77.31010437011719, "logps/rejected": -101.05453491210938, "loss": 0.9823, "margin_dpo/margin_mean": 8.905525207519531, "margin_dpo/margin_std": 12.392328262329102, "step": 367 }, { "epoch": 0.5563114134542706, "grad_norm": 31.565885543823242, "learning_rate": 2.46033509041298e-07, "logits/chosen": 0.21036341786384583, "logits/rejected": 0.20005691051483154, "logps/chosen": -90.7908935546875, "logps/ref_chosen": -74.82928466796875, "logps/ref_rejected": -76.11680603027344, "logps/rejected": -100.37398529052734, "loss": 1.0675, "margin_dpo/margin_mean": 8.295562744140625, "margin_dpo/margin_std": 13.27570915222168, "step": 368 }, { "epoch": 0.5578231292517006, "grad_norm": 27.81777572631836, "learning_rate": 2.447115179808846e-07, "logits/chosen": 0.3587338328361511, "logits/rejected": 0.3078988790512085, "logps/chosen": -73.69477081298828, "logps/ref_chosen": -58.32621765136719, "logps/ref_rejected": -80.92184448242188, "logps/rejected": -104.86062622070312, "loss": 1.0359, "margin_dpo/margin_mean": 8.570233345031738, "margin_dpo/margin_std": 12.987571716308594, "step": 369 }, { "epoch": 0.5593348450491308, "grad_norm": 31.120532989501953, "learning_rate": 2.4338967485068164e-07, "logits/chosen": 0.4456462264060974, "logits/rejected": 0.37752196192741394, "logps/chosen": -67.46039581298828, "logps/ref_chosen": -52.88372039794922, "logps/ref_rejected": -79.43692016601562, "logps/rejected": -105.49575805664062, "loss": 1.0011, "margin_dpo/margin_mean": 11.482163429260254, "margin_dpo/margin_std": 16.541217803955078, "step": 370 }, { "epoch": 0.5608465608465608, "grad_norm": 28.67278480529785, "learning_rate": 2.420680166254831e-07, "logits/chosen": 0.46257519721984863, "logits/rejected": 0.42657119035720825, "logps/chosen": -63.5814094543457, "logps/ref_chosen": -49.224212646484375, "logps/ref_rejected": -63.348472595214844, "logps/rejected": -88.49848937988281, "loss": 0.9804, "margin_dpo/margin_mean": 10.792819023132324, "margin_dpo/margin_std": 14.90372085571289, "step": 371 }, { "epoch": 0.562358276643991, "grad_norm": 35.98383331298828, "learning_rate": 2.4074658027491044e-07, "logits/chosen": 0.39039164781570435, "logits/rejected": 0.30279237031936646, "logps/chosen": -67.90390014648438, "logps/ref_chosen": -52.26955032348633, "logps/ref_rejected": -72.99522399902344, "logps/rejected": -95.57124328613281, "loss": 1.3022, "margin_dpo/margin_mean": 6.941670894622803, "margin_dpo/margin_std": 16.000974655151367, "step": 372 }, { "epoch": 0.563869992441421, "grad_norm": 39.37256622314453, "learning_rate": 2.394254027623792e-07, "logits/chosen": 0.37247511744499207, "logits/rejected": 0.3031313419342041, "logps/chosen": -77.99808502197266, "logps/ref_chosen": -61.112998962402344, "logps/ref_rejected": -76.24851989746094, "logps/rejected": -102.57091522216797, "loss": 1.1096, "margin_dpo/margin_mean": 9.437311172485352, "margin_dpo/margin_std": 16.313791275024414, "step": 373 }, { "epoch": 0.5653817082388511, "grad_norm": 34.07474136352539, "learning_rate": 2.381045210440644e-07, "logits/chosen": 0.27694612741470337, "logits/rejected": 0.2643893361091614, "logps/chosen": -87.940673828125, "logps/ref_chosen": -72.66920471191406, "logps/ref_rejected": -76.83158874511719, "logps/rejected": -104.74732208251953, "loss": 0.7677, "margin_dpo/margin_mean": 12.644261360168457, "margin_dpo/margin_std": 12.625534057617188, "step": 374 }, { "epoch": 0.5668934240362812, "grad_norm": 36.280582427978516, "learning_rate": 2.3678397206786715e-07, "logits/chosen": 0.4151899218559265, "logits/rejected": 0.35815227031707764, "logps/chosen": -72.33064270019531, "logps/ref_chosen": -57.68330383300781, "logps/ref_rejected": -79.34097290039062, "logps/rejected": -101.60260009765625, "loss": 1.0982, "margin_dpo/margin_mean": 7.614285469055176, "margin_dpo/margin_std": 13.181623458862305, "step": 375 }, { "epoch": 0.5684051398337112, "grad_norm": 28.2639217376709, "learning_rate": 2.3546379277238103e-07, "logits/chosen": 0.4273167848587036, "logits/rejected": 0.35925012826919556, "logps/chosen": -67.07015228271484, "logps/ref_chosen": -51.674072265625, "logps/ref_rejected": -75.69713592529297, "logps/rejected": -104.54731750488281, "loss": 0.8639, "margin_dpo/margin_mean": 13.45411205291748, "margin_dpo/margin_std": 16.179851531982422, "step": 376 }, { "epoch": 0.5699168556311414, "grad_norm": 28.088367462158203, "learning_rate": 2.3414402008585886e-07, "logits/chosen": 0.3443489670753479, "logits/rejected": 0.31910431385040283, "logps/chosen": -62.237247467041016, "logps/ref_chosen": -46.17853546142578, "logps/ref_rejected": -57.756500244140625, "logps/rejected": -83.15531921386719, "loss": 0.9812, "margin_dpo/margin_mean": 9.340099334716797, "margin_dpo/margin_std": 12.798967361450195, "step": 377 }, { "epoch": 0.5714285714285714, "grad_norm": 32.793365478515625, "learning_rate": 2.3282469092517977e-07, "logits/chosen": 0.42863214015960693, "logits/rejected": 0.37801802158355713, "logps/chosen": -75.05768585205078, "logps/ref_chosen": -59.21887969970703, "logps/ref_rejected": -71.2481918334961, "logps/rejected": -95.27156066894531, "loss": 1.0859, "margin_dpo/margin_mean": 8.184557914733887, "margin_dpo/margin_std": 13.956628799438477, "step": 378 }, { "epoch": 0.5729402872260015, "grad_norm": 35.66096496582031, "learning_rate": 2.3150584219481643e-07, "logits/chosen": 0.3971962034702301, "logits/rejected": 0.330384761095047, "logps/chosen": -91.02781677246094, "logps/ref_chosen": -76.31658935546875, "logps/ref_rejected": -104.26200866699219, "logps/rejected": -129.94598388671875, "loss": 0.9102, "margin_dpo/margin_mean": 10.972752571105957, "margin_dpo/margin_std": 14.204010009765625, "step": 379 }, { "epoch": 0.5744520030234316, "grad_norm": 28.557945251464844, "learning_rate": 2.3018751078580283e-07, "logits/chosen": 0.40566498041152954, "logits/rejected": 0.3627912998199463, "logps/chosen": -74.88333129882812, "logps/ref_chosen": -61.283164978027344, "logps/ref_rejected": -72.38892364501953, "logps/rejected": -98.32823181152344, "loss": 0.8653, "margin_dpo/margin_mean": 12.339146614074707, "margin_dpo/margin_std": 14.47273063659668, "step": 380 }, { "epoch": 0.5759637188208617, "grad_norm": 36.389286041259766, "learning_rate": 2.288697335747027e-07, "logits/chosen": 0.34177953004837036, "logits/rejected": 0.31463104486465454, "logps/chosen": -75.02536010742188, "logps/ref_chosen": -58.2139892578125, "logps/ref_rejected": -60.78669357299805, "logps/rejected": -82.91322326660156, "loss": 1.2548, "margin_dpo/margin_mean": 5.315165996551514, "margin_dpo/margin_std": 12.719392776489258, "step": 381 }, { "epoch": 0.5774754346182918, "grad_norm": 29.557825088500977, "learning_rate": 2.2755254742257706e-07, "logits/chosen": 0.39128854870796204, "logits/rejected": 0.3420262336730957, "logps/chosen": -79.03041076660156, "logps/ref_chosen": -61.82532501220703, "logps/ref_rejected": -83.0452880859375, "logps/rejected": -108.56074523925781, "loss": 0.9752, "margin_dpo/margin_mean": 8.310381889343262, "margin_dpo/margin_std": 11.493642807006836, "step": 382 }, { "epoch": 0.5789871504157218, "grad_norm": 37.190284729003906, "learning_rate": 2.2623598917395436e-07, "logits/chosen": 0.2815375328063965, "logits/rejected": 0.29732605814933777, "logps/chosen": -96.2820053100586, "logps/ref_chosen": -80.56326293945312, "logps/ref_rejected": -74.62922668457031, "logps/rejected": -100.91593933105469, "loss": 0.9926, "margin_dpo/margin_mean": 10.567968368530273, "margin_dpo/margin_std": 15.280168533325195, "step": 383 }, { "epoch": 0.5804988662131519, "grad_norm": 31.593162536621094, "learning_rate": 2.2492009565579875e-07, "logits/chosen": 0.37002235651016235, "logits/rejected": 0.3238743543624878, "logps/chosen": -82.05486297607422, "logps/ref_chosen": -65.47514343261719, "logps/ref_rejected": -79.67378234863281, "logps/rejected": -106.28813171386719, "loss": 0.8973, "margin_dpo/margin_mean": 10.034626007080078, "margin_dpo/margin_std": 12.21096420288086, "step": 384 }, { "epoch": 0.582010582010582, "grad_norm": 31.312774658203125, "learning_rate": 2.2360490367648084e-07, "logits/chosen": 0.31117817759513855, "logits/rejected": 0.2695065438747406, "logps/chosen": -82.10106658935547, "logps/ref_chosen": -66.0565185546875, "logps/ref_rejected": -86.68023681640625, "logps/rejected": -113.80641174316406, "loss": 0.8523, "margin_dpo/margin_mean": 11.08163070678711, "margin_dpo/margin_std": 12.792245864868164, "step": 385 }, { "epoch": 0.5835222978080121, "grad_norm": 31.37440299987793, "learning_rate": 2.2229045002474724e-07, "logits/chosen": 0.2849174737930298, "logits/rejected": 0.2268781214952469, "logps/chosen": -93.83451843261719, "logps/ref_chosen": -75.6236572265625, "logps/ref_rejected": -92.62330627441406, "logps/rejected": -119.64865112304688, "loss": 1.0064, "margin_dpo/margin_mean": 8.81447982788086, "margin_dpo/margin_std": 12.923885345458984, "step": 386 }, { "epoch": 0.5850340136054422, "grad_norm": 25.713747024536133, "learning_rate": 2.209767714686924e-07, "logits/chosen": 0.412885844707489, "logits/rejected": 0.31256186962127686, "logps/chosen": -62.97633743286133, "logps/ref_chosen": -47.22170639038086, "logps/ref_rejected": -87.33814239501953, "logps/rejected": -114.74981689453125, "loss": 0.7702, "margin_dpo/margin_mean": 11.657045364379883, "margin_dpo/margin_std": 11.923395156860352, "step": 387 }, { "epoch": 0.5865457294028723, "grad_norm": 32.77546310424805, "learning_rate": 2.1966390475472954e-07, "logits/chosen": 0.39520263671875, "logits/rejected": 0.3797769546508789, "logps/chosen": -90.493408203125, "logps/ref_chosen": -74.5794677734375, "logps/ref_rejected": -79.92558288574219, "logps/rejected": -104.2813720703125, "loss": 1.1101, "margin_dpo/margin_mean": 8.441862106323242, "margin_dpo/margin_std": 14.776924133300781, "step": 388 }, { "epoch": 0.5880574452003023, "grad_norm": 45.588809967041016, "learning_rate": 2.1835188660656265e-07, "logits/chosen": 0.3767646551132202, "logits/rejected": 0.3376935124397278, "logps/chosen": -79.09071350097656, "logps/ref_chosen": -61.624366760253906, "logps/ref_rejected": -76.50978088378906, "logps/rejected": -104.31814575195312, "loss": 0.9563, "margin_dpo/margin_mean": 10.34201431274414, "margin_dpo/margin_std": 13.894803047180176, "step": 389 }, { "epoch": 0.5895691609977324, "grad_norm": 26.247488021850586, "learning_rate": 2.170407537241599e-07, "logits/chosen": 0.4315715730190277, "logits/rejected": 0.3645378351211548, "logps/chosen": -61.070472717285156, "logps/ref_chosen": -45.871864318847656, "logps/ref_rejected": -61.305999755859375, "logps/rejected": -86.49710083007812, "loss": 0.8893, "margin_dpo/margin_mean": 9.992494583129883, "margin_dpo/margin_std": 12.295760154724121, "step": 390 }, { "epoch": 0.5910808767951625, "grad_norm": 29.182357788085938, "learning_rate": 2.1573054278272636e-07, "logits/chosen": 0.3792092204093933, "logits/rejected": 0.31141602993011475, "logps/chosen": -73.98717498779297, "logps/ref_chosen": -58.18701171875, "logps/ref_rejected": -83.63443756103516, "logps/rejected": -110.81365966796875, "loss": 0.98, "margin_dpo/margin_mean": 11.379058837890625, "margin_dpo/margin_std": 15.968740463256836, "step": 391 }, { "epoch": 0.5925925925925926, "grad_norm": 36.35900115966797, "learning_rate": 2.1442129043167873e-07, "logits/chosen": 0.48609426617622375, "logits/rejected": 0.4238324761390686, "logps/chosen": -83.52938842773438, "logps/ref_chosen": -69.74452209472656, "logps/ref_rejected": -94.05877685546875, "logps/rejected": -120.82524871826172, "loss": 0.9217, "margin_dpo/margin_mean": 12.981613159179688, "margin_dpo/margin_std": 17.47749137878418, "step": 392 }, { "epoch": 0.5941043083900227, "grad_norm": 27.474313735961914, "learning_rate": 2.131130332936195e-07, "logits/chosen": 0.41157129406929016, "logits/rejected": 0.36979395151138306, "logps/chosen": -69.56451416015625, "logps/ref_chosen": -52.33489990234375, "logps/ref_rejected": -74.33810424804688, "logps/rejected": -103.74046325683594, "loss": 0.8095, "margin_dpo/margin_mean": 12.172752380371094, "margin_dpo/margin_std": 13.218109130859375, "step": 393 }, { "epoch": 0.5956160241874527, "grad_norm": 29.2933349609375, "learning_rate": 2.1180580796331323e-07, "logits/chosen": 0.44927114248275757, "logits/rejected": 0.41426438093185425, "logps/chosen": -76.46290588378906, "logps/ref_chosen": -60.6761360168457, "logps/ref_rejected": -71.36075592041016, "logps/rejected": -97.13763427734375, "loss": 0.8872, "margin_dpo/margin_mean": 9.990107536315918, "margin_dpo/margin_std": 12.077360153198242, "step": 394 }, { "epoch": 0.5971277399848829, "grad_norm": 31.02004051208496, "learning_rate": 2.104996510066625e-07, "logits/chosen": 0.4154667258262634, "logits/rejected": 0.3240908682346344, "logps/chosen": -66.60691833496094, "logps/ref_chosen": -50.60432434082031, "logps/ref_rejected": -77.08731079101562, "logps/rejected": -102.46751403808594, "loss": 0.9324, "margin_dpo/margin_mean": 9.377607345581055, "margin_dpo/margin_std": 12.179868698120117, "step": 395 }, { "epoch": 0.5986394557823129, "grad_norm": 30.178119659423828, "learning_rate": 2.0919459895968517e-07, "logits/chosen": 0.38538146018981934, "logits/rejected": 0.2900884747505188, "logps/chosen": -66.71278381347656, "logps/ref_chosen": -51.35961151123047, "logps/ref_rejected": -79.89360046386719, "logps/rejected": -106.4371337890625, "loss": 0.8753, "margin_dpo/margin_mean": 11.190366744995117, "margin_dpo/margin_std": 12.479292869567871, "step": 396 }, { "epoch": 0.600151171579743, "grad_norm": 46.14140701293945, "learning_rate": 2.078906883274924e-07, "logits/chosen": 0.3269771933555603, "logits/rejected": 0.2809567153453827, "logps/chosen": -84.61856842041016, "logps/ref_chosen": -66.45622253417969, "logps/ref_rejected": -85.74736022949219, "logps/rejected": -109.89884948730469, "loss": 1.3166, "margin_dpo/margin_mean": 5.98914909362793, "margin_dpo/margin_std": 14.911293029785156, "step": 397 }, { "epoch": 0.6016628873771731, "grad_norm": 27.392459869384766, "learning_rate": 2.065879555832674e-07, "logits/chosen": 0.3728417158126831, "logits/rejected": 0.3049323558807373, "logps/chosen": -64.94625091552734, "logps/ref_chosen": -49.244239807128906, "logps/ref_rejected": -75.18949127197266, "logps/rejected": -102.7054672241211, "loss": 0.8319, "margin_dpo/margin_mean": 11.813968658447266, "margin_dpo/margin_std": 13.981929779052734, "step": 398 }, { "epoch": 0.6031746031746031, "grad_norm": 30.352577209472656, "learning_rate": 2.052864371672457e-07, "logits/chosen": 0.3121348023414612, "logits/rejected": 0.1789398044347763, "logps/chosen": -85.18453979492188, "logps/ref_chosen": -68.30679321289062, "logps/ref_rejected": -113.2708511352539, "logps/rejected": -143.73336791992188, "loss": 0.7977, "margin_dpo/margin_mean": 13.584760665893555, "margin_dpo/margin_std": 16.35565185546875, "step": 399 }, { "epoch": 0.6046863189720333, "grad_norm": 40.554786682128906, "learning_rate": 2.0398616948569493e-07, "logits/chosen": 0.397554486989975, "logits/rejected": 0.3408077657222748, "logps/chosen": -91.72099304199219, "logps/ref_chosen": -71.62649536132812, "logps/ref_rejected": -90.98765563964844, "logps/rejected": -119.32801055908203, "loss": 1.084, "margin_dpo/margin_mean": 8.245853424072266, "margin_dpo/margin_std": 14.028858184814453, "step": 400 }, { "epoch": 0.6046863189720333, "eval_logits/chosen": 0.391160786151886, "eval_logits/rejected": 0.3394322395324707, "eval_logps/chosen": -91.2626724243164, "eval_logps/ref_chosen": -74.85946655273438, "eval_logps/ref_rejected": -79.54898834228516, "eval_logps/rejected": -104.81573486328125, "eval_loss": 0.5336794853210449, "eval_margin_dpo/margin_mean": 8.86353588104248, "eval_margin_dpo/margin_std": 14.35659122467041, "eval_runtime": 38.6454, "eval_samples_per_second": 59.593, "eval_steps_per_second": 1.863, "step": 400 }, { "epoch": 0.6061980347694633, "grad_norm": 26.281349182128906, "learning_rate": 2.0268718890989752e-07, "logits/chosen": 0.4084246754646301, "logits/rejected": 0.31023353338241577, "logps/chosen": -68.73127746582031, "logps/ref_chosen": -53.72496032714844, "logps/ref_rejected": -75.06304931640625, "logps/rejected": -103.77824401855469, "loss": 0.7719, "margin_dpo/margin_mean": 13.708871841430664, "margin_dpo/margin_std": 14.911130905151367, "step": 401 }, { "epoch": 0.6077097505668935, "grad_norm": 32.728675842285156, "learning_rate": 2.013895317751323e-07, "logits/chosen": 0.40241509675979614, "logits/rejected": 0.3629066050052643, "logps/chosen": -77.94585418701172, "logps/ref_chosen": -61.873931884765625, "logps/ref_rejected": -66.1519775390625, "logps/rejected": -92.5583724975586, "loss": 0.9704, "margin_dpo/margin_mean": 10.334466934204102, "margin_dpo/margin_std": 14.114884376525879, "step": 402 }, { "epoch": 0.6092214663643235, "grad_norm": 26.115026473999023, "learning_rate": 2.0009323437965898e-07, "logits/chosen": 0.5070240497589111, "logits/rejected": 0.42596685886383057, "logps/chosen": -68.05751037597656, "logps/ref_chosen": -51.321502685546875, "logps/ref_rejected": -86.54010772705078, "logps/rejected": -117.83511352539062, "loss": 0.7949, "margin_dpo/margin_mean": 14.55899715423584, "margin_dpo/margin_std": 16.20275115966797, "step": 403 }, { "epoch": 0.6107331821617535, "grad_norm": 34.61177444458008, "learning_rate": 1.9879833298370237e-07, "logits/chosen": 0.3717535734176636, "logits/rejected": 0.2808607518672943, "logps/chosen": -77.73711395263672, "logps/ref_chosen": -62.26288604736328, "logps/ref_rejected": -95.19029998779297, "logps/rejected": -122.16180419921875, "loss": 0.925, "margin_dpo/margin_mean": 11.497272491455078, "margin_dpo/margin_std": 14.764816284179688, "step": 404 }, { "epoch": 0.6122448979591837, "grad_norm": 26.180768966674805, "learning_rate": 1.975048638084379e-07, "logits/chosen": 0.44532907009124756, "logits/rejected": 0.39605429768562317, "logps/chosen": -66.79632568359375, "logps/ref_chosen": -50.58434295654297, "logps/ref_rejected": -65.43156433105469, "logps/rejected": -92.28169250488281, "loss": 0.9244, "margin_dpo/margin_mean": 10.638147354125977, "margin_dpo/margin_std": 12.895776748657227, "step": 405 }, { "epoch": 0.6137566137566137, "grad_norm": 26.834482192993164, "learning_rate": 1.9621286303497914e-07, "logits/chosen": 0.4308337867259979, "logits/rejected": 0.2796477973461151, "logps/chosen": -64.9953842163086, "logps/ref_chosen": -48.99560546875, "logps/ref_rejected": -92.47773742675781, "logps/rejected": -120.5987777709961, "loss": 0.8967, "margin_dpo/margin_mean": 12.121261596679688, "margin_dpo/margin_std": 14.868024826049805, "step": 406 }, { "epoch": 0.6152683295540439, "grad_norm": 41.65665817260742, "learning_rate": 1.9492236680336483e-07, "logits/chosen": 0.29304084181785583, "logits/rejected": 0.21816039085388184, "logps/chosen": -109.28121948242188, "logps/ref_chosen": -89.40056610107422, "logps/ref_rejected": -99.28775024414062, "logps/rejected": -128.31704711914062, "loss": 1.0474, "margin_dpo/margin_mean": 9.148632049560547, "margin_dpo/margin_std": 14.953045845031738, "step": 407 }, { "epoch": 0.6167800453514739, "grad_norm": 24.90476417541504, "learning_rate": 1.9363341121154895e-07, "logits/chosen": 0.38886088132858276, "logits/rejected": 0.3072519898414612, "logps/chosen": -69.03826904296875, "logps/ref_chosen": -54.70391845703125, "logps/ref_rejected": -73.98648834228516, "logps/rejected": -101.77029418945312, "loss": 0.7863, "margin_dpo/margin_mean": 13.449457168579102, "margin_dpo/margin_std": 14.428291320800781, "step": 408 }, { "epoch": 0.618291761148904, "grad_norm": 33.50522232055664, "learning_rate": 1.9234603231438994e-07, "logits/chosen": 0.3977552652359009, "logits/rejected": 0.397859662771225, "logps/chosen": -79.27891540527344, "logps/ref_chosen": -62.11822509765625, "logps/ref_rejected": -61.933509826660156, "logps/rejected": -87.280517578125, "loss": 1.1203, "margin_dpo/margin_mean": 8.186317443847656, "margin_dpo/margin_std": 14.839698791503906, "step": 409 }, { "epoch": 0.6198034769463341, "grad_norm": 29.58933448791504, "learning_rate": 1.9106026612264315e-07, "logits/chosen": 0.4006178379058838, "logits/rejected": 0.3718770742416382, "logps/chosen": -77.56334686279297, "logps/ref_chosen": -61.80265808105469, "logps/ref_rejected": -76.60001373291016, "logps/rejected": -102.92352294921875, "loss": 0.809, "margin_dpo/margin_mean": 10.562816619873047, "margin_dpo/margin_std": 10.718137741088867, "step": 410 }, { "epoch": 0.6213151927437641, "grad_norm": 37.61003875732422, "learning_rate": 1.8977614860195296e-07, "logits/chosen": 0.38806748390197754, "logits/rejected": 0.3275112211704254, "logps/chosen": -73.58920288085938, "logps/ref_chosen": -54.445396423339844, "logps/ref_rejected": -74.56507873535156, "logps/rejected": -105.17032623291016, "loss": 1.0566, "margin_dpo/margin_mean": 11.461427688598633, "margin_dpo/margin_std": 17.844844818115234, "step": 411 }, { "epoch": 0.6228269085411943, "grad_norm": 30.01692008972168, "learning_rate": 1.8849371567184662e-07, "logits/chosen": 0.39307522773742676, "logits/rejected": 0.32400524616241455, "logps/chosen": -74.90364837646484, "logps/ref_chosen": -55.248085021972656, "logps/ref_rejected": -68.96623229980469, "logps/rejected": -99.93446350097656, "loss": 0.8748, "margin_dpo/margin_mean": 11.312671661376953, "margin_dpo/margin_std": 14.340400695800781, "step": 412 }, { "epoch": 0.6243386243386243, "grad_norm": 37.489013671875, "learning_rate": 1.872130032047302e-07, "logits/chosen": 0.2387150228023529, "logits/rejected": 0.20277327299118042, "logps/chosen": -88.35789489746094, "logps/ref_chosen": -68.72074890136719, "logps/ref_rejected": -78.76539611816406, "logps/rejected": -108.86207580566406, "loss": 1.0298, "margin_dpo/margin_mean": 10.459529876708984, "margin_dpo/margin_std": 16.46000099182129, "step": 413 }, { "epoch": 0.6258503401360545, "grad_norm": 30.536802291870117, "learning_rate": 1.8593404702488436e-07, "logits/chosen": 0.38499364256858826, "logits/rejected": 0.32137370109558105, "logps/chosen": -72.59126281738281, "logps/ref_chosen": -54.13821792602539, "logps/ref_rejected": -74.65741729736328, "logps/rejected": -105.45807647705078, "loss": 0.8818, "margin_dpo/margin_mean": 12.347614288330078, "margin_dpo/margin_std": 15.455865859985352, "step": 414 }, { "epoch": 0.6273620559334845, "grad_norm": 33.240840911865234, "learning_rate": 1.846568829074628e-07, "logits/chosen": 0.39433568716049194, "logits/rejected": 0.3705742359161377, "logps/chosen": -73.94824981689453, "logps/ref_chosen": -55.91856002807617, "logps/ref_rejected": -61.747703552246094, "logps/rejected": -90.88080596923828, "loss": 1.0148, "margin_dpo/margin_mean": 11.103410720825195, "margin_dpo/margin_std": 16.630678176879883, "step": 415 }, { "epoch": 0.6288737717309146, "grad_norm": 39.10771179199219, "learning_rate": 1.8338154657749128e-07, "logits/chosen": 0.36848437786102295, "logits/rejected": 0.316191703081131, "logps/chosen": -73.673095703125, "logps/ref_chosen": -54.72308349609375, "logps/ref_rejected": -69.17388916015625, "logps/rejected": -97.229248046875, "loss": 1.118, "margin_dpo/margin_mean": 9.105339050292969, "margin_dpo/margin_std": 14.868558883666992, "step": 416 }, { "epoch": 0.6303854875283447, "grad_norm": 35.6463508605957, "learning_rate": 1.8210807370886849e-07, "logits/chosen": 0.4883832335472107, "logits/rejected": 0.4186071753501892, "logps/chosen": -77.90934753417969, "logps/ref_chosen": -56.791259765625, "logps/ref_rejected": -68.7791748046875, "logps/rejected": -101.26014709472656, "loss": 0.9539, "margin_dpo/margin_mean": 11.362879753112793, "margin_dpo/margin_std": 15.073711395263672, "step": 417 }, { "epoch": 0.6318972033257747, "grad_norm": 48.182716369628906, "learning_rate": 1.8083649992336825e-07, "logits/chosen": 0.39419084787368774, "logits/rejected": 0.39882832765579224, "logps/chosen": -91.99015808105469, "logps/ref_chosen": -69.10798645019531, "logps/ref_rejected": -75.09132385253906, "logps/rejected": -105.7783203125, "loss": 1.2277, "margin_dpo/margin_mean": 7.804815292358398, "margin_dpo/margin_std": 16.62636375427246, "step": 418 }, { "epoch": 0.6334089191232048, "grad_norm": 29.898523330688477, "learning_rate": 1.7956686078964255e-07, "logits/chosen": 0.2999052107334137, "logits/rejected": 0.23983854055404663, "logps/chosen": -74.1788558959961, "logps/ref_chosen": -58.1717643737793, "logps/ref_rejected": -71.67066955566406, "logps/rejected": -100.13961791992188, "loss": 0.8586, "margin_dpo/margin_mean": 12.461854934692383, "margin_dpo/margin_std": 14.932546615600586, "step": 419 }, { "epoch": 0.6349206349206349, "grad_norm": 41.883358001708984, "learning_rate": 1.782991918222275e-07, "logits/chosen": 0.3532024621963501, "logits/rejected": 0.30437958240509033, "logps/chosen": -79.4993896484375, "logps/ref_chosen": -57.05351257324219, "logps/ref_rejected": -62.670982360839844, "logps/rejected": -92.65522766113281, "loss": 1.2803, "margin_dpo/margin_mean": 7.538368225097656, "margin_dpo/margin_std": 16.784019470214844, "step": 420 }, { "epoch": 0.636432350718065, "grad_norm": 41.40155792236328, "learning_rate": 1.7703352848054887e-07, "logits/chosen": 0.339724600315094, "logits/rejected": 0.27951130270957947, "logps/chosen": -78.44889068603516, "logps/ref_chosen": -57.32324981689453, "logps/ref_rejected": -75.33782958984375, "logps/rejected": -105.40597534179688, "loss": 1.2636, "margin_dpo/margin_mean": 8.942495346069336, "margin_dpo/margin_std": 18.20132064819336, "step": 421 }, { "epoch": 0.6379440665154951, "grad_norm": 32.80634689331055, "learning_rate": 1.7576990616793137e-07, "logits/chosen": 0.37151938676834106, "logits/rejected": 0.34015408158302307, "logps/chosen": -84.34286499023438, "logps/ref_chosen": -67.05757904052734, "logps/ref_rejected": -72.12803649902344, "logps/rejected": -100.574951171875, "loss": 0.8976, "margin_dpo/margin_mean": 11.161640167236328, "margin_dpo/margin_std": 14.631818771362305, "step": 422 }, { "epoch": 0.6394557823129252, "grad_norm": 30.248695373535156, "learning_rate": 1.745083602306071e-07, "logits/chosen": 0.4215930104255676, "logits/rejected": 0.35043540596961975, "logps/chosen": -72.92449951171875, "logps/ref_chosen": -54.061668395996094, "logps/ref_rejected": -76.64092254638672, "logps/rejected": -107.888671875, "loss": 0.9148, "margin_dpo/margin_mean": 12.38492202758789, "margin_dpo/margin_std": 16.511720657348633, "step": 423 }, { "epoch": 0.6409674981103552, "grad_norm": 32.00240707397461, "learning_rate": 1.7324892595672804e-07, "logits/chosen": 0.29672929644584656, "logits/rejected": 0.2571881115436554, "logps/chosen": -71.3875503540039, "logps/ref_chosen": -53.60887145996094, "logps/ref_rejected": -79.2139892578125, "logps/rejected": -109.96659851074219, "loss": 0.8388, "margin_dpo/margin_mean": 12.973922729492188, "margin_dpo/margin_std": 15.422065734863281, "step": 424 }, { "epoch": 0.6424792139077853, "grad_norm": 29.581701278686523, "learning_rate": 1.7199163857537824e-07, "logits/chosen": 0.4177130460739136, "logits/rejected": 0.38378778100013733, "logps/chosen": -77.00273895263672, "logps/ref_chosen": -58.41468048095703, "logps/ref_rejected": -66.59054565429688, "logps/rejected": -95.47838592529297, "loss": 0.9742, "margin_dpo/margin_mean": 10.299787521362305, "margin_dpo/margin_std": 14.727434158325195, "step": 425 }, { "epoch": 0.6439909297052154, "grad_norm": 44.239986419677734, "learning_rate": 1.7073653325558828e-07, "logits/chosen": 0.34781593084335327, "logits/rejected": 0.3494877219200134, "logps/chosen": -93.83209228515625, "logps/ref_chosen": -71.70822143554688, "logps/ref_rejected": -73.57725524902344, "logps/rejected": -102.10546875, "loss": 1.2957, "margin_dpo/margin_mean": 6.4043288230896, "margin_dpo/margin_std": 16.000539779663086, "step": 426 }, { "epoch": 0.6455026455026455, "grad_norm": 30.25602149963379, "learning_rate": 1.6948364510535218e-07, "logits/chosen": 0.4051620662212372, "logits/rejected": 0.3498576283454895, "logps/chosen": -78.96176147460938, "logps/ref_chosen": -58.64276885986328, "logps/ref_rejected": -86.25437927246094, "logps/rejected": -117.43075561523438, "loss": 0.9869, "margin_dpo/margin_mean": 10.857380867004395, "margin_dpo/margin_std": 16.25821304321289, "step": 427 }, { "epoch": 0.6470143613000756, "grad_norm": 35.63125991821289, "learning_rate": 1.6823300917064458e-07, "logits/chosen": 0.34857386350631714, "logits/rejected": 0.29594945907592773, "logps/chosen": -86.02629089355469, "logps/ref_chosen": -66.5960464477539, "logps/ref_rejected": -82.3941650390625, "logps/rejected": -114.90159606933594, "loss": 0.9136, "margin_dpo/margin_mean": 13.07719612121582, "margin_dpo/margin_std": 17.15648651123047, "step": 428 }, { "epoch": 0.6485260770975056, "grad_norm": 32.7097282409668, "learning_rate": 1.669846604344412e-07, "logits/chosen": 0.3295377492904663, "logits/rejected": 0.33729058504104614, "logps/chosen": -77.25363159179688, "logps/ref_chosen": -57.009700775146484, "logps/ref_rejected": -59.86549377441406, "logps/rejected": -90.89605712890625, "loss": 0.9879, "margin_dpo/margin_mean": 10.786637306213379, "margin_dpo/margin_std": 15.325776100158691, "step": 429 }, { "epoch": 0.6500377928949358, "grad_norm": 29.613990783691406, "learning_rate": 1.6573863381573954e-07, "logits/chosen": 0.25784656405448914, "logits/rejected": 0.24630099534988403, "logps/chosen": -78.46931457519531, "logps/ref_chosen": -59.563194274902344, "logps/ref_rejected": -70.52289581298828, "logps/rejected": -101.87786865234375, "loss": 0.8551, "margin_dpo/margin_mean": 12.448851585388184, "margin_dpo/margin_std": 15.234640121459961, "step": 430 }, { "epoch": 0.6515495086923658, "grad_norm": 31.293846130371094, "learning_rate": 1.6449496416858282e-07, "logits/chosen": 0.3527390658855438, "logits/rejected": 0.3057120442390442, "logps/chosen": -67.58097839355469, "logps/ref_chosen": -50.20032501220703, "logps/ref_rejected": -77.81680297851562, "logps/rejected": -105.64657592773438, "loss": 0.9619, "margin_dpo/margin_mean": 10.449111938476562, "margin_dpo/margin_std": 15.20313549041748, "step": 431 }, { "epoch": 0.6530612244897959, "grad_norm": 28.854806900024414, "learning_rate": 1.632536862810844e-07, "logits/chosen": 0.40595024824142456, "logits/rejected": 0.35618919134140015, "logps/chosen": -79.57620239257812, "logps/ref_chosen": -61.662757873535156, "logps/ref_rejected": -83.94496154785156, "logps/rejected": -112.93707275390625, "loss": 0.9282, "margin_dpo/margin_mean": 11.078676223754883, "margin_dpo/margin_std": 14.486236572265625, "step": 432 }, { "epoch": 0.654572940287226, "grad_norm": 28.713123321533203, "learning_rate": 1.6201483487445515e-07, "logits/chosen": 0.468322217464447, "logits/rejected": 0.45534592866897583, "logps/chosen": -82.63919067382812, "logps/ref_chosen": -63.72918701171875, "logps/ref_rejected": -65.8391342163086, "logps/rejected": -98.62335205078125, "loss": 0.8754, "margin_dpo/margin_mean": 13.874216079711914, "margin_dpo/margin_std": 17.385021209716797, "step": 433 }, { "epoch": 0.656084656084656, "grad_norm": 29.267993927001953, "learning_rate": 1.6077844460203204e-07, "logits/chosen": 0.4577040672302246, "logits/rejected": 0.3909740447998047, "logps/chosen": -64.70407104492188, "logps/ref_chosen": -47.97331619262695, "logps/ref_rejected": -72.51132202148438, "logps/rejected": -101.47706604003906, "loss": 0.988, "margin_dpo/margin_mean": 12.234983444213867, "margin_dpo/margin_std": 16.952194213867188, "step": 434 }, { "epoch": 0.6575963718820862, "grad_norm": 32.02671813964844, "learning_rate": 1.5954455004830878e-07, "logits/chosen": 0.4733126759529114, "logits/rejected": 0.43098342418670654, "logps/chosen": -76.56661987304688, "logps/ref_chosen": -57.06024932861328, "logps/ref_rejected": -71.69146728515625, "logps/rejected": -101.79170227050781, "loss": 0.9515, "margin_dpo/margin_mean": 10.593864440917969, "margin_dpo/margin_std": 14.23061752319336, "step": 435 }, { "epoch": 0.6591080876795162, "grad_norm": 30.705045700073242, "learning_rate": 1.5831318572796847e-07, "logits/chosen": 0.3775298297405243, "logits/rejected": 0.31887999176979065, "logps/chosen": -74.879150390625, "logps/ref_chosen": -56.158050537109375, "logps/ref_rejected": -67.63787841796875, "logps/rejected": -97.3807373046875, "loss": 0.9631, "margin_dpo/margin_mean": 11.02175521850586, "margin_dpo/margin_std": 14.938774108886719, "step": 436 }, { "epoch": 0.6606198034769464, "grad_norm": 35.06089782714844, "learning_rate": 1.5708438608491815e-07, "logits/chosen": 0.36842063069343567, "logits/rejected": 0.2398551106452942, "logps/chosen": -77.53937530517578, "logps/ref_chosen": -56.98578643798828, "logps/ref_rejected": -85.61524963378906, "logps/rejected": -116.64567565917969, "loss": 1.1462, "margin_dpo/margin_mean": 10.476823806762695, "margin_dpo/margin_std": 18.057945251464844, "step": 437 }, { "epoch": 0.6621315192743764, "grad_norm": 27.175416946411133, "learning_rate": 1.558581854913253e-07, "logits/chosen": 0.4072152376174927, "logits/rejected": 0.33879750967025757, "logps/chosen": -59.64152145385742, "logps/ref_chosen": -41.27777862548828, "logps/ref_rejected": -65.33840942382812, "logps/rejected": -96.728515625, "loss": 0.8997, "margin_dpo/margin_mean": 13.026363372802734, "margin_dpo/margin_std": 16.39688491821289, "step": 438 }, { "epoch": 0.6636432350718064, "grad_norm": 34.17799758911133, "learning_rate": 1.5463461824665658e-07, "logits/chosen": 0.3313141465187073, "logits/rejected": 0.29224908351898193, "logps/chosen": -99.81672668457031, "logps/ref_chosen": -81.41764831542969, "logps/ref_rejected": -94.72309875488281, "logps/rejected": -124.81297302246094, "loss": 0.8734, "margin_dpo/margin_mean": 11.690802574157715, "margin_dpo/margin_std": 14.000129699707031, "step": 439 }, { "epoch": 0.6651549508692366, "grad_norm": 35.477378845214844, "learning_rate": 1.534137185767178e-07, "logits/chosen": 0.3310989737510681, "logits/rejected": 0.23319561779499054, "logps/chosen": -59.91465759277344, "logps/ref_chosen": -42.538185119628906, "logps/ref_rejected": -69.78813934326172, "logps/rejected": -99.70895385742188, "loss": 0.89, "margin_dpo/margin_mean": 12.544342041015625, "margin_dpo/margin_std": 15.960411071777344, "step": 440 }, { "epoch": 0.6666666666666666, "grad_norm": 25.895111083984375, "learning_rate": 1.521955206326976e-07, "logits/chosen": 0.32143065333366394, "logits/rejected": 0.2299826741218567, "logps/chosen": -73.61805725097656, "logps/ref_chosen": -57.593223571777344, "logps/ref_rejected": -84.82878875732422, "logps/rejected": -114.32113647460938, "loss": 0.7072, "margin_dpo/margin_mean": 13.467504501342773, "margin_dpo/margin_std": 12.755413055419922, "step": 441 }, { "epoch": 0.6681783824640968, "grad_norm": 34.46872329711914, "learning_rate": 1.5098005849021078e-07, "logits/chosen": 0.38055524230003357, "logits/rejected": 0.3312191963195801, "logps/chosen": -88.37391662597656, "logps/ref_chosen": -67.46121978759766, "logps/ref_rejected": -89.0693588256836, "logps/rejected": -121.82472229003906, "loss": 0.9047, "margin_dpo/margin_mean": 11.842670440673828, "margin_dpo/margin_std": 15.179086685180664, "step": 442 }, { "epoch": 0.6696900982615268, "grad_norm": 27.29175567626953, "learning_rate": 1.4976736614834662e-07, "logits/chosen": 0.3942459225654602, "logits/rejected": 0.31970757246017456, "logps/chosen": -71.87895202636719, "logps/ref_chosen": -54.79609680175781, "logps/ref_rejected": -77.80782318115234, "logps/rejected": -110.1363525390625, "loss": 0.8146, "margin_dpo/margin_mean": 15.245677947998047, "margin_dpo/margin_std": 17.69448471069336, "step": 443 }, { "epoch": 0.671201814058957, "grad_norm": 40.294551849365234, "learning_rate": 1.4855747752871654e-07, "logits/chosen": 0.384095162153244, "logits/rejected": 0.30130642652511597, "logps/chosen": -80.62181091308594, "logps/ref_chosen": -58.749061584472656, "logps/ref_rejected": -86.87397003173828, "logps/rejected": -114.7911376953125, "loss": 1.3101, "margin_dpo/margin_mean": 6.04442024230957, "margin_dpo/margin_std": 14.843099594116211, "step": 444 }, { "epoch": 0.672713529856387, "grad_norm": 34.12118148803711, "learning_rate": 1.473504264745062e-07, "logits/chosen": 0.3511345088481903, "logits/rejected": 0.3304353356361389, "logps/chosen": -83.09930419921875, "logps/ref_chosen": -60.91743850708008, "logps/ref_rejected": -71.56373596191406, "logps/rejected": -105.06558990478516, "loss": 0.9353, "margin_dpo/margin_mean": 11.31997299194336, "margin_dpo/margin_std": 14.875, "step": 445 }, { "epoch": 0.674225245653817, "grad_norm": 23.161922454833984, "learning_rate": 1.461462467495284e-07, "logits/chosen": 0.4186308979988098, "logits/rejected": 0.3374045491218567, "logps/chosen": -66.56710052490234, "logps/ref_chosen": -48.79924774169922, "logps/ref_rejected": -71.87195587158203, "logps/rejected": -105.11751556396484, "loss": 0.6502, "margin_dpo/margin_mean": 15.477707862854004, "margin_dpo/margin_std": 13.376574516296387, "step": 446 }, { "epoch": 0.6757369614512472, "grad_norm": 27.70553207397461, "learning_rate": 1.4494497203727843e-07, "logits/chosen": 0.3202866315841675, "logits/rejected": 0.21970438957214355, "logps/chosen": -70.60762023925781, "logps/ref_chosen": -53.682716369628906, "logps/ref_rejected": -88.17315673828125, "logps/rejected": -120.407470703125, "loss": 0.7725, "margin_dpo/margin_mean": 15.309407234191895, "margin_dpo/margin_std": 16.02827262878418, "step": 447 }, { "epoch": 0.6772486772486772, "grad_norm": 26.229351043701172, "learning_rate": 1.4374663593999256e-07, "logits/chosen": 0.39596807956695557, "logits/rejected": 0.34193187952041626, "logps/chosen": -72.67399597167969, "logps/ref_chosen": -53.75125503540039, "logps/ref_rejected": -77.17623901367188, "logps/rejected": -108.64974975585938, "loss": 0.8676, "margin_dpo/margin_mean": 12.550762176513672, "margin_dpo/margin_std": 15.388051986694336, "step": 448 }, { "epoch": 0.6787603930461074, "grad_norm": 36.61587142944336, "learning_rate": 1.4255127197770707e-07, "logits/chosen": 0.2539058327674866, "logits/rejected": 0.24261929094791412, "logps/chosen": -99.29594421386719, "logps/ref_chosen": -75.82737731933594, "logps/ref_rejected": -82.20687103271484, "logps/rejected": -113.31431579589844, "loss": 1.0785, "margin_dpo/margin_mean": 7.638876914978027, "margin_dpo/margin_std": 12.820967674255371, "step": 449 }, { "epoch": 0.6802721088435374, "grad_norm": 33.98078536987305, "learning_rate": 1.4135891358732205e-07, "logits/chosen": 0.4526249170303345, "logits/rejected": 0.34345030784606934, "logps/chosen": -65.86204528808594, "logps/ref_chosen": -47.11572265625, "logps/ref_rejected": -78.7546615600586, "logps/rejected": -105.94377136230469, "loss": 1.0763, "margin_dpo/margin_mean": 8.44278335571289, "margin_dpo/margin_std": 14.431180953979492, "step": 450 }, { "epoch": 0.6817838246409675, "grad_norm": 35.303871154785156, "learning_rate": 1.4016959412166437e-07, "logits/chosen": 0.3524070382118225, "logits/rejected": 0.3001144528388977, "logps/chosen": -82.66523742675781, "logps/ref_chosen": -63.350440979003906, "logps/ref_rejected": -76.28530883789062, "logps/rejected": -104.85352325439453, "loss": 1.0359, "margin_dpo/margin_mean": 9.253421783447266, "margin_dpo/margin_std": 14.614404678344727, "step": 451 }, { "epoch": 0.6832955404383976, "grad_norm": 33.860225677490234, "learning_rate": 1.3898334684855645e-07, "logits/chosen": 0.34719571471214294, "logits/rejected": 0.26927071809768677, "logps/chosen": -74.74221801757812, "logps/ref_chosen": -55.585838317871094, "logps/ref_rejected": -77.68738555908203, "logps/rejected": -107.97444152832031, "loss": 0.9814, "margin_dpo/margin_mean": 11.130671501159668, "margin_dpo/margin_std": 15.483327865600586, "step": 452 }, { "epoch": 0.6848072562358276, "grad_norm": 32.40834045410156, "learning_rate": 1.3780020494988445e-07, "logits/chosen": 0.34208589792251587, "logits/rejected": 0.3065118193626404, "logps/chosen": -79.8968505859375, "logps/ref_chosen": -61.778202056884766, "logps/ref_rejected": -71.51402282714844, "logps/rejected": -100.97367858886719, "loss": 0.9782, "margin_dpo/margin_mean": 11.34100341796875, "margin_dpo/margin_std": 16.038299560546875, "step": 453 }, { "epoch": 0.6863189720332578, "grad_norm": 29.024457931518555, "learning_rate": 1.366202015206706e-07, "logits/chosen": 0.3748947083950043, "logits/rejected": 0.3294043242931366, "logps/chosen": -68.87191009521484, "logps/ref_chosen": -51.59515380859375, "logps/ref_rejected": -63.967323303222656, "logps/rejected": -93.42787170410156, "loss": 0.9702, "margin_dpo/margin_mean": 12.183794021606445, "margin_dpo/margin_std": 16.67722511291504, "step": 454 }, { "epoch": 0.6878306878306878, "grad_norm": 31.36719512939453, "learning_rate": 1.354433695681474e-07, "logits/chosen": 0.2365991175174713, "logits/rejected": 0.196326345205307, "logps/chosen": -90.68863677978516, "logps/ref_chosen": -70.65170288085938, "logps/ref_rejected": -77.44276428222656, "logps/rejected": -109.77125549316406, "loss": 0.8471, "margin_dpo/margin_mean": 12.291560173034668, "margin_dpo/margin_std": 15.06167984008789, "step": 455 }, { "epoch": 0.6893424036281179, "grad_norm": 31.750394821166992, "learning_rate": 1.3426974201083439e-07, "logits/chosen": 0.29783540964126587, "logits/rejected": 0.23817452788352966, "logps/chosen": -76.01242065429688, "logps/ref_chosen": -56.398284912109375, "logps/ref_rejected": -82.61642456054688, "logps/rejected": -112.9671630859375, "loss": 0.924, "margin_dpo/margin_mean": 10.736598014831543, "margin_dpo/margin_std": 14.164466857910156, "step": 456 }, { "epoch": 0.690854119425548, "grad_norm": 29.68619728088379, "learning_rate": 1.3309935167761717e-07, "logits/chosen": 0.42319661378860474, "logits/rejected": 0.35049009323120117, "logps/chosen": -65.1966552734375, "logps/ref_chosen": -44.72057342529297, "logps/ref_rejected": -68.11585998535156, "logps/rejected": -98.49119567871094, "loss": 0.9288, "margin_dpo/margin_mean": 9.899250984191895, "margin_dpo/margin_std": 12.776674270629883, "step": 457 }, { "epoch": 0.6923658352229781, "grad_norm": 32.18935775756836, "learning_rate": 1.3193223130682936e-07, "logits/chosen": 0.3539985418319702, "logits/rejected": 0.24194201827049255, "logps/chosen": -68.06039428710938, "logps/ref_chosen": -50.00569152832031, "logps/ref_rejected": -87.50015258789062, "logps/rejected": -117.12113189697266, "loss": 0.9265, "margin_dpo/margin_mean": 11.566278457641602, "margin_dpo/margin_std": 14.978096008300781, "step": 458 }, { "epoch": 0.6938775510204082, "grad_norm": 29.448610305786133, "learning_rate": 1.3076841354533658e-07, "logits/chosen": 0.4100639820098877, "logits/rejected": 0.3683810532093048, "logps/chosen": -83.21324157714844, "logps/ref_chosen": -65.37794494628906, "logps/ref_rejected": -88.19244384765625, "logps/rejected": -119.49298095703125, "loss": 0.819, "margin_dpo/margin_mean": 13.465246200561523, "margin_dpo/margin_std": 14.571124076843262, "step": 459 }, { "epoch": 0.6953892668178382, "grad_norm": 31.59938621520996, "learning_rate": 1.2960793094762345e-07, "logits/chosen": 0.4093438982963562, "logits/rejected": 0.2837139070034027, "logps/chosen": -83.06444549560547, "logps/ref_chosen": -64.5616683959961, "logps/ref_rejected": -88.67889404296875, "logps/rejected": -122.12751770019531, "loss": 0.7529, "margin_dpo/margin_mean": 14.945836067199707, "margin_dpo/margin_std": 16.06540298461914, "step": 460 }, { "epoch": 0.6969009826152683, "grad_norm": 27.819725036621094, "learning_rate": 1.2845081597488286e-07, "logits/chosen": 0.49810439348220825, "logits/rejected": 0.4051767587661743, "logps/chosen": -65.88737487792969, "logps/ref_chosen": -49.4779167175293, "logps/ref_rejected": -72.65262603759766, "logps/rejected": -101.95249938964844, "loss": 0.8445, "margin_dpo/margin_mean": 12.89040756225586, "margin_dpo/margin_std": 14.886733055114746, "step": 461 }, { "epoch": 0.6984126984126984, "grad_norm": 28.74095344543457, "learning_rate": 1.27297100994108e-07, "logits/chosen": 0.34951984882354736, "logits/rejected": 0.2860267162322998, "logps/chosen": -78.25369262695312, "logps/ref_chosen": -60.4951171875, "logps/ref_rejected": -74.82137298583984, "logps/rejected": -107.18858337402344, "loss": 0.7492, "margin_dpo/margin_mean": 14.608634948730469, "margin_dpo/margin_std": 15.107083320617676, "step": 462 }, { "epoch": 0.6999244142101285, "grad_norm": 32.04073715209961, "learning_rate": 1.2614681827718695e-07, "logits/chosen": 0.37092894315719604, "logits/rejected": 0.3595684766769409, "logps/chosen": -87.26763916015625, "logps/ref_chosen": -67.68511962890625, "logps/ref_rejected": -71.32196044921875, "logps/rejected": -100.9955062866211, "loss": 0.9124, "margin_dpo/margin_mean": 10.091035842895508, "margin_dpo/margin_std": 12.723739624023438, "step": 463 }, { "epoch": 0.7014361300075586, "grad_norm": 35.85340881347656, "learning_rate": 1.2500000000000005e-07, "logits/chosen": 0.34320521354675293, "logits/rejected": 0.31048744916915894, "logps/chosen": -79.22843933105469, "logps/ref_chosen": -59.16564178466797, "logps/ref_rejected": -69.56146240234375, "logps/rejected": -101.13890838623047, "loss": 0.9691, "margin_dpo/margin_mean": 11.5146484375, "margin_dpo/margin_std": 15.515382766723633, "step": 464 }, { "epoch": 0.7029478458049887, "grad_norm": 34.352787017822266, "learning_rate": 1.238566782415197e-07, "logits/chosen": 0.44564637541770935, "logits/rejected": 0.3805859386920929, "logps/chosen": -79.09562683105469, "logps/ref_chosen": -58.513671875, "logps/ref_rejected": -84.31745910644531, "logps/rejected": -115.3167724609375, "loss": 1.0309, "margin_dpo/margin_mean": 10.417366981506348, "margin_dpo/margin_std": 15.731843948364258, "step": 465 }, { "epoch": 0.7044595616024187, "grad_norm": 42.33492660522461, "learning_rate": 1.2271688498291334e-07, "logits/chosen": 0.37938642501831055, "logits/rejected": 0.37962764501571655, "logps/chosen": -95.0290756225586, "logps/ref_chosen": -73.26580810546875, "logps/ref_rejected": -74.83621215820312, "logps/rejected": -103.54035949707031, "loss": 1.1424, "margin_dpo/margin_mean": 6.940883636474609, "margin_dpo/margin_std": 13.032196998596191, "step": 466 }, { "epoch": 0.7059712773998488, "grad_norm": 27.393531799316406, "learning_rate": 1.2158065210664848e-07, "logits/chosen": 0.41466957330703735, "logits/rejected": 0.2669060230255127, "logps/chosen": -67.52658081054688, "logps/ref_chosen": -47.57947540283203, "logps/ref_rejected": -78.68522644042969, "logps/rejected": -110.11405181884766, "loss": 0.8437, "margin_dpo/margin_mean": 11.481725692749023, "margin_dpo/margin_std": 13.719661712646484, "step": 467 }, { "epoch": 0.7074829931972789, "grad_norm": 30.220720291137695, "learning_rate": 1.204480113956011e-07, "logits/chosen": 0.3812910318374634, "logits/rejected": 0.3669815957546234, "logps/chosen": -80.45675659179688, "logps/ref_chosen": -63.92778778076172, "logps/ref_rejected": -76.51626586914062, "logps/rejected": -109.25390625, "loss": 0.6942, "margin_dpo/margin_mean": 16.208662033081055, "margin_dpo/margin_std": 16.242408752441406, "step": 468 }, { "epoch": 0.708994708994709, "grad_norm": 28.700984954833984, "learning_rate": 1.1931899453216697e-07, "logits/chosen": 0.45047247409820557, "logits/rejected": 0.43260282278060913, "logps/chosen": -76.3576889038086, "logps/ref_chosen": -59.05818176269531, "logps/ref_rejected": -75.67672729492188, "logps/rejected": -105.44158172607422, "loss": 0.8573, "margin_dpo/margin_mean": 12.465351104736328, "margin_dpo/margin_std": 14.824824333190918, "step": 469 }, { "epoch": 0.7105064247921391, "grad_norm": 27.37070083618164, "learning_rate": 1.1819363309737438e-07, "logits/chosen": 0.38742589950561523, "logits/rejected": 0.31535792350769043, "logps/chosen": -67.01020812988281, "logps/ref_chosen": -47.86743927001953, "logps/ref_rejected": -65.96858978271484, "logps/rejected": -96.77906036376953, "loss": 0.9114, "margin_dpo/margin_mean": 11.667705535888672, "margin_dpo/margin_std": 14.78006362915039, "step": 470 }, { "epoch": 0.7120181405895691, "grad_norm": 31.069612503051758, "learning_rate": 1.1707195857000215e-07, "logits/chosen": 0.38362210988998413, "logits/rejected": 0.31821680068969727, "logps/chosen": -74.50727081298828, "logps/ref_chosen": -57.77785110473633, "logps/ref_rejected": -73.81172180175781, "logps/rejected": -104.50235748291016, "loss": 0.8414, "margin_dpo/margin_mean": 13.961222648620605, "margin_dpo/margin_std": 16.028059005737305, "step": 471 }, { "epoch": 0.7135298563869993, "grad_norm": 32.531490325927734, "learning_rate": 1.1595400232569768e-07, "logits/chosen": 0.40967637300491333, "logits/rejected": 0.3588043749332428, "logps/chosen": -72.53768157958984, "logps/ref_chosen": -55.908668518066406, "logps/ref_rejected": -74.70294189453125, "logps/rejected": -103.69931030273438, "loss": 0.9363, "margin_dpo/margin_mean": 12.367351531982422, "margin_dpo/margin_std": 16.488346099853516, "step": 472 }, { "epoch": 0.7150415721844293, "grad_norm": 32.29591369628906, "learning_rate": 1.1483979563610069e-07, "logits/chosen": 0.46595680713653564, "logits/rejected": 0.356813907623291, "logps/chosen": -70.86893463134766, "logps/ref_chosen": -54.16088104248047, "logps/ref_rejected": -92.76789855957031, "logps/rejected": -121.89593505859375, "loss": 1.0278, "margin_dpo/margin_mean": 12.419994354248047, "margin_dpo/margin_std": 18.459060668945312, "step": 473 }, { "epoch": 0.7165532879818595, "grad_norm": 38.99330520629883, "learning_rate": 1.1372936966796709e-07, "logits/chosen": 0.4462272524833679, "logits/rejected": 0.37020203471183777, "logps/chosen": -66.63814544677734, "logps/ref_chosen": -46.685707092285156, "logps/ref_rejected": -71.44731140136719, "logps/rejected": -101.14683532714844, "loss": 1.1052, "margin_dpo/margin_mean": 9.747077941894531, "margin_dpo/margin_std": 16.308094024658203, "step": 474 }, { "epoch": 0.7180650037792895, "grad_norm": 27.097074508666992, "learning_rate": 1.126227554822985e-07, "logits/chosen": 0.36055099964141846, "logits/rejected": 0.30357635021209717, "logps/chosen": -77.57850646972656, "logps/ref_chosen": -58.4873046875, "logps/ref_rejected": -87.00187683105469, "logps/rejected": -122.87249755859375, "loss": 0.6529, "margin_dpo/margin_mean": 16.779415130615234, "margin_dpo/margin_std": 15.916208267211914, "step": 475 }, { "epoch": 0.7195767195767195, "grad_norm": 43.0082893371582, "learning_rate": 1.1151998403347243e-07, "logits/chosen": 0.3008124828338623, "logits/rejected": 0.288544237613678, "logps/chosen": -97.41394805908203, "logps/ref_chosen": -75.38162231445312, "logps/ref_rejected": -76.99822235107422, "logps/rejected": -109.19664764404297, "loss": 1.0678, "margin_dpo/margin_mean": 10.166099548339844, "margin_dpo/margin_std": 16.556289672851562, "step": 476 }, { "epoch": 0.7210884353741497, "grad_norm": 40.80556869506836, "learning_rate": 1.1042108616837692e-07, "logits/chosen": 0.41148385405540466, "logits/rejected": 0.36269134283065796, "logps/chosen": -82.53071594238281, "logps/ref_chosen": -61.073387145996094, "logps/ref_rejected": -81.34375, "logps/rejected": -114.1594009399414, "loss": 1.0709, "margin_dpo/margin_mean": 11.35832691192627, "margin_dpo/margin_std": 17.43365478515625, "step": 477 }, { "epoch": 0.7226001511715797, "grad_norm": 40.633033752441406, "learning_rate": 1.0932609262554746e-07, "logits/chosen": 0.3245035409927368, "logits/rejected": 0.3233986496925354, "logps/chosen": -75.4658432006836, "logps/ref_chosen": -57.16731643676758, "logps/ref_rejected": -53.309181213378906, "logps/rejected": -81.65347290039062, "loss": 1.1221, "margin_dpo/margin_mean": 10.045768737792969, "margin_dpo/margin_std": 17.326122283935547, "step": 478 }, { "epoch": 0.7241118669690099, "grad_norm": 30.868711471557617, "learning_rate": 1.0823503403430734e-07, "logits/chosen": 0.30822062492370605, "logits/rejected": 0.25296294689178467, "logps/chosen": -79.10305786132812, "logps/ref_chosen": -58.91331481933594, "logps/ref_rejected": -63.7403450012207, "logps/rejected": -93.48685455322266, "loss": 1.0759, "margin_dpo/margin_mean": 9.556758880615234, "margin_dpo/margin_std": 15.833057403564453, "step": 479 }, { "epoch": 0.7256235827664399, "grad_norm": 36.491539001464844, "learning_rate": 1.0714794091391072e-07, "logits/chosen": 0.31532788276672363, "logits/rejected": 0.29993510246276855, "logps/chosen": -81.86836242675781, "logps/ref_chosen": -62.80060577392578, "logps/ref_rejected": -67.58859252929688, "logps/rejected": -97.8862075805664, "loss": 1.0544, "margin_dpo/margin_mean": 11.229857444763184, "margin_dpo/margin_std": 16.638090133666992, "step": 480 }, { "epoch": 0.72713529856387, "grad_norm": 30.423425674438477, "learning_rate": 1.0606484367268906e-07, "logits/chosen": 0.30370086431503296, "logits/rejected": 0.29490453004837036, "logps/chosen": -83.44210815429688, "logps/ref_chosen": -65.28649139404297, "logps/ref_rejected": -70.78668212890625, "logps/rejected": -101.75835418701172, "loss": 0.9433, "margin_dpo/margin_mean": 12.816054344177246, "margin_dpo/margin_std": 17.707691192626953, "step": 481 }, { "epoch": 0.7286470143613001, "grad_norm": 43.65763473510742, "learning_rate": 1.0498577260720048e-07, "logits/chosen": 0.3178885281085968, "logits/rejected": 0.1772965043783188, "logps/chosen": -81.17154693603516, "logps/ref_chosen": -60.906185150146484, "logps/ref_rejected": -103.44656372070312, "logps/rejected": -134.303955078125, "loss": 1.113, "margin_dpo/margin_mean": 10.592021942138672, "margin_dpo/margin_std": 18.282638549804688, "step": 482 }, { "epoch": 0.7301587301587301, "grad_norm": 32.36988067626953, "learning_rate": 1.0391075790138232e-07, "logits/chosen": 0.44268304109573364, "logits/rejected": 0.33734846115112305, "logps/chosen": -71.24640655517578, "logps/ref_chosen": -53.192012786865234, "logps/ref_rejected": -81.83927154541016, "logps/rejected": -112.96994018554688, "loss": 0.905, "margin_dpo/margin_mean": 13.076276779174805, "margin_dpo/margin_std": 16.73548126220703, "step": 483 }, { "epoch": 0.7316704459561603, "grad_norm": 34.262271881103516, "learning_rate": 1.0283982962570681e-07, "logits/chosen": 0.4280846118927002, "logits/rejected": 0.3853263854980469, "logps/chosen": -76.50611114501953, "logps/ref_chosen": -57.76945877075195, "logps/ref_rejected": -71.6829833984375, "logps/rejected": -100.66574096679688, "loss": 0.8924, "margin_dpo/margin_mean": 10.246103286743164, "margin_dpo/margin_std": 12.448812484741211, "step": 484 }, { "epoch": 0.7331821617535903, "grad_norm": 30.331209182739258, "learning_rate": 1.0177301773633992e-07, "logits/chosen": 0.40210530161857605, "logits/rejected": 0.3721107542514801, "logps/chosen": -73.7468032836914, "logps/ref_chosen": -56.63584899902344, "logps/ref_rejected": -70.85614013671875, "logps/rejected": -99.84536743164062, "loss": 0.921, "margin_dpo/margin_mean": 11.878273010253906, "margin_dpo/margin_std": 15.501483917236328, "step": 485 }, { "epoch": 0.7346938775510204, "grad_norm": 33.630470275878906, "learning_rate": 1.007103520743035e-07, "logits/chosen": 0.41097620129585266, "logits/rejected": 0.2941015958786011, "logps/chosen": -78.46426391601562, "logps/ref_chosen": -56.347023010253906, "logps/ref_rejected": -85.97221374511719, "logps/rejected": -119.07996368408203, "loss": 1.0505, "margin_dpo/margin_mean": 10.990509986877441, "margin_dpo/margin_std": 18.080223083496094, "step": 486 }, { "epoch": 0.7362055933484505, "grad_norm": 33.40716552734375, "learning_rate": 9.965186236464046e-08, "logits/chosen": 0.45882779359817505, "logits/rejected": 0.39584046602249146, "logps/chosen": -80.12818908691406, "logps/ref_chosen": -60.617218017578125, "logps/ref_rejected": -82.5097427368164, "logps/rejected": -113.12313079833984, "loss": 0.9295, "margin_dpo/margin_mean": 11.102409362792969, "margin_dpo/margin_std": 15.39416217803955, "step": 487 }, { "epoch": 0.7377173091458806, "grad_norm": 32.00619888305664, "learning_rate": 9.859757821558337e-08, "logits/chosen": 0.3710404634475708, "logits/rejected": 0.2971905469894409, "logps/chosen": -81.0091781616211, "logps/ref_chosen": -63.10905456542969, "logps/ref_rejected": -82.49348449707031, "logps/rejected": -114.04411315917969, "loss": 0.796, "margin_dpo/margin_mean": 13.650504112243652, "margin_dpo/margin_std": 15.383604049682617, "step": 488 }, { "epoch": 0.7392290249433107, "grad_norm": 35.907958984375, "learning_rate": 9.754752911772615e-08, "logits/chosen": 0.3739784359931946, "logits/rejected": 0.327509343624115, "logps/chosen": -84.74853515625, "logps/ref_chosen": -64.98896026611328, "logps/ref_rejected": -84.39607238769531, "logps/rejected": -111.33993530273438, "loss": 1.1867, "margin_dpo/margin_mean": 7.184290885925293, "margin_dpo/margin_std": 14.307548522949219, "step": 489 }, { "epoch": 0.7407407407407407, "grad_norm": 42.15243148803711, "learning_rate": 9.650174444319956e-08, "logits/chosen": 0.4511192739009857, "logits/rejected": 0.4217193126678467, "logps/chosen": -80.63301086425781, "logps/ref_chosen": -61.90874481201172, "logps/ref_rejected": -70.58566284179688, "logps/rejected": -99.12336730957031, "loss": 1.219, "margin_dpo/margin_mean": 9.813451766967773, "margin_dpo/margin_std": 18.48382568359375, "step": 490 }, { "epoch": 0.7422524565381708, "grad_norm": 30.005891799926758, "learning_rate": 9.546025344484868e-08, "logits/chosen": 0.2991289794445038, "logits/rejected": 0.24064157903194427, "logps/chosen": -73.79498291015625, "logps/ref_chosen": -55.47570037841797, "logps/ref_rejected": -78.70318603515625, "logps/rejected": -107.55183410644531, "loss": 0.93, "margin_dpo/margin_mean": 10.529365539550781, "margin_dpo/margin_std": 13.245063781738281, "step": 491 }, { "epoch": 0.7437641723356009, "grad_norm": 40.63232421875, "learning_rate": 9.442308525541589e-08, "logits/chosen": 0.32890427112579346, "logits/rejected": 0.25535351037979126, "logps/chosen": -90.1016616821289, "logps/ref_chosen": -67.28638458251953, "logps/ref_rejected": -82.78628540039062, "logps/rejected": -115.23426055908203, "loss": 1.1686, "margin_dpo/margin_mean": 9.632694244384766, "margin_dpo/margin_std": 16.95614242553711, "step": 492 }, { "epoch": 0.745275888133031, "grad_norm": 27.453155517578125, "learning_rate": 9.339026888672468e-08, "logits/chosen": 0.35031116008758545, "logits/rejected": 0.266485333442688, "logps/chosen": -74.11001586914062, "logps/ref_chosen": -55.92750549316406, "logps/ref_rejected": -79.12149810791016, "logps/rejected": -110.38975524902344, "loss": 0.8313, "margin_dpo/margin_mean": 13.085746765136719, "margin_dpo/margin_std": 15.100048065185547, "step": 493 }, { "epoch": 0.7467876039304611, "grad_norm": 41.737037658691406, "learning_rate": 9.236183322886945e-08, "logits/chosen": 0.2632497251033783, "logits/rejected": 0.20675988495349884, "logps/chosen": -86.74630737304688, "logps/ref_chosen": -67.95411682128906, "logps/ref_rejected": -90.50865936279297, "logps/rejected": -118.86773681640625, "loss": 1.1456, "margin_dpo/margin_mean": 9.566875457763672, "margin_dpo/margin_std": 17.215763092041016, "step": 494 }, { "epoch": 0.7482993197278912, "grad_norm": 33.81610870361328, "learning_rate": 9.133780704940594e-08, "logits/chosen": 0.4413723349571228, "logits/rejected": 0.37061169743537903, "logps/chosen": -70.81031036376953, "logps/ref_chosen": -52.625465393066406, "logps/ref_rejected": -72.06781005859375, "logps/rejected": -98.15983581542969, "loss": 1.1075, "margin_dpo/margin_mean": 7.907181262969971, "margin_dpo/margin_std": 14.21833324432373, "step": 495 }, { "epoch": 0.7498110355253212, "grad_norm": 34.30609130859375, "learning_rate": 9.031821899254797e-08, "logits/chosen": 0.39603427052497864, "logits/rejected": 0.28451234102249146, "logps/chosen": -77.51828002929688, "logps/ref_chosen": -57.597328186035156, "logps/ref_rejected": -94.36127471923828, "logps/rejected": -124.71854400634766, "loss": 1.0383, "margin_dpo/margin_mean": 10.436321258544922, "margin_dpo/margin_std": 17.503387451171875, "step": 496 }, { "epoch": 0.7513227513227513, "grad_norm": 33.988346099853516, "learning_rate": 8.930309757836516e-08, "logits/chosen": 0.41119682788848877, "logits/rejected": 0.37434881925582886, "logps/chosen": -92.74378967285156, "logps/ref_chosen": -72.78994750976562, "logps/ref_rejected": -89.48483276367188, "logps/rejected": -123.64321899414062, "loss": 0.844, "margin_dpo/margin_mean": 14.204545974731445, "margin_dpo/margin_std": 17.2962703704834, "step": 497 }, { "epoch": 0.7528344671201814, "grad_norm": 36.42383575439453, "learning_rate": 8.829247120198563e-08, "logits/chosen": 0.3917444944381714, "logits/rejected": 0.35288551449775696, "logps/chosen": -87.1715316772461, "logps/ref_chosen": -68.36572265625, "logps/ref_rejected": -71.28846740722656, "logps/rejected": -102.5860824584961, "loss": 0.8655, "margin_dpo/margin_mean": 12.491800308227539, "margin_dpo/margin_std": 15.737793922424316, "step": 498 }, { "epoch": 0.7543461829176115, "grad_norm": 34.57160568237305, "learning_rate": 8.728636813280163e-08, "logits/chosen": 0.36719441413879395, "logits/rejected": 0.29699230194091797, "logps/chosen": -79.09321594238281, "logps/ref_chosen": -61.90882873535156, "logps/ref_rejected": -91.9411392211914, "logps/rejected": -120.89878845214844, "loss": 1.0196, "margin_dpo/margin_mean": 11.773270606994629, "margin_dpo/margin_std": 17.18640899658203, "step": 499 }, { "epoch": 0.7558578987150416, "grad_norm": 35.90029525756836, "learning_rate": 8.628481651367875e-08, "logits/chosen": 0.30776458978652954, "logits/rejected": 0.2964794933795929, "logps/chosen": -89.29718017578125, "logps/ref_chosen": -70.225830078125, "logps/ref_rejected": -71.72203063964844, "logps/rejected": -101.60995483398438, "loss": 1.0037, "margin_dpo/margin_mean": 10.816570281982422, "margin_dpo/margin_std": 15.775973320007324, "step": 500 }, { "epoch": 0.7558578987150416, "eval_logits/chosen": 0.3937297463417053, "eval_logits/rejected": 0.3419411778450012, "eval_logps/chosen": -92.17249298095703, "eval_logps/ref_chosen": -74.85946655273438, "eval_logps/ref_rejected": -79.54898834228516, "eval_logps/rejected": -106.36981964111328, "eval_loss": 0.527696967124939, "eval_margin_dpo/margin_mean": 9.507804870605469, "eval_margin_dpo/margin_std": 15.06718635559082, "eval_runtime": 38.6653, "eval_samples_per_second": 59.562, "eval_steps_per_second": 1.862, "step": 500 }, { "epoch": 0.7573696145124716, "grad_norm": 26.93088722229004, "learning_rate": 8.528784436016878e-08, "logits/chosen": 0.359049528837204, "logits/rejected": 0.34849053621292114, "logps/chosen": -83.36741638183594, "logps/ref_chosen": -64.59880828857422, "logps/ref_rejected": -70.59329223632812, "logps/rejected": -99.85511779785156, "loss": 0.8434, "margin_dpo/margin_mean": 10.493215560913086, "margin_dpo/margin_std": 11.587947845458984, "step": 501 }, { "epoch": 0.7588813303099018, "grad_norm": 35.08143615722656, "learning_rate": 8.4295479559726e-08, "logits/chosen": 0.3733653128147125, "logits/rejected": 0.3248666822910309, "logps/chosen": -83.78016662597656, "logps/ref_chosen": -65.46662902832031, "logps/ref_rejected": -90.22233581542969, "logps/rejected": -118.43475341796875, "loss": 0.9262, "margin_dpo/margin_mean": 9.898885726928711, "margin_dpo/margin_std": 13.525036811828613, "step": 502 }, { "epoch": 0.7603930461073318, "grad_norm": 33.57307434082031, "learning_rate": 8.330774987092712e-08, "logits/chosen": 0.37700212001800537, "logits/rejected": 0.36113613843917847, "logps/chosen": -68.490966796875, "logps/ref_chosen": -51.83476257324219, "logps/ref_rejected": -57.62522506713867, "logps/rejected": -84.85618591308594, "loss": 1.0233, "margin_dpo/margin_mean": 10.574769020080566, "margin_dpo/margin_std": 15.259387969970703, "step": 503 }, { "epoch": 0.7619047619047619, "grad_norm": 32.33924865722656, "learning_rate": 8.232468292269479e-08, "logits/chosen": 0.33015936613082886, "logits/rejected": 0.3007563352584839, "logps/chosen": -85.83465576171875, "logps/ref_chosen": -68.65119934082031, "logps/ref_rejected": -77.91394805908203, "logps/rejected": -109.01856994628906, "loss": 0.71, "margin_dpo/margin_mean": 13.92115592956543, "margin_dpo/margin_std": 13.897747039794922, "step": 504 }, { "epoch": 0.763416477702192, "grad_norm": 38.17281723022461, "learning_rate": 8.134630621352483e-08, "logits/chosen": 0.3694714903831482, "logits/rejected": 0.3267279267311096, "logps/chosen": -78.07418823242188, "logps/ref_chosen": -59.99884796142578, "logps/ref_rejected": -76.88047790527344, "logps/rejected": -104.90641784667969, "loss": 1.0749, "margin_dpo/margin_mean": 9.950593948364258, "margin_dpo/margin_std": 16.37490463256836, "step": 505 }, { "epoch": 0.764928193499622, "grad_norm": 37.14749526977539, "learning_rate": 8.037264711071698e-08, "logits/chosen": 0.37326282262802124, "logits/rejected": 0.34700077772140503, "logps/chosen": -86.4708251953125, "logps/ref_chosen": -70.07130432128906, "logps/ref_rejected": -82.03775024414062, "logps/rejected": -108.28327941894531, "loss": 1.0496, "margin_dpo/margin_mean": 9.846002578735352, "margin_dpo/margin_std": 15.988225936889648, "step": 506 }, { "epoch": 0.7664399092970522, "grad_norm": 32.942909240722656, "learning_rate": 7.940373284960933e-08, "logits/chosen": 0.3433038592338562, "logits/rejected": 0.2900928556919098, "logps/chosen": -90.55781555175781, "logps/ref_chosen": -72.00703430175781, "logps/ref_rejected": -93.94987487792969, "logps/rejected": -123.90901947021484, "loss": 0.9739, "margin_dpo/margin_mean": 11.408361434936523, "margin_dpo/margin_std": 16.2240047454834, "step": 507 }, { "epoch": 0.7679516250944822, "grad_norm": 29.779918670654297, "learning_rate": 7.843959053281663e-08, "logits/chosen": 0.32127851247787476, "logits/rejected": 0.19979628920555115, "logps/chosen": -77.34425354003906, "logps/ref_chosen": -60.21992492675781, "logps/ref_rejected": -95.9200668334961, "logps/rejected": -125.66604614257812, "loss": 0.8772, "margin_dpo/margin_mean": 12.621658325195312, "margin_dpo/margin_std": 15.715794563293457, "step": 508 }, { "epoch": 0.7694633408919124, "grad_norm": 39.186466217041016, "learning_rate": 7.748024712947204e-08, "logits/chosen": 0.30487918853759766, "logits/rejected": 0.27709323167800903, "logps/chosen": -84.89524841308594, "logps/ref_chosen": -66.2701644897461, "logps/ref_rejected": -71.73065185546875, "logps/rejected": -100.41943359375, "loss": 0.9939, "margin_dpo/margin_mean": 10.063714981079102, "margin_dpo/margin_std": 14.282079696655273, "step": 509 }, { "epoch": 0.7709750566893424, "grad_norm": 31.86794090270996, "learning_rate": 7.652572947447272e-08, "logits/chosen": 0.48108965158462524, "logits/rejected": 0.3837711811065674, "logps/chosen": -72.2618179321289, "logps/ref_chosen": -53.54487609863281, "logps/ref_rejected": -91.36649322509766, "logps/rejected": -121.51242065429688, "loss": 0.9319, "margin_dpo/margin_mean": 11.42898941040039, "margin_dpo/margin_std": 15.287019729614258, "step": 510 }, { "epoch": 0.7724867724867724, "grad_norm": 32.75205993652344, "learning_rate": 7.557606426772961e-08, "logits/chosen": 0.39279526472091675, "logits/rejected": 0.3349749445915222, "logps/chosen": -75.0731201171875, "logps/ref_chosen": -55.844390869140625, "logps/ref_rejected": -86.49819946289062, "logps/rejected": -120.20086669921875, "loss": 0.8105, "margin_dpo/margin_mean": 14.473926544189453, "margin_dpo/margin_std": 16.571292877197266, "step": 511 }, { "epoch": 0.7739984882842026, "grad_norm": 35.257179260253906, "learning_rate": 7.463127807341966e-08, "logits/chosen": 0.25258108973503113, "logits/rejected": 0.23254463076591492, "logps/chosen": -80.03028106689453, "logps/ref_chosen": -61.653038024902344, "logps/ref_rejected": -72.83148193359375, "logps/rejected": -102.50459289550781, "loss": 1.0745, "margin_dpo/margin_mean": 11.295858383178711, "margin_dpo/margin_std": 20.00664520263672, "step": 512 }, { "epoch": 0.7755102040816326, "grad_norm": 25.621583938598633, "learning_rate": 7.369139731924401e-08, "logits/chosen": 0.49317803978919983, "logits/rejected": 0.43911677598953247, "logps/chosen": -67.57891082763672, "logps/ref_chosen": -50.852561950683594, "logps/ref_rejected": -69.21754455566406, "logps/rejected": -97.71588134765625, "loss": 0.8412, "margin_dpo/margin_mean": 11.771990776062012, "margin_dpo/margin_std": 14.104642868041992, "step": 513 }, { "epoch": 0.7770219198790628, "grad_norm": 34.901641845703125, "learning_rate": 7.275644829568747e-08, "logits/chosen": 0.4055827260017395, "logits/rejected": 0.3611418604850769, "logps/chosen": -88.5246810913086, "logps/ref_chosen": -69.38493347167969, "logps/ref_rejected": -83.32447814941406, "logps/rejected": -113.60870361328125, "loss": 0.9427, "margin_dpo/margin_mean": 11.14448356628418, "margin_dpo/margin_std": 14.856443405151367, "step": 514 }, { "epoch": 0.7785336356764928, "grad_norm": 31.39341926574707, "learning_rate": 7.182645715528435e-08, "logits/chosen": 0.3923872411251068, "logits/rejected": 0.30827796459198, "logps/chosen": -74.15292358398438, "logps/ref_chosen": -53.687034606933594, "logps/ref_rejected": -83.59614562988281, "logps/rejected": -116.03274536132812, "loss": 0.8749, "margin_dpo/margin_mean": 11.970718383789062, "margin_dpo/margin_std": 14.525394439697266, "step": 515 }, { "epoch": 0.780045351473923, "grad_norm": 33.65007781982422, "learning_rate": 7.090144991188568e-08, "logits/chosen": 0.35884907841682434, "logits/rejected": 0.30560222268104553, "logps/chosen": -73.98365783691406, "logps/ref_chosen": -56.9017219543457, "logps/ref_rejected": -67.83477783203125, "logps/rejected": -94.897705078125, "loss": 0.9889, "margin_dpo/margin_mean": 9.980987548828125, "margin_dpo/margin_std": 14.448881149291992, "step": 516 }, { "epoch": 0.781557067271353, "grad_norm": 31.95818519592285, "learning_rate": 6.998145243993284e-08, "logits/chosen": 0.40695106983184814, "logits/rejected": 0.3971271812915802, "logps/chosen": -82.69999694824219, "logps/ref_chosen": -61.775142669677734, "logps/ref_rejected": -62.88270950317383, "logps/rejected": -94.34140014648438, "loss": 0.9838, "margin_dpo/margin_mean": 10.533843994140625, "margin_dpo/margin_std": 15.69892692565918, "step": 517 }, { "epoch": 0.783068783068783, "grad_norm": 30.663984298706055, "learning_rate": 6.906649047373245e-08, "logits/chosen": 0.37528085708618164, "logits/rejected": 0.32002222537994385, "logps/chosen": -79.28578186035156, "logps/ref_chosen": -62.025230407714844, "logps/ref_rejected": -79.06085205078125, "logps/rejected": -105.47918701171875, "loss": 1.0703, "margin_dpo/margin_mean": 9.15778923034668, "margin_dpo/margin_std": 15.571455001831055, "step": 518 }, { "epoch": 0.7845804988662132, "grad_norm": 44.80562973022461, "learning_rate": 6.815658960673781e-08, "logits/chosen": 0.4019656777381897, "logits/rejected": 0.34249573945999146, "logps/chosen": -82.74702453613281, "logps/ref_chosen": -61.60636901855469, "logps/ref_rejected": -74.50727844238281, "logps/rejected": -102.14393615722656, "loss": 1.4077, "margin_dpo/margin_mean": 6.495992660522461, "margin_dpo/margin_std": 17.53131103515625, "step": 519 }, { "epoch": 0.7860922146636432, "grad_norm": 32.09324264526367, "learning_rate": 6.725177529083209e-08, "logits/chosen": 0.4463905394077301, "logits/rejected": 0.38504666090011597, "logps/chosen": -80.70368194580078, "logps/ref_chosen": -62.87343215942383, "logps/ref_rejected": -76.505615234375, "logps/rejected": -104.06520080566406, "loss": 0.927, "margin_dpo/margin_mean": 9.729334831237793, "margin_dpo/margin_std": 12.515774726867676, "step": 520 }, { "epoch": 0.7876039304610734, "grad_norm": 34.64705276489258, "learning_rate": 6.63520728356167e-08, "logits/chosen": 0.2728666663169861, "logits/rejected": 0.19291679561138153, "logps/chosen": -82.63397216796875, "logps/ref_chosen": -64.20668029785156, "logps/ref_rejected": -92.28083038330078, "logps/rejected": -121.74046325683594, "loss": 0.8833, "margin_dpo/margin_mean": 11.032337188720703, "margin_dpo/margin_std": 13.831937789916992, "step": 521 }, { "epoch": 0.7891156462585034, "grad_norm": 36.75823974609375, "learning_rate": 6.545750740770336e-08, "logits/chosen": 0.3487527370452881, "logits/rejected": 0.33449989557266235, "logps/chosen": -75.84567260742188, "logps/ref_chosen": -58.36972427368164, "logps/ref_rejected": -68.79248046875, "logps/rejected": -96.63177490234375, "loss": 1.1404, "margin_dpo/margin_mean": 10.363338470458984, "margin_dpo/margin_std": 17.992464065551758, "step": 522 }, { "epoch": 0.7906273620559335, "grad_norm": 41.78493881225586, "learning_rate": 6.456810403001012e-08, "logits/chosen": 0.4005553722381592, "logits/rejected": 0.2822886109352112, "logps/chosen": -85.6712417602539, "logps/ref_chosen": -65.71324157714844, "logps/ref_rejected": -91.98896789550781, "logps/rejected": -124.408447265625, "loss": 0.9277, "margin_dpo/margin_mean": 12.461475372314453, "margin_dpo/margin_std": 15.705230712890625, "step": 523 }, { "epoch": 0.7921390778533636, "grad_norm": 36.14333724975586, "learning_rate": 6.368388758106134e-08, "logits/chosen": 0.2999478578567505, "logits/rejected": 0.2743346095085144, "logps/chosen": -92.46823120117188, "logps/ref_chosen": -76.35124969482422, "logps/ref_rejected": -89.96072387695312, "logps/rejected": -114.48500061035156, "loss": 1.0375, "margin_dpo/margin_mean": 8.407294273376465, "margin_dpo/margin_std": 13.35383129119873, "step": 524 }, { "epoch": 0.7936507936507936, "grad_norm": 35.24806213378906, "learning_rate": 6.280488279429185e-08, "logits/chosen": 0.20016852021217346, "logits/rejected": 0.1862761676311493, "logps/chosen": -93.02354431152344, "logps/ref_chosen": -75.49578857421875, "logps/ref_rejected": -84.04852294921875, "logps/rejected": -109.72943115234375, "loss": 1.0159, "margin_dpo/margin_mean": 8.153154373168945, "margin_dpo/margin_std": 12.372773170471191, "step": 525 }, { "epoch": 0.7951625094482238, "grad_norm": 34.34667205810547, "learning_rate": 6.193111425735515e-08, "logits/chosen": 0.3684360086917877, "logits/rejected": 0.30142414569854736, "logps/chosen": -80.24913024902344, "logps/ref_chosen": -61.29241943359375, "logps/ref_rejected": -82.47763061523438, "logps/rejected": -109.65870666503906, "loss": 1.0505, "margin_dpo/margin_mean": 8.224379539489746, "margin_dpo/margin_std": 12.520376205444336, "step": 526 }, { "epoch": 0.7966742252456538, "grad_norm": 41.396583557128906, "learning_rate": 6.106260641143546e-08, "logits/chosen": 0.45549625158309937, "logits/rejected": 0.38120344281196594, "logps/chosen": -81.03947448730469, "logps/ref_chosen": -61.47262954711914, "logps/ref_rejected": -90.52831268310547, "logps/rejected": -117.31925964355469, "loss": 1.172, "margin_dpo/margin_mean": 7.224104881286621, "margin_dpo/margin_std": 14.417116165161133, "step": 527 }, { "epoch": 0.7981859410430839, "grad_norm": 36.54910659790039, "learning_rate": 6.019938355056422e-08, "logits/chosen": 0.26845669746398926, "logits/rejected": 0.18720799684524536, "logps/chosen": -77.49993896484375, "logps/ref_chosen": -58.792015075683594, "logps/ref_rejected": -71.82516479492188, "logps/rejected": -97.82785034179688, "loss": 1.2063, "margin_dpo/margin_mean": 7.294772148132324, "margin_dpo/margin_std": 14.19528579711914, "step": 528 }, { "epoch": 0.799697656840514, "grad_norm": 25.931058883666992, "learning_rate": 5.934146982094049e-08, "logits/chosen": 0.2855815887451172, "logits/rejected": 0.22274532914161682, "logps/chosen": -72.43391418457031, "logps/ref_chosen": -55.070960998535156, "logps/ref_rejected": -75.44007110595703, "logps/rejected": -108.48625946044922, "loss": 0.7267, "margin_dpo/margin_mean": 15.683242797851562, "margin_dpo/margin_std": 15.202524185180664, "step": 529 }, { "epoch": 0.8012093726379441, "grad_norm": 28.412240982055664, "learning_rate": 5.848888922025552e-08, "logits/chosen": 0.3692162334918976, "logits/rejected": 0.32501786947250366, "logps/chosen": -74.69463348388672, "logps/ref_chosen": -56.743812561035156, "logps/ref_rejected": -76.6692123413086, "logps/rejected": -104.58545684814453, "loss": 0.9058, "margin_dpo/margin_mean": 9.965425491333008, "margin_dpo/margin_std": 12.44455337524414, "step": 530 }, { "epoch": 0.8027210884353742, "grad_norm": 35.59648132324219, "learning_rate": 5.7641665597021435e-08, "logits/chosen": 0.3428173065185547, "logits/rejected": 0.2750357389450073, "logps/chosen": -69.8387680053711, "logps/ref_chosen": -51.116458892822266, "logps/ref_rejected": -79.52884674072266, "logps/rejected": -106.50172424316406, "loss": 1.084, "margin_dpo/margin_mean": 8.250574111938477, "margin_dpo/margin_std": 13.591650009155273, "step": 531 }, { "epoch": 0.8042328042328042, "grad_norm": 30.03083610534668, "learning_rate": 5.679982264990424e-08, "logits/chosen": 0.28468602895736694, "logits/rejected": 0.22941389679908752, "logps/chosen": -78.22216796875, "logps/ref_chosen": -58.279945373535156, "logps/ref_rejected": -78.05426788330078, "logps/rejected": -108.97996520996094, "loss": 0.9567, "margin_dpo/margin_mean": 10.983474731445312, "margin_dpo/margin_std": 15.372318267822266, "step": 532 }, { "epoch": 0.8057445200302343, "grad_norm": 30.32957649230957, "learning_rate": 5.596338392706076e-08, "logits/chosen": 0.4587961733341217, "logits/rejected": 0.39391863346099854, "logps/chosen": -71.87660217285156, "logps/ref_chosen": -56.41801452636719, "logps/ref_rejected": -73.89324951171875, "logps/rejected": -99.58336639404297, "loss": 0.9828, "margin_dpo/margin_mean": 10.23153305053711, "margin_dpo/margin_std": 14.516399383544922, "step": 533 }, { "epoch": 0.8072562358276644, "grad_norm": 32.21763229370117, "learning_rate": 5.513237282548033e-08, "logits/chosen": 0.35069578886032104, "logits/rejected": 0.3038621246814728, "logps/chosen": -77.76612091064453, "logps/ref_chosen": -60.748687744140625, "logps/ref_rejected": -73.8623046875, "logps/rejected": -100.79328155517578, "loss": 1.0124, "margin_dpo/margin_mean": 9.913549423217773, "margin_dpo/margin_std": 14.57766342163086, "step": 534 }, { "epoch": 0.8087679516250945, "grad_norm": 32.92207336425781, "learning_rate": 5.430681259032957e-08, "logits/chosen": 0.25479215383529663, "logits/rejected": 0.18896648287773132, "logps/chosen": -81.41650390625, "logps/ref_chosen": -61.637413024902344, "logps/ref_rejected": -80.93138122558594, "logps/rejected": -110.5506591796875, "loss": 1.0647, "margin_dpo/margin_mean": 9.84018325805664, "margin_dpo/margin_std": 16.032939910888672, "step": 535 }, { "epoch": 0.8102796674225246, "grad_norm": 27.200687408447266, "learning_rate": 5.3486726314303175e-08, "logits/chosen": 0.39205577969551086, "logits/rejected": 0.303525447845459, "logps/chosen": -69.04838562011719, "logps/ref_chosen": -51.888973236083984, "logps/ref_rejected": -73.34864044189453, "logps/rejected": -102.1460189819336, "loss": 0.7897, "margin_dpo/margin_mean": 11.637961387634277, "margin_dpo/margin_std": 12.476455688476562, "step": 536 }, { "epoch": 0.8117913832199547, "grad_norm": 30.757015228271484, "learning_rate": 5.267213693697695e-08, "logits/chosen": 0.43932461738586426, "logits/rejected": 0.3489525020122528, "logps/chosen": -73.9029769897461, "logps/ref_chosen": -54.248619079589844, "logps/ref_rejected": -94.94343566894531, "logps/rejected": -124.87551879882812, "loss": 1.0542, "margin_dpo/margin_mean": 10.277728080749512, "margin_dpo/margin_std": 15.779983520507812, "step": 537 }, { "epoch": 0.8133030990173847, "grad_norm": 33.307640075683594, "learning_rate": 5.1863067244167144e-08, "logits/chosen": 0.3416905403137207, "logits/rejected": 0.3030615448951721, "logps/chosen": -89.13497924804688, "logps/ref_chosen": -70.09354400634766, "logps/ref_rejected": -79.49833679199219, "logps/rejected": -111.39437866210938, "loss": 0.8039, "margin_dpo/margin_mean": 12.854602813720703, "margin_dpo/margin_std": 14.327655792236328, "step": 538 }, { "epoch": 0.8148148148148148, "grad_norm": 32.713741302490234, "learning_rate": 5.105953986729195e-08, "logits/chosen": 0.336628258228302, "logits/rejected": 0.2574174106121063, "logps/chosen": -80.40707397460938, "logps/ref_chosen": -61.93169403076172, "logps/ref_rejected": -84.08946228027344, "logps/rejected": -113.2166748046875, "loss": 0.8848, "margin_dpo/margin_mean": 10.651832580566406, "margin_dpo/margin_std": 13.286481857299805, "step": 539 }, { "epoch": 0.8163265306122449, "grad_norm": 31.55617332458496, "learning_rate": 5.026157728273966e-08, "logits/chosen": 0.38173389434814453, "logits/rejected": 0.28644853830337524, "logps/chosen": -80.31228637695312, "logps/ref_chosen": -62.70425033569336, "logps/ref_rejected": -95.63597106933594, "logps/rejected": -126.89846801757812, "loss": 0.7723, "margin_dpo/margin_mean": 13.654460906982422, "margin_dpo/margin_std": 14.235689163208008, "step": 540 }, { "epoch": 0.817838246409675, "grad_norm": 32.464759826660156, "learning_rate": 4.9469201811239035e-08, "logits/chosen": 0.3611023426055908, "logits/rejected": 0.36431825160980225, "logps/chosen": -79.27033996582031, "logps/ref_chosen": -62.48084259033203, "logps/ref_rejected": -57.55541229248047, "logps/rejected": -86.85520935058594, "loss": 0.8684, "margin_dpo/margin_mean": 12.510297775268555, "margin_dpo/margin_std": 14.803916931152344, "step": 541 }, { "epoch": 0.8193499622071051, "grad_norm": 27.37916374206543, "learning_rate": 4.868243561723534e-08, "logits/chosen": 0.41390740871429443, "logits/rejected": 0.35542088747024536, "logps/chosen": -64.57349395751953, "logps/ref_chosen": -49.454891204833984, "logps/ref_rejected": -65.33275604248047, "logps/rejected": -93.30146789550781, "loss": 0.8766, "margin_dpo/margin_mean": 12.850105285644531, "margin_dpo/margin_std": 15.956599235534668, "step": 542 }, { "epoch": 0.8208616780045351, "grad_norm": 25.401742935180664, "learning_rate": 4.790130070827028e-08, "logits/chosen": 0.3546418845653534, "logits/rejected": 0.26434922218322754, "logps/chosen": -68.28006744384766, "logps/ref_chosen": -51.10085678100586, "logps/ref_rejected": -76.06130981445312, "logps/rejected": -104.24705505371094, "loss": 0.9126, "margin_dpo/margin_mean": 11.006534576416016, "margin_dpo/margin_std": 14.493851661682129, "step": 543 }, { "epoch": 0.8223733938019653, "grad_norm": 26.264808654785156, "learning_rate": 4.7125818934366454e-08, "logits/chosen": 0.3526231348514557, "logits/rejected": 0.2759806513786316, "logps/chosen": -77.70940399169922, "logps/ref_chosen": -60.2772331237793, "logps/ref_rejected": -88.40553283691406, "logps/rejected": -120.80406188964844, "loss": 0.8164, "margin_dpo/margin_mean": 14.96635627746582, "margin_dpo/margin_std": 17.501577377319336, "step": 544 }, { "epoch": 0.8238851095993953, "grad_norm": 34.545066833496094, "learning_rate": 4.635601198741607e-08, "logits/chosen": 0.31289103627204895, "logits/rejected": 0.2516869604587555, "logps/chosen": -80.18997955322266, "logps/ref_chosen": -61.61524963378906, "logps/ref_rejected": -78.71266174316406, "logps/rejected": -107.26792907714844, "loss": 1.0111, "margin_dpo/margin_mean": 9.980533599853516, "margin_dpo/margin_std": 15.094192504882812, "step": 545 }, { "epoch": 0.8253968253968254, "grad_norm": 36.85138702392578, "learning_rate": 4.559190140057428e-08, "logits/chosen": 0.41092002391815186, "logits/rejected": 0.3929000794887543, "logps/chosen": -77.4332504272461, "logps/ref_chosen": -59.313262939453125, "logps/ref_rejected": -64.73631286621094, "logps/rejected": -92.99637603759766, "loss": 0.9744, "margin_dpo/margin_mean": 10.140082359313965, "margin_dpo/margin_std": 13.980596542358398, "step": 546 }, { "epoch": 0.8269085411942555, "grad_norm": 28.803686141967773, "learning_rate": 4.483350854765672e-08, "logits/chosen": 0.3148411214351654, "logits/rejected": 0.2440166473388672, "logps/chosen": -70.45671081542969, "logps/ref_chosen": -54.97674560546875, "logps/ref_rejected": -75.35922241210938, "logps/rejected": -102.77980041503906, "loss": 0.8617, "margin_dpo/margin_mean": 11.940618515014648, "margin_dpo/margin_std": 14.236207008361816, "step": 547 }, { "epoch": 0.8284202569916855, "grad_norm": 32.73234176635742, "learning_rate": 4.4080854642541826e-08, "logits/chosen": 0.2752930223941803, "logits/rejected": 0.20368722081184387, "logps/chosen": -82.19081115722656, "logps/ref_chosen": -63.21067810058594, "logps/ref_rejected": -81.23347473144531, "logps/rejected": -109.26484680175781, "loss": 1.0192, "margin_dpo/margin_mean": 9.05124282836914, "margin_dpo/margin_std": 14.42248249053955, "step": 548 }, { "epoch": 0.8299319727891157, "grad_norm": 38.426536560058594, "learning_rate": 4.333396073857723e-08, "logits/chosen": 0.4287411570549011, "logits/rejected": 0.36311283707618713, "logps/chosen": -81.94154357910156, "logps/ref_chosen": -64.27351379394531, "logps/ref_rejected": -92.31663513183594, "logps/rejected": -119.1673583984375, "loss": 1.0566, "margin_dpo/margin_mean": 9.182695388793945, "margin_dpo/margin_std": 14.17637825012207, "step": 549 }, { "epoch": 0.8314436885865457, "grad_norm": 30.786033630371094, "learning_rate": 4.259284772799099e-08, "logits/chosen": 0.39032119512557983, "logits/rejected": 0.35575926303863525, "logps/chosen": -75.25477600097656, "logps/ref_chosen": -56.230438232421875, "logps/ref_rejected": -62.59788513183594, "logps/rejected": -89.64376831054688, "loss": 1.0733, "margin_dpo/margin_mean": 8.02153205871582, "margin_dpo/margin_std": 13.34286117553711, "step": 550 }, { "epoch": 0.8329554043839759, "grad_norm": 30.042922973632812, "learning_rate": 4.1857536341307176e-08, "logits/chosen": 0.41295182704925537, "logits/rejected": 0.37483125925064087, "logps/chosen": -85.90719604492188, "logps/ref_chosen": -67.74720764160156, "logps/ref_rejected": -87.04285430908203, "logps/rejected": -114.77220916748047, "loss": 0.9587, "margin_dpo/margin_mean": 9.569366455078125, "margin_dpo/margin_std": 13.855855941772461, "step": 551 }, { "epoch": 0.8344671201814059, "grad_norm": 32.260215759277344, "learning_rate": 4.112804714676593e-08, "logits/chosen": 0.34446731209754944, "logits/rejected": 0.2876893877983093, "logps/chosen": -80.09131622314453, "logps/ref_chosen": -62.92626190185547, "logps/ref_rejected": -82.983642578125, "logps/rejected": -111.53181457519531, "loss": 0.869, "margin_dpo/margin_mean": 11.383108139038086, "margin_dpo/margin_std": 13.217905044555664, "step": 552 }, { "epoch": 0.8359788359788359, "grad_norm": 35.45354080200195, "learning_rate": 4.0404400549748144e-08, "logits/chosen": 0.33471113443374634, "logits/rejected": 0.22613003849983215, "logps/chosen": -76.17123413085938, "logps/ref_chosen": -56.038490295410156, "logps/ref_rejected": -84.48454284667969, "logps/rejected": -114.52032470703125, "loss": 1.1435, "margin_dpo/margin_mean": 9.903047561645508, "margin_dpo/margin_std": 18.003122329711914, "step": 553 }, { "epoch": 0.8374905517762661, "grad_norm": 36.615055084228516, "learning_rate": 3.968661679220467e-08, "logits/chosen": 0.3139854371547699, "logits/rejected": 0.2850970923900604, "logps/chosen": -83.22395324707031, "logps/ref_chosen": -64.53059387207031, "logps/ref_rejected": -71.21560668945312, "logps/rejected": -100.5650405883789, "loss": 0.9951, "margin_dpo/margin_mean": 10.656076431274414, "margin_dpo/margin_std": 15.991934776306152, "step": 554 }, { "epoch": 0.8390022675736961, "grad_norm": 36.042789459228516, "learning_rate": 3.89747159520904e-08, "logits/chosen": 0.3471308946609497, "logits/rejected": 0.3114258050918579, "logps/chosen": -87.32560729980469, "logps/ref_chosen": -66.65191650390625, "logps/ref_rejected": -68.6667251586914, "logps/rejected": -99.08800506591797, "loss": 1.0254, "margin_dpo/margin_mean": 9.74759292602539, "margin_dpo/margin_std": 13.942052841186523, "step": 555 }, { "epoch": 0.8405139833711263, "grad_norm": 32.9485969543457, "learning_rate": 3.826871794280192e-08, "logits/chosen": 0.38114964962005615, "logits/rejected": 0.3249025344848633, "logps/chosen": -73.74171447753906, "logps/ref_chosen": -52.832366943359375, "logps/ref_rejected": -64.49044036865234, "logps/rejected": -94.696533203125, "loss": 1.1283, "margin_dpo/margin_mean": 9.296748161315918, "margin_dpo/margin_std": 16.70484161376953, "step": 556 }, { "epoch": 0.8420256991685563, "grad_norm": 31.57485580444336, "learning_rate": 3.756864251262143e-08, "logits/chosen": 0.48908981680870056, "logits/rejected": 0.4027714729309082, "logps/chosen": -74.70928955078125, "logps/ref_chosen": -55.035980224609375, "logps/ref_rejected": -75.80644989013672, "logps/rejected": -107.88850402832031, "loss": 0.8341, "margin_dpo/margin_mean": 12.40875244140625, "margin_dpo/margin_std": 14.594932556152344, "step": 557 }, { "epoch": 0.8435374149659864, "grad_norm": 32.328819274902344, "learning_rate": 3.687450924416341e-08, "logits/chosen": 0.42616766691207886, "logits/rejected": 0.36569124460220337, "logps/chosen": -80.0509033203125, "logps/ref_chosen": -63.226348876953125, "logps/ref_rejected": -91.46881866455078, "logps/rejected": -120.33204650878906, "loss": 0.8683, "margin_dpo/margin_mean": 12.038671493530273, "margin_dpo/margin_std": 15.140554428100586, "step": 558 }, { "epoch": 0.8450491307634165, "grad_norm": 32.156707763671875, "learning_rate": 3.6186337553827743e-08, "logits/chosen": 0.3285496234893799, "logits/rejected": 0.2561686336994171, "logps/chosen": -80.550048828125, "logps/ref_chosen": -61.521644592285156, "logps/ref_rejected": -82.83859252929688, "logps/rejected": -113.366943359375, "loss": 0.9861, "margin_dpo/margin_mean": 11.49993896484375, "margin_dpo/margin_std": 16.700138092041016, "step": 559 }, { "epoch": 0.8465608465608465, "grad_norm": 32.59392166137695, "learning_rate": 3.550414669125573e-08, "logits/chosen": 0.36145317554473877, "logits/rejected": 0.318742573261261, "logps/chosen": -79.90272521972656, "logps/ref_chosen": -60.64122009277344, "logps/ref_rejected": -78.75474548339844, "logps/rejected": -108.12406921386719, "loss": 1.0135, "margin_dpo/margin_mean": 10.107817649841309, "margin_dpo/margin_std": 14.901373863220215, "step": 560 }, { "epoch": 0.8480725623582767, "grad_norm": 28.553300857543945, "learning_rate": 3.482795573879241e-08, "logits/chosen": 0.35189008712768555, "logits/rejected": 0.3156622648239136, "logps/chosen": -78.87603759765625, "logps/ref_chosen": -62.49860382080078, "logps/ref_rejected": -78.72064208984375, "logps/rejected": -106.23121643066406, "loss": 0.9285, "margin_dpo/margin_mean": 11.133148193359375, "margin_dpo/margin_std": 15.419248580932617, "step": 561 }, { "epoch": 0.8495842781557067, "grad_norm": 29.942720413208008, "learning_rate": 3.415778361095226e-08, "logits/chosen": 0.3795633912086487, "logits/rejected": 0.3341631591320038, "logps/chosen": -93.68280029296875, "logps/ref_chosen": -74.78173828125, "logps/ref_rejected": -92.63499450683594, "logps/rejected": -124.07745361328125, "loss": 0.8497, "margin_dpo/margin_mean": 12.541391372680664, "margin_dpo/margin_std": 14.615842819213867, "step": 562 }, { "epoch": 0.8510959939531368, "grad_norm": 34.505672454833984, "learning_rate": 3.349364905389032e-08, "logits/chosen": 0.4309301972389221, "logits/rejected": 0.373563289642334, "logps/chosen": -67.70144653320312, "logps/ref_chosen": -50.19850158691406, "logps/ref_rejected": -66.76687622070312, "logps/rejected": -95.26081848144531, "loss": 1.0219, "margin_dpo/margin_mean": 10.990997314453125, "margin_dpo/margin_std": 16.4418888092041, "step": 563 }, { "epoch": 0.8526077097505669, "grad_norm": 28.641733169555664, "learning_rate": 3.283557064487785e-08, "logits/chosen": 0.319988489151001, "logits/rejected": 0.284395694732666, "logps/chosen": -72.115478515625, "logps/ref_chosen": -55.7408447265625, "logps/ref_rejected": -74.8232421875, "logps/rejected": -104.1437759399414, "loss": 0.8346, "margin_dpo/margin_mean": 12.945907592773438, "margin_dpo/margin_std": 14.922258377075195, "step": 564 }, { "epoch": 0.854119425547997, "grad_norm": 35.68989944458008, "learning_rate": 3.218356679178252e-08, "logits/chosen": 0.40092119574546814, "logits/rejected": 0.34424033761024475, "logps/chosen": -79.51023864746094, "logps/ref_chosen": -58.33738327026367, "logps/ref_rejected": -78.31776428222656, "logps/rejected": -109.12625122070312, "loss": 1.0348, "margin_dpo/margin_mean": 9.63563346862793, "margin_dpo/margin_std": 14.547908782958984, "step": 565 }, { "epoch": 0.8556311413454271, "grad_norm": 40.661415100097656, "learning_rate": 3.1537655732553764e-08, "logits/chosen": 0.38034552335739136, "logits/rejected": 0.35432299971580505, "logps/chosen": -89.7762451171875, "logps/ref_chosen": -71.22373962402344, "logps/ref_rejected": -71.11601257324219, "logps/rejected": -99.64232635498047, "loss": 1.1097, "margin_dpo/margin_mean": 9.973814010620117, "margin_dpo/margin_std": 16.615154266357422, "step": 566 }, { "epoch": 0.8571428571428571, "grad_norm": 27.573196411132812, "learning_rate": 3.089785553471233e-08, "logits/chosen": 0.3880395293235779, "logits/rejected": 0.2916451096534729, "logps/chosen": -71.2606201171875, "logps/ref_chosen": -52.669273376464844, "logps/ref_rejected": -74.34785461425781, "logps/rejected": -104.5632553100586, "loss": 0.8334, "margin_dpo/margin_mean": 11.62405014038086, "margin_dpo/margin_std": 13.137116432189941, "step": 567 }, { "epoch": 0.8586545729402872, "grad_norm": 27.940860748291016, "learning_rate": 3.026418409484513e-08, "logits/chosen": 0.4033098816871643, "logits/rejected": 0.31375181674957275, "logps/chosen": -69.65806579589844, "logps/ref_chosen": -52.178001403808594, "logps/ref_rejected": -85.8277587890625, "logps/rejected": -116.09062194824219, "loss": 0.7742, "margin_dpo/margin_mean": 12.78278923034668, "margin_dpo/margin_std": 13.336824417114258, "step": 568 }, { "epoch": 0.8601662887377173, "grad_norm": 31.390438079833984, "learning_rate": 2.963665913810451e-08, "logits/chosen": 0.27315863966941833, "logits/rejected": 0.24073369801044464, "logps/chosen": -81.84532165527344, "logps/ref_chosen": -62.649253845214844, "logps/ref_rejected": -75.4298324584961, "logps/rejected": -104.8544921875, "loss": 0.9563, "margin_dpo/margin_mean": 10.228591918945312, "margin_dpo/margin_std": 13.502737998962402, "step": 569 }, { "epoch": 0.8616780045351474, "grad_norm": 26.328510284423828, "learning_rate": 2.9015298217712453e-08, "logits/chosen": 0.3102429509162903, "logits/rejected": 0.22179409861564636, "logps/chosen": -66.96505737304688, "logps/ref_chosen": -50.04179763793945, "logps/ref_rejected": -78.27146911621094, "logps/rejected": -109.94290161132812, "loss": 0.7166, "margin_dpo/margin_mean": 14.748177528381348, "margin_dpo/margin_std": 14.456729888916016, "step": 570 }, { "epoch": 0.8631897203325775, "grad_norm": 33.18550491333008, "learning_rate": 2.840011871446962e-08, "logits/chosen": 0.34274426102638245, "logits/rejected": 0.3042169213294983, "logps/chosen": -72.50775909423828, "logps/ref_chosen": -53.65681457519531, "logps/ref_rejected": -66.13298034667969, "logps/rejected": -93.20866394042969, "loss": 1.0793, "margin_dpo/margin_mean": 8.224736213684082, "margin_dpo/margin_std": 13.317925453186035, "step": 571 }, { "epoch": 0.8647014361300076, "grad_norm": 38.28977966308594, "learning_rate": 2.7791137836269158e-08, "logits/chosen": 0.38000768423080444, "logits/rejected": 0.4129163324832916, "logps/chosen": -93.44584655761719, "logps/ref_chosen": -74.81793212890625, "logps/ref_rejected": -65.88681030273438, "logps/rejected": -94.55876922607422, "loss": 0.9528, "margin_dpo/margin_mean": 10.044036865234375, "margin_dpo/margin_std": 13.885844230651855, "step": 572 }, { "epoch": 0.8662131519274376, "grad_norm": 39.81300735473633, "learning_rate": 2.718837261761528e-08, "logits/chosen": 0.35131800174713135, "logits/rejected": 0.30365169048309326, "logps/chosen": -89.20417785644531, "logps/ref_chosen": -68.72564697265625, "logps/ref_rejected": -88.16201782226562, "logps/rejected": -119.7973403930664, "loss": 1.0868, "margin_dpo/margin_mean": 11.156793594360352, "margin_dpo/margin_std": 18.632854461669922, "step": 573 }, { "epoch": 0.8677248677248677, "grad_norm": 27.587465286254883, "learning_rate": 2.659183991914696e-08, "logits/chosen": 0.4093438684940338, "logits/rejected": 0.3355673551559448, "logps/chosen": -75.17575073242188, "logps/ref_chosen": -56.31340026855469, "logps/ref_rejected": -83.91553497314453, "logps/rejected": -115.10252380371094, "loss": 0.763, "margin_dpo/margin_mean": 12.324638366699219, "margin_dpo/margin_std": 12.423215866088867, "step": 574 }, { "epoch": 0.8692365835222978, "grad_norm": 37.5849609375, "learning_rate": 2.600155642716606e-08, "logits/chosen": 0.4293455481529236, "logits/rejected": 0.34858548641204834, "logps/chosen": -82.68851470947266, "logps/ref_chosen": -64.5841293334961, "logps/ref_rejected": -93.47034454345703, "logps/rejected": -120.43289947509766, "loss": 1.1319, "margin_dpo/margin_mean": 8.858168601989746, "margin_dpo/margin_std": 15.557498931884766, "step": 575 }, { "epoch": 0.8707482993197279, "grad_norm": 33.85169219970703, "learning_rate": 2.5417538653170754e-08, "logits/chosen": 0.42376574873924255, "logits/rejected": 0.3154095411300659, "logps/chosen": -69.3154296875, "logps/ref_chosen": -53.28052520751953, "logps/ref_rejected": -84.20004272460938, "logps/rejected": -111.85845184326172, "loss": 0.8868, "margin_dpo/margin_mean": 11.62350082397461, "margin_dpo/margin_std": 14.447968482971191, "step": 576 }, { "epoch": 0.872260015117158, "grad_norm": 34.81472396850586, "learning_rate": 2.4839802933393607e-08, "logits/chosen": 0.3261626362800598, "logits/rejected": 0.303744912147522, "logps/chosen": -80.17160034179688, "logps/ref_chosen": -62.32469177246094, "logps/ref_rejected": -67.300537109375, "logps/rejected": -93.61477661132812, "loss": 1.1176, "margin_dpo/margin_mean": 8.467338562011719, "margin_dpo/margin_std": 15.134763717651367, "step": 577 }, { "epoch": 0.873771730914588, "grad_norm": 32.825374603271484, "learning_rate": 2.4268365428344733e-08, "logits/chosen": 0.42811548709869385, "logits/rejected": 0.39781272411346436, "logps/chosen": -74.832275390625, "logps/ref_chosen": -56.65557861328125, "logps/ref_rejected": -68.21835327148438, "logps/rejected": -94.33741760253906, "loss": 1.1023, "margin_dpo/margin_mean": 7.942363739013672, "margin_dpo/margin_std": 14.076078414916992, "step": 578 }, { "epoch": 0.8752834467120182, "grad_norm": 28.52338981628418, "learning_rate": 2.3703242122359357e-08, "logits/chosen": 0.30040955543518066, "logits/rejected": 0.26452547311782837, "logps/chosen": -75.03377532958984, "logps/ref_chosen": -56.809661865234375, "logps/ref_rejected": -68.09613037109375, "logps/rejected": -100.4896240234375, "loss": 0.7615, "margin_dpo/margin_mean": 14.169382095336914, "margin_dpo/margin_std": 15.500885009765625, "step": 579 }, { "epoch": 0.8767951625094482, "grad_norm": 33.52607345581055, "learning_rate": 2.3144448823151392e-08, "logits/chosen": 0.33850836753845215, "logits/rejected": 0.28215405344963074, "logps/chosen": -76.02555847167969, "logps/ref_chosen": -57.70011520385742, "logps/ref_rejected": -77.90664672851562, "logps/rejected": -106.16627502441406, "loss": 1.037, "margin_dpo/margin_mean": 9.934186935424805, "margin_dpo/margin_std": 15.274032592773438, "step": 580 }, { "epoch": 0.8783068783068783, "grad_norm": 37.067745208740234, "learning_rate": 2.259200116137039e-08, "logits/chosen": 0.38778138160705566, "logits/rejected": 0.32042139768600464, "logps/chosen": -79.5643539428711, "logps/ref_chosen": -59.332359313964844, "logps/ref_rejected": -83.64482116699219, "logps/rejected": -112.95232391357422, "loss": 1.0798, "margin_dpo/margin_mean": 9.075504302978516, "margin_dpo/margin_std": 15.379884719848633, "step": 581 }, { "epoch": 0.8798185941043084, "grad_norm": 32.96426010131836, "learning_rate": 2.204591459016525e-08, "logits/chosen": 0.3775950074195862, "logits/rejected": 0.3984692394733429, "logps/chosen": -82.11659240722656, "logps/ref_chosen": -64.16285705566406, "logps/ref_rejected": -58.632896423339844, "logps/rejected": -86.89315795898438, "loss": 0.9115, "margin_dpo/margin_mean": 10.306524276733398, "margin_dpo/margin_std": 13.311269760131836, "step": 582 }, { "epoch": 0.8813303099017384, "grad_norm": 35.847442626953125, "learning_rate": 2.1506204384751064e-08, "logits/chosen": 0.46830427646636963, "logits/rejected": 0.35817262530326843, "logps/chosen": -69.75498962402344, "logps/ref_chosen": -51.87239456176758, "logps/ref_rejected": -83.86331176757812, "logps/rejected": -111.9892807006836, "loss": 1.044, "margin_dpo/margin_mean": 10.24338150024414, "margin_dpo/margin_std": 16.19095230102539, "step": 583 }, { "epoch": 0.8828420256991686, "grad_norm": 33.298362731933594, "learning_rate": 2.09728856419826e-08, "logits/chosen": 0.48178231716156006, "logits/rejected": 0.3758727014064789, "logps/chosen": -63.06849670410156, "logps/ref_chosen": -46.571388244628906, "logps/ref_rejected": -80.67969512939453, "logps/rejected": -107.80296325683594, "loss": 1.053, "margin_dpo/margin_mean": 10.626161575317383, "margin_dpo/margin_std": 16.727203369140625, "step": 584 }, { "epoch": 0.8843537414965986, "grad_norm": 31.498868942260742, "learning_rate": 2.044597327993153e-08, "logits/chosen": 0.33858704566955566, "logits/rejected": 0.29580289125442505, "logps/chosen": -76.06266784667969, "logps/ref_chosen": -58.124534606933594, "logps/ref_rejected": -79.00538635253906, "logps/rejected": -106.02567291259766, "loss": 1.0402, "margin_dpo/margin_mean": 9.082149505615234, "margin_dpo/margin_std": 14.097719192504883, "step": 585 }, { "epoch": 0.8858654572940288, "grad_norm": 29.214014053344727, "learning_rate": 1.9925482037469187e-08, "logits/chosen": 0.4083937406539917, "logits/rejected": 0.35212117433547974, "logps/chosen": -72.71726989746094, "logps/ref_chosen": -54.10163879394531, "logps/ref_rejected": -63.72113037109375, "logps/rejected": -94.09498596191406, "loss": 0.8183, "margin_dpo/margin_mean": 11.758225440979004, "margin_dpo/margin_std": 13.04593276977539, "step": 586 }, { "epoch": 0.8873771730914588, "grad_norm": 40.19536209106445, "learning_rate": 1.9411426473854687e-08, "logits/chosen": 0.39255088567733765, "logits/rejected": 0.3729506731033325, "logps/chosen": -80.76512145996094, "logps/ref_chosen": -63.41719436645508, "logps/ref_rejected": -63.47003936767578, "logps/rejected": -91.57522583007812, "loss": 1.0638, "margin_dpo/margin_mean": 10.757262229919434, "margin_dpo/margin_std": 17.08779525756836, "step": 587 }, { "epoch": 0.8888888888888888, "grad_norm": 33.136600494384766, "learning_rate": 1.890382096832699e-08, "logits/chosen": 0.42146050930023193, "logits/rejected": 0.3690452575683594, "logps/chosen": -80.40387725830078, "logps/ref_chosen": -62.20103454589844, "logps/ref_rejected": -82.10250091552734, "logps/rejected": -110.79915618896484, "loss": 0.929, "margin_dpo/margin_mean": 10.493810653686523, "margin_dpo/margin_std": 14.043756484985352, "step": 588 }, { "epoch": 0.890400604686319, "grad_norm": 30.288169860839844, "learning_rate": 1.840267971970344e-08, "logits/chosen": 0.3641907572746277, "logits/rejected": 0.32878395915031433, "logps/chosen": -72.49046325683594, "logps/ref_chosen": -56.71361541748047, "logps/ref_rejected": -76.7366943359375, "logps/rejected": -106.24540710449219, "loss": 0.7499, "margin_dpo/margin_mean": 13.731870651245117, "margin_dpo/margin_std": 14.987432479858398, "step": 589 }, { "epoch": 0.891912320483749, "grad_norm": 31.437152862548828, "learning_rate": 1.7908016745981856e-08, "logits/chosen": 0.332854300737381, "logits/rejected": 0.29905757308006287, "logps/chosen": -85.63790893554688, "logps/ref_chosen": -66.5138168334961, "logps/ref_rejected": -85.70820617675781, "logps/rejected": -118.0446548461914, "loss": 0.8782, "margin_dpo/margin_mean": 13.212362289428711, "margin_dpo/margin_std": 16.211627960205078, "step": 590 }, { "epoch": 0.8934240362811792, "grad_norm": 31.040788650512695, "learning_rate": 1.7419845883949098e-08, "logits/chosen": 0.43007323145866394, "logits/rejected": 0.3634907603263855, "logps/chosen": -76.70008850097656, "logps/ref_chosen": -60.697181701660156, "logps/ref_rejected": -86.12278747558594, "logps/rejected": -114.46322631835938, "loss": 0.9683, "margin_dpo/margin_mean": 12.337522506713867, "margin_dpo/margin_std": 16.39020347595215, "step": 591 }, { "epoch": 0.8949357520786092, "grad_norm": 32.1717643737793, "learning_rate": 1.6938180788793556e-08, "logits/chosen": 0.4040185213088989, "logits/rejected": 0.28919947147369385, "logps/chosen": -68.08457946777344, "logps/ref_chosen": -51.237327575683594, "logps/ref_rejected": -81.60243225097656, "logps/rejected": -109.53164672851562, "loss": 0.905, "margin_dpo/margin_mean": 11.08197021484375, "margin_dpo/margin_std": 14.641545295715332, "step": 592 }, { "epoch": 0.8964474678760394, "grad_norm": 34.61304473876953, "learning_rate": 1.6463034933723336e-08, "logits/chosen": 0.3542747497558594, "logits/rejected": 0.2594214081764221, "logps/chosen": -57.7242546081543, "logps/ref_chosen": -42.08000183105469, "logps/ref_rejected": -68.47499084472656, "logps/rejected": -94.96978759765625, "loss": 1.0714, "margin_dpo/margin_mean": 10.850542068481445, "margin_dpo/margin_std": 17.000240325927734, "step": 593 }, { "epoch": 0.8979591836734694, "grad_norm": 29.87799835205078, "learning_rate": 1.5994421609589385e-08, "logits/chosen": 0.29593855142593384, "logits/rejected": 0.2723734378814697, "logps/chosen": -82.42706298828125, "logps/ref_chosen": -63.65867614746094, "logps/ref_rejected": -70.35597229003906, "logps/rejected": -97.96664428710938, "loss": 1.0163, "margin_dpo/margin_mean": 8.84228515625, "margin_dpo/margin_std": 13.406206130981445, "step": 594 }, { "epoch": 0.8994708994708994, "grad_norm": 30.320661544799805, "learning_rate": 1.553235392451377e-08, "logits/chosen": 0.41393527388572693, "logits/rejected": 0.31939074397087097, "logps/chosen": -74.23045349121094, "logps/ref_chosen": -56.21875762939453, "logps/ref_rejected": -83.95773315429688, "logps/rejected": -116.33284759521484, "loss": 0.8554, "margin_dpo/margin_mean": 14.363415718078613, "margin_dpo/margin_std": 17.355724334716797, "step": 595 }, { "epoch": 0.9009826152683296, "grad_norm": 36.3000373840332, "learning_rate": 1.507684480352292e-08, "logits/chosen": 0.2564888596534729, "logits/rejected": 0.2615135610103607, "logps/chosen": -87.64381408691406, "logps/ref_chosen": -68.48088073730469, "logps/ref_rejected": -61.732967376708984, "logps/rejected": -88.15812683105469, "loss": 1.1944, "margin_dpo/margin_mean": 7.262219429016113, "margin_dpo/margin_std": 15.237005233764648, "step": 596 }, { "epoch": 0.9024943310657596, "grad_norm": 26.79124641418457, "learning_rate": 1.4627906988186111e-08, "logits/chosen": 0.32503068447113037, "logits/rejected": 0.2960435152053833, "logps/chosen": -64.060791015625, "logps/ref_chosen": -48.85750961303711, "logps/ref_rejected": -55.068084716796875, "logps/rejected": -80.88280487060547, "loss": 0.9442, "margin_dpo/margin_mean": 10.611440658569336, "margin_dpo/margin_std": 14.508207321166992, "step": 597 }, { "epoch": 0.9040060468631897, "grad_norm": 41.459163665771484, "learning_rate": 1.4185553036259095e-08, "logits/chosen": 0.3746280074119568, "logits/rejected": 0.29736125469207764, "logps/chosen": -78.41563415527344, "logps/ref_chosen": -58.88715362548828, "logps/ref_rejected": -81.43145751953125, "logps/rejected": -108.66373443603516, "loss": 1.1658, "margin_dpo/margin_mean": 7.703801155090332, "margin_dpo/margin_std": 14.998854637145996, "step": 598 }, { "epoch": 0.9055177626606198, "grad_norm": 34.81147384643555, "learning_rate": 1.3749795321332885e-08, "logits/chosen": 0.453105092048645, "logits/rejected": 0.4056002199649811, "logps/chosen": -78.00206756591797, "logps/ref_chosen": -57.60719680786133, "logps/ref_rejected": -71.80469512939453, "logps/rejected": -101.34292602539062, "loss": 1.0565, "margin_dpo/margin_mean": 9.143360137939453, "margin_dpo/margin_std": 14.960365295410156, "step": 599 }, { "epoch": 0.9070294784580499, "grad_norm": 31.092208862304688, "learning_rate": 1.3320646032487393e-08, "logits/chosen": 0.4238000214099884, "logits/rejected": 0.3725815415382385, "logps/chosen": -77.0726547241211, "logps/ref_chosen": -58.44231414794922, "logps/ref_rejected": -83.64639282226562, "logps/rejected": -111.70862579345703, "loss": 1.0459, "margin_dpo/margin_mean": 9.431896209716797, "margin_dpo/margin_std": 14.470096588134766, "step": 600 }, { "epoch": 0.9070294784580499, "eval_logits/chosen": 0.37978631258010864, "eval_logits/rejected": 0.32850033044815063, "eval_logps/chosen": -92.03856658935547, "eval_logps/ref_chosen": -74.85946655273438, "eval_logps/ref_rejected": -79.54898834228516, "eval_logps/rejected": -106.09295654296875, "eval_loss": 0.5259022116661072, "eval_margin_dpo/margin_mean": 9.36485481262207, "eval_margin_dpo/margin_std": 14.809694290161133, "eval_runtime": 38.6664, "eval_samples_per_second": 59.561, "eval_steps_per_second": 1.862, "step": 600 }, { "epoch": 0.90854119425548, "grad_norm": 28.272085189819336, "learning_rate": 1.2898117173950868e-08, "logits/chosen": 0.3726602792739868, "logits/rejected": 0.29538896679878235, "logps/chosen": -70.87857055664062, "logps/ref_chosen": -55.59432601928711, "logps/ref_rejected": -83.68630981445312, "logps/rejected": -111.41099548339844, "loss": 0.8946, "margin_dpo/margin_mean": 12.440434455871582, "margin_dpo/margin_std": 15.828836441040039, "step": 601 }, { "epoch": 0.91005291005291, "grad_norm": 29.497037887573242, "learning_rate": 1.2482220564763667e-08, "logits/chosen": 0.36928558349609375, "logits/rejected": 0.3300653100013733, "logps/chosen": -70.74989318847656, "logps/ref_chosen": -56.349185943603516, "logps/ref_rejected": -71.9959716796875, "logps/rejected": -97.75289916992188, "loss": 0.857, "margin_dpo/margin_mean": 11.356219291687012, "margin_dpo/margin_std": 13.522655487060547, "step": 602 }, { "epoch": 0.9115646258503401, "grad_norm": 27.89227294921875, "learning_rate": 1.2072967838448051e-08, "logits/chosen": 0.318248450756073, "logits/rejected": 0.25470617413520813, "logps/chosen": -68.64102172851562, "logps/ref_chosen": -53.168392181396484, "logps/ref_rejected": -73.8604736328125, "logps/rejected": -100.43838500976562, "loss": 0.8921, "margin_dpo/margin_mean": 11.10527515411377, "margin_dpo/margin_std": 14.360272407531738, "step": 603 }, { "epoch": 0.9130763416477702, "grad_norm": 34.568702697753906, "learning_rate": 1.1670370442682459e-08, "logits/chosen": 0.3337470591068268, "logits/rejected": 0.32948338985443115, "logps/chosen": -88.53524780273438, "logps/ref_chosen": -72.64942169189453, "logps/ref_rejected": -69.87926483154297, "logps/rejected": -95.9818115234375, "loss": 1.0618, "margin_dpo/margin_mean": 10.216726303100586, "margin_dpo/margin_std": 16.702648162841797, "step": 604 }, { "epoch": 0.9145880574452003, "grad_norm": 32.900787353515625, "learning_rate": 1.1274439638981532e-08, "logits/chosen": 0.43235743045806885, "logits/rejected": 0.3695913553237915, "logps/chosen": -80.73637390136719, "logps/ref_chosen": -61.61284637451172, "logps/ref_rejected": -79.34398651123047, "logps/rejected": -108.1134033203125, "loss": 1.0151, "margin_dpo/margin_mean": 9.645885467529297, "margin_dpo/margin_std": 14.3195219039917, "step": 605 }, { "epoch": 0.9160997732426304, "grad_norm": 30.54977035522461, "learning_rate": 1.0885186502381016e-08, "logits/chosen": 0.36005428433418274, "logits/rejected": 0.2921282649040222, "logps/chosen": -71.12816619873047, "logps/ref_chosen": -54.464237213134766, "logps/ref_rejected": -79.6270751953125, "logps/rejected": -106.30125427246094, "loss": 0.9359, "margin_dpo/margin_mean": 10.010250091552734, "margin_dpo/margin_std": 13.337453842163086, "step": 606 }, { "epoch": 0.9176114890400605, "grad_norm": 32.681453704833984, "learning_rate": 1.0502621921127774e-08, "logits/chosen": 0.33145594596862793, "logits/rejected": 0.29535144567489624, "logps/chosen": -80.78804016113281, "logps/ref_chosen": -62.86086654663086, "logps/ref_rejected": -72.55020141601562, "logps/rejected": -102.04086303710938, "loss": 0.9373, "margin_dpo/margin_mean": 11.56348991394043, "margin_dpo/margin_std": 15.309553146362305, "step": 607 }, { "epoch": 0.9191232048374905, "grad_norm": 34.54768753051758, "learning_rate": 1.0126756596375685e-08, "logits/chosen": 0.3337676525115967, "logits/rejected": 0.26411697268486023, "logps/chosen": -82.00431823730469, "logps/ref_chosen": -63.18071746826172, "logps/ref_rejected": -99.15888977050781, "logps/rejected": -126.51753234863281, "loss": 1.025, "margin_dpo/margin_mean": 8.535051345825195, "margin_dpo/margin_std": 13.468018531799316, "step": 608 }, { "epoch": 0.9206349206349206, "grad_norm": 25.379526138305664, "learning_rate": 9.757601041885694e-09, "logits/chosen": 0.4193066656589508, "logits/rejected": 0.3789142966270447, "logps/chosen": -64.20651245117188, "logps/ref_chosen": -48.62322235107422, "logps/ref_rejected": -68.28271484375, "logps/rejected": -96.82388305664062, "loss": 0.7597, "margin_dpo/margin_mean": 12.957870483398438, "margin_dpo/margin_std": 13.232034683227539, "step": 609 }, { "epoch": 0.9221466364323507, "grad_norm": 39.13202667236328, "learning_rate": 9.395165583732379e-09, "logits/chosen": 0.3527122139930725, "logits/rejected": 0.3498975336551666, "logps/chosen": -90.61177062988281, "logps/ref_chosen": -72.66513061523438, "logps/ref_rejected": -87.15311431884766, "logps/rejected": -116.95249938964844, "loss": 0.9629, "margin_dpo/margin_mean": 11.85274600982666, "margin_dpo/margin_std": 17.011281967163086, "step": 610 }, { "epoch": 0.9236583522297808, "grad_norm": 36.39158630371094, "learning_rate": 9.03946036001449e-09, "logits/chosen": 0.429378867149353, "logits/rejected": 0.3778746426105499, "logps/chosen": -66.14514923095703, "logps/ref_chosen": -48.30857849121094, "logps/ref_rejected": -70.6141128540039, "logps/rejected": -96.36630249023438, "loss": 1.0813, "margin_dpo/margin_mean": 7.9156084060668945, "margin_dpo/margin_std": 13.126340866088867, "step": 611 }, { "epoch": 0.9251700680272109, "grad_norm": 31.740697860717773, "learning_rate": 8.690495320571839e-09, "logits/chosen": 0.2883661389350891, "logits/rejected": 0.218246728181839, "logps/chosen": -79.38035583496094, "logps/ref_chosen": -61.23155975341797, "logps/ref_rejected": -94.37979888916016, "logps/rejected": -124.63994598388672, "loss": 0.9565, "margin_dpo/margin_mean": 12.111353874206543, "margin_dpo/margin_std": 17.107288360595703, "step": 612 }, { "epoch": 0.926681783824641, "grad_norm": 32.549964904785156, "learning_rate": 8.348280226706722e-09, "logits/chosen": 0.3040260076522827, "logits/rejected": 0.2911207675933838, "logps/chosen": -70.19365692138672, "logps/ref_chosen": -53.98310852050781, "logps/ref_rejected": -58.32208251953125, "logps/rejected": -88.9547348022461, "loss": 0.8032, "margin_dpo/margin_mean": 14.422100067138672, "margin_dpo/margin_std": 16.371356964111328, "step": 613 }, { "epoch": 0.9281934996220711, "grad_norm": 30.036380767822266, "learning_rate": 8.012824650910937e-09, "logits/chosen": 0.39648669958114624, "logits/rejected": 0.3797394633293152, "logps/chosen": -77.901123046875, "logps/ref_chosen": -60.24303436279297, "logps/ref_rejected": -72.26258850097656, "logps/rejected": -100.70169067382812, "loss": 0.8261, "margin_dpo/margin_mean": 10.781007766723633, "margin_dpo/margin_std": 11.667643547058105, "step": 614 }, { "epoch": 0.9297052154195011, "grad_norm": 36.60609436035156, "learning_rate": 7.684137976598088e-09, "logits/chosen": 0.3580027222633362, "logits/rejected": 0.30660200119018555, "logps/chosen": -90.10015106201172, "logps/ref_chosen": -72.09467315673828, "logps/ref_rejected": -104.02980041503906, "logps/rejected": -134.2967987060547, "loss": 0.937, "margin_dpo/margin_mean": 12.261512756347656, "margin_dpo/margin_std": 16.436660766601562, "step": 615 }, { "epoch": 0.9312169312169312, "grad_norm": 31.520265579223633, "learning_rate": 7.36222939784098e-09, "logits/chosen": 0.38119786977767944, "logits/rejected": 0.2980498969554901, "logps/chosen": -77.32060241699219, "logps/ref_chosen": -58.53071975708008, "logps/ref_rejected": -75.48025512695312, "logps/rejected": -102.96885681152344, "loss": 1.0323, "margin_dpo/margin_mean": 8.698728561401367, "margin_dpo/margin_std": 14.142110824584961, "step": 616 }, { "epoch": 0.9327286470143613, "grad_norm": 31.635847091674805, "learning_rate": 7.047107919114586e-09, "logits/chosen": 0.3622846007347107, "logits/rejected": 0.3139178454875946, "logps/chosen": -77.13575744628906, "logps/ref_chosen": -57.608673095703125, "logps/ref_rejected": -81.22109985351562, "logps/rejected": -111.34927368164062, "loss": 0.9236, "margin_dpo/margin_mean": 10.601092338562012, "margin_dpo/margin_std": 13.501839637756348, "step": 617 }, { "epoch": 0.9342403628117913, "grad_norm": 32.08859634399414, "learning_rate": 6.738782355044048e-09, "logits/chosen": 0.33811718225479126, "logits/rejected": 0.23761284351348877, "logps/chosen": -72.75361633300781, "logps/ref_chosen": -56.69594192504883, "logps/ref_rejected": -85.92362976074219, "logps/rejected": -112.8507080078125, "loss": 0.891, "margin_dpo/margin_mean": 10.869397163391113, "margin_dpo/margin_std": 13.591676712036133, "step": 618 }, { "epoch": 0.9357520786092215, "grad_norm": 30.435407638549805, "learning_rate": 6.437261330158206e-09, "logits/chosen": 0.4325089454650879, "logits/rejected": 0.3603595495223999, "logps/chosen": -70.07926940917969, "logps/ref_chosen": -54.05841827392578, "logps/ref_rejected": -83.55493927001953, "logps/rejected": -110.23355865478516, "loss": 0.9168, "margin_dpo/margin_mean": 10.657764434814453, "margin_dpo/margin_std": 14.015510559082031, "step": 619 }, { "epoch": 0.9372637944066515, "grad_norm": 35.472373962402344, "learning_rate": 6.142553278648238e-09, "logits/chosen": 0.36848458647727966, "logits/rejected": 0.3582582175731659, "logps/chosen": -79.79203796386719, "logps/ref_chosen": -63.36971664428711, "logps/ref_rejected": -65.68268585205078, "logps/rejected": -91.74382019042969, "loss": 0.9899, "margin_dpo/margin_mean": 9.638816833496094, "margin_dpo/margin_std": 13.876047134399414, "step": 620 }, { "epoch": 0.9387755102040817, "grad_norm": 35.464088439941406, "learning_rate": 5.854666444131934e-09, "logits/chosen": 0.4091821312904358, "logits/rejected": 0.30643418431282043, "logps/chosen": -69.613037109375, "logps/ref_chosen": -52.321224212646484, "logps/ref_rejected": -88.09001159667969, "logps/rejected": -115.10537719726562, "loss": 1.0238, "margin_dpo/margin_mean": 9.723562240600586, "margin_dpo/margin_std": 14.74032974243164, "step": 621 }, { "epoch": 0.9402872260015117, "grad_norm": 28.998062133789062, "learning_rate": 5.573608879422875e-09, "logits/chosen": 0.3346262276172638, "logits/rejected": 0.29250970482826233, "logps/chosen": -77.71116638183594, "logps/ref_chosen": -59.86545944213867, "logps/ref_rejected": -81.86668395996094, "logps/rejected": -109.695068359375, "loss": 0.8933, "margin_dpo/margin_mean": 9.982682228088379, "margin_dpo/margin_std": 12.632926940917969, "step": 622 }, { "epoch": 0.9417989417989417, "grad_norm": 27.194913864135742, "learning_rate": 5.299388446305342e-09, "logits/chosen": 0.338644802570343, "logits/rejected": 0.27187514305114746, "logps/chosen": -87.81857299804688, "logps/ref_chosen": -67.36846160888672, "logps/ref_rejected": -82.02734375, "logps/rejected": -113.73113250732422, "loss": 0.8523, "margin_dpo/margin_mean": 11.253677368164062, "margin_dpo/margin_std": 13.05603313446045, "step": 623 }, { "epoch": 0.9433106575963719, "grad_norm": 29.42872428894043, "learning_rate": 5.03201281531429e-09, "logits/chosen": 0.35232800245285034, "logits/rejected": 0.2513423562049866, "logps/chosen": -67.53258514404297, "logps/ref_chosen": -51.02655029296875, "logps/ref_rejected": -76.49203491210938, "logps/rejected": -103.98919677734375, "loss": 0.9442, "margin_dpo/margin_mean": 10.991124153137207, "margin_dpo/margin_std": 14.784875869750977, "step": 624 }, { "epoch": 0.9448223733938019, "grad_norm": 32.62732696533203, "learning_rate": 4.7714894655209174e-09, "logits/chosen": 0.4503011405467987, "logits/rejected": 0.3634708523750305, "logps/chosen": -72.8299560546875, "logps/ref_chosen": -54.207618713378906, "logps/ref_rejected": -84.93669891357422, "logps/rejected": -112.27094268798828, "loss": 1.0682, "margin_dpo/margin_mean": 8.711897850036621, "margin_dpo/margin_std": 14.378011703491211, "step": 625 }, { "epoch": 0.9463340891912321, "grad_norm": 30.09181022644043, "learning_rate": 4.517825684323323e-09, "logits/chosen": 0.46222013235092163, "logits/rejected": 0.3331487774848938, "logps/chosen": -61.51173400878906, "logps/ref_chosen": -45.06201934814453, "logps/ref_rejected": -89.66368103027344, "logps/rejected": -118.61964416503906, "loss": 0.9869, "margin_dpo/margin_mean": 12.506250381469727, "margin_dpo/margin_std": 17.837303161621094, "step": 626 }, { "epoch": 0.9478458049886621, "grad_norm": 31.055070877075195, "learning_rate": 4.271028567242818e-09, "logits/chosen": 0.33586978912353516, "logits/rejected": 0.21951913833618164, "logps/chosen": -76.3229751586914, "logps/ref_chosen": -58.791053771972656, "logps/ref_rejected": -94.90802001953125, "logps/rejected": -126.97459411621094, "loss": 0.7957, "margin_dpo/margin_mean": 14.53464412689209, "margin_dpo/margin_std": 16.625656127929688, "step": 627 }, { "epoch": 0.9493575207860923, "grad_norm": 32.846656799316406, "learning_rate": 4.0311050177251895e-09, "logits/chosen": 0.3936043083667755, "logits/rejected": 0.353672593832016, "logps/chosen": -68.23133850097656, "logps/ref_chosen": -52.8035774230957, "logps/ref_rejected": -76.49468994140625, "logps/rejected": -105.67950439453125, "loss": 0.9118, "margin_dpo/margin_mean": 13.75704288482666, "margin_dpo/margin_std": 16.944318771362305, "step": 628 }, { "epoch": 0.9508692365835223, "grad_norm": 28.255290985107422, "learning_rate": 3.798061746947995e-09, "logits/chosen": 0.3765157461166382, "logits/rejected": 0.36460188031196594, "logps/chosen": -87.0561294555664, "logps/ref_chosen": -70.71749877929688, "logps/ref_rejected": -78.9627456665039, "logps/rejected": -106.53677368164062, "loss": 0.7943, "margin_dpo/margin_mean": 11.235391616821289, "margin_dpo/margin_std": 11.854427337646484, "step": 629 }, { "epoch": 0.9523809523809523, "grad_norm": 27.017562866210938, "learning_rate": 3.5719052736323806e-09, "logits/chosen": 0.3252914547920227, "logits/rejected": 0.2737062871456146, "logps/chosen": -73.12852478027344, "logps/ref_chosen": -56.201412200927734, "logps/ref_rejected": -74.69807434082031, "logps/rejected": -103.19023132324219, "loss": 0.863, "margin_dpo/margin_mean": 11.56503963470459, "margin_dpo/margin_std": 13.917753219604492, "step": 630 }, { "epoch": 0.9538926681783825, "grad_norm": 33.7799072265625, "learning_rate": 3.352641923861144e-09, "logits/chosen": 0.46569541096687317, "logits/rejected": 0.3548794686794281, "logps/chosen": -74.5596923828125, "logps/ref_chosen": -58.820594787597656, "logps/ref_rejected": -96.51437377929688, "logps/rejected": -125.98038482666016, "loss": 0.8959, "margin_dpo/margin_mean": 13.726908683776855, "margin_dpo/margin_std": 17.125089645385742, "step": 631 }, { "epoch": 0.9554043839758125, "grad_norm": 29.515872955322266, "learning_rate": 3.140277830901428e-09, "logits/chosen": 0.4190269708633423, "logits/rejected": 0.38895538449287415, "logps/chosen": -75.45999908447266, "logps/ref_chosen": -58.786048889160156, "logps/ref_rejected": -67.21923828125, "logps/rejected": -97.81733703613281, "loss": 0.7872, "margin_dpo/margin_mean": 13.924144744873047, "margin_dpo/margin_std": 14.660362243652344, "step": 632 }, { "epoch": 0.9569160997732427, "grad_norm": 29.584177017211914, "learning_rate": 2.9348189350335007e-09, "logits/chosen": 0.3622671365737915, "logits/rejected": 0.29672741889953613, "logps/chosen": -67.86599731445312, "logps/ref_chosen": -52.13019561767578, "logps/ref_rejected": -67.23016357421875, "logps/rejected": -93.01034545898438, "loss": 1.0208, "margin_dpo/margin_mean": 10.044373512268066, "margin_dpo/margin_std": 15.52154541015625, "step": 633 }, { "epoch": 0.9584278155706727, "grad_norm": 42.10969161987305, "learning_rate": 2.736270983384276e-09, "logits/chosen": 0.4223307967185974, "logits/rejected": 0.4292535185813904, "logps/chosen": -78.71293640136719, "logps/ref_chosen": -60.97979736328125, "logps/ref_rejected": -58.50825119018555, "logps/rejected": -82.54983520507812, "loss": 1.2272, "margin_dpo/margin_mean": 6.308449745178223, "margin_dpo/margin_std": 14.052802085876465, "step": 634 }, { "epoch": 0.9599395313681028, "grad_norm": 39.1640739440918, "learning_rate": 2.5446395297668287e-09, "logits/chosen": 0.28009316325187683, "logits/rejected": 0.21607697010040283, "logps/chosen": -87.28462219238281, "logps/ref_chosen": -65.9730224609375, "logps/ref_rejected": -85.61316680908203, "logps/rejected": -115.56608581542969, "loss": 1.1864, "margin_dpo/margin_mean": 8.641317367553711, "margin_dpo/margin_std": 16.910058975219727, "step": 635 }, { "epoch": 0.9614512471655329, "grad_norm": 28.020477294921875, "learning_rate": 2.359929934524829e-09, "logits/chosen": 0.3389211595058441, "logits/rejected": 0.24626314640045166, "logps/chosen": -65.96224975585938, "logps/ref_chosen": -49.140167236328125, "logps/ref_rejected": -81.26970672607422, "logps/rejected": -110.5828857421875, "loss": 0.7771, "margin_dpo/margin_mean": 12.491098403930664, "margin_dpo/margin_std": 13.732412338256836, "step": 636 }, { "epoch": 0.9629629629629629, "grad_norm": 37.647308349609375, "learning_rate": 2.1821473643827137e-09, "logits/chosen": 0.32874488830566406, "logits/rejected": 0.25553327798843384, "logps/chosen": -95.09454345703125, "logps/ref_chosen": -73.69658660888672, "logps/ref_rejected": -83.01487731933594, "logps/rejected": -113.97785949707031, "loss": 1.0925, "margin_dpo/margin_mean": 9.565020561218262, "margin_dpo/margin_std": 17.08456802368164, "step": 637 }, { "epoch": 0.9644746787603931, "grad_norm": 31.366626739501953, "learning_rate": 2.0112967923011646e-09, "logits/chosen": 0.3579593896865845, "logits/rejected": 0.3046620488166809, "logps/chosen": -82.60899353027344, "logps/ref_chosen": -62.78158187866211, "logps/ref_rejected": -85.40478515625, "logps/rejected": -118.306396484375, "loss": 0.8519, "margin_dpo/margin_mean": 13.074191093444824, "margin_dpo/margin_std": 15.700433731079102, "step": 638 }, { "epoch": 0.9659863945578231, "grad_norm": 31.602121353149414, "learning_rate": 1.847382997337943e-09, "logits/chosen": 0.34738531708717346, "logits/rejected": 0.23237836360931396, "logps/chosen": -71.09160614013672, "logps/ref_chosen": -53.76658248901367, "logps/ref_rejected": -72.30009460449219, "logps/rejected": -101.43208312988281, "loss": 0.8877, "margin_dpo/margin_mean": 11.806966781616211, "margin_dpo/margin_std": 15.63563346862793, "step": 639 }, { "epoch": 0.9674981103552532, "grad_norm": 33.430606842041016, "learning_rate": 1.690410564514244e-09, "logits/chosen": 0.4351768493652344, "logits/rejected": 0.3735978901386261, "logps/chosen": -69.39120483398438, "logps/ref_chosen": -51.41777801513672, "logps/ref_rejected": -77.27879333496094, "logps/rejected": -104.54359436035156, "loss": 1.0784, "margin_dpo/margin_mean": 9.291372299194336, "margin_dpo/margin_std": 15.052839279174805, "step": 640 }, { "epoch": 0.9690098261526833, "grad_norm": 39.402584075927734, "learning_rate": 1.5403838846864692e-09, "logits/chosen": 0.3504864573478699, "logits/rejected": 0.31922876834869385, "logps/chosen": -89.49940490722656, "logps/ref_chosen": -71.0546646118164, "logps/ref_rejected": -82.2440185546875, "logps/rejected": -110.66531372070312, "loss": 0.91, "margin_dpo/margin_mean": 9.976564407348633, "margin_dpo/margin_std": 13.33207893371582, "step": 641 }, { "epoch": 0.9705215419501134, "grad_norm": 39.39484786987305, "learning_rate": 1.3973071544233218e-09, "logits/chosen": 0.31102094054222107, "logits/rejected": 0.31289154291152954, "logps/chosen": -88.44036865234375, "logps/ref_chosen": -68.92927551269531, "logps/ref_rejected": -70.85682678222656, "logps/rejected": -99.15159606933594, "loss": 1.1142, "margin_dpo/margin_mean": 8.783670425415039, "margin_dpo/margin_std": 15.272052764892578, "step": 642 }, { "epoch": 0.9720332577475435, "grad_norm": 99.86732482910156, "learning_rate": 1.261184375888541e-09, "logits/chosen": 0.29999518394470215, "logits/rejected": 0.20917370915412903, "logps/chosen": -84.09303283691406, "logps/ref_chosen": -65.30903625488281, "logps/ref_rejected": -83.61613464355469, "logps/rejected": -112.48616790771484, "loss": 0.9726, "margin_dpo/margin_mean": 10.086037635803223, "margin_dpo/margin_std": 14.042640686035156, "step": 643 }, { "epoch": 0.9735449735449735, "grad_norm": 35.0439338684082, "learning_rate": 1.1320193567288527e-09, "logits/chosen": 0.4415176510810852, "logits/rejected": 0.40569156408309937, "logps/chosen": -68.61014556884766, "logps/ref_chosen": -51.002601623535156, "logps/ref_rejected": -64.46372985839844, "logps/rejected": -91.19786834716797, "loss": 1.1389, "margin_dpo/margin_mean": 9.126594543457031, "margin_dpo/margin_std": 16.37747573852539, "step": 644 }, { "epoch": 0.9750566893424036, "grad_norm": 32.50900650024414, "learning_rate": 1.0098157099674987e-09, "logits/chosen": 0.2861027121543884, "logits/rejected": 0.26147884130477905, "logps/chosen": -79.64277648925781, "logps/ref_chosen": -60.963409423828125, "logps/ref_rejected": -69.73353576660156, "logps/rejected": -98.8067626953125, "loss": 0.9447, "margin_dpo/margin_mean": 10.39387035369873, "margin_dpo/margin_std": 14.448659896850586, "step": 645 }, { "epoch": 0.9765684051398337, "grad_norm": 28.102170944213867, "learning_rate": 8.945768539031783e-10, "logits/chosen": 0.4046555757522583, "logits/rejected": 0.3490592837333679, "logps/chosen": -81.68882751464844, "logps/ref_chosen": -62.290069580078125, "logps/ref_rejected": -85.54812622070312, "logps/rejected": -116.33580017089844, "loss": 0.9153, "margin_dpo/margin_mean": 11.388922691345215, "margin_dpo/margin_std": 14.88244342803955, "step": 646 }, { "epoch": 0.9780801209372638, "grad_norm": 35.961692810058594, "learning_rate": 7.863060120144316e-10, "logits/chosen": 0.39894628524780273, "logits/rejected": 0.30425435304641724, "logps/chosen": -86.18168640136719, "logps/ref_chosen": -67.515869140625, "logps/ref_rejected": -101.50870513916016, "logps/rejected": -133.66326904296875, "loss": 0.8038, "margin_dpo/margin_mean": 13.488750457763672, "margin_dpo/margin_std": 15.609901428222656, "step": 647 }, { "epoch": 0.9795918367346939, "grad_norm": 29.62771224975586, "learning_rate": 6.850062128694045e-10, "logits/chosen": 0.33446913957595825, "logits/rejected": 0.2666047215461731, "logps/chosen": -84.09801483154297, "logps/ref_chosen": -64.59593963623047, "logps/ref_rejected": -83.384033203125, "logps/rejected": -114.47706604003906, "loss": 0.9224, "margin_dpo/margin_mean": 11.590965270996094, "margin_dpo/margin_std": 15.026180267333984, "step": 648 }, { "epoch": 0.981103552532124, "grad_norm": 36.72792434692383, "learning_rate": 5.906802900412788e-10, "logits/chosen": 0.38652855157852173, "logits/rejected": 0.32458633184432983, "logps/chosen": -67.03204345703125, "logps/ref_chosen": -49.30964660644531, "logps/ref_rejected": -73.73710632324219, "logps/rejected": -102.45809936523438, "loss": 1.0014, "margin_dpo/margin_mean": 10.998601913452148, "margin_dpo/margin_std": 15.62658405303955, "step": 649 }, { "epoch": 0.982615268329554, "grad_norm": 39.920772552490234, "learning_rate": 5.033308820289184e-10, "logits/chosen": 0.4805130958557129, "logits/rejected": 0.41330695152282715, "logps/chosen": -72.3466567993164, "logps/ref_chosen": -55.063262939453125, "logps/ref_rejected": -77.39610290527344, "logps/rejected": -105.56784057617188, "loss": 1.0476, "margin_dpo/margin_mean": 10.888345718383789, "margin_dpo/margin_std": 16.72048568725586, "step": 650 }, { "epoch": 0.9841269841269841, "grad_norm": 33.2890625, "learning_rate": 4.2296043218295606e-10, "logits/chosen": 0.45900759100914, "logits/rejected": 0.38366663455963135, "logps/chosen": -70.79644775390625, "logps/ref_chosen": -54.065162658691406, "logps/ref_rejected": -77.79080200195312, "logps/rejected": -104.88078308105469, "loss": 0.9555, "margin_dpo/margin_mean": 10.358694076538086, "margin_dpo/margin_std": 14.075986862182617, "step": 651 }, { "epoch": 0.9856386999244142, "grad_norm": 34.43223571777344, "learning_rate": 3.4957118863768176e-10, "logits/chosen": 0.3836783170700073, "logits/rejected": 0.3254081606864929, "logps/chosen": -83.17698669433594, "logps/ref_chosen": -63.64030456542969, "logps/ref_rejected": -78.86882019042969, "logps/rejected": -108.52143096923828, "loss": 0.9731, "margin_dpo/margin_mean": 10.115922927856445, "margin_dpo/margin_std": 14.506038665771484, "step": 652 }, { "epoch": 0.9871504157218443, "grad_norm": 34.71416473388672, "learning_rate": 2.831652042480093e-10, "logits/chosen": 0.35164594650268555, "logits/rejected": 0.3016093373298645, "logps/chosen": -79.4456558227539, "logps/ref_chosen": -61.668373107910156, "logps/ref_rejected": -73.83012390136719, "logps/rejected": -102.2838363647461, "loss": 1.0075, "margin_dpo/margin_mean": 10.676427841186523, "margin_dpo/margin_std": 16.090900421142578, "step": 653 }, { "epoch": 0.9886621315192744, "grad_norm": 34.91166305541992, "learning_rate": 2.2374433653205016e-10, "logits/chosen": 0.35892999172210693, "logits/rejected": 0.2633056640625, "logps/chosen": -75.29817962646484, "logps/ref_chosen": -57.568267822265625, "logps/ref_rejected": -87.74789428710938, "logps/rejected": -116.19950103759766, "loss": 0.9221, "margin_dpo/margin_mean": 10.721696853637695, "margin_dpo/margin_std": 14.811450958251953, "step": 654 }, { "epoch": 0.9901738473167044, "grad_norm": 27.576698303222656, "learning_rate": 1.7131024761923852e-10, "logits/chosen": 0.3685477375984192, "logits/rejected": 0.270671010017395, "logps/chosen": -67.61747741699219, "logps/ref_chosen": -52.14714813232422, "logps/ref_rejected": -80.85014343261719, "logps/rejected": -107.67730712890625, "loss": 0.8058, "margin_dpo/margin_mean": 11.356832504272461, "margin_dpo/margin_std": 11.787322998046875, "step": 655 }, { "epoch": 0.9916855631141346, "grad_norm": 28.61968994140625, "learning_rate": 1.2586440420372934e-10, "logits/chosen": 0.3276837468147278, "logits/rejected": 0.27165547013282776, "logps/chosen": -91.76471710205078, "logps/ref_chosen": -73.25672912597656, "logps/ref_rejected": -85.35127258300781, "logps/rejected": -116.37913513183594, "loss": 0.8199, "margin_dpo/margin_mean": 12.519876480102539, "margin_dpo/margin_std": 14.494292259216309, "step": 656 }, { "epoch": 0.9931972789115646, "grad_norm": 30.431232452392578, "learning_rate": 8.740807750345913e-11, "logits/chosen": 0.49412286281585693, "logits/rejected": 0.402765154838562, "logps/chosen": -67.19498443603516, "logps/ref_chosen": -49.72339630126953, "logps/ref_rejected": -75.15686798095703, "logps/rejected": -106.35232543945312, "loss": 0.8435, "margin_dpo/margin_mean": 13.723871231079102, "margin_dpo/margin_std": 16.22945213317871, "step": 657 }, { "epoch": 0.9947089947089947, "grad_norm": 35.37551498413086, "learning_rate": 5.594234322453539e-11, "logits/chosen": 0.4039853811264038, "logits/rejected": 0.36136534810066223, "logps/chosen": -82.059814453125, "logps/ref_chosen": -63.04634094238281, "logps/ref_rejected": -83.44963073730469, "logps/rejected": -112.26458740234375, "loss": 1.1246, "margin_dpo/margin_mean": 9.801492691040039, "margin_dpo/margin_std": 16.81917381286621, "step": 658 }, { "epoch": 0.9962207105064248, "grad_norm": 31.186094284057617, "learning_rate": 3.146808153123293e-11, "logits/chosen": 0.4496970474720001, "logits/rejected": 0.38089311122894287, "logps/chosen": -74.04220581054688, "logps/ref_chosen": -55.0802001953125, "logps/ref_rejected": -71.91049194335938, "logps/rejected": -99.90020751953125, "loss": 1.0572, "margin_dpo/margin_mean": 9.027709007263184, "margin_dpo/margin_std": 14.64944076538086, "step": 659 }, { "epoch": 0.9977324263038548, "grad_norm": 30.34589385986328, "learning_rate": 1.3985977021235829e-11, "logits/chosen": 0.47874438762664795, "logits/rejected": 0.40453845262527466, "logps/chosen": -72.7104721069336, "logps/ref_chosen": -54.52591323852539, "logps/ref_rejected": -81.23603820800781, "logps/rejected": -111.93832397460938, "loss": 0.815, "margin_dpo/margin_mean": 12.517721176147461, "margin_dpo/margin_std": 14.22152042388916, "step": 660 }, { "epoch": 0.999244142101285, "grad_norm": 32.01219177246094, "learning_rate": 3.4965187065971735e-12, "logits/chosen": 0.3397953510284424, "logits/rejected": 0.25590038299560547, "logps/chosen": -81.170166015625, "logps/ref_chosen": -60.372642517089844, "logps/ref_rejected": -77.42874908447266, "logps/rejected": -108.52937316894531, "loss": 1.0351, "margin_dpo/margin_mean": 10.303092956542969, "margin_dpo/margin_std": 15.487680435180664, "step": 661 }, { "epoch": 0.999244142101285, "step": 661, "total_flos": 0.0, "train_loss": 1.049779979526185, "train_runtime": 1908.5591, "train_samples_per_second": 22.182, "train_steps_per_second": 0.346 } ], "logging_steps": 1, "max_steps": 661, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }