{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.999244142101285, "eval_steps": 100, "global_step": 661, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0015117157974300832, "grad_norm": 17.898500442504883, "learning_rate": 0.0, "logits/chosen": 1.6779730319976807, "logits/rejected": 1.8961677551269531, "logps/chosen": -83.44859313964844, "logps/ref_chosen": -83.51206970214844, "logps/ref_rejected": -63.188392639160156, "logps/rejected": -63.05577087402344, "loss": 1.387, "margin_dpo/margin_mean": -0.0691443681716919, "margin_dpo/margin_std": 0.32690051198005676, "step": 1 }, { "epoch": 0.0030234315948601664, "grad_norm": 21.4422550201416, "learning_rate": 7.462686567164179e-09, "logits/chosen": 1.873326063156128, "logits/rejected": 1.763237714767456, "logps/chosen": -64.11585998535156, "logps/ref_chosen": -64.09571838378906, "logps/ref_rejected": -81.48753356933594, "logps/rejected": -81.47525024414062, "loss": 1.3839, "margin_dpo/margin_mean": -0.032428622245788574, "margin_dpo/margin_std": 0.4314689636230469, "step": 2 }, { "epoch": 0.0045351473922902496, "grad_norm": 19.98535919189453, "learning_rate": 1.4925373134328357e-08, "logits/chosen": 1.8420765399932861, "logits/rejected": 1.5543999671936035, "logps/chosen": -72.33031463623047, "logps/ref_chosen": -72.22918701171875, "logps/ref_rejected": -144.1502685546875, "logps/rejected": -144.16009521484375, "loss": 1.3892, "margin_dpo/margin_mean": -0.09128785133361816, "margin_dpo/margin_std": 0.5284313559532166, "step": 3 }, { "epoch": 0.006046863189720333, "grad_norm": 19.896684646606445, "learning_rate": 2.2388059701492534e-08, "logits/chosen": 1.8040515184402466, "logits/rejected": 1.771704912185669, "logps/chosen": -90.63349914550781, "logps/ref_chosen": -90.63705444335938, "logps/ref_rejected": -94.29634094238281, "logps/rejected": -94.24974822998047, "loss": 1.3924, "margin_dpo/margin_mean": -0.04302978515625, "margin_dpo/margin_std": 0.4008348882198334, "step": 4 }, { "epoch": 0.007558578987150416, "grad_norm": 18.94373321533203, "learning_rate": 2.9850746268656714e-08, "logits/chosen": 1.7836873531341553, "logits/rejected": 1.5461623668670654, "logps/chosen": -71.84033203125, "logps/ref_chosen": -71.76103210449219, "logps/ref_rejected": -131.26585388183594, "logps/rejected": -131.27783203125, "loss": 1.3921, "margin_dpo/margin_mean": -0.06732475757598877, "margin_dpo/margin_std": 0.5110207796096802, "step": 5 }, { "epoch": 0.009070294784580499, "grad_norm": 18.11248207092285, "learning_rate": 3.731343283582089e-08, "logits/chosen": 1.605924129486084, "logits/rejected": 1.2734148502349854, "logps/chosen": -52.6131706237793, "logps/ref_chosen": -52.6322021484375, "logps/ref_rejected": -116.92510986328125, "logps/rejected": -117.07302856445312, "loss": 1.3756, "margin_dpo/margin_mean": 0.16695034503936768, "margin_dpo/margin_std": 0.2736660838127136, "step": 6 }, { "epoch": 0.010582010582010581, "grad_norm": 17.40144157409668, "learning_rate": 4.477611940298507e-08, "logits/chosen": 1.4343695640563965, "logits/rejected": 1.2503747940063477, "logps/chosen": -56.68077850341797, "logps/ref_chosen": -56.668373107910156, "logps/ref_rejected": -68.33268737792969, "logps/rejected": -68.36799621582031, "loss": 1.3953, "margin_dpo/margin_mean": 0.022906064987182617, "margin_dpo/margin_std": 0.34203869104385376, "step": 7 }, { "epoch": 0.012093726379440665, "grad_norm": 18.27359962463379, "learning_rate": 5.223880597014925e-08, "logits/chosen": 1.6712956428527832, "logits/rejected": 1.6307878494262695, "logps/chosen": -110.3510513305664, "logps/ref_chosen": -110.36569213867188, "logps/ref_rejected": -131.55218505859375, "logps/rejected": -131.60867309570312, "loss": 1.3824, "margin_dpo/margin_mean": 0.07113003730773926, "margin_dpo/margin_std": 0.3367266058921814, "step": 8 }, { "epoch": 0.013605442176870748, "grad_norm": 19.336389541625977, "learning_rate": 5.970149253731343e-08, "logits/chosen": 1.619309425354004, "logits/rejected": 1.5667970180511475, "logps/chosen": -105.69021606445312, "logps/ref_chosen": -105.6363754272461, "logps/ref_rejected": -96.51815795898438, "logps/rejected": -96.45393371582031, "loss": 1.384, "margin_dpo/margin_mean": -0.11805999279022217, "margin_dpo/margin_std": 0.6195100545883179, "step": 9 }, { "epoch": 0.015117157974300832, "grad_norm": 19.008155822753906, "learning_rate": 6.71641791044776e-08, "logits/chosen": 1.9410130977630615, "logits/rejected": 1.8286724090576172, "logps/chosen": -82.22754669189453, "logps/ref_chosen": -82.0704574584961, "logps/ref_rejected": -129.03460693359375, "logps/rejected": -129.04812622070312, "loss": 1.3876, "margin_dpo/margin_mean": -0.14356493949890137, "margin_dpo/margin_std": 0.5235624313354492, "step": 10 }, { "epoch": 0.016628873771730914, "grad_norm": 17.27744483947754, "learning_rate": 7.462686567164178e-08, "logits/chosen": 1.5072648525238037, "logits/rejected": 1.386045217514038, "logps/chosen": -60.070247650146484, "logps/ref_chosen": -60.04745101928711, "logps/ref_rejected": -82.62361145019531, "logps/rejected": -82.6066665649414, "loss": 1.3863, "margin_dpo/margin_mean": -0.0397411584854126, "margin_dpo/margin_std": 0.29999005794525146, "step": 11 }, { "epoch": 0.018140589569160998, "grad_norm": 17.409473419189453, "learning_rate": 8.208955223880596e-08, "logits/chosen": 1.499330759048462, "logits/rejected": 1.5519837141036987, "logps/chosen": -92.88629150390625, "logps/ref_chosen": -92.96957397460938, "logps/ref_rejected": -95.16693115234375, "logps/rejected": -95.20138549804688, "loss": 1.3847, "margin_dpo/margin_mean": 0.11772763729095459, "margin_dpo/margin_std": 0.2226865589618683, "step": 12 }, { "epoch": 0.019652305366591082, "grad_norm": 17.753454208374023, "learning_rate": 8.955223880597014e-08, "logits/chosen": 1.6453795433044434, "logits/rejected": 1.3737688064575195, "logps/chosen": -66.47361755371094, "logps/ref_chosen": -66.66192626953125, "logps/ref_rejected": -129.6320343017578, "logps/rejected": -129.6300048828125, "loss": 1.3765, "margin_dpo/margin_mean": 0.18626713752746582, "margin_dpo/margin_std": 0.4635215401649475, "step": 13 }, { "epoch": 0.021164021164021163, "grad_norm": 19.906005859375, "learning_rate": 9.701492537313432e-08, "logits/chosen": 1.969182014465332, "logits/rejected": 1.6678775548934937, "logps/chosen": -69.01173400878906, "logps/ref_chosen": -69.07022094726562, "logps/ref_rejected": -101.53227233886719, "logps/rejected": -101.49339294433594, "loss": 1.3892, "margin_dpo/margin_mean": 0.019611239433288574, "margin_dpo/margin_std": 0.3171485662460327, "step": 14 }, { "epoch": 0.022675736961451247, "grad_norm": 20.356098175048828, "learning_rate": 1.044776119402985e-07, "logits/chosen": 1.5167992115020752, "logits/rejected": 1.2239075899124146, "logps/chosen": -71.29359436035156, "logps/ref_chosen": -71.23948669433594, "logps/ref_rejected": -119.10147094726562, "logps/rejected": -119.09857177734375, "loss": 1.3882, "margin_dpo/margin_mean": -0.05700063705444336, "margin_dpo/margin_std": 0.24678370356559753, "step": 15 }, { "epoch": 0.02418745275888133, "grad_norm": 17.326297760009766, "learning_rate": 1.1194029850746268e-07, "logits/chosen": 1.7005953788757324, "logits/rejected": 1.641419529914856, "logps/chosen": -50.68073272705078, "logps/ref_chosen": -50.60328674316406, "logps/ref_rejected": -73.29092407226562, "logps/rejected": -73.24888610839844, "loss": 1.3938, "margin_dpo/margin_mean": -0.11948823928833008, "margin_dpo/margin_std": 0.3800099790096283, "step": 16 }, { "epoch": 0.025699168556311415, "grad_norm": 18.32792854309082, "learning_rate": 1.1940298507462686e-07, "logits/chosen": 1.9296047687530518, "logits/rejected": 1.856062412261963, "logps/chosen": -73.9635009765625, "logps/ref_chosen": -73.9170150756836, "logps/ref_rejected": -94.39315795898438, "logps/rejected": -94.3359375, "loss": 1.3869, "margin_dpo/margin_mean": -0.1037132740020752, "margin_dpo/margin_std": 0.3462451696395874, "step": 17 }, { "epoch": 0.027210884353741496, "grad_norm": 17.149362564086914, "learning_rate": 1.2686567164179106e-07, "logits/chosen": 1.6179213523864746, "logits/rejected": 1.4698092937469482, "logps/chosen": -61.83678436279297, "logps/ref_chosen": -61.758995056152344, "logps/ref_rejected": -87.37486267089844, "logps/rejected": -87.43037414550781, "loss": 1.3832, "margin_dpo/margin_mean": -0.022275805473327637, "margin_dpo/margin_std": 0.27115654945373535, "step": 18 }, { "epoch": 0.02872260015117158, "grad_norm": 19.716426849365234, "learning_rate": 1.343283582089552e-07, "logits/chosen": 1.7507288455963135, "logits/rejected": 1.6653666496276855, "logps/chosen": -72.43673706054688, "logps/ref_chosen": -72.33351135253906, "logps/ref_rejected": -117.39173889160156, "logps/rejected": -117.52894592285156, "loss": 1.3856, "margin_dpo/margin_mean": 0.033976078033447266, "margin_dpo/margin_std": 0.4243420362472534, "step": 19 }, { "epoch": 0.030234315948601664, "grad_norm": 18.186567306518555, "learning_rate": 1.4179104477611938e-07, "logits/chosen": 1.7312731742858887, "logits/rejected": 1.5466480255126953, "logps/chosen": -70.5750732421875, "logps/ref_chosen": -70.6292953491211, "logps/ref_rejected": -114.001953125, "logps/rejected": -114.04512786865234, "loss": 1.3868, "margin_dpo/margin_mean": 0.09739136695861816, "margin_dpo/margin_std": 0.32309839129447937, "step": 20 }, { "epoch": 0.031746031746031744, "grad_norm": 18.10664176940918, "learning_rate": 1.4925373134328355e-07, "logits/chosen": 1.1802387237548828, "logits/rejected": 1.3718340396881104, "logps/chosen": -96.27203369140625, "logps/ref_chosen": -96.39649963378906, "logps/ref_rejected": -66.3286361694336, "logps/rejected": -66.37677764892578, "loss": 1.3818, "margin_dpo/margin_mean": 0.17261171340942383, "margin_dpo/margin_std": 0.441084623336792, "step": 21 }, { "epoch": 0.03325774754346183, "grad_norm": 18.422239303588867, "learning_rate": 1.5671641791044775e-07, "logits/chosen": 1.3317618370056152, "logits/rejected": 1.1103053092956543, "logps/chosen": -53.521812438964844, "logps/ref_chosen": -53.4959716796875, "logps/ref_rejected": -72.43526458740234, "logps/rejected": -72.42125701904297, "loss": 1.3899, "margin_dpo/margin_mean": -0.03985464572906494, "margin_dpo/margin_std": 0.2670213580131531, "step": 22 }, { "epoch": 0.03476946334089191, "grad_norm": 19.182920455932617, "learning_rate": 1.6417910447761193e-07, "logits/chosen": 2.5757384300231934, "logits/rejected": 2.252424478530884, "logps/chosen": -102.28890228271484, "logps/ref_chosen": -102.3310546875, "logps/ref_rejected": -166.07086181640625, "logps/rejected": -165.99232482910156, "loss": 1.3821, "margin_dpo/margin_mean": -0.03638148307800293, "margin_dpo/margin_std": 0.2729584276676178, "step": 23 }, { "epoch": 0.036281179138321996, "grad_norm": 17.08799171447754, "learning_rate": 1.716417910447761e-07, "logits/chosen": 1.777854323387146, "logits/rejected": 1.5683221817016602, "logps/chosen": -61.035789489746094, "logps/ref_chosen": -61.087989807128906, "logps/ref_rejected": -87.37986755371094, "logps/rejected": -87.38560485839844, "loss": 1.3864, "margin_dpo/margin_mean": 0.057926058769226074, "margin_dpo/margin_std": 0.21351204812526703, "step": 24 }, { "epoch": 0.03779289493575208, "grad_norm": 19.546113967895508, "learning_rate": 1.7910447761194027e-07, "logits/chosen": 1.761453628540039, "logits/rejected": 1.6618437767028809, "logps/chosen": -83.33495330810547, "logps/ref_chosen": -83.45718383789062, "logps/ref_rejected": -108.64176177978516, "logps/rejected": -108.67863464355469, "loss": 1.3821, "margin_dpo/margin_mean": 0.15910708904266357, "margin_dpo/margin_std": 0.20546990633010864, "step": 25 }, { "epoch": 0.039304610733182165, "grad_norm": 17.80427360534668, "learning_rate": 1.8656716417910447e-07, "logits/chosen": 2.1396985054016113, "logits/rejected": 2.102566719055176, "logps/chosen": -84.14248657226562, "logps/ref_chosen": -84.1357421875, "logps/ref_rejected": -127.7447280883789, "logps/rejected": -127.87543487548828, "loss": 1.3757, "margin_dpo/margin_mean": 0.12395787239074707, "margin_dpo/margin_std": 0.3669869005680084, "step": 26 }, { "epoch": 0.04081632653061224, "grad_norm": 18.56528091430664, "learning_rate": 1.9402985074626865e-07, "logits/chosen": 2.218780994415283, "logits/rejected": 2.2191362380981445, "logps/chosen": -63.50068664550781, "logps/ref_chosen": -63.56757736206055, "logps/ref_rejected": -78.59384155273438, "logps/rejected": -78.58531188964844, "loss": 1.3791, "margin_dpo/margin_mean": 0.05835092067718506, "margin_dpo/margin_std": 0.4117608666419983, "step": 27 }, { "epoch": 0.042328042328042326, "grad_norm": 17.719024658203125, "learning_rate": 2.0149253731343282e-07, "logits/chosen": 1.2656989097595215, "logits/rejected": 1.298233985900879, "logps/chosen": -115.55604553222656, "logps/ref_chosen": -115.63783264160156, "logps/ref_rejected": -122.8431167602539, "logps/rejected": -122.87903594970703, "loss": 1.3857, "margin_dpo/margin_mean": 0.11769771575927734, "margin_dpo/margin_std": 0.347103476524353, "step": 28 }, { "epoch": 0.04383975812547241, "grad_norm": 21.243242263793945, "learning_rate": 2.08955223880597e-07, "logits/chosen": 1.8719220161437988, "logits/rejected": 1.925657033920288, "logps/chosen": -91.31411743164062, "logps/ref_chosen": -91.36831665039062, "logps/ref_rejected": -88.87538146972656, "logps/rejected": -88.75587463378906, "loss": 1.3864, "margin_dpo/margin_mean": -0.06529319286346436, "margin_dpo/margin_std": 0.48117566108703613, "step": 29 }, { "epoch": 0.045351473922902494, "grad_norm": 18.94402313232422, "learning_rate": 2.1641791044776117e-07, "logits/chosen": 1.9215854406356812, "logits/rejected": 1.817436695098877, "logps/chosen": -77.03451538085938, "logps/ref_chosen": -77.02618408203125, "logps/ref_rejected": -101.89169311523438, "logps/rejected": -101.98573303222656, "loss": 1.3853, "margin_dpo/margin_mean": 0.0857081413269043, "margin_dpo/margin_std": 0.24434763193130493, "step": 30 }, { "epoch": 0.04686318972033258, "grad_norm": 20.718582153320312, "learning_rate": 2.2388059701492537e-07, "logits/chosen": 1.9631857872009277, "logits/rejected": 1.6834317445755005, "logps/chosen": -87.51710510253906, "logps/ref_chosen": -87.44007873535156, "logps/ref_rejected": -146.84466552734375, "logps/rejected": -146.79762268066406, "loss": 1.3858, "margin_dpo/margin_mean": -0.12406027317047119, "margin_dpo/margin_std": 0.49897146224975586, "step": 31 }, { "epoch": 0.04837490551776266, "grad_norm": 18.561134338378906, "learning_rate": 2.3134328358208954e-07, "logits/chosen": 1.6212668418884277, "logits/rejected": 1.6185669898986816, "logps/chosen": -103.97479248046875, "logps/ref_chosen": -103.8337173461914, "logps/ref_rejected": -139.31954956054688, "logps/rejected": -139.34271240234375, "loss": 1.3826, "margin_dpo/margin_mean": -0.11791133880615234, "margin_dpo/margin_std": 0.37325161695480347, "step": 32 }, { "epoch": 0.049886621315192746, "grad_norm": 19.705949783325195, "learning_rate": 2.388059701492537e-07, "logits/chosen": 1.638818383216858, "logits/rejected": 1.5358712673187256, "logps/chosen": -82.58836364746094, "logps/ref_chosen": -82.53307342529297, "logps/ref_rejected": -99.98233032226562, "logps/rejected": -99.93173217773438, "loss": 1.3901, "margin_dpo/margin_mean": -0.10588181018829346, "margin_dpo/margin_std": 0.48520350456237793, "step": 33 }, { "epoch": 0.05139833711262283, "grad_norm": 18.05829429626465, "learning_rate": 2.4626865671641786e-07, "logits/chosen": 1.557397484779358, "logits/rejected": 1.4068231582641602, "logps/chosen": -67.19465637207031, "logps/ref_chosen": -67.17166137695312, "logps/ref_rejected": -92.30982971191406, "logps/rejected": -92.29624938964844, "loss": 1.3865, "margin_dpo/margin_mean": -0.03656578063964844, "margin_dpo/margin_std": 0.37222912907600403, "step": 34 }, { "epoch": 0.05291005291005291, "grad_norm": 18.105566024780273, "learning_rate": 2.537313432835821e-07, "logits/chosen": 1.2708134651184082, "logits/rejected": 1.3580116033554077, "logps/chosen": -84.13629150390625, "logps/ref_chosen": -84.29032135009766, "logps/ref_rejected": -74.5831527709961, "logps/rejected": -74.57353210449219, "loss": 1.3812, "margin_dpo/margin_mean": 0.1444075107574463, "margin_dpo/margin_std": 0.5001283288002014, "step": 35 }, { "epoch": 0.05442176870748299, "grad_norm": 18.062942504882812, "learning_rate": 2.611940298507462e-07, "logits/chosen": 1.844420313835144, "logits/rejected": 1.6128147840499878, "logps/chosen": -72.6304931640625, "logps/ref_chosen": -72.701416015625, "logps/ref_rejected": -115.04224395751953, "logps/rejected": -115.15150451660156, "loss": 1.3801, "margin_dpo/margin_mean": 0.18017816543579102, "margin_dpo/margin_std": 0.30819839239120483, "step": 36 }, { "epoch": 0.055933484504913075, "grad_norm": 23.642406463623047, "learning_rate": 2.686567164179104e-07, "logits/chosen": 1.5849182605743408, "logits/rejected": 1.3682098388671875, "logps/chosen": -78.53628540039062, "logps/ref_chosen": -78.48605346679688, "logps/ref_rejected": -124.95842742919922, "logps/rejected": -124.8517837524414, "loss": 1.3936, "margin_dpo/margin_mean": -0.1568678617477417, "margin_dpo/margin_std": 0.2898165285587311, "step": 37 }, { "epoch": 0.05744520030234316, "grad_norm": 18.487695693969727, "learning_rate": 2.761194029850746e-07, "logits/chosen": 1.5338054895401, "logits/rejected": 1.4404113292694092, "logps/chosen": -65.9412841796875, "logps/ref_chosen": -66.01943969726562, "logps/ref_rejected": -76.39341735839844, "logps/rejected": -76.41506958007812, "loss": 1.3918, "margin_dpo/margin_mean": 0.09981250762939453, "margin_dpo/margin_std": 0.36171671748161316, "step": 38 }, { "epoch": 0.05895691609977324, "grad_norm": 18.96533203125, "learning_rate": 2.8358208955223876e-07, "logits/chosen": 2.279639720916748, "logits/rejected": 2.1966304779052734, "logps/chosen": -61.78990936279297, "logps/ref_chosen": -61.86280059814453, "logps/ref_rejected": -82.48257446289062, "logps/rejected": -82.45930480957031, "loss": 1.3832, "margin_dpo/margin_mean": 0.04962599277496338, "margin_dpo/margin_std": 0.35977721214294434, "step": 39 }, { "epoch": 0.06046863189720333, "grad_norm": 17.937349319458008, "learning_rate": 2.9104477611940296e-07, "logits/chosen": 2.048196315765381, "logits/rejected": 1.960934042930603, "logps/chosen": -57.42654037475586, "logps/ref_chosen": -57.4611930847168, "logps/ref_rejected": -77.61997985839844, "logps/rejected": -77.71343994140625, "loss": 1.3804, "margin_dpo/margin_mean": 0.12811851501464844, "margin_dpo/margin_std": 0.3518058955669403, "step": 40 }, { "epoch": 0.06198034769463341, "grad_norm": 18.322587966918945, "learning_rate": 2.985074626865671e-07, "logits/chosen": 1.3730394840240479, "logits/rejected": 1.3903639316558838, "logps/chosen": -90.71647644042969, "logps/ref_chosen": -90.73774719238281, "logps/ref_rejected": -98.51454162597656, "logps/rejected": -98.5171127319336, "loss": 1.3821, "margin_dpo/margin_mean": 0.023851871490478516, "margin_dpo/margin_std": 0.3970358371734619, "step": 41 }, { "epoch": 0.06349206349206349, "grad_norm": 18.129680633544922, "learning_rate": 3.059701492537313e-07, "logits/chosen": 1.6288437843322754, "logits/rejected": 1.575326681137085, "logps/chosen": -81.23544311523438, "logps/ref_chosen": -81.25213623046875, "logps/ref_rejected": -95.20384216308594, "logps/rejected": -95.15198516845703, "loss": 1.3798, "margin_dpo/margin_mean": -0.03517186641693115, "margin_dpo/margin_std": 0.3057817816734314, "step": 42 }, { "epoch": 0.06500377928949358, "grad_norm": 19.412158966064453, "learning_rate": 3.134328358208955e-07, "logits/chosen": 1.4714133739471436, "logits/rejected": 1.2787561416625977, "logps/chosen": -79.71080017089844, "logps/ref_chosen": -79.79239654541016, "logps/ref_rejected": -103.97785949707031, "logps/rejected": -103.95821380615234, "loss": 1.3826, "margin_dpo/margin_mean": 0.061962008476257324, "margin_dpo/margin_std": 0.3317086398601532, "step": 43 }, { "epoch": 0.06651549508692366, "grad_norm": 18.9310302734375, "learning_rate": 3.2089552238805965e-07, "logits/chosen": 1.8219784498214722, "logits/rejected": 1.8434821367263794, "logps/chosen": -102.7157974243164, "logps/ref_chosen": -102.73938751220703, "logps/ref_rejected": -97.6513671875, "logps/rejected": -97.61663055419922, "loss": 1.3834, "margin_dpo/margin_mean": -0.011139988899230957, "margin_dpo/margin_std": 0.3221431076526642, "step": 44 }, { "epoch": 0.06802721088435375, "grad_norm": 19.376144409179688, "learning_rate": 3.2835820895522385e-07, "logits/chosen": 1.81675386428833, "logits/rejected": 1.8069252967834473, "logps/chosen": -92.9081802368164, "logps/ref_chosen": -92.98908996582031, "logps/ref_rejected": -104.06941223144531, "logps/rejected": -104.01454162597656, "loss": 1.3829, "margin_dpo/margin_mean": 0.02604234218597412, "margin_dpo/margin_std": 0.49965983629226685, "step": 45 }, { "epoch": 0.06953892668178382, "grad_norm": 18.400278091430664, "learning_rate": 3.3582089552238805e-07, "logits/chosen": 1.9280130863189697, "logits/rejected": 1.9604179859161377, "logps/chosen": -112.65037536621094, "logps/ref_chosen": -112.74551391601562, "logps/ref_rejected": -100.84678649902344, "logps/rejected": -100.83540344238281, "loss": 1.3795, "margin_dpo/margin_mean": 0.08375799655914307, "margin_dpo/margin_std": 0.4423407316207886, "step": 46 }, { "epoch": 0.0710506424792139, "grad_norm": 19.051105499267578, "learning_rate": 3.432835820895522e-07, "logits/chosen": 2.0528926849365234, "logits/rejected": 2.015890598297119, "logps/chosen": -70.34732055664062, "logps/ref_chosen": -70.36965942382812, "logps/ref_rejected": -71.17677307128906, "logps/rejected": -71.1390609741211, "loss": 1.3848, "margin_dpo/margin_mean": -0.015374064445495605, "margin_dpo/margin_std": 0.34213775396347046, "step": 47 }, { "epoch": 0.07256235827664399, "grad_norm": 18.34172248840332, "learning_rate": 3.507462686567164e-07, "logits/chosen": 1.851947546005249, "logits/rejected": 1.7987971305847168, "logps/chosen": -73.01834106445312, "logps/ref_chosen": -73.19611358642578, "logps/ref_rejected": -90.55521392822266, "logps/rejected": -90.53504943847656, "loss": 1.3808, "margin_dpo/margin_mean": 0.15761232376098633, "margin_dpo/margin_std": 0.29501742124557495, "step": 48 }, { "epoch": 0.07407407407407407, "grad_norm": 17.46572494506836, "learning_rate": 3.5820895522388055e-07, "logits/chosen": 1.5749485492706299, "logits/rejected": 1.4956673383712769, "logps/chosen": -54.538917541503906, "logps/ref_chosen": -54.60618591308594, "logps/ref_rejected": -64.04939270019531, "logps/rejected": -63.959739685058594, "loss": 1.3817, "margin_dpo/margin_mean": -0.022379517555236816, "margin_dpo/margin_std": 0.2070123553276062, "step": 49 }, { "epoch": 0.07558578987150416, "grad_norm": 17.93285369873047, "learning_rate": 3.6567164179104475e-07, "logits/chosen": 1.7781734466552734, "logits/rejected": 1.642246961593628, "logps/chosen": -79.31803131103516, "logps/ref_chosen": -79.22439575195312, "logps/ref_rejected": -101.6429443359375, "logps/rejected": -101.7005615234375, "loss": 1.3867, "margin_dpo/margin_mean": -0.03601944446563721, "margin_dpo/margin_std": 0.31964874267578125, "step": 50 }, { "epoch": 0.07709750566893424, "grad_norm": 18.864973068237305, "learning_rate": 3.7313432835820895e-07, "logits/chosen": 1.895892858505249, "logits/rejected": 1.764426827430725, "logps/chosen": -60.76810836791992, "logps/ref_chosen": -60.84101104736328, "logps/ref_rejected": -84.53193664550781, "logps/rejected": -84.66233825683594, "loss": 1.3903, "margin_dpo/margin_mean": 0.20329368114471436, "margin_dpo/margin_std": 0.522630512714386, "step": 51 }, { "epoch": 0.07860922146636433, "grad_norm": 20.314756393432617, "learning_rate": 3.805970149253731e-07, "logits/chosen": 1.752061128616333, "logits/rejected": 1.5397191047668457, "logps/chosen": -87.34332275390625, "logps/ref_chosen": -87.34712219238281, "logps/ref_rejected": -130.8834991455078, "logps/rejected": -131.06954956054688, "loss": 1.3797, "margin_dpo/margin_mean": 0.1898595094680786, "margin_dpo/margin_std": 0.4715108275413513, "step": 52 }, { "epoch": 0.0801209372637944, "grad_norm": 17.26243019104004, "learning_rate": 3.880597014925373e-07, "logits/chosen": 1.3963996171951294, "logits/rejected": 1.4448646306991577, "logps/chosen": -78.25464630126953, "logps/ref_chosen": -78.33966064453125, "logps/ref_rejected": -69.99455261230469, "logps/rejected": -69.97331237792969, "loss": 1.3743, "margin_dpo/margin_mean": 0.06377887725830078, "margin_dpo/margin_std": 0.3493618965148926, "step": 53 }, { "epoch": 0.08163265306122448, "grad_norm": 16.3485050201416, "learning_rate": 3.9552238805970144e-07, "logits/chosen": 1.4432241916656494, "logits/rejected": 1.4330339431762695, "logps/chosen": -62.21860122680664, "logps/ref_chosen": -62.2901611328125, "logps/ref_rejected": -66.09869384765625, "logps/rejected": -66.14897155761719, "loss": 1.3802, "margin_dpo/margin_mean": 0.12183797359466553, "margin_dpo/margin_std": 0.33962827920913696, "step": 54 }, { "epoch": 0.08314436885865457, "grad_norm": 19.248918533325195, "learning_rate": 4.0298507462686564e-07, "logits/chosen": 1.589186191558838, "logits/rejected": 1.3769021034240723, "logps/chosen": -63.20148849487305, "logps/ref_chosen": -63.309165954589844, "logps/ref_rejected": -106.18276977539062, "logps/rejected": -106.23402404785156, "loss": 1.3636, "margin_dpo/margin_mean": 0.15891790390014648, "margin_dpo/margin_std": 0.29119423031806946, "step": 55 }, { "epoch": 0.08465608465608465, "grad_norm": 16.991220474243164, "learning_rate": 4.1044776119402984e-07, "logits/chosen": 1.6669023036956787, "logits/rejected": 1.8024544715881348, "logps/chosen": -88.55504608154297, "logps/ref_chosen": -88.68550109863281, "logps/ref_rejected": -79.2552490234375, "logps/rejected": -79.27470397949219, "loss": 1.376, "margin_dpo/margin_mean": 0.1499018669128418, "margin_dpo/margin_std": 0.39563676714897156, "step": 56 }, { "epoch": 0.08616780045351474, "grad_norm": 18.93277359008789, "learning_rate": 4.17910447761194e-07, "logits/chosen": 1.9262073040008545, "logits/rejected": 1.8704383373260498, "logps/chosen": -74.62947082519531, "logps/ref_chosen": -74.7513427734375, "logps/ref_rejected": -109.95883178710938, "logps/rejected": -110.10675048828125, "loss": 1.3705, "margin_dpo/margin_mean": 0.2697904109954834, "margin_dpo/margin_std": 0.37658262252807617, "step": 57 }, { "epoch": 0.08767951625094482, "grad_norm": 19.255691528320312, "learning_rate": 4.253731343283582e-07, "logits/chosen": 1.4077178239822388, "logits/rejected": 1.257333755493164, "logps/chosen": -75.95296478271484, "logps/ref_chosen": -76.02732849121094, "logps/ref_rejected": -108.35926818847656, "logps/rejected": -108.50439453125, "loss": 1.3828, "margin_dpo/margin_mean": 0.2194993495941162, "margin_dpo/margin_std": 0.40784844756126404, "step": 58 }, { "epoch": 0.08919123204837491, "grad_norm": 16.736886978149414, "learning_rate": 4.3283582089552234e-07, "logits/chosen": 1.9333748817443848, "logits/rejected": 1.6212902069091797, "logps/chosen": -53.054588317871094, "logps/ref_chosen": -53.1632080078125, "logps/ref_rejected": -99.89010620117188, "logps/rejected": -100.01497650146484, "loss": 1.37, "margin_dpo/margin_mean": 0.2335038185119629, "margin_dpo/margin_std": 0.6068885922431946, "step": 59 }, { "epoch": 0.09070294784580499, "grad_norm": 18.47850799560547, "learning_rate": 4.4029850746268654e-07, "logits/chosen": 2.0820059776306152, "logits/rejected": 2.1231188774108887, "logps/chosen": -82.00350952148438, "logps/ref_chosen": -82.04094696044922, "logps/ref_rejected": -85.6044692993164, "logps/rejected": -85.51055908203125, "loss": 1.3753, "margin_dpo/margin_mean": -0.056465864181518555, "margin_dpo/margin_std": 0.5121511816978455, "step": 60 }, { "epoch": 0.09221466364323508, "grad_norm": 16.533052444458008, "learning_rate": 4.4776119402985074e-07, "logits/chosen": 1.287696123123169, "logits/rejected": 1.1252989768981934, "logps/chosen": -53.68785095214844, "logps/ref_chosen": -53.741973876953125, "logps/ref_rejected": -95.9638671875, "logps/rejected": -95.99526977539062, "loss": 1.3822, "margin_dpo/margin_mean": 0.0855402946472168, "margin_dpo/margin_std": 0.41517889499664307, "step": 61 }, { "epoch": 0.09372637944066516, "grad_norm": 17.853179931640625, "learning_rate": 4.552238805970149e-07, "logits/chosen": 1.9671781063079834, "logits/rejected": 1.7240625619888306, "logps/chosen": -61.32941436767578, "logps/ref_chosen": -61.45232009887695, "logps/ref_rejected": -104.53389739990234, "logps/rejected": -104.56465148925781, "loss": 1.3697, "margin_dpo/margin_mean": 0.15366590023040771, "margin_dpo/margin_std": 0.4133530259132385, "step": 62 }, { "epoch": 0.09523809523809523, "grad_norm": 19.25010108947754, "learning_rate": 4.626865671641791e-07, "logits/chosen": 2.477019786834717, "logits/rejected": 2.518435001373291, "logps/chosen": -93.96156311035156, "logps/ref_chosen": -93.94019317626953, "logps/ref_rejected": -125.72433471679688, "logps/rejected": -125.82534790039062, "loss": 1.3647, "margin_dpo/margin_mean": 0.07965302467346191, "margin_dpo/margin_std": 0.6282739043235779, "step": 63 }, { "epoch": 0.09674981103552532, "grad_norm": 16.468244552612305, "learning_rate": 4.701492537313433e-07, "logits/chosen": 1.5718178749084473, "logits/rejected": 1.3083699941635132, "logps/chosen": -60.390655517578125, "logps/ref_chosen": -60.620521545410156, "logps/ref_rejected": -92.48592376708984, "logps/rejected": -92.69236755371094, "loss": 1.3805, "margin_dpo/margin_mean": 0.4363136291503906, "margin_dpo/margin_std": 0.5674552917480469, "step": 64 }, { "epoch": 0.0982615268329554, "grad_norm": 18.87264633178711, "learning_rate": 4.776119402985074e-07, "logits/chosen": 2.0484414100646973, "logits/rejected": 2.146376609802246, "logps/chosen": -93.19002532958984, "logps/ref_chosen": -93.0993423461914, "logps/ref_rejected": -74.82710266113281, "logps/rejected": -74.88607025146484, "loss": 1.3796, "margin_dpo/margin_mean": -0.03171539306640625, "margin_dpo/margin_std": 0.6118614673614502, "step": 65 }, { "epoch": 0.09977324263038549, "grad_norm": 18.51951026916504, "learning_rate": 4.850746268656717e-07, "logits/chosen": 1.9283275604248047, "logits/rejected": 1.8140395879745483, "logps/chosen": -89.08029174804688, "logps/ref_chosen": -89.0787353515625, "logps/ref_rejected": -98.55683135986328, "logps/rejected": -98.80810546875, "loss": 1.3673, "margin_dpo/margin_mean": 0.24970781803131104, "margin_dpo/margin_std": 0.5531671047210693, "step": 66 }, { "epoch": 0.10128495842781557, "grad_norm": 18.96343421936035, "learning_rate": 4.925373134328357e-07, "logits/chosen": 1.8661693334579468, "logits/rejected": 1.7169712781906128, "logps/chosen": -80.62449645996094, "logps/ref_chosen": -80.75080871582031, "logps/ref_rejected": -92.10690307617188, "logps/rejected": -92.14776611328125, "loss": 1.3681, "margin_dpo/margin_mean": 0.16716492176055908, "margin_dpo/margin_std": 0.4882839322090149, "step": 67 }, { "epoch": 0.10279667422524566, "grad_norm": 19.684844970703125, "learning_rate": 5e-07, "logits/chosen": 1.8137176036834717, "logits/rejected": 1.5234215259552002, "logps/chosen": -100.9151840209961, "logps/ref_chosen": -100.94575500488281, "logps/ref_rejected": -131.35989379882812, "logps/rejected": -131.56048583984375, "loss": 1.3684, "margin_dpo/margin_mean": 0.23115086555480957, "margin_dpo/margin_std": 0.761376142501831, "step": 68 }, { "epoch": 0.10430839002267574, "grad_norm": 19.874393463134766, "learning_rate": 4.999965034812934e-07, "logits/chosen": 1.8117549419403076, "logits/rejected": 1.678621768951416, "logps/chosen": -73.54576873779297, "logps/ref_chosen": -73.87828063964844, "logps/ref_rejected": -85.08430480957031, "logps/rejected": -84.94237518310547, "loss": 1.3614, "margin_dpo/margin_mean": 0.1905810832977295, "margin_dpo/margin_std": 0.37505558133125305, "step": 69 }, { "epoch": 0.10582010582010581, "grad_norm": 18.305349349975586, "learning_rate": 4.999860140229787e-07, "logits/chosen": 1.5521442890167236, "logits/rejected": 1.6108993291854858, "logps/chosen": -91.65077209472656, "logps/ref_chosen": -91.87322235107422, "logps/ref_rejected": -82.92180633544922, "logps/rejected": -82.74408721923828, "loss": 1.3662, "margin_dpo/margin_mean": 0.04472362995147705, "margin_dpo/margin_std": 0.6186438798904419, "step": 70 }, { "epoch": 0.1073318216175359, "grad_norm": 17.6241397857666, "learning_rate": 4.999685319184688e-07, "logits/chosen": 1.6633296012878418, "logits/rejected": 1.6592857837677002, "logps/chosen": -59.971195220947266, "logps/ref_chosen": -60.23143768310547, "logps/ref_rejected": -76.09031677246094, "logps/rejected": -75.9674301147461, "loss": 1.3757, "margin_dpo/margin_mean": 0.13735723495483398, "margin_dpo/margin_std": 0.5450801253318787, "step": 71 }, { "epoch": 0.10884353741496598, "grad_norm": 20.622314453125, "learning_rate": 4.999440576567755e-07, "logits/chosen": 1.6190707683563232, "logits/rejected": 1.3904025554656982, "logps/chosen": -62.686309814453125, "logps/ref_chosen": -63.191131591796875, "logps/ref_rejected": -76.39436340332031, "logps/rejected": -76.22337341308594, "loss": 1.3638, "margin_dpo/margin_mean": 0.3338280916213989, "margin_dpo/margin_std": 0.5702110528945923, "step": 72 }, { "epoch": 0.11035525321239607, "grad_norm": 19.382307052612305, "learning_rate": 4.999125919224965e-07, "logits/chosen": 1.445723056793213, "logits/rejected": 1.4411249160766602, "logps/chosen": -76.63996887207031, "logps/ref_chosen": -76.78716278076172, "logps/ref_rejected": -67.70246887207031, "logps/rejected": -67.4369888305664, "loss": 1.3854, "margin_dpo/margin_mean": -0.1182854175567627, "margin_dpo/margin_std": 0.7074819803237915, "step": 73 }, { "epoch": 0.11186696900982615, "grad_norm": 18.078689575195312, "learning_rate": 4.998741355957963e-07, "logits/chosen": 2.0661163330078125, "logits/rejected": 1.8220714330673218, "logps/chosen": -77.66769409179688, "logps/ref_chosen": -77.9533920288086, "logps/ref_rejected": -128.09182739257812, "logps/rejected": -128.16378784179688, "loss": 1.3623, "margin_dpo/margin_mean": 0.3576490879058838, "margin_dpo/margin_std": 0.6111558079719543, "step": 74 }, { "epoch": 0.11337868480725624, "grad_norm": 16.98634910583496, "learning_rate": 4.998286897523808e-07, "logits/chosen": 1.8484798669815063, "logits/rejected": 1.7624216079711914, "logps/chosen": -56.08860778808594, "logps/ref_chosen": -56.33122253417969, "logps/ref_rejected": -70.73942565917969, "logps/rejected": -70.81351470947266, "loss": 1.3622, "margin_dpo/margin_mean": 0.31670188903808594, "margin_dpo/margin_std": 0.5669878721237183, "step": 75 }, { "epoch": 0.11489040060468632, "grad_norm": 17.737001419067383, "learning_rate": 4.997762556634679e-07, "logits/chosen": 1.139418363571167, "logits/rejected": 1.0257502794265747, "logps/chosen": -74.759033203125, "logps/ref_chosen": -74.93817138671875, "logps/ref_rejected": -107.31590270996094, "logps/rejected": -107.27798461914062, "loss": 1.3611, "margin_dpo/margin_mean": 0.14121675491333008, "margin_dpo/margin_std": 0.6016703844070435, "step": 76 }, { "epoch": 0.1164021164021164, "grad_norm": 18.354162216186523, "learning_rate": 4.99716834795752e-07, "logits/chosen": 0.7272888422012329, "logits/rejected": 0.8730248808860779, "logps/chosen": -74.24947357177734, "logps/ref_chosen": -74.51144409179688, "logps/ref_rejected": -67.98213195800781, "logps/rejected": -67.81912231445312, "loss": 1.3501, "margin_dpo/margin_mean": 0.09897100925445557, "margin_dpo/margin_std": 0.7666027545928955, "step": 77 }, { "epoch": 0.11791383219954649, "grad_norm": 17.864540100097656, "learning_rate": 4.996504288113623e-07, "logits/chosen": 1.55259370803833, "logits/rejected": 1.5353095531463623, "logps/chosen": -89.1446533203125, "logps/ref_chosen": -89.7486572265625, "logps/ref_rejected": -106.78245544433594, "logps/rejected": -106.82298278808594, "loss": 1.3534, "margin_dpo/margin_mean": 0.6445306539535522, "margin_dpo/margin_std": 0.6010682582855225, "step": 78 }, { "epoch": 0.11942554799697656, "grad_norm": 19.36505699157715, "learning_rate": 4.995770395678171e-07, "logits/chosen": 1.8799240589141846, "logits/rejected": 1.9199717044830322, "logps/chosen": -71.10166931152344, "logps/ref_chosen": -71.39693450927734, "logps/ref_rejected": -78.64067077636719, "logps/rejected": -78.40392303466797, "loss": 1.3473, "margin_dpo/margin_mean": 0.05852353572845459, "margin_dpo/margin_std": 0.9495965838432312, "step": 79 }, { "epoch": 0.12093726379440665, "grad_norm": 17.543731689453125, "learning_rate": 4.994966691179711e-07, "logits/chosen": 1.6827142238616943, "logits/rejected": 1.415197730064392, "logps/chosen": -72.31405639648438, "logps/ref_chosen": -72.54411315917969, "logps/ref_rejected": -90.85714721679688, "logps/rejected": -90.972900390625, "loss": 1.3598, "margin_dpo/margin_mean": 0.3458261489868164, "margin_dpo/margin_std": 0.9633051156997681, "step": 80 }, { "epoch": 0.12244897959183673, "grad_norm": 17.892345428466797, "learning_rate": 4.994093197099587e-07, "logits/chosen": 1.3688116073608398, "logits/rejected": 1.2893130779266357, "logps/chosen": -72.74852752685547, "logps/ref_chosen": -73.00962829589844, "logps/ref_rejected": -81.4422607421875, "logps/rejected": -81.30767822265625, "loss": 1.3537, "margin_dpo/margin_mean": 0.1265125274658203, "margin_dpo/margin_std": 0.7544887065887451, "step": 81 }, { "epoch": 0.12396069538926682, "grad_norm": 17.94148826599121, "learning_rate": 4.993149937871306e-07, "logits/chosen": 1.7386322021484375, "logits/rejected": 1.5193268060684204, "logps/chosen": -41.30272674560547, "logps/ref_chosen": -42.107208251953125, "logps/ref_rejected": -75.11695861816406, "logps/rejected": -75.17755889892578, "loss": 1.3334, "margin_dpo/margin_mean": 0.8650846481323242, "margin_dpo/margin_std": 0.6220800876617432, "step": 82 }, { "epoch": 0.1254724111866969, "grad_norm": 18.747753143310547, "learning_rate": 4.992136939879856e-07, "logits/chosen": 1.240645408630371, "logits/rejected": 0.9125269651412964, "logps/chosen": -72.11739349365234, "logps/ref_chosen": -72.58721160888672, "logps/ref_rejected": -103.82908630371094, "logps/rejected": -103.79228973388672, "loss": 1.349, "margin_dpo/margin_mean": 0.43301212787628174, "margin_dpo/margin_std": 0.7614338397979736, "step": 83 }, { "epoch": 0.12698412698412698, "grad_norm": 19.350725173950195, "learning_rate": 4.991054231460969e-07, "logits/chosen": 1.7762892246246338, "logits/rejected": 1.6091362237930298, "logps/chosen": -80.59298706054688, "logps/ref_chosen": -81.03014373779297, "logps/ref_rejected": -82.0133056640625, "logps/rejected": -82.28628540039062, "loss": 1.341, "margin_dpo/margin_mean": 0.7101401090621948, "margin_dpo/margin_std": 0.7795432806015015, "step": 84 }, { "epoch": 0.12849584278155707, "grad_norm": 17.555471420288086, "learning_rate": 4.989901842900325e-07, "logits/chosen": 1.923715353012085, "logits/rejected": 1.8312654495239258, "logps/chosen": -77.27107238769531, "logps/ref_chosen": -77.72187805175781, "logps/ref_rejected": -98.06354522705078, "logps/rejected": -97.93031311035156, "loss": 1.3362, "margin_dpo/margin_mean": 0.3175792694091797, "margin_dpo/margin_std": 0.8862060308456421, "step": 85 }, { "epoch": 0.13000755857898716, "grad_norm": 17.512916564941406, "learning_rate": 4.988679806432711e-07, "logits/chosen": 1.612224817276001, "logits/rejected": 1.5968964099884033, "logps/chosen": -77.68038940429688, "logps/ref_chosen": -77.72428131103516, "logps/ref_rejected": -81.12899780273438, "logps/rejected": -81.25384521484375, "loss": 1.3548, "margin_dpo/margin_mean": 0.16873645782470703, "margin_dpo/margin_std": 0.7599313855171204, "step": 86 }, { "epoch": 0.13151927437641722, "grad_norm": 18.990942001342773, "learning_rate": 4.987388156241114e-07, "logits/chosen": 1.6482123136520386, "logits/rejected": 1.47174072265625, "logps/chosen": -110.45002746582031, "logps/ref_chosen": -111.06234741210938, "logps/ref_rejected": -107.74360656738281, "logps/rejected": -107.89732360839844, "loss": 1.3232, "margin_dpo/margin_mean": 0.7660520076751709, "margin_dpo/margin_std": 1.299325704574585, "step": 87 }, { "epoch": 0.1330309901738473, "grad_norm": 18.259008407592773, "learning_rate": 4.986026928455767e-07, "logits/chosen": 1.1589796543121338, "logits/rejected": 1.1260058879852295, "logps/chosen": -60.53464126586914, "logps/ref_chosen": -60.9940185546875, "logps/ref_rejected": -69.72227478027344, "logps/rejected": -69.83525085449219, "loss": 1.3543, "margin_dpo/margin_mean": 0.5723496675491333, "margin_dpo/margin_std": 0.9446424841880798, "step": 88 }, { "epoch": 0.1345427059712774, "grad_norm": 18.004953384399414, "learning_rate": 4.984596161153135e-07, "logits/chosen": 1.8243141174316406, "logits/rejected": 1.5551257133483887, "logps/chosen": -62.47174835205078, "logps/ref_chosen": -63.249576568603516, "logps/ref_rejected": -96.9591293334961, "logps/rejected": -97.26958465576172, "loss": 1.3078, "margin_dpo/margin_mean": 1.0882878303527832, "margin_dpo/margin_std": 1.2574162483215332, "step": 89 }, { "epoch": 0.1360544217687075, "grad_norm": 19.679380416870117, "learning_rate": 4.983095894354857e-07, "logits/chosen": 1.8491549491882324, "logits/rejected": 1.449577808380127, "logps/chosen": -43.0953254699707, "logps/ref_chosen": -43.73698425292969, "logps/ref_rejected": -101.83931732177734, "logps/rejected": -101.9093246459961, "loss": 1.328, "margin_dpo/margin_mean": 0.711666464805603, "margin_dpo/margin_std": 1.2567521333694458, "step": 90 }, { "epoch": 0.13756613756613756, "grad_norm": 19.679807662963867, "learning_rate": 4.98152617002662e-07, "logits/chosen": 2.0511393547058105, "logits/rejected": 1.7253239154815674, "logps/chosen": -58.7183837890625, "logps/ref_chosen": -59.29620361328125, "logps/ref_rejected": -107.24990844726562, "logps/rejected": -107.39588165283203, "loss": 1.3379, "margin_dpo/margin_mean": 0.7237950563430786, "margin_dpo/margin_std": 1.2198774814605713, "step": 91 }, { "epoch": 0.13907785336356765, "grad_norm": 18.862180709838867, "learning_rate": 4.979887032076988e-07, "logits/chosen": 1.5705971717834473, "logits/rejected": 1.4581992626190186, "logps/chosen": -61.83405685424805, "logps/ref_chosen": -62.420440673828125, "logps/ref_rejected": -69.33434295654297, "logps/rejected": -69.13055419921875, "loss": 1.323, "margin_dpo/margin_mean": 0.38259196281433105, "margin_dpo/margin_std": 0.8398549556732178, "step": 92 }, { "epoch": 0.14058956916099774, "grad_norm": 16.00171661376953, "learning_rate": 4.978178526356172e-07, "logits/chosen": 1.8979381322860718, "logits/rejected": 1.6970547437667847, "logps/chosen": -62.854522705078125, "logps/ref_chosen": -63.689697265625, "logps/ref_rejected": -85.74089813232422, "logps/rejected": -85.52314758300781, "loss": 1.3393, "margin_dpo/margin_mean": 0.6174291372299194, "margin_dpo/margin_std": 1.9055800437927246, "step": 93 }, { "epoch": 0.1421012849584278, "grad_norm": 22.29401969909668, "learning_rate": 4.976400700654751e-07, "logits/chosen": 1.802495002746582, "logits/rejected": 1.880399227142334, "logps/chosen": -97.23482513427734, "logps/ref_chosen": -98.1005859375, "logps/ref_rejected": -85.92489624023438, "logps/rejected": -85.80846405029297, "loss": 1.3018, "margin_dpo/margin_mean": 0.7493376731872559, "margin_dpo/margin_std": 1.8231675624847412, "step": 94 }, { "epoch": 0.1436130007558579, "grad_norm": 18.579496383666992, "learning_rate": 4.974553604702332e-07, "logits/chosen": 1.119457721710205, "logits/rejected": 1.0089519023895264, "logps/chosen": -73.76954650878906, "logps/ref_chosen": -74.1230697631836, "logps/ref_rejected": -115.10316467285156, "logps/rejected": -115.4325180053711, "loss": 1.3236, "margin_dpo/margin_mean": 0.6828739643096924, "margin_dpo/margin_std": 1.5515177249908447, "step": 95 }, { "epoch": 0.14512471655328799, "grad_norm": 18.99415397644043, "learning_rate": 4.972637290166157e-07, "logits/chosen": 1.751003623008728, "logits/rejected": 1.4340192079544067, "logps/chosen": -89.32980346679688, "logps/ref_chosen": -89.7393569946289, "logps/ref_rejected": -124.8184814453125, "logps/rejected": -125.43689727783203, "loss": 1.3155, "margin_dpo/margin_mean": 1.0279643535614014, "margin_dpo/margin_std": 1.3005374670028687, "step": 96 }, { "epoch": 0.14663643235071808, "grad_norm": 19.33478355407715, "learning_rate": 4.970651810649666e-07, "logits/chosen": 1.2102348804473877, "logits/rejected": 1.4555165767669678, "logps/chosen": -111.822021484375, "logps/ref_chosen": -111.78030395507812, "logps/ref_rejected": -71.13333129882812, "logps/rejected": -70.88111877441406, "loss": 1.3681, "margin_dpo/margin_mean": -0.2939218282699585, "margin_dpo/margin_std": 2.1070876121520996, "step": 97 }, { "epoch": 0.14814814814814814, "grad_norm": 17.386751174926758, "learning_rate": 4.968597221690985e-07, "logits/chosen": 1.446455478668213, "logits/rejected": 1.4816169738769531, "logps/chosen": -81.63556671142578, "logps/ref_chosen": -81.82776641845703, "logps/ref_rejected": -85.55567932128906, "logps/rejected": -85.81871032714844, "loss": 1.3505, "margin_dpo/margin_mean": 0.45524585247039795, "margin_dpo/margin_std": 1.4239929914474487, "step": 98 }, { "epoch": 0.14965986394557823, "grad_norm": 18.084707260131836, "learning_rate": 4.966473580761389e-07, "logits/chosen": 2.1066040992736816, "logits/rejected": 1.8866024017333984, "logps/chosen": -79.00135803222656, "logps/ref_chosen": -79.82548522949219, "logps/ref_rejected": -121.76324462890625, "logps/rejected": -122.25846099853516, "loss": 1.3311, "margin_dpo/margin_mean": 1.319352626800537, "margin_dpo/margin_std": 1.8709536790847778, "step": 99 }, { "epoch": 0.15117157974300832, "grad_norm": 19.6969051361084, "learning_rate": 4.964280947263676e-07, "logits/chosen": 2.006331443786621, "logits/rejected": 1.9770634174346924, "logps/chosen": -117.69312286376953, "logps/ref_chosen": -118.92030334472656, "logps/ref_rejected": -121.2953872680664, "logps/rejected": -121.53430938720703, "loss": 1.3236, "margin_dpo/margin_mean": 1.4661006927490234, "margin_dpo/margin_std": 1.6880757808685303, "step": 100 }, { "epoch": 0.15117157974300832, "eval_logits/chosen": 1.6972841024398804, "eval_logits/rejected": 1.5878300666809082, "eval_logps/chosen": -86.64280700683594, "eval_logps/ref_chosen": -87.31719970703125, "eval_logps/ref_rejected": -95.23231506347656, "eval_logps/rejected": -95.478515625, "eval_loss": 0.6539920568466187, "eval_margin_dpo/margin_mean": 0.9205958247184753, "eval_margin_dpo/margin_std": 1.8427311182022095, "eval_runtime": 42.5856, "eval_samples_per_second": 54.079, "eval_steps_per_second": 1.691, "step": 100 }, { "epoch": 0.15268329554043839, "grad_norm": 16.30322265625, "learning_rate": 4.96201938253052e-07, "logits/chosen": 0.7374497652053833, "logits/rejected": 0.592478334903717, "logps/chosen": -83.25194549560547, "logps/ref_chosen": -83.76132202148438, "logps/ref_rejected": -95.51852416992188, "logps/rejected": -95.7083511352539, "loss": 1.2977, "margin_dpo/margin_mean": 0.6992102861404419, "margin_dpo/margin_std": 1.7295677661895752, "step": 101 }, { "epoch": 0.15419501133786848, "grad_norm": 17.33847999572754, "learning_rate": 4.959688949822748e-07, "logits/chosen": 1.1919291019439697, "logits/rejected": 1.2928612232208252, "logps/chosen": -79.9405517578125, "logps/ref_chosen": -80.03215789794922, "logps/ref_rejected": -83.39249420166016, "logps/rejected": -82.97933197021484, "loss": 1.3384, "margin_dpo/margin_mean": -0.3215571641921997, "margin_dpo/margin_std": 1.3972532749176025, "step": 102 }, { "epoch": 0.15570672713529857, "grad_norm": 17.533742904663086, "learning_rate": 4.957289714327572e-07, "logits/chosen": 1.7225117683410645, "logits/rejected": 1.7683024406433105, "logps/chosen": -70.19136047363281, "logps/ref_chosen": -71.04435729980469, "logps/ref_rejected": -74.17625427246094, "logps/rejected": -74.07469177246094, "loss": 1.2856, "margin_dpo/margin_mean": 0.7514312267303467, "margin_dpo/margin_std": 1.703355312347412, "step": 103 }, { "epoch": 0.15721844293272866, "grad_norm": 18.58421516418457, "learning_rate": 4.954821743156767e-07, "logits/chosen": 1.756042242050171, "logits/rejected": 1.475834608078003, "logps/chosen": -78.34564971923828, "logps/ref_chosen": -79.14974975585938, "logps/ref_rejected": -132.1797637939453, "logps/rejected": -132.19769287109375, "loss": 1.2865, "margin_dpo/margin_mean": 0.8220130205154419, "margin_dpo/margin_std": 2.4924440383911133, "step": 104 }, { "epoch": 0.15873015873015872, "grad_norm": 18.827482223510742, "learning_rate": 4.952285105344791e-07, "logits/chosen": 1.4627060890197754, "logits/rejected": 1.2466471195220947, "logps/chosen": -60.80680465698242, "logps/ref_chosen": -62.11750030517578, "logps/ref_rejected": -92.18594360351562, "logps/rejected": -91.99990844726562, "loss": 1.3087, "margin_dpo/margin_mean": 1.1246623992919922, "margin_dpo/margin_std": 2.7592616081237793, "step": 105 }, { "epoch": 0.1602418745275888, "grad_norm": 17.016199111938477, "learning_rate": 4.949679871846857e-07, "logits/chosen": 1.6649564504623413, "logits/rejected": 1.4698894023895264, "logps/chosen": -66.28129577636719, "logps/ref_chosen": -67.84797668457031, "logps/ref_rejected": -82.6180419921875, "logps/rejected": -82.52056884765625, "loss": 1.312, "margin_dpo/margin_mean": 1.469221591949463, "margin_dpo/margin_std": 2.3686070442199707, "step": 106 }, { "epoch": 0.1617535903250189, "grad_norm": 19.119508743286133, "learning_rate": 4.947006115536947e-07, "logits/chosen": 1.3240102529525757, "logits/rejected": 1.515006184577942, "logps/chosen": -111.12362670898438, "logps/ref_chosen": -111.48698425292969, "logps/ref_rejected": -90.05622863769531, "logps/rejected": -90.30194091796875, "loss": 1.3359, "margin_dpo/margin_mean": 0.6090667247772217, "margin_dpo/margin_std": 2.2777419090270996, "step": 107 }, { "epoch": 0.16326530612244897, "grad_norm": 17.164888381958008, "learning_rate": 4.944263911205772e-07, "logits/chosen": 1.2025978565216064, "logits/rejected": 0.9316722750663757, "logps/chosen": -77.45760345458984, "logps/ref_chosen": -78.80503845214844, "logps/ref_rejected": -109.21432495117188, "logps/rejected": -109.38449096679688, "loss": 1.311, "margin_dpo/margin_mean": 1.5176045894622803, "margin_dpo/margin_std": 1.843069076538086, "step": 108 }, { "epoch": 0.16477702191987906, "grad_norm": 18.004884719848633, "learning_rate": 4.941453335558681e-07, "logits/chosen": 1.7693791389465332, "logits/rejected": 1.5222632884979248, "logps/chosen": -98.57215881347656, "logps/ref_chosen": -99.75715637207031, "logps/ref_rejected": -122.0266342163086, "logps/rejected": -122.67662048339844, "loss": 1.2745, "margin_dpo/margin_mean": 1.8349872827529907, "margin_dpo/margin_std": 2.162475824356079, "step": 109 }, { "epoch": 0.16628873771730915, "grad_norm": 20.493450164794922, "learning_rate": 4.938574467213517e-07, "logits/chosen": 1.19692862033844, "logits/rejected": 1.260218858718872, "logps/chosen": -83.29539489746094, "logps/ref_chosen": -84.31623840332031, "logps/ref_rejected": -77.30545043945312, "logps/rejected": -76.93648529052734, "loss": 1.3781, "margin_dpo/margin_mean": 0.6518844366073608, "margin_dpo/margin_std": 2.516350507736206, "step": 110 }, { "epoch": 0.16780045351473924, "grad_norm": 16.46819496154785, "learning_rate": 4.935627386698418e-07, "logits/chosen": 1.463053822517395, "logits/rejected": 1.4688405990600586, "logps/chosen": -91.22607421875, "logps/ref_chosen": -91.90132141113281, "logps/ref_rejected": -91.20811462402344, "logps/rejected": -91.19815826416016, "loss": 1.3051, "margin_dpo/margin_mean": 0.6652882099151611, "margin_dpo/margin_std": 2.3565096855163574, "step": 111 }, { "epoch": 0.1693121693121693, "grad_norm": 19.188968658447266, "learning_rate": 4.932612176449559e-07, "logits/chosen": 1.3166104555130005, "logits/rejected": 1.1191773414611816, "logps/chosen": -79.05795288085938, "logps/ref_chosen": -80.13862609863281, "logps/ref_rejected": -122.18325805664062, "logps/rejected": -122.52186584472656, "loss": 1.2818, "margin_dpo/margin_mean": 1.4192678928375244, "margin_dpo/margin_std": 2.294623851776123, "step": 112 }, { "epoch": 0.1708238851095994, "grad_norm": 16.653589248657227, "learning_rate": 4.929528920808854e-07, "logits/chosen": 0.9543758034706116, "logits/rejected": 1.1388777494430542, "logps/chosen": -92.203369140625, "logps/ref_chosen": -92.391845703125, "logps/ref_rejected": -69.11553955078125, "logps/rejected": -68.32243347167969, "loss": 1.3255, "margin_dpo/margin_mean": -0.6046181917190552, "margin_dpo/margin_std": 1.8006412982940674, "step": 113 }, { "epoch": 0.17233560090702948, "grad_norm": 18.097597122192383, "learning_rate": 4.92637770602159e-07, "logits/chosen": 1.7595313787460327, "logits/rejected": 1.7576406002044678, "logps/chosen": -80.02515411376953, "logps/ref_chosen": -81.58061218261719, "logps/ref_rejected": -88.33343505859375, "logps/rejected": -88.24864959716797, "loss": 1.2755, "margin_dpo/margin_mean": 1.470676064491272, "margin_dpo/margin_std": 2.8956403732299805, "step": 114 }, { "epoch": 0.17384731670445955, "grad_norm": 16.87544822692871, "learning_rate": 4.923158620234019e-07, "logits/chosen": 2.099130153656006, "logits/rejected": 1.8026936054229736, "logps/chosen": -76.92066192626953, "logps/ref_chosen": -77.95787048339844, "logps/ref_rejected": -113.61511993408203, "logps/rejected": -113.61744689941406, "loss": 1.2889, "margin_dpo/margin_mean": 1.0395395755767822, "margin_dpo/margin_std": 3.4564995765686035, "step": 115 }, { "epoch": 0.17535903250188964, "grad_norm": 18.161304473876953, "learning_rate": 4.91987175349089e-07, "logits/chosen": 1.5769227743148804, "logits/rejected": 1.3392497301101685, "logps/chosen": -66.2005615234375, "logps/ref_chosen": -67.72766876220703, "logps/ref_rejected": -100.49800109863281, "logps/rejected": -101.14695739746094, "loss": 1.2475, "margin_dpo/margin_mean": 2.176055908203125, "margin_dpo/margin_std": 2.400941848754883, "step": 116 }, { "epoch": 0.17687074829931973, "grad_norm": 16.556798934936523, "learning_rate": 4.916517197732933e-07, "logits/chosen": 1.6647223234176636, "logits/rejected": 1.4812830686569214, "logps/chosen": -62.25347900390625, "logps/ref_chosen": -64.65423583984375, "logps/ref_rejected": -91.2808837890625, "logps/rejected": -90.77375030517578, "loss": 1.2514, "margin_dpo/margin_mean": 1.8936235904693604, "margin_dpo/margin_std": 2.7461957931518555, "step": 117 }, { "epoch": 0.17838246409674982, "grad_norm": 16.51125717163086, "learning_rate": 4.913095046794281e-07, "logits/chosen": 0.902362585067749, "logits/rejected": 0.8815510869026184, "logps/chosen": -79.27388000488281, "logps/ref_chosen": -79.91114044189453, "logps/ref_rejected": -84.76600646972656, "logps/rejected": -84.52806091308594, "loss": 1.2836, "margin_dpo/margin_mean": 0.39931559562683105, "margin_dpo/margin_std": 2.6755645275115967, "step": 118 }, { "epoch": 0.17989417989417988, "grad_norm": 16.9473934173584, "learning_rate": 4.909605396399855e-07, "logits/chosen": 1.8376327753067017, "logits/rejected": 2.0263562202453613, "logps/chosen": -72.18095397949219, "logps/ref_chosen": -73.43350982666016, "logps/ref_rejected": -56.48483657836914, "logps/rejected": -55.133934020996094, "loss": 1.2876, "margin_dpo/margin_mean": -0.09835445880889893, "margin_dpo/margin_std": 4.086493968963623, "step": 119 }, { "epoch": 0.18140589569160998, "grad_norm": 18.088138580322266, "learning_rate": 4.906048344162676e-07, "logits/chosen": 1.978576421737671, "logits/rejected": 1.7176882028579712, "logps/chosen": -72.7327651977539, "logps/ref_chosen": -74.42625427246094, "logps/ref_rejected": -131.71356201171875, "logps/rejected": -132.21151733398438, "loss": 1.2194, "margin_dpo/margin_mean": 2.191432476043701, "margin_dpo/margin_std": 3.389819383621216, "step": 120 }, { "epoch": 0.18291761148904007, "grad_norm": 18.553237915039062, "learning_rate": 4.902423989581143e-07, "logits/chosen": 1.7400338649749756, "logits/rejected": 1.4648916721343994, "logps/chosen": -79.81134033203125, "logps/ref_chosen": -81.15644836425781, "logps/ref_rejected": -146.83558654785156, "logps/rejected": -146.99688720703125, "loss": 1.2714, "margin_dpo/margin_mean": 1.5064103603363037, "margin_dpo/margin_std": 3.948129177093506, "step": 121 }, { "epoch": 0.18442932728647016, "grad_norm": 17.77837562561035, "learning_rate": 4.898732434036243e-07, "logits/chosen": 1.671454906463623, "logits/rejected": 1.3591837882995605, "logps/chosen": -71.13043975830078, "logps/ref_chosen": -73.17575073242188, "logps/ref_rejected": -116.53224182128906, "logps/rejected": -116.39794921875, "loss": 1.2759, "margin_dpo/margin_mean": 1.9110122919082642, "margin_dpo/margin_std": 4.886318206787109, "step": 122 }, { "epoch": 0.18594104308390022, "grad_norm": 17.19220733642578, "learning_rate": 4.894973780788722e-07, "logits/chosen": 1.6772491931915283, "logits/rejected": 1.2726120948791504, "logps/chosen": -59.18843078613281, "logps/ref_chosen": -61.08534240722656, "logps/ref_rejected": -122.7364730834961, "logps/rejected": -123.64124298095703, "loss": 1.2486, "margin_dpo/margin_mean": 2.8016741275787354, "margin_dpo/margin_std": 3.3240303993225098, "step": 123 }, { "epoch": 0.1874527588813303, "grad_norm": 17.891834259033203, "learning_rate": 4.89114813497619e-07, "logits/chosen": 1.9690539836883545, "logits/rejected": 1.493447184562683, "logps/chosen": -47.25743103027344, "logps/ref_chosen": -51.044490814208984, "logps/ref_rejected": -124.63666534423828, "logps/rejected": -124.76104736328125, "loss": 1.2373, "margin_dpo/margin_mean": 3.9114441871643066, "margin_dpo/margin_std": 4.223942756652832, "step": 124 }, { "epoch": 0.1889644746787604, "grad_norm": 18.371837615966797, "learning_rate": 4.887255603610184e-07, "logits/chosen": 2.0389623641967773, "logits/rejected": 1.6993787288665771, "logps/chosen": -78.25626373291016, "logps/ref_chosen": -81.12104797363281, "logps/ref_rejected": -129.35906982421875, "logps/rejected": -129.52630615234375, "loss": 1.2143, "margin_dpo/margin_mean": 3.032008409500122, "margin_dpo/margin_std": 3.482339382171631, "step": 125 }, { "epoch": 0.19047619047619047, "grad_norm": 18.194929122924805, "learning_rate": 4.883296295573176e-07, "logits/chosen": 1.1758207082748413, "logits/rejected": 1.118505835533142, "logps/chosen": -70.00385284423828, "logps/ref_chosen": -75.61920166015625, "logps/ref_rejected": -66.16270446777344, "logps/rejected": -63.88288879394531, "loss": 1.2725, "margin_dpo/margin_mean": 3.335521697998047, "margin_dpo/margin_std": 4.482481956481934, "step": 126 }, { "epoch": 0.19198790627362056, "grad_norm": 17.007150650024414, "learning_rate": 4.87927032161552e-07, "logits/chosen": 1.672834873199463, "logits/rejected": 1.5646876096725464, "logps/chosen": -89.33263397216797, "logps/ref_chosen": -92.87060546875, "logps/ref_rejected": -111.15403747558594, "logps/rejected": -110.98653411865234, "loss": 1.1998, "margin_dpo/margin_mean": 3.3704707622528076, "margin_dpo/margin_std": 3.2387959957122803, "step": 127 }, { "epoch": 0.19349962207105065, "grad_norm": 19.154495239257812, "learning_rate": 4.875177794352363e-07, "logits/chosen": 1.5523099899291992, "logits/rejected": 1.4306855201721191, "logps/chosen": -102.0582275390625, "logps/ref_chosen": -104.432373046875, "logps/ref_rejected": -121.4126968383789, "logps/rejected": -120.19316864013672, "loss": 1.2979, "margin_dpo/margin_mean": 1.1546133756637573, "margin_dpo/margin_std": 4.68147611618042, "step": 128 }, { "epoch": 0.19501133786848074, "grad_norm": 18.43849754333496, "learning_rate": 4.871018828260491e-07, "logits/chosen": 1.129783034324646, "logits/rejected": 1.065466284751892, "logps/chosen": -60.99585723876953, "logps/ref_chosen": -64.34329223632812, "logps/ref_rejected": -87.94223022460938, "logps/rejected": -86.94331359863281, "loss": 1.2748, "margin_dpo/margin_mean": 2.348515510559082, "margin_dpo/margin_std": 3.6324710845947266, "step": 129 }, { "epoch": 0.1965230536659108, "grad_norm": 18.230493545532227, "learning_rate": 4.866793539675126e-07, "logits/chosen": 1.6992638111114502, "logits/rejected": 1.6011861562728882, "logps/chosen": -82.06292724609375, "logps/ref_chosen": -85.94593048095703, "logps/ref_rejected": -111.10652160644531, "logps/rejected": -109.91690063476562, "loss": 1.2352, "margin_dpo/margin_mean": 2.6933817863464355, "margin_dpo/margin_std": 3.7233176231384277, "step": 130 }, { "epoch": 0.1980347694633409, "grad_norm": 16.926321029663086, "learning_rate": 4.86250204678667e-07, "logits/chosen": 1.4448974132537842, "logits/rejected": 1.2139010429382324, "logps/chosen": -35.73811340332031, "logps/ref_chosen": -40.8268928527832, "logps/ref_rejected": -66.36492919921875, "logps/rejected": -64.19065856933594, "loss": 1.194, "margin_dpo/margin_mean": 2.914515972137451, "margin_dpo/margin_std": 4.459476470947266, "step": 131 }, { "epoch": 0.19954648526077098, "grad_norm": 17.55213737487793, "learning_rate": 4.858144469637408e-07, "logits/chosen": 1.7195863723754883, "logits/rejected": 1.767103910446167, "logps/chosen": -65.18118286132812, "logps/ref_chosen": -70.0657730102539, "logps/ref_rejected": -57.25163650512695, "logps/rejected": -54.02313995361328, "loss": 1.2516, "margin_dpo/margin_mean": 1.6560922861099243, "margin_dpo/margin_std": 3.1351027488708496, "step": 132 }, { "epoch": 0.20105820105820105, "grad_norm": 17.497833251953125, "learning_rate": 4.853720930118138e-07, "logits/chosen": 1.5292476415634155, "logits/rejected": 1.4932136535644531, "logps/chosen": -64.2121810913086, "logps/ref_chosen": -68.21361541748047, "logps/ref_rejected": -76.20640563964844, "logps/rejected": -74.19707489013672, "loss": 1.2384, "margin_dpo/margin_mean": 1.9921071529388428, "margin_dpo/margin_std": 4.457557678222656, "step": 133 }, { "epoch": 0.20256991685563114, "grad_norm": 15.944921493530273, "learning_rate": 4.849231551964771e-07, "logits/chosen": 1.9915122985839844, "logits/rejected": 1.9212216138839722, "logps/chosen": -72.64193725585938, "logps/ref_chosen": -76.32701110839844, "logps/ref_rejected": -94.63996887207031, "logps/rejected": -93.01493072509766, "loss": 1.1653, "margin_dpo/margin_mean": 2.0600321292877197, "margin_dpo/margin_std": 2.2377896308898926, "step": 134 }, { "epoch": 0.20408163265306123, "grad_norm": 16.158836364746094, "learning_rate": 4.844676460754862e-07, "logits/chosen": 2.00465726852417, "logits/rejected": 2.0386109352111816, "logps/chosen": -71.79452514648438, "logps/ref_chosen": -76.53942108154297, "logps/ref_rejected": -82.97396850585938, "logps/rejected": -78.98849487304688, "loss": 1.235, "margin_dpo/margin_mean": 0.7594242691993713, "margin_dpo/margin_std": 3.577526569366455, "step": 135 }, { "epoch": 0.20559334845049132, "grad_norm": 18.05194664001465, "learning_rate": 4.840055783904106e-07, "logits/chosen": 1.3569614887237549, "logits/rejected": 1.530954122543335, "logps/chosen": -98.23109436035156, "logps/ref_chosen": -101.48860931396484, "logps/ref_rejected": -78.98468017578125, "logps/rejected": -75.04344177246094, "loss": 1.23, "margin_dpo/margin_mean": -0.6837238073348999, "margin_dpo/margin_std": 4.982443809509277, "step": 136 }, { "epoch": 0.20710506424792138, "grad_norm": 15.806459426879883, "learning_rate": 4.835369650662767e-07, "logits/chosen": 1.4854345321655273, "logits/rejected": 1.3372095823287964, "logps/chosen": -88.92061614990234, "logps/ref_chosen": -94.42204284667969, "logps/ref_rejected": -97.13616180419922, "logps/rejected": -96.58726501464844, "loss": 1.1929, "margin_dpo/margin_mean": 4.952520370483398, "margin_dpo/margin_std": 5.966933727264404, "step": 137 }, { "epoch": 0.20861678004535147, "grad_norm": 16.874452590942383, "learning_rate": 4.830618192112065e-07, "logits/chosen": 1.3104310035705566, "logits/rejected": 1.3656392097473145, "logps/chosen": -104.93235778808594, "logps/ref_chosen": -107.82279205322266, "logps/ref_rejected": -82.18955993652344, "logps/rejected": -79.95216369628906, "loss": 1.2658, "margin_dpo/margin_mean": 0.6530355215072632, "margin_dpo/margin_std": 4.397375583648682, "step": 138 }, { "epoch": 0.21012849584278157, "grad_norm": 20.055728912353516, "learning_rate": 4.825801541160509e-07, "logits/chosen": 1.2643449306488037, "logits/rejected": 1.2432670593261719, "logps/chosen": -73.41048431396484, "logps/ref_chosen": -77.69741821289062, "logps/ref_rejected": -72.66459655761719, "logps/rejected": -70.36746978759766, "loss": 1.2534, "margin_dpo/margin_mean": 1.9898031949996948, "margin_dpo/margin_std": 4.86124324798584, "step": 139 }, { "epoch": 0.21164021164021163, "grad_norm": 20.17422103881836, "learning_rate": 4.820919832540181e-07, "logits/chosen": 1.3549814224243164, "logits/rejected": 1.3895388841629028, "logps/chosen": -108.87055969238281, "logps/ref_chosen": -111.52936553955078, "logps/ref_rejected": -119.21971893310547, "logps/rejected": -118.42239379882812, "loss": 1.1586, "margin_dpo/margin_mean": 1.861487865447998, "margin_dpo/margin_std": 6.23953104019165, "step": 140 }, { "epoch": 0.21315192743764172, "grad_norm": 15.97652530670166, "learning_rate": 4.815973202802966e-07, "logits/chosen": 1.9773420095443726, "logits/rejected": 1.825194239616394, "logps/chosen": -60.81574249267578, "logps/ref_chosen": -64.82176208496094, "logps/ref_rejected": -103.89480590820312, "logps/rejected": -102.4252700805664, "loss": 1.1722, "margin_dpo/margin_mean": 2.5364794731140137, "margin_dpo/margin_std": 5.527141094207764, "step": 141 }, { "epoch": 0.2146636432350718, "grad_norm": 16.915544509887695, "learning_rate": 4.810961790316729e-07, "logits/chosen": 1.7422206401824951, "logits/rejected": 1.6755993366241455, "logps/chosen": -64.37892150878906, "logps/ref_chosen": -68.35072326660156, "logps/ref_rejected": -79.67445373535156, "logps/rejected": -77.58482360839844, "loss": 1.2352, "margin_dpo/margin_mean": 1.8821768760681152, "margin_dpo/margin_std": 5.143482208251953, "step": 142 }, { "epoch": 0.2161753590325019, "grad_norm": 20.867433547973633, "learning_rate": 4.805885735261454e-07, "logits/chosen": 1.949344515800476, "logits/rejected": 1.7887301445007324, "logps/chosen": -50.87135314941406, "logps/ref_chosen": -54.84930419921875, "logps/ref_rejected": -90.4713134765625, "logps/rejected": -90.53902435302734, "loss": 1.2805, "margin_dpo/margin_mean": 4.045658111572266, "margin_dpo/margin_std": 5.359964847564697, "step": 143 }, { "epoch": 0.21768707482993196, "grad_norm": 23.608102798461914, "learning_rate": 4.800745179625307e-07, "logits/chosen": 1.571341633796692, "logits/rejected": 1.5120694637298584, "logps/chosen": -60.12604522705078, "logps/ref_chosen": -62.28664016723633, "logps/ref_rejected": -69.8016357421875, "logps/rejected": -68.43238830566406, "loss": 1.3461, "margin_dpo/margin_mean": 0.7913510799407959, "margin_dpo/margin_std": 7.056728363037109, "step": 144 }, { "epoch": 0.21919879062736206, "grad_norm": 20.554088592529297, "learning_rate": 4.795540267200686e-07, "logits/chosen": 0.878929853439331, "logits/rejected": 0.8702086210250854, "logps/chosen": -90.45643615722656, "logps/ref_chosen": -92.72438049316406, "logps/ref_rejected": -90.39459228515625, "logps/rejected": -90.30056762695312, "loss": 1.2305, "margin_dpo/margin_mean": 2.173919916152954, "margin_dpo/margin_std": 6.899385452270508, "step": 145 }, { "epoch": 0.22071050642479215, "grad_norm": 18.752853393554688, "learning_rate": 4.790271143580173e-07, "logits/chosen": 1.2555538415908813, "logits/rejected": 1.3202041387557983, "logps/chosen": -81.19849395751953, "logps/ref_chosen": -81.83560180664062, "logps/ref_rejected": -78.86693572998047, "logps/rejected": -77.32716369628906, "loss": 1.2722, "margin_dpo/margin_mean": -0.902668833732605, "margin_dpo/margin_std": 5.9320068359375, "step": 146 }, { "epoch": 0.2222222222222222, "grad_norm": 20.7388916015625, "learning_rate": 4.784937956152489e-07, "logits/chosen": 1.2700107097625732, "logits/rejected": 1.2025418281555176, "logps/chosen": -86.18782043457031, "logps/ref_chosen": -87.66102600097656, "logps/ref_rejected": -109.97225952148438, "logps/rejected": -110.99523162841797, "loss": 1.2935, "margin_dpo/margin_mean": 2.4961836338043213, "margin_dpo/margin_std": 4.278651714324951, "step": 147 }, { "epoch": 0.2237339380196523, "grad_norm": 15.080001831054688, "learning_rate": 4.779540854098347e-07, "logits/chosen": 2.2249903678894043, "logits/rejected": 1.9831640720367432, "logps/chosen": -56.78398132324219, "logps/ref_chosen": -60.305946350097656, "logps/ref_rejected": -91.08148956298828, "logps/rejected": -90.39674377441406, "loss": 1.1726, "margin_dpo/margin_mean": 2.837217330932617, "margin_dpo/margin_std": 7.046616554260254, "step": 148 }, { "epoch": 0.2252456538170824, "grad_norm": 17.77558135986328, "learning_rate": 4.774079988386296e-07, "logits/chosen": 1.1505180597305298, "logits/rejected": 1.3012826442718506, "logps/chosen": -76.28553771972656, "logps/ref_chosen": -77.25098419189453, "logps/ref_rejected": -69.12332916259766, "logps/rejected": -68.49529266357422, "loss": 1.2551, "margin_dpo/margin_mean": 0.3374178409576416, "margin_dpo/margin_std": 5.971264839172363, "step": 149 }, { "epoch": 0.22675736961451248, "grad_norm": 18.043701171875, "learning_rate": 4.768555511768486e-07, "logits/chosen": 1.241929292678833, "logits/rejected": 1.2955282926559448, "logps/chosen": -89.82179260253906, "logps/ref_chosen": -91.05587768554688, "logps/ref_rejected": -101.52323913574219, "logps/rejected": -102.30039978027344, "loss": 1.1576, "margin_dpo/margin_mean": 2.0112457275390625, "margin_dpo/margin_std": 6.84970235824585, "step": 150 }, { "epoch": 0.22826908541194255, "grad_norm": 15.95711612701416, "learning_rate": 4.762967578776406e-07, "logits/chosen": 1.702085018157959, "logits/rejected": 1.47122061252594, "logps/chosen": -81.42080688476562, "logps/ref_chosen": -83.08059692382812, "logps/ref_rejected": -93.768310546875, "logps/rejected": -96.05892944335938, "loss": 1.0981, "margin_dpo/margin_mean": 3.95042085647583, "margin_dpo/margin_std": 6.472861289978027, "step": 151 }, { "epoch": 0.22978080120937264, "grad_norm": 18.49770164489746, "learning_rate": 4.757316345716553e-07, "logits/chosen": 1.429657220840454, "logits/rejected": 1.4212470054626465, "logps/chosen": -88.82245635986328, "logps/ref_chosen": -88.03974914550781, "logps/ref_rejected": -105.87544250488281, "logps/rejected": -109.1420669555664, "loss": 1.2167, "margin_dpo/margin_mean": 2.4839107990264893, "margin_dpo/margin_std": 7.031231880187988, "step": 152 }, { "epoch": 0.23129251700680273, "grad_norm": 16.97657585144043, "learning_rate": 4.751601970666064e-07, "logits/chosen": 0.8840553760528564, "logits/rejected": 0.7603949904441833, "logps/chosen": -99.16061401367188, "logps/ref_chosen": -98.43572998046875, "logps/ref_rejected": -96.981201171875, "logps/rejected": -102.25637817382812, "loss": 1.1709, "margin_dpo/margin_mean": 4.550297737121582, "margin_dpo/margin_std": 5.529523849487305, "step": 153 }, { "epoch": 0.2328042328042328, "grad_norm": 19.179428100585938, "learning_rate": 4.745824613468292e-07, "logits/chosen": 1.0210994482040405, "logits/rejected": 1.284121036529541, "logps/chosen": -98.15274810791016, "logps/ref_chosen": -96.41099548339844, "logps/ref_rejected": -66.02450561523438, "logps/rejected": -65.16929626464844, "loss": 1.2754, "margin_dpo/margin_mean": -2.596966505050659, "margin_dpo/margin_std": 7.598773956298828, "step": 154 }, { "epoch": 0.23431594860166288, "grad_norm": 23.84918785095215, "learning_rate": 4.7399844357283393e-07, "logits/chosen": 1.4591971635818481, "logits/rejected": 1.2901732921600342, "logps/chosen": -60.90550994873047, "logps/ref_chosen": -61.10433578491211, "logps/ref_rejected": -97.90666961669922, "logps/rejected": -101.56568908691406, "loss": 1.2091, "margin_dpo/margin_mean": 3.857840061187744, "margin_dpo/margin_std": 5.620019912719727, "step": 155 }, { "epoch": 0.23582766439909297, "grad_norm": 19.43852424621582, "learning_rate": 4.7340816008085305e-07, "logits/chosen": 1.3670084476470947, "logits/rejected": 1.6216658353805542, "logps/chosen": -116.94534301757812, "logps/ref_chosen": -117.28364562988281, "logps/ref_rejected": -100.88206481933594, "logps/rejected": -105.00944519042969, "loss": 1.1023, "margin_dpo/margin_mean": 4.465673446655273, "margin_dpo/margin_std": 7.877100467681885, "step": 156 }, { "epoch": 0.23733938019652306, "grad_norm": 16.724939346313477, "learning_rate": 4.728116273823847e-07, "logits/chosen": 1.1171612739562988, "logits/rejected": 1.076847791671753, "logps/chosen": -75.83224487304688, "logps/ref_chosen": -76.55464172363281, "logps/ref_rejected": -93.70323181152344, "logps/rejected": -95.55250549316406, "loss": 1.2364, "margin_dpo/margin_mean": 2.5716757774353027, "margin_dpo/margin_std": 6.660999298095703, "step": 157 }, { "epoch": 0.23885109599395313, "grad_norm": 18.57274627685547, "learning_rate": 4.7220886216373085e-07, "logits/chosen": 1.405949354171753, "logits/rejected": 1.3929505348205566, "logps/chosen": -82.99036407470703, "logps/ref_chosen": -85.79928588867188, "logps/ref_rejected": -84.39836120605469, "logps/rejected": -86.05177307128906, "loss": 1.2067, "margin_dpo/margin_mean": 4.462333679199219, "margin_dpo/margin_std": 6.65220832824707, "step": 158 }, { "epoch": 0.24036281179138322, "grad_norm": 16.5221004486084, "learning_rate": 4.715998812855304e-07, "logits/chosen": 1.5438390970230103, "logits/rejected": 1.5307790040969849, "logps/chosen": -75.2174072265625, "logps/ref_chosen": -77.95636749267578, "logps/ref_rejected": -118.15950775146484, "logps/rejected": -121.49700927734375, "loss": 1.1418, "margin_dpo/margin_mean": 6.076457977294922, "margin_dpo/margin_std": 7.538599967956543, "step": 159 }, { "epoch": 0.2418745275888133, "grad_norm": 16.059919357299805, "learning_rate": 4.7098470178228755e-07, "logits/chosen": 0.9960245490074158, "logits/rejected": 0.914771318435669, "logps/chosen": -64.31304931640625, "logps/ref_chosen": -66.919189453125, "logps/ref_rejected": -82.74856567382812, "logps/rejected": -85.53584289550781, "loss": 1.1785, "margin_dpo/margin_mean": 5.393423557281494, "margin_dpo/margin_std": 6.3595781326293945, "step": 160 }, { "epoch": 0.24338624338624337, "grad_norm": 17.40558624267578, "learning_rate": 4.703633408618955e-07, "logits/chosen": 1.7136876583099365, "logits/rejected": 1.541985034942627, "logps/chosen": -66.63119506835938, "logps/ref_chosen": -68.49119567871094, "logps/ref_rejected": -113.86795806884766, "logps/rejected": -114.09303283691406, "loss": 1.1743, "margin_dpo/margin_mean": 2.0850729942321777, "margin_dpo/margin_std": 6.261900901794434, "step": 161 }, { "epoch": 0.24489795918367346, "grad_norm": 17.64971351623535, "learning_rate": 4.697358159051549e-07, "logits/chosen": 1.6097979545593262, "logits/rejected": 1.636500597000122, "logps/chosen": -115.13954162597656, "logps/ref_chosen": -114.65072631835938, "logps/ref_rejected": -118.04731750488281, "logps/rejected": -123.51725006103516, "loss": 1.0305, "margin_dpo/margin_mean": 4.9811177253723145, "margin_dpo/margin_std": 7.989214897155762, "step": 162 }, { "epoch": 0.24640967498110355, "grad_norm": 16.740474700927734, "learning_rate": 4.691021444652876e-07, "logits/chosen": 1.5232549905776978, "logits/rejected": 1.0545259714126587, "logps/chosen": -66.47976684570312, "logps/ref_chosen": -69.32765197753906, "logps/ref_rejected": -125.4322509765625, "logps/rejected": -127.10546875, "loss": 1.1371, "margin_dpo/margin_mean": 4.521094799041748, "margin_dpo/margin_std": 6.330313682556152, "step": 163 }, { "epoch": 0.24792139077853365, "grad_norm": 17.046655654907227, "learning_rate": 4.6846234426744624e-07, "logits/chosen": 1.4960196018218994, "logits/rejected": 1.188324213027954, "logps/chosen": -71.30110931396484, "logps/ref_chosen": -74.86119842529297, "logps/ref_rejected": -110.70217895507812, "logps/rejected": -111.56536865234375, "loss": 1.0828, "margin_dpo/margin_mean": 4.423286437988281, "margin_dpo/margin_std": 6.5512800216674805, "step": 164 }, { "epoch": 0.2494331065759637, "grad_norm": 16.380630493164062, "learning_rate": 4.678164332082175e-07, "logits/chosen": 1.545209527015686, "logits/rejected": 1.6276023387908936, "logps/chosen": -70.11854553222656, "logps/ref_chosen": -73.35094451904297, "logps/ref_rejected": -72.454345703125, "logps/rejected": -75.42884826660156, "loss": 1.0834, "margin_dpo/margin_mean": 6.206894874572754, "margin_dpo/margin_std": 6.151324272155762, "step": 165 }, { "epoch": 0.2509448223733938, "grad_norm": 18.689327239990234, "learning_rate": 4.6716442935512214e-07, "logits/chosen": 1.5046651363372803, "logits/rejected": 1.3802664279937744, "logps/chosen": -89.84822082519531, "logps/ref_chosen": -92.13763427734375, "logps/ref_rejected": -116.38691711425781, "logps/rejected": -118.75823974609375, "loss": 1.1838, "margin_dpo/margin_mean": 4.660735130310059, "margin_dpo/margin_std": 7.852128982543945, "step": 166 }, { "epoch": 0.25245653817082386, "grad_norm": 16.379173278808594, "learning_rate": 4.6650635094610966e-07, "logits/chosen": 1.4401094913482666, "logits/rejected": 1.2367253303527832, "logps/chosen": -49.68608856201172, "logps/ref_chosen": -53.23297119140625, "logps/ref_rejected": -88.27882385253906, "logps/rejected": -88.45932006835938, "loss": 1.1119, "margin_dpo/margin_mean": 3.72739315032959, "margin_dpo/margin_std": 5.302708148956299, "step": 167 }, { "epoch": 0.25396825396825395, "grad_norm": 16.850658416748047, "learning_rate": 4.6584221638904767e-07, "logits/chosen": 1.336973786354065, "logits/rejected": 1.4287123680114746, "logps/chosen": -97.93656921386719, "logps/ref_chosen": -99.15953063964844, "logps/ref_rejected": -73.91177368164062, "logps/rejected": -74.542724609375, "loss": 1.1524, "margin_dpo/margin_mean": 1.8539037704467773, "margin_dpo/margin_std": 5.620620250701904, "step": 168 }, { "epoch": 0.25547996976568405, "grad_norm": 18.042985916137695, "learning_rate": 4.651720442612075e-07, "logits/chosen": 1.4673585891723633, "logits/rejected": 1.435117244720459, "logps/chosen": -79.04154968261719, "logps/ref_chosen": -81.87686157226562, "logps/ref_rejected": -94.59857177734375, "logps/rejected": -94.93849182128906, "loss": 1.1062, "margin_dpo/margin_mean": 3.1752333641052246, "margin_dpo/margin_std": 6.8811421394348145, "step": 169 }, { "epoch": 0.25699168556311414, "grad_norm": 19.400802612304688, "learning_rate": 4.6449585330874425e-07, "logits/chosen": 1.3476133346557617, "logits/rejected": 1.4811618328094482, "logps/chosen": -60.58789825439453, "logps/ref_chosen": -65.82147216796875, "logps/ref_rejected": -61.918582916259766, "logps/rejected": -61.75260925292969, "loss": 1.3282, "margin_dpo/margin_mean": 5.067604064941406, "margin_dpo/margin_std": 9.479719161987305, "step": 170 }, { "epoch": 0.2585034013605442, "grad_norm": 17.081016540527344, "learning_rate": 4.6381366244617224e-07, "logits/chosen": 2.3293089866638184, "logits/rejected": 2.250077247619629, "logps/chosen": -59.310665130615234, "logps/ref_chosen": -67.25495910644531, "logps/ref_rejected": -85.13330078125, "logps/rejected": -86.42543029785156, "loss": 1.1209, "margin_dpo/margin_mean": 9.236425399780273, "margin_dpo/margin_std": 5.1870927810668945, "step": 171 }, { "epoch": 0.2600151171579743, "grad_norm": 16.66016387939453, "learning_rate": 4.631254907558365e-07, "logits/chosen": 2.1986536979675293, "logits/rejected": 2.0402820110321045, "logps/chosen": -71.0000228881836, "logps/ref_chosen": -72.16639709472656, "logps/ref_rejected": -111.55525970458984, "logps/rejected": -116.75547790527344, "loss": 1.1317, "margin_dpo/margin_mean": 6.366595268249512, "margin_dpo/margin_std": 7.718157768249512, "step": 172 }, { "epoch": 0.2615268329554044, "grad_norm": 20.126100540161133, "learning_rate": 4.624313574873786e-07, "logits/chosen": 1.292959213256836, "logits/rejected": 1.320690631866455, "logps/chosen": -61.82426071166992, "logps/ref_chosen": -66.64970397949219, "logps/ref_rejected": -62.34739685058594, "logps/rejected": -60.505615234375, "loss": 1.1968, "margin_dpo/margin_mean": 2.983661651611328, "margin_dpo/margin_std": 9.148659706115723, "step": 173 }, { "epoch": 0.26303854875283444, "grad_norm": 17.00262451171875, "learning_rate": 4.61731282057198e-07, "logits/chosen": 1.2794065475463867, "logits/rejected": 0.9840250015258789, "logps/chosen": -78.53173828125, "logps/ref_chosen": -84.24971008300781, "logps/ref_rejected": -124.96263122558594, "logps/rejected": -127.81623840332031, "loss": 1.0793, "margin_dpo/margin_mean": 8.571582794189453, "margin_dpo/margin_std": 7.489891052246094, "step": 174 }, { "epoch": 0.26455026455026454, "grad_norm": 20.59722900390625, "learning_rate": 4.6102528404790965e-07, "logits/chosen": 2.1873984336853027, "logits/rejected": 1.9818472862243652, "logps/chosen": -53.501182556152344, "logps/ref_chosen": -55.41690444946289, "logps/ref_rejected": -99.29142761230469, "logps/rejected": -102.10591125488281, "loss": 1.1673, "margin_dpo/margin_mean": 4.730203628540039, "margin_dpo/margin_std": 7.9715728759765625, "step": 175 }, { "epoch": 0.2660619803476946, "grad_norm": 20.261892318725586, "learning_rate": 4.603133832077953e-07, "logits/chosen": 1.8204588890075684, "logits/rejected": 1.7608739137649536, "logps/chosen": -102.83901977539062, "logps/ref_chosen": -100.96086120605469, "logps/ref_rejected": -99.77841186523438, "logps/rejected": -103.91777038574219, "loss": 1.2888, "margin_dpo/margin_mean": 2.2612133026123047, "margin_dpo/margin_std": 10.7415771484375, "step": 176 }, { "epoch": 0.2675736961451247, "grad_norm": 20.35191535949707, "learning_rate": 4.5959559945025183e-07, "logits/chosen": 1.7997978925704956, "logits/rejected": 1.7066614627838135, "logps/chosen": -69.72757720947266, "logps/ref_chosen": -73.72810363769531, "logps/ref_rejected": -92.87637329101562, "logps/rejected": -96.79242706298828, "loss": 0.878, "margin_dpo/margin_mean": 7.916572570800781, "margin_dpo/margin_std": 9.964702606201172, "step": 177 }, { "epoch": 0.2690854119425548, "grad_norm": 17.19330596923828, "learning_rate": 4.588719528532341e-07, "logits/chosen": 1.5731549263000488, "logits/rejected": 1.3723223209381104, "logps/chosen": -48.62438201904297, "logps/ref_chosen": -50.617286682128906, "logps/ref_rejected": -86.31294250488281, "logps/rejected": -90.3222885131836, "loss": 1.0496, "margin_dpo/margin_mean": 6.0022501945495605, "margin_dpo/margin_std": 6.132457733154297, "step": 178 }, { "epoch": 0.2705971277399849, "grad_norm": 18.006078720092773, "learning_rate": 4.581424636586928e-07, "logits/chosen": 1.6818873882293701, "logits/rejected": 1.745295763015747, "logps/chosen": -104.23760223388672, "logps/ref_chosen": -106.63618469238281, "logps/ref_rejected": -91.5908203125, "logps/rejected": -93.96669006347656, "loss": 1.1744, "margin_dpo/margin_mean": 4.774440765380859, "margin_dpo/margin_std": 8.780207633972168, "step": 179 }, { "epoch": 0.272108843537415, "grad_norm": 16.850460052490234, "learning_rate": 4.5740715227200897e-07, "logits/chosen": 1.0431925058364868, "logits/rejected": 0.7515966892242432, "logps/chosen": -58.83776092529297, "logps/ref_chosen": -62.49567413330078, "logps/ref_rejected": -92.67909240722656, "logps/rejected": -92.8590316772461, "loss": 1.1516, "margin_dpo/margin_mean": 3.837852716445923, "margin_dpo/margin_std": 9.029447555541992, "step": 180 }, { "epoch": 0.273620559334845, "grad_norm": 20.78750991821289, "learning_rate": 4.566660392614228e-07, "logits/chosen": 1.161057472229004, "logits/rejected": 1.07468581199646, "logps/chosen": -76.5954360961914, "logps/ref_chosen": -81.58159637451172, "logps/ref_rejected": -88.62760925292969, "logps/rejected": -85.45721435546875, "loss": 1.0522, "margin_dpo/margin_mean": 1.8157556056976318, "margin_dpo/margin_std": 4.802523612976074, "step": 181 }, { "epoch": 0.2751322751322751, "grad_norm": 19.17080307006836, "learning_rate": 4.5591914535745817e-07, "logits/chosen": 1.5764813423156738, "logits/rejected": 1.2571051120758057, "logps/chosen": -63.258296966552734, "logps/ref_chosen": -66.75, "logps/ref_rejected": -119.2486343383789, "logps/rejected": -123.26068115234375, "loss": 0.9894, "margin_dpo/margin_mean": 7.503762245178223, "margin_dpo/margin_std": 7.691174030303955, "step": 182 }, { "epoch": 0.2766439909297052, "grad_norm": 21.05959701538086, "learning_rate": 4.551664914523433e-07, "logits/chosen": 1.2917943000793457, "logits/rejected": 1.3505566120147705, "logps/chosen": -78.94085693359375, "logps/ref_chosen": -79.91377258300781, "logps/ref_rejected": -79.8192138671875, "logps/rejected": -79.86100006103516, "loss": 1.315, "margin_dpo/margin_mean": 1.014700174331665, "margin_dpo/margin_std": 7.933160305023193, "step": 183 }, { "epoch": 0.2781557067271353, "grad_norm": 15.321456909179688, "learning_rate": 4.544080985994258e-07, "logits/chosen": 1.6006574630737305, "logits/rejected": 1.5478490591049194, "logps/chosen": -70.38871765136719, "logps/ref_chosen": -74.22654724121094, "logps/ref_rejected": -76.7347640991211, "logps/rejected": -75.71514892578125, "loss": 1.0574, "margin_dpo/margin_mean": 2.8182148933410645, "margin_dpo/margin_std": 5.403117656707764, "step": 184 }, { "epoch": 0.2796674225245654, "grad_norm": 16.406084060668945, "learning_rate": 4.5364398801258394e-07, "logits/chosen": 1.9769668579101562, "logits/rejected": 1.9131214618682861, "logps/chosen": -86.2106704711914, "logps/ref_chosen": -89.63931274414062, "logps/ref_rejected": -81.96051025390625, "logps/rejected": -85.78184509277344, "loss": 1.169, "margin_dpo/margin_mean": 7.249977111816406, "margin_dpo/margin_std": 11.581094741821289, "step": 185 }, { "epoch": 0.2811791383219955, "grad_norm": 18.755495071411133, "learning_rate": 4.5287418106563354e-07, "logits/chosen": 1.3081080913543701, "logits/rejected": 1.06687331199646, "logps/chosen": -81.57020568847656, "logps/ref_chosen": -83.2655029296875, "logps/ref_rejected": -117.07292938232422, "logps/rejected": -123.5093765258789, "loss": 1.1446, "margin_dpo/margin_mean": 8.131747245788574, "margin_dpo/margin_std": 10.391054153442383, "step": 186 }, { "epoch": 0.28269085411942557, "grad_norm": 21.23556900024414, "learning_rate": 4.520986992917297e-07, "logits/chosen": 1.4633792638778687, "logits/rejected": 1.2260875701904297, "logps/chosen": -101.74247741699219, "logps/ref_chosen": -101.21977233886719, "logps/ref_rejected": -144.55734252929688, "logps/rejected": -152.8934326171875, "loss": 1.1127, "margin_dpo/margin_mean": 7.813370227813721, "margin_dpo/margin_std": 8.304927825927734, "step": 187 }, { "epoch": 0.2842025699168556, "grad_norm": 19.057212829589844, "learning_rate": 4.5131756438276466e-07, "logits/chosen": 1.8055870532989502, "logits/rejected": 1.5104811191558838, "logps/chosen": -77.42826843261719, "logps/ref_chosen": -79.15013122558594, "logps/ref_rejected": -107.39227294921875, "logps/rejected": -112.58609008789062, "loss": 1.1009, "margin_dpo/margin_mean": 6.9156813621521, "margin_dpo/margin_std": 8.73320198059082, "step": 188 }, { "epoch": 0.2857142857142857, "grad_norm": 23.79570960998535, "learning_rate": 4.5053079818876096e-07, "logits/chosen": 1.2658448219299316, "logits/rejected": 1.3400977849960327, "logps/chosen": -108.75637817382812, "logps/ref_chosen": -106.874755859375, "logps/ref_rejected": -83.47657775878906, "logps/rejected": -85.76707458496094, "loss": 1.237, "margin_dpo/margin_mean": 0.40888702869415283, "margin_dpo/margin_std": 10.766637802124023, "step": 189 }, { "epoch": 0.2872260015117158, "grad_norm": 21.95711898803711, "learning_rate": 4.4973842271726024e-07, "logits/chosen": 1.7546024322509766, "logits/rejected": 1.3498448133468628, "logps/chosen": -60.736053466796875, "logps/ref_chosen": -61.45669174194336, "logps/ref_rejected": -102.12944030761719, "logps/rejected": -107.77656555175781, "loss": 0.975, "margin_dpo/margin_mean": 6.36777400970459, "margin_dpo/margin_std": 10.653533935546875, "step": 190 }, { "epoch": 0.2887377173091459, "grad_norm": 22.683448791503906, "learning_rate": 4.48940460132708e-07, "logits/chosen": 2.0132603645324707, "logits/rejected": 1.8457577228546143, "logps/chosen": -80.98448181152344, "logps/ref_chosen": -82.37984466552734, "logps/ref_rejected": -104.54153442382812, "logps/rejected": -111.88583374023438, "loss": 1.1948, "margin_dpo/margin_mean": 8.739660263061523, "margin_dpo/margin_std": 9.778307914733887, "step": 191 }, { "epoch": 0.29024943310657597, "grad_norm": 16.572105407714844, "learning_rate": 4.481369327558329e-07, "logits/chosen": 1.6232538223266602, "logits/rejected": 1.6212327480316162, "logps/chosen": -84.11581420898438, "logps/ref_chosen": -79.55016326904297, "logps/ref_rejected": -73.93505859375, "logps/rejected": -78.04134368896484, "loss": 1.2442, "margin_dpo/margin_mean": -0.4593625068664551, "margin_dpo/margin_std": 8.584783554077148, "step": 192 }, { "epoch": 0.29176114890400606, "grad_norm": 16.37459373474121, "learning_rate": 4.47327863063023e-07, "logits/chosen": 1.3832461833953857, "logits/rejected": 1.2441718578338623, "logps/chosen": -70.2186050415039, "logps/ref_chosen": -67.54620361328125, "logps/ref_rejected": -86.98448944091797, "logps/rejected": -92.91549682617188, "loss": 1.018, "margin_dpo/margin_mean": 3.2586112022399902, "margin_dpo/margin_std": 6.868709564208984, "step": 193 }, { "epoch": 0.29327286470143615, "grad_norm": 20.786998748779297, "learning_rate": 4.4651327368569684e-07, "logits/chosen": 1.2682421207427979, "logits/rejected": 1.3284062147140503, "logps/chosen": -100.77994537353516, "logps/ref_chosen": -95.66322326660156, "logps/ref_rejected": -84.61515808105469, "logps/rejected": -90.30882263183594, "loss": 1.3303, "margin_dpo/margin_mean": 0.5769485235214233, "margin_dpo/margin_std": 7.857730865478516, "step": 194 }, { "epoch": 0.2947845804988662, "grad_norm": 20.121692657470703, "learning_rate": 4.4569318740967043e-07, "logits/chosen": 1.1212238073349, "logits/rejected": 1.2974833250045776, "logps/chosen": -105.86001586914062, "logps/ref_chosen": -100.59367370605469, "logps/ref_rejected": -85.27893829345703, "logps/rejected": -94.16304016113281, "loss": 1.1038, "margin_dpo/margin_mean": 3.617755651473999, "margin_dpo/margin_std": 10.129316329956055, "step": 195 }, { "epoch": 0.2962962962962963, "grad_norm": 17.283586502075195, "learning_rate": 4.448676271745197e-07, "logits/chosen": 1.6002600193023682, "logits/rejected": 1.6138341426849365, "logps/chosen": -96.85749816894531, "logps/ref_chosen": -93.04997253417969, "logps/ref_rejected": -94.47862243652344, "logps/rejected": -99.78226470947266, "loss": 1.17, "margin_dpo/margin_mean": 1.496113657951355, "margin_dpo/margin_std": 8.110875129699707, "step": 196 }, { "epoch": 0.29780801209372637, "grad_norm": 19.724607467651367, "learning_rate": 4.440366160729392e-07, "logits/chosen": 2.238926887512207, "logits/rejected": 1.7660119533538818, "logps/chosen": -53.97135925292969, "logps/ref_chosen": -51.194610595703125, "logps/ref_rejected": -92.28016662597656, "logps/rejected": -98.43435668945312, "loss": 1.1567, "margin_dpo/margin_mean": 3.3774306774139404, "margin_dpo/margin_std": 11.828231811523438, "step": 197 }, { "epoch": 0.29931972789115646, "grad_norm": 16.924100875854492, "learning_rate": 4.432001773500957e-07, "logits/chosen": 1.7960355281829834, "logits/rejected": 1.6403706073760986, "logps/chosen": -62.66572952270508, "logps/ref_chosen": -62.816810607910156, "logps/ref_rejected": -81.2480697631836, "logps/rejected": -88.56864929199219, "loss": 1.0528, "margin_dpo/margin_mean": 7.4716596603393555, "margin_dpo/margin_std": 7.589710235595703, "step": 198 }, { "epoch": 0.30083144368858655, "grad_norm": 18.70924949645996, "learning_rate": 4.4235833440297856e-07, "logits/chosen": 1.4740800857543945, "logits/rejected": 1.1325812339782715, "logps/chosen": -76.22238159179688, "logps/ref_chosen": -77.17382049560547, "logps/ref_rejected": -90.44135284423828, "logps/rejected": -98.39042663574219, "loss": 1.1997, "margin_dpo/margin_mean": 8.900522232055664, "margin_dpo/margin_std": 8.892587661743164, "step": 199 }, { "epoch": 0.30234315948601664, "grad_norm": 18.741064071655273, "learning_rate": 4.415111107797445e-07, "logits/chosen": 1.5921845436096191, "logits/rejected": 0.9971798658370972, "logps/chosen": -60.24110794067383, "logps/ref_chosen": -63.23415756225586, "logps/ref_rejected": -136.13055419921875, "logps/rejected": -140.66961669921875, "loss": 1.1498, "margin_dpo/margin_mean": 7.532114505767822, "margin_dpo/margin_std": 8.64643383026123, "step": 200 }, { "epoch": 0.30234315948601664, "eval_logits/chosen": 1.412081241607666, "eval_logits/rejected": 1.2978190183639526, "eval_logps/chosen": -88.1021957397461, "eval_logps/ref_chosen": -87.31719970703125, "eval_logps/ref_rejected": -95.23231506347656, "eval_logps/rejected": -101.35804748535156, "eval_loss": 0.5566064119338989, "eval_margin_dpo/margin_mean": 5.340742111206055, "eval_margin_dpo/margin_std": 9.015287399291992, "eval_runtime": 42.8673, "eval_samples_per_second": 53.724, "eval_steps_per_second": 1.68, "step": 200 }, { "epoch": 0.30385487528344673, "grad_norm": 18.564964294433594, "learning_rate": 4.4065853017905953e-07, "logits/chosen": 1.8663321733474731, "logits/rejected": 1.916215181350708, "logps/chosen": -99.33221435546875, "logps/ref_chosen": -95.81477355957031, "logps/ref_rejected": -90.6773681640625, "logps/rejected": -98.91471862792969, "loss": 1.0373, "margin_dpo/margin_mean": 4.719921112060547, "margin_dpo/margin_std": 6.383031845092773, "step": 201 }, { "epoch": 0.30536659108087677, "grad_norm": 21.8380069732666, "learning_rate": 4.3980061644943575e-07, "logits/chosen": 1.2910232543945312, "logits/rejected": 0.8727235794067383, "logps/chosen": -42.05162048339844, "logps/ref_chosen": -46.55683898925781, "logps/ref_rejected": -86.56182098388672, "logps/rejected": -91.80551147460938, "loss": 1.0821, "margin_dpo/margin_mean": 9.748905181884766, "margin_dpo/margin_std": 7.802475452423096, "step": 202 }, { "epoch": 0.30687830687830686, "grad_norm": 19.430665969848633, "learning_rate": 4.3893739358856455e-07, "logits/chosen": 2.0921430587768555, "logits/rejected": 1.5788099765777588, "logps/chosen": -78.43504333496094, "logps/ref_chosen": -78.54730224609375, "logps/ref_rejected": -148.39633178710938, "logps/rejected": -154.70986938476562, "loss": 1.0561, "margin_dpo/margin_mean": 6.425786972045898, "margin_dpo/margin_std": 9.921998977661133, "step": 203 }, { "epoch": 0.30839002267573695, "grad_norm": 19.065616607666016, "learning_rate": 4.380688857426449e-07, "logits/chosen": 1.371058464050293, "logits/rejected": 1.0024534463882446, "logps/chosen": -56.5880241394043, "logps/ref_chosen": -57.55014419555664, "logps/ref_rejected": -111.8753890991211, "logps/rejected": -118.48810577392578, "loss": 1.0365, "margin_dpo/margin_mean": 7.5748443603515625, "margin_dpo/margin_std": 10.04115104675293, "step": 204 }, { "epoch": 0.30990173847316704, "grad_norm": 21.128253936767578, "learning_rate": 4.3719511720570814e-07, "logits/chosen": 2.1554577350616455, "logits/rejected": 1.9470274448394775, "logps/chosen": -74.37040710449219, "logps/ref_chosen": -77.85409545898438, "logps/ref_rejected": -117.59823608398438, "logps/rejected": -123.0240249633789, "loss": 1.1943, "margin_dpo/margin_mean": 8.909467697143555, "margin_dpo/margin_std": 10.569025993347168, "step": 205 }, { "epoch": 0.31141345427059713, "grad_norm": 20.427236557006836, "learning_rate": 4.363161124189387e-07, "logits/chosen": 2.162990093231201, "logits/rejected": 1.9166287183761597, "logps/chosen": -63.64014434814453, "logps/ref_chosen": -64.427734375, "logps/ref_rejected": -111.59812927246094, "logps/rejected": -114.05654907226562, "loss": 1.2628, "margin_dpo/margin_mean": 3.2460155487060547, "margin_dpo/margin_std": 7.409453392028809, "step": 206 }, { "epoch": 0.3129251700680272, "grad_norm": 20.021848678588867, "learning_rate": 4.3543189596998986e-07, "logits/chosen": 1.4798643589019775, "logits/rejected": 1.1525617837905884, "logps/chosen": -59.04290771484375, "logps/ref_chosen": -57.09748840332031, "logps/ref_rejected": -91.08357238769531, "logps/rejected": -99.3094482421875, "loss": 1.0718, "margin_dpo/margin_mean": 6.280452251434326, "margin_dpo/margin_std": 9.402422904968262, "step": 207 }, { "epoch": 0.3144368858654573, "grad_norm": 19.99827003479004, "learning_rate": 4.3454249259229664e-07, "logits/chosen": 1.5373902320861816, "logits/rejected": 1.4672069549560547, "logps/chosen": -89.19135284423828, "logps/ref_chosen": -89.90771484375, "logps/ref_rejected": -113.57611083984375, "logps/rejected": -115.72317504882812, "loss": 1.3043, "margin_dpo/margin_mean": 2.863447666168213, "margin_dpo/margin_std": 9.252091407775879, "step": 208 }, { "epoch": 0.31594860166288735, "grad_norm": 19.299758911132812, "learning_rate": 4.336479271643833e-07, "logits/chosen": 1.6641685962677002, "logits/rejected": 1.5046910047531128, "logps/chosen": -86.4522476196289, "logps/ref_chosen": -86.69661712646484, "logps/ref_rejected": -113.98966979980469, "logps/rejected": -121.7276382446289, "loss": 1.0143, "margin_dpo/margin_mean": 7.9823384284973145, "margin_dpo/margin_std": 12.627361297607422, "step": 209 }, { "epoch": 0.31746031746031744, "grad_norm": 18.67994499206543, "learning_rate": 4.327482247091679e-07, "logits/chosen": 1.7544140815734863, "logits/rejected": 1.201540470123291, "logps/chosen": -100.80335998535156, "logps/ref_chosen": -101.39966583251953, "logps/ref_rejected": -162.4595947265625, "logps/rejected": -173.04063415527344, "loss": 0.992, "margin_dpo/margin_mean": 11.177356719970703, "margin_dpo/margin_std": 7.491905212402344, "step": 210 }, { "epoch": 0.31897203325774753, "grad_norm": 17.943483352661133, "learning_rate": 4.3184341039326217e-07, "logits/chosen": 1.9597463607788086, "logits/rejected": 1.5422741174697876, "logps/chosen": -61.412994384765625, "logps/ref_chosen": -63.88182067871094, "logps/ref_rejected": -108.5023193359375, "logps/rejected": -109.9597396850586, "loss": 1.0679, "margin_dpo/margin_mean": 3.926234722137451, "margin_dpo/margin_std": 6.603672027587891, "step": 211 }, { "epoch": 0.3204837490551776, "grad_norm": 18.866775512695312, "learning_rate": 4.309335095262675e-07, "logits/chosen": 1.6322221755981445, "logits/rejected": 1.698617935180664, "logps/chosen": -72.1840591430664, "logps/ref_chosen": -76.98751068115234, "logps/ref_rejected": -86.16354370117188, "logps/rejected": -90.28890991210938, "loss": 1.0301, "margin_dpo/margin_mean": 8.92881965637207, "margin_dpo/margin_std": 11.511048316955566, "step": 212 }, { "epoch": 0.3219954648526077, "grad_norm": 18.231098175048828, "learning_rate": 4.3001854756006724e-07, "logits/chosen": 1.0812079906463623, "logits/rejected": 1.3234604597091675, "logps/chosen": -94.9277114868164, "logps/ref_chosen": -96.49127197265625, "logps/ref_rejected": -83.63804626464844, "logps/rejected": -84.5646743774414, "loss": 1.1382, "margin_dpo/margin_mean": 2.4901845455169678, "margin_dpo/margin_std": 10.574831008911133, "step": 213 }, { "epoch": 0.3235071806500378, "grad_norm": 25.323326110839844, "learning_rate": 4.290985500881143e-07, "logits/chosen": 1.247567892074585, "logits/rejected": 1.4265937805175781, "logps/chosen": -78.91569519042969, "logps/ref_chosen": -82.22032165527344, "logps/ref_rejected": -75.47291564941406, "logps/rejected": -76.83289337158203, "loss": 1.1106, "margin_dpo/margin_mean": 4.664595127105713, "margin_dpo/margin_std": 6.5264739990234375, "step": 214 }, { "epoch": 0.3250188964474679, "grad_norm": 17.00086784362793, "learning_rate": 4.281735428447157e-07, "logits/chosen": 1.1786690950393677, "logits/rejected": 0.9956706166267395, "logps/chosen": -76.308837890625, "logps/ref_chosen": -77.30160522460938, "logps/ref_rejected": -106.88172149658203, "logps/rejected": -110.83109283447266, "loss": 0.9753, "margin_dpo/margin_mean": 4.942141056060791, "margin_dpo/margin_std": 8.318867683410645, "step": 215 }, { "epoch": 0.32653061224489793, "grad_norm": 17.5762882232666, "learning_rate": 4.2724355170431247e-07, "logits/chosen": 2.2342000007629395, "logits/rejected": 2.109405755996704, "logps/chosen": -102.64369201660156, "logps/ref_chosen": -102.14741516113281, "logps/ref_rejected": -122.47714233398438, "logps/rejected": -131.19338989257812, "loss": 1.0855, "margin_dpo/margin_mean": 8.21996784210205, "margin_dpo/margin_std": 5.978784561157227, "step": 216 }, { "epoch": 0.328042328042328, "grad_norm": 18.058996200561523, "learning_rate": 4.26308602680756e-07, "logits/chosen": 2.0615530014038086, "logits/rejected": 1.7113773822784424, "logps/chosen": -81.50210571289062, "logps/ref_chosen": -78.81869506835938, "logps/ref_rejected": -129.14828491210938, "logps/rejected": -139.06570434570312, "loss": 1.0823, "margin_dpo/margin_mean": 7.234000205993652, "margin_dpo/margin_std": 12.859729766845703, "step": 217 }, { "epoch": 0.3295540438397581, "grad_norm": 19.657135009765625, "learning_rate": 4.253687219265803e-07, "logits/chosen": 1.4448516368865967, "logits/rejected": 1.2799259424209595, "logps/chosen": -114.81224822998047, "logps/ref_chosen": -112.6024398803711, "logps/ref_rejected": -119.45700073242188, "logps/rejected": -123.72541809082031, "loss": 1.2687, "margin_dpo/margin_mean": 2.058602809906006, "margin_dpo/margin_std": 7.4347453117370605, "step": 218 }, { "epoch": 0.3310657596371882, "grad_norm": 19.548885345458984, "learning_rate": 4.2442393573227043e-07, "logits/chosen": 1.3907126188278198, "logits/rejected": 1.2917289733886719, "logps/chosen": -76.83174896240234, "logps/ref_chosen": -77.13209533691406, "logps/ref_rejected": -103.83682250976562, "logps/rejected": -110.44512176513672, "loss": 1.0694, "margin_dpo/margin_mean": 6.908657550811768, "margin_dpo/margin_std": 9.424392700195312, "step": 219 }, { "epoch": 0.3325774754346183, "grad_norm": 20.653133392333984, "learning_rate": 4.234742705255272e-07, "logits/chosen": 1.7574130296707153, "logits/rejected": 1.4534964561462402, "logps/chosen": -67.72616577148438, "logps/ref_chosen": -68.90743255615234, "logps/ref_rejected": -90.64686584472656, "logps/rejected": -93.7580337524414, "loss": 1.1469, "margin_dpo/margin_mean": 4.292444705963135, "margin_dpo/margin_std": 8.068262100219727, "step": 220 }, { "epoch": 0.3340891912320484, "grad_norm": 18.93564224243164, "learning_rate": 4.22519752870528e-07, "logits/chosen": 1.5631134510040283, "logits/rejected": 1.2504678964614868, "logps/chosen": -70.94021606445312, "logps/ref_chosen": -70.40955352783203, "logps/ref_rejected": -106.12084197998047, "logps/rejected": -109.77740478515625, "loss": 1.1514, "margin_dpo/margin_mean": 3.125894546508789, "margin_dpo/margin_std": 7.104891777038574, "step": 221 }, { "epoch": 0.3356009070294785, "grad_norm": 19.71828269958496, "learning_rate": 4.2156040946718343e-07, "logits/chosen": 2.1557414531707764, "logits/rejected": 2.042093515396118, "logps/chosen": -71.88018798828125, "logps/ref_chosen": -74.62059020996094, "logps/ref_rejected": -108.63029479980469, "logps/rejected": -111.80680847167969, "loss": 0.9711, "margin_dpo/margin_mean": 5.91693115234375, "margin_dpo/margin_std": 7.537817001342773, "step": 222 }, { "epoch": 0.3371126228269085, "grad_norm": 15.82883071899414, "learning_rate": 4.2059626715039065e-07, "logits/chosen": 1.5775530338287354, "logits/rejected": 1.4491376876831055, "logps/chosen": -75.37213897705078, "logps/ref_chosen": -78.35737609863281, "logps/ref_rejected": -84.74276733398438, "logps/rejected": -88.35108947753906, "loss": 0.9625, "margin_dpo/margin_mean": 6.59356689453125, "margin_dpo/margin_std": 8.484833717346191, "step": 223 }, { "epoch": 0.3386243386243386, "grad_norm": 18.63389778137207, "learning_rate": 4.1962735288928304e-07, "logits/chosen": 2.2383055686950684, "logits/rejected": 2.084538459777832, "logps/chosen": -48.79112243652344, "logps/ref_chosen": -54.18625259399414, "logps/ref_rejected": -78.41352081298828, "logps/rejected": -80.50018310546875, "loss": 1.1009, "margin_dpo/margin_mean": 7.481797695159912, "margin_dpo/margin_std": 7.345212936401367, "step": 224 }, { "epoch": 0.3401360544217687, "grad_norm": 20.763126373291016, "learning_rate": 4.186536937864752e-07, "logits/chosen": 1.5578806400299072, "logits/rejected": 1.0302515029907227, "logps/chosen": -63.778770446777344, "logps/ref_chosen": -65.91146087646484, "logps/ref_rejected": -146.1568145751953, "logps/rejected": -149.50999450683594, "loss": 1.0993, "margin_dpo/margin_mean": 5.485866546630859, "margin_dpo/margin_std": 8.005363464355469, "step": 225 }, { "epoch": 0.3416477702191988, "grad_norm": 16.861278533935547, "learning_rate": 4.176753170773052e-07, "logits/chosen": 1.3274794816970825, "logits/rejected": 1.1296758651733398, "logps/chosen": -57.77510070800781, "logps/ref_chosen": -58.237091064453125, "logps/ref_rejected": -88.97468566894531, "logps/rejected": -95.00027465820312, "loss": 1.0873, "margin_dpo/margin_mean": 6.487582206726074, "margin_dpo/margin_std": 8.638077735900879, "step": 226 }, { "epoch": 0.3431594860166289, "grad_norm": 18.6321964263916, "learning_rate": 4.166922501290729e-07, "logits/chosen": 1.3306598663330078, "logits/rejected": 1.2453471422195435, "logps/chosen": -62.705299377441406, "logps/ref_chosen": -64.79974365234375, "logps/ref_rejected": -82.2812271118164, "logps/rejected": -86.73684692382812, "loss": 1.1476, "margin_dpo/margin_mean": 6.550061225891113, "margin_dpo/margin_std": 15.07986068725586, "step": 227 }, { "epoch": 0.34467120181405897, "grad_norm": 19.915695190429688, "learning_rate": 4.1570452044027405e-07, "logits/chosen": 2.0995073318481445, "logits/rejected": 1.9628493785858154, "logps/chosen": -81.85237121582031, "logps/ref_chosen": -82.53168487548828, "logps/ref_rejected": -92.82319641113281, "logps/rejected": -99.55656433105469, "loss": 1.1253, "margin_dpo/margin_mean": 7.412677764892578, "margin_dpo/margin_std": 9.029712677001953, "step": 228 }, { "epoch": 0.34618291761148906, "grad_norm": 16.886613845825195, "learning_rate": 4.147121556398312e-07, "logits/chosen": 1.8414063453674316, "logits/rejected": 1.6012749671936035, "logps/chosen": -59.281646728515625, "logps/ref_chosen": -64.06170654296875, "logps/ref_rejected": -100.86270141601562, "logps/rejected": -105.90546417236328, "loss": 1.033, "margin_dpo/margin_mean": 9.822824478149414, "margin_dpo/margin_std": 6.314662456512451, "step": 229 }, { "epoch": 0.3476946334089191, "grad_norm": 21.322444915771484, "learning_rate": 4.137151834863213e-07, "logits/chosen": 1.0890512466430664, "logits/rejected": 1.4050287008285522, "logps/chosen": -97.45503234863281, "logps/ref_chosen": -98.65325927734375, "logps/ref_rejected": -75.16404724121094, "logps/rejected": -78.0488510131836, "loss": 1.1711, "margin_dpo/margin_mean": 4.083024024963379, "margin_dpo/margin_std": 9.145401954650879, "step": 230 }, { "epoch": 0.3492063492063492, "grad_norm": 19.391799926757812, "learning_rate": 4.1271363186719835e-07, "logits/chosen": 0.8802238702774048, "logits/rejected": 0.6724132299423218, "logps/chosen": -82.00273132324219, "logps/ref_chosen": -80.71703338623047, "logps/ref_rejected": -105.674072265625, "logps/rejected": -112.16716003417969, "loss": 1.0061, "margin_dpo/margin_mean": 5.207390785217285, "margin_dpo/margin_std": 7.625702857971191, "step": 231 }, { "epoch": 0.3507180650037793, "grad_norm": 247.5164794921875, "learning_rate": 4.1170752879801436e-07, "logits/chosen": 1.3618415594100952, "logits/rejected": 1.4338455200195312, "logps/chosen": -88.09855651855469, "logps/ref_chosen": -90.6833267211914, "logps/ref_rejected": -76.02790832519531, "logps/rejected": -77.30497741699219, "loss": 1.2222, "margin_dpo/margin_mean": 3.8618435859680176, "margin_dpo/margin_std": 11.690113067626953, "step": 232 }, { "epoch": 0.35222978080120937, "grad_norm": 17.651288986206055, "learning_rate": 4.106969024216348e-07, "logits/chosen": 1.7870259284973145, "logits/rejected": 1.4745216369628906, "logps/chosen": -57.64254379272461, "logps/ref_chosen": -60.32892990112305, "logps/ref_rejected": -82.74418640136719, "logps/rejected": -90.13136291503906, "loss": 1.1639, "margin_dpo/margin_mean": 10.073554992675781, "margin_dpo/margin_std": 11.479612350463867, "step": 233 }, { "epoch": 0.35374149659863946, "grad_norm": 17.584056854248047, "learning_rate": 4.09681781007452e-07, "logits/chosen": 0.45771628618240356, "logits/rejected": 0.3212139308452606, "logps/chosen": -82.85360717773438, "logps/ref_chosen": -82.00396728515625, "logps/ref_rejected": -93.49868774414062, "logps/rejected": -98.31204223632812, "loss": 1.1358, "margin_dpo/margin_mean": 3.963721752166748, "margin_dpo/margin_std": 7.145815849304199, "step": 234 }, { "epoch": 0.35525321239606955, "grad_norm": 18.441320419311523, "learning_rate": 4.08662192950594e-07, "logits/chosen": 1.1290838718414307, "logits/rejected": 1.2310476303100586, "logps/chosen": -97.12908935546875, "logps/ref_chosen": -99.99212646484375, "logps/ref_rejected": -103.86991882324219, "logps/rejected": -107.80982971191406, "loss": 0.9894, "margin_dpo/margin_mean": 6.8029279708862305, "margin_dpo/margin_std": 5.952003479003906, "step": 235 }, { "epoch": 0.35676492819349964, "grad_norm": 19.695188522338867, "learning_rate": 4.076381667711306e-07, "logits/chosen": 1.5752960443496704, "logits/rejected": 1.472326397895813, "logps/chosen": -88.29005432128906, "logps/ref_chosen": -90.8012466430664, "logps/ref_rejected": -111.03831481933594, "logps/rejected": -114.74986267089844, "loss": 1.1663, "margin_dpo/margin_mean": 6.2227463722229, "margin_dpo/margin_std": 6.764309883117676, "step": 236 }, { "epoch": 0.35827664399092973, "grad_norm": 20.905710220336914, "learning_rate": 4.066097311132753e-07, "logits/chosen": 1.3112211227416992, "logits/rejected": 1.2118773460388184, "logps/chosen": -85.30001068115234, "logps/ref_chosen": -88.0330581665039, "logps/ref_rejected": -105.17771911621094, "logps/rejected": -108.61278533935547, "loss": 1.203, "margin_dpo/margin_mean": 6.168111801147461, "margin_dpo/margin_std": 8.725688934326172, "step": 237 }, { "epoch": 0.35978835978835977, "grad_norm": 17.324424743652344, "learning_rate": 4.0557691474458414e-07, "logits/chosen": 1.735201120376587, "logits/rejected": 1.7092080116271973, "logps/chosen": -73.12251281738281, "logps/ref_chosen": -75.1971206665039, "logps/ref_rejected": -77.96612548828125, "logps/rejected": -82.361572265625, "loss": 1.0525, "margin_dpo/margin_mean": 6.470047950744629, "margin_dpo/margin_std": 11.176639556884766, "step": 238 }, { "epoch": 0.36130007558578986, "grad_norm": 21.258893966674805, "learning_rate": 4.045397465551513e-07, "logits/chosen": 1.6649603843688965, "logits/rejected": 1.5152134895324707, "logps/chosen": -94.1319351196289, "logps/ref_chosen": -91.15692138671875, "logps/ref_rejected": -113.57862091064453, "logps/rejected": -120.50105285644531, "loss": 1.1188, "margin_dpo/margin_mean": 3.9474148750305176, "margin_dpo/margin_std": 11.459762573242188, "step": 239 }, { "epoch": 0.36281179138321995, "grad_norm": 21.705753326416016, "learning_rate": 4.0349825555680045e-07, "logits/chosen": 1.3406062126159668, "logits/rejected": 1.273798942565918, "logps/chosen": -80.3905029296875, "logps/ref_chosen": -81.4254150390625, "logps/ref_rejected": -114.05240631103516, "logps/rejected": -121.94851684570312, "loss": 0.9413, "margin_dpo/margin_mean": 8.931010246276855, "margin_dpo/margin_std": 10.8176851272583, "step": 240 }, { "epoch": 0.36432350718065004, "grad_norm": 20.70751190185547, "learning_rate": 4.0245247088227377e-07, "logits/chosen": 1.3239598274230957, "logits/rejected": 1.146188735961914, "logps/chosen": -81.96150970458984, "logps/ref_chosen": -81.69168090820312, "logps/ref_rejected": -95.57888793945312, "logps/rejected": -99.75829315185547, "loss": 1.1959, "margin_dpo/margin_mean": 3.9095780849456787, "margin_dpo/margin_std": 10.020241737365723, "step": 241 }, { "epoch": 0.36583522297808013, "grad_norm": 15.902716636657715, "learning_rate": 4.0140242178441665e-07, "logits/chosen": 0.6462745666503906, "logits/rejected": 0.496981680393219, "logps/chosen": -60.067909240722656, "logps/ref_chosen": -66.06492614746094, "logps/ref_rejected": -83.15866088867188, "logps/rejected": -86.033447265625, "loss": 0.988, "margin_dpo/margin_mean": 8.87181282043457, "margin_dpo/margin_std": 11.042675018310547, "step": 242 }, { "epoch": 0.3673469387755102, "grad_norm": 21.21453857421875, "learning_rate": 4.003481376353596e-07, "logits/chosen": 1.953173041343689, "logits/rejected": 1.9563779830932617, "logps/chosen": -113.06846618652344, "logps/ref_chosen": -114.31979370117188, "logps/ref_rejected": -90.88074493408203, "logps/rejected": -95.6974105834961, "loss": 1.1352, "margin_dpo/margin_mean": 6.067990779876709, "margin_dpo/margin_std": 10.036855697631836, "step": 243 }, { "epoch": 0.3688586545729403, "grad_norm": 18.53084945678711, "learning_rate": 3.9928964792569654e-07, "logits/chosen": 1.541309118270874, "logits/rejected": 1.4583971500396729, "logps/chosen": -83.95600891113281, "logps/ref_chosen": -83.78800964355469, "logps/ref_rejected": -114.07919311523438, "logps/rejected": -120.47505187988281, "loss": 0.8184, "margin_dpo/margin_mean": 6.227858066558838, "margin_dpo/margin_std": 8.322176933288574, "step": 244 }, { "epoch": 0.37037037037037035, "grad_norm": 19.363468170166016, "learning_rate": 3.982269822636601e-07, "logits/chosen": 1.4912075996398926, "logits/rejected": 1.3978639841079712, "logps/chosen": -93.45870971679688, "logps/ref_chosen": -93.51729583740234, "logps/ref_rejected": -92.6208267211914, "logps/rejected": -101.91989135742188, "loss": 0.9155, "margin_dpo/margin_mean": 9.357648849487305, "margin_dpo/margin_std": 9.205923080444336, "step": 245 }, { "epoch": 0.37188208616780044, "grad_norm": 20.757997512817383, "learning_rate": 3.971601703742932e-07, "logits/chosen": 1.9170830249786377, "logits/rejected": 1.9764997959136963, "logps/chosen": -73.34526824951172, "logps/ref_chosen": -74.48593139648438, "logps/ref_rejected": -88.08491516113281, "logps/rejected": -97.43938446044922, "loss": 1.0065, "margin_dpo/margin_mean": 10.495137214660645, "margin_dpo/margin_std": 10.436851501464844, "step": 246 }, { "epoch": 0.37339380196523053, "grad_norm": 29.047739028930664, "learning_rate": 3.960892420986177e-07, "logits/chosen": 1.0904462337493896, "logits/rejected": 0.8627911806106567, "logps/chosen": -110.49881744384766, "logps/ref_chosen": -109.89823150634766, "logps/ref_rejected": -141.33636474609375, "logps/rejected": -149.88587951660156, "loss": 1.2264, "margin_dpo/margin_mean": 7.948927879333496, "margin_dpo/margin_std": 9.069073677062988, "step": 247 }, { "epoch": 0.3749055177626606, "grad_norm": 25.44984245300293, "learning_rate": 3.9501422739279953e-07, "logits/chosen": 0.9584105014801025, "logits/rejected": 1.1548070907592773, "logps/chosen": -93.99226379394531, "logps/ref_chosen": -90.47887420654297, "logps/ref_rejected": -76.64949798583984, "logps/rejected": -86.36746978759766, "loss": 1.1195, "margin_dpo/margin_mean": 6.204570770263672, "margin_dpo/margin_std": 11.101892471313477, "step": 248 }, { "epoch": 0.3764172335600907, "grad_norm": 27.712949752807617, "learning_rate": 3.9393515632731094e-07, "logits/chosen": 2.226923942565918, "logits/rejected": 2.240403890609741, "logps/chosen": -89.89859008789062, "logps/ref_chosen": -88.28697204589844, "logps/ref_rejected": -86.9918212890625, "logps/rejected": -91.2759017944336, "loss": 1.4394, "margin_dpo/margin_mean": 2.6724724769592285, "margin_dpo/margin_std": 12.873514175415039, "step": 249 }, { "epoch": 0.3779289493575208, "grad_norm": 20.57770538330078, "learning_rate": 3.9285205908608934e-07, "logits/chosen": 1.6664156913757324, "logits/rejected": 1.522703766822815, "logps/chosen": -62.21220397949219, "logps/ref_chosen": -63.43910217285156, "logps/ref_rejected": -80.62076568603516, "logps/rejected": -89.03213500976562, "loss": 0.9687, "margin_dpo/margin_mean": 9.638264656066895, "margin_dpo/margin_std": 10.435930252075195, "step": 250 }, { "epoch": 0.3794406651549509, "grad_norm": 22.137779235839844, "learning_rate": 3.9176496596569265e-07, "logits/chosen": 1.6106665134429932, "logits/rejected": 1.6481736898422241, "logps/chosen": -101.96516418457031, "logps/ref_chosen": -99.75392150878906, "logps/ref_rejected": -83.60183715820312, "logps/rejected": -90.12034606933594, "loss": 1.2025, "margin_dpo/margin_mean": 4.307268142700195, "margin_dpo/margin_std": 14.127132415771484, "step": 251 }, { "epoch": 0.38095238095238093, "grad_norm": 18.247407913208008, "learning_rate": 3.9067390737445254e-07, "logits/chosen": 1.5461227893829346, "logits/rejected": 1.4235080480575562, "logps/chosen": -89.36070251464844, "logps/ref_chosen": -86.51172637939453, "logps/ref_rejected": -97.54434204101562, "logps/rejected": -107.46619415283203, "loss": 1.1887, "margin_dpo/margin_mean": 7.072881698608398, "margin_dpo/margin_std": 11.177294731140137, "step": 252 }, { "epoch": 0.382464096749811, "grad_norm": 21.47108268737793, "learning_rate": 3.8957891383162304e-07, "logits/chosen": 1.6541004180908203, "logits/rejected": 1.550170660018921, "logps/chosen": -48.02019500732422, "logps/ref_chosen": -46.728111267089844, "logps/ref_rejected": -58.3885498046875, "logps/rejected": -65.66362762451172, "loss": 1.1659, "margin_dpo/margin_mean": 5.982996463775635, "margin_dpo/margin_std": 10.698124885559082, "step": 253 }, { "epoch": 0.3839758125472411, "grad_norm": 22.817777633666992, "learning_rate": 3.884800159665276e-07, "logits/chosen": 1.3299825191497803, "logits/rejected": 1.1941395998001099, "logps/chosen": -65.04866027832031, "logps/ref_chosen": -63.599853515625, "logps/ref_rejected": -83.38310241699219, "logps/rejected": -92.31219482421875, "loss": 1.0953, "margin_dpo/margin_mean": 7.480283260345459, "margin_dpo/margin_std": 8.203174591064453, "step": 254 }, { "epoch": 0.3854875283446712, "grad_norm": 23.891443252563477, "learning_rate": 3.873772445177015e-07, "logits/chosen": 1.4129853248596191, "logits/rejected": 1.2499051094055176, "logps/chosen": -73.55607604980469, "logps/ref_chosen": -74.2440185546875, "logps/ref_rejected": -111.37633514404297, "logps/rejected": -117.14253997802734, "loss": 1.0827, "margin_dpo/margin_mean": 6.454158782958984, "margin_dpo/margin_std": 11.983856201171875, "step": 255 }, { "epoch": 0.3869992441421013, "grad_norm": 21.12157440185547, "learning_rate": 3.862706303320329e-07, "logits/chosen": 1.4924036264419556, "logits/rejected": 1.389431118965149, "logps/chosen": -102.51988220214844, "logps/ref_chosen": -98.66207885742188, "logps/ref_rejected": -107.10368347167969, "logps/rejected": -116.27766418457031, "loss": 1.1034, "margin_dpo/margin_mean": 5.316197395324707, "margin_dpo/margin_std": 13.185227394104004, "step": 256 }, { "epoch": 0.3885109599395314, "grad_norm": 27.625837326049805, "learning_rate": 3.851602043638994e-07, "logits/chosen": 1.493327260017395, "logits/rejected": 1.2727749347686768, "logps/chosen": -74.17803192138672, "logps/ref_chosen": -74.201171875, "logps/ref_rejected": -90.87617492675781, "logps/rejected": -103.25715637207031, "loss": 1.1439, "margin_dpo/margin_mean": 12.404125213623047, "margin_dpo/margin_std": 10.76992416381836, "step": 257 }, { "epoch": 0.3900226757369615, "grad_norm": 19.34490203857422, "learning_rate": 3.840459976743023e-07, "logits/chosen": 1.7154569625854492, "logits/rejected": 1.5629184246063232, "logps/chosen": -85.86290740966797, "logps/ref_chosen": -82.2788314819336, "logps/ref_rejected": -104.513916015625, "logps/rejected": -116.44546508789062, "loss": 0.9383, "margin_dpo/margin_mean": 8.347471237182617, "margin_dpo/margin_std": 8.179452896118164, "step": 258 }, { "epoch": 0.3915343915343915, "grad_norm": 17.89645767211914, "learning_rate": 3.8292804142999796e-07, "logits/chosen": 1.6310702562332153, "logits/rejected": 1.634556531906128, "logps/chosen": -106.26507568359375, "logps/ref_chosen": -104.06521606445312, "logps/ref_rejected": -124.18028259277344, "logps/rejected": -133.65753173828125, "loss": 0.9197, "margin_dpo/margin_mean": 7.2773895263671875, "margin_dpo/margin_std": 13.141748428344727, "step": 259 }, { "epoch": 0.3930461073318216, "grad_norm": 27.13736343383789, "learning_rate": 3.818063669026256e-07, "logits/chosen": 1.7721970081329346, "logits/rejected": 1.5861704349517822, "logps/chosen": -71.54519653320312, "logps/ref_chosen": -71.14646911621094, "logps/ref_rejected": -93.3581771850586, "logps/rejected": -102.21702575683594, "loss": 1.1841, "margin_dpo/margin_mean": 8.460118293762207, "margin_dpo/margin_std": 13.403702735900879, "step": 260 }, { "epoch": 0.3945578231292517, "grad_norm": 24.996944427490234, "learning_rate": 3.806810054678331e-07, "logits/chosen": 1.418328046798706, "logits/rejected": 1.8168785572052002, "logps/chosen": -103.96897888183594, "logps/ref_chosen": -103.2262191772461, "logps/ref_rejected": -63.876731872558594, "logps/rejected": -68.32272338867188, "loss": 1.2284, "margin_dpo/margin_mean": 3.7032363414764404, "margin_dpo/margin_std": 9.83879280090332, "step": 261 }, { "epoch": 0.3960695389266818, "grad_norm": 17.423860549926758, "learning_rate": 3.7955198860439887e-07, "logits/chosen": 2.0037453174591064, "logits/rejected": 1.8984181880950928, "logps/chosen": -73.5115737915039, "logps/ref_chosen": -72.2059097290039, "logps/ref_rejected": -88.14268493652344, "logps/rejected": -96.40874481201172, "loss": 0.9958, "margin_dpo/margin_mean": 6.960394859313965, "margin_dpo/margin_std": 9.70195198059082, "step": 262 }, { "epoch": 0.3975812547241119, "grad_norm": 18.932403564453125, "learning_rate": 3.784193478933516e-07, "logits/chosen": 1.5900702476501465, "logits/rejected": 1.446866512298584, "logps/chosen": -78.120849609375, "logps/ref_chosen": -78.23809814453125, "logps/ref_rejected": -93.85999298095703, "logps/rejected": -99.21015930175781, "loss": 1.0764, "margin_dpo/margin_mean": 5.467409133911133, "margin_dpo/margin_std": 8.456999778747559, "step": 263 }, { "epoch": 0.39909297052154197, "grad_norm": 18.468406677246094, "learning_rate": 3.7728311501708674e-07, "logits/chosen": 1.4506256580352783, "logits/rejected": 1.3569269180297852, "logps/chosen": -108.14775085449219, "logps/ref_chosen": -101.19283294677734, "logps/ref_rejected": -105.89726257324219, "logps/rejected": -116.51220703125, "loss": 0.9957, "margin_dpo/margin_mean": 3.6600213050842285, "margin_dpo/margin_std": 9.101996421813965, "step": 264 }, { "epoch": 0.40060468631897206, "grad_norm": 18.443918228149414, "learning_rate": 3.7614332175848027e-07, "logits/chosen": 1.259042739868164, "logits/rejected": 1.2162940502166748, "logps/chosen": -77.63226318359375, "logps/ref_chosen": -74.58479309082031, "logps/ref_rejected": -80.430419921875, "logps/rejected": -89.23411560058594, "loss": 0.9792, "margin_dpo/margin_mean": 5.756211280822754, "margin_dpo/margin_std": 10.183759689331055, "step": 265 }, { "epoch": 0.4021164021164021, "grad_norm": 18.000490188598633, "learning_rate": 3.75e-07, "logits/chosen": 2.1044368743896484, "logits/rejected": 1.843429684638977, "logps/chosen": -60.27152633666992, "logps/ref_chosen": -60.559478759765625, "logps/ref_rejected": -86.36062622070312, "logps/rejected": -92.10088348388672, "loss": 1.0177, "margin_dpo/margin_mean": 6.028217792510986, "margin_dpo/margin_std": 9.724032402038574, "step": 266 }, { "epoch": 0.4036281179138322, "grad_norm": 16.399856567382812, "learning_rate": 3.738531817228131e-07, "logits/chosen": 1.6247460842132568, "logits/rejected": 1.4053623676300049, "logps/chosen": -59.919795989990234, "logps/ref_chosen": -63.70441436767578, "logps/ref_rejected": -83.08932495117188, "logps/rejected": -85.2328872680664, "loss": 1.0683, "margin_dpo/margin_mean": 5.928180694580078, "margin_dpo/margin_std": 9.278671264648438, "step": 267 }, { "epoch": 0.4051398337112623, "grad_norm": 19.022281646728516, "learning_rate": 3.7270289900589204e-07, "logits/chosen": 1.4496238231658936, "logits/rejected": 1.3633354902267456, "logps/chosen": -80.97392272949219, "logps/ref_chosen": -81.20317840576172, "logps/ref_rejected": -88.78952026367188, "logps/rejected": -91.68635559082031, "loss": 1.2326, "margin_dpo/margin_mean": 3.126094341278076, "margin_dpo/margin_std": 11.293224334716797, "step": 268 }, { "epoch": 0.40665154950869237, "grad_norm": 18.44759178161621, "learning_rate": 3.7154918402511714e-07, "logits/chosen": 1.5952229499816895, "logits/rejected": 1.8033515214920044, "logps/chosen": -100.68822479248047, "logps/ref_chosen": -100.46163940429688, "logps/ref_rejected": -81.60676574707031, "logps/rejected": -88.81659698486328, "loss": 1.0632, "margin_dpo/margin_mean": 6.983246326446533, "margin_dpo/margin_std": 11.594640731811523, "step": 269 }, { "epoch": 0.40816326530612246, "grad_norm": 20.01340675354004, "learning_rate": 3.7039206905237656e-07, "logits/chosen": 1.4241218566894531, "logits/rejected": 1.435469150543213, "logps/chosen": -80.5250244140625, "logps/ref_chosen": -84.99429321289062, "logps/ref_rejected": -79.18400573730469, "logps/rejected": -86.43377685546875, "loss": 1.0912, "margin_dpo/margin_mean": 11.719035148620605, "margin_dpo/margin_std": 10.268095016479492, "step": 270 }, { "epoch": 0.40967498110355255, "grad_norm": 22.529584884643555, "learning_rate": 3.692315864546635e-07, "logits/chosen": 1.658376693725586, "logits/rejected": 1.288610577583313, "logps/chosen": -51.31718826293945, "logps/ref_chosen": -53.80478286743164, "logps/ref_rejected": -109.61439514160156, "logps/rejected": -114.45604705810547, "loss": 1.2696, "margin_dpo/margin_mean": 7.329244613647461, "margin_dpo/margin_std": 9.891286849975586, "step": 271 }, { "epoch": 0.41118669690098264, "grad_norm": 19.045961380004883, "learning_rate": 3.6806776869317067e-07, "logits/chosen": 1.8601175546646118, "logits/rejected": 1.847928524017334, "logps/chosen": -77.80955505371094, "logps/ref_chosen": -78.50799560546875, "logps/ref_rejected": -74.68629455566406, "logps/rejected": -83.61516571044922, "loss": 0.8955, "margin_dpo/margin_mean": 9.627310752868652, "margin_dpo/margin_std": 9.307546615600586, "step": 272 }, { "epoch": 0.4126984126984127, "grad_norm": 23.23846435546875, "learning_rate": 3.669006483223828e-07, "logits/chosen": 1.961982011795044, "logits/rejected": 1.6739972829818726, "logps/chosen": -79.21211242675781, "logps/ref_chosen": -77.14126586914062, "logps/ref_rejected": -136.02557373046875, "logps/rejected": -142.18235778808594, "loss": 1.0901, "margin_dpo/margin_mean": 4.085923671722412, "margin_dpo/margin_std": 9.875133514404297, "step": 273 }, { "epoch": 0.41421012849584277, "grad_norm": 18.79170036315918, "learning_rate": 3.657302579891656e-07, "logits/chosen": 1.6463165283203125, "logits/rejected": 1.8553088903427124, "logps/chosen": -101.32669830322266, "logps/ref_chosen": -99.45804595947266, "logps/ref_rejected": -83.81266784667969, "logps/rejected": -91.04222106933594, "loss": 1.006, "margin_dpo/margin_mean": 5.360896587371826, "margin_dpo/margin_std": 10.031536102294922, "step": 274 }, { "epoch": 0.41572184429327286, "grad_norm": 16.581546783447266, "learning_rate": 3.645566304318526e-07, "logits/chosen": 1.7676652669906616, "logits/rejected": 1.6896226406097412, "logps/chosen": -74.30751037597656, "logps/ref_chosen": -74.81565856933594, "logps/ref_rejected": -83.8819580078125, "logps/rejected": -92.18299102783203, "loss": 0.9012, "margin_dpo/margin_mean": 8.809182167053223, "margin_dpo/margin_std": 10.104835510253906, "step": 275 }, { "epoch": 0.41723356009070295, "grad_norm": 19.646080017089844, "learning_rate": 3.633797984793294e-07, "logits/chosen": 1.3757855892181396, "logits/rejected": 1.412289023399353, "logps/chosen": -56.50067138671875, "logps/ref_chosen": -58.68775939941406, "logps/ref_rejected": -54.94135284423828, "logps/rejected": -59.72188949584961, "loss": 1.0373, "margin_dpo/margin_mean": 6.967626094818115, "margin_dpo/margin_std": 7.913096904754639, "step": 276 }, { "epoch": 0.41874527588813304, "grad_norm": 20.964855194091797, "learning_rate": 3.6219979505011555e-07, "logits/chosen": 0.8726018071174622, "logits/rejected": 0.8032586574554443, "logps/chosen": -75.53360748291016, "logps/ref_chosen": -73.8536376953125, "logps/ref_rejected": -78.52608489990234, "logps/rejected": -83.75806427001953, "loss": 1.2893, "margin_dpo/margin_mean": 3.552006721496582, "margin_dpo/margin_std": 9.064078330993652, "step": 277 }, { "epoch": 0.42025699168556313, "grad_norm": 21.72730827331543, "learning_rate": 3.6101665315144353e-07, "logits/chosen": 1.4963252544403076, "logits/rejected": 1.3513015508651733, "logps/chosen": -85.50416564941406, "logps/ref_chosen": -86.38224792480469, "logps/ref_rejected": -89.47709655761719, "logps/rejected": -96.7026138305664, "loss": 1.1692, "margin_dpo/margin_mean": 8.10361099243164, "margin_dpo/margin_std": 9.67882251739502, "step": 278 }, { "epoch": 0.4217687074829932, "grad_norm": 18.24779510498047, "learning_rate": 3.5983040587833563e-07, "logits/chosen": 1.5333263874053955, "logits/rejected": 1.4491504430770874, "logps/chosen": -70.74869537353516, "logps/ref_chosen": -76.225341796875, "logps/ref_rejected": -84.96986389160156, "logps/rejected": -87.84549713134766, "loss": 0.9037, "margin_dpo/margin_mean": 8.352277755737305, "margin_dpo/margin_std": 12.723238945007324, "step": 279 }, { "epoch": 0.42328042328042326, "grad_norm": 23.532352447509766, "learning_rate": 3.586410864126781e-07, "logits/chosen": 1.3650166988372803, "logits/rejected": 1.235607624053955, "logps/chosen": -66.58361053466797, "logps/ref_chosen": -73.5118408203125, "logps/ref_rejected": -77.72732543945312, "logps/rejected": -80.86961364746094, "loss": 0.8786, "margin_dpo/margin_mean": 10.070512771606445, "margin_dpo/margin_std": 8.306930541992188, "step": 280 }, { "epoch": 0.42479213907785335, "grad_norm": 16.226192474365234, "learning_rate": 3.574487280222929e-07, "logits/chosen": 1.4685707092285156, "logits/rejected": 1.5052597522735596, "logps/chosen": -101.86863708496094, "logps/ref_chosen": -103.09918212890625, "logps/ref_rejected": -114.46480560302734, "logps/rejected": -121.94792938232422, "loss": 1.0269, "margin_dpo/margin_mean": 8.713654518127441, "margin_dpo/margin_std": 6.986821174621582, "step": 281 }, { "epoch": 0.42630385487528344, "grad_norm": 21.093856811523438, "learning_rate": 3.562533640600075e-07, "logits/chosen": 1.3166978359222412, "logits/rejected": 1.0546810626983643, "logps/chosen": -69.39125061035156, "logps/ref_chosen": -70.73341369628906, "logps/ref_rejected": -91.24656677246094, "logps/rejected": -98.08975219726562, "loss": 1.0192, "margin_dpo/margin_mean": 8.18535041809082, "margin_dpo/margin_std": 7.882116317749023, "step": 282 }, { "epoch": 0.42781557067271353, "grad_norm": 19.645465850830078, "learning_rate": 3.550550279627215e-07, "logits/chosen": 1.6532424688339233, "logits/rejected": 1.489319086074829, "logps/chosen": -66.90553283691406, "logps/ref_chosen": -67.81599426269531, "logps/ref_rejected": -86.90434265136719, "logps/rejected": -90.23529052734375, "loss": 1.0701, "margin_dpo/margin_mean": 4.241410255432129, "margin_dpo/margin_std": 10.861912727355957, "step": 283 }, { "epoch": 0.4293272864701436, "grad_norm": 16.481609344482422, "learning_rate": 3.5385375325047163e-07, "logits/chosen": 1.597827434539795, "logits/rejected": 1.6138107776641846, "logps/chosen": -72.65160369873047, "logps/ref_chosen": -76.16476440429688, "logps/ref_rejected": -106.04917907714844, "logps/rejected": -110.24042510986328, "loss": 0.9211, "margin_dpo/margin_mean": 7.704400062561035, "margin_dpo/margin_std": 10.970076560974121, "step": 284 }, { "epoch": 0.4308390022675737, "grad_norm": 23.24557876586914, "learning_rate": 3.5264957352549375e-07, "logits/chosen": 1.6011934280395508, "logits/rejected": 1.6163420677185059, "logps/chosen": -88.22721099853516, "logps/ref_chosen": -83.19266510009766, "logps/ref_rejected": -65.96772003173828, "logps/rejected": -73.61405181884766, "loss": 1.2283, "margin_dpo/margin_mean": 2.611786365509033, "margin_dpo/margin_std": 12.371804237365723, "step": 285 }, { "epoch": 0.4323507180650038, "grad_norm": 19.125213623046875, "learning_rate": 3.514425224712835e-07, "logits/chosen": 1.461625337600708, "logits/rejected": 1.588505744934082, "logps/chosen": -75.20333862304688, "logps/ref_chosen": -70.34590148925781, "logps/ref_rejected": -74.4199447631836, "logps/rejected": -87.5842514038086, "loss": 0.9909, "margin_dpo/margin_mean": 8.306873321533203, "margin_dpo/margin_std": 8.450860977172852, "step": 286 }, { "epoch": 0.43386243386243384, "grad_norm": 20.771451950073242, "learning_rate": 3.502326338516534e-07, "logits/chosen": 1.591284990310669, "logits/rejected": 1.0741746425628662, "logps/chosen": -54.821868896484375, "logps/ref_chosen": -59.434837341308594, "logps/ref_rejected": -90.43165588378906, "logps/rejected": -102.91751098632812, "loss": 0.8545, "margin_dpo/margin_mean": 17.098819732666016, "margin_dpo/margin_std": 10.990958213806152, "step": 287 }, { "epoch": 0.43537414965986393, "grad_norm": 21.281402587890625, "learning_rate": 3.490199415097892e-07, "logits/chosen": 0.7958072423934937, "logits/rejected": 0.7759866714477539, "logps/chosen": -88.11946105957031, "logps/ref_chosen": -85.36576843261719, "logps/ref_rejected": -74.72684478759766, "logps/rejected": -81.76447296142578, "loss": 1.1737, "margin_dpo/margin_mean": 4.283925533294678, "margin_dpo/margin_std": 9.337142944335938, "step": 288 }, { "epoch": 0.436885865457294, "grad_norm": 18.69247055053711, "learning_rate": 3.4780447936730247e-07, "logits/chosen": 1.1241439580917358, "logits/rejected": 1.3310633897781372, "logps/chosen": -85.63728332519531, "logps/ref_chosen": -80.17725372314453, "logps/ref_rejected": -85.6978988647461, "logps/rejected": -95.17655944824219, "loss": 1.0162, "margin_dpo/margin_mean": 4.01862907409668, "margin_dpo/margin_std": 6.578553199768066, "step": 289 }, { "epoch": 0.4383975812547241, "grad_norm": 19.738040924072266, "learning_rate": 3.465862814232821e-07, "logits/chosen": 1.3741858005523682, "logits/rejected": 1.2459540367126465, "logps/chosen": -86.7077407836914, "logps/ref_chosen": -83.58234405517578, "logps/ref_rejected": -87.88914489746094, "logps/rejected": -98.64165496826172, "loss": 1.0412, "margin_dpo/margin_mean": 7.627105712890625, "margin_dpo/margin_std": 8.719098091125488, "step": 290 }, { "epoch": 0.4399092970521542, "grad_norm": 18.554731369018555, "learning_rate": 3.4536538175334343e-07, "logits/chosen": 1.6371653079986572, "logits/rejected": 1.5445657968521118, "logps/chosen": -92.02778625488281, "logps/ref_chosen": -87.6347427368164, "logps/ref_rejected": -94.1658935546875, "logps/rejected": -110.18753814697266, "loss": 1.0766, "margin_dpo/margin_mean": 11.628591537475586, "margin_dpo/margin_std": 15.106453895568848, "step": 291 }, { "epoch": 0.4414210128495843, "grad_norm": 21.74319076538086, "learning_rate": 3.4414181450867465e-07, "logits/chosen": 1.5633352994918823, "logits/rejected": 1.4391751289367676, "logps/chosen": -82.2508544921875, "logps/ref_chosen": -80.37139892578125, "logps/ref_rejected": -82.4432601928711, "logps/rejected": -90.6778335571289, "loss": 1.0578, "margin_dpo/margin_mean": 6.355119705200195, "margin_dpo/margin_std": 9.330062866210938, "step": 292 }, { "epoch": 0.4429327286470144, "grad_norm": 18.310407638549805, "learning_rate": 3.4291561391508185e-07, "logits/chosen": 1.2951960563659668, "logits/rejected": 1.406134843826294, "logps/chosen": -71.5809326171875, "logps/ref_chosen": -69.68745422363281, "logps/ref_rejected": -72.62487030029297, "logps/rejected": -85.28450012207031, "loss": 1.0547, "margin_dpo/margin_mean": 10.76614761352539, "margin_dpo/margin_std": 9.987350463867188, "step": 293 }, { "epoch": 0.4444444444444444, "grad_norm": 16.895854949951172, "learning_rate": 3.4168681427203153e-07, "logits/chosen": 1.9306247234344482, "logits/rejected": 1.7847046852111816, "logps/chosen": -95.72657775878906, "logps/ref_chosen": -89.15940856933594, "logps/ref_rejected": -115.4771957397461, "logps/rejected": -129.94447326660156, "loss": 1.0374, "margin_dpo/margin_mean": 7.900099754333496, "margin_dpo/margin_std": 11.466743469238281, "step": 294 }, { "epoch": 0.4459561602418745, "grad_norm": 21.449647903442383, "learning_rate": 3.4045544995169125e-07, "logits/chosen": 1.6327093839645386, "logits/rejected": 1.3721890449523926, "logps/chosen": -60.76752853393555, "logps/ref_chosen": -54.67145538330078, "logps/ref_rejected": -90.24403381347656, "logps/rejected": -107.34832763671875, "loss": 1.0976, "margin_dpo/margin_mean": 11.008213996887207, "margin_dpo/margin_std": 11.988443374633789, "step": 295 }, { "epoch": 0.4474678760393046, "grad_norm": 19.9609432220459, "learning_rate": 3.392215553979679e-07, "logits/chosen": 1.4768846035003662, "logits/rejected": 1.1597115993499756, "logps/chosen": -71.57769775390625, "logps/ref_chosen": -69.27288055419922, "logps/ref_rejected": -97.12939453125, "logps/rejected": -111.13511657714844, "loss": 0.962, "margin_dpo/margin_mean": 11.700906753540039, "margin_dpo/margin_std": 12.796022415161133, "step": 296 }, { "epoch": 0.4489795918367347, "grad_norm": 22.09977912902832, "learning_rate": 3.3798516512554485e-07, "logits/chosen": 1.448041319847107, "logits/rejected": 1.2209464311599731, "logps/chosen": -63.543087005615234, "logps/ref_chosen": -58.717681884765625, "logps/ref_rejected": -81.23310852050781, "logps/rejected": -96.39411163330078, "loss": 0.8747, "margin_dpo/margin_mean": 10.335600852966309, "margin_dpo/margin_std": 10.154754638671875, "step": 297 }, { "epoch": 0.4504913076341648, "grad_norm": 23.58592414855957, "learning_rate": 3.367463137189156e-07, "logits/chosen": 1.6173958778381348, "logits/rejected": 1.621980905532837, "logps/chosen": -94.65084075927734, "logps/ref_chosen": -90.035888671875, "logps/ref_rejected": -95.54667663574219, "logps/rejected": -108.70011901855469, "loss": 1.1131, "margin_dpo/margin_mean": 8.538497924804688, "margin_dpo/margin_std": 16.286890029907227, "step": 298 }, { "epoch": 0.4520030234315949, "grad_norm": 20.364479064941406, "learning_rate": 3.355050358314172e-07, "logits/chosen": 1.3762767314910889, "logits/rejected": 1.331855058670044, "logps/chosen": -61.005455017089844, "logps/ref_chosen": -55.741477966308594, "logps/ref_rejected": -70.30194091796875, "logps/rejected": -77.63589477539062, "loss": 1.1851, "margin_dpo/margin_mean": 2.069971799850464, "margin_dpo/margin_std": 9.210464477539062, "step": 299 }, { "epoch": 0.45351473922902497, "grad_norm": 20.966903686523438, "learning_rate": 3.3426136618426043e-07, "logits/chosen": 1.3917086124420166, "logits/rejected": 1.473130226135254, "logps/chosen": -86.35111999511719, "logps/ref_chosen": -82.57180786132812, "logps/ref_rejected": -80.38890075683594, "logps/rejected": -93.6250228881836, "loss": 1.1522, "margin_dpo/margin_mean": 9.456819534301758, "margin_dpo/margin_std": 13.218378067016602, "step": 300 }, { "epoch": 0.45351473922902497, "eval_logits/chosen": 1.4997327327728271, "eval_logits/rejected": 1.3737715482711792, "eval_logps/chosen": -91.854248046875, "eval_logps/ref_chosen": -87.31719970703125, "eval_logps/ref_rejected": -95.23231506347656, "eval_logps/rejected": -107.06349182128906, "eval_loss": 0.5328395962715149, "eval_margin_dpo/margin_mean": 7.294130802154541, "eval_margin_dpo/margin_std": 11.505497932434082, "eval_runtime": 42.5675, "eval_samples_per_second": 54.102, "eval_steps_per_second": 1.691, "step": 300 }, { "epoch": 0.455026455026455, "grad_norm": 17.592514038085938, "learning_rate": 3.3301533956555885e-07, "logits/chosen": 1.691786766052246, "logits/rejected": 1.5312892198562622, "logps/chosen": -71.78817749023438, "logps/ref_chosen": -61.52805709838867, "logps/ref_rejected": -91.78837585449219, "logps/rejected": -108.56642150878906, "loss": 1.1213, "margin_dpo/margin_mean": 6.517922401428223, "margin_dpo/margin_std": 10.830015182495117, "step": 301 }, { "epoch": 0.4565381708238851, "grad_norm": 21.65961265563965, "learning_rate": 3.317669908293554e-07, "logits/chosen": 0.9764095544815063, "logits/rejected": 0.7545739412307739, "logps/chosen": -62.87278366088867, "logps/ref_chosen": -58.05803680419922, "logps/ref_rejected": -86.68899536132812, "logps/rejected": -101.33213806152344, "loss": 1.3069, "margin_dpo/margin_mean": 9.82839584350586, "margin_dpo/margin_std": 10.545026779174805, "step": 302 }, { "epoch": 0.4580498866213152, "grad_norm": 18.00571632385254, "learning_rate": 3.3051635489464793e-07, "logits/chosen": 1.8940259218215942, "logits/rejected": 1.8514759540557861, "logps/chosen": -69.81169128417969, "logps/ref_chosen": -66.48047637939453, "logps/ref_rejected": -93.66386413574219, "logps/rejected": -104.38250732421875, "loss": 0.9885, "margin_dpo/margin_mean": 7.3874335289001465, "margin_dpo/margin_std": 10.99864387512207, "step": 303 }, { "epoch": 0.4595616024187453, "grad_norm": 17.358747482299805, "learning_rate": 3.292634667444117e-07, "logits/chosen": 1.8361141681671143, "logits/rejected": 1.6589012145996094, "logps/chosen": -78.14120483398438, "logps/ref_chosen": -75.53591918945312, "logps/ref_rejected": -91.04656982421875, "logps/rejected": -102.43717956542969, "loss": 0.9562, "margin_dpo/margin_mean": 8.785329818725586, "margin_dpo/margin_std": 9.457986831665039, "step": 304 }, { "epoch": 0.46107331821617537, "grad_norm": 19.037128448486328, "learning_rate": 3.280083614246217e-07, "logits/chosen": 1.2751967906951904, "logits/rejected": 1.3313028812408447, "logps/chosen": -108.29340362548828, "logps/ref_chosen": -100.27046966552734, "logps/ref_rejected": -98.79679870605469, "logps/rejected": -113.17327117919922, "loss": 1.0945, "margin_dpo/margin_mean": 6.353545188903809, "margin_dpo/margin_std": 14.072526931762695, "step": 305 }, { "epoch": 0.46258503401360546, "grad_norm": 18.92023468017578, "learning_rate": 3.267510740432719e-07, "logits/chosen": 1.5249675512313843, "logits/rejected": 1.2982500791549683, "logps/chosen": -67.72550964355469, "logps/ref_chosen": -67.18881225585938, "logps/ref_rejected": -80.10235595703125, "logps/rejected": -86.44146728515625, "loss": 1.0397, "margin_dpo/margin_mean": 5.802410125732422, "margin_dpo/margin_std": 10.138044357299805, "step": 306 }, { "epoch": 0.46409674981103555, "grad_norm": 19.16119384765625, "learning_rate": 3.2549163976939285e-07, "logits/chosen": 1.7671325206756592, "logits/rejected": 1.733433485031128, "logps/chosen": -85.22543334960938, "logps/ref_chosen": -84.34929656982422, "logps/ref_rejected": -77.96357727050781, "logps/rejected": -87.06395721435547, "loss": 1.1699, "margin_dpo/margin_mean": 8.224258422851562, "margin_dpo/margin_std": 12.441762924194336, "step": 307 }, { "epoch": 0.4656084656084656, "grad_norm": 21.289731979370117, "learning_rate": 3.2423009383206874e-07, "logits/chosen": 1.0627365112304688, "logits/rejected": 1.2089284658432007, "logps/chosen": -96.02548217773438, "logps/ref_chosen": -91.96617126464844, "logps/ref_rejected": -110.55294799804688, "logps/rejected": -123.92672729492188, "loss": 1.0756, "margin_dpo/margin_mean": 9.31445598602295, "margin_dpo/margin_std": 12.990707397460938, "step": 308 }, { "epoch": 0.4671201814058957, "grad_norm": 17.65408706665039, "learning_rate": 3.229664715194511e-07, "logits/chosen": 1.3217294216156006, "logits/rejected": 1.3109736442565918, "logps/chosen": -62.925941467285156, "logps/ref_chosen": -58.02919006347656, "logps/ref_rejected": -76.69621276855469, "logps/rejected": -88.94454956054688, "loss": 0.968, "margin_dpo/margin_mean": 7.351587295532227, "margin_dpo/margin_std": 9.38610553741455, "step": 309 }, { "epoch": 0.46863189720332576, "grad_norm": 25.508787155151367, "learning_rate": 3.2170080817777257e-07, "logits/chosen": 1.760181188583374, "logits/rejected": 1.9183385372161865, "logps/chosen": -98.79780578613281, "logps/ref_chosen": -93.83892822265625, "logps/ref_rejected": -83.82522583007812, "logps/rejected": -92.55107116699219, "loss": 1.3837, "margin_dpo/margin_mean": 3.766974449157715, "margin_dpo/margin_std": 10.875737190246582, "step": 310 }, { "epoch": 0.47014361300075586, "grad_norm": 17.4613094329834, "learning_rate": 3.204331392103574e-07, "logits/chosen": 1.3921611309051514, "logits/rejected": 1.4458154439926147, "logps/chosen": -75.35536193847656, "logps/ref_chosen": -73.9760513305664, "logps/ref_rejected": -81.3685302734375, "logps/rejected": -86.09585571289062, "loss": 1.0767, "margin_dpo/margin_mean": 3.3480172157287598, "margin_dpo/margin_std": 8.556022644042969, "step": 311 }, { "epoch": 0.47165532879818595, "grad_norm": 15.76866340637207, "learning_rate": 3.1916350007663176e-07, "logits/chosen": 1.9186828136444092, "logits/rejected": 1.839728832244873, "logps/chosen": -73.713134765625, "logps/ref_chosen": -70.6661605834961, "logps/ref_rejected": -94.29641723632812, "logps/rejected": -104.29425048828125, "loss": 0.9773, "margin_dpo/margin_mean": 6.95084810256958, "margin_dpo/margin_std": 8.404745101928711, "step": 312 }, { "epoch": 0.47316704459561604, "grad_norm": 19.37696647644043, "learning_rate": 3.178919262911314e-07, "logits/chosen": 1.2092010974884033, "logits/rejected": 1.2804292440414429, "logps/chosen": -71.31523132324219, "logps/ref_chosen": -71.98353576660156, "logps/ref_rejected": -64.7225341796875, "logps/rejected": -69.02494812011719, "loss": 1.2548, "margin_dpo/margin_mean": 4.970721244812012, "margin_dpo/margin_std": 6.743402481079102, "step": 313 }, { "epoch": 0.47467876039304613, "grad_norm": 19.60079574584961, "learning_rate": 3.166184534225087e-07, "logits/chosen": 1.2932363748550415, "logits/rejected": 1.177966833114624, "logps/chosen": -77.23876190185547, "logps/ref_chosen": -79.37973022460938, "logps/ref_rejected": -93.06839752197266, "logps/rejected": -100.330810546875, "loss": 1.0147, "margin_dpo/margin_mean": 9.403375625610352, "margin_dpo/margin_std": 10.796396255493164, "step": 314 }, { "epoch": 0.47619047619047616, "grad_norm": 17.703235626220703, "learning_rate": 3.1534311709253723e-07, "logits/chosen": 1.1464133262634277, "logits/rejected": 1.1185472011566162, "logps/chosen": -72.10354614257812, "logps/ref_chosen": -72.04164123535156, "logps/ref_rejected": -68.40371704101562, "logps/rejected": -71.7393569946289, "loss": 1.0334, "margin_dpo/margin_mean": 3.2737433910369873, "margin_dpo/margin_std": 7.896455764770508, "step": 315 }, { "epoch": 0.47770219198790626, "grad_norm": 15.83440113067627, "learning_rate": 3.1406595297511564e-07, "logits/chosen": 1.1275134086608887, "logits/rejected": 0.6892586946487427, "logps/chosen": -78.31683349609375, "logps/ref_chosen": -79.02296447753906, "logps/ref_rejected": -151.3109588623047, "logps/rejected": -159.35098266601562, "loss": 0.9813, "margin_dpo/margin_mean": 8.746146202087402, "margin_dpo/margin_std": 9.00886344909668, "step": 316 }, { "epoch": 0.47921390778533635, "grad_norm": 18.562458038330078, "learning_rate": 3.1278699679526975e-07, "logits/chosen": 1.3129262924194336, "logits/rejected": 1.1802295446395874, "logps/chosen": -63.89330291748047, "logps/ref_chosen": -62.97270584106445, "logps/ref_rejected": -89.73057556152344, "logps/rejected": -95.73944091796875, "loss": 1.0018, "margin_dpo/margin_mean": 5.088255882263184, "margin_dpo/margin_std": 7.3550519943237305, "step": 317 }, { "epoch": 0.48072562358276644, "grad_norm": 18.138851165771484, "learning_rate": 3.1150628432815336e-07, "logits/chosen": 1.7153645753860474, "logits/rejected": 1.8351796865463257, "logps/chosen": -103.4206771850586, "logps/ref_chosen": -101.49869537353516, "logps/ref_rejected": -99.12153625488281, "logps/rejected": -108.7420654296875, "loss": 1.1738, "margin_dpo/margin_mean": 7.698549270629883, "margin_dpo/margin_std": 12.269194602966309, "step": 318 }, { "epoch": 0.48223733938019653, "grad_norm": 17.831144332885742, "learning_rate": 3.1022385139804707e-07, "logits/chosen": 1.0430963039398193, "logits/rejected": 0.8398488759994507, "logps/chosen": -106.6004638671875, "logps/ref_chosen": -105.831298828125, "logps/ref_rejected": -133.7858123779297, "logps/rejected": -143.59652709960938, "loss": 1.0431, "margin_dpo/margin_mean": 9.041553497314453, "margin_dpo/margin_std": 7.210862159729004, "step": 319 }, { "epoch": 0.4837490551776266, "grad_norm": 19.16101837158203, "learning_rate": 3.0893973387735683e-07, "logits/chosen": 1.1535930633544922, "logits/rejected": 1.082245111465454, "logps/chosen": -48.82525634765625, "logps/ref_chosen": -50.85547637939453, "logps/ref_rejected": -71.84254455566406, "logps/rejected": -73.27474975585938, "loss": 1.2741, "margin_dpo/margin_mean": 3.4624290466308594, "margin_dpo/margin_std": 9.584606170654297, "step": 320 }, { "epoch": 0.4852607709750567, "grad_norm": 18.38614273071289, "learning_rate": 3.0765396768561004e-07, "logits/chosen": 1.29036545753479, "logits/rejected": 1.2340288162231445, "logps/chosen": -55.51321792602539, "logps/ref_chosen": -60.787071228027344, "logps/ref_rejected": -61.15034484863281, "logps/rejected": -64.06893157958984, "loss": 1.1247, "margin_dpo/margin_mean": 8.19244384765625, "margin_dpo/margin_std": 10.663275718688965, "step": 321 }, { "epoch": 0.48677248677248675, "grad_norm": 20.19472312927246, "learning_rate": 3.063665887884511e-07, "logits/chosen": 1.7706817388534546, "logits/rejected": 1.4572110176086426, "logps/chosen": -42.1898078918457, "logps/ref_chosen": -46.855037689208984, "logps/ref_rejected": -86.90510559082031, "logps/rejected": -92.86859130859375, "loss": 0.9153, "margin_dpo/margin_mean": 10.628705978393555, "margin_dpo/margin_std": 9.396108627319336, "step": 322 }, { "epoch": 0.48828420256991684, "grad_norm": 19.782289505004883, "learning_rate": 3.0507763319663517e-07, "logits/chosen": 1.3513455390930176, "logits/rejected": 1.3065063953399658, "logps/chosen": -79.96217346191406, "logps/ref_chosen": -79.34443664550781, "logps/ref_rejected": -89.86846160888672, "logps/rejected": -100.29277801513672, "loss": 1.1441, "margin_dpo/margin_mean": 9.806586265563965, "margin_dpo/margin_std": 12.042539596557617, "step": 323 }, { "epoch": 0.4897959183673469, "grad_norm": 19.262086868286133, "learning_rate": 3.0378713696502097e-07, "logits/chosen": 1.3597164154052734, "logits/rejected": 1.350963830947876, "logps/chosen": -53.68002700805664, "logps/ref_chosen": -57.522865295410156, "logps/ref_rejected": -74.0947494506836, "logps/rejected": -77.91177368164062, "loss": 1.0392, "margin_dpo/margin_mean": 7.659856796264648, "margin_dpo/margin_std": 10.965751647949219, "step": 324 }, { "epoch": 0.491307634164777, "grad_norm": 19.02881622314453, "learning_rate": 3.0249513619156206e-07, "logits/chosen": 1.6591523885726929, "logits/rejected": 1.6345276832580566, "logps/chosen": -69.29843139648438, "logps/ref_chosen": -70.73209381103516, "logps/ref_rejected": -78.93636322021484, "logps/rejected": -85.91812133789062, "loss": 1.0967, "margin_dpo/margin_mean": 8.415424346923828, "margin_dpo/margin_std": 10.627230644226074, "step": 325 }, { "epoch": 0.4928193499622071, "grad_norm": 18.081501007080078, "learning_rate": 3.012016670162977e-07, "logits/chosen": 1.549687385559082, "logits/rejected": 1.4641222953796387, "logps/chosen": -97.66859436035156, "logps/ref_chosen": -93.03094482421875, "logps/ref_rejected": -94.15129089355469, "logps/rejected": -103.8548355102539, "loss": 1.1939, "margin_dpo/margin_mean": 5.065885543823242, "margin_dpo/margin_std": 9.203946113586426, "step": 326 }, { "epoch": 0.4943310657596372, "grad_norm": 23.913400650024414, "learning_rate": 2.99906765620341e-07, "logits/chosen": 1.1021369695663452, "logits/rejected": 0.9692546725273132, "logps/chosen": -107.79444885253906, "logps/ref_chosen": -105.44377136230469, "logps/ref_rejected": -96.36316680908203, "logps/rejected": -103.40213012695312, "loss": 1.2476, "margin_dpo/margin_mean": 4.688292026519775, "margin_dpo/margin_std": 13.185343742370605, "step": 327 }, { "epoch": 0.4958427815570673, "grad_norm": 17.27338409423828, "learning_rate": 2.9861046822486766e-07, "logits/chosen": 1.2270727157592773, "logits/rejected": 1.0908598899841309, "logps/chosen": -75.63455963134766, "logps/ref_chosen": -77.95569610595703, "logps/ref_rejected": -99.2158432006836, "logps/rejected": -104.21788024902344, "loss": 1.1346, "margin_dpo/margin_mean": 7.323174476623535, "margin_dpo/margin_std": 7.602724075317383, "step": 328 }, { "epoch": 0.4973544973544973, "grad_norm": 20.16090965270996, "learning_rate": 2.9731281109010253e-07, "logits/chosen": 1.5063313245773315, "logits/rejected": 1.2426934242248535, "logps/chosen": -63.84971618652344, "logps/ref_chosen": -64.19764709472656, "logps/ref_rejected": -100.44914245605469, "logps/rejected": -103.77317810058594, "loss": 1.0886, "margin_dpo/margin_mean": 3.671964406967163, "margin_dpo/margin_std": 7.202153205871582, "step": 329 }, { "epoch": 0.4988662131519274, "grad_norm": 15.157934188842773, "learning_rate": 2.9601383051430505e-07, "logits/chosen": 1.361910104751587, "logits/rejected": 1.322227120399475, "logps/chosen": -71.92733001708984, "logps/ref_chosen": -69.53826904296875, "logps/ref_rejected": -81.59492492675781, "logps/rejected": -86.19032287597656, "loss": 0.9709, "margin_dpo/margin_mean": 2.2063469886779785, "margin_dpo/margin_std": 8.829365730285645, "step": 330 }, { "epoch": 0.5003779289493575, "grad_norm": 16.568151473999023, "learning_rate": 2.947135628327544e-07, "logits/chosen": 1.2606093883514404, "logits/rejected": 1.0372728109359741, "logps/chosen": -79.93902587890625, "logps/ref_chosen": -80.13872528076172, "logps/ref_rejected": -105.05142974853516, "logps/rejected": -118.0305404663086, "loss": 0.9745, "margin_dpo/margin_mean": 13.17880630493164, "margin_dpo/margin_std": 9.773565292358398, "step": 331 }, { "epoch": 0.5018896447467877, "grad_norm": 19.85029411315918, "learning_rate": 2.934120444167326e-07, "logits/chosen": 1.0722594261169434, "logits/rejected": 0.986824631690979, "logps/chosen": -75.63909149169922, "logps/ref_chosen": -74.36831665039062, "logps/ref_rejected": -89.39633178710938, "logps/rejected": -97.04723358154297, "loss": 1.0042, "margin_dpo/margin_mean": 6.380122184753418, "margin_dpo/margin_std": 7.564399242401123, "step": 332 }, { "epoch": 0.5034013605442177, "grad_norm": 15.980324745178223, "learning_rate": 2.921093116725076e-07, "logits/chosen": 1.4670207500457764, "logits/rejected": 1.3264145851135254, "logps/chosen": -93.92146301269531, "logps/ref_chosen": -90.88027954101562, "logps/ref_rejected": -127.32582092285156, "logps/rejected": -140.1451416015625, "loss": 0.85, "margin_dpo/margin_mean": 9.778153419494629, "margin_dpo/margin_std": 7.708182334899902, "step": 333 }, { "epoch": 0.5049130763416477, "grad_norm": 18.896371841430664, "learning_rate": 2.9080540104031484e-07, "logits/chosen": 1.6582601070404053, "logits/rejected": 1.2269431352615356, "logps/chosen": -76.29214477539062, "logps/ref_chosen": -72.8953628540039, "logps/ref_rejected": -103.43342590332031, "logps/rejected": -108.40107727050781, "loss": 1.1728, "margin_dpo/margin_mean": 1.5708723068237305, "margin_dpo/margin_std": 16.17702865600586, "step": 334 }, { "epoch": 0.5064247921390779, "grad_norm": 25.68128776550293, "learning_rate": 2.895003489933375e-07, "logits/chosen": 1.801456093788147, "logits/rejected": 1.4856525659561157, "logps/chosen": -72.10084533691406, "logps/ref_chosen": -73.78713989257812, "logps/ref_rejected": -140.6279296875, "logps/rejected": -154.3983154296875, "loss": 1.1925, "margin_dpo/margin_mean": 15.456686019897461, "margin_dpo/margin_std": 13.234912872314453, "step": 335 }, { "epoch": 0.5079365079365079, "grad_norm": 17.304594039916992, "learning_rate": 2.8819419203668675e-07, "logits/chosen": 1.3642692565917969, "logits/rejected": 1.369320273399353, "logps/chosen": -80.25181579589844, "logps/ref_chosen": -77.84403991699219, "logps/ref_rejected": -101.08308410644531, "logps/rejected": -115.89515686035156, "loss": 0.9955, "margin_dpo/margin_mean": 12.404296875, "margin_dpo/margin_std": 11.414948463439941, "step": 336 }, { "epoch": 0.509448223733938, "grad_norm": 18.648195266723633, "learning_rate": 2.8688696670638053e-07, "logits/chosen": 0.9704691171646118, "logits/rejected": 0.9159014225006104, "logps/chosen": -127.52105712890625, "logps/ref_chosen": -119.64498901367188, "logps/ref_rejected": -112.64798736572266, "logps/rejected": -121.73474884033203, "loss": 1.2175, "margin_dpo/margin_mean": 1.2106828689575195, "margin_dpo/margin_std": 10.663457870483398, "step": 337 }, { "epoch": 0.5109599395313681, "grad_norm": 20.368419647216797, "learning_rate": 2.8557870956832133e-07, "logits/chosen": 1.2665528059005737, "logits/rejected": 1.159436821937561, "logps/chosen": -90.64863586425781, "logps/ref_chosen": -86.41075134277344, "logps/ref_rejected": -116.54347229003906, "logps/rejected": -126.70204162597656, "loss": 1.1131, "margin_dpo/margin_mean": 5.920680999755859, "margin_dpo/margin_std": 11.168838500976562, "step": 338 }, { "epoch": 0.5124716553287982, "grad_norm": 17.986751556396484, "learning_rate": 2.842694572172736e-07, "logits/chosen": 1.5119249820709229, "logits/rejected": 1.1131794452667236, "logps/chosen": -40.407535552978516, "logps/ref_chosen": -41.70279312133789, "logps/ref_rejected": -70.68670654296875, "logps/rejected": -74.76795196533203, "loss": 0.9431, "margin_dpo/margin_mean": 5.376494884490967, "margin_dpo/margin_std": 5.643701553344727, "step": 339 }, { "epoch": 0.5139833711262283, "grad_norm": 16.6380558013916, "learning_rate": 2.8295924627584004e-07, "logits/chosen": 1.3235818147659302, "logits/rejected": 1.1337850093841553, "logps/chosen": -59.04003143310547, "logps/ref_chosen": -55.31536865234375, "logps/ref_rejected": -84.75738525390625, "logps/rejected": -99.1771011352539, "loss": 1.1224, "margin_dpo/margin_mean": 10.695058822631836, "margin_dpo/margin_std": 10.906319618225098, "step": 340 }, { "epoch": 0.5154950869236583, "grad_norm": 18.72040367126465, "learning_rate": 2.816481133934373e-07, "logits/chosen": 1.583848476409912, "logits/rejected": 1.4855204820632935, "logps/chosen": -64.03955078125, "logps/ref_chosen": -64.97042083740234, "logps/ref_rejected": -75.17206573486328, "logps/rejected": -85.15873718261719, "loss": 1.0653, "margin_dpo/margin_mean": 10.917531967163086, "margin_dpo/margin_std": 11.370951652526855, "step": 341 }, { "epoch": 0.5170068027210885, "grad_norm": 17.226045608520508, "learning_rate": 2.8033609524527046e-07, "logits/chosen": 1.1822351217269897, "logits/rejected": 1.2254983186721802, "logps/chosen": -90.85719299316406, "logps/ref_chosen": -90.8426513671875, "logps/ref_rejected": -73.89909362792969, "logps/rejected": -79.44075775146484, "loss": 0.9963, "margin_dpo/margin_mean": 5.527122497558594, "margin_dpo/margin_std": 10.514375686645508, "step": 342 }, { "epoch": 0.5185185185185185, "grad_norm": 20.78652000427246, "learning_rate": 2.7902322853130753e-07, "logits/chosen": 1.3685609102249146, "logits/rejected": 1.4792490005493164, "logps/chosen": -104.35194396972656, "logps/ref_chosen": -100.00821685791016, "logps/ref_rejected": -92.94171905517578, "logps/rejected": -98.23381042480469, "loss": 1.2496, "margin_dpo/margin_mean": 0.9483753442764282, "margin_dpo/margin_std": 10.396292686462402, "step": 343 }, { "epoch": 0.5200302343159486, "grad_norm": 17.205974578857422, "learning_rate": 2.7770954997525274e-07, "logits/chosen": 1.650438904762268, "logits/rejected": 1.3696870803833008, "logps/chosen": -50.81757354736328, "logps/ref_chosen": -48.01771926879883, "logps/ref_rejected": -94.11082458496094, "logps/rejected": -105.68701171875, "loss": 0.9568, "margin_dpo/margin_mean": 8.776339530944824, "margin_dpo/margin_std": 10.478099822998047, "step": 344 }, { "epoch": 0.5215419501133787, "grad_norm": 16.233139038085938, "learning_rate": 2.7639509632351927e-07, "logits/chosen": 1.6912753582000732, "logits/rejected": 1.6366878747940063, "logps/chosen": -60.98334503173828, "logps/ref_chosen": -59.042606353759766, "logps/ref_rejected": -67.68496704101562, "logps/rejected": -73.27864074707031, "loss": 1.0825, "margin_dpo/margin_mean": 3.652935028076172, "margin_dpo/margin_std": 10.401659965515137, "step": 345 }, { "epoch": 0.5230536659108088, "grad_norm": 18.452098846435547, "learning_rate": 2.7507990434420123e-07, "logits/chosen": 1.2100446224212646, "logits/rejected": 1.1001601219177246, "logps/chosen": -74.1183090209961, "logps/ref_chosen": -78.06119537353516, "logps/ref_rejected": -116.67398071289062, "logps/rejected": -128.70452880859375, "loss": 1.088, "margin_dpo/margin_mean": 15.973433494567871, "margin_dpo/margin_std": 11.724790573120117, "step": 346 }, { "epoch": 0.5245653817082389, "grad_norm": 25.73068618774414, "learning_rate": 2.737640108260456e-07, "logits/chosen": 1.905352234840393, "logits/rejected": 1.7833974361419678, "logps/chosen": -85.78656768798828, "logps/ref_chosen": -82.30694580078125, "logps/ref_rejected": -103.71924591064453, "logps/rejected": -113.02760314941406, "loss": 1.1952, "margin_dpo/margin_mean": 5.828741550445557, "margin_dpo/margin_std": 14.944332122802734, "step": 347 }, { "epoch": 0.5260770975056689, "grad_norm": 16.72666358947754, "learning_rate": 2.724474525774229e-07, "logits/chosen": 1.4314143657684326, "logits/rejected": 1.3836251497268677, "logps/chosen": -91.54144287109375, "logps/ref_chosen": -93.23800659179688, "logps/ref_rejected": -103.54133605957031, "logps/rejected": -114.03265380859375, "loss": 1.0127, "margin_dpo/margin_mean": 12.187875747680664, "margin_dpo/margin_std": 13.77078628540039, "step": 348 }, { "epoch": 0.527588813303099, "grad_norm": 17.287527084350586, "learning_rate": 2.711302664252973e-07, "logits/chosen": 1.5420666933059692, "logits/rejected": 1.244254469871521, "logps/chosen": -51.156211853027344, "logps/ref_chosen": -51.72508239746094, "logps/ref_rejected": -91.75698852539062, "logps/rejected": -100.36253356933594, "loss": 1.0151, "margin_dpo/margin_mean": 9.174421310424805, "margin_dpo/margin_std": 9.410051345825195, "step": 349 }, { "epoch": 0.5291005291005291, "grad_norm": 15.985902786254883, "learning_rate": 2.698124892141971e-07, "logits/chosen": 1.5316600799560547, "logits/rejected": 1.4757490158081055, "logps/chosen": -90.90409851074219, "logps/ref_chosen": -88.76136779785156, "logps/ref_rejected": -111.48885345458984, "logps/rejected": -124.17880249023438, "loss": 0.8811, "margin_dpo/margin_mean": 10.547210693359375, "margin_dpo/margin_std": 10.40558910369873, "step": 350 }, { "epoch": 0.5306122448979592, "grad_norm": 17.085918426513672, "learning_rate": 2.6849415780518357e-07, "logits/chosen": 1.3141847848892212, "logits/rejected": 0.971895158290863, "logps/chosen": -70.03700256347656, "logps/ref_chosen": -66.01886749267578, "logps/ref_rejected": -126.26599884033203, "logps/rejected": -139.37261962890625, "loss": 1.0988, "margin_dpo/margin_mean": 9.088493347167969, "margin_dpo/margin_std": 11.360942840576172, "step": 351 }, { "epoch": 0.5321239606953893, "grad_norm": 19.062833786010742, "learning_rate": 2.6717530907482024e-07, "logits/chosen": 1.3318322896957397, "logits/rejected": 1.3409833908081055, "logps/chosen": -91.92908477783203, "logps/ref_chosen": -90.60063171386719, "logps/ref_rejected": -104.5595703125, "logps/rejected": -113.11971282958984, "loss": 1.0903, "margin_dpo/margin_mean": 7.231690406799316, "margin_dpo/margin_std": 10.331891059875488, "step": 352 }, { "epoch": 0.5336356764928194, "grad_norm": 17.326656341552734, "learning_rate": 2.658559799141411e-07, "logits/chosen": 1.5651016235351562, "logits/rejected": 1.2010442018508911, "logps/chosen": -85.78877258300781, "logps/ref_chosen": -82.65994262695312, "logps/ref_rejected": -119.38338470458984, "logps/rejected": -131.18006896972656, "loss": 1.0798, "margin_dpo/margin_mean": 8.667842864990234, "margin_dpo/margin_std": 12.725309371948242, "step": 353 }, { "epoch": 0.5351473922902494, "grad_norm": 16.332246780395508, "learning_rate": 2.6453620722761895e-07, "logits/chosen": 1.270531415939331, "logits/rejected": 1.2129976749420166, "logps/chosen": -39.61259460449219, "logps/ref_chosen": -41.430076599121094, "logps/ref_rejected": -67.54981994628906, "logps/rejected": -74.89820861816406, "loss": 1.04, "margin_dpo/margin_mean": 9.165868759155273, "margin_dpo/margin_std": 10.583430290222168, "step": 354 }, { "epoch": 0.5366591080876795, "grad_norm": 16.691301345825195, "learning_rate": 2.632160279321328e-07, "logits/chosen": 2.2866439819335938, "logits/rejected": 1.8712668418884277, "logps/chosen": -64.62295532226562, "logps/ref_chosen": -65.18504333496094, "logps/ref_rejected": -115.27197265625, "logps/rejected": -122.08123779296875, "loss": 0.9765, "margin_dpo/margin_mean": 7.371346950531006, "margin_dpo/margin_std": 7.073702335357666, "step": 355 }, { "epoch": 0.5381708238851096, "grad_norm": 18.533376693725586, "learning_rate": 2.618954789559356e-07, "logits/chosen": 1.8462498188018799, "logits/rejected": 1.4768980741500854, "logps/chosen": -49.76811981201172, "logps/ref_chosen": -49.94112777709961, "logps/ref_rejected": -89.28327178955078, "logps/rejected": -95.26639556884766, "loss": 1.1715, "margin_dpo/margin_mean": 6.156140327453613, "margin_dpo/margin_std": 17.53639030456543, "step": 356 }, { "epoch": 0.5396825396825397, "grad_norm": 19.41956329345703, "learning_rate": 2.6057459723762076e-07, "logits/chosen": 1.407848596572876, "logits/rejected": 1.011894702911377, "logps/chosen": -82.75194549560547, "logps/ref_chosen": -81.51641082763672, "logps/ref_rejected": -107.62196350097656, "logps/rejected": -118.98323822021484, "loss": 1.1118, "margin_dpo/margin_mean": 10.12573528289795, "margin_dpo/margin_std": 14.654556274414062, "step": 357 }, { "epoch": 0.5411942554799698, "grad_norm": 18.552204132080078, "learning_rate": 2.5925341972508954e-07, "logits/chosen": 0.823056161403656, "logits/rejected": 0.878453254699707, "logps/chosen": -77.65656280517578, "logps/ref_chosen": -77.7448501586914, "logps/ref_rejected": -72.35954284667969, "logps/rejected": -81.51641845703125, "loss": 0.9507, "margin_dpo/margin_mean": 9.245153427124023, "margin_dpo/margin_std": 9.211978912353516, "step": 358 }, { "epoch": 0.5427059712773998, "grad_norm": 23.570837020874023, "learning_rate": 2.579319833745169e-07, "logits/chosen": 1.549546480178833, "logits/rejected": 1.6890381574630737, "logps/chosen": -93.20405578613281, "logps/ref_chosen": -93.38333129882812, "logps/ref_rejected": -91.43463134765625, "logps/rejected": -96.86613464355469, "loss": 1.2416, "margin_dpo/margin_mean": 5.610783576965332, "margin_dpo/margin_std": 12.787965774536133, "step": 359 }, { "epoch": 0.54421768707483, "grad_norm": 16.336427688598633, "learning_rate": 2.5661032514931834e-07, "logits/chosen": 1.0054841041564941, "logits/rejected": 0.6383575797080994, "logps/chosen": -77.46910858154297, "logps/ref_chosen": -77.78421020507812, "logps/ref_rejected": -122.08389282226562, "logps/rejected": -129.6495361328125, "loss": 1.0194, "margin_dpo/margin_mean": 7.880744934082031, "margin_dpo/margin_std": 12.73130989074707, "step": 360 }, { "epoch": 0.54572940287226, "grad_norm": 16.655563354492188, "learning_rate": 2.552884820191154e-07, "logits/chosen": 1.5570170879364014, "logits/rejected": 1.4404245615005493, "logps/chosen": -67.04269409179688, "logps/ref_chosen": -65.98370361328125, "logps/ref_rejected": -72.98002624511719, "logps/rejected": -78.06330871582031, "loss": 0.9804, "margin_dpo/margin_mean": 4.024289608001709, "margin_dpo/margin_std": 8.366500854492188, "step": 361 }, { "epoch": 0.54724111866969, "grad_norm": 19.2061767578125, "learning_rate": 2.53966490958702e-07, "logits/chosen": 1.6192841529846191, "logits/rejected": 1.2942339181900024, "logps/chosen": -73.27638244628906, "logps/ref_chosen": -68.8250732421875, "logps/ref_rejected": -124.96432495117188, "logps/rejected": -135.93898010253906, "loss": 1.0967, "margin_dpo/margin_mean": 6.523345470428467, "margin_dpo/margin_std": 14.696924209594727, "step": 362 }, { "epoch": 0.5487528344671202, "grad_norm": 17.632171630859375, "learning_rate": 2.526443889470099e-07, "logits/chosen": 1.5779058933258057, "logits/rejected": 0.9689816236495972, "logps/chosen": -62.90093231201172, "logps/ref_chosen": -63.45079803466797, "logps/ref_rejected": -153.2322998046875, "logps/rejected": -164.6898651123047, "loss": 0.8967, "margin_dpo/margin_mean": 12.007436752319336, "margin_dpo/margin_std": 12.705961227416992, "step": 363 }, { "epoch": 0.5502645502645502, "grad_norm": 16.575843811035156, "learning_rate": 2.513222129660744e-07, "logits/chosen": 1.5237762928009033, "logits/rejected": 1.188504934310913, "logps/chosen": -59.034629821777344, "logps/ref_chosen": -60.04315185546875, "logps/ref_rejected": -105.874755859375, "logps/rejected": -118.5722885131836, "loss": 1.0026, "margin_dpo/margin_mean": 13.706052780151367, "margin_dpo/margin_std": 15.232439041137695, "step": 364 }, { "epoch": 0.5517762660619804, "grad_norm": 18.13707160949707, "learning_rate": 2.5e-07, "logits/chosen": 1.652744174003601, "logits/rejected": 1.66725754737854, "logps/chosen": -62.1462287902832, "logps/ref_chosen": -64.748291015625, "logps/ref_rejected": -62.80437469482422, "logps/rejected": -68.29106903076172, "loss": 0.9707, "margin_dpo/margin_mean": 8.088747024536133, "margin_dpo/margin_std": 7.421823501586914, "step": 365 }, { "epoch": 0.5532879818594104, "grad_norm": 18.038606643676758, "learning_rate": 2.486777870339255e-07, "logits/chosen": 1.6026561260223389, "logits/rejected": 1.5810301303863525, "logps/chosen": -85.81364440917969, "logps/ref_chosen": -84.550537109375, "logps/ref_rejected": -88.39691925048828, "logps/rejected": -94.60423278808594, "loss": 1.1318, "margin_dpo/margin_mean": 4.944221019744873, "margin_dpo/margin_std": 10.379396438598633, "step": 366 }, { "epoch": 0.5547996976568406, "grad_norm": 17.66802406311035, "learning_rate": 2.4735561105299014e-07, "logits/chosen": 1.4644160270690918, "logits/rejected": 1.1781034469604492, "logps/chosen": -74.05936431884766, "logps/ref_chosen": -73.39277648925781, "logps/ref_rejected": -96.41886901855469, "logps/rejected": -106.19867706298828, "loss": 1.0917, "margin_dpo/margin_mean": 9.11322021484375, "margin_dpo/margin_std": 9.061528205871582, "step": 367 }, { "epoch": 0.5563114134542706, "grad_norm": 18.270389556884766, "learning_rate": 2.46033509041298e-07, "logits/chosen": 0.9589509963989258, "logits/rejected": 1.162531852722168, "logps/chosen": -88.20960235595703, "logps/ref_chosen": -86.24443054199219, "logps/ref_rejected": -68.43778991699219, "logps/rejected": -75.57440948486328, "loss": 1.0945, "margin_dpo/margin_mean": 5.171448707580566, "margin_dpo/margin_std": 11.394579887390137, "step": 368 }, { "epoch": 0.5578231292517006, "grad_norm": 19.684722900390625, "learning_rate": 2.447115179808846e-07, "logits/chosen": 1.1577520370483398, "logits/rejected": 0.8543440103530884, "logps/chosen": -76.26089477539062, "logps/ref_chosen": -74.83604431152344, "logps/ref_rejected": -106.77117919921875, "logps/rejected": -118.65797424316406, "loss": 1.1515, "margin_dpo/margin_mean": 10.461954116821289, "margin_dpo/margin_std": 12.125584602355957, "step": 369 }, { "epoch": 0.5593348450491308, "grad_norm": 17.727333068847656, "learning_rate": 2.4338967485068164e-07, "logits/chosen": 1.8807153701782227, "logits/rejected": 1.8148707151412964, "logps/chosen": -60.55421829223633, "logps/ref_chosen": -64.84715270996094, "logps/ref_rejected": -95.09796142578125, "logps/rejected": -105.71687316894531, "loss": 1.0043, "margin_dpo/margin_mean": 14.911844253540039, "margin_dpo/margin_std": 13.269659042358398, "step": 370 }, { "epoch": 0.5608465608465608, "grad_norm": 17.64663314819336, "learning_rate": 2.420680166254831e-07, "logits/chosen": 2.1669180393218994, "logits/rejected": 2.04794979095459, "logps/chosen": -56.87687301635742, "logps/ref_chosen": -56.208343505859375, "logps/ref_rejected": -94.90827941894531, "logps/rejected": -106.34837341308594, "loss": 1.07, "margin_dpo/margin_mean": 10.771562576293945, "margin_dpo/margin_std": 15.61629867553711, "step": 371 }, { "epoch": 0.562358276643991, "grad_norm": 18.7775821685791, "learning_rate": 2.4074658027491044e-07, "logits/chosen": 1.2795021533966064, "logits/rejected": 0.9924272298812866, "logps/chosen": -69.81268310546875, "logps/ref_chosen": -70.21278381347656, "logps/ref_rejected": -107.09066772460938, "logps/rejected": -117.4203872680664, "loss": 1.2085, "margin_dpo/margin_mean": 10.729829788208008, "margin_dpo/margin_std": 11.587409973144531, "step": 372 }, { "epoch": 0.563869992441421, "grad_norm": 22.780662536621094, "learning_rate": 2.394254027623792e-07, "logits/chosen": 1.9176933765411377, "logits/rejected": 1.636462688446045, "logps/chosen": -90.99224853515625, "logps/ref_chosen": -86.43083190917969, "logps/ref_rejected": -85.3323745727539, "logps/rejected": -95.23503875732422, "loss": 1.039, "margin_dpo/margin_mean": 5.341238975524902, "margin_dpo/margin_std": 12.141375541687012, "step": 373 }, { "epoch": 0.5653817082388511, "grad_norm": 19.62753677368164, "learning_rate": 2.381045210440644e-07, "logits/chosen": 1.353756070137024, "logits/rejected": 0.9825633764266968, "logps/chosen": -81.65689086914062, "logps/ref_chosen": -78.68983459472656, "logps/ref_rejected": -98.20587158203125, "logps/rejected": -114.04132843017578, "loss": 0.8784, "margin_dpo/margin_mean": 12.868396759033203, "margin_dpo/margin_std": 13.804384231567383, "step": 374 }, { "epoch": 0.5668934240362812, "grad_norm": 18.845134735107422, "learning_rate": 2.3678397206786715e-07, "logits/chosen": 1.6013425588607788, "logits/rejected": 1.2837507724761963, "logps/chosen": -47.1046142578125, "logps/ref_chosen": -48.388282775878906, "logps/ref_rejected": -92.34618377685547, "logps/rejected": -98.99795532226562, "loss": 1.0784, "margin_dpo/margin_mean": 7.9354352951049805, "margin_dpo/margin_std": 8.361946105957031, "step": 375 }, { "epoch": 0.5684051398337112, "grad_norm": 16.70038414001465, "learning_rate": 2.3546379277238103e-07, "logits/chosen": 0.9974828958511353, "logits/rejected": 1.1140481233596802, "logps/chosen": -105.5245361328125, "logps/ref_chosen": -100.63041687011719, "logps/ref_rejected": -75.61399841308594, "logps/rejected": -84.88065338134766, "loss": 1.0079, "margin_dpo/margin_mean": 4.372528076171875, "margin_dpo/margin_std": 11.696916580200195, "step": 376 }, { "epoch": 0.5699168556311414, "grad_norm": 16.028461456298828, "learning_rate": 2.3414402008585886e-07, "logits/chosen": 1.9152958393096924, "logits/rejected": 1.869389295578003, "logps/chosen": -81.78144836425781, "logps/ref_chosen": -80.26454162597656, "logps/ref_rejected": -82.82815551757812, "logps/rejected": -92.58141326904297, "loss": 1.0578, "margin_dpo/margin_mean": 8.236349105834961, "margin_dpo/margin_std": 10.057723045349121, "step": 377 }, { "epoch": 0.5714285714285714, "grad_norm": 18.478181838989258, "learning_rate": 2.3282469092517977e-07, "logits/chosen": 1.2696528434753418, "logits/rejected": 1.135990858078003, "logps/chosen": -67.37728881835938, "logps/ref_chosen": -64.12151336669922, "logps/ref_rejected": -92.84522247314453, "logps/rejected": -101.94862365722656, "loss": 1.149, "margin_dpo/margin_mean": 5.847634792327881, "margin_dpo/margin_std": 10.592866897583008, "step": 378 }, { "epoch": 0.5729402872260015, "grad_norm": 21.347606658935547, "learning_rate": 2.3150584219481643e-07, "logits/chosen": 1.6456807851791382, "logits/rejected": 1.4840655326843262, "logps/chosen": -78.78755950927734, "logps/ref_chosen": -74.53916931152344, "logps/ref_rejected": -111.96742248535156, "logps/rejected": -122.64026641845703, "loss": 1.0323, "margin_dpo/margin_mean": 6.424446105957031, "margin_dpo/margin_std": 12.509603500366211, "step": 379 }, { "epoch": 0.5744520030234316, "grad_norm": 15.730467796325684, "learning_rate": 2.3018751078580283e-07, "logits/chosen": 0.9392193555831909, "logits/rejected": 1.0190677642822266, "logps/chosen": -68.32166290283203, "logps/ref_chosen": -65.65042114257812, "logps/ref_rejected": -57.875572204589844, "logps/rejected": -66.88612365722656, "loss": 0.9421, "margin_dpo/margin_mean": 6.339319229125977, "margin_dpo/margin_std": 12.235734939575195, "step": 380 }, { "epoch": 0.5759637188208617, "grad_norm": 19.18428611755371, "learning_rate": 2.288697335747027e-07, "logits/chosen": 1.821494698524475, "logits/rejected": 1.6652073860168457, "logps/chosen": -76.76895904541016, "logps/ref_chosen": -71.0316162109375, "logps/ref_rejected": -82.40918731689453, "logps/rejected": -93.10916137695312, "loss": 1.2166, "margin_dpo/margin_mean": 4.962637901306152, "margin_dpo/margin_std": 12.813655853271484, "step": 381 }, { "epoch": 0.5774754346182918, "grad_norm": 16.963834762573242, "learning_rate": 2.2755254742257706e-07, "logits/chosen": 1.7162137031555176, "logits/rejected": 1.4812861680984497, "logps/chosen": -84.69490051269531, "logps/ref_chosen": -79.11405944824219, "logps/ref_rejected": -122.008056640625, "logps/rejected": -136.27113342285156, "loss": 1.08, "margin_dpo/margin_mean": 8.682241439819336, "margin_dpo/margin_std": 12.171220779418945, "step": 382 }, { "epoch": 0.5789871504157218, "grad_norm": 22.514751434326172, "learning_rate": 2.2623598917395436e-07, "logits/chosen": 1.3546805381774902, "logits/rejected": 1.2783007621765137, "logps/chosen": -69.39689636230469, "logps/ref_chosen": -69.9166259765625, "logps/ref_rejected": -87.328857421875, "logps/rejected": -93.75856018066406, "loss": 1.0655, "margin_dpo/margin_mean": 6.949431896209717, "margin_dpo/margin_std": 9.882552146911621, "step": 383 }, { "epoch": 0.5804988662131519, "grad_norm": 17.967792510986328, "learning_rate": 2.2492009565579875e-07, "logits/chosen": 1.2994017601013184, "logits/rejected": 1.448273777961731, "logps/chosen": -103.30746459960938, "logps/ref_chosen": -99.48637390136719, "logps/ref_rejected": -101.33261108398438, "logps/rejected": -112.26524353027344, "loss": 1.0016, "margin_dpo/margin_mean": 7.111542701721191, "margin_dpo/margin_std": 9.658019065856934, "step": 384 }, { "epoch": 0.582010582010582, "grad_norm": 17.039796829223633, "learning_rate": 2.2360490367648084e-07, "logits/chosen": 1.527607798576355, "logits/rejected": 1.4134703874588013, "logps/chosen": -79.23987579345703, "logps/ref_chosen": -75.85514831542969, "logps/ref_rejected": -82.66242980957031, "logps/rejected": -94.84847259521484, "loss": 0.9526, "margin_dpo/margin_mean": 8.801328659057617, "margin_dpo/margin_std": 11.05715274810791, "step": 385 }, { "epoch": 0.5835222978080121, "grad_norm": 19.3527774810791, "learning_rate": 2.2229045002474724e-07, "logits/chosen": 1.235201120376587, "logits/rejected": 0.9587774872779846, "logps/chosen": -67.49232482910156, "logps/ref_chosen": -67.96981811523438, "logps/ref_rejected": -107.00527954101562, "logps/rejected": -112.47984313964844, "loss": 1.2317, "margin_dpo/margin_mean": 5.952061653137207, "margin_dpo/margin_std": 11.560074806213379, "step": 386 }, { "epoch": 0.5850340136054422, "grad_norm": 16.471900939941406, "learning_rate": 2.209767714686924e-07, "logits/chosen": 1.837686538696289, "logits/rejected": 1.6203699111938477, "logps/chosen": -47.08496856689453, "logps/ref_chosen": -46.420677185058594, "logps/ref_rejected": -87.33722686767578, "logps/rejected": -97.84181213378906, "loss": 0.8997, "margin_dpo/margin_mean": 9.840293884277344, "margin_dpo/margin_std": 13.550289154052734, "step": 387 }, { "epoch": 0.5865457294028723, "grad_norm": 24.3358154296875, "learning_rate": 2.1966390475472954e-07, "logits/chosen": 1.644033670425415, "logits/rejected": 1.3548939228057861, "logps/chosen": -99.03819274902344, "logps/ref_chosen": -94.71730041503906, "logps/ref_rejected": -130.7916259765625, "logps/rejected": -143.21759033203125, "loss": 1.2093, "margin_dpo/margin_mean": 8.105066299438477, "margin_dpo/margin_std": 9.479436874389648, "step": 388 }, { "epoch": 0.5880574452003023, "grad_norm": 16.21053123474121, "learning_rate": 2.1835188660656265e-07, "logits/chosen": 1.765403151512146, "logits/rejected": 1.5489118099212646, "logps/chosen": -82.00344848632812, "logps/ref_chosen": -77.13436889648438, "logps/ref_rejected": -118.94754028320312, "logps/rejected": -132.73802185058594, "loss": 0.9982, "margin_dpo/margin_mean": 8.921403884887695, "margin_dpo/margin_std": 11.89144515991211, "step": 389 }, { "epoch": 0.5895691609977324, "grad_norm": 16.69336700439453, "learning_rate": 2.170407537241599e-07, "logits/chosen": 1.124328374862671, "logits/rejected": 1.0310174226760864, "logps/chosen": -59.94728088378906, "logps/ref_chosen": -60.2330322265625, "logps/ref_rejected": -70.51981353759766, "logps/rejected": -77.9442138671875, "loss": 1.0075, "margin_dpo/margin_mean": 7.710161209106445, "margin_dpo/margin_std": 11.860628128051758, "step": 390 }, { "epoch": 0.5910808767951625, "grad_norm": 15.89005184173584, "learning_rate": 2.1573054278272636e-07, "logits/chosen": 1.4358313083648682, "logits/rejected": 1.3161406517028809, "logps/chosen": -85.41853332519531, "logps/ref_chosen": -85.9833984375, "logps/ref_rejected": -116.30288696289062, "logps/rejected": -127.33718872070312, "loss": 1.0165, "margin_dpo/margin_mean": 11.599164009094238, "margin_dpo/margin_std": 17.371217727661133, "step": 391 }, { "epoch": 0.5925925925925926, "grad_norm": 15.85769271850586, "learning_rate": 2.1442129043167873e-07, "logits/chosen": 1.8632386922836304, "logits/rejected": 1.8789622783660889, "logps/chosen": -77.12300109863281, "logps/ref_chosen": -79.13163757324219, "logps/ref_rejected": -91.26902770996094, "logps/rejected": -100.0257797241211, "loss": 0.8725, "margin_dpo/margin_mean": 10.765390396118164, "margin_dpo/margin_std": 13.134359359741211, "step": 392 }, { "epoch": 0.5941043083900227, "grad_norm": 16.14752769470215, "learning_rate": 2.131130332936195e-07, "logits/chosen": 1.0586001873016357, "logits/rejected": 1.0538451671600342, "logps/chosen": -64.10144805908203, "logps/ref_chosen": -61.6352653503418, "logps/ref_rejected": -68.62921142578125, "logps/rejected": -79.71250915527344, "loss": 0.9557, "margin_dpo/margin_mean": 8.617115020751953, "margin_dpo/margin_std": 11.661966323852539, "step": 393 }, { "epoch": 0.5956160241874527, "grad_norm": 18.7858829498291, "learning_rate": 2.1180580796331323e-07, "logits/chosen": 1.6424164772033691, "logits/rejected": 1.296222448348999, "logps/chosen": -60.317344665527344, "logps/ref_chosen": -58.820316314697266, "logps/ref_rejected": -106.39338684082031, "logps/rejected": -115.24707794189453, "loss": 0.9949, "margin_dpo/margin_mean": 7.356667518615723, "margin_dpo/margin_std": 12.168745040893555, "step": 394 }, { "epoch": 0.5971277399848829, "grad_norm": 17.745750427246094, "learning_rate": 2.104996510066625e-07, "logits/chosen": 1.937403678894043, "logits/rejected": 1.5673989057540894, "logps/chosen": -59.99589920043945, "logps/ref_chosen": -60.08242416381836, "logps/ref_rejected": -108.21504211425781, "logps/rejected": -119.75575256347656, "loss": 1.0411, "margin_dpo/margin_mean": 11.627223014831543, "margin_dpo/margin_std": 9.34223461151123, "step": 395 }, { "epoch": 0.5986394557823129, "grad_norm": 16.138158798217773, "learning_rate": 2.0919459895968517e-07, "logits/chosen": 1.7231922149658203, "logits/rejected": 1.6860530376434326, "logps/chosen": -64.44842529296875, "logps/ref_chosen": -62.94968795776367, "logps/ref_rejected": -74.56437683105469, "logps/rejected": -87.29389190673828, "loss": 0.9368, "margin_dpo/margin_mean": 11.230780601501465, "margin_dpo/margin_std": 8.197953224182129, "step": 396 }, { "epoch": 0.600151171579743, "grad_norm": 22.415613174438477, "learning_rate": 2.078906883274924e-07, "logits/chosen": 1.40608549118042, "logits/rejected": 1.364497184753418, "logps/chosen": -89.64517211914062, "logps/ref_chosen": -86.79851531982422, "logps/ref_rejected": -113.37101745605469, "logps/rejected": -124.23312377929688, "loss": 1.2085, "margin_dpo/margin_mean": 8.015443801879883, "margin_dpo/margin_std": 7.1273274421691895, "step": 397 }, { "epoch": 0.6016628873771731, "grad_norm": 15.636205673217773, "learning_rate": 2.065879555832674e-07, "logits/chosen": 1.3818635940551758, "logits/rejected": 1.0291097164154053, "logps/chosen": -67.93223571777344, "logps/ref_chosen": -65.900146484375, "logps/ref_rejected": -117.08538818359375, "logps/rejected": -128.13636779785156, "loss": 0.9702, "margin_dpo/margin_mean": 9.018890380859375, "margin_dpo/margin_std": 13.53309440612793, "step": 398 }, { "epoch": 0.6031746031746031, "grad_norm": 17.620872497558594, "learning_rate": 2.052864371672457e-07, "logits/chosen": 1.7343003749847412, "logits/rejected": 1.4701778888702393, "logps/chosen": -106.04431915283203, "logps/ref_chosen": -96.72502136230469, "logps/ref_rejected": -151.64376831054688, "logps/rejected": -165.40628051757812, "loss": 0.9853, "margin_dpo/margin_mean": 4.44322395324707, "margin_dpo/margin_std": 10.476129531860352, "step": 399 }, { "epoch": 0.6046863189720333, "grad_norm": 22.48299789428711, "learning_rate": 2.0398616948569493e-07, "logits/chosen": 1.5725784301757812, "logits/rejected": 1.5334935188293457, "logps/chosen": -125.30363464355469, "logps/ref_chosen": -122.26991271972656, "logps/ref_rejected": -103.23331451416016, "logps/rejected": -113.30017852783203, "loss": 1.2091, "margin_dpo/margin_mean": 7.033153057098389, "margin_dpo/margin_std": 7.89906120300293, "step": 400 }, { "epoch": 0.6046863189720333, "eval_logits/chosen": 1.4582184553146362, "eval_logits/rejected": 1.3356250524520874, "eval_logps/chosen": -89.28824615478516, "eval_logps/ref_chosen": -87.31719970703125, "eval_logps/ref_rejected": -95.23231506347656, "eval_logps/rejected": -104.48871612548828, "eval_loss": 0.5247963666915894, "eval_margin_dpo/margin_mean": 7.2853546142578125, "eval_margin_dpo/margin_std": 11.136839866638184, "eval_runtime": 42.851, "eval_samples_per_second": 53.744, "eval_steps_per_second": 1.68, "step": 400 }, { "epoch": 0.6061980347694633, "grad_norm": 14.39082145690918, "learning_rate": 2.0268718890989752e-07, "logits/chosen": 0.7350183725357056, "logits/rejected": 0.7028823494911194, "logps/chosen": -68.5293960571289, "logps/ref_chosen": -70.25821685791016, "logps/ref_rejected": -80.78495025634766, "logps/rejected": -86.06382751464844, "loss": 0.8404, "margin_dpo/margin_mean": 7.007689952850342, "margin_dpo/margin_std": 10.468082427978516, "step": 401 }, { "epoch": 0.6077097505668935, "grad_norm": 18.517383575439453, "learning_rate": 2.013895317751323e-07, "logits/chosen": 1.7012050151824951, "logits/rejected": 1.3150333166122437, "logps/chosen": -51.0611572265625, "logps/ref_chosen": -52.10100173950195, "logps/ref_rejected": -98.71633911132812, "logps/rejected": -112.17620086669922, "loss": 0.9903, "margin_dpo/margin_mean": 14.499706268310547, "margin_dpo/margin_std": 9.993553161621094, "step": 402 }, { "epoch": 0.6092214663643235, "grad_norm": 17.70839500427246, "learning_rate": 2.0009323437965898e-07, "logits/chosen": 1.7505762577056885, "logits/rejected": 1.4791232347488403, "logps/chosen": -79.9526138305664, "logps/ref_chosen": -80.85928344726562, "logps/ref_rejected": -110.012939453125, "logps/rejected": -122.6014404296875, "loss": 0.9479, "margin_dpo/margin_mean": 13.495168685913086, "margin_dpo/margin_std": 11.853385925292969, "step": 403 }, { "epoch": 0.6107331821617535, "grad_norm": 17.47926902770996, "learning_rate": 1.9879833298370237e-07, "logits/chosen": 1.4046885967254639, "logits/rejected": 1.5411865711212158, "logps/chosen": -108.6570816040039, "logps/ref_chosen": -106.20344543457031, "logps/ref_rejected": -106.75729370117188, "logps/rejected": -118.572265625, "loss": 0.9263, "margin_dpo/margin_mean": 9.361337661743164, "margin_dpo/margin_std": 11.436565399169922, "step": 404 }, { "epoch": 0.6122448979591837, "grad_norm": 15.739611625671387, "learning_rate": 1.975048638084379e-07, "logits/chosen": 1.7689330577850342, "logits/rejected": 1.5532318353652954, "logps/chosen": -63.42049789428711, "logps/ref_chosen": -64.43032836914062, "logps/ref_rejected": -82.855224609375, "logps/rejected": -86.1676025390625, "loss": 1.0393, "margin_dpo/margin_mean": 4.322210788726807, "margin_dpo/margin_std": 10.163808822631836, "step": 405 }, { "epoch": 0.6137566137566137, "grad_norm": 15.536432266235352, "learning_rate": 1.9621286303497914e-07, "logits/chosen": 1.3931666612625122, "logits/rejected": 0.874456524848938, "logps/chosen": -55.357154846191406, "logps/ref_chosen": -56.44206237792969, "logps/ref_rejected": -114.25556945800781, "logps/rejected": -125.14240264892578, "loss": 0.8991, "margin_dpo/margin_mean": 11.971742630004883, "margin_dpo/margin_std": 13.41472053527832, "step": 406 }, { "epoch": 0.6152683295540439, "grad_norm": 22.571401596069336, "learning_rate": 1.9492236680336483e-07, "logits/chosen": 1.9526389837265015, "logits/rejected": 1.9017415046691895, "logps/chosen": -92.21347045898438, "logps/ref_chosen": -89.26659393310547, "logps/ref_rejected": -99.39598846435547, "logps/rejected": -105.9415283203125, "loss": 1.0834, "margin_dpo/margin_mean": 3.5986623764038086, "margin_dpo/margin_std": 11.726037979125977, "step": 407 }, { "epoch": 0.6167800453514739, "grad_norm": 14.365006446838379, "learning_rate": 1.9363341121154895e-07, "logits/chosen": 1.7956515550613403, "logits/rejected": 1.5656187534332275, "logps/chosen": -70.03071594238281, "logps/ref_chosen": -73.04478454589844, "logps/ref_rejected": -100.50680541992188, "logps/rejected": -106.039794921875, "loss": 0.865, "margin_dpo/margin_mean": 8.547052383422852, "margin_dpo/margin_std": 9.32919692993164, "step": 408 }, { "epoch": 0.618291761148904, "grad_norm": 19.600969314575195, "learning_rate": 1.9234603231438994e-07, "logits/chosen": 1.5324292182922363, "logits/rejected": 1.6931285858154297, "logps/chosen": -87.93988037109375, "logps/ref_chosen": -85.88887786865234, "logps/ref_rejected": -71.50348663330078, "logps/rejected": -77.88395690917969, "loss": 1.2284, "margin_dpo/margin_mean": 4.329472541809082, "margin_dpo/margin_std": 10.474832534790039, "step": 409 }, { "epoch": 0.6198034769463341, "grad_norm": 20.610931396484375, "learning_rate": 1.9106026612264315e-07, "logits/chosen": 1.621500015258789, "logits/rejected": 1.6648962497711182, "logps/chosen": -59.008819580078125, "logps/ref_chosen": -58.368202209472656, "logps/ref_rejected": -66.27806091308594, "logps/rejected": -73.01986694335938, "loss": 0.9819, "margin_dpo/margin_mean": 6.101186752319336, "margin_dpo/margin_std": 11.882577896118164, "step": 410 }, { "epoch": 0.6213151927437641, "grad_norm": 25.92525863647461, "learning_rate": 1.8977614860195296e-07, "logits/chosen": 1.2717056274414062, "logits/rejected": 1.2971036434173584, "logps/chosen": -89.50897216796875, "logps/ref_chosen": -88.38739013671875, "logps/ref_rejected": -79.86773681640625, "logps/rejected": -91.71290588378906, "loss": 1.0022, "margin_dpo/margin_mean": 10.723580360412598, "margin_dpo/margin_std": 13.953067779541016, "step": 411 }, { "epoch": 0.6228269085411943, "grad_norm": 17.894712448120117, "learning_rate": 1.8849371567184662e-07, "logits/chosen": 1.5564298629760742, "logits/rejected": 1.70186185836792, "logps/chosen": -66.75880432128906, "logps/ref_chosen": -67.69011688232422, "logps/ref_rejected": -54.996158599853516, "logps/rejected": -64.54686737060547, "loss": 1.0023, "margin_dpo/margin_mean": 10.482020378112793, "margin_dpo/margin_std": 13.3348388671875, "step": 412 }, { "epoch": 0.6243386243386243, "grad_norm": 22.179824829101562, "learning_rate": 1.872130032047302e-07, "logits/chosen": 0.9876876473426819, "logits/rejected": 0.6877783536911011, "logps/chosen": -95.60763549804688, "logps/ref_chosen": -88.13890838623047, "logps/ref_rejected": -103.29592895507812, "logps/rejected": -116.02366638183594, "loss": 1.1014, "margin_dpo/margin_mean": 5.25900411605835, "margin_dpo/margin_std": 15.124969482421875, "step": 413 }, { "epoch": 0.6258503401360545, "grad_norm": 15.521639823913574, "learning_rate": 1.8593404702488436e-07, "logits/chosen": 1.2975716590881348, "logits/rejected": 1.1770504713058472, "logps/chosen": -88.85011291503906, "logps/ref_chosen": -83.41361999511719, "logps/ref_rejected": -109.53297424316406, "logps/rejected": -126.68208312988281, "loss": 0.9736, "margin_dpo/margin_mean": 11.712615966796875, "margin_dpo/margin_std": 11.007841110229492, "step": 414 }, { "epoch": 0.6273620559334845, "grad_norm": 19.120716094970703, "learning_rate": 1.846568829074628e-07, "logits/chosen": 1.0723220109939575, "logits/rejected": 1.2517869472503662, "logps/chosen": -74.63948059082031, "logps/ref_chosen": -72.93316650390625, "logps/ref_rejected": -56.376548767089844, "logps/rejected": -67.72654724121094, "loss": 1.1127, "margin_dpo/margin_mean": 9.643697738647461, "margin_dpo/margin_std": 14.639366149902344, "step": 415 }, { "epoch": 0.6288737717309146, "grad_norm": 19.14404296875, "learning_rate": 1.8338154657749128e-07, "logits/chosen": 1.4537324905395508, "logits/rejected": 1.2070437669754028, "logps/chosen": -76.9991226196289, "logps/ref_chosen": -72.59295654296875, "logps/ref_rejected": -118.85919189453125, "logps/rejected": -130.78631591796875, "loss": 1.1947, "margin_dpo/margin_mean": 7.520954608917236, "margin_dpo/margin_std": 15.016427993774414, "step": 416 }, { "epoch": 0.6303854875283447, "grad_norm": 17.6398868560791, "learning_rate": 1.8210807370886849e-07, "logits/chosen": 1.7790758609771729, "logits/rejected": 1.7406165599822998, "logps/chosen": -98.33168029785156, "logps/ref_chosen": -94.0817642211914, "logps/ref_rejected": -86.61659240722656, "logps/rejected": -99.73716735839844, "loss": 0.9483, "margin_dpo/margin_mean": 8.870662689208984, "margin_dpo/margin_std": 17.66168785095215, "step": 417 }, { "epoch": 0.6318972033257747, "grad_norm": 21.657602310180664, "learning_rate": 1.8083649992336825e-07, "logits/chosen": 2.0443997383117676, "logits/rejected": 1.9017527103424072, "logps/chosen": -74.12547302246094, "logps/ref_chosen": -72.66082763671875, "logps/ref_rejected": -96.8029556274414, "logps/rejected": -107.14457702636719, "loss": 1.2497, "margin_dpo/margin_mean": 8.87697982788086, "margin_dpo/margin_std": 13.649295806884766, "step": 418 }, { "epoch": 0.6334089191232048, "grad_norm": 15.388176918029785, "learning_rate": 1.7956686078964255e-07, "logits/chosen": 1.307974934577942, "logits/rejected": 0.9341150522232056, "logps/chosen": -78.28793334960938, "logps/ref_chosen": -76.53992462158203, "logps/ref_rejected": -96.74131774902344, "logps/rejected": -107.10330963134766, "loss": 0.8231, "margin_dpo/margin_mean": 8.61398983001709, "margin_dpo/margin_std": 10.405679702758789, "step": 419 }, { "epoch": 0.6349206349206349, "grad_norm": 19.34682273864746, "learning_rate": 1.782991918222275e-07, "logits/chosen": 1.333677053451538, "logits/rejected": 1.3203482627868652, "logps/chosen": -88.46221923828125, "logps/ref_chosen": -78.37210845947266, "logps/ref_rejected": -86.08318328857422, "logps/rejected": -98.54408264160156, "loss": 1.1776, "margin_dpo/margin_mean": 2.3707919120788574, "margin_dpo/margin_std": 12.83182144165039, "step": 420 }, { "epoch": 0.636432350718065, "grad_norm": 19.5356388092041, "learning_rate": 1.7703352848054887e-07, "logits/chosen": 2.035388946533203, "logits/rejected": 1.4979362487792969, "logps/chosen": -54.49224853515625, "logps/ref_chosen": -54.065673828125, "logps/ref_rejected": -114.31968688964844, "logps/rejected": -126.77404022216797, "loss": 1.2293, "margin_dpo/margin_mean": 12.027775764465332, "margin_dpo/margin_std": 16.751602172851562, "step": 421 }, { "epoch": 0.6379440665154951, "grad_norm": 19.82470703125, "learning_rate": 1.7576990616793137e-07, "logits/chosen": 2.170444965362549, "logits/rejected": 1.9144538640975952, "logps/chosen": -87.58007049560547, "logps/ref_chosen": -87.74325561523438, "logps/ref_rejected": -113.90839385986328, "logps/rejected": -124.56327819824219, "loss": 1.0623, "margin_dpo/margin_mean": 10.818069458007812, "margin_dpo/margin_std": 14.527170181274414, "step": 422 }, { "epoch": 0.6394557823129252, "grad_norm": 18.15706443786621, "learning_rate": 1.745083602306071e-07, "logits/chosen": 1.7937231063842773, "logits/rejected": 1.7060699462890625, "logps/chosen": -88.28042602539062, "logps/ref_chosen": -85.32998657226562, "logps/ref_rejected": -75.05231475830078, "logps/rejected": -84.42832946777344, "loss": 1.0096, "margin_dpo/margin_mean": 6.425585746765137, "margin_dpo/margin_std": 13.523710250854492, "step": 423 }, { "epoch": 0.6409674981103552, "grad_norm": 17.072818756103516, "learning_rate": 1.7324892595672804e-07, "logits/chosen": 1.5514662265777588, "logits/rejected": 1.530092477798462, "logps/chosen": -60.43510437011719, "logps/ref_chosen": -58.31544494628906, "logps/ref_rejected": -94.3969955444336, "logps/rejected": -104.16131591796875, "loss": 0.8908, "margin_dpo/margin_mean": 7.644657611846924, "margin_dpo/margin_std": 12.729293823242188, "step": 424 }, { "epoch": 0.6424792139077853, "grad_norm": 17.55316162109375, "learning_rate": 1.7199163857537824e-07, "logits/chosen": 1.324086308479309, "logits/rejected": 1.2915505170822144, "logps/chosen": -92.78572845458984, "logps/ref_chosen": -89.6885986328125, "logps/ref_rejected": -98.63832092285156, "logps/rejected": -110.48238372802734, "loss": 0.9377, "margin_dpo/margin_mean": 8.746915817260742, "margin_dpo/margin_std": 12.064696311950684, "step": 425 }, { "epoch": 0.6439909297052154, "grad_norm": 23.180673599243164, "learning_rate": 1.7073653325558828e-07, "logits/chosen": 1.3505516052246094, "logits/rejected": 1.342008113861084, "logps/chosen": -101.0322494506836, "logps/ref_chosen": -95.77278900146484, "logps/ref_rejected": -70.38717651367188, "logps/rejected": -81.29769897460938, "loss": 1.3358, "margin_dpo/margin_mean": 5.651068687438965, "margin_dpo/margin_std": 11.569957733154297, "step": 426 }, { "epoch": 0.6455026455026455, "grad_norm": 18.770654678344727, "learning_rate": 1.6948364510535218e-07, "logits/chosen": 1.204085111618042, "logits/rejected": 1.2493438720703125, "logps/chosen": -81.01364135742188, "logps/ref_chosen": -78.994140625, "logps/ref_rejected": -90.12332916259766, "logps/rejected": -97.1053237915039, "loss": 1.0118, "margin_dpo/margin_mean": 4.962490081787109, "margin_dpo/margin_std": 11.343255996704102, "step": 427 }, { "epoch": 0.6470143613000756, "grad_norm": 20.1258602142334, "learning_rate": 1.6823300917064458e-07, "logits/chosen": 1.3591469526290894, "logits/rejected": 1.594498634338379, "logps/chosen": -106.14566802978516, "logps/ref_chosen": -99.87548828125, "logps/ref_rejected": -86.27838134765625, "logps/rejected": -95.73759460449219, "loss": 1.0193, "margin_dpo/margin_mean": 3.1890363693237305, "margin_dpo/margin_std": 14.035361289978027, "step": 428 }, { "epoch": 0.6485260770975056, "grad_norm": 20.07024383544922, "learning_rate": 1.669846604344412e-07, "logits/chosen": 1.2506906986236572, "logits/rejected": 1.450724720954895, "logps/chosen": -120.5801773071289, "logps/ref_chosen": -113.89552307128906, "logps/ref_rejected": -69.47867584228516, "logps/rejected": -82.81956481933594, "loss": 1.0805, "margin_dpo/margin_mean": 6.6562347412109375, "margin_dpo/margin_std": 8.949975967407227, "step": 429 }, { "epoch": 0.6500377928949358, "grad_norm": 16.58482551574707, "learning_rate": 1.6573863381573954e-07, "logits/chosen": 1.2297159433364868, "logits/rejected": 1.2015564441680908, "logps/chosen": -71.74535369873047, "logps/ref_chosen": -71.97297668457031, "logps/ref_rejected": -74.14552307128906, "logps/rejected": -87.13224029541016, "loss": 0.9543, "margin_dpo/margin_mean": 13.214336395263672, "margin_dpo/margin_std": 14.50355339050293, "step": 430 }, { "epoch": 0.6515495086923658, "grad_norm": 19.001174926757812, "learning_rate": 1.6449496416858282e-07, "logits/chosen": 0.9143924713134766, "logits/rejected": 0.6924408674240112, "logps/chosen": -48.290382385253906, "logps/ref_chosen": -45.253562927246094, "logps/ref_rejected": -70.72871398925781, "logps/rejected": -80.62155151367188, "loss": 1.0729, "margin_dpo/margin_mean": 6.856022834777832, "margin_dpo/margin_std": 14.329547882080078, "step": 431 }, { "epoch": 0.6530612244897959, "grad_norm": 19.789640426635742, "learning_rate": 1.632536862810844e-07, "logits/chosen": 1.4070956707000732, "logits/rejected": 1.6601455211639404, "logps/chosen": -99.5880126953125, "logps/ref_chosen": -91.55174255371094, "logps/ref_rejected": -77.03479766845703, "logps/rejected": -84.5213394165039, "loss": 1.0904, "margin_dpo/margin_mean": -0.549723744392395, "margin_dpo/margin_std": 13.010431289672852, "step": 432 }, { "epoch": 0.654572940287226, "grad_norm": 16.187782287597656, "learning_rate": 1.6201483487445515e-07, "logits/chosen": 1.7537860870361328, "logits/rejected": 1.6819634437561035, "logps/chosen": -95.52337646484375, "logps/ref_chosen": -91.60700225830078, "logps/ref_rejected": -95.3456802368164, "logps/rejected": -105.0643310546875, "loss": 0.9705, "margin_dpo/margin_mean": 5.802274703979492, "margin_dpo/margin_std": 14.144033432006836, "step": 433 }, { "epoch": 0.656084656084656, "grad_norm": 16.793041229248047, "learning_rate": 1.6077844460203204e-07, "logits/chosen": 1.751226782798767, "logits/rejected": 1.6085643768310547, "logps/chosen": -74.38494873046875, "logps/ref_chosen": -72.4745101928711, "logps/ref_rejected": -86.73628997802734, "logps/rejected": -93.55900573730469, "loss": 0.9884, "margin_dpo/margin_mean": 4.912266731262207, "margin_dpo/margin_std": 16.100318908691406, "step": 434 }, { "epoch": 0.6575963718820862, "grad_norm": 17.793743133544922, "learning_rate": 1.5954455004830878e-07, "logits/chosen": 1.5350686311721802, "logits/rejected": 1.503602385520935, "logps/chosen": -75.05659484863281, "logps/ref_chosen": -72.75701904296875, "logps/ref_rejected": -71.77749633789062, "logps/rejected": -83.84210968017578, "loss": 1.0668, "margin_dpo/margin_mean": 9.76504135131836, "margin_dpo/margin_std": 10.812267303466797, "step": 435 }, { "epoch": 0.6591080876795162, "grad_norm": 17.49306869506836, "learning_rate": 1.5831318572796847e-07, "logits/chosen": 1.3119710683822632, "logits/rejected": 1.4872088432312012, "logps/chosen": -69.15885925292969, "logps/ref_chosen": -71.40719604492188, "logps/ref_rejected": -64.65544128417969, "logps/rejected": -73.23320770263672, "loss": 1.0535, "margin_dpo/margin_mean": 10.826096534729004, "margin_dpo/margin_std": 10.449188232421875, "step": 436 }, { "epoch": 0.6606198034769464, "grad_norm": 21.246532440185547, "learning_rate": 1.5708438608491815e-07, "logits/chosen": 1.5471603870391846, "logits/rejected": 1.2806159257888794, "logps/chosen": -88.1977310180664, "logps/ref_chosen": -81.06645202636719, "logps/ref_rejected": -126.76922607421875, "logps/rejected": -142.19287109375, "loss": 1.1814, "margin_dpo/margin_mean": 8.292359352111816, "margin_dpo/margin_std": 17.37850570678711, "step": 437 }, { "epoch": 0.6621315192743764, "grad_norm": 15.043370246887207, "learning_rate": 1.558581854913253e-07, "logits/chosen": 1.3914852142333984, "logits/rejected": 1.0753388404846191, "logps/chosen": -56.55994415283203, "logps/ref_chosen": -53.102054595947266, "logps/ref_rejected": -103.98554992675781, "logps/rejected": -114.4327392578125, "loss": 0.9652, "margin_dpo/margin_mean": 6.989290237426758, "margin_dpo/margin_std": 13.30164623260498, "step": 438 }, { "epoch": 0.6636432350718064, "grad_norm": 17.945087432861328, "learning_rate": 1.5463461824665658e-07, "logits/chosen": 1.9131124019622803, "logits/rejected": 1.7442913055419922, "logps/chosen": -74.43425750732422, "logps/ref_chosen": -73.38117980957031, "logps/ref_rejected": -103.42163848876953, "logps/rejected": -115.60191345214844, "loss": 0.9336, "margin_dpo/margin_mean": 11.12718391418457, "margin_dpo/margin_std": 11.015290260314941, "step": 439 }, { "epoch": 0.6651549508692366, "grad_norm": 15.192266464233398, "learning_rate": 1.534137185767178e-07, "logits/chosen": 1.0925568342208862, "logits/rejected": 0.6252709627151489, "logps/chosen": -75.656005859375, "logps/ref_chosen": -73.87025451660156, "logps/ref_rejected": -124.6444320678711, "logps/rejected": -143.79974365234375, "loss": 0.7814, "margin_dpo/margin_mean": 17.36956214904785, "margin_dpo/margin_std": 11.921714782714844, "step": 440 }, { "epoch": 0.6666666666666666, "grad_norm": 16.522050857543945, "learning_rate": 1.521955206326976e-07, "logits/chosen": 1.3481934070587158, "logits/rejected": 0.9532965421676636, "logps/chosen": -60.32987976074219, "logps/ref_chosen": -60.727577209472656, "logps/ref_rejected": -103.00782775878906, "logps/rejected": -112.31686401367188, "loss": 0.922, "margin_dpo/margin_mean": 9.706727981567383, "margin_dpo/margin_std": 8.411361694335938, "step": 441 }, { "epoch": 0.6681783824640968, "grad_norm": 19.764413833618164, "learning_rate": 1.5098005849021078e-07, "logits/chosen": 1.9329787492752075, "logits/rejected": 1.8003835678100586, "logps/chosen": -102.09831237792969, "logps/ref_chosen": -98.06529998779297, "logps/ref_rejected": -132.149169921875, "logps/rejected": -148.95257568359375, "loss": 1.0234, "margin_dpo/margin_mean": 12.77038288116455, "margin_dpo/margin_std": 12.934089660644531, "step": 442 }, { "epoch": 0.6696900982615268, "grad_norm": 18.58302116394043, "learning_rate": 1.4976736614834662e-07, "logits/chosen": 0.9382685422897339, "logits/rejected": 0.789872407913208, "logps/chosen": -100.74476623535156, "logps/ref_chosen": -94.45551300048828, "logps/ref_rejected": -105.468017578125, "logps/rejected": -119.6545181274414, "loss": 0.9576, "margin_dpo/margin_mean": 7.897246360778809, "margin_dpo/margin_std": 12.519614219665527, "step": 443 }, { "epoch": 0.671201814058957, "grad_norm": 21.96334457397461, "learning_rate": 1.4855747752871654e-07, "logits/chosen": 1.5991406440734863, "logits/rejected": 1.4715348482131958, "logps/chosen": -91.892333984375, "logps/ref_chosen": -84.40686798095703, "logps/ref_rejected": -104.80293273925781, "logps/rejected": -115.50108337402344, "loss": 1.2869, "margin_dpo/margin_mean": 3.2126736640930176, "margin_dpo/margin_std": 8.641006469726562, "step": 444 }, { "epoch": 0.672713529856387, "grad_norm": 20.579483032226562, "learning_rate": 1.473504264745062e-07, "logits/chosen": 1.5812016725540161, "logits/rejected": 1.6715682744979858, "logps/chosen": -111.17323303222656, "logps/ref_chosen": -101.3098373413086, "logps/ref_rejected": -82.13021850585938, "logps/rejected": -100.5653305053711, "loss": 0.9818, "margin_dpo/margin_mean": 8.571715354919434, "margin_dpo/margin_std": 14.228178977966309, "step": 445 }, { "epoch": 0.674225245653817, "grad_norm": 15.240901947021484, "learning_rate": 1.461462467495284e-07, "logits/chosen": 1.3102160692214966, "logits/rejected": 1.2892918586730957, "logps/chosen": -81.38726806640625, "logps/ref_chosen": -79.57562255859375, "logps/ref_rejected": -108.42478942871094, "logps/rejected": -122.22372436523438, "loss": 0.7873, "margin_dpo/margin_mean": 11.987289428710938, "margin_dpo/margin_std": 6.825370788574219, "step": 446 }, { "epoch": 0.6757369614512472, "grad_norm": 18.29388999938965, "learning_rate": 1.4494497203727843e-07, "logits/chosen": 1.6509873867034912, "logits/rejected": 1.2109224796295166, "logps/chosen": -84.00570678710938, "logps/ref_chosen": -81.28489685058594, "logps/ref_rejected": -103.32322692871094, "logps/rejected": -114.07003784179688, "loss": 0.9732, "margin_dpo/margin_mean": 8.025994300842285, "margin_dpo/margin_std": 11.31793212890625, "step": 447 }, { "epoch": 0.6772486772486772, "grad_norm": 16.292701721191406, "learning_rate": 1.4374663593999256e-07, "logits/chosen": 2.086494207382202, "logits/rejected": 1.8563368320465088, "logps/chosen": -67.54390716552734, "logps/ref_chosen": -66.38812255859375, "logps/ref_rejected": -105.30958557128906, "logps/rejected": -117.88861083984375, "loss": 1.0249, "margin_dpo/margin_mean": 11.423235893249512, "margin_dpo/margin_std": 10.760120391845703, "step": 448 }, { "epoch": 0.6787603930461074, "grad_norm": 24.82891845703125, "learning_rate": 1.4255127197770707e-07, "logits/chosen": 0.9788494110107422, "logits/rejected": 0.8106634616851807, "logps/chosen": -97.54696655273438, "logps/ref_chosen": -89.28851318359375, "logps/ref_rejected": -111.45301055908203, "logps/rejected": -124.34387969970703, "loss": 1.2574, "margin_dpo/margin_mean": 4.6324143409729, "margin_dpo/margin_std": 9.226218223571777, "step": 449 }, { "epoch": 0.6802721088435374, "grad_norm": 15.677437782287598, "learning_rate": 1.4135891358732205e-07, "logits/chosen": 1.2450978755950928, "logits/rejected": 0.767084538936615, "logps/chosen": -52.601375579833984, "logps/ref_chosen": -49.781455993652344, "logps/ref_rejected": -104.49571228027344, "logps/rejected": -117.5496826171875, "loss": 1.0703, "margin_dpo/margin_mean": 10.234039306640625, "margin_dpo/margin_std": 10.402769088745117, "step": 450 }, { "epoch": 0.6817838246409675, "grad_norm": 17.859636306762695, "learning_rate": 1.4016959412166437e-07, "logits/chosen": 1.4652822017669678, "logits/rejected": 1.0275336503982544, "logps/chosen": -95.54891967773438, "logps/ref_chosen": -91.53657531738281, "logps/ref_rejected": -132.09930419921875, "logps/rejected": -144.7137451171875, "loss": 1.0768, "margin_dpo/margin_mean": 8.602102279663086, "margin_dpo/margin_std": 9.885042190551758, "step": 451 }, { "epoch": 0.6832955404383976, "grad_norm": 19.282310485839844, "learning_rate": 1.3898334684855645e-07, "logits/chosen": 0.9785106182098389, "logits/rejected": 0.7253472805023193, "logps/chosen": -81.38229370117188, "logps/ref_chosen": -80.01558685302734, "logps/ref_rejected": -115.30209350585938, "logps/rejected": -130.41009521484375, "loss": 1.0413, "margin_dpo/margin_mean": 13.741291046142578, "margin_dpo/margin_std": 11.923042297363281, "step": 452 }, { "epoch": 0.6848072562358276, "grad_norm": 19.360755920410156, "learning_rate": 1.3780020494988445e-07, "logits/chosen": 1.247298240661621, "logits/rejected": 1.059384822845459, "logps/chosen": -68.79821014404297, "logps/ref_chosen": -69.91822814941406, "logps/ref_rejected": -84.69450378417969, "logps/rejected": -93.41853332519531, "loss": 1.107, "margin_dpo/margin_mean": 9.844054222106934, "margin_dpo/margin_std": 13.889093399047852, "step": 453 }, { "epoch": 0.6863189720332578, "grad_norm": 15.253005027770996, "learning_rate": 1.366202015206706e-07, "logits/chosen": 1.4422881603240967, "logits/rejected": 1.469926118850708, "logps/chosen": -63.08732986450195, "logps/ref_chosen": -63.27928161621094, "logps/ref_rejected": -66.56890869140625, "logps/rejected": -73.28570556640625, "loss": 1.0178, "margin_dpo/margin_mean": 6.908746719360352, "margin_dpo/margin_std": 14.997026443481445, "step": 454 }, { "epoch": 0.6878306878306878, "grad_norm": 17.045881271362305, "learning_rate": 1.354433695681474e-07, "logits/chosen": 1.3675647974014282, "logits/rejected": 1.2833292484283447, "logps/chosen": -93.13310241699219, "logps/ref_chosen": -89.64226531982422, "logps/ref_rejected": -99.01678466796875, "logps/rejected": -117.45805358886719, "loss": 0.871, "margin_dpo/margin_mean": 14.950428009033203, "margin_dpo/margin_std": 12.206085205078125, "step": 455 }, { "epoch": 0.6893424036281179, "grad_norm": 17.601259231567383, "learning_rate": 1.3426974201083439e-07, "logits/chosen": 1.3527313470840454, "logits/rejected": 1.0486574172973633, "logps/chosen": -61.5137939453125, "logps/ref_chosen": -59.692848205566406, "logps/ref_rejected": -104.20835876464844, "logps/rejected": -118.35098266601562, "loss": 0.952, "margin_dpo/margin_mean": 12.321682929992676, "margin_dpo/margin_std": 11.274205207824707, "step": 456 }, { "epoch": 0.690854119425548, "grad_norm": 16.536128997802734, "learning_rate": 1.3309935167761717e-07, "logits/chosen": 1.3329854011535645, "logits/rejected": 1.106091856956482, "logps/chosen": -66.14324951171875, "logps/ref_chosen": -56.836063385009766, "logps/ref_rejected": -85.02915954589844, "logps/rejected": -95.58209228515625, "loss": 1.0103, "margin_dpo/margin_mean": 1.2457318305969238, "margin_dpo/margin_std": 7.310519218444824, "step": 457 }, { "epoch": 0.6923658352229781, "grad_norm": 17.41718864440918, "learning_rate": 1.3193223130682936e-07, "logits/chosen": 1.1356405019760132, "logits/rejected": 0.9141072034835815, "logps/chosen": -81.12027740478516, "logps/ref_chosen": -78.43865966796875, "logps/ref_rejected": -89.72425079345703, "logps/rejected": -97.72470092773438, "loss": 0.9036, "margin_dpo/margin_mean": 5.318826198577881, "margin_dpo/margin_std": 10.594809532165527, "step": 458 }, { "epoch": 0.6938775510204082, "grad_norm": 18.433881759643555, "learning_rate": 1.3076841354533658e-07, "logits/chosen": 1.8431639671325684, "logits/rejected": 1.832322359085083, "logps/chosen": -92.86773681640625, "logps/ref_chosen": -89.75145721435547, "logps/ref_rejected": -127.85643005371094, "logps/rejected": -141.12796020507812, "loss": 0.9665, "margin_dpo/margin_mean": 10.155257225036621, "margin_dpo/margin_std": 11.597003936767578, "step": 459 }, { "epoch": 0.6953892668178382, "grad_norm": 17.372512817382812, "learning_rate": 1.2960793094762345e-07, "logits/chosen": 1.610877513885498, "logits/rejected": 1.0858798027038574, "logps/chosen": -96.94534301757812, "logps/ref_chosen": -92.05219268798828, "logps/ref_rejected": -121.02523803710938, "logps/rejected": -135.60598754882812, "loss": 0.8881, "margin_dpo/margin_mean": 9.687601089477539, "margin_dpo/margin_std": 12.625102996826172, "step": 460 }, { "epoch": 0.6969009826152683, "grad_norm": 17.028596878051758, "learning_rate": 1.2845081597488286e-07, "logits/chosen": 1.9178948402404785, "logits/rejected": 1.6595206260681152, "logps/chosen": -79.25119018554688, "logps/ref_chosen": -80.09500122070312, "logps/ref_rejected": -109.53080749511719, "logps/rejected": -117.13710021972656, "loss": 0.9182, "margin_dpo/margin_mean": 8.450096130371094, "margin_dpo/margin_std": 6.928763389587402, "step": 461 }, { "epoch": 0.6984126984126984, "grad_norm": 16.272708892822266, "learning_rate": 1.27297100994108e-07, "logits/chosen": 1.556300163269043, "logits/rejected": 1.5010604858398438, "logps/chosen": -79.9306640625, "logps/ref_chosen": -80.1484375, "logps/ref_rejected": -85.07381439208984, "logps/rejected": -100.82209014892578, "loss": 0.9328, "margin_dpo/margin_mean": 15.96605110168457, "margin_dpo/margin_std": 13.421406745910645, "step": 462 }, { "epoch": 0.6999244142101285, "grad_norm": 21.996517181396484, "learning_rate": 1.2614681827718695e-07, "logits/chosen": 1.8423978090286255, "logits/rejected": 1.8831135034561157, "logps/chosen": -82.12452697753906, "logps/ref_chosen": -79.72691345214844, "logps/ref_rejected": -66.49798583984375, "logps/rejected": -79.49771118164062, "loss": 1.0843, "margin_dpo/margin_mean": 10.602113723754883, "margin_dpo/margin_std": 12.234289169311523, "step": 463 }, { "epoch": 0.7014361300075586, "grad_norm": 19.246137619018555, "learning_rate": 1.2500000000000005e-07, "logits/chosen": 1.0019738674163818, "logits/rejected": 0.9494054913520813, "logps/chosen": -106.94095611572266, "logps/ref_chosen": -101.13998413085938, "logps/ref_rejected": -96.78703308105469, "logps/rejected": -112.37801361083984, "loss": 1.0795, "margin_dpo/margin_mean": 9.790007591247559, "margin_dpo/margin_std": 14.912128448486328, "step": 464 }, { "epoch": 0.7029478458049887, "grad_norm": 17.93642234802246, "learning_rate": 1.238566782415197e-07, "logits/chosen": 1.7674261331558228, "logits/rejected": 1.547837734222412, "logps/chosen": -90.09259033203125, "logps/ref_chosen": -86.97392272949219, "logps/ref_rejected": -120.5452880859375, "logps/rejected": -135.66293334960938, "loss": 1.0772, "margin_dpo/margin_mean": 11.998979568481445, "margin_dpo/margin_std": 15.716552734375, "step": 465 }, { "epoch": 0.7044595616024187, "grad_norm": 25.619155883789062, "learning_rate": 1.2271688498291334e-07, "logits/chosen": 0.9957696199417114, "logits/rejected": 1.1233830451965332, "logps/chosen": -142.93106079101562, "logps/ref_chosen": -135.1009521484375, "logps/ref_rejected": -91.93312072753906, "logps/rejected": -101.55802917480469, "loss": 1.1851, "margin_dpo/margin_mean": 1.7947821617126465, "margin_dpo/margin_std": 9.825563430786133, "step": 466 }, { "epoch": 0.7059712773998488, "grad_norm": 17.11458396911621, "learning_rate": 1.2158065210664848e-07, "logits/chosen": 1.2528045177459717, "logits/rejected": 0.782263994216919, "logps/chosen": -68.46549987792969, "logps/ref_chosen": -62.561187744140625, "logps/ref_rejected": -98.08023071289062, "logps/rejected": -110.28775024414062, "loss": 0.9298, "margin_dpo/margin_mean": 6.3032073974609375, "margin_dpo/margin_std": 8.435142517089844, "step": 467 }, { "epoch": 0.7074829931972789, "grad_norm": 20.385744094848633, "learning_rate": 1.204480113956011e-07, "logits/chosen": 1.747432827949524, "logits/rejected": 1.6361857652664185, "logps/chosen": -61.57317352294922, "logps/ref_chosen": -60.35050582885742, "logps/ref_rejected": -89.21257019042969, "logps/rejected": -100.70707702636719, "loss": 0.9071, "margin_dpo/margin_mean": 10.271842956542969, "margin_dpo/margin_std": 10.037555694580078, "step": 468 }, { "epoch": 0.708994708994709, "grad_norm": 18.414081573486328, "learning_rate": 1.1931899453216697e-07, "logits/chosen": 1.6912715435028076, "logits/rejected": 1.430666446685791, "logps/chosen": -78.16200256347656, "logps/ref_chosen": -74.3167724609375, "logps/ref_rejected": -107.75574493408203, "logps/rejected": -119.85853576660156, "loss": 0.994, "margin_dpo/margin_mean": 8.257554054260254, "margin_dpo/margin_std": 8.877249717712402, "step": 469 }, { "epoch": 0.7105064247921391, "grad_norm": 15.941197395324707, "learning_rate": 1.1819363309737438e-07, "logits/chosen": 1.3210291862487793, "logits/rejected": 1.0169076919555664, "logps/chosen": -74.15365600585938, "logps/ref_chosen": -71.33705139160156, "logps/ref_rejected": -104.70712280273438, "logps/rejected": -119.19931030273438, "loss": 1.0169, "margin_dpo/margin_mean": 11.675580978393555, "margin_dpo/margin_std": 12.775873184204102, "step": 470 }, { "epoch": 0.7120181405895691, "grad_norm": 17.501144409179688, "learning_rate": 1.1707195857000215e-07, "logits/chosen": 1.278343915939331, "logits/rejected": 1.1727805137634277, "logps/chosen": -67.31437683105469, "logps/ref_chosen": -66.2132797241211, "logps/ref_rejected": -79.41998291015625, "logps/rejected": -92.11048889160156, "loss": 0.9252, "margin_dpo/margin_mean": 11.589404106140137, "margin_dpo/margin_std": 14.581537246704102, "step": 471 }, { "epoch": 0.7135298563869993, "grad_norm": 17.502349853515625, "learning_rate": 1.1595400232569768e-07, "logits/chosen": 1.6343634128570557, "logits/rejected": 1.708991527557373, "logps/chosen": -98.43843841552734, "logps/ref_chosen": -93.6006088256836, "logps/ref_rejected": -98.41580963134766, "logps/rejected": -113.84051513671875, "loss": 1.0407, "margin_dpo/margin_mean": 10.586881637573242, "margin_dpo/margin_std": 11.680074691772461, "step": 472 }, { "epoch": 0.7150415721844293, "grad_norm": 20.486879348754883, "learning_rate": 1.1483979563610069e-07, "logits/chosen": 1.28770911693573, "logits/rejected": 0.8416398763656616, "logps/chosen": -52.20741271972656, "logps/ref_chosen": -52.449378967285156, "logps/ref_rejected": -97.75361633300781, "logps/rejected": -108.06135559082031, "loss": 1.0244, "margin_dpo/margin_mean": 10.54970645904541, "margin_dpo/margin_std": 14.598611831665039, "step": 473 }, { "epoch": 0.7165532879818595, "grad_norm": 17.808273315429688, "learning_rate": 1.1372936966796709e-07, "logits/chosen": 2.0512568950653076, "logits/rejected": 1.8590844869613647, "logps/chosen": -73.751708984375, "logps/ref_chosen": -63.751033782958984, "logps/ref_rejected": -87.05754852294922, "logps/rejected": -105.81466674804688, "loss": 1.1052, "margin_dpo/margin_mean": 8.756440162658691, "margin_dpo/margin_std": 14.645761489868164, "step": 474 }, { "epoch": 0.7180650037792895, "grad_norm": 18.614866256713867, "learning_rate": 1.126227554822985e-07, "logits/chosen": 0.8599485158920288, "logits/rejected": 0.9398992657661438, "logps/chosen": -81.05137634277344, "logps/ref_chosen": -77.45022583007812, "logps/ref_rejected": -94.36614990234375, "logps/rejected": -108.81474304199219, "loss": 0.8143, "margin_dpo/margin_mean": 10.847440719604492, "margin_dpo/margin_std": 10.613088607788086, "step": 475 }, { "epoch": 0.7195767195767195, "grad_norm": 18.886241912841797, "learning_rate": 1.1151998403347243e-07, "logits/chosen": 1.352895975112915, "logits/rejected": 1.2046875953674316, "logps/chosen": -74.26627349853516, "logps/ref_chosen": -72.13654327392578, "logps/ref_rejected": -92.82406616210938, "logps/rejected": -104.87606811523438, "loss": 1.0155, "margin_dpo/margin_mean": 9.922283172607422, "margin_dpo/margin_std": 11.191987991333008, "step": 476 }, { "epoch": 0.7210884353741497, "grad_norm": 21.929550170898438, "learning_rate": 1.1042108616837692e-07, "logits/chosen": 1.5145859718322754, "logits/rejected": 1.4401910305023193, "logps/chosen": -86.26606750488281, "logps/ref_chosen": -79.956787109375, "logps/ref_rejected": -88.42959594726562, "logps/rejected": -99.09489440917969, "loss": 1.1803, "margin_dpo/margin_mean": 4.356002330780029, "margin_dpo/margin_std": 10.943910598754883, "step": 477 }, { "epoch": 0.7226001511715797, "grad_norm": 24.98163414001465, "learning_rate": 1.0932609262554746e-07, "logits/chosen": 1.3828537464141846, "logits/rejected": 1.3153038024902344, "logps/chosen": -82.94097900390625, "logps/ref_chosen": -81.26200866699219, "logps/ref_rejected": -75.96963500976562, "logps/rejected": -90.4791259765625, "loss": 1.2257, "margin_dpo/margin_mean": 12.830511093139648, "margin_dpo/margin_std": 16.21420669555664, "step": 478 }, { "epoch": 0.7241118669690099, "grad_norm": 17.808778762817383, "learning_rate": 1.0823503403430734e-07, "logits/chosen": 0.7754037380218506, "logits/rejected": 0.20788192749023438, "logps/chosen": -69.4775390625, "logps/ref_chosen": -66.30494689941406, "logps/ref_rejected": -79.75753784179688, "logps/rejected": -89.80123138427734, "loss": 1.1554, "margin_dpo/margin_mean": 6.87109375, "margin_dpo/margin_std": 11.000585556030273, "step": 479 }, { "epoch": 0.7256235827664399, "grad_norm": 20.942508697509766, "learning_rate": 1.0714794091391072e-07, "logits/chosen": 1.5337204933166504, "logits/rejected": 1.5473401546478271, "logps/chosen": -92.85333251953125, "logps/ref_chosen": -85.79927062988281, "logps/ref_rejected": -88.14668273925781, "logps/rejected": -97.37765502929688, "loss": 1.1007, "margin_dpo/margin_mean": 2.1769046783447266, "margin_dpo/margin_std": 15.181905746459961, "step": 480 }, { "epoch": 0.72713529856387, "grad_norm": 18.731130599975586, "learning_rate": 1.0606484367268906e-07, "logits/chosen": 1.1598225831985474, "logits/rejected": 1.3063321113586426, "logps/chosen": -103.10277557373047, "logps/ref_chosen": -99.173828125, "logps/ref_rejected": -89.25907897949219, "logps/rejected": -101.21484375, "loss": 1.0364, "margin_dpo/margin_mean": 8.026813507080078, "margin_dpo/margin_std": 11.911779403686523, "step": 481 }, { "epoch": 0.7286470143613001, "grad_norm": 24.170635223388672, "learning_rate": 1.0498577260720048e-07, "logits/chosen": 1.5664019584655762, "logits/rejected": 1.4500259160995483, "logps/chosen": -76.21261596679688, "logps/ref_chosen": -72.07780456542969, "logps/ref_rejected": -112.81143951416016, "logps/rejected": -121.56095886230469, "loss": 1.1926, "margin_dpo/margin_mean": 4.614706993103027, "margin_dpo/margin_std": 10.684948921203613, "step": 482 }, { "epoch": 0.7301587301587301, "grad_norm": 17.4322452545166, "learning_rate": 1.0391075790138232e-07, "logits/chosen": 1.2500498294830322, "logits/rejected": 1.3257312774658203, "logps/chosen": -78.55180358886719, "logps/ref_chosen": -76.97505187988281, "logps/ref_rejected": -88.44538879394531, "logps/rejected": -103.33856964111328, "loss": 0.9625, "margin_dpo/margin_mean": 13.316431999206543, "margin_dpo/margin_std": 12.179356575012207, "step": 483 }, { "epoch": 0.7316704459561603, "grad_norm": 19.626110076904297, "learning_rate": 1.0283982962570681e-07, "logits/chosen": 1.6901671886444092, "logits/rejected": 1.663952112197876, "logps/chosen": -70.52226257324219, "logps/ref_chosen": -66.91915130615234, "logps/ref_rejected": -81.10409545898438, "logps/rejected": -88.86813354492188, "loss": 1.0212, "margin_dpo/margin_mean": 4.160930156707764, "margin_dpo/margin_std": 12.263092041015625, "step": 484 }, { "epoch": 0.7331821617535903, "grad_norm": 18.661724090576172, "learning_rate": 1.0177301773633992e-07, "logits/chosen": 1.689317226409912, "logits/rejected": 1.494809627532959, "logps/chosen": -70.6705093383789, "logps/ref_chosen": -65.09275817871094, "logps/ref_rejected": -80.40423583984375, "logps/rejected": -95.483154296875, "loss": 1.006, "margin_dpo/margin_mean": 9.501173973083496, "margin_dpo/margin_std": 13.067754745483398, "step": 485 }, { "epoch": 0.7346938775510204, "grad_norm": 16.752140045166016, "learning_rate": 1.007103520743035e-07, "logits/chosen": 1.509906530380249, "logits/rejected": 1.163289189338684, "logps/chosen": -93.05138397216797, "logps/ref_chosen": -84.84764862060547, "logps/ref_rejected": -133.550537109375, "logps/rejected": -152.564208984375, "loss": 1.0698, "margin_dpo/margin_mean": 10.809947967529297, "margin_dpo/margin_std": 12.410652160644531, "step": 486 }, { "epoch": 0.7362055933484505, "grad_norm": 18.650707244873047, "learning_rate": 9.965186236464046e-08, "logits/chosen": 1.281712532043457, "logits/rejected": 1.5488810539245605, "logps/chosen": -114.79739379882812, "logps/ref_chosen": -108.61647033691406, "logps/ref_rejected": -78.72598266601562, "logps/rejected": -93.0931167602539, "loss": 1.0009, "margin_dpo/margin_mean": 8.186208724975586, "margin_dpo/margin_std": 12.530467987060547, "step": 487 }, { "epoch": 0.7377173091458806, "grad_norm": 20.89183235168457, "learning_rate": 9.859757821558337e-08, "logits/chosen": 1.9154369831085205, "logits/rejected": 1.701824426651001, "logps/chosen": -62.53257369995117, "logps/ref_chosen": -59.21123504638672, "logps/ref_rejected": -76.8067626953125, "logps/rejected": -87.98098754882812, "loss": 1.0391, "margin_dpo/margin_mean": 7.852884292602539, "margin_dpo/margin_std": 16.462614059448242, "step": 488 }, { "epoch": 0.7392290249433107, "grad_norm": 20.71122169494629, "learning_rate": 9.754752911772615e-08, "logits/chosen": 1.587559461593628, "logits/rejected": 1.5583109855651855, "logps/chosen": -91.11749267578125, "logps/ref_chosen": -87.55021667480469, "logps/ref_rejected": -112.35884094238281, "logps/rejected": -120.99494934082031, "loss": 1.3057, "margin_dpo/margin_mean": 5.0688347816467285, "margin_dpo/margin_std": 13.321014404296875, "step": 489 }, { "epoch": 0.7407407407407407, "grad_norm": 19.836669921875, "learning_rate": 9.650174444319956e-08, "logits/chosen": 1.9370818138122559, "logits/rejected": 1.946554183959961, "logps/chosen": -85.51853942871094, "logps/ref_chosen": -80.47018432617188, "logps/ref_rejected": -95.76599884033203, "logps/rejected": -107.79928588867188, "loss": 1.147, "margin_dpo/margin_mean": 6.984920501708984, "margin_dpo/margin_std": 16.217628479003906, "step": 490 }, { "epoch": 0.7422524565381708, "grad_norm": 17.223388671875, "learning_rate": 9.546025344484868e-08, "logits/chosen": 1.0049835443496704, "logits/rejected": 1.0877690315246582, "logps/chosen": -82.58981323242188, "logps/ref_chosen": -77.64207458496094, "logps/ref_rejected": -73.8185043334961, "logps/rejected": -87.43759155273438, "loss": 0.9568, "margin_dpo/margin_mean": 8.671359062194824, "margin_dpo/margin_std": 10.907693862915039, "step": 491 }, { "epoch": 0.7437641723356009, "grad_norm": 21.896053314208984, "learning_rate": 9.442308525541589e-08, "logits/chosen": 1.5087008476257324, "logits/rejected": 0.9396347999572754, "logps/chosen": -69.9753646850586, "logps/ref_chosen": -63.29586410522461, "logps/ref_rejected": -136.92556762695312, "logps/rejected": -152.44723510742188, "loss": 1.2247, "margin_dpo/margin_mean": 8.842167854309082, "margin_dpo/margin_std": 9.512777328491211, "step": 492 }, { "epoch": 0.745275888133031, "grad_norm": 17.821565628051758, "learning_rate": 9.339026888672468e-08, "logits/chosen": 1.7605314254760742, "logits/rejected": 1.5902516841888428, "logps/chosen": -80.77244567871094, "logps/ref_chosen": -77.4769058227539, "logps/ref_rejected": -99.05404663085938, "logps/rejected": -109.08465576171875, "loss": 1.0315, "margin_dpo/margin_mean": 6.73507022857666, "margin_dpo/margin_std": 13.012711524963379, "step": 493 }, { "epoch": 0.7467876039304611, "grad_norm": 23.856151580810547, "learning_rate": 9.236183322886945e-08, "logits/chosen": 0.9068803787231445, "logits/rejected": 0.8410812020301819, "logps/chosen": -76.83076477050781, "logps/ref_chosen": -73.76473236083984, "logps/ref_rejected": -87.72662353515625, "logps/rejected": -95.38212585449219, "loss": 1.1202, "margin_dpo/margin_mean": 4.589466094970703, "margin_dpo/margin_std": 6.67095422744751, "step": 494 }, { "epoch": 0.7482993197278912, "grad_norm": 18.004785537719727, "learning_rate": 9.133780704940594e-08, "logits/chosen": 1.4808114767074585, "logits/rejected": 1.4466617107391357, "logps/chosen": -84.16793060302734, "logps/ref_chosen": -78.31684112548828, "logps/ref_rejected": -82.60205841064453, "logps/rejected": -87.97726440429688, "loss": 1.152, "margin_dpo/margin_mean": -0.47588586807250977, "margin_dpo/margin_std": 14.283245086669922, "step": 495 }, { "epoch": 0.7498110355253212, "grad_norm": 18.314136505126953, "learning_rate": 9.031821899254797e-08, "logits/chosen": 1.4718937873840332, "logits/rejected": 1.138891339302063, "logps/chosen": -63.24338912963867, "logps/ref_chosen": -61.20139694213867, "logps/ref_rejected": -128.4497528076172, "logps/rejected": -141.38265991210938, "loss": 1.0619, "margin_dpo/margin_mean": 10.890892028808594, "margin_dpo/margin_std": 12.704998016357422, "step": 496 }, { "epoch": 0.7513227513227513, "grad_norm": 17.068378448486328, "learning_rate": 8.930309757836516e-08, "logits/chosen": 1.5163381099700928, "logits/rejected": 1.214906096458435, "logps/chosen": -72.13016510009766, "logps/ref_chosen": -73.95780944824219, "logps/ref_rejected": -118.5865478515625, "logps/rejected": -130.9983673095703, "loss": 0.9075, "margin_dpo/margin_mean": 14.239460945129395, "margin_dpo/margin_std": 12.243886947631836, "step": 497 }, { "epoch": 0.7528344671201814, "grad_norm": 19.820772171020508, "learning_rate": 8.829247120198563e-08, "logits/chosen": 1.7457375526428223, "logits/rejected": 1.517223596572876, "logps/chosen": -87.77767944335938, "logps/ref_chosen": -82.978515625, "logps/ref_rejected": -121.03521728515625, "logps/rejected": -133.5142364501953, "loss": 1.0216, "margin_dpo/margin_mean": 7.679849624633789, "margin_dpo/margin_std": 11.738670349121094, "step": 498 }, { "epoch": 0.7543461829176115, "grad_norm": 21.134685516357422, "learning_rate": 8.728636813280163e-08, "logits/chosen": 1.3227453231811523, "logits/rejected": 1.0213969945907593, "logps/chosen": -82.8435287475586, "logps/ref_chosen": -77.29167175292969, "logps/ref_rejected": -122.06014251708984, "logps/rejected": -136.57225036621094, "loss": 1.1046, "margin_dpo/margin_mean": 8.960248947143555, "margin_dpo/margin_std": 13.603601455688477, "step": 499 }, { "epoch": 0.7558578987150416, "grad_norm": 19.003353118896484, "learning_rate": 8.628481651367875e-08, "logits/chosen": 0.8882439136505127, "logits/rejected": 0.9162918329238892, "logps/chosen": -103.67877197265625, "logps/ref_chosen": -96.82359313964844, "logps/ref_rejected": -103.96098327636719, "logps/rejected": -124.1900405883789, "loss": 1.0214, "margin_dpo/margin_mean": 13.37387466430664, "margin_dpo/margin_std": 9.533989906311035, "step": 500 }, { "epoch": 0.7558578987150416, "eval_logits/chosen": 1.7113724946975708, "eval_logits/rejected": 1.5744194984436035, "eval_logps/chosen": -90.40150451660156, "eval_logps/ref_chosen": -87.31719970703125, "eval_logps/ref_rejected": -95.23231506347656, "eval_logps/rejected": -106.3937759399414, "eval_loss": 0.5191938877105713, "eval_margin_dpo/margin_mean": 8.07715892791748, "eval_margin_dpo/margin_std": 11.990333557128906, "eval_runtime": 42.5794, "eval_samples_per_second": 54.087, "eval_steps_per_second": 1.691, "step": 500 }, { "epoch": 0.7573696145124716, "grad_norm": 18.370750427246094, "learning_rate": 8.528784436016878e-08, "logits/chosen": 1.4236221313476562, "logits/rejected": 1.4487783908843994, "logps/chosen": -88.52447509765625, "logps/ref_chosen": -84.77076721191406, "logps/ref_rejected": -96.5400390625, "logps/rejected": -111.22660827636719, "loss": 1.0468, "margin_dpo/margin_mean": 10.932865142822266, "margin_dpo/margin_std": 13.202751159667969, "step": 501 }, { "epoch": 0.7588813303099018, "grad_norm": 19.810033798217773, "learning_rate": 8.4295479559726e-08, "logits/chosen": 1.7618186473846436, "logits/rejected": 1.5462822914123535, "logps/chosen": -80.53328704833984, "logps/ref_chosen": -79.30207824707031, "logps/ref_rejected": -108.16116333007812, "logps/rejected": -113.54059600830078, "loss": 1.0673, "margin_dpo/margin_mean": 4.148220539093018, "margin_dpo/margin_std": 10.900373458862305, "step": 502 }, { "epoch": 0.7603930461073318, "grad_norm": 18.34208106994629, "learning_rate": 8.330774987092712e-08, "logits/chosen": 1.3788197040557861, "logits/rejected": 1.4502460956573486, "logps/chosen": -83.57225799560547, "logps/ref_chosen": -80.10978698730469, "logps/ref_rejected": -88.66081237792969, "logps/rejected": -99.5992431640625, "loss": 1.0207, "margin_dpo/margin_mean": 7.475958824157715, "margin_dpo/margin_std": 13.405517578125, "step": 503 }, { "epoch": 0.7619047619047619, "grad_norm": 16.681896209716797, "learning_rate": 8.232468292269479e-08, "logits/chosen": 1.6238125562667847, "logits/rejected": 1.4265142679214478, "logps/chosen": -83.1351318359375, "logps/ref_chosen": -79.96421813964844, "logps/ref_rejected": -104.044921875, "logps/rejected": -118.07717895507812, "loss": 0.7901, "margin_dpo/margin_mean": 10.86135196685791, "margin_dpo/margin_std": 12.178512573242188, "step": 504 }, { "epoch": 0.763416477702192, "grad_norm": 23.297094345092773, "learning_rate": 8.134630621352483e-08, "logits/chosen": 1.5399702787399292, "logits/rejected": 1.3151166439056396, "logps/chosen": -95.18058013916016, "logps/ref_chosen": -92.33998107910156, "logps/ref_rejected": -118.33450317382812, "logps/rejected": -129.932861328125, "loss": 1.1893, "margin_dpo/margin_mean": 8.757759094238281, "margin_dpo/margin_std": 12.796874046325684, "step": 505 }, { "epoch": 0.764928193499622, "grad_norm": 24.723825454711914, "learning_rate": 8.037264711071698e-08, "logits/chosen": 1.3498167991638184, "logits/rejected": 1.4767301082611084, "logps/chosen": -90.04280853271484, "logps/ref_chosen": -87.29638671875, "logps/ref_rejected": -89.43702697753906, "logps/rejected": -99.47492980957031, "loss": 1.2114, "margin_dpo/margin_mean": 7.291473865509033, "margin_dpo/margin_std": 7.812591552734375, "step": 506 }, { "epoch": 0.7664399092970522, "grad_norm": 18.891969680786133, "learning_rate": 7.940373284960933e-08, "logits/chosen": 1.2641713619232178, "logits/rejected": 1.3655802011489868, "logps/chosen": -125.11323547363281, "logps/ref_chosen": -117.73011779785156, "logps/ref_rejected": -119.16480255126953, "logps/rejected": -130.9937744140625, "loss": 1.0284, "margin_dpo/margin_mean": 4.445857048034668, "margin_dpo/margin_std": 10.777402877807617, "step": 507 }, { "epoch": 0.7679516250944822, "grad_norm": 17.53611946105957, "learning_rate": 7.843959053281663e-08, "logits/chosen": 1.5996830463409424, "logits/rejected": 1.165022850036621, "logps/chosen": -75.09086608886719, "logps/ref_chosen": -75.30168151855469, "logps/ref_rejected": -123.31394958496094, "logps/rejected": -137.30581665039062, "loss": 0.9909, "margin_dpo/margin_mean": 14.202698707580566, "margin_dpo/margin_std": 18.454662322998047, "step": 508 }, { "epoch": 0.7694633408919124, "grad_norm": 16.167238235473633, "learning_rate": 7.748024712947204e-08, "logits/chosen": 0.9779084920883179, "logits/rejected": 1.1555689573287964, "logps/chosen": -78.19929504394531, "logps/ref_chosen": -75.66693878173828, "logps/ref_rejected": -66.68931579589844, "logps/rejected": -76.95621490478516, "loss": 0.9238, "margin_dpo/margin_mean": 7.73454475402832, "margin_dpo/margin_std": 10.285099029541016, "step": 509 }, { "epoch": 0.7709750566893424, "grad_norm": 18.947065353393555, "learning_rate": 7.652572947447272e-08, "logits/chosen": 1.337875247001648, "logits/rejected": 1.208590030670166, "logps/chosen": -89.04047393798828, "logps/ref_chosen": -83.85684204101562, "logps/ref_rejected": -113.48956298828125, "logps/rejected": -125.53936004638672, "loss": 0.9869, "margin_dpo/margin_mean": 6.866161823272705, "margin_dpo/margin_std": 10.09292221069336, "step": 510 }, { "epoch": 0.7724867724867724, "grad_norm": 18.071434020996094, "learning_rate": 7.557606426772961e-08, "logits/chosen": 1.6594161987304688, "logits/rejected": 1.3299106359481812, "logps/chosen": -66.330810546875, "logps/ref_chosen": -63.364810943603516, "logps/ref_rejected": -97.9059066772461, "logps/rejected": -113.6323013305664, "loss": 0.9209, "margin_dpo/margin_mean": 12.760382652282715, "margin_dpo/margin_std": 12.243383407592773, "step": 511 }, { "epoch": 0.7739984882842026, "grad_norm": 21.141952514648438, "learning_rate": 7.463127807341966e-08, "logits/chosen": 1.265761137008667, "logits/rejected": 1.4011613130569458, "logps/chosen": -76.77413177490234, "logps/ref_chosen": -75.36632537841797, "logps/ref_rejected": -70.61337280273438, "logps/rejected": -77.54893493652344, "loss": 1.1503, "margin_dpo/margin_mean": 5.527750015258789, "margin_dpo/margin_std": 12.510340690612793, "step": 512 }, { "epoch": 0.7755102040816326, "grad_norm": 15.058732986450195, "learning_rate": 7.369139731924401e-08, "logits/chosen": 2.2177677154541016, "logits/rejected": 2.1079206466674805, "logps/chosen": -52.59001922607422, "logps/ref_chosen": -54.447452545166016, "logps/ref_rejected": -66.64134216308594, "logps/rejected": -77.21235656738281, "loss": 0.9279, "margin_dpo/margin_mean": 12.428445816040039, "margin_dpo/margin_std": 10.572999954223633, "step": 513 }, { "epoch": 0.7770219198790628, "grad_norm": 17.961933135986328, "learning_rate": 7.275644829568747e-08, "logits/chosen": 1.7214994430541992, "logits/rejected": 1.657637119293213, "logps/chosen": -83.87235260009766, "logps/ref_chosen": -77.21424865722656, "logps/ref_rejected": -88.79304504394531, "logps/rejected": -103.65969848632812, "loss": 0.9568, "margin_dpo/margin_mean": 8.208556175231934, "margin_dpo/margin_std": 8.400633811950684, "step": 514 }, { "epoch": 0.7785336356764928, "grad_norm": 19.01670265197754, "learning_rate": 7.182645715528435e-08, "logits/chosen": 2.269339084625244, "logits/rejected": 2.018411874771118, "logps/chosen": -57.86487579345703, "logps/ref_chosen": -53.61089324951172, "logps/ref_rejected": -84.93665313720703, "logps/rejected": -98.9819564819336, "loss": 1.0671, "margin_dpo/margin_mean": 9.791324615478516, "margin_dpo/margin_std": 10.679479598999023, "step": 515 }, { "epoch": 0.780045351473923, "grad_norm": 17.602020263671875, "learning_rate": 7.090144991188568e-08, "logits/chosen": 1.5840175151824951, "logits/rejected": 1.3921927213668823, "logps/chosen": -77.81848907470703, "logps/ref_chosen": -74.37448120117188, "logps/ref_rejected": -92.19779968261719, "logps/rejected": -106.61052703857422, "loss": 1.0446, "margin_dpo/margin_mean": 10.968722343444824, "margin_dpo/margin_std": 11.749760627746582, "step": 516 }, { "epoch": 0.781557067271353, "grad_norm": 19.337661743164062, "learning_rate": 6.998145243993284e-08, "logits/chosen": 1.4960018396377563, "logits/rejected": 1.4958319664001465, "logps/chosen": -77.3065185546875, "logps/ref_chosen": -66.6958236694336, "logps/ref_rejected": -57.935340881347656, "logps/rejected": -72.71110534667969, "loss": 1.1829, "margin_dpo/margin_mean": 4.165067195892334, "margin_dpo/margin_std": 10.648651123046875, "step": 517 }, { "epoch": 0.783068783068783, "grad_norm": 17.19664192199707, "learning_rate": 6.906649047373245e-08, "logits/chosen": 1.2923452854156494, "logits/rejected": 1.3025954961776733, "logps/chosen": -85.42341613769531, "logps/ref_chosen": -79.89225769042969, "logps/ref_rejected": -73.43104553222656, "logps/rejected": -82.86656951904297, "loss": 1.1025, "margin_dpo/margin_mean": 3.9043619632720947, "margin_dpo/margin_std": 9.182500839233398, "step": 518 }, { "epoch": 0.7845804988662132, "grad_norm": 24.619091033935547, "learning_rate": 6.815658960673781e-08, "logits/chosen": 1.4309927225112915, "logits/rejected": 1.3174998760223389, "logps/chosen": -86.82591247558594, "logps/ref_chosen": -79.05235290527344, "logps/ref_rejected": -98.82565307617188, "logps/rejected": -109.80943298339844, "loss": 1.4383, "margin_dpo/margin_mean": 3.2102420330047607, "margin_dpo/margin_std": 12.968755722045898, "step": 519 }, { "epoch": 0.7860922146636432, "grad_norm": 17.64301300048828, "learning_rate": 6.725177529083209e-08, "logits/chosen": 1.6076090335845947, "logits/rejected": 1.0972341299057007, "logps/chosen": -78.50565338134766, "logps/ref_chosen": -73.80180358886719, "logps/ref_rejected": -86.52413940429688, "logps/rejected": -98.18661499023438, "loss": 0.9752, "margin_dpo/margin_mean": 6.9586358070373535, "margin_dpo/margin_std": 6.212986946105957, "step": 520 }, { "epoch": 0.7876039304610734, "grad_norm": 20.074474334716797, "learning_rate": 6.63520728356167e-08, "logits/chosen": 1.3820008039474487, "logits/rejected": 1.3183671236038208, "logps/chosen": -110.52100372314453, "logps/ref_chosen": -110.50602722167969, "logps/ref_rejected": -112.4345474243164, "logps/rejected": -121.87361907958984, "loss": 1.0654, "margin_dpo/margin_mean": 9.424093246459961, "margin_dpo/margin_std": 9.943330764770508, "step": 521 }, { "epoch": 0.7891156462585034, "grad_norm": 25.567726135253906, "learning_rate": 6.545750740770336e-08, "logits/chosen": 1.6041221618652344, "logits/rejected": 1.4414212703704834, "logps/chosen": -55.25762176513672, "logps/ref_chosen": -53.32655334472656, "logps/ref_rejected": -83.27340698242188, "logps/rejected": -95.13212585449219, "loss": 1.2416, "margin_dpo/margin_mean": 9.927656173706055, "margin_dpo/margin_std": 12.694474220275879, "step": 522 }, { "epoch": 0.7906273620559335, "grad_norm": 18.995908737182617, "learning_rate": 6.456810403001012e-08, "logits/chosen": 1.8754998445510864, "logits/rejected": 1.0644989013671875, "logps/chosen": -60.10360336303711, "logps/ref_chosen": -57.306026458740234, "logps/ref_rejected": -117.778564453125, "logps/rejected": -133.0610809326172, "loss": 1.0006, "margin_dpo/margin_mean": 12.48494815826416, "margin_dpo/margin_std": 9.661229133605957, "step": 523 }, { "epoch": 0.7921390778533636, "grad_norm": 21.017295837402344, "learning_rate": 6.368388758106134e-08, "logits/chosen": 1.1499770879745483, "logits/rejected": 1.1621506214141846, "logps/chosen": -95.40074157714844, "logps/ref_chosen": -92.42146301269531, "logps/ref_rejected": -100.56710815429688, "logps/rejected": -108.34770202636719, "loss": 1.1296, "margin_dpo/margin_mean": 4.801305294036865, "margin_dpo/margin_std": 10.20431900024414, "step": 524 }, { "epoch": 0.7936507936507936, "grad_norm": 20.421939849853516, "learning_rate": 6.280488279429185e-08, "logits/chosen": 0.8107924461364746, "logits/rejected": 0.7142012119293213, "logps/chosen": -116.94172668457031, "logps/ref_chosen": -110.25018310546875, "logps/ref_rejected": -111.46438598632812, "logps/rejected": -122.10671997070312, "loss": 1.1939, "margin_dpo/margin_mean": 3.9507997035980225, "margin_dpo/margin_std": 8.627264022827148, "step": 525 }, { "epoch": 0.7951625094482238, "grad_norm": 17.86248207092285, "learning_rate": 6.193111425735515e-08, "logits/chosen": 1.3053174018859863, "logits/rejected": 0.9650485515594482, "logps/chosen": -86.19122314453125, "logps/ref_chosen": -82.32691955566406, "logps/ref_rejected": -104.9666748046875, "logps/rejected": -116.00468444824219, "loss": 1.1079, "margin_dpo/margin_mean": 7.173711776733398, "margin_dpo/margin_std": 10.565650939941406, "step": 526 }, { "epoch": 0.7966742252456538, "grad_norm": 22.50766944885254, "learning_rate": 6.106260641143546e-08, "logits/chosen": 1.9102357625961304, "logits/rejected": 1.5195531845092773, "logps/chosen": -55.809349060058594, "logps/ref_chosen": -54.391990661621094, "logps/ref_rejected": -114.67759704589844, "logps/rejected": -125.46134948730469, "loss": 1.2124, "margin_dpo/margin_mean": 9.366389274597168, "margin_dpo/margin_std": 9.0003080368042, "step": 527 }, { "epoch": 0.7981859410430839, "grad_norm": 18.06372833251953, "learning_rate": 6.019938355056422e-08, "logits/chosen": 0.9811175465583801, "logits/rejected": 1.313080072402954, "logps/chosen": -98.42125701904297, "logps/ref_chosen": -96.57173919677734, "logps/ref_rejected": -61.710731506347656, "logps/rejected": -66.19438171386719, "loss": 1.2084, "margin_dpo/margin_mean": 2.6341331005096436, "margin_dpo/margin_std": 12.884065628051758, "step": 528 }, { "epoch": 0.799697656840514, "grad_norm": 15.852145195007324, "learning_rate": 5.934146982094049e-08, "logits/chosen": 1.3661787509918213, "logits/rejected": 1.3078646659851074, "logps/chosen": -79.79299926757812, "logps/ref_chosen": -75.91831970214844, "logps/ref_rejected": -86.05809020996094, "logps/rejected": -99.61922454833984, "loss": 0.8205, "margin_dpo/margin_mean": 9.686440467834473, "margin_dpo/margin_std": 10.735424041748047, "step": 529 }, { "epoch": 0.8012093726379441, "grad_norm": 17.016653060913086, "learning_rate": 5.848888922025552e-08, "logits/chosen": 1.8464529514312744, "logits/rejected": 1.4458162784576416, "logps/chosen": -68.35203552246094, "logps/ref_chosen": -65.1891098022461, "logps/ref_rejected": -102.95738983154297, "logps/rejected": -119.57209777832031, "loss": 1.025, "margin_dpo/margin_mean": 13.451784133911133, "margin_dpo/margin_std": 13.144256591796875, "step": 530 }, { "epoch": 0.8027210884353742, "grad_norm": 19.747512817382812, "learning_rate": 5.7641665597021435e-08, "logits/chosen": 1.5875380039215088, "logits/rejected": 1.386962652206421, "logps/chosen": -68.38678741455078, "logps/ref_chosen": -65.18759155273438, "logps/ref_rejected": -106.05992889404297, "logps/rejected": -117.46918487548828, "loss": 1.0935, "margin_dpo/margin_mean": 8.210055351257324, "margin_dpo/margin_std": 9.990748405456543, "step": 531 }, { "epoch": 0.8042328042328042, "grad_norm": 18.090911865234375, "learning_rate": 5.679982264990424e-08, "logits/chosen": 1.3891462087631226, "logits/rejected": 0.9999946355819702, "logps/chosen": -81.65940856933594, "logps/ref_chosen": -77.56283569335938, "logps/ref_rejected": -112.9091796875, "logps/rejected": -129.6756591796875, "loss": 1.0529, "margin_dpo/margin_mean": 12.669902801513672, "margin_dpo/margin_std": 10.895479202270508, "step": 532 }, { "epoch": 0.8057445200302343, "grad_norm": 20.71051025390625, "learning_rate": 5.596338392706076e-08, "logits/chosen": 1.7654846906661987, "logits/rejected": 1.4767529964447021, "logps/chosen": -73.6165771484375, "logps/ref_chosen": -72.06595611572266, "logps/ref_rejected": -84.92366027832031, "logps/rejected": -94.09053039550781, "loss": 1.0637, "margin_dpo/margin_mean": 7.616242408752441, "margin_dpo/margin_std": 10.316248893737793, "step": 533 }, { "epoch": 0.8072562358276644, "grad_norm": 18.25124740600586, "learning_rate": 5.513237282548033e-08, "logits/chosen": 1.0457935333251953, "logits/rejected": 0.5739990472793579, "logps/chosen": -75.39283752441406, "logps/ref_chosen": -73.32286834716797, "logps/ref_rejected": -130.4866485595703, "logps/rejected": -138.80526733398438, "loss": 1.0636, "margin_dpo/margin_mean": 6.248659133911133, "margin_dpo/margin_std": 11.008050918579102, "step": 534 }, { "epoch": 0.8087679516250945, "grad_norm": 18.94314193725586, "learning_rate": 5.430681259032957e-08, "logits/chosen": 1.0625642538070679, "logits/rejected": 0.7707558274269104, "logps/chosen": -65.30772399902344, "logps/ref_chosen": -61.52978515625, "logps/ref_rejected": -83.76773071289062, "logps/rejected": -95.008544921875, "loss": 1.1441, "margin_dpo/margin_mean": 7.462870121002197, "margin_dpo/margin_std": 14.842193603515625, "step": 535 }, { "epoch": 0.8102796674225246, "grad_norm": 15.118937492370605, "learning_rate": 5.3486726314303175e-08, "logits/chosen": 1.6048871278762817, "logits/rejected": 1.4890940189361572, "logps/chosen": -83.21963500976562, "logps/ref_chosen": -80.38054656982422, "logps/ref_rejected": -93.82463073730469, "logps/rejected": -103.38377380371094, "loss": 0.8625, "margin_dpo/margin_mean": 6.720064640045166, "margin_dpo/margin_std": 9.501663208007812, "step": 536 }, { "epoch": 0.8117913832199547, "grad_norm": 18.49951934814453, "learning_rate": 5.267213693697695e-08, "logits/chosen": 1.4918584823608398, "logits/rejected": 1.1810901165008545, "logps/chosen": -58.49045181274414, "logps/ref_chosen": -57.22574234008789, "logps/ref_rejected": -135.7332763671875, "logps/rejected": -150.7767333984375, "loss": 1.0835, "margin_dpo/margin_mean": 13.778725624084473, "margin_dpo/margin_std": 9.245294570922852, "step": 537 }, { "epoch": 0.8133030990173847, "grad_norm": 19.65271759033203, "learning_rate": 5.1863067244167144e-08, "logits/chosen": 1.602089762687683, "logits/rejected": 1.7309138774871826, "logps/chosen": -71.2269287109375, "logps/ref_chosen": -67.96293640136719, "logps/ref_rejected": -56.81917190551758, "logps/rejected": -67.69879150390625, "loss": 0.9342, "margin_dpo/margin_mean": 7.615636825561523, "margin_dpo/margin_std": 10.339972496032715, "step": 538 }, { "epoch": 0.8148148148148148, "grad_norm": 19.1221981048584, "learning_rate": 5.105953986729195e-08, "logits/chosen": 1.293712854385376, "logits/rejected": 1.1432920694351196, "logps/chosen": -90.13014221191406, "logps/ref_chosen": -89.68031311035156, "logps/ref_rejected": -105.60565948486328, "logps/rejected": -116.22645568847656, "loss": 1.0204, "margin_dpo/margin_mean": 10.170965194702148, "margin_dpo/margin_std": 11.156034469604492, "step": 539 }, { "epoch": 0.8163265306122449, "grad_norm": 19.985137939453125, "learning_rate": 5.026157728273966e-08, "logits/chosen": 1.8261759281158447, "logits/rejected": 1.642942190170288, "logps/chosen": -75.65715026855469, "logps/ref_chosen": -70.51634979248047, "logps/ref_rejected": -97.39108276367188, "logps/rejected": -110.94168853759766, "loss": 0.9994, "margin_dpo/margin_mean": 8.409812927246094, "margin_dpo/margin_std": 10.951087951660156, "step": 540 }, { "epoch": 0.817838246409675, "grad_norm": 21.32786750793457, "learning_rate": 4.9469201811239035e-08, "logits/chosen": 1.3147519826889038, "logits/rejected": 1.4918153285980225, "logps/chosen": -118.90332794189453, "logps/ref_chosen": -115.25343322753906, "logps/ref_rejected": -71.04121398925781, "logps/rejected": -82.87217712402344, "loss": 0.9862, "margin_dpo/margin_mean": 8.181074142456055, "margin_dpo/margin_std": 10.864706039428711, "step": 541 }, { "epoch": 0.8193499622071051, "grad_norm": 17.46336555480957, "learning_rate": 4.868243561723534e-08, "logits/chosen": 1.5071132183074951, "logits/rejected": 1.2838053703308105, "logps/chosen": -72.99198913574219, "logps/ref_chosen": -73.7113265991211, "logps/ref_rejected": -109.87480163574219, "logps/rejected": -118.7210922241211, "loss": 0.9761, "margin_dpo/margin_mean": 9.565635681152344, "margin_dpo/margin_std": 8.672539710998535, "step": 542 }, { "epoch": 0.8208616780045351, "grad_norm": 14.991835594177246, "learning_rate": 4.790130070827028e-08, "logits/chosen": 1.6452085971832275, "logits/rejected": 1.4290642738342285, "logps/chosen": -68.71112060546875, "logps/ref_chosen": -65.55136108398438, "logps/ref_rejected": -105.42532348632812, "logps/rejected": -118.35376739501953, "loss": 0.9838, "margin_dpo/margin_mean": 9.768689155578613, "margin_dpo/margin_std": 11.122146606445312, "step": 543 }, { "epoch": 0.8223733938019653, "grad_norm": 16.761272430419922, "learning_rate": 4.7125818934366454e-08, "logits/chosen": 1.403899908065796, "logits/rejected": 1.471496343612671, "logps/chosen": -105.05339050292969, "logps/ref_chosen": -97.55657958984375, "logps/ref_rejected": -85.24616241455078, "logps/rejected": -99.89966583251953, "loss": 0.9523, "margin_dpo/margin_mean": 7.156683921813965, "margin_dpo/margin_std": 10.03597640991211, "step": 544 }, { "epoch": 0.8238851095993953, "grad_norm": 18.97000503540039, "learning_rate": 4.635601198741607e-08, "logits/chosen": 1.3821418285369873, "logits/rejected": 1.2542109489440918, "logps/chosen": -76.37702178955078, "logps/ref_chosen": -71.65219116210938, "logps/ref_rejected": -89.26495361328125, "logps/rejected": -99.81912231445312, "loss": 1.0731, "margin_dpo/margin_mean": 5.829347610473633, "margin_dpo/margin_std": 10.605819702148438, "step": 545 }, { "epoch": 0.8253968253968254, "grad_norm": 17.86812973022461, "learning_rate": 4.559190140057428e-08, "logits/chosen": 1.2848098278045654, "logits/rejected": 1.305572271347046, "logps/chosen": -99.93084716796875, "logps/ref_chosen": -94.3504867553711, "logps/ref_rejected": -95.88395690917969, "logps/rejected": -109.36778259277344, "loss": 0.9614, "margin_dpo/margin_mean": 7.903461456298828, "margin_dpo/margin_std": 10.53524398803711, "step": 546 }, { "epoch": 0.8269085411942555, "grad_norm": 16.171890258789062, "learning_rate": 4.483350854765672e-08, "logits/chosen": 1.1553101539611816, "logits/rejected": 0.7482352256774902, "logps/chosen": -48.18885040283203, "logps/ref_chosen": -44.66164779663086, "logps/ref_rejected": -87.94758605957031, "logps/rejected": -100.96206665039062, "loss": 0.901, "margin_dpo/margin_mean": 9.487278938293457, "margin_dpo/margin_std": 13.219003677368164, "step": 547 }, { "epoch": 0.8284202569916855, "grad_norm": 20.400054931640625, "learning_rate": 4.4080854642541826e-08, "logits/chosen": 1.1452207565307617, "logits/rejected": 1.07662034034729, "logps/chosen": -86.12905883789062, "logps/ref_chosen": -84.8812484741211, "logps/ref_rejected": -89.17807006835938, "logps/rejected": -98.5996322631836, "loss": 1.1308, "margin_dpo/margin_mean": 8.173754692077637, "margin_dpo/margin_std": 10.9679536819458, "step": 548 }, { "epoch": 0.8299319727891157, "grad_norm": 23.801851272583008, "learning_rate": 4.333396073857723e-08, "logits/chosen": 1.7808953523635864, "logits/rejected": 1.8159725666046143, "logps/chosen": -97.10274505615234, "logps/ref_chosen": -92.5716552734375, "logps/ref_rejected": -109.80947875976562, "logps/rejected": -120.0747299194336, "loss": 1.2453, "margin_dpo/margin_mean": 5.734167575836182, "margin_dpo/margin_std": 11.611507415771484, "step": 549 }, { "epoch": 0.8314436885865457, "grad_norm": 21.680681228637695, "learning_rate": 4.259284772799099e-08, "logits/chosen": 1.750056266784668, "logits/rejected": 1.787322759628296, "logps/chosen": -60.24526596069336, "logps/ref_chosen": -62.580467224121094, "logps/ref_rejected": -60.354148864746094, "logps/rejected": -65.9862060546875, "loss": 1.1958, "margin_dpo/margin_mean": 7.967259407043457, "margin_dpo/margin_std": 14.096725463867188, "step": 550 }, { "epoch": 0.8329554043839759, "grad_norm": 17.300020217895508, "learning_rate": 4.1857536341307176e-08, "logits/chosen": 1.6444084644317627, "logits/rejected": 1.4184859991073608, "logps/chosen": -59.297080993652344, "logps/ref_chosen": -57.222023010253906, "logps/ref_rejected": -98.87252807617188, "logps/rejected": -114.77069091796875, "loss": 0.976, "margin_dpo/margin_mean": 13.823116302490234, "margin_dpo/margin_std": 12.905557632446289, "step": 551 }, { "epoch": 0.8344671201814059, "grad_norm": 19.558116912841797, "learning_rate": 4.112804714676593e-08, "logits/chosen": 1.7145252227783203, "logits/rejected": 1.373504400253296, "logps/chosen": -71.43199920654297, "logps/ref_chosen": -67.35926818847656, "logps/ref_rejected": -114.29425048828125, "logps/rejected": -124.70507049560547, "loss": 1.052, "margin_dpo/margin_mean": 6.338090896606445, "margin_dpo/margin_std": 9.177579879760742, "step": 552 }, { "epoch": 0.8359788359788359, "grad_norm": 23.0908260345459, "learning_rate": 4.0404400549748144e-08, "logits/chosen": 1.7321324348449707, "logits/rejected": 1.201894998550415, "logps/chosen": -70.65605163574219, "logps/ref_chosen": -66.7867202758789, "logps/ref_rejected": -136.77378845214844, "logps/rejected": -151.6304931640625, "loss": 1.11, "margin_dpo/margin_mean": 10.987382888793945, "margin_dpo/margin_std": 13.33609390258789, "step": 553 }, { "epoch": 0.8374905517762661, "grad_norm": 18.269779205322266, "learning_rate": 3.968661679220467e-08, "logits/chosen": 1.1821682453155518, "logits/rejected": 1.099571704864502, "logps/chosen": -77.60135650634766, "logps/ref_chosen": -74.59046936035156, "logps/ref_rejected": -93.91989135742188, "logps/rejected": -102.0347900390625, "loss": 1.0654, "margin_dpo/margin_mean": 5.10401725769043, "margin_dpo/margin_std": 10.80300235748291, "step": 554 }, { "epoch": 0.8390022675736961, "grad_norm": 18.944839477539062, "learning_rate": 3.89747159520904e-08, "logits/chosen": 1.742308259010315, "logits/rejected": 1.5633643865585327, "logps/chosen": -108.03983306884766, "logps/ref_chosen": -104.49757385253906, "logps/ref_rejected": -108.89088439941406, "logps/rejected": -117.63859558105469, "loss": 1.0804, "margin_dpo/margin_mean": 5.205449104309082, "margin_dpo/margin_std": 12.613378524780273, "step": 555 }, { "epoch": 0.8405139833711263, "grad_norm": 17.373750686645508, "learning_rate": 3.826871794280192e-08, "logits/chosen": 1.2276897430419922, "logits/rejected": 1.111892580986023, "logps/chosen": -75.65097045898438, "logps/ref_chosen": -67.98088073730469, "logps/ref_rejected": -84.16119384765625, "logps/rejected": -97.19153594970703, "loss": 1.1205, "margin_dpo/margin_mean": 5.360258102416992, "margin_dpo/margin_std": 11.282686233520508, "step": 556 }, { "epoch": 0.8420256991685563, "grad_norm": 18.611671447753906, "learning_rate": 3.756864251262143e-08, "logits/chosen": 1.0393072366714478, "logits/rejected": 0.5406616926193237, "logps/chosen": -52.78868865966797, "logps/ref_chosen": -52.481388092041016, "logps/ref_rejected": -65.22357177734375, "logps/rejected": -72.37397766113281, "loss": 1.0019, "margin_dpo/margin_mean": 6.843094348907471, "margin_dpo/margin_std": 12.773565292358398, "step": 557 }, { "epoch": 0.8435374149659864, "grad_norm": 19.885276794433594, "learning_rate": 3.687450924416341e-08, "logits/chosen": 1.8202342987060547, "logits/rejected": 1.7157986164093018, "logps/chosen": -86.03704833984375, "logps/ref_chosen": -82.42589569091797, "logps/ref_rejected": -108.21171569824219, "logps/rejected": -122.34723663330078, "loss": 1.0, "margin_dpo/margin_mean": 10.524370193481445, "margin_dpo/margin_std": 8.647871017456055, "step": 558 }, { "epoch": 0.8450491307634165, "grad_norm": 20.51862144470215, "learning_rate": 3.6186337553827743e-08, "logits/chosen": 1.5360794067382812, "logits/rejected": 1.0735549926757812, "logps/chosen": -106.53812408447266, "logps/ref_chosen": -102.7005615234375, "logps/ref_rejected": -159.43634033203125, "logps/rejected": -172.47027587890625, "loss": 1.1693, "margin_dpo/margin_mean": 9.19637680053711, "margin_dpo/margin_std": 15.942785263061523, "step": 559 }, { "epoch": 0.8465608465608465, "grad_norm": 17.374521255493164, "learning_rate": 3.550414669125573e-08, "logits/chosen": 1.2343003749847412, "logits/rejected": 1.2184264659881592, "logps/chosen": -109.99679565429688, "logps/ref_chosen": -108.25234985351562, "logps/ref_rejected": -110.07012939453125, "logps/rejected": -118.95156860351562, "loss": 1.0505, "margin_dpo/margin_mean": 7.136996746063232, "margin_dpo/margin_std": 10.95610523223877, "step": 560 }, { "epoch": 0.8480725623582767, "grad_norm": 16.211755752563477, "learning_rate": 3.482795573879241e-08, "logits/chosen": 1.5161001682281494, "logits/rejected": 1.4198527336120605, "logps/chosen": -66.95407104492188, "logps/ref_chosen": -66.03121948242188, "logps/ref_rejected": -83.07962036132812, "logps/rejected": -91.74763488769531, "loss": 0.9798, "margin_dpo/margin_mean": 7.74515962600708, "margin_dpo/margin_std": 9.913259506225586, "step": 561 }, { "epoch": 0.8495842781557067, "grad_norm": 16.60391616821289, "learning_rate": 3.415778361095226e-08, "logits/chosen": 1.919426441192627, "logits/rejected": 1.5250730514526367, "logps/chosen": -97.28946685791016, "logps/ref_chosen": -91.13333129882812, "logps/ref_rejected": -141.3377227783203, "logps/rejected": -155.29470825195312, "loss": 0.9836, "margin_dpo/margin_mean": 7.800865173339844, "margin_dpo/margin_std": 13.37569808959961, "step": 562 }, { "epoch": 0.8510959939531368, "grad_norm": 17.424243927001953, "learning_rate": 3.349364905389032e-08, "logits/chosen": 1.1921464204788208, "logits/rejected": 0.9232168197631836, "logps/chosen": -60.38529586791992, "logps/ref_chosen": -61.78717803955078, "logps/ref_rejected": -96.05131530761719, "logps/rejected": -105.43507385253906, "loss": 1.0827, "margin_dpo/margin_mean": 10.785636901855469, "margin_dpo/margin_std": 13.370136260986328, "step": 563 }, { "epoch": 0.8526077097505669, "grad_norm": 19.076160430908203, "learning_rate": 3.283557064487785e-08, "logits/chosen": 1.4732539653778076, "logits/rejected": 1.4946141242980957, "logps/chosen": -69.32135009765625, "logps/ref_chosen": -69.37046813964844, "logps/ref_rejected": -65.46798706054688, "logps/rejected": -76.27815246582031, "loss": 1.0605, "margin_dpo/margin_mean": 10.85927963256836, "margin_dpo/margin_std": 16.79261016845703, "step": 564 }, { "epoch": 0.854119425547997, "grad_norm": 17.255325317382812, "learning_rate": 3.218356679178252e-08, "logits/chosen": 1.6891493797302246, "logits/rejected": 1.2135411500930786, "logps/chosen": -93.90524291992188, "logps/ref_chosen": -76.99365234375, "logps/ref_rejected": -154.01608276367188, "logps/rejected": -174.64259338378906, "loss": 1.0313, "margin_dpo/margin_mean": 3.714911937713623, "margin_dpo/margin_std": 12.842723846435547, "step": 565 }, { "epoch": 0.8556311413454271, "grad_norm": 20.989856719970703, "learning_rate": 3.1537655732553764e-08, "logits/chosen": 1.7889273166656494, "logits/rejected": 1.5454118251800537, "logps/chosen": -83.66606140136719, "logps/ref_chosen": -81.36064910888672, "logps/ref_rejected": -107.46647644042969, "logps/rejected": -121.88374328613281, "loss": 1.0831, "margin_dpo/margin_mean": 12.111852645874023, "margin_dpo/margin_std": 8.250154495239258, "step": 566 }, { "epoch": 0.8571428571428571, "grad_norm": 17.08823013305664, "learning_rate": 3.089785553471233e-08, "logits/chosen": 1.2035924196243286, "logits/rejected": 1.295198917388916, "logps/chosen": -85.9408187866211, "logps/ref_chosen": -82.7647705078125, "logps/ref_rejected": -71.20525360107422, "logps/rejected": -84.59428405761719, "loss": 0.943, "margin_dpo/margin_mean": 10.21297836303711, "margin_dpo/margin_std": 12.372262954711914, "step": 567 }, { "epoch": 0.8586545729402872, "grad_norm": 15.560006141662598, "learning_rate": 3.026418409484513e-08, "logits/chosen": 1.5644490718841553, "logits/rejected": 1.3717920780181885, "logps/chosen": -58.53541564941406, "logps/ref_chosen": -57.66379165649414, "logps/ref_rejected": -101.05614471435547, "logps/rejected": -107.21061706542969, "loss": 0.8891, "margin_dpo/margin_mean": 5.282842636108398, "margin_dpo/margin_std": 10.844523429870605, "step": 568 }, { "epoch": 0.8601662887377173, "grad_norm": 19.204389572143555, "learning_rate": 2.963665913810451e-08, "logits/chosen": 1.8129878044128418, "logits/rejected": 1.830693006515503, "logps/chosen": -98.49859619140625, "logps/ref_chosen": -96.91630554199219, "logps/ref_rejected": -103.29447937011719, "logps/rejected": -117.35980224609375, "loss": 1.1218, "margin_dpo/margin_mean": 12.4830322265625, "margin_dpo/margin_std": 14.160663604736328, "step": 569 }, { "epoch": 0.8616780045351474, "grad_norm": 14.261395454406738, "learning_rate": 2.9015298217712453e-08, "logits/chosen": 1.703674077987671, "logits/rejected": 1.599630355834961, "logps/chosen": -74.10055541992188, "logps/ref_chosen": -70.51353454589844, "logps/ref_rejected": -72.61724853515625, "logps/rejected": -82.44377136230469, "loss": 0.8116, "margin_dpo/margin_mean": 6.23950719833374, "margin_dpo/margin_std": 9.172955513000488, "step": 570 }, { "epoch": 0.8631897203325775, "grad_norm": 20.0264892578125, "learning_rate": 2.840011871446962e-08, "logits/chosen": 1.3966124057769775, "logits/rejected": 1.0823702812194824, "logps/chosen": -63.99068832397461, "logps/ref_chosen": -65.33963775634766, "logps/ref_rejected": -93.44989776611328, "logps/rejected": -100.00010681152344, "loss": 1.1862, "margin_dpo/margin_mean": 7.89915657043457, "margin_dpo/margin_std": 7.443321228027344, "step": 571 }, { "epoch": 0.8647014361300076, "grad_norm": 20.597599029541016, "learning_rate": 2.7791137836269158e-08, "logits/chosen": 1.2991360425949097, "logits/rejected": 1.1408284902572632, "logps/chosen": -63.88082504272461, "logps/ref_chosen": -61.96685791015625, "logps/ref_rejected": -82.47567749023438, "logps/rejected": -95.54309844970703, "loss": 1.0317, "margin_dpo/margin_mean": 11.153448104858398, "margin_dpo/margin_std": 13.577078819274902, "step": 572 }, { "epoch": 0.8662131519274376, "grad_norm": 19.121173858642578, "learning_rate": 2.718837261761528e-08, "logits/chosen": 1.855758786201477, "logits/rejected": 1.7033733129501343, "logps/chosen": -86.36241149902344, "logps/ref_chosen": -81.11073303222656, "logps/ref_rejected": -89.05966186523438, "logps/rejected": -94.73785400390625, "loss": 1.1345, "margin_dpo/margin_mean": 0.4265178442001343, "margin_dpo/margin_std": 13.555877685546875, "step": 573 }, { "epoch": 0.8677248677248677, "grad_norm": 16.195131301879883, "learning_rate": 2.659183991914696e-08, "logits/chosen": 1.4159319400787354, "logits/rejected": 1.5143362283706665, "logps/chosen": -79.60470581054688, "logps/ref_chosen": -70.46939086914062, "logps/ref_rejected": -65.94213104248047, "logps/rejected": -74.72261810302734, "loss": 0.8892, "margin_dpo/margin_mean": -0.3548187017440796, "margin_dpo/margin_std": 8.279047012329102, "step": 574 }, { "epoch": 0.8692365835222978, "grad_norm": 19.91777992248535, "learning_rate": 2.600155642716606e-08, "logits/chosen": 1.7439351081848145, "logits/rejected": 1.3420132398605347, "logps/chosen": -86.42735290527344, "logps/ref_chosen": -80.69290924072266, "logps/ref_rejected": -137.00990295410156, "logps/rejected": -145.45672607421875, "loss": 1.1895, "margin_dpo/margin_mean": 2.712390899658203, "margin_dpo/margin_std": 14.354545593261719, "step": 575 }, { "epoch": 0.8707482993197279, "grad_norm": 17.009872436523438, "learning_rate": 2.5417538653170754e-08, "logits/chosen": 1.9598956108093262, "logits/rejected": 1.705583095550537, "logps/chosen": -73.29476165771484, "logps/ref_chosen": -72.19505310058594, "logps/ref_rejected": -98.79043579101562, "logps/rejected": -109.89517974853516, "loss": 0.9331, "margin_dpo/margin_mean": 10.00504207611084, "margin_dpo/margin_std": 10.71027946472168, "step": 576 }, { "epoch": 0.872260015117158, "grad_norm": 17.953590393066406, "learning_rate": 2.4839802933393607e-08, "logits/chosen": 2.0896143913269043, "logits/rejected": 1.9234864711761475, "logps/chosen": -58.926387786865234, "logps/ref_chosen": -59.01421356201172, "logps/ref_rejected": -77.23394775390625, "logps/rejected": -82.68992614746094, "loss": 1.1756, "margin_dpo/margin_mean": 5.5438008308410645, "margin_dpo/margin_std": 11.082096099853516, "step": 577 }, { "epoch": 0.873771730914588, "grad_norm": 17.826337814331055, "learning_rate": 2.4268365428344733e-08, "logits/chosen": 1.8121038675308228, "logits/rejected": 1.4766490459442139, "logps/chosen": -76.66905975341797, "logps/ref_chosen": -70.22001647949219, "logps/ref_rejected": -101.47371673583984, "logps/rejected": -111.71737670898438, "loss": 1.1596, "margin_dpo/margin_mean": 3.7946228981018066, "margin_dpo/margin_std": 8.00619125366211, "step": 578 }, { "epoch": 0.8752834467120182, "grad_norm": 17.362586975097656, "learning_rate": 2.3703242122359357e-08, "logits/chosen": 1.3625679016113281, "logits/rejected": 1.224226713180542, "logps/chosen": -58.240657806396484, "logps/ref_chosen": -54.6666374206543, "logps/ref_rejected": -80.38136291503906, "logps/rejected": -95.12175750732422, "loss": 0.9753, "margin_dpo/margin_mean": 11.166372299194336, "margin_dpo/margin_std": 11.77037525177002, "step": 579 }, { "epoch": 0.8767951625094482, "grad_norm": 17.465675354003906, "learning_rate": 2.3144448823151392e-08, "logits/chosen": 1.2661099433898926, "logits/rejected": 1.150763988494873, "logps/chosen": -75.54026794433594, "logps/ref_chosen": -76.24860382080078, "logps/ref_rejected": -95.67335510253906, "logps/rejected": -100.74491882324219, "loss": 1.0789, "margin_dpo/margin_mean": 5.779911994934082, "margin_dpo/margin_std": 10.722372055053711, "step": 580 }, { "epoch": 0.8783068783068783, "grad_norm": 18.842321395874023, "learning_rate": 2.259200116137039e-08, "logits/chosen": 2.0160927772521973, "logits/rejected": 1.8844667673110962, "logps/chosen": -81.1261215209961, "logps/ref_chosen": -77.2040786743164, "logps/ref_rejected": -101.34941101074219, "logps/rejected": -111.83381652832031, "loss": 1.0693, "margin_dpo/margin_mean": 6.5623602867126465, "margin_dpo/margin_std": 10.058853149414062, "step": 581 }, { "epoch": 0.8798185941043084, "grad_norm": 16.665578842163086, "learning_rate": 2.204591459016525e-08, "logits/chosen": 1.1603500843048096, "logits/rejected": 0.8706477880477905, "logps/chosen": -69.9503173828125, "logps/ref_chosen": -68.70108795166016, "logps/ref_rejected": -101.86036682128906, "logps/rejected": -118.56051635742188, "loss": 1.1055, "margin_dpo/margin_mean": 15.450916290283203, "margin_dpo/margin_std": 13.473276138305664, "step": 582 }, { "epoch": 0.8813303099017384, "grad_norm": 23.580341339111328, "learning_rate": 2.1506204384751064e-08, "logits/chosen": 1.7272263765335083, "logits/rejected": 1.6504167318344116, "logps/chosen": -77.47309112548828, "logps/ref_chosen": -71.14523315429688, "logps/ref_rejected": -84.20356750488281, "logps/rejected": -91.38108825683594, "loss": 1.2151, "margin_dpo/margin_mean": 0.8496625423431396, "margin_dpo/margin_std": 14.993759155273438, "step": 583 }, { "epoch": 0.8828420256991686, "grad_norm": 20.39993667602539, "learning_rate": 2.09728856419826e-08, "logits/chosen": 1.7487156391143799, "logits/rejected": 1.4867665767669678, "logps/chosen": -54.972251892089844, "logps/ref_chosen": -54.96758270263672, "logps/ref_rejected": -99.0251235961914, "logps/rejected": -106.09762573242188, "loss": 1.0997, "margin_dpo/margin_mean": 7.067837238311768, "margin_dpo/margin_std": 12.54556655883789, "step": 584 }, { "epoch": 0.8843537414965986, "grad_norm": 16.980655670166016, "learning_rate": 2.044597327993153e-08, "logits/chosen": 1.5392262935638428, "logits/rejected": 1.205371379852295, "logps/chosen": -77.55860900878906, "logps/ref_chosen": -74.56783294677734, "logps/ref_rejected": -136.12681579589844, "logps/rejected": -144.6713104248047, "loss": 1.179, "margin_dpo/margin_mean": 5.553707122802734, "margin_dpo/margin_std": 10.252649307250977, "step": 585 }, { "epoch": 0.8858654572940288, "grad_norm": 15.427536964416504, "learning_rate": 1.9925482037469187e-08, "logits/chosen": 1.4193227291107178, "logits/rejected": 1.4874060153961182, "logps/chosen": -73.52600860595703, "logps/ref_chosen": -73.84326171875, "logps/ref_rejected": -64.78768920898438, "logps/rejected": -74.1087646484375, "loss": 0.949, "margin_dpo/margin_mean": 9.638315200805664, "margin_dpo/margin_std": 9.592960357666016, "step": 586 }, { "epoch": 0.8873771730914588, "grad_norm": 21.97846031188965, "learning_rate": 1.9411426473854687e-08, "logits/chosen": 1.8039758205413818, "logits/rejected": 1.6281602382659912, "logps/chosen": -77.53922271728516, "logps/ref_chosen": -72.15461730957031, "logps/ref_rejected": -86.22157287597656, "logps/rejected": -95.01545715332031, "loss": 1.0184, "margin_dpo/margin_mean": 3.409271478652954, "margin_dpo/margin_std": 9.201944351196289, "step": 587 }, { "epoch": 0.8888888888888888, "grad_norm": 23.550718307495117, "learning_rate": 1.890382096832699e-08, "logits/chosen": 1.2994554042816162, "logits/rejected": 1.1653039455413818, "logps/chosen": -84.40214538574219, "logps/ref_chosen": -83.18878936767578, "logps/ref_rejected": -111.43290710449219, "logps/rejected": -122.60812377929688, "loss": 0.9737, "margin_dpo/margin_mean": 9.961862564086914, "margin_dpo/margin_std": 7.418084144592285, "step": 588 }, { "epoch": 0.890400604686319, "grad_norm": 20.140161514282227, "learning_rate": 1.840267971970344e-08, "logits/chosen": 0.9855322241783142, "logits/rejected": 1.0252772569656372, "logps/chosen": -70.16638946533203, "logps/ref_chosen": -67.77902221679688, "logps/ref_rejected": -89.7593765258789, "logps/rejected": -100.51022338867188, "loss": 0.9343, "margin_dpo/margin_mean": 8.363478660583496, "margin_dpo/margin_std": 12.025894165039062, "step": 589 }, { "epoch": 0.891912320483749, "grad_norm": 18.417516708374023, "learning_rate": 1.7908016745981856e-08, "logits/chosen": 1.2558910846710205, "logits/rejected": 1.2042444944381714, "logps/chosen": -69.4395523071289, "logps/ref_chosen": -65.99527740478516, "logps/ref_rejected": -78.5193862915039, "logps/rejected": -85.49420166015625, "loss": 0.9678, "margin_dpo/margin_mean": 3.530546188354492, "margin_dpo/margin_std": 12.429353713989258, "step": 590 }, { "epoch": 0.8934240362811792, "grad_norm": 17.514497756958008, "learning_rate": 1.7419845883949098e-08, "logits/chosen": 1.6709749698638916, "logits/rejected": 1.6045269966125488, "logps/chosen": -66.72975158691406, "logps/ref_chosen": -66.46542358398438, "logps/ref_rejected": -80.44232177734375, "logps/rejected": -90.60441589355469, "loss": 0.9874, "margin_dpo/margin_mean": 9.897760391235352, "margin_dpo/margin_std": 11.023052215576172, "step": 591 }, { "epoch": 0.8949357520786092, "grad_norm": 15.969135284423828, "learning_rate": 1.6938180788793556e-08, "logits/chosen": 1.3078227043151855, "logits/rejected": 1.1886652708053589, "logps/chosen": -67.06452941894531, "logps/ref_chosen": -67.20004272460938, "logps/ref_rejected": -92.5379638671875, "logps/rejected": -104.81024169921875, "loss": 0.9836, "margin_dpo/margin_mean": 12.407782554626465, "margin_dpo/margin_std": 11.735429763793945, "step": 592 }, { "epoch": 0.8964474678760394, "grad_norm": 17.383333206176758, "learning_rate": 1.6463034933723336e-08, "logits/chosen": 1.35636568069458, "logits/rejected": 1.3236842155456543, "logps/chosen": -86.72758483886719, "logps/ref_chosen": -82.37186431884766, "logps/ref_rejected": -97.39662170410156, "logps/rejected": -111.61054992675781, "loss": 1.0598, "margin_dpo/margin_mean": 9.858206748962402, "margin_dpo/margin_std": 10.82987117767334, "step": 593 }, { "epoch": 0.8979591836734694, "grad_norm": 17.741548538208008, "learning_rate": 1.5994421609589385e-08, "logits/chosen": 1.5507720708847046, "logits/rejected": 1.4901726245880127, "logps/chosen": -81.3001480102539, "logps/ref_chosen": -77.17347717285156, "logps/ref_rejected": -85.09161376953125, "logps/rejected": -94.42967224121094, "loss": 1.0661, "margin_dpo/margin_mean": 5.21138334274292, "margin_dpo/margin_std": 7.391263961791992, "step": 594 }, { "epoch": 0.8994708994708994, "grad_norm": 18.9903564453125, "learning_rate": 1.553235392451377e-08, "logits/chosen": 1.7658567428588867, "logits/rejected": 1.4860146045684814, "logps/chosen": -52.50914001464844, "logps/ref_chosen": -51.691951751708984, "logps/ref_rejected": -93.16610717773438, "logps/rejected": -105.13961791992188, "loss": 0.9983, "margin_dpo/margin_mean": 11.156330108642578, "margin_dpo/margin_std": 12.697431564331055, "step": 595 }, { "epoch": 0.9009826152683296, "grad_norm": 21.64061737060547, "learning_rate": 1.507684480352292e-08, "logits/chosen": 1.375877857208252, "logits/rejected": 1.2652180194854736, "logps/chosen": -78.5640869140625, "logps/ref_chosen": -76.55400085449219, "logps/ref_rejected": -81.81349182128906, "logps/rejected": -89.6481704711914, "loss": 1.3009, "margin_dpo/margin_mean": 5.824599742889404, "margin_dpo/margin_std": 11.860013961791992, "step": 596 }, { "epoch": 0.9024943310657596, "grad_norm": 15.115272521972656, "learning_rate": 1.4627906988186111e-08, "logits/chosen": 1.380811095237732, "logits/rejected": 1.4001131057739258, "logps/chosen": -88.30099487304688, "logps/ref_chosen": -85.21321868896484, "logps/ref_rejected": -76.54679870605469, "logps/rejected": -82.6546630859375, "loss": 1.0121, "margin_dpo/margin_mean": 3.020108699798584, "margin_dpo/margin_std": 13.225191116333008, "step": 597 }, { "epoch": 0.9040060468631897, "grad_norm": 20.66960906982422, "learning_rate": 1.4185553036259095e-08, "logits/chosen": 0.8799390196800232, "logits/rejected": 0.8972224593162537, "logps/chosen": -67.20416259765625, "logps/ref_chosen": -63.86994171142578, "logps/ref_rejected": -81.32545471191406, "logps/rejected": -90.64559936523438, "loss": 1.1724, "margin_dpo/margin_mean": 5.985927581787109, "margin_dpo/margin_std": 11.441094398498535, "step": 598 }, { "epoch": 0.9055177626606198, "grad_norm": 17.244525909423828, "learning_rate": 1.3749795321332885e-08, "logits/chosen": 1.1204272508621216, "logits/rejected": 1.0863747596740723, "logps/chosen": -86.43338012695312, "logps/ref_chosen": -81.67704772949219, "logps/ref_rejected": -82.86469268798828, "logps/rejected": -91.4560775756836, "loss": 1.0619, "margin_dpo/margin_mean": 3.8350515365600586, "margin_dpo/margin_std": 11.17123031616211, "step": 599 }, { "epoch": 0.9070294784580499, "grad_norm": 17.85767364501953, "learning_rate": 1.3320646032487393e-08, "logits/chosen": 1.3827404975891113, "logits/rejected": 1.4633660316467285, "logps/chosen": -84.48431396484375, "logps/ref_chosen": -81.40211486816406, "logps/ref_rejected": -67.45731353759766, "logps/rejected": -79.35734558105469, "loss": 1.1318, "margin_dpo/margin_mean": 8.817838668823242, "margin_dpo/margin_std": 11.690530776977539, "step": 600 }, { "epoch": 0.9070294784580499, "eval_logits/chosen": 1.4432964324951172, "eval_logits/rejected": 1.3188092708587646, "eval_logps/chosen": -90.09376525878906, "eval_logps/ref_chosen": -87.31719970703125, "eval_logps/ref_rejected": -95.23231506347656, "eval_logps/rejected": -105.90369415283203, "eval_loss": 0.5179798007011414, "eval_margin_dpo/margin_mean": 7.8948211669921875, "eval_margin_dpo/margin_std": 11.682005882263184, "eval_runtime": 42.7882, "eval_samples_per_second": 53.823, "eval_steps_per_second": 1.683, "step": 600 }, { "epoch": 0.90854119425548, "grad_norm": 14.905508041381836, "learning_rate": 1.2898117173950868e-08, "logits/chosen": 1.3443217277526855, "logits/rejected": 1.2762761116027832, "logps/chosen": -88.09783935546875, "logps/ref_chosen": -87.0102310180664, "logps/ref_rejected": -80.25422668457031, "logps/rejected": -92.59102630615234, "loss": 0.93, "margin_dpo/margin_mean": 11.24919605255127, "margin_dpo/margin_std": 15.050134658813477, "step": 601 }, { "epoch": 0.91005291005291, "grad_norm": 17.121368408203125, "learning_rate": 1.2482220564763667e-08, "logits/chosen": 1.5085530281066895, "logits/rejected": 1.3384506702423096, "logps/chosen": -85.7445068359375, "logps/ref_chosen": -87.5465316772461, "logps/ref_rejected": -113.92408752441406, "logps/rejected": -125.63684844970703, "loss": 0.9347, "margin_dpo/margin_mean": 13.514793395996094, "margin_dpo/margin_std": 11.44156551361084, "step": 602 }, { "epoch": 0.9115646258503401, "grad_norm": 17.26368522644043, "learning_rate": 1.2072967838448051e-08, "logits/chosen": 1.3381762504577637, "logits/rejected": 0.9238216876983643, "logps/chosen": -82.68363952636719, "logps/ref_chosen": -77.85739135742188, "logps/ref_rejected": -131.91189575195312, "logps/rejected": -144.17015075683594, "loss": 1.0748, "margin_dpo/margin_mean": 7.431997776031494, "margin_dpo/margin_std": 11.796621322631836, "step": 603 }, { "epoch": 0.9130763416477702, "grad_norm": 18.104822158813477, "learning_rate": 1.1670370442682459e-08, "logits/chosen": 1.3699002265930176, "logits/rejected": 1.2324756383895874, "logps/chosen": -82.2676773071289, "logps/ref_chosen": -82.13948822021484, "logps/ref_rejected": -77.12074279785156, "logps/rejected": -83.83705139160156, "loss": 1.1335, "margin_dpo/margin_mean": 6.588123321533203, "margin_dpo/margin_std": 10.303499221801758, "step": 604 }, { "epoch": 0.9145880574452003, "grad_norm": 18.233715057373047, "learning_rate": 1.1274439638981532e-08, "logits/chosen": 1.688083291053772, "logits/rejected": 1.3717212677001953, "logps/chosen": -64.18658447265625, "logps/ref_chosen": -61.72200012207031, "logps/ref_rejected": -111.85667419433594, "logps/rejected": -120.89041137695312, "loss": 1.0935, "margin_dpo/margin_mean": 6.56913948059082, "margin_dpo/margin_std": 11.750870704650879, "step": 605 }, { "epoch": 0.9160997732426304, "grad_norm": 16.52940559387207, "learning_rate": 1.0885186502381016e-08, "logits/chosen": 1.523679494857788, "logits/rejected": 1.289475440979004, "logps/chosen": -73.48724365234375, "logps/ref_chosen": -70.04190063476562, "logps/ref_rejected": -97.20962524414062, "logps/rejected": -109.41299438476562, "loss": 1.0224, "margin_dpo/margin_mean": 8.75802993774414, "margin_dpo/margin_std": 10.746617317199707, "step": 606 }, { "epoch": 0.9176114890400605, "grad_norm": 18.193004608154297, "learning_rate": 1.0502621921127774e-08, "logits/chosen": 0.9303282499313354, "logits/rejected": 1.428611159324646, "logps/chosen": -120.57992553710938, "logps/ref_chosen": -116.22660064697266, "logps/ref_rejected": -99.57171630859375, "logps/rejected": -112.75103759765625, "loss": 0.9301, "margin_dpo/margin_mean": 8.825995445251465, "margin_dpo/margin_std": 11.593498229980469, "step": 607 }, { "epoch": 0.9191232048374905, "grad_norm": 21.648239135742188, "learning_rate": 1.0126756596375685e-08, "logits/chosen": 1.3450318574905396, "logits/rejected": 1.3037018775939941, "logps/chosen": -88.96870422363281, "logps/ref_chosen": -86.08586120605469, "logps/ref_rejected": -73.17562866210938, "logps/rejected": -80.0965576171875, "loss": 1.1659, "margin_dpo/margin_mean": 4.038094520568848, "margin_dpo/margin_std": 8.220069885253906, "step": 608 }, { "epoch": 0.9206349206349206, "grad_norm": 14.971814155578613, "learning_rate": 9.757601041885694e-09, "logits/chosen": 1.632197380065918, "logits/rejected": 1.3415696620941162, "logps/chosen": -67.89978790283203, "logps/ref_chosen": -66.4081802368164, "logps/ref_rejected": -109.58668518066406, "logps/rejected": -120.42801666259766, "loss": 0.9099, "margin_dpo/margin_mean": 9.3497314453125, "margin_dpo/margin_std": 8.75728988647461, "step": 609 }, { "epoch": 0.9221466364323507, "grad_norm": 22.184701919555664, "learning_rate": 9.395165583732379e-09, "logits/chosen": 1.481752634048462, "logits/rejected": 1.5864505767822266, "logps/chosen": -108.60702514648438, "logps/ref_chosen": -104.24087524414062, "logps/ref_rejected": -104.20893096923828, "logps/rejected": -115.11647033691406, "loss": 1.0641, "margin_dpo/margin_mean": 6.5413923263549805, "margin_dpo/margin_std": 11.715717315673828, "step": 610 }, { "epoch": 0.9236583522297808, "grad_norm": 17.456018447875977, "learning_rate": 9.03946036001449e-09, "logits/chosen": 1.4371566772460938, "logits/rejected": 1.4335110187530518, "logps/chosen": -97.08434295654297, "logps/ref_chosen": -91.26354217529297, "logps/ref_rejected": -90.4445571899414, "logps/rejected": -98.9804916381836, "loss": 1.1214, "margin_dpo/margin_mean": 2.7151358127593994, "margin_dpo/margin_std": 10.84466552734375, "step": 611 }, { "epoch": 0.9251700680272109, "grad_norm": 16.698745727539062, "learning_rate": 8.690495320571839e-09, "logits/chosen": 0.7719953656196594, "logits/rejected": 0.60256427526474, "logps/chosen": -118.38748168945312, "logps/ref_chosen": -114.47161102294922, "logps/ref_rejected": -142.44839477539062, "logps/rejected": -153.93727111816406, "loss": 0.9955, "margin_dpo/margin_mean": 7.573012351989746, "margin_dpo/margin_std": 9.594082832336426, "step": 612 }, { "epoch": 0.926681783824641, "grad_norm": 18.308046340942383, "learning_rate": 8.348280226706722e-09, "logits/chosen": 0.7473181486129761, "logits/rejected": 0.7265275120735168, "logps/chosen": -60.11067581176758, "logps/ref_chosen": -56.83968734741211, "logps/ref_rejected": -62.65274429321289, "logps/rejected": -72.4839859008789, "loss": 0.9079, "margin_dpo/margin_mean": 6.560257911682129, "margin_dpo/margin_std": 11.889326095581055, "step": 613 }, { "epoch": 0.9281934996220711, "grad_norm": 18.27388572692871, "learning_rate": 8.012824650910937e-09, "logits/chosen": 1.9573242664337158, "logits/rejected": 1.592354416847229, "logps/chosen": -66.1085433959961, "logps/ref_chosen": -65.70287322998047, "logps/ref_rejected": -91.19664001464844, "logps/rejected": -102.28840637207031, "loss": 0.997, "margin_dpo/margin_mean": 10.6860990524292, "margin_dpo/margin_std": 9.086450576782227, "step": 614 }, { "epoch": 0.9297052154195011, "grad_norm": 21.73447036743164, "learning_rate": 7.684137976598088e-09, "logits/chosen": 1.5744948387145996, "logits/rejected": 1.548459529876709, "logps/chosen": -88.8761215209961, "logps/ref_chosen": -85.73095703125, "logps/ref_rejected": -110.56735229492188, "logps/rejected": -120.92332458496094, "loss": 0.9742, "margin_dpo/margin_mean": 7.210803031921387, "margin_dpo/margin_std": 12.126655578613281, "step": 615 }, { "epoch": 0.9312169312169312, "grad_norm": 18.48801040649414, "learning_rate": 7.36222939784098e-09, "logits/chosen": 1.3054572343826294, "logits/rejected": 1.1838648319244385, "logps/chosen": -85.79578399658203, "logps/ref_chosen": -83.71074676513672, "logps/ref_rejected": -105.13561248779297, "logps/rejected": -111.24017333984375, "loss": 1.0574, "margin_dpo/margin_mean": 4.019524097442627, "margin_dpo/margin_std": 9.304898262023926, "step": 616 }, { "epoch": 0.9327286470143613, "grad_norm": 17.521886825561523, "learning_rate": 7.047107919114586e-09, "logits/chosen": 1.7559709548950195, "logits/rejected": 1.6681147813796997, "logps/chosen": -92.1893539428711, "logps/ref_chosen": -87.66813659667969, "logps/ref_rejected": -104.60226440429688, "logps/rejected": -115.18096160888672, "loss": 0.9979, "margin_dpo/margin_mean": 6.057486534118652, "margin_dpo/margin_std": 7.72330379486084, "step": 617 }, { "epoch": 0.9342403628117913, "grad_norm": 18.390331268310547, "learning_rate": 6.738782355044048e-09, "logits/chosen": 1.4357054233551025, "logits/rejected": 1.0602905750274658, "logps/chosen": -81.81050109863281, "logps/ref_chosen": -81.6343994140625, "logps/ref_rejected": -131.33901977539062, "logps/rejected": -143.23614501953125, "loss": 1.0827, "margin_dpo/margin_mean": 11.72103500366211, "margin_dpo/margin_std": 13.690505981445312, "step": 618 }, { "epoch": 0.9357520786092215, "grad_norm": 16.01395606994629, "learning_rate": 6.437261330158206e-09, "logits/chosen": 1.7714673280715942, "logits/rejected": 1.6839182376861572, "logps/chosen": -69.84000396728516, "logps/ref_chosen": -69.38705444335938, "logps/ref_rejected": -85.20960235595703, "logps/rejected": -96.47872924804688, "loss": 0.9616, "margin_dpo/margin_mean": 10.816186904907227, "margin_dpo/margin_std": 13.97287368774414, "step": 619 }, { "epoch": 0.9372637944066515, "grad_norm": 20.930551528930664, "learning_rate": 6.142553278648238e-09, "logits/chosen": 1.3915448188781738, "logits/rejected": 1.3259550333023071, "logps/chosen": -104.18832397460938, "logps/ref_chosen": -99.11640167236328, "logps/ref_rejected": -101.99665832519531, "logps/rejected": -115.02188873291016, "loss": 1.1532, "margin_dpo/margin_mean": 7.953309535980225, "margin_dpo/margin_std": 13.846855163574219, "step": 620 }, { "epoch": 0.9387755102040817, "grad_norm": 18.575275421142578, "learning_rate": 5.854666444131934e-09, "logits/chosen": 1.3157460689544678, "logits/rejected": 1.5444042682647705, "logps/chosen": -97.02737426757812, "logps/ref_chosen": -94.86390686035156, "logps/ref_rejected": -71.17986297607422, "logps/rejected": -80.39669799804688, "loss": 1.086, "margin_dpo/margin_mean": 7.053367614746094, "margin_dpo/margin_std": 13.099421501159668, "step": 621 }, { "epoch": 0.9402872260015117, "grad_norm": 16.817035675048828, "learning_rate": 5.573608879422875e-09, "logits/chosen": 1.4764742851257324, "logits/rejected": 1.4986090660095215, "logps/chosen": -77.92872619628906, "logps/ref_chosen": -72.72981262207031, "logps/ref_rejected": -73.24261474609375, "logps/rejected": -85.58164978027344, "loss": 1.0007, "margin_dpo/margin_mean": 7.140112400054932, "margin_dpo/margin_std": 11.715137481689453, "step": 622 }, { "epoch": 0.9417989417989417, "grad_norm": 18.876110076904297, "learning_rate": 5.299388446305342e-09, "logits/chosen": 1.6838326454162598, "logits/rejected": 1.3310813903808594, "logps/chosen": -72.85499572753906, "logps/ref_chosen": -66.93772888183594, "logps/ref_rejected": -117.25381469726562, "logps/rejected": -129.57510375976562, "loss": 0.9928, "margin_dpo/margin_mean": 6.404027938842773, "margin_dpo/margin_std": 10.223091125488281, "step": 623 }, { "epoch": 0.9433106575963719, "grad_norm": 17.289594650268555, "learning_rate": 5.03201281531429e-09, "logits/chosen": 1.159879446029663, "logits/rejected": 1.0869073867797852, "logps/chosen": -79.07652282714844, "logps/ref_chosen": -76.47087097167969, "logps/ref_rejected": -77.22698974609375, "logps/rejected": -85.85189819335938, "loss": 0.9579, "margin_dpo/margin_mean": 6.019251823425293, "margin_dpo/margin_std": 8.766650199890137, "step": 624 }, { "epoch": 0.9448223733938019, "grad_norm": 19.9653263092041, "learning_rate": 4.7714894655209174e-09, "logits/chosen": 2.0300965309143066, "logits/rejected": 1.7943127155303955, "logps/chosen": -67.40950012207031, "logps/ref_chosen": -66.40412902832031, "logps/ref_rejected": -98.97720336914062, "logps/rejected": -107.16277313232422, "loss": 1.1839, "margin_dpo/margin_mean": 7.180194854736328, "margin_dpo/margin_std": 8.925931930541992, "step": 625 }, { "epoch": 0.9463340891912321, "grad_norm": 18.206520080566406, "learning_rate": 4.517825684323323e-09, "logits/chosen": 1.9503434896469116, "logits/rejected": 1.7698050737380981, "logps/chosen": -47.13784408569336, "logps/ref_chosen": -49.676002502441406, "logps/ref_rejected": -91.54035949707031, "logps/rejected": -100.55081176757812, "loss": 1.0456, "margin_dpo/margin_mean": 11.548616409301758, "margin_dpo/margin_std": 16.879806518554688, "step": 626 }, { "epoch": 0.9478458049886621, "grad_norm": 18.35430908203125, "learning_rate": 4.271028567242818e-09, "logits/chosen": 1.461404800415039, "logits/rejected": 1.123425006866455, "logps/chosen": -82.38633728027344, "logps/ref_chosen": -81.1116943359375, "logps/ref_rejected": -146.22958374023438, "logps/rejected": -161.44476318359375, "loss": 0.8552, "margin_dpo/margin_mean": 13.94053840637207, "margin_dpo/margin_std": 11.15290355682373, "step": 627 }, { "epoch": 0.9493575207860923, "grad_norm": 19.281156539916992, "learning_rate": 4.0311050177251895e-09, "logits/chosen": 1.6440317630767822, "logits/rejected": 1.7929996252059937, "logps/chosen": -86.36823272705078, "logps/ref_chosen": -87.46820068359375, "logps/ref_rejected": -86.33444213867188, "logps/rejected": -100.18806457519531, "loss": 1.045, "margin_dpo/margin_mean": 14.953593254089355, "margin_dpo/margin_std": 15.51756763458252, "step": 628 }, { "epoch": 0.9508692365835223, "grad_norm": 19.004770278930664, "learning_rate": 3.798061746947995e-09, "logits/chosen": 1.8739081621170044, "logits/rejected": 2.0123767852783203, "logps/chosen": -89.065185546875, "logps/ref_chosen": -88.49932861328125, "logps/ref_rejected": -64.12482452392578, "logps/rejected": -69.10171508789062, "loss": 0.9637, "margin_dpo/margin_mean": 4.41103458404541, "margin_dpo/margin_std": 9.106287956237793, "step": 629 }, { "epoch": 0.9523809523809523, "grad_norm": 14.79022216796875, "learning_rate": 3.5719052736323806e-09, "logits/chosen": 1.4147577285766602, "logits/rejected": 1.2344114780426025, "logps/chosen": -51.37743377685547, "logps/ref_chosen": -53.218597412109375, "logps/ref_rejected": -85.27548217773438, "logps/rejected": -94.2133560180664, "loss": 0.9172, "margin_dpo/margin_mean": 10.779036521911621, "margin_dpo/margin_std": 12.927876472473145, "step": 630 }, { "epoch": 0.9538926681783825, "grad_norm": 15.910907745361328, "learning_rate": 3.352641923861144e-09, "logits/chosen": 1.8082447052001953, "logits/rejected": 1.6056909561157227, "logps/chosen": -92.31981658935547, "logps/ref_chosen": -87.3474349975586, "logps/ref_rejected": -109.71377563476562, "logps/rejected": -123.54084777832031, "loss": 0.8691, "margin_dpo/margin_mean": 8.854686737060547, "margin_dpo/margin_std": 8.26478385925293, "step": 631 }, { "epoch": 0.9554043839758125, "grad_norm": 15.58011531829834, "learning_rate": 3.140277830901428e-09, "logits/chosen": 1.8941532373428345, "logits/rejected": 1.744066834449768, "logps/chosen": -91.18122100830078, "logps/ref_chosen": -87.20173645019531, "logps/ref_rejected": -82.0855484008789, "logps/rejected": -93.52336883544922, "loss": 0.9676, "margin_dpo/margin_mean": 7.458335876464844, "margin_dpo/margin_std": 9.417821884155273, "step": 632 }, { "epoch": 0.9569160997732427, "grad_norm": 16.345203399658203, "learning_rate": 2.9348189350335007e-09, "logits/chosen": 1.3171489238739014, "logits/rejected": 1.1669178009033203, "logps/chosen": -60.66598892211914, "logps/ref_chosen": -61.20926284790039, "logps/ref_rejected": -81.50201416015625, "logps/rejected": -89.64041137695312, "loss": 0.9564, "margin_dpo/margin_mean": 8.681678771972656, "margin_dpo/margin_std": 10.067359924316406, "step": 633 }, { "epoch": 0.9584278155706727, "grad_norm": 23.774341583251953, "learning_rate": 2.736270983384276e-09, "logits/chosen": 1.162872076034546, "logits/rejected": 1.1319793462753296, "logps/chosen": -84.03133392333984, "logps/ref_chosen": -79.12847137451172, "logps/ref_rejected": -75.25579833984375, "logps/rejected": -81.34612274169922, "loss": 1.2778, "margin_dpo/margin_mean": 1.1874574422836304, "margin_dpo/margin_std": 7.0797038078308105, "step": 634 }, { "epoch": 0.9599395313681028, "grad_norm": 19.676300048828125, "learning_rate": 2.5446395297668287e-09, "logits/chosen": 1.548421859741211, "logits/rejected": 1.4762241840362549, "logps/chosen": -72.77522277832031, "logps/ref_chosen": -65.92240905761719, "logps/ref_rejected": -84.54975128173828, "logps/rejected": -97.1646728515625, "loss": 1.1816, "margin_dpo/margin_mean": 5.7621049880981445, "margin_dpo/margin_std": 15.856145858764648, "step": 635 }, { "epoch": 0.9614512471655329, "grad_norm": 15.20212173461914, "learning_rate": 2.359929934524829e-09, "logits/chosen": 1.4818272590637207, "logits/rejected": 1.1474237442016602, "logps/chosen": -75.02130889892578, "logps/ref_chosen": -70.1754150390625, "logps/ref_rejected": -96.32096099853516, "logps/rejected": -112.0989990234375, "loss": 0.8145, "margin_dpo/margin_mean": 10.932147979736328, "margin_dpo/margin_std": 11.150850296020508, "step": 636 }, { "epoch": 0.9629629629629629, "grad_norm": 17.8300724029541, "learning_rate": 2.1821473643827137e-09, "logits/chosen": 1.8633533716201782, "logits/rejected": 1.5756624937057495, "logps/chosen": -92.27776336669922, "logps/ref_chosen": -86.76708221435547, "logps/ref_rejected": -125.85480499267578, "logps/rejected": -140.7567138671875, "loss": 0.9679, "margin_dpo/margin_mean": 9.391212463378906, "margin_dpo/margin_std": 9.200068473815918, "step": 637 }, { "epoch": 0.9644746787603931, "grad_norm": 17.15129852294922, "learning_rate": 2.0112967923011646e-09, "logits/chosen": 0.8507182598114014, "logits/rejected": 0.8024640679359436, "logps/chosen": -98.4106216430664, "logps/ref_chosen": -91.50517272949219, "logps/ref_rejected": -98.6578140258789, "logps/rejected": -109.16825103759766, "loss": 0.9548, "margin_dpo/margin_mean": 3.6049978733062744, "margin_dpo/margin_std": 11.042359352111816, "step": 638 }, { "epoch": 0.9659863945578231, "grad_norm": 16.736635208129883, "learning_rate": 1.847382997337943e-09, "logits/chosen": 1.4646602869033813, "logits/rejected": 1.1195839643478394, "logps/chosen": -56.15229797363281, "logps/ref_chosen": -56.33502197265625, "logps/ref_rejected": -82.59422302246094, "logps/rejected": -95.43173217773438, "loss": 0.9509, "margin_dpo/margin_mean": 13.020225524902344, "margin_dpo/margin_std": 10.440542221069336, "step": 639 }, { "epoch": 0.9674981103552532, "grad_norm": 19.80340003967285, "learning_rate": 1.690410564514244e-09, "logits/chosen": 1.523207187652588, "logits/rejected": 1.152984619140625, "logps/chosen": -80.076171875, "logps/ref_chosen": -76.00202941894531, "logps/ref_rejected": -113.35598754882812, "logps/rejected": -123.30535888671875, "loss": 1.1823, "margin_dpo/margin_mean": 5.875223159790039, "margin_dpo/margin_std": 10.620429992675781, "step": 640 }, { "epoch": 0.9690098261526833, "grad_norm": 22.529207229614258, "learning_rate": 1.5403838846864692e-09, "logits/chosen": 1.6252977848052979, "logits/rejected": 1.810211181640625, "logps/chosen": -96.23757934570312, "logps/ref_chosen": -92.76283264160156, "logps/ref_rejected": -66.28691101074219, "logps/rejected": -74.52487182617188, "loss": 1.0508, "margin_dpo/margin_mean": 4.7632341384887695, "margin_dpo/margin_std": 11.673845291137695, "step": 641 }, { "epoch": 0.9705215419501134, "grad_norm": 29.48429298400879, "learning_rate": 1.3973071544233218e-09, "logits/chosen": 1.4339690208435059, "logits/rejected": 1.493814468383789, "logps/chosen": -89.35336303710938, "logps/ref_chosen": -86.39984130859375, "logps/ref_rejected": -85.24763488769531, "logps/rejected": -93.75303649902344, "loss": 1.0871, "margin_dpo/margin_mean": 5.55187463760376, "margin_dpo/margin_std": 13.663029670715332, "step": 642 }, { "epoch": 0.9720332577475435, "grad_norm": 21.092832565307617, "learning_rate": 1.261184375888541e-09, "logits/chosen": 1.4830613136291504, "logits/rejected": 1.2223186492919922, "logps/chosen": -114.34831237792969, "logps/ref_chosen": -105.88678741455078, "logps/ref_rejected": -115.27891540527344, "logps/rejected": -131.4384765625, "loss": 1.06, "margin_dpo/margin_mean": 7.698041915893555, "margin_dpo/margin_std": 7.342251777648926, "step": 643 }, { "epoch": 0.9735449735449735, "grad_norm": 18.641374588012695, "learning_rate": 1.1320193567288527e-09, "logits/chosen": 2.0806455612182617, "logits/rejected": 1.9463882446289062, "logps/chosen": -62.69779968261719, "logps/ref_chosen": -58.99338150024414, "logps/ref_rejected": -77.56077575683594, "logps/rejected": -87.70941162109375, "loss": 1.0957, "margin_dpo/margin_mean": 6.44422721862793, "margin_dpo/margin_std": 12.243961334228516, "step": 644 }, { "epoch": 0.9750566893424036, "grad_norm": 18.767841339111328, "learning_rate": 1.0098157099674987e-09, "logits/chosen": 1.5735886096954346, "logits/rejected": 1.6023855209350586, "logps/chosen": -77.97441864013672, "logps/ref_chosen": -73.263916015625, "logps/ref_rejected": -90.16896057128906, "logps/rejected": -107.21348571777344, "loss": 0.901, "margin_dpo/margin_mean": 12.334012031555176, "margin_dpo/margin_std": 13.413923263549805, "step": 645 }, { "epoch": 0.9765684051398337, "grad_norm": 15.216825485229492, "learning_rate": 8.945768539031783e-10, "logits/chosen": 1.3807857036590576, "logits/rejected": 1.1138019561767578, "logps/chosen": -71.50484466552734, "logps/ref_chosen": -67.53563690185547, "logps/ref_rejected": -93.27706146240234, "logps/rejected": -109.07088470458984, "loss": 0.9661, "margin_dpo/margin_mean": 11.82461929321289, "margin_dpo/margin_std": 11.310791015625, "step": 646 }, { "epoch": 0.9780801209372638, "grad_norm": 17.22197723388672, "learning_rate": 7.863060120144316e-10, "logits/chosen": 1.5823383331298828, "logits/rejected": 1.3332762718200684, "logps/chosen": -86.91207885742188, "logps/ref_chosen": -80.29917907714844, "logps/ref_rejected": -159.9084930419922, "logps/rejected": -174.36048889160156, "loss": 0.8157, "margin_dpo/margin_mean": 7.839094161987305, "margin_dpo/margin_std": 8.450590133666992, "step": 647 }, { "epoch": 0.9795918367346939, "grad_norm": 17.142921447753906, "learning_rate": 6.850062128694045e-10, "logits/chosen": 1.1645476818084717, "logits/rejected": 1.0589673519134521, "logps/chosen": -88.89888000488281, "logps/ref_chosen": -82.89018249511719, "logps/ref_rejected": -97.86979675292969, "logps/rejected": -108.09463500976562, "loss": 1.0128, "margin_dpo/margin_mean": 4.216141700744629, "margin_dpo/margin_std": 12.352148056030273, "step": 648 }, { "epoch": 0.981103552532124, "grad_norm": 20.739110946655273, "learning_rate": 5.906802900412788e-10, "logits/chosen": 1.5978401899337769, "logits/rejected": 1.4116055965423584, "logps/chosen": -63.5177001953125, "logps/ref_chosen": -60.08456039428711, "logps/ref_rejected": -79.35488891601562, "logps/rejected": -94.80354309082031, "loss": 1.0755, "margin_dpo/margin_mean": 12.015510559082031, "margin_dpo/margin_std": 11.092313766479492, "step": 649 }, { "epoch": 0.982615268329554, "grad_norm": 18.967485427856445, "learning_rate": 5.033308820289184e-10, "logits/chosen": 1.8647633790969849, "logits/rejected": 1.529266119003296, "logps/chosen": -60.950950622558594, "logps/ref_chosen": -60.14158630371094, "logps/ref_rejected": -110.77296447753906, "logps/rejected": -122.6583023071289, "loss": 0.9601, "margin_dpo/margin_mean": 11.0759859085083, "margin_dpo/margin_std": 10.035527229309082, "step": 650 }, { "epoch": 0.9841269841269841, "grad_norm": 18.412734985351562, "learning_rate": 4.2296043218295606e-10, "logits/chosen": 1.3484299182891846, "logits/rejected": 1.2784044742584229, "logps/chosen": -74.44709777832031, "logps/ref_chosen": -71.01092529296875, "logps/ref_rejected": -81.35868072509766, "logps/rejected": -87.52689361572266, "loss": 1.0279, "margin_dpo/margin_mean": 2.7320351600646973, "margin_dpo/margin_std": 11.313848495483398, "step": 651 }, { "epoch": 0.9856386999244142, "grad_norm": 23.452882766723633, "learning_rate": 3.4957118863768176e-10, "logits/chosen": 1.8453524112701416, "logits/rejected": 1.8005539178848267, "logps/chosen": -54.88365173339844, "logps/ref_chosen": -58.51313781738281, "logps/ref_rejected": -60.924888610839844, "logps/rejected": -70.33354187011719, "loss": 1.0983, "margin_dpo/margin_mean": 13.038140296936035, "margin_dpo/margin_std": 12.017801284790039, "step": 652 }, { "epoch": 0.9871504157218443, "grad_norm": 17.194881439208984, "learning_rate": 2.831652042480093e-10, "logits/chosen": 1.0439538955688477, "logits/rejected": 1.114800214767456, "logps/chosen": -92.80335998535156, "logps/ref_chosen": -91.18860626220703, "logps/ref_rejected": -92.69169616699219, "logps/rejected": -104.38011169433594, "loss": 0.9885, "margin_dpo/margin_mean": 10.073652267456055, "margin_dpo/margin_std": 11.544588088989258, "step": 653 }, { "epoch": 0.9886621315192744, "grad_norm": 19.679128646850586, "learning_rate": 2.2374433653205016e-10, "logits/chosen": 1.174835205078125, "logits/rejected": 0.917803943157196, "logps/chosen": -62.90448760986328, "logps/ref_chosen": -62.01698303222656, "logps/ref_rejected": -97.94729614257812, "logps/rejected": -105.63380432128906, "loss": 1.0991, "margin_dpo/margin_mean": 6.7990193367004395, "margin_dpo/margin_std": 12.262263298034668, "step": 654 }, { "epoch": 0.9901738473167044, "grad_norm": 15.820514678955078, "learning_rate": 1.7131024761923852e-10, "logits/chosen": 1.085647702217102, "logits/rejected": 1.0816160440444946, "logps/chosen": -87.82431030273438, "logps/ref_chosen": -84.88597869873047, "logps/ref_rejected": -79.70976257324219, "logps/rejected": -88.19711303710938, "loss": 0.947, "margin_dpo/margin_mean": 5.549028396606445, "margin_dpo/margin_std": 7.839292526245117, "step": 655 }, { "epoch": 0.9916855631141346, "grad_norm": 17.426054000854492, "learning_rate": 1.2586440420372934e-10, "logits/chosen": 0.9089465737342834, "logits/rejected": 1.1027709245681763, "logps/chosen": -104.47540283203125, "logps/ref_chosen": -106.60414123535156, "logps/ref_rejected": -91.9193115234375, "logps/rejected": -96.89473724365234, "loss": 0.9548, "margin_dpo/margin_mean": 7.104152679443359, "margin_dpo/margin_std": 10.047257423400879, "step": 656 }, { "epoch": 0.9931972789115646, "grad_norm": 18.658771514892578, "learning_rate": 8.740807750345913e-11, "logits/chosen": 1.542173981666565, "logits/rejected": 1.1845550537109375, "logps/chosen": -53.32915496826172, "logps/ref_chosen": -50.90999984741211, "logps/ref_rejected": -102.31036376953125, "logps/rejected": -118.38748168945312, "loss": 0.8496, "margin_dpo/margin_mean": 13.657959938049316, "margin_dpo/margin_std": 11.830829620361328, "step": 657 }, { "epoch": 0.9947089947089947, "grad_norm": 17.80472183227539, "learning_rate": 5.594234322453539e-11, "logits/chosen": 1.7548247575759888, "logits/rejected": 1.6277499198913574, "logps/chosen": -89.88461303710938, "logps/ref_chosen": -85.44100189208984, "logps/ref_rejected": -100.04646301269531, "logps/rejected": -111.47998809814453, "loss": 1.0964, "margin_dpo/margin_mean": 6.989911079406738, "margin_dpo/margin_std": 10.78244686126709, "step": 658 }, { "epoch": 0.9962207105064248, "grad_norm": 19.191669464111328, "learning_rate": 3.146808153123293e-11, "logits/chosen": 1.5873839855194092, "logits/rejected": 1.2246129512786865, "logps/chosen": -49.315711975097656, "logps/ref_chosen": -49.80256652832031, "logps/ref_rejected": -89.57881164550781, "logps/rejected": -97.83163452148438, "loss": 1.2121, "margin_dpo/margin_mean": 8.739677429199219, "margin_dpo/margin_std": 10.020225524902344, "step": 659 }, { "epoch": 0.9977324263038548, "grad_norm": 16.246614456176758, "learning_rate": 1.3985977021235829e-11, "logits/chosen": 1.4449957609176636, "logits/rejected": 1.5105805397033691, "logps/chosen": -77.48287200927734, "logps/ref_chosen": -74.09809875488281, "logps/ref_rejected": -83.458251953125, "logps/rejected": -96.87677001953125, "loss": 0.8144, "margin_dpo/margin_mean": 10.033748626708984, "margin_dpo/margin_std": 12.101351737976074, "step": 660 }, { "epoch": 0.999244142101285, "grad_norm": 18.737720489501953, "learning_rate": 3.4965187065971735e-12, "logits/chosen": 1.4176580905914307, "logits/rejected": 1.4212589263916016, "logps/chosen": -84.48843383789062, "logps/ref_chosen": -78.03362274169922, "logps/ref_rejected": -67.08574676513672, "logps/rejected": -79.0387954711914, "loss": 1.1616, "margin_dpo/margin_mean": 5.498239517211914, "margin_dpo/margin_std": 16.2996883392334, "step": 661 }, { "epoch": 0.999244142101285, "step": 661, "total_flos": 0.0, "train_loss": 1.122965409968516, "train_runtime": 3224.9347, "train_samples_per_second": 13.128, "train_steps_per_second": 0.205 } ], "logging_steps": 1, "max_steps": 661, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }