{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 681, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0014684287812041115, "grad_norm": 83.52447509765625, "learning_rate": 0.0, "logits/chosen": -0.4974287748336792, "logits/rejected": -0.43299180269241333, "logps/chosen": -50.1435661315918, "logps/ref_chosen": -50.14883804321289, "logps/ref_rejected": -74.1280517578125, "logps/rejected": -74.09991455078125, "loss": 1.389, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.5005706548690796, "margin_dpo/beta_margin_grad_std": 0.0104739461094141, "margin_dpo/beta_margin_mean": -0.0022870064713060856, "margin_dpo/loss_margin_mean": -0.02287006378173828, "margin_dpo/margin_mean": -0.02287048101425171, "margin_dpo/margin_std": 0.41920793056488037, "step": 1 }, { "epoch": 0.002936857562408223, "grad_norm": 72.19432830810547, "learning_rate": 7.246376811594203e-09, "logits/chosen": -0.4953641891479492, "logits/rejected": -0.4594460129737854, "logps/chosen": -52.65569305419922, "logps/ref_chosen": -52.620704650878906, "logps/ref_rejected": -75.30413818359375, "logps/rejected": -75.27340698242188, "loss": 1.3932, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.5016425848007202, "margin_dpo/beta_margin_grad_std": 0.008758805692195892, "margin_dpo/beta_margin_mean": -0.006572261452674866, "margin_dpo/loss_margin_mean": -0.06572261452674866, "margin_dpo/margin_mean": -0.06572240591049194, "margin_dpo/margin_std": 0.35048407316207886, "step": 2 }, { "epoch": 0.004405286343612335, "grad_norm": 70.83383178710938, "learning_rate": 1.4492753623188406e-08, "logits/chosen": -0.48161470890045166, "logits/rejected": -0.44217246770858765, "logps/chosen": -60.95429611206055, "logps/ref_chosen": -60.98159408569336, "logps/ref_rejected": -68.67259216308594, "logps/rejected": -68.64839935302734, "loss": 1.3863, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.49992311000823975, "margin_dpo/beta_margin_grad_std": 0.008581075817346573, "margin_dpo/beta_margin_mean": 0.0003100454923696816, "margin_dpo/loss_margin_mean": 0.003100454807281494, "margin_dpo/margin_mean": 0.003100961446762085, "margin_dpo/margin_std": 0.3433571755886078, "step": 3 }, { "epoch": 0.005873715124816446, "grad_norm": 72.25827026367188, "learning_rate": 2.1739130434782606e-08, "logits/chosen": -0.46887677907943726, "logits/rejected": -0.44121015071868896, "logps/chosen": -56.833404541015625, "logps/ref_chosen": -56.76771545410156, "logps/ref_rejected": -86.64710998535156, "logps/rejected": -86.60629272460938, "loss": 1.3973, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.5026620626449585, "margin_dpo/beta_margin_grad_std": 0.008479107171297073, "margin_dpo/beta_margin_mean": -0.010650942102074623, "margin_dpo/loss_margin_mean": -0.10650941729545593, "margin_dpo/margin_mean": -0.10650989413261414, "margin_dpo/margin_std": 0.33926206827163696, "step": 4 }, { "epoch": 0.007342143906020558, "grad_norm": 89.21666717529297, "learning_rate": 2.898550724637681e-08, "logits/chosen": -0.5145087242126465, "logits/rejected": -0.4707593023777008, "logps/chosen": -53.772743225097656, "logps/ref_chosen": -53.859375, "logps/ref_rejected": -84.14918518066406, "logps/rejected": -84.13954162597656, "loss": 1.3789, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.49807578325271606, "margin_dpo/beta_margin_grad_std": 0.008384998887777328, "margin_dpo/beta_margin_mean": 0.007699114270508289, "margin_dpo/loss_margin_mean": 0.07699114084243774, "margin_dpo/margin_mean": 0.07699081301689148, "margin_dpo/margin_std": 0.3355046510696411, "step": 5 }, { "epoch": 0.00881057268722467, "grad_norm": 92.13448333740234, "learning_rate": 3.6231884057971014e-08, "logits/chosen": -0.5163406729698181, "logits/rejected": -0.475068598985672, "logps/chosen": -63.05199432373047, "logps/ref_chosen": -63.007484436035156, "logps/ref_rejected": -92.64534759521484, "logps/rejected": -92.68731689453125, "loss": 1.3869, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.5000631809234619, "margin_dpo/beta_margin_grad_std": 0.008657192811369896, "margin_dpo/beta_margin_mean": -0.00025360879953950644, "margin_dpo/loss_margin_mean": -0.002536088228225708, "margin_dpo/margin_mean": -0.002536386251449585, "margin_dpo/margin_std": 0.3463857173919678, "step": 6 }, { "epoch": 0.010279001468428781, "grad_norm": 82.59510803222656, "learning_rate": 4.347826086956521e-08, "logits/chosen": -0.5038071274757385, "logits/rejected": -0.46995049715042114, "logps/chosen": -57.764461517333984, "logps/ref_chosen": -57.774818420410156, "logps/ref_rejected": -103.92059326171875, "logps/rejected": -103.89596557617188, "loss": 1.3881, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.5003570914268494, "margin_dpo/beta_margin_grad_std": 0.009314555674791336, "margin_dpo/beta_margin_mean": -0.0014270306564867496, "margin_dpo/loss_margin_mean": -0.014270305633544922, "margin_dpo/margin_mean": -0.014270126819610596, "margin_dpo/margin_std": 0.37269771099090576, "step": 7 }, { "epoch": 0.011747430249632892, "grad_norm": 78.55260467529297, "learning_rate": 5.0724637681159424e-08, "logits/chosen": -0.5125592350959778, "logits/rejected": -0.48697221279144287, "logps/chosen": -58.67088317871094, "logps/ref_chosen": -58.716033935546875, "logps/ref_rejected": -79.3114242553711, "logps/rejected": -79.30046081542969, "loss": 1.3832, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.4991455674171448, "margin_dpo/beta_margin_grad_std": 0.008225222118198872, "margin_dpo/beta_margin_mean": 0.0034186365082859993, "margin_dpo/loss_margin_mean": 0.034186363220214844, "margin_dpo/margin_mean": 0.03418651223182678, "margin_dpo/margin_std": 0.3291283845901489, "step": 8 }, { "epoch": 0.013215859030837005, "grad_norm": 84.95925903320312, "learning_rate": 5.797101449275362e-08, "logits/chosen": -0.518346905708313, "logits/rejected": -0.4730910360813141, "logps/chosen": -69.84893798828125, "logps/ref_chosen": -69.8668441772461, "logps/ref_rejected": -99.6026611328125, "logps/rejected": -99.63265991210938, "loss": 1.3819, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.4988027811050415, "margin_dpo/beta_margin_grad_std": 0.010282458737492561, "margin_dpo/beta_margin_mean": 0.004790524020791054, "margin_dpo/loss_margin_mean": 0.04790523648262024, "margin_dpo/margin_mean": 0.04790511727333069, "margin_dpo/margin_std": 0.4114891588687897, "step": 9 }, { "epoch": 0.014684287812041116, "grad_norm": 70.49417877197266, "learning_rate": 6.521739130434782e-08, "logits/chosen": -0.4861105680465698, "logits/rejected": -0.44242680072784424, "logps/chosen": -48.30065155029297, "logps/ref_chosen": -48.35768508911133, "logps/ref_rejected": -80.37206268310547, "logps/rejected": -80.39839172363281, "loss": 1.3783, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.4979170560836792, "margin_dpo/beta_margin_grad_std": 0.008738012053072453, "margin_dpo/beta_margin_mean": 0.00833646859973669, "margin_dpo/loss_margin_mean": 0.08336468040943146, "margin_dpo/margin_mean": 0.08336484432220459, "margin_dpo/margin_std": 0.3496713638305664, "step": 10 }, { "epoch": 0.016152716593245228, "grad_norm": 68.25067901611328, "learning_rate": 7.246376811594203e-08, "logits/chosen": -0.4707266092300415, "logits/rejected": -0.4461541175842285, "logps/chosen": -53.01072692871094, "logps/ref_chosen": -53.01685333251953, "logps/ref_rejected": -87.78038024902344, "logps/rejected": -87.81500244140625, "loss": 1.3825, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.4989815652370453, "margin_dpo/beta_margin_grad_std": 0.008964480832219124, "margin_dpo/beta_margin_mean": 0.0040746452286839485, "margin_dpo/loss_margin_mean": 0.040746450424194336, "margin_dpo/margin_mean": 0.04074642062187195, "margin_dpo/margin_std": 0.35872533917427063, "step": 11 }, { "epoch": 0.01762114537444934, "grad_norm": 99.98358917236328, "learning_rate": 7.971014492753623e-08, "logits/chosen": -0.5403286814689636, "logits/rejected": -0.5041991472244263, "logps/chosen": -61.795372009277344, "logps/ref_chosen": -61.80543518066406, "logps/ref_rejected": -104.85826873779297, "logps/rejected": -104.8602294921875, "loss": 1.3855, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.4997004270553589, "margin_dpo/beta_margin_grad_std": 0.009653432294726372, "margin_dpo/beta_margin_mean": 0.001203133026137948, "margin_dpo/loss_margin_mean": 0.012031331658363342, "margin_dpo/margin_mean": 0.012031003832817078, "margin_dpo/margin_std": 0.3863860070705414, "step": 12 }, { "epoch": 0.01908957415565345, "grad_norm": 79.62843322753906, "learning_rate": 8.695652173913042e-08, "logits/chosen": -0.47281551361083984, "logits/rejected": -0.44416356086730957, "logps/chosen": -64.23121643066406, "logps/ref_chosen": -64.26036071777344, "logps/ref_rejected": -87.20307922363281, "logps/rejected": -87.18215942382812, "loss": 1.3859, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.4997946619987488, "margin_dpo/beta_margin_grad_std": 0.009803904220461845, "margin_dpo/beta_margin_mean": 0.0008225085912272334, "margin_dpo/loss_margin_mean": 0.008225083351135254, "margin_dpo/margin_mean": 0.008224427700042725, "margin_dpo/margin_std": 0.39235472679138184, "step": 13 }, { "epoch": 0.020558002936857563, "grad_norm": 85.54085540771484, "learning_rate": 9.420289855072464e-08, "logits/chosen": -0.4834981858730316, "logits/rejected": -0.4443725347518921, "logps/chosen": -58.135520935058594, "logps/ref_chosen": -58.11021423339844, "logps/ref_rejected": -104.04708099365234, "logps/rejected": -104.12353515625, "loss": 1.3816, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.49872222542762756, "margin_dpo/beta_margin_grad_std": 0.010205242782831192, "margin_dpo/beta_margin_mean": 0.00511439424008131, "margin_dpo/loss_margin_mean": 0.05114394426345825, "margin_dpo/margin_mean": 0.051144301891326904, "margin_dpo/margin_std": 0.4083808958530426, "step": 14 }, { "epoch": 0.022026431718061675, "grad_norm": 64.28120422363281, "learning_rate": 1.0144927536231885e-07, "logits/chosen": -0.505402147769928, "logits/rejected": -0.4873759150505066, "logps/chosen": -57.00213623046875, "logps/ref_chosen": -56.96691131591797, "logps/ref_rejected": -80.80863952636719, "logps/rejected": -80.82938385009766, "loss": 1.3881, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.5003613233566284, "margin_dpo/beta_margin_grad_std": 0.008744737133383751, "margin_dpo/beta_margin_mean": -0.0014485123101621866, "margin_dpo/loss_margin_mean": -0.01448512077331543, "margin_dpo/margin_mean": -0.01448512077331543, "margin_dpo/margin_std": 0.34991174936294556, "step": 15 }, { "epoch": 0.023494860499265784, "grad_norm": 84.06546020507812, "learning_rate": 1.0869565217391303e-07, "logits/chosen": -0.5580030083656311, "logits/rejected": -0.5204088687896729, "logps/chosen": -61.74095153808594, "logps/ref_chosen": -61.739891052246094, "logps/ref_rejected": -84.36947631835938, "logps/rejected": -84.42204284667969, "loss": 1.3816, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.4987128973007202, "margin_dpo/beta_margin_grad_std": 0.009828168898820877, "margin_dpo/beta_margin_mean": 0.005151033401489258, "margin_dpo/loss_margin_mean": 0.05151033401489258, "margin_dpo/margin_mean": 0.051510006189346313, "margin_dpo/margin_std": 0.3933736979961395, "step": 16 }, { "epoch": 0.024963289280469897, "grad_norm": 78.63739013671875, "learning_rate": 1.1594202898550725e-07, "logits/chosen": -0.5074384212493896, "logits/rejected": -0.47103995084762573, "logps/chosen": -67.64342498779297, "logps/ref_chosen": -67.71033477783203, "logps/ref_rejected": -85.37865447998047, "logps/rejected": -85.41255187988281, "loss": 1.3766, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.497480571269989, "margin_dpo/beta_margin_grad_std": 0.00974523089826107, "margin_dpo/beta_margin_mean": 0.010080328211188316, "margin_dpo/loss_margin_mean": 0.10080328583717346, "margin_dpo/margin_mean": 0.10080331563949585, "margin_dpo/margin_std": 0.39002037048339844, "step": 17 }, { "epoch": 0.02643171806167401, "grad_norm": 82.34278869628906, "learning_rate": 1.2318840579710146e-07, "logits/chosen": -0.48814651370048523, "logits/rejected": -0.4320324659347534, "logps/chosen": -47.713279724121094, "logps/ref_chosen": -47.7394905090332, "logps/ref_rejected": -75.4722900390625, "logps/rejected": -75.48577880859375, "loss": 1.3826, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.4990071952342987, "margin_dpo/beta_margin_grad_std": 0.008132295683026314, "margin_dpo/beta_margin_mean": 0.003970235586166382, "margin_dpo/loss_margin_mean": 0.03970235586166382, "margin_dpo/margin_mean": 0.03970211744308472, "margin_dpo/margin_std": 0.32538339495658875, "step": 18 }, { "epoch": 0.027900146842878122, "grad_norm": 73.4638900756836, "learning_rate": 1.3043478260869563e-07, "logits/chosen": -0.48973095417022705, "logits/rejected": -0.4396272301673889, "logps/chosen": -70.17350769042969, "logps/ref_chosen": -70.20535278320312, "logps/ref_rejected": -89.75758361816406, "logps/rejected": -89.85565948486328, "loss": 1.3737, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.49675452709198, "margin_dpo/beta_margin_grad_std": 0.009917546063661575, "margin_dpo/beta_margin_mean": 0.012991649098694324, "margin_dpo/loss_margin_mean": 0.1299164891242981, "margin_dpo/margin_mean": 0.12991660833358765, "margin_dpo/margin_std": 0.3970108926296234, "step": 19 }, { "epoch": 0.02936857562408223, "grad_norm": 74.19491577148438, "learning_rate": 1.3768115942028986e-07, "logits/chosen": -0.5667568445205688, "logits/rejected": -0.5119162797927856, "logps/chosen": -50.822715759277344, "logps/ref_chosen": -50.80324172973633, "logps/ref_rejected": -78.8233413696289, "logps/rejected": -78.87236785888672, "loss": 1.3836, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.4992612898349762, "margin_dpo/beta_margin_grad_std": 0.007525566965341568, "margin_dpo/beta_margin_mean": 0.0029547633603215218, "margin_dpo/loss_margin_mean": 0.02954763174057007, "margin_dpo/margin_mean": 0.029547661542892456, "margin_dpo/margin_std": 0.3011046051979065, "step": 20 }, { "epoch": 0.030837004405286344, "grad_norm": 77.03598022460938, "learning_rate": 1.4492753623188405e-07, "logits/chosen": -0.5037728548049927, "logits/rejected": -0.48049020767211914, "logps/chosen": -50.014862060546875, "logps/ref_chosen": -50.063018798828125, "logps/ref_rejected": -77.86878967285156, "logps/rejected": -78.02366638183594, "loss": 1.3664, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.4949270188808441, "margin_dpo/beta_margin_grad_std": 0.008798542432487011, "margin_dpo/beta_margin_mean": 0.020303059369325638, "margin_dpo/loss_margin_mean": 0.20303058624267578, "margin_dpo/margin_mean": 0.20303112268447876, "margin_dpo/margin_std": 0.3521846532821655, "step": 21 }, { "epoch": 0.032305433186490456, "grad_norm": 84.57589721679688, "learning_rate": 1.5217391304347825e-07, "logits/chosen": -0.49148398637771606, "logits/rejected": -0.44818878173828125, "logps/chosen": -58.99713897705078, "logps/ref_chosen": -59.05763626098633, "logps/ref_rejected": -97.50466918945312, "logps/rejected": -97.65492248535156, "loss": 1.3657, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.4947338104248047, "margin_dpo/beta_margin_grad_std": 0.009125478565692902, "margin_dpo/beta_margin_mean": 0.021074719727039337, "margin_dpo/loss_margin_mean": 0.21074718236923218, "margin_dpo/margin_mean": 0.210746169090271, "margin_dpo/margin_std": 0.36520200967788696, "step": 22 }, { "epoch": 0.033773861967694566, "grad_norm": 80.40442657470703, "learning_rate": 1.5942028985507245e-07, "logits/chosen": -0.4710449278354645, "logits/rejected": -0.44750112295150757, "logps/chosen": -60.034095764160156, "logps/ref_chosen": -60.07769775390625, "logps/ref_rejected": -81.1395492553711, "logps/rejected": -81.3127212524414, "loss": 1.3652, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.49458569288253784, "margin_dpo/beta_margin_grad_std": 0.010714245960116386, "margin_dpo/beta_margin_mean": 0.021676737815141678, "margin_dpo/loss_margin_mean": 0.21676737070083618, "margin_dpo/margin_mean": 0.21676787734031677, "margin_dpo/margin_std": 0.42905572056770325, "step": 23 }, { "epoch": 0.03524229074889868, "grad_norm": 86.11105346679688, "learning_rate": 1.6666666666666665e-07, "logits/chosen": -0.515487790107727, "logits/rejected": -0.49895963072776794, "logps/chosen": -44.28882598876953, "logps/ref_chosen": -44.29103469848633, "logps/ref_rejected": -99.12521362304688, "logps/rejected": -99.3304443359375, "loss": 1.3661, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.49481743574142456, "margin_dpo/beta_margin_grad_std": 0.01028534211218357, "margin_dpo/beta_margin_mean": 0.020744048058986664, "margin_dpo/loss_margin_mean": 0.20744048058986664, "margin_dpo/margin_mean": 0.20744094252586365, "margin_dpo/margin_std": 0.4117741584777832, "step": 24 }, { "epoch": 0.03671071953010279, "grad_norm": 74.07949829101562, "learning_rate": 1.7391304347826085e-07, "logits/chosen": -0.5136522650718689, "logits/rejected": -0.4842616319656372, "logps/chosen": -52.5118408203125, "logps/ref_chosen": -52.537052154541016, "logps/ref_rejected": -89.34219360351562, "logps/rejected": -89.51565551757812, "loss": 1.367, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.4950360655784607, "margin_dpo/beta_margin_grad_std": 0.010537989437580109, "margin_dpo/beta_margin_mean": 0.019867265596985817, "margin_dpo/loss_margin_mean": 0.19867265224456787, "margin_dpo/margin_mean": 0.1986721158027649, "margin_dpo/margin_std": 0.4217980206012726, "step": 25 }, { "epoch": 0.0381791483113069, "grad_norm": 87.3241195678711, "learning_rate": 1.8115942028985507e-07, "logits/chosen": -0.5391855239868164, "logits/rejected": -0.5075402855873108, "logps/chosen": -53.83518981933594, "logps/ref_chosen": -53.92280578613281, "logps/ref_rejected": -103.35971069335938, "logps/rejected": -103.70204162597656, "loss": 1.3445, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.4892633557319641, "margin_dpo/beta_margin_grad_std": 0.013717170804738998, "margin_dpo/beta_margin_mean": 0.04299398139119148, "margin_dpo/loss_margin_mean": 0.42993980646133423, "margin_dpo/margin_mean": 0.42994067072868347, "margin_dpo/margin_std": 0.5494698286056519, "step": 26 }, { "epoch": 0.039647577092511016, "grad_norm": 93.3059310913086, "learning_rate": 1.8840579710144927e-07, "logits/chosen": -0.5076569318771362, "logits/rejected": -0.47098520398139954, "logps/chosen": -42.758522033691406, "logps/ref_chosen": -42.898529052734375, "logps/ref_rejected": -98.72420501708984, "logps/rejected": -99.09854125976562, "loss": 1.3364, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.4871601164340973, "margin_dpo/beta_margin_grad_std": 0.014479693956673145, "margin_dpo/beta_margin_mean": 0.051434241235256195, "margin_dpo/loss_margin_mean": 0.5143424272537231, "margin_dpo/margin_mean": 0.514342188835144, "margin_dpo/margin_std": 0.5809046626091003, "step": 27 }, { "epoch": 0.041116005873715125, "grad_norm": 75.3113784790039, "learning_rate": 1.9565217391304347e-07, "logits/chosen": -0.5132657289505005, "logits/rejected": -0.4586002230644226, "logps/chosen": -60.55534362792969, "logps/ref_chosen": -60.55650329589844, "logps/ref_rejected": -91.40111541748047, "logps/rejected": -91.69779205322266, "loss": 1.3575, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.492561399936676, "margin_dpo/beta_margin_grad_std": 0.013777370564639568, "margin_dpo/beta_margin_mean": 0.029783397912979126, "margin_dpo/loss_margin_mean": 0.29783397912979126, "margin_dpo/margin_mean": 0.29783421754837036, "margin_dpo/margin_std": 0.5516640543937683, "step": 28 }, { "epoch": 0.042584434654919234, "grad_norm": 90.50589752197266, "learning_rate": 2.028985507246377e-07, "logits/chosen": -0.5619853734970093, "logits/rejected": -0.5164209008216858, "logps/chosen": -57.673362731933594, "logps/ref_chosen": -57.80778503417969, "logps/ref_rejected": -97.39434814453125, "logps/rejected": -97.85377502441406, "loss": 1.3285, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.4851696789264679, "margin_dpo/beta_margin_grad_std": 0.01294540986418724, "margin_dpo/beta_margin_mean": 0.0593840591609478, "margin_dpo/loss_margin_mean": 0.5938405990600586, "margin_dpo/margin_mean": 0.5938413739204407, "margin_dpo/margin_std": 0.5187057256698608, "step": 29 }, { "epoch": 0.04405286343612335, "grad_norm": 87.18180847167969, "learning_rate": 2.1014492753623187e-07, "logits/chosen": -0.5116697549819946, "logits/rejected": -0.4816800057888031, "logps/chosen": -52.425750732421875, "logps/ref_chosen": -52.57737350463867, "logps/ref_rejected": -98.48921203613281, "logps/rejected": -99.05937957763672, "loss": 1.3165, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.4819870591163635, "margin_dpo/beta_margin_grad_std": 0.016035309061408043, "margin_dpo/beta_margin_mean": 0.07217944413423538, "margin_dpo/loss_margin_mean": 0.7217944860458374, "margin_dpo/margin_mean": 0.7217941880226135, "margin_dpo/margin_std": 0.6435875296592712, "step": 30 }, { "epoch": 0.04552129221732746, "grad_norm": 67.85016632080078, "learning_rate": 2.1739130434782607e-07, "logits/chosen": -0.5148423910140991, "logits/rejected": -0.4710330367088318, "logps/chosen": -63.68492889404297, "logps/ref_chosen": -63.806922912597656, "logps/ref_rejected": -72.89400482177734, "logps/rejected": -73.29931640625, "loss": 1.3354, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.4868418872356415, "margin_dpo/beta_margin_grad_std": 0.016791202127933502, "margin_dpo/beta_margin_mean": 0.052730634808540344, "margin_dpo/loss_margin_mean": 0.527306318283081, "margin_dpo/margin_mean": 0.527306318283081, "margin_dpo/margin_std": 0.6738239526748657, "step": 31 }, { "epoch": 0.04698972099853157, "grad_norm": 81.52291107177734, "learning_rate": 2.2463768115942027e-07, "logits/chosen": -0.5098748207092285, "logits/rejected": -0.46841973066329956, "logps/chosen": -62.55455017089844, "logps/ref_chosen": -62.739524841308594, "logps/ref_rejected": -89.3175048828125, "logps/rejected": -89.87690734863281, "loss": 1.3153, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.48145878314971924, "margin_dpo/beta_margin_grad_std": 0.022345291450619698, "margin_dpo/beta_margin_mean": 0.07443846762180328, "margin_dpo/loss_margin_mean": 0.7443846464157104, "margin_dpo/margin_mean": 0.7443850040435791, "margin_dpo/margin_std": 0.9011361598968506, "step": 32 }, { "epoch": 0.048458149779735685, "grad_norm": 72.76732635498047, "learning_rate": 2.318840579710145e-07, "logits/chosen": -0.5056596994400024, "logits/rejected": -0.47998249530792236, "logps/chosen": -53.159671783447266, "logps/ref_chosen": -53.26097106933594, "logps/ref_rejected": -87.8851318359375, "logps/rejected": -88.34671020507812, "loss": 1.3315, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.4859437346458435, "margin_dpo/beta_margin_grad_std": 0.013436902314424515, "margin_dpo/beta_margin_mean": 0.056287482380867004, "margin_dpo/loss_margin_mean": 0.5628747940063477, "margin_dpo/margin_mean": 0.5628749132156372, "margin_dpo/margin_std": 0.5385845899581909, "step": 33 }, { "epoch": 0.049926578560939794, "grad_norm": 77.6261978149414, "learning_rate": 2.391304347826087e-07, "logits/chosen": -0.49515533447265625, "logits/rejected": -0.4777703285217285, "logps/chosen": -50.73601531982422, "logps/ref_chosen": -50.81732940673828, "logps/ref_rejected": -101.92184448242188, "logps/rejected": -102.61337280273438, "loss": 1.3124, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.48073524236679077, "margin_dpo/beta_margin_grad_std": 0.02180512621998787, "margin_dpo/beta_margin_mean": 0.07728321105241776, "margin_dpo/loss_margin_mean": 0.7728320360183716, "margin_dpo/margin_mean": 0.7728322744369507, "margin_dpo/margin_std": 0.8760267496109009, "step": 34 }, { "epoch": 0.0513950073421439, "grad_norm": 82.47791290283203, "learning_rate": 2.463768115942029e-07, "logits/chosen": -0.5227484107017517, "logits/rejected": -0.48601728677749634, "logps/chosen": -50.88093948364258, "logps/ref_chosen": -51.02449035644531, "logps/ref_rejected": -106.82443237304688, "logps/rejected": -107.93114471435547, "loss": 1.2685, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.4689541757106781, "margin_dpo/beta_margin_grad_std": 0.027965568006038666, "margin_dpo/beta_margin_mean": 0.12502656877040863, "margin_dpo/loss_margin_mean": 1.2502657175064087, "margin_dpo/margin_mean": 1.2502658367156982, "margin_dpo/margin_std": 1.1440428495407104, "step": 35 }, { "epoch": 0.05286343612334802, "grad_norm": 72.95713806152344, "learning_rate": 2.536231884057971e-07, "logits/chosen": -0.563947319984436, "logits/rejected": -0.5279806852340698, "logps/chosen": -51.93867492675781, "logps/ref_chosen": -51.991493225097656, "logps/ref_rejected": -86.04061889648438, "logps/rejected": -87.13275146484375, "loss": 1.2793, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.4715506434440613, "margin_dpo/beta_margin_grad_std": 0.031429655849933624, "margin_dpo/beta_margin_mean": 0.11449373513460159, "margin_dpo/loss_margin_mean": 1.14493727684021, "margin_dpo/margin_mean": 1.144936442375183, "margin_dpo/margin_std": 1.2692325115203857, "step": 36 }, { "epoch": 0.05433186490455213, "grad_norm": 61.87527084350586, "learning_rate": 2.6086956521739126e-07, "logits/chosen": -0.5000085234642029, "logits/rejected": -0.4554196000099182, "logps/chosen": -62.77561950683594, "logps/ref_chosen": -62.807106018066406, "logps/ref_rejected": -77.89507293701172, "logps/rejected": -78.88422393798828, "loss": 1.2909, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.47465330362319946, "margin_dpo/beta_margin_grad_std": 0.0310398917645216, "margin_dpo/beta_margin_mean": 0.10206404328346252, "margin_dpo/loss_margin_mean": 1.0206403732299805, "margin_dpo/margin_mean": 1.0206403732299805, "margin_dpo/margin_std": 1.2562531232833862, "step": 37 }, { "epoch": 0.055800293685756244, "grad_norm": 69.35832977294922, "learning_rate": 2.681159420289855e-07, "logits/chosen": -0.5131621360778809, "logits/rejected": -0.4803985357284546, "logps/chosen": -48.25373077392578, "logps/ref_chosen": -48.39051818847656, "logps/ref_rejected": -97.91244506835938, "logps/rejected": -99.13421630859375, "loss": 1.262, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.46648678183555603, "margin_dpo/beta_margin_grad_std": 0.0405726283788681, "margin_dpo/beta_margin_mean": 0.13585661351680756, "margin_dpo/loss_margin_mean": 1.3585660457611084, "margin_dpo/margin_mean": 1.3585660457611084, "margin_dpo/margin_std": 1.6711539030075073, "step": 38 }, { "epoch": 0.05726872246696035, "grad_norm": 73.56781768798828, "learning_rate": 2.753623188405797e-07, "logits/chosen": -0.5736282467842102, "logits/rejected": -0.534826934337616, "logps/chosen": -50.66197204589844, "logps/ref_chosen": -50.75046920776367, "logps/ref_rejected": -78.56951141357422, "logps/rejected": -80.15695190429688, "loss": 1.2309, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.458440363407135, "margin_dpo/beta_margin_grad_std": 0.035203345119953156, "margin_dpo/beta_margin_mean": 0.1675935983657837, "margin_dpo/loss_margin_mean": 1.675935983657837, "margin_dpo/margin_mean": 1.6759363412857056, "margin_dpo/margin_std": 1.4285030364990234, "step": 39 }, { "epoch": 0.05873715124816446, "grad_norm": 60.51735305786133, "learning_rate": 2.8260869565217386e-07, "logits/chosen": -0.527452826499939, "logits/rejected": -0.4978986382484436, "logps/chosen": -57.774688720703125, "logps/ref_chosen": -57.985069274902344, "logps/ref_rejected": -74.30007934570312, "logps/rejected": -75.63487243652344, "loss": 1.2454, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.46187901496887207, "margin_dpo/beta_margin_grad_std": 0.041983917355537415, "margin_dpo/beta_margin_mean": 0.15451756119728088, "margin_dpo/loss_margin_mean": 1.5451757907867432, "margin_dpo/margin_mean": 1.5451761484146118, "margin_dpo/margin_std": 1.721125602722168, "step": 40 }, { "epoch": 0.06020558002936858, "grad_norm": 68.02806091308594, "learning_rate": 2.898550724637681e-07, "logits/chosen": -0.522682785987854, "logits/rejected": -0.4852331280708313, "logps/chosen": -62.648956298828125, "logps/ref_chosen": -62.69581604003906, "logps/ref_rejected": -97.02352905273438, "logps/rejected": -98.86910247802734, "loss": 1.216, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.4534452557563782, "margin_dpo/beta_margin_grad_std": 0.04771146923303604, "margin_dpo/beta_margin_mean": 0.1892436146736145, "margin_dpo/loss_margin_mean": 1.892436146736145, "margin_dpo/margin_mean": 1.8924363851547241, "margin_dpo/margin_std": 1.9684252738952637, "step": 41 }, { "epoch": 0.06167400881057269, "grad_norm": 79.33026123046875, "learning_rate": 2.971014492753623e-07, "logits/chosen": -0.5209932923316956, "logits/rejected": -0.4742482602596283, "logps/chosen": -58.71235275268555, "logps/ref_chosen": -58.96642303466797, "logps/ref_rejected": -109.90837097167969, "logps/rejected": -112.24879455566406, "loss": 1.1582, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.43676841259002686, "margin_dpo/beta_margin_grad_std": 0.05766534060239792, "margin_dpo/beta_margin_mean": 0.25944995880126953, "margin_dpo/loss_margin_mean": 2.5944995880126953, "margin_dpo/margin_mean": 2.5944998264312744, "margin_dpo/margin_std": 2.435802936553955, "step": 42 }, { "epoch": 0.0631424375917768, "grad_norm": 71.26874542236328, "learning_rate": 3.043478260869565e-07, "logits/chosen": -0.5625420808792114, "logits/rejected": -0.538593590259552, "logps/chosen": -53.63534927368164, "logps/ref_chosen": -54.15599822998047, "logps/ref_rejected": -96.48019409179688, "logps/rejected": -98.510009765625, "loss": 1.1584, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.4374551773071289, "margin_dpo/beta_margin_grad_std": 0.05021943897008896, "margin_dpo/beta_margin_mean": 0.2550460994243622, "margin_dpo/loss_margin_mean": 2.5504610538482666, "margin_dpo/margin_mean": 2.5504608154296875, "margin_dpo/margin_std": 2.1022145748138428, "step": 43 }, { "epoch": 0.06461086637298091, "grad_norm": 78.6224136352539, "learning_rate": 3.115942028985507e-07, "logits/chosen": -0.46302688121795654, "logits/rejected": -0.443297415971756, "logps/chosen": -49.88066864013672, "logps/ref_chosen": -50.07849884033203, "logps/ref_rejected": -108.78376007080078, "logps/rejected": -111.412841796875, "loss": 1.1358, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.43084633350372314, "margin_dpo/beta_margin_grad_std": 0.053701795637607574, "margin_dpo/beta_margin_mean": 0.2826906740665436, "margin_dpo/loss_margin_mean": 2.826906681060791, "margin_dpo/margin_mean": 2.826906442642212, "margin_dpo/margin_std": 2.2519941329956055, "step": 44 }, { "epoch": 0.06607929515418502, "grad_norm": 61.787879943847656, "learning_rate": 3.188405797101449e-07, "logits/chosen": -0.4846153259277344, "logits/rejected": -0.47198039293289185, "logps/chosen": -48.231903076171875, "logps/ref_chosen": -48.41493225097656, "logps/ref_rejected": -77.93643188476562, "logps/rejected": -80.11711883544922, "loss": 1.181, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.4426528513431549, "margin_dpo/beta_margin_grad_std": 0.061857253313064575, "margin_dpo/beta_margin_mean": 0.23637181520462036, "margin_dpo/loss_margin_mean": 2.363718032836914, "margin_dpo/margin_mean": 2.363717555999756, "margin_dpo/margin_std": 2.6244254112243652, "step": 45 }, { "epoch": 0.06754772393538913, "grad_norm": 69.09931945800781, "learning_rate": 3.260869565217391e-07, "logits/chosen": -0.5141834020614624, "logits/rejected": -0.4625147581100464, "logps/chosen": -55.74999237060547, "logps/ref_chosen": -55.999427795410156, "logps/ref_rejected": -95.652587890625, "logps/rejected": -98.34117126464844, "loss": 1.1376, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.4294002056121826, "margin_dpo/beta_margin_grad_std": 0.07263202965259552, "margin_dpo/beta_margin_mean": 0.29380178451538086, "margin_dpo/loss_margin_mean": 2.9380178451538086, "margin_dpo/margin_mean": 2.9380173683166504, "margin_dpo/margin_std": 3.154534339904785, "step": 46 }, { "epoch": 0.06901615271659324, "grad_norm": 65.72870635986328, "learning_rate": 3.333333333333333e-07, "logits/chosen": -0.58119797706604, "logits/rejected": -0.5290583372116089, "logps/chosen": -57.503753662109375, "logps/ref_chosen": -57.92607879638672, "logps/ref_rejected": -94.67920684814453, "logps/rejected": -97.23452758789062, "loss": 1.1285, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.4276430606842041, "margin_dpo/beta_margin_grad_std": 0.0631469339132309, "margin_dpo/beta_margin_mean": 0.29776421189308167, "margin_dpo/loss_margin_mean": 2.977642059326172, "margin_dpo/margin_mean": 2.977642297744751, "margin_dpo/margin_std": 2.6595559120178223, "step": 47 }, { "epoch": 0.07048458149779736, "grad_norm": 72.27952575683594, "learning_rate": 3.4057971014492755e-07, "logits/chosen": -0.5920270681381226, "logits/rejected": -0.5339563488960266, "logps/chosen": -57.16640853881836, "logps/ref_chosen": -57.188072204589844, "logps/ref_rejected": -88.0166015625, "logps/rejected": -91.12606048583984, "loss": 1.1231, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.4243152141571045, "margin_dpo/beta_margin_grad_std": 0.07117132842540741, "margin_dpo/beta_margin_mean": 0.3131124675273895, "margin_dpo/loss_margin_mean": 3.131124496459961, "margin_dpo/margin_mean": 3.131124496459961, "margin_dpo/margin_std": 3.016913890838623, "step": 48 }, { "epoch": 0.07195301027900147, "grad_norm": 63.71873092651367, "learning_rate": 3.478260869565217e-07, "logits/chosen": -0.5536686182022095, "logits/rejected": -0.49566274881362915, "logps/chosen": -61.38921356201172, "logps/ref_chosen": -61.685264587402344, "logps/ref_rejected": -83.76747131347656, "logps/rejected": -87.34431457519531, "loss": 1.074, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.40908271074295044, "margin_dpo/beta_margin_grad_std": 0.08748139441013336, "margin_dpo/beta_margin_mean": 0.387288898229599, "margin_dpo/loss_margin_mean": 3.8728890419006348, "margin_dpo/margin_mean": 3.8728885650634766, "margin_dpo/margin_std": 3.9563791751861572, "step": 49 }, { "epoch": 0.07342143906020558, "grad_norm": 62.7824592590332, "learning_rate": 3.5507246376811595e-07, "logits/chosen": -0.5670984387397766, "logits/rejected": -0.5319196581840515, "logps/chosen": -58.91963195800781, "logps/ref_chosen": -58.72413635253906, "logps/ref_rejected": -96.35814666748047, "logps/rejected": -100.67803955078125, "loss": 1.0538, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.40261325240135193, "margin_dpo/beta_margin_grad_std": 0.09164208173751831, "margin_dpo/beta_margin_mean": 0.4124397039413452, "margin_dpo/loss_margin_mean": 4.124396800994873, "margin_dpo/margin_mean": 4.124396800994873, "margin_dpo/margin_std": 4.026268005371094, "step": 50 }, { "epoch": 0.07488986784140969, "grad_norm": 52.12064743041992, "learning_rate": 3.6231884057971015e-07, "logits/chosen": -0.5369248390197754, "logits/rejected": -0.5046299695968628, "logps/chosen": -61.671791076660156, "logps/ref_chosen": -61.3736686706543, "logps/ref_rejected": -76.00199890136719, "logps/rejected": -80.37809753417969, "loss": 1.0821, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.4077322781085968, "margin_dpo/beta_margin_grad_std": 0.11022845655679703, "margin_dpo/beta_margin_mean": 0.4077974557876587, "margin_dpo/loss_margin_mean": 4.077974796295166, "margin_dpo/margin_mean": 4.077974319458008, "margin_dpo/margin_std": 5.209657669067383, "step": 51 }, { "epoch": 0.0763582966226138, "grad_norm": 58.820133209228516, "learning_rate": 3.695652173913043e-07, "logits/chosen": -0.5732629299163818, "logits/rejected": -0.5190708637237549, "logps/chosen": -51.953399658203125, "logps/ref_chosen": -52.33735656738281, "logps/ref_rejected": -79.97391510009766, "logps/rejected": -85.85843658447266, "loss": 0.9142, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.3579610586166382, "margin_dpo/beta_margin_grad_std": 0.10757434368133545, "margin_dpo/beta_margin_mean": 0.6268481016159058, "margin_dpo/loss_margin_mean": 6.26848030090332, "margin_dpo/margin_mean": 6.2684807777404785, "margin_dpo/margin_std": 5.199737548828125, "step": 52 }, { "epoch": 0.07782672540381791, "grad_norm": 57.97807693481445, "learning_rate": 3.7681159420289855e-07, "logits/chosen": -0.618739128112793, "logits/rejected": -0.5973125100135803, "logps/chosen": -53.48461151123047, "logps/ref_chosen": -53.31465530395508, "logps/ref_rejected": -91.7835922241211, "logps/rejected": -98.30101013183594, "loss": 0.9439, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.36459940671920776, "margin_dpo/beta_margin_grad_std": 0.11843107640743256, "margin_dpo/beta_margin_mean": 0.6347463130950928, "margin_dpo/loss_margin_mean": 6.347463130950928, "margin_dpo/margin_mean": 6.347464084625244, "margin_dpo/margin_std": 6.299587726593018, "step": 53 }, { "epoch": 0.07929515418502203, "grad_norm": 58.53452682495117, "learning_rate": 3.8405797101449274e-07, "logits/chosen": -0.6002248525619507, "logits/rejected": -0.5472081303596497, "logps/chosen": -51.132781982421875, "logps/ref_chosen": -50.68865966796875, "logps/ref_rejected": -91.71539306640625, "logps/rejected": -97.54826354980469, "loss": 0.9754, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.37753698229789734, "margin_dpo/beta_margin_grad_std": 0.10520176589488983, "margin_dpo/beta_margin_mean": 0.5388752818107605, "margin_dpo/loss_margin_mean": 5.3887529373168945, "margin_dpo/margin_mean": 5.3887529373168945, "margin_dpo/margin_std": 5.09660530090332, "step": 54 }, { "epoch": 0.08076358296622614, "grad_norm": 53.50847244262695, "learning_rate": 3.9130434782608694e-07, "logits/chosen": -0.6379266977310181, "logits/rejected": -0.5748265981674194, "logps/chosen": -63.582801818847656, "logps/ref_chosen": -62.615234375, "logps/ref_rejected": -88.99349975585938, "logps/rejected": -96.49349212646484, "loss": 0.9552, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.36137956380844116, "margin_dpo/beta_margin_grad_std": 0.1454969346523285, "margin_dpo/beta_margin_mean": 0.6532418727874756, "margin_dpo/loss_margin_mean": 6.532418727874756, "margin_dpo/margin_mean": 6.532418251037598, "margin_dpo/margin_std": 7.533010482788086, "step": 55 }, { "epoch": 0.08223201174743025, "grad_norm": 48.04698944091797, "learning_rate": 3.9855072463768114e-07, "logits/chosen": -0.6322102546691895, "logits/rejected": -0.5908021330833435, "logps/chosen": -58.65277862548828, "logps/ref_chosen": -57.93273162841797, "logps/ref_rejected": -94.1744384765625, "logps/rejected": -101.14324951171875, "loss": 0.9724, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.3670324981212616, "margin_dpo/beta_margin_grad_std": 0.14493967592716217, "margin_dpo/beta_margin_mean": 0.6248764991760254, "margin_dpo/loss_margin_mean": 6.248764991760254, "margin_dpo/margin_mean": 6.248764991760254, "margin_dpo/margin_std": 7.392797470092773, "step": 56 }, { "epoch": 0.08370044052863436, "grad_norm": 53.7747688293457, "learning_rate": 4.057971014492754e-07, "logits/chosen": -0.5740865468978882, "logits/rejected": -0.5456082820892334, "logps/chosen": -71.21261596679688, "logps/ref_chosen": -70.49528503417969, "logps/ref_rejected": -95.56546020507812, "logps/rejected": -103.3371353149414, "loss": 0.8958, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.3454797565937042, "margin_dpo/beta_margin_grad_std": 0.1326553225517273, "margin_dpo/beta_margin_mean": 0.7054347991943359, "margin_dpo/loss_margin_mean": 7.054348468780518, "margin_dpo/margin_mean": 7.054348945617676, "margin_dpo/margin_std": 6.582326889038086, "step": 57 }, { "epoch": 0.08516886930983847, "grad_norm": 58.93936538696289, "learning_rate": 4.1304347826086954e-07, "logits/chosen": -0.6123115420341492, "logits/rejected": -0.5382078886032104, "logps/chosen": -63.20277786254883, "logps/ref_chosen": -62.13294219970703, "logps/ref_rejected": -84.61729431152344, "logps/rejected": -93.39222717285156, "loss": 0.8958, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.34207093715667725, "margin_dpo/beta_margin_grad_std": 0.15293042361736298, "margin_dpo/beta_margin_mean": 0.7705095410346985, "margin_dpo/loss_margin_mean": 7.7050957679748535, "margin_dpo/margin_mean": 7.705096244812012, "margin_dpo/margin_std": 8.273210525512695, "step": 58 }, { "epoch": 0.08663729809104258, "grad_norm": 55.383514404296875, "learning_rate": 4.2028985507246374e-07, "logits/chosen": -0.6397849321365356, "logits/rejected": -0.6004974842071533, "logps/chosen": -53.41858673095703, "logps/ref_chosen": -51.932525634765625, "logps/ref_rejected": -88.88520050048828, "logps/rejected": -98.85914611816406, "loss": 0.857, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.32870835065841675, "margin_dpo/beta_margin_grad_std": 0.15062686800956726, "margin_dpo/beta_margin_mean": 0.8487890958786011, "margin_dpo/loss_margin_mean": 8.48789119720459, "margin_dpo/margin_mean": 8.487890243530273, "margin_dpo/margin_std": 8.594100952148438, "step": 59 }, { "epoch": 0.0881057268722467, "grad_norm": 63.981693267822266, "learning_rate": 4.2753623188405794e-07, "logits/chosen": -0.6227731704711914, "logits/rejected": -0.5636199712753296, "logps/chosen": -63.575462341308594, "logps/ref_chosen": -60.94218444824219, "logps/ref_rejected": -85.39340209960938, "logps/rejected": -94.78590393066406, "loss": 0.9511, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.35332685708999634, "margin_dpo/beta_margin_grad_std": 0.1562734991312027, "margin_dpo/beta_margin_mean": 0.6759233474731445, "margin_dpo/loss_margin_mean": 6.759233474731445, "margin_dpo/margin_mean": 6.759233474731445, "margin_dpo/margin_std": 7.703272819519043, "step": 60 }, { "epoch": 0.08957415565345081, "grad_norm": 54.136070251464844, "learning_rate": 4.3478260869565214e-07, "logits/chosen": -0.589980959892273, "logits/rejected": -0.5553174018859863, "logps/chosen": -62.088226318359375, "logps/ref_chosen": -60.633522033691406, "logps/ref_rejected": -89.85249328613281, "logps/rejected": -99.72574615478516, "loss": 0.9274, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.34447312355041504, "margin_dpo/beta_margin_grad_std": 0.17768457531929016, "margin_dpo/beta_margin_mean": 0.8418547511100769, "margin_dpo/loss_margin_mean": 8.418547630310059, "margin_dpo/margin_mean": 8.418546676635742, "margin_dpo/margin_std": 11.459321975708008, "step": 61 }, { "epoch": 0.09104258443465492, "grad_norm": 56.41756057739258, "learning_rate": 4.420289855072464e-07, "logits/chosen": -0.6043162941932678, "logits/rejected": -0.5698095560073853, "logps/chosen": -57.790740966796875, "logps/ref_chosen": -56.15077209472656, "logps/ref_rejected": -75.56619262695312, "logps/rejected": -83.44951629638672, "loss": 0.9972, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.37032824754714966, "margin_dpo/beta_margin_grad_std": 0.15750956535339355, "margin_dpo/beta_margin_mean": 0.624334990978241, "margin_dpo/loss_margin_mean": 6.243350028991699, "margin_dpo/margin_mean": 6.243350028991699, "margin_dpo/margin_std": 8.166690826416016, "step": 62 }, { "epoch": 0.09251101321585903, "grad_norm": 56.643470764160156, "learning_rate": 4.4927536231884053e-07, "logits/chosen": -0.5830048322677612, "logits/rejected": -0.5372258424758911, "logps/chosen": -75.84552001953125, "logps/ref_chosen": -73.14739227294922, "logps/ref_rejected": -97.61006164550781, "logps/rejected": -108.71710205078125, "loss": 0.8745, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.3277238607406616, "margin_dpo/beta_margin_grad_std": 0.1626659482717514, "margin_dpo/beta_margin_mean": 0.84089195728302, "margin_dpo/loss_margin_mean": 8.408918380737305, "margin_dpo/margin_mean": 8.408919334411621, "margin_dpo/margin_std": 8.86873722076416, "step": 63 }, { "epoch": 0.09397944199706314, "grad_norm": 51.555030822753906, "learning_rate": 4.5652173913043473e-07, "logits/chosen": -0.6049680113792419, "logits/rejected": -0.5739491581916809, "logps/chosen": -54.96660232543945, "logps/ref_chosen": -53.99859619140625, "logps/ref_rejected": -93.53020477294922, "logps/rejected": -104.44441223144531, "loss": 0.8416, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.31263962388038635, "margin_dpo/beta_margin_grad_std": 0.1735057830810547, "margin_dpo/beta_margin_mean": 0.994620680809021, "margin_dpo/loss_margin_mean": 9.946207046508789, "margin_dpo/margin_mean": 9.946207046508789, "margin_dpo/margin_std": 11.080026626586914, "step": 64 }, { "epoch": 0.09544787077826726, "grad_norm": 54.08346939086914, "learning_rate": 4.63768115942029e-07, "logits/chosen": -0.6963008642196655, "logits/rejected": -0.6837696433067322, "logps/chosen": -68.06695556640625, "logps/ref_chosen": -64.83599853515625, "logps/ref_rejected": -109.94645690917969, "logps/rejected": -123.10871124267578, "loss": 0.8597, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.31020408868789673, "margin_dpo/beta_margin_grad_std": 0.18927563726902008, "margin_dpo/beta_margin_mean": 0.9931299686431885, "margin_dpo/loss_margin_mean": 9.931299209594727, "margin_dpo/margin_mean": 9.93129825592041, "margin_dpo/margin_std": 11.138134002685547, "step": 65 }, { "epoch": 0.09691629955947137, "grad_norm": 52.64336013793945, "learning_rate": 4.7101449275362313e-07, "logits/chosen": -0.6431201100349426, "logits/rejected": -0.6103649139404297, "logps/chosen": -54.33839797973633, "logps/ref_chosen": -51.44352722167969, "logps/ref_rejected": -75.63629150390625, "logps/rejected": -87.60906219482422, "loss": 0.8852, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.3300383687019348, "margin_dpo/beta_margin_grad_std": 0.17383669316768646, "margin_dpo/beta_margin_mean": 0.9077892303466797, "margin_dpo/loss_margin_mean": 9.077892303466797, "margin_dpo/margin_mean": 9.07789134979248, "margin_dpo/margin_std": 11.045241355895996, "step": 66 }, { "epoch": 0.09838472834067548, "grad_norm": 53.70967102050781, "learning_rate": 4.782608695652174e-07, "logits/chosen": -0.6037384271621704, "logits/rejected": -0.56143718957901, "logps/chosen": -61.83789825439453, "logps/ref_chosen": -59.34080505371094, "logps/ref_rejected": -72.78729248046875, "logps/rejected": -84.55035400390625, "loss": 0.8693, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.3258131444454193, "margin_dpo/beta_margin_grad_std": 0.1767134666442871, "margin_dpo/beta_margin_mean": 0.9265965223312378, "margin_dpo/loss_margin_mean": 9.265965461730957, "margin_dpo/margin_mean": 9.26596450805664, "margin_dpo/margin_std": 10.946893692016602, "step": 67 }, { "epoch": 0.09985315712187959, "grad_norm": 51.866180419921875, "learning_rate": 4.855072463768116e-07, "logits/chosen": -0.6399117708206177, "logits/rejected": -0.5805681347846985, "logps/chosen": -68.01475524902344, "logps/ref_chosen": -65.2058334350586, "logps/ref_rejected": -77.20724487304688, "logps/rejected": -88.76637268066406, "loss": 0.8436, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.3251006603240967, "margin_dpo/beta_margin_grad_std": 0.1536335051059723, "margin_dpo/beta_margin_mean": 0.875019907951355, "margin_dpo/loss_margin_mean": 8.750198364257812, "margin_dpo/margin_mean": 8.750198364257812, "margin_dpo/margin_std": 8.96760082244873, "step": 68 }, { "epoch": 0.1013215859030837, "grad_norm": 52.978328704833984, "learning_rate": 4.927536231884058e-07, "logits/chosen": -0.619906485080719, "logits/rejected": -0.5960003137588501, "logps/chosen": -63.03958511352539, "logps/ref_chosen": -59.81924057006836, "logps/ref_rejected": -103.38886260986328, "logps/rejected": -117.06353759765625, "loss": 0.7728, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.30147287249565125, "margin_dpo/beta_margin_grad_std": 0.15879851579666138, "margin_dpo/beta_margin_mean": 1.0454329252243042, "margin_dpo/loss_margin_mean": 10.454328536987305, "margin_dpo/margin_mean": 10.454329490661621, "margin_dpo/margin_std": 10.263179779052734, "step": 69 }, { "epoch": 0.1027900146842878, "grad_norm": 58.304927825927734, "learning_rate": 5e-07, "logits/chosen": -0.6229462623596191, "logits/rejected": -0.5881924629211426, "logps/chosen": -66.45687103271484, "logps/ref_chosen": -61.930641174316406, "logps/ref_rejected": -91.060791015625, "logps/rejected": -106.82664489746094, "loss": 0.7921, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2991790771484375, "margin_dpo/beta_margin_grad_std": 0.1847190260887146, "margin_dpo/beta_margin_mean": 1.123962163925171, "margin_dpo/loss_margin_mean": 11.239620208740234, "margin_dpo/margin_mean": 11.239620208740234, "margin_dpo/margin_std": 11.950462341308594, "step": 70 }, { "epoch": 0.10425844346549193, "grad_norm": 50.17360305786133, "learning_rate": 4.999967061337492e-07, "logits/chosen": -0.6788771152496338, "logits/rejected": -0.6398866772651672, "logps/chosen": -65.67703247070312, "logps/ref_chosen": -61.750343322753906, "logps/ref_rejected": -97.33662414550781, "logps/rejected": -114.21321105957031, "loss": 0.6993, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2723275125026703, "margin_dpo/beta_margin_grad_std": 0.1644226610660553, "margin_dpo/beta_margin_mean": 1.2949903011322021, "margin_dpo/loss_margin_mean": 12.949902534484863, "margin_dpo/margin_mean": 12.949902534484863, "margin_dpo/margin_std": 12.493947982788086, "step": 71 }, { "epoch": 0.10572687224669604, "grad_norm": 59.9242057800293, "learning_rate": 4.999868246217933e-07, "logits/chosen": -0.6571969985961914, "logits/rejected": -0.6217666864395142, "logps/chosen": -70.40309143066406, "logps/ref_chosen": -66.05341339111328, "logps/ref_rejected": -95.2869873046875, "logps/rejected": -113.08145141601562, "loss": 0.7306, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.26945406198501587, "margin_dpo/beta_margin_grad_std": 0.1959770768880844, "margin_dpo/beta_margin_mean": 1.3444783687591553, "margin_dpo/loss_margin_mean": 13.444782257080078, "margin_dpo/margin_mean": 13.444782257080078, "margin_dpo/margin_std": 13.743330955505371, "step": 72 }, { "epoch": 0.10719530102790015, "grad_norm": 75.93876647949219, "learning_rate": 4.999703557245192e-07, "logits/chosen": -0.6721267104148865, "logits/rejected": -0.6297430992126465, "logps/chosen": -72.05320739746094, "logps/ref_chosen": -66.25627136230469, "logps/ref_rejected": -90.45613861083984, "logps/rejected": -109.47367858886719, "loss": 0.9481, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.3085705637931824, "margin_dpo/beta_margin_grad_std": 0.24887485802173615, "margin_dpo/beta_margin_mean": 1.3220614194869995, "margin_dpo/loss_margin_mean": 13.220613479614258, "margin_dpo/margin_mean": 13.220613479614258, "margin_dpo/margin_std": 18.805517196655273, "step": 73 }, { "epoch": 0.10866372980910426, "grad_norm": 73.18781280517578, "learning_rate": 4.999472998758977e-07, "logits/chosen": -0.6080462336540222, "logits/rejected": -0.5960662364959717, "logps/chosen": -59.563087463378906, "logps/ref_chosen": -53.42488098144531, "logps/ref_rejected": -95.94693756103516, "logps/rejected": -115.85839080810547, "loss": 0.8756, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2891170084476471, "margin_dpo/beta_margin_grad_std": 0.21858373284339905, "margin_dpo/beta_margin_mean": 1.3773247003555298, "margin_dpo/loss_margin_mean": 13.773246765136719, "margin_dpo/margin_mean": 13.773246765136719, "margin_dpo/margin_std": 20.172929763793945, "step": 74 }, { "epoch": 0.11013215859030837, "grad_norm": 50.57677459716797, "learning_rate": 4.999176576834721e-07, "logits/chosen": -0.6782846450805664, "logits/rejected": -0.6683961153030396, "logps/chosen": -57.562652587890625, "logps/ref_chosen": -51.861663818359375, "logps/ref_rejected": -111.25397491455078, "logps/rejected": -136.24032592773438, "loss": 0.6095, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2280743420124054, "margin_dpo/beta_margin_grad_std": 0.19971585273742676, "margin_dpo/beta_margin_mean": 1.9285348653793335, "margin_dpo/loss_margin_mean": 19.285348892211914, "margin_dpo/margin_mean": 19.28534698486328, "margin_dpo/margin_std": 18.741535186767578, "step": 75 }, { "epoch": 0.11160058737151249, "grad_norm": 64.94268035888672, "learning_rate": 4.998814299283415e-07, "logits/chosen": -0.7133210301399231, "logits/rejected": -0.6718661785125732, "logps/chosen": -59.98701095581055, "logps/ref_chosen": -53.26604080200195, "logps/ref_rejected": -78.21662139892578, "logps/rejected": -97.29232788085938, "loss": 0.8158, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.28044480085372925, "margin_dpo/beta_margin_grad_std": 0.20121756196022034, "margin_dpo/beta_margin_mean": 1.235473871231079, "margin_dpo/loss_margin_mean": 12.354738235473633, "margin_dpo/margin_mean": 12.354738235473633, "margin_dpo/margin_std": 14.27847671508789, "step": 76 }, { "epoch": 0.1130690161527166, "grad_norm": 78.29557037353516, "learning_rate": 4.998386175651409e-07, "logits/chosen": -0.6657835245132446, "logits/rejected": -0.623427152633667, "logps/chosen": -63.632198333740234, "logps/ref_chosen": -58.0966796875, "logps/ref_rejected": -93.77361297607422, "logps/rejected": -118.64299774169922, "loss": 0.6806, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2214127629995346, "margin_dpo/beta_margin_grad_std": 0.2203107476234436, "margin_dpo/beta_margin_mean": 1.9333863258361816, "margin_dpo/loss_margin_mean": 19.3338623046875, "margin_dpo/margin_mean": 19.333864212036133, "margin_dpo/margin_std": 19.132383346557617, "step": 77 }, { "epoch": 0.1145374449339207, "grad_norm": 66.0775146484375, "learning_rate": 4.997892217220159e-07, "logits/chosen": -0.6555283069610596, "logits/rejected": -0.6280935406684875, "logps/chosen": -60.86896896362305, "logps/ref_chosen": -55.61378479003906, "logps/ref_rejected": -84.93436431884766, "logps/rejected": -105.0614013671875, "loss": 0.7256, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2643897533416748, "margin_dpo/beta_margin_grad_std": 0.20472703874111176, "margin_dpo/beta_margin_mean": 1.4871852397918701, "margin_dpo/loss_margin_mean": 14.871850967407227, "margin_dpo/margin_mean": 14.871851921081543, "margin_dpo/margin_std": 15.568973541259766, "step": 78 }, { "epoch": 0.11600587371512482, "grad_norm": 58.39738845825195, "learning_rate": 4.997332437005931e-07, "logits/chosen": -0.6440068483352661, "logits/rejected": -0.6116843819618225, "logps/chosen": -60.54931640625, "logps/ref_chosen": -55.45048522949219, "logps/ref_rejected": -87.64756774902344, "logps/rejected": -108.8857192993164, "loss": 0.777, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.27868783473968506, "margin_dpo/beta_margin_grad_std": 0.22643005847930908, "margin_dpo/beta_margin_mean": 1.6139320135116577, "margin_dpo/loss_margin_mean": 16.139320373535156, "margin_dpo/margin_mean": 16.139320373535156, "margin_dpo/margin_std": 18.9587459564209, "step": 79 }, { "epoch": 0.11747430249632893, "grad_norm": 62.89072036743164, "learning_rate": 4.996706849759452e-07, "logits/chosen": -0.7126628160476685, "logits/rejected": -0.6654119491577148, "logps/chosen": -65.43215942382812, "logps/ref_chosen": -58.519290924072266, "logps/ref_rejected": -87.54750061035156, "logps/rejected": -108.79297637939453, "loss": 0.8315, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.29197126626968384, "margin_dpo/beta_margin_grad_std": 0.22850532829761505, "margin_dpo/beta_margin_mean": 1.4332611560821533, "margin_dpo/loss_margin_mean": 14.332611083984375, "margin_dpo/margin_mean": 14.332611083984375, "margin_dpo/margin_std": 17.499080657958984, "step": 80 }, { "epoch": 0.11894273127753303, "grad_norm": 72.54817199707031, "learning_rate": 4.996015471965529e-07, "logits/chosen": -0.7246617674827576, "logits/rejected": -0.6918442249298096, "logps/chosen": -72.08871459960938, "logps/ref_chosen": -66.44886779785156, "logps/ref_rejected": -129.66270446777344, "logps/rejected": -154.00892639160156, "loss": 0.6947, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.24446864426136017, "margin_dpo/beta_margin_grad_std": 0.22534911334514618, "margin_dpo/beta_margin_mean": 1.8706377744674683, "margin_dpo/loss_margin_mean": 18.706378936767578, "margin_dpo/margin_mean": 18.706378936767578, "margin_dpo/margin_std": 20.739093780517578, "step": 81 }, { "epoch": 0.12041116005873716, "grad_norm": 88.24505615234375, "learning_rate": 4.995258321842611e-07, "logits/chosen": -0.6451495885848999, "logits/rejected": -0.6281242370605469, "logps/chosen": -59.3546142578125, "logps/ref_chosen": -52.232383728027344, "logps/ref_rejected": -90.74325561523438, "logps/rejected": -113.15169525146484, "loss": 0.9502, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.28069868683815, "margin_dpo/beta_margin_grad_std": 0.24664762616157532, "margin_dpo/beta_margin_mean": 1.5286219120025635, "margin_dpo/loss_margin_mean": 15.286218643188477, "margin_dpo/margin_mean": 15.286218643188477, "margin_dpo/margin_std": 21.404075622558594, "step": 82 }, { "epoch": 0.12187958883994127, "grad_norm": 70.73540496826172, "learning_rate": 4.994435419342304e-07, "logits/chosen": -0.6840830445289612, "logits/rejected": -0.6387213468551636, "logps/chosen": -62.84056854248047, "logps/ref_chosen": -55.82738494873047, "logps/ref_rejected": -103.71590423583984, "logps/rejected": -127.5250015258789, "loss": 0.7455, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.25954824686050415, "margin_dpo/beta_margin_grad_std": 0.22985972464084625, "margin_dpo/beta_margin_mean": 1.679591417312622, "margin_dpo/loss_margin_mean": 16.795913696289062, "margin_dpo/margin_mean": 16.795913696289062, "margin_dpo/margin_std": 18.532222747802734, "step": 83 }, { "epoch": 0.12334801762114538, "grad_norm": 58.62339401245117, "learning_rate": 4.993546786148857e-07, "logits/chosen": -0.6582399606704712, "logits/rejected": -0.6212340593338013, "logps/chosen": -72.36793518066406, "logps/ref_chosen": -67.1761703491211, "logps/ref_rejected": -87.29859924316406, "logps/rejected": -107.6661605834961, "loss": 0.6762, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.24515578150749207, "margin_dpo/beta_margin_grad_std": 0.18955430388450623, "margin_dpo/beta_margin_mean": 1.5175797939300537, "margin_dpo/loss_margin_mean": 15.175796508789062, "margin_dpo/margin_mean": 15.175797462463379, "margin_dpo/margin_std": 13.974632263183594, "step": 84 }, { "epoch": 0.12481644640234948, "grad_norm": 65.68191528320312, "learning_rate": 4.992592445678582e-07, "logits/chosen": -0.6145851016044617, "logits/rejected": -0.5817907452583313, "logps/chosen": -64.20103454589844, "logps/ref_chosen": -58.406620025634766, "logps/ref_rejected": -78.63880157470703, "logps/rejected": -99.02200317382812, "loss": 0.7679, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.27928727865219116, "margin_dpo/beta_margin_grad_std": 0.21024659276008606, "margin_dpo/beta_margin_mean": 1.458878993988037, "margin_dpo/loss_margin_mean": 14.588789939880371, "margin_dpo/margin_mean": 14.588790893554688, "margin_dpo/margin_std": 15.866073608398438, "step": 85 }, { "epoch": 0.1262848751835536, "grad_norm": 85.21753692626953, "learning_rate": 4.991572423079235e-07, "logits/chosen": -0.6728634238243103, "logits/rejected": -0.6558930277824402, "logps/chosen": -63.18061828613281, "logps/ref_chosen": -56.13746643066406, "logps/ref_rejected": -88.12165069580078, "logps/rejected": -110.33665466308594, "loss": 0.9179, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.29970669746398926, "margin_dpo/beta_margin_grad_std": 0.24453628063201904, "margin_dpo/beta_margin_mean": 1.5171852111816406, "margin_dpo/loss_margin_mean": 15.17185115814209, "margin_dpo/margin_mean": 15.171852111816406, "margin_dpo/margin_std": 21.81802749633789, "step": 86 }, { "epoch": 0.1277533039647577, "grad_norm": 66.58555603027344, "learning_rate": 4.990486745229364e-07, "logits/chosen": -0.7240000367164612, "logits/rejected": -0.6876901984214783, "logps/chosen": -62.49974060058594, "logps/ref_chosen": -55.63609313964844, "logps/ref_rejected": -95.46757507324219, "logps/rejected": -118.70195007324219, "loss": 0.7934, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2582399249076843, "margin_dpo/beta_margin_grad_std": 0.22514671087265015, "margin_dpo/beta_margin_mean": 1.6370728015899658, "margin_dpo/loss_margin_mean": 16.3707275390625, "margin_dpo/margin_mean": 16.3707275390625, "margin_dpo/margin_std": 19.043777465820312, "step": 87 }, { "epoch": 0.12922173274596183, "grad_norm": 75.37992095947266, "learning_rate": 4.989335440737586e-07, "logits/chosen": -0.6701527237892151, "logits/rejected": -0.6537374258041382, "logps/chosen": -82.11605072021484, "logps/ref_chosen": -73.67115020751953, "logps/ref_rejected": -106.70849609375, "logps/rejected": -127.71624755859375, "loss": 0.9174, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.30198073387145996, "margin_dpo/beta_margin_grad_std": 0.23300787806510925, "margin_dpo/beta_margin_mean": 1.2562841176986694, "margin_dpo/loss_margin_mean": 12.562840461730957, "margin_dpo/margin_mean": 12.562841415405273, "margin_dpo/margin_std": 15.86634635925293, "step": 88 }, { "epoch": 0.13069016152716592, "grad_norm": 54.054622650146484, "learning_rate": 4.988118539941847e-07, "logits/chosen": -0.7244502902030945, "logits/rejected": -0.6862339973449707, "logps/chosen": -65.05143737792969, "logps/ref_chosen": -60.624916076660156, "logps/ref_rejected": -82.08354949951172, "logps/rejected": -99.46173095703125, "loss": 0.7405, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.27611032128334045, "margin_dpo/beta_margin_grad_std": 0.17834323644638062, "margin_dpo/beta_margin_mean": 1.295165777206421, "margin_dpo/loss_margin_mean": 12.951656341552734, "margin_dpo/margin_mean": 12.95165729522705, "margin_dpo/margin_std": 13.88388442993164, "step": 89 }, { "epoch": 0.13215859030837004, "grad_norm": 66.94837188720703, "learning_rate": 4.986836074908615e-07, "logits/chosen": -0.6321258544921875, "logits/rejected": -0.6237634420394897, "logps/chosen": -59.42333221435547, "logps/ref_chosen": -53.285308837890625, "logps/ref_rejected": -111.54470825195312, "logps/rejected": -133.43154907226562, "loss": 0.8384, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.285112202167511, "margin_dpo/beta_margin_grad_std": 0.226267009973526, "margin_dpo/beta_margin_mean": 1.5748803615570068, "margin_dpo/loss_margin_mean": 15.748802185058594, "margin_dpo/margin_mean": 15.748802185058594, "margin_dpo/margin_std": 20.31298065185547, "step": 90 }, { "epoch": 0.13362701908957417, "grad_norm": 65.86888122558594, "learning_rate": 4.985488079432037e-07, "logits/chosen": -0.6855983734130859, "logits/rejected": -0.6458035707473755, "logps/chosen": -67.0127944946289, "logps/ref_chosen": -61.80295944213867, "logps/ref_rejected": -87.87395477294922, "logps/rejected": -108.96083068847656, "loss": 0.7585, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2720244526863098, "margin_dpo/beta_margin_grad_std": 0.22684738039970398, "margin_dpo/beta_margin_mean": 1.5877044200897217, "margin_dpo/loss_margin_mean": 15.877042770385742, "margin_dpo/margin_mean": 15.877042770385742, "margin_dpo/margin_std": 17.475290298461914, "step": 91 }, { "epoch": 0.13509544787077826, "grad_norm": 60.52584457397461, "learning_rate": 4.984074589033043e-07, "logits/chosen": -0.7122005224227905, "logits/rejected": -0.6839097738265991, "logps/chosen": -56.672237396240234, "logps/ref_chosen": -51.640769958496094, "logps/ref_rejected": -77.88117980957031, "logps/rejected": -97.52497100830078, "loss": 0.8096, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.28509509563446045, "margin_dpo/beta_margin_grad_std": 0.22501453757286072, "margin_dpo/beta_margin_mean": 1.4612317085266113, "margin_dpo/loss_margin_mean": 14.61231803894043, "margin_dpo/margin_mean": 14.61231803894043, "margin_dpo/margin_std": 17.27523422241211, "step": 92 }, { "epoch": 0.13656387665198239, "grad_norm": 47.008575439453125, "learning_rate": 4.982595640958425e-07, "logits/chosen": -0.7324954271316528, "logits/rejected": -0.671492874622345, "logps/chosen": -57.973655700683594, "logps/ref_chosen": -52.529239654541016, "logps/ref_rejected": -77.1607437133789, "logps/rejected": -97.39739990234375, "loss": 0.6889, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2600102424621582, "margin_dpo/beta_margin_grad_std": 0.18848371505737305, "margin_dpo/beta_margin_mean": 1.479223608970642, "margin_dpo/loss_margin_mean": 14.792236328125, "margin_dpo/margin_mean": 14.792236328125, "margin_dpo/margin_std": 15.35598087310791, "step": 93 }, { "epoch": 0.13803230543318648, "grad_norm": 51.4643669128418, "learning_rate": 4.98105127417984e-07, "logits/chosen": -0.6778910756111145, "logits/rejected": -0.649002730846405, "logps/chosen": -67.1570053100586, "logps/ref_chosen": -61.22261047363281, "logps/ref_rejected": -99.59902954101562, "logps/rejected": -121.3552474975586, "loss": 0.6464, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.24860258400440216, "margin_dpo/beta_margin_grad_std": 0.19004377722740173, "margin_dpo/beta_margin_mean": 1.5821821689605713, "margin_dpo/loss_margin_mean": 15.821820259094238, "margin_dpo/margin_mean": 15.821819305419922, "margin_dpo/margin_std": 14.735492706298828, "step": 94 }, { "epoch": 0.1395007342143906, "grad_norm": 50.75383758544922, "learning_rate": 4.979441529392784e-07, "logits/chosen": -0.6933159828186035, "logits/rejected": -0.655129075050354, "logps/chosen": -57.09678649902344, "logps/ref_chosen": -52.52364730834961, "logps/ref_rejected": -75.88035583496094, "logps/rejected": -93.311767578125, "loss": 0.7209, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2710234224796295, "margin_dpo/beta_margin_grad_std": 0.1795656383037567, "margin_dpo/beta_margin_mean": 1.2858270406723022, "margin_dpo/loss_margin_mean": 12.858270645141602, "margin_dpo/margin_mean": 12.858270645141602, "margin_dpo/margin_std": 12.511711120605469, "step": 95 }, { "epoch": 0.14096916299559473, "grad_norm": 50.98220443725586, "learning_rate": 4.977766449015534e-07, "logits/chosen": -0.6764161586761475, "logits/rejected": -0.6342806816101074, "logps/chosen": -65.936279296875, "logps/ref_chosen": -62.15697479248047, "logps/ref_rejected": -96.59601593017578, "logps/rejected": -117.31523895263672, "loss": 0.6236, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.23587027192115784, "margin_dpo/beta_margin_grad_std": 0.18438121676445007, "margin_dpo/beta_margin_mean": 1.6939918994903564, "margin_dpo/loss_margin_mean": 16.939918518066406, "margin_dpo/margin_mean": 16.939918518066406, "margin_dpo/margin_std": 16.764862060546875, "step": 96 }, { "epoch": 0.14243759177679882, "grad_norm": 52.73670959472656, "learning_rate": 4.976026077188012e-07, "logits/chosen": -0.6401921510696411, "logits/rejected": -0.5834782123565674, "logps/chosen": -59.18102264404297, "logps/ref_chosen": -54.64636993408203, "logps/ref_rejected": -76.96475219726562, "logps/rejected": -95.2552490234375, "loss": 0.6774, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.25927746295928955, "margin_dpo/beta_margin_grad_std": 0.17485031485557556, "margin_dpo/beta_margin_mean": 1.375584363937378, "margin_dpo/loss_margin_mean": 13.755844116210938, "margin_dpo/margin_mean": 13.755844116210938, "margin_dpo/margin_std": 12.07811164855957, "step": 97 }, { "epoch": 0.14390602055800295, "grad_norm": 58.02984619140625, "learning_rate": 4.974220459770639e-07, "logits/chosen": -0.6708123683929443, "logits/rejected": -0.6468954086303711, "logps/chosen": -71.02387237548828, "logps/ref_chosen": -65.25862884521484, "logps/ref_rejected": -96.5274887084961, "logps/rejected": -117.00706481933594, "loss": 0.7512, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.25962063670158386, "margin_dpo/beta_margin_grad_std": 0.2160511016845703, "margin_dpo/beta_margin_mean": 1.4714339971542358, "margin_dpo/loss_margin_mean": 14.714340209960938, "margin_dpo/margin_mean": 14.714340209960938, "margin_dpo/margin_std": 15.175495147705078, "step": 98 }, { "epoch": 0.14537444933920704, "grad_norm": 48.38506317138672, "learning_rate": 4.972349644343108e-07, "logits/chosen": -0.6916057467460632, "logits/rejected": -0.6791607737541199, "logps/chosen": -50.54969787597656, "logps/ref_chosen": -45.63848114013672, "logps/ref_rejected": -86.43792724609375, "logps/rejected": -107.18087768554688, "loss": 0.6459, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2525924742221832, "margin_dpo/beta_margin_grad_std": 0.17562821507453918, "margin_dpo/beta_margin_mean": 1.5831732749938965, "margin_dpo/loss_margin_mean": 15.831732749938965, "margin_dpo/margin_mean": 15.831733703613281, "margin_dpo/margin_std": 16.313186645507812, "step": 99 }, { "epoch": 0.14684287812041116, "grad_norm": 67.92232513427734, "learning_rate": 4.970413680203148e-07, "logits/chosen": -0.6955288648605347, "logits/rejected": -0.6533514857292175, "logps/chosen": -62.669090270996094, "logps/ref_chosen": -57.5939826965332, "logps/ref_rejected": -74.06021118164062, "logps/rejected": -90.62651062011719, "loss": 0.9037, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.30934613943099976, "margin_dpo/beta_margin_grad_std": 0.21438318490982056, "margin_dpo/beta_margin_mean": 1.1491191387176514, "margin_dpo/loss_margin_mean": 11.491190910339355, "margin_dpo/margin_mean": 11.491189956665039, "margin_dpo/margin_std": 15.003036499023438, "step": 100 }, { "epoch": 0.14684287812041116, "eval_logits/chosen": -0.6628317832946777, "eval_logits/rejected": -0.636573851108551, "eval_logps/chosen": -87.1427230834961, "eval_logps/ref_chosen": -79.05104064941406, "eval_logps/ref_rejected": -86.79793548583984, "eval_logps/rejected": -103.3295669555664, "eval_loss": 0.5592836737632751, "eval_margin_dpo/beta": 0.10000000149011612, "eval_margin_dpo/beta_margin_grad_mean": -0.36682140827178955, "eval_margin_dpo/beta_margin_grad_std": 0.23032358288764954, "eval_margin_dpo/beta_margin_mean": 0.8439961671829224, "eval_margin_dpo/loss_margin_mean": 8.439962387084961, "eval_margin_dpo/margin_mean": 8.439962387084961, "eval_margin_dpo/margin_std": 15.342604637145996, "eval_runtime": 39.9749, "eval_samples_per_second": 58.512, "eval_steps_per_second": 1.851, "step": 100 }, { "epoch": 0.14831130690161526, "grad_norm": 54.49692153930664, "learning_rate": 4.968412618365215e-07, "logits/chosen": -0.7001588344573975, "logits/rejected": -0.6611640453338623, "logps/chosen": -67.25942993164062, "logps/ref_chosen": -61.64884948730469, "logps/ref_rejected": -83.18968963623047, "logps/rejected": -102.17335510253906, "loss": 0.7842, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.288244366645813, "margin_dpo/beta_margin_grad_std": 0.20376016199588776, "margin_dpo/beta_margin_mean": 1.3373081684112549, "margin_dpo/loss_margin_mean": 13.373082160949707, "margin_dpo/margin_mean": 13.37308120727539, "margin_dpo/margin_std": 15.715073585510254, "step": 101 }, { "epoch": 0.14977973568281938, "grad_norm": 69.74262237548828, "learning_rate": 4.966346511559149e-07, "logits/chosen": -0.7373714447021484, "logits/rejected": -0.6927535533905029, "logps/chosen": -70.96074676513672, "logps/ref_chosen": -64.0788803100586, "logps/ref_rejected": -68.18707275390625, "logps/rejected": -85.39456176757812, "loss": 0.9365, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.3263900876045227, "margin_dpo/beta_margin_grad_std": 0.2226821929216385, "margin_dpo/beta_margin_mean": 1.0325615406036377, "margin_dpo/loss_margin_mean": 10.325615882873535, "margin_dpo/margin_mean": 10.325615882873535, "margin_dpo/margin_std": 14.067426681518555, "step": 102 }, { "epoch": 0.1512481644640235, "grad_norm": 46.19940185546875, "learning_rate": 4.964215414228785e-07, "logits/chosen": -0.6903908252716064, "logits/rejected": -0.6522761583328247, "logps/chosen": -64.90095520019531, "logps/ref_chosen": -61.299278259277344, "logps/ref_rejected": -93.57271575927734, "logps/rejected": -115.02561950683594, "loss": 0.5573, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.21845993399620056, "margin_dpo/beta_margin_grad_std": 0.17656126618385315, "margin_dpo/beta_margin_mean": 1.7851228713989258, "margin_dpo/loss_margin_mean": 17.851226806640625, "margin_dpo/margin_mean": 17.851226806640625, "margin_dpo/margin_std": 15.277688026428223, "step": 103 }, { "epoch": 0.1527165932452276, "grad_norm": 52.796669006347656, "learning_rate": 4.96201938253052e-07, "logits/chosen": -0.7016171813011169, "logits/rejected": -0.6555418968200684, "logps/chosen": -59.226173400878906, "logps/ref_chosen": -54.37277603149414, "logps/ref_rejected": -89.5647201538086, "logps/rejected": -111.29109191894531, "loss": 0.6771, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2526766061782837, "margin_dpo/beta_margin_grad_std": 0.20172113180160522, "margin_dpo/beta_margin_mean": 1.6872971057891846, "margin_dpo/loss_margin_mean": 16.87297248840332, "margin_dpo/margin_mean": 16.872970581054688, "margin_dpo/margin_std": 17.26502227783203, "step": 104 }, { "epoch": 0.15418502202643172, "grad_norm": 39.37166976928711, "learning_rate": 4.959758474331832e-07, "logits/chosen": -0.7300401926040649, "logits/rejected": -0.6914358139038086, "logps/chosen": -58.295875549316406, "logps/ref_chosen": -54.638946533203125, "logps/ref_rejected": -97.97351837158203, "logps/rejected": -124.21298217773438, "loss": 0.4219, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16911230981349945, "margin_dpo/beta_margin_grad_std": 0.16213884949684143, "margin_dpo/beta_margin_mean": 2.2582526206970215, "margin_dpo/loss_margin_mean": 22.58252716064453, "margin_dpo/margin_mean": 22.58252716064453, "margin_dpo/margin_std": 16.65502166748047, "step": 105 }, { "epoch": 0.15565345080763582, "grad_norm": 49.93497848510742, "learning_rate": 4.957432749209755e-07, "logits/chosen": -0.6783395409584045, "logits/rejected": -0.6233980059623718, "logps/chosen": -59.64507293701172, "logps/ref_chosen": -54.83289337158203, "logps/ref_rejected": -85.22461700439453, "logps/rejected": -104.86808013916016, "loss": 0.6785, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.25617170333862305, "margin_dpo/beta_margin_grad_std": 0.19661831855773926, "margin_dpo/beta_margin_mean": 1.483128547668457, "margin_dpo/loss_margin_mean": 14.83128547668457, "margin_dpo/margin_mean": 14.83128547668457, "margin_dpo/margin_std": 14.39011001586914, "step": 106 }, { "epoch": 0.15712187958883994, "grad_norm": 50.40999984741211, "learning_rate": 4.955042268449307e-07, "logits/chosen": -0.7118107676506042, "logits/rejected": -0.6568803787231445, "logps/chosen": -75.61688995361328, "logps/ref_chosen": -69.70780944824219, "logps/ref_rejected": -94.73950958251953, "logps/rejected": -117.43498229980469, "loss": 0.6451, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.23818761110305786, "margin_dpo/beta_margin_grad_std": 0.204330176115036, "margin_dpo/beta_margin_mean": 1.6786396503448486, "margin_dpo/loss_margin_mean": 16.786396026611328, "margin_dpo/margin_mean": 16.786396026611328, "margin_dpo/margin_std": 15.435192108154297, "step": 107 }, { "epoch": 0.15859030837004406, "grad_norm": 58.914913177490234, "learning_rate": 4.952587095041881e-07, "logits/chosen": -0.7263258695602417, "logits/rejected": -0.6777476668357849, "logps/chosen": -61.758811950683594, "logps/ref_chosen": -56.0098876953125, "logps/ref_rejected": -95.79601287841797, "logps/rejected": -118.6368408203125, "loss": 0.7481, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.26686227321624756, "margin_dpo/beta_margin_grad_std": 0.2259853482246399, "margin_dpo/beta_margin_mean": 1.7091913223266602, "margin_dpo/loss_margin_mean": 17.09191131591797, "margin_dpo/margin_mean": 17.09191131591797, "margin_dpo/margin_std": 19.03655433654785, "step": 108 }, { "epoch": 0.16005873715124816, "grad_norm": 45.8420295715332, "learning_rate": 4.95006729368358e-07, "logits/chosen": -0.6235396862030029, "logits/rejected": -0.5926010608673096, "logps/chosen": -67.99076080322266, "logps/ref_chosen": -62.88549041748047, "logps/ref_rejected": -98.68573760986328, "logps/rejected": -122.61830139160156, "loss": 0.545, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2126576006412506, "margin_dpo/beta_margin_grad_std": 0.18558171391487122, "margin_dpo/beta_margin_mean": 1.8827290534973145, "margin_dpo/loss_margin_mean": 18.827289581298828, "margin_dpo/margin_mean": 18.82729148864746, "margin_dpo/margin_std": 15.299284934997559, "step": 109 }, { "epoch": 0.16152716593245228, "grad_norm": 50.39557647705078, "learning_rate": 4.947482930773511e-07, "logits/chosen": -0.6504217386245728, "logits/rejected": -0.5953609347343445, "logps/chosen": -63.09541320800781, "logps/ref_chosen": -58.753684997558594, "logps/ref_rejected": -79.75001525878906, "logps/rejected": -101.90673828125, "loss": 0.6773, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.23589923977851868, "margin_dpo/beta_margin_grad_std": 0.20386064052581787, "margin_dpo/beta_margin_mean": 1.7814992666244507, "margin_dpo/loss_margin_mean": 17.814992904663086, "margin_dpo/margin_mean": 17.814992904663086, "margin_dpo/margin_std": 18.05242919921875, "step": 110 }, { "epoch": 0.16299559471365638, "grad_norm": 53.93953323364258, "learning_rate": 4.944834074412042e-07, "logits/chosen": -0.6995693445205688, "logits/rejected": -0.6706931591033936, "logps/chosen": -74.98273468017578, "logps/ref_chosen": -68.62410736083984, "logps/ref_rejected": -98.42886352539062, "logps/rejected": -123.05096435546875, "loss": 0.663, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.23302116990089417, "margin_dpo/beta_margin_grad_std": 0.2124572992324829, "margin_dpo/beta_margin_mean": 1.826347827911377, "margin_dpo/loss_margin_mean": 18.263477325439453, "margin_dpo/margin_mean": 18.263477325439453, "margin_dpo/margin_std": 17.97542953491211, "step": 111 }, { "epoch": 0.1644640234948605, "grad_norm": 58.261051177978516, "learning_rate": 4.942120794399002e-07, "logits/chosen": -0.6929997205734253, "logits/rejected": -0.6383606791496277, "logps/chosen": -56.543792724609375, "logps/ref_chosen": -50.24964141845703, "logps/ref_rejected": -64.77442932128906, "logps/rejected": -84.3316421508789, "loss": 0.8086, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2888134717941284, "margin_dpo/beta_margin_grad_std": 0.2119472473859787, "margin_dpo/beta_margin_mean": 1.3263057470321655, "margin_dpo/loss_margin_mean": 13.263057708740234, "margin_dpo/margin_mean": 13.263057708740234, "margin_dpo/margin_std": 15.071852684020996, "step": 112 }, { "epoch": 0.16593245227606462, "grad_norm": 52.767189025878906, "learning_rate": 4.939343162231841e-07, "logits/chosen": -0.6641270518302917, "logits/rejected": -0.6109206676483154, "logps/chosen": -72.74588012695312, "logps/ref_chosen": -66.71295166015625, "logps/ref_rejected": -77.96870422363281, "logps/rejected": -98.95388793945312, "loss": 0.659, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.25107163190841675, "margin_dpo/beta_margin_grad_std": 0.19331878423690796, "margin_dpo/beta_margin_mean": 1.495226263999939, "margin_dpo/loss_margin_mean": 14.952262878417969, "margin_dpo/margin_mean": 14.952262878417969, "margin_dpo/margin_std": 13.45613956451416, "step": 113 }, { "epoch": 0.16740088105726872, "grad_norm": 48.202720642089844, "learning_rate": 4.936501251103751e-07, "logits/chosen": -0.6876777410507202, "logits/rejected": -0.6400505304336548, "logps/chosen": -63.42707824707031, "logps/ref_chosen": -57.78507995605469, "logps/ref_rejected": -87.10966491699219, "logps/rejected": -112.90645599365234, "loss": 0.5985, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.22679130733013153, "margin_dpo/beta_margin_grad_std": 0.2024666965007782, "margin_dpo/beta_margin_mean": 2.0154800415039062, "margin_dpo/loss_margin_mean": 20.154800415039062, "margin_dpo/margin_mean": 20.154800415039062, "margin_dpo/margin_std": 20.26180076599121, "step": 114 }, { "epoch": 0.16886930983847284, "grad_norm": 73.78862762451172, "learning_rate": 4.933595135901732e-07, "logits/chosen": -0.7042691111564636, "logits/rejected": -0.6589173078536987, "logps/chosen": -73.75160217285156, "logps/ref_chosen": -65.5826416015625, "logps/ref_rejected": -98.56552124023438, "logps/rejected": -122.13522338867188, "loss": 0.7918, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.27233636379241943, "margin_dpo/beta_margin_grad_std": 0.21869003772735596, "margin_dpo/beta_margin_mean": 1.5400748252868652, "margin_dpo/loss_margin_mean": 15.400747299194336, "margin_dpo/margin_mean": 15.400747299194336, "margin_dpo/margin_std": 18.599172592163086, "step": 115 }, { "epoch": 0.17033773861967694, "grad_norm": 47.02722930908203, "learning_rate": 4.930624893204624e-07, "logits/chosen": -0.7148517370223999, "logits/rejected": -0.6804147958755493, "logps/chosen": -57.35455322265625, "logps/ref_chosen": -51.40031051635742, "logps/ref_rejected": -80.5218505859375, "logps/rejected": -101.68440246582031, "loss": 0.6214, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.24444279074668884, "margin_dpo/beta_margin_grad_std": 0.17096129059791565, "margin_dpo/beta_margin_mean": 1.5208299160003662, "margin_dpo/loss_margin_mean": 15.208297729492188, "margin_dpo/margin_mean": 15.20829963684082, "margin_dpo/margin_std": 13.959308624267578, "step": 116 }, { "epoch": 0.17180616740088106, "grad_norm": 61.11030960083008, "learning_rate": 4.927590601281083e-07, "logits/chosen": -0.6608189344406128, "logits/rejected": -0.6192047595977783, "logps/chosen": -75.69219207763672, "logps/ref_chosen": -69.29840850830078, "logps/ref_rejected": -66.58399200439453, "logps/rejected": -87.99634552001953, "loss": 0.6968, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.26184725761413574, "margin_dpo/beta_margin_grad_std": 0.19542691111564636, "margin_dpo/beta_margin_mean": 1.501856803894043, "margin_dpo/loss_margin_mean": 15.01856803894043, "margin_dpo/margin_mean": 15.01856803894043, "margin_dpo/margin_std": 15.984650611877441, "step": 117 }, { "epoch": 0.17327459618208516, "grad_norm": 48.049564361572266, "learning_rate": 4.924492340087524e-07, "logits/chosen": -0.6910693645477295, "logits/rejected": -0.6483018398284912, "logps/chosen": -62.306884765625, "logps/ref_chosen": -55.6409797668457, "logps/ref_rejected": -75.66905212402344, "logps/rejected": -96.35951232910156, "loss": 0.6673, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2580760717391968, "margin_dpo/beta_margin_grad_std": 0.17991520464420319, "margin_dpo/beta_margin_mean": 1.4024548530578613, "margin_dpo/loss_margin_mean": 14.024548530578613, "margin_dpo/margin_mean": 14.024547576904297, "margin_dpo/margin_std": 12.942065238952637, "step": 118 }, { "epoch": 0.17474302496328928, "grad_norm": 57.746402740478516, "learning_rate": 4.92133019126601e-07, "logits/chosen": -0.6886883974075317, "logits/rejected": -0.6644145250320435, "logps/chosen": -80.7379379272461, "logps/ref_chosen": -73.51019287109375, "logps/ref_rejected": -102.97728729248047, "logps/rejected": -125.08132934570312, "loss": 0.7405, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2667841613292694, "margin_dpo/beta_margin_grad_std": 0.21272984147071838, "margin_dpo/beta_margin_mean": 1.4876298904418945, "margin_dpo/loss_margin_mean": 14.876298904418945, "margin_dpo/margin_mean": 14.876298904418945, "margin_dpo/margin_std": 16.01374626159668, "step": 119 }, { "epoch": 0.1762114537444934, "grad_norm": 52.1450080871582, "learning_rate": 4.918104238142103e-07, "logits/chosen": -0.7026511430740356, "logits/rejected": -0.6608834266662598, "logps/chosen": -84.82908630371094, "logps/ref_chosen": -76.78083801269531, "logps/ref_rejected": -108.02374267578125, "logps/rejected": -134.74542236328125, "loss": 0.6004, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.22407472133636475, "margin_dpo/beta_margin_grad_std": 0.20310235023498535, "margin_dpo/beta_margin_mean": 1.8673429489135742, "margin_dpo/loss_margin_mean": 18.673429489135742, "margin_dpo/margin_mean": 18.673429489135742, "margin_dpo/margin_std": 17.22457504272461, "step": 120 }, { "epoch": 0.1776798825256975, "grad_norm": 48.357093811035156, "learning_rate": 4.91481456572267e-07, "logits/chosen": -0.6549187898635864, "logits/rejected": -0.6369335651397705, "logps/chosen": -69.40840911865234, "logps/ref_chosen": -61.789894104003906, "logps/ref_rejected": -109.99456787109375, "logps/rejected": -137.22264099121094, "loss": 0.5936, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2275564819574356, "margin_dpo/beta_margin_grad_std": 0.19740846753120422, "margin_dpo/beta_margin_mean": 1.9609556198120117, "margin_dpo/loss_margin_mean": 19.609556198120117, "margin_dpo/margin_mean": 19.60955810546875, "margin_dpo/margin_std": 18.554580688476562, "step": 121 }, { "epoch": 0.17914831130690162, "grad_norm": 45.592864990234375, "learning_rate": 4.911461260693638e-07, "logits/chosen": -0.6858741044998169, "logits/rejected": -0.6766628623008728, "logps/chosen": -53.84559631347656, "logps/ref_chosen": -46.90221405029297, "logps/ref_rejected": -106.71418762207031, "logps/rejected": -138.1607666015625, "loss": 0.4337, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1684824675321579, "margin_dpo/beta_margin_grad_std": 0.18330231308937073, "margin_dpo/beta_margin_mean": 2.450319290161133, "margin_dpo/loss_margin_mean": 24.503192901611328, "margin_dpo/margin_mean": 24.503192901611328, "margin_dpo/margin_std": 18.173328399658203, "step": 122 }, { "epoch": 0.18061674008810572, "grad_norm": 66.03611755371094, "learning_rate": 4.908044411417711e-07, "logits/chosen": -0.6609284281730652, "logits/rejected": -0.6286982297897339, "logps/chosen": -68.30619812011719, "logps/ref_chosen": -61.33863830566406, "logps/ref_rejected": -87.77539825439453, "logps/rejected": -111.76436614990234, "loss": 0.7836, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2565336227416992, "margin_dpo/beta_margin_grad_std": 0.23589524626731873, "margin_dpo/beta_margin_mean": 1.7021416425704956, "margin_dpo/loss_margin_mean": 17.02141571044922, "margin_dpo/margin_mean": 17.02141571044922, "margin_dpo/margin_std": 19.722978591918945, "step": 123 }, { "epoch": 0.18208516886930984, "grad_norm": 62.63188934326172, "learning_rate": 4.904564107932048e-07, "logits/chosen": -0.664189338684082, "logits/rejected": -0.6542805433273315, "logps/chosen": -78.76295471191406, "logps/ref_chosen": -71.44833374023438, "logps/ref_rejected": -117.58056640625, "logps/rejected": -146.3335723876953, "loss": 0.6425, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.22931857407093048, "margin_dpo/beta_margin_grad_std": 0.22194989025592804, "margin_dpo/beta_margin_mean": 2.1438369750976562, "margin_dpo/loss_margin_mean": 21.438369750976562, "margin_dpo/margin_mean": 21.438369750976562, "margin_dpo/margin_std": 23.726600646972656, "step": 124 }, { "epoch": 0.18355359765051396, "grad_norm": 45.74631881713867, "learning_rate": 4.90102044194588e-07, "logits/chosen": -0.6452882289886475, "logits/rejected": -0.6194664239883423, "logps/chosen": -55.687599182128906, "logps/ref_chosen": -50.136940002441406, "logps/ref_rejected": -83.98861694335938, "logps/rejected": -109.53338623046875, "loss": 0.5347, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2084668129682541, "margin_dpo/beta_margin_grad_std": 0.1783091127872467, "margin_dpo/beta_margin_mean": 1.9994112253189087, "margin_dpo/loss_margin_mean": 19.99411392211914, "margin_dpo/margin_mean": 19.994110107421875, "margin_dpo/margin_std": 17.385387420654297, "step": 125 }, { "epoch": 0.18502202643171806, "grad_norm": 55.11186599731445, "learning_rate": 4.897413506838102e-07, "logits/chosen": -0.6568164825439453, "logits/rejected": -0.6220812797546387, "logps/chosen": -61.971824645996094, "logps/ref_chosen": -55.66706848144531, "logps/ref_rejected": -98.1297607421875, "logps/rejected": -123.57440185546875, "loss": 0.5552, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.21338176727294922, "margin_dpo/beta_margin_grad_std": 0.18551576137542725, "margin_dpo/beta_margin_mean": 1.9139891862869263, "margin_dpo/loss_margin_mean": 19.139890670776367, "margin_dpo/margin_mean": 19.139890670776367, "margin_dpo/margin_std": 16.969818115234375, "step": 126 }, { "epoch": 0.18649045521292218, "grad_norm": 46.06990432739258, "learning_rate": 4.89374339765481e-07, "logits/chosen": -0.638472318649292, "logits/rejected": -0.6041021347045898, "logps/chosen": -61.95406723022461, "logps/ref_chosen": -56.55467987060547, "logps/ref_rejected": -76.7957763671875, "logps/rejected": -98.31398010253906, "loss": 0.6303, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2392818182706833, "margin_dpo/beta_margin_grad_std": 0.19149260222911835, "margin_dpo/beta_margin_mean": 1.6118814945220947, "margin_dpo/loss_margin_mean": 16.118816375732422, "margin_dpo/margin_mean": 16.118816375732422, "margin_dpo/margin_std": 13.991384506225586, "step": 127 }, { "epoch": 0.18795888399412627, "grad_norm": 51.177642822265625, "learning_rate": 4.890010211106795e-07, "logits/chosen": -0.663079023361206, "logits/rejected": -0.616753876209259, "logps/chosen": -63.87889862060547, "logps/ref_chosen": -58.12095642089844, "logps/ref_rejected": -76.43896484375, "logps/rejected": -99.25593566894531, "loss": 0.6751, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.25181353092193604, "margin_dpo/beta_margin_grad_std": 0.20399567484855652, "margin_dpo/beta_margin_mean": 1.7059035301208496, "margin_dpo/loss_margin_mean": 17.05903434753418, "margin_dpo/margin_mean": 17.05903434753418, "margin_dpo/margin_std": 17.72481346130371, "step": 128 }, { "epoch": 0.1894273127753304, "grad_norm": 72.67992401123047, "learning_rate": 4.88621404556699e-07, "logits/chosen": -0.6873067617416382, "logits/rejected": -0.6568499803543091, "logps/chosen": -75.6619644165039, "logps/ref_chosen": -66.91636657714844, "logps/ref_rejected": -96.6422119140625, "logps/rejected": -122.64834594726562, "loss": 0.7959, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.26328974962234497, "margin_dpo/beta_margin_grad_std": 0.24150995910167694, "margin_dpo/beta_margin_mean": 1.7260537147521973, "margin_dpo/loss_margin_mean": 17.260536193847656, "margin_dpo/margin_mean": 17.26053810119629, "margin_dpo/margin_std": 20.107585906982422, "step": 129 }, { "epoch": 0.19089574155653452, "grad_norm": 50.73147964477539, "learning_rate": 4.882355001067891e-07, "logits/chosen": -0.6596213579177856, "logits/rejected": -0.6461096405982971, "logps/chosen": -51.04236602783203, "logps/ref_chosen": -44.666847229003906, "logps/ref_rejected": -82.78165435791016, "logps/rejected": -112.08168029785156, "loss": 0.5939, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.20150384306907654, "margin_dpo/beta_margin_grad_std": 0.21593783795833588, "margin_dpo/beta_margin_mean": 2.2924509048461914, "margin_dpo/loss_margin_mean": 22.924509048461914, "margin_dpo/margin_mean": 22.924509048461914, "margin_dpo/margin_std": 19.672473907470703, "step": 130 }, { "epoch": 0.19236417033773862, "grad_norm": 43.14263916015625, "learning_rate": 4.878433179298909e-07, "logits/chosen": -0.6646705269813538, "logits/rejected": -0.6489601135253906, "logps/chosen": -49.25099182128906, "logps/ref_chosen": -44.92458724975586, "logps/ref_rejected": -88.44401550292969, "logps/rejected": -113.0731201171875, "loss": 0.5387, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.20338965952396393, "margin_dpo/beta_margin_grad_std": 0.19119322299957275, "margin_dpo/beta_margin_mean": 2.0302700996398926, "margin_dpo/loss_margin_mean": 20.302701950073242, "margin_dpo/margin_mean": 20.302701950073242, "margin_dpo/margin_std": 17.324234008789062, "step": 131 }, { "epoch": 0.19383259911894274, "grad_norm": 48.75657272338867, "learning_rate": 4.874448683603694e-07, "logits/chosen": -0.6894493699073792, "logits/rejected": -0.6632376909255981, "logps/chosen": -65.58708953857422, "logps/ref_chosen": -59.00108337402344, "logps/ref_rejected": -87.89215087890625, "logps/rejected": -113.75344848632812, "loss": 0.539, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2136635035276413, "margin_dpo/beta_margin_grad_std": 0.17412179708480835, "margin_dpo/beta_margin_mean": 1.927529215812683, "margin_dpo/loss_margin_mean": 19.275291442871094, "margin_dpo/margin_mean": 19.275293350219727, "margin_dpo/margin_std": 17.327112197875977, "step": 132 }, { "epoch": 0.19530102790014683, "grad_norm": 57.14876937866211, "learning_rate": 4.870401618977415e-07, "logits/chosen": -0.6868765354156494, "logits/rejected": -0.6663703918457031, "logps/chosen": -74.35096740722656, "logps/ref_chosen": -66.60449981689453, "logps/ref_rejected": -96.33355712890625, "logps/rejected": -121.88394165039062, "loss": 0.711, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2557414174079895, "margin_dpo/beta_margin_grad_std": 0.22053731977939606, "margin_dpo/beta_margin_mean": 1.780390977859497, "margin_dpo/loss_margin_mean": 17.803909301757812, "margin_dpo/margin_mean": 17.803909301757812, "margin_dpo/margin_std": 19.555706024169922, "step": 133 }, { "epoch": 0.19676945668135096, "grad_norm": 48.30363845825195, "learning_rate": 4.866292092063986e-07, "logits/chosen": -0.6847056150436401, "logits/rejected": -0.6499172449111938, "logps/chosen": -57.004554748535156, "logps/ref_chosen": -52.06925582885742, "logps/ref_rejected": -87.6545181274414, "logps/rejected": -112.0121841430664, "loss": 0.4899, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.20193609595298767, "margin_dpo/beta_margin_grad_std": 0.15019506216049194, "margin_dpo/beta_margin_mean": 1.942237377166748, "margin_dpo/loss_margin_mean": 19.422372817993164, "margin_dpo/margin_mean": 19.42237091064453, "margin_dpo/margin_std": 15.808595657348633, "step": 134 }, { "epoch": 0.19823788546255505, "grad_norm": 56.833290100097656, "learning_rate": 4.862120211153265e-07, "logits/chosen": -0.6657185554504395, "logits/rejected": -0.6646615862846375, "logps/chosen": -58.18457794189453, "logps/ref_chosen": -50.353858947753906, "logps/ref_rejected": -115.97975158691406, "logps/rejected": -144.62242126464844, "loss": 0.5819, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.21536532044410706, "margin_dpo/beta_margin_grad_std": 0.20012225210666656, "margin_dpo/beta_margin_mean": 2.0811963081359863, "margin_dpo/loss_margin_mean": 20.811962127685547, "margin_dpo/margin_mean": 20.811962127685547, "margin_dpo/margin_std": 19.506851196289062, "step": 135 }, { "epoch": 0.19970631424375918, "grad_norm": 60.99176788330078, "learning_rate": 4.857886086178193e-07, "logits/chosen": -0.6759936809539795, "logits/rejected": -0.6444242596626282, "logps/chosen": -73.07025146484375, "logps/ref_chosen": -65.072509765625, "logps/ref_rejected": -96.32122802734375, "logps/rejected": -120.97657775878906, "loss": 0.6585, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.252665251493454, "margin_dpo/beta_margin_grad_std": 0.18508610129356384, "margin_dpo/beta_margin_mean": 1.6657602787017822, "margin_dpo/loss_margin_mean": 16.657604217529297, "margin_dpo/margin_mean": 16.657604217529297, "margin_dpo/margin_std": 18.530437469482422, "step": 136 }, { "epoch": 0.2011747430249633, "grad_norm": 61.030094146728516, "learning_rate": 4.853589828711902e-07, "logits/chosen": -0.6564372181892395, "logits/rejected": -0.6498109102249146, "logps/chosen": -58.350120544433594, "logps/ref_chosen": -48.759117126464844, "logps/ref_rejected": -113.86377716064453, "logps/rejected": -146.03843688964844, "loss": 0.6181, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.218036487698555, "margin_dpo/beta_margin_grad_std": 0.2112565040588379, "margin_dpo/beta_margin_mean": 2.258366346359253, "margin_dpo/loss_margin_mean": 22.583663940429688, "margin_dpo/margin_mean": 22.583663940429688, "margin_dpo/margin_std": 22.754310607910156, "step": 137 }, { "epoch": 0.2026431718061674, "grad_norm": 70.91644287109375, "learning_rate": 4.849231551964771e-07, "logits/chosen": -0.6515681147575378, "logits/rejected": -0.6243264675140381, "logps/chosen": -69.78646850585938, "logps/ref_chosen": -60.519649505615234, "logps/ref_rejected": -93.19694519042969, "logps/rejected": -121.13736724853516, "loss": 0.6843, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2272961288690567, "margin_dpo/beta_margin_grad_std": 0.22245639562606812, "margin_dpo/beta_margin_mean": 1.8673601150512695, "margin_dpo/loss_margin_mean": 18.673601150512695, "margin_dpo/margin_mean": 18.673603057861328, "margin_dpo/margin_std": 18.434284210205078, "step": 138 }, { "epoch": 0.20411160058737152, "grad_norm": 50.293697357177734, "learning_rate": 4.844811370781446e-07, "logits/chosen": -0.6459161639213562, "logits/rejected": -0.6150977611541748, "logps/chosen": -53.843475341796875, "logps/ref_chosen": -46.89138412475586, "logps/ref_rejected": -79.72798156738281, "logps/rejected": -107.27476501464844, "loss": 0.548, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.21206048130989075, "margin_dpo/beta_margin_grad_std": 0.19148895144462585, "margin_dpo/beta_margin_mean": 2.059469699859619, "margin_dpo/loss_margin_mean": 20.594696044921875, "margin_dpo/margin_mean": 20.594696044921875, "margin_dpo/margin_std": 18.52047348022461, "step": 139 }, { "epoch": 0.2055800293685756, "grad_norm": 53.57754898071289, "learning_rate": 4.840329401637809e-07, "logits/chosen": -0.6656177639961243, "logits/rejected": -0.6380197405815125, "logps/chosen": -66.53479766845703, "logps/ref_chosen": -58.97471618652344, "logps/ref_rejected": -83.28411102294922, "logps/rejected": -110.21284484863281, "loss": 0.676, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.23967662453651428, "margin_dpo/beta_margin_grad_std": 0.2281067818403244, "margin_dpo/beta_margin_mean": 1.9368653297424316, "margin_dpo/loss_margin_mean": 19.368654251098633, "margin_dpo/margin_mean": 19.36865234375, "margin_dpo/margin_std": 19.57999038696289, "step": 140 }, { "epoch": 0.20704845814977973, "grad_norm": 61.85297393798828, "learning_rate": 4.83578576263792e-07, "logits/chosen": -0.6568824052810669, "logits/rejected": -0.6319071650505066, "logps/chosen": -81.36563110351562, "logps/ref_chosen": -75.0756607055664, "logps/ref_rejected": -98.1922607421875, "logps/rejected": -123.78886413574219, "loss": 0.6184, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.21810100972652435, "margin_dpo/beta_margin_grad_std": 0.2185245007276535, "margin_dpo/beta_margin_mean": 1.9306633472442627, "margin_dpo/loss_margin_mean": 19.30663299560547, "margin_dpo/margin_mean": 19.30663299560547, "margin_dpo/margin_std": 17.307022094726562, "step": 141 }, { "epoch": 0.20851688693098386, "grad_norm": 72.42768096923828, "learning_rate": 4.83118057351089e-07, "logits/chosen": -0.6577416658401489, "logits/rejected": -0.6425771117210388, "logps/chosen": -67.6053695678711, "logps/ref_chosen": -58.027931213378906, "logps/ref_rejected": -94.58222198486328, "logps/rejected": -124.15780639648438, "loss": 0.7595, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.24029812216758728, "margin_dpo/beta_margin_grad_std": 0.23216235637664795, "margin_dpo/beta_margin_mean": 1.999813437461853, "margin_dpo/loss_margin_mean": 19.99813461303711, "margin_dpo/margin_mean": 19.99813461303711, "margin_dpo/margin_std": 21.703876495361328, "step": 142 }, { "epoch": 0.20998531571218795, "grad_norm": 73.55904388427734, "learning_rate": 4.826513955607734e-07, "logits/chosen": -0.6666814088821411, "logits/rejected": -0.6278376579284668, "logps/chosen": -66.27539825439453, "logps/ref_chosen": -57.59645080566406, "logps/ref_rejected": -78.99957275390625, "logps/rejected": -103.03237915039062, "loss": 0.8432, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.275440514087677, "margin_dpo/beta_margin_grad_std": 0.24454638361930847, "margin_dpo/beta_margin_mean": 1.535386323928833, "margin_dpo/loss_margin_mean": 15.353862762451172, "margin_dpo/margin_mean": 15.353862762451172, "margin_dpo/margin_std": 18.465240478515625, "step": 143 }, { "epoch": 0.21145374449339208, "grad_norm": 43.79653549194336, "learning_rate": 4.821786031898176e-07, "logits/chosen": -0.6671550869941711, "logits/rejected": -0.6211960315704346, "logps/chosen": -66.0051498413086, "logps/ref_chosen": -59.90636444091797, "logps/ref_rejected": -82.00025939941406, "logps/rejected": -107.91677856445312, "loss": 0.532, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.20202696323394775, "margin_dpo/beta_margin_grad_std": 0.19462409615516663, "margin_dpo/beta_margin_mean": 1.9817728996276855, "margin_dpo/loss_margin_mean": 19.817729949951172, "margin_dpo/margin_mean": 19.817729949951172, "margin_dpo/margin_std": 16.256122589111328, "step": 144 }, { "epoch": 0.21292217327459617, "grad_norm": 47.29103469848633, "learning_rate": 4.816996926967401e-07, "logits/chosen": -0.6466660499572754, "logits/rejected": -0.6030235290527344, "logps/chosen": -64.20927429199219, "logps/ref_chosen": -56.60066604614258, "logps/ref_rejected": -77.86631774902344, "logps/rejected": -105.54521179199219, "loss": 0.5566, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2146737277507782, "margin_dpo/beta_margin_grad_std": 0.18967363238334656, "margin_dpo/beta_margin_mean": 2.0070290565490723, "margin_dpo/loss_margin_mean": 20.07029151916504, "margin_dpo/margin_mean": 20.070289611816406, "margin_dpo/margin_std": 18.223201751708984, "step": 145 }, { "epoch": 0.2143906020558003, "grad_norm": 70.13373565673828, "learning_rate": 4.812146767012779e-07, "logits/chosen": -0.680939793586731, "logits/rejected": -0.6254955530166626, "logps/chosen": -76.54412078857422, "logps/ref_chosen": -66.00045776367188, "logps/ref_rejected": -81.70278930664062, "logps/rejected": -108.8254623413086, "loss": 0.7131, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2562163472175598, "margin_dpo/beta_margin_grad_std": 0.22194227576255798, "margin_dpo/beta_margin_mean": 1.657900333404541, "margin_dpo/loss_margin_mean": 16.579002380371094, "margin_dpo/margin_mean": 16.579002380371094, "margin_dpo/margin_std": 17.49138641357422, "step": 146 }, { "epoch": 0.21585903083700442, "grad_norm": 57.6500244140625, "learning_rate": 4.807235679840536e-07, "logits/chosen": -0.6382741928100586, "logits/rejected": -0.5938813090324402, "logps/chosen": -61.75067138671875, "logps/ref_chosen": -53.405487060546875, "logps/ref_rejected": -71.39061737060547, "logps/rejected": -100.0536880493164, "loss": 0.5682, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.21429036557674408, "margin_dpo/beta_margin_grad_std": 0.19528846442699432, "margin_dpo/beta_margin_mean": 2.0317890644073486, "margin_dpo/loss_margin_mean": 20.317890167236328, "margin_dpo/margin_mean": 20.317890167236328, "margin_dpo/margin_std": 19.069602966308594, "step": 147 }, { "epoch": 0.2173274596182085, "grad_norm": 50.89360809326172, "learning_rate": 4.802263794862384e-07, "logits/chosen": -0.6731536388397217, "logits/rejected": -0.6413577795028687, "logps/chosen": -71.63116455078125, "logps/ref_chosen": -64.93708038330078, "logps/ref_rejected": -103.09384155273438, "logps/rejected": -125.92637634277344, "loss": 0.6758, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.25298935174942017, "margin_dpo/beta_margin_grad_std": 0.19248813390731812, "margin_dpo/beta_margin_mean": 1.6138441562652588, "margin_dpo/loss_margin_mean": 16.13844108581543, "margin_dpo/margin_mean": 16.13844108581543, "margin_dpo/margin_std": 15.57655143737793, "step": 148 }, { "epoch": 0.21879588839941264, "grad_norm": 43.59981155395508, "learning_rate": 4.797231243092118e-07, "logits/chosen": -0.7026668787002563, "logits/rejected": -0.6738122701644897, "logps/chosen": -65.29115295410156, "logps/ref_chosen": -58.47376251220703, "logps/ref_rejected": -99.31474304199219, "logps/rejected": -126.91746520996094, "loss": 0.5037, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.19538308680057526, "margin_dpo/beta_margin_grad_std": 0.18354260921478271, "margin_dpo/beta_margin_mean": 2.078533887863159, "margin_dpo/loss_margin_mean": 20.78533935546875, "margin_dpo/margin_mean": 20.78533935546875, "margin_dpo/margin_std": 16.75721549987793, "step": 149 }, { "epoch": 0.22026431718061673, "grad_norm": 55.560611724853516, "learning_rate": 4.792138157142157e-07, "logits/chosen": -0.6683057546615601, "logits/rejected": -0.6466302871704102, "logps/chosen": -52.58869934082031, "logps/ref_chosen": -45.705810546875, "logps/ref_rejected": -83.34759521484375, "logps/rejected": -109.74415588378906, "loss": 0.6181, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2259724736213684, "margin_dpo/beta_margin_grad_std": 0.19235166907310486, "margin_dpo/beta_margin_mean": 1.951366662979126, "margin_dpo/loss_margin_mean": 19.5136661529541, "margin_dpo/margin_mean": 19.5136661529541, "margin_dpo/margin_std": 19.547821044921875, "step": 150 }, { "epoch": 0.22173274596182085, "grad_norm": 53.323848724365234, "learning_rate": 4.786984671220053e-07, "logits/chosen": -0.6942344903945923, "logits/rejected": -0.6533582210540771, "logps/chosen": -78.0422592163086, "logps/ref_chosen": -70.57083129882812, "logps/ref_rejected": -100.46382141113281, "logps/rejected": -129.36167907714844, "loss": 0.5187, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.19543182849884033, "margin_dpo/beta_margin_grad_std": 0.19356155395507812, "margin_dpo/beta_margin_mean": 2.142643690109253, "margin_dpo/loss_margin_mean": 21.426437377929688, "margin_dpo/margin_mean": 21.426435470581055, "margin_dpo/margin_std": 18.174488067626953, "step": 151 }, { "epoch": 0.22320117474302498, "grad_norm": 58.04679870605469, "learning_rate": 4.78177092112495e-07, "logits/chosen": -0.6981043815612793, "logits/rejected": -0.6723449230194092, "logps/chosen": -65.85396575927734, "logps/ref_chosen": -60.164390563964844, "logps/ref_rejected": -106.14045715332031, "logps/rejected": -133.99301147460938, "loss": 0.5073, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18981137871742249, "margin_dpo/beta_margin_grad_std": 0.18638314306735992, "margin_dpo/beta_margin_mean": 2.2162981033325195, "margin_dpo/loss_margin_mean": 22.162979125976562, "margin_dpo/margin_mean": 22.162979125976562, "margin_dpo/margin_std": 18.572938919067383, "step": 152 }, { "epoch": 0.22466960352422907, "grad_norm": 45.65426254272461, "learning_rate": 4.776497044244016e-07, "logits/chosen": -0.6792968511581421, "logits/rejected": -0.6601795554161072, "logps/chosen": -62.912200927734375, "logps/ref_chosen": -56.315277099609375, "logps/ref_rejected": -85.65583801269531, "logps/rejected": -111.13700866699219, "loss": 0.6482, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.23670101165771484, "margin_dpo/beta_margin_grad_std": 0.2141384482383728, "margin_dpo/beta_margin_mean": 1.888425350189209, "margin_dpo/loss_margin_mean": 18.884254455566406, "margin_dpo/margin_mean": 18.884254455566406, "margin_dpo/margin_std": 19.149822235107422, "step": 153 }, { "epoch": 0.2261380323054332, "grad_norm": 70.82756805419922, "learning_rate": 4.771163179548808e-07, "logits/chosen": -0.6859003305435181, "logits/rejected": -0.6607710123062134, "logps/chosen": -71.26567077636719, "logps/ref_chosen": -62.74256896972656, "logps/ref_rejected": -104.24420166015625, "logps/rejected": -131.44509887695312, "loss": 0.7219, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2435159683227539, "margin_dpo/beta_margin_grad_std": 0.2322094589471817, "margin_dpo/beta_margin_mean": 1.8677783012390137, "margin_dpo/loss_margin_mean": 18.677783966064453, "margin_dpo/margin_mean": 18.677783966064453, "margin_dpo/margin_std": 18.939533233642578, "step": 154 }, { "epoch": 0.2276064610866373, "grad_norm": 54.700984954833984, "learning_rate": 4.7657694675916247e-07, "logits/chosen": -0.677458643913269, "logits/rejected": -0.650254487991333, "logps/chosen": -66.76405334472656, "logps/ref_chosen": -60.65318298339844, "logps/ref_rejected": -77.49220275878906, "logps/rejected": -103.33089447021484, "loss": 0.5788, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.20778781175613403, "margin_dpo/beta_margin_grad_std": 0.1974395513534546, "margin_dpo/beta_margin_mean": 1.972782015800476, "margin_dpo/loss_margin_mean": 19.727819442749023, "margin_dpo/margin_mean": 19.72781753540039, "margin_dpo/margin_std": 17.581571578979492, "step": 155 }, { "epoch": 0.2290748898678414, "grad_norm": 88.04608154296875, "learning_rate": 4.7603160505017893e-07, "logits/chosen": -0.653121829032898, "logits/rejected": -0.622460126876831, "logps/chosen": -79.57420349121094, "logps/ref_chosen": -69.49188232421875, "logps/ref_rejected": -77.1692886352539, "logps/rejected": -102.35071563720703, "loss": 0.9578, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2811535894870758, "margin_dpo/beta_margin_grad_std": 0.2631846070289612, "margin_dpo/beta_margin_mean": 1.5099093914031982, "margin_dpo/loss_margin_mean": 15.09909439086914, "margin_dpo/margin_mean": 15.09909439086914, "margin_dpo/margin_std": 20.09097671508789, "step": 156 }, { "epoch": 0.2305433186490455, "grad_norm": 60.531410217285156, "learning_rate": 4.7548030719819154e-07, "logits/chosen": -0.7271685600280762, "logits/rejected": -0.6973283290863037, "logps/chosen": -71.76556396484375, "logps/ref_chosen": -61.368438720703125, "logps/ref_rejected": -107.64636993408203, "logps/rejected": -139.60733032226562, "loss": 0.5432, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.20604455471038818, "margin_dpo/beta_margin_grad_std": 0.1988464891910553, "margin_dpo/beta_margin_mean": 2.1563832759857178, "margin_dpo/loss_margin_mean": 21.563831329345703, "margin_dpo/margin_mean": 21.563831329345703, "margin_dpo/margin_std": 19.046764373779297, "step": 157 }, { "epoch": 0.23201174743024963, "grad_norm": 46.93947219848633, "learning_rate": 4.7492306773041136e-07, "logits/chosen": -0.6574522256851196, "logits/rejected": -0.6339297294616699, "logps/chosen": -65.11067962646484, "logps/ref_chosen": -57.61292266845703, "logps/ref_rejected": -113.6946792602539, "logps/rejected": -143.18002319335938, "loss": 0.5655, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.21865713596343994, "margin_dpo/beta_margin_grad_std": 0.1967364400625229, "margin_dpo/beta_margin_mean": 2.198759078979492, "margin_dpo/loss_margin_mean": 21.987590789794922, "margin_dpo/margin_mean": 21.987590789794922, "margin_dpo/margin_std": 21.580215454101562, "step": 158 }, { "epoch": 0.23348017621145375, "grad_norm": 52.955257415771484, "learning_rate": 4.743599013306165e-07, "logits/chosen": -0.6695908308029175, "logits/rejected": -0.628494143486023, "logps/chosen": -89.92948150634766, "logps/ref_chosen": -81.56034088134766, "logps/ref_rejected": -88.8987045288086, "logps/rejected": -116.74146270751953, "loss": 0.6625, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.23920385539531708, "margin_dpo/beta_margin_grad_std": 0.21742284297943115, "margin_dpo/beta_margin_mean": 1.947361707687378, "margin_dpo/loss_margin_mean": 19.473617553710938, "margin_dpo/margin_mean": 19.473617553710938, "margin_dpo/margin_std": 20.372608184814453, "step": 159 }, { "epoch": 0.23494860499265785, "grad_norm": 58.06504440307617, "learning_rate": 4.737908228387656e-07, "logits/chosen": -0.6885409355163574, "logits/rejected": -0.6468052864074707, "logps/chosen": -74.30818939208984, "logps/ref_chosen": -65.73088073730469, "logps/ref_rejected": -97.21781921386719, "logps/rejected": -125.66719818115234, "loss": 0.6855, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.24562396109104156, "margin_dpo/beta_margin_grad_std": 0.22771283984184265, "margin_dpo/beta_margin_mean": 1.9872074127197266, "margin_dpo/loss_margin_mean": 19.872074127197266, "margin_dpo/margin_mean": 19.872072219848633, "margin_dpo/margin_std": 21.958354949951172, "step": 160 }, { "epoch": 0.23641703377386197, "grad_norm": 53.40824508666992, "learning_rate": 4.7321584725060594e-07, "logits/chosen": -0.6996691226959229, "logits/rejected": -0.6711582541465759, "logps/chosen": -60.577247619628906, "logps/ref_chosen": -52.43647766113281, "logps/ref_rejected": -83.43095397949219, "logps/rejected": -111.64784240722656, "loss": 0.6506, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.23106649518013, "margin_dpo/beta_margin_grad_std": 0.22650553286075592, "margin_dpo/beta_margin_mean": 2.0076117515563965, "margin_dpo/loss_margin_mean": 20.07611656188965, "margin_dpo/margin_mean": 20.07611846923828, "margin_dpo/margin_std": 19.994857788085938, "step": 161 }, { "epoch": 0.23788546255506607, "grad_norm": 46.01493835449219, "learning_rate": 4.7263498971727905e-07, "logits/chosen": -0.6546447277069092, "logits/rejected": -0.6173849105834961, "logps/chosen": -70.60856628417969, "logps/ref_chosen": -62.61058807373047, "logps/ref_rejected": -89.39057922363281, "logps/rejected": -116.71580505371094, "loss": 0.6419, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2332153171300888, "margin_dpo/beta_margin_grad_std": 0.20986613631248474, "margin_dpo/beta_margin_mean": 1.932724952697754, "margin_dpo/loss_margin_mean": 19.32724952697754, "margin_dpo/margin_mean": 19.32724952697754, "margin_dpo/margin_std": 19.705549240112305, "step": 162 }, { "epoch": 0.2393538913362702, "grad_norm": 42.86497116088867, "learning_rate": 4.720482655449212e-07, "logits/chosen": -0.65444415807724, "logits/rejected": -0.6160274744033813, "logps/chosen": -62.261444091796875, "logps/ref_chosen": -55.021629333496094, "logps/ref_rejected": -75.41822052001953, "logps/rejected": -101.32667541503906, "loss": 0.5951, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.21980655193328857, "margin_dpo/beta_margin_grad_std": 0.20289334654808044, "margin_dpo/beta_margin_mean": 1.8668642044067383, "margin_dpo/loss_margin_mean": 18.668642044067383, "margin_dpo/margin_mean": 18.668642044067383, "margin_dpo/margin_std": 16.874954223632812, "step": 163 }, { "epoch": 0.24082232011747431, "grad_norm": 37.281150817871094, "learning_rate": 4.714556901942599e-07, "logits/chosen": -0.6877849102020264, "logits/rejected": -0.6435602903366089, "logps/chosen": -61.4527702331543, "logps/ref_chosen": -55.64066696166992, "logps/ref_rejected": -79.66463470458984, "logps/rejected": -106.46293640136719, "loss": 0.464, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18166950345039368, "margin_dpo/beta_margin_grad_std": 0.16736756265163422, "margin_dpo/beta_margin_mean": 2.0986199378967285, "margin_dpo/loss_margin_mean": 20.98619842529297, "margin_dpo/margin_mean": 20.98619842529297, "margin_dpo/margin_std": 15.089117050170898, "step": 164 }, { "epoch": 0.2422907488986784, "grad_norm": 65.39411163330078, "learning_rate": 4.708572792802069e-07, "logits/chosen": -0.7008275985717773, "logits/rejected": -0.6510541439056396, "logps/chosen": -69.8692398071289, "logps/ref_chosen": -61.310691833496094, "logps/ref_rejected": -73.67060852050781, "logps/rejected": -96.39982604980469, "loss": 0.7393, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.26950976252555847, "margin_dpo/beta_margin_grad_std": 0.200277179479599, "margin_dpo/beta_margin_mean": 1.4170664548873901, "margin_dpo/loss_margin_mean": 14.170663833618164, "margin_dpo/margin_mean": 14.17066478729248, "margin_dpo/margin_std": 14.783781051635742, "step": 165 }, { "epoch": 0.24375917767988253, "grad_norm": 48.58378982543945, "learning_rate": 4.702530485714461e-07, "logits/chosen": -0.6584955453872681, "logits/rejected": -0.650241494178772, "logps/chosen": -59.29880905151367, "logps/ref_chosen": -50.98360061645508, "logps/ref_rejected": -98.09512329101562, "logps/rejected": -129.55075073242188, "loss": 0.5363, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2068173885345459, "margin_dpo/beta_margin_grad_std": 0.18966558575630188, "margin_dpo/beta_margin_mean": 2.3140416145324707, "margin_dpo/loss_margin_mean": 23.14041519165039, "margin_dpo/margin_mean": 23.14041519165039, "margin_dpo/margin_std": 23.364418029785156, "step": 166 }, { "epoch": 0.24522760646108663, "grad_norm": 52.58329391479492, "learning_rate": 4.6964301399001877e-07, "logits/chosen": -0.6375908851623535, "logits/rejected": -0.6227909922599792, "logps/chosen": -58.83246612548828, "logps/ref_chosen": -50.42409133911133, "logps/ref_rejected": -96.03042602539062, "logps/rejected": -128.48484802246094, "loss": 0.4969, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17307597398757935, "margin_dpo/beta_margin_grad_std": 0.1926582306623459, "margin_dpo/beta_margin_mean": 2.404604911804199, "margin_dpo/loss_margin_mean": 24.046049118041992, "margin_dpo/margin_mean": 24.04604721069336, "margin_dpo/margin_std": 18.615556716918945, "step": 167 }, { "epoch": 0.24669603524229075, "grad_norm": 46.499046325683594, "learning_rate": 4.690271916109034e-07, "logits/chosen": -0.7022169232368469, "logits/rejected": -0.6692053079605103, "logps/chosen": -57.07928466796875, "logps/ref_chosen": -49.46282196044922, "logps/ref_rejected": -75.30854797363281, "logps/rejected": -101.93223571777344, "loss": 0.5398, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.20751315355300903, "margin_dpo/beta_margin_grad_std": 0.18526721000671387, "margin_dpo/beta_margin_mean": 1.9007223844528198, "margin_dpo/loss_margin_mean": 19.00722312927246, "margin_dpo/margin_mean": 19.007225036621094, "margin_dpo/margin_std": 15.336655616760254, "step": 168 }, { "epoch": 0.24816446402349487, "grad_norm": 55.35097122192383, "learning_rate": 4.6840559766159235e-07, "logits/chosen": -0.6735790967941284, "logits/rejected": -0.644471287727356, "logps/chosen": -67.24217224121094, "logps/ref_chosen": -59.803443908691406, "logps/ref_rejected": -83.34574890136719, "logps/rejected": -108.11927795410156, "loss": 0.7689, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2712000906467438, "margin_dpo/beta_margin_grad_std": 0.2246740311384201, "margin_dpo/beta_margin_mean": 1.73348069190979, "margin_dpo/loss_margin_mean": 17.334806442260742, "margin_dpo/margin_mean": 17.334806442260742, "margin_dpo/margin_std": 21.838268280029297, "step": 169 }, { "epoch": 0.24963289280469897, "grad_norm": 44.05381774902344, "learning_rate": 4.6777824852166437e-07, "logits/chosen": -0.6359131336212158, "logits/rejected": -0.6122620105743408, "logps/chosen": -55.612205505371094, "logps/ref_chosen": -49.471771240234375, "logps/ref_rejected": -75.91734313964844, "logps/rejected": -103.55111694335938, "loss": 0.5703, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2044505774974823, "margin_dpo/beta_margin_grad_std": 0.19848260283470154, "margin_dpo/beta_margin_mean": 2.149333953857422, "margin_dpo/loss_margin_mean": 21.49333953857422, "margin_dpo/margin_mean": 21.49333953857422, "margin_dpo/margin_std": 18.57598876953125, "step": 170 }, { "epoch": 0.2511013215859031, "grad_norm": 62.48033142089844, "learning_rate": 4.6714516072235273e-07, "logits/chosen": -0.6769453883171082, "logits/rejected": -0.6254656314849854, "logps/chosen": -92.27412414550781, "logps/ref_chosen": -84.49931335449219, "logps/ref_rejected": -109.38209533691406, "logps/rejected": -135.86390686035156, "loss": 0.6447, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.23198726773262024, "margin_dpo/beta_margin_grad_std": 0.21067272126674652, "margin_dpo/beta_margin_mean": 1.8707005977630615, "margin_dpo/loss_margin_mean": 18.70700454711914, "margin_dpo/margin_mean": 18.70700454711914, "margin_dpo/margin_std": 19.6763916015625, "step": 171 }, { "epoch": 0.2525697503671072, "grad_norm": 65.7638168334961, "learning_rate": 4.6650635094610966e-07, "logits/chosen": -0.7173855304718018, "logits/rejected": -0.6745563745498657, "logps/chosen": -78.89846801757812, "logps/ref_chosen": -68.65391540527344, "logps/ref_rejected": -85.43667602539062, "logps/rejected": -113.78555297851562, "loss": 0.7006, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.23992997407913208, "margin_dpo/beta_margin_grad_std": 0.2309492528438568, "margin_dpo/beta_margin_mean": 1.8104331493377686, "margin_dpo/loss_margin_mean": 18.104331970214844, "margin_dpo/margin_mean": 18.104331970214844, "margin_dpo/margin_std": 18.842220306396484, "step": 172 }, { "epoch": 0.2540381791483113, "grad_norm": 49.16233444213867, "learning_rate": 4.6586183602616687e-07, "logits/chosen": -0.7287572026252747, "logits/rejected": -0.673369288444519, "logps/chosen": -70.7244873046875, "logps/ref_chosen": -63.050872802734375, "logps/ref_rejected": -78.68392944335938, "logps/rejected": -104.60391998291016, "loss": 0.6261, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.22973671555519104, "margin_dpo/beta_margin_grad_std": 0.2065761685371399, "margin_dpo/beta_margin_mean": 1.824638843536377, "margin_dpo/loss_margin_mean": 18.246387481689453, "margin_dpo/margin_mean": 18.246387481689453, "margin_dpo/margin_std": 17.423324584960938, "step": 173 }, { "epoch": 0.2555066079295154, "grad_norm": 53.320858001708984, "learning_rate": 4.652116329460919e-07, "logits/chosen": -0.6692589521408081, "logits/rejected": -0.669571042060852, "logps/chosen": -61.95579528808594, "logps/ref_chosen": -53.36296844482422, "logps/ref_rejected": -101.91120910644531, "logps/rejected": -128.92747497558594, "loss": 0.6537, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2317759096622467, "margin_dpo/beta_margin_grad_std": 0.21807681024074554, "margin_dpo/beta_margin_mean": 1.842343807220459, "margin_dpo/loss_margin_mean": 18.423437118530273, "margin_dpo/margin_mean": 18.423437118530273, "margin_dpo/margin_std": 17.788700103759766, "step": 174 }, { "epoch": 0.25697503671071953, "grad_norm": 47.415931701660156, "learning_rate": 4.645557588393406e-07, "logits/chosen": -0.6318604946136475, "logits/rejected": -0.6075109243392944, "logps/chosen": -52.48356628417969, "logps/ref_chosen": -45.417762756347656, "logps/ref_rejected": -89.50579833984375, "logps/rejected": -120.28120422363281, "loss": 0.4447, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1811106652021408, "margin_dpo/beta_margin_grad_std": 0.15868915617465973, "margin_dpo/beta_margin_mean": 2.370960235595703, "margin_dpo/loss_margin_mean": 23.7096004486084, "margin_dpo/margin_mean": 23.7096004486084, "margin_dpo/margin_std": 19.452625274658203, "step": 175 }, { "epoch": 0.25844346549192365, "grad_norm": 43.29652786254883, "learning_rate": 4.638942309888058e-07, "logits/chosen": -0.7066097259521484, "logits/rejected": -0.6949463486671448, "logps/chosen": -57.918304443359375, "logps/ref_chosen": -50.45283889770508, "logps/ref_rejected": -95.55896759033203, "logps/rejected": -124.98609924316406, "loss": 0.4807, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18598869442939758, "margin_dpo/beta_margin_grad_std": 0.1866185963153839, "margin_dpo/beta_margin_mean": 2.196166515350342, "margin_dpo/loss_margin_mean": 21.96166229248047, "margin_dpo/margin_mean": 21.9616641998291, "margin_dpo/margin_std": 17.302711486816406, "step": 176 }, { "epoch": 0.2599118942731278, "grad_norm": 41.06880187988281, "learning_rate": 4.6322706682636137e-07, "logits/chosen": -0.6657878160476685, "logits/rejected": -0.6222826242446899, "logps/chosen": -70.47785186767578, "logps/ref_chosen": -61.21646499633789, "logps/ref_rejected": -95.89378356933594, "logps/rejected": -127.02268981933594, "loss": 0.5421, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2077367603778839, "margin_dpo/beta_margin_grad_std": 0.19527243077754974, "margin_dpo/beta_margin_mean": 2.1867523193359375, "margin_dpo/loss_margin_mean": 21.867523193359375, "margin_dpo/margin_mean": 21.867523193359375, "margin_dpo/margin_std": 20.43021011352539, "step": 177 }, { "epoch": 0.26138032305433184, "grad_norm": 56.99783706665039, "learning_rate": 4.6255428393240354e-07, "logits/chosen": -0.6359624862670898, "logits/rejected": -0.6415029764175415, "logps/chosen": -70.54104614257812, "logps/ref_chosen": -58.26478958129883, "logps/ref_rejected": -105.36532592773438, "logps/rejected": -141.9470672607422, "loss": 0.5463, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1983056217432022, "margin_dpo/beta_margin_grad_std": 0.21389643847942352, "margin_dpo/beta_margin_mean": 2.4305472373962402, "margin_dpo/loss_margin_mean": 24.30547332763672, "margin_dpo/margin_mean": 24.30547332763672, "margin_dpo/margin_std": 22.445331573486328, "step": 178 }, { "epoch": 0.26284875183553597, "grad_norm": 71.98530578613281, "learning_rate": 4.6187590003538724e-07, "logits/chosen": -0.6545614004135132, "logits/rejected": -0.629997730255127, "logps/chosen": -72.10971069335938, "logps/ref_chosen": -61.05832290649414, "logps/ref_rejected": -90.52782440185547, "logps/rejected": -126.18659973144531, "loss": 0.672, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.21123456954956055, "margin_dpo/beta_margin_grad_std": 0.24784591794013977, "margin_dpo/beta_margin_mean": 2.460737705230713, "margin_dpo/loss_margin_mean": 24.607376098632812, "margin_dpo/margin_mean": 24.607376098632812, "margin_dpo/margin_std": 23.464365005493164, "step": 179 }, { "epoch": 0.2643171806167401, "grad_norm": 45.19013214111328, "learning_rate": 4.611919330113591e-07, "logits/chosen": -0.646220862865448, "logits/rejected": -0.6155471801757812, "logps/chosen": -62.99604034423828, "logps/ref_chosen": -54.34272003173828, "logps/ref_rejected": -98.21183776855469, "logps/rejected": -131.40283203125, "loss": 0.4572, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17428556084632874, "margin_dpo/beta_margin_grad_std": 0.19251887500286102, "margin_dpo/beta_margin_mean": 2.453767776489258, "margin_dpo/loss_margin_mean": 24.53767967224121, "margin_dpo/margin_mean": 24.53767967224121, "margin_dpo/margin_std": 19.19011688232422, "step": 180 }, { "epoch": 0.2657856093979442, "grad_norm": 53.57444381713867, "learning_rate": 4.605024008834863e-07, "logits/chosen": -0.684493899345398, "logits/rejected": -0.6450868844985962, "logps/chosen": -63.24270248413086, "logps/ref_chosen": -55.000457763671875, "logps/ref_rejected": -61.656166076660156, "logps/rejected": -87.48631286621094, "loss": 0.6668, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.24293041229248047, "margin_dpo/beta_margin_grad_std": 0.21182817220687866, "margin_dpo/beta_margin_mean": 1.7587906122207642, "margin_dpo/loss_margin_mean": 17.587905883789062, "margin_dpo/margin_mean": 17.587905883789062, "margin_dpo/margin_std": 18.199848175048828, "step": 181 }, { "epoch": 0.26725403817914833, "grad_norm": 58.0034065246582, "learning_rate": 4.598073218215817e-07, "logits/chosen": -0.6252259016036987, "logits/rejected": -0.6054234504699707, "logps/chosen": -50.210166931152344, "logps/ref_chosen": -41.10784912109375, "logps/ref_rejected": -89.5215835571289, "logps/rejected": -125.56619262695312, "loss": 0.5722, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18403324484825134, "margin_dpo/beta_margin_grad_std": 0.22924652695655823, "margin_dpo/beta_margin_mean": 2.6942296028137207, "margin_dpo/loss_margin_mean": 26.94229507446289, "margin_dpo/margin_mean": 26.942296981811523, "margin_dpo/margin_std": 23.49362564086914, "step": 182 }, { "epoch": 0.2687224669603524, "grad_norm": 90.1989974975586, "learning_rate": 4.5910671414162484e-07, "logits/chosen": -0.664011538028717, "logits/rejected": -0.612758994102478, "logps/chosen": -69.1800765991211, "logps/ref_chosen": -57.524559020996094, "logps/ref_rejected": -75.97572326660156, "logps/rejected": -108.55187225341797, "loss": 0.5191, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2063201367855072, "margin_dpo/beta_margin_grad_std": 0.16726571321487427, "margin_dpo/beta_margin_mean": 2.0920636653900146, "margin_dpo/loss_margin_mean": 20.920635223388672, "margin_dpo/margin_mean": 20.920637130737305, "margin_dpo/margin_std": 16.628860473632812, "step": 183 }, { "epoch": 0.2701908957415565, "grad_norm": 63.455596923828125, "learning_rate": 4.5840059630527985e-07, "logits/chosen": -0.6686521768569946, "logits/rejected": -0.658584713935852, "logps/chosen": -67.59618377685547, "logps/ref_chosen": -58.544952392578125, "logps/ref_rejected": -76.63406372070312, "logps/rejected": -101.26787567138672, "loss": 0.7089, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2659500241279602, "margin_dpo/beta_margin_grad_std": 0.19632551074028015, "margin_dpo/beta_margin_mean": 1.5582584142684937, "margin_dpo/loss_margin_mean": 15.582584381103516, "margin_dpo/margin_mean": 15.582584381103516, "margin_dpo/margin_std": 17.532541275024414, "step": 184 }, { "epoch": 0.27165932452276065, "grad_norm": 59.450496673583984, "learning_rate": 4.5768898691940836e-07, "logits/chosen": -0.6913318634033203, "logits/rejected": -0.6381030678749084, "logps/chosen": -71.63302612304688, "logps/ref_chosen": -62.02584457397461, "logps/ref_rejected": -73.76260375976562, "logps/rejected": -98.8121337890625, "loss": 0.8254, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2810859680175781, "margin_dpo/beta_margin_grad_std": 0.23385247588157654, "margin_dpo/beta_margin_mean": 1.5442359447479248, "margin_dpo/loss_margin_mean": 15.442358016967773, "margin_dpo/margin_mean": 15.442358016967773, "margin_dpo/margin_std": 18.860673904418945, "step": 185 }, { "epoch": 0.27312775330396477, "grad_norm": 42.19928741455078, "learning_rate": 4.5697190473557947e-07, "logits/chosen": -0.6911958456039429, "logits/rejected": -0.635596752166748, "logps/chosen": -79.12220764160156, "logps/ref_chosen": -69.35346984863281, "logps/ref_rejected": -88.07244873046875, "logps/rejected": -123.28886413574219, "loss": 0.3942, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15830203890800476, "margin_dpo/beta_margin_grad_std": 0.16447928547859192, "margin_dpo/beta_margin_mean": 2.544766902923584, "margin_dpo/loss_margin_mean": 25.447669982910156, "margin_dpo/margin_mean": 25.447669982910156, "margin_dpo/margin_std": 18.903629302978516, "step": 186 }, { "epoch": 0.2745961820851689, "grad_norm": 66.2356185913086, "learning_rate": 4.5624936864957555e-07, "logits/chosen": -0.6642763614654541, "logits/rejected": -0.6538236141204834, "logps/chosen": -64.20773315429688, "logps/ref_chosen": -52.75646209716797, "logps/ref_rejected": -81.96910095214844, "logps/rejected": -112.33505249023438, "loss": 0.7275, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.24081790447235107, "margin_dpo/beta_margin_grad_std": 0.22335243225097656, "margin_dpo/beta_margin_mean": 1.8914674520492554, "margin_dpo/loss_margin_mean": 18.914674758911133, "margin_dpo/margin_mean": 18.914674758911133, "margin_dpo/margin_std": 20.249263763427734, "step": 187 }, { "epoch": 0.27606461086637296, "grad_norm": 57.380775451660156, "learning_rate": 4.5552139770089454e-07, "logits/chosen": -0.6732475757598877, "logits/rejected": -0.655211329460144, "logps/chosen": -58.014007568359375, "logps/ref_chosen": -49.415489196777344, "logps/ref_rejected": -89.54043579101562, "logps/rejected": -119.85971069335938, "loss": 0.5978, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.20978838205337524, "margin_dpo/beta_margin_grad_std": 0.21329578757286072, "margin_dpo/beta_margin_mean": 2.1720757484436035, "margin_dpo/loss_margin_mean": 21.72075653076172, "margin_dpo/margin_mean": 21.72075653076172, "margin_dpo/margin_std": 20.222156524658203, "step": 188 }, { "epoch": 0.2775330396475771, "grad_norm": 59.92390441894531, "learning_rate": 4.5478801107224794e-07, "logits/chosen": -0.6710000038146973, "logits/rejected": -0.6149500012397766, "logps/chosen": -60.63805389404297, "logps/ref_chosen": -52.39896011352539, "logps/ref_rejected": -72.16735076904297, "logps/rejected": -102.7850341796875, "loss": 0.5266, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.19314193725585938, "margin_dpo/beta_margin_grad_std": 0.18688946962356567, "margin_dpo/beta_margin_mean": 2.2378592491149902, "margin_dpo/loss_margin_mean": 22.378591537475586, "margin_dpo/margin_mean": 22.378589630126953, "margin_dpo/margin_std": 19.83676528930664, "step": 189 }, { "epoch": 0.2790014684287812, "grad_norm": 49.65851974487305, "learning_rate": 4.5404922808905543e-07, "logits/chosen": -0.6853049993515015, "logits/rejected": -0.6321940422058105, "logps/chosen": -73.16799926757812, "logps/ref_chosen": -64.68305969238281, "logps/ref_rejected": -102.55052185058594, "logps/rejected": -134.60731506347656, "loss": 0.5028, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.19292610883712769, "margin_dpo/beta_margin_grad_std": 0.1859111189842224, "margin_dpo/beta_margin_mean": 2.357185125350952, "margin_dpo/loss_margin_mean": 23.57185173034668, "margin_dpo/margin_mean": 23.571849822998047, "margin_dpo/margin_std": 20.523746490478516, "step": 190 }, { "epoch": 0.28046989720998533, "grad_norm": 34.40500259399414, "learning_rate": 4.5330506821893565e-07, "logits/chosen": -0.6436042785644531, "logits/rejected": -0.6251427531242371, "logps/chosen": -75.3275146484375, "logps/ref_chosen": -68.65887451171875, "logps/ref_rejected": -110.1396713256836, "logps/rejected": -144.67477416992188, "loss": 0.4349, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16417238116264343, "margin_dpo/beta_margin_grad_std": 0.18410329520702362, "margin_dpo/beta_margin_mean": 2.7866461277008057, "margin_dpo/loss_margin_mean": 27.866458892822266, "margin_dpo/margin_mean": 27.866458892822266, "margin_dpo/margin_std": 24.24535369873047, "step": 191 }, { "epoch": 0.28193832599118945, "grad_norm": 54.248565673828125, "learning_rate": 4.5255555107119336e-07, "logits/chosen": -0.6639034748077393, "logits/rejected": -0.6402075290679932, "logps/chosen": -80.83169555664062, "logps/ref_chosen": -69.72691345214844, "logps/ref_rejected": -103.32135009765625, "logps/rejected": -138.1024627685547, "loss": 0.5458, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.20180875062942505, "margin_dpo/beta_margin_grad_std": 0.21217647194862366, "margin_dpo/beta_margin_mean": 2.367633819580078, "margin_dpo/loss_margin_mean": 23.67633628845215, "margin_dpo/margin_mean": 23.67633819580078, "margin_dpo/margin_std": 21.766090393066406, "step": 192 }, { "epoch": 0.2834067547723935, "grad_norm": 72.17058563232422, "learning_rate": 4.5180069639630236e-07, "logits/chosen": -0.7117282152175903, "logits/rejected": -0.6623973846435547, "logps/chosen": -71.19960021972656, "logps/ref_chosen": -60.19049835205078, "logps/ref_rejected": -76.40755462646484, "logps/rejected": -104.30625915527344, "loss": 0.7828, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.23411771655082703, "margin_dpo/beta_margin_grad_std": 0.23166634142398834, "margin_dpo/beta_margin_mean": 1.6889591217041016, "margin_dpo/loss_margin_mean": 16.889591217041016, "margin_dpo/margin_mean": 16.889591217041016, "margin_dpo/margin_std": 17.94813346862793, "step": 193 }, { "epoch": 0.28487518355359764, "grad_norm": 36.26988983154297, "learning_rate": 4.510405240853854e-07, "logits/chosen": -0.6696387529373169, "logits/rejected": -0.6456412076950073, "logps/chosen": -45.34961700439453, "logps/ref_chosen": -37.84037399291992, "logps/ref_rejected": -60.684783935546875, "logps/rejected": -90.98521423339844, "loss": 0.5049, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1981307864189148, "margin_dpo/beta_margin_grad_std": 0.1864275336265564, "margin_dpo/beta_margin_mean": 2.279118776321411, "margin_dpo/loss_margin_mean": 22.791187286376953, "margin_dpo/margin_mean": 22.791187286376953, "margin_dpo/margin_std": 19.600643157958984, "step": 194 }, { "epoch": 0.28634361233480177, "grad_norm": 52.49971008300781, "learning_rate": 4.5027505416968985e-07, "logits/chosen": -0.6493013501167297, "logits/rejected": -0.646949291229248, "logps/chosen": -66.55081939697266, "logps/ref_chosen": -54.891571044921875, "logps/ref_rejected": -96.77095794677734, "logps/rejected": -130.78152465820312, "loss": 0.4842, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18146619200706482, "margin_dpo/beta_margin_grad_std": 0.19031144678592682, "margin_dpo/beta_margin_mean": 2.2351322174072266, "margin_dpo/loss_margin_mean": 22.351322174072266, "margin_dpo/margin_mean": 22.351322174072266, "margin_dpo/margin_std": 18.570903778076172, "step": 195 }, { "epoch": 0.2878120411160059, "grad_norm": 42.67164993286133, "learning_rate": 4.495043068200599e-07, "logits/chosen": -0.6235789060592651, "logits/rejected": -0.5861127972602844, "logps/chosen": -61.65058135986328, "logps/ref_chosen": -53.245243072509766, "logps/ref_rejected": -76.05294799804688, "logps/rejected": -111.39912414550781, "loss": 0.442, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16935831308364868, "margin_dpo/beta_margin_grad_std": 0.18017151951789856, "margin_dpo/beta_margin_mean": 2.6940836906433105, "margin_dpo/loss_margin_mean": 26.940837860107422, "margin_dpo/margin_mean": 26.940837860107422, "margin_dpo/margin_std": 22.081172943115234, "step": 196 }, { "epoch": 0.28928046989721, "grad_norm": 47.60453414916992, "learning_rate": 4.4872830234640493e-07, "logits/chosen": -0.6239949464797974, "logits/rejected": -0.6122224926948547, "logps/chosen": -69.72303009033203, "logps/ref_chosen": -60.42033767700195, "logps/ref_rejected": -77.20890808105469, "logps/rejected": -107.96568298339844, "loss": 0.5113, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.19921091198921204, "margin_dpo/beta_margin_grad_std": 0.17500488460063934, "margin_dpo/beta_margin_mean": 2.1454074382781982, "margin_dpo/loss_margin_mean": 21.45407485961914, "margin_dpo/margin_mean": 21.45407485961914, "margin_dpo/margin_std": 18.467487335205078, "step": 197 }, { "epoch": 0.2907488986784141, "grad_norm": 52.930660247802734, "learning_rate": 4.479470611971645e-07, "logits/chosen": -0.651822566986084, "logits/rejected": -0.6399349570274353, "logps/chosen": -64.81780242919922, "logps/ref_chosen": -55.03618621826172, "logps/ref_rejected": -97.24325561523438, "logps/rejected": -130.20297241210938, "loss": 0.5069, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.19629308581352234, "margin_dpo/beta_margin_grad_std": 0.18192753195762634, "margin_dpo/beta_margin_mean": 2.3178110122680664, "margin_dpo/loss_margin_mean": 23.17810821533203, "margin_dpo/margin_mean": 23.17810821533203, "margin_dpo/margin_std": 22.717308044433594, "step": 198 }, { "epoch": 0.2922173274596182, "grad_norm": 54.65925216674805, "learning_rate": 4.471606039587695e-07, "logits/chosen": -0.6595567464828491, "logits/rejected": -0.6411717534065247, "logps/chosen": -66.6902847290039, "logps/ref_chosen": -56.828826904296875, "logps/ref_rejected": -84.64820861816406, "logps/rejected": -117.4836654663086, "loss": 0.5925, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.21100641787052155, "margin_dpo/beta_margin_grad_std": 0.21913698315620422, "margin_dpo/beta_margin_mean": 2.2973995208740234, "margin_dpo/loss_margin_mean": 22.973995208740234, "margin_dpo/margin_mean": 22.973995208740234, "margin_dpo/margin_std": 22.114442825317383, "step": 199 }, { "epoch": 0.2936857562408223, "grad_norm": 69.58685302734375, "learning_rate": 4.4636895135509966e-07, "logits/chosen": -0.6506587862968445, "logits/rejected": -0.6131795644760132, "logps/chosen": -64.16134643554688, "logps/ref_chosen": -53.06706237792969, "logps/ref_rejected": -80.60843658447266, "logps/rejected": -116.44496154785156, "loss": 0.6607, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.21007482707500458, "margin_dpo/beta_margin_grad_std": 0.22120529413223267, "margin_dpo/beta_margin_mean": 2.4742238521575928, "margin_dpo/loss_margin_mean": 24.742237091064453, "margin_dpo/margin_mean": 24.742237091064453, "margin_dpo/margin_std": 25.760879516601562, "step": 200 }, { "epoch": 0.2936857562408223, "eval_logits/chosen": -0.6426228284835815, "eval_logits/rejected": -0.6196746826171875, "eval_logps/chosen": -92.99790954589844, "eval_logps/ref_chosen": -79.05104064941406, "eval_logps/ref_rejected": -86.79793548583984, "eval_logps/rejected": -115.42741394042969, "eval_loss": 0.4791145920753479, "eval_margin_dpo/beta": 0.10000000149011612, "eval_margin_dpo/beta_margin_grad_mean": -0.31089797616004944, "eval_margin_dpo/beta_margin_grad_std": 0.24731215834617615, "eval_margin_dpo/beta_margin_mean": 1.4682612419128418, "eval_margin_dpo/loss_margin_mean": 14.682612419128418, "eval_margin_dpo/margin_mean": 14.682612419128418, "eval_margin_dpo/margin_std": 21.162776947021484, "eval_runtime": 39.8807, "eval_samples_per_second": 58.65, "eval_steps_per_second": 1.856, "step": 200 }, { "epoch": 0.29515418502202645, "grad_norm": 53.58722686767578, "learning_rate": 4.455721242469372e-07, "logits/chosen": -0.6264992356300354, "logits/rejected": -0.5962362289428711, "logps/chosen": -83.7801513671875, "logps/ref_chosen": -75.4022216796875, "logps/ref_rejected": -114.80821990966797, "logps/rejected": -148.85873413085938, "loss": 0.5285, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1883956640958786, "margin_dpo/beta_margin_grad_std": 0.21361252665519714, "margin_dpo/beta_margin_mean": 2.567258834838867, "margin_dpo/loss_margin_mean": 25.672588348388672, "margin_dpo/margin_mean": 25.67258644104004, "margin_dpo/margin_std": 23.231678009033203, "step": 201 }, { "epoch": 0.2966226138032305, "grad_norm": 72.63829803466797, "learning_rate": 4.4477014363141755e-07, "logits/chosen": -0.6617487668991089, "logits/rejected": -0.6499176025390625, "logps/chosen": -60.97098922729492, "logps/ref_chosen": -50.101318359375, "logps/ref_rejected": -86.98503112792969, "logps/rejected": -116.73014831542969, "loss": 0.7735, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2517499327659607, "margin_dpo/beta_margin_grad_std": 0.24596986174583435, "margin_dpo/beta_margin_mean": 1.8875447511672974, "margin_dpo/loss_margin_mean": 18.875446319580078, "margin_dpo/margin_mean": 18.875446319580078, "margin_dpo/margin_std": 21.153244018554688, "step": 202 }, { "epoch": 0.29809104258443464, "grad_norm": 44.909637451171875, "learning_rate": 4.439630306414758e-07, "logits/chosen": -0.6598186492919922, "logits/rejected": -0.6150715351104736, "logps/chosen": -68.689453125, "logps/ref_chosen": -60.60969543457031, "logps/ref_rejected": -85.89596557617188, "logps/rejected": -114.92884826660156, "loss": 0.5325, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.20222759246826172, "margin_dpo/beta_margin_grad_std": 0.1946883499622345, "margin_dpo/beta_margin_mean": 2.0953121185302734, "margin_dpo/loss_margin_mean": 20.953121185302734, "margin_dpo/margin_mean": 20.953121185302734, "margin_dpo/margin_std": 18.274738311767578, "step": 203 }, { "epoch": 0.29955947136563876, "grad_norm": 47.8790283203125, "learning_rate": 4.431508065452897e-07, "logits/chosen": -0.6964120864868164, "logits/rejected": -0.643916130065918, "logps/chosen": -89.8682861328125, "logps/ref_chosen": -80.16496276855469, "logps/ref_rejected": -87.69590759277344, "logps/rejected": -119.29830932617188, "loss": 0.522, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.19900324940681458, "margin_dpo/beta_margin_grad_std": 0.1881289929151535, "margin_dpo/beta_margin_mean": 2.1899077892303467, "margin_dpo/loss_margin_mean": 21.899078369140625, "margin_dpo/margin_mean": 21.899078369140625, "margin_dpo/margin_std": 19.764657974243164, "step": 204 }, { "epoch": 0.3010279001468429, "grad_norm": 66.0072021484375, "learning_rate": 4.4233349274571974e-07, "logits/chosen": -0.7408896684646606, "logits/rejected": -0.7029110193252563, "logps/chosen": -70.63681030273438, "logps/ref_chosen": -59.384735107421875, "logps/ref_rejected": -85.12505340576172, "logps/rejected": -120.17679595947266, "loss": 0.636, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.21140334010124207, "margin_dpo/beta_margin_grad_std": 0.22996652126312256, "margin_dpo/beta_margin_mean": 2.3799662590026855, "margin_dpo/loss_margin_mean": 23.799659729003906, "margin_dpo/margin_mean": 23.79966163635254, "margin_dpo/margin_std": 23.010438919067383, "step": 205 }, { "epoch": 0.302496328928047, "grad_norm": 45.65394592285156, "learning_rate": 4.415111107797445e-07, "logits/chosen": -0.650726318359375, "logits/rejected": -0.6484321355819702, "logps/chosen": -57.730751037597656, "logps/ref_chosen": -46.964500427246094, "logps/ref_rejected": -98.9534912109375, "logps/rejected": -136.9589080810547, "loss": 0.4357, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16246598958969116, "margin_dpo/beta_margin_grad_std": 0.19966889917850494, "margin_dpo/beta_margin_mean": 2.723916530609131, "margin_dpo/loss_margin_mean": 27.239166259765625, "margin_dpo/margin_mean": 27.239164352416992, "margin_dpo/margin_std": 21.334924697875977, "step": 206 }, { "epoch": 0.3039647577092511, "grad_norm": 52.66937255859375, "learning_rate": 4.4068368231789365e-07, "logits/chosen": -0.726109504699707, "logits/rejected": -0.70166015625, "logps/chosen": -64.216064453125, "logps/ref_chosen": -56.05625915527344, "logps/ref_rejected": -84.44779968261719, "logps/rejected": -120.97300720214844, "loss": 0.4566, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17621780931949615, "margin_dpo/beta_margin_grad_std": 0.18753397464752197, "margin_dpo/beta_margin_mean": 2.8365397453308105, "margin_dpo/loss_margin_mean": 28.36539649963379, "margin_dpo/margin_mean": 28.365394592285156, "margin_dpo/margin_std": 25.557231903076172, "step": 207 }, { "epoch": 0.3054331864904552, "grad_norm": 49.823787689208984, "learning_rate": 4.398512291636768e-07, "logits/chosen": -0.6845159530639648, "logits/rejected": -0.6554762125015259, "logps/chosen": -79.41966247558594, "logps/ref_chosen": -67.06761169433594, "logps/ref_rejected": -94.28689575195312, "logps/rejected": -129.97105407714844, "loss": 0.4757, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18872755765914917, "margin_dpo/beta_margin_grad_std": 0.1753954291343689, "margin_dpo/beta_margin_mean": 2.333209991455078, "margin_dpo/loss_margin_mean": 23.33209991455078, "margin_dpo/margin_mean": 23.33209991455078, "margin_dpo/margin_std": 20.38248062133789, "step": 208 }, { "epoch": 0.3069016152716593, "grad_norm": 56.87575912475586, "learning_rate": 4.3901377325300857e-07, "logits/chosen": -0.6603403687477112, "logits/rejected": -0.634404182434082, "logps/chosen": -65.81916809082031, "logps/ref_chosen": -56.18169403076172, "logps/ref_rejected": -80.94152069091797, "logps/rejected": -114.23938751220703, "loss": 0.6082, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2075938880443573, "margin_dpo/beta_margin_grad_std": 0.23247569799423218, "margin_dpo/beta_margin_mean": 2.366039752960205, "margin_dpo/loss_margin_mean": 23.660396575927734, "margin_dpo/margin_mean": 23.660396575927734, "margin_dpo/margin_std": 21.86727523803711, "step": 209 }, { "epoch": 0.30837004405286345, "grad_norm": 47.36671447753906, "learning_rate": 4.381713366536311e-07, "logits/chosen": -0.6590738296508789, "logits/rejected": -0.6278238296508789, "logps/chosen": -56.26042175292969, "logps/ref_chosen": -46.371822357177734, "logps/ref_rejected": -76.68162536621094, "logps/rejected": -108.72216796875, "loss": 0.5044, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1963951587677002, "margin_dpo/beta_margin_grad_std": 0.17726612091064453, "margin_dpo/beta_margin_mean": 2.2151942253112793, "margin_dpo/loss_margin_mean": 22.15194320678711, "margin_dpo/margin_mean": 22.15194320678711, "margin_dpo/margin_std": 18.77596664428711, "step": 210 }, { "epoch": 0.30983847283406757, "grad_norm": 60.94392776489258, "learning_rate": 4.373239415645323e-07, "logits/chosen": -0.6643211841583252, "logits/rejected": -0.6076558232307434, "logps/chosen": -91.72789764404297, "logps/ref_chosen": -78.93235778808594, "logps/ref_rejected": -86.82098388671875, "logps/rejected": -122.42787170410156, "loss": 0.5933, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2022809386253357, "margin_dpo/beta_margin_grad_std": 0.2218504548072815, "margin_dpo/beta_margin_mean": 2.2811341285705566, "margin_dpo/loss_margin_mean": 22.81134033203125, "margin_dpo/margin_mean": 22.81134033203125, "margin_dpo/margin_std": 20.594642639160156, "step": 211 }, { "epoch": 0.31130690161527164, "grad_norm": 55.531829833984375, "learning_rate": 4.3647161031536086e-07, "logits/chosen": -0.6789622902870178, "logits/rejected": -0.6477820873260498, "logps/chosen": -69.9522705078125, "logps/ref_chosen": -58.19701385498047, "logps/ref_rejected": -103.05784606933594, "logps/rejected": -143.61941528320312, "loss": 0.4546, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17285102605819702, "margin_dpo/beta_margin_grad_std": 0.19285747408866882, "margin_dpo/beta_margin_mean": 2.8806304931640625, "margin_dpo/loss_margin_mean": 28.806303024291992, "margin_dpo/margin_mean": 28.806303024291992, "margin_dpo/margin_std": 24.59879493713379, "step": 212 }, { "epoch": 0.31277533039647576, "grad_norm": 53.27421951293945, "learning_rate": 4.3561436536583774e-07, "logits/chosen": -0.6767191886901855, "logits/rejected": -0.6345890760421753, "logps/chosen": -77.456298828125, "logps/ref_chosen": -67.51271057128906, "logps/ref_rejected": -93.91471862792969, "logps/rejected": -132.91636657714844, "loss": 0.4835, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1712796837091446, "margin_dpo/beta_margin_grad_std": 0.19696003198623657, "margin_dpo/beta_margin_mean": 2.905806303024292, "margin_dpo/loss_margin_mean": 29.058063507080078, "margin_dpo/margin_mean": 29.058061599731445, "margin_dpo/margin_std": 25.737838745117188, "step": 213 }, { "epoch": 0.3142437591776799, "grad_norm": 63.7830696105957, "learning_rate": 4.3475222930516473e-07, "logits/chosen": -0.6705787181854248, "logits/rejected": -0.6549390554428101, "logps/chosen": -52.265167236328125, "logps/ref_chosen": -41.604888916015625, "logps/ref_rejected": -77.51741027832031, "logps/rejected": -111.19293212890625, "loss": 0.6303, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.21721556782722473, "margin_dpo/beta_margin_grad_std": 0.21066680550575256, "margin_dpo/beta_margin_mean": 2.3015246391296387, "margin_dpo/loss_margin_mean": 23.015247344970703, "margin_dpo/margin_mean": 23.015247344970703, "margin_dpo/margin_std": 24.095165252685547, "step": 214 }, { "epoch": 0.315712187958884, "grad_norm": 53.305728912353516, "learning_rate": 4.3388522485142885e-07, "logits/chosen": -0.6503559350967407, "logits/rejected": -0.6214190721511841, "logps/chosen": -63.870384216308594, "logps/ref_chosen": -53.279266357421875, "logps/ref_rejected": -89.96464538574219, "logps/rejected": -125.34566497802734, "loss": 0.5335, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1932601034641266, "margin_dpo/beta_margin_grad_std": 0.2013830840587616, "margin_dpo/beta_margin_mean": 2.478990077972412, "margin_dpo/loss_margin_mean": 24.789897918701172, "margin_dpo/margin_mean": 24.789899826049805, "margin_dpo/margin_std": 23.55270767211914, "step": 215 }, { "epoch": 0.31718061674008813, "grad_norm": 62.44280242919922, "learning_rate": 4.330133748510036e-07, "logits/chosen": -0.6717164516448975, "logits/rejected": -0.639351487159729, "logps/chosen": -61.949493408203125, "logps/ref_chosen": -48.887794494628906, "logps/ref_rejected": -77.19892883300781, "logps/rejected": -117.35563659667969, "loss": 0.5894, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.20755264163017273, "margin_dpo/beta_margin_grad_std": 0.22565214335918427, "margin_dpo/beta_margin_mean": 2.709501266479492, "margin_dpo/loss_margin_mean": 27.095012664794922, "margin_dpo/margin_mean": 27.095012664794922, "margin_dpo/margin_std": 26.17487335205078, "step": 216 }, { "epoch": 0.3186490455212922, "grad_norm": 42.385009765625, "learning_rate": 4.3213670227794757e-07, "logits/chosen": -0.6868363618850708, "logits/rejected": -0.6442810893058777, "logps/chosen": -60.769012451171875, "logps/ref_chosen": -49.845306396484375, "logps/ref_rejected": -100.07832336425781, "logps/rejected": -138.9866943359375, "loss": 0.4008, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1490134596824646, "margin_dpo/beta_margin_grad_std": 0.18446922302246094, "margin_dpo/beta_margin_mean": 2.798466205596924, "margin_dpo/loss_margin_mean": 27.984663009643555, "margin_dpo/margin_mean": 27.984663009643555, "margin_dpo/margin_std": 21.431766510009766, "step": 217 }, { "epoch": 0.3201174743024963, "grad_norm": 60.58324432373047, "learning_rate": 4.3125523023339815e-07, "logits/chosen": -0.659305214881897, "logits/rejected": -0.6280574798583984, "logps/chosen": -69.96195983886719, "logps/ref_chosen": -58.576683044433594, "logps/ref_rejected": -87.84639739990234, "logps/rejected": -124.12518310546875, "loss": 0.5344, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1986551582813263, "margin_dpo/beta_margin_grad_std": 0.19974969327449799, "margin_dpo/beta_margin_mean": 2.4893507957458496, "margin_dpo/loss_margin_mean": 24.893508911132812, "margin_dpo/margin_mean": 24.893508911132812, "margin_dpo/margin_std": 23.56732177734375, "step": 218 }, { "epoch": 0.32158590308370044, "grad_norm": 59.97367858886719, "learning_rate": 4.303689819449636e-07, "logits/chosen": -0.6612948775291443, "logits/rejected": -0.6297075152397156, "logps/chosen": -72.49765014648438, "logps/ref_chosen": -61.083858489990234, "logps/ref_rejected": -85.83042907714844, "logps/rejected": -119.2388687133789, "loss": 0.5264, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.19783681631088257, "margin_dpo/beta_margin_grad_std": 0.18227900564670563, "margin_dpo/beta_margin_mean": 2.1994645595550537, "margin_dpo/loss_margin_mean": 21.994646072387695, "margin_dpo/margin_mean": 21.994644165039062, "margin_dpo/margin_std": 19.54847526550293, "step": 219 }, { "epoch": 0.32305433186490456, "grad_norm": 47.776611328125, "learning_rate": 4.2947798076611047e-07, "logits/chosen": -0.646489679813385, "logits/rejected": -0.605070948600769, "logps/chosen": -81.14366912841797, "logps/ref_chosen": -70.03128051757812, "logps/ref_rejected": -87.68551635742188, "logps/rejected": -119.57649230957031, "loss": 0.5065, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.19959807395935059, "margin_dpo/beta_margin_grad_std": 0.17812803387641907, "margin_dpo/beta_margin_mean": 2.077859401702881, "margin_dpo/loss_margin_mean": 20.77859115600586, "margin_dpo/margin_mean": 20.778593063354492, "margin_dpo/margin_std": 16.526872634887695, "step": 220 }, { "epoch": 0.3245227606461087, "grad_norm": 52.991641998291016, "learning_rate": 4.285822501755485e-07, "logits/chosen": -0.6369597315788269, "logits/rejected": -0.6250277757644653, "logps/chosen": -64.38204193115234, "logps/ref_chosen": -52.15470886230469, "logps/ref_rejected": -106.46768188476562, "logps/rejected": -151.52133178710938, "loss": 0.3403, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12470635771751404, "margin_dpo/beta_margin_grad_std": 0.18095463514328003, "margin_dpo/beta_margin_mean": 3.2826321125030518, "margin_dpo/loss_margin_mean": 32.82632064819336, "margin_dpo/margin_mean": 32.82632064819336, "margin_dpo/margin_std": 22.725797653198242, "step": 221 }, { "epoch": 0.32599118942731276, "grad_norm": 72.23493957519531, "learning_rate": 4.276818137766118e-07, "logits/chosen": -0.7019423842430115, "logits/rejected": -0.6659786701202393, "logps/chosen": -74.65852355957031, "logps/ref_chosen": -60.971099853515625, "logps/ref_rejected": -100.00115203857422, "logps/rejected": -139.9058837890625, "loss": 0.5722, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.19258952140808105, "margin_dpo/beta_margin_grad_std": 0.22169330716133118, "margin_dpo/beta_margin_mean": 2.621731758117676, "margin_dpo/loss_margin_mean": 26.217315673828125, "margin_dpo/margin_mean": 26.217315673828125, "margin_dpo/margin_std": 24.92258071899414, "step": 222 }, { "epoch": 0.3274596182085169, "grad_norm": 85.80632781982422, "learning_rate": 4.2677669529663686e-07, "logits/chosen": -0.6935982704162598, "logits/rejected": -0.6445531249046326, "logps/chosen": -68.51580047607422, "logps/ref_chosen": -52.64057922363281, "logps/ref_rejected": -82.82502746582031, "logps/rejected": -120.95681762695312, "loss": 0.7341, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.22147008776664734, "margin_dpo/beta_margin_grad_std": 0.2591605484485626, "margin_dpo/beta_margin_mean": 2.225656509399414, "margin_dpo/loss_margin_mean": 22.25656509399414, "margin_dpo/margin_mean": 22.25656509399414, "margin_dpo/margin_std": 22.70517349243164, "step": 223 }, { "epoch": 0.328928046989721, "grad_norm": 72.05899810791016, "learning_rate": 4.2586691858633747e-07, "logits/chosen": -0.6770994663238525, "logits/rejected": -0.6360162496566772, "logps/chosen": -61.67028045654297, "logps/ref_chosen": -48.59540939331055, "logps/ref_rejected": -77.11648559570312, "logps/rejected": -116.93687438964844, "loss": 0.5671, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.19068878889083862, "margin_dpo/beta_margin_grad_std": 0.20871135592460632, "margin_dpo/beta_margin_mean": 2.674551486968994, "margin_dpo/loss_margin_mean": 26.745513916015625, "margin_dpo/margin_mean": 26.745513916015625, "margin_dpo/margin_std": 24.828662872314453, "step": 224 }, { "epoch": 0.3303964757709251, "grad_norm": 44.1650390625, "learning_rate": 4.249525076191759e-07, "logits/chosen": -0.6742178201675415, "logits/rejected": -0.6422642469406128, "logps/chosen": -72.62501525878906, "logps/ref_chosen": -58.000465393066406, "logps/ref_rejected": -99.90290832519531, "logps/rejected": -147.4707489013672, "loss": 0.4084, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14956694841384888, "margin_dpo/beta_margin_grad_std": 0.20272037386894226, "margin_dpo/beta_margin_mean": 3.2943289279937744, "margin_dpo/loss_margin_mean": 32.94328689575195, "margin_dpo/margin_mean": 32.94329071044922, "margin_dpo/margin_std": 26.7680606842041, "step": 225 }, { "epoch": 0.33186490455212925, "grad_norm": 52.37092208862305, "learning_rate": 4.2403348649073167e-07, "logits/chosen": -0.6877872943878174, "logits/rejected": -0.6356024146080017, "logps/chosen": -69.44877624511719, "logps/ref_chosen": -58.898799896240234, "logps/ref_rejected": -78.68775939941406, "logps/rejected": -114.89201354980469, "loss": 0.4828, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17930053174495697, "margin_dpo/beta_margin_grad_std": 0.18374590575695038, "margin_dpo/beta_margin_mean": 2.565427780151367, "margin_dpo/loss_margin_mean": 25.654277801513672, "margin_dpo/margin_mean": 25.654277801513672, "margin_dpo/margin_std": 21.256011962890625, "step": 226 }, { "epoch": 0.3333333333333333, "grad_norm": 47.082557678222656, "learning_rate": 4.2310987941806615e-07, "logits/chosen": -0.6890594959259033, "logits/rejected": -0.6593263745307922, "logps/chosen": -70.82875061035156, "logps/ref_chosen": -59.072181701660156, "logps/ref_rejected": -99.41236877441406, "logps/rejected": -142.82723999023438, "loss": 0.4196, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15451833605766296, "margin_dpo/beta_margin_grad_std": 0.19809511303901672, "margin_dpo/beta_margin_mean": 3.1658291816711426, "margin_dpo/loss_margin_mean": 31.65829086303711, "margin_dpo/margin_mean": 31.65829086303711, "margin_dpo/margin_std": 26.29955291748047, "step": 227 }, { "epoch": 0.33480176211453744, "grad_norm": 56.605804443359375, "learning_rate": 4.2218171073908463e-07, "logits/chosen": -0.7011622786521912, "logits/rejected": -0.6703910231590271, "logps/chosen": -78.96180725097656, "logps/ref_chosen": -65.89129638671875, "logps/ref_rejected": -91.04875183105469, "logps/rejected": -128.32534790039062, "loss": 0.5385, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18866901099681854, "margin_dpo/beta_margin_grad_std": 0.21488142013549805, "margin_dpo/beta_margin_mean": 2.4206089973449707, "margin_dpo/loss_margin_mean": 24.20608901977539, "margin_dpo/margin_mean": 24.20608901977539, "margin_dpo/margin_std": 20.969837188720703, "step": 228 }, { "epoch": 0.33627019089574156, "grad_norm": 63.40804672241211, "learning_rate": 4.212490049118951e-07, "logits/chosen": -0.7051106691360474, "logits/rejected": -0.6546026468276978, "logps/chosen": -84.97560119628906, "logps/ref_chosen": -70.70636749267578, "logps/ref_rejected": -84.52740478515625, "logps/rejected": -125.85427856445312, "loss": 0.6019, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.19034332036972046, "margin_dpo/beta_margin_grad_std": 0.2341303527355194, "margin_dpo/beta_margin_mean": 2.7057645320892334, "margin_dpo/loss_margin_mean": 27.05764389038086, "margin_dpo/margin_mean": 27.05764389038086, "margin_dpo/margin_std": 25.40169906616211, "step": 229 }, { "epoch": 0.3377386196769457, "grad_norm": 53.56421661376953, "learning_rate": 4.203117865141635e-07, "logits/chosen": -0.6524708271026611, "logits/rejected": -0.6426960229873657, "logps/chosen": -51.435630798339844, "logps/ref_chosen": -39.282005310058594, "logps/ref_rejected": -85.62191009521484, "logps/rejected": -128.23431396484375, "loss": 0.5106, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16138073801994324, "margin_dpo/beta_margin_grad_std": 0.21171000599861145, "margin_dpo/beta_margin_mean": 3.0458788871765137, "margin_dpo/loss_margin_mean": 30.45878791809082, "margin_dpo/margin_mean": 30.45878791809082, "margin_dpo/margin_std": 27.84360694885254, "step": 230 }, { "epoch": 0.3392070484581498, "grad_norm": 42.17887878417969, "learning_rate": 4.1937008024246625e-07, "logits/chosen": -0.6823098063468933, "logits/rejected": -0.6389471292495728, "logps/chosen": -74.67344665527344, "logps/ref_chosen": -63.27644348144531, "logps/ref_rejected": -74.1239013671875, "logps/rejected": -111.6684799194336, "loss": 0.4691, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18743924796581268, "margin_dpo/beta_margin_grad_std": 0.17160117626190186, "margin_dpo/beta_margin_mean": 2.614757537841797, "margin_dpo/loss_margin_mean": 26.14757537841797, "margin_dpo/margin_mean": 26.14757537841797, "margin_dpo/margin_std": 25.1517333984375, "step": 231 }, { "epoch": 0.3406754772393539, "grad_norm": 70.57960510253906, "learning_rate": 4.1842391091163933e-07, "logits/chosen": -0.6195404529571533, "logits/rejected": -0.5588107109069824, "logps/chosen": -84.39373779296875, "logps/ref_chosen": -70.74876403808594, "logps/ref_rejected": -83.97706604003906, "logps/rejected": -118.90908813476562, "loss": 0.6901, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.23599335551261902, "margin_dpo/beta_margin_grad_std": 0.24028193950653076, "margin_dpo/beta_margin_mean": 2.1287035942077637, "margin_dpo/loss_margin_mean": 21.287036895751953, "margin_dpo/margin_mean": 21.287036895751953, "margin_dpo/margin_std": 22.360965728759766, "step": 232 }, { "epoch": 0.342143906020558, "grad_norm": 63.01465606689453, "learning_rate": 4.174733034541245e-07, "logits/chosen": -0.6951059103012085, "logits/rejected": -0.6703442931175232, "logps/chosen": -67.92855834960938, "logps/ref_chosen": -54.8829345703125, "logps/ref_rejected": -107.48007202148438, "logps/rejected": -148.419921875, "loss": 0.5636, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18834185600280762, "margin_dpo/beta_margin_grad_std": 0.22877255082130432, "margin_dpo/beta_margin_mean": 2.7894234657287598, "margin_dpo/loss_margin_mean": 27.894235610961914, "margin_dpo/margin_mean": 27.89423370361328, "margin_dpo/margin_std": 26.236351013183594, "step": 233 }, { "epoch": 0.3436123348017621, "grad_norm": 59.044742584228516, "learning_rate": 4.165182829193126e-07, "logits/chosen": -0.6353030204772949, "logits/rejected": -0.6363640427589417, "logps/chosen": -55.01062774658203, "logps/ref_chosen": -44.09451675415039, "logps/ref_rejected": -100.00663757324219, "logps/rejected": -139.08529663085938, "loss": 0.4541, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15148046612739563, "margin_dpo/beta_margin_grad_std": 0.18851938843727112, "margin_dpo/beta_margin_mean": 2.8162550926208496, "margin_dpo/loss_margin_mean": 28.162551879882812, "margin_dpo/margin_mean": 28.16254997253418, "margin_dpo/margin_std": 21.774639129638672, "step": 234 }, { "epoch": 0.34508076358296624, "grad_norm": 64.13523864746094, "learning_rate": 4.1555887447288255e-07, "logits/chosen": -0.6782118082046509, "logits/rejected": -0.6378265619277954, "logps/chosen": -77.53021240234375, "logps/ref_chosen": -62.237911224365234, "logps/ref_rejected": -90.39505767822266, "logps/rejected": -128.7196502685547, "loss": 0.5952, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.21459338068962097, "margin_dpo/beta_margin_grad_std": 0.21603041887283325, "margin_dpo/beta_margin_mean": 2.303229808807373, "margin_dpo/loss_margin_mean": 23.03229522705078, "margin_dpo/margin_mean": 23.03229522705078, "margin_dpo/margin_std": 23.028390884399414, "step": 235 }, { "epoch": 0.3465491923641703, "grad_norm": 64.90979766845703, "learning_rate": 4.1459510339613946e-07, "logits/chosen": -0.6209807395935059, "logits/rejected": -0.6189430356025696, "logps/chosen": -60.39289855957031, "logps/ref_chosen": -49.34136199951172, "logps/ref_rejected": -103.51162719726562, "logps/rejected": -139.95445251464844, "loss": 0.5672, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.19896173477172852, "margin_dpo/beta_margin_grad_std": 0.22020728886127472, "margin_dpo/beta_margin_mean": 2.5391287803649902, "margin_dpo/loss_margin_mean": 25.391286849975586, "margin_dpo/margin_mean": 25.391284942626953, "margin_dpo/margin_std": 23.486440658569336, "step": 236 }, { "epoch": 0.34801762114537443, "grad_norm": 48.42905807495117, "learning_rate": 4.136269950853473e-07, "logits/chosen": -0.6719942092895508, "logits/rejected": -0.6377314329147339, "logps/chosen": -65.90644836425781, "logps/ref_chosen": -54.168121337890625, "logps/ref_rejected": -94.78036499023438, "logps/rejected": -133.96646118164062, "loss": 0.5145, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17442235350608826, "margin_dpo/beta_margin_grad_std": 0.21036501228809357, "margin_dpo/beta_margin_mean": 2.744776487350464, "margin_dpo/loss_margin_mean": 27.447765350341797, "margin_dpo/margin_mean": 27.447765350341797, "margin_dpo/margin_std": 24.072509765625, "step": 237 }, { "epoch": 0.34948604992657856, "grad_norm": 39.447776794433594, "learning_rate": 4.126545750510605e-07, "logits/chosen": -0.62393718957901, "logits/rejected": -0.6167235970497131, "logps/chosen": -64.80150604248047, "logps/ref_chosen": -53.973121643066406, "logps/ref_rejected": -89.41795349121094, "logps/rejected": -124.90143585205078, "loss": 0.4407, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17134535312652588, "margin_dpo/beta_margin_grad_std": 0.17862460017204285, "margin_dpo/beta_margin_mean": 2.465510368347168, "margin_dpo/loss_margin_mean": 24.65510368347168, "margin_dpo/margin_mean": 24.655105590820312, "margin_dpo/margin_std": 20.156238555908203, "step": 238 }, { "epoch": 0.3509544787077827, "grad_norm": 49.10955810546875, "learning_rate": 4.116778689174514e-07, "logits/chosen": -0.6934037208557129, "logits/rejected": -0.6648428440093994, "logps/chosen": -70.5438232421875, "logps/ref_chosen": -58.09782409667969, "logps/ref_rejected": -93.59294128417969, "logps/rejected": -131.54547119140625, "loss": 0.4402, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16609887778759003, "margin_dpo/beta_margin_grad_std": 0.17937105894088745, "margin_dpo/beta_margin_mean": 2.550652503967285, "margin_dpo/loss_margin_mean": 25.50652313232422, "margin_dpo/margin_mean": 25.50652313232422, "margin_dpo/margin_std": 19.751544952392578, "step": 239 }, { "epoch": 0.3524229074889868, "grad_norm": 59.94078063964844, "learning_rate": 4.106969024216348e-07, "logits/chosen": -0.7201815843582153, "logits/rejected": -0.6908845901489258, "logps/chosen": -73.39276885986328, "logps/ref_chosen": -60.6144905090332, "logps/ref_rejected": -74.1185302734375, "logps/rejected": -109.40141296386719, "loss": 0.6221, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.20475083589553833, "margin_dpo/beta_margin_grad_std": 0.22877436876296997, "margin_dpo/beta_margin_mean": 2.250460624694824, "margin_dpo/loss_margin_mean": 22.50460433959961, "margin_dpo/margin_mean": 22.50460433959961, "margin_dpo/margin_std": 20.584453582763672, "step": 240 }, { "epoch": 0.35389133627019087, "grad_norm": 58.87882614135742, "learning_rate": 4.097117014129903e-07, "logits/chosen": -0.6401950120925903, "logits/rejected": -0.5843400955200195, "logps/chosen": -76.44609069824219, "logps/ref_chosen": -66.091064453125, "logps/ref_rejected": -88.06088256835938, "logps/rejected": -130.1419677734375, "loss": 0.5073, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16559036076068878, "margin_dpo/beta_margin_grad_std": 0.2104012668132782, "margin_dpo/beta_margin_mean": 3.172605514526367, "margin_dpo/loss_margin_mean": 31.726055145263672, "margin_dpo/margin_mean": 31.726055145263672, "margin_dpo/margin_std": 29.52655029296875, "step": 241 }, { "epoch": 0.355359765051395, "grad_norm": 53.561256408691406, "learning_rate": 4.087222918524807e-07, "logits/chosen": -0.6564736366271973, "logits/rejected": -0.6259176135063171, "logps/chosen": -79.38307189941406, "logps/ref_chosen": -67.86392211914062, "logps/ref_rejected": -83.36033630371094, "logps/rejected": -119.49470520019531, "loss": 0.4948, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18211215734481812, "margin_dpo/beta_margin_grad_std": 0.19208890199661255, "margin_dpo/beta_margin_mean": 2.4615235328674316, "margin_dpo/loss_margin_mean": 24.615234375, "margin_dpo/margin_mean": 24.615234375, "margin_dpo/margin_std": 21.698657989501953, "step": 242 }, { "epoch": 0.3568281938325991, "grad_norm": 33.901893615722656, "learning_rate": 4.07728699811968e-07, "logits/chosen": -0.6499842405319214, "logits/rejected": -0.5824156999588013, "logps/chosen": -73.99522399902344, "logps/ref_chosen": -63.08424377441406, "logps/ref_rejected": -76.33563232421875, "logps/rejected": -116.67811584472656, "loss": 0.3266, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13336455821990967, "margin_dpo/beta_margin_grad_std": 0.15137195587158203, "margin_dpo/beta_margin_mean": 2.943150043487549, "margin_dpo/loss_margin_mean": 29.43149757385254, "margin_dpo/margin_mean": 29.43149757385254, "margin_dpo/margin_std": 21.736125946044922, "step": 243 }, { "epoch": 0.35829662261380324, "grad_norm": 43.502723693847656, "learning_rate": 4.067309514735267e-07, "logits/chosen": -0.6875163316726685, "logits/rejected": -0.6771037578582764, "logps/chosen": -71.17987060546875, "logps/ref_chosen": -61.14069366455078, "logps/ref_rejected": -94.89193725585938, "logps/rejected": -130.1956787109375, "loss": 0.4962, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18511684238910675, "margin_dpo/beta_margin_grad_std": 0.20264488458633423, "margin_dpo/beta_margin_mean": 2.526456832885742, "margin_dpo/loss_margin_mean": 25.264570236206055, "margin_dpo/margin_mean": 25.264570236206055, "margin_dpo/margin_std": 21.366226196289062, "step": 244 }, { "epoch": 0.35976505139500736, "grad_norm": 75.58712768554688, "learning_rate": 4.057290731287531e-07, "logits/chosen": -0.6512210369110107, "logits/rejected": -0.5914992094039917, "logps/chosen": -78.87323760986328, "logps/ref_chosen": -67.26228332519531, "logps/ref_rejected": -87.64010620117188, "logps/rejected": -126.08024597167969, "loss": 0.5409, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1954166740179062, "margin_dpo/beta_margin_grad_std": 0.20244669914245605, "margin_dpo/beta_margin_mean": 2.68291974067688, "margin_dpo/loss_margin_mean": 26.82919692993164, "margin_dpo/margin_mean": 26.82919692993164, "margin_dpo/margin_std": 25.66305923461914, "step": 245 }, { "epoch": 0.36123348017621143, "grad_norm": 55.81852722167969, "learning_rate": 4.047230911780736e-07, "logits/chosen": -0.6650277972221375, "logits/rejected": -0.6220686435699463, "logps/chosen": -77.91656494140625, "logps/ref_chosen": -66.69696807861328, "logps/ref_rejected": -84.34634399414062, "logps/rejected": -118.67218017578125, "loss": 0.5257, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.19744564592838287, "margin_dpo/beta_margin_grad_std": 0.1971253752708435, "margin_dpo/beta_margin_mean": 2.3106236457824707, "margin_dpo/loss_margin_mean": 23.10623550415039, "margin_dpo/margin_mean": 23.10623550415039, "margin_dpo/margin_std": 20.974063873291016, "step": 246 }, { "epoch": 0.36270190895741555, "grad_norm": 39.22549057006836, "learning_rate": 4.0371303213004814e-07, "logits/chosen": -0.7125513553619385, "logits/rejected": -0.6908072233200073, "logps/chosen": -68.05924224853516, "logps/ref_chosen": -56.6053466796875, "logps/ref_rejected": -106.29327392578125, "logps/rejected": -150.0380401611328, "loss": 0.4087, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1458946168422699, "margin_dpo/beta_margin_grad_std": 0.1995469033718109, "margin_dpo/beta_margin_mean": 3.229086399078369, "margin_dpo/loss_margin_mean": 32.29086685180664, "margin_dpo/margin_mean": 32.29086685180664, "margin_dpo/margin_std": 25.363780975341797, "step": 247 }, { "epoch": 0.3641703377386197, "grad_norm": 44.850181579589844, "learning_rate": 4.0269892260067197e-07, "logits/chosen": -0.65459144115448, "logits/rejected": -0.6384952068328857, "logps/chosen": -54.36640930175781, "logps/ref_chosen": -44.043216705322266, "logps/ref_rejected": -91.85687255859375, "logps/rejected": -126.57852172851562, "loss": 0.4079, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16769659519195557, "margin_dpo/beta_margin_grad_std": 0.15377435088157654, "margin_dpo/beta_margin_mean": 2.439844846725464, "margin_dpo/loss_margin_mean": 24.398448944091797, "margin_dpo/margin_mean": 24.398447036743164, "margin_dpo/margin_std": 19.091136932373047, "step": 248 }, { "epoch": 0.3656387665198238, "grad_norm": 64.64720916748047, "learning_rate": 4.0168078931267426e-07, "logits/chosen": -0.7210872173309326, "logits/rejected": -0.6843761205673218, "logps/chosen": -74.97351837158203, "logps/ref_chosen": -62.442352294921875, "logps/ref_rejected": -80.46806335449219, "logps/rejected": -113.79952239990234, "loss": 0.6559, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.23226860165596008, "margin_dpo/beta_margin_grad_std": 0.22308656573295593, "margin_dpo/beta_margin_mean": 2.080028533935547, "margin_dpo/loss_margin_mean": 20.80028533935547, "margin_dpo/margin_mean": 20.80028533935547, "margin_dpo/margin_std": 20.590898513793945, "step": 249 }, { "epoch": 0.3671071953010279, "grad_norm": 32.35546875, "learning_rate": 4.006586590948141e-07, "logits/chosen": -0.6919997334480286, "logits/rejected": -0.621533215045929, "logps/chosen": -74.43826293945312, "logps/ref_chosen": -65.6366958618164, "logps/ref_rejected": -73.87183380126953, "logps/rejected": -108.39453125, "loss": 0.4355, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15557865798473358, "margin_dpo/beta_margin_grad_std": 0.1853388100862503, "margin_dpo/beta_margin_mean": 2.572113037109375, "margin_dpo/loss_margin_mean": 25.72113037109375, "margin_dpo/margin_mean": 25.72113037109375, "margin_dpo/margin_std": 18.065841674804688, "step": 250 }, { "epoch": 0.368575624082232, "grad_norm": 45.22731018066406, "learning_rate": 3.9963255888117325e-07, "logits/chosen": -0.7006521224975586, "logits/rejected": -0.645630955696106, "logps/chosen": -70.0533676147461, "logps/ref_chosen": -57.182716369628906, "logps/ref_rejected": -77.66343688964844, "logps/rejected": -116.09004211425781, "loss": 0.4648, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.179362490773201, "margin_dpo/beta_margin_grad_std": 0.19022953510284424, "margin_dpo/beta_margin_mean": 2.5555951595306396, "margin_dpo/loss_margin_mean": 25.555952072143555, "margin_dpo/margin_mean": 25.555952072143555, "margin_dpo/margin_std": 20.537967681884766, "step": 251 }, { "epoch": 0.3700440528634361, "grad_norm": 52.23085403442383, "learning_rate": 3.9860251571044666e-07, "logits/chosen": -0.68607497215271, "logits/rejected": -0.6413745880126953, "logps/chosen": -83.42814636230469, "logps/ref_chosen": -71.68563842773438, "logps/ref_rejected": -84.75798797607422, "logps/rejected": -122.11627197265625, "loss": 0.4332, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15921342372894287, "margin_dpo/beta_margin_grad_std": 0.18207845091819763, "margin_dpo/beta_margin_mean": 2.5615780353546143, "margin_dpo/loss_margin_mean": 25.615779876708984, "margin_dpo/margin_mean": 25.615779876708984, "margin_dpo/margin_std": 19.359203338623047, "step": 252 }, { "epoch": 0.37151248164464024, "grad_norm": 50.17756652832031, "learning_rate": 3.9756855672522986e-07, "logits/chosen": -0.7000366449356079, "logits/rejected": -0.6709662675857544, "logps/chosen": -79.15950012207031, "logps/ref_chosen": -69.13392639160156, "logps/ref_rejected": -98.70252990722656, "logps/rejected": -132.74795532226562, "loss": 0.6265, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2043215036392212, "margin_dpo/beta_margin_grad_std": 0.22261486947536469, "margin_dpo/beta_margin_mean": 2.401987075805664, "margin_dpo/loss_margin_mean": 24.01987075805664, "margin_dpo/margin_mean": 24.01987075805664, "margin_dpo/margin_std": 22.792251586914062, "step": 253 }, { "epoch": 0.37298091042584436, "grad_norm": 63.319210052490234, "learning_rate": 3.965307091713037e-07, "logits/chosen": -0.6818605065345764, "logits/rejected": -0.6327254772186279, "logps/chosen": -64.66595458984375, "logps/ref_chosen": -54.154998779296875, "logps/ref_rejected": -90.30764770507812, "logps/rejected": -125.43360900878906, "loss": 0.5555, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1997654289007187, "margin_dpo/beta_margin_grad_std": 0.21750159561634064, "margin_dpo/beta_margin_mean": 2.461501121520996, "margin_dpo/loss_margin_mean": 24.615013122558594, "margin_dpo/margin_mean": 24.615013122558594, "margin_dpo/margin_std": 22.605167388916016, "step": 254 }, { "epoch": 0.3744493392070485, "grad_norm": 67.16899871826172, "learning_rate": 3.954890003969163e-07, "logits/chosen": -0.6882792115211487, "logits/rejected": -0.6564103364944458, "logps/chosen": -70.3147201538086, "logps/ref_chosen": -57.14167022705078, "logps/ref_rejected": -90.2085952758789, "logps/rejected": -130.73675537109375, "loss": 0.6626, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.19194266200065613, "margin_dpo/beta_margin_grad_std": 0.2271135002374649, "margin_dpo/beta_margin_mean": 2.7355103492736816, "margin_dpo/loss_margin_mean": 27.355106353759766, "margin_dpo/margin_mean": 27.355106353759766, "margin_dpo/margin_std": 28.053775787353516, "step": 255 }, { "epoch": 0.37591776798825255, "grad_norm": 58.15704345703125, "learning_rate": 3.944434578520628e-07, "logits/chosen": -0.6779786348342896, "logits/rejected": -0.6487682461738586, "logps/chosen": -68.2900619506836, "logps/ref_chosen": -55.163490295410156, "logps/ref_rejected": -92.56291961669922, "logps/rejected": -132.92449951171875, "loss": 0.5166, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17461168766021729, "margin_dpo/beta_margin_grad_std": 0.20105499029159546, "margin_dpo/beta_margin_mean": 2.7235007286071777, "margin_dpo/loss_margin_mean": 27.235008239746094, "margin_dpo/margin_mean": 27.235008239746094, "margin_dpo/margin_std": 25.091623306274414, "step": 256 }, { "epoch": 0.37738619676945667, "grad_norm": 45.471885681152344, "learning_rate": 3.933941090877615e-07, "logits/chosen": -0.647832453250885, "logits/rejected": -0.6230664253234863, "logps/chosen": -61.75330352783203, "logps/ref_chosen": -49.4236946105957, "logps/ref_rejected": -79.53791809082031, "logps/rejected": -121.87123107910156, "loss": 0.5048, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18101766705513, "margin_dpo/beta_margin_grad_std": 0.21384158730506897, "margin_dpo/beta_margin_mean": 3.0003700256347656, "margin_dpo/loss_margin_mean": 30.003700256347656, "margin_dpo/margin_mean": 30.003700256347656, "margin_dpo/margin_std": 25.782047271728516, "step": 257 }, { "epoch": 0.3788546255506608, "grad_norm": 87.32129669189453, "learning_rate": 3.923409817553284e-07, "logits/chosen": -0.6796102523803711, "logits/rejected": -0.6486295461654663, "logps/chosen": -75.19246673583984, "logps/ref_chosen": -59.384124755859375, "logps/ref_rejected": -95.9901123046875, "logps/rejected": -138.2916717529297, "loss": 0.7333, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.20006248354911804, "margin_dpo/beta_margin_grad_std": 0.2458367645740509, "margin_dpo/beta_margin_mean": 2.649322748184204, "margin_dpo/loss_margin_mean": 26.493227005004883, "margin_dpo/margin_mean": 26.493227005004883, "margin_dpo/margin_std": 27.26435089111328, "step": 258 }, { "epoch": 0.3803230543318649, "grad_norm": 49.1059684753418, "learning_rate": 3.9128410360564793e-07, "logits/chosen": -0.6160672307014465, "logits/rejected": -0.5899853110313416, "logps/chosen": -67.10466003417969, "logps/ref_chosen": -52.828346252441406, "logps/ref_rejected": -89.19165802001953, "logps/rejected": -127.45698547363281, "loss": 0.5267, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1898474097251892, "margin_dpo/beta_margin_grad_std": 0.19210869073867798, "margin_dpo/beta_margin_mean": 2.3989009857177734, "margin_dpo/loss_margin_mean": 23.9890079498291, "margin_dpo/margin_mean": 23.9890079498291, "margin_dpo/margin_std": 20.4078369140625, "step": 259 }, { "epoch": 0.38179148311306904, "grad_norm": 57.559608459472656, "learning_rate": 3.9022350248844246e-07, "logits/chosen": -0.6179628372192383, "logits/rejected": -0.6178984045982361, "logps/chosen": -62.646522521972656, "logps/ref_chosen": -47.41767501831055, "logps/ref_rejected": -95.08979034423828, "logps/rejected": -137.5649871826172, "loss": 0.4996, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18264836072921753, "margin_dpo/beta_margin_grad_std": 0.2074463963508606, "margin_dpo/beta_margin_mean": 2.724635124206543, "margin_dpo/loss_margin_mean": 27.24635124206543, "margin_dpo/margin_mean": 27.24635124206543, "margin_dpo/margin_std": 25.009780883789062, "step": 260 }, { "epoch": 0.3832599118942731, "grad_norm": 45.6181755065918, "learning_rate": 3.891592063515376e-07, "logits/chosen": -0.6720625758171082, "logits/rejected": -0.6372575759887695, "logps/chosen": -65.10540008544922, "logps/ref_chosen": -53.03137969970703, "logps/ref_rejected": -88.51494598388672, "logps/rejected": -129.3517608642578, "loss": 0.4674, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1723802089691162, "margin_dpo/beta_margin_grad_std": 0.2052065134048462, "margin_dpo/beta_margin_mean": 2.876279354095459, "margin_dpo/loss_margin_mean": 28.762794494628906, "margin_dpo/margin_mean": 28.762794494628906, "margin_dpo/margin_std": 26.346603393554688, "step": 261 }, { "epoch": 0.38472834067547723, "grad_norm": 62.1253776550293, "learning_rate": 3.880912432401264e-07, "logits/chosen": -0.6411904096603394, "logits/rejected": -0.5941104888916016, "logps/chosen": -74.12861633300781, "logps/ref_chosen": -59.620140075683594, "logps/ref_rejected": -86.41853332519531, "logps/rejected": -126.77371215820312, "loss": 0.5264, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17031686007976532, "margin_dpo/beta_margin_grad_std": 0.22105002403259277, "margin_dpo/beta_margin_mean": 2.5846705436706543, "margin_dpo/loss_margin_mean": 25.84670639038086, "margin_dpo/margin_mean": 25.84670639038086, "margin_dpo/margin_std": 21.955059051513672, "step": 262 }, { "epoch": 0.38619676945668135, "grad_norm": 63.75021743774414, "learning_rate": 3.870196412960302e-07, "logits/chosen": -0.6712623834609985, "logits/rejected": -0.6137841939926147, "logps/chosen": -71.11343383789062, "logps/ref_chosen": -59.42094421386719, "logps/ref_rejected": -96.85720825195312, "logps/rejected": -139.07449340820312, "loss": 0.4316, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1605646014213562, "margin_dpo/beta_margin_grad_std": 0.19631603360176086, "margin_dpo/beta_margin_mean": 3.0524797439575195, "margin_dpo/loss_margin_mean": 30.524797439575195, "margin_dpo/margin_mean": 30.524799346923828, "margin_dpo/margin_std": 26.109458923339844, "step": 263 }, { "epoch": 0.3876651982378855, "grad_norm": 65.67262268066406, "learning_rate": 3.8594442875695665e-07, "logits/chosen": -0.664789080619812, "logits/rejected": -0.6371362805366516, "logps/chosen": -76.22811126708984, "logps/ref_chosen": -62.722084045410156, "logps/ref_rejected": -93.85621643066406, "logps/rejected": -131.24859619140625, "loss": 0.5618, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.19636279344558716, "margin_dpo/beta_margin_grad_std": 0.201200932264328, "margin_dpo/beta_margin_mean": 2.3886351585388184, "margin_dpo/loss_margin_mean": 23.8863525390625, "margin_dpo/margin_mean": 23.8863525390625, "margin_dpo/margin_std": 21.996925354003906, "step": 264 }, { "epoch": 0.3891336270190896, "grad_norm": 68.70135498046875, "learning_rate": 3.848656339557562e-07, "logits/chosen": -0.6722284555435181, "logits/rejected": -0.6433699131011963, "logps/chosen": -76.23115539550781, "logps/ref_chosen": -61.971466064453125, "logps/ref_rejected": -88.02059936523438, "logps/rejected": -127.41754150390625, "loss": 0.5867, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.20745858550071716, "margin_dpo/beta_margin_grad_std": 0.22185558080673218, "margin_dpo/beta_margin_mean": 2.5137252807617188, "margin_dpo/loss_margin_mean": 25.137252807617188, "margin_dpo/margin_mean": 25.137252807617188, "margin_dpo/margin_std": 25.626853942871094, "step": 265 }, { "epoch": 0.39060205580029367, "grad_norm": 54.78199768066406, "learning_rate": 3.8378328531967507e-07, "logits/chosen": -0.6929798126220703, "logits/rejected": -0.6306042075157166, "logps/chosen": -80.66532897949219, "logps/ref_chosen": -67.09967041015625, "logps/ref_rejected": -67.97122192382812, "logps/rejected": -106.28300476074219, "loss": 0.5605, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1999325454235077, "margin_dpo/beta_margin_grad_std": 0.21910326182842255, "margin_dpo/beta_margin_mean": 2.474611759185791, "margin_dpo/loss_margin_mean": 24.746116638183594, "margin_dpo/margin_mean": 24.746116638183594, "margin_dpo/margin_std": 22.58720588684082, "step": 266 }, { "epoch": 0.3920704845814978, "grad_norm": 50.09029769897461, "learning_rate": 3.8269741136960646e-07, "logits/chosen": -0.661482036113739, "logits/rejected": -0.6170265078544617, "logps/chosen": -82.08916473388672, "logps/ref_chosen": -68.97074890136719, "logps/ref_rejected": -90.16844940185547, "logps/rejected": -130.65118408203125, "loss": 0.4154, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16007937490940094, "margin_dpo/beta_margin_grad_std": 0.18461111187934875, "margin_dpo/beta_margin_mean": 2.7364323139190674, "margin_dpo/loss_margin_mean": 27.364322662353516, "margin_dpo/margin_mean": 27.364322662353516, "margin_dpo/margin_std": 22.23681640625, "step": 267 }, { "epoch": 0.3935389133627019, "grad_norm": 61.21752166748047, "learning_rate": 3.8160804071933894e-07, "logits/chosen": -0.6383650898933411, "logits/rejected": -0.6218883991241455, "logps/chosen": -68.4329833984375, "logps/ref_chosen": -55.900306701660156, "logps/ref_rejected": -101.64763641357422, "logps/rejected": -139.42156982421875, "loss": 0.4978, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18062824010849, "margin_dpo/beta_margin_grad_std": 0.20679926872253418, "margin_dpo/beta_margin_mean": 2.5241270065307617, "margin_dpo/loss_margin_mean": 25.241270065307617, "margin_dpo/margin_mean": 25.241270065307617, "margin_dpo/margin_std": 21.52151870727539, "step": 268 }, { "epoch": 0.39500734214390604, "grad_norm": 63.637428283691406, "learning_rate": 3.8051520207480204e-07, "logits/chosen": -0.6397280097007751, "logits/rejected": -0.5932068228721619, "logps/chosen": -82.90742492675781, "logps/ref_chosen": -70.03955078125, "logps/ref_rejected": -107.34937286376953, "logps/rejected": -153.24514770507812, "loss": 0.4033, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14418360590934753, "margin_dpo/beta_margin_grad_std": 0.21151122450828552, "margin_dpo/beta_margin_mean": 3.302790641784668, "margin_dpo/loss_margin_mean": 33.02790832519531, "margin_dpo/margin_mean": 33.02790832519531, "margin_dpo/margin_std": 23.7076416015625, "step": 269 }, { "epoch": 0.3964757709251101, "grad_norm": 42.040462493896484, "learning_rate": 3.794189242333106e-07, "logits/chosen": -0.6672676205635071, "logits/rejected": -0.6468701362609863, "logps/chosen": -80.33956146240234, "logps/ref_chosen": -69.53347778320312, "logps/ref_rejected": -109.92864990234375, "logps/rejected": -145.84991455078125, "loss": 0.506, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18614254891872406, "margin_dpo/beta_margin_grad_std": 0.20391584932804108, "margin_dpo/beta_margin_mean": 2.5115182399749756, "margin_dpo/loss_margin_mean": 25.11518096923828, "margin_dpo/margin_mean": 25.11518096923828, "margin_dpo/margin_std": 22.150074005126953, "step": 270 }, { "epoch": 0.39794419970631423, "grad_norm": 51.15155792236328, "learning_rate": 3.7831923608280514e-07, "logits/chosen": -0.6099239587783813, "logits/rejected": -0.5680840611457825, "logps/chosen": -70.89580535888672, "logps/ref_chosen": -56.76457214355469, "logps/ref_rejected": -92.51383209228516, "logps/rejected": -132.5828399658203, "loss": 0.5353, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1870347112417221, "margin_dpo/beta_margin_grad_std": 0.21322524547576904, "margin_dpo/beta_margin_mean": 2.5937767028808594, "margin_dpo/loss_margin_mean": 25.937767028808594, "margin_dpo/margin_mean": 25.937767028808594, "margin_dpo/margin_std": 23.366622924804688, "step": 271 }, { "epoch": 0.39941262848751835, "grad_norm": 51.133304595947266, "learning_rate": 3.772161666010912e-07, "logits/chosen": -0.6056419014930725, "logits/rejected": -0.5937180519104004, "logps/chosen": -62.401695251464844, "logps/ref_chosen": -49.49715805053711, "logps/ref_rejected": -105.54279327392578, "logps/rejected": -150.09609985351562, "loss": 0.5269, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.168976292014122, "margin_dpo/beta_margin_grad_std": 0.23079171776771545, "margin_dpo/beta_margin_mean": 3.1648764610290527, "margin_dpo/loss_margin_mean": 31.648765563964844, "margin_dpo/margin_mean": 31.64876365661621, "margin_dpo/margin_std": 26.891244888305664, "step": 272 }, { "epoch": 0.4008810572687225, "grad_norm": 58.39497375488281, "learning_rate": 3.761097448550755e-07, "logits/chosen": -0.5882784128189087, "logits/rejected": -0.5531589984893799, "logps/chosen": -77.80586242675781, "logps/ref_chosen": -62.97539520263672, "logps/ref_rejected": -92.49858093261719, "logps/rejected": -137.8081817626953, "loss": 0.4583, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16255009174346924, "margin_dpo/beta_margin_grad_std": 0.20158691704273224, "margin_dpo/beta_margin_mean": 3.0479135513305664, "margin_dpo/loss_margin_mean": 30.479137420654297, "margin_dpo/margin_mean": 30.479137420654297, "margin_dpo/margin_std": 25.084999084472656, "step": 273 }, { "epoch": 0.4023494860499266, "grad_norm": 53.60417938232422, "learning_rate": 3.75e-07, "logits/chosen": -0.6290228366851807, "logits/rejected": -0.5923614501953125, "logps/chosen": -71.82681274414062, "logps/ref_chosen": -55.66770935058594, "logps/ref_rejected": -77.33308410644531, "logps/rejected": -119.93658447265625, "loss": 0.5139, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.179681196808815, "margin_dpo/beta_margin_grad_std": 0.21081170439720154, "margin_dpo/beta_margin_mean": 2.6444411277770996, "margin_dpo/loss_margin_mean": 26.44441032409668, "margin_dpo/margin_mean": 26.44441032409668, "margin_dpo/margin_std": 23.134292602539062, "step": 274 }, { "epoch": 0.40381791483113066, "grad_norm": 63.069358825683594, "learning_rate": 3.738869612786737e-07, "logits/chosen": -0.6466611623764038, "logits/rejected": -0.6279960870742798, "logps/chosen": -59.85034942626953, "logps/ref_chosen": -48.594703674316406, "logps/ref_rejected": -93.30369567871094, "logps/rejected": -132.1536407470703, "loss": 0.4693, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17479778826236725, "margin_dpo/beta_margin_grad_std": 0.19887307286262512, "margin_dpo/beta_margin_mean": 2.759430408477783, "margin_dpo/loss_margin_mean": 27.594303131103516, "margin_dpo/margin_mean": 27.594303131103516, "margin_dpo/margin_std": 24.204368591308594, "step": 275 }, { "epoch": 0.4052863436123348, "grad_norm": 62.213233947753906, "learning_rate": 3.7277065802070204e-07, "logits/chosen": -0.6649171113967896, "logits/rejected": -0.624710202217102, "logps/chosen": -70.10076141357422, "logps/ref_chosen": -56.57740783691406, "logps/ref_rejected": -70.36566925048828, "logps/rejected": -109.4853744506836, "loss": 0.5922, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.20924188196659088, "margin_dpo/beta_margin_grad_std": 0.2318626046180725, "margin_dpo/beta_margin_mean": 2.5596346855163574, "margin_dpo/loss_margin_mean": 25.59634780883789, "margin_dpo/margin_mean": 25.596345901489258, "margin_dpo/margin_std": 24.71031951904297, "step": 276 }, { "epoch": 0.4067547723935389, "grad_norm": 40.95648193359375, "learning_rate": 3.71651119641714e-07, "logits/chosen": -0.6606760025024414, "logits/rejected": -0.6246751546859741, "logps/chosen": -68.44947814941406, "logps/ref_chosen": -56.27156066894531, "logps/ref_rejected": -92.88127136230469, "logps/rejected": -129.38565063476562, "loss": 0.4261, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17114606499671936, "margin_dpo/beta_margin_grad_std": 0.16686135530471802, "margin_dpo/beta_margin_mean": 2.432648181915283, "margin_dpo/loss_margin_mean": 24.326480865478516, "margin_dpo/margin_mean": 24.326480865478516, "margin_dpo/margin_std": 18.5596866607666, "step": 277 }, { "epoch": 0.40822320117474303, "grad_norm": 47.080055236816406, "learning_rate": 3.705283756425872e-07, "logits/chosen": -0.6622641682624817, "logits/rejected": -0.6517907381057739, "logps/chosen": -64.23191833496094, "logps/ref_chosen": -52.94194030761719, "logps/ref_rejected": -91.25357818603516, "logps/rejected": -132.509765625, "loss": 0.4854, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17969730496406555, "margin_dpo/beta_margin_grad_std": 0.21197958290576935, "margin_dpo/beta_margin_mean": 2.9966213703155518, "margin_dpo/loss_margin_mean": 29.96621322631836, "margin_dpo/margin_mean": 29.96621322631836, "margin_dpo/margin_std": 26.836585998535156, "step": 278 }, { "epoch": 0.40969162995594716, "grad_norm": 50.956111907958984, "learning_rate": 3.6940245560867e-07, "logits/chosen": -0.6270808577537537, "logits/rejected": -0.5984815955162048, "logps/chosen": -60.80306625366211, "logps/ref_chosen": -48.641319274902344, "logps/ref_rejected": -87.8514404296875, "logps/rejected": -129.2867889404297, "loss": 0.4865, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17512354254722595, "margin_dpo/beta_margin_grad_std": 0.21795479953289032, "margin_dpo/beta_margin_mean": 2.9273602962493896, "margin_dpo/loss_margin_mean": 29.273601531982422, "margin_dpo/margin_mean": 29.273601531982422, "margin_dpo/margin_std": 24.12653160095215, "step": 279 }, { "epoch": 0.4111600587371512, "grad_norm": 37.822959899902344, "learning_rate": 3.6827338920900253e-07, "logits/chosen": -0.6415497064590454, "logits/rejected": -0.6238715648651123, "logps/chosen": -72.27760314941406, "logps/ref_chosen": -58.797122955322266, "logps/ref_rejected": -98.61885070800781, "logps/rejected": -141.04861450195312, "loss": 0.3466, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1335011124610901, "margin_dpo/beta_margin_grad_std": 0.17966148257255554, "margin_dpo/beta_margin_mean": 2.8949294090270996, "margin_dpo/loss_margin_mean": 28.949295043945312, "margin_dpo/margin_mean": 28.94929313659668, "margin_dpo/margin_std": 18.588150024414062, "step": 280 }, { "epoch": 0.41262848751835535, "grad_norm": 71.57465362548828, "learning_rate": 3.6714120619553435e-07, "logits/chosen": -0.6258310675621033, "logits/rejected": -0.5813519358634949, "logps/chosen": -67.82090759277344, "logps/ref_chosen": -55.488521575927734, "logps/ref_rejected": -80.88258361816406, "logps/rejected": -118.60010528564453, "loss": 0.4834, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1572715938091278, "margin_dpo/beta_margin_grad_std": 0.1871940791606903, "margin_dpo/beta_margin_mean": 2.5385141372680664, "margin_dpo/loss_margin_mean": 25.38513946533203, "margin_dpo/margin_mean": 25.38513946533203, "margin_dpo/margin_std": 20.046871185302734, "step": 281 }, { "epoch": 0.41409691629955947, "grad_norm": 49.311336517333984, "learning_rate": 3.660059364023408e-07, "logits/chosen": -0.6380579471588135, "logits/rejected": -0.5905803442001343, "logps/chosen": -85.72900390625, "logps/ref_chosen": -73.07014465332031, "logps/ref_rejected": -95.35098266601562, "logps/rejected": -131.3958740234375, "loss": 0.4738, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18345773220062256, "margin_dpo/beta_margin_grad_std": 0.18459081649780273, "margin_dpo/beta_margin_mean": 2.3386025428771973, "margin_dpo/loss_margin_mean": 23.38602638244629, "margin_dpo/margin_mean": 23.386028289794922, "margin_dpo/margin_std": 20.714370727539062, "step": 282 }, { "epoch": 0.4155653450807636, "grad_norm": 48.578853607177734, "learning_rate": 3.6486760974483685e-07, "logits/chosen": -0.6443203091621399, "logits/rejected": -0.6160309314727783, "logps/chosen": -74.32984161376953, "logps/ref_chosen": -61.89844512939453, "logps/ref_rejected": -96.98655700683594, "logps/rejected": -137.47125244140625, "loss": 0.478, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16398155689239502, "margin_dpo/beta_margin_grad_std": 0.20954221487045288, "margin_dpo/beta_margin_mean": 2.8053293228149414, "margin_dpo/loss_margin_mean": 28.05329132080078, "margin_dpo/margin_mean": 28.05329132080078, "margin_dpo/margin_std": 23.433940887451172, "step": 283 }, { "epoch": 0.4170337738619677, "grad_norm": 43.66304016113281, "learning_rate": 3.6372625621898863e-07, "logits/chosen": -0.6190305948257446, "logits/rejected": -0.6051605939865112, "logps/chosen": -72.00505828857422, "logps/ref_chosen": -58.4355354309082, "logps/ref_rejected": -93.46926879882812, "logps/rejected": -136.72036743164062, "loss": 0.4128, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1565876454114914, "margin_dpo/beta_margin_grad_std": 0.18501752614974976, "margin_dpo/beta_margin_mean": 2.9681572914123535, "margin_dpo/loss_margin_mean": 29.68157196044922, "margin_dpo/margin_mean": 29.68157196044922, "margin_dpo/margin_std": 25.59400749206543, "step": 284 }, { "epoch": 0.4185022026431718, "grad_norm": 55.42721176147461, "learning_rate": 3.625819059005228e-07, "logits/chosen": -0.6967588663101196, "logits/rejected": -0.670876681804657, "logps/chosen": -81.565185546875, "logps/ref_chosen": -66.2322006225586, "logps/ref_rejected": -99.1268310546875, "logps/rejected": -141.13421630859375, "loss": 0.4222, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16293823719024658, "margin_dpo/beta_margin_grad_std": 0.1807018220424652, "margin_dpo/beta_margin_mean": 2.667440414428711, "margin_dpo/loss_margin_mean": 26.67440414428711, "margin_dpo/margin_mean": 26.674402236938477, "margin_dpo/margin_std": 20.944385528564453, "step": 285 }, { "epoch": 0.4199706314243759, "grad_norm": 55.817623138427734, "learning_rate": 3.614345889441346e-07, "logits/chosen": -0.6651911735534668, "logits/rejected": -0.6330760717391968, "logps/chosen": -86.78990173339844, "logps/ref_chosen": -72.95100402832031, "logps/ref_rejected": -88.58845520019531, "logps/rejected": -130.15069580078125, "loss": 0.551, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18454879522323608, "margin_dpo/beta_margin_grad_std": 0.22670768201351166, "margin_dpo/beta_margin_mean": 2.772334575653076, "margin_dpo/loss_margin_mean": 27.723342895507812, "margin_dpo/margin_mean": 27.723342895507812, "margin_dpo/margin_std": 25.246835708618164, "step": 286 }, { "epoch": 0.42143906020558003, "grad_norm": 53.33695602416992, "learning_rate": 3.6028433558269275e-07, "logits/chosen": -0.6670126914978027, "logits/rejected": -0.6226764917373657, "logps/chosen": -75.57640075683594, "logps/ref_chosen": -61.54115295410156, "logps/ref_rejected": -77.6960678100586, "logps/rejected": -118.70558166503906, "loss": 0.5322, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.19202715158462524, "margin_dpo/beta_margin_grad_std": 0.21082551777362823, "margin_dpo/beta_margin_mean": 2.697427272796631, "margin_dpo/loss_margin_mean": 26.974271774291992, "margin_dpo/margin_mean": 26.974271774291992, "margin_dpo/margin_std": 26.020793914794922, "step": 287 }, { "epoch": 0.42290748898678415, "grad_norm": 58.48661804199219, "learning_rate": 3.5913117612644327e-07, "logits/chosen": -0.6334189176559448, "logits/rejected": -0.6017390489578247, "logps/chosen": -72.49565124511719, "logps/ref_chosen": -56.661224365234375, "logps/ref_rejected": -87.335693359375, "logps/rejected": -130.8236541748047, "loss": 0.4353, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16223303973674774, "margin_dpo/beta_margin_grad_std": 0.19457679986953735, "margin_dpo/beta_margin_mean": 2.765352249145508, "margin_dpo/loss_margin_mean": 27.653522491455078, "margin_dpo/margin_mean": 27.653522491455078, "margin_dpo/margin_std": 21.126564025878906, "step": 288 }, { "epoch": 0.4243759177679883, "grad_norm": 51.02857208251953, "learning_rate": 3.5797514096221024e-07, "logits/chosen": -0.6474858522415161, "logits/rejected": -0.6363035440444946, "logps/chosen": -61.39606857299805, "logps/ref_chosen": -45.23039245605469, "logps/ref_rejected": -87.64266967773438, "logps/rejected": -134.01449584960938, "loss": 0.5024, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18502648174762726, "margin_dpo/beta_margin_grad_std": 0.20525604486465454, "margin_dpo/beta_margin_mean": 3.020615339279175, "margin_dpo/loss_margin_mean": 30.206151962280273, "margin_dpo/margin_mean": 30.206153869628906, "margin_dpo/margin_std": 29.076404571533203, "step": 289 }, { "epoch": 0.42584434654919234, "grad_norm": 61.92301559448242, "learning_rate": 3.568162605525952e-07, "logits/chosen": -0.6007183194160461, "logits/rejected": -0.596926748752594, "logps/chosen": -71.86837768554688, "logps/ref_chosen": -55.47149658203125, "logps/ref_rejected": -116.70857238769531, "logps/rejected": -164.587646484375, "loss": 0.51, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17327924072742462, "margin_dpo/beta_margin_grad_std": 0.2234293818473816, "margin_dpo/beta_margin_mean": 3.1482198238372803, "margin_dpo/loss_margin_mean": 31.482196807861328, "margin_dpo/margin_mean": 31.482196807861328, "margin_dpo/margin_std": 29.341014862060547, "step": 290 }, { "epoch": 0.42731277533039647, "grad_norm": 56.78514099121094, "learning_rate": 3.5565456543517485e-07, "logits/chosen": -0.66310715675354, "logits/rejected": -0.6308495998382568, "logps/chosen": -75.83061981201172, "logps/ref_chosen": -63.26036834716797, "logps/ref_rejected": -89.29708862304688, "logps/rejected": -129.5008087158203, "loss": 0.4818, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16443225741386414, "margin_dpo/beta_margin_grad_std": 0.20096154510974884, "margin_dpo/beta_margin_mean": 2.7633466720581055, "margin_dpo/loss_margin_mean": 27.633464813232422, "margin_dpo/margin_mean": 27.633464813232422, "margin_dpo/margin_std": 22.616226196289062, "step": 291 }, { "epoch": 0.4287812041116006, "grad_norm": 53.309532165527344, "learning_rate": 3.5449008622169583e-07, "logits/chosen": -0.644907534122467, "logits/rejected": -0.6032723188400269, "logps/chosen": -70.52488708496094, "logps/ref_chosen": -53.91852951049805, "logps/ref_rejected": -89.96138000488281, "logps/rejected": -136.3544921875, "loss": 0.3955, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1559363305568695, "margin_dpo/beta_margin_grad_std": 0.17519932985305786, "margin_dpo/beta_margin_mean": 2.978675127029419, "margin_dpo/loss_margin_mean": 29.78675079345703, "margin_dpo/margin_mean": 29.78675079345703, "margin_dpo/margin_std": 24.420841217041016, "step": 292 }, { "epoch": 0.4302496328928047, "grad_norm": 54.48039245605469, "learning_rate": 3.5332285359726846e-07, "logits/chosen": -0.6540181040763855, "logits/rejected": -0.6253507137298584, "logps/chosen": -76.41354370117188, "logps/ref_chosen": -60.376033782958984, "logps/ref_rejected": -77.8524398803711, "logps/rejected": -118.03453063964844, "loss": 0.5943, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2110597789287567, "margin_dpo/beta_margin_grad_std": 0.21620804071426392, "margin_dpo/beta_margin_mean": 2.4144577980041504, "margin_dpo/loss_margin_mean": 24.144577026367188, "margin_dpo/margin_mean": 24.144577026367188, "margin_dpo/margin_std": 24.406749725341797, "step": 293 }, { "epoch": 0.43171806167400884, "grad_norm": 42.810264587402344, "learning_rate": 3.5215289831955786e-07, "logits/chosen": -0.6317086219787598, "logits/rejected": -0.6187810897827148, "logps/chosen": -62.521305084228516, "logps/ref_chosen": -48.0875358581543, "logps/ref_rejected": -81.89698791503906, "logps/rejected": -123.5872573852539, "loss": 0.5175, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1887100636959076, "margin_dpo/beta_margin_grad_std": 0.20870058238506317, "margin_dpo/beta_margin_mean": 2.725649833679199, "margin_dpo/loss_margin_mean": 27.256500244140625, "margin_dpo/margin_mean": 27.256500244140625, "margin_dpo/margin_std": 25.993709564208984, "step": 294 }, { "epoch": 0.4331864904552129, "grad_norm": 61.524410247802734, "learning_rate": 3.509802512179737e-07, "logits/chosen": -0.59177565574646, "logits/rejected": -0.5824375748634338, "logps/chosen": -68.55509185791016, "logps/ref_chosen": -49.92467498779297, "logps/ref_rejected": -87.45632934570312, "logps/rejected": -133.15512084960938, "loss": 0.595, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18381169438362122, "margin_dpo/beta_margin_grad_std": 0.22098243236541748, "margin_dpo/beta_margin_mean": 2.706838607788086, "margin_dpo/loss_margin_mean": 27.06838607788086, "margin_dpo/margin_mean": 27.06838607788086, "margin_dpo/margin_std": 24.90131378173828, "step": 295 }, { "epoch": 0.434654919236417, "grad_norm": 85.08708953857422, "learning_rate": 3.498049431928577e-07, "logits/chosen": -0.6830310821533203, "logits/rejected": -0.6430518627166748, "logps/chosen": -83.78386688232422, "logps/ref_chosen": -65.49124145507812, "logps/ref_rejected": -93.08908081054688, "logps/rejected": -134.91554260253906, "loss": 0.7283, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.23585554957389832, "margin_dpo/beta_margin_grad_std": 0.2520889639854431, "margin_dpo/beta_margin_mean": 2.3533830642700195, "margin_dpo/loss_margin_mean": 23.533828735351562, "margin_dpo/margin_mean": 23.53382682800293, "margin_dpo/margin_std": 26.529495239257812, "step": 296 }, { "epoch": 0.43612334801762115, "grad_norm": 44.37043762207031, "learning_rate": 3.486270052146694e-07, "logits/chosen": -0.5455184578895569, "logits/rejected": -0.5094451904296875, "logps/chosen": -74.52252197265625, "logps/ref_chosen": -56.47694778442383, "logps/ref_rejected": -95.1385498046875, "logps/rejected": -141.77182006835938, "loss": 0.4232, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16087649762630463, "margin_dpo/beta_margin_grad_std": 0.18778559565544128, "margin_dpo/beta_margin_mean": 2.858769416809082, "margin_dpo/loss_margin_mean": 28.587692260742188, "margin_dpo/margin_mean": 28.587696075439453, "margin_dpo/margin_std": 23.822004318237305, "step": 297 }, { "epoch": 0.43759177679882527, "grad_norm": 43.25141143798828, "learning_rate": 3.474464683231698e-07, "logits/chosen": -0.642224907875061, "logits/rejected": -0.6368216276168823, "logps/chosen": -83.6908950805664, "logps/ref_chosen": -67.32516479492188, "logps/ref_rejected": -116.66217041015625, "logps/rejected": -162.75466918945312, "loss": 0.4123, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16109539568424225, "margin_dpo/beta_margin_grad_std": 0.18005360662937164, "margin_dpo/beta_margin_mean": 2.972676992416382, "margin_dpo/loss_margin_mean": 29.726768493652344, "margin_dpo/margin_mean": 29.726768493652344, "margin_dpo/margin_std": 26.26955795288086, "step": 298 }, { "epoch": 0.4390602055800294, "grad_norm": 59.36482238769531, "learning_rate": 3.462633636266041e-07, "logits/chosen": -0.5800520181655884, "logits/rejected": -0.5592917799949646, "logps/chosen": -64.02508544921875, "logps/ref_chosen": -48.96209716796875, "logps/ref_rejected": -84.32823944091797, "logps/rejected": -130.45758056640625, "loss": 0.5094, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1787450611591339, "margin_dpo/beta_margin_grad_std": 0.2245379388332367, "margin_dpo/beta_margin_mean": 3.1066365242004395, "margin_dpo/loss_margin_mean": 31.066364288330078, "margin_dpo/margin_mean": 31.066362380981445, "margin_dpo/margin_std": 27.945383071899414, "step": 299 }, { "epoch": 0.44052863436123346, "grad_norm": 81.1234359741211, "learning_rate": 3.4507772230088147e-07, "logits/chosen": -0.6008783578872681, "logits/rejected": -0.5817815065383911, "logps/chosen": -80.24883270263672, "logps/ref_chosen": -59.073707580566406, "logps/ref_rejected": -95.9664535522461, "logps/rejected": -146.92205810546875, "loss": 0.699, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.20390301942825317, "margin_dpo/beta_margin_grad_std": 0.26888635754585266, "margin_dpo/beta_margin_mean": 2.9780476093292236, "margin_dpo/loss_margin_mean": 29.780475616455078, "margin_dpo/margin_mean": 29.780475616455078, "margin_dpo/margin_std": 29.694149017333984, "step": 300 }, { "epoch": 0.44052863436123346, "eval_logits/chosen": -0.6107151508331299, "eval_logits/rejected": -0.5844902992248535, "eval_logps/chosen": -99.96916198730469, "eval_logps/ref_chosen": -79.05104064941406, "eval_logps/ref_rejected": -86.79793548583984, "eval_logps/rejected": -125.81926727294922, "eval_loss": 0.4413561224937439, "eval_margin_dpo/beta": 0.10000000149011612, "eval_margin_dpo/beta_margin_grad_mean": -0.28278234601020813, "eval_margin_dpo/beta_margin_grad_std": 0.2516450583934784, "eval_margin_dpo/beta_margin_mean": 1.8103224039077759, "eval_margin_dpo/loss_margin_mean": 18.10322380065918, "eval_margin_dpo/margin_mean": 18.10322380065918, "eval_margin_dpo/margin_std": 23.78249168395996, "eval_runtime": 39.9127, "eval_samples_per_second": 58.603, "eval_steps_per_second": 1.854, "step": 300 }, { "epoch": 0.4419970631424376, "grad_norm": 47.328758239746094, "learning_rate": 3.4388957558875316e-07, "logits/chosen": -0.6326008439064026, "logits/rejected": -0.6015191078186035, "logps/chosen": -75.48579406738281, "logps/ref_chosen": -57.249366760253906, "logps/ref_rejected": -92.35354614257812, "logps/rejected": -141.67437744140625, "loss": 0.398, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15365365147590637, "margin_dpo/beta_margin_grad_std": 0.18743260204792023, "margin_dpo/beta_margin_mean": 3.1084399223327637, "margin_dpo/loss_margin_mean": 31.084396362304688, "margin_dpo/margin_mean": 31.084396362304688, "margin_dpo/margin_std": 25.64261245727539, "step": 301 }, { "epoch": 0.4434654919236417, "grad_norm": 68.61962127685547, "learning_rate": 3.426989547989902e-07, "logits/chosen": -0.5673216581344604, "logits/rejected": -0.5571401119232178, "logps/chosen": -66.45941925048828, "logps/ref_chosen": -51.19799041748047, "logps/ref_rejected": -97.22636413574219, "logps/rejected": -141.73390197753906, "loss": 0.5652, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18492794036865234, "margin_dpo/beta_margin_grad_std": 0.20831260085105896, "margin_dpo/beta_margin_mean": 2.9246113300323486, "margin_dpo/loss_margin_mean": 29.246112823486328, "margin_dpo/margin_mean": 29.246112823486328, "margin_dpo/margin_std": 29.20469093322754, "step": 302 }, { "epoch": 0.44493392070484583, "grad_norm": 67.56126403808594, "learning_rate": 3.4150589130555773e-07, "logits/chosen": -0.6228208541870117, "logits/rejected": -0.5857237577438354, "logps/chosen": -83.40345764160156, "logps/ref_chosen": -66.71394348144531, "logps/ref_rejected": -86.94542694091797, "logps/rejected": -131.71636962890625, "loss": 0.6418, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.20844492316246033, "margin_dpo/beta_margin_grad_std": 0.25230517983436584, "margin_dpo/beta_margin_mean": 2.80814266204834, "margin_dpo/loss_margin_mean": 28.081424713134766, "margin_dpo/margin_mean": 28.081424713134766, "margin_dpo/margin_std": 28.703716278076172, "step": 303 }, { "epoch": 0.44640234948604995, "grad_norm": 58.02699661254883, "learning_rate": 3.403104165467883e-07, "logits/chosen": -0.6468909382820129, "logits/rejected": -0.6176923513412476, "logps/chosen": -86.12702178955078, "logps/ref_chosen": -71.95069885253906, "logps/ref_rejected": -90.47203063964844, "logps/rejected": -132.961181640625, "loss": 0.4589, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15100273489952087, "margin_dpo/beta_margin_grad_std": 0.2101380079984665, "margin_dpo/beta_margin_mean": 2.831282377243042, "margin_dpo/loss_margin_mean": 28.312822341918945, "margin_dpo/margin_mean": 28.312822341918945, "margin_dpo/margin_std": 20.519695281982422, "step": 304 }, { "epoch": 0.447870778267254, "grad_norm": 51.01959991455078, "learning_rate": 3.391125620245535e-07, "logits/chosen": -0.6300150156021118, "logits/rejected": -0.5907981395721436, "logps/chosen": -84.66513061523438, "logps/ref_chosen": -66.79523468017578, "logps/ref_rejected": -92.75459289550781, "logps/rejected": -139.4754638671875, "loss": 0.4261, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16771967709064484, "margin_dpo/beta_margin_grad_std": 0.1794486939907074, "margin_dpo/beta_margin_mean": 2.885097026824951, "margin_dpo/loss_margin_mean": 28.850971221923828, "margin_dpo/margin_mean": 28.850971221923828, "margin_dpo/margin_std": 26.399646759033203, "step": 305 }, { "epoch": 0.44933920704845814, "grad_norm": 74.01148986816406, "learning_rate": 3.3791235930343417e-07, "logits/chosen": -0.6759564876556396, "logits/rejected": -0.626822829246521, "logps/chosen": -85.26339721679688, "logps/ref_chosen": -69.68389892578125, "logps/ref_rejected": -85.15919494628906, "logps/rejected": -128.5599822998047, "loss": 0.5096, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16733743250370026, "margin_dpo/beta_margin_grad_std": 0.21074122190475464, "margin_dpo/beta_margin_mean": 2.78212833404541, "margin_dpo/loss_margin_mean": 27.821285247802734, "margin_dpo/margin_mean": 27.8212833404541, "margin_dpo/margin_std": 23.266021728515625, "step": 306 }, { "epoch": 0.45080763582966227, "grad_norm": 54.2598762512207, "learning_rate": 3.367098400098881e-07, "logits/chosen": -0.6196017265319824, "logits/rejected": -0.5918940305709839, "logps/chosen": -86.1854476928711, "logps/ref_chosen": -70.16542053222656, "logps/ref_rejected": -86.97230529785156, "logps/rejected": -128.44007873535156, "loss": 0.5398, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1930994689464569, "margin_dpo/beta_margin_grad_std": 0.21751633286476135, "margin_dpo/beta_margin_mean": 2.544773578643799, "margin_dpo/loss_margin_mean": 25.447734832763672, "margin_dpo/margin_mean": 25.447734832763672, "margin_dpo/margin_std": 24.255327224731445, "step": 307 }, { "epoch": 0.4522760646108664, "grad_norm": 42.99238967895508, "learning_rate": 3.355050358314172e-07, "logits/chosen": -0.6084394454956055, "logits/rejected": -0.581619381904602, "logps/chosen": -70.17066955566406, "logps/ref_chosen": -55.2449951171875, "logps/ref_rejected": -79.37226104736328, "logps/rejected": -123.38723754882812, "loss": 0.4913, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1686662882566452, "margin_dpo/beta_margin_grad_std": 0.1977521926164627, "margin_dpo/beta_margin_mean": 2.908930778503418, "margin_dpo/loss_margin_mean": 29.089309692382812, "margin_dpo/margin_mean": 29.089309692382812, "margin_dpo/margin_std": 26.31514549255371, "step": 308 }, { "epoch": 0.45374449339207046, "grad_norm": 54.909400939941406, "learning_rate": 3.3429797851573183e-07, "logits/chosen": -0.6183820366859436, "logits/rejected": -0.5834609866142273, "logps/chosen": -66.57117462158203, "logps/ref_chosen": -48.959083557128906, "logps/ref_rejected": -82.34072875976562, "logps/rejected": -128.1937713623047, "loss": 0.5046, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17868635058403015, "margin_dpo/beta_margin_grad_std": 0.21038393676280975, "margin_dpo/beta_margin_mean": 2.824094295501709, "margin_dpo/loss_margin_mean": 28.240943908691406, "margin_dpo/margin_mean": 28.240943908691406, "margin_dpo/margin_std": 24.293102264404297, "step": 309 }, { "epoch": 0.4552129221732746, "grad_norm": 50.08707809448242, "learning_rate": 3.3308869986991487e-07, "logits/chosen": -0.6874780058860779, "logits/rejected": -0.6412575244903564, "logps/chosen": -78.43482971191406, "logps/ref_chosen": -62.74177932739258, "logps/ref_rejected": -79.9302978515625, "logps/rejected": -120.06564331054688, "loss": 0.4422, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17856904864311218, "margin_dpo/beta_margin_grad_std": 0.16496190428733826, "margin_dpo/beta_margin_mean": 2.4442286491394043, "margin_dpo/loss_margin_mean": 24.44228744506836, "margin_dpo/margin_mean": 24.44228744506836, "margin_dpo/margin_std": 20.531139373779297, "step": 310 }, { "epoch": 0.4566813509544787, "grad_norm": 63.15940475463867, "learning_rate": 3.3187723175958346e-07, "logits/chosen": -0.5884615182876587, "logits/rejected": -0.54796302318573, "logps/chosen": -73.1263656616211, "logps/ref_chosen": -53.027976989746094, "logps/ref_rejected": -77.43820190429688, "logps/rejected": -131.83961486816406, "loss": 0.3482, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12926018238067627, "margin_dpo/beta_margin_grad_std": 0.1706952154636383, "margin_dpo/beta_margin_mean": 3.4303030967712402, "margin_dpo/loss_margin_mean": 34.30303192138672, "margin_dpo/margin_mean": 34.30303192138672, "margin_dpo/margin_std": 25.374624252319336, "step": 311 }, { "epoch": 0.4581497797356828, "grad_norm": 56.578857421875, "learning_rate": 3.306636061080487e-07, "logits/chosen": -0.5995860695838928, "logits/rejected": -0.555045485496521, "logps/chosen": -65.98387908935547, "logps/ref_chosen": -49.39221954345703, "logps/ref_rejected": -75.79280090332031, "logps/rejected": -122.10321807861328, "loss": 0.4842, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1694258749485016, "margin_dpo/beta_margin_grad_std": 0.21930669248104095, "margin_dpo/beta_margin_mean": 2.9718756675720215, "margin_dpo/loss_margin_mean": 29.7187557220459, "margin_dpo/margin_mean": 29.7187557220459, "margin_dpo/margin_std": 26.206937789916992, "step": 312 }, { "epoch": 0.45961820851688695, "grad_norm": 57.107025146484375, "learning_rate": 3.2944785489547537e-07, "logits/chosen": -0.6909008026123047, "logits/rejected": -0.6554454565048218, "logps/chosen": -64.720458984375, "logps/ref_chosen": -50.152740478515625, "logps/ref_rejected": -86.40620422363281, "logps/rejected": -126.65206909179688, "loss": 0.6326, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.21527621150016785, "margin_dpo/beta_margin_grad_std": 0.22800129652023315, "margin_dpo/beta_margin_mean": 2.567814350128174, "margin_dpo/loss_margin_mean": 25.678142547607422, "margin_dpo/margin_mean": 25.678142547607422, "margin_dpo/margin_std": 26.8893985748291, "step": 313 }, { "epoch": 0.461086637298091, "grad_norm": 58.55873489379883, "learning_rate": 3.2823001015803857e-07, "logits/chosen": -0.6338675022125244, "logits/rejected": -0.609628438949585, "logps/chosen": -72.63245391845703, "logps/ref_chosen": -57.23758316040039, "logps/ref_rejected": -97.59652709960938, "logps/rejected": -138.97230529785156, "loss": 0.5754, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2068512737751007, "margin_dpo/beta_margin_grad_std": 0.22196519374847412, "margin_dpo/beta_margin_mean": 2.598090648651123, "margin_dpo/loss_margin_mean": 25.98090362548828, "margin_dpo/margin_mean": 25.980905532836914, "margin_dpo/margin_std": 25.633577346801758, "step": 314 }, { "epoch": 0.46255506607929514, "grad_norm": 47.718597412109375, "learning_rate": 3.270101039870797e-07, "logits/chosen": -0.6039018630981445, "logits/rejected": -0.5841349959373474, "logps/chosen": -64.1601791381836, "logps/ref_chosen": -49.06958770751953, "logps/ref_rejected": -85.68087768554688, "logps/rejected": -125.34097290039062, "loss": 0.4916, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18752586841583252, "margin_dpo/beta_margin_grad_std": 0.18681946396827698, "margin_dpo/beta_margin_mean": 2.456951141357422, "margin_dpo/loss_margin_mean": 24.56951141357422, "margin_dpo/margin_mean": 24.56951141357422, "margin_dpo/margin_std": 22.756999969482422, "step": 315 }, { "epoch": 0.46402349486049926, "grad_norm": 51.623634338378906, "learning_rate": 3.2578816852826086e-07, "logits/chosen": -0.6183241605758667, "logits/rejected": -0.6124423146247864, "logps/chosen": -71.89447784423828, "logps/ref_chosen": -54.26074981689453, "logps/ref_rejected": -101.2814712524414, "logps/rejected": -148.76461791992188, "loss": 0.4158, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15852956473827362, "margin_dpo/beta_margin_grad_std": 0.18403339385986328, "margin_dpo/beta_margin_mean": 2.984943389892578, "margin_dpo/loss_margin_mean": 29.84943389892578, "margin_dpo/margin_mean": 29.84943389892578, "margin_dpo/margin_std": 26.252422332763672, "step": 316 }, { "epoch": 0.4654919236417034, "grad_norm": 38.064273834228516, "learning_rate": 3.2456423598071783e-07, "logits/chosen": -0.6562374830245972, "logits/rejected": -0.6188434958457947, "logps/chosen": -69.55624389648438, "logps/ref_chosen": -56.094207763671875, "logps/ref_rejected": -100.69905090332031, "logps/rejected": -148.02902221679688, "loss": 0.3529, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13426414132118225, "margin_dpo/beta_margin_grad_std": 0.1786133348941803, "margin_dpo/beta_margin_mean": 3.386793375015259, "margin_dpo/loss_margin_mean": 33.86793518066406, "margin_dpo/margin_mean": 33.86793518066406, "margin_dpo/margin_std": 24.910192489624023, "step": 317 }, { "epoch": 0.4669603524229075, "grad_norm": 47.10121154785156, "learning_rate": 3.233383385962115e-07, "logits/chosen": -0.6792348623275757, "logits/rejected": -0.6371433138847351, "logps/chosen": -77.5732421875, "logps/ref_chosen": -64.64570617675781, "logps/ref_rejected": -82.76425170898438, "logps/rejected": -126.16979217529297, "loss": 0.4256, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15739941596984863, "margin_dpo/beta_margin_grad_std": 0.1940799355506897, "margin_dpo/beta_margin_mean": 3.047799587249756, "margin_dpo/loss_margin_mean": 30.47799301147461, "margin_dpo/margin_mean": 30.47799301147461, "margin_dpo/margin_std": 25.105358123779297, "step": 318 }, { "epoch": 0.4684287812041116, "grad_norm": 41.460880279541016, "learning_rate": 3.2211050867827805e-07, "logits/chosen": -0.6292097568511963, "logits/rejected": -0.6156477928161621, "logps/chosen": -62.311241149902344, "logps/ref_chosen": -49.383758544921875, "logps/ref_rejected": -113.90650939941406, "logps/rejected": -156.33575439453125, "loss": 0.363, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1437537968158722, "margin_dpo/beta_margin_grad_std": 0.17065343260765076, "margin_dpo/beta_margin_mean": 2.9501757621765137, "margin_dpo/loss_margin_mean": 29.501754760742188, "margin_dpo/margin_mean": 29.501754760742188, "margin_dpo/margin_std": 22.302837371826172, "step": 319 }, { "epoch": 0.4698972099853157, "grad_norm": 51.28620910644531, "learning_rate": 3.208807785813777e-07, "logits/chosen": -0.6504048109054565, "logits/rejected": -0.6378560066223145, "logps/chosen": -74.24742126464844, "logps/ref_chosen": -59.50489044189453, "logps/ref_rejected": -97.66716766357422, "logps/rejected": -139.2616729736328, "loss": 0.4858, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17591118812561035, "margin_dpo/beta_margin_grad_std": 0.1899009644985199, "margin_dpo/beta_margin_mean": 2.685196876525879, "margin_dpo/loss_margin_mean": 26.851966857910156, "margin_dpo/margin_mean": 26.85196876525879, "margin_dpo/margin_std": 23.467105865478516, "step": 320 }, { "epoch": 0.4713656387665198, "grad_norm": 72.78955078125, "learning_rate": 3.1964918071004217e-07, "logits/chosen": -0.6326063871383667, "logits/rejected": -0.5948277711868286, "logps/chosen": -80.62930297851562, "logps/ref_chosen": -61.548683166503906, "logps/ref_rejected": -91.64103698730469, "logps/rejected": -136.67556762695312, "loss": 0.7106, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.22593584656715393, "margin_dpo/beta_margin_grad_std": 0.2594347894191742, "margin_dpo/beta_margin_mean": 2.595390558242798, "margin_dpo/loss_margin_mean": 25.95390510559082, "margin_dpo/margin_mean": 25.953907012939453, "margin_dpo/margin_std": 27.37790298461914, "step": 321 }, { "epoch": 0.47283406754772395, "grad_norm": 53.10894775390625, "learning_rate": 3.184157475180207e-07, "logits/chosen": -0.6191203594207764, "logits/rejected": -0.597158670425415, "logps/chosen": -72.91082763671875, "logps/ref_chosen": -57.29003143310547, "logps/ref_rejected": -95.74992370605469, "logps/rejected": -143.1072235107422, "loss": 0.4304, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16689737141132355, "margin_dpo/beta_margin_grad_std": 0.18274548649787903, "margin_dpo/beta_margin_mean": 3.173649311065674, "margin_dpo/loss_margin_mean": 31.736494064331055, "margin_dpo/margin_mean": 31.736492156982422, "margin_dpo/margin_std": 27.840457916259766, "step": 322 }, { "epoch": 0.47430249632892807, "grad_norm": 46.58567428588867, "learning_rate": 3.171805115074251e-07, "logits/chosen": -0.6087906360626221, "logits/rejected": -0.5820388197898865, "logps/chosen": -66.820556640625, "logps/ref_chosen": -51.23395919799805, "logps/ref_rejected": -75.06192016601562, "logps/rejected": -121.96331787109375, "loss": 0.422, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15601900219917297, "margin_dpo/beta_margin_grad_std": 0.19936603307724, "margin_dpo/beta_margin_mean": 3.1314802169799805, "margin_dpo/loss_margin_mean": 31.314802169799805, "margin_dpo/margin_mean": 31.314802169799805, "margin_dpo/margin_std": 25.376670837402344, "step": 323 }, { "epoch": 0.47577092511013214, "grad_norm": 56.77102279663086, "learning_rate": 3.1594350522787295e-07, "logits/chosen": -0.6091630458831787, "logits/rejected": -0.5578924417495728, "logps/chosen": -82.68987274169922, "logps/ref_chosen": -65.13516998291016, "logps/ref_rejected": -86.47750091552734, "logps/rejected": -133.79421997070312, "loss": 0.4592, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16746217012405396, "margin_dpo/beta_margin_grad_std": 0.20326107740402222, "margin_dpo/beta_margin_mean": 2.9762015342712402, "margin_dpo/loss_margin_mean": 29.762012481689453, "margin_dpo/margin_mean": 29.762012481689453, "margin_dpo/margin_std": 25.058231353759766, "step": 324 }, { "epoch": 0.47723935389133626, "grad_norm": 43.94253158569336, "learning_rate": 3.147047612756302e-07, "logits/chosen": -0.625763475894928, "logits/rejected": -0.5584316253662109, "logps/chosen": -70.6198501586914, "logps/ref_chosen": -56.215599060058594, "logps/ref_rejected": -70.0859375, "logps/rejected": -113.17784118652344, "loss": 0.4258, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16012780368328094, "margin_dpo/beta_margin_grad_std": 0.19097131490707397, "margin_dpo/beta_margin_mean": 2.868764877319336, "margin_dpo/loss_margin_mean": 28.68764877319336, "margin_dpo/margin_mean": 28.68764877319336, "margin_dpo/margin_std": 21.614681243896484, "step": 325 }, { "epoch": 0.4787077826725404, "grad_norm": 57.078155517578125, "learning_rate": 3.134643122927519e-07, "logits/chosen": -0.670049250125885, "logits/rejected": -0.6241730451583862, "logps/chosen": -90.87605285644531, "logps/ref_chosen": -72.72496032714844, "logps/ref_rejected": -79.84678649902344, "logps/rejected": -123.93955993652344, "loss": 0.5032, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.19036465883255005, "margin_dpo/beta_margin_grad_std": 0.20020395517349243, "margin_dpo/beta_margin_mean": 2.5941686630249023, "margin_dpo/loss_margin_mean": 25.94168472290039, "margin_dpo/margin_mean": 25.941686630249023, "margin_dpo/margin_std": 24.112701416015625, "step": 326 }, { "epoch": 0.4801762114537445, "grad_norm": 48.156856536865234, "learning_rate": 3.1222219096622264e-07, "logits/chosen": -0.6242316365242004, "logits/rejected": -0.5801475048065186, "logps/chosen": -84.52735900878906, "logps/ref_chosen": -69.13441467285156, "logps/ref_rejected": -111.93377685546875, "logps/rejected": -164.81890869140625, "loss": 0.2858, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1086474135518074, "margin_dpo/beta_margin_grad_std": 0.16801781952381134, "margin_dpo/beta_margin_mean": 3.749218463897705, "margin_dpo/loss_margin_mean": 37.492183685302734, "margin_dpo/margin_mean": 37.492183685302734, "margin_dpo/margin_std": 24.61020278930664, "step": 327 }, { "epoch": 0.48164464023494863, "grad_norm": 53.24053192138672, "learning_rate": 3.1097843002709427e-07, "logits/chosen": -0.631726861000061, "logits/rejected": -0.6111768484115601, "logps/chosen": -78.70730590820312, "logps/ref_chosen": -59.68719482421875, "logps/ref_rejected": -90.85499572753906, "logps/rejected": -137.83163452148438, "loss": 0.476, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17879313230514526, "margin_dpo/beta_margin_grad_std": 0.1966237723827362, "margin_dpo/beta_margin_mean": 2.7956528663635254, "margin_dpo/loss_margin_mean": 27.956527709960938, "margin_dpo/margin_mean": 27.956527709960938, "margin_dpo/margin_std": 25.300691604614258, "step": 328 }, { "epoch": 0.4831130690161527, "grad_norm": 60.33453369140625, "learning_rate": 3.0973306224962437e-07, "logits/chosen": -0.6395320892333984, "logits/rejected": -0.5939961671829224, "logps/chosen": -82.35140991210938, "logps/ref_chosen": -65.2461929321289, "logps/ref_rejected": -100.69770812988281, "logps/rejected": -155.14166259765625, "loss": 0.3708, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12009057402610779, "margin_dpo/beta_margin_grad_std": 0.18349771201610565, "margin_dpo/beta_margin_mean": 3.733874797821045, "margin_dpo/loss_margin_mean": 37.3387451171875, "margin_dpo/margin_mean": 37.3387451171875, "margin_dpo/margin_std": 26.804096221923828, "step": 329 }, { "epoch": 0.4845814977973568, "grad_norm": 48.900360107421875, "learning_rate": 3.084861204504122e-07, "logits/chosen": -0.5813232660293579, "logits/rejected": -0.5687066316604614, "logps/chosen": -64.73994445800781, "logps/ref_chosen": -46.998348236083984, "logps/ref_rejected": -86.87684631347656, "logps/rejected": -136.06546020507812, "loss": 0.4066, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14167913794517517, "margin_dpo/beta_margin_grad_std": 0.186563640832901, "margin_dpo/beta_margin_mean": 3.1447031497955322, "margin_dpo/loss_margin_mean": 31.44702911376953, "margin_dpo/margin_mean": 31.44702911376953, "margin_dpo/margin_std": 24.540428161621094, "step": 330 }, { "epoch": 0.48604992657856094, "grad_norm": 37.78254699707031, "learning_rate": 3.072376374875335e-07, "logits/chosen": -0.6245772838592529, "logits/rejected": -0.5876287221908569, "logps/chosen": -66.91592407226562, "logps/ref_chosen": -50.52424621582031, "logps/ref_rejected": -89.01544189453125, "logps/rejected": -139.790283203125, "loss": 0.2587, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.10614001750946045, "margin_dpo/beta_margin_grad_std": 0.14252358675003052, "margin_dpo/beta_margin_mean": 3.438317060470581, "margin_dpo/loss_margin_mean": 34.38317108154297, "margin_dpo/margin_mean": 34.38317108154297, "margin_dpo/margin_std": 23.31448745727539, "step": 331 }, { "epoch": 0.48751835535976507, "grad_norm": 50.14997100830078, "learning_rate": 3.059876462596758e-07, "logits/chosen": -0.648339033126831, "logits/rejected": -0.6188260316848755, "logps/chosen": -67.62520599365234, "logps/ref_chosen": -49.18028259277344, "logps/ref_rejected": -76.48515319824219, "logps/rejected": -120.72598266601562, "loss": 0.5397, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1846732795238495, "margin_dpo/beta_margin_grad_std": 0.21404841542243958, "margin_dpo/beta_margin_mean": 2.579591751098633, "margin_dpo/loss_margin_mean": 25.795917510986328, "margin_dpo/margin_mean": 25.795917510986328, "margin_dpo/margin_std": 22.294769287109375, "step": 332 }, { "epoch": 0.4889867841409692, "grad_norm": 64.34756469726562, "learning_rate": 3.0473617970527015e-07, "logits/chosen": -0.5994927883148193, "logits/rejected": -0.5866981744766235, "logps/chosen": -83.70030212402344, "logps/ref_chosen": -63.75574493408203, "logps/ref_rejected": -95.04411315917969, "logps/rejected": -147.88723754882812, "loss": 0.5269, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1775931566953659, "margin_dpo/beta_margin_grad_std": 0.22904759645462036, "margin_dpo/beta_margin_mean": 3.2898573875427246, "margin_dpo/loss_margin_mean": 32.8985710144043, "margin_dpo/margin_mean": 32.8985710144043, "margin_dpo/margin_std": 28.806324005126953, "step": 333 }, { "epoch": 0.49045521292217326, "grad_norm": 47.576019287109375, "learning_rate": 3.034832708016243e-07, "logits/chosen": -0.5982068777084351, "logits/rejected": -0.5783542394638062, "logps/chosen": -87.60667419433594, "logps/ref_chosen": -66.97975158691406, "logps/ref_rejected": -95.31692504882812, "logps/rejected": -147.35403442382812, "loss": 0.3524, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12977060675621033, "margin_dpo/beta_margin_grad_std": 0.18269598484039307, "margin_dpo/beta_margin_mean": 3.141017198562622, "margin_dpo/loss_margin_mean": 31.410171508789062, "margin_dpo/margin_mean": 31.410171508789062, "margin_dpo/margin_std": 22.10809326171875, "step": 334 }, { "epoch": 0.4919236417033774, "grad_norm": 60.33529281616211, "learning_rate": 3.022289525640531e-07, "logits/chosen": -0.6425787210464478, "logits/rejected": -0.6166863441467285, "logps/chosen": -80.82369995117188, "logps/ref_chosen": -62.54248046875, "logps/ref_rejected": -87.6176986694336, "logps/rejected": -133.60256958007812, "loss": 0.5288, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1897895336151123, "margin_dpo/beta_margin_grad_std": 0.2193058580160141, "margin_dpo/beta_margin_mean": 2.770364284515381, "margin_dpo/loss_margin_mean": 27.703643798828125, "margin_dpo/margin_mean": 27.703643798828125, "margin_dpo/margin_std": 25.8885555267334, "step": 335 }, { "epoch": 0.4933920704845815, "grad_norm": 64.78392791748047, "learning_rate": 3.009732580450086e-07, "logits/chosen": -0.6276768445968628, "logits/rejected": -0.6149314641952515, "logps/chosen": -74.1087646484375, "logps/ref_chosen": -54.531150817871094, "logps/ref_rejected": -104.40424346923828, "logps/rejected": -158.13856506347656, "loss": 0.4812, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13777320086956024, "margin_dpo/beta_margin_grad_std": 0.21476463973522186, "margin_dpo/beta_margin_mean": 3.4156715869903564, "margin_dpo/loss_margin_mean": 34.156715393066406, "margin_dpo/margin_mean": 34.156715393066406, "margin_dpo/margin_std": 28.737443923950195, "step": 336 }, { "epoch": 0.4948604992657856, "grad_norm": 56.005924224853516, "learning_rate": 2.9971622033320914e-07, "logits/chosen": -0.6466571092605591, "logits/rejected": -0.6224143505096436, "logps/chosen": -82.91951751708984, "logps/ref_chosen": -65.12869262695312, "logps/ref_rejected": -101.72701263427734, "logps/rejected": -150.4021759033203, "loss": 0.3742, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14107711613178253, "margin_dpo/beta_margin_grad_std": 0.1787952482700348, "margin_dpo/beta_margin_mean": 3.0884342193603516, "margin_dpo/loss_margin_mean": 30.88433837890625, "margin_dpo/margin_mean": 30.884340286254883, "margin_dpo/margin_std": 22.561031341552734, "step": 337 }, { "epoch": 0.49632892804698975, "grad_norm": 53.44011306762695, "learning_rate": 2.984578725527675e-07, "logits/chosen": -0.6315950155258179, "logits/rejected": -0.6056466102600098, "logps/chosen": -78.76591491699219, "logps/ref_chosen": -58.422706604003906, "logps/ref_rejected": -89.06854248046875, "logps/rejected": -140.06710815429688, "loss": 0.3915, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15517953038215637, "margin_dpo/beta_margin_grad_std": 0.1764228343963623, "margin_dpo/beta_margin_mean": 3.065535545349121, "margin_dpo/loss_margin_mean": 30.655353546142578, "margin_dpo/margin_mean": 30.655353546142578, "margin_dpo/margin_std": 24.606985092163086, "step": 338 }, { "epoch": 0.4977973568281938, "grad_norm": 42.09189987182617, "learning_rate": 2.9719824786231796e-07, "logits/chosen": -0.7080618143081665, "logits/rejected": -0.6741960048675537, "logps/chosen": -77.78138732910156, "logps/ref_chosen": -59.99531555175781, "logps/ref_rejected": -103.9109115600586, "logps/rejected": -156.21578979492188, "loss": 0.3539, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13084164261817932, "margin_dpo/beta_margin_grad_std": 0.18853557109832764, "margin_dpo/beta_margin_mean": 3.4518818855285645, "margin_dpo/loss_margin_mean": 34.51881790161133, "margin_dpo/margin_mean": 34.51881790161133, "margin_dpo/margin_std": 25.89126205444336, "step": 339 }, { "epoch": 0.49926578560939794, "grad_norm": 45.83633804321289, "learning_rate": 2.959373794541426e-07, "logits/chosen": -0.6209253072738647, "logits/rejected": -0.5888671875, "logps/chosen": -73.24740600585938, "logps/ref_chosen": -52.83022689819336, "logps/ref_rejected": -73.10723876953125, "logps/rejected": -127.31039428710938, "loss": 0.3856, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14801771938800812, "margin_dpo/beta_margin_grad_std": 0.17904043197631836, "margin_dpo/beta_margin_mean": 3.3785972595214844, "margin_dpo/loss_margin_mean": 33.78596878051758, "margin_dpo/margin_mean": 33.785972595214844, "margin_dpo/margin_std": 30.059484481811523, "step": 340 }, { "epoch": 0.5007342143906021, "grad_norm": 47.085533142089844, "learning_rate": 2.946753005532965e-07, "logits/chosen": -0.6055405735969543, "logits/rejected": -0.5906496047973633, "logps/chosen": -70.06444549560547, "logps/ref_chosen": -47.899803161621094, "logps/ref_rejected": -101.80987548828125, "logps/rejected": -161.025390625, "loss": 0.3145, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12136228382587433, "margin_dpo/beta_margin_grad_std": 0.17082762718200684, "margin_dpo/beta_margin_mean": 3.7050869464874268, "margin_dpo/loss_margin_mean": 37.050865173339844, "margin_dpo/margin_mean": 37.050865173339844, "margin_dpo/margin_std": 26.06426429748535, "step": 341 }, { "epoch": 0.5022026431718062, "grad_norm": 70.15333557128906, "learning_rate": 2.934120444167326e-07, "logits/chosen": -0.5826171040534973, "logits/rejected": -0.5372592210769653, "logps/chosen": -90.79473114013672, "logps/ref_chosen": -71.99664306640625, "logps/ref_rejected": -92.58959197998047, "logps/rejected": -143.8526611328125, "loss": 0.452, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1657615751028061, "margin_dpo/beta_margin_grad_std": 0.21393808722496033, "margin_dpo/beta_margin_mean": 3.2464988231658936, "margin_dpo/loss_margin_mean": 32.464988708496094, "margin_dpo/margin_mean": 32.464988708496094, "margin_dpo/margin_std": 28.956148147583008, "step": 342 }, { "epoch": 0.5036710719530103, "grad_norm": 59.99635696411133, "learning_rate": 2.9214764433242476e-07, "logits/chosen": -0.6327919363975525, "logits/rejected": -0.6083285808563232, "logps/chosen": -71.64889526367188, "logps/ref_chosen": -54.40562438964844, "logps/ref_rejected": -111.04141998291016, "logps/rejected": -162.94818115234375, "loss": 0.3762, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13154786825180054, "margin_dpo/beta_margin_grad_std": 0.1953406035900116, "margin_dpo/beta_margin_mean": 3.46634840965271, "margin_dpo/loss_margin_mean": 34.663482666015625, "margin_dpo/margin_mean": 34.663482666015625, "margin_dpo/margin_std": 24.739078521728516, "step": 343 }, { "epoch": 0.5051395007342144, "grad_norm": 60.24159622192383, "learning_rate": 2.9088213361849126e-07, "logits/chosen": -0.60174560546875, "logits/rejected": -0.5771138072013855, "logps/chosen": -74.28924560546875, "logps/ref_chosen": -53.96466827392578, "logps/ref_rejected": -90.62336730957031, "logps/rejected": -139.2759246826172, "loss": 0.5701, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.19683772325515747, "margin_dpo/beta_margin_grad_std": 0.23028729856014252, "margin_dpo/beta_margin_mean": 2.8327980041503906, "margin_dpo/loss_margin_mean": 28.327980041503906, "margin_dpo/margin_mean": 28.327980041503906, "margin_dpo/margin_std": 28.41692543029785, "step": 344 }, { "epoch": 0.5066079295154186, "grad_norm": 52.972599029541016, "learning_rate": 2.896155456223163e-07, "logits/chosen": -0.6189597845077515, "logits/rejected": -0.5859960317611694, "logps/chosen": -81.26602172851562, "logps/ref_chosen": -61.685699462890625, "logps/ref_rejected": -99.49040985107422, "logps/rejected": -153.17733764648438, "loss": 0.385, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12680958211421967, "margin_dpo/beta_margin_grad_std": 0.19373470544815063, "margin_dpo/beta_margin_mean": 3.4106602668762207, "margin_dpo/loss_margin_mean": 34.10660171508789, "margin_dpo/margin_mean": 34.10660171508789, "margin_dpo/margin_std": 26.537147521972656, "step": 345 }, { "epoch": 0.5080763582966226, "grad_norm": 65.10262298583984, "learning_rate": 2.883479137196714e-07, "logits/chosen": -0.6390465497970581, "logits/rejected": -0.6188012361526489, "logps/chosen": -77.49059295654297, "logps/ref_chosen": -55.256263732910156, "logps/ref_rejected": -77.41532135009766, "logps/rejected": -130.3055877685547, "loss": 0.4883, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16579663753509521, "margin_dpo/beta_margin_grad_std": 0.21993526816368103, "margin_dpo/beta_margin_mean": 3.065593719482422, "margin_dpo/loss_margin_mean": 30.655935287475586, "margin_dpo/margin_mean": 30.655933380126953, "margin_dpo/margin_std": 26.671146392822266, "step": 346 }, { "epoch": 0.5095447870778267, "grad_norm": 58.82464599609375, "learning_rate": 2.8707927131383614e-07, "logits/chosen": -0.6383576393127441, "logits/rejected": -0.5973784923553467, "logps/chosen": -80.98310852050781, "logps/ref_chosen": -57.56624221801758, "logps/ref_rejected": -92.35508728027344, "logps/rejected": -146.76962280273438, "loss": 0.507, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15645217895507812, "margin_dpo/beta_margin_grad_std": 0.2233504056930542, "margin_dpo/beta_margin_mean": 3.099766731262207, "margin_dpo/loss_margin_mean": 30.99766731262207, "margin_dpo/margin_mean": 30.997665405273438, "margin_dpo/margin_std": 27.08733367919922, "step": 347 }, { "epoch": 0.5110132158590308, "grad_norm": 56.679996490478516, "learning_rate": 2.858096518347179e-07, "logits/chosen": -0.6223098635673523, "logits/rejected": -0.5989496111869812, "logps/chosen": -76.78868103027344, "logps/ref_chosen": -56.31770324707031, "logps/ref_rejected": -89.13837432861328, "logps/rejected": -139.9176025390625, "loss": 0.534, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18657389283180237, "margin_dpo/beta_margin_grad_std": 0.2263115644454956, "margin_dpo/beta_margin_mean": 3.0308241844177246, "margin_dpo/loss_margin_mean": 30.308242797851562, "margin_dpo/margin_mean": 30.308242797851562, "margin_dpo/margin_std": 26.598758697509766, "step": 348 }, { "epoch": 0.5124816446402349, "grad_norm": 74.86937713623047, "learning_rate": 2.845390887379706e-07, "logits/chosen": -0.6142607927322388, "logits/rejected": -0.6013126373291016, "logps/chosen": -76.69570922851562, "logps/ref_chosen": -58.0255126953125, "logps/ref_rejected": -97.50515747070312, "logps/rejected": -142.0050506591797, "loss": 0.7182, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.22203311324119568, "margin_dpo/beta_margin_grad_std": 0.25500667095184326, "margin_dpo/beta_margin_mean": 2.5829694271087646, "margin_dpo/loss_margin_mean": 25.829692840576172, "margin_dpo/margin_mean": 25.829696655273438, "margin_dpo/margin_std": 29.345046997070312, "step": 349 }, { "epoch": 0.5139500734214391, "grad_norm": 60.53803253173828, "learning_rate": 2.8326761550411346e-07, "logits/chosen": -0.6528929471969604, "logits/rejected": -0.6275583505630493, "logps/chosen": -83.62789916992188, "logps/ref_chosen": -64.33049011230469, "logps/ref_rejected": -89.87164306640625, "logps/rejected": -136.76258850097656, "loss": 0.6313, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.20285803079605103, "margin_dpo/beta_margin_grad_std": 0.24381616711616516, "margin_dpo/beta_margin_mean": 2.7593541145324707, "margin_dpo/loss_margin_mean": 27.59354019165039, "margin_dpo/margin_mean": 27.59354019165039, "margin_dpo/margin_std": 27.823108673095703, "step": 350 }, { "epoch": 0.5154185022026432, "grad_norm": 45.59613800048828, "learning_rate": 2.819952656376487e-07, "logits/chosen": -0.5691178441047668, "logits/rejected": -0.5438896417617798, "logps/chosen": -77.93399810791016, "logps/ref_chosen": -60.6721305847168, "logps/ref_rejected": -101.5654296875, "logps/rejected": -152.97235107421875, "loss": 0.3518, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12643350660800934, "margin_dpo/beta_margin_grad_std": 0.19213062524795532, "margin_dpo/beta_margin_mean": 3.414506196975708, "margin_dpo/loss_margin_mean": 34.145057678222656, "margin_dpo/margin_mean": 34.14506149291992, "margin_dpo/margin_std": 24.413818359375, "step": 351 }, { "epoch": 0.5168869309838473, "grad_norm": 69.56964111328125, "learning_rate": 2.8072207266617854e-07, "logits/chosen": -0.6274293661117554, "logits/rejected": -0.5914252996444702, "logps/chosen": -88.6449203491211, "logps/ref_chosen": -70.9434585571289, "logps/ref_rejected": -76.6419677734375, "logps/rejected": -121.17511749267578, "loss": 0.5783, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.20047959685325623, "margin_dpo/beta_margin_grad_std": 0.2212606817483902, "margin_dpo/beta_margin_mean": 2.683168888092041, "margin_dpo/loss_margin_mean": 26.831687927246094, "margin_dpo/margin_mean": 26.831687927246094, "margin_dpo/margin_std": 27.10396385192871, "step": 352 }, { "epoch": 0.5183553597650514, "grad_norm": 70.63945007324219, "learning_rate": 2.794480701395219e-07, "logits/chosen": -0.6379419565200806, "logits/rejected": -0.610392689704895, "logps/chosen": -78.57247924804688, "logps/ref_chosen": -58.39533996582031, "logps/ref_rejected": -80.33552551269531, "logps/rejected": -127.21942138671875, "loss": 0.7041, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2198677659034729, "margin_dpo/beta_margin_grad_std": 0.26175159215927124, "margin_dpo/beta_margin_mean": 2.6706738471984863, "margin_dpo/loss_margin_mean": 26.706737518310547, "margin_dpo/margin_mean": 26.706737518310547, "margin_dpo/margin_std": 27.502460479736328, "step": 353 }, { "epoch": 0.5198237885462555, "grad_norm": 39.9495964050293, "learning_rate": 2.781732916288303e-07, "logits/chosen": -0.6119546890258789, "logits/rejected": -0.5863783359527588, "logps/chosen": -76.71435546875, "logps/ref_chosen": -59.80299377441406, "logps/ref_rejected": -88.75750732421875, "logps/rejected": -137.52517700195312, "loss": 0.2944, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.11870501935482025, "margin_dpo/beta_margin_grad_std": 0.15505337715148926, "margin_dpo/beta_margin_mean": 3.1856298446655273, "margin_dpo/loss_margin_mean": 31.85629653930664, "margin_dpo/margin_mean": 31.856294631958008, "margin_dpo/margin_std": 21.562454223632812, "step": 354 }, { "epoch": 0.5212922173274597, "grad_norm": 37.86518096923828, "learning_rate": 2.7689777072570284e-07, "logits/chosen": -0.6655494570732117, "logits/rejected": -0.6307432055473328, "logps/chosen": -70.75237274169922, "logps/ref_chosen": -54.128501892089844, "logps/ref_rejected": -82.40606689453125, "logps/rejected": -132.5486297607422, "loss": 0.3504, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1401645541191101, "margin_dpo/beta_margin_grad_std": 0.16528445482254028, "margin_dpo/beta_margin_mean": 3.351868152618408, "margin_dpo/loss_margin_mean": 33.518680572509766, "margin_dpo/margin_mean": 33.518680572509766, "margin_dpo/margin_std": 28.362560272216797, "step": 355 }, { "epoch": 0.5227606461086637, "grad_norm": 97.62612915039062, "learning_rate": 2.7562154104130176e-07, "logits/chosen": -0.6015282273292542, "logits/rejected": -0.5701065063476562, "logps/chosen": -86.73289489746094, "logps/ref_chosen": -64.67381286621094, "logps/ref_rejected": -75.89926147460938, "logps/rejected": -120.73099517822266, "loss": 0.8011, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.26123011112213135, "margin_dpo/beta_margin_grad_std": 0.2479068785905838, "margin_dpo/beta_margin_mean": 2.2772653102874756, "margin_dpo/loss_margin_mean": 22.772653579711914, "margin_dpo/margin_mean": 22.772653579711914, "margin_dpo/margin_std": 27.33060073852539, "step": 356 }, { "epoch": 0.5242290748898678, "grad_norm": 48.75430679321289, "learning_rate": 2.7434463620546594e-07, "logits/chosen": -0.6201961040496826, "logits/rejected": -0.5882294178009033, "logps/chosen": -70.5618896484375, "logps/ref_chosen": -52.725799560546875, "logps/ref_rejected": -86.84115600585938, "logps/rejected": -136.1475372314453, "loss": 0.3922, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13970120251178741, "margin_dpo/beta_margin_grad_std": 0.18796856701374054, "margin_dpo/beta_margin_mean": 3.147029399871826, "margin_dpo/loss_margin_mean": 31.470294952392578, "margin_dpo/margin_mean": 31.470294952392578, "margin_dpo/margin_std": 23.818038940429688, "step": 357 }, { "epoch": 0.5256975036710719, "grad_norm": 65.04510498046875, "learning_rate": 2.730670898658255e-07, "logits/chosen": -0.6239925622940063, "logits/rejected": -0.5800461173057556, "logps/chosen": -79.59387969970703, "logps/ref_chosen": -63.20543670654297, "logps/ref_rejected": -88.373291015625, "logps/rejected": -133.9655303955078, "loss": 0.4776, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17933997511863708, "margin_dpo/beta_margin_grad_std": 0.1934494972229004, "margin_dpo/beta_margin_mean": 2.920379161834717, "margin_dpo/loss_margin_mean": 29.20379066467285, "margin_dpo/margin_mean": 29.20378875732422, "margin_dpo/margin_std": 27.63866424560547, "step": 358 }, { "epoch": 0.527165932452276, "grad_norm": 62.99494934082031, "learning_rate": 2.717889356869146e-07, "logits/chosen": -0.5847325325012207, "logits/rejected": -0.5495982766151428, "logps/chosen": -78.43362426757812, "logps/ref_chosen": -56.370216369628906, "logps/ref_rejected": -82.17375183105469, "logps/rejected": -136.6361083984375, "loss": 0.4728, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15706944465637207, "margin_dpo/beta_margin_grad_std": 0.21352404356002808, "margin_dpo/beta_margin_mean": 3.2398953437805176, "margin_dpo/loss_margin_mean": 32.39895248413086, "margin_dpo/margin_mean": 32.39895248413086, "margin_dpo/margin_std": 27.692176818847656, "step": 359 }, { "epoch": 0.5286343612334802, "grad_norm": 44.829593658447266, "learning_rate": 2.7051020734928443e-07, "logits/chosen": -0.5741163492202759, "logits/rejected": -0.5485746264457703, "logps/chosen": -70.59879302978516, "logps/ref_chosen": -51.460384368896484, "logps/ref_rejected": -69.83892822265625, "logps/rejected": -118.67765808105469, "loss": 0.4042, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14649343490600586, "margin_dpo/beta_margin_grad_std": 0.1764645129442215, "margin_dpo/beta_margin_mean": 2.970031976699829, "margin_dpo/loss_margin_mean": 29.700321197509766, "margin_dpo/margin_mean": 29.700321197509766, "margin_dpo/margin_std": 23.251190185546875, "step": 360 }, { "epoch": 0.5301027900146843, "grad_norm": 60.9596061706543, "learning_rate": 2.6923093854861593e-07, "logits/chosen": -0.6095191240310669, "logits/rejected": -0.5885258316993713, "logps/chosen": -73.89006042480469, "logps/ref_chosen": -53.86951446533203, "logps/ref_rejected": -90.76925659179688, "logps/rejected": -139.11050415039062, "loss": 0.4981, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1759810894727707, "margin_dpo/beta_margin_grad_std": 0.20826107263565063, "margin_dpo/beta_margin_mean": 2.832070827484131, "margin_dpo/loss_margin_mean": 28.320707321166992, "margin_dpo/margin_mean": 28.320707321166992, "margin_dpo/margin_std": 25.021095275878906, "step": 361 }, { "epoch": 0.5315712187958884, "grad_norm": 54.746620178222656, "learning_rate": 2.679511629948319e-07, "logits/chosen": -0.6106045246124268, "logits/rejected": -0.5950082540512085, "logps/chosen": -78.94198608398438, "logps/ref_chosen": -58.639060974121094, "logps/ref_rejected": -105.58195495605469, "logps/rejected": -159.86471557617188, "loss": 0.4172, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14724624156951904, "margin_dpo/beta_margin_grad_std": 0.20302033424377441, "margin_dpo/beta_margin_mean": 3.3979835510253906, "margin_dpo/loss_margin_mean": 33.979835510253906, "margin_dpo/margin_mean": 33.97983169555664, "margin_dpo/margin_std": 27.78476333618164, "step": 362 }, { "epoch": 0.5330396475770925, "grad_norm": 81.81700897216797, "learning_rate": 2.6667091441120816e-07, "logits/chosen": -0.626789927482605, "logits/rejected": -0.5806140899658203, "logps/chosen": -62.119293212890625, "logps/ref_chosen": -44.558380126953125, "logps/ref_rejected": -74.69496154785156, "logps/rejected": -131.403076171875, "loss": 0.4021, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1342936009168625, "margin_dpo/beta_margin_grad_std": 0.2032202035188675, "margin_dpo/beta_margin_mean": 3.914720058441162, "margin_dpo/loss_margin_mean": 39.14720153808594, "margin_dpo/margin_mean": 39.14720153808594, "margin_dpo/margin_std": 32.773658752441406, "step": 363 }, { "epoch": 0.5345080763582967, "grad_norm": 70.45586395263672, "learning_rate": 2.6539022653348575e-07, "logits/chosen": -0.6278142929077148, "logits/rejected": -0.6232542991638184, "logps/chosen": -67.96247863769531, "logps/ref_chosen": -48.894622802734375, "logps/ref_rejected": -91.395751953125, "logps/rejected": -138.64413452148438, "loss": 0.5219, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18396377563476562, "margin_dpo/beta_margin_grad_std": 0.21280619502067566, "margin_dpo/beta_margin_mean": 2.818053722381592, "margin_dpo/loss_margin_mean": 28.180538177490234, "margin_dpo/margin_mean": 28.180538177490234, "margin_dpo/margin_std": 26.407032012939453, "step": 364 }, { "epoch": 0.5359765051395007, "grad_norm": 53.16273498535156, "learning_rate": 2.641091331089811e-07, "logits/chosen": -0.5910431146621704, "logits/rejected": -0.5750705003738403, "logps/chosen": -69.91105651855469, "logps/ref_chosen": -51.49274444580078, "logps/ref_rejected": -92.70166778564453, "logps/rejected": -138.62091064453125, "loss": 0.4635, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1818796843290329, "margin_dpo/beta_margin_grad_std": 0.18291005492210388, "margin_dpo/beta_margin_mean": 2.750092029571533, "margin_dpo/loss_margin_mean": 27.500919342041016, "margin_dpo/margin_mean": 27.500919342041016, "margin_dpo/margin_std": 25.346105575561523, "step": 365 }, { "epoch": 0.5374449339207048, "grad_norm": 41.03267288208008, "learning_rate": 2.6282766789569736e-07, "logits/chosen": -0.6280097365379333, "logits/rejected": -0.6183122992515564, "logps/chosen": -61.73809051513672, "logps/ref_chosen": -44.7205696105957, "logps/ref_rejected": -83.31040954589844, "logps/rejected": -129.70077514648438, "loss": 0.3847, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1519765853881836, "margin_dpo/beta_margin_grad_std": 0.17253069579601288, "margin_dpo/beta_margin_mean": 2.937284469604492, "margin_dpo/loss_margin_mean": 29.372844696044922, "margin_dpo/margin_mean": 29.372844696044922, "margin_dpo/margin_std": 23.400222778320312, "step": 366 }, { "epoch": 0.5389133627019089, "grad_norm": 55.751182556152344, "learning_rate": 2.615458646614349e-07, "logits/chosen": -0.5929805040359497, "logits/rejected": -0.5748361945152283, "logps/chosen": -77.30204010009766, "logps/ref_chosen": -58.405418395996094, "logps/ref_rejected": -76.75132751464844, "logps/rejected": -121.19309997558594, "loss": 0.5059, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1770220696926117, "margin_dpo/beta_margin_grad_std": 0.2071405053138733, "margin_dpo/beta_margin_mean": 2.5545148849487305, "margin_dpo/loss_margin_mean": 25.545148849487305, "margin_dpo/margin_mean": 25.545148849487305, "margin_dpo/margin_std": 22.493253707885742, "step": 367 }, { "epoch": 0.540381791483113, "grad_norm": 41.846797943115234, "learning_rate": 2.6026375718290083e-07, "logits/chosen": -0.6356101036071777, "logits/rejected": -0.6200574040412903, "logps/chosen": -61.09843444824219, "logps/ref_chosen": -44.452518463134766, "logps/ref_rejected": -98.55526733398438, "logps/rejected": -145.97296142578125, "loss": 0.3508, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13728934526443481, "margin_dpo/beta_margin_grad_std": 0.17335599660873413, "margin_dpo/beta_margin_mean": 3.0771780014038086, "margin_dpo/loss_margin_mean": 30.771780014038086, "margin_dpo/margin_mean": 30.771780014038086, "margin_dpo/margin_std": 23.21100616455078, "step": 368 }, { "epoch": 0.5418502202643172, "grad_norm": 67.84696960449219, "learning_rate": 2.589813792448196e-07, "logits/chosen": -0.6399098634719849, "logits/rejected": -0.60102778673172, "logps/chosen": -89.86668395996094, "logps/ref_chosen": -71.38150024414062, "logps/ref_rejected": -91.29582214355469, "logps/rejected": -134.7001190185547, "loss": 0.553, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.19392737746238708, "margin_dpo/beta_margin_grad_std": 0.22195757925510406, "margin_dpo/beta_margin_mean": 2.4919116497039795, "margin_dpo/loss_margin_mean": 24.919116973876953, "margin_dpo/margin_mean": 24.91911506652832, "margin_dpo/margin_std": 22.250526428222656, "step": 369 }, { "epoch": 0.5433186490455213, "grad_norm": 53.89767074584961, "learning_rate": 2.5769876463904263e-07, "logits/chosen": -0.62095046043396, "logits/rejected": -0.5922361016273499, "logps/chosen": -90.54344177246094, "logps/ref_chosen": -71.60749816894531, "logps/ref_rejected": -97.25978088378906, "logps/rejected": -141.17263793945312, "loss": 0.5169, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1934899091720581, "margin_dpo/beta_margin_grad_std": 0.2069862186908722, "margin_dpo/beta_margin_mean": 2.4976892471313477, "margin_dpo/loss_margin_mean": 24.976890563964844, "margin_dpo/margin_mean": 24.976890563964844, "margin_dpo/margin_std": 21.94351577758789, "step": 370 }, { "epoch": 0.5447870778267254, "grad_norm": 65.36404418945312, "learning_rate": 2.5641594716365744e-07, "logits/chosen": -0.65543532371521, "logits/rejected": -0.6313973665237427, "logps/chosen": -89.23640441894531, "logps/ref_chosen": -69.41448974609375, "logps/ref_rejected": -99.17217254638672, "logps/rejected": -146.0745849609375, "loss": 0.6168, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.19891506433486938, "margin_dpo/beta_margin_grad_std": 0.2336684763431549, "margin_dpo/beta_margin_mean": 2.7080492973327637, "margin_dpo/loss_margin_mean": 27.080493927001953, "margin_dpo/margin_mean": 27.08049201965332, "margin_dpo/margin_std": 28.510940551757812, "step": 371 }, { "epoch": 0.5462555066079295, "grad_norm": 55.08517074584961, "learning_rate": 2.551329606220976e-07, "logits/chosen": -0.6221505403518677, "logits/rejected": -0.5715365409851074, "logps/chosen": -81.12614440917969, "logps/ref_chosen": -61.8179931640625, "logps/ref_rejected": -78.53949737548828, "logps/rejected": -129.13607788085938, "loss": 0.5133, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17172211408615112, "margin_dpo/beta_margin_grad_std": 0.2211245894432068, "margin_dpo/beta_margin_mean": 3.1288440227508545, "margin_dpo/loss_margin_mean": 31.288440704345703, "margin_dpo/margin_mean": 31.288440704345703, "margin_dpo/margin_std": 29.940040588378906, "step": 372 }, { "epoch": 0.5477239353891337, "grad_norm": 59.45585250854492, "learning_rate": 2.538498388222517e-07, "logits/chosen": -0.6288785338401794, "logits/rejected": -0.5830151438713074, "logps/chosen": -85.13172912597656, "logps/ref_chosen": -64.21713256835938, "logps/ref_rejected": -85.95960998535156, "logps/rejected": -139.53817749023438, "loss": 0.4155, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1484554558992386, "margin_dpo/beta_margin_grad_std": 0.19429174065589905, "margin_dpo/beta_margin_mean": 3.266396999359131, "margin_dpo/loss_margin_mean": 32.663970947265625, "margin_dpo/margin_mean": 32.663970947265625, "margin_dpo/margin_std": 25.845104217529297, "step": 373 }, { "epoch": 0.5491923641703378, "grad_norm": 47.363040924072266, "learning_rate": 2.525666155755725e-07, "logits/chosen": -0.6621605157852173, "logits/rejected": -0.6285480856895447, "logps/chosen": -88.64697265625, "logps/ref_chosen": -70.65017700195312, "logps/ref_rejected": -93.64016723632812, "logps/rejected": -141.2034912109375, "loss": 0.4313, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1634691059589386, "margin_dpo/beta_margin_grad_std": 0.18810805678367615, "margin_dpo/beta_margin_mean": 2.9566543102264404, "margin_dpo/loss_margin_mean": 29.566543579101562, "margin_dpo/margin_mean": 29.566543579101562, "margin_dpo/margin_std": 25.27297019958496, "step": 374 }, { "epoch": 0.5506607929515418, "grad_norm": 53.13246536254883, "learning_rate": 2.512833246961859e-07, "logits/chosen": -0.5945202112197876, "logits/rejected": -0.580052375793457, "logps/chosen": -79.16407775878906, "logps/ref_chosen": -60.080223083496094, "logps/ref_rejected": -88.93830871582031, "logps/rejected": -137.2688446044922, "loss": 0.5153, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18427500128746033, "margin_dpo/beta_margin_grad_std": 0.21660040318965912, "margin_dpo/beta_margin_mean": 2.924668550491333, "margin_dpo/loss_margin_mean": 29.246685028076172, "margin_dpo/margin_mean": 29.246685028076172, "margin_dpo/margin_std": 24.21230697631836, "step": 375 }, { "epoch": 0.5521292217327459, "grad_norm": 48.868709564208984, "learning_rate": 2.5e-07, "logits/chosen": -0.5992149114608765, "logits/rejected": -0.5802311897277832, "logps/chosen": -81.73542785644531, "logps/ref_chosen": -62.660308837890625, "logps/ref_rejected": -105.526611328125, "logps/rejected": -156.72482299804688, "loss": 0.3949, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1482008844614029, "margin_dpo/beta_margin_grad_std": 0.17604166269302368, "margin_dpo/beta_margin_mean": 3.212308883666992, "margin_dpo/loss_margin_mean": 32.12308883666992, "margin_dpo/margin_mean": 32.12308883666992, "margin_dpo/margin_std": 27.578922271728516, "step": 376 }, { "epoch": 0.55359765051395, "grad_norm": 62.117244720458984, "learning_rate": 2.487166753038141e-07, "logits/chosen": -0.5650719404220581, "logits/rejected": -0.548367977142334, "logps/chosen": -76.00762939453125, "logps/ref_chosen": -54.478736877441406, "logps/ref_rejected": -98.70335388183594, "logps/rejected": -149.87831115722656, "loss": 0.5358, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18531596660614014, "margin_dpo/beta_margin_grad_std": 0.2207948863506317, "margin_dpo/beta_margin_mean": 2.9646058082580566, "margin_dpo/loss_margin_mean": 29.646059036254883, "margin_dpo/margin_mean": 29.646059036254883, "margin_dpo/margin_std": 26.692546844482422, "step": 377 }, { "epoch": 0.5550660792951542, "grad_norm": 44.018394470214844, "learning_rate": 2.4743338442442754e-07, "logits/chosen": -0.6136064529418945, "logits/rejected": -0.600831151008606, "logps/chosen": -62.556495666503906, "logps/ref_chosen": -45.02053451538086, "logps/ref_rejected": -88.0469741821289, "logps/rejected": -137.45811462402344, "loss": 0.4079, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13975557684898376, "margin_dpo/beta_margin_grad_std": 0.2044086456298828, "margin_dpo/beta_margin_mean": 3.1875176429748535, "margin_dpo/loss_margin_mean": 31.87517547607422, "margin_dpo/margin_mean": 31.87517547607422, "margin_dpo/margin_std": 25.208221435546875, "step": 378 }, { "epoch": 0.5565345080763583, "grad_norm": 55.18994903564453, "learning_rate": 2.461501611777483e-07, "logits/chosen": -0.6599289774894714, "logits/rejected": -0.6502236127853394, "logps/chosen": -72.3454818725586, "logps/ref_chosen": -53.182098388671875, "logps/ref_rejected": -114.30015563964844, "logps/rejected": -166.52252197265625, "loss": 0.4312, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1558556854724884, "margin_dpo/beta_margin_grad_std": 0.19769342243671417, "margin_dpo/beta_margin_mean": 3.305896759033203, "margin_dpo/loss_margin_mean": 33.05896759033203, "margin_dpo/margin_mean": 33.05896759033203, "margin_dpo/margin_std": 27.922954559326172, "step": 379 }, { "epoch": 0.5580029368575624, "grad_norm": 79.15997314453125, "learning_rate": 2.4486703937790243e-07, "logits/chosen": -0.5800139904022217, "logits/rejected": -0.5870028138160706, "logps/chosen": -74.11767578125, "logps/ref_chosen": -51.3530387878418, "logps/ref_rejected": -104.19169616699219, "logps/rejected": -161.47531127929688, "loss": 0.5708, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17511457204818726, "margin_dpo/beta_margin_grad_std": 0.2577556371688843, "margin_dpo/beta_margin_mean": 3.451897144317627, "margin_dpo/loss_margin_mean": 34.51897430419922, "margin_dpo/margin_mean": 34.51897430419922, "margin_dpo/margin_std": 30.276945114135742, "step": 380 }, { "epoch": 0.5594713656387665, "grad_norm": 63.70035171508789, "learning_rate": 2.435840528363426e-07, "logits/chosen": -0.5940126180648804, "logits/rejected": -0.5524269342422485, "logps/chosen": -79.10946655273438, "logps/ref_chosen": -57.80306625366211, "logps/ref_rejected": -79.21940612792969, "logps/rejected": -134.66360473632812, "loss": 0.5249, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1618008017539978, "margin_dpo/beta_margin_grad_std": 0.22977310419082642, "margin_dpo/beta_margin_mean": 3.4137802124023438, "margin_dpo/loss_margin_mean": 34.13780212402344, "margin_dpo/margin_mean": 34.13780212402344, "margin_dpo/margin_std": 30.301250457763672, "step": 381 }, { "epoch": 0.5609397944199707, "grad_norm": 55.583091735839844, "learning_rate": 2.4230123536095745e-07, "logits/chosen": -0.6578388214111328, "logits/rejected": -0.6236182451248169, "logps/chosen": -84.37736511230469, "logps/ref_chosen": -66.02030181884766, "logps/ref_rejected": -110.71015930175781, "logps/rejected": -164.35707092285156, "loss": 0.402, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13427521288394928, "margin_dpo/beta_margin_grad_std": 0.20077002048492432, "margin_dpo/beta_margin_mean": 3.528985023498535, "margin_dpo/loss_margin_mean": 35.28984832763672, "margin_dpo/margin_mean": 35.28984832763672, "margin_dpo/margin_std": 27.51814079284668, "step": 382 }, { "epoch": 0.5624082232011748, "grad_norm": 53.64909744262695, "learning_rate": 2.4101862075518037e-07, "logits/chosen": -0.621538519859314, "logits/rejected": -0.6095184087753296, "logps/chosen": -71.61962890625, "logps/ref_chosen": -50.39148712158203, "logps/ref_rejected": -93.71589660644531, "logps/rejected": -147.1136016845703, "loss": 0.3707, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1415613293647766, "margin_dpo/beta_margin_grad_std": 0.18154266476631165, "margin_dpo/beta_margin_mean": 3.2169556617736816, "margin_dpo/loss_margin_mean": 32.1695556640625, "margin_dpo/margin_mean": 32.1695556640625, "margin_dpo/margin_std": 25.299137115478516, "step": 383 }, { "epoch": 0.5638766519823789, "grad_norm": 49.92765426635742, "learning_rate": 2.397362428170992e-07, "logits/chosen": -0.6100102663040161, "logits/rejected": -0.5814231038093567, "logps/chosen": -73.15731811523438, "logps/ref_chosen": -52.046104431152344, "logps/ref_rejected": -85.76089477539062, "logps/rejected": -138.78662109375, "loss": 0.4752, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17313042283058167, "margin_dpo/beta_margin_grad_std": 0.20930971205234528, "margin_dpo/beta_margin_mean": 3.191450357437134, "margin_dpo/loss_margin_mean": 31.914501190185547, "margin_dpo/margin_mean": 31.914505004882812, "margin_dpo/margin_std": 31.282188415527344, "step": 384 }, { "epoch": 0.5653450807635829, "grad_norm": 57.458656311035156, "learning_rate": 2.3845413533856514e-07, "logits/chosen": -0.636741578578949, "logits/rejected": -0.5834276676177979, "logps/chosen": -83.70286560058594, "logps/ref_chosen": -65.55216217041016, "logps/ref_rejected": -77.82792663574219, "logps/rejected": -124.51187133789062, "loss": 0.4448, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1649201214313507, "margin_dpo/beta_margin_grad_std": 0.20112237334251404, "margin_dpo/beta_margin_mean": 2.8533244132995605, "margin_dpo/loss_margin_mean": 28.533245086669922, "margin_dpo/margin_mean": 28.533245086669922, "margin_dpo/margin_std": 21.985557556152344, "step": 385 }, { "epoch": 0.566813509544787, "grad_norm": 65.1162109375, "learning_rate": 2.3717233210430254e-07, "logits/chosen": -0.5937461853027344, "logits/rejected": -0.5639574527740479, "logps/chosen": -79.37002563476562, "logps/ref_chosen": -58.22185516357422, "logps/ref_rejected": -92.32742309570312, "logps/rejected": -147.26278686523438, "loss": 0.3465, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12259222567081451, "margin_dpo/beta_margin_grad_std": 0.18382999300956726, "margin_dpo/beta_margin_mean": 3.3787193298339844, "margin_dpo/loss_margin_mean": 33.787193298339844, "margin_dpo/margin_mean": 33.787193298339844, "margin_dpo/margin_std": 23.929697036743164, "step": 386 }, { "epoch": 0.5682819383259912, "grad_norm": 72.24935150146484, "learning_rate": 2.3589086689101889e-07, "logits/chosen": -0.6583748459815979, "logits/rejected": -0.6081060171127319, "logps/chosen": -84.46687316894531, "logps/ref_chosen": -66.41944885253906, "logps/ref_rejected": -92.16915893554688, "logps/rejected": -139.08279418945312, "loss": 0.4609, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16438427567481995, "margin_dpo/beta_margin_grad_std": 0.20714232325553894, "margin_dpo/beta_margin_mean": 2.886620044708252, "margin_dpo/loss_margin_mean": 28.866199493408203, "margin_dpo/margin_mean": 28.86620330810547, "margin_dpo/margin_std": 22.799579620361328, "step": 387 }, { "epoch": 0.5697503671071953, "grad_norm": 49.13148880004883, "learning_rate": 2.3460977346651428e-07, "logits/chosen": -0.6356014013290405, "logits/rejected": -0.6344074010848999, "logps/chosen": -70.69566345214844, "logps/ref_chosen": -50.129459381103516, "logps/ref_rejected": -104.43305969238281, "logps/rejected": -160.75782775878906, "loss": 0.3715, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13866804540157318, "margin_dpo/beta_margin_grad_std": 0.18035584688186646, "margin_dpo/beta_margin_mean": 3.5758562088012695, "margin_dpo/loss_margin_mean": 35.75856018066406, "margin_dpo/margin_mean": 35.75856018066406, "margin_dpo/margin_std": 29.00539779663086, "step": 388 }, { "epoch": 0.5712187958883994, "grad_norm": 39.877593994140625, "learning_rate": 2.3332908558879177e-07, "logits/chosen": -0.670194149017334, "logits/rejected": -0.6247744560241699, "logps/chosen": -76.81654357910156, "logps/ref_chosen": -57.906593322753906, "logps/ref_rejected": -77.91454315185547, "logps/rejected": -130.85861206054688, "loss": 0.3492, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13106092810630798, "margin_dpo/beta_margin_grad_std": 0.18352819979190826, "margin_dpo/beta_margin_mean": 3.403412103652954, "margin_dpo/loss_margin_mean": 34.03411865234375, "margin_dpo/margin_mean": 34.03411865234375, "margin_dpo/margin_std": 25.96773338317871, "step": 389 }, { "epoch": 0.5726872246696035, "grad_norm": 65.6811752319336, "learning_rate": 2.320488370051681e-07, "logits/chosen": -0.5949693322181702, "logits/rejected": -0.5706865787506104, "logps/chosen": -70.46009826660156, "logps/ref_chosen": -49.22591781616211, "logps/ref_rejected": -85.5281982421875, "logps/rejected": -137.94676208496094, "loss": 0.5262, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16779480874538422, "margin_dpo/beta_margin_grad_std": 0.23410001397132874, "margin_dpo/beta_margin_mean": 3.118438243865967, "margin_dpo/loss_margin_mean": 31.18438148498535, "margin_dpo/margin_mean": 31.18438148498535, "margin_dpo/margin_std": 27.223957061767578, "step": 390 }, { "epoch": 0.5741556534508077, "grad_norm": 59.37446594238281, "learning_rate": 2.3076906145138405e-07, "logits/chosen": -0.6412761211395264, "logits/rejected": -0.6216508150100708, "logps/chosen": -86.92506408691406, "logps/ref_chosen": -64.32965087890625, "logps/ref_rejected": -86.73820495605469, "logps/rejected": -137.03587341308594, "loss": 0.5183, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18732406198978424, "margin_dpo/beta_margin_grad_std": 0.2125789225101471, "margin_dpo/beta_margin_mean": 2.7702255249023438, "margin_dpo/loss_margin_mean": 27.702255249023438, "margin_dpo/margin_mean": 27.702255249023438, "margin_dpo/margin_std": 26.598276138305664, "step": 391 }, { "epoch": 0.5756240822320118, "grad_norm": 42.18976974487305, "learning_rate": 2.294897926507156e-07, "logits/chosen": -0.6050982475280762, "logits/rejected": -0.5827013850212097, "logps/chosen": -71.68020629882812, "logps/ref_chosen": -53.50397872924805, "logps/ref_rejected": -102.34583282470703, "logps/rejected": -154.8188934326172, "loss": 0.2982, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.11888954043388367, "margin_dpo/beta_margin_grad_std": 0.1541348695755005, "margin_dpo/beta_margin_mean": 3.429682493209839, "margin_dpo/loss_margin_mean": 34.29682540893555, "margin_dpo/margin_mean": 34.29682540893555, "margin_dpo/margin_std": 25.064613342285156, "step": 392 }, { "epoch": 0.5770925110132159, "grad_norm": 57.39728927612305, "learning_rate": 2.2821106431308543e-07, "logits/chosen": -0.572903037071228, "logits/rejected": -0.543228268623352, "logps/chosen": -65.63607788085938, "logps/ref_chosen": -46.473915100097656, "logps/ref_rejected": -71.96885681152344, "logps/rejected": -118.71408081054688, "loss": 0.5349, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.19703662395477295, "margin_dpo/beta_margin_grad_std": 0.2085367888212204, "margin_dpo/beta_margin_mean": 2.7583065032958984, "margin_dpo/loss_margin_mean": 27.583065032958984, "margin_dpo/margin_mean": 27.583065032958984, "margin_dpo/margin_std": 27.609420776367188, "step": 393 }, { "epoch": 0.57856093979442, "grad_norm": 60.01694869995117, "learning_rate": 2.2693291013417452e-07, "logits/chosen": -0.6191369295120239, "logits/rejected": -0.5974385738372803, "logps/chosen": -71.70793151855469, "logps/ref_chosen": -52.91154479980469, "logps/ref_rejected": -90.82263946533203, "logps/rejected": -140.04214477539062, "loss": 0.577, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18772023916244507, "margin_dpo/beta_margin_grad_std": 0.22964736819267273, "margin_dpo/beta_margin_mean": 3.0423121452331543, "margin_dpo/loss_margin_mean": 30.42312240600586, "margin_dpo/margin_mean": 30.42312240600586, "margin_dpo/margin_std": 31.186908721923828, "step": 394 }, { "epoch": 0.580029368575624, "grad_norm": 45.10045623779297, "learning_rate": 2.2565536379453404e-07, "logits/chosen": -0.6933879852294922, "logits/rejected": -0.6763237714767456, "logps/chosen": -79.92889404296875, "logps/ref_chosen": -62.546112060546875, "logps/ref_rejected": -83.78262329101562, "logps/rejected": -133.17141723632812, "loss": 0.4329, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16615131497383118, "margin_dpo/beta_margin_grad_std": 0.19662132859230042, "margin_dpo/beta_margin_mean": 3.2006003856658936, "margin_dpo/loss_margin_mean": 32.006004333496094, "margin_dpo/margin_mean": 32.006004333496094, "margin_dpo/margin_std": 26.82199478149414, "step": 395 }, { "epoch": 0.5814977973568282, "grad_norm": 49.713191986083984, "learning_rate": 2.2437845895869825e-07, "logits/chosen": -0.6597648859024048, "logits/rejected": -0.6144574284553528, "logps/chosen": -88.35139465332031, "logps/ref_chosen": -68.99594116210938, "logps/ref_rejected": -88.64665985107422, "logps/rejected": -139.371337890625, "loss": 0.4051, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14218196272850037, "margin_dpo/beta_margin_grad_std": 0.207365944981575, "margin_dpo/beta_margin_mean": 3.1369237899780273, "margin_dpo/loss_margin_mean": 31.369239807128906, "margin_dpo/margin_mean": 31.369239807128906, "margin_dpo/margin_std": 23.291423797607422, "step": 396 }, { "epoch": 0.5829662261380323, "grad_norm": 45.640316009521484, "learning_rate": 2.2310222927429716e-07, "logits/chosen": -0.6247228384017944, "logits/rejected": -0.5786880254745483, "logps/chosen": -78.00132751464844, "logps/ref_chosen": -61.27716827392578, "logps/ref_rejected": -103.11612701416016, "logps/rejected": -155.62109375, "loss": 0.3668, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1343180537223816, "margin_dpo/beta_margin_grad_std": 0.19318078458309174, "margin_dpo/beta_margin_mean": 3.5780787467956543, "margin_dpo/loss_margin_mean": 35.78078842163086, "margin_dpo/margin_mean": 35.780784606933594, "margin_dpo/margin_std": 27.38970184326172, "step": 397 }, { "epoch": 0.5844346549192364, "grad_norm": 50.45404815673828, "learning_rate": 2.2182670837116972e-07, "logits/chosen": -0.6684058904647827, "logits/rejected": -0.6429616212844849, "logps/chosen": -87.1710205078125, "logps/ref_chosen": -68.15155029296875, "logps/ref_rejected": -108.52360534667969, "logps/rejected": -158.55606079101562, "loss": 0.3542, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13709759712219238, "margin_dpo/beta_margin_grad_std": 0.17260834574699402, "margin_dpo/beta_margin_mean": 3.1012983322143555, "margin_dpo/loss_margin_mean": 31.012981414794922, "margin_dpo/margin_mean": 31.012981414794922, "margin_dpo/margin_std": 25.8978328704834, "step": 398 }, { "epoch": 0.5859030837004405, "grad_norm": 56.667911529541016, "learning_rate": 2.2055192986047804e-07, "logits/chosen": -0.655005931854248, "logits/rejected": -0.5906921625137329, "logps/chosen": -77.88751220703125, "logps/ref_chosen": -60.889801025390625, "logps/ref_rejected": -77.96558380126953, "logps/rejected": -129.66696166992188, "loss": 0.4236, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14022482931613922, "margin_dpo/beta_margin_grad_std": 0.2172231674194336, "margin_dpo/beta_margin_mean": 3.470367431640625, "margin_dpo/loss_margin_mean": 34.70367431640625, "margin_dpo/margin_mean": 34.70367431640625, "margin_dpo/margin_std": 25.53824234008789, "step": 399 }, { "epoch": 0.5873715124816447, "grad_norm": 56.22902297973633, "learning_rate": 2.192779273338215e-07, "logits/chosen": -0.6668632626533508, "logits/rejected": -0.6306219100952148, "logps/chosen": -81.36758422851562, "logps/ref_chosen": -63.64359664916992, "logps/ref_rejected": -105.252685546875, "logps/rejected": -161.2772674560547, "loss": 0.4468, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1347055435180664, "margin_dpo/beta_margin_grad_std": 0.20773792266845703, "margin_dpo/beta_margin_mean": 3.830059289932251, "margin_dpo/loss_margin_mean": 38.300594329833984, "margin_dpo/margin_mean": 38.30059051513672, "margin_dpo/margin_std": 30.820228576660156, "step": 400 }, { "epoch": 0.5873715124816447, "eval_logits/chosen": -0.5647093653678894, "eval_logits/rejected": -0.5334640741348267, "eval_logps/chosen": -102.04676818847656, "eval_logps/ref_chosen": -79.05104064941406, "eval_logps/ref_rejected": -86.79793548583984, "eval_logps/rejected": -130.0719757080078, "eval_loss": 0.42134976387023926, "eval_margin_dpo/beta": 0.10000000149011612, "eval_margin_dpo/beta_margin_grad_mean": -0.2686771750450134, "eval_margin_dpo/beta_margin_grad_std": 0.2539796233177185, "eval_margin_dpo/beta_margin_mean": 2.0278308391571045, "eval_margin_dpo/loss_margin_mean": 20.27830696105957, "eval_margin_dpo/margin_mean": 20.27830696105957, "eval_margin_dpo/margin_std": 25.458209991455078, "eval_runtime": 39.9217, "eval_samples_per_second": 58.59, "eval_steps_per_second": 1.854, "step": 400 }, { "epoch": 0.5888399412628488, "grad_norm": 63.177490234375, "learning_rate": 2.1800473436235136e-07, "logits/chosen": -0.5337532758712769, "logits/rejected": -0.5123304724693298, "logps/chosen": -76.32991027832031, "logps/ref_chosen": -57.16303253173828, "logps/ref_rejected": -83.79249572753906, "logps/rejected": -132.53762817382812, "loss": 0.6636, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2089146077632904, "margin_dpo/beta_margin_grad_std": 0.2529540956020355, "margin_dpo/beta_margin_mean": 2.95782470703125, "margin_dpo/loss_margin_mean": 29.5782470703125, "margin_dpo/margin_mean": 29.5782470703125, "margin_dpo/margin_std": 31.70156478881836, "step": 401 }, { "epoch": 0.5903083700440529, "grad_norm": 25.056612014770508, "learning_rate": 2.1673238449588665e-07, "logits/chosen": -0.6563818454742432, "logits/rejected": -0.6099350452423096, "logps/chosen": -62.76488494873047, "logps/ref_chosen": -50.74037170410156, "logps/ref_rejected": -81.0460433959961, "logps/rejected": -132.43775939941406, "loss": 0.211, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.08416810631752014, "margin_dpo/beta_margin_grad_std": 0.13652239739894867, "margin_dpo/beta_margin_mean": 3.936720371246338, "margin_dpo/loss_margin_mean": 39.36720275878906, "margin_dpo/margin_mean": 39.36720275878906, "margin_dpo/margin_std": 24.1574649810791, "step": 402 }, { "epoch": 0.591776798825257, "grad_norm": 62.19951629638672, "learning_rate": 2.154609112620295e-07, "logits/chosen": -0.6823678016662598, "logits/rejected": -0.6667909622192383, "logps/chosen": -62.73232650756836, "logps/ref_chosen": -47.14731216430664, "logps/ref_rejected": -77.2666015625, "logps/rejected": -123.16378021240234, "loss": 0.5812, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1768760085105896, "margin_dpo/beta_margin_grad_std": 0.22503289580345154, "margin_dpo/beta_margin_mean": 3.0312163829803467, "margin_dpo/loss_margin_mean": 30.312164306640625, "margin_dpo/margin_mean": 30.312164306640625, "margin_dpo/margin_std": 28.440311431884766, "step": 403 }, { "epoch": 0.593245227606461, "grad_norm": 55.05302810668945, "learning_rate": 2.1419034816528218e-07, "logits/chosen": -0.6315656304359436, "logits/rejected": -0.5993084907531738, "logps/chosen": -63.717140197753906, "logps/ref_chosen": -47.875274658203125, "logps/ref_rejected": -77.15499877929688, "logps/rejected": -123.78963470458984, "loss": 0.5707, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18741407990455627, "margin_dpo/beta_margin_grad_std": 0.23549535870552063, "margin_dpo/beta_margin_mean": 3.0792763233184814, "margin_dpo/loss_margin_mean": 30.792762756347656, "margin_dpo/margin_mean": 30.792762756347656, "margin_dpo/margin_std": 29.048046112060547, "step": 404 }, { "epoch": 0.5947136563876652, "grad_norm": 68.80015563964844, "learning_rate": 2.129207286861638e-07, "logits/chosen": -0.5908911824226379, "logits/rejected": -0.5618330240249634, "logps/chosen": -84.89933776855469, "logps/ref_chosen": -65.16290283203125, "logps/ref_rejected": -87.18678283691406, "logps/rejected": -137.16378784179688, "loss": 0.5402, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18618471920490265, "margin_dpo/beta_margin_grad_std": 0.2252415120601654, "margin_dpo/beta_margin_mean": 3.024057149887085, "margin_dpo/loss_margin_mean": 30.240570068359375, "margin_dpo/margin_mean": 30.240571975708008, "margin_dpo/margin_std": 27.378923416137695, "step": 405 }, { "epoch": 0.5961820851688693, "grad_norm": 62.42378616333008, "learning_rate": 2.1165208628032861e-07, "logits/chosen": -0.633690357208252, "logits/rejected": -0.6194950342178345, "logps/chosen": -66.8834457397461, "logps/ref_chosen": -49.740814208984375, "logps/ref_rejected": -92.07862854003906, "logps/rejected": -141.23345947265625, "loss": 0.5398, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16698844730854034, "margin_dpo/beta_margin_grad_std": 0.21780461072921753, "margin_dpo/beta_margin_mean": 3.2012197971343994, "margin_dpo/loss_margin_mean": 32.0121955871582, "margin_dpo/margin_mean": 32.0121955871582, "margin_dpo/margin_std": 28.075244903564453, "step": 406 }, { "epoch": 0.5976505139500734, "grad_norm": 77.41171264648438, "learning_rate": 2.1038445437768375e-07, "logits/chosen": -0.6692545413970947, "logits/rejected": -0.6265490055084229, "logps/chosen": -72.6529541015625, "logps/ref_chosen": -56.33069610595703, "logps/ref_rejected": -77.5120849609375, "logps/rejected": -126.45954895019531, "loss": 0.6116, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1815359890460968, "margin_dpo/beta_margin_grad_std": 0.24493393301963806, "margin_dpo/beta_margin_mean": 3.2625207901000977, "margin_dpo/loss_margin_mean": 32.625205993652344, "margin_dpo/margin_mean": 32.625205993652344, "margin_dpo/margin_std": 29.34493637084961, "step": 407 }, { "epoch": 0.5991189427312775, "grad_norm": 63.60155487060547, "learning_rate": 2.0911786638150872e-07, "logits/chosen": -0.6971176862716675, "logits/rejected": -0.6451106071472168, "logps/chosen": -85.57447814941406, "logps/ref_chosen": -69.789306640625, "logps/ref_rejected": -90.09693908691406, "logps/rejected": -133.64590454101562, "loss": 0.6213, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.20202887058258057, "margin_dpo/beta_margin_grad_std": 0.24348929524421692, "margin_dpo/beta_margin_mean": 2.7763803005218506, "margin_dpo/loss_margin_mean": 27.76380157470703, "margin_dpo/margin_mean": 27.76380157470703, "margin_dpo/margin_std": 27.04732894897461, "step": 408 }, { "epoch": 0.6005873715124816, "grad_norm": 49.4036750793457, "learning_rate": 2.0785235566757517e-07, "logits/chosen": -0.6140397787094116, "logits/rejected": -0.582785964012146, "logps/chosen": -84.53123474121094, "logps/ref_chosen": -67.31744384765625, "logps/ref_rejected": -84.904296875, "logps/rejected": -133.21397399902344, "loss": 0.4071, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15303273499011993, "margin_dpo/beta_margin_grad_std": 0.18726620078086853, "margin_dpo/beta_margin_mean": 3.109589099884033, "margin_dpo/loss_margin_mean": 31.095890045166016, "margin_dpo/margin_mean": 31.095890045166016, "margin_dpo/margin_std": 25.748220443725586, "step": 409 }, { "epoch": 0.6020558002936858, "grad_norm": 67.83360290527344, "learning_rate": 2.065879555832674e-07, "logits/chosen": -0.5768519043922424, "logits/rejected": -0.5736193656921387, "logps/chosen": -70.64789581298828, "logps/ref_chosen": -51.465354919433594, "logps/ref_rejected": -83.198974609375, "logps/rejected": -130.64712524414062, "loss": 0.585, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.19843655824661255, "margin_dpo/beta_margin_grad_std": 0.23984801769256592, "margin_dpo/beta_margin_mean": 2.8265600204467773, "margin_dpo/loss_margin_mean": 28.265602111816406, "margin_dpo/margin_mean": 28.265600204467773, "margin_dpo/margin_std": 26.36197280883789, "step": 410 }, { "epoch": 0.6035242290748899, "grad_norm": 57.13195037841797, "learning_rate": 2.0532469944670343e-07, "logits/chosen": -0.6790816783905029, "logits/rejected": -0.6465529203414917, "logps/chosen": -71.7957992553711, "logps/ref_chosen": -52.30727005004883, "logps/ref_rejected": -80.69495391845703, "logps/rejected": -130.16925048828125, "loss": 0.5328, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18435493111610413, "margin_dpo/beta_margin_grad_std": 0.22915974259376526, "margin_dpo/beta_margin_mean": 2.9985756874084473, "margin_dpo/loss_margin_mean": 29.985755920410156, "margin_dpo/margin_mean": 29.985755920410156, "margin_dpo/margin_std": 27.710227966308594, "step": 411 }, { "epoch": 0.604992657856094, "grad_norm": 41.65260314941406, "learning_rate": 2.0406262054585738e-07, "logits/chosen": -0.7030426859855652, "logits/rejected": -0.6921846866607666, "logps/chosen": -69.07894897460938, "logps/ref_chosen": -53.144126892089844, "logps/ref_rejected": -100.06080627441406, "logps/rejected": -145.71115112304688, "loss": 0.5004, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1866704523563385, "margin_dpo/beta_margin_grad_std": 0.2087305784225464, "margin_dpo/beta_margin_mean": 2.9715518951416016, "margin_dpo/loss_margin_mean": 29.715518951416016, "margin_dpo/margin_mean": 29.715518951416016, "margin_dpo/margin_std": 27.474346160888672, "step": 412 }, { "epoch": 0.6064610866372981, "grad_norm": 57.669593811035156, "learning_rate": 2.0280175213768205e-07, "logits/chosen": -0.5741822719573975, "logits/rejected": -0.5401548147201538, "logps/chosen": -80.91738891601562, "logps/ref_chosen": -61.58196258544922, "logps/ref_rejected": -99.47340393066406, "logps/rejected": -148.91249084472656, "loss": 0.4797, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1573394238948822, "margin_dpo/beta_margin_grad_std": 0.1960790902376175, "margin_dpo/beta_margin_mean": 3.0103673934936523, "margin_dpo/loss_margin_mean": 30.10367202758789, "margin_dpo/margin_mean": 30.10367202758789, "margin_dpo/margin_std": 25.22928237915039, "step": 413 }, { "epoch": 0.6079295154185022, "grad_norm": 49.89177322387695, "learning_rate": 2.0154212744723247e-07, "logits/chosen": -0.6295123100280762, "logits/rejected": -0.5924926996231079, "logps/chosen": -62.89250183105469, "logps/ref_chosen": -46.63148880004883, "logps/ref_rejected": -87.64652252197266, "logps/rejected": -139.89688110351562, "loss": 0.3669, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13251307606697083, "margin_dpo/beta_margin_grad_std": 0.18466657400131226, "margin_dpo/beta_margin_mean": 3.598933219909668, "margin_dpo/loss_margin_mean": 35.98933410644531, "margin_dpo/margin_mean": 35.98933410644531, "margin_dpo/margin_std": 25.766937255859375, "step": 414 }, { "epoch": 0.6093979441997063, "grad_norm": 44.46585464477539, "learning_rate": 2.002837796667909e-07, "logits/chosen": -0.6287680268287659, "logits/rejected": -0.6043534278869629, "logps/chosen": -95.65867614746094, "logps/ref_chosen": -78.6182861328125, "logps/ref_rejected": -100.47752380371094, "logps/rejected": -147.51513671875, "loss": 0.3992, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15828433632850647, "margin_dpo/beta_margin_grad_std": 0.17664587497711182, "margin_dpo/beta_margin_mean": 2.9997239112854004, "margin_dpo/loss_margin_mean": 29.997238159179688, "margin_dpo/margin_mean": 29.997238159179688, "margin_dpo/margin_std": 25.054841995239258, "step": 415 }, { "epoch": 0.6108663729809104, "grad_norm": 48.96870040893555, "learning_rate": 1.990267419549914e-07, "logits/chosen": -0.629509449005127, "logits/rejected": -0.5954192876815796, "logps/chosen": -75.95622253417969, "logps/ref_chosen": -58.27912521362305, "logps/ref_rejected": -90.56871795654297, "logps/rejected": -145.17367553710938, "loss": 0.384, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13312619924545288, "margin_dpo/beta_margin_grad_std": 0.18159282207489014, "margin_dpo/beta_margin_mean": 3.692786693572998, "margin_dpo/loss_margin_mean": 36.92786407470703, "margin_dpo/margin_mean": 36.9278678894043, "margin_dpo/margin_std": 27.910099029541016, "step": 416 }, { "epoch": 0.6123348017621145, "grad_norm": 34.49977493286133, "learning_rate": 1.9777104743594686e-07, "logits/chosen": -0.6395463943481445, "logits/rejected": -0.5739086866378784, "logps/chosen": -66.991455078125, "logps/ref_chosen": -50.1987190246582, "logps/ref_rejected": -68.15184020996094, "logps/rejected": -120.10306549072266, "loss": 0.3145, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12301211804151535, "margin_dpo/beta_margin_grad_std": 0.17249628901481628, "margin_dpo/beta_margin_mean": 3.5158486366271973, "margin_dpo/loss_margin_mean": 35.158485412597656, "margin_dpo/margin_mean": 35.158485412597656, "margin_dpo/margin_std": 23.311870574951172, "step": 417 }, { "epoch": 0.6138032305433186, "grad_norm": 66.27497100830078, "learning_rate": 1.965167291983757e-07, "logits/chosen": -0.6523764133453369, "logits/rejected": -0.5884617567062378, "logps/chosen": -99.4321060180664, "logps/ref_chosen": -81.97846984863281, "logps/ref_rejected": -104.69148254394531, "logps/rejected": -156.43402099609375, "loss": 0.566, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1649475246667862, "margin_dpo/beta_margin_grad_std": 0.23172861337661743, "margin_dpo/beta_margin_mean": 3.4288902282714844, "margin_dpo/loss_margin_mean": 34.288902282714844, "margin_dpo/margin_mean": 34.288902282714844, "margin_dpo/margin_std": 31.480552673339844, "step": 418 }, { "epoch": 0.6152716593245228, "grad_norm": 47.13197708129883, "learning_rate": 1.9526382029472988e-07, "logits/chosen": -0.6016639471054077, "logits/rejected": -0.5594383478164673, "logps/chosen": -70.49934387207031, "logps/ref_chosen": -52.948646545410156, "logps/ref_rejected": -91.58309936523438, "logps/rejected": -143.46878051757812, "loss": 0.3025, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.11779798567295074, "margin_dpo/beta_margin_grad_std": 0.15797148644924164, "margin_dpo/beta_margin_mean": 3.433500289916992, "margin_dpo/loss_margin_mean": 34.33500289916992, "margin_dpo/margin_mean": 34.33500289916992, "margin_dpo/margin_std": 24.259674072265625, "step": 419 }, { "epoch": 0.6167400881057269, "grad_norm": 63.46165466308594, "learning_rate": 1.9401235374032425e-07, "logits/chosen": -0.6578436493873596, "logits/rejected": -0.579310417175293, "logps/chosen": -96.29592895507812, "logps/ref_chosen": -77.7699203491211, "logps/ref_rejected": -69.31985473632812, "logps/rejected": -121.10002899169922, "loss": 0.4568, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1476997584104538, "margin_dpo/beta_margin_grad_std": 0.207474946975708, "margin_dpo/beta_margin_mean": 3.3254165649414062, "margin_dpo/loss_margin_mean": 33.25416564941406, "margin_dpo/margin_mean": 33.25416564941406, "margin_dpo/margin_std": 27.482261657714844, "step": 420 }, { "epoch": 0.618208516886931, "grad_norm": 75.73670959472656, "learning_rate": 1.9276236251246653e-07, "logits/chosen": -0.6206883192062378, "logits/rejected": -0.5857428908348083, "logps/chosen": -74.22445678710938, "logps/ref_chosen": -53.765865325927734, "logps/ref_rejected": -89.28144836425781, "logps/rejected": -137.7203826904297, "loss": 0.6307, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1979384869337082, "margin_dpo/beta_margin_grad_std": 0.2501598596572876, "margin_dpo/beta_margin_mean": 2.798034429550171, "margin_dpo/loss_margin_mean": 27.980342864990234, "margin_dpo/margin_mean": 27.980342864990234, "margin_dpo/margin_std": 26.982437133789062, "step": 421 }, { "epoch": 0.6196769456681351, "grad_norm": 69.67945098876953, "learning_rate": 1.9151387954958792e-07, "logits/chosen": -0.6548997163772583, "logits/rejected": -0.6131415367126465, "logps/chosen": -89.59654998779297, "logps/ref_chosen": -68.6337661743164, "logps/ref_rejected": -87.86351013183594, "logps/rejected": -139.065185546875, "loss": 0.5739, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18962985277175903, "margin_dpo/beta_margin_grad_std": 0.246720552444458, "margin_dpo/beta_margin_mean": 3.0238897800445557, "margin_dpo/loss_margin_mean": 30.2388973236084, "margin_dpo/margin_mean": 30.2388973236084, "margin_dpo/margin_std": 28.849193572998047, "step": 422 }, { "epoch": 0.6211453744493393, "grad_norm": 73.22090148925781, "learning_rate": 1.902669377503756e-07, "logits/chosen": -0.6237994432449341, "logits/rejected": -0.6053036451339722, "logps/chosen": -74.4217529296875, "logps/ref_chosen": -54.99030303955078, "logps/ref_rejected": -86.30654907226562, "logps/rejected": -137.05184936523438, "loss": 0.5645, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18330860137939453, "margin_dpo/beta_margin_grad_std": 0.2365390956401825, "margin_dpo/beta_margin_mean": 3.1313838958740234, "margin_dpo/loss_margin_mean": 31.313838958740234, "margin_dpo/margin_mean": 31.313838958740234, "margin_dpo/margin_std": 29.860960006713867, "step": 423 }, { "epoch": 0.6226138032305433, "grad_norm": 49.532413482666016, "learning_rate": 1.890215699729057e-07, "logits/chosen": -0.6332702040672302, "logits/rejected": -0.5856061577796936, "logps/chosen": -73.72906494140625, "logps/ref_chosen": -56.01191711425781, "logps/ref_rejected": -66.47896575927734, "logps/rejected": -118.37336730957031, "loss": 0.4253, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15173882246017456, "margin_dpo/beta_margin_grad_std": 0.20923829078674316, "margin_dpo/beta_margin_mean": 3.417725086212158, "margin_dpo/loss_margin_mean": 34.17725372314453, "margin_dpo/margin_mean": 34.17725372314453, "margin_dpo/margin_std": 30.512378692626953, "step": 424 }, { "epoch": 0.6240822320117474, "grad_norm": 60.32538604736328, "learning_rate": 1.8777780903377732e-07, "logits/chosen": -0.631500780582428, "logits/rejected": -0.6203855872154236, "logps/chosen": -65.76054382324219, "logps/ref_chosen": -46.868995666503906, "logps/ref_rejected": -95.92545318603516, "logps/rejected": -145.38247680664062, "loss": 0.5126, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16552463173866272, "margin_dpo/beta_margin_grad_std": 0.224747896194458, "margin_dpo/beta_margin_mean": 3.0565476417541504, "margin_dpo/loss_margin_mean": 30.565475463867188, "margin_dpo/margin_mean": 30.56547737121582, "margin_dpo/margin_std": 24.83243179321289, "step": 425 }, { "epoch": 0.6255506607929515, "grad_norm": 77.01701354980469, "learning_rate": 1.8653568770724803e-07, "logits/chosen": -0.6216360330581665, "logits/rejected": -0.5673133730888367, "logps/chosen": -93.9437255859375, "logps/ref_chosen": -76.58354187011719, "logps/ref_rejected": -81.26658630371094, "logps/rejected": -132.55589294433594, "loss": 0.4423, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13299913704395294, "margin_dpo/beta_margin_grad_std": 0.21135151386260986, "margin_dpo/beta_margin_mean": 3.392913341522217, "margin_dpo/loss_margin_mean": 33.929134368896484, "margin_dpo/margin_mean": 33.929134368896484, "margin_dpo/margin_std": 26.49199867248535, "step": 426 }, { "epoch": 0.6270190895741556, "grad_norm": 56.73555374145508, "learning_rate": 1.8529523872436977e-07, "logits/chosen": -0.6535402536392212, "logits/rejected": -0.5980893969535828, "logps/chosen": -81.8448486328125, "logps/ref_chosen": -64.8538818359375, "logps/ref_rejected": -78.56600952148438, "logps/rejected": -120.1833267211914, "loss": 0.5899, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1911955028772354, "margin_dpo/beta_margin_grad_std": 0.20332689583301544, "margin_dpo/beta_margin_mean": 2.4626340866088867, "margin_dpo/loss_margin_mean": 24.626338958740234, "margin_dpo/margin_mean": 24.626338958740234, "margin_dpo/margin_std": 23.466392517089844, "step": 427 }, { "epoch": 0.6284875183553598, "grad_norm": 43.91977310180664, "learning_rate": 1.8405649477212697e-07, "logits/chosen": -0.626772403717041, "logits/rejected": -0.5905691385269165, "logps/chosen": -83.34781646728516, "logps/ref_chosen": -62.63666534423828, "logps/ref_rejected": -103.28182220458984, "logps/rejected": -159.70887756347656, "loss": 0.3243, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12279026210308075, "margin_dpo/beta_margin_grad_std": 0.17204001545906067, "margin_dpo/beta_margin_mean": 3.571589946746826, "margin_dpo/loss_margin_mean": 35.71589660644531, "margin_dpo/margin_mean": 35.71589660644531, "margin_dpo/margin_std": 27.326576232910156, "step": 428 }, { "epoch": 0.6299559471365639, "grad_norm": 64.35308837890625, "learning_rate": 1.828194884925749e-07, "logits/chosen": -0.5859851837158203, "logits/rejected": -0.5243451595306396, "logps/chosen": -101.322509765625, "logps/ref_chosen": -81.23401641845703, "logps/ref_rejected": -91.79493713378906, "logps/rejected": -141.50485229492188, "loss": 0.5977, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.19384217262268066, "margin_dpo/beta_margin_grad_std": 0.23802496492862701, "margin_dpo/beta_margin_mean": 2.9621434211730957, "margin_dpo/loss_margin_mean": 29.621435165405273, "margin_dpo/margin_mean": 29.62143325805664, "margin_dpo/margin_std": 28.574806213378906, "step": 429 }, { "epoch": 0.631424375917768, "grad_norm": 52.39344787597656, "learning_rate": 1.8158425248197928e-07, "logits/chosen": -0.5943987369537354, "logits/rejected": -0.5758558511734009, "logps/chosen": -79.220458984375, "logps/ref_chosen": -60.92032241821289, "logps/ref_rejected": -104.42280578613281, "logps/rejected": -153.45037841796875, "loss": 0.4773, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16452635824680328, "margin_dpo/beta_margin_grad_std": 0.22081297636032104, "margin_dpo/beta_margin_mean": 3.0727434158325195, "margin_dpo/loss_margin_mean": 30.727432250976562, "margin_dpo/margin_mean": 30.727432250976562, "margin_dpo/margin_std": 26.309518814086914, "step": 430 }, { "epoch": 0.6328928046989721, "grad_norm": 44.553733825683594, "learning_rate": 1.8035081928995788e-07, "logits/chosen": -0.5974197387695312, "logits/rejected": -0.5811679363250732, "logps/chosen": -76.02676391601562, "logps/ref_chosen": -57.348751068115234, "logps/ref_rejected": -92.84022521972656, "logps/rejected": -146.17950439453125, "loss": 0.3371, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13372743129730225, "margin_dpo/beta_margin_grad_std": 0.17146742343902588, "margin_dpo/beta_margin_mean": 3.4661264419555664, "margin_dpo/loss_margin_mean": 34.66126251220703, "margin_dpo/margin_mean": 34.66126251220703, "margin_dpo/margin_std": 26.27811050415039, "step": 431 }, { "epoch": 0.6343612334801763, "grad_norm": 57.066585540771484, "learning_rate": 1.791192214186223e-07, "logits/chosen": -0.551721453666687, "logits/rejected": -0.5079036951065063, "logps/chosen": -89.14061737060547, "logps/ref_chosen": -71.07479095458984, "logps/ref_rejected": -98.57952880859375, "logps/rejected": -149.09951782226562, "loss": 0.4364, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14938993752002716, "margin_dpo/beta_margin_grad_std": 0.20730724930763245, "margin_dpo/beta_margin_mean": 3.2454161643981934, "margin_dpo/loss_margin_mean": 32.45416259765625, "margin_dpo/margin_mean": 32.45416259765625, "margin_dpo/margin_std": 27.234264373779297, "step": 432 }, { "epoch": 0.6358296622613803, "grad_norm": 72.30256652832031, "learning_rate": 1.7788949132172193e-07, "logits/chosen": -0.6330820322036743, "logits/rejected": -0.6014422178268433, "logps/chosen": -81.89974975585938, "logps/ref_chosen": -58.273193359375, "logps/ref_rejected": -95.95089721679688, "logps/rejected": -148.0189208984375, "loss": 0.5896, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1933947056531906, "margin_dpo/beta_margin_grad_std": 0.23659807443618774, "margin_dpo/beta_margin_mean": 2.844146966934204, "margin_dpo/loss_margin_mean": 28.441471099853516, "margin_dpo/margin_mean": 28.441471099853516, "margin_dpo/margin_std": 26.49103546142578, "step": 433 }, { "epoch": 0.6372980910425844, "grad_norm": 48.833492279052734, "learning_rate": 1.7666166140378853e-07, "logits/chosen": -0.6459417343139648, "logits/rejected": -0.6003463864326477, "logps/chosen": -79.62370300292969, "logps/ref_chosen": -61.97370147705078, "logps/ref_rejected": -78.49861145019531, "logps/rejected": -125.43734741210938, "loss": 0.4262, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1577899158000946, "margin_dpo/beta_margin_grad_std": 0.19978675246238708, "margin_dpo/beta_margin_mean": 2.9288740158081055, "margin_dpo/loss_margin_mean": 29.288738250732422, "margin_dpo/margin_mean": 29.288738250732422, "margin_dpo/margin_std": 24.996349334716797, "step": 434 }, { "epoch": 0.6387665198237885, "grad_norm": 64.77494812011719, "learning_rate": 1.7543576401928218e-07, "logits/chosen": -0.6737290620803833, "logits/rejected": -0.6396021842956543, "logps/chosen": -69.81366729736328, "logps/ref_chosen": -51.502052307128906, "logps/ref_rejected": -87.56689453125, "logps/rejected": -138.3524169921875, "loss": 0.5095, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16580714285373688, "margin_dpo/beta_margin_grad_std": 0.20909518003463745, "margin_dpo/beta_margin_mean": 3.247391700744629, "margin_dpo/loss_margin_mean": 32.473915100097656, "margin_dpo/margin_mean": 32.473915100097656, "margin_dpo/margin_std": 29.528972625732422, "step": 435 }, { "epoch": 0.6402349486049926, "grad_norm": 40.28781509399414, "learning_rate": 1.742118314717391e-07, "logits/chosen": -0.6202067136764526, "logits/rejected": -0.5589362978935242, "logps/chosen": -89.22311401367188, "logps/ref_chosen": -71.40371704101562, "logps/ref_rejected": -82.72775268554688, "logps/rejected": -132.42782592773438, "loss": 0.3515, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13569772243499756, "margin_dpo/beta_margin_grad_std": 0.17755961418151855, "margin_dpo/beta_margin_mean": 3.1880667209625244, "margin_dpo/loss_margin_mean": 31.880666732788086, "margin_dpo/margin_mean": 31.880664825439453, "margin_dpo/margin_std": 24.377714157104492, "step": 436 }, { "epoch": 0.6417033773861968, "grad_norm": 51.87274932861328, "learning_rate": 1.7298989601292036e-07, "logits/chosen": -0.6447381973266602, "logits/rejected": -0.6036201119422913, "logps/chosen": -82.27588653564453, "logps/ref_chosen": -64.7442626953125, "logps/ref_rejected": -82.04356384277344, "logps/rejected": -127.82572937011719, "loss": 0.5283, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17866836488246918, "margin_dpo/beta_margin_grad_std": 0.217972993850708, "margin_dpo/beta_margin_mean": 2.8250551223754883, "margin_dpo/loss_margin_mean": 28.25054931640625, "margin_dpo/margin_mean": 28.25054931640625, "margin_dpo/margin_std": 23.456018447875977, "step": 437 }, { "epoch": 0.6431718061674009, "grad_norm": 64.63465118408203, "learning_rate": 1.7176998984196144e-07, "logits/chosen": -0.6529127359390259, "logits/rejected": -0.5783262848854065, "logps/chosen": -78.42019653320312, "logps/ref_chosen": -59.0186653137207, "logps/ref_rejected": -83.07682037353516, "logps/rejected": -136.8445587158203, "loss": 0.3724, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13743507862091064, "margin_dpo/beta_margin_grad_std": 0.18105177581310272, "margin_dpo/beta_margin_mean": 3.4366211891174316, "margin_dpo/loss_margin_mean": 34.3662109375, "margin_dpo/margin_mean": 34.3662109375, "margin_dpo/margin_std": 26.907875061035156, "step": 438 }, { "epoch": 0.644640234948605, "grad_norm": 65.7437744140625, "learning_rate": 1.7055214510452458e-07, "logits/chosen": -0.6286749243736267, "logits/rejected": -0.607205867767334, "logps/chosen": -77.5360107421875, "logps/ref_chosen": -53.784080505371094, "logps/ref_rejected": -83.98545837402344, "logps/rejected": -134.8016357421875, "loss": 0.5171, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18228086829185486, "margin_dpo/beta_margin_grad_std": 0.19927145540714264, "margin_dpo/beta_margin_mean": 2.7064239978790283, "margin_dpo/loss_margin_mean": 27.064239501953125, "margin_dpo/margin_mean": 27.064241409301758, "margin_dpo/margin_std": 23.722930908203125, "step": 439 }, { "epoch": 0.6461086637298091, "grad_norm": 95.62813568115234, "learning_rate": 1.6933639389195134e-07, "logits/chosen": -0.6431401968002319, "logits/rejected": -0.6009776592254639, "logps/chosen": -97.17742919921875, "logps/ref_chosen": -78.56671905517578, "logps/ref_rejected": -96.49775695800781, "logps/rejected": -140.99290466308594, "loss": 0.6656, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.21006713807582855, "margin_dpo/beta_margin_grad_std": 0.24986517429351807, "margin_dpo/beta_margin_mean": 2.5884432792663574, "margin_dpo/loss_margin_mean": 25.884429931640625, "margin_dpo/margin_mean": 25.884429931640625, "margin_dpo/margin_std": 27.127971649169922, "step": 440 }, { "epoch": 0.6475770925110133, "grad_norm": 52.33854293823242, "learning_rate": 1.681227682404166e-07, "logits/chosen": -0.587798535823822, "logits/rejected": -0.5523707866668701, "logps/chosen": -80.9710693359375, "logps/ref_chosen": -60.824440002441406, "logps/ref_rejected": -96.47080993652344, "logps/rejected": -147.42752075195312, "loss": 0.4425, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1385086476802826, "margin_dpo/beta_margin_grad_std": 0.18967705965042114, "margin_dpo/beta_margin_mean": 3.0810084342956543, "margin_dpo/loss_margin_mean": 30.81008529663086, "margin_dpo/margin_mean": 30.81008529663086, "margin_dpo/margin_std": 23.786081314086914, "step": 441 }, { "epoch": 0.6490455212922174, "grad_norm": 36.203887939453125, "learning_rate": 1.669113001300851e-07, "logits/chosen": -0.5871816873550415, "logits/rejected": -0.549630343914032, "logps/chosen": -65.28014373779297, "logps/ref_chosen": -47.01121520996094, "logps/ref_rejected": -76.53926086425781, "logps/rejected": -132.68634033203125, "loss": 0.2789, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.10812409967184067, "margin_dpo/beta_margin_grad_std": 0.13728050887584686, "margin_dpo/beta_margin_mean": 3.787814140319824, "margin_dpo/loss_margin_mean": 37.878135681152344, "margin_dpo/margin_mean": 37.878135681152344, "margin_dpo/margin_std": 26.232383728027344, "step": 442 }, { "epoch": 0.6505139500734214, "grad_norm": 81.11394500732422, "learning_rate": 1.6570202148426815e-07, "logits/chosen": -0.6177343130111694, "logits/rejected": -0.5820919275283813, "logps/chosen": -93.88763427734375, "logps/ref_chosen": -71.27301788330078, "logps/ref_rejected": -86.679931640625, "logps/rejected": -138.04800415039062, "loss": 0.6542, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.19925275444984436, "margin_dpo/beta_margin_grad_std": 0.2632359564304352, "margin_dpo/beta_margin_mean": 2.8753466606140137, "margin_dpo/loss_margin_mean": 28.75346565246582, "margin_dpo/margin_mean": 28.753463745117188, "margin_dpo/margin_std": 27.78663444519043, "step": 443 }, { "epoch": 0.6519823788546255, "grad_norm": 47.98015594482422, "learning_rate": 1.6449496416858282e-07, "logits/chosen": -0.593714714050293, "logits/rejected": -0.5685232877731323, "logps/chosen": -77.04825592041016, "logps/ref_chosen": -57.213706970214844, "logps/ref_rejected": -97.25489044189453, "logps/rejected": -151.35964965820312, "loss": 0.4406, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1492740362882614, "margin_dpo/beta_margin_grad_std": 0.21126282215118408, "margin_dpo/beta_margin_mean": 3.42702054977417, "margin_dpo/loss_margin_mean": 34.27020263671875, "margin_dpo/margin_mean": 34.27020263671875, "margin_dpo/margin_std": 28.456218719482422, "step": 444 }, { "epoch": 0.6534508076358296, "grad_norm": 63.52720260620117, "learning_rate": 1.6329015999011182e-07, "logits/chosen": -0.5531260967254639, "logits/rejected": -0.5164097547531128, "logps/chosen": -84.41445922851562, "logps/ref_chosen": -67.29979705810547, "logps/ref_rejected": -92.68267822265625, "logps/rejected": -141.507080078125, "loss": 0.4692, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1679629534482956, "margin_dpo/beta_margin_grad_std": 0.21126939356327057, "margin_dpo/beta_margin_mean": 3.170973300933838, "margin_dpo/loss_margin_mean": 31.709733963012695, "margin_dpo/margin_mean": 31.709733963012695, "margin_dpo/margin_std": 27.622833251953125, "step": 445 }, { "epoch": 0.6549192364170338, "grad_norm": 50.25477600097656, "learning_rate": 1.6208764069656578e-07, "logits/chosen": -0.6113117933273315, "logits/rejected": -0.5910245776176453, "logps/chosen": -76.8818359375, "logps/ref_chosen": -59.098487854003906, "logps/ref_rejected": -101.26419067382812, "logps/rejected": -149.34832763671875, "loss": 0.4369, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16404861211776733, "margin_dpo/beta_margin_grad_std": 0.18896964192390442, "margin_dpo/beta_margin_mean": 3.030078887939453, "margin_dpo/loss_margin_mean": 30.30078887939453, "margin_dpo/margin_mean": 30.300785064697266, "margin_dpo/margin_std": 26.338363647460938, "step": 446 }, { "epoch": 0.6563876651982379, "grad_norm": 50.45421600341797, "learning_rate": 1.608874379754465e-07, "logits/chosen": -0.636214017868042, "logits/rejected": -0.6372050046920776, "logps/chosen": -76.49604797363281, "logps/ref_chosen": -56.07533264160156, "logps/ref_rejected": -98.69475555419922, "logps/rejected": -151.05020141601562, "loss": 0.4496, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16537348926067352, "margin_dpo/beta_margin_grad_std": 0.20752978324890137, "margin_dpo/beta_margin_mean": 3.193472385406494, "margin_dpo/loss_margin_mean": 31.934722900390625, "margin_dpo/margin_mean": 31.934722900390625, "margin_dpo/margin_std": 28.57367515563965, "step": 447 }, { "epoch": 0.657856093979442, "grad_norm": 49.00764846801758, "learning_rate": 1.5968958345321177e-07, "logits/chosen": -0.618561863899231, "logits/rejected": -0.6021959185600281, "logps/chosen": -81.1510238647461, "logps/ref_chosen": -60.00384521484375, "logps/ref_rejected": -102.26465606689453, "logps/rejected": -155.61978149414062, "loss": 0.3919, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13892757892608643, "margin_dpo/beta_margin_grad_std": 0.17518764734268188, "margin_dpo/beta_margin_mean": 3.220794439315796, "margin_dpo/loss_margin_mean": 32.207942962646484, "margin_dpo/margin_mean": 32.207942962646484, "margin_dpo/margin_std": 25.56855010986328, "step": 448 }, { "epoch": 0.6593245227606461, "grad_norm": 81.82498931884766, "learning_rate": 1.584941086944423e-07, "logits/chosen": -0.6117278337478638, "logits/rejected": -0.5685479640960693, "logps/chosen": -89.69844055175781, "logps/ref_chosen": -67.52661895751953, "logps/ref_rejected": -88.59690856933594, "logps/rejected": -142.21090698242188, "loss": 0.5996, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17296102643013, "margin_dpo/beta_margin_grad_std": 0.23669497668743134, "margin_dpo/beta_margin_mean": 3.1442177295684814, "margin_dpo/loss_margin_mean": 31.442176818847656, "margin_dpo/margin_mean": 31.442176818847656, "margin_dpo/margin_std": 30.16796875, "step": 449 }, { "epoch": 0.6607929515418502, "grad_norm": 41.05678176879883, "learning_rate": 1.573010452010098e-07, "logits/chosen": -0.6542295217514038, "logits/rejected": -0.6270924806594849, "logps/chosen": -73.37232971191406, "logps/ref_chosen": -57.108116149902344, "logps/ref_rejected": -102.75494384765625, "logps/rejected": -153.49468994140625, "loss": 0.3193, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12820908427238464, "margin_dpo/beta_margin_grad_std": 0.16310177743434906, "margin_dpo/beta_margin_mean": 3.4475526809692383, "margin_dpo/loss_margin_mean": 34.47552490234375, "margin_dpo/margin_mean": 34.47552490234375, "margin_dpo/margin_std": 25.831031799316406, "step": 450 }, { "epoch": 0.6622613803230544, "grad_norm": 75.21393585205078, "learning_rate": 1.5611042441124687e-07, "logits/chosen": -0.643078625202179, "logits/rejected": -0.5938763618469238, "logps/chosen": -80.2492904663086, "logps/ref_chosen": -58.46883010864258, "logps/ref_rejected": -72.92941284179688, "logps/rejected": -124.17213439941406, "loss": 0.553, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17004984617233276, "margin_dpo/beta_margin_grad_std": 0.22876521944999695, "margin_dpo/beta_margin_mean": 2.946226119995117, "margin_dpo/loss_margin_mean": 29.46225929260254, "margin_dpo/margin_mean": 29.46225929260254, "margin_dpo/margin_std": 25.89090347290039, "step": 451 }, { "epoch": 0.6637298091042585, "grad_norm": 33.82390213012695, "learning_rate": 1.549222776991186e-07, "logits/chosen": -0.5697954297065735, "logits/rejected": -0.5710628628730774, "logps/chosen": -66.59547424316406, "logps/ref_chosen": -50.39055252075195, "logps/ref_rejected": -97.77143096923828, "logps/rejected": -144.08883666992188, "loss": 0.2885, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12178364396095276, "margin_dpo/beta_margin_grad_std": 0.13007774949073792, "margin_dpo/beta_margin_mean": 3.0112478733062744, "margin_dpo/loss_margin_mean": 30.11248016357422, "margin_dpo/margin_mean": 30.11248016357422, "margin_dpo/margin_std": 22.0058536529541, "step": 452 }, { "epoch": 0.6651982378854625, "grad_norm": 49.218753814697266, "learning_rate": 1.5373663637339584e-07, "logits/chosen": -0.6415982246398926, "logits/rejected": -0.5901994705200195, "logps/chosen": -77.15182495117188, "logps/ref_chosen": -57.71485137939453, "logps/ref_rejected": -82.20741271972656, "logps/rejected": -130.7767333984375, "loss": 0.4672, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1702961027622223, "margin_dpo/beta_margin_grad_std": 0.20113880932331085, "margin_dpo/beta_margin_mean": 2.9132347106933594, "margin_dpo/loss_margin_mean": 29.13234519958496, "margin_dpo/margin_mean": 29.132347106933594, "margin_dpo/margin_std": 25.473758697509766, "step": 453 }, { "epoch": 0.6666666666666666, "grad_norm": 56.509586334228516, "learning_rate": 1.5255353167683017e-07, "logits/chosen": -0.6178318858146667, "logits/rejected": -0.5745600461959839, "logps/chosen": -81.63310241699219, "logps/ref_chosen": -60.945648193359375, "logps/ref_rejected": -84.9507827758789, "logps/rejected": -138.04898071289062, "loss": 0.3975, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14186511933803558, "margin_dpo/beta_margin_grad_std": 0.19594962894916534, "margin_dpo/beta_margin_mean": 3.2410740852355957, "margin_dpo/loss_margin_mean": 32.41073989868164, "margin_dpo/margin_mean": 32.41073989868164, "margin_dpo/margin_std": 25.818143844604492, "step": 454 }, { "epoch": 0.6681350954478708, "grad_norm": 45.265987396240234, "learning_rate": 1.5137299478533064e-07, "logits/chosen": -0.6223077774047852, "logits/rejected": -0.5953476428985596, "logps/chosen": -65.11666870117188, "logps/ref_chosen": -44.88671112060547, "logps/ref_rejected": -115.30147552490234, "logps/rejected": -172.906982421875, "loss": 0.3707, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1233278438448906, "margin_dpo/beta_margin_grad_std": 0.19460362195968628, "margin_dpo/beta_margin_mean": 3.7375543117523193, "margin_dpo/loss_margin_mean": 37.37554168701172, "margin_dpo/margin_mean": 37.37554168701172, "margin_dpo/margin_std": 26.588571548461914, "step": 455 }, { "epoch": 0.6696035242290749, "grad_norm": 51.1346321105957, "learning_rate": 1.5019505680714232e-07, "logits/chosen": -0.6171753406524658, "logits/rejected": -0.6144955158233643, "logps/chosen": -74.41389465332031, "logps/ref_chosen": -57.036781311035156, "logps/ref_rejected": -105.21783447265625, "logps/rejected": -160.3103790283203, "loss": 0.3541, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13141584396362305, "margin_dpo/beta_margin_grad_std": 0.1724250167608261, "margin_dpo/beta_margin_mean": 3.771542549133301, "margin_dpo/loss_margin_mean": 37.715423583984375, "margin_dpo/margin_mean": 37.715423583984375, "margin_dpo/margin_std": 28.47699737548828, "step": 456 }, { "epoch": 0.671071953010279, "grad_norm": 58.4116096496582, "learning_rate": 1.4901974878202627e-07, "logits/chosen": -0.5911962985992432, "logits/rejected": -0.5620957612991333, "logps/chosen": -72.8665542602539, "logps/ref_chosen": -54.24253845214844, "logps/ref_rejected": -85.10956573486328, "logps/rejected": -136.763916015625, "loss": 0.3892, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13140106201171875, "margin_dpo/beta_margin_grad_std": 0.19598211348056793, "margin_dpo/beta_margin_mean": 3.3030338287353516, "margin_dpo/loss_margin_mean": 33.030338287353516, "margin_dpo/margin_mean": 33.030338287353516, "margin_dpo/margin_std": 24.58535385131836, "step": 457 }, { "epoch": 0.6725403817914831, "grad_norm": 60.80534362792969, "learning_rate": 1.4784710168044212e-07, "logits/chosen": -0.5890240669250488, "logits/rejected": -0.5499871969223022, "logps/chosen": -74.93452453613281, "logps/ref_chosen": -55.40888214111328, "logps/ref_rejected": -97.68325805664062, "logps/rejected": -155.25704956054688, "loss": 0.4472, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1399458944797516, "margin_dpo/beta_margin_grad_std": 0.23080742359161377, "margin_dpo/beta_margin_mean": 3.8048152923583984, "margin_dpo/loss_margin_mean": 38.04814910888672, "margin_dpo/margin_mean": 38.048152923583984, "margin_dpo/margin_std": 32.506038665771484, "step": 458 }, { "epoch": 0.6740088105726872, "grad_norm": 49.459625244140625, "learning_rate": 1.466771464027316e-07, "logits/chosen": -0.6109951138496399, "logits/rejected": -0.58476322889328, "logps/chosen": -67.23796081542969, "logps/ref_chosen": -46.55748748779297, "logps/ref_rejected": -86.16854095458984, "logps/rejected": -135.8137969970703, "loss": 0.4559, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16255170106887817, "margin_dpo/beta_margin_grad_std": 0.19391369819641113, "margin_dpo/beta_margin_mean": 2.8964788913726807, "margin_dpo/loss_margin_mean": 28.96478843688965, "margin_dpo/margin_mean": 28.96478843688965, "margin_dpo/margin_std": 23.446517944335938, "step": 459 }, { "epoch": 0.6754772393538914, "grad_norm": 60.67763900756836, "learning_rate": 1.4550991377830423e-07, "logits/chosen": -0.6025089621543884, "logits/rejected": -0.6064221858978271, "logps/chosen": -70.86332702636719, "logps/ref_chosen": -51.63489532470703, "logps/ref_rejected": -104.11935424804688, "logps/rejected": -156.27178955078125, "loss": 0.4214, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.151195228099823, "margin_dpo/beta_margin_grad_std": 0.2108098566532135, "margin_dpo/beta_margin_mean": 3.2924013137817383, "margin_dpo/loss_margin_mean": 32.92401123046875, "margin_dpo/margin_mean": 32.92401123046875, "margin_dpo/margin_std": 25.963363647460938, "step": 460 }, { "epoch": 0.6769456681350955, "grad_norm": 62.43408966064453, "learning_rate": 1.4434543456482518e-07, "logits/chosen": -0.6063634157180786, "logits/rejected": -0.591764509677887, "logps/chosen": -80.1798095703125, "logps/ref_chosen": -55.18195343017578, "logps/ref_rejected": -86.47689819335938, "logps/rejected": -139.15049743652344, "loss": 0.5542, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1909765750169754, "margin_dpo/beta_margin_grad_std": 0.22030548751354218, "margin_dpo/beta_margin_mean": 2.767573833465576, "margin_dpo/loss_margin_mean": 27.675739288330078, "margin_dpo/margin_mean": 27.675739288330078, "margin_dpo/margin_std": 27.114221572875977, "step": 461 }, { "epoch": 0.6784140969162996, "grad_norm": 65.849853515625, "learning_rate": 1.4318373944740484e-07, "logits/chosen": -0.6203492879867554, "logits/rejected": -0.5816408395767212, "logps/chosen": -93.56144714355469, "logps/ref_chosen": -69.92803955078125, "logps/ref_rejected": -78.84111785888672, "logps/rejected": -129.50086975097656, "loss": 0.5525, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1917511373758316, "margin_dpo/beta_margin_grad_std": 0.21951280534267426, "margin_dpo/beta_margin_mean": 2.702633857727051, "margin_dpo/loss_margin_mean": 27.026338577270508, "margin_dpo/margin_mean": 27.026338577270508, "margin_dpo/margin_std": 25.6932373046875, "step": 462 }, { "epoch": 0.6798825256975036, "grad_norm": 50.78045654296875, "learning_rate": 1.4202485903778976e-07, "logits/chosen": -0.5955685377120972, "logits/rejected": -0.5663818120956421, "logps/chosen": -76.08649444580078, "logps/ref_chosen": -55.27437210083008, "logps/ref_rejected": -89.02497863769531, "logps/rejected": -143.9271240234375, "loss": 0.3565, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12205320596694946, "margin_dpo/beta_margin_grad_std": 0.1792411506175995, "margin_dpo/beta_margin_mean": 3.4090020656585693, "margin_dpo/loss_margin_mean": 34.09001922607422, "margin_dpo/margin_mean": 34.09001922607422, "margin_dpo/margin_std": 23.93946075439453, "step": 463 }, { "epoch": 0.6813509544787077, "grad_norm": 57.22633743286133, "learning_rate": 1.4086882387355658e-07, "logits/chosen": -0.6078216433525085, "logits/rejected": -0.6137137413024902, "logps/chosen": -73.63619995117188, "logps/ref_chosen": -50.91230010986328, "logps/ref_rejected": -102.4893798828125, "logps/rejected": -160.0600128173828, "loss": 0.4538, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14801308512687683, "margin_dpo/beta_margin_grad_std": 0.2087215781211853, "margin_dpo/beta_margin_mean": 3.484673023223877, "margin_dpo/loss_margin_mean": 34.84673309326172, "margin_dpo/margin_mean": 34.84673309326172, "margin_dpo/margin_std": 30.08755874633789, "step": 464 }, { "epoch": 0.6828193832599119, "grad_norm": 50.34962844848633, "learning_rate": 1.3971566441730714e-07, "logits/chosen": -0.5805087089538574, "logits/rejected": -0.5592623949050903, "logps/chosen": -81.41581726074219, "logps/ref_chosen": -60.116851806640625, "logps/ref_rejected": -113.94602966308594, "logps/rejected": -173.1104736328125, "loss": 0.2796, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.10604196786880493, "margin_dpo/beta_margin_grad_std": 0.16912564635276794, "margin_dpo/beta_margin_mean": 3.786548614501953, "margin_dpo/loss_margin_mean": 37.86548614501953, "margin_dpo/margin_mean": 37.86548614501953, "margin_dpo/margin_std": 25.48162841796875, "step": 465 }, { "epoch": 0.684287812041116, "grad_norm": 56.52486801147461, "learning_rate": 1.3856541105586545e-07, "logits/chosen": -0.6198223829269409, "logits/rejected": -0.5899391174316406, "logps/chosen": -75.75175476074219, "logps/ref_chosen": -52.920921325683594, "logps/ref_rejected": -90.3154296875, "logps/rejected": -147.3414306640625, "loss": 0.3836, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12490339577198029, "margin_dpo/beta_margin_grad_std": 0.17585688829421997, "margin_dpo/beta_margin_mean": 3.419515609741211, "margin_dpo/loss_margin_mean": 34.195152282714844, "margin_dpo/margin_mean": 34.195152282714844, "margin_dpo/margin_std": 23.578819274902344, "step": 466 }, { "epoch": 0.6857562408223201, "grad_norm": 47.07603073120117, "learning_rate": 1.3741809409947729e-07, "logits/chosen": -0.6401114463806152, "logits/rejected": -0.6112991571426392, "logps/chosen": -102.23968505859375, "logps/ref_chosen": -78.7158203125, "logps/ref_rejected": -102.86019897460938, "logps/rejected": -160.81512451171875, "loss": 0.3757, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1389492303133011, "margin_dpo/beta_margin_grad_std": 0.19067519903182983, "margin_dpo/beta_margin_mean": 3.443108081817627, "margin_dpo/loss_margin_mean": 34.43107986450195, "margin_dpo/margin_mean": 34.43107986450195, "margin_dpo/margin_std": 27.870590209960938, "step": 467 }, { "epoch": 0.6872246696035242, "grad_norm": 52.84998321533203, "learning_rate": 1.362737437810114e-07, "logits/chosen": -0.5987285375595093, "logits/rejected": -0.5710204243659973, "logps/chosen": -89.93443298339844, "logps/ref_chosen": -69.93536376953125, "logps/ref_rejected": -101.02881622314453, "logps/rejected": -152.99951171875, "loss": 0.3946, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14784805476665497, "margin_dpo/beta_margin_grad_std": 0.19148442149162292, "margin_dpo/beta_margin_mean": 3.1971635818481445, "margin_dpo/loss_margin_mean": 31.971633911132812, "margin_dpo/margin_mean": 31.971633911132812, "margin_dpo/margin_std": 26.962993621826172, "step": 468 }, { "epoch": 0.6886930983847284, "grad_norm": 57.32428741455078, "learning_rate": 1.351323902551631e-07, "logits/chosen": -0.6210588216781616, "logits/rejected": -0.5882803201675415, "logps/chosen": -91.5205078125, "logps/ref_chosen": -68.12469482421875, "logps/ref_rejected": -104.78640747070312, "logps/rejected": -161.24330139160156, "loss": 0.4417, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15189404785633087, "margin_dpo/beta_margin_grad_std": 0.21930459141731262, "margin_dpo/beta_margin_mean": 3.3061084747314453, "margin_dpo/loss_margin_mean": 33.06108474731445, "margin_dpo/margin_mean": 33.06108474731445, "margin_dpo/margin_std": 27.455984115600586, "step": 469 }, { "epoch": 0.6901615271659325, "grad_norm": 36.49016189575195, "learning_rate": 1.339940635976592e-07, "logits/chosen": -0.5721327066421509, "logits/rejected": -0.5457053184509277, "logps/chosen": -64.37504577636719, "logps/ref_chosen": -43.79193115234375, "logps/ref_rejected": -82.70285034179688, "logps/rejected": -141.7349853515625, "loss": 0.2322, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.0932888612151146, "margin_dpo/beta_margin_grad_std": 0.1430518627166748, "margin_dpo/beta_margin_mean": 3.8449037075042725, "margin_dpo/loss_margin_mean": 38.44903564453125, "margin_dpo/margin_mean": 38.44903564453125, "margin_dpo/margin_std": 23.93124008178711, "step": 470 }, { "epoch": 0.6916299559471366, "grad_norm": 55.444793701171875, "learning_rate": 1.3285879380446563e-07, "logits/chosen": -0.5917923450469971, "logits/rejected": -0.5650200843811035, "logps/chosen": -87.83808898925781, "logps/ref_chosen": -63.33952331542969, "logps/ref_rejected": -83.61048126220703, "logps/rejected": -139.63504028320312, "loss": 0.4209, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15353405475616455, "margin_dpo/beta_margin_grad_std": 0.2025720775127411, "margin_dpo/beta_margin_mean": 3.1526002883911133, "margin_dpo/loss_margin_mean": 31.526004791259766, "margin_dpo/margin_mean": 31.526002883911133, "margin_dpo/margin_std": 24.51514434814453, "step": 471 }, { "epoch": 0.6930983847283406, "grad_norm": 51.19646453857422, "learning_rate": 1.317266107909975e-07, "logits/chosen": -0.6550266742706299, "logits/rejected": -0.6002498865127563, "logps/chosen": -105.19808959960938, "logps/ref_chosen": -83.66609954833984, "logps/ref_rejected": -117.20919799804688, "logps/rejected": -179.25303649902344, "loss": 0.3083, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.11649411916732788, "margin_dpo/beta_margin_grad_std": 0.1778721958398819, "margin_dpo/beta_margin_mean": 4.051185607910156, "margin_dpo/loss_margin_mean": 40.51185607910156, "margin_dpo/margin_mean": 40.51185607910156, "margin_dpo/margin_std": 33.35724639892578, "step": 472 }, { "epoch": 0.6945668135095447, "grad_norm": 78.59127044677734, "learning_rate": 1.3059754439133002e-07, "logits/chosen": -0.5987892746925354, "logits/rejected": -0.5563715696334839, "logps/chosen": -87.65088653564453, "logps/ref_chosen": -63.49696731567383, "logps/ref_rejected": -81.14657592773438, "logps/rejected": -133.66065979003906, "loss": 0.4869, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.170325368642807, "margin_dpo/beta_margin_grad_std": 0.22086429595947266, "margin_dpo/beta_margin_mean": 2.8360166549682617, "margin_dpo/loss_margin_mean": 28.360164642333984, "margin_dpo/margin_mean": 28.360164642333984, "margin_dpo/margin_std": 22.68465805053711, "step": 473 }, { "epoch": 0.6960352422907489, "grad_norm": 73.43638610839844, "learning_rate": 1.2947162435741277e-07, "logits/chosen": -0.5953601598739624, "logits/rejected": -0.5836308598518372, "logps/chosen": -76.69085693359375, "logps/ref_chosen": -52.6119384765625, "logps/ref_rejected": -90.08041381835938, "logps/rejected": -145.01792907714844, "loss": 0.4665, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16552072763442993, "margin_dpo/beta_margin_grad_std": 0.22101813554763794, "margin_dpo/beta_margin_mean": 3.085860252380371, "margin_dpo/loss_margin_mean": 30.85860252380371, "margin_dpo/margin_mean": 30.85860252380371, "margin_dpo/margin_std": 25.63982582092285, "step": 474 }, { "epoch": 0.697503671071953, "grad_norm": 43.33028030395508, "learning_rate": 1.2834888035828596e-07, "logits/chosen": -0.634456992149353, "logits/rejected": -0.6314413547515869, "logps/chosen": -63.670257568359375, "logps/ref_chosen": -42.49519348144531, "logps/ref_rejected": -90.06295013427734, "logps/rejected": -145.98110961914062, "loss": 0.3832, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14322960376739502, "margin_dpo/beta_margin_grad_std": 0.19120553135871887, "margin_dpo/beta_margin_mean": 3.4743099212646484, "margin_dpo/loss_margin_mean": 34.74309539794922, "margin_dpo/margin_mean": 34.743099212646484, "margin_dpo/margin_std": 30.56637191772461, "step": 475 }, { "epoch": 0.6989720998531571, "grad_norm": 60.676177978515625, "learning_rate": 1.2722934197929802e-07, "logits/chosen": -0.6468064785003662, "logits/rejected": -0.6170526742935181, "logps/chosen": -64.93257141113281, "logps/ref_chosen": -42.949378967285156, "logps/ref_rejected": -73.71023559570312, "logps/rejected": -126.3614501953125, "loss": 0.506, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17712682485580444, "margin_dpo/beta_margin_grad_std": 0.22809603810310364, "margin_dpo/beta_margin_mean": 3.066802501678467, "margin_dpo/loss_margin_mean": 30.668025970458984, "margin_dpo/margin_mean": 30.668025970458984, "margin_dpo/margin_std": 26.836669921875, "step": 476 }, { "epoch": 0.7004405286343612, "grad_norm": 82.7061538696289, "learning_rate": 1.2611303872132631e-07, "logits/chosen": -0.6342014074325562, "logits/rejected": -0.5668247938156128, "logps/chosen": -96.3890151977539, "logps/ref_chosen": -70.77261352539062, "logps/ref_rejected": -76.13737487792969, "logps/rejected": -133.8131866455078, "loss": 0.6051, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15660977363586426, "margin_dpo/beta_margin_grad_std": 0.24074432253837585, "margin_dpo/beta_margin_mean": 3.2059414386749268, "margin_dpo/loss_margin_mean": 32.05941390991211, "margin_dpo/margin_mean": 32.05941390991211, "margin_dpo/margin_std": 27.974023818969727, "step": 477 }, { "epoch": 0.7019089574155654, "grad_norm": 48.793907165527344, "learning_rate": 1.2500000000000005e-07, "logits/chosen": -0.6003127098083496, "logits/rejected": -0.5863485336303711, "logps/chosen": -61.834197998046875, "logps/ref_chosen": -41.440513610839844, "logps/ref_rejected": -85.36196899414062, "logps/rejected": -140.44876098632812, "loss": 0.3975, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14744707942008972, "margin_dpo/beta_margin_grad_std": 0.1875038743019104, "margin_dpo/beta_margin_mean": 3.469311237335205, "margin_dpo/loss_margin_mean": 34.693111419677734, "margin_dpo/margin_mean": 34.693111419677734, "margin_dpo/margin_std": 29.18410873413086, "step": 478 }, { "epoch": 0.7033773861967695, "grad_norm": 57.176082611083984, "learning_rate": 1.2389025514492456e-07, "logits/chosen": -0.5899140238761902, "logits/rejected": -0.5823123455047607, "logps/chosen": -79.59027099609375, "logps/ref_chosen": -53.907920837402344, "logps/ref_rejected": -95.1163330078125, "logps/rejected": -151.4071044921875, "loss": 0.4438, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1582869589328766, "margin_dpo/beta_margin_grad_std": 0.21056291460990906, "margin_dpo/beta_margin_mean": 3.0608415603637695, "margin_dpo/loss_margin_mean": 30.608417510986328, "margin_dpo/margin_mean": 30.608415603637695, "margin_dpo/margin_std": 22.303516387939453, "step": 479 }, { "epoch": 0.7048458149779736, "grad_norm": 74.97010040283203, "learning_rate": 1.227838333989088e-07, "logits/chosen": -0.5865793824195862, "logits/rejected": -0.5290813446044922, "logps/chosen": -85.32308959960938, "logps/ref_chosen": -58.682701110839844, "logps/ref_rejected": -82.93248748779297, "logps/rejected": -146.00466918945312, "loss": 0.5139, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15616512298583984, "margin_dpo/beta_margin_grad_std": 0.231448233127594, "margin_dpo/beta_margin_mean": 3.643179416656494, "margin_dpo/loss_margin_mean": 36.431793212890625, "margin_dpo/margin_mean": 36.431793212890625, "margin_dpo/margin_std": 32.21718978881836, "step": 480 }, { "epoch": 0.7063142437591777, "grad_norm": 53.85762023925781, "learning_rate": 1.2168076391719489e-07, "logits/chosen": -0.6352800130844116, "logits/rejected": -0.6003815531730652, "logps/chosen": -80.35116577148438, "logps/ref_chosen": -54.964271545410156, "logps/ref_rejected": -92.42044067382812, "logps/rejected": -152.66683959960938, "loss": 0.4429, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14123259484767914, "margin_dpo/beta_margin_grad_std": 0.2171190083026886, "margin_dpo/beta_margin_mean": 3.485950469970703, "margin_dpo/loss_margin_mean": 34.85950469970703, "margin_dpo/margin_mean": 34.85950469970703, "margin_dpo/margin_std": 26.85974884033203, "step": 481 }, { "epoch": 0.7077826725403817, "grad_norm": 56.575809478759766, "learning_rate": 1.2058107576668938e-07, "logits/chosen": -0.5973387956619263, "logits/rejected": -0.56673264503479, "logps/chosen": -90.35714721679688, "logps/ref_chosen": -67.55347442626953, "logps/ref_rejected": -87.58953857421875, "logps/rejected": -140.64810180664062, "loss": 0.4347, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16674280166625977, "margin_dpo/beta_margin_grad_std": 0.18362513184547424, "margin_dpo/beta_margin_mean": 3.0254898071289062, "margin_dpo/loss_margin_mean": 30.254898071289062, "margin_dpo/margin_mean": 30.254898071289062, "margin_dpo/margin_std": 26.179527282714844, "step": 482 }, { "epoch": 0.7092511013215859, "grad_norm": 71.42662048339844, "learning_rate": 1.194847979251979e-07, "logits/chosen": -0.5927727222442627, "logits/rejected": -0.5307378768920898, "logps/chosen": -89.05766296386719, "logps/ref_chosen": -63.32981872558594, "logps/ref_rejected": -95.78697204589844, "logps/rejected": -157.198974609375, "loss": 0.3982, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1320417821407318, "margin_dpo/beta_margin_grad_std": 0.2117423117160797, "margin_dpo/beta_margin_mean": 3.5684163570404053, "margin_dpo/loss_margin_mean": 35.68416213989258, "margin_dpo/margin_mean": 35.68416213989258, "margin_dpo/margin_std": 27.353343963623047, "step": 483 }, { "epoch": 0.71071953010279, "grad_norm": 52.49878692626953, "learning_rate": 1.1839195928066101e-07, "logits/chosen": -0.644550085067749, "logits/rejected": -0.6064622402191162, "logps/chosen": -81.20121002197266, "logps/ref_chosen": -59.13812255859375, "logps/ref_rejected": -84.37144470214844, "logps/rejected": -142.26454162597656, "loss": 0.3632, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14210814237594604, "margin_dpo/beta_margin_grad_std": 0.1774926483631134, "margin_dpo/beta_margin_mean": 3.583000659942627, "margin_dpo/loss_margin_mean": 35.83000564575195, "margin_dpo/margin_mean": 35.83000564575195, "margin_dpo/margin_std": 29.71619415283203, "step": 484 }, { "epoch": 0.7121879588839941, "grad_norm": 51.49626922607422, "learning_rate": 1.1730258863039347e-07, "logits/chosen": -0.5840227603912354, "logits/rejected": -0.5531511306762695, "logps/chosen": -77.98756408691406, "logps/ref_chosen": -58.849571228027344, "logps/ref_rejected": -103.36408996582031, "logps/rejected": -163.02682495117188, "loss": 0.4151, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14008283615112305, "margin_dpo/beta_margin_grad_std": 0.21648435294628143, "margin_dpo/beta_margin_mean": 4.052473545074463, "margin_dpo/loss_margin_mean": 40.52473449707031, "margin_dpo/margin_mean": 40.52473449707031, "margin_dpo/margin_std": 32.42699432373047, "step": 485 }, { "epoch": 0.7136563876651982, "grad_norm": 67.00348663330078, "learning_rate": 1.1621671468032493e-07, "logits/chosen": -0.6258925199508667, "logits/rejected": -0.5719567537307739, "logps/chosen": -78.4237060546875, "logps/ref_chosen": -55.25966262817383, "logps/ref_rejected": -92.13936614990234, "logps/rejected": -154.44952392578125, "loss": 0.4209, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14244696497917175, "margin_dpo/beta_margin_grad_std": 0.21729934215545654, "margin_dpo/beta_margin_mean": 3.9146108627319336, "margin_dpo/loss_margin_mean": 39.1461067199707, "margin_dpo/margin_mean": 39.1461067199707, "margin_dpo/margin_std": 30.911312103271484, "step": 486 }, { "epoch": 0.7151248164464024, "grad_norm": 57.11901092529297, "learning_rate": 1.1513436604424378e-07, "logits/chosen": -0.638907790184021, "logits/rejected": -0.6043581962585449, "logps/chosen": -75.46080017089844, "logps/ref_chosen": -53.06330871582031, "logps/ref_rejected": -92.4188232421875, "logps/rejected": -152.25311279296875, "loss": 0.3229, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12441418319940567, "margin_dpo/beta_margin_grad_std": 0.16795003414154053, "margin_dpo/beta_margin_mean": 3.743680477142334, "margin_dpo/loss_margin_mean": 37.436805725097656, "margin_dpo/margin_mean": 37.436805725097656, "margin_dpo/margin_std": 26.35199737548828, "step": 487 }, { "epoch": 0.7165932452276065, "grad_norm": 33.266990661621094, "learning_rate": 1.1405557124304335e-07, "logits/chosen": -0.591684877872467, "logits/rejected": -0.5600037574768066, "logps/chosen": -73.20890808105469, "logps/ref_chosen": -52.228153228759766, "logps/ref_rejected": -84.00656127929688, "logps/rejected": -137.1870880126953, "loss": 0.282, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.11648823320865631, "margin_dpo/beta_margin_grad_std": 0.14420145750045776, "margin_dpo/beta_margin_mean": 3.219977378845215, "margin_dpo/loss_margin_mean": 32.199771881103516, "margin_dpo/margin_mean": 32.199771881103516, "margin_dpo/margin_std": 21.357444763183594, "step": 488 }, { "epoch": 0.7180616740088106, "grad_norm": 55.21537780761719, "learning_rate": 1.1298035870396985e-07, "logits/chosen": -0.6172722578048706, "logits/rejected": -0.5707902908325195, "logps/chosen": -78.21434020996094, "logps/ref_chosen": -55.989627838134766, "logps/ref_rejected": -79.39813232421875, "logps/rejected": -133.66220092773438, "loss": 0.4375, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1609182357788086, "margin_dpo/beta_margin_grad_std": 0.20739710330963135, "margin_dpo/beta_margin_mean": 3.2039356231689453, "margin_dpo/loss_margin_mean": 32.03935623168945, "margin_dpo/margin_mean": 32.03936004638672, "margin_dpo/margin_std": 27.642593383789062, "step": 489 }, { "epoch": 0.7195301027900147, "grad_norm": 69.54042053222656, "learning_rate": 1.1190875675987355e-07, "logits/chosen": -0.6238210201263428, "logits/rejected": -0.6131519079208374, "logps/chosen": -73.09518432617188, "logps/ref_chosen": -52.36639404296875, "logps/ref_rejected": -110.40904998779297, "logps/rejected": -162.64816284179688, "loss": 0.5726, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18633374571800232, "margin_dpo/beta_margin_grad_std": 0.23432117700576782, "margin_dpo/beta_margin_mean": 3.151031017303467, "margin_dpo/loss_margin_mean": 31.510311126708984, "margin_dpo/margin_mean": 31.510311126708984, "margin_dpo/margin_std": 29.851600646972656, "step": 490 }, { "epoch": 0.7209985315712188, "grad_norm": 71.87732696533203, "learning_rate": 1.1084079364846241e-07, "logits/chosen": -0.5947495698928833, "logits/rejected": -0.5508404970169067, "logps/chosen": -83.43572998046875, "logps/ref_chosen": -60.11626434326172, "logps/ref_rejected": -73.27278900146484, "logps/rejected": -124.82978820800781, "loss": 0.5807, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18778780102729797, "margin_dpo/beta_margin_grad_std": 0.22901329398155212, "margin_dpo/beta_margin_mean": 2.8237528800964355, "margin_dpo/loss_margin_mean": 28.237525939941406, "margin_dpo/margin_mean": 28.237525939941406, "margin_dpo/margin_std": 27.937530517578125, "step": 491 }, { "epoch": 0.7224669603524229, "grad_norm": 113.52478790283203, "learning_rate": 1.097764975115576e-07, "logits/chosen": -0.6196011304855347, "logits/rejected": -0.5757460594177246, "logps/chosen": -77.80059814453125, "logps/ref_chosen": -53.99418258666992, "logps/ref_rejected": -72.65962219238281, "logps/rejected": -122.739990234375, "loss": 0.9513, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.23549211025238037, "margin_dpo/beta_margin_grad_std": 0.311998188495636, "margin_dpo/beta_margin_mean": 2.6273956298828125, "margin_dpo/loss_margin_mean": 26.273958206176758, "margin_dpo/margin_mean": 26.273958206176758, "margin_dpo/margin_std": 30.309785842895508, "step": 492 }, { "epoch": 0.723935389133627, "grad_norm": 57.65090560913086, "learning_rate": 1.0871589639435203e-07, "logits/chosen": -0.6554695963859558, "logits/rejected": -0.5956501960754395, "logps/chosen": -95.95299530029297, "logps/ref_chosen": -75.49723815917969, "logps/ref_rejected": -87.32301330566406, "logps/rejected": -140.97837829589844, "loss": 0.4676, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1496451199054718, "margin_dpo/beta_margin_grad_std": 0.22746598720550537, "margin_dpo/beta_margin_mean": 3.319960117340088, "margin_dpo/loss_margin_mean": 33.19960021972656, "margin_dpo/margin_mean": 33.19960021972656, "margin_dpo/margin_std": 26.605464935302734, "step": 493 }, { "epoch": 0.7254038179148311, "grad_norm": 106.5522232055664, "learning_rate": 1.0765901824467166e-07, "logits/chosen": -0.5855438709259033, "logits/rejected": -0.5765562653541565, "logps/chosen": -63.644134521484375, "logps/ref_chosen": -41.35926818847656, "logps/ref_rejected": -86.09136962890625, "logps/rejected": -144.02566528320312, "loss": 0.5137, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1603744924068451, "margin_dpo/beta_margin_grad_std": 0.23930124938488007, "margin_dpo/beta_margin_mean": 3.5649423599243164, "margin_dpo/loss_margin_mean": 35.64942169189453, "margin_dpo/margin_mean": 35.64942169189453, "margin_dpo/margin_std": 29.729022979736328, "step": 494 }, { "epoch": 0.7268722466960352, "grad_norm": 68.87841033935547, "learning_rate": 1.0660589091223854e-07, "logits/chosen": -0.6375582218170166, "logits/rejected": -0.5974992513656616, "logps/chosen": -85.37813568115234, "logps/ref_chosen": -63.53507995605469, "logps/ref_rejected": -91.42443084716797, "logps/rejected": -145.79571533203125, "loss": 0.5198, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15870174765586853, "margin_dpo/beta_margin_grad_std": 0.22453130781650543, "margin_dpo/beta_margin_mean": 3.2528228759765625, "margin_dpo/loss_margin_mean": 32.528228759765625, "margin_dpo/margin_mean": 32.52822494506836, "margin_dpo/margin_std": 27.80425262451172, "step": 495 }, { "epoch": 0.7283406754772394, "grad_norm": 67.1712417602539, "learning_rate": 1.0555654214793722e-07, "logits/chosen": -0.6405035257339478, "logits/rejected": -0.583281397819519, "logps/chosen": -97.08042907714844, "logps/ref_chosen": -72.59192657470703, "logps/ref_rejected": -84.32933807373047, "logps/rejected": -137.63818359375, "loss": 0.538, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17619186639785767, "margin_dpo/beta_margin_grad_std": 0.22553400695323944, "margin_dpo/beta_margin_mean": 2.882033348083496, "margin_dpo/loss_margin_mean": 28.820335388183594, "margin_dpo/margin_mean": 28.82033348083496, "margin_dpo/margin_std": 26.039134979248047, "step": 496 }, { "epoch": 0.7298091042584435, "grad_norm": 76.36552429199219, "learning_rate": 1.0451099960308374e-07, "logits/chosen": -0.6118708848953247, "logits/rejected": -0.5635442733764648, "logps/chosen": -84.1969985961914, "logps/ref_chosen": -58.593971252441406, "logps/ref_rejected": -76.28836822509766, "logps/rejected": -130.5543670654297, "loss": 0.6066, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.20490401983261108, "margin_dpo/beta_margin_grad_std": 0.22362196445465088, "margin_dpo/beta_margin_mean": 2.8662962913513184, "margin_dpo/loss_margin_mean": 28.6629638671875, "margin_dpo/margin_mean": 28.6629638671875, "margin_dpo/margin_std": 27.733150482177734, "step": 497 }, { "epoch": 0.7312775330396476, "grad_norm": 86.41475677490234, "learning_rate": 1.0346929082869641e-07, "logits/chosen": -0.617120623588562, "logits/rejected": -0.5855381488800049, "logps/chosen": -95.82791137695312, "logps/ref_chosen": -71.20565795898438, "logps/ref_rejected": -83.95803833007812, "logps/rejected": -139.6630859375, "loss": 0.5315, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1733829826116562, "margin_dpo/beta_margin_grad_std": 0.23494428396224976, "margin_dpo/beta_margin_mean": 3.1082797050476074, "margin_dpo/loss_margin_mean": 31.082794189453125, "margin_dpo/margin_mean": 31.082794189453125, "margin_dpo/margin_std": 28.04306411743164, "step": 498 }, { "epoch": 0.7327459618208517, "grad_norm": 80.22297668457031, "learning_rate": 1.0243144327477013e-07, "logits/chosen": -0.6173365116119385, "logits/rejected": -0.6094462275505066, "logps/chosen": -74.71839904785156, "logps/ref_chosen": -51.25519561767578, "logps/ref_rejected": -101.07870483398438, "logps/rejected": -156.43063354492188, "loss": 0.6745, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17869053781032562, "margin_dpo/beta_margin_grad_std": 0.25083693861961365, "margin_dpo/beta_margin_mean": 3.1888723373413086, "margin_dpo/loss_margin_mean": 31.888721466064453, "margin_dpo/margin_mean": 31.888721466064453, "margin_dpo/margin_std": 30.46820831298828, "step": 499 }, { "epoch": 0.7342143906020558, "grad_norm": 44.01335144042969, "learning_rate": 1.0139748428955333e-07, "logits/chosen": -0.5734531879425049, "logits/rejected": -0.5571717023849487, "logps/chosen": -82.99430847167969, "logps/ref_chosen": -57.027442932128906, "logps/ref_rejected": -93.93421173095703, "logps/rejected": -153.8037109375, "loss": 0.38, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1365164965391159, "margin_dpo/beta_margin_grad_std": 0.1890600323677063, "margin_dpo/beta_margin_mean": 3.390263557434082, "margin_dpo/loss_margin_mean": 33.90263366699219, "margin_dpo/margin_mean": 33.90263366699219, "margin_dpo/margin_std": 29.03835678100586, "step": 500 }, { "epoch": 0.7342143906020558, "eval_logits/chosen": -0.6236123442649841, "eval_logits/rejected": -0.5976437926292419, "eval_logps/chosen": -106.9358139038086, "eval_logps/ref_chosen": -79.05104064941406, "eval_logps/ref_rejected": -86.79793548583984, "eval_logps/rejected": -136.5065155029297, "eval_loss": 0.40981218218803406, "eval_margin_dpo/beta": 0.10000000149011612, "eval_margin_dpo/beta_margin_grad_mean": -0.25789907574653625, "eval_margin_dpo/beta_margin_grad_std": 0.2560845613479614, "eval_margin_dpo/beta_margin_mean": 2.1823792457580566, "eval_margin_dpo/loss_margin_mean": 21.823793411254883, "eval_margin_dpo/margin_mean": 21.823793411254883, "eval_margin_dpo/margin_std": 26.597421646118164, "eval_runtime": 39.8891, "eval_samples_per_second": 58.638, "eval_steps_per_second": 1.855, "step": 500 }, { "epoch": 0.73568281938326, "grad_norm": 51.43994140625, "learning_rate": 1.0036744111882672e-07, "logits/chosen": -0.6275873184204102, "logits/rejected": -0.5868571996688843, "logps/chosen": -76.56288146972656, "logps/ref_chosen": -54.359527587890625, "logps/ref_rejected": -80.15670776367188, "logps/rejected": -140.09364318847656, "loss": 0.3432, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1260565221309662, "margin_dpo/beta_margin_grad_std": 0.185085728764534, "margin_dpo/beta_margin_mean": 3.7733588218688965, "margin_dpo/loss_margin_mean": 37.733585357666016, "margin_dpo/margin_mean": 37.733585357666016, "margin_dpo/margin_std": 29.447450637817383, "step": 501 }, { "epoch": 0.737151248164464, "grad_norm": 45.80177688598633, "learning_rate": 9.934134090518592e-08, "logits/chosen": -0.6037914752960205, "logits/rejected": -0.542682945728302, "logps/chosen": -90.61296844482422, "logps/ref_chosen": -67.60050964355469, "logps/ref_rejected": -82.94876098632812, "logps/rejected": -139.87281799316406, "loss": 0.3237, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1189626082777977, "margin_dpo/beta_margin_grad_std": 0.18001875281333923, "margin_dpo/beta_margin_mean": 3.3911592960357666, "margin_dpo/loss_margin_mean": 33.911590576171875, "margin_dpo/margin_mean": 33.911590576171875, "margin_dpo/margin_std": 20.9443416595459, "step": 502 }, { "epoch": 0.7386196769456681, "grad_norm": 54.339813232421875, "learning_rate": 9.831921068732571e-08, "logits/chosen": -0.5617387294769287, "logits/rejected": -0.5207287073135376, "logps/chosen": -76.20591735839844, "logps/ref_chosen": -55.078407287597656, "logps/ref_rejected": -82.50544738769531, "logps/rejected": -137.9188690185547, "loss": 0.4001, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13650323450565338, "margin_dpo/beta_margin_grad_std": 0.21414102613925934, "margin_dpo/beta_margin_mean": 3.428591251373291, "margin_dpo/loss_margin_mean": 34.285911560058594, "margin_dpo/margin_mean": 34.285911560058594, "margin_dpo/margin_std": 23.53387451171875, "step": 503 }, { "epoch": 0.7400881057268722, "grad_norm": 36.07522964477539, "learning_rate": 9.730107739932805e-08, "logits/chosen": -0.6169658899307251, "logits/rejected": -0.5972700119018555, "logps/chosen": -79.52164459228516, "logps/ref_chosen": -59.96575164794922, "logps/ref_rejected": -103.76213073730469, "logps/rejected": -163.46487426757812, "loss": 0.2426, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.0935632586479187, "margin_dpo/beta_margin_grad_std": 0.1528908908367157, "margin_dpo/beta_margin_mean": 4.014684677124023, "margin_dpo/loss_margin_mean": 40.1468505859375, "margin_dpo/margin_mean": 40.146846771240234, "margin_dpo/margin_std": 26.04753875732422, "step": 504 }, { "epoch": 0.7415565345080763, "grad_norm": 81.94815063476562, "learning_rate": 9.628696786995188e-08, "logits/chosen": -0.6530240774154663, "logits/rejected": -0.6016232967376709, "logps/chosen": -101.87925720214844, "logps/ref_chosen": -76.1549072265625, "logps/ref_rejected": -88.58537292480469, "logps/rejected": -142.79168701171875, "loss": 0.6762, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.2050047218799591, "margin_dpo/beta_margin_grad_std": 0.24811255931854248, "margin_dpo/beta_margin_mean": 2.8481969833374023, "margin_dpo/loss_margin_mean": 28.481971740722656, "margin_dpo/margin_mean": 28.481969833374023, "margin_dpo/margin_std": 29.522705078125, "step": 505 }, { "epoch": 0.7430249632892805, "grad_norm": 66.44819641113281, "learning_rate": 9.527690882192635e-08, "logits/chosen": -0.615195631980896, "logits/rejected": -0.579567551612854, "logps/chosen": -71.32270812988281, "logps/ref_chosen": -48.96050262451172, "logps/ref_rejected": -78.41505432128906, "logps/rejected": -134.9954376220703, "loss": 0.448, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15291953086853027, "margin_dpo/beta_margin_grad_std": 0.2167114019393921, "margin_dpo/beta_margin_mean": 3.421818733215332, "margin_dpo/loss_margin_mean": 34.21818542480469, "margin_dpo/margin_mean": 34.21818542480469, "margin_dpo/margin_std": 29.17880630493164, "step": 506 }, { "epoch": 0.7444933920704846, "grad_norm": 52.085636138916016, "learning_rate": 9.427092687124691e-08, "logits/chosen": -0.6226514577865601, "logits/rejected": -0.5839424133300781, "logps/chosen": -90.81254577636719, "logps/ref_chosen": -66.80150604248047, "logps/ref_rejected": -95.37289428710938, "logps/rejected": -151.70751953125, "loss": 0.3354, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1319025158882141, "margin_dpo/beta_margin_grad_std": 0.17619000375270844, "margin_dpo/beta_margin_mean": 3.2323567867279053, "margin_dpo/loss_margin_mean": 32.32356643676758, "margin_dpo/margin_mean": 32.32356262207031, "margin_dpo/margin_std": 22.592437744140625, "step": 507 }, { "epoch": 0.7459618208516887, "grad_norm": 64.97161865234375, "learning_rate": 9.326904852647344e-08, "logits/chosen": -0.6235086917877197, "logits/rejected": -0.5866918563842773, "logps/chosen": -93.52983093261719, "logps/ref_chosen": -71.303466796875, "logps/ref_rejected": -95.6275405883789, "logps/rejected": -150.2212371826172, "loss": 0.5048, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17342665791511536, "margin_dpo/beta_margin_grad_std": 0.22466185688972473, "margin_dpo/beta_margin_mean": 3.2367329597473145, "margin_dpo/loss_margin_mean": 32.36732864379883, "margin_dpo/margin_mean": 32.36732864379883, "margin_dpo/margin_std": 28.795747756958008, "step": 508 }, { "epoch": 0.7474302496328928, "grad_norm": 71.68479919433594, "learning_rate": 9.227130018803195e-08, "logits/chosen": -0.6284923553466797, "logits/rejected": -0.5883047580718994, "logps/chosen": -86.16942596435547, "logps/ref_chosen": -63.81895065307617, "logps/ref_rejected": -83.25643920898438, "logps/rejected": -138.71392822265625, "loss": 0.4825, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1474766582250595, "margin_dpo/beta_margin_grad_std": 0.23035329580307007, "margin_dpo/beta_margin_mean": 3.3107001781463623, "margin_dpo/loss_margin_mean": 33.10700225830078, "margin_dpo/margin_mean": 33.10700225830078, "margin_dpo/margin_std": 26.037105560302734, "step": 509 }, { "epoch": 0.748898678414097, "grad_norm": 57.0163459777832, "learning_rate": 9.127770814751932e-08, "logits/chosen": -0.5673776865005493, "logits/rejected": -0.551094114780426, "logps/chosen": -79.0858154296875, "logps/ref_chosen": -51.878448486328125, "logps/ref_rejected": -102.7651596069336, "logps/rejected": -170.52944946289062, "loss": 0.4056, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14309245347976685, "margin_dpo/beta_margin_grad_std": 0.21340487897396088, "margin_dpo/beta_margin_mean": 4.055694580078125, "margin_dpo/loss_margin_mean": 40.556941986083984, "margin_dpo/margin_mean": 40.556941986083984, "margin_dpo/margin_std": 32.51176452636719, "step": 510 }, { "epoch": 0.750367107195301, "grad_norm": 55.767330169677734, "learning_rate": 9.028829858700973e-08, "logits/chosen": -0.6472057104110718, "logits/rejected": -0.6147615909576416, "logps/chosen": -82.40501403808594, "logps/ref_chosen": -60.23811340332031, "logps/ref_rejected": -92.85676574707031, "logps/rejected": -151.13473510742188, "loss": 0.4616, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1584860384464264, "margin_dpo/beta_margin_grad_std": 0.22719457745552063, "margin_dpo/beta_margin_mean": 3.6111063957214355, "margin_dpo/loss_margin_mean": 36.11106491088867, "margin_dpo/margin_mean": 36.11106491088867, "margin_dpo/margin_std": 29.813339233398438, "step": 511 }, { "epoch": 0.7518355359765051, "grad_norm": 45.10017776489258, "learning_rate": 8.930309757836516e-08, "logits/chosen": -0.6407291889190674, "logits/rejected": -0.6046779155731201, "logps/chosen": -80.43107604980469, "logps/ref_chosen": -54.905494689941406, "logps/ref_rejected": -81.87586975097656, "logps/rejected": -142.87881469726562, "loss": 0.3367, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.132303386926651, "margin_dpo/beta_margin_grad_std": 0.1747945249080658, "margin_dpo/beta_margin_mean": 3.547736644744873, "margin_dpo/loss_margin_mean": 35.47736358642578, "margin_dpo/margin_mean": 35.47736740112305, "margin_dpo/margin_std": 27.888330459594727, "step": 512 }, { "epoch": 0.7533039647577092, "grad_norm": 74.60807800292969, "learning_rate": 8.832213108254863e-08, "logits/chosen": -0.6363452672958374, "logits/rejected": -0.5841466188430786, "logps/chosen": -89.99166870117188, "logps/ref_chosen": -64.91644287109375, "logps/ref_rejected": -76.06245422363281, "logps/rejected": -131.63497924804688, "loss": 0.5735, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16800448298454285, "margin_dpo/beta_margin_grad_std": 0.2321883887052536, "margin_dpo/beta_margin_mean": 3.049729347229004, "margin_dpo/loss_margin_mean": 30.49729347229004, "margin_dpo/margin_mean": 30.497295379638672, "margin_dpo/margin_std": 25.966400146484375, "step": 513 }, { "epoch": 0.7547723935389133, "grad_norm": 72.9621810913086, "learning_rate": 8.734542494893954e-08, "logits/chosen": -0.6219351291656494, "logits/rejected": -0.5745389461517334, "logps/chosen": -100.20359802246094, "logps/ref_chosen": -74.22957611083984, "logps/ref_rejected": -78.945556640625, "logps/rejected": -135.80770874023438, "loss": 0.7095, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.21395042538642883, "margin_dpo/beta_margin_grad_std": 0.27207687497138977, "margin_dpo/beta_margin_mean": 3.0888137817382812, "margin_dpo/loss_margin_mean": 30.88813591003418, "margin_dpo/margin_mean": 30.888137817382812, "margin_dpo/margin_std": 33.99193572998047, "step": 514 }, { "epoch": 0.7562408223201175, "grad_norm": 49.74563980102539, "learning_rate": 8.637300491465272e-08, "logits/chosen": -0.6339064836502075, "logits/rejected": -0.6179243326187134, "logps/chosen": -73.3636703491211, "logps/ref_chosen": -50.40156555175781, "logps/ref_rejected": -87.09774780273438, "logps/rejected": -143.69723510742188, "loss": 0.377, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14515420794487, "margin_dpo/beta_margin_grad_std": 0.17126330733299255, "margin_dpo/beta_margin_mean": 3.363739490509033, "margin_dpo/loss_margin_mean": 33.63739776611328, "margin_dpo/margin_mean": 33.63739776611328, "margin_dpo/margin_std": 27.569812774658203, "step": 515 }, { "epoch": 0.7577092511013216, "grad_norm": 51.14539337158203, "learning_rate": 8.540489660386064e-08, "logits/chosen": -0.642087459564209, "logits/rejected": -0.6219902038574219, "logps/chosen": -87.75926208496094, "logps/ref_chosen": -64.6495590209961, "logps/ref_rejected": -111.72238159179688, "logps/rejected": -170.44625854492188, "loss": 0.3628, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13847823441028595, "margin_dpo/beta_margin_grad_std": 0.18551796674728394, "margin_dpo/beta_margin_mean": 3.5614166259765625, "margin_dpo/loss_margin_mean": 35.614166259765625, "margin_dpo/margin_mean": 35.614166259765625, "margin_dpo/margin_std": 28.459064483642578, "step": 516 }, { "epoch": 0.7591776798825257, "grad_norm": 49.0867805480957, "learning_rate": 8.444112552711752e-08, "logits/chosen": -0.6216846704483032, "logits/rejected": -0.5751929879188538, "logps/chosen": -86.8254623413086, "logps/ref_chosen": -60.913551330566406, "logps/ref_rejected": -89.08308410644531, "logps/rejected": -150.07212829589844, "loss": 0.3873, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12766914069652557, "margin_dpo/beta_margin_grad_std": 0.20745864510536194, "margin_dpo/beta_margin_mean": 3.5077133178710938, "margin_dpo/loss_margin_mean": 35.07713317871094, "margin_dpo/margin_mean": 35.07713317871094, "margin_dpo/margin_std": 25.649383544921875, "step": 517 }, { "epoch": 0.7606461086637298, "grad_norm": 52.4042854309082, "learning_rate": 8.348171708068747e-08, "logits/chosen": -0.6181496381759644, "logits/rejected": -0.6017059087753296, "logps/chosen": -83.06076049804688, "logps/ref_chosen": -57.45589065551758, "logps/ref_rejected": -85.31269836425781, "logps/rejected": -142.11749267578125, "loss": 0.4583, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15641465783119202, "margin_dpo/beta_margin_grad_std": 0.2147601693868637, "margin_dpo/beta_margin_mean": 3.119992733001709, "margin_dpo/loss_margin_mean": 31.199928283691406, "margin_dpo/margin_mean": 31.199928283691406, "margin_dpo/margin_std": 26.132186889648438, "step": 518 }, { "epoch": 0.762114537444934, "grad_norm": 62.43038558959961, "learning_rate": 8.25266965458755e-08, "logits/chosen": -0.6043561697006226, "logits/rejected": -0.5701404213905334, "logps/chosen": -97.01838684082031, "logps/ref_chosen": -74.06330871582031, "logps/ref_rejected": -104.44416809082031, "logps/rejected": -159.92041015625, "loss": 0.4574, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16313457489013672, "margin_dpo/beta_margin_grad_std": 0.1897900104522705, "margin_dpo/beta_margin_mean": 3.2521166801452637, "margin_dpo/loss_margin_mean": 32.52116394042969, "margin_dpo/margin_mean": 32.52116775512695, "margin_dpo/margin_std": 30.377395629882812, "step": 519 }, { "epoch": 0.7635829662261381, "grad_norm": 50.88139343261719, "learning_rate": 8.15760890883607e-08, "logits/chosen": -0.5501081943511963, "logits/rejected": -0.5258777141571045, "logps/chosen": -93.9324951171875, "logps/ref_chosen": -70.2998275756836, "logps/ref_rejected": -99.98133850097656, "logps/rejected": -156.6881103515625, "loss": 0.3602, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13520291447639465, "margin_dpo/beta_margin_grad_std": 0.1813124269247055, "margin_dpo/beta_margin_mean": 3.307410717010498, "margin_dpo/loss_margin_mean": 33.0741081237793, "margin_dpo/margin_mean": 33.07410430908203, "margin_dpo/margin_std": 24.42025375366211, "step": 520 }, { "epoch": 0.7650513950073421, "grad_norm": 51.61702346801758, "learning_rate": 8.062991975753378e-08, "logits/chosen": -0.6081865429878235, "logits/rejected": -0.578285276889801, "logps/chosen": -80.80309295654297, "logps/ref_chosen": -58.14292907714844, "logps/ref_rejected": -83.28060913085938, "logps/rejected": -138.00160217285156, "loss": 0.4205, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15506964921951294, "margin_dpo/beta_margin_grad_std": 0.19557394087314606, "margin_dpo/beta_margin_mean": 3.2060821056365967, "margin_dpo/loss_margin_mean": 32.060821533203125, "margin_dpo/margin_mean": 32.060821533203125, "margin_dpo/margin_std": 26.212413787841797, "step": 521 }, { "epoch": 0.7665198237885462, "grad_norm": 49.603553771972656, "learning_rate": 7.968821348583643e-08, "logits/chosen": -0.610974133014679, "logits/rejected": -0.5809307098388672, "logps/chosen": -70.47454833984375, "logps/ref_chosen": -46.54766845703125, "logps/ref_rejected": -66.01388549804688, "logps/rejected": -118.23016357421875, "loss": 0.4643, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17351622879505157, "margin_dpo/beta_margin_grad_std": 0.19399023056030273, "margin_dpo/beta_margin_mean": 2.8289389610290527, "margin_dpo/loss_margin_mean": 28.289390563964844, "margin_dpo/margin_mean": 28.289390563964844, "margin_dpo/margin_std": 25.933521270751953, "step": 522 }, { "epoch": 0.7679882525697503, "grad_norm": 62.4161376953125, "learning_rate": 7.875099508810484e-08, "logits/chosen": -0.6249532699584961, "logits/rejected": -0.5845484733581543, "logps/chosen": -86.18218994140625, "logps/ref_chosen": -61.76960372924805, "logps/ref_rejected": -83.76141357421875, "logps/rejected": -140.2449951171875, "loss": 0.5486, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18028101325035095, "margin_dpo/beta_margin_grad_std": 0.23459823429584503, "margin_dpo/beta_margin_mean": 3.2071008682250977, "margin_dpo/loss_margin_mean": 32.071006774902344, "margin_dpo/margin_mean": 32.071006774902344, "margin_dpo/margin_std": 29.059005737304688, "step": 523 }, { "epoch": 0.7694566813509545, "grad_norm": 61.96669387817383, "learning_rate": 7.781828926091535e-08, "logits/chosen": -0.5891939401626587, "logits/rejected": -0.5522305965423584, "logps/chosen": -101.28225708007812, "logps/ref_chosen": -78.0720443725586, "logps/ref_rejected": -81.30198669433594, "logps/rejected": -134.13906860351562, "loss": 0.502, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1610349714756012, "margin_dpo/beta_margin_grad_std": 0.2138672173023224, "margin_dpo/beta_margin_mean": 2.962686538696289, "margin_dpo/loss_margin_mean": 29.62686538696289, "margin_dpo/margin_mean": 29.62686538696289, "margin_dpo/margin_std": 24.750131607055664, "step": 524 }, { "epoch": 0.7709251101321586, "grad_norm": 38.1711540222168, "learning_rate": 7.689012058193384e-08, "logits/chosen": -0.5875400304794312, "logits/rejected": -0.5771076679229736, "logps/chosen": -73.10057830810547, "logps/ref_chosen": -50.827857971191406, "logps/ref_rejected": -100.05293273925781, "logps/rejected": -157.6528778076172, "loss": 0.2666, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.10849446058273315, "margin_dpo/beta_margin_grad_std": 0.14546433091163635, "margin_dpo/beta_margin_mean": 3.532721996307373, "margin_dpo/loss_margin_mean": 35.32722091674805, "margin_dpo/margin_mean": 35.32722091674805, "margin_dpo/margin_std": 23.876306533813477, "step": 525 }, { "epoch": 0.7723935389133627, "grad_norm": 69.49327850341797, "learning_rate": 7.596651350926836e-08, "logits/chosen": -0.6228262782096863, "logits/rejected": -0.5682265162467957, "logps/chosen": -88.6613540649414, "logps/ref_chosen": -63.167232513427734, "logps/ref_rejected": -86.30934143066406, "logps/rejected": -146.317138671875, "loss": 0.4345, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14551463723182678, "margin_dpo/beta_margin_grad_std": 0.21542373299598694, "margin_dpo/beta_margin_mean": 3.451366901397705, "margin_dpo/loss_margin_mean": 34.51366424560547, "margin_dpo/margin_mean": 34.51366424560547, "margin_dpo/margin_std": 27.885501861572266, "step": 526 }, { "epoch": 0.7738619676945668, "grad_norm": 59.04280471801758, "learning_rate": 7.504749238082414e-08, "logits/chosen": -0.6852065324783325, "logits/rejected": -0.6316944360733032, "logps/chosen": -94.76507568359375, "logps/ref_chosen": -71.12867736816406, "logps/ref_rejected": -78.3425521850586, "logps/rejected": -134.12088012695312, "loss": 0.4411, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1558818817138672, "margin_dpo/beta_margin_grad_std": 0.2005808800458908, "margin_dpo/beta_margin_mean": 3.2141916751861572, "margin_dpo/loss_margin_mean": 32.14191436767578, "margin_dpo/margin_mean": 32.14191436767578, "margin_dpo/margin_std": 28.21198272705078, "step": 527 }, { "epoch": 0.775330396475771, "grad_norm": 52.068695068359375, "learning_rate": 7.413308141366254e-08, "logits/chosen": -0.6312476396560669, "logits/rejected": -0.6082254648208618, "logps/chosen": -91.81694030761719, "logps/ref_chosen": -68.0894546508789, "logps/ref_rejected": -93.91006469726562, "logps/rejected": -148.40286254882812, "loss": 0.4254, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1584298014640808, "margin_dpo/beta_margin_grad_std": 0.2020604908466339, "margin_dpo/beta_margin_mean": 3.0765323638916016, "margin_dpo/loss_margin_mean": 30.765323638916016, "margin_dpo/margin_mean": 30.765323638916016, "margin_dpo/margin_std": 25.180191040039062, "step": 528 }, { "epoch": 0.7767988252569751, "grad_norm": 78.19168853759766, "learning_rate": 7.322330470336313e-08, "logits/chosen": -0.6202692985534668, "logits/rejected": -0.5999141931533813, "logps/chosen": -82.72918701171875, "logps/ref_chosen": -55.5749626159668, "logps/ref_rejected": -89.20909118652344, "logps/rejected": -144.00283813476562, "loss": 0.7184, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.19822926819324493, "margin_dpo/beta_margin_grad_std": 0.24975398182868958, "margin_dpo/beta_margin_mean": 2.7639517784118652, "margin_dpo/loss_margin_mean": 27.63951873779297, "margin_dpo/margin_mean": 27.63951873779297, "margin_dpo/margin_std": 28.70267677307129, "step": 529 }, { "epoch": 0.7782672540381792, "grad_norm": 62.91647720336914, "learning_rate": 7.231818622338822e-08, "logits/chosen": -0.5722482204437256, "logits/rejected": -0.5528737902641296, "logps/chosen": -71.75675201416016, "logps/ref_chosen": -47.601417541503906, "logps/ref_rejected": -87.2845230102539, "logps/rejected": -148.30410766601562, "loss": 0.4394, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13412612676620483, "margin_dpo/beta_margin_grad_std": 0.2340560257434845, "margin_dpo/beta_margin_mean": 3.686424732208252, "margin_dpo/loss_margin_mean": 36.8642463684082, "margin_dpo/margin_mean": 36.8642463684082, "margin_dpo/margin_std": 27.135440826416016, "step": 530 }, { "epoch": 0.7797356828193832, "grad_norm": 56.84402084350586, "learning_rate": 7.141774982445147e-08, "logits/chosen": -0.6246213912963867, "logits/rejected": -0.583504319190979, "logps/chosen": -78.43968200683594, "logps/ref_chosen": -55.246063232421875, "logps/ref_rejected": -70.60598754882812, "logps/rejected": -126.05680847167969, "loss": 0.5239, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16395235061645508, "margin_dpo/beta_margin_grad_std": 0.2341843992471695, "margin_dpo/beta_margin_mean": 3.2257208824157715, "margin_dpo/loss_margin_mean": 32.25720977783203, "margin_dpo/margin_mean": 32.25720977783203, "margin_dpo/margin_std": 28.00493049621582, "step": 531 }, { "epoch": 0.7812041116005873, "grad_norm": 58.166229248046875, "learning_rate": 7.052201923388953e-08, "logits/chosen": -0.5881683826446533, "logits/rejected": -0.554157018661499, "logps/chosen": -94.40569305419922, "logps/ref_chosen": -70.28602600097656, "logps/ref_rejected": -86.5913314819336, "logps/rejected": -148.8220672607422, "loss": 0.3506, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12911692261695862, "margin_dpo/beta_margin_grad_std": 0.19371378421783447, "margin_dpo/beta_margin_mean": 3.8111071586608887, "margin_dpo/loss_margin_mean": 38.11106872558594, "margin_dpo/margin_mean": 38.11106872558594, "margin_dpo/margin_std": 27.990556716918945, "step": 532 }, { "epoch": 0.7826725403817915, "grad_norm": 70.82559204101562, "learning_rate": 6.963101805503646e-08, "logits/chosen": -0.6239089965820312, "logits/rejected": -0.5815380215644836, "logps/chosen": -88.10252380371094, "logps/ref_chosen": -64.8551025390625, "logps/ref_rejected": -76.58805847167969, "logps/rejected": -126.4080581665039, "loss": 0.5933, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18534313142299652, "margin_dpo/beta_margin_grad_std": 0.23213329911231995, "margin_dpo/beta_margin_mean": 2.657257080078125, "margin_dpo/loss_margin_mean": 26.57257080078125, "margin_dpo/margin_mean": 26.572572708129883, "margin_dpo/margin_std": 23.991806030273438, "step": 533 }, { "epoch": 0.7841409691629956, "grad_norm": 47.19809341430664, "learning_rate": 6.874476976660184e-08, "logits/chosen": -0.623961329460144, "logits/rejected": -0.5935629606246948, "logps/chosen": -82.6689453125, "logps/ref_chosen": -60.119388580322266, "logps/ref_rejected": -78.54347229003906, "logps/rejected": -133.5894012451172, "loss": 0.4033, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1534009873867035, "margin_dpo/beta_margin_grad_std": 0.18615348637104034, "margin_dpo/beta_margin_mean": 3.2496376037597656, "margin_dpo/loss_margin_mean": 32.496376037597656, "margin_dpo/margin_mean": 32.49637222290039, "margin_dpo/margin_std": 27.735137939453125, "step": 534 }, { "epoch": 0.7856093979441997, "grad_norm": 46.1759147644043, "learning_rate": 6.786329772205246e-08, "logits/chosen": -0.6095963716506958, "logits/rejected": -0.5779241919517517, "logps/chosen": -74.88334655761719, "logps/ref_chosen": -54.330238342285156, "logps/ref_rejected": -96.30763244628906, "logps/rejected": -152.88735961914062, "loss": 0.3929, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13403823971748352, "margin_dpo/beta_margin_grad_std": 0.19758474826812744, "margin_dpo/beta_margin_mean": 3.602663040161133, "margin_dpo/loss_margin_mean": 36.02663040161133, "margin_dpo/margin_mean": 36.02663040161133, "margin_dpo/margin_std": 26.887496948242188, "step": 535 }, { "epoch": 0.7870778267254038, "grad_norm": 33.667205810546875, "learning_rate": 6.698662514899638e-08, "logits/chosen": -0.6077243089675903, "logits/rejected": -0.5903106927871704, "logps/chosen": -67.83413696289062, "logps/ref_chosen": -47.08053207397461, "logps/ref_rejected": -89.09783935546875, "logps/rejected": -150.54098510742188, "loss": 0.2183, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.0890052318572998, "margin_dpo/beta_margin_grad_std": 0.13288089632987976, "margin_dpo/beta_margin_mean": 4.068953990936279, "margin_dpo/loss_margin_mean": 40.689537048339844, "margin_dpo/margin_mean": 40.689537048339844, "margin_dpo/margin_std": 27.558616638183594, "step": 536 }, { "epoch": 0.788546255506608, "grad_norm": 60.72896957397461, "learning_rate": 6.611477514857114e-08, "logits/chosen": -0.6039552688598633, "logits/rejected": -0.5421825647354126, "logps/chosen": -78.5447998046875, "logps/ref_chosen": -57.747474670410156, "logps/ref_rejected": -70.43838500976562, "logps/rejected": -124.99288940429688, "loss": 0.4139, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1552681028842926, "margin_dpo/beta_margin_grad_std": 0.19483794271945953, "margin_dpo/beta_margin_mean": 3.375717878341675, "margin_dpo/loss_margin_mean": 33.757179260253906, "margin_dpo/margin_mean": 33.757179260253906, "margin_dpo/margin_std": 28.151874542236328, "step": 537 }, { "epoch": 0.7900146842878121, "grad_norm": 46.434898376464844, "learning_rate": 6.524777069483525e-08, "logits/chosen": -0.616761326789856, "logits/rejected": -0.5684964656829834, "logps/chosen": -89.30928039550781, "logps/ref_chosen": -66.41593933105469, "logps/ref_rejected": -84.22808837890625, "logps/rejected": -139.8426055908203, "loss": 0.3541, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12844222784042358, "margin_dpo/beta_margin_grad_std": 0.16971102356910706, "margin_dpo/beta_margin_mean": 3.2721188068389893, "margin_dpo/loss_margin_mean": 32.721187591552734, "margin_dpo/margin_mean": 32.721187591552734, "margin_dpo/margin_std": 25.067447662353516, "step": 538 }, { "epoch": 0.7914831130690162, "grad_norm": 55.15032196044922, "learning_rate": 6.438563463416221e-08, "logits/chosen": -0.6659849882125854, "logits/rejected": -0.6233581304550171, "logps/chosen": -79.83650207519531, "logps/ref_chosen": -58.49285125732422, "logps/ref_rejected": -91.85395812988281, "logps/rejected": -144.42047119140625, "loss": 0.4882, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16426563262939453, "margin_dpo/beta_margin_grad_std": 0.21655422449111938, "margin_dpo/beta_margin_mean": 3.122286319732666, "margin_dpo/loss_margin_mean": 31.22286605834961, "margin_dpo/margin_mean": 31.22286605834961, "margin_dpo/margin_std": 27.15618133544922, "step": 539 }, { "epoch": 0.7929515418502202, "grad_norm": 62.82166290283203, "learning_rate": 6.352838968463919e-08, "logits/chosen": -0.6381834149360657, "logits/rejected": -0.6113142967224121, "logps/chosen": -85.19060516357422, "logps/ref_chosen": -63.482513427734375, "logps/ref_rejected": -116.43000030517578, "logps/rejected": -173.4632110595703, "loss": 0.4628, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13866056501865387, "margin_dpo/beta_margin_grad_std": 0.2188330590724945, "margin_dpo/beta_margin_mean": 3.5325119495391846, "margin_dpo/loss_margin_mean": 35.32511901855469, "margin_dpo/margin_mean": 35.32511901855469, "margin_dpo/margin_std": 27.556922912597656, "step": 540 }, { "epoch": 0.7944199706314243, "grad_norm": 62.53669738769531, "learning_rate": 6.267605843546767e-08, "logits/chosen": -0.6469000577926636, "logits/rejected": -0.6038193702697754, "logps/chosen": -101.33219146728516, "logps/ref_chosen": -78.28035736083984, "logps/ref_rejected": -103.273681640625, "logps/rejected": -156.50372314453125, "loss": 0.4275, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15085425972938538, "margin_dpo/beta_margin_grad_std": 0.18950016796588898, "margin_dpo/beta_margin_mean": 3.017820358276367, "margin_dpo/loss_margin_mean": 30.17820167541504, "margin_dpo/margin_mean": 30.178203582763672, "margin_dpo/margin_std": 23.339244842529297, "step": 541 }, { "epoch": 0.7958883994126285, "grad_norm": 39.13835144042969, "learning_rate": 6.182866334636888e-08, "logits/chosen": -0.6534620523452759, "logits/rejected": -0.6460641622543335, "logps/chosen": -80.37567901611328, "logps/ref_chosen": -57.48497009277344, "logps/ref_rejected": -96.47506713867188, "logps/rejected": -153.80548095703125, "loss": 0.366, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13392959535121918, "margin_dpo/beta_margin_grad_std": 0.19380900263786316, "margin_dpo/beta_margin_mean": 3.4439687728881836, "margin_dpo/loss_margin_mean": 34.43968963623047, "margin_dpo/margin_mean": 34.43968963623047, "margin_dpo/margin_std": 25.000656127929688, "step": 542 }, { "epoch": 0.7973568281938326, "grad_norm": 80.92756652832031, "learning_rate": 6.098622674699147e-08, "logits/chosen": -0.5841265916824341, "logits/rejected": -0.5707241296768188, "logps/chosen": -84.22129821777344, "logps/ref_chosen": -60.61750793457031, "logps/ref_rejected": -105.59896850585938, "logps/rejected": -154.79116821289062, "loss": 0.6059, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.19587799906730652, "margin_dpo/beta_margin_grad_std": 0.22574475407600403, "margin_dpo/beta_margin_mean": 2.5588417053222656, "margin_dpo/loss_margin_mean": 25.588415145874023, "margin_dpo/margin_mean": 25.588417053222656, "margin_dpo/margin_std": 25.410099029541016, "step": 543 }, { "epoch": 0.7988252569750367, "grad_norm": 46.70132064819336, "learning_rate": 6.01487708363232e-08, "logits/chosen": -0.6044985055923462, "logits/rejected": -0.5905438661575317, "logps/chosen": -85.07525634765625, "logps/ref_chosen": -59.642303466796875, "logps/ref_rejected": -100.95469665527344, "logps/rejected": -159.369873046875, "loss": 0.3149, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12085522711277008, "margin_dpo/beta_margin_grad_std": 0.16033101081848145, "margin_dpo/beta_margin_mean": 3.2982213497161865, "margin_dpo/loss_margin_mean": 32.98221206665039, "margin_dpo/margin_mean": 32.98221206665039, "margin_dpo/margin_std": 24.098819732666016, "step": 544 }, { "epoch": 0.8002936857562408, "grad_norm": 49.51677703857422, "learning_rate": 5.9316317682106294e-08, "logits/chosen": -0.5612127780914307, "logits/rejected": -0.5339560508728027, "logps/chosen": -91.53611755371094, "logps/ref_chosen": -67.64859771728516, "logps/ref_rejected": -95.90800476074219, "logps/rejected": -154.13796997070312, "loss": 0.3835, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14522799849510193, "margin_dpo/beta_margin_grad_std": 0.19207137823104858, "margin_dpo/beta_margin_mean": 3.43424654006958, "margin_dpo/loss_margin_mean": 34.34246826171875, "margin_dpo/margin_mean": 34.34246826171875, "margin_dpo/margin_std": 26.62921142578125, "step": 545 }, { "epoch": 0.801762114537445, "grad_norm": 49.70174789428711, "learning_rate": 5.848888922025552e-08, "logits/chosen": -0.5628246665000916, "logits/rejected": -0.5294591188430786, "logps/chosen": -73.03337097167969, "logps/ref_chosen": -50.744232177734375, "logps/ref_rejected": -81.86622619628906, "logps/rejected": -137.66339111328125, "loss": 0.3402, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12423272430896759, "margin_dpo/beta_margin_grad_std": 0.17410895228385925, "margin_dpo/beta_margin_mean": 3.350802421569824, "margin_dpo/loss_margin_mean": 33.50802230834961, "margin_dpo/margin_mean": 33.50802230834961, "margin_dpo/margin_std": 23.63653564453125, "step": 546 }, { "epoch": 0.8032305433186491, "grad_norm": 94.18778228759766, "learning_rate": 5.7666507254280265e-08, "logits/chosen": -0.5813489556312561, "logits/rejected": -0.5452552437782288, "logps/chosen": -99.18565368652344, "logps/ref_chosen": -73.6877212524414, "logps/ref_rejected": -90.76136779785156, "logps/rejected": -147.3549041748047, "loss": 0.5868, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18151536583900452, "margin_dpo/beta_margin_grad_std": 0.23811323940753937, "margin_dpo/beta_margin_mean": 3.109560966491699, "margin_dpo/loss_margin_mean": 31.09560775756836, "margin_dpo/margin_mean": 31.09560775756836, "margin_dpo/margin_std": 29.87148666381836, "step": 547 }, { "epoch": 0.8046989720998532, "grad_norm": 51.33172607421875, "learning_rate": 5.684919345471029e-08, "logits/chosen": -0.6642282009124756, "logits/rejected": -0.6327365040779114, "logps/chosen": -87.51785278320312, "logps/ref_chosen": -65.24634552001953, "logps/ref_rejected": -94.11807250976562, "logps/rejected": -150.5765380859375, "loss": 0.4329, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14642292261123657, "margin_dpo/beta_margin_grad_std": 0.21783213317394257, "margin_dpo/beta_margin_mean": 3.4186956882476807, "margin_dpo/loss_margin_mean": 34.186954498291016, "margin_dpo/margin_mean": 34.186954498291016, "margin_dpo/margin_std": 28.527481079101562, "step": 548 }, { "epoch": 0.8061674008810573, "grad_norm": 59.542022705078125, "learning_rate": 5.603696935852426e-08, "logits/chosen": -0.587199866771698, "logits/rejected": -0.5497395992279053, "logps/chosen": -70.22129821777344, "logps/ref_chosen": -49.21235656738281, "logps/ref_rejected": -73.91031646728516, "logps/rejected": -129.74290466308594, "loss": 0.3415, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12509265542030334, "margin_dpo/beta_margin_grad_std": 0.17894169688224792, "margin_dpo/beta_margin_mean": 3.4823646545410156, "margin_dpo/loss_margin_mean": 34.823646545410156, "margin_dpo/margin_mean": 34.823646545410156, "margin_dpo/margin_std": 25.530513763427734, "step": 549 }, { "epoch": 0.8076358296622613, "grad_norm": 69.98318481445312, "learning_rate": 5.5229856368582376e-08, "logits/chosen": -0.5780174732208252, "logits/rejected": -0.554786741733551, "logps/chosen": -81.68783569335938, "logps/ref_chosen": -56.80695343017578, "logps/ref_rejected": -95.12580871582031, "logps/rejected": -147.86605834960938, "loss": 0.512, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18082059919834137, "margin_dpo/beta_margin_grad_std": 0.22207701206207275, "margin_dpo/beta_margin_mean": 2.7859373092651367, "margin_dpo/loss_margin_mean": 27.859371185302734, "margin_dpo/margin_mean": 27.859375, "margin_dpo/margin_std": 24.073030471801758, "step": 550 }, { "epoch": 0.8091042584434655, "grad_norm": 68.77825164794922, "learning_rate": 5.4427875753062734e-08, "logits/chosen": -0.6016166806221008, "logits/rejected": -0.5792367458343506, "logps/chosen": -82.6038589477539, "logps/ref_chosen": -59.10633087158203, "logps/ref_rejected": -111.67280578613281, "logps/rejected": -170.34124755859375, "loss": 0.3618, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13445577025413513, "margin_dpo/beta_margin_grad_std": 0.18969406187534332, "margin_dpo/beta_margin_mean": 3.5170915126800537, "margin_dpo/loss_margin_mean": 35.17091369628906, "margin_dpo/margin_mean": 35.17091369628906, "margin_dpo/margin_std": 27.047245025634766, "step": 551 }, { "epoch": 0.8105726872246696, "grad_norm": 36.09619903564453, "learning_rate": 5.363104864490034e-08, "logits/chosen": -0.6584379076957703, "logits/rejected": -0.6297129392623901, "logps/chosen": -82.9939193725586, "logps/ref_chosen": -62.35459899902344, "logps/ref_rejected": -104.56210327148438, "logps/rejected": -164.9587860107422, "loss": 0.2475, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.10201341658830643, "margin_dpo/beta_margin_grad_std": 0.1296149343252182, "margin_dpo/beta_margin_mean": 3.975735902786255, "margin_dpo/loss_margin_mean": 39.757354736328125, "margin_dpo/margin_mean": 39.757354736328125, "margin_dpo/margin_std": 30.61846923828125, "step": 552 }, { "epoch": 0.8120411160058737, "grad_norm": 62.299354553222656, "learning_rate": 5.2839396041230415e-08, "logits/chosen": -0.5835554599761963, "logits/rejected": -0.5560900568962097, "logps/chosen": -89.63333129882812, "logps/ref_chosen": -68.25881958007812, "logps/ref_rejected": -98.0971450805664, "logps/rejected": -150.1568603515625, "loss": 0.4084, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15132924914360046, "margin_dpo/beta_margin_grad_std": 0.1949571967124939, "margin_dpo/beta_margin_mean": 3.0685200691223145, "margin_dpo/loss_margin_mean": 30.685199737548828, "margin_dpo/margin_mean": 30.685199737548828, "margin_dpo/margin_std": 24.393556594848633, "step": 553 }, { "epoch": 0.8135095447870778, "grad_norm": 70.59496307373047, "learning_rate": 5.205293880283551e-08, "logits/chosen": -0.5978009104728699, "logits/rejected": -0.5454249382019043, "logps/chosen": -91.25200653076172, "logps/ref_chosen": -67.94767761230469, "logps/ref_rejected": -89.78272247314453, "logps/rejected": -154.95721435546875, "loss": 0.4373, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12750211358070374, "margin_dpo/beta_margin_grad_std": 0.22240933775901794, "margin_dpo/beta_margin_mean": 4.187016487121582, "margin_dpo/loss_margin_mean": 41.87016677856445, "margin_dpo/margin_mean": 41.87016677856445, "margin_dpo/margin_std": 30.95236587524414, "step": 554 }, { "epoch": 0.8149779735682819, "grad_norm": 61.74562454223633, "learning_rate": 5.127169765359515e-08, "logits/chosen": -0.5948277115821838, "logits/rejected": -0.5893919467926025, "logps/chosen": -75.4261245727539, "logps/ref_chosen": -53.33049011230469, "logps/ref_rejected": -108.47937774658203, "logps/rejected": -165.4979248046875, "loss": 0.4571, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14716145396232605, "margin_dpo/beta_margin_grad_std": 0.20644359290599823, "margin_dpo/beta_margin_mean": 3.4922895431518555, "margin_dpo/loss_margin_mean": 34.92289733886719, "margin_dpo/margin_mean": 34.92289733886719, "margin_dpo/margin_std": 27.98041534423828, "step": 555 }, { "epoch": 0.8164464023494861, "grad_norm": 72.64017486572266, "learning_rate": 5.049569317994012e-08, "logits/chosen": -0.5797896385192871, "logits/rejected": -0.5396873950958252, "logps/chosen": -80.73486328125, "logps/ref_chosen": -58.64447021484375, "logps/ref_rejected": -101.34040832519531, "logps/rejected": -154.17111206054688, "loss": 0.5343, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1822032928466797, "margin_dpo/beta_margin_grad_std": 0.2356235682964325, "margin_dpo/beta_margin_mean": 3.074030876159668, "margin_dpo/loss_margin_mean": 30.740306854248047, "margin_dpo/margin_mean": 30.740306854248047, "margin_dpo/margin_std": 27.73691177368164, "step": 556 }, { "epoch": 0.8179148311306902, "grad_norm": 52.41410446166992, "learning_rate": 4.9724945830310144e-08, "logits/chosen": -0.6446192264556885, "logits/rejected": -0.6262944936752319, "logps/chosen": -89.59944152832031, "logps/ref_chosen": -67.84066009521484, "logps/ref_rejected": -109.93966674804688, "logps/rejected": -162.2843475341797, "loss": 0.4552, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16416671872138977, "margin_dpo/beta_margin_grad_std": 0.19996830821037292, "margin_dpo/beta_margin_mean": 3.0585899353027344, "margin_dpo/loss_margin_mean": 30.585901260375977, "margin_dpo/margin_mean": 30.585901260375977, "margin_dpo/margin_std": 26.19734001159668, "step": 557 }, { "epoch": 0.8193832599118943, "grad_norm": 38.192787170410156, "learning_rate": 4.8959475914614554e-08, "logits/chosen": -0.6551119089126587, "logits/rejected": -0.6068642139434814, "logps/chosen": -81.69489288330078, "logps/ref_chosen": -62.36824035644531, "logps/ref_rejected": -102.16102600097656, "logps/rejected": -162.4652862548828, "loss": 0.2874, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.10797977447509766, "margin_dpo/beta_margin_grad_std": 0.14597085118293762, "margin_dpo/beta_margin_mean": 4.0977606773376465, "margin_dpo/loss_margin_mean": 40.97760772705078, "margin_dpo/margin_mean": 40.97760772705078, "margin_dpo/margin_std": 30.455211639404297, "step": 558 }, { "epoch": 0.8208516886930984, "grad_norm": 55.569332122802734, "learning_rate": 4.8199303603697614e-08, "logits/chosen": -0.6832656860351562, "logits/rejected": -0.6301894187927246, "logps/chosen": -80.5696029663086, "logps/ref_chosen": -60.75232696533203, "logps/ref_rejected": -93.4422836303711, "logps/rejected": -146.19882202148438, "loss": 0.4325, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15588068962097168, "margin_dpo/beta_margin_grad_std": 0.20822836458683014, "margin_dpo/beta_margin_mean": 3.2939257621765137, "margin_dpo/loss_margin_mean": 32.93925857543945, "margin_dpo/margin_mean": 32.93925476074219, "margin_dpo/margin_std": 27.555404663085938, "step": 559 }, { "epoch": 0.8223201174743024, "grad_norm": 67.84832000732422, "learning_rate": 4.7444448928806615e-08, "logits/chosen": -0.5837658643722534, "logits/rejected": -0.5339952707290649, "logps/chosen": -79.17695617675781, "logps/ref_chosen": -58.10382080078125, "logps/ref_rejected": -79.99122619628906, "logps/rejected": -130.1254425048828, "loss": 0.4489, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16372855007648468, "margin_dpo/beta_margin_grad_std": 0.1933506578207016, "margin_dpo/beta_margin_mean": 2.906108856201172, "margin_dpo/loss_margin_mean": 29.061086654663086, "margin_dpo/margin_mean": 29.061086654663086, "margin_dpo/margin_std": 24.71479034423828, "step": 560 }, { "epoch": 0.8237885462555066, "grad_norm": 66.11046600341797, "learning_rate": 4.669493178106432e-08, "logits/chosen": -0.6318497657775879, "logits/rejected": -0.6243282556533813, "logps/chosen": -76.038330078125, "logps/ref_chosen": -50.91287612915039, "logps/ref_rejected": -99.06857299804688, "logps/rejected": -153.46937561035156, "loss": 0.4945, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17272219061851501, "margin_dpo/beta_margin_grad_std": 0.21409326791763306, "margin_dpo/beta_margin_mean": 2.927535057067871, "margin_dpo/loss_margin_mean": 29.275352478027344, "margin_dpo/margin_mean": 29.275352478027344, "margin_dpo/margin_std": 26.028850555419922, "step": 561 }, { "epoch": 0.8252569750367107, "grad_norm": 34.92936706542969, "learning_rate": 4.5950771910944596e-08, "logits/chosen": -0.651642382144928, "logits/rejected": -0.604433536529541, "logps/chosen": -78.28529357910156, "logps/ref_chosen": -59.46440124511719, "logps/ref_rejected": -96.54266357421875, "logps/rejected": -153.3458709716797, "loss": 0.2435, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.0989568680524826, "margin_dpo/beta_margin_grad_std": 0.1261216104030609, "margin_dpo/beta_margin_mean": 3.7982311248779297, "margin_dpo/loss_margin_mean": 37.9823112487793, "margin_dpo/margin_mean": 37.9823112487793, "margin_dpo/margin_std": 26.726564407348633, "step": 562 }, { "epoch": 0.8267254038179148, "grad_norm": 63.81352233886719, "learning_rate": 4.521198892775202e-08, "logits/chosen": -0.5930050611495972, "logits/rejected": -0.5729939937591553, "logps/chosen": -83.12980651855469, "logps/ref_chosen": -60.60819625854492, "logps/ref_rejected": -94.56770324707031, "logps/rejected": -147.40249633789062, "loss": 0.4148, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15109089016914368, "margin_dpo/beta_margin_grad_std": 0.1935727894306183, "margin_dpo/beta_margin_mean": 3.031318426132202, "margin_dpo/loss_margin_mean": 30.313182830810547, "margin_dpo/margin_mean": 30.313182830810547, "margin_dpo/margin_std": 23.21819496154785, "step": 563 }, { "epoch": 0.8281938325991189, "grad_norm": 47.72722244262695, "learning_rate": 4.447860229910544e-08, "logits/chosen": -0.656052827835083, "logits/rejected": -0.5981060862541199, "logps/chosen": -96.48939514160156, "logps/ref_chosen": -74.26837921142578, "logps/ref_rejected": -93.2381820678711, "logps/rejected": -147.96966552734375, "loss": 0.368, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13481834530830383, "margin_dpo/beta_margin_grad_std": 0.1763157844543457, "margin_dpo/beta_margin_mean": 3.2510476112365723, "margin_dpo/loss_margin_mean": 32.510475158691406, "margin_dpo/margin_mean": 32.510475158691406, "margin_dpo/margin_std": 22.74962043762207, "step": 564 }, { "epoch": 0.8296622613803231, "grad_norm": 44.3295783996582, "learning_rate": 4.375063135042445e-08, "logits/chosen": -0.6097604036331177, "logits/rejected": -0.5671969652175903, "logps/chosen": -91.07102966308594, "logps/ref_chosen": -69.0199203491211, "logps/ref_rejected": -85.7789306640625, "logps/rejected": -143.09686279296875, "loss": 0.3731, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.141332745552063, "margin_dpo/beta_margin_grad_std": 0.18416938185691833, "margin_dpo/beta_margin_mean": 3.5266833305358887, "margin_dpo/loss_margin_mean": 35.2668342590332, "margin_dpo/margin_mean": 35.2668342590332, "margin_dpo/margin_std": 30.624713897705078, "step": 565 }, { "epoch": 0.8311306901615272, "grad_norm": 56.34800338745117, "learning_rate": 4.3028095264420525e-08, "logits/chosen": -0.5949935913085938, "logits/rejected": -0.5808389186859131, "logps/chosen": -87.20069885253906, "logps/ref_chosen": -66.5453109741211, "logps/ref_rejected": -103.86931610107422, "logps/rejected": -158.5188751220703, "loss": 0.4755, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16708871722221375, "margin_dpo/beta_margin_grad_std": 0.21157479286193848, "margin_dpo/beta_margin_mean": 3.399416923522949, "margin_dpo/loss_margin_mean": 33.994171142578125, "margin_dpo/margin_mean": 33.994171142578125, "margin_dpo/margin_std": 29.911640167236328, "step": 566 }, { "epoch": 0.8325991189427313, "grad_norm": 82.15995025634766, "learning_rate": 4.231101308059165e-08, "logits/chosen": -0.6804023385047913, "logits/rejected": -0.6269962787628174, "logps/chosen": -75.24916076660156, "logps/ref_chosen": -52.858299255371094, "logps/ref_rejected": -85.37095642089844, "logps/rejected": -140.0916748046875, "loss": 0.5883, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17573551833629608, "margin_dpo/beta_margin_grad_std": 0.24265018105506897, "margin_dpo/beta_margin_mean": 3.2329859733581543, "margin_dpo/loss_margin_mean": 32.329856872558594, "margin_dpo/margin_mean": 32.32986068725586, "margin_dpo/margin_std": 28.114917755126953, "step": 567 }, { "epoch": 0.8340675477239354, "grad_norm": 43.57807159423828, "learning_rate": 4.1599403694720145e-08, "logits/chosen": -0.580660343170166, "logits/rejected": -0.5636056065559387, "logps/chosen": -67.96955108642578, "logps/ref_chosen": -45.1923828125, "logps/ref_rejected": -89.09236145019531, "logps/rejected": -149.81170654296875, "loss": 0.3489, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12049505114555359, "margin_dpo/beta_margin_grad_std": 0.17438393831253052, "margin_dpo/beta_margin_mean": 3.7942161560058594, "margin_dpo/loss_margin_mean": 37.942161560058594, "margin_dpo/margin_mean": 37.942161560058594, "margin_dpo/margin_std": 26.538555145263672, "step": 568 }, { "epoch": 0.8355359765051396, "grad_norm": 63.59123229980469, "learning_rate": 4.089328585837512e-08, "logits/chosen": -0.6394084692001343, "logits/rejected": -0.6091455817222595, "logps/chosen": -86.40789794921875, "logps/ref_chosen": -63.72056198120117, "logps/ref_rejected": -79.10325622558594, "logps/rejected": -131.8647918701172, "loss": 0.5032, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17486035823822021, "margin_dpo/beta_margin_grad_std": 0.22121158242225647, "margin_dpo/beta_margin_mean": 3.007420063018799, "margin_dpo/loss_margin_mean": 30.074199676513672, "margin_dpo/margin_mean": 30.074199676513672, "margin_dpo/margin_std": 27.18084716796875, "step": 569 }, { "epoch": 0.8370044052863436, "grad_norm": 53.639320373535156, "learning_rate": 4.019267817841834e-08, "logits/chosen": -0.674132764339447, "logits/rejected": -0.6270936131477356, "logps/chosen": -82.65512084960938, "logps/ref_chosen": -61.61454772949219, "logps/ref_rejected": -82.1418685913086, "logps/rejected": -138.31561279296875, "loss": 0.329, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1217803955078125, "margin_dpo/beta_margin_grad_std": 0.18320615589618683, "margin_dpo/beta_margin_mean": 3.5133180618286133, "margin_dpo/loss_margin_mean": 35.1331787109375, "margin_dpo/margin_mean": 35.1331787109375, "margin_dpo/margin_std": 25.58907127380371, "step": 570 }, { "epoch": 0.8384728340675477, "grad_norm": 61.772037506103516, "learning_rate": 3.9497599116513705e-08, "logits/chosen": -0.5997041463851929, "logits/rejected": -0.5761772990226746, "logps/chosen": -75.53142547607422, "logps/ref_chosen": -53.05406188964844, "logps/ref_rejected": -91.33682250976562, "logps/rejected": -148.2672119140625, "loss": 0.3786, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1382187306880951, "margin_dpo/beta_margin_grad_std": 0.19462428987026215, "margin_dpo/beta_margin_mean": 3.445303440093994, "margin_dpo/loss_margin_mean": 34.453033447265625, "margin_dpo/margin_mean": 34.453033447265625, "margin_dpo/margin_std": 27.182416915893555, "step": 571 }, { "epoch": 0.8399412628487518, "grad_norm": 78.96542358398438, "learning_rate": 3.880806698864086e-08, "logits/chosen": -0.5895199775695801, "logits/rejected": -0.5727903246879578, "logps/chosen": -75.78829193115234, "logps/ref_chosen": -48.459285736083984, "logps/ref_rejected": -83.5570297241211, "logps/rejected": -143.24444580078125, "loss": 0.6978, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18673813343048096, "margin_dpo/beta_margin_grad_std": 0.25194963812828064, "margin_dpo/beta_margin_mean": 3.23583984375, "margin_dpo/loss_margin_mean": 32.3583984375, "margin_dpo/margin_mean": 32.3583984375, "margin_dpo/margin_std": 31.786035537719727, "step": 572 }, { "epoch": 0.8414096916299559, "grad_norm": 59.523719787597656, "learning_rate": 3.812409996461275e-08, "logits/chosen": -0.6645894050598145, "logits/rejected": -0.6301409602165222, "logps/chosen": -73.29808044433594, "logps/ref_chosen": -51.62262725830078, "logps/ref_rejected": -85.32499694824219, "logps/rejected": -141.5244903564453, "loss": 0.4423, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1509319394826889, "margin_dpo/beta_margin_grad_std": 0.21270796656608582, "margin_dpo/beta_margin_mean": 3.452404499053955, "margin_dpo/loss_margin_mean": 34.5240478515625, "margin_dpo/margin_mean": 34.5240478515625, "margin_dpo/margin_std": 26.213939666748047, "step": 573 }, { "epoch": 0.8428781204111601, "grad_norm": 71.15605926513672, "learning_rate": 3.74457160675965e-08, "logits/chosen": -0.6428389549255371, "logits/rejected": -0.6086920499801636, "logps/chosen": -74.59834289550781, "logps/ref_chosen": -51.04446029663086, "logps/ref_rejected": -92.80640411376953, "logps/rejected": -150.51730346679688, "loss": 0.4548, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14354148507118225, "margin_dpo/beta_margin_grad_std": 0.1961897611618042, "margin_dpo/beta_margin_mean": 3.415701389312744, "margin_dpo/loss_margin_mean": 34.157012939453125, "margin_dpo/margin_mean": 34.157012939453125, "margin_dpo/margin_std": 27.831592559814453, "step": 574 }, { "epoch": 0.8443465491923642, "grad_norm": 86.26287078857422, "learning_rate": 3.677293317363864e-08, "logits/chosen": -0.5673672556877136, "logits/rejected": -0.5325363874435425, "logps/chosen": -97.13941955566406, "logps/ref_chosen": -71.79014587402344, "logps/ref_rejected": -95.38619995117188, "logps/rejected": -157.06790161132812, "loss": 0.6399, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1683315932750702, "margin_dpo/beta_margin_grad_std": 0.2584590017795563, "margin_dpo/beta_margin_mean": 3.6332435607910156, "margin_dpo/loss_margin_mean": 36.332435607910156, "margin_dpo/margin_mean": 36.332435607910156, "margin_dpo/margin_std": 31.414226531982422, "step": 575 }, { "epoch": 0.8458149779735683, "grad_norm": 49.230098724365234, "learning_rate": 3.6105769011194224e-08, "logits/chosen": -0.5987046957015991, "logits/rejected": -0.5964124202728271, "logps/chosen": -77.82243347167969, "logps/ref_chosen": -54.262969970703125, "logps/ref_rejected": -100.7542724609375, "logps/rejected": -159.103515625, "loss": 0.444, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14397433400154114, "margin_dpo/beta_margin_grad_std": 0.19858963787555695, "margin_dpo/beta_margin_mean": 3.4789772033691406, "margin_dpo/loss_margin_mean": 34.789772033691406, "margin_dpo/margin_mean": 34.789772033691406, "margin_dpo/margin_std": 29.601974487304688, "step": 576 }, { "epoch": 0.8472834067547724, "grad_norm": 48.130558013916016, "learning_rate": 3.5444241160659304e-08, "logits/chosen": -0.6501774787902832, "logits/rejected": -0.6030783653259277, "logps/chosen": -81.96438598632812, "logps/ref_chosen": -61.909706115722656, "logps/ref_rejected": -84.07069396972656, "logps/rejected": -142.32540893554688, "loss": 0.348, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.11993271112442017, "margin_dpo/beta_margin_grad_std": 0.1745702028274536, "margin_dpo/beta_margin_mean": 3.8200042247772217, "margin_dpo/loss_margin_mean": 38.200042724609375, "margin_dpo/margin_mean": 38.200042724609375, "margin_dpo/margin_std": 27.837221145629883, "step": 577 }, { "epoch": 0.8487518355359766, "grad_norm": 56.244651794433594, "learning_rate": 3.478836705390808e-08, "logits/chosen": -0.5622389912605286, "logits/rejected": -0.5450348854064941, "logps/chosen": -76.16979217529297, "logps/ref_chosen": -49.26368713378906, "logps/ref_rejected": -83.43626403808594, "logps/rejected": -145.69003295898438, "loss": 0.3651, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13420046865940094, "margin_dpo/beta_margin_grad_std": 0.1781080663204193, "margin_dpo/beta_margin_mean": 3.5347681045532227, "margin_dpo/loss_margin_mean": 35.347679138183594, "margin_dpo/margin_mean": 35.347679138183594, "margin_dpo/margin_std": 26.89832878112793, "step": 578 }, { "epoch": 0.8502202643171806, "grad_norm": 54.51181411743164, "learning_rate": 3.41381639738331e-08, "logits/chosen": -0.6210640668869019, "logits/rejected": -0.5942162871360779, "logps/chosen": -80.40229797363281, "logps/ref_chosen": -58.88581848144531, "logps/ref_rejected": -94.78762817382812, "logps/rejected": -147.00790405273438, "loss": 0.3713, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14311912655830383, "margin_dpo/beta_margin_grad_std": 0.1770986020565033, "margin_dpo/beta_margin_mean": 3.0703792572021484, "margin_dpo/loss_margin_mean": 30.70379066467285, "margin_dpo/margin_mean": 30.70379066467285, "margin_dpo/margin_std": 23.245943069458008, "step": 579 }, { "epoch": 0.8516886930983847, "grad_norm": 47.365726470947266, "learning_rate": 3.349364905389032e-08, "logits/chosen": -0.6056051254272461, "logits/rejected": -0.5679988265037537, "logps/chosen": -67.93992614746094, "logps/ref_chosen": -48.70684051513672, "logps/ref_rejected": -81.7583999633789, "logps/rejected": -141.37774658203125, "loss": 0.3507, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1173824667930603, "margin_dpo/beta_margin_grad_std": 0.2006831169128418, "margin_dpo/beta_margin_mean": 4.0386271476745605, "margin_dpo/loss_margin_mean": 40.38627243041992, "margin_dpo/margin_mean": 40.38627243041992, "margin_dpo/margin_std": 30.28713607788086, "step": 580 }, { "epoch": 0.8531571218795888, "grad_norm": 52.634037017822266, "learning_rate": 3.285483927764726e-08, "logits/chosen": -0.5740267634391785, "logits/rejected": -0.5509278774261475, "logps/chosen": -83.43389892578125, "logps/ref_chosen": -62.22235107421875, "logps/ref_rejected": -91.73568725585938, "logps/rejected": -144.02926635742188, "loss": 0.4291, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15440425276756287, "margin_dpo/beta_margin_grad_std": 0.203893780708313, "margin_dpo/beta_margin_mean": 3.1082029342651367, "margin_dpo/loss_margin_mean": 31.082029342651367, "margin_dpo/margin_mean": 31.082029342651367, "margin_dpo/margin_std": 24.809860229492188, "step": 581 }, { "epoch": 0.8546255506607929, "grad_norm": 66.31195068359375, "learning_rate": 3.222175147833556e-08, "logits/chosen": -0.605322003364563, "logits/rejected": -0.6055228114128113, "logps/chosen": -77.13755798339844, "logps/ref_chosen": -58.228660583496094, "logps/ref_rejected": -110.06959533691406, "logps/rejected": -164.6455078125, "loss": 0.4097, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14925749599933624, "margin_dpo/beta_margin_grad_std": 0.20233149826526642, "margin_dpo/beta_margin_mean": 3.5667009353637695, "margin_dpo/loss_margin_mean": 35.66700744628906, "margin_dpo/margin_mean": 35.66700744628906, "margin_dpo/margin_std": 28.87442398071289, "step": 582 }, { "epoch": 0.856093979441997, "grad_norm": 63.78881072998047, "learning_rate": 3.159440233840763e-08, "logits/chosen": -0.5751946568489075, "logits/rejected": -0.5564270615577698, "logps/chosen": -81.47348022460938, "logps/ref_chosen": -56.86286163330078, "logps/ref_rejected": -88.4039306640625, "logps/rejected": -142.409423828125, "loss": 0.5765, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18386687338352203, "margin_dpo/beta_margin_grad_std": 0.2354869246482849, "margin_dpo/beta_margin_mean": 2.939487934112549, "margin_dpo/loss_margin_mean": 29.394878387451172, "margin_dpo/margin_mean": 29.394878387451172, "margin_dpo/margin_std": 29.66604995727539, "step": 583 }, { "epoch": 0.8575624082232012, "grad_norm": 40.81183624267578, "learning_rate": 3.0972808389096635e-08, "logits/chosen": -0.6157029271125793, "logits/rejected": -0.5580540299415588, "logps/chosen": -74.85009002685547, "logps/ref_chosen": -56.90068054199219, "logps/ref_rejected": -97.63606262207031, "logps/rejected": -154.90313720703125, "loss": 0.2598, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.10160969197750092, "margin_dpo/beta_margin_grad_std": 0.1572841852903366, "margin_dpo/beta_margin_mean": 3.931765556335449, "margin_dpo/loss_margin_mean": 39.317657470703125, "margin_dpo/margin_mean": 39.317657470703125, "margin_dpo/margin_std": 26.282012939453125, "step": 584 }, { "epoch": 0.8590308370044053, "grad_norm": 64.71321105957031, "learning_rate": 3.035698600998121e-08, "logits/chosen": -0.6199055314064026, "logits/rejected": -0.5935189723968506, "logps/chosen": -85.7191162109375, "logps/ref_chosen": -60.973968505859375, "logps/ref_rejected": -84.16952514648438, "logps/rejected": -141.8712921142578, "loss": 0.4802, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16080693900585175, "margin_dpo/beta_margin_grad_std": 0.21611681580543518, "margin_dpo/beta_margin_mean": 3.2956621646881104, "margin_dpo/loss_margin_mean": 32.95662307739258, "margin_dpo/margin_mean": 32.95662307739258, "margin_dpo/margin_std": 28.009883880615234, "step": 585 }, { "epoch": 0.8604992657856094, "grad_norm": 64.60726928710938, "learning_rate": 2.974695142855388e-08, "logits/chosen": -0.5863425731658936, "logits/rejected": -0.5751093626022339, "logps/chosen": -82.08023071289062, "logps/ref_chosen": -56.85559844970703, "logps/ref_rejected": -91.8026123046875, "logps/rejected": -149.82192993164062, "loss": 0.5533, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16516205668449402, "margin_dpo/beta_margin_grad_std": 0.2322179675102234, "margin_dpo/beta_margin_mean": 3.2794694900512695, "margin_dpo/loss_margin_mean": 32.79469299316406, "margin_dpo/margin_mean": 32.79469680786133, "margin_dpo/margin_std": 29.784767150878906, "step": 586 }, { "epoch": 0.8619676945668135, "grad_norm": 47.665504455566406, "learning_rate": 2.9142720719793122e-08, "logits/chosen": -0.6379122734069824, "logits/rejected": -0.6201504468917847, "logps/chosen": -62.888648986816406, "logps/ref_chosen": -44.69159698486328, "logps/ref_rejected": -82.62385559082031, "logps/rejected": -131.4063720703125, "loss": 0.501, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1841679811477661, "margin_dpo/beta_margin_grad_std": 0.21286147832870483, "margin_dpo/beta_margin_mean": 3.058547019958496, "margin_dpo/loss_margin_mean": 30.58547019958496, "margin_dpo/margin_mean": 30.585468292236328, "margin_dpo/margin_std": 27.52269744873047, "step": 587 }, { "epoch": 0.8634361233480177, "grad_norm": 63.907814025878906, "learning_rate": 2.8544309805740018e-08, "logits/chosen": -0.6512797474861145, "logits/rejected": -0.6356316804885864, "logps/chosen": -73.06784057617188, "logps/ref_chosen": -50.294952392578125, "logps/ref_rejected": -107.36988067626953, "logps/rejected": -162.28692626953125, "loss": 0.4775, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16899925470352173, "margin_dpo/beta_margin_grad_std": 0.21543928980827332, "margin_dpo/beta_margin_mean": 3.21441650390625, "margin_dpo/loss_margin_mean": 32.1441650390625, "margin_dpo/margin_mean": 32.1441650390625, "margin_dpo/margin_std": 28.336963653564453, "step": 588 }, { "epoch": 0.8649045521292217, "grad_norm": 42.07124710083008, "learning_rate": 2.7951734455078786e-08, "logits/chosen": -0.6398344039916992, "logits/rejected": -0.6136231422424316, "logps/chosen": -82.24392700195312, "logps/ref_chosen": -59.929908752441406, "logps/ref_rejected": -111.65534973144531, "logps/rejected": -178.29017639160156, "loss": 0.3172, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.10654419660568237, "margin_dpo/beta_margin_grad_std": 0.19332361221313477, "margin_dpo/beta_margin_mean": 4.432080268859863, "margin_dpo/loss_margin_mean": 44.32080078125, "margin_dpo/margin_mean": 44.32080078125, "margin_dpo/margin_std": 32.961795806884766, "step": 589 }, { "epoch": 0.8663729809104258, "grad_norm": 38.59159851074219, "learning_rate": 2.736501028272095e-08, "logits/chosen": -0.6100300550460815, "logits/rejected": -0.5847848057746887, "logps/chosen": -77.71539306640625, "logps/ref_chosen": -55.80979537963867, "logps/ref_rejected": -106.06282043457031, "logps/rejected": -166.61837768554688, "loss": 0.2745, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.10657128691673279, "margin_dpo/beta_margin_grad_std": 0.1513645052909851, "margin_dpo/beta_margin_mean": 3.8649959564208984, "margin_dpo/loss_margin_mean": 38.64995574951172, "margin_dpo/margin_mean": 38.64995574951172, "margin_dpo/margin_std": 28.00396728515625, "step": 590 }, { "epoch": 0.8678414096916299, "grad_norm": 63.60677719116211, "learning_rate": 2.678415274939408e-08, "logits/chosen": -0.6167633533477783, "logits/rejected": -0.5552696585655212, "logps/chosen": -81.19537353515625, "logps/ref_chosen": -56.24061965942383, "logps/ref_rejected": -83.78629302978516, "logps/rejected": -145.19476318359375, "loss": 0.3925, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12805438041687012, "margin_dpo/beta_margin_grad_std": 0.21181520819664001, "margin_dpo/beta_margin_mean": 3.645371913909912, "margin_dpo/loss_margin_mean": 36.45372009277344, "margin_dpo/margin_mean": 36.45372009277344, "margin_dpo/margin_std": 25.672313690185547, "step": 591 }, { "epoch": 0.869309838472834, "grad_norm": 86.11256408691406, "learning_rate": 2.6209177161234442e-08, "logits/chosen": -0.6086193323135376, "logits/rejected": -0.5861480236053467, "logps/chosen": -73.6089096069336, "logps/ref_chosen": -47.94025421142578, "logps/ref_rejected": -75.73287963867188, "logps/rejected": -137.11573791503906, "loss": 0.5921, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15364328026771545, "margin_dpo/beta_margin_grad_std": 0.24902772903442383, "margin_dpo/beta_margin_mean": 3.571420669555664, "margin_dpo/loss_margin_mean": 35.71420669555664, "margin_dpo/margin_mean": 35.71420669555664, "margin_dpo/margin_std": 28.337505340576172, "step": 592 }, { "epoch": 0.8707782672540382, "grad_norm": 82.06730651855469, "learning_rate": 2.564009866938349e-08, "logits/chosen": -0.5315680503845215, "logits/rejected": -0.5048198699951172, "logps/chosen": -72.25209045410156, "logps/ref_chosen": -48.690757751464844, "logps/ref_rejected": -60.90800476074219, "logps/rejected": -114.44489288330078, "loss": 0.6216, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18850964307785034, "margin_dpo/beta_margin_grad_std": 0.2500463128089905, "margin_dpo/beta_margin_mean": 2.9975552558898926, "margin_dpo/loss_margin_mean": 29.97555160522461, "margin_dpo/margin_mean": 29.97555160522461, "margin_dpo/margin_std": 28.288480758666992, "step": 593 }, { "epoch": 0.8722466960352423, "grad_norm": 60.2679443359375, "learning_rate": 2.5076932269588708e-08, "logits/chosen": -0.6228535175323486, "logits/rejected": -0.5754865407943726, "logps/chosen": -76.21943664550781, "logps/ref_chosen": -54.93488693237305, "logps/ref_rejected": -86.09967041015625, "logps/rejected": -147.49954223632812, "loss": 0.4954, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14612731337547302, "margin_dpo/beta_margin_grad_std": 0.2071676254272461, "margin_dpo/beta_margin_mean": 4.011531829833984, "margin_dpo/loss_margin_mean": 40.11532211303711, "margin_dpo/margin_mean": 40.115318298339844, "margin_dpo/margin_std": 34.15007781982422, "step": 594 }, { "epoch": 0.8737151248164464, "grad_norm": 43.721248626708984, "learning_rate": 2.451969280180849e-08, "logits/chosen": -0.5794812440872192, "logits/rejected": -0.544758141040802, "logps/chosen": -72.4796142578125, "logps/ref_chosen": -49.42041778564453, "logps/ref_rejected": -80.62731170654297, "logps/rejected": -135.92617797851562, "loss": 0.3821, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14666876196861267, "margin_dpo/beta_margin_grad_std": 0.18497151136398315, "margin_dpo/beta_margin_mean": 3.223968029022217, "margin_dpo/loss_margin_mean": 32.23967742919922, "margin_dpo/margin_mean": 32.23967742919922, "margin_dpo/margin_std": 26.86066436767578, "step": 595 }, { "epoch": 0.8751835535976505, "grad_norm": 64.42137908935547, "learning_rate": 2.396839494982103e-08, "logits/chosen": -0.5889699459075928, "logits/rejected": -0.5423535704612732, "logps/chosen": -81.71769714355469, "logps/ref_chosen": -59.791683197021484, "logps/ref_rejected": -80.09111785888672, "logps/rejected": -137.01409912109375, "loss": 0.4742, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16088539361953735, "margin_dpo/beta_margin_grad_std": 0.20914097130298615, "margin_dpo/beta_margin_mean": 3.49969744682312, "margin_dpo/loss_margin_mean": 34.99697494506836, "margin_dpo/margin_mean": 34.99697494506836, "margin_dpo/margin_std": 29.85952377319336, "step": 596 }, { "epoch": 0.8766519823788547, "grad_norm": 58.71807098388672, "learning_rate": 2.3423053240837514e-08, "logits/chosen": -0.5836566686630249, "logits/rejected": -0.579143762588501, "logps/chosen": -79.78302001953125, "logps/ref_chosen": -57.26078796386719, "logps/ref_rejected": -100.6937255859375, "logps/rejected": -158.7677459716797, "loss": 0.5176, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16815660893917084, "margin_dpo/beta_margin_grad_std": 0.23717570304870605, "margin_dpo/beta_margin_mean": 3.555178642272949, "margin_dpo/loss_margin_mean": 35.551788330078125, "margin_dpo/margin_mean": 35.55178451538086, "margin_dpo/margin_std": 31.711952209472656, "step": 597 }, { "epoch": 0.8781204111600588, "grad_norm": 66.43830871582031, "learning_rate": 2.2883682045119062e-08, "logits/chosen": -0.6284604072570801, "logits/rejected": -0.5999557375907898, "logps/chosen": -75.705078125, "logps/ref_chosen": -52.51850509643555, "logps/ref_rejected": -89.44385528564453, "logps/rejected": -145.19064331054688, "loss": 0.4578, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14189483225345612, "margin_dpo/beta_margin_grad_std": 0.1853117048740387, "margin_dpo/beta_margin_mean": 3.256021499633789, "margin_dpo/loss_margin_mean": 32.560211181640625, "margin_dpo/margin_mean": 32.560211181640625, "margin_dpo/margin_std": 24.856882095336914, "step": 598 }, { "epoch": 0.8795888399412628, "grad_norm": 59.694637298583984, "learning_rate": 2.2350295575598367e-08, "logits/chosen": -0.6019773483276367, "logits/rejected": -0.5818980932235718, "logps/chosen": -71.63743591308594, "logps/ref_chosen": -49.802677154541016, "logps/ref_rejected": -82.978515625, "logps/rejected": -137.6220245361328, "loss": 0.4546, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15810009837150574, "margin_dpo/beta_margin_grad_std": 0.22325119376182556, "margin_dpo/beta_margin_mean": 3.2808756828308105, "margin_dpo/loss_margin_mean": 32.80875778198242, "margin_dpo/margin_mean": 32.80875778198242, "margin_dpo/margin_std": 25.88229751586914, "step": 599 }, { "epoch": 0.8810572687224669, "grad_norm": 73.32799530029297, "learning_rate": 2.1822907887504932e-08, "logits/chosen": -0.6534677147865295, "logits/rejected": -0.6272458434104919, "logps/chosen": -88.00627899169922, "logps/ref_chosen": -66.43487548828125, "logps/ref_rejected": -85.45649719238281, "logps/rejected": -137.1350555419922, "loss": 0.4876, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.152080237865448, "margin_dpo/beta_margin_grad_std": 0.21173834800720215, "margin_dpo/beta_margin_mean": 3.0107154846191406, "margin_dpo/loss_margin_mean": 30.107154846191406, "margin_dpo/margin_mean": 30.107154846191406, "margin_dpo/margin_std": 25.551097869873047, "step": 600 }, { "epoch": 0.8810572687224669, "eval_logits/chosen": -0.6269975304603577, "eval_logits/rejected": -0.6013357043266296, "eval_logps/chosen": -105.93721771240234, "eval_logps/ref_chosen": -79.05104064941406, "eval_logps/ref_rejected": -86.79793548583984, "eval_logps/rejected": -135.44046020507812, "eval_loss": 0.4046096205711365, "eval_margin_dpo/beta": 0.10000000149011612, "eval_margin_dpo/beta_margin_grad_mean": -0.25697416067123413, "eval_margin_dpo/beta_margin_grad_std": 0.25375545024871826, "eval_margin_dpo/beta_margin_mean": 2.175632953643799, "eval_margin_dpo/loss_margin_mean": 21.756330490112305, "eval_margin_dpo/margin_mean": 21.756330490112305, "eval_margin_dpo/margin_std": 26.337753295898438, "eval_runtime": 39.8498, "eval_samples_per_second": 58.695, "eval_steps_per_second": 1.857, "step": 600 }, { "epoch": 0.882525697503671, "grad_norm": 83.50579071044922, "learning_rate": 2.1301532877994742e-08, "logits/chosen": -0.6308251619338989, "logits/rejected": -0.6028087139129639, "logps/chosen": -85.07262420654297, "logps/ref_chosen": -59.13360595703125, "logps/ref_rejected": -94.69093322753906, "logps/rejected": -154.66099548339844, "loss": 0.5224, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1595621407032013, "margin_dpo/beta_margin_grad_std": 0.24474212527275085, "margin_dpo/beta_margin_mean": 3.403104543685913, "margin_dpo/loss_margin_mean": 34.031044006347656, "margin_dpo/margin_mean": 34.031044006347656, "margin_dpo/margin_std": 28.710695266723633, "step": 601 }, { "epoch": 0.8839941262848752, "grad_norm": 67.36071014404297, "learning_rate": 2.0786184285784298e-08, "logits/chosen": -0.6085352897644043, "logits/rejected": -0.6077029705047607, "logps/chosen": -66.83834838867188, "logps/ref_chosen": -48.59352111816406, "logps/ref_rejected": -87.6685562133789, "logps/rejected": -143.52706909179688, "loss": 0.3598, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12709340453147888, "margin_dpo/beta_margin_grad_std": 0.19667461514472961, "margin_dpo/beta_margin_mean": 3.7613697052001953, "margin_dpo/loss_margin_mean": 37.61369705200195, "margin_dpo/margin_mean": 37.61369705200195, "margin_dpo/margin_std": 27.582782745361328, "step": 602 }, { "epoch": 0.8854625550660793, "grad_norm": 65.35578918457031, "learning_rate": 2.0276875690788204e-08, "logits/chosen": -0.6445978879928589, "logits/rejected": -0.6060948371887207, "logps/chosen": -90.91526794433594, "logps/ref_chosen": -70.41461944580078, "logps/ref_rejected": -100.32560729980469, "logps/rejected": -153.026611328125, "loss": 0.4681, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16203567385673523, "margin_dpo/beta_margin_grad_std": 0.22206860780715942, "margin_dpo/beta_margin_mean": 3.2200357913970947, "margin_dpo/loss_margin_mean": 32.200355529785156, "margin_dpo/margin_mean": 32.200355529785156, "margin_dpo/margin_std": 26.239582061767578, "step": 603 }, { "epoch": 0.8869309838472834, "grad_norm": 65.85285949707031, "learning_rate": 1.977362051376158e-08, "logits/chosen": -0.6049788594245911, "logits/rejected": -0.5948315858840942, "logps/chosen": -65.44568634033203, "logps/ref_chosen": -46.45808029174805, "logps/ref_rejected": -91.8544921875, "logps/rejected": -146.36270141601562, "loss": 0.4541, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14404089748859406, "margin_dpo/beta_margin_grad_std": 0.20777881145477295, "margin_dpo/beta_margin_mean": 3.552060604095459, "margin_dpo/loss_margin_mean": 35.520606994628906, "margin_dpo/margin_mean": 35.520606994628906, "margin_dpo/margin_std": 29.537954330444336, "step": 604 }, { "epoch": 0.8883994126284875, "grad_norm": 62.26566696166992, "learning_rate": 1.9276432015946446e-08, "logits/chosen": -0.5922667384147644, "logits/rejected": -0.5747475028038025, "logps/chosen": -90.86492919921875, "logps/ref_chosen": -66.24933624267578, "logps/ref_rejected": -102.30496978759766, "logps/rejected": -158.65435791015625, "loss": 0.4596, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1484421193599701, "margin_dpo/beta_margin_grad_std": 0.19956421852111816, "margin_dpo/beta_margin_mean": 3.1733784675598145, "margin_dpo/loss_margin_mean": 31.733783721923828, "margin_dpo/margin_mean": 31.733783721923828, "margin_dpo/margin_std": 29.25701141357422, "step": 605 }, { "epoch": 0.8898678414096917, "grad_norm": 40.07181167602539, "learning_rate": 1.8785323298722093e-08, "logits/chosen": -0.6057391166687012, "logits/rejected": -0.5747348070144653, "logps/chosen": -76.91024780273438, "logps/ref_chosen": -54.819122314453125, "logps/ref_rejected": -98.37147521972656, "logps/rejected": -157.16664123535156, "loss": 0.2922, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.11917038261890411, "margin_dpo/beta_margin_grad_std": 0.14764106273651123, "margin_dpo/beta_margin_mean": 3.6704044342041016, "margin_dpo/loss_margin_mean": 36.704044342041016, "margin_dpo/margin_mean": 36.704044342041016, "margin_dpo/margin_std": 25.36406707763672, "step": 606 }, { "epoch": 0.8913362701908958, "grad_norm": 52.19849395751953, "learning_rate": 1.8300307303259904e-08, "logits/chosen": -0.5950828194618225, "logits/rejected": -0.560725212097168, "logps/chosen": -79.23565673828125, "logps/ref_chosen": -58.08403778076172, "logps/ref_rejected": -79.777099609375, "logps/rejected": -133.3597412109375, "loss": 0.3369, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12986359000205994, "margin_dpo/beta_margin_grad_std": 0.17091821134090424, "margin_dpo/beta_margin_mean": 3.243102550506592, "margin_dpo/loss_margin_mean": 32.43102264404297, "margin_dpo/margin_mean": 32.43102264404297, "margin_dpo/margin_std": 23.558351516723633, "step": 607 }, { "epoch": 0.8928046989720999, "grad_norm": 59.17472839355469, "learning_rate": 1.7821396810182437e-08, "logits/chosen": -0.6549203395843506, "logits/rejected": -0.6243371367454529, "logps/chosen": -78.31192016601562, "logps/ref_chosen": -57.450836181640625, "logps/ref_rejected": -94.77339172363281, "logps/rejected": -148.8663330078125, "loss": 0.4835, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15288802981376648, "margin_dpo/beta_margin_grad_std": 0.22536322474479675, "margin_dpo/beta_margin_mean": 3.3231868743896484, "margin_dpo/loss_margin_mean": 33.231868743896484, "margin_dpo/margin_mean": 33.23186492919922, "margin_dpo/margin_std": 26.378620147705078, "step": 608 }, { "epoch": 0.8942731277533039, "grad_norm": 66.138427734375, "learning_rate": 1.7348604439226617e-08, "logits/chosen": -0.6333421468734741, "logits/rejected": -0.5963184833526611, "logps/chosen": -82.09093475341797, "logps/ref_chosen": -58.805355072021484, "logps/ref_rejected": -88.81600952148438, "logps/rejected": -145.73898315429688, "loss": 0.3546, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12897904217243195, "margin_dpo/beta_margin_grad_std": 0.18817874789237976, "margin_dpo/beta_margin_mean": 3.363740921020508, "margin_dpo/loss_margin_mean": 33.63740539550781, "margin_dpo/margin_mean": 33.63740539550781, "margin_dpo/margin_std": 24.00457763671875, "step": 609 }, { "epoch": 0.895741556534508, "grad_norm": 75.1207275390625, "learning_rate": 1.6881942648911074e-08, "logits/chosen": -0.5928279161453247, "logits/rejected": -0.5319284200668335, "logps/chosen": -90.37582397460938, "logps/ref_chosen": -65.69503784179688, "logps/ref_rejected": -83.4053955078125, "logps/rejected": -140.68991088867188, "loss": 0.4574, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1538185179233551, "margin_dpo/beta_margin_grad_std": 0.21573176980018616, "margin_dpo/beta_margin_mean": 3.26037335395813, "margin_dpo/loss_margin_mean": 32.60373306274414, "margin_dpo/margin_mean": 32.60373306274414, "margin_dpo/margin_std": 24.9559326171875, "step": 610 }, { "epoch": 0.8972099853157122, "grad_norm": 48.684600830078125, "learning_rate": 1.6421423736208e-08, "logits/chosen": -0.6442773342132568, "logits/rejected": -0.6083732843399048, "logps/chosen": -74.56732177734375, "logps/ref_chosen": -52.59947204589844, "logps/ref_rejected": -86.33099365234375, "logps/rejected": -144.01742553710938, "loss": 0.3964, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14588133990764618, "margin_dpo/beta_margin_grad_std": 0.19904646277427673, "margin_dpo/beta_margin_mean": 3.5718588829040527, "margin_dpo/loss_margin_mean": 35.718589782714844, "margin_dpo/margin_mean": 35.718589782714844, "margin_dpo/margin_std": 28.007495880126953, "step": 611 }, { "epoch": 0.8986784140969163, "grad_norm": 45.877662658691406, "learning_rate": 1.5967059836219042e-08, "logits/chosen": -0.6410280466079712, "logits/rejected": -0.582598090171814, "logps/chosen": -80.21870422363281, "logps/ref_chosen": -59.32372283935547, "logps/ref_rejected": -88.31239318847656, "logps/rejected": -150.30587768554688, "loss": 0.2722, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.10045182704925537, "margin_dpo/beta_margin_grad_std": 0.16613739728927612, "margin_dpo/beta_margin_mean": 4.109850883483887, "margin_dpo/loss_margin_mean": 41.0985107421875, "margin_dpo/margin_mean": 41.0985107421875, "margin_dpo/margin_std": 27.613842010498047, "step": 612 }, { "epoch": 0.9001468428781204, "grad_norm": 50.60771942138672, "learning_rate": 1.551886292185553e-08, "logits/chosen": -0.6397769451141357, "logits/rejected": -0.6355684995651245, "logps/chosen": -80.78131866455078, "logps/ref_chosen": -59.72996520996094, "logps/ref_rejected": -105.10753631591797, "logps/rejected": -161.82345581054688, "loss": 0.3682, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13177156448364258, "margin_dpo/beta_margin_grad_std": 0.19979646801948547, "margin_dpo/beta_margin_mean": 3.566455841064453, "margin_dpo/loss_margin_mean": 35.66455841064453, "margin_dpo/margin_mean": 35.66455841064453, "margin_dpo/margin_std": 27.262413024902344, "step": 613 }, { "epoch": 0.9016152716593245, "grad_norm": 46.63825988769531, "learning_rate": 1.507684480352292e-08, "logits/chosen": -0.58838951587677, "logits/rejected": -0.5823639035224915, "logps/chosen": -76.70652770996094, "logps/ref_chosen": -52.93898010253906, "logps/ref_rejected": -104.67938232421875, "logps/rejected": -164.14959716796875, "loss": 0.3003, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.11585717648267746, "margin_dpo/beta_margin_grad_std": 0.16684673726558685, "margin_dpo/beta_margin_mean": 3.57026743888855, "margin_dpo/loss_margin_mean": 35.702674865722656, "margin_dpo/margin_mean": 35.702674865722656, "margin_dpo/margin_std": 25.349502563476562, "step": 614 }, { "epoch": 0.9030837004405287, "grad_norm": 42.50800323486328, "learning_rate": 1.4641017128809801e-08, "logits/chosen": -0.5662412047386169, "logits/rejected": -0.5331077575683594, "logps/chosen": -87.12669372558594, "logps/ref_chosen": -65.81727600097656, "logps/ref_rejected": -95.17749786376953, "logps/rejected": -146.67713928222656, "loss": 0.4066, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1578460931777954, "margin_dpo/beta_margin_grad_std": 0.1820681393146515, "margin_dpo/beta_margin_mean": 3.019021987915039, "margin_dpo/loss_margin_mean": 30.19021987915039, "margin_dpo/margin_mean": 30.19021987915039, "margin_dpo/margin_std": 23.05301284790039, "step": 615 }, { "epoch": 0.9045521292217328, "grad_norm": 77.02394104003906, "learning_rate": 1.4211391382180637e-08, "logits/chosen": -0.5957802534103394, "logits/rejected": -0.5418244004249573, "logps/chosen": -88.68885040283203, "logps/ref_chosen": -65.13285827636719, "logps/ref_rejected": -74.70050048828125, "logps/rejected": -130.87673950195312, "loss": 0.4978, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15978117287158966, "margin_dpo/beta_margin_grad_std": 0.22819003462791443, "margin_dpo/beta_margin_mean": 3.26202392578125, "margin_dpo/loss_margin_mean": 32.6202392578125, "margin_dpo/margin_mean": 32.6202392578125, "margin_dpo/margin_std": 29.516948699951172, "step": 616 }, { "epoch": 0.9060205580029369, "grad_norm": 49.953460693359375, "learning_rate": 1.378797888467345e-08, "logits/chosen": -0.5749341249465942, "logits/rejected": -0.53103107213974, "logps/chosen": -87.75241088867188, "logps/ref_chosen": -63.005550384521484, "logps/ref_rejected": -64.234130859375, "logps/rejected": -118.99295043945312, "loss": 0.3848, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14862868189811707, "margin_dpo/beta_margin_grad_std": 0.17751744389533997, "margin_dpo/beta_margin_mean": 3.0011959075927734, "margin_dpo/loss_margin_mean": 30.011959075927734, "margin_dpo/margin_mean": 30.011959075927734, "margin_dpo/margin_std": 23.59270477294922, "step": 617 }, { "epoch": 0.9074889867841409, "grad_norm": 66.36804962158203, "learning_rate": 1.3370790793601371e-08, "logits/chosen": -0.6147041320800781, "logits/rejected": -0.5852859616279602, "logps/chosen": -90.93468475341797, "logps/ref_chosen": -67.10135650634766, "logps/ref_rejected": -92.15339660644531, "logps/rejected": -146.77523803710938, "loss": 0.4572, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16413499414920807, "margin_dpo/beta_margin_grad_std": 0.19249024987220764, "margin_dpo/beta_margin_mean": 3.0788497924804688, "margin_dpo/loss_margin_mean": 30.78849983215332, "margin_dpo/margin_mean": 30.788501739501953, "margin_dpo/margin_std": 26.1810245513916, "step": 618 }, { "epoch": 0.908957415565345, "grad_norm": 60.24756622314453, "learning_rate": 1.2959838102258535e-08, "logits/chosen": -0.5955780744552612, "logits/rejected": -0.5634878873825073, "logps/chosen": -79.19235229492188, "logps/ref_chosen": -55.978233337402344, "logps/ref_rejected": -93.1854019165039, "logps/rejected": -149.55165100097656, "loss": 0.4659, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1654718816280365, "margin_dpo/beta_margin_grad_std": 0.21678967773914337, "margin_dpo/beta_margin_mean": 3.3152127265930176, "margin_dpo/loss_margin_mean": 33.152130126953125, "margin_dpo/margin_mean": 33.152130126953125, "margin_dpo/margin_std": 29.86014175415039, "step": 619 }, { "epoch": 0.9104258443465492, "grad_norm": 35.56299591064453, "learning_rate": 1.2555131639630567e-08, "logits/chosen": -0.6245037317276001, "logits/rejected": -0.5862281322479248, "logps/chosen": -79.94758605957031, "logps/ref_chosen": -59.79750061035156, "logps/ref_rejected": -78.41075134277344, "logps/rejected": -134.01303100585938, "loss": 0.2607, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1101793646812439, "margin_dpo/beta_margin_grad_std": 0.12655286490917206, "margin_dpo/beta_margin_mean": 3.5452189445495605, "margin_dpo/loss_margin_mean": 35.45219039916992, "margin_dpo/margin_mean": 35.45219039916992, "margin_dpo/margin_std": 25.871028900146484, "step": 620 }, { "epoch": 0.9118942731277533, "grad_norm": 37.061790466308594, "learning_rate": 1.2156682070109086e-08, "logits/chosen": -0.606106698513031, "logits/rejected": -0.5785382986068726, "logps/chosen": -72.80471801757812, "logps/ref_chosen": -53.933753967285156, "logps/ref_rejected": -88.36952209472656, "logps/rejected": -143.35723876953125, "loss": 0.3122, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.10982675105333328, "margin_dpo/beta_margin_grad_std": 0.17607754468917847, "margin_dpo/beta_margin_mean": 3.61167573928833, "margin_dpo/loss_margin_mean": 36.11675262451172, "margin_dpo/margin_mean": 36.11675262451172, "margin_dpo/margin_std": 26.981311798095703, "step": 621 }, { "epoch": 0.9133627019089574, "grad_norm": 49.68582534790039, "learning_rate": 1.1764499893210878e-08, "logits/chosen": -0.5509716272354126, "logits/rejected": -0.49232321977615356, "logps/chosen": -82.90745544433594, "logps/ref_chosen": -60.28582000732422, "logps/ref_rejected": -85.51873779296875, "logps/rejected": -144.81259155273438, "loss": 0.3885, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13941837847232819, "margin_dpo/beta_margin_grad_std": 0.1975843608379364, "margin_dpo/beta_margin_mean": 3.6672213077545166, "margin_dpo/loss_margin_mean": 36.672210693359375, "margin_dpo/margin_mean": 36.672210693359375, "margin_dpo/margin_std": 28.6815185546875, "step": 622 }, { "epoch": 0.9148311306901615, "grad_norm": 73.91598510742188, "learning_rate": 1.1378595443300998e-08, "logits/chosen": -0.632436990737915, "logits/rejected": -0.5964562892913818, "logps/chosen": -88.94279479980469, "logps/ref_chosen": -64.15696716308594, "logps/ref_rejected": -85.08304595947266, "logps/rejected": -140.305419921875, "loss": 0.5623, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18874922394752502, "margin_dpo/beta_margin_grad_std": 0.23199373483657837, "margin_dpo/beta_margin_mean": 3.0436534881591797, "margin_dpo/loss_margin_mean": 30.436534881591797, "margin_dpo/margin_mean": 30.436534881591797, "margin_dpo/margin_std": 29.26456069946289, "step": 623 }, { "epoch": 0.9162995594713657, "grad_norm": 69.39524841308594, "learning_rate": 1.0998978889320582e-08, "logits/chosen": -0.6819274425506592, "logits/rejected": -0.6118913888931274, "logps/chosen": -94.78079986572266, "logps/ref_chosen": -71.91862487792969, "logps/ref_rejected": -97.13203430175781, "logps/rejected": -157.30023193359375, "loss": 0.4915, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14777633547782898, "margin_dpo/beta_margin_grad_std": 0.2476031631231308, "margin_dpo/beta_margin_mean": 3.730602741241455, "margin_dpo/loss_margin_mean": 37.30602264404297, "margin_dpo/margin_mean": 37.30602264404297, "margin_dpo/margin_std": 27.589237213134766, "step": 624 }, { "epoch": 0.9177679882525698, "grad_norm": 48.65541458129883, "learning_rate": 1.0625660234518913e-08, "logits/chosen": -0.5835287570953369, "logits/rejected": -0.5436596870422363, "logps/chosen": -81.64682006835938, "logps/ref_chosen": -58.342071533203125, "logps/ref_rejected": -86.09038543701172, "logps/rejected": -145.12460327148438, "loss": 0.3486, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13256171345710754, "margin_dpo/beta_margin_grad_std": 0.18188360333442688, "margin_dpo/beta_margin_mean": 3.5729477405548096, "margin_dpo/loss_margin_mean": 35.72947692871094, "margin_dpo/margin_mean": 35.72947692871094, "margin_dpo/margin_std": 28.54417610168457, "step": 625 }, { "epoch": 0.9192364170337739, "grad_norm": 63.34779739379883, "learning_rate": 1.0258649316189721e-08, "logits/chosen": -0.5459762811660767, "logits/rejected": -0.5098272562026978, "logps/chosen": -99.06941223144531, "logps/ref_chosen": -75.11260986328125, "logps/ref_rejected": -99.18872833251953, "logps/rejected": -153.42007446289062, "loss": 0.5183, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18455414474010468, "margin_dpo/beta_margin_grad_std": 0.21206972002983093, "margin_dpo/beta_margin_mean": 3.0274548530578613, "margin_dpo/loss_margin_mean": 30.274547576904297, "margin_dpo/margin_mean": 30.274547576904297, "margin_dpo/margin_std": 28.38648223876953, "step": 626 }, { "epoch": 0.920704845814978, "grad_norm": 79.35140228271484, "learning_rate": 9.897955805412e-09, "logits/chosen": -0.5999346971511841, "logits/rejected": -0.6070972681045532, "logps/chosen": -69.15809631347656, "logps/ref_chosen": -47.74314880371094, "logps/ref_rejected": -106.75448608398438, "logps/rejected": -162.14773559570312, "loss": 0.6092, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1894698441028595, "margin_dpo/beta_margin_grad_std": 0.2491467446088791, "margin_dpo/beta_margin_mean": 3.397829055786133, "margin_dpo/loss_margin_mean": 33.97829055786133, "margin_dpo/margin_mean": 33.97828674316406, "margin_dpo/margin_std": 34.131160736083984, "step": 627 }, { "epoch": 0.922173274596182, "grad_norm": 40.400997161865234, "learning_rate": 9.543589206795238e-09, "logits/chosen": -0.5975438356399536, "logits/rejected": -0.5776046514511108, "logps/chosen": -82.31864929199219, "logps/ref_chosen": -60.182945251464844, "logps/ref_rejected": -101.55467224121094, "logps/rejected": -159.41123962402344, "loss": 0.298, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1191520020365715, "margin_dpo/beta_margin_grad_std": 0.15398246049880981, "margin_dpo/beta_margin_mean": 3.572086811065674, "margin_dpo/loss_margin_mean": 35.72086715698242, "margin_dpo/margin_mean": 35.72086715698242, "margin_dpo/margin_std": 25.814367294311523, "step": 628 }, { "epoch": 0.9236417033773862, "grad_norm": 62.328609466552734, "learning_rate": 9.19555885822887e-09, "logits/chosen": -0.6410259008407593, "logits/rejected": -0.597222089767456, "logps/chosen": -86.5196533203125, "logps/ref_chosen": -64.21353912353516, "logps/ref_rejected": -91.65367126464844, "logps/rejected": -145.63623046875, "loss": 0.4052, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14122983813285828, "margin_dpo/beta_margin_grad_std": 0.1920766830444336, "margin_dpo/beta_margin_mean": 3.167644500732422, "margin_dpo/loss_margin_mean": 31.67644500732422, "margin_dpo/margin_mean": 31.67644500732422, "margin_dpo/margin_std": 24.94976043701172, "step": 629 }, { "epoch": 0.9251101321585903, "grad_norm": 61.22914505004883, "learning_rate": 8.85387393063622e-09, "logits/chosen": -0.662344217300415, "logits/rejected": -0.6153937578201294, "logps/chosen": -79.80152130126953, "logps/ref_chosen": -59.29100036621094, "logps/ref_rejected": -83.59829711914062, "logps/rejected": -134.3487091064453, "loss": 0.464, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16477835178375244, "margin_dpo/beta_margin_grad_std": 0.2072598934173584, "margin_dpo/beta_margin_mean": 3.023988962173462, "margin_dpo/loss_margin_mean": 30.23988914489746, "margin_dpo/margin_mean": 30.23988914489746, "margin_dpo/margin_std": 25.428054809570312, "step": 630 }, { "epoch": 0.9265785609397944, "grad_norm": 93.05696105957031, "learning_rate": 8.518543427732949e-09, "logits/chosen": -0.6291791200637817, "logits/rejected": -0.586702287197113, "logps/chosen": -84.07586669921875, "logps/ref_chosen": -59.45360565185547, "logps/ref_rejected": -80.95157623291016, "logps/rejected": -133.77267456054688, "loss": 0.7308, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.19751125574111938, "margin_dpo/beta_margin_grad_std": 0.26264500617980957, "margin_dpo/beta_margin_mean": 2.8198840618133545, "margin_dpo/loss_margin_mean": 28.198841094970703, "margin_dpo/margin_mean": 28.198841094970703, "margin_dpo/margin_std": 29.28610610961914, "step": 631 }, { "epoch": 0.9280469897209985, "grad_norm": 86.49762725830078, "learning_rate": 8.189576185789637e-09, "logits/chosen": -0.6317383050918579, "logits/rejected": -0.5975475311279297, "logps/chosen": -85.93399047851562, "logps/ref_chosen": -61.35155487060547, "logps/ref_rejected": -86.16017150878906, "logps/rejected": -143.37826538085938, "loss": 0.7104, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16609230637550354, "margin_dpo/beta_margin_grad_std": 0.26753705739974976, "margin_dpo/beta_margin_mean": 3.263566255569458, "margin_dpo/loss_margin_mean": 32.63566207885742, "margin_dpo/margin_mean": 32.63566207885742, "margin_dpo/margin_std": 29.368499755859375, "step": 632 }, { "epoch": 0.9295154185022027, "grad_norm": 59.74918746948242, "learning_rate": 7.866980873399015e-09, "logits/chosen": -0.6477575898170471, "logits/rejected": -0.6343536376953125, "logps/chosen": -80.77423095703125, "logps/ref_chosen": -57.278167724609375, "logps/ref_rejected": -91.58395385742188, "logps/rejected": -142.47764587402344, "loss": 0.5478, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1920245885848999, "margin_dpo/beta_margin_grad_std": 0.21717044711112976, "margin_dpo/beta_margin_mean": 2.739762783050537, "margin_dpo/loss_margin_mean": 27.397626876831055, "margin_dpo/margin_mean": 27.397626876831055, "margin_dpo/margin_std": 24.44476890563965, "step": 633 }, { "epoch": 0.9309838472834068, "grad_norm": 71.1202392578125, "learning_rate": 7.550765991247654e-09, "logits/chosen": -0.5574454069137573, "logits/rejected": -0.539508581161499, "logps/chosen": -93.28065490722656, "logps/ref_chosen": -66.61896514892578, "logps/ref_rejected": -107.12565612792969, "logps/rejected": -161.80874633789062, "loss": 0.6534, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.20834138989448547, "margin_dpo/beta_margin_grad_std": 0.24892690777778625, "margin_dpo/beta_margin_mean": 2.802140235900879, "margin_dpo/loss_margin_mean": 28.021400451660156, "margin_dpo/margin_mean": 28.021400451660156, "margin_dpo/margin_std": 29.189456939697266, "step": 634 }, { "epoch": 0.9324522760646109, "grad_norm": 49.81635665893555, "learning_rate": 7.240939871891699e-09, "logits/chosen": -0.627153754234314, "logits/rejected": -0.5792471170425415, "logps/chosen": -96.72550201416016, "logps/ref_chosen": -73.95551300048828, "logps/ref_rejected": -82.50045776367188, "logps/rejected": -133.87303161621094, "loss": 0.412, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.155021071434021, "margin_dpo/beta_margin_grad_std": 0.18796978890895844, "margin_dpo/beta_margin_mean": 2.860257387161255, "margin_dpo/loss_margin_mean": 28.60257339477539, "margin_dpo/margin_mean": 28.60257339477539, "margin_dpo/margin_std": 22.55862808227539, "step": 635 }, { "epoch": 0.933920704845815, "grad_norm": 49.317588806152344, "learning_rate": 6.937510679537628e-09, "logits/chosen": -0.5662115812301636, "logits/rejected": -0.5381814241409302, "logps/chosen": -82.45319366455078, "logps/ref_chosen": -59.628910064697266, "logps/ref_rejected": -81.97883605957031, "logps/rejected": -137.88497924804688, "loss": 0.3993, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13900163769721985, "margin_dpo/beta_margin_grad_std": 0.21276052296161652, "margin_dpo/beta_margin_mean": 3.308185577392578, "margin_dpo/loss_margin_mean": 33.08185958862305, "margin_dpo/margin_mean": 33.08185577392578, "margin_dpo/margin_std": 23.71514129638672, "step": 636 }, { "epoch": 0.9353891336270191, "grad_norm": 53.16542434692383, "learning_rate": 6.640486409826785e-09, "logits/chosen": -0.5961561799049377, "logits/rejected": -0.5736096501350403, "logps/chosen": -73.35490417480469, "logps/ref_chosen": -49.652687072753906, "logps/ref_rejected": -98.40513610839844, "logps/rejected": -155.1796112060547, "loss": 0.3585, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1328592598438263, "margin_dpo/beta_margin_grad_std": 0.18699194490909576, "margin_dpo/beta_margin_mean": 3.3072257041931152, "margin_dpo/loss_margin_mean": 33.0722541809082, "margin_dpo/margin_mean": 33.0722541809082, "margin_dpo/margin_std": 25.20583724975586, "step": 637 }, { "epoch": 0.9368575624082232, "grad_norm": 42.64322280883789, "learning_rate": 6.349874889624962e-09, "logits/chosen": -0.5751190185546875, "logits/rejected": -0.5282764434814453, "logps/chosen": -78.66455841064453, "logps/ref_chosen": -58.156646728515625, "logps/ref_rejected": -79.3014907836914, "logps/rejected": -136.99473571777344, "loss": 0.3226, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12046533823013306, "margin_dpo/beta_margin_grad_std": 0.1725090891122818, "margin_dpo/beta_margin_mean": 3.718533515930176, "margin_dpo/loss_margin_mean": 37.185333251953125, "margin_dpo/margin_mean": 37.185333251953125, "margin_dpo/margin_std": 27.281875610351562, "step": 638 }, { "epoch": 0.9383259911894273, "grad_norm": 57.3282470703125, "learning_rate": 6.065683776815933e-09, "logits/chosen": -0.5723918676376343, "logits/rejected": -0.5074343681335449, "logps/chosen": -97.81383514404297, "logps/ref_chosen": -72.32319641113281, "logps/ref_rejected": -74.2749252319336, "logps/rejected": -130.6868896484375, "loss": 0.4425, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14578017592430115, "margin_dpo/beta_margin_grad_std": 0.1988290250301361, "margin_dpo/beta_margin_mean": 3.092132568359375, "margin_dpo/loss_margin_mean": 30.921327590942383, "margin_dpo/margin_mean": 30.921327590942383, "margin_dpo/margin_std": 24.539897918701172, "step": 639 }, { "epoch": 0.9397944199706314, "grad_norm": 45.74781036376953, "learning_rate": 5.7879205600998296e-09, "logits/chosen": -0.5912868976593018, "logits/rejected": -0.5555776357650757, "logps/chosen": -78.59037780761719, "logps/ref_chosen": -56.13436508178711, "logps/ref_rejected": -108.60014343261719, "logps/rejected": -167.8488006591797, "loss": 0.3056, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12402527034282684, "margin_dpo/beta_margin_grad_std": 0.15019166469573975, "margin_dpo/beta_margin_mean": 3.6792640686035156, "margin_dpo/loss_margin_mean": 36.792640686035156, "margin_dpo/margin_mean": 36.792640686035156, "margin_dpo/margin_std": 29.762346267700195, "step": 640 }, { "epoch": 0.9412628487518355, "grad_norm": 51.42761993408203, "learning_rate": 5.516592558795746e-09, "logits/chosen": -0.6235780715942383, "logits/rejected": -0.5653523206710815, "logps/chosen": -88.91046142578125, "logps/ref_chosen": -64.99689483642578, "logps/ref_rejected": -86.99232482910156, "logps/rejected": -142.96499633789062, "loss": 0.3758, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14689452946186066, "margin_dpo/beta_margin_grad_std": 0.16710370779037476, "margin_dpo/beta_margin_mean": 3.2059097290039062, "margin_dpo/loss_margin_mean": 32.05909729003906, "margin_dpo/margin_mean": 32.05909729003906, "margin_dpo/margin_std": 29.564998626708984, "step": 641 }, { "epoch": 0.9427312775330396, "grad_norm": 79.5660629272461, "learning_rate": 5.251706922648868e-09, "logits/chosen": -0.5625093579292297, "logits/rejected": -0.5258715152740479, "logps/chosen": -90.38214111328125, "logps/ref_chosen": -65.68924713134766, "logps/ref_rejected": -110.24205017089844, "logps/rejected": -170.61810302734375, "loss": 0.4846, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15378239750862122, "margin_dpo/beta_margin_grad_std": 0.2159113883972168, "margin_dpo/beta_margin_mean": 3.5683140754699707, "margin_dpo/loss_margin_mean": 35.683143615722656, "margin_dpo/margin_mean": 35.68313980102539, "margin_dpo/margin_std": 30.738298416137695, "step": 642 }, { "epoch": 0.9441997063142438, "grad_norm": 50.391510009765625, "learning_rate": 4.993270631642038e-09, "logits/chosen": -0.6333717107772827, "logits/rejected": -0.6052130460739136, "logps/chosen": -71.46492004394531, "logps/ref_chosen": -51.94999694824219, "logps/ref_rejected": -87.46833801269531, "logps/rejected": -137.68312072753906, "loss": 0.4285, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14480799436569214, "margin_dpo/beta_margin_grad_std": 0.19451884925365448, "margin_dpo/beta_margin_mean": 3.069986581802368, "margin_dpo/loss_margin_mean": 30.699865341186523, "margin_dpo/margin_mean": 30.699865341186523, "margin_dpo/margin_std": 23.991680145263672, "step": 643 }, { "epoch": 0.9456681350954479, "grad_norm": 77.25302124023438, "learning_rate": 4.741290495811873e-09, "logits/chosen": -0.562663197517395, "logits/rejected": -0.5326156616210938, "logps/chosen": -79.98289489746094, "logps/ref_chosen": -59.017662048339844, "logps/ref_rejected": -87.13668823242188, "logps/rejected": -138.28848266601562, "loss": 0.5598, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1840367615222931, "margin_dpo/beta_margin_grad_std": 0.2329137921333313, "margin_dpo/beta_margin_mean": 3.018655776977539, "margin_dpo/loss_margin_mean": 30.18655776977539, "margin_dpo/margin_mean": 30.18655776977539, "margin_dpo/margin_std": 28.526702880859375, "step": 644 }, { "epoch": 0.947136563876652, "grad_norm": 75.70096588134766, "learning_rate": 4.495773155069299e-09, "logits/chosen": -0.5764358043670654, "logits/rejected": -0.5558615922927856, "logps/chosen": -79.81644439697266, "logps/ref_chosen": -55.87602233886719, "logps/ref_rejected": -97.78080749511719, "logps/rejected": -150.7290496826172, "loss": 0.5337, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18957525491714478, "margin_dpo/beta_margin_grad_std": 0.21459272503852844, "margin_dpo/beta_margin_mean": 2.9007816314697266, "margin_dpo/loss_margin_mean": 29.007814407348633, "margin_dpo/margin_mean": 29.007816314697266, "margin_dpo/margin_std": 27.730712890625, "step": 645 }, { "epoch": 0.9486049926578561, "grad_norm": 50.839752197265625, "learning_rate": 4.256725079024553e-09, "logits/chosen": -0.6054178476333618, "logits/rejected": -0.5551047325134277, "logps/chosen": -84.0325927734375, "logps/ref_chosen": -61.275787353515625, "logps/ref_rejected": -77.50580596923828, "logps/rejected": -133.4116668701172, "loss": 0.3167, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.11927846819162369, "margin_dpo/beta_margin_grad_std": 0.160105362534523, "margin_dpo/beta_margin_mean": 3.3149056434631348, "margin_dpo/loss_margin_mean": 33.14905548095703, "margin_dpo/margin_mean": 33.14905548095703, "margin_dpo/margin_std": 22.489887237548828, "step": 646 }, { "epoch": 0.9500734214390602, "grad_norm": 81.40752410888672, "learning_rate": 4.024152566816791e-09, "logits/chosen": -0.5593730807304382, "logits/rejected": -0.5357339382171631, "logps/chosen": -78.91641235351562, "logps/ref_chosen": -54.852413177490234, "logps/ref_rejected": -93.5194091796875, "logps/rejected": -150.42044067382812, "loss": 0.4999, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16081605851650238, "margin_dpo/beta_margin_grad_std": 0.2314681112766266, "margin_dpo/beta_margin_mean": 3.2837038040161133, "margin_dpo/loss_margin_mean": 32.837039947509766, "margin_dpo/margin_mean": 32.837039947509766, "margin_dpo/margin_std": 26.819320678710938, "step": 647 }, { "epoch": 0.9515418502202643, "grad_norm": 48.46821212768555, "learning_rate": 3.798061746947995e-09, "logits/chosen": -0.61167973279953, "logits/rejected": -0.6027648448944092, "logps/chosen": -74.04869842529297, "logps/ref_chosen": -54.17146682739258, "logps/ref_rejected": -98.71279907226562, "logps/rejected": -159.05592346191406, "loss": 0.3695, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1377037763595581, "margin_dpo/beta_margin_grad_std": 0.1926315426826477, "margin_dpo/beta_margin_mean": 4.046590328216553, "margin_dpo/loss_margin_mean": 40.465904235839844, "margin_dpo/margin_mean": 40.465904235839844, "margin_dpo/margin_std": 34.20042037963867, "step": 648 }, { "epoch": 0.9530102790014684, "grad_norm": 50.36833572387695, "learning_rate": 3.5784585771215235e-09, "logits/chosen": -0.6533620357513428, "logits/rejected": -0.6218982934951782, "logps/chosen": -83.10621643066406, "logps/ref_chosen": -62.4803466796875, "logps/ref_rejected": -80.07717895507812, "logps/rejected": -129.4200897216797, "loss": 0.5299, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1947104036808014, "margin_dpo/beta_margin_grad_std": 0.2096787691116333, "margin_dpo/beta_margin_mean": 2.871704339981079, "margin_dpo/loss_margin_mean": 28.717042922973633, "margin_dpo/margin_mean": 28.717044830322266, "margin_dpo/margin_std": 28.58915138244629, "step": 649 }, { "epoch": 0.9544787077826725, "grad_norm": 59.41923522949219, "learning_rate": 3.3653488440851253e-09, "logits/chosen": -0.5570046901702881, "logits/rejected": -0.5465147495269775, "logps/chosen": -80.50581359863281, "logps/ref_chosen": -56.09281921386719, "logps/ref_rejected": -98.26483917236328, "logps/rejected": -159.12442016601562, "loss": 0.3573, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1336677372455597, "margin_dpo/beta_margin_grad_std": 0.1835484653711319, "margin_dpo/beta_margin_mean": 3.6446590423583984, "margin_dpo/loss_margin_mean": 36.44658660888672, "margin_dpo/margin_mean": 36.44658660888672, "margin_dpo/margin_std": 28.654094696044922, "step": 650 }, { "epoch": 0.9559471365638766, "grad_norm": 38.3771858215332, "learning_rate": 3.158738163478475e-09, "logits/chosen": -0.607953667640686, "logits/rejected": -0.6089369058609009, "logps/chosen": -63.09129333496094, "logps/ref_chosen": -43.42544937133789, "logps/ref_rejected": -99.9579086303711, "logps/rejected": -155.23358154296875, "loss": 0.3196, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1269427090883255, "margin_dpo/beta_margin_grad_std": 0.16582195460796356, "margin_dpo/beta_margin_mean": 3.560983657836914, "margin_dpo/loss_margin_mean": 35.609832763671875, "margin_dpo/margin_mean": 35.609832763671875, "margin_dpo/margin_std": 27.03875732421875, "step": 651 }, { "epoch": 0.9574155653450808, "grad_norm": 42.76301956176758, "learning_rate": 2.9586319796851555e-09, "logits/chosen": -0.641417384147644, "logits/rejected": -0.6177515983581543, "logps/chosen": -78.98847961425781, "logps/ref_chosen": -62.57680892944336, "logps/ref_rejected": -111.76779174804688, "logps/rejected": -163.6732177734375, "loss": 0.3394, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1356583833694458, "margin_dpo/beta_margin_grad_std": 0.16708874702453613, "margin_dpo/beta_margin_mean": 3.5493762493133545, "margin_dpo/loss_margin_mean": 35.49375915527344, "margin_dpo/margin_mean": 35.49375915527344, "margin_dpo/margin_std": 28.135786056518555, "step": 652 }, { "epoch": 0.9588839941262849, "grad_norm": 52.41037368774414, "learning_rate": 2.7650355656892166e-09, "logits/chosen": -0.6357418298721313, "logits/rejected": -0.6146754026412964, "logps/chosen": -84.56446838378906, "logps/ref_chosen": -61.11295700073242, "logps/ref_rejected": -103.24960327148438, "logps/rejected": -162.56381225585938, "loss": 0.3193, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.11913929879665375, "margin_dpo/beta_margin_grad_std": 0.17990511655807495, "margin_dpo/beta_margin_mean": 3.5862698554992676, "margin_dpo/loss_margin_mean": 35.86269760131836, "margin_dpo/margin_mean": 35.862701416015625, "margin_dpo/margin_std": 25.911727905273438, "step": 653 }, { "epoch": 0.960352422907489, "grad_norm": 70.10198974609375, "learning_rate": 2.577954022936174e-09, "logits/chosen": -0.6373894810676575, "logits/rejected": -0.6333979368209839, "logps/chosen": -87.05609130859375, "logps/ref_chosen": -61.7281379699707, "logps/ref_rejected": -98.7738037109375, "logps/rejected": -153.52032470703125, "loss": 0.5245, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17562022805213928, "margin_dpo/beta_margin_grad_std": 0.2261282503604889, "margin_dpo/beta_margin_mean": 2.9418554306030273, "margin_dpo/loss_margin_mean": 29.418556213378906, "margin_dpo/margin_mean": 29.418556213378906, "margin_dpo/margin_std": 28.580772399902344, "step": 654 }, { "epoch": 0.9618208516886931, "grad_norm": 69.47087860107422, "learning_rate": 2.397392281198729e-09, "logits/chosen": -0.6162744760513306, "logits/rejected": -0.6169338226318359, "logps/chosen": -71.07903289794922, "logps/ref_chosen": -49.576812744140625, "logps/ref_rejected": -98.29183197021484, "logps/rejected": -150.40371704101562, "loss": 0.5102, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.18301187455654144, "margin_dpo/beta_margin_grad_std": 0.21452929079532623, "margin_dpo/beta_margin_mean": 3.060966968536377, "margin_dpo/loss_margin_mean": 30.609668731689453, "margin_dpo/margin_mean": 30.609668731689453, "margin_dpo/margin_std": 29.15388298034668, "step": 655 }, { "epoch": 0.9632892804698973, "grad_norm": 40.41490936279297, "learning_rate": 2.223355098446622e-09, "logits/chosen": -0.5592731237411499, "logits/rejected": -0.5631238222122192, "logps/chosen": -73.43663024902344, "logps/ref_chosen": -52.54943084716797, "logps/ref_rejected": -113.67464447021484, "logps/rejected": -176.38644409179688, "loss": 0.2391, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.0935378447175026, "margin_dpo/beta_margin_grad_std": 0.15634706616401672, "margin_dpo/beta_margin_mean": 4.182460784912109, "margin_dpo/loss_margin_mean": 41.824607849121094, "margin_dpo/margin_mean": 41.824607849121094, "margin_dpo/margin_std": 25.468910217285156, "step": 656 }, { "epoch": 0.9647577092511013, "grad_norm": 45.6422233581543, "learning_rate": 2.055847060721566e-09, "logits/chosen": -0.5981370210647583, "logits/rejected": -0.5761264562606812, "logps/chosen": -68.69126892089844, "logps/ref_chosen": -46.700538635253906, "logps/ref_rejected": -97.91487121582031, "logps/rejected": -157.28271484375, "loss": 0.3403, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.11615358293056488, "margin_dpo/beta_margin_grad_std": 0.175856813788414, "margin_dpo/beta_margin_mean": 3.737710952758789, "margin_dpo/loss_margin_mean": 37.377105712890625, "margin_dpo/margin_mean": 37.377105712890625, "margin_dpo/margin_std": 28.65097427368164, "step": 657 }, { "epoch": 0.9662261380323054, "grad_norm": 57.90339660644531, "learning_rate": 1.8948725820160662e-09, "logits/chosen": -0.6264636516571045, "logits/rejected": -0.5879380702972412, "logps/chosen": -86.60824584960938, "logps/ref_chosen": -60.958213806152344, "logps/ref_rejected": -95.93949127197266, "logps/rejected": -156.69830322265625, "loss": 0.4473, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1491597294807434, "margin_dpo/beta_margin_grad_std": 0.21673035621643066, "margin_dpo/beta_margin_mean": 3.5108790397644043, "margin_dpo/loss_margin_mean": 35.10879135131836, "margin_dpo/margin_mean": 35.108787536621094, "margin_dpo/margin_std": 29.642911911010742, "step": 658 }, { "epoch": 0.9676945668135095, "grad_norm": 56.96932601928711, "learning_rate": 1.7404359041573723e-09, "logits/chosen": -0.5769931077957153, "logits/rejected": -0.5069276690483093, "logps/chosen": -96.17684936523438, "logps/ref_chosen": -76.74298095703125, "logps/ref_rejected": -87.4709701538086, "logps/rejected": -141.29806518554688, "loss": 0.4928, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16625207662582397, "margin_dpo/beta_margin_grad_std": 0.23161795735359192, "margin_dpo/beta_margin_mean": 3.4393229484558105, "margin_dpo/loss_margin_mean": 34.39323043823242, "margin_dpo/margin_mean": 34.39323043823242, "margin_dpo/margin_std": 29.151775360107422, "step": 659 }, { "epoch": 0.9691629955947136, "grad_norm": 51.775634765625, "learning_rate": 1.592541096695571e-09, "logits/chosen": -0.6120225191116333, "logits/rejected": -0.5639553070068359, "logps/chosen": -80.46331787109375, "logps/ref_chosen": -59.047882080078125, "logps/ref_rejected": -75.96005249023438, "logps/rejected": -135.1595458984375, "loss": 0.2938, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.11358514428138733, "margin_dpo/beta_margin_grad_std": 0.17192693054676056, "margin_dpo/beta_margin_mean": 3.778407096862793, "margin_dpo/loss_margin_mean": 37.7840690612793, "margin_dpo/margin_mean": 37.7840690612793, "margin_dpo/margin_std": 27.653093338012695, "step": 660 }, { "epoch": 0.9706314243759178, "grad_norm": 64.97528839111328, "learning_rate": 1.4511920567963908e-09, "logits/chosen": -0.5847969055175781, "logits/rejected": -0.5379676818847656, "logps/chosen": -71.49467468261719, "logps/ref_chosen": -50.673973083496094, "logps/ref_rejected": -86.00569152832031, "logps/rejected": -141.72848510742188, "loss": 0.4565, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.14623390138149261, "margin_dpo/beta_margin_grad_std": 0.21268951892852783, "margin_dpo/beta_margin_mean": 3.4902100563049316, "margin_dpo/loss_margin_mean": 34.902099609375, "margin_dpo/margin_mean": 34.902099609375, "margin_dpo/margin_std": 29.28716278076172, "step": 661 }, { "epoch": 0.9720998531571219, "grad_norm": 51.50436782836914, "learning_rate": 1.3163925091384532e-09, "logits/chosen": -0.6173335313796997, "logits/rejected": -0.5659915208816528, "logps/chosen": -93.51567077636719, "logps/ref_chosen": -69.26106262207031, "logps/ref_rejected": -89.05593872070312, "logps/rejected": -144.10789489746094, "loss": 0.3806, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1421985775232315, "margin_dpo/beta_margin_grad_std": 0.1747194081544876, "margin_dpo/beta_margin_mean": 3.0797348022460938, "margin_dpo/loss_margin_mean": 30.797348022460938, "margin_dpo/margin_mean": 30.797348022460938, "margin_dpo/margin_std": 25.398778915405273, "step": 662 }, { "epoch": 0.973568281938326, "grad_norm": 38.44890594482422, "learning_rate": 1.1881460058152382e-09, "logits/chosen": -0.647502064704895, "logits/rejected": -0.6262093782424927, "logps/chosen": -83.27813720703125, "logps/ref_chosen": -64.87891387939453, "logps/ref_rejected": -113.92536926269531, "logps/rejected": -165.39273071289062, "loss": 0.3283, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12441954016685486, "margin_dpo/beta_margin_grad_std": 0.1589188277721405, "margin_dpo/beta_margin_mean": 3.306814670562744, "margin_dpo/loss_margin_mean": 33.068145751953125, "margin_dpo/margin_mean": 33.068145751953125, "margin_dpo/margin_std": 24.52547264099121, "step": 663 }, { "epoch": 0.9750367107195301, "grad_norm": 69.16484832763672, "learning_rate": 1.066455926241383e-09, "logits/chosen": -0.5968215465545654, "logits/rejected": -0.5680118799209595, "logps/chosen": -84.52749633789062, "logps/ref_chosen": -60.88847351074219, "logps/ref_rejected": -105.521728515625, "logps/rejected": -166.35784912109375, "loss": 0.426, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.11887051165103912, "margin_dpo/beta_margin_grad_std": 0.19583344459533691, "margin_dpo/beta_margin_mean": 3.7197093963623047, "margin_dpo/loss_margin_mean": 37.19709014892578, "margin_dpo/margin_mean": 37.19709014892578, "margin_dpo/margin_std": 27.019386291503906, "step": 664 }, { "epoch": 0.9765051395007343, "grad_norm": 42.6618537902832, "learning_rate": 9.513254770636137e-10, "logits/chosen": -0.6599475145339966, "logits/rejected": -0.618366003036499, "logps/chosen": -81.63275146484375, "logps/ref_chosen": -60.56413269042969, "logps/ref_rejected": -84.8088150024414, "logps/rejected": -137.7115478515625, "loss": 0.3485, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13592152297496796, "margin_dpo/beta_margin_grad_std": 0.1685512661933899, "margin_dpo/beta_margin_mean": 3.183411121368408, "margin_dpo/loss_margin_mean": 31.834110260009766, "margin_dpo/margin_mean": 31.834110260009766, "margin_dpo/margin_std": 23.29065704345703, "step": 665 }, { "epoch": 0.9779735682819384, "grad_norm": 60.117515563964844, "learning_rate": 8.427576920763956e-10, "logits/chosen": -0.5868717432022095, "logits/rejected": -0.5477631688117981, "logps/chosen": -88.28842163085938, "logps/ref_chosen": -64.41996002197266, "logps/ref_rejected": -95.89163208007812, "logps/rejected": -154.99008178710938, "loss": 0.4224, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1320444643497467, "margin_dpo/beta_margin_grad_std": 0.18931497633457184, "margin_dpo/beta_margin_mean": 3.522998809814453, "margin_dpo/loss_margin_mean": 35.22998809814453, "margin_dpo/margin_mean": 35.22998809814453, "margin_dpo/margin_std": 25.960124969482422, "step": 666 }, { "epoch": 0.9794419970631424, "grad_norm": 56.65580749511719, "learning_rate": 7.407554321417764e-10, "logits/chosen": -0.6088787317276001, "logits/rejected": -0.5590524673461914, "logps/chosen": -94.58509063720703, "logps/ref_chosen": -69.27703094482422, "logps/ref_rejected": -87.83549499511719, "logps/rejected": -147.4503173828125, "loss": 0.3265, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12345196306705475, "margin_dpo/beta_margin_grad_std": 0.16755497455596924, "margin_dpo/beta_margin_mean": 3.4306764602661133, "margin_dpo/loss_margin_mean": 34.3067626953125, "margin_dpo/margin_mean": 34.3067626953125, "margin_dpo/margin_std": 24.47415542602539, "step": 667 }, { "epoch": 0.9809104258443465, "grad_norm": 70.59870910644531, "learning_rate": 6.453213851142225e-10, "logits/chosen": -0.6269364356994629, "logits/rejected": -0.5885031819343567, "logps/chosen": -96.2440185546875, "logps/ref_chosen": -72.60400390625, "logps/ref_rejected": -103.73905181884766, "logps/rejected": -160.45053100585938, "loss": 0.4488, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1537022888660431, "margin_dpo/beta_margin_grad_std": 0.21864807605743408, "margin_dpo/beta_margin_mean": 3.3071470260620117, "margin_dpo/loss_margin_mean": 33.07147216796875, "margin_dpo/margin_mean": 33.07147216796875, "margin_dpo/margin_std": 25.884002685546875, "step": 668 }, { "epoch": 0.9823788546255506, "grad_norm": 68.7284927368164, "learning_rate": 5.564580657695939e-10, "logits/chosen": -0.6374561786651611, "logits/rejected": -0.5934668183326721, "logps/chosen": -65.82669067382812, "logps/ref_chosen": -46.116416931152344, "logps/ref_rejected": -77.92434692382812, "logps/rejected": -135.83099365234375, "loss": 0.5109, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.15599049627780914, "margin_dpo/beta_margin_grad_std": 0.2399427890777588, "margin_dpo/beta_margin_mean": 3.8196370601654053, "margin_dpo/loss_margin_mean": 38.19636917114258, "margin_dpo/margin_mean": 38.19636917114258, "margin_dpo/margin_std": 32.66187286376953, "step": 669 }, { "epoch": 0.9838472834067548, "grad_norm": 44.16929626464844, "learning_rate": 4.741678157389739e-10, "logits/chosen": -0.6017849445343018, "logits/rejected": -0.5695161819458008, "logps/chosen": -83.34823608398438, "logps/ref_chosen": -62.34575653076172, "logps/ref_rejected": -96.9405517578125, "logps/rejected": -156.87435913085938, "loss": 0.2739, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.10903677344322205, "margin_dpo/beta_margin_grad_std": 0.15663883090019226, "margin_dpo/beta_margin_mean": 3.8931329250335693, "margin_dpo/loss_margin_mean": 38.93132781982422, "margin_dpo/margin_mean": 38.93132781982422, "margin_dpo/margin_std": 25.873010635375977, "step": 670 }, { "epoch": 0.9853157121879589, "grad_norm": 55.043914794921875, "learning_rate": 3.9845280344705245e-10, "logits/chosen": -0.6250673532485962, "logits/rejected": -0.5933674573898315, "logps/chosen": -72.50352478027344, "logps/ref_chosen": -48.00010681152344, "logps/ref_rejected": -83.81932067871094, "logps/rejected": -143.89370727539062, "loss": 0.3551, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13739252090454102, "margin_dpo/beta_margin_grad_std": 0.1742577999830246, "margin_dpo/beta_margin_mean": 3.5570971965789795, "margin_dpo/loss_margin_mean": 35.57096862792969, "margin_dpo/margin_mean": 35.57096862792969, "margin_dpo/margin_std": 28.424989700317383, "step": 671 }, { "epoch": 0.986784140969163, "grad_norm": 66.3819351196289, "learning_rate": 3.293150240547549e-10, "logits/chosen": -0.6075701117515564, "logits/rejected": -0.5688859820365906, "logps/chosen": -82.87799072265625, "logps/ref_chosen": -58.583290100097656, "logps/ref_rejected": -93.14014434814453, "logps/rejected": -149.91152954101562, "loss": 0.479, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.17206090688705444, "margin_dpo/beta_margin_grad_std": 0.21605950593948364, "margin_dpo/beta_margin_mean": 3.2476677894592285, "margin_dpo/loss_margin_mean": 32.47667694091797, "margin_dpo/margin_mean": 32.47667694091797, "margin_dpo/margin_std": 29.792341232299805, "step": 672 }, { "epoch": 0.9882525697503671, "grad_norm": 41.944400787353516, "learning_rate": 2.6675629940689504e-10, "logits/chosen": -0.6093118786811829, "logits/rejected": -0.5795783996582031, "logps/chosen": -68.00104522705078, "logps/ref_chosen": -46.72320556640625, "logps/ref_rejected": -85.29623413085938, "logps/rejected": -143.71388244628906, "loss": 0.3057, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.12082840502262115, "margin_dpo/beta_margin_grad_std": 0.16586080193519592, "margin_dpo/beta_margin_mean": 3.7139804363250732, "margin_dpo/loss_margin_mean": 37.139801025390625, "margin_dpo/margin_mean": 37.13980484008789, "margin_dpo/margin_std": 27.277408599853516, "step": 673 }, { "epoch": 0.9897209985315712, "grad_norm": 37.851444244384766, "learning_rate": 2.1077827798404725e-10, "logits/chosen": -0.56818687915802, "logits/rejected": -0.5401608943939209, "logps/chosen": -67.67996215820312, "logps/ref_chosen": -45.445526123046875, "logps/ref_rejected": -70.04593658447266, "logps/rejected": -130.27374267578125, "loss": 0.2841, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.11634500324726105, "margin_dpo/beta_margin_grad_std": 0.15289074182510376, "margin_dpo/beta_margin_mean": 3.799337387084961, "margin_dpo/loss_margin_mean": 37.993370056152344, "margin_dpo/margin_mean": 37.993370056152344, "margin_dpo/margin_std": 28.400920867919922, "step": 674 }, { "epoch": 0.9911894273127754, "grad_norm": 61.00739669799805, "learning_rate": 1.6138243485910863e-10, "logits/chosen": -0.5657342672348022, "logits/rejected": -0.5385361909866333, "logps/chosen": -65.09626770019531, "logps/ref_chosen": -44.17628479003906, "logps/ref_rejected": -74.09197998046875, "logps/rejected": -134.56707763671875, "loss": 0.3851, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.11753740161657333, "margin_dpo/beta_margin_grad_std": 0.20709985494613647, "margin_dpo/beta_margin_mean": 3.9555118083953857, "margin_dpo/loss_margin_mean": 39.555118560791016, "margin_dpo/margin_mean": 39.555118560791016, "margin_dpo/margin_std": 27.490642547607422, "step": 675 }, { "epoch": 0.9926578560939795, "grad_norm": 76.93489837646484, "learning_rate": 1.1857007165852472e-10, "logits/chosen": -0.639100968837738, "logits/rejected": -0.6050753593444824, "logps/chosen": -96.79466247558594, "logps/ref_chosen": -71.39852142333984, "logps/ref_rejected": -88.3587646484375, "logps/rejected": -150.10198974609375, "loss": 0.4104, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13633592426776886, "margin_dpo/beta_margin_grad_std": 0.18744832277297974, "margin_dpo/beta_margin_mean": 3.6347103118896484, "margin_dpo/loss_margin_mean": 36.34709930419922, "margin_dpo/margin_mean": 36.34709930419922, "margin_dpo/margin_std": 28.50853729248047, "step": 676 }, { "epoch": 0.9941262848751835, "grad_norm": 66.0276107788086, "learning_rate": 8.23423165278725e-11, "logits/chosen": -0.6146172285079956, "logits/rejected": -0.565468966960907, "logps/chosen": -79.82300567626953, "logps/ref_chosen": -56.52743911743164, "logps/ref_rejected": -78.22654724121094, "logps/rejected": -138.8525390625, "loss": 0.4429, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1386735886335373, "margin_dpo/beta_margin_grad_std": 0.21394288539886475, "margin_dpo/beta_margin_mean": 3.733041286468506, "margin_dpo/loss_margin_mean": 37.330413818359375, "margin_dpo/margin_mean": 37.330413818359375, "margin_dpo/margin_std": 28.455005645751953, "step": 677 }, { "epoch": 0.9955947136563876, "grad_norm": 50.60527801513672, "learning_rate": 5.270012410216185e-11, "logits/chosen": -0.5987369418144226, "logits/rejected": -0.5752243995666504, "logps/chosen": -68.12297058105469, "logps/ref_chosen": -46.13447570800781, "logps/ref_rejected": -80.60462951660156, "logps/rejected": -139.23318481445312, "loss": 0.4505, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.1656286120414734, "margin_dpo/beta_margin_grad_std": 0.21251779794692993, "margin_dpo/beta_margin_mean": 3.6640052795410156, "margin_dpo/loss_margin_mean": 36.640052795410156, "margin_dpo/margin_mean": 36.640052795410156, "margin_dpo/margin_std": 31.15386390686035, "step": 678 }, { "epoch": 0.9970631424375918, "grad_norm": 47.600643157958984, "learning_rate": 2.9644275480772416e-11, "logits/chosen": -0.6062077283859253, "logits/rejected": -0.5733453035354614, "logps/chosen": -72.9251937866211, "logps/ref_chosen": -50.294921875, "logps/ref_rejected": -76.59813690185547, "logps/rejected": -136.07611083984375, "loss": 0.3291, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.112078458070755, "margin_dpo/beta_margin_grad_std": 0.17374977469444275, "margin_dpo/beta_margin_mean": 3.6847691535949707, "margin_dpo/loss_margin_mean": 36.84769058227539, "margin_dpo/margin_mean": 36.84769058227539, "margin_dpo/margin_std": 26.915252685546875, "step": 679 }, { "epoch": 0.9985315712187959, "grad_norm": 55.98041915893555, "learning_rate": 1.31753782067201e-11, "logits/chosen": -0.6170350313186646, "logits/rejected": -0.5841037034988403, "logps/chosen": -99.70010375976562, "logps/ref_chosen": -76.91569519042969, "logps/ref_rejected": -112.384765625, "logps/rejected": -171.31031799316406, "loss": 0.3861, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.13496683537960052, "margin_dpo/beta_margin_grad_std": 0.1988377571105957, "margin_dpo/beta_margin_mean": 3.6141152381896973, "margin_dpo/loss_margin_mean": 36.141151428222656, "margin_dpo/margin_mean": 36.141151428222656, "margin_dpo/margin_std": 29.207448959350586, "step": 680 }, { "epoch": 1.0, "grad_norm": 52.392547607421875, "learning_rate": 3.2938662507808745e-12, "logits/chosen": -0.6597200632095337, "logits/rejected": -0.6327718496322632, "logps/chosen": -84.40997314453125, "logps/ref_chosen": -60.957279205322266, "logps/ref_rejected": -88.5579833984375, "logps/rejected": -143.79640197753906, "loss": 0.4602, "margin_dpo/beta": 0.10000000149011612, "margin_dpo/beta_margin_grad_mean": -0.16065430641174316, "margin_dpo/beta_margin_grad_std": 0.20822513103485107, "margin_dpo/beta_margin_mean": 3.1785736083984375, "margin_dpo/loss_margin_mean": 31.785736083984375, "margin_dpo/margin_mean": 31.785736083984375, "margin_dpo/margin_std": 28.091190338134766, "step": 681 }, { "epoch": 1.0, "step": 681, "total_flos": 0.0, "train_loss": 0.572698849610295, "train_runtime": 1998.3785, "train_samples_per_second": 21.817, "train_steps_per_second": 0.341 } ], "logging_steps": 1, "max_steps": 681, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }