Files
llama-3-8b-base-margin-dpo-…/trainer_state.json
ModelHub XC e2d7fad87f 初始化项目,由ModelHub XC社区提供模型
Model: jackf857/llama-3-8b-base-margin-dpo-hh-helpful-batch-64
Source: Original Platform
2026-05-10 12:37:20 +08:00

13790 lines
558 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 100,
"global_step": 681,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0014684287812041115,
"grad_norm": 83.52447509765625,
"learning_rate": 0.0,
"logits/chosen": -0.4974287748336792,
"logits/rejected": -0.43299180269241333,
"logps/chosen": -50.1435661315918,
"logps/ref_chosen": -50.14883804321289,
"logps/ref_rejected": -74.1280517578125,
"logps/rejected": -74.09991455078125,
"loss": 1.389,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.5005706548690796,
"margin_dpo/beta_margin_grad_std": 0.0104739461094141,
"margin_dpo/beta_margin_mean": -0.0022870064713060856,
"margin_dpo/loss_margin_mean": -0.02287006378173828,
"margin_dpo/margin_mean": -0.02287048101425171,
"margin_dpo/margin_std": 0.41920793056488037,
"step": 1
},
{
"epoch": 0.002936857562408223,
"grad_norm": 72.19432830810547,
"learning_rate": 7.246376811594203e-09,
"logits/chosen": -0.4953641891479492,
"logits/rejected": -0.4594460129737854,
"logps/chosen": -52.65569305419922,
"logps/ref_chosen": -52.620704650878906,
"logps/ref_rejected": -75.30413818359375,
"logps/rejected": -75.27340698242188,
"loss": 1.3932,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.5016425848007202,
"margin_dpo/beta_margin_grad_std": 0.008758805692195892,
"margin_dpo/beta_margin_mean": -0.006572261452674866,
"margin_dpo/loss_margin_mean": -0.06572261452674866,
"margin_dpo/margin_mean": -0.06572240591049194,
"margin_dpo/margin_std": 0.35048407316207886,
"step": 2
},
{
"epoch": 0.004405286343612335,
"grad_norm": 70.83383178710938,
"learning_rate": 1.4492753623188406e-08,
"logits/chosen": -0.48161470890045166,
"logits/rejected": -0.44217246770858765,
"logps/chosen": -60.95429611206055,
"logps/ref_chosen": -60.98159408569336,
"logps/ref_rejected": -68.67259216308594,
"logps/rejected": -68.64839935302734,
"loss": 1.3863,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.49992311000823975,
"margin_dpo/beta_margin_grad_std": 0.008581075817346573,
"margin_dpo/beta_margin_mean": 0.0003100454923696816,
"margin_dpo/loss_margin_mean": 0.003100454807281494,
"margin_dpo/margin_mean": 0.003100961446762085,
"margin_dpo/margin_std": 0.3433571755886078,
"step": 3
},
{
"epoch": 0.005873715124816446,
"grad_norm": 72.25827026367188,
"learning_rate": 2.1739130434782606e-08,
"logits/chosen": -0.46887677907943726,
"logits/rejected": -0.44121015071868896,
"logps/chosen": -56.833404541015625,
"logps/ref_chosen": -56.76771545410156,
"logps/ref_rejected": -86.64710998535156,
"logps/rejected": -86.60629272460938,
"loss": 1.3973,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.5026620626449585,
"margin_dpo/beta_margin_grad_std": 0.008479107171297073,
"margin_dpo/beta_margin_mean": -0.010650942102074623,
"margin_dpo/loss_margin_mean": -0.10650941729545593,
"margin_dpo/margin_mean": -0.10650989413261414,
"margin_dpo/margin_std": 0.33926206827163696,
"step": 4
},
{
"epoch": 0.007342143906020558,
"grad_norm": 89.21666717529297,
"learning_rate": 2.898550724637681e-08,
"logits/chosen": -0.5145087242126465,
"logits/rejected": -0.4707593023777008,
"logps/chosen": -53.772743225097656,
"logps/ref_chosen": -53.859375,
"logps/ref_rejected": -84.14918518066406,
"logps/rejected": -84.13954162597656,
"loss": 1.3789,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.49807578325271606,
"margin_dpo/beta_margin_grad_std": 0.008384998887777328,
"margin_dpo/beta_margin_mean": 0.007699114270508289,
"margin_dpo/loss_margin_mean": 0.07699114084243774,
"margin_dpo/margin_mean": 0.07699081301689148,
"margin_dpo/margin_std": 0.3355046510696411,
"step": 5
},
{
"epoch": 0.00881057268722467,
"grad_norm": 92.13448333740234,
"learning_rate": 3.6231884057971014e-08,
"logits/chosen": -0.5163406729698181,
"logits/rejected": -0.475068598985672,
"logps/chosen": -63.05199432373047,
"logps/ref_chosen": -63.007484436035156,
"logps/ref_rejected": -92.64534759521484,
"logps/rejected": -92.68731689453125,
"loss": 1.3869,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.5000631809234619,
"margin_dpo/beta_margin_grad_std": 0.008657192811369896,
"margin_dpo/beta_margin_mean": -0.00025360879953950644,
"margin_dpo/loss_margin_mean": -0.002536088228225708,
"margin_dpo/margin_mean": -0.002536386251449585,
"margin_dpo/margin_std": 0.3463857173919678,
"step": 6
},
{
"epoch": 0.010279001468428781,
"grad_norm": 82.59510803222656,
"learning_rate": 4.347826086956521e-08,
"logits/chosen": -0.5038071274757385,
"logits/rejected": -0.46995049715042114,
"logps/chosen": -57.764461517333984,
"logps/ref_chosen": -57.774818420410156,
"logps/ref_rejected": -103.92059326171875,
"logps/rejected": -103.89596557617188,
"loss": 1.3881,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.5003570914268494,
"margin_dpo/beta_margin_grad_std": 0.009314555674791336,
"margin_dpo/beta_margin_mean": -0.0014270306564867496,
"margin_dpo/loss_margin_mean": -0.014270305633544922,
"margin_dpo/margin_mean": -0.014270126819610596,
"margin_dpo/margin_std": 0.37269771099090576,
"step": 7
},
{
"epoch": 0.011747430249632892,
"grad_norm": 78.55260467529297,
"learning_rate": 5.0724637681159424e-08,
"logits/chosen": -0.5125592350959778,
"logits/rejected": -0.48697221279144287,
"logps/chosen": -58.67088317871094,
"logps/ref_chosen": -58.716033935546875,
"logps/ref_rejected": -79.3114242553711,
"logps/rejected": -79.30046081542969,
"loss": 1.3832,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.4991455674171448,
"margin_dpo/beta_margin_grad_std": 0.008225222118198872,
"margin_dpo/beta_margin_mean": 0.0034186365082859993,
"margin_dpo/loss_margin_mean": 0.034186363220214844,
"margin_dpo/margin_mean": 0.03418651223182678,
"margin_dpo/margin_std": 0.3291283845901489,
"step": 8
},
{
"epoch": 0.013215859030837005,
"grad_norm": 84.95925903320312,
"learning_rate": 5.797101449275362e-08,
"logits/chosen": -0.518346905708313,
"logits/rejected": -0.4730910360813141,
"logps/chosen": -69.84893798828125,
"logps/ref_chosen": -69.8668441772461,
"logps/ref_rejected": -99.6026611328125,
"logps/rejected": -99.63265991210938,
"loss": 1.3819,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.4988027811050415,
"margin_dpo/beta_margin_grad_std": 0.010282458737492561,
"margin_dpo/beta_margin_mean": 0.004790524020791054,
"margin_dpo/loss_margin_mean": 0.04790523648262024,
"margin_dpo/margin_mean": 0.04790511727333069,
"margin_dpo/margin_std": 0.4114891588687897,
"step": 9
},
{
"epoch": 0.014684287812041116,
"grad_norm": 70.49417877197266,
"learning_rate": 6.521739130434782e-08,
"logits/chosen": -0.4861105680465698,
"logits/rejected": -0.44242680072784424,
"logps/chosen": -48.30065155029297,
"logps/ref_chosen": -48.35768508911133,
"logps/ref_rejected": -80.37206268310547,
"logps/rejected": -80.39839172363281,
"loss": 1.3783,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.4979170560836792,
"margin_dpo/beta_margin_grad_std": 0.008738012053072453,
"margin_dpo/beta_margin_mean": 0.00833646859973669,
"margin_dpo/loss_margin_mean": 0.08336468040943146,
"margin_dpo/margin_mean": 0.08336484432220459,
"margin_dpo/margin_std": 0.3496713638305664,
"step": 10
},
{
"epoch": 0.016152716593245228,
"grad_norm": 68.25067901611328,
"learning_rate": 7.246376811594203e-08,
"logits/chosen": -0.4707266092300415,
"logits/rejected": -0.4461541175842285,
"logps/chosen": -53.01072692871094,
"logps/ref_chosen": -53.01685333251953,
"logps/ref_rejected": -87.78038024902344,
"logps/rejected": -87.81500244140625,
"loss": 1.3825,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.4989815652370453,
"margin_dpo/beta_margin_grad_std": 0.008964480832219124,
"margin_dpo/beta_margin_mean": 0.0040746452286839485,
"margin_dpo/loss_margin_mean": 0.040746450424194336,
"margin_dpo/margin_mean": 0.04074642062187195,
"margin_dpo/margin_std": 0.35872533917427063,
"step": 11
},
{
"epoch": 0.01762114537444934,
"grad_norm": 99.98358917236328,
"learning_rate": 7.971014492753623e-08,
"logits/chosen": -0.5403286814689636,
"logits/rejected": -0.5041991472244263,
"logps/chosen": -61.795372009277344,
"logps/ref_chosen": -61.80543518066406,
"logps/ref_rejected": -104.85826873779297,
"logps/rejected": -104.8602294921875,
"loss": 1.3855,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.4997004270553589,
"margin_dpo/beta_margin_grad_std": 0.009653432294726372,
"margin_dpo/beta_margin_mean": 0.001203133026137948,
"margin_dpo/loss_margin_mean": 0.012031331658363342,
"margin_dpo/margin_mean": 0.012031003832817078,
"margin_dpo/margin_std": 0.3863860070705414,
"step": 12
},
{
"epoch": 0.01908957415565345,
"grad_norm": 79.62843322753906,
"learning_rate": 8.695652173913042e-08,
"logits/chosen": -0.47281551361083984,
"logits/rejected": -0.44416356086730957,
"logps/chosen": -64.23121643066406,
"logps/ref_chosen": -64.26036071777344,
"logps/ref_rejected": -87.20307922363281,
"logps/rejected": -87.18215942382812,
"loss": 1.3859,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.4997946619987488,
"margin_dpo/beta_margin_grad_std": 0.009803904220461845,
"margin_dpo/beta_margin_mean": 0.0008225085912272334,
"margin_dpo/loss_margin_mean": 0.008225083351135254,
"margin_dpo/margin_mean": 0.008224427700042725,
"margin_dpo/margin_std": 0.39235472679138184,
"step": 13
},
{
"epoch": 0.020558002936857563,
"grad_norm": 85.54085540771484,
"learning_rate": 9.420289855072464e-08,
"logits/chosen": -0.4834981858730316,
"logits/rejected": -0.4443725347518921,
"logps/chosen": -58.135520935058594,
"logps/ref_chosen": -58.11021423339844,
"logps/ref_rejected": -104.04708099365234,
"logps/rejected": -104.12353515625,
"loss": 1.3816,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.49872222542762756,
"margin_dpo/beta_margin_grad_std": 0.010205242782831192,
"margin_dpo/beta_margin_mean": 0.00511439424008131,
"margin_dpo/loss_margin_mean": 0.05114394426345825,
"margin_dpo/margin_mean": 0.051144301891326904,
"margin_dpo/margin_std": 0.4083808958530426,
"step": 14
},
{
"epoch": 0.022026431718061675,
"grad_norm": 64.28120422363281,
"learning_rate": 1.0144927536231885e-07,
"logits/chosen": -0.505402147769928,
"logits/rejected": -0.4873759150505066,
"logps/chosen": -57.00213623046875,
"logps/ref_chosen": -56.96691131591797,
"logps/ref_rejected": -80.80863952636719,
"logps/rejected": -80.82938385009766,
"loss": 1.3881,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.5003613233566284,
"margin_dpo/beta_margin_grad_std": 0.008744737133383751,
"margin_dpo/beta_margin_mean": -0.0014485123101621866,
"margin_dpo/loss_margin_mean": -0.01448512077331543,
"margin_dpo/margin_mean": -0.01448512077331543,
"margin_dpo/margin_std": 0.34991174936294556,
"step": 15
},
{
"epoch": 0.023494860499265784,
"grad_norm": 84.06546020507812,
"learning_rate": 1.0869565217391303e-07,
"logits/chosen": -0.5580030083656311,
"logits/rejected": -0.5204088687896729,
"logps/chosen": -61.74095153808594,
"logps/ref_chosen": -61.739891052246094,
"logps/ref_rejected": -84.36947631835938,
"logps/rejected": -84.42204284667969,
"loss": 1.3816,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.4987128973007202,
"margin_dpo/beta_margin_grad_std": 0.009828168898820877,
"margin_dpo/beta_margin_mean": 0.005151033401489258,
"margin_dpo/loss_margin_mean": 0.05151033401489258,
"margin_dpo/margin_mean": 0.051510006189346313,
"margin_dpo/margin_std": 0.3933736979961395,
"step": 16
},
{
"epoch": 0.024963289280469897,
"grad_norm": 78.63739013671875,
"learning_rate": 1.1594202898550725e-07,
"logits/chosen": -0.5074384212493896,
"logits/rejected": -0.47103995084762573,
"logps/chosen": -67.64342498779297,
"logps/ref_chosen": -67.71033477783203,
"logps/ref_rejected": -85.37865447998047,
"logps/rejected": -85.41255187988281,
"loss": 1.3766,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.497480571269989,
"margin_dpo/beta_margin_grad_std": 0.00974523089826107,
"margin_dpo/beta_margin_mean": 0.010080328211188316,
"margin_dpo/loss_margin_mean": 0.10080328583717346,
"margin_dpo/margin_mean": 0.10080331563949585,
"margin_dpo/margin_std": 0.39002037048339844,
"step": 17
},
{
"epoch": 0.02643171806167401,
"grad_norm": 82.34278869628906,
"learning_rate": 1.2318840579710146e-07,
"logits/chosen": -0.48814651370048523,
"logits/rejected": -0.4320324659347534,
"logps/chosen": -47.713279724121094,
"logps/ref_chosen": -47.7394905090332,
"logps/ref_rejected": -75.4722900390625,
"logps/rejected": -75.48577880859375,
"loss": 1.3826,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.4990071952342987,
"margin_dpo/beta_margin_grad_std": 0.008132295683026314,
"margin_dpo/beta_margin_mean": 0.003970235586166382,
"margin_dpo/loss_margin_mean": 0.03970235586166382,
"margin_dpo/margin_mean": 0.03970211744308472,
"margin_dpo/margin_std": 0.32538339495658875,
"step": 18
},
{
"epoch": 0.027900146842878122,
"grad_norm": 73.4638900756836,
"learning_rate": 1.3043478260869563e-07,
"logits/chosen": -0.48973095417022705,
"logits/rejected": -0.4396272301673889,
"logps/chosen": -70.17350769042969,
"logps/ref_chosen": -70.20535278320312,
"logps/ref_rejected": -89.75758361816406,
"logps/rejected": -89.85565948486328,
"loss": 1.3737,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.49675452709198,
"margin_dpo/beta_margin_grad_std": 0.009917546063661575,
"margin_dpo/beta_margin_mean": 0.012991649098694324,
"margin_dpo/loss_margin_mean": 0.1299164891242981,
"margin_dpo/margin_mean": 0.12991660833358765,
"margin_dpo/margin_std": 0.3970108926296234,
"step": 19
},
{
"epoch": 0.02936857562408223,
"grad_norm": 74.19491577148438,
"learning_rate": 1.3768115942028986e-07,
"logits/chosen": -0.5667568445205688,
"logits/rejected": -0.5119162797927856,
"logps/chosen": -50.822715759277344,
"logps/ref_chosen": -50.80324172973633,
"logps/ref_rejected": -78.8233413696289,
"logps/rejected": -78.87236785888672,
"loss": 1.3836,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.4992612898349762,
"margin_dpo/beta_margin_grad_std": 0.007525566965341568,
"margin_dpo/beta_margin_mean": 0.0029547633603215218,
"margin_dpo/loss_margin_mean": 0.02954763174057007,
"margin_dpo/margin_mean": 0.029547661542892456,
"margin_dpo/margin_std": 0.3011046051979065,
"step": 20
},
{
"epoch": 0.030837004405286344,
"grad_norm": 77.03598022460938,
"learning_rate": 1.4492753623188405e-07,
"logits/chosen": -0.5037728548049927,
"logits/rejected": -0.48049020767211914,
"logps/chosen": -50.014862060546875,
"logps/ref_chosen": -50.063018798828125,
"logps/ref_rejected": -77.86878967285156,
"logps/rejected": -78.02366638183594,
"loss": 1.3664,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.4949270188808441,
"margin_dpo/beta_margin_grad_std": 0.008798542432487011,
"margin_dpo/beta_margin_mean": 0.020303059369325638,
"margin_dpo/loss_margin_mean": 0.20303058624267578,
"margin_dpo/margin_mean": 0.20303112268447876,
"margin_dpo/margin_std": 0.3521846532821655,
"step": 21
},
{
"epoch": 0.032305433186490456,
"grad_norm": 84.57589721679688,
"learning_rate": 1.5217391304347825e-07,
"logits/chosen": -0.49148398637771606,
"logits/rejected": -0.44818878173828125,
"logps/chosen": -58.99713897705078,
"logps/ref_chosen": -59.05763626098633,
"logps/ref_rejected": -97.50466918945312,
"logps/rejected": -97.65492248535156,
"loss": 1.3657,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.4947338104248047,
"margin_dpo/beta_margin_grad_std": 0.009125478565692902,
"margin_dpo/beta_margin_mean": 0.021074719727039337,
"margin_dpo/loss_margin_mean": 0.21074718236923218,
"margin_dpo/margin_mean": 0.210746169090271,
"margin_dpo/margin_std": 0.36520200967788696,
"step": 22
},
{
"epoch": 0.033773861967694566,
"grad_norm": 80.40442657470703,
"learning_rate": 1.5942028985507245e-07,
"logits/chosen": -0.4710449278354645,
"logits/rejected": -0.44750112295150757,
"logps/chosen": -60.034095764160156,
"logps/ref_chosen": -60.07769775390625,
"logps/ref_rejected": -81.1395492553711,
"logps/rejected": -81.3127212524414,
"loss": 1.3652,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.49458569288253784,
"margin_dpo/beta_margin_grad_std": 0.010714245960116386,
"margin_dpo/beta_margin_mean": 0.021676737815141678,
"margin_dpo/loss_margin_mean": 0.21676737070083618,
"margin_dpo/margin_mean": 0.21676787734031677,
"margin_dpo/margin_std": 0.42905572056770325,
"step": 23
},
{
"epoch": 0.03524229074889868,
"grad_norm": 86.11105346679688,
"learning_rate": 1.6666666666666665e-07,
"logits/chosen": -0.515487790107727,
"logits/rejected": -0.49895963072776794,
"logps/chosen": -44.28882598876953,
"logps/ref_chosen": -44.29103469848633,
"logps/ref_rejected": -99.12521362304688,
"logps/rejected": -99.3304443359375,
"loss": 1.3661,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.49481743574142456,
"margin_dpo/beta_margin_grad_std": 0.01028534211218357,
"margin_dpo/beta_margin_mean": 0.020744048058986664,
"margin_dpo/loss_margin_mean": 0.20744048058986664,
"margin_dpo/margin_mean": 0.20744094252586365,
"margin_dpo/margin_std": 0.4117741584777832,
"step": 24
},
{
"epoch": 0.03671071953010279,
"grad_norm": 74.07949829101562,
"learning_rate": 1.7391304347826085e-07,
"logits/chosen": -0.5136522650718689,
"logits/rejected": -0.4842616319656372,
"logps/chosen": -52.5118408203125,
"logps/ref_chosen": -52.537052154541016,
"logps/ref_rejected": -89.34219360351562,
"logps/rejected": -89.51565551757812,
"loss": 1.367,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.4950360655784607,
"margin_dpo/beta_margin_grad_std": 0.010537989437580109,
"margin_dpo/beta_margin_mean": 0.019867265596985817,
"margin_dpo/loss_margin_mean": 0.19867265224456787,
"margin_dpo/margin_mean": 0.1986721158027649,
"margin_dpo/margin_std": 0.4217980206012726,
"step": 25
},
{
"epoch": 0.0381791483113069,
"grad_norm": 87.3241195678711,
"learning_rate": 1.8115942028985507e-07,
"logits/chosen": -0.5391855239868164,
"logits/rejected": -0.5075402855873108,
"logps/chosen": -53.83518981933594,
"logps/ref_chosen": -53.92280578613281,
"logps/ref_rejected": -103.35971069335938,
"logps/rejected": -103.70204162597656,
"loss": 1.3445,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.4892633557319641,
"margin_dpo/beta_margin_grad_std": 0.013717170804738998,
"margin_dpo/beta_margin_mean": 0.04299398139119148,
"margin_dpo/loss_margin_mean": 0.42993980646133423,
"margin_dpo/margin_mean": 0.42994067072868347,
"margin_dpo/margin_std": 0.5494698286056519,
"step": 26
},
{
"epoch": 0.039647577092511016,
"grad_norm": 93.3059310913086,
"learning_rate": 1.8840579710144927e-07,
"logits/chosen": -0.5076569318771362,
"logits/rejected": -0.47098520398139954,
"logps/chosen": -42.758522033691406,
"logps/ref_chosen": -42.898529052734375,
"logps/ref_rejected": -98.72420501708984,
"logps/rejected": -99.09854125976562,
"loss": 1.3364,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.4871601164340973,
"margin_dpo/beta_margin_grad_std": 0.014479693956673145,
"margin_dpo/beta_margin_mean": 0.051434241235256195,
"margin_dpo/loss_margin_mean": 0.5143424272537231,
"margin_dpo/margin_mean": 0.514342188835144,
"margin_dpo/margin_std": 0.5809046626091003,
"step": 27
},
{
"epoch": 0.041116005873715125,
"grad_norm": 75.3113784790039,
"learning_rate": 1.9565217391304347e-07,
"logits/chosen": -0.5132657289505005,
"logits/rejected": -0.4586002230644226,
"logps/chosen": -60.55534362792969,
"logps/ref_chosen": -60.55650329589844,
"logps/ref_rejected": -91.40111541748047,
"logps/rejected": -91.69779205322266,
"loss": 1.3575,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.492561399936676,
"margin_dpo/beta_margin_grad_std": 0.013777370564639568,
"margin_dpo/beta_margin_mean": 0.029783397912979126,
"margin_dpo/loss_margin_mean": 0.29783397912979126,
"margin_dpo/margin_mean": 0.29783421754837036,
"margin_dpo/margin_std": 0.5516640543937683,
"step": 28
},
{
"epoch": 0.042584434654919234,
"grad_norm": 90.50589752197266,
"learning_rate": 2.028985507246377e-07,
"logits/chosen": -0.5619853734970093,
"logits/rejected": -0.5164209008216858,
"logps/chosen": -57.673362731933594,
"logps/ref_chosen": -57.80778503417969,
"logps/ref_rejected": -97.39434814453125,
"logps/rejected": -97.85377502441406,
"loss": 1.3285,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.4851696789264679,
"margin_dpo/beta_margin_grad_std": 0.01294540986418724,
"margin_dpo/beta_margin_mean": 0.0593840591609478,
"margin_dpo/loss_margin_mean": 0.5938405990600586,
"margin_dpo/margin_mean": 0.5938413739204407,
"margin_dpo/margin_std": 0.5187057256698608,
"step": 29
},
{
"epoch": 0.04405286343612335,
"grad_norm": 87.18180847167969,
"learning_rate": 2.1014492753623187e-07,
"logits/chosen": -0.5116697549819946,
"logits/rejected": -0.4816800057888031,
"logps/chosen": -52.425750732421875,
"logps/ref_chosen": -52.57737350463867,
"logps/ref_rejected": -98.48921203613281,
"logps/rejected": -99.05937957763672,
"loss": 1.3165,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.4819870591163635,
"margin_dpo/beta_margin_grad_std": 0.016035309061408043,
"margin_dpo/beta_margin_mean": 0.07217944413423538,
"margin_dpo/loss_margin_mean": 0.7217944860458374,
"margin_dpo/margin_mean": 0.7217941880226135,
"margin_dpo/margin_std": 0.6435875296592712,
"step": 30
},
{
"epoch": 0.04552129221732746,
"grad_norm": 67.85016632080078,
"learning_rate": 2.1739130434782607e-07,
"logits/chosen": -0.5148423910140991,
"logits/rejected": -0.4710330367088318,
"logps/chosen": -63.68492889404297,
"logps/ref_chosen": -63.806922912597656,
"logps/ref_rejected": -72.89400482177734,
"logps/rejected": -73.29931640625,
"loss": 1.3354,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.4868418872356415,
"margin_dpo/beta_margin_grad_std": 0.016791202127933502,
"margin_dpo/beta_margin_mean": 0.052730634808540344,
"margin_dpo/loss_margin_mean": 0.527306318283081,
"margin_dpo/margin_mean": 0.527306318283081,
"margin_dpo/margin_std": 0.6738239526748657,
"step": 31
},
{
"epoch": 0.04698972099853157,
"grad_norm": 81.52291107177734,
"learning_rate": 2.2463768115942027e-07,
"logits/chosen": -0.5098748207092285,
"logits/rejected": -0.46841973066329956,
"logps/chosen": -62.55455017089844,
"logps/ref_chosen": -62.739524841308594,
"logps/ref_rejected": -89.3175048828125,
"logps/rejected": -89.87690734863281,
"loss": 1.3153,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.48145878314971924,
"margin_dpo/beta_margin_grad_std": 0.022345291450619698,
"margin_dpo/beta_margin_mean": 0.07443846762180328,
"margin_dpo/loss_margin_mean": 0.7443846464157104,
"margin_dpo/margin_mean": 0.7443850040435791,
"margin_dpo/margin_std": 0.9011361598968506,
"step": 32
},
{
"epoch": 0.048458149779735685,
"grad_norm": 72.76732635498047,
"learning_rate": 2.318840579710145e-07,
"logits/chosen": -0.5056596994400024,
"logits/rejected": -0.47998249530792236,
"logps/chosen": -53.159671783447266,
"logps/ref_chosen": -53.26097106933594,
"logps/ref_rejected": -87.8851318359375,
"logps/rejected": -88.34671020507812,
"loss": 1.3315,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.4859437346458435,
"margin_dpo/beta_margin_grad_std": 0.013436902314424515,
"margin_dpo/beta_margin_mean": 0.056287482380867004,
"margin_dpo/loss_margin_mean": 0.5628747940063477,
"margin_dpo/margin_mean": 0.5628749132156372,
"margin_dpo/margin_std": 0.5385845899581909,
"step": 33
},
{
"epoch": 0.049926578560939794,
"grad_norm": 77.6261978149414,
"learning_rate": 2.391304347826087e-07,
"logits/chosen": -0.49515533447265625,
"logits/rejected": -0.4777703285217285,
"logps/chosen": -50.73601531982422,
"logps/ref_chosen": -50.81732940673828,
"logps/ref_rejected": -101.92184448242188,
"logps/rejected": -102.61337280273438,
"loss": 1.3124,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.48073524236679077,
"margin_dpo/beta_margin_grad_std": 0.02180512621998787,
"margin_dpo/beta_margin_mean": 0.07728321105241776,
"margin_dpo/loss_margin_mean": 0.7728320360183716,
"margin_dpo/margin_mean": 0.7728322744369507,
"margin_dpo/margin_std": 0.8760267496109009,
"step": 34
},
{
"epoch": 0.0513950073421439,
"grad_norm": 82.47791290283203,
"learning_rate": 2.463768115942029e-07,
"logits/chosen": -0.5227484107017517,
"logits/rejected": -0.48601728677749634,
"logps/chosen": -50.88093948364258,
"logps/ref_chosen": -51.02449035644531,
"logps/ref_rejected": -106.82443237304688,
"logps/rejected": -107.93114471435547,
"loss": 1.2685,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.4689541757106781,
"margin_dpo/beta_margin_grad_std": 0.027965568006038666,
"margin_dpo/beta_margin_mean": 0.12502656877040863,
"margin_dpo/loss_margin_mean": 1.2502657175064087,
"margin_dpo/margin_mean": 1.2502658367156982,
"margin_dpo/margin_std": 1.1440428495407104,
"step": 35
},
{
"epoch": 0.05286343612334802,
"grad_norm": 72.95713806152344,
"learning_rate": 2.536231884057971e-07,
"logits/chosen": -0.563947319984436,
"logits/rejected": -0.5279806852340698,
"logps/chosen": -51.93867492675781,
"logps/ref_chosen": -51.991493225097656,
"logps/ref_rejected": -86.04061889648438,
"logps/rejected": -87.13275146484375,
"loss": 1.2793,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.4715506434440613,
"margin_dpo/beta_margin_grad_std": 0.031429655849933624,
"margin_dpo/beta_margin_mean": 0.11449373513460159,
"margin_dpo/loss_margin_mean": 1.14493727684021,
"margin_dpo/margin_mean": 1.144936442375183,
"margin_dpo/margin_std": 1.2692325115203857,
"step": 36
},
{
"epoch": 0.05433186490455213,
"grad_norm": 61.87527084350586,
"learning_rate": 2.6086956521739126e-07,
"logits/chosen": -0.5000085234642029,
"logits/rejected": -0.4554196000099182,
"logps/chosen": -62.77561950683594,
"logps/ref_chosen": -62.807106018066406,
"logps/ref_rejected": -77.89507293701172,
"logps/rejected": -78.88422393798828,
"loss": 1.2909,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.47465330362319946,
"margin_dpo/beta_margin_grad_std": 0.0310398917645216,
"margin_dpo/beta_margin_mean": 0.10206404328346252,
"margin_dpo/loss_margin_mean": 1.0206403732299805,
"margin_dpo/margin_mean": 1.0206403732299805,
"margin_dpo/margin_std": 1.2562531232833862,
"step": 37
},
{
"epoch": 0.055800293685756244,
"grad_norm": 69.35832977294922,
"learning_rate": 2.681159420289855e-07,
"logits/chosen": -0.5131621360778809,
"logits/rejected": -0.4803985357284546,
"logps/chosen": -48.25373077392578,
"logps/ref_chosen": -48.39051818847656,
"logps/ref_rejected": -97.91244506835938,
"logps/rejected": -99.13421630859375,
"loss": 1.262,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.46648678183555603,
"margin_dpo/beta_margin_grad_std": 0.0405726283788681,
"margin_dpo/beta_margin_mean": 0.13585661351680756,
"margin_dpo/loss_margin_mean": 1.3585660457611084,
"margin_dpo/margin_mean": 1.3585660457611084,
"margin_dpo/margin_std": 1.6711539030075073,
"step": 38
},
{
"epoch": 0.05726872246696035,
"grad_norm": 73.56781768798828,
"learning_rate": 2.753623188405797e-07,
"logits/chosen": -0.5736282467842102,
"logits/rejected": -0.534826934337616,
"logps/chosen": -50.66197204589844,
"logps/ref_chosen": -50.75046920776367,
"logps/ref_rejected": -78.56951141357422,
"logps/rejected": -80.15695190429688,
"loss": 1.2309,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.458440363407135,
"margin_dpo/beta_margin_grad_std": 0.035203345119953156,
"margin_dpo/beta_margin_mean": 0.1675935983657837,
"margin_dpo/loss_margin_mean": 1.675935983657837,
"margin_dpo/margin_mean": 1.6759363412857056,
"margin_dpo/margin_std": 1.4285030364990234,
"step": 39
},
{
"epoch": 0.05873715124816446,
"grad_norm": 60.51735305786133,
"learning_rate": 2.8260869565217386e-07,
"logits/chosen": -0.527452826499939,
"logits/rejected": -0.4978986382484436,
"logps/chosen": -57.774688720703125,
"logps/ref_chosen": -57.985069274902344,
"logps/ref_rejected": -74.30007934570312,
"logps/rejected": -75.63487243652344,
"loss": 1.2454,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.46187901496887207,
"margin_dpo/beta_margin_grad_std": 0.041983917355537415,
"margin_dpo/beta_margin_mean": 0.15451756119728088,
"margin_dpo/loss_margin_mean": 1.5451757907867432,
"margin_dpo/margin_mean": 1.5451761484146118,
"margin_dpo/margin_std": 1.721125602722168,
"step": 40
},
{
"epoch": 0.06020558002936858,
"grad_norm": 68.02806091308594,
"learning_rate": 2.898550724637681e-07,
"logits/chosen": -0.522682785987854,
"logits/rejected": -0.4852331280708313,
"logps/chosen": -62.648956298828125,
"logps/ref_chosen": -62.69581604003906,
"logps/ref_rejected": -97.02352905273438,
"logps/rejected": -98.86910247802734,
"loss": 1.216,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.4534452557563782,
"margin_dpo/beta_margin_grad_std": 0.04771146923303604,
"margin_dpo/beta_margin_mean": 0.1892436146736145,
"margin_dpo/loss_margin_mean": 1.892436146736145,
"margin_dpo/margin_mean": 1.8924363851547241,
"margin_dpo/margin_std": 1.9684252738952637,
"step": 41
},
{
"epoch": 0.06167400881057269,
"grad_norm": 79.33026123046875,
"learning_rate": 2.971014492753623e-07,
"logits/chosen": -0.5209932923316956,
"logits/rejected": -0.4742482602596283,
"logps/chosen": -58.71235275268555,
"logps/ref_chosen": -58.96642303466797,
"logps/ref_rejected": -109.90837097167969,
"logps/rejected": -112.24879455566406,
"loss": 1.1582,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.43676841259002686,
"margin_dpo/beta_margin_grad_std": 0.05766534060239792,
"margin_dpo/beta_margin_mean": 0.25944995880126953,
"margin_dpo/loss_margin_mean": 2.5944995880126953,
"margin_dpo/margin_mean": 2.5944998264312744,
"margin_dpo/margin_std": 2.435802936553955,
"step": 42
},
{
"epoch": 0.0631424375917768,
"grad_norm": 71.26874542236328,
"learning_rate": 3.043478260869565e-07,
"logits/chosen": -0.5625420808792114,
"logits/rejected": -0.538593590259552,
"logps/chosen": -53.63534927368164,
"logps/ref_chosen": -54.15599822998047,
"logps/ref_rejected": -96.48019409179688,
"logps/rejected": -98.510009765625,
"loss": 1.1584,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.4374551773071289,
"margin_dpo/beta_margin_grad_std": 0.05021943897008896,
"margin_dpo/beta_margin_mean": 0.2550460994243622,
"margin_dpo/loss_margin_mean": 2.5504610538482666,
"margin_dpo/margin_mean": 2.5504608154296875,
"margin_dpo/margin_std": 2.1022145748138428,
"step": 43
},
{
"epoch": 0.06461086637298091,
"grad_norm": 78.6224136352539,
"learning_rate": 3.115942028985507e-07,
"logits/chosen": -0.46302688121795654,
"logits/rejected": -0.443297415971756,
"logps/chosen": -49.88066864013672,
"logps/ref_chosen": -50.07849884033203,
"logps/ref_rejected": -108.78376007080078,
"logps/rejected": -111.412841796875,
"loss": 1.1358,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.43084633350372314,
"margin_dpo/beta_margin_grad_std": 0.053701795637607574,
"margin_dpo/beta_margin_mean": 0.2826906740665436,
"margin_dpo/loss_margin_mean": 2.826906681060791,
"margin_dpo/margin_mean": 2.826906442642212,
"margin_dpo/margin_std": 2.2519941329956055,
"step": 44
},
{
"epoch": 0.06607929515418502,
"grad_norm": 61.787879943847656,
"learning_rate": 3.188405797101449e-07,
"logits/chosen": -0.4846153259277344,
"logits/rejected": -0.47198039293289185,
"logps/chosen": -48.231903076171875,
"logps/ref_chosen": -48.41493225097656,
"logps/ref_rejected": -77.93643188476562,
"logps/rejected": -80.11711883544922,
"loss": 1.181,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.4426528513431549,
"margin_dpo/beta_margin_grad_std": 0.061857253313064575,
"margin_dpo/beta_margin_mean": 0.23637181520462036,
"margin_dpo/loss_margin_mean": 2.363718032836914,
"margin_dpo/margin_mean": 2.363717555999756,
"margin_dpo/margin_std": 2.6244254112243652,
"step": 45
},
{
"epoch": 0.06754772393538913,
"grad_norm": 69.09931945800781,
"learning_rate": 3.260869565217391e-07,
"logits/chosen": -0.5141834020614624,
"logits/rejected": -0.4625147581100464,
"logps/chosen": -55.74999237060547,
"logps/ref_chosen": -55.999427795410156,
"logps/ref_rejected": -95.652587890625,
"logps/rejected": -98.34117126464844,
"loss": 1.1376,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.4294002056121826,
"margin_dpo/beta_margin_grad_std": 0.07263202965259552,
"margin_dpo/beta_margin_mean": 0.29380178451538086,
"margin_dpo/loss_margin_mean": 2.9380178451538086,
"margin_dpo/margin_mean": 2.9380173683166504,
"margin_dpo/margin_std": 3.154534339904785,
"step": 46
},
{
"epoch": 0.06901615271659324,
"grad_norm": 65.72870635986328,
"learning_rate": 3.333333333333333e-07,
"logits/chosen": -0.58119797706604,
"logits/rejected": -0.5290583372116089,
"logps/chosen": -57.503753662109375,
"logps/ref_chosen": -57.92607879638672,
"logps/ref_rejected": -94.67920684814453,
"logps/rejected": -97.23452758789062,
"loss": 1.1285,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.4276430606842041,
"margin_dpo/beta_margin_grad_std": 0.0631469339132309,
"margin_dpo/beta_margin_mean": 0.29776421189308167,
"margin_dpo/loss_margin_mean": 2.977642059326172,
"margin_dpo/margin_mean": 2.977642297744751,
"margin_dpo/margin_std": 2.6595559120178223,
"step": 47
},
{
"epoch": 0.07048458149779736,
"grad_norm": 72.27952575683594,
"learning_rate": 3.4057971014492755e-07,
"logits/chosen": -0.5920270681381226,
"logits/rejected": -0.5339563488960266,
"logps/chosen": -57.16640853881836,
"logps/ref_chosen": -57.188072204589844,
"logps/ref_rejected": -88.0166015625,
"logps/rejected": -91.12606048583984,
"loss": 1.1231,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.4243152141571045,
"margin_dpo/beta_margin_grad_std": 0.07117132842540741,
"margin_dpo/beta_margin_mean": 0.3131124675273895,
"margin_dpo/loss_margin_mean": 3.131124496459961,
"margin_dpo/margin_mean": 3.131124496459961,
"margin_dpo/margin_std": 3.016913890838623,
"step": 48
},
{
"epoch": 0.07195301027900147,
"grad_norm": 63.71873092651367,
"learning_rate": 3.478260869565217e-07,
"logits/chosen": -0.5536686182022095,
"logits/rejected": -0.49566274881362915,
"logps/chosen": -61.38921356201172,
"logps/ref_chosen": -61.685264587402344,
"logps/ref_rejected": -83.76747131347656,
"logps/rejected": -87.34431457519531,
"loss": 1.074,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.40908271074295044,
"margin_dpo/beta_margin_grad_std": 0.08748139441013336,
"margin_dpo/beta_margin_mean": 0.387288898229599,
"margin_dpo/loss_margin_mean": 3.8728890419006348,
"margin_dpo/margin_mean": 3.8728885650634766,
"margin_dpo/margin_std": 3.9563791751861572,
"step": 49
},
{
"epoch": 0.07342143906020558,
"grad_norm": 62.7824592590332,
"learning_rate": 3.5507246376811595e-07,
"logits/chosen": -0.5670984387397766,
"logits/rejected": -0.5319196581840515,
"logps/chosen": -58.91963195800781,
"logps/ref_chosen": -58.72413635253906,
"logps/ref_rejected": -96.35814666748047,
"logps/rejected": -100.67803955078125,
"loss": 1.0538,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.40261325240135193,
"margin_dpo/beta_margin_grad_std": 0.09164208173751831,
"margin_dpo/beta_margin_mean": 0.4124397039413452,
"margin_dpo/loss_margin_mean": 4.124396800994873,
"margin_dpo/margin_mean": 4.124396800994873,
"margin_dpo/margin_std": 4.026268005371094,
"step": 50
},
{
"epoch": 0.07488986784140969,
"grad_norm": 52.12064743041992,
"learning_rate": 3.6231884057971015e-07,
"logits/chosen": -0.5369248390197754,
"logits/rejected": -0.5046299695968628,
"logps/chosen": -61.671791076660156,
"logps/ref_chosen": -61.3736686706543,
"logps/ref_rejected": -76.00199890136719,
"logps/rejected": -80.37809753417969,
"loss": 1.0821,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.4077322781085968,
"margin_dpo/beta_margin_grad_std": 0.11022845655679703,
"margin_dpo/beta_margin_mean": 0.4077974557876587,
"margin_dpo/loss_margin_mean": 4.077974796295166,
"margin_dpo/margin_mean": 4.077974319458008,
"margin_dpo/margin_std": 5.209657669067383,
"step": 51
},
{
"epoch": 0.0763582966226138,
"grad_norm": 58.820133209228516,
"learning_rate": 3.695652173913043e-07,
"logits/chosen": -0.5732629299163818,
"logits/rejected": -0.5190708637237549,
"logps/chosen": -51.953399658203125,
"logps/ref_chosen": -52.33735656738281,
"logps/ref_rejected": -79.97391510009766,
"logps/rejected": -85.85843658447266,
"loss": 0.9142,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.3579610586166382,
"margin_dpo/beta_margin_grad_std": 0.10757434368133545,
"margin_dpo/beta_margin_mean": 0.6268481016159058,
"margin_dpo/loss_margin_mean": 6.26848030090332,
"margin_dpo/margin_mean": 6.2684807777404785,
"margin_dpo/margin_std": 5.199737548828125,
"step": 52
},
{
"epoch": 0.07782672540381791,
"grad_norm": 57.97807693481445,
"learning_rate": 3.7681159420289855e-07,
"logits/chosen": -0.618739128112793,
"logits/rejected": -0.5973125100135803,
"logps/chosen": -53.48461151123047,
"logps/ref_chosen": -53.31465530395508,
"logps/ref_rejected": -91.7835922241211,
"logps/rejected": -98.30101013183594,
"loss": 0.9439,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.36459940671920776,
"margin_dpo/beta_margin_grad_std": 0.11843107640743256,
"margin_dpo/beta_margin_mean": 0.6347463130950928,
"margin_dpo/loss_margin_mean": 6.347463130950928,
"margin_dpo/margin_mean": 6.347464084625244,
"margin_dpo/margin_std": 6.299587726593018,
"step": 53
},
{
"epoch": 0.07929515418502203,
"grad_norm": 58.53452682495117,
"learning_rate": 3.8405797101449274e-07,
"logits/chosen": -0.6002248525619507,
"logits/rejected": -0.5472081303596497,
"logps/chosen": -51.132781982421875,
"logps/ref_chosen": -50.68865966796875,
"logps/ref_rejected": -91.71539306640625,
"logps/rejected": -97.54826354980469,
"loss": 0.9754,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.37753698229789734,
"margin_dpo/beta_margin_grad_std": 0.10520176589488983,
"margin_dpo/beta_margin_mean": 0.5388752818107605,
"margin_dpo/loss_margin_mean": 5.3887529373168945,
"margin_dpo/margin_mean": 5.3887529373168945,
"margin_dpo/margin_std": 5.09660530090332,
"step": 54
},
{
"epoch": 0.08076358296622614,
"grad_norm": 53.50847244262695,
"learning_rate": 3.9130434782608694e-07,
"logits/chosen": -0.6379266977310181,
"logits/rejected": -0.5748265981674194,
"logps/chosen": -63.582801818847656,
"logps/ref_chosen": -62.615234375,
"logps/ref_rejected": -88.99349975585938,
"logps/rejected": -96.49349212646484,
"loss": 0.9552,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.36137956380844116,
"margin_dpo/beta_margin_grad_std": 0.1454969346523285,
"margin_dpo/beta_margin_mean": 0.6532418727874756,
"margin_dpo/loss_margin_mean": 6.532418727874756,
"margin_dpo/margin_mean": 6.532418251037598,
"margin_dpo/margin_std": 7.533010482788086,
"step": 55
},
{
"epoch": 0.08223201174743025,
"grad_norm": 48.04698944091797,
"learning_rate": 3.9855072463768114e-07,
"logits/chosen": -0.6322102546691895,
"logits/rejected": -0.5908021330833435,
"logps/chosen": -58.65277862548828,
"logps/ref_chosen": -57.93273162841797,
"logps/ref_rejected": -94.1744384765625,
"logps/rejected": -101.14324951171875,
"loss": 0.9724,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.3670324981212616,
"margin_dpo/beta_margin_grad_std": 0.14493967592716217,
"margin_dpo/beta_margin_mean": 0.6248764991760254,
"margin_dpo/loss_margin_mean": 6.248764991760254,
"margin_dpo/margin_mean": 6.248764991760254,
"margin_dpo/margin_std": 7.392797470092773,
"step": 56
},
{
"epoch": 0.08370044052863436,
"grad_norm": 53.7747688293457,
"learning_rate": 4.057971014492754e-07,
"logits/chosen": -0.5740865468978882,
"logits/rejected": -0.5456082820892334,
"logps/chosen": -71.21261596679688,
"logps/ref_chosen": -70.49528503417969,
"logps/ref_rejected": -95.56546020507812,
"logps/rejected": -103.3371353149414,
"loss": 0.8958,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.3454797565937042,
"margin_dpo/beta_margin_grad_std": 0.1326553225517273,
"margin_dpo/beta_margin_mean": 0.7054347991943359,
"margin_dpo/loss_margin_mean": 7.054348468780518,
"margin_dpo/margin_mean": 7.054348945617676,
"margin_dpo/margin_std": 6.582326889038086,
"step": 57
},
{
"epoch": 0.08516886930983847,
"grad_norm": 58.93936538696289,
"learning_rate": 4.1304347826086954e-07,
"logits/chosen": -0.6123115420341492,
"logits/rejected": -0.5382078886032104,
"logps/chosen": -63.20277786254883,
"logps/ref_chosen": -62.13294219970703,
"logps/ref_rejected": -84.61729431152344,
"logps/rejected": -93.39222717285156,
"loss": 0.8958,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.34207093715667725,
"margin_dpo/beta_margin_grad_std": 0.15293042361736298,
"margin_dpo/beta_margin_mean": 0.7705095410346985,
"margin_dpo/loss_margin_mean": 7.7050957679748535,
"margin_dpo/margin_mean": 7.705096244812012,
"margin_dpo/margin_std": 8.273210525512695,
"step": 58
},
{
"epoch": 0.08663729809104258,
"grad_norm": 55.383514404296875,
"learning_rate": 4.2028985507246374e-07,
"logits/chosen": -0.6397849321365356,
"logits/rejected": -0.6004974842071533,
"logps/chosen": -53.41858673095703,
"logps/ref_chosen": -51.932525634765625,
"logps/ref_rejected": -88.88520050048828,
"logps/rejected": -98.85914611816406,
"loss": 0.857,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.32870835065841675,
"margin_dpo/beta_margin_grad_std": 0.15062686800956726,
"margin_dpo/beta_margin_mean": 0.8487890958786011,
"margin_dpo/loss_margin_mean": 8.48789119720459,
"margin_dpo/margin_mean": 8.487890243530273,
"margin_dpo/margin_std": 8.594100952148438,
"step": 59
},
{
"epoch": 0.0881057268722467,
"grad_norm": 63.981693267822266,
"learning_rate": 4.2753623188405794e-07,
"logits/chosen": -0.6227731704711914,
"logits/rejected": -0.5636199712753296,
"logps/chosen": -63.575462341308594,
"logps/ref_chosen": -60.94218444824219,
"logps/ref_rejected": -85.39340209960938,
"logps/rejected": -94.78590393066406,
"loss": 0.9511,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.35332685708999634,
"margin_dpo/beta_margin_grad_std": 0.1562734991312027,
"margin_dpo/beta_margin_mean": 0.6759233474731445,
"margin_dpo/loss_margin_mean": 6.759233474731445,
"margin_dpo/margin_mean": 6.759233474731445,
"margin_dpo/margin_std": 7.703272819519043,
"step": 60
},
{
"epoch": 0.08957415565345081,
"grad_norm": 54.136070251464844,
"learning_rate": 4.3478260869565214e-07,
"logits/chosen": -0.589980959892273,
"logits/rejected": -0.5553174018859863,
"logps/chosen": -62.088226318359375,
"logps/ref_chosen": -60.633522033691406,
"logps/ref_rejected": -89.85249328613281,
"logps/rejected": -99.72574615478516,
"loss": 0.9274,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.34447312355041504,
"margin_dpo/beta_margin_grad_std": 0.17768457531929016,
"margin_dpo/beta_margin_mean": 0.8418547511100769,
"margin_dpo/loss_margin_mean": 8.418547630310059,
"margin_dpo/margin_mean": 8.418546676635742,
"margin_dpo/margin_std": 11.459321975708008,
"step": 61
},
{
"epoch": 0.09104258443465492,
"grad_norm": 56.41756057739258,
"learning_rate": 4.420289855072464e-07,
"logits/chosen": -0.6043162941932678,
"logits/rejected": -0.5698095560073853,
"logps/chosen": -57.790740966796875,
"logps/ref_chosen": -56.15077209472656,
"logps/ref_rejected": -75.56619262695312,
"logps/rejected": -83.44951629638672,
"loss": 0.9972,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.37032824754714966,
"margin_dpo/beta_margin_grad_std": 0.15750956535339355,
"margin_dpo/beta_margin_mean": 0.624334990978241,
"margin_dpo/loss_margin_mean": 6.243350028991699,
"margin_dpo/margin_mean": 6.243350028991699,
"margin_dpo/margin_std": 8.166690826416016,
"step": 62
},
{
"epoch": 0.09251101321585903,
"grad_norm": 56.643470764160156,
"learning_rate": 4.4927536231884053e-07,
"logits/chosen": -0.5830048322677612,
"logits/rejected": -0.5372258424758911,
"logps/chosen": -75.84552001953125,
"logps/ref_chosen": -73.14739227294922,
"logps/ref_rejected": -97.61006164550781,
"logps/rejected": -108.71710205078125,
"loss": 0.8745,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.3277238607406616,
"margin_dpo/beta_margin_grad_std": 0.1626659482717514,
"margin_dpo/beta_margin_mean": 0.84089195728302,
"margin_dpo/loss_margin_mean": 8.408918380737305,
"margin_dpo/margin_mean": 8.408919334411621,
"margin_dpo/margin_std": 8.86873722076416,
"step": 63
},
{
"epoch": 0.09397944199706314,
"grad_norm": 51.555030822753906,
"learning_rate": 4.5652173913043473e-07,
"logits/chosen": -0.6049680113792419,
"logits/rejected": -0.5739491581916809,
"logps/chosen": -54.96660232543945,
"logps/ref_chosen": -53.99859619140625,
"logps/ref_rejected": -93.53020477294922,
"logps/rejected": -104.44441223144531,
"loss": 0.8416,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.31263962388038635,
"margin_dpo/beta_margin_grad_std": 0.1735057830810547,
"margin_dpo/beta_margin_mean": 0.994620680809021,
"margin_dpo/loss_margin_mean": 9.946207046508789,
"margin_dpo/margin_mean": 9.946207046508789,
"margin_dpo/margin_std": 11.080026626586914,
"step": 64
},
{
"epoch": 0.09544787077826726,
"grad_norm": 54.08346939086914,
"learning_rate": 4.63768115942029e-07,
"logits/chosen": -0.6963008642196655,
"logits/rejected": -0.6837696433067322,
"logps/chosen": -68.06695556640625,
"logps/ref_chosen": -64.83599853515625,
"logps/ref_rejected": -109.94645690917969,
"logps/rejected": -123.10871124267578,
"loss": 0.8597,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.31020408868789673,
"margin_dpo/beta_margin_grad_std": 0.18927563726902008,
"margin_dpo/beta_margin_mean": 0.9931299686431885,
"margin_dpo/loss_margin_mean": 9.931299209594727,
"margin_dpo/margin_mean": 9.93129825592041,
"margin_dpo/margin_std": 11.138134002685547,
"step": 65
},
{
"epoch": 0.09691629955947137,
"grad_norm": 52.64336013793945,
"learning_rate": 4.7101449275362313e-07,
"logits/chosen": -0.6431201100349426,
"logits/rejected": -0.6103649139404297,
"logps/chosen": -54.33839797973633,
"logps/ref_chosen": -51.44352722167969,
"logps/ref_rejected": -75.63629150390625,
"logps/rejected": -87.60906219482422,
"loss": 0.8852,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.3300383687019348,
"margin_dpo/beta_margin_grad_std": 0.17383669316768646,
"margin_dpo/beta_margin_mean": 0.9077892303466797,
"margin_dpo/loss_margin_mean": 9.077892303466797,
"margin_dpo/margin_mean": 9.07789134979248,
"margin_dpo/margin_std": 11.045241355895996,
"step": 66
},
{
"epoch": 0.09838472834067548,
"grad_norm": 53.70967102050781,
"learning_rate": 4.782608695652174e-07,
"logits/chosen": -0.6037384271621704,
"logits/rejected": -0.56143718957901,
"logps/chosen": -61.83789825439453,
"logps/ref_chosen": -59.34080505371094,
"logps/ref_rejected": -72.78729248046875,
"logps/rejected": -84.55035400390625,
"loss": 0.8693,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.3258131444454193,
"margin_dpo/beta_margin_grad_std": 0.1767134666442871,
"margin_dpo/beta_margin_mean": 0.9265965223312378,
"margin_dpo/loss_margin_mean": 9.265965461730957,
"margin_dpo/margin_mean": 9.26596450805664,
"margin_dpo/margin_std": 10.946893692016602,
"step": 67
},
{
"epoch": 0.09985315712187959,
"grad_norm": 51.866180419921875,
"learning_rate": 4.855072463768116e-07,
"logits/chosen": -0.6399117708206177,
"logits/rejected": -0.5805681347846985,
"logps/chosen": -68.01475524902344,
"logps/ref_chosen": -65.2058334350586,
"logps/ref_rejected": -77.20724487304688,
"logps/rejected": -88.76637268066406,
"loss": 0.8436,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.3251006603240967,
"margin_dpo/beta_margin_grad_std": 0.1536335051059723,
"margin_dpo/beta_margin_mean": 0.875019907951355,
"margin_dpo/loss_margin_mean": 8.750198364257812,
"margin_dpo/margin_mean": 8.750198364257812,
"margin_dpo/margin_std": 8.96760082244873,
"step": 68
},
{
"epoch": 0.1013215859030837,
"grad_norm": 52.978328704833984,
"learning_rate": 4.927536231884058e-07,
"logits/chosen": -0.619906485080719,
"logits/rejected": -0.5960003137588501,
"logps/chosen": -63.03958511352539,
"logps/ref_chosen": -59.81924057006836,
"logps/ref_rejected": -103.38886260986328,
"logps/rejected": -117.06353759765625,
"loss": 0.7728,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.30147287249565125,
"margin_dpo/beta_margin_grad_std": 0.15879851579666138,
"margin_dpo/beta_margin_mean": 1.0454329252243042,
"margin_dpo/loss_margin_mean": 10.454328536987305,
"margin_dpo/margin_mean": 10.454329490661621,
"margin_dpo/margin_std": 10.263179779052734,
"step": 69
},
{
"epoch": 0.1027900146842878,
"grad_norm": 58.304927825927734,
"learning_rate": 5e-07,
"logits/chosen": -0.6229462623596191,
"logits/rejected": -0.5881924629211426,
"logps/chosen": -66.45687103271484,
"logps/ref_chosen": -61.930641174316406,
"logps/ref_rejected": -91.060791015625,
"logps/rejected": -106.82664489746094,
"loss": 0.7921,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.2991790771484375,
"margin_dpo/beta_margin_grad_std": 0.1847190260887146,
"margin_dpo/beta_margin_mean": 1.123962163925171,
"margin_dpo/loss_margin_mean": 11.239620208740234,
"margin_dpo/margin_mean": 11.239620208740234,
"margin_dpo/margin_std": 11.950462341308594,
"step": 70
},
{
"epoch": 0.10425844346549193,
"grad_norm": 50.17360305786133,
"learning_rate": 4.999967061337492e-07,
"logits/chosen": -0.6788771152496338,
"logits/rejected": -0.6398866772651672,
"logps/chosen": -65.67703247070312,
"logps/ref_chosen": -61.750343322753906,
"logps/ref_rejected": -97.33662414550781,
"logps/rejected": -114.21321105957031,
"loss": 0.6993,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.2723275125026703,
"margin_dpo/beta_margin_grad_std": 0.1644226610660553,
"margin_dpo/beta_margin_mean": 1.2949903011322021,
"margin_dpo/loss_margin_mean": 12.949902534484863,
"margin_dpo/margin_mean": 12.949902534484863,
"margin_dpo/margin_std": 12.493947982788086,
"step": 71
},
{
"epoch": 0.10572687224669604,
"grad_norm": 59.9242057800293,
"learning_rate": 4.999868246217933e-07,
"logits/chosen": -0.6571969985961914,
"logits/rejected": -0.6217666864395142,
"logps/chosen": -70.40309143066406,
"logps/ref_chosen": -66.05341339111328,
"logps/ref_rejected": -95.2869873046875,
"logps/rejected": -113.08145141601562,
"loss": 0.7306,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.26945406198501587,
"margin_dpo/beta_margin_grad_std": 0.1959770768880844,
"margin_dpo/beta_margin_mean": 1.3444783687591553,
"margin_dpo/loss_margin_mean": 13.444782257080078,
"margin_dpo/margin_mean": 13.444782257080078,
"margin_dpo/margin_std": 13.743330955505371,
"step": 72
},
{
"epoch": 0.10719530102790015,
"grad_norm": 75.93876647949219,
"learning_rate": 4.999703557245192e-07,
"logits/chosen": -0.6721267104148865,
"logits/rejected": -0.6297430992126465,
"logps/chosen": -72.05320739746094,
"logps/ref_chosen": -66.25627136230469,
"logps/ref_rejected": -90.45613861083984,
"logps/rejected": -109.47367858886719,
"loss": 0.9481,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.3085705637931824,
"margin_dpo/beta_margin_grad_std": 0.24887485802173615,
"margin_dpo/beta_margin_mean": 1.3220614194869995,
"margin_dpo/loss_margin_mean": 13.220613479614258,
"margin_dpo/margin_mean": 13.220613479614258,
"margin_dpo/margin_std": 18.805517196655273,
"step": 73
},
{
"epoch": 0.10866372980910426,
"grad_norm": 73.18781280517578,
"learning_rate": 4.999472998758977e-07,
"logits/chosen": -0.6080462336540222,
"logits/rejected": -0.5960662364959717,
"logps/chosen": -59.563087463378906,
"logps/ref_chosen": -53.42488098144531,
"logps/ref_rejected": -95.94693756103516,
"logps/rejected": -115.85839080810547,
"loss": 0.8756,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.2891170084476471,
"margin_dpo/beta_margin_grad_std": 0.21858373284339905,
"margin_dpo/beta_margin_mean": 1.3773247003555298,
"margin_dpo/loss_margin_mean": 13.773246765136719,
"margin_dpo/margin_mean": 13.773246765136719,
"margin_dpo/margin_std": 20.172929763793945,
"step": 74
},
{
"epoch": 0.11013215859030837,
"grad_norm": 50.57677459716797,
"learning_rate": 4.999176576834721e-07,
"logits/chosen": -0.6782846450805664,
"logits/rejected": -0.6683961153030396,
"logps/chosen": -57.562652587890625,
"logps/ref_chosen": -51.861663818359375,
"logps/ref_rejected": -111.25397491455078,
"logps/rejected": -136.24032592773438,
"loss": 0.6095,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.2280743420124054,
"margin_dpo/beta_margin_grad_std": 0.19971585273742676,
"margin_dpo/beta_margin_mean": 1.9285348653793335,
"margin_dpo/loss_margin_mean": 19.285348892211914,
"margin_dpo/margin_mean": 19.28534698486328,
"margin_dpo/margin_std": 18.741535186767578,
"step": 75
},
{
"epoch": 0.11160058737151249,
"grad_norm": 64.94268035888672,
"learning_rate": 4.998814299283415e-07,
"logits/chosen": -0.7133210301399231,
"logits/rejected": -0.6718661785125732,
"logps/chosen": -59.98701095581055,
"logps/ref_chosen": -53.26604080200195,
"logps/ref_rejected": -78.21662139892578,
"logps/rejected": -97.29232788085938,
"loss": 0.8158,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.28044480085372925,
"margin_dpo/beta_margin_grad_std": 0.20121756196022034,
"margin_dpo/beta_margin_mean": 1.235473871231079,
"margin_dpo/loss_margin_mean": 12.354738235473633,
"margin_dpo/margin_mean": 12.354738235473633,
"margin_dpo/margin_std": 14.27847671508789,
"step": 76
},
{
"epoch": 0.1130690161527166,
"grad_norm": 78.29557037353516,
"learning_rate": 4.998386175651409e-07,
"logits/chosen": -0.6657835245132446,
"logits/rejected": -0.623427152633667,
"logps/chosen": -63.632198333740234,
"logps/ref_chosen": -58.0966796875,
"logps/ref_rejected": -93.77361297607422,
"logps/rejected": -118.64299774169922,
"loss": 0.6806,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.2214127629995346,
"margin_dpo/beta_margin_grad_std": 0.2203107476234436,
"margin_dpo/beta_margin_mean": 1.9333863258361816,
"margin_dpo/loss_margin_mean": 19.3338623046875,
"margin_dpo/margin_mean": 19.333864212036133,
"margin_dpo/margin_std": 19.132383346557617,
"step": 77
},
{
"epoch": 0.1145374449339207,
"grad_norm": 66.0775146484375,
"learning_rate": 4.997892217220159e-07,
"logits/chosen": -0.6555283069610596,
"logits/rejected": -0.6280935406684875,
"logps/chosen": -60.86896896362305,
"logps/ref_chosen": -55.61378479003906,
"logps/ref_rejected": -84.93436431884766,
"logps/rejected": -105.0614013671875,
"loss": 0.7256,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.2643897533416748,
"margin_dpo/beta_margin_grad_std": 0.20472703874111176,
"margin_dpo/beta_margin_mean": 1.4871852397918701,
"margin_dpo/loss_margin_mean": 14.871850967407227,
"margin_dpo/margin_mean": 14.871851921081543,
"margin_dpo/margin_std": 15.568973541259766,
"step": 78
},
{
"epoch": 0.11600587371512482,
"grad_norm": 58.39738845825195,
"learning_rate": 4.997332437005931e-07,
"logits/chosen": -0.6440068483352661,
"logits/rejected": -0.6116843819618225,
"logps/chosen": -60.54931640625,
"logps/ref_chosen": -55.45048522949219,
"logps/ref_rejected": -87.64756774902344,
"logps/rejected": -108.8857192993164,
"loss": 0.777,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.27868783473968506,
"margin_dpo/beta_margin_grad_std": 0.22643005847930908,
"margin_dpo/beta_margin_mean": 1.6139320135116577,
"margin_dpo/loss_margin_mean": 16.139320373535156,
"margin_dpo/margin_mean": 16.139320373535156,
"margin_dpo/margin_std": 18.9587459564209,
"step": 79
},
{
"epoch": 0.11747430249632893,
"grad_norm": 62.89072036743164,
"learning_rate": 4.996706849759452e-07,
"logits/chosen": -0.7126628160476685,
"logits/rejected": -0.6654119491577148,
"logps/chosen": -65.43215942382812,
"logps/ref_chosen": -58.519290924072266,
"logps/ref_rejected": -87.54750061035156,
"logps/rejected": -108.79297637939453,
"loss": 0.8315,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.29197126626968384,
"margin_dpo/beta_margin_grad_std": 0.22850532829761505,
"margin_dpo/beta_margin_mean": 1.4332611560821533,
"margin_dpo/loss_margin_mean": 14.332611083984375,
"margin_dpo/margin_mean": 14.332611083984375,
"margin_dpo/margin_std": 17.499080657958984,
"step": 80
},
{
"epoch": 0.11894273127753303,
"grad_norm": 72.54817199707031,
"learning_rate": 4.996015471965529e-07,
"logits/chosen": -0.7246617674827576,
"logits/rejected": -0.6918442249298096,
"logps/chosen": -72.08871459960938,
"logps/ref_chosen": -66.44886779785156,
"logps/ref_rejected": -129.66270446777344,
"logps/rejected": -154.00892639160156,
"loss": 0.6947,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.24446864426136017,
"margin_dpo/beta_margin_grad_std": 0.22534911334514618,
"margin_dpo/beta_margin_mean": 1.8706377744674683,
"margin_dpo/loss_margin_mean": 18.706378936767578,
"margin_dpo/margin_mean": 18.706378936767578,
"margin_dpo/margin_std": 20.739093780517578,
"step": 81
},
{
"epoch": 0.12041116005873716,
"grad_norm": 88.24505615234375,
"learning_rate": 4.995258321842611e-07,
"logits/chosen": -0.6451495885848999,
"logits/rejected": -0.6281242370605469,
"logps/chosen": -59.3546142578125,
"logps/ref_chosen": -52.232383728027344,
"logps/ref_rejected": -90.74325561523438,
"logps/rejected": -113.15169525146484,
"loss": 0.9502,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.28069868683815,
"margin_dpo/beta_margin_grad_std": 0.24664762616157532,
"margin_dpo/beta_margin_mean": 1.5286219120025635,
"margin_dpo/loss_margin_mean": 15.286218643188477,
"margin_dpo/margin_mean": 15.286218643188477,
"margin_dpo/margin_std": 21.404075622558594,
"step": 82
},
{
"epoch": 0.12187958883994127,
"grad_norm": 70.73540496826172,
"learning_rate": 4.994435419342304e-07,
"logits/chosen": -0.6840830445289612,
"logits/rejected": -0.6387213468551636,
"logps/chosen": -62.84056854248047,
"logps/ref_chosen": -55.82738494873047,
"logps/ref_rejected": -103.71590423583984,
"logps/rejected": -127.5250015258789,
"loss": 0.7455,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.25954824686050415,
"margin_dpo/beta_margin_grad_std": 0.22985972464084625,
"margin_dpo/beta_margin_mean": 1.679591417312622,
"margin_dpo/loss_margin_mean": 16.795913696289062,
"margin_dpo/margin_mean": 16.795913696289062,
"margin_dpo/margin_std": 18.532222747802734,
"step": 83
},
{
"epoch": 0.12334801762114538,
"grad_norm": 58.62339401245117,
"learning_rate": 4.993546786148857e-07,
"logits/chosen": -0.6582399606704712,
"logits/rejected": -0.6212340593338013,
"logps/chosen": -72.36793518066406,
"logps/ref_chosen": -67.1761703491211,
"logps/ref_rejected": -87.29859924316406,
"logps/rejected": -107.6661605834961,
"loss": 0.6762,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.24515578150749207,
"margin_dpo/beta_margin_grad_std": 0.18955430388450623,
"margin_dpo/beta_margin_mean": 1.5175797939300537,
"margin_dpo/loss_margin_mean": 15.175796508789062,
"margin_dpo/margin_mean": 15.175797462463379,
"margin_dpo/margin_std": 13.974632263183594,
"step": 84
},
{
"epoch": 0.12481644640234948,
"grad_norm": 65.68191528320312,
"learning_rate": 4.992592445678582e-07,
"logits/chosen": -0.6145851016044617,
"logits/rejected": -0.5817907452583313,
"logps/chosen": -64.20103454589844,
"logps/ref_chosen": -58.406620025634766,
"logps/ref_rejected": -78.63880157470703,
"logps/rejected": -99.02200317382812,
"loss": 0.7679,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.27928727865219116,
"margin_dpo/beta_margin_grad_std": 0.21024659276008606,
"margin_dpo/beta_margin_mean": 1.458878993988037,
"margin_dpo/loss_margin_mean": 14.588789939880371,
"margin_dpo/margin_mean": 14.588790893554688,
"margin_dpo/margin_std": 15.866073608398438,
"step": 85
},
{
"epoch": 0.1262848751835536,
"grad_norm": 85.21753692626953,
"learning_rate": 4.991572423079235e-07,
"logits/chosen": -0.6728634238243103,
"logits/rejected": -0.6558930277824402,
"logps/chosen": -63.18061828613281,
"logps/ref_chosen": -56.13746643066406,
"logps/ref_rejected": -88.12165069580078,
"logps/rejected": -110.33665466308594,
"loss": 0.9179,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.29970669746398926,
"margin_dpo/beta_margin_grad_std": 0.24453628063201904,
"margin_dpo/beta_margin_mean": 1.5171852111816406,
"margin_dpo/loss_margin_mean": 15.17185115814209,
"margin_dpo/margin_mean": 15.171852111816406,
"margin_dpo/margin_std": 21.81802749633789,
"step": 86
},
{
"epoch": 0.1277533039647577,
"grad_norm": 66.58555603027344,
"learning_rate": 4.990486745229364e-07,
"logits/chosen": -0.7240000367164612,
"logits/rejected": -0.6876901984214783,
"logps/chosen": -62.49974060058594,
"logps/ref_chosen": -55.63609313964844,
"logps/ref_rejected": -95.46757507324219,
"logps/rejected": -118.70195007324219,
"loss": 0.7934,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.2582399249076843,
"margin_dpo/beta_margin_grad_std": 0.22514671087265015,
"margin_dpo/beta_margin_mean": 1.6370728015899658,
"margin_dpo/loss_margin_mean": 16.3707275390625,
"margin_dpo/margin_mean": 16.3707275390625,
"margin_dpo/margin_std": 19.043777465820312,
"step": 87
},
{
"epoch": 0.12922173274596183,
"grad_norm": 75.37992095947266,
"learning_rate": 4.989335440737586e-07,
"logits/chosen": -0.6701527237892151,
"logits/rejected": -0.6537374258041382,
"logps/chosen": -82.11605072021484,
"logps/ref_chosen": -73.67115020751953,
"logps/ref_rejected": -106.70849609375,
"logps/rejected": -127.71624755859375,
"loss": 0.9174,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.30198073387145996,
"margin_dpo/beta_margin_grad_std": 0.23300787806510925,
"margin_dpo/beta_margin_mean": 1.2562841176986694,
"margin_dpo/loss_margin_mean": 12.562840461730957,
"margin_dpo/margin_mean": 12.562841415405273,
"margin_dpo/margin_std": 15.86634635925293,
"step": 88
},
{
"epoch": 0.13069016152716592,
"grad_norm": 54.054622650146484,
"learning_rate": 4.988118539941847e-07,
"logits/chosen": -0.7244502902030945,
"logits/rejected": -0.6862339973449707,
"logps/chosen": -65.05143737792969,
"logps/ref_chosen": -60.624916076660156,
"logps/ref_rejected": -82.08354949951172,
"logps/rejected": -99.46173095703125,
"loss": 0.7405,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.27611032128334045,
"margin_dpo/beta_margin_grad_std": 0.17834323644638062,
"margin_dpo/beta_margin_mean": 1.295165777206421,
"margin_dpo/loss_margin_mean": 12.951656341552734,
"margin_dpo/margin_mean": 12.95165729522705,
"margin_dpo/margin_std": 13.88388442993164,
"step": 89
},
{
"epoch": 0.13215859030837004,
"grad_norm": 66.94837188720703,
"learning_rate": 4.986836074908615e-07,
"logits/chosen": -0.6321258544921875,
"logits/rejected": -0.6237634420394897,
"logps/chosen": -59.42333221435547,
"logps/ref_chosen": -53.285308837890625,
"logps/ref_rejected": -111.54470825195312,
"logps/rejected": -133.43154907226562,
"loss": 0.8384,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.285112202167511,
"margin_dpo/beta_margin_grad_std": 0.226267009973526,
"margin_dpo/beta_margin_mean": 1.5748803615570068,
"margin_dpo/loss_margin_mean": 15.748802185058594,
"margin_dpo/margin_mean": 15.748802185058594,
"margin_dpo/margin_std": 20.31298065185547,
"step": 90
},
{
"epoch": 0.13362701908957417,
"grad_norm": 65.86888122558594,
"learning_rate": 4.985488079432037e-07,
"logits/chosen": -0.6855983734130859,
"logits/rejected": -0.6458035707473755,
"logps/chosen": -67.0127944946289,
"logps/ref_chosen": -61.80295944213867,
"logps/ref_rejected": -87.87395477294922,
"logps/rejected": -108.96083068847656,
"loss": 0.7585,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.2720244526863098,
"margin_dpo/beta_margin_grad_std": 0.22684738039970398,
"margin_dpo/beta_margin_mean": 1.5877044200897217,
"margin_dpo/loss_margin_mean": 15.877042770385742,
"margin_dpo/margin_mean": 15.877042770385742,
"margin_dpo/margin_std": 17.475290298461914,
"step": 91
},
{
"epoch": 0.13509544787077826,
"grad_norm": 60.52584457397461,
"learning_rate": 4.984074589033043e-07,
"logits/chosen": -0.7122005224227905,
"logits/rejected": -0.6839097738265991,
"logps/chosen": -56.672237396240234,
"logps/ref_chosen": -51.640769958496094,
"logps/ref_rejected": -77.88117980957031,
"logps/rejected": -97.52497100830078,
"loss": 0.8096,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.28509509563446045,
"margin_dpo/beta_margin_grad_std": 0.22501453757286072,
"margin_dpo/beta_margin_mean": 1.4612317085266113,
"margin_dpo/loss_margin_mean": 14.61231803894043,
"margin_dpo/margin_mean": 14.61231803894043,
"margin_dpo/margin_std": 17.27523422241211,
"step": 92
},
{
"epoch": 0.13656387665198239,
"grad_norm": 47.008575439453125,
"learning_rate": 4.982595640958425e-07,
"logits/chosen": -0.7324954271316528,
"logits/rejected": -0.671492874622345,
"logps/chosen": -57.973655700683594,
"logps/ref_chosen": -52.529239654541016,
"logps/ref_rejected": -77.1607437133789,
"logps/rejected": -97.39739990234375,
"loss": 0.6889,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.2600102424621582,
"margin_dpo/beta_margin_grad_std": 0.18848371505737305,
"margin_dpo/beta_margin_mean": 1.479223608970642,
"margin_dpo/loss_margin_mean": 14.792236328125,
"margin_dpo/margin_mean": 14.792236328125,
"margin_dpo/margin_std": 15.35598087310791,
"step": 93
},
{
"epoch": 0.13803230543318648,
"grad_norm": 51.4643669128418,
"learning_rate": 4.98105127417984e-07,
"logits/chosen": -0.6778910756111145,
"logits/rejected": -0.649002730846405,
"logps/chosen": -67.1570053100586,
"logps/ref_chosen": -61.22261047363281,
"logps/ref_rejected": -99.59902954101562,
"logps/rejected": -121.3552474975586,
"loss": 0.6464,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.24860258400440216,
"margin_dpo/beta_margin_grad_std": 0.19004377722740173,
"margin_dpo/beta_margin_mean": 1.5821821689605713,
"margin_dpo/loss_margin_mean": 15.821820259094238,
"margin_dpo/margin_mean": 15.821819305419922,
"margin_dpo/margin_std": 14.735492706298828,
"step": 94
},
{
"epoch": 0.1395007342143906,
"grad_norm": 50.75383758544922,
"learning_rate": 4.979441529392784e-07,
"logits/chosen": -0.6933159828186035,
"logits/rejected": -0.655129075050354,
"logps/chosen": -57.09678649902344,
"logps/ref_chosen": -52.52364730834961,
"logps/ref_rejected": -75.88035583496094,
"logps/rejected": -93.311767578125,
"loss": 0.7209,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.2710234224796295,
"margin_dpo/beta_margin_grad_std": 0.1795656383037567,
"margin_dpo/beta_margin_mean": 1.2858270406723022,
"margin_dpo/loss_margin_mean": 12.858270645141602,
"margin_dpo/margin_mean": 12.858270645141602,
"margin_dpo/margin_std": 12.511711120605469,
"step": 95
},
{
"epoch": 0.14096916299559473,
"grad_norm": 50.98220443725586,
"learning_rate": 4.977766449015534e-07,
"logits/chosen": -0.6764161586761475,
"logits/rejected": -0.6342806816101074,
"logps/chosen": -65.936279296875,
"logps/ref_chosen": -62.15697479248047,
"logps/ref_rejected": -96.59601593017578,
"logps/rejected": -117.31523895263672,
"loss": 0.6236,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.23587027192115784,
"margin_dpo/beta_margin_grad_std": 0.18438121676445007,
"margin_dpo/beta_margin_mean": 1.6939918994903564,
"margin_dpo/loss_margin_mean": 16.939918518066406,
"margin_dpo/margin_mean": 16.939918518066406,
"margin_dpo/margin_std": 16.764862060546875,
"step": 96
},
{
"epoch": 0.14243759177679882,
"grad_norm": 52.73670959472656,
"learning_rate": 4.976026077188012e-07,
"logits/chosen": -0.6401921510696411,
"logits/rejected": -0.5834782123565674,
"logps/chosen": -59.18102264404297,
"logps/ref_chosen": -54.64636993408203,
"logps/ref_rejected": -76.96475219726562,
"logps/rejected": -95.2552490234375,
"loss": 0.6774,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.25927746295928955,
"margin_dpo/beta_margin_grad_std": 0.17485031485557556,
"margin_dpo/beta_margin_mean": 1.375584363937378,
"margin_dpo/loss_margin_mean": 13.755844116210938,
"margin_dpo/margin_mean": 13.755844116210938,
"margin_dpo/margin_std": 12.07811164855957,
"step": 97
},
{
"epoch": 0.14390602055800295,
"grad_norm": 58.02984619140625,
"learning_rate": 4.974220459770639e-07,
"logits/chosen": -0.6708123683929443,
"logits/rejected": -0.6468954086303711,
"logps/chosen": -71.02387237548828,
"logps/ref_chosen": -65.25862884521484,
"logps/ref_rejected": -96.5274887084961,
"logps/rejected": -117.00706481933594,
"loss": 0.7512,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.25962063670158386,
"margin_dpo/beta_margin_grad_std": 0.2160511016845703,
"margin_dpo/beta_margin_mean": 1.4714339971542358,
"margin_dpo/loss_margin_mean": 14.714340209960938,
"margin_dpo/margin_mean": 14.714340209960938,
"margin_dpo/margin_std": 15.175495147705078,
"step": 98
},
{
"epoch": 0.14537444933920704,
"grad_norm": 48.38506317138672,
"learning_rate": 4.972349644343108e-07,
"logits/chosen": -0.6916057467460632,
"logits/rejected": -0.6791607737541199,
"logps/chosen": -50.54969787597656,
"logps/ref_chosen": -45.63848114013672,
"logps/ref_rejected": -86.43792724609375,
"logps/rejected": -107.18087768554688,
"loss": 0.6459,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.2525924742221832,
"margin_dpo/beta_margin_grad_std": 0.17562821507453918,
"margin_dpo/beta_margin_mean": 1.5831732749938965,
"margin_dpo/loss_margin_mean": 15.831732749938965,
"margin_dpo/margin_mean": 15.831733703613281,
"margin_dpo/margin_std": 16.313186645507812,
"step": 99
},
{
"epoch": 0.14684287812041116,
"grad_norm": 67.92232513427734,
"learning_rate": 4.970413680203148e-07,
"logits/chosen": -0.6955288648605347,
"logits/rejected": -0.6533514857292175,
"logps/chosen": -62.669090270996094,
"logps/ref_chosen": -57.5939826965332,
"logps/ref_rejected": -74.06021118164062,
"logps/rejected": -90.62651062011719,
"loss": 0.9037,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.30934613943099976,
"margin_dpo/beta_margin_grad_std": 0.21438318490982056,
"margin_dpo/beta_margin_mean": 1.1491191387176514,
"margin_dpo/loss_margin_mean": 11.491190910339355,
"margin_dpo/margin_mean": 11.491189956665039,
"margin_dpo/margin_std": 15.003036499023438,
"step": 100
},
{
"epoch": 0.14684287812041116,
"eval_logits/chosen": -0.6628317832946777,
"eval_logits/rejected": -0.636573851108551,
"eval_logps/chosen": -87.1427230834961,
"eval_logps/ref_chosen": -79.05104064941406,
"eval_logps/ref_rejected": -86.79793548583984,
"eval_logps/rejected": -103.3295669555664,
"eval_loss": 0.5592836737632751,
"eval_margin_dpo/beta": 0.10000000149011612,
"eval_margin_dpo/beta_margin_grad_mean": -0.36682140827178955,
"eval_margin_dpo/beta_margin_grad_std": 0.23032358288764954,
"eval_margin_dpo/beta_margin_mean": 0.8439961671829224,
"eval_margin_dpo/loss_margin_mean": 8.439962387084961,
"eval_margin_dpo/margin_mean": 8.439962387084961,
"eval_margin_dpo/margin_std": 15.342604637145996,
"eval_runtime": 39.9749,
"eval_samples_per_second": 58.512,
"eval_steps_per_second": 1.851,
"step": 100
},
{
"epoch": 0.14831130690161526,
"grad_norm": 54.49692153930664,
"learning_rate": 4.968412618365215e-07,
"logits/chosen": -0.7001588344573975,
"logits/rejected": -0.6611640453338623,
"logps/chosen": -67.25942993164062,
"logps/ref_chosen": -61.64884948730469,
"logps/ref_rejected": -83.18968963623047,
"logps/rejected": -102.17335510253906,
"loss": 0.7842,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.288244366645813,
"margin_dpo/beta_margin_grad_std": 0.20376016199588776,
"margin_dpo/beta_margin_mean": 1.3373081684112549,
"margin_dpo/loss_margin_mean": 13.373082160949707,
"margin_dpo/margin_mean": 13.37308120727539,
"margin_dpo/margin_std": 15.715073585510254,
"step": 101
},
{
"epoch": 0.14977973568281938,
"grad_norm": 69.74262237548828,
"learning_rate": 4.966346511559149e-07,
"logits/chosen": -0.7373714447021484,
"logits/rejected": -0.6927535533905029,
"logps/chosen": -70.96074676513672,
"logps/ref_chosen": -64.0788803100586,
"logps/ref_rejected": -68.18707275390625,
"logps/rejected": -85.39456176757812,
"loss": 0.9365,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.3263900876045227,
"margin_dpo/beta_margin_grad_std": 0.2226821929216385,
"margin_dpo/beta_margin_mean": 1.0325615406036377,
"margin_dpo/loss_margin_mean": 10.325615882873535,
"margin_dpo/margin_mean": 10.325615882873535,
"margin_dpo/margin_std": 14.067426681518555,
"step": 102
},
{
"epoch": 0.1512481644640235,
"grad_norm": 46.19940185546875,
"learning_rate": 4.964215414228785e-07,
"logits/chosen": -0.6903908252716064,
"logits/rejected": -0.6522761583328247,
"logps/chosen": -64.90095520019531,
"logps/ref_chosen": -61.299278259277344,
"logps/ref_rejected": -93.57271575927734,
"logps/rejected": -115.02561950683594,
"loss": 0.5573,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.21845993399620056,
"margin_dpo/beta_margin_grad_std": 0.17656126618385315,
"margin_dpo/beta_margin_mean": 1.7851228713989258,
"margin_dpo/loss_margin_mean": 17.851226806640625,
"margin_dpo/margin_mean": 17.851226806640625,
"margin_dpo/margin_std": 15.277688026428223,
"step": 103
},
{
"epoch": 0.1527165932452276,
"grad_norm": 52.796669006347656,
"learning_rate": 4.96201938253052e-07,
"logits/chosen": -0.7016171813011169,
"logits/rejected": -0.6555418968200684,
"logps/chosen": -59.226173400878906,
"logps/ref_chosen": -54.37277603149414,
"logps/ref_rejected": -89.5647201538086,
"logps/rejected": -111.29109191894531,
"loss": 0.6771,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.2526766061782837,
"margin_dpo/beta_margin_grad_std": 0.20172113180160522,
"margin_dpo/beta_margin_mean": 1.6872971057891846,
"margin_dpo/loss_margin_mean": 16.87297248840332,
"margin_dpo/margin_mean": 16.872970581054688,
"margin_dpo/margin_std": 17.26502227783203,
"step": 104
},
{
"epoch": 0.15418502202643172,
"grad_norm": 39.37166976928711,
"learning_rate": 4.959758474331832e-07,
"logits/chosen": -0.7300401926040649,
"logits/rejected": -0.6914358139038086,
"logps/chosen": -58.295875549316406,
"logps/ref_chosen": -54.638946533203125,
"logps/ref_rejected": -97.97351837158203,
"logps/rejected": -124.21298217773438,
"loss": 0.4219,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16911230981349945,
"margin_dpo/beta_margin_grad_std": 0.16213884949684143,
"margin_dpo/beta_margin_mean": 2.2582526206970215,
"margin_dpo/loss_margin_mean": 22.58252716064453,
"margin_dpo/margin_mean": 22.58252716064453,
"margin_dpo/margin_std": 16.65502166748047,
"step": 105
},
{
"epoch": 0.15565345080763582,
"grad_norm": 49.93497848510742,
"learning_rate": 4.957432749209755e-07,
"logits/chosen": -0.6783395409584045,
"logits/rejected": -0.6233980059623718,
"logps/chosen": -59.64507293701172,
"logps/ref_chosen": -54.83289337158203,
"logps/ref_rejected": -85.22461700439453,
"logps/rejected": -104.86808013916016,
"loss": 0.6785,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.25617170333862305,
"margin_dpo/beta_margin_grad_std": 0.19661831855773926,
"margin_dpo/beta_margin_mean": 1.483128547668457,
"margin_dpo/loss_margin_mean": 14.83128547668457,
"margin_dpo/margin_mean": 14.83128547668457,
"margin_dpo/margin_std": 14.39011001586914,
"step": 106
},
{
"epoch": 0.15712187958883994,
"grad_norm": 50.40999984741211,
"learning_rate": 4.955042268449307e-07,
"logits/chosen": -0.7118107676506042,
"logits/rejected": -0.6568803787231445,
"logps/chosen": -75.61688995361328,
"logps/ref_chosen": -69.70780944824219,
"logps/ref_rejected": -94.73950958251953,
"logps/rejected": -117.43498229980469,
"loss": 0.6451,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.23818761110305786,
"margin_dpo/beta_margin_grad_std": 0.204330176115036,
"margin_dpo/beta_margin_mean": 1.6786396503448486,
"margin_dpo/loss_margin_mean": 16.786396026611328,
"margin_dpo/margin_mean": 16.786396026611328,
"margin_dpo/margin_std": 15.435192108154297,
"step": 107
},
{
"epoch": 0.15859030837004406,
"grad_norm": 58.914913177490234,
"learning_rate": 4.952587095041881e-07,
"logits/chosen": -0.7263258695602417,
"logits/rejected": -0.6777476668357849,
"logps/chosen": -61.758811950683594,
"logps/ref_chosen": -56.0098876953125,
"logps/ref_rejected": -95.79601287841797,
"logps/rejected": -118.6368408203125,
"loss": 0.7481,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.26686227321624756,
"margin_dpo/beta_margin_grad_std": 0.2259853482246399,
"margin_dpo/beta_margin_mean": 1.7091913223266602,
"margin_dpo/loss_margin_mean": 17.09191131591797,
"margin_dpo/margin_mean": 17.09191131591797,
"margin_dpo/margin_std": 19.03655433654785,
"step": 108
},
{
"epoch": 0.16005873715124816,
"grad_norm": 45.8420295715332,
"learning_rate": 4.95006729368358e-07,
"logits/chosen": -0.6235396862030029,
"logits/rejected": -0.5926010608673096,
"logps/chosen": -67.99076080322266,
"logps/ref_chosen": -62.88549041748047,
"logps/ref_rejected": -98.68573760986328,
"logps/rejected": -122.61830139160156,
"loss": 0.545,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.2126576006412506,
"margin_dpo/beta_margin_grad_std": 0.18558171391487122,
"margin_dpo/beta_margin_mean": 1.8827290534973145,
"margin_dpo/loss_margin_mean": 18.827289581298828,
"margin_dpo/margin_mean": 18.82729148864746,
"margin_dpo/margin_std": 15.299284934997559,
"step": 109
},
{
"epoch": 0.16152716593245228,
"grad_norm": 50.39557647705078,
"learning_rate": 4.947482930773511e-07,
"logits/chosen": -0.6504217386245728,
"logits/rejected": -0.5953609347343445,
"logps/chosen": -63.09541320800781,
"logps/ref_chosen": -58.753684997558594,
"logps/ref_rejected": -79.75001525878906,
"logps/rejected": -101.90673828125,
"loss": 0.6773,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.23589923977851868,
"margin_dpo/beta_margin_grad_std": 0.20386064052581787,
"margin_dpo/beta_margin_mean": 1.7814992666244507,
"margin_dpo/loss_margin_mean": 17.814992904663086,
"margin_dpo/margin_mean": 17.814992904663086,
"margin_dpo/margin_std": 18.05242919921875,
"step": 110
},
{
"epoch": 0.16299559471365638,
"grad_norm": 53.93953323364258,
"learning_rate": 4.944834074412042e-07,
"logits/chosen": -0.6995693445205688,
"logits/rejected": -0.6706931591033936,
"logps/chosen": -74.98273468017578,
"logps/ref_chosen": -68.62410736083984,
"logps/ref_rejected": -98.42886352539062,
"logps/rejected": -123.05096435546875,
"loss": 0.663,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.23302116990089417,
"margin_dpo/beta_margin_grad_std": 0.2124572992324829,
"margin_dpo/beta_margin_mean": 1.826347827911377,
"margin_dpo/loss_margin_mean": 18.263477325439453,
"margin_dpo/margin_mean": 18.263477325439453,
"margin_dpo/margin_std": 17.97542953491211,
"step": 111
},
{
"epoch": 0.1644640234948605,
"grad_norm": 58.261051177978516,
"learning_rate": 4.942120794399002e-07,
"logits/chosen": -0.6929997205734253,
"logits/rejected": -0.6383606791496277,
"logps/chosen": -56.543792724609375,
"logps/ref_chosen": -50.24964141845703,
"logps/ref_rejected": -64.77442932128906,
"logps/rejected": -84.3316421508789,
"loss": 0.8086,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.2888134717941284,
"margin_dpo/beta_margin_grad_std": 0.2119472473859787,
"margin_dpo/beta_margin_mean": 1.3263057470321655,
"margin_dpo/loss_margin_mean": 13.263057708740234,
"margin_dpo/margin_mean": 13.263057708740234,
"margin_dpo/margin_std": 15.071852684020996,
"step": 112
},
{
"epoch": 0.16593245227606462,
"grad_norm": 52.767189025878906,
"learning_rate": 4.939343162231841e-07,
"logits/chosen": -0.6641270518302917,
"logits/rejected": -0.6109206676483154,
"logps/chosen": -72.74588012695312,
"logps/ref_chosen": -66.71295166015625,
"logps/ref_rejected": -77.96870422363281,
"logps/rejected": -98.95388793945312,
"loss": 0.659,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.25107163190841675,
"margin_dpo/beta_margin_grad_std": 0.19331878423690796,
"margin_dpo/beta_margin_mean": 1.495226263999939,
"margin_dpo/loss_margin_mean": 14.952262878417969,
"margin_dpo/margin_mean": 14.952262878417969,
"margin_dpo/margin_std": 13.45613956451416,
"step": 113
},
{
"epoch": 0.16740088105726872,
"grad_norm": 48.202720642089844,
"learning_rate": 4.936501251103751e-07,
"logits/chosen": -0.6876777410507202,
"logits/rejected": -0.6400505304336548,
"logps/chosen": -63.42707824707031,
"logps/ref_chosen": -57.78507995605469,
"logps/ref_rejected": -87.10966491699219,
"logps/rejected": -112.90645599365234,
"loss": 0.5985,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.22679130733013153,
"margin_dpo/beta_margin_grad_std": 0.2024666965007782,
"margin_dpo/beta_margin_mean": 2.0154800415039062,
"margin_dpo/loss_margin_mean": 20.154800415039062,
"margin_dpo/margin_mean": 20.154800415039062,
"margin_dpo/margin_std": 20.26180076599121,
"step": 114
},
{
"epoch": 0.16886930983847284,
"grad_norm": 73.78862762451172,
"learning_rate": 4.933595135901732e-07,
"logits/chosen": -0.7042691111564636,
"logits/rejected": -0.6589173078536987,
"logps/chosen": -73.75160217285156,
"logps/ref_chosen": -65.5826416015625,
"logps/ref_rejected": -98.56552124023438,
"logps/rejected": -122.13522338867188,
"loss": 0.7918,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.27233636379241943,
"margin_dpo/beta_margin_grad_std": 0.21869003772735596,
"margin_dpo/beta_margin_mean": 1.5400748252868652,
"margin_dpo/loss_margin_mean": 15.400747299194336,
"margin_dpo/margin_mean": 15.400747299194336,
"margin_dpo/margin_std": 18.599172592163086,
"step": 115
},
{
"epoch": 0.17033773861967694,
"grad_norm": 47.02722930908203,
"learning_rate": 4.930624893204624e-07,
"logits/chosen": -0.7148517370223999,
"logits/rejected": -0.6804147958755493,
"logps/chosen": -57.35455322265625,
"logps/ref_chosen": -51.40031051635742,
"logps/ref_rejected": -80.5218505859375,
"logps/rejected": -101.68440246582031,
"loss": 0.6214,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.24444279074668884,
"margin_dpo/beta_margin_grad_std": 0.17096129059791565,
"margin_dpo/beta_margin_mean": 1.5208299160003662,
"margin_dpo/loss_margin_mean": 15.208297729492188,
"margin_dpo/margin_mean": 15.20829963684082,
"margin_dpo/margin_std": 13.959308624267578,
"step": 116
},
{
"epoch": 0.17180616740088106,
"grad_norm": 61.11030960083008,
"learning_rate": 4.927590601281083e-07,
"logits/chosen": -0.6608189344406128,
"logits/rejected": -0.6192047595977783,
"logps/chosen": -75.69219207763672,
"logps/ref_chosen": -69.29840850830078,
"logps/ref_rejected": -66.58399200439453,
"logps/rejected": -87.99634552001953,
"loss": 0.6968,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.26184725761413574,
"margin_dpo/beta_margin_grad_std": 0.19542691111564636,
"margin_dpo/beta_margin_mean": 1.501856803894043,
"margin_dpo/loss_margin_mean": 15.01856803894043,
"margin_dpo/margin_mean": 15.01856803894043,
"margin_dpo/margin_std": 15.984650611877441,
"step": 117
},
{
"epoch": 0.17327459618208516,
"grad_norm": 48.049564361572266,
"learning_rate": 4.924492340087524e-07,
"logits/chosen": -0.6910693645477295,
"logits/rejected": -0.6483018398284912,
"logps/chosen": -62.306884765625,
"logps/ref_chosen": -55.6409797668457,
"logps/ref_rejected": -75.66905212402344,
"logps/rejected": -96.35951232910156,
"loss": 0.6673,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.2580760717391968,
"margin_dpo/beta_margin_grad_std": 0.17991520464420319,
"margin_dpo/beta_margin_mean": 1.4024548530578613,
"margin_dpo/loss_margin_mean": 14.024548530578613,
"margin_dpo/margin_mean": 14.024547576904297,
"margin_dpo/margin_std": 12.942065238952637,
"step": 118
},
{
"epoch": 0.17474302496328928,
"grad_norm": 57.746402740478516,
"learning_rate": 4.92133019126601e-07,
"logits/chosen": -0.6886883974075317,
"logits/rejected": -0.6644145250320435,
"logps/chosen": -80.7379379272461,
"logps/ref_chosen": -73.51019287109375,
"logps/ref_rejected": -102.97728729248047,
"logps/rejected": -125.08132934570312,
"loss": 0.7405,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.2667841613292694,
"margin_dpo/beta_margin_grad_std": 0.21272984147071838,
"margin_dpo/beta_margin_mean": 1.4876298904418945,
"margin_dpo/loss_margin_mean": 14.876298904418945,
"margin_dpo/margin_mean": 14.876298904418945,
"margin_dpo/margin_std": 16.01374626159668,
"step": 119
},
{
"epoch": 0.1762114537444934,
"grad_norm": 52.1450080871582,
"learning_rate": 4.918104238142103e-07,
"logits/chosen": -0.7026511430740356,
"logits/rejected": -0.6608834266662598,
"logps/chosen": -84.82908630371094,
"logps/ref_chosen": -76.78083801269531,
"logps/ref_rejected": -108.02374267578125,
"logps/rejected": -134.74542236328125,
"loss": 0.6004,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.22407472133636475,
"margin_dpo/beta_margin_grad_std": 0.20310235023498535,
"margin_dpo/beta_margin_mean": 1.8673429489135742,
"margin_dpo/loss_margin_mean": 18.673429489135742,
"margin_dpo/margin_mean": 18.673429489135742,
"margin_dpo/margin_std": 17.22457504272461,
"step": 120
},
{
"epoch": 0.1776798825256975,
"grad_norm": 48.357093811035156,
"learning_rate": 4.91481456572267e-07,
"logits/chosen": -0.6549187898635864,
"logits/rejected": -0.6369335651397705,
"logps/chosen": -69.40840911865234,
"logps/ref_chosen": -61.789894104003906,
"logps/ref_rejected": -109.99456787109375,
"logps/rejected": -137.22264099121094,
"loss": 0.5936,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.2275564819574356,
"margin_dpo/beta_margin_grad_std": 0.19740846753120422,
"margin_dpo/beta_margin_mean": 1.9609556198120117,
"margin_dpo/loss_margin_mean": 19.609556198120117,
"margin_dpo/margin_mean": 19.60955810546875,
"margin_dpo/margin_std": 18.554580688476562,
"step": 121
},
{
"epoch": 0.17914831130690162,
"grad_norm": 45.592864990234375,
"learning_rate": 4.911461260693638e-07,
"logits/chosen": -0.6858741044998169,
"logits/rejected": -0.6766628623008728,
"logps/chosen": -53.84559631347656,
"logps/ref_chosen": -46.90221405029297,
"logps/ref_rejected": -106.71418762207031,
"logps/rejected": -138.1607666015625,
"loss": 0.4337,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1684824675321579,
"margin_dpo/beta_margin_grad_std": 0.18330231308937073,
"margin_dpo/beta_margin_mean": 2.450319290161133,
"margin_dpo/loss_margin_mean": 24.503192901611328,
"margin_dpo/margin_mean": 24.503192901611328,
"margin_dpo/margin_std": 18.173328399658203,
"step": 122
},
{
"epoch": 0.18061674008810572,
"grad_norm": 66.03611755371094,
"learning_rate": 4.908044411417711e-07,
"logits/chosen": -0.6609284281730652,
"logits/rejected": -0.6286982297897339,
"logps/chosen": -68.30619812011719,
"logps/ref_chosen": -61.33863830566406,
"logps/ref_rejected": -87.77539825439453,
"logps/rejected": -111.76436614990234,
"loss": 0.7836,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.2565336227416992,
"margin_dpo/beta_margin_grad_std": 0.23589524626731873,
"margin_dpo/beta_margin_mean": 1.7021416425704956,
"margin_dpo/loss_margin_mean": 17.02141571044922,
"margin_dpo/margin_mean": 17.02141571044922,
"margin_dpo/margin_std": 19.722978591918945,
"step": 123
},
{
"epoch": 0.18208516886930984,
"grad_norm": 62.63188934326172,
"learning_rate": 4.904564107932048e-07,
"logits/chosen": -0.664189338684082,
"logits/rejected": -0.6542805433273315,
"logps/chosen": -78.76295471191406,
"logps/ref_chosen": -71.44833374023438,
"logps/ref_rejected": -117.58056640625,
"logps/rejected": -146.3335723876953,
"loss": 0.6425,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.22931857407093048,
"margin_dpo/beta_margin_grad_std": 0.22194989025592804,
"margin_dpo/beta_margin_mean": 2.1438369750976562,
"margin_dpo/loss_margin_mean": 21.438369750976562,
"margin_dpo/margin_mean": 21.438369750976562,
"margin_dpo/margin_std": 23.726600646972656,
"step": 124
},
{
"epoch": 0.18355359765051396,
"grad_norm": 45.74631881713867,
"learning_rate": 4.90102044194588e-07,
"logits/chosen": -0.6452882289886475,
"logits/rejected": -0.6194664239883423,
"logps/chosen": -55.687599182128906,
"logps/ref_chosen": -50.136940002441406,
"logps/ref_rejected": -83.98861694335938,
"logps/rejected": -109.53338623046875,
"loss": 0.5347,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.2084668129682541,
"margin_dpo/beta_margin_grad_std": 0.1783091127872467,
"margin_dpo/beta_margin_mean": 1.9994112253189087,
"margin_dpo/loss_margin_mean": 19.99411392211914,
"margin_dpo/margin_mean": 19.994110107421875,
"margin_dpo/margin_std": 17.385387420654297,
"step": 125
},
{
"epoch": 0.18502202643171806,
"grad_norm": 55.11186599731445,
"learning_rate": 4.897413506838102e-07,
"logits/chosen": -0.6568164825439453,
"logits/rejected": -0.6220812797546387,
"logps/chosen": -61.971824645996094,
"logps/ref_chosen": -55.66706848144531,
"logps/ref_rejected": -98.1297607421875,
"logps/rejected": -123.57440185546875,
"loss": 0.5552,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.21338176727294922,
"margin_dpo/beta_margin_grad_std": 0.18551576137542725,
"margin_dpo/beta_margin_mean": 1.9139891862869263,
"margin_dpo/loss_margin_mean": 19.139890670776367,
"margin_dpo/margin_mean": 19.139890670776367,
"margin_dpo/margin_std": 16.969818115234375,
"step": 126
},
{
"epoch": 0.18649045521292218,
"grad_norm": 46.06990432739258,
"learning_rate": 4.89374339765481e-07,
"logits/chosen": -0.638472318649292,
"logits/rejected": -0.6041021347045898,
"logps/chosen": -61.95406723022461,
"logps/ref_chosen": -56.55467987060547,
"logps/ref_rejected": -76.7957763671875,
"logps/rejected": -98.31398010253906,
"loss": 0.6303,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.2392818182706833,
"margin_dpo/beta_margin_grad_std": 0.19149260222911835,
"margin_dpo/beta_margin_mean": 1.6118814945220947,
"margin_dpo/loss_margin_mean": 16.118816375732422,
"margin_dpo/margin_mean": 16.118816375732422,
"margin_dpo/margin_std": 13.991384506225586,
"step": 127
},
{
"epoch": 0.18795888399412627,
"grad_norm": 51.177642822265625,
"learning_rate": 4.890010211106795e-07,
"logits/chosen": -0.663079023361206,
"logits/rejected": -0.616753876209259,
"logps/chosen": -63.87889862060547,
"logps/ref_chosen": -58.12095642089844,
"logps/ref_rejected": -76.43896484375,
"logps/rejected": -99.25593566894531,
"loss": 0.6751,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.25181353092193604,
"margin_dpo/beta_margin_grad_std": 0.20399567484855652,
"margin_dpo/beta_margin_mean": 1.7059035301208496,
"margin_dpo/loss_margin_mean": 17.05903434753418,
"margin_dpo/margin_mean": 17.05903434753418,
"margin_dpo/margin_std": 17.72481346130371,
"step": 128
},
{
"epoch": 0.1894273127753304,
"grad_norm": 72.67992401123047,
"learning_rate": 4.88621404556699e-07,
"logits/chosen": -0.6873067617416382,
"logits/rejected": -0.6568499803543091,
"logps/chosen": -75.6619644165039,
"logps/ref_chosen": -66.91636657714844,
"logps/ref_rejected": -96.6422119140625,
"logps/rejected": -122.64834594726562,
"loss": 0.7959,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.26328974962234497,
"margin_dpo/beta_margin_grad_std": 0.24150995910167694,
"margin_dpo/beta_margin_mean": 1.7260537147521973,
"margin_dpo/loss_margin_mean": 17.260536193847656,
"margin_dpo/margin_mean": 17.26053810119629,
"margin_dpo/margin_std": 20.107585906982422,
"step": 129
},
{
"epoch": 0.19089574155653452,
"grad_norm": 50.73147964477539,
"learning_rate": 4.882355001067891e-07,
"logits/chosen": -0.6596213579177856,
"logits/rejected": -0.6461096405982971,
"logps/chosen": -51.04236602783203,
"logps/ref_chosen": -44.666847229003906,
"logps/ref_rejected": -82.78165435791016,
"logps/rejected": -112.08168029785156,
"loss": 0.5939,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.20150384306907654,
"margin_dpo/beta_margin_grad_std": 0.21593783795833588,
"margin_dpo/beta_margin_mean": 2.2924509048461914,
"margin_dpo/loss_margin_mean": 22.924509048461914,
"margin_dpo/margin_mean": 22.924509048461914,
"margin_dpo/margin_std": 19.672473907470703,
"step": 130
},
{
"epoch": 0.19236417033773862,
"grad_norm": 43.14263916015625,
"learning_rate": 4.878433179298909e-07,
"logits/chosen": -0.6646705269813538,
"logits/rejected": -0.6489601135253906,
"logps/chosen": -49.25099182128906,
"logps/ref_chosen": -44.92458724975586,
"logps/ref_rejected": -88.44401550292969,
"logps/rejected": -113.0731201171875,
"loss": 0.5387,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.20338965952396393,
"margin_dpo/beta_margin_grad_std": 0.19119322299957275,
"margin_dpo/beta_margin_mean": 2.0302700996398926,
"margin_dpo/loss_margin_mean": 20.302701950073242,
"margin_dpo/margin_mean": 20.302701950073242,
"margin_dpo/margin_std": 17.324234008789062,
"step": 131
},
{
"epoch": 0.19383259911894274,
"grad_norm": 48.75657272338867,
"learning_rate": 4.874448683603694e-07,
"logits/chosen": -0.6894493699073792,
"logits/rejected": -0.6632376909255981,
"logps/chosen": -65.58708953857422,
"logps/ref_chosen": -59.00108337402344,
"logps/ref_rejected": -87.89215087890625,
"logps/rejected": -113.75344848632812,
"loss": 0.539,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.2136635035276413,
"margin_dpo/beta_margin_grad_std": 0.17412179708480835,
"margin_dpo/beta_margin_mean": 1.927529215812683,
"margin_dpo/loss_margin_mean": 19.275291442871094,
"margin_dpo/margin_mean": 19.275293350219727,
"margin_dpo/margin_std": 17.327112197875977,
"step": 132
},
{
"epoch": 0.19530102790014683,
"grad_norm": 57.14876937866211,
"learning_rate": 4.870401618977415e-07,
"logits/chosen": -0.6868765354156494,
"logits/rejected": -0.6663703918457031,
"logps/chosen": -74.35096740722656,
"logps/ref_chosen": -66.60449981689453,
"logps/ref_rejected": -96.33355712890625,
"logps/rejected": -121.88394165039062,
"loss": 0.711,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.2557414174079895,
"margin_dpo/beta_margin_grad_std": 0.22053731977939606,
"margin_dpo/beta_margin_mean": 1.780390977859497,
"margin_dpo/loss_margin_mean": 17.803909301757812,
"margin_dpo/margin_mean": 17.803909301757812,
"margin_dpo/margin_std": 19.555706024169922,
"step": 133
},
{
"epoch": 0.19676945668135096,
"grad_norm": 48.30363845825195,
"learning_rate": 4.866292092063986e-07,
"logits/chosen": -0.6847056150436401,
"logits/rejected": -0.6499172449111938,
"logps/chosen": -57.004554748535156,
"logps/ref_chosen": -52.06925582885742,
"logps/ref_rejected": -87.6545181274414,
"logps/rejected": -112.0121841430664,
"loss": 0.4899,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.20193609595298767,
"margin_dpo/beta_margin_grad_std": 0.15019506216049194,
"margin_dpo/beta_margin_mean": 1.942237377166748,
"margin_dpo/loss_margin_mean": 19.422372817993164,
"margin_dpo/margin_mean": 19.42237091064453,
"margin_dpo/margin_std": 15.808595657348633,
"step": 134
},
{
"epoch": 0.19823788546255505,
"grad_norm": 56.833290100097656,
"learning_rate": 4.862120211153265e-07,
"logits/chosen": -0.6657185554504395,
"logits/rejected": -0.6646615862846375,
"logps/chosen": -58.18457794189453,
"logps/ref_chosen": -50.353858947753906,
"logps/ref_rejected": -115.97975158691406,
"logps/rejected": -144.62242126464844,
"loss": 0.5819,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.21536532044410706,
"margin_dpo/beta_margin_grad_std": 0.20012225210666656,
"margin_dpo/beta_margin_mean": 2.0811963081359863,
"margin_dpo/loss_margin_mean": 20.811962127685547,
"margin_dpo/margin_mean": 20.811962127685547,
"margin_dpo/margin_std": 19.506851196289062,
"step": 135
},
{
"epoch": 0.19970631424375918,
"grad_norm": 60.99176788330078,
"learning_rate": 4.857886086178193e-07,
"logits/chosen": -0.6759936809539795,
"logits/rejected": -0.6444242596626282,
"logps/chosen": -73.07025146484375,
"logps/ref_chosen": -65.072509765625,
"logps/ref_rejected": -96.32122802734375,
"logps/rejected": -120.97657775878906,
"loss": 0.6585,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.252665251493454,
"margin_dpo/beta_margin_grad_std": 0.18508610129356384,
"margin_dpo/beta_margin_mean": 1.6657602787017822,
"margin_dpo/loss_margin_mean": 16.657604217529297,
"margin_dpo/margin_mean": 16.657604217529297,
"margin_dpo/margin_std": 18.530437469482422,
"step": 136
},
{
"epoch": 0.2011747430249633,
"grad_norm": 61.030094146728516,
"learning_rate": 4.853589828711902e-07,
"logits/chosen": -0.6564372181892395,
"logits/rejected": -0.6498109102249146,
"logps/chosen": -58.350120544433594,
"logps/ref_chosen": -48.759117126464844,
"logps/ref_rejected": -113.86377716064453,
"logps/rejected": -146.03843688964844,
"loss": 0.6181,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.218036487698555,
"margin_dpo/beta_margin_grad_std": 0.2112565040588379,
"margin_dpo/beta_margin_mean": 2.258366346359253,
"margin_dpo/loss_margin_mean": 22.583663940429688,
"margin_dpo/margin_mean": 22.583663940429688,
"margin_dpo/margin_std": 22.754310607910156,
"step": 137
},
{
"epoch": 0.2026431718061674,
"grad_norm": 70.91644287109375,
"learning_rate": 4.849231551964771e-07,
"logits/chosen": -0.6515681147575378,
"logits/rejected": -0.6243264675140381,
"logps/chosen": -69.78646850585938,
"logps/ref_chosen": -60.519649505615234,
"logps/ref_rejected": -93.19694519042969,
"logps/rejected": -121.13736724853516,
"loss": 0.6843,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.2272961288690567,
"margin_dpo/beta_margin_grad_std": 0.22245639562606812,
"margin_dpo/beta_margin_mean": 1.8673601150512695,
"margin_dpo/loss_margin_mean": 18.673601150512695,
"margin_dpo/margin_mean": 18.673603057861328,
"margin_dpo/margin_std": 18.434284210205078,
"step": 138
},
{
"epoch": 0.20411160058737152,
"grad_norm": 50.293697357177734,
"learning_rate": 4.844811370781446e-07,
"logits/chosen": -0.6459161639213562,
"logits/rejected": -0.6150977611541748,
"logps/chosen": -53.843475341796875,
"logps/ref_chosen": -46.89138412475586,
"logps/ref_rejected": -79.72798156738281,
"logps/rejected": -107.27476501464844,
"loss": 0.548,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.21206048130989075,
"margin_dpo/beta_margin_grad_std": 0.19148895144462585,
"margin_dpo/beta_margin_mean": 2.059469699859619,
"margin_dpo/loss_margin_mean": 20.594696044921875,
"margin_dpo/margin_mean": 20.594696044921875,
"margin_dpo/margin_std": 18.52047348022461,
"step": 139
},
{
"epoch": 0.2055800293685756,
"grad_norm": 53.57754898071289,
"learning_rate": 4.840329401637809e-07,
"logits/chosen": -0.6656177639961243,
"logits/rejected": -0.6380197405815125,
"logps/chosen": -66.53479766845703,
"logps/ref_chosen": -58.97471618652344,
"logps/ref_rejected": -83.28411102294922,
"logps/rejected": -110.21284484863281,
"loss": 0.676,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.23967662453651428,
"margin_dpo/beta_margin_grad_std": 0.2281067818403244,
"margin_dpo/beta_margin_mean": 1.9368653297424316,
"margin_dpo/loss_margin_mean": 19.368654251098633,
"margin_dpo/margin_mean": 19.36865234375,
"margin_dpo/margin_std": 19.57999038696289,
"step": 140
},
{
"epoch": 0.20704845814977973,
"grad_norm": 61.85297393798828,
"learning_rate": 4.83578576263792e-07,
"logits/chosen": -0.6568824052810669,
"logits/rejected": -0.6319071650505066,
"logps/chosen": -81.36563110351562,
"logps/ref_chosen": -75.0756607055664,
"logps/ref_rejected": -98.1922607421875,
"logps/rejected": -123.78886413574219,
"loss": 0.6184,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.21810100972652435,
"margin_dpo/beta_margin_grad_std": 0.2185245007276535,
"margin_dpo/beta_margin_mean": 1.9306633472442627,
"margin_dpo/loss_margin_mean": 19.30663299560547,
"margin_dpo/margin_mean": 19.30663299560547,
"margin_dpo/margin_std": 17.307022094726562,
"step": 141
},
{
"epoch": 0.20851688693098386,
"grad_norm": 72.42768096923828,
"learning_rate": 4.83118057351089e-07,
"logits/chosen": -0.6577416658401489,
"logits/rejected": -0.6425771117210388,
"logps/chosen": -67.6053695678711,
"logps/ref_chosen": -58.027931213378906,
"logps/ref_rejected": -94.58222198486328,
"logps/rejected": -124.15780639648438,
"loss": 0.7595,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.24029812216758728,
"margin_dpo/beta_margin_grad_std": 0.23216235637664795,
"margin_dpo/beta_margin_mean": 1.999813437461853,
"margin_dpo/loss_margin_mean": 19.99813461303711,
"margin_dpo/margin_mean": 19.99813461303711,
"margin_dpo/margin_std": 21.703876495361328,
"step": 142
},
{
"epoch": 0.20998531571218795,
"grad_norm": 73.55904388427734,
"learning_rate": 4.826513955607734e-07,
"logits/chosen": -0.6666814088821411,
"logits/rejected": -0.6278376579284668,
"logps/chosen": -66.27539825439453,
"logps/ref_chosen": -57.59645080566406,
"logps/ref_rejected": -78.99957275390625,
"logps/rejected": -103.03237915039062,
"loss": 0.8432,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.275440514087677,
"margin_dpo/beta_margin_grad_std": 0.24454638361930847,
"margin_dpo/beta_margin_mean": 1.535386323928833,
"margin_dpo/loss_margin_mean": 15.353862762451172,
"margin_dpo/margin_mean": 15.353862762451172,
"margin_dpo/margin_std": 18.465240478515625,
"step": 143
},
{
"epoch": 0.21145374449339208,
"grad_norm": 43.79653549194336,
"learning_rate": 4.821786031898176e-07,
"logits/chosen": -0.6671550869941711,
"logits/rejected": -0.6211960315704346,
"logps/chosen": -66.0051498413086,
"logps/ref_chosen": -59.90636444091797,
"logps/ref_rejected": -82.00025939941406,
"logps/rejected": -107.91677856445312,
"loss": 0.532,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.20202696323394775,
"margin_dpo/beta_margin_grad_std": 0.19462409615516663,
"margin_dpo/beta_margin_mean": 1.9817728996276855,
"margin_dpo/loss_margin_mean": 19.817729949951172,
"margin_dpo/margin_mean": 19.817729949951172,
"margin_dpo/margin_std": 16.256122589111328,
"step": 144
},
{
"epoch": 0.21292217327459617,
"grad_norm": 47.29103469848633,
"learning_rate": 4.816996926967401e-07,
"logits/chosen": -0.6466660499572754,
"logits/rejected": -0.6030235290527344,
"logps/chosen": -64.20927429199219,
"logps/ref_chosen": -56.60066604614258,
"logps/ref_rejected": -77.86631774902344,
"logps/rejected": -105.54521179199219,
"loss": 0.5566,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.2146737277507782,
"margin_dpo/beta_margin_grad_std": 0.18967363238334656,
"margin_dpo/beta_margin_mean": 2.0070290565490723,
"margin_dpo/loss_margin_mean": 20.07029151916504,
"margin_dpo/margin_mean": 20.070289611816406,
"margin_dpo/margin_std": 18.223201751708984,
"step": 145
},
{
"epoch": 0.2143906020558003,
"grad_norm": 70.13373565673828,
"learning_rate": 4.812146767012779e-07,
"logits/chosen": -0.680939793586731,
"logits/rejected": -0.6254955530166626,
"logps/chosen": -76.54412078857422,
"logps/ref_chosen": -66.00045776367188,
"logps/ref_rejected": -81.70278930664062,
"logps/rejected": -108.8254623413086,
"loss": 0.7131,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.2562163472175598,
"margin_dpo/beta_margin_grad_std": 0.22194227576255798,
"margin_dpo/beta_margin_mean": 1.657900333404541,
"margin_dpo/loss_margin_mean": 16.579002380371094,
"margin_dpo/margin_mean": 16.579002380371094,
"margin_dpo/margin_std": 17.49138641357422,
"step": 146
},
{
"epoch": 0.21585903083700442,
"grad_norm": 57.6500244140625,
"learning_rate": 4.807235679840536e-07,
"logits/chosen": -0.6382741928100586,
"logits/rejected": -0.5938813090324402,
"logps/chosen": -61.75067138671875,
"logps/ref_chosen": -53.405487060546875,
"logps/ref_rejected": -71.39061737060547,
"logps/rejected": -100.0536880493164,
"loss": 0.5682,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.21429036557674408,
"margin_dpo/beta_margin_grad_std": 0.19528846442699432,
"margin_dpo/beta_margin_mean": 2.0317890644073486,
"margin_dpo/loss_margin_mean": 20.317890167236328,
"margin_dpo/margin_mean": 20.317890167236328,
"margin_dpo/margin_std": 19.069602966308594,
"step": 147
},
{
"epoch": 0.2173274596182085,
"grad_norm": 50.89360809326172,
"learning_rate": 4.802263794862384e-07,
"logits/chosen": -0.6731536388397217,
"logits/rejected": -0.6413577795028687,
"logps/chosen": -71.63116455078125,
"logps/ref_chosen": -64.93708038330078,
"logps/ref_rejected": -103.09384155273438,
"logps/rejected": -125.92637634277344,
"loss": 0.6758,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.25298935174942017,
"margin_dpo/beta_margin_grad_std": 0.19248813390731812,
"margin_dpo/beta_margin_mean": 1.6138441562652588,
"margin_dpo/loss_margin_mean": 16.13844108581543,
"margin_dpo/margin_mean": 16.13844108581543,
"margin_dpo/margin_std": 15.57655143737793,
"step": 148
},
{
"epoch": 0.21879588839941264,
"grad_norm": 43.59981155395508,
"learning_rate": 4.797231243092118e-07,
"logits/chosen": -0.7026668787002563,
"logits/rejected": -0.6738122701644897,
"logps/chosen": -65.29115295410156,
"logps/ref_chosen": -58.47376251220703,
"logps/ref_rejected": -99.31474304199219,
"logps/rejected": -126.91746520996094,
"loss": 0.5037,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.19538308680057526,
"margin_dpo/beta_margin_grad_std": 0.18354260921478271,
"margin_dpo/beta_margin_mean": 2.078533887863159,
"margin_dpo/loss_margin_mean": 20.78533935546875,
"margin_dpo/margin_mean": 20.78533935546875,
"margin_dpo/margin_std": 16.75721549987793,
"step": 149
},
{
"epoch": 0.22026431718061673,
"grad_norm": 55.560611724853516,
"learning_rate": 4.792138157142157e-07,
"logits/chosen": -0.6683057546615601,
"logits/rejected": -0.6466302871704102,
"logps/chosen": -52.58869934082031,
"logps/ref_chosen": -45.705810546875,
"logps/ref_rejected": -83.34759521484375,
"logps/rejected": -109.74415588378906,
"loss": 0.6181,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.2259724736213684,
"margin_dpo/beta_margin_grad_std": 0.19235166907310486,
"margin_dpo/beta_margin_mean": 1.951366662979126,
"margin_dpo/loss_margin_mean": 19.5136661529541,
"margin_dpo/margin_mean": 19.5136661529541,
"margin_dpo/margin_std": 19.547821044921875,
"step": 150
},
{
"epoch": 0.22173274596182085,
"grad_norm": 53.323848724365234,
"learning_rate": 4.786984671220053e-07,
"logits/chosen": -0.6942344903945923,
"logits/rejected": -0.6533582210540771,
"logps/chosen": -78.0422592163086,
"logps/ref_chosen": -70.57083129882812,
"logps/ref_rejected": -100.46382141113281,
"logps/rejected": -129.36167907714844,
"loss": 0.5187,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.19543182849884033,
"margin_dpo/beta_margin_grad_std": 0.19356155395507812,
"margin_dpo/beta_margin_mean": 2.142643690109253,
"margin_dpo/loss_margin_mean": 21.426437377929688,
"margin_dpo/margin_mean": 21.426435470581055,
"margin_dpo/margin_std": 18.174488067626953,
"step": 151
},
{
"epoch": 0.22320117474302498,
"grad_norm": 58.04679870605469,
"learning_rate": 4.78177092112495e-07,
"logits/chosen": -0.6981043815612793,
"logits/rejected": -0.6723449230194092,
"logps/chosen": -65.85396575927734,
"logps/ref_chosen": -60.164390563964844,
"logps/ref_rejected": -106.14045715332031,
"logps/rejected": -133.99301147460938,
"loss": 0.5073,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.18981137871742249,
"margin_dpo/beta_margin_grad_std": 0.18638314306735992,
"margin_dpo/beta_margin_mean": 2.2162981033325195,
"margin_dpo/loss_margin_mean": 22.162979125976562,
"margin_dpo/margin_mean": 22.162979125976562,
"margin_dpo/margin_std": 18.572938919067383,
"step": 152
},
{
"epoch": 0.22466960352422907,
"grad_norm": 45.65426254272461,
"learning_rate": 4.776497044244016e-07,
"logits/chosen": -0.6792968511581421,
"logits/rejected": -0.6601795554161072,
"logps/chosen": -62.912200927734375,
"logps/ref_chosen": -56.315277099609375,
"logps/ref_rejected": -85.65583801269531,
"logps/rejected": -111.13700866699219,
"loss": 0.6482,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.23670101165771484,
"margin_dpo/beta_margin_grad_std": 0.2141384482383728,
"margin_dpo/beta_margin_mean": 1.888425350189209,
"margin_dpo/loss_margin_mean": 18.884254455566406,
"margin_dpo/margin_mean": 18.884254455566406,
"margin_dpo/margin_std": 19.149822235107422,
"step": 153
},
{
"epoch": 0.2261380323054332,
"grad_norm": 70.82756805419922,
"learning_rate": 4.771163179548808e-07,
"logits/chosen": -0.6859003305435181,
"logits/rejected": -0.6607710123062134,
"logps/chosen": -71.26567077636719,
"logps/ref_chosen": -62.74256896972656,
"logps/ref_rejected": -104.24420166015625,
"logps/rejected": -131.44509887695312,
"loss": 0.7219,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.2435159683227539,
"margin_dpo/beta_margin_grad_std": 0.2322094589471817,
"margin_dpo/beta_margin_mean": 1.8677783012390137,
"margin_dpo/loss_margin_mean": 18.677783966064453,
"margin_dpo/margin_mean": 18.677783966064453,
"margin_dpo/margin_std": 18.939533233642578,
"step": 154
},
{
"epoch": 0.2276064610866373,
"grad_norm": 54.700984954833984,
"learning_rate": 4.7657694675916247e-07,
"logits/chosen": -0.677458643913269,
"logits/rejected": -0.650254487991333,
"logps/chosen": -66.76405334472656,
"logps/ref_chosen": -60.65318298339844,
"logps/ref_rejected": -77.49220275878906,
"logps/rejected": -103.33089447021484,
"loss": 0.5788,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.20778781175613403,
"margin_dpo/beta_margin_grad_std": 0.1974395513534546,
"margin_dpo/beta_margin_mean": 1.972782015800476,
"margin_dpo/loss_margin_mean": 19.727819442749023,
"margin_dpo/margin_mean": 19.72781753540039,
"margin_dpo/margin_std": 17.581571578979492,
"step": 155
},
{
"epoch": 0.2290748898678414,
"grad_norm": 88.04608154296875,
"learning_rate": 4.7603160505017893e-07,
"logits/chosen": -0.653121829032898,
"logits/rejected": -0.622460126876831,
"logps/chosen": -79.57420349121094,
"logps/ref_chosen": -69.49188232421875,
"logps/ref_rejected": -77.1692886352539,
"logps/rejected": -102.35071563720703,
"loss": 0.9578,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.2811535894870758,
"margin_dpo/beta_margin_grad_std": 0.2631846070289612,
"margin_dpo/beta_margin_mean": 1.5099093914031982,
"margin_dpo/loss_margin_mean": 15.09909439086914,
"margin_dpo/margin_mean": 15.09909439086914,
"margin_dpo/margin_std": 20.09097671508789,
"step": 156
},
{
"epoch": 0.2305433186490455,
"grad_norm": 60.531410217285156,
"learning_rate": 4.7548030719819154e-07,
"logits/chosen": -0.7271685600280762,
"logits/rejected": -0.6973283290863037,
"logps/chosen": -71.76556396484375,
"logps/ref_chosen": -61.368438720703125,
"logps/ref_rejected": -107.64636993408203,
"logps/rejected": -139.60733032226562,
"loss": 0.5432,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.20604455471038818,
"margin_dpo/beta_margin_grad_std": 0.1988464891910553,
"margin_dpo/beta_margin_mean": 2.1563832759857178,
"margin_dpo/loss_margin_mean": 21.563831329345703,
"margin_dpo/margin_mean": 21.563831329345703,
"margin_dpo/margin_std": 19.046764373779297,
"step": 157
},
{
"epoch": 0.23201174743024963,
"grad_norm": 46.93947219848633,
"learning_rate": 4.7492306773041136e-07,
"logits/chosen": -0.6574522256851196,
"logits/rejected": -0.6339297294616699,
"logps/chosen": -65.11067962646484,
"logps/ref_chosen": -57.61292266845703,
"logps/ref_rejected": -113.6946792602539,
"logps/rejected": -143.18002319335938,
"loss": 0.5655,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.21865713596343994,
"margin_dpo/beta_margin_grad_std": 0.1967364400625229,
"margin_dpo/beta_margin_mean": 2.198759078979492,
"margin_dpo/loss_margin_mean": 21.987590789794922,
"margin_dpo/margin_mean": 21.987590789794922,
"margin_dpo/margin_std": 21.580215454101562,
"step": 158
},
{
"epoch": 0.23348017621145375,
"grad_norm": 52.955257415771484,
"learning_rate": 4.743599013306165e-07,
"logits/chosen": -0.6695908308029175,
"logits/rejected": -0.628494143486023,
"logps/chosen": -89.92948150634766,
"logps/ref_chosen": -81.56034088134766,
"logps/ref_rejected": -88.8987045288086,
"logps/rejected": -116.74146270751953,
"loss": 0.6625,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.23920385539531708,
"margin_dpo/beta_margin_grad_std": 0.21742284297943115,
"margin_dpo/beta_margin_mean": 1.947361707687378,
"margin_dpo/loss_margin_mean": 19.473617553710938,
"margin_dpo/margin_mean": 19.473617553710938,
"margin_dpo/margin_std": 20.372608184814453,
"step": 159
},
{
"epoch": 0.23494860499265785,
"grad_norm": 58.06504440307617,
"learning_rate": 4.737908228387656e-07,
"logits/chosen": -0.6885409355163574,
"logits/rejected": -0.6468052864074707,
"logps/chosen": -74.30818939208984,
"logps/ref_chosen": -65.73088073730469,
"logps/ref_rejected": -97.21781921386719,
"logps/rejected": -125.66719818115234,
"loss": 0.6855,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.24562396109104156,
"margin_dpo/beta_margin_grad_std": 0.22771283984184265,
"margin_dpo/beta_margin_mean": 1.9872074127197266,
"margin_dpo/loss_margin_mean": 19.872074127197266,
"margin_dpo/margin_mean": 19.872072219848633,
"margin_dpo/margin_std": 21.958354949951172,
"step": 160
},
{
"epoch": 0.23641703377386197,
"grad_norm": 53.40824508666992,
"learning_rate": 4.7321584725060594e-07,
"logits/chosen": -0.6996691226959229,
"logits/rejected": -0.6711582541465759,
"logps/chosen": -60.577247619628906,
"logps/ref_chosen": -52.43647766113281,
"logps/ref_rejected": -83.43095397949219,
"logps/rejected": -111.64784240722656,
"loss": 0.6506,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.23106649518013,
"margin_dpo/beta_margin_grad_std": 0.22650553286075592,
"margin_dpo/beta_margin_mean": 2.0076117515563965,
"margin_dpo/loss_margin_mean": 20.07611656188965,
"margin_dpo/margin_mean": 20.07611846923828,
"margin_dpo/margin_std": 19.994857788085938,
"step": 161
},
{
"epoch": 0.23788546255506607,
"grad_norm": 46.01493835449219,
"learning_rate": 4.7263498971727905e-07,
"logits/chosen": -0.6546447277069092,
"logits/rejected": -0.6173849105834961,
"logps/chosen": -70.60856628417969,
"logps/ref_chosen": -62.61058807373047,
"logps/ref_rejected": -89.39057922363281,
"logps/rejected": -116.71580505371094,
"loss": 0.6419,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.2332153171300888,
"margin_dpo/beta_margin_grad_std": 0.20986613631248474,
"margin_dpo/beta_margin_mean": 1.932724952697754,
"margin_dpo/loss_margin_mean": 19.32724952697754,
"margin_dpo/margin_mean": 19.32724952697754,
"margin_dpo/margin_std": 19.705549240112305,
"step": 162
},
{
"epoch": 0.2393538913362702,
"grad_norm": 42.86497116088867,
"learning_rate": 4.720482655449212e-07,
"logits/chosen": -0.65444415807724,
"logits/rejected": -0.6160274744033813,
"logps/chosen": -62.261444091796875,
"logps/ref_chosen": -55.021629333496094,
"logps/ref_rejected": -75.41822052001953,
"logps/rejected": -101.32667541503906,
"loss": 0.5951,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.21980655193328857,
"margin_dpo/beta_margin_grad_std": 0.20289334654808044,
"margin_dpo/beta_margin_mean": 1.8668642044067383,
"margin_dpo/loss_margin_mean": 18.668642044067383,
"margin_dpo/margin_mean": 18.668642044067383,
"margin_dpo/margin_std": 16.874954223632812,
"step": 163
},
{
"epoch": 0.24082232011747431,
"grad_norm": 37.281150817871094,
"learning_rate": 4.714556901942599e-07,
"logits/chosen": -0.6877849102020264,
"logits/rejected": -0.6435602903366089,
"logps/chosen": -61.4527702331543,
"logps/ref_chosen": -55.64066696166992,
"logps/ref_rejected": -79.66463470458984,
"logps/rejected": -106.46293640136719,
"loss": 0.464,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.18166950345039368,
"margin_dpo/beta_margin_grad_std": 0.16736756265163422,
"margin_dpo/beta_margin_mean": 2.0986199378967285,
"margin_dpo/loss_margin_mean": 20.98619842529297,
"margin_dpo/margin_mean": 20.98619842529297,
"margin_dpo/margin_std": 15.089117050170898,
"step": 164
},
{
"epoch": 0.2422907488986784,
"grad_norm": 65.39411163330078,
"learning_rate": 4.708572792802069e-07,
"logits/chosen": -0.7008275985717773,
"logits/rejected": -0.6510541439056396,
"logps/chosen": -69.8692398071289,
"logps/ref_chosen": -61.310691833496094,
"logps/ref_rejected": -73.67060852050781,
"logps/rejected": -96.39982604980469,
"loss": 0.7393,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.26950976252555847,
"margin_dpo/beta_margin_grad_std": 0.200277179479599,
"margin_dpo/beta_margin_mean": 1.4170664548873901,
"margin_dpo/loss_margin_mean": 14.170663833618164,
"margin_dpo/margin_mean": 14.17066478729248,
"margin_dpo/margin_std": 14.783781051635742,
"step": 165
},
{
"epoch": 0.24375917767988253,
"grad_norm": 48.58378982543945,
"learning_rate": 4.702530485714461e-07,
"logits/chosen": -0.6584955453872681,
"logits/rejected": -0.650241494178772,
"logps/chosen": -59.29880905151367,
"logps/ref_chosen": -50.98360061645508,
"logps/ref_rejected": -98.09512329101562,
"logps/rejected": -129.55075073242188,
"loss": 0.5363,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.2068173885345459,
"margin_dpo/beta_margin_grad_std": 0.18966558575630188,
"margin_dpo/beta_margin_mean": 2.3140416145324707,
"margin_dpo/loss_margin_mean": 23.14041519165039,
"margin_dpo/margin_mean": 23.14041519165039,
"margin_dpo/margin_std": 23.364418029785156,
"step": 166
},
{
"epoch": 0.24522760646108663,
"grad_norm": 52.58329391479492,
"learning_rate": 4.6964301399001877e-07,
"logits/chosen": -0.6375908851623535,
"logits/rejected": -0.6227909922599792,
"logps/chosen": -58.83246612548828,
"logps/ref_chosen": -50.42409133911133,
"logps/ref_rejected": -96.03042602539062,
"logps/rejected": -128.48484802246094,
"loss": 0.4969,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.17307597398757935,
"margin_dpo/beta_margin_grad_std": 0.1926582306623459,
"margin_dpo/beta_margin_mean": 2.404604911804199,
"margin_dpo/loss_margin_mean": 24.046049118041992,
"margin_dpo/margin_mean": 24.04604721069336,
"margin_dpo/margin_std": 18.615556716918945,
"step": 167
},
{
"epoch": 0.24669603524229075,
"grad_norm": 46.499046325683594,
"learning_rate": 4.690271916109034e-07,
"logits/chosen": -0.7022169232368469,
"logits/rejected": -0.6692053079605103,
"logps/chosen": -57.07928466796875,
"logps/ref_chosen": -49.46282196044922,
"logps/ref_rejected": -75.30854797363281,
"logps/rejected": -101.93223571777344,
"loss": 0.5398,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.20751315355300903,
"margin_dpo/beta_margin_grad_std": 0.18526721000671387,
"margin_dpo/beta_margin_mean": 1.9007223844528198,
"margin_dpo/loss_margin_mean": 19.00722312927246,
"margin_dpo/margin_mean": 19.007225036621094,
"margin_dpo/margin_std": 15.336655616760254,
"step": 168
},
{
"epoch": 0.24816446402349487,
"grad_norm": 55.35097122192383,
"learning_rate": 4.6840559766159235e-07,
"logits/chosen": -0.6735790967941284,
"logits/rejected": -0.644471287727356,
"logps/chosen": -67.24217224121094,
"logps/ref_chosen": -59.803443908691406,
"logps/ref_rejected": -83.34574890136719,
"logps/rejected": -108.11927795410156,
"loss": 0.7689,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.2712000906467438,
"margin_dpo/beta_margin_grad_std": 0.2246740311384201,
"margin_dpo/beta_margin_mean": 1.73348069190979,
"margin_dpo/loss_margin_mean": 17.334806442260742,
"margin_dpo/margin_mean": 17.334806442260742,
"margin_dpo/margin_std": 21.838268280029297,
"step": 169
},
{
"epoch": 0.24963289280469897,
"grad_norm": 44.05381774902344,
"learning_rate": 4.6777824852166437e-07,
"logits/chosen": -0.6359131336212158,
"logits/rejected": -0.6122620105743408,
"logps/chosen": -55.612205505371094,
"logps/ref_chosen": -49.471771240234375,
"logps/ref_rejected": -75.91734313964844,
"logps/rejected": -103.55111694335938,
"loss": 0.5703,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.2044505774974823,
"margin_dpo/beta_margin_grad_std": 0.19848260283470154,
"margin_dpo/beta_margin_mean": 2.149333953857422,
"margin_dpo/loss_margin_mean": 21.49333953857422,
"margin_dpo/margin_mean": 21.49333953857422,
"margin_dpo/margin_std": 18.57598876953125,
"step": 170
},
{
"epoch": 0.2511013215859031,
"grad_norm": 62.48033142089844,
"learning_rate": 4.6714516072235273e-07,
"logits/chosen": -0.6769453883171082,
"logits/rejected": -0.6254656314849854,
"logps/chosen": -92.27412414550781,
"logps/ref_chosen": -84.49931335449219,
"logps/ref_rejected": -109.38209533691406,
"logps/rejected": -135.86390686035156,
"loss": 0.6447,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.23198726773262024,
"margin_dpo/beta_margin_grad_std": 0.21067272126674652,
"margin_dpo/beta_margin_mean": 1.8707005977630615,
"margin_dpo/loss_margin_mean": 18.70700454711914,
"margin_dpo/margin_mean": 18.70700454711914,
"margin_dpo/margin_std": 19.6763916015625,
"step": 171
},
{
"epoch": 0.2525697503671072,
"grad_norm": 65.7638168334961,
"learning_rate": 4.6650635094610966e-07,
"logits/chosen": -0.7173855304718018,
"logits/rejected": -0.6745563745498657,
"logps/chosen": -78.89846801757812,
"logps/ref_chosen": -68.65391540527344,
"logps/ref_rejected": -85.43667602539062,
"logps/rejected": -113.78555297851562,
"loss": 0.7006,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.23992997407913208,
"margin_dpo/beta_margin_grad_std": 0.2309492528438568,
"margin_dpo/beta_margin_mean": 1.8104331493377686,
"margin_dpo/loss_margin_mean": 18.104331970214844,
"margin_dpo/margin_mean": 18.104331970214844,
"margin_dpo/margin_std": 18.842220306396484,
"step": 172
},
{
"epoch": 0.2540381791483113,
"grad_norm": 49.16233444213867,
"learning_rate": 4.6586183602616687e-07,
"logits/chosen": -0.7287572026252747,
"logits/rejected": -0.673369288444519,
"logps/chosen": -70.7244873046875,
"logps/ref_chosen": -63.050872802734375,
"logps/ref_rejected": -78.68392944335938,
"logps/rejected": -104.60391998291016,
"loss": 0.6261,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.22973671555519104,
"margin_dpo/beta_margin_grad_std": 0.2065761685371399,
"margin_dpo/beta_margin_mean": 1.824638843536377,
"margin_dpo/loss_margin_mean": 18.246387481689453,
"margin_dpo/margin_mean": 18.246387481689453,
"margin_dpo/margin_std": 17.423324584960938,
"step": 173
},
{
"epoch": 0.2555066079295154,
"grad_norm": 53.320858001708984,
"learning_rate": 4.652116329460919e-07,
"logits/chosen": -0.6692589521408081,
"logits/rejected": -0.669571042060852,
"logps/chosen": -61.95579528808594,
"logps/ref_chosen": -53.36296844482422,
"logps/ref_rejected": -101.91120910644531,
"logps/rejected": -128.92747497558594,
"loss": 0.6537,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.2317759096622467,
"margin_dpo/beta_margin_grad_std": 0.21807681024074554,
"margin_dpo/beta_margin_mean": 1.842343807220459,
"margin_dpo/loss_margin_mean": 18.423437118530273,
"margin_dpo/margin_mean": 18.423437118530273,
"margin_dpo/margin_std": 17.788700103759766,
"step": 174
},
{
"epoch": 0.25697503671071953,
"grad_norm": 47.415931701660156,
"learning_rate": 4.645557588393406e-07,
"logits/chosen": -0.6318604946136475,
"logits/rejected": -0.6075109243392944,
"logps/chosen": -52.48356628417969,
"logps/ref_chosen": -45.417762756347656,
"logps/ref_rejected": -89.50579833984375,
"logps/rejected": -120.28120422363281,
"loss": 0.4447,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1811106652021408,
"margin_dpo/beta_margin_grad_std": 0.15868915617465973,
"margin_dpo/beta_margin_mean": 2.370960235595703,
"margin_dpo/loss_margin_mean": 23.7096004486084,
"margin_dpo/margin_mean": 23.7096004486084,
"margin_dpo/margin_std": 19.452625274658203,
"step": 175
},
{
"epoch": 0.25844346549192365,
"grad_norm": 43.29652786254883,
"learning_rate": 4.638942309888058e-07,
"logits/chosen": -0.7066097259521484,
"logits/rejected": -0.6949463486671448,
"logps/chosen": -57.918304443359375,
"logps/ref_chosen": -50.45283889770508,
"logps/ref_rejected": -95.55896759033203,
"logps/rejected": -124.98609924316406,
"loss": 0.4807,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.18598869442939758,
"margin_dpo/beta_margin_grad_std": 0.1866185963153839,
"margin_dpo/beta_margin_mean": 2.196166515350342,
"margin_dpo/loss_margin_mean": 21.96166229248047,
"margin_dpo/margin_mean": 21.9616641998291,
"margin_dpo/margin_std": 17.302711486816406,
"step": 176
},
{
"epoch": 0.2599118942731278,
"grad_norm": 41.06880187988281,
"learning_rate": 4.6322706682636137e-07,
"logits/chosen": -0.6657878160476685,
"logits/rejected": -0.6222826242446899,
"logps/chosen": -70.47785186767578,
"logps/ref_chosen": -61.21646499633789,
"logps/ref_rejected": -95.89378356933594,
"logps/rejected": -127.02268981933594,
"loss": 0.5421,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.2077367603778839,
"margin_dpo/beta_margin_grad_std": 0.19527243077754974,
"margin_dpo/beta_margin_mean": 2.1867523193359375,
"margin_dpo/loss_margin_mean": 21.867523193359375,
"margin_dpo/margin_mean": 21.867523193359375,
"margin_dpo/margin_std": 20.43021011352539,
"step": 177
},
{
"epoch": 0.26138032305433184,
"grad_norm": 56.99783706665039,
"learning_rate": 4.6255428393240354e-07,
"logits/chosen": -0.6359624862670898,
"logits/rejected": -0.6415029764175415,
"logps/chosen": -70.54104614257812,
"logps/ref_chosen": -58.26478958129883,
"logps/ref_rejected": -105.36532592773438,
"logps/rejected": -141.9470672607422,
"loss": 0.5463,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1983056217432022,
"margin_dpo/beta_margin_grad_std": 0.21389643847942352,
"margin_dpo/beta_margin_mean": 2.4305472373962402,
"margin_dpo/loss_margin_mean": 24.30547332763672,
"margin_dpo/margin_mean": 24.30547332763672,
"margin_dpo/margin_std": 22.445331573486328,
"step": 178
},
{
"epoch": 0.26284875183553597,
"grad_norm": 71.98530578613281,
"learning_rate": 4.6187590003538724e-07,
"logits/chosen": -0.6545614004135132,
"logits/rejected": -0.629997730255127,
"logps/chosen": -72.10971069335938,
"logps/ref_chosen": -61.05832290649414,
"logps/ref_rejected": -90.52782440185547,
"logps/rejected": -126.18659973144531,
"loss": 0.672,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.21123456954956055,
"margin_dpo/beta_margin_grad_std": 0.24784591794013977,
"margin_dpo/beta_margin_mean": 2.460737705230713,
"margin_dpo/loss_margin_mean": 24.607376098632812,
"margin_dpo/margin_mean": 24.607376098632812,
"margin_dpo/margin_std": 23.464365005493164,
"step": 179
},
{
"epoch": 0.2643171806167401,
"grad_norm": 45.19013214111328,
"learning_rate": 4.611919330113591e-07,
"logits/chosen": -0.646220862865448,
"logits/rejected": -0.6155471801757812,
"logps/chosen": -62.99604034423828,
"logps/ref_chosen": -54.34272003173828,
"logps/ref_rejected": -98.21183776855469,
"logps/rejected": -131.40283203125,
"loss": 0.4572,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.17428556084632874,
"margin_dpo/beta_margin_grad_std": 0.19251887500286102,
"margin_dpo/beta_margin_mean": 2.453767776489258,
"margin_dpo/loss_margin_mean": 24.53767967224121,
"margin_dpo/margin_mean": 24.53767967224121,
"margin_dpo/margin_std": 19.19011688232422,
"step": 180
},
{
"epoch": 0.2657856093979442,
"grad_norm": 53.57444381713867,
"learning_rate": 4.605024008834863e-07,
"logits/chosen": -0.684493899345398,
"logits/rejected": -0.6450868844985962,
"logps/chosen": -63.24270248413086,
"logps/ref_chosen": -55.000457763671875,
"logps/ref_rejected": -61.656166076660156,
"logps/rejected": -87.48631286621094,
"loss": 0.6668,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.24293041229248047,
"margin_dpo/beta_margin_grad_std": 0.21182817220687866,
"margin_dpo/beta_margin_mean": 1.7587906122207642,
"margin_dpo/loss_margin_mean": 17.587905883789062,
"margin_dpo/margin_mean": 17.587905883789062,
"margin_dpo/margin_std": 18.199848175048828,
"step": 181
},
{
"epoch": 0.26725403817914833,
"grad_norm": 58.0034065246582,
"learning_rate": 4.598073218215817e-07,
"logits/chosen": -0.6252259016036987,
"logits/rejected": -0.6054234504699707,
"logps/chosen": -50.210166931152344,
"logps/ref_chosen": -41.10784912109375,
"logps/ref_rejected": -89.5215835571289,
"logps/rejected": -125.56619262695312,
"loss": 0.5722,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.18403324484825134,
"margin_dpo/beta_margin_grad_std": 0.22924652695655823,
"margin_dpo/beta_margin_mean": 2.6942296028137207,
"margin_dpo/loss_margin_mean": 26.94229507446289,
"margin_dpo/margin_mean": 26.942296981811523,
"margin_dpo/margin_std": 23.49362564086914,
"step": 182
},
{
"epoch": 0.2687224669603524,
"grad_norm": 90.1989974975586,
"learning_rate": 4.5910671414162484e-07,
"logits/chosen": -0.664011538028717,
"logits/rejected": -0.612758994102478,
"logps/chosen": -69.1800765991211,
"logps/ref_chosen": -57.524559020996094,
"logps/ref_rejected": -75.97572326660156,
"logps/rejected": -108.55187225341797,
"loss": 0.5191,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.2063201367855072,
"margin_dpo/beta_margin_grad_std": 0.16726571321487427,
"margin_dpo/beta_margin_mean": 2.0920636653900146,
"margin_dpo/loss_margin_mean": 20.920635223388672,
"margin_dpo/margin_mean": 20.920637130737305,
"margin_dpo/margin_std": 16.628860473632812,
"step": 183
},
{
"epoch": 0.2701908957415565,
"grad_norm": 63.455596923828125,
"learning_rate": 4.5840059630527985e-07,
"logits/chosen": -0.6686521768569946,
"logits/rejected": -0.658584713935852,
"logps/chosen": -67.59618377685547,
"logps/ref_chosen": -58.544952392578125,
"logps/ref_rejected": -76.63406372070312,
"logps/rejected": -101.26787567138672,
"loss": 0.7089,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.2659500241279602,
"margin_dpo/beta_margin_grad_std": 0.19632551074028015,
"margin_dpo/beta_margin_mean": 1.5582584142684937,
"margin_dpo/loss_margin_mean": 15.582584381103516,
"margin_dpo/margin_mean": 15.582584381103516,
"margin_dpo/margin_std": 17.532541275024414,
"step": 184
},
{
"epoch": 0.27165932452276065,
"grad_norm": 59.450496673583984,
"learning_rate": 4.5768898691940836e-07,
"logits/chosen": -0.6913318634033203,
"logits/rejected": -0.6381030678749084,
"logps/chosen": -71.63302612304688,
"logps/ref_chosen": -62.02584457397461,
"logps/ref_rejected": -73.76260375976562,
"logps/rejected": -98.8121337890625,
"loss": 0.8254,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.2810859680175781,
"margin_dpo/beta_margin_grad_std": 0.23385247588157654,
"margin_dpo/beta_margin_mean": 1.5442359447479248,
"margin_dpo/loss_margin_mean": 15.442358016967773,
"margin_dpo/margin_mean": 15.442358016967773,
"margin_dpo/margin_std": 18.860673904418945,
"step": 185
},
{
"epoch": 0.27312775330396477,
"grad_norm": 42.19928741455078,
"learning_rate": 4.5697190473557947e-07,
"logits/chosen": -0.6911958456039429,
"logits/rejected": -0.635596752166748,
"logps/chosen": -79.12220764160156,
"logps/ref_chosen": -69.35346984863281,
"logps/ref_rejected": -88.07244873046875,
"logps/rejected": -123.28886413574219,
"loss": 0.3942,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.15830203890800476,
"margin_dpo/beta_margin_grad_std": 0.16447928547859192,
"margin_dpo/beta_margin_mean": 2.544766902923584,
"margin_dpo/loss_margin_mean": 25.447669982910156,
"margin_dpo/margin_mean": 25.447669982910156,
"margin_dpo/margin_std": 18.903629302978516,
"step": 186
},
{
"epoch": 0.2745961820851689,
"grad_norm": 66.2356185913086,
"learning_rate": 4.5624936864957555e-07,
"logits/chosen": -0.6642763614654541,
"logits/rejected": -0.6538236141204834,
"logps/chosen": -64.20773315429688,
"logps/ref_chosen": -52.75646209716797,
"logps/ref_rejected": -81.96910095214844,
"logps/rejected": -112.33505249023438,
"loss": 0.7275,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.24081790447235107,
"margin_dpo/beta_margin_grad_std": 0.22335243225097656,
"margin_dpo/beta_margin_mean": 1.8914674520492554,
"margin_dpo/loss_margin_mean": 18.914674758911133,
"margin_dpo/margin_mean": 18.914674758911133,
"margin_dpo/margin_std": 20.249263763427734,
"step": 187
},
{
"epoch": 0.27606461086637296,
"grad_norm": 57.380775451660156,
"learning_rate": 4.5552139770089454e-07,
"logits/chosen": -0.6732475757598877,
"logits/rejected": -0.655211329460144,
"logps/chosen": -58.014007568359375,
"logps/ref_chosen": -49.415489196777344,
"logps/ref_rejected": -89.54043579101562,
"logps/rejected": -119.85971069335938,
"loss": 0.5978,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.20978838205337524,
"margin_dpo/beta_margin_grad_std": 0.21329578757286072,
"margin_dpo/beta_margin_mean": 2.1720757484436035,
"margin_dpo/loss_margin_mean": 21.72075653076172,
"margin_dpo/margin_mean": 21.72075653076172,
"margin_dpo/margin_std": 20.222156524658203,
"step": 188
},
{
"epoch": 0.2775330396475771,
"grad_norm": 59.92390441894531,
"learning_rate": 4.5478801107224794e-07,
"logits/chosen": -0.6710000038146973,
"logits/rejected": -0.6149500012397766,
"logps/chosen": -60.63805389404297,
"logps/ref_chosen": -52.39896011352539,
"logps/ref_rejected": -72.16735076904297,
"logps/rejected": -102.7850341796875,
"loss": 0.5266,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.19314193725585938,
"margin_dpo/beta_margin_grad_std": 0.18688946962356567,
"margin_dpo/beta_margin_mean": 2.2378592491149902,
"margin_dpo/loss_margin_mean": 22.378591537475586,
"margin_dpo/margin_mean": 22.378589630126953,
"margin_dpo/margin_std": 19.83676528930664,
"step": 189
},
{
"epoch": 0.2790014684287812,
"grad_norm": 49.65851974487305,
"learning_rate": 4.5404922808905543e-07,
"logits/chosen": -0.6853049993515015,
"logits/rejected": -0.6321940422058105,
"logps/chosen": -73.16799926757812,
"logps/ref_chosen": -64.68305969238281,
"logps/ref_rejected": -102.55052185058594,
"logps/rejected": -134.60731506347656,
"loss": 0.5028,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.19292610883712769,
"margin_dpo/beta_margin_grad_std": 0.1859111189842224,
"margin_dpo/beta_margin_mean": 2.357185125350952,
"margin_dpo/loss_margin_mean": 23.57185173034668,
"margin_dpo/margin_mean": 23.571849822998047,
"margin_dpo/margin_std": 20.523746490478516,
"step": 190
},
{
"epoch": 0.28046989720998533,
"grad_norm": 34.40500259399414,
"learning_rate": 4.5330506821893565e-07,
"logits/chosen": -0.6436042785644531,
"logits/rejected": -0.6251427531242371,
"logps/chosen": -75.3275146484375,
"logps/ref_chosen": -68.65887451171875,
"logps/ref_rejected": -110.1396713256836,
"logps/rejected": -144.67477416992188,
"loss": 0.4349,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16417238116264343,
"margin_dpo/beta_margin_grad_std": 0.18410329520702362,
"margin_dpo/beta_margin_mean": 2.7866461277008057,
"margin_dpo/loss_margin_mean": 27.866458892822266,
"margin_dpo/margin_mean": 27.866458892822266,
"margin_dpo/margin_std": 24.24535369873047,
"step": 191
},
{
"epoch": 0.28193832599118945,
"grad_norm": 54.248565673828125,
"learning_rate": 4.5255555107119336e-07,
"logits/chosen": -0.6639034748077393,
"logits/rejected": -0.6402075290679932,
"logps/chosen": -80.83169555664062,
"logps/ref_chosen": -69.72691345214844,
"logps/ref_rejected": -103.32135009765625,
"logps/rejected": -138.1024627685547,
"loss": 0.5458,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.20180875062942505,
"margin_dpo/beta_margin_grad_std": 0.21217647194862366,
"margin_dpo/beta_margin_mean": 2.367633819580078,
"margin_dpo/loss_margin_mean": 23.67633628845215,
"margin_dpo/margin_mean": 23.67633819580078,
"margin_dpo/margin_std": 21.766090393066406,
"step": 192
},
{
"epoch": 0.2834067547723935,
"grad_norm": 72.17058563232422,
"learning_rate": 4.5180069639630236e-07,
"logits/chosen": -0.7117282152175903,
"logits/rejected": -0.6623973846435547,
"logps/chosen": -71.19960021972656,
"logps/ref_chosen": -60.19049835205078,
"logps/ref_rejected": -76.40755462646484,
"logps/rejected": -104.30625915527344,
"loss": 0.7828,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.23411771655082703,
"margin_dpo/beta_margin_grad_std": 0.23166634142398834,
"margin_dpo/beta_margin_mean": 1.6889591217041016,
"margin_dpo/loss_margin_mean": 16.889591217041016,
"margin_dpo/margin_mean": 16.889591217041016,
"margin_dpo/margin_std": 17.94813346862793,
"step": 193
},
{
"epoch": 0.28487518355359764,
"grad_norm": 36.26988983154297,
"learning_rate": 4.510405240853854e-07,
"logits/chosen": -0.6696387529373169,
"logits/rejected": -0.6456412076950073,
"logps/chosen": -45.34961700439453,
"logps/ref_chosen": -37.84037399291992,
"logps/ref_rejected": -60.684783935546875,
"logps/rejected": -90.98521423339844,
"loss": 0.5049,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1981307864189148,
"margin_dpo/beta_margin_grad_std": 0.1864275336265564,
"margin_dpo/beta_margin_mean": 2.279118776321411,
"margin_dpo/loss_margin_mean": 22.791187286376953,
"margin_dpo/margin_mean": 22.791187286376953,
"margin_dpo/margin_std": 19.600643157958984,
"step": 194
},
{
"epoch": 0.28634361233480177,
"grad_norm": 52.49971008300781,
"learning_rate": 4.5027505416968985e-07,
"logits/chosen": -0.6493013501167297,
"logits/rejected": -0.646949291229248,
"logps/chosen": -66.55081939697266,
"logps/ref_chosen": -54.891571044921875,
"logps/ref_rejected": -96.77095794677734,
"logps/rejected": -130.78152465820312,
"loss": 0.4842,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.18146619200706482,
"margin_dpo/beta_margin_grad_std": 0.19031144678592682,
"margin_dpo/beta_margin_mean": 2.2351322174072266,
"margin_dpo/loss_margin_mean": 22.351322174072266,
"margin_dpo/margin_mean": 22.351322174072266,
"margin_dpo/margin_std": 18.570903778076172,
"step": 195
},
{
"epoch": 0.2878120411160059,
"grad_norm": 42.67164993286133,
"learning_rate": 4.495043068200599e-07,
"logits/chosen": -0.6235789060592651,
"logits/rejected": -0.5861127972602844,
"logps/chosen": -61.65058135986328,
"logps/ref_chosen": -53.245243072509766,
"logps/ref_rejected": -76.05294799804688,
"logps/rejected": -111.39912414550781,
"loss": 0.442,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16935831308364868,
"margin_dpo/beta_margin_grad_std": 0.18017151951789856,
"margin_dpo/beta_margin_mean": 2.6940836906433105,
"margin_dpo/loss_margin_mean": 26.940837860107422,
"margin_dpo/margin_mean": 26.940837860107422,
"margin_dpo/margin_std": 22.081172943115234,
"step": 196
},
{
"epoch": 0.28928046989721,
"grad_norm": 47.60453414916992,
"learning_rate": 4.4872830234640493e-07,
"logits/chosen": -0.6239949464797974,
"logits/rejected": -0.6122224926948547,
"logps/chosen": -69.72303009033203,
"logps/ref_chosen": -60.42033767700195,
"logps/ref_rejected": -77.20890808105469,
"logps/rejected": -107.96568298339844,
"loss": 0.5113,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.19921091198921204,
"margin_dpo/beta_margin_grad_std": 0.17500488460063934,
"margin_dpo/beta_margin_mean": 2.1454074382781982,
"margin_dpo/loss_margin_mean": 21.45407485961914,
"margin_dpo/margin_mean": 21.45407485961914,
"margin_dpo/margin_std": 18.467487335205078,
"step": 197
},
{
"epoch": 0.2907488986784141,
"grad_norm": 52.930660247802734,
"learning_rate": 4.479470611971645e-07,
"logits/chosen": -0.651822566986084,
"logits/rejected": -0.6399349570274353,
"logps/chosen": -64.81780242919922,
"logps/ref_chosen": -55.03618621826172,
"logps/ref_rejected": -97.24325561523438,
"logps/rejected": -130.20297241210938,
"loss": 0.5069,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.19629308581352234,
"margin_dpo/beta_margin_grad_std": 0.18192753195762634,
"margin_dpo/beta_margin_mean": 2.3178110122680664,
"margin_dpo/loss_margin_mean": 23.17810821533203,
"margin_dpo/margin_mean": 23.17810821533203,
"margin_dpo/margin_std": 22.717308044433594,
"step": 198
},
{
"epoch": 0.2922173274596182,
"grad_norm": 54.65925216674805,
"learning_rate": 4.471606039587695e-07,
"logits/chosen": -0.6595567464828491,
"logits/rejected": -0.6411717534065247,
"logps/chosen": -66.6902847290039,
"logps/ref_chosen": -56.828826904296875,
"logps/ref_rejected": -84.64820861816406,
"logps/rejected": -117.4836654663086,
"loss": 0.5925,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.21100641787052155,
"margin_dpo/beta_margin_grad_std": 0.21913698315620422,
"margin_dpo/beta_margin_mean": 2.2973995208740234,
"margin_dpo/loss_margin_mean": 22.973995208740234,
"margin_dpo/margin_mean": 22.973995208740234,
"margin_dpo/margin_std": 22.114442825317383,
"step": 199
},
{
"epoch": 0.2936857562408223,
"grad_norm": 69.58685302734375,
"learning_rate": 4.4636895135509966e-07,
"logits/chosen": -0.6506587862968445,
"logits/rejected": -0.6131795644760132,
"logps/chosen": -64.16134643554688,
"logps/ref_chosen": -53.06706237792969,
"logps/ref_rejected": -80.60843658447266,
"logps/rejected": -116.44496154785156,
"loss": 0.6607,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.21007482707500458,
"margin_dpo/beta_margin_grad_std": 0.22120529413223267,
"margin_dpo/beta_margin_mean": 2.4742238521575928,
"margin_dpo/loss_margin_mean": 24.742237091064453,
"margin_dpo/margin_mean": 24.742237091064453,
"margin_dpo/margin_std": 25.760879516601562,
"step": 200
},
{
"epoch": 0.2936857562408223,
"eval_logits/chosen": -0.6426228284835815,
"eval_logits/rejected": -0.6196746826171875,
"eval_logps/chosen": -92.99790954589844,
"eval_logps/ref_chosen": -79.05104064941406,
"eval_logps/ref_rejected": -86.79793548583984,
"eval_logps/rejected": -115.42741394042969,
"eval_loss": 0.4791145920753479,
"eval_margin_dpo/beta": 0.10000000149011612,
"eval_margin_dpo/beta_margin_grad_mean": -0.31089797616004944,
"eval_margin_dpo/beta_margin_grad_std": 0.24731215834617615,
"eval_margin_dpo/beta_margin_mean": 1.4682612419128418,
"eval_margin_dpo/loss_margin_mean": 14.682612419128418,
"eval_margin_dpo/margin_mean": 14.682612419128418,
"eval_margin_dpo/margin_std": 21.162776947021484,
"eval_runtime": 39.8807,
"eval_samples_per_second": 58.65,
"eval_steps_per_second": 1.856,
"step": 200
},
{
"epoch": 0.29515418502202645,
"grad_norm": 53.58722686767578,
"learning_rate": 4.455721242469372e-07,
"logits/chosen": -0.6264992356300354,
"logits/rejected": -0.5962362289428711,
"logps/chosen": -83.7801513671875,
"logps/ref_chosen": -75.4022216796875,
"logps/ref_rejected": -114.80821990966797,
"logps/rejected": -148.85873413085938,
"loss": 0.5285,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1883956640958786,
"margin_dpo/beta_margin_grad_std": 0.21361252665519714,
"margin_dpo/beta_margin_mean": 2.567258834838867,
"margin_dpo/loss_margin_mean": 25.672588348388672,
"margin_dpo/margin_mean": 25.67258644104004,
"margin_dpo/margin_std": 23.231678009033203,
"step": 201
},
{
"epoch": 0.2966226138032305,
"grad_norm": 72.63829803466797,
"learning_rate": 4.4477014363141755e-07,
"logits/chosen": -0.6617487668991089,
"logits/rejected": -0.6499176025390625,
"logps/chosen": -60.97098922729492,
"logps/ref_chosen": -50.101318359375,
"logps/ref_rejected": -86.98503112792969,
"logps/rejected": -116.73014831542969,
"loss": 0.7735,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.2517499327659607,
"margin_dpo/beta_margin_grad_std": 0.24596986174583435,
"margin_dpo/beta_margin_mean": 1.8875447511672974,
"margin_dpo/loss_margin_mean": 18.875446319580078,
"margin_dpo/margin_mean": 18.875446319580078,
"margin_dpo/margin_std": 21.153244018554688,
"step": 202
},
{
"epoch": 0.29809104258443464,
"grad_norm": 44.909637451171875,
"learning_rate": 4.439630306414758e-07,
"logits/chosen": -0.6598186492919922,
"logits/rejected": -0.6150715351104736,
"logps/chosen": -68.689453125,
"logps/ref_chosen": -60.60969543457031,
"logps/ref_rejected": -85.89596557617188,
"logps/rejected": -114.92884826660156,
"loss": 0.5325,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.20222759246826172,
"margin_dpo/beta_margin_grad_std": 0.1946883499622345,
"margin_dpo/beta_margin_mean": 2.0953121185302734,
"margin_dpo/loss_margin_mean": 20.953121185302734,
"margin_dpo/margin_mean": 20.953121185302734,
"margin_dpo/margin_std": 18.274738311767578,
"step": 203
},
{
"epoch": 0.29955947136563876,
"grad_norm": 47.8790283203125,
"learning_rate": 4.431508065452897e-07,
"logits/chosen": -0.6964120864868164,
"logits/rejected": -0.643916130065918,
"logps/chosen": -89.8682861328125,
"logps/ref_chosen": -80.16496276855469,
"logps/ref_rejected": -87.69590759277344,
"logps/rejected": -119.29830932617188,
"loss": 0.522,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.19900324940681458,
"margin_dpo/beta_margin_grad_std": 0.1881289929151535,
"margin_dpo/beta_margin_mean": 2.1899077892303467,
"margin_dpo/loss_margin_mean": 21.899078369140625,
"margin_dpo/margin_mean": 21.899078369140625,
"margin_dpo/margin_std": 19.764657974243164,
"step": 204
},
{
"epoch": 0.3010279001468429,
"grad_norm": 66.0072021484375,
"learning_rate": 4.4233349274571974e-07,
"logits/chosen": -0.7408896684646606,
"logits/rejected": -0.7029110193252563,
"logps/chosen": -70.63681030273438,
"logps/ref_chosen": -59.384735107421875,
"logps/ref_rejected": -85.12505340576172,
"logps/rejected": -120.17679595947266,
"loss": 0.636,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.21140334010124207,
"margin_dpo/beta_margin_grad_std": 0.22996652126312256,
"margin_dpo/beta_margin_mean": 2.3799662590026855,
"margin_dpo/loss_margin_mean": 23.799659729003906,
"margin_dpo/margin_mean": 23.79966163635254,
"margin_dpo/margin_std": 23.010438919067383,
"step": 205
},
{
"epoch": 0.302496328928047,
"grad_norm": 45.65394592285156,
"learning_rate": 4.415111107797445e-07,
"logits/chosen": -0.650726318359375,
"logits/rejected": -0.6484321355819702,
"logps/chosen": -57.730751037597656,
"logps/ref_chosen": -46.964500427246094,
"logps/ref_rejected": -98.9534912109375,
"logps/rejected": -136.9589080810547,
"loss": 0.4357,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16246598958969116,
"margin_dpo/beta_margin_grad_std": 0.19966889917850494,
"margin_dpo/beta_margin_mean": 2.723916530609131,
"margin_dpo/loss_margin_mean": 27.239166259765625,
"margin_dpo/margin_mean": 27.239164352416992,
"margin_dpo/margin_std": 21.334924697875977,
"step": 206
},
{
"epoch": 0.3039647577092511,
"grad_norm": 52.66937255859375,
"learning_rate": 4.4068368231789365e-07,
"logits/chosen": -0.726109504699707,
"logits/rejected": -0.70166015625,
"logps/chosen": -64.216064453125,
"logps/ref_chosen": -56.05625915527344,
"logps/ref_rejected": -84.44779968261719,
"logps/rejected": -120.97300720214844,
"loss": 0.4566,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.17621780931949615,
"margin_dpo/beta_margin_grad_std": 0.18753397464752197,
"margin_dpo/beta_margin_mean": 2.8365397453308105,
"margin_dpo/loss_margin_mean": 28.36539649963379,
"margin_dpo/margin_mean": 28.365394592285156,
"margin_dpo/margin_std": 25.557231903076172,
"step": 207
},
{
"epoch": 0.3054331864904552,
"grad_norm": 49.823787689208984,
"learning_rate": 4.398512291636768e-07,
"logits/chosen": -0.6845159530639648,
"logits/rejected": -0.6554762125015259,
"logps/chosen": -79.41966247558594,
"logps/ref_chosen": -67.06761169433594,
"logps/ref_rejected": -94.28689575195312,
"logps/rejected": -129.97105407714844,
"loss": 0.4757,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.18872755765914917,
"margin_dpo/beta_margin_grad_std": 0.1753954291343689,
"margin_dpo/beta_margin_mean": 2.333209991455078,
"margin_dpo/loss_margin_mean": 23.33209991455078,
"margin_dpo/margin_mean": 23.33209991455078,
"margin_dpo/margin_std": 20.38248062133789,
"step": 208
},
{
"epoch": 0.3069016152716593,
"grad_norm": 56.87575912475586,
"learning_rate": 4.3901377325300857e-07,
"logits/chosen": -0.6603403687477112,
"logits/rejected": -0.634404182434082,
"logps/chosen": -65.81916809082031,
"logps/ref_chosen": -56.18169403076172,
"logps/ref_rejected": -80.94152069091797,
"logps/rejected": -114.23938751220703,
"loss": 0.6082,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.2075938880443573,
"margin_dpo/beta_margin_grad_std": 0.23247569799423218,
"margin_dpo/beta_margin_mean": 2.366039752960205,
"margin_dpo/loss_margin_mean": 23.660396575927734,
"margin_dpo/margin_mean": 23.660396575927734,
"margin_dpo/margin_std": 21.86727523803711,
"step": 209
},
{
"epoch": 0.30837004405286345,
"grad_norm": 47.36671447753906,
"learning_rate": 4.381713366536311e-07,
"logits/chosen": -0.6590738296508789,
"logits/rejected": -0.6278238296508789,
"logps/chosen": -56.26042175292969,
"logps/ref_chosen": -46.371822357177734,
"logps/ref_rejected": -76.68162536621094,
"logps/rejected": -108.72216796875,
"loss": 0.5044,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1963951587677002,
"margin_dpo/beta_margin_grad_std": 0.17726612091064453,
"margin_dpo/beta_margin_mean": 2.2151942253112793,
"margin_dpo/loss_margin_mean": 22.15194320678711,
"margin_dpo/margin_mean": 22.15194320678711,
"margin_dpo/margin_std": 18.77596664428711,
"step": 210
},
{
"epoch": 0.30983847283406757,
"grad_norm": 60.94392776489258,
"learning_rate": 4.373239415645323e-07,
"logits/chosen": -0.6643211841583252,
"logits/rejected": -0.6076558232307434,
"logps/chosen": -91.72789764404297,
"logps/ref_chosen": -78.93235778808594,
"logps/ref_rejected": -86.82098388671875,
"logps/rejected": -122.42787170410156,
"loss": 0.5933,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.2022809386253357,
"margin_dpo/beta_margin_grad_std": 0.2218504548072815,
"margin_dpo/beta_margin_mean": 2.2811341285705566,
"margin_dpo/loss_margin_mean": 22.81134033203125,
"margin_dpo/margin_mean": 22.81134033203125,
"margin_dpo/margin_std": 20.594642639160156,
"step": 211
},
{
"epoch": 0.31130690161527164,
"grad_norm": 55.531829833984375,
"learning_rate": 4.3647161031536086e-07,
"logits/chosen": -0.6789622902870178,
"logits/rejected": -0.6477820873260498,
"logps/chosen": -69.9522705078125,
"logps/ref_chosen": -58.19701385498047,
"logps/ref_rejected": -103.05784606933594,
"logps/rejected": -143.61941528320312,
"loss": 0.4546,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.17285102605819702,
"margin_dpo/beta_margin_grad_std": 0.19285747408866882,
"margin_dpo/beta_margin_mean": 2.8806304931640625,
"margin_dpo/loss_margin_mean": 28.806303024291992,
"margin_dpo/margin_mean": 28.806303024291992,
"margin_dpo/margin_std": 24.59879493713379,
"step": 212
},
{
"epoch": 0.31277533039647576,
"grad_norm": 53.27421951293945,
"learning_rate": 4.3561436536583774e-07,
"logits/chosen": -0.6767191886901855,
"logits/rejected": -0.6345890760421753,
"logps/chosen": -77.456298828125,
"logps/ref_chosen": -67.51271057128906,
"logps/ref_rejected": -93.91471862792969,
"logps/rejected": -132.91636657714844,
"loss": 0.4835,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1712796837091446,
"margin_dpo/beta_margin_grad_std": 0.19696003198623657,
"margin_dpo/beta_margin_mean": 2.905806303024292,
"margin_dpo/loss_margin_mean": 29.058063507080078,
"margin_dpo/margin_mean": 29.058061599731445,
"margin_dpo/margin_std": 25.737838745117188,
"step": 213
},
{
"epoch": 0.3142437591776799,
"grad_norm": 63.7830696105957,
"learning_rate": 4.3475222930516473e-07,
"logits/chosen": -0.6705787181854248,
"logits/rejected": -0.6549390554428101,
"logps/chosen": -52.265167236328125,
"logps/ref_chosen": -41.604888916015625,
"logps/ref_rejected": -77.51741027832031,
"logps/rejected": -111.19293212890625,
"loss": 0.6303,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.21721556782722473,
"margin_dpo/beta_margin_grad_std": 0.21066680550575256,
"margin_dpo/beta_margin_mean": 2.3015246391296387,
"margin_dpo/loss_margin_mean": 23.015247344970703,
"margin_dpo/margin_mean": 23.015247344970703,
"margin_dpo/margin_std": 24.095165252685547,
"step": 214
},
{
"epoch": 0.315712187958884,
"grad_norm": 53.305728912353516,
"learning_rate": 4.3388522485142885e-07,
"logits/chosen": -0.6503559350967407,
"logits/rejected": -0.6214190721511841,
"logps/chosen": -63.870384216308594,
"logps/ref_chosen": -53.279266357421875,
"logps/ref_rejected": -89.96464538574219,
"logps/rejected": -125.34566497802734,
"loss": 0.5335,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1932601034641266,
"margin_dpo/beta_margin_grad_std": 0.2013830840587616,
"margin_dpo/beta_margin_mean": 2.478990077972412,
"margin_dpo/loss_margin_mean": 24.789897918701172,
"margin_dpo/margin_mean": 24.789899826049805,
"margin_dpo/margin_std": 23.55270767211914,
"step": 215
},
{
"epoch": 0.31718061674008813,
"grad_norm": 62.44280242919922,
"learning_rate": 4.330133748510036e-07,
"logits/chosen": -0.6717164516448975,
"logits/rejected": -0.639351487159729,
"logps/chosen": -61.949493408203125,
"logps/ref_chosen": -48.887794494628906,
"logps/ref_rejected": -77.19892883300781,
"logps/rejected": -117.35563659667969,
"loss": 0.5894,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.20755264163017273,
"margin_dpo/beta_margin_grad_std": 0.22565214335918427,
"margin_dpo/beta_margin_mean": 2.709501266479492,
"margin_dpo/loss_margin_mean": 27.095012664794922,
"margin_dpo/margin_mean": 27.095012664794922,
"margin_dpo/margin_std": 26.17487335205078,
"step": 216
},
{
"epoch": 0.3186490455212922,
"grad_norm": 42.385009765625,
"learning_rate": 4.3213670227794757e-07,
"logits/chosen": -0.6868363618850708,
"logits/rejected": -0.6442810893058777,
"logps/chosen": -60.769012451171875,
"logps/ref_chosen": -49.845306396484375,
"logps/ref_rejected": -100.07832336425781,
"logps/rejected": -138.9866943359375,
"loss": 0.4008,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1490134596824646,
"margin_dpo/beta_margin_grad_std": 0.18446922302246094,
"margin_dpo/beta_margin_mean": 2.798466205596924,
"margin_dpo/loss_margin_mean": 27.984663009643555,
"margin_dpo/margin_mean": 27.984663009643555,
"margin_dpo/margin_std": 21.431766510009766,
"step": 217
},
{
"epoch": 0.3201174743024963,
"grad_norm": 60.58324432373047,
"learning_rate": 4.3125523023339815e-07,
"logits/chosen": -0.659305214881897,
"logits/rejected": -0.6280574798583984,
"logps/chosen": -69.96195983886719,
"logps/ref_chosen": -58.576683044433594,
"logps/ref_rejected": -87.84639739990234,
"logps/rejected": -124.12518310546875,
"loss": 0.5344,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1986551582813263,
"margin_dpo/beta_margin_grad_std": 0.19974969327449799,
"margin_dpo/beta_margin_mean": 2.4893507957458496,
"margin_dpo/loss_margin_mean": 24.893508911132812,
"margin_dpo/margin_mean": 24.893508911132812,
"margin_dpo/margin_std": 23.56732177734375,
"step": 218
},
{
"epoch": 0.32158590308370044,
"grad_norm": 59.97367858886719,
"learning_rate": 4.303689819449636e-07,
"logits/chosen": -0.6612948775291443,
"logits/rejected": -0.6297075152397156,
"logps/chosen": -72.49765014648438,
"logps/ref_chosen": -61.083858489990234,
"logps/ref_rejected": -85.83042907714844,
"logps/rejected": -119.2388687133789,
"loss": 0.5264,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.19783681631088257,
"margin_dpo/beta_margin_grad_std": 0.18227900564670563,
"margin_dpo/beta_margin_mean": 2.1994645595550537,
"margin_dpo/loss_margin_mean": 21.994646072387695,
"margin_dpo/margin_mean": 21.994644165039062,
"margin_dpo/margin_std": 19.54847526550293,
"step": 219
},
{
"epoch": 0.32305433186490456,
"grad_norm": 47.776611328125,
"learning_rate": 4.2947798076611047e-07,
"logits/chosen": -0.646489679813385,
"logits/rejected": -0.605070948600769,
"logps/chosen": -81.14366912841797,
"logps/ref_chosen": -70.03128051757812,
"logps/ref_rejected": -87.68551635742188,
"logps/rejected": -119.57649230957031,
"loss": 0.5065,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.19959807395935059,
"margin_dpo/beta_margin_grad_std": 0.17812803387641907,
"margin_dpo/beta_margin_mean": 2.077859401702881,
"margin_dpo/loss_margin_mean": 20.77859115600586,
"margin_dpo/margin_mean": 20.778593063354492,
"margin_dpo/margin_std": 16.526872634887695,
"step": 220
},
{
"epoch": 0.3245227606461087,
"grad_norm": 52.991641998291016,
"learning_rate": 4.285822501755485e-07,
"logits/chosen": -0.6369597315788269,
"logits/rejected": -0.6250277757644653,
"logps/chosen": -64.38204193115234,
"logps/ref_chosen": -52.15470886230469,
"logps/ref_rejected": -106.46768188476562,
"logps/rejected": -151.52133178710938,
"loss": 0.3403,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.12470635771751404,
"margin_dpo/beta_margin_grad_std": 0.18095463514328003,
"margin_dpo/beta_margin_mean": 3.2826321125030518,
"margin_dpo/loss_margin_mean": 32.82632064819336,
"margin_dpo/margin_mean": 32.82632064819336,
"margin_dpo/margin_std": 22.725797653198242,
"step": 221
},
{
"epoch": 0.32599118942731276,
"grad_norm": 72.23493957519531,
"learning_rate": 4.276818137766118e-07,
"logits/chosen": -0.7019423842430115,
"logits/rejected": -0.6659786701202393,
"logps/chosen": -74.65852355957031,
"logps/ref_chosen": -60.971099853515625,
"logps/ref_rejected": -100.00115203857422,
"logps/rejected": -139.9058837890625,
"loss": 0.5722,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.19258952140808105,
"margin_dpo/beta_margin_grad_std": 0.22169330716133118,
"margin_dpo/beta_margin_mean": 2.621731758117676,
"margin_dpo/loss_margin_mean": 26.217315673828125,
"margin_dpo/margin_mean": 26.217315673828125,
"margin_dpo/margin_std": 24.92258071899414,
"step": 222
},
{
"epoch": 0.3274596182085169,
"grad_norm": 85.80632781982422,
"learning_rate": 4.2677669529663686e-07,
"logits/chosen": -0.6935982704162598,
"logits/rejected": -0.6445531249046326,
"logps/chosen": -68.51580047607422,
"logps/ref_chosen": -52.64057922363281,
"logps/ref_rejected": -82.82502746582031,
"logps/rejected": -120.95681762695312,
"loss": 0.7341,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.22147008776664734,
"margin_dpo/beta_margin_grad_std": 0.2591605484485626,
"margin_dpo/beta_margin_mean": 2.225656509399414,
"margin_dpo/loss_margin_mean": 22.25656509399414,
"margin_dpo/margin_mean": 22.25656509399414,
"margin_dpo/margin_std": 22.70517349243164,
"step": 223
},
{
"epoch": 0.328928046989721,
"grad_norm": 72.05899810791016,
"learning_rate": 4.2586691858633747e-07,
"logits/chosen": -0.6770994663238525,
"logits/rejected": -0.6360162496566772,
"logps/chosen": -61.67028045654297,
"logps/ref_chosen": -48.59540939331055,
"logps/ref_rejected": -77.11648559570312,
"logps/rejected": -116.93687438964844,
"loss": 0.5671,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.19068878889083862,
"margin_dpo/beta_margin_grad_std": 0.20871135592460632,
"margin_dpo/beta_margin_mean": 2.674551486968994,
"margin_dpo/loss_margin_mean": 26.745513916015625,
"margin_dpo/margin_mean": 26.745513916015625,
"margin_dpo/margin_std": 24.828662872314453,
"step": 224
},
{
"epoch": 0.3303964757709251,
"grad_norm": 44.1650390625,
"learning_rate": 4.249525076191759e-07,
"logits/chosen": -0.6742178201675415,
"logits/rejected": -0.6422642469406128,
"logps/chosen": -72.62501525878906,
"logps/ref_chosen": -58.000465393066406,
"logps/ref_rejected": -99.90290832519531,
"logps/rejected": -147.4707489013672,
"loss": 0.4084,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.14956694841384888,
"margin_dpo/beta_margin_grad_std": 0.20272037386894226,
"margin_dpo/beta_margin_mean": 3.2943289279937744,
"margin_dpo/loss_margin_mean": 32.94328689575195,
"margin_dpo/margin_mean": 32.94329071044922,
"margin_dpo/margin_std": 26.7680606842041,
"step": 225
},
{
"epoch": 0.33186490455212925,
"grad_norm": 52.37092208862305,
"learning_rate": 4.2403348649073167e-07,
"logits/chosen": -0.6877872943878174,
"logits/rejected": -0.6356024146080017,
"logps/chosen": -69.44877624511719,
"logps/ref_chosen": -58.898799896240234,
"logps/ref_rejected": -78.68775939941406,
"logps/rejected": -114.89201354980469,
"loss": 0.4828,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.17930053174495697,
"margin_dpo/beta_margin_grad_std": 0.18374590575695038,
"margin_dpo/beta_margin_mean": 2.565427780151367,
"margin_dpo/loss_margin_mean": 25.654277801513672,
"margin_dpo/margin_mean": 25.654277801513672,
"margin_dpo/margin_std": 21.256011962890625,
"step": 226
},
{
"epoch": 0.3333333333333333,
"grad_norm": 47.082557678222656,
"learning_rate": 4.2310987941806615e-07,
"logits/chosen": -0.6890594959259033,
"logits/rejected": -0.6593263745307922,
"logps/chosen": -70.82875061035156,
"logps/ref_chosen": -59.072181701660156,
"logps/ref_rejected": -99.41236877441406,
"logps/rejected": -142.82723999023438,
"loss": 0.4196,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.15451833605766296,
"margin_dpo/beta_margin_grad_std": 0.19809511303901672,
"margin_dpo/beta_margin_mean": 3.1658291816711426,
"margin_dpo/loss_margin_mean": 31.65829086303711,
"margin_dpo/margin_mean": 31.65829086303711,
"margin_dpo/margin_std": 26.29955291748047,
"step": 227
},
{
"epoch": 0.33480176211453744,
"grad_norm": 56.605804443359375,
"learning_rate": 4.2218171073908463e-07,
"logits/chosen": -0.7011622786521912,
"logits/rejected": -0.6703910231590271,
"logps/chosen": -78.96180725097656,
"logps/ref_chosen": -65.89129638671875,
"logps/ref_rejected": -91.04875183105469,
"logps/rejected": -128.32534790039062,
"loss": 0.5385,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.18866901099681854,
"margin_dpo/beta_margin_grad_std": 0.21488142013549805,
"margin_dpo/beta_margin_mean": 2.4206089973449707,
"margin_dpo/loss_margin_mean": 24.20608901977539,
"margin_dpo/margin_mean": 24.20608901977539,
"margin_dpo/margin_std": 20.969837188720703,
"step": 228
},
{
"epoch": 0.33627019089574156,
"grad_norm": 63.40804672241211,
"learning_rate": 4.212490049118951e-07,
"logits/chosen": -0.7051106691360474,
"logits/rejected": -0.6546026468276978,
"logps/chosen": -84.97560119628906,
"logps/ref_chosen": -70.70636749267578,
"logps/ref_rejected": -84.52740478515625,
"logps/rejected": -125.85427856445312,
"loss": 0.6019,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.19034332036972046,
"margin_dpo/beta_margin_grad_std": 0.2341303527355194,
"margin_dpo/beta_margin_mean": 2.7057645320892334,
"margin_dpo/loss_margin_mean": 27.05764389038086,
"margin_dpo/margin_mean": 27.05764389038086,
"margin_dpo/margin_std": 25.40169906616211,
"step": 229
},
{
"epoch": 0.3377386196769457,
"grad_norm": 53.56421661376953,
"learning_rate": 4.203117865141635e-07,
"logits/chosen": -0.6524708271026611,
"logits/rejected": -0.6426960229873657,
"logps/chosen": -51.435630798339844,
"logps/ref_chosen": -39.282005310058594,
"logps/ref_rejected": -85.62191009521484,
"logps/rejected": -128.23431396484375,
"loss": 0.5106,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16138073801994324,
"margin_dpo/beta_margin_grad_std": 0.21171000599861145,
"margin_dpo/beta_margin_mean": 3.0458788871765137,
"margin_dpo/loss_margin_mean": 30.45878791809082,
"margin_dpo/margin_mean": 30.45878791809082,
"margin_dpo/margin_std": 27.84360694885254,
"step": 230
},
{
"epoch": 0.3392070484581498,
"grad_norm": 42.17887878417969,
"learning_rate": 4.1937008024246625e-07,
"logits/chosen": -0.6823098063468933,
"logits/rejected": -0.6389471292495728,
"logps/chosen": -74.67344665527344,
"logps/ref_chosen": -63.27644348144531,
"logps/ref_rejected": -74.1239013671875,
"logps/rejected": -111.6684799194336,
"loss": 0.4691,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.18743924796581268,
"margin_dpo/beta_margin_grad_std": 0.17160117626190186,
"margin_dpo/beta_margin_mean": 2.614757537841797,
"margin_dpo/loss_margin_mean": 26.14757537841797,
"margin_dpo/margin_mean": 26.14757537841797,
"margin_dpo/margin_std": 25.1517333984375,
"step": 231
},
{
"epoch": 0.3406754772393539,
"grad_norm": 70.57960510253906,
"learning_rate": 4.1842391091163933e-07,
"logits/chosen": -0.6195404529571533,
"logits/rejected": -0.5588107109069824,
"logps/chosen": -84.39373779296875,
"logps/ref_chosen": -70.74876403808594,
"logps/ref_rejected": -83.97706604003906,
"logps/rejected": -118.90908813476562,
"loss": 0.6901,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.23599335551261902,
"margin_dpo/beta_margin_grad_std": 0.24028193950653076,
"margin_dpo/beta_margin_mean": 2.1287035942077637,
"margin_dpo/loss_margin_mean": 21.287036895751953,
"margin_dpo/margin_mean": 21.287036895751953,
"margin_dpo/margin_std": 22.360965728759766,
"step": 232
},
{
"epoch": 0.342143906020558,
"grad_norm": 63.01465606689453,
"learning_rate": 4.174733034541245e-07,
"logits/chosen": -0.6951059103012085,
"logits/rejected": -0.6703442931175232,
"logps/chosen": -67.92855834960938,
"logps/ref_chosen": -54.8829345703125,
"logps/ref_rejected": -107.48007202148438,
"logps/rejected": -148.419921875,
"loss": 0.5636,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.18834185600280762,
"margin_dpo/beta_margin_grad_std": 0.22877255082130432,
"margin_dpo/beta_margin_mean": 2.7894234657287598,
"margin_dpo/loss_margin_mean": 27.894235610961914,
"margin_dpo/margin_mean": 27.89423370361328,
"margin_dpo/margin_std": 26.236351013183594,
"step": 233
},
{
"epoch": 0.3436123348017621,
"grad_norm": 59.044742584228516,
"learning_rate": 4.165182829193126e-07,
"logits/chosen": -0.6353030204772949,
"logits/rejected": -0.6363640427589417,
"logps/chosen": -55.01062774658203,
"logps/ref_chosen": -44.09451675415039,
"logps/ref_rejected": -100.00663757324219,
"logps/rejected": -139.08529663085938,
"loss": 0.4541,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.15148046612739563,
"margin_dpo/beta_margin_grad_std": 0.18851938843727112,
"margin_dpo/beta_margin_mean": 2.8162550926208496,
"margin_dpo/loss_margin_mean": 28.162551879882812,
"margin_dpo/margin_mean": 28.16254997253418,
"margin_dpo/margin_std": 21.774639129638672,
"step": 234
},
{
"epoch": 0.34508076358296624,
"grad_norm": 64.13523864746094,
"learning_rate": 4.1555887447288255e-07,
"logits/chosen": -0.6782118082046509,
"logits/rejected": -0.6378265619277954,
"logps/chosen": -77.53021240234375,
"logps/ref_chosen": -62.237911224365234,
"logps/ref_rejected": -90.39505767822266,
"logps/rejected": -128.7196502685547,
"loss": 0.5952,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.21459338068962097,
"margin_dpo/beta_margin_grad_std": 0.21603041887283325,
"margin_dpo/beta_margin_mean": 2.303229808807373,
"margin_dpo/loss_margin_mean": 23.03229522705078,
"margin_dpo/margin_mean": 23.03229522705078,
"margin_dpo/margin_std": 23.028390884399414,
"step": 235
},
{
"epoch": 0.3465491923641703,
"grad_norm": 64.90979766845703,
"learning_rate": 4.1459510339613946e-07,
"logits/chosen": -0.6209807395935059,
"logits/rejected": -0.6189430356025696,
"logps/chosen": -60.39289855957031,
"logps/ref_chosen": -49.34136199951172,
"logps/ref_rejected": -103.51162719726562,
"logps/rejected": -139.95445251464844,
"loss": 0.5672,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.19896173477172852,
"margin_dpo/beta_margin_grad_std": 0.22020728886127472,
"margin_dpo/beta_margin_mean": 2.5391287803649902,
"margin_dpo/loss_margin_mean": 25.391286849975586,
"margin_dpo/margin_mean": 25.391284942626953,
"margin_dpo/margin_std": 23.486440658569336,
"step": 236
},
{
"epoch": 0.34801762114537443,
"grad_norm": 48.42905807495117,
"learning_rate": 4.136269950853473e-07,
"logits/chosen": -0.6719942092895508,
"logits/rejected": -0.6377314329147339,
"logps/chosen": -65.90644836425781,
"logps/ref_chosen": -54.168121337890625,
"logps/ref_rejected": -94.78036499023438,
"logps/rejected": -133.96646118164062,
"loss": 0.5145,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.17442235350608826,
"margin_dpo/beta_margin_grad_std": 0.21036501228809357,
"margin_dpo/beta_margin_mean": 2.744776487350464,
"margin_dpo/loss_margin_mean": 27.447765350341797,
"margin_dpo/margin_mean": 27.447765350341797,
"margin_dpo/margin_std": 24.072509765625,
"step": 237
},
{
"epoch": 0.34948604992657856,
"grad_norm": 39.447776794433594,
"learning_rate": 4.126545750510605e-07,
"logits/chosen": -0.62393718957901,
"logits/rejected": -0.6167235970497131,
"logps/chosen": -64.80150604248047,
"logps/ref_chosen": -53.973121643066406,
"logps/ref_rejected": -89.41795349121094,
"logps/rejected": -124.90143585205078,
"loss": 0.4407,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.17134535312652588,
"margin_dpo/beta_margin_grad_std": 0.17862460017204285,
"margin_dpo/beta_margin_mean": 2.465510368347168,
"margin_dpo/loss_margin_mean": 24.65510368347168,
"margin_dpo/margin_mean": 24.655105590820312,
"margin_dpo/margin_std": 20.156238555908203,
"step": 238
},
{
"epoch": 0.3509544787077827,
"grad_norm": 49.10955810546875,
"learning_rate": 4.116778689174514e-07,
"logits/chosen": -0.6934037208557129,
"logits/rejected": -0.6648428440093994,
"logps/chosen": -70.5438232421875,
"logps/ref_chosen": -58.09782409667969,
"logps/ref_rejected": -93.59294128417969,
"logps/rejected": -131.54547119140625,
"loss": 0.4402,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16609887778759003,
"margin_dpo/beta_margin_grad_std": 0.17937105894088745,
"margin_dpo/beta_margin_mean": 2.550652503967285,
"margin_dpo/loss_margin_mean": 25.50652313232422,
"margin_dpo/margin_mean": 25.50652313232422,
"margin_dpo/margin_std": 19.751544952392578,
"step": 239
},
{
"epoch": 0.3524229074889868,
"grad_norm": 59.94078063964844,
"learning_rate": 4.106969024216348e-07,
"logits/chosen": -0.7201815843582153,
"logits/rejected": -0.6908845901489258,
"logps/chosen": -73.39276885986328,
"logps/ref_chosen": -60.6144905090332,
"logps/ref_rejected": -74.1185302734375,
"logps/rejected": -109.40141296386719,
"loss": 0.6221,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.20475083589553833,
"margin_dpo/beta_margin_grad_std": 0.22877436876296997,
"margin_dpo/beta_margin_mean": 2.250460624694824,
"margin_dpo/loss_margin_mean": 22.50460433959961,
"margin_dpo/margin_mean": 22.50460433959961,
"margin_dpo/margin_std": 20.584453582763672,
"step": 240
},
{
"epoch": 0.35389133627019087,
"grad_norm": 58.87882614135742,
"learning_rate": 4.097117014129903e-07,
"logits/chosen": -0.6401950120925903,
"logits/rejected": -0.5843400955200195,
"logps/chosen": -76.44609069824219,
"logps/ref_chosen": -66.091064453125,
"logps/ref_rejected": -88.06088256835938,
"logps/rejected": -130.1419677734375,
"loss": 0.5073,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16559036076068878,
"margin_dpo/beta_margin_grad_std": 0.2104012668132782,
"margin_dpo/beta_margin_mean": 3.172605514526367,
"margin_dpo/loss_margin_mean": 31.726055145263672,
"margin_dpo/margin_mean": 31.726055145263672,
"margin_dpo/margin_std": 29.52655029296875,
"step": 241
},
{
"epoch": 0.355359765051395,
"grad_norm": 53.561256408691406,
"learning_rate": 4.087222918524807e-07,
"logits/chosen": -0.6564736366271973,
"logits/rejected": -0.6259176135063171,
"logps/chosen": -79.38307189941406,
"logps/ref_chosen": -67.86392211914062,
"logps/ref_rejected": -83.36033630371094,
"logps/rejected": -119.49470520019531,
"loss": 0.4948,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.18211215734481812,
"margin_dpo/beta_margin_grad_std": 0.19208890199661255,
"margin_dpo/beta_margin_mean": 2.4615235328674316,
"margin_dpo/loss_margin_mean": 24.615234375,
"margin_dpo/margin_mean": 24.615234375,
"margin_dpo/margin_std": 21.698657989501953,
"step": 242
},
{
"epoch": 0.3568281938325991,
"grad_norm": 33.901893615722656,
"learning_rate": 4.07728699811968e-07,
"logits/chosen": -0.6499842405319214,
"logits/rejected": -0.5824156999588013,
"logps/chosen": -73.99522399902344,
"logps/ref_chosen": -63.08424377441406,
"logps/ref_rejected": -76.33563232421875,
"logps/rejected": -116.67811584472656,
"loss": 0.3266,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.13336455821990967,
"margin_dpo/beta_margin_grad_std": 0.15137195587158203,
"margin_dpo/beta_margin_mean": 2.943150043487549,
"margin_dpo/loss_margin_mean": 29.43149757385254,
"margin_dpo/margin_mean": 29.43149757385254,
"margin_dpo/margin_std": 21.736125946044922,
"step": 243
},
{
"epoch": 0.35829662261380324,
"grad_norm": 43.502723693847656,
"learning_rate": 4.067309514735267e-07,
"logits/chosen": -0.6875163316726685,
"logits/rejected": -0.6771037578582764,
"logps/chosen": -71.17987060546875,
"logps/ref_chosen": -61.14069366455078,
"logps/ref_rejected": -94.89193725585938,
"logps/rejected": -130.1956787109375,
"loss": 0.4962,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.18511684238910675,
"margin_dpo/beta_margin_grad_std": 0.20264488458633423,
"margin_dpo/beta_margin_mean": 2.526456832885742,
"margin_dpo/loss_margin_mean": 25.264570236206055,
"margin_dpo/margin_mean": 25.264570236206055,
"margin_dpo/margin_std": 21.366226196289062,
"step": 244
},
{
"epoch": 0.35976505139500736,
"grad_norm": 75.58712768554688,
"learning_rate": 4.057290731287531e-07,
"logits/chosen": -0.6512210369110107,
"logits/rejected": -0.5914992094039917,
"logps/chosen": -78.87323760986328,
"logps/ref_chosen": -67.26228332519531,
"logps/ref_rejected": -87.64010620117188,
"logps/rejected": -126.08024597167969,
"loss": 0.5409,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1954166740179062,
"margin_dpo/beta_margin_grad_std": 0.20244669914245605,
"margin_dpo/beta_margin_mean": 2.68291974067688,
"margin_dpo/loss_margin_mean": 26.82919692993164,
"margin_dpo/margin_mean": 26.82919692993164,
"margin_dpo/margin_std": 25.66305923461914,
"step": 245
},
{
"epoch": 0.36123348017621143,
"grad_norm": 55.81852722167969,
"learning_rate": 4.047230911780736e-07,
"logits/chosen": -0.6650277972221375,
"logits/rejected": -0.6220686435699463,
"logps/chosen": -77.91656494140625,
"logps/ref_chosen": -66.69696807861328,
"logps/ref_rejected": -84.34634399414062,
"logps/rejected": -118.67218017578125,
"loss": 0.5257,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.19744564592838287,
"margin_dpo/beta_margin_grad_std": 0.1971253752708435,
"margin_dpo/beta_margin_mean": 2.3106236457824707,
"margin_dpo/loss_margin_mean": 23.10623550415039,
"margin_dpo/margin_mean": 23.10623550415039,
"margin_dpo/margin_std": 20.974063873291016,
"step": 246
},
{
"epoch": 0.36270190895741555,
"grad_norm": 39.22549057006836,
"learning_rate": 4.0371303213004814e-07,
"logits/chosen": -0.7125513553619385,
"logits/rejected": -0.6908072233200073,
"logps/chosen": -68.05924224853516,
"logps/ref_chosen": -56.6053466796875,
"logps/ref_rejected": -106.29327392578125,
"logps/rejected": -150.0380401611328,
"loss": 0.4087,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1458946168422699,
"margin_dpo/beta_margin_grad_std": 0.1995469033718109,
"margin_dpo/beta_margin_mean": 3.229086399078369,
"margin_dpo/loss_margin_mean": 32.29086685180664,
"margin_dpo/margin_mean": 32.29086685180664,
"margin_dpo/margin_std": 25.363780975341797,
"step": 247
},
{
"epoch": 0.3641703377386197,
"grad_norm": 44.850181579589844,
"learning_rate": 4.0269892260067197e-07,
"logits/chosen": -0.65459144115448,
"logits/rejected": -0.6384952068328857,
"logps/chosen": -54.36640930175781,
"logps/ref_chosen": -44.043216705322266,
"logps/ref_rejected": -91.85687255859375,
"logps/rejected": -126.57852172851562,
"loss": 0.4079,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16769659519195557,
"margin_dpo/beta_margin_grad_std": 0.15377435088157654,
"margin_dpo/beta_margin_mean": 2.439844846725464,
"margin_dpo/loss_margin_mean": 24.398448944091797,
"margin_dpo/margin_mean": 24.398447036743164,
"margin_dpo/margin_std": 19.091136932373047,
"step": 248
},
{
"epoch": 0.3656387665198238,
"grad_norm": 64.64720916748047,
"learning_rate": 4.0168078931267426e-07,
"logits/chosen": -0.7210872173309326,
"logits/rejected": -0.6843761205673218,
"logps/chosen": -74.97351837158203,
"logps/ref_chosen": -62.442352294921875,
"logps/ref_rejected": -80.46806335449219,
"logps/rejected": -113.79952239990234,
"loss": 0.6559,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.23226860165596008,
"margin_dpo/beta_margin_grad_std": 0.22308656573295593,
"margin_dpo/beta_margin_mean": 2.080028533935547,
"margin_dpo/loss_margin_mean": 20.80028533935547,
"margin_dpo/margin_mean": 20.80028533935547,
"margin_dpo/margin_std": 20.590898513793945,
"step": 249
},
{
"epoch": 0.3671071953010279,
"grad_norm": 32.35546875,
"learning_rate": 4.006586590948141e-07,
"logits/chosen": -0.6919997334480286,
"logits/rejected": -0.621533215045929,
"logps/chosen": -74.43826293945312,
"logps/ref_chosen": -65.6366958618164,
"logps/ref_rejected": -73.87183380126953,
"logps/rejected": -108.39453125,
"loss": 0.4355,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.15557865798473358,
"margin_dpo/beta_margin_grad_std": 0.1853388100862503,
"margin_dpo/beta_margin_mean": 2.572113037109375,
"margin_dpo/loss_margin_mean": 25.72113037109375,
"margin_dpo/margin_mean": 25.72113037109375,
"margin_dpo/margin_std": 18.065841674804688,
"step": 250
},
{
"epoch": 0.368575624082232,
"grad_norm": 45.22731018066406,
"learning_rate": 3.9963255888117325e-07,
"logits/chosen": -0.7006521224975586,
"logits/rejected": -0.645630955696106,
"logps/chosen": -70.0533676147461,
"logps/ref_chosen": -57.182716369628906,
"logps/ref_rejected": -77.66343688964844,
"logps/rejected": -116.09004211425781,
"loss": 0.4648,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.179362490773201,
"margin_dpo/beta_margin_grad_std": 0.19022953510284424,
"margin_dpo/beta_margin_mean": 2.5555951595306396,
"margin_dpo/loss_margin_mean": 25.555952072143555,
"margin_dpo/margin_mean": 25.555952072143555,
"margin_dpo/margin_std": 20.537967681884766,
"step": 251
},
{
"epoch": 0.3700440528634361,
"grad_norm": 52.23085403442383,
"learning_rate": 3.9860251571044666e-07,
"logits/chosen": -0.68607497215271,
"logits/rejected": -0.6413745880126953,
"logps/chosen": -83.42814636230469,
"logps/ref_chosen": -71.68563842773438,
"logps/ref_rejected": -84.75798797607422,
"logps/rejected": -122.11627197265625,
"loss": 0.4332,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.15921342372894287,
"margin_dpo/beta_margin_grad_std": 0.18207845091819763,
"margin_dpo/beta_margin_mean": 2.5615780353546143,
"margin_dpo/loss_margin_mean": 25.615779876708984,
"margin_dpo/margin_mean": 25.615779876708984,
"margin_dpo/margin_std": 19.359203338623047,
"step": 252
},
{
"epoch": 0.37151248164464024,
"grad_norm": 50.17756652832031,
"learning_rate": 3.9756855672522986e-07,
"logits/chosen": -0.7000366449356079,
"logits/rejected": -0.6709662675857544,
"logps/chosen": -79.15950012207031,
"logps/ref_chosen": -69.13392639160156,
"logps/ref_rejected": -98.70252990722656,
"logps/rejected": -132.74795532226562,
"loss": 0.6265,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.2043215036392212,
"margin_dpo/beta_margin_grad_std": 0.22261486947536469,
"margin_dpo/beta_margin_mean": 2.401987075805664,
"margin_dpo/loss_margin_mean": 24.01987075805664,
"margin_dpo/margin_mean": 24.01987075805664,
"margin_dpo/margin_std": 22.792251586914062,
"step": 253
},
{
"epoch": 0.37298091042584436,
"grad_norm": 63.319210052490234,
"learning_rate": 3.965307091713037e-07,
"logits/chosen": -0.6818605065345764,
"logits/rejected": -0.6327254772186279,
"logps/chosen": -64.66595458984375,
"logps/ref_chosen": -54.154998779296875,
"logps/ref_rejected": -90.30764770507812,
"logps/rejected": -125.43360900878906,
"loss": 0.5555,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1997654289007187,
"margin_dpo/beta_margin_grad_std": 0.21750159561634064,
"margin_dpo/beta_margin_mean": 2.461501121520996,
"margin_dpo/loss_margin_mean": 24.615013122558594,
"margin_dpo/margin_mean": 24.615013122558594,
"margin_dpo/margin_std": 22.605167388916016,
"step": 254
},
{
"epoch": 0.3744493392070485,
"grad_norm": 67.16899871826172,
"learning_rate": 3.954890003969163e-07,
"logits/chosen": -0.6882792115211487,
"logits/rejected": -0.6564103364944458,
"logps/chosen": -70.3147201538086,
"logps/ref_chosen": -57.14167022705078,
"logps/ref_rejected": -90.2085952758789,
"logps/rejected": -130.73675537109375,
"loss": 0.6626,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.19194266200065613,
"margin_dpo/beta_margin_grad_std": 0.2271135002374649,
"margin_dpo/beta_margin_mean": 2.7355103492736816,
"margin_dpo/loss_margin_mean": 27.355106353759766,
"margin_dpo/margin_mean": 27.355106353759766,
"margin_dpo/margin_std": 28.053775787353516,
"step": 255
},
{
"epoch": 0.37591776798825255,
"grad_norm": 58.15704345703125,
"learning_rate": 3.944434578520628e-07,
"logits/chosen": -0.6779786348342896,
"logits/rejected": -0.6487682461738586,
"logps/chosen": -68.2900619506836,
"logps/ref_chosen": -55.163490295410156,
"logps/ref_rejected": -92.56291961669922,
"logps/rejected": -132.92449951171875,
"loss": 0.5166,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.17461168766021729,
"margin_dpo/beta_margin_grad_std": 0.20105499029159546,
"margin_dpo/beta_margin_mean": 2.7235007286071777,
"margin_dpo/loss_margin_mean": 27.235008239746094,
"margin_dpo/margin_mean": 27.235008239746094,
"margin_dpo/margin_std": 25.091623306274414,
"step": 256
},
{
"epoch": 0.37738619676945667,
"grad_norm": 45.471885681152344,
"learning_rate": 3.933941090877615e-07,
"logits/chosen": -0.647832453250885,
"logits/rejected": -0.6230664253234863,
"logps/chosen": -61.75330352783203,
"logps/ref_chosen": -49.4236946105957,
"logps/ref_rejected": -79.53791809082031,
"logps/rejected": -121.87123107910156,
"loss": 0.5048,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.18101766705513,
"margin_dpo/beta_margin_grad_std": 0.21384158730506897,
"margin_dpo/beta_margin_mean": 3.0003700256347656,
"margin_dpo/loss_margin_mean": 30.003700256347656,
"margin_dpo/margin_mean": 30.003700256347656,
"margin_dpo/margin_std": 25.782047271728516,
"step": 257
},
{
"epoch": 0.3788546255506608,
"grad_norm": 87.32129669189453,
"learning_rate": 3.923409817553284e-07,
"logits/chosen": -0.6796102523803711,
"logits/rejected": -0.6486295461654663,
"logps/chosen": -75.19246673583984,
"logps/ref_chosen": -59.384124755859375,
"logps/ref_rejected": -95.9901123046875,
"logps/rejected": -138.2916717529297,
"loss": 0.7333,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.20006248354911804,
"margin_dpo/beta_margin_grad_std": 0.2458367645740509,
"margin_dpo/beta_margin_mean": 2.649322748184204,
"margin_dpo/loss_margin_mean": 26.493227005004883,
"margin_dpo/margin_mean": 26.493227005004883,
"margin_dpo/margin_std": 27.26435089111328,
"step": 258
},
{
"epoch": 0.3803230543318649,
"grad_norm": 49.1059684753418,
"learning_rate": 3.9128410360564793e-07,
"logits/chosen": -0.6160672307014465,
"logits/rejected": -0.5899853110313416,
"logps/chosen": -67.10466003417969,
"logps/ref_chosen": -52.828346252441406,
"logps/ref_rejected": -89.19165802001953,
"logps/rejected": -127.45698547363281,
"loss": 0.5267,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1898474097251892,
"margin_dpo/beta_margin_grad_std": 0.19210869073867798,
"margin_dpo/beta_margin_mean": 2.3989009857177734,
"margin_dpo/loss_margin_mean": 23.9890079498291,
"margin_dpo/margin_mean": 23.9890079498291,
"margin_dpo/margin_std": 20.4078369140625,
"step": 259
},
{
"epoch": 0.38179148311306904,
"grad_norm": 57.559608459472656,
"learning_rate": 3.9022350248844246e-07,
"logits/chosen": -0.6179628372192383,
"logits/rejected": -0.6178984045982361,
"logps/chosen": -62.646522521972656,
"logps/ref_chosen": -47.41767501831055,
"logps/ref_rejected": -95.08979034423828,
"logps/rejected": -137.5649871826172,
"loss": 0.4996,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.18264836072921753,
"margin_dpo/beta_margin_grad_std": 0.2074463963508606,
"margin_dpo/beta_margin_mean": 2.724635124206543,
"margin_dpo/loss_margin_mean": 27.24635124206543,
"margin_dpo/margin_mean": 27.24635124206543,
"margin_dpo/margin_std": 25.009780883789062,
"step": 260
},
{
"epoch": 0.3832599118942731,
"grad_norm": 45.6181755065918,
"learning_rate": 3.891592063515376e-07,
"logits/chosen": -0.6720625758171082,
"logits/rejected": -0.6372575759887695,
"logps/chosen": -65.10540008544922,
"logps/ref_chosen": -53.03137969970703,
"logps/ref_rejected": -88.51494598388672,
"logps/rejected": -129.3517608642578,
"loss": 0.4674,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1723802089691162,
"margin_dpo/beta_margin_grad_std": 0.2052065134048462,
"margin_dpo/beta_margin_mean": 2.876279354095459,
"margin_dpo/loss_margin_mean": 28.762794494628906,
"margin_dpo/margin_mean": 28.762794494628906,
"margin_dpo/margin_std": 26.346603393554688,
"step": 261
},
{
"epoch": 0.38472834067547723,
"grad_norm": 62.1253776550293,
"learning_rate": 3.880912432401264e-07,
"logits/chosen": -0.6411904096603394,
"logits/rejected": -0.5941104888916016,
"logps/chosen": -74.12861633300781,
"logps/ref_chosen": -59.620140075683594,
"logps/ref_rejected": -86.41853332519531,
"logps/rejected": -126.77371215820312,
"loss": 0.5264,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.17031686007976532,
"margin_dpo/beta_margin_grad_std": 0.22105002403259277,
"margin_dpo/beta_margin_mean": 2.5846705436706543,
"margin_dpo/loss_margin_mean": 25.84670639038086,
"margin_dpo/margin_mean": 25.84670639038086,
"margin_dpo/margin_std": 21.955059051513672,
"step": 262
},
{
"epoch": 0.38619676945668135,
"grad_norm": 63.75021743774414,
"learning_rate": 3.870196412960302e-07,
"logits/chosen": -0.6712623834609985,
"logits/rejected": -0.6137841939926147,
"logps/chosen": -71.11343383789062,
"logps/ref_chosen": -59.42094421386719,
"logps/ref_rejected": -96.85720825195312,
"logps/rejected": -139.07449340820312,
"loss": 0.4316,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1605646014213562,
"margin_dpo/beta_margin_grad_std": 0.19631603360176086,
"margin_dpo/beta_margin_mean": 3.0524797439575195,
"margin_dpo/loss_margin_mean": 30.524797439575195,
"margin_dpo/margin_mean": 30.524799346923828,
"margin_dpo/margin_std": 26.109458923339844,
"step": 263
},
{
"epoch": 0.3876651982378855,
"grad_norm": 65.67262268066406,
"learning_rate": 3.8594442875695665e-07,
"logits/chosen": -0.664789080619812,
"logits/rejected": -0.6371362805366516,
"logps/chosen": -76.22811126708984,
"logps/ref_chosen": -62.722084045410156,
"logps/ref_rejected": -93.85621643066406,
"logps/rejected": -131.24859619140625,
"loss": 0.5618,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.19636279344558716,
"margin_dpo/beta_margin_grad_std": 0.201200932264328,
"margin_dpo/beta_margin_mean": 2.3886351585388184,
"margin_dpo/loss_margin_mean": 23.8863525390625,
"margin_dpo/margin_mean": 23.8863525390625,
"margin_dpo/margin_std": 21.996925354003906,
"step": 264
},
{
"epoch": 0.3891336270190896,
"grad_norm": 68.70135498046875,
"learning_rate": 3.848656339557562e-07,
"logits/chosen": -0.6722284555435181,
"logits/rejected": -0.6433699131011963,
"logps/chosen": -76.23115539550781,
"logps/ref_chosen": -61.971466064453125,
"logps/ref_rejected": -88.02059936523438,
"logps/rejected": -127.41754150390625,
"loss": 0.5867,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.20745858550071716,
"margin_dpo/beta_margin_grad_std": 0.22185558080673218,
"margin_dpo/beta_margin_mean": 2.5137252807617188,
"margin_dpo/loss_margin_mean": 25.137252807617188,
"margin_dpo/margin_mean": 25.137252807617188,
"margin_dpo/margin_std": 25.626853942871094,
"step": 265
},
{
"epoch": 0.39060205580029367,
"grad_norm": 54.78199768066406,
"learning_rate": 3.8378328531967507e-07,
"logits/chosen": -0.6929798126220703,
"logits/rejected": -0.6306042075157166,
"logps/chosen": -80.66532897949219,
"logps/ref_chosen": -67.09967041015625,
"logps/ref_rejected": -67.97122192382812,
"logps/rejected": -106.28300476074219,
"loss": 0.5605,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1999325454235077,
"margin_dpo/beta_margin_grad_std": 0.21910326182842255,
"margin_dpo/beta_margin_mean": 2.474611759185791,
"margin_dpo/loss_margin_mean": 24.746116638183594,
"margin_dpo/margin_mean": 24.746116638183594,
"margin_dpo/margin_std": 22.58720588684082,
"step": 266
},
{
"epoch": 0.3920704845814978,
"grad_norm": 50.09029769897461,
"learning_rate": 3.8269741136960646e-07,
"logits/chosen": -0.661482036113739,
"logits/rejected": -0.6170265078544617,
"logps/chosen": -82.08916473388672,
"logps/ref_chosen": -68.97074890136719,
"logps/ref_rejected": -90.16844940185547,
"logps/rejected": -130.65118408203125,
"loss": 0.4154,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16007937490940094,
"margin_dpo/beta_margin_grad_std": 0.18461111187934875,
"margin_dpo/beta_margin_mean": 2.7364323139190674,
"margin_dpo/loss_margin_mean": 27.364322662353516,
"margin_dpo/margin_mean": 27.364322662353516,
"margin_dpo/margin_std": 22.23681640625,
"step": 267
},
{
"epoch": 0.3935389133627019,
"grad_norm": 61.21752166748047,
"learning_rate": 3.8160804071933894e-07,
"logits/chosen": -0.6383650898933411,
"logits/rejected": -0.6218883991241455,
"logps/chosen": -68.4329833984375,
"logps/ref_chosen": -55.900306701660156,
"logps/ref_rejected": -101.64763641357422,
"logps/rejected": -139.42156982421875,
"loss": 0.4978,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.18062824010849,
"margin_dpo/beta_margin_grad_std": 0.20679926872253418,
"margin_dpo/beta_margin_mean": 2.5241270065307617,
"margin_dpo/loss_margin_mean": 25.241270065307617,
"margin_dpo/margin_mean": 25.241270065307617,
"margin_dpo/margin_std": 21.52151870727539,
"step": 268
},
{
"epoch": 0.39500734214390604,
"grad_norm": 63.637428283691406,
"learning_rate": 3.8051520207480204e-07,
"logits/chosen": -0.6397280097007751,
"logits/rejected": -0.5932068228721619,
"logps/chosen": -82.90742492675781,
"logps/ref_chosen": -70.03955078125,
"logps/ref_rejected": -107.34937286376953,
"logps/rejected": -153.24514770507812,
"loss": 0.4033,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.14418360590934753,
"margin_dpo/beta_margin_grad_std": 0.21151122450828552,
"margin_dpo/beta_margin_mean": 3.302790641784668,
"margin_dpo/loss_margin_mean": 33.02790832519531,
"margin_dpo/margin_mean": 33.02790832519531,
"margin_dpo/margin_std": 23.7076416015625,
"step": 269
},
{
"epoch": 0.3964757709251101,
"grad_norm": 42.040462493896484,
"learning_rate": 3.794189242333106e-07,
"logits/chosen": -0.6672676205635071,
"logits/rejected": -0.6468701362609863,
"logps/chosen": -80.33956146240234,
"logps/ref_chosen": -69.53347778320312,
"logps/ref_rejected": -109.92864990234375,
"logps/rejected": -145.84991455078125,
"loss": 0.506,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.18614254891872406,
"margin_dpo/beta_margin_grad_std": 0.20391584932804108,
"margin_dpo/beta_margin_mean": 2.5115182399749756,
"margin_dpo/loss_margin_mean": 25.11518096923828,
"margin_dpo/margin_mean": 25.11518096923828,
"margin_dpo/margin_std": 22.150074005126953,
"step": 270
},
{
"epoch": 0.39794419970631423,
"grad_norm": 51.15155792236328,
"learning_rate": 3.7831923608280514e-07,
"logits/chosen": -0.6099239587783813,
"logits/rejected": -0.5680840611457825,
"logps/chosen": -70.89580535888672,
"logps/ref_chosen": -56.76457214355469,
"logps/ref_rejected": -92.51383209228516,
"logps/rejected": -132.5828399658203,
"loss": 0.5353,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1870347112417221,
"margin_dpo/beta_margin_grad_std": 0.21322524547576904,
"margin_dpo/beta_margin_mean": 2.5937767028808594,
"margin_dpo/loss_margin_mean": 25.937767028808594,
"margin_dpo/margin_mean": 25.937767028808594,
"margin_dpo/margin_std": 23.366622924804688,
"step": 271
},
{
"epoch": 0.39941262848751835,
"grad_norm": 51.133304595947266,
"learning_rate": 3.772161666010912e-07,
"logits/chosen": -0.6056419014930725,
"logits/rejected": -0.5937180519104004,
"logps/chosen": -62.401695251464844,
"logps/ref_chosen": -49.49715805053711,
"logps/ref_rejected": -105.54279327392578,
"logps/rejected": -150.09609985351562,
"loss": 0.5269,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.168976292014122,
"margin_dpo/beta_margin_grad_std": 0.23079171776771545,
"margin_dpo/beta_margin_mean": 3.1648764610290527,
"margin_dpo/loss_margin_mean": 31.648765563964844,
"margin_dpo/margin_mean": 31.64876365661621,
"margin_dpo/margin_std": 26.891244888305664,
"step": 272
},
{
"epoch": 0.4008810572687225,
"grad_norm": 58.39497375488281,
"learning_rate": 3.761097448550755e-07,
"logits/chosen": -0.5882784128189087,
"logits/rejected": -0.5531589984893799,
"logps/chosen": -77.80586242675781,
"logps/ref_chosen": -62.97539520263672,
"logps/ref_rejected": -92.49858093261719,
"logps/rejected": -137.8081817626953,
"loss": 0.4583,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16255009174346924,
"margin_dpo/beta_margin_grad_std": 0.20158691704273224,
"margin_dpo/beta_margin_mean": 3.0479135513305664,
"margin_dpo/loss_margin_mean": 30.479137420654297,
"margin_dpo/margin_mean": 30.479137420654297,
"margin_dpo/margin_std": 25.084999084472656,
"step": 273
},
{
"epoch": 0.4023494860499266,
"grad_norm": 53.60417938232422,
"learning_rate": 3.75e-07,
"logits/chosen": -0.6290228366851807,
"logits/rejected": -0.5923614501953125,
"logps/chosen": -71.82681274414062,
"logps/ref_chosen": -55.66770935058594,
"logps/ref_rejected": -77.33308410644531,
"logps/rejected": -119.93658447265625,
"loss": 0.5139,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.179681196808815,
"margin_dpo/beta_margin_grad_std": 0.21081170439720154,
"margin_dpo/beta_margin_mean": 2.6444411277770996,
"margin_dpo/loss_margin_mean": 26.44441032409668,
"margin_dpo/margin_mean": 26.44441032409668,
"margin_dpo/margin_std": 23.134292602539062,
"step": 274
},
{
"epoch": 0.40381791483113066,
"grad_norm": 63.069358825683594,
"learning_rate": 3.738869612786737e-07,
"logits/chosen": -0.6466611623764038,
"logits/rejected": -0.6279960870742798,
"logps/chosen": -59.85034942626953,
"logps/ref_chosen": -48.594703674316406,
"logps/ref_rejected": -93.30369567871094,
"logps/rejected": -132.1536407470703,
"loss": 0.4693,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.17479778826236725,
"margin_dpo/beta_margin_grad_std": 0.19887307286262512,
"margin_dpo/beta_margin_mean": 2.759430408477783,
"margin_dpo/loss_margin_mean": 27.594303131103516,
"margin_dpo/margin_mean": 27.594303131103516,
"margin_dpo/margin_std": 24.204368591308594,
"step": 275
},
{
"epoch": 0.4052863436123348,
"grad_norm": 62.213233947753906,
"learning_rate": 3.7277065802070204e-07,
"logits/chosen": -0.6649171113967896,
"logits/rejected": -0.624710202217102,
"logps/chosen": -70.10076141357422,
"logps/ref_chosen": -56.57740783691406,
"logps/ref_rejected": -70.36566925048828,
"logps/rejected": -109.4853744506836,
"loss": 0.5922,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.20924188196659088,
"margin_dpo/beta_margin_grad_std": 0.2318626046180725,
"margin_dpo/beta_margin_mean": 2.5596346855163574,
"margin_dpo/loss_margin_mean": 25.59634780883789,
"margin_dpo/margin_mean": 25.596345901489258,
"margin_dpo/margin_std": 24.71031951904297,
"step": 276
},
{
"epoch": 0.4067547723935389,
"grad_norm": 40.95648193359375,
"learning_rate": 3.71651119641714e-07,
"logits/chosen": -0.6606760025024414,
"logits/rejected": -0.6246751546859741,
"logps/chosen": -68.44947814941406,
"logps/ref_chosen": -56.27156066894531,
"logps/ref_rejected": -92.88127136230469,
"logps/rejected": -129.38565063476562,
"loss": 0.4261,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.17114606499671936,
"margin_dpo/beta_margin_grad_std": 0.16686135530471802,
"margin_dpo/beta_margin_mean": 2.432648181915283,
"margin_dpo/loss_margin_mean": 24.326480865478516,
"margin_dpo/margin_mean": 24.326480865478516,
"margin_dpo/margin_std": 18.5596866607666,
"step": 277
},
{
"epoch": 0.40822320117474303,
"grad_norm": 47.080055236816406,
"learning_rate": 3.705283756425872e-07,
"logits/chosen": -0.6622641682624817,
"logits/rejected": -0.6517907381057739,
"logps/chosen": -64.23191833496094,
"logps/ref_chosen": -52.94194030761719,
"logps/ref_rejected": -91.25357818603516,
"logps/rejected": -132.509765625,
"loss": 0.4854,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.17969730496406555,
"margin_dpo/beta_margin_grad_std": 0.21197958290576935,
"margin_dpo/beta_margin_mean": 2.9966213703155518,
"margin_dpo/loss_margin_mean": 29.96621322631836,
"margin_dpo/margin_mean": 29.96621322631836,
"margin_dpo/margin_std": 26.836585998535156,
"step": 278
},
{
"epoch": 0.40969162995594716,
"grad_norm": 50.956111907958984,
"learning_rate": 3.6940245560867e-07,
"logits/chosen": -0.6270808577537537,
"logits/rejected": -0.5984815955162048,
"logps/chosen": -60.80306625366211,
"logps/ref_chosen": -48.641319274902344,
"logps/ref_rejected": -87.8514404296875,
"logps/rejected": -129.2867889404297,
"loss": 0.4865,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.17512354254722595,
"margin_dpo/beta_margin_grad_std": 0.21795479953289032,
"margin_dpo/beta_margin_mean": 2.9273602962493896,
"margin_dpo/loss_margin_mean": 29.273601531982422,
"margin_dpo/margin_mean": 29.273601531982422,
"margin_dpo/margin_std": 24.12653160095215,
"step": 279
},
{
"epoch": 0.4111600587371512,
"grad_norm": 37.822959899902344,
"learning_rate": 3.6827338920900253e-07,
"logits/chosen": -0.6415497064590454,
"logits/rejected": -0.6238715648651123,
"logps/chosen": -72.27760314941406,
"logps/ref_chosen": -58.797122955322266,
"logps/ref_rejected": -98.61885070800781,
"logps/rejected": -141.04861450195312,
"loss": 0.3466,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1335011124610901,
"margin_dpo/beta_margin_grad_std": 0.17966148257255554,
"margin_dpo/beta_margin_mean": 2.8949294090270996,
"margin_dpo/loss_margin_mean": 28.949295043945312,
"margin_dpo/margin_mean": 28.94929313659668,
"margin_dpo/margin_std": 18.588150024414062,
"step": 280
},
{
"epoch": 0.41262848751835535,
"grad_norm": 71.57465362548828,
"learning_rate": 3.6714120619553435e-07,
"logits/chosen": -0.6258310675621033,
"logits/rejected": -0.5813519358634949,
"logps/chosen": -67.82090759277344,
"logps/ref_chosen": -55.488521575927734,
"logps/ref_rejected": -80.88258361816406,
"logps/rejected": -118.60010528564453,
"loss": 0.4834,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1572715938091278,
"margin_dpo/beta_margin_grad_std": 0.1871940791606903,
"margin_dpo/beta_margin_mean": 2.5385141372680664,
"margin_dpo/loss_margin_mean": 25.38513946533203,
"margin_dpo/margin_mean": 25.38513946533203,
"margin_dpo/margin_std": 20.046871185302734,
"step": 281
},
{
"epoch": 0.41409691629955947,
"grad_norm": 49.311336517333984,
"learning_rate": 3.660059364023408e-07,
"logits/chosen": -0.6380579471588135,
"logits/rejected": -0.5905803442001343,
"logps/chosen": -85.72900390625,
"logps/ref_chosen": -73.07014465332031,
"logps/ref_rejected": -95.35098266601562,
"logps/rejected": -131.3958740234375,
"loss": 0.4738,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.18345773220062256,
"margin_dpo/beta_margin_grad_std": 0.18459081649780273,
"margin_dpo/beta_margin_mean": 2.3386025428771973,
"margin_dpo/loss_margin_mean": 23.38602638244629,
"margin_dpo/margin_mean": 23.386028289794922,
"margin_dpo/margin_std": 20.714370727539062,
"step": 282
},
{
"epoch": 0.4155653450807636,
"grad_norm": 48.578853607177734,
"learning_rate": 3.6486760974483685e-07,
"logits/chosen": -0.6443203091621399,
"logits/rejected": -0.6160309314727783,
"logps/chosen": -74.32984161376953,
"logps/ref_chosen": -61.89844512939453,
"logps/ref_rejected": -96.98655700683594,
"logps/rejected": -137.47125244140625,
"loss": 0.478,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16398155689239502,
"margin_dpo/beta_margin_grad_std": 0.20954221487045288,
"margin_dpo/beta_margin_mean": 2.8053293228149414,
"margin_dpo/loss_margin_mean": 28.05329132080078,
"margin_dpo/margin_mean": 28.05329132080078,
"margin_dpo/margin_std": 23.433940887451172,
"step": 283
},
{
"epoch": 0.4170337738619677,
"grad_norm": 43.66304016113281,
"learning_rate": 3.6372625621898863e-07,
"logits/chosen": -0.6190305948257446,
"logits/rejected": -0.6051605939865112,
"logps/chosen": -72.00505828857422,
"logps/ref_chosen": -58.4355354309082,
"logps/ref_rejected": -93.46926879882812,
"logps/rejected": -136.72036743164062,
"loss": 0.4128,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1565876454114914,
"margin_dpo/beta_margin_grad_std": 0.18501752614974976,
"margin_dpo/beta_margin_mean": 2.9681572914123535,
"margin_dpo/loss_margin_mean": 29.68157196044922,
"margin_dpo/margin_mean": 29.68157196044922,
"margin_dpo/margin_std": 25.59400749206543,
"step": 284
},
{
"epoch": 0.4185022026431718,
"grad_norm": 55.42721176147461,
"learning_rate": 3.625819059005228e-07,
"logits/chosen": -0.6967588663101196,
"logits/rejected": -0.670876681804657,
"logps/chosen": -81.565185546875,
"logps/ref_chosen": -66.2322006225586,
"logps/ref_rejected": -99.1268310546875,
"logps/rejected": -141.13421630859375,
"loss": 0.4222,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16293823719024658,
"margin_dpo/beta_margin_grad_std": 0.1807018220424652,
"margin_dpo/beta_margin_mean": 2.667440414428711,
"margin_dpo/loss_margin_mean": 26.67440414428711,
"margin_dpo/margin_mean": 26.674402236938477,
"margin_dpo/margin_std": 20.944385528564453,
"step": 285
},
{
"epoch": 0.4199706314243759,
"grad_norm": 55.817623138427734,
"learning_rate": 3.614345889441346e-07,
"logits/chosen": -0.6651911735534668,
"logits/rejected": -0.6330760717391968,
"logps/chosen": -86.78990173339844,
"logps/ref_chosen": -72.95100402832031,
"logps/ref_rejected": -88.58845520019531,
"logps/rejected": -130.15069580078125,
"loss": 0.551,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.18454879522323608,
"margin_dpo/beta_margin_grad_std": 0.22670768201351166,
"margin_dpo/beta_margin_mean": 2.772334575653076,
"margin_dpo/loss_margin_mean": 27.723342895507812,
"margin_dpo/margin_mean": 27.723342895507812,
"margin_dpo/margin_std": 25.246835708618164,
"step": 286
},
{
"epoch": 0.42143906020558003,
"grad_norm": 53.33695602416992,
"learning_rate": 3.6028433558269275e-07,
"logits/chosen": -0.6670126914978027,
"logits/rejected": -0.6226764917373657,
"logps/chosen": -75.57640075683594,
"logps/ref_chosen": -61.54115295410156,
"logps/ref_rejected": -77.6960678100586,
"logps/rejected": -118.70558166503906,
"loss": 0.5322,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.19202715158462524,
"margin_dpo/beta_margin_grad_std": 0.21082551777362823,
"margin_dpo/beta_margin_mean": 2.697427272796631,
"margin_dpo/loss_margin_mean": 26.974271774291992,
"margin_dpo/margin_mean": 26.974271774291992,
"margin_dpo/margin_std": 26.020793914794922,
"step": 287
},
{
"epoch": 0.42290748898678415,
"grad_norm": 58.48661804199219,
"learning_rate": 3.5913117612644327e-07,
"logits/chosen": -0.6334189176559448,
"logits/rejected": -0.6017390489578247,
"logps/chosen": -72.49565124511719,
"logps/ref_chosen": -56.661224365234375,
"logps/ref_rejected": -87.335693359375,
"logps/rejected": -130.8236541748047,
"loss": 0.4353,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16223303973674774,
"margin_dpo/beta_margin_grad_std": 0.19457679986953735,
"margin_dpo/beta_margin_mean": 2.765352249145508,
"margin_dpo/loss_margin_mean": 27.653522491455078,
"margin_dpo/margin_mean": 27.653522491455078,
"margin_dpo/margin_std": 21.126564025878906,
"step": 288
},
{
"epoch": 0.4243759177679883,
"grad_norm": 51.02857208251953,
"learning_rate": 3.5797514096221024e-07,
"logits/chosen": -0.6474858522415161,
"logits/rejected": -0.6363035440444946,
"logps/chosen": -61.39606857299805,
"logps/ref_chosen": -45.23039245605469,
"logps/ref_rejected": -87.64266967773438,
"logps/rejected": -134.01449584960938,
"loss": 0.5024,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.18502648174762726,
"margin_dpo/beta_margin_grad_std": 0.20525604486465454,
"margin_dpo/beta_margin_mean": 3.020615339279175,
"margin_dpo/loss_margin_mean": 30.206151962280273,
"margin_dpo/margin_mean": 30.206153869628906,
"margin_dpo/margin_std": 29.076404571533203,
"step": 289
},
{
"epoch": 0.42584434654919234,
"grad_norm": 61.92301559448242,
"learning_rate": 3.568162605525952e-07,
"logits/chosen": -0.6007183194160461,
"logits/rejected": -0.596926748752594,
"logps/chosen": -71.86837768554688,
"logps/ref_chosen": -55.47149658203125,
"logps/ref_rejected": -116.70857238769531,
"logps/rejected": -164.587646484375,
"loss": 0.51,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.17327924072742462,
"margin_dpo/beta_margin_grad_std": 0.2234293818473816,
"margin_dpo/beta_margin_mean": 3.1482198238372803,
"margin_dpo/loss_margin_mean": 31.482196807861328,
"margin_dpo/margin_mean": 31.482196807861328,
"margin_dpo/margin_std": 29.341014862060547,
"step": 290
},
{
"epoch": 0.42731277533039647,
"grad_norm": 56.78514099121094,
"learning_rate": 3.5565456543517485e-07,
"logits/chosen": -0.66310715675354,
"logits/rejected": -0.6308495998382568,
"logps/chosen": -75.83061981201172,
"logps/ref_chosen": -63.26036834716797,
"logps/ref_rejected": -89.29708862304688,
"logps/rejected": -129.5008087158203,
"loss": 0.4818,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16443225741386414,
"margin_dpo/beta_margin_grad_std": 0.20096154510974884,
"margin_dpo/beta_margin_mean": 2.7633466720581055,
"margin_dpo/loss_margin_mean": 27.633464813232422,
"margin_dpo/margin_mean": 27.633464813232422,
"margin_dpo/margin_std": 22.616226196289062,
"step": 291
},
{
"epoch": 0.4287812041116006,
"grad_norm": 53.309532165527344,
"learning_rate": 3.5449008622169583e-07,
"logits/chosen": -0.644907534122467,
"logits/rejected": -0.6032723188400269,
"logps/chosen": -70.52488708496094,
"logps/ref_chosen": -53.91852951049805,
"logps/ref_rejected": -89.96138000488281,
"logps/rejected": -136.3544921875,
"loss": 0.3955,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1559363305568695,
"margin_dpo/beta_margin_grad_std": 0.17519932985305786,
"margin_dpo/beta_margin_mean": 2.978675127029419,
"margin_dpo/loss_margin_mean": 29.78675079345703,
"margin_dpo/margin_mean": 29.78675079345703,
"margin_dpo/margin_std": 24.420841217041016,
"step": 292
},
{
"epoch": 0.4302496328928047,
"grad_norm": 54.48039245605469,
"learning_rate": 3.5332285359726846e-07,
"logits/chosen": -0.6540181040763855,
"logits/rejected": -0.6253507137298584,
"logps/chosen": -76.41354370117188,
"logps/ref_chosen": -60.376033782958984,
"logps/ref_rejected": -77.8524398803711,
"logps/rejected": -118.03453063964844,
"loss": 0.5943,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.2110597789287567,
"margin_dpo/beta_margin_grad_std": 0.21620804071426392,
"margin_dpo/beta_margin_mean": 2.4144577980041504,
"margin_dpo/loss_margin_mean": 24.144577026367188,
"margin_dpo/margin_mean": 24.144577026367188,
"margin_dpo/margin_std": 24.406749725341797,
"step": 293
},
{
"epoch": 0.43171806167400884,
"grad_norm": 42.810264587402344,
"learning_rate": 3.5215289831955786e-07,
"logits/chosen": -0.6317086219787598,
"logits/rejected": -0.6187810897827148,
"logps/chosen": -62.521305084228516,
"logps/ref_chosen": -48.0875358581543,
"logps/ref_rejected": -81.89698791503906,
"logps/rejected": -123.5872573852539,
"loss": 0.5175,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1887100636959076,
"margin_dpo/beta_margin_grad_std": 0.20870058238506317,
"margin_dpo/beta_margin_mean": 2.725649833679199,
"margin_dpo/loss_margin_mean": 27.256500244140625,
"margin_dpo/margin_mean": 27.256500244140625,
"margin_dpo/margin_std": 25.993709564208984,
"step": 294
},
{
"epoch": 0.4331864904552129,
"grad_norm": 61.524410247802734,
"learning_rate": 3.509802512179737e-07,
"logits/chosen": -0.59177565574646,
"logits/rejected": -0.5824375748634338,
"logps/chosen": -68.55509185791016,
"logps/ref_chosen": -49.92467498779297,
"logps/ref_rejected": -87.45632934570312,
"logps/rejected": -133.15512084960938,
"loss": 0.595,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.18381169438362122,
"margin_dpo/beta_margin_grad_std": 0.22098243236541748,
"margin_dpo/beta_margin_mean": 2.706838607788086,
"margin_dpo/loss_margin_mean": 27.06838607788086,
"margin_dpo/margin_mean": 27.06838607788086,
"margin_dpo/margin_std": 24.90131378173828,
"step": 295
},
{
"epoch": 0.434654919236417,
"grad_norm": 85.08708953857422,
"learning_rate": 3.498049431928577e-07,
"logits/chosen": -0.6830310821533203,
"logits/rejected": -0.6430518627166748,
"logps/chosen": -83.78386688232422,
"logps/ref_chosen": -65.49124145507812,
"logps/ref_rejected": -93.08908081054688,
"logps/rejected": -134.91554260253906,
"loss": 0.7283,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.23585554957389832,
"margin_dpo/beta_margin_grad_std": 0.2520889639854431,
"margin_dpo/beta_margin_mean": 2.3533830642700195,
"margin_dpo/loss_margin_mean": 23.533828735351562,
"margin_dpo/margin_mean": 23.53382682800293,
"margin_dpo/margin_std": 26.529495239257812,
"step": 296
},
{
"epoch": 0.43612334801762115,
"grad_norm": 44.37043762207031,
"learning_rate": 3.486270052146694e-07,
"logits/chosen": -0.5455184578895569,
"logits/rejected": -0.5094451904296875,
"logps/chosen": -74.52252197265625,
"logps/ref_chosen": -56.47694778442383,
"logps/ref_rejected": -95.1385498046875,
"logps/rejected": -141.77182006835938,
"loss": 0.4232,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16087649762630463,
"margin_dpo/beta_margin_grad_std": 0.18778559565544128,
"margin_dpo/beta_margin_mean": 2.858769416809082,
"margin_dpo/loss_margin_mean": 28.587692260742188,
"margin_dpo/margin_mean": 28.587696075439453,
"margin_dpo/margin_std": 23.822004318237305,
"step": 297
},
{
"epoch": 0.43759177679882527,
"grad_norm": 43.25141143798828,
"learning_rate": 3.474464683231698e-07,
"logits/chosen": -0.642224907875061,
"logits/rejected": -0.6368216276168823,
"logps/chosen": -83.6908950805664,
"logps/ref_chosen": -67.32516479492188,
"logps/ref_rejected": -116.66217041015625,
"logps/rejected": -162.75466918945312,
"loss": 0.4123,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16109539568424225,
"margin_dpo/beta_margin_grad_std": 0.18005360662937164,
"margin_dpo/beta_margin_mean": 2.972676992416382,
"margin_dpo/loss_margin_mean": 29.726768493652344,
"margin_dpo/margin_mean": 29.726768493652344,
"margin_dpo/margin_std": 26.26955795288086,
"step": 298
},
{
"epoch": 0.4390602055800294,
"grad_norm": 59.36482238769531,
"learning_rate": 3.462633636266041e-07,
"logits/chosen": -0.5800520181655884,
"logits/rejected": -0.5592917799949646,
"logps/chosen": -64.02508544921875,
"logps/ref_chosen": -48.96209716796875,
"logps/ref_rejected": -84.32823944091797,
"logps/rejected": -130.45758056640625,
"loss": 0.5094,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1787450611591339,
"margin_dpo/beta_margin_grad_std": 0.2245379388332367,
"margin_dpo/beta_margin_mean": 3.1066365242004395,
"margin_dpo/loss_margin_mean": 31.066364288330078,
"margin_dpo/margin_mean": 31.066362380981445,
"margin_dpo/margin_std": 27.945383071899414,
"step": 299
},
{
"epoch": 0.44052863436123346,
"grad_norm": 81.1234359741211,
"learning_rate": 3.4507772230088147e-07,
"logits/chosen": -0.6008783578872681,
"logits/rejected": -0.5817815065383911,
"logps/chosen": -80.24883270263672,
"logps/ref_chosen": -59.073707580566406,
"logps/ref_rejected": -95.9664535522461,
"logps/rejected": -146.92205810546875,
"loss": 0.699,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.20390301942825317,
"margin_dpo/beta_margin_grad_std": 0.26888635754585266,
"margin_dpo/beta_margin_mean": 2.9780476093292236,
"margin_dpo/loss_margin_mean": 29.780475616455078,
"margin_dpo/margin_mean": 29.780475616455078,
"margin_dpo/margin_std": 29.694149017333984,
"step": 300
},
{
"epoch": 0.44052863436123346,
"eval_logits/chosen": -0.6107151508331299,
"eval_logits/rejected": -0.5844902992248535,
"eval_logps/chosen": -99.96916198730469,
"eval_logps/ref_chosen": -79.05104064941406,
"eval_logps/ref_rejected": -86.79793548583984,
"eval_logps/rejected": -125.81926727294922,
"eval_loss": 0.4413561224937439,
"eval_margin_dpo/beta": 0.10000000149011612,
"eval_margin_dpo/beta_margin_grad_mean": -0.28278234601020813,
"eval_margin_dpo/beta_margin_grad_std": 0.2516450583934784,
"eval_margin_dpo/beta_margin_mean": 1.8103224039077759,
"eval_margin_dpo/loss_margin_mean": 18.10322380065918,
"eval_margin_dpo/margin_mean": 18.10322380065918,
"eval_margin_dpo/margin_std": 23.78249168395996,
"eval_runtime": 39.9127,
"eval_samples_per_second": 58.603,
"eval_steps_per_second": 1.854,
"step": 300
},
{
"epoch": 0.4419970631424376,
"grad_norm": 47.328758239746094,
"learning_rate": 3.4388957558875316e-07,
"logits/chosen": -0.6326008439064026,
"logits/rejected": -0.6015191078186035,
"logps/chosen": -75.48579406738281,
"logps/ref_chosen": -57.249366760253906,
"logps/ref_rejected": -92.35354614257812,
"logps/rejected": -141.67437744140625,
"loss": 0.398,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.15365365147590637,
"margin_dpo/beta_margin_grad_std": 0.18743260204792023,
"margin_dpo/beta_margin_mean": 3.1084399223327637,
"margin_dpo/loss_margin_mean": 31.084396362304688,
"margin_dpo/margin_mean": 31.084396362304688,
"margin_dpo/margin_std": 25.64261245727539,
"step": 301
},
{
"epoch": 0.4434654919236417,
"grad_norm": 68.61962127685547,
"learning_rate": 3.426989547989902e-07,
"logits/chosen": -0.5673216581344604,
"logits/rejected": -0.5571401119232178,
"logps/chosen": -66.45941925048828,
"logps/ref_chosen": -51.19799041748047,
"logps/ref_rejected": -97.22636413574219,
"logps/rejected": -141.73390197753906,
"loss": 0.5652,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.18492794036865234,
"margin_dpo/beta_margin_grad_std": 0.20831260085105896,
"margin_dpo/beta_margin_mean": 2.9246113300323486,
"margin_dpo/loss_margin_mean": 29.246112823486328,
"margin_dpo/margin_mean": 29.246112823486328,
"margin_dpo/margin_std": 29.20469093322754,
"step": 302
},
{
"epoch": 0.44493392070484583,
"grad_norm": 67.56126403808594,
"learning_rate": 3.4150589130555773e-07,
"logits/chosen": -0.6228208541870117,
"logits/rejected": -0.5857237577438354,
"logps/chosen": -83.40345764160156,
"logps/ref_chosen": -66.71394348144531,
"logps/ref_rejected": -86.94542694091797,
"logps/rejected": -131.71636962890625,
"loss": 0.6418,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.20844492316246033,
"margin_dpo/beta_margin_grad_std": 0.25230517983436584,
"margin_dpo/beta_margin_mean": 2.80814266204834,
"margin_dpo/loss_margin_mean": 28.081424713134766,
"margin_dpo/margin_mean": 28.081424713134766,
"margin_dpo/margin_std": 28.703716278076172,
"step": 303
},
{
"epoch": 0.44640234948604995,
"grad_norm": 58.02699661254883,
"learning_rate": 3.403104165467883e-07,
"logits/chosen": -0.6468909382820129,
"logits/rejected": -0.6176923513412476,
"logps/chosen": -86.12702178955078,
"logps/ref_chosen": -71.95069885253906,
"logps/ref_rejected": -90.47203063964844,
"logps/rejected": -132.961181640625,
"loss": 0.4589,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.15100273489952087,
"margin_dpo/beta_margin_grad_std": 0.2101380079984665,
"margin_dpo/beta_margin_mean": 2.831282377243042,
"margin_dpo/loss_margin_mean": 28.312822341918945,
"margin_dpo/margin_mean": 28.312822341918945,
"margin_dpo/margin_std": 20.519695281982422,
"step": 304
},
{
"epoch": 0.447870778267254,
"grad_norm": 51.01959991455078,
"learning_rate": 3.391125620245535e-07,
"logits/chosen": -0.6300150156021118,
"logits/rejected": -0.5907981395721436,
"logps/chosen": -84.66513061523438,
"logps/ref_chosen": -66.79523468017578,
"logps/ref_rejected": -92.75459289550781,
"logps/rejected": -139.4754638671875,
"loss": 0.4261,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16771967709064484,
"margin_dpo/beta_margin_grad_std": 0.1794486939907074,
"margin_dpo/beta_margin_mean": 2.885097026824951,
"margin_dpo/loss_margin_mean": 28.850971221923828,
"margin_dpo/margin_mean": 28.850971221923828,
"margin_dpo/margin_std": 26.399646759033203,
"step": 305
},
{
"epoch": 0.44933920704845814,
"grad_norm": 74.01148986816406,
"learning_rate": 3.3791235930343417e-07,
"logits/chosen": -0.6759564876556396,
"logits/rejected": -0.626822829246521,
"logps/chosen": -85.26339721679688,
"logps/ref_chosen": -69.68389892578125,
"logps/ref_rejected": -85.15919494628906,
"logps/rejected": -128.5599822998047,
"loss": 0.5096,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16733743250370026,
"margin_dpo/beta_margin_grad_std": 0.21074122190475464,
"margin_dpo/beta_margin_mean": 2.78212833404541,
"margin_dpo/loss_margin_mean": 27.821285247802734,
"margin_dpo/margin_mean": 27.8212833404541,
"margin_dpo/margin_std": 23.266021728515625,
"step": 306
},
{
"epoch": 0.45080763582966227,
"grad_norm": 54.2598762512207,
"learning_rate": 3.367098400098881e-07,
"logits/chosen": -0.6196017265319824,
"logits/rejected": -0.5918940305709839,
"logps/chosen": -86.1854476928711,
"logps/ref_chosen": -70.16542053222656,
"logps/ref_rejected": -86.97230529785156,
"logps/rejected": -128.44007873535156,
"loss": 0.5398,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1930994689464569,
"margin_dpo/beta_margin_grad_std": 0.21751633286476135,
"margin_dpo/beta_margin_mean": 2.544773578643799,
"margin_dpo/loss_margin_mean": 25.447734832763672,
"margin_dpo/margin_mean": 25.447734832763672,
"margin_dpo/margin_std": 24.255327224731445,
"step": 307
},
{
"epoch": 0.4522760646108664,
"grad_norm": 42.99238967895508,
"learning_rate": 3.355050358314172e-07,
"logits/chosen": -0.6084394454956055,
"logits/rejected": -0.581619381904602,
"logps/chosen": -70.17066955566406,
"logps/ref_chosen": -55.2449951171875,
"logps/ref_rejected": -79.37226104736328,
"logps/rejected": -123.38723754882812,
"loss": 0.4913,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1686662882566452,
"margin_dpo/beta_margin_grad_std": 0.1977521926164627,
"margin_dpo/beta_margin_mean": 2.908930778503418,
"margin_dpo/loss_margin_mean": 29.089309692382812,
"margin_dpo/margin_mean": 29.089309692382812,
"margin_dpo/margin_std": 26.31514549255371,
"step": 308
},
{
"epoch": 0.45374449339207046,
"grad_norm": 54.909400939941406,
"learning_rate": 3.3429797851573183e-07,
"logits/chosen": -0.6183820366859436,
"logits/rejected": -0.5834609866142273,
"logps/chosen": -66.57117462158203,
"logps/ref_chosen": -48.959083557128906,
"logps/ref_rejected": -82.34072875976562,
"logps/rejected": -128.1937713623047,
"loss": 0.5046,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.17868635058403015,
"margin_dpo/beta_margin_grad_std": 0.21038393676280975,
"margin_dpo/beta_margin_mean": 2.824094295501709,
"margin_dpo/loss_margin_mean": 28.240943908691406,
"margin_dpo/margin_mean": 28.240943908691406,
"margin_dpo/margin_std": 24.293102264404297,
"step": 309
},
{
"epoch": 0.4552129221732746,
"grad_norm": 50.08707809448242,
"learning_rate": 3.3308869986991487e-07,
"logits/chosen": -0.6874780058860779,
"logits/rejected": -0.6412575244903564,
"logps/chosen": -78.43482971191406,
"logps/ref_chosen": -62.74177932739258,
"logps/ref_rejected": -79.9302978515625,
"logps/rejected": -120.06564331054688,
"loss": 0.4422,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.17856904864311218,
"margin_dpo/beta_margin_grad_std": 0.16496190428733826,
"margin_dpo/beta_margin_mean": 2.4442286491394043,
"margin_dpo/loss_margin_mean": 24.44228744506836,
"margin_dpo/margin_mean": 24.44228744506836,
"margin_dpo/margin_std": 20.531139373779297,
"step": 310
},
{
"epoch": 0.4566813509544787,
"grad_norm": 63.15940475463867,
"learning_rate": 3.3187723175958346e-07,
"logits/chosen": -0.5884615182876587,
"logits/rejected": -0.54796302318573,
"logps/chosen": -73.1263656616211,
"logps/ref_chosen": -53.027976989746094,
"logps/ref_rejected": -77.43820190429688,
"logps/rejected": -131.83961486816406,
"loss": 0.3482,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.12926018238067627,
"margin_dpo/beta_margin_grad_std": 0.1706952154636383,
"margin_dpo/beta_margin_mean": 3.4303030967712402,
"margin_dpo/loss_margin_mean": 34.30303192138672,
"margin_dpo/margin_mean": 34.30303192138672,
"margin_dpo/margin_std": 25.374624252319336,
"step": 311
},
{
"epoch": 0.4581497797356828,
"grad_norm": 56.578857421875,
"learning_rate": 3.306636061080487e-07,
"logits/chosen": -0.5995860695838928,
"logits/rejected": -0.555045485496521,
"logps/chosen": -65.98387908935547,
"logps/ref_chosen": -49.39221954345703,
"logps/ref_rejected": -75.79280090332031,
"logps/rejected": -122.10321807861328,
"loss": 0.4842,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1694258749485016,
"margin_dpo/beta_margin_grad_std": 0.21930669248104095,
"margin_dpo/beta_margin_mean": 2.9718756675720215,
"margin_dpo/loss_margin_mean": 29.7187557220459,
"margin_dpo/margin_mean": 29.7187557220459,
"margin_dpo/margin_std": 26.206937789916992,
"step": 312
},
{
"epoch": 0.45961820851688695,
"grad_norm": 57.107025146484375,
"learning_rate": 3.2944785489547537e-07,
"logits/chosen": -0.6909008026123047,
"logits/rejected": -0.6554454565048218,
"logps/chosen": -64.720458984375,
"logps/ref_chosen": -50.152740478515625,
"logps/ref_rejected": -86.40620422363281,
"logps/rejected": -126.65206909179688,
"loss": 0.6326,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.21527621150016785,
"margin_dpo/beta_margin_grad_std": 0.22800129652023315,
"margin_dpo/beta_margin_mean": 2.567814350128174,
"margin_dpo/loss_margin_mean": 25.678142547607422,
"margin_dpo/margin_mean": 25.678142547607422,
"margin_dpo/margin_std": 26.8893985748291,
"step": 313
},
{
"epoch": 0.461086637298091,
"grad_norm": 58.55873489379883,
"learning_rate": 3.2823001015803857e-07,
"logits/chosen": -0.6338675022125244,
"logits/rejected": -0.609628438949585,
"logps/chosen": -72.63245391845703,
"logps/ref_chosen": -57.23758316040039,
"logps/ref_rejected": -97.59652709960938,
"logps/rejected": -138.97230529785156,
"loss": 0.5754,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.2068512737751007,
"margin_dpo/beta_margin_grad_std": 0.22196519374847412,
"margin_dpo/beta_margin_mean": 2.598090648651123,
"margin_dpo/loss_margin_mean": 25.98090362548828,
"margin_dpo/margin_mean": 25.980905532836914,
"margin_dpo/margin_std": 25.633577346801758,
"step": 314
},
{
"epoch": 0.46255506607929514,
"grad_norm": 47.718597412109375,
"learning_rate": 3.270101039870797e-07,
"logits/chosen": -0.6039018630981445,
"logits/rejected": -0.5841349959373474,
"logps/chosen": -64.1601791381836,
"logps/ref_chosen": -49.06958770751953,
"logps/ref_rejected": -85.68087768554688,
"logps/rejected": -125.34097290039062,
"loss": 0.4916,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.18752586841583252,
"margin_dpo/beta_margin_grad_std": 0.18681946396827698,
"margin_dpo/beta_margin_mean": 2.456951141357422,
"margin_dpo/loss_margin_mean": 24.56951141357422,
"margin_dpo/margin_mean": 24.56951141357422,
"margin_dpo/margin_std": 22.756999969482422,
"step": 315
},
{
"epoch": 0.46402349486049926,
"grad_norm": 51.623634338378906,
"learning_rate": 3.2578816852826086e-07,
"logits/chosen": -0.6183241605758667,
"logits/rejected": -0.6124423146247864,
"logps/chosen": -71.89447784423828,
"logps/ref_chosen": -54.26074981689453,
"logps/ref_rejected": -101.2814712524414,
"logps/rejected": -148.76461791992188,
"loss": 0.4158,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.15852956473827362,
"margin_dpo/beta_margin_grad_std": 0.18403339385986328,
"margin_dpo/beta_margin_mean": 2.984943389892578,
"margin_dpo/loss_margin_mean": 29.84943389892578,
"margin_dpo/margin_mean": 29.84943389892578,
"margin_dpo/margin_std": 26.252422332763672,
"step": 316
},
{
"epoch": 0.4654919236417034,
"grad_norm": 38.064273834228516,
"learning_rate": 3.2456423598071783e-07,
"logits/chosen": -0.6562374830245972,
"logits/rejected": -0.6188434958457947,
"logps/chosen": -69.55624389648438,
"logps/ref_chosen": -56.094207763671875,
"logps/ref_rejected": -100.69905090332031,
"logps/rejected": -148.02902221679688,
"loss": 0.3529,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.13426414132118225,
"margin_dpo/beta_margin_grad_std": 0.1786133348941803,
"margin_dpo/beta_margin_mean": 3.386793375015259,
"margin_dpo/loss_margin_mean": 33.86793518066406,
"margin_dpo/margin_mean": 33.86793518066406,
"margin_dpo/margin_std": 24.910192489624023,
"step": 317
},
{
"epoch": 0.4669603524229075,
"grad_norm": 47.10121154785156,
"learning_rate": 3.233383385962115e-07,
"logits/chosen": -0.6792348623275757,
"logits/rejected": -0.6371433138847351,
"logps/chosen": -77.5732421875,
"logps/ref_chosen": -64.64570617675781,
"logps/ref_rejected": -82.76425170898438,
"logps/rejected": -126.16979217529297,
"loss": 0.4256,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.15739941596984863,
"margin_dpo/beta_margin_grad_std": 0.1940799355506897,
"margin_dpo/beta_margin_mean": 3.047799587249756,
"margin_dpo/loss_margin_mean": 30.47799301147461,
"margin_dpo/margin_mean": 30.47799301147461,
"margin_dpo/margin_std": 25.105358123779297,
"step": 318
},
{
"epoch": 0.4684287812041116,
"grad_norm": 41.460880279541016,
"learning_rate": 3.2211050867827805e-07,
"logits/chosen": -0.6292097568511963,
"logits/rejected": -0.6156477928161621,
"logps/chosen": -62.311241149902344,
"logps/ref_chosen": -49.383758544921875,
"logps/ref_rejected": -113.90650939941406,
"logps/rejected": -156.33575439453125,
"loss": 0.363,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1437537968158722,
"margin_dpo/beta_margin_grad_std": 0.17065343260765076,
"margin_dpo/beta_margin_mean": 2.9501757621765137,
"margin_dpo/loss_margin_mean": 29.501754760742188,
"margin_dpo/margin_mean": 29.501754760742188,
"margin_dpo/margin_std": 22.302837371826172,
"step": 319
},
{
"epoch": 0.4698972099853157,
"grad_norm": 51.28620910644531,
"learning_rate": 3.208807785813777e-07,
"logits/chosen": -0.6504048109054565,
"logits/rejected": -0.6378560066223145,
"logps/chosen": -74.24742126464844,
"logps/ref_chosen": -59.50489044189453,
"logps/ref_rejected": -97.66716766357422,
"logps/rejected": -139.2616729736328,
"loss": 0.4858,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.17591118812561035,
"margin_dpo/beta_margin_grad_std": 0.1899009644985199,
"margin_dpo/beta_margin_mean": 2.685196876525879,
"margin_dpo/loss_margin_mean": 26.851966857910156,
"margin_dpo/margin_mean": 26.85196876525879,
"margin_dpo/margin_std": 23.467105865478516,
"step": 320
},
{
"epoch": 0.4713656387665198,
"grad_norm": 72.78955078125,
"learning_rate": 3.1964918071004217e-07,
"logits/chosen": -0.6326063871383667,
"logits/rejected": -0.5948277711868286,
"logps/chosen": -80.62930297851562,
"logps/ref_chosen": -61.548683166503906,
"logps/ref_rejected": -91.64103698730469,
"logps/rejected": -136.67556762695312,
"loss": 0.7106,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.22593584656715393,
"margin_dpo/beta_margin_grad_std": 0.2594347894191742,
"margin_dpo/beta_margin_mean": 2.595390558242798,
"margin_dpo/loss_margin_mean": 25.95390510559082,
"margin_dpo/margin_mean": 25.953907012939453,
"margin_dpo/margin_std": 27.37790298461914,
"step": 321
},
{
"epoch": 0.47283406754772395,
"grad_norm": 53.10894775390625,
"learning_rate": 3.184157475180207e-07,
"logits/chosen": -0.6191203594207764,
"logits/rejected": -0.597158670425415,
"logps/chosen": -72.91082763671875,
"logps/ref_chosen": -57.29003143310547,
"logps/ref_rejected": -95.74992370605469,
"logps/rejected": -143.1072235107422,
"loss": 0.4304,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16689737141132355,
"margin_dpo/beta_margin_grad_std": 0.18274548649787903,
"margin_dpo/beta_margin_mean": 3.173649311065674,
"margin_dpo/loss_margin_mean": 31.736494064331055,
"margin_dpo/margin_mean": 31.736492156982422,
"margin_dpo/margin_std": 27.840457916259766,
"step": 322
},
{
"epoch": 0.47430249632892807,
"grad_norm": 46.58567428588867,
"learning_rate": 3.171805115074251e-07,
"logits/chosen": -0.6087906360626221,
"logits/rejected": -0.5820388197898865,
"logps/chosen": -66.820556640625,
"logps/ref_chosen": -51.23395919799805,
"logps/ref_rejected": -75.06192016601562,
"logps/rejected": -121.96331787109375,
"loss": 0.422,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.15601900219917297,
"margin_dpo/beta_margin_grad_std": 0.19936603307724,
"margin_dpo/beta_margin_mean": 3.1314802169799805,
"margin_dpo/loss_margin_mean": 31.314802169799805,
"margin_dpo/margin_mean": 31.314802169799805,
"margin_dpo/margin_std": 25.376670837402344,
"step": 323
},
{
"epoch": 0.47577092511013214,
"grad_norm": 56.77102279663086,
"learning_rate": 3.1594350522787295e-07,
"logits/chosen": -0.6091630458831787,
"logits/rejected": -0.5578924417495728,
"logps/chosen": -82.68987274169922,
"logps/ref_chosen": -65.13516998291016,
"logps/ref_rejected": -86.47750091552734,
"logps/rejected": -133.79421997070312,
"loss": 0.4592,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16746217012405396,
"margin_dpo/beta_margin_grad_std": 0.20326107740402222,
"margin_dpo/beta_margin_mean": 2.9762015342712402,
"margin_dpo/loss_margin_mean": 29.762012481689453,
"margin_dpo/margin_mean": 29.762012481689453,
"margin_dpo/margin_std": 25.058231353759766,
"step": 324
},
{
"epoch": 0.47723935389133626,
"grad_norm": 43.94253158569336,
"learning_rate": 3.147047612756302e-07,
"logits/chosen": -0.625763475894928,
"logits/rejected": -0.5584316253662109,
"logps/chosen": -70.6198501586914,
"logps/ref_chosen": -56.215599060058594,
"logps/ref_rejected": -70.0859375,
"logps/rejected": -113.17784118652344,
"loss": 0.4258,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16012780368328094,
"margin_dpo/beta_margin_grad_std": 0.19097131490707397,
"margin_dpo/beta_margin_mean": 2.868764877319336,
"margin_dpo/loss_margin_mean": 28.68764877319336,
"margin_dpo/margin_mean": 28.68764877319336,
"margin_dpo/margin_std": 21.614681243896484,
"step": 325
},
{
"epoch": 0.4787077826725404,
"grad_norm": 57.078155517578125,
"learning_rate": 3.134643122927519e-07,
"logits/chosen": -0.670049250125885,
"logits/rejected": -0.6241730451583862,
"logps/chosen": -90.87605285644531,
"logps/ref_chosen": -72.72496032714844,
"logps/ref_rejected": -79.84678649902344,
"logps/rejected": -123.93955993652344,
"loss": 0.5032,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.19036465883255005,
"margin_dpo/beta_margin_grad_std": 0.20020395517349243,
"margin_dpo/beta_margin_mean": 2.5941686630249023,
"margin_dpo/loss_margin_mean": 25.94168472290039,
"margin_dpo/margin_mean": 25.941686630249023,
"margin_dpo/margin_std": 24.112701416015625,
"step": 326
},
{
"epoch": 0.4801762114537445,
"grad_norm": 48.156856536865234,
"learning_rate": 3.1222219096622264e-07,
"logits/chosen": -0.6242316365242004,
"logits/rejected": -0.5801475048065186,
"logps/chosen": -84.52735900878906,
"logps/ref_chosen": -69.13441467285156,
"logps/ref_rejected": -111.93377685546875,
"logps/rejected": -164.81890869140625,
"loss": 0.2858,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1086474135518074,
"margin_dpo/beta_margin_grad_std": 0.16801781952381134,
"margin_dpo/beta_margin_mean": 3.749218463897705,
"margin_dpo/loss_margin_mean": 37.492183685302734,
"margin_dpo/margin_mean": 37.492183685302734,
"margin_dpo/margin_std": 24.61020278930664,
"step": 327
},
{
"epoch": 0.48164464023494863,
"grad_norm": 53.24053192138672,
"learning_rate": 3.1097843002709427e-07,
"logits/chosen": -0.631726861000061,
"logits/rejected": -0.6111768484115601,
"logps/chosen": -78.70730590820312,
"logps/ref_chosen": -59.68719482421875,
"logps/ref_rejected": -90.85499572753906,
"logps/rejected": -137.83163452148438,
"loss": 0.476,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.17879313230514526,
"margin_dpo/beta_margin_grad_std": 0.1966237723827362,
"margin_dpo/beta_margin_mean": 2.7956528663635254,
"margin_dpo/loss_margin_mean": 27.956527709960938,
"margin_dpo/margin_mean": 27.956527709960938,
"margin_dpo/margin_std": 25.300691604614258,
"step": 328
},
{
"epoch": 0.4831130690161527,
"grad_norm": 60.33453369140625,
"learning_rate": 3.0973306224962437e-07,
"logits/chosen": -0.6395320892333984,
"logits/rejected": -0.5939961671829224,
"logps/chosen": -82.35140991210938,
"logps/ref_chosen": -65.2461929321289,
"logps/ref_rejected": -100.69770812988281,
"logps/rejected": -155.14166259765625,
"loss": 0.3708,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.12009057402610779,
"margin_dpo/beta_margin_grad_std": 0.18349771201610565,
"margin_dpo/beta_margin_mean": 3.733874797821045,
"margin_dpo/loss_margin_mean": 37.3387451171875,
"margin_dpo/margin_mean": 37.3387451171875,
"margin_dpo/margin_std": 26.804096221923828,
"step": 329
},
{
"epoch": 0.4845814977973568,
"grad_norm": 48.900360107421875,
"learning_rate": 3.084861204504122e-07,
"logits/chosen": -0.5813232660293579,
"logits/rejected": -0.5687066316604614,
"logps/chosen": -64.73994445800781,
"logps/ref_chosen": -46.998348236083984,
"logps/ref_rejected": -86.87684631347656,
"logps/rejected": -136.06546020507812,
"loss": 0.4066,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.14167913794517517,
"margin_dpo/beta_margin_grad_std": 0.186563640832901,
"margin_dpo/beta_margin_mean": 3.1447031497955322,
"margin_dpo/loss_margin_mean": 31.44702911376953,
"margin_dpo/margin_mean": 31.44702911376953,
"margin_dpo/margin_std": 24.540428161621094,
"step": 330
},
{
"epoch": 0.48604992657856094,
"grad_norm": 37.78254699707031,
"learning_rate": 3.072376374875335e-07,
"logits/chosen": -0.6245772838592529,
"logits/rejected": -0.5876287221908569,
"logps/chosen": -66.91592407226562,
"logps/ref_chosen": -50.52424621582031,
"logps/ref_rejected": -89.01544189453125,
"logps/rejected": -139.790283203125,
"loss": 0.2587,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.10614001750946045,
"margin_dpo/beta_margin_grad_std": 0.14252358675003052,
"margin_dpo/beta_margin_mean": 3.438317060470581,
"margin_dpo/loss_margin_mean": 34.38317108154297,
"margin_dpo/margin_mean": 34.38317108154297,
"margin_dpo/margin_std": 23.31448745727539,
"step": 331
},
{
"epoch": 0.48751835535976507,
"grad_norm": 50.14997100830078,
"learning_rate": 3.059876462596758e-07,
"logits/chosen": -0.648339033126831,
"logits/rejected": -0.6188260316848755,
"logps/chosen": -67.62520599365234,
"logps/ref_chosen": -49.18028259277344,
"logps/ref_rejected": -76.48515319824219,
"logps/rejected": -120.72598266601562,
"loss": 0.5397,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1846732795238495,
"margin_dpo/beta_margin_grad_std": 0.21404841542243958,
"margin_dpo/beta_margin_mean": 2.579591751098633,
"margin_dpo/loss_margin_mean": 25.795917510986328,
"margin_dpo/margin_mean": 25.795917510986328,
"margin_dpo/margin_std": 22.294769287109375,
"step": 332
},
{
"epoch": 0.4889867841409692,
"grad_norm": 64.34756469726562,
"learning_rate": 3.0473617970527015e-07,
"logits/chosen": -0.5994927883148193,
"logits/rejected": -0.5866981744766235,
"logps/chosen": -83.70030212402344,
"logps/ref_chosen": -63.75574493408203,
"logps/ref_rejected": -95.04411315917969,
"logps/rejected": -147.88723754882812,
"loss": 0.5269,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1775931566953659,
"margin_dpo/beta_margin_grad_std": 0.22904759645462036,
"margin_dpo/beta_margin_mean": 3.2898573875427246,
"margin_dpo/loss_margin_mean": 32.8985710144043,
"margin_dpo/margin_mean": 32.8985710144043,
"margin_dpo/margin_std": 28.806324005126953,
"step": 333
},
{
"epoch": 0.49045521292217326,
"grad_norm": 47.576019287109375,
"learning_rate": 3.034832708016243e-07,
"logits/chosen": -0.5982068777084351,
"logits/rejected": -0.5783542394638062,
"logps/chosen": -87.60667419433594,
"logps/ref_chosen": -66.97975158691406,
"logps/ref_rejected": -95.31692504882812,
"logps/rejected": -147.35403442382812,
"loss": 0.3524,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.12977060675621033,
"margin_dpo/beta_margin_grad_std": 0.18269598484039307,
"margin_dpo/beta_margin_mean": 3.141017198562622,
"margin_dpo/loss_margin_mean": 31.410171508789062,
"margin_dpo/margin_mean": 31.410171508789062,
"margin_dpo/margin_std": 22.10809326171875,
"step": 334
},
{
"epoch": 0.4919236417033774,
"grad_norm": 60.33529281616211,
"learning_rate": 3.022289525640531e-07,
"logits/chosen": -0.6425787210464478,
"logits/rejected": -0.6166863441467285,
"logps/chosen": -80.82369995117188,
"logps/ref_chosen": -62.54248046875,
"logps/ref_rejected": -87.6176986694336,
"logps/rejected": -133.60256958007812,
"loss": 0.5288,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1897895336151123,
"margin_dpo/beta_margin_grad_std": 0.2193058580160141,
"margin_dpo/beta_margin_mean": 2.770364284515381,
"margin_dpo/loss_margin_mean": 27.703643798828125,
"margin_dpo/margin_mean": 27.703643798828125,
"margin_dpo/margin_std": 25.8885555267334,
"step": 335
},
{
"epoch": 0.4933920704845815,
"grad_norm": 64.78392791748047,
"learning_rate": 3.009732580450086e-07,
"logits/chosen": -0.6276768445968628,
"logits/rejected": -0.6149314641952515,
"logps/chosen": -74.1087646484375,
"logps/ref_chosen": -54.531150817871094,
"logps/ref_rejected": -104.40424346923828,
"logps/rejected": -158.13856506347656,
"loss": 0.4812,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.13777320086956024,
"margin_dpo/beta_margin_grad_std": 0.21476463973522186,
"margin_dpo/beta_margin_mean": 3.4156715869903564,
"margin_dpo/loss_margin_mean": 34.156715393066406,
"margin_dpo/margin_mean": 34.156715393066406,
"margin_dpo/margin_std": 28.737443923950195,
"step": 336
},
{
"epoch": 0.4948604992657856,
"grad_norm": 56.005924224853516,
"learning_rate": 2.9971622033320914e-07,
"logits/chosen": -0.6466571092605591,
"logits/rejected": -0.6224143505096436,
"logps/chosen": -82.91951751708984,
"logps/ref_chosen": -65.12869262695312,
"logps/ref_rejected": -101.72701263427734,
"logps/rejected": -150.4021759033203,
"loss": 0.3742,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.14107711613178253,
"margin_dpo/beta_margin_grad_std": 0.1787952482700348,
"margin_dpo/beta_margin_mean": 3.0884342193603516,
"margin_dpo/loss_margin_mean": 30.88433837890625,
"margin_dpo/margin_mean": 30.884340286254883,
"margin_dpo/margin_std": 22.561031341552734,
"step": 337
},
{
"epoch": 0.49632892804698975,
"grad_norm": 53.44011306762695,
"learning_rate": 2.984578725527675e-07,
"logits/chosen": -0.6315950155258179,
"logits/rejected": -0.6056466102600098,
"logps/chosen": -78.76591491699219,
"logps/ref_chosen": -58.422706604003906,
"logps/ref_rejected": -89.06854248046875,
"logps/rejected": -140.06710815429688,
"loss": 0.3915,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.15517953038215637,
"margin_dpo/beta_margin_grad_std": 0.1764228343963623,
"margin_dpo/beta_margin_mean": 3.065535545349121,
"margin_dpo/loss_margin_mean": 30.655353546142578,
"margin_dpo/margin_mean": 30.655353546142578,
"margin_dpo/margin_std": 24.606985092163086,
"step": 338
},
{
"epoch": 0.4977973568281938,
"grad_norm": 42.09189987182617,
"learning_rate": 2.9719824786231796e-07,
"logits/chosen": -0.7080618143081665,
"logits/rejected": -0.6741960048675537,
"logps/chosen": -77.78138732910156,
"logps/ref_chosen": -59.99531555175781,
"logps/ref_rejected": -103.9109115600586,
"logps/rejected": -156.21578979492188,
"loss": 0.3539,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.13084164261817932,
"margin_dpo/beta_margin_grad_std": 0.18853557109832764,
"margin_dpo/beta_margin_mean": 3.4518818855285645,
"margin_dpo/loss_margin_mean": 34.51881790161133,
"margin_dpo/margin_mean": 34.51881790161133,
"margin_dpo/margin_std": 25.89126205444336,
"step": 339
},
{
"epoch": 0.49926578560939794,
"grad_norm": 45.83633804321289,
"learning_rate": 2.959373794541426e-07,
"logits/chosen": -0.6209253072738647,
"logits/rejected": -0.5888671875,
"logps/chosen": -73.24740600585938,
"logps/ref_chosen": -52.83022689819336,
"logps/ref_rejected": -73.10723876953125,
"logps/rejected": -127.31039428710938,
"loss": 0.3856,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.14801771938800812,
"margin_dpo/beta_margin_grad_std": 0.17904043197631836,
"margin_dpo/beta_margin_mean": 3.3785972595214844,
"margin_dpo/loss_margin_mean": 33.78596878051758,
"margin_dpo/margin_mean": 33.785972595214844,
"margin_dpo/margin_std": 30.059484481811523,
"step": 340
},
{
"epoch": 0.5007342143906021,
"grad_norm": 47.085533142089844,
"learning_rate": 2.946753005532965e-07,
"logits/chosen": -0.6055405735969543,
"logits/rejected": -0.5906496047973633,
"logps/chosen": -70.06444549560547,
"logps/ref_chosen": -47.899803161621094,
"logps/ref_rejected": -101.80987548828125,
"logps/rejected": -161.025390625,
"loss": 0.3145,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.12136228382587433,
"margin_dpo/beta_margin_grad_std": 0.17082762718200684,
"margin_dpo/beta_margin_mean": 3.7050869464874268,
"margin_dpo/loss_margin_mean": 37.050865173339844,
"margin_dpo/margin_mean": 37.050865173339844,
"margin_dpo/margin_std": 26.06426429748535,
"step": 341
},
{
"epoch": 0.5022026431718062,
"grad_norm": 70.15333557128906,
"learning_rate": 2.934120444167326e-07,
"logits/chosen": -0.5826171040534973,
"logits/rejected": -0.5372592210769653,
"logps/chosen": -90.79473114013672,
"logps/ref_chosen": -71.99664306640625,
"logps/ref_rejected": -92.58959197998047,
"logps/rejected": -143.8526611328125,
"loss": 0.452,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1657615751028061,
"margin_dpo/beta_margin_grad_std": 0.21393808722496033,
"margin_dpo/beta_margin_mean": 3.2464988231658936,
"margin_dpo/loss_margin_mean": 32.464988708496094,
"margin_dpo/margin_mean": 32.464988708496094,
"margin_dpo/margin_std": 28.956148147583008,
"step": 342
},
{
"epoch": 0.5036710719530103,
"grad_norm": 59.99635696411133,
"learning_rate": 2.9214764433242476e-07,
"logits/chosen": -0.6327919363975525,
"logits/rejected": -0.6083285808563232,
"logps/chosen": -71.64889526367188,
"logps/ref_chosen": -54.40562438964844,
"logps/ref_rejected": -111.04141998291016,
"logps/rejected": -162.94818115234375,
"loss": 0.3762,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.13154786825180054,
"margin_dpo/beta_margin_grad_std": 0.1953406035900116,
"margin_dpo/beta_margin_mean": 3.46634840965271,
"margin_dpo/loss_margin_mean": 34.663482666015625,
"margin_dpo/margin_mean": 34.663482666015625,
"margin_dpo/margin_std": 24.739078521728516,
"step": 343
},
{
"epoch": 0.5051395007342144,
"grad_norm": 60.24159622192383,
"learning_rate": 2.9088213361849126e-07,
"logits/chosen": -0.60174560546875,
"logits/rejected": -0.5771138072013855,
"logps/chosen": -74.28924560546875,
"logps/ref_chosen": -53.96466827392578,
"logps/ref_rejected": -90.62336730957031,
"logps/rejected": -139.2759246826172,
"loss": 0.5701,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.19683772325515747,
"margin_dpo/beta_margin_grad_std": 0.23028729856014252,
"margin_dpo/beta_margin_mean": 2.8327980041503906,
"margin_dpo/loss_margin_mean": 28.327980041503906,
"margin_dpo/margin_mean": 28.327980041503906,
"margin_dpo/margin_std": 28.41692543029785,
"step": 344
},
{
"epoch": 0.5066079295154186,
"grad_norm": 52.972599029541016,
"learning_rate": 2.896155456223163e-07,
"logits/chosen": -0.6189597845077515,
"logits/rejected": -0.5859960317611694,
"logps/chosen": -81.26602172851562,
"logps/ref_chosen": -61.685699462890625,
"logps/ref_rejected": -99.49040985107422,
"logps/rejected": -153.17733764648438,
"loss": 0.385,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.12680958211421967,
"margin_dpo/beta_margin_grad_std": 0.19373470544815063,
"margin_dpo/beta_margin_mean": 3.4106602668762207,
"margin_dpo/loss_margin_mean": 34.10660171508789,
"margin_dpo/margin_mean": 34.10660171508789,
"margin_dpo/margin_std": 26.537147521972656,
"step": 345
},
{
"epoch": 0.5080763582966226,
"grad_norm": 65.10262298583984,
"learning_rate": 2.883479137196714e-07,
"logits/chosen": -0.6390465497970581,
"logits/rejected": -0.6188012361526489,
"logps/chosen": -77.49059295654297,
"logps/ref_chosen": -55.256263732910156,
"logps/ref_rejected": -77.41532135009766,
"logps/rejected": -130.3055877685547,
"loss": 0.4883,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16579663753509521,
"margin_dpo/beta_margin_grad_std": 0.21993526816368103,
"margin_dpo/beta_margin_mean": 3.065593719482422,
"margin_dpo/loss_margin_mean": 30.655935287475586,
"margin_dpo/margin_mean": 30.655933380126953,
"margin_dpo/margin_std": 26.671146392822266,
"step": 346
},
{
"epoch": 0.5095447870778267,
"grad_norm": 58.82464599609375,
"learning_rate": 2.8707927131383614e-07,
"logits/chosen": -0.6383576393127441,
"logits/rejected": -0.5973784923553467,
"logps/chosen": -80.98310852050781,
"logps/ref_chosen": -57.56624221801758,
"logps/ref_rejected": -92.35508728027344,
"logps/rejected": -146.76962280273438,
"loss": 0.507,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.15645217895507812,
"margin_dpo/beta_margin_grad_std": 0.2233504056930542,
"margin_dpo/beta_margin_mean": 3.099766731262207,
"margin_dpo/loss_margin_mean": 30.99766731262207,
"margin_dpo/margin_mean": 30.997665405273438,
"margin_dpo/margin_std": 27.08733367919922,
"step": 347
},
{
"epoch": 0.5110132158590308,
"grad_norm": 56.679996490478516,
"learning_rate": 2.858096518347179e-07,
"logits/chosen": -0.6223098635673523,
"logits/rejected": -0.5989496111869812,
"logps/chosen": -76.78868103027344,
"logps/ref_chosen": -56.31770324707031,
"logps/ref_rejected": -89.13837432861328,
"logps/rejected": -139.9176025390625,
"loss": 0.534,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.18657389283180237,
"margin_dpo/beta_margin_grad_std": 0.2263115644454956,
"margin_dpo/beta_margin_mean": 3.0308241844177246,
"margin_dpo/loss_margin_mean": 30.308242797851562,
"margin_dpo/margin_mean": 30.308242797851562,
"margin_dpo/margin_std": 26.598758697509766,
"step": 348
},
{
"epoch": 0.5124816446402349,
"grad_norm": 74.86937713623047,
"learning_rate": 2.845390887379706e-07,
"logits/chosen": -0.6142607927322388,
"logits/rejected": -0.6013126373291016,
"logps/chosen": -76.69570922851562,
"logps/ref_chosen": -58.0255126953125,
"logps/ref_rejected": -97.50515747070312,
"logps/rejected": -142.0050506591797,
"loss": 0.7182,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.22203311324119568,
"margin_dpo/beta_margin_grad_std": 0.25500667095184326,
"margin_dpo/beta_margin_mean": 2.5829694271087646,
"margin_dpo/loss_margin_mean": 25.829692840576172,
"margin_dpo/margin_mean": 25.829696655273438,
"margin_dpo/margin_std": 29.345046997070312,
"step": 349
},
{
"epoch": 0.5139500734214391,
"grad_norm": 60.53803253173828,
"learning_rate": 2.8326761550411346e-07,
"logits/chosen": -0.6528929471969604,
"logits/rejected": -0.6275583505630493,
"logps/chosen": -83.62789916992188,
"logps/ref_chosen": -64.33049011230469,
"logps/ref_rejected": -89.87164306640625,
"logps/rejected": -136.76258850097656,
"loss": 0.6313,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.20285803079605103,
"margin_dpo/beta_margin_grad_std": 0.24381616711616516,
"margin_dpo/beta_margin_mean": 2.7593541145324707,
"margin_dpo/loss_margin_mean": 27.59354019165039,
"margin_dpo/margin_mean": 27.59354019165039,
"margin_dpo/margin_std": 27.823108673095703,
"step": 350
},
{
"epoch": 0.5154185022026432,
"grad_norm": 45.59613800048828,
"learning_rate": 2.819952656376487e-07,
"logits/chosen": -0.5691178441047668,
"logits/rejected": -0.5438896417617798,
"logps/chosen": -77.93399810791016,
"logps/ref_chosen": -60.6721305847168,
"logps/ref_rejected": -101.5654296875,
"logps/rejected": -152.97235107421875,
"loss": 0.3518,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.12643350660800934,
"margin_dpo/beta_margin_grad_std": 0.19213062524795532,
"margin_dpo/beta_margin_mean": 3.414506196975708,
"margin_dpo/loss_margin_mean": 34.145057678222656,
"margin_dpo/margin_mean": 34.14506149291992,
"margin_dpo/margin_std": 24.413818359375,
"step": 351
},
{
"epoch": 0.5168869309838473,
"grad_norm": 69.56964111328125,
"learning_rate": 2.8072207266617854e-07,
"logits/chosen": -0.6274293661117554,
"logits/rejected": -0.5914252996444702,
"logps/chosen": -88.6449203491211,
"logps/ref_chosen": -70.9434585571289,
"logps/ref_rejected": -76.6419677734375,
"logps/rejected": -121.17511749267578,
"loss": 0.5783,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.20047959685325623,
"margin_dpo/beta_margin_grad_std": 0.2212606817483902,
"margin_dpo/beta_margin_mean": 2.683168888092041,
"margin_dpo/loss_margin_mean": 26.831687927246094,
"margin_dpo/margin_mean": 26.831687927246094,
"margin_dpo/margin_std": 27.10396385192871,
"step": 352
},
{
"epoch": 0.5183553597650514,
"grad_norm": 70.63945007324219,
"learning_rate": 2.794480701395219e-07,
"logits/chosen": -0.6379419565200806,
"logits/rejected": -0.610392689704895,
"logps/chosen": -78.57247924804688,
"logps/ref_chosen": -58.39533996582031,
"logps/ref_rejected": -80.33552551269531,
"logps/rejected": -127.21942138671875,
"loss": 0.7041,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.2198677659034729,
"margin_dpo/beta_margin_grad_std": 0.26175159215927124,
"margin_dpo/beta_margin_mean": 2.6706738471984863,
"margin_dpo/loss_margin_mean": 26.706737518310547,
"margin_dpo/margin_mean": 26.706737518310547,
"margin_dpo/margin_std": 27.502460479736328,
"step": 353
},
{
"epoch": 0.5198237885462555,
"grad_norm": 39.9495964050293,
"learning_rate": 2.781732916288303e-07,
"logits/chosen": -0.6119546890258789,
"logits/rejected": -0.5863783359527588,
"logps/chosen": -76.71435546875,
"logps/ref_chosen": -59.80299377441406,
"logps/ref_rejected": -88.75750732421875,
"logps/rejected": -137.52517700195312,
"loss": 0.2944,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.11870501935482025,
"margin_dpo/beta_margin_grad_std": 0.15505337715148926,
"margin_dpo/beta_margin_mean": 3.1856298446655273,
"margin_dpo/loss_margin_mean": 31.85629653930664,
"margin_dpo/margin_mean": 31.856294631958008,
"margin_dpo/margin_std": 21.562454223632812,
"step": 354
},
{
"epoch": 0.5212922173274597,
"grad_norm": 37.86518096923828,
"learning_rate": 2.7689777072570284e-07,
"logits/chosen": -0.6655494570732117,
"logits/rejected": -0.6307432055473328,
"logps/chosen": -70.75237274169922,
"logps/ref_chosen": -54.128501892089844,
"logps/ref_rejected": -82.40606689453125,
"logps/rejected": -132.5486297607422,
"loss": 0.3504,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1401645541191101,
"margin_dpo/beta_margin_grad_std": 0.16528445482254028,
"margin_dpo/beta_margin_mean": 3.351868152618408,
"margin_dpo/loss_margin_mean": 33.518680572509766,
"margin_dpo/margin_mean": 33.518680572509766,
"margin_dpo/margin_std": 28.362560272216797,
"step": 355
},
{
"epoch": 0.5227606461086637,
"grad_norm": 97.62612915039062,
"learning_rate": 2.7562154104130176e-07,
"logits/chosen": -0.6015282273292542,
"logits/rejected": -0.5701065063476562,
"logps/chosen": -86.73289489746094,
"logps/ref_chosen": -64.67381286621094,
"logps/ref_rejected": -75.89926147460938,
"logps/rejected": -120.73099517822266,
"loss": 0.8011,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.26123011112213135,
"margin_dpo/beta_margin_grad_std": 0.2479068785905838,
"margin_dpo/beta_margin_mean": 2.2772653102874756,
"margin_dpo/loss_margin_mean": 22.772653579711914,
"margin_dpo/margin_mean": 22.772653579711914,
"margin_dpo/margin_std": 27.33060073852539,
"step": 356
},
{
"epoch": 0.5242290748898678,
"grad_norm": 48.75430679321289,
"learning_rate": 2.7434463620546594e-07,
"logits/chosen": -0.6201961040496826,
"logits/rejected": -0.5882294178009033,
"logps/chosen": -70.5618896484375,
"logps/ref_chosen": -52.725799560546875,
"logps/ref_rejected": -86.84115600585938,
"logps/rejected": -136.1475372314453,
"loss": 0.3922,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.13970120251178741,
"margin_dpo/beta_margin_grad_std": 0.18796856701374054,
"margin_dpo/beta_margin_mean": 3.147029399871826,
"margin_dpo/loss_margin_mean": 31.470294952392578,
"margin_dpo/margin_mean": 31.470294952392578,
"margin_dpo/margin_std": 23.818038940429688,
"step": 357
},
{
"epoch": 0.5256975036710719,
"grad_norm": 65.04510498046875,
"learning_rate": 2.730670898658255e-07,
"logits/chosen": -0.6239925622940063,
"logits/rejected": -0.5800461173057556,
"logps/chosen": -79.59387969970703,
"logps/ref_chosen": -63.20543670654297,
"logps/ref_rejected": -88.373291015625,
"logps/rejected": -133.9655303955078,
"loss": 0.4776,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.17933997511863708,
"margin_dpo/beta_margin_grad_std": 0.1934494972229004,
"margin_dpo/beta_margin_mean": 2.920379161834717,
"margin_dpo/loss_margin_mean": 29.20379066467285,
"margin_dpo/margin_mean": 29.20378875732422,
"margin_dpo/margin_std": 27.63866424560547,
"step": 358
},
{
"epoch": 0.527165932452276,
"grad_norm": 62.99494934082031,
"learning_rate": 2.717889356869146e-07,
"logits/chosen": -0.5847325325012207,
"logits/rejected": -0.5495982766151428,
"logps/chosen": -78.43362426757812,
"logps/ref_chosen": -56.370216369628906,
"logps/ref_rejected": -82.17375183105469,
"logps/rejected": -136.6361083984375,
"loss": 0.4728,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.15706944465637207,
"margin_dpo/beta_margin_grad_std": 0.21352404356002808,
"margin_dpo/beta_margin_mean": 3.2398953437805176,
"margin_dpo/loss_margin_mean": 32.39895248413086,
"margin_dpo/margin_mean": 32.39895248413086,
"margin_dpo/margin_std": 27.692176818847656,
"step": 359
},
{
"epoch": 0.5286343612334802,
"grad_norm": 44.829593658447266,
"learning_rate": 2.7051020734928443e-07,
"logits/chosen": -0.5741163492202759,
"logits/rejected": -0.5485746264457703,
"logps/chosen": -70.59879302978516,
"logps/ref_chosen": -51.460384368896484,
"logps/ref_rejected": -69.83892822265625,
"logps/rejected": -118.67765808105469,
"loss": 0.4042,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.14649343490600586,
"margin_dpo/beta_margin_grad_std": 0.1764645129442215,
"margin_dpo/beta_margin_mean": 2.970031976699829,
"margin_dpo/loss_margin_mean": 29.700321197509766,
"margin_dpo/margin_mean": 29.700321197509766,
"margin_dpo/margin_std": 23.251190185546875,
"step": 360
},
{
"epoch": 0.5301027900146843,
"grad_norm": 60.9596061706543,
"learning_rate": 2.6923093854861593e-07,
"logits/chosen": -0.6095191240310669,
"logits/rejected": -0.5885258316993713,
"logps/chosen": -73.89006042480469,
"logps/ref_chosen": -53.86951446533203,
"logps/ref_rejected": -90.76925659179688,
"logps/rejected": -139.11050415039062,
"loss": 0.4981,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1759810894727707,
"margin_dpo/beta_margin_grad_std": 0.20826107263565063,
"margin_dpo/beta_margin_mean": 2.832070827484131,
"margin_dpo/loss_margin_mean": 28.320707321166992,
"margin_dpo/margin_mean": 28.320707321166992,
"margin_dpo/margin_std": 25.021095275878906,
"step": 361
},
{
"epoch": 0.5315712187958884,
"grad_norm": 54.746620178222656,
"learning_rate": 2.679511629948319e-07,
"logits/chosen": -0.6106045246124268,
"logits/rejected": -0.5950082540512085,
"logps/chosen": -78.94198608398438,
"logps/ref_chosen": -58.639060974121094,
"logps/ref_rejected": -105.58195495605469,
"logps/rejected": -159.86471557617188,
"loss": 0.4172,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.14724624156951904,
"margin_dpo/beta_margin_grad_std": 0.20302033424377441,
"margin_dpo/beta_margin_mean": 3.3979835510253906,
"margin_dpo/loss_margin_mean": 33.979835510253906,
"margin_dpo/margin_mean": 33.97983169555664,
"margin_dpo/margin_std": 27.78476333618164,
"step": 362
},
{
"epoch": 0.5330396475770925,
"grad_norm": 81.81700897216797,
"learning_rate": 2.6667091441120816e-07,
"logits/chosen": -0.626789927482605,
"logits/rejected": -0.5806140899658203,
"logps/chosen": -62.119293212890625,
"logps/ref_chosen": -44.558380126953125,
"logps/ref_rejected": -74.69496154785156,
"logps/rejected": -131.403076171875,
"loss": 0.4021,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1342936009168625,
"margin_dpo/beta_margin_grad_std": 0.2032202035188675,
"margin_dpo/beta_margin_mean": 3.914720058441162,
"margin_dpo/loss_margin_mean": 39.14720153808594,
"margin_dpo/margin_mean": 39.14720153808594,
"margin_dpo/margin_std": 32.773658752441406,
"step": 363
},
{
"epoch": 0.5345080763582967,
"grad_norm": 70.45586395263672,
"learning_rate": 2.6539022653348575e-07,
"logits/chosen": -0.6278142929077148,
"logits/rejected": -0.6232542991638184,
"logps/chosen": -67.96247863769531,
"logps/ref_chosen": -48.894622802734375,
"logps/ref_rejected": -91.395751953125,
"logps/rejected": -138.64413452148438,
"loss": 0.5219,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.18396377563476562,
"margin_dpo/beta_margin_grad_std": 0.21280619502067566,
"margin_dpo/beta_margin_mean": 2.818053722381592,
"margin_dpo/loss_margin_mean": 28.180538177490234,
"margin_dpo/margin_mean": 28.180538177490234,
"margin_dpo/margin_std": 26.407032012939453,
"step": 364
},
{
"epoch": 0.5359765051395007,
"grad_norm": 53.16273498535156,
"learning_rate": 2.641091331089811e-07,
"logits/chosen": -0.5910431146621704,
"logits/rejected": -0.5750705003738403,
"logps/chosen": -69.91105651855469,
"logps/ref_chosen": -51.49274444580078,
"logps/ref_rejected": -92.70166778564453,
"logps/rejected": -138.62091064453125,
"loss": 0.4635,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1818796843290329,
"margin_dpo/beta_margin_grad_std": 0.18291005492210388,
"margin_dpo/beta_margin_mean": 2.750092029571533,
"margin_dpo/loss_margin_mean": 27.500919342041016,
"margin_dpo/margin_mean": 27.500919342041016,
"margin_dpo/margin_std": 25.346105575561523,
"step": 365
},
{
"epoch": 0.5374449339207048,
"grad_norm": 41.03267288208008,
"learning_rate": 2.6282766789569736e-07,
"logits/chosen": -0.6280097365379333,
"logits/rejected": -0.6183122992515564,
"logps/chosen": -61.73809051513672,
"logps/ref_chosen": -44.7205696105957,
"logps/ref_rejected": -83.31040954589844,
"logps/rejected": -129.70077514648438,
"loss": 0.3847,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1519765853881836,
"margin_dpo/beta_margin_grad_std": 0.17253069579601288,
"margin_dpo/beta_margin_mean": 2.937284469604492,
"margin_dpo/loss_margin_mean": 29.372844696044922,
"margin_dpo/margin_mean": 29.372844696044922,
"margin_dpo/margin_std": 23.400222778320312,
"step": 366
},
{
"epoch": 0.5389133627019089,
"grad_norm": 55.751182556152344,
"learning_rate": 2.615458646614349e-07,
"logits/chosen": -0.5929805040359497,
"logits/rejected": -0.5748361945152283,
"logps/chosen": -77.30204010009766,
"logps/ref_chosen": -58.405418395996094,
"logps/ref_rejected": -76.75132751464844,
"logps/rejected": -121.19309997558594,
"loss": 0.5059,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1770220696926117,
"margin_dpo/beta_margin_grad_std": 0.2071405053138733,
"margin_dpo/beta_margin_mean": 2.5545148849487305,
"margin_dpo/loss_margin_mean": 25.545148849487305,
"margin_dpo/margin_mean": 25.545148849487305,
"margin_dpo/margin_std": 22.493253707885742,
"step": 367
},
{
"epoch": 0.540381791483113,
"grad_norm": 41.846797943115234,
"learning_rate": 2.6026375718290083e-07,
"logits/chosen": -0.6356101036071777,
"logits/rejected": -0.6200574040412903,
"logps/chosen": -61.09843444824219,
"logps/ref_chosen": -44.452518463134766,
"logps/ref_rejected": -98.55526733398438,
"logps/rejected": -145.97296142578125,
"loss": 0.3508,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.13728934526443481,
"margin_dpo/beta_margin_grad_std": 0.17335599660873413,
"margin_dpo/beta_margin_mean": 3.0771780014038086,
"margin_dpo/loss_margin_mean": 30.771780014038086,
"margin_dpo/margin_mean": 30.771780014038086,
"margin_dpo/margin_std": 23.21100616455078,
"step": 368
},
{
"epoch": 0.5418502202643172,
"grad_norm": 67.84696960449219,
"learning_rate": 2.589813792448196e-07,
"logits/chosen": -0.6399098634719849,
"logits/rejected": -0.60102778673172,
"logps/chosen": -89.86668395996094,
"logps/ref_chosen": -71.38150024414062,
"logps/ref_rejected": -91.29582214355469,
"logps/rejected": -134.7001190185547,
"loss": 0.553,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.19392737746238708,
"margin_dpo/beta_margin_grad_std": 0.22195757925510406,
"margin_dpo/beta_margin_mean": 2.4919116497039795,
"margin_dpo/loss_margin_mean": 24.919116973876953,
"margin_dpo/margin_mean": 24.91911506652832,
"margin_dpo/margin_std": 22.250526428222656,
"step": 369
},
{
"epoch": 0.5433186490455213,
"grad_norm": 53.89767074584961,
"learning_rate": 2.5769876463904263e-07,
"logits/chosen": -0.62095046043396,
"logits/rejected": -0.5922361016273499,
"logps/chosen": -90.54344177246094,
"logps/ref_chosen": -71.60749816894531,
"logps/ref_rejected": -97.25978088378906,
"logps/rejected": -141.17263793945312,
"loss": 0.5169,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1934899091720581,
"margin_dpo/beta_margin_grad_std": 0.2069862186908722,
"margin_dpo/beta_margin_mean": 2.4976892471313477,
"margin_dpo/loss_margin_mean": 24.976890563964844,
"margin_dpo/margin_mean": 24.976890563964844,
"margin_dpo/margin_std": 21.94351577758789,
"step": 370
},
{
"epoch": 0.5447870778267254,
"grad_norm": 65.36404418945312,
"learning_rate": 2.5641594716365744e-07,
"logits/chosen": -0.65543532371521,
"logits/rejected": -0.6313973665237427,
"logps/chosen": -89.23640441894531,
"logps/ref_chosen": -69.41448974609375,
"logps/ref_rejected": -99.17217254638672,
"logps/rejected": -146.0745849609375,
"loss": 0.6168,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.19891506433486938,
"margin_dpo/beta_margin_grad_std": 0.2336684763431549,
"margin_dpo/beta_margin_mean": 2.7080492973327637,
"margin_dpo/loss_margin_mean": 27.080493927001953,
"margin_dpo/margin_mean": 27.08049201965332,
"margin_dpo/margin_std": 28.510940551757812,
"step": 371
},
{
"epoch": 0.5462555066079295,
"grad_norm": 55.08517074584961,
"learning_rate": 2.551329606220976e-07,
"logits/chosen": -0.6221505403518677,
"logits/rejected": -0.5715365409851074,
"logps/chosen": -81.12614440917969,
"logps/ref_chosen": -61.8179931640625,
"logps/ref_rejected": -78.53949737548828,
"logps/rejected": -129.13607788085938,
"loss": 0.5133,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.17172211408615112,
"margin_dpo/beta_margin_grad_std": 0.2211245894432068,
"margin_dpo/beta_margin_mean": 3.1288440227508545,
"margin_dpo/loss_margin_mean": 31.288440704345703,
"margin_dpo/margin_mean": 31.288440704345703,
"margin_dpo/margin_std": 29.940040588378906,
"step": 372
},
{
"epoch": 0.5477239353891337,
"grad_norm": 59.45585250854492,
"learning_rate": 2.538498388222517e-07,
"logits/chosen": -0.6288785338401794,
"logits/rejected": -0.5830151438713074,
"logps/chosen": -85.13172912597656,
"logps/ref_chosen": -64.21713256835938,
"logps/ref_rejected": -85.95960998535156,
"logps/rejected": -139.53817749023438,
"loss": 0.4155,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1484554558992386,
"margin_dpo/beta_margin_grad_std": 0.19429174065589905,
"margin_dpo/beta_margin_mean": 3.266396999359131,
"margin_dpo/loss_margin_mean": 32.663970947265625,
"margin_dpo/margin_mean": 32.663970947265625,
"margin_dpo/margin_std": 25.845104217529297,
"step": 373
},
{
"epoch": 0.5491923641703378,
"grad_norm": 47.363040924072266,
"learning_rate": 2.525666155755725e-07,
"logits/chosen": -0.6621605157852173,
"logits/rejected": -0.6285480856895447,
"logps/chosen": -88.64697265625,
"logps/ref_chosen": -70.65017700195312,
"logps/ref_rejected": -93.64016723632812,
"logps/rejected": -141.2034912109375,
"loss": 0.4313,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1634691059589386,
"margin_dpo/beta_margin_grad_std": 0.18810805678367615,
"margin_dpo/beta_margin_mean": 2.9566543102264404,
"margin_dpo/loss_margin_mean": 29.566543579101562,
"margin_dpo/margin_mean": 29.566543579101562,
"margin_dpo/margin_std": 25.27297019958496,
"step": 374
},
{
"epoch": 0.5506607929515418,
"grad_norm": 53.13246536254883,
"learning_rate": 2.512833246961859e-07,
"logits/chosen": -0.5945202112197876,
"logits/rejected": -0.580052375793457,
"logps/chosen": -79.16407775878906,
"logps/ref_chosen": -60.080223083496094,
"logps/ref_rejected": -88.93830871582031,
"logps/rejected": -137.2688446044922,
"loss": 0.5153,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.18427500128746033,
"margin_dpo/beta_margin_grad_std": 0.21660040318965912,
"margin_dpo/beta_margin_mean": 2.924668550491333,
"margin_dpo/loss_margin_mean": 29.246685028076172,
"margin_dpo/margin_mean": 29.246685028076172,
"margin_dpo/margin_std": 24.21230697631836,
"step": 375
},
{
"epoch": 0.5521292217327459,
"grad_norm": 48.868709564208984,
"learning_rate": 2.5e-07,
"logits/chosen": -0.5992149114608765,
"logits/rejected": -0.5802311897277832,
"logps/chosen": -81.73542785644531,
"logps/ref_chosen": -62.660308837890625,
"logps/ref_rejected": -105.526611328125,
"logps/rejected": -156.72482299804688,
"loss": 0.3949,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1482008844614029,
"margin_dpo/beta_margin_grad_std": 0.17604166269302368,
"margin_dpo/beta_margin_mean": 3.212308883666992,
"margin_dpo/loss_margin_mean": 32.12308883666992,
"margin_dpo/margin_mean": 32.12308883666992,
"margin_dpo/margin_std": 27.578922271728516,
"step": 376
},
{
"epoch": 0.55359765051395,
"grad_norm": 62.117244720458984,
"learning_rate": 2.487166753038141e-07,
"logits/chosen": -0.5650719404220581,
"logits/rejected": -0.548367977142334,
"logps/chosen": -76.00762939453125,
"logps/ref_chosen": -54.478736877441406,
"logps/ref_rejected": -98.70335388183594,
"logps/rejected": -149.87831115722656,
"loss": 0.5358,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.18531596660614014,
"margin_dpo/beta_margin_grad_std": 0.2207948863506317,
"margin_dpo/beta_margin_mean": 2.9646058082580566,
"margin_dpo/loss_margin_mean": 29.646059036254883,
"margin_dpo/margin_mean": 29.646059036254883,
"margin_dpo/margin_std": 26.692546844482422,
"step": 377
},
{
"epoch": 0.5550660792951542,
"grad_norm": 44.018394470214844,
"learning_rate": 2.4743338442442754e-07,
"logits/chosen": -0.6136064529418945,
"logits/rejected": -0.600831151008606,
"logps/chosen": -62.556495666503906,
"logps/ref_chosen": -45.02053451538086,
"logps/ref_rejected": -88.0469741821289,
"logps/rejected": -137.45811462402344,
"loss": 0.4079,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.13975557684898376,
"margin_dpo/beta_margin_grad_std": 0.2044086456298828,
"margin_dpo/beta_margin_mean": 3.1875176429748535,
"margin_dpo/loss_margin_mean": 31.87517547607422,
"margin_dpo/margin_mean": 31.87517547607422,
"margin_dpo/margin_std": 25.208221435546875,
"step": 378
},
{
"epoch": 0.5565345080763583,
"grad_norm": 55.18994903564453,
"learning_rate": 2.461501611777483e-07,
"logits/chosen": -0.6599289774894714,
"logits/rejected": -0.6502236127853394,
"logps/chosen": -72.3454818725586,
"logps/ref_chosen": -53.182098388671875,
"logps/ref_rejected": -114.30015563964844,
"logps/rejected": -166.52252197265625,
"loss": 0.4312,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1558556854724884,
"margin_dpo/beta_margin_grad_std": 0.19769342243671417,
"margin_dpo/beta_margin_mean": 3.305896759033203,
"margin_dpo/loss_margin_mean": 33.05896759033203,
"margin_dpo/margin_mean": 33.05896759033203,
"margin_dpo/margin_std": 27.922954559326172,
"step": 379
},
{
"epoch": 0.5580029368575624,
"grad_norm": 79.15997314453125,
"learning_rate": 2.4486703937790243e-07,
"logits/chosen": -0.5800139904022217,
"logits/rejected": -0.5870028138160706,
"logps/chosen": -74.11767578125,
"logps/ref_chosen": -51.3530387878418,
"logps/ref_rejected": -104.19169616699219,
"logps/rejected": -161.47531127929688,
"loss": 0.5708,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.17511457204818726,
"margin_dpo/beta_margin_grad_std": 0.2577556371688843,
"margin_dpo/beta_margin_mean": 3.451897144317627,
"margin_dpo/loss_margin_mean": 34.51897430419922,
"margin_dpo/margin_mean": 34.51897430419922,
"margin_dpo/margin_std": 30.276945114135742,
"step": 380
},
{
"epoch": 0.5594713656387665,
"grad_norm": 63.70035171508789,
"learning_rate": 2.435840528363426e-07,
"logits/chosen": -0.5940126180648804,
"logits/rejected": -0.5524269342422485,
"logps/chosen": -79.10946655273438,
"logps/ref_chosen": -57.80306625366211,
"logps/ref_rejected": -79.21940612792969,
"logps/rejected": -134.66360473632812,
"loss": 0.5249,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1618008017539978,
"margin_dpo/beta_margin_grad_std": 0.22977310419082642,
"margin_dpo/beta_margin_mean": 3.4137802124023438,
"margin_dpo/loss_margin_mean": 34.13780212402344,
"margin_dpo/margin_mean": 34.13780212402344,
"margin_dpo/margin_std": 30.301250457763672,
"step": 381
},
{
"epoch": 0.5609397944199707,
"grad_norm": 55.583091735839844,
"learning_rate": 2.4230123536095745e-07,
"logits/chosen": -0.6578388214111328,
"logits/rejected": -0.6236182451248169,
"logps/chosen": -84.37736511230469,
"logps/ref_chosen": -66.02030181884766,
"logps/ref_rejected": -110.71015930175781,
"logps/rejected": -164.35707092285156,
"loss": 0.402,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.13427521288394928,
"margin_dpo/beta_margin_grad_std": 0.20077002048492432,
"margin_dpo/beta_margin_mean": 3.528985023498535,
"margin_dpo/loss_margin_mean": 35.28984832763672,
"margin_dpo/margin_mean": 35.28984832763672,
"margin_dpo/margin_std": 27.51814079284668,
"step": 382
},
{
"epoch": 0.5624082232011748,
"grad_norm": 53.64909744262695,
"learning_rate": 2.4101862075518037e-07,
"logits/chosen": -0.621538519859314,
"logits/rejected": -0.6095184087753296,
"logps/chosen": -71.61962890625,
"logps/ref_chosen": -50.39148712158203,
"logps/ref_rejected": -93.71589660644531,
"logps/rejected": -147.1136016845703,
"loss": 0.3707,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1415613293647766,
"margin_dpo/beta_margin_grad_std": 0.18154266476631165,
"margin_dpo/beta_margin_mean": 3.2169556617736816,
"margin_dpo/loss_margin_mean": 32.1695556640625,
"margin_dpo/margin_mean": 32.1695556640625,
"margin_dpo/margin_std": 25.299137115478516,
"step": 383
},
{
"epoch": 0.5638766519823789,
"grad_norm": 49.92765426635742,
"learning_rate": 2.397362428170992e-07,
"logits/chosen": -0.6100102663040161,
"logits/rejected": -0.5814231038093567,
"logps/chosen": -73.15731811523438,
"logps/ref_chosen": -52.046104431152344,
"logps/ref_rejected": -85.76089477539062,
"logps/rejected": -138.78662109375,
"loss": 0.4752,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.17313042283058167,
"margin_dpo/beta_margin_grad_std": 0.20930971205234528,
"margin_dpo/beta_margin_mean": 3.191450357437134,
"margin_dpo/loss_margin_mean": 31.914501190185547,
"margin_dpo/margin_mean": 31.914505004882812,
"margin_dpo/margin_std": 31.282188415527344,
"step": 384
},
{
"epoch": 0.5653450807635829,
"grad_norm": 57.458656311035156,
"learning_rate": 2.3845413533856514e-07,
"logits/chosen": -0.636741578578949,
"logits/rejected": -0.5834276676177979,
"logps/chosen": -83.70286560058594,
"logps/ref_chosen": -65.55216217041016,
"logps/ref_rejected": -77.82792663574219,
"logps/rejected": -124.51187133789062,
"loss": 0.4448,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1649201214313507,
"margin_dpo/beta_margin_grad_std": 0.20112237334251404,
"margin_dpo/beta_margin_mean": 2.8533244132995605,
"margin_dpo/loss_margin_mean": 28.533245086669922,
"margin_dpo/margin_mean": 28.533245086669922,
"margin_dpo/margin_std": 21.985557556152344,
"step": 385
},
{
"epoch": 0.566813509544787,
"grad_norm": 65.1162109375,
"learning_rate": 2.3717233210430254e-07,
"logits/chosen": -0.5937461853027344,
"logits/rejected": -0.5639574527740479,
"logps/chosen": -79.37002563476562,
"logps/ref_chosen": -58.22185516357422,
"logps/ref_rejected": -92.32742309570312,
"logps/rejected": -147.26278686523438,
"loss": 0.3465,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.12259222567081451,
"margin_dpo/beta_margin_grad_std": 0.18382999300956726,
"margin_dpo/beta_margin_mean": 3.3787193298339844,
"margin_dpo/loss_margin_mean": 33.787193298339844,
"margin_dpo/margin_mean": 33.787193298339844,
"margin_dpo/margin_std": 23.929697036743164,
"step": 386
},
{
"epoch": 0.5682819383259912,
"grad_norm": 72.24935150146484,
"learning_rate": 2.3589086689101889e-07,
"logits/chosen": -0.6583748459815979,
"logits/rejected": -0.6081060171127319,
"logps/chosen": -84.46687316894531,
"logps/ref_chosen": -66.41944885253906,
"logps/ref_rejected": -92.16915893554688,
"logps/rejected": -139.08279418945312,
"loss": 0.4609,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16438427567481995,
"margin_dpo/beta_margin_grad_std": 0.20714232325553894,
"margin_dpo/beta_margin_mean": 2.886620044708252,
"margin_dpo/loss_margin_mean": 28.866199493408203,
"margin_dpo/margin_mean": 28.86620330810547,
"margin_dpo/margin_std": 22.799579620361328,
"step": 387
},
{
"epoch": 0.5697503671071953,
"grad_norm": 49.13148880004883,
"learning_rate": 2.3460977346651428e-07,
"logits/chosen": -0.6356014013290405,
"logits/rejected": -0.6344074010848999,
"logps/chosen": -70.69566345214844,
"logps/ref_chosen": -50.129459381103516,
"logps/ref_rejected": -104.43305969238281,
"logps/rejected": -160.75782775878906,
"loss": 0.3715,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.13866804540157318,
"margin_dpo/beta_margin_grad_std": 0.18035584688186646,
"margin_dpo/beta_margin_mean": 3.5758562088012695,
"margin_dpo/loss_margin_mean": 35.75856018066406,
"margin_dpo/margin_mean": 35.75856018066406,
"margin_dpo/margin_std": 29.00539779663086,
"step": 388
},
{
"epoch": 0.5712187958883994,
"grad_norm": 39.877593994140625,
"learning_rate": 2.3332908558879177e-07,
"logits/chosen": -0.670194149017334,
"logits/rejected": -0.6247744560241699,
"logps/chosen": -76.81654357910156,
"logps/ref_chosen": -57.906593322753906,
"logps/ref_rejected": -77.91454315185547,
"logps/rejected": -130.85861206054688,
"loss": 0.3492,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.13106092810630798,
"margin_dpo/beta_margin_grad_std": 0.18352819979190826,
"margin_dpo/beta_margin_mean": 3.403412103652954,
"margin_dpo/loss_margin_mean": 34.03411865234375,
"margin_dpo/margin_mean": 34.03411865234375,
"margin_dpo/margin_std": 25.96773338317871,
"step": 389
},
{
"epoch": 0.5726872246696035,
"grad_norm": 65.6811752319336,
"learning_rate": 2.320488370051681e-07,
"logits/chosen": -0.5949693322181702,
"logits/rejected": -0.5706865787506104,
"logps/chosen": -70.46009826660156,
"logps/ref_chosen": -49.22591781616211,
"logps/ref_rejected": -85.5281982421875,
"logps/rejected": -137.94676208496094,
"loss": 0.5262,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16779480874538422,
"margin_dpo/beta_margin_grad_std": 0.23410001397132874,
"margin_dpo/beta_margin_mean": 3.118438243865967,
"margin_dpo/loss_margin_mean": 31.18438148498535,
"margin_dpo/margin_mean": 31.18438148498535,
"margin_dpo/margin_std": 27.223957061767578,
"step": 390
},
{
"epoch": 0.5741556534508077,
"grad_norm": 59.37446594238281,
"learning_rate": 2.3076906145138405e-07,
"logits/chosen": -0.6412761211395264,
"logits/rejected": -0.6216508150100708,
"logps/chosen": -86.92506408691406,
"logps/ref_chosen": -64.32965087890625,
"logps/ref_rejected": -86.73820495605469,
"logps/rejected": -137.03587341308594,
"loss": 0.5183,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.18732406198978424,
"margin_dpo/beta_margin_grad_std": 0.2125789225101471,
"margin_dpo/beta_margin_mean": 2.7702255249023438,
"margin_dpo/loss_margin_mean": 27.702255249023438,
"margin_dpo/margin_mean": 27.702255249023438,
"margin_dpo/margin_std": 26.598276138305664,
"step": 391
},
{
"epoch": 0.5756240822320118,
"grad_norm": 42.18976974487305,
"learning_rate": 2.294897926507156e-07,
"logits/chosen": -0.6050982475280762,
"logits/rejected": -0.5827013850212097,
"logps/chosen": -71.68020629882812,
"logps/ref_chosen": -53.50397872924805,
"logps/ref_rejected": -102.34583282470703,
"logps/rejected": -154.8188934326172,
"loss": 0.2982,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.11888954043388367,
"margin_dpo/beta_margin_grad_std": 0.1541348695755005,
"margin_dpo/beta_margin_mean": 3.429682493209839,
"margin_dpo/loss_margin_mean": 34.29682540893555,
"margin_dpo/margin_mean": 34.29682540893555,
"margin_dpo/margin_std": 25.064613342285156,
"step": 392
},
{
"epoch": 0.5770925110132159,
"grad_norm": 57.39728927612305,
"learning_rate": 2.2821106431308543e-07,
"logits/chosen": -0.572903037071228,
"logits/rejected": -0.543228268623352,
"logps/chosen": -65.63607788085938,
"logps/ref_chosen": -46.473915100097656,
"logps/ref_rejected": -71.96885681152344,
"logps/rejected": -118.71408081054688,
"loss": 0.5349,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.19703662395477295,
"margin_dpo/beta_margin_grad_std": 0.2085367888212204,
"margin_dpo/beta_margin_mean": 2.7583065032958984,
"margin_dpo/loss_margin_mean": 27.583065032958984,
"margin_dpo/margin_mean": 27.583065032958984,
"margin_dpo/margin_std": 27.609420776367188,
"step": 393
},
{
"epoch": 0.57856093979442,
"grad_norm": 60.01694869995117,
"learning_rate": 2.2693291013417452e-07,
"logits/chosen": -0.6191369295120239,
"logits/rejected": -0.5974385738372803,
"logps/chosen": -71.70793151855469,
"logps/ref_chosen": -52.91154479980469,
"logps/ref_rejected": -90.82263946533203,
"logps/rejected": -140.04214477539062,
"loss": 0.577,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.18772023916244507,
"margin_dpo/beta_margin_grad_std": 0.22964736819267273,
"margin_dpo/beta_margin_mean": 3.0423121452331543,
"margin_dpo/loss_margin_mean": 30.42312240600586,
"margin_dpo/margin_mean": 30.42312240600586,
"margin_dpo/margin_std": 31.186908721923828,
"step": 394
},
{
"epoch": 0.580029368575624,
"grad_norm": 45.10045623779297,
"learning_rate": 2.2565536379453404e-07,
"logits/chosen": -0.6933879852294922,
"logits/rejected": -0.6763237714767456,
"logps/chosen": -79.92889404296875,
"logps/ref_chosen": -62.546112060546875,
"logps/ref_rejected": -83.78262329101562,
"logps/rejected": -133.17141723632812,
"loss": 0.4329,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16615131497383118,
"margin_dpo/beta_margin_grad_std": 0.19662132859230042,
"margin_dpo/beta_margin_mean": 3.2006003856658936,
"margin_dpo/loss_margin_mean": 32.006004333496094,
"margin_dpo/margin_mean": 32.006004333496094,
"margin_dpo/margin_std": 26.82199478149414,
"step": 395
},
{
"epoch": 0.5814977973568282,
"grad_norm": 49.713191986083984,
"learning_rate": 2.2437845895869825e-07,
"logits/chosen": -0.6597648859024048,
"logits/rejected": -0.6144574284553528,
"logps/chosen": -88.35139465332031,
"logps/ref_chosen": -68.99594116210938,
"logps/ref_rejected": -88.64665985107422,
"logps/rejected": -139.371337890625,
"loss": 0.4051,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.14218196272850037,
"margin_dpo/beta_margin_grad_std": 0.207365944981575,
"margin_dpo/beta_margin_mean": 3.1369237899780273,
"margin_dpo/loss_margin_mean": 31.369239807128906,
"margin_dpo/margin_mean": 31.369239807128906,
"margin_dpo/margin_std": 23.291423797607422,
"step": 396
},
{
"epoch": 0.5829662261380323,
"grad_norm": 45.640316009521484,
"learning_rate": 2.2310222927429716e-07,
"logits/chosen": -0.6247228384017944,
"logits/rejected": -0.5786880254745483,
"logps/chosen": -78.00132751464844,
"logps/ref_chosen": -61.27716827392578,
"logps/ref_rejected": -103.11612701416016,
"logps/rejected": -155.62109375,
"loss": 0.3668,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1343180537223816,
"margin_dpo/beta_margin_grad_std": 0.19318078458309174,
"margin_dpo/beta_margin_mean": 3.5780787467956543,
"margin_dpo/loss_margin_mean": 35.78078842163086,
"margin_dpo/margin_mean": 35.780784606933594,
"margin_dpo/margin_std": 27.38970184326172,
"step": 397
},
{
"epoch": 0.5844346549192364,
"grad_norm": 50.45404815673828,
"learning_rate": 2.2182670837116972e-07,
"logits/chosen": -0.6684058904647827,
"logits/rejected": -0.6429616212844849,
"logps/chosen": -87.1710205078125,
"logps/ref_chosen": -68.15155029296875,
"logps/ref_rejected": -108.52360534667969,
"logps/rejected": -158.55606079101562,
"loss": 0.3542,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.13709759712219238,
"margin_dpo/beta_margin_grad_std": 0.17260834574699402,
"margin_dpo/beta_margin_mean": 3.1012983322143555,
"margin_dpo/loss_margin_mean": 31.012981414794922,
"margin_dpo/margin_mean": 31.012981414794922,
"margin_dpo/margin_std": 25.8978328704834,
"step": 398
},
{
"epoch": 0.5859030837004405,
"grad_norm": 56.667911529541016,
"learning_rate": 2.2055192986047804e-07,
"logits/chosen": -0.655005931854248,
"logits/rejected": -0.5906921625137329,
"logps/chosen": -77.88751220703125,
"logps/ref_chosen": -60.889801025390625,
"logps/ref_rejected": -77.96558380126953,
"logps/rejected": -129.66696166992188,
"loss": 0.4236,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.14022482931613922,
"margin_dpo/beta_margin_grad_std": 0.2172231674194336,
"margin_dpo/beta_margin_mean": 3.470367431640625,
"margin_dpo/loss_margin_mean": 34.70367431640625,
"margin_dpo/margin_mean": 34.70367431640625,
"margin_dpo/margin_std": 25.53824234008789,
"step": 399
},
{
"epoch": 0.5873715124816447,
"grad_norm": 56.22902297973633,
"learning_rate": 2.192779273338215e-07,
"logits/chosen": -0.6668632626533508,
"logits/rejected": -0.6306219100952148,
"logps/chosen": -81.36758422851562,
"logps/ref_chosen": -63.64359664916992,
"logps/ref_rejected": -105.252685546875,
"logps/rejected": -161.2772674560547,
"loss": 0.4468,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1347055435180664,
"margin_dpo/beta_margin_grad_std": 0.20773792266845703,
"margin_dpo/beta_margin_mean": 3.830059289932251,
"margin_dpo/loss_margin_mean": 38.300594329833984,
"margin_dpo/margin_mean": 38.30059051513672,
"margin_dpo/margin_std": 30.820228576660156,
"step": 400
},
{
"epoch": 0.5873715124816447,
"eval_logits/chosen": -0.5647093653678894,
"eval_logits/rejected": -0.5334640741348267,
"eval_logps/chosen": -102.04676818847656,
"eval_logps/ref_chosen": -79.05104064941406,
"eval_logps/ref_rejected": -86.79793548583984,
"eval_logps/rejected": -130.0719757080078,
"eval_loss": 0.42134976387023926,
"eval_margin_dpo/beta": 0.10000000149011612,
"eval_margin_dpo/beta_margin_grad_mean": -0.2686771750450134,
"eval_margin_dpo/beta_margin_grad_std": 0.2539796233177185,
"eval_margin_dpo/beta_margin_mean": 2.0278308391571045,
"eval_margin_dpo/loss_margin_mean": 20.27830696105957,
"eval_margin_dpo/margin_mean": 20.27830696105957,
"eval_margin_dpo/margin_std": 25.458209991455078,
"eval_runtime": 39.9217,
"eval_samples_per_second": 58.59,
"eval_steps_per_second": 1.854,
"step": 400
},
{
"epoch": 0.5888399412628488,
"grad_norm": 63.177490234375,
"learning_rate": 2.1800473436235136e-07,
"logits/chosen": -0.5337532758712769,
"logits/rejected": -0.5123304724693298,
"logps/chosen": -76.32991027832031,
"logps/ref_chosen": -57.16303253173828,
"logps/ref_rejected": -83.79249572753906,
"logps/rejected": -132.53762817382812,
"loss": 0.6636,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.2089146077632904,
"margin_dpo/beta_margin_grad_std": 0.2529540956020355,
"margin_dpo/beta_margin_mean": 2.95782470703125,
"margin_dpo/loss_margin_mean": 29.5782470703125,
"margin_dpo/margin_mean": 29.5782470703125,
"margin_dpo/margin_std": 31.70156478881836,
"step": 401
},
{
"epoch": 0.5903083700440529,
"grad_norm": 25.056612014770508,
"learning_rate": 2.1673238449588665e-07,
"logits/chosen": -0.6563818454742432,
"logits/rejected": -0.6099350452423096,
"logps/chosen": -62.76488494873047,
"logps/ref_chosen": -50.74037170410156,
"logps/ref_rejected": -81.0460433959961,
"logps/rejected": -132.43775939941406,
"loss": 0.211,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.08416810631752014,
"margin_dpo/beta_margin_grad_std": 0.13652239739894867,
"margin_dpo/beta_margin_mean": 3.936720371246338,
"margin_dpo/loss_margin_mean": 39.36720275878906,
"margin_dpo/margin_mean": 39.36720275878906,
"margin_dpo/margin_std": 24.1574649810791,
"step": 402
},
{
"epoch": 0.591776798825257,
"grad_norm": 62.19951629638672,
"learning_rate": 2.154609112620295e-07,
"logits/chosen": -0.6823678016662598,
"logits/rejected": -0.6667909622192383,
"logps/chosen": -62.73232650756836,
"logps/ref_chosen": -47.14731216430664,
"logps/ref_rejected": -77.2666015625,
"logps/rejected": -123.16378021240234,
"loss": 0.5812,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1768760085105896,
"margin_dpo/beta_margin_grad_std": 0.22503289580345154,
"margin_dpo/beta_margin_mean": 3.0312163829803467,
"margin_dpo/loss_margin_mean": 30.312164306640625,
"margin_dpo/margin_mean": 30.312164306640625,
"margin_dpo/margin_std": 28.440311431884766,
"step": 403
},
{
"epoch": 0.593245227606461,
"grad_norm": 55.05302810668945,
"learning_rate": 2.1419034816528218e-07,
"logits/chosen": -0.6315656304359436,
"logits/rejected": -0.5993084907531738,
"logps/chosen": -63.717140197753906,
"logps/ref_chosen": -47.875274658203125,
"logps/ref_rejected": -77.15499877929688,
"logps/rejected": -123.78963470458984,
"loss": 0.5707,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.18741407990455627,
"margin_dpo/beta_margin_grad_std": 0.23549535870552063,
"margin_dpo/beta_margin_mean": 3.0792763233184814,
"margin_dpo/loss_margin_mean": 30.792762756347656,
"margin_dpo/margin_mean": 30.792762756347656,
"margin_dpo/margin_std": 29.048046112060547,
"step": 404
},
{
"epoch": 0.5947136563876652,
"grad_norm": 68.80015563964844,
"learning_rate": 2.129207286861638e-07,
"logits/chosen": -0.5908911824226379,
"logits/rejected": -0.5618330240249634,
"logps/chosen": -84.89933776855469,
"logps/ref_chosen": -65.16290283203125,
"logps/ref_rejected": -87.18678283691406,
"logps/rejected": -137.16378784179688,
"loss": 0.5402,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.18618471920490265,
"margin_dpo/beta_margin_grad_std": 0.2252415120601654,
"margin_dpo/beta_margin_mean": 3.024057149887085,
"margin_dpo/loss_margin_mean": 30.240570068359375,
"margin_dpo/margin_mean": 30.240571975708008,
"margin_dpo/margin_std": 27.378923416137695,
"step": 405
},
{
"epoch": 0.5961820851688693,
"grad_norm": 62.42378616333008,
"learning_rate": 2.1165208628032861e-07,
"logits/chosen": -0.633690357208252,
"logits/rejected": -0.6194950342178345,
"logps/chosen": -66.8834457397461,
"logps/ref_chosen": -49.740814208984375,
"logps/ref_rejected": -92.07862854003906,
"logps/rejected": -141.23345947265625,
"loss": 0.5398,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16698844730854034,
"margin_dpo/beta_margin_grad_std": 0.21780461072921753,
"margin_dpo/beta_margin_mean": 3.2012197971343994,
"margin_dpo/loss_margin_mean": 32.0121955871582,
"margin_dpo/margin_mean": 32.0121955871582,
"margin_dpo/margin_std": 28.075244903564453,
"step": 406
},
{
"epoch": 0.5976505139500734,
"grad_norm": 77.41171264648438,
"learning_rate": 2.1038445437768375e-07,
"logits/chosen": -0.6692545413970947,
"logits/rejected": -0.6265490055084229,
"logps/chosen": -72.6529541015625,
"logps/ref_chosen": -56.33069610595703,
"logps/ref_rejected": -77.5120849609375,
"logps/rejected": -126.45954895019531,
"loss": 0.6116,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1815359890460968,
"margin_dpo/beta_margin_grad_std": 0.24493393301963806,
"margin_dpo/beta_margin_mean": 3.2625207901000977,
"margin_dpo/loss_margin_mean": 32.625205993652344,
"margin_dpo/margin_mean": 32.625205993652344,
"margin_dpo/margin_std": 29.34493637084961,
"step": 407
},
{
"epoch": 0.5991189427312775,
"grad_norm": 63.60155487060547,
"learning_rate": 2.0911786638150872e-07,
"logits/chosen": -0.6971176862716675,
"logits/rejected": -0.6451106071472168,
"logps/chosen": -85.57447814941406,
"logps/ref_chosen": -69.789306640625,
"logps/ref_rejected": -90.09693908691406,
"logps/rejected": -133.64590454101562,
"loss": 0.6213,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.20202887058258057,
"margin_dpo/beta_margin_grad_std": 0.24348929524421692,
"margin_dpo/beta_margin_mean": 2.7763803005218506,
"margin_dpo/loss_margin_mean": 27.76380157470703,
"margin_dpo/margin_mean": 27.76380157470703,
"margin_dpo/margin_std": 27.04732894897461,
"step": 408
},
{
"epoch": 0.6005873715124816,
"grad_norm": 49.4036750793457,
"learning_rate": 2.0785235566757517e-07,
"logits/chosen": -0.6140397787094116,
"logits/rejected": -0.582785964012146,
"logps/chosen": -84.53123474121094,
"logps/ref_chosen": -67.31744384765625,
"logps/ref_rejected": -84.904296875,
"logps/rejected": -133.21397399902344,
"loss": 0.4071,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.15303273499011993,
"margin_dpo/beta_margin_grad_std": 0.18726620078086853,
"margin_dpo/beta_margin_mean": 3.109589099884033,
"margin_dpo/loss_margin_mean": 31.095890045166016,
"margin_dpo/margin_mean": 31.095890045166016,
"margin_dpo/margin_std": 25.748220443725586,
"step": 409
},
{
"epoch": 0.6020558002936858,
"grad_norm": 67.83360290527344,
"learning_rate": 2.065879555832674e-07,
"logits/chosen": -0.5768519043922424,
"logits/rejected": -0.5736193656921387,
"logps/chosen": -70.64789581298828,
"logps/ref_chosen": -51.465354919433594,
"logps/ref_rejected": -83.198974609375,
"logps/rejected": -130.64712524414062,
"loss": 0.585,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.19843655824661255,
"margin_dpo/beta_margin_grad_std": 0.23984801769256592,
"margin_dpo/beta_margin_mean": 2.8265600204467773,
"margin_dpo/loss_margin_mean": 28.265602111816406,
"margin_dpo/margin_mean": 28.265600204467773,
"margin_dpo/margin_std": 26.36197280883789,
"step": 410
},
{
"epoch": 0.6035242290748899,
"grad_norm": 57.13195037841797,
"learning_rate": 2.0532469944670343e-07,
"logits/chosen": -0.6790816783905029,
"logits/rejected": -0.6465529203414917,
"logps/chosen": -71.7957992553711,
"logps/ref_chosen": -52.30727005004883,
"logps/ref_rejected": -80.69495391845703,
"logps/rejected": -130.16925048828125,
"loss": 0.5328,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.18435493111610413,
"margin_dpo/beta_margin_grad_std": 0.22915974259376526,
"margin_dpo/beta_margin_mean": 2.9985756874084473,
"margin_dpo/loss_margin_mean": 29.985755920410156,
"margin_dpo/margin_mean": 29.985755920410156,
"margin_dpo/margin_std": 27.710227966308594,
"step": 411
},
{
"epoch": 0.604992657856094,
"grad_norm": 41.65260314941406,
"learning_rate": 2.0406262054585738e-07,
"logits/chosen": -0.7030426859855652,
"logits/rejected": -0.6921846866607666,
"logps/chosen": -69.07894897460938,
"logps/ref_chosen": -53.144126892089844,
"logps/ref_rejected": -100.06080627441406,
"logps/rejected": -145.71115112304688,
"loss": 0.5004,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1866704523563385,
"margin_dpo/beta_margin_grad_std": 0.2087305784225464,
"margin_dpo/beta_margin_mean": 2.9715518951416016,
"margin_dpo/loss_margin_mean": 29.715518951416016,
"margin_dpo/margin_mean": 29.715518951416016,
"margin_dpo/margin_std": 27.474346160888672,
"step": 412
},
{
"epoch": 0.6064610866372981,
"grad_norm": 57.669593811035156,
"learning_rate": 2.0280175213768205e-07,
"logits/chosen": -0.5741822719573975,
"logits/rejected": -0.5401548147201538,
"logps/chosen": -80.91738891601562,
"logps/ref_chosen": -61.58196258544922,
"logps/ref_rejected": -99.47340393066406,
"logps/rejected": -148.91249084472656,
"loss": 0.4797,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1573394238948822,
"margin_dpo/beta_margin_grad_std": 0.1960790902376175,
"margin_dpo/beta_margin_mean": 3.0103673934936523,
"margin_dpo/loss_margin_mean": 30.10367202758789,
"margin_dpo/margin_mean": 30.10367202758789,
"margin_dpo/margin_std": 25.22928237915039,
"step": 413
},
{
"epoch": 0.6079295154185022,
"grad_norm": 49.89177322387695,
"learning_rate": 2.0154212744723247e-07,
"logits/chosen": -0.6295123100280762,
"logits/rejected": -0.5924926996231079,
"logps/chosen": -62.89250183105469,
"logps/ref_chosen": -46.63148880004883,
"logps/ref_rejected": -87.64652252197266,
"logps/rejected": -139.89688110351562,
"loss": 0.3669,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.13251307606697083,
"margin_dpo/beta_margin_grad_std": 0.18466657400131226,
"margin_dpo/beta_margin_mean": 3.598933219909668,
"margin_dpo/loss_margin_mean": 35.98933410644531,
"margin_dpo/margin_mean": 35.98933410644531,
"margin_dpo/margin_std": 25.766937255859375,
"step": 414
},
{
"epoch": 0.6093979441997063,
"grad_norm": 44.46585464477539,
"learning_rate": 2.002837796667909e-07,
"logits/chosen": -0.6287680268287659,
"logits/rejected": -0.6043534278869629,
"logps/chosen": -95.65867614746094,
"logps/ref_chosen": -78.6182861328125,
"logps/ref_rejected": -100.47752380371094,
"logps/rejected": -147.51513671875,
"loss": 0.3992,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.15828433632850647,
"margin_dpo/beta_margin_grad_std": 0.17664587497711182,
"margin_dpo/beta_margin_mean": 2.9997239112854004,
"margin_dpo/loss_margin_mean": 29.997238159179688,
"margin_dpo/margin_mean": 29.997238159179688,
"margin_dpo/margin_std": 25.054841995239258,
"step": 415
},
{
"epoch": 0.6108663729809104,
"grad_norm": 48.96870040893555,
"learning_rate": 1.990267419549914e-07,
"logits/chosen": -0.629509449005127,
"logits/rejected": -0.5954192876815796,
"logps/chosen": -75.95622253417969,
"logps/ref_chosen": -58.27912521362305,
"logps/ref_rejected": -90.56871795654297,
"logps/rejected": -145.17367553710938,
"loss": 0.384,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.13312619924545288,
"margin_dpo/beta_margin_grad_std": 0.18159282207489014,
"margin_dpo/beta_margin_mean": 3.692786693572998,
"margin_dpo/loss_margin_mean": 36.92786407470703,
"margin_dpo/margin_mean": 36.9278678894043,
"margin_dpo/margin_std": 27.910099029541016,
"step": 416
},
{
"epoch": 0.6123348017621145,
"grad_norm": 34.49977493286133,
"learning_rate": 1.9777104743594686e-07,
"logits/chosen": -0.6395463943481445,
"logits/rejected": -0.5739086866378784,
"logps/chosen": -66.991455078125,
"logps/ref_chosen": -50.1987190246582,
"logps/ref_rejected": -68.15184020996094,
"logps/rejected": -120.10306549072266,
"loss": 0.3145,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.12301211804151535,
"margin_dpo/beta_margin_grad_std": 0.17249628901481628,
"margin_dpo/beta_margin_mean": 3.5158486366271973,
"margin_dpo/loss_margin_mean": 35.158485412597656,
"margin_dpo/margin_mean": 35.158485412597656,
"margin_dpo/margin_std": 23.311870574951172,
"step": 417
},
{
"epoch": 0.6138032305433186,
"grad_norm": 66.27497100830078,
"learning_rate": 1.965167291983757e-07,
"logits/chosen": -0.6523764133453369,
"logits/rejected": -0.5884617567062378,
"logps/chosen": -99.4321060180664,
"logps/ref_chosen": -81.97846984863281,
"logps/ref_rejected": -104.69148254394531,
"logps/rejected": -156.43402099609375,
"loss": 0.566,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1649475246667862,
"margin_dpo/beta_margin_grad_std": 0.23172861337661743,
"margin_dpo/beta_margin_mean": 3.4288902282714844,
"margin_dpo/loss_margin_mean": 34.288902282714844,
"margin_dpo/margin_mean": 34.288902282714844,
"margin_dpo/margin_std": 31.480552673339844,
"step": 418
},
{
"epoch": 0.6152716593245228,
"grad_norm": 47.13197708129883,
"learning_rate": 1.9526382029472988e-07,
"logits/chosen": -0.6016639471054077,
"logits/rejected": -0.5594383478164673,
"logps/chosen": -70.49934387207031,
"logps/ref_chosen": -52.948646545410156,
"logps/ref_rejected": -91.58309936523438,
"logps/rejected": -143.46878051757812,
"loss": 0.3025,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.11779798567295074,
"margin_dpo/beta_margin_grad_std": 0.15797148644924164,
"margin_dpo/beta_margin_mean": 3.433500289916992,
"margin_dpo/loss_margin_mean": 34.33500289916992,
"margin_dpo/margin_mean": 34.33500289916992,
"margin_dpo/margin_std": 24.259674072265625,
"step": 419
},
{
"epoch": 0.6167400881057269,
"grad_norm": 63.46165466308594,
"learning_rate": 1.9401235374032425e-07,
"logits/chosen": -0.6578436493873596,
"logits/rejected": -0.579310417175293,
"logps/chosen": -96.29592895507812,
"logps/ref_chosen": -77.7699203491211,
"logps/ref_rejected": -69.31985473632812,
"logps/rejected": -121.10002899169922,
"loss": 0.4568,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1476997584104538,
"margin_dpo/beta_margin_grad_std": 0.207474946975708,
"margin_dpo/beta_margin_mean": 3.3254165649414062,
"margin_dpo/loss_margin_mean": 33.25416564941406,
"margin_dpo/margin_mean": 33.25416564941406,
"margin_dpo/margin_std": 27.482261657714844,
"step": 420
},
{
"epoch": 0.618208516886931,
"grad_norm": 75.73670959472656,
"learning_rate": 1.9276236251246653e-07,
"logits/chosen": -0.6206883192062378,
"logits/rejected": -0.5857428908348083,
"logps/chosen": -74.22445678710938,
"logps/ref_chosen": -53.765865325927734,
"logps/ref_rejected": -89.28144836425781,
"logps/rejected": -137.7203826904297,
"loss": 0.6307,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1979384869337082,
"margin_dpo/beta_margin_grad_std": 0.2501598596572876,
"margin_dpo/beta_margin_mean": 2.798034429550171,
"margin_dpo/loss_margin_mean": 27.980342864990234,
"margin_dpo/margin_mean": 27.980342864990234,
"margin_dpo/margin_std": 26.982437133789062,
"step": 421
},
{
"epoch": 0.6196769456681351,
"grad_norm": 69.67945098876953,
"learning_rate": 1.9151387954958792e-07,
"logits/chosen": -0.6548997163772583,
"logits/rejected": -0.6131415367126465,
"logps/chosen": -89.59654998779297,
"logps/ref_chosen": -68.6337661743164,
"logps/ref_rejected": -87.86351013183594,
"logps/rejected": -139.065185546875,
"loss": 0.5739,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.18962985277175903,
"margin_dpo/beta_margin_grad_std": 0.246720552444458,
"margin_dpo/beta_margin_mean": 3.0238897800445557,
"margin_dpo/loss_margin_mean": 30.2388973236084,
"margin_dpo/margin_mean": 30.2388973236084,
"margin_dpo/margin_std": 28.849193572998047,
"step": 422
},
{
"epoch": 0.6211453744493393,
"grad_norm": 73.22090148925781,
"learning_rate": 1.902669377503756e-07,
"logits/chosen": -0.6237994432449341,
"logits/rejected": -0.6053036451339722,
"logps/chosen": -74.4217529296875,
"logps/ref_chosen": -54.99030303955078,
"logps/ref_rejected": -86.30654907226562,
"logps/rejected": -137.05184936523438,
"loss": 0.5645,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.18330860137939453,
"margin_dpo/beta_margin_grad_std": 0.2365390956401825,
"margin_dpo/beta_margin_mean": 3.1313838958740234,
"margin_dpo/loss_margin_mean": 31.313838958740234,
"margin_dpo/margin_mean": 31.313838958740234,
"margin_dpo/margin_std": 29.860960006713867,
"step": 423
},
{
"epoch": 0.6226138032305433,
"grad_norm": 49.532413482666016,
"learning_rate": 1.890215699729057e-07,
"logits/chosen": -0.6332702040672302,
"logits/rejected": -0.5856061577796936,
"logps/chosen": -73.72906494140625,
"logps/ref_chosen": -56.01191711425781,
"logps/ref_rejected": -66.47896575927734,
"logps/rejected": -118.37336730957031,
"loss": 0.4253,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.15173882246017456,
"margin_dpo/beta_margin_grad_std": 0.20923829078674316,
"margin_dpo/beta_margin_mean": 3.417725086212158,
"margin_dpo/loss_margin_mean": 34.17725372314453,
"margin_dpo/margin_mean": 34.17725372314453,
"margin_dpo/margin_std": 30.512378692626953,
"step": 424
},
{
"epoch": 0.6240822320117474,
"grad_norm": 60.32538604736328,
"learning_rate": 1.8777780903377732e-07,
"logits/chosen": -0.631500780582428,
"logits/rejected": -0.6203855872154236,
"logps/chosen": -65.76054382324219,
"logps/ref_chosen": -46.868995666503906,
"logps/ref_rejected": -95.92545318603516,
"logps/rejected": -145.38247680664062,
"loss": 0.5126,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16552463173866272,
"margin_dpo/beta_margin_grad_std": 0.224747896194458,
"margin_dpo/beta_margin_mean": 3.0565476417541504,
"margin_dpo/loss_margin_mean": 30.565475463867188,
"margin_dpo/margin_mean": 30.56547737121582,
"margin_dpo/margin_std": 24.83243179321289,
"step": 425
},
{
"epoch": 0.6255506607929515,
"grad_norm": 77.01701354980469,
"learning_rate": 1.8653568770724803e-07,
"logits/chosen": -0.6216360330581665,
"logits/rejected": -0.5673133730888367,
"logps/chosen": -93.9437255859375,
"logps/ref_chosen": -76.58354187011719,
"logps/ref_rejected": -81.26658630371094,
"logps/rejected": -132.55589294433594,
"loss": 0.4423,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.13299913704395294,
"margin_dpo/beta_margin_grad_std": 0.21135151386260986,
"margin_dpo/beta_margin_mean": 3.392913341522217,
"margin_dpo/loss_margin_mean": 33.929134368896484,
"margin_dpo/margin_mean": 33.929134368896484,
"margin_dpo/margin_std": 26.49199867248535,
"step": 426
},
{
"epoch": 0.6270190895741556,
"grad_norm": 56.73555374145508,
"learning_rate": 1.8529523872436977e-07,
"logits/chosen": -0.6535402536392212,
"logits/rejected": -0.5980893969535828,
"logps/chosen": -81.8448486328125,
"logps/ref_chosen": -64.8538818359375,
"logps/ref_rejected": -78.56600952148438,
"logps/rejected": -120.1833267211914,
"loss": 0.5899,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1911955028772354,
"margin_dpo/beta_margin_grad_std": 0.20332689583301544,
"margin_dpo/beta_margin_mean": 2.4626340866088867,
"margin_dpo/loss_margin_mean": 24.626338958740234,
"margin_dpo/margin_mean": 24.626338958740234,
"margin_dpo/margin_std": 23.466392517089844,
"step": 427
},
{
"epoch": 0.6284875183553598,
"grad_norm": 43.91977310180664,
"learning_rate": 1.8405649477212697e-07,
"logits/chosen": -0.626772403717041,
"logits/rejected": -0.5905691385269165,
"logps/chosen": -83.34781646728516,
"logps/ref_chosen": -62.63666534423828,
"logps/ref_rejected": -103.28182220458984,
"logps/rejected": -159.70887756347656,
"loss": 0.3243,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.12279026210308075,
"margin_dpo/beta_margin_grad_std": 0.17204001545906067,
"margin_dpo/beta_margin_mean": 3.571589946746826,
"margin_dpo/loss_margin_mean": 35.71589660644531,
"margin_dpo/margin_mean": 35.71589660644531,
"margin_dpo/margin_std": 27.326576232910156,
"step": 428
},
{
"epoch": 0.6299559471365639,
"grad_norm": 64.35308837890625,
"learning_rate": 1.828194884925749e-07,
"logits/chosen": -0.5859851837158203,
"logits/rejected": -0.5243451595306396,
"logps/chosen": -101.322509765625,
"logps/ref_chosen": -81.23401641845703,
"logps/ref_rejected": -91.79493713378906,
"logps/rejected": -141.50485229492188,
"loss": 0.5977,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.19384217262268066,
"margin_dpo/beta_margin_grad_std": 0.23802496492862701,
"margin_dpo/beta_margin_mean": 2.9621434211730957,
"margin_dpo/loss_margin_mean": 29.621435165405273,
"margin_dpo/margin_mean": 29.62143325805664,
"margin_dpo/margin_std": 28.574806213378906,
"step": 429
},
{
"epoch": 0.631424375917768,
"grad_norm": 52.39344787597656,
"learning_rate": 1.8158425248197928e-07,
"logits/chosen": -0.5943987369537354,
"logits/rejected": -0.5758558511734009,
"logps/chosen": -79.220458984375,
"logps/ref_chosen": -60.92032241821289,
"logps/ref_rejected": -104.42280578613281,
"logps/rejected": -153.45037841796875,
"loss": 0.4773,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16452635824680328,
"margin_dpo/beta_margin_grad_std": 0.22081297636032104,
"margin_dpo/beta_margin_mean": 3.0727434158325195,
"margin_dpo/loss_margin_mean": 30.727432250976562,
"margin_dpo/margin_mean": 30.727432250976562,
"margin_dpo/margin_std": 26.309518814086914,
"step": 430
},
{
"epoch": 0.6328928046989721,
"grad_norm": 44.553733825683594,
"learning_rate": 1.8035081928995788e-07,
"logits/chosen": -0.5974197387695312,
"logits/rejected": -0.5811679363250732,
"logps/chosen": -76.02676391601562,
"logps/ref_chosen": -57.348751068115234,
"logps/ref_rejected": -92.84022521972656,
"logps/rejected": -146.17950439453125,
"loss": 0.3371,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.13372743129730225,
"margin_dpo/beta_margin_grad_std": 0.17146742343902588,
"margin_dpo/beta_margin_mean": 3.4661264419555664,
"margin_dpo/loss_margin_mean": 34.66126251220703,
"margin_dpo/margin_mean": 34.66126251220703,
"margin_dpo/margin_std": 26.27811050415039,
"step": 431
},
{
"epoch": 0.6343612334801763,
"grad_norm": 57.066585540771484,
"learning_rate": 1.791192214186223e-07,
"logits/chosen": -0.551721453666687,
"logits/rejected": -0.5079036951065063,
"logps/chosen": -89.14061737060547,
"logps/ref_chosen": -71.07479095458984,
"logps/ref_rejected": -98.57952880859375,
"logps/rejected": -149.09951782226562,
"loss": 0.4364,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.14938993752002716,
"margin_dpo/beta_margin_grad_std": 0.20730724930763245,
"margin_dpo/beta_margin_mean": 3.2454161643981934,
"margin_dpo/loss_margin_mean": 32.45416259765625,
"margin_dpo/margin_mean": 32.45416259765625,
"margin_dpo/margin_std": 27.234264373779297,
"step": 432
},
{
"epoch": 0.6358296622613803,
"grad_norm": 72.30256652832031,
"learning_rate": 1.7788949132172193e-07,
"logits/chosen": -0.6330820322036743,
"logits/rejected": -0.6014422178268433,
"logps/chosen": -81.89974975585938,
"logps/ref_chosen": -58.273193359375,
"logps/ref_rejected": -95.95089721679688,
"logps/rejected": -148.0189208984375,
"loss": 0.5896,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1933947056531906,
"margin_dpo/beta_margin_grad_std": 0.23659807443618774,
"margin_dpo/beta_margin_mean": 2.844146966934204,
"margin_dpo/loss_margin_mean": 28.441471099853516,
"margin_dpo/margin_mean": 28.441471099853516,
"margin_dpo/margin_std": 26.49103546142578,
"step": 433
},
{
"epoch": 0.6372980910425844,
"grad_norm": 48.833492279052734,
"learning_rate": 1.7666166140378853e-07,
"logits/chosen": -0.6459417343139648,
"logits/rejected": -0.6003463864326477,
"logps/chosen": -79.62370300292969,
"logps/ref_chosen": -61.97370147705078,
"logps/ref_rejected": -78.49861145019531,
"logps/rejected": -125.43734741210938,
"loss": 0.4262,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1577899158000946,
"margin_dpo/beta_margin_grad_std": 0.19978675246238708,
"margin_dpo/beta_margin_mean": 2.9288740158081055,
"margin_dpo/loss_margin_mean": 29.288738250732422,
"margin_dpo/margin_mean": 29.288738250732422,
"margin_dpo/margin_std": 24.996349334716797,
"step": 434
},
{
"epoch": 0.6387665198237885,
"grad_norm": 64.77494812011719,
"learning_rate": 1.7543576401928218e-07,
"logits/chosen": -0.6737290620803833,
"logits/rejected": -0.6396021842956543,
"logps/chosen": -69.81366729736328,
"logps/ref_chosen": -51.502052307128906,
"logps/ref_rejected": -87.56689453125,
"logps/rejected": -138.3524169921875,
"loss": 0.5095,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16580714285373688,
"margin_dpo/beta_margin_grad_std": 0.20909518003463745,
"margin_dpo/beta_margin_mean": 3.247391700744629,
"margin_dpo/loss_margin_mean": 32.473915100097656,
"margin_dpo/margin_mean": 32.473915100097656,
"margin_dpo/margin_std": 29.528972625732422,
"step": 435
},
{
"epoch": 0.6402349486049926,
"grad_norm": 40.28781509399414,
"learning_rate": 1.742118314717391e-07,
"logits/chosen": -0.6202067136764526,
"logits/rejected": -0.5589362978935242,
"logps/chosen": -89.22311401367188,
"logps/ref_chosen": -71.40371704101562,
"logps/ref_rejected": -82.72775268554688,
"logps/rejected": -132.42782592773438,
"loss": 0.3515,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.13569772243499756,
"margin_dpo/beta_margin_grad_std": 0.17755961418151855,
"margin_dpo/beta_margin_mean": 3.1880667209625244,
"margin_dpo/loss_margin_mean": 31.880666732788086,
"margin_dpo/margin_mean": 31.880664825439453,
"margin_dpo/margin_std": 24.377714157104492,
"step": 436
},
{
"epoch": 0.6417033773861968,
"grad_norm": 51.87274932861328,
"learning_rate": 1.7298989601292036e-07,
"logits/chosen": -0.6447381973266602,
"logits/rejected": -0.6036201119422913,
"logps/chosen": -82.27588653564453,
"logps/ref_chosen": -64.7442626953125,
"logps/ref_rejected": -82.04356384277344,
"logps/rejected": -127.82572937011719,
"loss": 0.5283,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.17866836488246918,
"margin_dpo/beta_margin_grad_std": 0.217972993850708,
"margin_dpo/beta_margin_mean": 2.8250551223754883,
"margin_dpo/loss_margin_mean": 28.25054931640625,
"margin_dpo/margin_mean": 28.25054931640625,
"margin_dpo/margin_std": 23.456018447875977,
"step": 437
},
{
"epoch": 0.6431718061674009,
"grad_norm": 64.63465118408203,
"learning_rate": 1.7176998984196144e-07,
"logits/chosen": -0.6529127359390259,
"logits/rejected": -0.5783262848854065,
"logps/chosen": -78.42019653320312,
"logps/ref_chosen": -59.0186653137207,
"logps/ref_rejected": -83.07682037353516,
"logps/rejected": -136.8445587158203,
"loss": 0.3724,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.13743507862091064,
"margin_dpo/beta_margin_grad_std": 0.18105177581310272,
"margin_dpo/beta_margin_mean": 3.4366211891174316,
"margin_dpo/loss_margin_mean": 34.3662109375,
"margin_dpo/margin_mean": 34.3662109375,
"margin_dpo/margin_std": 26.907875061035156,
"step": 438
},
{
"epoch": 0.644640234948605,
"grad_norm": 65.7437744140625,
"learning_rate": 1.7055214510452458e-07,
"logits/chosen": -0.6286749243736267,
"logits/rejected": -0.607205867767334,
"logps/chosen": -77.5360107421875,
"logps/ref_chosen": -53.784080505371094,
"logps/ref_rejected": -83.98545837402344,
"logps/rejected": -134.8016357421875,
"loss": 0.5171,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.18228086829185486,
"margin_dpo/beta_margin_grad_std": 0.19927145540714264,
"margin_dpo/beta_margin_mean": 2.7064239978790283,
"margin_dpo/loss_margin_mean": 27.064239501953125,
"margin_dpo/margin_mean": 27.064241409301758,
"margin_dpo/margin_std": 23.722930908203125,
"step": 439
},
{
"epoch": 0.6461086637298091,
"grad_norm": 95.62813568115234,
"learning_rate": 1.6933639389195134e-07,
"logits/chosen": -0.6431401968002319,
"logits/rejected": -0.6009776592254639,
"logps/chosen": -97.17742919921875,
"logps/ref_chosen": -78.56671905517578,
"logps/ref_rejected": -96.49775695800781,
"logps/rejected": -140.99290466308594,
"loss": 0.6656,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.21006713807582855,
"margin_dpo/beta_margin_grad_std": 0.24986517429351807,
"margin_dpo/beta_margin_mean": 2.5884432792663574,
"margin_dpo/loss_margin_mean": 25.884429931640625,
"margin_dpo/margin_mean": 25.884429931640625,
"margin_dpo/margin_std": 27.127971649169922,
"step": 440
},
{
"epoch": 0.6475770925110133,
"grad_norm": 52.33854293823242,
"learning_rate": 1.681227682404166e-07,
"logits/chosen": -0.587798535823822,
"logits/rejected": -0.5523707866668701,
"logps/chosen": -80.9710693359375,
"logps/ref_chosen": -60.824440002441406,
"logps/ref_rejected": -96.47080993652344,
"logps/rejected": -147.42752075195312,
"loss": 0.4425,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1385086476802826,
"margin_dpo/beta_margin_grad_std": 0.18967705965042114,
"margin_dpo/beta_margin_mean": 3.0810084342956543,
"margin_dpo/loss_margin_mean": 30.81008529663086,
"margin_dpo/margin_mean": 30.81008529663086,
"margin_dpo/margin_std": 23.786081314086914,
"step": 441
},
{
"epoch": 0.6490455212922174,
"grad_norm": 36.203887939453125,
"learning_rate": 1.669113001300851e-07,
"logits/chosen": -0.5871816873550415,
"logits/rejected": -0.549630343914032,
"logps/chosen": -65.28014373779297,
"logps/ref_chosen": -47.01121520996094,
"logps/ref_rejected": -76.53926086425781,
"logps/rejected": -132.68634033203125,
"loss": 0.2789,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.10812409967184067,
"margin_dpo/beta_margin_grad_std": 0.13728050887584686,
"margin_dpo/beta_margin_mean": 3.787814140319824,
"margin_dpo/loss_margin_mean": 37.878135681152344,
"margin_dpo/margin_mean": 37.878135681152344,
"margin_dpo/margin_std": 26.232383728027344,
"step": 442
},
{
"epoch": 0.6505139500734214,
"grad_norm": 81.11394500732422,
"learning_rate": 1.6570202148426815e-07,
"logits/chosen": -0.6177343130111694,
"logits/rejected": -0.5820919275283813,
"logps/chosen": -93.88763427734375,
"logps/ref_chosen": -71.27301788330078,
"logps/ref_rejected": -86.679931640625,
"logps/rejected": -138.04800415039062,
"loss": 0.6542,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.19925275444984436,
"margin_dpo/beta_margin_grad_std": 0.2632359564304352,
"margin_dpo/beta_margin_mean": 2.8753466606140137,
"margin_dpo/loss_margin_mean": 28.75346565246582,
"margin_dpo/margin_mean": 28.753463745117188,
"margin_dpo/margin_std": 27.78663444519043,
"step": 443
},
{
"epoch": 0.6519823788546255,
"grad_norm": 47.98015594482422,
"learning_rate": 1.6449496416858282e-07,
"logits/chosen": -0.593714714050293,
"logits/rejected": -0.5685232877731323,
"logps/chosen": -77.04825592041016,
"logps/ref_chosen": -57.213706970214844,
"logps/ref_rejected": -97.25489044189453,
"logps/rejected": -151.35964965820312,
"loss": 0.4406,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1492740362882614,
"margin_dpo/beta_margin_grad_std": 0.21126282215118408,
"margin_dpo/beta_margin_mean": 3.42702054977417,
"margin_dpo/loss_margin_mean": 34.27020263671875,
"margin_dpo/margin_mean": 34.27020263671875,
"margin_dpo/margin_std": 28.456218719482422,
"step": 444
},
{
"epoch": 0.6534508076358296,
"grad_norm": 63.52720260620117,
"learning_rate": 1.6329015999011182e-07,
"logits/chosen": -0.5531260967254639,
"logits/rejected": -0.5164097547531128,
"logps/chosen": -84.41445922851562,
"logps/ref_chosen": -67.29979705810547,
"logps/ref_rejected": -92.68267822265625,
"logps/rejected": -141.507080078125,
"loss": 0.4692,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1679629534482956,
"margin_dpo/beta_margin_grad_std": 0.21126939356327057,
"margin_dpo/beta_margin_mean": 3.170973300933838,
"margin_dpo/loss_margin_mean": 31.709733963012695,
"margin_dpo/margin_mean": 31.709733963012695,
"margin_dpo/margin_std": 27.622833251953125,
"step": 445
},
{
"epoch": 0.6549192364170338,
"grad_norm": 50.25477600097656,
"learning_rate": 1.6208764069656578e-07,
"logits/chosen": -0.6113117933273315,
"logits/rejected": -0.5910245776176453,
"logps/chosen": -76.8818359375,
"logps/ref_chosen": -59.098487854003906,
"logps/ref_rejected": -101.26419067382812,
"logps/rejected": -149.34832763671875,
"loss": 0.4369,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16404861211776733,
"margin_dpo/beta_margin_grad_std": 0.18896964192390442,
"margin_dpo/beta_margin_mean": 3.030078887939453,
"margin_dpo/loss_margin_mean": 30.30078887939453,
"margin_dpo/margin_mean": 30.300785064697266,
"margin_dpo/margin_std": 26.338363647460938,
"step": 446
},
{
"epoch": 0.6563876651982379,
"grad_norm": 50.45421600341797,
"learning_rate": 1.608874379754465e-07,
"logits/chosen": -0.636214017868042,
"logits/rejected": -0.6372050046920776,
"logps/chosen": -76.49604797363281,
"logps/ref_chosen": -56.07533264160156,
"logps/ref_rejected": -98.69475555419922,
"logps/rejected": -151.05020141601562,
"loss": 0.4496,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16537348926067352,
"margin_dpo/beta_margin_grad_std": 0.20752978324890137,
"margin_dpo/beta_margin_mean": 3.193472385406494,
"margin_dpo/loss_margin_mean": 31.934722900390625,
"margin_dpo/margin_mean": 31.934722900390625,
"margin_dpo/margin_std": 28.57367515563965,
"step": 447
},
{
"epoch": 0.657856093979442,
"grad_norm": 49.00764846801758,
"learning_rate": 1.5968958345321177e-07,
"logits/chosen": -0.618561863899231,
"logits/rejected": -0.6021959185600281,
"logps/chosen": -81.1510238647461,
"logps/ref_chosen": -60.00384521484375,
"logps/ref_rejected": -102.26465606689453,
"logps/rejected": -155.61978149414062,
"loss": 0.3919,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.13892757892608643,
"margin_dpo/beta_margin_grad_std": 0.17518764734268188,
"margin_dpo/beta_margin_mean": 3.220794439315796,
"margin_dpo/loss_margin_mean": 32.207942962646484,
"margin_dpo/margin_mean": 32.207942962646484,
"margin_dpo/margin_std": 25.56855010986328,
"step": 448
},
{
"epoch": 0.6593245227606461,
"grad_norm": 81.82498931884766,
"learning_rate": 1.584941086944423e-07,
"logits/chosen": -0.6117278337478638,
"logits/rejected": -0.5685479640960693,
"logps/chosen": -89.69844055175781,
"logps/ref_chosen": -67.52661895751953,
"logps/ref_rejected": -88.59690856933594,
"logps/rejected": -142.21090698242188,
"loss": 0.5996,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.17296102643013,
"margin_dpo/beta_margin_grad_std": 0.23669497668743134,
"margin_dpo/beta_margin_mean": 3.1442177295684814,
"margin_dpo/loss_margin_mean": 31.442176818847656,
"margin_dpo/margin_mean": 31.442176818847656,
"margin_dpo/margin_std": 30.16796875,
"step": 449
},
{
"epoch": 0.6607929515418502,
"grad_norm": 41.05678176879883,
"learning_rate": 1.573010452010098e-07,
"logits/chosen": -0.6542295217514038,
"logits/rejected": -0.6270924806594849,
"logps/chosen": -73.37232971191406,
"logps/ref_chosen": -57.108116149902344,
"logps/ref_rejected": -102.75494384765625,
"logps/rejected": -153.49468994140625,
"loss": 0.3193,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.12820908427238464,
"margin_dpo/beta_margin_grad_std": 0.16310177743434906,
"margin_dpo/beta_margin_mean": 3.4475526809692383,
"margin_dpo/loss_margin_mean": 34.47552490234375,
"margin_dpo/margin_mean": 34.47552490234375,
"margin_dpo/margin_std": 25.831031799316406,
"step": 450
},
{
"epoch": 0.6622613803230544,
"grad_norm": 75.21393585205078,
"learning_rate": 1.5611042441124687e-07,
"logits/chosen": -0.643078625202179,
"logits/rejected": -0.5938763618469238,
"logps/chosen": -80.2492904663086,
"logps/ref_chosen": -58.46883010864258,
"logps/ref_rejected": -72.92941284179688,
"logps/rejected": -124.17213439941406,
"loss": 0.553,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.17004984617233276,
"margin_dpo/beta_margin_grad_std": 0.22876521944999695,
"margin_dpo/beta_margin_mean": 2.946226119995117,
"margin_dpo/loss_margin_mean": 29.46225929260254,
"margin_dpo/margin_mean": 29.46225929260254,
"margin_dpo/margin_std": 25.89090347290039,
"step": 451
},
{
"epoch": 0.6637298091042585,
"grad_norm": 33.82390213012695,
"learning_rate": 1.549222776991186e-07,
"logits/chosen": -0.5697954297065735,
"logits/rejected": -0.5710628628730774,
"logps/chosen": -66.59547424316406,
"logps/ref_chosen": -50.39055252075195,
"logps/ref_rejected": -97.77143096923828,
"logps/rejected": -144.08883666992188,
"loss": 0.2885,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.12178364396095276,
"margin_dpo/beta_margin_grad_std": 0.13007774949073792,
"margin_dpo/beta_margin_mean": 3.0112478733062744,
"margin_dpo/loss_margin_mean": 30.11248016357422,
"margin_dpo/margin_mean": 30.11248016357422,
"margin_dpo/margin_std": 22.0058536529541,
"step": 452
},
{
"epoch": 0.6651982378854625,
"grad_norm": 49.218753814697266,
"learning_rate": 1.5373663637339584e-07,
"logits/chosen": -0.6415982246398926,
"logits/rejected": -0.5901994705200195,
"logps/chosen": -77.15182495117188,
"logps/ref_chosen": -57.71485137939453,
"logps/ref_rejected": -82.20741271972656,
"logps/rejected": -130.7767333984375,
"loss": 0.4672,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1702961027622223,
"margin_dpo/beta_margin_grad_std": 0.20113880932331085,
"margin_dpo/beta_margin_mean": 2.9132347106933594,
"margin_dpo/loss_margin_mean": 29.13234519958496,
"margin_dpo/margin_mean": 29.132347106933594,
"margin_dpo/margin_std": 25.473758697509766,
"step": 453
},
{
"epoch": 0.6666666666666666,
"grad_norm": 56.509586334228516,
"learning_rate": 1.5255353167683017e-07,
"logits/chosen": -0.6178318858146667,
"logits/rejected": -0.5745600461959839,
"logps/chosen": -81.63310241699219,
"logps/ref_chosen": -60.945648193359375,
"logps/ref_rejected": -84.9507827758789,
"logps/rejected": -138.04898071289062,
"loss": 0.3975,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.14186511933803558,
"margin_dpo/beta_margin_grad_std": 0.19594962894916534,
"margin_dpo/beta_margin_mean": 3.2410740852355957,
"margin_dpo/loss_margin_mean": 32.41073989868164,
"margin_dpo/margin_mean": 32.41073989868164,
"margin_dpo/margin_std": 25.818143844604492,
"step": 454
},
{
"epoch": 0.6681350954478708,
"grad_norm": 45.265987396240234,
"learning_rate": 1.5137299478533064e-07,
"logits/chosen": -0.6223077774047852,
"logits/rejected": -0.5953476428985596,
"logps/chosen": -65.11666870117188,
"logps/ref_chosen": -44.88671112060547,
"logps/ref_rejected": -115.30147552490234,
"logps/rejected": -172.906982421875,
"loss": 0.3707,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1233278438448906,
"margin_dpo/beta_margin_grad_std": 0.19460362195968628,
"margin_dpo/beta_margin_mean": 3.7375543117523193,
"margin_dpo/loss_margin_mean": 37.37554168701172,
"margin_dpo/margin_mean": 37.37554168701172,
"margin_dpo/margin_std": 26.588571548461914,
"step": 455
},
{
"epoch": 0.6696035242290749,
"grad_norm": 51.1346321105957,
"learning_rate": 1.5019505680714232e-07,
"logits/chosen": -0.6171753406524658,
"logits/rejected": -0.6144955158233643,
"logps/chosen": -74.41389465332031,
"logps/ref_chosen": -57.036781311035156,
"logps/ref_rejected": -105.21783447265625,
"logps/rejected": -160.3103790283203,
"loss": 0.3541,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.13141584396362305,
"margin_dpo/beta_margin_grad_std": 0.1724250167608261,
"margin_dpo/beta_margin_mean": 3.771542549133301,
"margin_dpo/loss_margin_mean": 37.715423583984375,
"margin_dpo/margin_mean": 37.715423583984375,
"margin_dpo/margin_std": 28.47699737548828,
"step": 456
},
{
"epoch": 0.671071953010279,
"grad_norm": 58.4116096496582,
"learning_rate": 1.4901974878202627e-07,
"logits/chosen": -0.5911962985992432,
"logits/rejected": -0.5620957612991333,
"logps/chosen": -72.8665542602539,
"logps/ref_chosen": -54.24253845214844,
"logps/ref_rejected": -85.10956573486328,
"logps/rejected": -136.763916015625,
"loss": 0.3892,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.13140106201171875,
"margin_dpo/beta_margin_grad_std": 0.19598211348056793,
"margin_dpo/beta_margin_mean": 3.3030338287353516,
"margin_dpo/loss_margin_mean": 33.030338287353516,
"margin_dpo/margin_mean": 33.030338287353516,
"margin_dpo/margin_std": 24.58535385131836,
"step": 457
},
{
"epoch": 0.6725403817914831,
"grad_norm": 60.80534362792969,
"learning_rate": 1.4784710168044212e-07,
"logits/chosen": -0.5890240669250488,
"logits/rejected": -0.5499871969223022,
"logps/chosen": -74.93452453613281,
"logps/ref_chosen": -55.40888214111328,
"logps/ref_rejected": -97.68325805664062,
"logps/rejected": -155.25704956054688,
"loss": 0.4472,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1399458944797516,
"margin_dpo/beta_margin_grad_std": 0.23080742359161377,
"margin_dpo/beta_margin_mean": 3.8048152923583984,
"margin_dpo/loss_margin_mean": 38.04814910888672,
"margin_dpo/margin_mean": 38.048152923583984,
"margin_dpo/margin_std": 32.506038665771484,
"step": 458
},
{
"epoch": 0.6740088105726872,
"grad_norm": 49.459625244140625,
"learning_rate": 1.466771464027316e-07,
"logits/chosen": -0.6109951138496399,
"logits/rejected": -0.58476322889328,
"logps/chosen": -67.23796081542969,
"logps/ref_chosen": -46.55748748779297,
"logps/ref_rejected": -86.16854095458984,
"logps/rejected": -135.8137969970703,
"loss": 0.4559,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16255170106887817,
"margin_dpo/beta_margin_grad_std": 0.19391369819641113,
"margin_dpo/beta_margin_mean": 2.8964788913726807,
"margin_dpo/loss_margin_mean": 28.96478843688965,
"margin_dpo/margin_mean": 28.96478843688965,
"margin_dpo/margin_std": 23.446517944335938,
"step": 459
},
{
"epoch": 0.6754772393538914,
"grad_norm": 60.67763900756836,
"learning_rate": 1.4550991377830423e-07,
"logits/chosen": -0.6025089621543884,
"logits/rejected": -0.6064221858978271,
"logps/chosen": -70.86332702636719,
"logps/ref_chosen": -51.63489532470703,
"logps/ref_rejected": -104.11935424804688,
"logps/rejected": -156.27178955078125,
"loss": 0.4214,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.151195228099823,
"margin_dpo/beta_margin_grad_std": 0.2108098566532135,
"margin_dpo/beta_margin_mean": 3.2924013137817383,
"margin_dpo/loss_margin_mean": 32.92401123046875,
"margin_dpo/margin_mean": 32.92401123046875,
"margin_dpo/margin_std": 25.963363647460938,
"step": 460
},
{
"epoch": 0.6769456681350955,
"grad_norm": 62.43408966064453,
"learning_rate": 1.4434543456482518e-07,
"logits/chosen": -0.6063634157180786,
"logits/rejected": -0.591764509677887,
"logps/chosen": -80.1798095703125,
"logps/ref_chosen": -55.18195343017578,
"logps/ref_rejected": -86.47689819335938,
"logps/rejected": -139.15049743652344,
"loss": 0.5542,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1909765750169754,
"margin_dpo/beta_margin_grad_std": 0.22030548751354218,
"margin_dpo/beta_margin_mean": 2.767573833465576,
"margin_dpo/loss_margin_mean": 27.675739288330078,
"margin_dpo/margin_mean": 27.675739288330078,
"margin_dpo/margin_std": 27.114221572875977,
"step": 461
},
{
"epoch": 0.6784140969162996,
"grad_norm": 65.849853515625,
"learning_rate": 1.4318373944740484e-07,
"logits/chosen": -0.6203492879867554,
"logits/rejected": -0.5816408395767212,
"logps/chosen": -93.56144714355469,
"logps/ref_chosen": -69.92803955078125,
"logps/ref_rejected": -78.84111785888672,
"logps/rejected": -129.50086975097656,
"loss": 0.5525,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1917511373758316,
"margin_dpo/beta_margin_grad_std": 0.21951280534267426,
"margin_dpo/beta_margin_mean": 2.702633857727051,
"margin_dpo/loss_margin_mean": 27.026338577270508,
"margin_dpo/margin_mean": 27.026338577270508,
"margin_dpo/margin_std": 25.6932373046875,
"step": 462
},
{
"epoch": 0.6798825256975036,
"grad_norm": 50.78045654296875,
"learning_rate": 1.4202485903778976e-07,
"logits/chosen": -0.5955685377120972,
"logits/rejected": -0.5663818120956421,
"logps/chosen": -76.08649444580078,
"logps/ref_chosen": -55.27437210083008,
"logps/ref_rejected": -89.02497863769531,
"logps/rejected": -143.9271240234375,
"loss": 0.3565,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.12205320596694946,
"margin_dpo/beta_margin_grad_std": 0.1792411506175995,
"margin_dpo/beta_margin_mean": 3.4090020656585693,
"margin_dpo/loss_margin_mean": 34.09001922607422,
"margin_dpo/margin_mean": 34.09001922607422,
"margin_dpo/margin_std": 23.93946075439453,
"step": 463
},
{
"epoch": 0.6813509544787077,
"grad_norm": 57.22633743286133,
"learning_rate": 1.4086882387355658e-07,
"logits/chosen": -0.6078216433525085,
"logits/rejected": -0.6137137413024902,
"logps/chosen": -73.63619995117188,
"logps/ref_chosen": -50.91230010986328,
"logps/ref_rejected": -102.4893798828125,
"logps/rejected": -160.0600128173828,
"loss": 0.4538,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.14801308512687683,
"margin_dpo/beta_margin_grad_std": 0.2087215781211853,
"margin_dpo/beta_margin_mean": 3.484673023223877,
"margin_dpo/loss_margin_mean": 34.84673309326172,
"margin_dpo/margin_mean": 34.84673309326172,
"margin_dpo/margin_std": 30.08755874633789,
"step": 464
},
{
"epoch": 0.6828193832599119,
"grad_norm": 50.34962844848633,
"learning_rate": 1.3971566441730714e-07,
"logits/chosen": -0.5805087089538574,
"logits/rejected": -0.5592623949050903,
"logps/chosen": -81.41581726074219,
"logps/ref_chosen": -60.116851806640625,
"logps/ref_rejected": -113.94602966308594,
"logps/rejected": -173.1104736328125,
"loss": 0.2796,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.10604196786880493,
"margin_dpo/beta_margin_grad_std": 0.16912564635276794,
"margin_dpo/beta_margin_mean": 3.786548614501953,
"margin_dpo/loss_margin_mean": 37.86548614501953,
"margin_dpo/margin_mean": 37.86548614501953,
"margin_dpo/margin_std": 25.48162841796875,
"step": 465
},
{
"epoch": 0.684287812041116,
"grad_norm": 56.52486801147461,
"learning_rate": 1.3856541105586545e-07,
"logits/chosen": -0.6198223829269409,
"logits/rejected": -0.5899391174316406,
"logps/chosen": -75.75175476074219,
"logps/ref_chosen": -52.920921325683594,
"logps/ref_rejected": -90.3154296875,
"logps/rejected": -147.3414306640625,
"loss": 0.3836,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.12490339577198029,
"margin_dpo/beta_margin_grad_std": 0.17585688829421997,
"margin_dpo/beta_margin_mean": 3.419515609741211,
"margin_dpo/loss_margin_mean": 34.195152282714844,
"margin_dpo/margin_mean": 34.195152282714844,
"margin_dpo/margin_std": 23.578819274902344,
"step": 466
},
{
"epoch": 0.6857562408223201,
"grad_norm": 47.07603073120117,
"learning_rate": 1.3741809409947729e-07,
"logits/chosen": -0.6401114463806152,
"logits/rejected": -0.6112991571426392,
"logps/chosen": -102.23968505859375,
"logps/ref_chosen": -78.7158203125,
"logps/ref_rejected": -102.86019897460938,
"logps/rejected": -160.81512451171875,
"loss": 0.3757,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1389492303133011,
"margin_dpo/beta_margin_grad_std": 0.19067519903182983,
"margin_dpo/beta_margin_mean": 3.443108081817627,
"margin_dpo/loss_margin_mean": 34.43107986450195,
"margin_dpo/margin_mean": 34.43107986450195,
"margin_dpo/margin_std": 27.870590209960938,
"step": 467
},
{
"epoch": 0.6872246696035242,
"grad_norm": 52.84998321533203,
"learning_rate": 1.362737437810114e-07,
"logits/chosen": -0.5987285375595093,
"logits/rejected": -0.5710204243659973,
"logps/chosen": -89.93443298339844,
"logps/ref_chosen": -69.93536376953125,
"logps/ref_rejected": -101.02881622314453,
"logps/rejected": -152.99951171875,
"loss": 0.3946,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.14784805476665497,
"margin_dpo/beta_margin_grad_std": 0.19148442149162292,
"margin_dpo/beta_margin_mean": 3.1971635818481445,
"margin_dpo/loss_margin_mean": 31.971633911132812,
"margin_dpo/margin_mean": 31.971633911132812,
"margin_dpo/margin_std": 26.962993621826172,
"step": 468
},
{
"epoch": 0.6886930983847284,
"grad_norm": 57.32428741455078,
"learning_rate": 1.351323902551631e-07,
"logits/chosen": -0.6210588216781616,
"logits/rejected": -0.5882803201675415,
"logps/chosen": -91.5205078125,
"logps/ref_chosen": -68.12469482421875,
"logps/ref_rejected": -104.78640747070312,
"logps/rejected": -161.24330139160156,
"loss": 0.4417,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.15189404785633087,
"margin_dpo/beta_margin_grad_std": 0.21930459141731262,
"margin_dpo/beta_margin_mean": 3.3061084747314453,
"margin_dpo/loss_margin_mean": 33.06108474731445,
"margin_dpo/margin_mean": 33.06108474731445,
"margin_dpo/margin_std": 27.455984115600586,
"step": 469
},
{
"epoch": 0.6901615271659325,
"grad_norm": 36.49016189575195,
"learning_rate": 1.339940635976592e-07,
"logits/chosen": -0.5721327066421509,
"logits/rejected": -0.5457053184509277,
"logps/chosen": -64.37504577636719,
"logps/ref_chosen": -43.79193115234375,
"logps/ref_rejected": -82.70285034179688,
"logps/rejected": -141.7349853515625,
"loss": 0.2322,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.0932888612151146,
"margin_dpo/beta_margin_grad_std": 0.1430518627166748,
"margin_dpo/beta_margin_mean": 3.8449037075042725,
"margin_dpo/loss_margin_mean": 38.44903564453125,
"margin_dpo/margin_mean": 38.44903564453125,
"margin_dpo/margin_std": 23.93124008178711,
"step": 470
},
{
"epoch": 0.6916299559471366,
"grad_norm": 55.444793701171875,
"learning_rate": 1.3285879380446563e-07,
"logits/chosen": -0.5917923450469971,
"logits/rejected": -0.5650200843811035,
"logps/chosen": -87.83808898925781,
"logps/ref_chosen": -63.33952331542969,
"logps/ref_rejected": -83.61048126220703,
"logps/rejected": -139.63504028320312,
"loss": 0.4209,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.15353405475616455,
"margin_dpo/beta_margin_grad_std": 0.2025720775127411,
"margin_dpo/beta_margin_mean": 3.1526002883911133,
"margin_dpo/loss_margin_mean": 31.526004791259766,
"margin_dpo/margin_mean": 31.526002883911133,
"margin_dpo/margin_std": 24.51514434814453,
"step": 471
},
{
"epoch": 0.6930983847283406,
"grad_norm": 51.19646453857422,
"learning_rate": 1.317266107909975e-07,
"logits/chosen": -0.6550266742706299,
"logits/rejected": -0.6002498865127563,
"logps/chosen": -105.19808959960938,
"logps/ref_chosen": -83.66609954833984,
"logps/ref_rejected": -117.20919799804688,
"logps/rejected": -179.25303649902344,
"loss": 0.3083,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.11649411916732788,
"margin_dpo/beta_margin_grad_std": 0.1778721958398819,
"margin_dpo/beta_margin_mean": 4.051185607910156,
"margin_dpo/loss_margin_mean": 40.51185607910156,
"margin_dpo/margin_mean": 40.51185607910156,
"margin_dpo/margin_std": 33.35724639892578,
"step": 472
},
{
"epoch": 0.6945668135095447,
"grad_norm": 78.59127044677734,
"learning_rate": 1.3059754439133002e-07,
"logits/chosen": -0.5987892746925354,
"logits/rejected": -0.5563715696334839,
"logps/chosen": -87.65088653564453,
"logps/ref_chosen": -63.49696731567383,
"logps/ref_rejected": -81.14657592773438,
"logps/rejected": -133.66065979003906,
"loss": 0.4869,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.170325368642807,
"margin_dpo/beta_margin_grad_std": 0.22086429595947266,
"margin_dpo/beta_margin_mean": 2.8360166549682617,
"margin_dpo/loss_margin_mean": 28.360164642333984,
"margin_dpo/margin_mean": 28.360164642333984,
"margin_dpo/margin_std": 22.68465805053711,
"step": 473
},
{
"epoch": 0.6960352422907489,
"grad_norm": 73.43638610839844,
"learning_rate": 1.2947162435741277e-07,
"logits/chosen": -0.5953601598739624,
"logits/rejected": -0.5836308598518372,
"logps/chosen": -76.69085693359375,
"logps/ref_chosen": -52.6119384765625,
"logps/ref_rejected": -90.08041381835938,
"logps/rejected": -145.01792907714844,
"loss": 0.4665,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16552072763442993,
"margin_dpo/beta_margin_grad_std": 0.22101813554763794,
"margin_dpo/beta_margin_mean": 3.085860252380371,
"margin_dpo/loss_margin_mean": 30.85860252380371,
"margin_dpo/margin_mean": 30.85860252380371,
"margin_dpo/margin_std": 25.63982582092285,
"step": 474
},
{
"epoch": 0.697503671071953,
"grad_norm": 43.33028030395508,
"learning_rate": 1.2834888035828596e-07,
"logits/chosen": -0.634456992149353,
"logits/rejected": -0.6314413547515869,
"logps/chosen": -63.670257568359375,
"logps/ref_chosen": -42.49519348144531,
"logps/ref_rejected": -90.06295013427734,
"logps/rejected": -145.98110961914062,
"loss": 0.3832,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.14322960376739502,
"margin_dpo/beta_margin_grad_std": 0.19120553135871887,
"margin_dpo/beta_margin_mean": 3.4743099212646484,
"margin_dpo/loss_margin_mean": 34.74309539794922,
"margin_dpo/margin_mean": 34.743099212646484,
"margin_dpo/margin_std": 30.56637191772461,
"step": 475
},
{
"epoch": 0.6989720998531571,
"grad_norm": 60.676177978515625,
"learning_rate": 1.2722934197929802e-07,
"logits/chosen": -0.6468064785003662,
"logits/rejected": -0.6170526742935181,
"logps/chosen": -64.93257141113281,
"logps/ref_chosen": -42.949378967285156,
"logps/ref_rejected": -73.71023559570312,
"logps/rejected": -126.3614501953125,
"loss": 0.506,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.17712682485580444,
"margin_dpo/beta_margin_grad_std": 0.22809603810310364,
"margin_dpo/beta_margin_mean": 3.066802501678467,
"margin_dpo/loss_margin_mean": 30.668025970458984,
"margin_dpo/margin_mean": 30.668025970458984,
"margin_dpo/margin_std": 26.836669921875,
"step": 476
},
{
"epoch": 0.7004405286343612,
"grad_norm": 82.7061538696289,
"learning_rate": 1.2611303872132631e-07,
"logits/chosen": -0.6342014074325562,
"logits/rejected": -0.5668247938156128,
"logps/chosen": -96.3890151977539,
"logps/ref_chosen": -70.77261352539062,
"logps/ref_rejected": -76.13737487792969,
"logps/rejected": -133.8131866455078,
"loss": 0.6051,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.15660977363586426,
"margin_dpo/beta_margin_grad_std": 0.24074432253837585,
"margin_dpo/beta_margin_mean": 3.2059414386749268,
"margin_dpo/loss_margin_mean": 32.05941390991211,
"margin_dpo/margin_mean": 32.05941390991211,
"margin_dpo/margin_std": 27.974023818969727,
"step": 477
},
{
"epoch": 0.7019089574155654,
"grad_norm": 48.793907165527344,
"learning_rate": 1.2500000000000005e-07,
"logits/chosen": -0.6003127098083496,
"logits/rejected": -0.5863485336303711,
"logps/chosen": -61.834197998046875,
"logps/ref_chosen": -41.440513610839844,
"logps/ref_rejected": -85.36196899414062,
"logps/rejected": -140.44876098632812,
"loss": 0.3975,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.14744707942008972,
"margin_dpo/beta_margin_grad_std": 0.1875038743019104,
"margin_dpo/beta_margin_mean": 3.469311237335205,
"margin_dpo/loss_margin_mean": 34.693111419677734,
"margin_dpo/margin_mean": 34.693111419677734,
"margin_dpo/margin_std": 29.18410873413086,
"step": 478
},
{
"epoch": 0.7033773861967695,
"grad_norm": 57.176082611083984,
"learning_rate": 1.2389025514492456e-07,
"logits/chosen": -0.5899140238761902,
"logits/rejected": -0.5823123455047607,
"logps/chosen": -79.59027099609375,
"logps/ref_chosen": -53.907920837402344,
"logps/ref_rejected": -95.1163330078125,
"logps/rejected": -151.4071044921875,
"loss": 0.4438,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1582869589328766,
"margin_dpo/beta_margin_grad_std": 0.21056291460990906,
"margin_dpo/beta_margin_mean": 3.0608415603637695,
"margin_dpo/loss_margin_mean": 30.608417510986328,
"margin_dpo/margin_mean": 30.608415603637695,
"margin_dpo/margin_std": 22.303516387939453,
"step": 479
},
{
"epoch": 0.7048458149779736,
"grad_norm": 74.97010040283203,
"learning_rate": 1.227838333989088e-07,
"logits/chosen": -0.5865793824195862,
"logits/rejected": -0.5290813446044922,
"logps/chosen": -85.32308959960938,
"logps/ref_chosen": -58.682701110839844,
"logps/ref_rejected": -82.93248748779297,
"logps/rejected": -146.00466918945312,
"loss": 0.5139,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.15616512298583984,
"margin_dpo/beta_margin_grad_std": 0.231448233127594,
"margin_dpo/beta_margin_mean": 3.643179416656494,
"margin_dpo/loss_margin_mean": 36.431793212890625,
"margin_dpo/margin_mean": 36.431793212890625,
"margin_dpo/margin_std": 32.21718978881836,
"step": 480
},
{
"epoch": 0.7063142437591777,
"grad_norm": 53.85762023925781,
"learning_rate": 1.2168076391719489e-07,
"logits/chosen": -0.6352800130844116,
"logits/rejected": -0.6003815531730652,
"logps/chosen": -80.35116577148438,
"logps/ref_chosen": -54.964271545410156,
"logps/ref_rejected": -92.42044067382812,
"logps/rejected": -152.66683959960938,
"loss": 0.4429,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.14123259484767914,
"margin_dpo/beta_margin_grad_std": 0.2171190083026886,
"margin_dpo/beta_margin_mean": 3.485950469970703,
"margin_dpo/loss_margin_mean": 34.85950469970703,
"margin_dpo/margin_mean": 34.85950469970703,
"margin_dpo/margin_std": 26.85974884033203,
"step": 481
},
{
"epoch": 0.7077826725403817,
"grad_norm": 56.575809478759766,
"learning_rate": 1.2058107576668938e-07,
"logits/chosen": -0.5973387956619263,
"logits/rejected": -0.56673264503479,
"logps/chosen": -90.35714721679688,
"logps/ref_chosen": -67.55347442626953,
"logps/ref_rejected": -87.58953857421875,
"logps/rejected": -140.64810180664062,
"loss": 0.4347,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16674280166625977,
"margin_dpo/beta_margin_grad_std": 0.18362513184547424,
"margin_dpo/beta_margin_mean": 3.0254898071289062,
"margin_dpo/loss_margin_mean": 30.254898071289062,
"margin_dpo/margin_mean": 30.254898071289062,
"margin_dpo/margin_std": 26.179527282714844,
"step": 482
},
{
"epoch": 0.7092511013215859,
"grad_norm": 71.42662048339844,
"learning_rate": 1.194847979251979e-07,
"logits/chosen": -0.5927727222442627,
"logits/rejected": -0.5307378768920898,
"logps/chosen": -89.05766296386719,
"logps/ref_chosen": -63.32981872558594,
"logps/ref_rejected": -95.78697204589844,
"logps/rejected": -157.198974609375,
"loss": 0.3982,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1320417821407318,
"margin_dpo/beta_margin_grad_std": 0.2117423117160797,
"margin_dpo/beta_margin_mean": 3.5684163570404053,
"margin_dpo/loss_margin_mean": 35.68416213989258,
"margin_dpo/margin_mean": 35.68416213989258,
"margin_dpo/margin_std": 27.353343963623047,
"step": 483
},
{
"epoch": 0.71071953010279,
"grad_norm": 52.49878692626953,
"learning_rate": 1.1839195928066101e-07,
"logits/chosen": -0.644550085067749,
"logits/rejected": -0.6064622402191162,
"logps/chosen": -81.20121002197266,
"logps/ref_chosen": -59.13812255859375,
"logps/ref_rejected": -84.37144470214844,
"logps/rejected": -142.26454162597656,
"loss": 0.3632,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.14210814237594604,
"margin_dpo/beta_margin_grad_std": 0.1774926483631134,
"margin_dpo/beta_margin_mean": 3.583000659942627,
"margin_dpo/loss_margin_mean": 35.83000564575195,
"margin_dpo/margin_mean": 35.83000564575195,
"margin_dpo/margin_std": 29.71619415283203,
"step": 484
},
{
"epoch": 0.7121879588839941,
"grad_norm": 51.49626922607422,
"learning_rate": 1.1730258863039347e-07,
"logits/chosen": -0.5840227603912354,
"logits/rejected": -0.5531511306762695,
"logps/chosen": -77.98756408691406,
"logps/ref_chosen": -58.849571228027344,
"logps/ref_rejected": -103.36408996582031,
"logps/rejected": -163.02682495117188,
"loss": 0.4151,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.14008283615112305,
"margin_dpo/beta_margin_grad_std": 0.21648435294628143,
"margin_dpo/beta_margin_mean": 4.052473545074463,
"margin_dpo/loss_margin_mean": 40.52473449707031,
"margin_dpo/margin_mean": 40.52473449707031,
"margin_dpo/margin_std": 32.42699432373047,
"step": 485
},
{
"epoch": 0.7136563876651982,
"grad_norm": 67.00348663330078,
"learning_rate": 1.1621671468032493e-07,
"logits/chosen": -0.6258925199508667,
"logits/rejected": -0.5719567537307739,
"logps/chosen": -78.4237060546875,
"logps/ref_chosen": -55.25966262817383,
"logps/ref_rejected": -92.13936614990234,
"logps/rejected": -154.44952392578125,
"loss": 0.4209,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.14244696497917175,
"margin_dpo/beta_margin_grad_std": 0.21729934215545654,
"margin_dpo/beta_margin_mean": 3.9146108627319336,
"margin_dpo/loss_margin_mean": 39.1461067199707,
"margin_dpo/margin_mean": 39.1461067199707,
"margin_dpo/margin_std": 30.911312103271484,
"step": 486
},
{
"epoch": 0.7151248164464024,
"grad_norm": 57.11901092529297,
"learning_rate": 1.1513436604424378e-07,
"logits/chosen": -0.638907790184021,
"logits/rejected": -0.6043581962585449,
"logps/chosen": -75.46080017089844,
"logps/ref_chosen": -53.06330871582031,
"logps/ref_rejected": -92.4188232421875,
"logps/rejected": -152.25311279296875,
"loss": 0.3229,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.12441418319940567,
"margin_dpo/beta_margin_grad_std": 0.16795003414154053,
"margin_dpo/beta_margin_mean": 3.743680477142334,
"margin_dpo/loss_margin_mean": 37.436805725097656,
"margin_dpo/margin_mean": 37.436805725097656,
"margin_dpo/margin_std": 26.35199737548828,
"step": 487
},
{
"epoch": 0.7165932452276065,
"grad_norm": 33.266990661621094,
"learning_rate": 1.1405557124304335e-07,
"logits/chosen": -0.591684877872467,
"logits/rejected": -0.5600037574768066,
"logps/chosen": -73.20890808105469,
"logps/ref_chosen": -52.228153228759766,
"logps/ref_rejected": -84.00656127929688,
"logps/rejected": -137.1870880126953,
"loss": 0.282,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.11648823320865631,
"margin_dpo/beta_margin_grad_std": 0.14420145750045776,
"margin_dpo/beta_margin_mean": 3.219977378845215,
"margin_dpo/loss_margin_mean": 32.199771881103516,
"margin_dpo/margin_mean": 32.199771881103516,
"margin_dpo/margin_std": 21.357444763183594,
"step": 488
},
{
"epoch": 0.7180616740088106,
"grad_norm": 55.21537780761719,
"learning_rate": 1.1298035870396985e-07,
"logits/chosen": -0.6172722578048706,
"logits/rejected": -0.5707902908325195,
"logps/chosen": -78.21434020996094,
"logps/ref_chosen": -55.989627838134766,
"logps/ref_rejected": -79.39813232421875,
"logps/rejected": -133.66220092773438,
"loss": 0.4375,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1609182357788086,
"margin_dpo/beta_margin_grad_std": 0.20739710330963135,
"margin_dpo/beta_margin_mean": 3.2039356231689453,
"margin_dpo/loss_margin_mean": 32.03935623168945,
"margin_dpo/margin_mean": 32.03936004638672,
"margin_dpo/margin_std": 27.642593383789062,
"step": 489
},
{
"epoch": 0.7195301027900147,
"grad_norm": 69.54042053222656,
"learning_rate": 1.1190875675987355e-07,
"logits/chosen": -0.6238210201263428,
"logits/rejected": -0.6131519079208374,
"logps/chosen": -73.09518432617188,
"logps/ref_chosen": -52.36639404296875,
"logps/ref_rejected": -110.40904998779297,
"logps/rejected": -162.64816284179688,
"loss": 0.5726,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.18633374571800232,
"margin_dpo/beta_margin_grad_std": 0.23432117700576782,
"margin_dpo/beta_margin_mean": 3.151031017303467,
"margin_dpo/loss_margin_mean": 31.510311126708984,
"margin_dpo/margin_mean": 31.510311126708984,
"margin_dpo/margin_std": 29.851600646972656,
"step": 490
},
{
"epoch": 0.7209985315712188,
"grad_norm": 71.87732696533203,
"learning_rate": 1.1084079364846241e-07,
"logits/chosen": -0.5947495698928833,
"logits/rejected": -0.5508404970169067,
"logps/chosen": -83.43572998046875,
"logps/ref_chosen": -60.11626434326172,
"logps/ref_rejected": -73.27278900146484,
"logps/rejected": -124.82978820800781,
"loss": 0.5807,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.18778780102729797,
"margin_dpo/beta_margin_grad_std": 0.22901329398155212,
"margin_dpo/beta_margin_mean": 2.8237528800964355,
"margin_dpo/loss_margin_mean": 28.237525939941406,
"margin_dpo/margin_mean": 28.237525939941406,
"margin_dpo/margin_std": 27.937530517578125,
"step": 491
},
{
"epoch": 0.7224669603524229,
"grad_norm": 113.52478790283203,
"learning_rate": 1.097764975115576e-07,
"logits/chosen": -0.6196011304855347,
"logits/rejected": -0.5757460594177246,
"logps/chosen": -77.80059814453125,
"logps/ref_chosen": -53.99418258666992,
"logps/ref_rejected": -72.65962219238281,
"logps/rejected": -122.739990234375,
"loss": 0.9513,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.23549211025238037,
"margin_dpo/beta_margin_grad_std": 0.311998188495636,
"margin_dpo/beta_margin_mean": 2.6273956298828125,
"margin_dpo/loss_margin_mean": 26.273958206176758,
"margin_dpo/margin_mean": 26.273958206176758,
"margin_dpo/margin_std": 30.309785842895508,
"step": 492
},
{
"epoch": 0.723935389133627,
"grad_norm": 57.65090560913086,
"learning_rate": 1.0871589639435203e-07,
"logits/chosen": -0.6554695963859558,
"logits/rejected": -0.5956501960754395,
"logps/chosen": -95.95299530029297,
"logps/ref_chosen": -75.49723815917969,
"logps/ref_rejected": -87.32301330566406,
"logps/rejected": -140.97837829589844,
"loss": 0.4676,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1496451199054718,
"margin_dpo/beta_margin_grad_std": 0.22746598720550537,
"margin_dpo/beta_margin_mean": 3.319960117340088,
"margin_dpo/loss_margin_mean": 33.19960021972656,
"margin_dpo/margin_mean": 33.19960021972656,
"margin_dpo/margin_std": 26.605464935302734,
"step": 493
},
{
"epoch": 0.7254038179148311,
"grad_norm": 106.5522232055664,
"learning_rate": 1.0765901824467166e-07,
"logits/chosen": -0.5855438709259033,
"logits/rejected": -0.5765562653541565,
"logps/chosen": -63.644134521484375,
"logps/ref_chosen": -41.35926818847656,
"logps/ref_rejected": -86.09136962890625,
"logps/rejected": -144.02566528320312,
"loss": 0.5137,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1603744924068451,
"margin_dpo/beta_margin_grad_std": 0.23930124938488007,
"margin_dpo/beta_margin_mean": 3.5649423599243164,
"margin_dpo/loss_margin_mean": 35.64942169189453,
"margin_dpo/margin_mean": 35.64942169189453,
"margin_dpo/margin_std": 29.729022979736328,
"step": 494
},
{
"epoch": 0.7268722466960352,
"grad_norm": 68.87841033935547,
"learning_rate": 1.0660589091223854e-07,
"logits/chosen": -0.6375582218170166,
"logits/rejected": -0.5974992513656616,
"logps/chosen": -85.37813568115234,
"logps/ref_chosen": -63.53507995605469,
"logps/ref_rejected": -91.42443084716797,
"logps/rejected": -145.79571533203125,
"loss": 0.5198,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.15870174765586853,
"margin_dpo/beta_margin_grad_std": 0.22453130781650543,
"margin_dpo/beta_margin_mean": 3.2528228759765625,
"margin_dpo/loss_margin_mean": 32.528228759765625,
"margin_dpo/margin_mean": 32.52822494506836,
"margin_dpo/margin_std": 27.80425262451172,
"step": 495
},
{
"epoch": 0.7283406754772394,
"grad_norm": 67.1712417602539,
"learning_rate": 1.0555654214793722e-07,
"logits/chosen": -0.6405035257339478,
"logits/rejected": -0.583281397819519,
"logps/chosen": -97.08042907714844,
"logps/ref_chosen": -72.59192657470703,
"logps/ref_rejected": -84.32933807373047,
"logps/rejected": -137.63818359375,
"loss": 0.538,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.17619186639785767,
"margin_dpo/beta_margin_grad_std": 0.22553400695323944,
"margin_dpo/beta_margin_mean": 2.882033348083496,
"margin_dpo/loss_margin_mean": 28.820335388183594,
"margin_dpo/margin_mean": 28.82033348083496,
"margin_dpo/margin_std": 26.039134979248047,
"step": 496
},
{
"epoch": 0.7298091042584435,
"grad_norm": 76.36552429199219,
"learning_rate": 1.0451099960308374e-07,
"logits/chosen": -0.6118708848953247,
"logits/rejected": -0.5635442733764648,
"logps/chosen": -84.1969985961914,
"logps/ref_chosen": -58.593971252441406,
"logps/ref_rejected": -76.28836822509766,
"logps/rejected": -130.5543670654297,
"loss": 0.6066,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.20490401983261108,
"margin_dpo/beta_margin_grad_std": 0.22362196445465088,
"margin_dpo/beta_margin_mean": 2.8662962913513184,
"margin_dpo/loss_margin_mean": 28.6629638671875,
"margin_dpo/margin_mean": 28.6629638671875,
"margin_dpo/margin_std": 27.733150482177734,
"step": 497
},
{
"epoch": 0.7312775330396476,
"grad_norm": 86.41475677490234,
"learning_rate": 1.0346929082869641e-07,
"logits/chosen": -0.617120623588562,
"logits/rejected": -0.5855381488800049,
"logps/chosen": -95.82791137695312,
"logps/ref_chosen": -71.20565795898438,
"logps/ref_rejected": -83.95803833007812,
"logps/rejected": -139.6630859375,
"loss": 0.5315,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1733829826116562,
"margin_dpo/beta_margin_grad_std": 0.23494428396224976,
"margin_dpo/beta_margin_mean": 3.1082797050476074,
"margin_dpo/loss_margin_mean": 31.082794189453125,
"margin_dpo/margin_mean": 31.082794189453125,
"margin_dpo/margin_std": 28.04306411743164,
"step": 498
},
{
"epoch": 0.7327459618208517,
"grad_norm": 80.22297668457031,
"learning_rate": 1.0243144327477013e-07,
"logits/chosen": -0.6173365116119385,
"logits/rejected": -0.6094462275505066,
"logps/chosen": -74.71839904785156,
"logps/ref_chosen": -51.25519561767578,
"logps/ref_rejected": -101.07870483398438,
"logps/rejected": -156.43063354492188,
"loss": 0.6745,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.17869053781032562,
"margin_dpo/beta_margin_grad_std": 0.25083693861961365,
"margin_dpo/beta_margin_mean": 3.1888723373413086,
"margin_dpo/loss_margin_mean": 31.888721466064453,
"margin_dpo/margin_mean": 31.888721466064453,
"margin_dpo/margin_std": 30.46820831298828,
"step": 499
},
{
"epoch": 0.7342143906020558,
"grad_norm": 44.01335144042969,
"learning_rate": 1.0139748428955333e-07,
"logits/chosen": -0.5734531879425049,
"logits/rejected": -0.5571717023849487,
"logps/chosen": -82.99430847167969,
"logps/ref_chosen": -57.027442932128906,
"logps/ref_rejected": -93.93421173095703,
"logps/rejected": -153.8037109375,
"loss": 0.38,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1365164965391159,
"margin_dpo/beta_margin_grad_std": 0.1890600323677063,
"margin_dpo/beta_margin_mean": 3.390263557434082,
"margin_dpo/loss_margin_mean": 33.90263366699219,
"margin_dpo/margin_mean": 33.90263366699219,
"margin_dpo/margin_std": 29.03835678100586,
"step": 500
},
{
"epoch": 0.7342143906020558,
"eval_logits/chosen": -0.6236123442649841,
"eval_logits/rejected": -0.5976437926292419,
"eval_logps/chosen": -106.9358139038086,
"eval_logps/ref_chosen": -79.05104064941406,
"eval_logps/ref_rejected": -86.79793548583984,
"eval_logps/rejected": -136.5065155029297,
"eval_loss": 0.40981218218803406,
"eval_margin_dpo/beta": 0.10000000149011612,
"eval_margin_dpo/beta_margin_grad_mean": -0.25789907574653625,
"eval_margin_dpo/beta_margin_grad_std": 0.2560845613479614,
"eval_margin_dpo/beta_margin_mean": 2.1823792457580566,
"eval_margin_dpo/loss_margin_mean": 21.823793411254883,
"eval_margin_dpo/margin_mean": 21.823793411254883,
"eval_margin_dpo/margin_std": 26.597421646118164,
"eval_runtime": 39.8891,
"eval_samples_per_second": 58.638,
"eval_steps_per_second": 1.855,
"step": 500
},
{
"epoch": 0.73568281938326,
"grad_norm": 51.43994140625,
"learning_rate": 1.0036744111882672e-07,
"logits/chosen": -0.6275873184204102,
"logits/rejected": -0.5868571996688843,
"logps/chosen": -76.56288146972656,
"logps/ref_chosen": -54.359527587890625,
"logps/ref_rejected": -80.15670776367188,
"logps/rejected": -140.09364318847656,
"loss": 0.3432,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1260565221309662,
"margin_dpo/beta_margin_grad_std": 0.185085728764534,
"margin_dpo/beta_margin_mean": 3.7733588218688965,
"margin_dpo/loss_margin_mean": 37.733585357666016,
"margin_dpo/margin_mean": 37.733585357666016,
"margin_dpo/margin_std": 29.447450637817383,
"step": 501
},
{
"epoch": 0.737151248164464,
"grad_norm": 45.80177688598633,
"learning_rate": 9.934134090518592e-08,
"logits/chosen": -0.6037914752960205,
"logits/rejected": -0.542682945728302,
"logps/chosen": -90.61296844482422,
"logps/ref_chosen": -67.60050964355469,
"logps/ref_rejected": -82.94876098632812,
"logps/rejected": -139.87281799316406,
"loss": 0.3237,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1189626082777977,
"margin_dpo/beta_margin_grad_std": 0.18001875281333923,
"margin_dpo/beta_margin_mean": 3.3911592960357666,
"margin_dpo/loss_margin_mean": 33.911590576171875,
"margin_dpo/margin_mean": 33.911590576171875,
"margin_dpo/margin_std": 20.9443416595459,
"step": 502
},
{
"epoch": 0.7386196769456681,
"grad_norm": 54.339813232421875,
"learning_rate": 9.831921068732571e-08,
"logits/chosen": -0.5617387294769287,
"logits/rejected": -0.5207287073135376,
"logps/chosen": -76.20591735839844,
"logps/ref_chosen": -55.078407287597656,
"logps/ref_rejected": -82.50544738769531,
"logps/rejected": -137.9188690185547,
"loss": 0.4001,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.13650323450565338,
"margin_dpo/beta_margin_grad_std": 0.21414102613925934,
"margin_dpo/beta_margin_mean": 3.428591251373291,
"margin_dpo/loss_margin_mean": 34.285911560058594,
"margin_dpo/margin_mean": 34.285911560058594,
"margin_dpo/margin_std": 23.53387451171875,
"step": 503
},
{
"epoch": 0.7400881057268722,
"grad_norm": 36.07522964477539,
"learning_rate": 9.730107739932805e-08,
"logits/chosen": -0.6169658899307251,
"logits/rejected": -0.5972700119018555,
"logps/chosen": -79.52164459228516,
"logps/ref_chosen": -59.96575164794922,
"logps/ref_rejected": -103.76213073730469,
"logps/rejected": -163.46487426757812,
"loss": 0.2426,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.0935632586479187,
"margin_dpo/beta_margin_grad_std": 0.1528908908367157,
"margin_dpo/beta_margin_mean": 4.014684677124023,
"margin_dpo/loss_margin_mean": 40.1468505859375,
"margin_dpo/margin_mean": 40.146846771240234,
"margin_dpo/margin_std": 26.04753875732422,
"step": 504
},
{
"epoch": 0.7415565345080763,
"grad_norm": 81.94815063476562,
"learning_rate": 9.628696786995188e-08,
"logits/chosen": -0.6530240774154663,
"logits/rejected": -0.6016232967376709,
"logps/chosen": -101.87925720214844,
"logps/ref_chosen": -76.1549072265625,
"logps/ref_rejected": -88.58537292480469,
"logps/rejected": -142.79168701171875,
"loss": 0.6762,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.2050047218799591,
"margin_dpo/beta_margin_grad_std": 0.24811255931854248,
"margin_dpo/beta_margin_mean": 2.8481969833374023,
"margin_dpo/loss_margin_mean": 28.481971740722656,
"margin_dpo/margin_mean": 28.481969833374023,
"margin_dpo/margin_std": 29.522705078125,
"step": 505
},
{
"epoch": 0.7430249632892805,
"grad_norm": 66.44819641113281,
"learning_rate": 9.527690882192635e-08,
"logits/chosen": -0.615195631980896,
"logits/rejected": -0.579567551612854,
"logps/chosen": -71.32270812988281,
"logps/ref_chosen": -48.96050262451172,
"logps/ref_rejected": -78.41505432128906,
"logps/rejected": -134.9954376220703,
"loss": 0.448,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.15291953086853027,
"margin_dpo/beta_margin_grad_std": 0.2167114019393921,
"margin_dpo/beta_margin_mean": 3.421818733215332,
"margin_dpo/loss_margin_mean": 34.21818542480469,
"margin_dpo/margin_mean": 34.21818542480469,
"margin_dpo/margin_std": 29.17880630493164,
"step": 506
},
{
"epoch": 0.7444933920704846,
"grad_norm": 52.085636138916016,
"learning_rate": 9.427092687124691e-08,
"logits/chosen": -0.6226514577865601,
"logits/rejected": -0.5839424133300781,
"logps/chosen": -90.81254577636719,
"logps/ref_chosen": -66.80150604248047,
"logps/ref_rejected": -95.37289428710938,
"logps/rejected": -151.70751953125,
"loss": 0.3354,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1319025158882141,
"margin_dpo/beta_margin_grad_std": 0.17619000375270844,
"margin_dpo/beta_margin_mean": 3.2323567867279053,
"margin_dpo/loss_margin_mean": 32.32356643676758,
"margin_dpo/margin_mean": 32.32356262207031,
"margin_dpo/margin_std": 22.592437744140625,
"step": 507
},
{
"epoch": 0.7459618208516887,
"grad_norm": 64.97161865234375,
"learning_rate": 9.326904852647344e-08,
"logits/chosen": -0.6235086917877197,
"logits/rejected": -0.5866918563842773,
"logps/chosen": -93.52983093261719,
"logps/ref_chosen": -71.303466796875,
"logps/ref_rejected": -95.6275405883789,
"logps/rejected": -150.2212371826172,
"loss": 0.5048,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.17342665791511536,
"margin_dpo/beta_margin_grad_std": 0.22466185688972473,
"margin_dpo/beta_margin_mean": 3.2367329597473145,
"margin_dpo/loss_margin_mean": 32.36732864379883,
"margin_dpo/margin_mean": 32.36732864379883,
"margin_dpo/margin_std": 28.795747756958008,
"step": 508
},
{
"epoch": 0.7474302496328928,
"grad_norm": 71.68479919433594,
"learning_rate": 9.227130018803195e-08,
"logits/chosen": -0.6284923553466797,
"logits/rejected": -0.5883047580718994,
"logps/chosen": -86.16942596435547,
"logps/ref_chosen": -63.81895065307617,
"logps/ref_rejected": -83.25643920898438,
"logps/rejected": -138.71392822265625,
"loss": 0.4825,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1474766582250595,
"margin_dpo/beta_margin_grad_std": 0.23035329580307007,
"margin_dpo/beta_margin_mean": 3.3107001781463623,
"margin_dpo/loss_margin_mean": 33.10700225830078,
"margin_dpo/margin_mean": 33.10700225830078,
"margin_dpo/margin_std": 26.037105560302734,
"step": 509
},
{
"epoch": 0.748898678414097,
"grad_norm": 57.0163459777832,
"learning_rate": 9.127770814751932e-08,
"logits/chosen": -0.5673776865005493,
"logits/rejected": -0.551094114780426,
"logps/chosen": -79.0858154296875,
"logps/ref_chosen": -51.878448486328125,
"logps/ref_rejected": -102.7651596069336,
"logps/rejected": -170.52944946289062,
"loss": 0.4056,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.14309245347976685,
"margin_dpo/beta_margin_grad_std": 0.21340487897396088,
"margin_dpo/beta_margin_mean": 4.055694580078125,
"margin_dpo/loss_margin_mean": 40.556941986083984,
"margin_dpo/margin_mean": 40.556941986083984,
"margin_dpo/margin_std": 32.51176452636719,
"step": 510
},
{
"epoch": 0.750367107195301,
"grad_norm": 55.767330169677734,
"learning_rate": 9.028829858700973e-08,
"logits/chosen": -0.6472057104110718,
"logits/rejected": -0.6147615909576416,
"logps/chosen": -82.40501403808594,
"logps/ref_chosen": -60.23811340332031,
"logps/ref_rejected": -92.85676574707031,
"logps/rejected": -151.13473510742188,
"loss": 0.4616,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1584860384464264,
"margin_dpo/beta_margin_grad_std": 0.22719457745552063,
"margin_dpo/beta_margin_mean": 3.6111063957214355,
"margin_dpo/loss_margin_mean": 36.11106491088867,
"margin_dpo/margin_mean": 36.11106491088867,
"margin_dpo/margin_std": 29.813339233398438,
"step": 511
},
{
"epoch": 0.7518355359765051,
"grad_norm": 45.10017776489258,
"learning_rate": 8.930309757836516e-08,
"logits/chosen": -0.6407291889190674,
"logits/rejected": -0.6046779155731201,
"logps/chosen": -80.43107604980469,
"logps/ref_chosen": -54.905494689941406,
"logps/ref_rejected": -81.87586975097656,
"logps/rejected": -142.87881469726562,
"loss": 0.3367,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.132303386926651,
"margin_dpo/beta_margin_grad_std": 0.1747945249080658,
"margin_dpo/beta_margin_mean": 3.547736644744873,
"margin_dpo/loss_margin_mean": 35.47736358642578,
"margin_dpo/margin_mean": 35.47736740112305,
"margin_dpo/margin_std": 27.888330459594727,
"step": 512
},
{
"epoch": 0.7533039647577092,
"grad_norm": 74.60807800292969,
"learning_rate": 8.832213108254863e-08,
"logits/chosen": -0.6363452672958374,
"logits/rejected": -0.5841466188430786,
"logps/chosen": -89.99166870117188,
"logps/ref_chosen": -64.91644287109375,
"logps/ref_rejected": -76.06245422363281,
"logps/rejected": -131.63497924804688,
"loss": 0.5735,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16800448298454285,
"margin_dpo/beta_margin_grad_std": 0.2321883887052536,
"margin_dpo/beta_margin_mean": 3.049729347229004,
"margin_dpo/loss_margin_mean": 30.49729347229004,
"margin_dpo/margin_mean": 30.497295379638672,
"margin_dpo/margin_std": 25.966400146484375,
"step": 513
},
{
"epoch": 0.7547723935389133,
"grad_norm": 72.9621810913086,
"learning_rate": 8.734542494893954e-08,
"logits/chosen": -0.6219351291656494,
"logits/rejected": -0.5745389461517334,
"logps/chosen": -100.20359802246094,
"logps/ref_chosen": -74.22957611083984,
"logps/ref_rejected": -78.945556640625,
"logps/rejected": -135.80770874023438,
"loss": 0.7095,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.21395042538642883,
"margin_dpo/beta_margin_grad_std": 0.27207687497138977,
"margin_dpo/beta_margin_mean": 3.0888137817382812,
"margin_dpo/loss_margin_mean": 30.88813591003418,
"margin_dpo/margin_mean": 30.888137817382812,
"margin_dpo/margin_std": 33.99193572998047,
"step": 514
},
{
"epoch": 0.7562408223201175,
"grad_norm": 49.74563980102539,
"learning_rate": 8.637300491465272e-08,
"logits/chosen": -0.6339064836502075,
"logits/rejected": -0.6179243326187134,
"logps/chosen": -73.3636703491211,
"logps/ref_chosen": -50.40156555175781,
"logps/ref_rejected": -87.09774780273438,
"logps/rejected": -143.69723510742188,
"loss": 0.377,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.14515420794487,
"margin_dpo/beta_margin_grad_std": 0.17126330733299255,
"margin_dpo/beta_margin_mean": 3.363739490509033,
"margin_dpo/loss_margin_mean": 33.63739776611328,
"margin_dpo/margin_mean": 33.63739776611328,
"margin_dpo/margin_std": 27.569812774658203,
"step": 515
},
{
"epoch": 0.7577092511013216,
"grad_norm": 51.14539337158203,
"learning_rate": 8.540489660386064e-08,
"logits/chosen": -0.642087459564209,
"logits/rejected": -0.6219902038574219,
"logps/chosen": -87.75926208496094,
"logps/ref_chosen": -64.6495590209961,
"logps/ref_rejected": -111.72238159179688,
"logps/rejected": -170.44625854492188,
"loss": 0.3628,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.13847823441028595,
"margin_dpo/beta_margin_grad_std": 0.18551796674728394,
"margin_dpo/beta_margin_mean": 3.5614166259765625,
"margin_dpo/loss_margin_mean": 35.614166259765625,
"margin_dpo/margin_mean": 35.614166259765625,
"margin_dpo/margin_std": 28.459064483642578,
"step": 516
},
{
"epoch": 0.7591776798825257,
"grad_norm": 49.0867805480957,
"learning_rate": 8.444112552711752e-08,
"logits/chosen": -0.6216846704483032,
"logits/rejected": -0.5751929879188538,
"logps/chosen": -86.8254623413086,
"logps/ref_chosen": -60.913551330566406,
"logps/ref_rejected": -89.08308410644531,
"logps/rejected": -150.07212829589844,
"loss": 0.3873,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.12766914069652557,
"margin_dpo/beta_margin_grad_std": 0.20745864510536194,
"margin_dpo/beta_margin_mean": 3.5077133178710938,
"margin_dpo/loss_margin_mean": 35.07713317871094,
"margin_dpo/margin_mean": 35.07713317871094,
"margin_dpo/margin_std": 25.649383544921875,
"step": 517
},
{
"epoch": 0.7606461086637298,
"grad_norm": 52.4042854309082,
"learning_rate": 8.348171708068747e-08,
"logits/chosen": -0.6181496381759644,
"logits/rejected": -0.6017059087753296,
"logps/chosen": -83.06076049804688,
"logps/ref_chosen": -57.45589065551758,
"logps/ref_rejected": -85.31269836425781,
"logps/rejected": -142.11749267578125,
"loss": 0.4583,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.15641465783119202,
"margin_dpo/beta_margin_grad_std": 0.2147601693868637,
"margin_dpo/beta_margin_mean": 3.119992733001709,
"margin_dpo/loss_margin_mean": 31.199928283691406,
"margin_dpo/margin_mean": 31.199928283691406,
"margin_dpo/margin_std": 26.132186889648438,
"step": 518
},
{
"epoch": 0.762114537444934,
"grad_norm": 62.43038558959961,
"learning_rate": 8.25266965458755e-08,
"logits/chosen": -0.6043561697006226,
"logits/rejected": -0.5701404213905334,
"logps/chosen": -97.01838684082031,
"logps/ref_chosen": -74.06330871582031,
"logps/ref_rejected": -104.44416809082031,
"logps/rejected": -159.92041015625,
"loss": 0.4574,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16313457489013672,
"margin_dpo/beta_margin_grad_std": 0.1897900104522705,
"margin_dpo/beta_margin_mean": 3.2521166801452637,
"margin_dpo/loss_margin_mean": 32.52116394042969,
"margin_dpo/margin_mean": 32.52116775512695,
"margin_dpo/margin_std": 30.377395629882812,
"step": 519
},
{
"epoch": 0.7635829662261381,
"grad_norm": 50.88139343261719,
"learning_rate": 8.15760890883607e-08,
"logits/chosen": -0.5501081943511963,
"logits/rejected": -0.5258777141571045,
"logps/chosen": -93.9324951171875,
"logps/ref_chosen": -70.2998275756836,
"logps/ref_rejected": -99.98133850097656,
"logps/rejected": -156.6881103515625,
"loss": 0.3602,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.13520291447639465,
"margin_dpo/beta_margin_grad_std": 0.1813124269247055,
"margin_dpo/beta_margin_mean": 3.307410717010498,
"margin_dpo/loss_margin_mean": 33.0741081237793,
"margin_dpo/margin_mean": 33.07410430908203,
"margin_dpo/margin_std": 24.42025375366211,
"step": 520
},
{
"epoch": 0.7650513950073421,
"grad_norm": 51.61702346801758,
"learning_rate": 8.062991975753378e-08,
"logits/chosen": -0.6081865429878235,
"logits/rejected": -0.578285276889801,
"logps/chosen": -80.80309295654297,
"logps/ref_chosen": -58.14292907714844,
"logps/ref_rejected": -83.28060913085938,
"logps/rejected": -138.00160217285156,
"loss": 0.4205,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.15506964921951294,
"margin_dpo/beta_margin_grad_std": 0.19557394087314606,
"margin_dpo/beta_margin_mean": 3.2060821056365967,
"margin_dpo/loss_margin_mean": 32.060821533203125,
"margin_dpo/margin_mean": 32.060821533203125,
"margin_dpo/margin_std": 26.212413787841797,
"step": 521
},
{
"epoch": 0.7665198237885462,
"grad_norm": 49.603553771972656,
"learning_rate": 7.968821348583643e-08,
"logits/chosen": -0.610974133014679,
"logits/rejected": -0.5809307098388672,
"logps/chosen": -70.47454833984375,
"logps/ref_chosen": -46.54766845703125,
"logps/ref_rejected": -66.01388549804688,
"logps/rejected": -118.23016357421875,
"loss": 0.4643,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.17351622879505157,
"margin_dpo/beta_margin_grad_std": 0.19399023056030273,
"margin_dpo/beta_margin_mean": 2.8289389610290527,
"margin_dpo/loss_margin_mean": 28.289390563964844,
"margin_dpo/margin_mean": 28.289390563964844,
"margin_dpo/margin_std": 25.933521270751953,
"step": 522
},
{
"epoch": 0.7679882525697503,
"grad_norm": 62.4161376953125,
"learning_rate": 7.875099508810484e-08,
"logits/chosen": -0.6249532699584961,
"logits/rejected": -0.5845484733581543,
"logps/chosen": -86.18218994140625,
"logps/ref_chosen": -61.76960372924805,
"logps/ref_rejected": -83.76141357421875,
"logps/rejected": -140.2449951171875,
"loss": 0.5486,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.18028101325035095,
"margin_dpo/beta_margin_grad_std": 0.23459823429584503,
"margin_dpo/beta_margin_mean": 3.2071008682250977,
"margin_dpo/loss_margin_mean": 32.071006774902344,
"margin_dpo/margin_mean": 32.071006774902344,
"margin_dpo/margin_std": 29.059005737304688,
"step": 523
},
{
"epoch": 0.7694566813509545,
"grad_norm": 61.96669387817383,
"learning_rate": 7.781828926091535e-08,
"logits/chosen": -0.5891939401626587,
"logits/rejected": -0.5522305965423584,
"logps/chosen": -101.28225708007812,
"logps/ref_chosen": -78.0720443725586,
"logps/ref_rejected": -81.30198669433594,
"logps/rejected": -134.13906860351562,
"loss": 0.502,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1610349714756012,
"margin_dpo/beta_margin_grad_std": 0.2138672173023224,
"margin_dpo/beta_margin_mean": 2.962686538696289,
"margin_dpo/loss_margin_mean": 29.62686538696289,
"margin_dpo/margin_mean": 29.62686538696289,
"margin_dpo/margin_std": 24.750131607055664,
"step": 524
},
{
"epoch": 0.7709251101321586,
"grad_norm": 38.1711540222168,
"learning_rate": 7.689012058193384e-08,
"logits/chosen": -0.5875400304794312,
"logits/rejected": -0.5771076679229736,
"logps/chosen": -73.10057830810547,
"logps/ref_chosen": -50.827857971191406,
"logps/ref_rejected": -100.05293273925781,
"logps/rejected": -157.6528778076172,
"loss": 0.2666,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.10849446058273315,
"margin_dpo/beta_margin_grad_std": 0.14546433091163635,
"margin_dpo/beta_margin_mean": 3.532721996307373,
"margin_dpo/loss_margin_mean": 35.32722091674805,
"margin_dpo/margin_mean": 35.32722091674805,
"margin_dpo/margin_std": 23.876306533813477,
"step": 525
},
{
"epoch": 0.7723935389133627,
"grad_norm": 69.49327850341797,
"learning_rate": 7.596651350926836e-08,
"logits/chosen": -0.6228262782096863,
"logits/rejected": -0.5682265162467957,
"logps/chosen": -88.6613540649414,
"logps/ref_chosen": -63.167232513427734,
"logps/ref_rejected": -86.30934143066406,
"logps/rejected": -146.317138671875,
"loss": 0.4345,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.14551463723182678,
"margin_dpo/beta_margin_grad_std": 0.21542373299598694,
"margin_dpo/beta_margin_mean": 3.451366901397705,
"margin_dpo/loss_margin_mean": 34.51366424560547,
"margin_dpo/margin_mean": 34.51366424560547,
"margin_dpo/margin_std": 27.885501861572266,
"step": 526
},
{
"epoch": 0.7738619676945668,
"grad_norm": 59.04280471801758,
"learning_rate": 7.504749238082414e-08,
"logits/chosen": -0.6852065324783325,
"logits/rejected": -0.6316944360733032,
"logps/chosen": -94.76507568359375,
"logps/ref_chosen": -71.12867736816406,
"logps/ref_rejected": -78.3425521850586,
"logps/rejected": -134.12088012695312,
"loss": 0.4411,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1558818817138672,
"margin_dpo/beta_margin_grad_std": 0.2005808800458908,
"margin_dpo/beta_margin_mean": 3.2141916751861572,
"margin_dpo/loss_margin_mean": 32.14191436767578,
"margin_dpo/margin_mean": 32.14191436767578,
"margin_dpo/margin_std": 28.21198272705078,
"step": 527
},
{
"epoch": 0.775330396475771,
"grad_norm": 52.068695068359375,
"learning_rate": 7.413308141366254e-08,
"logits/chosen": -0.6312476396560669,
"logits/rejected": -0.6082254648208618,
"logps/chosen": -91.81694030761719,
"logps/ref_chosen": -68.0894546508789,
"logps/ref_rejected": -93.91006469726562,
"logps/rejected": -148.40286254882812,
"loss": 0.4254,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1584298014640808,
"margin_dpo/beta_margin_grad_std": 0.2020604908466339,
"margin_dpo/beta_margin_mean": 3.0765323638916016,
"margin_dpo/loss_margin_mean": 30.765323638916016,
"margin_dpo/margin_mean": 30.765323638916016,
"margin_dpo/margin_std": 25.180191040039062,
"step": 528
},
{
"epoch": 0.7767988252569751,
"grad_norm": 78.19168853759766,
"learning_rate": 7.322330470336313e-08,
"logits/chosen": -0.6202692985534668,
"logits/rejected": -0.5999141931533813,
"logps/chosen": -82.72918701171875,
"logps/ref_chosen": -55.5749626159668,
"logps/ref_rejected": -89.20909118652344,
"logps/rejected": -144.00283813476562,
"loss": 0.7184,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.19822926819324493,
"margin_dpo/beta_margin_grad_std": 0.24975398182868958,
"margin_dpo/beta_margin_mean": 2.7639517784118652,
"margin_dpo/loss_margin_mean": 27.63951873779297,
"margin_dpo/margin_mean": 27.63951873779297,
"margin_dpo/margin_std": 28.70267677307129,
"step": 529
},
{
"epoch": 0.7782672540381792,
"grad_norm": 62.91647720336914,
"learning_rate": 7.231818622338822e-08,
"logits/chosen": -0.5722482204437256,
"logits/rejected": -0.5528737902641296,
"logps/chosen": -71.75675201416016,
"logps/ref_chosen": -47.601417541503906,
"logps/ref_rejected": -87.2845230102539,
"logps/rejected": -148.30410766601562,
"loss": 0.4394,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.13412612676620483,
"margin_dpo/beta_margin_grad_std": 0.2340560257434845,
"margin_dpo/beta_margin_mean": 3.686424732208252,
"margin_dpo/loss_margin_mean": 36.8642463684082,
"margin_dpo/margin_mean": 36.8642463684082,
"margin_dpo/margin_std": 27.135440826416016,
"step": 530
},
{
"epoch": 0.7797356828193832,
"grad_norm": 56.84402084350586,
"learning_rate": 7.141774982445147e-08,
"logits/chosen": -0.6246213912963867,
"logits/rejected": -0.583504319190979,
"logps/chosen": -78.43968200683594,
"logps/ref_chosen": -55.246063232421875,
"logps/ref_rejected": -70.60598754882812,
"logps/rejected": -126.05680847167969,
"loss": 0.5239,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16395235061645508,
"margin_dpo/beta_margin_grad_std": 0.2341843992471695,
"margin_dpo/beta_margin_mean": 3.2257208824157715,
"margin_dpo/loss_margin_mean": 32.25720977783203,
"margin_dpo/margin_mean": 32.25720977783203,
"margin_dpo/margin_std": 28.00493049621582,
"step": 531
},
{
"epoch": 0.7812041116005873,
"grad_norm": 58.166229248046875,
"learning_rate": 7.052201923388953e-08,
"logits/chosen": -0.5881683826446533,
"logits/rejected": -0.554157018661499,
"logps/chosen": -94.40569305419922,
"logps/ref_chosen": -70.28602600097656,
"logps/ref_rejected": -86.5913314819336,
"logps/rejected": -148.8220672607422,
"loss": 0.3506,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.12911692261695862,
"margin_dpo/beta_margin_grad_std": 0.19371378421783447,
"margin_dpo/beta_margin_mean": 3.8111071586608887,
"margin_dpo/loss_margin_mean": 38.11106872558594,
"margin_dpo/margin_mean": 38.11106872558594,
"margin_dpo/margin_std": 27.990556716918945,
"step": 532
},
{
"epoch": 0.7826725403817915,
"grad_norm": 70.82559204101562,
"learning_rate": 6.963101805503646e-08,
"logits/chosen": -0.6239089965820312,
"logits/rejected": -0.5815380215644836,
"logps/chosen": -88.10252380371094,
"logps/ref_chosen": -64.8551025390625,
"logps/ref_rejected": -76.58805847167969,
"logps/rejected": -126.4080581665039,
"loss": 0.5933,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.18534313142299652,
"margin_dpo/beta_margin_grad_std": 0.23213329911231995,
"margin_dpo/beta_margin_mean": 2.657257080078125,
"margin_dpo/loss_margin_mean": 26.57257080078125,
"margin_dpo/margin_mean": 26.572572708129883,
"margin_dpo/margin_std": 23.991806030273438,
"step": 533
},
{
"epoch": 0.7841409691629956,
"grad_norm": 47.19809341430664,
"learning_rate": 6.874476976660184e-08,
"logits/chosen": -0.623961329460144,
"logits/rejected": -0.5935629606246948,
"logps/chosen": -82.6689453125,
"logps/ref_chosen": -60.119388580322266,
"logps/ref_rejected": -78.54347229003906,
"logps/rejected": -133.5894012451172,
"loss": 0.4033,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1534009873867035,
"margin_dpo/beta_margin_grad_std": 0.18615348637104034,
"margin_dpo/beta_margin_mean": 3.2496376037597656,
"margin_dpo/loss_margin_mean": 32.496376037597656,
"margin_dpo/margin_mean": 32.49637222290039,
"margin_dpo/margin_std": 27.735137939453125,
"step": 534
},
{
"epoch": 0.7856093979441997,
"grad_norm": 46.1759147644043,
"learning_rate": 6.786329772205246e-08,
"logits/chosen": -0.6095963716506958,
"logits/rejected": -0.5779241919517517,
"logps/chosen": -74.88334655761719,
"logps/ref_chosen": -54.330238342285156,
"logps/ref_rejected": -96.30763244628906,
"logps/rejected": -152.88735961914062,
"loss": 0.3929,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.13403823971748352,
"margin_dpo/beta_margin_grad_std": 0.19758474826812744,
"margin_dpo/beta_margin_mean": 3.602663040161133,
"margin_dpo/loss_margin_mean": 36.02663040161133,
"margin_dpo/margin_mean": 36.02663040161133,
"margin_dpo/margin_std": 26.887496948242188,
"step": 535
},
{
"epoch": 0.7870778267254038,
"grad_norm": 33.667205810546875,
"learning_rate": 6.698662514899638e-08,
"logits/chosen": -0.6077243089675903,
"logits/rejected": -0.5903106927871704,
"logps/chosen": -67.83413696289062,
"logps/ref_chosen": -47.08053207397461,
"logps/ref_rejected": -89.09783935546875,
"logps/rejected": -150.54098510742188,
"loss": 0.2183,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.0890052318572998,
"margin_dpo/beta_margin_grad_std": 0.13288089632987976,
"margin_dpo/beta_margin_mean": 4.068953990936279,
"margin_dpo/loss_margin_mean": 40.689537048339844,
"margin_dpo/margin_mean": 40.689537048339844,
"margin_dpo/margin_std": 27.558616638183594,
"step": 536
},
{
"epoch": 0.788546255506608,
"grad_norm": 60.72896957397461,
"learning_rate": 6.611477514857114e-08,
"logits/chosen": -0.6039552688598633,
"logits/rejected": -0.5421825647354126,
"logps/chosen": -78.5447998046875,
"logps/ref_chosen": -57.747474670410156,
"logps/ref_rejected": -70.43838500976562,
"logps/rejected": -124.99288940429688,
"loss": 0.4139,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1552681028842926,
"margin_dpo/beta_margin_grad_std": 0.19483794271945953,
"margin_dpo/beta_margin_mean": 3.375717878341675,
"margin_dpo/loss_margin_mean": 33.757179260253906,
"margin_dpo/margin_mean": 33.757179260253906,
"margin_dpo/margin_std": 28.151874542236328,
"step": 537
},
{
"epoch": 0.7900146842878121,
"grad_norm": 46.434898376464844,
"learning_rate": 6.524777069483525e-08,
"logits/chosen": -0.616761326789856,
"logits/rejected": -0.5684964656829834,
"logps/chosen": -89.30928039550781,
"logps/ref_chosen": -66.41593933105469,
"logps/ref_rejected": -84.22808837890625,
"logps/rejected": -139.8426055908203,
"loss": 0.3541,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.12844222784042358,
"margin_dpo/beta_margin_grad_std": 0.16971102356910706,
"margin_dpo/beta_margin_mean": 3.2721188068389893,
"margin_dpo/loss_margin_mean": 32.721187591552734,
"margin_dpo/margin_mean": 32.721187591552734,
"margin_dpo/margin_std": 25.067447662353516,
"step": 538
},
{
"epoch": 0.7914831130690162,
"grad_norm": 55.15032196044922,
"learning_rate": 6.438563463416221e-08,
"logits/chosen": -0.6659849882125854,
"logits/rejected": -0.6233581304550171,
"logps/chosen": -79.83650207519531,
"logps/ref_chosen": -58.49285125732422,
"logps/ref_rejected": -91.85395812988281,
"logps/rejected": -144.42047119140625,
"loss": 0.4882,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16426563262939453,
"margin_dpo/beta_margin_grad_std": 0.21655422449111938,
"margin_dpo/beta_margin_mean": 3.122286319732666,
"margin_dpo/loss_margin_mean": 31.22286605834961,
"margin_dpo/margin_mean": 31.22286605834961,
"margin_dpo/margin_std": 27.15618133544922,
"step": 539
},
{
"epoch": 0.7929515418502202,
"grad_norm": 62.82166290283203,
"learning_rate": 6.352838968463919e-08,
"logits/chosen": -0.6381834149360657,
"logits/rejected": -0.6113142967224121,
"logps/chosen": -85.19060516357422,
"logps/ref_chosen": -63.482513427734375,
"logps/ref_rejected": -116.43000030517578,
"logps/rejected": -173.4632110595703,
"loss": 0.4628,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.13866056501865387,
"margin_dpo/beta_margin_grad_std": 0.2188330590724945,
"margin_dpo/beta_margin_mean": 3.5325119495391846,
"margin_dpo/loss_margin_mean": 35.32511901855469,
"margin_dpo/margin_mean": 35.32511901855469,
"margin_dpo/margin_std": 27.556922912597656,
"step": 540
},
{
"epoch": 0.7944199706314243,
"grad_norm": 62.53669738769531,
"learning_rate": 6.267605843546767e-08,
"logits/chosen": -0.6469000577926636,
"logits/rejected": -0.6038193702697754,
"logps/chosen": -101.33219146728516,
"logps/ref_chosen": -78.28035736083984,
"logps/ref_rejected": -103.273681640625,
"logps/rejected": -156.50372314453125,
"loss": 0.4275,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.15085425972938538,
"margin_dpo/beta_margin_grad_std": 0.18950016796588898,
"margin_dpo/beta_margin_mean": 3.017820358276367,
"margin_dpo/loss_margin_mean": 30.17820167541504,
"margin_dpo/margin_mean": 30.178203582763672,
"margin_dpo/margin_std": 23.339244842529297,
"step": 541
},
{
"epoch": 0.7958883994126285,
"grad_norm": 39.13835144042969,
"learning_rate": 6.182866334636888e-08,
"logits/chosen": -0.6534620523452759,
"logits/rejected": -0.6460641622543335,
"logps/chosen": -80.37567901611328,
"logps/ref_chosen": -57.48497009277344,
"logps/ref_rejected": -96.47506713867188,
"logps/rejected": -153.80548095703125,
"loss": 0.366,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.13392959535121918,
"margin_dpo/beta_margin_grad_std": 0.19380900263786316,
"margin_dpo/beta_margin_mean": 3.4439687728881836,
"margin_dpo/loss_margin_mean": 34.43968963623047,
"margin_dpo/margin_mean": 34.43968963623047,
"margin_dpo/margin_std": 25.000656127929688,
"step": 542
},
{
"epoch": 0.7973568281938326,
"grad_norm": 80.92756652832031,
"learning_rate": 6.098622674699147e-08,
"logits/chosen": -0.5841265916824341,
"logits/rejected": -0.5707241296768188,
"logps/chosen": -84.22129821777344,
"logps/ref_chosen": -60.61750793457031,
"logps/ref_rejected": -105.59896850585938,
"logps/rejected": -154.79116821289062,
"loss": 0.6059,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.19587799906730652,
"margin_dpo/beta_margin_grad_std": 0.22574475407600403,
"margin_dpo/beta_margin_mean": 2.5588417053222656,
"margin_dpo/loss_margin_mean": 25.588415145874023,
"margin_dpo/margin_mean": 25.588417053222656,
"margin_dpo/margin_std": 25.410099029541016,
"step": 543
},
{
"epoch": 0.7988252569750367,
"grad_norm": 46.70132064819336,
"learning_rate": 6.01487708363232e-08,
"logits/chosen": -0.6044985055923462,
"logits/rejected": -0.5905438661575317,
"logps/chosen": -85.07525634765625,
"logps/ref_chosen": -59.642303466796875,
"logps/ref_rejected": -100.95469665527344,
"logps/rejected": -159.369873046875,
"loss": 0.3149,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.12085522711277008,
"margin_dpo/beta_margin_grad_std": 0.16033101081848145,
"margin_dpo/beta_margin_mean": 3.2982213497161865,
"margin_dpo/loss_margin_mean": 32.98221206665039,
"margin_dpo/margin_mean": 32.98221206665039,
"margin_dpo/margin_std": 24.098819732666016,
"step": 544
},
{
"epoch": 0.8002936857562408,
"grad_norm": 49.51677703857422,
"learning_rate": 5.9316317682106294e-08,
"logits/chosen": -0.5612127780914307,
"logits/rejected": -0.5339560508728027,
"logps/chosen": -91.53611755371094,
"logps/ref_chosen": -67.64859771728516,
"logps/ref_rejected": -95.90800476074219,
"logps/rejected": -154.13796997070312,
"loss": 0.3835,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.14522799849510193,
"margin_dpo/beta_margin_grad_std": 0.19207137823104858,
"margin_dpo/beta_margin_mean": 3.43424654006958,
"margin_dpo/loss_margin_mean": 34.34246826171875,
"margin_dpo/margin_mean": 34.34246826171875,
"margin_dpo/margin_std": 26.62921142578125,
"step": 545
},
{
"epoch": 0.801762114537445,
"grad_norm": 49.70174789428711,
"learning_rate": 5.848888922025552e-08,
"logits/chosen": -0.5628246665000916,
"logits/rejected": -0.5294591188430786,
"logps/chosen": -73.03337097167969,
"logps/ref_chosen": -50.744232177734375,
"logps/ref_rejected": -81.86622619628906,
"logps/rejected": -137.66339111328125,
"loss": 0.3402,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.12423272430896759,
"margin_dpo/beta_margin_grad_std": 0.17410895228385925,
"margin_dpo/beta_margin_mean": 3.350802421569824,
"margin_dpo/loss_margin_mean": 33.50802230834961,
"margin_dpo/margin_mean": 33.50802230834961,
"margin_dpo/margin_std": 23.63653564453125,
"step": 546
},
{
"epoch": 0.8032305433186491,
"grad_norm": 94.18778228759766,
"learning_rate": 5.7666507254280265e-08,
"logits/chosen": -0.5813489556312561,
"logits/rejected": -0.5452552437782288,
"logps/chosen": -99.18565368652344,
"logps/ref_chosen": -73.6877212524414,
"logps/ref_rejected": -90.76136779785156,
"logps/rejected": -147.3549041748047,
"loss": 0.5868,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.18151536583900452,
"margin_dpo/beta_margin_grad_std": 0.23811323940753937,
"margin_dpo/beta_margin_mean": 3.109560966491699,
"margin_dpo/loss_margin_mean": 31.09560775756836,
"margin_dpo/margin_mean": 31.09560775756836,
"margin_dpo/margin_std": 29.87148666381836,
"step": 547
},
{
"epoch": 0.8046989720998532,
"grad_norm": 51.33172607421875,
"learning_rate": 5.684919345471029e-08,
"logits/chosen": -0.6642282009124756,
"logits/rejected": -0.6327365040779114,
"logps/chosen": -87.51785278320312,
"logps/ref_chosen": -65.24634552001953,
"logps/ref_rejected": -94.11807250976562,
"logps/rejected": -150.5765380859375,
"loss": 0.4329,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.14642292261123657,
"margin_dpo/beta_margin_grad_std": 0.21783213317394257,
"margin_dpo/beta_margin_mean": 3.4186956882476807,
"margin_dpo/loss_margin_mean": 34.186954498291016,
"margin_dpo/margin_mean": 34.186954498291016,
"margin_dpo/margin_std": 28.527481079101562,
"step": 548
},
{
"epoch": 0.8061674008810573,
"grad_norm": 59.542022705078125,
"learning_rate": 5.603696935852426e-08,
"logits/chosen": -0.587199866771698,
"logits/rejected": -0.5497395992279053,
"logps/chosen": -70.22129821777344,
"logps/ref_chosen": -49.21235656738281,
"logps/ref_rejected": -73.91031646728516,
"logps/rejected": -129.74290466308594,
"loss": 0.3415,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.12509265542030334,
"margin_dpo/beta_margin_grad_std": 0.17894169688224792,
"margin_dpo/beta_margin_mean": 3.4823646545410156,
"margin_dpo/loss_margin_mean": 34.823646545410156,
"margin_dpo/margin_mean": 34.823646545410156,
"margin_dpo/margin_std": 25.530513763427734,
"step": 549
},
{
"epoch": 0.8076358296622613,
"grad_norm": 69.98318481445312,
"learning_rate": 5.5229856368582376e-08,
"logits/chosen": -0.5780174732208252,
"logits/rejected": -0.554786741733551,
"logps/chosen": -81.68783569335938,
"logps/ref_chosen": -56.80695343017578,
"logps/ref_rejected": -95.12580871582031,
"logps/rejected": -147.86605834960938,
"loss": 0.512,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.18082059919834137,
"margin_dpo/beta_margin_grad_std": 0.22207701206207275,
"margin_dpo/beta_margin_mean": 2.7859373092651367,
"margin_dpo/loss_margin_mean": 27.859371185302734,
"margin_dpo/margin_mean": 27.859375,
"margin_dpo/margin_std": 24.073030471801758,
"step": 550
},
{
"epoch": 0.8091042584434655,
"grad_norm": 68.77825164794922,
"learning_rate": 5.4427875753062734e-08,
"logits/chosen": -0.6016166806221008,
"logits/rejected": -0.5792367458343506,
"logps/chosen": -82.6038589477539,
"logps/ref_chosen": -59.10633087158203,
"logps/ref_rejected": -111.67280578613281,
"logps/rejected": -170.34124755859375,
"loss": 0.3618,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.13445577025413513,
"margin_dpo/beta_margin_grad_std": 0.18969406187534332,
"margin_dpo/beta_margin_mean": 3.5170915126800537,
"margin_dpo/loss_margin_mean": 35.17091369628906,
"margin_dpo/margin_mean": 35.17091369628906,
"margin_dpo/margin_std": 27.047245025634766,
"step": 551
},
{
"epoch": 0.8105726872246696,
"grad_norm": 36.09619903564453,
"learning_rate": 5.363104864490034e-08,
"logits/chosen": -0.6584379076957703,
"logits/rejected": -0.6297129392623901,
"logps/chosen": -82.9939193725586,
"logps/ref_chosen": -62.35459899902344,
"logps/ref_rejected": -104.56210327148438,
"logps/rejected": -164.9587860107422,
"loss": 0.2475,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.10201341658830643,
"margin_dpo/beta_margin_grad_std": 0.1296149343252182,
"margin_dpo/beta_margin_mean": 3.975735902786255,
"margin_dpo/loss_margin_mean": 39.757354736328125,
"margin_dpo/margin_mean": 39.757354736328125,
"margin_dpo/margin_std": 30.61846923828125,
"step": 552
},
{
"epoch": 0.8120411160058737,
"grad_norm": 62.299354553222656,
"learning_rate": 5.2839396041230415e-08,
"logits/chosen": -0.5835554599761963,
"logits/rejected": -0.5560900568962097,
"logps/chosen": -89.63333129882812,
"logps/ref_chosen": -68.25881958007812,
"logps/ref_rejected": -98.0971450805664,
"logps/rejected": -150.1568603515625,
"loss": 0.4084,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.15132924914360046,
"margin_dpo/beta_margin_grad_std": 0.1949571967124939,
"margin_dpo/beta_margin_mean": 3.0685200691223145,
"margin_dpo/loss_margin_mean": 30.685199737548828,
"margin_dpo/margin_mean": 30.685199737548828,
"margin_dpo/margin_std": 24.393556594848633,
"step": 553
},
{
"epoch": 0.8135095447870778,
"grad_norm": 70.59496307373047,
"learning_rate": 5.205293880283551e-08,
"logits/chosen": -0.5978009104728699,
"logits/rejected": -0.5454249382019043,
"logps/chosen": -91.25200653076172,
"logps/ref_chosen": -67.94767761230469,
"logps/ref_rejected": -89.78272247314453,
"logps/rejected": -154.95721435546875,
"loss": 0.4373,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.12750211358070374,
"margin_dpo/beta_margin_grad_std": 0.22240933775901794,
"margin_dpo/beta_margin_mean": 4.187016487121582,
"margin_dpo/loss_margin_mean": 41.87016677856445,
"margin_dpo/margin_mean": 41.87016677856445,
"margin_dpo/margin_std": 30.95236587524414,
"step": 554
},
{
"epoch": 0.8149779735682819,
"grad_norm": 61.74562454223633,
"learning_rate": 5.127169765359515e-08,
"logits/chosen": -0.5948277115821838,
"logits/rejected": -0.5893919467926025,
"logps/chosen": -75.4261245727539,
"logps/ref_chosen": -53.33049011230469,
"logps/ref_rejected": -108.47937774658203,
"logps/rejected": -165.4979248046875,
"loss": 0.4571,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.14716145396232605,
"margin_dpo/beta_margin_grad_std": 0.20644359290599823,
"margin_dpo/beta_margin_mean": 3.4922895431518555,
"margin_dpo/loss_margin_mean": 34.92289733886719,
"margin_dpo/margin_mean": 34.92289733886719,
"margin_dpo/margin_std": 27.98041534423828,
"step": 555
},
{
"epoch": 0.8164464023494861,
"grad_norm": 72.64017486572266,
"learning_rate": 5.049569317994012e-08,
"logits/chosen": -0.5797896385192871,
"logits/rejected": -0.5396873950958252,
"logps/chosen": -80.73486328125,
"logps/ref_chosen": -58.64447021484375,
"logps/ref_rejected": -101.34040832519531,
"logps/rejected": -154.17111206054688,
"loss": 0.5343,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1822032928466797,
"margin_dpo/beta_margin_grad_std": 0.2356235682964325,
"margin_dpo/beta_margin_mean": 3.074030876159668,
"margin_dpo/loss_margin_mean": 30.740306854248047,
"margin_dpo/margin_mean": 30.740306854248047,
"margin_dpo/margin_std": 27.73691177368164,
"step": 556
},
{
"epoch": 0.8179148311306902,
"grad_norm": 52.41410446166992,
"learning_rate": 4.9724945830310144e-08,
"logits/chosen": -0.6446192264556885,
"logits/rejected": -0.6262944936752319,
"logps/chosen": -89.59944152832031,
"logps/ref_chosen": -67.84066009521484,
"logps/ref_rejected": -109.93966674804688,
"logps/rejected": -162.2843475341797,
"loss": 0.4552,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16416671872138977,
"margin_dpo/beta_margin_grad_std": 0.19996830821037292,
"margin_dpo/beta_margin_mean": 3.0585899353027344,
"margin_dpo/loss_margin_mean": 30.585901260375977,
"margin_dpo/margin_mean": 30.585901260375977,
"margin_dpo/margin_std": 26.19734001159668,
"step": 557
},
{
"epoch": 0.8193832599118943,
"grad_norm": 38.192787170410156,
"learning_rate": 4.8959475914614554e-08,
"logits/chosen": -0.6551119089126587,
"logits/rejected": -0.6068642139434814,
"logps/chosen": -81.69489288330078,
"logps/ref_chosen": -62.36824035644531,
"logps/ref_rejected": -102.16102600097656,
"logps/rejected": -162.4652862548828,
"loss": 0.2874,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.10797977447509766,
"margin_dpo/beta_margin_grad_std": 0.14597085118293762,
"margin_dpo/beta_margin_mean": 4.0977606773376465,
"margin_dpo/loss_margin_mean": 40.97760772705078,
"margin_dpo/margin_mean": 40.97760772705078,
"margin_dpo/margin_std": 30.455211639404297,
"step": 558
},
{
"epoch": 0.8208516886930984,
"grad_norm": 55.569332122802734,
"learning_rate": 4.8199303603697614e-08,
"logits/chosen": -0.6832656860351562,
"logits/rejected": -0.6301894187927246,
"logps/chosen": -80.5696029663086,
"logps/ref_chosen": -60.75232696533203,
"logps/ref_rejected": -93.4422836303711,
"logps/rejected": -146.19882202148438,
"loss": 0.4325,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.15588068962097168,
"margin_dpo/beta_margin_grad_std": 0.20822836458683014,
"margin_dpo/beta_margin_mean": 3.2939257621765137,
"margin_dpo/loss_margin_mean": 32.93925857543945,
"margin_dpo/margin_mean": 32.93925476074219,
"margin_dpo/margin_std": 27.555404663085938,
"step": 559
},
{
"epoch": 0.8223201174743024,
"grad_norm": 67.84832000732422,
"learning_rate": 4.7444448928806615e-08,
"logits/chosen": -0.5837658643722534,
"logits/rejected": -0.5339952707290649,
"logps/chosen": -79.17695617675781,
"logps/ref_chosen": -58.10382080078125,
"logps/ref_rejected": -79.99122619628906,
"logps/rejected": -130.1254425048828,
"loss": 0.4489,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16372855007648468,
"margin_dpo/beta_margin_grad_std": 0.1933506578207016,
"margin_dpo/beta_margin_mean": 2.906108856201172,
"margin_dpo/loss_margin_mean": 29.061086654663086,
"margin_dpo/margin_mean": 29.061086654663086,
"margin_dpo/margin_std": 24.71479034423828,
"step": 560
},
{
"epoch": 0.8237885462555066,
"grad_norm": 66.11046600341797,
"learning_rate": 4.669493178106432e-08,
"logits/chosen": -0.6318497657775879,
"logits/rejected": -0.6243282556533813,
"logps/chosen": -76.038330078125,
"logps/ref_chosen": -50.91287612915039,
"logps/ref_rejected": -99.06857299804688,
"logps/rejected": -153.46937561035156,
"loss": 0.4945,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.17272219061851501,
"margin_dpo/beta_margin_grad_std": 0.21409326791763306,
"margin_dpo/beta_margin_mean": 2.927535057067871,
"margin_dpo/loss_margin_mean": 29.275352478027344,
"margin_dpo/margin_mean": 29.275352478027344,
"margin_dpo/margin_std": 26.028850555419922,
"step": 561
},
{
"epoch": 0.8252569750367107,
"grad_norm": 34.92936706542969,
"learning_rate": 4.5950771910944596e-08,
"logits/chosen": -0.651642382144928,
"logits/rejected": -0.604433536529541,
"logps/chosen": -78.28529357910156,
"logps/ref_chosen": -59.46440124511719,
"logps/ref_rejected": -96.54266357421875,
"logps/rejected": -153.3458709716797,
"loss": 0.2435,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.0989568680524826,
"margin_dpo/beta_margin_grad_std": 0.1261216104030609,
"margin_dpo/beta_margin_mean": 3.7982311248779297,
"margin_dpo/loss_margin_mean": 37.9823112487793,
"margin_dpo/margin_mean": 37.9823112487793,
"margin_dpo/margin_std": 26.726564407348633,
"step": 562
},
{
"epoch": 0.8267254038179148,
"grad_norm": 63.81352233886719,
"learning_rate": 4.521198892775202e-08,
"logits/chosen": -0.5930050611495972,
"logits/rejected": -0.5729939937591553,
"logps/chosen": -83.12980651855469,
"logps/ref_chosen": -60.60819625854492,
"logps/ref_rejected": -94.56770324707031,
"logps/rejected": -147.40249633789062,
"loss": 0.4148,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.15109089016914368,
"margin_dpo/beta_margin_grad_std": 0.1935727894306183,
"margin_dpo/beta_margin_mean": 3.031318426132202,
"margin_dpo/loss_margin_mean": 30.313182830810547,
"margin_dpo/margin_mean": 30.313182830810547,
"margin_dpo/margin_std": 23.21819496154785,
"step": 563
},
{
"epoch": 0.8281938325991189,
"grad_norm": 47.72722244262695,
"learning_rate": 4.447860229910544e-08,
"logits/chosen": -0.656052827835083,
"logits/rejected": -0.5981060862541199,
"logps/chosen": -96.48939514160156,
"logps/ref_chosen": -74.26837921142578,
"logps/ref_rejected": -93.2381820678711,
"logps/rejected": -147.96966552734375,
"loss": 0.368,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.13481834530830383,
"margin_dpo/beta_margin_grad_std": 0.1763157844543457,
"margin_dpo/beta_margin_mean": 3.2510476112365723,
"margin_dpo/loss_margin_mean": 32.510475158691406,
"margin_dpo/margin_mean": 32.510475158691406,
"margin_dpo/margin_std": 22.74962043762207,
"step": 564
},
{
"epoch": 0.8296622613803231,
"grad_norm": 44.3295783996582,
"learning_rate": 4.375063135042445e-08,
"logits/chosen": -0.6097604036331177,
"logits/rejected": -0.5671969652175903,
"logps/chosen": -91.07102966308594,
"logps/ref_chosen": -69.0199203491211,
"logps/ref_rejected": -85.7789306640625,
"logps/rejected": -143.09686279296875,
"loss": 0.3731,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.141332745552063,
"margin_dpo/beta_margin_grad_std": 0.18416938185691833,
"margin_dpo/beta_margin_mean": 3.5266833305358887,
"margin_dpo/loss_margin_mean": 35.2668342590332,
"margin_dpo/margin_mean": 35.2668342590332,
"margin_dpo/margin_std": 30.624713897705078,
"step": 565
},
{
"epoch": 0.8311306901615272,
"grad_norm": 56.34800338745117,
"learning_rate": 4.3028095264420525e-08,
"logits/chosen": -0.5949935913085938,
"logits/rejected": -0.5808389186859131,
"logps/chosen": -87.20069885253906,
"logps/ref_chosen": -66.5453109741211,
"logps/ref_rejected": -103.86931610107422,
"logps/rejected": -158.5188751220703,
"loss": 0.4755,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16708871722221375,
"margin_dpo/beta_margin_grad_std": 0.21157479286193848,
"margin_dpo/beta_margin_mean": 3.399416923522949,
"margin_dpo/loss_margin_mean": 33.994171142578125,
"margin_dpo/margin_mean": 33.994171142578125,
"margin_dpo/margin_std": 29.911640167236328,
"step": 566
},
{
"epoch": 0.8325991189427313,
"grad_norm": 82.15995025634766,
"learning_rate": 4.231101308059165e-08,
"logits/chosen": -0.6804023385047913,
"logits/rejected": -0.6269962787628174,
"logps/chosen": -75.24916076660156,
"logps/ref_chosen": -52.858299255371094,
"logps/ref_rejected": -85.37095642089844,
"logps/rejected": -140.0916748046875,
"loss": 0.5883,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.17573551833629608,
"margin_dpo/beta_margin_grad_std": 0.24265018105506897,
"margin_dpo/beta_margin_mean": 3.2329859733581543,
"margin_dpo/loss_margin_mean": 32.329856872558594,
"margin_dpo/margin_mean": 32.32986068725586,
"margin_dpo/margin_std": 28.114917755126953,
"step": 567
},
{
"epoch": 0.8340675477239354,
"grad_norm": 43.57807159423828,
"learning_rate": 4.1599403694720145e-08,
"logits/chosen": -0.580660343170166,
"logits/rejected": -0.5636056065559387,
"logps/chosen": -67.96955108642578,
"logps/ref_chosen": -45.1923828125,
"logps/ref_rejected": -89.09236145019531,
"logps/rejected": -149.81170654296875,
"loss": 0.3489,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.12049505114555359,
"margin_dpo/beta_margin_grad_std": 0.17438393831253052,
"margin_dpo/beta_margin_mean": 3.7942161560058594,
"margin_dpo/loss_margin_mean": 37.942161560058594,
"margin_dpo/margin_mean": 37.942161560058594,
"margin_dpo/margin_std": 26.538555145263672,
"step": 568
},
{
"epoch": 0.8355359765051396,
"grad_norm": 63.59123229980469,
"learning_rate": 4.089328585837512e-08,
"logits/chosen": -0.6394084692001343,
"logits/rejected": -0.6091455817222595,
"logps/chosen": -86.40789794921875,
"logps/ref_chosen": -63.72056198120117,
"logps/ref_rejected": -79.10325622558594,
"logps/rejected": -131.8647918701172,
"loss": 0.5032,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.17486035823822021,
"margin_dpo/beta_margin_grad_std": 0.22121158242225647,
"margin_dpo/beta_margin_mean": 3.007420063018799,
"margin_dpo/loss_margin_mean": 30.074199676513672,
"margin_dpo/margin_mean": 30.074199676513672,
"margin_dpo/margin_std": 27.18084716796875,
"step": 569
},
{
"epoch": 0.8370044052863436,
"grad_norm": 53.639320373535156,
"learning_rate": 4.019267817841834e-08,
"logits/chosen": -0.674132764339447,
"logits/rejected": -0.6270936131477356,
"logps/chosen": -82.65512084960938,
"logps/ref_chosen": -61.61454772949219,
"logps/ref_rejected": -82.1418685913086,
"logps/rejected": -138.31561279296875,
"loss": 0.329,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1217803955078125,
"margin_dpo/beta_margin_grad_std": 0.18320615589618683,
"margin_dpo/beta_margin_mean": 3.5133180618286133,
"margin_dpo/loss_margin_mean": 35.1331787109375,
"margin_dpo/margin_mean": 35.1331787109375,
"margin_dpo/margin_std": 25.58907127380371,
"step": 570
},
{
"epoch": 0.8384728340675477,
"grad_norm": 61.772037506103516,
"learning_rate": 3.9497599116513705e-08,
"logits/chosen": -0.5997041463851929,
"logits/rejected": -0.5761772990226746,
"logps/chosen": -75.53142547607422,
"logps/ref_chosen": -53.05406188964844,
"logps/ref_rejected": -91.33682250976562,
"logps/rejected": -148.2672119140625,
"loss": 0.3786,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1382187306880951,
"margin_dpo/beta_margin_grad_std": 0.19462428987026215,
"margin_dpo/beta_margin_mean": 3.445303440093994,
"margin_dpo/loss_margin_mean": 34.453033447265625,
"margin_dpo/margin_mean": 34.453033447265625,
"margin_dpo/margin_std": 27.182416915893555,
"step": 571
},
{
"epoch": 0.8399412628487518,
"grad_norm": 78.96542358398438,
"learning_rate": 3.880806698864086e-08,
"logits/chosen": -0.5895199775695801,
"logits/rejected": -0.5727903246879578,
"logps/chosen": -75.78829193115234,
"logps/ref_chosen": -48.459285736083984,
"logps/ref_rejected": -83.5570297241211,
"logps/rejected": -143.24444580078125,
"loss": 0.6978,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.18673813343048096,
"margin_dpo/beta_margin_grad_std": 0.25194963812828064,
"margin_dpo/beta_margin_mean": 3.23583984375,
"margin_dpo/loss_margin_mean": 32.3583984375,
"margin_dpo/margin_mean": 32.3583984375,
"margin_dpo/margin_std": 31.786035537719727,
"step": 572
},
{
"epoch": 0.8414096916299559,
"grad_norm": 59.523719787597656,
"learning_rate": 3.812409996461275e-08,
"logits/chosen": -0.6645894050598145,
"logits/rejected": -0.6301409602165222,
"logps/chosen": -73.29808044433594,
"logps/ref_chosen": -51.62262725830078,
"logps/ref_rejected": -85.32499694824219,
"logps/rejected": -141.5244903564453,
"loss": 0.4423,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1509319394826889,
"margin_dpo/beta_margin_grad_std": 0.21270796656608582,
"margin_dpo/beta_margin_mean": 3.452404499053955,
"margin_dpo/loss_margin_mean": 34.5240478515625,
"margin_dpo/margin_mean": 34.5240478515625,
"margin_dpo/margin_std": 26.213939666748047,
"step": 573
},
{
"epoch": 0.8428781204111601,
"grad_norm": 71.15605926513672,
"learning_rate": 3.74457160675965e-08,
"logits/chosen": -0.6428389549255371,
"logits/rejected": -0.6086920499801636,
"logps/chosen": -74.59834289550781,
"logps/ref_chosen": -51.04446029663086,
"logps/ref_rejected": -92.80640411376953,
"logps/rejected": -150.51730346679688,
"loss": 0.4548,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.14354148507118225,
"margin_dpo/beta_margin_grad_std": 0.1961897611618042,
"margin_dpo/beta_margin_mean": 3.415701389312744,
"margin_dpo/loss_margin_mean": 34.157012939453125,
"margin_dpo/margin_mean": 34.157012939453125,
"margin_dpo/margin_std": 27.831592559814453,
"step": 574
},
{
"epoch": 0.8443465491923642,
"grad_norm": 86.26287078857422,
"learning_rate": 3.677293317363864e-08,
"logits/chosen": -0.5673672556877136,
"logits/rejected": -0.5325363874435425,
"logps/chosen": -97.13941955566406,
"logps/ref_chosen": -71.79014587402344,
"logps/ref_rejected": -95.38619995117188,
"logps/rejected": -157.06790161132812,
"loss": 0.6399,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1683315932750702,
"margin_dpo/beta_margin_grad_std": 0.2584590017795563,
"margin_dpo/beta_margin_mean": 3.6332435607910156,
"margin_dpo/loss_margin_mean": 36.332435607910156,
"margin_dpo/margin_mean": 36.332435607910156,
"margin_dpo/margin_std": 31.414226531982422,
"step": 575
},
{
"epoch": 0.8458149779735683,
"grad_norm": 49.230098724365234,
"learning_rate": 3.6105769011194224e-08,
"logits/chosen": -0.5987046957015991,
"logits/rejected": -0.5964124202728271,
"logps/chosen": -77.82243347167969,
"logps/ref_chosen": -54.262969970703125,
"logps/ref_rejected": -100.7542724609375,
"logps/rejected": -159.103515625,
"loss": 0.444,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.14397433400154114,
"margin_dpo/beta_margin_grad_std": 0.19858963787555695,
"margin_dpo/beta_margin_mean": 3.4789772033691406,
"margin_dpo/loss_margin_mean": 34.789772033691406,
"margin_dpo/margin_mean": 34.789772033691406,
"margin_dpo/margin_std": 29.601974487304688,
"step": 576
},
{
"epoch": 0.8472834067547724,
"grad_norm": 48.130558013916016,
"learning_rate": 3.5444241160659304e-08,
"logits/chosen": -0.6501774787902832,
"logits/rejected": -0.6030783653259277,
"logps/chosen": -81.96438598632812,
"logps/ref_chosen": -61.909706115722656,
"logps/ref_rejected": -84.07069396972656,
"logps/rejected": -142.32540893554688,
"loss": 0.348,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.11993271112442017,
"margin_dpo/beta_margin_grad_std": 0.1745702028274536,
"margin_dpo/beta_margin_mean": 3.8200042247772217,
"margin_dpo/loss_margin_mean": 38.200042724609375,
"margin_dpo/margin_mean": 38.200042724609375,
"margin_dpo/margin_std": 27.837221145629883,
"step": 577
},
{
"epoch": 0.8487518355359766,
"grad_norm": 56.244651794433594,
"learning_rate": 3.478836705390808e-08,
"logits/chosen": -0.5622389912605286,
"logits/rejected": -0.5450348854064941,
"logps/chosen": -76.16979217529297,
"logps/ref_chosen": -49.26368713378906,
"logps/ref_rejected": -83.43626403808594,
"logps/rejected": -145.69003295898438,
"loss": 0.3651,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.13420046865940094,
"margin_dpo/beta_margin_grad_std": 0.1781080663204193,
"margin_dpo/beta_margin_mean": 3.5347681045532227,
"margin_dpo/loss_margin_mean": 35.347679138183594,
"margin_dpo/margin_mean": 35.347679138183594,
"margin_dpo/margin_std": 26.89832878112793,
"step": 578
},
{
"epoch": 0.8502202643171806,
"grad_norm": 54.51181411743164,
"learning_rate": 3.41381639738331e-08,
"logits/chosen": -0.6210640668869019,
"logits/rejected": -0.5942162871360779,
"logps/chosen": -80.40229797363281,
"logps/ref_chosen": -58.88581848144531,
"logps/ref_rejected": -94.78762817382812,
"logps/rejected": -147.00790405273438,
"loss": 0.3713,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.14311912655830383,
"margin_dpo/beta_margin_grad_std": 0.1770986020565033,
"margin_dpo/beta_margin_mean": 3.0703792572021484,
"margin_dpo/loss_margin_mean": 30.70379066467285,
"margin_dpo/margin_mean": 30.70379066467285,
"margin_dpo/margin_std": 23.245943069458008,
"step": 579
},
{
"epoch": 0.8516886930983847,
"grad_norm": 47.365726470947266,
"learning_rate": 3.349364905389032e-08,
"logits/chosen": -0.6056051254272461,
"logits/rejected": -0.5679988265037537,
"logps/chosen": -67.93992614746094,
"logps/ref_chosen": -48.70684051513672,
"logps/ref_rejected": -81.7583999633789,
"logps/rejected": -141.37774658203125,
"loss": 0.3507,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1173824667930603,
"margin_dpo/beta_margin_grad_std": 0.2006831169128418,
"margin_dpo/beta_margin_mean": 4.0386271476745605,
"margin_dpo/loss_margin_mean": 40.38627243041992,
"margin_dpo/margin_mean": 40.38627243041992,
"margin_dpo/margin_std": 30.28713607788086,
"step": 580
},
{
"epoch": 0.8531571218795888,
"grad_norm": 52.634037017822266,
"learning_rate": 3.285483927764726e-08,
"logits/chosen": -0.5740267634391785,
"logits/rejected": -0.5509278774261475,
"logps/chosen": -83.43389892578125,
"logps/ref_chosen": -62.22235107421875,
"logps/ref_rejected": -91.73568725585938,
"logps/rejected": -144.02926635742188,
"loss": 0.4291,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.15440425276756287,
"margin_dpo/beta_margin_grad_std": 0.203893780708313,
"margin_dpo/beta_margin_mean": 3.1082029342651367,
"margin_dpo/loss_margin_mean": 31.082029342651367,
"margin_dpo/margin_mean": 31.082029342651367,
"margin_dpo/margin_std": 24.809860229492188,
"step": 581
},
{
"epoch": 0.8546255506607929,
"grad_norm": 66.31195068359375,
"learning_rate": 3.222175147833556e-08,
"logits/chosen": -0.605322003364563,
"logits/rejected": -0.6055228114128113,
"logps/chosen": -77.13755798339844,
"logps/ref_chosen": -58.228660583496094,
"logps/ref_rejected": -110.06959533691406,
"logps/rejected": -164.6455078125,
"loss": 0.4097,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.14925749599933624,
"margin_dpo/beta_margin_grad_std": 0.20233149826526642,
"margin_dpo/beta_margin_mean": 3.5667009353637695,
"margin_dpo/loss_margin_mean": 35.66700744628906,
"margin_dpo/margin_mean": 35.66700744628906,
"margin_dpo/margin_std": 28.87442398071289,
"step": 582
},
{
"epoch": 0.856093979441997,
"grad_norm": 63.78881072998047,
"learning_rate": 3.159440233840763e-08,
"logits/chosen": -0.5751946568489075,
"logits/rejected": -0.5564270615577698,
"logps/chosen": -81.47348022460938,
"logps/ref_chosen": -56.86286163330078,
"logps/ref_rejected": -88.4039306640625,
"logps/rejected": -142.409423828125,
"loss": 0.5765,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.18386687338352203,
"margin_dpo/beta_margin_grad_std": 0.2354869246482849,
"margin_dpo/beta_margin_mean": 2.939487934112549,
"margin_dpo/loss_margin_mean": 29.394878387451172,
"margin_dpo/margin_mean": 29.394878387451172,
"margin_dpo/margin_std": 29.66604995727539,
"step": 583
},
{
"epoch": 0.8575624082232012,
"grad_norm": 40.81183624267578,
"learning_rate": 3.0972808389096635e-08,
"logits/chosen": -0.6157029271125793,
"logits/rejected": -0.5580540299415588,
"logps/chosen": -74.85009002685547,
"logps/ref_chosen": -56.90068054199219,
"logps/ref_rejected": -97.63606262207031,
"logps/rejected": -154.90313720703125,
"loss": 0.2598,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.10160969197750092,
"margin_dpo/beta_margin_grad_std": 0.1572841852903366,
"margin_dpo/beta_margin_mean": 3.931765556335449,
"margin_dpo/loss_margin_mean": 39.317657470703125,
"margin_dpo/margin_mean": 39.317657470703125,
"margin_dpo/margin_std": 26.282012939453125,
"step": 584
},
{
"epoch": 0.8590308370044053,
"grad_norm": 64.71321105957031,
"learning_rate": 3.035698600998121e-08,
"logits/chosen": -0.6199055314064026,
"logits/rejected": -0.5935189723968506,
"logps/chosen": -85.7191162109375,
"logps/ref_chosen": -60.973968505859375,
"logps/ref_rejected": -84.16952514648438,
"logps/rejected": -141.8712921142578,
"loss": 0.4802,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16080693900585175,
"margin_dpo/beta_margin_grad_std": 0.21611681580543518,
"margin_dpo/beta_margin_mean": 3.2956621646881104,
"margin_dpo/loss_margin_mean": 32.95662307739258,
"margin_dpo/margin_mean": 32.95662307739258,
"margin_dpo/margin_std": 28.009883880615234,
"step": 585
},
{
"epoch": 0.8604992657856094,
"grad_norm": 64.60726928710938,
"learning_rate": 2.974695142855388e-08,
"logits/chosen": -0.5863425731658936,
"logits/rejected": -0.5751093626022339,
"logps/chosen": -82.08023071289062,
"logps/ref_chosen": -56.85559844970703,
"logps/ref_rejected": -91.8026123046875,
"logps/rejected": -149.82192993164062,
"loss": 0.5533,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16516205668449402,
"margin_dpo/beta_margin_grad_std": 0.2322179675102234,
"margin_dpo/beta_margin_mean": 3.2794694900512695,
"margin_dpo/loss_margin_mean": 32.79469299316406,
"margin_dpo/margin_mean": 32.79469680786133,
"margin_dpo/margin_std": 29.784767150878906,
"step": 586
},
{
"epoch": 0.8619676945668135,
"grad_norm": 47.665504455566406,
"learning_rate": 2.9142720719793122e-08,
"logits/chosen": -0.6379122734069824,
"logits/rejected": -0.6201504468917847,
"logps/chosen": -62.888648986816406,
"logps/ref_chosen": -44.69159698486328,
"logps/ref_rejected": -82.62385559082031,
"logps/rejected": -131.4063720703125,
"loss": 0.501,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1841679811477661,
"margin_dpo/beta_margin_grad_std": 0.21286147832870483,
"margin_dpo/beta_margin_mean": 3.058547019958496,
"margin_dpo/loss_margin_mean": 30.58547019958496,
"margin_dpo/margin_mean": 30.585468292236328,
"margin_dpo/margin_std": 27.52269744873047,
"step": 587
},
{
"epoch": 0.8634361233480177,
"grad_norm": 63.907814025878906,
"learning_rate": 2.8544309805740018e-08,
"logits/chosen": -0.6512797474861145,
"logits/rejected": -0.6356316804885864,
"logps/chosen": -73.06784057617188,
"logps/ref_chosen": -50.294952392578125,
"logps/ref_rejected": -107.36988067626953,
"logps/rejected": -162.28692626953125,
"loss": 0.4775,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16899925470352173,
"margin_dpo/beta_margin_grad_std": 0.21543928980827332,
"margin_dpo/beta_margin_mean": 3.21441650390625,
"margin_dpo/loss_margin_mean": 32.1441650390625,
"margin_dpo/margin_mean": 32.1441650390625,
"margin_dpo/margin_std": 28.336963653564453,
"step": 588
},
{
"epoch": 0.8649045521292217,
"grad_norm": 42.07124710083008,
"learning_rate": 2.7951734455078786e-08,
"logits/chosen": -0.6398344039916992,
"logits/rejected": -0.6136231422424316,
"logps/chosen": -82.24392700195312,
"logps/ref_chosen": -59.929908752441406,
"logps/ref_rejected": -111.65534973144531,
"logps/rejected": -178.29017639160156,
"loss": 0.3172,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.10654419660568237,
"margin_dpo/beta_margin_grad_std": 0.19332361221313477,
"margin_dpo/beta_margin_mean": 4.432080268859863,
"margin_dpo/loss_margin_mean": 44.32080078125,
"margin_dpo/margin_mean": 44.32080078125,
"margin_dpo/margin_std": 32.961795806884766,
"step": 589
},
{
"epoch": 0.8663729809104258,
"grad_norm": 38.59159851074219,
"learning_rate": 2.736501028272095e-08,
"logits/chosen": -0.6100300550460815,
"logits/rejected": -0.5847848057746887,
"logps/chosen": -77.71539306640625,
"logps/ref_chosen": -55.80979537963867,
"logps/ref_rejected": -106.06282043457031,
"logps/rejected": -166.61837768554688,
"loss": 0.2745,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.10657128691673279,
"margin_dpo/beta_margin_grad_std": 0.1513645052909851,
"margin_dpo/beta_margin_mean": 3.8649959564208984,
"margin_dpo/loss_margin_mean": 38.64995574951172,
"margin_dpo/margin_mean": 38.64995574951172,
"margin_dpo/margin_std": 28.00396728515625,
"step": 590
},
{
"epoch": 0.8678414096916299,
"grad_norm": 63.60677719116211,
"learning_rate": 2.678415274939408e-08,
"logits/chosen": -0.6167633533477783,
"logits/rejected": -0.5552696585655212,
"logps/chosen": -81.19537353515625,
"logps/ref_chosen": -56.24061965942383,
"logps/ref_rejected": -83.78629302978516,
"logps/rejected": -145.19476318359375,
"loss": 0.3925,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.12805438041687012,
"margin_dpo/beta_margin_grad_std": 0.21181520819664001,
"margin_dpo/beta_margin_mean": 3.645371913909912,
"margin_dpo/loss_margin_mean": 36.45372009277344,
"margin_dpo/margin_mean": 36.45372009277344,
"margin_dpo/margin_std": 25.672313690185547,
"step": 591
},
{
"epoch": 0.869309838472834,
"grad_norm": 86.11256408691406,
"learning_rate": 2.6209177161234442e-08,
"logits/chosen": -0.6086193323135376,
"logits/rejected": -0.5861480236053467,
"logps/chosen": -73.6089096069336,
"logps/ref_chosen": -47.94025421142578,
"logps/ref_rejected": -75.73287963867188,
"logps/rejected": -137.11573791503906,
"loss": 0.5921,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.15364328026771545,
"margin_dpo/beta_margin_grad_std": 0.24902772903442383,
"margin_dpo/beta_margin_mean": 3.571420669555664,
"margin_dpo/loss_margin_mean": 35.71420669555664,
"margin_dpo/margin_mean": 35.71420669555664,
"margin_dpo/margin_std": 28.337505340576172,
"step": 592
},
{
"epoch": 0.8707782672540382,
"grad_norm": 82.06730651855469,
"learning_rate": 2.564009866938349e-08,
"logits/chosen": -0.5315680503845215,
"logits/rejected": -0.5048198699951172,
"logps/chosen": -72.25209045410156,
"logps/ref_chosen": -48.690757751464844,
"logps/ref_rejected": -60.90800476074219,
"logps/rejected": -114.44489288330078,
"loss": 0.6216,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.18850964307785034,
"margin_dpo/beta_margin_grad_std": 0.2500463128089905,
"margin_dpo/beta_margin_mean": 2.9975552558898926,
"margin_dpo/loss_margin_mean": 29.97555160522461,
"margin_dpo/margin_mean": 29.97555160522461,
"margin_dpo/margin_std": 28.288480758666992,
"step": 593
},
{
"epoch": 0.8722466960352423,
"grad_norm": 60.2679443359375,
"learning_rate": 2.5076932269588708e-08,
"logits/chosen": -0.6228535175323486,
"logits/rejected": -0.5754865407943726,
"logps/chosen": -76.21943664550781,
"logps/ref_chosen": -54.93488693237305,
"logps/ref_rejected": -86.09967041015625,
"logps/rejected": -147.49954223632812,
"loss": 0.4954,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.14612731337547302,
"margin_dpo/beta_margin_grad_std": 0.2071676254272461,
"margin_dpo/beta_margin_mean": 4.011531829833984,
"margin_dpo/loss_margin_mean": 40.11532211303711,
"margin_dpo/margin_mean": 40.115318298339844,
"margin_dpo/margin_std": 34.15007781982422,
"step": 594
},
{
"epoch": 0.8737151248164464,
"grad_norm": 43.721248626708984,
"learning_rate": 2.451969280180849e-08,
"logits/chosen": -0.5794812440872192,
"logits/rejected": -0.544758141040802,
"logps/chosen": -72.4796142578125,
"logps/ref_chosen": -49.42041778564453,
"logps/ref_rejected": -80.62731170654297,
"logps/rejected": -135.92617797851562,
"loss": 0.3821,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.14666876196861267,
"margin_dpo/beta_margin_grad_std": 0.18497151136398315,
"margin_dpo/beta_margin_mean": 3.223968029022217,
"margin_dpo/loss_margin_mean": 32.23967742919922,
"margin_dpo/margin_mean": 32.23967742919922,
"margin_dpo/margin_std": 26.86066436767578,
"step": 595
},
{
"epoch": 0.8751835535976505,
"grad_norm": 64.42137908935547,
"learning_rate": 2.396839494982103e-08,
"logits/chosen": -0.5889699459075928,
"logits/rejected": -0.5423535704612732,
"logps/chosen": -81.71769714355469,
"logps/ref_chosen": -59.791683197021484,
"logps/ref_rejected": -80.09111785888672,
"logps/rejected": -137.01409912109375,
"loss": 0.4742,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16088539361953735,
"margin_dpo/beta_margin_grad_std": 0.20914097130298615,
"margin_dpo/beta_margin_mean": 3.49969744682312,
"margin_dpo/loss_margin_mean": 34.99697494506836,
"margin_dpo/margin_mean": 34.99697494506836,
"margin_dpo/margin_std": 29.85952377319336,
"step": 596
},
{
"epoch": 0.8766519823788547,
"grad_norm": 58.71807098388672,
"learning_rate": 2.3423053240837514e-08,
"logits/chosen": -0.5836566686630249,
"logits/rejected": -0.579143762588501,
"logps/chosen": -79.78302001953125,
"logps/ref_chosen": -57.26078796386719,
"logps/ref_rejected": -100.6937255859375,
"logps/rejected": -158.7677459716797,
"loss": 0.5176,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16815660893917084,
"margin_dpo/beta_margin_grad_std": 0.23717570304870605,
"margin_dpo/beta_margin_mean": 3.555178642272949,
"margin_dpo/loss_margin_mean": 35.551788330078125,
"margin_dpo/margin_mean": 35.55178451538086,
"margin_dpo/margin_std": 31.711952209472656,
"step": 597
},
{
"epoch": 0.8781204111600588,
"grad_norm": 66.43830871582031,
"learning_rate": 2.2883682045119062e-08,
"logits/chosen": -0.6284604072570801,
"logits/rejected": -0.5999557375907898,
"logps/chosen": -75.705078125,
"logps/ref_chosen": -52.51850509643555,
"logps/ref_rejected": -89.44385528564453,
"logps/rejected": -145.19064331054688,
"loss": 0.4578,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.14189483225345612,
"margin_dpo/beta_margin_grad_std": 0.1853117048740387,
"margin_dpo/beta_margin_mean": 3.256021499633789,
"margin_dpo/loss_margin_mean": 32.560211181640625,
"margin_dpo/margin_mean": 32.560211181640625,
"margin_dpo/margin_std": 24.856882095336914,
"step": 598
},
{
"epoch": 0.8795888399412628,
"grad_norm": 59.694637298583984,
"learning_rate": 2.2350295575598367e-08,
"logits/chosen": -0.6019773483276367,
"logits/rejected": -0.5818980932235718,
"logps/chosen": -71.63743591308594,
"logps/ref_chosen": -49.802677154541016,
"logps/ref_rejected": -82.978515625,
"logps/rejected": -137.6220245361328,
"loss": 0.4546,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.15810009837150574,
"margin_dpo/beta_margin_grad_std": 0.22325119376182556,
"margin_dpo/beta_margin_mean": 3.2808756828308105,
"margin_dpo/loss_margin_mean": 32.80875778198242,
"margin_dpo/margin_mean": 32.80875778198242,
"margin_dpo/margin_std": 25.88229751586914,
"step": 599
},
{
"epoch": 0.8810572687224669,
"grad_norm": 73.32799530029297,
"learning_rate": 2.1822907887504932e-08,
"logits/chosen": -0.6534677147865295,
"logits/rejected": -0.6272458434104919,
"logps/chosen": -88.00627899169922,
"logps/ref_chosen": -66.43487548828125,
"logps/ref_rejected": -85.45649719238281,
"logps/rejected": -137.1350555419922,
"loss": 0.4876,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.152080237865448,
"margin_dpo/beta_margin_grad_std": 0.21173834800720215,
"margin_dpo/beta_margin_mean": 3.0107154846191406,
"margin_dpo/loss_margin_mean": 30.107154846191406,
"margin_dpo/margin_mean": 30.107154846191406,
"margin_dpo/margin_std": 25.551097869873047,
"step": 600
},
{
"epoch": 0.8810572687224669,
"eval_logits/chosen": -0.6269975304603577,
"eval_logits/rejected": -0.6013357043266296,
"eval_logps/chosen": -105.93721771240234,
"eval_logps/ref_chosen": -79.05104064941406,
"eval_logps/ref_rejected": -86.79793548583984,
"eval_logps/rejected": -135.44046020507812,
"eval_loss": 0.4046096205711365,
"eval_margin_dpo/beta": 0.10000000149011612,
"eval_margin_dpo/beta_margin_grad_mean": -0.25697416067123413,
"eval_margin_dpo/beta_margin_grad_std": 0.25375545024871826,
"eval_margin_dpo/beta_margin_mean": 2.175632953643799,
"eval_margin_dpo/loss_margin_mean": 21.756330490112305,
"eval_margin_dpo/margin_mean": 21.756330490112305,
"eval_margin_dpo/margin_std": 26.337753295898438,
"eval_runtime": 39.8498,
"eval_samples_per_second": 58.695,
"eval_steps_per_second": 1.857,
"step": 600
},
{
"epoch": 0.882525697503671,
"grad_norm": 83.50579071044922,
"learning_rate": 2.1301532877994742e-08,
"logits/chosen": -0.6308251619338989,
"logits/rejected": -0.6028087139129639,
"logps/chosen": -85.07262420654297,
"logps/ref_chosen": -59.13360595703125,
"logps/ref_rejected": -94.69093322753906,
"logps/rejected": -154.66099548339844,
"loss": 0.5224,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1595621407032013,
"margin_dpo/beta_margin_grad_std": 0.24474212527275085,
"margin_dpo/beta_margin_mean": 3.403104543685913,
"margin_dpo/loss_margin_mean": 34.031044006347656,
"margin_dpo/margin_mean": 34.031044006347656,
"margin_dpo/margin_std": 28.710695266723633,
"step": 601
},
{
"epoch": 0.8839941262848752,
"grad_norm": 67.36071014404297,
"learning_rate": 2.0786184285784298e-08,
"logits/chosen": -0.6085352897644043,
"logits/rejected": -0.6077029705047607,
"logps/chosen": -66.83834838867188,
"logps/ref_chosen": -48.59352111816406,
"logps/ref_rejected": -87.6685562133789,
"logps/rejected": -143.52706909179688,
"loss": 0.3598,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.12709340453147888,
"margin_dpo/beta_margin_grad_std": 0.19667461514472961,
"margin_dpo/beta_margin_mean": 3.7613697052001953,
"margin_dpo/loss_margin_mean": 37.61369705200195,
"margin_dpo/margin_mean": 37.61369705200195,
"margin_dpo/margin_std": 27.582782745361328,
"step": 602
},
{
"epoch": 0.8854625550660793,
"grad_norm": 65.35578918457031,
"learning_rate": 2.0276875690788204e-08,
"logits/chosen": -0.6445978879928589,
"logits/rejected": -0.6060948371887207,
"logps/chosen": -90.91526794433594,
"logps/ref_chosen": -70.41461944580078,
"logps/ref_rejected": -100.32560729980469,
"logps/rejected": -153.026611328125,
"loss": 0.4681,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16203567385673523,
"margin_dpo/beta_margin_grad_std": 0.22206860780715942,
"margin_dpo/beta_margin_mean": 3.2200357913970947,
"margin_dpo/loss_margin_mean": 32.200355529785156,
"margin_dpo/margin_mean": 32.200355529785156,
"margin_dpo/margin_std": 26.239582061767578,
"step": 603
},
{
"epoch": 0.8869309838472834,
"grad_norm": 65.85285949707031,
"learning_rate": 1.977362051376158e-08,
"logits/chosen": -0.6049788594245911,
"logits/rejected": -0.5948315858840942,
"logps/chosen": -65.44568634033203,
"logps/ref_chosen": -46.45808029174805,
"logps/ref_rejected": -91.8544921875,
"logps/rejected": -146.36270141601562,
"loss": 0.4541,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.14404089748859406,
"margin_dpo/beta_margin_grad_std": 0.20777881145477295,
"margin_dpo/beta_margin_mean": 3.552060604095459,
"margin_dpo/loss_margin_mean": 35.520606994628906,
"margin_dpo/margin_mean": 35.520606994628906,
"margin_dpo/margin_std": 29.537954330444336,
"step": 604
},
{
"epoch": 0.8883994126284875,
"grad_norm": 62.26566696166992,
"learning_rate": 1.9276432015946446e-08,
"logits/chosen": -0.5922667384147644,
"logits/rejected": -0.5747475028038025,
"logps/chosen": -90.86492919921875,
"logps/ref_chosen": -66.24933624267578,
"logps/ref_rejected": -102.30496978759766,
"logps/rejected": -158.65435791015625,
"loss": 0.4596,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1484421193599701,
"margin_dpo/beta_margin_grad_std": 0.19956421852111816,
"margin_dpo/beta_margin_mean": 3.1733784675598145,
"margin_dpo/loss_margin_mean": 31.733783721923828,
"margin_dpo/margin_mean": 31.733783721923828,
"margin_dpo/margin_std": 29.25701141357422,
"step": 605
},
{
"epoch": 0.8898678414096917,
"grad_norm": 40.07181167602539,
"learning_rate": 1.8785323298722093e-08,
"logits/chosen": -0.6057391166687012,
"logits/rejected": -0.5747348070144653,
"logps/chosen": -76.91024780273438,
"logps/ref_chosen": -54.819122314453125,
"logps/ref_rejected": -98.37147521972656,
"logps/rejected": -157.16664123535156,
"loss": 0.2922,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.11917038261890411,
"margin_dpo/beta_margin_grad_std": 0.14764106273651123,
"margin_dpo/beta_margin_mean": 3.6704044342041016,
"margin_dpo/loss_margin_mean": 36.704044342041016,
"margin_dpo/margin_mean": 36.704044342041016,
"margin_dpo/margin_std": 25.36406707763672,
"step": 606
},
{
"epoch": 0.8913362701908958,
"grad_norm": 52.19849395751953,
"learning_rate": 1.8300307303259904e-08,
"logits/chosen": -0.5950828194618225,
"logits/rejected": -0.560725212097168,
"logps/chosen": -79.23565673828125,
"logps/ref_chosen": -58.08403778076172,
"logps/ref_rejected": -79.777099609375,
"logps/rejected": -133.3597412109375,
"loss": 0.3369,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.12986359000205994,
"margin_dpo/beta_margin_grad_std": 0.17091821134090424,
"margin_dpo/beta_margin_mean": 3.243102550506592,
"margin_dpo/loss_margin_mean": 32.43102264404297,
"margin_dpo/margin_mean": 32.43102264404297,
"margin_dpo/margin_std": 23.558351516723633,
"step": 607
},
{
"epoch": 0.8928046989720999,
"grad_norm": 59.17472839355469,
"learning_rate": 1.7821396810182437e-08,
"logits/chosen": -0.6549203395843506,
"logits/rejected": -0.6243371367454529,
"logps/chosen": -78.31192016601562,
"logps/ref_chosen": -57.450836181640625,
"logps/ref_rejected": -94.77339172363281,
"logps/rejected": -148.8663330078125,
"loss": 0.4835,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.15288802981376648,
"margin_dpo/beta_margin_grad_std": 0.22536322474479675,
"margin_dpo/beta_margin_mean": 3.3231868743896484,
"margin_dpo/loss_margin_mean": 33.231868743896484,
"margin_dpo/margin_mean": 33.23186492919922,
"margin_dpo/margin_std": 26.378620147705078,
"step": 608
},
{
"epoch": 0.8942731277533039,
"grad_norm": 66.138427734375,
"learning_rate": 1.7348604439226617e-08,
"logits/chosen": -0.6333421468734741,
"logits/rejected": -0.5963184833526611,
"logps/chosen": -82.09093475341797,
"logps/ref_chosen": -58.805355072021484,
"logps/ref_rejected": -88.81600952148438,
"logps/rejected": -145.73898315429688,
"loss": 0.3546,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.12897904217243195,
"margin_dpo/beta_margin_grad_std": 0.18817874789237976,
"margin_dpo/beta_margin_mean": 3.363740921020508,
"margin_dpo/loss_margin_mean": 33.63740539550781,
"margin_dpo/margin_mean": 33.63740539550781,
"margin_dpo/margin_std": 24.00457763671875,
"step": 609
},
{
"epoch": 0.895741556534508,
"grad_norm": 75.1207275390625,
"learning_rate": 1.6881942648911074e-08,
"logits/chosen": -0.5928279161453247,
"logits/rejected": -0.5319284200668335,
"logps/chosen": -90.37582397460938,
"logps/ref_chosen": -65.69503784179688,
"logps/ref_rejected": -83.4053955078125,
"logps/rejected": -140.68991088867188,
"loss": 0.4574,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1538185179233551,
"margin_dpo/beta_margin_grad_std": 0.21573176980018616,
"margin_dpo/beta_margin_mean": 3.26037335395813,
"margin_dpo/loss_margin_mean": 32.60373306274414,
"margin_dpo/margin_mean": 32.60373306274414,
"margin_dpo/margin_std": 24.9559326171875,
"step": 610
},
{
"epoch": 0.8972099853157122,
"grad_norm": 48.684600830078125,
"learning_rate": 1.6421423736208e-08,
"logits/chosen": -0.6442773342132568,
"logits/rejected": -0.6083732843399048,
"logps/chosen": -74.56732177734375,
"logps/ref_chosen": -52.59947204589844,
"logps/ref_rejected": -86.33099365234375,
"logps/rejected": -144.01742553710938,
"loss": 0.3964,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.14588133990764618,
"margin_dpo/beta_margin_grad_std": 0.19904646277427673,
"margin_dpo/beta_margin_mean": 3.5718588829040527,
"margin_dpo/loss_margin_mean": 35.718589782714844,
"margin_dpo/margin_mean": 35.718589782714844,
"margin_dpo/margin_std": 28.007495880126953,
"step": 611
},
{
"epoch": 0.8986784140969163,
"grad_norm": 45.877662658691406,
"learning_rate": 1.5967059836219042e-08,
"logits/chosen": -0.6410280466079712,
"logits/rejected": -0.582598090171814,
"logps/chosen": -80.21870422363281,
"logps/ref_chosen": -59.32372283935547,
"logps/ref_rejected": -88.31239318847656,
"logps/rejected": -150.30587768554688,
"loss": 0.2722,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.10045182704925537,
"margin_dpo/beta_margin_grad_std": 0.16613739728927612,
"margin_dpo/beta_margin_mean": 4.109850883483887,
"margin_dpo/loss_margin_mean": 41.0985107421875,
"margin_dpo/margin_mean": 41.0985107421875,
"margin_dpo/margin_std": 27.613842010498047,
"step": 612
},
{
"epoch": 0.9001468428781204,
"grad_norm": 50.60771942138672,
"learning_rate": 1.551886292185553e-08,
"logits/chosen": -0.6397769451141357,
"logits/rejected": -0.6355684995651245,
"logps/chosen": -80.78131866455078,
"logps/ref_chosen": -59.72996520996094,
"logps/ref_rejected": -105.10753631591797,
"logps/rejected": -161.82345581054688,
"loss": 0.3682,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.13177156448364258,
"margin_dpo/beta_margin_grad_std": 0.19979646801948547,
"margin_dpo/beta_margin_mean": 3.566455841064453,
"margin_dpo/loss_margin_mean": 35.66455841064453,
"margin_dpo/margin_mean": 35.66455841064453,
"margin_dpo/margin_std": 27.262413024902344,
"step": 613
},
{
"epoch": 0.9016152716593245,
"grad_norm": 46.63825988769531,
"learning_rate": 1.507684480352292e-08,
"logits/chosen": -0.58838951587677,
"logits/rejected": -0.5823639035224915,
"logps/chosen": -76.70652770996094,
"logps/ref_chosen": -52.93898010253906,
"logps/ref_rejected": -104.67938232421875,
"logps/rejected": -164.14959716796875,
"loss": 0.3003,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.11585717648267746,
"margin_dpo/beta_margin_grad_std": 0.16684673726558685,
"margin_dpo/beta_margin_mean": 3.57026743888855,
"margin_dpo/loss_margin_mean": 35.702674865722656,
"margin_dpo/margin_mean": 35.702674865722656,
"margin_dpo/margin_std": 25.349502563476562,
"step": 614
},
{
"epoch": 0.9030837004405287,
"grad_norm": 42.50800323486328,
"learning_rate": 1.4641017128809801e-08,
"logits/chosen": -0.5662412047386169,
"logits/rejected": -0.5331077575683594,
"logps/chosen": -87.12669372558594,
"logps/ref_chosen": -65.81727600097656,
"logps/ref_rejected": -95.17749786376953,
"logps/rejected": -146.67713928222656,
"loss": 0.4066,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1578460931777954,
"margin_dpo/beta_margin_grad_std": 0.1820681393146515,
"margin_dpo/beta_margin_mean": 3.019021987915039,
"margin_dpo/loss_margin_mean": 30.19021987915039,
"margin_dpo/margin_mean": 30.19021987915039,
"margin_dpo/margin_std": 23.05301284790039,
"step": 615
},
{
"epoch": 0.9045521292217328,
"grad_norm": 77.02394104003906,
"learning_rate": 1.4211391382180637e-08,
"logits/chosen": -0.5957802534103394,
"logits/rejected": -0.5418244004249573,
"logps/chosen": -88.68885040283203,
"logps/ref_chosen": -65.13285827636719,
"logps/ref_rejected": -74.70050048828125,
"logps/rejected": -130.87673950195312,
"loss": 0.4978,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.15978117287158966,
"margin_dpo/beta_margin_grad_std": 0.22819003462791443,
"margin_dpo/beta_margin_mean": 3.26202392578125,
"margin_dpo/loss_margin_mean": 32.6202392578125,
"margin_dpo/margin_mean": 32.6202392578125,
"margin_dpo/margin_std": 29.516948699951172,
"step": 616
},
{
"epoch": 0.9060205580029369,
"grad_norm": 49.953460693359375,
"learning_rate": 1.378797888467345e-08,
"logits/chosen": -0.5749341249465942,
"logits/rejected": -0.53103107213974,
"logps/chosen": -87.75241088867188,
"logps/ref_chosen": -63.005550384521484,
"logps/ref_rejected": -64.234130859375,
"logps/rejected": -118.99295043945312,
"loss": 0.3848,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.14862868189811707,
"margin_dpo/beta_margin_grad_std": 0.17751744389533997,
"margin_dpo/beta_margin_mean": 3.0011959075927734,
"margin_dpo/loss_margin_mean": 30.011959075927734,
"margin_dpo/margin_mean": 30.011959075927734,
"margin_dpo/margin_std": 23.59270477294922,
"step": 617
},
{
"epoch": 0.9074889867841409,
"grad_norm": 66.36804962158203,
"learning_rate": 1.3370790793601371e-08,
"logits/chosen": -0.6147041320800781,
"logits/rejected": -0.5852859616279602,
"logps/chosen": -90.93468475341797,
"logps/ref_chosen": -67.10135650634766,
"logps/ref_rejected": -92.15339660644531,
"logps/rejected": -146.77523803710938,
"loss": 0.4572,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16413499414920807,
"margin_dpo/beta_margin_grad_std": 0.19249024987220764,
"margin_dpo/beta_margin_mean": 3.0788497924804688,
"margin_dpo/loss_margin_mean": 30.78849983215332,
"margin_dpo/margin_mean": 30.788501739501953,
"margin_dpo/margin_std": 26.1810245513916,
"step": 618
},
{
"epoch": 0.908957415565345,
"grad_norm": 60.24756622314453,
"learning_rate": 1.2959838102258535e-08,
"logits/chosen": -0.5955780744552612,
"logits/rejected": -0.5634878873825073,
"logps/chosen": -79.19235229492188,
"logps/ref_chosen": -55.978233337402344,
"logps/ref_rejected": -93.1854019165039,
"logps/rejected": -149.55165100097656,
"loss": 0.4659,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1654718816280365,
"margin_dpo/beta_margin_grad_std": 0.21678967773914337,
"margin_dpo/beta_margin_mean": 3.3152127265930176,
"margin_dpo/loss_margin_mean": 33.152130126953125,
"margin_dpo/margin_mean": 33.152130126953125,
"margin_dpo/margin_std": 29.86014175415039,
"step": 619
},
{
"epoch": 0.9104258443465492,
"grad_norm": 35.56299591064453,
"learning_rate": 1.2555131639630567e-08,
"logits/chosen": -0.6245037317276001,
"logits/rejected": -0.5862281322479248,
"logps/chosen": -79.94758605957031,
"logps/ref_chosen": -59.79750061035156,
"logps/ref_rejected": -78.41075134277344,
"logps/rejected": -134.01303100585938,
"loss": 0.2607,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1101793646812439,
"margin_dpo/beta_margin_grad_std": 0.12655286490917206,
"margin_dpo/beta_margin_mean": 3.5452189445495605,
"margin_dpo/loss_margin_mean": 35.45219039916992,
"margin_dpo/margin_mean": 35.45219039916992,
"margin_dpo/margin_std": 25.871028900146484,
"step": 620
},
{
"epoch": 0.9118942731277533,
"grad_norm": 37.061790466308594,
"learning_rate": 1.2156682070109086e-08,
"logits/chosen": -0.606106698513031,
"logits/rejected": -0.5785382986068726,
"logps/chosen": -72.80471801757812,
"logps/ref_chosen": -53.933753967285156,
"logps/ref_rejected": -88.36952209472656,
"logps/rejected": -143.35723876953125,
"loss": 0.3122,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.10982675105333328,
"margin_dpo/beta_margin_grad_std": 0.17607754468917847,
"margin_dpo/beta_margin_mean": 3.61167573928833,
"margin_dpo/loss_margin_mean": 36.11675262451172,
"margin_dpo/margin_mean": 36.11675262451172,
"margin_dpo/margin_std": 26.981311798095703,
"step": 621
},
{
"epoch": 0.9133627019089574,
"grad_norm": 49.68582534790039,
"learning_rate": 1.1764499893210878e-08,
"logits/chosen": -0.5509716272354126,
"logits/rejected": -0.49232321977615356,
"logps/chosen": -82.90745544433594,
"logps/ref_chosen": -60.28582000732422,
"logps/ref_rejected": -85.51873779296875,
"logps/rejected": -144.81259155273438,
"loss": 0.3885,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.13941837847232819,
"margin_dpo/beta_margin_grad_std": 0.1975843608379364,
"margin_dpo/beta_margin_mean": 3.6672213077545166,
"margin_dpo/loss_margin_mean": 36.672210693359375,
"margin_dpo/margin_mean": 36.672210693359375,
"margin_dpo/margin_std": 28.6815185546875,
"step": 622
},
{
"epoch": 0.9148311306901615,
"grad_norm": 73.91598510742188,
"learning_rate": 1.1378595443300998e-08,
"logits/chosen": -0.632436990737915,
"logits/rejected": -0.5964562892913818,
"logps/chosen": -88.94279479980469,
"logps/ref_chosen": -64.15696716308594,
"logps/ref_rejected": -85.08304595947266,
"logps/rejected": -140.305419921875,
"loss": 0.5623,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.18874922394752502,
"margin_dpo/beta_margin_grad_std": 0.23199373483657837,
"margin_dpo/beta_margin_mean": 3.0436534881591797,
"margin_dpo/loss_margin_mean": 30.436534881591797,
"margin_dpo/margin_mean": 30.436534881591797,
"margin_dpo/margin_std": 29.26456069946289,
"step": 623
},
{
"epoch": 0.9162995594713657,
"grad_norm": 69.39524841308594,
"learning_rate": 1.0998978889320582e-08,
"logits/chosen": -0.6819274425506592,
"logits/rejected": -0.6118913888931274,
"logps/chosen": -94.78079986572266,
"logps/ref_chosen": -71.91862487792969,
"logps/ref_rejected": -97.13203430175781,
"logps/rejected": -157.30023193359375,
"loss": 0.4915,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.14777633547782898,
"margin_dpo/beta_margin_grad_std": 0.2476031631231308,
"margin_dpo/beta_margin_mean": 3.730602741241455,
"margin_dpo/loss_margin_mean": 37.30602264404297,
"margin_dpo/margin_mean": 37.30602264404297,
"margin_dpo/margin_std": 27.589237213134766,
"step": 624
},
{
"epoch": 0.9177679882525698,
"grad_norm": 48.65541458129883,
"learning_rate": 1.0625660234518913e-08,
"logits/chosen": -0.5835287570953369,
"logits/rejected": -0.5436596870422363,
"logps/chosen": -81.64682006835938,
"logps/ref_chosen": -58.342071533203125,
"logps/ref_rejected": -86.09038543701172,
"logps/rejected": -145.12460327148438,
"loss": 0.3486,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.13256171345710754,
"margin_dpo/beta_margin_grad_std": 0.18188360333442688,
"margin_dpo/beta_margin_mean": 3.5729477405548096,
"margin_dpo/loss_margin_mean": 35.72947692871094,
"margin_dpo/margin_mean": 35.72947692871094,
"margin_dpo/margin_std": 28.54417610168457,
"step": 625
},
{
"epoch": 0.9192364170337739,
"grad_norm": 63.34779739379883,
"learning_rate": 1.0258649316189721e-08,
"logits/chosen": -0.5459762811660767,
"logits/rejected": -0.5098272562026978,
"logps/chosen": -99.06941223144531,
"logps/ref_chosen": -75.11260986328125,
"logps/ref_rejected": -99.18872833251953,
"logps/rejected": -153.42007446289062,
"loss": 0.5183,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.18455414474010468,
"margin_dpo/beta_margin_grad_std": 0.21206972002983093,
"margin_dpo/beta_margin_mean": 3.0274548530578613,
"margin_dpo/loss_margin_mean": 30.274547576904297,
"margin_dpo/margin_mean": 30.274547576904297,
"margin_dpo/margin_std": 28.38648223876953,
"step": 626
},
{
"epoch": 0.920704845814978,
"grad_norm": 79.35140228271484,
"learning_rate": 9.897955805412e-09,
"logits/chosen": -0.5999346971511841,
"logits/rejected": -0.6070972681045532,
"logps/chosen": -69.15809631347656,
"logps/ref_chosen": -47.74314880371094,
"logps/ref_rejected": -106.75448608398438,
"logps/rejected": -162.14773559570312,
"loss": 0.6092,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1894698441028595,
"margin_dpo/beta_margin_grad_std": 0.2491467446088791,
"margin_dpo/beta_margin_mean": 3.397829055786133,
"margin_dpo/loss_margin_mean": 33.97829055786133,
"margin_dpo/margin_mean": 33.97828674316406,
"margin_dpo/margin_std": 34.131160736083984,
"step": 627
},
{
"epoch": 0.922173274596182,
"grad_norm": 40.400997161865234,
"learning_rate": 9.543589206795238e-09,
"logits/chosen": -0.5975438356399536,
"logits/rejected": -0.5776046514511108,
"logps/chosen": -82.31864929199219,
"logps/ref_chosen": -60.182945251464844,
"logps/ref_rejected": -101.55467224121094,
"logps/rejected": -159.41123962402344,
"loss": 0.298,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1191520020365715,
"margin_dpo/beta_margin_grad_std": 0.15398246049880981,
"margin_dpo/beta_margin_mean": 3.572086811065674,
"margin_dpo/loss_margin_mean": 35.72086715698242,
"margin_dpo/margin_mean": 35.72086715698242,
"margin_dpo/margin_std": 25.814367294311523,
"step": 628
},
{
"epoch": 0.9236417033773862,
"grad_norm": 62.328609466552734,
"learning_rate": 9.19555885822887e-09,
"logits/chosen": -0.6410259008407593,
"logits/rejected": -0.597222089767456,
"logps/chosen": -86.5196533203125,
"logps/ref_chosen": -64.21353912353516,
"logps/ref_rejected": -91.65367126464844,
"logps/rejected": -145.63623046875,
"loss": 0.4052,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.14122983813285828,
"margin_dpo/beta_margin_grad_std": 0.1920766830444336,
"margin_dpo/beta_margin_mean": 3.167644500732422,
"margin_dpo/loss_margin_mean": 31.67644500732422,
"margin_dpo/margin_mean": 31.67644500732422,
"margin_dpo/margin_std": 24.94976043701172,
"step": 629
},
{
"epoch": 0.9251101321585903,
"grad_norm": 61.22914505004883,
"learning_rate": 8.85387393063622e-09,
"logits/chosen": -0.662344217300415,
"logits/rejected": -0.6153937578201294,
"logps/chosen": -79.80152130126953,
"logps/ref_chosen": -59.29100036621094,
"logps/ref_rejected": -83.59829711914062,
"logps/rejected": -134.3487091064453,
"loss": 0.464,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16477835178375244,
"margin_dpo/beta_margin_grad_std": 0.2072598934173584,
"margin_dpo/beta_margin_mean": 3.023988962173462,
"margin_dpo/loss_margin_mean": 30.23988914489746,
"margin_dpo/margin_mean": 30.23988914489746,
"margin_dpo/margin_std": 25.428054809570312,
"step": 630
},
{
"epoch": 0.9265785609397944,
"grad_norm": 93.05696105957031,
"learning_rate": 8.518543427732949e-09,
"logits/chosen": -0.6291791200637817,
"logits/rejected": -0.586702287197113,
"logps/chosen": -84.07586669921875,
"logps/ref_chosen": -59.45360565185547,
"logps/ref_rejected": -80.95157623291016,
"logps/rejected": -133.77267456054688,
"loss": 0.7308,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.19751125574111938,
"margin_dpo/beta_margin_grad_std": 0.26264500617980957,
"margin_dpo/beta_margin_mean": 2.8198840618133545,
"margin_dpo/loss_margin_mean": 28.198841094970703,
"margin_dpo/margin_mean": 28.198841094970703,
"margin_dpo/margin_std": 29.28610610961914,
"step": 631
},
{
"epoch": 0.9280469897209985,
"grad_norm": 86.49762725830078,
"learning_rate": 8.189576185789637e-09,
"logits/chosen": -0.6317383050918579,
"logits/rejected": -0.5975475311279297,
"logps/chosen": -85.93399047851562,
"logps/ref_chosen": -61.35155487060547,
"logps/ref_rejected": -86.16017150878906,
"logps/rejected": -143.37826538085938,
"loss": 0.7104,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16609230637550354,
"margin_dpo/beta_margin_grad_std": 0.26753705739974976,
"margin_dpo/beta_margin_mean": 3.263566255569458,
"margin_dpo/loss_margin_mean": 32.63566207885742,
"margin_dpo/margin_mean": 32.63566207885742,
"margin_dpo/margin_std": 29.368499755859375,
"step": 632
},
{
"epoch": 0.9295154185022027,
"grad_norm": 59.74918746948242,
"learning_rate": 7.866980873399015e-09,
"logits/chosen": -0.6477575898170471,
"logits/rejected": -0.6343536376953125,
"logps/chosen": -80.77423095703125,
"logps/ref_chosen": -57.278167724609375,
"logps/ref_rejected": -91.58395385742188,
"logps/rejected": -142.47764587402344,
"loss": 0.5478,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1920245885848999,
"margin_dpo/beta_margin_grad_std": 0.21717044711112976,
"margin_dpo/beta_margin_mean": 2.739762783050537,
"margin_dpo/loss_margin_mean": 27.397626876831055,
"margin_dpo/margin_mean": 27.397626876831055,
"margin_dpo/margin_std": 24.44476890563965,
"step": 633
},
{
"epoch": 0.9309838472834068,
"grad_norm": 71.1202392578125,
"learning_rate": 7.550765991247654e-09,
"logits/chosen": -0.5574454069137573,
"logits/rejected": -0.539508581161499,
"logps/chosen": -93.28065490722656,
"logps/ref_chosen": -66.61896514892578,
"logps/ref_rejected": -107.12565612792969,
"logps/rejected": -161.80874633789062,
"loss": 0.6534,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.20834138989448547,
"margin_dpo/beta_margin_grad_std": 0.24892690777778625,
"margin_dpo/beta_margin_mean": 2.802140235900879,
"margin_dpo/loss_margin_mean": 28.021400451660156,
"margin_dpo/margin_mean": 28.021400451660156,
"margin_dpo/margin_std": 29.189456939697266,
"step": 634
},
{
"epoch": 0.9324522760646109,
"grad_norm": 49.81635665893555,
"learning_rate": 7.240939871891699e-09,
"logits/chosen": -0.627153754234314,
"logits/rejected": -0.5792471170425415,
"logps/chosen": -96.72550201416016,
"logps/ref_chosen": -73.95551300048828,
"logps/ref_rejected": -82.50045776367188,
"logps/rejected": -133.87303161621094,
"loss": 0.412,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.155021071434021,
"margin_dpo/beta_margin_grad_std": 0.18796978890895844,
"margin_dpo/beta_margin_mean": 2.860257387161255,
"margin_dpo/loss_margin_mean": 28.60257339477539,
"margin_dpo/margin_mean": 28.60257339477539,
"margin_dpo/margin_std": 22.55862808227539,
"step": 635
},
{
"epoch": 0.933920704845815,
"grad_norm": 49.317588806152344,
"learning_rate": 6.937510679537628e-09,
"logits/chosen": -0.5662115812301636,
"logits/rejected": -0.5381814241409302,
"logps/chosen": -82.45319366455078,
"logps/ref_chosen": -59.628910064697266,
"logps/ref_rejected": -81.97883605957031,
"logps/rejected": -137.88497924804688,
"loss": 0.3993,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.13900163769721985,
"margin_dpo/beta_margin_grad_std": 0.21276052296161652,
"margin_dpo/beta_margin_mean": 3.308185577392578,
"margin_dpo/loss_margin_mean": 33.08185958862305,
"margin_dpo/margin_mean": 33.08185577392578,
"margin_dpo/margin_std": 23.71514129638672,
"step": 636
},
{
"epoch": 0.9353891336270191,
"grad_norm": 53.16542434692383,
"learning_rate": 6.640486409826785e-09,
"logits/chosen": -0.5961561799049377,
"logits/rejected": -0.5736096501350403,
"logps/chosen": -73.35490417480469,
"logps/ref_chosen": -49.652687072753906,
"logps/ref_rejected": -98.40513610839844,
"logps/rejected": -155.1796112060547,
"loss": 0.3585,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1328592598438263,
"margin_dpo/beta_margin_grad_std": 0.18699194490909576,
"margin_dpo/beta_margin_mean": 3.3072257041931152,
"margin_dpo/loss_margin_mean": 33.0722541809082,
"margin_dpo/margin_mean": 33.0722541809082,
"margin_dpo/margin_std": 25.20583724975586,
"step": 637
},
{
"epoch": 0.9368575624082232,
"grad_norm": 42.64322280883789,
"learning_rate": 6.349874889624962e-09,
"logits/chosen": -0.5751190185546875,
"logits/rejected": -0.5282764434814453,
"logps/chosen": -78.66455841064453,
"logps/ref_chosen": -58.156646728515625,
"logps/ref_rejected": -79.3014907836914,
"logps/rejected": -136.99473571777344,
"loss": 0.3226,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.12046533823013306,
"margin_dpo/beta_margin_grad_std": 0.1725090891122818,
"margin_dpo/beta_margin_mean": 3.718533515930176,
"margin_dpo/loss_margin_mean": 37.185333251953125,
"margin_dpo/margin_mean": 37.185333251953125,
"margin_dpo/margin_std": 27.281875610351562,
"step": 638
},
{
"epoch": 0.9383259911894273,
"grad_norm": 57.3282470703125,
"learning_rate": 6.065683776815933e-09,
"logits/chosen": -0.5723918676376343,
"logits/rejected": -0.5074343681335449,
"logps/chosen": -97.81383514404297,
"logps/ref_chosen": -72.32319641113281,
"logps/ref_rejected": -74.2749252319336,
"logps/rejected": -130.6868896484375,
"loss": 0.4425,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.14578017592430115,
"margin_dpo/beta_margin_grad_std": 0.1988290250301361,
"margin_dpo/beta_margin_mean": 3.092132568359375,
"margin_dpo/loss_margin_mean": 30.921327590942383,
"margin_dpo/margin_mean": 30.921327590942383,
"margin_dpo/margin_std": 24.539897918701172,
"step": 639
},
{
"epoch": 0.9397944199706314,
"grad_norm": 45.74781036376953,
"learning_rate": 5.7879205600998296e-09,
"logits/chosen": -0.5912868976593018,
"logits/rejected": -0.5555776357650757,
"logps/chosen": -78.59037780761719,
"logps/ref_chosen": -56.13436508178711,
"logps/ref_rejected": -108.60014343261719,
"logps/rejected": -167.8488006591797,
"loss": 0.3056,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.12402527034282684,
"margin_dpo/beta_margin_grad_std": 0.15019166469573975,
"margin_dpo/beta_margin_mean": 3.6792640686035156,
"margin_dpo/loss_margin_mean": 36.792640686035156,
"margin_dpo/margin_mean": 36.792640686035156,
"margin_dpo/margin_std": 29.762346267700195,
"step": 640
},
{
"epoch": 0.9412628487518355,
"grad_norm": 51.42761993408203,
"learning_rate": 5.516592558795746e-09,
"logits/chosen": -0.6235780715942383,
"logits/rejected": -0.5653523206710815,
"logps/chosen": -88.91046142578125,
"logps/ref_chosen": -64.99689483642578,
"logps/ref_rejected": -86.99232482910156,
"logps/rejected": -142.96499633789062,
"loss": 0.3758,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.14689452946186066,
"margin_dpo/beta_margin_grad_std": 0.16710370779037476,
"margin_dpo/beta_margin_mean": 3.2059097290039062,
"margin_dpo/loss_margin_mean": 32.05909729003906,
"margin_dpo/margin_mean": 32.05909729003906,
"margin_dpo/margin_std": 29.564998626708984,
"step": 641
},
{
"epoch": 0.9427312775330396,
"grad_norm": 79.5660629272461,
"learning_rate": 5.251706922648868e-09,
"logits/chosen": -0.5625093579292297,
"logits/rejected": -0.5258715152740479,
"logps/chosen": -90.38214111328125,
"logps/ref_chosen": -65.68924713134766,
"logps/ref_rejected": -110.24205017089844,
"logps/rejected": -170.61810302734375,
"loss": 0.4846,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.15378239750862122,
"margin_dpo/beta_margin_grad_std": 0.2159113883972168,
"margin_dpo/beta_margin_mean": 3.5683140754699707,
"margin_dpo/loss_margin_mean": 35.683143615722656,
"margin_dpo/margin_mean": 35.68313980102539,
"margin_dpo/margin_std": 30.738298416137695,
"step": 642
},
{
"epoch": 0.9441997063142438,
"grad_norm": 50.391510009765625,
"learning_rate": 4.993270631642038e-09,
"logits/chosen": -0.6333717107772827,
"logits/rejected": -0.6052130460739136,
"logps/chosen": -71.46492004394531,
"logps/ref_chosen": -51.94999694824219,
"logps/ref_rejected": -87.46833801269531,
"logps/rejected": -137.68312072753906,
"loss": 0.4285,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.14480799436569214,
"margin_dpo/beta_margin_grad_std": 0.19451884925365448,
"margin_dpo/beta_margin_mean": 3.069986581802368,
"margin_dpo/loss_margin_mean": 30.699865341186523,
"margin_dpo/margin_mean": 30.699865341186523,
"margin_dpo/margin_std": 23.991680145263672,
"step": 643
},
{
"epoch": 0.9456681350954479,
"grad_norm": 77.25302124023438,
"learning_rate": 4.741290495811873e-09,
"logits/chosen": -0.562663197517395,
"logits/rejected": -0.5326156616210938,
"logps/chosen": -79.98289489746094,
"logps/ref_chosen": -59.017662048339844,
"logps/ref_rejected": -87.13668823242188,
"logps/rejected": -138.28848266601562,
"loss": 0.5598,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1840367615222931,
"margin_dpo/beta_margin_grad_std": 0.2329137921333313,
"margin_dpo/beta_margin_mean": 3.018655776977539,
"margin_dpo/loss_margin_mean": 30.18655776977539,
"margin_dpo/margin_mean": 30.18655776977539,
"margin_dpo/margin_std": 28.526702880859375,
"step": 644
},
{
"epoch": 0.947136563876652,
"grad_norm": 75.70096588134766,
"learning_rate": 4.495773155069299e-09,
"logits/chosen": -0.5764358043670654,
"logits/rejected": -0.5558615922927856,
"logps/chosen": -79.81644439697266,
"logps/ref_chosen": -55.87602233886719,
"logps/ref_rejected": -97.78080749511719,
"logps/rejected": -150.7290496826172,
"loss": 0.5337,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.18957525491714478,
"margin_dpo/beta_margin_grad_std": 0.21459272503852844,
"margin_dpo/beta_margin_mean": 2.9007816314697266,
"margin_dpo/loss_margin_mean": 29.007814407348633,
"margin_dpo/margin_mean": 29.007816314697266,
"margin_dpo/margin_std": 27.730712890625,
"step": 645
},
{
"epoch": 0.9486049926578561,
"grad_norm": 50.839752197265625,
"learning_rate": 4.256725079024553e-09,
"logits/chosen": -0.6054178476333618,
"logits/rejected": -0.5551047325134277,
"logps/chosen": -84.0325927734375,
"logps/ref_chosen": -61.275787353515625,
"logps/ref_rejected": -77.50580596923828,
"logps/rejected": -133.4116668701172,
"loss": 0.3167,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.11927846819162369,
"margin_dpo/beta_margin_grad_std": 0.160105362534523,
"margin_dpo/beta_margin_mean": 3.3149056434631348,
"margin_dpo/loss_margin_mean": 33.14905548095703,
"margin_dpo/margin_mean": 33.14905548095703,
"margin_dpo/margin_std": 22.489887237548828,
"step": 646
},
{
"epoch": 0.9500734214390602,
"grad_norm": 81.40752410888672,
"learning_rate": 4.024152566816791e-09,
"logits/chosen": -0.5593730807304382,
"logits/rejected": -0.5357339382171631,
"logps/chosen": -78.91641235351562,
"logps/ref_chosen": -54.852413177490234,
"logps/ref_rejected": -93.5194091796875,
"logps/rejected": -150.42044067382812,
"loss": 0.4999,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16081605851650238,
"margin_dpo/beta_margin_grad_std": 0.2314681112766266,
"margin_dpo/beta_margin_mean": 3.2837038040161133,
"margin_dpo/loss_margin_mean": 32.837039947509766,
"margin_dpo/margin_mean": 32.837039947509766,
"margin_dpo/margin_std": 26.819320678710938,
"step": 647
},
{
"epoch": 0.9515418502202643,
"grad_norm": 48.46821212768555,
"learning_rate": 3.798061746947995e-09,
"logits/chosen": -0.61167973279953,
"logits/rejected": -0.6027648448944092,
"logps/chosen": -74.04869842529297,
"logps/ref_chosen": -54.17146682739258,
"logps/ref_rejected": -98.71279907226562,
"logps/rejected": -159.05592346191406,
"loss": 0.3695,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1377037763595581,
"margin_dpo/beta_margin_grad_std": 0.1926315426826477,
"margin_dpo/beta_margin_mean": 4.046590328216553,
"margin_dpo/loss_margin_mean": 40.465904235839844,
"margin_dpo/margin_mean": 40.465904235839844,
"margin_dpo/margin_std": 34.20042037963867,
"step": 648
},
{
"epoch": 0.9530102790014684,
"grad_norm": 50.36833572387695,
"learning_rate": 3.5784585771215235e-09,
"logits/chosen": -0.6533620357513428,
"logits/rejected": -0.6218982934951782,
"logps/chosen": -83.10621643066406,
"logps/ref_chosen": -62.4803466796875,
"logps/ref_rejected": -80.07717895507812,
"logps/rejected": -129.4200897216797,
"loss": 0.5299,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1947104036808014,
"margin_dpo/beta_margin_grad_std": 0.2096787691116333,
"margin_dpo/beta_margin_mean": 2.871704339981079,
"margin_dpo/loss_margin_mean": 28.717042922973633,
"margin_dpo/margin_mean": 28.717044830322266,
"margin_dpo/margin_std": 28.58915138244629,
"step": 649
},
{
"epoch": 0.9544787077826725,
"grad_norm": 59.41923522949219,
"learning_rate": 3.3653488440851253e-09,
"logits/chosen": -0.5570046901702881,
"logits/rejected": -0.5465147495269775,
"logps/chosen": -80.50581359863281,
"logps/ref_chosen": -56.09281921386719,
"logps/ref_rejected": -98.26483917236328,
"logps/rejected": -159.12442016601562,
"loss": 0.3573,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1336677372455597,
"margin_dpo/beta_margin_grad_std": 0.1835484653711319,
"margin_dpo/beta_margin_mean": 3.6446590423583984,
"margin_dpo/loss_margin_mean": 36.44658660888672,
"margin_dpo/margin_mean": 36.44658660888672,
"margin_dpo/margin_std": 28.654094696044922,
"step": 650
},
{
"epoch": 0.9559471365638766,
"grad_norm": 38.3771858215332,
"learning_rate": 3.158738163478475e-09,
"logits/chosen": -0.607953667640686,
"logits/rejected": -0.6089369058609009,
"logps/chosen": -63.09129333496094,
"logps/ref_chosen": -43.42544937133789,
"logps/ref_rejected": -99.9579086303711,
"logps/rejected": -155.23358154296875,
"loss": 0.3196,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1269427090883255,
"margin_dpo/beta_margin_grad_std": 0.16582195460796356,
"margin_dpo/beta_margin_mean": 3.560983657836914,
"margin_dpo/loss_margin_mean": 35.609832763671875,
"margin_dpo/margin_mean": 35.609832763671875,
"margin_dpo/margin_std": 27.03875732421875,
"step": 651
},
{
"epoch": 0.9574155653450808,
"grad_norm": 42.76301956176758,
"learning_rate": 2.9586319796851555e-09,
"logits/chosen": -0.641417384147644,
"logits/rejected": -0.6177515983581543,
"logps/chosen": -78.98847961425781,
"logps/ref_chosen": -62.57680892944336,
"logps/ref_rejected": -111.76779174804688,
"logps/rejected": -163.6732177734375,
"loss": 0.3394,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1356583833694458,
"margin_dpo/beta_margin_grad_std": 0.16708874702453613,
"margin_dpo/beta_margin_mean": 3.5493762493133545,
"margin_dpo/loss_margin_mean": 35.49375915527344,
"margin_dpo/margin_mean": 35.49375915527344,
"margin_dpo/margin_std": 28.135786056518555,
"step": 652
},
{
"epoch": 0.9588839941262849,
"grad_norm": 52.41037368774414,
"learning_rate": 2.7650355656892166e-09,
"logits/chosen": -0.6357418298721313,
"logits/rejected": -0.6146754026412964,
"logps/chosen": -84.56446838378906,
"logps/ref_chosen": -61.11295700073242,
"logps/ref_rejected": -103.24960327148438,
"logps/rejected": -162.56381225585938,
"loss": 0.3193,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.11913929879665375,
"margin_dpo/beta_margin_grad_std": 0.17990511655807495,
"margin_dpo/beta_margin_mean": 3.5862698554992676,
"margin_dpo/loss_margin_mean": 35.86269760131836,
"margin_dpo/margin_mean": 35.862701416015625,
"margin_dpo/margin_std": 25.911727905273438,
"step": 653
},
{
"epoch": 0.960352422907489,
"grad_norm": 70.10198974609375,
"learning_rate": 2.577954022936174e-09,
"logits/chosen": -0.6373894810676575,
"logits/rejected": -0.6333979368209839,
"logps/chosen": -87.05609130859375,
"logps/ref_chosen": -61.7281379699707,
"logps/ref_rejected": -98.7738037109375,
"logps/rejected": -153.52032470703125,
"loss": 0.5245,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.17562022805213928,
"margin_dpo/beta_margin_grad_std": 0.2261282503604889,
"margin_dpo/beta_margin_mean": 2.9418554306030273,
"margin_dpo/loss_margin_mean": 29.418556213378906,
"margin_dpo/margin_mean": 29.418556213378906,
"margin_dpo/margin_std": 28.580772399902344,
"step": 654
},
{
"epoch": 0.9618208516886931,
"grad_norm": 69.47087860107422,
"learning_rate": 2.397392281198729e-09,
"logits/chosen": -0.6162744760513306,
"logits/rejected": -0.6169338226318359,
"logps/chosen": -71.07903289794922,
"logps/ref_chosen": -49.576812744140625,
"logps/ref_rejected": -98.29183197021484,
"logps/rejected": -150.40371704101562,
"loss": 0.5102,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.18301187455654144,
"margin_dpo/beta_margin_grad_std": 0.21452929079532623,
"margin_dpo/beta_margin_mean": 3.060966968536377,
"margin_dpo/loss_margin_mean": 30.609668731689453,
"margin_dpo/margin_mean": 30.609668731689453,
"margin_dpo/margin_std": 29.15388298034668,
"step": 655
},
{
"epoch": 0.9632892804698973,
"grad_norm": 40.41490936279297,
"learning_rate": 2.223355098446622e-09,
"logits/chosen": -0.5592731237411499,
"logits/rejected": -0.5631238222122192,
"logps/chosen": -73.43663024902344,
"logps/ref_chosen": -52.54943084716797,
"logps/ref_rejected": -113.67464447021484,
"logps/rejected": -176.38644409179688,
"loss": 0.2391,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.0935378447175026,
"margin_dpo/beta_margin_grad_std": 0.15634706616401672,
"margin_dpo/beta_margin_mean": 4.182460784912109,
"margin_dpo/loss_margin_mean": 41.824607849121094,
"margin_dpo/margin_mean": 41.824607849121094,
"margin_dpo/margin_std": 25.468910217285156,
"step": 656
},
{
"epoch": 0.9647577092511013,
"grad_norm": 45.6422233581543,
"learning_rate": 2.055847060721566e-09,
"logits/chosen": -0.5981370210647583,
"logits/rejected": -0.5761264562606812,
"logps/chosen": -68.69126892089844,
"logps/ref_chosen": -46.700538635253906,
"logps/ref_rejected": -97.91487121582031,
"logps/rejected": -157.28271484375,
"loss": 0.3403,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.11615358293056488,
"margin_dpo/beta_margin_grad_std": 0.175856813788414,
"margin_dpo/beta_margin_mean": 3.737710952758789,
"margin_dpo/loss_margin_mean": 37.377105712890625,
"margin_dpo/margin_mean": 37.377105712890625,
"margin_dpo/margin_std": 28.65097427368164,
"step": 657
},
{
"epoch": 0.9662261380323054,
"grad_norm": 57.90339660644531,
"learning_rate": 1.8948725820160662e-09,
"logits/chosen": -0.6264636516571045,
"logits/rejected": -0.5879380702972412,
"logps/chosen": -86.60824584960938,
"logps/ref_chosen": -60.958213806152344,
"logps/ref_rejected": -95.93949127197266,
"logps/rejected": -156.69830322265625,
"loss": 0.4473,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1491597294807434,
"margin_dpo/beta_margin_grad_std": 0.21673035621643066,
"margin_dpo/beta_margin_mean": 3.5108790397644043,
"margin_dpo/loss_margin_mean": 35.10879135131836,
"margin_dpo/margin_mean": 35.108787536621094,
"margin_dpo/margin_std": 29.642911911010742,
"step": 658
},
{
"epoch": 0.9676945668135095,
"grad_norm": 56.96932601928711,
"learning_rate": 1.7404359041573723e-09,
"logits/chosen": -0.5769931077957153,
"logits/rejected": -0.5069276690483093,
"logps/chosen": -96.17684936523438,
"logps/ref_chosen": -76.74298095703125,
"logps/ref_rejected": -87.4709701538086,
"logps/rejected": -141.29806518554688,
"loss": 0.4928,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16625207662582397,
"margin_dpo/beta_margin_grad_std": 0.23161795735359192,
"margin_dpo/beta_margin_mean": 3.4393229484558105,
"margin_dpo/loss_margin_mean": 34.39323043823242,
"margin_dpo/margin_mean": 34.39323043823242,
"margin_dpo/margin_std": 29.151775360107422,
"step": 659
},
{
"epoch": 0.9691629955947136,
"grad_norm": 51.775634765625,
"learning_rate": 1.592541096695571e-09,
"logits/chosen": -0.6120225191116333,
"logits/rejected": -0.5639553070068359,
"logps/chosen": -80.46331787109375,
"logps/ref_chosen": -59.047882080078125,
"logps/ref_rejected": -75.96005249023438,
"logps/rejected": -135.1595458984375,
"loss": 0.2938,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.11358514428138733,
"margin_dpo/beta_margin_grad_std": 0.17192693054676056,
"margin_dpo/beta_margin_mean": 3.778407096862793,
"margin_dpo/loss_margin_mean": 37.7840690612793,
"margin_dpo/margin_mean": 37.7840690612793,
"margin_dpo/margin_std": 27.653093338012695,
"step": 660
},
{
"epoch": 0.9706314243759178,
"grad_norm": 64.97528839111328,
"learning_rate": 1.4511920567963908e-09,
"logits/chosen": -0.5847969055175781,
"logits/rejected": -0.5379676818847656,
"logps/chosen": -71.49467468261719,
"logps/ref_chosen": -50.673973083496094,
"logps/ref_rejected": -86.00569152832031,
"logps/rejected": -141.72848510742188,
"loss": 0.4565,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.14623390138149261,
"margin_dpo/beta_margin_grad_std": 0.21268951892852783,
"margin_dpo/beta_margin_mean": 3.4902100563049316,
"margin_dpo/loss_margin_mean": 34.902099609375,
"margin_dpo/margin_mean": 34.902099609375,
"margin_dpo/margin_std": 29.28716278076172,
"step": 661
},
{
"epoch": 0.9720998531571219,
"grad_norm": 51.50436782836914,
"learning_rate": 1.3163925091384532e-09,
"logits/chosen": -0.6173335313796997,
"logits/rejected": -0.5659915208816528,
"logps/chosen": -93.51567077636719,
"logps/ref_chosen": -69.26106262207031,
"logps/ref_rejected": -89.05593872070312,
"logps/rejected": -144.10789489746094,
"loss": 0.3806,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1421985775232315,
"margin_dpo/beta_margin_grad_std": 0.1747194081544876,
"margin_dpo/beta_margin_mean": 3.0797348022460938,
"margin_dpo/loss_margin_mean": 30.797348022460938,
"margin_dpo/margin_mean": 30.797348022460938,
"margin_dpo/margin_std": 25.398778915405273,
"step": 662
},
{
"epoch": 0.973568281938326,
"grad_norm": 38.44890594482422,
"learning_rate": 1.1881460058152382e-09,
"logits/chosen": -0.647502064704895,
"logits/rejected": -0.6262093782424927,
"logps/chosen": -83.27813720703125,
"logps/ref_chosen": -64.87891387939453,
"logps/ref_rejected": -113.92536926269531,
"logps/rejected": -165.39273071289062,
"loss": 0.3283,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.12441954016685486,
"margin_dpo/beta_margin_grad_std": 0.1589188277721405,
"margin_dpo/beta_margin_mean": 3.306814670562744,
"margin_dpo/loss_margin_mean": 33.068145751953125,
"margin_dpo/margin_mean": 33.068145751953125,
"margin_dpo/margin_std": 24.52547264099121,
"step": 663
},
{
"epoch": 0.9750367107195301,
"grad_norm": 69.16484832763672,
"learning_rate": 1.066455926241383e-09,
"logits/chosen": -0.5968215465545654,
"logits/rejected": -0.5680118799209595,
"logps/chosen": -84.52749633789062,
"logps/ref_chosen": -60.88847351074219,
"logps/ref_rejected": -105.521728515625,
"logps/rejected": -166.35784912109375,
"loss": 0.426,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.11887051165103912,
"margin_dpo/beta_margin_grad_std": 0.19583344459533691,
"margin_dpo/beta_margin_mean": 3.7197093963623047,
"margin_dpo/loss_margin_mean": 37.19709014892578,
"margin_dpo/margin_mean": 37.19709014892578,
"margin_dpo/margin_std": 27.019386291503906,
"step": 664
},
{
"epoch": 0.9765051395007343,
"grad_norm": 42.6618537902832,
"learning_rate": 9.513254770636137e-10,
"logits/chosen": -0.6599475145339966,
"logits/rejected": -0.618366003036499,
"logps/chosen": -81.63275146484375,
"logps/ref_chosen": -60.56413269042969,
"logps/ref_rejected": -84.8088150024414,
"logps/rejected": -137.7115478515625,
"loss": 0.3485,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.13592152297496796,
"margin_dpo/beta_margin_grad_std": 0.1685512661933899,
"margin_dpo/beta_margin_mean": 3.183411121368408,
"margin_dpo/loss_margin_mean": 31.834110260009766,
"margin_dpo/margin_mean": 31.834110260009766,
"margin_dpo/margin_std": 23.29065704345703,
"step": 665
},
{
"epoch": 0.9779735682819384,
"grad_norm": 60.117515563964844,
"learning_rate": 8.427576920763956e-10,
"logits/chosen": -0.5868717432022095,
"logits/rejected": -0.5477631688117981,
"logps/chosen": -88.28842163085938,
"logps/ref_chosen": -64.41996002197266,
"logps/ref_rejected": -95.89163208007812,
"logps/rejected": -154.99008178710938,
"loss": 0.4224,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1320444643497467,
"margin_dpo/beta_margin_grad_std": 0.18931497633457184,
"margin_dpo/beta_margin_mean": 3.522998809814453,
"margin_dpo/loss_margin_mean": 35.22998809814453,
"margin_dpo/margin_mean": 35.22998809814453,
"margin_dpo/margin_std": 25.960124969482422,
"step": 666
},
{
"epoch": 0.9794419970631424,
"grad_norm": 56.65580749511719,
"learning_rate": 7.407554321417764e-10,
"logits/chosen": -0.6088787317276001,
"logits/rejected": -0.5590524673461914,
"logps/chosen": -94.58509063720703,
"logps/ref_chosen": -69.27703094482422,
"logps/ref_rejected": -87.83549499511719,
"logps/rejected": -147.4503173828125,
"loss": 0.3265,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.12345196306705475,
"margin_dpo/beta_margin_grad_std": 0.16755497455596924,
"margin_dpo/beta_margin_mean": 3.4306764602661133,
"margin_dpo/loss_margin_mean": 34.3067626953125,
"margin_dpo/margin_mean": 34.3067626953125,
"margin_dpo/margin_std": 24.47415542602539,
"step": 667
},
{
"epoch": 0.9809104258443465,
"grad_norm": 70.59870910644531,
"learning_rate": 6.453213851142225e-10,
"logits/chosen": -0.6269364356994629,
"logits/rejected": -0.5885031819343567,
"logps/chosen": -96.2440185546875,
"logps/ref_chosen": -72.60400390625,
"logps/ref_rejected": -103.73905181884766,
"logps/rejected": -160.45053100585938,
"loss": 0.4488,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1537022888660431,
"margin_dpo/beta_margin_grad_std": 0.21864807605743408,
"margin_dpo/beta_margin_mean": 3.3071470260620117,
"margin_dpo/loss_margin_mean": 33.07147216796875,
"margin_dpo/margin_mean": 33.07147216796875,
"margin_dpo/margin_std": 25.884002685546875,
"step": 668
},
{
"epoch": 0.9823788546255506,
"grad_norm": 68.7284927368164,
"learning_rate": 5.564580657695939e-10,
"logits/chosen": -0.6374561786651611,
"logits/rejected": -0.5934668183326721,
"logps/chosen": -65.82669067382812,
"logps/ref_chosen": -46.116416931152344,
"logps/ref_rejected": -77.92434692382812,
"logps/rejected": -135.83099365234375,
"loss": 0.5109,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.15599049627780914,
"margin_dpo/beta_margin_grad_std": 0.2399427890777588,
"margin_dpo/beta_margin_mean": 3.8196370601654053,
"margin_dpo/loss_margin_mean": 38.19636917114258,
"margin_dpo/margin_mean": 38.19636917114258,
"margin_dpo/margin_std": 32.66187286376953,
"step": 669
},
{
"epoch": 0.9838472834067548,
"grad_norm": 44.16929626464844,
"learning_rate": 4.741678157389739e-10,
"logits/chosen": -0.6017849445343018,
"logits/rejected": -0.5695161819458008,
"logps/chosen": -83.34823608398438,
"logps/ref_chosen": -62.34575653076172,
"logps/ref_rejected": -96.9405517578125,
"logps/rejected": -156.87435913085938,
"loss": 0.2739,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.10903677344322205,
"margin_dpo/beta_margin_grad_std": 0.15663883090019226,
"margin_dpo/beta_margin_mean": 3.8931329250335693,
"margin_dpo/loss_margin_mean": 38.93132781982422,
"margin_dpo/margin_mean": 38.93132781982422,
"margin_dpo/margin_std": 25.873010635375977,
"step": 670
},
{
"epoch": 0.9853157121879589,
"grad_norm": 55.043914794921875,
"learning_rate": 3.9845280344705245e-10,
"logits/chosen": -0.6250673532485962,
"logits/rejected": -0.5933674573898315,
"logps/chosen": -72.50352478027344,
"logps/ref_chosen": -48.00010681152344,
"logps/ref_rejected": -83.81932067871094,
"logps/rejected": -143.89370727539062,
"loss": 0.3551,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.13739252090454102,
"margin_dpo/beta_margin_grad_std": 0.1742577999830246,
"margin_dpo/beta_margin_mean": 3.5570971965789795,
"margin_dpo/loss_margin_mean": 35.57096862792969,
"margin_dpo/margin_mean": 35.57096862792969,
"margin_dpo/margin_std": 28.424989700317383,
"step": 671
},
{
"epoch": 0.986784140969163,
"grad_norm": 66.3819351196289,
"learning_rate": 3.293150240547549e-10,
"logits/chosen": -0.6075701117515564,
"logits/rejected": -0.5688859820365906,
"logps/chosen": -82.87799072265625,
"logps/ref_chosen": -58.583290100097656,
"logps/ref_rejected": -93.14014434814453,
"logps/rejected": -149.91152954101562,
"loss": 0.479,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.17206090688705444,
"margin_dpo/beta_margin_grad_std": 0.21605950593948364,
"margin_dpo/beta_margin_mean": 3.2476677894592285,
"margin_dpo/loss_margin_mean": 32.47667694091797,
"margin_dpo/margin_mean": 32.47667694091797,
"margin_dpo/margin_std": 29.792341232299805,
"step": 672
},
{
"epoch": 0.9882525697503671,
"grad_norm": 41.944400787353516,
"learning_rate": 2.6675629940689504e-10,
"logits/chosen": -0.6093118786811829,
"logits/rejected": -0.5795783996582031,
"logps/chosen": -68.00104522705078,
"logps/ref_chosen": -46.72320556640625,
"logps/ref_rejected": -85.29623413085938,
"logps/rejected": -143.71388244628906,
"loss": 0.3057,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.12082840502262115,
"margin_dpo/beta_margin_grad_std": 0.16586080193519592,
"margin_dpo/beta_margin_mean": 3.7139804363250732,
"margin_dpo/loss_margin_mean": 37.139801025390625,
"margin_dpo/margin_mean": 37.13980484008789,
"margin_dpo/margin_std": 27.277408599853516,
"step": 673
},
{
"epoch": 0.9897209985315712,
"grad_norm": 37.851444244384766,
"learning_rate": 2.1077827798404725e-10,
"logits/chosen": -0.56818687915802,
"logits/rejected": -0.5401608943939209,
"logps/chosen": -67.67996215820312,
"logps/ref_chosen": -45.445526123046875,
"logps/ref_rejected": -70.04593658447266,
"logps/rejected": -130.27374267578125,
"loss": 0.2841,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.11634500324726105,
"margin_dpo/beta_margin_grad_std": 0.15289074182510376,
"margin_dpo/beta_margin_mean": 3.799337387084961,
"margin_dpo/loss_margin_mean": 37.993370056152344,
"margin_dpo/margin_mean": 37.993370056152344,
"margin_dpo/margin_std": 28.400920867919922,
"step": 674
},
{
"epoch": 0.9911894273127754,
"grad_norm": 61.00739669799805,
"learning_rate": 1.6138243485910863e-10,
"logits/chosen": -0.5657342672348022,
"logits/rejected": -0.5385361909866333,
"logps/chosen": -65.09626770019531,
"logps/ref_chosen": -44.17628479003906,
"logps/ref_rejected": -74.09197998046875,
"logps/rejected": -134.56707763671875,
"loss": 0.3851,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.11753740161657333,
"margin_dpo/beta_margin_grad_std": 0.20709985494613647,
"margin_dpo/beta_margin_mean": 3.9555118083953857,
"margin_dpo/loss_margin_mean": 39.555118560791016,
"margin_dpo/margin_mean": 39.555118560791016,
"margin_dpo/margin_std": 27.490642547607422,
"step": 675
},
{
"epoch": 0.9926578560939795,
"grad_norm": 76.93489837646484,
"learning_rate": 1.1857007165852472e-10,
"logits/chosen": -0.639100968837738,
"logits/rejected": -0.6050753593444824,
"logps/chosen": -96.79466247558594,
"logps/ref_chosen": -71.39852142333984,
"logps/ref_rejected": -88.3587646484375,
"logps/rejected": -150.10198974609375,
"loss": 0.4104,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.13633592426776886,
"margin_dpo/beta_margin_grad_std": 0.18744832277297974,
"margin_dpo/beta_margin_mean": 3.6347103118896484,
"margin_dpo/loss_margin_mean": 36.34709930419922,
"margin_dpo/margin_mean": 36.34709930419922,
"margin_dpo/margin_std": 28.50853729248047,
"step": 676
},
{
"epoch": 0.9941262848751835,
"grad_norm": 66.0276107788086,
"learning_rate": 8.23423165278725e-11,
"logits/chosen": -0.6146172285079956,
"logits/rejected": -0.565468966960907,
"logps/chosen": -79.82300567626953,
"logps/ref_chosen": -56.52743911743164,
"logps/ref_rejected": -78.22654724121094,
"logps/rejected": -138.8525390625,
"loss": 0.4429,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1386735886335373,
"margin_dpo/beta_margin_grad_std": 0.21394288539886475,
"margin_dpo/beta_margin_mean": 3.733041286468506,
"margin_dpo/loss_margin_mean": 37.330413818359375,
"margin_dpo/margin_mean": 37.330413818359375,
"margin_dpo/margin_std": 28.455005645751953,
"step": 677
},
{
"epoch": 0.9955947136563876,
"grad_norm": 50.60527801513672,
"learning_rate": 5.270012410216185e-11,
"logits/chosen": -0.5987369418144226,
"logits/rejected": -0.5752243995666504,
"logps/chosen": -68.12297058105469,
"logps/ref_chosen": -46.13447570800781,
"logps/ref_rejected": -80.60462951660156,
"logps/rejected": -139.23318481445312,
"loss": 0.4505,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.1656286120414734,
"margin_dpo/beta_margin_grad_std": 0.21251779794692993,
"margin_dpo/beta_margin_mean": 3.6640052795410156,
"margin_dpo/loss_margin_mean": 36.640052795410156,
"margin_dpo/margin_mean": 36.640052795410156,
"margin_dpo/margin_std": 31.15386390686035,
"step": 678
},
{
"epoch": 0.9970631424375918,
"grad_norm": 47.600643157958984,
"learning_rate": 2.9644275480772416e-11,
"logits/chosen": -0.6062077283859253,
"logits/rejected": -0.5733453035354614,
"logps/chosen": -72.9251937866211,
"logps/ref_chosen": -50.294921875,
"logps/ref_rejected": -76.59813690185547,
"logps/rejected": -136.07611083984375,
"loss": 0.3291,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.112078458070755,
"margin_dpo/beta_margin_grad_std": 0.17374977469444275,
"margin_dpo/beta_margin_mean": 3.6847691535949707,
"margin_dpo/loss_margin_mean": 36.84769058227539,
"margin_dpo/margin_mean": 36.84769058227539,
"margin_dpo/margin_std": 26.915252685546875,
"step": 679
},
{
"epoch": 0.9985315712187959,
"grad_norm": 55.98041915893555,
"learning_rate": 1.31753782067201e-11,
"logits/chosen": -0.6170350313186646,
"logits/rejected": -0.5841037034988403,
"logps/chosen": -99.70010375976562,
"logps/ref_chosen": -76.91569519042969,
"logps/ref_rejected": -112.384765625,
"logps/rejected": -171.31031799316406,
"loss": 0.3861,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.13496683537960052,
"margin_dpo/beta_margin_grad_std": 0.1988377571105957,
"margin_dpo/beta_margin_mean": 3.6141152381896973,
"margin_dpo/loss_margin_mean": 36.141151428222656,
"margin_dpo/margin_mean": 36.141151428222656,
"margin_dpo/margin_std": 29.207448959350586,
"step": 680
},
{
"epoch": 1.0,
"grad_norm": 52.392547607421875,
"learning_rate": 3.2938662507808745e-12,
"logits/chosen": -0.6597200632095337,
"logits/rejected": -0.6327718496322632,
"logps/chosen": -84.40997314453125,
"logps/ref_chosen": -60.957279205322266,
"logps/ref_rejected": -88.5579833984375,
"logps/rejected": -143.79640197753906,
"loss": 0.4602,
"margin_dpo/beta": 0.10000000149011612,
"margin_dpo/beta_margin_grad_mean": -0.16065430641174316,
"margin_dpo/beta_margin_grad_std": 0.20822513103485107,
"margin_dpo/beta_margin_mean": 3.1785736083984375,
"margin_dpo/loss_margin_mean": 31.785736083984375,
"margin_dpo/margin_mean": 31.785736083984375,
"margin_dpo/margin_std": 28.091190338134766,
"step": 681
},
{
"epoch": 1.0,
"step": 681,
"total_flos": 0.0,
"train_loss": 0.572698849610295,
"train_runtime": 1998.3785,
"train_samples_per_second": 21.817,
"train_steps_per_second": 0.341
}
],
"logging_steps": 1,
"max_steps": 681,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}