Files
llama-3-8b-base-epsilon-dpo…/trainer_state.json
ModelHub XC 121c6f2962 初始化项目,由ModelHub XC社区提供模型
Model: W-61/llama-3-8b-base-epsilon-dpo-hh-harmless-4xh200-batch-64-20260418-003215
Source: Original Platform
2026-06-13 14:08:31 +08:00

18047 lines
724 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.999244142101285,
"eval_steps": 100,
"global_step": 661,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0015117157974300832,
"epsilon_dpo/beta": 0.0998849868774414,
"epsilon_dpo/beta_margin_grad_mean": -0.5000842809677124,
"epsilon_dpo/beta_margin_grad_std": 0.006420796271413565,
"epsilon_dpo/beta_margin_mean": -0.0003377889806870371,
"epsilon_dpo/beta_margin_std": 0.02568790502846241,
"epsilon_dpo/loss_margin_mean": -0.0013527870178222656,
"grad_norm": 28.214866638183594,
"kl/avg_steps": 0.125,
"kl/beta": 0.10000000149011612,
"kl/n_epsilon_steps": 0.4375,
"kl/p_epsilon_steps": 0.5625,
"learning_rate": 0.0,
"logits/chosen": -0.293241411447525,
"logits/rejected": -0.34447842836380005,
"logps/chosen": -64.5841293334961,
"logps/ref_chosen": -64.61280822753906,
"logps/ref_rejected": -64.17195129394531,
"logps/rejected": -64.14192199707031,
"loss": 1.3868,
"rewards/accuracies": 0.578125,
"rewards/chosen": 0.0027694925665855408,
"rewards/margins": -0.0003378365363460034,
"rewards/rejected": 0.0031073291320353746,
"step": 1
},
{
"epoch": 0.0030234315948601664,
"epsilon_dpo/beta": 0.09976029396057129,
"epsilon_dpo/beta_margin_grad_mean": -0.49912163615226746,
"epsilon_dpo/beta_margin_grad_std": 0.007172735407948494,
"epsilon_dpo/beta_margin_mean": 0.0035140058025717735,
"epsilon_dpo/beta_margin_std": 0.028697991743683815,
"epsilon_dpo/loss_margin_mean": 0.03744968771934509,
"grad_norm": 27.765911102294922,
"kl/avg_steps": 0.125,
"kl/beta": 0.09987515956163406,
"kl/n_epsilon_steps": 0.4375,
"kl/p_epsilon_steps": 0.5625,
"learning_rate": 7.462686567164179e-09,
"logits/chosen": -0.2665444612503052,
"logits/rejected": -0.3357340097427368,
"logps/chosen": -56.101890563964844,
"logps/ref_chosen": -56.0989990234375,
"logps/ref_rejected": -66.59971618652344,
"logps/rejected": -66.64006042480469,
"loss": 1.383,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.0004388358211144805,
"rewards/margins": 0.003514041192829609,
"rewards/rejected": -0.003952877130359411,
"step": 2
},
{
"epoch": 0.0045351473922902496,
"epsilon_dpo/beta": 0.09991631656885147,
"epsilon_dpo/beta_margin_grad_mean": -0.5003474354743958,
"epsilon_dpo/beta_margin_grad_std": 0.007928181439638138,
"epsilon_dpo/beta_margin_mean": -0.0013886871747672558,
"epsilon_dpo/beta_margin_std": 0.03172110393643379,
"epsilon_dpo/loss_margin_mean": -0.011415421962738037,
"grad_norm": 31.248964309692383,
"kl/avg_steps": -0.15625,
"kl/beta": 0.09975046664476395,
"kl/n_epsilon_steps": 0.578125,
"kl/p_epsilon_steps": 0.421875,
"learning_rate": 1.4925373134328357e-08,
"logits/chosen": -0.3116225004196167,
"logits/rejected": -0.3542691767215729,
"logps/chosen": -65.43191528320312,
"logps/ref_chosen": -65.45726013183594,
"logps/ref_rejected": -90.82853698730469,
"logps/rejected": -90.7917709350586,
"loss": 1.3879,
"rewards/accuracies": 0.421875,
"rewards/chosen": 0.0024507236666977406,
"rewards/margins": -0.00138873141258955,
"rewards/rejected": 0.0038394550792872906,
"step": 3
},
{
"epoch": 0.006046863189720333,
"epsilon_dpo/beta": 0.09979166090488434,
"epsilon_dpo/beta_margin_grad_mean": -0.4990536868572235,
"epsilon_dpo/beta_margin_grad_std": 0.008320465683937073,
"epsilon_dpo/beta_margin_mean": 0.00378626910969615,
"epsilon_dpo/beta_margin_std": 0.0332907997071743,
"epsilon_dpo/loss_margin_mean": 0.04052528738975525,
"grad_norm": 34.140968322753906,
"kl/avg_steps": 0.125,
"kl/beta": 0.0999065712094307,
"kl/n_epsilon_steps": 0.4375,
"kl/p_epsilon_steps": 0.5625,
"learning_rate": 2.2388059701492534e-08,
"logits/chosen": -0.3732798099517822,
"logits/rejected": -0.38962864875793457,
"logps/chosen": -76.84223937988281,
"logps/ref_chosen": -76.86018371582031,
"logps/ref_rejected": -79.91523742675781,
"logps/rejected": -79.93782043457031,
"loss": 1.3828,
"rewards/accuracies": 0.546875,
"rewards/chosen": 0.0016458019381389022,
"rewards/margins": 0.003786304732784629,
"rewards/rejected": -0.0021405029110610485,
"step": 4
},
{
"epoch": 0.007558578987150416,
"epsilon_dpo/beta": 0.09979181736707687,
"epsilon_dpo/beta_margin_grad_mean": -0.4996488094329834,
"epsilon_dpo/beta_margin_grad_std": 0.006971836555749178,
"epsilon_dpo/beta_margin_mean": 0.0014054944040253758,
"epsilon_dpo/beta_margin_std": 0.02789238840341568,
"epsilon_dpo/loss_margin_mean": 0.0163441002368927,
"grad_norm": 29.427160263061523,
"kl/avg_steps": 0.0,
"kl/beta": 0.09978184103965759,
"kl/n_epsilon_steps": 0.5,
"kl/p_epsilon_steps": 0.5,
"learning_rate": 2.9850746268656714e-08,
"logits/chosen": -0.31111201643943787,
"logits/rejected": -0.42863184213638306,
"logps/chosen": -62.99342727661133,
"logps/ref_chosen": -62.97134017944336,
"logps/ref_rejected": -79.91920471191406,
"logps/rejected": -79.9576416015625,
"loss": 1.3851,
"rewards/accuracies": 0.484375,
"rewards/chosen": -0.0023576724343001842,
"rewards/margins": 0.0014054615749046206,
"rewards/rejected": -0.0037631341256201267,
"step": 5
},
{
"epoch": 0.009070294784580499,
"epsilon_dpo/beta": 0.09976062923669815,
"epsilon_dpo/beta_margin_grad_mean": -0.5020826458930969,
"epsilon_dpo/beta_margin_grad_std": 0.010993240401148796,
"epsilon_dpo/beta_margin_mean": -0.00834981445223093,
"epsilon_dpo/beta_margin_std": 0.04405822604894638,
"epsilon_dpo/loss_margin_mean": -0.08078205585479736,
"grad_norm": 29.794363021850586,
"kl/avg_steps": 0.03125,
"kl/beta": 0.09978184103965759,
"kl/n_epsilon_steps": 0.484375,
"kl/p_epsilon_steps": 0.515625,
"learning_rate": 3.731343283582089e-08,
"logits/chosen": -0.2843635678291321,
"logits/rejected": -0.3435862958431244,
"logps/chosen": -51.349830627441406,
"logps/ref_chosen": -51.30736541748047,
"logps/ref_rejected": -82.77239227294922,
"logps/rejected": -82.73407745361328,
"loss": 1.3951,
"rewards/accuracies": 0.484375,
"rewards/chosen": -0.0043645575642585754,
"rewards/margins": -0.008349799551069736,
"rewards/rejected": 0.003985242452472448,
"step": 6
},
{
"epoch": 0.010582010582010581,
"epsilon_dpo/beta": 0.09977608174085617,
"epsilon_dpo/beta_margin_grad_mean": -0.49996793270111084,
"epsilon_dpo/beta_margin_grad_std": 0.00833675917237997,
"epsilon_dpo/beta_margin_mean": 0.00012787683226633817,
"epsilon_dpo/beta_margin_std": 0.033355168998241425,
"epsilon_dpo/loss_margin_mean": 0.003915518522262573,
"grad_norm": 27.13857650756836,
"kl/avg_steps": -0.015625,
"kl/beta": 0.09975067526102066,
"kl/n_epsilon_steps": 0.5,
"kl/p_epsilon_steps": 0.484375,
"learning_rate": 4.477611940298507e-08,
"logits/chosen": -0.34914782643318176,
"logits/rejected": -0.4351033568382263,
"logps/chosen": -51.442935943603516,
"logps/ref_chosen": -51.45941162109375,
"logps/ref_rejected": -66.3828125,
"logps/rejected": -66.37024688720703,
"loss": 1.3864,
"rewards/accuracies": 0.484375,
"rewards/chosen": 0.001505495049059391,
"rewards/margins": 0.00012786738807335496,
"rewards/rejected": 0.0013776274863630533,
"step": 7
},
{
"epoch": 0.012093726379440665,
"epsilon_dpo/beta": 0.09980741888284683,
"epsilon_dpo/beta_margin_grad_mean": -0.5002568364143372,
"epsilon_dpo/beta_margin_grad_std": 0.008262201212346554,
"epsilon_dpo/beta_margin_mean": -0.0010270840721204877,
"epsilon_dpo/beta_margin_std": 0.03305831924080849,
"epsilon_dpo/loss_margin_mean": -0.0077308714389801025,
"grad_norm": 28.532468795776367,
"kl/avg_steps": -0.03125,
"kl/beta": 0.09976626187562943,
"kl/n_epsilon_steps": 0.515625,
"kl/p_epsilon_steps": 0.484375,
"learning_rate": 5.223880597014925e-08,
"logits/chosen": -0.30369192361831665,
"logits/rejected": -0.38484492897987366,
"logps/chosen": -62.208282470703125,
"logps/ref_chosen": -62.19754409790039,
"logps/ref_rejected": -74.66180419921875,
"logps/rejected": -74.6648178100586,
"loss": 1.3876,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.0011976377572864294,
"rewards/margins": -0.0010270420461893082,
"rewards/rejected": -0.00017059571109712124,
"step": 8
},
{
"epoch": 0.013605442176870748,
"epsilon_dpo/beta": 0.09983861446380615,
"epsilon_dpo/beta_margin_grad_mean": -0.49962690472602844,
"epsilon_dpo/beta_margin_grad_std": 0.007648429833352566,
"epsilon_dpo/beta_margin_mean": 0.0014941173139959574,
"epsilon_dpo/beta_margin_std": 0.030601851642131805,
"epsilon_dpo/loss_margin_mean": 0.017356693744659424,
"grad_norm": 31.47663116455078,
"kl/avg_steps": -0.03125,
"kl/beta": 0.09979745000600815,
"kl/n_epsilon_steps": 0.515625,
"kl/p_epsilon_steps": 0.484375,
"learning_rate": 5.970149253731343e-08,
"logits/chosen": -0.26175159215927124,
"logits/rejected": -0.36549025774002075,
"logps/chosen": -55.64149856567383,
"logps/ref_chosen": -55.629722595214844,
"logps/ref_rejected": -86.21221923828125,
"logps/rejected": -86.2413558959961,
"loss": 1.385,
"rewards/accuracies": 0.515625,
"rewards/chosen": -0.0012760079698637128,
"rewards/margins": 0.0014940700493752956,
"rewards/rejected": -0.0027700779028236866,
"step": 9
},
{
"epoch": 0.015117157974300832,
"epsilon_dpo/beta": 0.0999322235584259,
"epsilon_dpo/beta_margin_grad_mean": -0.5011972188949585,
"epsilon_dpo/beta_margin_grad_std": 0.008651547133922577,
"epsilon_dpo/beta_margin_mean": -0.004788341000676155,
"epsilon_dpo/beta_margin_std": 0.03462748974561691,
"epsilon_dpo/loss_margin_mean": -0.04551097750663757,
"grad_norm": 29.798023223876953,
"kl/avg_steps": -0.09375,
"kl/beta": 0.09982864558696747,
"kl/n_epsilon_steps": 0.546875,
"kl/p_epsilon_steps": 0.453125,
"learning_rate": 6.71641791044776e-08,
"logits/chosen": -0.268494188785553,
"logits/rejected": -0.3035653233528137,
"logps/chosen": -62.68584060668945,
"logps/ref_chosen": -62.69060134887695,
"logps/ref_rejected": -90.61012268066406,
"logps/rejected": -90.55984497070312,
"loss": 1.3914,
"rewards/accuracies": 0.4375,
"rewards/chosen": 0.00036463316064327955,
"rewards/margins": -0.004788305144757032,
"rewards/rejected": 0.005152938421815634,
"step": 10
},
{
"epoch": 0.016628873771730914,
"epsilon_dpo/beta": 0.09977616369724274,
"epsilon_dpo/beta_margin_grad_mean": -0.4987374544143677,
"epsilon_dpo/beta_margin_grad_std": 0.00888054259121418,
"epsilon_dpo/beta_margin_mean": 0.005049114115536213,
"epsilon_dpo/beta_margin_std": 0.03554675728082657,
"epsilon_dpo/loss_margin_mean": 0.05307146906852722,
"grad_norm": 29.118450164794922,
"kl/avg_steps": 0.15625,
"kl/beta": 0.09992232173681259,
"kl/n_epsilon_steps": 0.421875,
"kl/p_epsilon_steps": 0.578125,
"learning_rate": 7.462686567164178e-08,
"logits/chosen": -0.29443594813346863,
"logits/rejected": -0.31589585542678833,
"logps/chosen": -65.7430419921875,
"logps/ref_chosen": -65.76712036132812,
"logps/ref_rejected": -72.4764633178711,
"logps/rejected": -72.50544738769531,
"loss": 1.3816,
"rewards/accuracies": 0.609375,
"rewards/chosen": 0.00232205493375659,
"rewards/margins": 0.00504904892295599,
"rewards/rejected": -0.0027269939891994,
"step": 11
},
{
"epoch": 0.018140589569160998,
"epsilon_dpo/beta": 0.09971404820680618,
"epsilon_dpo/beta_margin_grad_mean": -0.5000174641609192,
"epsilon_dpo/beta_margin_grad_std": 0.006055078003555536,
"epsilon_dpo/beta_margin_mean": -7.042424840619788e-05,
"epsilon_dpo/beta_margin_std": 0.024224182590842247,
"epsilon_dpo/loss_margin_mean": 0.0012585818767547607,
"grad_norm": 28.209169387817383,
"kl/avg_steps": 0.0625,
"kl/beta": 0.09976643323898315,
"kl/n_epsilon_steps": 0.46875,
"kl/p_epsilon_steps": 0.53125,
"learning_rate": 8.208955223880596e-08,
"logits/chosen": -0.34568387269973755,
"logits/rejected": -0.38922828435897827,
"logps/chosen": -60.716941833496094,
"logps/ref_chosen": -60.704891204833984,
"logps/ref_rejected": -69.41564178466797,
"logps/rejected": -69.42894744873047,
"loss": 1.3865,
"rewards/accuracies": 0.515625,
"rewards/chosen": -0.0013101967051625252,
"rewards/margins": -7.043101504677907e-05,
"rewards/rejected": -0.0012397656682878733,
"step": 12
},
{
"epoch": 0.019652305366591082,
"epsilon_dpo/beta": 0.09968262165784836,
"epsilon_dpo/beta_margin_grad_mean": -0.5011368989944458,
"epsilon_dpo/beta_margin_grad_std": 0.007047805469483137,
"epsilon_dpo/beta_margin_mean": -0.0045489720068871975,
"epsilon_dpo/beta_margin_std": 0.028197508305311203,
"epsilon_dpo/loss_margin_mean": -0.0434664785861969,
"grad_norm": 29.133708953857422,
"kl/avg_steps": 0.03125,
"kl/beta": 0.09970412403345108,
"kl/n_epsilon_steps": 0.46875,
"kl/p_epsilon_steps": 0.5,
"learning_rate": 8.955223880597014e-08,
"logits/chosen": -0.2935143709182739,
"logits/rejected": -0.3819401264190674,
"logps/chosen": -49.920982360839844,
"logps/ref_chosen": -49.90925216674805,
"logps/ref_rejected": -92.378173828125,
"logps/rejected": -92.346435546875,
"loss": 1.391,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.0012526975478976965,
"rewards/margins": -0.0045489538460969925,
"rewards/rejected": 0.003296256298199296,
"step": 13
},
{
"epoch": 0.021164021164021163,
"epsilon_dpo/beta": 0.0997452363371849,
"epsilon_dpo/beta_margin_grad_mean": -0.4997658133506775,
"epsilon_dpo/beta_margin_grad_std": 0.006996911950409412,
"epsilon_dpo/beta_margin_mean": 0.0009388642502017319,
"epsilon_dpo/beta_margin_std": 0.0279961246997118,
"epsilon_dpo/loss_margin_mean": 0.011530548334121704,
"grad_norm": 29.414230346679688,
"kl/avg_steps": -0.0625,
"kl/beta": 0.09967297315597534,
"kl/n_epsilon_steps": 0.53125,
"kl/p_epsilon_steps": 0.46875,
"learning_rate": 9.701492537313432e-08,
"logits/chosen": -0.3997393250465393,
"logits/rejected": -0.39330822229385376,
"logps/chosen": -60.60332107543945,
"logps/ref_chosen": -60.61879348754883,
"logps/ref_rejected": -71.79306030273438,
"logps/rejected": -71.78912353515625,
"loss": 1.3856,
"rewards/accuracies": 0.453125,
"rewards/chosen": 0.0014628882054239511,
"rewards/margins": 0.0009388620383106172,
"rewards/rejected": 0.0005240262253209949,
"step": 14
},
{
"epoch": 0.022675736961451247,
"epsilon_dpo/beta": 0.09986995905637741,
"epsilon_dpo/beta_margin_grad_mean": -0.5013818144798279,
"epsilon_dpo/beta_margin_grad_std": 0.007960259914398193,
"epsilon_dpo/beta_margin_mean": -0.005531555972993374,
"epsilon_dpo/beta_margin_std": 0.031855810433626175,
"epsilon_dpo/loss_margin_mean": -0.05313822627067566,
"grad_norm": 33.27139663696289,
"kl/avg_steps": -0.125,
"kl/beta": 0.0997353047132492,
"kl/n_epsilon_steps": 0.5625,
"kl/p_epsilon_steps": 0.4375,
"learning_rate": 1.044776119402985e-07,
"logits/chosen": -0.29406124353408813,
"logits/rejected": -0.35813331604003906,
"logps/chosen": -63.495731353759766,
"logps/ref_chosen": -63.46953582763672,
"logps/ref_rejected": -88.88951110839844,
"logps/rejected": -88.8625717163086,
"loss": 1.3921,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.0027354268822818995,
"rewards/margins": -0.00553159462288022,
"rewards/rejected": 0.002796167740598321,
"step": 15
},
{
"epoch": 0.02418745275888133,
"epsilon_dpo/beta": 0.09971407055854797,
"epsilon_dpo/beta_margin_grad_mean": -0.4989100992679596,
"epsilon_dpo/beta_margin_grad_std": 0.006902648136019707,
"epsilon_dpo/beta_margin_mean": 0.004361429717391729,
"epsilon_dpo/beta_margin_std": 0.02761976607143879,
"epsilon_dpo/loss_margin_mean": 0.0457233190536499,
"grad_norm": 26.702556610107422,
"kl/avg_steps": 0.15625,
"kl/beta": 0.09986013174057007,
"kl/n_epsilon_steps": 0.421875,
"kl/p_epsilon_steps": 0.578125,
"learning_rate": 1.1194029850746268e-07,
"logits/chosen": -0.3057270646095276,
"logits/rejected": -0.3239745497703552,
"logps/chosen": -46.53052520751953,
"logps/ref_chosen": -46.53229904174805,
"logps/ref_rejected": -74.27534484863281,
"logps/rejected": -74.31929016113281,
"loss": 1.3821,
"rewards/accuracies": 0.5625,
"rewards/chosen": 8.973665535449982e-05,
"rewards/margins": 0.004361418075859547,
"rewards/rejected": -0.004271681420505047,
"step": 16
},
{
"epoch": 0.025699168556311415,
"epsilon_dpo/beta": 0.09962083399295807,
"epsilon_dpo/beta_margin_grad_mean": -0.4984602928161621,
"epsilon_dpo/beta_margin_grad_std": 0.009348109364509583,
"epsilon_dpo/beta_margin_mean": 0.006162949372082949,
"epsilon_dpo/beta_margin_std": 0.037412647157907486,
"epsilon_dpo/loss_margin_mean": 0.0646277666091919,
"grad_norm": 32.68865203857422,
"kl/avg_steps": 0.09375,
"kl/beta": 0.09970434755086899,
"kl/n_epsilon_steps": 0.453125,
"kl/p_epsilon_steps": 0.546875,
"learning_rate": 1.1940298507462686e-07,
"logits/chosen": -0.33725497126579285,
"logits/rejected": -0.35533463954925537,
"logps/chosen": -64.07317352294922,
"logps/ref_chosen": -64.07783508300781,
"logps/ref_rejected": -86.40876770019531,
"logps/rejected": -86.46873474121094,
"loss": 1.3805,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.0003336211375426501,
"rewards/margins": 0.00616296473890543,
"rewards/rejected": -0.005829343572258949,
"step": 17
},
{
"epoch": 0.027210884353741496,
"epsilon_dpo/beta": 0.09971431642770767,
"epsilon_dpo/beta_margin_grad_mean": -0.5000079274177551,
"epsilon_dpo/beta_margin_grad_std": 0.006709072273224592,
"epsilon_dpo/beta_margin_mean": -3.1369447242468596e-05,
"epsilon_dpo/beta_margin_std": 0.026841431856155396,
"epsilon_dpo/loss_margin_mean": 0.0017310678958892822,
"grad_norm": 27.74285316467285,
"kl/avg_steps": -0.09375,
"kl/beta": 0.09961096197366714,
"kl/n_epsilon_steps": 0.546875,
"kl/p_epsilon_steps": 0.453125,
"learning_rate": 1.2686567164179106e-07,
"logits/chosen": -0.3130100667476654,
"logits/rejected": -0.3496634364128113,
"logps/chosen": -44.86057662963867,
"logps/ref_chosen": -44.87433624267578,
"logps/ref_rejected": -70.9760513305664,
"logps/rejected": -70.96401977539062,
"loss": 1.3865,
"rewards/accuracies": 0.46875,
"rewards/chosen": 0.0012917739804834127,
"rewards/margins": -3.139290492981672e-05,
"rewards/rejected": 0.0013231671182438731,
"step": 18
},
{
"epoch": 0.02872260015117158,
"epsilon_dpo/beta": 0.09971439838409424,
"epsilon_dpo/beta_margin_grad_mean": -0.5000306963920593,
"epsilon_dpo/beta_margin_grad_std": 0.007233525160700083,
"epsilon_dpo/beta_margin_mean": -0.00012327870354056358,
"epsilon_dpo/beta_margin_std": 0.028940344229340553,
"epsilon_dpo/loss_margin_mean": 0.0011038780212402344,
"grad_norm": 30.739639282226562,
"kl/avg_steps": 0.0,
"kl/beta": 0.09970442950725555,
"kl/n_epsilon_steps": 0.5,
"kl/p_epsilon_steps": 0.5,
"learning_rate": 1.343283582089552e-07,
"logits/chosen": -0.28195369243621826,
"logits/rejected": -0.34346824884414673,
"logps/chosen": -68.14604949951172,
"logps/ref_chosen": -68.1598129272461,
"logps/ref_rejected": -81.17138671875,
"logps/rejected": -81.15872955322266,
"loss": 1.3866,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.0013030236586928368,
"rewards/margins": -0.00012326962314546108,
"rewards/rejected": 0.0014262932818382978,
"step": 19
},
{
"epoch": 0.030234315948601664,
"epsilon_dpo/beta": 0.09974555671215057,
"epsilon_dpo/beta_margin_grad_mean": -0.5000939965248108,
"epsilon_dpo/beta_margin_grad_std": 0.006312840152531862,
"epsilon_dpo/beta_margin_mean": -0.0003763908171094954,
"epsilon_dpo/beta_margin_std": 0.025255702435970306,
"epsilon_dpo/loss_margin_mean": -0.0018305182456970215,
"grad_norm": 29.221317291259766,
"kl/avg_steps": -0.03125,
"kl/beta": 0.09970442950725555,
"kl/n_epsilon_steps": 0.515625,
"kl/p_epsilon_steps": 0.484375,
"learning_rate": 1.4179104477611938e-07,
"logits/chosen": -0.3637614846229553,
"logits/rejected": -0.35907772183418274,
"logps/chosen": -53.66650390625,
"logps/ref_chosen": -53.678558349609375,
"logps/ref_rejected": -74.16911315917969,
"logps/rejected": -74.15522766113281,
"loss": 1.3868,
"rewards/accuracies": 0.484375,
"rewards/chosen": 0.001081271329894662,
"rewards/margins": -0.00037639960646629333,
"rewards/rejected": 0.0014576709363609552,
"step": 20
},
{
"epoch": 0.031746031746031744,
"epsilon_dpo/beta": 0.09965206682682037,
"epsilon_dpo/beta_margin_grad_mean": -0.5000557899475098,
"epsilon_dpo/beta_margin_grad_std": 0.008477938361465931,
"epsilon_dpo/beta_margin_mean": -0.00022252913913689554,
"epsilon_dpo/beta_margin_std": 0.03392705321311951,
"epsilon_dpo/loss_margin_mean": 0.0002441704273223877,
"grad_norm": 29.078224182128906,
"kl/avg_steps": 0.09375,
"kl/beta": 0.09973560273647308,
"kl/n_epsilon_steps": 0.453125,
"kl/p_epsilon_steps": 0.546875,
"learning_rate": 1.4925373134328355e-07,
"logits/chosen": -0.2857532501220703,
"logits/rejected": -0.33214303851127625,
"logps/chosen": -64.68922424316406,
"logps/ref_chosen": -64.70155334472656,
"logps/ref_rejected": -81.02095031738281,
"logps/rejected": -81.00885009765625,
"loss": 1.3868,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.0011181639274582267,
"rewards/margins": -0.0002224706404376775,
"rewards/rejected": 0.001340634422376752,
"step": 21
},
{
"epoch": 0.03325774754346183,
"epsilon_dpo/beta": 0.09943416714668274,
"epsilon_dpo/beta_margin_grad_mean": -0.4989608824253082,
"epsilon_dpo/beta_margin_grad_std": 0.007762262597680092,
"epsilon_dpo/beta_margin_mean": 0.004156144801527262,
"epsilon_dpo/beta_margin_std": 0.03105759806931019,
"epsilon_dpo/loss_margin_mean": 0.04423174262046814,
"grad_norm": 28.78575325012207,
"kl/avg_steps": 0.21875,
"kl/beta": 0.09964218735694885,
"kl/n_epsilon_steps": 0.390625,
"kl/p_epsilon_steps": 0.609375,
"learning_rate": 1.5671641791044775e-07,
"logits/chosen": -0.32038193941116333,
"logits/rejected": -0.32221364974975586,
"logps/chosen": -58.03137969970703,
"logps/ref_chosen": -58.03599548339844,
"logps/ref_rejected": -80.72721862792969,
"logps/rejected": -80.76683044433594,
"loss": 1.3824,
"rewards/accuracies": 0.609375,
"rewards/chosen": 0.0003732939367182553,
"rewards/margins": 0.004156148061156273,
"rewards/rejected": -0.003782853949815035,
"step": 22
},
{
"epoch": 0.03476946334089191,
"epsilon_dpo/beta": 0.09915497899055481,
"epsilon_dpo/beta_margin_grad_mean": -0.4985734820365906,
"epsilon_dpo/beta_margin_grad_std": 0.007610122673213482,
"epsilon_dpo/beta_margin_mean": 0.005706463940441608,
"epsilon_dpo/beta_margin_std": 0.030449943616986275,
"epsilon_dpo/loss_margin_mean": 0.059777408838272095,
"grad_norm": 32.48927688598633,
"kl/avg_steps": 0.28125,
"kl/beta": 0.09942469745874405,
"kl/n_epsilon_steps": 0.359375,
"kl/p_epsilon_steps": 0.640625,
"learning_rate": 1.6417910447761193e-07,
"logits/chosen": -0.2952424883842468,
"logits/rejected": -0.2977880835533142,
"logps/chosen": -66.321044921875,
"logps/ref_chosen": -66.35609436035156,
"logps/ref_rejected": -93.02769470214844,
"logps/rejected": -93.05242156982422,
"loss": 1.3808,
"rewards/accuracies": 0.609375,
"rewards/chosen": 0.003371128113940358,
"rewards/margins": 0.005706476047635078,
"rewards/rejected": -0.0023353479336947203,
"step": 23
},
{
"epoch": 0.036281179138321996,
"epsilon_dpo/beta": 0.09909378737211227,
"epsilon_dpo/beta_margin_grad_mean": -0.500180184841156,
"epsilon_dpo/beta_margin_grad_std": 0.006747873965650797,
"epsilon_dpo/beta_margin_mean": -0.0007218060200102627,
"epsilon_dpo/beta_margin_std": 0.026996400207281113,
"epsilon_dpo/loss_margin_mean": -0.005073219537734985,
"grad_norm": 26.146747589111328,
"kl/avg_steps": 0.0625,
"kl/beta": 0.09914584457874298,
"kl/n_epsilon_steps": 0.46875,
"kl/p_epsilon_steps": 0.53125,
"learning_rate": 1.716417910447761e-07,
"logits/chosen": -0.27315062284469604,
"logits/rejected": -0.38406500220298767,
"logps/chosen": -54.475669860839844,
"logps/ref_chosen": -54.461238861083984,
"logps/ref_rejected": -68.33817291259766,
"logps/rejected": -68.34752655029297,
"loss": 1.3872,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.0015315038617700338,
"rewards/margins": -0.0007217684760689735,
"rewards/rejected": -0.0008097353274933994,
"step": 24
},
{
"epoch": 0.03779289493575208,
"epsilon_dpo/beta": 0.09917108714580536,
"epsilon_dpo/beta_margin_grad_mean": -0.5001992583274841,
"epsilon_dpo/beta_margin_grad_std": 0.009507820941507816,
"epsilon_dpo/beta_margin_mean": -0.0007946694386191666,
"epsilon_dpo/beta_margin_std": 0.038071826100349426,
"epsilon_dpo/loss_margin_mean": -0.005500108003616333,
"grad_norm": 29.377212524414062,
"kl/avg_steps": -0.078125,
"kl/beta": 0.09908391535282135,
"kl/n_epsilon_steps": 0.53125,
"kl/p_epsilon_steps": 0.453125,
"learning_rate": 1.7910447761194027e-07,
"logits/chosen": -0.24233002960681915,
"logits/rejected": -0.36202138662338257,
"logps/chosen": -60.047935485839844,
"logps/ref_chosen": -60.00420379638672,
"logps/ref_rejected": -90.47376251220703,
"logps/rejected": -90.51200103759766,
"loss": 1.3875,
"rewards/accuracies": 0.453125,
"rewards/chosen": -0.0044132559560239315,
"rewards/margins": -0.0007946339319460094,
"rewards/rejected": -0.0036186217330396175,
"step": 25
},
{
"epoch": 0.039304610733182165,
"epsilon_dpo/beta": 0.09910932183265686,
"epsilon_dpo/beta_margin_grad_mean": -0.5002905130386353,
"epsilon_dpo/beta_margin_grad_std": 0.008865254931151867,
"epsilon_dpo/beta_margin_mean": -0.0011616774136200547,
"epsilon_dpo/beta_margin_std": 0.03548089787364006,
"epsilon_dpo/loss_margin_mean": -0.009196758270263672,
"grad_norm": 29.478923797607422,
"kl/avg_steps": 0.0625,
"kl/beta": 0.09916138648986816,
"kl/n_epsilon_steps": 0.46875,
"kl/p_epsilon_steps": 0.53125,
"learning_rate": 1.8656716417910447e-07,
"logits/chosen": -0.33379530906677246,
"logits/rejected": -0.36592623591423035,
"logps/chosen": -56.83445739746094,
"logps/ref_chosen": -56.81915283203125,
"logps/ref_rejected": -77.84333038330078,
"logps/rejected": -77.84943389892578,
"loss": 1.3878,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.0016438440652564168,
"rewards/margins": -0.0011616243282333016,
"rewards/rejected": -0.00048221962060779333,
"step": 26
},
{
"epoch": 0.04081632653061224,
"epsilon_dpo/beta": 0.09907838702201843,
"epsilon_dpo/beta_margin_grad_mean": -0.5005349516868591,
"epsilon_dpo/beta_margin_grad_std": 0.007867163978517056,
"epsilon_dpo/beta_margin_mean": -0.002141261473298073,
"epsilon_dpo/beta_margin_std": 0.031479235738515854,
"epsilon_dpo/loss_margin_mean": -0.019195079803466797,
"grad_norm": 28.842924118041992,
"kl/avg_steps": 0.03125,
"kl/beta": 0.09909944981336594,
"kl/n_epsilon_steps": 0.484375,
"kl/p_epsilon_steps": 0.515625,
"learning_rate": 1.9402985074626865e-07,
"logits/chosen": -0.34893810749053955,
"logits/rejected": -0.3658442795276642,
"logps/chosen": -62.88542175292969,
"logps/ref_chosen": -62.87702178955078,
"logps/ref_rejected": -71.34437561035156,
"logps/rejected": -71.33357238769531,
"loss": 1.3887,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.0009422144503332675,
"rewards/margins": -0.0021412523929029703,
"rewards/rejected": 0.001199037884362042,
"step": 27
},
{
"epoch": 0.042328042328042326,
"epsilon_dpo/beta": 0.09900084137916565,
"epsilon_dpo/beta_margin_grad_mean": -0.5003678798675537,
"epsilon_dpo/beta_margin_grad_std": 0.008399988524615765,
"epsilon_dpo/beta_margin_mean": -0.0014723996864631772,
"epsilon_dpo/beta_margin_std": 0.033611465245485306,
"epsilon_dpo/loss_margin_mean": -0.01223665475845337,
"grad_norm": 27.418651580810547,
"kl/avg_steps": 0.078125,
"kl/beta": 0.09906849265098572,
"kl/n_epsilon_steps": 0.453125,
"kl/p_epsilon_steps": 0.53125,
"learning_rate": 2.0149253731343282e-07,
"logits/chosen": -0.361447274684906,
"logits/rejected": -0.3206895589828491,
"logps/chosen": -59.86574172973633,
"logps/ref_chosen": -59.833377838134766,
"logps/ref_rejected": -70.39804077148438,
"logps/rejected": -70.41816711425781,
"loss": 1.388,
"rewards/accuracies": 0.515625,
"rewards/chosen": -0.003342903219163418,
"rewards/margins": -0.0014724235516041517,
"rewards/rejected": -0.0018704799003899097,
"step": 28
},
{
"epoch": 0.04383975812547241,
"epsilon_dpo/beta": 0.09893918037414551,
"epsilon_dpo/beta_margin_grad_mean": -0.500007688999176,
"epsilon_dpo/beta_margin_grad_std": 0.007880612276494503,
"epsilon_dpo/beta_margin_mean": -3.106631993432529e-05,
"epsilon_dpo/beta_margin_std": 0.03153260052204132,
"epsilon_dpo/loss_margin_mean": 0.0019943714141845703,
"grad_norm": 32.391754150390625,
"kl/avg_steps": 0.0625,
"kl/beta": 0.09899115562438965,
"kl/n_epsilon_steps": 0.46875,
"kl/p_epsilon_steps": 0.53125,
"learning_rate": 2.08955223880597e-07,
"logits/chosen": -0.30471086502075195,
"logits/rejected": -0.315449059009552,
"logps/chosen": -74.17647552490234,
"logps/ref_chosen": -74.12020111083984,
"logps/ref_rejected": -83.33098602294922,
"logps/rejected": -83.3892593383789,
"loss": 1.3866,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.005664899479597807,
"rewards/margins": -3.107072552666068e-05,
"rewards/rejected": -0.005633828695863485,
"step": 29
},
{
"epoch": 0.045351473922902494,
"epsilon_dpo/beta": 0.09884645789861679,
"epsilon_dpo/beta_margin_grad_mean": -0.4979906976222992,
"epsilon_dpo/beta_margin_grad_std": 0.008478553965687752,
"epsilon_dpo/beta_margin_mean": 0.008043105714023113,
"epsilon_dpo/beta_margin_std": 0.03393545001745224,
"epsilon_dpo/loss_margin_mean": 0.08371976017951965,
"grad_norm": 29.81890869140625,
"kl/avg_steps": 0.09375,
"kl/beta": 0.09892932325601578,
"kl/n_epsilon_steps": 0.453125,
"kl/p_epsilon_steps": 0.546875,
"learning_rate": 2.1641791044776117e-07,
"logits/chosen": -0.27538198232650757,
"logits/rejected": -0.3717191815376282,
"logps/chosen": -50.739891052246094,
"logps/ref_chosen": -50.75128936767578,
"logps/ref_rejected": -89.29063415527344,
"logps/rejected": -89.36295318603516,
"loss": 1.3786,
"rewards/accuracies": 0.546875,
"rewards/chosen": 0.001058907713741064,
"rewards/margins": 0.008043129928410053,
"rewards/rejected": -0.006984221749007702,
"step": 30
},
{
"epoch": 0.04686318972033258,
"epsilon_dpo/beta": 0.09873828291893005,
"epsilon_dpo/beta_margin_grad_mean": -0.49814411997795105,
"epsilon_dpo/beta_margin_grad_std": 0.010129507631063461,
"epsilon_dpo/beta_margin_mean": 0.007430542726069689,
"epsilon_dpo/beta_margin_std": 0.04054348170757294,
"epsilon_dpo/loss_margin_mean": 0.07811975479125977,
"grad_norm": 33.668331146240234,
"kl/avg_steps": 0.109375,
"kl/beta": 0.09883666783571243,
"kl/n_epsilon_steps": 0.4375,
"kl/p_epsilon_steps": 0.546875,
"learning_rate": 2.2388059701492537e-07,
"logits/chosen": -0.2762707471847534,
"logits/rejected": -0.3536580801010132,
"logps/chosen": -65.345458984375,
"logps/ref_chosen": -65.33675384521484,
"logps/ref_rejected": -100.76666259765625,
"logps/rejected": -100.85348510742188,
"loss": 1.3793,
"rewards/accuracies": 0.546875,
"rewards/chosen": -0.0009668983984738588,
"rewards/margins": 0.0074305459856987,
"rewards/rejected": -0.00839744508266449,
"step": 31
},
{
"epoch": 0.04837490551776266,
"epsilon_dpo/beta": 0.09886197745800018,
"epsilon_dpo/beta_margin_grad_mean": -0.49965915083885193,
"epsilon_dpo/beta_margin_grad_std": 0.009040210396051407,
"epsilon_dpo/beta_margin_mean": 0.0013648051535710692,
"epsilon_dpo/beta_margin_std": 0.03617309778928757,
"epsilon_dpo/loss_margin_mean": 0.016734689474105835,
"grad_norm": 29.715906143188477,
"kl/avg_steps": -0.125,
"kl/beta": 0.09872867912054062,
"kl/n_epsilon_steps": 0.5625,
"kl/p_epsilon_steps": 0.4375,
"learning_rate": 2.3134328358208954e-07,
"logits/chosen": -0.3281136155128479,
"logits/rejected": -0.3593684434890747,
"logps/chosen": -67.18722534179688,
"logps/ref_chosen": -67.18333435058594,
"logps/ref_rejected": -82.80763244628906,
"logps/rejected": -82.82826232910156,
"loss": 1.3853,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.000504728639498353,
"rewards/margins": 0.001364781754091382,
"rewards/rejected": -0.0018695106264203787,
"step": 32
},
{
"epoch": 0.049886621315192746,
"epsilon_dpo/beta": 0.09873855113983154,
"epsilon_dpo/beta_margin_grad_mean": -0.49806874990463257,
"epsilon_dpo/beta_margin_grad_std": 0.009414257481694221,
"epsilon_dpo/beta_margin_mean": 0.007726335898041725,
"epsilon_dpo/beta_margin_std": 0.03767699748277664,
"epsilon_dpo/loss_margin_mean": 0.08104704320430756,
"grad_norm": 30.580888748168945,
"kl/avg_steps": 0.125,
"kl/beta": 0.0988522469997406,
"kl/n_epsilon_steps": 0.4375,
"kl/p_epsilon_steps": 0.5625,
"learning_rate": 2.388059701492537e-07,
"logits/chosen": -0.3929429352283478,
"logits/rejected": -0.3888055384159088,
"logps/chosen": -64.02083587646484,
"logps/ref_chosen": -64.03947448730469,
"logps/ref_rejected": -75.68357849121094,
"logps/rejected": -75.74598693847656,
"loss": 1.3789,
"rewards/accuracies": 0.578125,
"rewards/chosen": 0.001717576989904046,
"rewards/margins": 0.007726255338639021,
"rewards/rejected": -0.006008678115904331,
"step": 33
},
{
"epoch": 0.05139833711262283,
"epsilon_dpo/beta": 0.09843014925718307,
"epsilon_dpo/beta_margin_grad_mean": -0.4978667199611664,
"epsilon_dpo/beta_margin_grad_std": 0.006962464656680822,
"epsilon_dpo/beta_margin_mean": 0.008533835411071777,
"epsilon_dpo/beta_margin_std": 0.027855342254042625,
"epsilon_dpo/loss_margin_mean": 0.08876317739486694,
"grad_norm": 27.95029067993164,
"kl/avg_steps": 0.3125,
"kl/beta": 0.09872883558273315,
"kl/n_epsilon_steps": 0.34375,
"kl/p_epsilon_steps": 0.65625,
"learning_rate": 2.4626865671641786e-07,
"logits/chosen": -0.3121333122253418,
"logits/rejected": -0.3687829375267029,
"logps/chosen": -53.67481994628906,
"logps/ref_chosen": -53.66429901123047,
"logps/ref_rejected": -65.77989196777344,
"logps/rejected": -65.87918853759766,
"loss": 1.378,
"rewards/accuracies": 0.640625,
"rewards/chosen": -0.0011011587921530008,
"rewards/margins": 0.00853382982313633,
"rewards/rejected": -0.009634988382458687,
"step": 34
},
{
"epoch": 0.05291005291005291,
"epsilon_dpo/beta": 0.09846186637878418,
"epsilon_dpo/beta_margin_grad_mean": -0.5003045201301575,
"epsilon_dpo/beta_margin_grad_std": 0.008296910673379898,
"epsilon_dpo/beta_margin_mean": -0.0012179145123809576,
"epsilon_dpo/beta_margin_std": 0.03319939225912094,
"epsilon_dpo/loss_margin_mean": -0.009852796792984009,
"grad_norm": 27.398624420166016,
"kl/avg_steps": -0.03125,
"kl/beta": 0.09842126816511154,
"kl/n_epsilon_steps": 0.515625,
"kl/p_epsilon_steps": 0.484375,
"learning_rate": 2.537313432835821e-07,
"logits/chosen": -0.3273148536682129,
"logits/rejected": -0.3661719262599945,
"logps/chosen": -61.13897705078125,
"logps/ref_chosen": -61.01686096191406,
"logps/ref_rejected": -72.78598022460938,
"logps/rejected": -72.89823913574219,
"loss": 1.3878,
"rewards/accuracies": 0.484375,
"rewards/chosen": -0.01214178092777729,
"rewards/margins": -0.0012179468758404255,
"rewards/rejected": -0.010923834517598152,
"step": 35
},
{
"epoch": 0.05442176870748299,
"epsilon_dpo/beta": 0.09858494997024536,
"epsilon_dpo/beta_margin_grad_mean": -0.5004087090492249,
"epsilon_dpo/beta_margin_grad_std": 0.009157510474324226,
"epsilon_dpo/beta_margin_mean": -0.0016357137355953455,
"epsilon_dpo/beta_margin_std": 0.03664514049887657,
"epsilon_dpo/loss_margin_mean": -0.013788998126983643,
"grad_norm": 28.394075393676758,
"kl/avg_steps": -0.125,
"kl/beta": 0.09845203161239624,
"kl/n_epsilon_steps": 0.5625,
"kl/p_epsilon_steps": 0.4375,
"learning_rate": 2.611940298507462e-07,
"logits/chosen": -0.3019469380378723,
"logits/rejected": -0.38744592666625977,
"logps/chosen": -50.620140075683594,
"logps/ref_chosen": -50.53736114501953,
"logps/ref_rejected": -78.11678314208984,
"logps/rejected": -78.18577575683594,
"loss": 1.3883,
"rewards/accuracies": 0.421875,
"rewards/chosen": -0.008260859176516533,
"rewards/margins": -0.0016357424901798368,
"rewards/rejected": -0.006625116337090731,
"step": 36
},
{
"epoch": 0.055933484504913075,
"epsilon_dpo/beta": 0.09830784797668457,
"epsilon_dpo/beta_margin_grad_mean": -0.49664080142974854,
"epsilon_dpo/beta_margin_grad_std": 0.009693044237792492,
"epsilon_dpo/beta_margin_mean": 0.013446959666907787,
"epsilon_dpo/beta_margin_std": 0.038813989609479904,
"epsilon_dpo/loss_margin_mean": 0.1392996609210968,
"grad_norm": 37.030452728271484,
"kl/avg_steps": 0.28125,
"kl/beta": 0.09857525676488876,
"kl/n_epsilon_steps": 0.359375,
"kl/p_epsilon_steps": 0.640625,
"learning_rate": 2.686567164179104e-07,
"logits/chosen": -0.28968507051467896,
"logits/rejected": -0.39959055185317993,
"logps/chosen": -59.57615661621094,
"logps/ref_chosen": -59.55394744873047,
"logps/ref_rejected": -108.27703094482422,
"logps/rejected": -108.43853759765625,
"loss": 1.3733,
"rewards/accuracies": 0.640625,
"rewards/chosen": -0.0022825594060122967,
"rewards/margins": 0.013446968980133533,
"rewards/rejected": -0.015729527920484543,
"step": 37
},
{
"epoch": 0.05744520030234316,
"epsilon_dpo/beta": 0.09812428802251816,
"epsilon_dpo/beta_margin_grad_mean": -0.4994020164012909,
"epsilon_dpo/beta_margin_grad_std": 0.00942437443882227,
"epsilon_dpo/beta_margin_mean": 0.0023924780543893576,
"epsilon_dpo/beta_margin_std": 0.037712108343839645,
"epsilon_dpo/loss_margin_mean": 0.02735239267349243,
"grad_norm": 29.14921760559082,
"kl/avg_steps": 0.1875,
"kl/beta": 0.09829878807067871,
"kl/n_epsilon_steps": 0.40625,
"kl/p_epsilon_steps": 0.59375,
"learning_rate": 2.761194029850746e-07,
"logits/chosen": -0.2621217966079712,
"logits/rejected": -0.3265727758407593,
"logps/chosen": -65.86236572265625,
"logps/ref_chosen": -65.7883529663086,
"logps/ref_rejected": -76.1619873046875,
"logps/rejected": -76.26335906982422,
"loss": 1.3843,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.007381693460047245,
"rewards/margins": 0.0023924654815346003,
"rewards/rejected": -0.009774158708751202,
"step": 38
},
{
"epoch": 0.05895691609977324,
"epsilon_dpo/beta": 0.09797131270170212,
"epsilon_dpo/beta_margin_grad_mean": -0.4989696741104126,
"epsilon_dpo/beta_margin_grad_std": 0.009285034611821175,
"epsilon_dpo/beta_margin_mean": 0.004120942205190659,
"epsilon_dpo/beta_margin_std": 0.037152983248233795,
"epsilon_dpo/loss_margin_mean": 0.045042961835861206,
"grad_norm": 28.734718322753906,
"kl/avg_steps": 0.15625,
"kl/beta": 0.09811482578516006,
"kl/n_epsilon_steps": 0.421875,
"kl/p_epsilon_steps": 0.578125,
"learning_rate": 2.8358208955223876e-07,
"logits/chosen": -0.2983561158180237,
"logits/rejected": -0.3871016502380371,
"logps/chosen": -57.26594924926758,
"logps/ref_chosen": -57.17680358886719,
"logps/ref_rejected": -79.486328125,
"logps/rejected": -79.62051391601562,
"loss": 1.3825,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.00887388177216053,
"rewards/margins": 0.004120939411222935,
"rewards/rejected": -0.01299482211470604,
"step": 39
},
{
"epoch": 0.06046863189720333,
"epsilon_dpo/beta": 0.09778755903244019,
"epsilon_dpo/beta_margin_grad_mean": -0.49924176931381226,
"epsilon_dpo/beta_margin_grad_std": 0.007707234937697649,
"epsilon_dpo/beta_margin_mean": 0.0030321883969008923,
"epsilon_dpo/beta_margin_std": 0.030836397781968117,
"epsilon_dpo/loss_margin_mean": 0.03347122669219971,
"grad_norm": 30.860240936279297,
"kl/avg_steps": 0.1875,
"kl/beta": 0.09796176105737686,
"kl/n_epsilon_steps": 0.390625,
"kl/p_epsilon_steps": 0.578125,
"learning_rate": 2.9104477611940296e-07,
"logits/chosen": -0.2660544216632843,
"logits/rejected": -0.4195551872253418,
"logps/chosen": -61.44474411010742,
"logps/ref_chosen": -61.33416748046875,
"logps/ref_rejected": -79.10697174072266,
"logps/rejected": -79.25102233886719,
"loss": 1.3835,
"rewards/accuracies": 0.609375,
"rewards/chosen": -0.010938970372080803,
"rewards/margins": 0.003032212145626545,
"rewards/rejected": -0.013971181586384773,
"step": 40
},
{
"epoch": 0.06198034769463341,
"epsilon_dpo/beta": 0.09769652038812637,
"epsilon_dpo/beta_margin_grad_mean": -0.4976310431957245,
"epsilon_dpo/beta_margin_grad_std": 0.008914729580283165,
"epsilon_dpo/beta_margin_mean": 0.009479942731559277,
"epsilon_dpo/beta_margin_std": 0.03567349910736084,
"epsilon_dpo/loss_margin_mean": 0.0999109148979187,
"grad_norm": 29.658599853515625,
"kl/avg_steps": 0.09375,
"kl/beta": 0.09777842462062836,
"kl/n_epsilon_steps": 0.453125,
"kl/p_epsilon_steps": 0.546875,
"learning_rate": 2.985074626865671e-07,
"logits/chosen": -0.36961716413497925,
"logits/rejected": -0.39740079641342163,
"logps/chosen": -67.65887451171875,
"logps/ref_chosen": -67.54672241210938,
"logps/ref_rejected": -83.87788391113281,
"logps/rejected": -84.0899429321289,
"loss": 1.3772,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.011106956750154495,
"rewards/margins": 0.009479934349656105,
"rewards/rejected": -0.0205868910998106,
"step": 41
},
{
"epoch": 0.06349206349206349,
"epsilon_dpo/beta": 0.09775766730308533,
"epsilon_dpo/beta_margin_grad_mean": -0.5014545321464539,
"epsilon_dpo/beta_margin_grad_std": 0.010774490423500538,
"epsilon_dpo/beta_margin_mean": -0.005826249718666077,
"epsilon_dpo/beta_margin_std": 0.04312598705291748,
"epsilon_dpo/loss_margin_mean": -0.05621953308582306,
"grad_norm": 28.796390533447266,
"kl/avg_steps": -0.0625,
"kl/beta": 0.09768684208393097,
"kl/n_epsilon_steps": 0.53125,
"kl/p_epsilon_steps": 0.46875,
"learning_rate": 3.059701492537313e-07,
"logits/chosen": -0.3224967122077942,
"logits/rejected": -0.35755455493927,
"logps/chosen": -61.40879821777344,
"logps/ref_chosen": -61.26485824584961,
"logps/ref_rejected": -76.3629150390625,
"logps/rejected": -76.45063781738281,
"loss": 1.3926,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.014219951815903187,
"rewards/margins": -0.0058262646198272705,
"rewards/rejected": -0.008393687196075916,
"step": 42
},
{
"epoch": 0.06500377928949358,
"epsilon_dpo/beta": 0.0976811870932579,
"epsilon_dpo/beta_margin_grad_mean": -0.499348908662796,
"epsilon_dpo/beta_margin_grad_std": 0.009806429967284203,
"epsilon_dpo/beta_margin_mean": 0.0026009411085397005,
"epsilon_dpo/beta_margin_std": 0.0392548032104969,
"epsilon_dpo/loss_margin_mean": 0.029582887887954712,
"grad_norm": 33.69056701660156,
"kl/avg_steps": 0.078125,
"kl/beta": 0.09774793684482574,
"kl/n_epsilon_steps": 0.453125,
"kl/p_epsilon_steps": 0.53125,
"learning_rate": 3.134328358208955e-07,
"logits/chosen": -0.3330717086791992,
"logits/rejected": -0.3579285740852356,
"logps/chosen": -71.92140197753906,
"logps/ref_chosen": -71.80902862548828,
"logps/ref_rejected": -81.12464141845703,
"logps/rejected": -81.26659393310547,
"loss": 1.3841,
"rewards/accuracies": 0.515625,
"rewards/chosen": -0.011149590834975243,
"rewards/margins": 0.002601020270958543,
"rewards/rejected": -0.013750611804425716,
"step": 43
},
{
"epoch": 0.06651549508692366,
"epsilon_dpo/beta": 0.09760492295026779,
"epsilon_dpo/beta_margin_grad_mean": -0.49933645129203796,
"epsilon_dpo/beta_margin_grad_std": 0.010744070634245872,
"epsilon_dpo/beta_margin_mean": 0.0026634575333446264,
"epsilon_dpo/beta_margin_std": 0.043017566204071045,
"epsilon_dpo/loss_margin_mean": 0.030479639768600464,
"grad_norm": 31.798940658569336,
"kl/avg_steps": 0.078125,
"kl/beta": 0.09767162799835205,
"kl/n_epsilon_steps": 0.453125,
"kl/p_epsilon_steps": 0.53125,
"learning_rate": 3.2089552238805965e-07,
"logits/chosen": -0.3427223563194275,
"logits/rejected": -0.40159872174263,
"logps/chosen": -66.72885131835938,
"logps/ref_chosen": -66.55043029785156,
"logps/ref_rejected": -85.06198120117188,
"logps/rejected": -85.27088165283203,
"loss": 1.3841,
"rewards/accuracies": 0.546875,
"rewards/chosen": -0.017551973462104797,
"rewards/margins": 0.0026634635869413614,
"rewards/rejected": -0.020215436816215515,
"step": 44
},
{
"epoch": 0.06802721088435375,
"epsilon_dpo/beta": 0.09742213785648346,
"epsilon_dpo/beta_margin_grad_mean": -0.4976644515991211,
"epsilon_dpo/beta_margin_grad_std": 0.010346302762627602,
"epsilon_dpo/beta_margin_mean": 0.009346517734229565,
"epsilon_dpo/beta_margin_std": 0.041404642164707184,
"epsilon_dpo/loss_margin_mean": 0.09919825196266174,
"grad_norm": 30.93644905090332,
"kl/avg_steps": 0.1875,
"kl/beta": 0.09759538620710373,
"kl/n_epsilon_steps": 0.40625,
"kl/p_epsilon_steps": 0.59375,
"learning_rate": 3.2835820895522385e-07,
"logits/chosen": -0.3158496618270874,
"logits/rejected": -0.4154477119445801,
"logps/chosen": -62.401817321777344,
"logps/ref_chosen": -62.243858337402344,
"logps/ref_rejected": -92.96665954589844,
"logps/rejected": -93.22382354736328,
"loss": 1.3774,
"rewards/accuracies": 0.609375,
"rewards/chosen": -0.015503356233239174,
"rewards/margins": 0.009346544742584229,
"rewards/rejected": -0.024849899113178253,
"step": 45
},
{
"epoch": 0.06953892668178382,
"epsilon_dpo/beta": 0.09723980724811554,
"epsilon_dpo/beta_margin_grad_mean": -0.49653923511505127,
"epsilon_dpo/beta_margin_grad_std": 0.0095536969602108,
"epsilon_dpo/beta_margin_mean": 0.013848591595888138,
"epsilon_dpo/beta_margin_std": 0.03823241591453552,
"epsilon_dpo/loss_margin_mean": 0.14542287588119507,
"grad_norm": 30.165502548217773,
"kl/avg_steps": 0.1875,
"kl/beta": 0.09741273522377014,
"kl/n_epsilon_steps": 0.40625,
"kl/p_epsilon_steps": 0.59375,
"learning_rate": 3.3582089552238805e-07,
"logits/chosen": -0.2545163035392761,
"logits/rejected": -0.42722511291503906,
"logps/chosen": -61.611412048339844,
"logps/ref_chosen": -61.498905181884766,
"logps/ref_rejected": -78.91172790527344,
"logps/rejected": -79.16966247558594,
"loss": 1.3729,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.011050897650420666,
"rewards/margins": 0.013848556205630302,
"rewards/rejected": -0.024899452924728394,
"step": 46
},
{
"epoch": 0.0710506424792139,
"epsilon_dpo/beta": 0.09687550365924835,
"epsilon_dpo/beta_margin_grad_mean": -0.49549174308776855,
"epsilon_dpo/beta_margin_grad_std": 0.010052971541881561,
"epsilon_dpo/beta_margin_mean": 0.018040889874100685,
"epsilon_dpo/beta_margin_std": 0.040230199694633484,
"epsilon_dpo/loss_margin_mean": 0.18935969471931458,
"grad_norm": 27.54783821105957,
"kl/avg_steps": 0.375,
"kl/beta": 0.09723042696714401,
"kl/n_epsilon_steps": 0.3125,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 3.432835820895522e-07,
"logits/chosen": -0.3111526370048523,
"logits/rejected": -0.40701138973236084,
"logps/chosen": -51.70506286621094,
"logps/ref_chosen": -51.578346252441406,
"logps/ref_rejected": -68.2215576171875,
"logps/rejected": -68.53763580322266,
"loss": 1.3687,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.012445923872292042,
"rewards/margins": 0.018040882423520088,
"rewards/rejected": -0.030486807227134705,
"step": 47
},
{
"epoch": 0.07256235827664399,
"epsilon_dpo/beta": 0.09684658795595169,
"epsilon_dpo/beta_margin_grad_mean": -0.4997785985469818,
"epsilon_dpo/beta_margin_grad_std": 0.009725292213261127,
"epsilon_dpo/beta_margin_mean": 0.0008865180425345898,
"epsilon_dpo/beta_margin_std": 0.03891964256763458,
"epsilon_dpo/loss_margin_mean": 0.012155205011367798,
"grad_norm": 26.05389976501465,
"kl/avg_steps": 0.03125,
"kl/beta": 0.09686717391014099,
"kl/n_epsilon_steps": 0.484375,
"kl/p_epsilon_steps": 0.515625,
"learning_rate": 3.507462686567164e-07,
"logits/chosen": -0.2208203375339508,
"logits/rejected": -0.3506305515766144,
"logps/chosen": -52.0263671875,
"logps/ref_chosen": -51.79365158081055,
"logps/ref_rejected": -64.22504425048828,
"logps/rejected": -64.46990966796875,
"loss": 1.3858,
"rewards/accuracies": 0.484375,
"rewards/chosen": -0.0227007158100605,
"rewards/margins": 0.0008864859119057655,
"rewards/rejected": -0.02358720451593399,
"step": 48
},
{
"epoch": 0.07407407407407407,
"epsilon_dpo/beta": 0.09643787145614624,
"epsilon_dpo/beta_margin_grad_mean": -0.496853232383728,
"epsilon_dpo/beta_margin_grad_std": 0.010868191719055176,
"epsilon_dpo/beta_margin_mean": 0.012585949152708054,
"epsilon_dpo/beta_margin_std": 0.04351355507969856,
"epsilon_dpo/loss_margin_mean": 0.13341808319091797,
"grad_norm": 26.346975326538086,
"kl/avg_steps": 0.421875,
"kl/beta": 0.09683690965175629,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.703125,
"learning_rate": 3.5820895522388055e-07,
"logits/chosen": -0.2635442018508911,
"logits/rejected": -0.3202664256095886,
"logps/chosen": -58.3313102722168,
"logps/ref_chosen": -58.13460159301758,
"logps/ref_rejected": -64.63206481933594,
"logps/rejected": -64.96219635009766,
"loss": 1.3742,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.01911630481481552,
"rewards/margins": 0.012585877440869808,
"rewards/rejected": -0.031702183187007904,
"step": 49
},
{
"epoch": 0.07558578987150416,
"epsilon_dpo/beta": 0.09619864076375961,
"epsilon_dpo/beta_margin_grad_mean": -0.49748218059539795,
"epsilon_dpo/beta_margin_grad_std": 0.009904789738357067,
"epsilon_dpo/beta_margin_mean": 0.010073556564748287,
"epsilon_dpo/beta_margin_std": 0.039639923721551895,
"epsilon_dpo/loss_margin_mean": 0.1077713668346405,
"grad_norm": 27.031532287597656,
"kl/avg_steps": 0.25,
"kl/beta": 0.09643010050058365,
"kl/n_epsilon_steps": 0.375,
"kl/p_epsilon_steps": 0.625,
"learning_rate": 3.6567164179104475e-07,
"logits/chosen": -0.3586847186088562,
"logits/rejected": -0.3919578790664673,
"logps/chosen": -53.15693283081055,
"logps/ref_chosen": -52.85643768310547,
"logps/ref_rejected": -72.17460632324219,
"logps/rejected": -72.58287048339844,
"loss": 1.3766,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.029071442782878876,
"rewards/margins": 0.010073533281683922,
"rewards/rejected": -0.03914497792720795,
"step": 50
},
{
"epoch": 0.07709750566893424,
"epsilon_dpo/beta": 0.09598881006240845,
"epsilon_dpo/beta_margin_grad_mean": -0.495978444814682,
"epsilon_dpo/beta_margin_grad_std": 0.010665152221918106,
"epsilon_dpo/beta_margin_mean": 0.01609669253230095,
"epsilon_dpo/beta_margin_std": 0.042686909437179565,
"epsilon_dpo/loss_margin_mean": 0.17097631096839905,
"grad_norm": 29.876943588256836,
"kl/avg_steps": 0.21875,
"kl/beta": 0.09618962556123734,
"kl/n_epsilon_steps": 0.390625,
"kl/p_epsilon_steps": 0.609375,
"learning_rate": 3.7313432835820895e-07,
"logits/chosen": -0.3726983964443207,
"logits/rejected": -0.4622589349746704,
"logps/chosen": -63.9547119140625,
"logps/ref_chosen": -63.65644073486328,
"logps/ref_rejected": -86.1323013305664,
"logps/rejected": -86.60154724121094,
"loss": 1.3707,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.028727885335683823,
"rewards/margins": 0.016096722334623337,
"rewards/rejected": -0.04482460767030716,
"step": 51
},
{
"epoch": 0.07860922146636433,
"epsilon_dpo/beta": 0.09576413780450821,
"epsilon_dpo/beta_margin_grad_mean": -0.495856910943985,
"epsilon_dpo/beta_margin_grad_std": 0.01338079571723938,
"epsilon_dpo/beta_margin_mean": 0.016596658155322075,
"epsilon_dpo/beta_margin_std": 0.05361338332295418,
"epsilon_dpo/loss_margin_mean": 0.17717164754867554,
"grad_norm": 31.279869079589844,
"kl/avg_steps": 0.234375,
"kl/beta": 0.09597966820001602,
"kl/n_epsilon_steps": 0.375,
"kl/p_epsilon_steps": 0.609375,
"learning_rate": 3.805970149253731e-07,
"logits/chosen": -0.3090393543243408,
"logits/rejected": -0.3384135365486145,
"logps/chosen": -68.15504455566406,
"logps/ref_chosen": -67.8402099609375,
"logps/ref_rejected": -96.97091674804688,
"logps/rejected": -97.46290588378906,
"loss": 1.3705,
"rewards/accuracies": 0.609375,
"rewards/chosen": -0.030326515436172485,
"rewards/margins": 0.016596663743257523,
"rewards/rejected": -0.04692317917943001,
"step": 52
},
{
"epoch": 0.0801209372637944,
"epsilon_dpo/beta": 0.09534584730863571,
"epsilon_dpo/beta_margin_grad_mean": -0.4960094392299652,
"epsilon_dpo/beta_margin_grad_std": 0.011819392442703247,
"epsilon_dpo/beta_margin_mean": 0.01596810296177864,
"epsilon_dpo/beta_margin_std": 0.047310467809438705,
"epsilon_dpo/loss_margin_mean": 0.17082881927490234,
"grad_norm": 26.208026885986328,
"kl/avg_steps": 0.4375,
"kl/beta": 0.09575524181127548,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 3.880597014925373e-07,
"logits/chosen": -0.31404581665992737,
"logits/rejected": -0.3455438017845154,
"logps/chosen": -57.198753356933594,
"logps/ref_chosen": -56.87813949584961,
"logps/ref_rejected": -60.75569152832031,
"logps/rejected": -61.24713897705078,
"loss": 1.3709,
"rewards/accuracies": 0.703125,
"rewards/chosen": -0.030741358175873756,
"rewards/margins": 0.015968114137649536,
"rewards/rejected": -0.04670947045087814,
"step": 53
},
{
"epoch": 0.08163265306122448,
"epsilon_dpo/beta": 0.09522848576307297,
"epsilon_dpo/beta_margin_grad_mean": -0.4960388243198395,
"epsilon_dpo/beta_margin_grad_std": 0.012931020930409431,
"epsilon_dpo/beta_margin_mean": 0.015868177637457848,
"epsilon_dpo/beta_margin_std": 0.05182372406125069,
"epsilon_dpo/loss_margin_mean": 0.17040961980819702,
"grad_norm": 25.257896423339844,
"kl/avg_steps": 0.125,
"kl/beta": 0.0953381359577179,
"kl/n_epsilon_steps": 0.4375,
"kl/p_epsilon_steps": 0.5625,
"learning_rate": 3.9552238805970144e-07,
"logits/chosen": -0.26633220911026,
"logits/rejected": -0.3327806293964386,
"logps/chosen": -47.65915298461914,
"logps/ref_chosen": -47.26692199707031,
"logps/ref_rejected": -62.19426727294922,
"logps/rejected": -62.75690841674805,
"loss": 1.3712,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.03754565119743347,
"rewards/margins": 0.0158681683242321,
"rewards/rejected": -0.05341381952166557,
"step": 54
},
{
"epoch": 0.08314436885865457,
"epsilon_dpo/beta": 0.09487152099609375,
"epsilon_dpo/beta_margin_grad_mean": -0.4924120008945465,
"epsilon_dpo/beta_margin_grad_std": 0.01731080375611782,
"epsilon_dpo/beta_margin_mean": 0.030413687229156494,
"epsilon_dpo/beta_margin_std": 0.06939557194709778,
"epsilon_dpo/loss_margin_mean": 0.32537999749183655,
"grad_norm": 29.639198303222656,
"kl/avg_steps": 0.375,
"kl/beta": 0.09521911293268204,
"kl/n_epsilon_steps": 0.3125,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 4.0298507462686564e-07,
"logits/chosen": -0.3583253026008606,
"logits/rejected": -0.43026989698410034,
"logps/chosen": -50.658485412597656,
"logps/ref_chosen": -50.32619094848633,
"logps/ref_rejected": -92.44389343261719,
"logps/rejected": -93.1015625,
"loss": 1.3573,
"rewards/accuracies": 0.671875,
"rewards/chosen": -0.03171641379594803,
"rewards/margins": 0.030413739383220673,
"rewards/rejected": -0.0621301531791687,
"step": 55
},
{
"epoch": 0.08465608465608465,
"epsilon_dpo/beta": 0.09454673528671265,
"epsilon_dpo/beta_margin_grad_mean": -0.4957652688026428,
"epsilon_dpo/beta_margin_grad_std": 0.015593883581459522,
"epsilon_dpo/beta_margin_mean": 0.01693076640367508,
"epsilon_dpo/beta_margin_std": 0.06251642107963562,
"epsilon_dpo/loss_margin_mean": 0.18337374925613403,
"grad_norm": 26.008502960205078,
"kl/avg_steps": 0.34375,
"kl/beta": 0.09486337751150131,
"kl/n_epsilon_steps": 0.328125,
"kl/p_epsilon_steps": 0.671875,
"learning_rate": 4.1044776119402984e-07,
"logits/chosen": -0.2539837062358856,
"logits/rejected": -0.3539873957633972,
"logps/chosen": -57.122779846191406,
"logps/ref_chosen": -56.766971588134766,
"logps/ref_rejected": -66.30503845214844,
"logps/rejected": -66.84422302246094,
"loss": 1.3704,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.03385629132390022,
"rewards/margins": 0.016930758953094482,
"rewards/rejected": -0.050787050276994705,
"step": 56
},
{
"epoch": 0.08616780045351474,
"epsilon_dpo/beta": 0.09413420408964157,
"epsilon_dpo/beta_margin_grad_mean": -0.49317413568496704,
"epsilon_dpo/beta_margin_grad_std": 0.014798992313444614,
"epsilon_dpo/beta_margin_mean": 0.027332819998264313,
"epsilon_dpo/beta_margin_std": 0.05926959589123726,
"epsilon_dpo/loss_margin_mean": 0.2945442795753479,
"grad_norm": 28.721080780029297,
"kl/avg_steps": 0.4375,
"kl/beta": 0.09453839808702469,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 4.17910447761194e-07,
"logits/chosen": -0.3537985682487488,
"logits/rejected": -0.5109343528747559,
"logps/chosen": -58.26600646972656,
"logps/ref_chosen": -57.76774597167969,
"logps/ref_rejected": -82.75698852539062,
"logps/rejected": -83.54979705810547,
"loss": 1.36,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.047103650867938995,
"rewards/margins": 0.027332860976457596,
"rewards/rejected": -0.07443651556968689,
"step": 57
},
{
"epoch": 0.08767951625094482,
"epsilon_dpo/beta": 0.09410659223794937,
"epsilon_dpo/beta_margin_grad_mean": -0.4949421286582947,
"epsilon_dpo/beta_margin_grad_std": 0.023574965074658394,
"epsilon_dpo/beta_margin_mean": 0.02041253261268139,
"epsilon_dpo/beta_margin_std": 0.09501735866069794,
"epsilon_dpo/loss_margin_mean": 0.22355195879936218,
"grad_norm": 28.49271583557129,
"kl/avg_steps": 0.03125,
"kl/beta": 0.09412659704685211,
"kl/n_epsilon_steps": 0.484375,
"kl/p_epsilon_steps": 0.515625,
"learning_rate": 4.253731343283582e-07,
"logits/chosen": -0.341006875038147,
"logits/rejected": -0.32765746116638184,
"logps/chosen": -73.33562469482422,
"logps/ref_chosen": -72.76408386230469,
"logps/ref_rejected": -84.49275207519531,
"logps/rejected": -85.287841796875,
"loss": 1.3682,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.05411393940448761,
"rewards/margins": 0.020412495359778404,
"rewards/rejected": -0.07452643662691116,
"step": 58
},
{
"epoch": 0.08919123204837491,
"epsilon_dpo/beta": 0.09404777735471725,
"epsilon_dpo/beta_margin_grad_mean": -0.49456143379211426,
"epsilon_dpo/beta_margin_grad_std": 0.020266661420464516,
"epsilon_dpo/beta_margin_mean": 0.021841851994395256,
"epsilon_dpo/beta_margin_std": 0.08135965466499329,
"epsilon_dpo/loss_margin_mean": 0.23829877376556396,
"grad_norm": 25.092540740966797,
"kl/avg_steps": 0.0625,
"kl/beta": 0.09409718960523605,
"kl/n_epsilon_steps": 0.46875,
"kl/p_epsilon_steps": 0.53125,
"learning_rate": 4.3283582089552234e-07,
"logits/chosen": -0.22025075554847717,
"logits/rejected": -0.3547680974006653,
"logps/chosen": -50.37342834472656,
"logps/ref_chosen": -49.82077407836914,
"logps/ref_rejected": -77.14368438720703,
"logps/rejected": -77.93464660644531,
"loss": 1.3662,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.05218241363763809,
"rewards/margins": 0.02184188924729824,
"rewards/rejected": -0.07402430474758148,
"step": 59
},
{
"epoch": 0.09070294784580499,
"epsilon_dpo/beta": 0.09384208917617798,
"epsilon_dpo/beta_margin_grad_mean": -0.4990495443344116,
"epsilon_dpo/beta_margin_grad_std": 0.01902272365987301,
"epsilon_dpo/beta_margin_mean": 0.003781897248700261,
"epsilon_dpo/beta_margin_std": 0.07621411979198456,
"epsilon_dpo/loss_margin_mean": 0.046735942363739014,
"grad_norm": 27.941707611083984,
"kl/avg_steps": 0.21875,
"kl/beta": 0.09403841942548752,
"kl/n_epsilon_steps": 0.390625,
"kl/p_epsilon_steps": 0.609375,
"learning_rate": 4.4029850746268654e-07,
"logits/chosen": -0.2731139659881592,
"logits/rejected": -0.2829166352748871,
"logps/chosen": -63.85133361816406,
"logps/ref_chosen": -63.22477340698242,
"logps/ref_rejected": -61.360477447509766,
"logps/rejected": -62.03376770019531,
"loss": 1.384,
"rewards/accuracies": 0.609375,
"rewards/chosen": -0.059199295938014984,
"rewards/margins": 0.0037818802520632744,
"rewards/rejected": -0.06298117339611053,
"step": 60
},
{
"epoch": 0.09221466364323508,
"epsilon_dpo/beta": 0.09372523427009583,
"epsilon_dpo/beta_margin_grad_mean": -0.4979851543903351,
"epsilon_dpo/beta_margin_grad_std": 0.023453911766409874,
"epsilon_dpo/beta_margin_mean": 0.008055842481553555,
"epsilon_dpo/beta_margin_std": 0.09404861181974411,
"epsilon_dpo/loss_margin_mean": 0.09340301156044006,
"grad_norm": 26.400217056274414,
"kl/avg_steps": 0.125,
"kl/beta": 0.09383315593004227,
"kl/n_epsilon_steps": 0.4375,
"kl/p_epsilon_steps": 0.5625,
"learning_rate": 4.4776119402985074e-07,
"logits/chosen": -0.27581292390823364,
"logits/rejected": -0.3091738820075989,
"logps/chosen": -49.7232666015625,
"logps/ref_chosen": -49.01679992675781,
"logps/ref_rejected": -74.90817260742188,
"logps/rejected": -75.70803833007812,
"loss": 1.3805,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.06659521907567978,
"rewards/margins": 0.008055852726101875,
"rewards/rejected": -0.0746510699391365,
"step": 61
},
{
"epoch": 0.09372637944066516,
"epsilon_dpo/beta": 0.0934617817401886,
"epsilon_dpo/beta_margin_grad_mean": -0.4950014650821686,
"epsilon_dpo/beta_margin_grad_std": 0.021385950967669487,
"epsilon_dpo/beta_margin_mean": 0.020017653703689575,
"epsilon_dpo/beta_margin_std": 0.08577166497707367,
"epsilon_dpo/loss_margin_mean": 0.2207047939300537,
"grad_norm": 26.785049438476562,
"kl/avg_steps": 0.28125,
"kl/beta": 0.09371601045131683,
"kl/n_epsilon_steps": 0.359375,
"kl/p_epsilon_steps": 0.640625,
"learning_rate": 4.552238805970149e-07,
"logits/chosen": -0.27624958753585815,
"logits/rejected": -0.40080103278160095,
"logps/chosen": -63.499183654785156,
"logps/ref_chosen": -62.751869201660156,
"logps/ref_rejected": -78.93360900878906,
"logps/rejected": -79.90162658691406,
"loss": 1.3682,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.0701717734336853,
"rewards/margins": 0.020017648115754128,
"rewards/rejected": -0.09018941223621368,
"step": 62
},
{
"epoch": 0.09523809523809523,
"epsilon_dpo/beta": 0.09287837892770767,
"epsilon_dpo/beta_margin_grad_mean": -0.487263023853302,
"epsilon_dpo/beta_margin_grad_std": 0.019023440778255463,
"epsilon_dpo/beta_margin_mean": 0.05103456601500511,
"epsilon_dpo/beta_margin_std": 0.07637037336826324,
"epsilon_dpo/loss_margin_mean": 0.5532166361808777,
"grad_norm": 29.184829711914062,
"kl/avg_steps": 0.625,
"kl/beta": 0.09345317631959915,
"kl/n_epsilon_steps": 0.1875,
"kl/p_epsilon_steps": 0.8125,
"learning_rate": 4.626865671641791e-07,
"logits/chosen": -0.3608902096748352,
"logits/rejected": -0.34171557426452637,
"logps/chosen": -61.08062744140625,
"logps/ref_chosen": -60.51525115966797,
"logps/ref_rejected": -85.11021423339844,
"logps/rejected": -86.22881317138672,
"loss": 1.3374,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.05265050381422043,
"rewards/margins": 0.05103456974029541,
"rewards/rejected": -0.10368506610393524,
"step": 63
},
{
"epoch": 0.09674981103552532,
"epsilon_dpo/beta": 0.09282395988702774,
"epsilon_dpo/beta_margin_grad_mean": -0.49743354320526123,
"epsilon_dpo/beta_margin_grad_std": 0.02004072815179825,
"epsilon_dpo/beta_margin_mean": 0.010281778872013092,
"epsilon_dpo/beta_margin_std": 0.0804191380739212,
"epsilon_dpo/loss_margin_mean": 0.11681100726127625,
"grad_norm": 24.240947723388672,
"kl/avg_steps": 0.0625,
"kl/beta": 0.0928727239370346,
"kl/n_epsilon_steps": 0.46875,
"kl/p_epsilon_steps": 0.53125,
"learning_rate": 4.701492537313433e-07,
"logits/chosen": -0.2798372209072113,
"logits/rejected": -0.34699270129203796,
"logps/chosen": -51.999481201171875,
"logps/ref_chosen": -51.20684814453125,
"logps/ref_rejected": -66.93082427978516,
"logps/rejected": -67.84027099609375,
"loss": 1.3777,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.07380862534046173,
"rewards/margins": 0.010281761176884174,
"rewards/rejected": -0.08409038931131363,
"step": 64
},
{
"epoch": 0.0982615268329554,
"epsilon_dpo/beta": 0.09238888323307037,
"epsilon_dpo/beta_margin_grad_mean": -0.48824411630630493,
"epsilon_dpo/beta_margin_grad_std": 0.026403291150927544,
"epsilon_dpo/beta_margin_mean": 0.04717652499675751,
"epsilon_dpo/beta_margin_std": 0.10614392161369324,
"epsilon_dpo/loss_margin_mean": 0.5177453756332397,
"grad_norm": 28.654985427856445,
"kl/avg_steps": 0.46875,
"kl/beta": 0.09281471371650696,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 4.776119402985074e-07,
"logits/chosen": -0.3122035264968872,
"logits/rejected": -0.39129406213760376,
"logps/chosen": -68.14698028564453,
"logps/ref_chosen": -67.2886962890625,
"logps/ref_rejected": -74.44281005859375,
"logps/rejected": -75.81883239746094,
"loss": 1.3425,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.07980494201183319,
"rewards/margins": 0.04717659205198288,
"rewards/rejected": -0.12698152661323547,
"step": 65
},
{
"epoch": 0.09977324263038549,
"epsilon_dpo/beta": 0.09207331389188766,
"epsilon_dpo/beta_margin_grad_mean": -0.49234625697135925,
"epsilon_dpo/beta_margin_grad_std": 0.024242157116532326,
"epsilon_dpo/beta_margin_mean": 0.030687103047966957,
"epsilon_dpo/beta_margin_std": 0.09728584438562393,
"epsilon_dpo/loss_margin_mean": 0.3405768573284149,
"grad_norm": 27.06910514831543,
"kl/avg_steps": 0.34375,
"kl/beta": 0.09238167107105255,
"kl/n_epsilon_steps": 0.328125,
"kl/p_epsilon_steps": 0.671875,
"learning_rate": 4.850746268656717e-07,
"logits/chosen": -0.28761962056159973,
"logits/rejected": -0.34669753909111023,
"logps/chosen": -71.6399154663086,
"logps/ref_chosen": -70.743408203125,
"logps/ref_rejected": -77.26499938964844,
"logps/rejected": -78.5020751953125,
"loss": 1.3582,
"rewards/accuracies": 0.671875,
"rewards/chosen": -0.0828639566898346,
"rewards/margins": 0.030687125399708748,
"rewards/rejected": -0.1135510802268982,
"step": 66
},
{
"epoch": 0.10128495842781557,
"epsilon_dpo/beta": 0.09170035272836685,
"epsilon_dpo/beta_margin_grad_mean": -0.492803156375885,
"epsilon_dpo/beta_margin_grad_std": 0.021091420203447342,
"epsilon_dpo/beta_margin_mean": 0.028854617848992348,
"epsilon_dpo/beta_margin_std": 0.08459162712097168,
"epsilon_dpo/loss_margin_mean": 0.32074975967407227,
"grad_norm": 26.752805709838867,
"kl/avg_steps": 0.40625,
"kl/beta": 0.09206520020961761,
"kl/n_epsilon_steps": 0.296875,
"kl/p_epsilon_steps": 0.703125,
"learning_rate": 4.925373134328357e-07,
"logits/chosen": -0.3209341764450073,
"logits/rejected": -0.4727107286453247,
"logps/chosen": -61.37024688720703,
"logps/ref_chosen": -60.60260009765625,
"logps/ref_rejected": -75.22235870361328,
"logps/rejected": -76.31076049804688,
"loss": 1.3594,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.07072115689516068,
"rewards/margins": 0.028854595497250557,
"rewards/rejected": -0.09957575798034668,
"step": 67
},
{
"epoch": 0.10279667422524566,
"epsilon_dpo/beta": 0.09147261828184128,
"epsilon_dpo/beta_margin_grad_mean": -0.4939153492450714,
"epsilon_dpo/beta_margin_grad_std": 0.027705803513526917,
"epsilon_dpo/beta_margin_mean": 0.02442844770848751,
"epsilon_dpo/beta_margin_std": 0.11143834888935089,
"epsilon_dpo/loss_margin_mean": 0.27541279792785645,
"grad_norm": 28.854305267333984,
"kl/avg_steps": 0.25,
"kl/beta": 0.09169270098209381,
"kl/n_epsilon_steps": 0.375,
"kl/p_epsilon_steps": 0.625,
"learning_rate": 5e-07,
"logits/chosen": -0.334445595741272,
"logits/rejected": -0.39260703325271606,
"logps/chosen": -78.7845458984375,
"logps/ref_chosen": -77.52836608886719,
"logps/ref_rejected": -93.17778015136719,
"logps/rejected": -94.70936584472656,
"loss": 1.3651,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.11520832777023315,
"rewards/margins": 0.024428434669971466,
"rewards/rejected": -0.13963675498962402,
"step": 68
},
{
"epoch": 0.10430839002267574,
"epsilon_dpo/beta": 0.09104440361261368,
"epsilon_dpo/beta_margin_grad_mean": -0.4860967993736267,
"epsilon_dpo/beta_margin_grad_std": 0.026079317554831505,
"epsilon_dpo/beta_margin_mean": 0.05584343895316124,
"epsilon_dpo/beta_margin_std": 0.10497574508190155,
"epsilon_dpo/loss_margin_mean": 0.6200288534164429,
"grad_norm": 28.353233337402344,
"kl/avg_steps": 0.46875,
"kl/beta": 0.09146403521299362,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 4.999965034812934e-07,
"logits/chosen": -0.33460840582847595,
"logits/rejected": -0.42138153314590454,
"logps/chosen": -67.07247924804688,
"logps/ref_chosen": -65.94305419921875,
"logps/ref_rejected": -89.7735595703125,
"logps/rejected": -91.52301025390625,
"loss": 1.334,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.10303386300802231,
"rewards/margins": 0.05584348365664482,
"rewards/rejected": -0.15887734293937683,
"step": 69
},
{
"epoch": 0.10582010582010581,
"epsilon_dpo/beta": 0.09087570011615753,
"epsilon_dpo/beta_margin_grad_mean": -0.49287936091423035,
"epsilon_dpo/beta_margin_grad_std": 0.027025269344449043,
"epsilon_dpo/beta_margin_mean": 0.02853131853044033,
"epsilon_dpo/beta_margin_std": 0.10846278071403503,
"epsilon_dpo/loss_margin_mean": 0.32296425104141235,
"grad_norm": 26.23570442199707,
"kl/avg_steps": 0.1875,
"kl/beta": 0.09103730320930481,
"kl/n_epsilon_steps": 0.40625,
"kl/p_epsilon_steps": 0.59375,
"learning_rate": 4.999860140229787e-07,
"logits/chosen": -0.3370419144630432,
"logits/rejected": -0.41479384899139404,
"logps/chosen": -63.15817642211914,
"logps/ref_chosen": -61.957908630371094,
"logps/ref_rejected": -75.80946350097656,
"logps/rejected": -77.33268737792969,
"loss": 1.3609,
"rewards/accuracies": 0.640625,
"rewards/chosen": -0.10949172079563141,
"rewards/margins": 0.02853127010166645,
"rewards/rejected": -0.1380229890346527,
"step": 70
},
{
"epoch": 0.1073318216175359,
"epsilon_dpo/beta": 0.09079081565141678,
"epsilon_dpo/beta_margin_grad_mean": -0.4949309825897217,
"epsilon_dpo/beta_margin_grad_std": 0.031317904591560364,
"epsilon_dpo/beta_margin_mean": 0.020275531336665154,
"epsilon_dpo/beta_margin_std": 0.125793918967247,
"epsilon_dpo/loss_margin_mean": 0.23358842730522156,
"grad_norm": 25.90142059326172,
"kl/avg_steps": 0.09375,
"kl/beta": 0.09086692333221436,
"kl/n_epsilon_steps": 0.453125,
"kl/p_epsilon_steps": 0.546875,
"learning_rate": 4.999685319184688e-07,
"logits/chosen": -0.3241754472255707,
"logits/rejected": -0.3807687759399414,
"logps/chosen": -64.7705078125,
"logps/ref_chosen": -63.34757995605469,
"logps/ref_rejected": -67.49658203125,
"logps/rejected": -69.1530990600586,
"loss": 1.3701,
"rewards/accuracies": 0.546875,
"rewards/chosen": -0.1295222043991089,
"rewards/margins": 0.020275503396987915,
"rewards/rejected": -0.1497977077960968,
"step": 71
},
{
"epoch": 0.10884353741496598,
"epsilon_dpo/beta": 0.09036531299352646,
"epsilon_dpo/beta_margin_grad_mean": -0.4837353825569153,
"epsilon_dpo/beta_margin_grad_std": 0.028116153553128242,
"epsilon_dpo/beta_margin_mean": 0.06528313457965851,
"epsilon_dpo/beta_margin_std": 0.11307370662689209,
"epsilon_dpo/loss_margin_mean": 0.7303054332733154,
"grad_norm": 27.492733001708984,
"kl/avg_steps": 0.46875,
"kl/beta": 0.0907818153500557,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 4.999440576567755e-07,
"logits/chosen": -0.34430789947509766,
"logits/rejected": -0.4970097541809082,
"logps/chosen": -57.018646240234375,
"logps/ref_chosen": -55.85929870605469,
"logps/ref_rejected": -68.45423889160156,
"logps/rejected": -70.3438949584961,
"loss": 1.3253,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.10510388016700745,
"rewards/margins": 0.06528313457965851,
"rewards/rejected": -0.17038701474666595,
"step": 72
},
{
"epoch": 0.11035525321239607,
"epsilon_dpo/beta": 0.09039553999900818,
"epsilon_dpo/beta_margin_grad_mean": -0.4975738823413849,
"epsilon_dpo/beta_margin_grad_std": 0.035635244101285934,
"epsilon_dpo/beta_margin_mean": 0.010131915099918842,
"epsilon_dpo/beta_margin_std": 0.14464719593524933,
"epsilon_dpo/loss_margin_mean": 0.12331095337867737,
"grad_norm": 28.97447967529297,
"kl/avg_steps": -0.03125,
"kl/beta": 0.09035826474428177,
"kl/n_epsilon_steps": 0.515625,
"kl/p_epsilon_steps": 0.484375,
"learning_rate": 4.999125919224965e-07,
"logits/chosen": -0.3634873032569885,
"logits/rejected": -0.38062894344329834,
"logps/chosen": -70.92495727539062,
"logps/ref_chosen": -69.13880920410156,
"logps/ref_rejected": -79.04586791992188,
"logps/rejected": -80.95533752441406,
"loss": 1.3814,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.16201280057430267,
"rewards/margins": 0.010131875053048134,
"rewards/rejected": -0.17214468121528625,
"step": 73
},
{
"epoch": 0.11186696900982615,
"epsilon_dpo/beta": 0.09005656093358994,
"epsilon_dpo/beta_margin_grad_mean": -0.4857005774974823,
"epsilon_dpo/beta_margin_grad_std": 0.028445864096283913,
"epsilon_dpo/beta_margin_mean": 0.057448577135801315,
"epsilon_dpo/beta_margin_std": 0.11448825150728226,
"epsilon_dpo/loss_margin_mean": 0.6469835042953491,
"grad_norm": 25.312898635864258,
"kl/avg_steps": 0.375,
"kl/beta": 0.09038650989532471,
"kl/n_epsilon_steps": 0.3125,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 4.998741355957963e-07,
"logits/chosen": -0.2756233811378479,
"logits/rejected": -0.281640887260437,
"logps/chosen": -51.166038513183594,
"logps/ref_chosen": -49.923736572265625,
"logps/ref_rejected": -81.73213958740234,
"logps/rejected": -83.62142944335938,
"loss": 1.3329,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.11220179498195648,
"rewards/margins": 0.05744864046573639,
"rewards/rejected": -0.16965043544769287,
"step": 74
},
{
"epoch": 0.11337868480725624,
"epsilon_dpo/beta": 0.08966382592916489,
"epsilon_dpo/beta_margin_grad_mean": -0.48363542556762695,
"epsilon_dpo/beta_margin_grad_std": 0.02883969061076641,
"epsilon_dpo/beta_margin_mean": 0.0657411590218544,
"epsilon_dpo/beta_margin_std": 0.11607305705547333,
"epsilon_dpo/loss_margin_mean": 0.740928053855896,
"grad_norm": 23.65488052368164,
"kl/avg_steps": 0.4375,
"kl/beta": 0.09004882723093033,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 4.998286897523808e-07,
"logits/chosen": -0.33889278769493103,
"logits/rejected": -0.309769868850708,
"logps/chosen": -47.43863296508789,
"logps/ref_chosen": -46.06875228881836,
"logps/ref_rejected": -66.1181411743164,
"logps/rejected": -68.22895050048828,
"loss": 1.325,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.1231279969215393,
"rewards/margins": 0.0657411515712738,
"rewards/rejected": -0.1888691484928131,
"step": 75
},
{
"epoch": 0.11489040060468632,
"epsilon_dpo/beta": 0.08952543884515762,
"epsilon_dpo/beta_margin_grad_mean": -0.4931797385215759,
"epsilon_dpo/beta_margin_grad_std": 0.03402528539299965,
"epsilon_dpo/beta_margin_mean": 0.027378061786293983,
"epsilon_dpo/beta_margin_std": 0.1369428187608719,
"epsilon_dpo/loss_margin_mean": 0.3166384696960449,
"grad_norm": 26.26421356201172,
"kl/avg_steps": 0.15625,
"kl/beta": 0.08965657651424408,
"kl/n_epsilon_steps": 0.421875,
"kl/p_epsilon_steps": 0.578125,
"learning_rate": 4.997762556634679e-07,
"logits/chosen": -0.3610483407974243,
"logits/rejected": -0.39478594064712524,
"logps/chosen": -55.626991271972656,
"logps/ref_chosen": -54.06275177001953,
"logps/ref_rejected": -74.87464141845703,
"logps/rejected": -76.75552368164062,
"loss": 1.3638,
"rewards/accuracies": 0.546875,
"rewards/chosen": -0.14047299325466156,
"rewards/margins": 0.027378087863326073,
"rewards/rejected": -0.16785109043121338,
"step": 76
},
{
"epoch": 0.1164021164021164,
"epsilon_dpo/beta": 0.08918993175029755,
"epsilon_dpo/beta_margin_grad_mean": -0.4837992489337921,
"epsilon_dpo/beta_margin_grad_std": 0.03146786242723465,
"epsilon_dpo/beta_margin_mean": 0.06510339677333832,
"epsilon_dpo/beta_margin_std": 0.12656398117542267,
"epsilon_dpo/loss_margin_mean": 0.7392587065696716,
"grad_norm": 26.23760223388672,
"kl/avg_steps": 0.375,
"kl/beta": 0.08951670676469803,
"kl/n_epsilon_steps": 0.3125,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 4.99716834795752e-07,
"logits/chosen": -0.29933983087539673,
"logits/rejected": -0.3533180356025696,
"logps/chosen": -54.68768310546875,
"logps/ref_chosen": -53.07609176635742,
"logps/ref_rejected": -74.45601654052734,
"logps/rejected": -76.8068618774414,
"loss": 1.3262,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.1440524160861969,
"rewards/margins": 0.06510336697101593,
"rewards/rejected": -0.20915578305721283,
"step": 77
},
{
"epoch": 0.11791383219954649,
"epsilon_dpo/beta": 0.0889403447508812,
"epsilon_dpo/beta_margin_grad_mean": -0.49066993594169617,
"epsilon_dpo/beta_margin_grad_std": 0.030180798843503,
"epsilon_dpo/beta_margin_mean": 0.03743256628513336,
"epsilon_dpo/beta_margin_std": 0.12118643522262573,
"epsilon_dpo/loss_margin_mean": 0.4303058981895447,
"grad_norm": 26.07628631591797,
"kl/avg_steps": 0.28125,
"kl/beta": 0.08918227255344391,
"kl/n_epsilon_steps": 0.359375,
"kl/p_epsilon_steps": 0.640625,
"learning_rate": 4.996504288113623e-07,
"logits/chosen": -0.2836863398551941,
"logits/rejected": -0.35981813073158264,
"logps/chosen": -69.51696014404297,
"logps/ref_chosen": -67.72541809082031,
"logps/ref_rejected": -79.03927612304688,
"logps/rejected": -81.26111602783203,
"loss": 1.3529,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.1598249077796936,
"rewards/margins": 0.037432536482810974,
"rewards/rejected": -0.19725742936134338,
"step": 78
},
{
"epoch": 0.11942554799697656,
"epsilon_dpo/beta": 0.08849633485078812,
"epsilon_dpo/beta_margin_grad_mean": -0.47606751322746277,
"epsilon_dpo/beta_margin_grad_std": 0.042190127074718475,
"epsilon_dpo/beta_margin_mean": 0.09710415452718735,
"epsilon_dpo/beta_margin_std": 0.1726681888103485,
"epsilon_dpo/loss_margin_mean": 1.1065272092819214,
"grad_norm": 27.684852600097656,
"kl/avg_steps": 0.5,
"kl/beta": 0.08893214911222458,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 4.995770395678171e-07,
"logits/chosen": -0.2766944169998169,
"logits/rejected": -0.3576112985610962,
"logps/chosen": -53.86768341064453,
"logps/ref_chosen": -52.16064453125,
"logps/ref_rejected": -83.31062316894531,
"logps/rejected": -86.12418365478516,
"loss": 1.2989,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.15135186910629272,
"rewards/margins": 0.09710416942834854,
"rewards/rejected": -0.24845603108406067,
"step": 79
},
{
"epoch": 0.12093726379440665,
"epsilon_dpo/beta": 0.08844324201345444,
"epsilon_dpo/beta_margin_grad_mean": -0.487560510635376,
"epsilon_dpo/beta_margin_grad_std": 0.041432999074459076,
"epsilon_dpo/beta_margin_mean": 0.05020918697118759,
"epsilon_dpo/beta_margin_std": 0.16699855029582977,
"epsilon_dpo/loss_margin_mean": 0.5821816921234131,
"grad_norm": 25.254793167114258,
"kl/avg_steps": 0.0625,
"kl/beta": 0.08848970383405685,
"kl/n_epsilon_steps": 0.46875,
"kl/p_epsilon_steps": 0.53125,
"learning_rate": 4.994966691179711e-07,
"logits/chosen": -0.23642706871032715,
"logits/rejected": -0.36239850521087646,
"logps/chosen": -63.40154266357422,
"logps/ref_chosen": -61.410560607910156,
"logps/ref_rejected": -78.66004943847656,
"logps/rejected": -81.23321533203125,
"loss": 1.3437,
"rewards/accuracies": 0.546875,
"rewards/chosen": -0.17663419246673584,
"rewards/margins": 0.05020918324589729,
"rewards/rejected": -0.22684337198734283,
"step": 80
},
{
"epoch": 0.12244897959183673,
"epsilon_dpo/beta": 0.08822217583656311,
"epsilon_dpo/beta_margin_grad_mean": -0.48244184255599976,
"epsilon_dpo/beta_margin_grad_std": 0.0381772443652153,
"epsilon_dpo/beta_margin_mean": 0.07098691165447235,
"epsilon_dpo/beta_margin_std": 0.15491938591003418,
"epsilon_dpo/loss_margin_mean": 0.8161328434944153,
"grad_norm": 25.950115203857422,
"kl/avg_steps": 0.25,
"kl/beta": 0.08843443542718887,
"kl/n_epsilon_steps": 0.375,
"kl/p_epsilon_steps": 0.625,
"learning_rate": 4.994093197099587e-07,
"logits/chosen": -0.3112892508506775,
"logits/rejected": -0.36429500579833984,
"logps/chosen": -65.90744018554688,
"logps/ref_chosen": -63.80437088012695,
"logps/ref_rejected": -79.34840393066406,
"logps/rejected": -82.26761627197266,
"loss": 1.3225,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.1859210729598999,
"rewards/margins": 0.07098691910505295,
"rewards/rejected": -0.25690799951553345,
"step": 81
},
{
"epoch": 0.12396069538926682,
"epsilon_dpo/beta": 0.08775404095649719,
"epsilon_dpo/beta_margin_grad_mean": -0.47642040252685547,
"epsilon_dpo/beta_margin_grad_std": 0.03442943096160889,
"epsilon_dpo/beta_margin_mean": 0.09483001381158829,
"epsilon_dpo/beta_margin_std": 0.13907304406166077,
"epsilon_dpo/loss_margin_mean": 1.0893311500549316,
"grad_norm": 23.46103286743164,
"kl/avg_steps": 0.53125,
"kl/beta": 0.08821389824151993,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 4.993149937871306e-07,
"logits/chosen": -0.3112872838973999,
"logits/rejected": -0.4451986253261566,
"logps/chosen": -50.527034759521484,
"logps/ref_chosen": -48.817893981933594,
"logps/ref_rejected": -70.31497955322266,
"logps/rejected": -73.11345672607422,
"loss": 1.2985,
"rewards/accuracies": 0.796875,
"rewards/chosen": -0.1504351645708084,
"rewards/margins": 0.0948299989104271,
"rewards/rejected": -0.2452651560306549,
"step": 82
},
{
"epoch": 0.1254724111866969,
"epsilon_dpo/beta": 0.08726288378238678,
"epsilon_dpo/beta_margin_grad_mean": -0.4754730761051178,
"epsilon_dpo/beta_margin_grad_std": 0.038948871195316315,
"epsilon_dpo/beta_margin_mean": 0.09912940859794617,
"epsilon_dpo/beta_margin_std": 0.15893737971782684,
"epsilon_dpo/loss_margin_mean": 1.1451313495635986,
"grad_norm": 25.848224639892578,
"kl/avg_steps": 0.5625,
"kl/beta": 0.0877477377653122,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 4.992136939879856e-07,
"logits/chosen": -0.17561104893684387,
"logits/rejected": -0.30185046792030334,
"logps/chosen": -59.18461608886719,
"logps/ref_chosen": -57.15077209472656,
"logps/ref_rejected": -75.1710205078125,
"logps/rejected": -78.34999084472656,
"loss": 1.2959,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.17770114541053772,
"rewards/margins": 0.09912942349910736,
"rewards/rejected": -0.2768305540084839,
"step": 83
},
{
"epoch": 0.12698412698412698,
"epsilon_dpo/beta": 0.08685658127069473,
"epsilon_dpo/beta_margin_grad_mean": -0.48180902004241943,
"epsilon_dpo/beta_margin_grad_std": 0.040308646857738495,
"epsilon_dpo/beta_margin_mean": 0.0733170136809349,
"epsilon_dpo/beta_margin_std": 0.16251438856124878,
"epsilon_dpo/loss_margin_mean": 0.8559837341308594,
"grad_norm": 26.832216262817383,
"kl/avg_steps": 0.46875,
"kl/beta": 0.08725691586732864,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 4.991054231460969e-07,
"logits/chosen": -0.34390878677368164,
"logits/rejected": -0.3491112291812897,
"logps/chosen": -67.22178649902344,
"logps/ref_chosen": -64.77730560302734,
"logps/ref_rejected": -84.71949768066406,
"logps/rejected": -88.01997375488281,
"loss": 1.3209,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.21294060349464417,
"rewards/margins": 0.07331700623035431,
"rewards/rejected": -0.28625762462615967,
"step": 84
},
{
"epoch": 0.12849584278155707,
"epsilon_dpo/beta": 0.08636991679668427,
"epsilon_dpo/beta_margin_grad_mean": -0.4734337031841278,
"epsilon_dpo/beta_margin_grad_std": 0.04253039509057999,
"epsilon_dpo/beta_margin_mean": 0.10724300891160965,
"epsilon_dpo/beta_margin_std": 0.17241978645324707,
"epsilon_dpo/loss_margin_mean": 1.252614140510559,
"grad_norm": 23.521873474121094,
"kl/avg_steps": 0.5625,
"kl/beta": 0.08684980869293213,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 4.989901842900325e-07,
"logits/chosen": -0.27682313323020935,
"logits/rejected": -0.3715432584285736,
"logps/chosen": -52.452110290527344,
"logps/ref_chosen": -50.25169372558594,
"logps/ref_rejected": -66.55438995361328,
"logps/rejected": -70.00741577148438,
"loss": 1.2893,
"rewards/accuracies": 0.796875,
"rewards/chosen": -0.1906118392944336,
"rewards/margins": 0.10724307596683502,
"rewards/rejected": -0.2978549003601074,
"step": 85
},
{
"epoch": 0.13000755857898716,
"epsilon_dpo/beta": 0.08610273152589798,
"epsilon_dpo/beta_margin_grad_mean": -0.4800806939601898,
"epsilon_dpo/beta_margin_grad_std": 0.04230509698390961,
"epsilon_dpo/beta_margin_mean": 0.08032626658678055,
"epsilon_dpo/beta_margin_std": 0.17068816721439362,
"epsilon_dpo/loss_margin_mean": 0.94707190990448,
"grad_norm": 23.847671508789062,
"kl/avg_steps": 0.3125,
"kl/beta": 0.0863640084862709,
"kl/n_epsilon_steps": 0.34375,
"kl/p_epsilon_steps": 0.65625,
"learning_rate": 4.988679806432711e-07,
"logits/chosen": -0.31426382064819336,
"logits/rejected": -0.34176743030548096,
"logps/chosen": -63.52363204956055,
"logps/ref_chosen": -60.72917938232422,
"logps/ref_rejected": -72.30960845947266,
"logps/rejected": -76.05113220214844,
"loss": 1.3148,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.24124664068222046,
"rewards/margins": 0.08032624423503876,
"rewards/rejected": -0.3215728998184204,
"step": 86
},
{
"epoch": 0.13151927437641722,
"epsilon_dpo/beta": 0.08583450317382812,
"epsilon_dpo/beta_margin_grad_mean": -0.47823014855384827,
"epsilon_dpo/beta_margin_grad_std": 0.05162518098950386,
"epsilon_dpo/beta_margin_mean": 0.08801626414060593,
"epsilon_dpo/beta_margin_std": 0.2094600796699524,
"epsilon_dpo/loss_margin_mean": 1.0425142049789429,
"grad_norm": 26.36173439025879,
"kl/avg_steps": 0.3125,
"kl/beta": 0.08609496802091599,
"kl/n_epsilon_steps": 0.34375,
"kl/p_epsilon_steps": 0.65625,
"learning_rate": 4.987388156241114e-07,
"logits/chosen": -0.3468170464038849,
"logits/rejected": -0.3861439824104309,
"logps/chosen": -68.70194244384766,
"logps/ref_chosen": -65.75796508789062,
"logps/ref_rejected": -84.81159973144531,
"logps/rejected": -88.79808044433594,
"loss": 1.3111,
"rewards/accuracies": 0.671875,
"rewards/chosen": -0.25342974066734314,
"rewards/margins": 0.08801624923944473,
"rewards/rejected": -0.34144601225852966,
"step": 87
},
{
"epoch": 0.1330309901738473,
"epsilon_dpo/beta": 0.08562074601650238,
"epsilon_dpo/beta_margin_grad_mean": -0.48272082209587097,
"epsilon_dpo/beta_margin_grad_std": 0.061211053282022476,
"epsilon_dpo/beta_margin_mean": 0.07055442035198212,
"epsilon_dpo/beta_margin_std": 0.2521108388900757,
"epsilon_dpo/loss_margin_mean": 0.8428503274917603,
"grad_norm": 25.902055740356445,
"kl/avg_steps": 0.25,
"kl/beta": 0.08582675457000732,
"kl/n_epsilon_steps": 0.375,
"kl/p_epsilon_steps": 0.625,
"learning_rate": 4.986026928455767e-07,
"logits/chosen": -0.2521975040435791,
"logits/rejected": -0.32689160108566284,
"logps/chosen": -65.82767486572266,
"logps/ref_chosen": -62.82402801513672,
"logps/ref_rejected": -74.9607162475586,
"logps/rejected": -78.80721282958984,
"loss": 1.3326,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.25794392824172974,
"rewards/margins": 0.07055442780256271,
"rewards/rejected": -0.32849836349487305,
"step": 88
},
{
"epoch": 0.1345427059712774,
"epsilon_dpo/beta": 0.08519317209720612,
"epsilon_dpo/beta_margin_grad_mean": -0.4714643955230713,
"epsilon_dpo/beta_margin_grad_std": 0.05181068181991577,
"epsilon_dpo/beta_margin_mean": 0.11607452481985092,
"epsilon_dpo/beta_margin_std": 0.2124992311000824,
"epsilon_dpo/loss_margin_mean": 1.3756203651428223,
"grad_norm": 25.38898277282715,
"kl/avg_steps": 0.5,
"kl/beta": 0.0856127217411995,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 4.984596161153135e-07,
"logits/chosen": -0.19296492636203766,
"logits/rejected": -0.3612852096557617,
"logps/chosen": -43.72795867919922,
"logps/ref_chosen": -41.191436767578125,
"logps/ref_rejected": -85.44769287109375,
"logps/rejected": -89.3598403930664,
"loss": 1.2847,
"rewards/accuracies": 0.703125,
"rewards/chosen": -0.21668045222759247,
"rewards/margins": 0.11607441306114197,
"rewards/rejected": -0.33275485038757324,
"step": 89
},
{
"epoch": 0.1360544217687075,
"epsilon_dpo/beta": 0.08479595184326172,
"epsilon_dpo/beta_margin_grad_mean": -0.4741279184818268,
"epsilon_dpo/beta_margin_grad_std": 0.05333807319402695,
"epsilon_dpo/beta_margin_mean": 0.1047237440943718,
"epsilon_dpo/beta_margin_std": 0.21627415716648102,
"epsilon_dpo/loss_margin_mean": 1.251347303390503,
"grad_norm": 25.27131462097168,
"kl/avg_steps": 0.46875,
"kl/beta": 0.08518678694963455,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 4.983095894354857e-07,
"logits/chosen": -0.25080180168151855,
"logits/rejected": -0.2997087240219116,
"logps/chosen": -59.681705474853516,
"logps/ref_chosen": -56.58390808105469,
"logps/ref_rejected": -86.86978149414062,
"logps/rejected": -91.21892547607422,
"loss": 1.2959,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.263161838054657,
"rewards/margins": 0.10472376644611359,
"rewards/rejected": -0.3678855895996094,
"step": 90
},
{
"epoch": 0.13756613756613756,
"epsilon_dpo/beta": 0.08453282713890076,
"epsilon_dpo/beta_margin_grad_mean": -0.47601643204689026,
"epsilon_dpo/beta_margin_grad_std": 0.05607705935835838,
"epsilon_dpo/beta_margin_mean": 0.09787020087242126,
"epsilon_dpo/beta_margin_std": 0.23042532801628113,
"epsilon_dpo/loss_margin_mean": 1.1759456396102905,
"grad_norm": 21.97345542907715,
"kl/avg_steps": 0.3125,
"kl/beta": 0.08478934317827225,
"kl/n_epsilon_steps": 0.34375,
"kl/p_epsilon_steps": 0.65625,
"learning_rate": 4.98152617002662e-07,
"logits/chosen": -0.2475605607032776,
"logits/rejected": -0.3375104069709778,
"logps/chosen": -55.47486877441406,
"logps/ref_chosen": -52.38234329223633,
"logps/ref_rejected": -72.17642211914062,
"logps/rejected": -76.44489288330078,
"loss": 1.3039,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.2622981369495392,
"rewards/margins": 0.09787018597126007,
"rewards/rejected": -0.36016833782196045,
"step": 91
},
{
"epoch": 0.13907785336356765,
"epsilon_dpo/beta": 0.0842430591583252,
"epsilon_dpo/beta_margin_grad_mean": -0.4685448706150055,
"epsilon_dpo/beta_margin_grad_std": 0.06363333016633987,
"epsilon_dpo/beta_margin_mean": 0.12894244492053986,
"epsilon_dpo/beta_margin_std": 0.2619534730911255,
"epsilon_dpo/loss_margin_mean": 1.550490379333496,
"grad_norm": 23.472488403320312,
"kl/avg_steps": 0.34375,
"kl/beta": 0.0845251977443695,
"kl/n_epsilon_steps": 0.328125,
"kl/p_epsilon_steps": 0.671875,
"learning_rate": 4.979887032076988e-07,
"logits/chosen": -0.2954648733139038,
"logits/rejected": -0.28664323687553406,
"logps/chosen": -56.24725341796875,
"logps/ref_chosen": -53.00870132446289,
"logps/ref_rejected": -79.77813720703125,
"logps/rejected": -84.56717681884766,
"loss": 1.2784,
"rewards/accuracies": 0.671875,
"rewards/chosen": -0.2737791836261749,
"rewards/margins": 0.12894247472286224,
"rewards/rejected": -0.402721643447876,
"step": 92
},
{
"epoch": 0.14058956916099774,
"epsilon_dpo/beta": 0.0840861052274704,
"epsilon_dpo/beta_margin_grad_mean": -0.4796208143234253,
"epsilon_dpo/beta_margin_grad_std": 0.06429679691791534,
"epsilon_dpo/beta_margin_mean": 0.08413957059383392,
"epsilon_dpo/beta_margin_std": 0.26654988527297974,
"epsilon_dpo/loss_margin_mean": 1.022035837173462,
"grad_norm": 20.898527145385742,
"kl/avg_steps": 0.1875,
"kl/beta": 0.08423563838005066,
"kl/n_epsilon_steps": 0.40625,
"kl/p_epsilon_steps": 0.59375,
"learning_rate": 4.978178526356172e-07,
"logits/chosen": -0.28997743129730225,
"logits/rejected": -0.2752231955528259,
"logps/chosen": -48.44411849975586,
"logps/ref_chosen": -44.90705108642578,
"logps/ref_rejected": -58.7879524230957,
"logps/rejected": -63.34705352783203,
"loss": 1.3213,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.29818129539489746,
"rewards/margins": 0.08413958549499512,
"rewards/rejected": -0.3823208808898926,
"step": 93
},
{
"epoch": 0.1421012849584278,
"epsilon_dpo/beta": 0.08366596698760986,
"epsilon_dpo/beta_margin_grad_mean": -0.46264174580574036,
"epsilon_dpo/beta_margin_grad_std": 0.079315185546875,
"epsilon_dpo/beta_margin_mean": 0.15383951365947723,
"epsilon_dpo/beta_margin_std": 0.33543291687965393,
"epsilon_dpo/loss_margin_mean": 1.8606209754943848,
"grad_norm": 23.870315551757812,
"kl/avg_steps": 0.5,
"kl/beta": 0.08407799154520035,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 4.976400700654751e-07,
"logits/chosen": -0.2582881450653076,
"logits/rejected": -0.2828645706176758,
"logps/chosen": -63.228511810302734,
"logps/ref_chosen": -59.93777084350586,
"logps/ref_rejected": -79.3138427734375,
"logps/rejected": -84.46520233154297,
"loss": 1.2656,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.27621322870254517,
"rewards/margins": 0.15383949875831604,
"rewards/rejected": -0.4300526976585388,
"step": 94
},
{
"epoch": 0.1436130007558579,
"epsilon_dpo/beta": 0.08340659737586975,
"epsilon_dpo/beta_margin_grad_mean": -0.4681813716888428,
"epsilon_dpo/beta_margin_grad_std": 0.06947793811559677,
"epsilon_dpo/beta_margin_mean": 0.13020388782024384,
"epsilon_dpo/beta_margin_std": 0.28408244252204895,
"epsilon_dpo/loss_margin_mean": 1.5844521522521973,
"grad_norm": 24.884397506713867,
"kl/avg_steps": 0.3125,
"kl/beta": 0.08365969359874725,
"kl/n_epsilon_steps": 0.34375,
"kl/p_epsilon_steps": 0.65625,
"learning_rate": 4.974553604702332e-07,
"logits/chosen": -0.26567769050598145,
"logits/rejected": -0.38477879762649536,
"logps/chosen": -64.4288558959961,
"logps/ref_chosen": -60.168487548828125,
"logps/ref_rejected": -90.73665618896484,
"logps/rejected": -96.58148193359375,
"loss": 1.2802,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.35575753450393677,
"rewards/margins": 0.1302039623260498,
"rewards/rejected": -0.4859614968299866,
"step": 95
},
{
"epoch": 0.14512471655328799,
"epsilon_dpo/beta": 0.08312069624662399,
"epsilon_dpo/beta_margin_grad_mean": -0.46429139375686646,
"epsilon_dpo/beta_margin_grad_std": 0.09034043550491333,
"epsilon_dpo/beta_margin_mean": 0.15051960945129395,
"epsilon_dpo/beta_margin_std": 0.38265353441238403,
"epsilon_dpo/loss_margin_mean": 1.8390917778015137,
"grad_norm": 23.29682731628418,
"kl/avg_steps": 0.34375,
"kl/beta": 0.08339907228946686,
"kl/n_epsilon_steps": 0.328125,
"kl/p_epsilon_steps": 0.671875,
"learning_rate": 4.972637290166157e-07,
"logits/chosen": -0.23969542980194092,
"logits/rejected": -0.29741132259368896,
"logps/chosen": -64.8504867553711,
"logps/ref_chosen": -60.66877746582031,
"logps/ref_rejected": -88.30673217773438,
"logps/rejected": -94.32752990722656,
"loss": 1.2768,
"rewards/accuracies": 0.671875,
"rewards/chosen": -0.34894490242004395,
"rewards/margins": 0.15051962435245514,
"rewards/rejected": -0.4994645118713379,
"step": 96
},
{
"epoch": 0.14663643235071808,
"epsilon_dpo/beta": 0.08293985575437546,
"epsilon_dpo/beta_margin_grad_mean": -0.48031315207481384,
"epsilon_dpo/beta_margin_grad_std": 0.09844296425580978,
"epsilon_dpo/beta_margin_mean": 0.08118901401758194,
"epsilon_dpo/beta_margin_std": 0.43013235926628113,
"epsilon_dpo/loss_margin_mean": 1.0124452114105225,
"grad_norm": 29.83897590637207,
"kl/avg_steps": 0.21875,
"kl/beta": 0.08311337232589722,
"kl/n_epsilon_steps": 0.390625,
"kl/p_epsilon_steps": 0.609375,
"learning_rate": 4.970651810649666e-07,
"logits/chosen": -0.25200527906417847,
"logits/rejected": -0.3504447937011719,
"logps/chosen": -70.09553527832031,
"logps/ref_chosen": -65.04412841796875,
"logps/ref_rejected": -78.42092895507812,
"logps/rejected": -84.48478698730469,
"loss": 1.3509,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.42023158073425293,
"rewards/margins": 0.08118899166584015,
"rewards/rejected": -0.5014206171035767,
"step": 97
},
{
"epoch": 0.14814814814814814,
"epsilon_dpo/beta": 0.08266797661781311,
"epsilon_dpo/beta_margin_grad_mean": -0.4853890538215637,
"epsilon_dpo/beta_margin_grad_std": 0.06719968467950821,
"epsilon_dpo/beta_margin_mean": 0.05843065306544304,
"epsilon_dpo/beta_margin_std": 0.27545446157455444,
"epsilon_dpo/loss_margin_mean": 0.729252278804779,
"grad_norm": 24.91126251220703,
"kl/avg_steps": 0.328125,
"kl/beta": 0.08293195813894272,
"kl/n_epsilon_steps": 0.328125,
"kl/p_epsilon_steps": 0.65625,
"learning_rate": 4.968597221690985e-07,
"logits/chosen": -0.1983109712600708,
"logits/rejected": -0.286150723695755,
"logps/chosen": -60.181182861328125,
"logps/ref_chosen": -55.503231048583984,
"logps/ref_rejected": -72.81553649902344,
"logps/rejected": -78.22274780273438,
"loss": 1.3475,
"rewards/accuracies": 0.640625,
"rewards/chosen": -0.3883008360862732,
"rewards/margins": 0.058430641889572144,
"rewards/rejected": -0.44673144817352295,
"step": 98
},
{
"epoch": 0.14965986394557823,
"epsilon_dpo/beta": 0.08251398801803589,
"epsilon_dpo/beta_margin_grad_mean": -0.4797271490097046,
"epsilon_dpo/beta_margin_grad_std": 0.09218871593475342,
"epsilon_dpo/beta_margin_mean": 0.08427122235298157,
"epsilon_dpo/beta_margin_std": 0.38877469301223755,
"epsilon_dpo/loss_margin_mean": 1.052944540977478,
"grad_norm": 26.6590633392334,
"kl/avg_steps": 0.1875,
"kl/beta": 0.0826607272028923,
"kl/n_epsilon_steps": 0.40625,
"kl/p_epsilon_steps": 0.59375,
"learning_rate": 4.966473580761389e-07,
"logits/chosen": -0.3002532422542572,
"logits/rejected": -0.34748363494873047,
"logps/chosen": -63.325279235839844,
"logps/ref_chosen": -58.57563781738281,
"logps/ref_rejected": -78.69361114501953,
"logps/rejected": -84.4961929321289,
"loss": 1.3405,
"rewards/accuracies": 0.640625,
"rewards/chosen": -0.39335766434669495,
"rewards/margins": 0.08427121490240097,
"rewards/rejected": -0.4776288866996765,
"step": 99
},
{
"epoch": 0.15117157974300832,
"epsilon_dpo/beta": 0.08226918429136276,
"epsilon_dpo/beta_margin_grad_mean": -0.47224295139312744,
"epsilon_dpo/beta_margin_grad_std": 0.11228302866220474,
"epsilon_dpo/beta_margin_mean": 0.12067549675703049,
"epsilon_dpo/beta_margin_std": 0.5142140984535217,
"epsilon_dpo/loss_margin_mean": 1.502497673034668,
"grad_norm": 27.125337600708008,
"kl/avg_steps": 0.296875,
"kl/beta": 0.08250602334737778,
"kl/n_epsilon_steps": 0.34375,
"kl/p_epsilon_steps": 0.640625,
"learning_rate": 4.964280947263676e-07,
"logits/chosen": -0.25578558444976807,
"logits/rejected": -0.24466118216514587,
"logps/chosen": -84.99995422363281,
"logps/ref_chosen": -79.58343505859375,
"logps/ref_rejected": -92.152587890625,
"logps/rejected": -99.07160949707031,
"loss": 1.3305,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.44687801599502563,
"rewards/margins": 0.1206754669547081,
"rewards/rejected": -0.5675535202026367,
"step": 100
},
{
"epoch": 0.15117157974300832,
"eval_epsilon_dpo/beta": 0.08208680897951126,
"eval_epsilon_dpo/beta_margin_grad_mean": -0.47245821356773376,
"eval_epsilon_dpo/beta_margin_grad_std": 0.09693592041730881,
"eval_epsilon_dpo/beta_margin_mean": 0.11633308976888657,
"eval_epsilon_dpo/beta_margin_std": 0.4194590151309967,
"eval_epsilon_dpo/loss_margin_mean": 1.4523829221725464,
"eval_kl/n_epsilon_steps": 0.3882042169570923,
"eval_kl/p_epsilon_steps": 0.61091548204422,
"eval_logits/chosen": -0.21332350373268127,
"eval_logits/rejected": -0.29548704624176025,
"eval_logps/chosen": -79.80242156982422,
"eval_logps/ref_chosen": -74.85946655273438,
"eval_logps/ref_rejected": -79.54898834228516,
"eval_logps/rejected": -85.94432830810547,
"eval_loss": 0.6591081023216248,
"eval_rewards/accuracies": 0.6298415660858154,
"eval_rewards/chosen": -0.4072907269001007,
"eval_rewards/margins": 0.11633308976888657,
"eval_rewards/rejected": -0.5236237645149231,
"eval_runtime": 41.9343,
"eval_samples_per_second": 54.919,
"eval_steps_per_second": 1.717,
"step": 100
},
{
"epoch": 0.15268329554043839,
"epsilon_dpo/beta": 0.08188439905643463,
"epsilon_dpo/beta_margin_grad_mean": -0.4580801725387573,
"epsilon_dpo/beta_margin_grad_std": 0.08454929292201996,
"epsilon_dpo/beta_margin_mean": 0.17178460955619812,
"epsilon_dpo/beta_margin_std": 0.3538208305835724,
"epsilon_dpo/loss_margin_mean": 2.123854875564575,
"grad_norm": 21.476987838745117,
"kl/avg_steps": 0.46875,
"kl/beta": 0.0822618156671524,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 4.96201938253052e-07,
"logits/chosen": -0.3077951967716217,
"logits/rejected": -0.3784474730491638,
"logps/chosen": -56.707183837890625,
"logps/ref_chosen": -52.332786560058594,
"logps/ref_rejected": -69.55589294433594,
"logps/rejected": -76.05415344238281,
"loss": 1.2524,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.3596314787864685,
"rewards/margins": 0.1717846393585205,
"rewards/rejected": -0.531416118144989,
"step": 101
},
{
"epoch": 0.15419501133786848,
"epsilon_dpo/beta": 0.0819629579782486,
"epsilon_dpo/beta_margin_grad_mean": -0.4849558472633362,
"epsilon_dpo/beta_margin_grad_std": 0.10686981678009033,
"epsilon_dpo/beta_margin_mean": 0.07028303295373917,
"epsilon_dpo/beta_margin_std": 0.46668335795402527,
"epsilon_dpo/loss_margin_mean": 0.897625207901001,
"grad_norm": 26.247426986694336,
"kl/avg_steps": -0.09375,
"kl/beta": 0.08187800645828247,
"kl/n_epsilon_steps": 0.546875,
"kl/p_epsilon_steps": 0.453125,
"learning_rate": 4.959688949822748e-07,
"logits/chosen": -0.26697877049446106,
"logits/rejected": -0.37949949502944946,
"logps/chosen": -70.19974517822266,
"logps/ref_chosen": -64.74348449707031,
"logps/ref_rejected": -69.06133270263672,
"logps/rejected": -75.41522216796875,
"loss": 1.3691,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.448466032743454,
"rewards/margins": 0.07028304040431976,
"rewards/rejected": -0.5187490582466125,
"step": 102
},
{
"epoch": 0.15570672713529857,
"epsilon_dpo/beta": 0.08170690387487411,
"epsilon_dpo/beta_margin_grad_mean": -0.4686320722103119,
"epsilon_dpo/beta_margin_grad_std": 0.07682781666517258,
"epsilon_dpo/beta_margin_mean": 0.12938132882118225,
"epsilon_dpo/beta_margin_std": 0.3163187503814697,
"epsilon_dpo/loss_margin_mean": 1.610579490661621,
"grad_norm": 24.425413131713867,
"kl/avg_steps": 0.3125,
"kl/beta": 0.08195484429597855,
"kl/n_epsilon_steps": 0.34375,
"kl/p_epsilon_steps": 0.65625,
"learning_rate": 4.957289714327572e-07,
"logits/chosen": -0.22854191064834595,
"logits/rejected": -0.3113904297351837,
"logps/chosen": -68.79859161376953,
"logps/ref_chosen": -63.836647033691406,
"logps/ref_rejected": -79.3236312866211,
"logps/rejected": -85.89615631103516,
"loss": 1.2857,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.40643417835235596,
"rewards/margins": 0.12938141822814941,
"rewards/rejected": -0.5358155965805054,
"step": 103
},
{
"epoch": 0.15721844293272866,
"epsilon_dpo/beta": 0.08147788792848587,
"epsilon_dpo/beta_margin_grad_mean": -0.4568372964859009,
"epsilon_dpo/beta_margin_grad_std": 0.12087428569793701,
"epsilon_dpo/beta_margin_mean": 0.19555088877677917,
"epsilon_dpo/beta_margin_std": 0.5607696771621704,
"epsilon_dpo/loss_margin_mean": 2.441429615020752,
"grad_norm": 25.43903350830078,
"kl/avg_steps": 0.28125,
"kl/beta": 0.08169952780008316,
"kl/n_epsilon_steps": 0.359375,
"kl/p_epsilon_steps": 0.640625,
"learning_rate": 4.954821743156767e-07,
"logits/chosen": -0.2984537184238434,
"logits/rejected": -0.3845409154891968,
"logps/chosen": -65.84523010253906,
"logps/ref_chosen": -60.99920654296875,
"logps/ref_rejected": -98.8464584350586,
"logps/rejected": -106.13390350341797,
"loss": 1.2719,
"rewards/accuracies": 0.671875,
"rewards/chosen": -0.39637333154678345,
"rewards/margins": 0.19555078446865082,
"rewards/rejected": -0.5919241309165955,
"step": 104
},
{
"epoch": 0.15873015873015872,
"epsilon_dpo/beta": 0.08122391998767853,
"epsilon_dpo/beta_margin_grad_mean": -0.465393602848053,
"epsilon_dpo/beta_margin_grad_std": 0.09943027794361115,
"epsilon_dpo/beta_margin_mean": 0.14796897768974304,
"epsilon_dpo/beta_margin_std": 0.42563724517822266,
"epsilon_dpo/loss_margin_mean": 1.8564667701721191,
"grad_norm": 25.51272201538086,
"kl/avg_steps": 0.3125,
"kl/beta": 0.08147039264440536,
"kl/n_epsilon_steps": 0.34375,
"kl/p_epsilon_steps": 0.65625,
"learning_rate": 4.952285105344791e-07,
"logits/chosen": -0.2829214334487915,
"logits/rejected": -0.3911677598953247,
"logps/chosen": -76.53125,
"logps/ref_chosen": -70.95027160644531,
"logps/ref_rejected": -87.88340759277344,
"logps/rejected": -95.32086181640625,
"loss": 1.2873,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.45500004291534424,
"rewards/margins": 0.14796897768974304,
"rewards/rejected": -0.6029690504074097,
"step": 105
},
{
"epoch": 0.1602418745275888,
"epsilon_dpo/beta": 0.08094550669193268,
"epsilon_dpo/beta_margin_grad_mean": -0.4687478542327881,
"epsilon_dpo/beta_margin_grad_std": 0.10816415399312973,
"epsilon_dpo/beta_margin_mean": 0.12886309623718262,
"epsilon_dpo/beta_margin_std": 0.4681468605995178,
"epsilon_dpo/loss_margin_mean": 1.6295931339263916,
"grad_norm": 25.131946563720703,
"kl/avg_steps": 0.34375,
"kl/beta": 0.08121659606695175,
"kl/n_epsilon_steps": 0.328125,
"kl/p_epsilon_steps": 0.671875,
"learning_rate": 4.949679871846857e-07,
"logits/chosen": -0.314166784286499,
"logits/rejected": -0.3277107775211334,
"logps/chosen": -67.71137237548828,
"logps/ref_chosen": -62.45933151245117,
"logps/ref_rejected": -67.00595092773438,
"logps/rejected": -73.88758850097656,
"loss": 1.3141,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.42657342553138733,
"rewards/margins": 0.1288631111383438,
"rewards/rejected": -0.5554364919662476,
"step": 106
},
{
"epoch": 0.1617535903250189,
"epsilon_dpo/beta": 0.08074409514665604,
"epsilon_dpo/beta_margin_grad_mean": -0.48306161165237427,
"epsilon_dpo/beta_margin_grad_std": 0.12286480516195297,
"epsilon_dpo/beta_margin_mean": 0.06496331840753555,
"epsilon_dpo/beta_margin_std": 0.5415164828300476,
"epsilon_dpo/loss_margin_mean": 0.8508073687553406,
"grad_norm": 33.21994400024414,
"kl/avg_steps": 0.25,
"kl/beta": 0.08093836903572083,
"kl/n_epsilon_steps": 0.375,
"kl/p_epsilon_steps": 0.625,
"learning_rate": 4.947006115536947e-07,
"logits/chosen": -0.3120744228363037,
"logits/rejected": -0.2999732196331024,
"logps/chosen": -82.71369934082031,
"logps/ref_chosen": -75.83796691894531,
"logps/ref_rejected": -87.74038696289062,
"logps/rejected": -95.4669189453125,
"loss": 1.3921,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.5573137998580933,
"rewards/margins": 0.06496329605579376,
"rewards/rejected": -0.6222771406173706,
"step": 107
},
{
"epoch": 0.16326530612244897,
"epsilon_dpo/beta": 0.08056797087192535,
"epsilon_dpo/beta_margin_grad_mean": -0.46542906761169434,
"epsilon_dpo/beta_margin_grad_std": 0.10896296054124832,
"epsilon_dpo/beta_margin_mean": 0.1475972682237625,
"epsilon_dpo/beta_margin_std": 0.4702468514442444,
"epsilon_dpo/loss_margin_mean": 1.8718154430389404,
"grad_norm": 23.493183135986328,
"kl/avg_steps": 0.21875,
"kl/beta": 0.08073652535676956,
"kl/n_epsilon_steps": 0.390625,
"kl/p_epsilon_steps": 0.609375,
"learning_rate": 4.944263911205772e-07,
"logits/chosen": -0.2802099883556366,
"logits/rejected": -0.3116492033004761,
"logps/chosen": -74.2529296875,
"logps/ref_chosen": -68.39323425292969,
"logps/ref_rejected": -83.24267578125,
"logps/rejected": -90.97418212890625,
"loss": 1.2971,
"rewards/accuracies": 0.578125,
"rewards/chosen": -0.4733844995498657,
"rewards/margins": 0.1475972682237625,
"rewards/rejected": -0.6209816932678223,
"step": 108
},
{
"epoch": 0.16477702191987906,
"epsilon_dpo/beta": 0.08031658083200455,
"epsilon_dpo/beta_margin_grad_mean": -0.4409874975681305,
"epsilon_dpo/beta_margin_grad_std": 0.10245691239833832,
"epsilon_dpo/beta_margin_mean": 0.26134350895881653,
"epsilon_dpo/beta_margin_std": 0.4796282649040222,
"epsilon_dpo/loss_margin_mean": 3.286970615386963,
"grad_norm": 23.221473693847656,
"kl/avg_steps": 0.3125,
"kl/beta": 0.08056030422449112,
"kl/n_epsilon_steps": 0.34375,
"kl/p_epsilon_steps": 0.65625,
"learning_rate": 4.941453335558681e-07,
"logits/chosen": -0.2664685845375061,
"logits/rejected": -0.3282352387905121,
"logps/chosen": -60.65399169921875,
"logps/ref_chosen": -55.52748107910156,
"logps/ref_rejected": -83.55218505859375,
"logps/rejected": -91.96566772460938,
"loss": 1.1938,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.41318178176879883,
"rewards/margins": 0.2613435387611389,
"rewards/rejected": -0.6745253205299377,
"step": 109
},
{
"epoch": 0.16628873771730915,
"epsilon_dpo/beta": 0.08024206757545471,
"epsilon_dpo/beta_margin_grad_mean": -0.4952909052371979,
"epsilon_dpo/beta_margin_grad_std": 0.10888107866048813,
"epsilon_dpo/beta_margin_mean": 0.012063674628734589,
"epsilon_dpo/beta_margin_std": 0.4786463975906372,
"epsilon_dpo/loss_margin_mean": 0.1897306740283966,
"grad_norm": 35.83425521850586,
"kl/avg_steps": 0.09375,
"kl/beta": 0.08030933141708374,
"kl/n_epsilon_steps": 0.453125,
"kl/p_epsilon_steps": 0.546875,
"learning_rate": 4.938574467213517e-07,
"logits/chosen": -0.34494251012802124,
"logits/rejected": -0.37497127056121826,
"logps/chosen": -87.87454223632812,
"logps/ref_chosen": -81.15874481201172,
"logps/ref_rejected": -72.56021118164062,
"logps/rejected": -79.46573638916016,
"loss": 1.4287,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.5410492420196533,
"rewards/margins": 0.012063663452863693,
"rewards/rejected": -0.5531128644943237,
"step": 110
},
{
"epoch": 0.16780045351473924,
"epsilon_dpo/beta": 0.08016691356897354,
"epsilon_dpo/beta_margin_grad_mean": -0.4615313708782196,
"epsilon_dpo/beta_margin_grad_std": 0.10746827721595764,
"epsilon_dpo/beta_margin_mean": 0.1679041087627411,
"epsilon_dpo/beta_margin_std": 0.46593236923217773,
"epsilon_dpo/loss_margin_mean": 2.137284994125366,
"grad_norm": 20.94742774963379,
"kl/avg_steps": 0.09375,
"kl/beta": 0.0802341178059578,
"kl/n_epsilon_steps": 0.453125,
"kl/p_epsilon_steps": 0.546875,
"learning_rate": 4.935627386698418e-07,
"logits/chosen": -0.24417856335639954,
"logits/rejected": -0.2752796411514282,
"logps/chosen": -58.40732192993164,
"logps/ref_chosen": -52.358985900878906,
"logps/ref_rejected": -77.06150817871094,
"logps/rejected": -85.24712371826172,
"loss": 1.2771,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.4860393702983856,
"rewards/margins": 0.16790412366390228,
"rewards/rejected": -0.6539434790611267,
"step": 111
},
{
"epoch": 0.1693121693121693,
"epsilon_dpo/beta": 0.07976614683866501,
"epsilon_dpo/beta_margin_grad_mean": -0.45119598507881165,
"epsilon_dpo/beta_margin_grad_std": 0.0995359718799591,
"epsilon_dpo/beta_margin_mean": 0.20377379655838013,
"epsilon_dpo/beta_margin_std": 0.4216992259025574,
"epsilon_dpo/loss_margin_mean": 2.5878348350524902,
"grad_norm": 26.345199584960938,
"kl/avg_steps": 0.5,
"kl/beta": 0.08015896379947662,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 4.932612176449559e-07,
"logits/chosen": -0.2106427252292633,
"logits/rejected": -0.2692835330963135,
"logps/chosen": -68.82554626464844,
"logps/ref_chosen": -63.02006912231445,
"logps/ref_rejected": -111.36941528320312,
"logps/rejected": -119.76272583007812,
"loss": 1.2358,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.46483007073402405,
"rewards/margins": 0.20377382636070251,
"rewards/rejected": -0.6686038970947266,
"step": 112
},
{
"epoch": 0.1708238851095994,
"epsilon_dpo/beta": 0.07959365099668503,
"epsilon_dpo/beta_margin_grad_mean": -0.47365984320640564,
"epsilon_dpo/beta_margin_grad_std": 0.09783027321100235,
"epsilon_dpo/beta_margin_mean": 0.10828649252653122,
"epsilon_dpo/beta_margin_std": 0.4176913797855377,
"epsilon_dpo/loss_margin_mean": 1.3970348834991455,
"grad_norm": 28.65127182006836,
"kl/avg_steps": 0.21875,
"kl/beta": 0.07976016402244568,
"kl/n_epsilon_steps": 0.390625,
"kl/p_epsilon_steps": 0.609375,
"learning_rate": 4.929528920808854e-07,
"logits/chosen": -0.30682387948036194,
"logits/rejected": -0.37203821539878845,
"logps/chosen": -61.56267547607422,
"logps/ref_chosen": -55.80766296386719,
"logps/ref_rejected": -69.84014129638672,
"logps/rejected": -76.9921875,
"loss": 1.323,
"rewards/accuracies": 0.640625,
"rewards/chosen": -0.45990556478500366,
"rewards/margins": 0.10828644037246704,
"rewards/rejected": -0.5681920051574707,
"step": 113
},
{
"epoch": 0.17233560090702948,
"epsilon_dpo/beta": 0.0793701708316803,
"epsilon_dpo/beta_margin_grad_mean": -0.4555838406085968,
"epsilon_dpo/beta_margin_grad_std": 0.09699393063783646,
"epsilon_dpo/beta_margin_mean": 0.1890346109867096,
"epsilon_dpo/beta_margin_std": 0.41326647996902466,
"epsilon_dpo/loss_margin_mean": 2.418400526046753,
"grad_norm": 24.078357696533203,
"kl/avg_steps": 0.28125,
"kl/beta": 0.07958607375621796,
"kl/n_epsilon_steps": 0.359375,
"kl/p_epsilon_steps": 0.640625,
"learning_rate": 4.92637770602159e-07,
"logits/chosen": -0.22454284131526947,
"logits/rejected": -0.4311988353729248,
"logps/chosen": -71.88116455078125,
"logps/ref_chosen": -66.33277130126953,
"logps/ref_rejected": -71.61489868164062,
"logps/rejected": -79.58169555664062,
"loss": 1.2472,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.44173866510391235,
"rewards/margins": 0.1890345811843872,
"rewards/rejected": -0.6307732462882996,
"step": 114
},
{
"epoch": 0.17384731670445955,
"epsilon_dpo/beta": 0.07909796386957169,
"epsilon_dpo/beta_margin_grad_mean": -0.46361038088798523,
"epsilon_dpo/beta_margin_grad_std": 0.09177512675523758,
"epsilon_dpo/beta_margin_mean": 0.15344859659671783,
"epsilon_dpo/beta_margin_std": 0.38923752307891846,
"epsilon_dpo/loss_margin_mean": 1.971751093864441,
"grad_norm": 22.831642150878906,
"kl/avg_steps": 0.34375,
"kl/beta": 0.07936286181211472,
"kl/n_epsilon_steps": 0.328125,
"kl/p_epsilon_steps": 0.671875,
"learning_rate": 4.923158620234019e-07,
"logits/chosen": -0.26415345072746277,
"logits/rejected": -0.36682528257369995,
"logps/chosen": -61.73161697387695,
"logps/ref_chosen": -55.74903869628906,
"logps/ref_rejected": -79.59849548339844,
"logps/rejected": -87.55282592773438,
"loss": 1.2753,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.4741614758968353,
"rewards/margins": 0.15344858169555664,
"rewards/rejected": -0.6276100873947144,
"step": 115
},
{
"epoch": 0.17535903250188964,
"epsilon_dpo/beta": 0.07877755910158157,
"epsilon_dpo/beta_margin_grad_mean": -0.4412730038166046,
"epsilon_dpo/beta_margin_grad_std": 0.0889286994934082,
"epsilon_dpo/beta_margin_mean": 0.24470652639865875,
"epsilon_dpo/beta_margin_std": 0.37284860014915466,
"epsilon_dpo/loss_margin_mean": 3.137737274169922,
"grad_norm": 21.381956100463867,
"kl/avg_steps": 0.40625,
"kl/beta": 0.07909099012613297,
"kl/n_epsilon_steps": 0.296875,
"kl/p_epsilon_steps": 0.703125,
"learning_rate": 4.91987175349089e-07,
"logits/chosen": -0.281636506319046,
"logits/rejected": -0.3857163190841675,
"logps/chosen": -54.97114944458008,
"logps/ref_chosen": -49.365169525146484,
"logps/ref_rejected": -72.84671020507812,
"logps/rejected": -81.59042358398438,
"loss": 1.1902,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.442451536655426,
"rewards/margins": 0.24470648169517517,
"rewards/rejected": -0.6871579885482788,
"step": 116
},
{
"epoch": 0.17687074829931973,
"epsilon_dpo/beta": 0.07850805670022964,
"epsilon_dpo/beta_margin_grad_mean": -0.4636790156364441,
"epsilon_dpo/beta_margin_grad_std": 0.0962023064494133,
"epsilon_dpo/beta_margin_mean": 0.150344118475914,
"epsilon_dpo/beta_margin_std": 0.4104200601577759,
"epsilon_dpo/loss_margin_mean": 1.949401617050171,
"grad_norm": 22.659311294555664,
"kl/avg_steps": 0.34375,
"kl/beta": 0.07877098023891449,
"kl/n_epsilon_steps": 0.328125,
"kl/p_epsilon_steps": 0.671875,
"learning_rate": 4.916517197732933e-07,
"logits/chosen": -0.2831147015094757,
"logits/rejected": -0.3588418960571289,
"logps/chosen": -63.401885986328125,
"logps/ref_chosen": -57.710899353027344,
"logps/ref_rejected": -69.77254486083984,
"logps/rejected": -77.41293334960938,
"loss": 1.2822,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.44801393151283264,
"rewards/margins": 0.1503440886735916,
"rewards/rejected": -0.5983580350875854,
"step": 117
},
{
"epoch": 0.17838246409674982,
"epsilon_dpo/beta": 0.07821457833051682,
"epsilon_dpo/beta_margin_grad_mean": -0.44928446412086487,
"epsilon_dpo/beta_margin_grad_std": 0.08310793340206146,
"epsilon_dpo/beta_margin_mean": 0.2098846435546875,
"epsilon_dpo/beta_margin_std": 0.34501397609710693,
"epsilon_dpo/loss_margin_mean": 2.714132070541382,
"grad_norm": 22.579818725585938,
"kl/avg_steps": 0.375,
"kl/beta": 0.0785011351108551,
"kl/n_epsilon_steps": 0.3125,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 4.913095046794281e-07,
"logits/chosen": -0.27549389004707336,
"logits/rejected": -0.36111223697662354,
"logps/chosen": -58.01707458496094,
"logps/ref_chosen": -52.479896545410156,
"logps/ref_rejected": -81.35912322998047,
"logps/rejected": -89.61042785644531,
"loss": 1.2164,
"rewards/accuracies": 0.671875,
"rewards/chosen": -0.4339134693145752,
"rewards/margins": 0.20988470315933228,
"rewards/rejected": -0.6437982320785522,
"step": 118
},
{
"epoch": 0.17989417989417988,
"epsilon_dpo/beta": 0.07806901633739471,
"epsilon_dpo/beta_margin_grad_mean": -0.4660235345363617,
"epsilon_dpo/beta_margin_grad_std": 0.10985706746578217,
"epsilon_dpo/beta_margin_mean": 0.14664211869239807,
"epsilon_dpo/beta_margin_std": 0.48551133275032043,
"epsilon_dpo/loss_margin_mean": 1.9220465421676636,
"grad_norm": 23.248210906982422,
"kl/avg_steps": 0.1875,
"kl/beta": 0.07820785045623779,
"kl/n_epsilon_steps": 0.40625,
"kl/p_epsilon_steps": 0.59375,
"learning_rate": 4.909605396399855e-07,
"logits/chosen": -0.32200682163238525,
"logits/rejected": -0.3813210129737854,
"logps/chosen": -68.35606384277344,
"logps/ref_chosen": -61.35767364501953,
"logps/ref_rejected": -75.71510314941406,
"logps/rejected": -84.63553619384766,
"loss": 1.3006,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.5482773184776306,
"rewards/margins": 0.14664211869239807,
"rewards/rejected": -0.6949194073677063,
"step": 119
},
{
"epoch": 0.18140589569160998,
"epsilon_dpo/beta": 0.07765454798936844,
"epsilon_dpo/beta_margin_grad_mean": -0.4431649148464203,
"epsilon_dpo/beta_margin_grad_std": 0.08065138012170792,
"epsilon_dpo/beta_margin_mean": 0.23651456832885742,
"epsilon_dpo/beta_margin_std": 0.34066343307495117,
"epsilon_dpo/loss_margin_mean": 3.0703203678131104,
"grad_norm": 23.334104537963867,
"kl/avg_steps": 0.53125,
"kl/beta": 0.07806148380041122,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 4.906048344162676e-07,
"logits/chosen": -0.28048282861709595,
"logits/rejected": -0.4235890507698059,
"logps/chosen": -65.68623352050781,
"logps/ref_chosen": -59.907569885253906,
"logps/ref_rejected": -79.6910629272461,
"logps/rejected": -88.54005432128906,
"loss": 1.1917,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.44951748847961426,
"rewards/margins": 0.23651456832885742,
"rewards/rejected": -0.6860320568084717,
"step": 120
},
{
"epoch": 0.18291761148904007,
"epsilon_dpo/beta": 0.07734125852584839,
"epsilon_dpo/beta_margin_grad_mean": -0.44082891941070557,
"epsilon_dpo/beta_margin_grad_std": 0.0972229614853859,
"epsilon_dpo/beta_margin_mean": 0.2585199773311615,
"epsilon_dpo/beta_margin_std": 0.44502782821655273,
"epsilon_dpo/loss_margin_mean": 3.3688769340515137,
"grad_norm": 21.977359771728516,
"kl/avg_steps": 0.40625,
"kl/beta": 0.07764897495508194,
"kl/n_epsilon_steps": 0.296875,
"kl/p_epsilon_steps": 0.703125,
"learning_rate": 4.902423989581143e-07,
"logits/chosen": -0.28811100125312805,
"logits/rejected": -0.3832913637161255,
"logps/chosen": -62.00624084472656,
"logps/ref_chosen": -55.666046142578125,
"logps/ref_rejected": -101.56233978271484,
"logps/rejected": -111.27141571044922,
"loss": 1.1898,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.4912344813346863,
"rewards/margins": 0.2585200369358063,
"rewards/rejected": -0.7497545480728149,
"step": 121
},
{
"epoch": 0.18442932728647016,
"epsilon_dpo/beta": 0.07712501287460327,
"epsilon_dpo/beta_margin_grad_mean": -0.45105886459350586,
"epsilon_dpo/beta_margin_grad_std": 0.11214206367731094,
"epsilon_dpo/beta_margin_mean": 0.2194489687681198,
"epsilon_dpo/beta_margin_std": 0.5113435983657837,
"epsilon_dpo/loss_margin_mean": 2.885754108428955,
"grad_norm": 24.13726234436035,
"kl/avg_steps": 0.28125,
"kl/beta": 0.07733480632305145,
"kl/n_epsilon_steps": 0.359375,
"kl/p_epsilon_steps": 0.640625,
"learning_rate": 4.898732434036243e-07,
"logits/chosen": -0.3566771447658539,
"logits/rejected": -0.38670986890792847,
"logps/chosen": -70.29400634765625,
"logps/ref_chosen": -63.334373474121094,
"logps/ref_rejected": -73.67523193359375,
"logps/rejected": -83.52062225341797,
"loss": 1.239,
"rewards/accuracies": 0.671875,
"rewards/chosen": -0.5386757850646973,
"rewards/margins": 0.219448983669281,
"rewards/rejected": -0.7581247687339783,
"step": 122
},
{
"epoch": 0.18594104308390022,
"epsilon_dpo/beta": 0.07676409929990768,
"epsilon_dpo/beta_margin_grad_mean": -0.4485389292240143,
"epsilon_dpo/beta_margin_grad_std": 0.0899190604686737,
"epsilon_dpo/beta_margin_mean": 0.2186276763677597,
"epsilon_dpo/beta_margin_std": 0.39453816413879395,
"epsilon_dpo/loss_margin_mean": 2.8759868144989014,
"grad_norm": 21.403152465820312,
"kl/avg_steps": 0.46875,
"kl/beta": 0.0771179124712944,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 4.894973780788722e-07,
"logits/chosen": -0.2101168930530548,
"logits/rejected": -0.2997229993343353,
"logps/chosen": -63.53388214111328,
"logps/ref_chosen": -56.89874267578125,
"logps/ref_rejected": -78.97029113769531,
"logps/rejected": -88.48141479492188,
"loss": 1.2162,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.5101251602172852,
"rewards/margins": 0.2186277210712433,
"rewards/rejected": -0.7287529110908508,
"step": 123
},
{
"epoch": 0.1874527588813303,
"epsilon_dpo/beta": 0.07638195157051086,
"epsilon_dpo/beta_margin_grad_mean": -0.43010213971138,
"epsilon_dpo/beta_margin_grad_std": 0.10084889829158783,
"epsilon_dpo/beta_margin_mean": 0.2960425913333893,
"epsilon_dpo/beta_margin_std": 0.434106707572937,
"epsilon_dpo/loss_margin_mean": 3.910611391067505,
"grad_norm": 22.229591369628906,
"kl/avg_steps": 0.5,
"kl/beta": 0.07675810903310776,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 4.89114813497619e-07,
"logits/chosen": -0.3432326018810272,
"logits/rejected": -0.4263257384300232,
"logps/chosen": -64.15696716308594,
"logps/ref_chosen": -57.116085052490234,
"logps/ref_rejected": -87.93074035644531,
"logps/rejected": -98.88223266601562,
"loss": 1.1569,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.5391653180122375,
"rewards/margins": 0.29604262113571167,
"rewards/rejected": -0.8352078795433044,
"step": 124
},
{
"epoch": 0.1889644746787604,
"epsilon_dpo/beta": 0.07600194215774536,
"epsilon_dpo/beta_margin_grad_mean": -0.43823862075805664,
"epsilon_dpo/beta_margin_grad_std": 0.1043824702501297,
"epsilon_dpo/beta_margin_mean": 0.26712024211883545,
"epsilon_dpo/beta_margin_std": 0.46725982427597046,
"epsilon_dpo/loss_margin_mean": 3.549048900604248,
"grad_norm": 22.98603057861328,
"kl/avg_steps": 0.5,
"kl/beta": 0.07637622207403183,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 4.887255603610184e-07,
"logits/chosen": -0.27146202325820923,
"logits/rejected": -0.3609638512134552,
"logps/chosen": -73.373291015625,
"logps/ref_chosen": -65.7061767578125,
"logps/ref_rejected": -91.72711944580078,
"logps/rejected": -102.94328308105469,
"loss": 1.1876,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.5836365818977356,
"rewards/margins": 0.26712027192115784,
"rewards/rejected": -0.8507568836212158,
"step": 125
},
{
"epoch": 0.19047619047619047,
"epsilon_dpo/beta": 0.07583758234977722,
"epsilon_dpo/beta_margin_grad_mean": -0.46983328461647034,
"epsilon_dpo/beta_margin_grad_std": 0.09309153258800507,
"epsilon_dpo/beta_margin_mean": 0.12513495981693268,
"epsilon_dpo/beta_margin_std": 0.38970738649368286,
"epsilon_dpo/loss_margin_mean": 1.688839077949524,
"grad_norm": 22.87610626220703,
"kl/avg_steps": 0.21875,
"kl/beta": 0.07599624246358871,
"kl/n_epsilon_steps": 0.390625,
"kl/p_epsilon_steps": 0.609375,
"learning_rate": 4.883296295573176e-07,
"logits/chosen": -0.34933096170425415,
"logits/rejected": -0.3598957657814026,
"logps/chosen": -75.55615234375,
"logps/ref_chosen": -68.17608642578125,
"logps/ref_rejected": -65.1175537109375,
"logps/rejected": -74.18645477294922,
"loss": 1.3021,
"rewards/accuracies": 0.609375,
"rewards/chosen": -0.5615495443344116,
"rewards/margins": 0.1251349151134491,
"rewards/rejected": -0.6866844892501831,
"step": 126
},
{
"epoch": 0.19198790627362056,
"epsilon_dpo/beta": 0.07548245787620544,
"epsilon_dpo/beta_margin_grad_mean": -0.4357949495315552,
"epsilon_dpo/beta_margin_grad_std": 0.08713133633136749,
"epsilon_dpo/beta_margin_mean": 0.2675861120223999,
"epsilon_dpo/beta_margin_std": 0.36765769124031067,
"epsilon_dpo/loss_margin_mean": 3.574740171432495,
"grad_norm": 20.705974578857422,
"kl/avg_steps": 0.46875,
"kl/beta": 0.0758303627371788,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 4.87927032161552e-07,
"logits/chosen": -0.33182287216186523,
"logits/rejected": -0.4110308289527893,
"logps/chosen": -69.33089447021484,
"logps/ref_chosen": -61.88023376464844,
"logps/ref_rejected": -68.46012878417969,
"logps/rejected": -79.48552703857422,
"loss": 1.1691,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.563042163848877,
"rewards/margins": 0.2675861418247223,
"rewards/rejected": -0.8306283354759216,
"step": 127
},
{
"epoch": 0.19349962207105065,
"epsilon_dpo/beta": 0.07510669529438019,
"epsilon_dpo/beta_margin_grad_mean": -0.4305736720561981,
"epsilon_dpo/beta_margin_grad_std": 0.12716034054756165,
"epsilon_dpo/beta_margin_mean": 0.3063305616378784,
"epsilon_dpo/beta_margin_std": 0.5747892260551453,
"epsilon_dpo/loss_margin_mean": 4.1219940185546875,
"grad_norm": 22.07100486755371,
"kl/avg_steps": 0.5,
"kl/beta": 0.07547657191753387,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 4.875177794352363e-07,
"logits/chosen": -0.2974826395511627,
"logits/rejected": -0.39164555072784424,
"logps/chosen": -74.81300354003906,
"logps/ref_chosen": -66.708984375,
"logps/ref_rejected": -94.97969055175781,
"logps/rejected": -107.2057113647461,
"loss": 1.1796,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.6093688011169434,
"rewards/margins": 0.3063305616378784,
"rewards/rejected": -0.9156993627548218,
"step": 128
},
{
"epoch": 0.19501133786848074,
"epsilon_dpo/beta": 0.07496774196624756,
"epsilon_dpo/beta_margin_grad_mean": -0.4631972908973694,
"epsilon_dpo/beta_margin_grad_std": 0.12010663747787476,
"epsilon_dpo/beta_margin_mean": 0.1557939052581787,
"epsilon_dpo/beta_margin_std": 0.5243479609489441,
"epsilon_dpo/loss_margin_mean": 2.1298911571502686,
"grad_norm": 26.728836059570312,
"kl/avg_steps": 0.1875,
"kl/beta": 0.07510106265544891,
"kl/n_epsilon_steps": 0.40625,
"kl/p_epsilon_steps": 0.59375,
"learning_rate": 4.871018828260491e-07,
"logits/chosen": -0.35696378350257874,
"logits/rejected": -0.3766302466392517,
"logps/chosen": -74.52942657470703,
"logps/ref_chosen": -65.33882904052734,
"logps/ref_rejected": -68.06109619140625,
"logps/rejected": -79.381591796875,
"loss": 1.302,
"rewards/accuracies": 0.609375,
"rewards/chosen": -0.6905369758605957,
"rewards/margins": 0.1557939052581787,
"rewards/rejected": -0.8463308811187744,
"step": 129
},
{
"epoch": 0.1965230536659108,
"epsilon_dpo/beta": 0.07461659610271454,
"epsilon_dpo/beta_margin_grad_mean": -0.43477219343185425,
"epsilon_dpo/beta_margin_grad_std": 0.0998179018497467,
"epsilon_dpo/beta_margin_mean": 0.2750687599182129,
"epsilon_dpo/beta_margin_std": 0.4250439703464508,
"epsilon_dpo/loss_margin_mean": 3.723066806793213,
"grad_norm": 23.3736572265625,
"kl/avg_steps": 0.46875,
"kl/beta": 0.07496051490306854,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 4.866793539675126e-07,
"logits/chosen": -0.3007310628890991,
"logits/rejected": -0.3654765188694,
"logps/chosen": -67.38555908203125,
"logps/ref_chosen": -58.660743713378906,
"logps/ref_rejected": -79.24510192871094,
"logps/rejected": -91.69297790527344,
"loss": 1.1733,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.6524736881256104,
"rewards/margins": 0.2750687599182129,
"rewards/rejected": -0.9275424480438232,
"step": 130
},
{
"epoch": 0.1980347694633409,
"epsilon_dpo/beta": 0.07430332899093628,
"epsilon_dpo/beta_margin_grad_mean": -0.4191279113292694,
"epsilon_dpo/beta_margin_grad_std": 0.12402309477329254,
"epsilon_dpo/beta_margin_mean": 0.35492897033691406,
"epsilon_dpo/beta_margin_std": 0.5491863489151001,
"epsilon_dpo/loss_margin_mean": 4.822208404541016,
"grad_norm": 20.391130447387695,
"kl/avg_steps": 0.421875,
"kl/beta": 0.07461077719926834,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.703125,
"learning_rate": 4.86250204678667e-07,
"logits/chosen": -0.1885560303926468,
"logits/rejected": -0.30237945914268494,
"logps/chosen": -60.834678649902344,
"logps/ref_chosen": -52.51454162597656,
"logps/ref_rejected": -85.18299865722656,
"logps/rejected": -98.32534790039062,
"loss": 1.1329,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.6196655035018921,
"rewards/margins": 0.35492897033691406,
"rewards/rejected": -0.9745944738388062,
"step": 131
},
{
"epoch": 0.19954648526077098,
"epsilon_dpo/beta": 0.07391001284122467,
"epsilon_dpo/beta_margin_grad_mean": -0.43144744634628296,
"epsilon_dpo/beta_margin_grad_std": 0.10492806136608124,
"epsilon_dpo/beta_margin_mean": 0.28503015637397766,
"epsilon_dpo/beta_margin_std": 0.4498443305492401,
"epsilon_dpo/loss_margin_mean": 3.894169330596924,
"grad_norm": 21.18486976623535,
"kl/avg_steps": 0.53125,
"kl/beta": 0.07429733127355576,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 4.858144469637408e-07,
"logits/chosen": -0.3224283456802368,
"logits/rejected": -0.35772061347961426,
"logps/chosen": -75.00634765625,
"logps/ref_chosen": -65.68513488769531,
"logps/ref_rejected": -69.54120635986328,
"logps/rejected": -82.756591796875,
"loss": 1.17,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.690468430519104,
"rewards/margins": 0.28503015637397766,
"rewards/rejected": -0.9754985570907593,
"step": 132
},
{
"epoch": 0.20105820105820105,
"epsilon_dpo/beta": 0.07370422780513763,
"epsilon_dpo/beta_margin_grad_mean": -0.4493582248687744,
"epsilon_dpo/beta_margin_grad_std": 0.11538718640804291,
"epsilon_dpo/beta_margin_mean": 0.21408821642398834,
"epsilon_dpo/beta_margin_std": 0.5006569623947144,
"epsilon_dpo/loss_margin_mean": 2.9492056369781494,
"grad_norm": 24.301433563232422,
"kl/avg_steps": 0.28125,
"kl/beta": 0.07390471547842026,
"kl/n_epsilon_steps": 0.359375,
"kl/p_epsilon_steps": 0.640625,
"learning_rate": 4.853720930118138e-07,
"logits/chosen": -0.3723902106285095,
"logits/rejected": -0.3735864758491516,
"logps/chosen": -73.47770690917969,
"logps/ref_chosen": -63.598114013671875,
"logps/ref_rejected": -73.72798156738281,
"logps/rejected": -86.5567855834961,
"loss": 1.2434,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.7299602031707764,
"rewards/margins": 0.21408820152282715,
"rewards/rejected": -0.9440484046936035,
"step": 133
},
{
"epoch": 0.20256991685563114,
"epsilon_dpo/beta": 0.07329022139310837,
"epsilon_dpo/beta_margin_grad_mean": -0.41012582182884216,
"epsilon_dpo/beta_margin_grad_std": 0.11700302362442017,
"epsilon_dpo/beta_margin_mean": 0.3908378779888153,
"epsilon_dpo/beta_margin_std": 0.5314415693283081,
"epsilon_dpo/loss_margin_mean": 5.370856285095215,
"grad_norm": 19.890684127807617,
"kl/avg_steps": 0.5625,
"kl/beta": 0.07369744032621384,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 4.849231551964771e-07,
"logits/chosen": -0.26751643419265747,
"logits/rejected": -0.3765121400356293,
"logps/chosen": -62.95500183105469,
"logps/ref_chosen": -53.79457092285156,
"logps/ref_rejected": -74.16741943359375,
"logps/rejected": -88.69869995117188,
"loss": 1.098,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.672065019607544,
"rewards/margins": 0.3908378481864929,
"rewards/rejected": -1.0629029273986816,
"step": 134
},
{
"epoch": 0.20408163265306123,
"epsilon_dpo/beta": 0.07294897735118866,
"epsilon_dpo/beta_margin_grad_mean": -0.44255179166793823,
"epsilon_dpo/beta_margin_grad_std": 0.11139528453350067,
"epsilon_dpo/beta_margin_mean": 0.24292528629302979,
"epsilon_dpo/beta_margin_std": 0.48473989963531494,
"epsilon_dpo/loss_margin_mean": 3.3710861206054688,
"grad_norm": 19.40846061706543,
"kl/avg_steps": 0.46875,
"kl/beta": 0.07328520715236664,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 4.844676460754862e-07,
"logits/chosen": -0.258120596408844,
"logits/rejected": -0.3430369794368744,
"logps/chosen": -58.67110061645508,
"logps/ref_chosen": -49.441078186035156,
"logps/ref_rejected": -65.96878051757812,
"logps/rejected": -78.56989288330078,
"loss": 1.2139,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.6748960018157959,
"rewards/margins": 0.24292531609535217,
"rewards/rejected": -0.9178212881088257,
"step": 135
},
{
"epoch": 0.20559334845049132,
"epsilon_dpo/beta": 0.07267702370882034,
"epsilon_dpo/beta_margin_grad_mean": -0.42554789781570435,
"epsilon_dpo/beta_margin_grad_std": 0.1490720808506012,
"epsilon_dpo/beta_margin_mean": 0.3313453495502472,
"epsilon_dpo/beta_margin_std": 0.6657735705375671,
"epsilon_dpo/loss_margin_mean": 4.620194911956787,
"grad_norm": 23.152406692504883,
"kl/avg_steps": 0.375,
"kl/beta": 0.0729432925581932,
"kl/n_epsilon_steps": 0.3125,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 4.840055783904106e-07,
"logits/chosen": -0.3157322406768799,
"logits/rejected": -0.3505600690841675,
"logps/chosen": -78.24725341796875,
"logps/ref_chosen": -66.75926208496094,
"logps/ref_rejected": -94.61787414550781,
"logps/rejected": -110.72605895996094,
"loss": 1.1854,
"rewards/accuracies": 0.671875,
"rewards/chosen": -0.8365979194641113,
"rewards/margins": 0.33134526014328003,
"rewards/rejected": -1.1679432392120361,
"step": 136
},
{
"epoch": 0.20710506424792138,
"epsilon_dpo/beta": 0.07233736664056778,
"epsilon_dpo/beta_margin_grad_mean": -0.4249022305011749,
"epsilon_dpo/beta_margin_grad_std": 0.11909965425729752,
"epsilon_dpo/beta_margin_mean": 0.3281296491622925,
"epsilon_dpo/beta_margin_std": 0.5356920957565308,
"epsilon_dpo/loss_margin_mean": 4.578954219818115,
"grad_norm": 20.099393844604492,
"kl/avg_steps": 0.46875,
"kl/beta": 0.07267077267169952,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 4.835369650662767e-07,
"logits/chosen": -0.29257553815841675,
"logits/rejected": -0.3296911418437958,
"logps/chosen": -67.43355560302734,
"logps/ref_chosen": -56.78379821777344,
"logps/ref_rejected": -69.89952087402344,
"logps/rejected": -85.12823486328125,
"loss": 1.1511,
"rewards/accuracies": 0.703125,
"rewards/chosen": -0.7721371054649353,
"rewards/margins": 0.3281296193599701,
"rewards/rejected": -1.100266695022583,
"step": 137
},
{
"epoch": 0.20861678004535147,
"epsilon_dpo/beta": 0.07206767797470093,
"epsilon_dpo/beta_margin_grad_mean": -0.43408605456352234,
"epsilon_dpo/beta_margin_grad_std": 0.1350470930337906,
"epsilon_dpo/beta_margin_mean": 0.2828466296195984,
"epsilon_dpo/beta_margin_std": 0.6074637770652771,
"epsilon_dpo/loss_margin_mean": 3.9784443378448486,
"grad_norm": 20.9215087890625,
"kl/avg_steps": 0.375,
"kl/beta": 0.07233171910047531,
"kl/n_epsilon_steps": 0.3125,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 4.830618192112065e-07,
"logits/chosen": -0.2662360966205597,
"logits/rejected": -0.372766375541687,
"logps/chosen": -71.4655990600586,
"logps/ref_chosen": -58.766014099121094,
"logps/ref_rejected": -68.12371826171875,
"logps/rejected": -84.80175018310547,
"loss": 1.2094,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.9172662496566772,
"rewards/margins": 0.2828466296195984,
"rewards/rejected": -1.20011305809021,
"step": 138
},
{
"epoch": 0.21012849584278157,
"epsilon_dpo/beta": 0.07173064351081848,
"epsilon_dpo/beta_margin_grad_mean": -0.42348769307136536,
"epsilon_dpo/beta_margin_grad_std": 0.11816025525331497,
"epsilon_dpo/beta_margin_mean": 0.3260977864265442,
"epsilon_dpo/beta_margin_std": 0.5176202058792114,
"epsilon_dpo/loss_margin_mean": 4.59393835067749,
"grad_norm": 23.283512115478516,
"kl/avg_steps": 0.46875,
"kl/beta": 0.07206148654222488,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 4.825801541160509e-07,
"logits/chosen": -0.2882693409919739,
"logits/rejected": -0.34237560629844666,
"logps/chosen": -84.75924682617188,
"logps/ref_chosen": -71.2255859375,
"logps/ref_rejected": -82.1834716796875,
"logps/rejected": -100.31106567382812,
"loss": 1.1498,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.9718290567398071,
"rewards/margins": 0.3260977566242218,
"rewards/rejected": -1.297926664352417,
"step": 139
},
{
"epoch": 0.21164021164021163,
"epsilon_dpo/beta": 0.07139620184898376,
"epsilon_dpo/beta_margin_grad_mean": -0.39723992347717285,
"epsilon_dpo/beta_margin_grad_std": 0.16468670964241028,
"epsilon_dpo/beta_margin_mean": 0.462647944688797,
"epsilon_dpo/beta_margin_std": 0.7564051151275635,
"epsilon_dpo/loss_margin_mean": 6.550485134124756,
"grad_norm": 23.505237579345703,
"kl/avg_steps": 0.46875,
"kl/beta": 0.0717252790927887,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 4.820919832540181e-07,
"logits/chosen": -0.23829999566078186,
"logits/rejected": -0.3623167872428894,
"logps/chosen": -75.56968688964844,
"logps/ref_chosen": -63.27766418457031,
"logps/ref_rejected": -83.30647277832031,
"logps/rejected": -102.14898681640625,
"loss": 1.1069,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.8810849189758301,
"rewards/margins": 0.4626480042934418,
"rewards/rejected": -1.3437328338623047,
"step": 140
},
{
"epoch": 0.21315192743764172,
"epsilon_dpo/beta": 0.0710630938410759,
"epsilon_dpo/beta_margin_grad_mean": -0.40966731309890747,
"epsilon_dpo/beta_margin_grad_std": 0.15519315004348755,
"epsilon_dpo/beta_margin_mean": 0.41022515296936035,
"epsilon_dpo/beta_margin_std": 0.7178983092308044,
"epsilon_dpo/loss_margin_mean": 5.8360395431518555,
"grad_norm": 22.21529769897461,
"kl/avg_steps": 0.46875,
"kl/beta": 0.07139062881469727,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 4.815973202802966e-07,
"logits/chosen": -0.2697054147720337,
"logits/rejected": -0.3415735960006714,
"logps/chosen": -75.64986419677734,
"logps/ref_chosen": -61.76676940917969,
"logps/ref_rejected": -88.60601806640625,
"logps/rejected": -108.32515716552734,
"loss": 1.1343,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.9880951642990112,
"rewards/margins": 0.4102250337600708,
"rewards/rejected": -1.398320198059082,
"step": 141
},
{
"epoch": 0.2146636432350718,
"epsilon_dpo/beta": 0.07076474279165268,
"epsilon_dpo/beta_margin_grad_mean": -0.4206353425979614,
"epsilon_dpo/beta_margin_grad_std": 0.12228359282016754,
"epsilon_dpo/beta_margin_mean": 0.34394702315330505,
"epsilon_dpo/beta_margin_std": 0.5437850952148438,
"epsilon_dpo/loss_margin_mean": 4.905487537384033,
"grad_norm": 21.119140625,
"kl/avg_steps": 0.421875,
"kl/beta": 0.07105755060911179,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.703125,
"learning_rate": 4.810961790316729e-07,
"logits/chosen": -0.30424752831459045,
"logits/rejected": -0.36931002140045166,
"logps/chosen": -78.66854858398438,
"logps/ref_chosen": -65.2747802734375,
"logps/ref_rejected": -81.1378173828125,
"logps/rejected": -99.43707275390625,
"loss": 1.1406,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.9488117694854736,
"rewards/margins": 0.34394705295562744,
"rewards/rejected": -1.2927587032318115,
"step": 142
},
{
"epoch": 0.2161753590325019,
"epsilon_dpo/beta": 0.07050073891878128,
"epsilon_dpo/beta_margin_grad_mean": -0.4335528314113617,
"epsilon_dpo/beta_margin_grad_std": 0.16585581004619598,
"epsilon_dpo/beta_margin_mean": 0.2821354568004608,
"epsilon_dpo/beta_margin_std": 0.7709394097328186,
"epsilon_dpo/loss_margin_mean": 4.07772970199585,
"grad_norm": 29.214380264282227,
"kl/avg_steps": 0.375,
"kl/beta": 0.07075903564691544,
"kl/n_epsilon_steps": 0.3125,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 4.805885735261454e-07,
"logits/chosen": -0.2855129837989807,
"logits/rejected": -0.34903484582901,
"logps/chosen": -77.00143432617188,
"logps/ref_chosen": -62.61782455444336,
"logps/ref_rejected": -70.39239501953125,
"logps/rejected": -88.85372924804688,
"loss": 1.2604,
"rewards/accuracies": 0.671875,
"rewards/chosen": -1.0170958042144775,
"rewards/margins": 0.2821354269981384,
"rewards/rejected": -1.2992312908172607,
"step": 143
},
{
"epoch": 0.21768707482993196,
"epsilon_dpo/beta": 0.07030344009399414,
"epsilon_dpo/beta_margin_grad_mean": -0.41849595308303833,
"epsilon_dpo/beta_margin_grad_std": 0.16150552034378052,
"epsilon_dpo/beta_margin_mean": 0.37938806414604187,
"epsilon_dpo/beta_margin_std": 0.7529569864273071,
"epsilon_dpo/loss_margin_mean": 5.4711689949035645,
"grad_norm": 22.511993408203125,
"kl/avg_steps": 0.28125,
"kl/beta": 0.07049468159675598,
"kl/n_epsilon_steps": 0.359375,
"kl/p_epsilon_steps": 0.640625,
"learning_rate": 4.800745179625307e-07,
"logits/chosen": -0.28277039527893066,
"logits/rejected": -0.29437702894210815,
"logps/chosen": -75.4214859008789,
"logps/ref_chosen": -60.80268859863281,
"logps/ref_rejected": -79.07284545898438,
"logps/rejected": -99.16281127929688,
"loss": 1.1706,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.0307759046554565,
"rewards/margins": 0.3793880343437195,
"rewards/rejected": -1.4101638793945312,
"step": 144
},
{
"epoch": 0.21919879062736206,
"epsilon_dpo/beta": 0.07004036009311676,
"epsilon_dpo/beta_margin_grad_mean": -0.41011685132980347,
"epsilon_dpo/beta_margin_grad_std": 0.16874288022518158,
"epsilon_dpo/beta_margin_mean": 0.40650680661201477,
"epsilon_dpo/beta_margin_std": 0.814594566822052,
"epsilon_dpo/loss_margin_mean": 5.882437705993652,
"grad_norm": 26.169166564941406,
"kl/avg_steps": 0.375,
"kl/beta": 0.07029697299003601,
"kl/n_epsilon_steps": 0.3125,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 4.795540267200686e-07,
"logits/chosen": -0.45094913244247437,
"logits/rejected": -0.3485727906227112,
"logps/chosen": -88.29129028320312,
"logps/ref_chosen": -74.61146545410156,
"logps/ref_rejected": -83.24461364746094,
"logps/rejected": -102.80686950683594,
"loss": 1.1681,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.9619247913360596,
"rewards/margins": 0.40650683641433716,
"rewards/rejected": -1.368431568145752,
"step": 145
},
{
"epoch": 0.22071050642479215,
"epsilon_dpo/beta": 0.0697786882519722,
"epsilon_dpo/beta_margin_grad_mean": -0.41254398226737976,
"epsilon_dpo/beta_margin_grad_std": 0.1563834547996521,
"epsilon_dpo/beta_margin_mean": 0.396220326423645,
"epsilon_dpo/beta_margin_std": 0.7105116844177246,
"epsilon_dpo/loss_margin_mean": 5.748117923736572,
"grad_norm": 22.814542770385742,
"kl/avg_steps": 0.375,
"kl/beta": 0.07003434002399445,
"kl/n_epsilon_steps": 0.3125,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 4.790271143580173e-07,
"logits/chosen": -0.3010927140712738,
"logits/rejected": -0.3338754177093506,
"logps/chosen": -71.25645446777344,
"logps/ref_chosen": -57.84098434448242,
"logps/ref_rejected": -67.47422790527344,
"logps/rejected": -86.6378173828125,
"loss": 1.1447,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.938737154006958,
"rewards/margins": 0.3962203860282898,
"rewards/rejected": -1.3349575996398926,
"step": 146
},
{
"epoch": 0.2222222222222222,
"epsilon_dpo/beta": 0.06953980028629303,
"epsilon_dpo/beta_margin_grad_mean": -0.4271882176399231,
"epsilon_dpo/beta_margin_grad_std": 0.15932345390319824,
"epsilon_dpo/beta_margin_mean": 0.32222291827201843,
"epsilon_dpo/beta_margin_std": 0.7378240823745728,
"epsilon_dpo/loss_margin_mean": 4.707395553588867,
"grad_norm": 26.77433967590332,
"kl/avg_steps": 0.34375,
"kl/beta": 0.06977269053459167,
"kl/n_epsilon_steps": 0.328125,
"kl/p_epsilon_steps": 0.671875,
"learning_rate": 4.784937956152489e-07,
"logits/chosen": -0.28852128982543945,
"logits/rejected": -0.37690430879592896,
"logps/chosen": -82.10720825195312,
"logps/ref_chosen": -66.8134765625,
"logps/ref_rejected": -81.1796875,
"logps/rejected": -101.18082427978516,
"loss": 1.2144,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.065216064453125,
"rewards/margins": 0.32222288846969604,
"rewards/rejected": -1.3874390125274658,
"step": 147
},
{
"epoch": 0.2237339380196523,
"epsilon_dpo/beta": 0.06917119026184082,
"epsilon_dpo/beta_margin_grad_mean": -0.40103206038475037,
"epsilon_dpo/beta_margin_grad_std": 0.15802228450775146,
"epsilon_dpo/beta_margin_mean": 0.44464221596717834,
"epsilon_dpo/beta_margin_std": 0.7200804352760315,
"epsilon_dpo/loss_margin_mean": 6.493539810180664,
"grad_norm": 18.844465255737305,
"kl/avg_steps": 0.53125,
"kl/beta": 0.06953366845846176,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 4.779540854098347e-07,
"logits/chosen": -0.2599055767059326,
"logits/rejected": -0.4167260229587555,
"logps/chosen": -62.261810302734375,
"logps/ref_chosen": -48.68775177001953,
"logps/ref_rejected": -67.50503540039062,
"logps/rejected": -87.5726318359375,
"loss": 1.1091,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.9408795833587646,
"rewards/margins": 0.44464218616485596,
"rewards/rejected": -1.3855218887329102,
"step": 148
},
{
"epoch": 0.2252456538170824,
"epsilon_dpo/beta": 0.06882727891206741,
"epsilon_dpo/beta_margin_grad_mean": -0.3923902213573456,
"epsilon_dpo/beta_margin_grad_std": 0.14097757637500763,
"epsilon_dpo/beta_margin_mean": 0.4799686074256897,
"epsilon_dpo/beta_margin_std": 0.6410298943519592,
"epsilon_dpo/loss_margin_mean": 7.03225040435791,
"grad_norm": 20.45415496826172,
"kl/avg_steps": 0.5,
"kl/beta": 0.06916622817516327,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 4.774079988386296e-07,
"logits/chosen": -0.2811169922351837,
"logits/rejected": -0.4123772978782654,
"logps/chosen": -71.6675796508789,
"logps/ref_chosen": -55.14377975463867,
"logps/ref_rejected": -64.79888916015625,
"logps/rejected": -88.35494995117188,
"loss": 1.0567,
"rewards/accuracies": 0.765625,
"rewards/chosen": -1.1394243240356445,
"rewards/margins": 0.4799686074256897,
"rewards/rejected": -1.6193928718566895,
"step": 149
},
{
"epoch": 0.22675736961451248,
"epsilon_dpo/beta": 0.06846334040164948,
"epsilon_dpo/beta_margin_grad_mean": -0.3678981363773346,
"epsilon_dpo/beta_margin_grad_std": 0.16524246335029602,
"epsilon_dpo/beta_margin_mean": 0.628822386264801,
"epsilon_dpo/beta_margin_std": 0.8186566233634949,
"epsilon_dpo/loss_margin_mean": 9.253426551818848,
"grad_norm": 20.30909538269043,
"kl/avg_steps": 0.53125,
"kl/beta": 0.06882211565971375,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 4.768555511768486e-07,
"logits/chosen": -0.2793060839176178,
"logits/rejected": -0.37656548619270325,
"logps/chosen": -81.79039001464844,
"logps/ref_chosen": -67.47074890136719,
"logps/ref_rejected": -89.21170043945312,
"logps/rejected": -112.78477478027344,
"loss": 0.9964,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.9830636978149414,
"rewards/margins": 0.628822386264801,
"rewards/rejected": -1.6118860244750977,
"step": 150
},
{
"epoch": 0.22826908541194255,
"epsilon_dpo/beta": 0.06808015704154968,
"epsilon_dpo/beta_margin_grad_mean": -0.34604471921920776,
"epsilon_dpo/beta_margin_grad_std": 0.15563137829303741,
"epsilon_dpo/beta_margin_mean": 0.7367002367973328,
"epsilon_dpo/beta_margin_std": 0.8065953850746155,
"epsilon_dpo/loss_margin_mean": 10.885876655578613,
"grad_norm": 18.89447021484375,
"kl/avg_steps": 0.5625,
"kl/beta": 0.0684584304690361,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 4.762967578776406e-07,
"logits/chosen": -0.23645813763141632,
"logits/rejected": -0.30803561210632324,
"logps/chosen": -64.03060913085938,
"logps/ref_chosen": -52.45954132080078,
"logps/ref_rejected": -79.06301879882812,
"logps/rejected": -101.51995849609375,
"loss": 0.9116,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.7910324931144714,
"rewards/margins": 0.7367002367973328,
"rewards/rejected": -1.5277327299118042,
"step": 151
},
{
"epoch": 0.22978080120937264,
"epsilon_dpo/beta": 0.06772062927484512,
"epsilon_dpo/beta_margin_grad_mean": -0.40272659063339233,
"epsilon_dpo/beta_margin_grad_std": 0.18340329825878143,
"epsilon_dpo/beta_margin_mean": 0.45768681168556213,
"epsilon_dpo/beta_margin_std": 0.8623914122581482,
"epsilon_dpo/loss_margin_mean": 6.835455894470215,
"grad_norm": 21.6560115814209,
"kl/avg_steps": 0.53125,
"kl/beta": 0.0680755078792572,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 4.757316345716553e-07,
"logits/chosen": -0.2873806655406952,
"logits/rejected": -0.332497239112854,
"logps/chosen": -71.64530944824219,
"logps/ref_chosen": -56.5538330078125,
"logps/ref_rejected": -76.55074310302734,
"logps/rejected": -98.47767639160156,
"loss": 1.1472,
"rewards/accuracies": 0.734375,
"rewards/chosen": -1.0243415832519531,
"rewards/margins": 0.4576868414878845,
"rewards/rejected": -1.4820284843444824,
"step": 152
},
{
"epoch": 0.23129251700680273,
"epsilon_dpo/beta": 0.06738392263650894,
"epsilon_dpo/beta_margin_grad_mean": -0.3816215991973877,
"epsilon_dpo/beta_margin_grad_std": 0.1576320230960846,
"epsilon_dpo/beta_margin_mean": 0.5424678921699524,
"epsilon_dpo/beta_margin_std": 0.730301022529602,
"epsilon_dpo/loss_margin_mean": 8.119546890258789,
"grad_norm": 20.802284240722656,
"kl/avg_steps": 0.5,
"kl/beta": 0.06771576404571533,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 4.751601970666064e-07,
"logits/chosen": -0.3340821862220764,
"logits/rejected": -0.3825557231903076,
"logps/chosen": -82.14995574951172,
"logps/ref_chosen": -68.00689697265625,
"logps/ref_rejected": -74.83482360839844,
"logps/rejected": -97.09742736816406,
"loss": 1.0356,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.9562482833862305,
"rewards/margins": 0.5424678921699524,
"rewards/rejected": -1.498716115951538,
"step": 153
},
{
"epoch": 0.2328042328042328,
"epsilon_dpo/beta": 0.06715396791696548,
"epsilon_dpo/beta_margin_grad_mean": -0.4110979437828064,
"epsilon_dpo/beta_margin_grad_std": 0.17948567867279053,
"epsilon_dpo/beta_margin_mean": 0.40163373947143555,
"epsilon_dpo/beta_margin_std": 0.8289053440093994,
"epsilon_dpo/loss_margin_mean": 6.070918560028076,
"grad_norm": 22.871055603027344,
"kl/avg_steps": 0.34375,
"kl/beta": 0.06737887114286423,
"kl/n_epsilon_steps": 0.328125,
"kl/p_epsilon_steps": 0.671875,
"learning_rate": 4.745824613468292e-07,
"logits/chosen": -0.2625642418861389,
"logits/rejected": -0.25436925888061523,
"logps/chosen": -74.19328308105469,
"logps/ref_chosen": -59.222537994384766,
"logps/ref_rejected": -64.19132232666016,
"logps/rejected": -85.23298645019531,
"loss": 1.1816,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.007482886314392,
"rewards/margins": 0.4016337990760803,
"rewards/rejected": -1.4091167449951172,
"step": 154
},
{
"epoch": 0.23431594860166288,
"epsilon_dpo/beta": 0.0668819472193718,
"epsilon_dpo/beta_margin_grad_mean": -0.3854452073574066,
"epsilon_dpo/beta_margin_grad_std": 0.19765251874923706,
"epsilon_dpo/beta_margin_mean": 0.5586703419685364,
"epsilon_dpo/beta_margin_std": 0.9932601451873779,
"epsilon_dpo/loss_margin_mean": 8.452394485473633,
"grad_norm": 22.700172424316406,
"kl/avg_steps": 0.40625,
"kl/beta": 0.06714805215597153,
"kl/n_epsilon_steps": 0.296875,
"kl/p_epsilon_steps": 0.703125,
"learning_rate": 4.7399844357283393e-07,
"logits/chosen": -0.26384520530700684,
"logits/rejected": -0.30866539478302,
"logps/chosen": -84.74057006835938,
"logps/ref_chosen": -68.45469665527344,
"logps/ref_rejected": -77.91763305664062,
"logps/rejected": -102.65591430664062,
"loss": 1.1153,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.0931569337844849,
"rewards/margins": 0.5586704015731812,
"rewards/rejected": -1.651827335357666,
"step": 155
},
{
"epoch": 0.23582766439909297,
"epsilon_dpo/beta": 0.06650684028863907,
"epsilon_dpo/beta_margin_grad_mean": -0.36289355158805847,
"epsilon_dpo/beta_margin_grad_std": 0.16767385601997375,
"epsilon_dpo/beta_margin_mean": 0.6588570475578308,
"epsilon_dpo/beta_margin_std": 0.8301137685775757,
"epsilon_dpo/loss_margin_mean": 9.974431991577148,
"grad_norm": 20.73874282836914,
"kl/avg_steps": 0.5625,
"kl/beta": 0.0668763667345047,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 4.7340816008085305e-07,
"logits/chosen": -0.34579187631607056,
"logits/rejected": -0.4123355746269226,
"logps/chosen": -83.38964080810547,
"logps/ref_chosen": -67.26959991455078,
"logps/ref_rejected": -86.95914459228516,
"logps/rejected": -113.05361938476562,
"loss": 0.9792,
"rewards/accuracies": 0.796875,
"rewards/chosen": -1.0740737915039062,
"rewards/margins": 0.658856987953186,
"rewards/rejected": -1.7329306602478027,
"step": 156
},
{
"epoch": 0.23733938019652306,
"epsilon_dpo/beta": 0.06623874604701996,
"epsilon_dpo/beta_margin_grad_mean": -0.3899654746055603,
"epsilon_dpo/beta_margin_grad_std": 0.17842227220535278,
"epsilon_dpo/beta_margin_mean": 0.5179902911186218,
"epsilon_dpo/beta_margin_std": 0.8619597554206848,
"epsilon_dpo/loss_margin_mean": 7.9072465896606445,
"grad_norm": 21.97333335876465,
"kl/avg_steps": 0.40625,
"kl/beta": 0.06650228798389435,
"kl/n_epsilon_steps": 0.296875,
"kl/p_epsilon_steps": 0.703125,
"learning_rate": 4.728116273823847e-07,
"logits/chosen": -0.22052708268165588,
"logits/rejected": -0.30261218547821045,
"logps/chosen": -68.69932556152344,
"logps/ref_chosen": -54.77287292480469,
"logps/ref_rejected": -63.87866973876953,
"logps/rejected": -85.71237182617188,
"loss": 1.0974,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.926228940486908,
"rewards/margins": 0.517990231513977,
"rewards/rejected": -1.4442192316055298,
"step": 157
},
{
"epoch": 0.23885109599395313,
"epsilon_dpo/beta": 0.06597074121236801,
"epsilon_dpo/beta_margin_grad_mean": -0.3853689432144165,
"epsilon_dpo/beta_margin_grad_std": 0.1895613968372345,
"epsilon_dpo/beta_margin_mean": 0.5653907060623169,
"epsilon_dpo/beta_margin_std": 0.9349595904350281,
"epsilon_dpo/loss_margin_mean": 8.663420677185059,
"grad_norm": 22.90645980834961,
"kl/avg_steps": 0.40625,
"kl/beta": 0.06623321771621704,
"kl/n_epsilon_steps": 0.296875,
"kl/p_epsilon_steps": 0.703125,
"learning_rate": 4.7220886216373085e-07,
"logits/chosen": -0.31424635648727417,
"logits/rejected": -0.3887185752391815,
"logps/chosen": -80.49627685546875,
"logps/ref_chosen": -64.92271423339844,
"logps/ref_rejected": -82.23789978027344,
"logps/rejected": -106.47486877441406,
"loss": 1.0869,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.0310311317443848,
"rewards/margins": 0.5653907060623169,
"rewards/rejected": -1.5964219570159912,
"step": 158
},
{
"epoch": 0.24036281179138322,
"epsilon_dpo/beta": 0.06566259264945984,
"epsilon_dpo/beta_margin_grad_mean": -0.3695249855518341,
"epsilon_dpo/beta_margin_grad_std": 0.18520978093147278,
"epsilon_dpo/beta_margin_mean": 0.6269689798355103,
"epsilon_dpo/beta_margin_std": 0.9548689126968384,
"epsilon_dpo/loss_margin_mean": 9.634745597839355,
"grad_norm": 23.45098114013672,
"kl/avg_steps": 0.46875,
"kl/beta": 0.06596523523330688,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 4.715998812855304e-07,
"logits/chosen": -0.20362374186515808,
"logits/rejected": -0.27223098278045654,
"logps/chosen": -72.87303924560547,
"logps/ref_chosen": -57.04698944091797,
"logps/ref_rejected": -73.32441711425781,
"logps/rejected": -98.78520965576172,
"loss": 1.0466,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.0435682535171509,
"rewards/margins": 0.6269689798355103,
"rewards/rejected": -1.6705372333526611,
"step": 159
},
{
"epoch": 0.2418745275888133,
"epsilon_dpo/beta": 0.06537675857543945,
"epsilon_dpo/beta_margin_grad_mean": -0.3901420533657074,
"epsilon_dpo/beta_margin_grad_std": 0.19423310458660126,
"epsilon_dpo/beta_margin_mean": 0.550905704498291,
"epsilon_dpo/beta_margin_std": 0.9735690951347351,
"epsilon_dpo/loss_margin_mean": 8.523533821105957,
"grad_norm": 22.86724090576172,
"kl/avg_steps": 0.4375,
"kl/beta": 0.06565746665000916,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 4.7098470178228755e-07,
"logits/chosen": -0.34587207436561584,
"logits/rejected": -0.37707728147506714,
"logps/chosen": -67.38560485839844,
"logps/ref_chosen": -49.806915283203125,
"logps/ref_rejected": -68.3370132446289,
"logps/rejected": -94.43923950195312,
"loss": 1.111,
"rewards/accuracies": 0.734375,
"rewards/chosen": -1.151965618133545,
"rewards/margins": 0.550905704498291,
"rewards/rejected": -1.702871322631836,
"step": 160
},
{
"epoch": 0.24338624338624337,
"epsilon_dpo/beta": 0.06507153809070587,
"epsilon_dpo/beta_margin_grad_mean": -0.37438496947288513,
"epsilon_dpo/beta_margin_grad_std": 0.18452827632427216,
"epsilon_dpo/beta_margin_mean": 0.5988555550575256,
"epsilon_dpo/beta_margin_std": 0.9330669045448303,
"epsilon_dpo/loss_margin_mean": 9.295365333557129,
"grad_norm": 21.15367889404297,
"kl/avg_steps": 0.46875,
"kl/beta": 0.06537146121263504,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 4.703633408618955e-07,
"logits/chosen": -0.2439931333065033,
"logits/rejected": -0.29970598220825195,
"logps/chosen": -69.84889221191406,
"logps/ref_chosen": -52.50048828125,
"logps/ref_rejected": -66.04540252685547,
"logps/rejected": -92.68916320800781,
"loss": 1.0598,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.1320596933364868,
"rewards/margins": 0.5988555550575256,
"rewards/rejected": -1.7309153079986572,
"step": 161
},
{
"epoch": 0.24489795918367346,
"epsilon_dpo/beta": 0.06462559103965759,
"epsilon_dpo/beta_margin_grad_mean": -0.34233224391937256,
"epsilon_dpo/beta_margin_grad_std": 0.19140274822711945,
"epsilon_dpo/beta_margin_mean": 0.7702382206916809,
"epsilon_dpo/beta_margin_std": 0.9879153370857239,
"epsilon_dpo/loss_margin_mean": 11.991604804992676,
"grad_norm": 24.1027889251709,
"kl/avg_steps": 0.6875,
"kl/beta": 0.06506646424531937,
"kl/n_epsilon_steps": 0.15625,
"kl/p_epsilon_steps": 0.84375,
"learning_rate": 4.697358159051549e-07,
"logits/chosen": -0.3745758533477783,
"logits/rejected": -0.42245256900787354,
"logps/chosen": -88.05198669433594,
"logps/ref_chosen": -69.46919250488281,
"logps/ref_rejected": -92.00952911376953,
"logps/rejected": -122.58391571044922,
"loss": 0.9594,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.2028251886367798,
"rewards/margins": 0.7702381610870361,
"rewards/rejected": -1.9730634689331055,
"step": 162
},
{
"epoch": 0.24640967498110355,
"epsilon_dpo/beta": 0.06428530812263489,
"epsilon_dpo/beta_margin_grad_mean": -0.34151625633239746,
"epsilon_dpo/beta_margin_grad_std": 0.1972528100013733,
"epsilon_dpo/beta_margin_mean": 0.7887895703315735,
"epsilon_dpo/beta_margin_std": 0.9846240282058716,
"epsilon_dpo/loss_margin_mean": 12.36876392364502,
"grad_norm": 20.49311065673828,
"kl/avg_steps": 0.53125,
"kl/beta": 0.0646221861243248,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 4.691021444652876e-07,
"logits/chosen": -0.2664734125137329,
"logits/rejected": -0.2773905098438263,
"logps/chosen": -65.9134521484375,
"logps/ref_chosen": -50.613834381103516,
"logps/ref_rejected": -74.62033081054688,
"logps/rejected": -102.28871154785156,
"loss": 0.9507,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.9869455695152283,
"rewards/margins": 0.7887895107269287,
"rewards/rejected": -1.7757351398468018,
"step": 163
},
{
"epoch": 0.24792139077853365,
"epsilon_dpo/beta": 0.06390541791915894,
"epsilon_dpo/beta_margin_grad_mean": -0.35187870264053345,
"epsilon_dpo/beta_margin_grad_std": 0.20197324454784393,
"epsilon_dpo/beta_margin_mean": 0.7465708255767822,
"epsilon_dpo/beta_margin_std": 1.052926778793335,
"epsilon_dpo/loss_margin_mean": 11.773000717163086,
"grad_norm": 20.55039405822754,
"kl/avg_steps": 0.59375,
"kl/beta": 0.06428069621324539,
"kl/n_epsilon_steps": 0.203125,
"kl/p_epsilon_steps": 0.796875,
"learning_rate": 4.6846234426744624e-07,
"logits/chosen": -0.25759801268577576,
"logits/rejected": -0.35514870285987854,
"logps/chosen": -71.70374298095703,
"logps/ref_chosen": -54.848114013671875,
"logps/ref_rejected": -79.0630111694336,
"logps/rejected": -107.69164276123047,
"loss": 1.0021,
"rewards/accuracies": 0.796875,
"rewards/chosen": -1.0796349048614502,
"rewards/margins": 0.7465708255767822,
"rewards/rejected": -1.8262057304382324,
"step": 164
},
{
"epoch": 0.2494331065759637,
"epsilon_dpo/beta": 0.06356815993785858,
"epsilon_dpo/beta_margin_grad_mean": -0.3637421131134033,
"epsilon_dpo/beta_margin_grad_std": 0.20940490067005157,
"epsilon_dpo/beta_margin_mean": 0.6675106287002563,
"epsilon_dpo/beta_margin_std": 1.057544231414795,
"epsilon_dpo/loss_margin_mean": 10.60260009765625,
"grad_norm": 21.606149673461914,
"kl/avg_steps": 0.53125,
"kl/beta": 0.06390128284692764,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 4.678164332082175e-07,
"logits/chosen": -0.188014417886734,
"logits/rejected": -0.3162604868412018,
"logps/chosen": -70.30587768554688,
"logps/ref_chosen": -51.089210510253906,
"logps/ref_rejected": -71.23370361328125,
"logps/rejected": -101.05296325683594,
"loss": 1.0653,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.22507643699646,
"rewards/margins": 0.6675106883049011,
"rewards/rejected": -1.8925870656967163,
"step": 165
},
{
"epoch": 0.2509448223733938,
"epsilon_dpo/beta": 0.06335143744945526,
"epsilon_dpo/beta_margin_grad_mean": -0.39574819803237915,
"epsilon_dpo/beta_margin_grad_std": 0.1846621185541153,
"epsilon_dpo/beta_margin_mean": 0.515212893486023,
"epsilon_dpo/beta_margin_std": 0.894351601600647,
"epsilon_dpo/loss_margin_mean": 8.232608795166016,
"grad_norm": 24.80101203918457,
"kl/avg_steps": 0.34375,
"kl/beta": 0.06356360018253326,
"kl/n_epsilon_steps": 0.328125,
"kl/p_epsilon_steps": 0.671875,
"learning_rate": 4.6716442935512214e-07,
"logits/chosen": -0.2625441551208496,
"logits/rejected": -0.4250110387802124,
"logps/chosen": -81.37533569335938,
"logps/ref_chosen": -63.19081115722656,
"logps/ref_rejected": -93.8402099609375,
"logps/rejected": -120.25733947753906,
"loss": 1.1104,
"rewards/accuracies": 0.671875,
"rewards/chosen": -1.1550203561782837,
"rewards/margins": 0.515212893486023,
"rewards/rejected": -1.6702332496643066,
"step": 166
},
{
"epoch": 0.25245653817082386,
"epsilon_dpo/beta": 0.06299582868814468,
"epsilon_dpo/beta_margin_grad_mean": -0.3437648415565491,
"epsilon_dpo/beta_margin_grad_std": 0.1770849972963333,
"epsilon_dpo/beta_margin_mean": 0.7756779789924622,
"epsilon_dpo/beta_margin_std": 0.9134400486946106,
"epsilon_dpo/loss_margin_mean": 12.391538619995117,
"grad_norm": 23.047231674194336,
"kl/avg_steps": 0.5625,
"kl/beta": 0.06334584951400757,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 4.6650635094610966e-07,
"logits/chosen": -0.2523866891860962,
"logits/rejected": -0.34412816166877747,
"logps/chosen": -73.8267593383789,
"logps/ref_chosen": -58.92427062988281,
"logps/ref_rejected": -72.97377014160156,
"logps/rejected": -100.2677993774414,
"loss": 0.9259,
"rewards/accuracies": 0.796875,
"rewards/chosen": -0.9413843154907227,
"rewards/margins": 0.7756779789924622,
"rewards/rejected": -1.71706223487854,
"step": 167
},
{
"epoch": 0.25396825396825395,
"epsilon_dpo/beta": 0.06280095130205154,
"epsilon_dpo/beta_margin_grad_mean": -0.37643855810165405,
"epsilon_dpo/beta_margin_grad_std": 0.18603515625,
"epsilon_dpo/beta_margin_mean": 0.6265664100646973,
"epsilon_dpo/beta_margin_std": 0.9391864538192749,
"epsilon_dpo/loss_margin_mean": 10.08084774017334,
"grad_norm": 22.81222915649414,
"kl/avg_steps": 0.3125,
"kl/beta": 0.06299152225255966,
"kl/n_epsilon_steps": 0.34375,
"kl/p_epsilon_steps": 0.65625,
"learning_rate": 4.6584221638904767e-07,
"logits/chosen": -0.2895227074623108,
"logits/rejected": -0.34218716621398926,
"logps/chosen": -83.47079467773438,
"logps/ref_chosen": -65.65138244628906,
"logps/ref_rejected": -79.71418762207031,
"logps/rejected": -107.61444854736328,
"loss": 1.0404,
"rewards/accuracies": 0.671875,
"rewards/chosen": -1.1223618984222412,
"rewards/margins": 0.6265664100646973,
"rewards/rejected": -1.7489283084869385,
"step": 168
},
{
"epoch": 0.25547996976568405,
"epsilon_dpo/beta": 0.06246793270111084,
"epsilon_dpo/beta_margin_grad_mean": -0.3688127398490906,
"epsilon_dpo/beta_margin_grad_std": 0.20474793016910553,
"epsilon_dpo/beta_margin_mean": 0.6704166531562805,
"epsilon_dpo/beta_margin_std": 1.130202054977417,
"epsilon_dpo/loss_margin_mean": 10.832911491394043,
"grad_norm": 26.35257339477539,
"kl/avg_steps": 0.53125,
"kl/beta": 0.06279528886079788,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 4.651720442612075e-07,
"logits/chosen": -0.28726431727409363,
"logits/rejected": -0.4055023789405823,
"logps/chosen": -75.50286865234375,
"logps/ref_chosen": -61.425865173339844,
"logps/ref_rejected": -76.09590148925781,
"logps/rejected": -101.00581359863281,
"loss": 1.0779,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.8822125196456909,
"rewards/margins": 0.6704165935516357,
"rewards/rejected": -1.5526292324066162,
"step": 169
},
{
"epoch": 0.25699168556311414,
"epsilon_dpo/beta": 0.062137823551893234,
"epsilon_dpo/beta_margin_grad_mean": -0.36316975951194763,
"epsilon_dpo/beta_margin_grad_std": 0.2232203632593155,
"epsilon_dpo/beta_margin_mean": 0.6876026391983032,
"epsilon_dpo/beta_margin_std": 1.1640148162841797,
"epsilon_dpo/loss_margin_mean": 11.184520721435547,
"grad_norm": 21.732114791870117,
"kl/avg_steps": 0.53125,
"kl/beta": 0.06246344745159149,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 4.6449585330874425e-07,
"logits/chosen": -0.28552019596099854,
"logits/rejected": -0.2265230417251587,
"logps/chosen": -69.56861877441406,
"logps/ref_chosen": -56.65319061279297,
"logps/ref_rejected": -63.45965576171875,
"logps/rejected": -87.55960083007812,
"loss": 1.0952,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.8059422969818115,
"rewards/margins": 0.6876026391983032,
"rewards/rejected": -1.4935450553894043,
"step": 170
},
{
"epoch": 0.2585034013605442,
"epsilon_dpo/beta": 0.061857908964157104,
"epsilon_dpo/beta_margin_grad_mean": -0.35556191205978394,
"epsilon_dpo/beta_margin_grad_std": 0.22085915505886078,
"epsilon_dpo/beta_margin_mean": 0.7404354214668274,
"epsilon_dpo/beta_margin_std": 1.2024332284927368,
"epsilon_dpo/loss_margin_mean": 12.090255737304688,
"grad_norm": 22.402751922607422,
"kl/avg_steps": 0.453125,
"kl/beta": 0.062133364379405975,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 4.6381366244617224e-07,
"logits/chosen": -0.3162384033203125,
"logits/rejected": -0.34521257877349854,
"logps/chosen": -79.16874694824219,
"logps/ref_chosen": -63.734764099121094,
"logps/ref_rejected": -78.50328063964844,
"logps/rejected": -106.02751922607422,
"loss": 1.0712,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.9579418301582336,
"rewards/margins": 0.7404354810714722,
"rewards/rejected": -1.698377251625061,
"step": 171
},
{
"epoch": 0.2600151171579743,
"epsilon_dpo/beta": 0.0615113191306591,
"epsilon_dpo/beta_margin_grad_mean": -0.3682391345500946,
"epsilon_dpo/beta_margin_grad_std": 0.20234271883964539,
"epsilon_dpo/beta_margin_mean": 0.6789873838424683,
"epsilon_dpo/beta_margin_std": 1.0842193365097046,
"epsilon_dpo/loss_margin_mean": 11.13126277923584,
"grad_norm": 24.113933563232422,
"kl/avg_steps": 0.5625,
"kl/beta": 0.06185309216380119,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 4.631254907558365e-07,
"logits/chosen": -0.21430940926074982,
"logits/rejected": -0.3421577215194702,
"logps/chosen": -69.283447265625,
"logps/ref_chosen": -52.201759338378906,
"logps/ref_rejected": -82.85285949707031,
"logps/rejected": -111.06581115722656,
"loss": 1.0562,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.0531675815582275,
"rewards/margins": 0.678987443447113,
"rewards/rejected": -1.7321550846099854,
"step": 172
},
{
"epoch": 0.2615268329554044,
"epsilon_dpo/beta": 0.06128259375691414,
"epsilon_dpo/beta_margin_grad_mean": -0.36865609884262085,
"epsilon_dpo/beta_margin_grad_std": 0.23911400139331818,
"epsilon_dpo/beta_margin_mean": 0.7229339480400085,
"epsilon_dpo/beta_margin_std": 1.3027911186218262,
"epsilon_dpo/loss_margin_mean": 11.942514419555664,
"grad_norm": 23.799114227294922,
"kl/avg_steps": 0.375,
"kl/beta": 0.06150711700320244,
"kl/n_epsilon_steps": 0.3125,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 4.624313574873786e-07,
"logits/chosen": -0.21749958395957947,
"logits/rejected": -0.35434597730636597,
"logps/chosen": -70.81362915039062,
"logps/ref_chosen": -55.43472671508789,
"logps/ref_rejected": -77.8196792602539,
"logps/rejected": -105.14109802246094,
"loss": 1.1294,
"rewards/accuracies": 0.703125,
"rewards/chosen": -0.9442607164382935,
"rewards/margins": 0.7229339480400085,
"rewards/rejected": -1.6671946048736572,
"step": 173
},
{
"epoch": 0.26303854875283444,
"epsilon_dpo/beta": 0.060919586569070816,
"epsilon_dpo/beta_margin_grad_mean": -0.3483433723449707,
"epsilon_dpo/beta_margin_grad_std": 0.2087724804878235,
"epsilon_dpo/beta_margin_mean": 0.7738977074623108,
"epsilon_dpo/beta_margin_std": 1.104463815689087,
"epsilon_dpo/loss_margin_mean": 12.800680160522461,
"grad_norm": 23.94753074645996,
"kl/avg_steps": 0.59375,
"kl/beta": 0.061277326196432114,
"kl/n_epsilon_steps": 0.203125,
"kl/p_epsilon_steps": 0.796875,
"learning_rate": 4.61731282057198e-07,
"logits/chosen": -0.35545074939727783,
"logits/rejected": -0.4043503403663635,
"logps/chosen": -74.4474868774414,
"logps/ref_chosen": -57.17195129394531,
"logps/ref_rejected": -85.47578430175781,
"logps/rejected": -115.55198669433594,
"loss": 1.0049,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.0548806190490723,
"rewards/margins": 0.7738977670669556,
"rewards/rejected": -1.8287782669067383,
"step": 174
},
{
"epoch": 0.26455026455026454,
"epsilon_dpo/beta": 0.060655198991298676,
"epsilon_dpo/beta_margin_grad_mean": -0.3453494608402252,
"epsilon_dpo/beta_margin_grad_std": 0.21968674659729004,
"epsilon_dpo/beta_margin_mean": 0.8563733696937561,
"epsilon_dpo/beta_margin_std": 1.2030829191207886,
"epsilon_dpo/loss_margin_mean": 14.246023178100586,
"grad_norm": 22.649356842041016,
"kl/avg_steps": 0.4375,
"kl/beta": 0.060915641486644745,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 4.6102528404790965e-07,
"logits/chosen": -0.25328904390335083,
"logits/rejected": -0.2955777049064636,
"logps/chosen": -82.35508728027344,
"logps/ref_chosen": -67.6656265258789,
"logps/ref_rejected": -84.36767578125,
"logps/rejected": -113.30315399169922,
"loss": 0.9863,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.8948940634727478,
"rewards/margins": 0.8563734292984009,
"rewards/rejected": -1.751267433166504,
"step": 175
},
{
"epoch": 0.2660619803476946,
"epsilon_dpo/beta": 0.06048576161265373,
"epsilon_dpo/beta_margin_grad_mean": -0.3979288637638092,
"epsilon_dpo/beta_margin_grad_std": 0.22770652174949646,
"epsilon_dpo/beta_margin_mean": 0.5396315455436707,
"epsilon_dpo/beta_margin_std": 1.2365338802337646,
"epsilon_dpo/loss_margin_mean": 9.062606811523438,
"grad_norm": 30.601669311523438,
"kl/avg_steps": 0.28125,
"kl/beta": 0.0606502927839756,
"kl/n_epsilon_steps": 0.359375,
"kl/p_epsilon_steps": 0.640625,
"learning_rate": 4.603133832077953e-07,
"logits/chosen": -0.34826281666755676,
"logits/rejected": -0.3899797201156616,
"logps/chosen": -96.5220947265625,
"logps/ref_chosen": -77.8587646484375,
"logps/ref_rejected": -81.08732604980469,
"logps/rejected": -108.81326293945312,
"loss": 1.2306,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.1332380771636963,
"rewards/margins": 0.5396316051483154,
"rewards/rejected": -1.6728696823120117,
"step": 176
},
{
"epoch": 0.2675736961451247,
"epsilon_dpo/beta": 0.06005149334669113,
"epsilon_dpo/beta_margin_grad_mean": -0.3008078336715698,
"epsilon_dpo/beta_margin_grad_std": 0.1859234869480133,
"epsilon_dpo/beta_margin_mean": 1.0505074262619019,
"epsilon_dpo/beta_margin_std": 1.0782644748687744,
"epsilon_dpo/loss_margin_mean": 17.564970016479492,
"grad_norm": 26.424150466918945,
"kl/avg_steps": 0.71875,
"kl/beta": 0.06048019230365753,
"kl/n_epsilon_steps": 0.140625,
"kl/p_epsilon_steps": 0.859375,
"learning_rate": 4.5959559945025183e-07,
"logits/chosen": -0.3523577153682709,
"logits/rejected": -0.4300538897514343,
"logps/chosen": -68.95022583007812,
"logps/ref_chosen": -55.22039794921875,
"logps/ref_rejected": -92.54974365234375,
"logps/rejected": -123.84454345703125,
"loss": 0.8084,
"rewards/accuracies": 0.859375,
"rewards/chosen": -0.825654149055481,
"rewards/margins": 1.0505074262619019,
"rewards/rejected": -1.8761615753173828,
"step": 177
},
{
"epoch": 0.2690854119425548,
"epsilon_dpo/beta": 0.05979185923933983,
"epsilon_dpo/beta_margin_grad_mean": -0.37746661901474,
"epsilon_dpo/beta_margin_grad_std": 0.1945168524980545,
"epsilon_dpo/beta_margin_mean": 0.6076987385749817,
"epsilon_dpo/beta_margin_std": 0.9756598472595215,
"epsilon_dpo/loss_margin_mean": 10.267746925354004,
"grad_norm": 28.44964599609375,
"kl/avg_steps": 0.4375,
"kl/beta": 0.06004859507083893,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 4.588719528532341e-07,
"logits/chosen": -0.22682592272758484,
"logits/rejected": -0.37198418378829956,
"logps/chosen": -77.29765319824219,
"logps/ref_chosen": -60.81048583984375,
"logps/ref_rejected": -81.12973022460938,
"logps/rejected": -107.8846435546875,
"loss": 1.0708,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.986926794052124,
"rewards/margins": 0.6076987981796265,
"rewards/rejected": -1.594625473022461,
"step": 178
},
{
"epoch": 0.2705971277399849,
"epsilon_dpo/beta": 0.059475354850292206,
"epsilon_dpo/beta_margin_grad_mean": -0.3549274504184723,
"epsilon_dpo/beta_margin_grad_std": 0.21680504083633423,
"epsilon_dpo/beta_margin_mean": 0.7543036341667175,
"epsilon_dpo/beta_margin_std": 1.1721493005752563,
"epsilon_dpo/loss_margin_mean": 12.802788734436035,
"grad_norm": 26.032310485839844,
"kl/avg_steps": 0.53125,
"kl/beta": 0.05978702753782272,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 4.581424636586928e-07,
"logits/chosen": -0.3475063443183899,
"logits/rejected": -0.3340640068054199,
"logps/chosen": -82.29966735839844,
"logps/ref_chosen": -65.67171478271484,
"logps/ref_rejected": -75.32586669921875,
"logps/rejected": -104.75660705566406,
"loss": 1.0429,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.9936603307723999,
"rewards/margins": 0.7543036341667175,
"rewards/rejected": -1.7479639053344727,
"step": 179
},
{
"epoch": 0.272108843537415,
"epsilon_dpo/beta": 0.059235405176877975,
"epsilon_dpo/beta_margin_grad_mean": -0.387634813785553,
"epsilon_dpo/beta_margin_grad_std": 0.2040894329547882,
"epsilon_dpo/beta_margin_mean": 0.5790908932685852,
"epsilon_dpo/beta_margin_std": 1.086661696434021,
"epsilon_dpo/loss_margin_mean": 9.889979362487793,
"grad_norm": 22.61466407775879,
"kl/avg_steps": 0.40625,
"kl/beta": 0.05947108566761017,
"kl/n_epsilon_steps": 0.296875,
"kl/p_epsilon_steps": 0.703125,
"learning_rate": 4.5740715227200897e-07,
"logits/chosen": -0.2834322452545166,
"logits/rejected": -0.3228118121623993,
"logps/chosen": -69.12271118164062,
"logps/ref_chosen": -56.68280792236328,
"logps/ref_rejected": -64.94414520263672,
"logps/rejected": -87.27401733398438,
"loss": 1.1323,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.7392200231552124,
"rewards/margins": 0.5790908932685852,
"rewards/rejected": -1.3183109760284424,
"step": 180
},
{
"epoch": 0.273620559334845,
"epsilon_dpo/beta": 0.058884669095277786,
"epsilon_dpo/beta_margin_grad_mean": -0.3379547894001007,
"epsilon_dpo/beta_margin_grad_std": 0.17709705233573914,
"epsilon_dpo/beta_margin_mean": 0.8369985818862915,
"epsilon_dpo/beta_margin_std": 0.9841684699058533,
"epsilon_dpo/loss_margin_mean": 14.290465354919434,
"grad_norm": 22.397214889526367,
"kl/avg_steps": 0.59375,
"kl/beta": 0.059230461716651917,
"kl/n_epsilon_steps": 0.203125,
"kl/p_epsilon_steps": 0.796875,
"learning_rate": 4.566660392614228e-07,
"logits/chosen": -0.25201842188835144,
"logits/rejected": -0.3360195755958557,
"logps/chosen": -71.43923950195312,
"logps/ref_chosen": -60.77604675292969,
"logps/ref_rejected": -83.98361206054688,
"logps/rejected": -108.93727111816406,
"loss": 0.9025,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.6288473606109619,
"rewards/margins": 0.8369985818862915,
"rewards/rejected": -1.4658459424972534,
"step": 181
},
{
"epoch": 0.2751322751322751,
"epsilon_dpo/beta": 0.05855550989508629,
"epsilon_dpo/beta_margin_grad_mean": -0.34649306535720825,
"epsilon_dpo/beta_margin_grad_std": 0.2180139720439911,
"epsilon_dpo/beta_margin_mean": 0.815775454044342,
"epsilon_dpo/beta_margin_std": 1.1701534986495972,
"epsilon_dpo/loss_margin_mean": 14.04555892944336,
"grad_norm": 24.7940616607666,
"kl/avg_steps": 0.5625,
"kl/beta": 0.05888085812330246,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 4.5591914535745817e-07,
"logits/chosen": -0.2540717124938965,
"logits/rejected": -0.4515971839427948,
"logps/chosen": -75.76898193359375,
"logps/ref_chosen": -60.2537841796875,
"logps/ref_rejected": -89.7706298828125,
"logps/rejected": -119.33139038085938,
"loss": 1.0043,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.9136762619018555,
"rewards/margins": 0.8157755136489868,
"rewards/rejected": -1.7294516563415527,
"step": 182
},
{
"epoch": 0.2766439909297052,
"epsilon_dpo/beta": 0.05841096490621567,
"epsilon_dpo/beta_margin_grad_mean": -0.42652302980422974,
"epsilon_dpo/beta_margin_grad_std": 0.1989770382642746,
"epsilon_dpo/beta_margin_mean": 0.344012051820755,
"epsilon_dpo/beta_margin_std": 1.000016689300537,
"epsilon_dpo/loss_margin_mean": 6.008331775665283,
"grad_norm": 24.96488380432129,
"kl/avg_steps": 0.25,
"kl/beta": 0.05855150520801544,
"kl/n_epsilon_steps": 0.375,
"kl/p_epsilon_steps": 0.625,
"learning_rate": 4.551664914523433e-07,
"logits/chosen": -0.28982096910476685,
"logits/rejected": -0.3066185712814331,
"logps/chosen": -80.36439514160156,
"logps/ref_chosen": -61.76142120361328,
"logps/ref_rejected": -72.54627990722656,
"logps/rejected": -97.1575927734375,
"loss": 1.29,
"rewards/accuracies": 0.640625,
"rewards/chosen": -1.0891584157943726,
"rewards/margins": 0.3440120816230774,
"rewards/rejected": -1.4331705570220947,
"step": 183
},
{
"epoch": 0.2781557067271353,
"epsilon_dpo/beta": 0.05819229036569595,
"epsilon_dpo/beta_margin_grad_mean": -0.37999579310417175,
"epsilon_dpo/beta_margin_grad_std": 0.1717667579650879,
"epsilon_dpo/beta_margin_mean": 0.5970726013183594,
"epsilon_dpo/beta_margin_std": 0.873263418674469,
"epsilon_dpo/loss_margin_mean": 10.349804878234863,
"grad_norm": 20.948413848876953,
"kl/avg_steps": 0.375,
"kl/beta": 0.05840549245476723,
"kl/n_epsilon_steps": 0.3125,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 4.544080985994258e-07,
"logits/chosen": -0.1806359887123108,
"logits/rejected": -0.3026115894317627,
"logps/chosen": -57.57061767578125,
"logps/ref_chosen": -46.840721130371094,
"logps/ref_rejected": -69.3609390258789,
"logps/rejected": -90.44064331054688,
"loss": 1.0356,
"rewards/accuracies": 0.640625,
"rewards/chosen": -0.6264686584472656,
"rewards/margins": 0.5970726013183594,
"rewards/rejected": -1.223541259765625,
"step": 184
},
{
"epoch": 0.2796674225245654,
"epsilon_dpo/beta": 0.0580112561583519,
"epsilon_dpo/beta_margin_grad_mean": -0.37028154730796814,
"epsilon_dpo/beta_margin_grad_std": 0.2305072844028473,
"epsilon_dpo/beta_margin_mean": 0.6763496994972229,
"epsilon_dpo/beta_margin_std": 1.1983596086502075,
"epsilon_dpo/loss_margin_mean": 11.817178726196289,
"grad_norm": 21.605913162231445,
"kl/avg_steps": 0.3125,
"kl/beta": 0.05818729102611542,
"kl/n_epsilon_steps": 0.34375,
"kl/p_epsilon_steps": 0.65625,
"learning_rate": 4.5364398801258394e-07,
"logits/chosen": -0.24417275190353394,
"logits/rejected": -0.3371928632259369,
"logps/chosen": -66.188232421875,
"logps/ref_chosen": -52.321136474609375,
"logps/ref_rejected": -68.3885726928711,
"logps/rejected": -94.07284545898438,
"loss": 1.1193,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.808593213558197,
"rewards/margins": 0.6763496994972229,
"rewards/rejected": -1.48494291305542,
"step": 185
},
{
"epoch": 0.2811791383219955,
"epsilon_dpo/beta": 0.05777614936232567,
"epsilon_dpo/beta_margin_grad_mean": -0.3594328463077545,
"epsilon_dpo/beta_margin_grad_std": 0.20834676921367645,
"epsilon_dpo/beta_margin_mean": 0.7274589538574219,
"epsilon_dpo/beta_margin_std": 1.147803783416748,
"epsilon_dpo/loss_margin_mean": 12.723150253295898,
"grad_norm": 26.340944290161133,
"kl/avg_steps": 0.40625,
"kl/beta": 0.05800602212548256,
"kl/n_epsilon_steps": 0.296875,
"kl/p_epsilon_steps": 0.703125,
"learning_rate": 4.5287418106563354e-07,
"logits/chosen": -0.409564346075058,
"logits/rejected": -0.3969210386276245,
"logps/chosen": -79.36711883544922,
"logps/ref_chosen": -67.42012786865234,
"logps/ref_rejected": -82.50968933105469,
"logps/rejected": -107.1798324584961,
"loss": 1.0473,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.6928529739379883,
"rewards/margins": 0.7274588942527771,
"rewards/rejected": -1.4203118085861206,
"step": 186
},
{
"epoch": 0.28269085411942557,
"epsilon_dpo/beta": 0.05753326416015625,
"epsilon_dpo/beta_margin_grad_mean": -0.38152551651000977,
"epsilon_dpo/beta_margin_grad_std": 0.21516606211662292,
"epsilon_dpo/beta_margin_mean": 0.6407138109207153,
"epsilon_dpo/beta_margin_std": 1.1479554176330566,
"epsilon_dpo/loss_margin_mean": 11.262286186218262,
"grad_norm": 28.630285263061523,
"kl/avg_steps": 0.421875,
"kl/beta": 0.05777132511138916,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.703125,
"learning_rate": 4.520986992917297e-07,
"logits/chosen": -0.3924351632595062,
"logits/rejected": -0.46077537536621094,
"logps/chosen": -90.86955261230469,
"logps/ref_chosen": -75.52549743652344,
"logps/ref_rejected": -94.76289367675781,
"logps/rejected": -121.3692398071289,
"loss": 1.1118,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.8866924047470093,
"rewards/margins": 0.6407138109207153,
"rewards/rejected": -1.5274062156677246,
"step": 187
},
{
"epoch": 0.2842025699168556,
"epsilon_dpo/beta": 0.057228729128837585,
"epsilon_dpo/beta_margin_grad_mean": -0.364745169878006,
"epsilon_dpo/beta_margin_grad_std": 0.19652563333511353,
"epsilon_dpo/beta_margin_mean": 0.7049745321273804,
"epsilon_dpo/beta_margin_std": 1.0635921955108643,
"epsilon_dpo/loss_margin_mean": 12.413689613342285,
"grad_norm": 23.41449546813965,
"kl/avg_steps": 0.53125,
"kl/beta": 0.057528626173734665,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 4.5131756438276466e-07,
"logits/chosen": -0.29929056763648987,
"logits/rejected": -0.3593160808086395,
"logps/chosen": -84.41511535644531,
"logps/ref_chosen": -71.52333068847656,
"logps/ref_rejected": -78.29949951171875,
"logps/rejected": -103.60497283935547,
"loss": 1.0281,
"rewards/accuracies": 0.796875,
"rewards/chosen": -0.740180253982544,
"rewards/margins": 0.7049745321273804,
"rewards/rejected": -1.4451546669006348,
"step": 188
},
{
"epoch": 0.2857142857142857,
"epsilon_dpo/beta": 0.056944191455841064,
"epsilon_dpo/beta_margin_grad_mean": -0.3733265995979309,
"epsilon_dpo/beta_margin_grad_std": 0.1941097378730774,
"epsilon_dpo/beta_margin_mean": 0.6423479914665222,
"epsilon_dpo/beta_margin_std": 1.0260517597198486,
"epsilon_dpo/loss_margin_mean": 11.383039474487305,
"grad_norm": 24.319408416748047,
"kl/avg_steps": 0.5,
"kl/beta": 0.05722462013363838,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 4.5053079818876096e-07,
"logits/chosen": -0.33385777473449707,
"logits/rejected": -0.30858296155929565,
"logps/chosen": -84.73298645019531,
"logps/ref_chosen": -72.17626953125,
"logps/ref_rejected": -75.26313781738281,
"logps/rejected": -99.20288848876953,
"loss": 1.0595,
"rewards/accuracies": 0.796875,
"rewards/chosen": -0.7164562940597534,
"rewards/margins": 0.6423479318618774,
"rewards/rejected": -1.3588042259216309,
"step": 189
},
{
"epoch": 0.2872260015117158,
"epsilon_dpo/beta": 0.05662529170513153,
"epsilon_dpo/beta_margin_grad_mean": -0.3427947163581848,
"epsilon_dpo/beta_margin_grad_std": 0.18773041665554047,
"epsilon_dpo/beta_margin_mean": 0.8246808052062988,
"epsilon_dpo/beta_margin_std": 1.0402835607528687,
"epsilon_dpo/loss_margin_mean": 14.65948486328125,
"grad_norm": 28.0584774017334,
"kl/avg_steps": 0.5625,
"kl/beta": 0.05693991854786873,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 4.4973842271726024e-07,
"logits/chosen": -0.19765062630176544,
"logits/rejected": -0.5322472453117371,
"logps/chosen": -66.28235626220703,
"logps/ref_chosen": -54.624267578125,
"logps/ref_rejected": -101.47068786621094,
"logps/rejected": -127.78825378417969,
"loss": 0.9324,
"rewards/accuracies": 0.796875,
"rewards/chosen": -0.664069652557373,
"rewards/margins": 0.8246808052062988,
"rewards/rejected": -1.4887504577636719,
"step": 190
},
{
"epoch": 0.2887377173091459,
"epsilon_dpo/beta": 0.05634395033121109,
"epsilon_dpo/beta_margin_grad_mean": -0.3676350712776184,
"epsilon_dpo/beta_margin_grad_std": 0.21277813613414764,
"epsilon_dpo/beta_margin_mean": 0.6711294651031494,
"epsilon_dpo/beta_margin_std": 1.1578779220581055,
"epsilon_dpo/loss_margin_mean": 12.032771110534668,
"grad_norm": 28.362462997436523,
"kl/avg_steps": 0.5,
"kl/beta": 0.05662142485380173,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 4.48940460132708e-07,
"logits/chosen": -0.3306986689567566,
"logits/rejected": -0.30516481399536133,
"logps/chosen": -89.927978515625,
"logps/ref_chosen": -72.9325180053711,
"logps/ref_rejected": -89.95103454589844,
"logps/rejected": -118.97926330566406,
"loss": 1.0967,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.9603179693222046,
"rewards/margins": 0.6711294651031494,
"rewards/rejected": -1.631447434425354,
"step": 191
},
{
"epoch": 0.29024943310657597,
"epsilon_dpo/beta": 0.056134067475795746,
"epsilon_dpo/beta_margin_grad_mean": -0.4142908453941345,
"epsilon_dpo/beta_margin_grad_std": 0.17809294164180756,
"epsilon_dpo/beta_margin_mean": 0.4027373194694519,
"epsilon_dpo/beta_margin_std": 0.8817051649093628,
"epsilon_dpo/loss_margin_mean": 7.276228427886963,
"grad_norm": 21.971487045288086,
"kl/avg_steps": 0.375,
"kl/beta": 0.056339725852012634,
"kl/n_epsilon_steps": 0.3125,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 4.481369327558329e-07,
"logits/chosen": -0.23263001441955566,
"logits/rejected": -0.31310129165649414,
"logps/chosen": -72.01651000976562,
"logps/ref_chosen": -54.001121520996094,
"logps/ref_rejected": -63.53154754638672,
"logps/rejected": -88.82317352294922,
"loss": 1.1932,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.0131219625473022,
"rewards/margins": 0.4027373194694519,
"rewards/rejected": -1.4158592224121094,
"step": 192
},
{
"epoch": 0.29176114890400606,
"epsilon_dpo/beta": 0.05583664029836655,
"epsilon_dpo/beta_margin_grad_mean": -0.3544656038284302,
"epsilon_dpo/beta_margin_grad_std": 0.18689486384391785,
"epsilon_dpo/beta_margin_mean": 0.7465012073516846,
"epsilon_dpo/beta_margin_std": 1.0022002458572388,
"epsilon_dpo/loss_margin_mean": 13.461745262145996,
"grad_norm": 22.3031063079834,
"kl/avg_steps": 0.53125,
"kl/beta": 0.05612924322485924,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 4.47327863063023e-07,
"logits/chosen": -0.27325087785720825,
"logits/rejected": -0.36695626378059387,
"logps/chosen": -70.99406433105469,
"logps/ref_chosen": -56.74927520751953,
"logps/ref_rejected": -58.80628967285156,
"logps/rejected": -86.51283264160156,
"loss": 0.9748,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.7992120385169983,
"rewards/margins": 0.7465012073516846,
"rewards/rejected": -1.545713186264038,
"step": 193
},
{
"epoch": 0.29327286470143615,
"epsilon_dpo/beta": 0.05559392273426056,
"epsilon_dpo/beta_margin_grad_mean": -0.3749810457229614,
"epsilon_dpo/beta_margin_grad_std": 0.21410410106182098,
"epsilon_dpo/beta_margin_mean": 0.645567774772644,
"epsilon_dpo/beta_margin_std": 1.1327472925186157,
"epsilon_dpo/loss_margin_mean": 11.74362564086914,
"grad_norm": 22.297475814819336,
"kl/avg_steps": 0.4375,
"kl/beta": 0.0558326318860054,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 4.4651327368569684e-07,
"logits/chosen": -0.2819008529186249,
"logits/rejected": -0.3600131869316101,
"logps/chosen": -71.22317504882812,
"logps/ref_chosen": -56.649444580078125,
"logps/ref_rejected": -69.98954772949219,
"logps/rejected": -96.30690002441406,
"loss": 1.1055,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.8121296167373657,
"rewards/margins": 0.645567774772644,
"rewards/rejected": -1.4576973915100098,
"step": 194
},
{
"epoch": 0.2947845804988662,
"epsilon_dpo/beta": 0.05537772923707962,
"epsilon_dpo/beta_margin_grad_mean": -0.3532843291759491,
"epsilon_dpo/beta_margin_grad_std": 0.20875737071037292,
"epsilon_dpo/beta_margin_mean": 0.7541001439094543,
"epsilon_dpo/beta_margin_std": 1.0787545442581177,
"epsilon_dpo/loss_margin_mean": 13.755513191223145,
"grad_norm": 25.483095169067383,
"kl/avg_steps": 0.390625,
"kl/beta": 0.055589426308870316,
"kl/n_epsilon_steps": 0.296875,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 4.4569318740967043e-07,
"logits/chosen": -0.357321560382843,
"logits/rejected": -0.34259867668151855,
"logps/chosen": -89.14146423339844,
"logps/ref_chosen": -70.40978240966797,
"logps/ref_rejected": -74.39448547363281,
"logps/rejected": -106.88168334960938,
"loss": 1.0087,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.0407756567001343,
"rewards/margins": 0.7541001439094543,
"rewards/rejected": -1.7948757410049438,
"step": 195
},
{
"epoch": 0.2962962962962963,
"epsilon_dpo/beta": 0.05506715923547745,
"epsilon_dpo/beta_margin_grad_mean": -0.369242399930954,
"epsilon_dpo/beta_margin_grad_std": 0.17825055122375488,
"epsilon_dpo/beta_margin_mean": 0.6246627569198608,
"epsilon_dpo/beta_margin_std": 0.9467172622680664,
"epsilon_dpo/loss_margin_mean": 11.432047843933105,
"grad_norm": 22.514299392700195,
"kl/avg_steps": 0.5625,
"kl/beta": 0.05537312477827072,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 4.448676271745197e-07,
"logits/chosen": -0.2100377380847931,
"logits/rejected": -0.2828536033630371,
"logps/chosen": -76.97757720947266,
"logps/ref_chosen": -59.227577209472656,
"logps/ref_rejected": -83.54757690429688,
"logps/rejected": -112.72962188720703,
"loss": 1.042,
"rewards/accuracies": 0.796875,
"rewards/chosen": -0.9788019061088562,
"rewards/margins": 0.6246627569198608,
"rewards/rejected": -1.6034646034240723,
"step": 196
},
{
"epoch": 0.29780801209372637,
"epsilon_dpo/beta": 0.05475913733243942,
"epsilon_dpo/beta_margin_grad_mean": -0.3558703362941742,
"epsilon_dpo/beta_margin_grad_std": 0.23059363663196564,
"epsilon_dpo/beta_margin_mean": 0.7524293661117554,
"epsilon_dpo/beta_margin_std": 1.2486003637313843,
"epsilon_dpo/loss_margin_mean": 13.876150131225586,
"grad_norm": 24.459840774536133,
"kl/avg_steps": 0.5625,
"kl/beta": 0.0550633929669857,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 4.440366160729392e-07,
"logits/chosen": -0.22216857969760895,
"logits/rejected": -0.28093230724334717,
"logps/chosen": -67.17655944824219,
"logps/ref_chosen": -51.52912902832031,
"logps/ref_rejected": -73.70631408691406,
"logps/rejected": -103.22988891601562,
"loss": 1.0858,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.8599985837936401,
"rewards/margins": 0.7524293661117554,
"rewards/rejected": -1.6124279499053955,
"step": 197
},
{
"epoch": 0.29931972789115646,
"epsilon_dpo/beta": 0.05441861227154732,
"epsilon_dpo/beta_margin_grad_mean": -0.3470156192779541,
"epsilon_dpo/beta_margin_grad_std": 0.1880028247833252,
"epsilon_dpo/beta_margin_mean": 0.7554615139961243,
"epsilon_dpo/beta_margin_std": 0.9644250869750977,
"epsilon_dpo/loss_margin_mean": 13.977117538452148,
"grad_norm": 22.188901901245117,
"kl/avg_steps": 0.625,
"kl/beta": 0.05475539341568947,
"kl/n_epsilon_steps": 0.1875,
"kl/p_epsilon_steps": 0.8125,
"learning_rate": 4.432001773500957e-07,
"logits/chosen": -0.28294283151626587,
"logits/rejected": -0.3606169819831848,
"logps/chosen": -76.99325561523438,
"logps/ref_chosen": -59.78268051147461,
"logps/ref_rejected": -72.24533081054688,
"logps/rejected": -103.43302154541016,
"loss": 0.96,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.9392465949058533,
"rewards/margins": 0.7554615139961243,
"rewards/rejected": -1.6947081089019775,
"step": 198
},
{
"epoch": 0.30083144368858655,
"epsilon_dpo/beta": 0.05419965833425522,
"epsilon_dpo/beta_margin_grad_mean": -0.3772837817668915,
"epsilon_dpo/beta_margin_grad_std": 0.21967315673828125,
"epsilon_dpo/beta_margin_mean": 0.6050155162811279,
"epsilon_dpo/beta_margin_std": 1.1063847541809082,
"epsilon_dpo/loss_margin_mean": 11.31169319152832,
"grad_norm": 22.75516128540039,
"kl/avg_steps": 0.40625,
"kl/beta": 0.054415300488471985,
"kl/n_epsilon_steps": 0.296875,
"kl/p_epsilon_steps": 0.703125,
"learning_rate": 4.4235833440297856e-07,
"logits/chosen": -0.24749316275119781,
"logits/rejected": -0.4613002836704254,
"logps/chosen": -76.83978271484375,
"logps/ref_chosen": -56.38677215576172,
"logps/ref_rejected": -74.56779479980469,
"logps/rejected": -106.3324966430664,
"loss": 1.1321,
"rewards/accuracies": 0.703125,
"rewards/chosen": -1.1123127937316895,
"rewards/margins": 0.6050155162811279,
"rewards/rejected": -1.7173283100128174,
"step": 199
},
{
"epoch": 0.30234315948601664,
"epsilon_dpo/beta": 0.053912609815597534,
"epsilon_dpo/beta_margin_grad_mean": -0.3339231014251709,
"epsilon_dpo/beta_margin_grad_std": 0.2144888937473297,
"epsilon_dpo/beta_margin_mean": 0.9012787938117981,
"epsilon_dpo/beta_margin_std": 1.2188260555267334,
"epsilon_dpo/loss_margin_mean": 16.847806930541992,
"grad_norm": 21.172880172729492,
"kl/avg_steps": 0.53125,
"kl/beta": 0.05419513210654259,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 4.415111107797445e-07,
"logits/chosen": -0.20614995062351227,
"logits/rejected": -0.3295978903770447,
"logps/chosen": -73.48313903808594,
"logps/ref_chosen": -57.82432556152344,
"logps/ref_rejected": -89.28246307373047,
"logps/rejected": -121.78907775878906,
"loss": 0.9619,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.8482496738433838,
"rewards/margins": 0.9012788534164429,
"rewards/rejected": -1.7495285272598267,
"step": 200
},
{
"epoch": 0.30234315948601664,
"eval_epsilon_dpo/beta": 0.05367208644747734,
"eval_epsilon_dpo/beta_margin_grad_mean": -0.3725771903991699,
"eval_epsilon_dpo/beta_margin_grad_std": 0.20947444438934326,
"eval_epsilon_dpo/beta_margin_mean": 0.6642249226570129,
"eval_epsilon_dpo/beta_margin_std": 1.1184816360473633,
"eval_epsilon_dpo/loss_margin_mean": 12.508957862854004,
"eval_kl/n_epsilon_steps": 0.27508804202079773,
"eval_kl/p_epsilon_steps": 0.7240316867828369,
"eval_logits/chosen": -0.1899249106645584,
"eval_logits/rejected": -0.3033720552921295,
"eval_logps/chosen": -93.88325500488281,
"eval_logps/ref_chosen": -74.85946655273438,
"eval_logps/ref_rejected": -79.54898834228516,
"eval_logps/rejected": -111.08172607421875,
"eval_loss": 0.5463831424713135,
"eval_rewards/accuracies": 0.7183098793029785,
"eval_rewards/chosen": -1.024414300918579,
"eval_rewards/margins": 0.6642249226570129,
"eval_rewards/rejected": -1.6886391639709473,
"eval_runtime": 41.7874,
"eval_samples_per_second": 55.112,
"eval_steps_per_second": 1.723,
"step": 200
},
{
"epoch": 0.30385487528344673,
"epsilon_dpo/beta": 0.05366141349077225,
"epsilon_dpo/beta_margin_grad_mean": -0.37062400579452515,
"epsilon_dpo/beta_margin_grad_std": 0.21090182662010193,
"epsilon_dpo/beta_margin_mean": 0.671442985534668,
"epsilon_dpo/beta_margin_std": 1.1181495189666748,
"epsilon_dpo/loss_margin_mean": 12.643129348754883,
"grad_norm": 24.668813705444336,
"kl/avg_steps": 0.46875,
"kl/beta": 0.05390874296426773,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 4.4065853017905953e-07,
"logits/chosen": -0.21364565193653107,
"logits/rejected": -0.3193379044532776,
"logps/chosen": -81.317626953125,
"logps/ref_chosen": -58.999759674072266,
"logps/ref_rejected": -84.67575073242188,
"logps/rejected": -119.63674926757812,
"loss": 1.0788,
"rewards/accuracies": 0.765625,
"rewards/chosen": -1.2012832164764404,
"rewards/margins": 0.671442985534668,
"rewards/rejected": -1.8727260828018188,
"step": 201
},
{
"epoch": 0.30536659108087677,
"epsilon_dpo/beta": 0.053343966603279114,
"epsilon_dpo/beta_margin_grad_mean": -0.3508089780807495,
"epsilon_dpo/beta_margin_grad_std": 0.2011328935623169,
"epsilon_dpo/beta_margin_mean": 0.7409186363220215,
"epsilon_dpo/beta_margin_std": 1.0971077680587769,
"epsilon_dpo/loss_margin_mean": 14.002457618713379,
"grad_norm": 24.13290023803711,
"kl/avg_steps": 0.59375,
"kl/beta": 0.05365722253918648,
"kl/n_epsilon_steps": 0.203125,
"kl/p_epsilon_steps": 0.796875,
"learning_rate": 4.3980061644943575e-07,
"logits/chosen": -0.2312248796224594,
"logits/rejected": -0.3955553472042084,
"logps/chosen": -66.12772369384766,
"logps/ref_chosen": -47.660648345947266,
"logps/ref_rejected": -73.63249206542969,
"logps/rejected": -106.1020278930664,
"loss": 1.0172,
"rewards/accuracies": 0.796875,
"rewards/chosen": -0.9882045984268188,
"rewards/margins": 0.7409186363220215,
"rewards/rejected": -1.7291233539581299,
"step": 202
},
{
"epoch": 0.30687830687830686,
"epsilon_dpo/beta": 0.05306244641542435,
"epsilon_dpo/beta_margin_grad_mean": -0.3689170777797699,
"epsilon_dpo/beta_margin_grad_std": 0.20343144237995148,
"epsilon_dpo/beta_margin_mean": 0.6760145425796509,
"epsilon_dpo/beta_margin_std": 1.079183578491211,
"epsilon_dpo/loss_margin_mean": 12.856017112731934,
"grad_norm": 24.65911865234375,
"kl/avg_steps": 0.53125,
"kl/beta": 0.05334051325917244,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 4.3893739358856455e-07,
"logits/chosen": -0.16740179061889648,
"logits/rejected": -0.2672940492630005,
"logps/chosen": -84.31963348388672,
"logps/ref_chosen": -62.32553482055664,
"logps/ref_rejected": -99.37225341796875,
"logps/rejected": -134.22238159179688,
"loss": 1.0573,
"rewards/accuracies": 0.765625,
"rewards/chosen": -1.170931100845337,
"rewards/margins": 0.6760145425796509,
"rewards/rejected": -1.8469456434249878,
"step": 203
},
{
"epoch": 0.30839002267573695,
"epsilon_dpo/beta": 0.052806831896305084,
"epsilon_dpo/beta_margin_grad_mean": -0.3536710739135742,
"epsilon_dpo/beta_margin_grad_std": 0.19887302815914154,
"epsilon_dpo/beta_margin_mean": 0.7577495574951172,
"epsilon_dpo/beta_margin_std": 1.06210196018219,
"epsilon_dpo/loss_margin_mean": 14.47249698638916,
"grad_norm": 21.671335220336914,
"kl/avg_steps": 0.484375,
"kl/beta": 0.05305863916873932,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 4.380688857426449e-07,
"logits/chosen": -0.13354623317718506,
"logits/rejected": -0.26746487617492676,
"logps/chosen": -70.49836730957031,
"logps/ref_chosen": -50.62931442260742,
"logps/ref_rejected": -66.60475158691406,
"logps/rejected": -100.94629669189453,
"loss": 0.9926,
"rewards/accuracies": 0.765625,
"rewards/chosen": -1.0505276918411255,
"rewards/margins": 0.7577494978904724,
"rewards/rejected": -1.8082772493362427,
"step": 204
},
{
"epoch": 0.30990173847316704,
"epsilon_dpo/beta": 0.05252761393785477,
"epsilon_dpo/beta_margin_grad_mean": -0.36570826172828674,
"epsilon_dpo/beta_margin_grad_std": 0.21349091827869415,
"epsilon_dpo/beta_margin_mean": 0.7072960138320923,
"epsilon_dpo/beta_margin_std": 1.2182862758636475,
"epsilon_dpo/loss_margin_mean": 13.59742546081543,
"grad_norm": 29.41587257385254,
"kl/avg_steps": 0.53125,
"kl/beta": 0.05280287563800812,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 4.3719511720570814e-07,
"logits/chosen": -0.32566794753074646,
"logits/rejected": -0.3880379796028137,
"logps/chosen": -93.26892852783203,
"logps/ref_chosen": -70.35617065429688,
"logps/ref_rejected": -93.39848327636719,
"logps/rejected": -129.9086456298828,
"loss": 1.0866,
"rewards/accuracies": 0.734375,
"rewards/chosen": -1.2062199115753174,
"rewards/margins": 0.7072960138320923,
"rewards/rejected": -1.9135158061981201,
"step": 205
},
{
"epoch": 0.31141345427059713,
"epsilon_dpo/beta": 0.05236494168639183,
"epsilon_dpo/beta_margin_grad_mean": -0.39649882912635803,
"epsilon_dpo/beta_margin_grad_std": 0.2292843461036682,
"epsilon_dpo/beta_margin_mean": 0.5444038510322571,
"epsilon_dpo/beta_margin_std": 1.1895228624343872,
"epsilon_dpo/loss_margin_mean": 10.55942440032959,
"grad_norm": 24.95809555053711,
"kl/avg_steps": 0.3125,
"kl/beta": 0.05252384394407272,
"kl/n_epsilon_steps": 0.34375,
"kl/p_epsilon_steps": 0.65625,
"learning_rate": 4.363161124189387e-07,
"logits/chosen": -0.3284838795661926,
"logits/rejected": -0.35329103469848633,
"logps/chosen": -91.85841369628906,
"logps/ref_chosen": -67.64547729492188,
"logps/ref_rejected": -79.89584350585938,
"logps/rejected": -114.66819763183594,
"loss": 1.2121,
"rewards/accuracies": 0.671875,
"rewards/chosen": -1.271415114402771,
"rewards/margins": 0.5444037914276123,
"rewards/rejected": -1.8158189058303833,
"step": 206
},
{
"epoch": 0.3129251700680272,
"epsilon_dpo/beta": 0.05210362374782562,
"epsilon_dpo/beta_margin_grad_mean": -0.359841525554657,
"epsilon_dpo/beta_margin_grad_std": 0.2003079652786255,
"epsilon_dpo/beta_margin_mean": 0.7392714619636536,
"epsilon_dpo/beta_margin_std": 1.0833410024642944,
"epsilon_dpo/loss_margin_mean": 14.311153411865234,
"grad_norm": 22.56675148010254,
"kl/avg_steps": 0.5,
"kl/beta": 0.05236021801829338,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 4.3543189596998986e-07,
"logits/chosen": -0.2962496280670166,
"logits/rejected": -0.4314347505569458,
"logps/chosen": -95.94314575195312,
"logps/ref_chosen": -67.66419219970703,
"logps/ref_rejected": -85.10249328613281,
"logps/rejected": -127.69259643554688,
"loss": 1.0112,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.4758737087249756,
"rewards/margins": 0.7392715215682983,
"rewards/rejected": -2.2151451110839844,
"step": 207
},
{
"epoch": 0.3144368858654573,
"epsilon_dpo/beta": 0.05195838585495949,
"epsilon_dpo/beta_margin_grad_mean": -0.42607802152633667,
"epsilon_dpo/beta_margin_grad_std": 0.21802197396755219,
"epsilon_dpo/beta_margin_mean": 0.3730897903442383,
"epsilon_dpo/beta_margin_std": 1.0580157041549683,
"epsilon_dpo/loss_margin_mean": 7.333087921142578,
"grad_norm": 28.304237365722656,
"kl/avg_steps": 0.28125,
"kl/beta": 0.052099719643592834,
"kl/n_epsilon_steps": 0.359375,
"kl/p_epsilon_steps": 0.640625,
"learning_rate": 4.3454249259229664e-07,
"logits/chosen": -0.2553904950618744,
"logits/rejected": -0.28176793456077576,
"logps/chosen": -80.06737518310547,
"logps/ref_chosen": -57.731712341308594,
"logps/ref_rejected": -74.19276428222656,
"logps/rejected": -103.86151885986328,
"loss": 1.2958,
"rewards/accuracies": 0.609375,
"rewards/chosen": -1.1618506908416748,
"rewards/margins": 0.3730897903442383,
"rewards/rejected": -1.5349406003952026,
"step": 208
},
{
"epoch": 0.31594860166288735,
"epsilon_dpo/beta": 0.05169900134205818,
"epsilon_dpo/beta_margin_grad_mean": -0.3398745656013489,
"epsilon_dpo/beta_margin_grad_std": 0.21661755442619324,
"epsilon_dpo/beta_margin_mean": 0.8418377637863159,
"epsilon_dpo/beta_margin_std": 1.241960048675537,
"epsilon_dpo/loss_margin_mean": 16.422760009765625,
"grad_norm": 26.676044464111328,
"kl/avg_steps": 0.5,
"kl/beta": 0.051953598856925964,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 4.336479271643833e-07,
"logits/chosen": -0.2835359573364258,
"logits/rejected": -0.3975231647491455,
"logps/chosen": -91.62300109863281,
"logps/ref_chosen": -68.55007934570312,
"logps/ref_rejected": -87.90542602539062,
"logps/rejected": -127.40109252929688,
"loss": 1.0114,
"rewards/accuracies": 0.765625,
"rewards/chosen": -1.1974425315856934,
"rewards/margins": 0.8418377041816711,
"rewards/rejected": -2.039280414581299,
"step": 209
},
{
"epoch": 0.31746031746031744,
"epsilon_dpo/beta": 0.05142563581466675,
"epsilon_dpo/beta_margin_grad_mean": -0.34444931149482727,
"epsilon_dpo/beta_margin_grad_std": 0.21232710778713226,
"epsilon_dpo/beta_margin_mean": 0.8279846906661987,
"epsilon_dpo/beta_margin_std": 1.1546531915664673,
"epsilon_dpo/loss_margin_mean": 16.233200073242188,
"grad_norm": 21.39499282836914,
"kl/avg_steps": 0.53125,
"kl/beta": 0.05169512331485748,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 4.327482247091679e-07,
"logits/chosen": -0.1698862612247467,
"logits/rejected": -0.33817726373672485,
"logps/chosen": -80.65475463867188,
"logps/ref_chosen": -57.268272399902344,
"logps/ref_rejected": -85.72807312011719,
"logps/rejected": -125.34776306152344,
"loss": 0.985,
"rewards/accuracies": 0.796875,
"rewards/chosen": -1.2052545547485352,
"rewards/margins": 0.8279846906661987,
"rewards/rejected": -2.0332393646240234,
"step": 210
},
{
"epoch": 0.31897203325774753,
"epsilon_dpo/beta": 0.051137808710336685,
"epsilon_dpo/beta_margin_grad_mean": -0.3561786115169525,
"epsilon_dpo/beta_margin_grad_std": 0.18420453369617462,
"epsilon_dpo/beta_margin_mean": 0.7483376264572144,
"epsilon_dpo/beta_margin_std": 1.028225064277649,
"epsilon_dpo/loss_margin_mean": 14.734661102294922,
"grad_norm": 25.736953735351562,
"kl/avg_steps": 0.5625,
"kl/beta": 0.05142194405198097,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 4.3184341039326217e-07,
"logits/chosen": -0.12346768379211426,
"logits/rejected": -0.3688925504684448,
"logps/chosen": -74.39219665527344,
"logps/ref_chosen": -53.640708923339844,
"logps/ref_rejected": -93.03880310058594,
"logps/rejected": -128.5249481201172,
"loss": 0.9751,
"rewards/accuracies": 0.796875,
"rewards/chosen": -1.065348744392395,
"rewards/margins": 0.7483376264572144,
"rewards/rejected": -1.8136863708496094,
"step": 211
},
{
"epoch": 0.3204837490551776,
"epsilon_dpo/beta": 0.05088372901082039,
"epsilon_dpo/beta_margin_grad_mean": -0.34854933619499207,
"epsilon_dpo/beta_margin_grad_std": 0.20713090896606445,
"epsilon_dpo/beta_margin_mean": 0.781592845916748,
"epsilon_dpo/beta_margin_std": 1.1313918828964233,
"epsilon_dpo/loss_margin_mean": 15.493827819824219,
"grad_norm": 21.113059997558594,
"kl/avg_steps": 0.5,
"kl/beta": 0.05113431438803673,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 4.309335095262675e-07,
"logits/chosen": -0.1305130422115326,
"logits/rejected": -0.2613433301448822,
"logps/chosen": -81.18766784667969,
"logps/ref_chosen": -57.36674499511719,
"logps/ref_rejected": -79.89643096923828,
"logps/rejected": -119.21118927001953,
"loss": 1.0056,
"rewards/accuracies": 0.765625,
"rewards/chosen": -1.215775966644287,
"rewards/margins": 0.781592845916748,
"rewards/rejected": -1.9973688125610352,
"step": 212
},
{
"epoch": 0.3219954648526077,
"epsilon_dpo/beta": 0.050582874566316605,
"epsilon_dpo/beta_margin_grad_mean": -0.3371620774269104,
"epsilon_dpo/beta_margin_grad_std": 0.21343198418617249,
"epsilon_dpo/beta_margin_mean": 0.8663833141326904,
"epsilon_dpo/beta_margin_std": 1.2144683599472046,
"epsilon_dpo/loss_margin_mean": 17.25967025756836,
"grad_norm": 19.573049545288086,
"kl/avg_steps": 0.59375,
"kl/beta": 0.050879914313554764,
"kl/n_epsilon_steps": 0.203125,
"kl/p_epsilon_steps": 0.796875,
"learning_rate": 4.3001854756006724e-07,
"logits/chosen": -0.2590155601501465,
"logits/rejected": -0.3154826760292053,
"logps/chosen": -83.90384674072266,
"logps/ref_chosen": -65.22111511230469,
"logps/ref_rejected": -80.1810302734375,
"logps/rejected": -116.12342834472656,
"loss": 0.9794,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.9479498863220215,
"rewards/margins": 0.8663833141326904,
"rewards/rejected": -1.814333200454712,
"step": 213
},
{
"epoch": 0.3235071806500378,
"epsilon_dpo/beta": 0.050379153341054916,
"epsilon_dpo/beta_margin_grad_mean": -0.342753529548645,
"epsilon_dpo/beta_margin_grad_std": 0.22161895036697388,
"epsilon_dpo/beta_margin_mean": 0.8594990968704224,
"epsilon_dpo/beta_margin_std": 1.2116649150848389,
"epsilon_dpo/loss_margin_mean": 17.224977493286133,
"grad_norm": 25.820276260375977,
"kl/avg_steps": 0.40625,
"kl/beta": 0.05057959631085396,
"kl/n_epsilon_steps": 0.296875,
"kl/p_epsilon_steps": 0.703125,
"learning_rate": 4.290985500881143e-07,
"logits/chosen": -0.2213762253522873,
"logits/rejected": -0.29440993070602417,
"logps/chosen": -83.03913116455078,
"logps/ref_chosen": -61.292327880859375,
"logps/ref_rejected": -67.69841003417969,
"logps/rejected": -106.67018127441406,
"loss": 0.9879,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.1000583171844482,
"rewards/margins": 0.8594990968704224,
"rewards/rejected": -1.9595574140548706,
"step": 214
},
{
"epoch": 0.3250188964474679,
"epsilon_dpo/beta": 0.050128087401390076,
"epsilon_dpo/beta_margin_grad_mean": -0.3391542136669159,
"epsilon_dpo/beta_margin_grad_std": 0.22373604774475098,
"epsilon_dpo/beta_margin_mean": 0.886252760887146,
"epsilon_dpo/beta_margin_std": 1.218886137008667,
"epsilon_dpo/loss_margin_mean": 17.829500198364258,
"grad_norm": 22.375837326049805,
"kl/avg_steps": 0.5,
"kl/beta": 0.05037495121359825,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 4.281735428447157e-07,
"logits/chosen": -0.1783505380153656,
"logits/rejected": -0.42138317227363586,
"logps/chosen": -89.10653686523438,
"logps/ref_chosen": -63.86913299560547,
"logps/ref_rejected": -98.7657241821289,
"logps/rejected": -141.83261108398438,
"loss": 0.9768,
"rewards/accuracies": 0.734375,
"rewards/chosen": -1.2677936553955078,
"rewards/margins": 0.886252760887146,
"rewards/rejected": -2.1540462970733643,
"step": 215
},
{
"epoch": 0.32653061224489793,
"epsilon_dpo/beta": 0.04986302927136421,
"epsilon_dpo/beta_margin_grad_mean": -0.3254110515117645,
"epsilon_dpo/beta_margin_grad_std": 0.20005854964256287,
"epsilon_dpo/beta_margin_mean": 0.9090076088905334,
"epsilon_dpo/beta_margin_std": 1.0917046070098877,
"epsilon_dpo/loss_margin_mean": 18.35765838623047,
"grad_norm": 23.445894241333008,
"kl/avg_steps": 0.53125,
"kl/beta": 0.05012432858347893,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 4.2724355170431247e-07,
"logits/chosen": -0.24750207364559174,
"logits/rejected": -0.42292094230651855,
"logps/chosen": -89.51116180419922,
"logps/ref_chosen": -67.824951171875,
"logps/ref_rejected": -96.40231323242188,
"logps/rejected": -136.44618225097656,
"loss": 0.9057,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.083980679512024,
"rewards/margins": 0.9090075492858887,
"rewards/rejected": -1.9929883480072021,
"step": 216
},
{
"epoch": 0.328042328042328,
"epsilon_dpo/beta": 0.04956836625933647,
"epsilon_dpo/beta_margin_grad_mean": -0.3257940411567688,
"epsilon_dpo/beta_margin_grad_std": 0.20377041399478912,
"epsilon_dpo/beta_margin_mean": 0.9509314298629761,
"epsilon_dpo/beta_margin_std": 1.1743782758712769,
"epsilon_dpo/loss_margin_mean": 19.306211471557617,
"grad_norm": 19.750581741333008,
"kl/avg_steps": 0.59375,
"kl/beta": 0.04985944926738739,
"kl/n_epsilon_steps": 0.203125,
"kl/p_epsilon_steps": 0.796875,
"learning_rate": 4.26308602680756e-07,
"logits/chosen": -0.2603002190589905,
"logits/rejected": -0.461361825466156,
"logps/chosen": -85.08186340332031,
"logps/ref_chosen": -60.50499725341797,
"logps/ref_rejected": -84.26618194580078,
"logps/rejected": -128.14926147460938,
"loss": 0.9027,
"rewards/accuracies": 0.765625,
"rewards/chosen": -1.222327470779419,
"rewards/margins": 0.9509314298629761,
"rewards/rejected": -2.1732587814331055,
"step": 217
},
{
"epoch": 0.3295540438397581,
"epsilon_dpo/beta": 0.0493842251598835,
"epsilon_dpo/beta_margin_grad_mean": -0.3916711211204529,
"epsilon_dpo/beta_margin_grad_std": 0.23450112342834473,
"epsilon_dpo/beta_margin_mean": 0.5639125108718872,
"epsilon_dpo/beta_margin_std": 1.2866250276565552,
"epsilon_dpo/loss_margin_mean": 11.593125343322754,
"grad_norm": 24.170324325561523,
"kl/avg_steps": 0.375,
"kl/beta": 0.0495651550590992,
"kl/n_epsilon_steps": 0.3125,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 4.253687219265803e-07,
"logits/chosen": -0.35200822353363037,
"logits/rejected": -0.3244783878326416,
"logps/chosen": -96.97987365722656,
"logps/ref_chosen": -70.59431457519531,
"logps/ref_rejected": -73.89038848876953,
"logps/rejected": -111.86907196044922,
"loss": 1.2353,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.3072826862335205,
"rewards/margins": 0.563912570476532,
"rewards/rejected": -1.8711950778961182,
"step": 218
},
{
"epoch": 0.3310657596371882,
"epsilon_dpo/beta": 0.049199726432561874,
"epsilon_dpo/beta_margin_grad_mean": -0.3830212950706482,
"epsilon_dpo/beta_margin_grad_std": 0.19721731543540955,
"epsilon_dpo/beta_margin_mean": 0.5809386372566223,
"epsilon_dpo/beta_margin_std": 0.9904532432556152,
"epsilon_dpo/loss_margin_mean": 11.946596145629883,
"grad_norm": 21.35806655883789,
"kl/avg_steps": 0.375,
"kl/beta": 0.04937998205423355,
"kl/n_epsilon_steps": 0.3125,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 4.2442393573227043e-07,
"logits/chosen": -0.2186792492866516,
"logits/rejected": -0.287276029586792,
"logps/chosen": -84.83613586425781,
"logps/ref_chosen": -60.490943908691406,
"logps/ref_rejected": -75.85001373291016,
"logps/rejected": -112.14179992675781,
"loss": 1.0965,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.1999635696411133,
"rewards/margins": 0.5809386372566223,
"rewards/rejected": -1.7809021472930908,
"step": 219
},
{
"epoch": 0.3325774754346183,
"epsilon_dpo/beta": 0.04903129115700722,
"epsilon_dpo/beta_margin_grad_mean": -0.367901474237442,
"epsilon_dpo/beta_margin_grad_std": 0.2149849534034729,
"epsilon_dpo/beta_margin_mean": 0.7109902501106262,
"epsilon_dpo/beta_margin_std": 1.164394736289978,
"epsilon_dpo/loss_margin_mean": 14.667479515075684,
"grad_norm": 21.353673934936523,
"kl/avg_steps": 0.34375,
"kl/beta": 0.04919549822807312,
"kl/n_epsilon_steps": 0.328125,
"kl/p_epsilon_steps": 0.671875,
"learning_rate": 4.234742705255272e-07,
"logits/chosen": -0.11274047940969467,
"logits/rejected": -0.2561477720737457,
"logps/chosen": -65.19322204589844,
"logps/ref_chosen": -45.013397216796875,
"logps/ref_rejected": -70.49369812011719,
"logps/rejected": -105.34100341796875,
"loss": 1.0668,
"rewards/accuracies": 0.671875,
"rewards/chosen": -0.9930375814437866,
"rewards/margins": 0.7109901905059814,
"rewards/rejected": -1.704027771949768,
"step": 220
},
{
"epoch": 0.3340891912320484,
"epsilon_dpo/beta": 0.048786710947752,
"epsilon_dpo/beta_margin_grad_mean": -0.3601202368736267,
"epsilon_dpo/beta_margin_grad_std": 0.20719635486602783,
"epsilon_dpo/beta_margin_mean": 0.7726308703422546,
"epsilon_dpo/beta_margin_std": 1.192143201828003,
"epsilon_dpo/loss_margin_mean": 15.972743034362793,
"grad_norm": 23.348731994628906,
"kl/avg_steps": 0.5,
"kl/beta": 0.049026969820261,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 4.22519752870528e-07,
"logits/chosen": -0.24676984548568726,
"logits/rejected": -0.32545697689056396,
"logps/chosen": -78.79469299316406,
"logps/ref_chosen": -59.09584045410156,
"logps/ref_rejected": -88.64388275146484,
"logps/rejected": -124.31547546386719,
"loss": 1.0239,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.9637718796730042,
"rewards/margins": 0.7726308703422546,
"rewards/rejected": -1.7364027500152588,
"step": 221
},
{
"epoch": 0.3356009070294785,
"epsilon_dpo/beta": 0.048498254269361496,
"epsilon_dpo/beta_margin_grad_mean": -0.3126097321510315,
"epsilon_dpo/beta_margin_grad_std": 0.20452351868152618,
"epsilon_dpo/beta_margin_mean": 1.0146393775939941,
"epsilon_dpo/beta_margin_std": 1.1660168170928955,
"epsilon_dpo/loss_margin_mean": 21.046466827392578,
"grad_norm": 20.287765502929688,
"kl/avg_steps": 0.59375,
"kl/beta": 0.04878305271267891,
"kl/n_epsilon_steps": 0.203125,
"kl/p_epsilon_steps": 0.796875,
"learning_rate": 4.2156040946718343e-07,
"logits/chosen": -0.31022557616233826,
"logits/rejected": -0.3927844762802124,
"logps/chosen": -76.32540893554688,
"logps/ref_chosen": -55.9976921081543,
"logps/ref_rejected": -111.94727325439453,
"logps/rejected": -153.3214569091797,
"loss": 0.8638,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.9881341457366943,
"rewards/margins": 1.0146393775939941,
"rewards/rejected": -2.0027735233306885,
"step": 222
},
{
"epoch": 0.3371126228269085,
"epsilon_dpo/beta": 0.04830293357372284,
"epsilon_dpo/beta_margin_grad_mean": -0.3402611017227173,
"epsilon_dpo/beta_margin_grad_std": 0.19977925717830658,
"epsilon_dpo/beta_margin_mean": 0.8388980627059937,
"epsilon_dpo/beta_margin_std": 1.0658760070800781,
"epsilon_dpo/loss_margin_mean": 17.520381927490234,
"grad_norm": 20.322734832763672,
"kl/avg_steps": 0.40625,
"kl/beta": 0.04849511384963989,
"kl/n_epsilon_steps": 0.296875,
"kl/p_epsilon_steps": 0.703125,
"learning_rate": 4.2059626715039065e-07,
"logits/chosen": -0.2663101553916931,
"logits/rejected": -0.3712068200111389,
"logps/chosen": -82.51565551757812,
"logps/ref_chosen": -59.891422271728516,
"logps/ref_rejected": -86.28954315185547,
"logps/rejected": -126.43417358398438,
"loss": 0.939,
"rewards/accuracies": 0.703125,
"rewards/chosen": -1.0962058305740356,
"rewards/margins": 0.8388980627059937,
"rewards/rejected": -1.9351038932800293,
"step": 223
},
{
"epoch": 0.3386243386243386,
"epsilon_dpo/beta": 0.048122588545084,
"epsilon_dpo/beta_margin_grad_mean": -0.40934452414512634,
"epsilon_dpo/beta_margin_grad_std": 0.20357008278369904,
"epsilon_dpo/beta_margin_mean": 0.47357648611068726,
"epsilon_dpo/beta_margin_std": 1.0665119886398315,
"epsilon_dpo/loss_margin_mean": 9.9871244430542,
"grad_norm": 26.544706344604492,
"kl/avg_steps": 0.375,
"kl/beta": 0.048298899084329605,
"kl/n_epsilon_steps": 0.3125,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 4.1962735288928304e-07,
"logits/chosen": -0.20113852620124817,
"logits/rejected": -0.2587093114852905,
"logps/chosen": -89.50384521484375,
"logps/ref_chosen": -64.04463195800781,
"logps/ref_rejected": -75.05450439453125,
"logps/rejected": -110.5008316040039,
"loss": 1.2036,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.22823166847229,
"rewards/margins": 0.47357648611068726,
"rewards/rejected": -1.701808214187622,
"step": 224
},
{
"epoch": 0.3401360544217687,
"epsilon_dpo/beta": 0.047882650047540665,
"epsilon_dpo/beta_margin_grad_mean": -0.34000715613365173,
"epsilon_dpo/beta_margin_grad_std": 0.20214377343654633,
"epsilon_dpo/beta_margin_mean": 0.8546714186668396,
"epsilon_dpo/beta_margin_std": 1.1385964155197144,
"epsilon_dpo/loss_margin_mean": 17.97957992553711,
"grad_norm": 24.81421661376953,
"kl/avg_steps": 0.5,
"kl/beta": 0.04811845347285271,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 4.186536937864752e-07,
"logits/chosen": -0.3176313042640686,
"logits/rejected": -0.5447347164154053,
"logps/chosen": -90.762939453125,
"logps/ref_chosen": -66.0958251953125,
"logps/ref_rejected": -97.68675231933594,
"logps/rejected": -140.33343505859375,
"loss": 0.9532,
"rewards/accuracies": 0.796875,
"rewards/chosen": -1.1839871406555176,
"rewards/margins": 0.8546714782714844,
"rewards/rejected": -2.038658618927002,
"step": 225
},
{
"epoch": 0.3416477702191988,
"epsilon_dpo/beta": 0.04768931865692139,
"epsilon_dpo/beta_margin_grad_mean": -0.37059932947158813,
"epsilon_dpo/beta_margin_grad_std": 0.2239595502614975,
"epsilon_dpo/beta_margin_mean": 0.6670401096343994,
"epsilon_dpo/beta_margin_std": 1.1524150371551514,
"epsilon_dpo/loss_margin_mean": 14.158507347106934,
"grad_norm": 22.927324295043945,
"kl/avg_steps": 0.40625,
"kl/beta": 0.04787905886769295,
"kl/n_epsilon_steps": 0.296875,
"kl/p_epsilon_steps": 0.703125,
"learning_rate": 4.176753170773052e-07,
"logits/chosen": -0.1265522688627243,
"logits/rejected": -0.18893226981163025,
"logps/chosen": -72.90589141845703,
"logps/ref_chosen": -51.4168701171875,
"logps/ref_rejected": -66.30068969726562,
"logps/rejected": -101.94821166992188,
"loss": 1.1049,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.0277118682861328,
"rewards/margins": 0.6670401692390442,
"rewards/rejected": -1.6947519779205322,
"step": 226
},
{
"epoch": 0.3431594860166289,
"epsilon_dpo/beta": 0.047481462359428406,
"epsilon_dpo/beta_margin_grad_mean": -0.37029850482940674,
"epsilon_dpo/beta_margin_grad_std": 0.22259338200092316,
"epsilon_dpo/beta_margin_mean": 0.6982373595237732,
"epsilon_dpo/beta_margin_std": 1.2246394157409668,
"epsilon_dpo/loss_margin_mean": 14.864490509033203,
"grad_norm": 25.389001846313477,
"kl/avg_steps": 0.4375,
"kl/beta": 0.04768533632159233,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 4.166922501290729e-07,
"logits/chosen": -0.16581010818481445,
"logits/rejected": -0.2899661362171173,
"logps/chosen": -80.6136474609375,
"logps/ref_chosen": -57.98978042602539,
"logps/ref_rejected": -75.05464172363281,
"logps/rejected": -112.54299926757812,
"loss": 1.1047,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.0783476829528809,
"rewards/margins": 0.698237419128418,
"rewards/rejected": -1.7765851020812988,
"step": 227
},
{
"epoch": 0.34467120181405897,
"epsilon_dpo/beta": 0.04724495857954025,
"epsilon_dpo/beta_margin_grad_mean": -0.35448509454727173,
"epsilon_dpo/beta_margin_grad_std": 0.21162235736846924,
"epsilon_dpo/beta_margin_mean": 0.7630500793457031,
"epsilon_dpo/beta_margin_std": 1.125166893005371,
"epsilon_dpo/loss_margin_mean": 16.300472259521484,
"grad_norm": 20.90208625793457,
"kl/avg_steps": 0.5,
"kl/beta": 0.04747762158513069,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 4.1570452044027405e-07,
"logits/chosen": -0.16706092655658722,
"logits/rejected": -0.3283839821815491,
"logps/chosen": -78.76603698730469,
"logps/ref_chosen": -55.559364318847656,
"logps/ref_rejected": -77.02364349365234,
"logps/rejected": -116.5307846069336,
"loss": 1.017,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.0988540649414062,
"rewards/margins": 0.7630500793457031,
"rewards/rejected": -1.8619041442871094,
"step": 228
},
{
"epoch": 0.34618291761148906,
"epsilon_dpo/beta": 0.046995144337415695,
"epsilon_dpo/beta_margin_grad_mean": -0.37700027227401733,
"epsilon_dpo/beta_margin_grad_std": 0.22120419144630432,
"epsilon_dpo/beta_margin_mean": 0.6218365430831909,
"epsilon_dpo/beta_margin_std": 1.1692832708358765,
"epsilon_dpo/loss_margin_mean": 13.381778717041016,
"grad_norm": 43.009891510009766,
"kl/avg_steps": 0.53125,
"kl/beta": 0.047241415828466415,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 4.147121556398312e-07,
"logits/chosen": -0.09967577457427979,
"logits/rejected": -0.20441506803035736,
"logps/chosen": -71.13024139404297,
"logps/ref_chosen": -50.79466247558594,
"logps/ref_rejected": -78.44740295410156,
"logps/rejected": -112.16476440429688,
"loss": 1.1416,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.9604513645172119,
"rewards/margins": 0.6218365430831909,
"rewards/rejected": -1.5822877883911133,
"step": 229
},
{
"epoch": 0.3476946334089191,
"epsilon_dpo/beta": 0.04667336866259575,
"epsilon_dpo/beta_margin_grad_mean": -0.3549221456050873,
"epsilon_dpo/beta_margin_grad_std": 0.19611062109470367,
"epsilon_dpo/beta_margin_mean": 0.7466577887535095,
"epsilon_dpo/beta_margin_std": 1.1324728727340698,
"epsilon_dpo/loss_margin_mean": 16.100971221923828,
"grad_norm": 22.320335388183594,
"kl/avg_steps": 0.6875,
"kl/beta": 0.04699177294969559,
"kl/n_epsilon_steps": 0.15625,
"kl/p_epsilon_steps": 0.84375,
"learning_rate": 4.137151834863213e-07,
"logits/chosen": -0.24332374334335327,
"logits/rejected": -0.24807855486869812,
"logps/chosen": -78.63899230957031,
"logps/ref_chosen": -56.729225158691406,
"logps/ref_rejected": -62.99180603027344,
"logps/rejected": -101.0025405883789,
"loss": 1.0203,
"rewards/accuracies": 0.828125,
"rewards/chosen": -1.0256309509277344,
"rewards/margins": 0.7466577887535095,
"rewards/rejected": -1.7722887992858887,
"step": 230
},
{
"epoch": 0.3492063492063492,
"epsilon_dpo/beta": 0.04639843851327896,
"epsilon_dpo/beta_margin_grad_mean": -0.29492706060409546,
"epsilon_dpo/beta_margin_grad_std": 0.1999235302209854,
"epsilon_dpo/beta_margin_mean": 1.0802605152130127,
"epsilon_dpo/beta_margin_std": 1.099169135093689,
"epsilon_dpo/loss_margin_mean": 23.418697357177734,
"grad_norm": 21.897851943969727,
"kl/avg_steps": 0.59375,
"kl/beta": 0.046670909970998764,
"kl/n_epsilon_steps": 0.203125,
"kl/p_epsilon_steps": 0.796875,
"learning_rate": 4.1271363186719835e-07,
"logits/chosen": -0.2738434672355652,
"logits/rejected": -0.3240780234336853,
"logps/chosen": -96.62458801269531,
"logps/ref_chosen": -72.59710693359375,
"logps/ref_rejected": -86.2322998046875,
"logps/rejected": -133.67848205566406,
"loss": 0.8092,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.1179986000061035,
"rewards/margins": 1.0802605152130127,
"rewards/rejected": -2.198259115219116,
"step": 231
},
{
"epoch": 0.3507180650037793,
"epsilon_dpo/beta": 0.0461970753967762,
"epsilon_dpo/beta_margin_grad_mean": -0.37141913175582886,
"epsilon_dpo/beta_margin_grad_std": 0.21335488557815552,
"epsilon_dpo/beta_margin_mean": 0.7099602818489075,
"epsilon_dpo/beta_margin_std": 1.3296102285385132,
"epsilon_dpo/loss_margin_mean": 15.5336332321167,
"grad_norm": 26.136768341064453,
"kl/avg_steps": 0.4375,
"kl/beta": 0.0463954359292984,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 4.1170752879801436e-07,
"logits/chosen": -0.2462497353553772,
"logits/rejected": -0.25309091806411743,
"logps/chosen": -92.60012817382812,
"logps/ref_chosen": -68.1185302734375,
"logps/ref_rejected": -83.79415893554688,
"logps/rejected": -123.80940246582031,
"loss": 1.116,
"rewards/accuracies": 0.734375,
"rewards/chosen": -1.133660912513733,
"rewards/margins": 0.7099602222442627,
"rewards/rejected": -1.8436212539672852,
"step": 232
},
{
"epoch": 0.35222978080120937,
"epsilon_dpo/beta": 0.04605359211564064,
"epsilon_dpo/beta_margin_grad_mean": -0.3957999348640442,
"epsilon_dpo/beta_margin_grad_std": 0.21426290273666382,
"epsilon_dpo/beta_margin_mean": 0.5684574842453003,
"epsilon_dpo/beta_margin_std": 1.135840892791748,
"epsilon_dpo/loss_margin_mean": 12.512956619262695,
"grad_norm": 23.499263763427734,
"kl/avg_steps": 0.3125,
"kl/beta": 0.04619334265589714,
"kl/n_epsilon_steps": 0.34375,
"kl/p_epsilon_steps": 0.65625,
"learning_rate": 4.106969024216348e-07,
"logits/chosen": -0.1567138284444809,
"logits/rejected": -0.29733848571777344,
"logps/chosen": -82.85314178466797,
"logps/ref_chosen": -55.070152282714844,
"logps/ref_rejected": -66.61845397949219,
"logps/rejected": -106.91439819335938,
"loss": 1.1591,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.286712646484375,
"rewards/margins": 0.5684574842453003,
"rewards/rejected": -1.8551700115203857,
"step": 233
},
{
"epoch": 0.35374149659863946,
"epsilon_dpo/beta": 0.04588134214282036,
"epsilon_dpo/beta_margin_grad_mean": -0.4026263356208801,
"epsilon_dpo/beta_margin_grad_std": 0.22269965708255768,
"epsilon_dpo/beta_margin_mean": 0.5063302516937256,
"epsilon_dpo/beta_margin_std": 1.1445589065551758,
"epsilon_dpo/loss_margin_mean": 11.206908226013184,
"grad_norm": 26.901485443115234,
"kl/avg_steps": 0.375,
"kl/beta": 0.046049438416957855,
"kl/n_epsilon_steps": 0.3125,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 4.09681781007452e-07,
"logits/chosen": -0.10691481828689575,
"logits/rejected": -0.14320091903209686,
"logps/chosen": -80.59999084472656,
"logps/ref_chosen": -55.92589569091797,
"logps/ref_rejected": -51.11608123779297,
"logps/rejected": -86.99708557128906,
"loss": 1.2204,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.1371861696243286,
"rewards/margins": 0.5063302516937256,
"rewards/rejected": -1.6435164213180542,
"step": 234
},
{
"epoch": 0.35525321239606955,
"epsilon_dpo/beta": 0.04560955986380577,
"epsilon_dpo/beta_margin_grad_mean": -0.31382739543914795,
"epsilon_dpo/beta_margin_grad_std": 0.18148332834243774,
"epsilon_dpo/beta_margin_mean": 0.9531666040420532,
"epsilon_dpo/beta_margin_std": 0.9881489276885986,
"epsilon_dpo/loss_margin_mean": 21.01175308227539,
"grad_norm": 18.951379776000977,
"kl/avg_steps": 0.59375,
"kl/beta": 0.04587739706039429,
"kl/n_epsilon_steps": 0.203125,
"kl/p_epsilon_steps": 0.796875,
"learning_rate": 4.08662192950594e-07,
"logits/chosen": -0.2972600758075714,
"logits/rejected": -0.3329446315765381,
"logps/chosen": -85.34906005859375,
"logps/ref_chosen": -64.53972625732422,
"logps/ref_rejected": -77.69151306152344,
"logps/rejected": -119.5125961303711,
"loss": 0.8368,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.950025200843811,
"rewards/margins": 0.9531666040420532,
"rewards/rejected": -1.9031918048858643,
"step": 235
},
{
"epoch": 0.35676492819349964,
"epsilon_dpo/beta": 0.04542586952447891,
"epsilon_dpo/beta_margin_grad_mean": -0.3612869083881378,
"epsilon_dpo/beta_margin_grad_std": 0.21417368948459625,
"epsilon_dpo/beta_margin_mean": 0.7317869663238525,
"epsilon_dpo/beta_margin_std": 1.1905665397644043,
"epsilon_dpo/loss_margin_mean": 16.277965545654297,
"grad_norm": 24.175756454467773,
"kl/avg_steps": 0.40625,
"kl/beta": 0.04560660570859909,
"kl/n_epsilon_steps": 0.296875,
"kl/p_epsilon_steps": 0.703125,
"learning_rate": 4.076381667711306e-07,
"logits/chosen": -0.2971087098121643,
"logits/rejected": -0.24580159783363342,
"logps/chosen": -103.0522232055664,
"logps/ref_chosen": -71.15473937988281,
"logps/ref_rejected": -84.88542175292969,
"logps/rejected": -133.0608673095703,
"loss": 1.0627,
"rewards/accuracies": 0.703125,
"rewards/chosen": -1.45292329788208,
"rewards/margins": 0.7317869663238525,
"rewards/rejected": -2.1847102642059326,
"step": 236
},
{
"epoch": 0.35827664399092973,
"epsilon_dpo/beta": 0.0451710969209671,
"epsilon_dpo/beta_margin_grad_mean": -0.33493107557296753,
"epsilon_dpo/beta_margin_grad_std": 0.21735528111457825,
"epsilon_dpo/beta_margin_mean": 0.8445869088172913,
"epsilon_dpo/beta_margin_std": 1.2118748426437378,
"epsilon_dpo/loss_margin_mean": 18.857500076293945,
"grad_norm": 26.047622680664062,
"kl/avg_steps": 0.5625,
"kl/beta": 0.04542208090424538,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 4.066097311132753e-07,
"logits/chosen": -0.26094740629196167,
"logits/rejected": -0.25581249594688416,
"logps/chosen": -101.80400085449219,
"logps/ref_chosen": -76.14201354980469,
"logps/ref_rejected": -80.88479614257812,
"logps/rejected": -125.40428161621094,
"loss": 1.0,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.1612461805343628,
"rewards/margins": 0.844586968421936,
"rewards/rejected": -2.005833148956299,
"step": 237
},
{
"epoch": 0.35978835978835977,
"epsilon_dpo/beta": 0.04497489705681801,
"epsilon_dpo/beta_margin_grad_mean": -0.349046915769577,
"epsilon_dpo/beta_margin_grad_std": 0.22108124196529388,
"epsilon_dpo/beta_margin_mean": 0.812792181968689,
"epsilon_dpo/beta_margin_std": 1.158914566040039,
"epsilon_dpo/loss_margin_mean": 18.24932098388672,
"grad_norm": 27.500316619873047,
"kl/avg_steps": 0.4375,
"kl/beta": 0.04516800865530968,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 4.0557691474458414e-07,
"logits/chosen": -0.1964595466852188,
"logits/rejected": -0.19806668162345886,
"logps/chosen": -93.53779602050781,
"logps/ref_chosen": -68.88484954833984,
"logps/ref_rejected": -75.8946304321289,
"logps/rejected": -118.79689025878906,
"loss": 1.0027,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.1110514402389526,
"rewards/margins": 0.812792181968689,
"rewards/rejected": -1.9238436222076416,
"step": 238
},
{
"epoch": 0.36130007558578986,
"epsilon_dpo/beta": 0.044722769409418106,
"epsilon_dpo/beta_margin_grad_mean": -0.33949601650238037,
"epsilon_dpo/beta_margin_grad_std": 0.21600690484046936,
"epsilon_dpo/beta_margin_mean": 0.8546693921089172,
"epsilon_dpo/beta_margin_std": 1.1973682641983032,
"epsilon_dpo/loss_margin_mean": 19.267606735229492,
"grad_norm": 22.895824432373047,
"kl/avg_steps": 0.5625,
"kl/beta": 0.04497126117348671,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 4.045397465551513e-07,
"logits/chosen": -0.13500560820102692,
"logits/rejected": -0.34600499272346497,
"logps/chosen": -87.53263854980469,
"logps/ref_chosen": -56.771827697753906,
"logps/ref_rejected": -116.23049926757812,
"logps/rejected": -166.2589111328125,
"loss": 0.9834,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.3770864009857178,
"rewards/margins": 0.8546693325042725,
"rewards/rejected": -2.2317557334899902,
"step": 239
},
{
"epoch": 0.36281179138321995,
"epsilon_dpo/beta": 0.04448658600449562,
"epsilon_dpo/beta_margin_grad_mean": -0.3048049211502075,
"epsilon_dpo/beta_margin_grad_std": 0.21014252305030823,
"epsilon_dpo/beta_margin_mean": 1.0467398166656494,
"epsilon_dpo/beta_margin_std": 1.1823891401290894,
"epsilon_dpo/loss_margin_mean": 23.692577362060547,
"grad_norm": 17.359264373779297,
"kl/avg_steps": 0.53125,
"kl/beta": 0.04471971094608307,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 4.0349825555680045e-07,
"logits/chosen": -0.16031520068645477,
"logits/rejected": -0.3085278272628784,
"logps/chosen": -82.06173706054688,
"logps/ref_chosen": -53.35411071777344,
"logps/ref_rejected": -80.12019348144531,
"logps/rejected": -132.5203857421875,
"loss": 0.8605,
"rewards/accuracies": 0.796875,
"rewards/chosen": -1.279559850692749,
"rewards/margins": 1.0467398166656494,
"rewards/rejected": -2.3262996673583984,
"step": 240
},
{
"epoch": 0.36432350718065004,
"epsilon_dpo/beta": 0.04433491453528404,
"epsilon_dpo/beta_margin_grad_mean": -0.3852992355823517,
"epsilon_dpo/beta_margin_grad_std": 0.21619254350662231,
"epsilon_dpo/beta_margin_mean": 0.6136118769645691,
"epsilon_dpo/beta_margin_std": 1.1264688968658447,
"epsilon_dpo/loss_margin_mean": 14.015726089477539,
"grad_norm": 24.875,
"kl/avg_steps": 0.34375,
"kl/beta": 0.04448339343070984,
"kl/n_epsilon_steps": 0.328125,
"kl/p_epsilon_steps": 0.671875,
"learning_rate": 4.0245247088227377e-07,
"logits/chosen": -0.2391211986541748,
"logits/rejected": -0.3339824080467224,
"logps/chosen": -100.33395385742188,
"logps/ref_chosen": -71.89541625976562,
"logps/ref_rejected": -83.03492736816406,
"logps/rejected": -125.48918151855469,
"loss": 1.126,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.2645165920257568,
"rewards/margins": 0.6136118173599243,
"rewards/rejected": -1.8781282901763916,
"step": 241
},
{
"epoch": 0.36583522297808013,
"epsilon_dpo/beta": 0.04407219961285591,
"epsilon_dpo/beta_margin_grad_mean": -0.3359421193599701,
"epsilon_dpo/beta_margin_grad_std": 0.21481238305568695,
"epsilon_dpo/beta_margin_mean": 0.9087969064712524,
"epsilon_dpo/beta_margin_std": 1.2901452779769897,
"epsilon_dpo/loss_margin_mean": 20.765674591064453,
"grad_norm": 20.7161808013916,
"kl/avg_steps": 0.59375,
"kl/beta": 0.044331006705760956,
"kl/n_epsilon_steps": 0.203125,
"kl/p_epsilon_steps": 0.796875,
"learning_rate": 4.0140242178441665e-07,
"logits/chosen": -0.12015914916992188,
"logits/rejected": -0.18805427849292755,
"logps/chosen": -87.11502075195312,
"logps/ref_chosen": -57.927433013916016,
"logps/ref_rejected": -67.83861541748047,
"logps/rejected": -117.79188537597656,
"loss": 0.9734,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.288856029510498,
"rewards/margins": 0.9087969064712524,
"rewards/rejected": -2.197652816772461,
"step": 242
},
{
"epoch": 0.3673469387755102,
"epsilon_dpo/beta": 0.04387397691607475,
"epsilon_dpo/beta_margin_grad_mean": -0.34642571210861206,
"epsilon_dpo/beta_margin_grad_std": 0.21472449600696564,
"epsilon_dpo/beta_margin_mean": 0.8148993253707886,
"epsilon_dpo/beta_margin_std": 1.1465483903884888,
"epsilon_dpo/loss_margin_mean": 18.741283416748047,
"grad_norm": 22.113584518432617,
"kl/avg_steps": 0.453125,
"kl/beta": 0.04406934604048729,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 4.003481376353596e-07,
"logits/chosen": -0.34190136194229126,
"logits/rejected": -0.2832658886909485,
"logps/chosen": -102.57917785644531,
"logps/ref_chosen": -74.27667236328125,
"logps/ref_rejected": -73.24340057373047,
"logps/rejected": -120.28718566894531,
"loss": 0.9932,
"rewards/accuracies": 0.765625,
"rewards/chosen": -1.2449101209640503,
"rewards/margins": 0.8148993253707886,
"rewards/rejected": -2.059809446334839,
"step": 243
},
{
"epoch": 0.3688586545729403,
"epsilon_dpo/beta": 0.043559592217206955,
"epsilon_dpo/beta_margin_grad_mean": -0.2976473569869995,
"epsilon_dpo/beta_margin_grad_std": 0.1875329166650772,
"epsilon_dpo/beta_margin_mean": 1.0726085901260376,
"epsilon_dpo/beta_margin_std": 1.1004550457000732,
"epsilon_dpo/loss_margin_mean": 24.721460342407227,
"grad_norm": 18.40015983581543,
"kl/avg_steps": 0.71875,
"kl/beta": 0.043870557099580765,
"kl/n_epsilon_steps": 0.140625,
"kl/p_epsilon_steps": 0.859375,
"learning_rate": 3.9928964792569654e-07,
"logits/chosen": -0.10484915226697922,
"logits/rejected": -0.3584839403629303,
"logps/chosen": -82.0526123046875,
"logps/ref_chosen": -53.36390686035156,
"logps/ref_rejected": -71.10276794433594,
"logps/rejected": -124.512939453125,
"loss": 0.8047,
"rewards/accuracies": 0.890625,
"rewards/chosen": -1.2519149780273438,
"rewards/margins": 1.0726085901260376,
"rewards/rejected": -2.324523448944092,
"step": 244
},
{
"epoch": 0.37037037037037035,
"epsilon_dpo/beta": 0.04320790246129036,
"epsilon_dpo/beta_margin_grad_mean": -0.27778851985931396,
"epsilon_dpo/beta_margin_grad_std": 0.19265861809253693,
"epsilon_dpo/beta_margin_mean": 1.1953248977661133,
"epsilon_dpo/beta_margin_std": 1.1307498216629028,
"epsilon_dpo/loss_margin_mean": 27.74947166442871,
"grad_norm": 41.73692321777344,
"kl/avg_steps": 0.8125,
"kl/beta": 0.043557487428188324,
"kl/n_epsilon_steps": 0.09375,
"kl/p_epsilon_steps": 0.90625,
"learning_rate": 3.982269822636601e-07,
"logits/chosen": -0.258206307888031,
"logits/rejected": -0.30017420649528503,
"logps/chosen": -101.29631042480469,
"logps/ref_chosen": -71.19510650634766,
"logps/ref_rejected": -80.76235961914062,
"logps/rejected": -138.613037109375,
"loss": 0.7503,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.3017505407333374,
"rewards/margins": 1.1953248977661133,
"rewards/rejected": -2.4970755577087402,
"step": 245
},
{
"epoch": 0.37188208616780044,
"epsilon_dpo/beta": 0.042994700372219086,
"epsilon_dpo/beta_margin_grad_mean": -0.33950769901275635,
"epsilon_dpo/beta_margin_grad_std": 0.24362608790397644,
"epsilon_dpo/beta_margin_mean": 0.9245045781135559,
"epsilon_dpo/beta_margin_std": 1.4429113864898682,
"epsilon_dpo/loss_margin_mean": 21.705978393554688,
"grad_norm": 26.83746337890625,
"kl/avg_steps": 0.5,
"kl/beta": 0.043206434696912766,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 3.971601703742932e-07,
"logits/chosen": -0.29668131470680237,
"logits/rejected": -0.35561996698379517,
"logps/chosen": -109.19932556152344,
"logps/ref_chosen": -71.62104797363281,
"logps/ref_rejected": -94.03392028808594,
"logps/rejected": -153.31817626953125,
"loss": 1.0493,
"rewards/accuracies": 0.765625,
"rewards/chosen": -1.6204333305358887,
"rewards/margins": 0.9245046377182007,
"rewards/rejected": -2.5449378490448,
"step": 246
},
{
"epoch": 0.37339380196523053,
"epsilon_dpo/beta": 0.04280766844749451,
"epsilon_dpo/beta_margin_grad_mean": -0.38589486479759216,
"epsilon_dpo/beta_margin_grad_std": 0.22010691463947296,
"epsilon_dpo/beta_margin_mean": 0.570848822593689,
"epsilon_dpo/beta_margin_std": 1.1592731475830078,
"epsilon_dpo/loss_margin_mean": 13.519911766052246,
"grad_norm": 25.601972579956055,
"kl/avg_steps": 0.4375,
"kl/beta": 0.042991477996110916,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 3.960892420986177e-07,
"logits/chosen": -0.33667704463005066,
"logits/rejected": -0.3145361542701721,
"logps/chosen": -119.41648864746094,
"logps/ref_chosen": -80.02254486083984,
"logps/ref_rejected": -89.22705078125,
"logps/rejected": -142.1409149169922,
"loss": 1.1719,
"rewards/accuracies": 0.703125,
"rewards/chosen": -1.6904618740081787,
"rewards/margins": 0.570848822593689,
"rewards/rejected": -2.261310577392578,
"step": 247
},
{
"epoch": 0.3749055177626606,
"epsilon_dpo/beta": 0.042621202766895294,
"epsilon_dpo/beta_margin_grad_mean": -0.3455793857574463,
"epsilon_dpo/beta_margin_grad_std": 0.24640627205371857,
"epsilon_dpo/beta_margin_mean": 0.9159837365150452,
"epsilon_dpo/beta_margin_std": 1.4213393926620483,
"epsilon_dpo/loss_margin_mean": 21.710309982299805,
"grad_norm": 28.24690818786621,
"kl/avg_steps": 0.4375,
"kl/beta": 0.042804207652807236,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 3.9501422739279953e-07,
"logits/chosen": -0.22308963537216187,
"logits/rejected": -0.1411212533712387,
"logps/chosen": -100.60003662109375,
"logps/ref_chosen": -65.37796020507812,
"logps/ref_rejected": -61.36579132080078,
"logps/rejected": -118.29817199707031,
"loss": 1.0472,
"rewards/accuracies": 0.734375,
"rewards/chosen": -1.5076146125793457,
"rewards/margins": 0.9159836769104004,
"rewards/rejected": -2.423598289489746,
"step": 248
},
{
"epoch": 0.3764172335600907,
"epsilon_dpo/beta": 0.04247550666332245,
"epsilon_dpo/beta_margin_grad_mean": -0.42465880513191223,
"epsilon_dpo/beta_margin_grad_std": 0.24674171209335327,
"epsilon_dpo/beta_margin_mean": 0.37750500440597534,
"epsilon_dpo/beta_margin_std": 1.2677173614501953,
"epsilon_dpo/loss_margin_mean": 9.114813804626465,
"grad_norm": 35.48320770263672,
"kl/avg_steps": 0.34375,
"kl/beta": 0.04261775687336922,
"kl/n_epsilon_steps": 0.328125,
"kl/p_epsilon_steps": 0.671875,
"learning_rate": 3.9393515632731094e-07,
"logits/chosen": -0.2901439666748047,
"logits/rejected": -0.18192759156227112,
"logps/chosen": -117.25718688964844,
"logps/ref_chosen": -74.60145568847656,
"logps/ref_rejected": -63.79338455200195,
"logps/rejected": -115.56392669677734,
"loss": 1.3898,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.817288875579834,
"rewards/margins": 0.37750497460365295,
"rewards/rejected": -2.194793701171875,
"step": 249
},
{
"epoch": 0.3779289493575208,
"epsilon_dpo/beta": 0.04227690026164055,
"epsilon_dpo/beta_margin_grad_mean": -0.32008102536201477,
"epsilon_dpo/beta_margin_grad_std": 0.21534791588783264,
"epsilon_dpo/beta_margin_mean": 0.9775921702384949,
"epsilon_dpo/beta_margin_std": 1.295398235321045,
"epsilon_dpo/loss_margin_mean": 23.30982780456543,
"grad_norm": 22.5899600982666,
"kl/avg_steps": 0.46875,
"kl/beta": 0.042471759021282196,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 3.9285205908608934e-07,
"logits/chosen": -0.1831701546907425,
"logits/rejected": -0.32014960050582886,
"logps/chosen": -99.19203186035156,
"logps/ref_chosen": -61.93821334838867,
"logps/ref_rejected": -72.21602630615234,
"logps/rejected": -132.77967834472656,
"loss": 0.9399,
"rewards/accuracies": 0.796875,
"rewards/chosen": -1.5776634216308594,
"rewards/margins": 0.9775921702384949,
"rewards/rejected": -2.55525541305542,
"step": 250
},
{
"epoch": 0.3794406651549509,
"epsilon_dpo/beta": 0.04209286347031593,
"epsilon_dpo/beta_margin_grad_mean": -0.38185030221939087,
"epsilon_dpo/beta_margin_grad_std": 0.22797048091888428,
"epsilon_dpo/beta_margin_mean": 0.6124684810638428,
"epsilon_dpo/beta_margin_std": 1.2211238145828247,
"epsilon_dpo/loss_margin_mean": 14.747017860412598,
"grad_norm": 28.806964874267578,
"kl/avg_steps": 0.4375,
"kl/beta": 0.04227360337972641,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 3.9176496596569265e-07,
"logits/chosen": -0.1576414406299591,
"logits/rejected": -0.2465716153383255,
"logps/chosen": -106.44970703125,
"logps/ref_chosen": -66.85694122314453,
"logps/ref_rejected": -84.83396911621094,
"logps/rejected": -139.17376708984375,
"loss": 1.1694,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.669304609298706,
"rewards/margins": 0.6124684810638428,
"rewards/rejected": -2.281773090362549,
"step": 251
},
{
"epoch": 0.38095238095238093,
"epsilon_dpo/beta": 0.041909512132406235,
"epsilon_dpo/beta_margin_grad_mean": -0.3917834460735321,
"epsilon_dpo/beta_margin_grad_std": 0.23721851408481598,
"epsilon_dpo/beta_margin_mean": 0.5502148866653442,
"epsilon_dpo/beta_margin_std": 1.3598700761795044,
"epsilon_dpo/loss_margin_mean": 13.339332580566406,
"grad_norm": 32.72731399536133,
"kl/avg_steps": 0.4375,
"kl/beta": 0.04208946228027344,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 3.9067390737445254e-07,
"logits/chosen": -0.20090891420841217,
"logits/rejected": -0.2901458442211151,
"logps/chosen": -94.52253723144531,
"logps/ref_chosen": -56.22393035888672,
"logps/ref_rejected": -77.1136245727539,
"logps/rejected": -128.75157165527344,
"loss": 1.2776,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.6071038246154785,
"rewards/margins": 0.5502148866653442,
"rewards/rejected": -2.157318592071533,
"step": 252
},
{
"epoch": 0.382464096749811,
"epsilon_dpo/beta": 0.0417400524020195,
"epsilon_dpo/beta_margin_grad_mean": -0.3715527057647705,
"epsilon_dpo/beta_margin_grad_std": 0.21816174685955048,
"epsilon_dpo/beta_margin_mean": 0.6735394597053528,
"epsilon_dpo/beta_margin_std": 1.1490751504898071,
"epsilon_dpo/loss_margin_mean": 16.325931549072266,
"grad_norm": 21.263883590698242,
"kl/avg_steps": 0.40625,
"kl/beta": 0.041906122118234634,
"kl/n_epsilon_steps": 0.296875,
"kl/p_epsilon_steps": 0.703125,
"learning_rate": 3.8957891383162304e-07,
"logits/chosen": -0.061061084270477295,
"logits/rejected": -0.17727521061897278,
"logps/chosen": -89.08055114746094,
"logps/ref_chosen": -52.21001434326172,
"logps/ref_rejected": -58.75764465332031,
"logps/rejected": -111.95411682128906,
"loss": 1.0921,
"rewards/accuracies": 0.703125,
"rewards/chosen": -1.541445255279541,
"rewards/margins": 0.6735395193099976,
"rewards/rejected": -2.214984893798828,
"step": 253
},
{
"epoch": 0.3839758125472411,
"epsilon_dpo/beta": 0.041584212332963943,
"epsilon_dpo/beta_margin_grad_mean": -0.369428426027298,
"epsilon_dpo/beta_margin_grad_std": 0.21766522526741028,
"epsilon_dpo/beta_margin_mean": 0.7099707126617432,
"epsilon_dpo/beta_margin_std": 1.1634279489517212,
"epsilon_dpo/loss_margin_mean": 17.256244659423828,
"grad_norm": 22.009788513183594,
"kl/avg_steps": 0.375,
"kl/beta": 0.04173656553030014,
"kl/n_epsilon_steps": 0.3125,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 3.884800159665276e-07,
"logits/chosen": -0.13468728959560394,
"logits/rejected": -0.261949747800827,
"logps/chosen": -104.312255859375,
"logps/ref_chosen": -65.63632202148438,
"logps/ref_rejected": -82.34425354003906,
"logps/rejected": -138.27642822265625,
"loss": 1.0701,
"rewards/accuracies": 0.671875,
"rewards/chosen": -1.613656759262085,
"rewards/margins": 0.7099707126617432,
"rewards/rejected": -2.323627471923828,
"step": 254
},
{
"epoch": 0.3854875283446712,
"epsilon_dpo/beta": 0.041389867663383484,
"epsilon_dpo/beta_margin_grad_mean": -0.33623260259628296,
"epsilon_dpo/beta_margin_grad_std": 0.22076046466827393,
"epsilon_dpo/beta_margin_mean": 0.9280020594596863,
"epsilon_dpo/beta_margin_std": 1.284578800201416,
"epsilon_dpo/loss_margin_mean": 22.61232566833496,
"grad_norm": 22.385225296020508,
"kl/avg_steps": 0.46875,
"kl/beta": 0.04158063977956772,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 3.873772445177015e-07,
"logits/chosen": -0.29942619800567627,
"logits/rejected": -0.2981582581996918,
"logps/chosen": -102.49989318847656,
"logps/ref_chosen": -67.91109466552734,
"logps/ref_rejected": -83.89114379882812,
"logps/rejected": -141.09228515625,
"loss": 0.9675,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.4344618320465088,
"rewards/margins": 0.928002119064331,
"rewards/rejected": -2.3624637126922607,
"step": 255
},
{
"epoch": 0.3869992441421013,
"epsilon_dpo/beta": 0.04126143082976341,
"epsilon_dpo/beta_margin_grad_mean": -0.36181482672691345,
"epsilon_dpo/beta_margin_grad_std": 0.23323839902877808,
"epsilon_dpo/beta_margin_mean": 0.7811844944953918,
"epsilon_dpo/beta_margin_std": 1.3121854066848755,
"epsilon_dpo/loss_margin_mean": 19.161640167236328,
"grad_norm": 24.99724578857422,
"kl/avg_steps": 0.3125,
"kl/beta": 0.04138663783669472,
"kl/n_epsilon_steps": 0.34375,
"kl/p_epsilon_steps": 0.65625,
"learning_rate": 3.862706303320329e-07,
"logits/chosen": -0.22274255752563477,
"logits/rejected": -0.26259535551071167,
"logps/chosen": -104.70640563964844,
"logps/ref_chosen": -63.49998474121094,
"logps/ref_rejected": -90.77104187011719,
"logps/rejected": -151.13909912109375,
"loss": 1.0839,
"rewards/accuracies": 0.671875,
"rewards/chosen": -1.7056180238723755,
"rewards/margins": 0.7811845541000366,
"rewards/rejected": -2.486802577972412,
"step": 256
},
{
"epoch": 0.3885109599395314,
"epsilon_dpo/beta": 0.04104262962937355,
"epsilon_dpo/beta_margin_grad_mean": -0.3379192054271698,
"epsilon_dpo/beta_margin_grad_std": 0.21875940263271332,
"epsilon_dpo/beta_margin_mean": 0.9559639692306519,
"epsilon_dpo/beta_margin_std": 1.3791394233703613,
"epsilon_dpo/loss_margin_mean": 23.468908309936523,
"grad_norm": 22.31130027770996,
"kl/avg_steps": 0.53125,
"kl/beta": 0.04125770926475525,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 3.851602043638994e-07,
"logits/chosen": -0.3219867944717407,
"logits/rejected": -0.3862881660461426,
"logps/chosen": -112.60818481445312,
"logps/ref_chosen": -70.60064697265625,
"logps/ref_rejected": -108.5831298828125,
"logps/rejected": -174.0595703125,
"loss": 0.9738,
"rewards/accuracies": 0.765625,
"rewards/chosen": -1.7274171113967896,
"rewards/margins": 0.9559639692306519,
"rewards/rejected": -2.6833810806274414,
"step": 257
},
{
"epoch": 0.3900226757369615,
"epsilon_dpo/beta": 0.04072948545217514,
"epsilon_dpo/beta_margin_grad_mean": -0.34492138028144836,
"epsilon_dpo/beta_margin_grad_std": 0.17867842316627502,
"epsilon_dpo/beta_margin_mean": 0.75054931640625,
"epsilon_dpo/beta_margin_std": 0.9416071176528931,
"epsilon_dpo/loss_margin_mean": 18.523508071899414,
"grad_norm": 22.837146759033203,
"kl/avg_steps": 0.765625,
"kl/beta": 0.041039686650037766,
"kl/n_epsilon_steps": 0.109375,
"kl/p_epsilon_steps": 0.875,
"learning_rate": 3.840459976743023e-07,
"logits/chosen": -0.19208520650863647,
"logits/rejected": -0.2627413272857666,
"logps/chosen": -100.40775299072266,
"logps/ref_chosen": -59.25416564941406,
"logps/ref_rejected": -85.58709716796875,
"logps/rejected": -145.26419067382812,
"loss": 0.9519,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.6784533262252808,
"rewards/margins": 0.75054931640625,
"rewards/rejected": -2.4290027618408203,
"step": 258
},
{
"epoch": 0.3915343915343915,
"epsilon_dpo/beta": 0.04047735780477524,
"epsilon_dpo/beta_margin_grad_mean": -0.29619985818862915,
"epsilon_dpo/beta_margin_grad_std": 0.22371140122413635,
"epsilon_dpo/beta_margin_mean": 1.1150555610656738,
"epsilon_dpo/beta_margin_std": 1.2491729259490967,
"epsilon_dpo/loss_margin_mean": 27.723583221435547,
"grad_norm": 20.26732063293457,
"kl/avg_steps": 0.625,
"kl/beta": 0.04072786122560501,
"kl/n_epsilon_steps": 0.1875,
"kl/p_epsilon_steps": 0.8125,
"learning_rate": 3.8292804142999796e-07,
"logits/chosen": -0.10857418924570084,
"logits/rejected": -0.3421369194984436,
"logps/chosen": -99.74125671386719,
"logps/ref_chosen": -65.43487548828125,
"logps/ref_rejected": -95.41731262207031,
"logps/rejected": -157.447265625,
"loss": 0.8539,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.3928723335266113,
"rewards/margins": 1.1150554418563843,
"rewards/rejected": -2.507927894592285,
"step": 259
},
{
"epoch": 0.3930461073318216,
"epsilon_dpo/beta": 0.04027654975652695,
"epsilon_dpo/beta_margin_grad_mean": -0.3476658761501312,
"epsilon_dpo/beta_margin_grad_std": 0.23531104624271393,
"epsilon_dpo/beta_margin_mean": 0.8362375497817993,
"epsilon_dpo/beta_margin_std": 1.260378360748291,
"epsilon_dpo/loss_margin_mean": 20.95905113220215,
"grad_norm": 23.808887481689453,
"kl/avg_steps": 0.5,
"kl/beta": 0.040474895387887955,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 3.818063669026256e-07,
"logits/chosen": -0.16017837822437286,
"logits/rejected": -0.21371683478355408,
"logps/chosen": -84.45217895507812,
"logps/ref_chosen": -49.08958435058594,
"logps/ref_rejected": -79.01708221435547,
"logps/rejected": -135.33872985839844,
"loss": 1.034,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.4267231225967407,
"rewards/margins": 0.8362375497817993,
"rewards/rejected": -2.26296067237854,
"step": 260
},
{
"epoch": 0.3945578231292517,
"epsilon_dpo/beta": 0.04012651368975639,
"epsilon_dpo/beta_margin_grad_mean": -0.3732527494430542,
"epsilon_dpo/beta_margin_grad_std": 0.21205906569957733,
"epsilon_dpo/beta_margin_mean": 0.6821871995925903,
"epsilon_dpo/beta_margin_std": 1.1377960443496704,
"epsilon_dpo/loss_margin_mean": 17.190061569213867,
"grad_norm": 26.759939193725586,
"kl/avg_steps": 0.375,
"kl/beta": 0.040273528546094894,
"kl/n_epsilon_steps": 0.3125,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 3.806810054678331e-07,
"logits/chosen": -0.2861855626106262,
"logits/rejected": -0.20148369669914246,
"logps/chosen": -107.46412658691406,
"logps/ref_chosen": -70.87239074707031,
"logps/ref_rejected": -65.01522064208984,
"logps/rejected": -118.7970199584961,
"loss": 1.0758,
"rewards/accuracies": 0.703125,
"rewards/chosen": -1.4706168174743652,
"rewards/margins": 0.6821871995925903,
"rewards/rejected": -2.152804136276245,
"step": 261
},
{
"epoch": 0.3960695389266818,
"epsilon_dpo/beta": 0.03988882154226303,
"epsilon_dpo/beta_margin_grad_mean": -0.3503076434135437,
"epsilon_dpo/beta_margin_grad_std": 0.20528076589107513,
"epsilon_dpo/beta_margin_mean": 0.7828584313392639,
"epsilon_dpo/beta_margin_std": 1.07047438621521,
"epsilon_dpo/loss_margin_mean": 19.77398109436035,
"grad_norm": 21.7348690032959,
"kl/avg_steps": 0.59375,
"kl/beta": 0.04012306407094002,
"kl/n_epsilon_steps": 0.203125,
"kl/p_epsilon_steps": 0.796875,
"learning_rate": 3.7955198860439887e-07,
"logits/chosen": -0.25513583421707153,
"logits/rejected": -0.3391590118408203,
"logps/chosen": -105.1324462890625,
"logps/ref_chosen": -67.87063598632812,
"logps/ref_rejected": -88.7205810546875,
"logps/rejected": -145.75637817382812,
"loss": 0.9828,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.4890307188034058,
"rewards/margins": 0.7828584313392639,
"rewards/rejected": -2.2718892097473145,
"step": 262
},
{
"epoch": 0.3975812547241119,
"epsilon_dpo/beta": 0.039715707302093506,
"epsilon_dpo/beta_margin_grad_mean": -0.3735382854938507,
"epsilon_dpo/beta_margin_grad_std": 0.2114417850971222,
"epsilon_dpo/beta_margin_mean": 0.639028012752533,
"epsilon_dpo/beta_margin_std": 1.0829418897628784,
"epsilon_dpo/loss_margin_mean": 16.26985740661621,
"grad_norm": 19.9456729888916,
"kl/avg_steps": 0.4375,
"kl/beta": 0.0398862399160862,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 3.784193478933516e-07,
"logits/chosen": -0.061840981245040894,
"logits/rejected": -0.3313126564025879,
"logps/chosen": -91.1656265258789,
"logps/ref_chosen": -55.194580078125,
"logps/ref_rejected": -80.54048156738281,
"logps/rejected": -132.78138732910156,
"loss": 1.0928,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.432045817375183,
"rewards/margins": 0.6390280723571777,
"rewards/rejected": -2.0710740089416504,
"step": 263
},
{
"epoch": 0.39909297052154197,
"epsilon_dpo/beta": 0.03951788693666458,
"epsilon_dpo/beta_margin_grad_mean": -0.35467609763145447,
"epsilon_dpo/beta_margin_grad_std": 0.22027695178985596,
"epsilon_dpo/beta_margin_mean": 0.7902923226356506,
"epsilon_dpo/beta_margin_std": 1.210559368133545,
"epsilon_dpo/loss_margin_mean": 20.185094833374023,
"grad_norm": 23.44506072998047,
"kl/avg_steps": 0.5,
"kl/beta": 0.03971249982714653,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 3.7728311501708674e-07,
"logits/chosen": -0.31946539878845215,
"logits/rejected": -0.3810897171497345,
"logps/chosen": -120.71807861328125,
"logps/ref_chosen": -83.17068481445312,
"logps/ref_rejected": -88.33625793457031,
"logps/rejected": -146.06875610351562,
"loss": 1.0311,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.4880857467651367,
"rewards/margins": 0.7902923822402954,
"rewards/rejected": -2.2783780097961426,
"step": 264
},
{
"epoch": 0.40060468631897206,
"epsilon_dpo/beta": 0.039308931678533554,
"epsilon_dpo/beta_margin_grad_mean": -0.3492180109024048,
"epsilon_dpo/beta_margin_grad_std": 0.24312558770179749,
"epsilon_dpo/beta_margin_mean": 0.8246368169784546,
"epsilon_dpo/beta_margin_std": 1.3355547189712524,
"epsilon_dpo/loss_margin_mean": 21.1899356842041,
"grad_norm": 22.772716522216797,
"kl/avg_steps": 0.53125,
"kl/beta": 0.039514925330877304,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 3.7614332175848027e-07,
"logits/chosen": -0.09426143765449524,
"logits/rejected": -0.2781641185283661,
"logps/chosen": -86.9837417602539,
"logps/ref_chosen": -51.66284942626953,
"logps/ref_rejected": -67.1720962524414,
"logps/rejected": -123.68292236328125,
"loss": 1.0789,
"rewards/accuracies": 0.796875,
"rewards/chosen": -1.3934438228607178,
"rewards/margins": 0.8246368169784546,
"rewards/rejected": -2.218080520629883,
"step": 265
},
{
"epoch": 0.4021164021164021,
"epsilon_dpo/beta": 0.03908892348408699,
"epsilon_dpo/beta_margin_grad_mean": -0.35088396072387695,
"epsilon_dpo/beta_margin_grad_std": 0.20357009768486023,
"epsilon_dpo/beta_margin_mean": 0.7827740907669067,
"epsilon_dpo/beta_margin_std": 1.1340844631195068,
"epsilon_dpo/loss_margin_mean": 20.18011474609375,
"grad_norm": 20.76192855834961,
"kl/avg_steps": 0.5625,
"kl/beta": 0.03930611163377762,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 3.75e-07,
"logits/chosen": -0.12414835393428802,
"logits/rejected": -0.3246955871582031,
"logps/chosen": -91.22309875488281,
"logps/ref_chosen": -57.45049285888672,
"logps/ref_rejected": -77.60826110839844,
"logps/rejected": -131.56097412109375,
"loss": 1.0013,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.323702335357666,
"rewards/margins": 0.7827740907669067,
"rewards/rejected": -2.106476306915283,
"step": 266
},
{
"epoch": 0.4036281179138322,
"epsilon_dpo/beta": 0.03893135488033295,
"epsilon_dpo/beta_margin_grad_mean": -0.3826831579208374,
"epsilon_dpo/beta_margin_grad_std": 0.2206048220396042,
"epsilon_dpo/beta_margin_mean": 0.6011332273483276,
"epsilon_dpo/beta_margin_std": 1.1812732219696045,
"epsilon_dpo/loss_margin_mean": 15.641103744506836,
"grad_norm": 20.94510841369629,
"kl/avg_steps": 0.40625,
"kl/beta": 0.039086248725652695,
"kl/n_epsilon_steps": 0.296875,
"kl/p_epsilon_steps": 0.703125,
"learning_rate": 3.738531817228131e-07,
"logits/chosen": -0.1702580600976944,
"logits/rejected": -0.21223387122154236,
"logps/chosen": -84.27885437011719,
"logps/ref_chosen": -55.03534698486328,
"logps/ref_rejected": -66.0953369140625,
"logps/rejected": -110.97994995117188,
"loss": 1.1599,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.1412169933319092,
"rewards/margins": 0.6011332273483276,
"rewards/rejected": -1.7423501014709473,
"step": 267
},
{
"epoch": 0.4051398337112623,
"epsilon_dpo/beta": 0.03873734176158905,
"epsilon_dpo/beta_margin_grad_mean": -0.3630383014678955,
"epsilon_dpo/beta_margin_grad_std": 0.1842850148677826,
"epsilon_dpo/beta_margin_mean": 0.7043724060058594,
"epsilon_dpo/beta_margin_std": 0.9809404611587524,
"epsilon_dpo/loss_margin_mean": 18.31964874267578,
"grad_norm": 16.800960540771484,
"kl/avg_steps": 0.5,
"kl/beta": 0.03892810642719269,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 3.7270289900589204e-07,
"logits/chosen": -0.24320363998413086,
"logits/rejected": -0.25359469652175903,
"logps/chosen": -94.5784912109375,
"logps/ref_chosen": -65.07174682617188,
"logps/ref_rejected": -71.42486572265625,
"logps/rejected": -119.25125122070312,
"loss": 0.9947,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.1449675559997559,
"rewards/margins": 0.7043724060058594,
"rewards/rejected": -1.8493399620056152,
"step": 268
},
{
"epoch": 0.40665154950869237,
"epsilon_dpo/beta": 0.038508299738168716,
"epsilon_dpo/beta_margin_grad_mean": -0.34540772438049316,
"epsilon_dpo/beta_margin_grad_std": 0.1915276050567627,
"epsilon_dpo/beta_margin_mean": 0.7910067439079285,
"epsilon_dpo/beta_margin_std": 1.0360459089279175,
"epsilon_dpo/loss_margin_mean": 20.67135238647461,
"grad_norm": 16.313074111938477,
"kl/avg_steps": 0.59375,
"kl/beta": 0.03873443230986595,
"kl/n_epsilon_steps": 0.203125,
"kl/p_epsilon_steps": 0.796875,
"learning_rate": 3.7154918402511714e-07,
"logits/chosen": -0.16979868710041046,
"logits/rejected": -0.2855718731880188,
"logps/chosen": -99.54581451416016,
"logps/ref_chosen": -67.1362075805664,
"logps/ref_rejected": -82.55778503417969,
"logps/rejected": -135.63873291015625,
"loss": 0.9594,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.250057578086853,
"rewards/margins": 0.7910068035125732,
"rewards/rejected": -2.0410642623901367,
"step": 269
},
{
"epoch": 0.40816326530612246,
"epsilon_dpo/beta": 0.03826896846294403,
"epsilon_dpo/beta_margin_grad_mean": -0.3660072386264801,
"epsilon_dpo/beta_margin_grad_std": 0.1895051896572113,
"epsilon_dpo/beta_margin_mean": 0.6512601971626282,
"epsilon_dpo/beta_margin_std": 1.0007617473602295,
"epsilon_dpo/loss_margin_mean": 17.152385711669922,
"grad_norm": 20.79635238647461,
"kl/avg_steps": 0.625,
"kl/beta": 0.03850580379366875,
"kl/n_epsilon_steps": 0.1875,
"kl/p_epsilon_steps": 0.8125,
"learning_rate": 3.7039206905237656e-07,
"logits/chosen": -0.23422327637672424,
"logits/rejected": -0.32264748215675354,
"logps/chosen": -98.90885925292969,
"logps/ref_chosen": -66.6886978149414,
"logps/ref_rejected": -85.16129302978516,
"logps/rejected": -134.53384399414062,
"loss": 1.0446,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.2359251976013184,
"rewards/margins": 0.651260256767273,
"rewards/rejected": -1.8871853351593018,
"step": 270
},
{
"epoch": 0.40967498110355255,
"epsilon_dpo/beta": 0.03812694922089577,
"epsilon_dpo/beta_margin_grad_mean": -0.40505045652389526,
"epsilon_dpo/beta_margin_grad_std": 0.22268527746200562,
"epsilon_dpo/beta_margin_mean": 0.5694870948791504,
"epsilon_dpo/beta_margin_std": 1.2615535259246826,
"epsilon_dpo/loss_margin_mean": 15.136260032653809,
"grad_norm": 22.018741607666016,
"kl/avg_steps": 0.375,
"kl/beta": 0.038266636431217194,
"kl/n_epsilon_steps": 0.3125,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 3.692315864546635e-07,
"logits/chosen": -0.20422720909118652,
"logits/rejected": -0.347294420003891,
"logps/chosen": -102.64226531982422,
"logps/ref_chosen": -72.40754699707031,
"logps/ref_rejected": -92.0631103515625,
"logps/rejected": -137.43408203125,
"loss": 1.2052,
"rewards/accuracies": 0.640625,
"rewards/chosen": -1.1559407711029053,
"rewards/margins": 0.5694870948791504,
"rewards/rejected": -1.7254277467727661,
"step": 271
},
{
"epoch": 0.41118669690098264,
"epsilon_dpo/beta": 0.03785344585776329,
"epsilon_dpo/beta_margin_grad_mean": -0.31559768319129944,
"epsilon_dpo/beta_margin_grad_std": 0.17588132619857788,
"epsilon_dpo/beta_margin_mean": 0.9508032202720642,
"epsilon_dpo/beta_margin_std": 0.9953068494796753,
"epsilon_dpo/loss_margin_mean": 25.219436645507812,
"grad_norm": 19.028003692626953,
"kl/avg_steps": 0.71875,
"kl/beta": 0.03812367469072342,
"kl/n_epsilon_steps": 0.140625,
"kl/p_epsilon_steps": 0.859375,
"learning_rate": 3.6806776869317067e-07,
"logits/chosen": -0.26148316264152527,
"logits/rejected": -0.18833643198013306,
"logps/chosen": -92.54074096679688,
"logps/ref_chosen": -66.60140228271484,
"logps/ref_rejected": -67.74339294433594,
"logps/rejected": -118.90216827392578,
"loss": 0.8348,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.9836262464523315,
"rewards/margins": 0.9508031606674194,
"rewards/rejected": -1.934429407119751,
"step": 272
},
{
"epoch": 0.4126984126984127,
"epsilon_dpo/beta": 0.03768978640437126,
"epsilon_dpo/beta_margin_grad_mean": -0.3645114600658417,
"epsilon_dpo/beta_margin_grad_std": 0.2196216732263565,
"epsilon_dpo/beta_margin_mean": 0.720595121383667,
"epsilon_dpo/beta_margin_std": 1.2089626789093018,
"epsilon_dpo/loss_margin_mean": 19.329578399658203,
"grad_norm": 21.885257720947266,
"kl/avg_steps": 0.4375,
"kl/beta": 0.037851616740226746,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 3.669006483223828e-07,
"logits/chosen": -0.11959455162286758,
"logits/rejected": -0.29678627848625183,
"logps/chosen": -91.24102783203125,
"logps/ref_chosen": -57.35487365722656,
"logps/ref_rejected": -84.17168426513672,
"logps/rejected": -137.38742065429688,
"loss": 1.0756,
"rewards/accuracies": 0.734375,
"rewards/chosen": -1.2816635370254517,
"rewards/margins": 0.7205950617790222,
"rewards/rejected": -2.002258539199829,
"step": 273
},
{
"epoch": 0.41421012849584277,
"epsilon_dpo/beta": 0.03750205039978027,
"epsilon_dpo/beta_margin_grad_mean": -0.35682639479637146,
"epsilon_dpo/beta_margin_grad_std": 0.20721961557865143,
"epsilon_dpo/beta_margin_mean": 0.7384819984436035,
"epsilon_dpo/beta_margin_std": 1.095733404159546,
"epsilon_dpo/loss_margin_mean": 19.86833381652832,
"grad_norm": 17.513832092285156,
"kl/avg_steps": 0.5,
"kl/beta": 0.03768673539161682,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 3.657302579891656e-07,
"logits/chosen": -0.1848251223564148,
"logits/rejected": -0.24779653549194336,
"logps/chosen": -91.3389892578125,
"logps/ref_chosen": -59.64149475097656,
"logps/ref_rejected": -68.29348754882812,
"logps/rejected": -119.85931396484375,
"loss": 1.0236,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.1923532485961914,
"rewards/margins": 0.7384819984436035,
"rewards/rejected": -1.930835247039795,
"step": 274
},
{
"epoch": 0.41572184429327286,
"epsilon_dpo/beta": 0.037303756922483444,
"epsilon_dpo/beta_margin_grad_mean": -0.34078559279441833,
"epsilon_dpo/beta_margin_grad_std": 0.1952379047870636,
"epsilon_dpo/beta_margin_mean": 0.8146477937698364,
"epsilon_dpo/beta_margin_std": 1.0776177644729614,
"epsilon_dpo/loss_margin_mean": 21.99654197692871,
"grad_norm": 18.516475677490234,
"kl/avg_steps": 0.53125,
"kl/beta": 0.03749924153089523,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 3.645566304318526e-07,
"logits/chosen": -0.180327907204628,
"logits/rejected": -0.3625754117965698,
"logps/chosen": -84.18186950683594,
"logps/ref_chosen": -53.26664733886719,
"logps/ref_rejected": -73.84062194824219,
"logps/rejected": -126.75238037109375,
"loss": 0.9586,
"rewards/accuracies": 0.796875,
"rewards/chosen": -1.1543972492218018,
"rewards/margins": 0.8146477937698364,
"rewards/rejected": -1.9690450429916382,
"step": 275
},
{
"epoch": 0.41723356009070295,
"epsilon_dpo/beta": 0.037036679685115814,
"epsilon_dpo/beta_margin_grad_mean": -0.3456708490848541,
"epsilon_dpo/beta_margin_grad_std": 0.1778232604265213,
"epsilon_dpo/beta_margin_mean": 0.7665261030197144,
"epsilon_dpo/beta_margin_std": 0.9438202977180481,
"epsilon_dpo/loss_margin_mean": 20.800600051879883,
"grad_norm": 17.534378051757812,
"kl/avg_steps": 0.71875,
"kl/beta": 0.03730107843875885,
"kl/n_epsilon_steps": 0.140625,
"kl/p_epsilon_steps": 0.859375,
"learning_rate": 3.633797984793294e-07,
"logits/chosen": -0.09510757774114609,
"logits/rejected": -0.17241308093070984,
"logps/chosen": -82.1324462890625,
"logps/ref_chosen": -53.02079772949219,
"logps/ref_rejected": -61.56678771972656,
"logps/rejected": -111.47903442382812,
"loss": 0.9401,
"rewards/accuracies": 0.859375,
"rewards/chosen": -1.080058217048645,
"rewards/margins": 0.7665261030197144,
"rewards/rejected": -1.8465843200683594,
"step": 276
},
{
"epoch": 0.41874527588813304,
"epsilon_dpo/beta": 0.03691127523779869,
"epsilon_dpo/beta_margin_grad_mean": -0.40965455770492554,
"epsilon_dpo/beta_margin_grad_std": 0.19617639482021332,
"epsilon_dpo/beta_margin_mean": 0.46141234040260315,
"epsilon_dpo/beta_margin_std": 0.9829038381576538,
"epsilon_dpo/loss_margin_mean": 12.678837776184082,
"grad_norm": 24.3159236907959,
"kl/avg_steps": 0.34375,
"kl/beta": 0.03703489154577255,
"kl/n_epsilon_steps": 0.328125,
"kl/p_epsilon_steps": 0.671875,
"learning_rate": 3.6219979505011555e-07,
"logits/chosen": -0.24780939519405365,
"logits/rejected": -0.2727198898792267,
"logps/chosen": -104.44639587402344,
"logps/ref_chosen": -71.43299102783203,
"logps/ref_rejected": -67.65852355957031,
"logps/rejected": -113.35076904296875,
"loss": 1.1839,
"rewards/accuracies": 0.703125,
"rewards/chosen": -1.222653865814209,
"rewards/margins": 0.46141237020492554,
"rewards/rejected": -1.6840662956237793,
"step": 277
},
{
"epoch": 0.42025699168556313,
"epsilon_dpo/beta": 0.036704082041978836,
"epsilon_dpo/beta_margin_grad_mean": -0.35146912932395935,
"epsilon_dpo/beta_margin_grad_std": 0.20363350212574005,
"epsilon_dpo/beta_margin_mean": 0.7368968725204468,
"epsilon_dpo/beta_margin_std": 1.0814954042434692,
"epsilon_dpo/loss_margin_mean": 20.25160789489746,
"grad_norm": 22.260601043701172,
"kl/avg_steps": 0.5625,
"kl/beta": 0.036908019334077835,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 3.6101665315144353e-07,
"logits/chosen": -0.21412307024002075,
"logits/rejected": -0.2813052535057068,
"logps/chosen": -101.09065246582031,
"logps/ref_chosen": -67.11076354980469,
"logps/ref_rejected": -88.74851989746094,
"logps/rejected": -142.9800262451172,
"loss": 1.0192,
"rewards/accuracies": 0.796875,
"rewards/chosen": -1.2513198852539062,
"rewards/margins": 0.7368968725204468,
"rewards/rejected": -1.988216757774353,
"step": 278
},
{
"epoch": 0.4217687074829932,
"epsilon_dpo/beta": 0.036452893167734146,
"epsilon_dpo/beta_margin_grad_mean": -0.31533199548721313,
"epsilon_dpo/beta_margin_grad_std": 0.1786704957485199,
"epsilon_dpo/beta_margin_mean": 0.940020740032196,
"epsilon_dpo/beta_margin_std": 0.9835910201072693,
"epsilon_dpo/loss_margin_mean": 25.90764617919922,
"grad_norm": 19.41154670715332,
"kl/avg_steps": 0.6875,
"kl/beta": 0.03670157119631767,
"kl/n_epsilon_steps": 0.15625,
"kl/p_epsilon_steps": 0.84375,
"learning_rate": 3.5983040587833563e-07,
"logits/chosen": -0.1814892441034317,
"logits/rejected": -0.2673693597316742,
"logps/chosen": -79.43757629394531,
"logps/ref_chosen": -54.49748611450195,
"logps/ref_rejected": -70.4237289428711,
"logps/rejected": -121.27146911621094,
"loss": 0.8423,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.9104586839675903,
"rewards/margins": 0.940020740032196,
"rewards/rejected": -1.8504793643951416,
"step": 279
},
{
"epoch": 0.42328042328042326,
"epsilon_dpo/beta": 0.03619259595870972,
"epsilon_dpo/beta_margin_grad_mean": -0.3112720847129822,
"epsilon_dpo/beta_margin_grad_std": 0.161660835146904,
"epsilon_dpo/beta_margin_mean": 0.9645206332206726,
"epsilon_dpo/beta_margin_std": 0.9408534169197083,
"epsilon_dpo/loss_margin_mean": 26.740095138549805,
"grad_norm": 18.046972274780273,
"kl/avg_steps": 0.71875,
"kl/beta": 0.03645097091794014,
"kl/n_epsilon_steps": 0.140625,
"kl/p_epsilon_steps": 0.859375,
"learning_rate": 3.586410864126781e-07,
"logits/chosen": -0.1912391185760498,
"logits/rejected": -0.1932620406150818,
"logps/chosen": -87.58761596679688,
"logps/ref_chosen": -60.43281173706055,
"logps/ref_rejected": -78.39051818847656,
"logps/rejected": -132.28541564941406,
"loss": 0.8042,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.9834344387054443,
"rewards/margins": 0.9645205736160278,
"rewards/rejected": -1.9479551315307617,
"step": 280
},
{
"epoch": 0.42479213907785335,
"epsilon_dpo/beta": 0.0359908752143383,
"epsilon_dpo/beta_margin_grad_mean": -0.33162885904312134,
"epsilon_dpo/beta_margin_grad_std": 0.1964201033115387,
"epsilon_dpo/beta_margin_mean": 0.8618521094322205,
"epsilon_dpo/beta_margin_std": 1.0393693447113037,
"epsilon_dpo/loss_margin_mean": 24.112550735473633,
"grad_norm": 16.716167449951172,
"kl/avg_steps": 0.5625,
"kl/beta": 0.03619084879755974,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 3.574487280222929e-07,
"logits/chosen": -0.21409659087657928,
"logits/rejected": -0.08143429458141327,
"logps/chosen": -90.40107727050781,
"logps/ref_chosen": -60.2820930480957,
"logps/ref_rejected": -62.04009246826172,
"logps/rejected": -116.27163696289062,
"loss": 0.918,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.0871957540512085,
"rewards/margins": 0.8618521094322205,
"rewards/rejected": -1.9490478038787842,
"step": 281
},
{
"epoch": 0.42630385487528344,
"epsilon_dpo/beta": 0.03582330420613289,
"epsilon_dpo/beta_margin_grad_mean": -0.35065773129463196,
"epsilon_dpo/beta_margin_grad_std": 0.21036547422409058,
"epsilon_dpo/beta_margin_mean": 0.8097745180130005,
"epsilon_dpo/beta_margin_std": 1.1492936611175537,
"epsilon_dpo/loss_margin_mean": 22.809677124023438,
"grad_norm": 21.746095657348633,
"kl/avg_steps": 0.46875,
"kl/beta": 0.03598841652274132,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 3.562533640600075e-07,
"logits/chosen": -0.13356426358222961,
"logits/rejected": -0.23361456394195557,
"logps/chosen": -94.24847412109375,
"logps/ref_chosen": -60.623924255371094,
"logps/ref_rejected": -68.67400360107422,
"logps/rejected": -125.10823822021484,
"loss": 0.9908,
"rewards/accuracies": 0.734375,
"rewards/chosen": -1.2076689004898071,
"rewards/margins": 0.8097745180130005,
"rewards/rejected": -2.0174434185028076,
"step": 282
},
{
"epoch": 0.42781557067271353,
"epsilon_dpo/beta": 0.035622578114271164,
"epsilon_dpo/beta_margin_grad_mean": -0.367683082818985,
"epsilon_dpo/beta_margin_grad_std": 0.2163701206445694,
"epsilon_dpo/beta_margin_mean": 0.7181314826011658,
"epsilon_dpo/beta_margin_std": 1.1614021062850952,
"epsilon_dpo/loss_margin_mean": 20.334840774536133,
"grad_norm": 22.46578598022461,
"kl/avg_steps": 0.5625,
"kl/beta": 0.035820506513118744,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 3.550550279627215e-07,
"logits/chosen": -0.18255124986171722,
"logits/rejected": -0.3613712191581726,
"logps/chosen": -103.27592468261719,
"logps/ref_chosen": -67.64775085449219,
"logps/ref_rejected": -99.96835327148438,
"logps/rejected": -155.93136596679688,
"loss": 1.0633,
"rewards/accuracies": 0.734375,
"rewards/chosen": -1.271193027496338,
"rewards/margins": 0.718131422996521,
"rewards/rejected": -1.9893245697021484,
"step": 283
},
{
"epoch": 0.4293272864701436,
"epsilon_dpo/beta": 0.03538992255926132,
"epsilon_dpo/beta_margin_grad_mean": -0.35381340980529785,
"epsilon_dpo/beta_margin_grad_std": 0.18310926854610443,
"epsilon_dpo/beta_margin_mean": 0.7518596649169922,
"epsilon_dpo/beta_margin_std": 1.051992654800415,
"epsilon_dpo/loss_margin_mean": 21.368831634521484,
"grad_norm": 20.21920394897461,
"kl/avg_steps": 0.65625,
"kl/beta": 0.03562014177441597,
"kl/n_epsilon_steps": 0.171875,
"kl/p_epsilon_steps": 0.828125,
"learning_rate": 3.5385375325047163e-07,
"logits/chosen": -0.17809271812438965,
"logits/rejected": -0.3383534252643585,
"logps/chosen": -90.8304214477539,
"logps/ref_chosen": -56.967430114746094,
"logps/ref_rejected": -86.36236572265625,
"logps/rejected": -141.5941925048828,
"loss": 0.982,
"rewards/accuracies": 0.828125,
"rewards/chosen": -1.200179934501648,
"rewards/margins": 0.7518596649169922,
"rewards/rejected": -1.9520395994186401,
"step": 284
},
{
"epoch": 0.4308390022675737,
"epsilon_dpo/beta": 0.03522555157542229,
"epsilon_dpo/beta_margin_grad_mean": -0.3701721429824829,
"epsilon_dpo/beta_margin_grad_std": 0.2017168402671814,
"epsilon_dpo/beta_margin_mean": 0.6842705607414246,
"epsilon_dpo/beta_margin_std": 1.0585107803344727,
"epsilon_dpo/loss_margin_mean": 19.605499267578125,
"grad_norm": 21.92341423034668,
"kl/avg_steps": 0.46875,
"kl/beta": 0.03538791090250015,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 3.5264957352549375e-07,
"logits/chosen": -0.19132962822914124,
"logits/rejected": -0.17737413942813873,
"logps/chosen": -111.88589477539062,
"logps/ref_chosen": -71.65611267089844,
"logps/ref_rejected": -81.63829803466797,
"logps/rejected": -141.4735870361328,
"loss": 1.0434,
"rewards/accuracies": 0.765625,
"rewards/chosen": -1.4212801456451416,
"rewards/margins": 0.6842705607414246,
"rewards/rejected": -2.105550765991211,
"step": 285
},
{
"epoch": 0.4323507180650038,
"epsilon_dpo/beta": 0.03495112061500549,
"epsilon_dpo/beta_margin_grad_mean": -0.3013160526752472,
"epsilon_dpo/beta_margin_grad_std": 0.1912383735179901,
"epsilon_dpo/beta_margin_mean": 1.0564639568328857,
"epsilon_dpo/beta_margin_std": 1.1228727102279663,
"epsilon_dpo/loss_margin_mean": 30.341690063476562,
"grad_norm": 18.68132209777832,
"kl/avg_steps": 0.78125,
"kl/beta": 0.03522280231118202,
"kl/n_epsilon_steps": 0.109375,
"kl/p_epsilon_steps": 0.890625,
"learning_rate": 3.514425224712835e-07,
"logits/chosen": -0.17713254690170288,
"logits/rejected": -0.27768129110336304,
"logps/chosen": -100.56034851074219,
"logps/ref_chosen": -61.07952117919922,
"logps/ref_rejected": -91.28128051757812,
"logps/rejected": -161.10379028320312,
"loss": 0.8217,
"rewards/accuracies": 0.890625,
"rewards/chosen": -1.3821173906326294,
"rewards/margins": 1.0564639568328857,
"rewards/rejected": -2.4385814666748047,
"step": 286
},
{
"epoch": 0.43386243386243384,
"epsilon_dpo/beta": 0.03471294790506363,
"epsilon_dpo/beta_margin_grad_mean": -0.3082159459590912,
"epsilon_dpo/beta_margin_grad_std": 0.19621455669403076,
"epsilon_dpo/beta_margin_mean": 1.0580402612686157,
"epsilon_dpo/beta_margin_std": 1.178950309753418,
"epsilon_dpo/loss_margin_mean": 30.61455726623535,
"grad_norm": 18.646347045898438,
"kl/avg_steps": 0.6875,
"kl/beta": 0.034949757158756256,
"kl/n_epsilon_steps": 0.15625,
"kl/p_epsilon_steps": 0.84375,
"learning_rate": 3.502326338516534e-07,
"logits/chosen": 0.0036140456795692444,
"logits/rejected": -0.11716046184301376,
"logps/chosen": -79.26083374023438,
"logps/ref_chosen": -46.035789489746094,
"logps/ref_rejected": -59.95293426513672,
"logps/rejected": -123.79253387451172,
"loss": 0.8376,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.1553316116333008,
"rewards/margins": 1.0580402612686157,
"rewards/rejected": -2.213371753692627,
"step": 287
},
{
"epoch": 0.43537414965986393,
"epsilon_dpo/beta": 0.0345844104886055,
"epsilon_dpo/beta_margin_grad_mean": -0.3722850978374481,
"epsilon_dpo/beta_margin_grad_std": 0.21924489736557007,
"epsilon_dpo/beta_margin_mean": 0.667302668094635,
"epsilon_dpo/beta_margin_std": 1.1633450984954834,
"epsilon_dpo/loss_margin_mean": 19.533231735229492,
"grad_norm": 23.35117530822754,
"kl/avg_steps": 0.375,
"kl/beta": 0.034711118787527084,
"kl/n_epsilon_steps": 0.3125,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 3.490199415097892e-07,
"logits/chosen": -0.27200770378112793,
"logits/rejected": -0.33584830164909363,
"logps/chosen": -108.50701904296875,
"logps/ref_chosen": -65.3908462524414,
"logps/ref_rejected": -88.53607177734375,
"logps/rejected": -151.1854705810547,
"loss": 1.1016,
"rewards/accuracies": 0.734375,
"rewards/chosen": -1.4953001737594604,
"rewards/margins": 0.6673027276992798,
"rewards/rejected": -2.1626029014587402,
"step": 288
},
{
"epoch": 0.436885865457294,
"epsilon_dpo/beta": 0.03443358466029167,
"epsilon_dpo/beta_margin_grad_mean": -0.3693796992301941,
"epsilon_dpo/beta_margin_grad_std": 0.22231638431549072,
"epsilon_dpo/beta_margin_mean": 0.7420229315757751,
"epsilon_dpo/beta_margin_std": 1.2633713483810425,
"epsilon_dpo/loss_margin_mean": 21.776105880737305,
"grad_norm": 20.75128173828125,
"kl/avg_steps": 0.4375,
"kl/beta": 0.03458143770694733,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 3.4780447936730247e-07,
"logits/chosen": -0.10027895122766495,
"logits/rejected": -0.2047712355852127,
"logps/chosen": -98.62236022949219,
"logps/ref_chosen": -54.5936279296875,
"logps/ref_rejected": -67.20855712890625,
"logps/rejected": -133.01339721679688,
"loss": 1.0824,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.5208301544189453,
"rewards/margins": 0.7420229315757751,
"rewards/rejected": -2.2628531455993652,
"step": 289
},
{
"epoch": 0.4383975812547241,
"epsilon_dpo/beta": 0.03421903774142265,
"epsilon_dpo/beta_margin_grad_mean": -0.3281201720237732,
"epsilon_dpo/beta_margin_grad_std": 0.21743208169937134,
"epsilon_dpo/beta_margin_mean": 0.9275364279747009,
"epsilon_dpo/beta_margin_std": 1.241027593612671,
"epsilon_dpo/loss_margin_mean": 27.293519973754883,
"grad_norm": 25.378704071044922,
"kl/avg_steps": 0.625,
"kl/beta": 0.03443080559372902,
"kl/n_epsilon_steps": 0.1875,
"kl/p_epsilon_steps": 0.8125,
"learning_rate": 3.465862814232821e-07,
"logits/chosen": -0.012030299752950668,
"logits/rejected": -0.26927900314331055,
"logps/chosen": -110.91470336914062,
"logps/ref_chosen": -61.38457489013672,
"logps/ref_rejected": -91.92778015136719,
"logps/rejected": -168.75143432617188,
"loss": 0.9558,
"rewards/accuracies": 0.828125,
"rewards/chosen": -1.696755051612854,
"rewards/margins": 0.9275364875793457,
"rewards/rejected": -2.62429141998291,
"step": 290
},
{
"epoch": 0.4399092970521542,
"epsilon_dpo/beta": 0.03403857350349426,
"epsilon_dpo/beta_margin_grad_mean": -0.33269572257995605,
"epsilon_dpo/beta_margin_grad_std": 0.21903719007968903,
"epsilon_dpo/beta_margin_mean": 0.9532585144042969,
"epsilon_dpo/beta_margin_std": 1.3030027151107788,
"epsilon_dpo/loss_margin_mean": 28.213741302490234,
"grad_norm": 22.436275482177734,
"kl/avg_steps": 0.53125,
"kl/beta": 0.034216947853565216,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 3.4536538175334343e-07,
"logits/chosen": 0.014822449535131454,
"logits/rejected": -0.14718475937843323,
"logps/chosen": -97.53120422363281,
"logps/ref_chosen": -50.863037109375,
"logps/ref_rejected": -82.20868682861328,
"logps/rejected": -157.09060668945312,
"loss": 0.9556,
"rewards/accuracies": 0.765625,
"rewards/chosen": -1.5923006534576416,
"rewards/margins": 0.9532584547996521,
"rewards/rejected": -2.5455589294433594,
"step": 291
},
{
"epoch": 0.4414210128495843,
"epsilon_dpo/beta": 0.03389061242341995,
"epsilon_dpo/beta_margin_grad_mean": -0.35319986939430237,
"epsilon_dpo/beta_margin_grad_std": 0.21678273379802704,
"epsilon_dpo/beta_margin_mean": 0.7712721228599548,
"epsilon_dpo/beta_margin_std": 1.1954424381256104,
"epsilon_dpo/loss_margin_mean": 22.987010955810547,
"grad_norm": 22.653987884521484,
"kl/avg_steps": 0.4375,
"kl/beta": 0.03403612971305847,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 3.4414181450867465e-07,
"logits/chosen": -0.062173761427402496,
"logits/rejected": -0.27091309428215027,
"logps/chosen": -109.98258972167969,
"logps/ref_chosen": -64.34888458251953,
"logps/ref_rejected": -72.86434936523438,
"logps/rejected": -141.4850616455078,
"loss": 1.0382,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.5515226125717163,
"rewards/margins": 0.7712721824645996,
"rewards/rejected": -2.3227949142456055,
"step": 292
},
{
"epoch": 0.4429327286470144,
"epsilon_dpo/beta": 0.033700622618198395,
"epsilon_dpo/beta_margin_grad_mean": -0.3102937936782837,
"epsilon_dpo/beta_margin_grad_std": 0.2274078130722046,
"epsilon_dpo/beta_margin_mean": 1.0588384866714478,
"epsilon_dpo/beta_margin_std": 1.3324589729309082,
"epsilon_dpo/loss_margin_mean": 31.65074920654297,
"grad_norm": 17.412572860717773,
"kl/avg_steps": 0.5625,
"kl/beta": 0.033887870609760284,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 3.4291561391508185e-07,
"logits/chosen": -0.03904179483652115,
"logits/rejected": -0.28950247168540955,
"logps/chosen": -103.18391418457031,
"logps/ref_chosen": -54.86946487426758,
"logps/ref_rejected": -81.858642578125,
"logps/rejected": -161.82383728027344,
"loss": 0.9154,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.6325865983963013,
"rewards/margins": 1.0588384866714478,
"rewards/rejected": -2.69142484664917,
"step": 293
},
{
"epoch": 0.4444444444444444,
"epsilon_dpo/beta": 0.03351211920380592,
"epsilon_dpo/beta_margin_grad_mean": -0.3480183780193329,
"epsilon_dpo/beta_margin_grad_std": 0.19195351004600525,
"epsilon_dpo/beta_margin_mean": 0.8353626132011414,
"epsilon_dpo/beta_margin_std": 1.139838695526123,
"epsilon_dpo/loss_margin_mean": 25.088672637939453,
"grad_norm": 19.270187377929688,
"kl/avg_steps": 0.5625,
"kl/beta": 0.033698320388793945,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 3.4168681427203153e-07,
"logits/chosen": 0.008149133995175362,
"logits/rejected": -0.030063778162002563,
"logps/chosen": -103.8961181640625,
"logps/ref_chosen": -56.6708984375,
"logps/ref_rejected": -70.32819366455078,
"logps/rejected": -142.64208984375,
"loss": 0.9541,
"rewards/accuracies": 0.796875,
"rewards/chosen": -1.5827089548110962,
"rewards/margins": 0.8353626728057861,
"rewards/rejected": -2.4180715084075928,
"step": 294
},
{
"epoch": 0.4459561602418745,
"epsilon_dpo/beta": 0.033387500792741776,
"epsilon_dpo/beta_margin_grad_mean": -0.3709297478199005,
"epsilon_dpo/beta_margin_grad_std": 0.2090597152709961,
"epsilon_dpo/beta_margin_mean": 0.7060700058937073,
"epsilon_dpo/beta_margin_std": 1.1487157344818115,
"epsilon_dpo/loss_margin_mean": 21.37085723876953,
"grad_norm": 24.286874771118164,
"kl/avg_steps": 0.375,
"kl/beta": 0.03350982442498207,
"kl/n_epsilon_steps": 0.3125,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 3.4045544995169125e-07,
"logits/chosen": 0.03945862129330635,
"logits/rejected": -0.244182288646698,
"logps/chosen": -103.69340515136719,
"logps/ref_chosen": -50.40088653564453,
"logps/ref_rejected": -83.43521881103516,
"logps/rejected": -158.0985870361328,
"loss": 1.059,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.7828543186187744,
"rewards/margins": 0.7060699462890625,
"rewards/rejected": -2.488924264907837,
"step": 295
},
{
"epoch": 0.4474678760393046,
"epsilon_dpo/beta": 0.033210597932338715,
"epsilon_dpo/beta_margin_grad_mean": -0.33759990334510803,
"epsilon_dpo/beta_margin_grad_std": 0.2221236228942871,
"epsilon_dpo/beta_margin_mean": 0.8879937529563904,
"epsilon_dpo/beta_margin_std": 1.2552028894424438,
"epsilon_dpo/loss_margin_mean": 26.967525482177734,
"grad_norm": 22.84150505065918,
"kl/avg_steps": 0.53125,
"kl/beta": 0.03338463231921196,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 3.392215553979679e-07,
"logits/chosen": -0.16981446743011475,
"logits/rejected": -0.22287404537200928,
"logps/chosen": -121.29803466796875,
"logps/ref_chosen": -69.15034484863281,
"logps/ref_rejected": -89.60166931152344,
"logps/rejected": -168.7168731689453,
"loss": 0.9854,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.7358465194702148,
"rewards/margins": 0.8879938125610352,
"rewards/rejected": -2.62384033203125,
"step": 296
},
{
"epoch": 0.4489795918367347,
"epsilon_dpo/beta": 0.03300396353006363,
"epsilon_dpo/beta_margin_grad_mean": -0.32552599906921387,
"epsilon_dpo/beta_margin_grad_std": 0.199081152677536,
"epsilon_dpo/beta_margin_mean": 0.9136903285980225,
"epsilon_dpo/beta_margin_std": 1.0769370794296265,
"epsilon_dpo/loss_margin_mean": 27.85042381286621,
"grad_norm": 21.35120964050293,
"kl/avg_steps": 0.625,
"kl/beta": 0.0332082137465477,
"kl/n_epsilon_steps": 0.1875,
"kl/p_epsilon_steps": 0.8125,
"learning_rate": 3.3798516512554485e-07,
"logits/chosen": -0.04735187068581581,
"logits/rejected": -0.19100773334503174,
"logps/chosen": -113.34721374511719,
"logps/ref_chosen": -58.01630401611328,
"logps/ref_rejected": -69.95780944824219,
"logps/rejected": -153.13912963867188,
"loss": 0.8976,
"rewards/accuracies": 0.828125,
"rewards/chosen": -1.8296699523925781,
"rewards/margins": 0.9136903285980225,
"rewards/rejected": -2.7433602809906006,
"step": 297
},
{
"epoch": 0.4504913076341648,
"epsilon_dpo/beta": 0.03284022584557533,
"epsilon_dpo/beta_margin_grad_mean": -0.3704070448875427,
"epsilon_dpo/beta_margin_grad_std": 0.22375141084194183,
"epsilon_dpo/beta_margin_mean": 0.7068548202514648,
"epsilon_dpo/beta_margin_std": 1.218665599822998,
"epsilon_dpo/loss_margin_mean": 21.750486373901367,
"grad_norm": 22.46474266052246,
"kl/avg_steps": 0.5,
"kl/beta": 0.03300195187330246,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 3.367463137189156e-07,
"logits/chosen": -0.0493951290845871,
"logits/rejected": -0.21405287086963654,
"logps/chosen": -111.58262634277344,
"logps/ref_chosen": -56.1693115234375,
"logps/ref_rejected": -68.55052185058594,
"logps/rejected": -145.71432495117188,
"loss": 1.0957,
"rewards/accuracies": 0.734375,
"rewards/chosen": -1.8223876953125,
"rewards/margins": 0.7068548202514648,
"rewards/rejected": -2.529242515563965,
"step": 298
},
{
"epoch": 0.4520030234315949,
"epsilon_dpo/beta": 0.03270244970917702,
"epsilon_dpo/beta_margin_grad_mean": -0.3751116693019867,
"epsilon_dpo/beta_margin_grad_std": 0.22815139591693878,
"epsilon_dpo/beta_margin_mean": 0.6882988810539246,
"epsilon_dpo/beta_margin_std": 1.2340561151504517,
"epsilon_dpo/loss_margin_mean": 21.294403076171875,
"grad_norm": 22.5549373626709,
"kl/avg_steps": 0.421875,
"kl/beta": 0.03283776342868805,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.703125,
"learning_rate": 3.355050358314172e-07,
"logits/chosen": -0.036409709602594376,
"logits/rejected": -0.10160522162914276,
"logps/chosen": -115.97065734863281,
"logps/ref_chosen": -62.31780242919922,
"logps/ref_rejected": -72.60028839111328,
"logps/rejected": -147.54754638671875,
"loss": 1.1165,
"rewards/accuracies": 0.703125,
"rewards/chosen": -1.7570571899414062,
"rewards/margins": 0.6882988214492798,
"rewards/rejected": -2.4453561305999756,
"step": 299
},
{
"epoch": 0.45351473922902497,
"epsilon_dpo/beta": 0.03252934664487839,
"epsilon_dpo/beta_margin_grad_mean": -0.35380449891090393,
"epsilon_dpo/beta_margin_grad_std": 0.2140689194202423,
"epsilon_dpo/beta_margin_mean": 0.7571981549263,
"epsilon_dpo/beta_margin_std": 1.1229230165481567,
"epsilon_dpo/loss_margin_mean": 23.49399185180664,
"grad_norm": 20.840009689331055,
"kl/avg_steps": 0.53125,
"kl/beta": 0.032699812203645706,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 3.3426136618426043e-07,
"logits/chosen": -0.0998692736029625,
"logits/rejected": -0.21543878316879272,
"logps/chosen": -116.08995819091797,
"logps/ref_chosen": -60.38157653808594,
"logps/ref_rejected": -75.45442199707031,
"logps/rejected": -154.65679931640625,
"loss": 1.0235,
"rewards/accuracies": 0.765625,
"rewards/chosen": -1.815822720527649,
"rewards/margins": 0.7571982145309448,
"rewards/rejected": -2.5730209350585938,
"step": 300
},
{
"epoch": 0.45351473922902497,
"eval_epsilon_dpo/beta": 0.032380782067775726,
"eval_epsilon_dpo/beta_margin_grad_mean": -0.3618065118789673,
"eval_epsilon_dpo/beta_margin_grad_std": 0.21422915160655975,
"eval_epsilon_dpo/beta_margin_mean": 0.7358340620994568,
"eval_epsilon_dpo/beta_margin_std": 1.17220139503479,
"eval_epsilon_dpo/loss_margin_mean": 22.956846237182617,
"eval_kl/n_epsilon_steps": 0.2698063254356384,
"eval_kl/p_epsilon_steps": 0.7293133735656738,
"eval_logits/chosen": -0.04810130596160889,
"eval_logits/rejected": -0.17416058480739594,
"eval_logps/chosen": -124.96621704101562,
"eval_logps/ref_chosen": -74.85946655273438,
"eval_logps/ref_rejected": -79.54898834228516,
"eval_logps/rejected": -152.61257934570312,
"eval_loss": 0.5322726368904114,
"eval_rewards/accuracies": 0.7284330725669861,
"eval_rewards/chosen": -1.6264508962631226,
"eval_rewards/margins": 0.7358340620994568,
"eval_rewards/rejected": -2.3622844219207764,
"eval_runtime": 41.8164,
"eval_samples_per_second": 55.074,
"eval_steps_per_second": 1.722,
"step": 300
},
{
"epoch": 0.455026455026455,
"epsilon_dpo/beta": 0.03237777575850487,
"epsilon_dpo/beta_margin_grad_mean": -0.3747977912425995,
"epsilon_dpo/beta_margin_grad_std": 0.2182133048772812,
"epsilon_dpo/beta_margin_mean": 0.667656421661377,
"epsilon_dpo/beta_margin_std": 1.203988790512085,
"epsilon_dpo/loss_margin_mean": 20.853044509887695,
"grad_norm": 18.300973892211914,
"kl/avg_steps": 0.46875,
"kl/beta": 0.03252701088786125,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 3.3301533956555885e-07,
"logits/chosen": -0.003833577036857605,
"logits/rejected": -0.12104681879281998,
"logps/chosen": -106.81861877441406,
"logps/ref_chosen": -52.85089111328125,
"logps/ref_rejected": -69.97584533691406,
"logps/rejected": -144.79661560058594,
"loss": 1.1116,
"rewards/accuracies": 0.734375,
"rewards/chosen": -1.7513763904571533,
"rewards/margins": 0.6676563620567322,
"rewards/rejected": -2.4190328121185303,
"step": 301
},
{
"epoch": 0.4565381708238851,
"epsilon_dpo/beta": 0.03224685415625572,
"epsilon_dpo/beta_margin_grad_mean": -0.4092699885368347,
"epsilon_dpo/beta_margin_grad_std": 0.20309504866600037,
"epsilon_dpo/beta_margin_mean": 0.464851975440979,
"epsilon_dpo/beta_margin_std": 1.077169418334961,
"epsilon_dpo/loss_margin_mean": 14.629032135009766,
"grad_norm": 23.397573471069336,
"kl/avg_steps": 0.40625,
"kl/beta": 0.03237525373697281,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 3.317669908293554e-07,
"logits/chosen": -0.126109316945076,
"logits/rejected": -0.29567620158195496,
"logps/chosen": -126.37675476074219,
"logps/ref_chosen": -66.96651458740234,
"logps/ref_rejected": -88.0951156616211,
"logps/rejected": -162.1343994140625,
"loss": 1.2144,
"rewards/accuracies": 0.734375,
"rewards/chosen": -1.9191679954528809,
"rewards/margins": 0.464851975440979,
"rewards/rejected": -2.3840200901031494,
"step": 302
},
{
"epoch": 0.4580498866213152,
"epsilon_dpo/beta": 0.03210131451487541,
"epsilon_dpo/beta_margin_grad_mean": -0.3311282992362976,
"epsilon_dpo/beta_margin_grad_std": 0.23038393259048462,
"epsilon_dpo/beta_margin_mean": 0.9265754818916321,
"epsilon_dpo/beta_margin_std": 1.2422301769256592,
"epsilon_dpo/loss_margin_mean": 29.129186630249023,
"grad_norm": 16.80988121032715,
"kl/avg_steps": 0.453125,
"kl/beta": 0.03224426135420799,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 3.3051635489464793e-07,
"logits/chosen": -0.14565876126289368,
"logits/rejected": -0.22940771281719208,
"logps/chosen": -111.89651489257812,
"logps/ref_chosen": -62.12152862548828,
"logps/ref_rejected": -90.31204223632812,
"logps/rejected": -169.21621704101562,
"loss": 0.9654,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.6028306484222412,
"rewards/margins": 0.9265754222869873,
"rewards/rejected": -2.5294060707092285,
"step": 303
},
{
"epoch": 0.4595616024187453,
"epsilon_dpo/beta": 0.031871289014816284,
"epsilon_dpo/beta_margin_grad_mean": -0.31583327054977417,
"epsilon_dpo/beta_margin_grad_std": 0.18183431029319763,
"epsilon_dpo/beta_margin_mean": 0.9324640035629272,
"epsilon_dpo/beta_margin_std": 0.9843049645423889,
"epsilon_dpo/loss_margin_mean": 29.384740829467773,
"grad_norm": 18.028322219848633,
"kl/avg_steps": 0.71875,
"kl/beta": 0.032098811119794846,
"kl/n_epsilon_steps": 0.140625,
"kl/p_epsilon_steps": 0.859375,
"learning_rate": 3.292634667444117e-07,
"logits/chosen": -0.1078590601682663,
"logits/rejected": -0.2193235456943512,
"logps/chosen": -108.30549621582031,
"logps/ref_chosen": -60.69508361816406,
"logps/ref_rejected": -78.25254821777344,
"logps/rejected": -155.24769592285156,
"loss": 0.8495,
"rewards/accuracies": 0.84375,
"rewards/chosen": -1.5200098752975464,
"rewards/margins": 0.9324639439582825,
"rewards/rejected": -2.4524738788604736,
"step": 304
},
{
"epoch": 0.46107331821617537,
"epsilon_dpo/beta": 0.03172353282570839,
"epsilon_dpo/beta_margin_grad_mean": -0.37009990215301514,
"epsilon_dpo/beta_margin_grad_std": 0.22368741035461426,
"epsilon_dpo/beta_margin_mean": 0.6786959767341614,
"epsilon_dpo/beta_margin_std": 1.179148554801941,
"epsilon_dpo/loss_margin_mean": 21.638240814208984,
"grad_norm": 23.151081085205078,
"kl/avg_steps": 0.46875,
"kl/beta": 0.03186975046992302,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 3.280083614246217e-07,
"logits/chosen": -0.24610410630702972,
"logits/rejected": -0.1328166276216507,
"logps/chosen": -127.88774871826172,
"logps/ref_chosen": -72.69914245605469,
"logps/ref_rejected": -65.65670776367188,
"logps/rejected": -142.48355102539062,
"loss": 1.1048,
"rewards/accuracies": 0.734375,
"rewards/chosen": -1.756119728088379,
"rewards/margins": 0.6786960363388062,
"rewards/rejected": -2.4348158836364746,
"step": 305
},
{
"epoch": 0.46258503401360546,
"epsilon_dpo/beta": 0.03153586760163307,
"epsilon_dpo/beta_margin_grad_mean": -0.3657355010509491,
"epsilon_dpo/beta_margin_grad_std": 0.17301443219184875,
"epsilon_dpo/beta_margin_mean": 0.6566804647445679,
"epsilon_dpo/beta_margin_std": 0.9133028984069824,
"epsilon_dpo/loss_margin_mean": 20.97022247314453,
"grad_norm": 17.611703872680664,
"kl/avg_steps": 0.59375,
"kl/beta": 0.03172105550765991,
"kl/n_epsilon_steps": 0.203125,
"kl/p_epsilon_steps": 0.796875,
"learning_rate": 3.267510740432719e-07,
"logits/chosen": -0.04794091358780861,
"logits/rejected": -0.31678956747055054,
"logps/chosen": -106.12442016601562,
"logps/ref_chosen": -53.97052764892578,
"logps/ref_rejected": -71.02423095703125,
"logps/rejected": -144.14834594726562,
"loss": 1.0025,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.6451444625854492,
"rewards/margins": 0.6566804647445679,
"rewards/rejected": -2.3018250465393066,
"step": 306
},
{
"epoch": 0.46409674981103555,
"epsilon_dpo/beta": 0.03148770332336426,
"epsilon_dpo/beta_margin_grad_mean": -0.4137803316116333,
"epsilon_dpo/beta_margin_grad_std": 0.2295747846364975,
"epsilon_dpo/beta_margin_mean": 0.4782416522502899,
"epsilon_dpo/beta_margin_std": 1.1538732051849365,
"epsilon_dpo/loss_margin_mean": 15.484113693237305,
"grad_norm": 20.8459415435791,
"kl/avg_steps": 0.15625,
"kl/beta": 0.031533826142549515,
"kl/n_epsilon_steps": 0.421875,
"kl/p_epsilon_steps": 0.578125,
"learning_rate": 3.2549163976939285e-07,
"logits/chosen": -0.09461039304733276,
"logits/rejected": -0.21562448143959045,
"logps/chosen": -102.38115692138672,
"logps/ref_chosen": -57.413108825683594,
"logps/ref_rejected": -68.68011474609375,
"logps/rejected": -129.13226318359375,
"loss": 1.2484,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.4213111400604248,
"rewards/margins": 0.4782416820526123,
"rewards/rejected": -1.899552822113037,
"step": 307
},
{
"epoch": 0.4656084656084656,
"epsilon_dpo/beta": 0.031350020319223404,
"epsilon_dpo/beta_margin_grad_mean": -0.37121155858039856,
"epsilon_dpo/beta_margin_grad_std": 0.21223023533821106,
"epsilon_dpo/beta_margin_mean": 0.6495247483253479,
"epsilon_dpo/beta_margin_std": 1.077079176902771,
"epsilon_dpo/loss_margin_mean": 20.942363739013672,
"grad_norm": 17.589200973510742,
"kl/avg_steps": 0.4375,
"kl/beta": 0.03148462995886803,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 3.2423009383206874e-07,
"logits/chosen": -0.22774261236190796,
"logits/rejected": -0.2404198795557022,
"logps/chosen": -115.05911254882812,
"logps/ref_chosen": -66.59878540039062,
"logps/ref_rejected": -74.337158203125,
"logps/rejected": -143.73983764648438,
"loss": 1.0843,
"rewards/accuracies": 0.765625,
"rewards/chosen": -1.523284912109375,
"rewards/margins": 0.6495247483253479,
"rewards/rejected": -2.172809600830078,
"step": 308
},
{
"epoch": 0.4671201814058957,
"epsilon_dpo/beta": 0.031213458627462387,
"epsilon_dpo/beta_margin_grad_mean": -0.3729403018951416,
"epsilon_dpo/beta_margin_grad_std": 0.19963404536247253,
"epsilon_dpo/beta_margin_mean": 0.6403841376304626,
"epsilon_dpo/beta_margin_std": 1.0186731815338135,
"epsilon_dpo/loss_margin_mean": 20.727285385131836,
"grad_norm": 20.43776512145996,
"kl/avg_steps": 0.4375,
"kl/beta": 0.03134748339653015,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 3.229664715194511e-07,
"logits/chosen": -0.06137102097272873,
"logits/rejected": -0.2520020008087158,
"logps/chosen": -119.87788391113281,
"logps/ref_chosen": -65.39474487304688,
"logps/ref_rejected": -75.70930480957031,
"logps/rejected": -150.91973876953125,
"loss": 1.0623,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.7041327953338623,
"rewards/margins": 0.6403840780258179,
"rewards/rejected": -2.3445167541503906,
"step": 309
},
{
"epoch": 0.46863189720332576,
"epsilon_dpo/beta": 0.031126268208026886,
"epsilon_dpo/beta_margin_grad_mean": -0.42556193470954895,
"epsilon_dpo/beta_margin_grad_std": 0.2021602839231491,
"epsilon_dpo/beta_margin_mean": 0.3931906819343567,
"epsilon_dpo/beta_margin_std": 1.007182002067566,
"epsilon_dpo/loss_margin_mean": 12.860428810119629,
"grad_norm": 20.93350601196289,
"kl/avg_steps": 0.28125,
"kl/beta": 0.03121093660593033,
"kl/n_epsilon_steps": 0.359375,
"kl/p_epsilon_steps": 0.640625,
"learning_rate": 3.2170080817777257e-07,
"logits/chosen": -0.32511359453201294,
"logits/rejected": -0.30830568075180054,
"logps/chosen": -129.52880859375,
"logps/ref_chosen": -74.66827392578125,
"logps/ref_rejected": -80.5689697265625,
"logps/rejected": -148.28994750976562,
"loss": 1.2497,
"rewards/accuracies": 0.640625,
"rewards/chosen": -1.7097536325454712,
"rewards/margins": 0.3931906521320343,
"rewards/rejected": -2.1029443740844727,
"step": 310
},
{
"epoch": 0.47014361300075586,
"epsilon_dpo/beta": 0.030970880761742592,
"epsilon_dpo/beta_margin_grad_mean": -0.3730762004852295,
"epsilon_dpo/beta_margin_grad_std": 0.18645299971103668,
"epsilon_dpo/beta_margin_mean": 0.6560811996459961,
"epsilon_dpo/beta_margin_std": 1.0826748609542847,
"epsilon_dpo/loss_margin_mean": 21.375320434570312,
"grad_norm": 18.205066680908203,
"kl/avg_steps": 0.5,
"kl/beta": 0.03112340159714222,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 3.204331392103574e-07,
"logits/chosen": -0.09656871855258942,
"logits/rejected": -0.35739874839782715,
"logps/chosen": -103.98506927490234,
"logps/ref_chosen": -59.73802947998047,
"logps/ref_rejected": -93.60757446289062,
"logps/rejected": -159.2299346923828,
"loss": 1.0528,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.3724453449249268,
"rewards/margins": 0.6560811996459961,
"rewards/rejected": -2.028526544570923,
"step": 311
},
{
"epoch": 0.47165532879818595,
"epsilon_dpo/beta": 0.030758727341890335,
"epsilon_dpo/beta_margin_grad_mean": -0.33074235916137695,
"epsilon_dpo/beta_margin_grad_std": 0.17869852483272552,
"epsilon_dpo/beta_margin_mean": 0.8504709601402283,
"epsilon_dpo/beta_margin_std": 0.967566192150116,
"epsilon_dpo/loss_margin_mean": 27.792688369750977,
"grad_norm": 15.504898071289062,
"kl/avg_steps": 0.6875,
"kl/beta": 0.03096855990588665,
"kl/n_epsilon_steps": 0.15625,
"kl/p_epsilon_steps": 0.84375,
"learning_rate": 3.1916350007663176e-07,
"logits/chosen": -0.03940066695213318,
"logits/rejected": -0.26339995861053467,
"logps/chosen": -99.76426696777344,
"logps/ref_chosen": -53.816436767578125,
"logps/ref_rejected": -68.6575698852539,
"logps/rejected": -142.39810180664062,
"loss": 0.8924,
"rewards/accuracies": 0.84375,
"rewards/chosen": -1.4157965183258057,
"rewards/margins": 0.8504709601402283,
"rewards/rejected": -2.2662672996520996,
"step": 312
},
{
"epoch": 0.47316704459561604,
"epsilon_dpo/beta": 0.030654441565275192,
"epsilon_dpo/beta_margin_grad_mean": -0.40436407923698425,
"epsilon_dpo/beta_margin_grad_std": 0.1993086040019989,
"epsilon_dpo/beta_margin_mean": 0.5042582750320435,
"epsilon_dpo/beta_margin_std": 1.0631064176559448,
"epsilon_dpo/loss_margin_mean": 16.674205780029297,
"grad_norm": 19.160459518432617,
"kl/avg_steps": 0.34375,
"kl/beta": 0.030757104977965355,
"kl/n_epsilon_steps": 0.328125,
"kl/p_epsilon_steps": 0.671875,
"learning_rate": 3.178919262911314e-07,
"logits/chosen": -0.13525782525539398,
"logits/rejected": -0.19805659353733063,
"logps/chosen": -105.85040283203125,
"logps/ref_chosen": -59.957359313964844,
"logps/ref_rejected": -69.31729888916016,
"logps/rejected": -131.88455200195312,
"loss": 1.1747,
"rewards/accuracies": 0.703125,
"rewards/chosen": -1.4092702865600586,
"rewards/margins": 0.5042582750320435,
"rewards/rejected": -1.9135286808013916,
"step": 313
},
{
"epoch": 0.47467876039304613,
"epsilon_dpo/beta": 0.03049194999039173,
"epsilon_dpo/beta_margin_grad_mean": -0.336773157119751,
"epsilon_dpo/beta_margin_grad_std": 0.17656877636909485,
"epsilon_dpo/beta_margin_mean": 0.8239002823829651,
"epsilon_dpo/beta_margin_std": 0.9447659850120544,
"epsilon_dpo/loss_margin_mean": 27.19158172607422,
"grad_norm": 16.053180694580078,
"kl/avg_steps": 0.53125,
"kl/beta": 0.030651738867163658,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 3.166184534225087e-07,
"logits/chosen": -0.2134556770324707,
"logits/rejected": -0.1250694841146469,
"logps/chosen": -113.34410095214844,
"logps/ref_chosen": -70.26815795898438,
"logps/ref_rejected": -69.23971557617188,
"logps/rejected": -139.50723266601562,
"loss": 0.9013,
"rewards/accuracies": 0.796875,
"rewards/chosen": -1.3145241737365723,
"rewards/margins": 0.8239003419876099,
"rewards/rejected": -2.1384243965148926,
"step": 314
},
{
"epoch": 0.47619047619047616,
"epsilon_dpo/beta": 0.030359404161572456,
"epsilon_dpo/beta_margin_grad_mean": -0.36150962114334106,
"epsilon_dpo/beta_margin_grad_std": 0.1857806146144867,
"epsilon_dpo/beta_margin_mean": 0.6679502725601196,
"epsilon_dpo/beta_margin_std": 0.9110891819000244,
"epsilon_dpo/loss_margin_mean": 22.209096908569336,
"grad_norm": 16.2976131439209,
"kl/avg_steps": 0.4375,
"kl/beta": 0.030489761382341385,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 3.1534311709253723e-07,
"logits/chosen": -0.2032572478055954,
"logits/rejected": -0.28827083110809326,
"logps/chosen": -112.1826171875,
"logps/ref_chosen": -67.79469299316406,
"logps/ref_rejected": -74.55148315429688,
"logps/rejected": -141.14849853515625,
"loss": 1.0045,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.3489515781402588,
"rewards/margins": 0.6679502725601196,
"rewards/rejected": -2.016901969909668,
"step": 315
},
{
"epoch": 0.47770219198790626,
"epsilon_dpo/beta": 0.030198698863387108,
"epsilon_dpo/beta_margin_grad_mean": -0.33820757269859314,
"epsilon_dpo/beta_margin_grad_std": 0.18698331713676453,
"epsilon_dpo/beta_margin_mean": 0.8183820247650146,
"epsilon_dpo/beta_margin_std": 0.9690468907356262,
"epsilon_dpo/loss_margin_mean": 27.29484748840332,
"grad_norm": 16.733016967773438,
"kl/avg_steps": 0.53125,
"kl/beta": 0.03035695105791092,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 3.1406595297511564e-07,
"logits/chosen": -0.11085818707942963,
"logits/rejected": -0.2800089120864868,
"logps/chosen": -98.94679260253906,
"logps/ref_chosen": -55.288482666015625,
"logps/ref_rejected": -96.15723419189453,
"logps/rejected": -167.1103973388672,
"loss": 0.9183,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.3192174434661865,
"rewards/margins": 0.8183820247650146,
"rewards/rejected": -2.137599468231201,
"step": 316
},
{
"epoch": 0.47921390778533635,
"epsilon_dpo/beta": 0.030001366510987282,
"epsilon_dpo/beta_margin_grad_mean": -0.31859520077705383,
"epsilon_dpo/beta_margin_grad_std": 0.1786729246377945,
"epsilon_dpo/beta_margin_mean": 0.917066752910614,
"epsilon_dpo/beta_margin_std": 0.9616928100585938,
"epsilon_dpo/loss_margin_mean": 30.723278045654297,
"grad_norm": 17.434093475341797,
"kl/avg_steps": 0.65625,
"kl/beta": 0.030196530744433403,
"kl/n_epsilon_steps": 0.171875,
"kl/p_epsilon_steps": 0.828125,
"learning_rate": 3.1278699679526975e-07,
"logits/chosen": -0.15586476027965546,
"logits/rejected": -0.23516878485679626,
"logps/chosen": -92.65066528320312,
"logps/ref_chosen": -54.58137512207031,
"logps/ref_rejected": -72.77232360839844,
"logps/rejected": -141.56488037109375,
"loss": 0.8498,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.1454646587371826,
"rewards/margins": 0.9170666933059692,
"rewards/rejected": -2.0625312328338623,
"step": 317
},
{
"epoch": 0.48072562358276644,
"epsilon_dpo/beta": 0.0298901479691267,
"epsilon_dpo/beta_margin_grad_mean": -0.388794481754303,
"epsilon_dpo/beta_margin_grad_std": 0.23832088708877563,
"epsilon_dpo/beta_margin_mean": 0.6110854744911194,
"epsilon_dpo/beta_margin_std": 1.267026424407959,
"epsilon_dpo/loss_margin_mean": 20.742528915405273,
"grad_norm": 18.73935890197754,
"kl/avg_steps": 0.375,
"kl/beta": 0.029999658465385437,
"kl/n_epsilon_steps": 0.3125,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 3.1150628432815336e-07,
"logits/chosen": -0.0694267749786377,
"logits/rejected": -0.24108974635601044,
"logps/chosen": -97.48424530029297,
"logps/ref_chosen": -52.88822937011719,
"logps/ref_rejected": -80.63988494873047,
"logps/rejected": -145.97842407226562,
"loss": 1.1939,
"rewards/accuracies": 0.671875,
"rewards/chosen": -1.3360154628753662,
"rewards/margins": 0.6110855340957642,
"rewards/rejected": -1.94710111618042,
"step": 318
},
{
"epoch": 0.48223733938019653,
"epsilon_dpo/beta": 0.029741115868091583,
"epsilon_dpo/beta_margin_grad_mean": -0.3368452191352844,
"epsilon_dpo/beta_margin_grad_std": 0.19771793484687805,
"epsilon_dpo/beta_margin_mean": 0.8668777942657471,
"epsilon_dpo/beta_margin_std": 1.0975935459136963,
"epsilon_dpo/loss_margin_mean": 29.372663497924805,
"grad_norm": 16.870088577270508,
"kl/avg_steps": 0.5,
"kl/beta": 0.029887579381465912,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 3.1022385139804707e-07,
"logits/chosen": -0.20363673567771912,
"logits/rejected": -0.15058369934558868,
"logps/chosen": -105.547119140625,
"logps/ref_chosen": -64.36333465576172,
"logps/ref_rejected": -79.47296142578125,
"logps/rejected": -150.0294189453125,
"loss": 0.9279,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.2278856039047241,
"rewards/margins": 0.8668777942657471,
"rewards/rejected": -2.0947632789611816,
"step": 319
},
{
"epoch": 0.4837490551776266,
"epsilon_dpo/beta": 0.029621032997965813,
"epsilon_dpo/beta_margin_grad_mean": -0.383794903755188,
"epsilon_dpo/beta_margin_grad_std": 0.20442117750644684,
"epsilon_dpo/beta_margin_mean": 0.6200546026229858,
"epsilon_dpo/beta_margin_std": 1.122438669204712,
"epsilon_dpo/loss_margin_mean": 21.16710662841797,
"grad_norm": 19.105484008789062,
"kl/avg_steps": 0.40625,
"kl/beta": 0.029738886281847954,
"kl/n_epsilon_steps": 0.296875,
"kl/p_epsilon_steps": 0.703125,
"learning_rate": 3.0893973387735683e-07,
"logits/chosen": -0.11793963611125946,
"logits/rejected": -0.16425363719463348,
"logps/chosen": -90.87725067138672,
"logps/ref_chosen": -49.558746337890625,
"logps/ref_rejected": -71.23444366455078,
"logps/rejected": -133.72006225585938,
"loss": 1.1081,
"rewards/accuracies": 0.703125,
"rewards/chosen": -1.225843906402588,
"rewards/margins": 0.6200546026229858,
"rewards/rejected": -1.8458983898162842,
"step": 320
},
{
"epoch": 0.4852607709750567,
"epsilon_dpo/beta": 0.0294363871216774,
"epsilon_dpo/beta_margin_grad_mean": -0.3642115890979767,
"epsilon_dpo/beta_margin_grad_std": 0.2081499844789505,
"epsilon_dpo/beta_margin_mean": 0.6765070557594299,
"epsilon_dpo/beta_margin_std": 1.0760635137557983,
"epsilon_dpo/loss_margin_mean": 23.185911178588867,
"grad_norm": 19.51809310913086,
"kl/avg_steps": 0.625,
"kl/beta": 0.029618559405207634,
"kl/n_epsilon_steps": 0.1875,
"kl/p_epsilon_steps": 0.8125,
"learning_rate": 3.0765396768561004e-07,
"logits/chosen": 0.006262287497520447,
"logits/rejected": -0.04578740894794464,
"logps/chosen": -94.43731689453125,
"logps/ref_chosen": -52.085269927978516,
"logps/ref_rejected": -55.58674621582031,
"logps/rejected": -121.12471008300781,
"loss": 1.0622,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.2502509355545044,
"rewards/margins": 0.6765070557594299,
"rewards/rejected": -1.926758050918579,
"step": 321
},
{
"epoch": 0.48677248677248675,
"epsilon_dpo/beta": 0.02921675518155098,
"epsilon_dpo/beta_margin_grad_mean": -0.31855690479278564,
"epsilon_dpo/beta_margin_grad_std": 0.17890101671218872,
"epsilon_dpo/beta_margin_mean": 0.9084843993186951,
"epsilon_dpo/beta_margin_std": 0.9611436128616333,
"epsilon_dpo/loss_margin_mean": 31.230024337768555,
"grad_norm": 17.022132873535156,
"kl/avg_steps": 0.75,
"kl/beta": 0.02943459339439869,
"kl/n_epsilon_steps": 0.125,
"kl/p_epsilon_steps": 0.875,
"learning_rate": 3.063665887884511e-07,
"logits/chosen": 0.022591251879930496,
"logits/rejected": -0.22565415501594543,
"logps/chosen": -93.83444213867188,
"logps/ref_chosen": -47.404109954833984,
"logps/ref_rejected": -73.4260025024414,
"logps/rejected": -151.08636474609375,
"loss": 0.8564,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.3588275909423828,
"rewards/margins": 0.9084843397140503,
"rewards/rejected": -2.2673120498657227,
"step": 322
},
{
"epoch": 0.48828420256991684,
"epsilon_dpo/beta": 0.029113350436091423,
"epsilon_dpo/beta_margin_grad_mean": -0.38590797781944275,
"epsilon_dpo/beta_margin_grad_std": 0.21527589857578278,
"epsilon_dpo/beta_margin_mean": 0.6405409574508667,
"epsilon_dpo/beta_margin_std": 1.216580867767334,
"epsilon_dpo/loss_margin_mean": 22.26471710205078,
"grad_norm": 18.42991065979004,
"kl/avg_steps": 0.359375,
"kl/beta": 0.029215477406978607,
"kl/n_epsilon_steps": 0.3125,
"kl/p_epsilon_steps": 0.671875,
"learning_rate": 3.0507763319663517e-07,
"logits/chosen": -0.20102056860923767,
"logits/rejected": -0.3409814238548279,
"logps/chosen": -117.82904052734375,
"logps/ref_chosen": -70.00630187988281,
"logps/ref_rejected": -86.96690368652344,
"logps/rejected": -157.05435180664062,
"loss": 1.1328,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.3946311473846436,
"rewards/margins": 0.6405409574508667,
"rewards/rejected": -2.0351719856262207,
"step": 323
},
{
"epoch": 0.4897959183673469,
"epsilon_dpo/beta": 0.028922712430357933,
"epsilon_dpo/beta_margin_grad_mean": -0.34377309679985046,
"epsilon_dpo/beta_margin_grad_std": 0.17404742538928986,
"epsilon_dpo/beta_margin_mean": 0.771388828754425,
"epsilon_dpo/beta_margin_std": 0.9316695332527161,
"epsilon_dpo/loss_margin_mean": 26.821897506713867,
"grad_norm": 19.947586059570312,
"kl/avg_steps": 0.65625,
"kl/beta": 0.0291108600795269,
"kl/n_epsilon_steps": 0.171875,
"kl/p_epsilon_steps": 0.828125,
"learning_rate": 3.0378713696502097e-07,
"logits/chosen": -0.18514229357242584,
"logits/rejected": -0.20308475196361542,
"logps/chosen": -97.69032287597656,
"logps/ref_chosen": -55.88882064819336,
"logps/ref_rejected": -75.23088073730469,
"logps/rejected": -143.85427856445312,
"loss": 0.9311,
"rewards/accuracies": 0.828125,
"rewards/chosen": -1.2128124237060547,
"rewards/margins": 0.771388828754425,
"rewards/rejected": -1.984201192855835,
"step": 324
},
{
"epoch": 0.491307634164777,
"epsilon_dpo/beta": 0.02877933904528618,
"epsilon_dpo/beta_margin_grad_mean": -0.3556049168109894,
"epsilon_dpo/beta_margin_grad_std": 0.20257484912872314,
"epsilon_dpo/beta_margin_mean": 0.7505987882614136,
"epsilon_dpo/beta_margin_std": 1.1391657590866089,
"epsilon_dpo/loss_margin_mean": 26.31403923034668,
"grad_norm": 18.61254119873047,
"kl/avg_steps": 0.5,
"kl/beta": 0.028921065852046013,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 3.0249513619156206e-07,
"logits/chosen": -0.10339093953371048,
"logits/rejected": -0.14916850626468658,
"logps/chosen": -115.03617095947266,
"logps/ref_chosen": -64.14701843261719,
"logps/ref_rejected": -79.91143035888672,
"logps/rejected": -157.1146240234375,
"loss": 1.0236,
"rewards/accuracies": 0.765625,
"rewards/chosen": -1.4673625230789185,
"rewards/margins": 0.7505987882614136,
"rewards/rejected": -2.217961311340332,
"step": 325
},
{
"epoch": 0.4928193499622071,
"epsilon_dpo/beta": 0.0287350881844759,
"epsilon_dpo/beta_margin_grad_mean": -0.4206501841545105,
"epsilon_dpo/beta_margin_grad_std": 0.21349987387657166,
"epsilon_dpo/beta_margin_mean": 0.3843631148338318,
"epsilon_dpo/beta_margin_std": 1.0303096771240234,
"epsilon_dpo/loss_margin_mean": 13.668975830078125,
"grad_norm": 22.55351448059082,
"kl/avg_steps": 0.15625,
"kl/beta": 0.02877718023955822,
"kl/n_epsilon_steps": 0.421875,
"kl/p_epsilon_steps": 0.578125,
"learning_rate": 3.012016670162977e-07,
"logits/chosen": -0.1851939707994461,
"logits/rejected": -0.19780233502388,
"logps/chosen": -135.22189331054688,
"logps/ref_chosen": -75.53131103515625,
"logps/ref_rejected": -76.5898666381836,
"logps/rejected": -149.94943237304688,
"loss": 1.2747,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.7200180292129517,
"rewards/margins": 0.3843631148338318,
"rewards/rejected": -2.1043810844421387,
"step": 326
},
{
"epoch": 0.4943310657596372,
"epsilon_dpo/beta": 0.02862740308046341,
"epsilon_dpo/beta_margin_grad_mean": -0.3837631940841675,
"epsilon_dpo/beta_margin_grad_std": 0.21749018132686615,
"epsilon_dpo/beta_margin_mean": 0.6076876521110535,
"epsilon_dpo/beta_margin_std": 1.158327579498291,
"epsilon_dpo/loss_margin_mean": 21.49993896484375,
"grad_norm": 21.54072380065918,
"kl/avg_steps": 0.375,
"kl/beta": 0.028732286766171455,
"kl/n_epsilon_steps": 0.3125,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 2.99906765620341e-07,
"logits/chosen": -0.08407838642597198,
"logits/rejected": -0.2020251303911209,
"logps/chosen": -124.540771484375,
"logps/ref_chosen": -69.337158203125,
"logps/ref_rejected": -73.37751770019531,
"logps/rejected": -150.08106994628906,
"loss": 1.1426,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.5829294919967651,
"rewards/margins": 0.6076875925064087,
"rewards/rejected": -2.190617084503174,
"step": 327
},
{
"epoch": 0.4958427815570673,
"epsilon_dpo/beta": 0.02846677415072918,
"epsilon_dpo/beta_margin_grad_mean": -0.3519129753112793,
"epsilon_dpo/beta_margin_grad_std": 0.17671814560890198,
"epsilon_dpo/beta_margin_mean": 0.7486301064491272,
"epsilon_dpo/beta_margin_std": 0.9373253583908081,
"epsilon_dpo/loss_margin_mean": 26.463184356689453,
"grad_norm": 17.0780086517334,
"kl/avg_steps": 0.5625,
"kl/beta": 0.028624942526221275,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 2.9861046822486766e-07,
"logits/chosen": -0.12927740812301636,
"logits/rejected": -0.09088870882987976,
"logps/chosen": -107.67019653320312,
"logps/ref_chosen": -61.70623016357422,
"logps/ref_rejected": -83.73808288574219,
"logps/rejected": -156.16522216796875,
"loss": 0.9485,
"rewards/accuracies": 0.765625,
"rewards/chosen": -1.3091254234313965,
"rewards/margins": 0.7486301064491272,
"rewards/rejected": -2.057755470275879,
"step": 328
},
{
"epoch": 0.4973544973544973,
"epsilon_dpo/beta": 0.02834312804043293,
"epsilon_dpo/beta_margin_grad_mean": -0.3533529043197632,
"epsilon_dpo/beta_margin_grad_std": 0.1981075257062912,
"epsilon_dpo/beta_margin_mean": 0.7818435430526733,
"epsilon_dpo/beta_margin_std": 1.078006625175476,
"epsilon_dpo/loss_margin_mean": 27.817201614379883,
"grad_norm": 19.720523834228516,
"kl/avg_steps": 0.4375,
"kl/beta": 0.028464827686548233,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 2.9731281109010253e-07,
"logits/chosen": -0.07322809100151062,
"logits/rejected": -0.23954272270202637,
"logps/chosen": -118.35223388671875,
"logps/ref_chosen": -64.4984130859375,
"logps/ref_rejected": -83.6591796875,
"logps/rejected": -165.3302001953125,
"loss": 0.9788,
"rewards/accuracies": 0.734375,
"rewards/chosen": -1.5281262397766113,
"rewards/margins": 0.7818435430526733,
"rewards/rejected": -2.309969902038574,
"step": 329
},
{
"epoch": 0.4988662131519274,
"epsilon_dpo/beta": 0.028193095698952675,
"epsilon_dpo/beta_margin_grad_mean": -0.35263553261756897,
"epsilon_dpo/beta_margin_grad_std": 0.22436949610710144,
"epsilon_dpo/beta_margin_mean": 0.7705467343330383,
"epsilon_dpo/beta_margin_std": 1.224207878112793,
"epsilon_dpo/loss_margin_mean": 27.591312408447266,
"grad_norm": 18.109106063842773,
"kl/avg_steps": 0.53125,
"kl/beta": 0.028340836986899376,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 2.9601383051430505e-07,
"logits/chosen": -0.015815619379281998,
"logits/rejected": -0.18343853950500488,
"logps/chosen": -101.00935363769531,
"logps/ref_chosen": -54.80464172363281,
"logps/ref_rejected": -75.31942749023438,
"logps/rejected": -149.11544799804688,
"loss": 1.0588,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.3063124418258667,
"rewards/margins": 0.7705467939376831,
"rewards/rejected": -2.07685923576355,
"step": 330
},
{
"epoch": 0.5003779289493575,
"epsilon_dpo/beta": 0.028026489540934563,
"epsilon_dpo/beta_margin_grad_mean": -0.30794641375541687,
"epsilon_dpo/beta_margin_grad_std": 0.2166793793439865,
"epsilon_dpo/beta_margin_mean": 1.048171877861023,
"epsilon_dpo/beta_margin_std": 1.2668702602386475,
"epsilon_dpo/loss_margin_mean": 37.6468391418457,
"grad_norm": 16.974321365356445,
"kl/avg_steps": 0.59375,
"kl/beta": 0.02819107100367546,
"kl/n_epsilon_steps": 0.203125,
"kl/p_epsilon_steps": 0.796875,
"learning_rate": 2.947135628327544e-07,
"logits/chosen": -0.04081210866570473,
"logits/rejected": -0.07118724286556244,
"logps/chosen": -109.1402816772461,
"logps/ref_chosen": -59.242576599121094,
"logps/ref_rejected": -69.87483215332031,
"logps/rejected": -157.41937255859375,
"loss": 0.8909,
"rewards/accuracies": 0.796875,
"rewards/chosen": -1.4016971588134766,
"rewards/margins": 1.048171877861023,
"rewards/rejected": -2.449869155883789,
"step": 331
},
{
"epoch": 0.5018896447467877,
"epsilon_dpo/beta": 0.027861064299941063,
"epsilon_dpo/beta_margin_grad_mean": -0.34692201018333435,
"epsilon_dpo/beta_margin_grad_std": 0.18728528916835785,
"epsilon_dpo/beta_margin_mean": 0.8180192708969116,
"epsilon_dpo/beta_margin_std": 1.0675290822982788,
"epsilon_dpo/loss_margin_mean": 29.532556533813477,
"grad_norm": 18.015535354614258,
"kl/avg_steps": 0.59375,
"kl/beta": 0.02802467532455921,
"kl/n_epsilon_steps": 0.203125,
"kl/p_epsilon_steps": 0.796875,
"learning_rate": 2.934120444167326e-07,
"logits/chosen": -0.11017828434705734,
"logits/rejected": -0.1367471069097519,
"logps/chosen": -119.58549499511719,
"logps/ref_chosen": -67.10975646972656,
"logps/ref_rejected": -77.11839294433594,
"logps/rejected": -159.12667846679688,
"loss": 0.9432,
"rewards/accuracies": 0.84375,
"rewards/chosen": -1.463294506072998,
"rewards/margins": 0.8180192708969116,
"rewards/rejected": -2.28131365776062,
"step": 332
},
{
"epoch": 0.5034013605442177,
"epsilon_dpo/beta": 0.02767920307815075,
"epsilon_dpo/beta_margin_grad_mean": -0.3167070150375366,
"epsilon_dpo/beta_margin_grad_std": 0.18079084157943726,
"epsilon_dpo/beta_margin_mean": 0.955777108669281,
"epsilon_dpo/beta_margin_std": 1.0292044878005981,
"epsilon_dpo/loss_margin_mean": 34.69361877441406,
"grad_norm": 14.943379402160645,
"kl/avg_steps": 0.65625,
"kl/beta": 0.027859261259436607,
"kl/n_epsilon_steps": 0.171875,
"kl/p_epsilon_steps": 0.828125,
"learning_rate": 2.921093116725076e-07,
"logits/chosen": -0.00956184696406126,
"logits/rejected": -0.20060396194458008,
"logps/chosen": -113.64372253417969,
"logps/ref_chosen": -58.381126403808594,
"logps/ref_rejected": -85.02839660644531,
"logps/rejected": -174.98460388183594,
"loss": 0.8442,
"rewards/accuracies": 0.84375,
"rewards/chosen": -1.5324262380599976,
"rewards/margins": 0.9557771682739258,
"rewards/rejected": -2.488203525543213,
"step": 333
},
{
"epoch": 0.5049130763416477,
"epsilon_dpo/beta": 0.027576593682169914,
"epsilon_dpo/beta_margin_grad_mean": -0.3798303008079529,
"epsilon_dpo/beta_margin_grad_std": 0.22335129976272583,
"epsilon_dpo/beta_margin_mean": 0.6433828473091125,
"epsilon_dpo/beta_margin_std": 1.2143069505691528,
"epsilon_dpo/loss_margin_mean": 23.620746612548828,
"grad_norm": 21.258296966552734,
"kl/avg_steps": 0.375,
"kl/beta": 0.0276776272803545,
"kl/n_epsilon_steps": 0.3125,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 2.9080540104031484e-07,
"logits/chosen": -0.12896177172660828,
"logits/rejected": -0.12486004084348679,
"logps/chosen": -121.84860229492188,
"logps/ref_chosen": -66.89199829101562,
"logps/ref_rejected": -91.83695220947266,
"logps/rejected": -170.414306640625,
"loss": 1.1403,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.5188663005828857,
"rewards/margins": 0.6433828473091125,
"rewards/rejected": -2.1622490882873535,
"step": 334
},
{
"epoch": 0.5064247921390779,
"epsilon_dpo/beta": 0.027447713539004326,
"epsilon_dpo/beta_margin_grad_mean": -0.3620632588863373,
"epsilon_dpo/beta_margin_grad_std": 0.22339656949043274,
"epsilon_dpo/beta_margin_mean": 0.7682227492332458,
"epsilon_dpo/beta_margin_std": 1.228237271308899,
"epsilon_dpo/loss_margin_mean": 28.264835357666016,
"grad_norm": 23.812440872192383,
"kl/avg_steps": 0.46875,
"kl/beta": 0.027574222534894943,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 2.895003489933375e-07,
"logits/chosen": -0.03605717793107033,
"logits/rejected": -0.08819030225276947,
"logps/chosen": -117.11300659179688,
"logps/ref_chosen": -61.51445770263672,
"logps/ref_rejected": -75.68916320800781,
"logps/rejected": -159.55255126953125,
"loss": 1.0553,
"rewards/accuracies": 0.765625,
"rewards/chosen": -1.5294873714447021,
"rewards/margins": 0.7682227492332458,
"rewards/rejected": -2.2977099418640137,
"step": 335
},
{
"epoch": 0.5079365079365079,
"epsilon_dpo/beta": 0.027328230440616608,
"epsilon_dpo/beta_margin_grad_mean": -0.3672873079776764,
"epsilon_dpo/beta_margin_grad_std": 0.21067845821380615,
"epsilon_dpo/beta_margin_mean": 0.7602779865264893,
"epsilon_dpo/beta_margin_std": 1.2374534606933594,
"epsilon_dpo/loss_margin_mean": 28.086387634277344,
"grad_norm": 20.6002197265625,
"kl/avg_steps": 0.4375,
"kl/beta": 0.02744557149708271,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 2.8819419203668675e-07,
"logits/chosen": -0.143601655960083,
"logits/rejected": -0.14409543573856354,
"logps/chosen": -135.72813415527344,
"logps/ref_chosen": -68.85006713867188,
"logps/ref_rejected": -92.99603271484375,
"logps/rejected": -187.96047973632812,
"loss": 1.0467,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.8305745124816895,
"rewards/margins": 0.7602779865264893,
"rewards/rejected": -2.590852737426758,
"step": 336
},
{
"epoch": 0.509448223733938,
"epsilon_dpo/beta": 0.027226271107792854,
"epsilon_dpo/beta_margin_grad_mean": -0.3840291500091553,
"epsilon_dpo/beta_margin_grad_std": 0.2105308622121811,
"epsilon_dpo/beta_margin_mean": 0.5929294228553772,
"epsilon_dpo/beta_margin_std": 1.099222183227539,
"epsilon_dpo/loss_margin_mean": 22.05531883239746,
"grad_norm": 19.38318634033203,
"kl/avg_steps": 0.375,
"kl/beta": 0.02732602134346962,
"kl/n_epsilon_steps": 0.3125,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 2.8688696670638053e-07,
"logits/chosen": -0.13203132152557373,
"logits/rejected": -0.2529122531414032,
"logps/chosen": -140.16940307617188,
"logps/ref_chosen": -73.18783569335938,
"logps/ref_rejected": -86.89118957519531,
"logps/rejected": -175.92807006835938,
"loss": 1.1286,
"rewards/accuracies": 0.703125,
"rewards/chosen": -1.827904462814331,
"rewards/margins": 0.5929294228553772,
"rewards/rejected": -2.4208340644836426,
"step": 337
},
{
"epoch": 0.5109599395313681,
"epsilon_dpo/beta": 0.027090521529316902,
"epsilon_dpo/beta_margin_grad_mean": -0.3820020854473114,
"epsilon_dpo/beta_margin_grad_std": 0.21057164669036865,
"epsilon_dpo/beta_margin_mean": 0.6138112545013428,
"epsilon_dpo/beta_margin_std": 1.1201114654541016,
"epsilon_dpo/loss_margin_mean": 22.898605346679688,
"grad_norm": 21.866863250732422,
"kl/avg_steps": 0.5,
"kl/beta": 0.02722393162548542,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 2.8557870956832133e-07,
"logits/chosen": -0.0695226639509201,
"logits/rejected": -0.13510388135910034,
"logps/chosen": -131.22802734375,
"logps/ref_chosen": -63.939613342285156,
"logps/ref_rejected": -75.34243774414062,
"logps/rejected": -165.52944946289062,
"loss": 1.1218,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.825699806213379,
"rewards/margins": 0.6138112545013428,
"rewards/rejected": -2.439511299133301,
"step": 338
},
{
"epoch": 0.5124716553287982,
"epsilon_dpo/beta": 0.026947276666760445,
"epsilon_dpo/beta_margin_grad_mean": -0.35879501700401306,
"epsilon_dpo/beta_margin_grad_std": 0.19893522560596466,
"epsilon_dpo/beta_margin_mean": 0.7257330417633057,
"epsilon_dpo/beta_margin_std": 1.0469053983688354,
"epsilon_dpo/loss_margin_mean": 27.157066345214844,
"grad_norm": 19.962738037109375,
"kl/avg_steps": 0.53125,
"kl/beta": 0.02708848938345909,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 2.842694572172736e-07,
"logits/chosen": 0.16238835453987122,
"logits/rejected": -0.054334595799446106,
"logps/chosen": -101.20484161376953,
"logps/ref_chosen": -45.54913330078125,
"logps/ref_rejected": -67.0482177734375,
"logps/rejected": -149.86099243164062,
"loss": 1.0097,
"rewards/accuracies": 0.796875,
"rewards/chosen": -1.504643201828003,
"rewards/margins": 0.7257329821586609,
"rewards/rejected": -2.2303762435913086,
"step": 339
},
{
"epoch": 0.5139833711262283,
"epsilon_dpo/beta": 0.026872245594859123,
"epsilon_dpo/beta_margin_grad_mean": -0.36537620425224304,
"epsilon_dpo/beta_margin_grad_std": 0.23597703874111176,
"epsilon_dpo/beta_margin_mean": 0.7736579775810242,
"epsilon_dpo/beta_margin_std": 1.3091695308685303,
"epsilon_dpo/loss_margin_mean": 29.150344848632812,
"grad_norm": 20.396921157836914,
"kl/avg_steps": 0.28125,
"kl/beta": 0.026945341378450394,
"kl/n_epsilon_steps": 0.359375,
"kl/p_epsilon_steps": 0.640625,
"learning_rate": 2.8295924627584004e-07,
"logits/chosen": 0.028462864458560944,
"logits/rejected": -0.02581612765789032,
"logps/chosen": -117.3265380859375,
"logps/ref_chosen": -54.00564956665039,
"logps/ref_rejected": -61.314430236816406,
"logps/rejected": -153.78567504882812,
"loss": 1.0917,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.7069659233093262,
"rewards/margins": 0.7736579179763794,
"rewards/rejected": -2.480623960494995,
"step": 340
},
{
"epoch": 0.5154950869236583,
"epsilon_dpo/beta": 0.02673809416592121,
"epsilon_dpo/beta_margin_grad_mean": -0.34740331768989563,
"epsilon_dpo/beta_margin_grad_std": 0.2228785902261734,
"epsilon_dpo/beta_margin_mean": 0.8176708817481995,
"epsilon_dpo/beta_margin_std": 1.1942744255065918,
"epsilon_dpo/loss_margin_mean": 30.86228370666504,
"grad_norm": 21.178123474121094,
"kl/avg_steps": 0.5,
"kl/beta": 0.026869770139455795,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 2.816481133934373e-07,
"logits/chosen": -0.05700066313147545,
"logits/rejected": -0.11208242177963257,
"logps/chosen": -125.3892822265625,
"logps/ref_chosen": -63.39509582519531,
"logps/ref_rejected": -76.20973205566406,
"logps/rejected": -169.06619262695312,
"loss": 1.0143,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.6606955528259277,
"rewards/margins": 0.8176709413528442,
"rewards/rejected": -2.4783666133880615,
"step": 341
},
{
"epoch": 0.5170068027210885,
"epsilon_dpo/beta": 0.026605069637298584,
"epsilon_dpo/beta_margin_grad_mean": -0.33578258752822876,
"epsilon_dpo/beta_margin_grad_std": 0.20211291313171387,
"epsilon_dpo/beta_margin_mean": 0.9122076630592346,
"epsilon_dpo/beta_margin_std": 1.1753519773483276,
"epsilon_dpo/loss_margin_mean": 34.535587310791016,
"grad_norm": 16.719194412231445,
"kl/avg_steps": 0.5,
"kl/beta": 0.02673608995974064,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 2.8033609524527046e-07,
"logits/chosen": -0.002955283969640732,
"logits/rejected": -0.09421958774328232,
"logps/chosen": -113.54861450195312,
"logps/ref_chosen": -53.047813415527344,
"logps/ref_rejected": -68.2854232788086,
"logps/rejected": -163.32180786132812,
"loss": 0.9252,
"rewards/accuracies": 0.734375,
"rewards/chosen": -1.6117675304412842,
"rewards/margins": 0.9122077226638794,
"rewards/rejected": -2.523975372314453,
"step": 342
},
{
"epoch": 0.5185185185185185,
"epsilon_dpo/beta": 0.026497649028897285,
"epsilon_dpo/beta_margin_grad_mean": -0.3930060863494873,
"epsilon_dpo/beta_margin_grad_std": 0.19550399482250214,
"epsilon_dpo/beta_margin_mean": 0.5287180542945862,
"epsilon_dpo/beta_margin_std": 1.0360126495361328,
"epsilon_dpo/loss_margin_mean": 20.19304656982422,
"grad_norm": 18.811838150024414,
"kl/avg_steps": 0.40625,
"kl/beta": 0.026603074744343758,
"kl/n_epsilon_steps": 0.296875,
"kl/p_epsilon_steps": 0.703125,
"learning_rate": 2.7902322853130753e-07,
"logits/chosen": -0.15556937456130981,
"logits/rejected": -0.12188497185707092,
"logps/chosen": -129.45437622070312,
"logps/ref_chosen": -70.57853698730469,
"logps/ref_rejected": -84.73873901367188,
"logps/rejected": -163.8076171875,
"loss": 1.1473,
"rewards/accuracies": 0.703125,
"rewards/chosen": -1.5627158880233765,
"rewards/margins": 0.5287179946899414,
"rewards/rejected": -2.0914340019226074,
"step": 343
},
{
"epoch": 0.5200302343159486,
"epsilon_dpo/beta": 0.026357313618063927,
"epsilon_dpo/beta_margin_grad_mean": -0.3436361253261566,
"epsilon_dpo/beta_margin_grad_std": 0.2023583948612213,
"epsilon_dpo/beta_margin_mean": 0.8134778141975403,
"epsilon_dpo/beta_margin_std": 1.0867211818695068,
"epsilon_dpo/loss_margin_mean": 31.103286743164062,
"grad_norm": 18.871267318725586,
"kl/avg_steps": 0.53125,
"kl/beta": 0.02649543620646,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 2.7770954997525274e-07,
"logits/chosen": 0.09255300462245941,
"logits/rejected": -0.08929741382598877,
"logps/chosen": -121.63165283203125,
"logps/ref_chosen": -55.811004638671875,
"logps/ref_rejected": -84.7763671875,
"logps/rejected": -181.70030212402344,
"loss": 0.9649,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.7377839088439941,
"rewards/margins": 0.8134778738021851,
"rewards/rejected": -2.5512619018554688,
"step": 344
},
{
"epoch": 0.5215419501133787,
"epsilon_dpo/beta": 0.026209793984889984,
"epsilon_dpo/beta_margin_grad_mean": -0.3536202311515808,
"epsilon_dpo/beta_margin_grad_std": 0.213411346077919,
"epsilon_dpo/beta_margin_mean": 0.7738087177276611,
"epsilon_dpo/beta_margin_std": 1.1662368774414062,
"epsilon_dpo/loss_margin_mean": 29.77425765991211,
"grad_norm": 18.995004653930664,
"kl/avg_steps": 0.5625,
"kl/beta": 0.026355423033237457,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 2.7639509632351927e-07,
"logits/chosen": -0.017737407237291336,
"logits/rejected": -0.11551772058010101,
"logps/chosen": -108.07044219970703,
"logps/ref_chosen": -57.786094665527344,
"logps/ref_rejected": -78.91847229003906,
"logps/rejected": -158.97708129882812,
"loss": 1.0245,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.3206021785736084,
"rewards/margins": 0.7738087177276611,
"rewards/rejected": -2.0944108963012695,
"step": 345
},
{
"epoch": 0.5230536659108088,
"epsilon_dpo/beta": 0.02607138082385063,
"epsilon_dpo/beta_margin_grad_mean": -0.33412283658981323,
"epsilon_dpo/beta_margin_grad_std": 0.22004300355911255,
"epsilon_dpo/beta_margin_mean": 0.8977053165435791,
"epsilon_dpo/beta_margin_std": 1.2167853116989136,
"epsilon_dpo/loss_margin_mean": 34.712406158447266,
"grad_norm": 18.345523834228516,
"kl/avg_steps": 0.53125,
"kl/beta": 0.026208003982901573,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 2.7507990434420123e-07,
"logits/chosen": 0.05169302597641945,
"logits/rejected": -0.16830721497535706,
"logps/chosen": -111.11968994140625,
"logps/ref_chosen": -56.285125732421875,
"logps/ref_rejected": -91.15303039550781,
"logps/rejected": -180.70001220703125,
"loss": 0.9684,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.4319815635681152,
"rewards/margins": 0.8977053165435791,
"rewards/rejected": -2.3296866416931152,
"step": 346
},
{
"epoch": 0.5245653817082389,
"epsilon_dpo/beta": 0.025949902832508087,
"epsilon_dpo/beta_margin_grad_mean": -0.3737473785877228,
"epsilon_dpo/beta_margin_grad_std": 0.2025209665298462,
"epsilon_dpo/beta_margin_mean": 0.6699243187904358,
"epsilon_dpo/beta_margin_std": 1.0823490619659424,
"epsilon_dpo/loss_margin_mean": 26.06598663330078,
"grad_norm": 22.090608596801758,
"kl/avg_steps": 0.46875,
"kl/beta": 0.026069508865475655,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 2.737640108260456e-07,
"logits/chosen": 0.059030018746852875,
"logits/rejected": -0.10736814886331558,
"logps/chosen": -113.51923370361328,
"logps/ref_chosen": -53.499542236328125,
"logps/ref_rejected": -72.52565002441406,
"logps/rejected": -158.611328125,
"loss": 1.06,
"rewards/accuracies": 0.734375,
"rewards/chosen": -1.5608642101287842,
"rewards/margins": 0.6699243187904358,
"rewards/rejected": -2.230788469314575,
"step": 347
},
{
"epoch": 0.5260770975056689,
"epsilon_dpo/beta": 0.025869376957416534,
"epsilon_dpo/beta_margin_grad_mean": -0.3531711995601654,
"epsilon_dpo/beta_margin_grad_std": 0.21784502267837524,
"epsilon_dpo/beta_margin_mean": 0.8533730506896973,
"epsilon_dpo/beta_margin_std": 1.2607651948928833,
"epsilon_dpo/loss_margin_mean": 33.320106506347656,
"grad_norm": 19.78038787841797,
"kl/avg_steps": 0.3125,
"kl/beta": 0.025947878137230873,
"kl/n_epsilon_steps": 0.34375,
"kl/p_epsilon_steps": 0.65625,
"learning_rate": 2.724474525774229e-07,
"logits/chosen": -0.012447573244571686,
"logits/rejected": -0.04512844607234001,
"logps/chosen": -103.84272003173828,
"logps/ref_chosen": -50.78684997558594,
"logps/ref_rejected": -68.63732147216797,
"logps/rejected": -155.01329040527344,
"loss": 1.0015,
"rewards/accuracies": 0.671875,
"rewards/chosen": -1.3761167526245117,
"rewards/margins": 0.8533730506896973,
"rewards/rejected": -2.229489803314209,
"step": 348
},
{
"epoch": 0.527588813303099,
"epsilon_dpo/beta": 0.025748364627361298,
"epsilon_dpo/beta_margin_grad_mean": -0.3487337827682495,
"epsilon_dpo/beta_margin_grad_std": 0.1987219899892807,
"epsilon_dpo/beta_margin_mean": 0.8085266351699829,
"epsilon_dpo/beta_margin_std": 1.120718002319336,
"epsilon_dpo/loss_margin_mean": 31.64775276184082,
"grad_norm": 17.94879913330078,
"kl/avg_steps": 0.46875,
"kl/beta": 0.025867043063044548,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 2.711302664252973e-07,
"logits/chosen": 0.04031102731823921,
"logits/rejected": -0.18329352140426636,
"logps/chosen": -106.57583618164062,
"logps/ref_chosen": -53.32501220703125,
"logps/ref_rejected": -83.21235656738281,
"logps/rejected": -168.11093139648438,
"loss": 0.9751,
"rewards/accuracies": 0.765625,
"rewards/chosen": -1.3722143173217773,
"rewards/margins": 0.8085266351699829,
"rewards/rejected": -2.18074107170105,
"step": 349
},
{
"epoch": 0.5291005291005291,
"epsilon_dpo/beta": 0.025588002055883408,
"epsilon_dpo/beta_margin_grad_mean": -0.31332096457481384,
"epsilon_dpo/beta_margin_grad_std": 0.20124055445194244,
"epsilon_dpo/beta_margin_mean": 0.9967000484466553,
"epsilon_dpo/beta_margin_std": 1.1194626092910767,
"epsilon_dpo/loss_margin_mean": 39.17788314819336,
"grad_norm": 21.083372116088867,
"kl/avg_steps": 0.625,
"kl/beta": 0.025746358558535576,
"kl/n_epsilon_steps": 0.1875,
"kl/p_epsilon_steps": 0.8125,
"learning_rate": 2.698124892141971e-07,
"logits/chosen": -0.07673121988773346,
"logits/rejected": -0.25874873995780945,
"logps/chosen": -119.18470764160156,
"logps/ref_chosen": -61.62577438354492,
"logps/ref_rejected": -87.63627624511719,
"logps/rejected": -184.37307739257812,
"loss": 0.8604,
"rewards/accuracies": 0.796875,
"rewards/chosen": -1.475480556488037,
"rewards/margins": 0.9967001080513,
"rewards/rejected": -2.4721806049346924,
"step": 350
},
{
"epoch": 0.5306122448979592,
"epsilon_dpo/beta": 0.025445064529776573,
"epsilon_dpo/beta_margin_grad_mean": -0.33756786584854126,
"epsilon_dpo/beta_margin_grad_std": 0.22831334173679352,
"epsilon_dpo/beta_margin_mean": 0.8140925765037537,
"epsilon_dpo/beta_margin_std": 1.2352745532989502,
"epsilon_dpo/loss_margin_mean": 32.29942321777344,
"grad_norm": 16.99067497253418,
"kl/avg_steps": 0.5625,
"kl/beta": 0.0255864430218935,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 2.6849415780518357e-07,
"logits/chosen": -0.00764252245426178,
"logits/rejected": -0.1301419734954834,
"logps/chosen": -109.30411529541016,
"logps/ref_chosen": -56.2563362121582,
"logps/ref_rejected": -79.11589813232422,
"logps/rejected": -164.46310424804688,
"loss": 1.042,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.353240966796875,
"rewards/margins": 0.8140926361083984,
"rewards/rejected": -2.1673336029052734,
"step": 351
},
{
"epoch": 0.5321239606953893,
"epsilon_dpo/beta": 0.025310687720775604,
"epsilon_dpo/beta_margin_grad_mean": -0.355891615152359,
"epsilon_dpo/beta_margin_grad_std": 0.19265837967395782,
"epsilon_dpo/beta_margin_mean": 0.7581871747970581,
"epsilon_dpo/beta_margin_std": 1.0667455196380615,
"epsilon_dpo/loss_margin_mean": 30.180667877197266,
"grad_norm": 18.76468276977539,
"kl/avg_steps": 0.53125,
"kl/beta": 0.02544332481920719,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 2.6717530907482024e-07,
"logits/chosen": -0.13559405505657196,
"logits/rejected": -0.24246063828468323,
"logps/chosen": -114.82362365722656,
"logps/ref_chosen": -63.05195617675781,
"logps/ref_rejected": -85.52035522460938,
"logps/rejected": -167.47268676757812,
"loss": 0.9863,
"rewards/accuracies": 0.765625,
"rewards/chosen": -1.312227725982666,
"rewards/margins": 0.7581871747970581,
"rewards/rejected": -2.0704147815704346,
"step": 352
},
{
"epoch": 0.5336356764928194,
"epsilon_dpo/beta": 0.02517693303525448,
"epsilon_dpo/beta_margin_grad_mean": -0.33823856711387634,
"epsilon_dpo/beta_margin_grad_std": 0.2085641473531723,
"epsilon_dpo/beta_margin_mean": 0.8513762354850769,
"epsilon_dpo/beta_margin_std": 1.168091058731079,
"epsilon_dpo/loss_margin_mean": 34.07402420043945,
"grad_norm": 16.548805236816406,
"kl/avg_steps": 0.53125,
"kl/beta": 0.025308869779109955,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 2.658559799141411e-07,
"logits/chosen": -0.12417947500944138,
"logits/rejected": -0.05545546114444733,
"logps/chosen": -117.10000610351562,
"logps/ref_chosen": -69.00918579101562,
"logps/ref_rejected": -72.65840148925781,
"logps/rejected": -154.8232421875,
"loss": 0.9738,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.2114300727844238,
"rewards/margins": 0.8513762354850769,
"rewards/rejected": -2.0628063678741455,
"step": 353
},
{
"epoch": 0.5351473922902494,
"epsilon_dpo/beta": 0.02504388988018036,
"epsilon_dpo/beta_margin_grad_mean": -0.33561262488365173,
"epsilon_dpo/beta_margin_grad_std": 0.2188667356967926,
"epsilon_dpo/beta_margin_mean": 0.8719602227210999,
"epsilon_dpo/beta_margin_std": 1.1586198806762695,
"epsilon_dpo/loss_margin_mean": 35.108421325683594,
"grad_norm": 19.405967712402344,
"kl/avg_steps": 0.53125,
"kl/beta": 0.025175128132104874,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 2.6453620722761895e-07,
"logits/chosen": 0.16506893932819366,
"logits/rejected": -0.15294471383094788,
"logps/chosen": -90.18260955810547,
"logps/ref_chosen": -39.78833770751953,
"logps/ref_rejected": -69.56885528564453,
"logps/rejected": -155.07154846191406,
"loss": 0.9639,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.265305519104004,
"rewards/margins": 0.8719602227210999,
"rewards/rejected": -2.137265682220459,
"step": 354
},
{
"epoch": 0.5366591080876795,
"epsilon_dpo/beta": 0.02491154707968235,
"epsilon_dpo/beta_margin_grad_mean": -0.3334660828113556,
"epsilon_dpo/beta_margin_grad_std": 0.2218456268310547,
"epsilon_dpo/beta_margin_mean": 0.8903090357780457,
"epsilon_dpo/beta_margin_std": 1.1928634643554688,
"epsilon_dpo/loss_margin_mean": 36.034576416015625,
"grad_norm": 20.067537307739258,
"kl/avg_steps": 0.53125,
"kl/beta": 0.02504209242761135,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 2.632160279321328e-07,
"logits/chosen": 0.019428424537181854,
"logits/rejected": -0.26601749658584595,
"logps/chosen": -101.98774719238281,
"logps/ref_chosen": -46.25537872314453,
"logps/ref_rejected": -78.20236206054688,
"logps/rejected": -169.96929931640625,
"loss": 0.9668,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.391890287399292,
"rewards/margins": 0.8903090357780457,
"rewards/rejected": -2.2821993827819824,
"step": 355
},
{
"epoch": 0.5381708238851096,
"epsilon_dpo/beta": 0.024795474484562874,
"epsilon_dpo/beta_margin_grad_mean": -0.3692854642868042,
"epsilon_dpo/beta_margin_grad_std": 0.24712832272052765,
"epsilon_dpo/beta_margin_mean": 0.6950345635414124,
"epsilon_dpo/beta_margin_std": 1.3489798307418823,
"epsilon_dpo/loss_margin_mean": 28.390382766723633,
"grad_norm": 18.33978271484375,
"kl/avg_steps": 0.46875,
"kl/beta": 0.02490975894033909,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 2.618954789559356e-07,
"logits/chosen": 0.08699241280555725,
"logits/rejected": -0.14003297686576843,
"logps/chosen": -97.93515014648438,
"logps/ref_chosen": -47.906158447265625,
"logps/ref_rejected": -74.29397583007812,
"logps/rejected": -152.71334838867188,
"loss": 1.1782,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.2455909252166748,
"rewards/margins": 0.6950346231460571,
"rewards/rejected": -1.940625548362732,
"step": 356
},
{
"epoch": 0.5396825396825397,
"epsilon_dpo/beta": 0.02467978745698929,
"epsilon_dpo/beta_margin_grad_mean": -0.3811408579349518,
"epsilon_dpo/beta_margin_grad_std": 0.2140098512172699,
"epsilon_dpo/beta_margin_mean": 0.5947501063346863,
"epsilon_dpo/beta_margin_std": 1.108279824256897,
"epsilon_dpo/loss_margin_mean": 24.386058807373047,
"grad_norm": 18.42845344543457,
"kl/avg_steps": 0.46875,
"kl/beta": 0.024793539196252823,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 2.6057459723762076e-07,
"logits/chosen": -0.1337180733680725,
"logits/rejected": -0.21349212527275085,
"logps/chosen": -121.88738250732422,
"logps/ref_chosen": -62.63499450683594,
"logps/ref_rejected": -65.11400604248047,
"logps/rejected": -148.75244140625,
"loss": 1.1361,
"rewards/accuracies": 0.734375,
"rewards/chosen": -1.4662628173828125,
"rewards/margins": 0.5947501063346863,
"rewards/rejected": -2.0610129833221436,
"step": 357
},
{
"epoch": 0.5411942554799698,
"epsilon_dpo/beta": 0.024549216032028198,
"epsilon_dpo/beta_margin_grad_mean": -0.3436722755432129,
"epsilon_dpo/beta_margin_grad_std": 0.20899824798107147,
"epsilon_dpo/beta_margin_mean": 0.8488820791244507,
"epsilon_dpo/beta_margin_std": 1.1528469324111938,
"epsilon_dpo/loss_margin_mean": 34.84442901611328,
"grad_norm": 22.171615600585938,
"kl/avg_steps": 0.53125,
"kl/beta": 0.024677861481904984,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 2.5925341972508954e-07,
"logits/chosen": -0.142683744430542,
"logits/rejected": -0.04371079057455063,
"logps/chosen": -122.4355697631836,
"logps/ref_chosen": -67.20960998535156,
"logps/ref_rejected": -69.34715270996094,
"logps/rejected": -159.41754150390625,
"loss": 0.966,
"rewards/accuracies": 0.765625,
"rewards/chosen": -1.3573343753814697,
"rewards/margins": 0.8488820791244507,
"rewards/rejected": -2.206216335296631,
"step": 358
},
{
"epoch": 0.5427059712773998,
"epsilon_dpo/beta": 0.024465516209602356,
"epsilon_dpo/beta_margin_grad_mean": -0.41350895166397095,
"epsilon_dpo/beta_margin_grad_std": 0.202595055103302,
"epsilon_dpo/beta_margin_mean": 0.43923673033714294,
"epsilon_dpo/beta_margin_std": 1.0158089399337769,
"epsilon_dpo/loss_margin_mean": 18.245548248291016,
"grad_norm": 21.56581687927246,
"kl/avg_steps": 0.34375,
"kl/beta": 0.024547452107071877,
"kl/n_epsilon_steps": 0.328125,
"kl/p_epsilon_steps": 0.671875,
"learning_rate": 2.579319833745169e-07,
"logits/chosen": -0.08289580792188644,
"logits/rejected": -0.18635423481464386,
"logps/chosen": -123.06568145751953,
"logps/ref_chosen": -62.52578353881836,
"logps/ref_rejected": -76.63114929199219,
"logps/rejected": -155.41659545898438,
"loss": 1.2164,
"rewards/accuracies": 0.671875,
"rewards/chosen": -1.4838988780975342,
"rewards/margins": 0.43923673033714294,
"rewards/rejected": -1.9231356382369995,
"step": 359
},
{
"epoch": 0.54421768707483,
"epsilon_dpo/beta": 0.024351121857762337,
"epsilon_dpo/beta_margin_grad_mean": -0.36273613572120667,
"epsilon_dpo/beta_margin_grad_std": 0.1914779394865036,
"epsilon_dpo/beta_margin_mean": 0.712982177734375,
"epsilon_dpo/beta_margin_std": 1.0266094207763672,
"epsilon_dpo/loss_margin_mean": 29.523704528808594,
"grad_norm": 18.596818923950195,
"kl/avg_steps": 0.46875,
"kl/beta": 0.024463359266519547,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 2.5661032514931834e-07,
"logits/chosen": -0.07461874932050705,
"logits/rejected": -0.2994999289512634,
"logps/chosen": -122.07127380371094,
"logps/ref_chosen": -63.48772048950195,
"logps/ref_rejected": -90.6891098022461,
"logps/rejected": -178.79637145996094,
"loss": 1.0065,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.4296650886535645,
"rewards/margins": 0.712982177734375,
"rewards/rejected": -2.1426472663879395,
"step": 360
},
{
"epoch": 0.54572940287226,
"epsilon_dpo/beta": 0.024207070469856262,
"epsilon_dpo/beta_margin_grad_mean": -0.33131957054138184,
"epsilon_dpo/beta_margin_grad_std": 0.1978544145822525,
"epsilon_dpo/beta_margin_mean": 0.8593066930770874,
"epsilon_dpo/beta_margin_std": 1.042729377746582,
"epsilon_dpo/loss_margin_mean": 35.735504150390625,
"grad_norm": 16.4859676361084,
"kl/avg_steps": 0.59375,
"kl/beta": 0.02434922382235527,
"kl/n_epsilon_steps": 0.203125,
"kl/p_epsilon_steps": 0.796875,
"learning_rate": 2.552884820191154e-07,
"logits/chosen": -0.03401462361216545,
"logits/rejected": -0.12106764316558838,
"logps/chosen": -114.06637573242188,
"logps/ref_chosen": -57.917144775390625,
"logps/ref_rejected": -72.39089965820312,
"logps/rejected": -164.275634765625,
"loss": 0.9213,
"rewards/accuracies": 0.796875,
"rewards/chosen": -1.3623058795928955,
"rewards/margins": 0.8593066930770874,
"rewards/rejected": -2.2216124534606934,
"step": 361
},
{
"epoch": 0.54724111866969,
"epsilon_dpo/beta": 0.02407175302505493,
"epsilon_dpo/beta_margin_grad_mean": -0.3352404832839966,
"epsilon_dpo/beta_margin_grad_std": 0.19922372698783875,
"epsilon_dpo/beta_margin_mean": 0.8902791142463684,
"epsilon_dpo/beta_margin_std": 1.1847198009490967,
"epsilon_dpo/loss_margin_mean": 37.23998260498047,
"grad_norm": 17.215599060058594,
"kl/avg_steps": 0.5625,
"kl/beta": 0.02420550212264061,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 2.53966490958702e-07,
"logits/chosen": -0.16901442408561707,
"logits/rejected": -0.30185365676879883,
"logps/chosen": -118.09026336669922,
"logps/ref_chosen": -63.4434700012207,
"logps/ref_rejected": -103.45516967773438,
"logps/rejected": -195.34194946289062,
"loss": 0.9329,
"rewards/accuracies": 0.796875,
"rewards/chosen": -1.316558837890625,
"rewards/margins": 0.8902791738510132,
"rewards/rejected": -2.2068378925323486,
"step": 362
},
{
"epoch": 0.5487528344671202,
"epsilon_dpo/beta": 0.023907016962766647,
"epsilon_dpo/beta_margin_grad_mean": -0.33447110652923584,
"epsilon_dpo/beta_margin_grad_std": 0.18823032081127167,
"epsilon_dpo/beta_margin_mean": 0.8569877743721008,
"epsilon_dpo/beta_margin_std": 1.0583666563034058,
"epsilon_dpo/loss_margin_mean": 36.04021072387695,
"grad_norm": 15.949338912963867,
"kl/avg_steps": 0.6875,
"kl/beta": 0.02407010830938816,
"kl/n_epsilon_steps": 0.15625,
"kl/p_epsilon_steps": 0.84375,
"learning_rate": 2.526443889470099e-07,
"logits/chosen": 0.08449310064315796,
"logits/rejected": -0.2509921193122864,
"logps/chosen": -107.6585464477539,
"logps/ref_chosen": -48.65182876586914,
"logps/ref_rejected": -88.65904235839844,
"logps/rejected": -183.70596313476562,
"loss": 0.919,
"rewards/accuracies": 0.828125,
"rewards/chosen": -1.411900520324707,
"rewards/margins": 0.8569878339767456,
"rewards/rejected": -2.268888473510742,
"step": 363
},
{
"epoch": 0.5502645502645502,
"epsilon_dpo/beta": 0.023811019957065582,
"epsilon_dpo/beta_margin_grad_mean": -0.3396543562412262,
"epsilon_dpo/beta_margin_grad_std": 0.22338302433490753,
"epsilon_dpo/beta_margin_mean": 0.884149968624115,
"epsilon_dpo/beta_margin_std": 1.2586935758590698,
"epsilon_dpo/loss_margin_mean": 37.478702545166016,
"grad_norm": 17.371923446655273,
"kl/avg_steps": 0.40625,
"kl/beta": 0.023905755952000618,
"kl/n_epsilon_steps": 0.296875,
"kl/p_epsilon_steps": 0.703125,
"learning_rate": 2.513222129660744e-07,
"logits/chosen": -0.04727925360202789,
"logits/rejected": -0.27311673760414124,
"logps/chosen": -111.36739349365234,
"logps/ref_chosen": -57.87107467651367,
"logps/ref_rejected": -80.95502471923828,
"logps/rejected": -171.9300537109375,
"loss": 0.991,
"rewards/accuracies": 0.734375,
"rewards/chosen": -1.2763241529464722,
"rewards/margins": 0.884149968624115,
"rewards/rejected": -2.1604743003845215,
"step": 364
},
{
"epoch": 0.5517762660619804,
"epsilon_dpo/beta": 0.023670032620429993,
"epsilon_dpo/beta_margin_grad_mean": -0.33620965480804443,
"epsilon_dpo/beta_margin_grad_std": 0.17516224086284637,
"epsilon_dpo/beta_margin_mean": 0.8213182091712952,
"epsilon_dpo/beta_margin_std": 0.9406766891479492,
"epsilon_dpo/loss_margin_mean": 34.89070510864258,
"grad_norm": 16.950355529785156,
"kl/avg_steps": 0.59375,
"kl/beta": 0.023809032514691353,
"kl/n_epsilon_steps": 0.203125,
"kl/p_epsilon_steps": 0.796875,
"learning_rate": 2.5e-07,
"logits/chosen": -0.11987531930208206,
"logits/rejected": -0.08955763280391693,
"logps/chosen": -112.06727600097656,
"logps/ref_chosen": -64.94217681884766,
"logps/ref_rejected": -74.8599853515625,
"logps/rejected": -156.87579345703125,
"loss": 0.9005,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.117061734199524,
"rewards/margins": 0.8213182091712952,
"rewards/rejected": -1.9383800029754639,
"step": 365
},
{
"epoch": 0.5532879818594104,
"epsilon_dpo/beta": 0.02357470616698265,
"epsilon_dpo/beta_margin_grad_mean": -0.36424005031585693,
"epsilon_dpo/beta_margin_grad_std": 0.2140674591064453,
"epsilon_dpo/beta_margin_mean": 0.7208508849143982,
"epsilon_dpo/beta_margin_std": 1.128444790840149,
"epsilon_dpo/loss_margin_mean": 30.900943756103516,
"grad_norm": 17.26808738708496,
"kl/avg_steps": 0.40625,
"kl/beta": 0.023668501526117325,
"kl/n_epsilon_steps": 0.296875,
"kl/p_epsilon_steps": 0.703125,
"learning_rate": 2.486777870339255e-07,
"logits/chosen": -0.054251138120889664,
"logits/rejected": -0.1293611079454422,
"logps/chosen": -99.60520935058594,
"logps/ref_chosen": -55.165985107421875,
"logps/ref_rejected": -65.2612075805664,
"logps/rejected": -140.60137939453125,
"loss": 1.049,
"rewards/accuracies": 0.703125,
"rewards/chosen": -1.049750566482544,
"rewards/margins": 0.7208508849143982,
"rewards/rejected": -1.770601511001587,
"step": 366
},
{
"epoch": 0.5547996976568406,
"epsilon_dpo/beta": 0.023427749052643776,
"epsilon_dpo/beta_margin_grad_mean": -0.36557790637016296,
"epsilon_dpo/beta_margin_grad_std": 0.19309480488300323,
"epsilon_dpo/beta_margin_mean": 0.6505571007728577,
"epsilon_dpo/beta_margin_std": 0.994044840335846,
"epsilon_dpo/loss_margin_mean": 27.995878219604492,
"grad_norm": 17.34241485595703,
"kl/avg_steps": 0.625,
"kl/beta": 0.023572735488414764,
"kl/n_epsilon_steps": 0.1875,
"kl/p_epsilon_steps": 0.8125,
"learning_rate": 2.4735561105299014e-07,
"logits/chosen": -0.0672166645526886,
"logits/rejected": -0.26583585143089294,
"logps/chosen": -110.31550598144531,
"logps/ref_chosen": -56.010467529296875,
"logps/ref_rejected": -77.31010437011719,
"logps/rejected": -159.61102294921875,
"loss": 1.0466,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.2740150690078735,
"rewards/margins": 0.6505571007728577,
"rewards/rejected": -1.924572229385376,
"step": 367
},
{
"epoch": 0.5563114134542706,
"epsilon_dpo/beta": 0.023326164111495018,
"epsilon_dpo/beta_margin_grad_mean": -0.3718603849411011,
"epsilon_dpo/beta_margin_grad_std": 0.20834578573703766,
"epsilon_dpo/beta_margin_mean": 0.627814769744873,
"epsilon_dpo/beta_margin_std": 1.0270978212356567,
"epsilon_dpo/loss_margin_mean": 27.22341537475586,
"grad_norm": 16.82701873779297,
"kl/avg_steps": 0.4375,
"kl/beta": 0.023426322266459465,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 2.46033509041298e-07,
"logits/chosen": -0.21781033277511597,
"logits/rejected": -0.06027165800333023,
"logps/chosen": -134.59854125976562,
"logps/ref_chosen": -74.82928466796875,
"logps/ref_rejected": -76.11680603027344,
"logps/rejected": -163.10946655273438,
"loss": 1.0821,
"rewards/accuracies": 0.734375,
"rewards/chosen": -1.3972914218902588,
"rewards/margins": 0.627814769744873,
"rewards/rejected": -2.025106430053711,
"step": 368
},
{
"epoch": 0.5578231292517006,
"epsilon_dpo/beta": 0.02323545515537262,
"epsilon_dpo/beta_margin_grad_mean": -0.3783518373966217,
"epsilon_dpo/beta_margin_grad_std": 0.197055846452713,
"epsilon_dpo/beta_margin_mean": 0.6090226173400879,
"epsilon_dpo/beta_margin_std": 1.0001606941223145,
"epsilon_dpo/loss_margin_mean": 26.504749298095703,
"grad_norm": 16.288339614868164,
"kl/avg_steps": 0.390625,
"kl/beta": 0.023324277251958847,
"kl/n_epsilon_steps": 0.296875,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 2.447115179808846e-07,
"logits/chosen": -0.06415815651416779,
"logits/rejected": -0.25171515345573425,
"logps/chosen": -114.86565399169922,
"logps/ref_chosen": -58.32621765136719,
"logps/ref_rejected": -80.92184448242188,
"logps/rejected": -163.96603393554688,
"loss": 1.0775,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.3156427145004272,
"rewards/margins": 0.6090226173400879,
"rewards/rejected": -1.9246652126312256,
"step": 369
},
{
"epoch": 0.5593348450491308,
"epsilon_dpo/beta": 0.023105142638087273,
"epsilon_dpo/beta_margin_grad_mean": -0.3424655497074127,
"epsilon_dpo/beta_margin_grad_std": 0.21819935739040375,
"epsilon_dpo/beta_margin_mean": 0.825592041015625,
"epsilon_dpo/beta_margin_std": 1.2170312404632568,
"epsilon_dpo/loss_margin_mean": 36.02720260620117,
"grad_norm": 16.507875442504883,
"kl/avg_steps": 0.5625,
"kl/beta": 0.023233521729707718,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 2.4338967485068164e-07,
"logits/chosen": 0.039311882108449936,
"logits/rejected": -0.12402039766311646,
"logps/chosen": -106.84638977050781,
"logps/ref_chosen": -52.88372039794922,
"logps/ref_rejected": -79.43692016601562,
"logps/rejected": -169.42678833007812,
"loss": 1.0153,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.2500584125518799,
"rewards/margins": 0.825592041015625,
"rewards/rejected": -2.075650453567505,
"step": 370
},
{
"epoch": 0.5608465608465608,
"epsilon_dpo/beta": 0.022997567430138588,
"epsilon_dpo/beta_margin_grad_mean": -0.34857356548309326,
"epsilon_dpo/beta_margin_grad_std": 0.20019613206386566,
"epsilon_dpo/beta_margin_mean": 0.7864258885383606,
"epsilon_dpo/beta_margin_std": 1.0434191226959229,
"epsilon_dpo/loss_margin_mean": 34.48577880859375,
"grad_norm": 15.30681037902832,
"kl/avg_steps": 0.46875,
"kl/beta": 0.023103564977645874,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 2.420680166254831e-07,
"logits/chosen": 0.05145767703652382,
"logits/rejected": -0.06251757591962814,
"logps/chosen": -100.85081481933594,
"logps/ref_chosen": -49.224212646484375,
"logps/ref_rejected": -63.348472595214844,
"logps/rejected": -149.4608612060547,
"loss": 0.9679,
"rewards/accuracies": 0.734375,
"rewards/chosen": -1.1888883113861084,
"rewards/margins": 0.7864259481430054,
"rewards/rejected": -1.9753141403198242,
"step": 371
},
{
"epoch": 0.562358276643991,
"epsilon_dpo/beta": 0.022904641926288605,
"epsilon_dpo/beta_margin_grad_mean": -0.40844154357910156,
"epsilon_dpo/beta_margin_grad_std": 0.2289167046546936,
"epsilon_dpo/beta_margin_mean": 0.46895912289619446,
"epsilon_dpo/beta_margin_std": 1.2005647420883179,
"epsilon_dpo/loss_margin_mean": 20.829545974731445,
"grad_norm": 20.411462783813477,
"kl/avg_steps": 0.40625,
"kl/beta": 0.02299577184021473,
"kl/n_epsilon_steps": 0.296875,
"kl/p_epsilon_steps": 0.703125,
"learning_rate": 2.4074658027491044e-07,
"logits/chosen": 0.03478096425533295,
"logits/rejected": -0.18025296926498413,
"logps/chosen": -109.72450256347656,
"logps/ref_chosen": -52.26955032348633,
"logps/ref_rejected": -72.99522399902344,
"logps/rejected": -151.2797088623047,
"loss": 1.2751,
"rewards/accuracies": 0.734375,
"rewards/chosen": -1.3205981254577637,
"rewards/margins": 0.46895912289619446,
"rewards/rejected": -1.7895572185516357,
"step": 372
},
{
"epoch": 0.563869992441421,
"epsilon_dpo/beta": 0.022833440452814102,
"epsilon_dpo/beta_margin_grad_mean": -0.3846432566642761,
"epsilon_dpo/beta_margin_grad_std": 0.2319236695766449,
"epsilon_dpo/beta_margin_mean": 0.6094071269035339,
"epsilon_dpo/beta_margin_std": 1.1913042068481445,
"epsilon_dpo/loss_margin_mean": 27.08026123046875,
"grad_norm": 23.150869369506836,
"kl/avg_steps": 0.3125,
"kl/beta": 0.022902728989720345,
"kl/n_epsilon_steps": 0.34375,
"kl/p_epsilon_steps": 0.65625,
"learning_rate": 2.394254027623792e-07,
"logits/chosen": -0.20109140872955322,
"logits/rejected": -0.2932276725769043,
"logps/chosen": -124.26031494140625,
"logps/ref_chosen": -61.112998962402344,
"logps/ref_rejected": -76.24851989746094,
"logps/rejected": -166.47610473632812,
"loss": 1.1647,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.4463003873825073,
"rewards/margins": 0.6094071865081787,
"rewards/rejected": -2.0557074546813965,
"step": 373
},
{
"epoch": 0.5653817082388511,
"epsilon_dpo/beta": 0.022662414237856865,
"epsilon_dpo/beta_margin_grad_mean": -0.31355804204940796,
"epsilon_dpo/beta_margin_grad_std": 0.182713583111763,
"epsilon_dpo/beta_margin_mean": 0.9390335083007812,
"epsilon_dpo/beta_margin_std": 0.9844923615455627,
"epsilon_dpo/loss_margin_mean": 41.61290740966797,
"grad_norm": 24.740949630737305,
"kl/avg_steps": 0.75,
"kl/beta": 0.022831382229924202,
"kl/n_epsilon_steps": 0.125,
"kl/p_epsilon_steps": 0.875,
"learning_rate": 2.381045210440644e-07,
"logits/chosen": -0.2289101779460907,
"logits/rejected": -0.1994692087173462,
"logps/chosen": -128.3408660888672,
"logps/ref_chosen": -72.66920471191406,
"logps/ref_rejected": -76.83158874511719,
"logps/rejected": -174.1161651611328,
"loss": 0.846,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.2642393112182617,
"rewards/margins": 0.9390335083007812,
"rewards/rejected": -2.203272819519043,
"step": 374
},
{
"epoch": 0.5668934240362812,
"epsilon_dpo/beta": 0.022571615874767303,
"epsilon_dpo/beta_margin_grad_mean": -0.3776033818721771,
"epsilon_dpo/beta_margin_grad_std": 0.2066648155450821,
"epsilon_dpo/beta_margin_mean": 0.6234402656555176,
"epsilon_dpo/beta_margin_std": 1.056839942932129,
"epsilon_dpo/loss_margin_mean": 27.937467575073242,
"grad_norm": 19.269590377807617,
"kl/avg_steps": 0.40625,
"kl/beta": 0.022661421447992325,
"kl/n_epsilon_steps": 0.296875,
"kl/p_epsilon_steps": 0.703125,
"learning_rate": 2.3678397206786715e-07,
"logits/chosen": -0.060030847787857056,
"logits/rejected": -0.2464950829744339,
"logps/chosen": -111.95555114746094,
"logps/ref_chosen": -57.68330383300781,
"logps/ref_rejected": -79.34097290039062,
"logps/rejected": -161.55068969726562,
"loss": 1.0916,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.2278056144714355,
"rewards/margins": 0.6234402656555176,
"rewards/rejected": -1.8512458801269531,
"step": 375
},
{
"epoch": 0.5684051398337112,
"epsilon_dpo/beta": 0.022445019334554672,
"epsilon_dpo/beta_margin_grad_mean": -0.3363484740257263,
"epsilon_dpo/beta_margin_grad_std": 0.21801850199699402,
"epsilon_dpo/beta_margin_mean": 0.9019449353218079,
"epsilon_dpo/beta_margin_std": 1.2155145406723022,
"epsilon_dpo/loss_margin_mean": 40.487979888916016,
"grad_norm": 16.357463836669922,
"kl/avg_steps": 0.5625,
"kl/beta": 0.022569730877876282,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 2.3546379277238103e-07,
"logits/chosen": 0.05979081615805626,
"logits/rejected": -0.13624578714370728,
"logps/chosen": -110.30464172363281,
"logps/ref_chosen": -51.674072265625,
"logps/ref_rejected": -75.69713592529297,
"logps/rejected": -174.815673828125,
"loss": 0.9591,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.3192379474639893,
"rewards/margins": 0.9019448757171631,
"rewards/rejected": -2.2211828231811523,
"step": 376
},
{
"epoch": 0.5699168556311414,
"epsilon_dpo/beta": 0.02234051562845707,
"epsilon_dpo/beta_margin_grad_mean": -0.371245801448822,
"epsilon_dpo/beta_margin_grad_std": 0.19891570508480072,
"epsilon_dpo/beta_margin_mean": 0.6351776123046875,
"epsilon_dpo/beta_margin_std": 0.9869452118873596,
"epsilon_dpo/loss_margin_mean": 28.719274520874023,
"grad_norm": 17.512327194213867,
"kl/avg_steps": 0.46875,
"kl/beta": 0.022443486377596855,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 2.3414402008585886e-07,
"logits/chosen": 0.04614192247390747,
"logits/rejected": -0.01711181551218033,
"logps/chosen": -104.1654052734375,
"logps/ref_chosen": -46.17853546142578,
"logps/ref_rejected": -57.756500244140625,
"logps/rejected": -144.462646484375,
"loss": 1.0573,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.2976436614990234,
"rewards/margins": 0.6351776719093323,
"rewards/rejected": -1.9328213930130005,
"step": 377
},
{
"epoch": 0.5714285714285714,
"epsilon_dpo/beta": 0.022243265062570572,
"epsilon_dpo/beta_margin_grad_mean": -0.3872436583042145,
"epsilon_dpo/beta_margin_grad_std": 0.19555704295635223,
"epsilon_dpo/beta_margin_mean": 0.5645663738250732,
"epsilon_dpo/beta_margin_std": 1.0182288885116577,
"epsilon_dpo/loss_margin_mean": 25.658349990844727,
"grad_norm": 17.64137840270996,
"kl/avg_steps": 0.4375,
"kl/beta": 0.02233877405524254,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 2.3282469092517977e-07,
"logits/chosen": -0.06279049813747406,
"logits/rejected": -0.18278297781944275,
"logps/chosen": -115.47160339355469,
"logps/ref_chosen": -59.21887969970703,
"logps/ref_rejected": -71.2481918334961,
"logps/rejected": -153.15927124023438,
"loss": 1.1158,
"rewards/accuracies": 0.734375,
"rewards/chosen": -1.253143548965454,
"rewards/margins": 0.5645663738250732,
"rewards/rejected": -1.8177099227905273,
"step": 378
},
{
"epoch": 0.5729402872260015,
"epsilon_dpo/beta": 0.022090766578912735,
"epsilon_dpo/beta_margin_grad_mean": -0.3519314229488373,
"epsilon_dpo/beta_margin_grad_std": 0.18667548894882202,
"epsilon_dpo/beta_margin_mean": 0.7616754770278931,
"epsilon_dpo/beta_margin_std": 1.0172090530395508,
"epsilon_dpo/loss_margin_mean": 34.67833709716797,
"grad_norm": 16.588329315185547,
"kl/avg_steps": 0.6875,
"kl/beta": 0.022241467610001564,
"kl/n_epsilon_steps": 0.15625,
"kl/p_epsilon_steps": 0.84375,
"learning_rate": 2.3150584219481643e-07,
"logits/chosen": -0.19441550970077515,
"logits/rejected": -0.3250526785850525,
"logps/chosen": -130.19581604003906,
"logps/ref_chosen": -76.31658935546875,
"logps/ref_rejected": -104.26200866699219,
"logps/rejected": -192.81956481933594,
"loss": 0.9667,
"rewards/accuracies": 0.828125,
"rewards/chosen": -1.1934432983398438,
"rewards/margins": 0.7616754770278931,
"rewards/rejected": -1.9551188945770264,
"step": 379
},
{
"epoch": 0.5744520030234316,
"epsilon_dpo/beta": 0.021953735500574112,
"epsilon_dpo/beta_margin_grad_mean": -0.32952001690864563,
"epsilon_dpo/beta_margin_grad_std": 0.20268180966377258,
"epsilon_dpo/beta_margin_mean": 0.8697534799575806,
"epsilon_dpo/beta_margin_std": 1.0578638315200806,
"epsilon_dpo/loss_margin_mean": 39.87781524658203,
"grad_norm": 15.790874481201172,
"kl/avg_steps": 0.625,
"kl/beta": 0.022089600563049316,
"kl/n_epsilon_steps": 0.1875,
"kl/p_epsilon_steps": 0.8125,
"learning_rate": 2.3018751078580283e-07,
"logits/chosen": -0.15327207744121552,
"logits/rejected": -0.10459771752357483,
"logps/chosen": -109.94285583496094,
"logps/ref_chosen": -61.283164978027344,
"logps/ref_rejected": -72.38892364501953,
"logps/rejected": -160.92642211914062,
"loss": 0.9242,
"rewards/accuracies": 0.828125,
"rewards/chosen": -1.0711722373962402,
"rewards/margins": 0.8697534203529358,
"rewards/rejected": -1.9409257173538208,
"step": 380
},
{
"epoch": 0.5759637188208617,
"epsilon_dpo/beta": 0.021885985508561134,
"epsilon_dpo/beta_margin_grad_mean": -0.4288911819458008,
"epsilon_dpo/beta_margin_grad_std": 0.19800609350204468,
"epsilon_dpo/beta_margin_mean": 0.36180296540260315,
"epsilon_dpo/beta_margin_std": 1.0440819263458252,
"epsilon_dpo/loss_margin_mean": 16.834692001342773,
"grad_norm": 17.530067443847656,
"kl/avg_steps": 0.3125,
"kl/beta": 0.021952398121356964,
"kl/n_epsilon_steps": 0.34375,
"kl/p_epsilon_steps": 0.65625,
"learning_rate": 2.288697335747027e-07,
"logits/chosen": -0.08915866911411285,
"logits/rejected": -0.16119176149368286,
"logps/chosen": -117.79042053222656,
"logps/ref_chosen": -58.2139892578125,
"logps/ref_rejected": -60.78669357299805,
"logps/rejected": -137.19781494140625,
"loss": 1.2859,
"rewards/accuracies": 0.640625,
"rewards/chosen": -1.3063081502914429,
"rewards/margins": 0.36180296540260315,
"rewards/rejected": -1.6681110858917236,
"step": 381
},
{
"epoch": 0.5774754346182918,
"epsilon_dpo/beta": 0.021783607080578804,
"epsilon_dpo/beta_margin_grad_mean": -0.38216355443000793,
"epsilon_dpo/beta_margin_grad_std": 0.17063254117965698,
"epsilon_dpo/beta_margin_mean": 0.5790287256240845,
"epsilon_dpo/beta_margin_std": 0.8841366171836853,
"epsilon_dpo/loss_margin_mean": 26.817768096923828,
"grad_norm": 16.4626407623291,
"kl/avg_steps": 0.46875,
"kl/beta": 0.02188401110470295,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 2.2755254742257706e-07,
"logits/chosen": -0.11755181849002838,
"logits/rejected": -0.19758224487304688,
"logps/chosen": -122.74113464355469,
"logps/ref_chosen": -61.82532501220703,
"logps/ref_rejected": -83.0452880859375,
"logps/rejected": -170.77886962890625,
"loss": 1.051,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.328293800354004,
"rewards/margins": 0.5790287256240845,
"rewards/rejected": -1.907322645187378,
"step": 382
},
{
"epoch": 0.5789871504157218,
"epsilon_dpo/beta": 0.021695587784051895,
"epsilon_dpo/beta_margin_grad_mean": -0.365491658449173,
"epsilon_dpo/beta_margin_grad_std": 0.2085854709148407,
"epsilon_dpo/beta_margin_mean": 0.7124947309494019,
"epsilon_dpo/beta_margin_std": 1.1057060956954956,
"epsilon_dpo/loss_margin_mean": 33.18313980102539,
"grad_norm": 19.305198669433594,
"kl/avg_steps": 0.40625,
"kl/beta": 0.021781908348202705,
"kl/n_epsilon_steps": 0.296875,
"kl/p_epsilon_steps": 0.703125,
"learning_rate": 2.2623598917395436e-07,
"logits/chosen": -0.24549967050552368,
"logits/rejected": -0.22099743783473969,
"logps/chosen": -137.58901977539062,
"logps/ref_chosen": -80.56326293945312,
"logps/ref_rejected": -74.62922668457031,
"logps/rejected": -164.83811950683594,
"loss": 1.043,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.2404372692108154,
"rewards/margins": 0.7124947309494019,
"rewards/rejected": -1.9529321193695068,
"step": 383
},
{
"epoch": 0.5804988662131519,
"epsilon_dpo/beta": 0.02154678851366043,
"epsilon_dpo/beta_margin_grad_mean": -0.35505884885787964,
"epsilon_dpo/beta_margin_grad_std": 0.17465253174304962,
"epsilon_dpo/beta_margin_mean": 0.7003733515739441,
"epsilon_dpo/beta_margin_std": 0.893462061882019,
"epsilon_dpo/loss_margin_mean": 32.69243621826172,
"grad_norm": 15.199735641479492,
"kl/avg_steps": 0.6875,
"kl/beta": 0.021693777292966843,
"kl/n_epsilon_steps": 0.15625,
"kl/p_epsilon_steps": 0.84375,
"learning_rate": 2.2492009565579875e-07,
"logits/chosen": -0.15723174810409546,
"logits/rejected": -0.23061436414718628,
"logps/chosen": -121.34182739257812,
"logps/ref_chosen": -65.47514343261719,
"logps/ref_rejected": -79.67378234863281,
"logps/rejected": -168.23291015625,
"loss": 0.9706,
"rewards/accuracies": 0.828125,
"rewards/chosen": -1.2048842906951904,
"rewards/margins": 0.7003734111785889,
"rewards/rejected": -1.9052577018737793,
"step": 384
},
{
"epoch": 0.582010582010582,
"epsilon_dpo/beta": 0.021440066397190094,
"epsilon_dpo/beta_margin_grad_mean": -0.34055736660957336,
"epsilon_dpo/beta_margin_grad_std": 0.18236930668354034,
"epsilon_dpo/beta_margin_mean": 0.7946664094924927,
"epsilon_dpo/beta_margin_std": 0.9285534024238586,
"epsilon_dpo/loss_margin_mean": 37.33354568481445,
"grad_norm": 15.285587310791016,
"kl/avg_steps": 0.5,
"kl/beta": 0.02154565043747425,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 2.2360490367648084e-07,
"logits/chosen": -0.23100626468658447,
"logits/rejected": -0.2713785171508789,
"logps/chosen": -123.45042419433594,
"logps/ref_chosen": -66.0565185546875,
"logps/ref_rejected": -86.68023681640625,
"logps/rejected": -181.40768432617188,
"loss": 0.9204,
"rewards/accuracies": 0.734375,
"rewards/chosen": -1.2333240509033203,
"rewards/margins": 0.7946664094924927,
"rewards/rejected": -2.0279903411865234,
"step": 385
},
{
"epoch": 0.5835222978080121,
"epsilon_dpo/beta": 0.021353499963879585,
"epsilon_dpo/beta_margin_grad_mean": -0.3857010304927826,
"epsilon_dpo/beta_margin_grad_std": 0.1887744516134262,
"epsilon_dpo/beta_margin_mean": 0.5480641722679138,
"epsilon_dpo/beta_margin_std": 0.9241737127304077,
"epsilon_dpo/loss_margin_mean": 25.96361541748047,
"grad_norm": 18.356056213378906,
"kl/avg_steps": 0.40625,
"kl/beta": 0.021438458934426308,
"kl/n_epsilon_steps": 0.296875,
"kl/p_epsilon_steps": 0.703125,
"learning_rate": 2.2229045002474724e-07,
"logits/chosen": -0.22354364395141602,
"logits/rejected": -0.31517553329467773,
"logps/chosen": -141.54310607910156,
"logps/ref_chosen": -75.6236572265625,
"logps/ref_rejected": -92.62330627441406,
"logps/rejected": -184.50637817382812,
"loss": 1.0976,
"rewards/accuracies": 0.734375,
"rewards/chosen": -1.409054160118103,
"rewards/margins": 0.5480641722679138,
"rewards/rejected": -1.9571183919906616,
"step": 386
},
{
"epoch": 0.5850340136054422,
"epsilon_dpo/beta": 0.021227063611149788,
"epsilon_dpo/beta_margin_grad_mean": -0.3486242890357971,
"epsilon_dpo/beta_margin_grad_std": 0.16586869955062866,
"epsilon_dpo/beta_margin_mean": 0.7582774758338928,
"epsilon_dpo/beta_margin_std": 0.9101418256759644,
"epsilon_dpo/loss_margin_mean": 35.92195510864258,
"grad_norm": 16.27162742614746,
"kl/avg_steps": 0.59375,
"kl/beta": 0.02135171741247177,
"kl/n_epsilon_steps": 0.203125,
"kl/p_epsilon_steps": 0.796875,
"learning_rate": 2.209767714686924e-07,
"logits/chosen": 0.03768094629049301,
"logits/rejected": -0.1885252147912979,
"logps/chosen": -104.50468444824219,
"logps/ref_chosen": -47.22170639038086,
"logps/ref_rejected": -87.33814239501953,
"logps/rejected": -180.54307556152344,
"loss": 0.9274,
"rewards/accuracies": 0.796875,
"rewards/chosen": -1.2186527252197266,
"rewards/margins": 0.7582774758338928,
"rewards/rejected": -1.9769301414489746,
"step": 387
},
{
"epoch": 0.5865457294028723,
"epsilon_dpo/beta": 0.021161476150155067,
"epsilon_dpo/beta_margin_grad_mean": -0.39690154790878296,
"epsilon_dpo/beta_margin_grad_std": 0.2020420879125595,
"epsilon_dpo/beta_margin_mean": 0.5378429293632507,
"epsilon_dpo/beta_margin_std": 1.0457186698913574,
"epsilon_dpo/loss_margin_mean": 25.749526977539062,
"grad_norm": 16.6661376953125,
"kl/avg_steps": 0.3125,
"kl/beta": 0.021225690841674805,
"kl/n_epsilon_steps": 0.34375,
"kl/p_epsilon_steps": 0.65625,
"learning_rate": 2.1966390475472954e-07,
"logits/chosen": -0.17634549736976624,
"logits/rejected": -0.26564857363700867,
"logps/chosen": -132.0753173828125,
"logps/ref_chosen": -74.5794677734375,
"logps/ref_rejected": -79.92558288574219,
"logps/rejected": -163.17095947265625,
"loss": 1.1468,
"rewards/accuracies": 0.671875,
"rewards/chosen": -1.219231367111206,
"rewards/margins": 0.5378429293632507,
"rewards/rejected": -1.7570743560791016,
"step": 388
},
{
"epoch": 0.5880574452003023,
"epsilon_dpo/beta": 0.021082326769828796,
"epsilon_dpo/beta_margin_grad_mean": -0.35466712713241577,
"epsilon_dpo/beta_margin_grad_std": 0.19740672409534454,
"epsilon_dpo/beta_margin_mean": 0.7312471270561218,
"epsilon_dpo/beta_margin_std": 1.0218209028244019,
"epsilon_dpo/loss_margin_mean": 35.024688720703125,
"grad_norm": 43.114139556884766,
"kl/avg_steps": 0.375,
"kl/beta": 0.021159566938877106,
"kl/n_epsilon_steps": 0.3125,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 2.1835188660656265e-07,
"logits/chosen": -0.10400611907243729,
"logits/rejected": -0.15589158236980438,
"logps/chosen": -121.51643371582031,
"logps/ref_chosen": -61.624366760253906,
"logps/ref_rejected": -76.50978088378906,
"logps/rejected": -171.42654418945312,
"loss": 0.9993,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.2657912969589233,
"rewards/margins": 0.7312471270561218,
"rewards/rejected": -1.9970383644104004,
"step": 389
},
{
"epoch": 0.5895691609977324,
"epsilon_dpo/beta": 0.020977208390831947,
"epsilon_dpo/beta_margin_grad_mean": -0.37068212032318115,
"epsilon_dpo/beta_margin_grad_std": 0.18008175492286682,
"epsilon_dpo/beta_margin_mean": 0.6301229000091553,
"epsilon_dpo/beta_margin_std": 0.9000543355941772,
"epsilon_dpo/loss_margin_mean": 30.297916412353516,
"grad_norm": 15.200876235961914,
"kl/avg_steps": 0.5,
"kl/beta": 0.021080514416098595,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 2.170407537241599e-07,
"logits/chosen": 0.10825499892234802,
"logits/rejected": 0.01285313069820404,
"logps/chosen": -98.9400863647461,
"logps/ref_chosen": -45.871864318847656,
"logps/ref_rejected": -61.305999755859375,
"logps/rejected": -144.67214965820312,
"loss": 1.0242,
"rewards/accuracies": 0.765625,
"rewards/chosen": -1.1162071228027344,
"rewards/margins": 0.6301229000091553,
"rewards/rejected": -1.7463300228118896,
"step": 390
},
{
"epoch": 0.5910808767951625,
"epsilon_dpo/beta": 0.02087940089404583,
"epsilon_dpo/beta_margin_grad_mean": -0.3506031632423401,
"epsilon_dpo/beta_margin_grad_std": 0.2078220397233963,
"epsilon_dpo/beta_margin_mean": 0.7664510011672974,
"epsilon_dpo/beta_margin_std": 1.0937607288360596,
"epsilon_dpo/loss_margin_mean": 37.051856994628906,
"grad_norm": 16.012065887451172,
"kl/avg_steps": 0.46875,
"kl/beta": 0.020975636318325996,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 2.1573054278272636e-07,
"logits/chosen": -0.12658485770225525,
"logits/rejected": -0.2928740382194519,
"logps/chosen": -113.88916015625,
"logps/ref_chosen": -58.18701171875,
"logps/ref_rejected": -83.63443756103516,
"logps/rejected": -176.388427734375,
"loss": 1.0032,
"rewards/accuracies": 0.703125,
"rewards/chosen": -1.1650164127349854,
"rewards/margins": 0.7664510011672974,
"rewards/rejected": -1.9314674139022827,
"step": 391
},
{
"epoch": 0.5925925925925926,
"epsilon_dpo/beta": 0.020755885168910027,
"epsilon_dpo/beta_margin_grad_mean": -0.34486162662506104,
"epsilon_dpo/beta_margin_grad_std": 0.2070373296737671,
"epsilon_dpo/beta_margin_mean": 0.8322276473045349,
"epsilon_dpo/beta_margin_std": 1.1633617877960205,
"epsilon_dpo/loss_margin_mean": 40.383235931396484,
"grad_norm": 15.344311714172363,
"kl/avg_steps": 0.59375,
"kl/beta": 0.020877771079540253,
"kl/n_epsilon_steps": 0.203125,
"kl/p_epsilon_steps": 0.796875,
"learning_rate": 2.1442129043167873e-07,
"logits/chosen": -0.2770148515701294,
"logits/rejected": -0.35691946744918823,
"logps/chosen": -118.74815368652344,
"logps/ref_chosen": -69.74452209472656,
"logps/ref_rejected": -94.05877685546875,
"logps/rejected": -183.44564819335938,
"loss": 0.9778,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.0196120738983154,
"rewards/margins": 0.8322277069091797,
"rewards/rejected": -1.8518397808074951,
"step": 392
},
{
"epoch": 0.5941043083900227,
"epsilon_dpo/beta": 0.02063986100256443,
"epsilon_dpo/beta_margin_grad_mean": -0.33578401803970337,
"epsilon_dpo/beta_margin_grad_std": 0.17865769565105438,
"epsilon_dpo/beta_margin_mean": 0.8281677961349487,
"epsilon_dpo/beta_margin_std": 0.9427415132522583,
"epsilon_dpo/loss_margin_mean": 40.36809539794922,
"grad_norm": 14.897909164428711,
"kl/avg_steps": 0.5625,
"kl/beta": 0.02075454220175743,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 2.131130332936195e-07,
"logits/chosen": -0.13843950629234314,
"logits/rejected": -0.17297746241092682,
"logps/chosen": -110.48135375976562,
"logps/ref_chosen": -52.33489990234375,
"logps/ref_rejected": -74.33810424804688,
"logps/rejected": -172.85264587402344,
"loss": 0.9,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.2015691995620728,
"rewards/margins": 0.8281677961349487,
"rewards/rejected": -2.0297369956970215,
"step": 393
},
{
"epoch": 0.5956160241874527,
"epsilon_dpo/beta": 0.020524412393569946,
"epsilon_dpo/beta_margin_grad_mean": -0.3643406927585602,
"epsilon_dpo/beta_margin_grad_std": 0.1668926328420639,
"epsilon_dpo/beta_margin_mean": 0.6361644864082336,
"epsilon_dpo/beta_margin_std": 0.824968695640564,
"epsilon_dpo/loss_margin_mean": 31.230648040771484,
"grad_norm": 18.56803321838379,
"kl/avg_steps": 0.5625,
"kl/beta": 0.020638450980186462,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 2.1180580796331323e-07,
"logits/chosen": -0.06011414900422096,
"logits/rejected": -0.1274970918893814,
"logps/chosen": -116.40287780761719,
"logps/ref_chosen": -60.6761360168457,
"logps/ref_rejected": -71.36075592041016,
"logps/rejected": -158.31814575195312,
"loss": 0.9942,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.1463639736175537,
"rewards/margins": 0.6361645460128784,
"rewards/rejected": -1.7825285196304321,
"step": 394
},
{
"epoch": 0.5971277399848829,
"epsilon_dpo/beta": 0.020409606397151947,
"epsilon_dpo/beta_margin_grad_mean": -0.38182270526885986,
"epsilon_dpo/beta_margin_grad_std": 0.17407265305519104,
"epsilon_dpo/beta_margin_mean": 0.5839479565620422,
"epsilon_dpo/beta_margin_std": 0.8922612071037292,
"epsilon_dpo/loss_margin_mean": 28.82781410217285,
"grad_norm": 18.611263275146484,
"kl/avg_steps": 0.5625,
"kl/beta": 0.020523007959127426,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 2.104996510066625e-07,
"logits/chosen": 0.03072867915034294,
"logits/rejected": -0.23621629178524017,
"logps/chosen": -109.64920043945312,
"logps/ref_chosen": -50.60432434082031,
"logps/ref_rejected": -77.08731079101562,
"logps/rejected": -164.95999145507812,
"loss": 1.0515,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.2070801258087158,
"rewards/margins": 0.5839479565620422,
"rewards/rejected": -1.7910280227661133,
"step": 395
},
{
"epoch": 0.5986394557823129,
"epsilon_dpo/beta": 0.020289067178964615,
"epsilon_dpo/beta_margin_grad_mean": -0.35646292567253113,
"epsilon_dpo/beta_margin_grad_std": 0.1791132390499115,
"epsilon_dpo/beta_margin_mean": 0.7119321227073669,
"epsilon_dpo/beta_margin_std": 0.9423914551734924,
"epsilon_dpo/loss_margin_mean": 35.326683044433594,
"grad_norm": 15.465738296508789,
"kl/avg_steps": 0.59375,
"kl/beta": 0.020408213138580322,
"kl/n_epsilon_steps": 0.203125,
"kl/p_epsilon_steps": 0.796875,
"learning_rate": 2.0919459895968517e-07,
"logits/chosen": 0.002484539058059454,
"logits/rejected": -0.1747826337814331,
"logps/chosen": -108.60047912597656,
"logps/ref_chosen": -51.35961151123047,
"logps/ref_rejected": -79.89360046386719,
"logps/rejected": -172.46115112304688,
"loss": 0.9772,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.1632931232452393,
"rewards/margins": 0.7119321227073669,
"rewards/rejected": -1.8752251863479614,
"step": 396
},
{
"epoch": 0.600151171579743,
"epsilon_dpo/beta": 0.020213695243000984,
"epsilon_dpo/beta_margin_grad_mean": -0.4233687222003937,
"epsilon_dpo/beta_margin_grad_std": 0.2098219394683838,
"epsilon_dpo/beta_margin_mean": 0.35051825642585754,
"epsilon_dpo/beta_margin_std": 1.0821709632873535,
"epsilon_dpo/loss_margin_mean": 17.70000648498535,
"grad_norm": 19.940303802490234,
"kl/avg_steps": 0.375,
"kl/beta": 0.020287754014134407,
"kl/n_epsilon_steps": 0.3125,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 2.078906883274924e-07,
"logits/chosen": -0.2025710642337799,
"logits/rejected": -0.20071834325790405,
"logps/chosen": -132.37310791015625,
"logps/ref_chosen": -66.45622253417969,
"logps/ref_rejected": -85.74736022949219,
"logps/rejected": -169.3642578125,
"loss": 1.3183,
"rewards/accuracies": 0.734375,
"rewards/chosen": -1.3366742134094238,
"rewards/margins": 0.35051822662353516,
"rewards/rejected": -1.687192440032959,
"step": 397
},
{
"epoch": 0.6016628873771731,
"epsilon_dpo/beta": 0.020106593146920204,
"epsilon_dpo/beta_margin_grad_mean": -0.3422844111919403,
"epsilon_dpo/beta_margin_grad_std": 0.17670224606990814,
"epsilon_dpo/beta_margin_mean": 0.8147029280662537,
"epsilon_dpo/beta_margin_std": 0.9878636002540588,
"epsilon_dpo/loss_margin_mean": 40.769081115722656,
"grad_norm": 14.102545738220215,
"kl/avg_steps": 0.53125,
"kl/beta": 0.0202119592577219,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 2.065879555832674e-07,
"logits/chosen": 0.008489780128002167,
"logits/rejected": -0.1300819367170334,
"logps/chosen": -105.1488265991211,
"logps/ref_chosen": -49.244239807128906,
"logps/ref_rejected": -75.18949127197266,
"logps/rejected": -171.8631591796875,
"loss": 0.9154,
"rewards/accuracies": 0.765625,
"rewards/chosen": -1.1251747608184814,
"rewards/margins": 0.8147029280662537,
"rewards/rejected": -1.9398777484893799,
"step": 398
},
{
"epoch": 0.6031746031746031,
"epsilon_dpo/beta": 0.019994057714939117,
"epsilon_dpo/beta_margin_grad_mean": -0.3294134736061096,
"epsilon_dpo/beta_margin_grad_std": 0.1800222396850586,
"epsilon_dpo/beta_margin_mean": 0.9007197618484497,
"epsilon_dpo/beta_margin_std": 1.0671669244766235,
"epsilon_dpo/loss_margin_mean": 45.319000244140625,
"grad_norm": 15.830536842346191,
"kl/avg_steps": 0.5625,
"kl/beta": 0.02010514959692955,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 2.052864371672457e-07,
"logits/chosen": -0.21466538310050964,
"logits/rejected": -0.443739116191864,
"logps/chosen": -133.56298828125,
"logps/ref_chosen": -68.30679321289062,
"logps/ref_rejected": -113.2708511352539,
"logps/rejected": -223.84605407714844,
"loss": 0.8803,
"rewards/accuracies": 0.796875,
"rewards/chosen": -1.3076364994049072,
"rewards/margins": 0.9007197618484497,
"rewards/rejected": -2.2083563804626465,
"step": 399
},
{
"epoch": 0.6046863189720333,
"epsilon_dpo/beta": 0.019894717261195183,
"epsilon_dpo/beta_margin_grad_mean": -0.39825156331062317,
"epsilon_dpo/beta_margin_grad_std": 0.18856297433376312,
"epsilon_dpo/beta_margin_mean": 0.5103837847709656,
"epsilon_dpo/beta_margin_std": 0.9503202438354492,
"epsilon_dpo/loss_margin_mean": 25.913150787353516,
"grad_norm": 25.672508239746094,
"kl/avg_steps": 0.5,
"kl/beta": 0.019992690533399582,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 2.0398616948569493e-07,
"logits/chosen": -0.13366559147834778,
"logits/rejected": -0.2674522399902344,
"logps/chosen": -144.44911193847656,
"logps/ref_chosen": -71.62649536132812,
"logps/ref_rejected": -90.98765563964844,
"logps/rejected": -189.72341918945312,
"loss": 1.132,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.4507603645324707,
"rewards/margins": 0.5103837251663208,
"rewards/rejected": -1.9611440896987915,
"step": 400
},
{
"epoch": 0.6046863189720333,
"eval_epsilon_dpo/beta": 0.019803792238235474,
"eval_epsilon_dpo/beta_margin_grad_mean": -0.37993547320365906,
"eval_epsilon_dpo/beta_margin_grad_std": 0.18947526812553406,
"eval_epsilon_dpo/beta_margin_mean": 0.599367082118988,
"eval_epsilon_dpo/beta_margin_std": 0.9730914235115051,
"eval_epsilon_dpo/loss_margin_mean": 30.570838928222656,
"eval_kl/n_epsilon_steps": 0.2698063254356384,
"eval_kl/p_epsilon_steps": 0.7293133735656738,
"eval_logits/chosen": -0.04321199655532837,
"eval_logits/rejected": -0.16142940521240234,
"eval_logps/chosen": -135.41810607910156,
"eval_logps/ref_chosen": -74.85946655273438,
"eval_logps/ref_rejected": -79.54898834228516,
"eval_logps/rejected": -170.67848205566406,
"eval_loss": 0.5401765704154968,
"eval_rewards/accuracies": 0.7301936745643616,
"eval_rewards/chosen": -1.2023439407348633,
"eval_rewards/margins": 0.599367082118988,
"eval_rewards/rejected": -1.801710844039917,
"eval_runtime": 41.8307,
"eval_samples_per_second": 55.055,
"eval_steps_per_second": 1.721,
"step": 400
},
{
"epoch": 0.6061980347694633,
"epsilon_dpo/beta": 0.01977708749473095,
"epsilon_dpo/beta_margin_grad_mean": -0.33867183327674866,
"epsilon_dpo/beta_margin_grad_std": 0.1728515326976776,
"epsilon_dpo/beta_margin_mean": 0.8172554969787598,
"epsilon_dpo/beta_margin_std": 0.9354393482208252,
"epsilon_dpo/loss_margin_mean": 41.55567169189453,
"grad_norm": 12.292644500732422,
"kl/avg_steps": 0.59375,
"kl/beta": 0.01989322528243065,
"kl/n_epsilon_steps": 0.203125,
"kl/p_epsilon_steps": 0.796875,
"learning_rate": 2.0268718890989752e-07,
"logits/chosen": 0.029777199029922485,
"logits/rejected": -0.23572467267513275,
"logps/chosen": -108.26029968261719,
"logps/ref_chosen": -53.72496032714844,
"logps/ref_rejected": -75.06304931640625,
"logps/rejected": -171.154052734375,
"loss": 0.9005,
"rewards/accuracies": 0.796875,
"rewards/chosen": -1.0800408124923706,
"rewards/margins": 0.8172554969787598,
"rewards/rejected": -1.8972963094711304,
"step": 401
},
{
"epoch": 0.6077097505668935,
"epsilon_dpo/beta": 0.019691256806254387,
"epsilon_dpo/beta_margin_grad_mean": -0.3722436726093292,
"epsilon_dpo/beta_margin_grad_std": 0.19271717965602875,
"epsilon_dpo/beta_margin_mean": 0.6294482350349426,
"epsilon_dpo/beta_margin_std": 0.957283079624176,
"epsilon_dpo/loss_margin_mean": 32.287208557128906,
"grad_norm": 16.305931091308594,
"kl/avg_steps": 0.4375,
"kl/beta": 0.01977580599486828,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 2.013895317751323e-07,
"logits/chosen": -0.04498763009905815,
"logits/rejected": -0.08883590996265411,
"logps/chosen": -120.49742126464844,
"logps/ref_chosen": -61.873931884765625,
"logps/ref_rejected": -66.1519775390625,
"logps/rejected": -157.06268310546875,
"loss": 1.0486,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.158579707145691,
"rewards/margins": 0.6294482946395874,
"rewards/rejected": -1.7880280017852783,
"step": 402
},
{
"epoch": 0.6092214663643235,
"epsilon_dpo/beta": 0.019562408328056335,
"epsilon_dpo/beta_margin_grad_mean": -0.33596885204315186,
"epsilon_dpo/beta_margin_grad_std": 0.18883143365383148,
"epsilon_dpo/beta_margin_mean": 0.8536834120750427,
"epsilon_dpo/beta_margin_std": 1.0500301122665405,
"epsilon_dpo/loss_margin_mean": 43.880340576171875,
"grad_norm": 16.045747756958008,
"kl/avg_steps": 0.65625,
"kl/beta": 0.019689664244651794,
"kl/n_epsilon_steps": 0.171875,
"kl/p_epsilon_steps": 0.828125,
"learning_rate": 2.0009323437965898e-07,
"logits/chosen": 0.07565954327583313,
"logits/rejected": -0.15804286301136017,
"logps/chosen": -113.56201171875,
"logps/ref_chosen": -51.321502685546875,
"logps/ref_rejected": -86.54010772705078,
"logps/rejected": -192.66094970703125,
"loss": 0.9177,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.2192585468292236,
"rewards/margins": 0.8536834120750427,
"rewards/rejected": -2.072942018508911,
"step": 403
},
{
"epoch": 0.6107331821617535,
"epsilon_dpo/beta": 0.019483773037791252,
"epsilon_dpo/beta_margin_grad_mean": -0.3493500053882599,
"epsilon_dpo/beta_margin_grad_std": 0.20133227109909058,
"epsilon_dpo/beta_margin_mean": 0.7820363640785217,
"epsilon_dpo/beta_margin_std": 1.0519464015960693,
"epsilon_dpo/loss_margin_mean": 40.49776840209961,
"grad_norm": 19.05116081237793,
"kl/avg_steps": 0.40625,
"kl/beta": 0.019561292603611946,
"kl/n_epsilon_steps": 0.296875,
"kl/p_epsilon_steps": 0.703125,
"learning_rate": 1.9879833298370237e-07,
"logits/chosen": -0.131773442029953,
"logits/rejected": -0.3354595899581909,
"logps/chosen": -121.96896362304688,
"logps/ref_chosen": -62.26288604736328,
"logps/ref_rejected": -95.19029998779297,
"logps/rejected": -195.39413452148438,
"loss": 0.975,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.1677515506744385,
"rewards/margins": 0.7820363640785217,
"rewards/rejected": -1.9497878551483154,
"step": 404
},
{
"epoch": 0.6122448979591837,
"epsilon_dpo/beta": 0.019417118281126022,
"epsilon_dpo/beta_margin_grad_mean": -0.3749491572380066,
"epsilon_dpo/beta_margin_grad_std": 0.1802622377872467,
"epsilon_dpo/beta_margin_mean": 0.6299278140068054,
"epsilon_dpo/beta_margin_std": 0.9201330542564392,
"epsilon_dpo/loss_margin_mean": 32.75624084472656,
"grad_norm": 14.629277229309082,
"kl/avg_steps": 0.34375,
"kl/beta": 0.019482146948575974,
"kl/n_epsilon_steps": 0.328125,
"kl/p_epsilon_steps": 0.671875,
"learning_rate": 1.975048638084379e-07,
"logits/chosen": 0.00706704705953598,
"logits/rejected": -0.06672985851764679,
"logps/chosen": -112.21296691894531,
"logps/ref_chosen": -50.58434295654297,
"logps/ref_rejected": -65.43156433105469,
"logps/rejected": -159.81643676757812,
"loss": 1.029,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.199660301208496,
"rewards/margins": 0.6299278140068054,
"rewards/rejected": -1.8295881748199463,
"step": 405
},
{
"epoch": 0.6137566137566137,
"epsilon_dpo/beta": 0.019314192235469818,
"epsilon_dpo/beta_margin_grad_mean": -0.3451802432537079,
"epsilon_dpo/beta_margin_grad_std": 0.19972553849220276,
"epsilon_dpo/beta_margin_mean": 0.7781849503517151,
"epsilon_dpo/beta_margin_std": 1.0301129817962646,
"epsilon_dpo/loss_margin_mean": 40.611534118652344,
"grad_norm": 16.12744903564453,
"kl/avg_steps": 0.53125,
"kl/beta": 0.01941540651023388,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 1.9621286303497914e-07,
"logits/chosen": 0.07635320723056793,
"logits/rejected": -0.1688210666179657,
"logps/chosen": -109.36135864257812,
"logps/ref_chosen": -48.99560546875,
"logps/ref_rejected": -92.47773742675781,
"logps/rejected": -193.4550323486328,
"loss": 0.9728,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.1673238277435303,
"rewards/margins": 0.7781850099563599,
"rewards/rejected": -1.9455087184906006,
"step": 406
},
{
"epoch": 0.6152683295540439,
"epsilon_dpo/beta": 0.01923627220094204,
"epsilon_dpo/beta_margin_grad_mean": -0.371547669172287,
"epsilon_dpo/beta_margin_grad_std": 0.19633881747722626,
"epsilon_dpo/beta_margin_mean": 0.64608234167099,
"epsilon_dpo/beta_margin_std": 0.998077392578125,
"epsilon_dpo/loss_margin_mean": 33.92042541503906,
"grad_norm": 16.994611740112305,
"kl/avg_steps": 0.40625,
"kl/beta": 0.01931280642747879,
"kl/n_epsilon_steps": 0.296875,
"kl/p_epsilon_steps": 0.703125,
"learning_rate": 1.9492236680336483e-07,
"logits/chosen": -0.19888855516910553,
"logits/rejected": -0.371822714805603,
"logps/chosen": -164.33045959472656,
"logps/ref_chosen": -89.40056610107422,
"logps/ref_rejected": -99.28775024414062,
"logps/rejected": -208.13807678222656,
"loss": 1.0507,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.4450275897979736,
"rewards/margins": 0.6460822820663452,
"rewards/rejected": -2.0911097526550293,
"step": 407
},
{
"epoch": 0.6167800453514739,
"epsilon_dpo/beta": 0.01910433918237686,
"epsilon_dpo/beta_margin_grad_mean": -0.32324710488319397,
"epsilon_dpo/beta_margin_grad_std": 0.1788894534111023,
"epsilon_dpo/beta_margin_mean": 0.8831892609596252,
"epsilon_dpo/beta_margin_std": 0.9331621527671814,
"epsilon_dpo/loss_margin_mean": 46.45234298706055,
"grad_norm": 13.703137397766113,
"kl/avg_steps": 0.6875,
"kl/beta": 0.019234666600823402,
"kl/n_epsilon_steps": 0.15625,
"kl/p_epsilon_steps": 0.84375,
"learning_rate": 1.9363341121154895e-07,
"logits/chosen": -0.01748759299516678,
"logits/rejected": -0.1468869149684906,
"logps/chosen": -110.12002563476562,
"logps/ref_chosen": -54.70391845703125,
"logps/ref_rejected": -73.98648834228516,
"logps/rejected": -175.85494995117188,
"loss": 0.8643,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.059541940689087,
"rewards/margins": 0.8831892609596252,
"rewards/rejected": -1.9427311420440674,
"step": 408
},
{
"epoch": 0.618291761148904,
"epsilon_dpo/beta": 0.019051508978009224,
"epsilon_dpo/beta_margin_grad_mean": -0.4060458540916443,
"epsilon_dpo/beta_margin_grad_std": 0.18802158534526825,
"epsilon_dpo/beta_margin_mean": 0.47828155755996704,
"epsilon_dpo/beta_margin_std": 0.9415356516838074,
"epsilon_dpo/loss_margin_mean": 25.44247817993164,
"grad_norm": 18.018136978149414,
"kl/avg_steps": 0.28125,
"kl/beta": 0.019103331491351128,
"kl/n_epsilon_steps": 0.359375,
"kl/p_epsilon_steps": 0.640625,
"learning_rate": 1.9234603231438994e-07,
"logits/chosen": -0.10915550589561462,
"logits/rejected": -0.032916419208049774,
"logps/chosen": -130.06460571289062,
"logps/ref_chosen": -62.11822509765625,
"logps/ref_rejected": -61.933509826660156,
"logps/rejected": -155.32237243652344,
"loss": 1.1538,
"rewards/accuracies": 0.640625,
"rewards/chosen": -1.2979209423065186,
"rewards/margins": 0.47828155755996704,
"rewards/rejected": -1.7762024402618408,
"step": 409
},
{
"epoch": 0.6198034769463341,
"epsilon_dpo/beta": 0.018926633521914482,
"epsilon_dpo/beta_margin_grad_mean": -0.35163354873657227,
"epsilon_dpo/beta_margin_grad_std": 0.15604017674922943,
"epsilon_dpo/beta_margin_mean": 0.7063899040222168,
"epsilon_dpo/beta_margin_std": 0.7969531416893005,
"epsilon_dpo/loss_margin_mean": 37.51892852783203,
"grad_norm": 15.377472877502441,
"kl/avg_steps": 0.65625,
"kl/beta": 0.01904975436627865,
"kl/n_epsilon_steps": 0.171875,
"kl/p_epsilon_steps": 0.828125,
"learning_rate": 1.9106026612264315e-07,
"logits/chosen": -0.1137542873620987,
"logits/rejected": -0.11894262582063675,
"logps/chosen": -125.42486572265625,
"logps/ref_chosen": -61.80265808105469,
"logps/ref_rejected": -76.60001373291016,
"logps/rejected": -177.74114990234375,
"loss": 0.9315,
"rewards/accuracies": 0.828125,
"rewards/chosen": -1.2060956954956055,
"rewards/margins": 0.7063899040222168,
"rewards/rejected": -1.9124855995178223,
"step": 410
},
{
"epoch": 0.6213151927437641,
"epsilon_dpo/beta": 0.018850553780794144,
"epsilon_dpo/beta_margin_grad_mean": -0.35760369896888733,
"epsilon_dpo/beta_margin_grad_std": 0.19247713685035706,
"epsilon_dpo/beta_margin_mean": 0.725860059261322,
"epsilon_dpo/beta_margin_std": 1.0220859050750732,
"epsilon_dpo/loss_margin_mean": 38.86330795288086,
"grad_norm": 16.35286521911621,
"kl/avg_steps": 0.40625,
"kl/beta": 0.018925555050373077,
"kl/n_epsilon_steps": 0.296875,
"kl/p_epsilon_steps": 0.703125,
"learning_rate": 1.8977614860195296e-07,
"logits/chosen": 0.011134624481201172,
"logits/rejected": -0.16456930339336395,
"logps/chosen": -126.23455810546875,
"logps/ref_chosen": -54.445396423339844,
"logps/ref_rejected": -74.56507873535156,
"logps/rejected": -185.21755981445312,
"loss": 0.9977,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.3571560382843018,
"rewards/margins": 0.7258599996566772,
"rewards/rejected": -2.0830161571502686,
"step": 411
},
{
"epoch": 0.6228269085411943,
"epsilon_dpo/beta": 0.018750719726085663,
"epsilon_dpo/beta_margin_grad_mean": -0.36582618951797485,
"epsilon_dpo/beta_margin_grad_std": 0.16970573365688324,
"epsilon_dpo/beta_margin_mean": 0.659077525138855,
"epsilon_dpo/beta_margin_std": 0.8630571365356445,
"epsilon_dpo/loss_margin_mean": 35.399478912353516,
"grad_norm": 15.528914451599121,
"kl/avg_steps": 0.53125,
"kl/beta": 0.018848979845643044,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 1.8849371567184662e-07,
"logits/chosen": -0.04830653965473175,
"logits/rejected": -0.12663593888282776,
"logps/chosen": -128.50949096679688,
"logps/ref_chosen": -55.248085021972656,
"logps/ref_rejected": -68.96623229980469,
"logps/rejected": -177.6271209716797,
"loss": 0.9873,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.37641179561615,
"rewards/margins": 0.659077525138855,
"rewards/rejected": -2.035489320755005,
"step": 412
},
{
"epoch": 0.6243386243386243,
"epsilon_dpo/beta": 0.018669214099645615,
"epsilon_dpo/beta_margin_grad_mean": -0.37798234820365906,
"epsilon_dpo/beta_margin_grad_std": 0.2066570222377777,
"epsilon_dpo/beta_margin_mean": 0.6125777959823608,
"epsilon_dpo/beta_margin_std": 1.0806788206100464,
"epsilon_dpo/loss_margin_mean": 33.19569396972656,
"grad_norm": 18.219341278076172,
"kl/avg_steps": 0.4375,
"kl/beta": 0.018749374896287918,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 1.872130032047302e-07,
"logits/chosen": -0.1416029930114746,
"logits/rejected": -0.18682757019996643,
"logps/chosen": -148.57882690429688,
"logps/ref_chosen": -68.72074890136719,
"logps/ref_rejected": -78.76539611816406,
"logps/rejected": -191.81918334960938,
"loss": 1.106,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.4942833185195923,
"rewards/margins": 0.6125777959823608,
"rewards/rejected": -2.106861114501953,
"step": 413
},
{
"epoch": 0.6258503401360545,
"epsilon_dpo/beta": 0.018570387735962868,
"epsilon_dpo/beta_margin_grad_mean": -0.3514930009841919,
"epsilon_dpo/beta_margin_grad_std": 0.18316827714443207,
"epsilon_dpo/beta_margin_mean": 0.7291079163551331,
"epsilon_dpo/beta_margin_std": 0.9344438314437866,
"epsilon_dpo/loss_margin_mean": 39.55892562866211,
"grad_norm": 16.12960433959961,
"kl/avg_steps": 0.53125,
"kl/beta": 0.01866770349442959,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 1.8593404702488436e-07,
"logits/chosen": 0.013908982276916504,
"logits/rejected": -0.06706319749355316,
"logps/chosen": -127.59284973144531,
"logps/ref_chosen": -54.13821792602539,
"logps/ref_rejected": -74.65741729736328,
"logps/rejected": -187.67098999023438,
"loss": 0.967,
"rewards/accuracies": 0.796875,
"rewards/chosen": -1.3656686544418335,
"rewards/margins": 0.7291079163551331,
"rewards/rejected": -2.0947766304016113,
"step": 414
},
{
"epoch": 0.6273620559334845,
"epsilon_dpo/beta": 0.018489664420485497,
"epsilon_dpo/beta_margin_grad_mean": -0.3665994107723236,
"epsilon_dpo/beta_margin_grad_std": 0.19567281007766724,
"epsilon_dpo/beta_margin_mean": 0.661335289478302,
"epsilon_dpo/beta_margin_std": 0.9977084994316101,
"epsilon_dpo/loss_margin_mean": 36.11844253540039,
"grad_norm": 15.833710670471191,
"kl/avg_steps": 0.4375,
"kl/beta": 0.018569055944681168,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 1.846568829074628e-07,
"logits/chosen": 0.041580211371183395,
"logits/rejected": 0.1119493693113327,
"logps/chosen": -126.4255142211914,
"logps/ref_chosen": -55.91856002807617,
"logps/ref_rejected": -61.747703552246094,
"logps/rejected": -168.37310791015625,
"loss": 1.0397,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.305060625076294,
"rewards/margins": 0.661335289478302,
"rewards/rejected": -1.9663958549499512,
"step": 415
},
{
"epoch": 0.6288737717309146,
"epsilon_dpo/beta": 0.018432235345244408,
"epsilon_dpo/beta_margin_grad_mean": -0.3978129029273987,
"epsilon_dpo/beta_margin_grad_std": 0.19987753033638,
"epsilon_dpo/beta_margin_mean": 0.5286079049110413,
"epsilon_dpo/beta_margin_std": 1.0472856760025024,
"epsilon_dpo/loss_margin_mean": 29.06624412536621,
"grad_norm": 17.572973251342773,
"kl/avg_steps": 0.3125,
"kl/beta": 0.018488168716430664,
"kl/n_epsilon_steps": 0.34375,
"kl/p_epsilon_steps": 0.65625,
"learning_rate": 1.8338154657749128e-07,
"logits/chosen": -0.038118891417980194,
"logits/rejected": -0.1409626454114914,
"logps/chosen": -131.90078735351562,
"logps/ref_chosen": -54.72308349609375,
"logps/ref_rejected": -69.17388916015625,
"logps/rejected": -175.4178466796875,
"loss": 1.1524,
"rewards/accuracies": 0.671875,
"rewards/chosen": -1.4267981052398682,
"rewards/margins": 0.5286079049110413,
"rewards/rejected": -1.9554059505462646,
"step": 416
},
{
"epoch": 0.6303854875283447,
"epsilon_dpo/beta": 0.018334494903683662,
"epsilon_dpo/beta_margin_grad_mean": -0.3512361943721771,
"epsilon_dpo/beta_margin_grad_std": 0.1943528652191162,
"epsilon_dpo/beta_margin_mean": 0.7189047336578369,
"epsilon_dpo/beta_margin_std": 1.0374675989151,
"epsilon_dpo/loss_margin_mean": 39.55641174316406,
"grad_norm": 16.48633575439453,
"kl/avg_steps": 0.53125,
"kl/beta": 0.018430573865771294,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 1.8210807370886849e-07,
"logits/chosen": 0.012499801814556122,
"logits/rejected": -0.19840557873249054,
"logps/chosen": -137.2838592529297,
"logps/ref_chosen": -56.791259765625,
"logps/ref_rejected": -68.7791748046875,
"logps/rejected": -188.82818603515625,
"loss": 1.0119,
"rewards/accuracies": 0.765625,
"rewards/chosen": -1.478604793548584,
"rewards/margins": 0.7189047336578369,
"rewards/rejected": -2.197509527206421,
"step": 417
},
{
"epoch": 0.6318972033257747,
"epsilon_dpo/beta": 0.018266255035996437,
"epsilon_dpo/beta_margin_grad_mean": -0.39510443806648254,
"epsilon_dpo/beta_margin_grad_std": 0.19843092560768127,
"epsilon_dpo/beta_margin_mean": 0.5324282050132751,
"epsilon_dpo/beta_margin_std": 1.030791997909546,
"epsilon_dpo/loss_margin_mean": 29.519411087036133,
"grad_norm": 19.738357543945312,
"kl/avg_steps": 0.375,
"kl/beta": 0.018333178013563156,
"kl/n_epsilon_steps": 0.3125,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 1.8083649992336825e-07,
"logits/chosen": -0.14377397298812866,
"logits/rejected": -0.09100518375635147,
"logps/chosen": -156.55502319335938,
"logps/ref_chosen": -69.10798645019531,
"logps/ref_rejected": -75.09132385253906,
"logps/rejected": -192.05780029296875,
"loss": 1.1428,
"rewards/accuracies": 0.703125,
"rewards/chosen": -1.6020665168762207,
"rewards/margins": 0.5324281454086304,
"rewards/rejected": -2.1344945430755615,
"step": 418
},
{
"epoch": 0.6334089191232048,
"epsilon_dpo/beta": 0.018163764849305153,
"epsilon_dpo/beta_margin_grad_mean": -0.34459593892097473,
"epsilon_dpo/beta_margin_grad_std": 0.18837900459766388,
"epsilon_dpo/beta_margin_mean": 0.8002488017082214,
"epsilon_dpo/beta_margin_std": 1.0125706195831299,
"epsilon_dpo/loss_margin_mean": 44.349056243896484,
"grad_norm": 15.949722290039062,
"kl/avg_steps": 0.5625,
"kl/beta": 0.018264686688780785,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 1.7956686078964255e-07,
"logits/chosen": -0.032163530588150024,
"logits/rejected": -0.14619705080986023,
"logps/chosen": -125.9248046875,
"logps/ref_chosen": -58.1717643737793,
"logps/ref_rejected": -71.67066955566406,
"logps/rejected": -183.77276611328125,
"loss": 0.9427,
"rewards/accuracies": 0.828125,
"rewards/chosen": -1.2329981327056885,
"rewards/margins": 0.8002488017082214,
"rewards/rejected": -2.0332469940185547,
"step": 419
},
{
"epoch": 0.6349206349206349,
"epsilon_dpo/beta": 0.01811325177550316,
"epsilon_dpo/beta_margin_grad_mean": -0.4163946509361267,
"epsilon_dpo/beta_margin_grad_std": 0.21044054627418518,
"epsilon_dpo/beta_margin_mean": 0.42256245017051697,
"epsilon_dpo/beta_margin_std": 1.079357624053955,
"epsilon_dpo/loss_margin_mean": 23.748287200927734,
"grad_norm": 17.98488998413086,
"kl/avg_steps": 0.28125,
"kl/beta": 0.018162522464990616,
"kl/n_epsilon_steps": 0.359375,
"kl/p_epsilon_steps": 0.640625,
"learning_rate": 1.782991918222275e-07,
"logits/chosen": 0.046790819615125656,
"logits/rejected": -0.01735183410346508,
"logps/chosen": -144.91510009765625,
"logps/ref_chosen": -57.05351257324219,
"logps/ref_rejected": -62.670982360839844,
"logps/rejected": -174.28085327148438,
"loss": 1.257,
"rewards/accuracies": 0.640625,
"rewards/chosen": -1.5956722497940063,
"rewards/margins": 0.42256245017051697,
"rewards/rejected": -2.0182347297668457,
"step": 420
},
{
"epoch": 0.636432350718065,
"epsilon_dpo/beta": 0.018022827804088593,
"epsilon_dpo/beta_margin_grad_mean": -0.3885970711708069,
"epsilon_dpo/beta_margin_grad_std": 0.2243383824825287,
"epsilon_dpo/beta_margin_mean": 0.5735925436019897,
"epsilon_dpo/beta_margin_std": 1.1769942045211792,
"epsilon_dpo/loss_margin_mean": 32.23101043701172,
"grad_norm": 19.791738510131836,
"kl/avg_steps": 0.5,
"kl/beta": 0.018111582845449448,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 1.7703352848054887e-07,
"logits/chosen": -0.024503352120518684,
"logits/rejected": -0.1419539600610733,
"logps/chosen": -139.7303466796875,
"logps/ref_chosen": -57.32324981689453,
"logps/ref_rejected": -75.33782958984375,
"logps/rejected": -189.97593688964844,
"loss": 1.1813,
"rewards/accuracies": 0.765625,
"rewards/chosen": -1.4896764755249023,
"rewards/margins": 0.5735925436019897,
"rewards/rejected": -2.0632691383361816,
"step": 421
},
{
"epoch": 0.6379440665154951,
"epsilon_dpo/beta": 0.01792752929031849,
"epsilon_dpo/beta_margin_grad_mean": -0.34745872020721436,
"epsilon_dpo/beta_margin_grad_std": 0.18449221551418304,
"epsilon_dpo/beta_margin_mean": 0.7565876245498657,
"epsilon_dpo/beta_margin_std": 0.9432625770568848,
"epsilon_dpo/loss_margin_mean": 42.52012634277344,
"grad_norm": 17.226760864257812,
"kl/avg_steps": 0.53125,
"kl/beta": 0.01802147552371025,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 1.7576990616793137e-07,
"logits/chosen": -0.13847726583480835,
"logits/rejected": -0.12428702414035797,
"logps/chosen": -138.095703125,
"logps/ref_chosen": -67.05757904052734,
"logps/ref_rejected": -72.12803649902344,
"logps/rejected": -185.686279296875,
"loss": 0.9506,
"rewards/accuracies": 0.765625,
"rewards/chosen": -1.2754526138305664,
"rewards/margins": 0.7565876245498657,
"rewards/rejected": -2.0320403575897217,
"step": 422
},
{
"epoch": 0.6394557823129252,
"epsilon_dpo/beta": 0.017827188596129417,
"epsilon_dpo/beta_margin_grad_mean": -0.34686267375946045,
"epsilon_dpo/beta_margin_grad_std": 0.18482360243797302,
"epsilon_dpo/beta_margin_mean": 0.7790398001670837,
"epsilon_dpo/beta_margin_std": 0.9722562432289124,
"epsilon_dpo/loss_margin_mean": 43.979835510253906,
"grad_norm": 15.439717292785645,
"kl/avg_steps": 0.5625,
"kl/beta": 0.01792624220252037,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 1.745083602306071e-07,
"logits/chosen": 0.06741990894079208,
"logits/rejected": -0.18852515518665314,
"logps/chosen": -128.6971893310547,
"logps/ref_chosen": -54.061668395996094,
"logps/ref_rejected": -76.64092254638672,
"logps/rejected": -195.25628662109375,
"loss": 0.9438,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.3332774639129639,
"rewards/margins": 0.7790398597717285,
"rewards/rejected": -2.1123170852661133,
"step": 423
},
{
"epoch": 0.6409674981103552,
"epsilon_dpo/beta": 0.017716331407427788,
"epsilon_dpo/beta_margin_grad_mean": -0.33768096566200256,
"epsilon_dpo/beta_margin_grad_std": 0.1860429048538208,
"epsilon_dpo/beta_margin_mean": 0.8251336216926575,
"epsilon_dpo/beta_margin_std": 0.9804246425628662,
"epsilon_dpo/loss_margin_mean": 46.85561752319336,
"grad_norm": 17.873279571533203,
"kl/avg_steps": 0.625,
"kl/beta": 0.01782597228884697,
"kl/n_epsilon_steps": 0.1875,
"kl/p_epsilon_steps": 0.8125,
"learning_rate": 1.7324892595672804e-07,
"logits/chosen": 0.04498608037829399,
"logits/rejected": -0.03568783774971962,
"logps/chosen": -130.06663513183594,
"logps/ref_chosen": -53.60887145996094,
"logps/ref_rejected": -79.2139892578125,
"logps/rejected": -202.52737426757812,
"loss": 0.9163,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.3556125164031982,
"rewards/margins": 0.8251335620880127,
"rewards/rejected": -2.180746078491211,
"step": 424
},
{
"epoch": 0.6424792139077853,
"epsilon_dpo/beta": 0.017628438770771027,
"epsilon_dpo/beta_margin_grad_mean": -0.3782404363155365,
"epsilon_dpo/beta_margin_grad_std": 0.19027520716190338,
"epsilon_dpo/beta_margin_mean": 0.6106665134429932,
"epsilon_dpo/beta_margin_std": 0.9608878493309021,
"epsilon_dpo/loss_margin_mean": 34.963016510009766,
"grad_norm": 17.07148551940918,
"kl/avg_steps": 0.5,
"kl/beta": 0.017715251073241234,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 1.7199163857537824e-07,
"logits/chosen": 0.011417558416724205,
"logits/rejected": -0.04729383438825607,
"logps/chosen": -135.05638122558594,
"logps/ref_chosen": -58.41468048095703,
"logps/ref_rejected": -66.59054565429688,
"logps/rejected": -178.19525146484375,
"loss": 1.0614,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.3536860942840576,
"rewards/margins": 0.6106665134429932,
"rewards/rejected": -1.9643526077270508,
"step": 425
},
{
"epoch": 0.6439909297052154,
"epsilon_dpo/beta": 0.01757378876209259,
"epsilon_dpo/beta_margin_grad_mean": -0.41993066668510437,
"epsilon_dpo/beta_margin_grad_std": 0.20855309069156647,
"epsilon_dpo/beta_margin_mean": 0.3883659839630127,
"epsilon_dpo/beta_margin_std": 1.0724225044250488,
"epsilon_dpo/loss_margin_mean": 22.518798828125,
"grad_norm": 23.373611450195312,
"kl/avg_steps": 0.3125,
"kl/beta": 0.017627116292715073,
"kl/n_epsilon_steps": 0.34375,
"kl/p_epsilon_steps": 0.65625,
"learning_rate": 1.7073653325558828e-07,
"logits/chosen": -0.17499208450317383,
"logits/rejected": -0.05495479702949524,
"logps/chosen": -164.74636840820312,
"logps/ref_chosen": -71.70822143554688,
"logps/ref_rejected": -73.57725524902344,
"logps/rejected": -189.13421630859375,
"loss": 1.2818,
"rewards/accuracies": 0.671875,
"rewards/chosen": -1.6391658782958984,
"rewards/margins": 0.3883659839630127,
"rewards/rejected": -2.027531623840332,
"step": 426
},
{
"epoch": 0.6455026455026455,
"epsilon_dpo/beta": 0.017497073858976364,
"epsilon_dpo/beta_margin_grad_mean": -0.37794923782348633,
"epsilon_dpo/beta_margin_grad_std": 0.20528697967529297,
"epsilon_dpo/beta_margin_mean": 0.6316925883293152,
"epsilon_dpo/beta_margin_std": 1.0771772861480713,
"epsilon_dpo/loss_margin_mean": 36.494667053222656,
"grad_norm": 17.935665130615234,
"kl/avg_steps": 0.4375,
"kl/beta": 0.017572201788425446,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 1.6948364510535218e-07,
"logits/chosen": 0.0007263254374265671,
"logits/rejected": -0.05288812518119812,
"logps/chosen": -147.6334228515625,
"logps/ref_chosen": -58.64276885986328,
"logps/ref_rejected": -86.25437927246094,
"logps/rejected": -211.73968505859375,
"loss": 1.0889,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.560344934463501,
"rewards/margins": 0.6316925287246704,
"rewards/rejected": -2.192037582397461,
"step": 427
},
{
"epoch": 0.6470143613000756,
"epsilon_dpo/beta": 0.017404451966285706,
"epsilon_dpo/beta_margin_grad_mean": -0.35697662830352783,
"epsilon_dpo/beta_margin_grad_std": 0.20317521691322327,
"epsilon_dpo/beta_margin_mean": 0.7461524605751038,
"epsilon_dpo/beta_margin_std": 1.0847582817077637,
"epsilon_dpo/loss_margin_mean": 43.21971130371094,
"grad_norm": 15.791139602661133,
"kl/avg_steps": 0.53125,
"kl/beta": 0.01749565824866295,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 1.6823300917064458e-07,
"logits/chosen": -0.09639132022857666,
"logits/rejected": -0.2238897979259491,
"logps/chosen": -149.83340454101562,
"logps/ref_chosen": -66.5960464477539,
"logps/ref_rejected": -82.3941650390625,
"logps/rejected": -208.85122680664062,
"loss": 1.008,
"rewards/accuracies": 0.734375,
"rewards/chosen": -1.4517005681991577,
"rewards/margins": 0.7461525201797485,
"rewards/rejected": -2.1978530883789062,
"step": 428
},
{
"epoch": 0.6485260770975056,
"epsilon_dpo/beta": 0.017323357984423637,
"epsilon_dpo/beta_margin_grad_mean": -0.3801988661289215,
"epsilon_dpo/beta_margin_grad_std": 0.19638431072235107,
"epsilon_dpo/beta_margin_mean": 0.5862859487533569,
"epsilon_dpo/beta_margin_std": 0.9739435315132141,
"epsilon_dpo/loss_margin_mean": 34.19995880126953,
"grad_norm": 17.651044845581055,
"kl/avg_steps": 0.46875,
"kl/beta": 0.01740320399403572,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 1.669846604344412e-07,
"logits/chosen": 0.015349796041846275,
"logits/rejected": 0.04521708935499191,
"logps/chosen": -145.02532958984375,
"logps/ref_chosen": -57.009700775146484,
"logps/ref_rejected": -59.86549377441406,
"logps/rejected": -182.08108520507812,
"loss": 1.0882,
"rewards/accuracies": 0.765625,
"rewards/chosen": -1.5289678573608398,
"rewards/margins": 0.5862859487533569,
"rewards/rejected": -2.1152536869049072,
"step": 429
},
{
"epoch": 0.6500377928949358,
"epsilon_dpo/beta": 0.017215466126799583,
"epsilon_dpo/beta_margin_grad_mean": -0.3352311849594116,
"epsilon_dpo/beta_margin_grad_std": 0.1836235374212265,
"epsilon_dpo/beta_margin_mean": 0.8115749359130859,
"epsilon_dpo/beta_margin_std": 0.9579644203186035,
"epsilon_dpo/loss_margin_mean": 47.41362762451172,
"grad_norm": 15.476751327514648,
"kl/avg_steps": 0.625,
"kl/beta": 0.017322007566690445,
"kl/n_epsilon_steps": 0.1875,
"kl/p_epsilon_steps": 0.8125,
"learning_rate": 1.6573863381573954e-07,
"logits/chosen": 0.029597945511341095,
"logits/rejected": -0.035756662487983704,
"logps/chosen": -139.48040771484375,
"logps/ref_chosen": -59.563194274902344,
"logps/ref_rejected": -70.52289581298828,
"logps/rejected": -197.85372924804688,
"loss": 0.9205,
"rewards/accuracies": 0.828125,
"rewards/chosen": -1.3787541389465332,
"rewards/margins": 0.8115749359130859,
"rewards/rejected": -2.190329074859619,
"step": 430
},
{
"epoch": 0.6515495086923658,
"epsilon_dpo/beta": 0.01711391843855381,
"epsilon_dpo/beta_margin_grad_mean": -0.3737926483154297,
"epsilon_dpo/beta_margin_grad_std": 0.18102532625198364,
"epsilon_dpo/beta_margin_mean": 0.6166346669197083,
"epsilon_dpo/beta_margin_std": 0.9225481152534485,
"epsilon_dpo/loss_margin_mean": 36.305259704589844,
"grad_norm": 14.876779556274414,
"kl/avg_steps": 0.59375,
"kl/beta": 0.017214417457580566,
"kl/n_epsilon_steps": 0.203125,
"kl/p_epsilon_steps": 0.796875,
"learning_rate": 1.6449496416858282e-07,
"logits/chosen": 0.18211492896080017,
"logits/rejected": 0.013401351869106293,
"logps/chosen": -124.92241668701172,
"logps/ref_chosen": -50.20032501220703,
"logps/ref_rejected": -77.81680297851562,
"logps/rejected": -188.8441619873047,
"loss": 1.0412,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.2807905673980713,
"rewards/margins": 0.6166346669197083,
"rewards/rejected": -1.8974252939224243,
"step": 431
},
{
"epoch": 0.6530612244897959,
"epsilon_dpo/beta": 0.017044993117451668,
"epsilon_dpo/beta_margin_grad_mean": -0.37144941091537476,
"epsilon_dpo/beta_margin_grad_std": 0.192764014005661,
"epsilon_dpo/beta_margin_mean": 0.6323844194412231,
"epsilon_dpo/beta_margin_std": 0.9669424891471863,
"epsilon_dpo/loss_margin_mean": 37.45354080200195,
"grad_norm": 15.999431610107422,
"kl/avg_steps": 0.40625,
"kl/beta": 0.017112810164690018,
"kl/n_epsilon_steps": 0.296875,
"kl/p_epsilon_steps": 0.703125,
"learning_rate": 1.632536862810844e-07,
"logits/chosen": -0.1155528575181961,
"logits/rejected": -0.09565869718790054,
"logps/chosen": -138.04913330078125,
"logps/ref_chosen": -61.662757873535156,
"logps/ref_rejected": -83.94496154785156,
"logps/rejected": -197.78488159179688,
"loss": 1.0504,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.3053680658340454,
"rewards/margins": 0.6323844194412231,
"rewards/rejected": -1.9377524852752686,
"step": 432
},
{
"epoch": 0.654572940287226,
"epsilon_dpo/beta": 0.016949394717812538,
"epsilon_dpo/beta_margin_grad_mean": -0.3422141373157501,
"epsilon_dpo/beta_margin_grad_std": 0.19215217232704163,
"epsilon_dpo/beta_margin_mean": 0.7830297946929932,
"epsilon_dpo/beta_margin_std": 0.9888100624084473,
"epsilon_dpo/loss_margin_mean": 46.536128997802734,
"grad_norm": 15.502524375915527,
"kl/avg_steps": 0.5625,
"kl/beta": 0.017043570056557655,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 1.6201483487445515e-07,
"logits/chosen": -0.028168167918920517,
"logits/rejected": 0.03759019821882248,
"logps/chosen": -140.24765014648438,
"logps/ref_chosen": -63.72918701171875,
"logps/ref_rejected": -65.8391342163086,
"logps/rejected": -188.8937225341797,
"loss": 0.9531,
"rewards/accuracies": 0.796875,
"rewards/chosen": -1.2998216152191162,
"rewards/margins": 0.7830297946929932,
"rewards/rejected": -2.0828514099121094,
"step": 433
},
{
"epoch": 0.656084656084656,
"epsilon_dpo/beta": 0.01684929057955742,
"epsilon_dpo/beta_margin_grad_mean": -0.34700122475624084,
"epsilon_dpo/beta_margin_grad_std": 0.20768187940120697,
"epsilon_dpo/beta_margin_mean": 0.7490586638450623,
"epsilon_dpo/beta_margin_std": 1.0590612888336182,
"epsilon_dpo/loss_margin_mean": 44.82007598876953,
"grad_norm": 14.662243843078613,
"kl/avg_steps": 0.59375,
"kl/beta": 0.016948236152529716,
"kl/n_epsilon_steps": 0.203125,
"kl/p_epsilon_steps": 0.796875,
"learning_rate": 1.6077844460203204e-07,
"logits/chosen": 0.1233740970492363,
"logits/rejected": -0.008235976099967957,
"logps/chosen": -116.51690673828125,
"logps/ref_chosen": -47.97331619262695,
"logps/ref_rejected": -72.51132202148438,
"logps/rejected": -185.875,
"loss": 1.009,
"rewards/accuracies": 0.765625,
"rewards/chosen": -1.1575994491577148,
"rewards/margins": 0.7490586042404175,
"rewards/rejected": -1.9066579341888428,
"step": 434
},
{
"epoch": 0.6575963718820862,
"epsilon_dpo/beta": 0.016781434416770935,
"epsilon_dpo/beta_margin_grad_mean": -0.3677992820739746,
"epsilon_dpo/beta_margin_grad_std": 0.19565437734127045,
"epsilon_dpo/beta_margin_mean": 0.6558788418769836,
"epsilon_dpo/beta_margin_std": 0.9668457508087158,
"epsilon_dpo/loss_margin_mean": 39.47189712524414,
"grad_norm": 17.670854568481445,
"kl/avg_steps": 0.40625,
"kl/beta": 0.01684820093214512,
"kl/n_epsilon_steps": 0.296875,
"kl/p_epsilon_steps": 0.703125,
"learning_rate": 1.5954455004830878e-07,
"logits/chosen": 0.016416650265455246,
"logits/rejected": -0.07025650888681412,
"logps/chosen": -136.03875732421875,
"logps/ref_chosen": -57.06024932861328,
"logps/ref_rejected": -71.69146728515625,
"logps/rejected": -190.14187622070312,
"loss": 1.0347,
"rewards/accuracies": 0.734375,
"rewards/chosen": -1.3289178609848022,
"rewards/margins": 0.6558787822723389,
"rewards/rejected": -1.9847967624664307,
"step": 435
},
{
"epoch": 0.6591080876795162,
"epsilon_dpo/beta": 0.0167030468583107,
"epsilon_dpo/beta_margin_grad_mean": -0.380288302898407,
"epsilon_dpo/beta_margin_grad_std": 0.1991802155971527,
"epsilon_dpo/beta_margin_mean": 0.5946735143661499,
"epsilon_dpo/beta_margin_std": 0.9789596796035767,
"epsilon_dpo/loss_margin_mean": 35.97812271118164,
"grad_norm": 16.151695251464844,
"kl/avg_steps": 0.46875,
"kl/beta": 0.016780031844973564,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 1.5831318572796847e-07,
"logits/chosen": -0.05882483348250389,
"logits/rejected": -0.1354832947254181,
"logps/chosen": -132.9088897705078,
"logps/ref_chosen": -56.158050537109375,
"logps/ref_rejected": -67.63787841796875,
"logps/rejected": -180.3668212890625,
"loss": 1.0848,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.284294843673706,
"rewards/margins": 0.5946735143661499,
"rewards/rejected": -1.878968358039856,
"step": 436
},
{
"epoch": 0.6606198034769464,
"epsilon_dpo/beta": 0.01665121503174305,
"epsilon_dpo/beta_margin_grad_mean": -0.38308537006378174,
"epsilon_dpo/beta_margin_grad_std": 0.21530242264270782,
"epsilon_dpo/beta_margin_mean": 0.59850013256073,
"epsilon_dpo/beta_margin_std": 1.1140815019607544,
"epsilon_dpo/loss_margin_mean": 36.433902740478516,
"grad_norm": 18.0877742767334,
"kl/avg_steps": 0.3125,
"kl/beta": 0.016701743006706238,
"kl/n_epsilon_steps": 0.34375,
"kl/p_epsilon_steps": 0.65625,
"learning_rate": 1.5708438608491815e-07,
"logits/chosen": -0.0194876566529274,
"logits/rejected": -0.22391854226589203,
"logps/chosen": -140.8734893798828,
"logps/ref_chosen": -56.98578643798828,
"logps/ref_rejected": -85.61524963378906,
"logps/rejected": -205.93685913085938,
"loss": 1.1347,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.4000355005264282,
"rewards/margins": 0.59850013256073,
"rewards/rejected": -1.9985356330871582,
"step": 437
},
{
"epoch": 0.6621315192743764,
"epsilon_dpo/beta": 0.016557713970541954,
"epsilon_dpo/beta_margin_grad_mean": -0.3412574231624603,
"epsilon_dpo/beta_margin_grad_std": 0.19060277938842773,
"epsilon_dpo/beta_margin_mean": 0.8231085538864136,
"epsilon_dpo/beta_margin_std": 1.0279713869094849,
"epsilon_dpo/loss_margin_mean": 50.02913284301758,
"grad_norm": 16.26816749572754,
"kl/avg_steps": 0.5625,
"kl/beta": 0.01664971187710762,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 1.558581854913253e-07,
"logits/chosen": 0.15658235549926758,
"logits/rejected": 0.014725517481565475,
"logps/chosen": -114.3604965209961,
"logps/ref_chosen": -41.27777862548828,
"logps/ref_rejected": -65.33840942382812,
"logps/rejected": -188.45025634765625,
"loss": 0.9318,
"rewards/accuracies": 0.734375,
"rewards/chosen": -1.2123374938964844,
"rewards/margins": 0.8231085538864136,
"rewards/rejected": -2.0354461669921875,
"step": 438
},
{
"epoch": 0.6636432350718064,
"epsilon_dpo/beta": 0.01645992323756218,
"epsilon_dpo/beta_margin_grad_mean": -0.35800254344940186,
"epsilon_dpo/beta_margin_grad_std": 0.17168234288692474,
"epsilon_dpo/beta_margin_mean": 0.6950428485870361,
"epsilon_dpo/beta_margin_std": 0.8970963358879089,
"epsilon_dpo/loss_margin_mean": 42.48575973510742,
"grad_norm": 15.933945655822754,
"kl/avg_steps": 0.59375,
"kl/beta": 0.01655658148229122,
"kl/n_epsilon_steps": 0.203125,
"kl/p_epsilon_steps": 0.796875,
"learning_rate": 1.5463461824665658e-07,
"logits/chosen": -0.144636332988739,
"logits/rejected": -0.1910811960697174,
"logps/chosen": -161.16375732421875,
"logps/ref_chosen": -81.41764831542969,
"logps/ref_rejected": -94.72309875488281,
"logps/rejected": -216.9549560546875,
"loss": 0.9737,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.3161146640777588,
"rewards/margins": 0.6950427889823914,
"rewards/rejected": -2.011157512664795,
"step": 439
},
{
"epoch": 0.6651549508692366,
"epsilon_dpo/beta": 0.016373056918382645,
"epsilon_dpo/beta_margin_grad_mean": -0.3605916500091553,
"epsilon_dpo/beta_margin_grad_std": 0.19463224709033966,
"epsilon_dpo/beta_margin_mean": 0.7015513181686401,
"epsilon_dpo/beta_margin_std": 0.9930663704872131,
"epsilon_dpo/loss_margin_mean": 43.208988189697266,
"grad_norm": 23.889144897460938,
"kl/avg_steps": 0.53125,
"kl/beta": 0.01645885780453682,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 1.534137185767178e-07,
"logits/chosen": 0.18619604408740997,
"logits/rejected": -0.023444700986146927,
"logps/chosen": -115.22410583496094,
"logps/ref_chosen": -42.538185119628906,
"logps/ref_rejected": -69.78813934326172,
"logps/rejected": -185.68304443359375,
"loss": 1.0091,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.1933023929595947,
"rewards/margins": 0.7015513181686401,
"rewards/rejected": -1.8948535919189453,
"step": 440
},
{
"epoch": 0.6666666666666666,
"epsilon_dpo/beta": 0.01626606658101082,
"epsilon_dpo/beta_margin_grad_mean": -0.33554089069366455,
"epsilon_dpo/beta_margin_grad_std": 0.16279840469360352,
"epsilon_dpo/beta_margin_mean": 0.7940685153007507,
"epsilon_dpo/beta_margin_std": 0.8216511011123657,
"epsilon_dpo/loss_margin_mean": 49.05491638183594,
"grad_norm": 16.21149444580078,
"kl/avg_steps": 0.65625,
"kl/beta": 0.01637188158929348,
"kl/n_epsilon_steps": 0.171875,
"kl/p_epsilon_steps": 0.828125,
"learning_rate": 1.521955206326976e-07,
"logits/chosen": 0.01356169581413269,
"logits/rejected": -0.16893848776817322,
"logps/chosen": -125.37178802490234,
"logps/ref_chosen": -57.593223571777344,
"logps/ref_rejected": -84.82878875732422,
"logps/rejected": -201.66226196289062,
"loss": 0.8838,
"rewards/accuracies": 0.828125,
"rewards/chosen": -1.1033220291137695,
"rewards/margins": 0.7940685153007507,
"rewards/rejected": -1.897390604019165,
"step": 441
},
{
"epoch": 0.6681783824640968,
"epsilon_dpo/beta": 0.016175266355276108,
"epsilon_dpo/beta_margin_grad_mean": -0.3511590361595154,
"epsilon_dpo/beta_margin_grad_std": 0.18610098958015442,
"epsilon_dpo/beta_margin_mean": 0.7301819324493408,
"epsilon_dpo/beta_margin_std": 0.9202057719230652,
"epsilon_dpo/loss_margin_mean": 45.47758865356445,
"grad_norm": 16.281423568725586,
"kl/avg_steps": 0.5625,
"kl/beta": 0.01626514084637165,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 1.5098005849021078e-07,
"logits/chosen": -0.0566435307264328,
"logits/rejected": -0.1968882828950882,
"logps/chosen": -154.2303466796875,
"logps/ref_chosen": -67.46121978759766,
"logps/ref_rejected": -89.0693588256836,
"logps/rejected": -221.31607055664062,
"loss": 0.9644,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.4060349464416504,
"rewards/margins": 0.7301819324493408,
"rewards/rejected": -2.136216878890991,
"step": 442
},
{
"epoch": 0.6696900982615268,
"epsilon_dpo/beta": 0.016074679791927338,
"epsilon_dpo/beta_margin_grad_mean": -0.3204159736633301,
"epsilon_dpo/beta_margin_grad_std": 0.19219955801963806,
"epsilon_dpo/beta_margin_mean": 0.9077427983283997,
"epsilon_dpo/beta_margin_std": 1.033774495124817,
"epsilon_dpo/loss_margin_mean": 56.80141067504883,
"grad_norm": 16.50442123413086,
"kl/avg_steps": 0.625,
"kl/beta": 0.016174161806702614,
"kl/n_epsilon_steps": 0.1875,
"kl/p_epsilon_steps": 0.8125,
"learning_rate": 1.4976736614834662e-07,
"logits/chosen": 0.005667464341968298,
"logits/rejected": -0.14641402661800385,
"logps/chosen": -125.54917907714844,
"logps/ref_chosen": -54.79609680175781,
"logps/ref_rejected": -77.80782318115234,
"logps/rejected": -205.3623046875,
"loss": 0.8848,
"rewards/accuracies": 0.796875,
"rewards/chosen": -1.1396013498306274,
"rewards/margins": 0.9077427387237549,
"rewards/rejected": -2.047344207763672,
"step": 443
},
{
"epoch": 0.671201814058957,
"epsilon_dpo/beta": 0.016020050272345543,
"epsilon_dpo/beta_margin_grad_mean": -0.4323629140853882,
"epsilon_dpo/beta_margin_grad_std": 0.19742116332054138,
"epsilon_dpo/beta_margin_mean": 0.3304816484451294,
"epsilon_dpo/beta_margin_std": 0.950802206993103,
"epsilon_dpo/loss_margin_mean": 21.030738830566406,
"grad_norm": 22.008689880371094,
"kl/avg_steps": 0.34375,
"kl/beta": 0.01607370190322399,
"kl/n_epsilon_steps": 0.328125,
"kl/p_epsilon_steps": 0.671875,
"learning_rate": 1.4855747752871654e-07,
"logits/chosen": 0.015946775674819946,
"logits/rejected": -0.22191354632377625,
"logps/chosen": -149.5116729736328,
"logps/ref_chosen": -58.749061584472656,
"logps/ref_rejected": -86.87397003173828,
"logps/rejected": -198.66732788085938,
"loss": 1.2849,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.4570257663726807,
"rewards/margins": 0.3304816484451294,
"rewards/rejected": -1.78750741481781,
"step": 444
},
{
"epoch": 0.672713529856387,
"epsilon_dpo/beta": 0.015920111909508705,
"epsilon_dpo/beta_margin_grad_mean": -0.34919169545173645,
"epsilon_dpo/beta_margin_grad_std": 0.18469832837581635,
"epsilon_dpo/beta_margin_mean": 0.7448247075080872,
"epsilon_dpo/beta_margin_std": 0.9299260377883911,
"epsilon_dpo/loss_margin_mean": 47.08465576171875,
"grad_norm": 16.90455436706543,
"kl/avg_steps": 0.625,
"kl/beta": 0.016018636524677277,
"kl/n_epsilon_steps": 0.1875,
"kl/p_epsilon_steps": 0.8125,
"learning_rate": 1.473504264745062e-07,
"logits/chosen": -0.047003570944070816,
"logits/rejected": -0.044804759323596954,
"logps/chosen": -146.5565948486328,
"logps/ref_chosen": -60.91743850708008,
"logps/ref_rejected": -71.56373596191406,
"logps/rejected": -204.2875518798828,
"loss": 0.9563,
"rewards/accuracies": 0.84375,
"rewards/chosen": -1.3663113117218018,
"rewards/margins": 0.7448246479034424,
"rewards/rejected": -2.111135959625244,
"step": 445
},
{
"epoch": 0.674225245653817,
"epsilon_dpo/beta": 0.01582122966647148,
"epsilon_dpo/beta_margin_grad_mean": -0.3263431489467621,
"epsilon_dpo/beta_margin_grad_std": 0.16399583220481873,
"epsilon_dpo/beta_margin_mean": 0.8518962860107422,
"epsilon_dpo/beta_margin_std": 0.8549835085868835,
"epsilon_dpo/loss_margin_mean": 54.100502014160156,
"grad_norm": 12.506586074829102,
"kl/avg_steps": 0.625,
"kl/beta": 0.015919141471385956,
"kl/n_epsilon_steps": 0.1875,
"kl/p_epsilon_steps": 0.8125,
"learning_rate": 1.461462467495284e-07,
"logits/chosen": 0.24987921118736267,
"logits/rejected": -0.026499008759856224,
"logps/chosen": -123.0771484375,
"logps/ref_chosen": -48.79924774169922,
"logps/ref_rejected": -71.87195587158203,
"logps/rejected": -200.25035095214844,
"loss": 0.8549,
"rewards/accuracies": 0.796875,
"rewards/chosen": -1.1769723892211914,
"rewards/margins": 0.8518962264060974,
"rewards/rejected": -2.0288686752319336,
"step": 446
},
{
"epoch": 0.6757369614512472,
"epsilon_dpo/beta": 0.015727905556559563,
"epsilon_dpo/beta_margin_grad_mean": -0.3208446800708771,
"epsilon_dpo/beta_margin_grad_std": 0.19948740303516388,
"epsilon_dpo/beta_margin_mean": 0.8883765935897827,
"epsilon_dpo/beta_margin_std": 1.0160611867904663,
"epsilon_dpo/loss_margin_mean": 56.877159118652344,
"grad_norm": 16.29720687866211,
"kl/avg_steps": 0.59375,
"kl/beta": 0.01582026481628418,
"kl/n_epsilon_steps": 0.203125,
"kl/p_epsilon_steps": 0.796875,
"learning_rate": 1.4494497203727843e-07,
"logits/chosen": 0.04142617806792259,
"logits/rejected": -0.1503468155860901,
"logps/chosen": -125.75507354736328,
"logps/ref_chosen": -53.682716369628906,
"logps/ref_rejected": -88.17315673828125,
"logps/rejected": -217.1226806640625,
"loss": 0.9004,
"rewards/accuracies": 0.796875,
"rewards/chosen": -1.1368310451507568,
"rewards/margins": 0.8883765935897827,
"rewards/rejected": -2.025207757949829,
"step": 447
},
{
"epoch": 0.6772486772486772,
"epsilon_dpo/beta": 0.0156252421438694,
"epsilon_dpo/beta_margin_grad_mean": -0.35464411973953247,
"epsilon_dpo/beta_margin_grad_std": 0.18226809799671173,
"epsilon_dpo/beta_margin_mean": 0.6988101005554199,
"epsilon_dpo/beta_margin_std": 0.9170963168144226,
"epsilon_dpo/loss_margin_mean": 45.00851821899414,
"grad_norm": 13.480257034301758,
"kl/avg_steps": 0.65625,
"kl/beta": 0.015726886689662933,
"kl/n_epsilon_steps": 0.171875,
"kl/p_epsilon_steps": 0.828125,
"learning_rate": 1.4374663593999256e-07,
"logits/chosen": -0.06806192547082901,
"logits/rejected": -0.1533464789390564,
"logps/chosen": -133.1595916748047,
"logps/ref_chosen": -53.75125503540039,
"logps/ref_rejected": -77.17623901367188,
"logps/rejected": -201.59307861328125,
"loss": 0.984,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.2408463954925537,
"rewards/margins": 0.6988101005554199,
"rewards/rejected": -1.9396564960479736,
"step": 448
},
{
"epoch": 0.6787603930461074,
"epsilon_dpo/beta": 0.0155722014605999,
"epsilon_dpo/beta_margin_grad_mean": -0.42443975806236267,
"epsilon_dpo/beta_margin_grad_std": 0.17208167910575867,
"epsilon_dpo/beta_margin_mean": 0.3604929745197296,
"epsilon_dpo/beta_margin_std": 0.8101447820663452,
"epsilon_dpo/loss_margin_mean": 23.50426483154297,
"grad_norm": 21.256118774414062,
"kl/avg_steps": 0.34375,
"kl/beta": 0.015624352730810642,
"kl/n_epsilon_steps": 0.328125,
"kl/p_epsilon_steps": 0.671875,
"learning_rate": 1.4255127197770707e-07,
"logits/chosen": -0.19139866530895233,
"logits/rejected": -0.10306224226951599,
"logps/chosen": -173.113037109375,
"logps/ref_chosen": -75.82737731933594,
"logps/ref_rejected": -82.20687103271484,
"logps/rejected": -202.99679565429688,
"loss": 1.2055,
"rewards/accuracies": 0.640625,
"rewards/chosen": -1.516629934310913,
"rewards/margins": 0.360493004322052,
"rewards/rejected": -1.8771228790283203,
"step": 449
},
{
"epoch": 0.6802721088435374,
"epsilon_dpo/beta": 0.015518855303525925,
"epsilon_dpo/beta_margin_grad_mean": -0.4014636278152466,
"epsilon_dpo/beta_margin_grad_std": 0.18613174557685852,
"epsilon_dpo/beta_margin_mean": 0.4953595995903015,
"epsilon_dpo/beta_margin_std": 0.9263343214988708,
"epsilon_dpo/loss_margin_mean": 32.31964874267578,
"grad_norm": 17.194866180419922,
"kl/avg_steps": 0.34375,
"kl/beta": 0.015570827759802341,
"kl/n_epsilon_steps": 0.328125,
"kl/p_epsilon_steps": 0.671875,
"learning_rate": 1.4135891358732205e-07,
"logits/chosen": 0.2150343358516693,
"logits/rejected": -0.1200256198644638,
"logps/chosen": -124.6307144165039,
"logps/ref_chosen": -47.11572265625,
"logps/ref_rejected": -78.7546615600586,
"logps/rejected": -188.58929443359375,
"loss": 1.1352,
"rewards/accuracies": 0.671875,
"rewards/chosen": -1.2055349349975586,
"rewards/margins": 0.4953596293926239,
"rewards/rejected": -1.7008945941925049,
"step": 450
},
{
"epoch": 0.6817838246409675,
"epsilon_dpo/beta": 0.015460841357707977,
"epsilon_dpo/beta_margin_grad_mean": -0.3902263641357422,
"epsilon_dpo/beta_margin_grad_std": 0.1784912347793579,
"epsilon_dpo/beta_margin_mean": 0.529263436794281,
"epsilon_dpo/beta_margin_std": 0.8688886165618896,
"epsilon_dpo/loss_margin_mean": 34.60202407836914,
"grad_norm": 16.85890769958496,
"kl/avg_steps": 0.375,
"kl/beta": 0.01551748625934124,
"kl/n_epsilon_steps": 0.3125,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 1.4016959412166437e-07,
"logits/chosen": -0.008814550004899502,
"logits/rejected": -0.12918636202812195,
"logps/chosen": -141.05792236328125,
"logps/ref_chosen": -63.350440979003906,
"logps/ref_rejected": -76.28530883789062,
"logps/rejected": -188.59481811523438,
"loss": 1.09,
"rewards/accuracies": 0.734375,
"rewards/chosen": -1.203305721282959,
"rewards/margins": 0.529263436794281,
"rewards/rejected": -1.7325690984725952,
"step": 451
},
{
"epoch": 0.6832955404383976,
"epsilon_dpo/beta": 0.0153789222240448,
"epsilon_dpo/beta_margin_grad_mean": -0.37822604179382324,
"epsilon_dpo/beta_margin_grad_std": 0.19475796818733215,
"epsilon_dpo/beta_margin_mean": 0.5874215960502625,
"epsilon_dpo/beta_margin_std": 0.9609581232070923,
"epsilon_dpo/loss_margin_mean": 38.5745735168457,
"grad_norm": 16.045228958129883,
"kl/avg_steps": 0.53125,
"kl/beta": 0.015459513291716576,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 1.3898334684855645e-07,
"logits/chosen": 0.044762223958969116,
"logits/rejected": -0.13101793825626373,
"logps/chosen": -134.6223907470703,
"logps/ref_chosen": -55.585838317871094,
"logps/ref_rejected": -77.68738555908203,
"logps/rejected": -195.29852294921875,
"loss": 1.0828,
"rewards/accuracies": 0.765625,
"rewards/chosen": -1.2179051637649536,
"rewards/margins": 0.5874216556549072,
"rewards/rejected": -1.8053267002105713,
"step": 452
},
{
"epoch": 0.6848072562358276,
"epsilon_dpo/beta": 0.0153120718896389,
"epsilon_dpo/beta_margin_grad_mean": -0.36968758702278137,
"epsilon_dpo/beta_margin_grad_std": 0.1911747306585312,
"epsilon_dpo/beta_margin_mean": 0.6542624235153198,
"epsilon_dpo/beta_margin_std": 0.9553431868553162,
"epsilon_dpo/loss_margin_mean": 43.13039779663086,
"grad_norm": 20.01287078857422,
"kl/avg_steps": 0.4375,
"kl/beta": 0.01537781860679388,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 1.3780020494988445e-07,
"logits/chosen": -0.12334619462490082,
"logits/rejected": -0.11555872857570648,
"logps/chosen": -137.42770385742188,
"logps/ref_chosen": -61.778202056884766,
"logps/ref_rejected": -71.51402282714844,
"logps/rejected": -190.29393005371094,
"loss": 1.029,
"rewards/accuracies": 0.734375,
"rewards/chosen": -1.1602611541748047,
"rewards/margins": 0.6542624235153198,
"rewards/rejected": -1.814523696899414,
"step": 453
},
{
"epoch": 0.6863189720332578,
"epsilon_dpo/beta": 0.015221447683870792,
"epsilon_dpo/beta_margin_grad_mean": -0.3616742789745331,
"epsilon_dpo/beta_margin_grad_std": 0.1899394989013672,
"epsilon_dpo/beta_margin_mean": 0.6655922532081604,
"epsilon_dpo/beta_margin_std": 0.9803752303123474,
"epsilon_dpo/loss_margin_mean": 44.06951141357422,
"grad_norm": 13.59394645690918,
"kl/avg_steps": 0.59375,
"kl/beta": 0.015310833230614662,
"kl/n_epsilon_steps": 0.203125,
"kl/p_epsilon_steps": 0.796875,
"learning_rate": 1.366202015206706e-07,
"logits/chosen": 0.05573238432407379,
"logits/rejected": -0.008013417944312096,
"logps/chosen": -123.97268676757812,
"logps/ref_chosen": -51.59515380859375,
"logps/ref_rejected": -63.967323303222656,
"logps/rejected": -180.41436767578125,
"loss": 1.03,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.1037403345108032,
"rewards/margins": 0.6655922532081604,
"rewards/rejected": -1.7693325281143188,
"step": 454
},
{
"epoch": 0.6878306878306878,
"epsilon_dpo/beta": 0.015143472701311111,
"epsilon_dpo/beta_margin_grad_mean": -0.36568930745124817,
"epsilon_dpo/beta_margin_grad_std": 0.1735781878232956,
"epsilon_dpo/beta_margin_mean": 0.6613116264343262,
"epsilon_dpo/beta_margin_std": 0.8814070820808411,
"epsilon_dpo/loss_margin_mean": 43.99155044555664,
"grad_norm": 15.623213768005371,
"kl/avg_steps": 0.515625,
"kl/beta": 0.01522046234458685,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 1.354433695681474e-07,
"logits/chosen": -0.13695141673088074,
"logits/rejected": -0.18374785780906677,
"logps/chosen": -151.96755981445312,
"logps/ref_chosen": -70.65170288085938,
"logps/ref_rejected": -77.44276428222656,
"logps/rejected": -202.75015258789062,
"loss": 0.9931,
"rewards/accuracies": 0.765625,
"rewards/chosen": -1.2335149049758911,
"rewards/margins": 0.6613115668296814,
"rewards/rejected": -1.8948265314102173,
"step": 455
},
{
"epoch": 0.6893424036281179,
"epsilon_dpo/beta": 0.015063446946442127,
"epsilon_dpo/beta_margin_grad_mean": -0.3790503740310669,
"epsilon_dpo/beta_margin_grad_std": 0.17220792174339294,
"epsilon_dpo/beta_margin_mean": 0.5817795991897583,
"epsilon_dpo/beta_margin_std": 0.855984628200531,
"epsilon_dpo/loss_margin_mean": 38.93759536743164,
"grad_norm": 18.083881378173828,
"kl/avg_steps": 0.53125,
"kl/beta": 0.015142383985221386,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 1.3426974201083439e-07,
"logits/chosen": -0.034226901829242706,
"logits/rejected": -0.16684238612651825,
"logps/chosen": -136.68612670898438,
"logps/ref_chosen": -56.398284912109375,
"logps/ref_rejected": -82.61642456054688,
"logps/rejected": -201.84185791015625,
"loss": 1.0434,
"rewards/accuracies": 0.765625,
"rewards/chosen": -1.2117884159088135,
"rewards/margins": 0.5817795991897583,
"rewards/rejected": -1.7935680150985718,
"step": 456
},
{
"epoch": 0.690854119425548,
"epsilon_dpo/beta": 0.014969722367823124,
"epsilon_dpo/beta_margin_grad_mean": -0.37519484758377075,
"epsilon_dpo/beta_margin_grad_std": 0.15988659858703613,
"epsilon_dpo/beta_margin_mean": 0.5802949666976929,
"epsilon_dpo/beta_margin_std": 0.7644326090812683,
"epsilon_dpo/loss_margin_mean": 39.0189323425293,
"grad_norm": 13.618075370788574,
"kl/avg_steps": 0.625,
"kl/beta": 0.015062365680932999,
"kl/n_epsilon_steps": 0.1875,
"kl/p_epsilon_steps": 0.8125,
"learning_rate": 1.3309935167761717e-07,
"logits/chosen": 0.17056697607040405,
"logits/rejected": -0.1440895050764084,
"logps/chosen": -128.39015197753906,
"logps/ref_chosen": -44.72057342529297,
"logps/ref_rejected": -68.11585998535156,
"logps/rejected": -190.80438232421875,
"loss": 1.0165,
"rewards/accuracies": 0.796875,
"rewards/chosen": -1.254011869430542,
"rewards/margins": 0.5802949666976929,
"rewards/rejected": -1.8343068361282349,
"step": 457
},
{
"epoch": 0.6923658352229781,
"epsilon_dpo/beta": 0.014886099845170975,
"epsilon_dpo/beta_margin_grad_mean": -0.36608678102493286,
"epsilon_dpo/beta_margin_grad_std": 0.17732404172420502,
"epsilon_dpo/beta_margin_mean": 0.6359953880310059,
"epsilon_dpo/beta_margin_std": 0.8864153623580933,
"epsilon_dpo/loss_margin_mean": 43.05495071411133,
"grad_norm": 13.512868881225586,
"kl/avg_steps": 0.5625,
"kl/beta": 0.014968810603022575,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 1.3193223130682936e-07,
"logits/chosen": 0.057013943791389465,
"logits/rejected": -0.260597825050354,
"logps/chosen": -125.82892608642578,
"logps/ref_chosen": -50.00569152832031,
"logps/ref_rejected": -87.50015258789062,
"logps/rejected": -206.37832641601562,
"loss": 1.0169,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.1297353506088257,
"rewards/margins": 0.6359953880310059,
"rewards/rejected": -1.7657307386398315,
"step": 458
},
{
"epoch": 0.6938775510204082,
"epsilon_dpo/beta": 0.014784225262701511,
"epsilon_dpo/beta_margin_grad_mean": -0.3394293189048767,
"epsilon_dpo/beta_margin_grad_std": 0.18479669094085693,
"epsilon_dpo/beta_margin_mean": 0.7821560502052307,
"epsilon_dpo/beta_margin_std": 0.9194261431694031,
"epsilon_dpo/loss_margin_mean": 53.202415466308594,
"grad_norm": 15.157086372375488,
"kl/avg_steps": 0.6875,
"kl/beta": 0.01488508190959692,
"kl/n_epsilon_steps": 0.15625,
"kl/p_epsilon_steps": 0.84375,
"learning_rate": 1.3076841354533658e-07,
"logits/chosen": -0.16891610622406006,
"logits/rejected": -0.07677589356899261,
"logps/chosen": -137.9151153564453,
"logps/ref_chosen": -65.37794494628906,
"logps/ref_rejected": -88.19244384765625,
"logps/rejected": -213.93202209472656,
"loss": 0.9297,
"rewards/accuracies": 0.84375,
"rewards/chosen": -1.07389235496521,
"rewards/margins": 0.7821560502052307,
"rewards/rejected": -1.8560484647750854,
"step": 459
},
{
"epoch": 0.6953892668178382,
"epsilon_dpo/beta": 0.014697139151394367,
"epsilon_dpo/beta_margin_grad_mean": -0.3432950973510742,
"epsilon_dpo/beta_margin_grad_std": 0.1727389097213745,
"epsilon_dpo/beta_margin_mean": 0.7734363079071045,
"epsilon_dpo/beta_margin_std": 0.8833531737327576,
"epsilon_dpo/loss_margin_mean": 52.92116928100586,
"grad_norm": 14.610294342041016,
"kl/avg_steps": 0.59375,
"kl/beta": 0.014783445745706558,
"kl/n_epsilon_steps": 0.203125,
"kl/p_epsilon_steps": 0.796875,
"learning_rate": 1.2960793094762345e-07,
"logits/chosen": -0.06524206697940826,
"logits/rejected": -0.30612558126449585,
"logps/chosen": -144.34349060058594,
"logps/ref_chosen": -64.5616683959961,
"logps/ref_rejected": -88.67889404296875,
"logps/rejected": -221.38189697265625,
"loss": 0.9173,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.1751583814620972,
"rewards/margins": 0.7734363079071045,
"rewards/rejected": -1.9485946893692017,
"step": 460
},
{
"epoch": 0.6969009826152683,
"epsilon_dpo/beta": 0.014619575813412666,
"epsilon_dpo/beta_margin_grad_mean": -0.3441314399242401,
"epsilon_dpo/beta_margin_grad_std": 0.16998042166233063,
"epsilon_dpo/beta_margin_mean": 0.7581866979598999,
"epsilon_dpo/beta_margin_std": 0.8434395790100098,
"epsilon_dpo/loss_margin_mean": 52.199790954589844,
"grad_norm": 13.166162490844727,
"kl/avg_steps": 0.53125,
"kl/beta": 0.01469618733972311,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 1.2845081597488286e-07,
"logits/chosen": 0.07498809695243835,
"logits/rejected": -0.14307263493537903,
"logps/chosen": -115.65630340576172,
"logps/ref_chosen": -49.4779167175293,
"logps/ref_rejected": -72.65262603759766,
"logps/rejected": -191.03082275390625,
"loss": 0.916,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.9694328308105469,
"rewards/margins": 0.7581866979598999,
"rewards/rejected": -1.7276195287704468,
"step": 461
},
{
"epoch": 0.6984126984126984,
"epsilon_dpo/beta": 0.014524044468998909,
"epsilon_dpo/beta_margin_grad_mean": -0.3293147385120392,
"epsilon_dpo/beta_margin_grad_std": 0.1750185787677765,
"epsilon_dpo/beta_margin_mean": 0.8249046206474304,
"epsilon_dpo/beta_margin_std": 0.8749585747718811,
"epsilon_dpo/loss_margin_mean": 57.090484619140625,
"grad_norm": 13.01965045928955,
"kl/avg_steps": 0.65625,
"kl/beta": 0.014618526212871075,
"kl/n_epsilon_steps": 0.171875,
"kl/p_epsilon_steps": 0.828125,
"learning_rate": 1.27297100994108e-07,
"logits/chosen": 0.03168656677007675,
"logits/rejected": -0.0870504230260849,
"logps/chosen": -134.78570556640625,
"logps/ref_chosen": -60.4951171875,
"logps/ref_rejected": -74.82137298583984,
"logps/rejected": -206.20245361328125,
"loss": 0.8851,
"rewards/accuracies": 0.84375,
"rewards/chosen": -1.0809142589569092,
"rewards/margins": 0.8249046206474304,
"rewards/rejected": -1.9058189392089844,
"step": 462
},
{
"epoch": 0.6999244142101285,
"epsilon_dpo/beta": 0.014456585049629211,
"epsilon_dpo/beta_margin_grad_mean": -0.3923404812812805,
"epsilon_dpo/beta_margin_grad_std": 0.1637619286775589,
"epsilon_dpo/beta_margin_mean": 0.49296700954437256,
"epsilon_dpo/beta_margin_std": 0.7714128494262695,
"epsilon_dpo/loss_margin_mean": 34.431114196777344,
"grad_norm": 18.325389862060547,
"kl/avg_steps": 0.46875,
"kl/beta": 0.01452321745455265,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 1.2614681827718695e-07,
"logits/chosen": -0.138390451669693,
"logits/rejected": -0.07783595472574234,
"logps/chosen": -154.4484405517578,
"logps/ref_chosen": -67.68511962890625,
"logps/ref_rejected": -71.32196044921875,
"logps/rejected": -192.51638793945312,
"loss": 1.0863,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.2572212219238281,
"rewards/margins": 0.49296700954437256,
"rewards/rejected": -1.7501882314682007,
"step": 463
},
{
"epoch": 0.7014361300075586,
"epsilon_dpo/beta": 0.014384618028998375,
"epsilon_dpo/beta_margin_grad_mean": -0.35438039898872375,
"epsilon_dpo/beta_margin_grad_std": 0.19094325602054596,
"epsilon_dpo/beta_margin_mean": 0.7150393128395081,
"epsilon_dpo/beta_margin_std": 0.9422991275787354,
"epsilon_dpo/loss_margin_mean": 50.12311935424805,
"grad_norm": 15.382913589477539,
"kl/avg_steps": 0.5,
"kl/beta": 0.014455457217991352,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 1.2500000000000005e-07,
"logits/chosen": -0.011155502870678902,
"logits/rejected": -0.02159612998366356,
"logps/chosen": -142.5160675048828,
"logps/ref_chosen": -59.16564178466797,
"logps/ref_rejected": -69.56146240234375,
"logps/rejected": -203.03500366210938,
"loss": 0.9842,
"rewards/accuracies": 0.765625,
"rewards/chosen": -1.2018377780914307,
"rewards/margins": 0.7150393128395081,
"rewards/rejected": -1.916877031326294,
"step": 464
},
{
"epoch": 0.7029478458049887,
"epsilon_dpo/beta": 0.014331034384667873,
"epsilon_dpo/beta_margin_grad_mean": -0.384671688079834,
"epsilon_dpo/beta_margin_grad_std": 0.19037847220897675,
"epsilon_dpo/beta_margin_mean": 0.560808002948761,
"epsilon_dpo/beta_margin_std": 0.9024878740310669,
"epsilon_dpo/loss_margin_mean": 39.585304260253906,
"grad_norm": 18.341367721557617,
"kl/avg_steps": 0.375,
"kl/beta": 0.01438353955745697,
"kl/n_epsilon_steps": 0.3125,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 1.238566782415197e-07,
"logits/chosen": 0.04115644842386246,
"logits/rejected": -0.1283925622701645,
"logps/chosen": -146.37130737304688,
"logps/ref_chosen": -58.513671875,
"logps/ref_rejected": -84.31745910644531,
"logps/rejected": -211.76039123535156,
"loss": 1.0824,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.2623660564422607,
"rewards/margins": 0.560808002948761,
"rewards/rejected": -1.823173999786377,
"step": 465
},
{
"epoch": 0.7044595616024187,
"epsilon_dpo/beta": 0.01428197231143713,
"epsilon_dpo/beta_margin_grad_mean": -0.41949617862701416,
"epsilon_dpo/beta_margin_grad_std": 0.1664215475320816,
"epsilon_dpo/beta_margin_mean": 0.38105759024620056,
"epsilon_dpo/beta_margin_std": 0.8050876259803772,
"epsilon_dpo/loss_margin_mean": 27.044538497924805,
"grad_norm": 19.665855407714844,
"kl/avg_steps": 0.34375,
"kl/beta": 0.014329803176224232,
"kl/n_epsilon_steps": 0.328125,
"kl/p_epsilon_steps": 0.671875,
"learning_rate": 1.2271688498291334e-07,
"logits/chosen": -0.022899843752384186,
"logits/rejected": -0.05235084146261215,
"logps/chosen": -164.94752502441406,
"logps/ref_chosen": -73.26580810546875,
"logps/ref_rejected": -74.83621215820312,
"logps/rejected": -193.56246948242188,
"loss": 1.1845,
"rewards/accuracies": 0.703125,
"rewards/chosen": -1.3135833740234375,
"rewards/margins": 0.38105762004852295,
"rewards/rejected": -1.69464111328125,
"step": 466
},
{
"epoch": 0.7059712773998488,
"epsilon_dpo/beta": 0.01420180406421423,
"epsilon_dpo/beta_margin_grad_mean": -0.38733479380607605,
"epsilon_dpo/beta_margin_grad_std": 0.1536182165145874,
"epsilon_dpo/beta_margin_mean": 0.53230881690979,
"epsilon_dpo/beta_margin_std": 0.7820398807525635,
"epsilon_dpo/loss_margin_mean": 37.7493896484375,
"grad_norm": 14.837164878845215,
"kl/avg_steps": 0.5625,
"kl/beta": 0.01428071316331625,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 1.2158065210664848e-07,
"logits/chosen": 0.09981206059455872,
"logits/rejected": -0.288411021232605,
"logps/chosen": -133.34310913085938,
"logps/ref_chosen": -47.57947540283203,
"logps/ref_rejected": -78.68522644042969,
"logps/rejected": -202.1982421875,
"loss": 1.0504,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.219231367111206,
"rewards/margins": 0.53230881690979,
"rewards/rejected": -1.751540184020996,
"step": 467
},
{
"epoch": 0.7074829931972789,
"epsilon_dpo/beta": 0.01410905085504055,
"epsilon_dpo/beta_margin_grad_mean": -0.32862117886543274,
"epsilon_dpo/beta_margin_grad_std": 0.1745777279138565,
"epsilon_dpo/beta_margin_mean": 0.8363341093063354,
"epsilon_dpo/beta_margin_std": 0.8792763352394104,
"epsilon_dpo/loss_margin_mean": 59.58964157104492,
"grad_norm": 15.589622497558594,
"kl/avg_steps": 0.65625,
"kl/beta": 0.01420083362609148,
"kl/n_epsilon_steps": 0.171875,
"kl/p_epsilon_steps": 0.828125,
"learning_rate": 1.204480113956011e-07,
"logits/chosen": -0.10584881901741028,
"logits/rejected": -0.023347195237874985,
"logps/chosen": -139.39561462402344,
"logps/ref_chosen": -63.92778778076172,
"logps/ref_rejected": -76.51626586914062,
"logps/rejected": -211.57373046875,
"loss": 0.8779,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.0657360553741455,
"rewards/margins": 0.8363341093063354,
"rewards/rejected": -1.9020702838897705,
"step": 468
},
{
"epoch": 0.708994708994709,
"epsilon_dpo/beta": 0.014039110392332077,
"epsilon_dpo/beta_margin_grad_mean": -0.3573172986507416,
"epsilon_dpo/beta_margin_grad_std": 0.159497931599617,
"epsilon_dpo/beta_margin_mean": 0.6745405793190002,
"epsilon_dpo/beta_margin_std": 0.7813970446586609,
"epsilon_dpo/loss_margin_mean": 48.362491607666016,
"grad_norm": 17.070011138916016,
"kl/avg_steps": 0.5,
"kl/beta": 0.014108248054981232,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 1.1931899453216697e-07,
"logits/chosen": -0.06523493677377701,
"logits/rejected": -0.023180361837148666,
"logps/chosen": -137.62957763671875,
"logps/ref_chosen": -59.05818176269531,
"logps/ref_rejected": -75.67672729492188,
"logps/rejected": -202.610595703125,
"loss": 0.9527,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.1058293581008911,
"rewards/margins": 0.674540638923645,
"rewards/rejected": -1.7803699970245361,
"step": 469
},
{
"epoch": 0.7105064247921391,
"epsilon_dpo/beta": 0.013969264924526215,
"epsilon_dpo/beta_margin_grad_mean": -0.3630892336368561,
"epsilon_dpo/beta_margin_grad_std": 0.1783372461795807,
"epsilon_dpo/beta_margin_mean": 0.645988404750824,
"epsilon_dpo/beta_margin_std": 0.8631333708763123,
"epsilon_dpo/loss_margin_mean": 46.6306266784668,
"grad_norm": 13.700272560119629,
"kl/avg_steps": 0.5,
"kl/beta": 0.014038057997822762,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 1.1819363309737438e-07,
"logits/chosen": 0.13314181566238403,
"logits/rejected": -0.04929421842098236,
"logps/chosen": -128.344482421875,
"logps/ref_chosen": -47.86743927001953,
"logps/ref_rejected": -65.96858978271484,
"logps/rejected": -193.07626342773438,
"loss": 1.0042,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.127017617225647,
"rewards/margins": 0.6459884643554688,
"rewards/rejected": -1.7730062007904053,
"step": 470
},
{
"epoch": 0.7120181405895691,
"epsilon_dpo/beta": 0.013882302679121494,
"epsilon_dpo/beta_margin_grad_mean": -0.3426089286804199,
"epsilon_dpo/beta_margin_grad_std": 0.18046066164970398,
"epsilon_dpo/beta_margin_mean": 0.7614877223968506,
"epsilon_dpo/beta_margin_std": 0.9007505178451538,
"epsilon_dpo/loss_margin_mean": 55.18793869018555,
"grad_norm": 14.91497802734375,
"kl/avg_steps": 0.625,
"kl/beta": 0.013968216255307198,
"kl/n_epsilon_steps": 0.1875,
"kl/p_epsilon_steps": 0.8125,
"learning_rate": 1.1707195857000215e-07,
"logits/chosen": -0.034831296652555466,
"logits/rejected": -0.114321768283844,
"logps/chosen": -131.39889526367188,
"logps/ref_chosen": -57.77785110473633,
"logps/ref_rejected": -73.81172180175781,
"logps/rejected": -202.62069702148438,
"loss": 0.9359,
"rewards/accuracies": 0.796875,
"rewards/chosen": -1.0248136520385742,
"rewards/margins": 0.7614877223968506,
"rewards/rejected": -1.7863013744354248,
"step": 471
},
{
"epoch": 0.7135298563869993,
"epsilon_dpo/beta": 0.013809092342853546,
"epsilon_dpo/beta_margin_grad_mean": -0.37804096937179565,
"epsilon_dpo/beta_margin_grad_std": 0.19511382281780243,
"epsilon_dpo/beta_margin_mean": 0.6068339943885803,
"epsilon_dpo/beta_margin_std": 0.9758617877960205,
"epsilon_dpo/loss_margin_mean": 44.36206817626953,
"grad_norm": 15.633830070495605,
"kl/avg_steps": 0.53125,
"kl/beta": 0.013881457038223743,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 1.1595400232569768e-07,
"logits/chosen": -0.07246048748493195,
"logits/rejected": -0.10945230722427368,
"logps/chosen": -131.42694091796875,
"logps/ref_chosen": -55.908668518066406,
"logps/ref_rejected": -74.70294189453125,
"logps/rejected": -194.5832977294922,
"loss": 1.0719,
"rewards/accuracies": 0.765625,
"rewards/chosen": -1.0450999736785889,
"rewards/margins": 0.6068340539932251,
"rewards/rejected": -1.6519339084625244,
"step": 472
},
{
"epoch": 0.7150415721844293,
"epsilon_dpo/beta": 0.013744750991463661,
"epsilon_dpo/beta_margin_grad_mean": -0.3755965828895569,
"epsilon_dpo/beta_margin_grad_std": 0.20187832415103912,
"epsilon_dpo/beta_margin_mean": 0.6269193291664124,
"epsilon_dpo/beta_margin_std": 1.0167042016983032,
"epsilon_dpo/loss_margin_mean": 46.08133316040039,
"grad_norm": 17.12528419494629,
"kl/avg_steps": 0.46875,
"kl/beta": 0.01380810234695673,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 1.1483979563610069e-07,
"logits/chosen": 0.09769396483898163,
"logits/rejected": -0.32806509733200073,
"logps/chosen": -129.93846130371094,
"logps/ref_chosen": -54.16088104248047,
"logps/ref_rejected": -92.76789855957031,
"logps/rejected": -214.62680053710938,
"loss": 1.0739,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.04404616355896,
"rewards/margins": 0.6269192695617676,
"rewards/rejected": -1.6709654331207275,
"step": 473
},
{
"epoch": 0.7165532879818595,
"epsilon_dpo/beta": 0.013689213432371616,
"epsilon_dpo/beta_margin_grad_mean": -0.39714428782463074,
"epsilon_dpo/beta_margin_grad_std": 0.19270876049995422,
"epsilon_dpo/beta_margin_mean": 0.5017508864402771,
"epsilon_dpo/beta_margin_std": 0.9321697950363159,
"epsilon_dpo/loss_margin_mean": 37.101463317871094,
"grad_norm": 21.937671661376953,
"kl/avg_steps": 0.40625,
"kl/beta": 0.013743678107857704,
"kl/n_epsilon_steps": 0.296875,
"kl/p_epsilon_steps": 0.703125,
"learning_rate": 1.1372936966796709e-07,
"logits/chosen": 0.10181444883346558,
"logits/rejected": -0.13675163686275482,
"logps/chosen": -132.3435821533203,
"logps/ref_chosen": -46.685707092285156,
"logps/ref_rejected": -71.44731140136719,
"logps/rejected": -194.20664978027344,
"loss": 1.1374,
"rewards/accuracies": 0.671875,
"rewards/chosen": -1.1751048564910889,
"rewards/margins": 0.5017508864402771,
"rewards/rejected": -1.6768558025360107,
"step": 474
},
{
"epoch": 0.7180650037792895,
"epsilon_dpo/beta": 0.01359960250556469,
"epsilon_dpo/beta_margin_grad_mean": -0.3274012804031372,
"epsilon_dpo/beta_margin_grad_std": 0.16534771025180817,
"epsilon_dpo/beta_margin_mean": 0.8390607833862305,
"epsilon_dpo/beta_margin_std": 0.8349558115005493,
"epsilon_dpo/loss_margin_mean": 61.976905822753906,
"grad_norm": 11.486339569091797,
"kl/avg_steps": 0.65625,
"kl/beta": 0.013688070699572563,
"kl/n_epsilon_steps": 0.171875,
"kl/p_epsilon_steps": 0.828125,
"learning_rate": 1.126227554822985e-07,
"logits/chosen": -0.13027337193489075,
"logits/rejected": -0.17294619977474213,
"logps/chosen": -136.9379425048828,
"logps/ref_chosen": -58.4873046875,
"logps/ref_rejected": -87.00187683105469,
"logps/rejected": -227.42942810058594,
"loss": 0.8597,
"rewards/accuracies": 0.859375,
"rewards/chosen": -1.068725824356079,
"rewards/margins": 0.8390607833862305,
"rewards/rejected": -1.9077866077423096,
"step": 475
},
{
"epoch": 0.7195767195767195,
"epsilon_dpo/beta": 0.013536437414586544,
"epsilon_dpo/beta_margin_grad_mean": -0.38742998242378235,
"epsilon_dpo/beta_margin_grad_std": 0.18115252256393433,
"epsilon_dpo/beta_margin_mean": 0.5441389679908752,
"epsilon_dpo/beta_margin_std": 0.8730788230895996,
"epsilon_dpo/loss_margin_mean": 40.6083984375,
"grad_norm": 16.15215492248535,
"kl/avg_steps": 0.46875,
"kl/beta": 0.013598828576505184,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 1.1151998403347243e-07,
"logits/chosen": -0.17047910392284393,
"logits/rejected": -0.1555979698896408,
"logps/chosen": -169.92681884765625,
"logps/ref_chosen": -75.38162231445312,
"logps/ref_rejected": -76.99822235107422,
"logps/rejected": -212.15182495117188,
"loss": 1.0812,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.2838433980941772,
"rewards/margins": 0.54413902759552,
"rewards/rejected": -1.8279824256896973,
"step": 476
},
{
"epoch": 0.7210884353741497,
"epsilon_dpo/beta": 0.013469051569700241,
"epsilon_dpo/beta_margin_grad_mean": -0.3848317563533783,
"epsilon_dpo/beta_margin_grad_std": 0.2032901793718338,
"epsilon_dpo/beta_margin_mean": 0.5557514429092407,
"epsilon_dpo/beta_margin_std": 0.9845151305198669,
"epsilon_dpo/loss_margin_mean": 41.73151397705078,
"grad_norm": 17.519760131835938,
"kl/avg_steps": 0.5,
"kl/beta": 0.01353538129478693,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 1.1042108616837692e-07,
"logits/chosen": 0.03746385499835014,
"logits/rejected": -0.127159982919693,
"logps/chosen": -155.70777893066406,
"logps/ref_chosen": -61.073387145996094,
"logps/ref_rejected": -81.34375,
"logps/rejected": -217.70965576171875,
"loss": 1.1192,
"rewards/accuracies": 0.765625,
"rewards/chosen": -1.278522253036499,
"rewards/margins": 0.5557514429092407,
"rewards/rejected": -1.8342738151550293,
"step": 477
},
{
"epoch": 0.7226001511715797,
"epsilon_dpo/beta": 0.013418877497315407,
"epsilon_dpo/beta_margin_grad_mean": -0.4020891487598419,
"epsilon_dpo/beta_margin_grad_std": 0.1974584460258484,
"epsilon_dpo/beta_margin_mean": 0.46954864263534546,
"epsilon_dpo/beta_margin_std": 0.9394667744636536,
"epsilon_dpo/loss_margin_mean": 35.48004913330078,
"grad_norm": 16.75385093688965,
"kl/avg_steps": 0.375,
"kl/beta": 0.013468041084706783,
"kl/n_epsilon_steps": 0.3125,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 1.0932609262554746e-07,
"logits/chosen": -0.044872041791677475,
"logits/rejected": 0.034967903047800064,
"logps/chosen": -140.67608642578125,
"logps/ref_chosen": -57.16731643676758,
"logps/ref_rejected": -53.309181213378906,
"logps/rejected": -172.29800415039062,
"loss": 1.1681,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.1226396560668945,
"rewards/margins": 0.46954867243766785,
"rewards/rejected": -1.5921882390975952,
"step": 478
},
{
"epoch": 0.7241118669690099,
"epsilon_dpo/beta": 0.013364551588892937,
"epsilon_dpo/beta_margin_grad_mean": -0.4109058976173401,
"epsilon_dpo/beta_margin_grad_std": 0.16968494653701782,
"epsilon_dpo/beta_margin_mean": 0.4123218059539795,
"epsilon_dpo/beta_margin_std": 0.8247367739677429,
"epsilon_dpo/loss_margin_mean": 31.247920989990234,
"grad_norm": 16.825471878051758,
"kl/avg_steps": 0.40625,
"kl/beta": 0.013417724519968033,
"kl/n_epsilon_steps": 0.296875,
"kl/p_epsilon_steps": 0.703125,
"learning_rate": 1.0823503403430734e-07,
"logits/chosen": 0.006298096850514412,
"logits/rejected": -0.10416960716247559,
"logps/chosen": -145.26657104492188,
"logps/ref_chosen": -58.91331481933594,
"logps/ref_rejected": -63.7403450012207,
"logps/rejected": -181.34152221679688,
"loss": 1.1658,
"rewards/accuracies": 0.703125,
"rewards/chosen": -1.155928611755371,
"rewards/margins": 0.4123218059539795,
"rewards/rejected": -1.5682504177093506,
"step": 479
},
{
"epoch": 0.7256235827664399,
"epsilon_dpo/beta": 0.013277065940201283,
"epsilon_dpo/beta_margin_grad_mean": -0.36171403527259827,
"epsilon_dpo/beta_margin_grad_std": 0.1862575113773346,
"epsilon_dpo/beta_margin_mean": 0.6519140005111694,
"epsilon_dpo/beta_margin_std": 0.9125059247016907,
"epsilon_dpo/loss_margin_mean": 49.46089172363281,
"grad_norm": 17.718896865844727,
"kl/avg_steps": 0.65625,
"kl/beta": 0.013363435864448547,
"kl/n_epsilon_steps": 0.171875,
"kl/p_epsilon_steps": 0.828125,
"learning_rate": 1.0714794091391072e-07,
"logits/chosen": -0.057031869888305664,
"logits/rejected": 0.05434707552194595,
"logps/chosen": -148.45436096191406,
"logps/ref_chosen": -62.80060577392578,
"logps/ref_rejected": -67.58859252929688,
"logps/rejected": -202.70323181152344,
"loss": 1.0178,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.1397075653076172,
"rewards/margins": 0.6519140005111694,
"rewards/rejected": -1.7916215658187866,
"step": 480
},
{
"epoch": 0.72713529856387,
"epsilon_dpo/beta": 0.01321954745799303,
"epsilon_dpo/beta_margin_grad_mean": -0.3718944191932678,
"epsilon_dpo/beta_margin_grad_std": 0.17709095776081085,
"epsilon_dpo/beta_margin_mean": 0.6156706809997559,
"epsilon_dpo/beta_margin_std": 0.8768129944801331,
"epsilon_dpo/loss_margin_mean": 46.98955535888672,
"grad_norm": 14.313308715820312,
"kl/avg_steps": 0.4375,
"kl/beta": 0.013276309706270695,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 1.0606484367268906e-07,
"logits/chosen": -0.07522377371788025,
"logits/rejected": -0.045152582228183746,
"logps/chosen": -150.4244384765625,
"logps/ref_chosen": -65.28649139404297,
"logps/ref_rejected": -70.78668212890625,
"logps/rejected": -202.9141845703125,
"loss": 1.0275,
"rewards/accuracies": 0.765625,
"rewards/chosen": -1.1279120445251465,
"rewards/margins": 0.6156706809997559,
"rewards/rejected": -1.7435826063156128,
"step": 481
},
{
"epoch": 0.7286470143613001,
"epsilon_dpo/beta": 0.013155747205018997,
"epsilon_dpo/beta_margin_grad_mean": -0.39164191484451294,
"epsilon_dpo/beta_margin_grad_std": 0.20249617099761963,
"epsilon_dpo/beta_margin_mean": 0.5469822883605957,
"epsilon_dpo/beta_margin_std": 1.0474293231964111,
"epsilon_dpo/loss_margin_mean": 42.05437088012695,
"grad_norm": 18.229524612426758,
"kl/avg_steps": 0.484375,
"kl/beta": 0.013218479230999947,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 1.0498577260720048e-07,
"logits/chosen": 0.0013577770441770554,
"logits/rejected": -0.28564998507499695,
"logps/chosen": -157.193359375,
"logps/ref_chosen": -60.906185150146484,
"logps/ref_rejected": -103.44656372070312,
"logps/rejected": -241.78811645507812,
"loss": 1.142,
"rewards/accuracies": 0.734375,
"rewards/chosen": -1.2702873945236206,
"rewards/margins": 0.5469822883605957,
"rewards/rejected": -1.8172696828842163,
"step": 482
},
{
"epoch": 0.7301587301587301,
"epsilon_dpo/beta": 0.01308207307010889,
"epsilon_dpo/beta_margin_grad_mean": -0.3583935797214508,
"epsilon_dpo/beta_margin_grad_std": 0.1710551530122757,
"epsilon_dpo/beta_margin_mean": 0.6657871603965759,
"epsilon_dpo/beta_margin_std": 0.8378815054893494,
"epsilon_dpo/loss_margin_mean": 51.241188049316406,
"grad_norm": 14.750130653381348,
"kl/avg_steps": 0.5625,
"kl/beta": 0.01315476093441248,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 1.0391075790138232e-07,
"logits/chosen": 0.11684601753950119,
"logits/rejected": -0.061854369938373566,
"logps/chosen": -134.9259033203125,
"logps/ref_chosen": -53.192012786865234,
"logps/ref_rejected": -81.83927154541016,
"logps/rejected": -214.81436157226562,
"loss": 0.98,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.0708601474761963,
"rewards/margins": 0.6657872200012207,
"rewards/rejected": -1.736647367477417,
"step": 483
},
{
"epoch": 0.7316704459561603,
"epsilon_dpo/beta": 0.01301298663020134,
"epsilon_dpo/beta_margin_grad_mean": -0.386461466550827,
"epsilon_dpo/beta_margin_grad_std": 0.1438293755054474,
"epsilon_dpo/beta_margin_mean": 0.5141505599021912,
"epsilon_dpo/beta_margin_std": 0.6710637211799622,
"epsilon_dpo/loss_margin_mean": 39.80889129638672,
"grad_norm": 18.822967529296875,
"kl/avg_steps": 0.53125,
"kl/beta": 0.01308117900043726,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 1.0283982962570681e-07,
"logits/chosen": -0.009363815188407898,
"logits/rejected": -0.08153226226568222,
"logps/chosen": -141.2456817626953,
"logps/ref_chosen": -57.76945877075195,
"logps/ref_rejected": -71.6829833984375,
"logps/rejected": -194.96810913085938,
"loss": 1.0377,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.0872807502746582,
"rewards/margins": 0.5141505599021912,
"rewards/rejected": -1.6014313697814941,
"step": 484
},
{
"epoch": 0.7331821617535903,
"epsilon_dpo/beta": 0.012944220565259457,
"epsilon_dpo/beta_margin_grad_mean": -0.3859536945819855,
"epsilon_dpo/beta_margin_grad_std": 0.16117826104164124,
"epsilon_dpo/beta_margin_mean": 0.5364850163459778,
"epsilon_dpo/beta_margin_std": 0.7896075248718262,
"epsilon_dpo/loss_margin_mean": 41.76102066040039,
"grad_norm": 14.586181640625,
"kl/avg_steps": 0.53125,
"kl/beta": 0.013012052513659,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 1.0177301773633992e-07,
"logits/chosen": -0.09010796993970871,
"logits/rejected": -0.05839370936155319,
"logps/chosen": -139.747802734375,
"logps/ref_chosen": -56.63584899902344,
"logps/ref_rejected": -70.85614013671875,
"logps/rejected": -195.72911071777344,
"loss": 1.0556,
"rewards/accuracies": 0.765625,
"rewards/chosen": -1.0769765377044678,
"rewards/margins": 0.536484956741333,
"rewards/rejected": -1.6134614944458008,
"step": 485
},
{
"epoch": 0.7346938775510204,
"epsilon_dpo/beta": 0.012900088913738728,
"epsilon_dpo/beta_margin_grad_mean": -0.4082988202571869,
"epsilon_dpo/beta_margin_grad_std": 0.1839274764060974,
"epsilon_dpo/beta_margin_mean": 0.4521438777446747,
"epsilon_dpo/beta_margin_std": 0.9206711053848267,
"epsilon_dpo/loss_margin_mean": 35.52356719970703,
"grad_norm": 14.925753593444824,
"kl/avg_steps": 0.34375,
"kl/beta": 0.01294329110532999,
"kl/n_epsilon_steps": 0.328125,
"kl/p_epsilon_steps": 0.671875,
"learning_rate": 1.007103520743035e-07,
"logits/chosen": 0.1375911682844162,
"logits/rejected": -0.1547698974609375,
"logps/chosen": -158.2030029296875,
"logps/ref_chosen": -56.347023010253906,
"logps/ref_rejected": -85.97221374511719,
"logps/rejected": -223.35177612304688,
"loss": 1.1662,
"rewards/accuracies": 0.671875,
"rewards/chosen": -1.3167333602905273,
"rewards/margins": 0.45214390754699707,
"rewards/rejected": -1.7688772678375244,
"step": 486
},
{
"epoch": 0.7362055933484505,
"epsilon_dpo/beta": 0.012835739180445671,
"epsilon_dpo/beta_margin_grad_mean": -0.3854285478591919,
"epsilon_dpo/beta_margin_grad_std": 0.1614953726530075,
"epsilon_dpo/beta_margin_mean": 0.5375082492828369,
"epsilon_dpo/beta_margin_std": 0.8063103556632996,
"epsilon_dpo/loss_margin_mean": 42.232818603515625,
"grad_norm": 17.51795196533203,
"kl/avg_steps": 0.5,
"kl/beta": 0.012898950837552547,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 9.965186236464046e-08,
"logits/chosen": 0.034427061676979065,
"logits/rejected": -0.15124982595443726,
"logps/chosen": -152.9064483642578,
"logps/ref_chosen": -60.617218017578125,
"logps/ref_rejected": -82.5097427368164,
"logps/rejected": -217.03179931640625,
"loss": 1.0581,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.1869494915008545,
"rewards/margins": 0.5375082492828369,
"rewards/rejected": -1.7244577407836914,
"step": 487
},
{
"epoch": 0.7377173091458806,
"epsilon_dpo/beta": 0.0127638578414917,
"epsilon_dpo/beta_margin_grad_mean": -0.36659687757492065,
"epsilon_dpo/beta_margin_grad_std": 0.1597447246313095,
"epsilon_dpo/beta_margin_mean": 0.6339870691299438,
"epsilon_dpo/beta_margin_std": 0.7843267917633057,
"epsilon_dpo/loss_margin_mean": 49.98493957519531,
"grad_norm": 17.00872802734375,
"kl/avg_steps": 0.5625,
"kl/beta": 0.012834777124226093,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 9.859757821558337e-08,
"logits/chosen": -0.09179073572158813,
"logits/rejected": -0.2324819415807724,
"logps/chosen": -145.92892456054688,
"logps/ref_chosen": -63.10905456542969,
"logps/ref_rejected": -82.49348449707031,
"logps/rejected": -215.2982940673828,
"loss": 0.9819,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.058620810508728,
"rewards/margins": 0.6339870691299438,
"rewards/rejected": -1.6926078796386719,
"step": 488
},
{
"epoch": 0.7392290249433107,
"epsilon_dpo/beta": 0.012724372558295727,
"epsilon_dpo/beta_margin_grad_mean": -0.42152947187423706,
"epsilon_dpo/beta_margin_grad_std": 0.1758275330066681,
"epsilon_dpo/beta_margin_mean": 0.3680368959903717,
"epsilon_dpo/beta_margin_std": 0.8166444301605225,
"epsilon_dpo/loss_margin_mean": 29.376705169677734,
"grad_norm": 15.848600387573242,
"kl/avg_steps": 0.3125,
"kl/beta": 0.012762985192239285,
"kl/n_epsilon_steps": 0.34375,
"kl/p_epsilon_steps": 0.65625,
"learning_rate": 9.754752911772615e-08,
"logits/chosen": -0.06368907541036606,
"logits/rejected": -0.17668470740318298,
"logps/chosen": -161.76226806640625,
"logps/ref_chosen": -64.98896026611328,
"logps/ref_rejected": -84.39607238769531,
"logps/rejected": -210.54608154296875,
"loss": 1.2034,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.2333425283432007,
"rewards/margins": 0.3680368661880493,
"rewards/rejected": -1.60137939453125,
"step": 489
},
{
"epoch": 0.7407407407407407,
"epsilon_dpo/beta": 0.012684733606874943,
"epsilon_dpo/beta_margin_grad_mean": -0.40088528394699097,
"epsilon_dpo/beta_margin_grad_std": 0.20213083922863007,
"epsilon_dpo/beta_margin_mean": 0.47017014026641846,
"epsilon_dpo/beta_margin_std": 0.9762210845947266,
"epsilon_dpo/loss_margin_mean": 37.632205963134766,
"grad_norm": 13.472871780395508,
"kl/avg_steps": 0.3125,
"kl/beta": 0.0127232251688838,
"kl/n_epsilon_steps": 0.34375,
"kl/p_epsilon_steps": 0.65625,
"learning_rate": 9.650174444319956e-08,
"logits/chosen": 0.0033436529338359833,
"logits/rejected": -0.06486822664737701,
"logps/chosen": -148.0041046142578,
"logps/ref_chosen": -61.90874481201172,
"logps/ref_rejected": -70.58566284179688,
"logps/rejected": -194.313232421875,
"loss": 1.1822,
"rewards/accuracies": 0.640625,
"rewards/chosen": -1.0953683853149414,
"rewards/margins": 0.47017017006874084,
"rewards/rejected": -1.5655385255813599,
"step": 490
},
{
"epoch": 0.7422524565381708,
"epsilon_dpo/beta": 0.012621432542800903,
"epsilon_dpo/beta_margin_grad_mean": -0.3884417414665222,
"epsilon_dpo/beta_margin_grad_std": 0.1630062758922577,
"epsilon_dpo/beta_margin_mean": 0.5196285247802734,
"epsilon_dpo/beta_margin_std": 0.773684561252594,
"epsilon_dpo/loss_margin_mean": 41.5296745300293,
"grad_norm": 13.528923034667969,
"kl/avg_steps": 0.5,
"kl/beta": 0.01268358901143074,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 9.546025344484868e-08,
"logits/chosen": 0.02480892837047577,
"logits/rejected": -0.04510752111673355,
"logps/chosen": -141.18540954589844,
"logps/ref_chosen": -55.47570037841797,
"logps/ref_rejected": -78.70318603515625,
"logps/rejected": -205.94256591796875,
"loss": 1.0654,
"rewards/accuracies": 0.765625,
"rewards/chosen": -1.0840129852294922,
"rewards/margins": 0.5196285247802734,
"rewards/rejected": -1.6036415100097656,
"step": 491
},
{
"epoch": 0.7437641723356009,
"epsilon_dpo/beta": 0.012590194121003151,
"epsilon_dpo/beta_margin_grad_mean": -0.4135659337043762,
"epsilon_dpo/beta_margin_grad_std": 0.18651123344898224,
"epsilon_dpo/beta_margin_mean": 0.4122660756111145,
"epsilon_dpo/beta_margin_std": 0.8830878138542175,
"epsilon_dpo/loss_margin_mean": 33.266136169433594,
"grad_norm": 17.39338493347168,
"kl/avg_steps": 0.25,
"kl/beta": 0.012620486319065094,
"kl/n_epsilon_steps": 0.375,
"kl/p_epsilon_steps": 0.625,
"learning_rate": 9.442308525541589e-08,
"logits/chosen": -0.06004483997821808,
"logits/rejected": -0.22294960916042328,
"logps/chosen": -171.03338623046875,
"logps/ref_chosen": -67.28638458251953,
"logps/ref_rejected": -82.78628540039062,
"logps/rejected": -219.7994384765625,
"loss": 1.1909,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.3084372282028198,
"rewards/margins": 0.4122660756111145,
"rewards/rejected": -1.720703363418579,
"step": 492
},
{
"epoch": 0.745275888133031,
"epsilon_dpo/beta": 0.012515518814325333,
"epsilon_dpo/beta_margin_grad_mean": -0.3640889525413513,
"epsilon_dpo/beta_margin_grad_std": 0.16848108172416687,
"epsilon_dpo/beta_margin_mean": 0.6339695453643799,
"epsilon_dpo/beta_margin_std": 0.8145210146903992,
"epsilon_dpo/loss_margin_mean": 51.00349426269531,
"grad_norm": 13.730269432067871,
"kl/avg_steps": 0.59375,
"kl/beta": 0.012589014135301113,
"kl/n_epsilon_steps": 0.203125,
"kl/p_epsilon_steps": 0.796875,
"learning_rate": 9.339026888672468e-08,
"logits/chosen": -0.011049837805330753,
"logits/rejected": -0.15957790613174438,
"logps/chosen": -138.27645874023438,
"logps/ref_chosen": -55.92750549316406,
"logps/ref_rejected": -79.12149810791016,
"logps/rejected": -212.47393798828125,
"loss": 0.9951,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.0317935943603516,
"rewards/margins": 0.6339695453643799,
"rewards/rejected": -1.6657631397247314,
"step": 493
},
{
"epoch": 0.7467876039304611,
"epsilon_dpo/beta": 0.012465112842619419,
"epsilon_dpo/beta_margin_grad_mean": -0.3992280066013336,
"epsilon_dpo/beta_margin_grad_std": 0.18988929688930511,
"epsilon_dpo/beta_margin_mean": 0.47358325123786926,
"epsilon_dpo/beta_margin_std": 0.9350878000259399,
"epsilon_dpo/loss_margin_mean": 38.49197769165039,
"grad_norm": 17.38585090637207,
"kl/avg_steps": 0.40625,
"kl/beta": 0.012514707632362843,
"kl/n_epsilon_steps": 0.296875,
"kl/p_epsilon_steps": 0.703125,
"learning_rate": 9.236183322886945e-08,
"logits/chosen": -0.1552366465330124,
"logits/rejected": -0.24309870600700378,
"logps/chosen": -154.8963623046875,
"logps/ref_chosen": -67.95411682128906,
"logps/ref_rejected": -90.50865936279297,
"logps/rejected": -215.94288635253906,
"loss": 1.1592,
"rewards/accuracies": 0.703125,
"rewards/chosen": -1.0862911939620972,
"rewards/margins": 0.47358325123786926,
"rewards/rejected": -1.559874415397644,
"step": 494
},
{
"epoch": 0.7482993197278912,
"epsilon_dpo/beta": 0.012426364235579967,
"epsilon_dpo/beta_margin_grad_mean": -0.4139139950275421,
"epsilon_dpo/beta_margin_grad_std": 0.16601787507534027,
"epsilon_dpo/beta_margin_mean": 0.41505637764930725,
"epsilon_dpo/beta_margin_std": 0.8066527843475342,
"epsilon_dpo/loss_margin_mean": 33.823486328125,
"grad_norm": 16.146984100341797,
"kl/avg_steps": 0.3125,
"kl/beta": 0.012464072555303574,
"kl/n_epsilon_steps": 0.34375,
"kl/p_epsilon_steps": 0.65625,
"learning_rate": 9.133780704940594e-08,
"logits/chosen": 0.12984301149845123,
"logits/rejected": -0.12017878890037537,
"logps/chosen": -136.327392578125,
"logps/ref_chosen": -52.625465393066406,
"logps/ref_rejected": -72.06781005859375,
"logps/rejected": -189.59323120117188,
"loss": 1.1562,
"rewards/accuracies": 0.640625,
"rewards/chosen": -1.0412909984588623,
"rewards/margins": 0.41505637764930725,
"rewards/rejected": -1.4563474655151367,
"step": 495
},
{
"epoch": 0.7498110355253212,
"epsilon_dpo/beta": 0.012383770197629929,
"epsilon_dpo/beta_margin_grad_mean": -0.3938678205013275,
"epsilon_dpo/beta_margin_grad_std": 0.17601278424263,
"epsilon_dpo/beta_margin_mean": 0.5370790362358093,
"epsilon_dpo/beta_margin_std": 0.9416787624359131,
"epsilon_dpo/loss_margin_mean": 43.853919982910156,
"grad_norm": 13.699808120727539,
"kl/avg_steps": 0.34375,
"kl/beta": 0.012425243854522705,
"kl/n_epsilon_steps": 0.328125,
"kl/p_epsilon_steps": 0.671875,
"learning_rate": 9.031821899254797e-08,
"logits/chosen": 0.08078505098819733,
"logits/rejected": -0.25226420164108276,
"logps/chosen": -150.21319580078125,
"logps/ref_chosen": -57.597328186035156,
"logps/ref_rejected": -94.36127471923828,
"logps/rejected": -230.8310546875,
"loss": 1.095,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.1493494510650635,
"rewards/margins": 0.5370790362358093,
"rewards/rejected": -1.6864285469055176,
"step": 496
},
{
"epoch": 0.7513227513227513,
"epsilon_dpo/beta": 0.012329736724495888,
"epsilon_dpo/beta_margin_grad_mean": -0.3691116273403168,
"epsilon_dpo/beta_margin_grad_std": 0.16679774224758148,
"epsilon_dpo/beta_margin_mean": 0.6055059432983398,
"epsilon_dpo/beta_margin_std": 0.7912006974220276,
"epsilon_dpo/loss_margin_mean": 49.53929138183594,
"grad_norm": 13.69522476196289,
"kl/avg_steps": 0.4375,
"kl/beta": 0.01238267868757248,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 8.930309757836516e-08,
"logits/chosen": -0.0926516056060791,
"logits/rejected": -0.10358744114637375,
"logps/chosen": -166.04916381835938,
"logps/ref_chosen": -72.78994750976562,
"logps/ref_rejected": -89.48483276367188,
"logps/rejected": -232.28335571289062,
"loss": 1.0085,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.1533386707305908,
"rewards/margins": 0.6055059432983398,
"rewards/rejected": -1.7588446140289307,
"step": 497
},
{
"epoch": 0.7528344671201814,
"epsilon_dpo/beta": 0.012252910993993282,
"epsilon_dpo/beta_margin_grad_mean": -0.37015247344970703,
"epsilon_dpo/beta_margin_grad_std": 0.14894452691078186,
"epsilon_dpo/beta_margin_mean": 0.597204864025116,
"epsilon_dpo/beta_margin_std": 0.7268882989883423,
"epsilon_dpo/loss_margin_mean": 49.007041931152344,
"grad_norm": 14.688222885131836,
"kl/avg_steps": 0.625,
"kl/beta": 0.01232874020934105,
"kl/n_epsilon_steps": 0.1875,
"kl/p_epsilon_steps": 0.8125,
"learning_rate": 8.829247120198563e-08,
"logits/chosen": -0.04416649788618088,
"logits/rejected": -0.060835979878902435,
"logps/chosen": -151.98118591308594,
"logps/ref_chosen": -68.36572265625,
"logps/ref_rejected": -71.28846740722656,
"logps/rejected": -203.91098022460938,
"loss": 0.9908,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.026533603668213,
"rewards/margins": 0.5972048044204712,
"rewards/rejected": -1.6237385272979736,
"step": 498
},
{
"epoch": 0.7543461829176115,
"epsilon_dpo/beta": 0.0121959513053298,
"epsilon_dpo/beta_margin_grad_mean": -0.3769929111003876,
"epsilon_dpo/beta_margin_grad_std": 0.19435811042785645,
"epsilon_dpo/beta_margin_mean": 0.587293803691864,
"epsilon_dpo/beta_margin_std": 0.9322723150253296,
"epsilon_dpo/loss_margin_mean": 48.67766189575195,
"grad_norm": 16.318862915039062,
"kl/avg_steps": 0.46875,
"kl/beta": 0.01225216407328844,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 8.728636813280163e-08,
"logits/chosen": -0.12170986086130142,
"logits/rejected": -0.2458992302417755,
"logps/chosen": -142.9403533935547,
"logps/ref_chosen": -61.90882873535156,
"logps/ref_rejected": -91.9411392211914,
"logps/rejected": -221.65032958984375,
"loss": 1.0743,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.9905064105987549,
"rewards/margins": 0.587293803691864,
"rewards/rejected": -1.5778002738952637,
"step": 499
},
{
"epoch": 0.7558578987150416,
"epsilon_dpo/beta": 0.01213142741471529,
"epsilon_dpo/beta_margin_grad_mean": -0.3868768513202667,
"epsilon_dpo/beta_margin_grad_std": 0.1750926375389099,
"epsilon_dpo/beta_margin_mean": 0.518576979637146,
"epsilon_dpo/beta_margin_std": 0.814042866230011,
"epsilon_dpo/loss_margin_mean": 43.169132232666016,
"grad_norm": 17.110469818115234,
"kl/avg_steps": 0.53125,
"kl/beta": 0.012195000424981117,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 8.628481651367875e-08,
"logits/chosen": -0.06761372089385986,
"logits/rejected": -0.023700576275587082,
"logps/chosen": -158.58779907226562,
"logps/ref_chosen": -70.225830078125,
"logps/ref_rejected": -71.72203063964844,
"logps/rejected": -203.25314331054688,
"loss": 1.0834,
"rewards/accuracies": 0.765625,
"rewards/chosen": -1.0752438306808472,
"rewards/margins": 0.518576979637146,
"rewards/rejected": -1.5938208103179932,
"step": 500
},
{
"epoch": 0.7558578987150416,
"eval_epsilon_dpo/beta": 0.012077624909579754,
"eval_epsilon_dpo/beta_margin_grad_mean": -0.39524921774864197,
"eval_epsilon_dpo/beta_margin_grad_std": 0.16700725257396698,
"eval_epsilon_dpo/beta_margin_mean": 0.48942071199417114,
"eval_epsilon_dpo/beta_margin_std": 0.7939777374267578,
"eval_epsilon_dpo/loss_margin_mean": 40.94007110595703,
"eval_kl/n_epsilon_steps": 0.27640846371650696,
"eval_kl/p_epsilon_steps": 0.7227112650871277,
"eval_logits/chosen": 0.021525084972381592,
"eval_logits/rejected": -0.08719652146100998,
"eval_logps/chosen": -158.35911560058594,
"eval_logps/ref_chosen": -74.85946655273438,
"eval_logps/ref_rejected": -79.54898834228516,
"eval_logps/rejected": -203.9887237548828,
"eval_loss": 0.5506576895713806,
"eval_rewards/accuracies": 0.720950722694397,
"eval_rewards/chosen": -1.0110900402069092,
"eval_rewards/margins": 0.48942071199417114,
"eval_rewards/rejected": -1.500510573387146,
"eval_runtime": 41.7874,
"eval_samples_per_second": 55.112,
"eval_steps_per_second": 1.723,
"step": 500
},
{
"epoch": 0.7573696145124716,
"epsilon_dpo/beta": 0.012059737928211689,
"epsilon_dpo/beta_margin_grad_mean": -0.3916279673576355,
"epsilon_dpo/beta_margin_grad_std": 0.12956352531909943,
"epsilon_dpo/beta_margin_mean": 0.4797613024711609,
"epsilon_dpo/beta_margin_std": 0.6012536883354187,
"epsilon_dpo/loss_margin_mean": 40.0147705078125,
"grad_norm": 11.337005615234375,
"kl/avg_steps": 0.59375,
"kl/beta": 0.012130556628108025,
"kl/n_epsilon_steps": 0.203125,
"kl/p_epsilon_steps": 0.796875,
"learning_rate": 8.528784436016878e-08,
"logits/chosen": -0.09394572675228119,
"logits/rejected": -0.06278929114341736,
"logps/chosen": -151.153076171875,
"logps/ref_chosen": -64.59880828857422,
"logps/ref_rejected": -70.59329223632812,
"logps/rejected": -197.16232299804688,
"loss": 1.0448,
"rewards/accuracies": 0.796875,
"rewards/chosen": -1.0444523096084595,
"rewards/margins": 0.4797613024711609,
"rewards/rejected": -1.5242135524749756,
"step": 501
},
{
"epoch": 0.7588813303099018,
"epsilon_dpo/beta": 0.01199986133724451,
"epsilon_dpo/beta_margin_grad_mean": -0.38776537775993347,
"epsilon_dpo/beta_margin_grad_std": 0.14369182288646698,
"epsilon_dpo/beta_margin_mean": 0.509573757648468,
"epsilon_dpo/beta_margin_std": 0.6684770584106445,
"epsilon_dpo/loss_margin_mean": 42.791404724121094,
"grad_norm": 15.403996467590332,
"kl/avg_steps": 0.5,
"kl/beta": 0.012058956548571587,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 8.4295479559726e-08,
"logits/chosen": -0.027420366182923317,
"logits/rejected": -0.18210762739181519,
"logps/chosen": -151.18763732910156,
"logps/ref_chosen": -65.46662902832031,
"logps/ref_rejected": -90.22233581542969,
"logps/rejected": -218.7347412109375,
"loss": 1.0404,
"rewards/accuracies": 0.796875,
"rewards/chosen": -1.0299993753433228,
"rewards/margins": 0.509573757648468,
"rewards/rejected": -1.5395731925964355,
"step": 502
},
{
"epoch": 0.7603930461073318,
"epsilon_dpo/beta": 0.011928911320865154,
"epsilon_dpo/beta_margin_grad_mean": -0.38798028230667114,
"epsilon_dpo/beta_margin_grad_std": 0.17388485372066498,
"epsilon_dpo/beta_margin_mean": 0.5134563446044922,
"epsilon_dpo/beta_margin_std": 0.8138607740402222,
"epsilon_dpo/loss_margin_mean": 43.42601013183594,
"grad_norm": 12.801469802856445,
"kl/avg_steps": 0.59375,
"kl/beta": 0.011998961679637432,
"kl/n_epsilon_steps": 0.203125,
"kl/p_epsilon_steps": 0.796875,
"learning_rate": 8.330774987092712e-08,
"logits/chosen": 0.09116241335868835,
"logits/rejected": 0.16091768443584442,
"logps/chosen": -131.15020751953125,
"logps/ref_chosen": -51.83476257324219,
"logps/ref_rejected": -57.62522506713867,
"logps/rejected": -180.36666870117188,
"loss": 1.0867,
"rewards/accuracies": 0.796875,
"rewards/chosen": -0.948855996131897,
"rewards/margins": 0.5134563446044922,
"rewards/rejected": -1.4623124599456787,
"step": 503
},
{
"epoch": 0.7619047619047619,
"epsilon_dpo/beta": 0.011851045303046703,
"epsilon_dpo/beta_margin_grad_mean": -0.35295844078063965,
"epsilon_dpo/beta_margin_grad_std": 0.14565306901931763,
"epsilon_dpo/beta_margin_mean": 0.6760881543159485,
"epsilon_dpo/beta_margin_std": 0.6993220448493958,
"epsilon_dpo/loss_margin_mean": 57.321773529052734,
"grad_norm": 15.422840118408203,
"kl/avg_steps": 0.65625,
"kl/beta": 0.011928138323128223,
"kl/n_epsilon_steps": 0.171875,
"kl/p_epsilon_steps": 0.828125,
"learning_rate": 8.232468292269479e-08,
"logits/chosen": -0.11517796665430069,
"logits/rejected": -0.15213042497634888,
"logps/chosen": -150.25379943847656,
"logps/ref_chosen": -68.65119934082031,
"logps/ref_rejected": -77.91394805908203,
"logps/rejected": -216.83831787109375,
"loss": 0.9273,
"rewards/accuracies": 0.859375,
"rewards/chosen": -0.9680932760238647,
"rewards/margins": 0.6760881543159485,
"rewards/rejected": -1.644181489944458,
"step": 504
},
{
"epoch": 0.763416477702192,
"epsilon_dpo/beta": 0.01181822270154953,
"epsilon_dpo/beta_margin_grad_mean": -0.4092603623867035,
"epsilon_dpo/beta_margin_grad_std": 0.1841905564069748,
"epsilon_dpo/beta_margin_mean": 0.4375028610229492,
"epsilon_dpo/beta_margin_std": 0.8769425749778748,
"epsilon_dpo/loss_margin_mean": 37.54341506958008,
"grad_norm": 15.376502990722656,
"kl/avg_steps": 0.28125,
"kl/beta": 0.011850370094180107,
"kl/n_epsilon_steps": 0.359375,
"kl/p_epsilon_steps": 0.640625,
"learning_rate": 8.134630621352483e-08,
"logits/chosen": -0.11766748875379562,
"logits/rejected": -0.1298036277294159,
"logps/chosen": -144.8924560546875,
"logps/ref_chosen": -59.99884796142578,
"logps/ref_rejected": -76.88047790527344,
"logps/rejected": -199.3175048828125,
"loss": 1.1673,
"rewards/accuracies": 0.671875,
"rewards/chosen": -1.006791353225708,
"rewards/margins": 0.4375028610229492,
"rewards/rejected": -1.4442942142486572,
"step": 505
},
{
"epoch": 0.764928193499622,
"epsilon_dpo/beta": 0.011766611598432064,
"epsilon_dpo/beta_margin_grad_mean": -0.39573776721954346,
"epsilon_dpo/beta_margin_grad_std": 0.17577704787254333,
"epsilon_dpo/beta_margin_mean": 0.47944560647010803,
"epsilon_dpo/beta_margin_std": 0.8339173197746277,
"epsilon_dpo/loss_margin_mean": 41.21306610107422,
"grad_norm": 15.163553237915039,
"kl/avg_steps": 0.4375,
"kl/beta": 0.01181713491678238,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 8.037264711071698e-08,
"logits/chosen": -0.22006317973136902,
"logits/rejected": -0.22004762291908264,
"logps/chosen": -152.54010009765625,
"logps/ref_chosen": -70.07130432128906,
"logps/ref_rejected": -82.03775024414062,
"logps/rejected": -205.71961975097656,
"loss": 1.1188,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.9732955694198608,
"rewards/margins": 0.47944557666778564,
"rewards/rejected": -1.4527411460876465,
"step": 506
},
{
"epoch": 0.7664399092970522,
"epsilon_dpo/beta": 0.011715356260538101,
"epsilon_dpo/beta_margin_grad_mean": -0.3934823274612427,
"epsilon_dpo/beta_margin_grad_std": 0.1719146966934204,
"epsilon_dpo/beta_margin_mean": 0.5199995636940002,
"epsilon_dpo/beta_margin_std": 0.8467255234718323,
"epsilon_dpo/loss_margin_mean": 44.81961441040039,
"grad_norm": 14.491789817810059,
"kl/avg_steps": 0.4375,
"kl/beta": 0.011765659786760807,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 7.940373284960933e-08,
"logits/chosen": -0.18367737531661987,
"logits/rejected": -0.20366671681404114,
"logps/chosen": -163.86795043945312,
"logps/ref_chosen": -72.00703430175781,
"logps/ref_rejected": -93.94987487792969,
"logps/rejected": -230.63040161132812,
"loss": 1.0866,
"rewards/accuracies": 0.703125,
"rewards/chosen": -1.0778284072875977,
"rewards/margins": 0.5199995636940002,
"rewards/rejected": -1.5978279113769531,
"step": 507
},
{
"epoch": 0.7679516250944822,
"epsilon_dpo/beta": 0.01165700238198042,
"epsilon_dpo/beta_margin_grad_mean": -0.3724977970123291,
"epsilon_dpo/beta_margin_grad_std": 0.1666453778743744,
"epsilon_dpo/beta_margin_mean": 0.5963394641876221,
"epsilon_dpo/beta_margin_std": 0.7914450764656067,
"epsilon_dpo/loss_margin_mean": 51.5748176574707,
"grad_norm": 14.877209663391113,
"kl/avg_steps": 0.5,
"kl/beta": 0.011714409105479717,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 7.843959053281663e-08,
"logits/chosen": -0.03610976040363312,
"logits/rejected": -0.3166627287864685,
"logps/chosen": -143.24783325195312,
"logps/ref_chosen": -60.21992492675781,
"logps/ref_rejected": -95.9200668334961,
"logps/rejected": -230.52279663085938,
"loss": 1.0147,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.9691613912582397,
"rewards/margins": 0.5963395237922668,
"rewards/rejected": -1.5655009746551514,
"step": 508
},
{
"epoch": 0.7694633408919124,
"epsilon_dpo/beta": 0.011606293730437756,
"epsilon_dpo/beta_margin_grad_mean": -0.39447611570358276,
"epsilon_dpo/beta_margin_grad_std": 0.16088837385177612,
"epsilon_dpo/beta_margin_mean": 0.4742947816848755,
"epsilon_dpo/beta_margin_std": 0.7261459827423096,
"epsilon_dpo/loss_margin_mean": 41.306663513183594,
"grad_norm": 15.453798294067383,
"kl/avg_steps": 0.4375,
"kl/beta": 0.01165612880140543,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 7.748024712947204e-08,
"logits/chosen": 0.01344912126660347,
"logits/rejected": -0.06524886190891266,
"logps/chosen": -154.87969970703125,
"logps/ref_chosen": -66.2701644897461,
"logps/ref_rejected": -71.73065185546875,
"logps/rejected": -201.6468505859375,
"loss": 1.0887,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.0299005508422852,
"rewards/margins": 0.4742947816848755,
"rewards/rejected": -1.5041954517364502,
"step": 509
},
{
"epoch": 0.7709750566893424,
"epsilon_dpo/beta": 0.011541228741407394,
"epsilon_dpo/beta_margin_grad_mean": -0.38125261664390564,
"epsilon_dpo/beta_margin_grad_std": 0.17376817762851715,
"epsilon_dpo/beta_margin_mean": 0.5642380118370056,
"epsilon_dpo/beta_margin_std": 0.8332074284553528,
"epsilon_dpo/loss_margin_mean": 49.29336929321289,
"grad_norm": 13.736610412597656,
"kl/avg_steps": 0.5625,
"kl/beta": 0.011605354957282543,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 7.652572947447272e-08,
"logits/chosen": 0.09537425637245178,
"logits/rejected": -0.20089438557624817,
"logps/chosen": -141.93206787109375,
"logps/ref_chosen": -53.54487609863281,
"logps/ref_rejected": -91.36649322509766,
"logps/rejected": -229.04705810546875,
"loss": 1.0522,
"rewards/accuracies": 0.765625,
"rewards/chosen": -1.0215578079223633,
"rewards/margins": 0.5642380118370056,
"rewards/rejected": -1.5857958793640137,
"step": 510
},
{
"epoch": 0.7724867724867724,
"epsilon_dpo/beta": 0.011473065242171288,
"epsilon_dpo/beta_margin_grad_mean": -0.36531829833984375,
"epsilon_dpo/beta_margin_grad_std": 0.17535652220249176,
"epsilon_dpo/beta_margin_mean": 0.6306586861610413,
"epsilon_dpo/beta_margin_std": 0.8256549835205078,
"epsilon_dpo/loss_margin_mean": 55.38315963745117,
"grad_norm": 14.992674827575684,
"kl/avg_steps": 0.59375,
"kl/beta": 0.011540439911186695,
"kl/n_epsilon_steps": 0.203125,
"kl/p_epsilon_steps": 0.796875,
"learning_rate": 7.557606426772961e-08,
"logits/chosen": -0.013582116924226284,
"logits/rejected": -0.18654537200927734,
"logps/chosen": -142.97750854492188,
"logps/ref_chosen": -55.844390869140625,
"logps/ref_rejected": -86.49819946289062,
"logps/rejected": -229.01449584960938,
"loss": 1.0036,
"rewards/accuracies": 0.796875,
"rewards/chosen": -1.002798318862915,
"rewards/margins": 0.6306586265563965,
"rewards/rejected": -1.6334569454193115,
"step": 511
},
{
"epoch": 0.7739984882842026,
"epsilon_dpo/beta": 0.01142685953527689,
"epsilon_dpo/beta_margin_grad_mean": -0.3992239534854889,
"epsilon_dpo/beta_margin_grad_std": 0.17349795997142792,
"epsilon_dpo/beta_margin_mean": 0.46488142013549805,
"epsilon_dpo/beta_margin_std": 0.7966252565383911,
"epsilon_dpo/loss_margin_mean": 41.146881103515625,
"grad_norm": 17.237150192260742,
"kl/avg_steps": 0.40625,
"kl/beta": 0.011472322978079319,
"kl/n_epsilon_steps": 0.296875,
"kl/p_epsilon_steps": 0.703125,
"learning_rate": 7.463127807341966e-08,
"logits/chosen": -0.018309108912944794,
"logits/rejected": -0.0311952643096447,
"logps/chosen": -147.0172576904297,
"logps/ref_chosen": -61.653038024902344,
"logps/ref_rejected": -72.83148193359375,
"logps/rejected": -199.34259033203125,
"loss": 1.1192,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.9793570041656494,
"rewards/margins": 0.46488142013549805,
"rewards/rejected": -1.4442384243011475,
"step": 512
},
{
"epoch": 0.7755102040816326,
"epsilon_dpo/beta": 0.01135919988155365,
"epsilon_dpo/beta_margin_grad_mean": -0.38892507553100586,
"epsilon_dpo/beta_margin_grad_std": 0.1450594812631607,
"epsilon_dpo/beta_margin_mean": 0.5117185711860657,
"epsilon_dpo/beta_margin_std": 0.7006902694702148,
"epsilon_dpo/loss_margin_mean": 45.34425354003906,
"grad_norm": 12.44005298614502,
"kl/avg_steps": 0.59375,
"kl/beta": 0.011425905860960484,
"kl/n_epsilon_steps": 0.203125,
"kl/p_epsilon_steps": 0.796875,
"learning_rate": 7.369139731924401e-08,
"logits/chosen": 0.06506671011447906,
"logits/rejected": 0.001617439091205597,
"logps/chosen": -128.57168579101562,
"logps/ref_chosen": -50.852561950683594,
"logps/ref_rejected": -69.21754455566406,
"logps/rejected": -192.28091430664062,
"loss": 1.0461,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.8842555284500122,
"rewards/margins": 0.5117185711860657,
"rewards/rejected": -1.3959741592407227,
"step": 513
},
{
"epoch": 0.7770219198790628,
"epsilon_dpo/beta": 0.011302802711725235,
"epsilon_dpo/beta_margin_grad_mean": -0.3687097132205963,
"epsilon_dpo/beta_margin_grad_std": 0.15901188552379608,
"epsilon_dpo/beta_margin_mean": 0.6013322472572327,
"epsilon_dpo/beta_margin_std": 0.7520633935928345,
"epsilon_dpo/loss_margin_mean": 53.62897491455078,
"grad_norm": 13.720666885375977,
"kl/avg_steps": 0.5,
"kl/beta": 0.011358465068042278,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 7.275644829568747e-08,
"logits/chosen": -0.1292281597852707,
"logits/rejected": -0.16890010237693787,
"logps/chosen": -156.04302978515625,
"logps/ref_chosen": -69.38493347167969,
"logps/ref_rejected": -83.32447814941406,
"logps/rejected": -223.61154174804688,
"loss": 0.9981,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.981688380241394,
"rewards/margins": 0.6013321876525879,
"rewards/rejected": -1.5830204486846924,
"step": 514
},
{
"epoch": 0.7785336356764928,
"epsilon_dpo/beta": 0.011246570385992527,
"epsilon_dpo/beta_margin_grad_mean": -0.3916643261909485,
"epsilon_dpo/beta_margin_grad_std": 0.16084180772304535,
"epsilon_dpo/beta_margin_mean": 0.4969915449619293,
"epsilon_dpo/beta_margin_std": 0.7405228614807129,
"epsilon_dpo/loss_margin_mean": 44.5915641784668,
"grad_norm": 14.838274002075195,
"kl/avg_steps": 0.5,
"kl/beta": 0.011301955208182335,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 7.182645715528435e-08,
"logits/chosen": 0.055826883763074875,
"logits/rejected": -0.09414568543434143,
"logps/chosen": -148.4252166748047,
"logps/ref_chosen": -53.687034606933594,
"logps/ref_rejected": -83.59614562988281,
"logps/rejected": -222.92588806152344,
"loss": 1.0741,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.0675394535064697,
"rewards/margins": 0.49699151515960693,
"rewards/rejected": -1.5645309686660767,
"step": 515
},
{
"epoch": 0.780045351473923,
"epsilon_dpo/beta": 0.011194132268428802,
"epsilon_dpo/beta_margin_grad_mean": -0.4029303789138794,
"epsilon_dpo/beta_margin_grad_std": 0.16609422862529755,
"epsilon_dpo/beta_margin_mean": 0.4301670789718628,
"epsilon_dpo/beta_margin_std": 0.7569416761398315,
"epsilon_dpo/loss_margin_mean": 38.874961853027344,
"grad_norm": 16.539331436157227,
"kl/avg_steps": 0.46875,
"kl/beta": 0.011245726607739925,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 7.090144991188568e-08,
"logits/chosen": -0.14461268484592438,
"logits/rejected": -0.08606170862913132,
"logps/chosen": -139.4183349609375,
"logps/ref_chosen": -56.9017219543457,
"logps/ref_rejected": -67.83477783203125,
"logps/rejected": -189.22634887695312,
"loss": 1.1338,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.9271007776260376,
"rewards/margins": 0.4301670789718628,
"rewards/rejected": -1.3572678565979004,
"step": 516
},
{
"epoch": 0.781557067271353,
"epsilon_dpo/beta": 0.011152397841215134,
"epsilon_dpo/beta_margin_grad_mean": -0.4203470051288605,
"epsilon_dpo/beta_margin_grad_std": 0.1519647240638733,
"epsilon_dpo/beta_margin_mean": 0.3701581656932831,
"epsilon_dpo/beta_margin_std": 0.7016948461532593,
"epsilon_dpo/loss_margin_mean": 33.60012435913086,
"grad_norm": 16.598073959350586,
"kl/avg_steps": 0.375,
"kl/beta": 0.011193257756531239,
"kl/n_epsilon_steps": 0.3125,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 6.998145243993284e-08,
"logits/chosen": 0.0344880111515522,
"logits/rejected": 0.036995913833379745,
"logps/chosen": -161.7857666015625,
"logps/ref_chosen": -61.775142669677734,
"logps/ref_rejected": -62.88270950317383,
"logps/rejected": -196.49346923828125,
"loss": 1.1618,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.1175148487091064,
"rewards/margins": 0.3701581656932831,
"rewards/rejected": -1.487673044204712,
"step": 517
},
{
"epoch": 0.783068783068783,
"epsilon_dpo/beta": 0.011110733263194561,
"epsilon_dpo/beta_margin_grad_mean": -0.4045083522796631,
"epsilon_dpo/beta_margin_grad_std": 0.15338724851608276,
"epsilon_dpo/beta_margin_mean": 0.43878626823425293,
"epsilon_dpo/beta_margin_std": 0.7061228156089783,
"epsilon_dpo/loss_margin_mean": 39.908721923828125,
"grad_norm": 12.619616508483887,
"kl/avg_steps": 0.375,
"kl/beta": 0.011151440441608429,
"kl/n_epsilon_steps": 0.3125,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 6.906649047373245e-08,
"logits/chosen": -0.06493734568357468,
"logits/rejected": -0.08891810476779938,
"logps/chosen": -143.70635986328125,
"logps/ref_chosen": -62.025230407714844,
"logps/ref_rejected": -79.06085205078125,
"logps/rejected": -200.6507110595703,
"loss": 1.1078,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.9089415073394775,
"rewards/margins": 0.43878626823425293,
"rewards/rejected": -1.3477277755737305,
"step": 518
},
{
"epoch": 0.7845804988662132,
"epsilon_dpo/beta": 0.011079640127718449,
"epsilon_dpo/beta_margin_grad_mean": -0.4377845227718353,
"epsilon_dpo/beta_margin_grad_std": 0.1885075569152832,
"epsilon_dpo/beta_margin_mean": 0.2836650013923645,
"epsilon_dpo/beta_margin_std": 0.877540111541748,
"epsilon_dpo/loss_margin_mean": 26.19062042236328,
"grad_norm": 19.991710662841797,
"kl/avg_steps": 0.28125,
"kl/beta": 0.01110977865755558,
"kl/n_epsilon_steps": 0.359375,
"kl/p_epsilon_steps": 0.640625,
"learning_rate": 6.815658960673781e-08,
"logits/chosen": -0.005118317902088165,
"logits/rejected": -0.17630475759506226,
"logps/chosen": -161.46450805664062,
"logps/ref_chosen": -61.60636901855469,
"logps/ref_rejected": -74.50727844238281,
"logps/rejected": -200.5560302734375,
"loss": 1.299,
"rewards/accuracies": 0.609375,
"rewards/chosen": -1.1086468696594238,
"rewards/margins": 0.2836650013923645,
"rewards/rejected": -1.3923118114471436,
"step": 519
},
{
"epoch": 0.7860922146636432,
"epsilon_dpo/beta": 0.011017403565347195,
"epsilon_dpo/beta_margin_grad_mean": -0.4108213186264038,
"epsilon_dpo/beta_margin_grad_std": 0.14979180693626404,
"epsilon_dpo/beta_margin_mean": 0.4028027653694153,
"epsilon_dpo/beta_margin_std": 0.6876090168952942,
"epsilon_dpo/loss_margin_mean": 36.90079116821289,
"grad_norm": 14.898476600646973,
"kl/avg_steps": 0.5625,
"kl/beta": 0.01107861939817667,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 6.725177529083209e-08,
"logits/chosen": 0.10064268857240677,
"logits/rejected": -0.16475823521614075,
"logps/chosen": -154.83224487304688,
"logps/ref_chosen": -62.87343215942383,
"logps/ref_rejected": -76.505615234375,
"logps/rejected": -205.36521911621094,
"loss": 1.1314,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.0145667791366577,
"rewards/margins": 0.4028027653694153,
"rewards/rejected": -1.4173696041107178,
"step": 520
},
{
"epoch": 0.7876039304610734,
"epsilon_dpo/beta": 0.010955777950584888,
"epsilon_dpo/beta_margin_grad_mean": -0.3722280263900757,
"epsilon_dpo/beta_margin_grad_std": 0.14417913556098938,
"epsilon_dpo/beta_margin_mean": 0.5786331295967102,
"epsilon_dpo/beta_margin_std": 0.6789863109588623,
"epsilon_dpo/loss_margin_mean": 53.15705490112305,
"grad_norm": 16.011079788208008,
"kl/avg_steps": 0.5625,
"kl/beta": 0.011016651056706905,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 6.63520728356167e-08,
"logits/chosen": -0.07822868227958679,
"logits/rejected": -0.20976831018924713,
"logps/chosen": -154.4457550048828,
"logps/ref_chosen": -64.20668029785156,
"logps/ref_rejected": -92.28083038330078,
"logps/rejected": -235.6769561767578,
"loss": 0.9915,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.9909406304359436,
"rewards/margins": 0.5786331295967102,
"rewards/rejected": -1.5695737600326538,
"step": 521
},
{
"epoch": 0.7891156462585034,
"epsilon_dpo/beta": 0.01090819202363491,
"epsilon_dpo/beta_margin_grad_mean": -0.41209661960601807,
"epsilon_dpo/beta_margin_grad_std": 0.1886807233095169,
"epsilon_dpo/beta_margin_mean": 0.41103124618530273,
"epsilon_dpo/beta_margin_std": 0.8884212970733643,
"epsilon_dpo/loss_margin_mean": 38.21929168701172,
"grad_norm": 15.265345573425293,
"kl/avg_steps": 0.4375,
"kl/beta": 0.010955029167234898,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 6.545750740770336e-08,
"logits/chosen": -0.06968595087528229,
"logits/rejected": 0.002111488953232765,
"logps/chosen": -148.2176513671875,
"logps/ref_chosen": -58.36972427368164,
"logps/ref_rejected": -68.79248046875,
"logps/rejected": -196.8596954345703,
"loss": 1.1952,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.9822707176208496,
"rewards/margins": 0.41103124618530273,
"rewards/rejected": -1.3933019638061523,
"step": 522
},
{
"epoch": 0.7906273620559335,
"epsilon_dpo/beta": 0.010843630880117416,
"epsilon_dpo/beta_margin_grad_mean": -0.39532458782196045,
"epsilon_dpo/beta_margin_grad_std": 0.16449421644210815,
"epsilon_dpo/beta_margin_mean": 0.459897518157959,
"epsilon_dpo/beta_margin_std": 0.7425405383110046,
"epsilon_dpo/loss_margin_mean": 42.8420524597168,
"grad_norm": 16.413509368896484,
"kl/avg_steps": 0.59375,
"kl/beta": 0.010907309129834175,
"kl/n_epsilon_steps": 0.203125,
"kl/p_epsilon_steps": 0.796875,
"learning_rate": 6.456810403001012e-08,
"logits/chosen": 0.02888753078877926,
"logits/rejected": -0.28321683406829834,
"logps/chosen": -160.22735595703125,
"logps/ref_chosen": -65.71324157714844,
"logps/ref_rejected": -91.98896789550781,
"logps/rejected": -229.3451385498047,
"loss": 1.1062,
"rewards/accuracies": 0.796875,
"rewards/chosen": -1.0269458293914795,
"rewards/margins": 0.459897518157959,
"rewards/rejected": -1.4868433475494385,
"step": 523
},
{
"epoch": 0.7921390778533636,
"epsilon_dpo/beta": 0.010783016681671143,
"epsilon_dpo/beta_margin_grad_mean": -0.40032902359962463,
"epsilon_dpo/beta_margin_grad_std": 0.1398220956325531,
"epsilon_dpo/beta_margin_mean": 0.4393196403980255,
"epsilon_dpo/beta_margin_std": 0.6374650597572327,
"epsilon_dpo/loss_margin_mean": 41.07111740112305,
"grad_norm": 13.701552391052246,
"kl/avg_steps": 0.5625,
"kl/beta": 0.010842929594218731,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 6.368388758106134e-08,
"logits/chosen": -0.17534656822681427,
"logits/rejected": -0.16570016741752625,
"logps/chosen": -155.463134765625,
"logps/ref_chosen": -76.35124969482422,
"logps/ref_rejected": -89.96072387695312,
"logps/rejected": -210.14373779296875,
"loss": 1.0877,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.8538601398468018,
"rewards/margins": 0.4393196702003479,
"rewards/rejected": -1.2931797504425049,
"step": 524
},
{
"epoch": 0.7936507936507936,
"epsilon_dpo/beta": 0.01074965950101614,
"epsilon_dpo/beta_margin_grad_mean": -0.42374032735824585,
"epsilon_dpo/beta_margin_grad_std": 0.1512511968612671,
"epsilon_dpo/beta_margin_mean": 0.3471807837486267,
"epsilon_dpo/beta_margin_std": 0.6909745335578918,
"epsilon_dpo/loss_margin_mean": 32.73856735229492,
"grad_norm": 17.13736343383789,
"kl/avg_steps": 0.3125,
"kl/beta": 0.010782279074192047,
"kl/n_epsilon_steps": 0.34375,
"kl/p_epsilon_steps": 0.65625,
"learning_rate": 6.280488279429185e-08,
"logits/chosen": -0.2273256778717041,
"logits/rejected": -0.1729271560907364,
"logps/chosen": -166.95675659179688,
"logps/ref_chosen": -75.49578857421875,
"logps/ref_rejected": -84.04852294921875,
"logps/rejected": -208.248046875,
"loss": 1.1785,
"rewards/accuracies": 0.671875,
"rewards/chosen": -0.9856960773468018,
"rewards/margins": 0.3471807837486267,
"rewards/rejected": -1.3328769207000732,
"step": 525
},
{
"epoch": 0.7951625094482238,
"epsilon_dpo/beta": 0.010699375532567501,
"epsilon_dpo/beta_margin_grad_mean": -0.42020729184150696,
"epsilon_dpo/beta_margin_grad_std": 0.1444169133901596,
"epsilon_dpo/beta_margin_mean": 0.34961146116256714,
"epsilon_dpo/beta_margin_std": 0.6499526500701904,
"epsilon_dpo/loss_margin_mean": 33.06034469604492,
"grad_norm": 13.86230182647705,
"kl/avg_steps": 0.46875,
"kl/beta": 0.010748689994215965,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 6.193111425735515e-08,
"logits/chosen": 0.051716819405555725,
"logits/rejected": -0.19310572743415833,
"logps/chosen": -157.715576171875,
"logps/ref_chosen": -61.29241943359375,
"logps/ref_rejected": -82.47763061523438,
"logps/rejected": -211.96112060546875,
"loss": 1.165,
"rewards/accuracies": 0.734375,
"rewards/chosen": -1.0342143774032593,
"rewards/margins": 0.34961146116256714,
"rewards/rejected": -1.3838257789611816,
"step": 526
},
{
"epoch": 0.7966742252456538,
"epsilon_dpo/beta": 0.01066283043473959,
"epsilon_dpo/beta_margin_grad_mean": -0.43542996048927307,
"epsilon_dpo/beta_margin_grad_std": 0.15423430502414703,
"epsilon_dpo/beta_margin_mean": 0.29155755043029785,
"epsilon_dpo/beta_margin_std": 0.6924352049827576,
"epsilon_dpo/loss_margin_mean": 27.787824630737305,
"grad_norm": 15.987152099609375,
"kl/avg_steps": 0.34375,
"kl/beta": 0.010698540136218071,
"kl/n_epsilon_steps": 0.328125,
"kl/p_epsilon_steps": 0.671875,
"learning_rate": 6.106260641143546e-08,
"logits/chosen": 0.01460237056016922,
"logits/rejected": -0.19241218268871307,
"logps/chosen": -162.69764709472656,
"logps/ref_chosen": -61.47262954711914,
"logps/ref_rejected": -90.52831268310547,
"logps/rejected": -219.54116821289062,
"loss": 1.2275,
"rewards/accuracies": 0.640625,
"rewards/chosen": -1.0818819999694824,
"rewards/margins": 0.29155758023262024,
"rewards/rejected": -1.3734395503997803,
"step": 527
},
{
"epoch": 0.7981859410430839,
"epsilon_dpo/beta": 0.010622969828546047,
"epsilon_dpo/beta_margin_grad_mean": -0.43023064732551575,
"epsilon_dpo/beta_margin_grad_std": 0.16522420942783356,
"epsilon_dpo/beta_margin_mean": 0.30544036626815796,
"epsilon_dpo/beta_margin_std": 0.754958987236023,
"epsilon_dpo/loss_margin_mean": 29.22115135192871,
"grad_norm": 17.425264358520508,
"kl/avg_steps": 0.375,
"kl/beta": 0.010661889798939228,
"kl/n_epsilon_steps": 0.3125,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 6.019938355056422e-08,
"logits/chosen": 0.04494815692305565,
"logits/rejected": -0.17464430630207062,
"logps/chosen": -150.1697235107422,
"logps/ref_chosen": -58.792015075683594,
"logps/ref_rejected": -71.82516479492188,
"logps/rejected": -192.4240264892578,
"loss": 1.2361,
"rewards/accuracies": 0.671875,
"rewards/chosen": -0.9725919961929321,
"rewards/margins": 0.30544042587280273,
"rewards/rejected": -1.2780323028564453,
"step": 528
},
{
"epoch": 0.799697656840514,
"epsilon_dpo/beta": 0.010560045018792152,
"epsilon_dpo/beta_margin_grad_mean": -0.35183483362197876,
"epsilon_dpo/beta_margin_grad_std": 0.15599550306797028,
"epsilon_dpo/beta_margin_mean": 0.6755702495574951,
"epsilon_dpo/beta_margin_std": 0.722866415977478,
"epsilon_dpo/loss_margin_mean": 64.36847686767578,
"grad_norm": 12.666074752807617,
"kl/avg_steps": 0.59375,
"kl/beta": 0.010622057132422924,
"kl/n_epsilon_steps": 0.203125,
"kl/p_epsilon_steps": 0.796875,
"learning_rate": 5.934146982094049e-08,
"logits/chosen": 0.0297603290528059,
"logits/rejected": -0.09386517852544785,
"logps/chosen": -142.72621154785156,
"logps/ref_chosen": -55.070960998535156,
"logps/ref_rejected": -75.44007110595703,
"logps/rejected": -227.46380615234375,
"loss": 0.9382,
"rewards/accuracies": 0.796875,
"rewards/chosen": -0.9280588030815125,
"rewards/margins": 0.6755702495574951,
"rewards/rejected": -1.6036291122436523,
"step": 529
},
{
"epoch": 0.8012093726379441,
"epsilon_dpo/beta": 0.0105142155662179,
"epsilon_dpo/beta_margin_grad_mean": -0.4084755480289459,
"epsilon_dpo/beta_margin_grad_std": 0.14044035971164703,
"epsilon_dpo/beta_margin_mean": 0.4130297899246216,
"epsilon_dpo/beta_margin_std": 0.6400982737541199,
"epsilon_dpo/loss_margin_mean": 39.6524543762207,
"grad_norm": 15.055916786193848,
"kl/avg_steps": 0.4375,
"kl/beta": 0.010559361428022385,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 5.848888922025552e-08,
"logits/chosen": 0.02155611664056778,
"logits/rejected": -0.11885523796081543,
"logps/chosen": -145.18594360351562,
"logps/ref_chosen": -56.743812561035156,
"logps/ref_rejected": -76.6692123413086,
"logps/rejected": -204.7637939453125,
"loss": 1.1089,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.9313051700592041,
"rewards/margins": 0.4130297899246216,
"rewards/rejected": -1.3443349599838257,
"step": 530
},
{
"epoch": 0.8027210884353742,
"epsilon_dpo/beta": 0.01047498732805252,
"epsilon_dpo/beta_margin_grad_mean": -0.4068000912666321,
"epsilon_dpo/beta_margin_grad_std": 0.15221014618873596,
"epsilon_dpo/beta_margin_mean": 0.42143553495407104,
"epsilon_dpo/beta_margin_std": 0.6951009035110474,
"epsilon_dpo/loss_margin_mean": 40.66737747192383,
"grad_norm": 14.889087677001953,
"kl/avg_steps": 0.375,
"kl/beta": 0.010513365268707275,
"kl/n_epsilon_steps": 0.3125,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 5.7641665597021435e-08,
"logits/chosen": 0.07515759021043777,
"logits/rejected": -0.07046963274478912,
"logps/chosen": -140.56666564941406,
"logps/ref_chosen": -51.116458892822266,
"logps/ref_rejected": -79.52884674072266,
"logps/rejected": -209.64642333984375,
"loss": 1.1193,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.9387943744659424,
"rewards/margins": 0.42143553495407104,
"rewards/rejected": -1.3602299690246582,
"step": 531
},
{
"epoch": 0.8042328042328042,
"epsilon_dpo/beta": 0.010419486090540886,
"epsilon_dpo/beta_margin_grad_mean": -0.3947012424468994,
"epsilon_dpo/beta_margin_grad_std": 0.15935707092285156,
"epsilon_dpo/beta_margin_mean": 0.4751250743865967,
"epsilon_dpo/beta_margin_std": 0.7214094996452332,
"epsilon_dpo/loss_margin_mean": 46.03975296020508,
"grad_norm": 13.558876037597656,
"kl/avg_steps": 0.53125,
"kl/beta": 0.010474087670445442,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 5.679982264990424e-08,
"logits/chosen": -0.011013902723789215,
"logits/rejected": -0.06333242356777191,
"logps/chosen": -162.0532989501953,
"logps/ref_chosen": -58.279945373535156,
"logps/ref_rejected": -78.05426788330078,
"logps/rejected": -227.86737060546875,
"loss": 1.0861,
"rewards/accuracies": 0.765625,
"rewards/chosen": -1.082663655281067,
"rewards/margins": 0.4751250743865967,
"rewards/rejected": -1.5577887296676636,
"step": 532
},
{
"epoch": 0.8057445200302343,
"epsilon_dpo/beta": 0.010364423505961895,
"epsilon_dpo/beta_margin_grad_mean": -0.3996739983558655,
"epsilon_dpo/beta_margin_grad_std": 0.1479678601026535,
"epsilon_dpo/beta_margin_mean": 0.4454460144042969,
"epsilon_dpo/beta_margin_std": 0.670646607875824,
"epsilon_dpo/loss_margin_mean": 43.33749008178711,
"grad_norm": 12.718405723571777,
"kl/avg_steps": 0.53125,
"kl/beta": 0.010418737307190895,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 5.596338392706076e-08,
"logits/chosen": 0.05036419630050659,
"logits/rejected": -0.13193204998970032,
"logps/chosen": -129.54742431640625,
"logps/ref_chosen": -56.41801452636719,
"logps/ref_rejected": -73.89324951171875,
"logps/rejected": -190.36013793945312,
"loss": 1.0932,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.7596213817596436,
"rewards/margins": 0.4454460144042969,
"rewards/rejected": -1.2050673961639404,
"step": 533
},
{
"epoch": 0.8072562358276644,
"epsilon_dpo/beta": 0.010329088196158409,
"epsilon_dpo/beta_margin_grad_mean": -0.3971126079559326,
"epsilon_dpo/beta_margin_grad_std": 0.1607603281736374,
"epsilon_dpo/beta_margin_mean": 0.4707995355129242,
"epsilon_dpo/beta_margin_std": 0.745704710483551,
"epsilon_dpo/loss_margin_mean": 46.0926628112793,
"grad_norm": 11.885769844055176,
"kl/avg_steps": 0.34375,
"kl/beta": 0.010363680310547352,
"kl/n_epsilon_steps": 0.328125,
"kl/p_epsilon_steps": 0.671875,
"learning_rate": 5.513237282548033e-08,
"logits/chosen": 0.01309187337756157,
"logits/rejected": -0.07911329716444016,
"logps/chosen": -147.35385131835938,
"logps/ref_chosen": -60.748687744140625,
"logps/ref_rejected": -73.8623046875,
"logps/rejected": -206.56011962890625,
"loss": 1.0956,
"rewards/accuracies": 0.671875,
"rewards/chosen": -0.8963637948036194,
"rewards/margins": 0.4707995057106018,
"rewards/rejected": -1.3671634197235107,
"step": 534
},
{
"epoch": 0.8087679516250945,
"epsilon_dpo/beta": 0.010293703526258469,
"epsilon_dpo/beta_margin_grad_mean": -0.41606441140174866,
"epsilon_dpo/beta_margin_grad_std": 0.1568082720041275,
"epsilon_dpo/beta_margin_mean": 0.37637367844581604,
"epsilon_dpo/beta_margin_std": 0.7163012623786926,
"epsilon_dpo/loss_margin_mean": 37.04615020751953,
"grad_norm": 14.706581115722656,
"kl/avg_steps": 0.34375,
"kl/beta": 0.010328177362680435,
"kl/n_epsilon_steps": 0.328125,
"kl/p_epsilon_steps": 0.671875,
"learning_rate": 5.430681259032957e-08,
"logits/chosen": 0.01764669455587864,
"logits/rejected": -0.15716047585010529,
"logps/chosen": -160.3616943359375,
"logps/ref_chosen": -61.637413024902344,
"logps/ref_rejected": -80.93138122558594,
"logps/rejected": -216.70181274414062,
"loss": 1.1628,
"rewards/accuracies": 0.671875,
"rewards/chosen": -1.0196473598480225,
"rewards/margins": 0.3763737082481384,
"rewards/rejected": -1.3960211277008057,
"step": 535
},
{
"epoch": 0.8102796674225246,
"epsilon_dpo/beta": 0.01022627204656601,
"epsilon_dpo/beta_margin_grad_mean": -0.37227851152420044,
"epsilon_dpo/beta_margin_grad_std": 0.12902818620204926,
"epsilon_dpo/beta_margin_mean": 0.5732942223548889,
"epsilon_dpo/beta_margin_std": 0.6022374629974365,
"epsilon_dpo/loss_margin_mean": 56.31126022338867,
"grad_norm": 10.136542320251465,
"kl/avg_steps": 0.65625,
"kl/beta": 0.010292796418070793,
"kl/n_epsilon_steps": 0.171875,
"kl/p_epsilon_steps": 0.828125,
"learning_rate": 5.3486726314303175e-08,
"logits/chosen": 0.20993435382843018,
"logits/rejected": -0.09832259267568588,
"logps/chosen": -139.06715393066406,
"logps/ref_chosen": -51.888973236083984,
"logps/ref_rejected": -73.34864044189453,
"logps/rejected": -216.8380889892578,
"loss": 0.974,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.8925957083702087,
"rewards/margins": 0.5732941627502441,
"rewards/rejected": -1.4658899307250977,
"step": 536
},
{
"epoch": 0.8117913832199547,
"epsilon_dpo/beta": 0.010191557928919792,
"epsilon_dpo/beta_margin_grad_mean": -0.4022936522960663,
"epsilon_dpo/beta_margin_grad_std": 0.1535758227109909,
"epsilon_dpo/beta_margin_mean": 0.4453662931919098,
"epsilon_dpo/beta_margin_std": 0.6968430280685425,
"epsilon_dpo/loss_margin_mean": 44.184513092041016,
"grad_norm": 12.227337837219238,
"kl/avg_steps": 0.34375,
"kl/beta": 0.010225689969956875,
"kl/n_epsilon_steps": 0.328125,
"kl/p_epsilon_steps": 0.671875,
"learning_rate": 5.267213693697695e-08,
"logits/chosen": 0.09206399321556091,
"logits/rejected": -0.20483046770095825,
"logps/chosen": -152.02105712890625,
"logps/ref_chosen": -54.248619079589844,
"logps/ref_rejected": -94.94343566894531,
"logps/rejected": -236.900390625,
"loss": 1.1008,
"rewards/accuracies": 0.703125,
"rewards/chosen": -0.9980409145355225,
"rewards/margins": 0.4453662633895874,
"rewards/rejected": -1.4434072971343994,
"step": 537
},
{
"epoch": 0.8133030990173847,
"epsilon_dpo/beta": 0.010124795138835907,
"epsilon_dpo/beta_margin_grad_mean": -0.3865533769130707,
"epsilon_dpo/beta_margin_grad_std": 0.14259770512580872,
"epsilon_dpo/beta_margin_mean": 0.5082729458808899,
"epsilon_dpo/beta_margin_std": 0.6586771607398987,
"epsilon_dpo/loss_margin_mean": 50.5070915222168,
"grad_norm": 13.094738960266113,
"kl/avg_steps": 0.65625,
"kl/beta": 0.010190659202635288,
"kl/n_epsilon_steps": 0.171875,
"kl/p_epsilon_steps": 0.828125,
"learning_rate": 5.1863067244167144e-08,
"logits/chosen": -0.14467468857765198,
"logits/rejected": -0.1057022213935852,
"logps/chosen": -164.96591186523438,
"logps/ref_chosen": -70.09354400634766,
"logps/ref_rejected": -79.49833679199219,
"logps/rejected": -224.8778076171875,
"loss": 1.0392,
"rewards/accuracies": 0.796875,
"rewards/chosen": -0.9625347852706909,
"rewards/margins": 0.5082728862762451,
"rewards/rejected": -1.470807671546936,
"step": 538
},
{
"epoch": 0.8148148148148148,
"epsilon_dpo/beta": 0.010074606165289879,
"epsilon_dpo/beta_margin_grad_mean": -0.41831132769584656,
"epsilon_dpo/beta_margin_grad_std": 0.13271506130695343,
"epsilon_dpo/beta_margin_mean": 0.3641396462917328,
"epsilon_dpo/beta_margin_std": 0.6018718481063843,
"epsilon_dpo/loss_margin_mean": 36.46466827392578,
"grad_norm": 13.052766799926758,
"kl/avg_steps": 0.5,
"kl/beta": 0.010124219581484795,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 5.105953986729195e-08,
"logits/chosen": 0.08030635863542557,
"logits/rejected": -0.12439245730638504,
"logps/chosen": -160.74258422851562,
"logps/ref_chosen": -61.93169403076172,
"logps/ref_rejected": -84.08946228027344,
"logps/rejected": -219.36502075195312,
"loss": 1.1383,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.9971802234649658,
"rewards/margins": 0.3641396760940552,
"rewards/rejected": -1.361319899559021,
"step": 539
},
{
"epoch": 0.8163265306122449,
"epsilon_dpo/beta": 0.010011889971792698,
"epsilon_dpo/beta_margin_grad_mean": -0.38009002804756165,
"epsilon_dpo/beta_margin_grad_std": 0.13908350467681885,
"epsilon_dpo/beta_margin_mean": 0.5382365584373474,
"epsilon_dpo/beta_margin_std": 0.6392221450805664,
"epsilon_dpo/loss_margin_mean": 54.07574462890625,
"grad_norm": 12.857253074645996,
"kl/avg_steps": 0.625,
"kl/beta": 0.010073849931359291,
"kl/n_epsilon_steps": 0.1875,
"kl/p_epsilon_steps": 0.8125,
"learning_rate": 5.026157728273966e-08,
"logits/chosen": -0.011615540832281113,
"logits/rejected": -0.2316288948059082,
"logps/chosen": -154.4329833984375,
"logps/ref_chosen": -62.70425033569336,
"logps/ref_rejected": -95.63597106933594,
"logps/rejected": -241.44046020507812,
"loss": 1.0113,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.9192986488342285,
"rewards/margins": 0.5382365584373474,
"rewards/rejected": -1.4575352668762207,
"step": 540
},
{
"epoch": 0.817838246409675,
"epsilon_dpo/beta": 0.009965348988771439,
"epsilon_dpo/beta_margin_grad_mean": -0.3810591399669647,
"epsilon_dpo/beta_margin_grad_std": 0.13710962235927582,
"epsilon_dpo/beta_margin_mean": 0.5287958383560181,
"epsilon_dpo/beta_margin_std": 0.6138408184051514,
"epsilon_dpo/loss_margin_mean": 53.45842742919922,
"grad_norm": 13.600028991699219,
"kl/avg_steps": 0.46875,
"kl/beta": 0.010011279955506325,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 4.9469201811239035e-08,
"logits/chosen": -0.02240005135536194,
"logits/rejected": 0.09297636896371841,
"logps/chosen": -150.23812866210938,
"logps/ref_chosen": -62.48084259033203,
"logps/ref_rejected": -57.55541229248047,
"logps/rejected": -198.77113342285156,
"loss": 1.0126,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.87642902135849,
"rewards/margins": 0.5287958979606628,
"rewards/rejected": -1.4052249193191528,
"step": 541
},
{
"epoch": 0.8193499622071051,
"epsilon_dpo/beta": 0.009903281927108765,
"epsilon_dpo/beta_margin_grad_mean": -0.3769392967224121,
"epsilon_dpo/beta_margin_grad_std": 0.1581653356552124,
"epsilon_dpo/beta_margin_mean": 0.5590870380401611,
"epsilon_dpo/beta_margin_std": 0.7264112830162048,
"epsilon_dpo/loss_margin_mean": 56.84881591796875,
"grad_norm": 11.680880546569824,
"kl/avg_steps": 0.625,
"kl/beta": 0.00996457040309906,
"kl/n_epsilon_steps": 0.1875,
"kl/p_epsilon_steps": 0.8125,
"learning_rate": 4.868243561723534e-08,
"logits/chosen": 0.12971803545951843,
"logits/rejected": 0.05990251153707504,
"logps/chosen": -125.817138671875,
"logps/ref_chosen": -49.454891204833984,
"logps/ref_rejected": -65.33275604248047,
"logps/rejected": -198.5438232421875,
"loss": 1.0231,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.7576043605804443,
"rewards/margins": 0.5590870380401611,
"rewards/rejected": -1.3166913986206055,
"step": 542
},
{
"epoch": 0.8208616780045351,
"epsilon_dpo/beta": 0.00984177179634571,
"epsilon_dpo/beta_margin_grad_mean": -0.3877067267894745,
"epsilon_dpo/beta_margin_grad_std": 0.14030489325523376,
"epsilon_dpo/beta_margin_mean": 0.4917500615119934,
"epsilon_dpo/beta_margin_std": 0.640770435333252,
"epsilon_dpo/loss_margin_mean": 50.29764938354492,
"grad_norm": 10.045071601867676,
"kl/avg_steps": 0.625,
"kl/beta": 0.009902679361402988,
"kl/n_epsilon_steps": 0.1875,
"kl/p_epsilon_steps": 0.8125,
"learning_rate": 4.790130070827028e-08,
"logits/chosen": 0.15234971046447754,
"logits/rejected": 0.03943786770105362,
"logps/chosen": -136.27597045898438,
"logps/ref_chosen": -51.10085678100586,
"logps/ref_rejected": -76.06130981445312,
"logps/rejected": -211.5340576171875,
"loss": 1.0481,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.8387240171432495,
"rewards/margins": 0.4917500615119934,
"rewards/rejected": -1.3304740190505981,
"step": 543
},
{
"epoch": 0.8223733938019653,
"epsilon_dpo/beta": 0.00978986918926239,
"epsilon_dpo/beta_margin_grad_mean": -0.3752652704715729,
"epsilon_dpo/beta_margin_grad_std": 0.15228131413459778,
"epsilon_dpo/beta_margin_mean": 0.5659483671188354,
"epsilon_dpo/beta_margin_std": 0.7061290144920349,
"epsilon_dpo/loss_margin_mean": 58.25019454956055,
"grad_norm": 12.454544067382812,
"kl/avg_steps": 0.53125,
"kl/beta": 0.009841172024607658,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 4.7125818934366454e-08,
"logits/chosen": -0.03965529054403305,
"logits/rejected": -0.18958096206188202,
"logps/chosen": -153.03073120117188,
"logps/ref_chosen": -60.2772331237793,
"logps/ref_rejected": -88.40553283691406,
"logps/rejected": -239.4092254638672,
"loss": 1.0105,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.9090437293052673,
"rewards/margins": 0.5659483671188354,
"rewards/rejected": -1.474992036819458,
"step": 544
},
{
"epoch": 0.8238851095993953,
"epsilon_dpo/beta": 0.009753433056175709,
"epsilon_dpo/beta_margin_grad_mean": -0.4201149642467499,
"epsilon_dpo/beta_margin_grad_std": 0.14904241263866425,
"epsilon_dpo/beta_margin_mean": 0.3560601472854614,
"epsilon_dpo/beta_margin_std": 0.6699649095535278,
"epsilon_dpo/loss_margin_mean": 36.96013641357422,
"grad_norm": 12.590239524841309,
"kl/avg_steps": 0.375,
"kl/beta": 0.00978916697204113,
"kl/n_epsilon_steps": 0.3125,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 4.635601198741607e-08,
"logits/chosen": 0.010761722922325134,
"logits/rejected": -0.13790488243103027,
"logps/chosen": -158.50006103515625,
"logps/ref_chosen": -61.61524963378906,
"logps/ref_rejected": -78.71266174316406,
"logps/rejected": -212.5576171875,
"loss": 1.1658,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.9473348259925842,
"rewards/margins": 0.3560601472854614,
"rewards/rejected": -1.3033950328826904,
"step": 545
},
{
"epoch": 0.8253968253968254,
"epsilon_dpo/beta": 0.00969870574772358,
"epsilon_dpo/beta_margin_grad_mean": -0.39806199073791504,
"epsilon_dpo/beta_margin_grad_std": 0.14555932581424713,
"epsilon_dpo/beta_margin_mean": 0.4468782842159271,
"epsilon_dpo/beta_margin_std": 0.6423460841178894,
"epsilon_dpo/loss_margin_mean": 46.49570083618164,
"grad_norm": 14.601067543029785,
"kl/avg_steps": 0.5625,
"kl/beta": 0.009752594865858555,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 4.559190140057428e-08,
"logits/chosen": 0.06842926144599915,
"logits/rejected": 0.1060560941696167,
"logps/chosen": -150.25686645507812,
"logps/ref_chosen": -59.313262939453125,
"logps/ref_rejected": -64.73631286621094,
"logps/rejected": -202.17562866210938,
"loss": 1.0851,
"rewards/accuracies": 0.796875,
"rewards/chosen": -0.8845122456550598,
"rewards/margins": 0.44687825441360474,
"rewards/rejected": -1.3313905000686646,
"step": 546
},
{
"epoch": 0.8269085411942555,
"epsilon_dpo/beta": 0.009632331319153309,
"epsilon_dpo/beta_margin_grad_mean": -0.3813442885875702,
"epsilon_dpo/beta_margin_grad_std": 0.14514042437076569,
"epsilon_dpo/beta_margin_mean": 0.525467038154602,
"epsilon_dpo/beta_margin_std": 0.6635622382164001,
"epsilon_dpo/loss_margin_mean": 54.88117980957031,
"grad_norm": 14.89316463470459,
"kl/avg_steps": 0.6875,
"kl/beta": 0.009698042646050453,
"kl/n_epsilon_steps": 0.15625,
"kl/p_epsilon_steps": 0.84375,
"learning_rate": 4.483350854765672e-08,
"logits/chosen": 0.1364162117242813,
"logits/rejected": -0.03356347605586052,
"logps/chosen": -137.76309204101562,
"logps/ref_chosen": -54.97674560546875,
"logps/ref_rejected": -75.35922241210938,
"logps/rejected": -213.02674865722656,
"loss": 1.0289,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.7987380027770996,
"rewards/margins": 0.525467038154602,
"rewards/rejected": -1.3242050409317017,
"step": 547
},
{
"epoch": 0.8284202569916855,
"epsilon_dpo/beta": 0.009593653492629528,
"epsilon_dpo/beta_margin_grad_mean": -0.4259827435016632,
"epsilon_dpo/beta_margin_grad_std": 0.14797894656658173,
"epsilon_dpo/beta_margin_mean": 0.3331350088119507,
"epsilon_dpo/beta_margin_std": 0.6586005687713623,
"epsilon_dpo/loss_margin_mean": 35.179237365722656,
"grad_norm": 13.712616920471191,
"kl/avg_steps": 0.40625,
"kl/beta": 0.009631823748350143,
"kl/n_epsilon_steps": 0.296875,
"kl/p_epsilon_steps": 0.703125,
"learning_rate": 4.4080854642541826e-08,
"logits/chosen": 0.02758133038878441,
"logits/rejected": -0.18227389454841614,
"logps/chosen": -160.00942993164062,
"logps/ref_chosen": -63.21067810058594,
"logps/ref_rejected": -81.23347473144531,
"logps/rejected": -213.21145629882812,
"loss": 1.1818,
"rewards/accuracies": 0.703125,
"rewards/chosen": -0.930591344833374,
"rewards/margins": 0.3331350088119507,
"rewards/rejected": -1.2637262344360352,
"step": 548
},
{
"epoch": 0.8299319727891157,
"epsilon_dpo/beta": 0.009557835757732391,
"epsilon_dpo/beta_margin_grad_mean": -0.41547831892967224,
"epsilon_dpo/beta_margin_grad_std": 0.14878982305526733,
"epsilon_dpo/beta_margin_mean": 0.3703382611274719,
"epsilon_dpo/beta_margin_std": 0.657423198223114,
"epsilon_dpo/loss_margin_mean": 39.23622131347656,
"grad_norm": 13.625334739685059,
"kl/avg_steps": 0.375,
"kl/beta": 0.009592853486537933,
"kl/n_epsilon_steps": 0.3125,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 4.333396073857723e-08,
"logits/chosen": -0.06155749410390854,
"logits/rejected": -0.23194348812103271,
"logps/chosen": -158.0437774658203,
"logps/ref_chosen": -64.27351379394531,
"logps/ref_rejected": -92.31663513183594,
"logps/rejected": -225.3231201171875,
"loss": 1.1513,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.8983582258224487,
"rewards/margins": 0.3703382611274719,
"rewards/rejected": -1.2686965465545654,
"step": 549
},
{
"epoch": 0.8314436885865457,
"epsilon_dpo/beta": 0.00951316673308611,
"epsilon_dpo/beta_margin_grad_mean": -0.4365103840827942,
"epsilon_dpo/beta_margin_grad_std": 0.1384592056274414,
"epsilon_dpo/beta_margin_mean": 0.2764241397380829,
"epsilon_dpo/beta_margin_std": 0.6107503771781921,
"epsilon_dpo/loss_margin_mean": 29.465194702148438,
"grad_norm": 13.058089256286621,
"kl/avg_steps": 0.46875,
"kl/beta": 0.00955701433122158,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 4.259284772799099e-08,
"logits/chosen": 0.09267033636569977,
"logits/rejected": 0.007480932399630547,
"logps/chosen": -158.31451416015625,
"logps/ref_chosen": -56.230438232421875,
"logps/ref_rejected": -62.59788513183594,
"logps/rejected": -194.14715576171875,
"loss": 1.2168,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.9727519750595093,
"rewards/margins": 0.2764241099357605,
"rewards/rejected": -1.249176025390625,
"step": 550
},
{
"epoch": 0.8329554043839759,
"epsilon_dpo/beta": 0.00946283619850874,
"epsilon_dpo/beta_margin_grad_mean": -0.4175582826137543,
"epsilon_dpo/beta_margin_grad_std": 0.1236472949385643,
"epsilon_dpo/beta_margin_mean": 0.37026816606521606,
"epsilon_dpo/beta_margin_std": 0.5902994275093079,
"epsilon_dpo/loss_margin_mean": 39.40168762207031,
"grad_norm": 13.020936012268066,
"kl/avg_steps": 0.53125,
"kl/beta": 0.009512425400316715,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 4.1857536341307176e-08,
"logits/chosen": -0.06622433662414551,
"logits/rejected": -0.1261545717716217,
"logps/chosen": -165.08934020996094,
"logps/ref_chosen": -67.74720764160156,
"logps/ref_rejected": -87.04285430908203,
"logps/rejected": -223.7866668701172,
"loss": 1.1269,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.921585202217102,
"rewards/margins": 0.37026816606521606,
"rewards/rejected": -1.291853427886963,
"step": 551
},
{
"epoch": 0.8344671201814059,
"epsilon_dpo/beta": 0.009403958916664124,
"epsilon_dpo/beta_margin_grad_mean": -0.398048996925354,
"epsilon_dpo/beta_margin_grad_std": 0.12981706857681274,
"epsilon_dpo/beta_margin_mean": 0.43422242999076843,
"epsilon_dpo/beta_margin_std": 0.5744320750236511,
"epsilon_dpo/loss_margin_mean": 46.5087890625,
"grad_norm": 14.701848030090332,
"kl/avg_steps": 0.625,
"kl/beta": 0.009462157264351845,
"kl/n_epsilon_steps": 0.1875,
"kl/p_epsilon_steps": 0.8125,
"learning_rate": 4.112804714676593e-08,
"logits/chosen": -0.07371262460947037,
"logits/rejected": -0.19621533155441284,
"logps/chosen": -154.55172729492188,
"logps/ref_chosen": -62.92626190185547,
"logps/ref_rejected": -82.983642578125,
"logps/rejected": -221.11788940429688,
"loss": 1.076,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.8627252578735352,
"rewards/margins": 0.43422240018844604,
"rewards/rejected": -1.296947717666626,
"step": 552
},
{
"epoch": 0.8359788359788359,
"epsilon_dpo/beta": 0.009369060397148132,
"epsilon_dpo/beta_margin_grad_mean": -0.4151637852191925,
"epsilon_dpo/beta_margin_grad_std": 0.15181565284729004,
"epsilon_dpo/beta_margin_mean": 0.383619487285614,
"epsilon_dpo/beta_margin_std": 0.6985273361206055,
"epsilon_dpo/loss_margin_mean": 41.42930221557617,
"grad_norm": 15.29831314086914,
"kl/avg_steps": 0.375,
"kl/beta": 0.009403386153280735,
"kl/n_epsilon_steps": 0.3125,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 4.0404400549748144e-08,
"logits/chosen": 0.07058180123567581,
"logits/rejected": -0.21693959832191467,
"logps/chosen": -156.7476806640625,
"logps/ref_chosen": -56.038490295410156,
"logps/ref_rejected": -84.48454284667969,
"logps/rejected": -226.62301635742188,
"loss": 1.1504,
"rewards/accuracies": 0.640625,
"rewards/chosen": -0.9436930418014526,
"rewards/margins": 0.383619487285614,
"rewards/rejected": -1.3273124694824219,
"step": 553
},
{
"epoch": 0.8374905517762661,
"epsilon_dpo/beta": 0.00932234525680542,
"epsilon_dpo/beta_margin_grad_mean": -0.39552679657936096,
"epsilon_dpo/beta_margin_grad_std": 0.14172735810279846,
"epsilon_dpo/beta_margin_mean": 0.46062296628952026,
"epsilon_dpo/beta_margin_std": 0.6384677886962891,
"epsilon_dpo/loss_margin_mean": 49.81555938720703,
"grad_norm": 12.362720489501953,
"kl/avg_steps": 0.5,
"kl/beta": 0.00936825480312109,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 3.968661679220467e-08,
"logits/chosen": -0.002938609104603529,
"logits/rejected": -0.06545370817184448,
"logps/chosen": -156.66513061523438,
"logps/ref_chosen": -64.53059387207031,
"logps/ref_rejected": -71.21560668945312,
"logps/rejected": -213.16567993164062,
"loss": 1.0719,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.860846221446991,
"rewards/margins": 0.46062296628952026,
"rewards/rejected": -1.3214691877365112,
"step": 554
},
{
"epoch": 0.8390022675736961,
"epsilon_dpo/beta": 0.00927013996988535,
"epsilon_dpo/beta_margin_grad_mean": -0.40857529640197754,
"epsilon_dpo/beta_margin_grad_std": 0.157921701669693,
"epsilon_dpo/beta_margin_mean": 0.3956778943538666,
"epsilon_dpo/beta_margin_std": 0.7081512808799744,
"epsilon_dpo/loss_margin_mean": 43.16078186035156,
"grad_norm": 13.322372436523438,
"kl/avg_steps": 0.5625,
"kl/beta": 0.009321646764874458,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 3.89747159520904e-08,
"logits/chosen": -0.020282667130231857,
"logits/rejected": -0.055424001067876816,
"logps/chosen": -168.455810546875,
"logps/ref_chosen": -66.65191650390625,
"logps/ref_rejected": -68.6667251586914,
"logps/rejected": -213.63140869140625,
"loss": 1.1464,
"rewards/accuracies": 0.796875,
"rewards/chosen": -0.9466332197189331,
"rewards/margins": 0.39567792415618896,
"rewards/rejected": -1.342311143875122,
"step": 555
},
{
"epoch": 0.8405139833711263,
"epsilon_dpo/beta": 0.009235668927431107,
"epsilon_dpo/beta_margin_grad_mean": -0.4244295656681061,
"epsilon_dpo/beta_margin_grad_std": 0.1527853012084961,
"epsilon_dpo/beta_margin_mean": 0.3409133553504944,
"epsilon_dpo/beta_margin_std": 0.6857250332832336,
"epsilon_dpo/loss_margin_mean": 37.412601470947266,
"grad_norm": 11.735703468322754,
"kl/avg_steps": 0.375,
"kl/beta": 0.009269505739212036,
"kl/n_epsilon_steps": 0.3125,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 3.826871794280192e-08,
"logits/chosen": 0.1422003209590912,
"logits/rejected": -0.04843950644135475,
"logps/chosen": -155.98561096191406,
"logps/ref_chosen": -52.832366943359375,
"logps/ref_rejected": -64.49044036865234,
"logps/rejected": -205.0562744140625,
"loss": 1.1833,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.9544154405593872,
"rewards/margins": 0.340913325548172,
"rewards/rejected": -1.2953287363052368,
"step": 556
},
{
"epoch": 0.8420256991685563,
"epsilon_dpo/beta": 0.009198278188705444,
"epsilon_dpo/beta_margin_grad_mean": -0.388275146484375,
"epsilon_dpo/beta_margin_grad_std": 0.1277274489402771,
"epsilon_dpo/beta_margin_mean": 0.49317213892936707,
"epsilon_dpo/beta_margin_std": 0.5701685547828674,
"epsilon_dpo/loss_margin_mean": 54.0263557434082,
"grad_norm": 12.588410377502441,
"kl/avg_steps": 0.40625,
"kl/beta": 0.009234875440597534,
"kl/n_epsilon_steps": 0.296875,
"kl/p_epsilon_steps": 0.703125,
"learning_rate": 3.756864251262143e-08,
"logits/chosen": 0.06715458631515503,
"logits/rejected": -0.06625057756900787,
"logps/chosen": -155.78225708007812,
"logps/ref_chosen": -55.035980224609375,
"logps/ref_rejected": -75.80644989013672,
"logps/rejected": -230.57908630371094,
"loss": 1.0277,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.929233193397522,
"rewards/margins": 0.49317213892936707,
"rewards/rejected": -1.4224053621292114,
"step": 557
},
{
"epoch": 0.8435374149659864,
"epsilon_dpo/beta": 0.009143813513219357,
"epsilon_dpo/beta_margin_grad_mean": -0.3770783245563507,
"epsilon_dpo/beta_margin_grad_std": 0.13442471623420715,
"epsilon_dpo/beta_margin_mean": 0.5506212115287781,
"epsilon_dpo/beta_margin_std": 0.6153272986412048,
"epsilon_dpo/loss_margin_mean": 60.54936599731445,
"grad_norm": 10.583367347717285,
"kl/avg_steps": 0.59375,
"kl/beta": 0.009197509847581387,
"kl/n_epsilon_steps": 0.203125,
"kl/p_epsilon_steps": 0.796875,
"learning_rate": 3.687450924416341e-08,
"logits/chosen": -0.061662137508392334,
"logits/rejected": -0.20276546478271484,
"logps/chosen": -151.16940307617188,
"logps/ref_chosen": -63.226348876953125,
"logps/ref_rejected": -91.46881866455078,
"logps/rejected": -239.96124267578125,
"loss": 0.9953,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.8052123785018921,
"rewards/margins": 0.5506211519241333,
"rewards/rejected": -1.355833649635315,
"step": 558
},
{
"epoch": 0.8450491307634165,
"epsilon_dpo/beta": 0.009104130789637566,
"epsilon_dpo/beta_margin_grad_mean": -0.3989567756652832,
"epsilon_dpo/beta_margin_grad_std": 0.15116558969020844,
"epsilon_dpo/beta_margin_mean": 0.46307721734046936,
"epsilon_dpo/beta_margin_std": 0.7055156230926514,
"epsilon_dpo/loss_margin_mean": 51.3341064453125,
"grad_norm": 10.420401573181152,
"kl/avg_steps": 0.4375,
"kl/beta": 0.009143222123384476,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 3.6186337553827743e-08,
"logits/chosen": -0.01063072681427002,
"logits/rejected": -0.13921178877353668,
"logps/chosen": -157.12667846679688,
"logps/ref_chosen": -61.521644592285156,
"logps/ref_rejected": -82.83859252929688,
"logps/rejected": -229.77774047851562,
"loss": 1.0873,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.8718199729919434,
"rewards/margins": 0.46307721734046936,
"rewards/rejected": -1.3348971605300903,
"step": 559
},
{
"epoch": 0.8465608465608465,
"epsilon_dpo/beta": 0.009067319333553314,
"epsilon_dpo/beta_margin_grad_mean": -0.4019600749015808,
"epsilon_dpo/beta_margin_grad_std": 0.13447844982147217,
"epsilon_dpo/beta_margin_mean": 0.4334515929222107,
"epsilon_dpo/beta_margin_std": 0.593961775302887,
"epsilon_dpo/loss_margin_mean": 48.244808197021484,
"grad_norm": 12.462697982788086,
"kl/avg_steps": 0.40625,
"kl/beta": 0.00910339504480362,
"kl/n_epsilon_steps": 0.296875,
"kl/p_epsilon_steps": 0.703125,
"learning_rate": 3.550414669125573e-08,
"logits/chosen": 0.038852326571941376,
"logits/rejected": -0.022575462237000465,
"logps/chosen": -161.799072265625,
"logps/ref_chosen": -60.64122009277344,
"logps/ref_rejected": -78.75474548339844,
"logps/rejected": -228.1573944091797,
"loss": 1.0812,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.9195099472999573,
"rewards/margins": 0.4334515929222107,
"rewards/rejected": -1.352961540222168,
"step": 560
},
{
"epoch": 0.8480725623582767,
"epsilon_dpo/beta": 0.009019297547638416,
"epsilon_dpo/beta_margin_grad_mean": -0.40206801891326904,
"epsilon_dpo/beta_margin_grad_std": 0.1390688717365265,
"epsilon_dpo/beta_margin_mean": 0.4403170645236969,
"epsilon_dpo/beta_margin_std": 0.6293224096298218,
"epsilon_dpo/loss_margin_mean": 49.202247619628906,
"grad_norm": 11.104375839233398,
"kl/avg_steps": 0.53125,
"kl/beta": 0.009066562168300152,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 3.482795573879241e-08,
"logits/chosen": 0.017888018861413002,
"logits/rejected": -0.07850607484579086,
"logps/chosen": -153.1875762939453,
"logps/ref_chosen": -62.49860382080078,
"logps/ref_rejected": -78.72064208984375,
"logps/rejected": -218.6118621826172,
"loss": 1.0844,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.8194180727005005,
"rewards/margins": 0.4403170645236969,
"rewards/rejected": -1.259735107421875,
"step": 561
},
{
"epoch": 0.8495842781557067,
"epsilon_dpo/beta": 0.008977273479104042,
"epsilon_dpo/beta_margin_grad_mean": -0.3885160982608795,
"epsilon_dpo/beta_margin_grad_std": 0.14038877189159393,
"epsilon_dpo/beta_margin_mean": 0.49948006868362427,
"epsilon_dpo/beta_margin_std": 0.6345898509025574,
"epsilon_dpo/loss_margin_mean": 56.07931900024414,
"grad_norm": 13.873201370239258,
"kl/avg_steps": 0.46875,
"kl/beta": 0.009018650278449059,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 3.415778361095226e-08,
"logits/chosen": -0.1397266685962677,
"logits/rejected": -0.20345699787139893,
"logps/chosen": -173.15899658203125,
"logps/ref_chosen": -74.78173828125,
"logps/ref_rejected": -92.63499450683594,
"logps/rejected": -247.09158325195312,
"loss": 1.04,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.8846384286880493,
"rewards/margins": 0.49948006868362427,
"rewards/rejected": -1.3841185569763184,
"step": 562
},
{
"epoch": 0.8510959939531368,
"epsilon_dpo/beta": 0.008932583034038544,
"epsilon_dpo/beta_margin_grad_mean": -0.40113911032676697,
"epsilon_dpo/beta_margin_grad_std": 0.1470586657524109,
"epsilon_dpo/beta_margin_mean": 0.4358147978782654,
"epsilon_dpo/beta_margin_std": 0.6505940556526184,
"epsilon_dpo/loss_margin_mean": 49.272884368896484,
"grad_norm": 15.256080627441406,
"kl/avg_steps": 0.5,
"kl/beta": 0.008976573124527931,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 3.349364905389032e-08,
"logits/chosen": 0.127085343003273,
"logits/rejected": 0.07558880746364594,
"logps/chosen": -135.4530029296875,
"logps/ref_chosen": -50.19850158691406,
"logps/ref_rejected": -66.76687622070312,
"logps/rejected": -201.29428100585938,
"loss": 1.0961,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.7633324265480042,
"rewards/margins": 0.4358147978782654,
"rewards/rejected": -1.1991472244262695,
"step": 563
},
{
"epoch": 0.8526077097505669,
"epsilon_dpo/beta": 0.008876976557075977,
"epsilon_dpo/beta_margin_grad_mean": -0.37654897570610046,
"epsilon_dpo/beta_margin_grad_std": 0.1465965360403061,
"epsilon_dpo/beta_margin_mean": 0.5490842461585999,
"epsilon_dpo/beta_margin_std": 0.6591891050338745,
"epsilon_dpo/loss_margin_mean": 62.2650032043457,
"grad_norm": 10.8076810836792,
"kl/avg_steps": 0.625,
"kl/beta": 0.008931913413107395,
"kl/n_epsilon_steps": 0.1875,
"kl/p_epsilon_steps": 0.8125,
"learning_rate": 3.283557064487785e-08,
"logits/chosen": 0.08980629593133926,
"logits/rejected": -0.0049457848072052,
"logps/chosen": -139.7255096435547,
"logps/ref_chosen": -55.7408447265625,
"logps/ref_rejected": -74.8232421875,
"logps/rejected": -221.07290649414062,
"loss": 1.0108,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.7465522289276123,
"rewards/margins": 0.5490843057632446,
"rewards/rejected": -1.2956364154815674,
"step": 564
},
{
"epoch": 0.854119425547997,
"epsilon_dpo/beta": 0.008832937106490135,
"epsilon_dpo/beta_margin_grad_mean": -0.41519609093666077,
"epsilon_dpo/beta_margin_grad_std": 0.13157862424850464,
"epsilon_dpo/beta_margin_mean": 0.3718837797641754,
"epsilon_dpo/beta_margin_std": 0.5843960046768188,
"epsilon_dpo/loss_margin_mean": 42.49930191040039,
"grad_norm": 11.922967910766602,
"kl/avg_steps": 0.5,
"kl/beta": 0.008876435458660126,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 3.218356679178252e-08,
"logits/chosen": -0.02302563190460205,
"logits/rejected": -0.11473672837018967,
"logps/chosen": -164.09243774414062,
"logps/ref_chosen": -58.33738327026367,
"logps/ref_rejected": -78.31776428222656,
"logps/rejected": -226.57211303710938,
"loss": 1.1282,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.9360243678092957,
"rewards/margins": 0.3718837797641754,
"rewards/rejected": -1.3079081773757935,
"step": 565
},
{
"epoch": 0.8556311413454271,
"epsilon_dpo/beta": 0.008802792988717556,
"epsilon_dpo/beta_margin_grad_mean": -0.41178324818611145,
"epsilon_dpo/beta_margin_grad_std": 0.15758198499679565,
"epsilon_dpo/beta_margin_mean": 0.3910994529724121,
"epsilon_dpo/beta_margin_std": 0.7065132260322571,
"epsilon_dpo/loss_margin_mean": 45.01554870605469,
"grad_norm": 13.528739929199219,
"kl/avg_steps": 0.34375,
"kl/beta": 0.008832274004817009,
"kl/n_epsilon_steps": 0.328125,
"kl/p_epsilon_steps": 0.671875,
"learning_rate": 3.1537655732553764e-08,
"logits/chosen": -0.12171538919210434,
"logits/rejected": -0.08658839762210846,
"logps/chosen": -165.40048217773438,
"logps/ref_chosen": -71.22373962402344,
"logps/ref_rejected": -71.11601257324219,
"logps/rejected": -210.3083038330078,
"loss": 1.149,
"rewards/accuracies": 0.703125,
"rewards/chosen": -0.83110111951828,
"rewards/margins": 0.3910994529724121,
"rewards/rejected": -1.222200632095337,
"step": 566
},
{
"epoch": 0.8571428571428571,
"epsilon_dpo/beta": 0.008756131865084171,
"epsilon_dpo/beta_margin_grad_mean": -0.4015049338340759,
"epsilon_dpo/beta_margin_grad_std": 0.1300589144229889,
"epsilon_dpo/beta_margin_mean": 0.43171557784080505,
"epsilon_dpo/beta_margin_std": 0.573701024055481,
"epsilon_dpo/loss_margin_mean": 49.686893463134766,
"grad_norm": 10.687217712402344,
"kl/avg_steps": 0.53125,
"kl/beta": 0.008802017197012901,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 3.089785553471233e-08,
"logits/chosen": 0.17192617058753967,
"logits/rejected": -0.10820268839597702,
"logps/chosen": -145.16360473632812,
"logps/ref_chosen": -52.669273376464844,
"logps/ref_rejected": -74.34785461425781,
"logps/rejected": -216.52906799316406,
"loss": 1.0772,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.8114957809448242,
"rewards/margins": 0.43171560764312744,
"rewards/rejected": -1.243211269378662,
"step": 567
},
{
"epoch": 0.8586545729402872,
"epsilon_dpo/beta": 0.008709860034286976,
"epsilon_dpo/beta_margin_grad_mean": -0.3866046667098999,
"epsilon_dpo/beta_margin_grad_std": 0.12950319051742554,
"epsilon_dpo/beta_margin_mean": 0.5051518678665161,
"epsilon_dpo/beta_margin_std": 0.597504734992981,
"epsilon_dpo/loss_margin_mean": 58.37926483154297,
"grad_norm": 11.621685981750488,
"kl/avg_steps": 0.53125,
"kl/beta": 0.008755503222346306,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 3.026418409484513e-08,
"logits/chosen": 0.031522877514362335,
"logits/rejected": -0.18789838254451752,
"logps/chosen": -139.25909423828125,
"logps/ref_chosen": -52.178001403808594,
"logps/ref_rejected": -85.8277587890625,
"logps/rejected": -231.28811645507812,
"loss": 1.024,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.759665310382843,
"rewards/margins": 0.5051518678665161,
"rewards/rejected": -1.264817237854004,
"step": 568
},
{
"epoch": 0.8601662887377173,
"epsilon_dpo/beta": 0.008671999908983707,
"epsilon_dpo/beta_margin_grad_mean": -0.4265301823616028,
"epsilon_dpo/beta_margin_grad_std": 0.13363845646381378,
"epsilon_dpo/beta_margin_mean": 0.32311898469924927,
"epsilon_dpo/beta_margin_std": 0.5893939733505249,
"epsilon_dpo/loss_margin_mean": 37.690120697021484,
"grad_norm": 12.444249153137207,
"kl/avg_steps": 0.4375,
"kl/beta": 0.008709236048161983,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 2.963665913810451e-08,
"logits/chosen": -0.03436426818370819,
"logits/rejected": -0.1318507045507431,
"logps/chosen": -160.3306121826172,
"logps/ref_chosen": -62.649253845214844,
"logps/ref_rejected": -75.4298324584961,
"logps/rejected": -210.8013153076172,
"loss": 1.1706,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.8486992716789246,
"rewards/margins": 0.32311898469924927,
"rewards/rejected": -1.1718182563781738,
"step": 569
},
{
"epoch": 0.8616780045351474,
"epsilon_dpo/beta": 0.00862338487058878,
"epsilon_dpo/beta_margin_grad_mean": -0.37432458996772766,
"epsilon_dpo/beta_margin_grad_std": 0.13357394933700562,
"epsilon_dpo/beta_margin_mean": 0.5587610602378845,
"epsilon_dpo/beta_margin_std": 0.6137102246284485,
"epsilon_dpo/loss_margin_mean": 65.20036315917969,
"grad_norm": 10.535733222961426,
"kl/avg_steps": 0.5625,
"kl/beta": 0.008671298623085022,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 2.9015298217712453e-08,
"logits/chosen": -0.00015204772353172302,
"logits/rejected": -0.17118829488754272,
"logps/chosen": -136.68362426757812,
"logps/ref_chosen": -50.04179763793945,
"logps/ref_rejected": -78.27146911621094,
"logps/rejected": -230.1136474609375,
"loss": 0.9886,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.7490814924240112,
"rewards/margins": 0.5587610006332397,
"rewards/rejected": -1.307842493057251,
"step": 570
},
{
"epoch": 0.8631897203325775,
"epsilon_dpo/beta": 0.008588624186813831,
"epsilon_dpo/beta_margin_grad_mean": -0.42722076177597046,
"epsilon_dpo/beta_margin_grad_std": 0.13514843583106995,
"epsilon_dpo/beta_margin_mean": 0.3095320463180542,
"epsilon_dpo/beta_margin_std": 0.5791841149330139,
"epsilon_dpo/loss_margin_mean": 36.52940368652344,
"grad_norm": 11.776948928833008,
"kl/avg_steps": 0.40625,
"kl/beta": 0.008622795343399048,
"kl/n_epsilon_steps": 0.296875,
"kl/p_epsilon_steps": 0.703125,
"learning_rate": 2.840011871446962e-08,
"logits/chosen": 0.005588196218013763,
"logits/rejected": -0.08289927989244461,
"logps/chosen": -149.7568817138672,
"logps/ref_chosen": -53.65681457519531,
"logps/ref_rejected": -66.13298034667969,
"logps/rejected": -198.762451171875,
"loss": 1.1809,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.827346682548523,
"rewards/margins": 0.3095320463180542,
"rewards/rejected": -1.1368787288665771,
"step": 571
},
{
"epoch": 0.8647014361300076,
"epsilon_dpo/beta": 0.00854850560426712,
"epsilon_dpo/beta_margin_grad_mean": -0.41007423400878906,
"epsilon_dpo/beta_margin_grad_std": 0.11853473633527756,
"epsilon_dpo/beta_margin_mean": 0.388029009103775,
"epsilon_dpo/beta_margin_std": 0.5144620537757874,
"epsilon_dpo/loss_margin_mean": 45.78092575073242,
"grad_norm": 12.78712272644043,
"kl/avg_steps": 0.46875,
"kl/beta": 0.008587907068431377,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 2.7791137836269158e-08,
"logits/chosen": -0.11244659870862961,
"logits/rejected": 0.0900236964225769,
"logps/chosen": -169.36062622070312,
"logps/ref_chosen": -74.81793212890625,
"logps/ref_rejected": -65.88681030273438,
"logps/rejected": -206.21041870117188,
"loss": 1.0979,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.8102878332138062,
"rewards/margins": 0.388029009103775,
"rewards/rejected": -1.1983168125152588,
"step": 572
},
{
"epoch": 0.8662131519274376,
"epsilon_dpo/beta": 0.008511293679475784,
"epsilon_dpo/beta_margin_grad_mean": -0.41562026739120483,
"epsilon_dpo/beta_margin_grad_std": 0.1512221097946167,
"epsilon_dpo/beta_margin_mean": 0.37745651602745056,
"epsilon_dpo/beta_margin_std": 0.689054548740387,
"epsilon_dpo/loss_margin_mean": 44.865699768066406,
"grad_norm": 12.692795753479004,
"kl/avg_steps": 0.4375,
"kl/beta": 0.00854783970862627,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 2.718837261761528e-08,
"logits/chosen": -0.12519502639770508,
"logits/rejected": -0.21198636293411255,
"logps/chosen": -173.47674560546875,
"logps/ref_chosen": -68.72564697265625,
"logps/ref_rejected": -88.16201782226562,
"logps/rejected": -237.77880859375,
"loss": 1.1532,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.8933955430984497,
"rewards/margins": 0.3774564862251282,
"rewards/rejected": -1.2708520889282227,
"step": 573
},
{
"epoch": 0.8677248677248677,
"epsilon_dpo/beta": 0.008460920304059982,
"epsilon_dpo/beta_margin_grad_mean": -0.3916386067867279,
"epsilon_dpo/beta_margin_grad_std": 0.11366511136293411,
"epsilon_dpo/beta_margin_mean": 0.46974778175354004,
"epsilon_dpo/beta_margin_std": 0.5089936256408691,
"epsilon_dpo/loss_margin_mean": 55.810508728027344,
"grad_norm": 9.733078956604004,
"kl/avg_steps": 0.59375,
"kl/beta": 0.008510605432093143,
"kl/n_epsilon_steps": 0.203125,
"kl/p_epsilon_steps": 0.796875,
"learning_rate": 2.659183991914696e-08,
"logits/chosen": 0.0009604603983461857,
"logits/rejected": -0.15071141719818115,
"logps/chosen": -148.11769104003906,
"logps/ref_chosen": -56.31340026855469,
"logps/ref_rejected": -83.91553497314453,
"logps/rejected": -231.53033447265625,
"loss": 1.0305,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.7779324650764465,
"rewards/margins": 0.46974778175354004,
"rewards/rejected": -1.2476803064346313,
"step": 574
},
{
"epoch": 0.8692365835222978,
"epsilon_dpo/beta": 0.008425508625805378,
"epsilon_dpo/beta_margin_grad_mean": -0.429740846157074,
"epsilon_dpo/beta_margin_grad_std": 0.14508263766765594,
"epsilon_dpo/beta_margin_mean": 0.31341859698295593,
"epsilon_dpo/beta_margin_std": 0.6467012763023376,
"epsilon_dpo/loss_margin_mean": 37.692832946777344,
"grad_norm": 12.099715232849121,
"kl/avg_steps": 0.421875,
"kl/beta": 0.008460371755063534,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.703125,
"learning_rate": 2.600155642716606e-08,
"logits/chosen": 0.07123199850320816,
"logits/rejected": -0.14407536387443542,
"logps/chosen": -159.65185546875,
"logps/ref_chosen": -64.5841293334961,
"logps/ref_rejected": -93.47034454345703,
"logps/rejected": -226.2309112548828,
"loss": 1.1949,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.8030122518539429,
"rewards/margins": 0.3134186267852783,
"rewards/rejected": -1.1164307594299316,
"step": 575
},
{
"epoch": 0.8707482993197279,
"epsilon_dpo/beta": 0.008383544161915779,
"epsilon_dpo/beta_margin_grad_mean": -0.3934468626976013,
"epsilon_dpo/beta_margin_grad_std": 0.13522404432296753,
"epsilon_dpo/beta_margin_mean": 0.470424622297287,
"epsilon_dpo/beta_margin_std": 0.6084267497062683,
"epsilon_dpo/loss_margin_mean": 56.54560852050781,
"grad_norm": 11.731398582458496,
"kl/avg_steps": 0.5,
"kl/beta": 0.008424829691648483,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 2.5417538653170754e-08,
"logits/chosen": 0.10168327391147614,
"logits/rejected": -0.2125554233789444,
"logps/chosen": -137.221435546875,
"logps/ref_chosen": -53.28052520751953,
"logps/ref_rejected": -84.20004272460938,
"logps/rejected": -224.68655395507812,
"loss": 1.0555,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.7051520943641663,
"rewards/margins": 0.470424622297287,
"rewards/rejected": -1.1755766868591309,
"step": 576
},
{
"epoch": 0.872260015117158,
"epsilon_dpo/beta": 0.008341834880411625,
"epsilon_dpo/beta_margin_grad_mean": -0.4219464361667633,
"epsilon_dpo/beta_margin_grad_std": 0.1365426778793335,
"epsilon_dpo/beta_margin_mean": 0.3374972343444824,
"epsilon_dpo/beta_margin_std": 0.6000775694847107,
"epsilon_dpo/loss_margin_mean": 40.90005874633789,
"grad_norm": 11.904960632324219,
"kl/avg_steps": 0.5,
"kl/beta": 0.008382915519177914,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 2.4839802933393607e-08,
"logits/chosen": -0.007333159446716309,
"logits/rejected": -0.04042524844408035,
"logps/chosen": -155.2171173095703,
"logps/ref_chosen": -62.32469177246094,
"logps/ref_rejected": -67.300537109375,
"logps/rejected": -201.093017578125,
"loss": 1.1619,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.7761048078536987,
"rewards/margins": 0.33749720454216003,
"rewards/rejected": -1.1136019229888916,
"step": 577
},
{
"epoch": 0.873771730914588,
"epsilon_dpo/beta": 0.008318581618368626,
"epsilon_dpo/beta_margin_grad_mean": -0.4327985644340515,
"epsilon_dpo/beta_margin_grad_std": 0.13032911717891693,
"epsilon_dpo/beta_margin_mean": 0.2953794300556183,
"epsilon_dpo/beta_margin_std": 0.5692952275276184,
"epsilon_dpo/loss_margin_mean": 35.99378204345703,
"grad_norm": 10.861734390258789,
"kl/avg_steps": 0.28125,
"kl/beta": 0.008341209031641483,
"kl/n_epsilon_steps": 0.359375,
"kl/p_epsilon_steps": 0.640625,
"learning_rate": 2.4268365428344733e-08,
"logits/chosen": 0.09147296100854874,
"logits/rejected": -0.018801851198077202,
"logps/chosen": -146.52703857421875,
"logps/ref_chosen": -56.65557861328125,
"logps/ref_rejected": -68.21835327148438,
"logps/rejected": -194.08358764648438,
"loss": 1.1891,
"rewards/accuracies": 0.609375,
"rewards/chosen": -0.7487365007400513,
"rewards/margins": 0.2953794300556183,
"rewards/rejected": -1.0441159009933472,
"step": 578
},
{
"epoch": 0.8752834467120182,
"epsilon_dpo/beta": 0.008269255049526691,
"epsilon_dpo/beta_margin_grad_mean": -0.3884841203689575,
"epsilon_dpo/beta_margin_grad_std": 0.1199883297085762,
"epsilon_dpo/beta_margin_mean": 0.4820398688316345,
"epsilon_dpo/beta_margin_std": 0.5268412828445435,
"epsilon_dpo/loss_margin_mean": 58.65264129638672,
"grad_norm": 10.335455894470215,
"kl/avg_steps": 0.59375,
"kl/beta": 0.008317815139889717,
"kl/n_epsilon_steps": 0.203125,
"kl/p_epsilon_steps": 0.796875,
"learning_rate": 2.3703242122359357e-08,
"logits/chosen": 0.06139584630727768,
"logits/rejected": 0.03873599320650101,
"logps/chosen": -151.46800231933594,
"logps/ref_chosen": -56.809661865234375,
"logps/ref_rejected": -68.09613037109375,
"logps/rejected": -221.4071044921875,
"loss": 1.0262,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.7837352752685547,
"rewards/margins": 0.4820398688316345,
"rewards/rejected": -1.265775203704834,
"step": 579
},
{
"epoch": 0.8767951625094482,
"epsilon_dpo/beta": 0.008238535374403,
"epsilon_dpo/beta_margin_grad_mean": -0.4189753532409668,
"epsilon_dpo/beta_margin_grad_std": 0.14445728063583374,
"epsilon_dpo/beta_margin_mean": 0.3580981194972992,
"epsilon_dpo/beta_margin_std": 0.639573872089386,
"epsilon_dpo/loss_margin_mean": 44.00116729736328,
"grad_norm": 11.668439865112305,
"kl/avg_steps": 0.375,
"kl/beta": 0.008268719539046288,
"kl/n_epsilon_steps": 0.3125,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 2.3144448823151392e-08,
"logits/chosen": 0.0028475839644670486,
"logits/rejected": -0.1367034763097763,
"logps/chosen": -151.4437713623047,
"logps/ref_chosen": -57.70011520385742,
"logps/ref_rejected": -77.90664672851562,
"logps/rejected": -215.65147399902344,
"loss": 1.1557,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.7745280861854553,
"rewards/margins": 0.3580981492996216,
"rewards/rejected": -1.1326262950897217,
"step": 580
},
{
"epoch": 0.8783068783068783,
"epsilon_dpo/beta": 0.008205181919038296,
"epsilon_dpo/beta_margin_grad_mean": -0.42340657114982605,
"epsilon_dpo/beta_margin_grad_std": 0.13469408452510834,
"epsilon_dpo/beta_margin_mean": 0.33468595147132874,
"epsilon_dpo/beta_margin_std": 0.5863233804702759,
"epsilon_dpo/loss_margin_mean": 41.283748626708984,
"grad_norm": 13.026628494262695,
"kl/avg_steps": 0.40625,
"kl/beta": 0.008237827569246292,
"kl/n_epsilon_steps": 0.296875,
"kl/p_epsilon_steps": 0.703125,
"learning_rate": 2.259200116137039e-08,
"logits/chosen": 0.09341280162334442,
"logits/rejected": -0.13382352888584137,
"logps/chosen": -164.80697631835938,
"logps/ref_chosen": -59.332359313964844,
"logps/ref_rejected": -83.64482116699219,
"logps/rejected": -230.40318298339844,
"loss": 1.1606,
"rewards/accuracies": 0.703125,
"rewards/chosen": -0.8671630024909973,
"rewards/margins": 0.33468595147132874,
"rewards/rejected": -1.2018489837646484,
"step": 581
},
{
"epoch": 0.8798185941043084,
"epsilon_dpo/beta": 0.008166855201125145,
"epsilon_dpo/beta_margin_grad_mean": -0.41138020157814026,
"epsilon_dpo/beta_margin_grad_std": 0.1309242844581604,
"epsilon_dpo/beta_margin_mean": 0.38620275259017944,
"epsilon_dpo/beta_margin_std": 0.5747610330581665,
"epsilon_dpo/loss_margin_mean": 47.71682357788086,
"grad_norm": 11.733780860900879,
"kl/avg_steps": 0.46875,
"kl/beta": 0.008204497396945953,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 2.204591459016525e-08,
"logits/chosen": 0.019137922674417496,
"logits/rejected": 0.18065905570983887,
"logps/chosen": -157.9627685546875,
"logps/ref_chosen": -64.16285705566406,
"logps/ref_rejected": -58.632896423339844,
"logps/rejected": -200.14962768554688,
"loss": 1.1144,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.7678611278533936,
"rewards/margins": 0.38620275259017944,
"rewards/rejected": -1.1540639400482178,
"step": 582
},
{
"epoch": 0.8813303099017384,
"epsilon_dpo/beta": 0.008133855648338795,
"epsilon_dpo/beta_margin_grad_mean": -0.40232956409454346,
"epsilon_dpo/beta_margin_grad_std": 0.14534710347652435,
"epsilon_dpo/beta_margin_mean": 0.42777982354164124,
"epsilon_dpo/beta_margin_std": 0.6396336555480957,
"epsilon_dpo/loss_margin_mean": 53.14875411987305,
"grad_norm": 12.845998764038086,
"kl/avg_steps": 0.40625,
"kl/beta": 0.008166218176484108,
"kl/n_epsilon_steps": 0.296875,
"kl/p_epsilon_steps": 0.703125,
"learning_rate": 2.1506204384751064e-08,
"logits/chosen": 0.11665484309196472,
"logits/rejected": -0.16138173639774323,
"logps/chosen": -144.72470092773438,
"logps/ref_chosen": -51.87239456176758,
"logps/ref_rejected": -83.86331176757812,
"logps/rejected": -229.8643798828125,
"loss": 1.0996,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.7562511563301086,
"rewards/margins": 0.4277798533439636,
"rewards/rejected": -1.1840310096740723,
"step": 583
},
{
"epoch": 0.8828420256991686,
"epsilon_dpo/beta": 0.008106029592454433,
"epsilon_dpo/beta_margin_grad_mean": -0.4164838194847107,
"epsilon_dpo/beta_margin_grad_std": 0.14666490256786346,
"epsilon_dpo/beta_margin_mean": 0.3699907958507538,
"epsilon_dpo/beta_margin_std": 0.6473835110664368,
"epsilon_dpo/loss_margin_mean": 46.214813232421875,
"grad_norm": 11.324408531188965,
"kl/avg_steps": 0.34375,
"kl/beta": 0.0081331767141819,
"kl/n_epsilon_steps": 0.328125,
"kl/p_epsilon_steps": 0.671875,
"learning_rate": 2.09728856419826e-08,
"logits/chosen": 0.18188504874706268,
"logits/rejected": -0.15276563167572021,
"logps/chosen": -130.79421997070312,
"logps/ref_chosen": -46.571388244628906,
"logps/ref_rejected": -80.67969512939453,
"logps/rejected": -211.11734008789062,
"loss": 1.1483,
"rewards/accuracies": 0.671875,
"rewards/chosen": -0.6845394372940063,
"rewards/margins": 0.3699907660484314,
"rewards/rejected": -1.054530143737793,
"step": 584
},
{
"epoch": 0.8843537414965986,
"epsilon_dpo/beta": 0.008063062094151974,
"epsilon_dpo/beta_margin_grad_mean": -0.43578481674194336,
"epsilon_dpo/beta_margin_grad_std": 0.125066339969635,
"epsilon_dpo/beta_margin_mean": 0.2729918956756592,
"epsilon_dpo/beta_margin_std": 0.5514092445373535,
"epsilon_dpo/loss_margin_mean": 34.23734664916992,
"grad_norm": 11.248820304870605,
"kl/avg_steps": 0.53125,
"kl/beta": 0.008105315268039703,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 2.044597327993153e-08,
"logits/chosen": 0.04138148948550224,
"logits/rejected": -0.006087362766265869,
"logps/chosen": -156.91653442382812,
"logps/ref_chosen": -58.124534606933594,
"logps/ref_rejected": -79.00538635253906,
"logps/rejected": -212.03472900390625,
"loss": 1.2036,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.7980189323425293,
"rewards/margins": 0.2729918956756592,
"rewards/rejected": -1.0710108280181885,
"step": 585
},
{
"epoch": 0.8858654572940288,
"epsilon_dpo/beta": 0.008017932996153831,
"epsilon_dpo/beta_margin_grad_mean": -0.40787455439567566,
"epsilon_dpo/beta_margin_grad_std": 0.10913902521133423,
"epsilon_dpo/beta_margin_mean": 0.39562439918518066,
"epsilon_dpo/beta_margin_std": 0.47668689489364624,
"epsilon_dpo/loss_margin_mean": 49.635353088378906,
"grad_norm": 12.100872993469238,
"kl/avg_steps": 0.5625,
"kl/beta": 0.008062482811510563,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 1.9925482037469187e-08,
"logits/chosen": 0.1364174783229828,
"logits/rejected": 0.026796605437994003,
"logps/chosen": -146.84213256835938,
"logps/ref_chosen": -54.10163879394531,
"logps/ref_rejected": -63.72113037109375,
"logps/rejected": -206.09698486328125,
"loss": 1.0827,
"rewards/accuracies": 0.796875,
"rewards/chosen": -0.7444946765899658,
"rewards/margins": 0.39562439918518066,
"rewards/rejected": -1.1401190757751465,
"step": 586
},
{
"epoch": 0.8873771730914588,
"epsilon_dpo/beta": 0.007980601862072945,
"epsilon_dpo/beta_margin_grad_mean": -0.407537579536438,
"epsilon_dpo/beta_margin_grad_std": 0.1508595496416092,
"epsilon_dpo/beta_margin_mean": 0.39538565278053284,
"epsilon_dpo/beta_margin_std": 0.691368579864502,
"epsilon_dpo/loss_margin_mean": 50.10236358642578,
"grad_norm": 11.429548263549805,
"kl/avg_steps": 0.46875,
"kl/beta": 0.008017385378479958,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 1.9411426473854687e-08,
"logits/chosen": 0.025330090895295143,
"logits/rejected": 0.037366680800914764,
"logps/chosen": -156.08505249023438,
"logps/ref_chosen": -63.41719436645508,
"logps/ref_rejected": -63.47003936767578,
"logps/rejected": -206.24024963378906,
"loss": 1.1398,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.7418072819709778,
"rewards/margins": 0.39538562297821045,
"rewards/rejected": -1.137192964553833,
"step": 587
},
{
"epoch": 0.8888888888888888,
"epsilon_dpo/beta": 0.007938378490507603,
"epsilon_dpo/beta_margin_grad_mean": -0.40604764223098755,
"epsilon_dpo/beta_margin_grad_std": 0.1312301903963089,
"epsilon_dpo/beta_margin_mean": 0.4103603661060333,
"epsilon_dpo/beta_margin_std": 0.5784146189689636,
"epsilon_dpo/loss_margin_mean": 52.14124298095703,
"grad_norm": 13.238972663879395,
"kl/avg_steps": 0.53125,
"kl/beta": 0.007979978807270527,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 1.890382096832699e-08,
"logits/chosen": 0.06427621841430664,
"logits/rejected": -0.108741894364357,
"logps/chosen": -159.0752410888672,
"logps/ref_chosen": -62.20103454589844,
"logps/ref_rejected": -82.10250091552734,
"logps/rejected": -231.11795043945312,
"loss": 1.0956,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.7718620300292969,
"rewards/margins": 0.4103603959083557,
"rewards/rejected": -1.1822223663330078,
"step": 588
},
{
"epoch": 0.890400604686319,
"epsilon_dpo/beta": 0.007891467772424221,
"epsilon_dpo/beta_margin_grad_mean": -0.4017091989517212,
"epsilon_dpo/beta_margin_grad_std": 0.10897976160049438,
"epsilon_dpo/beta_margin_mean": 0.422450453042984,
"epsilon_dpo/beta_margin_std": 0.4800112247467041,
"epsilon_dpo/loss_margin_mean": 53.84581756591797,
"grad_norm": 10.671985626220703,
"kl/avg_steps": 0.59375,
"kl/beta": 0.007937809452414513,
"kl/n_epsilon_steps": 0.203125,
"kl/p_epsilon_steps": 0.796875,
"learning_rate": 1.840267971970344e-08,
"logits/chosen": -0.002718959003686905,
"logits/rejected": -0.015046834945678711,
"logps/chosen": -146.4038543701172,
"logps/ref_chosen": -56.71361541748047,
"logps/ref_rejected": -76.7366943359375,
"logps/rejected": -220.27276611328125,
"loss": 1.0616,
"rewards/accuracies": 0.796875,
"rewards/chosen": -0.709425687789917,
"rewards/margins": 0.422450453042984,
"rewards/rejected": -1.1318762302398682,
"step": 589
},
{
"epoch": 0.891912320483749,
"epsilon_dpo/beta": 0.007849819958209991,
"epsilon_dpo/beta_margin_grad_mean": -0.4037237763404846,
"epsilon_dpo/beta_margin_grad_std": 0.12907341122627258,
"epsilon_dpo/beta_margin_mean": 0.4160190522670746,
"epsilon_dpo/beta_margin_std": 0.5652801990509033,
"epsilon_dpo/loss_margin_mean": 53.41786575317383,
"grad_norm": 10.42196273803711,
"kl/avg_steps": 0.53125,
"kl/beta": 0.007890956476330757,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 1.7908016745981856e-08,
"logits/chosen": -0.06222187355160713,
"logits/rejected": -0.11079029738903046,
"logps/chosen": -166.63790893554688,
"logps/ref_chosen": -66.5138168334961,
"logps/ref_rejected": -85.70820617675781,
"logps/rejected": -239.2501678466797,
"loss": 1.0881,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.7874312996864319,
"rewards/margins": 0.4160190224647522,
"rewards/rejected": -1.203450322151184,
"step": 590
},
{
"epoch": 0.8934240362811792,
"epsilon_dpo/beta": 0.007803432643413544,
"epsilon_dpo/beta_margin_grad_mean": -0.39000368118286133,
"epsilon_dpo/beta_margin_grad_std": 0.14685600996017456,
"epsilon_dpo/beta_margin_mean": 0.48694413900375366,
"epsilon_dpo/beta_margin_std": 0.6528546214103699,
"epsilon_dpo/loss_margin_mean": 62.883907318115234,
"grad_norm": 13.084833145141602,
"kl/avg_steps": 0.59375,
"kl/beta": 0.007849257439374924,
"kl/n_epsilon_steps": 0.203125,
"kl/p_epsilon_steps": 0.796875,
"learning_rate": 1.7419845883949098e-08,
"logits/chosen": 0.08892872929573059,
"logits/rejected": -0.08392320573329926,
"logps/chosen": -144.78277587890625,
"logps/ref_chosen": -60.697181701660156,
"logps/ref_rejected": -86.12278747558594,
"logps/rejected": -233.09228515625,
"loss": 1.0564,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.6579023599624634,
"rewards/margins": 0.48694413900375366,
"rewards/rejected": -1.1448464393615723,
"step": 591
},
{
"epoch": 0.8949357520786092,
"epsilon_dpo/beta": 0.0077598122879862785,
"epsilon_dpo/beta_margin_grad_mean": -0.41340371966362,
"epsilon_dpo/beta_margin_grad_std": 0.12167234718799591,
"epsilon_dpo/beta_margin_mean": 0.3811741769313812,
"epsilon_dpo/beta_margin_std": 0.5453034043312073,
"epsilon_dpo/loss_margin_mean": 49.46284484863281,
"grad_norm": 11.433758735656738,
"kl/avg_steps": 0.5625,
"kl/beta": 0.007802927866578102,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 1.6938180788793556e-08,
"logits/chosen": 0.06828460097312927,
"logits/rejected": -0.1545882523059845,
"logps/chosen": -138.826416015625,
"logps/ref_chosen": -51.237327575683594,
"logps/ref_rejected": -81.60243225097656,
"logps/rejected": -218.65435791015625,
"loss": 1.1097,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.6808744072914124,
"rewards/margins": 0.3811741769313812,
"rewards/rejected": -1.0620485544204712,
"step": 592
},
{
"epoch": 0.8964474678760394,
"epsilon_dpo/beta": 0.007723682560026646,
"epsilon_dpo/beta_margin_grad_mean": -0.4153226613998413,
"epsilon_dpo/beta_margin_grad_std": 0.13811911642551422,
"epsilon_dpo/beta_margin_mean": 0.3676586449146271,
"epsilon_dpo/beta_margin_std": 0.6002876162528992,
"epsilon_dpo/loss_margin_mean": 48.10410690307617,
"grad_norm": 11.860922813415527,
"kl/avg_steps": 0.46875,
"kl/beta": 0.007759281899780035,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 1.6463034933723336e-08,
"logits/chosen": 0.24936816096305847,
"logits/rejected": -0.07551784813404083,
"logps/chosen": -122.47266387939453,
"logps/ref_chosen": -42.08000183105469,
"logps/ref_rejected": -68.47499084472656,
"logps/rejected": -196.97177124023438,
"loss": 1.1374,
"rewards/accuracies": 0.703125,
"rewards/chosen": -0.6228024959564209,
"rewards/margins": 0.36765867471694946,
"rewards/rejected": -0.9904611706733704,
"step": 593
},
{
"epoch": 0.8979591836734694,
"epsilon_dpo/beta": 0.007685232907533646,
"epsilon_dpo/beta_margin_grad_mean": -0.42152139544487,
"epsilon_dpo/beta_margin_grad_std": 0.1151556745171547,
"epsilon_dpo/beta_margin_mean": 0.33433103561401367,
"epsilon_dpo/beta_margin_std": 0.49525129795074463,
"epsilon_dpo/loss_margin_mean": 43.89540481567383,
"grad_norm": 11.084526062011719,
"kl/avg_steps": 0.5,
"kl/beta": 0.0077230799943208694,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 1.5994421609589385e-08,
"logits/chosen": -0.02860281616449356,
"logits/rejected": -0.049613598734140396,
"logps/chosen": -162.7176971435547,
"logps/ref_chosen": -63.65867614746094,
"logps/ref_rejected": -70.35597229003906,
"logps/rejected": -213.31040954589844,
"loss": 1.1381,
"rewards/accuracies": 0.703125,
"rewards/chosen": -0.7621855735778809,
"rewards/margins": 0.33433103561401367,
"rewards/rejected": -1.0965166091918945,
"step": 594
},
{
"epoch": 0.8994708994708994,
"epsilon_dpo/beta": 0.007630185689777136,
"epsilon_dpo/beta_margin_grad_mean": -0.3984719514846802,
"epsilon_dpo/beta_margin_grad_std": 0.1360938400030136,
"epsilon_dpo/beta_margin_mean": 0.44250786304473877,
"epsilon_dpo/beta_margin_std": 0.60281902551651,
"epsilon_dpo/loss_margin_mean": 58.3497314453125,
"grad_norm": 10.344907760620117,
"kl/avg_steps": 0.71875,
"kl/beta": 0.007684656418859959,
"kl/n_epsilon_steps": 0.140625,
"kl/p_epsilon_steps": 0.859375,
"learning_rate": 1.553235392451377e-08,
"logits/chosen": 0.07310786098241806,
"logits/rejected": -0.151771679520607,
"logps/chosen": -147.03689575195312,
"logps/ref_chosen": -56.21875762939453,
"logps/ref_rejected": -83.95773315429688,
"logps/rejected": -233.1256103515625,
"loss": 1.0768,
"rewards/accuracies": 0.859375,
"rewards/chosen": -0.6941012144088745,
"rewards/margins": 0.44250786304473877,
"rewards/rejected": -1.1366090774536133,
"step": 595
},
{
"epoch": 0.9009826152683296,
"epsilon_dpo/beta": 0.007616272661834955,
"epsilon_dpo/beta_margin_grad_mean": -0.4501587450504303,
"epsilon_dpo/beta_margin_grad_std": 0.11889635771512985,
"epsilon_dpo/beta_margin_mean": 0.21836520731449127,
"epsilon_dpo/beta_margin_std": 0.5180902481079102,
"epsilon_dpo/loss_margin_mean": 29.141748428344727,
"grad_norm": 10.07887077331543,
"kl/avg_steps": 0.1875,
"kl/beta": 0.007629817351698875,
"kl/n_epsilon_steps": 0.40625,
"kl/p_epsilon_steps": 0.59375,
"learning_rate": 1.507684480352292e-08,
"logits/chosen": -0.08291863650083542,
"logits/rejected": -0.03285349905490875,
"logps/chosen": -172.1246337890625,
"logps/ref_chosen": -68.48088073730469,
"logps/ref_rejected": -61.732967376708984,
"logps/rejected": -194.51846313476562,
"loss": 1.2435,
"rewards/accuracies": 0.609375,
"rewards/chosen": -0.7914004325866699,
"rewards/margins": 0.21836520731449127,
"rewards/rejected": -1.009765625,
"step": 596
},
{
"epoch": 0.9024943310657596,
"epsilon_dpo/beta": 0.0075829788111150265,
"epsilon_dpo/beta_margin_grad_mean": -0.42246386408805847,
"epsilon_dpo/beta_margin_grad_std": 0.11843694001436234,
"epsilon_dpo/beta_margin_mean": 0.33559679985046387,
"epsilon_dpo/beta_margin_std": 0.5121856927871704,
"epsilon_dpo/loss_margin_mean": 44.67052459716797,
"grad_norm": 9.06714916229248,
"kl/avg_steps": 0.4375,
"kl/beta": 0.007615538313984871,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 1.4627906988186111e-08,
"logits/chosen": 0.1500977873802185,
"logits/rejected": 0.11747360229492188,
"logps/chosen": -128.927734375,
"logps/ref_chosen": -48.85750961303711,
"logps/ref_rejected": -55.068084716796875,
"logps/rejected": -179.808837890625,
"loss": 1.1407,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.6085903644561768,
"rewards/margins": 0.33559679985046387,
"rewards/rejected": -0.9441871643066406,
"step": 597
},
{
"epoch": 0.9040060468631897,
"epsilon_dpo/beta": 0.007557056378573179,
"epsilon_dpo/beta_margin_grad_mean": -0.44960156083106995,
"epsilon_dpo/beta_margin_grad_std": 0.12332341074943542,
"epsilon_dpo/beta_margin_mean": 0.2178182452917099,
"epsilon_dpo/beta_margin_std": 0.5318365693092346,
"epsilon_dpo/loss_margin_mean": 29.29388999938965,
"grad_norm": 13.698641777038574,
"kl/avg_steps": 0.34375,
"kl/beta": 0.007582365069538355,
"kl/n_epsilon_steps": 0.328125,
"kl/p_epsilon_steps": 0.671875,
"learning_rate": 1.4185553036259095e-08,
"logits/chosen": 0.04286496341228485,
"logits/rejected": -0.13238824903964996,
"logps/chosen": -165.68869018554688,
"logps/ref_chosen": -58.88715362548828,
"logps/ref_rejected": -81.43145751953125,
"logps/rejected": -217.52688598632812,
"loss": 1.2479,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.8088340759277344,
"rewards/margins": 0.2178182601928711,
"rewards/rejected": -1.0266523361206055,
"step": 598
},
{
"epoch": 0.9055177626606198,
"epsilon_dpo/beta": 0.007533529307693243,
"epsilon_dpo/beta_margin_grad_mean": -0.43888407945632935,
"epsilon_dpo/beta_margin_grad_std": 0.12328551709651947,
"epsilon_dpo/beta_margin_mean": 0.26401567459106445,
"epsilon_dpo/beta_margin_std": 0.5303941369056702,
"epsilon_dpo/loss_margin_mean": 35.55364990234375,
"grad_norm": 11.04465389251709,
"kl/avg_steps": 0.3125,
"kl/beta": 0.007556390017271042,
"kl/n_epsilon_steps": 0.34375,
"kl/p_epsilon_steps": 0.65625,
"learning_rate": 1.3749795321332885e-08,
"logits/chosen": 0.11935050785541534,
"logits/rejected": 0.021623089909553528,
"logps/chosen": -165.39453125,
"logps/ref_chosen": -57.60719680786133,
"logps/ref_rejected": -71.80469512939453,
"logps/rejected": -215.1456756591797,
"loss": 1.2068,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.814131498336792,
"rewards/margins": 0.26401567459106445,
"rewards/rejected": -1.0781471729278564,
"step": 599
},
{
"epoch": 0.9070294784580499,
"epsilon_dpo/beta": 0.007498289458453655,
"epsilon_dpo/beta_margin_grad_mean": -0.42788735032081604,
"epsilon_dpo/beta_margin_grad_std": 0.12347178161144257,
"epsilon_dpo/beta_margin_mean": 0.3114525079727173,
"epsilon_dpo/beta_margin_std": 0.5340651273727417,
"epsilon_dpo/loss_margin_mean": 41.97848129272461,
"grad_norm": 12.164484977722168,
"kl/avg_steps": 0.46875,
"kl/beta": 0.007532849907875061,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 1.3320646032487393e-08,
"logits/chosen": -0.01866793818771839,
"logits/rejected": -0.09087549149990082,
"logps/chosen": -155.71920776367188,
"logps/ref_chosen": -58.44231414794922,
"logps/ref_rejected": -83.64639282226562,
"logps/rejected": -222.90176391601562,
"loss": 1.1666,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.7313058376312256,
"rewards/margins": 0.3114525079727173,
"rewards/rejected": -1.0427583456039429,
"step": 600
},
{
"epoch": 0.9070294784580499,
"eval_epsilon_dpo/beta": 0.007465483620762825,
"eval_epsilon_dpo/beta_margin_grad_mean": -0.42372190952301025,
"eval_epsilon_dpo/beta_margin_grad_std": 0.12393853068351746,
"eval_epsilon_dpo/beta_margin_mean": 0.32965075969696045,
"eval_epsilon_dpo/beta_margin_std": 0.5411303043365479,
"eval_epsilon_dpo/loss_margin_mean": 44.62019348144531,
"eval_kl/n_epsilon_steps": 0.27992957830429077,
"eval_kl/p_epsilon_steps": 0.7196303009986877,
"eval_logits/chosen": 0.03963098302483559,
"eval_logits/rejected": -0.06409834325313568,
"eval_logps/chosen": -168.1115264892578,
"eval_logps/ref_chosen": -74.85946655273438,
"eval_logps/ref_rejected": -79.54898834228516,
"eval_logps/rejected": -217.42123413085938,
"eval_loss": 0.5777515769004822,
"eval_rewards/accuracies": 0.7191901206970215,
"eval_rewards/chosen": -0.698003888130188,
"eval_rewards/margins": 0.32965072989463806,
"eval_rewards/rejected": -1.027654767036438,
"eval_runtime": 42.0712,
"eval_samples_per_second": 54.741,
"eval_steps_per_second": 1.711,
"step": 600
},
{
"epoch": 0.90854119425548,
"epsilon_dpo/beta": 0.007460962049663067,
"epsilon_dpo/beta_margin_grad_mean": -0.4058528542518616,
"epsilon_dpo/beta_margin_grad_std": 0.13406476378440857,
"epsilon_dpo/beta_margin_mean": 0.410722553730011,
"epsilon_dpo/beta_margin_std": 0.5876964926719666,
"epsilon_dpo/loss_margin_mean": 55.532718658447266,
"grad_norm": 10.722939491271973,
"kl/avg_steps": 0.5,
"kl/beta": 0.007497704587876797,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 1.2898117173950868e-08,
"logits/chosen": 0.055346377193927765,
"logits/rejected": -0.11516669392585754,
"logps/chosen": -140.23919677734375,
"logps/ref_chosen": -55.59432601928711,
"logps/ref_rejected": -83.68630981445312,
"logps/rejected": -223.86390686035156,
"loss": 1.0982,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.6332917213439941,
"rewards/margins": 0.410722553730011,
"rewards/rejected": -1.04401433467865,
"step": 601
},
{
"epoch": 0.91005291005291,
"epsilon_dpo/beta": 0.007411006838083267,
"epsilon_dpo/beta_margin_grad_mean": -0.4077316224575043,
"epsilon_dpo/beta_margin_grad_std": 0.11194012314081192,
"epsilon_dpo/beta_margin_mean": 0.3954795002937317,
"epsilon_dpo/beta_margin_std": 0.4895583391189575,
"epsilon_dpo/loss_margin_mean": 53.647003173828125,
"grad_norm": 9.719115257263184,
"kl/avg_steps": 0.671875,
"kl/beta": 0.007460402324795723,
"kl/n_epsilon_steps": 0.15625,
"kl/p_epsilon_steps": 0.828125,
"learning_rate": 1.2482220564763667e-08,
"logits/chosen": 0.026923656463623047,
"logits/rejected": -0.05092637240886688,
"logps/chosen": -134.0385284423828,
"logps/ref_chosen": -56.349185943603516,
"logps/ref_rejected": -71.9959716796875,
"logps/rejected": -203.33230590820312,
"loss": 1.0858,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.5762945413589478,
"rewards/margins": 0.3954795002937317,
"rewards/rejected": -0.9717740416526794,
"step": 602
},
{
"epoch": 0.9115646258503401,
"epsilon_dpo/beta": 0.0073696644976735115,
"epsilon_dpo/beta_margin_grad_mean": -0.41273772716522217,
"epsilon_dpo/beta_margin_grad_std": 0.12069539725780487,
"epsilon_dpo/beta_margin_mean": 0.3755446672439575,
"epsilon_dpo/beta_margin_std": 0.5277886390686035,
"epsilon_dpo/loss_margin_mean": 51.373809814453125,
"grad_norm": 11.57040786743164,
"kl/avg_steps": 0.5625,
"kl/beta": 0.0074106124229729176,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 1.2072967838448051e-08,
"logits/chosen": 0.12180892378091812,
"logits/rejected": 0.038878731429576874,
"logps/chosen": -140.5664825439453,
"logps/ref_chosen": -53.168392181396484,
"logps/ref_rejected": -73.8604736328125,
"logps/rejected": -212.6323699951172,
"loss": 1.1111,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.6450119018554688,
"rewards/margins": 0.3755446672439575,
"rewards/rejected": -1.0205565690994263,
"step": 603
},
{
"epoch": 0.9130763416477702,
"epsilon_dpo/beta": 0.007335351314395666,
"epsilon_dpo/beta_margin_grad_mean": -0.4238927364349365,
"epsilon_dpo/beta_margin_grad_std": 0.12766797840595245,
"epsilon_dpo/beta_margin_mean": 0.32990798354148865,
"epsilon_dpo/beta_margin_std": 0.5593236684799194,
"epsilon_dpo/loss_margin_mean": 45.43312072753906,
"grad_norm": 11.01646614074707,
"kl/avg_steps": 0.46875,
"kl/beta": 0.007369161117821932,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 1.1670370442682459e-08,
"logits/chosen": -0.05858701467514038,
"logits/rejected": -0.05502926558256149,
"logps/chosen": -156.8004150390625,
"logps/ref_chosen": -72.64942169189453,
"logps/ref_rejected": -69.87926483154297,
"logps/rejected": -199.46337890625,
"loss": 1.1571,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.6187934279441833,
"rewards/margins": 0.32990801334381104,
"rewards/rejected": -0.9487013816833496,
"step": 604
},
{
"epoch": 0.9145880574452003,
"epsilon_dpo/beta": 0.0072965421713888645,
"epsilon_dpo/beta_margin_grad_mean": -0.4235232174396515,
"epsilon_dpo/beta_margin_grad_std": 0.12755514681339264,
"epsilon_dpo/beta_margin_mean": 0.32660332322120667,
"epsilon_dpo/beta_margin_std": 0.5486314296722412,
"epsilon_dpo/loss_margin_mean": 45.23341369628906,
"grad_norm": 10.986143112182617,
"kl/avg_steps": 0.53125,
"kl/beta": 0.007334779016673565,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 1.1274439638981532e-08,
"logits/chosen": 0.04533889517188072,
"logits/rejected": -0.15611541271209717,
"logps/chosen": -160.96194458007812,
"logps/ref_chosen": -61.61284637451172,
"logps/ref_rejected": -79.34398651123047,
"logps/rejected": -223.926513671875,
"loss": 1.158,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.7269819378852844,
"rewards/margins": 0.3266032934188843,
"rewards/rejected": -1.0535852909088135,
"step": 605
},
{
"epoch": 0.9160997732426304,
"epsilon_dpo/beta": 0.007267105858772993,
"epsilon_dpo/beta_margin_grad_mean": -0.4112081527709961,
"epsilon_dpo/beta_margin_grad_std": 0.12008678168058395,
"epsilon_dpo/beta_margin_mean": 0.3835746943950653,
"epsilon_dpo/beta_margin_std": 0.5218533873558044,
"epsilon_dpo/loss_margin_mean": 53.274147033691406,
"grad_norm": 11.376989364624023,
"kl/avg_steps": 0.40625,
"kl/beta": 0.007296019233763218,
"kl/n_epsilon_steps": 0.296875,
"kl/p_epsilon_steps": 0.703125,
"learning_rate": 1.0885186502381016e-08,
"logits/chosen": 0.04323825612664223,
"logits/rejected": -0.1311364471912384,
"logps/chosen": -141.42286682128906,
"logps/ref_chosen": -54.464237213134766,
"logps/ref_rejected": -79.6270751953125,
"logps/rejected": -219.85986328125,
"loss": 1.1033,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.6330260038375854,
"rewards/margins": 0.3835746645927429,
"rewards/rejected": -1.0166006088256836,
"step": 606
},
{
"epoch": 0.9176114890400605,
"epsilon_dpo/beta": 0.007235431578010321,
"epsilon_dpo/beta_margin_grad_mean": -0.4161735475063324,
"epsilon_dpo/beta_margin_grad_std": 0.12085915356874466,
"epsilon_dpo/beta_margin_mean": 0.36265861988067627,
"epsilon_dpo/beta_margin_std": 0.525148868560791,
"epsilon_dpo/loss_margin_mean": 50.57728958129883,
"grad_norm": 9.329792022705078,
"kl/avg_steps": 0.4375,
"kl/beta": 0.007266499102115631,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 1.0502621921127774e-08,
"logits/chosen": 0.009375464171171188,
"logits/rejected": -0.04926396533846855,
"logps/chosen": -158.64186096191406,
"logps/ref_chosen": -62.86086654663086,
"logps/ref_rejected": -72.55020141601562,
"logps/rejected": -218.90847778320312,
"loss": 1.1213,
"rewards/accuracies": 0.703125,
"rewards/chosen": -0.6937291026115417,
"rewards/margins": 0.36265861988067627,
"rewards/rejected": -1.0563877820968628,
"step": 607
},
{
"epoch": 0.9191232048374905,
"epsilon_dpo/beta": 0.007195988669991493,
"epsilon_dpo/beta_margin_grad_mean": -0.4237441420555115,
"epsilon_dpo/beta_margin_grad_std": 0.10979735851287842,
"epsilon_dpo/beta_margin_mean": 0.32675349712371826,
"epsilon_dpo/beta_margin_std": 0.47603684663772583,
"epsilon_dpo/loss_margin_mean": 45.74920654296875,
"grad_norm": 10.672455787658691,
"kl/avg_steps": 0.546875,
"kl/beta": 0.007234846241772175,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 1.0126756596375685e-08,
"logits/chosen": 0.009275710210204124,
"logits/rejected": -0.16440889239311218,
"logps/chosen": -163.0616455078125,
"logps/ref_chosen": -63.18071746826172,
"logps/ref_rejected": -99.15888977050781,
"logps/rejected": -244.78903198242188,
"loss": 1.1396,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.7204186916351318,
"rewards/margins": 0.32675349712371826,
"rewards/rejected": -1.0471720695495605,
"step": 608
},
{
"epoch": 0.9206349206349206,
"epsilon_dpo/beta": 0.007155736908316612,
"epsilon_dpo/beta_margin_grad_mean": -0.4017157554626465,
"epsilon_dpo/beta_margin_grad_std": 0.10842680931091309,
"epsilon_dpo/beta_margin_mean": 0.41583919525146484,
"epsilon_dpo/beta_margin_std": 0.4688786566257477,
"epsilon_dpo/loss_margin_mean": 58.50373458862305,
"grad_norm": 9.007486343383789,
"kl/avg_steps": 0.5625,
"kl/beta": 0.007195496000349522,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 9.757601041885694e-09,
"logits/chosen": 0.10885617136955261,
"logits/rejected": 0.06380142271518707,
"logps/chosen": -133.4068603515625,
"logps/ref_chosen": -48.62322235107422,
"logps/ref_rejected": -68.28271484375,
"logps/rejected": -211.57008361816406,
"loss": 1.0653,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.6078311204910278,
"rewards/margins": 0.41583919525146484,
"rewards/rejected": -1.0236703157424927,
"step": 609
},
{
"epoch": 0.9221466364323507,
"epsilon_dpo/beta": 0.007124656345695257,
"epsilon_dpo/beta_margin_grad_mean": -0.41456207633018494,
"epsilon_dpo/beta_margin_grad_std": 0.12071671336889267,
"epsilon_dpo/beta_margin_mean": 0.37014347314834595,
"epsilon_dpo/beta_margin_std": 0.5261032581329346,
"epsilon_dpo/loss_margin_mean": 52.425601959228516,
"grad_norm": 9.742947578430176,
"kl/avg_steps": 0.4375,
"kl/beta": 0.007155247963964939,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 9.395165583732379e-09,
"logits/chosen": -0.21154795587062836,
"logits/rejected": -0.167566180229187,
"logps/chosen": -167.72744750976562,
"logps/ref_chosen": -72.66513061523438,
"logps/ref_rejected": -87.15311431884766,
"logps/rejected": -234.64105224609375,
"loss": 1.1152,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.6791345477104187,
"rewards/margins": 0.37014347314834595,
"rewards/rejected": -1.0492780208587646,
"step": 610
},
{
"epoch": 0.9236583522297808,
"epsilon_dpo/beta": 0.007091394625604153,
"epsilon_dpo/beta_margin_grad_mean": -0.4277513027191162,
"epsilon_dpo/beta_margin_grad_std": 0.10382693260908127,
"epsilon_dpo/beta_margin_mean": 0.3041156232357025,
"epsilon_dpo/beta_margin_std": 0.4398960769176483,
"epsilon_dpo/loss_margin_mean": 43.281211853027344,
"grad_norm": 9.944867134094238,
"kl/avg_steps": 0.46875,
"kl/beta": 0.00712407985702157,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 9.03946036001449e-09,
"logits/chosen": 0.10931895673274994,
"logits/rejected": -0.06344226002693176,
"logps/chosen": -134.90826416015625,
"logps/ref_chosen": -48.30857849121094,
"logps/ref_rejected": -70.6141128540039,
"logps/rejected": -200.49502563476562,
"loss": 1.1517,
"rewards/accuracies": 0.703125,
"rewards/chosen": -0.6150293946266174,
"rewards/margins": 0.3041156232357025,
"rewards/rejected": -0.9191450476646423,
"step": 611
},
{
"epoch": 0.9251700680272109,
"epsilon_dpo/beta": 0.007051660679280758,
"epsilon_dpo/beta_margin_grad_mean": -0.40716853737831116,
"epsilon_dpo/beta_margin_grad_std": 0.11803495138883591,
"epsilon_dpo/beta_margin_mean": 0.3968473970890045,
"epsilon_dpo/beta_margin_std": 0.5169604420661926,
"epsilon_dpo/loss_margin_mean": 56.697147369384766,
"grad_norm": 9.84080982208252,
"kl/avg_steps": 0.5625,
"kl/beta": 0.007090841419994831,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 8.690495320571839e-09,
"logits/chosen": -0.015712738037109375,
"logits/rejected": -0.12880679965019226,
"logps/chosen": -158.4119110107422,
"logps/ref_chosen": -61.23155975341797,
"logps/ref_rejected": -94.37979888916016,
"logps/rejected": -248.25729370117188,
"loss": 1.0912,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.6870585680007935,
"rewards/margins": 0.3968474268913269,
"rewards/rejected": -1.0839059352874756,
"step": 612
},
{
"epoch": 0.926681783824641,
"epsilon_dpo/beta": 0.007014420814812183,
"epsilon_dpo/beta_margin_grad_mean": -0.40466073155403137,
"epsilon_dpo/beta_margin_grad_std": 0.11337319016456604,
"epsilon_dpo/beta_margin_mean": 0.4067113697528839,
"epsilon_dpo/beta_margin_std": 0.491860955953598,
"epsilon_dpo/loss_margin_mean": 58.391170501708984,
"grad_norm": 8.444968223571777,
"kl/avg_steps": 0.53125,
"kl/beta": 0.007051178719848394,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 8.348280226706722e-09,
"logits/chosen": 0.04089689999818802,
"logits/rejected": 0.09657607972621918,
"logps/chosen": -136.03172302246094,
"logps/ref_chosen": -53.98310852050781,
"logps/ref_rejected": -58.32208251953125,
"logps/rejected": -198.76187133789062,
"loss": 1.0776,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.5768507719039917,
"rewards/margins": 0.4067113697528839,
"rewards/rejected": -0.9835621118545532,
"step": 613
},
{
"epoch": 0.9281934996220711,
"epsilon_dpo/beta": 0.006972969509661198,
"epsilon_dpo/beta_margin_grad_mean": -0.4126204550266266,
"epsilon_dpo/beta_margin_grad_std": 0.10445983707904816,
"epsilon_dpo/beta_margin_mean": 0.36727526783943176,
"epsilon_dpo/beta_margin_std": 0.4458141326904297,
"epsilon_dpo/loss_margin_mean": 53.03390884399414,
"grad_norm": 10.596892356872559,
"kl/avg_steps": 0.59375,
"kl/beta": 0.007013917434960604,
"kl/n_epsilon_steps": 0.203125,
"kl/p_epsilon_steps": 0.796875,
"learning_rate": 8.012824650910937e-09,
"logits/chosen": -0.031832028180360794,
"logits/rejected": 0.109224334359169,
"logps/chosen": -155.0105438232422,
"logps/ref_chosen": -60.24303436279297,
"logps/ref_rejected": -72.26258850097656,
"logps/rejected": -220.06402587890625,
"loss": 1.1,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.6618989706039429,
"rewards/margins": 0.3672752380371094,
"rewards/rejected": -1.0291742086410522,
"step": 614
},
{
"epoch": 0.9297052154195011,
"epsilon_dpo/beta": 0.006936169695109129,
"epsilon_dpo/beta_margin_grad_mean": -0.4119343161582947,
"epsilon_dpo/beta_margin_grad_std": 0.13213057816028595,
"epsilon_dpo/beta_margin_mean": 0.37970831990242004,
"epsilon_dpo/beta_margin_std": 0.5761052966117859,
"epsilon_dpo/loss_margin_mean": 55.22464370727539,
"grad_norm": 9.393294334411621,
"kl/avg_steps": 0.53125,
"kl/beta": 0.006972517818212509,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 7.684137976598088e-09,
"logits/chosen": -0.19200178980827332,
"logits/rejected": -0.1479618400335312,
"logps/chosen": -171.401611328125,
"logps/ref_chosen": -72.09467315673828,
"logps/ref_rejected": -104.02980041503906,
"logps/rejected": -258.5614013671875,
"loss": 1.1207,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.6904047131538391,
"rewards/margins": 0.37970834970474243,
"rewards/rejected": -1.0701130628585815,
"step": 615
},
{
"epoch": 0.9312169312169312,
"epsilon_dpo/beta": 0.0069103543646633625,
"epsilon_dpo/beta_margin_grad_mean": -0.4270230233669281,
"epsilon_dpo/beta_margin_grad_std": 0.11226309090852737,
"epsilon_dpo/beta_margin_mean": 0.31189143657684326,
"epsilon_dpo/beta_margin_std": 0.4825916886329651,
"epsilon_dpo/loss_margin_mean": 45.58794021606445,
"grad_norm": 9.05135440826416,
"kl/avg_steps": 0.375,
"kl/beta": 0.006935672368854284,
"kl/n_epsilon_steps": 0.3125,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 7.36222939784098e-09,
"logits/chosen": 0.13671629130840302,
"logits/rejected": -0.03375185281038284,
"logps/chosen": -153.08074951171875,
"logps/ref_chosen": -58.53071975708008,
"logps/ref_rejected": -75.48025512695312,
"logps/rejected": -215.61822509765625,
"loss": 1.154,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.6540100574493408,
"rewards/margins": 0.31189143657684326,
"rewards/rejected": -0.9659014940261841,
"step": 616
},
{
"epoch": 0.9327286470143613,
"epsilon_dpo/beta": 0.006870489567518234,
"epsilon_dpo/beta_margin_grad_mean": -0.4331701695919037,
"epsilon_dpo/beta_margin_grad_std": 0.10922081023454666,
"epsilon_dpo/beta_margin_mean": 0.28335583209991455,
"epsilon_dpo/beta_margin_std": 0.4693763554096222,
"epsilon_dpo/loss_margin_mean": 41.612857818603516,
"grad_norm": 11.716795921325684,
"kl/avg_steps": 0.578125,
"kl/beta": 0.006909760646522045,
"kl/n_epsilon_steps": 0.203125,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 7.047107919114586e-09,
"logits/chosen": -0.015370100736618042,
"logits/rejected": -0.13238976895809174,
"logps/chosen": -161.82861328125,
"logps/ref_chosen": -57.608673095703125,
"logps/ref_rejected": -81.22109985351562,
"logps/rejected": -227.05389404296875,
"loss": 1.1755,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.7172625064849854,
"rewards/margins": 0.28335583209991455,
"rewards/rejected": -1.0006183385849,
"step": 617
},
{
"epoch": 0.9342403628117913,
"epsilon_dpo/beta": 0.006827787961810827,
"epsilon_dpo/beta_margin_grad_mean": -0.4208817481994629,
"epsilon_dpo/beta_margin_grad_std": 0.10455264896154404,
"epsilon_dpo/beta_margin_mean": 0.3381859362125397,
"epsilon_dpo/beta_margin_std": 0.453767865896225,
"epsilon_dpo/loss_margin_mean": 49.83127212524414,
"grad_norm": 11.439993858337402,
"kl/avg_steps": 0.625,
"kl/beta": 0.006870042998343706,
"kl/n_epsilon_steps": 0.1875,
"kl/p_epsilon_steps": 0.8125,
"learning_rate": 6.738782355044048e-09,
"logits/chosen": 0.06061525270342827,
"logits/rejected": -0.2049265205860138,
"logps/chosen": -146.90414428710938,
"logps/ref_chosen": -56.69594192504883,
"logps/ref_rejected": -85.92362976074219,
"logps/rejected": -225.96310424804688,
"loss": 1.1251,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.6173588633537292,
"rewards/margins": 0.3381859064102173,
"rewards/rejected": -0.9555448293685913,
"step": 618
},
{
"epoch": 0.9357520786092215,
"epsilon_dpo/beta": 0.0067939143627882,
"epsilon_dpo/beta_margin_grad_mean": -0.4224725067615509,
"epsilon_dpo/beta_margin_grad_std": 0.10785052180290222,
"epsilon_dpo/beta_margin_mean": 0.3281177878379822,
"epsilon_dpo/beta_margin_std": 0.46251538395881653,
"epsilon_dpo/loss_margin_mean": 48.69709777832031,
"grad_norm": 10.064620018005371,
"kl/avg_steps": 0.5,
"kl/beta": 0.0068273721262812614,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 6.437261330158206e-09,
"logits/chosen": 0.058192264288663864,
"logits/rejected": -0.11182879656553268,
"logps/chosen": -142.52651977539062,
"logps/ref_chosen": -54.05841827392578,
"logps/ref_rejected": -83.55493927001953,
"logps/rejected": -220.72015380859375,
"loss": 1.1359,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.6021831631660461,
"rewards/margins": 0.3281177878379822,
"rewards/rejected": -0.9303009510040283,
"step": 619
},
{
"epoch": 0.9372637944066515,
"epsilon_dpo/beta": 0.006768606137484312,
"epsilon_dpo/beta_margin_grad_mean": -0.43207570910453796,
"epsilon_dpo/beta_margin_grad_std": 0.10796722024679184,
"epsilon_dpo/beta_margin_mean": 0.2891670763492584,
"epsilon_dpo/beta_margin_std": 0.45973464846611023,
"epsilon_dpo/loss_margin_mean": 43.17377471923828,
"grad_norm": 10.38392448425293,
"kl/avg_steps": 0.375,
"kl/beta": 0.006793404929339886,
"kl/n_epsilon_steps": 0.3125,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 6.142553278648238e-09,
"logits/chosen": -0.009588861837983131,
"logits/rejected": -0.016385123133659363,
"logps/chosen": -150.55735778808594,
"logps/ref_chosen": -63.36971664428711,
"logps/ref_rejected": -65.68268585205078,
"logps/rejected": -196.04409790039062,
"loss": 1.1685,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.5913649201393127,
"rewards/margins": 0.2891670763492584,
"rewards/rejected": -0.8805320262908936,
"step": 620
},
{
"epoch": 0.9387755102040817,
"epsilon_dpo/beta": 0.006736973766237497,
"epsilon_dpo/beta_margin_grad_mean": -0.43110784888267517,
"epsilon_dpo/beta_margin_grad_std": 0.11181029677391052,
"epsilon_dpo/beta_margin_mean": 0.29307910799980164,
"epsilon_dpo/beta_margin_std": 0.4820668697357178,
"epsilon_dpo/loss_margin_mean": 43.935176849365234,
"grad_norm": 10.178037643432617,
"kl/avg_steps": 0.46875,
"kl/beta": 0.006768024992197752,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 5.854666444131934e-09,
"logits/chosen": 0.09242188930511475,
"logits/rejected": -0.16551537811756134,
"logps/chosen": -142.29522705078125,
"logps/ref_chosen": -52.321224212646484,
"logps/ref_rejected": -88.09001159667969,
"logps/rejected": -221.99917602539062,
"loss": 1.1699,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.6069018244743347,
"rewards/margins": 0.29307910799980164,
"rewards/rejected": -0.899980902671814,
"step": 621
},
{
"epoch": 0.9402872260015117,
"epsilon_dpo/beta": 0.006707646884024143,
"epsilon_dpo/beta_margin_grad_mean": -0.42685550451278687,
"epsilon_dpo/beta_margin_grad_std": 0.10311096906661987,
"epsilon_dpo/beta_margin_mean": 0.30909091234207153,
"epsilon_dpo/beta_margin_std": 0.43712377548217773,
"epsilon_dpo/loss_margin_mean": 46.49442672729492,
"grad_norm": 11.078190803527832,
"kl/avg_steps": 0.4375,
"kl/beta": 0.006736448034644127,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 5.573608879422875e-09,
"logits/chosen": -0.04872158169746399,
"logits/rejected": -0.07477103918790817,
"logps/chosen": -159.86968994140625,
"logps/ref_chosen": -59.86545944213867,
"logps/ref_rejected": -81.86668395996094,
"logps/rejected": -228.36534118652344,
"loss": 1.1468,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.6726903915405273,
"rewards/margins": 0.30909091234207153,
"rewards/rejected": -0.9817812442779541,
"step": 622
},
{
"epoch": 0.9417989417989417,
"epsilon_dpo/beta": 0.006676332093775272,
"epsilon_dpo/beta_margin_grad_mean": -0.4220459461212158,
"epsilon_dpo/beta_margin_grad_std": 0.10684069991111755,
"epsilon_dpo/beta_margin_mean": 0.33005592226982117,
"epsilon_dpo/beta_margin_std": 0.45437002182006836,
"epsilon_dpo/loss_margin_mean": 49.880836486816406,
"grad_norm": 9.438215255737305,
"kl/avg_steps": 0.46875,
"kl/beta": 0.00670710438862443,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 5.299388446305342e-09,
"logits/chosen": -0.07584099471569061,
"logits/rejected": -0.1416528820991516,
"logps/chosen": -174.12042236328125,
"logps/ref_chosen": -67.36846160888672,
"logps/ref_rejected": -82.02734375,
"logps/rejected": -238.66014099121094,
"loss": 1.1327,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.7145916819572449,
"rewards/margins": 0.33005592226982117,
"rewards/rejected": -1.0446476936340332,
"step": 623
},
{
"epoch": 0.9433106575963719,
"epsilon_dpo/beta": 0.006649355869740248,
"epsilon_dpo/beta_margin_grad_mean": -0.4162905812263489,
"epsilon_dpo/beta_margin_grad_std": 0.11670554429292679,
"epsilon_dpo/beta_margin_mean": 0.3594978451728821,
"epsilon_dpo/beta_margin_std": 0.5036519765853882,
"epsilon_dpo/loss_margin_mean": 54.56452560424805,
"grad_norm": 9.69974136352539,
"kl/avg_steps": 0.40625,
"kl/beta": 0.006675811484456062,
"kl/n_epsilon_steps": 0.296875,
"kl/p_epsilon_steps": 0.703125,
"learning_rate": 5.03201281531429e-09,
"logits/chosen": 0.11633279174566269,
"logits/rejected": -0.06985671818256378,
"logps/chosen": -137.498779296875,
"logps/ref_chosen": -51.02655029296875,
"logps/ref_rejected": -76.49203491210938,
"logps/rejected": -217.52879333496094,
"loss": 1.1189,
"rewards/accuracies": 0.703125,
"rewards/chosen": -0.5761100053787231,
"rewards/margins": 0.3594978451728821,
"rewards/rejected": -0.93560791015625,
"step": 624
},
{
"epoch": 0.9448223733938019,
"epsilon_dpo/beta": 0.0066182962618768215,
"epsilon_dpo/beta_margin_grad_mean": -0.43883904814720154,
"epsilon_dpo/beta_margin_grad_std": 0.11473709344863892,
"epsilon_dpo/beta_margin_mean": 0.261360764503479,
"epsilon_dpo/beta_margin_std": 0.49338194727897644,
"epsilon_dpo/loss_margin_mean": 39.940860748291016,
"grad_norm": 9.432180404663086,
"kl/avg_steps": 0.46875,
"kl/beta": 0.006648800801485777,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 4.7714894655209174e-09,
"logits/chosen": 0.012524990364909172,
"logits/rejected": -0.14943927526474,
"logps/chosen": -145.58840942382812,
"logps/ref_chosen": -54.207618713378906,
"logps/ref_rejected": -84.93669891357422,
"logps/rejected": -216.25836181640625,
"loss": 1.2001,
"rewards/accuracies": 0.703125,
"rewards/chosen": -0.6059010028839111,
"rewards/margins": 0.261360764503479,
"rewards/rejected": -0.8672617673873901,
"step": 625
},
{
"epoch": 0.9463340891912321,
"epsilon_dpo/beta": 0.006583280861377716,
"epsilon_dpo/beta_margin_grad_mean": -0.41852301359176636,
"epsilon_dpo/beta_margin_grad_std": 0.1270623356103897,
"epsilon_dpo/beta_margin_mean": 0.3543926477432251,
"epsilon_dpo/beta_margin_std": 0.553898274898529,
"epsilon_dpo/loss_margin_mean": 54.345157623291016,
"grad_norm": 9.763165473937988,
"kl/avg_steps": 0.53125,
"kl/beta": 0.006617779843509197,
"kl/n_epsilon_steps": 0.234375,
"kl/p_epsilon_steps": 0.765625,
"learning_rate": 4.517825684323323e-09,
"logits/chosen": 0.21380025148391724,
"logits/rejected": -0.05563541501760483,
"logps/chosen": -131.3229217529297,
"logps/ref_chosen": -45.06201934814453,
"logps/ref_rejected": -89.66368103027344,
"logps/rejected": -230.26974487304688,
"loss": 1.1353,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.5695881843566895,
"rewards/margins": 0.3543926477432251,
"rewards/rejected": -0.9239808320999146,
"step": 626
},
{
"epoch": 0.9478458049886621,
"epsilon_dpo/beta": 0.006546434946358204,
"epsilon_dpo/beta_margin_grad_mean": -0.40834441781044006,
"epsilon_dpo/beta_margin_grad_std": 0.1129455491900444,
"epsilon_dpo/beta_margin_mean": 0.39005330204963684,
"epsilon_dpo/beta_margin_std": 0.48856836557388306,
"epsilon_dpo/loss_margin_mean": 60.00200271606445,
"grad_norm": 9.57247543334961,
"kl/avg_steps": 0.5625,
"kl/beta": 0.006582808680832386,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 4.271028567242818e-09,
"logits/chosen": -0.024713603779673576,
"logits/rejected": -0.19905048608779907,
"logps/chosen": -154.45907592773438,
"logps/ref_chosen": -58.791053771972656,
"logps/ref_rejected": -94.90802001953125,
"logps/rejected": -250.57803344726562,
"loss": 1.0904,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.6272003650665283,
"rewards/margins": 0.39005327224731445,
"rewards/rejected": -1.0172536373138428,
"step": 627
},
{
"epoch": 0.9493575207860923,
"epsilon_dpo/beta": 0.00650777155533433,
"epsilon_dpo/beta_margin_grad_mean": -0.40838801860809326,
"epsilon_dpo/beta_margin_grad_std": 0.12563160061836243,
"epsilon_dpo/beta_margin_mean": 0.3874700665473938,
"epsilon_dpo/beta_margin_std": 0.5390621423721313,
"epsilon_dpo/loss_margin_mean": 60.05453872680664,
"grad_norm": 11.243673324584961,
"kl/avg_steps": 0.59375,
"kl/beta": 0.006545987445861101,
"kl/n_epsilon_steps": 0.203125,
"kl/p_epsilon_steps": 0.796875,
"learning_rate": 4.0311050177251895e-09,
"logits/chosen": -0.037690669298172,
"logits/rejected": -0.04482515528798103,
"logps/chosen": -139.09957885742188,
"logps/ref_chosen": -52.8035774230957,
"logps/ref_rejected": -76.49468994140625,
"logps/rejected": -222.84524536132812,
"loss": 1.1054,
"rewards/accuracies": 0.796875,
"rewards/chosen": -0.5632553696632385,
"rewards/margins": 0.3874700665473938,
"rewards/rejected": -0.9507254362106323,
"step": 628
},
{
"epoch": 0.9508692365835223,
"epsilon_dpo/beta": 0.006467325612902641,
"epsilon_dpo/beta_margin_grad_mean": -0.428668349981308,
"epsilon_dpo/beta_margin_grad_std": 0.09284396469593048,
"epsilon_dpo/beta_margin_mean": 0.3013315200805664,
"epsilon_dpo/beta_margin_std": 0.398568332195282,
"epsilon_dpo/loss_margin_mean": 46.86011505126953,
"grad_norm": 9.765864372253418,
"kl/avg_steps": 0.625,
"kl/beta": 0.006507350131869316,
"kl/n_epsilon_steps": 0.1875,
"kl/p_epsilon_steps": 0.8125,
"learning_rate": 3.798061746947995e-09,
"logits/chosen": -0.13407738506793976,
"logits/rejected": -0.06686470657587051,
"logps/chosen": -160.96990966796875,
"logps/ref_chosen": -70.71749877929688,
"logps/ref_rejected": -78.9627456665039,
"logps/rejected": -216.0752716064453,
"loss": 1.1453,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.5841171741485596,
"rewards/margins": 0.3013315200805664,
"rewards/rejected": -0.885448694229126,
"step": 629
},
{
"epoch": 0.9523809523809523,
"epsilon_dpo/beta": 0.0064352406188845634,
"epsilon_dpo/beta_margin_grad_mean": -0.4158555269241333,
"epsilon_dpo/beta_margin_grad_std": 0.10698544979095459,
"epsilon_dpo/beta_margin_mean": 0.3580004870891571,
"epsilon_dpo/beta_margin_std": 0.4584910571575165,
"epsilon_dpo/loss_margin_mean": 56.05823516845703,
"grad_norm": 7.748056888580322,
"kl/avg_steps": 0.5,
"kl/beta": 0.006466931663453579,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 3.5719052736323806e-09,
"logits/chosen": 0.09015575796365738,
"logits/rejected": -0.006706856191158295,
"logps/chosen": -146.3076171875,
"logps/ref_chosen": -56.201412200927734,
"logps/ref_rejected": -74.69807434082031,
"logps/rejected": -220.8625030517578,
"loss": 1.1101,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.5804177522659302,
"rewards/margins": 0.3580004870891571,
"rewards/rejected": -0.9384182691574097,
"step": 630
},
{
"epoch": 0.9538926681783825,
"epsilon_dpo/beta": 0.006407246924936771,
"epsilon_dpo/beta_margin_grad_mean": -0.40450039505958557,
"epsilon_dpo/beta_margin_grad_std": 0.1196960061788559,
"epsilon_dpo/beta_margin_mean": 0.4121440052986145,
"epsilon_dpo/beta_margin_std": 0.520190954208374,
"epsilon_dpo/loss_margin_mean": 64.84634399414062,
"grad_norm": 9.96324348449707,
"kl/avg_steps": 0.4375,
"kl/beta": 0.006434758193790913,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 3.352641923861144e-09,
"logits/chosen": -0.03412717580795288,
"logits/rejected": -0.2696327567100525,
"logps/chosen": -142.77847290039062,
"logps/ref_chosen": -58.820594787597656,
"logps/ref_rejected": -96.51437377929688,
"logps/rejected": -245.318603515625,
"loss": 1.0798,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.5395218133926392,
"rewards/margins": 0.4121440052986145,
"rewards/rejected": -0.9516658186912537,
"step": 631
},
{
"epoch": 0.9554043839758125,
"epsilon_dpo/beta": 0.006369325798004866,
"epsilon_dpo/beta_margin_grad_mean": -0.4071274995803833,
"epsilon_dpo/beta_margin_grad_std": 0.1029144823551178,
"epsilon_dpo/beta_margin_mean": 0.39231714606285095,
"epsilon_dpo/beta_margin_std": 0.43879544734954834,
"epsilon_dpo/loss_margin_mean": 61.99010467529297,
"grad_norm": 8.712691307067871,
"kl/avg_steps": 0.59375,
"kl/beta": 0.006406728643923998,
"kl/n_epsilon_steps": 0.203125,
"kl/p_epsilon_steps": 0.796875,
"learning_rate": 3.140277830901428e-09,
"logits/chosen": -0.020818855613470078,
"logits/rejected": 0.010319981724023819,
"logps/chosen": -146.97023010253906,
"logps/ref_chosen": -58.786048889160156,
"logps/ref_rejected": -67.21923828125,
"logps/rejected": -217.39352416992188,
"loss": 1.078,
"rewards/accuracies": 0.796875,
"rewards/chosen": -0.562757134437561,
"rewards/margins": 0.39231714606285095,
"rewards/rejected": -0.9550743103027344,
"step": 632
},
{
"epoch": 0.9569160997732427,
"epsilon_dpo/beta": 0.00633969297632575,
"epsilon_dpo/beta_margin_grad_mean": -0.43209657073020935,
"epsilon_dpo/beta_margin_grad_std": 0.10680217295885086,
"epsilon_dpo/beta_margin_mean": 0.28936246037483215,
"epsilon_dpo/beta_margin_std": 0.45695391297340393,
"epsilon_dpo/loss_margin_mean": 46.07304763793945,
"grad_norm": 9.39486026763916,
"kl/avg_steps": 0.46875,
"kl/beta": 0.006368913222104311,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 2.9348189350335007e-09,
"logits/chosen": 0.14489537477493286,
"logits/rejected": -0.019632523879408836,
"logps/chosen": -135.10983276367188,
"logps/ref_chosen": -52.13019561767578,
"logps/ref_rejected": -67.23016357421875,
"logps/rejected": -196.2828369140625,
"loss": 1.1676,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.5274480581283569,
"rewards/margins": 0.28936249017715454,
"rewards/rejected": -0.8168105483055115,
"step": 633
},
{
"epoch": 0.9584278155706727,
"epsilon_dpo/beta": 0.006331907119601965,
"epsilon_dpo/beta_margin_grad_mean": -0.46683964133262634,
"epsilon_dpo/beta_margin_grad_std": 0.10544212907552719,
"epsilon_dpo/beta_margin_mean": 0.14130796492099762,
"epsilon_dpo/beta_margin_std": 0.4424091875553131,
"epsilon_dpo/loss_margin_mean": 22.82888412475586,
"grad_norm": 11.229324340820312,
"kl/avg_steps": 0.125,
"kl/beta": 0.006339197978377342,
"kl/n_epsilon_steps": 0.4375,
"kl/p_epsilon_steps": 0.5625,
"learning_rate": 2.736270983384276e-09,
"logits/chosen": 0.08808039873838425,
"logits/rejected": 0.013496596366167068,
"logps/chosen": -160.33682250976562,
"logps/ref_chosen": -60.97979736328125,
"logps/ref_rejected": -58.50825119018555,
"logps/rejected": -180.69415283203125,
"loss": 1.2976,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.6300903558731079,
"rewards/margins": 0.1413079798221588,
"rewards/rejected": -0.7713983058929443,
"step": 634
},
{
"epoch": 0.9599395313681028,
"epsilon_dpo/beta": 0.0063081723637878895,
"epsilon_dpo/beta_margin_grad_mean": -0.44409170746803284,
"epsilon_dpo/beta_margin_grad_std": 0.11822935938835144,
"epsilon_dpo/beta_margin_mean": 0.23734334111213684,
"epsilon_dpo/beta_margin_std": 0.499663770198822,
"epsilon_dpo/loss_margin_mean": 38.173614501953125,
"grad_norm": 8.142550468444824,
"kl/avg_steps": 0.375,
"kl/beta": 0.006331284064799547,
"kl/n_epsilon_steps": 0.3125,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 2.5446395297668287e-09,
"logits/chosen": -0.0561397448182106,
"logits/rejected": -0.21694621443748474,
"logps/chosen": -178.86697387695312,
"logps/ref_chosen": -65.9730224609375,
"logps/ref_rejected": -85.61316680908203,
"logps/rejected": -236.68072509765625,
"loss": 1.2233,
"rewards/accuracies": 0.671875,
"rewards/chosen": -0.7139409780502319,
"rewards/margins": 0.23734335601329803,
"rewards/rejected": -0.9512842893600464,
"step": 635
},
{
"epoch": 0.9614512471655329,
"epsilon_dpo/beta": 0.006266863085329533,
"epsilon_dpo/beta_margin_grad_mean": -0.42008447647094727,
"epsilon_dpo/beta_margin_grad_std": 0.10251911729574203,
"epsilon_dpo/beta_margin_mean": 0.3363209068775177,
"epsilon_dpo/beta_margin_std": 0.43979865312576294,
"epsilon_dpo/loss_margin_mean": 54.00802230834961,
"grad_norm": 7.920770645141602,
"kl/avg_steps": 0.65625,
"kl/beta": 0.006307630334049463,
"kl/n_epsilon_steps": 0.171875,
"kl/p_epsilon_steps": 0.828125,
"learning_rate": 2.359929934524829e-09,
"logits/chosen": 0.13026434183120728,
"logits/rejected": -0.10571019351482391,
"logps/chosen": -136.9871368408203,
"logps/ref_chosen": -49.140167236328125,
"logps/ref_rejected": -81.26970672607422,
"logps/rejected": -223.12469482421875,
"loss": 1.1242,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.5517352819442749,
"rewards/margins": 0.3363208770751953,
"rewards/rejected": -0.8880561590194702,
"step": 636
},
{
"epoch": 0.9629629629629629,
"epsilon_dpo/beta": 0.0062436312437057495,
"epsilon_dpo/beta_margin_grad_mean": -0.4420214891433716,
"epsilon_dpo/beta_margin_grad_std": 0.11377973854541779,
"epsilon_dpo/beta_margin_mean": 0.24802474677562714,
"epsilon_dpo/beta_margin_std": 0.4900610148906708,
"epsilon_dpo/loss_margin_mean": 40.22936248779297,
"grad_norm": 9.328143119812012,
"kl/avg_steps": 0.375,
"kl/beta": 0.006266506388783455,
"kl/n_epsilon_steps": 0.3125,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 2.1821473643827137e-09,
"logits/chosen": 0.04495641961693764,
"logits/rejected": -0.15657472610473633,
"logps/chosen": -190.55067443847656,
"logps/ref_chosen": -73.69658660888672,
"logps/ref_rejected": -83.01487731933594,
"logps/rejected": -240.09832763671875,
"loss": 1.2109,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.7313704490661621,
"rewards/margins": 0.24802476167678833,
"rewards/rejected": -0.9793952107429504,
"step": 637
},
{
"epoch": 0.9644746787603931,
"epsilon_dpo/beta": 0.006214451510459185,
"epsilon_dpo/beta_margin_grad_mean": -0.43167394399642944,
"epsilon_dpo/beta_margin_grad_std": 0.10392957180738449,
"epsilon_dpo/beta_margin_mean": 0.2886382043361664,
"epsilon_dpo/beta_margin_std": 0.43859899044036865,
"epsilon_dpo/loss_margin_mean": 46.859130859375,
"grad_norm": 9.426673889160156,
"kl/avg_steps": 0.46875,
"kl/beta": 0.006243094801902771,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 2.0112967923011646e-09,
"logits/chosen": -0.0664314478635788,
"logits/rejected": -0.18105250597000122,
"logps/chosen": -165.05059814453125,
"logps/ref_chosen": -62.78158187866211,
"logps/ref_rejected": -85.40478515625,
"logps/rejected": -234.53292846679688,
"loss": 1.1647,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.6368111371994019,
"rewards/margins": 0.2886382043361664,
"rewards/rejected": -0.9254493117332458,
"step": 638
},
{
"epoch": 0.9659863945578231,
"epsilon_dpo/beta": 0.006183515302836895,
"epsilon_dpo/beta_margin_grad_mean": -0.42163699865341187,
"epsilon_dpo/beta_margin_grad_std": 0.10470977425575256,
"epsilon_dpo/beta_margin_mean": 0.33371710777282715,
"epsilon_dpo/beta_margin_std": 0.449085533618927,
"epsilon_dpo/loss_margin_mean": 54.38595199584961,
"grad_norm": 9.339580535888672,
"kl/avg_steps": 0.5,
"kl/beta": 0.006213966757059097,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 1.847382997337943e-09,
"logits/chosen": 0.12869128584861755,
"logits/rejected": -0.11718625575304031,
"logps/chosen": -145.54495239257812,
"logps/ref_chosen": -53.76658248901367,
"logps/ref_rejected": -72.30009460449219,
"logps/rejected": -218.46441650390625,
"loss": 1.1282,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.5689350366592407,
"rewards/margins": 0.33371710777282715,
"rewards/rejected": -0.9026521444320679,
"step": 639
},
{
"epoch": 0.9674981103552532,
"epsilon_dpo/beta": 0.006148886866867542,
"epsilon_dpo/beta_margin_grad_mean": -0.4303390085697174,
"epsilon_dpo/beta_margin_grad_std": 0.10404349118471146,
"epsilon_dpo/beta_margin_mean": 0.2902457118034363,
"epsilon_dpo/beta_margin_std": 0.4400752782821655,
"epsilon_dpo/loss_margin_mean": 47.627437591552734,
"grad_norm": 9.325575828552246,
"kl/avg_steps": 0.5625,
"kl/beta": 0.0061830515041947365,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 1.690410564514244e-09,
"logits/chosen": 0.1595688760280609,
"logits/rejected": -0.028746701776981354,
"logps/chosen": -147.52455139160156,
"logps/ref_chosen": -51.41777801513672,
"logps/ref_rejected": -77.27879333496094,
"logps/rejected": -221.01300048828125,
"loss": 1.1637,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.5918633937835693,
"rewards/margins": 0.2902457118034363,
"rewards/rejected": -0.8821091055870056,
"step": 640
},
{
"epoch": 0.9690098261526833,
"epsilon_dpo/beta": 0.0061106495559215546,
"epsilon_dpo/beta_margin_grad_mean": -0.43354716897010803,
"epsilon_dpo/beta_margin_grad_std": 0.0875125303864479,
"epsilon_dpo/beta_margin_mean": 0.27646568417549133,
"epsilon_dpo/beta_margin_std": 0.3727814853191376,
"epsilon_dpo/loss_margin_mean": 45.52855682373047,
"grad_norm": 9.120386123657227,
"kl/avg_steps": 0.625,
"kl/beta": 0.006148466374725103,
"kl/n_epsilon_steps": 0.1875,
"kl/p_epsilon_steps": 0.8125,
"learning_rate": 1.5403838846864692e-09,
"logits/chosen": -0.19152021408081055,
"logits/rejected": -0.19406136870384216,
"logps/chosen": -169.15484619140625,
"logps/ref_chosen": -71.0546646118164,
"logps/ref_rejected": -82.2440185546875,
"logps/rejected": -225.87274169921875,
"loss": 1.1622,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.6000571250915527,
"rewards/margins": 0.27646568417549133,
"rewards/rejected": -0.8765227794647217,
"step": 641
},
{
"epoch": 0.9705215419501134,
"epsilon_dpo/beta": 0.006089882459491491,
"epsilon_dpo/beta_margin_grad_mean": -0.44763198494911194,
"epsilon_dpo/beta_margin_grad_std": 0.10747512429952621,
"epsilon_dpo/beta_margin_mean": 0.22062529623508453,
"epsilon_dpo/beta_margin_std": 0.4535558521747589,
"epsilon_dpo/loss_margin_mean": 36.75855255126953,
"grad_norm": 10.424286842346191,
"kl/avg_steps": 0.34375,
"kl/beta": 0.006110277492552996,
"kl/n_epsilon_steps": 0.328125,
"kl/p_epsilon_steps": 0.671875,
"learning_rate": 1.3973071544233218e-09,
"logits/chosen": -0.11139755696058273,
"logits/rejected": -0.0013796687126159668,
"logps/chosen": -173.82308959960938,
"logps/ref_chosen": -68.92927551269531,
"logps/ref_rejected": -70.85682678222656,
"logps/rejected": -212.50918579101562,
"loss": 1.2276,
"rewards/accuracies": 0.671875,
"rewards/chosen": -0.6407560110092163,
"rewards/margins": 0.22062531113624573,
"rewards/rejected": -0.8613812923431396,
"step": 642
},
{
"epoch": 0.9720332577475435,
"epsilon_dpo/beta": 0.006059504114091396,
"epsilon_dpo/beta_margin_grad_mean": -0.4308704137802124,
"epsilon_dpo/beta_margin_grad_std": 0.10768142342567444,
"epsilon_dpo/beta_margin_mean": 0.29134950041770935,
"epsilon_dpo/beta_margin_std": 0.45370638370513916,
"epsilon_dpo/loss_margin_mean": 48.56650161743164,
"grad_norm": 14.203643798828125,
"kl/avg_steps": 0.5,
"kl/beta": 0.006089345086365938,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 1.261184375888541e-09,
"logits/chosen": -0.07608947157859802,
"logits/rejected": -0.18557631969451904,
"logps/chosen": -162.5103759765625,
"logps/ref_chosen": -65.30903625488281,
"logps/ref_rejected": -83.61613464355469,
"logps/rejected": -229.3839874267578,
"loss": 1.1657,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.590510368347168,
"rewards/margins": 0.29134950041770935,
"rewards/rejected": -0.8818598985671997,
"step": 643
},
{
"epoch": 0.9735449735449735,
"epsilon_dpo/beta": 0.006029357668012381,
"epsilon_dpo/beta_margin_grad_mean": -0.4443458318710327,
"epsilon_dpo/beta_margin_grad_std": 0.1099080890417099,
"epsilon_dpo/beta_margin_mean": 0.23319894075393677,
"epsilon_dpo/beta_margin_std": 0.4639197289943695,
"epsilon_dpo/loss_margin_mean": 39.140132904052734,
"grad_norm": 7.422348976135254,
"kl/avg_steps": 0.5,
"kl/beta": 0.006059050094336271,
"kl/n_epsilon_steps": 0.25,
"kl/p_epsilon_steps": 0.75,
"learning_rate": 1.1320193567288527e-09,
"logits/chosen": 0.1682870090007782,
"logits/rejected": 0.06035337597131729,
"logps/chosen": -138.48989868164062,
"logps/ref_chosen": -51.002601623535156,
"logps/ref_rejected": -64.46372985839844,
"logps/rejected": -191.09115600585938,
"loss": 1.2187,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.5294017791748047,
"rewards/margins": 0.23319892585277557,
"rewards/rejected": -0.7626007199287415,
"step": 644
},
{
"epoch": 0.9750566893424036,
"epsilon_dpo/beta": 0.005991823971271515,
"epsilon_dpo/beta_margin_grad_mean": -0.43631860613822937,
"epsilon_dpo/beta_margin_grad_std": 0.0921812355518341,
"epsilon_dpo/beta_margin_mean": 0.2657393515110016,
"epsilon_dpo/beta_margin_std": 0.38939616084098816,
"epsilon_dpo/loss_margin_mean": 44.66914367675781,
"grad_norm": 9.235962867736816,
"kl/avg_steps": 0.625,
"kl/beta": 0.006028905510902405,
"kl/n_epsilon_steps": 0.1875,
"kl/p_epsilon_steps": 0.8125,
"learning_rate": 1.0098157099674987e-09,
"logits/chosen": -0.05183897912502289,
"logits/rejected": -0.06940633058547974,
"logps/chosen": -157.7047576904297,
"logps/ref_chosen": -60.963409423828125,
"logps/ref_rejected": -69.73353576660156,
"logps/rejected": -211.14404296875,
"loss": 1.1747,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.5809457302093506,
"rewards/margins": 0.2657393515110016,
"rewards/rejected": -0.8466850519180298,
"step": 645
},
{
"epoch": 0.9765684051398337,
"epsilon_dpo/beta": 0.00597146013751626,
"epsilon_dpo/beta_margin_grad_mean": -0.44182419776916504,
"epsilon_dpo/beta_margin_grad_std": 0.10553637892007828,
"epsilon_dpo/beta_margin_mean": 0.2444935441017151,
"epsilon_dpo/beta_margin_std": 0.4472936987876892,
"epsilon_dpo/loss_margin_mean": 41.44504165649414,
"grad_norm": 8.651565551757812,
"kl/avg_steps": 0.34375,
"kl/beta": 0.005991458892822266,
"kl/n_epsilon_steps": 0.328125,
"kl/p_epsilon_steps": 0.671875,
"learning_rate": 8.945768539031783e-10,
"logits/chosen": 0.010120227932929993,
"logits/rejected": -0.12322086095809937,
"logps/chosen": -169.28077697753906,
"logps/ref_chosen": -62.290069580078125,
"logps/ref_rejected": -85.54812622070312,
"logps/rejected": -233.98387145996094,
"loss": 1.2049,
"rewards/accuracies": 0.703125,
"rewards/chosen": -0.6408818960189819,
"rewards/margins": 0.2444935441017151,
"rewards/rejected": -0.8853753805160522,
"step": 646
},
{
"epoch": 0.9780801209372638,
"epsilon_dpo/beta": 0.0059379409067332745,
"epsilon_dpo/beta_margin_grad_mean": -0.41022220253944397,
"epsilon_dpo/beta_margin_grad_std": 0.09677625447511673,
"epsilon_dpo/beta_margin_mean": 0.3789673447608948,
"epsilon_dpo/beta_margin_std": 0.41520026326179504,
"epsilon_dpo/loss_margin_mean": 64.20808410644531,
"grad_norm": 9.409882545471191,
"kl/avg_steps": 0.5625,
"kl/beta": 0.005970933474600315,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 7.863060120144316e-10,
"logits/chosen": 0.005722839385271072,
"logits/rejected": -0.20680958032608032,
"logps/chosen": -170.15533447265625,
"logps/ref_chosen": -67.515869140625,
"logps/ref_rejected": -101.50870513916016,
"logps/rejected": -268.35626220703125,
"loss": 1.0838,
"rewards/accuracies": 0.796875,
"rewards/chosen": -0.6111813187599182,
"rewards/margins": 0.3789673447608948,
"rewards/rejected": -0.990148663520813,
"step": 647
},
{
"epoch": 0.9795918367346939,
"epsilon_dpo/beta": 0.005915860645473003,
"epsilon_dpo/beta_margin_grad_mean": -0.43487077951431274,
"epsilon_dpo/beta_margin_grad_std": 0.10711178928613663,
"epsilon_dpo/beta_margin_mean": 0.2727292478084564,
"epsilon_dpo/beta_margin_std": 0.45299896597862244,
"epsilon_dpo/loss_margin_mean": 46.654109954833984,
"grad_norm": 8.898358345031738,
"kl/avg_steps": 0.375,
"kl/beta": 0.005937534850090742,
"kl/n_epsilon_steps": 0.3125,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 6.850062128694045e-10,
"logits/chosen": -0.00845257192850113,
"logits/rejected": -0.14589478075504303,
"logps/chosen": -168.66171264648438,
"logps/ref_chosen": -64.59593963623047,
"logps/ref_rejected": -83.384033203125,
"logps/rejected": -234.10391235351562,
"loss": 1.1816,
"rewards/accuracies": 0.703125,
"rewards/chosen": -0.6165870428085327,
"rewards/margins": 0.2727292478084564,
"rewards/rejected": -0.8893162608146667,
"step": 648
},
{
"epoch": 0.981103552532124,
"epsilon_dpo/beta": 0.005895608104765415,
"epsilon_dpo/beta_margin_grad_mean": -0.4329400062561035,
"epsilon_dpo/beta_margin_grad_std": 0.10516858845949173,
"epsilon_dpo/beta_margin_mean": 0.2821688950061798,
"epsilon_dpo/beta_margin_std": 0.4424353837966919,
"epsilon_dpo/loss_margin_mean": 48.406612396240234,
"grad_norm": 13.302745819091797,
"kl/avg_steps": 0.34375,
"kl/beta": 0.005915352609008551,
"kl/n_epsilon_steps": 0.328125,
"kl/p_epsilon_steps": 0.671875,
"learning_rate": 5.906802900412788e-10,
"logits/chosen": 0.13613608479499817,
"logits/rejected": -0.032275184988975525,
"logps/chosen": -145.61695861816406,
"logps/ref_chosen": -49.30964660644531,
"logps/ref_rejected": -73.73710632324219,
"logps/rejected": -218.45101928710938,
"loss": 1.1712,
"rewards/accuracies": 0.703125,
"rewards/chosen": -0.5696060657501221,
"rewards/margins": 0.2821689248085022,
"rewards/rejected": -0.8517749309539795,
"step": 649
},
{
"epoch": 0.982615268329554,
"epsilon_dpo/beta": 0.005869883578270674,
"epsilon_dpo/beta_margin_grad_mean": -0.43242478370666504,
"epsilon_dpo/beta_margin_grad_std": 0.10855328291654587,
"epsilon_dpo/beta_margin_mean": 0.2853633165359497,
"epsilon_dpo/beta_margin_std": 0.45738157629966736,
"epsilon_dpo/loss_margin_mean": 49.12057113647461,
"grad_norm": 9.074873924255371,
"kl/avg_steps": 0.4375,
"kl/beta": 0.0058950879611074924,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.71875,
"learning_rate": 5.033308820289184e-10,
"logits/chosen": 0.19129210710525513,
"logits/rejected": 0.010112637653946877,
"logps/chosen": -146.40533447265625,
"logps/ref_chosen": -55.063262939453125,
"logps/ref_rejected": -77.39610290527344,
"logps/rejected": -217.85873413085938,
"loss": 1.1717,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.5381441116333008,
"rewards/margins": 0.2853633165359497,
"rewards/rejected": -0.8235074281692505,
"step": 650
},
{
"epoch": 0.9841269841269841,
"epsilon_dpo/beta": 0.005842480808496475,
"epsilon_dpo/beta_margin_grad_mean": -0.4416995942592621,
"epsilon_dpo/beta_margin_grad_std": 0.09576379507780075,
"epsilon_dpo/beta_margin_mean": 0.24522654712200165,
"epsilon_dpo/beta_margin_std": 0.4043896496295929,
"epsilon_dpo/loss_margin_mean": 42.36793518066406,
"grad_norm": 9.8654146194458,
"kl/avg_steps": 0.46875,
"kl/beta": 0.0058694095350801945,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 4.2296043218295606e-10,
"logits/chosen": 0.03646399453282356,
"logits/rejected": -0.15612871944904327,
"logps/chosen": -145.41241455078125,
"logps/ref_chosen": -54.065162658691406,
"logps/ref_rejected": -77.79080200195312,
"logps/rejected": -211.50599670410156,
"loss": 1.1955,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.5349258184432983,
"rewards/margins": 0.24522654712200165,
"rewards/rejected": -0.7801523208618164,
"step": 651
},
{
"epoch": 0.9856386999244142,
"epsilon_dpo/beta": 0.005820699501782656,
"epsilon_dpo/beta_margin_grad_mean": -0.4430865943431854,
"epsilon_dpo/beta_margin_grad_std": 0.10328911244869232,
"epsilon_dpo/beta_margin_mean": 0.24002714455127716,
"epsilon_dpo/beta_margin_std": 0.43497779965400696,
"epsilon_dpo/loss_margin_mean": 41.7305908203125,
"grad_norm": 9.020734786987305,
"kl/avg_steps": 0.375,
"kl/beta": 0.005842024926096201,
"kl/n_epsilon_steps": 0.3125,
"kl/p_epsilon_steps": 0.6875,
"learning_rate": 3.4957118863768176e-10,
"logits/chosen": 0.025039512664079666,
"logits/rejected": -0.11147890985012054,
"logps/chosen": -171.20289611816406,
"logps/ref_chosen": -63.64030456542969,
"logps/ref_rejected": -78.86882019042969,
"logps/rejected": -228.16201782226562,
"loss": 1.2064,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.6275283098220825,
"rewards/margins": 0.24002712965011597,
"rewards/rejected": -0.8675554990768433,
"step": 652
},
{
"epoch": 0.9871504157218443,
"epsilon_dpo/beta": 0.005788039416074753,
"epsilon_dpo/beta_margin_grad_mean": -0.4308336079120636,
"epsilon_dpo/beta_margin_grad_std": 0.1006578877568245,
"epsilon_dpo/beta_margin_mean": 0.29018673300743103,
"epsilon_dpo/beta_margin_std": 0.4255160391330719,
"epsilon_dpo/loss_margin_mean": 50.542213439941406,
"grad_norm": 9.18583869934082,
"kl/avg_steps": 0.5625,
"kl/beta": 0.005820199381560087,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 2.831652042480093e-10,
"logits/chosen": -0.06760972738265991,
"logits/rejected": -0.08101306855678558,
"logps/chosen": -156.74609375,
"logps/ref_chosen": -61.668373107910156,
"logps/ref_rejected": -73.83012390136719,
"logps/rejected": -219.45004272460938,
"loss": 1.1607,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.5514776706695557,
"rewards/margins": 0.29018670320510864,
"rewards/rejected": -0.8416643738746643,
"step": 653
},
{
"epoch": 0.9886621315192744,
"epsilon_dpo/beta": 0.005761090200394392,
"epsilon_dpo/beta_margin_grad_mean": -0.4454101026058197,
"epsilon_dpo/beta_margin_grad_std": 0.0978466123342514,
"epsilon_dpo/beta_margin_mean": 0.2307618260383606,
"epsilon_dpo/beta_margin_std": 0.4153948426246643,
"epsilon_dpo/loss_margin_mean": 40.47480392456055,
"grad_norm": 9.93721866607666,
"kl/avg_steps": 0.46875,
"kl/beta": 0.005787643603980541,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 2.2374433653205016e-10,
"logits/chosen": 0.05396203696727753,
"logits/rejected": -0.205628901720047,
"logps/chosen": -157.49917602539062,
"logps/ref_chosen": -57.568267822265625,
"logps/ref_rejected": -87.74789428710938,
"logps/rejected": -228.15362548828125,
"loss": 1.2103,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.5768218040466309,
"rewards/margins": 0.2307618260383606,
"rewards/rejected": -0.8075836896896362,
"step": 654
},
{
"epoch": 0.9901738473167044,
"epsilon_dpo/beta": 0.0057252091355621815,
"epsilon_dpo/beta_margin_grad_mean": -0.4167655408382416,
"epsilon_dpo/beta_margin_grad_std": 0.0851697325706482,
"epsilon_dpo/beta_margin_mean": 0.3463904857635498,
"epsilon_dpo/beta_margin_std": 0.3574642539024353,
"epsilon_dpo/loss_margin_mean": 60.82048034667969,
"grad_norm": 8.491602897644043,
"kl/avg_steps": 0.625,
"kl/beta": 0.00576064083725214,
"kl/n_epsilon_steps": 0.1875,
"kl/p_epsilon_steps": 0.8125,
"learning_rate": 1.7131024761923852e-10,
"logits/chosen": 0.1099543422460556,
"logits/rejected": -0.13104557991027832,
"logps/chosen": -132.05859375,
"logps/ref_chosen": -52.14714813232422,
"logps/ref_rejected": -80.85014343261719,
"logps/rejected": -221.5820770263672,
"loss": 1.1005,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.4581993818283081,
"rewards/margins": 0.3463904559612274,
"rewards/rejected": -0.8045898675918579,
"step": 655
},
{
"epoch": 0.9916855631141346,
"epsilon_dpo/beta": 0.005693227518349886,
"epsilon_dpo/beta_margin_grad_mean": -0.4306899905204773,
"epsilon_dpo/beta_margin_grad_std": 0.09381554275751114,
"epsilon_dpo/beta_margin_mean": 0.2897292375564575,
"epsilon_dpo/beta_margin_std": 0.3953634798526764,
"epsilon_dpo/loss_margin_mean": 51.26100540161133,
"grad_norm": 7.579216957092285,
"kl/avg_steps": 0.5625,
"kl/beta": 0.005724860355257988,
"kl/n_epsilon_steps": 0.21875,
"kl/p_epsilon_steps": 0.78125,
"learning_rate": 1.2586440420372934e-10,
"logits/chosen": -0.06490539014339447,
"logits/rejected": -0.1359993815422058,
"logps/chosen": -173.83035278320312,
"logps/ref_chosen": -73.25672912597656,
"logps/ref_rejected": -85.35127258300781,
"logps/rejected": -237.1859130859375,
"loss": 1.1551,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.5735797882080078,
"rewards/margins": 0.2897292375564575,
"rewards/rejected": -0.8633090257644653,
"step": 656
},
{
"epoch": 0.9931972789115646,
"epsilon_dpo/beta": 0.005656044464558363,
"epsilon_dpo/beta_margin_grad_mean": -0.42085394263267517,
"epsilon_dpo/beta_margin_grad_std": 0.10377608239650726,
"epsilon_dpo/beta_margin_mean": 0.3315429091453552,
"epsilon_dpo/beta_margin_std": 0.4385998249053955,
"epsilon_dpo/loss_margin_mean": 58.998226165771484,
"grad_norm": 8.478572845458984,
"kl/avg_steps": 0.65625,
"kl/beta": 0.005692838225513697,
"kl/n_epsilon_steps": 0.171875,
"kl/p_epsilon_steps": 0.828125,
"learning_rate": 8.740807750345913e-11,
"logits/chosen": 0.1666242778301239,
"logits/rejected": -0.060559555888175964,
"logps/chosen": -141.16455078125,
"logps/ref_chosen": -49.72339630126953,
"logps/ref_rejected": -75.15686798095703,
"logps/rejected": -225.59625244140625,
"loss": 1.1284,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.5178928375244141,
"rewards/margins": 0.3315429091453552,
"rewards/rejected": -0.8494357466697693,
"step": 657
},
{
"epoch": 0.9947089947089947,
"epsilon_dpo/beta": 0.005629774183034897,
"epsilon_dpo/beta_margin_grad_mean": -0.4411674439907074,
"epsilon_dpo/beta_margin_grad_std": 0.11220408231019974,
"epsilon_dpo/beta_margin_mean": 0.24884627759456635,
"epsilon_dpo/beta_margin_std": 0.47748851776123047,
"epsilon_dpo/loss_margin_mean": 44.730255126953125,
"grad_norm": 8.150103569030762,
"kl/avg_steps": 0.46875,
"kl/beta": 0.005655722226947546,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 5.594234322453539e-11,
"logits/chosen": -0.02786184474825859,
"logits/rejected": -0.12675124406814575,
"logps/chosen": -161.846435546875,
"logps/ref_chosen": -63.04634094238281,
"logps/ref_rejected": -83.44963073730469,
"logps/rejected": -226.97998046875,
"loss": 1.2077,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.5570150017738342,
"rewards/margins": 0.24884626269340515,
"rewards/rejected": -0.8058612942695618,
"step": 658
},
{
"epoch": 0.9962207105064248,
"epsilon_dpo/beta": 0.0056035080924630165,
"epsilon_dpo/beta_margin_grad_mean": -0.4466729164123535,
"epsilon_dpo/beta_margin_grad_std": 0.09939718246459961,
"epsilon_dpo/beta_margin_mean": 0.2224022001028061,
"epsilon_dpo/beta_margin_std": 0.4150841534137726,
"epsilon_dpo/loss_margin_mean": 40.16176223754883,
"grad_norm": 9.186923027038574,
"kl/avg_steps": 0.46875,
"kl/beta": 0.005629335064440966,
"kl/n_epsilon_steps": 0.265625,
"kl/p_epsilon_steps": 0.734375,
"learning_rate": 3.146808153123293e-11,
"logits/chosen": 0.09668943285942078,
"logits/rejected": -0.12483270466327667,
"logps/chosen": -154.25985717773438,
"logps/ref_chosen": -55.0802001953125,
"logps/ref_rejected": -71.91049194335938,
"logps/rejected": -211.25192260742188,
"loss": 1.2181,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.5565832853317261,
"rewards/margins": 0.22240221500396729,
"rewards/rejected": -0.7789855003356934,
"step": 659
},
{
"epoch": 0.9977324263038548,
"epsilon_dpo/beta": 0.005568607710301876,
"epsilon_dpo/beta_margin_grad_mean": -0.4257502555847168,
"epsilon_dpo/beta_margin_grad_std": 0.08676893264055252,
"epsilon_dpo/beta_margin_mean": 0.3090023696422577,
"epsilon_dpo/beta_margin_std": 0.3653712570667267,
"epsilon_dpo/loss_margin_mean": 55.83287048339844,
"grad_norm": 9.039102554321289,
"kl/avg_steps": 0.625,
"kl/beta": 0.005603070370852947,
"kl/n_epsilon_steps": 0.1875,
"kl/p_epsilon_steps": 0.8125,
"learning_rate": 1.3985977021235829e-11,
"logits/chosen": 0.10768848657608032,
"logits/rejected": -0.08223304152488708,
"logps/chosen": -153.62896728515625,
"logps/ref_chosen": -54.52591323852539,
"logps/ref_rejected": -81.23603820800781,
"logps/rejected": -236.17196655273438,
"loss": 1.1332,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.5523468255996704,
"rewards/margins": 0.3090023398399353,
"rewards/rejected": -0.8613492250442505,
"step": 660
},
{
"epoch": 0.999244142101285,
"epsilon_dpo/beta": 0.005545323248952627,
"epsilon_dpo/beta_margin_grad_mean": -0.4474778175354004,
"epsilon_dpo/beta_margin_grad_std": 0.10211808234453201,
"epsilon_dpo/beta_margin_mean": 0.21895082294940948,
"epsilon_dpo/beta_margin_std": 0.42851126194000244,
"epsilon_dpo/loss_margin_mean": 39.99132537841797,
"grad_norm": 7.923947334289551,
"kl/avg_steps": 0.421875,
"kl/beta": 0.005568268708884716,
"kl/n_epsilon_steps": 0.28125,
"kl/p_epsilon_steps": 0.703125,
"learning_rate": 3.4965187065971735e-12,
"logits/chosen": 0.03417160362005234,
"logits/rejected": -0.1054316833615303,
"logps/chosen": -170.80929565429688,
"logps/ref_chosen": -60.372642517089844,
"logps/ref_rejected": -77.42874908447266,
"logps/rejected": -227.85675048828125,
"loss": 1.2239,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.6141051054000854,
"rewards/margins": 0.21895083785057068,
"rewards/rejected": -0.8330559730529785,
"step": 661
},
{
"epoch": 0.999244142101285,
"step": 661,
"total_flos": 0.0,
"train_loss": 1.1175190241903112,
"train_runtime": 3196.4458,
"train_samples_per_second": 13.245,
"train_steps_per_second": 0.207
}
],
"logging_steps": 1,
"max_steps": 661,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}